From 2dbf5cde8dc0cebd8c3092824ff85d37aaba524c Mon Sep 17 00:00:00 2001
From: Orson Peters <orsonpeters@gmail.com>
Date: Thu, 17 Aug 2023 23:10:39 +0200
Subject: [PATCH 01/55] chore(rust): bump MSRV to 1.65 (#10568)

---
 README.md                                                   | 2 +-
 crates/polars-arrow/src/kernels/rolling/no_nulls/min_max.rs | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index 4a3d6ed5225b..1fee568eb25c 100644
--- a/README.md
+++ b/README.md
@@ -220,7 +220,7 @@ point to the `main` branch of this repo.
 polars = { git = "https://github.com/pola-rs/polars", rev = "<optional git tag>" }
 ```
 
-Required Rust version `>=1.62`
+Required Rust version `>=1.65`.
 
 ## Contributing
 
diff --git a/crates/polars-arrow/src/kernels/rolling/no_nulls/min_max.rs b/crates/polars-arrow/src/kernels/rolling/no_nulls/min_max.rs
index d9c8927ffdae..efeeb9e183a2 100644
--- a/crates/polars-arrow/src/kernels/rolling/no_nulls/min_max.rs
+++ b/crates/polars-arrow/src/kernels/rolling/no_nulls/min_max.rs
@@ -168,7 +168,7 @@ macro_rules! minmax_window {
                 };
                 let empty_overlap = old_last_end <= start;
 
-                if entering.is_some_and(|em| $new_is_m(&self.m, em.1) || empty_overlap) {
+                if entering.map(|em| $new_is_m(&self.m, em.1) || empty_overlap) == Some(true) {
                     // The entering extremum "beats" the previous extremum so we can ignore the overlap
                     self.update_m_and_m_idx(entering.unwrap());
                     return self.m;

From 2a2e25bc8353d4d3de655a078213a8693c6e5b28 Mon Sep 17 00:00:00 2001
From: Marco Edward Gorelli <marcogorelli@protonmail.com>
Date: Fri, 18 Aug 2023 13:03:24 +0200
Subject: [PATCH 02/55] perf(rust, python): use binary abstraction for atan2
 (#10588)

---
 crates/polars-arrow/src/kernels/atan2.rs        | 14 ++++++++++++++
 crates/polars-arrow/src/kernels/mod.rs          |  1 +
 .../src/dsl/function_expr/trigonometry.rs       | 17 ++++++++---------
 3 files changed, 23 insertions(+), 9 deletions(-)
 create mode 100644 crates/polars-arrow/src/kernels/atan2.rs

diff --git a/crates/polars-arrow/src/kernels/atan2.rs b/crates/polars-arrow/src/kernels/atan2.rs
new file mode 100644
index 000000000000..50670dad5696
--- /dev/null
+++ b/crates/polars-arrow/src/kernels/atan2.rs
@@ -0,0 +1,14 @@
+use arrow::array::PrimitiveArray;
+use arrow::compute::arity::binary;
+use arrow::types::NativeType;
+use num_traits::Float;
+
+pub fn atan2<T: NativeType>(
+    arr_1: &PrimitiveArray<T>,
+    arr_2: &PrimitiveArray<T>,
+) -> PrimitiveArray<T>
+where
+    T: Float,
+{
+    binary(arr_1, arr_2, arr_1.data_type().clone(), |a, b| a.atan2(b))
+}
diff --git a/crates/polars-arrow/src/kernels/mod.rs b/crates/polars-arrow/src/kernels/mod.rs
index 64049c4608a5..29f56bc513a3 100644
--- a/crates/polars-arrow/src/kernels/mod.rs
+++ b/crates/polars-arrow/src/kernels/mod.rs
@@ -4,6 +4,7 @@ use arrow::array::BooleanArray;
 use arrow::bitmap::utils::BitChunks;
 #[cfg(feature = "simd")]
 pub mod agg_mean;
+pub mod atan2;
 #[cfg(feature = "dtype-array")]
 pub mod comparison;
 pub mod concatenate;
diff --git a/crates/polars-plan/src/dsl/function_expr/trigonometry.rs b/crates/polars-plan/src/dsl/function_expr/trigonometry.rs
index 8891fd4daa8f..a07598204858 100644
--- a/crates/polars-plan/src/dsl/function_expr/trigonometry.rs
+++ b/crates/polars-plan/src/dsl/function_expr/trigonometry.rs
@@ -1,6 +1,7 @@
 use num::Float;
-use polars_arrow::utils::CustomIterTools;
+use polars_arrow::kernels::atan2::atan2 as atan2_kernel;
 use polars_core::export::num;
+use polars_core::utils::align_chunks_binary;
 
 use super::*;
 
@@ -128,15 +129,13 @@ where
 
         Ok(Some(x.apply(|v| y_value.atan2(v)).into_series()))
     } else {
+        let (ca_1, ca_2) = align_chunks_binary(y, x);
+        let chunks = ca_1
+            .downcast_iter()
+            .zip(ca_2.downcast_iter())
+            .map(|(arr_1, arr_2)| atan2_kernel(arr_1, arr_2));
         Ok(Some(
-            y.into_iter()
-                .zip(x)
-                .map(|(opt_y, opt_x)| match (opt_y, opt_x) {
-                    (Some(y), Some(x)) => Some(y.atan2(x)),
-                    _ => None,
-                })
-                .collect_trusted::<ChunkedArray<T>>()
-                .into_series(),
+            ChunkedArray::from_chunk_iter(ca_1.name(), chunks).into_series(),
         ))
     }
 }

From b91cd2d3fa42f80319d23d057fd9691ba474bd61 Mon Sep 17 00:00:00 2001
From: Marshall <mcrumiller@users.noreply.github.com>
Date: Fri, 18 Aug 2023 07:54:50 -0400
Subject: [PATCH 03/55] fix(rust): join_asof missing `tolerance`
 implementation, address edge-cases (#10482)

---
 .../polars-core/src/frame/asof_join/asof.rs   |  95 +++-
 .../polars-core/src/frame/asof_join/groups.rs |  85 +++-
 crates/polars-core/src/frame/asof_join/mod.rs |  12 +-
 py-polars/polars/dataframe/frame.py           |   9 +-
 py-polars/polars/lazyframe/frame.py           |  13 +-
 .../tests/unit/operations/test_join_asof.py   | 442 +++++++++++++++++-
 6 files changed, 629 insertions(+), 27 deletions(-)

diff --git a/crates/polars-core/src/frame/asof_join/asof.rs b/crates/polars-core/src/frame/asof_join/asof.rs
index 89a189aca03a..7edbd1372bae 100644
--- a/crates/polars-core/src/frame/asof_join/asof.rs
+++ b/crates/polars-core/src/frame/asof_join/asof.rs
@@ -1,5 +1,5 @@
 use std::fmt::Debug;
-use std::ops::Sub;
+use std::ops::{Add, Sub};
 
 use num_traits::Bounded;
 use polars_arrow::index::IdxSize;
@@ -182,6 +182,94 @@ pub(super) fn join_asof_backward<T: PartialOrd + Copy + Debug>(
     out
 }
 
+pub(super) fn join_asof_nearest_with_tolerance<
+    T: PartialOrd + Copy + Debug + Sub<Output = T> + Add<Output = T> + Bounded,
+>(
+    left: &[T],
+    right: &[T],
+    tolerance: T,
+) -> Vec<Option<IdxSize>> {
+    let n_left = left.len();
+
+    if left.is_empty() {
+        return Vec::new();
+    }
+    let mut out = Vec::with_capacity(n_left);
+    if right.is_empty() {
+        out.extend(std::iter::repeat(None).take(n_left));
+        return out;
+    }
+
+    // If we know the first/last values, we can leave early in many cases.
+    let n_right = right.len();
+    let first_left = left[0];
+    let last_left = left[n_left - 1];
+    let r_lower_bound = right[0] - tolerance;
+    let r_upper_bound = right[n_right - 1] + tolerance;
+
+    // If the left and right hand side are disjoint partitions, we can early exit.
+    if (r_lower_bound > last_left) || (r_upper_bound < first_left) {
+        out.extend(std::iter::repeat(None).take(n_left));
+        return out;
+    }
+
+    for &val_l in left {
+        // Detect early exit cases
+        if val_l < r_lower_bound {
+            // The left value is too low.
+            out.push(None);
+            continue;
+        } else if val_l > r_upper_bound {
+            // The left value is too high. Subsequent left values are guaranteed to
+            // be too high as well, so we can early return.
+            out.extend(std::iter::repeat(None).take(n_left - out.len()));
+            return out;
+        }
+
+        // The left value is contained within the RHS window, so we might have a match.
+        let mut offset: IdxSize = 0;
+        let mut dist = tolerance;
+        let mut found_window = false;
+        let val_l_upper_bound = val_l + tolerance;
+        for &val_r in right {
+            // We haven't reached the window yet; go to next RHS value.
+            if val_l > val_r + tolerance {
+                offset += 1;
+                continue;
+            }
+
+            // We passed the window without a match, so leave immediately.
+            if !found_window && (val_r > val_l_upper_bound) {
+                out.push(None);
+                break;
+            }
+
+            // We made it to the window: matches are now possible, start measuring distance.
+            found_window = true;
+            let current_dist = if val_l > val_r {
+                val_l - val_r
+            } else {
+                val_r - val_l
+            };
+            if current_dist <= dist {
+                dist = current_dist;
+                if offset == (n_right - 1) as IdxSize {
+                    // We're the last item, it's a match.
+                    out.push(Some(offset));
+                    break;
+                }
+            } else {
+                // We'ved moved farther away, so the last element was the match.
+                out.push(Some(offset - 1));
+                break;
+            }
+            offset += 1;
+        }
+    }
+
+    out
+}
+
 pub(super) fn join_asof_nearest<T: PartialOrd + Copy + Debug + Sub<Output = T> + Bounded>(
     left: &[T],
     right: &[T],
@@ -189,9 +277,9 @@ pub(super) fn join_asof_nearest<T: PartialOrd + Copy + Debug + Sub<Output = T> +
     let mut out = Vec::with_capacity(left.len());
     let mut offset = 0 as IdxSize;
     let max_value = <T as num_traits::Bounded>::max_value();
-    let mut dist: T = max_value;
 
     for &val_l in left {
+        let mut dist: T = max_value;
         loop {
             match right.get(offset as usize) {
                 Some(&val_r) => {
@@ -209,9 +297,6 @@ pub(super) fn join_asof_nearest<T: PartialOrd + Copy + Debug + Sub<Output = T> +
                         // distance has increased, we're now farther away, so previous element was closest
                         out.push(Some(offset - 1));
 
-                        // reset distance
-                        dist = max_value;
-
                         // The next left-item may match on the same item, so we need to rewind the offset
                         offset -= 1;
                         break;
diff --git a/crates/polars-core/src/frame/asof_join/groups.rs b/crates/polars-core/src/frame/asof_join/groups.rs
index 722e5b779f28..ae27b92fb685 100644
--- a/crates/polars-core/src/frame/asof_join/groups.rs
+++ b/crates/polars-core/src/frame/asof_join/groups.rs
@@ -1,6 +1,6 @@
 use std::fmt::Debug;
 use std::hash::Hash;
-use std::ops::Sub;
+use std::ops::{Add, Sub};
 
 use ahash::RandomState;
 use arrow::types::NativeType;
@@ -91,6 +91,69 @@ pub(super) unsafe fn join_asof_forward_with_indirection_and_tolerance<
     (None, offsets.len())
 }
 
+pub(super) unsafe fn join_asof_nearest_with_indirection_and_tolerance<
+    T: PartialOrd + Copy + Debug + Sub<Output = T> + Add<Output = T>,
+>(
+    val_l: T,
+    right: &[T],
+    offsets: &[IdxSize],
+    tolerance: T,
+) -> (Option<IdxSize>, usize) {
+    if offsets.is_empty() {
+        return (None, 0);
+    }
+
+    // If we know the first/last values, we can leave early in many cases.
+    let n_right = offsets.len();
+    let r_upper_bound = right[offsets[n_right - 1] as usize] + tolerance;
+
+    // The left value is too high. Subsequent values are guaranteed to be too
+    // high as well, so we can early return.
+    if val_l > r_upper_bound {
+        return (None, n_right - 1);
+    }
+
+    let mut dist: T = tolerance;
+    let mut prev_offset: IdxSize = 0;
+    let mut found_window = false;
+    for (idx, &offset) in offsets.iter().enumerate() {
+        let val_r = *right.get_unchecked(offset as usize);
+
+        // We haven't reached the window yet; go to next RHS value.
+        if val_l > val_r + tolerance {
+            prev_offset = offset;
+            continue;
+        }
+
+        // We passed the window without a match, so leave immediately.
+        if !found_window && (val_r > val_l + tolerance) {
+            return (None, n_right - 1);
+        }
+
+        // We made it to the window: matches are now possible, start measuring distance.
+        found_window = true;
+        let current_dist = if val_l > val_r {
+            val_l - val_r
+        } else {
+            val_r - val_l
+        };
+        if current_dist <= dist {
+            dist = current_dist;
+            if idx == (n_right - 1) {
+                // We're the last item, it's a match.
+                return (Some(offset), idx);
+            }
+            prev_offset = offset;
+        } else {
+            // We'ved moved farther away, so the last element was the match.
+            return (Some(prev_offset), idx - 1);
+        }
+    }
+
+    // This should be unreachable.
+    (None, 0)
+}
+
 pub(super) unsafe fn join_asof_backward_with_indirection<T: PartialOrd + Copy + Debug>(
     val_l: T,
     right: &[T],
@@ -167,8 +230,6 @@ pub(super) unsafe fn join_asof_nearest_with_indirection<
             // candidate for match
             dist = dist_curr;
         } else {
-            // note for a nearest-match, we can re-match on the same val_r next time,
-            // so we need to rewind the idx by 1
             return (Some(prev_offset), idx - 1);
         }
         prev_offset = offset;
@@ -274,7 +335,11 @@ where
         (None, AsofStrategy::Forward) => {
             (join_asof_forward_with_indirection, T::Native::zero(), true)
         },
-        (_, AsofStrategy::Nearest) => {
+        (Some(tolerance), AsofStrategy::Nearest) => {
+            let tol = tolerance.extract::<T::Native>().unwrap();
+            (join_asof_nearest_with_indirection_and_tolerance, tol, false)
+        },
+        (None, AsofStrategy::Nearest) => {
             (join_asof_nearest_with_indirection, T::Native::zero(), false)
         },
     };
@@ -408,7 +473,11 @@ where
         (None, AsofStrategy::Forward) => {
             (join_asof_forward_with_indirection, T::Native::zero(), true)
         },
-        (_, AsofStrategy::Nearest) => {
+        (Some(tolerance), AsofStrategy::Nearest) => {
+            let tol = tolerance.extract::<T::Native>().unwrap();
+            (join_asof_nearest_with_indirection_and_tolerance, tol, false)
+        },
+        (None, AsofStrategy::Nearest) => {
             (join_asof_nearest_with_indirection, T::Native::zero(), false)
         },
     };
@@ -534,7 +603,11 @@ where
         (None, AsofStrategy::Forward) => {
             (join_asof_forward_with_indirection, T::Native::zero(), true)
         },
-        (_, AsofStrategy::Nearest) => {
+        (Some(tolerance), AsofStrategy::Nearest) => {
+            let tol = tolerance.extract::<T::Native>().unwrap();
+            (join_asof_nearest_with_indirection_and_tolerance, tol, false)
+        },
+        (None, AsofStrategy::Nearest) => {
             (join_asof_nearest_with_indirection, T::Native::zero(), false)
         },
     };
diff --git a/crates/polars-core/src/frame/asof_join/mod.rs b/crates/polars-core/src/frame/asof_join/mod.rs
index 30954abb14f9..c496c670d696 100644
--- a/crates/polars-core/src/frame/asof_join/mod.rs
+++ b/crates/polars-core/src/frame/asof_join/mod.rs
@@ -103,8 +103,16 @@ where
                     )
                 },
             },
-            AsofStrategy::Nearest => {
-                join_asof_nearest(ca.cont_slice().unwrap(), other.cont_slice().unwrap())
+            AsofStrategy::Nearest => match tolerance {
+                None => join_asof_nearest(ca.cont_slice().unwrap(), other.cont_slice().unwrap()),
+                Some(tolerance) => {
+                    let tolerance = tolerance.extract::<T::Native>().unwrap();
+                    join_asof_nearest_with_tolerance(
+                        self.cont_slice().unwrap(),
+                        other.cont_slice().unwrap(),
+                        tolerance,
+                    )
+                },
             },
         };
         Ok(out)
diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
index 736db0185bba..fdf85a85c60c 100644
--- a/py-polars/polars/dataframe/frame.py
+++ b/py-polars/polars/dataframe/frame.py
@@ -5522,7 +5522,7 @@ def join_asof(
         by: str | Sequence[str] | None = None,
         strategy: AsofJoinStrategy = "backward",
         suffix: str = "_right",
-        tolerance: str | int | float | None = None,
+        tolerance: str | int | float | timedelta | None = None,
         allow_parallel: bool = True,
         force_parallel: bool = False,
     ) -> DataFrame:
@@ -5543,7 +5543,8 @@ def join_asof(
             'on' key is greater than or equal to the left's key.
 
           - A "nearest" search selects the last row in the right DataFrame whose value
-            is nearest to the left's key.
+            is nearest to the left's key. String keys are not currently supported for a
+            nearest search.
 
         The default is "backward".
 
@@ -5571,8 +5572,8 @@ def join_asof(
         tolerance
             Numeric tolerance. By setting this the join will only be done if the near
             keys are within this distance. If an asof join is done on columns of dtype
-            "Date", "Datetime", "Duration" or "Time", use the following string
-            language:
+            "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta
+            object or the following string language:
 
                 - 1ns   (1 nanosecond)
                 - 1us   (1 microsecond)
diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py
index 7b9f3d0d26df..7079e1ea4f56 100644
--- a/py-polars/polars/lazyframe/frame.py
+++ b/py-polars/polars/lazyframe/frame.py
@@ -2941,7 +2941,7 @@ def join_asof(
         by: str | Sequence[str] | None = None,
         strategy: AsofJoinStrategy = "backward",
         suffix: str = "_right",
-        tolerance: str | int | float | None = None,
+        tolerance: str | int | float | timedelta | None = None,
         allow_parallel: bool = True,
         force_parallel: bool = False,
     ) -> Self:
@@ -2961,8 +2961,9 @@ def join_asof(
           - A "forward" search selects the first row in the right DataFrame whose
             'on' key is greater than or equal to the left's key.
 
-        - A "nearest" search selects the last row in the right DataFrame whose value
-          is nearest to the left's key.
+            A "nearest" search selects the last row in the right DataFrame whose value
+            is nearest to the left's key. String keys are not currently supported for a
+            nearest search.
 
         The default is "backward".
 
@@ -2990,8 +2991,8 @@ def join_asof(
         tolerance
             Numeric tolerance. By setting this the join will only be done if the near
             keys are within this distance. If an asof join is done on columns of dtype
-            "Date", "Datetime", "Duration" or "Time" you use the following string
-            language:
+            "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta
+            object or the following string language:
 
                 - 1ns   (1 nanosecond)
                 - 1us   (1 microsecond)
@@ -3091,6 +3092,8 @@ def join_asof(
         tolerance_num: float | int | None = None
         if isinstance(tolerance, str):
             tolerance_str = tolerance
+        elif isinstance(tolerance, timedelta):
+            tolerance_str = _timedelta_to_pl_duration(tolerance)
         else:
             tolerance_num = tolerance
 
diff --git a/py-polars/tests/unit/operations/test_join_asof.py b/py-polars/tests/unit/operations/test_join_asof.py
index f30827ac36bf..8021d360e92c 100644
--- a/py-polars/tests/unit/operations/test_join_asof.py
+++ b/py-polars/tests/unit/operations/test_join_asof.py
@@ -1,4 +1,4 @@
-from datetime import date, datetime
+from datetime import date, datetime, timedelta
 from typing import Any
 
 import numpy as np
@@ -426,6 +426,7 @@ def test_asof_join_sorted_by_group(capsys: Any) -> None:
 
 
 def test_asof_join_nearest() -> None:
+    # Generic join_asof
     df1 = pl.DataFrame(
         {
             "asof_key": [-1, 1, 2, 4, 6],
@@ -435,20 +436,170 @@ def test_asof_join_nearest() -> None:
 
     df2 = pl.DataFrame(
         {
-            "asof_key": [1, 2, 4, 5],
+            "asof_key": [-1, 2, 4, 5],
             "b": [1, 2, 3, 4],
         }
     ).sort(by="asof_key")
 
     expected = pl.DataFrame(
-        {"asof_key": [-1, 1, 2, 4, 6], "a": [1, 2, 3, 4, 5], "b": [1, 1, 2, 3, 4]}
+        {"asof_key": [-1, 1, 2, 4, 6], "a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 3, 4]}
     )
 
     out = df1.join_asof(df2, on="asof_key", strategy="nearest")
     assert_frame_equal(out, expected)
 
+    # Edge case: last item of right matches multiples on left
+    df1 = pl.DataFrame(
+        {
+            "asof_key": [9, 9, 10, 10, 10],
+            "a": [1, 2, 3, 4, 5],
+        }
+    ).set_sorted("asof_key")
+
+    df2 = pl.DataFrame(
+        {
+            "asof_key": [1, 2, 3, 10],
+            "b": [1, 2, 3, 4],
+        }
+    ).set_sorted("asof_key")
+
+    expected = pl.DataFrame(
+        {
+            "asof_key": [9, 9, 10, 10, 10],
+            "a": [1, 2, 3, 4, 5],
+            "b": [4, 4, 4, 4, 4],
+        }
+    )
+
+    out = df1.join_asof(df2, on="asof_key", strategy="nearest")
+    assert_frame_equal(out, expected)
+
+
+def test_asof_join_nearest_with_tolerance() -> None:
+    a = b = [1, 2, 3, 4, 5]
+
+    nones = pl.Series([None, None, None, None, None], dtype=pl.Int64)
+
+    # Case 1: complete miss
+    df1 = pl.DataFrame({"asof_key": [1, 2, 3, 4, 5], "a": a}).set_sorted("asof_key")
+    df2 = pl.DataFrame(
+        {
+            "asof_key": [7, 8, 9, 10, 11],
+            "b": b,
+        }
+    ).set_sorted("asof_key")
+    expected = df1.with_columns(nones.alias("b"))
+    out = df1.join_asof(df2, on="asof_key", strategy="nearest", tolerance=1)
+    assert_frame_equal(out, expected)
+
+    # Case 2: complete miss in other direction
+    df1 = pl.DataFrame({"asof_key": [7, 8, 9, 10, 11], "a": a}).set_sorted("asof_key")
+    df2 = pl.DataFrame(
+        {
+            "asof_key": [1, 2, 3, 4, 5],
+            "b": b,
+        }
+    ).set_sorted("asof_key")
+    expected = df1.with_columns(nones.alias("b"))
+    out = df1.join_asof(df2, on="asof_key", strategy="nearest", tolerance=1)
+    assert_frame_equal(out, expected)
+
+    # Case 3: match first item
+    df1 = pl.DataFrame({"asof_key": [1, 2, 3, 4, 5], "a": a}).set_sorted("asof_key")
+    df2 = pl.DataFrame(
+        {
+            "asof_key": [6, 7, 8, 9, 10],
+            "b": b,
+        }
+    ).set_sorted("asof_key")
+    out = df1.join_asof(df2, on="asof_key", strategy="nearest", tolerance=1)
+    expected = df1.with_columns(pl.Series([None, None, None, None, 1]).alias("b"))
+    assert_frame_equal(out, expected)
+
+    # Case 4: match last item
+    df1 = pl.DataFrame({"asof_key": [1, 2, 3, 4, 5], "a": a}).set_sorted("asof_key")
+    df2 = pl.DataFrame(
+        {
+            "asof_key": [-4, -3, -2, -1, 0],
+            "b": b,
+        }
+    ).set_sorted("asof_key")
+    out = df1.join_asof(df2, on="asof_key", strategy="nearest", tolerance=1)
+    expected = df1.with_columns(pl.Series([5, None, None, None, None]).alias("b"))
+    assert_frame_equal(out, expected)
+
+    # Case 5: match multiples, pick closer
+    df1 = pl.DataFrame(
+        {"asof_key": pl.Series([1, 2, 3, 4, 5], dtype=pl.Float64), "a": a}
+    ).set_sorted("asof_key")
+    df2 = pl.DataFrame(
+        {
+            "asof_key": [0, 2, 2.4, 3.4, 10],
+            "b": b,
+        }
+    ).set_sorted("asof_key")
+    out = df1.join_asof(df2, on="asof_key", strategy="nearest", tolerance=1)
+    expected = df1.with_columns(pl.Series([2, 2, 4, 4, None]).alias("b"))
+    assert_frame_equal(out, expected)
+
+    # Case 6: use 0 tolerance
+    df1 = pl.DataFrame(
+        {"asof_key": pl.Series([1, 2, 3, 4, 5], dtype=pl.Float64), "a": a}
+    ).set_sorted("asof_key")
+    df2 = pl.DataFrame(
+        {
+            "asof_key": [0, 2, 2.4, 3.4, 10],
+            "b": b,
+        }
+    ).set_sorted("asof_key")
+    out = df1.join_asof(df2, on="asof_key", strategy="nearest", tolerance=0)
+    expected = df1.with_columns(pl.Series([None, 2, None, None, None]).alias("b"))
+    assert_frame_equal(out, expected)
+
+    # Case 7: test with datetime
+    df1 = pl.DataFrame(
+        {
+            "asof_key": pl.Series(
+                [
+                    datetime(2023, 1, 1),
+                    datetime(2023, 1, 2),
+                    datetime(2023, 1, 3),
+                    datetime(2023, 1, 4),
+                    datetime(2023, 1, 6),
+                ]
+            ),
+            "a": a,
+        }
+    ).set_sorted("asof_key")
+    df2 = pl.DataFrame(
+        {
+            "asof_key": pl.Series(
+                [
+                    datetime(2022, 1, 1),
+                    datetime(2022, 1, 2),
+                    datetime(2022, 1, 3),
+                    datetime(
+                        2023, 1, 2, 21, 30, 0
+                    ),  # should match with 2023-01-02, 2023-01-03, and 2021-01-04
+                    datetime(2023, 1, 7),
+                ]
+            ),
+            "b": b,
+        }
+    ).set_sorted("asof_key")
+    out = df1.join_asof(df2, on="asof_key", strategy="nearest", tolerance="1d4h")
+    expected = df1.with_columns(pl.Series([None, 4, 4, 4, 5]).alias("b"))
+    assert_frame_equal(out, expected)
+
+    # Case 8: test using timedelta tolerance
+    out = df1.join_asof(
+        df2, on="asof_key", strategy="nearest", tolerance=timedelta(days=1, hours=4)
+    )
+    assert_frame_equal(out, expected)
+
 
 def test_asof_join_nearest_by() -> None:
+    # Generic join_asof
     df1 = pl.DataFrame(
         {
             "asof_key": [-1, 1, 2, 6, 1],
@@ -459,7 +610,7 @@ def test_asof_join_nearest_by() -> None:
 
     df2 = pl.DataFrame(
         {
-            "asof_key": [1, 2, 5, 1],
+            "asof_key": [-1, 2, 5, 1],
             "group": [1, 1, 2, 2],
             "b": [1, 2, 3, 4],
         }
@@ -469,11 +620,37 @@ def test_asof_join_nearest_by() -> None:
         {
             "asof_key": [-1, 1, 2, 6, 1],
             "group": [1, 1, 1, 2, 2],
+            "a": [1, 2, 3, 5, 2],
+            "b": [1, 2, 2, 4, 3],
+        }
+    ).sort(by=["group", "asof_key"])
+
+    # Edge case: last item of right matches multiples on left
+    df1 = pl.DataFrame(
+        {
+            "asof_key": [9, 9, 10, 10, 10],
+            "group": [1, 1, 1, 2, 2],
             "a": [1, 2, 3, 2, 5],
-            "b": [1, 1, 2, 3, 4],
         }
     ).sort(by=["group", "asof_key"])
 
+    df2 = pl.DataFrame(
+        {
+            "asof_key": [-1, 1, 1, 10],
+            "group": [1, 1, 2, 2],
+            "b": [1, 2, 3, 4],
+        }
+    ).sort(by=["group", "asof_key"])
+
+    expected = pl.DataFrame(
+        {
+            "asof_key": [9, 9, 10, 10, 10],
+            "group": [1, 1, 1, 2, 2],
+            "a": [1, 2, 3, 2, 5],
+            "b": [2, 2, 2, 4, 4],
+        }
+    )
+
     out = df1.join_asof(df2, on="asof_key", by="group", strategy="nearest")
     assert_frame_equal(out, expected)
 
@@ -503,6 +680,261 @@ def test_asof_join_nearest_by() -> None:
     assert_frame_equal(out, expected)
 
 
+def test_asof_join_nearest_by_with_tolerance() -> None:
+    df1 = pl.DataFrame(
+        {
+            "group": [
+                1,
+                1,
+                1,
+                1,
+                1,
+                2,
+                2,
+                2,
+                2,
+                2,
+                3,
+                3,
+                3,
+                3,
+                3,
+                4,
+                4,
+                4,
+                4,
+                4,
+                5,
+                5,
+                5,
+                5,
+                5,
+                6,
+                6,
+                6,
+                6,
+                6,
+            ],
+            "asof_key": pl.Series(
+                [
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                ],
+                dtype=pl.Float32,
+            ),
+            "a": [
+                1,
+                2,
+                3,
+                4,
+                5,
+                1,
+                2,
+                3,
+                4,
+                5,
+                1,
+                2,
+                3,
+                4,
+                5,
+                1,
+                2,
+                3,
+                4,
+                5,
+                1,
+                2,
+                3,
+                4,
+                5,
+                1,
+                2,
+                3,
+                4,
+                5,
+            ],
+        }
+    )
+
+    df2 = pl.DataFrame(
+        {
+            "group": [
+                1,
+                1,
+                1,
+                1,
+                1,
+                2,
+                2,
+                2,
+                2,
+                2,
+                3,
+                3,
+                3,
+                3,
+                3,
+                4,
+                4,
+                4,
+                4,
+                4,
+                5,
+                5,
+                5,
+                5,
+                5,
+                6,
+                6,
+                6,
+                6,
+                6,
+            ],
+            "asof_key": pl.Series(
+                [
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    5,
+                    -3,
+                    -2,
+                    -1,
+                    0,
+                    0,
+                    2,
+                    2.4,
+                    3.4,
+                    10,
+                    -3,
+                    3,
+                    8,
+                    9,
+                    10,
+                ],
+                dtype=pl.Float32,
+            ),
+            "b": [
+                1,
+                2,
+                3,
+                4,
+                5,
+                1,
+                2,
+                3,
+                4,
+                5,
+                1,
+                2,
+                3,
+                4,
+                5,
+                1,
+                2,
+                3,
+                4,
+                5,
+                1,
+                2,
+                3,
+                4,
+                5,
+                1,
+                2,
+                3,
+                4,
+                5,
+            ],
+        }
+    )
+
+    expected = df1.with_columns(
+        pl.Series(
+            [
+                None,
+                None,
+                None,
+                None,
+                None,
+                None,
+                None,
+                None,
+                None,
+                None,
+                None,
+                None,
+                None,
+                None,
+                1,
+                5,
+                None,
+                None,
+                1,
+                1,
+                2,
+                2,
+                4,
+                4,
+                None,
+                None,
+                2,
+                2,
+                2,
+                None,
+            ]
+        ).alias("b")
+    )
+    df1 = df1.sort(by=["group", "asof_key"])
+    df2 = df2.sort(by=["group", "asof_key"])
+    expected = expected.sort(by=["group", "a"])
+
+    out = df1.join_asof(
+        df2, by="group", on="asof_key", strategy="nearest", tolerance=1.0
+    ).sort(by=["group", "a"])
+    assert_frame_equal(out, expected)
+
+
 def test_asof_join_nearest_by_date() -> None:
     df1 = pl.DataFrame(
         {

From d7bc251c09c59d88ad7a7b8334f95a2054876343 Mon Sep 17 00:00:00 2001
From: Alexander Beedie <alexander-beedie@users.noreply.github.com>
Date: Sat, 19 Aug 2023 10:02:34 +0400
Subject: [PATCH 04/55] chore(python): ensure that `make requirements` fully
 refreshes unpinned packages/deps (#10591)

---
 py-polars/Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/py-polars/Makefile b/py-polars/Makefile
index dbd16f1428ff..c673c9ed7482 100644
--- a/py-polars/Makefile
+++ b/py-polars/Makefile
@@ -17,9 +17,9 @@ endif
 .PHONY: requirements
 requirements: .venv  ## Install/refresh all project requirements
 	$(VENV_BIN)/python -m pip install --upgrade pip
-	$(VENV_BIN)/pip install -r requirements-dev.txt
-	$(VENV_BIN)/pip install -r requirements-lint.txt
-	$(VENV_BIN)/pip install -r docs/requirements-docs.txt
+	$(VENV_BIN)/pip install --upgrade -r requirements-dev.txt
+	$(VENV_BIN)/pip install --upgrade -r requirements-lint.txt
+	$(VENV_BIN)/pip install --upgrade -r docs/requirements-docs.txt
 
 .PHONY: build
 build: .venv  ## Compile and install Polars for development

From 7dff4c3ef75b75eb93606be4cc43e6bb632a11c0 Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Sat, 19 Aug 2023 10:32:36 +0200
Subject: [PATCH 05/55] refactor(rust): make binary chunkedarray functions DRY
 (#10607)

---
 .../src/chunked_array/arithmetic/decimal.rs   |   1 +
 .../src/chunked_array/arithmetic/mod.rs       |  16 +-
 .../src/chunked_array/arithmetic/numeric.rs   |  10 +-
 .../polars-core/src/chunked_array/bitwise.rs  |  30 +--
 .../src/chunked_array/comparison/mod.rs       | 218 ++++++------------
 .../src/chunked_array/ops/apply.rs            |   2 +-
 .../src/chunked_array/ops/arity.rs            | 153 ++++++++++++
 .../src/chunked_array/ops/downcast.rs         |   6 +
 .../src/chunked_array/ops/filter.rs           |  97 ++++----
 .../src/chunked_array/ops/min_max_binary.rs   |  41 +---
 .../polars-core/src/chunked_array/ops/mod.rs  |   1 +
 .../src/chunked_array/ops/repeat_by.rs        |  66 +++---
 .../src/series/arithmetic/borrowed.rs         | 101 +++-----
 .../polars-ops/src/chunked_array/list/sets.rs |  23 +-
 .../polars-ops/src/series/ops/floor_divide.rs |   9 +-
 .../polars-plan/src/dsl/function_expr/pow.rs  |   9 +-
 .../src/dsl/function_expr/trigonometry.rs     |   8 +-
 17 files changed, 378 insertions(+), 413 deletions(-)
 create mode 100644 crates/polars-core/src/chunked_array/ops/arity.rs

diff --git a/crates/polars-core/src/chunked_array/arithmetic/decimal.rs b/crates/polars-core/src/chunked_array/arithmetic/decimal.rs
index 4c755ec06d25..9341407d9565 100644
--- a/crates/polars-core/src/chunked_array/arithmetic/decimal.rs
+++ b/crates/polars-core/src/chunked_array/arithmetic/decimal.rs
@@ -2,6 +2,7 @@ use polars_arrow::compute::arithmetics::decimal;
 
 use super::*;
 use crate::prelude::DecimalChunked;
+use crate::utils::align_chunks_binary;
 
 // TODO: remove
 impl ArrayArithmetics for i128 {
diff --git a/crates/polars-core/src/chunked_array/arithmetic/mod.rs b/crates/polars-core/src/chunked_array/arithmetic/mod.rs
index 6460435caccf..6a51424b4e21 100644
--- a/crates/polars-core/src/chunked_array/arithmetic/mod.rs
+++ b/crates/polars-core/src/chunked_array/arithmetic/mod.rs
@@ -15,7 +15,7 @@ use polars_arrow::utils::combine_validities_and;
 
 use crate::prelude::*;
 use crate::series::IsSorted;
-use crate::utils::{align_chunks_binary, align_chunks_binary_owned};
+use crate::utils::align_chunks_binary_owned;
 
 pub trait ArrayArithmetics
 where
@@ -148,12 +148,7 @@ impl Add for &BinaryChunked {
             };
         }
 
-        let (lhs, rhs) = align_chunks_binary(self, rhs);
-        let chunks = lhs
-            .downcast_iter()
-            .zip(rhs.downcast_iter())
-            .map(|(a, b)| concat_binary(a, b));
-        ChunkedArray::from_chunk_iter(self.name(), chunks)
+        arity::binary_mut(self, rhs, concat_binary)
     }
 }
 
@@ -202,12 +197,7 @@ impl Add for &BooleanChunked {
         if self.len() == 1 {
             return rhs.add(self);
         }
-        let (lhs, rhs) = align_chunks_binary(self, rhs);
-        let chunks = lhs
-            .downcast_iter()
-            .zip(rhs.downcast_iter())
-            .map(|(a, b)| add_boolean(a, b));
-        ChunkedArray::from_chunk_iter(self.name(), chunks)
+        arity::binary_mut(self, rhs, add_boolean)
     }
 }
 
diff --git a/crates/polars-core/src/chunked_array/arithmetic/numeric.rs b/crates/polars-core/src/chunked_array/arithmetic/numeric.rs
index 6e4216a1ecdc..ebfb835c715f 100644
--- a/crates/polars-core/src/chunked_array/arithmetic/numeric.rs
+++ b/crates/polars-core/src/chunked_array/arithmetic/numeric.rs
@@ -12,15 +12,7 @@ where
     F: Fn(T::Native, T::Native) -> T::Native,
 {
     let mut ca = match (lhs.len(), rhs.len()) {
-        (a, b) if a == b => {
-            let (lhs, rhs) = align_chunks_binary(lhs, rhs);
-            let chunks = lhs
-                .downcast_iter()
-                .zip(rhs.downcast_iter())
-                .map(|(lhs, rhs)| Box::new(kernel(lhs, rhs)) as ArrayRef)
-                .collect();
-            unsafe { lhs.copy_with_chunks(chunks, false, false) }
-        },
+        (a, b) if a == b => arity::binary_mut(lhs, rhs, |lhs, rhs| kernel(lhs, rhs)),
         // broadcast right path
         (_, 1) => {
             let opt_rhs = rhs.get(0);
diff --git a/crates/polars-core/src/chunked_array/bitwise.rs b/crates/polars-core/src/chunked_array/bitwise.rs
index 7bf9f6457c61..ea9372ef3adc 100644
--- a/crates/polars-core/src/chunked_array/bitwise.rs
+++ b/crates/polars-core/src/chunked_array/bitwise.rs
@@ -6,7 +6,6 @@ use polars_arrow::utils::combine_validities_and;
 
 use super::arithmetic::arithmetic_helper;
 use super::*;
-use crate::utils::align_chunks_binary;
 
 impl<T> BitAnd for &ChunkedArray<T>
 where
@@ -73,12 +72,7 @@ impl BitOr for &BooleanChunked {
             _ => {},
         }
 
-        let (lhs, rhs) = align_chunks_binary(self, rhs);
-        let chunks = lhs
-            .downcast_iter()
-            .zip(rhs.downcast_iter())
-            .map(|(lhs, rhs)| compute::boolean_kleene::or(lhs, rhs));
-        BooleanChunked::from_chunk_iter(self.name(), chunks)
+        arity::binary_mut(self, rhs, compute::boolean_kleene::or)
     }
 }
 
@@ -123,16 +117,11 @@ impl BitXor for &BooleanChunked {
             _ => {},
         }
 
-        let (l, r) = align_chunks_binary(self, rhs);
-        let chunks = l
-            .downcast_iter()
-            .zip(r.downcast_iter())
-            .map(|(l_arr, r_arr)| {
-                let validity = combine_validities_and(l_arr.validity(), r_arr.validity());
-                let values = l_arr.values() ^ r_arr.values();
-                BooleanArray::from_data_default(values, validity)
-            });
-        ChunkedArray::from_chunk_iter(self.name(), chunks)
+        arity::binary_mut(self, rhs, |l_arr, r_arr| {
+            let validity = combine_validities_and(l_arr.validity(), r_arr.validity());
+            let values = l_arr.values() ^ r_arr.values();
+            BooleanArray::from_data_default(values, validity)
+        })
     }
 }
 
@@ -169,12 +158,7 @@ impl BitAnd for &BooleanChunked {
             _ => {},
         }
 
-        let (lhs, rhs) = align_chunks_binary(self, rhs);
-        let chunks = lhs
-            .downcast_iter()
-            .zip(rhs.downcast_iter())
-            .map(|(lhs, rhs)| compute::boolean_kleene::and(lhs, rhs));
-        BooleanChunked::from_chunk_iter(self.name(), chunks)
+        arity::binary_mut(self, rhs, compute::boolean_kleene::and)
     }
 }
 
diff --git a/crates/polars-core/src/chunked_array/comparison/mod.rs b/crates/polars-core/src/chunked_array/comparison/mod.rs
index 202473428f8a..29350564f8e7 100644
--- a/crates/polars-core/src/chunked_array/comparison/mod.rs
+++ b/crates/polars-core/src/chunked_array/comparison/mod.rs
@@ -14,30 +14,14 @@ use polars_arrow::prelude::FromData;
 
 use crate::prelude::*;
 use crate::series::IsSorted;
-use crate::utils::align_chunks_binary;
 
 impl<T> ChunkedArray<T>
 where
     T: PolarsNumericType,
 {
-    /// First ensure that the chunks of lhs and rhs match and then iterates over the chunks and applies
-    /// the comparison operator.
-    fn comparison(
-        &self,
-        rhs: &ChunkedArray<T>,
-        f: impl Fn(&PrimitiveArray<T::Native>, &PrimitiveArray<T::Native>) -> BooleanArray,
-    ) -> BooleanChunked {
-        let chunks = self
-            .downcast_iter()
-            .zip(rhs.downcast_iter())
-            .map(|(left, right)| f(left, right));
-        ChunkedArray::from_chunk_iter("", chunks)
-    }
-
     // Also includes validity in comparison.
     pub fn not_equal_and_validity(&self, rhs: &ChunkedArray<T>) -> BooleanChunked {
-        let (lhs, rhs) = align_chunks_binary(self, rhs);
-        lhs.comparison(&rhs, |x, y| comparison::neq_and_validity(x, y))
+        arity::binary_mut_with_options(self, rhs, |a, b| comparison::neq_and_validity(a, b), "")
     }
 }
 
@@ -64,11 +48,7 @@ where
                     BooleanChunked::full_null("", rhs.len())
                 }
             },
-            _ => {
-                // Same length.
-                let (lhs, rhs) = align_chunks_binary(self, rhs);
-                lhs.comparison(&rhs, |x, y| comparison::eq(x, y))
-            },
+            _ => arity::binary_mut_with_options(self, rhs, |a, b| comparison::eq(a, b), ""),
         }
     }
 
@@ -89,11 +69,12 @@ where
                     rhs.is_null()
                 }
             },
-            _ => {
-                // Same length.
-                let (lhs, rhs) = align_chunks_binary(self, rhs);
-                lhs.comparison(&rhs, |x, y| comparison::eq_and_validity(x, y))
-            },
+            _ => arity::binary_mut_with_options(
+                self,
+                rhs,
+                |a, b| comparison::eq_and_validity(a, b),
+                "",
+            ),
         }
     }
 
@@ -114,11 +95,7 @@ where
                     BooleanChunked::full_null("", rhs.len())
                 }
             },
-            _ => {
-                // Same length.
-                let (lhs, rhs) = align_chunks_binary(self, rhs);
-                lhs.comparison(&rhs, |x, y| comparison::neq(x, y))
-            },
+            _ => arity::binary_mut_with_options(self, rhs, |a, b| comparison::neq(a, b), ""),
         }
     }
 
@@ -139,11 +116,12 @@ where
                     rhs.is_not_null()
                 }
             },
-            _ => {
-                // Same length.
-                let (lhs, rhs) = align_chunks_binary(self, rhs);
-                lhs.comparison(&rhs, |x, y| comparison::neq_and_validity(x, y))
-            },
+            _ => arity::binary_mut_with_options(
+                self,
+                rhs,
+                |a, b| comparison::neq_and_validity(a, b),
+                "",
+            ),
         }
     }
 
@@ -164,11 +142,7 @@ where
                     BooleanChunked::full_null("", rhs.len())
                 }
             },
-            _ => {
-                // Same length.
-                let (lhs, rhs) = align_chunks_binary(self, rhs);
-                lhs.comparison(&rhs, |x, y| comparison::gt(x, y))
-            },
+            _ => arity::binary_mut_with_options(self, rhs, |a, b| comparison::gt(a, b), ""),
         }
     }
 
@@ -189,11 +163,7 @@ where
                     BooleanChunked::full_null("", rhs.len())
                 }
             },
-            _ => {
-                // Same length.
-                let (lhs, rhs) = align_chunks_binary(self, rhs);
-                lhs.comparison(&rhs, |x, y| comparison::gt_eq(x, y))
-            },
+            _ => arity::binary_mut_with_options(self, rhs, |a, b| comparison::gt_eq(a, b), ""),
         }
     }
 
@@ -214,11 +184,7 @@ where
                     BooleanChunked::full_null("", rhs.len())
                 }
             },
-            _ => {
-                // Same length.
-                let (lhs, rhs) = align_chunks_binary(self, rhs);
-                lhs.comparison(&rhs, |x, y| comparison::lt(x, y))
-            },
+            _ => arity::binary_mut_with_options(self, rhs, |a, b| comparison::lt(a, b), ""),
         }
     }
 
@@ -239,27 +205,11 @@ where
                     BooleanChunked::full_null("", rhs.len())
                 }
             },
-            _ => {
-                // Same length.
-                let (lhs, rhs) = align_chunks_binary(self, rhs);
-                lhs.comparison(&rhs, |x, y| comparison::lt_eq(x, y))
-            },
+            _ => arity::binary_mut_with_options(self, rhs, |a, b| comparison::lt_eq(a, b), ""),
         }
     }
 }
 
-fn compare_bools(
-    lhs: &BooleanChunked,
-    rhs: &BooleanChunked,
-    f: impl Fn(&BooleanArray, &BooleanArray) -> BooleanArray,
-) -> BooleanChunked {
-    let chunks = lhs
-        .downcast_iter()
-        .zip(rhs.downcast_iter())
-        .map(|(l, r)| f(l, r));
-    ChunkedArray::from_chunk_iter("", chunks)
-}
-
 impl ChunkCompare<&BooleanChunked> for BooleanChunked {
     type Item = BooleanChunked;
 
@@ -278,11 +228,7 @@ impl ChunkCompare<&BooleanChunked> for BooleanChunked {
                 }
             },
             (1, _) => rhs.equal(self),
-            _ => {
-                // Same length.
-                let (lhs, rhs) = align_chunks_binary(self, rhs);
-                compare_bools(&lhs, &rhs, |lhs, rhs| comparison::eq(lhs, rhs))
-            },
+            _ => arity::binary_mut_with_options(self, rhs, |lhs, rhs| comparison::eq(lhs, rhs), ""),
         }
     }
 
@@ -321,11 +267,12 @@ impl ChunkCompare<&BooleanChunked> for BooleanChunked {
                 }
             },
             (1, _) => rhs.equal_missing(self),
-            _ => {
-                // Same length.
-                let (lhs, rhs) = align_chunks_binary(self, rhs);
-                compare_bools(&lhs, &rhs, |lhs, rhs| comparison::eq_and_validity(lhs, rhs))
-            },
+            _ => arity::binary_mut_with_options(
+                self,
+                rhs,
+                |lhs, rhs| comparison::eq_and_validity(lhs, rhs),
+                "",
+            ),
         }
     }
 
@@ -345,9 +292,7 @@ impl ChunkCompare<&BooleanChunked> for BooleanChunked {
             },
             (1, _) => rhs.not_equal(self),
             _ => {
-                // Same length.
-                let (lhs, rhs) = align_chunks_binary(self, rhs);
-                compare_bools(&lhs, &rhs, |lhs, rhs| comparison::neq(lhs, rhs))
+                arity::binary_mut_with_options(self, rhs, |lhs, rhs| comparison::neq(lhs, rhs), "")
             },
         }
     }
@@ -381,13 +326,12 @@ impl ChunkCompare<&BooleanChunked> for BooleanChunked {
                 }
             },
             (1, _) => rhs.not_equal_missing(self),
-            _ => {
-                // Same length.
-                let (lhs, rhs) = align_chunks_binary(self, rhs);
-                compare_bools(&lhs, &rhs, |lhs, rhs| {
-                    comparison::neq_and_validity(lhs, rhs)
-                })
-            },
+            _ => arity::binary_mut_with_options(
+                self,
+                rhs,
+                |lhs, rhs| comparison::neq_and_validity(lhs, rhs),
+                "",
+            ),
         }
     }
 
@@ -414,11 +358,7 @@ impl ChunkCompare<&BooleanChunked> for BooleanChunked {
                     BooleanChunked::full_null("", rhs.len())
                 }
             },
-            _ => {
-                // Same length.
-                let (lhs, rhs) = align_chunks_binary(self, rhs);
-                compare_bools(&lhs, &rhs, |lhs, rhs| comparison::gt(lhs, rhs))
-            },
+            _ => arity::binary_mut_with_options(self, rhs, |lhs, rhs| comparison::gt(lhs, rhs), ""),
         }
     }
 
@@ -445,11 +385,12 @@ impl ChunkCompare<&BooleanChunked> for BooleanChunked {
                     BooleanChunked::full_null("", rhs.len())
                 }
             },
-            _ => {
-                // Same length.
-                let (lhs, rhs) = align_chunks_binary(self, rhs);
-                compare_bools(&lhs, &rhs, |lhs, rhs| comparison::gt_eq(lhs, rhs))
-            },
+            _ => arity::binary_mut_with_options(
+                self,
+                rhs,
+                |lhs, rhs| comparison::gt_eq(lhs, rhs),
+                "",
+            ),
         }
     }
 
@@ -476,11 +417,7 @@ impl ChunkCompare<&BooleanChunked> for BooleanChunked {
                     BooleanChunked::full_null("", rhs.len())
                 }
             },
-            _ => {
-                // Same length.
-                let (lhs, rhs) = align_chunks_binary(self, rhs);
-                compare_bools(&lhs, &rhs, |lhs, rhs| comparison::lt(lhs, rhs))
-            },
+            _ => arity::binary_mut_with_options(self, rhs, |lhs, rhs| comparison::lt(lhs, rhs), ""),
         }
     }
 
@@ -507,11 +444,12 @@ impl ChunkCompare<&BooleanChunked> for BooleanChunked {
                     BooleanChunked::full_null("", rhs.len())
                 }
             },
-            _ => {
-                // Same length.
-                let (lhs, rhs) = align_chunks_binary(self, rhs);
-                compare_bools(&lhs, &rhs, |lhs, rhs| comparison::lt_eq(lhs, rhs))
-            },
+            _ => arity::binary_mut_with_options(
+                self,
+                rhs,
+                |lhs, rhs| comparison::lt_eq(lhs, rhs),
+                "",
+            ),
         }
     }
 }
@@ -551,20 +489,6 @@ impl ChunkCompare<&Utf8Chunked> for Utf8Chunked {
     }
 }
 
-impl BinaryChunked {
-    fn comparison(
-        &self,
-        rhs: &BinaryChunked,
-        f: impl Fn(&BinaryArray<i64>, &BinaryArray<i64>) -> BooleanArray,
-    ) -> BooleanChunked {
-        let chunks = self
-            .downcast_iter()
-            .zip(rhs.downcast_iter())
-            .map(|(left, right)| f(left, right));
-        ChunkedArray::from_chunk_iter("", chunks)
-    }
-}
-
 impl ChunkCompare<&BinaryChunked> for BinaryChunked {
     type Item = BooleanChunked;
 
@@ -583,8 +507,7 @@ impl ChunkCompare<&BinaryChunked> for BinaryChunked {
                 BooleanChunked::full_null("", rhs.len())
             }
         } else {
-            let (lhs, rhs) = align_chunks_binary(self, rhs);
-            lhs.comparison(&rhs, comparison::binary::eq)
+            arity::binary_mut_with_options(self, rhs, comparison::binary::eq, "")
         }
     }
 
@@ -603,8 +526,7 @@ impl ChunkCompare<&BinaryChunked> for BinaryChunked {
                 rhs.is_null()
             }
         } else {
-            let (lhs, rhs) = align_chunks_binary(self, rhs);
-            lhs.comparison(&rhs, comparison::binary::eq_and_validity)
+            arity::binary_mut_with_options(self, rhs, comparison::binary::eq_and_validity, "")
         }
     }
 
@@ -623,8 +545,7 @@ impl ChunkCompare<&BinaryChunked> for BinaryChunked {
                 BooleanChunked::full_null("", rhs.len())
             }
         } else {
-            let (lhs, rhs) = align_chunks_binary(self, rhs);
-            lhs.comparison(&rhs, comparison::binary::neq)
+            arity::binary_mut_with_options(self, rhs, comparison::binary::neq, "")
         }
     }
 
@@ -643,8 +564,7 @@ impl ChunkCompare<&BinaryChunked> for BinaryChunked {
                 rhs.is_not_null()
             }
         } else {
-            let (lhs, rhs) = align_chunks_binary(self, rhs);
-            lhs.comparison(&rhs, comparison::binary::neq_and_validity)
+            arity::binary_mut_with_options(self, rhs, comparison::binary::neq_and_validity, "")
         }
     }
 
@@ -663,8 +583,7 @@ impl ChunkCompare<&BinaryChunked> for BinaryChunked {
                 BooleanChunked::full_null("", self.len())
             }
         } else {
-            let (lhs, rhs) = align_chunks_binary(self, rhs);
-            lhs.comparison(&rhs, |l, r| comparison::gt(l, r))
+            arity::binary_mut_with_options(self, rhs, comparison::binary::gt, "")
         }
     }
 
@@ -683,8 +602,7 @@ impl ChunkCompare<&BinaryChunked> for BinaryChunked {
                 BooleanChunked::full_null("", self.len())
             }
         } else {
-            let (lhs, rhs) = align_chunks_binary(self, rhs);
-            lhs.comparison(&rhs, |l, r| comparison::gt_eq(l, r))
+            arity::binary_mut_with_options(self, rhs, comparison::binary::gt_eq, "")
         }
     }
 
@@ -703,8 +621,7 @@ impl ChunkCompare<&BinaryChunked> for BinaryChunked {
                 BooleanChunked::full_null("", self.len())
             }
         } else {
-            let (lhs, rhs) = align_chunks_binary(self, rhs);
-            lhs.comparison(&rhs, |l, r| comparison::lt(l, r))
+            arity::binary_mut_with_options(self, rhs, comparison::binary::lt, "")
         }
     }
 
@@ -723,8 +640,7 @@ impl ChunkCompare<&BinaryChunked> for BinaryChunked {
                 BooleanChunked::full_null("", self.len())
             }
         } else {
-            let (lhs, rhs) = align_chunks_binary(self, rhs);
-            lhs.comparison(&rhs, |l, r| comparison::lt_eq(l, r))
+            arity::binary_mut_with_options(self, rhs, comparison::binary::lt_eq, "")
         }
     }
 }
@@ -905,12 +821,12 @@ impl ChunkCompare<&StructChunked> for StructChunked {
 impl ChunkCompare<&ArrayChunked> for ArrayChunked {
     type Item = BooleanChunked;
     fn equal(&self, rhs: &ArrayChunked) -> BooleanChunked {
-        let (a, b) = align_chunks_binary(self, rhs);
-        let chunks = a
-            .downcast_iter()
-            .zip(b.downcast_iter())
-            .map(|(a, b)| polars_arrow::kernels::comparison::fixed_size_list_eq(a, b));
-        ChunkedArray::from_chunk_iter(self.name(), chunks)
+        arity::binary_mut_with_options(
+            self,
+            rhs,
+            polars_arrow::kernels::comparison::fixed_size_list_eq,
+            "",
+        )
     }
 
     fn equal_missing(&self, rhs: &ArrayChunked) -> BooleanChunked {
@@ -919,12 +835,12 @@ impl ChunkCompare<&ArrayChunked> for ArrayChunked {
     }
 
     fn not_equal(&self, rhs: &ArrayChunked) -> BooleanChunked {
-        let (a, b) = align_chunks_binary(self, rhs);
-        let chunks = a
-            .downcast_iter()
-            .zip(b.downcast_iter())
-            .map(|(a, b)| polars_arrow::kernels::comparison::fixed_size_list_neq(a, b));
-        ChunkedArray::from_chunk_iter(self.name(), chunks)
+        arity::binary_mut_with_options(
+            self,
+            rhs,
+            polars_arrow::kernels::comparison::fixed_size_list_neq,
+            "",
+        )
     }
 
     fn not_equal_missing(&self, rhs: &ArrayChunked) -> Self::Item {
diff --git a/crates/polars-core/src/chunked_array/ops/apply.rs b/crates/polars-core/src/chunked_array/ops/apply.rs
index 533c49f82d43..bff8409d0052 100644
--- a/crates/polars-core/src/chunked_array/ops/apply.rs
+++ b/crates/polars-core/src/chunked_array/ops/apply.rs
@@ -13,7 +13,7 @@ use crate::prelude::*;
 use crate::series::IsSorted;
 use crate::utils::{CustomIterTools, NoNull};
 
-fn collect_array<T: NativeType, I: TrustedLen<Item = T>>(
+pub(super) fn collect_array<T: NativeType, I: TrustedLen<Item = T>>(
     iter: I,
     validity: Option<Bitmap>,
 ) -> PrimitiveArray<T> {
diff --git a/crates/polars-core/src/chunked_array/ops/arity.rs b/crates/polars-core/src/chunked_array/ops/arity.rs
new file mode 100644
index 000000000000..287ab18adb89
--- /dev/null
+++ b/crates/polars-core/src/chunked_array/ops/arity.rs
@@ -0,0 +1,153 @@
+use arrow::array::{Array, PrimitiveArray};
+use polars_arrow::utils::combine_validities_and;
+
+use crate::chunked_array::ops::apply::collect_array;
+use crate::datatypes::{
+    HasUnderlyingArray, PolarsNumericType, StaticArray, StaticallyMatchesPolarsType,
+};
+use crate::prelude::{ChunkedArray, PolarsDataType};
+use crate::utils::align_chunks_binary;
+
+#[inline]
+pub fn binary_elementwise<T, U, V, F>(
+    lhs: &ChunkedArray<T>,
+    rhs: &ChunkedArray<U>,
+    mut op: F,
+) -> ChunkedArray<V>
+where
+    T: PolarsDataType,
+    U: PolarsDataType,
+    V: PolarsNumericType,
+    ChunkedArray<T>: HasUnderlyingArray,
+    ChunkedArray<U>: HasUnderlyingArray,
+    F: for<'a> FnMut(
+        Option<<<ChunkedArray<T> as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>>,
+        Option<<<ChunkedArray<U> as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>>,
+    ) -> Option<V::Native>,
+{
+    let (lhs, rhs) = align_chunks_binary(lhs, rhs);
+    let iter = lhs
+        .downcast_iter()
+        .zip(rhs.downcast_iter())
+        .map(|(lhs_arr, rhs_arr)| {
+            lhs_arr
+                .iter()
+                .zip(rhs_arr.iter())
+                .map(|(lhs_opt_val, rhs_opt_val)| op(lhs_opt_val, rhs_opt_val))
+                .collect::<PrimitiveArray<V::Native>>()
+        });
+    ChunkedArray::from_chunk_iter(lhs.name(), iter)
+}
+
+#[inline]
+pub fn binary_elementwise_values<T, U, V, F>(
+    lhs: &ChunkedArray<T>,
+    rhs: &ChunkedArray<U>,
+    mut op: F,
+) -> ChunkedArray<V>
+where
+    T: PolarsDataType,
+    U: PolarsDataType,
+    V: PolarsNumericType,
+    ChunkedArray<T>: HasUnderlyingArray,
+    ChunkedArray<U>: HasUnderlyingArray,
+    F: for<'a> FnMut(
+        <<ChunkedArray<T> as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>,
+        <<ChunkedArray<U> as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>,
+    ) -> V::Native,
+{
+    let (lhs, rhs) = align_chunks_binary(lhs, rhs);
+    let iter = lhs
+        .downcast_iter()
+        .zip(rhs.downcast_iter())
+        .map(|(lhs_arr, rhs_arr)| {
+            let validity = combine_validities_and(lhs_arr.validity(), rhs_arr.validity());
+
+            let iter = lhs_arr
+                .values_iter()
+                .zip(rhs_arr.values_iter())
+                .map(|(lhs_val, rhs_val)| op(lhs_val, rhs_val));
+            collect_array(iter, validity)
+        });
+    ChunkedArray::from_chunk_iter(lhs.name(), iter)
+}
+
+/// Applies a kernel that produces `Array` types.
+#[inline]
+pub fn binary_mut_with_options<T, U, V, F, Arr>(
+    lhs: &ChunkedArray<T>,
+    rhs: &ChunkedArray<U>,
+    mut op: F,
+    name: &str,
+) -> ChunkedArray<V>
+where
+    T: PolarsDataType,
+    U: PolarsDataType,
+    V: PolarsDataType,
+    ChunkedArray<T>: HasUnderlyingArray,
+    ChunkedArray<U>: HasUnderlyingArray,
+    Arr: Array + StaticallyMatchesPolarsType<V>,
+    F: FnMut(
+        &<ChunkedArray<T> as HasUnderlyingArray>::ArrayT,
+        &<ChunkedArray<U> as HasUnderlyingArray>::ArrayT,
+    ) -> Arr,
+{
+    let (lhs, rhs) = align_chunks_binary(lhs, rhs);
+    let iter = lhs
+        .downcast_iter()
+        .zip(rhs.downcast_iter())
+        .map(|(lhs_arr, rhs_arr)| op(lhs_arr, rhs_arr));
+    ChunkedArray::from_chunk_iter(name, iter)
+}
+
+/// Applies a kernel that produces `Array` types.
+pub fn binary_mut<T, U, V, F, Arr>(
+    lhs: &ChunkedArray<T>,
+    rhs: &ChunkedArray<U>,
+    op: F,
+) -> ChunkedArray<V>
+where
+    T: PolarsDataType,
+    U: PolarsDataType,
+    V: PolarsDataType,
+    ChunkedArray<T>: HasUnderlyingArray,
+    ChunkedArray<U>: HasUnderlyingArray,
+    Arr: Array + StaticallyMatchesPolarsType<V>,
+    F: FnMut(
+        &<ChunkedArray<T> as HasUnderlyingArray>::ArrayT,
+        &<ChunkedArray<U> as HasUnderlyingArray>::ArrayT,
+    ) -> Arr,
+{
+    binary_mut_with_options(lhs, rhs, op, lhs.name())
+}
+
+/// Applies a kernel that produces `ArrayRef` of the same type.
+///
+/// # Safety
+/// Caller must ensure that the returned `ArrayRef` belongs to `T: PolarsDataType`.
+#[inline]
+pub unsafe fn binary_mut_unchecked_same_type<T, U, F>(
+    lhs: &ChunkedArray<T>,
+    rhs: &ChunkedArray<U>,
+    mut op: F,
+    keep_sorted: bool,
+    keep_fast_explode: bool,
+) -> ChunkedArray<T>
+where
+    T: PolarsDataType,
+    U: PolarsDataType,
+    ChunkedArray<T>: HasUnderlyingArray,
+    ChunkedArray<U>: HasUnderlyingArray,
+    F: FnMut(
+        &<ChunkedArray<T> as HasUnderlyingArray>::ArrayT,
+        &<ChunkedArray<U> as HasUnderlyingArray>::ArrayT,
+    ) -> Box<dyn Array>,
+{
+    let (lhs, rhs) = align_chunks_binary(lhs, rhs);
+    let chunks = lhs
+        .downcast_iter()
+        .zip(rhs.downcast_iter())
+        .map(|(lhs_arr, rhs_arr)| op(lhs_arr, rhs_arr))
+        .collect();
+    lhs.copy_with_chunks(chunks, keep_sorted, keep_fast_explode)
+}
diff --git a/crates/polars-core/src/chunked_array/ops/downcast.rs b/crates/polars-core/src/chunked_array/ops/downcast.rs
index 1b29f939f94c..66197f51efb7 100644
--- a/crates/polars-core/src/chunked_array/ops/downcast.rs
+++ b/crates/polars-core/src/chunked_array/ops/downcast.rs
@@ -18,6 +18,7 @@ impl<'a, T> Chunks<'a, T> {
         }
     }
 
+    #[inline]
     pub fn get(&self, index: usize) -> Option<&'a T> {
         self.chunks.get(index).map(|arr| {
             let arr = &**arr;
@@ -25,6 +26,7 @@ impl<'a, T> Chunks<'a, T> {
         })
     }
 
+    #[inline]
     pub unsafe fn get_unchecked(&self, index: usize) -> &'a T {
         let arr = self.chunks.get_unchecked(index);
         let arr = &**arr;
@@ -35,6 +37,7 @@ impl<'a, T> Chunks<'a, T> {
         self.chunks.len()
     }
 
+    #[inline]
     pub fn last(&self) -> Option<&'a T> {
         self.chunks.last().map(|arr| {
             let arr = &**arr;
@@ -48,6 +51,7 @@ impl<T: PolarsDataType> ChunkedArray<T>
 where
     Self: HasUnderlyingArray,
 {
+    #[inline]
     pub fn downcast_iter(
         &self,
     ) -> impl Iterator<Item = &<Self as HasUnderlyingArray>::ArrayT> + DoubleEndedIterator {
@@ -62,6 +66,7 @@ where
     /// The caller must ensure:
     ///     * the length remains correct.
     ///     * the flags (sorted, etc) remain correct.
+    #[inline]
     pub unsafe fn downcast_iter_mut(
         &mut self,
     ) -> impl Iterator<Item = &mut <Self as HasUnderlyingArray>::ArrayT> + DoubleEndedIterator {
@@ -72,6 +77,7 @@ where
         })
     }
 
+    #[inline]
     pub fn downcast_chunks(&self) -> Chunks<'_, <Self as HasUnderlyingArray>::ArrayT> {
         Chunks::new(&self.chunks)
     }
diff --git a/crates/polars-core/src/chunked_array/ops/filter.rs b/crates/polars-core/src/chunked_array/ops/filter.rs
index 4f46af830179..408902b3258b 100644
--- a/crates/polars-core/src/chunked_array/ops/filter.rs
+++ b/crates/polars-core/src/chunked_array/ops/filter.rs
@@ -5,7 +5,6 @@ use arrow::compute::filter::filter as filter_fn;
 #[cfg(feature = "object")]
 use crate::chunked_array::object::builder::ObjectChunkedBuilder;
 use crate::prelude::*;
-use crate::utils::align_chunks_binary;
 
 macro_rules! check_filter_len {
     ($self:expr, $filter:expr) => {{
@@ -30,14 +29,15 @@ where
             };
         }
         check_filter_len!(self, filter);
-        let (left, filter) = align_chunks_binary(self, filter);
-
-        let chunks = left
-            .downcast_iter()
-            .zip(filter.downcast_iter())
-            .map(|(left, mask)| filter_fn(left, mask).unwrap())
-            .collect::<Vec<_>>();
-        unsafe { Ok(self.copy_with_chunks(chunks, true, true)) }
+        Ok(unsafe {
+            arity::binary_mut_unchecked_same_type(
+                self,
+                filter,
+                |left, mask| filter_fn(left, mask).unwrap(),
+                true,
+                true,
+            )
+        })
     }
 }
 
@@ -51,14 +51,15 @@ impl ChunkFilter<BooleanType> for BooleanChunked {
             };
         }
         check_filter_len!(self, filter);
-        let (left, filter) = align_chunks_binary(self, filter);
-
-        let chunks = left
-            .downcast_iter()
-            .zip(filter.downcast_iter())
-            .map(|(left, mask)| filter_fn(left, mask).unwrap())
-            .collect::<Vec<_>>();
-        unsafe { Ok(self.copy_with_chunks(chunks, true, true)) }
+        Ok(unsafe {
+            arity::binary_mut_unchecked_same_type(
+                self,
+                filter,
+                |left, mask| filter_fn(left, mask).unwrap(),
+                true,
+                true,
+            )
+        })
     }
 }
 
@@ -79,15 +80,15 @@ impl ChunkFilter<BinaryType> for BinaryChunked {
             };
         }
         check_filter_len!(self, filter);
-        let (left, filter) = align_chunks_binary(self, filter);
-
-        let chunks = left
-            .downcast_iter()
-            .zip(filter.downcast_iter())
-            .map(|(left, mask)| filter_fn(left, mask).unwrap())
-            .collect::<Vec<_>>();
-
-        unsafe { Ok(self.copy_with_chunks(chunks, true, true)) }
+        Ok(unsafe {
+            arity::binary_mut_unchecked_same_type(
+                self,
+                filter,
+                |left, mask| filter_fn(left, mask).unwrap(),
+                true,
+                true,
+            )
+        })
     }
 }
 
@@ -103,19 +104,15 @@ impl ChunkFilter<ListType> for ListChunked {
                 )),
             };
         }
-        let (left, filter) = align_chunks_binary(self, filter);
-
-        let chunks = left
-            .downcast_iter()
-            .zip(filter.downcast_iter())
-            .map(|(left, mask)| filter_fn(left, mask).unwrap())
-            .collect::<Vec<_>>();
-
-        // inner type may be categorical or logical type so we clone the state.
-        let mut ca = self.clone();
-        ca.chunks = chunks;
-        ca.compute_len();
-        Ok(ca)
+        Ok(unsafe {
+            arity::binary_mut_unchecked_same_type(
+                self,
+                filter,
+                |left, mask| filter_fn(left, mask).unwrap(),
+                true,
+                true,
+            )
+        })
     }
 }
 
@@ -132,19 +129,15 @@ impl ChunkFilter<FixedSizeListType> for ArrayChunked {
                 )),
             };
         }
-        let (left, filter) = align_chunks_binary(self, filter);
-
-        let chunks = left
-            .downcast_iter()
-            .zip(filter.downcast_iter())
-            .map(|(left, mask)| filter_fn(left, mask).unwrap())
-            .collect::<Vec<_>>();
-
-        // inner type may be categorical or logical type so we clone the state.
-        let mut ca = self.clone();
-        ca.chunks = chunks;
-        ca.compute_len();
-        Ok(ca)
+        Ok(unsafe {
+            arity::binary_mut_unchecked_same_type(
+                self,
+                filter,
+                |left, mask| filter_fn(left, mask).unwrap(),
+                true,
+                true,
+            )
+        })
     }
 }
 
diff --git a/crates/polars-core/src/chunked_array/ops/min_max_binary.rs b/crates/polars-core/src/chunked_array/ops/min_max_binary.rs
index 1270db482f16..bfb8dcc1d014 100644
--- a/crates/polars-core/src/chunked_array/ops/min_max_binary.rs
+++ b/crates/polars-core/src/chunked_array/ops/min_max_binary.rs
@@ -1,45 +1,20 @@
-use arrow::array::PrimitiveArray;
-use polars_arrow::prelude::FromData;
-
 use crate::datatypes::PolarsNumericType;
 use crate::prelude::*;
 use crate::series::arithmetic::coerce_lhs_rhs;
-use crate::utils::align_chunks_binary;
-
-fn cmp_binary<T, F>(left: &ChunkedArray<T>, right: &ChunkedArray<T>, op: F) -> ChunkedArray<T>
-where
-    T: PolarsNumericType,
-    F: Fn(T::Native, T::Native) -> T::Native,
-{
-    let (left, right) = align_chunks_binary(left, right);
-    let chunks = left
-        .downcast_iter()
-        .zip(right.downcast_iter())
-        .map(|(left, right)| {
-            let values = left
-                .values()
-                .iter()
-                .zip(right.values().iter())
-                .map(|(l, r)| op(*l, *r))
-                .collect::<Vec<_>>();
-            PrimitiveArray::from_data_default(values.into(), None)
-        });
-    ChunkedArray::from_chunk_iter(left.name(), chunks)
-}
 
 fn min_binary<T>(left: &ChunkedArray<T>, right: &ChunkedArray<T>) -> ChunkedArray<T>
 where
     T: PolarsNumericType,
     T::Native: PartialOrd,
 {
-    let op = |l, r| {
+    let op = |l: &T::Native, r: &T::Native| {
         if l < r {
-            l
+            *l
         } else {
-            r
+            *r
         }
     };
-    cmp_binary(left, right, op)
+    arity::binary_elementwise_values(left, right, op)
 }
 
 fn max_binary<T>(left: &ChunkedArray<T>, right: &ChunkedArray<T>) -> ChunkedArray<T>
@@ -47,14 +22,14 @@ where
     T: PolarsNumericType,
     T::Native: PartialOrd,
 {
-    let op = |l, r| {
+    let op = |l: &T::Native, r: &T::Native| {
         if l > r {
-            l
+            *l
         } else {
-            r
+            *r
         }
     };
-    cmp_binary(left, right, op)
+    arity::binary_elementwise_values(left, right, op)
 }
 
 pub(crate) fn min_max_binary_series(
diff --git a/crates/polars-core/src/chunked_array/ops/mod.rs b/crates/polars-core/src/chunked_array/ops/mod.rs
index 719ede0ee8a6..3420a041813a 100644
--- a/crates/polars-core/src/chunked_array/ops/mod.rs
+++ b/crates/polars-core/src/chunked_array/ops/mod.rs
@@ -13,6 +13,7 @@ pub(crate) mod aggregate;
 pub(crate) mod any_value;
 pub(crate) mod append;
 mod apply;
+pub mod arity;
 mod bit_repr;
 pub(crate) mod chunkops;
 pub(crate) mod compare_inner;
diff --git a/crates/polars-core/src/chunked_array/ops/repeat_by.rs b/crates/polars-core/src/chunked_array/ops/repeat_by.rs
index 31fb62c19971..3932b644ad9f 100644
--- a/crates/polars-core/src/chunked_array/ops/repeat_by.rs
+++ b/crates/polars-core/src/chunked_array/ops/repeat_by.rs
@@ -30,16 +30,17 @@ where
                     .collect::<Vec<IdxSize>>(),
             ));
         }
-        let iter = self
-            .into_iter()
-            .zip(by)
-            .map(|(opt_v, opt_by)| opt_by.map(|by| std::iter::repeat(opt_v).take(by as usize)));
-
-        // SAFETY: length of iter is trusted.
-        let arr = unsafe {
-            LargeListArray::from_iter_primitive_trusted_len(iter, T::get_dtype().to_arrow())
-        };
-        Ok(ChunkedArray::with_chunk(self.name(), arr))
+
+        Ok(arity::binary_mut(self, by, |arr, by| {
+            let iter = arr.into_iter().zip(by).map(|(opt_v, opt_by)| {
+                opt_by.map(|by| std::iter::repeat(opt_v.copied()).take(*by as usize))
+            });
+
+            // SAFETY: length of iter is trusted.
+            unsafe {
+                LargeListArray::from_iter_primitive_trusted_len(iter, T::get_dtype().to_arrow())
+            }
+        }))
     }
 }
 impl RepeatBy for BooleanChunked {
@@ -55,14 +56,14 @@ impl RepeatBy for BooleanChunked {
             ));
         }
 
-        let iter = self
-            .into_iter()
-            .zip(by)
-            .map(|(opt_v, opt_by)| opt_by.map(|by| std::iter::repeat(opt_v).take(by as usize)));
+        Ok(arity::binary_mut(self, by, |arr, by| {
+            let iter = arr.into_iter().zip(by).map(|(opt_v, opt_by)| {
+                opt_by.map(|by| std::iter::repeat(opt_v).take(*by as usize))
+            });
 
-        // SAFETY: length of iter is trusted.
-        let arr = unsafe { LargeListArray::from_iter_bool_trusted_len(iter) };
-        Ok(ChunkedArray::with_chunk(self.name(), arr))
+            // SAFETY: length of iter is trusted.
+            unsafe { LargeListArray::from_iter_bool_trusted_len(iter) }
+        }))
     }
 }
 impl RepeatBy for Utf8Chunked {
@@ -79,14 +80,14 @@ impl RepeatBy for Utf8Chunked {
             ));
         }
 
-        let iter = self
-            .into_iter()
-            .zip(by)
-            .map(|(opt_v, opt_by)| opt_by.map(|by| std::iter::repeat(opt_v).take(by as usize)));
+        Ok(arity::binary_mut(self, by, |arr, by| {
+            let iter = arr.into_iter().zip(by).map(|(opt_v, opt_by)| {
+                opt_by.map(|by| std::iter::repeat(opt_v).take(*by as usize))
+            });
 
-        // SAFETY: length of iter is trusted.
-        let arr = unsafe { LargeListArray::from_iter_utf8_trusted_len(iter, self.len()) };
-        Ok(ChunkedArray::with_chunk(self.name(), arr))
+            // SAFETY: length of iter is trusted.
+            unsafe { LargeListArray::from_iter_utf8_trusted_len(iter, self.len()) }
+        }))
     }
 }
 
@@ -102,13 +103,14 @@ impl RepeatBy for BinaryChunked {
                     .collect::<Vec<IdxSize>>(),
             ));
         }
-        let iter = self
-            .into_iter()
-            .zip(by)
-            .map(|(opt_v, opt_by)| opt_by.map(|by| std::iter::repeat(opt_v).take(by as usize)));
-
-        // SAFETY: length of iter is trusted.
-        let arr = unsafe { LargeListArray::from_iter_binary_trusted_len(iter, self.len()) };
-        Ok(ChunkedArray::with_chunk(self.name(), arr))
+
+        Ok(arity::binary_mut(self, by, |arr, by| {
+            let iter = arr.into_iter().zip(by).map(|(opt_v, opt_by)| {
+                opt_by.map(|by| std::iter::repeat(opt_v).take(*by as usize))
+            });
+
+            // SAFETY: length of iter is trusted.
+            unsafe { LargeListArray::from_iter_binary_trusted_len(iter, self.len()) }
+        }))
     }
 }
diff --git a/crates/polars-core/src/series/arithmetic/borrowed.rs b/crates/polars-core/src/series/arithmetic/borrowed.rs
index 2388bd5e0890..4ca8e3e7e00b 100644
--- a/crates/polars-core/src/series/arithmetic/borrowed.rs
+++ b/crates/polars-core/src/series/arithmetic/borrowed.rs
@@ -118,7 +118,6 @@ pub mod checked {
     use num_traits::{CheckedDiv, One, ToPrimitive, Zero};
 
     use super::*;
-    use crate::utils::align_chunks_binary;
 
     pub trait NumOpsDispatchCheckedInner: PolarsDataType + Sized {
         /// Checked integer division. Computes self / rhs, returning None if rhs == 0 or the division results in overflow.
@@ -161,24 +160,14 @@ pub mod checked {
             // Note that the physical type correctness is checked!
             // The ChunkedArray with the wrong dtype is dropped after this operation
             let rhs = unsafe { lhs.unpack_series_matching_physical_type(rhs) };
-            let (l, r) = align_chunks_binary(lhs, rhs);
-
-            Ok((l)
-                .downcast_iter()
-                .zip(r.downcast_iter())
-                .flat_map(|(l_arr, r_arr)| {
-                    l_arr
-                        .into_iter()
-                        .zip(r_arr)
-                        // we don't use a kernel, because the checked div also supplies nulls.
-                        // so the usual bit combining is not enough.
-                        .map(|(opt_l, opt_r)| match (opt_l, opt_r) {
-                            (Some(l), Some(r)) => l.checked_div(r),
-                            _ => None,
-                        })
+
+            Ok(
+                arity::binary_elementwise(lhs, rhs, |opt_l, opt_r| match (opt_l, opt_r) {
+                    (Some(l), Some(r)) => l.checked_div(r),
+                    _ => None,
                 })
-                .collect::<ChunkedArray<T>>()
-                .into_series())
+                .into_series(),
+            )
         }
     }
 
@@ -187,30 +176,22 @@ pub mod checked {
             // Safety:
             // see check_div for chunkedarray<T>
             let rhs = unsafe { lhs.unpack_series_matching_physical_type(rhs) };
-            let (l, r) = align_chunks_binary(lhs, rhs);
-
-            Ok((l)
-                .downcast_iter()
-                .zip(r.downcast_iter())
-                .flat_map(|(l_arr, r_arr)| {
-                    l_arr
-                        .into_iter()
-                        .zip(r_arr)
-                        // we don't use a kernel, because the checked div also supplies nulls.
-                        // so the usual bit combining is not enough.
-                        .map(|(opt_l, opt_r)| match (opt_l, opt_r) {
-                            (Some(l), Some(r)) => {
-                                if r.is_zero() {
-                                    None
-                                } else {
-                                    Some(l / r)
-                                }
-                            },
-                            _ => None,
-                        })
+
+            Ok(
+                arity::binary_elementwise::<_, _, Float32Type, _>(lhs, rhs, |opt_l, opt_r| match (
+                    opt_l, opt_r,
+                ) {
+                    (Some(l), Some(r)) => {
+                        if r.is_zero() {
+                            None
+                        } else {
+                            Some(l / r)
+                        }
+                    },
+                    _ => None,
                 })
-                .collect::<Float32Chunked>()
-                .into_series())
+                .into_series(),
+            )
         }
     }
 
@@ -219,30 +200,22 @@ pub mod checked {
             // Safety:
             // see check_div
             let rhs = unsafe { lhs.unpack_series_matching_physical_type(rhs) };
-            let (l, r) = align_chunks_binary(lhs, rhs);
-
-            Ok((l)
-                .downcast_iter()
-                .zip(r.downcast_iter())
-                .flat_map(|(l_arr, r_arr)| {
-                    l_arr
-                        .into_iter()
-                        .zip(r_arr)
-                        // we don't use a kernel, because the checked div also supplies nulls.
-                        // so the usual bit combining is not enough.
-                        .map(|(opt_l, opt_r)| match (opt_l, opt_r) {
-                            (Some(l), Some(r)) => {
-                                if r.is_zero() {
-                                    None
-                                } else {
-                                    Some(l / r)
-                                }
-                            },
-                            _ => None,
-                        })
+
+            Ok(
+                arity::binary_elementwise::<_, _, Float64Type, _>(lhs, rhs, |opt_l, opt_r| match (
+                    opt_l, opt_r,
+                ) {
+                    (Some(l), Some(r)) => {
+                        if r.is_zero() {
+                            None
+                        } else {
+                            Some(l / r)
+                        }
+                    },
+                    _ => None,
                 })
-                .collect::<Float64Chunked>()
-                .into_series())
+                .into_series(),
+            )
         }
     }
 
diff --git a/crates/polars-ops/src/chunked_array/list/sets.rs b/crates/polars-ops/src/chunked_array/list/sets.rs
index 1f3c3a7e8782..a442e820a420 100644
--- a/crates/polars-ops/src/chunked_array/list/sets.rs
+++ b/crates/polars-ops/src/chunked_array/list/sets.rs
@@ -10,7 +10,6 @@ use arrow::offset::OffsetsBuffer;
 use arrow::types::NativeType;
 use polars_arrow::utils::combine_validities_and;
 use polars_core::prelude::*;
-use polars_core::utils::align_chunks_binary;
 use polars_core::with_match_physical_integer_type;
 #[cfg(feature = "serde")]
 use serde::{Deserialize, Serialize};
@@ -281,16 +280,14 @@ fn array_set_operation(
 }
 
 pub fn list_set_operation(a: &ListChunked, b: &ListChunked, set_op: SetOperation) -> ListChunked {
-    let (a, b) = align_chunks_binary(a, b);
-
-    // no downcasting needed as lists
-    // already have logical types
-    let chunks = a
-        .downcast_iter()
-        .zip(b.downcast_iter())
-        .map(|(a, b)| array_set_operation(a, b, set_op).boxed())
-        .collect::<Vec<_>>();
-
-    // safety: dtypes are correct
-    unsafe { a.with_chunks(chunks) }
+    // we use the unsafe variant because we want to keep the nested logical types type.
+    unsafe {
+        arity::binary_mut_unchecked_same_type(
+            a,
+            b,
+            |a, b| array_set_operation(a, b, set_op).boxed(),
+            false,
+            false,
+        )
+    }
 }
diff --git a/crates/polars-ops/src/series/ops/floor_divide.rs b/crates/polars-ops/src/series/ops/floor_divide.rs
index 01e4e038d2c0..1bf7800070f8 100644
--- a/crates/polars-ops/src/series/ops/floor_divide.rs
+++ b/crates/polars-ops/src/series/ops/floor_divide.rs
@@ -6,7 +6,6 @@ use polars_core::export::num;
 use polars_core::prelude::*;
 #[cfg(feature = "dtype-struct")]
 use polars_core::series::arithmetic::_struct_arithmetic;
-use polars_core::utils::align_chunks_binary;
 use polars_core::with_match_physical_numeric_polars_type;
 
 #[inline]
@@ -79,13 +78,7 @@ fn floor_div_ca<T: PolarsNumericType>(a: &ChunkedArray<T>, b: &ChunkedArray<T>)
             ChunkedArray::full_null(a.name(), a.len())
         };
     }
-    let (a, b) = align_chunks_binary(a, b);
-
-    let chunks = a
-        .downcast_iter()
-        .zip(b.downcast_iter())
-        .map(|(a, b)| floor_div_array(a, b));
-    ChunkedArray::from_chunk_iter(a.name(), chunks)
+    arity::binary_mut(a, b, floor_div_array)
 }
 
 pub fn floor_div_series(a: &Series, b: &Series) -> PolarsResult<Series> {
diff --git a/crates/polars-plan/src/dsl/function_expr/pow.rs b/crates/polars-plan/src/dsl/function_expr/pow.rs
index c0e87fb43072..530041707365 100644
--- a/crates/polars-plan/src/dsl/function_expr/pow.rs
+++ b/crates/polars-plan/src/dsl/function_expr/pow.rs
@@ -2,7 +2,6 @@ use num::pow::Pow;
 use polars_arrow::kernels::pow::pow as pow_kernel;
 use polars_core::export::num;
 use polars_core::export::num::{Float, ToPrimitive};
-use polars_core::utils::align_chunks_binary;
 
 use super::*;
 
@@ -64,13 +63,9 @@ where
             exponent.apply(|exp| Pow::pow(base, exp)).into_series(),
         ))
     } else {
-        let (ca_1, ca_2) = align_chunks_binary(base, exponent);
-        let chunks = ca_1
-            .downcast_iter()
-            .zip(ca_2.downcast_iter())
-            .map(|(arr_1, arr_2)| pow_kernel(arr_1, arr_2));
         Ok(Some(
-            ChunkedArray::from_chunk_iter(ca_1.name(), chunks).into_series(),
+            polars_core::chunked_array::ops::arity::binary_mut(base, exponent, pow_kernel)
+                .into_series(),
         ))
     }
 }
diff --git a/crates/polars-plan/src/dsl/function_expr/trigonometry.rs b/crates/polars-plan/src/dsl/function_expr/trigonometry.rs
index a07598204858..99cd90cee546 100644
--- a/crates/polars-plan/src/dsl/function_expr/trigonometry.rs
+++ b/crates/polars-plan/src/dsl/function_expr/trigonometry.rs
@@ -1,7 +1,6 @@
 use num::Float;
 use polars_arrow::kernels::atan2::atan2 as atan2_kernel;
 use polars_core::export::num;
-use polars_core::utils::align_chunks_binary;
 
 use super::*;
 
@@ -129,13 +128,8 @@ where
 
         Ok(Some(x.apply(|v| y_value.atan2(v)).into_series()))
     } else {
-        let (ca_1, ca_2) = align_chunks_binary(y, x);
-        let chunks = ca_1
-            .downcast_iter()
-            .zip(ca_2.downcast_iter())
-            .map(|(arr_1, arr_2)| atan2_kernel(arr_1, arr_2));
         Ok(Some(
-            ChunkedArray::from_chunk_iter(ca_1.name(), chunks).into_series(),
+            polars_core::prelude::arity::binary_mut(y, x, atan2_kernel).into_series(),
         ))
     }
 }

From 76557703cbf3eb697a6fa893734545d10bf89d1b Mon Sep 17 00:00:00 2001
From: Marco Edward Gorelli <marcogorelli@protonmail.com>
Date: Sat, 19 Aug 2023 09:33:07 +0100
Subject: [PATCH 06/55] refactor(python): deprecate DataFrame.replace (#10600)

---
 py-polars/polars/dataframe/frame.py       | 20 ++++++++++++++++----
 py-polars/tests/unit/dataframe/test_df.py |  3 ++-
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
index fdf85a85c60c..19c09a790fcd 100644
--- a/py-polars/polars/dataframe/frame.py
+++ b/py-polars/polars/dataframe/frame.py
@@ -83,6 +83,7 @@
 from polars.utils._wrap import wrap_expr, wrap_ldf, wrap_s
 from polars.utils.convert import _timedelta_to_pl_duration
 from polars.utils.deprecation import (
+    deprecate_function,
     deprecate_renamed_methods,
     deprecate_renamed_parameter,
 )
@@ -1033,6 +1034,11 @@ def _read_ndjson(
         )
         return self
 
+    def _replace(self, column: str, new_column: Series) -> Self:
+        """Replace a column by a new Series (in place)."""
+        self._df.replace(column, new_column._s)
+        return self
+
     @property
     def shape(self) -> tuple[int, int]:
         """
@@ -1701,7 +1707,7 @@ def __setitem__(
                 self.replace_at_idx(col_selection, s)
             # df["foo"]
             elif isinstance(col_selection, str):
-                self.replace(col_selection, s)
+                self._replace(col_selection, s)
         else:
             raise TypeError(
                 f"cannot use `__setitem__` on DataFrame"
@@ -4380,6 +4386,13 @@ def frame_equal(self, other: DataFrame, *, null_equal: bool = True) -> bool:
         """
         return self._df.frame_equal(other._df, null_equal)
 
+    @deprecate_function(
+        "DataFrame.replace is deprecated and will be removed in a future version. "
+        "Please use\n"
+        "    df = df.with_columns(new_column.alias(column_name))\n"
+        "instead.",
+        version="0.19.0",
+    )
     def replace(self, column: str, new_column: Series) -> Self:
         """
         Replace a column by a new Series.
@@ -4395,7 +4408,7 @@ def replace(self, column: str, new_column: Series) -> Self:
         --------
         >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})
         >>> s = pl.Series([10, 20, 30])
-        >>> df.replace("foo", s)  # works in-place!
+        >>> df.replace("foo", s)  # works in-place!  # doctest: +SKIP
         shape: (3, 2)
         ┌─────┬─────┐
         │ foo ┆ bar │
@@ -4408,8 +4421,7 @@ def replace(self, column: str, new_column: Series) -> Self:
         └─────┴─────┘
 
         """
-        self._df.replace(column, new_column._s)
-        return self
+        return self._replace(column, new_column)
 
     def slice(self, offset: int, length: int | None = None) -> Self:
         """
diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py
index e8b921119912..0258dd0fb1e3 100644
--- a/py-polars/tests/unit/dataframe/test_df.py
+++ b/py-polars/tests/unit/dataframe/test_df.py
@@ -511,7 +511,8 @@ def test_sort_maintain_order() -> None:
 def test_replace() -> None:
     df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3]})
     s = pl.Series("c", [True, False, True])
-    df.replace("a", s)
+    with pytest.deprecated_call():
+        df.replace("a", s)
     assert_frame_equal(df, pl.DataFrame({"a": [True, False, True], "b": [1, 2, 3]}))
 
 

From 1095763fdf5e81f789dddde589161ea97c910cdc Mon Sep 17 00:00:00 2001
From: Weijie Guo <reswqa@163.com>
Date: Sat, 19 Aug 2023 19:00:36 +0800
Subject: [PATCH 07/55] fix(rust, python): Fix serialization for categorical
 chunked. (#10609)

---
 crates/polars-core/src/serde/chunked_array.rs | 2 +-
 py-polars/tests/unit/test_serde.py            | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/crates/polars-core/src/serde/chunked_array.rs b/crates/polars-core/src/serde/chunked_array.rs
index ad5f31213566..19e8a815b8bf 100644
--- a/crates/polars-core/src/serde/chunked_array.rs
+++ b/crates/polars-core/src/serde/chunked_array.rs
@@ -144,7 +144,7 @@ impl Serialize for CategoricalChunked {
         S: Serializer,
     {
         {
-            let mut state = serializer.serialize_map(Some(3))?;
+            let mut state = serializer.serialize_map(Some(4))?;
             state.serialize_entry("name", self.name())?;
             state.serialize_entry("datatype", self.dtype())?;
             state.serialize_entry("bit_settings", &self.get_flags())?;
diff --git a/py-polars/tests/unit/test_serde.py b/py-polars/tests/unit/test_serde.py
index 5f6c7a45baf1..f5bee11eb449 100644
--- a/py-polars/tests/unit/test_serde.py
+++ b/py-polars/tests/unit/test_serde.py
@@ -7,6 +7,7 @@
 import pytest
 
 import polars as pl
+from polars import StringCache
 from polars.testing import assert_frame_equal, assert_series_equal
 
 
@@ -182,3 +183,10 @@ def inner_df_times2(df: pl.DataFrame) -> pl.DataFrame:
 
     q = pickle.loads(b)
     assert q.collect()["a"].to_list() == [2, 4, 6]
+
+
+@StringCache()
+def test_serde_categorical_series_10586() -> None:
+    s = pl.Series(["a", "b", "b", "a", "c"], dtype=pl.Categorical)
+    loaded_s = pickle.loads(pickle.dumps(s))
+    assert_series_equal(loaded_s, s)

From caa6f7a71d6c8568374446f734a22ed34c6ccbdd Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijn@degooijer.io>
Date: Sat, 19 Aug 2023 13:18:24 +0200
Subject: [PATCH 08/55] feat!: Change behavior of `all` - fix Kleene logic
 implementation for `all`/`any` (#10564)

---
 Cargo.toml                                    |  10 +-
 .../src/chunked_array/comparison/mod.rs       |  55 ++++---
 .../src/dsl/function_expr/boolean.rs          |  32 ++--
 crates/polars-plan/src/dsl/mod.rs             |  36 +++--
 py-polars/Cargo.lock                          |  15 +-
 py-polars/polars/expr/expr.py                 | 150 ++++++++++--------
 py-polars/polars/expr/list.py                 |   2 +-
 .../polars/functions/aggregation/vertical.py  |  46 ++++--
 py-polars/polars/series/list.py               |   2 +-
 py-polars/polars/series/series.py             |  90 +++++++++--
 py-polars/src/expr/general.rs                 |  10 +-
 py-polars/tests/unit/datatypes/test_bool.py   |   6 -
 py-polars/tests/unit/datatypes/test_list.py   |   2 +-
 py-polars/tests/unit/series/test_all_any.py   |  75 +++++++++
 14 files changed, 368 insertions(+), 163 deletions(-)
 create mode 100644 py-polars/tests/unit/series/test_all_any.py

diff --git a/Cargo.toml b/Cargo.toml
index 492f2d9dd5a6..735b905a14b0 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -53,12 +53,10 @@ xxhash-rust = { version = "0.8.6", features = ["xxh3"] }
 
 [workspace.dependencies.arrow]
 package = "arrow2"
-# git = "https://github.com/jorgecarleitao/arrow2"
-# git = "https://github.com/ritchie46/arrow2"
-# rev = "9beabec8cfb5502582d31ab898fdd36e7af0873c"
-# path = "../arrow2"
-# branch = "duration_json"
-version = "0.17.4"
+git = "https://github.com/jorgecarleitao/arrow2"
+rev = "7edf5f9e359e0ed02e9d0c6b9318b06964d805f0"
+# branch = ""
+# version = "0.17.4"
 default-features = false
 features = [
   "compute_aggregate",
diff --git a/crates/polars-core/src/chunked_array/comparison/mod.rs b/crates/polars-core/src/chunked_array/comparison/mod.rs
index 29350564f8e7..31540fbba65a 100644
--- a/crates/polars-core/src/chunked_array/comparison/mod.rs
+++ b/crates/polars-core/src/chunked_array/comparison/mod.rs
@@ -884,31 +884,50 @@ impl Not for BooleanChunked {
 }
 
 impl BooleanChunked {
-    /// Check if all values are `true`
-    pub fn all(&self) -> bool {
-        self.downcast_iter().all(compute::boolean::all)
-    }
-
-    /// Check if any value is `true`
+    /// Returns whether any of the values in the column are `true`.
+    ///
+    /// Null values are ignored.
     pub fn any(&self) -> bool {
         self.downcast_iter().any(compute::boolean::any)
     }
 
-    // Three-valued versions which can return None
-    pub fn all_3val(&self, drop_nulls: bool) -> Option<bool> {
-        if drop_nulls || self.null_count() == 0 {
-            Some(self.all())
-        } else {
-            None
+    /// Returns whether all values in the array are `true`.
+    ///
+    /// Null values are ignored.
+    pub fn all(&self) -> bool {
+        self.downcast_iter().all(compute::boolean::all)
+    }
+
+    /// Returns whether any of the values in the column are `true`.
+    ///
+    /// The output is unknown (`None`) if the array contains any null values and
+    /// no `true` values.
+    pub fn any_kleene(&self) -> Option<bool> {
+        let mut result = Some(false);
+        for arr in self.downcast_iter() {
+            match compute::boolean_kleene::any(arr) {
+                Some(true) => return Some(true),
+                None => result = None,
+                _ => (),
+            };
         }
+        result
     }
-    pub fn any_3val(&self, drop_nulls: bool) -> Option<bool> {
-        let res = self.any();
-        if drop_nulls || res {
-            Some(res)
-        } else {
-            None
+
+    /// Returns whether all values in the column are `true`.
+    ///
+    /// The output is unknown (`None`) if the array contains any null values and
+    /// no `false` values.
+    pub fn all_kleene(&self) -> Option<bool> {
+        let mut result = Some(true);
+        for arr in self.downcast_iter() {
+            match compute::boolean_kleene::all(arr) {
+                Some(false) => return Some(false),
+                None => result = None,
+                _ => (),
+            };
         }
+        result
     }
 }
 
diff --git a/crates/polars-plan/src/dsl/function_expr/boolean.rs b/crates/polars-plan/src/dsl/function_expr/boolean.rs
index 091922a6fa04..fc2eb6307c19 100644
--- a/crates/polars-plan/src/dsl/function_expr/boolean.rs
+++ b/crates/polars-plan/src/dsl/function_expr/boolean.rs
@@ -9,11 +9,11 @@ use crate::{map, wrap};
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 #[derive(Clone, PartialEq, Debug, Eq, Hash)]
 pub enum BooleanFunction {
-    All {
-        drop_nulls: bool,
-    },
     Any {
-        drop_nulls: bool,
+        ignore_nulls: bool,
+    },
+    All {
+        ignore_nulls: bool,
     },
     IsNot,
     IsNull,
@@ -77,8 +77,8 @@ impl From<BooleanFunction> for SpecialEq<Arc<dyn SeriesUdf>> {
     fn from(func: BooleanFunction) -> Self {
         use BooleanFunction::*;
         match func {
-            All { drop_nulls } => map!(all, drop_nulls),
-            Any { drop_nulls } => map!(any, drop_nulls),
+            Any { ignore_nulls } => map!(any, ignore_nulls),
+            All { ignore_nulls } => map!(all, ignore_nulls),
             IsNot => map!(is_not),
             IsNull => map!(is_null),
             IsNotNull => map!(is_not_null),
@@ -106,14 +106,22 @@ impl From<BooleanFunction> for FunctionExpr {
     }
 }
 
-fn all(s: &Series, drop_nulls: bool) -> PolarsResult<Series> {
-    let boolean = s.bool()?;
-    Ok(Series::new(s.name(), [boolean.all_3val(drop_nulls)]))
+fn any(s: &Series, ignore_nulls: bool) -> PolarsResult<Series> {
+    let ca = s.bool()?;
+    if ignore_nulls {
+        Ok(Series::new(s.name(), [ca.any()]))
+    } else {
+        Ok(Series::new(s.name(), [ca.any_kleene()]))
+    }
 }
 
-fn any(s: &Series, drop_nulls: bool) -> PolarsResult<Series> {
-    let boolean = s.bool()?;
-    Ok(Series::new(s.name(), [boolean.any_3val(drop_nulls)]))
+fn all(s: &Series, ignore_nulls: bool) -> PolarsResult<Series> {
+    let ca = s.bool()?;
+    if ignore_nulls {
+        Ok(Series::new(s.name(), [ca.all()]))
+    } else {
+        Ok(Series::new(s.name(), [ca.all_kleene()]))
+    }
 }
 
 fn is_not(s: &Series) -> PolarsResult<Series> {
diff --git a/crates/polars-plan/src/dsl/mod.rs b/crates/polars-plan/src/dsl/mod.rs
index c8e8746eae6a..4e3c8520851a 100644
--- a/crates/polars-plan/src/dsl/mod.rs
+++ b/crates/polars-plan/src/dsl/mod.rs
@@ -1652,9 +1652,30 @@ impl Expr {
         .with_fmt("ewm_var")
     }
 
-    /// Check if any boolean value is `true`
-    pub fn any(self, drop_nulls: bool) -> Self {
-        self.apply_private(BooleanFunction::Any { drop_nulls }.into())
+    /// Returns whether any of the values in the column are `true`.
+    ///
+    /// If `ignore_nulls` is `False`, [Kleene logic] is used to deal with nulls:
+    /// if the column contains any null values and no `true` values, the output
+    /// is null.
+    ///
+    /// [Kleene logic]: https://en.wikipedia.org/wiki/Three-valued_logic
+    pub fn any(self, ignore_nulls: bool) -> Self {
+        self.apply_private(BooleanFunction::Any { ignore_nulls }.into())
+            .with_function_options(|mut opt| {
+                opt.auto_explode = true;
+                opt
+            })
+    }
+
+    /// Returns whether all values in the column are `true`.
+    ///
+    /// If `ignore_nulls` is `False`, [Kleene logic] is used to deal with nulls:
+    /// if the column contains any null values and no `true` values, the output
+    /// is null.
+    ///
+    /// [Kleene logic]: https://en.wikipedia.org/wiki/Three-valued_logic
+    pub fn all(self, ignore_nulls: bool) -> Self {
+        self.apply_private(BooleanFunction::All { ignore_nulls }.into())
             .with_function_options(|mut opt| {
                 opt.auto_explode = true;
                 opt
@@ -1668,15 +1689,6 @@ impl Expr {
         self.map_private(FunctionExpr::ShrinkType)
     }
 
-    /// Check if all boolean values are `true`
-    pub fn all(self, drop_nulls: bool) -> Self {
-        self.apply_private(BooleanFunction::All { drop_nulls }.into())
-            .with_function_options(|mut opt| {
-                opt.auto_explode = true;
-                opt
-            })
-    }
-
     #[cfg(feature = "dtype-struct")]
     /// Count all unique values and create a struct mapping value to count
     /// Note that it is better to turn multithreaded off in the aggregation context
diff --git a/py-polars/Cargo.lock b/py-polars/Cargo.lock
index 0479f717938a..286f80fe8ae0 100644
--- a/py-polars/Cargo.lock
+++ b/py-polars/Cargo.lock
@@ -99,8 +99,7 @@ dependencies = [
 [[package]]
 name = "arrow2"
 version = "0.17.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "59c468daea140b747d781a1da9f7db5f0a8e6636d4af20cc539e43d05b0604fa"
+source = "git+https://github.com/jorgecarleitao/arrow2?rev=7edf5f9e359e0ed02e9d0c6b9318b06964d805f0#7edf5f9e359e0ed02e9d0c6b9318b06964d805f0"
 dependencies = [
  "ahash",
  "arrow-format",
@@ -123,7 +122,7 @@ dependencies = [
  "num-traits",
  "parquet2",
  "regex",
- "regex-syntax 0.6.29",
+ "regex-syntax",
  "rustc_version",
  "simdutf8",
  "streaming-iterator",
@@ -1863,7 +1862,7 @@ dependencies = [
  "aho-corasick",
  "memchr",
  "regex-automata",
- "regex-syntax 0.7.4",
+ "regex-syntax",
 ]
 
 [[package]]
@@ -1874,15 +1873,9 @@ checksum = "fed1ceff11a1dddaee50c9dc8e4938bd106e9d89ae372f192311e7da498e3b69"
 dependencies = [
  "aho-corasick",
  "memchr",
- "regex-syntax 0.7.4",
+ "regex-syntax",
 ]
 
-[[package]]
-name = "regex-syntax"
-version = "0.6.29"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
-
 [[package]]
 name = "regex-syntax"
 version = "0.7.4"
diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py
index f29a813614ab..29735133c9dc 100644
--- a/py-polars/polars/expr/expr.py
+++ b/py-polars/polars/expr/expr.py
@@ -317,14 +317,23 @@ def to_physical(self) -> Self:
         """
         return self._from_pyexpr(self._pyexpr.to_physical())
 
-    def any(self, drop_nulls: bool = True) -> Self:
+    @deprecate_renamed_parameter("drop_nulls", "ignore_nulls", version="0.19.0")
+    def any(self, *, ignore_nulls: bool = True) -> Self:
         """
-        Check if any boolean value in a Boolean column is `True`.
+        Return whether any of the values in the column are ``True``.
+
+        Only works on columns of data type :class:`Boolean`.
 
         Parameters
         ----------
-        drop_nulls
-            If False, return None if there are nulls but no Trues.
+        ignore_nulls
+            Ignore null values (default).
+
+            If set to ``False``, `Kleene logic`_ is used to deal with nulls:
+            if the column contains any null values and no ``True`` values,
+            the output is null.
+
+            .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic
 
         Returns
         -------
@@ -333,51 +342,59 @@ def any(self, drop_nulls: bool = True) -> Self:
 
         Examples
         --------
-        >>> df = pl.DataFrame({"TF": [True, False], "FF": [False, False]})
-        >>> df.select(pl.all().any())
-        shape: (1, 2)
-        ┌──────┬───────┐
-        │ TF   ┆ FF    │
-        │ ---  ┆ ---   │
-        │ bool ┆ bool  │
-        ╞══════╪═══════╡
-        │ true ┆ false │
-        └──────┴───────┘
-        >>> df = pl.DataFrame(dict(x=[None, False], y=[None, True]))
-        >>> df.select(pl.col("x").any(True), pl.col("y").any(True))
-        shape: (1, 2)
-        ┌───────┬──────┐
-        │ x     ┆ y    │
-        │ ---   ┆ ---  │
-        │ bool  ┆ bool │
-        ╞═══════╪══════╡
-        │ false ┆ true │
-        └───────┴──────┘
-        >>> df.select(pl.col("x").any(False), pl.col("y").any(False))
-        shape: (1, 2)
-        ┌──────┬──────┐
-        │ x    ┆ y    │
-        │ ---  ┆ ---  │
-        │ bool ┆ bool │
-        ╞══════╪══════╡
-        │ null ┆ true │
-        └──────┴──────┘
+        >>> df = pl.DataFrame(
+        ...     {
+        ...         "a": [True, False],
+        ...         "b": [False, False],
+        ...         "c": [None, False],
+        ...     }
+        ... )
+        >>> df.select(pl.col("*").any())
+        shape: (1, 3)
+        ┌──────┬───────┬───────┐
+        │ a    ┆ b     ┆ c     │
+        │ ---  ┆ ---   ┆ ---   │
+        │ bool ┆ bool  ┆ bool  │
+        ╞══════╪═══════╪═══════╡
+        │ true ┆ false ┆ false │
+        └──────┴───────┴───────┘
+
+        Enable Kleene logic by setting ``ignore_nulls=False``.
+
+        >>> df.select(pl.col("*").any(ignore_nulls=False))
+        shape: (1, 3)
+        ┌──────┬───────┬──────┐
+        │ a    ┆ b     ┆ c    │
+        │ ---  ┆ ---   ┆ ---  │
+        │ bool ┆ bool  ┆ bool │
+        ╞══════╪═══════╪══════╡
+        │ true ┆ false ┆ null │
+        └──────┴───────┴──────┘
 
         """
-        return self._from_pyexpr(self._pyexpr.any(drop_nulls))
+        return self._from_pyexpr(self._pyexpr.any(ignore_nulls))
 
-    def all(self, drop_nulls: bool = True) -> Self:
+    @deprecate_renamed_parameter("drop_nulls", "ignore_nulls", version="0.19.0")
+    def all(self, *, ignore_nulls: bool = True) -> Self:
         """
-        Check if all boolean values in a Boolean column are `True`.
+        Return whether all values in the column are ``True``.
 
-        This method is an expression - not to be confused with
-        :func:`polars.all` which is a function to select all columns.
+        Only works on columns of data type :class:`Boolean`.
+
+        .. note::
+            This method is not to be confused with the function :func:`polars.all`,
+            which can be used to select all columns.
 
         Parameters
         ----------
-        drop_nulls
-            If False, return None if there are any nulls.
+        ignore_nulls
+            Ignore null values (default).
 
+            If set to ``False``, `Kleene logic`_ is used to deal with nulls:
+            if the column contains any null values and no ``True`` values,
+            the output is null.
+
+            .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic
 
         Returns
         -------
@@ -387,39 +404,36 @@ def all(self, drop_nulls: bool = True) -> Self:
         Examples
         --------
         >>> df = pl.DataFrame(
-        ...     {"TT": [True, True], "TF": [True, False], "FF": [False, False]}
+        ...     {
+        ...         "a": [True, True],
+        ...         "b": [False, True],
+        ...         "c": [None, True],
+        ...     }
         ... )
         >>> df.select(pl.col("*").all())
         shape: (1, 3)
-        ┌──────┬───────┬───────┐
-        │ TT   ┆ TF    ┆ FF    │
-        │ ---  ┆ ---   ┆ ---   │
-        │ bool ┆ bool  ┆ bool  │
-        ╞══════╪═══════╪═══════╡
-        │ true ┆ false ┆ false │
-        └──────┴───────┴───────┘
-        >>> df = pl.DataFrame(dict(x=[None, False], y=[None, True]))
-        >>> df.select(pl.col("x").all(True), pl.col("y").all(True))
-        shape: (1, 2)
-        ┌───────┬───────┐
-        │ x     ┆ y     │
-        │ ---   ┆ ---   │
-        │ bool  ┆ bool  │
-        ╞═══════╪═══════╡
-        │ false ┆ false │
-        └───────┴───────┘
-        >>> df.select(pl.col("x").all(False), pl.col("y").all(False))
-        shape: (1, 2)
-        ┌──────┬──────┐
-        │ x    ┆ y    │
-        │ ---  ┆ ---  │
-        │ bool ┆ bool │
-        ╞══════╪══════╡
-        │ null ┆ null │
-        └──────┴──────┘
+        ┌──────┬───────┬──────┐
+        │ a    ┆ b     ┆ c    │
+        │ ---  ┆ ---   ┆ ---  │
+        │ bool ┆ bool  ┆ bool │
+        ╞══════╪═══════╪══════╡
+        │ true ┆ false ┆ true │
+        └──────┴───────┴──────┘
+
+        Enable Kleene logic by setting ``ignore_nulls=False``.
+
+        >>> df.select(pl.col("*").all(ignore_nulls=False))
+        shape: (1, 3)
+        ┌──────┬───────┬──────┐
+        │ a    ┆ b     ┆ c    │
+        │ ---  ┆ ---   ┆ ---  │
+        │ bool ┆ bool  ┆ bool │
+        ╞══════╪═══════╪══════╡
+        │ true ┆ false ┆ null │
+        └──────┴───────┴──────┘
 
         """
-        return self._from_pyexpr(self._pyexpr.all(drop_nulls))
+        return self._from_pyexpr(self._pyexpr.all(ignore_nulls))
 
     def arg_true(self) -> Self:
         """
diff --git a/py-polars/polars/expr/list.py b/py-polars/polars/expr/list.py
index 6b5c426ebdab..1577353b722e 100644
--- a/py-polars/polars/expr/list.py
+++ b/py-polars/polars/expr/list.py
@@ -60,7 +60,7 @@ def all(self) -> Expr:
         │ true  │
         │ false │
         │ false │
-        │ false │
+        │ true  │
         │ true  │
         │ null  │
         └───────┘
diff --git a/py-polars/polars/functions/aggregation/vertical.py b/py-polars/polars/functions/aggregation/vertical.py
index 600b20cc3fca..ef1234b2bc26 100644
--- a/py-polars/polars/functions/aggregation/vertical.py
+++ b/py-polars/polars/functions/aggregation/vertical.py
@@ -15,20 +15,24 @@
 
 
 @overload
-def all(exprs: Series) -> bool:  # type: ignore[misc]
+def all(exprs: Series, *, ignore_nulls: bool = ...) -> bool | None:  # type: ignore[misc]
     ...
 
 
 @overload
 def all(
-    exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr
+    exprs: IntoExpr | Iterable[IntoExpr] | None = ...,
+    *more_exprs: IntoExpr,
+    ignore_nulls: bool = ...,
 ) -> Expr:
     ...
 
 
 @deprecate_renamed_parameter("columns", "exprs", version="0.18.7")
 def all(
-    exprs: IntoExpr | Iterable[IntoExpr] | None = None, *more_exprs: IntoExpr
+    exprs: IntoExpr | Iterable[IntoExpr] | None = None,
+    *more_exprs: IntoExpr,
+    ignore_nulls: bool = True,
 ) -> Expr | bool | None:
     """
     Either return an expression representing all columns, or evaluate a bitwise AND operation.
@@ -50,6 +54,14 @@ def all(
         parsed as column names, other non-expression inputs are parsed as literals.
     *more_exprs
         Additional columns to use in the aggregation, specified as positional arguments.
+    ignore_nulls
+        Ignore null values (default).
+
+        If set to ``False``, `Kleene logic`_ is used to deal with nulls:
+        if the column contains any null values and no ``True`` values,
+        the output is ``None``.
+
+        .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic
 
     See Also
     --------
@@ -96,27 +108,33 @@ def all(
                 "passing a Series to `all` is deprecated. Use `Series.all()` instead.",
                 version="0.18.7",
             )
-            return exprs.all()
+            return exprs.all(ignore_nulls=ignore_nulls)
         elif isinstance(exprs, str):
-            return F.col(exprs).all()
+            return F.col(exprs).all(ignore_nulls=ignore_nulls)
 
     _warn_for_deprecated_horizontal_use("all")
     return F.all_horizontal(exprs, *more_exprs)
 
 
 @overload
-def any(exprs: Series) -> bool:  # type: ignore[misc]
+def any(exprs: Series, *, ignore_nulls: bool = ...) -> bool | None:  # type: ignore[misc]
     ...
 
 
 @overload
-def any(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr:
+def any(
+    exprs: IntoExpr | Iterable[IntoExpr],
+    *more_exprs: IntoExpr,
+    ignore_nulls: bool = ...,
+) -> Expr:
     ...
 
 
 @deprecate_renamed_parameter("columns", "exprs", version="0.18.7")
 def any(
-    exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr
+    exprs: IntoExpr | Iterable[IntoExpr],
+    *more_exprs: IntoExpr,
+    ignore_nulls: bool = True,
 ) -> Expr | bool | None:
     """
     Evaluate a bitwise OR operation.
@@ -141,6 +159,14 @@ def any(
         parsed as column names, other non-expression inputs are parsed as literals.
     *more_exprs
         Additional columns to use in the aggregation, specified as positional arguments.
+    ignore_nulls
+        Ignore null values (default).
+
+        If set to ``False``, `Kleene logic`_ is used to deal with nulls:
+        if the column contains any null values and no ``True`` values,
+        the output is ``None``.
+
+        .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic
 
     Examples
     --------
@@ -167,9 +193,9 @@ def any(
                 "passing a Series to `any` is deprecated. Use `Series.any()` instead.",
                 version="0.18.7",
             )
-            return exprs.any()
+            return exprs.any(ignore_nulls=ignore_nulls)
         elif isinstance(exprs, str):
-            return F.col(exprs).any()
+            return F.col(exprs).any(ignore_nulls=ignore_nulls)
 
     _warn_for_deprecated_horizontal_use("any")
     return F.any_horizontal(exprs, *more_exprs)
diff --git a/py-polars/polars/series/list.py b/py-polars/polars/series/list.py
index 69a9d8ffec69..7afd49f03a81 100644
--- a/py-polars/polars/series/list.py
+++ b/py-polars/polars/series/list.py
@@ -57,7 +57,7 @@ def all(self) -> Expr:
         │ true  │
         │ false │
         │ false │
-        │ false │
+        │ true  │
         │ true  │
         │ null  │
         └───────┘
diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py
index 357bd3b87656..fcd637ca0f35 100644
--- a/py-polars/polars/series/series.py
+++ b/py-polars/polars/series/series.py
@@ -1229,37 +1229,103 @@ def cbrt(self) -> Series:
 
         """
 
-    def any(self, drop_nulls: bool = True) -> bool | None:
+    @overload
+    def any(self, *, ignore_nulls: Literal[True] = ...) -> bool:
+        ...
+
+    @overload
+    def any(self, *, ignore_nulls: bool) -> bool | None:
+        ...
+
+    @deprecate_renamed_parameter("drop_nulls", "ignore_nulls", version="0.19.0")
+    def any(self, *, ignore_nulls: bool = True) -> bool | None:
         """
-        Check if any boolean value in the column is `True`.
+        Return whether any of the values in the column are ``True``.
+
+        Only works on columns of data type :class:`Boolean`.
+
+        Parameters
+        ----------
+        ignore_nulls
+            Ignore null values (default).
+
+            If set to ``False``, `Kleene logic`_ is used to deal with nulls:
+            if the column contains any null values and no ``True`` values,
+            the output is ``None``.
+
+            .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic
 
         Returns
         -------
-        Series
-            Series of data type :class:`Boolean`.
+        bool or None
+
+        Examples
+        --------
+        >>> pl.Series([True, False]).any()
+        True
+        >>> pl.Series([False, False]).any()
+        False
+        >>> pl.Series([None, False]).any()
+        False
+
+        Enable Kleene logic by setting ``ignore_nulls=False``.
+
+        >>> pl.Series([None, False]).any(ignore_nulls=False)  # Returns None
 
         """
         return (
             self.to_frame()
-            .select(F.col(self.name).any(drop_nulls=drop_nulls))
-            .to_series()
+            .select(F.col(self.name).any(ignore_nulls=ignore_nulls))
             .item()
         )
 
-    def all(self, drop_nulls: bool = True) -> bool | None:
+    @overload
+    def all(self, *, ignore_nulls: Literal[True] = ...) -> bool:
+        ...
+
+    @overload
+    def all(self, *, ignore_nulls: bool) -> bool | None:
+        ...
+
+    @deprecate_renamed_parameter("drop_nulls", "ignore_nulls", version="0.19.0")
+    def all(self, *, ignore_nulls: bool = True) -> bool | None:
         """
-        Check if all boolean values in the column are `True`.
+        Return whether all values in the column are ``True``.
+
+        Only works on columns of data type :class:`Boolean`.
+
+        Parameters
+        ----------
+        ignore_nulls
+            Ignore null values (default).
+
+            If set to ``False``, `Kleene logic`_ is used to deal with nulls:
+            if the column contains any null values and no ``True`` values,
+            the output is ``None``.
+
+            .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic
 
         Returns
         -------
-        Series
-            Series of data type :class:`Boolean`.
+        bool or None
+
+        Examples
+        --------
+        >>> pl.Series([True, True]).all()
+        True
+        >>> pl.Series([False, True]).all()
+        False
+        >>> pl.Series([None, True]).all()
+        True
+
+        Enable Kleene logic by setting ``ignore_nulls=False``.
+
+        >>> pl.Series([None, True]).all(ignore_nulls=False)  # Returns None
 
         """
         return (
             self.to_frame()
-            .select(F.col(self.name).all(drop_nulls=drop_nulls))
-            .to_series()
+            .select(F.col(self.name).all(ignore_nulls=ignore_nulls))
             .item()
         )
 
diff --git a/py-polars/src/expr/general.rs b/py-polars/src/expr/general.rs
index f9e8e726b26d..a66318dafe18 100644
--- a/py-polars/src/expr/general.rs
+++ b/py-polars/src/expr/general.rs
@@ -1145,12 +1145,12 @@ impl PyExpr {
             .with_fmt("extend")
             .into()
     }
-    fn any(&self, drop_nulls: bool) -> Self {
-        self.inner.clone().any(drop_nulls).into()
-    }
 
-    fn all(&self, drop_nulls: bool) -> Self {
-        self.inner.clone().all(drop_nulls).into()
+    fn any(&self, ignore_nulls: bool) -> Self {
+        self.inner.clone().any(ignore_nulls).into()
+    }
+    fn all(&self, ignore_nulls: bool) -> Self {
+        self.inner.clone().all(ignore_nulls).into()
     }
 
     fn log(&self, base: f64) -> Self {
diff --git a/py-polars/tests/unit/datatypes/test_bool.py b/py-polars/tests/unit/datatypes/test_bool.py
index 95c670ae2ebc..34a4b0d589a6 100644
--- a/py-polars/tests/unit/datatypes/test_bool.py
+++ b/py-polars/tests/unit/datatypes/test_bool.py
@@ -61,9 +61,3 @@ def val(expr: pl.Expr) -> dict[str, list[bool]]:
     assert val(True | pl.col("x")) == {"literal": [True, True]}
     assert val(False ^ pl.col("x")) == {"literal": [False, True]}
     assert val(True ^ pl.col("x")) == {"literal": [True, False]}
-
-
-def test_all_empty() -> None:
-    s = pl.Series([], dtype=pl.Boolean)
-    assert s.all()
-    assert not s.any()
diff --git a/py-polars/tests/unit/datatypes/test_list.py b/py-polars/tests/unit/datatypes/test_list.py
index d9af8d8296b0..962875782f73 100644
--- a/py-polars/tests/unit/datatypes/test_list.py
+++ b/py-polars/tests/unit/datatypes/test_list.py
@@ -311,7 +311,7 @@ def test_list_all() -> None:
             ]
         }
     ).select(pl.col("a").list.all()).to_dict(False) == {
-        "a": [True, False, True, False, False, False, True]
+        "a": [True, False, True, False, False, True, True]
     }
 
 
diff --git a/py-polars/tests/unit/series/test_all_any.py b/py-polars/tests/unit/series/test_all_any.py
new file mode 100644
index 000000000000..eae9989dee75
--- /dev/null
+++ b/py-polars/tests/unit/series/test_all_any.py
@@ -0,0 +1,75 @@
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+
+@pytest.mark.parametrize(
+    ("data", "expected"),
+    [
+        ([], False),
+        ([None], False),
+        ([False], False),
+        ([False, None], False),
+        ([True], True),
+        ([True, None], True),
+    ],
+)
+def test_any(data: list[bool | None], expected: bool) -> None:
+    assert pl.Series(data, dtype=pl.Boolean).any() is expected
+
+
+@pytest.mark.parametrize(
+    ("data", "expected"),
+    [
+        ([], False),
+        ([None], None),
+        ([False], False),
+        ([False, None], None),
+        ([True], True),
+        ([True, None], True),
+    ],
+)
+def test_any_kleene(data: list[bool | None], expected: bool | None) -> None:
+    assert pl.Series(data, dtype=pl.Boolean).any(ignore_nulls=False) is expected
+
+
+def test_any_wrong_dtype() -> None:
+    with pytest.raises(pl.SchemaError, match="expected `Boolean`"):
+        pl.Series([0, 1, 0]).any()
+
+
+@pytest.mark.parametrize(
+    ("data", "expected"),
+    [
+        ([], True),
+        ([None], True),
+        ([False], False),
+        ([False, None], False),
+        ([True], True),
+        ([True, None], True),
+    ],
+)
+def test_all(data: list[bool | None], expected: bool) -> None:
+    assert pl.Series(data, dtype=pl.Boolean).all() is expected
+
+
+@pytest.mark.parametrize(
+    ("data", "expected"),
+    [
+        ([], True),
+        ([None], None),
+        ([False], False),
+        ([False, None], False),
+        ([True], True),
+        ([True, None], None),
+    ],
+)
+def test_all_kleene(data: list[bool | None], expected: bool | None) -> None:
+    assert pl.Series(data, dtype=pl.Boolean).all(ignore_nulls=False) is expected
+
+
+def test_all_wrong_dtype() -> None:
+    with pytest.raises(pl.SchemaError, match="expected `Boolean`"):
+        pl.Series([0, 1, 0]).all()

From 3beff9a5faf9db48e3ea6ff2235740bcb6adeadc Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijn@degooijer.io>
Date: Sat, 19 Aug 2023 13:32:57 +0200
Subject: [PATCH 09/55] ci: Enforce up-to-date `Cargo.lock` (#10555)

---
 .github/workflows/lint-py-polars.yml | 2 +-
 py-polars/Makefile                   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/lint-py-polars.yml b/.github/workflows/lint-py-polars.yml
index 5265f3181b3b..47e05ff3bbfa 100644
--- a/.github/workflows/lint-py-polars.yml
+++ b/.github/workflows/lint-py-polars.yml
@@ -43,4 +43,4 @@ jobs:
         run: cargo fmt --all -- --check
 
       - name: Run clippy
-        run: cargo clippy -- -D warnings
+        run: cargo clippy --locked -- -D warnings
diff --git a/py-polars/Makefile b/py-polars/Makefile
index c673c9ed7482..e0fcee1b83a5 100644
--- a/py-polars/Makefile
+++ b/py-polars/Makefile
@@ -67,7 +67,7 @@ fmt: .venv  ## Run autoformatting and linting
 
 .PHONY: clippy
 clippy:  ## Run clippy
-	cargo clippy -- -D warnings
+	cargo clippy --locked -- -D warnings
 
 .PHONY: pre-commit
 pre-commit: fmt clippy  ## Run all code quality checks

From 88a8c3c6a7da8342a95956b77f571909b7337762 Mon Sep 17 00:00:00 2001
From: Liam Brannigan <braaannigan@users.noreply.github.com>
Date: Sat, 19 Aug 2023 16:16:24 +0100
Subject: [PATCH 10/55] docs(python): Add docstrings for `Expr.meta` namespace
 (#10617)

Co-authored-by: Liam Brannigan <l.brannigan@analyticsengines.com>
---
 py-polars/polars/expr/meta.py | 118 ++++++++++++++++++++++++++++++++--
 1 file changed, 112 insertions(+), 6 deletions(-)

diff --git a/py-polars/polars/expr/meta.py b/py-polars/polars/expr/meta.py
index 3ac0568bf076..45205b54074d 100644
--- a/py-polars/polars/expr/meta.py
+++ b/py-polars/polars/expr/meta.py
@@ -28,19 +28,63 @@ def __ne__(self, other: ExprMetaNameSpace | Expr) -> bool:  # type: ignore[overr
         return not self == other
 
     def eq(self, other: ExprMetaNameSpace | Expr) -> bool:
-        """Indicate if this expression is the same as another expression."""
+        """
+        Indicate if this expression is the same as another expression.
+
+        Examples
+        --------
+        >>> foo_bar = pl.col("foo").alias("bar")
+        >>> foo = pl.col("foo")
+        >>> foo_bar.meta.eq(foo)
+        False
+        >>> foo_bar2 = pl.col("foo").alias("bar")
+        >>> foo_bar.meta.eq(foo_bar2)
+        True
+
+        """
         return self._pyexpr.meta_eq(other._pyexpr)
 
     def ne(self, other: ExprMetaNameSpace | Expr) -> bool:
-        """Indicate if this expression is NOT the same as another expression."""
+        """
+        Indicate if this expression is NOT the same as another expression.
+
+        Examples
+        --------
+        >>> foo_bar = pl.col("foo").alias("bar")
+        >>> foo = pl.col("foo")
+        >>> foo_bar.meta.ne(foo)
+        True
+        >>> foo_bar2 = pl.col("foo").alias("bar")
+        >>> foo_bar.meta.ne(foo_bar2)
+        False
+
+        """
         return not self.eq(other)
 
     def has_multiple_outputs(self) -> bool:
-        """Whether this expression expands into multiple expressions."""
+        """
+        Whether this expression expands into multiple expressions.
+
+        Examples
+        --------
+        >>> e = pl.col(["a", "b"]).alias("bar")
+        >>> e.meta.has_multiple_outputs()
+        True
+
+        """
         return self._pyexpr.meta_has_multiple_outputs()
 
     def is_regex_projection(self) -> bool:
-        """Whether this expression expands to columns that match a regex pattern."""
+        """
+        Whether this expression expands to columns that match a regex pattern.
+
+        Examples
+        --------
+        >>> e = pl.col("^.*$").alias("bar")
+        >>> e.meta.is_regex_projection()
+        True
+
+        """
         return self._pyexpr.meta_is_regex_projection()
 
     def output_name(self) -> str:
@@ -50,6 +94,24 @@ def output_name(self) -> str:
         It may not always be possible to determine the output name, as that can depend
         on the schema of the context; in that case this will raise ``ComputeError``.
 
+        Examples
+        --------
+        >>> e = pl.col("foo") * pl.col("bar")
+        >>> e.meta.output_name()
+        'foo'
+        >>> e_filter = pl.col("foo").filter(pl.col("bar") == 13)
+        >>> e_filter.meta.output_name()
+        'foo'
+        >>> e_sum_over = pl.sum("foo").over("groups")
+        >>> e_sum_over.meta.output_name()
+        'foo'
+        >>> e_sum_slice = pl.sum("foo").slice(pl.count() - 10, pl.col("bar"))
+        >>> e_sum_slice.meta.output_name()
+        'foo'
+        >>> e_count = pl.count()
+        >>> e_count.meta.output_name()
+        'count'
+
         """
         return self._pyexpr.meta_output_name()
 
@@ -64,15 +126,54 @@ def pop(self) -> list[Expr]:
             This is not the case when an expression has multiple inputs.
             For instance in a ``fold`` expression.
 
+        Examples
+        --------
+        >>> e = pl.col("foo").alias("bar")
+        >>> first = e.meta.pop()[0]
+        >>> first.meta == pl.col("foo")
+        True
+        >>> first.meta == pl.col("bar")
+        False
+
         """
         return [wrap_expr(e) for e in self._pyexpr.meta_pop()]
 
     def root_names(self) -> list[str]:
-        """Get a list with the root column name."""
+        """
+        Get a list with the root column name.
+
+        Examples
+        --------
+        >>> e = pl.col("foo") * pl.col("bar")
+        >>> e.meta.root_names()
+        ['foo', 'bar']
+        >>> e_filter = pl.col("foo").filter(pl.col("bar") == 13)
+        >>> e_filter.meta.root_names()
+        ['foo', 'bar']
+        >>> e_sum_over = pl.sum("foo").over("groups")
+        >>> e_sum_over.meta.root_names()
+        ['foo', 'groups']
+        >>> e_sum_slice = pl.sum("foo").slice(pl.count() - 10, pl.col("bar"))
+        >>> e_sum_slice.meta.root_names()
+        ['foo', 'bar']
+
+        """
         return self._pyexpr.meta_root_names()
 
     def undo_aliases(self) -> Expr:
-        """Undo any renaming operation like ``alias`` or ``keep_name``."""
+        """
+        Undo any renaming operation like ``alias`` or ``keep_name``.
+
+        Examples
+        --------
+        >>> e = pl.col("foo").alias("bar")
+        >>> e.meta.undo_aliases().meta == pl.col("foo")
+        True
+        >>> e = pl.col("foo").sum().over("bar")
+        >>> e.keep_name().meta.undo_aliases().meta == e
+        True
+
+        """
         return wrap_expr(self._pyexpr.meta_undo_aliases())
 
     def _as_selector(self) -> Expr:
@@ -135,6 +236,11 @@ def tree_format(self, return_as_string: bool = False) -> str | None:
         return_as_string:
             If True, return as string rather than printing to stdout.
 
+        Examples
+        --------
+        >>> e = (pl.col("foo") * pl.col("bar")).sum().over(pl.col("ham")) / 2
+        >>> e.meta.tree_format(return_as_string=True)  # doctest: +SKIP
+
         """
         s = self._pyexpr.meta_tree_format()
         if return_as_string:

From 5bdca8978044eb069fe5c6f45b29ffca1f812d5c Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Sun, 20 Aug 2023 09:17:20 +0200
Subject: [PATCH 11/55] feat(rust, python): propagate null is in `is_in` and
 more generic array construction (#10614)

---
 crates/polars-core/Cargo.toml                 |   4 +-
 .../src/chunked_array/ops/apply.rs            |  39 +++++++
 .../src/chunked_array/ops/bit_repr.rs         |  70 ++++++++++--
 .../src/chunked_array/ops/is_in.rs            | 104 ++++++++----------
 .../src/chunked_array/ops/min_max_binary.rs   |  12 +-
 .../polars-core/src/datatypes/from_values.rs  |  72 ++++++++++++
 crates/polars-core/src/datatypes/mod.rs       |   2 +
 .../polars-core/src/datatypes/static_array.rs |  31 +++++-
 .../src/frame/groupby/into_groups.rs          |  16 +--
 .../src/series/arithmetic/borrowed.rs         |   2 +-
 py-polars/tests/unit/operations/test_is_in.py |  24 +++-
 11 files changed, 280 insertions(+), 96 deletions(-)
 create mode 100644 crates/polars-core/src/datatypes/from_values.rs

diff --git a/crates/polars-core/Cargo.toml b/crates/polars-core/Cargo.toml
index c051e09d7306..becd9894074d 100644
--- a/crates/polars-core/Cargo.toml
+++ b/crates/polars-core/Cargo.toml
@@ -59,7 +59,7 @@ lazy = []
 
 # ~40% faster collect, needed until trustedlength iter stabilizes
 # more fast paths, slower compilation
-performant = ["polars-arrow/performant"]
+performant = ["polars-arrow/performant", "reinterpret"]
 
 # extra utilities for Utf8Chunked
 strings = ["regex", "polars-arrow/strings", "arrow/compute_substring", "polars-error/regex"]
@@ -77,7 +77,7 @@ sort_multiple = []
 rows = []
 
 # operations
-is_in = []
+is_in = ["reinterpret"]
 zip_with = []
 round_series = []
 checked_arithmetic = []
diff --git a/crates/polars-core/src/chunked_array/ops/apply.rs b/crates/polars-core/src/chunked_array/ops/apply.rs
index bff8409d0052..48cfdee12d1d 100644
--- a/crates/polars-core/src/chunked_array/ops/apply.rs
+++ b/crates/polars-core/src/chunked_array/ops/apply.rs
@@ -13,6 +13,45 @@ use crate::prelude::*;
 use crate::series::IsSorted;
 use crate::utils::{CustomIterTools, NoNull};
 
+impl<T> ChunkedArray<T>
+where
+    T: PolarsDataType,
+    Self: HasUnderlyingArray,
+{
+    pub fn apply_values<'a, U, K, F>(&'a self, op: F) -> ChunkedArray<U>
+    where
+        U: PolarsDataType,
+        F: FnMut(<<Self as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>) -> K + Copy,
+        K: ArrayFromElementIter,
+        K::ArrayType: StaticallyMatchesPolarsType<U>,
+    {
+        let iter = self.downcast_iter().map(|arr| {
+            let element_iter = arr.values_iter().map(op);
+            let array = K::array_from_values_iter(element_iter);
+            array.with_validity_typed(arr.validity().cloned())
+        });
+
+        ChunkedArray::from_chunk_iter(self.name(), iter)
+    }
+    pub fn apply2<'a, U, K, F>(&'a self, op: F) -> ChunkedArray<U>
+    where
+        U: PolarsDataType,
+        F: FnMut(
+                Option<<<Self as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>>,
+            ) -> Option<K>
+            + Copy,
+        K: ArrayFromElementIter,
+        K::ArrayType: StaticallyMatchesPolarsType<U>,
+    {
+        let iter = self.downcast_iter().map(|arr| {
+            let element_iter = arr.iter().map(op);
+            K::array_from_iter(element_iter)
+        });
+
+        ChunkedArray::from_chunk_iter(self.name(), iter)
+    }
+}
+
 pub(super) fn collect_array<T: NativeType, I: TrustedLen<Item = T>>(
     iter: I,
     validity: Option<Bitmap>,
diff --git a/crates/polars-core/src/chunked_array/ops/bit_repr.rs b/crates/polars-core/src/chunked_array/ops/bit_repr.rs
index 2724e2813287..2133405b3eb4 100644
--- a/crates/polars-core/src/chunked_array/ops/bit_repr.rs
+++ b/crates/polars-core/src/chunked_array/ops/bit_repr.rs
@@ -22,17 +22,47 @@ fn reinterpret_chunked_array<T: PolarsNumericType, U: PolarsNumericType>(
     ChunkedArray::from_chunk_iter(ca.name(), chunks)
 }
 
-#[cfg(feature = "performant")]
-impl Int16Chunked {
-    pub(crate) fn reinterpret_unsigned(&self) -> UInt16Chunked {
-        reinterpret_chunked_array(self)
+#[cfg(all(feature = "reinterpret", feature = "dtype-i16", feature = "dtype-u16"))]
+impl Reinterpret for Int16Chunked {
+    fn reinterpret_signed(&self) -> Series {
+        self.clone().into_series()
+    }
+
+    fn reinterpret_unsigned(&self) -> Series {
+        reinterpret_chunked_array::<_, UInt16Type>(self).into_series()
     }
 }
 
-#[cfg(feature = "performant")]
-impl Int8Chunked {
-    pub(crate) fn reinterpret_unsigned(&self) -> UInt8Chunked {
-        reinterpret_chunked_array(self)
+#[cfg(all(feature = "reinterpret", feature = "dtype-u16", feature = "dtype-i16"))]
+impl Reinterpret for UInt16Chunked {
+    fn reinterpret_signed(&self) -> Series {
+        reinterpret_chunked_array::<_, Int16Type>(self).into_series()
+    }
+
+    fn reinterpret_unsigned(&self) -> Series {
+        self.clone().into_series()
+    }
+}
+
+#[cfg(all(feature = "reinterpret", feature = "dtype-i8", feature = "dtype-u8"))]
+impl Reinterpret for Int8Chunked {
+    fn reinterpret_signed(&self) -> Series {
+        self.clone().into_series()
+    }
+
+    fn reinterpret_unsigned(&self) -> Series {
+        reinterpret_chunked_array::<_, UInt8Type>(self).into_series()
+    }
+}
+
+#[cfg(all(feature = "reinterpret", feature = "dtype-u8", feature = "dtype-i8"))]
+impl Reinterpret for UInt8Chunked {
+    fn reinterpret_signed(&self) -> Series {
+        reinterpret_chunked_array::<_, Int8Type>(self).into_series()
+    }
+
+    fn reinterpret_unsigned(&self) -> Series {
+        self.clone().into_series()
     }
 }
 
@@ -120,7 +150,29 @@ impl Reinterpret for Int32Chunked {
     }
 
     fn reinterpret_unsigned(&self) -> Series {
-        self.bit_repr_large().into_series()
+        self.bit_repr_small().into_series()
+    }
+}
+
+#[cfg(feature = "reinterpret")]
+impl Reinterpret for Float32Chunked {
+    fn reinterpret_signed(&self) -> Series {
+        reinterpret_chunked_array::<_, Int32Type>(self).into_series()
+    }
+
+    fn reinterpret_unsigned(&self) -> Series {
+        reinterpret_chunked_array::<_, UInt32Type>(self).into_series()
+    }
+}
+
+#[cfg(feature = "reinterpret")]
+impl Reinterpret for Float64Chunked {
+    fn reinterpret_signed(&self) -> Series {
+        reinterpret_chunked_array::<_, Int64Type>(self).into_series()
+    }
+
+    fn reinterpret_unsigned(&self) -> Series {
+        reinterpret_chunked_array::<_, UInt64Type>(self).into_series()
     }
 }
 
diff --git a/crates/polars-core/src/chunked_array/ops/is_in.rs b/crates/polars-core/src/chunked_array/ops/is_in.rs
index f81a5b33e53e..a53b5de54d44 100644
--- a/crates/polars-core/src/chunked_array/ops/is_in.rs
+++ b/crates/polars-core/src/chunked_array/ops/is_in.rs
@@ -3,42 +3,29 @@ use std::hash::Hash;
 use crate::prelude::*;
 use crate::utils::{try_get_supertype, CustomIterTools};
 
-unsafe fn is_in_helper<T, P>(ca: &ChunkedArray<T>, other: &Series) -> PolarsResult<BooleanChunked>
+fn is_in_helper<'a, T>(ca: &'a ChunkedArray<T>, other: &Series) -> PolarsResult<BooleanChunked>
 where
-    T: PolarsNumericType,
-    P: Eq + Hash + Copy,
+    T: PolarsDataType,
+    ChunkedArray<T>: HasUnderlyingArray,
+    <<ChunkedArray<T> as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>: Hash + Eq + Copy,
 {
     let mut set = PlHashSet::with_capacity(other.len());
 
     let other = ca.unpack_series_matching_type(other)?;
     other.downcast_iter().for_each(|iter| {
-        iter.into_iter().for_each(|opt_val| {
-            // Safety
-            // bit sizes are/ should be equal
-            let ptr = &opt_val.copied() as *const Option<T::Native> as *const Option<P>;
-            let opt_val = *ptr;
-            set.insert(opt_val);
+        iter.iter().for_each(|opt_val| {
+            if let Some(v) = opt_val {
+                set.insert(v);
+            }
         })
     });
-
-    let name = ca.name();
-    let mut ca: BooleanChunked = ca
-        .into_iter()
-        .map(|opt_val| {
-            // Safety
-            // bit sizes are/ should be equal
-            let ptr = &opt_val as *const Option<T::Native> as *const Option<P>;
-            let opt_val = *ptr;
-            set.contains(&opt_val)
-        })
-        .collect_trusted();
-    ca.rename(name);
-    Ok(ca)
+    Ok(ca.apply_values(|val| set.contains(&val)))
 }
 
 impl<T> IsIn for ChunkedArray<T>
 where
-    T: PolarsNumericType,
+    T: PolarsIntegerType,
+    T::Native: Hash + Eq,
 {
     fn is_in(&self, other: &Series) -> PolarsResult<BooleanChunked> {
         // We check implicitly cast to supertype here
@@ -88,24 +75,7 @@ where
                     let right = other.cast(&st)?;
                     return left.is_in(&right);
                 }
-                // now that the types are equal, we coerce every 32 bit array to u32
-                // and every 64 bit array to u64 (including floats)
-                // this allows hashing them and greatly reduces the number of code paths.
-                match self.dtype() {
-                    DataType::UInt64 | DataType::Int64 | DataType::Float64 => unsafe {
-                        is_in_helper::<T, u64>(self, other)
-                    },
-                    DataType::UInt32 | DataType::Int32 | DataType::Float32 => unsafe {
-                        is_in_helper::<T, u32>(self, other)
-                    },
-                    DataType::UInt8 | DataType::Int8 => unsafe {
-                        is_in_helper::<T, u8>(self, other)
-                    },
-                    DataType::UInt16 | DataType::Int16 => unsafe {
-                        is_in_helper::<T, u16>(self, other)
-                    },
-                    dt => polars_bail!(opq = is_in, dt),
-                }
+                is_in_helper(self, other)
             }
         }
         .map(|mut ca| {
@@ -114,6 +84,26 @@ where
         })
     }
 }
+
+impl IsIn for Float32Chunked {
+    fn is_in(&self, other: &Series) -> PolarsResult<BooleanChunked> {
+        let other = other.cast(&DataType::Float32)?;
+        let other = other.f32().unwrap();
+        let other = other.reinterpret_unsigned();
+        let ca = self.reinterpret_unsigned();
+        ca.is_in(&other)
+    }
+}
+impl IsIn for Float64Chunked {
+    fn is_in(&self, other: &Series) -> PolarsResult<BooleanChunked> {
+        let other = other.cast(&DataType::Float64)?;
+        let other = other.f64().unwrap();
+        let other = other.reinterpret_unsigned();
+        let ca = self.reinterpret_unsigned();
+        ca.is_in(&other)
+    }
+}
+
 impl IsIn for Utf8Chunked {
     fn is_in(&self, other: &Series) -> PolarsResult<BooleanChunked> {
         match other.dtype() {
@@ -209,20 +199,7 @@ impl IsIn for BinaryChunked {
                 Ok(ca)
             }
             DataType::Binary => {
-                let mut set = PlHashSet::with_capacity(other.len());
-
-                let other = other.binary()?;
-                other.downcast_iter().for_each(|iter| {
-                    iter.into_iter().for_each(|opt_val| {
-                        set.insert(opt_val);
-                    })
-                });
-                let mut ca: BooleanChunked = self
-                    .into_iter()
-                    .map(|opt_val| set.contains(&opt_val))
-                    .collect_trusted();
-                ca.rename(self.name());
-                Ok(ca)
+                is_in_helper(self, other)
             }
             _ => polars_bail!(opq = is_in, self.dtype(), other.dtype()),
         }
@@ -363,11 +340,11 @@ impl IsIn for StructChunked {
                 }
 
                 let mut anyvalues = Vec::with_capacity(other.len() * other.fields().len());
-                // Safety:
+                // SAFETY:
                 // the iterator is unsafe as the lifetime is tied to the iterator
                 // so we copy to an owned buffer first
-                other.into_iter().for_each(|val| {
-                    anyvalues.extend_from_slice(val);
+                other.into_iter().for_each(|vals| {
+                    anyvalues.extend_from_slice(vals);
                 });
 
                 // then we fill the set
@@ -382,7 +359,14 @@ impl IsIn for StructChunked {
                 // and then we check for membership
                 let mut ca: BooleanChunked = self_ca
                     .into_iter()
-                    .map(|vals| set.contains(&vals))
+                    .map(|vals| {
+                        // If all rows are null we see the struct row as missing.
+                        if !vals.iter().all(|val| matches!(val, AnyValue::Null)) {
+                            Some(set.contains(&vals))
+                        } else {
+                            None
+                        }
+                    })
                     .collect();
                 ca.rename(self.name());
                 Ok(ca)
diff --git a/crates/polars-core/src/chunked_array/ops/min_max_binary.rs b/crates/polars-core/src/chunked_array/ops/min_max_binary.rs
index bfb8dcc1d014..279a4ae0719f 100644
--- a/crates/polars-core/src/chunked_array/ops/min_max_binary.rs
+++ b/crates/polars-core/src/chunked_array/ops/min_max_binary.rs
@@ -7,11 +7,11 @@ where
     T: PolarsNumericType,
     T::Native: PartialOrd,
 {
-    let op = |l: &T::Native, r: &T::Native| {
+    let op = |l: T::Native, r: T::Native| {
         if l < r {
-            *l
+            l
         } else {
-            *r
+            r
         }
     };
     arity::binary_elementwise_values(left, right, op)
@@ -22,11 +22,11 @@ where
     T: PolarsNumericType,
     T::Native: PartialOrd,
 {
-    let op = |l: &T::Native, r: &T::Native| {
+    let op = |l: T::Native, r: T::Native| {
         if l > r {
-            *l
+            l
         } else {
-            *r
+            r
         }
     };
     arity::binary_elementwise_values(left, right, op)
diff --git a/crates/polars-core/src/datatypes/from_values.rs b/crates/polars-core/src/datatypes/from_values.rs
new file mode 100644
index 000000000000..7674adbec0e1
--- /dev/null
+++ b/crates/polars-core/src/datatypes/from_values.rs
@@ -0,0 +1,72 @@
+use arrow::array::{BooleanArray, PrimitiveArray, Utf8Array};
+use polars_arrow::array::utf8::Utf8FromIter;
+use polars_arrow::trusted_len::TrustedLen;
+
+use crate::prelude::StaticArray;
+
+pub trait ArrayFromElementIter
+where
+    Self: Sized,
+{
+    type ArrayType: StaticArray;
+
+    fn array_from_iter<I: TrustedLen<Item = Option<Self>>>(iter: I) -> Self::ArrayType;
+
+    fn array_from_values_iter<I: TrustedLen<Item = Self>>(iter: I) -> Self::ArrayType;
+}
+
+impl ArrayFromElementIter for bool {
+    type ArrayType = BooleanArray;
+
+    fn array_from_iter<I: TrustedLen<Item = Option<Self>>>(iter: I) -> Self::ArrayType {
+        // SAFETY: guarded by `TrustedLen` trait
+        unsafe { BooleanArray::from_trusted_len_iter_unchecked(iter) }
+    }
+
+    fn array_from_values_iter<I: TrustedLen<Item = Self>>(iter: I) -> Self::ArrayType {
+        // SAFETY: guarded by `TrustedLen` trait
+        unsafe { BooleanArray::from_trusted_len_values_iter_unchecked(iter) }
+    }
+}
+
+macro_rules! impl_primitive {
+    ($tp:ty) => {
+        impl ArrayFromElementIter for $tp {
+            type ArrayType = PrimitiveArray<Self>;
+
+            fn array_from_iter<I: TrustedLen<Item = Option<Self>>>(iter: I) -> Self::ArrayType {
+                // SAFETY: guarded by `TrustedLen` trait
+                unsafe { PrimitiveArray::from_trusted_len_iter_unchecked(iter) }
+            }
+
+            fn array_from_values_iter<I: TrustedLen<Item = Self>>(iter: I) -> Self::ArrayType {
+                // SAFETY: guarded by `TrustedLen` trait
+                unsafe { PrimitiveArray::from_trusted_len_values_iter_unchecked(iter) }
+            }
+        }
+    };
+}
+
+impl_primitive!(u8);
+impl_primitive!(u16);
+impl_primitive!(u32);
+impl_primitive!(u64);
+impl_primitive!(i8);
+impl_primitive!(i16);
+impl_primitive!(i32);
+impl_primitive!(i64);
+
+impl ArrayFromElementIter for &str {
+    type ArrayType = Utf8Array<i64>;
+
+    fn array_from_iter<I: TrustedLen<Item = Option<Self>>>(iter: I) -> Self::ArrayType {
+        // SAFETY: guarded by `TrustedLen` trait
+        unsafe { Utf8Array::from_trusted_len_iter_unchecked(iter) }
+    }
+
+    fn array_from_values_iter<I: TrustedLen<Item = Self>>(iter: I) -> Self::ArrayType {
+        // SAFETY: guarded by `TrustedLen` trait
+        let len = iter.size_hint().0;
+        Utf8Array::from_values_iter(iter, len, len * 24)
+    }
+}
diff --git a/crates/polars-core/src/datatypes/mod.rs b/crates/polars-core/src/datatypes/mod.rs
index bdb5221d6b4d..120820365207 100644
--- a/crates/polars-core/src/datatypes/mod.rs
+++ b/crates/polars-core/src/datatypes/mod.rs
@@ -12,6 +12,7 @@ mod aliases;
 mod any_value;
 mod dtype;
 mod field;
+mod from_values;
 mod static_array;
 mod time_unit;
 
@@ -31,6 +32,7 @@ use arrow::types::simd::Simd;
 use arrow::types::NativeType;
 pub use dtype::*;
 pub use field::*;
+pub use from_values::ArrayFromElementIter;
 use num_traits::{Bounded, FromPrimitive, Num, NumCast, Zero};
 use polars_arrow::data_types::IsFloat;
 #[cfg(feature = "serde")]
diff --git a/crates/polars-core/src/datatypes/static_array.rs b/crates/polars-core/src/datatypes/static_array.rs
index 46dedae311bb..ecf54b7179e7 100644
--- a/crates/polars-core/src/datatypes/static_array.rs
+++ b/crates/polars-core/src/datatypes/static_array.rs
@@ -1,4 +1,5 @@
 use arrow::bitmap::utils::{BitmapIter, ZipValidity};
+use arrow::bitmap::Bitmap;
 
 #[cfg(feature = "object")]
 use crate::chunked_array::object::ObjectArray;
@@ -16,18 +17,22 @@ pub trait StaticArray: Array {
 
     fn iter(&self) -> ZipValidity<Self::ValueT<'_>, Self::ValueIterT<'_>, BitmapIter>;
     fn values_iter(&self) -> Self::ValueIterT<'_>;
+    fn with_validity_typed(self, validity: Option<Bitmap>) -> Self;
 }
 
 impl<T: NumericNative> StaticArray for PrimitiveArray<T> {
-    type ValueT<'a> = &'a T;
-    type ValueIterT<'a> = std::slice::Iter<'a, T>;
+    type ValueT<'a> = T;
+    type ValueIterT<'a> = std::iter::Copied<std::slice::Iter<'a, T>>;
 
     fn values_iter(&self) -> Self::ValueIterT<'_> {
-        self.values_iter()
+        self.values_iter().copied()
     }
 
     fn iter(&self) -> ZipValidity<Self::ValueT<'_>, Self::ValueIterT<'_>, BitmapIter> {
-        self.iter()
+        ZipValidity::new_with_validity(self.values().iter().copied(), self.validity())
+    }
+    fn with_validity_typed(self, validity: Option<Bitmap>) -> Self {
+        self.with_validity(validity)
     }
 }
 
@@ -42,6 +47,9 @@ impl StaticArray for BooleanArray {
     fn iter(&self) -> ZipValidity<Self::ValueT<'_>, Self::ValueIterT<'_>, BitmapIter> {
         self.iter()
     }
+    fn with_validity_typed(self, validity: Option<Bitmap>) -> Self {
+        self.with_validity(validity)
+    }
 }
 
 impl StaticArray for Utf8Array<i64> {
@@ -55,6 +63,9 @@ impl StaticArray for Utf8Array<i64> {
     fn iter(&self) -> ZipValidity<Self::ValueT<'_>, Self::ValueIterT<'_>, BitmapIter> {
         self.iter()
     }
+    fn with_validity_typed(self, validity: Option<Bitmap>) -> Self {
+        self.with_validity(validity)
+    }
 }
 
 impl StaticArray for BinaryArray<i64> {
@@ -68,6 +79,9 @@ impl StaticArray for BinaryArray<i64> {
     fn iter(&self) -> ZipValidity<Self::ValueT<'_>, Self::ValueIterT<'_>, BitmapIter> {
         self.iter()
     }
+    fn with_validity_typed(self, validity: Option<Bitmap>) -> Self {
+        self.with_validity(validity)
+    }
 }
 
 impl StaticArray for ListArray<i64> {
@@ -81,6 +95,9 @@ impl StaticArray for ListArray<i64> {
     fn iter(&self) -> ZipValidity<Self::ValueT<'_>, Self::ValueIterT<'_>, BitmapIter> {
         self.iter()
     }
+    fn with_validity_typed(self, validity: Option<Bitmap>) -> Self {
+        self.with_validity(validity)
+    }
 }
 
 #[cfg(feature = "dtype-array")]
@@ -95,6 +112,9 @@ impl StaticArray for FixedSizeListArray {
     fn iter(&self) -> ZipValidity<Self::ValueT<'_>, Self::ValueIterT<'_>, BitmapIter> {
         self.iter()
     }
+    fn with_validity_typed(self, validity: Option<Bitmap>) -> Self {
+        self.with_validity(validity)
+    }
 }
 
 #[cfg(feature = "object")]
@@ -109,4 +129,7 @@ impl<T: PolarsObject> StaticArray for ObjectArray<T> {
     fn iter(&self) -> ZipValidity<Self::ValueT<'_>, Self::ValueIterT<'_>, BitmapIter> {
         todo!()
     }
+    fn with_validity_typed(self, _validity: Option<Bitmap>) -> Self {
+        todo!()
+    }
 }
diff --git a/crates/polars-core/src/frame/groupby/into_groups.rs b/crates/polars-core/src/frame/groupby/into_groups.rs
index 4c0141ca41f7..5518f1a760d0 100644
--- a/crates/polars-core/src/frame/groupby/into_groups.rs
+++ b/crates/polars-core/src/frame/groupby/into_groups.rs
@@ -172,30 +172,30 @@ where
                 let ca = self.bit_repr_small();
                 num_groups_proxy(&ca, multithreaded, sorted)
             },
-            #[cfg(feature = "performant")]
+            #[cfg(all(feature = "performant", feature = "dtype-i8", feature = "dtype-u8"))]
             DataType::Int8 => {
                 // convince the compiler that we are this type.
                 let ca: &Int8Chunked =
                     unsafe { &*(self as *const ChunkedArray<T> as *const ChunkedArray<Int8Type>) };
-                let ca = ca.reinterpret_unsigned();
-                num_groups_proxy(&ca, multithreaded, sorted)
+                let s = ca.reinterpret_unsigned();
+                return s.group_tuples(multithreaded, sorted);
             },
-            #[cfg(feature = "performant")]
+            #[cfg(all(feature = "performant", feature = "dtype-i8", feature = "dtype-u8"))]
             DataType::UInt8 => {
                 // convince the compiler that we are this type.
                 let ca: &UInt8Chunked =
                     unsafe { &*(self as *const ChunkedArray<T> as *const ChunkedArray<UInt8Type>) };
                 num_groups_proxy(ca, multithreaded, sorted)
             },
-            #[cfg(feature = "performant")]
+            #[cfg(all(feature = "performant", feature = "dtype-i16", feature = "dtype-u16"))]
             DataType::Int16 => {
                 // convince the compiler that we are this type.
                 let ca: &Int16Chunked =
                     unsafe { &*(self as *const ChunkedArray<T> as *const ChunkedArray<Int16Type>) };
-                let ca = ca.reinterpret_unsigned();
-                num_groups_proxy(&ca, multithreaded, sorted)
+                let s = ca.reinterpret_unsigned();
+                return s.group_tuples(multithreaded, sorted);
             },
-            #[cfg(feature = "performant")]
+            #[cfg(all(feature = "performant", feature = "dtype-i16", feature = "dtype-u16"))]
             DataType::UInt16 => {
                 // convince the compiler that we are this type.
                 let ca: &UInt16Chunked = unsafe {
diff --git a/crates/polars-core/src/series/arithmetic/borrowed.rs b/crates/polars-core/src/series/arithmetic/borrowed.rs
index 4ca8e3e7e00b..bc71816331f7 100644
--- a/crates/polars-core/src/series/arithmetic/borrowed.rs
+++ b/crates/polars-core/src/series/arithmetic/borrowed.rs
@@ -163,7 +163,7 @@ pub mod checked {
 
             Ok(
                 arity::binary_elementwise(lhs, rhs, |opt_l, opt_r| match (opt_l, opt_r) {
-                    (Some(l), Some(r)) => l.checked_div(r),
+                    (Some(l), Some(r)) => l.checked_div(&r),
                     _ => None,
                 })
                 .into_series(),
diff --git a/py-polars/tests/unit/operations/test_is_in.py b/py-polars/tests/unit/operations/test_is_in.py
index 7e7a23df0c63..9066f49aa171 100644
--- a/py-polars/tests/unit/operations/test_is_in.py
+++ b/py-polars/tests/unit/operations/test_is_in.py
@@ -44,14 +44,9 @@ def test_is_in_empty_list_4639() -> None:
     df = pl.DataFrame({"a": [1, None]})
     empty_list: list[int] = []
 
-    print(df.with_columns(pl.col("a").is_in(empty_list)))
     assert df.with_columns([pl.col("a").is_in(empty_list).alias("a_in_list")]).to_dict(
         False
-    ) == {"a": [1, None], "a_in_list": [False, False]}
-    # df = pl.DataFrame()
-    # assert df.with_columns(
-    #     [pl.lit(None).cast(pl.Int64).is_in(empty_list).alias("in_empty_list")]
-    # ).to_dict(False) == {"in_empty_list": [False]}
+    ) == {"a": [1, None], "a_in_list": [False, None]}
 
 
 def test_is_in_struct() -> None:
@@ -71,6 +66,23 @@ def test_is_in_struct() -> None:
     }
 
 
+def test_is_in_null_prop() -> None:
+    assert pl.Series([None], dtype=pl.Float32).is_in(pl.Series([42])).item() is None
+    assert (
+        pl.Series([{"a": None}], dtype=pl.Struct({"a": pl.Float32}))
+        .is_in(pl.Series([{"a": 42}]))
+        .item()
+        is None
+    )
+    assert pl.Series([None], dtype=pl.Boolean).is_in(pl.Series([42])).item() is None
+    assert (
+        pl.Series([{"a": None}], dtype=pl.Struct({"a": pl.Boolean}))
+        .is_in(pl.Series([{"a": 42}]))
+        .item()
+        is None
+    )
+
+
 def test_is_in_df() -> None:
     df = pl.DataFrame({"a": [1, 2, 3]})
     assert df.select(pl.col("a").is_in([1, 2]))["a"].to_list() == [

From 046588fed62790c56ffd7474c62a6d5bf033a760 Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Sun, 20 Aug 2023 10:12:02 +0200
Subject: [PATCH 12/55] fix(rust, python): fix int/float downcast in `is_in`
 (#10620)

---
 .../src/logical_plan/optimizer/type_coercion/mod.rs         | 6 +++++-
 py-polars/tests/unit/operations/test_is_in.py               | 4 ++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/crates/polars-plan/src/logical_plan/optimizer/type_coercion/mod.rs b/crates/polars-plan/src/logical_plan/optimizer/type_coercion/mod.rs
index 1ae3d357438c..5ae5f8c9eda1 100644
--- a/crates/polars-plan/src/logical_plan/optimizer/type_coercion/mod.rs
+++ b/crates/polars-plan/src/logical_plan/optimizer/type_coercion/mod.rs
@@ -369,7 +369,11 @@ impl OptimizationRule for TypeCoercionRule {
                     // if right is another type, we cast it to left
                     // we do not use super-type as an `is_in` operation should not
                     // cast the whole column implicitly.
-                    (a, b) if a != b => {
+                    (a, b)
+                        if a != b
+                        // For integer/ float comparison we let them use supertypes.
+                        && !(a.is_integer() && b.is_float()) =>
+                    {
                         AExpr::Cast {
                             expr: other_node,
                             data_type: type_left,
diff --git a/py-polars/tests/unit/operations/test_is_in.py b/py-polars/tests/unit/operations/test_is_in.py
index 9066f49aa171..7eefd3758921 100644
--- a/py-polars/tests/unit/operations/test_is_in.py
+++ b/py-polars/tests/unit/operations/test_is_in.py
@@ -83,6 +83,10 @@ def test_is_in_null_prop() -> None:
     )
 
 
+def test_is_in_9070() -> None:
+    assert not pl.Series([1]).is_in(pl.Series([1.99])).item()
+
+
 def test_is_in_df() -> None:
     df = pl.DataFrame({"a": [1, 2, 3]})
     assert df.select(pl.col("a").is_in([1, 2]))["a"].to_list() == [

From c3faa051a44ebebeb00dd14e1a1647e8b1979ef1 Mon Sep 17 00:00:00 2001
From: Weijie Guo <reswqa@163.com>
Date: Sun, 20 Aug 2023 16:34:28 +0800
Subject: [PATCH 13/55] feature(rust, python): Multiple groupby key supports
 list type (#10615)

---
 .../src/chunked_array/ops/compare_inner.rs    |  39 +++++
 .../src/chunked_array/ops/take/take_random.rs |  12 +-
 .../polars-core/src/hashing/vector_hasher.rs  | 133 ++++++++++++++----
 .../src/series/implementations/binary.rs      |   4 +-
 .../src/series/implementations/boolean.rs     |   4 +-
 .../src/series/implementations/categorical.rs |   4 +-
 .../src/series/implementations/dates_time.rs  |   4 +-
 .../src/series/implementations/datetime.rs    |   4 +-
 .../src/series/implementations/duration.rs    |   4 +-
 .../src/series/implementations/floats.rs      |   4 +-
 .../src/series/implementations/list.rs        |  24 ++++
 .../src/series/implementations/mod.rs         |   4 +-
 .../src/series/implementations/object.rs      |   4 +-
 .../src/series/implementations/utf8.rs        |   4 +-
 crates/polars-error/src/lib.rs                |   5 +
 py-polars/tests/unit/datatypes/test_list.py   |  19 +++
 .../tests/unit/operations/test_unique.py      |   9 ++
 py-polars/tests/unit/test_errors.py           |   5 -
 18 files changed, 228 insertions(+), 58 deletions(-)

diff --git a/crates/polars-core/src/chunked_array/ops/compare_inner.rs b/crates/polars-core/src/chunked_array/ops/compare_inner.rs
index 59d064a506c9..a637d423f705 100644
--- a/crates/polars-core/src/chunked_array/ops/compare_inner.rs
+++ b/crates/polars-core/src/chunked_array/ops/compare_inner.rs
@@ -12,6 +12,7 @@ use crate::chunked_array::ops::take::take_random::{
 #[cfg(feature = "object")]
 use crate::chunked_array::ops::take::take_random::{ObjectTakeRandom, ObjectTakeRandomSingleChunk};
 use crate::prelude::*;
+use crate::utils::Wrap;
 
 pub trait PartialEqInner: Send + Sync {
     /// Safety:
@@ -77,6 +78,18 @@ impl_traits!(BoolTakeRandomSingleChunk<'_>);
 impl_traits!(NumTakeRandomSingleChunk<'_, T>, T);
 impl_traits!(NumTakeRandomChunked<'_, T>, T);
 
+impl<'a> PartialEqInner for ListTakeRandomSingleChunk<'a> {
+    unsafe fn eq_element_unchecked(&self, idx_a: usize, idx_b: usize) -> bool {
+        self.get_unchecked(idx_a).map(Wrap) == self.get_unchecked(idx_b).map(Wrap)
+    }
+}
+
+impl<'a> PartialEqInner for ListTakeRandom<'a> {
+    unsafe fn eq_element_unchecked(&self, idx_a: usize, idx_b: usize) -> bool {
+        self.get_unchecked(idx_a).map(Wrap) == self.get_unchecked(idx_b).map(Wrap)
+    }
+}
+
 impl<T> PartialEqInner for NumTakeRandomCont<'_, T>
 where
     T: Copy + PartialEq + Sync,
@@ -123,6 +136,32 @@ where
     }
 }
 
+impl<'a> IntoPartialEqInner<'a> for &'a ListChunked {
+    fn into_partial_eq_inner(self) -> Box<dyn PartialEqInner + 'a> {
+        match self.chunks.len() {
+            1 => {
+                let arr = self.downcast_iter().next().unwrap();
+                let t = ListTakeRandomSingleChunk {
+                    arr,
+                    name: self.name(),
+                };
+                Box::new(t)
+            },
+            _ => {
+                let name = self.name();
+                let inner_type = self.inner_dtype().to_physical();
+                let t = ListTakeRandom {
+                    inner_type,
+                    name,
+                    chunks: self.downcast_iter().collect(),
+                    chunk_lens: self.chunks.iter().map(|a| a.len() as IdxSize).collect(),
+                };
+                Box::new(t)
+            },
+        }
+    }
+}
+
 impl<'a> IntoPartialEqInner<'a> for &'a Utf8Chunked {
     fn into_partial_eq_inner(self) -> Box<dyn PartialEqInner + 'a> {
         match self.chunks.len() {
diff --git a/crates/polars-core/src/chunked_array/ops/take/take_random.rs b/crates/polars-core/src/chunked_array/ops/take/take_random.rs
index 601ad468a330..43feaca04576 100644
--- a/crates/polars-core/src/chunked_array/ops/take/take_random.rs
+++ b/crates/polars-core/src/chunked_array/ops/take/take_random.rs
@@ -540,10 +540,10 @@ impl<'a> TakeRandom for BoolTakeRandomSingleChunk<'a> {
 }
 
 pub struct ListTakeRandom<'a> {
-    inner_type: DataType,
-    name: &'a str,
-    chunks: Vec<&'a ListArray<i64>>,
-    chunk_lens: Vec<IdxSize>,
+    pub(crate) inner_type: DataType,
+    pub(crate) name: &'a str,
+    pub(crate) chunks: Vec<&'a ListArray<i64>>,
+    pub(crate) chunk_lens: Vec<IdxSize>,
 }
 
 impl<'a> TakeRandom for ListTakeRandom<'a> {
@@ -579,8 +579,8 @@ impl<'a> TakeRandom for ListTakeRandom<'a> {
 }
 
 pub struct ListTakeRandomSingleChunk<'a> {
-    arr: &'a ListArray<i64>,
-    name: &'a str,
+    pub(crate) arr: &'a ListArray<i64>,
+    pub(crate) name: &'a str,
 }
 
 impl<'a> TakeRandom for ListTakeRandomSingleChunk<'a> {
diff --git a/crates/polars-core/src/hashing/vector_hasher.rs b/crates/polars-core/src/hashing/vector_hasher.rs
index fd635e858d9c..537e71ee9e4c 100644
--- a/crates/polars-core/src/hashing/vector_hasher.rs
+++ b/crates/polars-core/src/hashing/vector_hasher.rs
@@ -1,6 +1,8 @@
 use arrow::bitmap::utils::get_bit_unchecked;
 use hashbrown::hash_map::RawEntryMut;
 use hashbrown::HashMap;
+#[cfg(feature = "groupby_list")]
+use polars_arrow::kernels::list_bytes_iter::numeric_list_bytes_iter;
 use polars_arrow::utils::CustomIterTools;
 use rayon::prelude::*;
 use xxhash_rust::xxh3::xxh3_64_with_seed;
@@ -22,12 +24,16 @@ pub trait VecHash {
     /// Compute the hash for all values in the array.
     ///
     /// This currently only works with the AHash RandomState hasher builder.
-    fn vec_hash(&self, _random_state: RandomState, _buf: &mut Vec<u64>) {
-        unimplemented!()
+    fn vec_hash(&self, _random_state: RandomState, _buf: &mut Vec<u64>) -> PolarsResult<()> {
+        polars_bail!(un_impl = vec_hash);
     }
 
-    fn vec_hash_combine(&self, _random_state: RandomState, _hashes: &mut [u64]) {
-        unimplemented!()
+    fn vec_hash_combine(
+        &self,
+        _random_state: RandomState,
+        _hashes: &mut [u64],
+    ) -> PolarsResult<()> {
+        polars_bail!(un_impl = vec_hash_combine);
     }
 }
 
@@ -138,12 +144,18 @@ where
 macro_rules! vec_hash_int {
     ($ca:ident) => {
         impl VecHash for $ca {
-            fn vec_hash(&self, random_state: RandomState, buf: &mut Vec<u64>) {
-                integer_vec_hash(self, random_state, buf)
+            fn vec_hash(&self, random_state: RandomState, buf: &mut Vec<u64>) -> PolarsResult<()> {
+                integer_vec_hash(self, random_state, buf);
+                Ok(())
             }
 
-            fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) {
-                integer_vec_hash_combine(self, random_state, hashes)
+            fn vec_hash_combine(
+                &self,
+                random_state: RandomState,
+                hashes: &mut [u64],
+            ) -> PolarsResult<()> {
+                integer_vec_hash_combine(self, random_state, hashes);
+                Ok(())
             }
         }
     };
@@ -159,12 +171,14 @@ vec_hash_int!(UInt16Chunked);
 vec_hash_int!(UInt8Chunked);
 
 impl VecHash for Utf8Chunked {
-    fn vec_hash(&self, random_state: RandomState, buf: &mut Vec<u64>) {
-        self.as_binary().vec_hash(random_state, buf)
+    fn vec_hash(&self, random_state: RandomState, buf: &mut Vec<u64>) -> PolarsResult<()> {
+        self.as_binary().vec_hash(random_state, buf)?;
+        Ok(())
     }
 
-    fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) {
-        self.as_binary().vec_hash_combine(random_state, hashes)
+    fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) -> PolarsResult<()> {
+        self.as_binary().vec_hash_combine(random_state, hashes)?;
+        Ok(())
     }
 }
 
@@ -183,14 +197,15 @@ pub fn _hash_binary_array(arr: &BinaryArray<i64>, random_state: RandomState, buf
 }
 
 impl VecHash for BinaryChunked {
-    fn vec_hash(&self, random_state: RandomState, buf: &mut Vec<u64>) {
+    fn vec_hash(&self, random_state: RandomState, buf: &mut Vec<u64>) -> PolarsResult<()> {
         buf.clear();
         buf.reserve(self.len());
         self.downcast_iter()
             .for_each(|arr| _hash_binary_array(arr, random_state.clone(), buf));
+        Ok(())
     }
 
-    fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) {
+    fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) -> PolarsResult<()> {
         let null_h = get_null_hash_value(random_state);
 
         let mut offset = 0;
@@ -222,11 +237,12 @@ impl VecHash for BinaryChunked {
             }
             offset += arr.len();
         });
+        Ok(())
     }
 }
 
 impl VecHash for BooleanChunked {
-    fn vec_hash(&self, random_state: RandomState, buf: &mut Vec<u64>) {
+    fn vec_hash(&self, random_state: RandomState, buf: &mut Vec<u64>) -> PolarsResult<()> {
         buf.clear();
         buf.reserve(self.len());
         let true_h = random_state.hash_one(true);
@@ -243,9 +259,10 @@ impl VecHash for BooleanChunked {
                 }))
             }
         });
+        Ok(())
     }
 
-    fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) {
+    fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) -> PolarsResult<()> {
         let true_h = random_state.hash_one(true);
         let false_h = random_state.hash_one(false);
         let null_h = get_null_hash_value(random_state);
@@ -283,24 +300,83 @@ impl VecHash for BooleanChunked {
             }
             offset += arr.len();
         });
+        Ok(())
     }
 }
 
 impl VecHash for Float32Chunked {
-    fn vec_hash(&self, random_state: RandomState, buf: &mut Vec<u64>) {
-        self.bit_repr_small().vec_hash(random_state, buf)
+    fn vec_hash(&self, random_state: RandomState, buf: &mut Vec<u64>) -> PolarsResult<()> {
+        self.bit_repr_small().vec_hash(random_state, buf)?;
+        Ok(())
     }
 
-    fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) {
-        self.bit_repr_small().vec_hash_combine(random_state, hashes)
+    fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) -> PolarsResult<()> {
+        self.bit_repr_small()
+            .vec_hash_combine(random_state, hashes)?;
+        Ok(())
     }
 }
 impl VecHash for Float64Chunked {
-    fn vec_hash(&self, random_state: RandomState, buf: &mut Vec<u64>) {
-        self.bit_repr_large().vec_hash(random_state, buf)
+    fn vec_hash(&self, random_state: RandomState, buf: &mut Vec<u64>) -> PolarsResult<()> {
+        self.bit_repr_large().vec_hash(random_state, buf)?;
+        Ok(())
     }
-    fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) {
-        self.bit_repr_large().vec_hash_combine(random_state, hashes)
+    fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) -> PolarsResult<()> {
+        self.bit_repr_large()
+            .vec_hash_combine(random_state, hashes)?;
+        Ok(())
+    }
+}
+
+#[cfg(feature = "groupby_list")]
+impl VecHash for ListChunked {
+    fn vec_hash(&self, _random_state: RandomState, _buf: &mut Vec<u64>) -> PolarsResult<()> {
+        polars_ensure!(
+        self.inner_dtype().to_physical().is_numeric(),
+        ComputeError: "grouping on list type is only allowed if the inner type is numeric"
+        );
+        _buf.clear();
+        _buf.reserve(self.len());
+        let null_h = get_null_hash_value(_random_state.clone());
+
+        for arr in self.downcast_iter() {
+            _buf.extend(
+                numeric_list_bytes_iter(arr)?.map(|opt_bytes| match opt_bytes {
+                    Some(s) => xxh3_64_with_seed(s, null_h),
+                    None => null_h,
+                }),
+            )
+        }
+        Ok(())
+    }
+
+    fn vec_hash_combine(
+        &self,
+        _random_state: RandomState,
+        _hashes: &mut [u64],
+    ) -> PolarsResult<()> {
+        polars_ensure!(
+        self.inner_dtype().to_physical().is_numeric(),
+        ComputeError: "grouping on list type is only allowed if the inner type is numeric"
+        );
+
+        let null_h = get_null_hash_value(_random_state);
+
+        let mut offset = 0;
+        self.downcast_iter().try_for_each(|arr| {
+            numeric_list_bytes_iter(arr)?
+                .zip(&mut _hashes[offset..])
+                .for_each(|(opt_bytes, h)| {
+                    let l = match opt_bytes {
+                        Some(s) => xxh3_64_with_seed(s, null_h),
+                        None => null_h,
+                    };
+                    *h = _boost_hash_combine(l, *h)
+                });
+            offset += arr.len();
+            PolarsResult::Ok(())
+        })?;
+        Ok(())
     }
 }
 
@@ -309,7 +385,7 @@ impl<T> VecHash for ObjectChunked<T>
 where
     T: PolarsObject,
 {
-    fn vec_hash(&self, random_state: RandomState, buf: &mut Vec<u64>) {
+    fn vec_hash(&self, random_state: RandomState, buf: &mut Vec<u64>) -> PolarsResult<()> {
         // Note that we don't use the no null branch! This can break in unexpected ways.
         // for instance with threading we split an array in n_threads, this may lead to
         // splits that have no nulls and splits that have nulls. Then one array is hashed with
@@ -326,9 +402,11 @@ where
                 hasher.finish()
             }))
         });
+
+        Ok(())
     }
 
-    fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) {
+    fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) -> PolarsResult<()> {
         self.apply_to_slice(
             |opt_v, h| {
                 let mut hasher = random_state.build_hasher();
@@ -336,7 +414,8 @@ where
                 _boost_hash_combine(hasher.finish(), *h)
             },
             hashes,
-        )
+        );
+        Ok(())
     }
 }
 
diff --git a/crates/polars-core/src/series/implementations/binary.rs b/crates/polars-core/src/series/implementations/binary.rs
index 2e1e8f42925c..25f9a44c17ab 100644
--- a/crates/polars-core/src/series/implementations/binary.rs
+++ b/crates/polars-core/src/series/implementations/binary.rs
@@ -50,12 +50,12 @@ impl private::PrivateSeries for SeriesWrap<BinaryChunked> {
     }
 
     fn vec_hash(&self, random_state: RandomState, buf: &mut Vec<u64>) -> PolarsResult<()> {
-        self.0.vec_hash(random_state, buf);
+        self.0.vec_hash(random_state, buf)?;
         Ok(())
     }
 
     fn vec_hash_combine(&self, build_hasher: RandomState, hashes: &mut [u64]) -> PolarsResult<()> {
-        self.0.vec_hash_combine(build_hasher, hashes);
+        self.0.vec_hash_combine(build_hasher, hashes)?;
         Ok(())
     }
 
diff --git a/crates/polars-core/src/series/implementations/boolean.rs b/crates/polars-core/src/series/implementations/boolean.rs
index ce46082adcc9..11baec69aaf0 100644
--- a/crates/polars-core/src/series/implementations/boolean.rs
+++ b/crates/polars-core/src/series/implementations/boolean.rs
@@ -51,12 +51,12 @@ impl private::PrivateSeries for SeriesWrap<BooleanChunked> {
     }
 
     fn vec_hash(&self, random_state: RandomState, buf: &mut Vec<u64>) -> PolarsResult<()> {
-        self.0.vec_hash(random_state, buf);
+        self.0.vec_hash(random_state, buf)?;
         Ok(())
     }
 
     fn vec_hash_combine(&self, build_hasher: RandomState, hashes: &mut [u64]) -> PolarsResult<()> {
-        self.0.vec_hash_combine(build_hasher, hashes);
+        self.0.vec_hash_combine(build_hasher, hashes)?;
         Ok(())
     }
 
diff --git a/crates/polars-core/src/series/implementations/categorical.rs b/crates/polars-core/src/series/implementations/categorical.rs
index 5e134e0273be..4430a1db84a7 100644
--- a/crates/polars-core/src/series/implementations/categorical.rs
+++ b/crates/polars-core/src/series/implementations/categorical.rs
@@ -98,12 +98,12 @@ impl private::PrivateSeries for SeriesWrap<CategoricalChunked> {
     }
 
     fn vec_hash(&self, random_state: RandomState, buf: &mut Vec<u64>) -> PolarsResult<()> {
-        self.0.logical().vec_hash(random_state, buf);
+        self.0.logical().vec_hash(random_state, buf)?;
         Ok(())
     }
 
     fn vec_hash_combine(&self, build_hasher: RandomState, hashes: &mut [u64]) -> PolarsResult<()> {
-        self.0.logical().vec_hash_combine(build_hasher, hashes);
+        self.0.logical().vec_hash_combine(build_hasher, hashes)?;
         Ok(())
     }
 
diff --git a/crates/polars-core/src/series/implementations/dates_time.rs b/crates/polars-core/src/series/implementations/dates_time.rs
index 886d9198cd57..477af2cd0e7f 100644
--- a/crates/polars-core/src/series/implementations/dates_time.rs
+++ b/crates/polars-core/src/series/implementations/dates_time.rs
@@ -77,7 +77,7 @@ macro_rules! impl_dyn_series {
             }
 
             fn vec_hash(&self, random_state: RandomState, buf: &mut Vec<u64>) -> PolarsResult<()> {
-                self.0.vec_hash(random_state, buf);
+                self.0.vec_hash(random_state, buf)?;
                 Ok(())
             }
 
@@ -86,7 +86,7 @@ macro_rules! impl_dyn_series {
                 build_hasher: RandomState,
                 hashes: &mut [u64],
             ) -> PolarsResult<()> {
-                self.0.vec_hash_combine(build_hasher, hashes);
+                self.0.vec_hash_combine(build_hasher, hashes)?;
                 Ok(())
             }
 
diff --git a/crates/polars-core/src/series/implementations/datetime.rs b/crates/polars-core/src/series/implementations/datetime.rs
index d070e75e8a8c..cab324d2fcbe 100644
--- a/crates/polars-core/src/series/implementations/datetime.rs
+++ b/crates/polars-core/src/series/implementations/datetime.rs
@@ -75,12 +75,12 @@ impl private::PrivateSeries for SeriesWrap<DatetimeChunked> {
     }
 
     fn vec_hash(&self, random_state: RandomState, buf: &mut Vec<u64>) -> PolarsResult<()> {
-        self.0.vec_hash(random_state, buf);
+        self.0.vec_hash(random_state, buf)?;
         Ok(())
     }
 
     fn vec_hash_combine(&self, build_hasher: RandomState, hashes: &mut [u64]) -> PolarsResult<()> {
-        self.0.vec_hash_combine(build_hasher, hashes);
+        self.0.vec_hash_combine(build_hasher, hashes)?;
         Ok(())
     }
 
diff --git a/crates/polars-core/src/series/implementations/duration.rs b/crates/polars-core/src/series/implementations/duration.rs
index a1738ebb9fd5..5caa0deacaa1 100644
--- a/crates/polars-core/src/series/implementations/duration.rs
+++ b/crates/polars-core/src/series/implementations/duration.rs
@@ -80,12 +80,12 @@ impl private::PrivateSeries for SeriesWrap<DurationChunked> {
     }
 
     fn vec_hash(&self, random_state: RandomState, buf: &mut Vec<u64>) -> PolarsResult<()> {
-        self.0.vec_hash(random_state, buf);
+        self.0.vec_hash(random_state, buf)?;
         Ok(())
     }
 
     fn vec_hash_combine(&self, build_hasher: RandomState, hashes: &mut [u64]) -> PolarsResult<()> {
-        self.0.vec_hash_combine(build_hasher, hashes);
+        self.0.vec_hash_combine(build_hasher, hashes)?;
         Ok(())
     }
 
diff --git a/crates/polars-core/src/series/implementations/floats.rs b/crates/polars-core/src/series/implementations/floats.rs
index b0a09e1e0e1b..dd28d742ca21 100644
--- a/crates/polars-core/src/series/implementations/floats.rs
+++ b/crates/polars-core/src/series/implementations/floats.rs
@@ -76,7 +76,7 @@ macro_rules! impl_dyn_series {
             }
 
             fn vec_hash(&self, random_state: RandomState, buf: &mut Vec<u64>) -> PolarsResult<()> {
-                self.0.vec_hash(random_state, buf);
+                self.0.vec_hash(random_state, buf)?;
                 Ok(())
             }
 
@@ -85,7 +85,7 @@ macro_rules! impl_dyn_series {
                 build_hasher: RandomState,
                 hashes: &mut [u64],
             ) -> PolarsResult<()> {
-                self.0.vec_hash_combine(build_hasher, hashes);
+                self.0.vec_hash_combine(build_hasher, hashes)?;
                 Ok(())
             }
 
diff --git a/crates/polars-core/src/series/implementations/list.rs b/crates/polars-core/src/series/implementations/list.rs
index f5c55a051950..bb4fc35987ba 100644
--- a/crates/polars-core/src/series/implementations/list.rs
+++ b/crates/polars-core/src/series/implementations/list.rs
@@ -1,8 +1,12 @@
 use std::any::Any;
 use std::borrow::Cow;
 
+#[cfg(feature = "groupby_list")]
+use ahash::RandomState;
+
 use super::{private, IntoSeries, SeriesTrait};
 use crate::chunked_array::comparison::*;
+use crate::chunked_array::ops::compare_inner::{IntoPartialEqInner, PartialEqInner};
 use crate::chunked_array::ops::explode::ExplodeByOffsets;
 use crate::chunked_array::{AsSinglePtr, Settings};
 use crate::frame::groupby::*;
@@ -48,6 +52,26 @@ impl private::PrivateSeries for SeriesWrap<ListChunked> {
     fn group_tuples(&self, multithreaded: bool, sorted: bool) -> PolarsResult<GroupsProxy> {
         IntoGroupsProxy::group_tuples(&self.0, multithreaded, sorted)
     }
+
+    #[cfg(feature = "groupby_list")]
+    fn vec_hash(&self, _build_hasher: RandomState, _buf: &mut Vec<u64>) -> PolarsResult<()> {
+        self.0.vec_hash(_build_hasher, _buf)?;
+        Ok(())
+    }
+
+    #[cfg(feature = "groupby_list")]
+    fn vec_hash_combine(
+        &self,
+        _build_hasher: RandomState,
+        _hashes: &mut [u64],
+    ) -> PolarsResult<()> {
+        self.0.vec_hash_combine(_build_hasher, _hashes)?;
+        Ok(())
+    }
+
+    fn into_partial_eq_inner<'a>(&'a self) -> Box<dyn PartialEqInner + 'a> {
+        (&self.0).into_partial_eq_inner()
+    }
 }
 
 impl SeriesTrait for SeriesWrap<ListChunked> {
diff --git a/crates/polars-core/src/series/implementations/mod.rs b/crates/polars-core/src/series/implementations/mod.rs
index cfdbb490a71e..9ca9bce80b91 100644
--- a/crates/polars-core/src/series/implementations/mod.rs
+++ b/crates/polars-core/src/series/implementations/mod.rs
@@ -139,7 +139,7 @@ macro_rules! impl_dyn_series {
             }
 
             fn vec_hash(&self, random_state: RandomState, buf: &mut Vec<u64>) -> PolarsResult<()> {
-                self.0.vec_hash(random_state, buf);
+                self.0.vec_hash(random_state, buf)?;
                 Ok(())
             }
 
@@ -148,7 +148,7 @@ macro_rules! impl_dyn_series {
                 build_hasher: RandomState,
                 hashes: &mut [u64],
             ) -> PolarsResult<()> {
-                self.0.vec_hash_combine(build_hasher, hashes);
+                self.0.vec_hash_combine(build_hasher, hashes)?;
                 Ok(())
             }
 
diff --git a/crates/polars-core/src/series/implementations/object.rs b/crates/polars-core/src/series/implementations/object.rs
index 3a8b8594d41a..01eb9fdbe0e9 100644
--- a/crates/polars-core/src/series/implementations/object.rs
+++ b/crates/polars-core/src/series/implementations/object.rs
@@ -55,12 +55,12 @@ where
     }
 
     fn vec_hash(&self, random_state: RandomState, buf: &mut Vec<u64>) -> PolarsResult<()> {
-        self.0.vec_hash(random_state, buf);
+        self.0.vec_hash(random_state, buf)?;
         Ok(())
     }
 
     fn vec_hash_combine(&self, build_hasher: RandomState, hashes: &mut [u64]) -> PolarsResult<()> {
-        self.0.vec_hash_combine(build_hasher, hashes);
+        self.0.vec_hash_combine(build_hasher, hashes)?;
         Ok(())
     }
 
diff --git a/crates/polars-core/src/series/implementations/utf8.rs b/crates/polars-core/src/series/implementations/utf8.rs
index ddee3fb82253..a25684c1ffd1 100644
--- a/crates/polars-core/src/series/implementations/utf8.rs
+++ b/crates/polars-core/src/series/implementations/utf8.rs
@@ -51,12 +51,12 @@ impl private::PrivateSeries for SeriesWrap<Utf8Chunked> {
     }
 
     fn vec_hash(&self, random_state: RandomState, buf: &mut Vec<u64>) -> PolarsResult<()> {
-        self.0.vec_hash(random_state, buf);
+        self.0.vec_hash(random_state, buf)?;
         Ok(())
     }
 
     fn vec_hash_combine(&self, build_hasher: RandomState, hashes: &mut [u64]) -> PolarsResult<()> {
-        self.0.vec_hash_combine(build_hasher, hashes);
+        self.0.vec_hash_combine(build_hasher, hashes)?;
         Ok(())
     }
 
diff --git a/crates/polars-error/src/lib.rs b/crates/polars-error/src/lib.rs
index 730f484123d0..3241c9faa54d 100644
--- a/crates/polars-error/src/lib.rs
+++ b/crates/polars-error/src/lib.rs
@@ -140,6 +140,11 @@ macro_rules! polars_err {
             op = concat!("`", stringify!($op), "`"), got = $arg, expected = $expected
         )
     };
+    (un_impl = $op:ident) => {
+        $crate::polars_err!(
+            InvalidOperation: "{} operation is not implemented.", concat!("`", stringify!($op), "`")
+        )
+    };
     (op = $op:expr, $arg:expr) => {
         $crate::polars_err!(
             InvalidOperation: "{} operation not supported for dtype `{}`", $op, $arg
diff --git a/py-polars/tests/unit/datatypes/test_list.py b/py-polars/tests/unit/datatypes/test_list.py
index 962875782f73..70f2117caad4 100644
--- a/py-polars/tests/unit/datatypes/test_list.py
+++ b/py-polars/tests/unit/datatypes/test_list.py
@@ -190,6 +190,25 @@ def test_groupby_list_column() -> None:
     }
 
 
+def test_groupby_multiple_keys_contains_list_column() -> None:
+    df = (
+        pl.DataFrame(
+            {
+                "a": ["x", "x", "y", "y"],
+                "b": [[1, 2], [1, 2], [3, 4, 5], [6]],
+                "c": [3, 2, 1, 0],
+            }
+        )
+        .groupby(["a", "b"], maintain_order=True)
+        .agg(pl.all())
+    )
+    assert df.to_dict(False) == {
+        "a": ["x", "y", "y"],
+        "b": [[1, 2], [3, 4, 5], [6]],
+        "c": [[3, 2], [1], [0]],
+    }
+
+
 def test_fast_explode_flag() -> None:
     df1 = pl.DataFrame({"values": [[[1, 2]]]})
     assert df1.clone().vstack(df1)["values"].flags["FAST_EXPLODE"]
diff --git a/py-polars/tests/unit/operations/test_unique.py b/py-polars/tests/unit/operations/test_unique.py
index 36ba61c701d3..1162a2b4e728 100644
--- a/py-polars/tests/unit/operations/test_unique.py
+++ b/py-polars/tests/unit/operations/test_unique.py
@@ -26,3 +26,12 @@ def test_unique_predicate_pd() -> None:
     )
     expected = pl.DataFrame({"x": ["abc"], "y": ["xxx"], "z": [True]})
     assert_frame_equal(result, expected)
+
+
+def test_unique_on_list_df() -> None:
+    assert pl.DataFrame(
+        {"a": [1, 2, 3, 4, 4], "b": [[1, 1], [2], [3], [4, 4], [4, 4]]}
+    ).unique(maintain_order=True).to_dict(False) == {
+        "a": [1, 2, 3, 4],
+        "b": [[1, 1], [2], [3], [4, 4]],
+    }
diff --git a/py-polars/tests/unit/test_errors.py b/py-polars/tests/unit/test_errors.py
index fc0b5b6d6c06..a697301353d4 100644
--- a/py-polars/tests/unit/test_errors.py
+++ b/py-polars/tests/unit/test_errors.py
@@ -209,11 +209,6 @@ def test_error_on_double_agg() -> None:
             )
 
 
-def test_unique_on_list_df() -> None:
-    with pytest.raises(pl.InvalidOperationError):
-        pl.DataFrame({"a": [1, 2, 3, 4], "b": [[1, 1], [2], [3], [4, 4]]}).unique()
-
-
 def test_filter_not_of_type_bool() -> None:
     df = pl.DataFrame({"json_val": ['{"a":"hello"}', None, '{"a":"world"}']})
     with pytest.raises(

From c3f01f8244d96738b408ac54289adb06219e24cf Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Sun, 20 Aug 2023 12:09:08 +0200
Subject: [PATCH 14/55] refactor(rust): remove unused apply functions and add
 fallible generic apply functions (#10621)

---
 Cargo.toml                                    |   2 +-
 .../src/chunked_array/arithmetic/mod.rs       |   2 +-
 .../src/chunked_array/arithmetic/numeric.rs   |   8 +-
 .../chunked_array/logical/categorical/mod.rs  |   2 +-
 crates/polars-core/src/chunked_array/mod.rs   |  24 +-
 .../polars-core/src/chunked_array/ops/abs.rs  |   2 +-
 .../src/chunked_array/ops/aggregate/var.rs    |   6 +-
 .../src/chunked_array/ops/apply.rs            | 297 ++++--------------
 .../src/chunked_array/ops/is_in.rs            |   4 +-
 .../polars-core/src/chunked_array/ops/mod.rs  |  39 +--
 .../polars-core/src/datatypes/from_values.rs  | 130 +++++++-
 crates/polars-core/src/functions.rs           |   4 +-
 .../src/series/arithmetic/borrowed.rs         |  26 +-
 crates/polars-core/src/series/ops/round.rs    |  16 +-
 .../src/chunked_array/binary/namespace.rs     |   8 +-
 .../src/chunked_array/strings/json_path.rs    |   4 +-
 .../src/chunked_array/strings/namespace.rs    |   8 +-
 crates/polars-ops/src/frame/pivot/mod.rs      |   2 +-
 .../polars-ops/src/series/ops/floor_divide.rs |   8 +-
 crates/polars-ops/src/series/ops/log.rs       |  16 +-
 .../polars-plan/src/dsl/function_expr/pow.rs  |  14 +-
 .../polars-plan/src/dsl/function_expr/sign.rs |   2 +-
 .../src/dsl/function_expr/strings.rs          |  26 +-
 .../src/dsl/function_expr/trigonometry.rs     |  34 +-
 crates/polars-time/src/base_utc_offset.rs     |   2 +-
 crates/polars-time/src/dst_offset.rs          |   2 +-
 crates/polars/src/docs/eager.rs               |   6 +-
 py-polars/Cargo.lock                          |   2 +-
 28 files changed, 305 insertions(+), 391 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 735b905a14b0..da68e35af692 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -54,7 +54,7 @@ xxhash-rust = { version = "0.8.6", features = ["xxh3"] }
 [workspace.dependencies.arrow]
 package = "arrow2"
 git = "https://github.com/jorgecarleitao/arrow2"
-rev = "7edf5f9e359e0ed02e9d0c6b9318b06964d805f0"
+rev = "2b3e2a9e83725a557d78b90cd39298c5bef0ca4a"
 # branch = ""
 # version = "0.17.4"
 default-features = false
diff --git a/crates/polars-core/src/chunked_array/arithmetic/mod.rs b/crates/polars-core/src/chunked_array/arithmetic/mod.rs
index 6a51424b4e21..101eab32120e 100644
--- a/crates/polars-core/src/chunked_array/arithmetic/mod.rs
+++ b/crates/polars-core/src/chunked_array/arithmetic/mod.rs
@@ -189,7 +189,7 @@ impl Add for &BooleanChunked {
         if rhs.len() == 1 {
             let rhs = rhs.get(0);
             return match rhs {
-                Some(rhs) => self.apply_cast_numeric(|v| v as IdxSize + rhs as IdxSize),
+                Some(rhs) => self.apply_values_generic(|v| v as IdxSize + rhs as IdxSize),
                 None => IdxCa::full_null(self.name(), self.len()),
             };
         }
diff --git a/crates/polars-core/src/chunked_array/arithmetic/numeric.rs b/crates/polars-core/src/chunked_array/arithmetic/numeric.rs
index ebfb835c715f..f0e5fa53ac12 100644
--- a/crates/polars-core/src/chunked_array/arithmetic/numeric.rs
+++ b/crates/polars-core/src/chunked_array/arithmetic/numeric.rs
@@ -18,14 +18,14 @@ where
             let opt_rhs = rhs.get(0);
             match opt_rhs {
                 None => ChunkedArray::full_null(lhs.name(), lhs.len()),
-                Some(rhs) => lhs.apply(|lhs| operation(lhs, rhs)),
+                Some(rhs) => lhs.apply_values(|lhs| operation(lhs, rhs)),
             }
         },
         (1, _) => {
             let opt_lhs = lhs.get(0);
             match opt_lhs {
                 None => ChunkedArray::full_null(lhs.name(), rhs.len()),
-                Some(lhs) => rhs.apply(|rhs| operation(lhs, rhs)),
+                Some(lhs) => rhs.apply_values(|rhs| operation(lhs, rhs)),
             }
         },
         _ => panic!("Cannot apply operation on arrays of different lengths"),
@@ -253,7 +253,7 @@ where
 
     fn add(self, rhs: N) -> Self::Output {
         let adder: T::Native = NumCast::from(rhs).unwrap();
-        let mut out = self.apply(|val| val + adder);
+        let mut out = self.apply_values(|val| val + adder);
         out.set_sorted_flag(self.is_sorted_flag());
         out
     }
@@ -268,7 +268,7 @@ where
 
     fn sub(self, rhs: N) -> Self::Output {
         let subber: T::Native = NumCast::from(rhs).unwrap();
-        let mut out = self.apply(|val| val - subber);
+        let mut out = self.apply_values(|val| val - subber);
         out.set_sorted_flag(self.is_sorted_flag());
         out
     }
diff --git a/crates/polars-core/src/chunked_array/logical/categorical/mod.rs b/crates/polars-core/src/chunked_array/logical/categorical/mod.rs
index f809a474190e..6841d6da5ba1 100644
--- a/crates/polars-core/src/chunked_array/logical/categorical/mod.rs
+++ b/crates/polars-core/src/chunked_array/logical/categorical/mod.rs
@@ -74,7 +74,7 @@ impl CategoricalChunked {
         // we can skip the apply and only update the rev_map
         let local_ca = self
             .logical()
-            .apply_on_opt(|opt_v| opt_v.map(|v| *physical_map.get(&v).unwrap()));
+            .apply(|opt_v| opt_v.map(|v| *physical_map.get(&v).unwrap()));
 
         let mut out =
             unsafe { Self::from_cats_and_rev_map_unchecked(local_ca, local_rev_map.into()) };
diff --git a/crates/polars-core/src/chunked_array/mod.rs b/crates/polars-core/src/chunked_array/mod.rs
index 6bde255bb244..19f4600b7510 100644
--- a/crates/polars-core/src/chunked_array/mod.rs
+++ b/crates/polars-core/src/chunked_array/mod.rs
@@ -68,30 +68,8 @@ pub type ChunkIdIter<'a> = std::iter::Map<std::slice::Iter<'a, ArrayRef>, fn(&Ar
 ///
 /// ```rust
 /// # use polars_core::prelude::*;
-/// fn apply_cosine(ca: &Float32Chunked) -> Float32Chunked {
-///     ca.apply(|v| v.cos())
-/// }
-/// ```
-///
-/// If we would like to cast the result we could use a Rust Iterator instead of an `apply` method.
-/// Note that Iterators are slightly slower as the null values aren't ignored implicitly.
-///
-/// ```rust
-/// # use polars_core::prelude::*;
-/// fn apply_cosine_and_cast(ca: &Float32Chunked) -> Float64Chunked {
-///     ca.into_iter()
-///         .map(|opt_v| {
-///         opt_v.map(|v| v.cos() as f64)
-///     }).collect()
-/// }
-/// ```
-///
-/// Another option is to first cast and then use an apply.
-///
-/// ```rust
-/// # use polars_core::prelude::*;
 /// fn apply_cosine_and_cast(ca: &Float32Chunked) -> Float64Chunked {
-///     ca.apply_cast_numeric(|v| v.cos() as f64)
+///     ca.apply_values_generic(|v| v.cos() as f64)
 /// }
 /// ```
 ///
diff --git a/crates/polars-core/src/chunked_array/ops/abs.rs b/crates/polars-core/src/chunked_array/ops/abs.rs
index 2e4da7d44c13..f0e035b6ecf0 100644
--- a/crates/polars-core/src/chunked_array/ops/abs.rs
+++ b/crates/polars-core/src/chunked_array/ops/abs.rs
@@ -9,6 +9,6 @@ where
     /// Convert all values to their absolute/positive value.
     #[must_use]
     pub fn abs(&self) -> Self {
-        self.apply(|v| v.abs())
+        self.apply_values(|v| v.abs())
     }
 }
diff --git a/crates/polars-core/src/chunked_array/ops/aggregate/var.rs b/crates/polars-core/src/chunked_array/ops/aggregate/var.rs
index fa7bbe3102b3..d7f4a4828591 100644
--- a/crates/polars-core/src/chunked_array/ops/aggregate/var.rs
+++ b/crates/polars-core/src/chunked_array/ops/aggregate/var.rs
@@ -23,7 +23,7 @@ where
         let n_values = n_values as f64;
 
         let mean = self.mean()?;
-        let squared = self.apply_cast_numeric::<_, Float64Type>(|value| {
+        let squared: Float64Chunked = ChunkedArray::apply_values_generic(self, |value| {
             let tmp = value.to_f64().unwrap() - mean;
             tmp * tmp
         });
@@ -50,7 +50,7 @@ impl ChunkVar<f32> for Float32Chunked {
         let n_values = n_values as f32;
 
         let mean = self.mean()? as f32;
-        let squared = self.apply(|value| {
+        let squared = self.apply_values(|value| {
             let tmp = value - mean;
             tmp * tmp
         });
@@ -74,7 +74,7 @@ impl ChunkVar<f64> for Float64Chunked {
         let n_values = n_values as f64;
 
         let mean = self.mean()?;
-        let squared = self.apply(|value| {
+        let squared = self.apply_values(|value| {
             let tmp = value - mean;
             tmp * tmp
         });
diff --git a/crates/polars-core/src/chunked_array/ops/apply.rs b/crates/polars-core/src/chunked_array/ops/apply.rs
index 48cfdee12d1d..d2fad1295b85 100644
--- a/crates/polars-core/src/chunked_array/ops/apply.rs
+++ b/crates/polars-core/src/chunked_array/ops/apply.rs
@@ -1,6 +1,7 @@
 //! Implementations of the ChunkApply Trait.
 use std::borrow::Cow;
 use std::convert::TryFrom;
+use std::error::Error;
 
 use arrow::array::{BooleanArray, PrimitiveArray};
 use arrow::bitmap::utils::{get_bit_unchecked, set_bit_unchecked};
@@ -11,14 +12,14 @@ use polars_arrow::bitmap::unary_mut;
 
 use crate::prelude::*;
 use crate::series::IsSorted;
-use crate::utils::{CustomIterTools, NoNull};
+use crate::utils::CustomIterTools;
 
 impl<T> ChunkedArray<T>
 where
     T: PolarsDataType,
     Self: HasUnderlyingArray,
 {
-    pub fn apply_values<'a, U, K, F>(&'a self, op: F) -> ChunkedArray<U>
+    pub fn apply_values_generic<'a, U, K, F>(&'a self, op: F) -> ChunkedArray<U>
     where
         U: PolarsDataType,
         F: FnMut(<<Self as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>) -> K + Copy,
@@ -33,7 +34,46 @@ where
 
         ChunkedArray::from_chunk_iter(self.name(), iter)
     }
-    pub fn apply2<'a, U, K, F>(&'a self, op: F) -> ChunkedArray<U>
+
+    pub fn try_apply_values_generic<'a, U, K, F, E>(&'a self, op: F) -> Result<ChunkedArray<U>, E>
+    where
+        U: PolarsDataType,
+        F: FnMut(<<Self as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>) -> Result<K, E>
+            + Copy,
+        K: ArrayFromElementIter,
+        K::ArrayType: StaticallyMatchesPolarsType<U>,
+        E: Error,
+    {
+        let iter = self.downcast_iter().map(|arr| {
+            let element_iter = arr.values_iter().map(op);
+            let array = K::try_array_from_values_iter(element_iter)?;
+            Ok(array.with_validity_typed(arr.validity().cloned()))
+        });
+
+        ChunkedArray::try_from_chunk_iter(self.name(), iter)
+    }
+
+    pub fn try_apply_generic<'a, U, K, F, E>(&'a self, op: F) -> Result<ChunkedArray<U>, E>
+    where
+        U: PolarsDataType,
+        F: FnMut(
+                Option<<<Self as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>>,
+            ) -> Result<Option<K>, E>
+            + Copy,
+        K: ArrayFromElementIter,
+        K::ArrayType: StaticallyMatchesPolarsType<U>,
+        E: Error,
+    {
+        let iter = self.downcast_iter().map(|arr| {
+            let element_iter = arr.iter().map(op);
+            let array = K::try_array_from_iter(element_iter)?;
+            Ok(array.with_validity_typed(arr.validity().cloned()))
+        });
+
+        ChunkedArray::try_from_chunk_iter(self.name(), iter)
+    }
+
+    pub fn apply_generic<'a, U, K, F>(&'a self, op: F) -> ChunkedArray<U>
     where
         U: PolarsDataType,
         F: FnMut(
@@ -85,24 +125,6 @@ macro_rules! apply {
     }};
 }
 
-macro_rules! apply_enumerate {
-    ($self:expr, $f:expr) => {{
-        if !$self.has_validity() {
-            $self
-                .into_no_null_iter()
-                .enumerate()
-                .map($f)
-                .collect_trusted()
-        } else {
-            $self
-                .into_iter()
-                .enumerate()
-                .map(|(idx, opt_v)| opt_v.map(|v| $f((idx, v))))
-                .collect_trusted()
-        }
-    }};
-}
-
 fn apply_in_place_impl<S, F>(name: &str, chunks: Vec<ArrayRef>, f: F) -> ChunkedArray<S>
 where
     F: Fn(S::Native) -> S::Native + Copy,
@@ -190,7 +212,7 @@ where
 {
     type FuncRet = T::Native;
 
-    fn apply<F>(&'a self, f: F) -> Self
+    fn apply_values<F>(&'a self, f: F) -> Self
     where
         F: Fn(T::Native) -> T::Native + Copy,
     {
@@ -219,7 +241,7 @@ where
         Ok(ca)
     }
 
-    fn apply_on_opt<F>(&'a self, f: F) -> Self
+    fn apply<F>(&'a self, f: F) -> Self
     where
         F: Fn(Option<T::Native>) -> Option<T::Native> + Copy,
     {
@@ -230,44 +252,6 @@ where
         Self::from_chunk_iter(self.name(), chunks)
     }
 
-    fn apply_with_idx<F>(&'a self, f: F) -> Self
-    where
-        F: Fn((usize, T::Native)) -> T::Native + Copy,
-    {
-        if !self.has_validity() {
-            let ca: NoNull<_> = self
-                .into_no_null_iter()
-                .enumerate()
-                .map(f)
-                .collect_trusted();
-            ca.into_inner()
-        } else {
-            // we know that we only iterate over length == self.len()
-            unsafe {
-                self.downcast_iter()
-                    .flatten()
-                    .trust_my_length(self.len())
-                    .enumerate()
-                    .map(|(idx, opt_v)| opt_v.map(|v| f((idx, *v))))
-                    .collect_trusted()
-            }
-        }
-    }
-
-    fn apply_with_idx_on_opt<F>(&'a self, f: F) -> Self
-    where
-        F: Fn((usize, Option<T::Native>)) -> Option<T::Native> + Copy,
-    {
-        // we know that we only iterate over length == self.len()
-        unsafe {
-            self.downcast_iter()
-                .flatten()
-                .trust_my_length(self.len())
-                .enumerate()
-                .map(|(idx, v)| f((idx, v.copied())))
-                .collect_trusted()
-        }
-    }
     fn apply_to_slice<F, V>(&'a self, f: F, slice: &mut [V])
     where
         F: Fn(Option<T::Native>, &V) -> V,
@@ -290,7 +274,7 @@ where
 impl<'a> ChunkApply<'a, bool> for BooleanChunked {
     type FuncRet = bool;
 
-    fn apply<F>(&self, f: F) -> Self
+    fn apply_values<F>(&self, f: F) -> Self
     where
         F: Fn(bool) -> bool + Copy,
     {
@@ -351,25 +335,11 @@ impl<'a> ChunkApply<'a, bool> for BooleanChunked {
         Ok(ret)
     }
 
-    fn apply_on_opt<F>(&'a self, f: F) -> Self
+    fn apply<F>(&'a self, f: F) -> Self
     where
         F: Fn(Option<bool>) -> Option<bool> + Copy,
     {
-        self.into_iter().map(f).collect_trusted()
-    }
-
-    fn apply_with_idx<F>(&'a self, f: F) -> Self
-    where
-        F: Fn((usize, bool)) -> bool + Copy,
-    {
-        apply_enumerate!(self, f)
-    }
-
-    fn apply_with_idx_on_opt<F>(&'a self, f: F) -> Self
-    where
-        F: Fn((usize, Option<bool>)) -> Option<bool> + Copy,
-    {
-        self.into_iter().enumerate().map(f).collect_trusted()
+        self.apply_generic(f)
     }
 
     fn apply_to_slice<F, T>(&'a self, f: F, slice: &mut [T])
@@ -426,50 +396,25 @@ impl BinaryChunked {
 impl<'a> ChunkApply<'a, &'a str> for Utf8Chunked {
     type FuncRet = Cow<'a, str>;
 
-    fn apply<F>(&'a self, f: F) -> Self
+    fn apply_values<F>(&'a self, f: F) -> Self
     where
         F: Fn(&'a str) -> Cow<'a, str> + Copy,
     {
-        use polars_arrow::array::utf8::Utf8FromIter;
-        let chunks = self.downcast_iter().map(|arr| {
-            let iter = arr.values_iter().map(f);
-            let size_hint = (arr.get_values_size() as f64 * 1.3) as usize;
-            let new = Utf8Array::<i64>::from_values_iter(iter, arr.len(), size_hint);
-            new.with_validity(arr.validity().cloned())
-        });
-        Utf8Chunked::from_chunk_iter(self.name(), chunks)
+        ChunkedArray::apply_values_generic(self, f)
     }
 
     fn try_apply<F>(&'a self, f: F) -> PolarsResult<Self>
     where
         F: Fn(&'a str) -> PolarsResult<Cow<'a, str>> + Copy,
     {
-        try_apply!(self, f)
+        self.try_apply_values_generic(f)
     }
 
-    fn apply_on_opt<F>(&'a self, f: F) -> Self
+    fn apply<F>(&'a self, f: F) -> Self
     where
         F: Fn(Option<&'a str>) -> Option<Cow<'a, str>> + Copy,
     {
-        let mut ca: Self = self.into_iter().map(f).collect_trusted();
-        ca.rename(self.name());
-        ca
-    }
-
-    fn apply_with_idx<F>(&'a self, f: F) -> Self
-    where
-        F: Fn((usize, &'a str)) -> Cow<'a, str> + Copy,
-    {
-        apply_enumerate!(self, f)
-    }
-
-    fn apply_with_idx_on_opt<F>(&'a self, f: F) -> Self
-    where
-        F: Fn((usize, Option<&'a str>)) -> Option<Cow<'a, str>> + Copy,
-    {
-        let mut ca: Self = self.into_iter().enumerate().map(f).collect_trusted();
-        ca.rename(self.name());
-        ca
+        self.apply_generic(f)
     }
 
     fn apply_to_slice<F, T>(&'a self, f: F, slice: &mut [T])
@@ -494,43 +439,25 @@ impl<'a> ChunkApply<'a, &'a str> for Utf8Chunked {
 impl<'a> ChunkApply<'a, &'a [u8]> for BinaryChunked {
     type FuncRet = Cow<'a, [u8]>;
 
-    fn apply<F>(&'a self, f: F) -> Self
+    fn apply_values<F>(&'a self, f: F) -> Self
     where
         F: Fn(&'a [u8]) -> Cow<'a, [u8]> + Copy,
     {
-        apply!(self, f)
+        self.apply_values_generic(f)
     }
 
     fn try_apply<F>(&'a self, f: F) -> PolarsResult<Self>
     where
         F: Fn(&'a [u8]) -> PolarsResult<Cow<'a, [u8]>> + Copy,
     {
-        try_apply!(self, f)
+        self.try_apply_values_generic(f)
     }
 
-    fn apply_on_opt<F>(&'a self, f: F) -> Self
+    fn apply<F>(&'a self, f: F) -> Self
     where
         F: Fn(Option<&'a [u8]>) -> Option<Cow<'a, [u8]>> + Copy,
     {
-        let mut ca: Self = self.into_iter().map(f).collect_trusted();
-        ca.rename(self.name());
-        ca
-    }
-
-    fn apply_with_idx<F>(&'a self, f: F) -> Self
-    where
-        F: Fn((usize, &'a [u8])) -> Cow<'a, [u8]> + Copy,
-    {
-        apply_enumerate!(self, f)
-    }
-
-    fn apply_with_idx_on_opt<F>(&'a self, f: F) -> Self
-    where
-        F: Fn((usize, Option<&'a [u8]>)) -> Option<Cow<'a, [u8]>> + Copy,
-    {
-        let mut ca: Self = self.into_iter().enumerate().map(f).collect_trusted();
-        ca.rename(self.name());
-        ca
+        self.apply_generic(f)
     }
 
     fn apply_to_slice<F, T>(&'a self, f: F, slice: &mut [T])
@@ -618,7 +545,7 @@ impl<'a> ChunkApply<'a, Series> for ListChunked {
     type FuncRet = Series;
 
     /// Apply a closure `F` elementwise.
-    fn apply<F>(&'a self, f: F) -> Self
+    fn apply_values<F>(&'a self, f: F) -> Self
     where
         F: Fn(Series) -> Series + Copy,
     {
@@ -666,7 +593,7 @@ impl<'a> ChunkApply<'a, Series> for ListChunked {
         Ok(ca)
     }
 
-    fn apply_on_opt<F>(&'a self, f: F) -> Self
+    fn apply<F>(&'a self, f: F) -> Self
     where
         F: Fn(Option<Series>) -> Option<Series> + Copy,
     {
@@ -676,54 +603,6 @@ impl<'a> ChunkApply<'a, Series> for ListChunked {
         self.into_iter().map(f).collect_trusted()
     }
 
-    /// Apply a closure elementwise. The closure gets the index of the element as first argument.
-    fn apply_with_idx<F>(&'a self, f: F) -> Self
-    where
-        F: Fn((usize, Series)) -> Series + Copy,
-    {
-        if self.is_empty() {
-            return self.clone();
-        }
-        let mut fast_explode = true;
-        let mut function = |(idx, s)| {
-            let out = f((idx, s));
-            if out.is_empty() {
-                fast_explode = false;
-            }
-            out
-        };
-        let mut ca: ListChunked = apply_enumerate!(self, function);
-        if fast_explode {
-            ca.set_fast_explode()
-        }
-        ca
-    }
-
-    /// Apply a closure elementwise. The closure gets the index of the element as first argument.
-    fn apply_with_idx_on_opt<F>(&'a self, f: F) -> Self
-    where
-        F: Fn((usize, Option<Series>)) -> Option<Series> + Copy,
-    {
-        if self.is_empty() {
-            return self.clone();
-        }
-        let mut fast_explode = true;
-        let function = |(idx, s)| {
-            let out = f((idx, s));
-            if let Some(out) = &out {
-                if out.is_empty() {
-                    fast_explode = false;
-                }
-            }
-            out
-        };
-        let mut ca: ListChunked = self.into_iter().enumerate().map(function).collect_trusted();
-        if fast_explode {
-            ca.set_fast_explode()
-        }
-        ca
-    }
-
     fn apply_to_slice<F, T>(&'a self, f: F, slice: &mut [T])
     where
         F: Fn(Option<Series>, &T) -> T,
@@ -752,7 +631,7 @@ where
 {
     type FuncRet = T;
 
-    fn apply<F>(&'a self, f: F) -> Self
+    fn apply_values<F>(&'a self, f: F) -> Self
     where
         F: Fn(&'a T) -> T + Copy,
     {
@@ -768,7 +647,7 @@ where
         todo!()
     }
 
-    fn apply_on_opt<F>(&'a self, f: F) -> Self
+    fn apply<F>(&'a self, f: F) -> Self
     where
         F: Fn(Option<&'a T>) -> Option<T> + Copy,
     {
@@ -777,20 +656,6 @@ where
         ca
     }
 
-    fn apply_with_idx<F>(&'a self, _f: F) -> Self
-    where
-        F: Fn((usize, &'a T)) -> T + Copy,
-    {
-        todo!()
-    }
-
-    fn apply_with_idx_on_opt<F>(&'a self, _f: F) -> Self
-    where
-        F: Fn((usize, Option<&'a T>)) -> Option<T> + Copy,
-    {
-        todo!()
-    }
-
     fn apply_to_slice<F, V>(&'a self, f: F, slice: &mut [V])
     where
         F: Fn(Option<&'a T>, &V) -> V,
@@ -808,41 +673,3 @@ where
         });
     }
 }
-
-impl<'a, T: PolarsDataType> ChunkApplyCast<'a> for ChunkedArray<T>
-where
-    ChunkedArray<T>: HasUnderlyingArray,
-{
-    fn apply_cast_numeric<F, R>(&'a self, f: F) -> ChunkedArray<R>
-    where
-        F: Fn(<<Self as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>) -> R::Native
-            + Copy,
-        R: PolarsNumericType,
-    {
-        let chunks = self.downcast_iter().map(|array| {
-            let values = array.values_iter().map(f);
-            collect_array(values, array.validity().cloned())
-        });
-        ChunkedArray::from_chunk_iter(self.name(), chunks)
-    }
-
-    fn branch_apply_cast_numeric_no_null<F, R>(&'a self, f: F) -> ChunkedArray<R>
-    where
-        F: Fn(
-                Option<<<Self as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>>,
-            ) -> R::Native
-            + Copy,
-        R: PolarsNumericType,
-    {
-        let chunks = self.downcast_iter().map(|array| {
-            if array.null_count() == 0 {
-                let values = array.values_iter().map(|v| f(Some(v)));
-                collect_array(values, None)
-            } else {
-                let values = array.iter().map(f);
-                collect_array(values, None)
-            }
-        });
-        ChunkedArray::from_chunk_iter(self.name(), chunks)
-    }
-}
diff --git a/crates/polars-core/src/chunked_array/ops/is_in.rs b/crates/polars-core/src/chunked_array/ops/is_in.rs
index a53b5de54d44..cf4becb1cb21 100644
--- a/crates/polars-core/src/chunked_array/ops/is_in.rs
+++ b/crates/polars-core/src/chunked_array/ops/is_in.rs
@@ -19,7 +19,7 @@ where
             }
         })
     });
-    Ok(ca.apply_values(|val| set.contains(&val)))
+    Ok(ca.apply_values_generic(|val| set.contains(&val)))
 }
 
 impl<T> IsIn for ChunkedArray<T>
@@ -256,7 +256,7 @@ impl IsIn for BooleanChunked {
                 } else {
                     !(other.sum().unwrap() as usize + nc) == other.len()
                 };
-                Ok(self.apply(|v| if v { has_true } else { has_false }))
+                Ok(self.apply_values(|v| if v { has_true } else { has_false }))
             }
             _ => polars_bail!(opq = is_in, self.dtype(), other.dtype()),
         }
diff --git a/crates/polars-core/src/chunked_array/ops/mod.rs b/crates/polars-core/src/chunked_array/ops/mod.rs
index 3420a041813a..6e0cc146a6b8 100644
--- a/crates/polars-core/src/chunked_array/ops/mod.rs
+++ b/crates/polars-core/src/chunked_array/ops/mod.rs
@@ -296,27 +296,6 @@ pub trait ChunkCast {
     unsafe fn cast_unchecked(&self, data_type: &DataType) -> PolarsResult<Series>;
 }
 
-pub trait ChunkApplyCast<'a>: HasUnderlyingArray {
-    /// Apply a closure elementwise and cast to a Numeric [`ChunkedArray`]. This is fastest when the null check branching is more expensive
-    /// than the closure application.
-    ///
-    /// Null values remain null.
-    fn apply_cast_numeric<F, R>(&'a self, f: F) -> ChunkedArray<R>
-    where
-        F: Fn(<<Self as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>) -> R::Native
-            + Copy,
-        R: PolarsNumericType;
-
-    /// Apply a closure on optional values and cast to Numeric ChunkedArray without null values.
-    fn branch_apply_cast_numeric_no_null<F, R>(&'a self, f: F) -> ChunkedArray<R>
-    where
-        F: Fn(
-                Option<<<Self as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>>,
-            ) -> R::Native
-            + Copy,
-        R: PolarsNumericType;
-}
-
 /// Fastest way to do elementwise operations on a [`ChunkedArray<T>`] when the operation is cheaper than
 /// branching due to null checking.
 pub trait ChunkApply<'a, T> {
@@ -332,11 +311,11 @@ pub trait ChunkApply<'a, T> {
     /// ```
     /// use polars_core::prelude::*;
     /// fn double(ca: &UInt32Chunked) -> UInt32Chunked {
-    ///     ca.apply(|v| v * 2)
+    ///     ca.apply_values(|v| v * 2)
     /// }
     /// ```
     #[must_use]
-    fn apply<F>(&'a self, f: F) -> Self
+    fn apply_values<F>(&'a self, f: F) -> Self
     where
         F: Fn(T) -> Self::FuncRet + Copy;
 
@@ -347,22 +326,10 @@ pub trait ChunkApply<'a, T> {
 
     /// Apply a closure elementwise including null values.
     #[must_use]
-    fn apply_on_opt<F>(&'a self, f: F) -> Self
+    fn apply<F>(&'a self, f: F) -> Self
     where
         F: Fn(Option<T>) -> Option<Self::FuncRet> + Copy;
 
-    /// Apply a closure elementwise. The closure gets the index of the element as first argument.
-    #[must_use]
-    fn apply_with_idx<F>(&'a self, f: F) -> Self
-    where
-        F: Fn((usize, T)) -> Self::FuncRet + Copy;
-
-    /// Apply a closure elementwise. The closure gets the index of the element as first argument.
-    #[must_use]
-    fn apply_with_idx_on_opt<F>(&'a self, f: F) -> Self
-    where
-        F: Fn((usize, Option<T>)) -> Option<Self::FuncRet> + Copy;
-
     /// Apply a closure elementwise and write results to a mutable slice.
     fn apply_to_slice<F, S>(&'a self, f: F, slice: &mut [S])
     // (value of chunkedarray, value of slice) -> value of slice
diff --git a/crates/polars-core/src/datatypes/from_values.rs b/crates/polars-core/src/datatypes/from_values.rs
index 7674adbec0e1..236495a27305 100644
--- a/crates/polars-core/src/datatypes/from_values.rs
+++ b/crates/polars-core/src/datatypes/from_values.rs
@@ -1,5 +1,13 @@
-use arrow::array::{BooleanArray, PrimitiveArray, Utf8Array};
-use polars_arrow::array::utf8::Utf8FromIter;
+use std::borrow::Cow;
+use std::error::Error;
+
+use arrow::array::{
+    BinaryArray, BooleanArray, MutableBinaryArray, MutableBinaryValuesArray, MutablePrimitiveArray,
+    MutableUtf8Array, MutableUtf8ValuesArray, PrimitiveArray, Utf8Array,
+};
+use arrow::bitmap::Bitmap;
+use polars_arrow::array::utf8::{BinaryFromIter, Utf8FromIter};
+use polars_arrow::prelude::FromData;
 use polars_arrow::trusted_len::TrustedLen;
 
 use crate::prelude::StaticArray;
@@ -13,6 +21,14 @@ where
     fn array_from_iter<I: TrustedLen<Item = Option<Self>>>(iter: I) -> Self::ArrayType;
 
     fn array_from_values_iter<I: TrustedLen<Item = Self>>(iter: I) -> Self::ArrayType;
+
+    fn try_array_from_iter<E: Error, I: TrustedLen<Item = Result<Option<Self>, E>>>(
+        iter: I,
+    ) -> Result<Self::ArrayType, E>;
+
+    fn try_array_from_values_iter<E: Error, I: TrustedLen<Item = Result<Self, E>>>(
+        iter: I,
+    ) -> Result<Self::ArrayType, E>;
 }
 
 impl ArrayFromElementIter for bool {
@@ -27,6 +43,20 @@ impl ArrayFromElementIter for bool {
         // SAFETY: guarded by `TrustedLen` trait
         unsafe { BooleanArray::from_trusted_len_values_iter_unchecked(iter) }
     }
+
+    fn try_array_from_iter<E: Error, I: TrustedLen<Item = Result<Option<Self>, E>>>(
+        iter: I,
+    ) -> Result<Self::ArrayType, E> {
+        // SAFETY: guarded by `TrustedLen` trait
+        unsafe { BooleanArray::try_from_trusted_len_iter_unchecked(iter) }
+    }
+    fn try_array_from_values_iter<E: Error, I: TrustedLen<Item = Result<Self, E>>>(
+        iter: I,
+    ) -> Result<Self::ArrayType, E> {
+        // SAFETY: guarded by `TrustedLen` trait
+        let values = unsafe { Bitmap::try_from_trusted_len_iter_unchecked(iter) }?;
+        Ok(BooleanArray::from_data_default(values, None))
+    }
 }
 
 macro_rules! impl_primitive {
@@ -43,6 +73,20 @@ macro_rules! impl_primitive {
                 // SAFETY: guarded by `TrustedLen` trait
                 unsafe { PrimitiveArray::from_trusted_len_values_iter_unchecked(iter) }
             }
+            fn try_array_from_iter<E: Error, I: TrustedLen<Item = Result<Option<Self>, E>>>(
+                iter: I,
+            ) -> Result<Self::ArrayType, E> {
+                // SAFETY: guarded by `TrustedLen` trait
+                unsafe {
+                    Ok(MutablePrimitiveArray::try_from_trusted_len_iter_unchecked(iter)?.into())
+                }
+            }
+            fn try_array_from_values_iter<E: Error, I: TrustedLen<Item = Result<Self, E>>>(
+                iter: I,
+            ) -> Result<Self::ArrayType, E> {
+                let values: Vec<_> = iter.collect::<Result<Vec<_>, _>>()?;
+                Ok(PrimitiveArray::from_vec(values))
+            }
         }
     };
 }
@@ -55,6 +99,8 @@ impl_primitive!(i8);
 impl_primitive!(i16);
 impl_primitive!(i32);
 impl_primitive!(i64);
+impl_primitive!(f32);
+impl_primitive!(f64);
 
 impl ArrayFromElementIter for &str {
     type ArrayType = Utf8Array<i64>;
@@ -64,9 +110,89 @@ impl ArrayFromElementIter for &str {
         unsafe { Utf8Array::from_trusted_len_iter_unchecked(iter) }
     }
 
+    fn array_from_values_iter<I: TrustedLen<Item = Self>>(iter: I) -> Self::ArrayType {
+        let len = iter.size_hint().0;
+        Utf8Array::from_values_iter(iter, len, len * 24)
+    }
+    fn try_array_from_iter<E: Error, I: TrustedLen<Item = Result<Option<Self>, E>>>(
+        iter: I,
+    ) -> Result<Self::ArrayType, E> {
+        let len = iter.size_hint().0;
+        let mut mutable = MutableUtf8Array::<i64>::with_capacities(len, len * 24);
+        mutable.extend_fallible(iter)?;
+        Ok(mutable.into())
+    }
+
+    fn try_array_from_values_iter<E: Error, I: TrustedLen<Item = Result<Self, E>>>(
+        iter: I,
+    ) -> Result<Self::ArrayType, E> {
+        let len = iter.size_hint().0;
+        let mut mutable = MutableUtf8ValuesArray::<i64>::with_capacities(len, len * 24);
+        mutable.extend_fallible(iter)?;
+        Ok(mutable.into())
+    }
+}
+
+impl ArrayFromElementIter for Cow<'_, str> {
+    type ArrayType = Utf8Array<i64>;
+
+    fn array_from_iter<I: TrustedLen<Item = Option<Self>>>(iter: I) -> Self::ArrayType {
+        // SAFETY: guarded by `TrustedLen` trait
+        unsafe { Utf8Array::from_trusted_len_iter_unchecked(iter) }
+    }
+
     fn array_from_values_iter<I: TrustedLen<Item = Self>>(iter: I) -> Self::ArrayType {
         // SAFETY: guarded by `TrustedLen` trait
         let len = iter.size_hint().0;
         Utf8Array::from_values_iter(iter, len, len * 24)
     }
+    fn try_array_from_iter<E: Error, I: TrustedLen<Item = Result<Option<Self>, E>>>(
+        iter: I,
+    ) -> Result<Self::ArrayType, E> {
+        let len = iter.size_hint().0;
+        let mut mutable = MutableUtf8Array::<i64>::with_capacities(len, len * 24);
+        mutable.extend_fallible(iter)?;
+        Ok(mutable.into())
+    }
+
+    fn try_array_from_values_iter<E: Error, I: TrustedLen<Item = Result<Self, E>>>(
+        iter: I,
+    ) -> Result<Self::ArrayType, E> {
+        let len = iter.size_hint().0;
+        let mut mutable = MutableUtf8ValuesArray::<i64>::with_capacities(len, len * 24);
+        mutable.extend_fallible(iter)?;
+        Ok(mutable.into())
+    }
+}
+
+impl ArrayFromElementIter for Cow<'_, [u8]> {
+    type ArrayType = BinaryArray<i64>;
+
+    fn array_from_iter<I: TrustedLen<Item = Option<Self>>>(iter: I) -> Self::ArrayType {
+        // SAFETY: guarded by `TrustedLen` trait
+        unsafe { BinaryArray::from_trusted_len_iter_unchecked(iter) }
+    }
+
+    fn array_from_values_iter<I: TrustedLen<Item = Self>>(iter: I) -> Self::ArrayType {
+        // SAFETY: guarded by `TrustedLen` trait
+        let len = iter.size_hint().0;
+        BinaryArray::from_values_iter(iter, len, len * 24)
+    }
+    fn try_array_from_iter<E: Error, I: TrustedLen<Item = Result<Option<Self>, E>>>(
+        iter: I,
+    ) -> Result<Self::ArrayType, E> {
+        let len = iter.size_hint().0;
+        let mut mutable = MutableBinaryArray::<i64>::with_capacities(len, len * 24);
+        mutable.extend_fallible(iter)?;
+        Ok(mutable.into())
+    }
+
+    fn try_array_from_values_iter<E: Error, I: TrustedLen<Item = Result<Self, E>>>(
+        iter: I,
+    ) -> Result<Self::ArrayType, E> {
+        let len = iter.size_hint().0;
+        let mut mutable = MutableBinaryValuesArray::<i64>::with_capacities(len, len * 24);
+        mutable.extend_fallible(iter)?;
+        Ok(mutable.into())
+    }
 }
diff --git a/crates/polars-core/src/functions.rs b/crates/polars-core/src/functions.rs
index 6c15c1bb910a..515913b43aad 100644
--- a/crates/polars-core/src/functions.rs
+++ b/crates/polars-core/src/functions.rs
@@ -49,8 +49,8 @@ where
     } else {
         let a_mean = a.mean()?;
         let b_mean = b.mean()?;
-        let a = a.apply_cast_numeric::<_, Float64Type>(|a| a.to_f64().unwrap() - a_mean);
-        let b = b.apply_cast_numeric(|b| b.to_f64().unwrap() - b_mean);
+        let a: Float64Chunked = a.apply_values_generic(|a| a.to_f64().unwrap() - a_mean);
+        let b: Float64Chunked = b.apply_values_generic(|b| b.to_f64().unwrap() - b_mean);
 
         let tmp = a * b;
         let n = tmp.len() - tmp.null_count();
diff --git a/crates/polars-core/src/series/arithmetic/borrowed.rs b/crates/polars-core/src/series/arithmetic/borrowed.rs
index bc71816331f7..d34227aa4198 100644
--- a/crates/polars-core/src/series/arithmetic/borrowed.rs
+++ b/crates/polars-core/src/series/arithmetic/borrowed.rs
@@ -234,50 +234,50 @@ pub mod checked {
                 UInt8 => s
                     .u8()
                     .unwrap()
-                    .apply_on_opt(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_u8().unwrap())))
+                    .apply(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_u8().unwrap())))
                     .into_series(),
                 #[cfg(feature = "dtype-i8")]
                 Int8 => s
                     .i8()
                     .unwrap()
-                    .apply_on_opt(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_i8().unwrap())))
+                    .apply(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_i8().unwrap())))
                     .into_series(),
                 #[cfg(feature = "dtype-i16")]
                 Int16 => s
                     .i16()
                     .unwrap()
-                    .apply_on_opt(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_i16().unwrap())))
+                    .apply(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_i16().unwrap())))
                     .into_series(),
                 #[cfg(feature = "dtype-u16")]
                 UInt16 => s
                     .u16()
                     .unwrap()
-                    .apply_on_opt(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_u16().unwrap())))
+                    .apply(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_u16().unwrap())))
                     .into_series(),
                 UInt32 => s
                     .u32()
                     .unwrap()
-                    .apply_on_opt(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_u32().unwrap())))
+                    .apply(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_u32().unwrap())))
                     .into_series(),
                 Int32 => s
                     .i32()
                     .unwrap()
-                    .apply_on_opt(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_i32().unwrap())))
+                    .apply(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_i32().unwrap())))
                     .into_series(),
                 UInt64 => s
                     .u64()
                     .unwrap()
-                    .apply_on_opt(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_u64().unwrap())))
+                    .apply(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_u64().unwrap())))
                     .into_series(),
                 Int64 => s
                     .i64()
                     .unwrap()
-                    .apply_on_opt(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_i64().unwrap())))
+                    .apply(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_i64().unwrap())))
                     .into_series(),
                 Float32 => s
                     .f32()
                     .unwrap()
-                    .apply_on_opt(|opt_v| {
+                    .apply(|opt_v| {
                         opt_v.and_then(|v| {
                             let res = rhs.to_f32().unwrap();
                             if res.is_zero() {
@@ -291,7 +291,7 @@ pub mod checked {
                 Float64 => s
                     .f64()
                     .unwrap()
-                    .apply_on_opt(|opt_v| {
+                    .apply(|opt_v| {
                         opt_v.and_then(|v| {
                             let res = rhs.to_f64().unwrap();
                             if res.is_zero() {
@@ -698,21 +698,21 @@ where
     #[must_use]
     pub fn lhs_sub<N: Num + NumCast>(&self, lhs: N) -> Self {
         let lhs: T::Native = NumCast::from(lhs).expect("could not cast");
-        self.apply(|v| lhs - v)
+        self.apply_values(|v| lhs - v)
     }
 
     /// Apply lhs / self
     #[must_use]
     pub fn lhs_div<N: Num + NumCast>(&self, lhs: N) -> Self {
         let lhs: T::Native = NumCast::from(lhs).expect("could not cast");
-        self.apply(|v| lhs / v)
+        self.apply_values(|v| lhs / v)
     }
 
     /// Apply lhs % self
     #[must_use]
     pub fn lhs_rem<N: Num + NumCast>(&self, lhs: N) -> Self {
         let lhs: T::Native = NumCast::from(lhs).expect("could not cast");
-        self.apply(|v| lhs % v)
+        self.apply_values(|v| lhs % v)
     }
 }
 
diff --git a/crates/polars-core/src/series/ops/round.rs b/crates/polars-core/src/series/ops/round.rs
index c58006ba3dbe..edcd3f31cbca 100644
--- a/crates/polars-core/src/series/ops/round.rs
+++ b/crates/polars-core/src/series/ops/round.rs
@@ -8,26 +8,26 @@ impl Series {
     pub fn round(&self, decimals: u32) -> PolarsResult<Self> {
         if let Ok(ca) = self.f32() {
             if decimals == 0 {
-                let s = ca.apply(|val| val.round()).into_series();
+                let s = ca.apply_values(|val| val.round()).into_series();
                 return Ok(s);
             } else {
                 // Note we do the computation on f64 floats to not lose precision
                 // when the computation is done, we cast to f32
                 let multiplier = 10.0.pow(decimals as f64);
                 let s = ca
-                    .apply(|val| ((val as f64 * multiplier).round() / multiplier) as f32)
+                    .apply_values(|val| ((val as f64 * multiplier).round() / multiplier) as f32)
                     .into_series();
                 return Ok(s);
             }
         }
         if let Ok(ca) = self.f64() {
             if decimals == 0 {
-                let s = ca.apply(|val| val.round()).into_series();
+                let s = ca.apply_values(|val| val.round()).into_series();
                 return Ok(s);
             } else {
                 let multiplier = 10.0.pow(decimals as f64);
                 let s = ca
-                    .apply(|val| (val * multiplier).round() / multiplier)
+                    .apply_values(|val| (val * multiplier).round() / multiplier)
                     .into_series();
                 return Ok(s);
             }
@@ -38,11 +38,11 @@ impl Series {
     /// Floor underlying floating point array to the lowest integers smaller or equal to the float value.
     pub fn floor(&self) -> PolarsResult<Self> {
         if let Ok(ca) = self.f32() {
-            let s = ca.apply(|val| val.floor()).into_series();
+            let s = ca.apply_values(|val| val.floor()).into_series();
             return Ok(s);
         }
         if let Ok(ca) = self.f64() {
-            let s = ca.apply(|val| val.floor()).into_series();
+            let s = ca.apply_values(|val| val.floor()).into_series();
             return Ok(s);
         }
         polars_bail!(opq = floor, self.dtype());
@@ -51,11 +51,11 @@ impl Series {
     /// Ceil underlying floating point array to the highest integers smaller or equal to the float value.
     pub fn ceil(&self) -> PolarsResult<Self> {
         if let Ok(ca) = self.f32() {
-            let s = ca.apply(|val| val.ceil()).into_series();
+            let s = ca.apply_values(|val| val.ceil()).into_series();
             return Ok(s);
         }
         if let Ok(ca) = self.f64() {
-            let s = ca.apply(|val| val.ceil()).into_series();
+            let s = ca.apply_values(|val| val.ceil()).into_series();
             return Ok(s);
         }
         polars_bail!(opq = ceil, self.dtype());
diff --git a/crates/polars-ops/src/chunked_array/binary/namespace.rs b/crates/polars-ops/src/chunked_array/binary/namespace.rs
index 2a674925cb44..59c444b4ac1d 100644
--- a/crates/polars-ops/src/chunked_array/binary/namespace.rs
+++ b/crates/polars-ops/src/chunked_array/binary/namespace.rs
@@ -60,7 +60,7 @@ pub trait BinaryNameSpaceImpl: AsBinary {
                 Ok(bytes.into())
             })
         } else {
-            Ok(ca.apply_on_opt(|opt_s| opt_s.and_then(|s| hex::decode(s).ok().map(Cow::Owned))))
+            Ok(ca.apply(|opt_s| opt_s.and_then(|s| hex::decode(s).ok().map(Cow::Owned))))
         }
     }
 
@@ -68,7 +68,7 @@ pub trait BinaryNameSpaceImpl: AsBinary {
     fn hex_encode(&self) -> Series {
         let ca = self.as_binary();
         unsafe {
-            ca.apply(|s| hex::encode(s).into_bytes().into())
+            ca.apply_values(|s| hex::encode(s).into_bytes().into())
                 .cast_unchecked(&DataType::Utf8)
                 .unwrap()
         }
@@ -88,7 +88,7 @@ pub trait BinaryNameSpaceImpl: AsBinary {
                 Ok(bytes.into())
             })
         } else {
-            Ok(ca.apply_on_opt(|opt_s| {
+            Ok(ca.apply(|opt_s| {
                 opt_s.and_then(|s| general_purpose::STANDARD.decode(s).ok().map(Cow::Owned))
             }))
         }
@@ -98,7 +98,7 @@ pub trait BinaryNameSpaceImpl: AsBinary {
     fn base64_encode(&self) -> Series {
         let ca = self.as_binary();
         unsafe {
-            ca.apply(|s| general_purpose::STANDARD.encode(s).into_bytes().into())
+            ca.apply_values(|s| general_purpose::STANDARD.encode(s).into_bytes().into())
                 .cast_unchecked(&DataType::Utf8)
                 .unwrap()
         }
diff --git a/crates/polars-ops/src/chunked_array/strings/json_path.rs b/crates/polars-ops/src/chunked_array/strings/json_path.rs
index 0f10895c689b..39ebd91e4f46 100644
--- a/crates/polars-ops/src/chunked_array/strings/json_path.rs
+++ b/crates/polars-ops/src/chunked_array/strings/json_path.rs
@@ -45,7 +45,7 @@ pub trait Utf8JsonPathImpl: AsUtf8 {
             .map_err(|e| polars_err!(ComputeError: "error compiling JSONpath expression {}", e))?;
         Ok(self
             .as_utf8()
-            .apply_on_opt(|opt_s| opt_s.and_then(|s| extract_json(&pat, s))))
+            .apply(|opt_s| opt_s.and_then(|s| extract_json(&pat, s))))
     }
 
     /// Returns the inferred DataType for JSON values for each row
@@ -93,7 +93,7 @@ pub trait Utf8JsonPathImpl: AsUtf8 {
             .map_err(|e| polars_err!(ComputeError: "error compiling JSONpath expression: {}", e))?;
         Ok(self
             .as_utf8()
-            .apply_on_opt(|opt_s| opt_s.and_then(|s| select_json(&pat, s))))
+            .apply(|opt_s| opt_s.and_then(|s| select_json(&pat, s))))
     }
 
     fn json_path_extract(
diff --git a/crates/polars-ops/src/chunked_array/strings/namespace.rs b/crates/polars-ops/src/chunked_array/strings/namespace.rs
index 50374fd0a927..ab9b4919abed 100644
--- a/crates/polars-ops/src/chunked_array/strings/namespace.rs
+++ b/crates/polars-ops/src/chunked_array/strings/namespace.rs
@@ -29,7 +29,7 @@ pub trait Utf8NameSpaceImpl: AsUtf8 {
     #[cfg(feature = "string_encoding")]
     fn hex_encode(&self) -> Utf8Chunked {
         let ca = self.as_utf8();
-        ca.apply(|s| hex::encode(s).into())
+        ca.apply_values(|s| hex::encode(s).into())
     }
 
     #[cfg(not(feature = "binary_encoding"))]
@@ -47,7 +47,7 @@ pub trait Utf8NameSpaceImpl: AsUtf8 {
     #[cfg(feature = "string_encoding")]
     fn base64_encode(&self) -> Utf8Chunked {
         let ca = self.as_utf8();
-        ca.apply(|s| general_purpose::STANDARD.encode(s).into())
+        ca.apply_values(|s| general_purpose::STANDARD.encode(s).into())
     }
 
     #[cfg(feature = "string_from_radix")]
@@ -178,7 +178,7 @@ pub trait Utf8NameSpaceImpl: AsUtf8 {
         let reg = Regex::new(pat)?;
         let f = |s: &'a str| reg.replace(s, val);
         let ca = self.as_utf8();
-        Ok(ca.apply(f))
+        Ok(ca.apply_values(f))
     }
 
     /// Replace the leftmost literal (sub)string with another string
@@ -235,7 +235,7 @@ pub trait Utf8NameSpaceImpl: AsUtf8 {
     fn replace_all(&self, pat: &str, val: &str) -> PolarsResult<Utf8Chunked> {
         let ca = self.as_utf8();
         let reg = Regex::new(pat)?;
-        Ok(ca.apply(|s| reg.replace_all(s, val)))
+        Ok(ca.apply_values(|s| reg.replace_all(s, val)))
     }
 
     /// Replace all matching literal (sub)strings with another string
diff --git a/crates/polars-ops/src/frame/pivot/mod.rs b/crates/polars-ops/src/frame/pivot/mod.rs
index c76474540547..9867dede28d9 100644
--- a/crates/polars-ops/src/frame/pivot/mod.rs
+++ b/crates/polars-ops/src/frame/pivot/mod.rs
@@ -241,7 +241,7 @@ fn pivot_impl(
                 let headers = column_agg.unique_stable()?.cast(&DataType::Utf8)?;
                 let mut headers = headers.utf8().unwrap().clone();
                 if values.len() > 1 {
-                    headers = headers.apply(|v| Cow::from(format!("{value_col_name}{sep}{column_column_name}{sep}{v}")))
+                    headers = headers.apply_values(|v| Cow::from(format!("{value_col_name}{sep}{column_column_name}{sep}{v}")))
                 }
 
                 let n_cols = headers.len();
diff --git a/crates/polars-ops/src/series/ops/floor_divide.rs b/crates/polars-ops/src/series/ops/floor_divide.rs
index 1bf7800070f8..933519cdd2a4 100644
--- a/crates/polars-ops/src/series/ops/floor_divide.rs
+++ b/crates/polars-ops/src/series/ops/floor_divide.rs
@@ -57,9 +57,9 @@ fn floor_div_ca<T: PolarsNumericType>(a: &ChunkedArray<T>, b: &ChunkedArray<T>)
         let name = a.name();
         return if let Some(a) = a.get(0) {
             let mut out = if b.null_count() == 0 {
-                b.apply(|b| floor_div_element(a, b))
+                b.apply_values(|b| floor_div_element(a, b))
             } else {
-                b.apply_on_opt(|b| b.map(|b| floor_div_element(a, b)))
+                b.apply(|b| b.map(|b| floor_div_element(a, b)))
             };
             out.rename(name);
             out
@@ -70,9 +70,9 @@ fn floor_div_ca<T: PolarsNumericType>(a: &ChunkedArray<T>, b: &ChunkedArray<T>)
     if b.len() == 1 {
         return if let Some(b) = b.get(0) {
             if a.null_count() == 0 {
-                a.apply(|a| floor_div_element(a, b))
+                a.apply_values(|a| floor_div_element(a, b))
             } else {
-                a.apply_on_opt(|a| a.map(|a| floor_div_element(a, b)))
+                a.apply(|a| a.map(|a| floor_div_element(a, b)))
             }
         } else {
             ChunkedArray::full_null(a.name(), a.len())
diff --git a/crates/polars-ops/src/series/ops/log.rs b/crates/polars-ops/src/series/ops/log.rs
index 174ca7bc33cc..d73fad24d27e 100644
--- a/crates/polars-ops/src/series/ops/log.rs
+++ b/crates/polars-ops/src/series/ops/log.rs
@@ -26,8 +26,12 @@ pub trait LogSeries: SeriesSealed {
             Int64 => log(s.i64().unwrap(), base).into_series(),
             UInt32 => log(s.u32().unwrap(), base).into_series(),
             UInt64 => log(s.u64().unwrap(), base).into_series(),
-            Float32 => s.f32().unwrap().apply(|v| v.log(base as f32)).into_series(),
-            Float64 => s.f64().unwrap().apply(|v| v.log(base)).into_series(),
+            Float32 => s
+                .f32()
+                .unwrap()
+                .apply_values(|v| v.log(base as f32))
+                .into_series(),
+            Float64 => s.f64().unwrap().apply_values(|v| v.log(base)).into_series(),
             _ => s.cast(&DataType::Float64).unwrap().log(base),
         }
     }
@@ -43,8 +47,8 @@ pub trait LogSeries: SeriesSealed {
             Int64 => log1p(s.i64().unwrap()).into_series(),
             UInt32 => log1p(s.u32().unwrap()).into_series(),
             UInt64 => log1p(s.u64().unwrap()).into_series(),
-            Float32 => s.f32().unwrap().apply(|v| v.ln_1p()).into_series(),
-            Float64 => s.f64().unwrap().apply(|v| v.ln_1p()).into_series(),
+            Float32 => s.f32().unwrap().apply_values(|v| v.ln_1p()).into_series(),
+            Float64 => s.f64().unwrap().apply_values(|v| v.ln_1p()).into_series(),
             _ => s.cast(&DataType::Float64).unwrap().log1p(),
         }
     }
@@ -60,8 +64,8 @@ pub trait LogSeries: SeriesSealed {
             Int64 => exp(s.i64().unwrap()).into_series(),
             UInt32 => exp(s.u32().unwrap()).into_series(),
             UInt64 => exp(s.u64().unwrap()).into_series(),
-            Float32 => s.f32().unwrap().apply(|v| v.exp()).into_series(),
-            Float64 => s.f64().unwrap().apply(|v| v.exp()).into_series(),
+            Float32 => s.f32().unwrap().apply_values(|v| v.exp()).into_series(),
+            Float64 => s.f64().unwrap().apply_values(|v| v.exp()).into_series(),
             _ => s.cast(&DataType::Float64).unwrap().exp(),
         }
     }
diff --git a/crates/polars-plan/src/dsl/function_expr/pow.rs b/crates/polars-plan/src/dsl/function_expr/pow.rs
index 530041707365..cfb1fe14dff3 100644
--- a/crates/polars-plan/src/dsl/function_expr/pow.rs
+++ b/crates/polars-plan/src/dsl/function_expr/pow.rs
@@ -42,7 +42,7 @@ where
             a if a == 1.0 => base.clone().into_series(),
             // specialized sqrt will ensure (-inf)^0.5 = NaN
             // and will likely be faster as well.
-            a if a == 0.5 => base.apply(|v| v.sqrt()).into_series(),
+            a if a == 0.5 => base.apply_values(|v| v.sqrt()).into_series(),
             a if a.fract() == 0.0 && a < 10.0 && a > 1.0 => {
                 let mut out = base.clone();
 
@@ -51,7 +51,9 @@ where
                 }
                 out.into_series()
             },
-            _ => base.apply(|v| Pow::pow(v, exponent_value)).into_series(),
+            _ => base
+                .apply_values(|v| Pow::pow(v, exponent_value))
+                .into_series(),
         };
         Ok(Some(s))
     } else if (base.len() == 1) && (exponent.len() != 1) {
@@ -60,7 +62,9 @@ where
             .ok_or_else(|| polars_err!(ComputeError: "base is null"))?;
 
         Ok(Some(
-            exponent.apply(|exp| Pow::pow(base, exp)).into_series(),
+            exponent
+                .apply_values(|exp| Pow::pow(base, exp))
+                .into_series(),
         ))
     } else {
         Ok(Some(
@@ -129,7 +133,7 @@ where
     T::Native: num::pow::Pow<T::Native, Output = T::Native> + ToPrimitive + Float,
     ChunkedArray<T>: IntoSeries,
 {
-    Ok(base.apply(|v| v.sqrt()).into_series())
+    Ok(base.apply_values(|v| v.sqrt()).into_series())
 }
 
 pub(super) fn cbrt(base: &Series) -> PolarsResult<Series> {
@@ -156,5 +160,5 @@ where
     T::Native: num::pow::Pow<T::Native, Output = T::Native> + ToPrimitive + Float,
     ChunkedArray<T>: IntoSeries,
 {
-    Ok(base.apply(|v| v.cbrt()).into_series())
+    Ok(base.apply_values(|v| v.cbrt()).into_series())
 }
diff --git a/crates/polars-plan/src/dsl/function_expr/sign.rs b/crates/polars-plan/src/dsl/function_expr/sign.rs
index 6951f3a9cf45..41707664e3ac 100644
--- a/crates/polars-plan/src/dsl/function_expr/sign.rs
+++ b/crates/polars-plan/src/dsl/function_expr/sign.rs
@@ -27,7 +27,7 @@ where
     T::Native: num::Float,
     ChunkedArray<T>: IntoSeries,
 {
-    ca.apply(signum_improved).into_series().cast(&Int64)
+    ca.apply_values(signum_improved).into_series().cast(&Int64)
 }
 
 // Wrapper for the signum function that handles +/-0.0 inputs differently
diff --git a/crates/polars-plan/src/dsl/function_expr/strings.rs b/crates/polars-plan/src/dsl/function_expr/strings.rs
index 32abb7cf8596..287356050f66 100644
--- a/crates/polars-plan/src/dsl/function_expr/strings.rs
+++ b/crates/polars-plan/src/dsl/function_expr/strings.rs
@@ -331,15 +331,15 @@ pub(super) fn strip(s: &Series, matches: Option<&str>) -> PolarsResult<Series> {
         if matches.chars().count() == 1 {
             // Fast path for when a single character is passed
             Ok(ca
-                .apply(|s| Cow::Borrowed(s.trim_matches(matches.chars().next().unwrap())))
+                .apply_values(|s| Cow::Borrowed(s.trim_matches(matches.chars().next().unwrap())))
                 .into_series())
         } else {
             Ok(ca
-                .apply(|s| Cow::Borrowed(s.trim_matches(|c| matches.contains(c))))
+                .apply_values(|s| Cow::Borrowed(s.trim_matches(|c| matches.contains(c))))
                 .into_series())
         }
     } else {
-        Ok(ca.apply(|s| Cow::Borrowed(s.trim())).into_series())
+        Ok(ca.apply_values(|s| Cow::Borrowed(s.trim())).into_series())
     }
 }
 
@@ -350,15 +350,19 @@ pub(super) fn lstrip(s: &Series, matches: Option<&str>) -> PolarsResult<Series>
         if matches.chars().count() == 1 {
             // Fast path for when a single character is passed
             Ok(ca
-                .apply(|s| Cow::Borrowed(s.trim_start_matches(matches.chars().next().unwrap())))
+                .apply_values(|s| {
+                    Cow::Borrowed(s.trim_start_matches(matches.chars().next().unwrap()))
+                })
                 .into_series())
         } else {
             Ok(ca
-                .apply(|s| Cow::Borrowed(s.trim_start_matches(|c| matches.contains(c))))
+                .apply_values(|s| Cow::Borrowed(s.trim_start_matches(|c| matches.contains(c))))
                 .into_series())
         }
     } else {
-        Ok(ca.apply(|s| Cow::Borrowed(s.trim_start())).into_series())
+        Ok(ca
+            .apply_values(|s| Cow::Borrowed(s.trim_start()))
+            .into_series())
     }
 }
 
@@ -368,15 +372,19 @@ pub(super) fn rstrip(s: &Series, matches: Option<&str>) -> PolarsResult<Series>
         if matches.chars().count() == 1 {
             // Fast path for when a single character is passed
             Ok(ca
-                .apply(|s| Cow::Borrowed(s.trim_end_matches(matches.chars().next().unwrap())))
+                .apply_values(|s| {
+                    Cow::Borrowed(s.trim_end_matches(matches.chars().next().unwrap()))
+                })
                 .into_series())
         } else {
             Ok(ca
-                .apply(|s| Cow::Borrowed(s.trim_end_matches(|c| matches.contains(c))))
+                .apply_values(|s| Cow::Borrowed(s.trim_end_matches(|c| matches.contains(c))))
                 .into_series())
         }
     } else {
-        Ok(ca.apply(|s| Cow::Borrowed(s.trim_end())).into_series())
+        Ok(ca
+            .apply_values(|s| Cow::Borrowed(s.trim_end()))
+            .into_series())
     }
 }
 
diff --git a/crates/polars-plan/src/dsl/function_expr/trigonometry.rs b/crates/polars-plan/src/dsl/function_expr/trigonometry.rs
index 99cd90cee546..32a57a623e59 100644
--- a/crates/polars-plan/src/dsl/function_expr/trigonometry.rs
+++ b/crates/polars-plan/src/dsl/function_expr/trigonometry.rs
@@ -120,13 +120,13 @@ where
             .get(0)
             .ok_or_else(|| polars_err!(ComputeError: "arctan2 x value is null"))?;
 
-        Ok(Some(y.apply(|v| v.atan2(x_value)).into_series()))
+        Ok(Some(y.apply_values(|v| v.atan2(x_value)).into_series()))
     } else if y.len() == 1 {
         let y_value = y
             .get(0)
             .ok_or_else(|| polars_err!(ComputeError: "arctan2 y value is null"))?;
 
-        Ok(Some(x.apply(|v| y_value.atan2(v)).into_series()))
+        Ok(Some(x.apply_values(|v| y_value.atan2(v)).into_series()))
     } else {
         Ok(Some(
             polars_core::prelude::arity::binary_mut(y, x, atan2_kernel).into_series(),
@@ -168,7 +168,7 @@ where
     T::Native: Float,
     ChunkedArray<T>: IntoSeries,
 {
-    Ok(ca.apply(|v| v.cos()).into_series())
+    Ok(ca.apply_values(|v| v.cos()).into_series())
 }
 
 fn cot<T>(ca: &ChunkedArray<T>) -> PolarsResult<Series>
@@ -177,7 +177,7 @@ where
     T::Native: Float,
     ChunkedArray<T>: IntoSeries,
 {
-    Ok(ca.apply(|v| v.cos() / v.sin()).into_series())
+    Ok(ca.apply_values(|v| v.cos() / v.sin()).into_series())
 }
 
 fn sin<T>(ca: &ChunkedArray<T>) -> PolarsResult<Series>
@@ -186,7 +186,7 @@ where
     T::Native: Float,
     ChunkedArray<T>: IntoSeries,
 {
-    Ok(ca.apply(|v| v.sin()).into_series())
+    Ok(ca.apply_values(|v| v.sin()).into_series())
 }
 
 fn tan<T>(ca: &ChunkedArray<T>) -> PolarsResult<Series>
@@ -195,7 +195,7 @@ where
     T::Native: Float,
     ChunkedArray<T>: IntoSeries,
 {
-    Ok(ca.apply(|v| v.tan()).into_series())
+    Ok(ca.apply_values(|v| v.tan()).into_series())
 }
 
 fn arccos<T>(ca: &ChunkedArray<T>) -> PolarsResult<Series>
@@ -204,7 +204,7 @@ where
     T::Native: Float,
     ChunkedArray<T>: IntoSeries,
 {
-    Ok(ca.apply(|v| v.acos()).into_series())
+    Ok(ca.apply_values(|v| v.acos()).into_series())
 }
 
 fn arcsin<T>(ca: &ChunkedArray<T>) -> PolarsResult<Series>
@@ -213,7 +213,7 @@ where
     T::Native: Float,
     ChunkedArray<T>: IntoSeries,
 {
-    Ok(ca.apply(|v| v.asin()).into_series())
+    Ok(ca.apply_values(|v| v.asin()).into_series())
 }
 
 fn arctan<T>(ca: &ChunkedArray<T>) -> PolarsResult<Series>
@@ -222,7 +222,7 @@ where
     T::Native: Float,
     ChunkedArray<T>: IntoSeries,
 {
-    Ok(ca.apply(|v| v.atan()).into_series())
+    Ok(ca.apply_values(|v| v.atan()).into_series())
 }
 
 fn cosh<T>(ca: &ChunkedArray<T>) -> PolarsResult<Series>
@@ -231,7 +231,7 @@ where
     T::Native: Float,
     ChunkedArray<T>: IntoSeries,
 {
-    Ok(ca.apply(|v| v.cosh()).into_series())
+    Ok(ca.apply_values(|v| v.cosh()).into_series())
 }
 
 fn sinh<T>(ca: &ChunkedArray<T>) -> PolarsResult<Series>
@@ -240,7 +240,7 @@ where
     T::Native: Float,
     ChunkedArray<T>: IntoSeries,
 {
-    Ok(ca.apply(|v| v.sinh()).into_series())
+    Ok(ca.apply_values(|v| v.sinh()).into_series())
 }
 
 fn tanh<T>(ca: &ChunkedArray<T>) -> PolarsResult<Series>
@@ -249,7 +249,7 @@ where
     T::Native: Float,
     ChunkedArray<T>: IntoSeries,
 {
-    Ok(ca.apply(|v| v.tanh()).into_series())
+    Ok(ca.apply_values(|v| v.tanh()).into_series())
 }
 
 fn arccosh<T>(ca: &ChunkedArray<T>) -> PolarsResult<Series>
@@ -258,7 +258,7 @@ where
     T::Native: Float,
     ChunkedArray<T>: IntoSeries,
 {
-    Ok(ca.apply(|v| v.acosh()).into_series())
+    Ok(ca.apply_values(|v| v.acosh()).into_series())
 }
 
 fn arcsinh<T>(ca: &ChunkedArray<T>) -> PolarsResult<Series>
@@ -267,7 +267,7 @@ where
     T::Native: Float,
     ChunkedArray<T>: IntoSeries,
 {
-    Ok(ca.apply(|v| v.asinh()).into_series())
+    Ok(ca.apply_values(|v| v.asinh()).into_series())
 }
 
 fn arctanh<T>(ca: &ChunkedArray<T>) -> PolarsResult<Series>
@@ -276,7 +276,7 @@ where
     T::Native: Float,
     ChunkedArray<T>: IntoSeries,
 {
-    Ok(ca.apply(|v| v.atanh()).into_series())
+    Ok(ca.apply_values(|v| v.atanh()).into_series())
 }
 
 fn degrees<T>(ca: &ChunkedArray<T>) -> PolarsResult<Series>
@@ -285,7 +285,7 @@ where
     T::Native: Float,
     ChunkedArray<T>: IntoSeries,
 {
-    Ok(ca.apply(|v| v.to_degrees()).into_series())
+    Ok(ca.apply_values(|v| v.to_degrees()).into_series())
 }
 
 fn radians<T>(ca: &ChunkedArray<T>) -> PolarsResult<Series>
@@ -294,5 +294,5 @@ where
     T::Native: Float,
     ChunkedArray<T>: IntoSeries,
 {
-    Ok(ca.apply(|v| v.to_radians()).into_series())
+    Ok(ca.apply_values(|v| v.to_radians()).into_series())
 }
diff --git a/crates/polars-time/src/base_utc_offset.rs b/crates/polars-time/src/base_utc_offset.rs
index a5c944885df0..128bfe5d23cb 100644
--- a/crates/polars-time/src/base_utc_offset.rs
+++ b/crates/polars-time/src/base_utc_offset.rs
@@ -21,7 +21,7 @@ pub fn base_utc_offset(
         TimeUnit::Microseconds => timestamp_us_to_datetime,
         TimeUnit::Milliseconds => timestamp_ms_to_datetime,
     };
-    ca.0.apply(|t| {
+    ca.0.apply_values(|t| {
         let ndt = timestamp_to_datetime(t);
         let dt = time_zone.from_utc_datetime(&ndt);
         dt.offset().base_utc_offset().num_milliseconds()
diff --git a/crates/polars-time/src/dst_offset.rs b/crates/polars-time/src/dst_offset.rs
index 58e4f34259ec..74c91c19ecec 100644
--- a/crates/polars-time/src/dst_offset.rs
+++ b/crates/polars-time/src/dst_offset.rs
@@ -18,7 +18,7 @@ pub fn dst_offset(ca: &DatetimeChunked, time_unit: &TimeUnit, time_zone: &Tz) ->
         TimeUnit::Microseconds => timestamp_us_to_datetime,
         TimeUnit::Milliseconds => timestamp_ms_to_datetime,
     };
-    ca.0.apply(|t| {
+    ca.0.apply_values(|t| {
         let ndt = timestamp_to_datetime(t);
         let dt = time_zone.from_utc_datetime(&ndt);
         dt.offset().dst_offset().num_milliseconds()
diff --git a/crates/polars/src/docs/eager.rs b/crates/polars/src/docs/eager.rs
index 01e1ccde3068..f8030818d94d 100644
--- a/crates/polars/src/docs/eager.rs
+++ b/crates/polars/src/docs/eager.rs
@@ -148,7 +148,7 @@
 //! let ca = UInt32Chunked::new("foo", &[1, 2, 3]);
 //!
 //! // 1 / ca
-//! let divide_one_by_ca = ca.apply(|rhs| 1 / rhs);
+//! let divide_one_by_ca = ca.apply_values(|rhs| 1 / rhs);
 //! ```
 //!
 //! ## Comparisons
@@ -245,11 +245,11 @@
 //!
 //! // apply a closure over all values
 //! let s = Series::new("foo", &[Some(1), Some(2), None]);
-//! s.i32()?.apply(|value| value * 20);
+//! s.i32()?.apply_values(|value| value * 20);
 //!
 //! // count string lengths
 //! let s = Series::new("foo", &["foo", "bar", "foobar"]);
-//! s.utf8()?.apply_cast_numeric::<_, UInt64Type>(|str_val| str_val.len() as u64);
+//! s.utf8()?.apply_values_generic(|str_val| str_val.len() as u64);
 //!
 //! # Ok(())
 //! # }
diff --git a/py-polars/Cargo.lock b/py-polars/Cargo.lock
index 286f80fe8ae0..32fef5e7a898 100644
--- a/py-polars/Cargo.lock
+++ b/py-polars/Cargo.lock
@@ -99,7 +99,7 @@ dependencies = [
 [[package]]
 name = "arrow2"
 version = "0.17.4"
-source = "git+https://github.com/jorgecarleitao/arrow2?rev=7edf5f9e359e0ed02e9d0c6b9318b06964d805f0#7edf5f9e359e0ed02e9d0c6b9318b06964d805f0"
+source = "git+https://github.com/jorgecarleitao/arrow2?rev=2b3e2a9e83725a557d78b90cd39298c5bef0ca4a#2b3e2a9e83725a557d78b90cd39298c5bef0ca4a"
 dependencies = [
  "ahash",
  "arrow-format",

From bccb11af03af615195b28c5e3bd77355965b5006 Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Sun, 20 Aug 2023 12:53:44 +0200
Subject: [PATCH 15/55] feat(rust): improve binary (arity) generics (#10622)

---
 .../src/chunked_array/arithmetic/mod.rs       |   4 +-
 .../src/chunked_array/arithmetic/numeric.rs   |   2 +-
 .../polars-core/src/chunked_array/bitwise.rs  |   6 +-
 .../src/chunked_array/ops/apply.rs            |   2 +-
 .../src/chunked_array/ops/arity.rs            | 169 ++++++++++++++++--
 .../src/chunked_array/ops/filter.rs           |  10 +-
 .../src/chunked_array/ops/repeat_by.rs        |   8 +-
 .../polars-core/src/datatypes/from_values.rs  |  69 +++----
 .../src/series/arithmetic/borrowed.rs         |  24 +--
 .../polars-ops/src/chunked_array/list/sets.rs |   2 +-
 .../polars-ops/src/series/ops/floor_divide.rs |   2 +-
 .../polars-plan/src/dsl/function_expr/pow.rs  |   2 +-
 .../src/dsl/function_expr/trigonometry.rs     |   2 +-
 13 files changed, 214 insertions(+), 88 deletions(-)

diff --git a/crates/polars-core/src/chunked_array/arithmetic/mod.rs b/crates/polars-core/src/chunked_array/arithmetic/mod.rs
index 101eab32120e..ecfad06720d9 100644
--- a/crates/polars-core/src/chunked_array/arithmetic/mod.rs
+++ b/crates/polars-core/src/chunked_array/arithmetic/mod.rs
@@ -148,7 +148,7 @@ impl Add for &BinaryChunked {
             };
         }
 
-        arity::binary_mut(self, rhs, concat_binary)
+        arity::binary(self, rhs, concat_binary)
     }
 }
 
@@ -197,7 +197,7 @@ impl Add for &BooleanChunked {
         if self.len() == 1 {
             return rhs.add(self);
         }
-        arity::binary_mut(self, rhs, add_boolean)
+        arity::binary(self, rhs, add_boolean)
     }
 }
 
diff --git a/crates/polars-core/src/chunked_array/arithmetic/numeric.rs b/crates/polars-core/src/chunked_array/arithmetic/numeric.rs
index f0e5fa53ac12..6efa9a3ffa13 100644
--- a/crates/polars-core/src/chunked_array/arithmetic/numeric.rs
+++ b/crates/polars-core/src/chunked_array/arithmetic/numeric.rs
@@ -12,7 +12,7 @@ where
     F: Fn(T::Native, T::Native) -> T::Native,
 {
     let mut ca = match (lhs.len(), rhs.len()) {
-        (a, b) if a == b => arity::binary_mut(lhs, rhs, |lhs, rhs| kernel(lhs, rhs)),
+        (a, b) if a == b => arity::binary(lhs, rhs, |lhs, rhs| kernel(lhs, rhs)),
         // broadcast right path
         (_, 1) => {
             let opt_rhs = rhs.get(0);
diff --git a/crates/polars-core/src/chunked_array/bitwise.rs b/crates/polars-core/src/chunked_array/bitwise.rs
index ea9372ef3adc..908549a947cc 100644
--- a/crates/polars-core/src/chunked_array/bitwise.rs
+++ b/crates/polars-core/src/chunked_array/bitwise.rs
@@ -72,7 +72,7 @@ impl BitOr for &BooleanChunked {
             _ => {},
         }
 
-        arity::binary_mut(self, rhs, compute::boolean_kleene::or)
+        arity::binary(self, rhs, compute::boolean_kleene::or)
     }
 }
 
@@ -117,7 +117,7 @@ impl BitXor for &BooleanChunked {
             _ => {},
         }
 
-        arity::binary_mut(self, rhs, |l_arr, r_arr| {
+        arity::binary(self, rhs, |l_arr, r_arr| {
             let validity = combine_validities_and(l_arr.validity(), r_arr.validity());
             let values = l_arr.values() ^ r_arr.values();
             BooleanArray::from_data_default(values, validity)
@@ -158,7 +158,7 @@ impl BitAnd for &BooleanChunked {
             _ => {},
         }
 
-        arity::binary_mut(self, rhs, compute::boolean_kleene::and)
+        arity::binary(self, rhs, compute::boolean_kleene::and)
     }
 }
 
diff --git a/crates/polars-core/src/chunked_array/ops/apply.rs b/crates/polars-core/src/chunked_array/ops/apply.rs
index d2fad1295b85..18a34d5566da 100644
--- a/crates/polars-core/src/chunked_array/ops/apply.rs
+++ b/crates/polars-core/src/chunked_array/ops/apply.rs
@@ -92,7 +92,7 @@ where
     }
 }
 
-pub(super) fn collect_array<T: NativeType, I: TrustedLen<Item = T>>(
+fn collect_array<T: NativeType, I: TrustedLen<Item = T>>(
     iter: I,
     validity: Option<Bitmap>,
 ) -> PrimitiveArray<T> {
diff --git a/crates/polars-core/src/chunked_array/ops/arity.rs b/crates/polars-core/src/chunked_array/ops/arity.rs
index 287ab18adb89..4214c41deccc 100644
--- a/crates/polars-core/src/chunked_array/ops/arity.rs
+++ b/crates/polars-core/src/chunked_array/ops/arity.rs
@@ -1,15 +1,17 @@
-use arrow::array::{Array, PrimitiveArray};
+use std::error::Error;
+
+use arrow::array::Array;
 use polars_arrow::utils::combine_validities_and;
 
-use crate::chunked_array::ops::apply::collect_array;
 use crate::datatypes::{
-    HasUnderlyingArray, PolarsNumericType, StaticArray, StaticallyMatchesPolarsType,
+    ArrayFromElementIter, HasUnderlyingArray, PolarsNumericType, StaticArray,
+    StaticallyMatchesPolarsType,
 };
 use crate::prelude::{ChunkedArray, PolarsDataType};
 use crate::utils::align_chunks_binary;
 
 #[inline]
-pub fn binary_elementwise<T, U, V, F>(
+pub fn binary_elementwise<T, U, V, F, K>(
     lhs: &ChunkedArray<T>,
     rhs: &ChunkedArray<U>,
     mut op: F,
@@ -17,30 +19,66 @@ pub fn binary_elementwise<T, U, V, F>(
 where
     T: PolarsDataType,
     U: PolarsDataType,
-    V: PolarsNumericType,
+    V: PolarsDataType,
     ChunkedArray<T>: HasUnderlyingArray,
     ChunkedArray<U>: HasUnderlyingArray,
     F: for<'a> FnMut(
         Option<<<ChunkedArray<T> as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>>,
         Option<<<ChunkedArray<U> as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>>,
-    ) -> Option<V::Native>,
+    ) -> Option<K>,
+    K: ArrayFromElementIter,
+    K::ArrayType: StaticallyMatchesPolarsType<V>,
 {
     let (lhs, rhs) = align_chunks_binary(lhs, rhs);
     let iter = lhs
         .downcast_iter()
         .zip(rhs.downcast_iter())
         .map(|(lhs_arr, rhs_arr)| {
-            lhs_arr
+            let element_iter = lhs_arr
                 .iter()
                 .zip(rhs_arr.iter())
-                .map(|(lhs_opt_val, rhs_opt_val)| op(lhs_opt_val, rhs_opt_val))
-                .collect::<PrimitiveArray<V::Native>>()
+                .map(|(lhs_opt_val, rhs_opt_val)| op(lhs_opt_val, rhs_opt_val));
+            K::array_from_iter(element_iter)
         });
     ChunkedArray::from_chunk_iter(lhs.name(), iter)
 }
 
 #[inline]
-pub fn binary_elementwise_values<T, U, V, F>(
+pub fn try_binary_elementwise<T, U, V, F, K, E>(
+    lhs: &ChunkedArray<T>,
+    rhs: &ChunkedArray<U>,
+    mut op: F,
+) -> Result<ChunkedArray<V>, E>
+where
+    T: PolarsDataType,
+    U: PolarsDataType,
+    V: PolarsDataType,
+    ChunkedArray<T>: HasUnderlyingArray,
+    ChunkedArray<U>: HasUnderlyingArray,
+    F: for<'a> FnMut(
+        Option<<<ChunkedArray<T> as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>>,
+        Option<<<ChunkedArray<U> as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>>,
+    ) -> Result<Option<K>, E>,
+    K: ArrayFromElementIter,
+    K::ArrayType: StaticallyMatchesPolarsType<V>,
+    E: Error,
+{
+    let (lhs, rhs) = align_chunks_binary(lhs, rhs);
+    let iter = lhs
+        .downcast_iter()
+        .zip(rhs.downcast_iter())
+        .map(|(lhs_arr, rhs_arr)| {
+            let element_iter = lhs_arr
+                .iter()
+                .zip(rhs_arr.iter())
+                .map(|(lhs_opt_val, rhs_opt_val)| op(lhs_opt_val, rhs_opt_val));
+            K::try_array_from_iter(element_iter)
+        });
+    ChunkedArray::try_from_chunk_iter(lhs.name(), iter)
+}
+
+#[inline]
+pub fn binary_elementwise_values<T, U, V, F, K>(
     lhs: &ChunkedArray<T>,
     rhs: &ChunkedArray<U>,
     mut op: F,
@@ -54,7 +92,9 @@ where
     F: for<'a> FnMut(
         <<ChunkedArray<T> as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>,
         <<ChunkedArray<U> as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>,
-    ) -> V::Native,
+    ) -> K,
+    K: ArrayFromElementIter,
+    K::ArrayType: StaticallyMatchesPolarsType<V>,
 {
     let (lhs, rhs) = align_chunks_binary(lhs, rhs);
     let iter = lhs
@@ -63,15 +103,55 @@ where
         .map(|(lhs_arr, rhs_arr)| {
             let validity = combine_validities_and(lhs_arr.validity(), rhs_arr.validity());
 
-            let iter = lhs_arr
+            let element_iter = lhs_arr
                 .values_iter()
                 .zip(rhs_arr.values_iter())
                 .map(|(lhs_val, rhs_val)| op(lhs_val, rhs_val));
-            collect_array(iter, validity)
+
+            let array = K::array_from_values_iter(element_iter);
+            array.with_validity_typed(validity)
         });
     ChunkedArray::from_chunk_iter(lhs.name(), iter)
 }
 
+#[inline]
+pub fn try_binary_elementwise_values<T, U, V, F, K, E>(
+    lhs: &ChunkedArray<T>,
+    rhs: &ChunkedArray<U>,
+    mut op: F,
+) -> Result<ChunkedArray<V>, E>
+where
+    T: PolarsDataType,
+    U: PolarsDataType,
+    V: PolarsNumericType,
+    ChunkedArray<T>: HasUnderlyingArray,
+    ChunkedArray<U>: HasUnderlyingArray,
+    F: for<'a> FnMut(
+        <<ChunkedArray<T> as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>,
+        <<ChunkedArray<U> as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>,
+    ) -> Result<K, E>,
+    K: ArrayFromElementIter,
+    K::ArrayType: StaticallyMatchesPolarsType<V>,
+    E: Error,
+{
+    let (lhs, rhs) = align_chunks_binary(lhs, rhs);
+    let iter = lhs
+        .downcast_iter()
+        .zip(rhs.downcast_iter())
+        .map(|(lhs_arr, rhs_arr)| {
+            let validity = combine_validities_and(lhs_arr.validity(), rhs_arr.validity());
+
+            let element_iter = lhs_arr
+                .values_iter()
+                .zip(rhs_arr.values_iter())
+                .map(|(lhs_val, rhs_val)| op(lhs_val, rhs_val));
+
+            let array = K::try_array_from_values_iter(element_iter)?;
+            Ok(array.with_validity_typed(validity))
+        });
+    ChunkedArray::try_from_chunk_iter(lhs.name(), iter)
+}
+
 /// Applies a kernel that produces `Array` types.
 #[inline]
 pub fn binary_mut_with_options<T, U, V, F, Arr>(
@@ -101,7 +181,7 @@ where
 }
 
 /// Applies a kernel that produces `Array` types.
-pub fn binary_mut<T, U, V, F, Arr>(
+pub fn binary<T, U, V, F, Arr>(
     lhs: &ChunkedArray<T>,
     rhs: &ChunkedArray<U>,
     op: F,
@@ -121,12 +201,39 @@ where
     binary_mut_with_options(lhs, rhs, op, lhs.name())
 }
 
+/// Applies a kernel that produces `Array` types.
+pub fn try_binary<T, U, V, F, Arr, E>(
+    lhs: &ChunkedArray<T>,
+    rhs: &ChunkedArray<U>,
+    mut op: F,
+) -> Result<ChunkedArray<V>, E>
+where
+    T: PolarsDataType,
+    U: PolarsDataType,
+    V: PolarsDataType,
+    ChunkedArray<T>: HasUnderlyingArray,
+    ChunkedArray<U>: HasUnderlyingArray,
+    Arr: Array + StaticallyMatchesPolarsType<V>,
+    F: FnMut(
+        &<ChunkedArray<T> as HasUnderlyingArray>::ArrayT,
+        &<ChunkedArray<U> as HasUnderlyingArray>::ArrayT,
+    ) -> Result<Arr, E>,
+    E: Error,
+{
+    let (lhs, rhs) = align_chunks_binary(lhs, rhs);
+    let iter = lhs
+        .downcast_iter()
+        .zip(rhs.downcast_iter())
+        .map(|(lhs_arr, rhs_arr)| op(lhs_arr, rhs_arr));
+    ChunkedArray::try_from_chunk_iter(lhs.name(), iter)
+}
+
 /// Applies a kernel that produces `ArrayRef` of the same type.
 ///
 /// # Safety
 /// Caller must ensure that the returned `ArrayRef` belongs to `T: PolarsDataType`.
 #[inline]
-pub unsafe fn binary_mut_unchecked_same_type<T, U, F>(
+pub unsafe fn binary_unchecked_same_type<T, U, F>(
     lhs: &ChunkedArray<T>,
     rhs: &ChunkedArray<U>,
     mut op: F,
@@ -151,3 +258,35 @@ where
         .collect();
     lhs.copy_with_chunks(chunks, keep_sorted, keep_fast_explode)
 }
+
+/// Applies a kernel that produces `ArrayRef` of the same type.
+///
+/// # Safety
+/// Caller must ensure that the returned `ArrayRef` belongs to `T: PolarsDataType`.
+#[inline]
+pub unsafe fn try_binary_unchecked_same_type<T, U, F, E>(
+    lhs: &ChunkedArray<T>,
+    rhs: &ChunkedArray<U>,
+    mut op: F,
+    keep_sorted: bool,
+    keep_fast_explode: bool,
+) -> Result<ChunkedArray<T>, E>
+where
+    T: PolarsDataType,
+    U: PolarsDataType,
+    ChunkedArray<T>: HasUnderlyingArray,
+    ChunkedArray<U>: HasUnderlyingArray,
+    F: FnMut(
+        &<ChunkedArray<T> as HasUnderlyingArray>::ArrayT,
+        &<ChunkedArray<U> as HasUnderlyingArray>::ArrayT,
+    ) -> Result<Box<dyn Array>, E>,
+    E: Error,
+{
+    let (lhs, rhs) = align_chunks_binary(lhs, rhs);
+    let chunks = lhs
+        .downcast_iter()
+        .zip(rhs.downcast_iter())
+        .map(|(lhs_arr, rhs_arr)| op(lhs_arr, rhs_arr))
+        .collect::<Result<Vec<_>, E>>()?;
+    Ok(lhs.copy_with_chunks(chunks, keep_sorted, keep_fast_explode))
+}
diff --git a/crates/polars-core/src/chunked_array/ops/filter.rs b/crates/polars-core/src/chunked_array/ops/filter.rs
index 408902b3258b..7543cff66583 100644
--- a/crates/polars-core/src/chunked_array/ops/filter.rs
+++ b/crates/polars-core/src/chunked_array/ops/filter.rs
@@ -30,7 +30,7 @@ where
         }
         check_filter_len!(self, filter);
         Ok(unsafe {
-            arity::binary_mut_unchecked_same_type(
+            arity::binary_unchecked_same_type(
                 self,
                 filter,
                 |left, mask| filter_fn(left, mask).unwrap(),
@@ -52,7 +52,7 @@ impl ChunkFilter<BooleanType> for BooleanChunked {
         }
         check_filter_len!(self, filter);
         Ok(unsafe {
-            arity::binary_mut_unchecked_same_type(
+            arity::binary_unchecked_same_type(
                 self,
                 filter,
                 |left, mask| filter_fn(left, mask).unwrap(),
@@ -81,7 +81,7 @@ impl ChunkFilter<BinaryType> for BinaryChunked {
         }
         check_filter_len!(self, filter);
         Ok(unsafe {
-            arity::binary_mut_unchecked_same_type(
+            arity::binary_unchecked_same_type(
                 self,
                 filter,
                 |left, mask| filter_fn(left, mask).unwrap(),
@@ -105,7 +105,7 @@ impl ChunkFilter<ListType> for ListChunked {
             };
         }
         Ok(unsafe {
-            arity::binary_mut_unchecked_same_type(
+            arity::binary_unchecked_same_type(
                 self,
                 filter,
                 |left, mask| filter_fn(left, mask).unwrap(),
@@ -130,7 +130,7 @@ impl ChunkFilter<FixedSizeListType> for ArrayChunked {
             };
         }
         Ok(unsafe {
-            arity::binary_mut_unchecked_same_type(
+            arity::binary_unchecked_same_type(
                 self,
                 filter,
                 |left, mask| filter_fn(left, mask).unwrap(),
diff --git a/crates/polars-core/src/chunked_array/ops/repeat_by.rs b/crates/polars-core/src/chunked_array/ops/repeat_by.rs
index 3932b644ad9f..419065689dff 100644
--- a/crates/polars-core/src/chunked_array/ops/repeat_by.rs
+++ b/crates/polars-core/src/chunked_array/ops/repeat_by.rs
@@ -31,7 +31,7 @@ where
             ));
         }
 
-        Ok(arity::binary_mut(self, by, |arr, by| {
+        Ok(arity::binary(self, by, |arr, by| {
             let iter = arr.into_iter().zip(by).map(|(opt_v, opt_by)| {
                 opt_by.map(|by| std::iter::repeat(opt_v.copied()).take(*by as usize))
             });
@@ -56,7 +56,7 @@ impl RepeatBy for BooleanChunked {
             ));
         }
 
-        Ok(arity::binary_mut(self, by, |arr, by| {
+        Ok(arity::binary(self, by, |arr, by| {
             let iter = arr.into_iter().zip(by).map(|(opt_v, opt_by)| {
                 opt_by.map(|by| std::iter::repeat(opt_v).take(*by as usize))
             });
@@ -80,7 +80,7 @@ impl RepeatBy for Utf8Chunked {
             ));
         }
 
-        Ok(arity::binary_mut(self, by, |arr, by| {
+        Ok(arity::binary(self, by, |arr, by| {
             let iter = arr.into_iter().zip(by).map(|(opt_v, opt_by)| {
                 opt_by.map(|by| std::iter::repeat(opt_v).take(*by as usize))
             });
@@ -104,7 +104,7 @@ impl RepeatBy for BinaryChunked {
             ));
         }
 
-        Ok(arity::binary_mut(self, by, |arr, by| {
+        Ok(arity::binary(self, by, |arr, by| {
             let iter = arr.into_iter().zip(by).map(|(opt_v, opt_by)| {
                 opt_by.map(|by| std::iter::repeat(opt_v).take(*by as usize))
             });
diff --git a/crates/polars-core/src/datatypes/from_values.rs b/crates/polars-core/src/datatypes/from_values.rs
index 236495a27305..07341355caa9 100644
--- a/crates/polars-core/src/datatypes/from_values.rs
+++ b/crates/polars-core/src/datatypes/from_values.rs
@@ -10,6 +10,7 @@ use polars_arrow::array::utf8::{BinaryFromIter, Utf8FromIter};
 use polars_arrow::prelude::FromData;
 use polars_arrow::trusted_len::TrustedLen;
 
+use crate::datatypes::NumericNative;
 use crate::prelude::StaticArray;
 
 pub trait ArrayFromElementIter
@@ -59,48 +60,34 @@ impl ArrayFromElementIter for bool {
     }
 }
 
-macro_rules! impl_primitive {
-    ($tp:ty) => {
-        impl ArrayFromElementIter for $tp {
-            type ArrayType = PrimitiveArray<Self>;
-
-            fn array_from_iter<I: TrustedLen<Item = Option<Self>>>(iter: I) -> Self::ArrayType {
-                // SAFETY: guarded by `TrustedLen` trait
-                unsafe { PrimitiveArray::from_trusted_len_iter_unchecked(iter) }
-            }
-
-            fn array_from_values_iter<I: TrustedLen<Item = Self>>(iter: I) -> Self::ArrayType {
-                // SAFETY: guarded by `TrustedLen` trait
-                unsafe { PrimitiveArray::from_trusted_len_values_iter_unchecked(iter) }
-            }
-            fn try_array_from_iter<E: Error, I: TrustedLen<Item = Result<Option<Self>, E>>>(
-                iter: I,
-            ) -> Result<Self::ArrayType, E> {
-                // SAFETY: guarded by `TrustedLen` trait
-                unsafe {
-                    Ok(MutablePrimitiveArray::try_from_trusted_len_iter_unchecked(iter)?.into())
-                }
-            }
-            fn try_array_from_values_iter<E: Error, I: TrustedLen<Item = Result<Self, E>>>(
-                iter: I,
-            ) -> Result<Self::ArrayType, E> {
-                let values: Vec<_> = iter.collect::<Result<Vec<_>, _>>()?;
-                Ok(PrimitiveArray::from_vec(values))
-            }
-        }
-    };
-}
+impl<T> ArrayFromElementIter for T
+where
+    T: NumericNative,
+{
+    type ArrayType = PrimitiveArray<Self>;
+
+    fn array_from_iter<I: TrustedLen<Item = Option<Self>>>(iter: I) -> Self::ArrayType {
+        // SAFETY: guarded by `TrustedLen` trait
+        unsafe { PrimitiveArray::from_trusted_len_iter_unchecked(iter) }
+    }
 
-impl_primitive!(u8);
-impl_primitive!(u16);
-impl_primitive!(u32);
-impl_primitive!(u64);
-impl_primitive!(i8);
-impl_primitive!(i16);
-impl_primitive!(i32);
-impl_primitive!(i64);
-impl_primitive!(f32);
-impl_primitive!(f64);
+    fn array_from_values_iter<I: TrustedLen<Item = Self>>(iter: I) -> Self::ArrayType {
+        // SAFETY: guarded by `TrustedLen` trait
+        unsafe { PrimitiveArray::from_trusted_len_values_iter_unchecked(iter) }
+    }
+    fn try_array_from_iter<E: Error, I: TrustedLen<Item = Result<Option<Self>, E>>>(
+        iter: I,
+    ) -> Result<Self::ArrayType, E> {
+        // SAFETY: guarded by `TrustedLen` trait
+        unsafe { Ok(MutablePrimitiveArray::try_from_trusted_len_iter_unchecked(iter)?.into()) }
+    }
+    fn try_array_from_values_iter<E: Error, I: TrustedLen<Item = Result<Self, E>>>(
+        iter: I,
+    ) -> Result<Self::ArrayType, E> {
+        let values: Vec<_> = iter.collect::<Result<Vec<_>, _>>()?;
+        Ok(PrimitiveArray::from_vec(values))
+    }
+}
 
 impl ArrayFromElementIter for &str {
     type ArrayType = Utf8Array<i64>;
diff --git a/crates/polars-core/src/series/arithmetic/borrowed.rs b/crates/polars-core/src/series/arithmetic/borrowed.rs
index d34227aa4198..bb04ddc9c976 100644
--- a/crates/polars-core/src/series/arithmetic/borrowed.rs
+++ b/crates/polars-core/src/series/arithmetic/borrowed.rs
@@ -177,10 +177,10 @@ pub mod checked {
             // see check_div for chunkedarray<T>
             let rhs = unsafe { lhs.unpack_series_matching_physical_type(rhs) };
 
-            Ok(
-                arity::binary_elementwise::<_, _, Float32Type, _>(lhs, rhs, |opt_l, opt_r| match (
-                    opt_l, opt_r,
-                ) {
+            Ok(arity::binary_elementwise::<_, _, Float32Type, _, _>(
+                lhs,
+                rhs,
+                |opt_l, opt_r| match (opt_l, opt_r) {
                     (Some(l), Some(r)) => {
                         if r.is_zero() {
                             None
@@ -189,9 +189,9 @@ pub mod checked {
                         }
                     },
                     _ => None,
-                })
-                .into_series(),
+                },
             )
+            .into_series())
         }
     }
 
@@ -201,10 +201,10 @@ pub mod checked {
             // see check_div
             let rhs = unsafe { lhs.unpack_series_matching_physical_type(rhs) };
 
-            Ok(
-                arity::binary_elementwise::<_, _, Float64Type, _>(lhs, rhs, |opt_l, opt_r| match (
-                    opt_l, opt_r,
-                ) {
+            Ok(arity::binary_elementwise::<_, _, Float64Type, _, _>(
+                lhs,
+                rhs,
+                |opt_l, opt_r| match (opt_l, opt_r) {
                     (Some(l), Some(r)) => {
                         if r.is_zero() {
                             None
@@ -213,9 +213,9 @@ pub mod checked {
                         }
                     },
                     _ => None,
-                })
-                .into_series(),
+                },
             )
+            .into_series())
         }
     }
 
diff --git a/crates/polars-ops/src/chunked_array/list/sets.rs b/crates/polars-ops/src/chunked_array/list/sets.rs
index a442e820a420..fe3fff9a3a78 100644
--- a/crates/polars-ops/src/chunked_array/list/sets.rs
+++ b/crates/polars-ops/src/chunked_array/list/sets.rs
@@ -282,7 +282,7 @@ fn array_set_operation(
 pub fn list_set_operation(a: &ListChunked, b: &ListChunked, set_op: SetOperation) -> ListChunked {
     // we use the unsafe variant because we want to keep the nested logical types type.
     unsafe {
-        arity::binary_mut_unchecked_same_type(
+        arity::binary_unchecked_same_type(
             a,
             b,
             |a, b| array_set_operation(a, b, set_op).boxed(),
diff --git a/crates/polars-ops/src/series/ops/floor_divide.rs b/crates/polars-ops/src/series/ops/floor_divide.rs
index 933519cdd2a4..c8ae34dbb272 100644
--- a/crates/polars-ops/src/series/ops/floor_divide.rs
+++ b/crates/polars-ops/src/series/ops/floor_divide.rs
@@ -78,7 +78,7 @@ fn floor_div_ca<T: PolarsNumericType>(a: &ChunkedArray<T>, b: &ChunkedArray<T>)
             ChunkedArray::full_null(a.name(), a.len())
         };
     }
-    arity::binary_mut(a, b, floor_div_array)
+    arity::binary(a, b, floor_div_array)
 }
 
 pub fn floor_div_series(a: &Series, b: &Series) -> PolarsResult<Series> {
diff --git a/crates/polars-plan/src/dsl/function_expr/pow.rs b/crates/polars-plan/src/dsl/function_expr/pow.rs
index cfb1fe14dff3..dc88256ad1e0 100644
--- a/crates/polars-plan/src/dsl/function_expr/pow.rs
+++ b/crates/polars-plan/src/dsl/function_expr/pow.rs
@@ -68,7 +68,7 @@ where
         ))
     } else {
         Ok(Some(
-            polars_core::chunked_array::ops::arity::binary_mut(base, exponent, pow_kernel)
+            polars_core::chunked_array::ops::arity::binary(base, exponent, pow_kernel)
                 .into_series(),
         ))
     }
diff --git a/crates/polars-plan/src/dsl/function_expr/trigonometry.rs b/crates/polars-plan/src/dsl/function_expr/trigonometry.rs
index 32a57a623e59..a24a0ebb94ab 100644
--- a/crates/polars-plan/src/dsl/function_expr/trigonometry.rs
+++ b/crates/polars-plan/src/dsl/function_expr/trigonometry.rs
@@ -129,7 +129,7 @@ where
         Ok(Some(x.apply_values(|v| y_value.atan2(v)).into_series()))
     } else {
         Ok(Some(
-            polars_core::prelude::arity::binary_mut(y, x, atan2_kernel).into_series(),
+            polars_core::prelude::arity::binary(y, x, atan2_kernel).into_series(),
         ))
     }
 }

From a75495ac1a4c6844ee82ba12f00ba9d6849b14a6 Mon Sep 17 00:00:00 2001
From: Sam Damashek <samuel.damashek@gmail.com>
Date: Sun, 20 Aug 2023 19:31:38 +0800
Subject: [PATCH 16/55] feat(python, rust!): Read/write support for IPC streams
 in DataFrames (#10606)

---
 crates/polars-io/src/ipc/ipc_stream.rs  |  11 +--
 crates/polars/tests/it/io/ipc_stream.rs |   7 +-
 py-polars/Cargo.toml                    |   2 +
 py-polars/docs/source/reference/io.rst  |   2 +
 py-polars/polars/__init__.py            |   2 +
 py-polars/polars/dataframe/frame.py     | 120 +++++++++++++++++++++++-
 py-polars/polars/io/__init__.py         |   3 +-
 py-polars/polars/io/ipc/__init__.py     |   3 +-
 py-polars/polars/io/ipc/functions.py    |  75 +++++++++++++++
 py-polars/src/dataframe.rs              |  50 ++++++++++
 py-polars/tests/unit/io/test_ipc.py     |  72 +++++++++-----
 11 files changed, 308 insertions(+), 39 deletions(-)

diff --git a/crates/polars-io/src/ipc/ipc_stream.rs b/crates/polars-io/src/ipc/ipc_stream.rs
index 52f07918c68c..3a3b6a399e65 100644
--- a/crates/polars-io/src/ipc/ipc_stream.rs
+++ b/crates/polars-io/src/ipc/ipc_stream.rs
@@ -237,17 +237,16 @@ fn fix_column_order(df: DataFrame, projection: Option<Vec<usize>>, row_count: bo
 #[must_use]
 pub struct IpcStreamWriter<W> {
     writer: W,
-    compression: Option<write::Compression>,
+    compression: Option<IpcCompression>,
 }
 
 use polars_core::frame::ArrowChunk;
-pub use write::Compression as IpcCompression;
 
 use crate::RowCount;
 
 impl<W> IpcStreamWriter<W> {
     /// Set the compression used. Defaults to None.
-    pub fn with_compression(mut self, compression: Option<write::Compression>) -> Self {
+    pub fn with_compression(mut self, compression: Option<IpcCompression>) -> Self {
         self.compression = compression;
         self
     }
@@ -268,7 +267,7 @@ where
         let mut ipc_stream_writer = write::StreamWriter::new(
             &mut self.writer,
             WriteOptions {
-                compression: self.compression,
+                compression: self.compression.map(|c| c.into()),
             },
         );
 
@@ -286,7 +285,7 @@ where
 }
 
 pub struct IpcStreamWriterOption {
-    compression: Option<write::Compression>,
+    compression: Option<IpcCompression>,
     extension: PathBuf,
 }
 
@@ -299,7 +298,7 @@ impl IpcStreamWriterOption {
     }
 
     /// Set the compression used. Defaults to None.
-    pub fn with_compression(mut self, compression: Option<write::Compression>) -> Self {
+    pub fn with_compression(mut self, compression: Option<IpcCompression>) -> Self {
         self.compression = compression;
         self
     }
diff --git a/crates/polars/tests/it/io/ipc_stream.rs b/crates/polars/tests/it/io/ipc_stream.rs
index 1bb070af99a0..eb369b284f40 100644
--- a/crates/polars/tests/it/io/ipc_stream.rs
+++ b/crates/polars/tests/it/io/ipc_stream.rs
@@ -2,7 +2,6 @@
 mod test {
     use std::io::Cursor;
 
-    use polars::export::arrow::io::ipc::write;
     use polars_core::df;
     use polars_core::prelude::*;
     use polars_io::ipc::*;
@@ -105,11 +104,7 @@ mod test {
     fn test_write_with_compression() {
         let mut df = create_df();
 
-        let compressions = vec![
-            None,
-            Some(write::Compression::LZ4),
-            Some(write::Compression::ZSTD),
-        ];
+        let compressions = vec![None, Some(IpcCompression::LZ4), Some(IpcCompression::ZSTD)];
 
         for compression in compressions.into_iter() {
             let mut buf: Cursor<Vec<u8>> = Cursor::new(Vec::new());
diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml
index 741c3f358f7b..ce25037a4e48 100644
--- a/py-polars/Cargo.toml
+++ b/py-polars/Cargo.toml
@@ -101,6 +101,7 @@ dtype-u16 = []
 avro = ["polars/avro"]
 parquet = ["polars/parquet"]
 ipc = ["polars/ipc"]
+ipc_streaming = ["polars/ipc_streaming"]
 is_in = ["polars/is_in"]
 json = ["polars/serde", "serde_json", "polars/json"]
 trigonometry = ["polars/trigonometry"]
@@ -145,6 +146,7 @@ all = [
   "json",
   "parquet",
   "ipc",
+  "ipc_streaming",
   "avro",
   "is_in",
   "repeat_by",
diff --git a/py-polars/docs/source/reference/io.rst b/py-polars/docs/source/reference/io.rst
index 6c315f70929c..d83afcffd10e 100644
--- a/py-polars/docs/source/reference/io.rst
+++ b/py-polars/docs/source/reference/io.rst
@@ -19,9 +19,11 @@ Feather/ IPC
    :toctree: api/
 
    read_ipc
+   read_ipc_stream
    scan_ipc
    read_ipc_schema
    DataFrame.write_ipc
+   DataFrame.write_ipc_stream
    LazyFrame.sink_ipc
 
 Parquet
diff --git a/py-polars/polars/__init__.py b/py-polars/polars/__init__.py
index fac8e02788ad..12557c8bb62b 100644
--- a/py-polars/polars/__init__.py
+++ b/py-polars/polars/__init__.py
@@ -158,6 +158,7 @@
     read_excel,
     read_ipc,
     read_ipc_schema,
+    read_ipc_stream,
     read_json,
     read_ndjson,
     read_parquet,
@@ -250,6 +251,7 @@
     "read_excel",
     "read_ipc",
     "read_ipc_schema",
+    "read_ipc_stream",
     "read_json",
     "read_ndjson",
     "read_parquet",
diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
index 19c09a790fcd..6e8794d791c6 100644
--- a/py-polars/polars/dataframe/frame.py
+++ b/py-polars/polars/dataframe/frame.py
@@ -904,9 +904,10 @@ def _read_ipc(
         memory_map: bool = True,
     ) -> Self:
         """
-        Read into a DataFrame from Arrow IPC stream format.
+        Read into a DataFrame from Arrow IPC file format.
 
-        Arrow IPC is also know as Feather (v2).
+        See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html.
+        Arrow IPC files are also known as Feather (v2) files.
 
         Parameters
         ----------
@@ -972,6 +973,58 @@ def _read_ipc(
         )
         return self
 
+    @classmethod
+    def _read_ipc_stream(
+        cls,
+        source: str | Path | BinaryIO | bytes,
+        *,
+        columns: Sequence[int] | Sequence[str] | None = None,
+        n_rows: int | None = None,
+        row_count_name: str | None = None,
+        row_count_offset: int = 0,
+        rechunk: bool = True,
+    ) -> Self:
+        """
+        Read into a DataFrame from Arrow IPC record batch stream format.
+
+        See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html.
+
+        Parameters
+        ----------
+        source
+            Path to a file or a file-like object (by file-like object, we refer to
+            objects that have a ``read()`` method, such as a file handler (e.g.
+            via builtin ``open`` function) or ``BytesIO``).
+        columns
+            Columns to select. Accepts a list of column indices (starting at zero) or a
+            list of column names.
+        n_rows
+            Stop reading from IPC stream after reading ``n_rows``.
+        row_count_name
+            Row count name.
+        row_count_offset
+            Row count offset.
+        rechunk
+            Make sure that all data is contiguous.
+
+        """
+        if isinstance(source, (str, Path)):
+            source = normalise_filepath(source)
+        if isinstance(columns, str):
+            columns = [columns]
+
+        projection, columns = handle_projection_columns(columns)
+        self = cls.__new__(cls)
+        self._df = PyDataFrame.read_ipc_stream(
+            source,
+            columns,
+            projection,
+            n_rows,
+            _prepare_row_count_args(row_count_name, row_count_offset),
+            rechunk,
+        )
+        return self
+
     @classmethod
     def _read_json(
         cls,
@@ -3085,6 +3138,8 @@ def write_ipc(
         """
         Write to Arrow IPC binary stream or Feather file.
 
+        See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html.
+
         Parameters
         ----------
         file
@@ -3120,6 +3175,67 @@ def write_ipc(
         self._df.write_ipc(file, compression)
         return file if return_bytes else None  # type: ignore[return-value]
 
+    @overload
+    def write_ipc_stream(
+        self,
+        file: None,
+        compression: IpcCompression = "uncompressed",
+    ) -> BytesIO:
+        ...
+
+    @overload
+    def write_ipc_stream(
+        self,
+        file: BinaryIO | BytesIO | str | Path,
+        compression: IpcCompression = "uncompressed",
+    ) -> None:
+        ...
+
+    def write_ipc_stream(
+        self,
+        file: BinaryIO | BytesIO | str | Path | None,
+        compression: IpcCompression = "uncompressed",
+    ) -> BytesIO | None:
+        """
+        Write to Arrow IPC record batch stream.
+
+        See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html.
+
+        Parameters
+        ----------
+        file
+            Path to which the IPC record batch data should be written. If set to
+            ``None``, the output is returned as a BytesIO object.
+        compression : {'uncompressed', 'lz4', 'zstd'}
+            Compression method. Defaults to "uncompressed".
+
+        Examples
+        --------
+        >>> import pathlib
+        >>>
+        >>> df = pl.DataFrame(
+        ...     {
+        ...         "foo": [1, 2, 3, 4, 5],
+        ...         "bar": [6, 7, 8, 9, 10],
+        ...         "ham": ["a", "b", "c", "d", "e"],
+        ...     }
+        ... )
+        >>> path: pathlib.Path = dirpath / "new_file.arrow"
+        >>> df.write_ipc_stream(path)
+
+        """
+        return_bytes = file is None
+        if return_bytes:
+            file = BytesIO()
+        elif isinstance(file, (str, Path)):
+            file = normalise_filepath(file)
+
+        if compression is None:
+            compression = "uncompressed"
+
+        self._df.write_ipc_stream(file, compression)
+        return file if return_bytes else None  # type: ignore[return-value]
+
     def write_parquet(
         self,
         file: str | Path | BytesIO,
diff --git a/py-polars/polars/io/__init__.py b/py-polars/polars/io/__init__.py
index 4ed17a139803..7243007e82fc 100644
--- a/py-polars/polars/io/__init__.py
+++ b/py-polars/polars/io/__init__.py
@@ -5,7 +5,7 @@
 from polars.io.database import read_database
 from polars.io.delta import read_delta, scan_delta
 from polars.io.excel import read_excel
-from polars.io.ipc import read_ipc, read_ipc_schema, scan_ipc
+from polars.io.ipc import read_ipc, read_ipc_schema, read_ipc_stream, scan_ipc
 from polars.io.json import read_json
 from polars.io.ndjson import read_ndjson, scan_ndjson
 from polars.io.parquet import read_parquet, read_parquet_schema, scan_parquet
@@ -19,6 +19,7 @@
     "read_delta",
     "read_excel",
     "read_ipc",
+    "read_ipc_stream",
     "read_ipc_schema",
     "read_json",
     "read_ndjson",
diff --git a/py-polars/polars/io/ipc/__init__.py b/py-polars/polars/io/ipc/__init__.py
index e0f4b0d4be27..9423bbceb829 100644
--- a/py-polars/polars/io/ipc/__init__.py
+++ b/py-polars/polars/io/ipc/__init__.py
@@ -1,7 +1,8 @@
-from polars.io.ipc.functions import read_ipc, read_ipc_schema, scan_ipc
+from polars.io.ipc.functions import read_ipc, read_ipc_schema, read_ipc_stream, scan_ipc
 
 __all__ = [
     "read_ipc",
+    "read_ipc_stream",
     "read_ipc_schema",
     "scan_ipc",
 ]
diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py
index e2bda1b8a7d4..7f661cc2d33f 100644
--- a/py-polars/polars/io/ipc/functions.py
+++ b/py-polars/polars/io/ipc/functions.py
@@ -111,6 +111,81 @@ def read_ipc(
         )
 
 
+def read_ipc_stream(
+    source: str | BinaryIO | BytesIO | Path | bytes,
+    *,
+    columns: list[int] | list[str] | None = None,
+    n_rows: int | None = None,
+    use_pyarrow: bool = False,
+    storage_options: dict[str, Any] | None = None,
+    row_count_name: str | None = None,
+    row_count_offset: int = 0,
+    rechunk: bool = True,
+) -> DataFrame:
+    """
+    Read into a DataFrame from Arrow IPC record batch stream.
+
+    Parameters
+    ----------
+    source
+        Path to a file or a file-like object (by file-like object, we refer to objects
+        that have a ``read()`` method, such as a file handler (e.g. via builtin ``open``
+        function) or ``BytesIO``). If ``fsspec`` is installed, it will be used to open
+        remote files.
+    columns
+        Columns to select. Accepts a list of column indices (starting at zero) or a list
+        of column names.
+    n_rows
+        Stop reading from IPC stream after reading ``n_rows``.
+        Only valid when `use_pyarrow=False`.
+    use_pyarrow
+        Use pyarrow or the native Rust reader.
+    storage_options
+        Extra options that make sense for ``fsspec.open()`` or a particular storage
+        connection, e.g. host, port, username, password, etc.
+    row_count_name
+        If not None, this will insert a row count column with give name into the
+        DataFrame
+    row_count_offset
+        Offset to start the row_count column (only use if the name is set)
+    rechunk
+        Make sure that all data is contiguous.
+
+    Returns
+    -------
+    DataFrame
+
+    """
+    storage_options = storage_options or {}
+    with _prepare_file_arg(source, use_pyarrow=use_pyarrow, **storage_options) as data:
+        if use_pyarrow:
+            if not _PYARROW_AVAILABLE:
+                raise ImportError(
+                    "'pyarrow' is required when using"
+                    " 'read_ipc_stream(..., use_pyarrow=True)'"
+                )
+
+            import pyarrow as pa
+
+            with pa.ipc.RecordBatchStreamReader(data) as reader:
+                tbl = reader.read_all()
+                df = pl.DataFrame._from_arrow(tbl, rechunk=rechunk)
+                if row_count_name is not None:
+                    df = df.with_row_count(row_count_name, row_count_offset)
+                if n_rows is not None:
+                    df = df.slice(0, n_rows)
+                return df
+
+        return pl.DataFrame._read_ipc_stream(
+            data,
+            columns=columns,
+            n_rows=n_rows,
+            row_count_name=row_count_name,
+            row_count_offset=row_count_offset,
+            rechunk=rechunk,
+        )
+
+
 def read_ipc_schema(source: str | BinaryIO | Path | bytes) -> dict[str, PolarsDataType]:
     """
     Get the schema of an IPC file without reading data.
diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs
index 2ffc201629ae..b3e77cf07f96 100644
--- a/py-polars/src/dataframe.rs
+++ b/py-polars/src/dataframe.rs
@@ -304,6 +304,30 @@ impl PyDataFrame {
         Ok(PyDataFrame::new(df))
     }
 
+    #[staticmethod]
+    #[cfg(feature = "ipc_streaming")]
+    #[pyo3(signature = (py_f, columns, projection, n_rows, row_count, rechunk))]
+    pub fn read_ipc_stream(
+        py_f: &PyAny,
+        columns: Option<Vec<String>>,
+        projection: Option<Vec<usize>>,
+        n_rows: Option<usize>,
+        row_count: Option<(String, IdxSize)>,
+        rechunk: bool,
+    ) -> PyResult<Self> {
+        let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
+        let mmap_bytes_r = get_mmap_bytes_reader(py_f)?;
+        let df = IpcStreamReader::new(mmap_bytes_r)
+            .with_projection(projection)
+            .with_columns(columns)
+            .with_n_rows(n_rows)
+            .with_row_count(row_count)
+            .set_rechunk(rechunk)
+            .finish()
+            .map_err(PyPolarsErr::from)?;
+        Ok(PyDataFrame::new(df))
+    }
+
     #[staticmethod]
     #[cfg(feature = "avro")]
     #[pyo3(signature = (py_f, columns, projection, n_rows))]
@@ -626,6 +650,32 @@ impl PyDataFrame {
         Ok(())
     }
 
+    #[cfg(feature = "ipc_streaming")]
+    pub fn write_ipc_stream(
+        &mut self,
+        py: Python,
+        py_f: PyObject,
+        compression: Wrap<Option<IpcCompression>>,
+    ) -> PyResult<()> {
+        if let Ok(s) = py_f.extract::<&str>(py) {
+            py.allow_threads(|| {
+                let f = std::fs::File::create(s).unwrap();
+                IpcStreamWriter::new(f)
+                    .with_compression(compression.0)
+                    .finish(&mut self.df)
+                    .map_err(PyPolarsErr::from)
+            })?;
+        } else {
+            let mut buf = get_file_like(py_f, true)?;
+
+            IpcStreamWriter::new(&mut buf)
+                .with_compression(compression.0)
+                .finish(&mut self.df)
+                .map_err(PyPolarsErr::from)?;
+        }
+        Ok(())
+    }
+
     #[cfg(feature = "object")]
     pub fn row_tuple(&self, idx: i64) -> PyResult<PyObject> {
         let idx = if idx < 0 {
diff --git a/py-polars/tests/unit/io/test_ipc.py b/py-polars/tests/unit/io/test_ipc.py
index 2f8af8207e31..6a7161fd9fe2 100644
--- a/py-polars/tests/unit/io/test_ipc.py
+++ b/py-polars/tests/unit/io/test_ipc.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import io
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 import pandas as pd
 import pytest
@@ -17,83 +17,108 @@
 COMPRESSIONS = ["uncompressed", "lz4", "zstd"]
 
 
+def read_ipc(is_stream: bool, *args: Any, **kwargs: Any) -> pl.DataFrame:
+    if is_stream:
+        return pl.read_ipc_stream(*args, **kwargs)
+    else:
+        return pl.read_ipc(*args, **kwargs)
+
+
+def write_ipc(df: pl.DataFrame, is_stream: bool, *args: Any, **kwargs: Any) -> Any:
+    if is_stream:
+        return df.write_ipc_stream(*args, **kwargs)
+    else:
+        return df.write_ipc(*args, **kwargs)
+
+
 @pytest.mark.parametrize("compression", COMPRESSIONS)
-def test_from_to_buffer(df: pl.DataFrame, compression: IpcCompression) -> None:
+@pytest.mark.parametrize("stream", [True, False])
+def test_from_to_buffer(
+    df: pl.DataFrame, compression: IpcCompression, stream: bool
+) -> None:
     # use an ad-hoc buffer (file=None)
-    buf1 = df.write_ipc(None, compression=compression)
-    read_df = pl.read_ipc(buf1, use_pyarrow=False)
+    buf1 = write_ipc(df, stream, None, compression=compression)
+    read_df = read_ipc(stream, buf1, use_pyarrow=False)
     assert_frame_equal(df, read_df, categorical_as_str=True)
 
     # explicitly supply an existing buffer
     buf2 = io.BytesIO()
-    df.write_ipc(buf2, compression=compression)
+    write_ipc(df, stream, buf2, compression=compression)
     buf2.seek(0)
-    read_df = pl.read_ipc(buf2, use_pyarrow=False)
+    read_df = read_ipc(stream, buf2, use_pyarrow=False)
     assert_frame_equal(df, read_df, categorical_as_str=True)
 
 
 @pytest.mark.parametrize("compression", COMPRESSIONS)
 @pytest.mark.parametrize("path_as_string", [True, False])
+@pytest.mark.parametrize("stream", [True, False])
 @pytest.mark.write_disk()
 def test_from_to_file(
     df: pl.DataFrame,
     compression: IpcCompression,
     path_as_string: bool,
     tmp_path: Path,
+    stream: bool,
 ) -> None:
     tmp_path.mkdir(exist_ok=True)
     file_path = tmp_path / "small.ipc"
     if path_as_string:
         file_path = str(file_path)  # type: ignore[assignment]
-    df.write_ipc(file_path, compression=compression)
-    df_read = pl.read_ipc(file_path, use_pyarrow=False)
+    write_ipc(df, stream, file_path, compression=compression)
+    df_read = read_ipc(stream, file_path, use_pyarrow=False)
 
     assert_frame_equal(df, df_read, categorical_as_str=True)
 
 
+@pytest.mark.parametrize("stream", [True, False])
 @pytest.mark.write_disk()
-def test_select_columns_from_file(df: pl.DataFrame, tmp_path: Path) -> None:
+def test_select_columns_from_file(
+    df: pl.DataFrame, tmp_path: Path, stream: bool
+) -> None:
     tmp_path.mkdir(exist_ok=True)
     file_path = tmp_path / "small.ipc"
-    df.write_ipc(file_path)
-    df_read = pl.read_ipc(file_path, columns=["bools"])
+    write_ipc(df, stream, file_path)
+    df_read = read_ipc(stream, file_path, columns=["bools"])
 
     assert df_read.columns == ["bools"]
 
 
-def test_select_columns_from_buffer() -> None:
+@pytest.mark.parametrize("stream", [True, False])
+def test_select_columns_from_buffer(stream: bool) -> None:
     df = pl.DataFrame({"a": [1, 2, 3], "b": [True, False, True], "c": ["a", "b", "c"]})
     expected = pl.DataFrame({"b": [True, False, True], "c": ["a", "b", "c"]})
 
     f = io.BytesIO()
-    df.write_ipc(f)
+    write_ipc(df, stream, f)
     f.seek(0)
 
-    read_df = pl.read_ipc(f, columns=["b", "c"], use_pyarrow=False)
+    read_df = read_ipc(stream, f, columns=["b", "c"], use_pyarrow=False)
     assert_frame_equal(expected, read_df)
 
 
-def test_select_columns_projection() -> None:
+@pytest.mark.parametrize("stream", [True, False])
+def test_select_columns_projection(stream: bool) -> None:
     df = pl.DataFrame({"a": [1, 2, 3], "b": [True, False, True], "c": ["a", "b", "c"]})
     expected = pl.DataFrame({"b": [True, False, True], "c": ["a", "b", "c"]})
 
     f = io.BytesIO()
-    df.write_ipc(f)
+    write_ipc(df, stream, f)
     f.seek(0)
 
-    read_df = pl.read_ipc(f, columns=[1, 2], use_pyarrow=False)
+    read_df = read_ipc(stream, f, columns=[1, 2], use_pyarrow=False)
     assert_frame_equal(expected, read_df)
 
 
 @pytest.mark.parametrize("compression", COMPRESSIONS)
-def test_compressed_simple(compression: IpcCompression) -> None:
+@pytest.mark.parametrize("stream", [True, False])
+def test_compressed_simple(compression: IpcCompression, stream: bool) -> None:
     df = pl.DataFrame({"a": [1, 2, 3], "b": [True, False, True], "c": ["a", "b", "c"]})
 
     f = io.BytesIO()
-    df.write_ipc(f, compression)
+    write_ipc(df, stream, f, compression)
     f.seek(0)
 
-    df_read = pl.read_ipc(f, use_pyarrow=False)
+    df_read = read_ipc(stream, f, use_pyarrow=False)
     assert_frame_equal(df_read, df)
 
 
@@ -143,7 +168,8 @@ def test_ipc_schema_from_file(
     assert schema == expected
 
 
-def test_ipc_column_order() -> None:
+@pytest.mark.parametrize("stream", [True, False])
+def test_ipc_column_order(stream: bool) -> None:
     df = pl.DataFrame(
         {
             "cola": ["x", "y", "z"],
@@ -152,12 +178,12 @@ def test_ipc_column_order() -> None:
         }
     )
     f = io.BytesIO()
-    df.write_ipc(f)
+    write_ipc(df, stream, f)
     f.seek(0)
 
     columns = ["colc", "colb", "cola"]
     # read file into polars; the specified column order is no longer respected
-    assert pl.read_ipc(f, columns=columns).columns == columns
+    assert read_ipc(stream, f, columns=columns).columns == columns
 
 
 @pytest.mark.write_disk()

From c6a301e3141d40c6c6febdebe0b95632ac8da2ed Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijn@degooijer.io>
Date: Sun, 20 Aug 2023 16:40:43 +0200
Subject: [PATCH 17/55] chore(python): Bump ruff and enable new setting
 (#10626)

---
 py-polars/polars/convert.py                       |  9 +--------
 py-polars/polars/expr/expr.py                     |  4 ++--
 py-polars/polars/functions/range.py               |  4 +---
 py-polars/polars/io/_utils.py                     | 11 ++---------
 py-polars/polars/io/csv/batched_reader.py         |  5 +----
 py-polars/polars/io/csv/functions.py              | 10 +---------
 py-polars/polars/sql/context.py                   |  8 +-------
 py-polars/polars/testing/parametric/__init__.py   |  7 +------
 py-polars/polars/testing/parametric/primitives.py |  7 +------
 py-polars/pyproject.toml                          |  3 +++
 py-polars/requirements-lint.txt                   |  2 +-
 py-polars/tests/unit/io/test_database.py          |  6 +-----
 py-polars/tests/unit/test_lazy.py                 |  1 -
 13 files changed, 16 insertions(+), 61 deletions(-)

diff --git a/py-polars/polars/convert.py b/py-polars/polars/convert.py
index 262eb10d28b0..a6fd5f8401c1 100644
--- a/py-polars/polars/convert.py
+++ b/py-polars/polars/convert.py
@@ -7,14 +7,7 @@
 
 import polars._reexport as pl
 from polars import functions as F
-from polars.datatypes import (
-    N_INFER_DEFAULT,
-    Categorical,
-    List,
-    Object,
-    Struct,
-    Utf8,
-)
+from polars.datatypes import N_INFER_DEFAULT, Categorical, List, Object, Struct, Utf8
 from polars.dependencies import pandas as pd
 from polars.dependencies import pyarrow as pa
 from polars.exceptions import NoDataError
diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py
index 29735133c9dc..d7bf90aeeb81 100644
--- a/py-polars/polars/expr/expr.py
+++ b/py-polars/polars/expr/expr.py
@@ -8827,11 +8827,11 @@ def _remap_key_or_value_series(
             except OverflowError as exc:
                 if is_keys:
                     raise ValueError(
-                        f"remapping keys for map_dict could not be converted to {dtype!r}: {str(exc)}"
+                        f"remapping keys for map_dict could not be converted to {dtype!r}: {exc!s}"
                     ) from exc
                 else:
                     raise ValueError(
-                        f"choose a more suitable output dtype for map_dict as remapping value could not be converted to {dtype!r}: {str(exc)}"
+                        f"choose a more suitable output dtype for map_dict as remapping value could not be converted to {dtype!r}: {exc!s}"
                     ) from exc
 
             if is_keys:
diff --git a/py-polars/polars/functions/range.py b/py-polars/polars/functions/range.py
index 05a051bb803c..20e6433b9b84 100644
--- a/py-polars/polars/functions/range.py
+++ b/py-polars/polars/functions/range.py
@@ -8,9 +8,7 @@
 from polars.datatypes import Int64
 from polars.utils._parse_expr_input import parse_as_expression
 from polars.utils._wrap import wrap_expr
-from polars.utils.convert import (
-    _timedelta_to_pl_duration,
-)
+from polars.utils.convert import _timedelta_to_pl_duration
 from polars.utils.deprecation import (
     deprecate_renamed_parameter,
     issue_deprecation_warning,
diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py
index 65033e28045d..69c21748072f 100644
--- a/py-polars/polars/io/_utils.py
+++ b/py-polars/polars/io/_utils.py
@@ -4,14 +4,7 @@
 from contextlib import contextmanager
 from io import BytesIO, StringIO
 from pathlib import Path
-from typing import (
-    Any,
-    BinaryIO,
-    ContextManager,
-    Iterator,
-    TextIO,
-    overload,
-)
+from typing import Any, BinaryIO, ContextManager, Iterator, TextIO, overload
 
 from polars.dependencies import _FSSPEC_AVAILABLE, fsspec
 from polars.exceptions import NoDataError
@@ -24,7 +17,7 @@ def _is_glob_pattern(file: str) -> bool:
 
 def _is_local_file(file: str) -> bool:
     try:
-        next(glob.iglob(file, recursive=True))
+        next(glob.iglob(file, recursive=True))  # noqa: PTH207
         return True
     except StopIteration:
         return False
diff --git a/py-polars/polars/io/csv/batched_reader.py b/py-polars/polars/io/csv/batched_reader.py
index 9bb71d39215c..87b58c055be2 100644
--- a/py-polars/polars/io/csv/batched_reader.py
+++ b/py-polars/polars/io/csv/batched_reader.py
@@ -4,10 +4,7 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Sequence
 
-from polars.datatypes import (
-    N_INFER_DEFAULT,
-    py_type_to_dtype,
-)
+from polars.datatypes import N_INFER_DEFAULT, py_type_to_dtype
 from polars.io.csv._utils import _update_columns
 from polars.utils._wrap import wrap_df
 from polars.utils.various import (
diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py
index 13564a3df9c9..45eac753c094 100644
--- a/py-polars/polars/io/csv/functions.py
+++ b/py-polars/polars/io/csv/functions.py
@@ -1,15 +1,7 @@
 from __future__ import annotations
 
 from pathlib import Path
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    BinaryIO,
-    Callable,
-    Mapping,
-    Sequence,
-    TextIO,
-)
+from typing import TYPE_CHECKING, Any, BinaryIO, Callable, Mapping, Sequence, TextIO
 
 import polars._reexport as pl
 from polars.datatypes import N_INFER_DEFAULT, Utf8
diff --git a/py-polars/polars/sql/context.py b/py-polars/polars/sql/context.py
index 6ef4ae458512..f4b39bb71c44 100644
--- a/py-polars/polars/sql/context.py
+++ b/py-polars/polars/sql/context.py
@@ -1,13 +1,7 @@
 from __future__ import annotations
 
 import contextlib
-from typing import (
-    TYPE_CHECKING,
-    Collection,
-    Generic,
-    Mapping,
-    overload,
-)
+from typing import TYPE_CHECKING, Collection, Generic, Mapping, overload
 
 from polars.dataframe import DataFrame
 from polars.lazyframe import LazyFrame
diff --git a/py-polars/polars/testing/parametric/__init__.py b/py-polars/polars/testing/parametric/__init__.py
index 3c08421b349c..98272ba93190 100644
--- a/py-polars/polars/testing/parametric/__init__.py
+++ b/py-polars/polars/testing/parametric/__init__.py
@@ -3,12 +3,7 @@
 from polars.dependencies import _HYPOTHESIS_AVAILABLE
 
 if _HYPOTHESIS_AVAILABLE:
-    from polars.testing.parametric.primitives import (
-        column,
-        columns,
-        dataframes,
-        series,
-    )
+    from polars.testing.parametric.primitives import column, columns, dataframes, series
     from polars.testing.parametric.profiles import load_profile, set_profile
     from polars.testing.parametric.strategies import (
         all_strategies,
diff --git a/py-polars/polars/testing/parametric/primitives.py b/py-polars/polars/testing/parametric/primitives.py
index c7b0648f7e80..d91f16ae9358 100644
--- a/py-polars/polars/testing/parametric/primitives.py
+++ b/py-polars/polars/testing/parametric/primitives.py
@@ -8,12 +8,7 @@
 from typing import TYPE_CHECKING, Any, Collection, Sequence, overload
 
 from hypothesis.errors import InvalidArgument, NonInteractiveExampleWarning
-from hypothesis.strategies import (
-    booleans,
-    composite,
-    lists,
-    sampled_from,
-)
+from hypothesis.strategies import booleans, composite, lists, sampled_from
 from hypothesis.strategies._internal.utils import defines_strategy
 
 from polars.dataframe import DataFrame
diff --git a/py-polars/pyproject.toml b/py-polars/pyproject.toml
index 907694a73abb..f30c9c05f3b8 100644
--- a/py-polars/pyproject.toml
+++ b/py-polars/pyproject.toml
@@ -153,6 +153,9 @@ ignore = [
 [tool.ruff.pycodestyle]
 max-doc-length = 88
 
+[tool.ruff.isort]
+split-on-trailing-comma = false
+
 [tool.ruff.flake8-tidy-imports]
 ban-relative-imports = "all"
 
diff --git a/py-polars/requirements-lint.txt b/py-polars/requirements-lint.txt
index ddf13052477e..2adb9a5681c5 100644
--- a/py-polars/requirements-lint.txt
+++ b/py-polars/requirements-lint.txt
@@ -1,5 +1,5 @@
 black==23.7.0
 blackdoc==0.3.8
 mypy==1.4.1
-ruff==0.0.278
+ruff==0.0.285
 typos==1.16.1
diff --git a/py-polars/tests/unit/io/test_database.py b/py-polars/tests/unit/io/test_database.py
index 8a3fbcfdc606..d8efec1eb5ec 100644
--- a/py-polars/tests/unit/io/test_database.py
+++ b/py-polars/tests/unit/io/test_database.py
@@ -12,11 +12,7 @@
 from polars.testing import assert_frame_equal
 
 if TYPE_CHECKING:
-    from polars.type_aliases import (
-        DbReadEngine,
-        DbWriteEngine,
-        DbWriteMode,
-    )
+    from polars.type_aliases import DbReadEngine, DbWriteEngine, DbWriteMode
 
 
 @pytest.fixture()
diff --git a/py-polars/tests/unit/test_lazy.py b/py-polars/tests/unit/test_lazy.py
index 980ae3caef51..fe2262ba1f80 100644
--- a/py-polars/tests/unit/test_lazy.py
+++ b/py-polars/tests/unit/test_lazy.py
@@ -529,7 +529,6 @@ def test_floor() -> None:
         (123.55, 0, 124.0),
         (123.55, 1, 123.6),
         (-1.23456789, 6, -1.234568),
-        (-1835.665, 2, -1835.67),
         (1.0e-5, 5, 0.00001),
         (1.0e-20, 20, 1e-20),
         (1.0e20, 2, 100000000000000000000.0),

From 576b1463c0dba41e167e34946af42c3d9713f7f6 Mon Sep 17 00:00:00 2001
From: Marco Edward Gorelli <marcogorelli@protonmail.com>
Date: Sun, 20 Aug 2023 19:49:17 +0100
Subject: [PATCH 18/55] docs(python): give more relevant example for
 polars.apply (#10631)

---
 py-polars/polars/functions/lazy.py | 68 +++++++++++++++++-------------
 1 file changed, 38 insertions(+), 30 deletions(-)

diff --git a/py-polars/polars/functions/lazy.py b/py-polars/polars/functions/lazy.py
index 5972aa709cb3..32a930ff1890 100644
--- a/py-polars/polars/functions/lazy.py
+++ b/py-polars/polars/functions/lazy.py
@@ -1061,39 +1061,47 @@ def apply(
     --------
     >>> df = pl.DataFrame(
     ...     {
-    ...         "a": [7, 2, 3, 4],
-    ...         "b": [2, 5, 6, 7],
+    ...         "group": [1, 1, 2],
+    ...         "a": [1, 3, 3],
+    ...         "b": [5, 6, 7],
     ...     }
     ... )
     >>> df
-    shape: (4, 2)
-    ┌─────┬─────┐
-    │ a   ┆ b   │
-    │ --- ┆ --- │
-    │ i64 ┆ i64 │
-    ╞═════╪═════╡
-    │ 7   ┆ 2   │
-    │ 2   ┆ 5   │
-    │ 3   ┆ 6   │
-    │ 4   ┆ 7   │
-    └─────┴─────┘
-
-    Calculate product of ``a``.
-
-    >>> df.with_columns(  # doctest: +SKIP
-    ...     pl.col("a").apply(lambda x: x * x).alias("product_a")
-    ... )
-    shape: (4, 3)
-    ┌─────┬─────┬───────────┐
-    │ a   ┆ b   ┆ product_a │
-    │ --- ┆ --- ┆ ---       │
-    │ i64 ┆ i64 ┆ i64       │
-    ╞═════╪═════╪═══════════╡
-    │ 7   ┆ 2   ┆ 49        │
-    │ 2   ┆ 5   ┆ 4         │
-    │ 3   ┆ 6   ┆ 9         │
-    │ 4   ┆ 7   ┆ 16        │
-    └─────┴─────┴───────────┘
+    shape: (3, 3)
+    ┌───────┬─────┬─────┐
+    │ group ┆ a   ┆ b   │
+    │ ---   ┆ --- ┆ --- │
+    │ i64   ┆ i64 ┆ i64 │
+    ╞═══════╪═════╪═════╡
+    │ 1     ┆ 1   ┆ 5   │
+    │ 1     ┆ 3   ┆ 6   │
+    │ 2     ┆ 3   ┆ 7   │
+    └───────┴─────┴─────┘
+    >>> (
+    ...     df.groupby("group").agg(
+    ...         pl.apply(
+    ...             exprs=["a", "b"],
+    ...             function=lambda list_of_series: list_of_series[0]
+    ...             / list_of_series[0].sum()
+    ...             + list_of_series[1],
+    ...         ).alias("my_custom_aggregation")
+    ...     )
+    ... ).sort("group")
+    shape: (2, 2)
+    ┌───────┬───────────────────────┐
+    │ group ┆ my_custom_aggregation │
+    │ ---   ┆ ---                   │
+    │ i64   ┆ list[f64]             │
+    ╞═══════╪═══════════════════════╡
+    │ 1     ┆ [5.25, 6.75]          │
+    │ 2     ┆ [8.0]                 │
+    └───────┴───────────────────────┘
+
+    The output for group `1` can be understood as follows:
+
+    - group `1` contains series `'a': [1, 3]` and `'b': [4, 5]`
+    - applying the function to those lists of Series, one gets the output
+      `[1 / 4 + 5, 3 / 4 + 6]`, i.e. `[5.25, 6.75]`
     """
     exprs = parse_as_list_of_expressions(exprs)
     return wrap_expr(

From 115fdbaae359afd9eb15ed03ee9530194bdcf960 Mon Sep 17 00:00:00 2001
From: Zverev Konstantin <Object905@gmail.com>
Date: Sun, 20 Aug 2023 23:51:46 +0500
Subject: [PATCH 19/55] feat(python): Add `LazyFrame.collect_async` (#10616)

---
 py-polars/polars/__init__.py           |   2 +
 py-polars/polars/functions/__init__.py |   2 +
 py-polars/polars/functions/lazy.py     |  85 +++++++++++++++++++
 py-polars/polars/lazyframe/frame.py    | 111 +++++++++++++++++++++++++
 py-polars/polars/utils/_async.py       |  45 ++++++++++
 py-polars/src/functions/lazy.rs        |  32 +++++++
 py-polars/src/lazyframe.rs             |  26 ++++++
 py-polars/src/lib.rs                   |   2 +
 8 files changed, 305 insertions(+)
 create mode 100644 py-polars/polars/utils/_async.py

diff --git a/py-polars/polars/__init__.py b/py-polars/polars/__init__.py
index 12557c8bb62b..01f03a68d68e 100644
--- a/py-polars/polars/__init__.py
+++ b/py-polars/polars/__init__.py
@@ -91,6 +91,7 @@
     coalesce,
     col,
     collect_all,
+    collect_all_async,
     concat,
     concat_list,
     concat_str,
@@ -306,6 +307,7 @@
     "coalesce",
     "col",
     "collect_all",
+    "collect_all_async",
     "concat_list",
     "concat_str",
     "corr",
diff --git a/py-polars/polars/functions/__init__.py b/py-polars/polars/functions/__init__.py
index 44db012930a6..1c780163cdf8 100644
--- a/py-polars/polars/functions/__init__.py
+++ b/py-polars/polars/functions/__init__.py
@@ -34,6 +34,7 @@
     coalesce,
     col,
     collect_all,
+    collect_all_async,
     corr,
     count,
     cov,
@@ -113,6 +114,7 @@
     "coalesce",
     "col",
     "collect_all",
+    "collect_all_async",
     "concat_list",
     "concat_str",
     "corr",
diff --git a/py-polars/polars/functions/lazy.py b/py-polars/polars/functions/lazy.py
index 32a930ff1890..427f368d6e2d 100644
--- a/py-polars/polars/functions/lazy.py
+++ b/py-polars/polars/functions/lazy.py
@@ -11,6 +11,7 @@
     Int64,
     is_polars_dtype,
 )
+from polars.utils._async import _AsyncDataFrameResult
 from polars.utils._parse_expr_input import (
     parse_as_expression,
     parse_as_list_of_expressions,
@@ -27,6 +28,7 @@
 
 
 if TYPE_CHECKING:
+    from queue import Queue
     from typing import Collection, Literal
 
     from polars import DataFrame, Expr, LazyFrame, Series
@@ -1769,6 +1771,89 @@ def collect_all(
     return result
 
 
+def collect_all_async(
+    lazy_frames: Sequence[LazyFrame],
+    queue: Queue[list[DataFrame] | Exception],
+    *,
+    type_coercion: bool = True,
+    predicate_pushdown: bool = True,
+    projection_pushdown: bool = True,
+    simplify_expression: bool = True,
+    no_optimization: bool = False,
+    slice_pushdown: bool = True,
+    comm_subplan_elim: bool = True,
+    comm_subexpr_elim: bool = True,
+    streaming: bool = False,
+) -> _AsyncDataFrameResult[list[DataFrame]]:
+    """
+    Collect multiple LazyFrames at the same time asynchronously in thread pool.
+
+    Collects into a list of DataFrame, like :func:`polars.collect_all`
+    but instead of returning them directly its collected inside thread pool
+    and gets put into `queue` with `put_nowait` method,
+    while this method returns almost instantly.
+
+    May be useful if you use gevent or asyncio and want to release control to other
+    greenlets/tasks while LazyFrames are being collected.
+    You must use correct queue in that case.
+    Given `queue` must be thread safe!
+
+    For gevent use
+    [`gevent.queue.Queue`](https://www.gevent.org/api/gevent.queue.html#gevent.queue.Queue).
+
+    For asyncio
+    [`asyncio.queues.Queue`](https://docs.python.org/3/library/asyncio-queue.html#queue)
+    can not be used, since it's not thread safe!
+    For that purpose use [janus](https://github.com/aio-libs/janus) library.
+
+    Notes
+    -----
+    Results are put in queue exactly once using `put_nowait`.
+    If error occurred then Exception will be put in the queue instead of result
+    which is then raised by returned wrapper `get` method.
+
+    Warnings
+    --------
+    This functionality is experimental and may change without it being considered a
+    breaking change.
+
+    See Also
+    --------
+    polars.collect_all : Collect multiple LazyFrames at the same time.
+    LazyFrame.collect_async: To collect single frame.
+
+    Returns
+    -------
+    Wrapper that has `get` method and `queue` attribute with given queue.
+    `get` accepts kwargs that are passed down to `queue.get`.
+    """
+    if no_optimization:
+        predicate_pushdown = False
+        projection_pushdown = False
+        slice_pushdown = False
+        comm_subplan_elim = False
+        comm_subexpr_elim = False
+
+    prepared = []
+
+    for lf in lazy_frames:
+        ldf = lf._ldf.optimization_toggle(
+            type_coercion,
+            predicate_pushdown,
+            projection_pushdown,
+            simplify_expression,
+            slice_pushdown,
+            comm_subplan_elim,
+            comm_subexpr_elim,
+            streaming,
+        )
+        prepared.append(ldf)
+
+    result = _AsyncDataFrameResult(queue)
+    plr.collect_all_with_callback(prepared, result._callback_all)
+    return result
+
+
 def select(*exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame:
     """
     Run polars expressions without a context.
diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py
index 7079e1ea4f56..36a54c86f713 100644
--- a/py-polars/polars/lazyframe/frame.py
+++ b/py-polars/polars/lazyframe/frame.py
@@ -50,6 +50,7 @@
 from polars.lazyframe.groupby import LazyGroupBy
 from polars.selectors import _expand_selectors, expand_selector
 from polars.slice import LazyPolarsSlice
+from polars.utils._async import _AsyncDataFrameResult
 from polars.utils._parse_expr_input import (
     parse_as_expression,
     parse_as_list_of_expressions,
@@ -75,6 +76,7 @@
 if TYPE_CHECKING:
     import sys
     from io import IOBase
+    from queue import Queue
     from typing import Literal
 
     import pyarrow as pa
@@ -1672,6 +1674,115 @@ def collect(
         )
         return wrap_df(ldf.collect())
 
+    def collect_async(
+        self,
+        queue: Queue[DataFrame | Exception],
+        *,
+        type_coercion: bool = True,
+        predicate_pushdown: bool = True,
+        projection_pushdown: bool = True,
+        simplify_expression: bool = True,
+        no_optimization: bool = False,
+        slice_pushdown: bool = True,
+        comm_subplan_elim: bool = True,
+        comm_subexpr_elim: bool = True,
+        streaming: bool = False,
+    ) -> _AsyncDataFrameResult[DataFrame]:
+        """
+        Collect dataframe asynchronously in thread pool.
+
+        Collects into a DataFrame, like :func:`collect`
+        but instead of returning dataframe directly its collected inside thread pool
+        and gets put into `queue` with `put_nowait` method,
+        while this method returns almost instantly.
+
+        May be useful if you use gevent or asyncio and want to release control to other
+        greenlets/tasks while LazyFrames are being collected.
+        You must use correct queue in that case.
+        Given `queue` must be thread safe!
+
+        For gevent use
+        [`gevent.queue.Queue`](https://www.gevent.org/api/gevent.queue.html#gevent.queue.Queue).
+
+        For asyncio
+        [`asyncio.queues.Queue`](https://docs.python.org/3/library/asyncio-queue.html#queue)
+        can not be used, since it's not thread safe!
+        For that purpose use [janus](https://github.com/aio-libs/janus) library.
+
+        Notes
+        -----
+        Results are put in queue exactly once using `put_nowait`.
+        If error occurred then Exception will be put in the queue instead of result
+        which is then raised by returned wrapper `get` method.
+
+        Warnings
+        --------
+        This functionality is experimental and may change without it being considered a
+        breaking change.
+
+        See Also
+        --------
+        polars.collect_all : Collect multiple LazyFrames at the same time.
+        polars.collect_all_async: Collect multiple LazyFrames at the same time lazily.
+
+        Returns
+        -------
+        Wrapper that has `get` method and `queue` attribute with given queue.
+        `get` accepts kwargs that are passed down to `queue.get`.
+
+        Examples
+        --------
+        >>> import queue
+        >>> lf = pl.LazyFrame(
+        ...     {
+        ...         "a": ["a", "b", "a", "b", "b", "c"],
+        ...         "b": [1, 2, 3, 4, 5, 6],
+        ...         "c": [6, 5, 4, 3, 2, 1],
+        ...     }
+        ... )
+        >>> a = (
+        ...     lf.groupby("a", maintain_order=True)
+        ...     .agg(pl.all().sum())
+        ...     .collect_async(queue.Queue())
+        ... )
+        >>> a.get()
+        shape: (3, 3)
+        ┌─────┬─────┬─────┐
+        │ a   ┆ b   ┆ c   │
+        │ --- ┆ --- ┆ --- │
+        │ str ┆ i64 ┆ i64 │
+        ╞═════╪═════╪═════╡
+        │ a   ┆ 4   ┆ 10  │
+        │ b   ┆ 11  ┆ 10  │
+        │ c   ┆ 6   ┆ 1   │
+        └─────┴─────┴─────┘
+
+        """
+        if no_optimization:
+            predicate_pushdown = False
+            projection_pushdown = False
+            slice_pushdown = False
+            comm_subplan_elim = False
+            comm_subexpr_elim = False
+
+        if streaming:
+            comm_subplan_elim = False
+
+        ldf = self._ldf.optimization_toggle(
+            type_coercion,
+            predicate_pushdown,
+            projection_pushdown,
+            simplify_expression,
+            slice_pushdown,
+            comm_subplan_elim,
+            comm_subexpr_elim,
+            streaming,
+        )
+
+        result = _AsyncDataFrameResult(queue)
+        ldf.collect_with_callback(result._callback)
+        return result
+
     def sink_parquet(
         self,
         path: str | Path,
diff --git a/py-polars/polars/utils/_async.py b/py-polars/polars/utils/_async.py
new file mode 100644
index 000000000000..d35956156b8c
--- /dev/null
+++ b/py-polars/polars/utils/_async.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, Generic, TypeVar
+
+from polars.utils._wrap import wrap_df
+
+if TYPE_CHECKING:
+    from queue import Queue
+
+    from polars.polars import PyDataFrame
+
+
+T = TypeVar("T")
+
+
+class _AsyncDataFrameResult(Generic[T]):
+    queue: Queue[Exception | T]
+    _result: Exception | T | None
+
+    __slots__ = ("queue", "_result")
+
+    def __init__(self, queue: Queue[Exception | T]) -> None:
+        self.queue = queue
+        self._result = None
+
+    def get(self, **kwargs: Any) -> T:
+        if self._result is not None:
+            if isinstance(self._result, Exception):
+                raise self._result
+            return self._result
+
+        self._result = self.queue.get(**kwargs)
+        if isinstance(self._result, Exception):
+            raise self._result
+        return self._result
+
+    def _callback(self, obj: PyDataFrame | Exception) -> None:
+        if not isinstance(obj, Exception):
+            obj = wrap_df(obj)
+        self.queue.put_nowait(obj)
+
+    def _callback_all(self, obj: list[PyDataFrame] | Exception) -> None:
+        if not isinstance(obj, Exception):
+            obj = [wrap_df(pydf) for pydf in obj]
+        self.queue.put_nowait(obj)  # type: ignore[arg-type]
diff --git a/py-polars/src/functions/lazy.rs b/py-polars/src/functions/lazy.rs
index 6d94b2f42fab..fbf78e5ae957 100644
--- a/py-polars/src/functions/lazy.rs
+++ b/py-polars/src/functions/lazy.rs
@@ -104,6 +104,38 @@ pub fn collect_all(lfs: Vec<PyLazyFrame>, py: Python) -> PyResult<Vec<PyDataFram
     Ok(out?)
 }
 
+#[pyfunction]
+pub fn collect_all_with_callback(lfs: Vec<PyLazyFrame>, lambda: PyObject, py: Python) {
+    use polars_core::utils::rayon::prelude::*;
+
+    py.allow_threads(|| {
+        polars_core::POOL.install(move || {
+            polars_core::POOL.spawn(move || {
+                let result = lfs
+                    .par_iter()
+                    .map(|lf| {
+                        let df = lf.ldf.clone().collect()?;
+                        Ok(PyDataFrame::new(df))
+                    })
+                    .collect::<polars_core::error::PolarsResult<Vec<_>>>()
+                    .map_err(PyPolarsErr::from);
+
+                Python::with_gil(|py| match result {
+                    Ok(dfs) => {
+                        lambda.call1(py, (dfs,)).map_err(|err| err.restore(py)).ok();
+                    },
+                    Err(err) => {
+                        lambda
+                            .call1(py, (PyErr::from(err).to_object(py),))
+                            .map_err(|err| err.restore(py))
+                            .ok();
+                    },
+                })
+            })
+        });
+    });
+}
+
 #[pyfunction]
 pub fn cols(names: Vec<String>) -> PyExpr {
     dsl::cols(names).into()
diff --git a/py-polars/src/lazyframe.rs b/py-polars/src/lazyframe.rs
index 2f288a1bd3bb..6d3b77400a8a 100644
--- a/py-polars/src/lazyframe.rs
+++ b/py-polars/src/lazyframe.rs
@@ -444,6 +444,32 @@ impl PyLazyFrame {
         Ok(df.into())
     }
 
+    #[pyo3(signature = (lambda,))]
+    fn collect_with_callback(&self, py: Python, lambda: PyObject) {
+        py.allow_threads(|| {
+            let ldf = self.ldf.clone();
+
+            polars_core::POOL.spawn(move || {
+                let result = ldf
+                    .collect()
+                    .map(PyDataFrame::new)
+                    .map_err(PyPolarsErr::from);
+
+                Python::with_gil(|py| match result {
+                    Ok(df) => {
+                        lambda.call1(py, (df,)).map_err(|err| err.restore(py)).ok();
+                    },
+                    Err(err) => {
+                        lambda
+                            .call1(py, (PyErr::from(err).to_object(py),))
+                            .map_err(|err| err.restore(py))
+                            .ok();
+                    },
+                });
+            });
+        });
+    }
+
     #[allow(clippy::too_many_arguments)]
     #[cfg(all(feature = "streaming", feature = "parquet"))]
     #[pyo3(signature = (path, compression, compression_level, statistics, row_group_size, data_pagesize_limit, maintain_order))]
diff --git a/py-polars/src/lib.rs b/py-polars/src/lib.rs
index 35c212084317..ebe003438453 100644
--- a/py-polars/src/lib.rs
+++ b/py-polars/src/lib.rs
@@ -128,6 +128,8 @@ fn polars(py: Python, m: &PyModule) -> PyResult<()> {
         .unwrap();
     m.add_wrapped(wrap_pyfunction!(functions::lazy::collect_all))
         .unwrap();
+    m.add_wrapped(wrap_pyfunction!(functions::lazy::collect_all_with_callback))
+        .unwrap();
     m.add_wrapped(wrap_pyfunction!(functions::lazy::cols))
         .unwrap();
     m.add_wrapped(wrap_pyfunction!(functions::lazy::concat_lf))

From 55cb641a76af3b0dc4c28fbca07f024fecdf9a95 Mon Sep 17 00:00:00 2001
From: Marco Edward Gorelli <marcogorelli@protonmail.com>
Date: Sun, 20 Aug 2023 21:53:19 +0100
Subject: [PATCH 20/55] chore(python): fix potential memory leak from usage of
 `inspect.currentframe` (#10630)

---
 py-polars/polars/utils/udfs.py    | 28 +++++++++-----
 py-polars/polars/utils/various.py | 61 +++++++++++++++++++------------
 2 files changed, 57 insertions(+), 32 deletions(-)

diff --git a/py-polars/polars/utils/udfs.py b/py-polars/polars/utils/udfs.py
index a21ae86dc2c5..58a85d9a02c3 100644
--- a/py-polars/polars/utils/udfs.py
+++ b/py-polars/polars/utils/udfs.py
@@ -178,16 +178,26 @@ def _get_all_caller_variables() -> dict[str, Any]:
     # https://stackoverflow.com/questions/17407119/python-inspect-stack-is-slow
     frame = inspect.currentframe()
     n = 0
-    while frame:
-        fname = inspect.getfile(frame)
-        if fname.startswith(str(pkg_dir)):
-            frame = frame.f_back
-            n += 1
+    try:
+        while frame:
+            fname = inspect.getfile(frame)
+            if fname.startswith(str(pkg_dir)):
+                frame = frame.f_back
+                n += 1
+            else:
+                break
+        variables: dict[str, Any]
+        if frame is None:
+            variables = {}
         else:
-            break
-    if frame is None:
-        return {}
-    return {**frame.f_locals, **frame.f_globals}
+            variables = {**frame.f_locals, **frame.f_globals}
+    finally:
+        # https://docs.python.org/3/library/inspect.html
+        # > Though the cycle detector will catch these, destruction of the frames
+        # > (and local variables) can be made deterministic by removing the cycle
+        # > in a finally clause.
+        del frame
+    return variables
 
 
 class BytecodeParser:
diff --git a/py-polars/polars/utils/various.py b/py-polars/polars/utils/various.py
index 677defeb55fe..88f0b9c1d42f 100644
--- a/py-polars/polars/utils/various.py
+++ b/py-polars/polars/utils/various.py
@@ -368,13 +368,20 @@ def find_stacklevel() -> int:
     # https://stackoverflow.com/questions/17407119/python-inspect-stack-is-slow
     frame = inspect.currentframe()
     n = 0
-    while frame:
-        fname = inspect.getfile(frame)
-        if fname.startswith(str(pkg_dir)):
-            frame = frame.f_back
-            n += 1
-        else:
-            break
+    try:
+        while frame:
+            fname = inspect.getfile(frame)
+            if fname.startswith(str(pkg_dir)):
+                frame = frame.f_back
+                n += 1
+            else:
+                break
+    finally:
+        # https://docs.python.org/3/library/inspect.html
+        # > Though the cycle detector will catch these, destruction of the frames
+        # > (and local variables) can be made deterministic by removing the cycle
+        # > in a finally clause.
+        del frame
     return n
 
 
@@ -406,22 +413,30 @@ def _get_stack_locals(
     examined_frames = 0
     if n_frames is None:
         n_frames = sys.maxsize
-    stack_frame = getattr(inspect.currentframe(), "f_back", None)
-
-    while stack_frame and examined_frames < n_frames:
-        local_items = list(stack_frame.f_locals.items())
-        for nm, obj in reversed(local_items):
-            if (
-                nm not in objects
-                and (named is None or (nm in named))
-                and (of_type is None or isinstance(obj, of_type))
-            ):
-                objects[nm] = obj
-                if n_objects is not None and len(objects) >= n_objects:
-                    return objects
-
-        stack_frame = stack_frame.f_back
-        examined_frames += 1
+    stack_frame = inspect.currentframe()
+    stack_frame = getattr(stack_frame, "f_back", None)
+
+    try:
+        while stack_frame and examined_frames < n_frames:
+            local_items = list(stack_frame.f_locals.items())
+            for nm, obj in reversed(local_items):
+                if (
+                    nm not in objects
+                    and (named is None or (nm in named))
+                    and (of_type is None or isinstance(obj, of_type))
+                ):
+                    objects[nm] = obj
+                    if n_objects is not None and len(objects) >= n_objects:
+                        return objects
+
+            stack_frame = stack_frame.f_back
+            examined_frames += 1
+    finally:
+        # https://docs.python.org/3/library/inspect.html
+        # > Though the cycle detector will catch these, destruction of the frames
+        # > (and local variables) can be made deterministic by removing the cycle
+        # > in a finally clause.
+        del stack_frame
 
     return objects
 

From 15527ae585a20d70b2680820676b123dfb08523e Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Mon, 21 Aug 2023 06:56:57 +0200
Subject: [PATCH 21/55] fix(rust, python): fix rename + projection pushdown
 (#10624)

---
 .../optimizer/projection_pushdown/rename.rs           | 11 +++++++----
 py-polars/tests/unit/test_projections.py              |  7 +++++++
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/rename.rs b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/rename.rs
index 7caa3aff226f..1c19036d4ae9 100644
--- a/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/rename.rs
+++ b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/rename.rs
@@ -33,13 +33,15 @@ pub(super) fn process_rename(
 ) -> PolarsResult<()> {
     let mut processed = BTreeSet::new();
     if swapping {
+        // We clone otherwise we update a data structure whilst we rename it.
+        let mut new_projected_names = projected_names.clone();
         for (existing, new) in existing.iter().zip(new.iter()) {
             let has_existing = projected_names.contains(existing.as_str());
+            // Only if the new column name is projected by the upper node we must update the name.
             let has_new = projected_names.contains(new.as_str());
             let has_both = has_existing && has_new;
-            let has_any = has_existing || has_new;
 
-            if has_any {
+            if has_new {
                 // swapping path
                 // this must leave projected names intact, as we only swap
                 if has_both {
@@ -54,9 +56,9 @@ pub(super) fn process_rename(
                 // simple new name path
                 // this must add and remove names
                 else {
-                    projected_names.remove(new.as_str());
+                    new_projected_names.remove(new.as_str());
                     let name: Arc<str> = Arc::from(existing.as_str());
-                    projected_names.insert(name);
+                    new_projected_names.insert(name);
                     iter_and_update_nodes(
                         existing,
                         new,
@@ -67,6 +69,7 @@ pub(super) fn process_rename(
                 }
             }
         }
+        *projected_names = new_projected_names;
     } else {
         for (existing, new) in existing.iter().zip(new.iter()) {
             if projected_names.remove(new.as_str()) {
diff --git a/py-polars/tests/unit/test_projections.py b/py-polars/tests/unit/test_projections.py
index b839b986a2c9..4312815eeaa6 100644
--- a/py-polars/tests/unit/test_projections.py
+++ b/py-polars/tests/unit/test_projections.py
@@ -313,3 +313,10 @@ def test_projection_join_names_9955() -> None:
         "yearID": pl.Int64,
         "lgID": pl.Utf8,
     }
+
+
+def test_projection_rename_10595() -> None:
+    lf = pl.LazyFrame(schema=["a", "b"])
+    assert lf.select("a", "b").rename({"b": "a", "a": "b"}).select(
+        "a"
+    ).collect().schema == {"a": pl.Float32}

From 6dd34327f8ad83decb6e6af6c77107d765978c4f Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Mon, 21 Aug 2023 09:31:22 +0200
Subject: [PATCH 22/55] fix(rust, python): respect 'ignore_errors=False' in csv
 parser (#10641)

---
 crates/polars-core/src/series/mod.rs          |  6 +--
 crates/polars-core/src/utils/series.rs        |  5 ++
 crates/polars-io/src/csv/buffer.rs            | 36 +++++++++-----
 .../src/csv/read_impl/batched_mmap.rs         |  2 +-
 .../src/csv/read_impl/batched_read.rs         |  2 +-
 crates/polars-io/src/csv/read_impl/mod.rs     | 48 ++++++++++++-------
 crates/polars/tests/it/io/csv.rs              |  1 +
 py-polars/tests/unit/io/test_csv.py           | 32 +++++++++++++
 8 files changed, 96 insertions(+), 36 deletions(-)

diff --git a/crates/polars-core/src/series/mod.rs b/crates/polars-core/src/series/mod.rs
index 1b6f114802f6..cdd010643198 100644
--- a/crates/polars-core/src/series/mod.rs
+++ b/crates/polars-core/src/series/mod.rs
@@ -32,7 +32,7 @@ use crate::chunked_array::Settings;
 use crate::prelude::unique::rank::rank;
 #[cfg(feature = "zip_with")]
 use crate::series::arithmetic::coerce_lhs_rhs;
-use crate::utils::{_split_offsets, split_ca, split_series, Wrap};
+use crate::utils::{_split_offsets, get_casting_failures, split_ca, split_series, Wrap};
 use crate::POOL;
 
 /// # Series
@@ -790,14 +790,12 @@ impl Series {
         }
         let s = self.0.cast(dtype)?;
         if null_count != s.null_count() {
-            let failure_mask = !self.is_null() & s.is_null();
-            let failures = self.filter_threaded(&failure_mask, false)?.unique()?;
+            let failures = get_casting_failures(self, &s)?;
             polars_bail!(
                 ComputeError:
                 "strict conversion from `{}` to `{}` failed for column: {}, value(s) {}; \
                 if you were trying to cast Utf8 to temporal dtypes, consider using `strptime`",
                 self.dtype(), dtype, s.name(), failures.fmt_list(),
-
             );
         } else {
             Ok(s)
diff --git a/crates/polars-core/src/utils/series.rs b/crates/polars-core/src/utils/series.rs
index 2f41790e953c..b6c87b2cff33 100644
--- a/crates/polars-core/src/utils/series.rs
+++ b/crates/polars-core/src/utils/series.rs
@@ -39,3 +39,8 @@ pub fn ensure_sorted_arg(s: &Series, operation: &str) -> PolarsResult<()> {
     ", operation);
     Ok(())
 }
+
+pub fn get_casting_failures(input: &Series, output: &Series) -> PolarsResult<Series> {
+    let failure_mask = !input.is_null() & output.is_null();
+    input.filter_threaded(&failure_mask, false)?.unique()
+}
diff --git a/crates/polars-io/src/csv/buffer.rs b/crates/polars-io/src/csv/buffer.rs
index 5d8317be27ad..6eeb98e41e39 100644
--- a/crates/polars-io/src/csv/buffer.rs
+++ b/crates/polars-io/src/csv/buffer.rs
@@ -422,14 +422,18 @@ where
         // Safety:
         // we just checked it is ascii
         unsafe { std::str::from_utf8_unchecked(bytes) }
-    } else if ignore_errors {
-        buf.builder.append_null();
-        return Ok(());
-    } else if !ignore_errors && std::str::from_utf8(bytes).is_err() {
-        polars_bail!(ComputeError: "invalid utf-8 sequence");
     } else {
-        buf.builder.append_null();
-        return Ok(());
+        match std::str::from_utf8(bytes) {
+            Ok(val) => val,
+            Err(_) => {
+                if ignore_errors {
+                    buf.builder.append_null();
+                    return Ok(());
+                } else {
+                    polars_bail!(ComputeError: "invalid utf-8 sequence");
+                }
+            },
+        }
     };
 
     let pattern = match &buf.compiled {
@@ -437,8 +441,12 @@ where
         None => match infer_pattern_single(val) {
             Some(pattern) => pattern,
             None => {
-                buf.builder.append_null();
-                return Ok(());
+                if ignore_errors {
+                    buf.builder.append_null();
+                    return Ok(());
+                } else {
+                    polars_bail!(ComputeError: "could not find a 'date/datetime' pattern for {}", val)
+                }
             },
         },
     };
@@ -449,9 +457,13 @@ where
             buf.builder.append_option(parsed);
             Ok(())
         },
-        Err(_) => {
-            buf.builder.append_null();
-            Ok(())
+        Err(err) => {
+            if ignore_errors {
+                buf.builder.append_null();
+                Ok(())
+            } else {
+                Err(err)
+            }
         },
     }
 }
diff --git a/crates/polars-io/src/csv/read_impl/batched_mmap.rs b/crates/polars-io/src/csv/read_impl/batched_mmap.rs
index b69fb10e4700..20f6f96018fb 100644
--- a/crates/polars-io/src/csv/read_impl/batched_mmap.rs
+++ b/crates/polars-io/src/csv/read_impl/batched_mmap.rs
@@ -249,7 +249,7 @@ impl<'a> BatchedCsvReaderMmap<'a> {
                         self.starting_point_offset,
                     )?;
 
-                    cast_columns(&mut df, &self.to_cast, false)?;
+                    cast_columns(&mut df, &self.to_cast, false, self.ignore_errors)?;
 
                     update_string_stats(&self.str_capacities, &self.str_columns, &df)?;
                     if let Some(rc) = &self.row_count {
diff --git a/crates/polars-io/src/csv/read_impl/batched_read.rs b/crates/polars-io/src/csv/read_impl/batched_read.rs
index 1152cbec2525..2c8a74a23969 100644
--- a/crates/polars-io/src/csv/read_impl/batched_read.rs
+++ b/crates/polars-io/src/csv/read_impl/batched_read.rs
@@ -346,7 +346,7 @@ impl<'a> BatchedCsvReaderRead<'a> {
                         self.starting_point_offset,
                     )?;
 
-                    cast_columns(&mut df, &self.to_cast, false)?;
+                    cast_columns(&mut df, &self.to_cast, false, self.ignore_errors)?;
 
                     update_string_stats(&self.str_capacities, &self.str_columns, &df)?;
                     if let Some(rc) = &self.row_count {
diff --git a/crates/polars-io/src/csv/read_impl/mod.rs b/crates/polars-io/src/csv/read_impl/mod.rs
index a95bacb6fc3f..62aa3578aabf 100644
--- a/crates/polars-io/src/csv/read_impl/mod.rs
+++ b/crates/polars-io/src/csv/read_impl/mod.rs
@@ -11,7 +11,7 @@ pub use batched_read::*;
 use polars_arrow::array::*;
 use polars_core::config::verbose;
 use polars_core::prelude::*;
-use polars_core::utils::accumulate_dataframes_vertical;
+use polars_core::utils::{accumulate_dataframes_vertical, get_casting_failures};
 use polars_core::POOL;
 #[cfg(feature = "polars-time")]
 use polars_time::prelude::*;
@@ -32,21 +32,33 @@ pub(crate) fn cast_columns(
     df: &mut DataFrame,
     to_cast: &[Field],
     parallel: bool,
+    ignore_errors: bool,
 ) -> PolarsResult<()> {
-    let cast_fn = |s: &Series, fld: &Field| match (s.dtype(), fld.data_type()) {
-        #[cfg(feature = "temporal")]
-        (DataType::Utf8, DataType::Date) => s
-            .utf8()
-            .unwrap()
-            .as_date(None, false)
-            .map(|ca| ca.into_series()),
-        #[cfg(feature = "temporal")]
-        (DataType::Utf8, DataType::Datetime(tu, _)) => s
-            .utf8()
-            .unwrap()
-            .as_datetime(None, *tu, false, false, None, None)
-            .map(|ca| ca.into_series()),
-        (_, dt) => s.cast(dt),
+    let cast_fn = |s: &Series, fld: &Field| {
+        let out = match (s.dtype(), fld.data_type()) {
+            #[cfg(feature = "temporal")]
+            (DataType::Utf8, DataType::Date) => s
+                .utf8()
+                .unwrap()
+                .as_date(None, false)
+                .map(|ca| ca.into_series()),
+            #[cfg(feature = "temporal")]
+            (DataType::Utf8, DataType::Datetime(tu, _)) => s
+                .utf8()
+                .unwrap()
+                .as_datetime(None, *tu, false, false, None, None)
+                .map(|ca| ca.into_series()),
+            (_, dt) => s.cast(dt),
+        }?;
+        if !ignore_errors && s.null_count() != out.null_count() {
+            let failures = get_casting_failures(s, &out)?;
+            polars_bail!(
+                ComputeError:
+                "parsing to `{}` failed for column: {}, value(s) {};",
+                fld.data_type(), s.name(), failures.fmt_list(),
+            )
+        }
+        Ok(out)
     };
 
     if parallel {
@@ -618,7 +630,7 @@ impl<'a> CoreReader<'a> {
                                 local_df.with_row_count_mut(&rc.name, Some(rc.offset));
                             };
 
-                            cast_columns(&mut local_df, &self.to_cast, false)?;
+                            cast_columns(&mut local_df, &self.to_cast, false, self.ignore_errors)?;
                             let s = predicate.evaluate(&local_df)?;
                             let mask = s.bool()?;
                             local_df = local_df.filter(mask)?;
@@ -681,7 +693,7 @@ impl<'a> CoreReader<'a> {
                             update_string_stats(&str_capacities, &str_columns, &df)?;
                         }
 
-                        cast_columns(&mut df, &self.to_cast, false)?;
+                        cast_columns(&mut df, &self.to_cast, false, self.ignore_errors)?;
                         if let Some(rc) = &self.row_count {
                             df.with_row_count_mut(&rc.name, Some(rc.offset));
                         }
@@ -731,7 +743,7 @@ impl<'a> CoreReader<'a> {
                             )
                         };
 
-                        cast_columns(&mut df, &self.to_cast, false)?;
+                        cast_columns(&mut df, &self.to_cast, false, self.ignore_errors)?;
                         if let Some(rc) = &self.row_count {
                             df.with_row_count_mut(&rc.name, Some(rc.offset));
                         }
diff --git a/crates/polars/tests/it/io/csv.rs b/crates/polars/tests/it/io/csv.rs
index 61abad8a03fe..74a66e320640 100644
--- a/crates/polars/tests/it/io/csv.rs
+++ b/crates/polars/tests/it/io/csv.rs
@@ -442,6 +442,7 @@ AUDCAD,1616455921,0.96212,0.95666,1
             "b",
             DataType::Datetime(TimeUnit::Nanoseconds, None),
         )]))))
+        .with_ignore_errors(true)
         .finish()?;
 
     assert_eq!(
diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py
index 060cb317be99..ca6574a31061 100644
--- a/py-polars/tests/unit/io/test_csv.py
+++ b/py-polars/tests/unit/io/test_csv.py
@@ -1440,3 +1440,35 @@ def test_csv_quote_styles() -> None:
         df.write_csv(quote_style="non_numeric", quote="8")
         == '8float8,8string8,8int8,8bool8\n1.0,8a8,1,8true8\n2.0,8abc8,2,8false8\n,8"hello8,3,\n'
     )
+
+
+def test_ignore_errors_casting_dtypes() -> None:
+    csv = """inventory
+    10
+
+    400
+    90
+    """
+
+    assert pl.read_csv(
+        source=io.StringIO(csv),
+        dtypes={"inventory": pl.Int8},
+        ignore_errors=True,
+    ).to_dict(False) == {"inventory": [10, None, None, 90]}
+
+    with pytest.raises(pl.ComputeError):
+        pl.read_csv(
+            source=io.StringIO(csv),
+            dtypes={"inventory": pl.Int8},
+            ignore_errors=False,
+        )
+
+
+def test_ignore_errors_date_parser() -> None:
+    data_invalid_date = "int,float,date\n3,3.4,X"
+    with pytest.raises(pl.ComputeError):
+        pl.read_csv(
+            source=io.StringIO(data_invalid_date),
+            dtypes={"date": pl.Date},
+            ignore_errors=False,
+        )

From 6ac24505a8e0a6ce7bdddfde22f921a3c7894112 Mon Sep 17 00:00:00 2001
From: Alexander Beedie <alexander-beedie@users.noreply.github.com>
Date: Mon, 21 Aug 2023 11:39:41 +0400
Subject: [PATCH 23/55] feat(python): support selector usage in `write_excel`
 arguments (#10589)

---
 py-polars/polars/dataframe/frame.py       | 35 +++++++++++++----------
 py-polars/polars/io/excel/_write_utils.py | 33 ++++++++++++++++-----
 py-polars/polars/selectors.py             | 24 ++++++++++++++++
 py-polars/polars/type_aliases.py          | 10 ++++++-
 py-polars/tests/unit/io/test_excel.py     |  7 +++--
 5 files changed, 83 insertions(+), 26 deletions(-)

diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
index 6e8794d791c6..817c49079313 100644
--- a/py-polars/polars/dataframe/frame.py
+++ b/py-polars/polars/dataframe/frame.py
@@ -66,7 +66,7 @@
     _xl_unique_table_name,
     _XLFormatCache,
 )
-from polars.selectors import _expand_selectors
+from polars.selectors import _expand_selector_dicts, _expand_selectors
 from polars.slice import PolarsSlice
 from polars.utils._construction import (
     _post_apply_columns,
@@ -120,8 +120,10 @@
         AsofJoinStrategy,
         AvroCompression,
         ClosedInterval,
+        ColumnFormatDict,
         ColumnNameOrSelector,
         ColumnTotalsDefinition,
+        ColumnWidthsDefinition,
         ComparisonOperator,
         ConditionalFormatDict,
         CsvEncoding,
@@ -2626,12 +2628,12 @@ def write_excel(
         position: tuple[int, int] | str = "A1",
         table_style: str | dict[str, Any] | None = None,
         table_name: str | None = None,
-        column_formats: dict[str | tuple[str, ...], str | dict[str, str]] | None = None,
+        column_formats: ColumnFormatDict | None = None,
         dtype_formats: dict[OneOrMoreDataTypes, str] | None = None,
         conditional_formats: ConditionalFormatDict | None = None,
         header_format: dict[str, Any] | None = None,
         column_totals: ColumnTotalsDefinition | None = None,
-        column_widths: dict[str | tuple[str, ...], int] | int | None = None,
+        column_widths: ColumnWidthsDefinition | None = None,
         row_totals: RowTotalsDefinition | None = None,
         row_heights: dict[int | tuple[int, ...], int] | int | None = None,
         sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = None,
@@ -2674,9 +2676,9 @@ def write_excel(
             Name of the output table object in the worksheet; can then be referred to
             in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations.
         column_formats : dict
-            A ``{colname:str,}`` dictionary for applying an Excel format string to the
-            given columns. Formats defined here (such as "dd/mm/yyyy", "0.00%", etc)
-            will override any defined in ``dtype_formats`` (below).
+            A ``{colname(s):str,}`` or ``{selector:str,}`` dictionary for applying an
+            Excel format string to the given columns. Formats defined here (such as
+            "dd/mm/yyyy", "0.00%", etc) will override any defined in ``dtype_formats``.
         dtype_formats : dict
             A ``{dtype:str,}`` dictionary that sets the default Excel format for the
             given dtype. (This can be overridden on a per-column basis by the
@@ -2684,8 +2686,8 @@ def write_excel(
             ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform
             integer and float formats.
         conditional_formats : dict
-            A ``{colname(s):str,}``, ``{colname(s):dict,}``, or ``{colname(s):list,}``
-            dictionary defining conditional format options for the specified columns.
+            A dictionary of colname (or selector) keys to a format str, dict, or list
+            that defines conditional formatting options for the specified columns.
 
             * If supplying a string typename, should be one of the valid ``xlsxwriter``
               types such as "3_color_scale", "data_bar", etc.
@@ -2711,9 +2713,9 @@ def write_excel(
             Valid total function names are "average", "count_nums", "count", "max",
             "min", "std_dev", "sum", and "var".
         column_widths : {dict, int}
-            A ``{colname:int,}`` dict or single integer that sets (or overrides if
-            autofitting) table column widths in integer pixel units. If given as an
-            integer the same value is used for all table columns.
+            A ``{colname:int,}`` or ``{selector:int,}`` dict or a single integer that
+            sets (or overrides if autofitting) table column widths, in integer pixel
+            units. If given as an integer the same value is used for all table columns.
         row_totals : {dict, bool}
             Add a row-total column to the right-hand side of the exported table.
 
@@ -2754,7 +2756,7 @@ def write_excel(
               "formula" (mandatory), one of "insert_before" or "insert_after", and
               optionally "return_dtype". The latter is used to appropriately format the
               output of the formula and allow it to participate in row/column totals.
-        float_precision : {dict, int}
+        float_precision : int
             Default number of decimals displayed for floating point columns (note that
             this is purely a formatting directive; the actual values are not rounded).
         has_header : bool
@@ -2764,7 +2766,7 @@ def write_excel(
         autofit : bool
             Calculate individual column widths from the data.
         hidden_columns : list
-             A list of table columns to hide in the worksheet.
+             A list or selector representing table columns to hide in the worksheet.
         hide_gridlines : bool
             Do not display any gridlines on the output worksheet.
         sheet_zoom : int
@@ -2784,7 +2786,6 @@ def write_excel(
               scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4).
               Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent.
 
-
         Notes
         -----
         * A list of compatible ``xlsxwriter`` format property names can be found here:
@@ -3062,9 +3063,13 @@ def write_excel(
                 )
 
         # additional column-level properties
-        hidden_columns = hidden_columns or ()
+        hidden_columns = _expand_selectors(df, hidden_columns or ())
         if isinstance(column_widths, int):
             column_widths = {column: column_widths for column in df.columns}
+        else:
+            column_widths = _expand_selector_dicts(  # type: ignore[assignment]
+                df, column_widths, expand_keys=True, expand_values=False
+            )
         column_widths = _unpack_multi_column_dict(column_widths or {})  # type: ignore[assignment]
 
         for column in df.columns:
diff --git a/py-polars/polars/io/excel/_write_utils.py b/py-polars/polars/io/excel/_write_utils.py
index 4bf5c022b945..89206deecb78 100644
--- a/py-polars/polars/io/excel/_write_utils.py
+++ b/py-polars/polars/io/excel/_write_utils.py
@@ -19,6 +19,7 @@
 )
 from polars.dependencies import json
 from polars.exceptions import DuplicateError
+from polars.selectors import _expand_selector_dicts, _expand_selectors
 
 if TYPE_CHECKING:
     from typing import Literal
@@ -29,6 +30,7 @@
 
     from polars import DataFrame, Series
     from polars.type_aliases import (
+        ColumnFormatDict,
         ColumnTotalsDefinition,
         ConditionalFormatDict,
         OneOrMoreDataTypes,
@@ -113,7 +115,9 @@ def _xl_apply_conditional_formats(
     """Take all conditional formatting options and apply them to the table/range."""
     from xlsxwriter.format import Format
 
-    for cols, formats in conditional_formats.items():
+    for cols, formats in _expand_selector_dicts(
+        df, conditional_formats, expand_keys=True, expand_values=False, tuple_keys=True
+    ).items():
         if not isinstance(cols, str) and len(cols) == 1:
             cols = next(iter(cols))
         if isinstance(formats, (str, dict)):
@@ -305,7 +309,7 @@ def _xl_setup_table_columns(
     df: DataFrame,
     format_cache: _XLFormatCache,
     column_totals: ColumnTotalsDefinition | None = None,
-    column_formats: dict[str | tuple[str, ...], str | dict[str, str]] | None = None,
+    column_formats: ColumnFormatDict | None = None,
     dtype_formats: dict[OneOrMoreDataTypes, str] | None = None,
     header_format: dict[str, Any] | None = None,
     sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = None,
@@ -327,8 +331,16 @@ def _map_str(s: Series) -> Series:
     if cast_cols:
         df = df.with_columns(cast_cols)
 
-    column_totals = _unpack_multi_column_dict(column_totals)  # type: ignore[assignment]
-    column_formats = _unpack_multi_column_dict(column_formats)  # type: ignore[assignment]
+    column_totals = _unpack_multi_column_dict(  # type: ignore[assignment]
+        _expand_selector_dicts(df, column_totals, expand_keys=True, expand_values=False)
+        if isinstance(column_totals, dict)
+        else _expand_selectors(df, column_totals)
+    )
+    column_formats = _unpack_multi_column_dict(  # type: ignore[assignment]
+        _expand_selector_dicts(
+            df, column_formats, expand_keys=True, expand_values=False, tuple_keys=True
+        )
+    )
 
     # normalise column totals
     column_total_funcs = (
@@ -348,12 +360,19 @@ def _map_str(s: Series) -> Series:
             sum_cols = (
                 numeric_cols
                 if row_totals is True
-                else ({row_totals} if isinstance(row_totals, str) else set(row_totals))
+                else (
+                    {row_totals}
+                    if isinstance(row_totals, str)
+                    else set(_expand_selectors(df, row_totals))
+                )
             )
             n_ucase = sum((c[0] if c else "").isupper() for c in df.columns)
             total = f"{'T' if (n_ucase > len(df.columns) // 2) else 't'}otal"
             row_total_funcs = {total: _xl_table_formula(df, sum_cols, "sum")}
         else:
+            row_totals = _expand_selector_dicts(
+                df, row_totals, expand_keys=False, expand_values=True
+            )
             row_total_funcs = {
                 name: _xl_table_formula(
                     df, numeric_cols if cols is True else cols, "sum"
@@ -368,8 +387,8 @@ def _map_str(s: Series) -> Series:
     }
 
     # normalise formats
-    column_formats = (column_formats or {}).copy()
-    dtype_formats = (dtype_formats or {}).copy()
+    column_formats = dict(column_formats or {})
+    dtype_formats = dict(dtype_formats or {})
 
     for tp in list(dtype_formats):
         if isinstance(tp, (tuple, frozenset)):
diff --git a/py-polars/polars/selectors.py b/py-polars/polars/selectors.py
index 2c00caeb841b..477f4196e240 100644
--- a/py-polars/polars/selectors.py
+++ b/py-polars/polars/selectors.py
@@ -173,6 +173,30 @@ def _expand_selectors(
     return expanded
 
 
+def _expand_selector_dicts(
+    df: DataFrame,
+    d: Mapping[Any, Any] | None,
+    expand_keys: bool,
+    expand_values: bool,
+    tuple_keys: bool = False,
+) -> dict[str, Any]:
+    """Expand dict key/value selectors into their underlying column names."""
+    expanded = {}
+    for key, value in (d or {}).items():
+        if expand_values and is_selector(value):
+            expanded[key] = expand_selector(df, selector=value)
+            value = expanded[key]
+        if expand_keys and is_selector(key):
+            cols = expand_selector(df, selector=key)
+            if tuple_keys:
+                expanded[cols] = value
+            else:
+                expanded.update({c: value for c in cols})
+        else:
+            expanded[key] = value
+    return expanded
+
+
 class _selector_proxy_(Expr):
     """Base column selector expression/proxy."""
 
diff --git a/py-polars/polars/type_aliases.py b/py-polars/polars/type_aliases.py
index ff907a68ce90..14597c0c6bb7 100644
--- a/py-polars/polars/type_aliases.py
+++ b/py-polars/polars/type_aliases.py
@@ -161,9 +161,14 @@
 ]
 
 # Excel IO
+ColumnFormatDict: TypeAlias = Mapping[
+    # dict of colname(s) or selector(s) to format string or dict
+    Union[ColumnNameOrSelector, Tuple[ColumnNameOrSelector, ...]],
+    Union[str, Mapping[str, str]],
+]
 ConditionalFormatDict: TypeAlias = Mapping[
     # dict of colname(s) to str, dict, or sequence of str/dict
-    Union[str, Collection[str]],
+    Union[ColumnNameOrSelector, Collection[str]],
     Union[str, Union[Mapping[str, Any], Sequence[Union[str, Mapping[str, Any]]]]],
 ]
 ColumnTotalsDefinition: TypeAlias = Union[
@@ -172,6 +177,9 @@
     Sequence[str],
     bool,
 ]
+ColumnWidthsDefinition: TypeAlias = Union[
+    Mapping[ColumnNameOrSelector, Union[Tuple[str, ...], int]], int
+]
 RowTotalsDefinition: TypeAlias = Union[
     # dict of colname to str(s), a collection of str, or a boolean
     Mapping[str, Union[str, Collection[str]]],
diff --git a/py-polars/tests/unit/io/test_excel.py b/py-polars/tests/unit/io/test_excel.py
index 4458c4cf812e..77a654610525 100644
--- a/py-polars/tests/unit/io/test_excel.py
+++ b/py-polars/tests/unit/io/test_excel.py
@@ -7,6 +7,7 @@
 import pytest
 
 import polars as pl
+import polars.selectors as cs
 from polars.exceptions import NoDataError
 from polars.testing import assert_frame_equal
 
@@ -219,7 +220,7 @@ def test_excel_sparklines() -> None:
             worksheet="frame_data",
             table_style="Table Style Light 2",
             dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"},
-            column_formats={("h1", "h2"): "#,##0_);(#,##0)"},
+            column_formats={cs.starts_with("h"): "#,##0_);(#,##0)"},
             sparklines={
                 "trend": ["q1", "q2", "q3", "q4"],
                 "+/-": {
@@ -229,13 +230,13 @@ def test_excel_sparklines() -> None:
                 },
             },
             conditional_formats={
-                ("q1", "q2", "q3", "q4", "h1", "h2"): {
+                cs.starts_with("q", "h"): {
                     "type": "2_color_scale",
                     "min_color": "#95b3d7",
                     "max_color": "#ffffff",
                 }
             },
-            column_widths={("q1", "q2", "q3", "q4", "h1", "h2"): 40},
+            column_widths={cs.starts_with("q", "h"): 40},
             row_totals={
                 "h1": ("q1", "q2"),
                 "h2": ("q3", "q4"),

From aa10faab9ef031f3da60b97810487d46abf40abf Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijn@degooijer.io>
Date: Mon, 21 Aug 2023 10:21:01 +0200
Subject: [PATCH 24/55] feat(python)!: Remove deprecated behavior from vertical
 aggregations (#10602)

---
 .../polars/functions/aggregation/vertical.py  | 323 +++---------------
 .../functions/aggregation/test_vertical.py    |  34 +-
 .../tests/unit/functions/test_functions.py    |  39 ++-
 3 files changed, 68 insertions(+), 328 deletions(-)

diff --git a/py-polars/polars/functions/aggregation/vertical.py b/py-polars/polars/functions/aggregation/vertical.py
index ef1234b2bc26..952ff10e4352 100644
--- a/py-polars/polars/functions/aggregation/vertical.py
+++ b/py-polars/polars/functions/aggregation/vertical.py
@@ -1,59 +1,24 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, Iterable, overload
+from typing import TYPE_CHECKING
 
-import polars._reexport as pl
 import polars.functions as F
-from polars.utils.deprecation import (
-    deprecate_renamed_parameter,
-    issue_deprecation_warning,
-)
 
 if TYPE_CHECKING:
-    from polars import Expr, Series
-    from polars.type_aliases import IntoExpr, PythonLiteral
+    from polars import Expr
 
 
-@overload
-def all(exprs: Series, *, ignore_nulls: bool = ...) -> bool | None:  # type: ignore[misc]
-    ...
-
-
-@overload
-def all(
-    exprs: IntoExpr | Iterable[IntoExpr] | None = ...,
-    *more_exprs: IntoExpr,
-    ignore_nulls: bool = ...,
-) -> Expr:
-    ...
-
-
-@deprecate_renamed_parameter("columns", "exprs", version="0.18.7")
-def all(
-    exprs: IntoExpr | Iterable[IntoExpr] | None = None,
-    *more_exprs: IntoExpr,
-    ignore_nulls: bool = True,
-) -> Expr | bool | None:
+def all(*names: str, ignore_nulls: bool = True) -> Expr:
     """
     Either return an expression representing all columns, or evaluate a bitwise AND operation.
 
-    If no arguments are passed, this is an alias for ``pl.col("*")``.
-    If a single string is passed, this is an alias for ``pl.col(name).any()``.
-
-    If a single Series is passed, this is an alias for ``Series.any()``.
-    **This functionality is deprecated**.
-
-    Otherwise, this function computes the bitwise AND horizontally across multiple
-    columns.
-    **This functionality is deprecated**, use ``pl.all_horizontal`` instead.
+    If no arguments are passed, this function is syntactic sugar for ``col("*")``.
+    Otherwise, this function is syntactic sugar for ``col(names).all()``.
 
     Parameters
     ----------
-    exprs
-        Column(s) to use in the aggregation. Accepts expression input. Strings are
-        parsed as column names, other non-expression inputs are parsed as literals.
-    *more_exprs
-        Additional columns to use in the aggregation, specified as positional arguments.
+    *names
+        Name(s) of the columns to use in the aggregation.
     ignore_nulls
         Ignore null values (default).
 
@@ -87,7 +52,7 @@ def all(
     │ 2   ┆ 0   │
     └─────┴─────┘
 
-    Evaluate bitwise AND for a column:
+    Evaluate bitwise AND for a column.
 
     >>> df.select(pl.all("a"))
     shape: (1, 1)
@@ -100,53 +65,17 @@ def all(
     └───────┘
 
     """  # noqa: W505
-    if not more_exprs:
-        if exprs is None:
-            return F.col("*")
-        elif isinstance(exprs, pl.Series):
-            issue_deprecation_warning(
-                "passing a Series to `all` is deprecated. Use `Series.all()` instead.",
-                version="0.18.7",
-            )
-            return exprs.all(ignore_nulls=ignore_nulls)
-        elif isinstance(exprs, str):
-            return F.col(exprs).all(ignore_nulls=ignore_nulls)
-
-    _warn_for_deprecated_horizontal_use("all")
-    return F.all_horizontal(exprs, *more_exprs)
-
-
-@overload
-def any(exprs: Series, *, ignore_nulls: bool = ...) -> bool | None:  # type: ignore[misc]
-    ...
-
-
-@overload
-def any(
-    exprs: IntoExpr | Iterable[IntoExpr],
-    *more_exprs: IntoExpr,
-    ignore_nulls: bool = ...,
-) -> Expr:
-    ...
-
-
-@deprecate_renamed_parameter("columns", "exprs", version="0.18.7")
-def any(
-    exprs: IntoExpr | Iterable[IntoExpr],
-    *more_exprs: IntoExpr,
-    ignore_nulls: bool = True,
-) -> Expr | bool | None:
-    """
-    Evaluate a bitwise OR operation.
+    if not names:
+        return F.col("*")
 
-    If a single string is passed, this is an alias for ``pl.col(name).any()``.
+    return F.col(*names).all(ignore_nulls=ignore_nulls)
 
-    If a single Series is passed, this is an alias for ``Series.any()``.
-    **This functionality is deprecated**.
 
-    Otherwise, this function computes the bitwise OR horizontally across multiple
-    columns.
-    **This functionality is deprecated**, use ``pl.any_horizontal`` instead.
+def any(*names: str, ignore_nulls: bool = True) -> Expr | bool | None:
+    """
+    Evaluate a bitwise OR operation.
+
+    Syntactic sugar for ``col(names).any()``.
 
     See Also
     --------
@@ -154,11 +83,8 @@ def any(
 
     Parameters
     ----------
-    exprs
-        Column(s) to use in the aggregation. Accepts expression input. Strings are
-        parsed as column names, other non-expression inputs are parsed as literals.
-    *more_exprs
-        Additional columns to use in the aggregation, specified as positional arguments.
+    *names
+        Name(s) of the columns to use in the aggregation.
     ignore_nulls
         Ignore null values (default).
 
@@ -187,50 +113,19 @@ def any(
     └──────┘
 
     """
-    if not more_exprs:
-        if isinstance(exprs, pl.Series):
-            issue_deprecation_warning(
-                "passing a Series to `any` is deprecated. Use `Series.any()` instead.",
-                version="0.18.7",
-            )
-            return exprs.any(ignore_nulls=ignore_nulls)
-        elif isinstance(exprs, str):
-            return F.col(exprs).any(ignore_nulls=ignore_nulls)
-
-    _warn_for_deprecated_horizontal_use("any")
-    return F.any_horizontal(exprs, *more_exprs)
-
-
-@overload
-def max(exprs: Series) -> PythonLiteral | None:  # type: ignore[misc]
-    ...
-
+    return F.col(*names).any(ignore_nulls=ignore_nulls)
 
-@overload
-def max(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr:
-    ...
 
-
-def max(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr | Any:
+def max(*names: str) -> Expr:
     """
     Get the maximum value.
 
-    If a single string is passed, this is an alias for ``pl.col(name).max()``.
-
-    If a single Series is passed, this is an alias for ``Series.max()``.
-    **This functionality is deprecated**.
-
-    Otherwise, this function computes the maximum value horizontally across multiple
-    columns.
-    **This functionality is deprecated**, use ``pl.max_horizontal`` instead.
+    Syntactic sugar for ``col(names).max()``.
 
     Parameters
     ----------
-    exprs
-        Column(s) to use in the aggregation. Accepts expression input. Strings are
-        parsed as column names, other non-expression inputs are parsed as literals.
-    *more_exprs
-        Additional columns to use in the aggregation, specified as positional arguments.
+    *names
+        Name(s) of the columns to use in the aggregation.
 
     See Also
     --------
@@ -238,7 +133,7 @@ def max(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr | A
 
     Examples
     --------
-    Get the maximum value of a column by passing a single column name.
+    Get the maximum value of a column.
 
     >>> df = pl.DataFrame(
     ...     {
@@ -257,8 +152,7 @@ def max(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr | A
     │ 8   │
     └─────┘
 
-    Get column-wise maximums for multiple columns by passing a regular expression,
-    or call ``.max()`` on a multi-column expression instead.
+    Get the maximum value of multiple columns.
 
     >>> df.select(pl.max("^a|b$"))
     shape: (1, 2)
@@ -269,7 +163,7 @@ def max(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr | A
     ╞═════╪═════╡
     │ 8   ┆ 5   │
     └─────┴─────┘
-    >>> df.select(pl.col("a", "b").max())
+    >>> df.select(pl.max("a", "b"))
     shape: (1, 2)
     ┌─────┬─────┐
     │ a   ┆ b   │
@@ -280,52 +174,19 @@ def max(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr | A
     └─────┴─────┘
 
     """
-    if not more_exprs:
-        if isinstance(exprs, pl.Series):
-            issue_deprecation_warning(
-                "passing a Series to `max` is deprecated. Use `Series.max()` instead.",
-                version="0.18.7",
-            )
-            return exprs.max()
-        elif isinstance(exprs, str):
-            return F.col(exprs).max()
-
-    _warn_for_deprecated_horizontal_use("max")
-    return F.max_horizontal(exprs, *more_exprs)
-
+    return F.col(*names).max()
 
-@overload
-def min(exprs: Series) -> PythonLiteral | None:  # type: ignore[misc]
-    ...
 
-
-@overload
-def min(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr:
-    ...
-
-
-def min(
-    exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr
-) -> Expr | PythonLiteral | None:
+def min(*names: str) -> Expr:
     """
     Get the minimum value.
 
-    If a single string is passed, this is an alias for ``pl.col(name).min()``.
-
-    If a single Series is passed, this is an alias for ``Series.min()``.
-    **This functionality is deprecated**.
-
-    Otherwise, this function computes the minimum value horizontally across multiple
-    columns.
-    **This functionality is deprecated**, use ``pl.min_horizontal`` instead.
+    Syntactic sugar for ``col(names).min()``.
 
     Parameters
     ----------
-    exprs
-        Column(s) to use in the aggregation. Accepts expression input. Strings are
-        parsed as column names, other non-expression inputs are parsed as literals.
-    *more_exprs
-        Additional columns to use in the aggregation, specified as positional arguments.
+    *names
+        Name(s) of the columns to use in the aggregation.
 
     See Also
     --------
@@ -333,7 +194,7 @@ def min(
 
     Examples
     --------
-    Get the minimum value of a column by passing a single column name.
+    Get the minimum value of a column.
 
     >>> df = pl.DataFrame(
     ...     {
@@ -352,8 +213,7 @@ def min(
     │ 1   │
     └─────┘
 
-    Get column-wise minimums for multiple columns by passing a regular expression,
-    or call ``.min()`` on a multi-column expression instead.
+    Get the minimum value of multiple columns.
 
     >>> df.select(pl.min("^a|b$"))
     shape: (1, 2)
@@ -364,7 +224,7 @@ def min(
     ╞═════╪═════╡
     │ 1   ┆ 2   │
     └─────┴─────┘
-    >>> df.select(pl.col("a", "b").min())
+    >>> df.select(pl.min("a", "b"))
     shape: (1, 2)
     ┌─────┬─────┐
     │ a   ┆ b   │
@@ -375,52 +235,19 @@ def min(
     └─────┴─────┘
 
     """
-    if not more_exprs:
-        if isinstance(exprs, pl.Series):
-            issue_deprecation_warning(
-                "passing a Series to `min` is deprecated. Use `Series.min()` instead.",
-                version="0.18.7",
-            )
-            return exprs.min()
-        elif isinstance(exprs, str):
-            return F.col(exprs).min()
-
-    _warn_for_deprecated_horizontal_use("min")
-    return F.min_horizontal(exprs, *more_exprs)
-
-
-@overload
-def sum(exprs: Series) -> int | float:  # type: ignore[misc]
-    ...
-
-
-@overload
-def sum(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr:
-    ...
+    return F.col(*names).min()
 
 
-@deprecate_renamed_parameter("column", "exprs", version="0.18.7")
-def sum(
-    exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr
-) -> Expr | int | float:
+def sum(*names: str) -> Expr:
     """
     Sum all values.
 
-    If a single string is passed, this is an alias for ``pl.col(name).sum()``.
-
-    If a single Series is passed, this is an alias for ``Series.sum()``.
-    **This functionality is deprecated**.
-
-    Otherwise, this function computes the sum horizontally across multiple columns.
-    **This functionality is deprecated**, use ``pl.sum_horizontal`` instead.
+    Syntactic sugar for ``col(name).sum()``.
 
     Parameters
     ----------
-    exprs
-        Column(s) to use in the aggregation. Accepts expression input. Strings are
-        parsed as column names, other non-expression inputs are parsed as literals.
-    *more_exprs
-        Additional columns to use in the aggregation, specified as positional arguments.
+    *names
+        Name(s) of the columns to use in the aggregation.
 
     See Also
     --------
@@ -428,7 +255,7 @@ def sum(
 
     Examples
     --------
-    Sum a column by name:
+    Sum a column.
 
     >>> df = pl.DataFrame(
     ...     {
@@ -447,10 +274,9 @@ def sum(
     │ 3   │
     └─────┘
 
-    To aggregate the sums for more than one column/expression use ``pl.col(list).sum()``
-    or a regular expression selector like ``pl.sum(regex)``:
+    Sum multiple columns.
 
-    >>> df.select(pl.col("a", "c").sum())
+    >>> df.select(pl.sum("a", "c"))
     shape: (1, 2)
     ┌─────┬─────┐
     │ a   ┆ c   │
@@ -459,7 +285,6 @@ def sum(
     ╞═════╪═════╡
     │ 3   ┆ 11  │
     └─────┴─────┘
-
     >>> df.select(pl.sum("^.*[bc]$"))
     shape: (1, 2)
     ┌─────┬─────┐
@@ -471,53 +296,19 @@ def sum(
     └─────┴─────┘
 
     """
-    if not more_exprs:
-        if isinstance(exprs, pl.Series):
-            issue_deprecation_warning(
-                "passing a Series to `sum` is deprecated. Use `Series.sum()` instead.",
-                version="0.18.7",
-            )
-            return exprs.sum()
-        elif isinstance(exprs, str):
-            return F.col(exprs).sum()
-
-    _warn_for_deprecated_horizontal_use("sum")
-    return F.sum_horizontal(exprs, *more_exprs)
-
+    return F.col(*names).sum()
 
-@overload
-def cumsum(exprs: Series) -> Series:  # type: ignore[misc]
-    ...
 
-
-@overload
-def cumsum(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr:
-    ...
-
-
-@deprecate_renamed_parameter("column", "exprs", version="0.18.7")
-def cumsum(
-    exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr
-) -> Expr | Series:
+def cumsum(*names: str) -> Expr:
     """
     Cumulatively sum all values.
 
-    If a single string is passed, this is an alias for ``pl.col(name).cumsum()``.
-
-    If a single Series is passed, this is an alias for ``Series.cumsum()``.
-    **This functionality is deprecated**.
-
-    Otherwise, this function computes the cumulative sum horizontally across multiple
-    columns.
-    **This functionality is deprecated**, use ``pl.cumsum_horizontal`` instead.
+    Syntactic sugar for ``col(names).cumsum()``.
 
     Parameters
     ----------
-    exprs
-        Column(s) to use in the aggregation. Accepts expression input. Strings are
-        parsed as column names, other non-expression inputs are parsed as literals.
-    *more_exprs
-        Additional columns to use in the aggregation, specified as positional arguments.
+    *names
+        Name(s) of the columns to use in the aggregation.
 
     See Also
     --------
@@ -544,22 +335,4 @@ def cumsum(
     └─────┘
 
     """
-    if not more_exprs:
-        if isinstance(exprs, pl.Series):
-            issue_deprecation_warning(
-                "passing a Series to `cumsum` is deprecated. Use `Series.cumsum()` instead.",
-                version="0.18.7",
-            )
-            return exprs.cumsum()
-        elif isinstance(exprs, str):
-            return F.col(exprs).cumsum()
-
-    _warn_for_deprecated_horizontal_use("cumsum")
-    return F.cumsum_horizontal(exprs, *more_exprs)
-
-
-def _warn_for_deprecated_horizontal_use(name: str) -> None:
-    issue_deprecation_warning(
-        f"using `{name}` for horizontal computation is deprecated. Use `{name}_horizontal` instead.",
-        version="0.18.7",
-    )
+    return F.col(*names).cumsum()
diff --git a/py-polars/tests/unit/functions/aggregation/test_vertical.py b/py-polars/tests/unit/functions/aggregation/test_vertical.py
index 57a45fd14a96..1651e3375e72 100644
--- a/py-polars/tests/unit/functions/aggregation/test_vertical.py
+++ b/py-polars/tests/unit/functions/aggregation/test_vertical.py
@@ -3,7 +3,7 @@
 import pytest
 
 import polars as pl
-from polars.testing import assert_frame_equal, assert_series_equal
+from polars.testing import assert_frame_equal
 
 
 def assert_expr_equal(
@@ -55,35 +55,3 @@ def test_alias_for_col_agg(function: str, input: str) -> None:
     expected = getattr(pl.col(input), function)()  # e.g. pl.col(input).min()
     context = pl.DataFrame({"a": [1, 4], "b": [3, 2]})
     assert_expr_equal(result, expected, context)
-
-
-@pytest.mark.parametrize("function", ["all", "any"])
-def test_deprecated_alias_for_series_agg_bool(function: str) -> None:
-    s = pl.Series([True, True, False])
-    with pytest.deprecated_call():
-        result = getattr(pl, function)(s)  # e.g. pl.all(s)
-    expected = getattr(s, function)()  # e.g. s.all()
-    assert result == expected
-
-
-@pytest.mark.parametrize("function", ["min", "max", "sum"])
-def test_deprecated_alias_for_series_agg_numeric(function: str) -> None:
-    s = pl.Series([1, 2, 3])
-    with pytest.deprecated_call():
-        result = getattr(pl, function)(s)  # e.g. pl.max(s)
-    expected = getattr(s, function)()  # e.g. s.max()
-    assert result == expected
-
-
-def test_deprecated_alias_for_series_agg_cumsum() -> None:
-    s = pl.Series([1, 2, 3])
-    with pytest.deprecated_call():
-        result = pl.cumsum(s)
-    expected = s.cumsum()
-    assert_series_equal(result, expected)
-
-
-@pytest.mark.parametrize("function", ["all", "any", "min", "max", "sum", "cumsum"])
-def test_deprecated_horizontal(function: str) -> None:
-    with pytest.deprecated_call():
-        getattr(pl, function)(pl.col("a"))  # e.g. pl.all(pl.col("a"))
diff --git a/py-polars/tests/unit/functions/test_functions.py b/py-polars/tests/unit/functions/test_functions.py
index 01ad444049a1..af79b585a0d4 100644
--- a/py-polars/tests/unit/functions/test_functions.py
+++ b/py-polars/tests/unit/functions/test_functions.py
@@ -373,44 +373,43 @@ def test_lazy_functions() -> None:
     )
     expected = 1.0
     assert np.isclose(out.to_series(0), expected)
-    with pytest.deprecated_call():
-        assert np.isclose(pl.var(df["b"]), expected)  # type: ignore[arg-type]
+    assert np.isclose(df["b"].var(), expected)  # type: ignore[arg-type]
+
     expected = 1.0
     assert np.isclose(out.to_series(1), expected)
-    with pytest.deprecated_call():
-        assert np.isclose(pl.std(df["b"]), expected)  # type: ignore[arg-type]
+    assert np.isclose(df["b"].std(), expected)  # type: ignore[arg-type]
+
     expected = 3
     assert np.isclose(out.to_series(2), expected)
-    with pytest.deprecated_call():
-        assert np.isclose(pl.max(df["b"]), expected)  # type: ignore[arg-type]
+    assert np.isclose(df["b"].max(), expected)  # type: ignore[arg-type]
+
     expected = 1
     assert np.isclose(out.to_series(3), expected)
-    with pytest.deprecated_call():
-        assert np.isclose(pl.min(df["b"]), expected)  # type: ignore[arg-type]
+    assert np.isclose(df["b"].min(), expected)  # type: ignore[arg-type]
+
     expected = 6
     assert np.isclose(out.to_series(4), expected)
-    with pytest.deprecated_call():
-        assert np.isclose(pl.sum(df["b"]), expected)
+    assert np.isclose(df["b"].sum(), expected)
+
     expected = 2
     assert np.isclose(out.to_series(5), expected)
-    with pytest.deprecated_call():
-        assert np.isclose(pl.mean(df["b"]), expected)
+    assert np.isclose(df["b"].mean(), expected)  # type: ignore[arg-type]
+
     expected = 2
     assert np.isclose(out.to_series(6), expected)
-    with pytest.deprecated_call():
-        assert np.isclose(pl.median(df["b"]), expected)
+    assert np.isclose(df["b"].median(), expected)  # type: ignore[arg-type]
+
     expected = 3
     assert np.isclose(out.to_series(7), expected)
-    with pytest.deprecated_call():
-        assert np.isclose(pl.n_unique(df["b"]), expected)
+    assert np.isclose(df["b"].n_unique(), expected)
+
     expected = 1
     assert np.isclose(out.to_series(8), expected)
-    with pytest.deprecated_call():
-        assert np.isclose(pl.first(df["b"]), expected)
+    assert np.isclose(df["b"][0], expected)
+
     expected = 3
     assert np.isclose(out.to_series(9), expected)
-    with pytest.deprecated_call():
-        assert np.isclose(pl.last(df["b"]), expected)
+    assert np.isclose(df["b"][-1], expected)
 
     # regex selection
     out = df.select(

From cdf83247ef279ed90b270cb8903e07198646c452 Mon Sep 17 00:00:00 2001
From: Julian <juliansteden@gmail.com>
Date: Mon, 21 Aug 2023 11:39:20 +0200
Subject: [PATCH 25/55] feat(python, rust): preserve whitespace in notebook
 output (#10644)

---
 py-polars/polars/dataframe/_html.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/py-polars/polars/dataframe/_html.py b/py-polars/polars/dataframe/_html.py
index f144f9fa2c30..1d432b8b161a 100644
--- a/py-polars/polars/dataframe/_html.py
+++ b/py-polars/polars/dataframe/_html.py
@@ -161,6 +161,7 @@ def write_style(self) -> None:
             .dataframe > thead > tr > th,
             .dataframe > tbody > tr > td {
               text-align: right;
+              white-space: pre;
             }
             </style>
         """

From 6f50321b68820238dd1e7c2384048f6519c3e0ad Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijn@degooijer.io>
Date: Mon, 21 Aug 2023 11:59:53 +0200
Subject: [PATCH 26/55] feat(python)!: Update a lot of error types (#10637)

---
 py-polars/polars/convert.py                   |  4 +-
 py-polars/polars/dataframe/frame.py           | 94 ++++++++++---------
 py-polars/polars/datatypes/constructor.py     |  2 +-
 py-polars/polars/datatypes/convert.py         |  4 +-
 py-polars/polars/dependencies.py              |  2 +-
 py-polars/polars/expr/datetime.py             |  2 +-
 py-polars/polars/expr/expr.py                 | 34 +++----
 py-polars/polars/expr/string.py               |  2 +-
 py-polars/polars/expr/struct.py               |  2 +-
 py-polars/polars/functions/eager.py           |  8 +-
 py-polars/polars/functions/lazy.py            |  9 +-
 py-polars/polars/io/_utils.py                 |  4 +-
 py-polars/polars/io/csv/batched_reader.py     |  2 +-
 py-polars/polars/io/csv/functions.py          | 12 +--
 py-polars/polars/io/database.py               | 20 ++--
 py-polars/polars/io/delta.py                  |  5 +-
 py-polars/polars/io/excel/_write_utils.py     |  2 +-
 py-polars/polars/io/excel/functions.py        |  6 +-
 py-polars/polars/io/ipc/functions.py          |  4 +-
 py-polars/polars/io/parquet/functions.py      |  5 +-
 py-polars/polars/lazyframe/frame.py           | 16 ++--
 py-polars/polars/lazyframe/groupby.py         |  2 +-
 py-polars/polars/series/series.py             | 40 ++++----
 py-polars/polars/series/struct.py             |  2 +-
 .../polars/testing/parametric/strategies.py   |  2 +-
 py-polars/polars/utils/various.py             |  5 +-
 py-polars/tests/unit/dataframe/test_df.py     | 18 ++--
 py-polars/tests/unit/io/test_database.py      |  6 +-
 py-polars/tests/unit/io/test_excel.py         |  2 +-
 py-polars/tests/unit/operations/test_join.py  |  8 +-
 py-polars/tests/unit/series/test_series.py    | 58 +++++++-----
 py-polars/tests/unit/test_errors.py           | 10 +-
 py-polars/tests/unit/test_exprs.py            | 12 +--
 py-polars/tests/unit/test_interop.py          |  2 +-
 py-polars/tests/unit/test_lazy.py             |  4 +-
 35 files changed, 213 insertions(+), 197 deletions(-)

diff --git a/py-polars/polars/convert.py b/py-polars/polars/convert.py
index a6fd5f8401c1..16ca69601012 100644
--- a/py-polars/polars/convert.py
+++ b/py-polars/polars/convert.py
@@ -291,7 +291,7 @@ def _from_dataframe_repr(m: re.Match[str]) -> DataFrame:
     for dtype in set(schema.values()):
         if dtype in (List, Struct, Object):
             raise NotImplementedError(
-                f"'from_repr' does not support {dtype.base_type()} dtype"
+                f"`from_repr` does not support data type {dtype.base_type().__name__!r}"
             )
 
     # construct DataFrame from string series and cast from repr to native dtype
@@ -720,6 +720,6 @@ def from_pandas(
             include_index=include_index,
         )
     else:
-        raise ValueError(
+        raise TypeError(
             f"expected pandas DataFrame or Series, got {type(data).__name__!r}"
         )
diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
index 817c49079313..180dfc4daebd 100644
--- a/py-polars/polars/dataframe/frame.py
+++ b/py-polars/polars/dataframe/frame.py
@@ -403,7 +403,7 @@ def __init__(
             )
         else:
             raise TypeError(
-                f"DataFrame constructor received unsupported type {type(data).__name__!r}"
+                f"DataFrame constructor called with unsupported type {type(data).__name__!r}"
                 " for the `data` parameter"
             )
 
@@ -711,7 +711,9 @@ def _read_csv(
             elif isinstance(dtypes, Sequence):
                 dtype_slice = dtypes
             else:
-                raise ValueError("dtype arg should be list or dict")
+                raise TypeError(
+                    f"`dtypes` should be of type list or dict, got {type(dtypes).__name__!r}"
+                )
 
         processed_null_values = _process_null_values(null_values)
 
@@ -723,8 +725,8 @@ def _read_csv(
                 dtypes_dict = dict(dtype_list)
             if dtype_slice is not None:
                 raise ValueError(
-                    "cannot use glob patterns and unnamed dtypes as `dtypes` argument;"
-                    " Use dtypes: Mapping[str, Type[DataType]"
+                    "cannot use glob patterns and unnamed dtypes as `dtypes` argument"
+                    "\n\nUse `dtypes`: Mapping[str, Type[DataType]"
                 )
             from polars import scan_csv
 
@@ -755,8 +757,8 @@ def _read_csv(
                 return scan.select(columns).collect()
             else:
                 raise ValueError(
-                    "cannot use glob patterns and integer based projection as `columns`"
-                    " argument; Use columns: List[str]"
+                    "cannot use glob patterns and integer based projection as `columns` argument"
+                    "\n\nUse columns: List[str]"
                 )
 
         projection, columns = handle_projection_columns(columns)
@@ -843,9 +845,9 @@ def _read_parquet(
             elif is_str_sequence(columns, allow_str=False):
                 return scan.select(columns).collect()
             else:
-                raise ValueError(
-                    "cannot use glob patterns and integer based projection as `columns`"
-                    " argument; Use columns: List[str]"
+                raise TypeError(
+                    "cannot use glob patterns and integer based projection as `columns` argument"
+                    "\n\nUse columns: List[str]"
                 )
 
         projection, columns = handle_projection_columns(columns)
@@ -957,9 +959,9 @@ def _read_ipc(
             elif is_str_sequence(columns, allow_str=False):
                 df = scan.select(columns).collect()
             else:
-                raise ValueError(
-                    "cannot use glob patterns and integer based projection as `columns`"
-                    " argument; Use columns: List[str]"
+                raise TypeError(
+                    "cannot use glob patterns and integer based projection as `columns` argument"
+                    "\n\nUse columns: List[str]"
                 )
             return cls._from_pydf(df._df)
 
@@ -1429,7 +1431,7 @@ def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame:
         return self._div(other, floordiv=False)
 
     def __bool__(self) -> NoReturn:
-        raise ValueError(
+        raise TypeError(
             "the truth value of a DataFrame is ambiguous"
             "\n\nHint: to check if a DataFrame contains any values, use `is_empty()`."
         )
@@ -1724,10 +1726,10 @@ def __setitem__(
                 raise ValueError("can only set multiple columns with 2D matrix")
             if value.shape[1] != len(key):
                 raise ValueError(
-                    "matrix columns should be equal to list use to determine column names"
+                    "matrix columns should be equal to list used to determine column names"
                 )
 
-            # todo! we can parallelize this by calling from_numpy
+            # TODO: we can parallelize this by calling from_numpy
             columns = []
             for i, name in enumerate(key):
                 columns.append(pl.Series(name, value[:, i]))
@@ -1740,8 +1742,8 @@ def __setitem__(
             if (
                 isinstance(row_selection, pl.Series) and row_selection.dtype == Boolean
             ) or is_bool_sequence(row_selection):
-                raise ValueError(
-                    "not allowed to set 'DataFrame' by boolean mask in the row position."
+                raise TypeError(
+                    "not allowed to set DataFrame by boolean mask in the row position"
                     "\n\nConsider using `DataFrame.with_columns`."
                 )
 
@@ -1751,7 +1753,7 @@ def __setitem__(
             elif isinstance(col_selection, int):
                 s = self[:, col_selection]
             else:
-                raise ValueError(f"unexpected column selection {col_selection!r}")
+                raise TypeError(f"unexpected column selection {col_selection!r}")
 
             # dispatch to __setitem__ of Series to do modification
             s[row_selection] = value
@@ -1858,7 +1860,7 @@ def item(self, row: int | None = None, column: int | str | None = None) -> Any:
             else self._df.column(column)
         )
         if s is None:
-            raise ValueError(f"column index {column!r} is out of bounds")
+            raise IndexError(f"column index {column!r} is out of bounds")
         return s.get_idx(row)
 
     def to_arrow(self) -> pa.Table:
@@ -2238,8 +2240,8 @@ def to_series(self, index: int = 0) -> Series:
 
         """
         if not isinstance(index, int):
-            raise ValueError(
-                f'Index value "{index}" should be be an int, but is {type(index)}.'
+            raise TypeError(
+                f"index value {index!r} should be an int, but is {type(index).__name__!r}"
             )
 
         if index < 0:
@@ -2984,7 +2986,8 @@ def write_excel(
             from xlsxwriter.utility import xl_cell_to_rowcol
         except ImportError:
             raise ImportError(
-                "Excel export requires xlsxwriter; please run `pip install XlsxWriter`"
+                "Excel export requires xlsxwriter"
+                "\n\nPlease run `pip install XlsxWriter`"
             ) from None
 
         # setup workbook/worksheet
@@ -3402,15 +3405,17 @@ def write_database(
 
         if engine == "adbc":
             if if_exists == "fail":
-                raise ValueError("'if_exists' not yet supported with engine ADBC")
+                raise NotImplementedError(
+                    "`if_exists` not yet supported with engine ADBC"
+                )
             elif if_exists == "replace":
                 mode = "create"
             elif if_exists == "append":
                 mode = "append"
             else:
                 raise ValueError(
-                    f"value for 'if_exists'={if_exists} was unexpected."
-                    f" Choose one of: {'fail', 'replace', 'append'}"
+                    f"unexpected value for `if_exists`: {if_exists!r}"
+                    f"\n\nChoose one of: {'fail', 'replace', 'append'}"
                 )
             with _open_adbc_connection(connection) as conn, conn.cursor() as cursor:
                 cursor.adbc_ingest(table_name, self.to_arrow(), mode)
@@ -3419,13 +3424,13 @@ def write_database(
         elif engine == "sqlalchemy":
             if parse_version(pd.__version__) < parse_version("1.5"):
                 raise ModuleNotFoundError(
-                    f"writing with engine 'sqlalchemy' requires Pandas 1.5.x or higher, found Pandas {pd.__version__!r}"
+                    f"writing with engine 'sqlalchemy' requires pandas 1.5.x or higher, found pandas {pd.__version__!r}"
                 )
 
             try:
                 from sqlalchemy import create_engine
-            except ImportError as exc:
-                raise ImportError(
+            except ModuleNotFoundError as exc:
+                raise ModuleNotFoundError(
                     "'sqlalchemy' not found. Install polars with 'pip install polars[sqlalchemy]'"
                 ) from exc
             from csv import reader as delimited_read
@@ -3434,7 +3439,7 @@ def write_database(
             # both components and pass them through unquoted (sqlalachemy will quote)
             table_ident = next(delimited_read([table_name], delimiter="."))
             if len(table_ident) > 2:
-                raise ValueError(f"table_name appears to be invalid: {table_name!r}")
+                raise ValueError(f"`table_name` appears to be invalid: {table_name!r}")
             elif len(table_ident) > 1:
                 db_schema = table_ident[0]
                 table_name = table_ident[1]
@@ -5781,7 +5786,7 @@ def join_asof(
         """
         if not isinstance(other, DataFrame):
             raise TypeError(
-                f"expected 'other' join table to be a DataFrame, not a {type(other).__name__!r}"
+                f"expected `other` join table to be a DataFrame, got {type(other).__name__!r}"
             )
 
         return (
@@ -5935,7 +5940,7 @@ def join(
         """
         if not isinstance(other, DataFrame):
             raise TypeError(
-                f"expected 'other' join table to be a DataFrame, not a {type(other).__name__!r}"
+                f"expected `other` join table to be a DataFrame, got {type(other).__name__!r}"
             )
 
         return (
@@ -6584,8 +6589,8 @@ def get_column(self, name: str) -> Series:
 
         """
         if not isinstance(name, str):
-            raise ValueError(
-                f'column name "{name!r}" should be be a string, but is {type(name).__name__!r}'
+            raise TypeError(
+                f"column name {name!r} should be be a string, but is {type(name).__name__!r}"
             )
         return self[name]
 
@@ -7900,8 +7905,8 @@ def n_chunks(self, strategy: str = "first") -> int | list[int]:
             return [s.n_chunks() for s in self.__iter__()]
         else:
             raise ValueError(
-                f"strategy: '{strategy}' not understood."
-                f" Choose one of {{'first',  'all'}}"
+                f"unexpected input for `strategy`: {strategy!r}"
+                f"\n\nChoose one of {{'first', 'all'}}"
             )
 
     @overload
@@ -7944,7 +7949,7 @@ def max(self, axis: int = 0) -> Self | Series:
             return self._from_pydf(self._df.max())
         if axis == 1:
             return wrap_s(self._df.hmax())
-        raise ValueError("axis should be 0 or 1")  # pragma: no cover
+        raise ValueError("axis should be 0 or 1")
 
     @overload
     def min(self, axis: Literal[0] = ...) -> Self:
@@ -7986,7 +7991,7 @@ def min(self, axis: int = 0) -> Self | Series:
             return self._from_pydf(self._df.min())
         if axis == 1:
             return wrap_s(self._df.hmin())
-        raise ValueError("axis should be 0 or 1")  # pragma: no cover
+        raise ValueError("axis should be 0 or 1")
 
     @overload
     def sum(
@@ -8063,7 +8068,7 @@ def sum(
             return self._from_pydf(self._df.sum())
         if axis == 1:
             return wrap_s(self._df.hsum(null_strategy))
-        raise ValueError("axis should be 0 or 1")  # pragma: no cover
+        raise ValueError("axis should be 0 or 1")
 
     @overload
     def mean(
@@ -8141,7 +8146,7 @@ def mean(
             return self._from_pydf(self._df.mean())
         if axis == 1:
             return wrap_s(self._df.hmean(null_strategy))
-        raise ValueError("axis should be 0 or 1")  # pragma: no cover
+        raise ValueError("axis should be 0 or 1")
 
     def std(self, ddof: int = 1) -> Self:
         """
@@ -8840,7 +8845,9 @@ def row(
                 "cannot set both 'index' and 'by_predicate'; mutually exclusive"
             )
         elif isinstance(index, pl.Expr):
-            raise TypeError("expressions should be passed to the 'by_predicate' param")
+            raise TypeError(
+                "expressions should be passed to the `by_predicate` parameter"
+            )
 
         if index is not None:
             row = self._df.row_tuple(index)
@@ -8852,8 +8859,7 @@ def row(
         elif by_predicate is not None:
             if not isinstance(by_predicate, pl.Expr):
                 raise TypeError(
-                    f"expected 'by_predicate to be an expression;"
-                    f" found {type(by_predicate).__name__!r}"
+                    f"expected `by_predicate` to be an expression, got {type(by_predicate).__name__!r}"
                 )
             rows = self.filter(by_predicate).rows()
             n_rows = len(rows)
@@ -8872,7 +8878,7 @@ def row(
             else:
                 return row
         else:
-            raise ValueError("one of 'index' or 'by_predicate' must be set")
+            raise ValueError("one of `index` or `by_predicate` must be set")
 
     @overload
     def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]:
@@ -9679,7 +9685,7 @@ def _prepare_other_arg(other: Any, length: int | None = None) -> Series:
         if isinstance(other, str):
             pass
         elif isinstance(other, Sequence):
-            raise ValueError("operation not supported")
+            raise TypeError("operation not supported")
         other = pl.Series("", [other])
 
     if length and length > 1:
diff --git a/py-polars/polars/datatypes/constructor.py b/py-polars/polars/datatypes/constructor.py
index 7066f3cdcc20..103e8a0bcdee 100644
--- a/py-polars/polars/datatypes/constructor.py
+++ b/py-polars/polars/datatypes/constructor.py
@@ -122,7 +122,7 @@ def numpy_type_to_constructor(dtype: type[np.dtype[Any]]) -> Callable[..., PySer
     except KeyError:
         return PySeries.new_object
     except NameError:  # pragma: no cover
-        raise ImportError(
+        raise ModuleNotFoundError(
             f"'numpy' is required to convert numpy dtype {dtype!r}"
         ) from None
 
diff --git a/py-polars/polars/datatypes/convert.py b/py-polars/polars/datatypes/convert.py
index e457e543780d..9d6e096f8a67 100644
--- a/py-polars/polars/datatypes/convert.py
+++ b/py-polars/polars/datatypes/convert.py
@@ -158,7 +158,7 @@ def is_polars_dtype(dtype: Any, include_unknown: bool = False) -> bool:
             return include_unknown
         else:
             return isinstance(dtype, (DataType, DataTypeClass))
-    except ValueError:
+    except TypeError:
         return False
 
 
@@ -517,7 +517,7 @@ def maybe_cast(el: Any, dtype: PolarsDataType) -> Any:
         try:
             el = py_type(el)  # type: ignore[call-arg, misc]
         except Exception:
-            raise ValueError(
+            raise TypeError(
                 f"cannot convert Python type {type(el).__name__!r} to {dtype!r}"
             ) from None
     return el
diff --git a/py-polars/polars/dependencies.py b/py-polars/polars/dependencies.py
index fac71538f887..b042e0c91382 100644
--- a/py-polars/polars/dependencies.py
+++ b/py-polars/polars/dependencies.py
@@ -94,7 +94,7 @@ def __getattr__(self, attr: Any) -> Any:
             # all other attribute access raises a helpful exception
             pfx = self._mod_pfx.get(self._module_name, "")
             raise ModuleNotFoundError(
-                f"{pfx}{attr} requires '{self._module_name}' module to be installed"
+                f"{pfx}{attr} requires {self._module_name!r} module to be installed"
             ) from None
 
 
diff --git a/py-polars/polars/expr/datetime.py b/py-polars/polars/expr/datetime.py
index 91caaa3ee422..31fc8f90f118 100644
--- a/py-polars/polars/expr/datetime.py
+++ b/py-polars/polars/expr/datetime.py
@@ -395,7 +395,7 @@ def combine(self, time: dt.time | Expr, time_unit: TimeUnit = "us") -> Expr:
         """
         if not isinstance(time, (dt.time, pl.Expr)):
             raise TypeError(
-                f"expected 'time' to be a python time or polars expression, found {time!r}"
+                f"expected 'time' to be a Python time or Polars expression, found {type(time).__name__!r}"
             )
         time = parse_as_expression(time)
         return wrap_expr(self._pyexpr.dt_combine(time, time_unit))
diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py
index d7bf90aeeb81..68c013feaca8 100644
--- a/py-polars/polars/expr/expr.py
+++ b/py-polars/polars/expr/expr.py
@@ -132,8 +132,8 @@ def __str__(self) -> str:
         return self._pyexpr.to_str()
 
     def __bool__(self) -> NoReturn:
-        raise ValueError(
-            "since Expr are lazy, the truthiness of an Expr is ambiguous."
+        raise TypeError(
+            "the truth value of an Expr is ambiguous"
             "\n\nHint: use '&' or '|' to logically combine Expr, not 'and'/'or', and"
             " use 'x.is_in([y,z])' instead of 'x in [y,z]' to check membership"
         )
@@ -246,7 +246,8 @@ def __array_ufunc__(
         if num_expr > 1:
             if num_expr < len(inputs):
                 raise ValueError(
-                    "Numpy ufunc with more than one expression can only be used if all non-expression inputs are provided as keyword arguments only"
+                    "NumPy ufunc with more than one expression can only be used"
+                    " if all non-expression inputs are provided as keyword arguments only"
                 )
 
             exprs = parse_as_list_of_expressions(inputs)
@@ -893,8 +894,8 @@ def exclude(
                 exclude_dtypes.append(item)
             else:
                 raise TypeError(
-                    "invalid input for `exclude`. Expected one or more `str`,"
-                    f"`DataType`, or selector; found {type(item).__name__!r} instead"
+                    "invalid input for `exclude`"
+                    f"\n\nExpected one or more `str`, `DataType`, or selector; found {type(item).__name__!r} instead."
                 )
 
         if exclude_cols and exclude_dtypes:
@@ -2483,13 +2484,12 @@ def fill_null(
 
         """
         if value is not None and strategy is not None:
-            raise ValueError("cannot specify both 'value' and 'strategy'")
+            raise ValueError("cannot specify both `value` and `strategy`")
         elif value is None and strategy is None:
-            raise ValueError("must specify either a fill 'value' or 'strategy'")
+            raise ValueError("must specify either a fill `value` or `strategy`")
         elif strategy not in ("forward", "backward") and limit is not None:
             raise ValueError(
-                "can only specify 'limit' when strategy is set to "
-                "'backward' or 'forward'"
+                "can only specify `limit` when strategy is set to 'backward' or 'forward'"
             )
 
         if value is not None:
@@ -4944,7 +4944,7 @@ def is_between(
             return (self >= lower_bound) & (self < upper_bound)
         else:
             raise ValueError(
-                "closed must be one of {'left', 'right', 'both', 'none'},"
+                "`closed` must be one of {'left', 'right', 'both', 'none'},"
                 f" got {closed!r}"
             )
 
@@ -8260,7 +8260,7 @@ def extend_constant(self, value: PythonLiteral | None, n: int) -> Self:
 
         """
         if isinstance(value, Expr):
-            raise TypeError(f"'value' must be a supported literal; found {value!r}")
+            raise TypeError(f"`value` must be a supported literal; found {value!r}")
 
         return self._from_pyexpr(self._pyexpr.extend_constant(value, n))
 
@@ -8805,7 +8805,7 @@ def _remap_key_or_value_series(
                             )
                             if dtype != s.dtype:
                                 raise ValueError(
-                                    f"remapping values for map_dict could not be converted to {dtype!r}: found {s.dtype!r}"
+                                    f"remapping values for `map_dict` could not be converted to {dtype!r}: found {s.dtype!r}"
                                 )
                 else:
                     # dtype was set, which should always be the case when:
@@ -8821,13 +8821,13 @@ def _remap_key_or_value_series(
                     )
                     if dtype != s.dtype:
                         raise ValueError(
-                            f"remapping {'keys' if is_keys else 'values'} for map_dict could not be converted to {dtype!r}: found {s.dtype!r}"
+                            f"remapping {'keys' if is_keys else 'values'} for `map_dict` could not be converted to {dtype!r}: found {s.dtype!r}"
                         )
 
             except OverflowError as exc:
                 if is_keys:
                     raise ValueError(
-                        f"remapping keys for map_dict could not be converted to {dtype!r}: {exc!s}"
+                        f"remapping keys for `map_dict` could not be converted to {dtype!r}: {exc!s}"
                     ) from exc
                 else:
                     raise ValueError(
@@ -8842,7 +8842,7 @@ def _remap_key_or_value_series(
                     pass
                 else:
                     raise ValueError(
-                        f"remapping keys for map_dict could not be converted to {dtype!r} without losing values in the conversion"
+                        f"remapping keys for `map_dict` could not be converted to {dtype!r} without losing values in the conversion"
                     )
             else:
                 # values = remapping.values()
@@ -8852,7 +8852,7 @@ def _remap_key_or_value_series(
                     pass
                 else:
                     raise ValueError(
-                        f"remapping values for map_dict could not be converted to {dtype!r} without losing values in the conversion"
+                        f"remapping values for `map_dict` could not be converted to {dtype!r} without losing values in the conversion"
                     )
 
             return s
@@ -9178,7 +9178,7 @@ def _prepare_rolling_window_args(
 ) -> tuple[str, int]:
     if isinstance(window_size, int):
         if window_size < 1:
-            raise ValueError("'window_size' should be positive")
+            raise ValueError("`window_size` must be positive")
 
         if min_periods is None:
             min_periods = window_size
diff --git a/py-polars/polars/expr/string.py b/py-polars/polars/expr/string.py
index 0efcbcab56a1..11338913e987 100644
--- a/py-polars/polars/expr/string.py
+++ b/py-polars/polars/expr/string.py
@@ -285,7 +285,7 @@ def strptime(
         elif dtype == Time:
             return self.to_time(format, strict=strict, cache=cache)
         else:
-            raise ValueError("dtype should be of type {Date, Datetime, Time}")
+            raise ValueError("`dtype` must be of type {Date, Datetime, Time}")
 
     def to_decimal(
         self,
diff --git a/py-polars/polars/expr/struct.py b/py-polars/polars/expr/struct.py
index 84440293983c..db9382405921 100644
--- a/py-polars/polars/expr/struct.py
+++ b/py-polars/polars/expr/struct.py
@@ -22,7 +22,7 @@ def __getitem__(self, item: str | int) -> Expr:
         elif isinstance(item, int):
             return wrap_expr(self._pyexpr.struct_field_by_index(item))
         else:
-            raise ValueError(
+            raise TypeError(
                 f"expected type 'int | str', got {type(item).__name__!r} ({item!r})"
             )
 
diff --git a/py-polars/polars/functions/eager.py b/py-polars/polars/functions/eager.py
index 59f18d0a1711..820c1e67abec 100644
--- a/py-polars/polars/functions/eager.py
+++ b/py-polars/polars/functions/eager.py
@@ -136,7 +136,7 @@ def concat(
 
     if how == "align":
         if not isinstance(elems[0], (pl.DataFrame, pl.LazyFrame)):
-            raise RuntimeError(
+            raise TypeError(
                 f"'align' strategy is not supported for {type(elems[0]).__name__!r}"
             )
 
@@ -194,14 +194,12 @@ def concat(
         if how == "vertical":
             out = wrap_s(plr.concat_series(elems))
         else:
-            raise ValueError("'Series' only allows {'vertical'} concat strategy")
+            raise ValueError("Series only allows {'vertical'} concat strategy")
 
     elif isinstance(first, pl.Expr):
         return wrap_expr(plr.concat_expr([e._pyexpr for e in elems], rechunk))
     else:
-        raise ValueError(
-            f"did not expect type: {type(first).__name__!r} in 'pl.concat'"
-        )
+        raise TypeError(f"did not expect type: {type(first).__name__!r} in `concat`")
 
     if rechunk:
         return out.rechunk()
diff --git a/py-polars/polars/functions/lazy.py b/py-polars/polars/functions/lazy.py
index 427f368d6e2d..feb994947557 100644
--- a/py-polars/polars/functions/lazy.py
+++ b/py-polars/polars/functions/lazy.py
@@ -185,7 +185,8 @@ def col(
             return wrap_expr(plr.dtype_cols(dtypes))
         else:
             raise TypeError(
-                f"invalid input for `col`. Expected `str` or `DataType`, got {type(name).__name__!r}"
+                "invalid input for `col`"
+                f"\n\nExpected `str` or `DataType`, got {type(name).__name__!r}."
             )
 
     if isinstance(name, str):
@@ -204,12 +205,14 @@ def col(
             return wrap_expr(plr.dtype_cols(names))
         else:
             raise TypeError(
-                "invalid input for `col`. Expected iterable of type `str` or `DataType`,"
+                "invalid input for `col`"
+                "\n\nExpected iterable of type `str` or `DataType`,"
                 f" got iterable of type {type(item).__name__!r}"
             )
     else:
         raise TypeError(
-            f"invalid input for `col`. Expected `str` or `DataType`, got {type(name).__name__!r}"
+            "invalid input for `col`"
+            f"\n\nExpected `str` or `DataType`, got {type(name).__name__!r}"
         )
 
 
diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py
index 69c21748072f..ec3301bbd930 100644
--- a/py-polars/polars/io/_utils.py
+++ b/py-polars/polars/io/_utils.py
@@ -168,7 +168,9 @@ def managed_file(file: Any) -> Iterator[Any]:
 
         # todo! add azure/ gcp/ ?
         if file.startswith("s3://"):
-            raise ImportError("fsspec needs to be installed to read files from s3")
+            raise ModuleNotFoundError(
+                "fsspec needs to be installed to read files from S3"
+            )
 
     if isinstance(file, list) and bool(file) and all(isinstance(f, str) for f in file):
         if _FSSPEC_AVAILABLE:
diff --git a/py-polars/polars/io/csv/batched_reader.py b/py-polars/polars/io/csv/batched_reader.py
index 87b58c055be2..27d55afb55e4 100644
--- a/py-polars/polars/io/csv/batched_reader.py
+++ b/py-polars/polars/io/csv/batched_reader.py
@@ -68,7 +68,7 @@ def __init__(
             elif isinstance(dtypes, Sequence):
                 dtype_slice = dtypes
             else:
-                raise ValueError("dtype arg should be list or dict")
+                raise TypeError("`dtypes` arg should be list or dict")
 
         processed_null_values = _process_null_values(null_values)
         projection, columns = handle_projection_columns(columns)
diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py
index 45eac753c094..42039f416e8c 100644
--- a/py-polars/polars/io/csv/functions.py
+++ b/py-polars/polars/io/csv/functions.py
@@ -186,7 +186,7 @@ def read_csv(
         for column in columns:
             if not column.startswith("column_"):
                 raise ValueError(
-                    'specified column names do not start with "column_",'
+                    "specified column names do not start with 'column_',"
                     " but autogenerated header names were requested"
                 )
 
@@ -558,8 +558,8 @@ def read_csv_batched(
         for column in columns:
             if not column.startswith("column_"):
                 raise ValueError(
-                    'specified column names do not start with "column_",'
-                    " but autogenerated header names were requested."
+                    "specified column names do not start with 'column_',"
+                    " but autogenerated header names were requested"
                 )
 
     if projection and dtypes and isinstance(dtypes, list):
@@ -598,8 +598,7 @@ def read_csv_batched(
         if columns:
             if len(columns) < len(new_columns):
                 raise ValueError(
-                    "more new column names are specified than there are selected"
-                    " columns"
+                    "more new column names are specified than there are selected columns"
                 )
 
             # Get column names of requested columns.
@@ -610,8 +609,7 @@ def read_csv_batched(
             if projection:
                 if columns and len(columns) < len(new_columns):
                     raise ValueError(
-                        "more new column names are specified than there are selected"
-                        " columns"
+                        "more new column names are specified than there are selected columns"
                     )
                 # Convert column indices from projection to 'column_1', 'column_2', ...
                 # column names.
diff --git a/py-polars/polars/io/database.py b/py-polars/polars/io/database.py
index 4c89c3b144d8..e6cd357c56f5 100644
--- a/py-polars/polars/io/database.py
+++ b/py-polars/polars/io/database.py
@@ -113,7 +113,7 @@ def read_database(
     """  # noqa: W505
     if not isinstance(connection, str):
         raise TypeError(
-            f"expect connection to be a URI string; found {type(connection).__name__!r}"
+            f"expected connection to be a URI string; found {type(connection).__name__!r}"
         )
     elif engine is None:
         engine = "connectorx"
@@ -132,7 +132,9 @@ def read_database(
             raise ValueError("only a single SQL query string is accepted for adbc")
         return _read_sql_adbc(query, connection)
     else:
-        raise ValueError(f"engine {engine!r} not implemented; use connectorx or adbc")
+        raise ValueError(
+            f"engine must be one of {{'connectorx', 'adbc'}}, got {engine!r}"
+        )
 
 
 def _read_sql_connectorx(
@@ -145,9 +147,10 @@ def _read_sql_connectorx(
 ) -> DataFrame:
     try:
         import connectorx as cx
-    except ImportError:
-        raise ImportError(
-            "connectorx is not installed. Please run `pip install connectorx>=0.3.1`"
+    except ModuleNotFoundError:
+        raise ModuleNotFoundError(
+            "connectorx is not installed"
+            "\n\nPlease run `pip install connectorx>=0.3.1`."
         ) from None
 
     tbl = cx.read_sql(
@@ -182,9 +185,10 @@ def _open_adbc_connection(connection_uri: str) -> Any:
         import_module(module_name)
         adbc_driver = sys.modules[module_name]
     except ImportError:
-        raise ImportError(
-            f"ADBC {driver_name} driver not detected; if ADBC supports this database,"
-            f" please run `pip install adbc-driver-{driver_name} pyarrow`"
+        raise ModuleNotFoundError(
+            f"ADBC {driver_name} driver not detected"
+            "\n\nIf ADBC supports this database, please run:"
+            " `pip install adbc-driver-{driver_name} pyarrow`"
         ) from None
 
     # some backends require the driver name to be stripped from the URI
diff --git a/py-polars/polars/io/delta.py b/py-polars/polars/io/delta.py
index 211dd6294411..e04d1a01037f 100644
--- a/py-polars/polars/io/delta.py
+++ b/py-polars/polars/io/delta.py
@@ -315,8 +315,9 @@ def _get_delta_lake_table(
 
 def _check_if_delta_available() -> None:
     if not _DELTALAKE_AVAILABLE:
-        raise ImportError(
-            "deltalake is not installed. Please run `pip install deltalake>=0.9.0`"
+        raise ModuleNotFoundError(
+            "deltalake is not installed"
+            "\n\nPlease run: `pip install deltalake>=0.9.0`"
         )
 
 
diff --git a/py-polars/polars/io/excel/_write_utils.py b/py-polars/polars/io/excel/_write_utils.py
index 89206deecb78..1903c61afb5d 100644
--- a/py-polars/polars/io/excel/_write_utils.py
+++ b/py-polars/polars/io/excel/_write_utils.py
@@ -493,7 +493,7 @@ def _xl_setup_table_options(
         )
         for key in table_style:
             if key not in valid_options:
-                raise ValueError(f"invalid table style key:{key!r}")
+                raise ValueError(f"invalid table style key: {key!r}")
 
         table_options = table_style.copy()
         table_style = table_options.pop("style", None)
diff --git a/py-polars/polars/io/excel/functions.py b/py-polars/polars/io/excel/functions.py
index 7ec1aceec4e8..f46ffc231111 100644
--- a/py-polars/polars/io/excel/functions.py
+++ b/py-polars/polars/io/excel/functions.py
@@ -165,13 +165,13 @@ def read_excel(
     try:
         import xlsx2csv
     except ImportError:
-        raise ImportError(
-            "xlsx2csv is not installed. Please run `pip install xlsx2csv`"
+        raise ModuleNotFoundError(
+            "xlsx2csv is not installed\n\nPlease run: `pip install xlsx2csv`"
         ) from None
 
     if sheet_id is not None and sheet_name is not None:
         raise ValueError(
-            f"Cannot specify both `sheet_name` ({sheet_name!r}) and `sheet_id` ({sheet_id!r})"
+            f"cannot specify both `sheet_name` ({sheet_name!r}) and `sheet_id` ({sheet_id!r})"
         )
 
     if isinstance(source, (str, Path)):
diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py
index 7f661cc2d33f..1decddc41c12 100644
--- a/py-polars/polars/io/ipc/functions.py
+++ b/py-polars/polars/io/ipc/functions.py
@@ -84,7 +84,7 @@ def read_ipc(
     with _prepare_file_arg(source, use_pyarrow=use_pyarrow, **storage_options) as data:
         if use_pyarrow:
             if not _PYARROW_AVAILABLE:
-                raise ImportError(
+                raise ModuleNotFoundError(
                     "'pyarrow' is required when using"
                     " 'read_ipc(..., use_pyarrow=True)'"
                 )
@@ -160,7 +160,7 @@ def read_ipc_stream(
     with _prepare_file_arg(source, use_pyarrow=use_pyarrow, **storage_options) as data:
         if use_pyarrow:
             if not _PYARROW_AVAILABLE:
-                raise ImportError(
+                raise ModuleNotFoundError(
                     "'pyarrow' is required when using"
                     " 'read_ipc_stream(..., use_pyarrow=True)'"
                 )
diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py
index 23c700271d84..26d660c42fe7 100644
--- a/py-polars/polars/io/parquet/functions.py
+++ b/py-polars/polars/io/parquet/functions.py
@@ -112,9 +112,8 @@ def read_parquet(
     ) as source_prep:
         if use_pyarrow:
             if not _PYARROW_AVAILABLE:
-                raise ImportError(
-                    "'pyarrow' is required when using"
-                    " 'read_parquet(..., use_pyarrow=True)'"
+                raise ModuleNotFoundError(
+                    "'pyarrow' is required when using `read_parquet(..., use_pyarrow=True)`"
                 )
 
             import pyarrow as pa
diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py
index 36a54c86f713..90f9fc452938 100644
--- a/py-polars/polars/lazyframe/frame.py
+++ b/py-polars/polars/lazyframe/frame.py
@@ -720,9 +720,9 @@ def width(self) -> int:
         return self._ldf.width()
 
     def __bool__(self) -> NoReturn:
-        raise ValueError(
+        raise TypeError(
             "the truth value of a LazyFrame is ambiguous"
-            "\n\nLazyFrames cannot be used in boolean context with and/or/not operators"
+            "\n\nLazyFrames cannot be used in boolean context with and/or/not operators."
         )
 
     def _comparison_error(self, operator: str) -> NoReturn:
@@ -760,8 +760,8 @@ def __deepcopy__(self, memo: None = None) -> Self:
     def __getitem__(self, item: int | range | slice) -> LazyFrame:
         if not isinstance(item, slice):
             raise TypeError(
-                "'LazyFrame' object is not subscriptable (aside from slicing). Use"
-                " 'select()' or 'filter()' instead"
+                "'LazyFrame' object is not subscriptable (aside from slicing)"
+                "\n\nUse `select()` or `filter()` instead."
             )
         return LazyPolarsSlice(self).apply(item)
 
@@ -1117,7 +1117,7 @@ def show_graph(
                 import matplotlib.image as mpimg
                 import matplotlib.pyplot as plt
             except ImportError:
-                raise ImportError(
+                raise ModuleNotFoundError(
                     "matplotlib should be installed to show graph"
                 ) from None
             plt.figure(figsize=figsize)
@@ -1577,7 +1577,7 @@ def profile(
                 plt.show()
 
             except ImportError:
-                raise ImportError(
+                raise ModuleNotFoundError(
                     "matplotlib should be installed to show profiling plot"
                 ) from None
 
@@ -3178,7 +3178,7 @@ def join_asof(
         """
         if not isinstance(other, LazyFrame):
             raise TypeError(
-                f"expected 'other' join table to be a LazyFrame, not a {type(other).__name__!r}"
+                f"expected `other` join table to be a LazyFrame, not a {type(other).__name__!r}"
             )
 
         if isinstance(on, (str, pl.Expr)):
@@ -3358,7 +3358,7 @@ def join(
         """
         if not isinstance(other, LazyFrame):
             raise TypeError(
-                f"expected 'other' join table to be a LazyFrame, not a {type(other).__name__!r}"
+                f"expected `other` join table to be a LazyFrame, not a {type(other).__name__!r}"
             )
 
         if how == "cross":
diff --git a/py-polars/polars/lazyframe/groupby.py b/py-polars/polars/lazyframe/groupby.py
index cb7ba5d15bd2..85eb9e10eb7a 100644
--- a/py-polars/polars/lazyframe/groupby.py
+++ b/py-polars/polars/lazyframe/groupby.py
@@ -133,7 +133,7 @@ def agg(
 
         """
         if aggs and isinstance(aggs[0], dict):
-            raise ValueError(
+            raise TypeError(
                 "specifying aggregations as a dictionary is not supported"
                 "\n\nTry unpacking the dictionary to take advantage of the keyword syntax"
                 " of the `agg` method."
diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py
index fcd637ca0f35..9c7d9454704e 100644
--- a/py-polars/polars/series/series.py
+++ b/py-polars/polars/series/series.py
@@ -320,8 +320,9 @@ def __init__(
                 dtype_if_empty=dtype_if_empty,
             )
         else:
-            raise ValueError(
-                f"Series constructor called with unsupported type; got {type(values).__name__!r}"
+            raise TypeError(
+                f"Series constructor called with unsupported type {type(values).__name__!r}"
+                " for the `values` parameter"
             )
 
     @classmethod
@@ -352,8 +353,7 @@ def _get_ptr(self) -> tuple[int, int, int]:
         """
         Get a pointer to the start of the values buffer of a numeric Series.
 
-        This will raise an error if the
-        ``Series`` contains multiple chunks
+        This will raise an error if the ``Series`` contains multiple chunks.
 
         This will return the offset, length and the pointer itself.
 
@@ -416,7 +416,7 @@ def shape(self) -> tuple[int]:
         return (self._s.len(),)
 
     def __bool__(self) -> NoReturn:
-        raise ValueError(
+        raise TypeError(
             "the truth value of a Series is ambiguous"
             "\n\nHint: use '&' or '|' to chain Series boolean results together, not and/or."
             " To check if a Series contains any values, use `is_empty()`."
@@ -673,7 +673,7 @@ def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self:
             other = maybe_cast(other, self.dtype)
             f = get_ffi_func(op_ffi, self.dtype, self._s)
         if f is None:
-            raise ValueError(
+            raise TypeError(
                 f"cannot do arithmetic with series of dtype: {self.dtype} and argument"
                 f" of type: {type(other).__name__!r}"
             )
@@ -725,7 +725,7 @@ def __truediv__(self, other: Any) -> Series | Expr:
         if isinstance(other, pl.Expr):
             return F.lit(self) / other
         if self.is_temporal():
-            raise ValueError("first cast to integer before dividing datelike dtypes")
+            raise TypeError("first cast to integer before dividing datelike dtypes")
 
         # this branch is exactly the floordiv function without rounding the floats
         if self.is_float() or self.dtype == Decimal:
@@ -745,7 +745,7 @@ def __floordiv__(self, other: Any) -> Series | Expr:
         if isinstance(other, pl.Expr):
             return F.lit(self) // other
         if self.is_temporal():
-            raise ValueError("first cast to integer before dividing datelike dtypes")
+            raise TypeError("first cast to integer before dividing datelike dtypes")
 
         if not isinstance(other, pl.Expr):
             other = F.lit(other)
@@ -772,7 +772,7 @@ def __mul__(self, other: Any) -> Series | DataFrame | Expr:
         if isinstance(other, pl.Expr):
             return F.lit(self) * other
         if self.is_temporal():
-            raise ValueError("first cast to integer before multiplying datelike dtypes")
+            raise TypeError("first cast to integer before multiplying datelike dtypes")
         elif isinstance(other, pl.DataFrame):
             return other * self
         else:
@@ -790,14 +790,14 @@ def __mod__(self, other: Any) -> Series | Expr:
         if isinstance(other, pl.Expr):
             return F.lit(self).__mod__(other)
         if self.is_temporal():
-            raise ValueError(
+            raise TypeError(
                 "first cast to integer before applying modulo on datelike dtypes"
             )
         return self._arithmetic(other, "rem", "rem_<>")
 
     def __rmod__(self, other: Any) -> Series:
         if self.is_temporal():
-            raise ValueError(
+            raise TypeError(
                 "first cast to integer before applying modulo on datelike dtypes"
             )
         return self._arithmetic(other, "rem", "rem_<>_rhs")
@@ -812,7 +812,7 @@ def __rsub__(self, other: Any) -> Series:
 
     def __rtruediv__(self, other: Any) -> Series:
         if self.is_temporal():
-            raise ValueError("first cast to integer before dividing datelike dtypes")
+            raise TypeError("first cast to integer before dividing datelike dtypes")
         if self.is_float():
             self.__rfloordiv__(other)
 
@@ -822,12 +822,12 @@ def __rtruediv__(self, other: Any) -> Series:
 
     def __rfloordiv__(self, other: Any) -> Series:
         if self.is_temporal():
-            raise ValueError("first cast to integer before dividing datelike dtypes")
+            raise TypeError("first cast to integer before dividing datelike dtypes")
         return self._arithmetic(other, "div", "div_<>_rhs")
 
     def __rmul__(self, other: Any) -> Series:
         if self.is_temporal():
-            raise ValueError("first cast to integer before multiplying datelike dtypes")
+            raise TypeError("first cast to integer before multiplying datelike dtypes")
         return self._arithmetic(other, "mul", "mul_<>")
 
     def __pow__(self, exponent: int | float | None | Series) -> Series:
@@ -835,7 +835,7 @@ def __pow__(self, exponent: int | float | None | Series) -> Series:
 
     def __rpow__(self, other: Any) -> Series:
         if self.is_temporal():
-            raise ValueError(
+            raise TypeError(
                 "first cast to integer before raising datelike dtypes to a power"
             )
         return self.to_frame().select(other ** F.col(self.name)).to_series()
@@ -1010,7 +1010,7 @@ def __setitem__(
             if self.is_numeric() or self.is_temporal():
                 self.set_at_idx(key, value)  # type: ignore[arg-type]
                 return None
-            raise ValueError(
+            raise TypeError(
                 f"cannot set Series of dtype: {self.dtype!r} with list/tuple as value;"
                 " use a scalar value"
             )
@@ -1036,7 +1036,7 @@ def __setitem__(
             s = self._from_pyseries(sequence_to_pyseries("", key, dtype=UInt32))
             self.__setitem__(s, value)
         else:
-            raise ValueError(f'cannot use "{key!r}" for indexing')
+            raise TypeError(f'cannot use "{key!r}" for indexing')
 
     def __array__(self, dtype: Any = None) -> np.ndarray[Any, Any]:
         """
@@ -1075,7 +1075,7 @@ def __array_ufunc__(
                 elif isinstance(arg, Series):
                     args.append(arg.view(ignore_nulls=True))
                 else:
-                    raise ValueError(
+                    raise TypeError(
                         f"unsupported type {type(arg).__name__!r} for {arg!r}"
                     )
 
@@ -1551,7 +1551,7 @@ def pow(self, exponent: int | float | None | Series) -> Series:
 
         """
         if self.is_temporal():
-            raise ValueError(
+            raise TypeError(
                 "first cast to integer before raising datelike dtypes to a power"
             )
         if _check_for_numpy(exponent) and isinstance(exponent, np.ndarray):
@@ -3764,7 +3764,7 @@ def is_integer(self, signed: bool | None = None) -> bool:
         elif signed is False:
             return self.dtype in UNSIGNED_INTEGER_DTYPES
 
-        raise ValueError(f"'signed' must be None, True or False; given {signed!r}")
+        raise ValueError(f"`signed` must be None, True or False; got {signed!r}")
 
     def is_temporal(self, excluding: OneOrMoreDataTypes | None = None) -> bool:
         """
diff --git a/py-polars/polars/series/struct.py b/py-polars/polars/series/struct.py
index 9c6d36177e03..529d0fde2d6a 100644
--- a/py-polars/polars/series/struct.py
+++ b/py-polars/polars/series/struct.py
@@ -30,7 +30,7 @@ def __getitem__(self, item: int | str) -> Series:
         elif isinstance(item, str):
             return self.field(item)
         else:
-            raise ValueError(f"expected type 'int | str', got {type(item).__name__!r}")
+            raise TypeError(f"expected type 'int | str', got {type(item).__name__!r}")
 
     def _ipython_key_completions_(self) -> list[str]:
         return self.fields
diff --git a/py-polars/polars/testing/parametric/strategies.py b/py-polars/polars/testing/parametric/strategies.py
index 71866ed5db70..0959e50d85aa 100644
--- a/py-polars/polars/testing/parametric/strategies.py
+++ b/py-polars/polars/testing/parametric/strategies.py
@@ -376,7 +376,7 @@ def create_list_strategy(
 
     """
     if select_from and inner_dtype is None:
-        raise ValueError("if specifying 'select_from', must also specify 'inner_dtype'")
+        raise ValueError("if specifying `select_from`, must also specify `inner_dtype`")
 
     if inner_dtype is None:
         strats = list(
diff --git a/py-polars/polars/utils/various.py b/py-polars/polars/utils/various.py
index 88f0b9c1d42f..49c854d18789 100644
--- a/py-polars/polars/utils/various.py
+++ b/py-polars/polars/utils/various.py
@@ -121,9 +121,8 @@ def handle_projection_columns(
         elif is_int_sequence(columns):
             projection = list(columns)
         elif not is_str_sequence(columns):
-            raise ValueError(
-                "'columns' arg should contain a list of all integers or all strings"
-                " values"
+            raise TypeError(
+                "'columns' arg should contain a list of all integers or all strings values"
             )
         else:
             new_columns = columns
diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py
index 0258dd0fb1e3..f3e212444836 100644
--- a/py-polars/tests/unit/dataframe/test_df.py
+++ b/py-polars/tests/unit/dataframe/test_df.py
@@ -56,7 +56,7 @@ def test_init_empty() -> None:
 
     # note: cannot use df (empty or otherwise) in boolean context
     empty_df = pl.DataFrame()
-    with pytest.raises(ValueError, match="ambiguous"):
+    with pytest.raises(TypeError, match="ambiguous"):
         not empty_df
 
 
@@ -2387,7 +2387,7 @@ def test_arithmetic() -> None:
     assert_frame_equal(out, expected)
 
     # cannot do arithmetic with a sequence
-    with pytest.raises(ValueError, match="operation not supported"):
+    with pytest.raises(TypeError, match="operation not supported"):
         _ = df + [1]  # type: ignore[operator]
 
 
@@ -3069,14 +3069,14 @@ def test_set() -> None:
         df["new"] = np.random.rand(10)
 
     with pytest.raises(
-        ValueError,
-        match=r"not allowed to set 'DataFrame' by boolean mask in the row position."
+        TypeError,
+        match=r"not allowed to set DataFrame by boolean mask in the row position"
         r"\n\nConsider using `DataFrame.with_columns`.",
     ):
         df[df["ham"] > 0.5, "ham"] = "a"
     with pytest.raises(
-        ValueError,
-        match=r"not allowed to set 'DataFrame' by boolean mask in the row position."
+        TypeError,
+        match=r"not allowed to set DataFrame by boolean mask in the row position"
         r"\n\nConsider using `DataFrame.with_columns`.",
     ):
         df[[True, False], "ham"] = "a"
@@ -3101,9 +3101,9 @@ def test_set() -> None:
     assert df[0, "b"] == 2
 
     # row and col selection have to be int or str
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         df[:, [1]] = 1  # type: ignore[index]
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         df[True, :] = 1  # type: ignore[index]
 
     # needs to be a 2 element tuple
@@ -3315,7 +3315,7 @@ def test_item() -> None:
     df = pl.DataFrame({})
     with pytest.raises(ValueError, match=r".* frame has shape \(0, 0\)"):
         df.item()
-    with pytest.raises(ValueError, match="column index 10 is out of bounds"):
+    with pytest.raises(IndexError, match="column index 10 is out of bounds"):
         df.item(0, 10)
 
 
diff --git a/py-polars/tests/unit/io/test_database.py b/py-polars/tests/unit/io/test_database.py
index d8efec1eb5ec..7e874032f2f5 100644
--- a/py-polars/tests/unit/io/test_database.py
+++ b/py-polars/tests/unit/io/test_database.py
@@ -118,7 +118,7 @@ def test_read_database(
             "SELECT * FROM test_data",
             "sqlite",
             ValueError,
-            "engine 'not_engine' not implemented; use connectorx or adbc",
+            "engine must be one of {'connectorx', 'adbc'}, got 'not_engine'",
             id="Not an available sql engine",
         ),
         pytest.param(
@@ -142,7 +142,7 @@ def test_read_database(
             "SELECT * FROM test_data",
             sqlite3.connect(":memory:"),
             TypeError,
-            "expect connection to be a URI string",
+            "expected connection to be a URI string",
             id="Invalid connection URI",
         ),
     ],
@@ -233,7 +233,7 @@ def test_write_database(
         {"table_name": "w.x.y.z"},
         {"if_exists": "crunk", "table_name": f"main.{tbl_name}"},
     ):
-        with pytest.raises(ValueError):
+        with pytest.raises((ValueError, NotImplementedError)):
             sample_df.write_database(
                 connection=f"sqlite:///{test_db}",
                 engine=engine,
diff --git a/py-polars/tests/unit/io/test_excel.py b/py-polars/tests/unit/io/test_excel.py
index 77a654610525..4442ba428d32 100644
--- a/py-polars/tests/unit/io/test_excel.py
+++ b/py-polars/tests/unit/io/test_excel.py
@@ -45,7 +45,7 @@ def test_read_excel_all_sheets(excel_file_path: Path) -> None:
 def test_read_excel_all_sheets_with_sheet_name(excel_file_path: Path) -> None:
     with pytest.raises(
         ValueError,
-        match=r"Cannot specify both `sheet_name` \('Sheet1'\) and `sheet_id` \(1\)",
+        match=r"cannot specify both `sheet_name` \('Sheet1'\) and `sheet_id` \(1\)",
     ):
         pl.read_excel(excel_file_path, sheet_id=1, sheet_name="Sheet1")
 
diff --git a/py-polars/tests/unit/operations/test_join.py b/py-polars/tests/unit/operations/test_join.py
index 5b986f6e2221..a1ffc113d9d5 100644
--- a/py-polars/tests/unit/operations/test_join.py
+++ b/py-polars/tests/unit/operations/test_join.py
@@ -467,13 +467,13 @@ def test_join_frame_consistency() -> None:
     df = pl.DataFrame({"A": [1, 2, 3]})
     ldf = pl.DataFrame({"A": [1, 2, 5]}).lazy()
 
-    with pytest.raises(TypeError, match="expected 'other'.* LazyFrame"):
+    with pytest.raises(TypeError, match="expected `other`.* LazyFrame"):
         _ = ldf.join(df, on="A")  # type: ignore[arg-type]
-    with pytest.raises(TypeError, match="expected 'other'.* DataFrame"):
+    with pytest.raises(TypeError, match="expected `other`.* DataFrame"):
         _ = df.join(ldf, on="A")  # type: ignore[arg-type]
-    with pytest.raises(TypeError, match="expected 'other'.* LazyFrame"):
+    with pytest.raises(TypeError, match="expected `other`.* LazyFrame"):
         _ = ldf.join_asof(df, on="A")  # type: ignore[arg-type]
-    with pytest.raises(TypeError, match="expected 'other'.* DataFrame"):
+    with pytest.raises(TypeError, match="expected `other`.* DataFrame"):
         _ = df.join_asof(ldf, on="A")  # type: ignore[arg-type]
 
 
diff --git a/py-polars/tests/unit/series/test_series.py b/py-polars/tests/unit/series/test_series.py
index 18be8757b499..d8b6336ac9ed 100644
--- a/py-polars/tests/unit/series/test_series.py
+++ b/py-polars/tests/unit/series/test_series.py
@@ -138,14 +138,14 @@ def test_init_inputs(monkeypatch: Any) -> None:
     # Bad inputs
     with pytest.raises(TypeError):
         pl.Series([1, 2, 3], [1, 2, 3])
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         pl.Series({"a": [1, 2, 3]})
     with pytest.raises(OverflowError):
         pl.Series("bigint", [2**64])
 
     # numpy not available
     monkeypatch.setattr(pl.series.series, "_check_for_numpy", lambda x: False)
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         pl.DataFrame(np.array([1, 2, 3]), schema=["a"])
 
 
@@ -345,31 +345,37 @@ def test_arithmetic(s: pl.Series) -> None:
     assert ((1.0 + a) == [2, 3]).sum() == 2
     assert ((1.0 % a) == [0, 1]).sum() == 2
 
+
+def test_arithmetic_datetime() -> None:
     a = pl.Series("a", [datetime(2021, 1, 1)])
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         a // 2
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         a / 2
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         a * 2
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         a % 2
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         a**2
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         2 / a
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         2 // a
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         2 * a
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         2 % a
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         2**a
-    with pytest.raises(ValueError):
+
+    with pytest.raises(TypeError):
         +a
+
+
+def test_arithmetic_string() -> None:
     a = pl.Series("a", [""])
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         +a
 
 
@@ -385,7 +391,7 @@ def test_power() -> None:
     assert_series_equal(b**b, pl.Series([None, 4.0], dtype=Float64))
     assert_series_equal(a**b, pl.Series([None, 4.0], dtype=Float64))
     assert_series_equal(a**None, pl.Series([None] * len(a), dtype=Float64))
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         c**2
     with pytest.raises(pl.ColumnNotFoundError):
         a ** "hi"  # type: ignore[operator]
@@ -393,7 +399,7 @@ def test_power() -> None:
     # rpow
     assert_series_equal(2.0**a, pl.Series("literal", [2.0, 4.0], dtype=Float64))
     assert_series_equal(2**b, pl.Series("literal", [None, 4.0], dtype=Float64))
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         2**c
     with pytest.raises(pl.ColumnNotFoundError):
         "hi" ** a
@@ -840,18 +846,18 @@ def test_set_value_as_list_fail() -> None:
 
     # for other types it is not allowed
     s = pl.Series("a", ["a", "b", "c"])
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         s[[0, 1]] = ["d", "e"]
 
     s = pl.Series("a", [True, False, False])
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         s[[0, 1]] = [True, False]
 
 
 @pytest.mark.parametrize("key", [True, False, 1.0])
 def test_set_invalid_key(key: Any) -> None:
     s = pl.Series("a", [1, 2, 3])
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         s[key] = 1
 
 
@@ -1145,7 +1151,7 @@ def test_empty() -> None:
         assert a.name == empty_a.name
         assert len(empty_a) == n
 
-    with pytest.raises(ValueError, match="ambiguous"):
+    with pytest.raises(TypeError, match="ambiguous"):
         not empty_a
 
 
@@ -1470,10 +1476,10 @@ def test_bitwise() -> None:
     assert_series_equal(out["xor"], pl.Series("xor", [2, 6, 6]))
 
     # ensure mistaken use of logical 'and'/'or' raises an exception
-    with pytest.raises(ValueError, match="ambiguous"):
+    with pytest.raises(TypeError, match="ambiguous"):
         a and b
 
-    with pytest.raises(ValueError, match="ambiguous"):
+    with pytest.raises(TypeError, match="ambiguous"):
         a or b
 
 
@@ -1621,17 +1627,17 @@ def test_comparisons_bool_series_to_int() -> None:
         r"cannot do arithmetic with series of dtype: Boolean"
         r" and argument of type: 'bool'"
     )
-    with pytest.raises(ValueError, match=match):
+    with pytest.raises(TypeError, match=match):
         srs_bool - 1
-    with pytest.raises(ValueError, match=match):
+    with pytest.raises(TypeError, match=match):
         srs_bool + 1
     match = (
         r"cannot do arithmetic with series of dtype: Boolean"
         r" and argument of type: 'bool'"
     )
-    with pytest.raises(ValueError, match=match):
+    with pytest.raises(TypeError, match=match):
         srs_bool % 2
-    with pytest.raises(ValueError, match=match):
+    with pytest.raises(TypeError, match=match):
         srs_bool * 1
 
     from operator import ge, gt, le, lt
diff --git a/py-polars/tests/unit/test_errors.py b/py-polars/tests/unit/test_errors.py
index a697301353d4..fc6e7b2a96e5 100644
--- a/py-polars/tests/unit/test_errors.py
+++ b/py-polars/tests/unit/test_errors.py
@@ -117,13 +117,13 @@ def test_join_lazy_on_df() -> None:
 
     with pytest.raises(
         TypeError,
-        match="expected 'other' .* to be a LazyFrame.* not a 'DataFrame'",
+        match="expected `other` .* to be a LazyFrame.* not a 'DataFrame'",
     ):
         df_left.lazy().join(df_right, on="Id")  # type: ignore[arg-type]
 
     with pytest.raises(
         TypeError,
-        match="expected 'other' .* to be a LazyFrame.* not a 'DataFrame'",
+        match="expected `other` .* to be a LazyFrame.* not a 'DataFrame'",
     ):
         df_left.lazy().join_asof(df_right, on="Id")  # type: ignore[arg-type]
 
@@ -298,7 +298,7 @@ def test_series_concat_err(how: ConcatMethod) -> None:
     s = pl.Series([1, 2, 3])
     with pytest.raises(
         ValueError,
-        match="'Series' only allows {'vertical'} concat strategy",
+        match="Series only allows {'vertical'} concat strategy",
     ):
         pl.concat([s, s], how=how)
 
@@ -591,7 +591,7 @@ def test_lit_agg_err() -> None:
 def test_window_size_validation() -> None:
     df = pl.DataFrame({"x": [1.0]})
 
-    with pytest.raises(ValueError, match=r"'window_size' should be positive"):
+    with pytest.raises(ValueError, match=r"`window_size` must be positive"):
         df.with_columns(trailing_min=pl.col("x").rolling_min(window_size=-3))
 
 
@@ -605,7 +605,7 @@ def test_invalid_getitem_key_err() -> None:
 def test_invalid_groupby_arg() -> None:
     df = pl.DataFrame({"a": [1]})
     with pytest.raises(
-        ValueError, match="specifying aggregations as a dictionary is not supported"
+        TypeError, match="specifying aggregations as a dictionary is not supported"
     ):
         df.groupby(1).agg({"a": "sum"})
 
diff --git a/py-polars/tests/unit/test_exprs.py b/py-polars/tests/unit/test_exprs.py
index ed878a857966..5b2e878e82c0 100644
--- a/py-polars/tests/unit/test_exprs.py
+++ b/py-polars/tests/unit/test_exprs.py
@@ -425,18 +425,18 @@ def test_abs_expr() -> None:
 def test_logical_boolean() -> None:
     # note, cannot use expressions in logical
     # boolean context (eg: and/or/not operators)
-    with pytest.raises(ValueError, match="ambiguous"):
+    with pytest.raises(TypeError, match="ambiguous"):
         pl.col("colx") and pl.col("coly")
 
-    with pytest.raises(ValueError, match="ambiguous"):
+    with pytest.raises(TypeError, match="ambiguous"):
         pl.col("colx") or pl.col("coly")
 
     df = pl.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 3, 4, 5]})
 
-    with pytest.raises(ValueError, match="ambiguous"):
+    with pytest.raises(TypeError, match="ambiguous"):
         df.select([(pl.col("a") > pl.col("b")) and (pl.col("b") > pl.col("b"))])
 
-    with pytest.raises(ValueError, match="ambiguous"):
+    with pytest.raises(TypeError, match="ambiguous"):
         df.select([(pl.col("a") > pl.col("b")) or (pl.col("b") > pl.col("b"))])
 
 
@@ -720,13 +720,13 @@ def test_map_dict() -> None:
 
     with pytest.raises(
         pl.ComputeError,
-        match="remapping keys for map_dict could not be converted to Utf8 without losing values in the conversion",
+        match="remapping keys for `map_dict` could not be converted to Utf8 without losing values in the conversion",
     ):
         df_int_as_str.with_columns(pl.col("int").map_dict(int_dict))
 
     with pytest.raises(
         pl.ComputeError,
-        match="remapping keys for map_dict could not be converted to Utf8 without losing values in the conversion",
+        match="remapping keys for `map_dict` could not be converted to Utf8 without losing values in the conversion",
     ):
         df_int_as_str.with_columns(pl.col("int").map_dict(int_with_none_dict))
 
diff --git a/py-polars/tests/unit/test_interop.py b/py-polars/tests/unit/test_interop.py
index 9e50a2d91057..c43fc195ff56 100644
--- a/py-polars/tests/unit/test_interop.py
+++ b/py-polars/tests/unit/test_interop.py
@@ -477,7 +477,7 @@ def test_from_pandas_dataframe() -> None:
     assert df.rows() == [(1, 2, 3), (4, 5, 6)]
 
     # if not a pandas dataframe, raise a ValueError
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         _ = pl.from_pandas([1, 2])  # type: ignore[call-overload]
 
 
diff --git a/py-polars/tests/unit/test_lazy.py b/py-polars/tests/unit/test_lazy.py
index fe2262ba1f80..90151c9bebb2 100644
--- a/py-polars/tests/unit/test_lazy.py
+++ b/py-polars/tests/unit/test_lazy.py
@@ -63,7 +63,7 @@ def test_lazyframe_membership_operator() -> None:
     assert "phone" not in ldf
 
     # note: cannot use lazyframe in boolean context
-    with pytest.raises(ValueError, match="ambiguous"):
+    with pytest.raises(TypeError, match="ambiguous"):
         not ldf
 
 
@@ -675,7 +675,7 @@ def test_fill_null() -> None:
         df.fill_null()
     with pytest.raises(ValueError, match="cannot specify both"):
         df.fill_null(value=3.0, strategy="max")
-    with pytest.raises(ValueError, match="can only specify 'limit'"):
+    with pytest.raises(ValueError, match="can only specify `limit`"):
         df.fill_null(strategy="max", limit=2)
 
 

From 08154e545a3c22092131fac8af605468881d8788 Mon Sep 17 00:00:00 2001
From: Weijie Guo <reswqa@163.com>
Date: Mon, 21 Aug 2023 18:01:02 +0800
Subject: [PATCH 27/55] fix(python): fix apply for empty series in threading
 mode (#10651)

---
 py-polars/polars/expr/expr.py                 | 13 ++++++-----
 py-polars/tests/unit/operations/test_apply.py | 22 +++++++++++++++++++
 2 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py
index 68c013feaca8..258cbc802bf3 100644
--- a/py-polars/polars/expr/expr.py
+++ b/py-polars/polars/expr/expr.py
@@ -3845,8 +3845,16 @@ def wrap_f(x: Series) -> Series:  # pragma: no cover
         elif strategy == "threading":
 
             def wrap_threading(x: Series) -> Series:
+                def get_lazy_promise(df: DataFrame) -> LazyFrame:
+                    return df.lazy().select(
+                        F.col("x").map(wrap_f, agg_list=True, return_dtype=return_dtype)
+                    )
+
                 df = x.to_frame("x")
 
+                if x.len() == 0:
+                    return get_lazy_promise(df).collect().to_series()
+
                 n_threads = threadpool_size()
                 chunk_size = x.len() // n_threads
                 remainder = x.len() % n_threads
@@ -3858,11 +3866,6 @@ def wrap_threading(x: Series) -> Series:
                         for i in range(n_threads)
                     ]
 
-                def get_lazy_promise(df: DataFrame) -> LazyFrame:
-                    return df.lazy().select(
-                        F.col("x").map(wrap_f, agg_list=True, return_dtype=return_dtype)
-                    )
-
                 # create partitions with LazyFrames
                 # these are promises on a computation
                 partitions = []
diff --git a/py-polars/tests/unit/operations/test_apply.py b/py-polars/tests/unit/operations/test_apply.py
index d7948408cd2f..af6cb5946633 100644
--- a/py-polars/tests/unit/operations/test_apply.py
+++ b/py-polars/tests/unit/operations/test_apply.py
@@ -391,3 +391,25 @@ def test_apply_dict_order_10128() -> None:
 def test_apply_10237() -> None:
     df = pl.DataFrame({"a": [1, 2, 3]})
     assert df.select(pl.all().apply(lambda x: x > 50))["a"].to_list() == [False] * 3
+
+
+def test_apply_on_empty_col_10639() -> None:
+    df = pl.DataFrame({"A": [], "B": []})
+    res = df.groupby("B").agg(
+        pl.col("A")
+        .apply(lambda x: x, return_dtype=pl.Int32, strategy="threading")
+        .alias("Foo")
+    )
+    assert res.to_dict(False) == {
+        "B": [],
+        "Foo": [],
+    }
+    res = df.groupby("B").agg(
+        pl.col("A")
+        .apply(lambda x: x, return_dtype=pl.Int32, strategy="thread_local")
+        .alias("Foo")
+    )
+    assert res.to_dict(False) == {
+        "B": [],
+        "Foo": [],
+    }

From bc166cef669f0e783acae4dd6f703e8605659251 Mon Sep 17 00:00:00 2001
From: Weijie Guo <reswqa@163.com>
Date: Mon, 21 Aug 2023 18:01:34 +0800
Subject: [PATCH 28/55] fix(rust): List<null> chunked builder should take care
 of series name (#10642)

---
 .../src/chunked_array/builder/list/boolean.rs | 23 -----------
 .../src/chunked_array/builder/list/mod.rs     |  4 +-
 .../src/chunked_array/builder/list/null.rs    | 38 +++++++++++++++++++
 crates/polars/tests/it/core/series.rs         |  7 ++++
 4 files changed, 48 insertions(+), 24 deletions(-)
 create mode 100644 crates/polars-core/src/chunked_array/builder/list/null.rs

diff --git a/crates/polars-core/src/chunked_array/builder/list/boolean.rs b/crates/polars-core/src/chunked_array/builder/list/boolean.rs
index a6483dfcca31..4d7bc490cb3d 100644
--- a/crates/polars-core/src/chunked_array/builder/list/boolean.rs
+++ b/crates/polars-core/src/chunked_array/builder/list/boolean.rs
@@ -69,26 +69,3 @@ impl ListBuilderTrait for ListBooleanChunkedBuilder {
         self.fast_explode
     }
 }
-
-impl ListBuilderTrait for LargeListNullBuilder {
-    #[inline]
-    fn append_series(&mut self, _s: &Series) -> PolarsResult<()> {
-        self.push_null();
-        Ok(())
-    }
-
-    #[inline]
-    fn append_null(&mut self) {
-        self.push_null()
-    }
-
-    fn finish(&mut self) -> ListChunked {
-        unsafe {
-            ListChunked::from_chunks_and_dtype_unchecked(
-                "",
-                vec![self.as_box()],
-                DataType::List(Box::new(DataType::Null)),
-            )
-        }
-    }
-}
diff --git a/crates/polars-core/src/chunked_array/builder/list/mod.rs b/crates/polars-core/src/chunked_array/builder/list/mod.rs
index 4484a4da1cd3..e4938e9fca17 100644
--- a/crates/polars-core/src/chunked_array/builder/list/mod.rs
+++ b/crates/polars-core/src/chunked_array/builder/list/mod.rs
@@ -4,6 +4,7 @@ mod boolean;
 #[cfg(feature = "dtype-categorical")]
 mod categorical;
 mod dtypes;
+mod null;
 mod primitive;
 
 pub use anonymous::*;
@@ -12,6 +13,7 @@ pub use boolean::*;
 #[cfg(feature = "dtype-categorical")]
 use categorical::*;
 use dtypes::*;
+use null::*;
 use polars_arrow::array::list::AnonymousBuilder;
 use polars_arrow::array::null::MutableNullArray;
 use polars_arrow::prelude::*;
@@ -116,7 +118,7 @@ pub fn get_list_builder(
             list_capacity,
             Some(inner_type_logical.clone()),
         ))),
-        DataType::Null => Ok(Box::new(LargeListNullBuilder::with_capacity(list_capacity))),
+        DataType::Null => Ok(Box::new(ListNullChunkedBuilder::new(name, list_capacity))),
         DataType::List(_) => Ok(Box::new(AnonymousOwnedListBuilder::new(
             name,
             list_capacity,
diff --git a/crates/polars-core/src/chunked_array/builder/list/null.rs b/crates/polars-core/src/chunked_array/builder/list/null.rs
new file mode 100644
index 000000000000..70346ed32071
--- /dev/null
+++ b/crates/polars-core/src/chunked_array/builder/list/null.rs
@@ -0,0 +1,38 @@
+use super::*;
+
+pub struct ListNullChunkedBuilder {
+    builder: LargeListNullBuilder,
+    name: String,
+}
+
+impl ListNullChunkedBuilder {
+    pub fn new(name: &str, capacity: usize) -> Self {
+        ListNullChunkedBuilder {
+            builder: LargeListNullBuilder::with_capacity(capacity),
+            name: name.into(),
+        }
+    }
+}
+
+impl ListBuilderTrait for ListNullChunkedBuilder {
+    #[inline]
+    fn append_series(&mut self, _s: &Series) -> PolarsResult<()> {
+        self.builder.push_null();
+        Ok(())
+    }
+
+    #[inline]
+    fn append_null(&mut self) {
+        self.builder.push_null();
+    }
+
+    fn finish(&mut self) -> ListChunked {
+        unsafe {
+            ListChunked::from_chunks_and_dtype_unchecked(
+                &self.name,
+                vec![self.builder.as_box()],
+                DataType::List(Box::new(DataType::Null)),
+            )
+        }
+    }
+}
diff --git a/crates/polars/tests/it/core/series.rs b/crates/polars/tests/it/core/series.rs
index 6e87defa58b6..42b533c78f1a 100644
--- a/crates/polars/tests/it/core/series.rs
+++ b/crates/polars/tests/it/core/series.rs
@@ -36,3 +36,10 @@ fn test_min_max_sorted_desc() {
     assert_eq!(a.max(), Some(4));
     assert_eq!(a.min(), Some(1));
 }
+
+#[test]
+fn test_construct_list_of_null_series() {
+    let s = Series::new("a", [Series::new_null("a1", 1), Series::new_null("a1", 1)]);
+    assert_eq!(s.null_count(), s.len());
+    assert_eq!(s.field().name(), "a");
+}

From 60efadff8f32b6439e31d57ec2e8f7c0fadc89e6 Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijn@degooijer.io>
Date: Mon, 21 Aug 2023 20:18:57 +0200
Subject: [PATCH 29/55] feat(rust)!: Rename `groupby` to `group_by` (#10654)

---
 crates/polars-algo/src/algo.rs                |   4 +-
 crates/polars-arrow/src/array/mod.rs          |   8 +-
 crates/polars-core/Cargo.toml                 |   6 +-
 .../logical/categorical/builder.rs            |   2 +-
 .../logical/categorical/ops/unique.rs         |   2 +-
 .../logical/categorical/stringcache.rs        |   2 +-
 .../src/chunked_array/ops/take/traits.rs      |   2 +-
 .../src/chunked_array/ops/unique/mod.rs       |   6 +-
 .../src/doc/changelog/v0_10_0_11.rs           |   2 +-
 crates/polars-core/src/doc/changelog/v0_4.rs  |   2 +-
 crates/polars-core/src/doc/changelog/v0_7.rs  |   2 +-
 .../polars-core/src/frame/asof_join/groups.rs |   2 +-
 .../aggregations/agg_list.rs                  |   0
 .../aggregations/boolean.rs                   |   0
 .../aggregations/dispatch.rs                  |   0
 .../{groupby => group_by}/aggregations/mod.rs |   6 +-
 .../aggregations/utf8.rs                      |   0
 .../src/frame/{groupby => group_by}/expr.rs   |   0
 .../frame/{groupby => group_by}/hashing.rs    |  18 +--
 .../{groupby => group_by}/into_groups.rs      |  32 ++---
 .../src/frame/{groupby => group_by}/mod.rs    | 126 +++++++++---------
 .../frame/{groupby => group_by}/perfect.rs    |   0
 .../src/frame/{groupby => group_by}/proxy.rs  |   2 +-
 crates/polars-core/src/frame/hash_join/mod.rs |   2 +-
 .../src/frame/hash_join/multiple_keys.rs      |   2 +-
 crates/polars-core/src/frame/mod.rs           |  20 +--
 .../polars-core/src/hashing/vector_hasher.rs  |   4 +-
 crates/polars-core/src/prelude.rs             |   4 +-
 .../src/series/implementations/array.rs       |   2 +-
 .../src/series/implementations/binary.rs      |   2 +-
 .../src/series/implementations/boolean.rs     |   2 +-
 .../src/series/implementations/categorical.rs |   2 +-
 .../src/series/implementations/dates_time.rs  |   2 +-
 .../src/series/implementations/datetime.rs    |   2 +-
 .../src/series/implementations/duration.rs    |   2 +-
 .../src/series/implementations/floats.rs      |   2 +-
 .../src/series/implementations/list.rs        |   8 +-
 .../src/series/implementations/mod.rs         |   2 +-
 .../src/series/implementations/object.rs      |   2 +-
 .../src/series/implementations/struct_.rs     |   2 +-
 .../src/series/implementations/utf8.rs        |   2 +-
 crates/polars-core/src/series/ops/unique.rs   |   2 +-
 crates/polars-core/src/series/series_trait.rs |   2 +-
 crates/polars-io/src/partition.rs             |   2 +-
 crates/polars-lazy/Cargo.toml                 |   2 +-
 crates/polars-lazy/src/dsl/list.rs            |   4 +-
 crates/polars-lazy/src/dsl/mod.rs             |   2 +-
 crates/polars-lazy/src/frame/mod.rs           |  70 +++++-----
 crates/polars-lazy/src/frame/pivot.rs         |   2 +-
 crates/polars-lazy/src/lib.rs                 |   6 +-
 .../executors/{groupby.rs => group_by.rs}     |   8 +-
 ...groupby_dynamic.rs => group_by_dynamic.rs} |  24 ++--
 ...partitioned.rs => group_by_partitioned.rs} |  28 ++--
 ...groupby_rolling.rs => group_by_rolling.rs} |  24 ++--
 .../src/physical_plan/executors/mod.rs        |  20 +--
 .../executors/projection_utils.rs             |  10 +-
 .../physical_plan/expressions/aggregation.rs  |   4 +-
 .../src/physical_plan/expressions/alias.rs    |   2 +-
 .../src/physical_plan/expressions/apply.rs    |   4 +-
 .../src/physical_plan/expressions/binary.rs   |   2 +-
 .../src/physical_plan/expressions/cast.rs     |   2 +-
 .../src/physical_plan/expressions/column.rs   |   4 +-
 .../src/physical_plan/expressions/filter.rs   |   2 +-
 .../src/physical_plan/expressions/literal.rs  |   2 +-
 .../src/physical_plan/expressions/mod.rs      |   4 +-
 .../src/physical_plan/expressions/slice.rs    |   2 +-
 .../src/physical_plan/expressions/sort.rs     |   2 +-
 .../src/physical_plan/expressions/sortby.rs   |   4 +-
 .../src/physical_plan/expressions/take.rs     |   2 +-
 .../src/physical_plan/expressions/ternary.rs  |   4 +-
 .../src/physical_plan/expressions/window.rs   |  32 ++---
 .../src/physical_plan/planner/expr.rs         |   2 +-
 .../src/physical_plan/planner/lp.rs           |  12 +-
 crates/polars-lazy/src/physical_plan/state.rs |   2 +-
 .../physical_plan/streaming/convert_alp.rs    |   4 +-
 crates/polars-lazy/src/prelude.rs             |   2 +-
 crates/polars-lazy/src/tests/aggregations.rs  |  42 +++---
 crates/polars-lazy/src/tests/arity.rs         |   4 +-
 crates/polars-lazy/src/tests/logical.rs       |   4 +-
 .../src/tests/optimization_checks.rs          |  14 +-
 crates/polars-lazy/src/tests/queries.rs       | 102 +++++++-------
 crates/polars-lazy/src/tests/streaming.rs     |  12 +-
 crates/polars-lazy/src/tests/tpch.rs          |   2 +-
 .../nan_propagating_aggregate.rs              |   2 +-
 crates/polars-ops/src/frame/pivot/mod.rs      |  20 +--
 .../polars-ops/src/series/ops/to_dummies.rs   |   2 +-
 .../aggregates/convert.rs                     |  14 +-
 .../{groupby => group_by}/aggregates/count.rs |   0
 .../{groupby => group_by}/aggregates/first.rs |   2 +-
 .../aggregates/interface.rs                   |  14 +-
 .../{groupby => group_by}/aggregates/last.rs  |   2 +-
 .../{groupby => group_by}/aggregates/mean.rs  |   0
 .../aggregates/min_max.rs                     |   0
 .../{groupby => group_by}/aggregates/mod.rs   |   0
 .../{groupby => group_by}/aggregates/null.rs  |   2 +-
 .../{groupby => group_by}/aggregates/sum.rs   |   0
 .../{groupby => group_by}/generic/eval.rs     |   0
 .../{groupby => group_by}/generic/global.rs   |   0
 .../generic/hash_table.rs                     |   0
 .../{groupby => group_by}/generic/mod.rs      |   2 +-
 .../generic/ooc_state.rs                      |   4 +-
 .../{groupby => group_by}/generic/sink.rs     |   8 +-
 .../{groupby => group_by}/generic/source.rs   |   6 +-
 .../generic/thread_local.rs                   |   0
 .../sinks/{groupby => group_by}/mod.rs        |   0
 .../sinks/{groupby => group_by}/ooc.rs        |  14 +-
 .../sinks/{groupby => group_by}/ooc_state.rs  |   4 +-
 .../{groupby => group_by}/primitive/mod.rs    |  16 +--
 .../sinks/{groupby => group_by}/string.rs     |  16 +--
 .../sinks/{groupby => group_by}/utils.rs      |   4 +-
 crates/polars-pipe/src/executors/sinks/mod.rs |   2 +-
 crates/polars-pipe/src/pipeline/convert.rs    |  20 +--
 crates/polars-pipe/src/pipeline/dispatcher.rs |   2 +-
 crates/polars-pipe/src/pipeline/mod.rs        |   2 +-
 crates/polars-plan/Cargo.toml                 |   2 +-
 crates/polars-plan/src/dsl/functions/arity.rs |   2 +-
 crates/polars-plan/src/dsl/mod.rs             |  10 +-
 .../polars-plan/src/logical_plan/aexpr/mod.rs |   2 +-
 .../polars-plan/src/logical_plan/builder.rs   |  12 +-
 .../src/logical_plan/builder_alp.rs           |   4 +-
 .../src/logical_plan/optimizer/cse_expr.rs    |  18 +--
 .../optimizer/predicate_pushdown/utils.rs     |   4 +-
 .../{groupby.rs => group_by.rs}               |   8 +-
 .../optimizer/projection_pushdown/mod.rs      |   6 +-
 .../optimizer/type_coercion/binary.rs         |   2 +-
 .../optimizer/type_coercion/mod.rs            |   4 +-
 .../polars-plan/src/logical_plan/options.rs   |   8 +-
 .../src/logical_plan/projection.rs            |   2 +-
 crates/polars-sql/src/context.rs              |  32 ++---
 crates/polars-sql/tests/iss_7437.rs           |   2 +-
 crates/polars-sql/tests/ops_distinct_on.rs    |   2 +-
 crates/polars-sql/tests/simple_exprs.rs       |   8 +-
 .../src/chunkedarray/rolling_window/mod.rs    |   4 +-
 .../rolling_kernels/no_nulls.rs               |  24 ++--
 .../src/{groupby => group_by}/dynamic.rs      |  80 +++++------
 .../src/{groupby => group_by}/mod.rs          |   0
 crates/polars-time/src/lib.rs                 |   6 +-
 crates/polars-time/src/prelude.rs             |   2 +-
 crates/polars-time/src/upsample.rs            |   4 +-
 crates/polars-time/src/windows/bounds.rs      |   4 +-
 .../src/windows/{groupby.rs => group_by.rs}   |  38 +++---
 crates/polars-time/src/windows/mod.rs         |   2 +-
 crates/polars-time/src/windows/test.rs        |  61 +++++----
 crates/polars/Cargo.toml                      |   6 +-
 crates/polars/src/docs/eager.rs               |  12 +-
 crates/polars/src/docs/lazy.rs                |   6 +-
 crates/polars/src/lib.rs                      |  16 +--
 crates/polars/src/prelude.rs                  |   2 +-
 .../tests/it/core/{groupby.rs => group_by.rs} |   2 +-
 crates/polars/tests/it/core/mod.rs            |   2 +-
 crates/polars/tests/it/joins.rs               |   6 +-
 crates/polars/tests/it/lazy/aggregation.rs    |   4 +-
 .../polars/tests/it/lazy/expressions/apply.rs |   6 +-
 .../polars/tests/it/lazy/expressions/arity.rs |  18 +--
 .../tests/it/lazy/expressions/filter.rs       |   6 +-
 .../polars/tests/it/lazy/expressions/slice.rs |   2 +-
 .../tests/it/lazy/{groupby.rs => group_by.rs} |  20 +--
 ...groupby_dynamic.rs => group_by_dynamic.rs} |  10 +-
 crates/polars/tests/it/lazy/mod.rs            |   4 +-
 crates/polars/tests/it/lazy/queries.rs        |  20 +--
 py-polars/Cargo.toml                          |   4 +-
 py-polars/polars/dataframe/groupby.py         |   2 +-
 py-polars/polars/lazyframe/frame.py           |   6 +-
 py-polars/src/dataframe.rs                    |   6 +-
 py-polars/src/lazyframe.rs                    |  14 +-
 .../unit/operations/test_groupby_rolling.py   |   4 +-
 py-polars/tests/unit/test_empty.py            |   2 +-
 py-polars/tests/unit/test_errors.py           |   4 +-
 168 files changed, 740 insertions(+), 723 deletions(-)
 rename crates/polars-core/src/frame/{groupby => group_by}/aggregations/agg_list.rs (100%)
 rename crates/polars-core/src/frame/{groupby => group_by}/aggregations/boolean.rs (100%)
 rename crates/polars-core/src/frame/{groupby => group_by}/aggregations/dispatch.rs (100%)
 rename crates/polars-core/src/frame/{groupby => group_by}/aggregations/mod.rs (99%)
 rename crates/polars-core/src/frame/{groupby => group_by}/aggregations/utf8.rs (100%)
 rename crates/polars-core/src/frame/{groupby => group_by}/expr.rs (100%)
 rename crates/polars-core/src/frame/{groupby => group_by}/hashing.rs (97%)
 rename crates/polars-core/src/frame/{groupby => group_by}/into_groups.rs (94%)
 rename crates/polars-core/src/frame/{groupby => group_by}/mod.rs (90%)
 rename crates/polars-core/src/frame/{groupby => group_by}/perfect.rs (100%)
 rename crates/polars-core/src/frame/{groupby => group_by}/proxy.rs (99%)
 rename crates/polars-lazy/src/physical_plan/executors/{groupby.rs => group_by.rs} (95%)
 rename crates/polars-lazy/src/physical_plan/executors/{groupby_dynamic.rs => group_by_dynamic.rs} (82%)
 rename crates/polars-lazy/src/physical_plan/executors/{groupby_partitioned.rs => group_by_partitioned.rs} (93%)
 rename crates/polars-lazy/src/physical_plan/executors/{groupby_rolling.rs => group_by_rolling.rs} (85%)
 rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/aggregates/convert.rs (95%)
 rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/aggregates/count.rs (100%)
 rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/aggregates/first.rs (96%)
 rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/aggregates/interface.rs (89%)
 rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/aggregates/last.rs (96%)
 rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/aggregates/mean.rs (100%)
 rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/aggregates/min_max.rs (100%)
 rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/aggregates/mod.rs (100%)
 rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/aggregates/null.rs (92%)
 rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/aggregates/sum.rs (100%)
 rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/generic/eval.rs (100%)
 rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/generic/global.rs (100%)
 rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/generic/hash_table.rs (100%)
 rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/generic/mod.rs (97%)
 rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/generic/ooc_state.rs (97%)
 rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/generic/sink.rs (96%)
 rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/generic/source.rs (95%)
 rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/generic/thread_local.rs (100%)
 rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/mod.rs (100%)
 rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/ooc.rs (95%)
 rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/ooc_state.rs (93%)
 rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/primitive/mod.rs (97%)
 rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/string.rs (97%)
 rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/utils.rs (96%)
 rename crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/{groupby.rs => group_by.rs} (95%)
 rename crates/polars-time/src/{groupby => group_by}/dynamic.rs (94%)
 rename crates/polars-time/src/{groupby => group_by}/mod.rs (100%)
 rename crates/polars-time/src/windows/{groupby.rs => group_by.rs} (94%)
 rename crates/polars/tests/it/core/{groupby.rs => group_by.rs} (98%)
 rename crates/polars/tests/it/lazy/{groupby.rs => group_by.rs} (90%)
 rename crates/polars/tests/it/lazy/{groupby_dynamic.rs => group_by_dynamic.rs} (92%)

diff --git a/crates/polars-algo/src/algo.rs b/crates/polars-algo/src/algo.rs
index 2da0f1787521..92533386ea89 100644
--- a/crates/polars-algo/src/algo.rs
+++ b/crates/polars-algo/src/algo.rs
@@ -47,7 +47,7 @@ pub fn hist(s: &Series, bins: Option<&Series>, bin_count: Option<usize>) -> Resu
         DataType::UInt16 => (lit(u32::MIN), AnyValue::UInt16(u16::MAX)),
         _ => polars_bail!(
             InvalidOperation:
-            "cannot take histogram of non-numeric types; consider a groupby and count"
+            "cannot take histogram of non-numeric types; consider a group_by and count"
         ),
     };
     let mut bins = bins.extend_constant(max_value, 1)?;
@@ -92,7 +92,7 @@ pub fn hist(s: &Series, bins: Option<&Series>, bin_count: Option<usize>) -> Resu
 
     let out = out
         .select(["category", s.name()])?
-        .groupby(["category"])?
+        .group_by(["category"])?
         .count()?;
 
     cuts.left_join(&out, [category_str], [category_str])?
diff --git a/crates/polars-arrow/src/array/mod.rs b/crates/polars-arrow/src/array/mod.rs
index 51f51bc63329..51f813440185 100644
--- a/crates/polars-arrow/src/array/mod.rs
+++ b/crates/polars-arrow/src/array/mod.rs
@@ -102,7 +102,7 @@ macro_rules! iter_to_values {
 
 pub trait ListFromIter {
     /// Create a list-array from an iterator.
-    /// Used in groupby agg-list
+    /// Used in group_by agg-list
     ///
     /// # Safety
     /// Will produce incorrect arrays if size hint is incorrect.
@@ -136,7 +136,7 @@ pub trait ListFromIter {
     }
 
     /// Create a list-array from an iterator.
-    /// Used in groupby agg-list
+    /// Used in group_by agg-list
     ///
     /// # Safety
     /// Will produce incorrect arrays if size hint is incorrect.
@@ -166,7 +166,7 @@ pub trait ListFromIter {
     }
 
     /// Create a list-array from an iterator.
-    /// Used in groupby agg-list
+    /// Used in group_by agg-list
     ///
     /// # Safety
     /// Will produce incorrect arrays if size hint is incorrect.
@@ -212,7 +212,7 @@ pub trait ListFromIter {
     }
 
     /// Create a list-array from an iterator.
-    /// Used in groupby agg-list
+    /// Used in group_by agg-list
     ///
     /// # Safety
     /// Will produce incorrect arrays if size hint is incorrect.
diff --git a/crates/polars-core/Cargo.toml b/crates/polars-core/Cargo.toml
index becd9894074d..d0a2f41c6ef2 100644
--- a/crates/polars-core/Cargo.toml
+++ b/crates/polars-core/Cargo.toml
@@ -92,8 +92,8 @@ row_hash = []
 reinterpret = []
 take_opt_iter = []
 mode = []
-# allow groupby operation on list type
-groupby_list = []
+# allow group_by operation on list type
+group_by_list = []
 # cumsum, cummin, etc.
 cum_agg = []
 # rolling window functions
@@ -114,7 +114,7 @@ semi_anti_join = []
 chunked_ids = []
 describe = []
 timezones = ["chrono-tz", "arrow/chrono-tz", "polars-arrow/timezones"]
-dynamic_groupby = ["dtype-datetime", "dtype-date"]
+dynamic_group_by = ["dtype-datetime", "dtype-date"]
 
 # opt-in datatypes for Series
 dtype-date = ["temporal"]
diff --git a/crates/polars-core/src/chunked_array/logical/categorical/builder.rs b/crates/polars-core/src/chunked_array/logical/categorical/builder.rs
index 0f07c5083af2..d8ad21b12f88 100644
--- a/crates/polars-core/src/chunked_array/logical/categorical/builder.rs
+++ b/crates/polars-core/src/chunked_array/logical/categorical/builder.rs
@@ -6,7 +6,7 @@ use hashbrown::hash_map::{Entry, RawEntryMut};
 use polars_arrow::trusted_len::TrustedLenPush;
 
 use crate::datatypes::PlHashMap;
-use crate::frame::groupby::hashing::HASHMAP_INIT_SIZE;
+use crate::frame::group_by::hashing::HASHMAP_INIT_SIZE;
 use crate::prelude::*;
 use crate::{using_string_cache, StringCache, POOL};
 
diff --git a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs
index c7a24a91fe01..9ac7d32ae749 100644
--- a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs
+++ b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs
@@ -1,5 +1,5 @@
 use super::*;
-use crate::frame::groupby::IntoGroupsProxy;
+use crate::frame::group_by::IntoGroupsProxy;
 
 impl CategoricalChunked {
     pub fn unique(&self) -> PolarsResult<Self> {
diff --git a/crates/polars-core/src/chunked_array/logical/categorical/stringcache.rs b/crates/polars-core/src/chunked_array/logical/categorical/stringcache.rs
index cf1be4be6525..195579e1392b 100644
--- a/crates/polars-core/src/chunked_array/logical/categorical/stringcache.rs
+++ b/crates/polars-core/src/chunked_array/logical/categorical/stringcache.rs
@@ -8,7 +8,7 @@ use once_cell::sync::Lazy;
 use smartstring::{LazyCompact, SmartString};
 
 use crate::datatypes::PlIdHashMap;
-use crate::frame::groupby::hashing::HASHMAP_INIT_SIZE;
+use crate::frame::group_by::hashing::HASHMAP_INIT_SIZE;
 use crate::prelude::InitHashMaps;
 
 /// We use atomic reference counting
diff --git a/crates/polars-core/src/chunked_array/ops/take/traits.rs b/crates/polars-core/src/chunked_array/ops/take/traits.rs
index e54e0a1c8a51..818681f831e6 100644
--- a/crates/polars-core/src/chunked_array/ops/take/traits.rs
+++ b/crates/polars-core/src/chunked_array/ops/take/traits.rs
@@ -1,5 +1,5 @@
 //! Traits that indicate the allowed arguments in a ChunkedArray::take operation.
-use crate::frame::groupby::GroupsProxyIter;
+use crate::frame::group_by::GroupsProxyIter;
 use crate::prelude::*;
 
 // Utility traits
diff --git a/crates/polars-core/src/chunked_array/ops/unique/mod.rs b/crates/polars-core/src/chunked_array/ops/unique/mod.rs
index 66e90d9f56bd..217e7a5494b0 100644
--- a/crates/polars-core/src/chunked_array/ops/unique/mod.rs
+++ b/crates/polars-core/src/chunked_array/ops/unique/mod.rs
@@ -8,10 +8,10 @@ use arrow::bitmap::MutableBitmap;
 #[cfg(feature = "object")]
 use crate::datatypes::ObjectType;
 use crate::datatypes::PlHashSet;
-use crate::frame::groupby::hashing::HASHMAP_INIT_SIZE;
-use crate::frame::groupby::GroupsProxy;
+use crate::frame::group_by::hashing::HASHMAP_INIT_SIZE;
+use crate::frame::group_by::GroupsProxy;
 #[cfg(feature = "mode")]
-use crate::frame::groupby::IntoGroupsProxy;
+use crate::frame::group_by::IntoGroupsProxy;
 use crate::prelude::*;
 use crate::series::IsSorted;
 
diff --git a/crates/polars-core/src/doc/changelog/v0_10_0_11.rs b/crates/polars-core/src/doc/changelog/v0_10_0_11.rs
index 502f267c1dc0..8136f24f8f80 100644
--- a/crates/polars-core/src/doc/changelog/v0_10_0_11.rs
+++ b/crates/polars-core/src/doc/changelog/v0_10_0_11.rs
@@ -6,7 +6,7 @@
 //! * Performance increase in take kernel
 //! * Performance increase in ChunkedArray builders
 //! * Join operation on multiple columns.
-//! * ~3.5 x performance increase in groupby operations (measured on db-benchmark),
+//! * ~3.5 x performance increase in group_by operations (measured on db-benchmark),
 //!   due to embarrassingly parallel grouping and better branch prediction (tight loops).
 //! * Performance increase on join operation due to better branch prediction.
 //! * Categorical datatype and global string cache (BETA).
diff --git a/crates/polars-core/src/doc/changelog/v0_4.rs b/crates/polars-core/src/doc/changelog/v0_4.rs
index c4f00cf1b50b..d357526134ef 100644
--- a/crates/polars-core/src/doc/changelog/v0_4.rs
+++ b/crates/polars-core/src/doc/changelog/v0_4.rs
@@ -1,7 +1,7 @@
 //! # Changelog v0.4
 //!
 //! * median aggregation added to `ChunkedArray<T>`
-//! * Arrow LargeList datatype support (and groupby aggregation into LargeList).
+//! * Arrow LargeList datatype support (and group_by aggregation into LargeList).
 //! * Shift operation.
 //! * Fill None operation.
 //! * Buffered serialization (less memory requirements)
diff --git a/crates/polars-core/src/doc/changelog/v0_7.rs b/crates/polars-core/src/doc/changelog/v0_7.rs
index 4d13947ac46e..55996f2fcaa5 100644
--- a/crates/polars-core/src/doc/changelog/v0_7.rs
+++ b/crates/polars-core/src/doc/changelog/v0_7.rs
@@ -20,7 +20,7 @@
 //!     - Type coercion optimizer
 //!     - Selection (filter, where clause)
 //!     - Projection (select foo from bar)
-//!     - Aggregation (groupby)
+//!     - Aggregation (group_by)
 //!         - all eager aggregations supported
 //!     - Joins
 //!     - WithColumn operation
diff --git a/crates/polars-core/src/frame/asof_join/groups.rs b/crates/polars-core/src/frame/asof_join/groups.rs
index ae27b92fb685..9c980c935b3b 100644
--- a/crates/polars-core/src/frame/asof_join/groups.rs
+++ b/crates/polars-core/src/frame/asof_join/groups.rs
@@ -9,7 +9,7 @@ use rayon::prelude::*;
 use smartstring::alias::String as SmartString;
 
 use super::*;
-use crate::frame::groupby::hashing::HASHMAP_INIT_SIZE;
+use crate::frame::group_by::hashing::HASHMAP_INIT_SIZE;
 #[cfg(feature = "dtype-categorical")]
 use crate::frame::hash_join::_check_categorical_src;
 use crate::frame::hash_join::{
diff --git a/crates/polars-core/src/frame/groupby/aggregations/agg_list.rs b/crates/polars-core/src/frame/group_by/aggregations/agg_list.rs
similarity index 100%
rename from crates/polars-core/src/frame/groupby/aggregations/agg_list.rs
rename to crates/polars-core/src/frame/group_by/aggregations/agg_list.rs
diff --git a/crates/polars-core/src/frame/groupby/aggregations/boolean.rs b/crates/polars-core/src/frame/group_by/aggregations/boolean.rs
similarity index 100%
rename from crates/polars-core/src/frame/groupby/aggregations/boolean.rs
rename to crates/polars-core/src/frame/group_by/aggregations/boolean.rs
diff --git a/crates/polars-core/src/frame/groupby/aggregations/dispatch.rs b/crates/polars-core/src/frame/group_by/aggregations/dispatch.rs
similarity index 100%
rename from crates/polars-core/src/frame/groupby/aggregations/dispatch.rs
rename to crates/polars-core/src/frame/group_by/aggregations/dispatch.rs
diff --git a/crates/polars-core/src/frame/groupby/aggregations/mod.rs b/crates/polars-core/src/frame/group_by/aggregations/mod.rs
similarity index 99%
rename from crates/polars-core/src/frame/groupby/aggregations/mod.rs
rename to crates/polars-core/src/frame/group_by/aggregations/mod.rs
index 1dbfa47d0662..b40f9137c554 100644
--- a/crates/polars-core/src/frame/groupby/aggregations/mod.rs
+++ b/crates/polars-core/src/frame/group_by/aggregations/mod.rs
@@ -27,9 +27,9 @@ use rayon::prelude::*;
 
 #[cfg(feature = "object")]
 use crate::chunked_array::object::extension::create_extension;
-use crate::frame::groupby::GroupsIdx;
+use crate::frame::group_by::GroupsIdx;
 #[cfg(feature = "object")]
-use crate::frame::groupby::GroupsIndicator;
+use crate::frame::group_by::GroupsIndicator;
 use crate::prelude::*;
 use crate::series::implementations::SeriesWrap;
 use crate::series::IsSorted;
@@ -76,7 +76,7 @@ where
     }
 
     // This iterators length can be trusted
-    // these represent the number of groups in the groupby operation
+    // these represent the number of groups in the group_by operation
     let output_len = offsets.size_hint().0;
     // start with a dummy index, will be overwritten on first iteration.
     // Safety:
diff --git a/crates/polars-core/src/frame/groupby/aggregations/utf8.rs b/crates/polars-core/src/frame/group_by/aggregations/utf8.rs
similarity index 100%
rename from crates/polars-core/src/frame/groupby/aggregations/utf8.rs
rename to crates/polars-core/src/frame/group_by/aggregations/utf8.rs
diff --git a/crates/polars-core/src/frame/groupby/expr.rs b/crates/polars-core/src/frame/group_by/expr.rs
similarity index 100%
rename from crates/polars-core/src/frame/groupby/expr.rs
rename to crates/polars-core/src/frame/group_by/expr.rs
diff --git a/crates/polars-core/src/frame/groupby/hashing.rs b/crates/polars-core/src/frame/group_by/hashing.rs
similarity index 97%
rename from crates/polars-core/src/frame/groupby/hashing.rs
rename to crates/polars-core/src/frame/group_by/hashing.rs
index 528156448ec3..3e24df8817b3 100644
--- a/crates/polars-core/src/frame/groupby/hashing.rs
+++ b/crates/polars-core/src/frame/group_by/hashing.rs
@@ -8,7 +8,7 @@ use rayon::prelude::*;
 
 use super::GroupsProxy;
 use crate::datatypes::PlHashMap;
-use crate::frame::groupby::{GroupsIdx, IdxItem};
+use crate::frame::group_by::{GroupsIdx, IdxItem};
 use crate::hashing::{
     df_rows_to_hashes_threaded_vertical, series_to_hashes, this_partition, AsU64, IdBuildHasher,
     IdxHash,
@@ -83,7 +83,7 @@ fn finish_group_order(mut out: Vec<Vec<IdxItem>>, sorted: bool) -> GroupsProxy {
 }
 
 // The inner vecs should be sorted by IdxSize
-// the groupby multiple keys variants suffice
+// the group_by multiple keys variants suffice
 // this requirements as they use an IdxMap strategy
 fn finish_group_order_vecs(
     mut vecs: Vec<(Vec<IdxSize>, Vec<Vec<IdxSize>>)>,
@@ -144,7 +144,7 @@ fn finish_group_order_vecs(
     }
 }
 
-pub(crate) fn groupby<T>(a: impl Iterator<Item = T>, sorted: bool) -> GroupsProxy
+pub(crate) fn group_by<T>(a: impl Iterator<Item = T>, sorted: bool) -> GroupsProxy
 where
     T: Hash + Eq,
 {
@@ -183,7 +183,7 @@ where
 // giving the slice info to the compiler is much
 // faster than the using an iterator, that's why we
 // have the code duplication
-pub(crate) fn groupby_threaded_slice<T, IntoSlice>(
+pub(crate) fn group_by_threaded_slice<T, IntoSlice>(
     keys: Vec<IntoSlice>,
     n_partitions: u64,
     sorted: bool,
@@ -246,7 +246,7 @@ where
     finish_group_order(out, sorted)
 }
 
-pub(crate) fn groupby_threaded_iter<T, I>(
+pub(crate) fn group_by_threaded_iter<T, I>(
     keys: &[I],
     n_partitions: u64,
     sorted: bool,
@@ -373,7 +373,7 @@ pub(crate) fn populate_multiple_key_hashmap<V, H, F, G>(
             idx_hash.hash == original_h && {
                 let key_idx = idx_hash.idx;
                 // Safety:
-                // indices in a groupby operation are always in bounds.
+                // indices in a group_by operation are always in bounds.
                 unsafe { compare_df_rows(keys, key_idx as usize, idx as usize) }
             }
         });
@@ -435,7 +435,7 @@ pub(crate) fn populate_multiple_key_hashmap2<'a, V, H, F, G>(
             original_h == idx_hash.hash && {
                 let key_idx = idx_hash.idx;
                 // Safety:
-                // indices in a groupby operation are always in bounds.
+                // indices in a group_by operation are always in bounds.
                 unsafe { compare_keys(keys_cmp, key_idx as usize, idx as usize) }
             }
         });
@@ -450,7 +450,7 @@ pub(crate) fn populate_multiple_key_hashmap2<'a, V, H, F, G>(
     }
 }
 
-pub(crate) fn groupby_threaded_multiple_keys_flat(
+pub(crate) fn group_by_threaded_multiple_keys_flat(
     mut keys: DataFrame,
     n_partitions: usize,
     sorted: bool,
@@ -540,7 +540,7 @@ pub(crate) fn groupby_threaded_multiple_keys_flat(
     Ok(finish_group_order_vecs(v, sorted))
 }
 
-pub(crate) fn groupby_multiple_keys(keys: DataFrame, sorted: bool) -> PolarsResult<GroupsProxy> {
+pub(crate) fn group_by_multiple_keys(keys: DataFrame, sorted: bool) -> PolarsResult<GroupsProxy> {
     let mut hashes = Vec::with_capacity(keys.height());
     let _ = series_to_hashes(keys.get_columns(), None, &mut hashes)?;
 
diff --git a/crates/polars-core/src/frame/groupby/into_groups.rs b/crates/polars-core/src/frame/group_by/into_groups.rs
similarity index 94%
rename from crates/polars-core/src/frame/groupby/into_groups.rs
rename to crates/polars-core/src/frame/group_by/into_groups.rs
index 5518f1a760d0..5144711ffa58 100644
--- a/crates/polars-core/src/frame/groupby/into_groups.rs
+++ b/crates/polars-core/src/frame/group_by/into_groups.rs
@@ -1,4 +1,4 @@
-#[cfg(feature = "groupby_list")]
+#[cfg(feature = "group_by_list")]
 use polars_arrow::kernels::list_bytes_iter::numeric_list_bytes_iter;
 use polars_arrow::kernels::sort_partition::{create_clean_partitions, partition_to_groups};
 use polars_arrow::prelude::*;
@@ -8,9 +8,9 @@ use crate::config::verbose;
 use crate::utils::_split_offsets;
 use crate::utils::flatten::flatten_par;
 
-/// Used to create the tuples for a groupby operation.
+/// Used to create the tuples for a group_by operation.
 pub trait IntoGroupsProxy {
-    /// Create the tuples need for a groupby operation.
+    /// Create the tuples need for a group_by operation.
     ///     * The first value in the tuple is the first index of the group.
     ///     * The second value in the tuple is are the indexes of the groups including the first value.
     fn group_tuples(&self, _multithreaded: bool, _sorted: bool) -> PolarsResult<GroupsProxy> {
@@ -38,15 +38,15 @@ where
                 .downcast_iter()
                 .map(|arr| arr.values().as_slice())
                 .collect::<Vec<_>>();
-            groupby_threaded_slice(keys, n_partitions, sorted)
+            group_by_threaded_slice(keys, n_partitions, sorted)
         } else {
             let keys = ca.downcast_iter().collect::<Vec<_>>();
-            groupby_threaded_iter(&keys, n_partitions, sorted)
+            group_by_threaded_iter(&keys, n_partitions, sorted)
         }
     } else if !ca.has_validity() {
-        groupby(ca.into_no_null_iter(), sorted)
+        group_by(ca.into_no_null_iter(), sorted)
     } else {
-        groupby(ca.into_iter(), sorted)
+        group_by(ca.into_iter(), sorted)
     }
 }
 
@@ -57,7 +57,7 @@ where
 {
     fn create_groups_from_sorted(&self, multithreaded: bool) -> GroupsSlice {
         if verbose() {
-            eprintln!("groupby keys are sorted; running sorted key fast path");
+            eprintln!("group_by keys are sorted; running sorted key fast path");
         }
         let arr = self.downcast_iter().next().unwrap();
         if arr.is_empty() {
@@ -271,7 +271,7 @@ impl IntoGroupsProxy for BinaryChunked {
                     .collect::<Vec<_>>()
             });
             let byte_hashes = byte_hashes.iter().collect::<Vec<_>>();
-            groupby_threaded_slice(byte_hashes, n_partitions as u64, sorted)
+            group_by_threaded_slice(byte_hashes, n_partitions as u64, sorted)
         } else {
             let byte_hashes = self
                 .into_iter()
@@ -283,7 +283,7 @@ impl IntoGroupsProxy for BinaryChunked {
                     BytesHash::new(opt_b, hash)
                 })
                 .collect_trusted::<Vec<_>>();
-            groupby(byte_hashes.iter(), sorted)
+            group_by(byte_hashes.iter(), sorted)
         };
         Ok(out)
     }
@@ -293,7 +293,7 @@ impl IntoGroupsProxy for ListChunked {
     #[allow(clippy::needless_lifetimes)]
     #[allow(unused_variables)]
     fn group_tuples<'a>(&'a self, multithreaded: bool, sorted: bool) -> PolarsResult<GroupsProxy> {
-        #[cfg(feature = "groupby_list")]
+        #[cfg(feature = "group_by_list")]
         {
             polars_ensure!(
                 self.inner_dtype().to_physical().is_numeric(),
@@ -338,7 +338,7 @@ impl IntoGroupsProxy for ListChunked {
                         })
                         .collect::<PolarsResult<Vec<_>>>()?;
                     let bytes_hashes = bytes_hashes.iter().collect::<Vec<_>>();
-                    Ok(groupby_threaded_slice(
+                    Ok(group_by_threaded_slice(
                         bytes_hashes,
                         n_partitions as u64,
                         sorted,
@@ -347,12 +347,12 @@ impl IntoGroupsProxy for ListChunked {
                 groups
             } else {
                 let hashes = arr_to_hashes(self)?;
-                Ok(groupby(hashes.iter(), sorted))
+                Ok(group_by(hashes.iter(), sorted))
             }
         }
-        #[cfg(not(feature = "groupby_list"))]
+        #[cfg(not(feature = "group_by_list"))]
         {
-            panic!("activate 'groupby_list' feature")
+            panic!("activate 'group_by_list' feature")
         }
     }
 }
@@ -376,6 +376,6 @@ where
     T: PolarsObject,
 {
     fn group_tuples(&self, _multithreaded: bool, sorted: bool) -> PolarsResult<GroupsProxy> {
-        Ok(groupby(self.into_iter(), sorted))
+        Ok(group_by(self.into_iter(), sorted))
     }
 }
diff --git a/crates/polars-core/src/frame/groupby/mod.rs b/crates/polars-core/src/frame/group_by/mod.rs
similarity index 90%
rename from crates/polars-core/src/frame/groupby/mod.rs
rename to crates/polars-core/src/frame/group_by/mod.rs
index c44a7ee3f24f..2ff670ac248f 100644
--- a/crates/polars-core/src/frame/groupby/mod.rs
+++ b/crates/polars-core/src/frame/group_by/mod.rs
@@ -50,7 +50,7 @@ fn prepare_dataframe_unsorted(by: &[Series]) -> DataFrame {
 }
 
 impl DataFrame {
-    pub fn groupby_with_series(
+    pub fn group_by_with_series(
         &self,
         mut by: Vec<Series>,
         multithreaded: bool,
@@ -58,7 +58,7 @@ impl DataFrame {
     ) -> PolarsResult<GroupBy> {
         polars_ensure!(
             !by.is_empty(),
-            ComputeError: "at least one key is required in a groupby operation"
+            ComputeError: "at least one key is required in a group_by operation"
         );
         let by_len = by[0].len();
 
@@ -88,9 +88,9 @@ impl DataFrame {
             }
             let keys_df = prepare_dataframe_unsorted(&by);
             if multithreaded {
-                groupby_threaded_multiple_keys_flat(keys_df, n_partitions, sorted)
+                group_by_threaded_multiple_keys_flat(keys_df, n_partitions, sorted)
             } else {
-                groupby_multiple_keys(keys_df, sorted)
+                group_by_multiple_keys(keys_df, sorted)
             }
         };
         Ok(GroupBy::new(self, by, groups?, None))
@@ -102,34 +102,34 @@ impl DataFrame {
     ///
     /// ```
     /// use polars_core::prelude::*;
-    /// fn groupby_sum(df: &DataFrame) -> PolarsResult<DataFrame> {
-    ///     df.groupby(["column_name"])?
+    /// fn group_by_sum(df: &DataFrame) -> PolarsResult<DataFrame> {
+    ///     df.group_by(["column_name"])?
     ///     .select(["agg_column_name"])
     ///     .sum()
     /// }
     /// ```
-    pub fn groupby<I, S>(&self, by: I) -> PolarsResult<GroupBy>
+    pub fn group_by<I, S>(&self, by: I) -> PolarsResult<GroupBy>
     where
         I: IntoIterator<Item = S>,
         S: AsRef<str>,
     {
         let selected_keys = self.select_series(by)?;
-        self.groupby_with_series(selected_keys, true, false)
+        self.group_by_with_series(selected_keys, true, false)
     }
 
     /// Group DataFrame using a Series column.
     /// The groups are ordered by their smallest row index.
-    pub fn groupby_stable<I, S>(&self, by: I) -> PolarsResult<GroupBy>
+    pub fn group_by_stable<I, S>(&self, by: I) -> PolarsResult<GroupBy>
     where
         I: IntoIterator<Item = S>,
         S: AsRef<str>,
     {
         let selected_keys = self.select_series(by)?;
-        self.groupby_with_series(selected_keys, true, true)
+        self.group_by_with_series(selected_keys, true, true)
     }
 }
 
-/// Returned by a groupby operation on a DataFrame. This struct supports
+/// Returned by a group_by operation on a DataFrame. This struct supports
 /// several aggregations.
 ///
 /// Until described otherwise, the examples in this struct are performed on the following DataFrame:
@@ -329,7 +329,7 @@ impl<'df> GroupBy<'df> {
     /// ```rust
     /// # use polars_core::prelude::*;
     /// fn example(df: DataFrame) -> PolarsResult<DataFrame> {
-    ///     df.groupby(["date"])?.select(&["temp", "rain"]).mean()
+    ///     df.group_by(["date"])?.select(&["temp", "rain"]).mean()
     /// }
     /// ```
     /// Returns:
@@ -352,7 +352,7 @@ impl<'df> GroupBy<'df> {
         let (mut cols, agg_cols) = self.prepare_agg()?;
 
         for agg_col in agg_cols {
-            let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Mean);
+            let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::Mean);
             let mut agg = unsafe { agg_col.agg_mean(&self.groups) };
             agg.rename(&new_name);
             cols.push(agg);
@@ -367,7 +367,7 @@ impl<'df> GroupBy<'df> {
     /// ```rust
     /// # use polars_core::prelude::*;
     /// fn example(df: DataFrame) -> PolarsResult<DataFrame> {
-    ///     df.groupby(["date"])?.select(["temp"]).sum()
+    ///     df.group_by(["date"])?.select(["temp"]).sum()
     /// }
     /// ```
     /// Returns:
@@ -390,7 +390,7 @@ impl<'df> GroupBy<'df> {
         let (mut cols, agg_cols) = self.prepare_agg()?;
 
         for agg_col in agg_cols {
-            let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Sum);
+            let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::Sum);
             let mut agg = unsafe { agg_col.agg_sum(&self.groups) };
             agg.rename(&new_name);
             cols.push(agg);
@@ -405,7 +405,7 @@ impl<'df> GroupBy<'df> {
     /// ```rust
     /// # use polars_core::prelude::*;
     /// fn example(df: DataFrame) -> PolarsResult<DataFrame> {
-    ///     df.groupby(["date"])?.select(["temp"]).min()
+    ///     df.group_by(["date"])?.select(["temp"]).min()
     /// }
     /// ```
     /// Returns:
@@ -427,7 +427,7 @@ impl<'df> GroupBy<'df> {
     pub fn min(&self) -> PolarsResult<DataFrame> {
         let (mut cols, agg_cols) = self.prepare_agg()?;
         for agg_col in agg_cols {
-            let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Min);
+            let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::Min);
             let mut agg = unsafe { agg_col.agg_min(&self.groups) };
             agg.rename(&new_name);
             cols.push(agg);
@@ -442,7 +442,7 @@ impl<'df> GroupBy<'df> {
     /// ```rust
     /// # use polars_core::prelude::*;
     /// fn example(df: DataFrame) -> PolarsResult<DataFrame> {
-    ///     df.groupby(["date"])?.select(["temp"]).max()
+    ///     df.group_by(["date"])?.select(["temp"]).max()
     /// }
     /// ```
     /// Returns:
@@ -464,7 +464,7 @@ impl<'df> GroupBy<'df> {
     pub fn max(&self) -> PolarsResult<DataFrame> {
         let (mut cols, agg_cols) = self.prepare_agg()?;
         for agg_col in agg_cols {
-            let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Max);
+            let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::Max);
             let mut agg = unsafe { agg_col.agg_max(&self.groups) };
             agg.rename(&new_name);
             cols.push(agg);
@@ -479,7 +479,7 @@ impl<'df> GroupBy<'df> {
     /// ```rust
     /// # use polars_core::prelude::*;
     /// fn example(df: DataFrame) -> PolarsResult<DataFrame> {
-    ///     df.groupby(["date"])?.select(["temp"]).first()
+    ///     df.group_by(["date"])?.select(["temp"]).first()
     /// }
     /// ```
     /// Returns:
@@ -501,7 +501,7 @@ impl<'df> GroupBy<'df> {
     pub fn first(&self) -> PolarsResult<DataFrame> {
         let (mut cols, agg_cols) = self.prepare_agg()?;
         for agg_col in agg_cols {
-            let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::First);
+            let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::First);
             let mut agg = unsafe { agg_col.agg_first(&self.groups) };
             agg.rename(&new_name);
             cols.push(agg);
@@ -516,7 +516,7 @@ impl<'df> GroupBy<'df> {
     /// ```rust
     /// # use polars_core::prelude::*;
     /// fn example(df: DataFrame) -> PolarsResult<DataFrame> {
-    ///     df.groupby(["date"])?.select(["temp"]).last()
+    ///     df.group_by(["date"])?.select(["temp"]).last()
     /// }
     /// ```
     /// Returns:
@@ -538,7 +538,7 @@ impl<'df> GroupBy<'df> {
     pub fn last(&self) -> PolarsResult<DataFrame> {
         let (mut cols, agg_cols) = self.prepare_agg()?;
         for agg_col in agg_cols {
-            let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Last);
+            let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::Last);
             let mut agg = unsafe { agg_col.agg_last(&self.groups) };
             agg.rename(&new_name);
             cols.push(agg);
@@ -553,7 +553,7 @@ impl<'df> GroupBy<'df> {
     /// ```rust
     /// # use polars_core::prelude::*;
     /// fn example(df: DataFrame) -> PolarsResult<DataFrame> {
-    ///     df.groupby(["date"])?.select(["temp"]).n_unique()
+    ///     df.group_by(["date"])?.select(["temp"]).n_unique()
     /// }
     /// ```
     /// Returns:
@@ -575,7 +575,7 @@ impl<'df> GroupBy<'df> {
     pub fn n_unique(&self) -> PolarsResult<DataFrame> {
         let (mut cols, agg_cols) = self.prepare_agg()?;
         for agg_col in agg_cols {
-            let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::NUnique);
+            let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::NUnique);
             let mut agg = unsafe { agg_col.agg_n_unique(&self.groups) };
             agg.rename(&new_name);
             cols.push(agg.into_series());
@@ -592,7 +592,7 @@ impl<'df> GroupBy<'df> {
     /// # use polars_arrow::prelude::QuantileInterpolOptions;
     ///
     /// fn example(df: DataFrame) -> PolarsResult<DataFrame> {
-    ///     df.groupby(["date"])?.select(["temp"]).quantile(0.2, QuantileInterpolOptions::default())
+    ///     df.group_by(["date"])?.select(["temp"]).quantile(0.2, QuantileInterpolOptions::default())
     /// }
     /// ```
     #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
@@ -608,7 +608,7 @@ impl<'df> GroupBy<'df> {
         let (mut cols, agg_cols) = self.prepare_agg()?;
         for agg_col in agg_cols {
             let new_name =
-                fmt_groupby_column(agg_col.name(), GroupByMethod::Quantile(quantile, interpol));
+                fmt_group_by_column(agg_col.name(), GroupByMethod::Quantile(quantile, interpol));
             let mut agg = unsafe { agg_col.agg_quantile(&self.groups, quantile, interpol) };
             agg.rename(&new_name);
             cols.push(agg.into_series());
@@ -623,14 +623,14 @@ impl<'df> GroupBy<'df> {
     /// ```rust
     /// # use polars_core::prelude::*;
     /// fn example(df: DataFrame) -> PolarsResult<DataFrame> {
-    ///     df.groupby(["date"])?.select(["temp"]).median()
+    ///     df.group_by(["date"])?.select(["temp"]).median()
     /// }
     /// ```
     #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
     pub fn median(&self) -> PolarsResult<DataFrame> {
         let (mut cols, agg_cols) = self.prepare_agg()?;
         for agg_col in agg_cols {
-            let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Median);
+            let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::Median);
             let mut agg = unsafe { agg_col.agg_median(&self.groups) };
             agg.rename(&new_name);
             cols.push(agg.into_series());
@@ -643,7 +643,7 @@ impl<'df> GroupBy<'df> {
     pub fn var(&self, ddof: u8) -> PolarsResult<DataFrame> {
         let (mut cols, agg_cols) = self.prepare_agg()?;
         for agg_col in agg_cols {
-            let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Var(ddof));
+            let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::Var(ddof));
             let mut agg = unsafe { agg_col.agg_var(&self.groups, ddof) };
             agg.rename(&new_name);
             cols.push(agg.into_series());
@@ -656,7 +656,7 @@ impl<'df> GroupBy<'df> {
     pub fn std(&self, ddof: u8) -> PolarsResult<DataFrame> {
         let (mut cols, agg_cols) = self.prepare_agg()?;
         for agg_col in agg_cols {
-            let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Std(ddof));
+            let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::Std(ddof));
             let mut agg = unsafe { agg_col.agg_std(&self.groups, ddof) };
             agg.rename(&new_name);
             cols.push(agg.into_series());
@@ -671,7 +671,7 @@ impl<'df> GroupBy<'df> {
     /// ```rust
     /// # use polars_core::prelude::*;
     /// fn example(df: DataFrame) -> PolarsResult<DataFrame> {
-    ///     df.groupby(["date"])?.select(["temp"]).count()
+    ///     df.group_by(["date"])?.select(["temp"]).count()
     /// }
     /// ```
     /// Returns:
@@ -693,7 +693,7 @@ impl<'df> GroupBy<'df> {
         let (mut cols, agg_cols) = self.prepare_agg()?;
 
         for agg_col in agg_cols {
-            let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Count);
+            let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::Count);
             let mut ca = self.groups.group_count();
             ca.rename(&new_name);
             cols.push(ca.into_series());
@@ -701,14 +701,14 @@ impl<'df> GroupBy<'df> {
         DataFrame::new(cols)
     }
 
-    /// Get the groupby group indexes.
+    /// Get the group_by group indexes.
     ///
     /// # Example
     ///
     /// ```rust
     /// # use polars_core::prelude::*;
     /// fn example(df: DataFrame) -> PolarsResult<DataFrame> {
-    ///     df.groupby(["date"])?.groups()
+    ///     df.group_by(["date"])?.groups()
     /// }
     /// ```
     /// Returns:
@@ -729,13 +729,13 @@ impl<'df> GroupBy<'df> {
     pub fn groups(&self) -> PolarsResult<DataFrame> {
         let mut cols = self.keys();
         let mut column = self.groups.as_list_chunked();
-        let new_name = fmt_groupby_column("", GroupByMethod::Groups);
+        let new_name = fmt_group_by_column("", GroupByMethod::Groups);
         column.rename(&new_name);
         cols.push(column.into_series());
         DataFrame::new(cols)
     }
 
-    /// Aggregate the groups of the groupby operation into lists.
+    /// Aggregate the groups of the group_by operation into lists.
     ///
     /// # Example
     ///
@@ -743,7 +743,7 @@ impl<'df> GroupBy<'df> {
     /// # use polars_core::prelude::*;
     /// fn example(df: DataFrame) -> PolarsResult<DataFrame> {
     ///     // GroupBy and aggregate to Lists
-    ///     df.groupby(["date"])?.select(["temp"]).agg_list()
+    ///     df.group_by(["date"])?.select(["temp"]).agg_list()
     /// }
     /// ```
     /// Returns:
@@ -765,7 +765,7 @@ impl<'df> GroupBy<'df> {
     pub fn agg_list(&self) -> PolarsResult<DataFrame> {
         let (mut cols, agg_cols) = self.prepare_agg()?;
         for agg_col in agg_cols {
-            let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Implode);
+            let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::Implode);
             let mut agg = unsafe { agg_col.agg_list(&self.groups) };
             agg.rename(&new_name);
             cols.push(agg);
@@ -774,7 +774,7 @@ impl<'df> GroupBy<'df> {
     }
 
     fn prepare_apply(&self) -> PolarsResult<DataFrame> {
-        polars_ensure!(self.df.height() > 0, ComputeError: "cannot groupby + apply on empty 'DataFrame'");
+        polars_ensure!(self.df.height() > 0, ComputeError: "cannot group_by + apply on empty 'DataFrame'");
         if let Some(agg) = &self.selected_agg {
             if agg.is_empty() {
                 Ok(self.df.clone())
@@ -889,7 +889,7 @@ impl Display for GroupByMethod {
 }
 
 // Formatting functions used in eager and lazy code for renaming grouped columns
-pub fn fmt_groupby_column(name: &str, method: GroupByMethod) -> String {
+pub fn fmt_group_by_column(name: &str, method: GroupByMethod) -> String {
     use GroupByMethod::*;
     match method {
         Min => format!("{name}_min"),
@@ -935,7 +935,7 @@ mod test {
         let s2 = Series::new("rain", [0.2, 0.1, 0.3, 0.1, 0.01]);
         let df = DataFrame::new(vec![s0, s1, s2]).unwrap();
 
-        let out = df.groupby_stable(["date"])?.select(["temp"]).count()?;
+        let out = df.group_by_stable(["date"])?.select(["temp"]).count()?;
         assert_eq!(
             out.column("temp_count")?,
             &Series::new("temp_count", [2 as IdxSize, 2, 1])
@@ -945,7 +945,7 @@ mod test {
         #[allow(deprecated)]
         // Select multiple
         let out = df
-            .groupby_stable(["date"])?
+            .group_by_stable(["date"])?
             .select(["temp", "rain"])
             .mean()?;
         assert_eq!(
@@ -957,14 +957,14 @@ mod test {
         #[allow(deprecated)]
         // Group by multiple
         let out = df
-            .groupby_stable(["date", "temp"])?
+            .group_by_stable(["date", "temp"])?
             .select(["rain"])
             .mean()?;
         assert!(out.column("rain_mean").is_ok());
 
         // Use of deprecated `sum()` for testing purposes
         #[allow(deprecated)]
-        let out = df.groupby_stable(["date"])?.select(["temp"]).sum()?;
+        let out = df.group_by_stable(["date"])?.select(["temp"]).sum()?;
         assert_eq!(
             out.column("temp_sum")?,
             &Series::new("temp_sum", [30, 8, 9])
@@ -973,7 +973,7 @@ mod test {
         // Use of deprecated `n_unique()` for testing purposes
         #[allow(deprecated)]
         // implicit select all and only aggregate on methods that support that aggregation
-        let gb = df.groupby(["date"]).unwrap().n_unique().unwrap();
+        let gb = df.group_by(["date"]).unwrap().n_unique().unwrap();
         // check the group by column is filtered out.
         assert_eq!(gb.width(), 3);
         Ok(())
@@ -981,7 +981,7 @@ mod test {
 
     #[test]
     #[cfg_attr(miri, ignore)]
-    fn test_static_groupby_by_12_columns() {
+    fn test_static_group_by_by_12_columns() {
         // Build GroupBy DataFrame.
         let s0 = Series::new("G1", ["A", "A", "B", "B", "C"].as_ref());
         let s1 = Series::new("N", [1, 2, 2, 4, 2].as_ref());
@@ -1003,7 +1003,7 @@ mod test {
         // Use of deprecated `sum()` for testing purposes
         #[allow(deprecated)]
         let adf = df
-            .groupby([
+            .group_by([
                 "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8", "G9", "G10", "G11", "G12",
             ])
             .unwrap()
@@ -1019,11 +1019,11 @@ mod test {
 
     #[test]
     #[cfg_attr(miri, ignore)]
-    fn test_dynamic_groupby_by_13_columns() {
-        // The content for every groupby series.
+    fn test_dynamic_group_by_by_13_columns() {
+        // The content for every group_by series.
         let series_content = ["A", "A", "B", "B", "C"];
 
-        // The name of every groupby series.
+        // The name of every group_by series.
         let series_names = [
             "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8", "G9", "G10", "G11", "G12", "G13",
         ];
@@ -1048,7 +1048,7 @@ mod test {
         #[allow(deprecated)]
         // Compute the aggregated DataFrame by the 13 columns defined in `series_names`.
         let adf = df
-            .groupby(series_names)
+            .group_by(series_names)
             .unwrap()
             .select(["N"])
             .sum()
@@ -1072,14 +1072,14 @@ mod test {
 
     #[test]
     #[cfg_attr(miri, ignore)]
-    fn test_groupby_floats() {
+    fn test_group_by_floats() {
         let df = df! {"flt" => [1., 1., 2., 2., 3.],
                     "val" => [1, 1, 1, 1, 1]
         }
         .unwrap();
         // Use of deprecated `sum()` for testing purposes
         #[allow(deprecated)]
-        let res = df.groupby(["flt"]).unwrap().sum().unwrap();
+        let res = df.group_by(["flt"]).unwrap().sum().unwrap();
         let res = res.sort(["flt"], false, false).unwrap();
         assert_eq!(
             Vec::from(res.column("val_sum").unwrap().i32().unwrap()),
@@ -1090,7 +1090,7 @@ mod test {
     #[test]
     #[cfg_attr(miri, ignore)]
     #[cfg(feature = "dtype-categorical")]
-    fn test_groupby_categorical() {
+    fn test_group_by_categorical() {
         let mut df = df! {"foo" => ["a", "a", "b", "b", "c"],
                     "ham" => ["a", "a", "b", "b", "c"],
                     "bar" => [1, 1, 1, 1, 1]
@@ -1104,7 +1104,7 @@ mod test {
         #[allow(deprecated)]
         // check multiple keys and categorical
         let res = df
-            .groupby_stable(["foo", "ham"])
+            .group_by_stable(["foo", "ham"])
             .unwrap()
             .select(["bar"])
             .sum()
@@ -1118,14 +1118,14 @@ mod test {
 
     #[test]
     #[cfg_attr(miri, ignore)]
-    fn test_groupby_null_handling() -> PolarsResult<()> {
+    fn test_group_by_null_handling() -> PolarsResult<()> {
         let df = df!(
             "a" => ["a", "a", "a", "b", "b"],
             "b" => [Some(1), Some(2), None, None, Some(1)]
         )?;
         // Use of deprecated `mean()` for testing purposes
         #[allow(deprecated)]
-        let out = df.groupby_stable(["a"])?.mean()?;
+        let out = df.group_by_stable(["a"])?.mean()?;
 
         assert_eq!(
             Vec::from(out.column("b_mean")?.f64()?),
@@ -1136,7 +1136,7 @@ mod test {
 
     #[test]
     #[cfg_attr(miri, ignore)]
-    fn test_groupby_var() -> PolarsResult<()> {
+    fn test_group_by_var() -> PolarsResult<()> {
         // check variance and proper coercion to f64
         let df = df![
             "g" => ["foo", "foo", "bar"],
@@ -1146,12 +1146,12 @@ mod test {
 
         // Use of deprecated `sum()` for testing purposes
         #[allow(deprecated)]
-        let out = df.groupby_stable(["g"])?.select(["int"]).var(1)?;
+        let out = df.group_by_stable(["g"])?.select(["int"]).var(1)?;
 
         assert_eq!(out.column("int_agg_var")?.f64()?.get(0), Some(0.5));
         // Use of deprecated `std()` for testing purposes
         #[allow(deprecated)]
-        let out = df.groupby_stable(["g"])?.select(["int"]).std(1)?;
+        let out = df.group_by_stable(["g"])?.select(["int"]).std(1)?;
         let val = out.column("int_agg_std")?.f64()?.get(0).unwrap();
         let expected = f64::FRAC_1_SQRT_2();
         assert!((val - expected).abs() < 0.000001);
@@ -1161,7 +1161,7 @@ mod test {
     #[test]
     #[cfg_attr(miri, ignore)]
     #[cfg(feature = "dtype-categorical")]
-    fn test_groupby_null_group() -> PolarsResult<()> {
+    fn test_group_by_null_group() -> PolarsResult<()> {
         // check if null is own group
         let mut df = df![
             "g" => [Some("foo"), Some("foo"), Some("bar"), None, None],
@@ -1173,7 +1173,7 @@ mod test {
 
         // Use of deprecated `sum()` for testing purposes
         #[allow(deprecated)]
-        let _ = df.groupby(["g"])?.sum()?;
+        let _ = df.group_by(["g"])?.sum()?;
         Ok(())
     }
 }
diff --git a/crates/polars-core/src/frame/groupby/perfect.rs b/crates/polars-core/src/frame/group_by/perfect.rs
similarity index 100%
rename from crates/polars-core/src/frame/groupby/perfect.rs
rename to crates/polars-core/src/frame/group_by/perfect.rs
diff --git a/crates/polars-core/src/frame/groupby/proxy.rs b/crates/polars-core/src/frame/group_by/proxy.rs
similarity index 99%
rename from crates/polars-core/src/frame/groupby/proxy.rs
rename to crates/polars-core/src/frame/group_by/proxy.rs
index 2c0a2b48bd18..ebd33232772d 100644
--- a/crates/polars-core/src/frame/groupby/proxy.rs
+++ b/crates/polars-core/src/frame/group_by/proxy.rs
@@ -296,7 +296,7 @@ pub enum GroupsProxy {
     Slice {
         // the groups slices
         groups: GroupsSlice,
-        // indicates if we do a rolling groupby
+        // indicates if we do a rolling group_by
         rolling: bool,
     },
 }
diff --git a/crates/polars-core/src/frame/hash_join/mod.rs b/crates/polars-core/src/frame/hash_join/mod.rs
index d596c916c612..f2f7500447da 100644
--- a/crates/polars-core/src/frame/hash_join/mod.rs
+++ b/crates/polars-core/src/frame/hash_join/mod.rs
@@ -36,7 +36,7 @@ pub(crate) use zip_outer::*;
 
 pub use self::multiple_keys::private_left_join_multiple_keys;
 use crate::datatypes::PlHashMap;
-use crate::frame::groupby::hashing::HASHMAP_INIT_SIZE;
+use crate::frame::group_by::hashing::HASHMAP_INIT_SIZE;
 pub use crate::frame::hash_join::multiple_keys::{
     _inner_join_multiple_keys, _left_join_multiple_keys, _outer_join_multiple_keys,
 };
diff --git a/crates/polars-core/src/frame/hash_join/multiple_keys.rs b/crates/polars-core/src/frame/hash_join/multiple_keys.rs
index 509ca3f4c223..157584328978 100644
--- a/crates/polars-core/src/frame/hash_join/multiple_keys.rs
+++ b/crates/polars-core/src/frame/hash_join/multiple_keys.rs
@@ -3,7 +3,7 @@ use hashbrown::HashMap;
 use rayon::prelude::*;
 
 use super::*;
-use crate::frame::groupby::hashing::{populate_multiple_key_hashmap, HASHMAP_INIT_SIZE};
+use crate::frame::group_by::hashing::{populate_multiple_key_hashmap, HASHMAP_INIT_SIZE};
 use crate::frame::hash_join::{
     get_hash_tbl_threaded_join_mut_partitioned, get_hash_tbl_threaded_join_partitioned,
 };
diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs
index 526b2e9aea82..eb86b38540e5 100644
--- a/crates/polars-core/src/frame/mod.rs
+++ b/crates/polars-core/src/frame/mod.rs
@@ -22,7 +22,7 @@ mod chunks;
 pub(crate) mod cross_join;
 pub mod explode;
 mod from;
-pub mod groupby;
+pub mod group_by;
 pub mod hash_join;
 #[cfg(feature = "rows")]
 pub mod row;
@@ -34,7 +34,7 @@ pub use chunks::*;
 use serde::{Deserialize, Serialize};
 use smartstring::alias::String as SmartString;
 
-use crate::frame::groupby::GroupsIndicator;
+use crate::frame::group_by::GroupsIndicator;
 #[cfg(feature = "row_hash")]
 use crate::hashing::df_rows_to_hashes_threaded_vertical;
 #[cfg(feature = "zip_with")]
@@ -3097,7 +3097,7 @@ impl DataFrame {
 
         let columns = match (keep, maintain_order) {
             (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, true) => {
-                let gb = df.groupby_stable(names)?;
+                let gb = df.group_by_stable(names)?;
                 let groups = gb.get_groups();
                 let (offset, len) = slice.unwrap_or((0, groups.len()));
                 let groups = groups.slice(offset, len);
@@ -3106,7 +3106,7 @@ impl DataFrame {
             (UniqueKeepStrategy::Last, true) => {
                 // maintain order by last values, so the sorted groups are not correct as they
                 // are sorted by the first value
-                let gb = df.groupby(names)?;
+                let gb = df.group_by(names)?;
                 let groups = gb.get_groups();
 
                 let func = |g: GroupsIndicator| match g {
@@ -3126,14 +3126,14 @@ impl DataFrame {
                 return Ok(unsafe { df.take_unchecked(&last_idx) });
             },
             (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, false) => {
-                let gb = df.groupby(names)?;
+                let gb = df.group_by(names)?;
                 let groups = gb.get_groups();
                 let (offset, len) = slice.unwrap_or((0, groups.len()));
                 let groups = groups.slice(offset, len);
                 df.apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
             },
             (UniqueKeepStrategy::Last, false) => {
-                let gb = df.groupby(names)?;
+                let gb = df.group_by(names)?;
                 let groups = gb.get_groups();
                 let (offset, len) = slice.unwrap_or((0, groups.len()));
                 let groups = groups.slice(offset, len);
@@ -3166,7 +3166,7 @@ impl DataFrame {
     /// # Ok::<(), PolarsError>(())
     /// ```
     pub fn is_unique(&self) -> PolarsResult<BooleanChunked> {
-        let gb = self.groupby(self.get_column_names())?;
+        let gb = self.group_by(self.get_column_names())?;
         let groups = gb.take_groups();
         Ok(is_unique_helper(
             groups,
@@ -3190,7 +3190,7 @@ impl DataFrame {
     /// # Ok::<(), PolarsError>(())
     /// ```
     pub fn is_duplicated(&self) -> PolarsResult<BooleanChunked> {
-        let gb = self.groupby(self.get_column_names())?;
+        let gb = self.group_by(self.get_column_names())?;
         let groups = gb.take_groups();
         Ok(is_unique_helper(
             groups,
@@ -3332,9 +3332,9 @@ impl DataFrame {
         include_key: bool,
     ) -> PolarsResult<Vec<DataFrame>> {
         let groups = if stable {
-            self.groupby_stable(cols)?.take_groups()
+            self.group_by_stable(cols)?.take_groups()
         } else {
-            self.groupby(cols)?.take_groups()
+            self.group_by(cols)?.take_groups()
         };
 
         // drop key columns prior to calculation if requested
diff --git a/crates/polars-core/src/hashing/vector_hasher.rs b/crates/polars-core/src/hashing/vector_hasher.rs
index 537e71ee9e4c..4aa7b08a527c 100644
--- a/crates/polars-core/src/hashing/vector_hasher.rs
+++ b/crates/polars-core/src/hashing/vector_hasher.rs
@@ -1,7 +1,7 @@
 use arrow::bitmap::utils::get_bit_unchecked;
 use hashbrown::hash_map::RawEntryMut;
 use hashbrown::HashMap;
-#[cfg(feature = "groupby_list")]
+#[cfg(feature = "group_by_list")]
 use polars_arrow::kernels::list_bytes_iter::numeric_list_bytes_iter;
 use polars_arrow::utils::CustomIterTools;
 use rayon::prelude::*;
@@ -328,7 +328,7 @@ impl VecHash for Float64Chunked {
     }
 }
 
-#[cfg(feature = "groupby_list")]
+#[cfg(feature = "group_by_list")]
 impl VecHash for ListChunked {
     fn vec_hash(&self, _random_state: RandomState, _buf: &mut Vec<u64>) -> PolarsResult<()> {
         polars_ensure!(
diff --git a/crates/polars-core/src/prelude.rs b/crates/polars-core/src/prelude.rs
index a6ba458e222b..e80e37899f85 100644
--- a/crates/polars-core/src/prelude.rs
+++ b/crates/polars-core/src/prelude.rs
@@ -38,8 +38,8 @@ pub use crate::error::{
 #[cfg(feature = "asof_join")]
 pub use crate::frame::asof_join::*;
 pub use crate::frame::explode::MeltArgs;
-pub(crate) use crate::frame::groupby::aggregations::*;
-pub use crate::frame::groupby::{GroupsIdx, GroupsProxy, GroupsSlice, IntoGroupsProxy};
+pub(crate) use crate::frame::group_by::aggregations::*;
+pub use crate::frame::group_by::{GroupsIdx, GroupsProxy, GroupsSlice, IntoGroupsProxy};
 pub(crate) use crate::frame::hash_join::*;
 pub use crate::frame::hash_join::{JoinArgs, JoinType};
 pub use crate::frame::{DataFrame, UniqueKeepStrategy};
diff --git a/crates/polars-core/src/series/implementations/array.rs b/crates/polars-core/src/series/implementations/array.rs
index ef560e50ccbc..44f422e447f0 100644
--- a/crates/polars-core/src/series/implementations/array.rs
+++ b/crates/polars-core/src/series/implementations/array.rs
@@ -5,7 +5,7 @@ use super::{private, IntoSeries, SeriesTrait};
 use crate::chunked_array::comparison::*;
 use crate::chunked_array::ops::explode::ExplodeByOffsets;
 use crate::chunked_array::{AsSinglePtr, Settings};
-use crate::frame::groupby::*;
+use crate::frame::group_by::*;
 use crate::prelude::*;
 use crate::series::implementations::SeriesWrap;
 #[cfg(feature = "chunked_ids")]
diff --git a/crates/polars-core/src/series/implementations/binary.rs b/crates/polars-core/src/series/implementations/binary.rs
index 25f9a44c17ab..d2277e2a1f47 100644
--- a/crates/polars-core/src/series/implementations/binary.rs
+++ b/crates/polars-core/src/series/implementations/binary.rs
@@ -9,7 +9,7 @@ use crate::chunked_array::ops::compare_inner::{
 };
 use crate::chunked_array::ops::explode::ExplodeByOffsets;
 use crate::chunked_array::AsSinglePtr;
-use crate::frame::groupby::*;
+use crate::frame::group_by::*;
 use crate::frame::hash_join::ZipOuterJoinColumn;
 use crate::prelude::*;
 use crate::series::implementations::SeriesWrap;
diff --git a/crates/polars-core/src/series/implementations/boolean.rs b/crates/polars-core/src/series/implementations/boolean.rs
index 11baec69aaf0..a1b54ab136c4 100644
--- a/crates/polars-core/src/series/implementations/boolean.rs
+++ b/crates/polars-core/src/series/implementations/boolean.rs
@@ -10,7 +10,7 @@ use crate::chunked_array::ops::compare_inner::{
 };
 use crate::chunked_array::ops::explode::ExplodeByOffsets;
 use crate::chunked_array::{AsSinglePtr, ChunkIdIter};
-use crate::frame::groupby::*;
+use crate::frame::group_by::*;
 use crate::frame::hash_join::ZipOuterJoinColumn;
 use crate::prelude::*;
 use crate::series::implementations::SeriesWrap;
diff --git a/crates/polars-core/src/series/implementations/categorical.rs b/crates/polars-core/src/series/implementations/categorical.rs
index 4430a1db84a7..f3bd53c3c012 100644
--- a/crates/polars-core/src/series/implementations/categorical.rs
+++ b/crates/polars-core/src/series/implementations/categorical.rs
@@ -8,7 +8,7 @@ use crate::chunked_array::comparison::*;
 use crate::chunked_array::ops::compare_inner::{IntoPartialOrdInner, PartialOrdInner};
 use crate::chunked_array::ops::explode::ExplodeByOffsets;
 use crate::chunked_array::AsSinglePtr;
-use crate::frame::groupby::*;
+use crate::frame::group_by::*;
 use crate::frame::hash_join::ZipOuterJoinColumn;
 #[cfg(feature = "is_in")]
 use crate::frame::hash_join::_check_categorical_src;
diff --git a/crates/polars-core/src/series/implementations/dates_time.rs b/crates/polars-core/src/series/implementations/dates_time.rs
index 477af2cd0e7f..a442535efb2b 100644
--- a/crates/polars-core/src/series/implementations/dates_time.rs
+++ b/crates/polars-core/src/series/implementations/dates_time.rs
@@ -17,7 +17,7 @@ use super::{private, IntoSeries, SeriesTrait, SeriesWrap, *};
 use crate::chunked_array::ops::explode::ExplodeByOffsets;
 use crate::chunked_array::ops::ToBitRepr;
 use crate::chunked_array::AsSinglePtr;
-use crate::frame::groupby::*;
+use crate::frame::group_by::*;
 use crate::frame::hash_join::*;
 use crate::prelude::*;
 
diff --git a/crates/polars-core/src/series/implementations/datetime.rs b/crates/polars-core/src/series/implementations/datetime.rs
index cab324d2fcbe..dce1c7fd9385 100644
--- a/crates/polars-core/src/series/implementations/datetime.rs
+++ b/crates/polars-core/src/series/implementations/datetime.rs
@@ -6,7 +6,7 @@ use ahash::RandomState;
 use super::{private, IntoSeries, SeriesTrait, SeriesWrap, *};
 use crate::chunked_array::ops::explode::ExplodeByOffsets;
 use crate::chunked_array::AsSinglePtr;
-use crate::frame::groupby::*;
+use crate::frame::group_by::*;
 use crate::frame::hash_join::*;
 use crate::prelude::*;
 
diff --git a/crates/polars-core/src/series/implementations/duration.rs b/crates/polars-core/src/series/implementations/duration.rs
index 5caa0deacaa1..85e8f5b8ed9b 100644
--- a/crates/polars-core/src/series/implementations/duration.rs
+++ b/crates/polars-core/src/series/implementations/duration.rs
@@ -7,7 +7,7 @@ use super::{private, IntoSeries, SeriesTrait, SeriesWrap, *};
 use crate::chunked_array::comparison::*;
 use crate::chunked_array::ops::explode::ExplodeByOffsets;
 use crate::chunked_array::AsSinglePtr;
-use crate::frame::groupby::*;
+use crate::frame::group_by::*;
 use crate::frame::hash_join::*;
 use crate::prelude::*;
 
diff --git a/crates/polars-core/src/series/implementations/floats.rs b/crates/polars-core/src/series/implementations/floats.rs
index dd28d742ca21..b0a8a246b79d 100644
--- a/crates/polars-core/src/series/implementations/floats.rs
+++ b/crates/polars-core/src/series/implementations/floats.rs
@@ -11,7 +11,7 @@ use crate::chunked_array::ops::compare_inner::{
 };
 use crate::chunked_array::ops::explode::ExplodeByOffsets;
 use crate::chunked_array::AsSinglePtr;
-use crate::frame::groupby::*;
+use crate::frame::group_by::*;
 use crate::frame::hash_join::ZipOuterJoinColumn;
 use crate::prelude::*;
 #[cfg(feature = "checked_arithmetic")]
diff --git a/crates/polars-core/src/series/implementations/list.rs b/crates/polars-core/src/series/implementations/list.rs
index bb4fc35987ba..7ca4f1371874 100644
--- a/crates/polars-core/src/series/implementations/list.rs
+++ b/crates/polars-core/src/series/implementations/list.rs
@@ -1,7 +1,7 @@
 use std::any::Any;
 use std::borrow::Cow;
 
-#[cfg(feature = "groupby_list")]
+#[cfg(feature = "group_by_list")]
 use ahash::RandomState;
 
 use super::{private, IntoSeries, SeriesTrait};
@@ -9,7 +9,7 @@ use crate::chunked_array::comparison::*;
 use crate::chunked_array::ops::compare_inner::{IntoPartialEqInner, PartialEqInner};
 use crate::chunked_array::ops::explode::ExplodeByOffsets;
 use crate::chunked_array::{AsSinglePtr, Settings};
-use crate::frame::groupby::*;
+use crate::frame::group_by::*;
 use crate::prelude::*;
 use crate::series::implementations::SeriesWrap;
 #[cfg(feature = "chunked_ids")]
@@ -53,13 +53,13 @@ impl private::PrivateSeries for SeriesWrap<ListChunked> {
         IntoGroupsProxy::group_tuples(&self.0, multithreaded, sorted)
     }
 
-    #[cfg(feature = "groupby_list")]
+    #[cfg(feature = "group_by_list")]
     fn vec_hash(&self, _build_hasher: RandomState, _buf: &mut Vec<u64>) -> PolarsResult<()> {
         self.0.vec_hash(_build_hasher, _buf)?;
         Ok(())
     }
 
-    #[cfg(feature = "groupby_list")]
+    #[cfg(feature = "group_by_list")]
     fn vec_hash_combine(
         &self,
         _build_hasher: RandomState,
diff --git a/crates/polars-core/src/series/implementations/mod.rs b/crates/polars-core/src/series/implementations/mod.rs
index 9ca9bce80b91..0a7b471dc492 100644
--- a/crates/polars-core/src/series/implementations/mod.rs
+++ b/crates/polars-core/src/series/implementations/mod.rs
@@ -41,7 +41,7 @@ use crate::chunked_array::ops::compare_inner::{
 };
 use crate::chunked_array::ops::explode::ExplodeByOffsets;
 use crate::chunked_array::AsSinglePtr;
-use crate::frame::groupby::*;
+use crate::frame::group_by::*;
 use crate::frame::hash_join::ZipOuterJoinColumn;
 use crate::prelude::*;
 #[cfg(feature = "checked_arithmetic")]
diff --git a/crates/polars-core/src/series/implementations/object.rs b/crates/polars-core/src/series/implementations/object.rs
index 01eb9fdbe0e9..941f1b550183 100644
--- a/crates/polars-core/src/series/implementations/object.rs
+++ b/crates/polars-core/src/series/implementations/object.rs
@@ -6,7 +6,7 @@ use ahash::RandomState;
 use crate::chunked_array::object::PolarsObjectSafe;
 use crate::chunked_array::ops::compare_inner::{IntoPartialEqInner, PartialEqInner};
 use crate::chunked_array::Settings;
-use crate::frame::groupby::{GroupsProxy, IntoGroupsProxy};
+use crate::frame::group_by::{GroupsProxy, IntoGroupsProxy};
 use crate::prelude::*;
 use crate::series::implementations::SeriesWrap;
 use crate::series::private::{PrivateSeries, PrivateSeriesNumeric};
diff --git a/crates/polars-core/src/series/implementations/struct_.rs b/crates/polars-core/src/series/implementations/struct_.rs
index 32222176b54b..21bf3f78a9e9 100644
--- a/crates/polars-core/src/series/implementations/struct_.rs
+++ b/crates/polars-core/src/series/implementations/struct_.rs
@@ -65,7 +65,7 @@ impl private::PrivateSeries for SeriesWrap<StructChunked> {
     fn group_tuples(&self, multithreaded: bool, sorted: bool) -> PolarsResult<GroupsProxy> {
         let df = DataFrame::new_no_checks(vec![]);
         let gb = df
-            .groupby_with_series(self.0.fields().to_vec(), multithreaded, sorted)
+            .group_by_with_series(self.0.fields().to_vec(), multithreaded, sorted)
             .unwrap();
         Ok(gb.take_groups())
     }
diff --git a/crates/polars-core/src/series/implementations/utf8.rs b/crates/polars-core/src/series/implementations/utf8.rs
index a25684c1ffd1..952591fed652 100644
--- a/crates/polars-core/src/series/implementations/utf8.rs
+++ b/crates/polars-core/src/series/implementations/utf8.rs
@@ -9,7 +9,7 @@ use crate::chunked_array::ops::compare_inner::{
 };
 use crate::chunked_array::ops::explode::ExplodeByOffsets;
 use crate::chunked_array::AsSinglePtr;
-use crate::frame::groupby::*;
+use crate::frame::group_by::*;
 use crate::frame::hash_join::ZipOuterJoinColumn;
 use crate::prelude::*;
 use crate::series::implementations::SeriesWrap;
diff --git a/crates/polars-core/src/series/ops/unique.rs b/crates/polars-core/src/series/ops/unique.rs
index 1c20b2ad8036..cfae77d687e7 100644
--- a/crates/polars-core/src/series/ops/unique.rs
+++ b/crates/polars-core/src/series/ops/unique.rs
@@ -2,7 +2,7 @@
 use std::hash::Hash;
 
 #[cfg(feature = "unique_counts")]
-use crate::frame::groupby::hashing::HASHMAP_INIT_SIZE;
+use crate::frame::group_by::hashing::HASHMAP_INIT_SIZE;
 use crate::prelude::*;
 #[cfg(feature = "unique_counts")]
 use crate::utils::NoNull;
diff --git a/crates/polars-core/src/series/series_trait.rs b/crates/polars-core/src/series/series_trait.rs
index 5e298eb2d169..79033df353a5 100644
--- a/crates/polars-core/src/series/series_trait.rs
+++ b/crates/polars-core/src/series/series_trait.rs
@@ -48,7 +48,7 @@ pub(crate) mod private {
     use crate::chunked_array::ops::compare_inner::{PartialEqInner, PartialOrdInner};
     use crate::chunked_array::Settings;
     #[cfg(feature = "rows")]
-    use crate::frame::groupby::GroupsProxy;
+    use crate::frame::group_by::GroupsProxy;
 
     pub trait PrivateSeriesNumeric {
         fn bit_repr_is_large(&self) -> bool {
diff --git a/crates/polars-io/src/partition.rs b/crates/polars-io/src/partition.rs
index 48044fe6e486..33d7f19abe70 100644
--- a/crates/polars-io/src/partition.rs
+++ b/crates/polars-io/src/partition.rs
@@ -91,7 +91,7 @@ where
     }
 
     pub fn finish(self, df: &DataFrame) -> PolarsResult<()> {
-        let groups = df.groupby(self.by.clone())?;
+        let groups = df.group_by(self.by.clone())?;
         let groups = groups.get_groups();
 
         // don't parallelize this
diff --git a/crates/polars-lazy/Cargo.toml b/crates/polars-lazy/Cargo.toml
index fb6d1d649bd3..f2317ea7f5dc 100644
--- a/crates/polars-lazy/Cargo.toml
+++ b/crates/polars-lazy/Cargo.toml
@@ -100,7 +100,7 @@ pct_change = ["polars-plan/pct_change"]
 moment = ["polars-plan/moment", "polars-ops/moment"]
 abs = ["polars-plan/abs"]
 random = ["polars-plan/random"]
-dynamic_groupby = ["polars-plan/dynamic_groupby", "polars-time", "temporal"]
+dynamic_group_by = ["polars-plan/dynamic_group_by", "polars-time", "temporal"]
 ewma = ["polars-plan/ewma"]
 dot_diagram = ["polars-plan/dot_diagram"]
 diagonal_concat = []
diff --git a/crates/polars-lazy/src/dsl/list.rs b/crates/polars-lazy/src/dsl/list.rs
index 7d6607868502..5e2b851c0904 100644
--- a/crates/polars-lazy/src/dsl/list.rs
+++ b/crates/polars-lazy/src/dsl/list.rs
@@ -110,7 +110,7 @@ fn run_per_sublist(
     }
 }
 
-fn run_on_groupby_engine(
+fn run_on_group_by_engine(
     name: &str,
     lst: &ListChunked,
     expr: &Expr,
@@ -194,7 +194,7 @@ pub trait ListNameSpaceExtension: IntoListNameSpace + Sized {
             };
 
             if fits_idx_size && s.null_count() == 0 && !is_user_apply() {
-                run_on_groupby_engine(s.name(), &lst, &expr)
+                run_on_group_by_engine(s.name(), &lst, &expr)
             } else {
                 run_per_sublist(s, &lst, &expr, parallel, output_field)
             }
diff --git a/crates/polars-lazy/src/dsl/mod.rs b/crates/polars-lazy/src/dsl/mod.rs
index b9126f044fa5..95d475e61ced 100644
--- a/crates/polars-lazy/src/dsl/mod.rs
+++ b/crates/polars-lazy/src/dsl/mod.rs
@@ -1,7 +1,7 @@
 //! Domain specific language for the Lazy API.
 //!
 //! This DSL revolves around the [`Expr`] type, which represents an abstract
-//! operation on a DataFrame, such as mapping over a column, filtering, groupby, or aggregation.
+//! operation on a DataFrame, such as mapping over a column, filtering, group_by, or aggregation.
 //! In general, functions on [`LazyFrame`](crate::frame::LazyFrame)s consume the LazyFrame and produce a new LazyFrame representing
 //! the result of applying the function and passed expressions to the consumed LazyFrame.
 //! At runtime, when [`LazyFrame::collect`](crate::frame::LazyFrame::collect) is called, the expressions that comprise
diff --git a/crates/polars-lazy/src/frame/mod.rs b/crates/polars-lazy/src/frame/mod.rs
index ac62ad8f9cbb..b16a809f990d 100644
--- a/crates/polars-lazy/src/frame/mod.rs
+++ b/crates/polars-lazy/src/frame/mod.rs
@@ -602,7 +602,7 @@ impl LazyFrame {
     ///
     /// fn example(df: DataFrame) -> PolarsResult<DataFrame> {
     ///     df.lazy()
-    ///       .groupby([col("foo")])
+    ///       .group_by([col("foo")])
     ///       .agg([col("bar").sum(), col("ham").mean().alias("avg_ham")])
     ///       .collect()
     /// }
@@ -764,7 +764,7 @@ impl LazyFrame {
     ///
     /// fn example(df: DataFrame) -> LazyFrame {
     ///       df.lazy()
-    ///        .groupby([col("date")])
+    ///        .group_by([col("date")])
     ///        .agg([
     ///            col("rain").min().alias("min_rain"),
     ///            col("rain").sum().alias("sum_rain"),
@@ -772,7 +772,7 @@ impl LazyFrame {
     ///        ])
     /// }
     /// ```
-    pub fn groupby<E: AsRef<[IE]>, IE: Into<Expr> + Clone>(self, by: E) -> LazyGroupBy {
+    pub fn group_by<E: AsRef<[IE]>, IE: Into<Expr> + Clone>(self, by: E) -> LazyGroupBy {
         let keys = by
             .as_ref()
             .iter()
@@ -780,7 +780,7 @@ impl LazyFrame {
             .collect::<Vec<_>>();
         let opt_state = self.get_opt_state();
 
-        #[cfg(feature = "dynamic_groupby")]
+        #[cfg(feature = "dynamic_group_by")]
         {
             LazyGroupBy {
                 logical_plan: self.logical_plan,
@@ -792,7 +792,7 @@ impl LazyFrame {
             }
         }
 
-        #[cfg(not(feature = "dynamic_groupby"))]
+        #[cfg(not(feature = "dynamic_group_by"))]
         {
             LazyGroupBy {
                 logical_plan: self.logical_plan,
@@ -807,11 +807,11 @@ impl LazyFrame {
     ///
     /// Also works for index values of type Int32 or Int64.
     ///
-    /// Different from a [`groupby_dynamic`][`Self::groupby_dynamic`], the windows are now determined by the
+    /// Different from a [`group_by_dynamic`][`Self::group_by_dynamic`], the windows are now determined by the
     /// individual values and are not of constant intervals. For constant intervals use
-    /// *groupby_dynamic*
-    #[cfg(feature = "dynamic_groupby")]
-    pub fn groupby_rolling<E: AsRef<[Expr]>>(
+    /// *group_by_dynamic*
+    #[cfg(feature = "dynamic_group_by")]
+    pub fn group_by_rolling<E: AsRef<[Expr]>>(
         self,
         index_column: Expr,
         by: E,
@@ -821,9 +821,11 @@ impl LazyFrame {
             options.index_column = name.as_ref().into();
         } else {
             let name = expr_output_name(&index_column).unwrap();
-            return self
-                .with_column(index_column)
-                .groupby_rolling(Expr::Column(name), by, options);
+            return self.with_column(index_column).group_by_rolling(
+                Expr::Column(name),
+                by,
+                options,
+            );
         }
         let opt_state = self.get_opt_state();
         LazyGroupBy {
@@ -839,7 +841,7 @@ impl LazyFrame {
     /// Group based on a time value (or index value of type Int32, Int64).
     ///
     /// Time windows are calculated and rows are assigned to windows. Different from a
-    /// normal groupby is that a row can be member of multiple groups. The time/index
+    /// normal group_by is that a row can be member of multiple groups. The time/index
     /// window could be seen as a rolling window, with a window size determined by
     /// dates/times/values instead of slots in the DataFrame.
     ///
@@ -850,9 +852,9 @@ impl LazyFrame {
     /// - offset: offset of the window
     ///
     /// The `by` argument should be empty `[]` if you don't want to combine this
-    /// with a ordinary groupby on these keys.
-    #[cfg(feature = "dynamic_groupby")]
-    pub fn groupby_dynamic<E: AsRef<[Expr]>>(
+    /// with a ordinary group_by on these keys.
+    #[cfg(feature = "dynamic_group_by")]
+    pub fn group_by_dynamic<E: AsRef<[Expr]>>(
         self,
         index_column: Expr,
         by: E,
@@ -862,9 +864,11 @@ impl LazyFrame {
             options.index_column = name.as_ref().into();
         } else {
             let name = expr_output_name(&index_column).unwrap();
-            return self
-                .with_column(index_column)
-                .groupby_dynamic(Expr::Column(name), by, options);
+            return self.with_column(index_column).group_by_dynamic(
+                Expr::Column(name),
+                by,
+                options,
+            );
         }
         let opt_state = self.get_opt_state();
         LazyGroupBy {
@@ -877,8 +881,8 @@ impl LazyFrame {
         }
     }
 
-    /// Similar to [`groupby`][`Self::groupby`], but order of the DataFrame is maintained.
-    pub fn groupby_stable<E: AsRef<[IE]>, IE: Into<Expr> + Clone>(self, by: E) -> LazyGroupBy {
+    /// Similar to [`group_by`][`Self::group_by`], but order of the DataFrame is maintained.
+    pub fn group_by_stable<E: AsRef<[IE]>, IE: Into<Expr> + Clone>(self, by: E) -> LazyGroupBy {
         let keys = by
             .as_ref()
             .iter()
@@ -886,7 +890,7 @@ impl LazyFrame {
             .collect::<Vec<_>>();
         let opt_state = self.get_opt_state();
 
-        #[cfg(feature = "dynamic_groupby")]
+        #[cfg(feature = "dynamic_group_by")]
         {
             LazyGroupBy {
                 logical_plan: self.logical_plan,
@@ -898,7 +902,7 @@ impl LazyFrame {
             }
         }
 
-        #[cfg(not(feature = "dynamic_groupby"))]
+        #[cfg(not(feature = "dynamic_group_by"))]
         {
             LazyGroupBy {
                 logical_plan: self.logical_plan,
@@ -1364,16 +1368,16 @@ impl LazyFrame {
     }
 }
 
-/// Utility struct for lazy groupby operation.
+/// Utility struct for lazy group_by operation.
 #[derive(Clone)]
 pub struct LazyGroupBy {
     pub logical_plan: LogicalPlan,
     opt_state: OptState,
     keys: Vec<Expr>,
     maintain_order: bool,
-    #[cfg(feature = "dynamic_groupby")]
+    #[cfg(feature = "dynamic_group_by")]
     dynamic_options: Option<DynamicGroupOptions>,
-    #[cfg(feature = "dynamic_groupby")]
+    #[cfg(feature = "dynamic_group_by")]
     rolling_options: Option<RollingGroupOptions>,
 }
 
@@ -1392,7 +1396,7 @@ impl LazyGroupBy {
     ///
     /// fn example(df: DataFrame) -> LazyFrame {
     ///       df.lazy()
-    ///        .groupby_stable([col("date")])
+    ///        .group_by_stable([col("date")])
     ///        .agg([
     ///            col("rain").min().alias("min_rain"),
     ///            col("rain").sum().alias("sum_rain"),
@@ -1401,9 +1405,9 @@ impl LazyGroupBy {
     /// }
     /// ```
     pub fn agg<E: AsRef<[Expr]>>(self, aggs: E) -> LazyFrame {
-        #[cfg(feature = "dynamic_groupby")]
+        #[cfg(feature = "dynamic_group_by")]
         let lp = LogicalPlanBuilder::from(self.logical_plan)
-            .groupby(
+            .group_by(
                 self.keys,
                 aggs,
                 None,
@@ -1413,9 +1417,9 @@ impl LazyGroupBy {
             )
             .build();
 
-        #[cfg(not(feature = "dynamic_groupby"))]
+        #[cfg(not(feature = "dynamic_group_by"))]
         let lp = LogicalPlanBuilder::from(self.logical_plan)
-            .groupby(self.keys, aggs, None, self.maintain_order)
+            .group_by(self.keys, aggs, None, self.maintain_order)
             .build();
         LazyFrame::from_logical_plan(lp, self.opt_state)
     }
@@ -1450,14 +1454,14 @@ impl LazyGroupBy {
     where
         F: 'static + Fn(DataFrame) -> PolarsResult<DataFrame> + Send + Sync,
     {
-        #[cfg(feature = "dynamic_groupby")]
+        #[cfg(feature = "dynamic_group_by")]
         let options = GroupbyOptions {
             dynamic: self.dynamic_options,
             rolling: self.rolling_options,
             slice: None,
         };
 
-        #[cfg(not(feature = "dynamic_groupby"))]
+        #[cfg(not(feature = "dynamic_group_by"))]
         let options = GroupbyOptions { slice: None };
 
         let lp = LogicalPlan::Aggregate {
diff --git a/crates/polars-lazy/src/frame/pivot.rs b/crates/polars-lazy/src/frame/pivot.rs
index 195d4ad49427..c9e0339593db 100644
--- a/crates/polars-lazy/src/frame/pivot.rs
+++ b/crates/polars-lazy/src/frame/pivot.rs
@@ -6,7 +6,7 @@
 //! pivot is here, because we want to be able to pass expressions to the pivot operation.
 //!
 
-use polars_core::frame::groupby::expr::PhysicalAggExpr;
+use polars_core::frame::group_by::expr::PhysicalAggExpr;
 use polars_core::prelude::*;
 use polars_ops::pivot::PivotAgg;
 
diff --git a/crates/polars-lazy/src/lib.rs b/crates/polars-lazy/src/lib.rs
index d4d0de8f1acd..cbe53c484dbe 100644
--- a/crates/polars-lazy/src/lib.rs
+++ b/crates/polars-lazy/src/lib.rs
@@ -99,7 +99,7 @@
 //!     )?;
 //!
 //!     df.lazy()
-//!     .groupby([col("date")])
+//!     .group_by([col("date")])
 //!     .agg([
 //!         col("rain").min().alias("min_rain"),
 //!         col("rain").sum().alias("sum_rain"),
@@ -160,7 +160,7 @@
 //!     .filter(
 //!         col("a").lt(lit(2))
 //!     )
-//!     .groupby([col("b")])
+//!     .group_by([col("b")])
 //!     .agg(
 //!         vec![col("b").first().alias("first_b"), col("c").first().alias("first_c")]
 //!      )
@@ -176,7 +176,7 @@
 //!
 //! fn aggregate_all_columns(df_a: DataFrame) -> LazyFrame {
 //!     df_a.lazy()
-//!     .groupby([col("b")])
+//!     .group_by([col("b")])
 //!     .agg(
 //!         vec![col("*").first()]
 //!      )
diff --git a/crates/polars-lazy/src/physical_plan/executors/groupby.rs b/crates/polars-lazy/src/physical_plan/executors/group_by.rs
similarity index 95%
rename from crates/polars-lazy/src/physical_plan/executors/groupby.rs
rename to crates/polars-lazy/src/physical_plan/executors/group_by.rs
index 55e12d2e8997..24d8bd88eb8d 100644
--- a/crates/polars-lazy/src/physical_plan/executors/groupby.rs
+++ b/crates/polars-lazy/src/physical_plan/executors/group_by.rs
@@ -54,7 +54,7 @@ impl GroupByExec {
 }
 
 #[allow(clippy::too_many_arguments)]
-pub(super) fn groupby_helper(
+pub(super) fn group_by_helper(
     mut df: DataFrame,
     keys: Vec<Series>,
     aggs: &[Arc<dyn PhysicalExpr>],
@@ -64,7 +64,7 @@ pub(super) fn groupby_helper(
     slice: Option<(i64, usize)>,
 ) -> PolarsResult<DataFrame> {
     df.as_single_chunk_par();
-    let gb = df.groupby_with_series(keys, true, maintain_order)?;
+    let gb = df.group_by_with_series(keys, true, maintain_order)?;
 
     if let Some(f) = apply {
         return gb.apply(move |df| f.call_udf(df));
@@ -101,7 +101,7 @@ impl GroupByExec {
             .iter()
             .map(|e| e.evaluate(&df, state))
             .collect::<PolarsResult<_>>()?;
-        groupby_helper(
+        group_by_helper(
             df,
             keys,
             &self.aggs,
@@ -132,7 +132,7 @@ impl Executor for GroupByExec {
                 .iter()
                 .map(|s| Ok(s.to_field(&self.input_schema)?.name))
                 .collect::<PolarsResult<Vec<_>>>()?;
-            let name = comma_delimited("groupby".to_string(), &by);
+            let name = comma_delimited("group_by".to_string(), &by);
             Cow::Owned(name)
         } else {
             Cow::Borrowed("")
diff --git a/crates/polars-lazy/src/physical_plan/executors/groupby_dynamic.rs b/crates/polars-lazy/src/physical_plan/executors/group_by_dynamic.rs
similarity index 82%
rename from crates/polars-lazy/src/physical_plan/executors/groupby_dynamic.rs
rename to crates/polars-lazy/src/physical_plan/executors/group_by_dynamic.rs
index 651ee63716e1..3aa156b062e4 100644
--- a/crates/polars-lazy/src/physical_plan/executors/groupby_dynamic.rs
+++ b/crates/polars-lazy/src/physical_plan/executors/group_by_dynamic.rs
@@ -1,18 +1,18 @@
-#[cfg(feature = "dynamic_groupby")]
-use polars_core::frame::groupby::GroupBy;
-#[cfg(feature = "dynamic_groupby")]
+#[cfg(feature = "dynamic_group_by")]
+use polars_core::frame::group_by::GroupBy;
+#[cfg(feature = "dynamic_group_by")]
 use polars_time::DynamicGroupOptions;
 
 use super::*;
 
-#[cfg_attr(not(feature = "dynamic_groupby"), allow(dead_code))]
+#[cfg_attr(not(feature = "dynamic_group_by"), allow(dead_code))]
 pub(crate) struct GroupByDynamicExec {
     pub(crate) input: Box<dyn Executor>,
     // we will use this later
     #[allow(dead_code)]
     pub(crate) keys: Vec<Arc<dyn PhysicalExpr>>,
     pub(crate) aggs: Vec<Arc<dyn PhysicalExpr>>,
-    #[cfg(feature = "dynamic_groupby")]
+    #[cfg(feature = "dynamic_group_by")]
     pub(crate) options: DynamicGroupOptions,
     pub(crate) input_schema: SchemaRef,
     pub(crate) slice: Option<(i64, usize)>,
@@ -20,7 +20,7 @@ pub(crate) struct GroupByDynamicExec {
 }
 
 impl GroupByDynamicExec {
-    #[cfg(feature = "dynamic_groupby")]
+    #[cfg(feature = "dynamic_group_by")]
     fn execute_impl(
         &mut self,
         state: &ExecutionState,
@@ -33,7 +33,7 @@ impl GroupByDynamicExec {
             .map(|e| e.evaluate(&df, state))
             .collect::<PolarsResult<Vec<_>>>()?;
 
-        let (mut time_key, mut keys, groups) = df.groupby_dynamic(keys, &self.options)?;
+        let (mut time_key, mut keys, groups) = df.group_by_dynamic(keys, &self.options)?;
 
         if let Some(f) = &self.apply {
             let gb = GroupBy::new(&df, vec![], groups, None);
@@ -57,7 +57,7 @@ impl GroupByDynamicExec {
             time_key = time_key.slice(offset, len);
 
             // todo! optimize this, we can prevent an agg_first aggregation upstream
-            // the ordering has changed due to the groupby
+            // the ordering has changed due to the group_by
             for key in keys.iter_mut() {
                 *key = key.slice(offset, len)
             }
@@ -75,12 +75,12 @@ impl GroupByDynamicExec {
 }
 
 impl Executor for GroupByDynamicExec {
-    #[cfg(not(feature = "dynamic_groupby"))]
+    #[cfg(not(feature = "dynamic_group_by"))]
     fn execute(&mut self, _state: &mut ExecutionState) -> PolarsResult<DataFrame> {
-        panic!("activate feature dynamic_groupby")
+        panic!("activate feature dynamic_group_by")
     }
 
-    #[cfg(feature = "dynamic_groupby")]
+    #[cfg(feature = "dynamic_group_by")]
     fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult<DataFrame> {
         #[cfg(debug_assertions)]
         {
@@ -96,7 +96,7 @@ impl Executor for GroupByDynamicExec {
                 .iter()
                 .map(|s| Ok(s.to_field(&self.input_schema)?.name))
                 .collect::<PolarsResult<Vec<_>>>()?;
-            let name = comma_delimited("groupby_dynamic".to_string(), &by);
+            let name = comma_delimited("group_by_dynamic".to_string(), &by);
             Cow::Owned(name)
         } else {
             Cow::Borrowed("")
diff --git a/crates/polars-lazy/src/physical_plan/executors/groupby_partitioned.rs b/crates/polars-lazy/src/physical_plan/executors/group_by_partitioned.rs
similarity index 93%
rename from crates/polars-lazy/src/physical_plan/executors/groupby_partitioned.rs
rename to crates/polars-lazy/src/physical_plan/executors/group_by_partitioned.rs
index a7341f951c00..068c2bf01754 100644
--- a/crates/polars-lazy/src/physical_plan/executors/groupby_partitioned.rs
+++ b/crates/polars-lazy/src/physical_plan/executors/group_by_partitioned.rs
@@ -70,9 +70,9 @@ fn run_partitions(
     n_threads: usize,
     maintain_order: bool,
 ) -> PolarsResult<Vec<DataFrame>> {
-    // We do a partitioned groupby.
-    // Meaning that we first do the groupby operation arbitrarily
-    // split on several threads. Than the final result we apply the same groupby again.
+    // We do a partitioned group_by.
+    // Meaning that we first do the group_by operation arbitrarily
+    // split on several threads. Than the final result we apply the same group_by again.
     let dfs = split_df(df, n_threads)?;
 
     let phys_aggs = &exec.phys_aggs;
@@ -81,7 +81,7 @@ fn run_partitions(
         dfs.into_par_iter()
             .map(|df| {
                 let keys = compute_keys(keys, &df, state)?;
-                let gb = df.groupby_with_series(keys, false, maintain_order)?;
+                let gb = df.group_by_with_series(keys, false, maintain_order)?;
                 let groups = gb.get_groups();
 
                 let mut columns = gb.keys();
@@ -151,7 +151,7 @@ fn estimate_unique_count(keys: &[Series], mut sample_size: usize) -> PolarsResul
             .collect::<Vec<_>>();
         let df = DataFrame::new_no_checks(keys);
         let names = df.get_column_names();
-        let gb = df.groupby(names).unwrap();
+        let gb = df.group_by(names).unwrap();
         Ok(finish(gb.get_groups()))
     }
 }
@@ -180,7 +180,7 @@ fn can_run_partitioned(
         }
         Ok(false)
     } else {
-        // below this boundary we assume the partitioned groupby will be faster
+        // below this boundary we assume the partitioned group_by will be faster
         let unique_count_boundary = std::env::var("POLARS_PARTITION_UNIQUE_COUNT")
             .map(|s| s.parse::<usize>().unwrap())
             .unwrap_or(1000);
@@ -230,7 +230,7 @@ impl PartitionGroupByExec {
         original_df: DataFrame,
     ) -> Option<PolarsResult<DataFrame>> {
         #[allow(clippy::needless_update)]
-        let groupby_options = GroupbyOptions {
+        let group_by_options = GroupbyOptions {
             slice: self.slice,
             ..Default::default()
         }
@@ -242,7 +242,7 @@ impl PartitionGroupByExec {
             schema: self.output_schema.clone(),
             apply: None,
             maintain_order: false,
-            options: groupby_options,
+            options: group_by_options,
         };
         let mut expr_arena = Default::default();
         let mut lp_arena = Default::default();
@@ -276,14 +276,14 @@ impl PartitionGroupByExec {
         mut original_df: DataFrame,
     ) -> PolarsResult<DataFrame> {
         let dfs = {
-            // already get the keys. This is the very last minute decision which groupby method we choose.
+            // already get the keys. This is the very last minute decision which group_by method we choose.
             // If the column is a categorical, we know the number of groups we have and can decide to continue
-            // partitioned or go for the standard groupby. The partitioned is likely to be faster on a small number
+            // partitioned or go for the standard group_by. The partitioned is likely to be faster on a small number
             // of groups.
             let keys = self.keys(&original_df, state)?;
 
             if !can_run_partitioned(&keys, &original_df, state, self.from_partitioned_ds)? {
-                return groupby_helper(
+                return group_by_helper(
                     original_df,
                     keys,
                     &self.phys_aggs,
@@ -321,11 +321,11 @@ impl PartitionGroupByExec {
         // MERGE phase
         // merge and hash aggregate again
         let df = accumulate_dataframes_vertical(dfs)?;
-        // the partitioned groupby has added columns so we must update the schema.
+        // the partitioned group_by has added columns so we must update the schema.
         let keys = self.keys(&df, state)?;
 
         // first get mutable access and optionally sort
-        let gb = df.groupby_with_series(keys, true, self.maintain_order)?;
+        let gb = df.group_by_with_series(keys, true, self.maintain_order)?;
         let mut groups = gb.get_groups();
 
         #[allow(unused_assignments)]
@@ -377,7 +377,7 @@ impl Executor for PartitionGroupByExec {
                 .iter()
                 .map(|s| Ok(s.to_field(&self.input_schema)?.name))
                 .collect::<PolarsResult<Vec<_>>>()?;
-            let name = comma_delimited("groupby_partitioned".to_string(), &by);
+            let name = comma_delimited("group_by_partitioned".to_string(), &by);
             Cow::Owned(name)
         } else {
             Cow::Borrowed("")
diff --git a/crates/polars-lazy/src/physical_plan/executors/groupby_rolling.rs b/crates/polars-lazy/src/physical_plan/executors/group_by_rolling.rs
similarity index 85%
rename from crates/polars-lazy/src/physical_plan/executors/groupby_rolling.rs
rename to crates/polars-lazy/src/physical_plan/executors/group_by_rolling.rs
index 170730fe3258..b6d890cbac0a 100644
--- a/crates/polars-lazy/src/physical_plan/executors/groupby_rolling.rs
+++ b/crates/polars-lazy/src/physical_plan/executors/group_by_rolling.rs
@@ -1,16 +1,16 @@
-#[cfg(feature = "dynamic_groupby")]
-use polars_core::frame::groupby::GroupBy;
-#[cfg(feature = "dynamic_groupby")]
+#[cfg(feature = "dynamic_group_by")]
+use polars_core::frame::group_by::GroupBy;
+#[cfg(feature = "dynamic_group_by")]
 use polars_time::RollingGroupOptions;
 
 use super::*;
 
-#[cfg_attr(not(feature = "dynamic_groupby"), allow(dead_code))]
+#[cfg_attr(not(feature = "dynamic_group_by"), allow(dead_code))]
 pub(crate) struct GroupByRollingExec {
     pub(crate) input: Box<dyn Executor>,
     pub(crate) keys: Vec<Arc<dyn PhysicalExpr>>,
     pub(crate) aggs: Vec<Arc<dyn PhysicalExpr>>,
-    #[cfg(feature = "dynamic_groupby")]
+    #[cfg(feature = "dynamic_group_by")]
     pub(crate) options: RollingGroupOptions,
     pub(crate) input_schema: SchemaRef,
     pub(crate) slice: Option<(i64, usize)>,
@@ -18,7 +18,7 @@ pub(crate) struct GroupByRollingExec {
 }
 
 impl GroupByRollingExec {
-    #[cfg(feature = "dynamic_groupby")]
+    #[cfg(feature = "dynamic_group_by")]
     fn execute_impl(
         &mut self,
         state: &ExecutionState,
@@ -32,7 +32,7 @@ impl GroupByRollingExec {
             .map(|e| e.evaluate(&df, state))
             .collect::<PolarsResult<Vec<_>>>()?;
 
-        let (mut time_key, mut keys, groups) = df.groupby_rolling(keys, &self.options)?;
+        let (mut time_key, mut keys, groups) = df.group_by_rolling(keys, &self.options)?;
 
         if let Some(f) = &self.apply {
             let gb = GroupBy::new(&df, vec![], groups, None);
@@ -56,7 +56,7 @@ impl GroupByRollingExec {
             time_key = time_key.slice(offset, len);
         }
 
-        // the ordering has changed due to the groupby
+        // the ordering has changed due to the group_by
         if !keys.is_empty() {
             unsafe {
                 match groups {
@@ -92,12 +92,12 @@ impl GroupByRollingExec {
 }
 
 impl Executor for GroupByRollingExec {
-    #[cfg(not(feature = "dynamic_groupby"))]
+    #[cfg(not(feature = "dynamic_group_by"))]
     fn execute(&mut self, _state: &mut ExecutionState) -> PolarsResult<DataFrame> {
-        panic!("activate feature dynamic_groupby")
+        panic!("activate feature dynamic_group_by")
     }
 
-    #[cfg(feature = "dynamic_groupby")]
+    #[cfg(feature = "dynamic_group_by")]
     fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult<DataFrame> {
         #[cfg(debug_assertions)]
         {
@@ -112,7 +112,7 @@ impl Executor for GroupByRollingExec {
                 .iter()
                 .map(|s| Ok(s.to_field(&self.input_schema)?.name))
                 .collect::<PolarsResult<Vec<_>>>()?;
-            let name = comma_delimited("groupby_rolling".to_string(), &by);
+            let name = comma_delimited("group_by_rolling".to_string(), &by);
             Cow::Owned(name)
         } else {
             Cow::Borrowed("")
diff --git a/crates/polars-lazy/src/physical_plan/executors/mod.rs b/crates/polars-lazy/src/physical_plan/executors/mod.rs
index bf95a06ed6f7..e43a99c2d53f 100644
--- a/crates/polars-lazy/src/physical_plan/executors/mod.rs
+++ b/crates/polars-lazy/src/physical_plan/executors/mod.rs
@@ -2,10 +2,10 @@ mod cache;
 mod executor;
 mod ext_context;
 mod filter;
-mod groupby;
-mod groupby_dynamic;
-mod groupby_partitioned;
-mod groupby_rolling;
+mod group_by;
+mod group_by_dynamic;
+mod group_by_partitioned;
+mod group_by_rolling;
 mod join;
 mod projection;
 mod projection_utils;
@@ -31,12 +31,12 @@ use rayon::prelude::*;
 pub(super) use self::cache::*;
 pub(super) use self::ext_context::*;
 pub(super) use self::filter::*;
-pub(super) use self::groupby::*;
-#[cfg(feature = "dynamic_groupby")]
-pub(super) use self::groupby_dynamic::*;
-pub(super) use self::groupby_partitioned::*;
-#[cfg(feature = "dynamic_groupby")]
-pub(super) use self::groupby_rolling::*;
+pub(super) use self::group_by::*;
+#[cfg(feature = "dynamic_group_by")]
+pub(super) use self::group_by_dynamic::*;
+pub(super) use self::group_by_partitioned::*;
+#[cfg(feature = "dynamic_group_by")]
+pub(super) use self::group_by_rolling::*;
 pub(super) use self::join::*;
 pub(super) use self::projection::*;
 #[cfg(feature = "python")]
diff --git a/crates/polars-lazy/src/physical_plan/executors/projection_utils.rs b/crates/polars-lazy/src/physical_plan/executors/projection_utils.rs
index 00bfa825554c..70e33fb986a0 100644
--- a/crates/polars-lazy/src/physical_plan/executors/projection_utils.rs
+++ b/crates/polars-lazy/src/physical_plan/executors/projection_utils.rs
@@ -24,7 +24,7 @@ fn execute_projection_cached_window_fns(
 ) -> PolarsResult<Vec<Series>> {
     // We partition by normal expression and window expression
     // - the normal expressions can run in parallel
-    // - the window expression take more memory and often use the same groupby keys and join tuples
+    // - the window expression take more memory and often use the same group_by keys and join tuples
     //   so they are cached and run sequential
 
     // the partitioning messes with column order, so we also store the idx
@@ -36,7 +36,7 @@ fn execute_projection_cached_window_fns(
     let mut other = Vec::with_capacity(exprs.len());
 
     // first we partition the window function by the values they group over.
-    // the groupby values should be cached
+    // the group_by values should be cached
     let mut index = 0u32;
     exprs.iter().for_each(|phys| {
         index += 1;
@@ -45,11 +45,11 @@ fn execute_projection_cached_window_fns(
         let mut is_window = false;
         for e in e.into_iter() {
             if let Expr::Window { partition_by, .. } = e {
-                let groupby = format!("{:?}", partition_by.as_slice());
-                if let Some(tpl) = windows.iter_mut().find(|tpl| tpl.0 == groupby) {
+                let group_by = format!("{:?}", partition_by.as_slice());
+                if let Some(tpl) = windows.iter_mut().find(|tpl| tpl.0 == group_by) {
                     tpl.1.push((index, phys.clone()))
                 } else {
-                    windows.push((groupby, vec![(index, phys.clone())]))
+                    windows.push((group_by, vec![(index, phys.clone())]))
                 }
                 is_window = true;
                 break;
diff --git a/crates/polars-lazy/src/physical_plan/expressions/aggregation.rs b/crates/polars-lazy/src/physical_plan/expressions/aggregation.rs
index 867ea1bd520b..3f7b700dde44 100644
--- a/crates/polars-lazy/src/physical_plan/expressions/aggregation.rs
+++ b/crates/polars-lazy/src/physical_plan/expressions/aggregation.rs
@@ -6,7 +6,7 @@ use polars_arrow::export::arrow::compute::concatenate::concatenate;
 use polars_arrow::export::arrow::offset::Offsets;
 use polars_arrow::prelude::QuantileInterpolOptions;
 use polars_arrow::utils::CustomIterTools;
-use polars_core::frame::groupby::{GroupByMethod, GroupsProxy};
+use polars_core::frame::group_by::{GroupByMethod, GroupsProxy};
 use polars_core::prelude::*;
 use polars_core::utils::NoNull;
 #[cfg(feature = "dtype-struct")]
@@ -426,7 +426,7 @@ impl PartitionedAggregation for AggregationExpr {
                         for (_, idx) in groups {
                             let ca = unsafe {
                                 // Safety
-                                // The indexes of the groupby operation are never out of bounds
+                                // The indexes of the group_by operation are never out of bounds
                                 ca.take_unchecked(idx.into())
                             };
                             process_group(ca)?;
diff --git a/crates/polars-lazy/src/physical_plan/expressions/alias.rs b/crates/polars-lazy/src/physical_plan/expressions/alias.rs
index 6dc975be1a8a..d9cc5cb73511 100644
--- a/crates/polars-lazy/src/physical_plan/expressions/alias.rs
+++ b/crates/polars-lazy/src/physical_plan/expressions/alias.rs
@@ -1,6 +1,6 @@
 use std::sync::Arc;
 
-use polars_core::frame::groupby::GroupsProxy;
+use polars_core::frame::group_by::GroupsProxy;
 use polars_core::prelude::*;
 
 use crate::physical_plan::state::ExecutionState;
diff --git a/crates/polars-lazy/src/physical_plan/expressions/apply.rs b/crates/polars-lazy/src/physical_plan/expressions/apply.rs
index 7a8dea411e83..b24d14c48e60 100644
--- a/crates/polars-lazy/src/physical_plan/expressions/apply.rs
+++ b/crates/polars-lazy/src/physical_plan/expressions/apply.rs
@@ -1,7 +1,7 @@
 use std::borrow::Cow;
 use std::sync::Arc;
 
-use polars_core::frame::groupby::GroupsProxy;
+use polars_core::frame::group_by::GroupsProxy;
 use polars_core::prelude::*;
 use polars_core::POOL;
 #[cfg(feature = "parquet")]
@@ -285,7 +285,7 @@ impl PhysicalExpr for ApplyExpr {
         polars_ensure!(
             self.allow_group_aware,
             expr = self.expr,
-            ComputeError: "this expression cannot run in the groupby context",
+            ComputeError: "this expression cannot run in the group_by context",
         );
         if self.inputs.len() == 1 {
             let mut ac = self.inputs[0].evaluate_on_groups(df, groups, state)?;
diff --git a/crates/polars-lazy/src/physical_plan/expressions/binary.rs b/crates/polars-lazy/src/physical_plan/expressions/binary.rs
index 329b32167d9c..74c1b7833b7a 100644
--- a/crates/polars-lazy/src/physical_plan/expressions/binary.rs
+++ b/crates/polars-lazy/src/physical_plan/expressions/binary.rs
@@ -1,6 +1,6 @@
 use std::sync::Arc;
 
-use polars_core::frame::groupby::GroupsProxy;
+use polars_core::frame::group_by::GroupsProxy;
 use polars_core::prelude::*;
 use polars_core::POOL;
 
diff --git a/crates/polars-lazy/src/physical_plan/expressions/cast.rs b/crates/polars-lazy/src/physical_plan/expressions/cast.rs
index d1255efe1072..fc63c6aa9dfc 100644
--- a/crates/polars-lazy/src/physical_plan/expressions/cast.rs
+++ b/crates/polars-lazy/src/physical_plan/expressions/cast.rs
@@ -1,6 +1,6 @@
 use std::sync::Arc;
 
-use polars_core::frame::groupby::GroupsProxy;
+use polars_core::frame::group_by::GroupsProxy;
 use polars_core::prelude::*;
 
 use crate::physical_plan::state::ExecutionState;
diff --git a/crates/polars-lazy/src/physical_plan/expressions/column.rs b/crates/polars-lazy/src/physical_plan/expressions/column.rs
index 4baf5d2224ff..b9656e3a2c4d 100644
--- a/crates/polars-lazy/src/physical_plan/expressions/column.rs
+++ b/crates/polars-lazy/src/physical_plan/expressions/column.rs
@@ -1,7 +1,7 @@
 use std::borrow::Cow;
 use std::sync::Arc;
 
-use polars_core::frame::groupby::GroupsProxy;
+use polars_core::frame::group_by::GroupsProxy;
 use polars_core::prelude::*;
 
 use crate::physical_plan::state::ExecutionState;
@@ -133,7 +133,7 @@ impl PhysicalExpr for ColumnExpr {
                         match df.get_columns().get(idx) {
                             Some(out) => self.process_by_idx(out, state, schema, df, true),
                             None => {
-                                // partitioned groupby special case
+                                // partitioned group_by special case
                                 if let Some(schema) = state.get_schema() {
                                     self.process_from_state_schema(df, state, &schema)
                                 } else {
diff --git a/crates/polars-lazy/src/physical_plan/expressions/filter.rs b/crates/polars-lazy/src/physical_plan/expressions/filter.rs
index 9abaf40aacff..a3408a377a2c 100644
--- a/crates/polars-lazy/src/physical_plan/expressions/filter.rs
+++ b/crates/polars-lazy/src/physical_plan/expressions/filter.rs
@@ -1,7 +1,7 @@
 use std::sync::Arc;
 
 use polars_arrow::is_valid::IsValid;
-use polars_core::frame::groupby::GroupsProxy;
+use polars_core::frame::group_by::GroupsProxy;
 use polars_core::prelude::*;
 use polars_core::POOL;
 use rayon::prelude::*;
diff --git a/crates/polars-lazy/src/physical_plan/expressions/literal.rs b/crates/polars-lazy/src/physical_plan/expressions/literal.rs
index 4e3061e3d1e3..12937639c968 100644
--- a/crates/polars-lazy/src/physical_plan/expressions/literal.rs
+++ b/crates/polars-lazy/src/physical_plan/expressions/literal.rs
@@ -1,7 +1,7 @@
 use std::borrow::Cow;
 use std::ops::Deref;
 
-use polars_core::frame::groupby::GroupsProxy;
+use polars_core::frame::group_by::GroupsProxy;
 use polars_core::prelude::*;
 use polars_core::utils::NoNull;
 
diff --git a/crates/polars-lazy/src/physical_plan/expressions/mod.rs b/crates/polars-lazy/src/physical_plan/expressions/mod.rs
index 0e81e54f1b3a..23dec5117d7c 100644
--- a/crates/polars-lazy/src/physical_plan/expressions/mod.rs
+++ b/crates/polars-lazy/src/physical_plan/expressions/mod.rs
@@ -28,7 +28,7 @@ pub(crate) use count::*;
 pub(crate) use filter::*;
 pub(crate) use literal::*;
 use polars_arrow::utils::CustomIterTools;
-use polars_core::frame::groupby::GroupsProxy;
+use polars_core::frame::group_by::GroupsProxy;
 use polars_core::prelude::*;
 use polars_io::predicates::PhysicalIoExpr;
 pub(crate) use slice::*;
@@ -400,7 +400,7 @@ impl<'a> AggregationContext<'a> {
                 #[cfg(debug_assertions)]
                 {
                     if self.groups.len() > s.len() {
-                        polars_warn!("groups may be out of bounds; more groups than elements in a series is only possible in dynamic groupby")
+                        polars_warn!("groups may be out of bounds; more groups than elements in a series is only possible in dynamic group_by")
                     }
                 }
 
diff --git a/crates/polars-lazy/src/physical_plan/expressions/slice.rs b/crates/polars-lazy/src/physical_plan/expressions/slice.rs
index 03abd46bff2d..7cffda73843f 100644
--- a/crates/polars-lazy/src/physical_plan/expressions/slice.rs
+++ b/crates/polars-lazy/src/physical_plan/expressions/slice.rs
@@ -1,6 +1,6 @@
 use std::sync::Arc;
 
-use polars_core::frame::groupby::{GroupsProxy, IdxItem};
+use polars_core::frame::group_by::{GroupsProxy, IdxItem};
 use polars_core::prelude::*;
 use polars_core::utils::{slice_offsets, CustomIterTools};
 use polars_core::POOL;
diff --git a/crates/polars-lazy/src/physical_plan/expressions/sort.rs b/crates/polars-lazy/src/physical_plan/expressions/sort.rs
index 515320cac6cb..473c43e5befc 100644
--- a/crates/polars-lazy/src/physical_plan/expressions/sort.rs
+++ b/crates/polars-lazy/src/physical_plan/expressions/sort.rs
@@ -1,7 +1,7 @@
 use std::sync::Arc;
 
 use polars_arrow::utils::CustomIterTools;
-use polars_core::frame::groupby::GroupsProxy;
+use polars_core::frame::group_by::GroupsProxy;
 use polars_core::prelude::*;
 use polars_core::POOL;
 use rayon::prelude::*;
diff --git a/crates/polars-lazy/src/physical_plan/expressions/sortby.rs b/crates/polars-lazy/src/physical_plan/expressions/sortby.rs
index 0da6c9e0b865..fac35f8e4009 100644
--- a/crates/polars-lazy/src/physical_plan/expressions/sortby.rs
+++ b/crates/polars-lazy/src/physical_plan/expressions/sortby.rs
@@ -1,7 +1,7 @@
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::Arc;
 
-use polars_core::frame::groupby::{GroupsIndicator, GroupsProxy};
+use polars_core::frame::group_by::{GroupsIndicator, GroupsProxy};
 use polars_core::prelude::*;
 use polars_core::POOL;
 use rayon::prelude::*;
@@ -298,7 +298,7 @@ impl PhysicalExpr for SortByExpr {
             );
 
             // if the rhs is already aggregated once,
-            // it is reordered by the groupby operation
+            // it is reordered by the group_by operation
             // we must ensure that we are as well.
             if ordered_by_group_operation {
                 let s = ac_in.aggregated();
diff --git a/crates/polars-lazy/src/physical_plan/expressions/take.rs b/crates/polars-lazy/src/physical_plan/expressions/take.rs
index 76d1d9338c5e..22b14b517dfa 100644
--- a/crates/polars-lazy/src/physical_plan/expressions/take.rs
+++ b/crates/polars-lazy/src/physical_plan/expressions/take.rs
@@ -1,7 +1,7 @@
 use std::sync::Arc;
 
 use polars_arrow::utils::CustomIterTools;
-use polars_core::frame::groupby::GroupsProxy;
+use polars_core::frame::group_by::GroupsProxy;
 use polars_core::prelude::*;
 use polars_core::utils::NoNull;
 
diff --git a/crates/polars-lazy/src/physical_plan/expressions/ternary.rs b/crates/polars-lazy/src/physical_plan/expressions/ternary.rs
index f32547743168..1efad2cfdbdf 100644
--- a/crates/polars-lazy/src/physical_plan/expressions/ternary.rs
+++ b/crates/polars-lazy/src/physical_plan/expressions/ternary.rs
@@ -1,7 +1,7 @@
 use std::sync::Arc;
 
 use polars_arrow::utils::CustomIterTools;
-use polars_core::frame::groupby::GroupsProxy;
+use polars_core::frame::group_by::GroupsProxy;
 use polars_core::prelude::*;
 use polars_core::POOL;
 
@@ -140,7 +140,7 @@ impl PhysicalExpr for TernaryExpr {
         if !aggregation_predicate {
             // unwrap will not fail as it is not an aggregation expression.
             eprintln!(
-                "The predicate '{}' in 'when->then->otherwise' is not a valid aggregation and might produce a different number of rows than the groupby operation would. This behavior is experimental and may be subject to change", self.predicate.as_expression().unwrap()
+                "The predicate '{}' in 'when->then->otherwise' is not a valid aggregation and might produce a different number of rows than the group_by operation would. This behavior is experimental and may be subject to change", self.predicate.as_expression().unwrap()
             )
         }
         let op_mask = || self.predicate.evaluate_on_groups(df, groups, state);
diff --git a/crates/polars-lazy/src/physical_plan/expressions/window.rs b/crates/polars-lazy/src/physical_plan/expressions/window.rs
index 670ce3854683..fb26b0e6fb5c 100644
--- a/crates/polars-lazy/src/physical_plan/expressions/window.rs
+++ b/crates/polars-lazy/src/physical_plan/expressions/window.rs
@@ -3,7 +3,7 @@ use std::sync::Arc;
 
 use polars_arrow::export::arrow::array::PrimitiveArray;
 use polars_core::export::arrow::bitmap::Bitmap;
-use polars_core::frame::groupby::{GroupBy, GroupsProxy};
+use polars_core::frame::group_by::{GroupBy, GroupsProxy};
 use polars_core::frame::hash_join::{
     default_join_ids, private_left_join_multiple_keys, ChunkJoinOptIds, JoinValidation,
 };
@@ -130,7 +130,7 @@ impl WindowExpr {
         out_column: Series,
         flattened: Series,
         mut ac: AggregationContext,
-        groupby_columns: &[Series],
+        group_by_columns: &[Series],
         gb: GroupBy,
         state: &ExecutionState,
         cache_key: &str,
@@ -175,7 +175,7 @@ impl WindowExpr {
 
             if let Some((output, group)) = non_matching_group {
                 let first = group.first();
-                let group = groupby_columns
+                let group = group_by_columns
                     .iter()
                     .map(|s| format_smartstring!("{}", s.get(first as usize).unwrap()))
                     .collect::<Vec<_>>();
@@ -371,13 +371,13 @@ impl WindowExpr {
 
 impl PhysicalExpr for WindowExpr {
     // Note: this was first implemented with expression evaluation but this performed really bad.
-    // Therefore we choose the groupby -> apply -> self join approach
+    // Therefore we choose the group_by -> apply -> self join approach
 
-    // This first cached the groupby and the join tuples, but rayon under a mutex leads to deadlocks:
+    // This first cached the group_by and the join tuples, but rayon under a mutex leads to deadlocks:
     // https://github.com/rayon-rs/rayon/issues/592
     fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult<Series> {
         // This method does the following:
-        // 1. determine groupby tuples based on the group_column
+        // 1. determine group_by tuples based on the group_column
         // 2. apply an aggregation function
         // 3. join the results back to the original dataframe
         //    this stores all group values on the original df size
@@ -407,14 +407,14 @@ impl PhysicalExpr for WindowExpr {
             return Ok(Series::full_null(field.name(), 0, field.data_type()));
         }
 
-        let groupby_columns = self
+        let group_by_columns = self
             .group_by
             .iter()
             .map(|e| e.evaluate(df, state))
             .collect::<PolarsResult<Vec<_>>>()?;
 
         // if the keys are sorted
-        let sorted_keys = groupby_columns.iter().all(|s| {
+        let sorted_keys = group_by_columns.iter().all(|s| {
             matches!(
                 s.is_sorted_flag(),
                 IsSorted::Ascending | IsSorted::Descending
@@ -441,16 +441,16 @@ impl PhysicalExpr for WindowExpr {
         }
 
         let create_groups = || {
-            let gb = df.groupby_with_series(groupby_columns.clone(), true, sort_groups)?;
+            let gb = df.group_by_with_series(group_by_columns.clone(), true, sort_groups)?;
             let out: PolarsResult<GroupsProxy> = Ok(gb.take_groups());
             out
         };
 
         // Try to get cached grouptuples
         let (mut groups, _, cache_key) = if state.cache_window() {
-            let mut cache_key = String::with_capacity(32 * groupby_columns.len());
+            let mut cache_key = String::with_capacity(32 * group_by_columns.len());
             write!(&mut cache_key, "{}", state.branch_idx).unwrap();
-            for s in &groupby_columns {
+            for s in &group_by_columns {
                 cache_key.push_str(s.name());
             }
 
@@ -488,7 +488,7 @@ impl PhysicalExpr for WindowExpr {
         if sort_groups || state.cache_window() {
             groups.sort()
         }
-        let gb = GroupBy::new(df, groupby_columns.clone(), groups, Some(apply_columns));
+        let gb = GroupBy::new(df, group_by_columns.clone(), groups, Some(apply_columns));
 
         // If the aggregation creates categoricals and `MapStrategy` is `Join`,
         // the string cache was needed. So we hold it for that case.
@@ -531,7 +531,7 @@ impl PhysicalExpr for WindowExpr {
                     out_column,
                     flattened,
                     ac,
-                    &groupby_columns,
+                    &group_by_columns,
                     gb,
                     state,
                     &cache_key,
@@ -558,16 +558,16 @@ impl PhysicalExpr for WindowExpr {
                         cache_gb(gb, state, &cache_key);
 
                         let get_join_tuples = || {
-                            if groupby_columns.len() == 1 {
+                            if group_by_columns.len() == 1 {
                                 // group key from right column
                                 let right = &keys[0];
-                                groupby_columns[0]
+                                group_by_columns[0]
                                     .hash_join_left(right, JoinValidation::ManyToMany)
                                     .unwrap()
                                     .1
                             } else {
                                 let df_right = DataFrame::new_no_checks(keys);
-                                let df_left = DataFrame::new_no_checks(groupby_columns);
+                                let df_left = DataFrame::new_no_checks(group_by_columns);
                                 private_left_join_multiple_keys(&df_left, &df_right, None, None).1
                             }
                         };
diff --git a/crates/polars-lazy/src/physical_plan/planner/expr.rs b/crates/polars-lazy/src/physical_plan/planner/expr.rs
index a96bb84f1193..2411539969d0 100644
--- a/crates/polars-lazy/src/physical_plan/planner/expr.rs
+++ b/crates/polars-lazy/src/physical_plan/planner/expr.rs
@@ -1,4 +1,4 @@
-use polars_core::frame::groupby::GroupByMethod;
+use polars_core::frame::group_by::GroupByMethod;
 use polars_core::prelude::*;
 use polars_core::series::IsSorted;
 use polars_core::utils::_split_offsets;
diff --git a/crates/polars-lazy/src/physical_plan/planner/lp.rs b/crates/polars-lazy/src/physical_plan/planner/lp.rs
index df764fd7d2a5..5a08853631dd 100644
--- a/crates/polars-lazy/src/physical_plan/planner/lp.rs
+++ b/crates/polars-lazy/src/physical_plan/planner/lp.rs
@@ -12,17 +12,17 @@ fn partitionable_gb(
     expr_arena: &Arena<AExpr>,
     apply: &Option<Arc<dyn DataFrameUdf>>,
 ) -> bool {
-    // We first check if we can partition the groupby on the latest moment.
+    // We first check if we can partition the group_by on the latest moment.
     let mut partitionable = true;
 
     // checks:
-    //      1. complex expressions in the groupby itself are also not partitionable
+    //      1. complex expressions in the group_by itself are also not partitionable
     //          in this case anything more than col("foo")
     //      2. a custom function cannot be partitioned
     //      3. we don't bother with more than 2 keys, as the cardinality likely explodes
     //         by the combinations
     if !keys.is_empty() && keys.len() < 3 && apply.is_none() {
-        // complex expressions in the groupby itself are also not partitionable
+        // complex expressions in the group_by itself are also not partitionable
         // in this case anything more than col("foo")
         for key in keys {
             if (expr_arena).iter(*key).count() > 1 {
@@ -405,7 +405,7 @@ pub fn create_physical_plan(
             )?;
 
             let _slice = options.slice;
-            #[cfg(feature = "dynamic_groupby")]
+            #[cfg(feature = "dynamic_group_by")]
             if let Some(options) = options.dynamic {
                 let input = create_physical_plan(input, lp_arena, expr_arena)?;
                 return Ok(Box::new(executors::GroupByDynamicExec {
@@ -419,7 +419,7 @@ pub fn create_physical_plan(
                 }));
             }
 
-            #[cfg(feature = "dynamic_groupby")]
+            #[cfg(feature = "dynamic_group_by")]
             if let Some(options) = options.rolling {
                 let input = create_physical_plan(input, lp_arena, expr_arena)?;
                 return Ok(Box::new(executors::GroupByRollingExec {
@@ -433,7 +433,7 @@ pub fn create_physical_plan(
                 }));
             }
 
-            // We first check if we can partition the groupby on the latest moment.
+            // We first check if we can partition the group_by on the latest moment.
             let partitionable = partitionable_gb(&keys, &aggs, &input_schema, expr_arena, &apply);
             if partitionable {
                 let from_partitioned_ds = (&*lp_arena).iter(input).any(|(_, lp)| {
diff --git a/crates/polars-lazy/src/physical_plan/state.rs b/crates/polars-lazy/src/physical_plan/state.rs
index 164e25adf8c1..ebf1e501885c 100644
--- a/crates/polars-lazy/src/physical_plan/state.rs
+++ b/crates/polars-lazy/src/physical_plan/state.rs
@@ -5,7 +5,7 @@ use std::sync::{Mutex, RwLock};
 use bitflags::bitflags;
 use once_cell::sync::OnceCell;
 use polars_core::config::verbose;
-use polars_core::frame::groupby::GroupsProxy;
+use polars_core::frame::group_by::GroupsProxy;
 use polars_core::frame::hash_join::ChunkJoinOptIds;
 use polars_core::prelude::*;
 #[cfg(any(feature = "parquet", feature = "csv", feature = "ipc"))]
diff --git a/crates/polars-lazy/src/physical_plan/streaming/convert_alp.rs b/crates/polars-lazy/src/physical_plan/streaming/convert_alp.rs
index 0a78548937ae..20369fafd237 100644
--- a/crates/polars-lazy/src/physical_plan/streaming/convert_alp.rs
+++ b/crates/polars-lazy/src/physical_plan/streaming/convert_alp.rs
@@ -365,7 +365,7 @@ pub(crate) fn insert_streaming_nodes(
                 #[allow(unused_mut)]
                 let mut can_stream = true;
 
-                #[cfg(feature = "dynamic_groupby")]
+                #[cfg(feature = "dynamic_group_by")]
                 {
                     if options.rolling.is_some() || options.dynamic.is_some() {
                         can_stream = false
@@ -387,7 +387,7 @@ pub(crate) fn insert_streaming_nodes(
                         expr_arena
                             .get(*node)
                             .get_type(schema, Context::Default, expr_arena)
-                            // ensure we don't groupby list
+                            // ensure we don't group_by list
                             .map(|dt| !matches!(dt, DataType::List(_)))
                             .unwrap_or(false)
                     })
diff --git a/crates/polars-lazy/src/prelude.rs b/crates/polars-lazy/src/prelude.rs
index 798a6e2515bc..0d4a0e3e67ce 100644
--- a/crates/polars-lazy/src/prelude.rs
+++ b/crates/polars-lazy/src/prelude.rs
@@ -9,7 +9,7 @@ pub use polars_plan::prelude::ParquetWriteOptions;
 pub(crate) use polars_plan::prelude::*;
 #[cfg(feature = "rolling_window")]
 pub use polars_time::{prelude::RollingOptions, Duration};
-#[cfg(feature = "dynamic_groupby")]
+#[cfg(feature = "dynamic_group_by")]
 pub use polars_time::{DynamicGroupOptions, PolarsTemporalGroupby, RollingGroupOptions};
 pub(crate) use polars_utils::arena::{Arena, Node};
 
diff --git a/crates/polars-lazy/src/tests/aggregations.rs b/crates/polars-lazy/src/tests/aggregations.rs
index e96c5c0e35e4..d0e620056ea1 100644
--- a/crates/polars-lazy/src/tests/aggregations.rs
+++ b/crates/polars-lazy/src/tests/aggregations.rs
@@ -9,7 +9,7 @@ fn test_agg_exprs() -> PolarsResult<()> {
     // a binary expression followed by a function and an aggregation. See if it runs
     let out = df
         .lazy()
-        .groupby_stable([col("cars")])
+        .group_by_stable([col("cars")])
         .agg([(lit(1) - col("A"))
             .map(|s| Ok(Some(&s * 2)), GetOutput::same_type())
             .alias("foo")])
@@ -30,7 +30,7 @@ fn test_agg_unique_first() -> PolarsResult<()> {
 
     let out = df
         .lazy()
-        .groupby_stable([col("g")])
+        .group_by_stable([col("g")])
         .agg([
             col("v").unique().first().alias("v_first"),
             col("v").unique().sort(false).first().alias("true_first"),
@@ -73,7 +73,7 @@ fn test_cumsum_agg_as_key() -> PolarsResult<()> {
 
     let out = df
         .lazy()
-        .groupby([col("soil")
+        .group_by([col("soil")
             .neq(col("soil").shift_and_fill(1, col("soil").first()))
             .cumsum(false)
             .alias("key")])
@@ -100,7 +100,7 @@ fn test_auto_skew_kurtosis_agg() -> PolarsResult<()> {
 
     let out = df
         .lazy()
-        .groupby([col("fruits")])
+        .group_by([col("fruits")])
         .agg([
             col("B").skew(false).alias("bskew"),
             col("B").kurtosis(false, false).alias("bkurt"),
@@ -121,17 +121,17 @@ fn test_auto_list_agg() -> PolarsResult<()> {
     let out = df
         .clone()
         .lazy()
-        .groupby([col("fruits")])
+        .group_by([col("fruits")])
         .agg([col("B").shift_and_fill(-1, lit(-1)).alias("foo")])
         .collect()?;
 
     assert!(matches!(out.column("foo")?.dtype(), DataType::List(_)));
 
-    // test if it runs and groupby executor thus implements a list after shift_and_fill
+    // test if it runs and group_by executor thus implements a list after shift_and_fill
     let _out = df
         .clone()
         .lazy()
-        .groupby([col("fruits")])
+        .group_by([col("fruits")])
         .agg([col("B").shift_and_fill(-1, lit(-1))])
         .collect()?;
 
@@ -157,7 +157,7 @@ fn test_power_in_agg_list1() -> PolarsResult<()> {
     // a flat apply on a final aggregation
     let out = df
         .lazy()
-        .groupby([col("fruits")])
+        .group_by([col("fruits")])
         .agg([
             col("A")
                 .rolling_min(RollingOptions {
@@ -199,7 +199,7 @@ fn test_power_in_agg_list2() -> PolarsResult<()> {
     // a flat apply on evaluate_on_groups
     let out = df
         .lazy()
-        .groupby([col("fruits")])
+        .group_by([col("fruits")])
         .agg([col("A")
             .rolling_min(RollingOptions {
                 window_size: Duration::new(2),
@@ -233,7 +233,7 @@ fn test_binary_agg_context_0() -> PolarsResult<()> {
 
     let out = df
         .lazy()
-        .groupby_stable([col("groups")])
+        .group_by_stable([col("groups")])
         .agg([when(col("vals").first().neq(lit(1)))
             .then(repeat(lit("a"), count()))
             .otherwise(repeat(lit("b"), count()))
@@ -274,7 +274,7 @@ fn test_binary_agg_context_1() -> PolarsResult<()> {
     let out = df
         .clone()
         .lazy()
-        .groupby_stable([col("groups")])
+        .group_by_stable([col("groups")])
         .agg([when(col("vals").eq(lit(1)))
             .then(col("vals").sum())
             .otherwise(lit(90))
@@ -295,7 +295,7 @@ fn test_binary_agg_context_1() -> PolarsResult<()> {
 
     let out = df
         .lazy()
-        .groupby_stable([col("groups")])
+        .group_by_stable([col("groups")])
         .agg([when(col("vals").eq(lit(1)))
             .then(lit(90))
             .otherwise(col("vals").sum())
@@ -329,7 +329,7 @@ fn test_binary_agg_context_2() -> PolarsResult<()> {
     let out = df
         .clone()
         .lazy()
-        .groupby_stable([col("groups")])
+        .group_by_stable([col("groups")])
         .agg([(col("vals").first() - col("vals")).alias("vals")])
         .collect()?;
 
@@ -347,7 +347,7 @@ fn test_binary_agg_context_2() -> PolarsResult<()> {
     // Same, but now we reverse the lhs / rhs.
     let out = df
         .lazy()
-        .groupby_stable([col("groups")])
+        .group_by_stable([col("groups")])
         .agg([((col("vals")) - col("vals").first()).alias("vals")])
         .collect()?;
 
@@ -371,7 +371,7 @@ fn test_binary_agg_context_3() -> PolarsResult<()> {
 
     let out = df
         .lazy()
-        .groupby_stable([col("cars")])
+        .group_by_stable([col("cars")])
         .agg([(col("A") - col("A").first()).last().alias("last")])
         .collect()?;
 
@@ -391,7 +391,7 @@ fn test_shift_elementwise_issue_2509() -> PolarsResult<()> {
     let out = df
         .lazy()
         // Don't use maintain order here! That hides the bug
-        .groupby([col("x")])
+        .group_by([col("x")])
         .agg(&[(col("y").shift(-1) + col("x")).alias("sum")])
         .sort("x", Default::default())
         .collect()?;
@@ -419,7 +419,7 @@ fn take_aggregations() -> PolarsResult<()> {
     let out = df
         .clone()
         .lazy()
-        .groupby([col("user")])
+        .group_by([col("user")])
         .agg([col("book").take(col("count").arg_max()).alias("fav_book")])
         .sort("user", Default::default())
         .collect()?;
@@ -432,7 +432,7 @@ fn take_aggregations() -> PolarsResult<()> {
     let out = df
         .clone()
         .lazy()
-        .groupby([col("user")])
+        .group_by([col("user")])
         .agg([
             // keep the head as it test slice correctness
             col("book")
@@ -458,7 +458,7 @@ fn take_aggregations() -> PolarsResult<()> {
 
     let out = df
         .lazy()
-        .groupby([col("user")])
+        .group_by([col("user")])
         .agg([col("book").take(lit(0)).alias("take_lit")])
         .sort("user", Default::default())
         .collect()?;
@@ -493,7 +493,7 @@ fn test_take_consistency() -> PolarsResult<()> {
     let out = df
         .clone()
         .lazy()
-        .groupby_stable([col("cars")])
+        .group_by_stable([col("cars")])
         .agg([col("A")
             .arg_sort(SortOptions {
                 descending: true,
@@ -510,7 +510,7 @@ fn test_take_consistency() -> PolarsResult<()> {
 
     let out_df = df
         .lazy()
-        .groupby_stable([col("cars")])
+        .group_by_stable([col("cars")])
         .agg([
             col("A"),
             col("A")
diff --git a/crates/polars-lazy/src/tests/arity.rs b/crates/polars-lazy/src/tests/arity.rs
index 77f7c4798ef3..12a2eb8ce9c8 100644
--- a/crates/polars-lazy/src/tests/arity.rs
+++ b/crates/polars-lazy/src/tests/arity.rs
@@ -12,7 +12,7 @@ fn test_pearson_corr() -> PolarsResult<()> {
     let out = df
         .clone()
         .lazy()
-        .groupby_stable([col("uid")])
+        .group_by_stable([col("uid")])
         // a double aggregation expression.
         .agg([pearson_corr(col("day"), col("cumcases"), 1).alias("pearson_corr")])
         .collect()?;
@@ -22,7 +22,7 @@ fn test_pearson_corr() -> PolarsResult<()> {
 
     let out = df
         .lazy()
-        .groupby_stable([col("uid")])
+        .group_by_stable([col("uid")])
         // a double aggregation expression.
         .agg([pearson_corr(col("day"), col("cumcases"), 1)
             .pow(2.0)
diff --git a/crates/polars-lazy/src/tests/logical.rs b/crates/polars-lazy/src/tests/logical.rs
index 14dff0e1cd91..6fdef02c8b38 100644
--- a/crates/polars-lazy/src/tests/logical.rs
+++ b/crates/polars-lazy/src/tests/logical.rs
@@ -20,7 +20,7 @@ fn test_duration() -> PolarsResult<()> {
                 .cast(DataType::Datetime(TimeUnit::Milliseconds, None))
                 .alias("datetime"),
         )
-        .groupby([col("groups")])
+        .group_by([col("groups")])
         .agg([
             (col("date") - col("date").first()).alias("date"),
             (col("datetime") - col("datetime").first()).alias("datetime"),
@@ -104,7 +104,7 @@ fn test_lazy_logical_plan_schema() {
 
     let lp = df
         .lazy()
-        .groupby([col("variety")])
+        .group_by([col("variety")])
         .agg([col("sepal.width").min()])
         .logical_plan;
     assert!(lp.schema().unwrap().get("sepal.width").is_some());
diff --git a/crates/polars-lazy/src/tests/optimization_checks.rs b/crates/polars-lazy/src/tests/optimization_checks.rs
index 4cd8f36737ac..ab6dcb57b177 100644
--- a/crates/polars-lazy/src/tests/optimization_checks.rs
+++ b/crates/polars-lazy/src/tests/optimization_checks.rs
@@ -202,16 +202,16 @@ pub fn test_slice_pushdown_join() -> PolarsResult<()> {
 }
 
 #[test]
-pub fn test_slice_pushdown_groupby() -> PolarsResult<()> {
+pub fn test_slice_pushdown_group_by() -> PolarsResult<()> {
     let _guard = SINGLE_LOCK.lock().unwrap();
     let q = scan_foods_parquet(false).limit(100);
 
     let q = q
-        .groupby([col("category")])
+        .group_by([col("category")])
         .agg([col("calories").sum()])
         .slice(1, 3);
 
-    // test if optimization continued beyond the groupby node
+    // test if optimization continued beyond the group_by node
     assert!(slice_at_scan(q.clone()));
 
     let (mut expr_arena, mut lp_arena) = get_arenas();
@@ -392,7 +392,7 @@ fn test_with_row_count_opts() -> PolarsResult<()> {
 }
 
 #[test]
-fn test_groupby_ternary_literal_predicate() -> PolarsResult<()> {
+fn test_group_by_ternary_literal_predicate() -> PolarsResult<()> {
     let df = df![
         "a" => [1, 2, 3],
         "b" => [1, 2, 3]
@@ -402,7 +402,7 @@ fn test_groupby_ternary_literal_predicate() -> PolarsResult<()> {
         let q = df
             .clone()
             .lazy()
-            .groupby(["a"])
+            .group_by(["a"])
             .agg([when(lit(predicate))
                 .then(col("b").sum())
                 .otherwise(NULL.lit())])
@@ -527,14 +527,14 @@ fn test_with_column_prune() -> PolarsResult<()> {
 }
 
 #[test]
-fn test_slice_at_scan_groupby() -> PolarsResult<()> {
+fn test_slice_at_scan_group_by() -> PolarsResult<()> {
     let ldf = scan_foods_csv();
 
     // this tests if slice pushdown restarts aggregation nodes (it did not)
     let q = ldf
         .slice(0, 5)
         .filter(col("calories").lt(lit(10)))
-        .groupby([col("calories")])
+        .group_by([col("calories")])
         .agg([col("fats_g").first()])
         .select([col("fats_g")]);
 
diff --git a/crates/polars-lazy/src/tests/queries.rs b/crates/polars-lazy/src/tests/queries.rs
index 2a6dc72fc4f9..4984f909de27 100644
--- a/crates/polars-lazy/src/tests/queries.rs
+++ b/crates/polars-lazy/src/tests/queries.rs
@@ -121,7 +121,7 @@ fn test_lazy_is_null() {
 
     let new = df
         .lazy()
-        .groupby([col("variety")])
+        .group_by([col("variety")])
         .agg([col("sepal.width").min()])
         .collect()
         .unwrap();
@@ -135,7 +135,7 @@ fn test_lazy_pushdown_through_agg() {
     let df = get_df();
     let new = df
         .lazy()
-        .groupby([col("variety")])
+        .group_by([col("variety")])
         .agg([
             col("sepal.length").min(),
             col("petal.length").min().alias("foo"),
@@ -234,7 +234,7 @@ fn test_lazy_query_2() {
 fn test_lazy_query_3() {
     // query checks if schema of scanning is not changed by aggregation
     let _ = scan_foods_csv()
-        .groupby([col("calories")])
+        .group_by([col("calories")])
         .agg([col("fats_g").max()])
         .collect()
         .unwrap();
@@ -253,7 +253,7 @@ fn test_lazy_query_4() {
 
     let out = base_df
         .clone()
-        .groupby([col("uid")])
+        .group_by([col("uid")])
         .agg([
             col("day").alias("day"),
             col("cumcases")
@@ -290,7 +290,7 @@ fn test_lazy_query_5() {
 
     let out = df
         .lazy()
-        .groupby([col("uid")])
+        .group_by([col("uid")])
         .agg([col("day").head(Some(2))])
         .collect()
         .unwrap();
@@ -370,7 +370,7 @@ fn test_lazy_query_9() -> PolarsResult<()> {
             [col("Cities.City")],
             JoinType::Inner.into(),
         )
-        .groupby([col("Cities.Country")])
+        .group_by([col("Cities.Country")])
         .agg([col("Sales.Amount").sum().alias("sum")])
         .sort("sum", Default::default())
         .collect()?;
@@ -561,7 +561,7 @@ fn test_lazy_wildcard() {
 
     let new = df
         .lazy()
-        .groupby([col("b")])
+        .group_by([col("b")])
         .agg([col("*").sum().suffix(""), col("*").first().suffix("_first")])
         .collect()
         .unwrap();
@@ -652,7 +652,7 @@ fn test_lazy_partition_agg() {
 
     let out = df
         .lazy()
-        .groupby([col("foo")])
+        .group_by([col("foo")])
         .agg([col("bar").mean()])
         .sort("foo", Default::default())
         .collect()
@@ -664,7 +664,7 @@ fn test_lazy_partition_agg() {
     );
 
     let out = scan_foods_csv()
-        .groupby([col("category")])
+        .group_by([col("category")])
         .agg([col("calories")])
         .sort("category", Default::default())
         .collect()
@@ -687,11 +687,11 @@ fn test_lazy_partition_agg() {
 }
 
 #[test]
-fn test_lazy_groupby_apply() {
+fn test_lazy_group_by_apply() {
     let df = fruits_cars();
 
     df.lazy()
-        .groupby([col("fruits")])
+        .group_by([col("fruits")])
         .agg([col("cars").apply(
             |s: Series| Ok(Some(Series::new("", &[s.len() as u32]))),
             GetOutput::same_type(),
@@ -733,7 +733,7 @@ fn test_lazy_shift_and_fill() {
 }
 
 #[test]
-fn test_lazy_groupby() {
+fn test_lazy_group_by() {
     let df = df! {
         "a" => &[Some(1.0), None, Some(3.0), Some(4.0), Some(5.0)],
         "groups" => &["a", "a", "b", "c", "c"]
@@ -742,7 +742,7 @@ fn test_lazy_groupby() {
 
     let out = df
         .lazy()
-        .groupby([col("groups")])
+        .group_by([col("groups")])
         .agg([col("a").mean()])
         .sort("a", Default::default())
         .collect()
@@ -763,7 +763,7 @@ fn test_lazy_tail() {
 }
 
 #[test]
-fn test_lazy_groupby_sort() {
+fn test_lazy_group_by_sort() {
     let df = df! {
         "a" => ["a", "b", "a", "b", "b", "c"],
         "b" => [1, 2, 3, 4, 5, 6]
@@ -773,7 +773,7 @@ fn test_lazy_groupby_sort() {
     let out = df
         .clone()
         .lazy()
-        .groupby([col("a")])
+        .group_by([col("a")])
         .agg([col("b").sort(false).first()])
         .collect()
         .unwrap()
@@ -787,7 +787,7 @@ fn test_lazy_groupby_sort() {
 
     let out = df
         .lazy()
-        .groupby([col("a")])
+        .group_by([col("a")])
         .agg([col("b").sort(false).last()])
         .collect()
         .unwrap()
@@ -801,7 +801,7 @@ fn test_lazy_groupby_sort() {
 }
 
 #[test]
-fn test_lazy_groupby_sort_by() {
+fn test_lazy_group_by_sort_by() {
     let df = df! {
         "a" => ["a", "a", "a", "b", "b", "c"],
         "b" => [1, 2, 3, 4, 5, 6],
@@ -811,7 +811,7 @@ fn test_lazy_groupby_sort_by() {
 
     let out = df
         .lazy()
-        .groupby([col("a")])
+        .group_by([col("a")])
         .agg([col("b").sort_by([col("c")], [true]).first()])
         .collect()
         .unwrap()
@@ -826,17 +826,17 @@ fn test_lazy_groupby_sort_by() {
 
 #[test]
 #[cfg(feature = "dtype-datetime")]
-fn test_lazy_groupby_cast() {
+fn test_lazy_group_by_cast() {
     let df = df! {
         "a" => ["a", "a", "a", "b", "b", "c"],
         "b" => [1, 2, 3, 4, 5, 6]
     }
     .unwrap();
 
-    // test if it runs in groupby context
+    // test if it runs in group_by context
     let _out = df
         .lazy()
-        .groupby([col("a")])
+        .group_by([col("a")])
         .agg([col("b")
             .mean()
             .cast(DataType::Datetime(TimeUnit::Nanoseconds, None))])
@@ -845,17 +845,17 @@ fn test_lazy_groupby_cast() {
 }
 
 #[test]
-fn test_lazy_groupby_binary_expr() {
+fn test_lazy_group_by_binary_expr() {
     let df = df! {
         "a" => ["a", "a", "a", "b", "b", "c"],
         "b" => [1, 2, 3, 4, 5, 6]
     }
     .unwrap();
 
-    // test if it runs in groupby context
+    // test if it runs in group_by context
     let out = df
         .lazy()
-        .groupby([col("a")])
+        .group_by([col("a")])
         .agg([col("b").mean() * lit(2)])
         .sort("a", Default::default())
         .collect()
@@ -867,18 +867,18 @@ fn test_lazy_groupby_binary_expr() {
 }
 
 #[test]
-fn test_lazy_groupby_filter() -> PolarsResult<()> {
+fn test_lazy_group_by_filter() -> PolarsResult<()> {
     let df = df! {
         "a" => ["a", "a", "a", "b", "b", "c"],
         "b" => [1, 2, 3, 4, 5, 6]
     }?;
 
-    // We test if the filters work in the groupby context
+    // We test if the filters work in the group_by context
     // and that the aggregations can deal with empty sets
 
     let out = df
         .lazy()
-        .groupby([col("a")])
+        .group_by([col("a")])
         .agg([
             col("b").filter(col("a").eq(lit("a"))).sum().alias("b_sum"),
             col("b")
@@ -926,7 +926,7 @@ fn test_lazy_groupby_filter() -> PolarsResult<()> {
 }
 
 #[test]
-fn test_groupby_projection_pd_same_column() -> PolarsResult<()> {
+fn test_group_by_projection_pd_same_column() -> PolarsResult<()> {
     // this query failed when projection pushdown was enabled
 
     let a = || {
@@ -952,7 +952,7 @@ fn test_groupby_projection_pd_same_column() -> PolarsResult<()> {
 }
 
 #[test]
-fn test_groupby_sort_slice() -> PolarsResult<()> {
+fn test_group_by_sort_slice() -> PolarsResult<()> {
     let df = df![
         "groups" => [1, 2, 2, 3, 3, 3],
         "vals" => [1, 5, 6, 3, 9, 8]
@@ -975,14 +975,14 @@ fn test_groupby_sort_slice() -> PolarsResult<()> {
                 ..Default::default()
             },
         )
-        .groupby([col("groups")])
+        .group_by([col("groups")])
         .agg([col("vals").head(Some(2)).alias("foo")])
         .sort("groups", SortOptions::default())
         .collect()?;
 
     let out2 = df
         .lazy()
-        .groupby([col("groups")])
+        .group_by([col("groups")])
         .agg([col("vals").sort(true).head(Some(2)).alias("foo")])
         .sort("groups", SortOptions::default())
         .collect()?;
@@ -992,7 +992,7 @@ fn test_groupby_sort_slice() -> PolarsResult<()> {
 }
 
 #[test]
-fn test_groupby_cumsum() -> PolarsResult<()> {
+fn test_group_by_cumsum() -> PolarsResult<()> {
     let df = df![
         "groups" => [1, 2, 2, 3, 3, 3],
         "vals" => [1, 5, 6, 3, 9, 8]
@@ -1000,7 +1000,7 @@ fn test_groupby_cumsum() -> PolarsResult<()> {
 
     let out = df
         .lazy()
-        .groupby([col("groups")])
+        .group_by([col("groups")])
         .agg([col("vals").cumsum(false)])
         .sort("groups", Default::default())
         .collect()?;
@@ -1059,7 +1059,7 @@ fn test_multiple_explode() -> PolarsResult<()> {
 
     let out = df
         .lazy()
-        .groupby([col("a")])
+        .group_by([col("a")])
         .agg([col("b").alias("b_list"), col("c").alias("c_list")])
         .explode([col("c_list"), col("b_list")])
         .collect()?;
@@ -1261,7 +1261,7 @@ fn test_sort_by() -> PolarsResult<()> {
     let out = df
         .clone()
         .lazy()
-        .groupby_stable([col("b")])
+        .group_by_stable([col("b")])
         .agg([col("a").sort_by([col("b"), col("c")], [false])])
         .collect()?;
     let a = out.column("a")?.explode()?;
@@ -1273,7 +1273,7 @@ fn test_sort_by() -> PolarsResult<()> {
     // evaluate_on_groups
     let out = df
         .lazy()
-        .groupby_stable([col("b")])
+        .group_by_stable([col("b")])
         .agg([col("a").sort_by([col("b"), col("c")], [false])])
         .collect()?;
 
@@ -1394,7 +1394,7 @@ fn test_filter_count() -> PolarsResult<()> {
 
 #[test]
 #[cfg(feature = "dtype-i16")]
-fn test_groupby_small_ints() -> PolarsResult<()> {
+fn test_group_by_small_ints() -> PolarsResult<()> {
     let df = df![
         "id_32" => [1i32, 2],
         "id_16" => [1i16, 2]
@@ -1403,7 +1403,7 @@ fn test_groupby_small_ints() -> PolarsResult<()> {
     // https://github.com/pola-rs/polars/issues/1255
     let out = df
         .lazy()
-        .groupby([col("id_16"), col("id_32")])
+        .group_by([col("id_16"), col("id_32")])
         .agg([col("id_16").sum().alias("foo")])
         .sort(
             "foo",
@@ -1468,7 +1468,7 @@ fn test_round_after_agg() -> PolarsResult<()> {
 
     let out = df
         .lazy()
-        .groupby([col("fruits")])
+        .group_by([col("fruits")])
         .agg([col("A")
             .cast(DataType::Float32)
             .mean()
@@ -1504,7 +1504,7 @@ fn test_round_after_agg() -> PolarsResult<()> {
 
     let out = df
         .lazy()
-        .groupby_stable([col("groups")])
+        .group_by_stable([col("groups")])
         .agg([((col("b") * col("c")).sum() / col("b").sum())
             .round(2)
             .alias("foo")])
@@ -1549,11 +1549,11 @@ fn test_exclude_regex() -> PolarsResult<()> {
 
 #[test]
 #[cfg(feature = "rank")]
-fn test_groupby_rank() -> PolarsResult<()> {
+fn test_group_by_rank() -> PolarsResult<()> {
     let df = fruits_cars();
     let out = df
         .lazy()
-        .groupby_stable([col("cars")])
+        .group_by_stable([col("cars")])
         .agg([col("B").rank(
             RankOptions {
                 method: RankMethod::Dense,
@@ -1703,7 +1703,7 @@ fn test_apply_flatten() -> PolarsResult<()> {
 
     let out = df
         .lazy()
-        .groupby_stable([col("B")])
+        .group_by_stable([col("B")])
         .agg([col("A").abs().sum().alias("A_sum")])
         .collect()?;
 
@@ -1723,7 +1723,7 @@ fn test_is_in() -> PolarsResult<()> {
     let out = df
         .clone()
         .lazy()
-        .groupby_stable([col("fruits")])
+        .group_by_stable([col("fruits")])
         .agg([col("cars").is_in(col("cars").filter(col("cars").eq(lit("beetle"))))])
         .collect()?;
     let out = out.column("cars").unwrap();
@@ -1737,7 +1737,7 @@ fn test_is_in() -> PolarsResult<()> {
     // this will be executed by map
     let out = df
         .lazy()
-        .groupby_stable([col("fruits")])
+        .group_by_stable([col("fruits")])
         .agg([col("cars").is_in(lit(Series::new("a", ["beetle", "vw"])))])
         .collect()?;
 
@@ -1761,7 +1761,7 @@ fn test_partitioned_gb_1() -> PolarsResult<()> {
         "vals" => ["a", "b", "c", "a", "a"]
     ]?
     .lazy()
-    .groupby([col("keys")])
+    .group_by([col("keys")])
     .agg([
         (col("vals").eq(lit("a"))).sum().alias("eq_a"),
         (col("vals").eq(lit("b"))).sum().alias("eq_b"),
@@ -1785,7 +1785,7 @@ fn test_partitioned_gb_count() -> PolarsResult<()> {
         "col" => (0..100).map(|_| Some(0)).collect::<Int32Chunked>().into_series(),
     ]?
     .lazy()
-    .groupby([col("col")])
+    .group_by([col("col")])
     .agg([
         // we make sure to alias with a different name
         count().alias("counted"),
@@ -1810,7 +1810,7 @@ fn test_partitioned_gb_mean() -> PolarsResult<()> {
     ]?
     .lazy()
     .with_columns([lit("a").alias("str"), lit(1).alias("int")])
-    .groupby([col("key")])
+    .group_by([col("key")])
     .agg([
         col("str").mean().alias("mean_str"),
         col("int").mean().alias("mean_int"),
@@ -1836,7 +1836,7 @@ fn test_partitioned_gb_binary() -> PolarsResult<()> {
     let out = df
         .clone()
         .lazy()
-        .groupby([col("col")])
+        .group_by([col("col")])
         .agg([(col("col") + lit(10)).sum().alias("sum")])
         .collect()?;
 
@@ -1847,7 +1847,7 @@ fn test_partitioned_gb_binary() -> PolarsResult<()> {
 
     let out = df
         .lazy()
-        .groupby([col("col")])
+        .group_by([col("col")])
         .agg([(col("col").cast(DataType::Float32) + lit(10.0))
             .sum()
             .alias("sum")])
@@ -1871,7 +1871,7 @@ fn test_partitioned_gb_ternary() -> PolarsResult<()> {
 
     let out = df
         .lazy()
-        .groupby([col("col")])
+        .group_by([col("col")])
         .agg([when(col("val").gt(lit(10)))
             .then(lit(1))
             .otherwise(lit(0))
diff --git a/crates/polars-lazy/src/tests/streaming.rs b/crates/polars-lazy/src/tests/streaming.rs
index a64c965a45bf..927beac8370a 100644
--- a/crates/polars-lazy/src/tests/streaming.rs
+++ b/crates/polars-lazy/src/tests/streaming.rs
@@ -37,7 +37,7 @@ fn test_streaming_parquet() -> PolarsResult<()> {
     let q = get_parquet_file();
 
     let q = q
-        .groupby([col("sugars_g")])
+        .group_by([col("sugars_g")])
         .agg([((lit(1) - col("fats_g")) + col("calories")).sum()])
         .sort("sugars_g", Default::default());
 
@@ -51,7 +51,7 @@ fn test_streaming_csv() -> PolarsResult<()> {
 
     let q = q
         .select([col("sugars_g"), col("calories")])
-        .groupby([col("sugars_g")])
+        .group_by([col("sugars_g")])
         .agg([col("calories").sum()])
         .sort("sugars_g", Default::default());
 
@@ -95,7 +95,7 @@ fn test_streaming_multiple_keys_aggregate() -> PolarsResult<()> {
 
     let q = q
         .filter(col("sugars_g").gt(lit(10)))
-        .groupby([col("sugars_g"), col("calories")])
+        .group_by([col("sugars_g"), col("calories")])
         .agg([
             (col("fats_g") * lit(10)).sum(),
             col("calories").mean().alias("cal_mean"),
@@ -117,7 +117,7 @@ fn test_streaming_first_sum() -> PolarsResult<()> {
 
     let q = q
         .select([col("sugars_g"), col("calories")])
-        .groupby([col("sugars_g")])
+        .group_by([col("sugars_g")])
         .agg([
             col("calories").sum(),
             col("calories").first().alias("calories_first"),
@@ -146,7 +146,7 @@ fn test_streaming_aggregate_slice() -> PolarsResult<()> {
     let q = get_parquet_file();
 
     let q = q
-        .groupby([col("sugars_g")])
+        .group_by([col("sugars_g")])
         .agg([((lit(1) - col("fats_g")) + col("calories")).sum()])
         .slice(3, 3);
 
@@ -311,7 +311,7 @@ fn test_streaming_aggregate_join() -> PolarsResult<()> {
     let q = get_parquet_file();
 
     let q = q
-        .groupby([col("sugars_g")])
+        .group_by([col("sugars_g")])
         .agg([((lit(1) - col("fats_g")) + col("calories")).sum()])
         .slice(0, 3);
 
diff --git a/crates/polars-lazy/src/tests/tpch.rs b/crates/polars-lazy/src/tests/tpch.rs
index c5f876477f5c..929711c751c7 100644
--- a/crates/polars-lazy/src/tests/tpch.rs
+++ b/crates/polars-lazy/src/tests/tpch.rs
@@ -58,7 +58,7 @@ fn test_q2() -> PolarsResult<()> {
         .filter(col("p_type").str().ends_with(lit("BRASS".to_string())));
     let q = q1
         .clone()
-        .groupby([col("p_partkey")])
+        .group_by([col("p_partkey")])
         .agg([col("ps_supplycost").min()])
         .join(
             q1,
diff --git a/crates/polars-ops/src/chunked_array/nan_propagating_aggregate.rs b/crates/polars-ops/src/chunked_array/nan_propagating_aggregate.rs
index 93cf9afd8550..ebf7f0d1545d 100644
--- a/crates/polars-ops/src/chunked_array/nan_propagating_aggregate.rs
+++ b/crates/polars-ops/src/chunked_array/nan_propagating_aggregate.rs
@@ -9,7 +9,7 @@ use polars_arrow::kernels::take_agg::{
 };
 use polars_arrow::utils::CustomIterTools;
 use polars_core::export::num::Bounded;
-use polars_core::frame::groupby::aggregations::{
+use polars_core::frame::group_by::aggregations::{
     _agg_helper_idx, _agg_helper_slice, _rolling_apply_agg_window_no_nulls,
     _rolling_apply_agg_window_nulls, _slice_from_offsets, _use_rolling_kernels,
 };
diff --git a/crates/polars-ops/src/frame/pivot/mod.rs b/crates/polars-ops/src/frame/pivot/mod.rs
index 9867dede28d9..df020f010548 100644
--- a/crates/polars-ops/src/frame/pivot/mod.rs
+++ b/crates/polars-ops/src/frame/pivot/mod.rs
@@ -3,7 +3,7 @@ mod positioning;
 use std::borrow::Cow;
 
 use polars_core::export::rayon::prelude::*;
-use polars_core::frame::groupby::expr::PhysicalAggExpr;
+use polars_core::frame::group_by::expr::PhysicalAggExpr;
 use polars_core::prelude::*;
 use polars_core::utils::_split_offsets;
 use polars_core::{downcast_as_macro_arg_physical, POOL};
@@ -75,7 +75,7 @@ fn restore_logical_type(s: &Series, logical_type: &DataType) -> Series {
 ///
 /// # Note
 /// Polars'/arrow memory is not ideal for transposing operations like pivots.
-/// If you have a relatively large table, consider using a groupby over a pivot.
+/// If you have a relatively large table, consider using a group_by over a pivot.
 pub fn pivot<I0, S0, I1, S1, I2, S2>(
     pivot_df: &DataFrame,
     values: I0,
@@ -121,7 +121,7 @@ where
 ///
 /// # Note
 /// Polars'/arrow memory is not ideal for transposing operations like pivots.
-/// If you have a relatively large table, consider using a groupby over a pivot.
+/// If you have a relatively large table, consider using a group_by over a pivot.
 pub fn pivot_stable<I0, S0, I1, S1, I2, S2>(
     pivot_df: &DataFrame,
     values: I0,
@@ -167,12 +167,12 @@ where
 #[allow(clippy::too_many_arguments)]
 fn pivot_impl(
     pivot_df: &DataFrame,
-    // these columns will be aggregated in the nested groupby
+    // these columns will be aggregated in the nested group_by
     values: &[String],
-    // keys of the first groupby operation
+    // keys of the first group_by operation
     index: &[String],
-    // these columns will be used for a nested groupby
-    // the rows of this nested groupby will be pivoted as header column values
+    // these columns will be used for a nested group_by
+    // the rows of this nested group_by will be pivoted as header column values
     columns: &[String],
     // aggregation function
     agg_fn: Option<PivotAgg>,
@@ -189,10 +189,10 @@ fn pivot_impl(
     let mut count = 0;
     let out: PolarsResult<()> = POOL.install(|| {
         for column_column_name in columns {
-            let mut groupby = index.to_vec();
-            groupby.push(column_column_name.clone());
+            let mut group_by = index.to_vec();
+            group_by.push(column_column_name.clone());
 
-            let groups = pivot_df.groupby_stable(groupby)?.take_groups();
+            let groups = pivot_df.group_by_stable(group_by)?.take_groups();
 
             // these are the row locations
             if !stable {
diff --git a/crates/polars-ops/src/series/ops/to_dummies.rs b/crates/polars-ops/src/series/ops/to_dummies.rs
index a69d4a0d29cd..3d68161a304e 100644
--- a/crates/polars-ops/src/series/ops/to_dummies.rs
+++ b/crates/polars-ops/src/series/ops/to_dummies.rs
@@ -1,4 +1,4 @@
-use polars_core::frame::groupby::GroupsIndicator;
+use polars_core::frame::group_by::GroupsIndicator;
 
 use super::*;
 
diff --git a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/convert.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/convert.rs
similarity index 95%
rename from crates/polars-pipe/src/executors/sinks/groupby/aggregates/convert.rs
rename to crates/polars-pipe/src/executors/sinks/group_by/aggregates/convert.rs
index d36085dea640..9e3276b9e0a6 100644
--- a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/convert.rs
+++ b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/convert.rs
@@ -11,13 +11,13 @@ use polars_plan::prelude::{AAggExpr, AExpr};
 use polars_utils::arena::{Arena, Node};
 use polars_utils::IdxSize;
 
-use crate::executors::sinks::groupby::aggregates::count::CountAgg;
-use crate::executors::sinks::groupby::aggregates::first::FirstAgg;
-use crate::executors::sinks::groupby::aggregates::last::LastAgg;
-use crate::executors::sinks::groupby::aggregates::mean::MeanAgg;
-use crate::executors::sinks::groupby::aggregates::min_max::{new_max, new_min};
-use crate::executors::sinks::groupby::aggregates::null::NullAgg;
-use crate::executors::sinks::groupby::aggregates::{AggregateFunction, SumAgg};
+use crate::executors::sinks::group_by::aggregates::count::CountAgg;
+use crate::executors::sinks::group_by::aggregates::first::FirstAgg;
+use crate::executors::sinks::group_by::aggregates::last::LastAgg;
+use crate::executors::sinks::group_by::aggregates::mean::MeanAgg;
+use crate::executors::sinks::group_by::aggregates::min_max::{new_max, new_min};
+use crate::executors::sinks::group_by::aggregates::null::NullAgg;
+use crate::executors::sinks::group_by::aggregates::{AggregateFunction, SumAgg};
 use crate::expressions::PhysicalPipedExpr;
 use crate::operators::DataChunk;
 
diff --git a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/count.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/count.rs
similarity index 100%
rename from crates/polars-pipe/src/executors/sinks/groupby/aggregates/count.rs
rename to crates/polars-pipe/src/executors/sinks/group_by/aggregates/count.rs
diff --git a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/first.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/first.rs
similarity index 96%
rename from crates/polars-pipe/src/executors/sinks/groupby/aggregates/first.rs
rename to crates/polars-pipe/src/executors/sinks/group_by/aggregates/first.rs
index 6de591c44a4d..604502902d53 100644
--- a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/first.rs
+++ b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/first.rs
@@ -4,7 +4,7 @@ use polars_core::datatypes::DataType;
 use polars_core::prelude::{AnyValue, Series};
 use polars_utils::unwrap::UnwrapUncheckedRelease;
 
-use crate::executors::sinks::groupby::aggregates::AggregateFn;
+use crate::executors::sinks::group_by::aggregates::AggregateFn;
 use crate::operators::IdxSize;
 
 pub(crate) struct FirstAgg {
diff --git a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/interface.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/interface.rs
similarity index 89%
rename from crates/polars-pipe/src/executors/sinks/groupby/aggregates/interface.rs
rename to crates/polars-pipe/src/executors/sinks/group_by/aggregates/interface.rs
index b56a8b9aee34..982506ecaabd 100644
--- a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/interface.rs
+++ b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/interface.rs
@@ -5,13 +5,13 @@ use enum_dispatch::enum_dispatch;
 use polars_core::datatypes::DataType;
 use polars_core::prelude::{AnyValue, Series};
 
-use crate::executors::sinks::groupby::aggregates::count::CountAgg;
-use crate::executors::sinks::groupby::aggregates::first::FirstAgg;
-use crate::executors::sinks::groupby::aggregates::last::LastAgg;
-use crate::executors::sinks::groupby::aggregates::mean::MeanAgg;
-use crate::executors::sinks::groupby::aggregates::min_max::MinMaxAgg;
-use crate::executors::sinks::groupby::aggregates::null::NullAgg;
-use crate::executors::sinks::groupby::aggregates::SumAgg;
+use crate::executors::sinks::group_by::aggregates::count::CountAgg;
+use crate::executors::sinks::group_by::aggregates::first::FirstAgg;
+use crate::executors::sinks::group_by::aggregates::last::LastAgg;
+use crate::executors::sinks::group_by::aggregates::mean::MeanAgg;
+use crate::executors::sinks::group_by::aggregates::min_max::MinMaxAgg;
+use crate::executors::sinks::group_by::aggregates::null::NullAgg;
+use crate::executors::sinks::group_by::aggregates::SumAgg;
 use crate::operators::IdxSize;
 
 #[enum_dispatch(AggregateFunction)]
diff --git a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/last.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/last.rs
similarity index 96%
rename from crates/polars-pipe/src/executors/sinks/groupby/aggregates/last.rs
rename to crates/polars-pipe/src/executors/sinks/group_by/aggregates/last.rs
index 27ddaf790c0b..08f211359064 100644
--- a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/last.rs
+++ b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/last.rs
@@ -4,7 +4,7 @@ use polars_core::datatypes::DataType;
 use polars_core::prelude::{AnyValue, Series};
 use polars_utils::unwrap::UnwrapUncheckedRelease;
 
-use crate::executors::sinks::groupby::aggregates::AggregateFn;
+use crate::executors::sinks::group_by::aggregates::AggregateFn;
 use crate::operators::IdxSize;
 
 pub(crate) struct LastAgg {
diff --git a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/mean.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/mean.rs
similarity index 100%
rename from crates/polars-pipe/src/executors/sinks/groupby/aggregates/mean.rs
rename to crates/polars-pipe/src/executors/sinks/group_by/aggregates/mean.rs
diff --git a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/min_max.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/min_max.rs
similarity index 100%
rename from crates/polars-pipe/src/executors/sinks/groupby/aggregates/min_max.rs
rename to crates/polars-pipe/src/executors/sinks/group_by/aggregates/min_max.rs
diff --git a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/mod.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/mod.rs
similarity index 100%
rename from crates/polars-pipe/src/executors/sinks/groupby/aggregates/mod.rs
rename to crates/polars-pipe/src/executors/sinks/group_by/aggregates/mod.rs
diff --git a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/null.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/null.rs
similarity index 92%
rename from crates/polars-pipe/src/executors/sinks/groupby/aggregates/null.rs
rename to crates/polars-pipe/src/executors/sinks/group_by/aggregates/null.rs
index 4846af4faf0b..768bcde96947 100644
--- a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/null.rs
+++ b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/null.rs
@@ -2,7 +2,7 @@ use std::any::Any;
 
 use polars_core::prelude::*;
 
-use crate::executors::sinks::groupby::aggregates::AggregateFn;
+use crate::executors::sinks::group_by::aggregates::AggregateFn;
 
 #[derive(Clone)]
 pub struct NullAgg(DataType);
diff --git a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/sum.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/sum.rs
similarity index 100%
rename from crates/polars-pipe/src/executors/sinks/groupby/aggregates/sum.rs
rename to crates/polars-pipe/src/executors/sinks/group_by/aggregates/sum.rs
diff --git a/crates/polars-pipe/src/executors/sinks/groupby/generic/eval.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/eval.rs
similarity index 100%
rename from crates/polars-pipe/src/executors/sinks/groupby/generic/eval.rs
rename to crates/polars-pipe/src/executors/sinks/group_by/generic/eval.rs
diff --git a/crates/polars-pipe/src/executors/sinks/groupby/generic/global.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/global.rs
similarity index 100%
rename from crates/polars-pipe/src/executors/sinks/groupby/generic/global.rs
rename to crates/polars-pipe/src/executors/sinks/group_by/generic/global.rs
diff --git a/crates/polars-pipe/src/executors/sinks/groupby/generic/hash_table.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs
similarity index 100%
rename from crates/polars-pipe/src/executors/sinks/groupby/generic/hash_table.rs
rename to crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs
diff --git a/crates/polars-pipe/src/executors/sinks/groupby/generic/mod.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/mod.rs
similarity index 97%
rename from crates/polars-pipe/src/executors/sinks/groupby/generic/mod.rs
rename to crates/polars-pipe/src/executors/sinks/group_by/generic/mod.rs
index 73e7c762bf58..88a06239ac9a 100644
--- a/crates/polars-pipe/src/executors/sinks/groupby/generic/mod.rs
+++ b/crates/polars-pipe/src/executors/sinks/group_by/generic/mod.rs
@@ -24,7 +24,7 @@ pub(crate) use sink::GenericGroupby2;
 use thread_local::ThreadLocalTable;
 
 use super::*;
-use crate::executors::sinks::groupby::aggregates::{AggregateFn, AggregateFunction};
+use crate::executors::sinks::group_by::aggregates::{AggregateFn, AggregateFunction};
 use crate::executors::sinks::io::IOThread;
 use crate::operators::{DataChunk, FinalizedSink, PExecutionContext, Sink, SinkResult};
 
diff --git a/crates/polars-pipe/src/executors/sinks/groupby/generic/ooc_state.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/ooc_state.rs
similarity index 97%
rename from crates/polars-pipe/src/executors/sinks/groupby/generic/ooc_state.rs
rename to crates/polars-pipe/src/executors/sinks/group_by/generic/ooc_state.rs
index 745b5dd99d95..1b8610c54251 100644
--- a/crates/polars-pipe/src/executors/sinks/groupby/generic/ooc_state.rs
+++ b/crates/polars-pipe/src/executors/sinks/group_by/generic/ooc_state.rs
@@ -53,14 +53,14 @@ pub(super) enum SpillAction {
 impl OocState {
     fn init_ooc(&mut self, spill_schema: Schema) -> PolarsResult<()> {
         if verbose() {
-            eprintln!("OOC groupby started");
+            eprintln!("OOC group_by started");
         }
         self.ooc = true;
 
         // start IO thread
         let mut iot = self.io_thread.lock().unwrap();
         if iot.is_none() {
-            *iot = Some(IOThread::try_new(Arc::new(spill_schema), "groupby").unwrap());
+            *iot = Some(IOThread::try_new(Arc::new(spill_schema), "group_by").unwrap());
         }
         Ok(())
     }
diff --git a/crates/polars-pipe/src/executors/sinks/groupby/generic/sink.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/sink.rs
similarity index 96%
rename from crates/polars-pipe/src/executors/sinks/groupby/generic/sink.rs
rename to crates/polars-pipe/src/executors/sinks/group_by/generic/sink.rs
index 93c0e0e24066..eb94bccc3839 100644
--- a/crates/polars-pipe/src/executors/sinks/groupby/generic/sink.rs
+++ b/crates/polars-pipe/src/executors/sinks/group_by/generic/sink.rs
@@ -3,9 +3,9 @@ use std::cell::UnsafeCell;
 use polars_core::utils::accumulate_dataframes_vertical_unchecked;
 
 use super::*;
-use crate::executors::sinks::groupby::generic::global::GlobalTable;
-use crate::executors::sinks::groupby::generic::ooc_state::{OocState, SpillAction};
-use crate::executors::sinks::groupby::generic::source::GroupBySource;
+use crate::executors::sinks::group_by::generic::global::GlobalTable;
+use crate::executors::sinks::group_by::generic::ooc_state::{OocState, SpillAction};
+use crate::executors::sinks::group_by::generic::source::GroupBySource;
 use crate::executors::sources::DataFrameSource;
 use crate::expressions::PhysicalPipedExpr;
 
@@ -174,7 +174,7 @@ impl Sink for GenericGroupby2 {
     }
 
     fn fmt(&self) -> &str {
-        "generic-groupby"
+        "generic-group_by"
     }
 }
 
diff --git a/crates/polars-pipe/src/executors/sinks/groupby/generic/source.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/source.rs
similarity index 95%
rename from crates/polars-pipe/src/executors/sinks/groupby/generic/source.rs
rename to crates/polars-pipe/src/executors/sinks/group_by/generic/source.rs
index d174e70998d9..bdb52235b3b7 100644
--- a/crates/polars-pipe/src/executors/sinks/groupby/generic/source.rs
+++ b/crates/polars-pipe/src/executors/sinks/group_by/generic/source.rs
@@ -3,7 +3,7 @@ use polars_io::ipc::IpcReader;
 use polars_io::SerReader;
 
 use super::*;
-use crate::executors::sinks::groupby::generic::global::GlobalTable;
+use crate::executors::sinks::group_by::generic::global::GlobalTable;
 use crate::executors::sinks::io::{block_thread_until_io_thread_done, IOThread};
 use crate::operators::{Source, SourceResult};
 use crate::pipeline::PARTITION_SIZE;
@@ -26,7 +26,7 @@ impl GroupBySource {
         let io_thread = io_thread.take().unwrap();
 
         if let Some(slice) = slice {
-            polars_ensure!(slice.0 >= 0, ComputeError: "negative slice not supported with out-of-core groupby")
+            polars_ensure!(slice.0 >= 0, ComputeError: "negative slice not supported with out-of-core group_by")
         }
 
         block_thread_until_io_thread_done(&io_thread);
@@ -89,6 +89,6 @@ impl Source for GroupBySource {
         )]))
     }
     fn fmt(&self) -> &str {
-        "generic-groupby-source"
+        "generic-group_by-source"
     }
 }
diff --git a/crates/polars-pipe/src/executors/sinks/groupby/generic/thread_local.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/thread_local.rs
similarity index 100%
rename from crates/polars-pipe/src/executors/sinks/groupby/generic/thread_local.rs
rename to crates/polars-pipe/src/executors/sinks/group_by/generic/thread_local.rs
diff --git a/crates/polars-pipe/src/executors/sinks/groupby/mod.rs b/crates/polars-pipe/src/executors/sinks/group_by/mod.rs
similarity index 100%
rename from crates/polars-pipe/src/executors/sinks/groupby/mod.rs
rename to crates/polars-pipe/src/executors/sinks/group_by/mod.rs
diff --git a/crates/polars-pipe/src/executors/sinks/groupby/ooc.rs b/crates/polars-pipe/src/executors/sinks/group_by/ooc.rs
similarity index 95%
rename from crates/polars-pipe/src/executors/sinks/groupby/ooc.rs
rename to crates/polars-pipe/src/executors/sinks/group_by/ooc.rs
index 36c8b2b4ba14..1c57aad65cbd 100644
--- a/crates/polars-pipe/src/executors/sinks/groupby/ooc.rs
+++ b/crates/polars-pipe/src/executors/sinks/group_by/ooc.rs
@@ -12,7 +12,7 @@ pub(super) struct GroupBySource {
     _io_thread: IOThread,
     already_finished: Option<DataFrame>,
     partitions: std::fs::ReadDir,
-    groupby_sink: Box<dyn Sink>,
+    group_by_sink: Box<dyn Sink>,
     chunk_idx: IdxSize,
     morsels_per_sink: usize,
     slice: Option<(usize, usize)>,
@@ -22,14 +22,14 @@ impl GroupBySource {
     pub(super) fn new(
         io_thread: IOThread,
         already_finished: DataFrame,
-        groupby_sink: Box<dyn Sink>,
+        group_by_sink: Box<dyn Sink>,
         slice: Option<(i64, usize)>,
     ) -> PolarsResult<Self> {
         let partitions = std::fs::read_dir(&io_thread.dir)?;
 
         if let Some(slice) = slice {
             if slice.0 < 0 {
-                polars_bail!(ComputeError: "negative slice not supported with out-of-core groupby")
+                polars_bail!(ComputeError: "negative slice not supported with out-of-core group_by")
             }
         }
 
@@ -37,7 +37,7 @@ impl GroupBySource {
             _io_thread: io_thread,
             already_finished: Some(already_finished),
             partitions,
-            groupby_sink,
+            group_by_sink,
             chunk_idx: 0,
             morsels_per_sink: morsels_per_sink(),
             slice: slice.map(|slice| (slice.0 as usize, slice.1)),
@@ -81,9 +81,9 @@ impl Source for GroupBySource {
                     })
                     .collect::<PolarsResult<Vec<_>>>()?;
 
-                // create a pipeline with a the files as sources and the groupby as sink
+                // create a pipeline with a the files as sources and the group_by as sink
                 let mut pipe =
-                    PipeLine::new_simple(sources, vec![], self.groupby_sink.split(0), verbose());
+                    PipeLine::new_simple(sources, vec![], self.group_by_sink.split(0), verbose());
 
                 match pipe.run_pipeline(context, Default::default())?.unwrap() {
                     FinalizedSink::Finished(mut df) => {
@@ -124,6 +124,6 @@ impl Source for GroupBySource {
     }
 
     fn fmt(&self) -> &str {
-        "ooc-groupby-source"
+        "ooc-group_by-source"
     }
 }
diff --git a/crates/polars-pipe/src/executors/sinks/groupby/ooc_state.rs b/crates/polars-pipe/src/executors/sinks/group_by/ooc_state.rs
similarity index 93%
rename from crates/polars-pipe/src/executors/sinks/groupby/ooc_state.rs
rename to crates/polars-pipe/src/executors/sinks/group_by/ooc_state.rs
index c43f18b757be..c51af7e81b3e 100644
--- a/crates/polars-pipe/src/executors/sinks/groupby/ooc_state.rs
+++ b/crates/polars-pipe/src/executors/sinks/group_by/ooc_state.rs
@@ -33,14 +33,14 @@ impl OocState {
 
     pub(super) fn init_ooc(&mut self, input_schema: SchemaRef) -> PolarsResult<()> {
         if verbose() {
-            eprintln!("OOC groupby started");
+            eprintln!("OOC group_by started");
         }
         self.ooc = true;
 
         // start IO thread
         let mut iot = self.io_thread.lock().unwrap();
         if iot.is_none() {
-            *iot = Some(IOThread::try_new(input_schema, "groupby")?)
+            *iot = Some(IOThread::try_new(input_schema, "group_by")?)
         }
         Ok(())
     }
diff --git a/crates/polars-pipe/src/executors/sinks/groupby/primitive/mod.rs b/crates/polars-pipe/src/executors/sinks/group_by/primitive/mod.rs
similarity index 97%
rename from crates/polars-pipe/src/executors/sinks/groupby/primitive/mod.rs
rename to crates/polars-pipe/src/executors/sinks/group_by/primitive/mod.rs
index d5e103062cfa..3711969eda20 100644
--- a/crates/polars-pipe/src/executors/sinks/groupby/primitive/mod.rs
+++ b/crates/polars-pipe/src/executors/sinks/group_by/primitive/mod.rs
@@ -21,11 +21,11 @@ use polars_utils::unwrap::UnwrapUncheckedRelease;
 use rayon::prelude::*;
 
 use super::aggregates::AggregateFn;
-use crate::executors::sinks::groupby::aggregates::AggregateFunction;
-use crate::executors::sinks::groupby::ooc_state::OocState;
-use crate::executors::sinks::groupby::physical_agg_to_logical;
-use crate::executors::sinks::groupby::string::{apply_aggregate, write_agg_idx};
-use crate::executors::sinks::groupby::utils::{compute_slices, finalize_groupby};
+use crate::executors::sinks::group_by::aggregates::AggregateFunction;
+use crate::executors::sinks::group_by::ooc_state::OocState;
+use crate::executors::sinks::group_by::physical_agg_to_logical;
+use crate::executors::sinks::group_by::string::{apply_aggregate, write_agg_idx};
+use crate::executors::sinks::group_by::utils::{compute_slices, finalize_group_by};
 use crate::executors::sinks::io::IOThread;
 use crate::executors::sinks::utils::load_vec;
 use crate::executors::sinks::HASHMAP_INIT_SIZE;
@@ -455,7 +455,7 @@ where
         let payload = if self.ooc_state.ooc {
             let mut iot = self.ooc_state.io_thread.lock().unwrap();
             // make sure that we reset the shared states
-            // the OOC groupby will call split as well and it should
+            // the OOC group_by will call split as well and it should
             // not send continue spilling to disk
             let iot = iot.take().unwrap();
             self.ooc_state.ooc = false;
@@ -464,7 +464,7 @@ where
         } else {
             None
         };
-        finalize_groupby(dfs, &self.output_schema, self.slice, payload)
+        finalize_group_by(dfs, &self.output_schema, self.slice, payload)
     }
 
     fn split(&self, thread_no: usize) -> Box<dyn Sink> {
@@ -487,7 +487,7 @@ where
         self
     }
     fn fmt(&self) -> &str {
-        "primitive_groupby"
+        "primitive_group_by"
     }
 }
 
diff --git a/crates/polars-pipe/src/executors/sinks/groupby/string.rs b/crates/polars-pipe/src/executors/sinks/group_by/string.rs
similarity index 97%
rename from crates/polars-pipe/src/executors/sinks/groupby/string.rs
rename to crates/polars-pipe/src/executors/sinks/group_by/string.rs
index 7c58a040895d..d8d0ae293bb2 100644
--- a/crates/polars-pipe/src/executors/sinks/groupby/string.rs
+++ b/crates/polars-pipe/src/executors/sinks/group_by/string.rs
@@ -15,11 +15,11 @@ use polars_utils::unwrap::UnwrapUncheckedRelease;
 use rayon::prelude::*;
 
 use super::aggregates::AggregateFn;
-use crate::executors::sinks::groupby::aggregates::AggregateFunction;
-use crate::executors::sinks::groupby::ooc_state::OocState;
-use crate::executors::sinks::groupby::physical_agg_to_logical;
-use crate::executors::sinks::groupby::primitive::apply_aggregation;
-use crate::executors::sinks::groupby::utils::{compute_slices, finalize_groupby};
+use crate::executors::sinks::group_by::aggregates::AggregateFunction;
+use crate::executors::sinks::group_by::ooc_state::OocState;
+use crate::executors::sinks::group_by::physical_agg_to_logical;
+use crate::executors::sinks::group_by::primitive::apply_aggregation;
+use crate::executors::sinks::group_by::utils::{compute_slices, finalize_group_by};
 use crate::executors::sinks::io::IOThread;
 use crate::executors::sinks::utils::load_vec;
 use crate::executors::sinks::HASHMAP_INIT_SIZE;
@@ -497,7 +497,7 @@ impl Sink for Utf8GroupbySink {
         let payload = if self.ooc_state.ooc {
             let mut iot = self.ooc_state.io_thread.lock().unwrap();
             // make sure that we reset the shared states
-            // the OOC groupby will call split as well and it should
+            // the OOC group_by will call split as well and it should
             // not send continue spilling to disk
             let iot = iot.take().unwrap();
             self.ooc_state.ooc = false;
@@ -506,14 +506,14 @@ impl Sink for Utf8GroupbySink {
         } else {
             None
         };
-        finalize_groupby(dfs, &self.output_schema, self.slice, payload)
+        finalize_group_by(dfs, &self.output_schema, self.slice, payload)
     }
 
     fn as_any(&mut self) -> &mut dyn Any {
         self
     }
     fn fmt(&self) -> &str {
-        "utf8_groupby"
+        "utf8_group_by"
     }
 }
 
diff --git a/crates/polars-pipe/src/executors/sinks/groupby/utils.rs b/crates/polars-pipe/src/executors/sinks/group_by/utils.rs
similarity index 96%
rename from crates/polars-pipe/src/executors/sinks/groupby/utils.rs
rename to crates/polars-pipe/src/executors/sinks/group_by/utils.rs
index e669fd4a8407..6120bcedcb3f 100644
--- a/crates/polars-pipe/src/executors/sinks/groupby/utils.rs
+++ b/crates/polars-pipe/src/executors/sinks/group_by/utils.rs
@@ -2,7 +2,7 @@ use hashbrown::HashMap;
 use polars_core::prelude::*;
 use polars_core::utils::{accumulate_dataframes_vertical_unchecked, slice_offsets};
 
-use crate::executors::sinks::groupby::ooc::GroupBySource;
+use crate::executors::sinks::group_by::ooc::GroupBySource;
 use crate::executors::sinks::io::{block_thread_until_io_thread_done, IOThread};
 use crate::operators::{FinalizedSink, Sink};
 
@@ -50,7 +50,7 @@ pub(super) fn compute_slices<K, V, HB>(
     }
 }
 
-pub(super) fn finalize_groupby(
+pub(super) fn finalize_group_by(
     dfs: Vec<DataFrame>,
     output_schema: &Schema,
     slice: Option<(i64, usize)>,
diff --git a/crates/polars-pipe/src/executors/sinks/mod.rs b/crates/polars-pipe/src/executors/sinks/mod.rs
index fb142566a6b9..328ab178a9e6 100644
--- a/crates/polars-pipe/src/executors/sinks/mod.rs
+++ b/crates/polars-pipe/src/executors/sinks/mod.rs
@@ -1,6 +1,6 @@
 #[cfg(any(feature = "parquet", feature = "ipc"))]
 mod file_sink;
-pub(crate) mod groupby;
+pub(crate) mod group_by;
 mod io;
 mod joins;
 mod memory;
diff --git a/crates/polars-pipe/src/pipeline/convert.rs b/crates/polars-pipe/src/pipeline/convert.rs
index 7ee9f7620875..ff1ce7550c0d 100644
--- a/crates/polars-pipe/src/pipeline/convert.rs
+++ b/crates/polars-pipe/src/pipeline/convert.rs
@@ -8,8 +8,8 @@ use polars_core::with_match_physical_integer_polars_type;
 use polars_plan::prelude::*;
 
 use crate::executors::operators::HstackOperator;
-use crate::executors::sinks::groupby::aggregates::convert_to_hash_agg;
-use crate::executors::sinks::groupby::GenericGroupby2;
+use crate::executors::sinks::group_by::aggregates::convert_to_hash_agg;
+use crate::executors::sinks::group_by::GenericGroupby2;
 use crate::executors::sinks::*;
 use crate::executors::{operators, sources};
 use crate::expressions::PhysicalPipedExpr;
@@ -241,13 +241,13 @@ where
                     (keys, aggs, input_schema.clone())
                 },
                 Some(keys) => {
-                    let mut groupby_out_schema = Schema::with_capacity(input_schema.len());
+                    let mut group_by_out_schema = Schema::with_capacity(input_schema.len());
                     let key_names = PlHashSet::from_iter(keys.iter().map(|s| s.as_ref()));
                     let keys = keys
                         .iter()
                         .map(|key| {
                             let (_, name, dtype) = input_schema.get_full(key.as_str()).unwrap();
-                            groupby_out_schema.with_column(name.clone(), dtype.clone());
+                            group_by_out_schema.with_column(name.clone(), dtype.clone());
                             expr_arena.add(AExpr::Column(Arc::from(key.as_str())))
                         })
                         .collect();
@@ -260,7 +260,7 @@ where
                             } else {
                                 let (_, name, dtype) =
                                     input_schema.get_full(name.as_str()).unwrap();
-                                groupby_out_schema.with_column(name.clone(), dtype.clone());
+                                group_by_out_schema.with_column(name.clone(), dtype.clone());
                                 let col = expr_arena.add(AExpr::Column(Arc::from(name.as_str())));
                                 Some(match options.keep_strategy {
                                     UniqueKeepStrategy::First | UniqueKeepStrategy::Any => {
@@ -276,7 +276,7 @@ where
                             }
                         })
                         .collect();
-                    (keys, aggs, groupby_out_schema.into())
+                    (keys, aggs, group_by_out_schema.into())
                 },
             };
 
@@ -300,7 +300,7 @@ where
             }
             let aggregation_columns = Arc::new(aggregation_columns);
 
-            let groupby_sink = Box::new(GenericGroupby2::new(
+            let group_by_sink = Box::new(GenericGroupby2::new(
                 key_columns,
                 aggregation_columns,
                 Arc::from(agg_fns),
@@ -309,7 +309,7 @@ where
                 options.slice,
             ));
 
-            Box::new(ReProjectSink::new(input_schema, groupby_sink))
+            Box::new(ReProjectSink::new(input_schema, group_by_sink))
         },
         Aggregate {
             input,
@@ -356,7 +356,7 @@ where
                 ) {
                     (dt, 1) if dt.is_integer() => {
                         with_match_physical_integer_polars_type!(dt, |$T| {
-                            Box::new(groupby::PrimitiveGroupbySink::<$T>::new(
+                            Box::new(group_by::PrimitiveGroupbySink::<$T>::new(
                                 key_columns[0].clone(),
                                 aggregation_columns,
                                 agg_fns,
@@ -366,7 +366,7 @@ where
                             )) as Box<dyn Sink>
                         })
                     },
-                    (DataType::Utf8, 1) => Box::new(groupby::Utf8GroupbySink::new(
+                    (DataType::Utf8, 1) => Box::new(group_by::Utf8GroupbySink::new(
                         key_columns[0].clone(),
                         aggregation_columns,
                         agg_fns,
diff --git a/crates/polars-pipe/src/pipeline/dispatcher.rs b/crates/polars-pipe/src/pipeline/dispatcher.rs
index c9c5b75827f8..ce4b6b4bd4d8 100644
--- a/crates/polars-pipe/src/pipeline/dispatcher.rs
+++ b/crates/polars-pipe/src/pipeline/dispatcher.rs
@@ -36,7 +36,7 @@ use crate::pipeline::morsels_per_sink;
 ///
 /// - 3. One or more sinks
 ///         A sink needs all data in scope to finalize a pipeline branch.
-///         Think of sorts, preparing a build phase of a join, groupby + aggregations.
+///         Think of sorts, preparing a build phase of a join, group_by + aggregations.
 ///
 /// This struct will have the SOS (source, operators, sinks) of its own pipeline branch, but also
 /// the SOS of other branches. The SOS are stored data oriented and the sinks have an offset that
diff --git a/crates/polars-pipe/src/pipeline/mod.rs b/crates/polars-pipe/src/pipeline/mod.rs
index 5e85ba469225..eced14cd6ece 100644
--- a/crates/polars-pipe/src/pipeline/mod.rs
+++ b/crates/polars-pipe/src/pipeline/mod.rs
@@ -7,7 +7,7 @@ pub use dispatcher::PipeLine;
 use polars_core::prelude::*;
 use polars_core::POOL;
 
-pub use crate::executors::sinks::groupby::aggregates::can_convert_to_hash_agg;
+pub use crate::executors::sinks::group_by::aggregates::can_convert_to_hash_agg;
 
 pub(crate) fn morsels_per_sink() -> usize {
     POOL.current_num_threads()
diff --git a/crates/polars-plan/Cargo.toml b/crates/polars-plan/Cargo.toml
index e385f9e89a23..9225ec0906fd 100644
--- a/crates/polars-plan/Cargo.toml
+++ b/crates/polars-plan/Cargo.toml
@@ -112,7 +112,7 @@ pct_change = ["polars-core/pct_change"]
 moment = ["polars-core/moment", "polars-ops/moment"]
 abs = ["polars-core/abs"]
 random = ["polars-core/random"]
-dynamic_groupby = ["polars-core/dynamic_groupby"]
+dynamic_group_by = ["polars-core/dynamic_group_by"]
 ewma = ["polars-core/ewma"]
 dot_diagram = []
 unique_counts = ["polars-core/unique_counts"]
diff --git a/crates/polars-plan/src/dsl/functions/arity.rs b/crates/polars-plan/src/dsl/functions/arity.rs
index 1a865efed104..e37b158d19ee 100644
--- a/crates/polars-plan/src/dsl/functions/arity.rs
+++ b/crates/polars-plan/src/dsl/functions/arity.rs
@@ -22,7 +22,7 @@ where
     a.map_many(function, &[b], output_type)
 }
 
-/// Like [`map_binary`], but used in a groupby-aggregation context.
+/// Like [`map_binary`], but used in a group_by-aggregation context.
 ///
 /// See [`Expr::apply`] for the difference between [`map`](Expr::map) and [`apply`](Expr::apply).
 pub fn apply_binary<F: 'static>(a: Expr, b: Expr, f: F, output_type: GetOutput) -> Expr
diff --git a/crates/polars-plan/src/dsl/mod.rs b/crates/polars-plan/src/dsl/mod.rs
index 4e3c8520851a..1d49cbe62a72 100644
--- a/crates/polars-plan/src/dsl/mod.rs
+++ b/crates/polars-plan/src/dsl/mod.rs
@@ -599,7 +599,7 @@ impl Expr {
         }
     }
 
-    /// Apply a function/closure over the groups. This should only be used in a groupby aggregation.
+    /// Apply a function/closure over the groups. This should only be used in a group_by aggregation.
     ///
     /// It is the responsibility of the caller that the schema is correct by giving
     /// the correct output_type. If None given the output type of the input expr is used.
@@ -637,7 +637,7 @@ impl Expr {
         }
     }
 
-    /// Apply a function/closure over the groups with many arguments. This should only be used in a groupby aggregation.
+    /// Apply a function/closure over the groups with many arguments. This should only be used in a group_by aggregation.
     ///
     /// See the [`Expr::apply`] function for the differences between [`map`](Expr::map) and [`apply`](Expr::apply).
     pub fn apply_many<F>(self, function: F, arguments: &[Expr], output_type: GetOutput) -> Self
@@ -867,7 +867,7 @@ impl Expr {
     }
 
     /// Apply window function over a subgroup.
-    /// This is similar to a groupby + aggregation + self join.
+    /// This is similar to a group_by + aggregation + self join.
     /// Or similar to [window functions in Postgres](https://www.postgresql.org/docs/9.1/tutorial-window.html).
     ///
     /// # Example
@@ -1057,7 +1057,7 @@ impl Expr {
     }
 
     /// Sort this column by the ordering of another column.
-    /// Can also be used in a groupby context to sort the groups.
+    /// Can also be used in a group_by context to sort the groups.
     pub fn sort_by<E: AsRef<[IE]>, IE: Into<Expr> + Clone, R: AsRef<[bool]>>(
         self,
         by: E,
@@ -1878,7 +1878,7 @@ where
     }
 }
 
-/// Apply a function/closure over the groups of multiple columns. This should only be used in a groupby aggregation.
+/// Apply a function/closure over the groups of multiple columns. This should only be used in a group_by aggregation.
 ///
 /// It is the responsibility of the caller that the schema is correct by giving
 /// the correct output_type. If None given the output type of the input expr is used.
diff --git a/crates/polars-plan/src/logical_plan/aexpr/mod.rs b/crates/polars-plan/src/logical_plan/aexpr/mod.rs
index 9fbc72ba71aa..09227fc972e9 100644
--- a/crates/polars-plan/src/logical_plan/aexpr/mod.rs
+++ b/crates/polars-plan/src/logical_plan/aexpr/mod.rs
@@ -4,7 +4,7 @@ mod schema;
 use std::sync::Arc;
 
 use polars_arrow::prelude::QuantileInterpolOptions;
-use polars_core::frame::groupby::GroupByMethod;
+use polars_core::frame::group_by::GroupByMethod;
 use polars_core::prelude::*;
 use polars_core::utils::{get_time_units, try_get_supertype};
 use polars_utils::arena::{Arena, Node};
diff --git a/crates/polars-plan/src/logical_plan/builder.rs b/crates/polars-plan/src/logical_plan/builder.rs
index faa1371b9f30..902ef9b6b91d 100644
--- a/crates/polars-plan/src/logical_plan/builder.rs
+++ b/crates/polars-plan/src/logical_plan/builder.rs
@@ -546,14 +546,14 @@ impl LogicalPlanBuilder {
         .into()
     }
 
-    pub fn groupby<E: AsRef<[Expr]>>(
+    pub fn group_by<E: AsRef<[Expr]>>(
         self,
         keys: Vec<Expr>,
         aggs: E,
         apply: Option<Arc<dyn DataFrameUdf>>,
         maintain_order: bool,
-        #[cfg(feature = "dynamic_groupby")] dynamic_options: Option<DynamicGroupOptions>,
-        #[cfg(feature = "dynamic_groupby")] rolling_options: Option<RollingGroupOptions>,
+        #[cfg(feature = "dynamic_group_by")] dynamic_options: Option<DynamicGroupOptions>,
+        #[cfg(feature = "dynamic_group_by")] rolling_options: Option<RollingGroupOptions>,
     ) -> Self {
         let current_schema = try_delayed!(self.0.schema(), &self.0, into);
         let current_schema = current_schema.as_ref();
@@ -594,7 +594,7 @@ impl LogicalPlanBuilder {
             try_delayed!(check_names(), &self.0, into)
         }
 
-        #[cfg(feature = "dynamic_groupby")]
+        #[cfg(feature = "dynamic_group_by")]
         {
             let index_columns = &[
                 rolling_options
@@ -616,14 +616,14 @@ impl LogicalPlanBuilder {
             }
         }
 
-        #[cfg(feature = "dynamic_groupby")]
+        #[cfg(feature = "dynamic_group_by")]
         let options = GroupbyOptions {
             dynamic: dynamic_options,
             rolling: rolling_options,
             slice: None,
         };
 
-        #[cfg(not(feature = "dynamic_groupby"))]
+        #[cfg(not(feature = "dynamic_group_by"))]
         let options = GroupbyOptions { slice: None };
 
         LogicalPlan::Aggregate {
diff --git a/crates/polars-plan/src/logical_plan/builder_alp.rs b/crates/polars-plan/src/logical_plan/builder_alp.rs
index 199dfcd921d2..b5ce0d863b73 100644
--- a/crates/polars-plan/src/logical_plan/builder_alp.rs
+++ b/crates/polars-plan/src/logical_plan/builder_alp.rs
@@ -121,7 +121,7 @@ impl<'a> ALogicalPlanBuilder<'a> {
         self.add_alp(lp)
     }
 
-    pub fn groupby(
+    pub fn group_by(
         self,
         keys: Vec<Node>,
         aggs: Vec<Node>,
@@ -143,7 +143,7 @@ impl<'a> ALogicalPlanBuilder<'a> {
         );
         schema.merge(other);
 
-        #[cfg(feature = "dynamic_groupby")]
+        #[cfg(feature = "dynamic_group_by")]
         {
             let index_columns = &[
                 options
diff --git a/crates/polars-plan/src/logical_plan/optimizer/cse_expr.rs b/crates/polars-plan/src/logical_plan/optimizer/cse_expr.rs
index aa0e44670d6b..4ea6c3de5327 100644
--- a/crates/polars-plan/src/logical_plan/optimizer/cse_expr.rs
+++ b/crates/polars-plan/src/logical_plan/optimizer/cse_expr.rs
@@ -136,7 +136,7 @@ enum VisitRecord {
     // The `bool` indicates if this expression is valid.
     // This can be `AND` accumulated by the lineage of the expression to determine
     // of the whole expression can be added.
-    // For instance a in a groupby we only want to use elementwise operation in cse:
+    // For instance a in a group_by we only want to use elementwise operation in cse:
     // - `(col("a") * 2).sum(), (col("a") * 2)` -> we want to do `col("a") * 2` on a `with_columns`
     // - `col("a").sum() * col("a").sum()` -> we don't want `sum` to run on `with_columns`
     // as that doesn't have groups context. If we encounter a `sum` it should be flagged as `false`
@@ -210,7 +210,7 @@ struct ExprIdentifierVisitor<'a> {
     // whether the expression replaced a subexpression
     has_sub_expr: bool,
     // During aggregation we only identify element-wise operations
-    is_groupby: bool,
+    is_group_by: bool,
 }
 
 impl ExprIdentifierVisitor<'_> {
@@ -218,7 +218,7 @@ impl ExprIdentifierVisitor<'_> {
         se_count: &'a mut SubExprCount,
         identifier_array: &'a mut IdentifierArray,
         visit_stack: &'a mut Vec<VisitRecord>,
-        is_groupby: bool,
+        is_group_by: bool,
     ) -> ExprIdentifierVisitor<'a> {
         let id_array_offset = identifier_array.len();
         ExprIdentifierVisitor {
@@ -229,7 +229,7 @@ impl ExprIdentifierVisitor<'_> {
             visit_stack,
             id_array_offset,
             has_sub_expr: false,
-            is_groupby,
+            is_group_by,
         }
     }
 
@@ -274,7 +274,7 @@ impl ExprIdentifierVisitor<'_> {
                 // during aggregation we only store elementwise operation in the state
                 // other operations we cannot add to the state as they have the output size of the
                 // groups, not the original dataframe
-                if self.is_groupby {
+                if self.is_group_by {
                     match ae {
                         AExpr::Agg(_) | AExpr::AnonymousFunction { .. } => {
                             Some((VisitRecursion::Continue, false))
@@ -528,13 +528,13 @@ impl<'a> CommonSubExprOptimizer<'a> {
     fn visit_expression(
         &mut self,
         ae_node: AexprNode,
-        is_groupby: bool,
+        is_group_by: bool,
     ) -> PolarsResult<(usize, bool)> {
         let mut visitor = ExprIdentifierVisitor::new(
             &mut self.se_count,
             &mut self.id_array,
             &mut self.visit_stack,
-            is_groupby,
+            is_group_by,
         );
         ae_node.visit(&mut visitor).map(|_| ())?;
         Ok((visitor.id_array_offset, visitor.has_sub_expr))
@@ -563,7 +563,7 @@ impl<'a> CommonSubExprOptimizer<'a> {
         expr: &[Node],
         expr_arena: &mut Arena<AExpr>,
         id_array_offsets: &mut Vec<u32>,
-        is_groupby: bool,
+        is_group_by: bool,
         schema: &Schema,
     ) -> PolarsResult<Option<ProjectionExprs>> {
         let mut has_sub_expr = false;
@@ -577,7 +577,7 @@ impl<'a> CommonSubExprOptimizer<'a> {
             // visit expressions and collect sub-expression counts
             let (id_array_offset, this_expr_has_se) =
                 AexprNode::with_context(*node, expr_arena, |ae_node| {
-                    self.visit_expression(ae_node, is_groupby)
+                    self.visit_expression(ae_node, is_group_by)
                 })?;
             id_array_offsets.push(id_array_offset as u32);
             has_sub_expr |= this_expr_has_se;
diff --git a/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/utils.rs b/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/utils.rs
index 22ad1e3e2366..58179b8aae8b 100644
--- a/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/utils.rs
+++ b/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/utils.rs
@@ -123,7 +123,7 @@ pub(super) fn predicate_is_pushdown_boundary(node: Node, expr_arena: &Arena<AExp
             | AExpr::AnonymousFunction {options: FunctionOptions { collect_groups: ApplyOptions::ApplyGroups, .. }, ..}
             | AExpr::Function {options: FunctionOptions { collect_groups: ApplyOptions::ApplyGroups, .. }, ..}
             | AExpr::Explode {..}
-            // A groupby needs all rows for aggregation
+            // A group_by needs all rows for aggregation
             | AExpr::Window {..}
         )
     };
@@ -158,7 +158,7 @@ pub(super) fn projection_is_definite_pushdown_boundary(
              | Nth(_)
              | Slice {..}
              | Take {..}
-            // A groupby needs all rows for aggregation
+            // A group_by needs all rows for aggregation
             | Window {..}
             | Literal(LiteralValue::Range {..}) => true,
             // The series might be used in a comparison with exactly the right length
diff --git a/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/groupby.rs b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/group_by.rs
similarity index 95%
rename from crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/groupby.rs
rename to crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/group_by.rs
index 09586b88490c..aaeca3ae7f79 100644
--- a/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/groupby.rs
+++ b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/group_by.rs
@@ -1,7 +1,7 @@
 use super::*;
 
 #[allow(clippy::too_many_arguments)]
-pub(super) fn process_groupby(
+pub(super) fn process_group_by(
     proj_pd: &mut ProjectionPushDown,
     input: Node,
     keys: Vec<Node>,
@@ -66,13 +66,13 @@ pub(super) fn process_groupby(
         }
 
         // make sure that the dynamic key is projected
-        #[cfg(feature = "dynamic_groupby")]
+        #[cfg(feature = "dynamic_group_by")]
         if let Some(options) = &options.dynamic {
             let node = expr_arena.add(AExpr::Column(Arc::from(options.index_column.as_str())));
             add_expr_to_accumulated(node, &mut acc_projections, &mut names, expr_arena);
         }
         // make sure that the rolling key is projected
-        #[cfg(feature = "dynamic_groupby")]
+        #[cfg(feature = "dynamic_group_by")]
         if let Some(options) = &options.rolling {
             let node = expr_arena.add(AExpr::Column(Arc::from(options.index_column.as_str())));
             add_expr_to_accumulated(node, &mut acc_projections, &mut names, expr_arena);
@@ -87,7 +87,7 @@ pub(super) fn process_groupby(
             expr_arena,
         )?;
 
-        let builder = ALogicalPlanBuilder::new(input, expr_arena, lp_arena).groupby(
+        let builder = ALogicalPlanBuilder::new(input, expr_arena, lp_arena).group_by(
             keys,
             projected_aggs,
             apply,
diff --git a/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/mod.rs b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/mod.rs
index edf8b6edbdb7..127e9ac14c95 100644
--- a/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/mod.rs
+++ b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/mod.rs
@@ -1,6 +1,6 @@
 mod functions;
 mod generic;
-mod groupby;
+mod group_by;
 mod hstack;
 mod joins;
 mod projection;
@@ -17,7 +17,7 @@ use semi_anti_join::process_semi_anti_join;
 use crate::logical_plan::Context;
 use crate::prelude::iterator::ArenaExprIter;
 use crate::prelude::optimizer::projection_pushdown::generic::process_generic;
-use crate::prelude::optimizer::projection_pushdown::groupby::process_groupby;
+use crate::prelude::optimizer::projection_pushdown::group_by::process_group_by;
 use crate::prelude::optimizer::projection_pushdown::hstack::process_hstack;
 use crate::prelude::optimizer::projection_pushdown::joins::process_join;
 use crate::prelude::optimizer::projection_pushdown::projection::process_projection;
@@ -580,7 +580,7 @@ impl ProjectionPushDown {
                 schema,
                 maintain_order,
                 options,
-            } => process_groupby(
+            } => process_group_by(
                 self,
                 input,
                 keys,
diff --git a/crates/polars-plan/src/logical_plan/optimizer/type_coercion/binary.rs b/crates/polars-plan/src/logical_plan/optimizer/type_coercion/binary.rs
index 89db0560bcc6..423b699ef240 100644
--- a/crates/polars-plan/src/logical_plan/optimizer/type_coercion/binary.rs
+++ b/crates/polars-plan/src/logical_plan/optimizer/type_coercion/binary.rs
@@ -282,7 +282,7 @@ pub(super) fn process_binary(
 
         // only cast if the type is not already the super type.
         // this can prevent an expensive flattening and subsequent aggregation
-        // in a groupby context. To be able to cast the groups need to be
+        // in a group_by context. To be able to cast the groups need to be
         // flattened
         let new_node_left = if type_left != st {
             expr_arena.add(AExpr::Cast {
diff --git a/crates/polars-plan/src/logical_plan/optimizer/type_coercion/mod.rs b/crates/polars-plan/src/logical_plan/optimizer/type_coercion/mod.rs
index 5ae5f8c9eda1..ba92141239fe 100644
--- a/crates/polars-plan/src/logical_plan/optimizer/type_coercion/mod.rs
+++ b/crates/polars-plan/src/logical_plan/optimizer/type_coercion/mod.rs
@@ -298,7 +298,7 @@ impl OptimizationRule for TypeCoercionRule {
 
                 // only cast if the type is not already the super type.
                 // this can prevent an expensive flattening and subsequent aggregation
-                // in a groupby context. To be able to cast the groups need to be
+                // in a group_by context. To be able to cast the groups need to be
                 // flattened
                 let new_node_truthy = if type_true != st {
                     expr_arena.add(AExpr::Cast {
@@ -462,7 +462,7 @@ impl OptimizationRule for TypeCoercionRule {
                 }
                 // only cast if the type is not already the super type.
                 // this can prevent an expensive flattening and subsequent aggregation
-                // in a groupby context. To be able to cast the groups need to be
+                // in a group_by context. To be able to cast the groups need to be
                 // flattened
                 let new_node_self = if type_self != super_type {
                     expr_arena.add(AExpr::Cast {
diff --git a/crates/polars-plan/src/logical_plan/options.rs b/crates/polars-plan/src/logical_plan/options.rs
index fd653f3764c1..9aef73892951 100644
--- a/crates/polars-plan/src/logical_plan/options.rs
+++ b/crates/polars-plan/src/logical_plan/options.rs
@@ -8,7 +8,7 @@ use polars_io::ipc::IpcCompression;
 #[cfg(feature = "parquet")]
 use polars_io::parquet::ParquetCompression;
 use polars_io::RowCount;
-#[cfg(feature = "dynamic_groupby")]
+#[cfg(feature = "dynamic_group_by")]
 use polars_time::{DynamicGroupOptions, RollingGroupOptions};
 #[cfg(feature = "serde")]
 use serde::{Deserialize, Serialize};
@@ -105,9 +105,9 @@ pub struct UnionOptions {
 #[derive(Clone, Debug, PartialEq, Eq, Default)]
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 pub struct GroupbyOptions {
-    #[cfg(feature = "dynamic_groupby")]
+    #[cfg(feature = "dynamic_group_by")]
     pub dynamic: Option<DynamicGroupOptions>,
-    #[cfg(feature = "dynamic_groupby")]
+    #[cfg(feature = "dynamic_group_by")]
     pub rolling: Option<RollingGroupOptions>,
     /// Take only a slice of the result
     pub slice: Option<(i64, usize)>,
@@ -195,7 +195,7 @@ pub struct FunctionOptions {
     // If set to `false` the physical engine will ensure the left input
     // expression is the output name.
     pub allow_rename: bool,
-    // if set, then the `Series` passed to the function in the groupby operation
+    // if set, then the `Series` passed to the function in the group_by operation
     // will ensure the name is set. This is an extra heap allocation per group.
     pub pass_name_to_apply: bool,
     // For example a `unique` or a `slice`
diff --git a/crates/polars-plan/src/logical_plan/projection.rs b/crates/polars-plan/src/logical_plan/projection.rs
index aa6b5fef739c..fd467fe56496 100644
--- a/crates/polars-plan/src/logical_plan/projection.rs
+++ b/crates/polars-plan/src/logical_plan/projection.rs
@@ -353,7 +353,7 @@ fn prepare_excluded(
         }
     }
 
-    // exclude groupby keys
+    // exclude group_by keys
     for mut expr in keys.iter() {
         // Allow a number of aliases of a column expression, still exclude column from aggregation
         loop {
diff --git a/crates/polars-sql/src/context.rs b/crates/polars-sql/src/context.rs
index 39102198e72c..acba55bd4c62 100644
--- a/crates/polars-sql/src/context.rs
+++ b/crates/polars-sql/src/context.rs
@@ -317,7 +317,7 @@ impl SQLContext {
 
         // Check for group by
         // After projection since there might be number.
-        let groupby_keys: Vec<Expr> = select_stmt
+        let group_by_keys: Vec<Expr> = select_stmt
             .group_by
             .iter()
             .map(|e| match e {
@@ -325,7 +325,7 @@ impl SQLContext {
                     let idx = match idx.parse::<usize>() {
                         Ok(0) | Err(_) => Err(polars_err!(
                             ComputeError:
-                            "groupby error: a positive number or an expression expected, got {}",
+                            "group_by error: a positive number or an expression expected, got {}",
                             idx
                         )),
                         Ok(idx) => Ok(idx),
@@ -334,16 +334,16 @@ impl SQLContext {
                 },
                 SqlExpr::Value(_) => Err(polars_err!(
                     ComputeError:
-                    "groupby error: a positive number or an expression expected",
+                    "group_by error: a positive number or an expression expected",
                 )),
                 _ => parse_sql_expr(e, self),
             })
             .collect::<PolarsResult<_>>()?;
 
-        if groupby_keys.is_empty() {
+        if group_by_keys.is_empty() {
             lf = lf.select(projections)
         } else {
-            lf = self.process_groupby(lf, contains_wildcard, &groupby_keys, &projections)?;
+            lf = self.process_group_by(lf, contains_wildcard, &group_by_keys, &projections)?;
 
             // Apply optional 'having' clause, post-aggregation
             lf = match select_stmt.having.as_ref() {
@@ -481,31 +481,31 @@ impl SQLContext {
         Ok(lf.sort_by_exprs(&by, descending, false, false))
     }
 
-    fn process_groupby(
+    fn process_group_by(
         &mut self,
         lf: LazyFrame,
         contains_wildcard: bool,
-        groupby_keys: &[Expr],
+        group_by_keys: &[Expr],
         projections: &[Expr],
     ) -> PolarsResult<LazyFrame> {
-        // check groupby and projection due to difference between SQL and polars
+        // check group_by and projection due to difference between SQL and polars
         // Return error on wild card, shouldn't process this
         polars_ensure!(
             !contains_wildcard,
-            ComputeError: "groupby error: can't process wildcard in groupby"
+            ComputeError: "group_by error: can't process wildcard in group_by"
         );
         let schema_before = lf.schema()?;
 
-        let groupby_keys_schema =
-            expressions_to_schema(groupby_keys, &schema_before, Context::Default)?;
+        let group_by_keys_schema =
+            expressions_to_schema(group_by_keys, &schema_before, Context::Default)?;
 
-        // remove the groupby keys as polars adds those implicitly
+        // remove the group_by keys as polars adds those implicitly
         let mut aggregation_projection = Vec::with_capacity(projections.len());
         let mut aliases: BTreeSet<&str> = BTreeSet::new();
 
         for mut e in projections {
             // if it is a simple expression & has alias,
-            // we must defer the aliasing until after the groupby
+            // we must defer the aliasing until after the group_by
             if e.clone().meta().is_simple_projection() {
                 if let Expr::Alias(expr, name) = e {
                     aliases.insert(name);
@@ -514,12 +514,12 @@ impl SQLContext {
             }
 
             let field = e.to_field(&schema_before, Context::Default)?;
-            if groupby_keys_schema.get(&field.name).is_none() {
+            if group_by_keys_schema.get(&field.name).is_none() {
                 aggregation_projection.push(e.clone())
             }
         }
 
-        let aggregated = lf.groupby(groupby_keys).agg(&aggregation_projection);
+        let aggregated = lf.group_by(group_by_keys).agg(&aggregation_projection);
         let projection_schema =
             expressions_to_schema(projections, &schema_before, Context::Default)?;
         // a final projection to get the proper order
@@ -527,7 +527,7 @@ impl SQLContext {
             .iter_names()
             .zip(projections)
             .map(|(name, projection_expr)| {
-                if groupby_keys_schema.get(name).is_some() || aliases.contains(name.as_str()) {
+                if group_by_keys_schema.get(name).is_some() || aliases.contains(name.as_str()) {
                     projection_expr.clone()
                 } else {
                     col(name)
diff --git a/crates/polars-sql/tests/iss_7437.rs b/crates/polars-sql/tests/iss_7437.rs
index a8479d92691e..29229ba5c4c6 100644
--- a/crates/polars-sql/tests/iss_7437.rs
+++ b/crates/polars-sql/tests/iss_7437.rs
@@ -25,7 +25,7 @@ fn iss_7437() -> PolarsResult<()> {
 
     let expected = LazyCsvReader::new("../../examples/datasets/foods1.csv")
         .finish()?
-        .groupby(vec![col("category").alias("category")])
+        .group_by(vec![col("category").alias("category")])
         .agg(vec![])
         .collect()?
         .sort(["category"], vec![false], false)?;
diff --git a/crates/polars-sql/tests/ops_distinct_on.rs b/crates/polars-sql/tests/ops_distinct_on.rs
index 9497cf63c530..4adc2ab75b45 100644
--- a/crates/polars-sql/tests/ops_distinct_on.rs
+++ b/crates/polars-sql/tests/ops_distinct_on.rs
@@ -34,7 +34,7 @@ fn test_distinct_on() {
             true,
             false,
         )
-        .groupby_stable(vec![col("Name")])
+        .group_by_stable(vec![col("Name")])
         .agg(vec![col("*").first()]);
     let expected = expected.collect().unwrap();
     assert!(actual.frame_equal(&expected))
diff --git a/crates/polars-sql/tests/simple_exprs.rs b/crates/polars-sql/tests/simple_exprs.rs
index 68653d8def23..5ea4937f6995 100644
--- a/crates/polars-sql/tests/simple_exprs.rs
+++ b/crates/polars-sql/tests/simple_exprs.rs
@@ -54,7 +54,7 @@ fn test_nested_expr() -> PolarsResult<()> {
     Ok(())
 }
 #[test]
-fn test_groupby_simple() -> PolarsResult<()> {
+fn test_group_by_simple() -> PolarsResult<()> {
     let df = create_sample_df()?;
     let mut context = SQLContext::new();
     context.register("df", df.clone().lazy());
@@ -78,7 +78,7 @@ fn test_groupby_simple() -> PolarsResult<()> {
         .collect()?;
     let df_pl = df
         .lazy()
-        .groupby(&[col("a")])
+        .group_by(&[col("a")])
         .agg(&[
             col("b").sum().alias("b"),
             (col("a") + col("b")).sum().alias("c"),
@@ -463,7 +463,7 @@ fn test_ctes() -> PolarsResult<()> {
 
 #[test]
 #[cfg(feature = "ipc")]
-fn test_groupby_2() -> PolarsResult<()> {
+fn test_group_by_2() -> PolarsResult<()> {
     let mut context = SQLContext::new();
     let sql = r#"
     CREATE TABLE foods AS
@@ -486,7 +486,7 @@ fn test_groupby_2() -> PolarsResult<()> {
     let df_sql = df_sql.collect()?;
     let expected = LazyFrame::scan_ipc("../../examples/datasets/foods1.ipc", Default::default())?
         .select(&[col("*")])
-        .groupby(vec![col("category")])
+        .group_by(vec![col("category")])
         .agg(vec![
             col("category").count().alias("count"),
             col("calories").max(),
diff --git a/crates/polars-time/src/chunkedarray/rolling_window/mod.rs b/crates/polars-time/src/chunkedarray/rolling_window/mod.rs
index ead6d9012eb9..dbb3e07d18e6 100644
--- a/crates/polars-time/src/chunkedarray/rolling_window/mod.rs
+++ b/crates/polars-time/src/chunkedarray/rolling_window/mod.rs
@@ -282,7 +282,7 @@ where
         })
     } else {
         if arr.null_count() > 0 {
-            panic!("'rolling by' not yet supported for series with null values, consider using 'groupby_rolling'")
+            panic!("'rolling by' not yet supported for series with null values, consider using 'group_by_rolling'")
         }
         let values = arr.values().as_slice();
         let duration = options.window_size;
@@ -291,7 +291,7 @@ where
         let by = options.by.unwrap();
         let closed_window = options.closed_window.expect("closed window  must be set");
         let func = rolling_agg_fn_dynamic.expect(
-            "'rolling by' not yet supported for this expression, consider using 'groupby_rolling'",
+            "'rolling by' not yet supported for this expression, consider using 'group_by_rolling'",
         );
 
         func(
diff --git a/crates/polars-time/src/chunkedarray/rolling_window/rolling_kernels/no_nulls.rs b/crates/polars-time/src/chunkedarray/rolling_window/rolling_kernels/no_nulls.rs
index 79a349009ead..3894a69dc3f1 100644
--- a/crates/polars-time/src/chunkedarray/rolling_window/rolling_kernels/no_nulls.rs
+++ b/crates/polars-time/src/chunkedarray/rolling_window/rolling_kernels/no_nulls.rs
@@ -61,8 +61,8 @@ where
 {
     let offset_iter = match tz {
         #[cfg(feature = "timezones")]
-        Some(tz) => groupby_values_iter(period, time, closed_window, tu, tz.parse::<Tz>().ok()),
-        _ => groupby_values_iter(period, time, closed_window, tu, None),
+        Some(tz) => group_by_values_iter(period, time, closed_window, tu, tz.parse::<Tz>().ok()),
+        _ => group_by_values_iter(period, time, closed_window, tu, None),
     };
     rolling_apply_agg_window::<no_nulls::MinWindow<_>, _, _>(values, offset_iter, None)
 }
@@ -82,8 +82,8 @@ where
 {
     let offset_iter = match tz {
         #[cfg(feature = "timezones")]
-        Some(tz) => groupby_values_iter(period, time, closed_window, tu, tz.parse::<Tz>().ok()),
-        _ => groupby_values_iter(period, time, closed_window, tu, None),
+        Some(tz) => group_by_values_iter(period, time, closed_window, tu, tz.parse::<Tz>().ok()),
+        _ => group_by_values_iter(period, time, closed_window, tu, None),
     };
     rolling_apply_agg_window::<no_nulls::MaxWindow<_>, _, _>(values, offset_iter, None)
 }
@@ -103,8 +103,8 @@ where
 {
     let offset_iter = match tz {
         #[cfg(feature = "timezones")]
-        Some(tz) => groupby_values_iter(period, time, closed_window, tu, tz.parse::<Tz>().ok()),
-        _ => groupby_values_iter(period, time, closed_window, tu, None),
+        Some(tz) => group_by_values_iter(period, time, closed_window, tu, tz.parse::<Tz>().ok()),
+        _ => group_by_values_iter(period, time, closed_window, tu, None),
     };
     rolling_apply_agg_window::<no_nulls::SumWindow<_>, _, _>(values, offset_iter, None)
 }
@@ -124,8 +124,8 @@ where
 {
     let offset_iter = match tz {
         #[cfg(feature = "timezones")]
-        Some(tz) => groupby_values_iter(period, time, closed_window, tu, tz.parse::<Tz>().ok()),
-        _ => groupby_values_iter(period, time, closed_window, tu, None),
+        Some(tz) => group_by_values_iter(period, time, closed_window, tu, tz.parse::<Tz>().ok()),
+        _ => group_by_values_iter(period, time, closed_window, tu, None),
     };
     rolling_apply_agg_window::<no_nulls::MeanWindow<_>, _, _>(values, offset_iter, None)
 }
@@ -145,8 +145,8 @@ where
 {
     let offset_iter = match tz {
         #[cfg(feature = "timezones")]
-        Some(tz) => groupby_values_iter(period, time, closed_window, tu, tz.parse::<Tz>().ok()),
-        _ => groupby_values_iter(period, time, closed_window, tu, None),
+        Some(tz) => group_by_values_iter(period, time, closed_window, tu, tz.parse::<Tz>().ok()),
+        _ => group_by_values_iter(period, time, closed_window, tu, None),
     };
     rolling_apply_agg_window::<no_nulls::VarWindow<_>, _, _>(values, offset_iter, params)
 }
@@ -166,8 +166,8 @@ where
 {
     let offset_iter = match tz {
         #[cfg(feature = "timezones")]
-        Some(tz) => groupby_values_iter(period, time, closed_window, tu, tz.parse::<Tz>().ok()),
-        _ => groupby_values_iter(period, time, closed_window, tu, None),
+        Some(tz) => group_by_values_iter(period, time, closed_window, tu, tz.parse::<Tz>().ok()),
+        _ => group_by_values_iter(period, time, closed_window, tu, None),
     };
     rolling_apply_agg_window::<no_nulls::QuantileWindow<_>, _, _>(values, offset_iter, params)
 }
diff --git a/crates/polars-time/src/groupby/dynamic.rs b/crates/polars-time/src/group_by/dynamic.rs
similarity index 94%
rename from crates/polars-time/src/groupby/dynamic.rs
rename to crates/polars-time/src/group_by/dynamic.rs
index c08248c3ae77..56e2bb969f34 100644
--- a/crates/polars-time/src/groupby/dynamic.rs
+++ b/crates/polars-time/src/group_by/dynamic.rs
@@ -1,7 +1,7 @@
 use polars_arrow::time_zone::Tz;
 use polars_arrow::utils::CustomIterTools;
 use polars_core::export::rayon::prelude::*;
-use polars_core::frame::groupby::GroupsProxy;
+use polars_core::frame::group_by::GroupsProxy;
 use polars_core::prelude::*;
 use polars_core::series::IsSorted;
 use polars_core::utils::ensure_sorted_arg;
@@ -92,13 +92,13 @@ const LB_NAME: &str = "_lower_boundary";
 const UP_NAME: &str = "_upper_boundary";
 
 pub trait PolarsTemporalGroupby {
-    fn groupby_rolling(
+    fn group_by_rolling(
         &self,
         by: Vec<Series>,
         options: &RollingGroupOptions,
     ) -> PolarsResult<(Series, Vec<Series>, GroupsProxy)>;
 
-    fn groupby_dynamic(
+    fn group_by_dynamic(
         &self,
         by: Vec<Series>,
         options: &DynamicGroupOptions,
@@ -106,25 +106,25 @@ pub trait PolarsTemporalGroupby {
 }
 
 impl PolarsTemporalGroupby for DataFrame {
-    fn groupby_rolling(
+    fn group_by_rolling(
         &self,
         by: Vec<Series>,
         options: &RollingGroupOptions,
     ) -> PolarsResult<(Series, Vec<Series>, GroupsProxy)> {
-        Wrap(self).groupby_rolling(by, options)
+        Wrap(self).group_by_rolling(by, options)
     }
 
-    fn groupby_dynamic(
+    fn group_by_dynamic(
         &self,
         by: Vec<Series>,
         options: &DynamicGroupOptions,
     ) -> PolarsResult<(Series, Vec<Series>, GroupsProxy)> {
-        Wrap(self).groupby_dynamic(by, options)
+        Wrap(self).group_by_dynamic(by, options)
     }
 }
 
 impl Wrap<&DataFrame> {
-    fn groupby_rolling(
+    fn group_by_rolling(
         &self,
         by: Vec<Series>,
         options: &RollingGroupOptions,
@@ -138,11 +138,11 @@ impl Wrap<&DataFrame> {
         if by.is_empty() {
             // if by is given, the column must be sorted in the 'by' arg, which we can not check now
             // this will be checked when the groups are materialized
-            ensure_sorted_arg(&time, "groupby_rolling")?;
+            ensure_sorted_arg(&time, "group_by_rolling")?;
         }
         let time_type = time.dtype();
 
-        polars_ensure!(time.null_count() == 0, ComputeError: "null values in dynamic groupby not supported, fill nulls.");
+        polars_ensure!(time.null_count() == 0, ComputeError: "null values in dynamic group_by not supported, fill nulls.");
 
         use DataType::*;
         let (dt, tu, tz): (Series, TimeUnit, Option<TimeZone>) = match time_type {
@@ -155,7 +155,7 @@ impl Wrap<&DataFrame> {
             Int32 => {
                 let time_type = Datetime(TimeUnit::Nanoseconds, None);
                 let dt = time.cast(&Int64).unwrap().cast(&time_type).unwrap();
-                let (out, by, gt) = self.impl_groupby_rolling(
+                let (out, by, gt) = self.impl_group_by_rolling(
                     dt,
                     by,
                     options,
@@ -169,7 +169,7 @@ impl Wrap<&DataFrame> {
             Int64 => {
                 let time_type = Datetime(TimeUnit::Nanoseconds, None);
                 let dt = time.cast(&time_type).unwrap();
-                let (out, by, gt) = self.impl_groupby_rolling(
+                let (out, by, gt) = self.impl_group_by_rolling(
                     dt,
                     by,
                     options,
@@ -189,14 +189,14 @@ impl Wrap<&DataFrame> {
         match tz {
             #[cfg(feature = "timezones")]
             Some(tz) => {
-                self.impl_groupby_rolling(dt, by, options, tu, tz.parse::<Tz>().ok(), time_type)
+                self.impl_group_by_rolling(dt, by, options, tu, tz.parse::<Tz>().ok(), time_type)
             },
-            _ => self.impl_groupby_rolling(dt, by, options, tu, None, time_type),
+            _ => self.impl_group_by_rolling(dt, by, options, tu, None, time_type),
         }
     }
 
     /// Returns: time_keys, keys, groupsproxy
-    fn groupby_dynamic(
+    fn group_by_dynamic(
         &self,
         by: Vec<Series>,
         options: &DynamicGroupOptions,
@@ -214,11 +214,11 @@ impl Wrap<&DataFrame> {
         if by.is_empty() {
             // if by is given, the column must be sorted in the 'by' arg, which we can not check now
             // this will be checked when the groups are materialized
-            ensure_sorted_arg(&time, "groupby_dynamic")?;
+            ensure_sorted_arg(&time, "group_by_dynamic")?;
         }
         let time_type = time.dtype();
 
-        polars_ensure!(time.null_count() == 0, ComputeError: "null values in dynamic groupby not supported, fill nulls.");
+        polars_ensure!(time.null_count() == 0, ComputeError: "null values in dynamic group_by not supported, fill nulls.");
 
         use DataType::*;
         let (dt, tu) = match time_type {
@@ -231,7 +231,7 @@ impl Wrap<&DataFrame> {
                 let time_type = Datetime(TimeUnit::Nanoseconds, None);
                 let dt = time.cast(&Int64).unwrap().cast(&time_type).unwrap();
                 let (out, mut keys, gt) =
-                    self.impl_groupby_dynamic(dt, by, options, TimeUnit::Nanoseconds, &time_type)?;
+                    self.impl_group_by_dynamic(dt, by, options, TimeUnit::Nanoseconds, &time_type)?;
                 let out = out.cast(&Int64).unwrap().cast(&Int32).unwrap();
                 for k in &mut keys {
                     if k.name() == UP_NAME || k.name() == LB_NAME {
@@ -244,7 +244,7 @@ impl Wrap<&DataFrame> {
                 let time_type = Datetime(TimeUnit::Nanoseconds, None);
                 let dt = time.cast(&time_type).unwrap();
                 let (out, mut keys, gt) =
-                    self.impl_groupby_dynamic(dt, by, options, TimeUnit::Nanoseconds, &time_type)?;
+                    self.impl_group_by_dynamic(dt, by, options, TimeUnit::Nanoseconds, &time_type)?;
                 let out = out.cast(&Int64).unwrap();
                 for k in &mut keys {
                     if k.name() == UP_NAME || k.name() == LB_NAME {
@@ -259,10 +259,10 @@ impl Wrap<&DataFrame> {
                 dt
             ),
         };
-        self.impl_groupby_dynamic(dt, by, options, tu, time_type)
+        self.impl_group_by_dynamic(dt, by, options, tu, time_type)
     }
 
-    fn impl_groupby_dynamic(
+    fn impl_group_by_dynamic(
         &self,
         mut dt: Series,
         mut by: Vec<Series>,
@@ -313,7 +313,7 @@ impl Wrap<&DataFrame> {
         let groups = if by.is_empty() {
             let vals = dt.downcast_iter().next().unwrap();
             let ts = vals.values().as_slice();
-            let (groups, lower, upper) = groupby_windows(
+            let (groups, lower, upper) = group_by_windows(
                 w,
                 ts,
                 options.closed_window,
@@ -331,7 +331,7 @@ impl Wrap<&DataFrame> {
         } else {
             let groups = self
                 .0
-                .groupby_with_series(by.clone(), true, true)?
+                .group_by_with_series(by.clone(), true, true)?
                 .take_groups();
 
             // include boundaries cannot be parallel (easily)
@@ -349,7 +349,7 @@ impl Wrap<&DataFrame> {
                                 {
                                     check_sortedness_slice(ts)?
                                 }
-                                let (sub_groups, lower, upper) = groupby_windows(
+                                let (sub_groups, lower, upper) = group_by_windows(
                                     w,
                                     ts,
                                     options.closed_window,
@@ -383,7 +383,7 @@ impl Wrap<&DataFrame> {
                                 let dt = dt.slice(base_g[0] as i64, base_g[1] as usize);
                                 let vals = dt.downcast_iter().next().unwrap();
                                 let ts = vals.values().as_slice();
-                                let (sub_groups, lower, upper) = groupby_windows(
+                                let (sub_groups, lower, upper) = group_by_windows(
                                     w,
                                     ts,
                                     options.closed_window,
@@ -428,7 +428,7 @@ impl Wrap<&DataFrame> {
                                 {
                                     check_sortedness_slice(ts)?
                                 }
-                                let (sub_groups, _, _) = groupby_windows(
+                                let (sub_groups, _, _) = group_by_windows(
                                     w,
                                     ts,
                                     options.closed_window,
@@ -450,7 +450,7 @@ impl Wrap<&DataFrame> {
                                 let dt = dt.slice(base_g[0] as i64, base_g[1] as usize);
                                 let vals = dt.downcast_iter().next().unwrap();
                                 let ts = vals.values().as_slice();
-                                let (sub_groups, _, _) = groupby_windows(
+                                let (sub_groups, _, _) = group_by_windows(
                                     w,
                                     ts,
                                     options.closed_window,
@@ -516,7 +516,7 @@ impl Wrap<&DataFrame> {
     }
 
     /// Returns: time_keys, keys, groupsproxy
-    fn impl_groupby_rolling(
+    fn impl_group_by_rolling(
         &self,
         dt: Series,
         by: Vec<Series>,
@@ -535,7 +535,7 @@ impl Wrap<&DataFrame> {
             let vals = dt.downcast_iter().next().unwrap();
             let ts = vals.values().as_slice();
             PolarsResult::Ok(GroupsProxy::Slice {
-                groups: groupby_values(
+                groups: group_by_values(
                     options.period,
                     options.offset,
                     ts,
@@ -548,7 +548,7 @@ impl Wrap<&DataFrame> {
         } else {
             let groups = self
                 .0
-                .groupby_with_series(by.clone(), true, true)?
+                .group_by_with_series(by.clone(), true, true)?
                 .take_groups();
 
             // we keep a local copy, as we are reordering on next operation.
@@ -573,7 +573,7 @@ impl Wrap<&DataFrame> {
                                 check_sortedness_slice(ts)?
                             }
 
-                            let sub_groups = groupby_values(
+                            let sub_groups = group_by_values(
                                 options.period,
                                 options.offset,
                                 ts,
@@ -594,7 +594,7 @@ impl Wrap<&DataFrame> {
                             let dt = dt_local.slice(base_g[0] as i64, base_g[1] as usize);
                             let vals = dt.downcast_iter().next().unwrap();
                             let ts = vals.values().as_slice();
-                            let sub_groups = groupby_values(
+                            let sub_groups = group_by_values(
                                 options.period,
                                 options.offset,
                                 ts,
@@ -641,7 +641,7 @@ fn update_subgroups_idx(
             let new_first = if len == 0 {
                 // in case the group is empty
                 // keep the original first so that the
-                // groupby keys still point to the original group
+                // group_by keys still point to the original group
                 base_g.0
             } else {
                 unsafe { *base_g.1.get_unchecked_release(first as usize) }
@@ -664,7 +664,7 @@ mod test {
     use super::*;
 
     #[test]
-    fn test_rolling_groupby_tu() -> PolarsResult<()> {
+    fn test_rolling_group_by_tu() -> PolarsResult<()> {
         // test multiple time units
         for tu in [
             TimeUnit::Nanoseconds,
@@ -689,7 +689,7 @@ mod test {
             let df = DataFrame::new(vec![date, a.clone()])?;
 
             let (_, _, groups) = df
-                .groupby_rolling(
+                .group_by_rolling(
                     vec![],
                     &RollingGroupOptions {
                         index_column: "dt".into(),
@@ -710,7 +710,7 @@ mod test {
     }
 
     #[test]
-    fn test_rolling_groupby_aggs() -> PolarsResult<()> {
+    fn test_rolling_group_by_aggs() -> PolarsResult<()> {
         let mut date = Utf8Chunked::new(
             "dt",
             [
@@ -730,7 +730,7 @@ mod test {
         let df = DataFrame::new(vec![date, a.clone()])?;
 
         let (_, _, groups) = df
-            .groupby_rolling(
+            .group_by_rolling(
                 vec![],
                 &RollingGroupOptions {
                     index_column: "dt".into(),
@@ -782,7 +782,7 @@ mod test {
     }
 
     #[test]
-    fn test_dynamic_groupby_window() -> PolarsResult<()> {
+    fn test_dynamic_group_by_window() -> PolarsResult<()> {
         let start = NaiveDate::from_ymd_opt(2021, 12, 16)
             .unwrap()
             .and_hms_opt(0, 0, 0)
@@ -808,7 +808,7 @@ mod test {
         let df = DataFrame::new(vec![range, groups.clone()]).unwrap();
 
         let (time_key, mut keys, groups) = df
-            .groupby_dynamic(
+            .group_by_dynamic(
                 vec![groups],
                 &DynamicGroupOptions {
                     index_column: "date".into(),
@@ -923,7 +923,7 @@ mod test {
         let df = DataFrame::new(vec![range, groups.clone()]).unwrap();
 
         let (mut time_key, keys, _groups) = df
-            .groupby_dynamic(
+            .group_by_dynamic(
                 vec![groups],
                 &DynamicGroupOptions {
                     index_column: "date".into(),
diff --git a/crates/polars-time/src/groupby/mod.rs b/crates/polars-time/src/group_by/mod.rs
similarity index 100%
rename from crates/polars-time/src/groupby/mod.rs
rename to crates/polars-time/src/group_by/mod.rs
diff --git a/crates/polars-time/src/lib.rs b/crates/polars-time/src/lib.rs
index 25b2e0704be5..b2162e26a740 100644
--- a/crates/polars-time/src/lib.rs
+++ b/crates/polars-time/src/lib.rs
@@ -3,7 +3,7 @@ mod base_utc_offset;
 pub mod chunkedarray;
 mod date_range;
 mod dst_offset;
-mod groupby;
+mod group_by;
 mod month_end;
 mod month_start;
 pub mod prelude;
@@ -20,7 +20,7 @@ pub use date_range::*;
 #[cfg(feature = "timezones")]
 pub use dst_offset::*;
 #[cfg(any(feature = "dtype-date", feature = "dtype-datetime"))]
-pub use groupby::dynamic::*;
+pub use group_by::dynamic::*;
 pub use month_end::*;
 pub use month_start::*;
 pub use round::*;
@@ -28,5 +28,5 @@ pub use truncate::*;
 pub use upsample::*;
 pub use windows::calendar::temporal_range as temporal_range_vec;
 pub use windows::duration::Duration;
-pub use windows::groupby::ClosedWindow;
+pub use windows::group_by::ClosedWindow;
 pub use windows::window::Window;
diff --git a/crates/polars-time/src/prelude.rs b/crates/polars-time/src/prelude.rs
index b59367b09453..aa9a2a1d5b21 100644
--- a/crates/polars-time/src/prelude.rs
+++ b/crates/polars-time/src/prelude.rs
@@ -4,6 +4,6 @@ pub use crate::chunkedarray::*;
 pub use crate::series::{SeriesOpsTime, TemporalMethods};
 pub use crate::windows::bounds::*;
 pub use crate::windows::duration::*;
-pub use crate::windows::groupby::*;
+pub use crate::windows::group_by::*;
 pub use crate::windows::window::*;
 pub use crate::*;
diff --git a/crates/polars-time/src/upsample.rs b/crates/polars-time/src/upsample.rs
index 65c7b69750b5..d6cde94ee98d 100644
--- a/crates/polars-time/src/upsample.rs
+++ b/crates/polars-time/src/upsample.rs
@@ -140,9 +140,9 @@ fn upsample_impl(
         upsample_single_impl(source, index_column, every, offset)
     } else {
         let gb = if stable {
-            source.groupby_stable(by)
+            source.group_by_stable(by)
         } else {
-            source.groupby(by)
+            source.group_by(by)
         };
         // don't parallelize this, this may SO on large data.
         gb?.apply(|df| {
diff --git a/crates/polars-time/src/windows/bounds.rs b/crates/polars-time/src/windows/bounds.rs
index 64af87b61f8a..c3699be2b278 100644
--- a/crates/polars-time/src/windows/bounds.rs
+++ b/crates/polars-time/src/windows/bounds.rs
@@ -1,4 +1,4 @@
-use super::groupby::ClosedWindow;
+use super::group_by::ClosedWindow;
 
 #[derive(Copy, Clone, Debug)]
 pub struct Bounds {
@@ -12,7 +12,7 @@ impl Bounds {
         assert!(
             start <= stop,
             "boundary start must be smaller than stop; is your time column sorted in ascending order?\
-            \nIf you did a groupby, note that null values are a separate group."
+            \nIf you did a group_by, note that null values are a separate group."
         );
         Self::new(start, stop)
     }
diff --git a/crates/polars-time/src/windows/groupby.rs b/crates/polars-time/src/windows/group_by.rs
similarity index 94%
rename from crates/polars-time/src/windows/groupby.rs
rename to crates/polars-time/src/windows/group_by.rs
index 87e285d9847c..40d6c0e7bea4 100644
--- a/crates/polars-time/src/windows/groupby.rs
+++ b/crates/polars-time/src/windows/group_by.rs
@@ -143,7 +143,7 @@ fn update_groups_and_bounds(
 ///
 /// If `include_boundaries` is `false` those `lower` and `upper` vectors will be empty.
 #[allow(clippy::too_many_arguments)]
-pub fn groupby_windows(
+pub fn group_by_windows(
     window: Window,
     time: &[i64],
     closed_window: ClosedWindow,
@@ -224,7 +224,7 @@ pub fn groupby_windows(
 }
 
 // this assumes that the given time point is the right endpoint of the window
-pub(crate) fn groupby_values_iter_lookbehind(
+pub(crate) fn group_by_values_iter_lookbehind(
     period: Duration,
     offset: Duration,
     time: &[i64],
@@ -281,7 +281,7 @@ pub(crate) fn groupby_values_iter_lookbehind(
 }
 
 // this one is correct for all lookbehind/lookaheads, but is slower
-pub(crate) fn groupby_values_iter_window_behind_t(
+pub(crate) fn group_by_values_iter_window_behind_t(
     period: Duration,
     offset: Duration,
     time: &[i64],
@@ -331,7 +331,7 @@ pub(crate) fn groupby_values_iter_window_behind_t(
 }
 
 // this one is correct for all lookbehind/lookaheads, but is slower
-pub(crate) fn groupby_values_iter_partial_lookbehind(
+pub(crate) fn group_by_values_iter_partial_lookbehind(
     period: Duration,
     offset: Duration,
     time: &[i64],
@@ -369,7 +369,7 @@ pub(crate) fn groupby_values_iter_partial_lookbehind(
 }
 
 #[allow(clippy::too_many_arguments)]
-pub(crate) fn groupby_values_iter_partial_lookahead(
+pub(crate) fn group_by_values_iter_partial_lookahead(
     period: Duration,
     offset: Duration,
     time: &[i64],
@@ -406,7 +406,7 @@ pub(crate) fn groupby_values_iter_partial_lookahead(
         })
 }
 #[allow(clippy::too_many_arguments)]
-pub(crate) fn groupby_values_iter_full_lookahead(
+pub(crate) fn group_by_values_iter_full_lookahead(
     period: Duration,
     offset: Duration,
     time: &[i64],
@@ -454,7 +454,7 @@ pub(crate) fn groupby_values_iter_full_lookahead(
 }
 
 #[cfg(feature = "rolling_window")]
-pub(crate) fn groupby_values_iter<'a>(
+pub(crate) fn group_by_values_iter<'a>(
     period: Duration,
     time: &'a [i64],
     closed_window: ClosedWindow,
@@ -464,16 +464,16 @@ pub(crate) fn groupby_values_iter<'a>(
     let mut offset = period;
     offset.negative = true;
     // t is at the right endpoint of the window
-    let iter = groupby_values_iter_lookbehind(period, offset, time, closed_window, tu, tz, 0);
+    let iter = group_by_values_iter_lookbehind(period, offset, time, closed_window, tu, tz, 0);
     Box::new(iter)
 }
 
-/// Different from `groupby_windows`, where define window buckets and search which values fit that
+/// Different from `group_by_windows`, where define window buckets and search which values fit that
 /// pre-defined bucket, this function defines every window based on the:
 ///     - timestamp (lower bound)
 ///     - timestamp + period (upper bound)
 /// where timestamps are the individual values in the array `time`
-pub fn groupby_values(
+pub fn group_by_values(
     period: Duration,
     offset: Duration,
     time: &[i64],
@@ -496,7 +496,7 @@ pub fn groupby_values(
                     .copied()
                     .map(|(base_offset, len)| {
                         let upper_bound = base_offset + len;
-                        let iter = groupby_values_iter_lookbehind(
+                        let iter = group_by_values_iter_lookbehind(
                             period,
                             offset,
                             &time[..upper_bound],
@@ -520,7 +520,7 @@ pub fn groupby_values(
             // ---------------t---
             //  [---]
             let iter =
-                groupby_values_iter_window_behind_t(period, offset, time, closed_window, tu, tz);
+                group_by_values_iter_window_behind_t(period, offset, time, closed_window, tu, tz);
             iter.map(|result| result.map(|(offset, len)| [offset, len]))
                 .collect::<PolarsResult<_>>()
         }
@@ -531,8 +531,14 @@ pub fn groupby_values(
         // ----t---
         //  [---]
         else {
-            let iter =
-                groupby_values_iter_partial_lookbehind(period, offset, time, closed_window, tu, tz);
+            let iter = group_by_values_iter_partial_lookbehind(
+                period,
+                offset,
+                time,
+                closed_window,
+                tu,
+                tz,
+            );
             iter.map(|result| result.map(|(offset, len)| [offset, len]))
                 .collect::<PolarsResult<_>>()
         }
@@ -550,7 +556,7 @@ pub fn groupby_values(
                 .map(|(base_offset, len)| {
                     let lower_bound = base_offset;
                     let upper_bound = base_offset + len;
-                    let iter = groupby_values_iter_full_lookahead(
+                    let iter = group_by_values_iter_full_lookahead(
                         period,
                         offset,
                         time,
@@ -578,7 +584,7 @@ pub fn groupby_values(
                 .map(|(base_offset, len)| {
                     let lower_bound = base_offset;
                     let upper_bound = base_offset + len;
-                    let iter = groupby_values_iter_partial_lookahead(
+                    let iter = group_by_values_iter_partial_lookahead(
                         period,
                         offset,
                         time,
diff --git a/crates/polars-time/src/windows/mod.rs b/crates/polars-time/src/windows/mod.rs
index 120707752d6f..65a8cee9318e 100644
--- a/crates/polars-time/src/windows/mod.rs
+++ b/crates/polars-time/src/windows/mod.rs
@@ -7,7 +7,7 @@
 pub(crate) mod bounds;
 pub(crate) mod calendar;
 pub(crate) mod duration;
-pub(crate) mod groupby;
+pub(crate) mod group_by;
 #[cfg(test)]
 mod test;
 pub(crate) mod window;
diff --git a/crates/polars-time/src/windows/test.rs b/crates/polars-time/src/windows/test.rs
index adb820837230..652746fbfa93 100644
--- a/crates/polars-time/src/windows/test.rs
+++ b/crates/polars-time/src/windows/test.rs
@@ -93,7 +93,7 @@ fn test_groups_large_interval() {
 
     let dur = Duration::parse("2d");
     let w = Window::new(Duration::parse("2d"), dur, Duration::from_nsecs(0));
-    let (groups, _, _) = groupby_windows(
+    let (groups, _, _) = group_by_windows(
         w,
         &ts,
         ClosedWindow::Both,
@@ -108,7 +108,7 @@ fn test_groups_large_interval() {
     assert_eq!(groups[1], [1, 1]);
     assert_eq!(groups[2], [1, 3]);
     assert_eq!(groups[3], [3, 1]);
-    let (groups, _, _) = groupby_windows(
+    let (groups, _, _) = group_by_windows(
         w,
         &ts,
         ClosedWindow::Left,
@@ -120,7 +120,7 @@ fn test_groups_large_interval() {
     );
     assert_eq!(groups.len(), 3);
     assert_eq!(groups[2], [3, 1]);
-    let (groups, _, _) = groupby_windows(
+    let (groups, _, _) = group_by_windows(
         w,
         &ts,
         ClosedWindow::Right,
@@ -191,7 +191,7 @@ fn test_boundaries() {
     assert_eq!(b.start, start.timestamp_nanos());
 
     // test closed: "both" (includes both ends of the interval)
-    let (groups, lower, higher) = groupby_windows(
+    let (groups, lower, higher) = group_by_windows(
         w,
         &ts,
         ClosedWindow::Both,
@@ -287,7 +287,7 @@ fn test_boundaries() {
     assert_eq!(groups[2], [4, 3]);
 
     // test closed: "left" (should not include right end of interval)
-    let (groups, _, _) = groupby_windows(
+    let (groups, _, _) = group_by_windows(
         w,
         &ts,
         ClosedWindow::Left,
@@ -302,7 +302,7 @@ fn test_boundaries() {
     assert_eq!(groups[2], [4, 2]); // 02:00:00 -> 02:30:00
 
     // test closed: "right" (should not include left end of interval)
-    let (groups, _, _) = groupby_windows(
+    let (groups, _, _) = group_by_windows(
         w,
         &ts,
         ClosedWindow::Right,
@@ -317,7 +317,7 @@ fn test_boundaries() {
     assert_eq!(groups[2], [5, 2]); // 02:00:00 -> 02:30:00
 
     // test closed: "none" (should not include left or right end of interval)
-    let (groups, _, _) = groupby_windows(
+    let (groups, _, _) = group_by_windows(
         w,
         &ts,
         ClosedWindow::None,
@@ -367,7 +367,7 @@ fn test_boundaries_2() {
 
     assert_eq!(b.start, start.timestamp_nanos() + offset.duration_ns());
 
-    let (groups, lower, higher) = groupby_windows(
+    let (groups, lower, higher) = group_by_windows(
         w,
         &ts,
         ClosedWindow::Left,
@@ -475,7 +475,7 @@ fn test_boundaries_ms() {
     assert_eq!(b.start, start.timestamp_millis());
 
     // test closed: "both" (includes both ends of the interval)
-    let (groups, lower, higher) = groupby_windows(
+    let (groups, lower, higher) = group_by_windows(
         w,
         &ts,
         ClosedWindow::Both,
@@ -571,7 +571,7 @@ fn test_boundaries_ms() {
     assert_eq!(groups[2], [4, 3]);
 
     // test closed: "left" (should not include right end of interval)
-    let (groups, _, _) = groupby_windows(
+    let (groups, _, _) = group_by_windows(
         w,
         &ts,
         ClosedWindow::Left,
@@ -586,7 +586,7 @@ fn test_boundaries_ms() {
     assert_eq!(groups[2], [4, 2]); // 02:00:00 -> 02:30:00
 
     // test closed: "right" (should not include left end of interval)
-    let (groups, _, _) = groupby_windows(
+    let (groups, _, _) = group_by_windows(
         w,
         &ts,
         ClosedWindow::Right,
@@ -601,7 +601,7 @@ fn test_boundaries_ms() {
     assert_eq!(groups[2], [5, 2]); // 02:00:00 -> 02:30:00
 
     // test closed: "none" (should not include left or right end of interval)
-    let (groups, _, _) = groupby_windows(
+    let (groups, _, _) = group_by_windows(
         w,
         &ts,
         ClosedWindow::None,
@@ -638,7 +638,7 @@ fn test_rolling_lookback() {
     .unwrap(); // unwrapping as we pass None as the time zone
 
     // full lookbehind
-    let groups = groupby_values(
+    let groups = group_by_values(
         Duration::parse("2h"),
         Duration::parse("-2h"),
         &dates,
@@ -659,7 +659,7 @@ fn test_rolling_lookback() {
     assert_eq!(groups[8], [5, 4]); // bound: 02:00 -> 04:00     time: 04:00
 
     // partial lookbehind
-    let groups = groupby_values(
+    let groups = group_by_values(
         Duration::parse("2h"),
         Duration::parse("-1h"),
         &dates,
@@ -680,7 +680,7 @@ fn test_rolling_lookback() {
     assert_eq!(groups[8], [7, 2]);
 
     // no lookbehind
-    let groups = groupby_values(
+    let groups = group_by_values(
         Duration::parse("2h"),
         Duration::parse("0h"),
         &dates,
@@ -709,13 +709,20 @@ fn test_rolling_lookback() {
         ClosedWindow::None,
     ] {
         let offset = Duration::parse("-2h");
-        let g0 = groupby_values_iter_lookbehind(period, offset, &dates, closed_window, tu, None, 0)
-            .collect::<PolarsResult<Vec<_>>>()
-            .unwrap();
-        let g1 =
-            groupby_values_iter_partial_lookbehind(period, offset, &dates, closed_window, tu, None)
+        let g0 =
+            group_by_values_iter_lookbehind(period, offset, &dates, closed_window, tu, None, 0)
                 .collect::<PolarsResult<Vec<_>>>()
                 .unwrap();
+        let g1 = group_by_values_iter_partial_lookbehind(
+            period,
+            offset,
+            &dates,
+            closed_window,
+            tu,
+            None,
+        )
+        .collect::<PolarsResult<Vec<_>>>()
+        .unwrap();
         assert_eq!(g0, g1);
     }
 }
@@ -746,7 +753,7 @@ fn test_end_membership() {
     // 2021-03-01 -> 2021-05-01     members: None
     // 2021-04-01 -> 2021-06-01     members: [1]
     // 2021-05-01 -> 2021-07-01     members: [1]
-    let (groups, _, _) = groupby_windows(
+    let (groups, _, _) = group_by_windows(
         window,
         &time,
         ClosedWindow::Left,
@@ -763,14 +770,14 @@ fn test_end_membership() {
 }
 
 #[test]
-fn test_groupby_windows_membership_2791() {
+fn test_group_by_windows_membership_2791() {
     let dates = [0, 0, 2, 2];
     let window = Window::new(
         Duration::parse("1ms"),
         Duration::parse("1ms"),
         Duration::parse("0ns"),
     );
-    let (groups, _, _) = groupby_windows(
+    let (groups, _, _) = group_by_windows(
         window,
         &dates,
         ClosedWindow::Left,
@@ -785,7 +792,7 @@ fn test_groupby_windows_membership_2791() {
 }
 
 #[test]
-fn test_groupby_windows_duplicates_2931() {
+fn test_group_by_windows_duplicates_2931() {
     let dates = [0, 3, 3, 5, 5];
     let window = Window::new(
         Duration::parse("1ms"),
@@ -793,7 +800,7 @@ fn test_groupby_windows_duplicates_2931() {
         Duration::parse("0ns"),
     );
 
-    let (groups, _, _) = groupby_windows(
+    let (groups, _, _) = group_by_windows(
         window,
         &dates,
         ClosedWindow::Left,
@@ -807,7 +814,7 @@ fn test_groupby_windows_duplicates_2931() {
 }
 
 #[test]
-fn test_groupby_windows_offsets_3776() {
+fn test_group_by_windows_offsets_3776() {
     let dates = &[
         NaiveDate::from_ymd_opt(2020, 12, 1).unwrap(),
         NaiveDate::from_ymd_opt(2021, 2, 1).unwrap(),
@@ -823,7 +830,7 @@ fn test_groupby_windows_offsets_3776() {
         Duration::parse("2d"),
         Duration::parse("-2d"),
     );
-    let (groups, _, _) = groupby_windows(
+    let (groups, _, _) = group_by_windows(
         window,
         &ts,
         ClosedWindow::Right,
diff --git a/crates/polars/Cargo.toml b/crates/polars/Cargo.toml
index 3e66aaee16b6..7b6d0f3d3097 100644
--- a/crates/polars/Cargo.toml
+++ b/crates/polars/Cargo.toml
@@ -130,7 +130,7 @@ extract_jsonpath = [
 ]
 string_encoding = ["polars-ops/string_encoding", "polars-core/strings"]
 binary_encoding = ["polars-ops/binary_encoding"]
-groupby_list = ["polars-core/groupby_list"]
+group_by_list = ["polars-core/group_by_list"]
 lazy_regex = ["polars-lazy/regex"]
 cum_agg = ["polars-core/cum_agg", "polars-core/cum_agg"]
 rolling_window = ["polars-core/rolling_window", "polars-lazy/rolling_window", "polars-time/rolling_window"]
@@ -144,7 +144,7 @@ true_div = ["polars-lazy/true_div"]
 diagonal_concat = ["polars-core/diagonal_concat", "polars-lazy/diagonal_concat"]
 horizontal_concat = ["polars-core/horizontal_concat"]
 abs = ["polars-core/abs", "polars-lazy/abs"]
-dynamic_groupby = ["polars-core/dynamic_groupby", "polars-lazy/dynamic_groupby"]
+dynamic_group_by = ["polars-core/dynamic_group_by", "polars-lazy/dynamic_group_by"]
 ewma = ["polars-core/ewma", "polars-lazy/ewma"]
 dot_diagram = ["polars-lazy/dot_diagram"]
 dataframe_arithmetic = ["polars-core/dataframe_arithmetic"]
@@ -327,7 +327,7 @@ docs-selection = [
   "arg_where",
   "propagate_nans",
   "coalesce",
-  "dynamic_groupby",
+  "dynamic_group_by",
   "extract_groups",
 ]
 
diff --git a/crates/polars/src/docs/eager.rs b/crates/polars/src/docs/eager.rs
index f8030818d94d..e31a5ada79e7 100644
--- a/crates/polars/src/docs/eager.rs
+++ b/crates/polars/src/docs/eager.rs
@@ -18,7 +18,7 @@
 //! * [Filter](#filter)
 //! * [Sort](#sort)
 //! * [Joins](#joins)
-//! * [GroupBy](#groupby)
+//! * [GroupBy](#group_by)
 //!     - [pivot](#pivot)
 //! * [Melt](#melt)
 //! * [Explode](#explode)
@@ -400,20 +400,20 @@
 //!
 //! ## Groupby
 //!
-//! Note that Polars lazy is a lot more powerful in and more performant in groupby operations.
+//! Note that Polars lazy is a lot more powerful in and more performant in group_by operations.
 //! In lazy a myriad of aggregations can be combined from expressions.
 //!
 //! See more in:
 //!
-//! * [Groupby](crate::frame::groupby::GroupBy)
+//! * [Groupby](crate::frame::group_by::GroupBy)
 //!
 //! ### GroupBy
 //! ```
 //! use polars::prelude::*;
 //!
 //! # fn example(df: &DataFrame) -> PolarsResult<()> {
-//!  // groupby "groups" | sum "foo"
-//!  let out = df.groupby(["groups"])?
+//!  // group_by "groups" | sum "foo"
+//!  let out = df.group_by(["groups"])?
 //!     .select(["foo"])
 //!     .sum();
 //!
@@ -434,7 +434,7 @@
 //!      "bar" => ["k", "l", "m", "n", "0"]
 //!      )?;
 //!
-//! // groupby "foo" | pivot "bar" column | aggregate "N"
+//! // group_by "foo" | pivot "bar" column | aggregate "N"
 //!  let pivoted = pivot::pivot(&df, ["foo"], ["bar"], ["N"], false, Some(first()), None);
 //!
 //! // pivoted:
diff --git a/crates/polars/src/docs/lazy.rs b/crates/polars/src/docs/lazy.rs
index 0e72c5a540c4..0d2404b6a753 100644
--- a/crates/polars/src/docs/lazy.rs
+++ b/crates/polars/src/docs/lazy.rs
@@ -9,7 +9,7 @@
 //! * [Start a lazy computation](#start-a-lazy-computation)
 //! * [Filter](#filter)
 //! * [Sort](#sort)
-//! * [GroupBy](#groupby)
+//! * [GroupBy](#group_by)
 //! * [Joins](#joins)
 //! * [Conditionally apply](#conditionally-apply)
 //! * [Black box function](#black-box-function)
@@ -106,7 +106,7 @@
 //!
 //! ## Groupby
 //!
-//! This example is from the polars [user guide](https://pola-rs.github.io/polars-book/user-guide/concepts/contexts/#groupby-aggregation).
+//! This example is from the polars [user guide](https://pola-rs.github.io/polars-book/user-guide/concepts/contexts/#group_by-aggregation).
 //!
 //! ```
 //! use polars::prelude::*;
@@ -116,7 +116,7 @@
 //!     .has_header(true)
 //!     .with_delimiter(b',')
 //!     .finish()?
-//!     .groupby([col("comment_karma")])
+//!     .group_by([col("comment_karma")])
 //!     .agg([col("name").n_unique().alias("unique_names"), col("link_karma").max()])
 //!     // take only 100 rows.
 //!     .fetch(100)?;
diff --git a/crates/polars/src/lib.rs b/crates/polars/src/lib.rs
index b16dc3917509..530bb4db3da6 100644
--- a/crates/polars/src/lib.rs
+++ b/crates/polars/src/lib.rs
@@ -14,7 +14,7 @@
 //! # fn example() -> PolarsResult<()> {
 //!
 //! let lf1 = LazyFrame::scan_parquet("myfile_1.parquet", Default::default())?
-//!     .groupby([col("ham")])
+//!     .group_by([col("ham")])
 //!     .agg([
 //!         // expressions can be combined into powerful aggregations
 //!         col("foo")
@@ -201,7 +201,7 @@
 //!                         * gzip
 //!
 //! * `DataFrame` operations:
-//!     - `dynamic_groupby` - Groupby based on a time window instead of predefined keys.
+//!     - `dynamic_group_by` - Groupby based on a time window instead of predefined keys.
 //!                           Also activates rolling window group by operations.
 //!     - `sort_multiple` - Allow sorting a `DataFrame` on multiple columns
 //!     - `rows` - Create `DataFrame` from rows and extract rows from `DataFrames`.
@@ -209,7 +209,7 @@
 //!     - `asof_join` - Join ASOF, to join on nearest keys instead of exact equality match.
 //!     - `cross_join` - Create the cartesian product of two DataFrames.
 //!     - `semi_anti_join` - SEMI and ANTI joins.
-//!     - `groupby_list` - Allow groupby operation on keys of type List.
+//!     - `group_by_list` - Allow group_by operation on keys of type List.
 //!     - `row_hash` - Utility to hash DataFrame rows to UInt64Chunked
 //!     - `diagonal_concat` - Concat diagonally thereby combining different schemas.
 //!     - `horizontal_concat` - Concat horizontally and extend with null values if lengths don't match
@@ -358,11 +358,11 @@
 //! * `POLARS_TABLE_WIDTH` -> width of the tables used during DataFrame formatting.
 //! * `POLARS_MAX_THREADS` -> maximum number of threads used to initialize thread pool (on startup).
 //! * `POLARS_VERBOSE` -> print logging info to stderr.
-//! * `POLARS_NO_PARTITION` -> polars may choose to partition the groupby operation, based on data
-//!                            cardinality. Setting this env var will turn partitioned groupby's off.
-//! * `POLARS_PARTITION_UNIQUE_COUNT` -> at which (estimated) key count a partitioned groupby should run.
-//!                                          defaults to `1000`, any higher cardinality will run default groupby.
-//! * `POLARS_FORCE_PARTITION` -> force partitioned groupby if the keys and aggregations allow it.
+//! * `POLARS_NO_PARTITION` -> polars may choose to partition the group_by operation, based on data
+//!                            cardinality. Setting this env var will turn partitioned group_by's off.
+//! * `POLARS_PARTITION_UNIQUE_COUNT` -> at which (estimated) key count a partitioned group_by should run.
+//!                                          defaults to `1000`, any higher cardinality will run default group_by.
+//! * `POLARS_FORCE_PARTITION` -> force partitioned group_by if the keys and aggregations allow it.
 //! * `POLARS_ALLOW_EXTENSION` -> allows for `[ObjectChunked<T>]` to be used in arrow, opening up possibilities like using
 //!                               `T` in complex lazy expressions. However this does require `unsafe` code allow this.
 //! * `POLARS_NO_PARQUET_STATISTICS` -> if set, statistics in parquet files are ignored.
diff --git a/crates/polars/src/prelude.rs b/crates/polars/src/prelude.rs
index 044cd574c028..f8901ec52fbb 100644
--- a/crates/polars/src/prelude.rs
+++ b/crates/polars/src/prelude.rs
@@ -1,6 +1,6 @@
 #[cfg(feature = "polars-algo")]
 pub use polars_algo::prelude::*;
-pub use polars_core::frame::groupby::*;
+pub use polars_core::frame::group_by::*;
 pub use polars_core::prelude::*;
 pub use polars_core::utils::NoNull;
 #[cfg(feature = "polars-io")]
diff --git a/crates/polars/tests/it/core/groupby.rs b/crates/polars/tests/it/core/group_by.rs
similarity index 98%
rename from crates/polars/tests/it/core/groupby.rs
rename to crates/polars/tests/it/core/group_by.rs
index 9a5696e5aa35..f14caad753dd 100644
--- a/crates/polars/tests/it/core/groupby.rs
+++ b/crates/polars/tests/it/core/group_by.rs
@@ -3,7 +3,7 @@ use polars_core::series::IsSorted;
 use super::*;
 
 #[test]
-fn test_sorted_groupby() -> PolarsResult<()> {
+fn test_sorted_group_by() -> PolarsResult<()> {
     // nulls last
     let mut s = Series::new("a", &[Some(1), Some(1), Some(1), Some(6), Some(6), None]);
     s.set_sorted_flag(IsSorted::Ascending);
diff --git a/crates/polars/tests/it/core/mod.rs b/crates/polars/tests/it/core/mod.rs
index a2017782af9d..76adb01d5677 100644
--- a/crates/polars/tests/it/core/mod.rs
+++ b/crates/polars/tests/it/core/mod.rs
@@ -1,5 +1,5 @@
 mod date_like;
-mod groupby;
+mod group_by;
 mod joins;
 mod list;
 mod ops;
diff --git a/crates/polars/tests/it/joins.rs b/crates/polars/tests/it/joins.rs
index ea286750123f..fa224b01f4ee 100644
--- a/crates/polars/tests/it/joins.rs
+++ b/crates/polars/tests/it/joins.rs
@@ -12,10 +12,10 @@ fn join_nans_outer() -> PolarsResult<()> {
         .lazy();
     let a1 = df1
         .clone()
-        .groupby(vec![col("w").alias("w"), col("t")])
+        .group_by(vec![col("w").alias("w"), col("t")])
         .agg(vec![col("c").sum().alias("c_sum")]);
     let a2 = df1
-        .groupby(vec![col("w").alias("w"), col("t")])
+        .group_by(vec![col("w").alias("w"), col("t")])
         .agg(vec![col("c").max().alias("c_max")]);
 
     let res = a1
@@ -42,7 +42,7 @@ fn join_empty_datasets() -> PolarsResult<()> {
     .unwrap();
 
     a.lazy()
-        .groupby([col("foo")])
+        .group_by([col("foo")])
         .agg([all().last()])
         .inner_join(b.lazy(), "foo", "foo")
         .collect()
diff --git a/crates/polars/tests/it/lazy/aggregation.rs b/crates/polars/tests/it/lazy/aggregation.rs
index 180db6a5b85e..21e19152303e 100644
--- a/crates/polars/tests/it/lazy/aggregation.rs
+++ b/crates/polars/tests/it/lazy/aggregation.rs
@@ -51,7 +51,7 @@ fn test_lazy_agg() {
 
     let lf = df
         .lazy()
-        .groupby([col("date")])
+        .group_by([col("date")])
         .agg([
             col("rain").min().alias("min"),
             col("rain").sum().alias("sum"),
@@ -90,7 +90,7 @@ fn test_apply_multiple_error() {
     let _res = df
         .lazy()
         .with_streaming(false)
-        .groupby_stable([col("rf")])
+        .group_by_stable([col("rf")])
         .agg([issue()])
         .collect()
         .unwrap();
diff --git a/crates/polars/tests/it/lazy/expressions/apply.rs b/crates/polars/tests/it/lazy/expressions/apply.rs
index 09ed578e9ab2..1c61aca40be3 100644
--- a/crates/polars/tests/it/lazy/expressions/apply.rs
+++ b/crates/polars/tests/it/lazy/expressions/apply.rs
@@ -28,7 +28,7 @@ fn test_groups_update() -> PolarsResult<()> {
 
     let out = df
         .lazy()
-        .groupby_stable([col("group")])
+        .group_by_stable([col("group")])
         .agg([col("id").unique_counts().log(2.0)])
         .explode([col("id")])
         .collect()?;
@@ -50,7 +50,7 @@ fn test_groups_update_binary_shift_log() -> PolarsResult<()> {
         "b" => [1, 2, 1, 2],
     ]?
     .lazy()
-    .groupby([col("b")])
+    .group_by([col("b")])
     .agg([col("a") - col("a").shift(1).log(2.0)])
     .sort("b", Default::default())
     .explode([col("a")])
@@ -93,7 +93,7 @@ fn test_apply_groups_empty() -> PolarsResult<()> {
     let out = df
         .lazy()
         .filter(col("id").eq(lit(2)))
-        .groupby([col("id")])
+        .group_by([col("id")])
         .agg([col("hi").drop_nulls().unique()])
         .collect()?;
 
diff --git a/crates/polars/tests/it/lazy/expressions/arity.rs b/crates/polars/tests/it/lazy/expressions/arity.rs
index 07e7c6700ff5..290bd9f3efca 100644
--- a/crates/polars/tests/it/lazy/expressions/arity.rs
+++ b/crates/polars/tests/it/lazy/expressions/arity.rs
@@ -10,7 +10,7 @@ fn test_list_broadcast() {
     ]
     .unwrap()
     .lazy()
-    .groupby([col("g")])
+    .group_by([col("g")])
     .agg([col("a").unique_counts() * count()])
     .collect()
     .unwrap();
@@ -161,7 +161,7 @@ fn test_when_then_otherwise_single_bool() -> PolarsResult<()> {
 
     let out = df
         .lazy()
-        .groupby_stable([col("key")])
+        .group_by_stable([col("key")])
         .agg([when(col("val").null_count().gt(lit(0)))
             .then(Null {}.lit())
             .otherwise(col("val").sum())
@@ -191,7 +191,7 @@ fn test_update_groups_in_cast() -> PolarsResult<()> {
     // in aggregation that cast coerces a list and the cast may forget to update groups
     let out = df
         .lazy()
-        .groupby_stable([col("group")])
+        .group_by_stable([col("group")])
         .agg([col("id").unique_counts() * lit(-1)])
         .collect()?;
 
@@ -214,7 +214,7 @@ fn test_when_then_otherwise_sum_in_agg() -> PolarsResult<()> {
 
     let q = df
         .lazy()
-        .groupby([col("groups")])
+        .group_by([col("groups")])
         .agg([when(all().exclude(["groups"]).sum().eq(lit(1)))
             .then(all().exclude(["groups"]).sum())
             .otherwise(lit(NULL))])
@@ -292,7 +292,7 @@ fn test_ternary_aggregation_set_literals() -> PolarsResult<()> {
     let out = df
         .clone()
         .lazy()
-        .groupby([col("name")])
+        .group_by([col("name")])
         .agg([when(col("value").sum().eq(lit(3)))
             .then(col("value").rank(Default::default(), None))
             .otherwise(lit(Series::new("", &[10 as IdxSize])))])
@@ -312,7 +312,7 @@ fn test_ternary_aggregation_set_literals() -> PolarsResult<()> {
     let out = df
         .clone()
         .lazy()
-        .groupby([col("name")])
+        .group_by([col("name")])
         .agg([when(col("value").sum().eq(lit(3)))
             .then(lit(Series::new("", &[10 as IdxSize])).alias("value"))
             .otherwise(col("value").rank(Default::default(), None))])
@@ -332,7 +332,7 @@ fn test_ternary_aggregation_set_literals() -> PolarsResult<()> {
     let out = df
         .clone()
         .lazy()
-        .groupby([col("name")])
+        .group_by([col("name")])
         .agg([when(col("value").sum().eq(lit(3)))
             .then(col("value").rank(Default::default(), None))
             .otherwise(Null {}.lit())])
@@ -346,7 +346,7 @@ fn test_ternary_aggregation_set_literals() -> PolarsResult<()> {
     // swapped branch
     let out = df
         .lazy()
-        .groupby([col("name")])
+        .group_by([col("name")])
         .agg([when(col("value").sum().eq(lit(3)))
             .then(Null {}.lit().alias("value"))
             .otherwise(col("value").rank(Default::default(), None))])
@@ -370,7 +370,7 @@ fn test_binary_group_consistency() -> PolarsResult<()> {
     .lazy();
 
     let out = lf
-        .groupby([col("category")])
+        .group_by([col("category")])
         .agg([col("name").filter(col("score").eq(col("score").max()))])
         .sort("category", Default::default())
         .collect()?;
diff --git a/crates/polars/tests/it/lazy/expressions/filter.rs b/crates/polars/tests/it/lazy/expressions/filter.rs
index 706f764e5a37..2d60525c3d1a 100644
--- a/crates/polars/tests/it/lazy/expressions/filter.rs
+++ b/crates/polars/tests/it/lazy/expressions/filter.rs
@@ -1,7 +1,7 @@
 use super::*;
 
 #[test]
-fn test_filter_in_groupby_agg() -> PolarsResult<()> {
+fn test_filter_in_group_by_agg() -> PolarsResult<()> {
     // This tests if the filter is correctly handled by the binary expression.
     // This could lead to UB if it were not the case. The filter creates an empty column.
     // but the group tuples could still be untouched leading to out of bounds aggregation.
@@ -13,7 +13,7 @@ fn test_filter_in_groupby_agg() -> PolarsResult<()> {
     let out = df
         .clone()
         .lazy()
-        .groupby([col("a")])
+        .group_by([col("a")])
         .agg([(col("b").filter(col("b").eq(lit(100))) * lit(2))
             .mean()
             .alias("b_mean")])
@@ -23,7 +23,7 @@ fn test_filter_in_groupby_agg() -> PolarsResult<()> {
 
     let out = df
         .lazy()
-        .groupby([col("a")])
+        .group_by([col("a")])
         .agg([(col("b")
             .filter(col("b").eq(lit(100)))
             .map(|v| Ok(Some(v)), GetOutput::same_type()))
diff --git a/crates/polars/tests/it/lazy/expressions/slice.rs b/crates/polars/tests/it/lazy/expressions/slice.rs
index 0d996d9ccc83..c57e8f83b7ad 100644
--- a/crates/polars/tests/it/lazy/expressions/slice.rs
+++ b/crates/polars/tests/it/lazy/expressions/slice.rs
@@ -14,7 +14,7 @@ fn test_slice_args() -> PolarsResult<()> {
         "vals" => 0i32..30
     ]?
     .lazy()
-    .groupby_stable([col("groups")])
+    .group_by_stable([col("groups")])
     .agg([col("vals").slice(lit(0i64), count() * lit(0.2))])
     .collect()?;
 
diff --git a/crates/polars/tests/it/lazy/groupby.rs b/crates/polars/tests/it/lazy/group_by.rs
similarity index 90%
rename from crates/polars/tests/it/lazy/groupby.rs
rename to crates/polars/tests/it/lazy/group_by.rs
index 6e3ed7e09666..9ed05bc25bd0 100644
--- a/crates/polars/tests/it/lazy/groupby.rs
+++ b/crates/polars/tests/it/lazy/group_by.rs
@@ -18,7 +18,7 @@ fn test_filter_sort_diff_2984() -> PolarsResult<()> {
     let out = df
         .lazy()
         // don't use stable in this test, it hides wrong state
-        .groupby([col("group")])
+        .group_by([col("group")])
         .agg([col("id")
             .filter(col("id").lt(lit(3)))
             .sort(false)
@@ -40,7 +40,7 @@ fn test_filter_after_tail() -> PolarsResult<()> {
 
     let out = df
         .lazy()
-        .groupby_stable([col("a")])
+        .group_by_stable([col("a")])
         .tail(Some(1))
         .filter(col("b").eq(lit(3)))
         .with_predicate_pushdown(false)
@@ -66,7 +66,7 @@ fn test_filter_diff_arithmetic() -> PolarsResult<()> {
 
     let out = df
         .lazy()
-        .groupby([col("user")])
+        .group_by([col("user")])
         .agg([(col("value")
             .filter(col("group").eq(lit(1)))
             .diff(1, Default::default())
@@ -83,14 +83,14 @@ fn test_filter_diff_arithmetic() -> PolarsResult<()> {
 }
 
 #[test]
-fn test_groupby_lit_agg() -> PolarsResult<()> {
+fn test_group_by_lit_agg() -> PolarsResult<()> {
     let df = df![
         "group" => [1, 2, 1, 1, 2],
     ]?;
 
     let out = df
         .lazy()
-        .groupby([col("group")])
+        .group_by([col("group")])
         .agg([lit("foo").alias("foo")])
         .collect()?;
 
@@ -101,7 +101,7 @@ fn test_groupby_lit_agg() -> PolarsResult<()> {
 
 #[test]
 #[cfg(feature = "diff")]
-fn test_groupby_agg_list_with_not_aggregated() -> PolarsResult<()> {
+fn test_group_by_agg_list_with_not_aggregated() -> PolarsResult<()> {
     let df = df![
     "group" => ["a", "a", "a", "a", "a", "a", "b", "b", "b", "b", "b", "b"],
     "value" => [0, 2, 3, 6, 2, 4, 7, 9, 3, 4, 6, 7, ],
@@ -109,7 +109,7 @@ fn test_groupby_agg_list_with_not_aggregated() -> PolarsResult<()> {
 
     let out = df
         .lazy()
-        .groupby([col("group")])
+        .group_by([col("group")])
         .agg([when(col("value").diff(1, NullBehavior::Ignore).gt_eq(0))
             .then(col("value").diff(1, NullBehavior::Ignore))
             .otherwise(col("value"))])
@@ -127,7 +127,7 @@ fn test_groupby_agg_list_with_not_aggregated() -> PolarsResult<()> {
 
 #[test]
 #[cfg(all(feature = "dtype-duration", feature = "dtype-struct"))]
-fn test_logical_mean_partitioned_groupby_block() -> PolarsResult<()> {
+fn test_logical_mean_partitioned_group_by_block() -> PolarsResult<()> {
     let _guard = SINGLE_LOCK.lock();
     let df = df![
         "a" => [1, 1, 2],
@@ -137,7 +137,7 @@ fn test_logical_mean_partitioned_groupby_block() -> PolarsResult<()> {
     let out = df
         .lazy()
         .with_column(col("duration").cast(DataType::Duration(TimeUnit::Microseconds)))
-        .groupby([col("a")])
+        .group_by([col("a")])
         .agg([col("duration").mean()])
         .sort("duration", Default::default())
         .collect()?;
@@ -164,7 +164,7 @@ fn test_filter_aggregated_expression() -> PolarsResult<()> {
 
     let df = df
         .lazy()
-        .groupby([col("day")])
+        .group_by([col("day")])
         .agg([(col("x") - col("x").first()).filter(f)])
         .sort("day", Default::default())
         .collect()
diff --git a/crates/polars/tests/it/lazy/groupby_dynamic.rs b/crates/polars/tests/it/lazy/group_by_dynamic.rs
similarity index 92%
rename from crates/polars/tests/it/lazy/groupby_dynamic.rs
rename to crates/polars/tests/it/lazy/group_by_dynamic.rs
index 44be5a7bcf19..1fa5ec6a396f 100644
--- a/crates/polars/tests/it/lazy/groupby_dynamic.rs
+++ b/crates/polars/tests/it/lazy/group_by_dynamic.rs
@@ -1,8 +1,8 @@
-// used only if feature="temporal", "dtype-date", "dynamic_groupby"
+// used only if feature="temporal", "dtype-date", "dynamic_group_by"
 #[allow(unused_imports)]
 use polars::export::chrono::prelude::*;
 
-// used only if feature="temporal", "dtype-date", "dynamic_groupby"
+// used only if feature="temporal", "dtype-date", "dynamic_group_by"
 #[allow(unused_imports)]
 use super::*;
 
@@ -10,9 +10,9 @@ use super::*;
 #[cfg(all(
     feature = "temporal",
     feature = "dtype-date",
-    feature = "dynamic_groupby"
+    feature = "dynamic_group_by"
 ))]
-fn test_groupby_dynamic_week_bounds() -> PolarsResult<()> {
+fn test_group_by_dynamic_week_bounds() -> PolarsResult<()> {
     let start = NaiveDate::from_ymd_opt(2022, 2, 1)
         .unwrap()
         .and_hms_opt(0, 0, 0)
@@ -40,7 +40,7 @@ fn test_groupby_dynamic_week_bounds() -> PolarsResult<()> {
 
     let out = df
         .lazy()
-        .groupby_dynamic(
+        .group_by_dynamic(
             col("dt"),
             [],
             DynamicGroupOptions {
diff --git a/crates/polars/tests/it/lazy/mod.rs b/crates/polars/tests/it/lazy/mod.rs
index 7dc6c62bb775..a5808478f150 100644
--- a/crates/polars/tests/it/lazy/mod.rs
+++ b/crates/polars/tests/it/lazy/mod.rs
@@ -5,8 +5,8 @@ mod explodes;
 mod expressions;
 mod folds;
 mod functions;
-mod groupby;
-mod groupby_dynamic;
+mod group_by;
+mod group_by_dynamic;
 mod predicate_queries;
 mod projection_queries;
 mod queries;
diff --git a/crates/polars/tests/it/lazy/queries.rs b/crates/polars/tests/it/lazy/queries.rs
index c83752f88100..d0af51efaab3 100644
--- a/crates/polars/tests/it/lazy/queries.rs
+++ b/crates/polars/tests/it/lazy/queries.rs
@@ -33,8 +33,8 @@ fn test_drop() -> PolarsResult<()> {
 }
 
 #[test]
-#[cfg(feature = "dynamic_groupby")]
-fn test_special_groupby_schemas() -> PolarsResult<()> {
+#[cfg(feature = "dynamic_group_by")]
+fn test_special_group_by_schemas() -> PolarsResult<()> {
     let df = df![
         "a" => [1, 2, 3, 4, 5],
         "b" => [1, 2, 3, 4, 5],
@@ -44,7 +44,7 @@ fn test_special_groupby_schemas() -> PolarsResult<()> {
         .clone()
         .lazy()
         .with_column(col("a").set_sorted_flag(IsSorted::Ascending))
-        .groupby_rolling(
+        .group_by_rolling(
             col("a"),
             [],
             RollingGroupOptions {
@@ -69,7 +69,7 @@ fn test_special_groupby_schemas() -> PolarsResult<()> {
     let out = df
         .lazy()
         .with_column(col("a").set_sorted_flag(IsSorted::Ascending))
-        .groupby_dynamic(
+        .group_by_dynamic(
             col("a"),
             [],
             DynamicGroupOptions {
@@ -108,7 +108,7 @@ fn max_on_empty_df_3027() -> PolarsResult<()> {
 
     let out = df
         .lazy()
-        .groupby(&[col("id"), col("name")])
+        .group_by(&[col("id"), col("name")])
         .agg(&[col("numb").max()])
         .collect()?;
     assert_eq!(out.shape(), (0, 3));
@@ -144,7 +144,7 @@ fn test_sorted_path() -> PolarsResult<()> {
         .lazy()
         .with_row_count("row_nr", None)
         .explode(["a"])
-        .groupby(["row_nr"])
+        .group_by(["row_nr"])
         .agg([col("a").count().alias("count")])
         .collect()?;
 
@@ -215,7 +215,7 @@ fn test_apply_multiple_columns() -> PolarsResult<()> {
 
     let out = df
         .lazy()
-        .groupby_stable([col("cars")])
+        .group_by_stable([col("cars")])
         .agg([apply_multiple(
             multiply,
             [col("A"), col("B")],
@@ -233,7 +233,7 @@ fn test_apply_multiple_columns() -> PolarsResult<()> {
 }
 
 #[test]
-fn test_groupby_on_lists() -> PolarsResult<()> {
+fn test_group_by_on_lists() -> PolarsResult<()> {
     let s0 = Series::new("", [1i32, 2, 3]);
     let s1 = Series::new("groups", [4i32, 5]);
 
@@ -247,7 +247,7 @@ fn test_groupby_on_lists() -> PolarsResult<()> {
     let out = df
         .clone()
         .lazy()
-        .groupby([col("groups")])
+        .group_by([col("groups")])
         .agg([col("arrays").first()])
         .collect()?;
 
@@ -258,7 +258,7 @@ fn test_groupby_on_lists() -> PolarsResult<()> {
 
     let out = df
         .lazy()
-        .groupby([col("groups")])
+        .group_by([col("groups")])
         .agg([col("arrays").implode()])
         .collect()?;
 
diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml
index ce25037a4e48..608895f110bc 100644
--- a/py-polars/Cargo.toml
+++ b/py-polars/Cargo.toml
@@ -45,7 +45,7 @@ features = [
   "dot_product",
   "dtype-categorical",
   "dtype-full",
-  "dynamic_groupby",
+  "dynamic_group_by",
   "ewma",
   "fmt",
   "horizontal_concat",
@@ -169,7 +169,7 @@ all = [
   "build_info",
   "cse",
   "propagate_nans",
-  "polars/groupby_list",
+  "polars/group_by_list",
   "polars/fused",
   "sql",
   "binary_encoding",
diff --git a/py-polars/polars/dataframe/groupby.py b/py-polars/polars/dataframe/groupby.py
index 33a4dbea9a44..4f6259d6bcd3 100644
--- a/py-polars/polars/dataframe/groupby.py
+++ b/py-polars/polars/dataframe/groupby.py
@@ -333,7 +333,7 @@ def apply(self, function: Callable[[DataFrame], DataFrame]) -> DataFrame:
             raise TypeError("cannot call `apply` when grouping by an expression")
 
         return self.df.__class__._from_pydf(
-            self.df._df.groupby_apply(by, function, self.maintain_order)
+            self.df._df.group_by_apply(by, function, self.maintain_order)
         )
 
     def head(self, n: int = 5) -> DataFrame:
diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py
index 90f9fc452938..c40b6c72d15a 100644
--- a/py-polars/polars/lazyframe/frame.py
+++ b/py-polars/polars/lazyframe/frame.py
@@ -2533,7 +2533,7 @@ def groupby(
 
         """
         exprs = parse_as_list_of_expressions(by, *more_by)
-        lgb = self._ldf.groupby(exprs, maintain_order)
+        lgb = self._ldf.group_by(exprs, maintain_order)
         return LazyGroupBy(lgb)
 
     def groupby_rolling(
@@ -2681,7 +2681,7 @@ def groupby_rolling(
         period = _timedelta_to_pl_duration(period)
         offset = _timedelta_to_pl_duration(offset)
 
-        lgb = self._ldf.groupby_rolling(
+        lgb = self._ldf.group_by_rolling(
             index_column, period, offset, closed, pyexprs_by, check_sorted
         )
         return LazyGroupBy(lgb)
@@ -3026,7 +3026,7 @@ def groupby_dynamic(
         every = _timedelta_to_pl_duration(every)
 
         pyexprs_by = parse_as_list_of_expressions(by) if by is not None else []
-        lgb = self._ldf.groupby_dynamic(
+        lgb = self._ldf.group_by_dynamic(
             index_column,
             every,
             period,
diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs
index b3e77cf07f96..355b24d5bec8 100644
--- a/py-polars/src/dataframe.rs
+++ b/py-polars/src/dataframe.rs
@@ -1130,16 +1130,16 @@ impl PyDataFrame {
         Ok(df.into())
     }
 
-    pub fn groupby_apply(
+    pub fn group_by_apply(
         &self,
         by: Vec<&str>,
         lambda: PyObject,
         maintain_order: bool,
     ) -> PyResult<Self> {
         let gb = if maintain_order {
-            self.df.groupby_stable(&by)
+            self.df.group_by_stable(&by)
         } else {
-            self.df.groupby(&by)
+            self.df.group_by(&by)
         }
         .map_err(PyPolarsErr::from)?;
 
diff --git a/py-polars/src/lazyframe.rs b/py-polars/src/lazyframe.rs
index 6d3b77400a8a..ae7df9579ff0 100644
--- a/py-polars/src/lazyframe.rs
+++ b/py-polars/src/lazyframe.rs
@@ -550,19 +550,19 @@ impl PyLazyFrame {
         ldf.select_seq(exprs).into()
     }
 
-    fn groupby(&mut self, by: Vec<PyExpr>, maintain_order: bool) -> PyLazyGroupBy {
+    fn group_by(&mut self, by: Vec<PyExpr>, maintain_order: bool) -> PyLazyGroupBy {
         let ldf = self.ldf.clone();
         let by = by.to_exprs();
         let lazy_gb = if maintain_order {
-            ldf.groupby_stable(by)
+            ldf.group_by_stable(by)
         } else {
-            ldf.groupby(by)
+            ldf.group_by(by)
         };
 
         PyLazyGroupBy { lgb: Some(lazy_gb) }
     }
 
-    fn groupby_rolling(
+    fn group_by_rolling(
         &mut self,
         index_column: PyExpr,
         period: &str,
@@ -577,7 +577,7 @@ impl PyLazyFrame {
             .into_iter()
             .map(|pyexpr| pyexpr.inner)
             .collect::<Vec<_>>();
-        let lazy_gb = ldf.groupby_rolling(
+        let lazy_gb = ldf.group_by_rolling(
             index_column.inner,
             by,
             RollingGroupOptions {
@@ -593,7 +593,7 @@ impl PyLazyFrame {
     }
 
     #[allow(clippy::too_many_arguments)]
-    fn groupby_dynamic(
+    fn group_by_dynamic(
         &mut self,
         index_column: PyExpr,
         every: &str,
@@ -612,7 +612,7 @@ impl PyLazyFrame {
             .map(|pyexpr| pyexpr.inner)
             .collect::<Vec<_>>();
         let ldf = self.ldf.clone();
-        let lazy_gb = ldf.groupby_dynamic(
+        let lazy_gb = ldf.group_by_dynamic(
             index_column.inner,
             by,
             DynamicGroupOptions {
diff --git a/py-polars/tests/unit/operations/test_groupby_rolling.py b/py-polars/tests/unit/operations/test_groupby_rolling.py
index 193693178165..36be0b12bf1e 100644
--- a/py-polars/tests/unit/operations/test_groupby_rolling.py
+++ b/py-polars/tests/unit/operations/test_groupby_rolling.py
@@ -248,14 +248,14 @@ def test_groupby_rolling_dynamic_sortedness_check() -> None:
     # no `by` argument
     with pytest.raises(
         pl.InvalidOperationError,
-        match=r"argument in operation 'groupby_dynamic' is not explicitly sorted",
+        match=r"argument in operation 'group_by_dynamic' is not explicitly sorted",
     ):
         df.groupby_dynamic("idx", every="2i").agg(pl.col("idx").alias("idx1"))
 
     # no `by` argument
     with pytest.raises(
         pl.InvalidOperationError,
-        match=r"argument in operation 'groupby_rolling' is not explicitly sorted",
+        match=r"argument in operation 'group_by_rolling' is not explicitly sorted",
     ):
         df.groupby_rolling("idx", period="2i").agg(pl.col("idx").alias("idx1"))
 
diff --git a/py-polars/tests/unit/test_empty.py b/py-polars/tests/unit/test_empty.py
index ba07619b104c..12cd4f84680c 100644
--- a/py-polars/tests/unit/test_empty.py
+++ b/py-polars/tests/unit/test_empty.py
@@ -75,7 +75,7 @@ def test_empty_9137() -> None:
 def test_empty_groupby_apply_err() -> None:
     df = pl.DataFrame(schema={"x": pl.Int64})
     with pytest.raises(
-        pl.ComputeError, match=r"cannot groupby \+ apply on empty 'DataFrame'"
+        pl.ComputeError, match=r"cannot group_by \+ apply on empty 'DataFrame'"
     ):
         df.groupby("x").apply(lambda x: x)
 
diff --git a/py-polars/tests/unit/test_errors.py b/py-polars/tests/unit/test_errors.py
index fc6e7b2a96e5..f79c7752e0b8 100644
--- a/py-polars/tests/unit/test_errors.py
+++ b/py-polars/tests/unit/test_errors.py
@@ -17,7 +17,7 @@
 
 def test_error_on_empty_groupby() -> None:
     with pytest.raises(
-        pl.ComputeError, match="at least one key is required in a groupby operation"
+        pl.ComputeError, match="at least one key is required in a group_by operation"
     ):
         pl.DataFrame({"x": [0, 0, 1, 1]}).groupby([]).agg(pl.count())
 
@@ -618,7 +618,7 @@ def test_no_sorted_err() -> None:
     )
     with pytest.raises(
         pl.InvalidOperationError,
-        match=r"argument in operation 'groupby_dynamic' is not explicitly sorted",
+        match=r"argument in operation 'group_by_dynamic' is not explicitly sorted",
     ):
         df.groupby_dynamic("dt", every="1h").agg(pl.all().count().suffix("_foo"))
 

From 4b388e3c0c0369a05c690f9a5c6e77827802d7c5 Mon Sep 17 00:00:00 2001
From: Weijie Guo <reswqa@163.com>
Date: Tue, 22 Aug 2023 03:01:09 +0800
Subject: [PATCH 30/55] feat(rust, python): support cast to list (#10623)

---
 Cargo.toml                                  |  2 +-
 py-polars/tests/unit/datatypes/test_list.py | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index da68e35af692..0ec7f425cce6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -54,7 +54,7 @@ xxhash-rust = { version = "0.8.6", features = ["xxh3"] }
 [workspace.dependencies.arrow]
 package = "arrow2"
 git = "https://github.com/jorgecarleitao/arrow2"
-rev = "2b3e2a9e83725a557d78b90cd39298c5bef0ca4a"
+rev = "ba6a882bc1542b0b899774b696ebea77482b5c31"
 # branch = ""
 # version = "0.17.4"
 default-features = false
diff --git a/py-polars/tests/unit/datatypes/test_list.py b/py-polars/tests/unit/datatypes/test_list.py
index 70f2117caad4..53901bab76a1 100644
--- a/py-polars/tests/unit/datatypes/test_list.py
+++ b/py-polars/tests/unit/datatypes/test_list.py
@@ -5,6 +5,7 @@
 import pandas as pd
 
 import polars as pl
+from polars.testing import assert_series_equal
 
 
 def test_dtype() -> None:
@@ -439,6 +440,15 @@ def test_list_recursive_categorical_cast() -> None:
     assert s.to_list() == values
 
 
+def test_non_nested_cast_to_list() -> None:
+    df = pl.DataFrame({"a": [1, 2, 3]})
+
+    df = df.with_columns([pl.col("a").cast(pl.List(pl.Int64))])
+
+    expected = pl.Series("a", [[1], [2], [3]])
+    assert_series_equal(df.to_series(), expected)
+
+
 def test_list_new_from_index_logical() -> None:
     s = (
         pl.select(pl.struct(pl.Series("a", [date(2001, 1, 1)])).implode())

From 29fff1772606b40ebea30279e3b1bda8665526de Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijn@degooijer.io>
Date: Tue, 22 Aug 2023 05:49:08 +0200
Subject: [PATCH 31/55] fix(rust): `AllHorizontal` format string (#10658)

---
 .github/workflows/lint-py-polars.yml                | 2 ++
 crates/polars-plan/src/dsl/function_expr/boolean.rs | 2 +-
 py-polars/Cargo.lock                                | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/lint-py-polars.yml b/.github/workflows/lint-py-polars.yml
index 47e05ff3bbfa..2af80ca0b9e6 100644
--- a/.github/workflows/lint-py-polars.yml
+++ b/.github/workflows/lint-py-polars.yml
@@ -4,6 +4,7 @@ on:
   pull_request:
     paths:
       - crates/**
+      - Cargo.toml
       - py-polars/src/**
       - py-polars/Cargo.toml
       - .github/workflows/lint-py-polars.yml
@@ -12,6 +13,7 @@ on:
       - main
     paths:
       - crates/**
+      - Cargo.toml
       - py-polars/src/**
       - py-polars/Cargo.toml
       - .github/workflows/lint-py-polars.yml
diff --git a/crates/polars-plan/src/dsl/function_expr/boolean.rs b/crates/polars-plan/src/dsl/function_expr/boolean.rs
index fc2eb6307c19..41d9ceebcaeb 100644
--- a/crates/polars-plan/src/dsl/function_expr/boolean.rs
+++ b/crates/polars-plan/src/dsl/function_expr/boolean.rs
@@ -67,7 +67,7 @@ impl Display for BooleanFunction {
             #[cfg(feature = "is_in")]
             IsIn => "is_in",
             AnyHorizontal => "any_horizontal",
-            AllHorizontal => "any_horizontal",
+            AllHorizontal => "all_horizontal",
         };
         write!(f, "{s}")
     }
diff --git a/py-polars/Cargo.lock b/py-polars/Cargo.lock
index 32fef5e7a898..740d3614dc21 100644
--- a/py-polars/Cargo.lock
+++ b/py-polars/Cargo.lock
@@ -99,7 +99,7 @@ dependencies = [
 [[package]]
 name = "arrow2"
 version = "0.17.4"
-source = "git+https://github.com/jorgecarleitao/arrow2?rev=2b3e2a9e83725a557d78b90cd39298c5bef0ca4a#2b3e2a9e83725a557d78b90cd39298c5bef0ca4a"
+source = "git+https://github.com/jorgecarleitao/arrow2?rev=ba6a882bc1542b0b899774b696ebea77482b5c31#ba6a882bc1542b0b899774b696ebea77482b5c31"
 dependencies = [
  "ahash",
  "arrow-format",

From 5fbb7197e050a10b3b3f8af750508c0ec229a310 Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijn@degooijer.io>
Date: Tue, 22 Aug 2023 07:24:49 +0200
Subject: [PATCH 32/55] refactor(rust): Clean up schema calculation for
 `date_range` (#10653)

---
 .../src/dsl/function_expr/schema.rs           | 115 ++++++++++--------
 py-polars/tests/unit/functions/test_range.py  |  10 +-
 2 files changed, 69 insertions(+), 56 deletions(-)

diff --git a/crates/polars-plan/src/dsl/function_expr/schema.rs b/crates/polars-plan/src/dsl/function_expr/schema.rs
index 3f777fa25510..91c272a0f082 100644
--- a/crates/polars-plan/src/dsl/function_expr/schema.rs
+++ b/crates/polars-plan/src/dsl/function_expr/schema.rs
@@ -68,7 +68,11 @@ impl FunctionExpr {
                         time_zone,
                     } => {
                         // output dtype may change based on `every`, `time_unit`, and `time_zone`
-                        let dtype = mapper.map_to_date_range_dtype(every, time_unit, time_zone)?;
+                        let dtype = mapper.map_to_date_range_dtype(
+                            every,
+                            time_unit.as_ref(),
+                            time_zone.as_deref(),
+                        )?;
                         return Ok(Field::new("date", dtype));
                     },
                     DateRanges {
@@ -78,8 +82,11 @@ impl FunctionExpr {
                         time_zone,
                     } => {
                         // output dtype may change based on `every`, `time_unit`, and `time_zone`
-                        let inner_dtype =
-                            mapper.map_to_date_range_dtype(every, time_unit, time_zone)?;
+                        let inner_dtype = mapper.map_to_date_range_dtype(
+                            every,
+                            time_unit.as_ref(),
+                            time_zone.as_deref(),
+                        )?;
                         return Ok(Field::new(
                             "date_range",
                             DataType::List(Box::new(inner_dtype)),
@@ -379,59 +386,65 @@ impl<'a> FieldsMapper<'a> {
     pub(super) fn map_to_date_range_dtype(
         &self,
         every: &Duration,
-        time_unit: &Option<TimeUnit>,
-        tz: &Option<String>,
+        time_unit: Option<&TimeUnit>,
+        time_zone: Option<&str>,
     ) -> PolarsResult<DataType> {
-        let inner_dtype = match (&self.map_to_supertype()?.dtype, time_unit, tz, every) {
-            #[cfg(feature = "timezones")]
-            (DataType::Datetime(tu, Some(field_tz)), time_unit, Some(tz), _) => {
-                if field_tz != tz {
-                    polars_bail!(ComputeError: format!("Given time_zone is different from that of timezone aware datetimes. \
-                    Given: '{}', got: '{}'.", tz, field_tz))
-                }
-                if let Some(time_unit) = time_unit {
-                    DataType::Datetime(*time_unit, Some(tz.to_string()))
-                } else {
-                    DataType::Datetime(*tu, Some(tz.to_string()))
-                }
-            },
-            #[cfg(feature = "timezones")]
-            (DataType::Datetime(_, Some(tz)), Some(time_unit), _, _) => {
-                DataType::Datetime(*time_unit, Some(tz.to_string()))
-            },
-            #[cfg(feature = "timezones")]
-            (DataType::Datetime(tu, Some(tz)), None, _, _) => {
-                DataType::Datetime(*tu, Some(tz.to_string()))
-            },
-            #[cfg(feature = "timezones")]
-            (DataType::Datetime(_, _), Some(time_unit), Some(tz), _) => {
-                DataType::Datetime(*time_unit, Some(tz.to_string()))
+        let data_dtype = self.map_to_supertype()?.dtype;
+        match data_dtype {
+            DataType::Datetime(tu, tz) => {
+                self.map_datetime_to_date_range_dtype(tu, tz, time_unit, time_zone)
             },
-            #[cfg(feature = "timezones")]
-            (DataType::Datetime(tu, _), None, Some(tz), _) => {
-                DataType::Datetime(*tu, Some(tz.to_string()))
+            DataType::Date => {
+                let schema_dtype = self.map_date_to_date_range_dtype(every, time_unit, time_zone);
+                Ok(schema_dtype)
             },
-            (DataType::Datetime(_, _), Some(time_unit), _, _) => {
-                DataType::Datetime(*time_unit, None)
-            },
-            (DataType::Datetime(tu, _), None, _, _) => DataType::Datetime(*tu, None),
-            (DataType::Date, time_unit, time_zone, every) => {
-                let nsecs = every.nanoseconds();
-                if nsecs == 0 {
-                    DataType::Date
-                } else if let Some(tu) = time_unit {
-                    DataType::Datetime(*tu, time_zone.clone())
-                } else if nsecs % 1000 != 0 {
-                    DataType::Datetime(TimeUnit::Nanoseconds, time_zone.clone())
-                } else {
-                    DataType::Datetime(TimeUnit::Microseconds, time_zone.clone())
-                }
-            },
-            (dtype, _, _, _) => {
-                polars_bail!(ComputeError: "expected Date or Datetime, got {}", dtype)
+            _ => polars_bail!(ComputeError: "expected Date or Datetime, got {}", data_dtype),
+        }
+    }
+    #[cfg(feature = "temporal")]
+    fn map_datetime_to_date_range_dtype(
+        &self,
+        data_time_unit: TimeUnit,
+        data_time_zone: Option<String>,
+        given_time_unit: Option<&TimeUnit>,
+        given_time_zone: Option<&str>,
+    ) -> PolarsResult<DataType> {
+        let schema_time_zone = match (data_time_zone, given_time_zone) {
+            (Some(data_tz), Some(given_tz)) => {
+                polars_ensure!(
+                    data_tz == given_tz,
+                    ComputeError: format!(
+                        "`time_zone` does not match the data\
+                        \n\nData has time zone '{}', got '{}'.", data_tz, given_tz)
+                );
+                Some(data_tz)
             },
+            (_, Some(given_tz)) => Some(given_tz.to_string()),
+            (Some(data_tz), None) => Some(data_tz),
+            (_, _) => None,
         };
-        Ok(inner_dtype)
+        let schema_time_unit = given_time_unit.unwrap_or(&data_time_unit);
+
+        let schema_dtype = DataType::Datetime(*schema_time_unit, schema_time_zone);
+        Ok(schema_dtype)
+    }
+    #[cfg(feature = "temporal")]
+    fn map_date_to_date_range_dtype(
+        &self,
+        every: &Duration,
+        time_unit: Option<&TimeUnit>,
+        time_zone: Option<&str>,
+    ) -> DataType {
+        let nsecs = every.nanoseconds();
+        if nsecs == 0 {
+            DataType::Date
+        } else if let Some(tu) = time_unit {
+            DataType::Datetime(*tu, time_zone.map(String::from))
+        } else if nsecs % 1000 != 0 {
+            DataType::Datetime(TimeUnit::Nanoseconds, time_zone.map(String::from))
+        } else {
+            DataType::Datetime(TimeUnit::Microseconds, time_zone.map(String::from))
+        }
     }
 
     /// Map the dtypes to the "supertype" of a list of lists.
diff --git a/py-polars/tests/unit/functions/test_range.py b/py-polars/tests/unit/functions/test_range.py
index abeb42bb2786..436dd2add813 100644
--- a/py-polars/tests/unit/functions/test_range.py
+++ b/py-polars/tests/unit/functions/test_range.py
@@ -230,7 +230,10 @@ def test_date_range_lazy_time_zones_invalid() -> None:
     stop = datetime(2020, 1, 2, tzinfo=ZoneInfo("Asia/Kathmandu"))
     with pytest.raises(
         ComputeError,
-        match="Given time_zone is different from that of timezone aware datetimes. Given: 'Pacific/Tarawa', got: 'Asia/Kathmandu",
+        match=(
+            "`time_zone` does not match the data"
+            "\n\nData has time zone 'Asia/Kathmandu', got 'Pacific/Tarawa'."
+        ),
     ), pytest.warns(TimeZoneAwareConstructorWarning, match="Series with UTC"):
         (
             pl.DataFrame({"start": [start], "stop": [stop]})
@@ -380,10 +383,7 @@ def test_timezone_aware_date_range() -> None:
             eager=True,
         )
 
-    with pytest.raises(
-        ComputeError,
-        match=r"Given time_zone is different from that of timezone aware datetimes",
-    ):
+    with pytest.raises(ComputeError, match="`time_zone` does not match the data"):
         pl.date_range(
             low, high, interval=timedelta(days=5), time_zone="UTC", eager=True
         )

From c33d7055f1350a362dda2b14b3f7bb8e1c0f5f8a Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Tue, 22 Aug 2023 07:27:37 +0200
Subject: [PATCH 33/55] refactor(rust): move 'is_in' to polars-ops (#10645)

---
 crates/polars-core/Cargo.toml                 |   2 -
 .../src/chunked_array/ops/is_in.rs            | 403 ------------------
 .../polars-core/src/chunked_array/ops/mod.rs  |  11 -
 .../src/series/implementations/binary.rs      |   4 -
 .../src/series/implementations/boolean.rs     |   4 -
 .../src/series/implementations/categorical.rs |   7 -
 .../src/series/implementations/dates_time.rs  |   4 -
 .../src/series/implementations/datetime.rs    |   4 -
 .../src/series/implementations/duration.rs    |   4 -
 .../src/series/implementations/floats.rs      |   4 -
 .../src/series/implementations/mod.rs         |   4 -
 .../src/series/implementations/struct_.rs     |   5 -
 .../src/series/implementations/utf8.rs        |   4 -
 crates/polars-core/src/series/series_trait.rs |   5 -
 crates/polars-lazy/Cargo.toml                 |   2 +-
 crates/polars-ops/Cargo.toml                  |   1 +
 crates/polars-ops/src/series/ops/is_in.rs     | 400 +++++++++++++++++
 crates/polars-ops/src/series/ops/mod.rs       |   4 +
 crates/polars-plan/Cargo.toml                 |   2 +-
 .../src/dsl/function_expr/boolean.rs          |   2 +-
 .../polars-plan/src/dsl/function_expr/list.rs |   2 +-
 crates/polars/Cargo.toml                      |   2 +-
 crates/polars/src/lib.rs                      |   4 +-
 py-polars/polars/io/_utils.py                 |   2 +-
 24 files changed, 413 insertions(+), 473 deletions(-)
 delete mode 100644 crates/polars-core/src/chunked_array/ops/is_in.rs
 create mode 100644 crates/polars-ops/src/series/ops/is_in.rs

diff --git a/crates/polars-core/Cargo.toml b/crates/polars-core/Cargo.toml
index d0a2f41c6ef2..adb530002559 100644
--- a/crates/polars-core/Cargo.toml
+++ b/crates/polars-core/Cargo.toml
@@ -77,7 +77,6 @@ sort_multiple = []
 rows = []
 
 # operations
-is_in = ["reinterpret"]
 zip_with = []
 round_series = []
 checked_arithmetic = []
@@ -141,7 +140,6 @@ serde-lazy = ["serde", "polars-arrow/serde", "indexmap/serde", "smartstring/serd
 
 docs-selection = [
   "ndarray",
-  "is_in",
   "rows",
   "docs",
   "strings",
diff --git a/crates/polars-core/src/chunked_array/ops/is_in.rs b/crates/polars-core/src/chunked_array/ops/is_in.rs
deleted file mode 100644
index cf4becb1cb21..000000000000
--- a/crates/polars-core/src/chunked_array/ops/is_in.rs
+++ /dev/null
@@ -1,403 +0,0 @@
-use std::hash::Hash;
-
-use crate::prelude::*;
-use crate::utils::{try_get_supertype, CustomIterTools};
-
-fn is_in_helper<'a, T>(ca: &'a ChunkedArray<T>, other: &Series) -> PolarsResult<BooleanChunked>
-where
-    T: PolarsDataType,
-    ChunkedArray<T>: HasUnderlyingArray,
-    <<ChunkedArray<T> as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>: Hash + Eq + Copy,
-{
-    let mut set = PlHashSet::with_capacity(other.len());
-
-    let other = ca.unpack_series_matching_type(other)?;
-    other.downcast_iter().for_each(|iter| {
-        iter.iter().for_each(|opt_val| {
-            if let Some(v) = opt_val {
-                set.insert(v);
-            }
-        })
-    });
-    Ok(ca.apply_values_generic(|val| set.contains(&val)))
-}
-
-impl<T> IsIn for ChunkedArray<T>
-where
-    T: PolarsIntegerType,
-    T::Native: Hash + Eq,
-{
-    fn is_in(&self, other: &Series) -> PolarsResult<BooleanChunked> {
-        // We check implicitly cast to supertype here
-        match other.dtype() {
-            DataType::List(dt) => {
-                let st = try_get_supertype(self.dtype(), dt)?;
-                if &st != self.dtype() || **dt != st {
-                    let left = self.cast(&st)?;
-                    let right = other.cast(&DataType::List(Box::new(st)))?;
-                    return left.is_in(&right);
-                }
-
-                let mut ca: BooleanChunked = if self.len() == 1 && other.len() != 1 {
-                    let value = self.get(0);
-
-                    other
-                        .list()?
-                        .amortized_iter()
-                        .map(|opt_s| {
-                            opt_s.map(|s| {
-                                let ca = s.as_ref().unpack::<T>().unwrap();
-                                ca.into_iter().any(|a| a == value)
-                            }) == Some(true)
-                        })
-                        .collect_trusted()
-                } else {
-                    polars_ensure!(self.len() == other.len(), ComputeError: "shapes don't match: expected {} elements in 'is_in' comparison, got {}", self.len(), other.len());
-                    self.into_iter()
-                        .zip(other.list()?.amortized_iter())
-                        .map(|(value, series)| match (value, series) {
-                            (val, Some(series)) => {
-                                let ca = series.as_ref().unpack::<T>().unwrap();
-                                ca.into_iter().any(|a| a == val)
-                            }
-                            _ => false,
-                        })
-                        .collect_trusted()
-                };
-                ca.rename(self.name());
-                Ok(ca)
-            }
-            _ => {
-                // first make sure that the types are equal
-                if self.dtype() != other.dtype() {
-                    let st = try_get_supertype(self.dtype(), other.dtype())?;
-                    let left = self.cast(&st)?;
-                    let right = other.cast(&st)?;
-                    return left.is_in(&right);
-                }
-                is_in_helper(self, other)
-            }
-        }
-        .map(|mut ca| {
-            ca.rename(self.name());
-            ca
-        })
-    }
-}
-
-impl IsIn for Float32Chunked {
-    fn is_in(&self, other: &Series) -> PolarsResult<BooleanChunked> {
-        let other = other.cast(&DataType::Float32)?;
-        let other = other.f32().unwrap();
-        let other = other.reinterpret_unsigned();
-        let ca = self.reinterpret_unsigned();
-        ca.is_in(&other)
-    }
-}
-impl IsIn for Float64Chunked {
-    fn is_in(&self, other: &Series) -> PolarsResult<BooleanChunked> {
-        let other = other.cast(&DataType::Float64)?;
-        let other = other.f64().unwrap();
-        let other = other.reinterpret_unsigned();
-        let ca = self.reinterpret_unsigned();
-        ca.is_in(&other)
-    }
-}
-
-impl IsIn for Utf8Chunked {
-    fn is_in(&self, other: &Series) -> PolarsResult<BooleanChunked> {
-        match other.dtype() {
-            #[cfg(feature = "dtype-categorical")]
-            DataType::List(dt) if matches!(&**dt, DataType::Categorical(_)) => {
-                if let DataType::Categorical(Some(rev_map)) = &**dt {
-                    let opt_val = self.get(0);
-
-                    let other = other.list()?;
-                    match opt_val {
-                        None => {
-                            let mut ca: BooleanChunked = other
-                                .amortized_iter()
-                                .map(|opt_s| {
-                                    opt_s.map(|s| s.as_ref().null_count() > 0) == Some(true)
-                                })
-                                .collect_trusted();
-                            ca.rename(self.name());
-                            Ok(ca)
-                        },
-                        Some(value) => {
-                            match rev_map.find(value) {
-                                // all false
-                                None => Ok(BooleanChunked::full(self.name(), false, other.len())),
-                                Some(idx) => {
-                                    let mut ca: BooleanChunked = other
-                                        .amortized_iter()
-                                        .map(|opt_s| {
-                                            opt_s.map(|s| {
-                                                let s = s.as_ref().to_physical_repr();
-                                                let ca = s.as_ref().u32().unwrap();
-                                                if ca.null_count() == 0 {
-                                                    ca.into_no_null_iter().any(|a| a == idx)
-                                                } else {
-                                                    ca.into_iter().any(|a| a == Some(idx))
-                                                }
-                                            }) == Some(true)
-                                        })
-                                        .collect_trusted();
-                                    ca.rename(self.name());
-                                    Ok(ca)
-                                },
-                            }
-                        },
-                    }
-                } else {
-                    unreachable!()
-                }
-            },
-            DataType::List(dt) if DataType::Utf8 == **dt => self.as_binary().is_in(
-                &other
-                    .cast(&DataType::List(Box::new(DataType::Binary)))
-                    .unwrap(),
-            ),
-            DataType::Utf8 => self
-                .as_binary()
-                .is_in(&other.cast(&DataType::Binary).unwrap()),
-            _ => polars_bail!(opq = is_in, self.dtype(), other.dtype()),
-        }
-    }
-}
-
-impl IsIn for BinaryChunked {
-    fn is_in(&self, other: &Series) -> PolarsResult<BooleanChunked> {
-        match other.dtype() {
-            DataType::List(dt) if DataType::Binary == **dt => {
-                let mut ca: BooleanChunked = if self.len() == 1 && other.len() != 1 {
-                    let value = self.get(0);
-                    other
-                        .list()?
-                        .amortized_iter()
-                        .map(|opt_b| {
-                            opt_b.map(|s| {
-                                let ca = s.as_ref().unpack::<BinaryType>().unwrap();
-                                ca.into_iter().any(|a| a == value)
-                            }) == Some(true)
-                        })
-                        .collect_trusted()
-                } else {
-                    polars_ensure!(self.len() == other.len(), ComputeError: "shapes don't match: expected {} elements in 'is_in' comparison, got {}", self.len(), other.len());
-                    self.into_iter()
-                        .zip(other.list()?.amortized_iter())
-                        .map(|(value, series)| match (value, series) {
-                            (val, Some(series)) => {
-                                let ca = series.as_ref().unpack::<BinaryType>().unwrap();
-                                ca.into_iter().any(|a| a == val)
-                            }
-                            _ => false,
-                        })
-                        .collect_trusted()
-                };
-                ca.rename(self.name());
-                Ok(ca)
-            }
-            DataType::Binary => {
-                is_in_helper(self, other)
-            }
-            _ => polars_bail!(opq = is_in, self.dtype(), other.dtype()),
-        }
-        .map(|mut ca| {
-            ca.rename(self.name());
-            ca
-        })
-    }
-}
-
-impl IsIn for BooleanChunked {
-    fn is_in(&self, other: &Series) -> PolarsResult<BooleanChunked> {
-        match other.dtype() {
-            DataType::List(dt) if self.dtype() == &**dt => {
-                let mut ca: BooleanChunked = if self.len() == 1 && other.len() != 1 {
-                    let value = self.get(0);
-                    // safety: we know the iterators len
-                    unsafe {
-                        other
-                            .list()?
-                            .amortized_iter()
-                            .map(|opt_s| {
-                                opt_s.map(|s| {
-                                    let ca = s.as_ref().unpack::<BooleanType>().unwrap();
-                                    ca.into_iter().any(|a| a == value)
-                                }) == Some(true)
-                            })
-                            .trust_my_length(other.len())
-                            .collect_trusted()
-                    }
-                } else {
-                    polars_ensure!(self.len() == other.len(), ComputeError: "shapes don't match: expected {} elements in 'is_in' comparison, got {}", self.len(), other.len());
-                    self.into_iter()
-                        .zip(other.list()?.amortized_iter())
-                        .map(|(value, series)| match (value, series) {
-                            (val, Some(series)) => {
-                                let ca = series.as_ref().unpack::<BooleanType>().unwrap();
-                                ca.into_iter().any(|a| a == val)
-                            }
-                            _ => false,
-                        })
-                        .collect_trusted()
-                };
-                ca.rename(self.name());
-                Ok(ca)
-            }
-            DataType::Boolean => {
-                let other = other.bool().unwrap();
-                let has_true = other.any();
-                let nc = other.null_count();
-
-                let has_false = if nc == 0 {
-                    !other.all()
-                } else {
-                    !(other.sum().unwrap() as usize + nc) == other.len()
-                };
-                Ok(self.apply_values(|v| if v { has_true } else { has_false }))
-            }
-            _ => polars_bail!(opq = is_in, self.dtype(), other.dtype()),
-        }
-        .map(|mut ca| {
-            ca.rename(self.name());
-            ca
-        })
-    }
-}
-
-#[cfg(feature = "dtype-struct")]
-impl IsIn for StructChunked {
-    fn is_in(&self, other: &Series) -> PolarsResult<BooleanChunked> {
-        match other.dtype() {
-            DataType::List(_) => {
-                let mut ca: BooleanChunked = if self.len() == 1 && other.len() != 1 {
-                    let mut value = vec![];
-                    let left = self.clone().into_series();
-                    let av = left.get(0).unwrap();
-                    if let AnyValue::Struct(_, _, _) = av {
-                        av._materialize_struct_av(&mut value);
-                    }
-                    other
-                        .list()?
-                        .amortized_iter()
-                        .map(|opt_s| {
-                            opt_s.map(|s| {
-                                let ca = s.as_ref().struct_().unwrap();
-                                ca.into_iter().any(|a| a == value)
-                            }) == Some(true)
-                        })
-                        .collect()
-                } else {
-                    polars_ensure!(self.len() == other.len(), ComputeError: "shapes don't match: expected {} elements in 'is_in' comparison, got {}", self.len(), other.len());
-                    self.into_iter()
-                        .zip(other.list()?.amortized_iter())
-                        .map(|(value, series)| match (value, series) {
-                            (val, Some(series)) => {
-                                let ca = series.as_ref().struct_().unwrap();
-                                ca.into_iter().any(|a| a == val)
-                            },
-                            _ => false,
-                        })
-                        .collect()
-                };
-                ca.rename(self.name());
-                Ok(ca)
-            },
-            _ => {
-                let other = other.cast(&other.dtype().to_physical()).unwrap();
-                let other = other.struct_()?;
-
-                polars_ensure!(
-                    self.fields().len() == other.fields().len(),
-                    ComputeError: "`is_in`: mismatch in the number of struct fields: {} and {}",
-                    self.fields().len(), other.fields().len()
-                );
-
-                // first make sure that the types are equal
-                let self_dtypes: Vec<_> = self.fields().iter().map(|f| f.dtype()).collect();
-                let other_dtypes: Vec<_> = other.fields().iter().map(|f| f.dtype()).collect();
-                if self_dtypes != other_dtypes {
-                    let self_names = self.fields().iter().map(|f| f.name());
-                    let other_names = other.fields().iter().map(|f| f.name());
-                    let supertypes = self_dtypes
-                        .iter()
-                        .zip(other_dtypes.iter())
-                        .map(|(dt1, dt2)| try_get_supertype(dt1, dt2))
-                        .collect::<Result<Vec<_>, _>>()?;
-                    let self_supertype_fields = self_names
-                        .zip(supertypes.iter())
-                        .map(|(name, st)| Field::new(name, st.clone()))
-                        .collect();
-                    let self_super = self.cast(&DataType::Struct(self_supertype_fields))?;
-                    let other_supertype_fields = other_names
-                        .zip(supertypes.iter())
-                        .map(|(name, st)| Field::new(name, st.clone()))
-                        .collect();
-                    let other_super = other.cast(&DataType::Struct(other_supertype_fields))?;
-                    return self_super.is_in(&other_super);
-                }
-
-                let mut anyvalues = Vec::with_capacity(other.len() * other.fields().len());
-                // SAFETY:
-                // the iterator is unsafe as the lifetime is tied to the iterator
-                // so we copy to an owned buffer first
-                other.into_iter().for_each(|vals| {
-                    anyvalues.extend_from_slice(vals);
-                });
-
-                // then we fill the set
-                let mut set = PlHashSet::with_capacity(other.len());
-                for key in anyvalues.chunks_exact(other.fields().len()) {
-                    set.insert(key);
-                }
-                // physical self
-                let self_ca = self.cast(&self.dtype().to_physical()).unwrap();
-                let self_ca = self_ca.struct_().unwrap();
-
-                // and then we check for membership
-                let mut ca: BooleanChunked = self_ca
-                    .into_iter()
-                    .map(|vals| {
-                        // If all rows are null we see the struct row as missing.
-                        if !vals.iter().all(|val| matches!(val, AnyValue::Null)) {
-                            Some(set.contains(&vals))
-                        } else {
-                            None
-                        }
-                    })
-                    .collect();
-                ca.rename(self.name());
-                Ok(ca)
-            },
-        }
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use crate::prelude::*;
-
-    #[test]
-    fn test_is_in() -> PolarsResult<()> {
-        let a = Int32Chunked::new("a", &[1, 2, 3, 4]);
-        let b = Int64Chunked::new("b", &[4, 5, 1]);
-
-        let out = a.is_in(&b.into_series())?;
-        assert_eq!(
-            Vec::from(&out),
-            [Some(true), Some(false), Some(false), Some(true)]
-        );
-
-        let a = Utf8Chunked::new("a", &["a", "b", "c", "d"]);
-        let b = Utf8Chunked::new("b", &["d", "e", "c"]);
-
-        let out = a.is_in(&b.into_series())?;
-        assert_eq!(
-            Vec::from(&out),
-            [Some(false), Some(false), Some(true), Some(true)]
-        );
-        Ok(())
-    }
-}
diff --git a/crates/polars-core/src/chunked_array/ops/mod.rs b/crates/polars-core/src/chunked_array/ops/mod.rs
index 6e0cc146a6b8..280ecc63a3a7 100644
--- a/crates/polars-core/src/chunked_array/ops/mod.rs
+++ b/crates/polars-core/src/chunked_array/ops/mod.rs
@@ -32,8 +32,6 @@ mod filter;
 pub mod full;
 #[cfg(feature = "interpolate")]
 mod interpolate;
-#[cfg(feature = "is_in")]
-mod is_in;
 mod len;
 #[cfg(feature = "zip_with")]
 pub(crate) mod min_max_binary;
@@ -714,15 +712,6 @@ pub trait ChunkPeaks {
     }
 }
 
-/// Check if element is member of list array
-#[cfg(feature = "is_in")]
-pub trait IsIn {
-    /// Check if elements of this array are in the right Series, or List values of the right Series.
-    fn is_in(&self, _other: &Series) -> PolarsResult<BooleanChunked> {
-        unimplemented!()
-    }
-}
-
 /// Repeat the values `n` times.
 #[cfg(feature = "repeat_by")]
 pub trait RepeatBy {
diff --git a/crates/polars-core/src/series/implementations/binary.rs b/crates/polars-core/src/series/implementations/binary.rs
index d2277e2a1f47..d03e48836358 100644
--- a/crates/polars-core/src/series/implementations/binary.rs
+++ b/crates/polars-core/src/series/implementations/binary.rs
@@ -274,10 +274,6 @@ impl SeriesTrait for SeriesWrap<BinaryChunked> {
         Arc::new(SeriesWrap(Clone::clone(&self.0)))
     }
 
-    #[cfg(feature = "is_in")]
-    fn is_in(&self, other: &Series) -> PolarsResult<BooleanChunked> {
-        IsIn::is_in(&self.0, other)
-    }
     #[cfg(feature = "repeat_by")]
     fn repeat_by(&self, by: &IdxCa) -> PolarsResult<ListChunked> {
         RepeatBy::repeat_by(&self.0, by)
diff --git a/crates/polars-core/src/series/implementations/boolean.rs b/crates/polars-core/src/series/implementations/boolean.rs
index a1b54ab136c4..bc29640ff17d 100644
--- a/crates/polars-core/src/series/implementations/boolean.rs
+++ b/crates/polars-core/src/series/implementations/boolean.rs
@@ -328,10 +328,6 @@ impl SeriesTrait for SeriesWrap<BooleanChunked> {
         Arc::new(SeriesWrap(Clone::clone(&self.0)))
     }
 
-    #[cfg(feature = "is_in")]
-    fn is_in(&self, other: &Series) -> PolarsResult<BooleanChunked> {
-        IsIn::is_in(&self.0, other)
-    }
     #[cfg(feature = "repeat_by")]
     fn repeat_by(&self, by: &IdxCa) -> PolarsResult<ListChunked> {
         RepeatBy::repeat_by(&self.0, by)
diff --git a/crates/polars-core/src/series/implementations/categorical.rs b/crates/polars-core/src/series/implementations/categorical.rs
index f3bd53c3c012..dccea0498d0b 100644
--- a/crates/polars-core/src/series/implementations/categorical.rs
+++ b/crates/polars-core/src/series/implementations/categorical.rs
@@ -10,8 +10,6 @@ use crate::chunked_array::ops::explode::ExplodeByOffsets;
 use crate::chunked_array::AsSinglePtr;
 use crate::frame::group_by::*;
 use crate::frame::hash_join::ZipOuterJoinColumn;
-#[cfg(feature = "is_in")]
-use crate::frame::hash_join::_check_categorical_src;
 use crate::prelude::*;
 use crate::series::implementations::SeriesWrap;
 
@@ -358,11 +356,6 @@ impl SeriesTrait for SeriesWrap<CategoricalChunked> {
         Arc::new(SeriesWrap(Clone::clone(&self.0)))
     }
 
-    #[cfg(feature = "is_in")]
-    fn is_in(&self, other: &Series) -> PolarsResult<BooleanChunked> {
-        _check_categorical_src(self.dtype(), other.dtype())?;
-        self.0.logical().is_in(&other.to_physical_repr())
-    }
     #[cfg(feature = "repeat_by")]
     fn repeat_by(&self, by: &IdxCa) -> PolarsResult<ListChunked> {
         let out = self.0.logical().repeat_by(by)?;
diff --git a/crates/polars-core/src/series/implementations/dates_time.rs b/crates/polars-core/src/series/implementations/dates_time.rs
index a442535efb2b..d2de3c250129 100644
--- a/crates/polars-core/src/series/implementations/dates_time.rs
+++ b/crates/polars-core/src/series/implementations/dates_time.rs
@@ -433,10 +433,6 @@ macro_rules! impl_dyn_series {
             fn peak_min(&self) -> BooleanChunked {
                 self.0.peak_min()
             }
-            #[cfg(feature = "is_in")]
-            fn is_in(&self, other: &Series) -> PolarsResult<BooleanChunked> {
-                self.0.is_in(other)
-            }
             #[cfg(feature = "repeat_by")]
             fn repeat_by(&self, by: &IdxCa) -> PolarsResult<ListChunked> {
                 match self.0.dtype() {
diff --git a/crates/polars-core/src/series/implementations/datetime.rs b/crates/polars-core/src/series/implementations/datetime.rs
index dce1c7fd9385..eeae18499da2 100644
--- a/crates/polars-core/src/series/implementations/datetime.rs
+++ b/crates/polars-core/src/series/implementations/datetime.rs
@@ -440,10 +440,6 @@ impl SeriesTrait for SeriesWrap<DatetimeChunked> {
     fn peak_min(&self) -> BooleanChunked {
         self.0.peak_min()
     }
-    #[cfg(feature = "is_in")]
-    fn is_in(&self, other: &Series) -> PolarsResult<BooleanChunked> {
-        self.0.is_in(other)
-    }
     #[cfg(feature = "repeat_by")]
     fn repeat_by(&self, by: &IdxCa) -> PolarsResult<ListChunked> {
         Ok(self
diff --git a/crates/polars-core/src/series/implementations/duration.rs b/crates/polars-core/src/series/implementations/duration.rs
index 85e8f5b8ed9b..4d7a97c16bdc 100644
--- a/crates/polars-core/src/series/implementations/duration.rs
+++ b/crates/polars-core/src/series/implementations/duration.rs
@@ -444,10 +444,6 @@ impl SeriesTrait for SeriesWrap<DurationChunked> {
     fn peak_min(&self) -> BooleanChunked {
         self.0.peak_min()
     }
-    #[cfg(feature = "is_in")]
-    fn is_in(&self, other: &Series) -> PolarsResult<BooleanChunked> {
-        self.0.is_in(other)
-    }
     #[cfg(feature = "repeat_by")]
     fn repeat_by(&self, by: &IdxCa) -> PolarsResult<ListChunked> {
         Ok(self
diff --git a/crates/polars-core/src/series/implementations/floats.rs b/crates/polars-core/src/series/implementations/floats.rs
index b0a8a246b79d..26704508a154 100644
--- a/crates/polars-core/src/series/implementations/floats.rs
+++ b/crates/polars-core/src/series/implementations/floats.rs
@@ -365,10 +365,6 @@ macro_rules! impl_dyn_series {
                 self.0.peak_min()
             }
 
-            #[cfg(feature = "is_in")]
-            fn is_in(&self, other: &Series) -> PolarsResult<BooleanChunked> {
-                IsIn::is_in(&self.0, other)
-            }
             #[cfg(feature = "repeat_by")]
             fn repeat_by(&self, by: &IdxCa) -> PolarsResult<ListChunked> {
                 RepeatBy::repeat_by(&self.0, by)
diff --git a/crates/polars-core/src/series/implementations/mod.rs b/crates/polars-core/src/series/implementations/mod.rs
index 0a7b471dc492..f3ab17eb7da3 100644
--- a/crates/polars-core/src/series/implementations/mod.rs
+++ b/crates/polars-core/src/series/implementations/mod.rs
@@ -459,10 +459,6 @@ macro_rules! impl_dyn_series {
                 self.0.peak_min()
             }
 
-            #[cfg(feature = "is_in")]
-            fn is_in(&self, other: &Series) -> PolarsResult<BooleanChunked> {
-                IsIn::is_in(&self.0, other)
-            }
             #[cfg(feature = "repeat_by")]
             fn repeat_by(&self, by: &IdxCa) -> PolarsResult<ListChunked> {
                 RepeatBy::repeat_by(&self.0, by)
diff --git a/crates/polars-core/src/series/implementations/struct_.rs b/crates/polars-core/src/series/implementations/struct_.rs
index 21bf3f78a9e9..db79be60ae12 100644
--- a/crates/polars-core/src/series/implementations/struct_.rs
+++ b/crates/polars-core/src/series/implementations/struct_.rs
@@ -346,11 +346,6 @@ impl SeriesTrait for SeriesWrap<StructChunked> {
         self.0.apply_fields(|s| s.shift(periods)).into_series()
     }
 
-    #[cfg(feature = "is_in")]
-    fn is_in(&self, other: &Series) -> PolarsResult<BooleanChunked> {
-        self.0.is_in(other)
-    }
-
     fn clone_inner(&self) -> Arc<dyn SeriesTrait> {
         Arc::new(SeriesWrap(Clone::clone(&self.0)))
     }
diff --git a/crates/polars-core/src/series/implementations/utf8.rs b/crates/polars-core/src/series/implementations/utf8.rs
index 952591fed652..3bcf1d3fdefa 100644
--- a/crates/polars-core/src/series/implementations/utf8.rs
+++ b/crates/polars-core/src/series/implementations/utf8.rs
@@ -289,10 +289,6 @@ impl SeriesTrait for SeriesWrap<Utf8Chunked> {
         Arc::new(SeriesWrap(Clone::clone(&self.0)))
     }
 
-    #[cfg(feature = "is_in")]
-    fn is_in(&self, other: &Series) -> PolarsResult<BooleanChunked> {
-        IsIn::is_in(&self.0, other)
-    }
     #[cfg(feature = "repeat_by")]
     fn repeat_by(&self, by: &IdxCa) -> PolarsResult<ListChunked> {
         RepeatBy::repeat_by(&self.0, by)
diff --git a/crates/polars-core/src/series/series_trait.rs b/crates/polars-core/src/series/series_trait.rs
index 79033df353a5..fc212efdc2fc 100644
--- a/crates/polars-core/src/series/series_trait.rs
+++ b/crates/polars-core/src/series/series_trait.rs
@@ -501,11 +501,6 @@ pub trait SeriesTrait:
         invalid_operation_panic!(peak_min, self)
     }
 
-    /// Check if elements of this Series are in the right Series, or List values of the right Series.
-    #[cfg(feature = "is_in")]
-    fn is_in(&self, _other: &Series) -> PolarsResult<BooleanChunked> {
-        polars_bail!(opq = is_in, self._dtype());
-    }
     #[cfg(feature = "repeat_by")]
     fn repeat_by(&self, _by: &IdxCa) -> PolarsResult<ListChunked> {
         polars_bail!(opq = repeat_by, self._dtype());
diff --git a/crates/polars-lazy/Cargo.toml b/crates/polars-lazy/Cargo.toml
index f2317ea7f5dc..eb432e860d98 100644
--- a/crates/polars-lazy/Cargo.toml
+++ b/crates/polars-lazy/Cargo.toml
@@ -78,7 +78,7 @@ extract_jsonpath = ["polars-plan/extract_jsonpath", "polars-ops/extract_jsonpath
 
 # operations
 approx_unique = ["polars-plan/approx_unique"]
-is_in = ["polars-plan/is_in"]
+is_in = ["polars-plan/is_in", "polars-ops/is_in"]
 repeat_by = ["polars-plan/repeat_by"]
 round_series = ["polars-plan/round_series", "polars-ops/round_series"]
 is_first = ["polars-plan/is_first"]
diff --git a/crates/polars-ops/Cargo.toml b/crates/polars-ops/Cargo.toml
index 305476bec432..7819ffaa7f35 100644
--- a/crates/polars-ops/Cargo.toml
+++ b/crates/polars-ops/Cargo.toml
@@ -89,3 +89,4 @@ list_take = []
 list_sets = []
 list_any_all = []
 extract_groups = ["dtype-struct", "polars-core/regex"]
+is_in = ["polars-core/reinterpret"]
diff --git a/crates/polars-ops/src/series/ops/is_in.rs b/crates/polars-ops/src/series/ops/is_in.rs
new file mode 100644
index 000000000000..6574d78d070f
--- /dev/null
+++ b/crates/polars-ops/src/series/ops/is_in.rs
@@ -0,0 +1,400 @@
+use std::hash::Hash;
+
+use polars_core::prelude::*;
+use polars_core::utils::{try_get_supertype, CustomIterTools};
+use polars_core::with_match_physical_integer_polars_type;
+
+fn is_in_helper<'a, T>(ca: &'a ChunkedArray<T>, other: &Series) -> PolarsResult<BooleanChunked>
+where
+    T: PolarsDataType,
+    ChunkedArray<T>: HasUnderlyingArray,
+    <<ChunkedArray<T> as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>: Hash + Eq + Copy,
+{
+    let mut set = PlHashSet::with_capacity(other.len());
+
+    let other = ca.unpack_series_matching_type(other)?;
+    other.downcast_iter().for_each(|iter| {
+        iter.iter().for_each(|opt_val| {
+            if let Some(v) = opt_val {
+                set.insert(v);
+            }
+        })
+    });
+    Ok(ca.apply_values_generic(|val| set.contains(&val)))
+}
+
+fn is_in_numeric<T>(ca_in: &ChunkedArray<T>, other: &Series) -> PolarsResult<BooleanChunked>
+where
+    T: PolarsIntegerType,
+    T::Native: Hash + Eq,
+{
+    // We check implicitly cast to supertype here
+    match other.dtype() {
+        DataType::List(dt) => {
+            let st = try_get_supertype(ca_in.dtype(), dt)?;
+            if &st != ca_in.dtype() || **dt != st {
+                let left = ca_in.cast(&st)?;
+                let right = other.cast(&DataType::List(Box::new(st)))?;
+                return is_in(&left, &right);
+            }
+
+            let mut ca: BooleanChunked = if ca_in.len() == 1 && other.len() != 1 {
+                let value = ca_in.get(0);
+
+                other
+                    .list()?
+                    .amortized_iter()
+                    .map(|opt_s| {
+                        opt_s.map(|s| {
+                            let ca = s.as_ref().unpack::<T>().unwrap();
+                            ca.into_iter().any(|a| a == value)
+                        }) == Some(true)
+                    })
+                    .collect_trusted()
+            } else {
+                polars_ensure!(ca_in.len() == other.len(), ComputeError: "shapes don't match: expected {} elements in 'is_in' comparison, got {}", ca_in.len(), other.len());
+                ca_in.into_iter()
+                    .zip(other.list()?.amortized_iter())
+                    .map(|(value, series)| match (value, series) {
+                        (val, Some(series)) => {
+                            let ca = series.as_ref().unpack::<T>().unwrap();
+                            ca.into_iter().any(|a| a == val)
+                        }
+                        _ => false,
+                    })
+                    .collect_trusted()
+            };
+            ca.rename(ca_in.name());
+            Ok(ca)
+        }
+        _ => {
+            // first make sure that the types are equal
+            if ca_in.dtype() != other.dtype() {
+                let st = try_get_supertype(ca_in.dtype(), other.dtype())?;
+                let left = ca_in.cast(&st)?;
+                let right = other.cast(&st)?;
+                return is_in(&left, &right);
+            }
+            is_in_helper(ca_in, other)
+        }
+    }
+        .map(|mut ca| {
+            ca.rename(ca_in.name());
+            ca
+        })
+}
+
+fn is_in_utf8(ca_in: &Utf8Chunked, other: &Series) -> PolarsResult<BooleanChunked> {
+    match other.dtype() {
+        #[cfg(feature = "dtype-categorical")]
+        DataType::List(dt) if matches!(&**dt, DataType::Categorical(_)) => {
+            if let DataType::Categorical(Some(rev_map)) = &**dt {
+                let opt_val = ca_in.get(0);
+
+                let other = other.list()?;
+                match opt_val {
+                    None => {
+                        let mut ca: BooleanChunked = other
+                            .amortized_iter()
+                            .map(|opt_s| opt_s.map(|s| s.as_ref().null_count() > 0) == Some(true))
+                            .collect_trusted();
+                        ca.rename(ca_in.name());
+                        Ok(ca)
+                    },
+                    Some(value) => {
+                        match rev_map.find(value) {
+                            // all false
+                            None => Ok(BooleanChunked::full(ca_in.name(), false, other.len())),
+                            Some(idx) => {
+                                let mut ca: BooleanChunked = other
+                                    .amortized_iter()
+                                    .map(|opt_s| {
+                                        opt_s.map(|s| {
+                                            let s = s.as_ref().to_physical_repr();
+                                            let ca = s.as_ref().u32().unwrap();
+                                            if ca.null_count() == 0 {
+                                                ca.into_no_null_iter().any(|a| a == idx)
+                                            } else {
+                                                ca.into_iter().any(|a| a == Some(idx))
+                                            }
+                                        }) == Some(true)
+                                    })
+                                    .collect_trusted();
+                                ca.rename(ca_in.name());
+                                Ok(ca)
+                            },
+                        }
+                    },
+                }
+            } else {
+                unreachable!()
+            }
+        },
+        DataType::List(dt) if DataType::Utf8 == **dt => is_in_binary(
+            &ca_in.as_binary(),
+            &other
+                .cast(&DataType::List(Box::new(DataType::Binary)))
+                .unwrap(),
+        ),
+        DataType::Utf8 => is_in_binary(&ca_in.as_binary(), &other.cast(&DataType::Binary).unwrap()),
+        _ => polars_bail!(opq = is_in, ca_in.dtype(), other.dtype()),
+    }
+}
+
+fn is_in_binary(ca_in: &BinaryChunked, other: &Series) -> PolarsResult<BooleanChunked> {
+    match other.dtype() {
+            DataType::List(dt) if DataType::Binary == **dt => {
+                let mut ca: BooleanChunked = if ca_in.len() == 1 && other.len() != 1 {
+                    let value = ca_in.get(0);
+                    other
+                        .list()?
+                        .amortized_iter()
+                        .map(|opt_b| {
+                            opt_b.map(|s| {
+                                let ca = s.as_ref().unpack::<BinaryType>().unwrap();
+                                ca.into_iter().any(|a| a == value)
+                            }) == Some(true)
+                        })
+                        .collect_trusted()
+                } else {
+                    polars_ensure!(ca_in.len() == other.len(), ComputeError: "shapes don't match: expected {} elements in 'is_in' comparison, got {}", ca_in.len(), other.len());
+                    ca_in.into_iter()
+                        .zip(other.list()?.amortized_iter())
+                        .map(|(value, series)| match (value, series) {
+                            (val, Some(series)) => {
+                                let ca = series.as_ref().unpack::<BinaryType>().unwrap();
+                                ca.into_iter().any(|a| a == val)
+                            }
+                            _ => false,
+                        })
+                        .collect_trusted()
+                };
+                ca.rename(ca_in.name());
+                Ok(ca)
+            }
+            DataType::Binary => {
+                is_in_helper(ca_in, other)
+            }
+            _ => polars_bail!(opq = is_in, ca_in.dtype(), other.dtype()),
+        }
+            .map(|mut ca| {
+                ca.rename(ca_in.name());
+                ca
+            })
+}
+
+fn is_in_boolean(ca_in: &BooleanChunked, other: &Series) -> PolarsResult<BooleanChunked> {
+    match other.dtype() {
+            DataType::List(dt) if ca_in.dtype() == &**dt => {
+                let mut ca: BooleanChunked = if ca_in.len() == 1 && other.len() != 1 {
+                    let value = ca_in.get(0);
+                    // safety: we know the iterators len
+                    unsafe {
+                        other
+                            .list()?
+                            .amortized_iter()
+                            .map(|opt_s| {
+                                opt_s.map(|s| {
+                                    let ca = s.as_ref().unpack::<BooleanType>().unwrap();
+                                    ca.into_iter().any(|a| a == value)
+                                }) == Some(true)
+                            })
+                            .trust_my_length(other.len())
+                            .collect_trusted()
+                    }
+                } else {
+                    polars_ensure!(ca_in.len() == other.len(), ComputeError: "shapes don't match: expected {} elements in 'is_in' comparison, got {}", ca_in.len(), other.len());
+                    ca_in.into_iter()
+                        .zip(other.list()?.amortized_iter())
+                        .map(|(value, series)| match (value, series) {
+                            (val, Some(series)) => {
+                                let ca = series.as_ref().unpack::<BooleanType>().unwrap();
+                                ca.into_iter().any(|a| a == val)
+                            }
+                            _ => false,
+                        })
+                        .collect_trusted()
+                };
+                ca.rename(ca_in.name());
+                Ok(ca)
+            }
+            DataType::Boolean => {
+                let other = other.bool().unwrap();
+                let has_true = other.any();
+                let nc = other.null_count();
+
+                let has_false = if nc == 0 {
+                    !other.all()
+                } else {
+                    !(other.sum().unwrap() as usize + nc) == other.len()
+                };
+                Ok(ca_in.apply_values(|v| if v { has_true } else { has_false }))
+            }
+            _ => polars_bail!(opq = is_in, ca_in.dtype(), other.dtype()),
+        }
+            .map(|mut ca| {
+                ca.rename(ca_in.name());
+                ca
+            })
+}
+
+#[cfg(feature = "dtype-struct")]
+fn is_in_struct(ca_in: &StructChunked, other: &Series) -> PolarsResult<BooleanChunked> {
+    match other.dtype() {
+        DataType::List(_) => {
+            let mut ca: BooleanChunked = if ca_in.len() == 1 && other.len() != 1 {
+                let mut value = vec![];
+                let left = ca_in.clone().into_series();
+                let av = left.get(0).unwrap();
+                if let AnyValue::Struct(_, _, _) = av {
+                    av._materialize_struct_av(&mut value);
+                }
+                other
+                    .list()?
+                    .amortized_iter()
+                    .map(|opt_s| {
+                        opt_s.map(|s| {
+                            let ca = s.as_ref().struct_().unwrap();
+                            ca.into_iter().any(|a| a == value)
+                        }) == Some(true)
+                    })
+                    .collect()
+            } else {
+                polars_ensure!(ca_in.len() == other.len(), ComputeError: "shapes don't match: expected {} elements in 'is_in' comparison, got {}", ca_in.len(), other.len());
+                ca_in
+                    .into_iter()
+                    .zip(other.list()?.amortized_iter())
+                    .map(|(value, series)| match (value, series) {
+                        (val, Some(series)) => {
+                            let ca = series.as_ref().struct_().unwrap();
+                            ca.into_iter().any(|a| a == val)
+                        },
+                        _ => false,
+                    })
+                    .collect()
+            };
+            ca.rename(ca_in.name());
+            Ok(ca)
+        },
+        _ => {
+            let other = other.cast(&other.dtype().to_physical()).unwrap();
+            let other = other.struct_()?;
+
+            polars_ensure!(
+                ca_in.fields().len() == other.fields().len(),
+                ComputeError: "`is_in`: mismatch in the number of struct fields: {} and {}",
+                ca_in.fields().len(), other.fields().len()
+            );
+
+            // first make sure that the types are equal
+            let ca_in_dtypes: Vec<_> = ca_in.fields().iter().map(|f| f.dtype()).collect();
+            let other_dtypes: Vec<_> = other.fields().iter().map(|f| f.dtype()).collect();
+            if ca_in_dtypes != other_dtypes {
+                let ca_in_names = ca_in.fields().iter().map(|f| f.name());
+                let other_names = other.fields().iter().map(|f| f.name());
+                let supertypes = ca_in_dtypes
+                    .iter()
+                    .zip(other_dtypes.iter())
+                    .map(|(dt1, dt2)| try_get_supertype(dt1, dt2))
+                    .collect::<Result<Vec<_>, _>>()?;
+                let ca_in_supertype_fields = ca_in_names
+                    .zip(supertypes.iter())
+                    .map(|(name, st)| Field::new(name, st.clone()))
+                    .collect();
+                let ca_in_super = ca_in.cast(&DataType::Struct(ca_in_supertype_fields))?;
+                let other_supertype_fields = other_names
+                    .zip(supertypes.iter())
+                    .map(|(name, st)| Field::new(name, st.clone()))
+                    .collect();
+                let other_super = other.cast(&DataType::Struct(other_supertype_fields))?;
+                return is_in(&ca_in_super, &other_super);
+            }
+
+            let mut anyvalues = Vec::with_capacity(other.len() * other.fields().len());
+            // SAFETY:
+            // the iterator is unsafe as the lifetime is tied to the iterator
+            // so we copy to an owned buffer first
+            other.into_iter().for_each(|vals| {
+                anyvalues.extend_from_slice(vals);
+            });
+
+            // then we fill the set
+            let mut set = PlHashSet::with_capacity(other.len());
+            for key in anyvalues.chunks_exact(other.fields().len()) {
+                set.insert(key);
+            }
+            // physical ca_in
+            let ca_in_ca = ca_in.cast(&ca_in.dtype().to_physical()).unwrap();
+            let ca_in_ca = ca_in_ca.struct_().unwrap();
+
+            // and then we check for membership
+            let mut ca: BooleanChunked = ca_in_ca
+                .into_iter()
+                .map(|vals| {
+                    // If all rows are null we see the struct row as missing.
+                    if !vals.iter().all(|val| matches!(val, AnyValue::Null)) {
+                        Some(set.contains(&vals))
+                    } else {
+                        None
+                    }
+                })
+                .collect();
+            ca.rename(ca_in.name());
+            Ok(ca)
+        },
+    }
+}
+
+pub fn is_in(s: &Series, other: &Series) -> PolarsResult<BooleanChunked> {
+    match s.dtype() {
+        #[cfg(feature = "dtype-categorical")]
+        DataType::Categorical(_) => {
+            use polars_core::frame::hash_join::_check_categorical_src;
+            _check_categorical_src(s.dtype(), other.dtype())?;
+            let ca = s.categorical().unwrap();
+            let ca = ca.logical();
+            is_in_numeric(ca, &other.to_physical_repr())
+        },
+        #[cfg(feature = "dtype-struct")]
+        DataType::Struct(_) => {
+            let ca = s.struct_().unwrap();
+            is_in_struct(ca, other)
+        },
+        DataType::Utf8 => {
+            let ca = s.utf8().unwrap();
+            is_in_utf8(ca, other)
+        },
+        DataType::Binary => {
+            let ca = s.binary().unwrap();
+            is_in_binary(ca, other)
+        },
+        DataType::Boolean => {
+            let ca = s.bool().unwrap();
+            is_in_boolean(ca, other)
+        },
+        DataType::Float32 => {
+            let other = other.cast(&DataType::Float32)?;
+            let other = other.f32().unwrap();
+            let other = other.reinterpret_unsigned();
+            let ca = s.f32().unwrap();
+            let s = ca.reinterpret_unsigned();
+            is_in(&s, &other)
+        },
+        DataType::Float64 => {
+            let other = other.cast(&DataType::Float64)?;
+            let other = other.f64().unwrap();
+            let other = other.reinterpret_unsigned();
+            let ca = s.f64().unwrap();
+            let s = ca.reinterpret_unsigned();
+            is_in(&s, &other)
+        },
+        dt if dt.to_physical().is_integer() => {
+            let s = s.to_physical_repr();
+            with_match_physical_integer_polars_type!(s.dtype(), |$T| {
+                let ca: &ChunkedArray<$T> = s.as_ref().as_ref().as_ref();
+                is_in_numeric(ca, other)
+            })
+        },
+        dt => polars_bail!(opq = is_int, dt),
+    }
+}
diff --git a/crates/polars-ops/src/series/ops/mod.rs b/crates/polars-ops/src/series/ops/mod.rs
index 0781a8655e9d..6abde8b44886 100644
--- a/crates/polars-ops/src/series/ops/mod.rs
+++ b/crates/polars-ops/src/series/ops/mod.rs
@@ -10,6 +10,8 @@ mod floor_divide;
 mod fused;
 #[cfg(feature = "is_first")]
 mod is_first;
+#[cfg(feature = "is_in")]
+mod is_in;
 #[cfg(feature = "is_unique")]
 mod is_unique;
 #[cfg(feature = "log")]
@@ -36,6 +38,8 @@ pub use floor_divide::*;
 pub use fused::*;
 #[cfg(feature = "is_first")]
 pub use is_first::*;
+#[cfg(feature = "is_in")]
+pub use is_in::*;
 #[cfg(feature = "is_unique")]
 pub use is_unique::*;
 #[cfg(feature = "log")]
diff --git a/crates/polars-plan/Cargo.toml b/crates/polars-plan/Cargo.toml
index 9225ec0906fd..aefc78e45f4f 100644
--- a/crates/polars-plan/Cargo.toml
+++ b/crates/polars-plan/Cargo.toml
@@ -88,7 +88,7 @@ extract_jsonpath = ["polars-ops/extract_jsonpath"]
 
 # operations
 approx_unique = ["polars-ops/approx_unique"]
-is_in = ["polars-core/is_in"]
+is_in = ["polars-ops/is_in"]
 repeat_by = ["polars-core/repeat_by"]
 round_series = ["polars-core/round_series"]
 is_first = ["polars-core/is_first", "polars-ops/is_first"]
diff --git a/crates/polars-plan/src/dsl/function_expr/boolean.rs b/crates/polars-plan/src/dsl/function_expr/boolean.rs
index 41d9ceebcaeb..2f534f95de93 100644
--- a/crates/polars-plan/src/dsl/function_expr/boolean.rs
+++ b/crates/polars-plan/src/dsl/function_expr/boolean.rs
@@ -171,7 +171,7 @@ fn is_duplicated(s: &Series) -> PolarsResult<Series> {
 fn is_in(s: &mut [Series]) -> PolarsResult<Option<Series>> {
     let left = &s[0];
     let other = &s[1];
-    left.is_in(other).map(|ca| Some(ca.into_series()))
+    polars_ops::prelude::is_in(left, other).map(|ca| Some(ca.into_series()))
 }
 
 fn any_horizontal(s: &mut [Series]) -> PolarsResult<Option<Series>> {
diff --git a/crates/polars-plan/src/dsl/function_expr/list.rs b/crates/polars-plan/src/dsl/function_expr/list.rs
index 3d4cceaa8680..1aed68b37cd9 100644
--- a/crates/polars-plan/src/dsl/function_expr/list.rs
+++ b/crates/polars-plan/src/dsl/function_expr/list.rs
@@ -55,7 +55,7 @@ pub(super) fn contains(args: &mut [Series]) -> PolarsResult<Option<Series>> {
     let list = &args[0];
     let is_in = &args[1];
 
-    is_in.is_in(list).map(|mut ca| {
+    polars_ops::prelude::is_in(is_in, list).map(|mut ca| {
         ca.rename(list.name());
         Some(ca.into_series())
     })
diff --git a/crates/polars/Cargo.toml b/crates/polars/Cargo.toml
index 7b6d0f3d3097..7763c91d807e 100644
--- a/crates/polars/Cargo.toml
+++ b/crates/polars/Cargo.toml
@@ -104,7 +104,7 @@ sort_multiple = ["polars-core/sort_multiple"]
 
 # extra operations
 approx_unique = ["polars-lazy/approx_unique", "polars-ops/approx_unique"]
-is_in = ["polars-core/is_in", "polars-lazy/is_in"]
+is_in = ["polars-lazy/is_in"]
 zip_with = ["polars-core/zip_with"]
 round_series = ["polars-core/round_series", "polars-lazy/round_series", "polars-ops/round_series"]
 checked_arithmetic = ["polars-core/checked_arithmetic"]
diff --git a/crates/polars/src/lib.rs b/crates/polars/src/lib.rs
index 530bb4db3da6..01516feff4bf 100644
--- a/crates/polars/src/lib.rs
+++ b/crates/polars/src/lib.rs
@@ -216,8 +216,8 @@
 //!     - `dataframe_arithmetic` - Arithmetic on (Dataframe and DataFrames) and (DataFrame on Series)
 //!     - `partition_by` - Split into multiple DataFrames partitioned by groups.
 //! * `Series`/`Expression` operations:
-//!     - `is_in` - [Check for membership in `Series`](crate::chunked_array::ops::IsIn)
-//!     - `zip_with` - [Zip two Series/ ChunkedArrays](crate::chunked_array::ops::ChunkZip)
+//!     - `is_in` - Check for membership in `Series`.
+//!     - `zip_with` - [Zip two Series/ ChunkedArrays](crate::chunked_array::ops::ChunkZip).
 //!     - `round_series` - round underlying float types of `Series`.
 //!     - `repeat_by` - [Repeat element in an Array N times, where N is given by another array.
 //!     - `is_first` - Check if element is first unique value.
diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py
index ec3301bbd930..4a59dd65353c 100644
--- a/py-polars/polars/io/_utils.py
+++ b/py-polars/polars/io/_utils.py
@@ -17,7 +17,7 @@ def _is_glob_pattern(file: str) -> bool:
 
 def _is_local_file(file: str) -> bool:
     try:
-        next(glob.iglob(file, recursive=True))  # noqa: PTH207
+        next(glob.iglob(file, recursive=True))
         return True
     except StopIteration:
         return False

From c92260db3cc81601d94eb396bf52b623ec0e96b6 Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijn@degooijer.io>
Date: Tue, 22 Aug 2023 11:12:03 +0200
Subject: [PATCH 34/55] depr(python): Rename `groupby` to `group_by` (#10656)

---
 .github/deploy_manylinux.sh                   |   2 +-
 .github/workflows/release-python.yml          |   2 +-
 README.md                                     |   2 +-
 .../dataframe/{groupby.rst => group_by.rst}   |   4 +-
 .../docs/source/reference/dataframe/index.rst |   2 +-
 .../reference/dataframe/modify_select.rst     |   3 +
 .../lazyframe/{groupby.rst => group_by.rst}   |   4 +-
 .../docs/source/reference/lazyframe/index.rst |   2 +-
 .../reference/lazyframe/modify_select.rst     |   3 +
 py-polars/docs/source/reference/selectors.rst |   2 +-
 py-polars/polars/dataframe/frame.py           | 238 ++++++++++++++--
 .../dataframe/{groupby.py => group_by.py}     |  90 +++---
 py-polars/polars/expr/expr.py                 |  48 ++--
 py-polars/polars/expr/list.py                 |   2 +-
 py-polars/polars/functions/lazy.py            |   4 +-
 py-polars/polars/lazyframe/frame.py           | 263 +++++++++++++++---
 .../lazyframe/{groupby.py => group_by.py}     |  48 ++--
 py-polars/polars/selectors.py                 |   6 +-
 py-polars/polars/series/list.py               |   2 +-
 py-polars/polars/series/series.py             |   2 +-
 .../tests/benchmark/run_h2oai_benchmark.py    |  42 +--
 py-polars/tests/benchmark/test_release.py     |  10 +-
 .../tests/parametric/test_groupby_rolling.py  |   4 +-
 py-polars/tests/unit/dataframe/test_df.py     |  42 +--
 py-polars/tests/unit/datatypes/test_array.py  |   8 +-
 .../tests/unit/datatypes/test_categorical.py  |  10 +-
 .../tests/unit/datatypes/test_decimal.py      |   2 +-
 py-polars/tests/unit/datatypes/test_float.py  |   8 +-
 py-polars/tests/unit/datatypes/test_list.py   |  28 +-
 py-polars/tests/unit/datatypes/test_struct.py |  12 +-
 .../tests/unit/datatypes/test_temporal.py     |  68 ++---
 .../tests/unit/functions/test_as_datatype.py  |   4 +-
 .../tests/unit/functions/test_whenthen.py     |   4 +-
 py-polars/tests/unit/io/test_lazy_parquet.py  |   2 +-
 py-polars/tests/unit/io/test_parquet.py       |   4 +-
 py-polars/tests/unit/namespaces/test_list.py  |   4 +-
 .../tests/unit/namespaces/test_string.py      |   2 +-
 .../unit/operations/test_aggregations.py      |  30 +-
 py-polars/tests/unit/operations/test_apply.py |  18 +-
 .../tests/unit/operations/test_explode.py     |  14 +-
 .../tests/unit/operations/test_filter.py      |   8 +-
 .../{test_groupby.py => test_group_by.py}     | 204 +++++++++-----
 ...by_rolling.py => test_group_by_rolling.py} |  46 +--
 py-polars/tests/unit/operations/test_join.py  |   4 +-
 .../tests/unit/operations/test_join_asof.py   |   2 +-
 .../tests/unit/operations/test_profile.py     |   4 +-
 .../tests/unit/operations/test_random.py      |   6 +-
 .../tests/unit/operations/test_rolling.py     |  60 ++--
 py-polars/tests/unit/operations/test_sort.py  |  18 +-
 .../tests/unit/streaming/test_streaming.py    |  20 +-
 .../unit/streaming/test_streaming_cse.py      |   4 +-
 ..._groupby.py => test_streaming_group_by.py} |  70 ++---
 .../unit/streaming/test_streaming_unique.py   |   2 +-
 py-polars/tests/unit/test_context.py          |   2 +-
 py-polars/tests/unit/test_cse.py              |  14 +-
 py-polars/tests/unit/test_datatypes.py        |   2 +-
 py-polars/tests/unit/test_empty.py            |   6 +-
 py-polars/tests/unit/test_errors.py           |  26 +-
 py-polars/tests/unit/test_expr_multi_cols.py  |   2 +-
 py-polars/tests/unit/test_exprs.py            |  16 +-
 py-polars/tests/unit/test_fmt.py              |   2 +-
 py-polars/tests/unit/test_interop.py          |   2 +-
 py-polars/tests/unit/test_lazy.py             |  40 +--
 py-polars/tests/unit/test_projections.py      |  14 +-
 py-polars/tests/unit/test_queries.py          |  28 +-
 py-polars/tests/unit/test_rows.py             |   4 +-
 py-polars/tests/unit/test_schema.py           |  12 +-
 py-polars/tests/unit/test_selectors.py        |   4 +-
 py-polars/tests/unit/test_show_graph.py       |   2 +-
 py-polars/tests/unit/test_sql.py              |   2 +-
 70 files changed, 1051 insertions(+), 620 deletions(-)
 rename py-polars/docs/source/reference/dataframe/{groupby.rst => group_by.rst} (87%)
 rename py-polars/docs/source/reference/lazyframe/{groupby.rst => group_by.rst} (77%)
 rename py-polars/polars/dataframe/{groupby.py => group_by.py} (94%)
 rename py-polars/polars/lazyframe/{groupby.py => group_by.py} (94%)
 rename py-polars/tests/unit/operations/{test_groupby.py => test_group_by.py} (76%)
 rename py-polars/tests/unit/operations/{test_groupby_rolling.py => test_group_by_rolling.py} (84%)
 rename py-polars/tests/unit/streaming/{test_streaming_groupby.py => test_streaming_group_by.py} (88%)

diff --git a/.github/deploy_manylinux.sh b/.github/deploy_manylinux.sh
index 993f4b39f2f5..2d27619e2a53 100644
--- a/.github/deploy_manylinux.sh
+++ b/.github/deploy_manylinux.sh
@@ -19,7 +19,7 @@ maturin publish \
 # now compile polars with bigidx feature
 sed -i 's/name = "polars"/name = "polars-u64-idx"/' pyproject.toml
 # a brittle hack to insert the 'bigidx' feature
-sed -i 's/"dynamic_groupby",/"dynamic_groupby",\n"bigidx",/' Cargo.toml
+sed -i 's/"dynamic_group_by",/"dynamic_group_by",\n"bigidx",/' Cargo.toml
 
 maturin publish \
   --skip-existing \
diff --git a/.github/workflows/release-python.yml b/.github/workflows/release-python.yml
index f8819dd9a923..2e9a611f52c1 100644
--- a/.github/workflows/release-python.yml
+++ b/.github/workflows/release-python.yml
@@ -87,7 +87,7 @@ jobs:
         run: |
           sed -i 's/name = "polars"/name = "polars-u64-idx"/' py-polars/pyproject.toml
           # A brittle hack to insert the 'bigidx' feature
-          sed -i 's/"dynamic_groupby",/"dynamic_groupby",\n"bigidx",/' py-polars/Cargo.toml
+          sed -i 's/"dynamic_group_by",/"dynamic_group_by",\n"bigidx",/' py-polars/Cargo.toml
 
       - name: Publish wheel
         uses: PyO3/maturin-action@v1
diff --git a/README.md b/README.md
index 1fee568eb25c..6a45f6a1f28f 100644
--- a/README.md
+++ b/README.md
@@ -129,7 +129,7 @@ shape: (5, 8)
 >>> # and continue in python
 >>> lf = context.execute(query)
 >>> (lf.join(other_table)
-...      .groupby("foo")
+...      .group_by("foo")
 ...      .agg(
 ...     pl.col("sum_v1").count()
 ... ).collect())
diff --git a/py-polars/docs/source/reference/dataframe/groupby.rst b/py-polars/docs/source/reference/dataframe/group_by.rst
similarity index 87%
rename from py-polars/docs/source/reference/dataframe/groupby.rst
rename to py-polars/docs/source/reference/dataframe/group_by.rst
index bd25b45699e9..5855d518f492 100644
--- a/py-polars/docs/source/reference/dataframe/groupby.rst
+++ b/py-polars/docs/source/reference/dataframe/group_by.rst
@@ -2,9 +2,9 @@
 GroupBy
 =======
 
-This namespace is available after calling :code:`DataFrame.groupby(...)`.
+This namespace is available after calling :code:`DataFrame.group_by(...)`.
 
-.. currentmodule:: polars.dataframe.groupby
+.. currentmodule:: polars.dataframe.group_by
 .. autosummary::
    :toctree: api/
 
diff --git a/py-polars/docs/source/reference/dataframe/index.rst b/py-polars/docs/source/reference/dataframe/index.rst
index 5fdaebbf9e27..ffcc810cc829 100644
--- a/py-polars/docs/source/reference/dataframe/index.rst
+++ b/py-polars/docs/source/reference/dataframe/index.rst
@@ -13,7 +13,7 @@ This page gives an overview of all public DataFrame methods.
    computation
    descriptive
    export
-   groupby
+   group_by
    modify_select
    miscellaneous
 
diff --git a/py-polars/docs/source/reference/dataframe/modify_select.rst b/py-polars/docs/source/reference/dataframe/modify_select.rst
index fad9e70b34b0..7feb84ffbc6c 100644
--- a/py-polars/docs/source/reference/dataframe/modify_select.rst
+++ b/py-polars/docs/source/reference/dataframe/modify_select.rst
@@ -20,6 +20,9 @@ Manipulation/selection
     DataFrame.find_idx_by_name
     DataFrame.get_column
     DataFrame.get_columns
+    DataFrame.group_by
+    DataFrame.group_by_dynamic
+    DataFrame.group_by_rolling
     DataFrame.groupby
     DataFrame.groupby_dynamic
     DataFrame.groupby_rolling
diff --git a/py-polars/docs/source/reference/lazyframe/groupby.rst b/py-polars/docs/source/reference/lazyframe/group_by.rst
similarity index 77%
rename from py-polars/docs/source/reference/lazyframe/groupby.rst
rename to py-polars/docs/source/reference/lazyframe/group_by.rst
index 9745656e0bc7..05e786726e3a 100644
--- a/py-polars/docs/source/reference/lazyframe/groupby.rst
+++ b/py-polars/docs/source/reference/lazyframe/group_by.rst
@@ -2,9 +2,9 @@
 GroupBy
 =======
 
-This namespace comes available by calling `LazyFrame.groupby(..)`.
+This namespace comes available by calling `LazyFrame.group_by(..)`.
 
-.. currentmodule:: polars.lazyframe.groupby
+.. currentmodule:: polars.lazyframe.group_by
 
 .. autosummary::
    :toctree: api/
diff --git a/py-polars/docs/source/reference/lazyframe/index.rst b/py-polars/docs/source/reference/lazyframe/index.rst
index 702cb4bf2cca..70f2b5434a7a 100644
--- a/py-polars/docs/source/reference/lazyframe/index.rst
+++ b/py-polars/docs/source/reference/lazyframe/index.rst
@@ -11,7 +11,7 @@ This page gives an overview of all public LazyFrame methods.
    aggregation
    attributes
    descriptive
-   groupby
+   group_by
    modify_select
    miscellaneous
 
diff --git a/py-polars/docs/source/reference/lazyframe/modify_select.rst b/py-polars/docs/source/reference/lazyframe/modify_select.rst
index f52bab24e662..2257467fb127 100644
--- a/py-polars/docs/source/reference/lazyframe/modify_select.rst
+++ b/py-polars/docs/source/reference/lazyframe/modify_select.rst
@@ -16,6 +16,9 @@ Manipulation/selection
     LazyFrame.fill_null
     LazyFrame.filter
     LazyFrame.first
+    LazyFrame.group_by
+    LazyFrame.group_by_dynamic
+    LazyFrame.group_by_rolling
     LazyFrame.groupby
     LazyFrame.groupby_dynamic
     LazyFrame.groupby_rolling
diff --git a/py-polars/docs/source/reference/selectors.rst b/py-polars/docs/source/reference/selectors.rst
index 1a0a71a45541..064cd530f968 100644
--- a/py-polars/docs/source/reference/selectors.rst
+++ b/py-polars/docs/source/reference/selectors.rst
@@ -28,7 +28,7 @@ Importing
               "z": ["a", "b", "a", "b", "b"],
           },
       )
-      df.groupby(by=cs.string()).agg(cs.numeric().sum())
+      df.group_by(by=cs.string()).agg(cs.numeric().sum())
 
 Set operations
 --------------
diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
index 180dfc4daebd..9c4f3656f7a9 100644
--- a/py-polars/polars/dataframe/frame.py
+++ b/py-polars/polars/dataframe/frame.py
@@ -30,7 +30,7 @@
 import polars._reexport as pl
 from polars import functions as F
 from polars.dataframe._html import NotebookFormatter
-from polars.dataframe.groupby import DynamicGroupBy, GroupBy, RollingGroupBy
+from polars.dataframe.group_by import DynamicGroupBy, GroupBy, RollingGroupBy
 from polars.datatypes import (
     FLOAT_DTYPES,
     INTEGER_DTYPES,
@@ -84,6 +84,7 @@
 from polars.utils.convert import _timedelta_to_pl_duration
 from polars.utils.deprecation import (
     deprecate_function,
+    deprecate_renamed_function,
     deprecate_renamed_methods,
     deprecate_renamed_parameter,
 )
@@ -4923,14 +4924,14 @@ def with_row_count(self, name: str = "row_nr", offset: int = 0) -> Self:
         """
         return self._from_pydf(self._df.with_row_count(name, offset))
 
-    def groupby(
+    def group_by(
         self,
         by: IntoExpr | Iterable[IntoExpr],
         *more_by: IntoExpr,
         maintain_order: bool = False,
     ) -> GroupBy:
         """
-        Start a groupby operation.
+        Start a group by operation.
 
         Parameters
         ----------
@@ -4941,7 +4942,7 @@ def groupby(
             Additional columns to group by, specified as positional arguments.
         maintain_order
             Ensure that the order of the groups is consistent with the input data.
-            This is slower than a default groupby.
+            This is slower than a default group by.
             Settings this to ``True`` blocks the possibility
             to run on the streaming engine.
 
@@ -4966,7 +4967,7 @@ def groupby(
         ...         "c": [5, 4, 3, 2, 1],
         ...     }
         ... )
-        >>> df.groupby("a").agg(pl.col("b").sum())  # doctest: +IGNORE_RESULT
+        >>> df.group_by("a").agg(pl.col("b").sum())  # doctest: +IGNORE_RESULT
         shape: (3, 2)
         ┌─────┬─────┐
         │ a   ┆ b   │
@@ -4981,7 +4982,7 @@ def groupby(
         Set ``maintain_order=True`` to ensure the order of the groups is consistent with
         the input.
 
-        >>> df.groupby("a", maintain_order=True).agg(pl.col("c"))
+        >>> df.group_by("a", maintain_order=True).agg(pl.col("c"))
         shape: (3, 2)
         ┌─────┬───────────┐
         │ a   ┆ c         │
@@ -4995,7 +4996,7 @@ def groupby(
 
         Group by multiple columns by passing a list of column names.
 
-        >>> df.groupby(["a", "b"]).agg(pl.max("c"))  # doctest: +IGNORE_RESULT
+        >>> df.group_by(["a", "b"]).agg(pl.max("c"))  # doctest: +IGNORE_RESULT
         shape: (4, 3)
         ┌─────┬─────┬─────┐
         │ a   ┆ b   ┆ c   │
@@ -5011,7 +5012,7 @@ def groupby(
         Or use positional arguments to group by multiple columns in the same way.
         Expressions are also accepted.
 
-        >>> df.groupby("a", pl.col("b") // 2).agg(pl.col("c").mean())  # doctest: +SKIP
+        >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean())  # doctest: +SKIP
         shape: (3, 3)
         ┌─────┬─────┬─────┐
         │ a   ┆ b   ┆ c   │
@@ -5026,7 +5027,7 @@ def groupby(
         The ``GroupBy`` object returned by this method is iterable, returning the name
         and data of each group.
 
-        >>> for name, data in df.groupby("a"):  # doctest: +SKIP
+        >>> for name, data in df.group_by("a"):  # doctest: +SKIP
         ...     print(name)
         ...     print(data)
         ...
@@ -5063,7 +5064,7 @@ def groupby(
         """
         return GroupBy(self, by, *more_by, maintain_order=maintain_order)
 
-    def groupby_rolling(
+    def group_by_rolling(
         self,
         index_column: IntoExpr,
         *,
@@ -5076,9 +5077,9 @@ def groupby_rolling(
         """
         Create rolling groups based on a time, Int32, or Int64 column.
 
-        Different from a ``dynamic_groupby`` the windows are now determined by the
+        Different from a ``group_by_dynamic`` the windows are now determined by the
         individual values and are not of constant intervals. For constant intervals use
-        *groupby_dynamic*.
+        :func:`DataFrame.group_by_dynamic`.
 
         If you have a time series ``<t_0, t_1, ..., t_n>``, then by default the
         windows created will be
@@ -5115,7 +5116,7 @@ def groupby_rolling(
         not be 24 hours, due to daylight savings). Similarly for "calendar week",
         "calendar month", "calendar quarter", and "calendar year".
 
-        In case of a groupby_rolling on an integer column, the windows are defined by:
+        In case of a group_by_rolling on an integer column, the windows are defined by:
 
         - **"1i"      # length 1**
         - **"10i"     # length 10**
@@ -5128,7 +5129,7 @@ def groupby_rolling(
             This column must be sorted in ascending order (or, if `by` is specified,
             then it must be sorted in ascending order within each group).
 
-            In case of a rolling groupby on indices, dtype needs to be one of
+            In case of a rolling group by on indices, dtype needs to be one of
             {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if
             performance matters use an Int64 column.
         period
@@ -5155,7 +5156,7 @@ def groupby_rolling(
 
         See Also
         --------
-        groupby_dynamic
+        group_by_dynamic
 
         Examples
         --------
@@ -5170,7 +5171,7 @@ def groupby_rolling(
         >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns(
         ...     pl.col("dt").str.strptime(pl.Datetime).set_sorted()
         ... )
-        >>> out = df.groupby_rolling(index_column="dt", period="2d").agg(
+        >>> out = df.group_by_rolling(index_column="dt", period="2d").agg(
         ...     [
         ...         pl.sum("a").alias("sum_a"),
         ...         pl.min("a").alias("min_a"),
@@ -5200,7 +5201,7 @@ def groupby_rolling(
             self, index_column, period, offset, closed, by, check_sorted
         )
 
-    def groupby_dynamic(
+    def group_by_dynamic(
         self,
         index_column: IntoExpr,
         *,
@@ -5218,7 +5219,7 @@ def groupby_dynamic(
         Group based on a time value (or index value of type Int32, Int64).
 
         Time windows are calculated and rows are assigned to windows. Different from a
-        normal groupby is that a row can be member of multiple groups. The time/index
+        normal group by is that a row can be member of multiple groups. The time/index
         window could be seen as a rolling window, with a window size determined by
         dates/times/values instead of slots in the DataFrame.
 
@@ -5255,7 +5256,7 @@ def groupby_dynamic(
         not be 24 hours, due to daylight savings). Similarly for "calendar week",
         "calendar month", "calendar quarter", and "calendar year".
 
-        In case of a groupby_dynamic on an integer column, the windows are defined by:
+        In case of a group_by_dynamic on an integer column, the windows are defined by:
 
         - "1i"      # length 1
         - "10i"     # length 10
@@ -5272,7 +5273,7 @@ def groupby_dynamic(
             This column must be sorted in ascending order (or, if `by` is specified,
             then it must be sorted in ascending order within each group).
 
-            In case of a dynamic groupby on indices, dtype needs to be one of
+            In case of a dynamic group by on indices, dtype needs to be one of
             {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if
             performance matters use an Int64 column.
         every
@@ -5325,7 +5326,7 @@ def groupby_dynamic(
         .. code-block:: python
 
             # polars
-            df.groupby_dynamic("ts", every="1d").agg(pl.col("value").sum())
+            df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum())
 
         is equivalent to
 
@@ -5371,7 +5372,7 @@ def groupby_dynamic(
 
         Group by windows of 1 hour starting at 2021-12-16 00:00:00.
 
-        >>> df.groupby_dynamic("time", every="1h", closed="right").agg(
+        >>> df.group_by_dynamic("time", every="1h", closed="right").agg(
         ...     [
         ...         pl.col("time").min().alias("time_min"),
         ...         pl.col("time").max().alias("time_max"),
@@ -5391,7 +5392,7 @@ def groupby_dynamic(
 
         The window boundaries can also be added to the aggregation result
 
-        >>> df.groupby_dynamic(
+        >>> df.group_by_dynamic(
         ...     "time", every="1h", include_boundaries=True, closed="right"
         ... ).agg([pl.col("time").count().alias("time_count")])
         shape: (4, 4)
@@ -5409,7 +5410,7 @@ def groupby_dynamic(
         When closed="left", should not include right end of interval
         [lower_bound, upper_bound)
 
-        >>> df.groupby_dynamic("time", every="1h", closed="left").agg(
+        >>> df.group_by_dynamic("time", every="1h", closed="left").agg(
         ...     [
         ...         pl.col("time").count().alias("time_count"),
         ...         pl.col("time").alias("time_agg_list"),
@@ -5429,7 +5430,7 @@ def groupby_dynamic(
 
         When closed="both" the time values at the window boundaries belong to 2 groups.
 
-        >>> df.groupby_dynamic("time", every="1h", closed="both").agg(
+        >>> df.group_by_dynamic("time", every="1h", closed="both").agg(
         ...     [pl.col("time").count().alias("time_count")]
         ... )
         shape: (5, 2)
@@ -5445,7 +5446,7 @@ def groupby_dynamic(
         │ 2021-12-16 03:00:00 ┆ 1          │
         └─────────────────────┴────────────┘
 
-        Dynamic groupbys can also be combined with grouping on normal keys
+        Dynamic group bys can also be combined with grouping on normal keys
 
         >>> df = pl.DataFrame(
         ...     {
@@ -5473,7 +5474,7 @@ def groupby_dynamic(
         │ 2021-12-16 02:30:00 ┆ a      │
         │ 2021-12-16 03:00:00 ┆ a      │
         └─────────────────────┴────────┘
-        >>> df.groupby_dynamic(
+        >>> df.group_by_dynamic(
         ...     "time",
         ...     every="1h",
         ...     closed="both",
@@ -5495,7 +5496,7 @@ def groupby_dynamic(
         │ b      ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1          │
         └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
 
-        Dynamic groupby on an index column
+        Dynamic group by on an index column
 
         >>> df = pl.DataFrame(
         ...     {
@@ -5504,7 +5505,7 @@ def groupby_dynamic(
         ...     }
         ... )
         >>> (
-        ...     df.groupby_dynamic(
+        ...     df.group_by_dynamic(
         ...         "idx",
         ...         every="2i",
         ...         period="3i",
@@ -8512,7 +8513,7 @@ def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = None) -> i
         In aggregate context there is also an equivalent method for returning the
         unique values per-group:
 
-        >>> df_agg_nunique = df.groupby(by=["a"]).n_unique()
+        >>> df_agg_nunique = df.group_by(by=["a"]).n_unique()
 
         Examples
         --------
@@ -9677,6 +9678,183 @@ def update(
         """
         return self.lazy().update(other.lazy(), on, how).collect(no_optimization=True)
 
+    @deprecate_renamed_function("group_by", version="0.19.0")
+    def groupby(
+        self,
+        by: IntoExpr | Iterable[IntoExpr],
+        *more_by: IntoExpr,
+        maintain_order: bool = False,
+    ) -> GroupBy:
+        """
+        Start a group by operation.
+
+        Alias for :func:`DataFrame.group_by`.
+
+        Parameters
+        ----------
+        by
+            Column(s) to group by. Accepts expression input. Strings are parsed as
+            column names.
+        *more_by
+            Additional columns to group by, specified as positional arguments.
+        maintain_order
+            Ensure that the order of the groups is consistent with the input data.
+            This is slower than a default group by.
+            Settings this to ``True`` blocks the possibility
+            to run on the streaming engine.
+
+            .. note::
+                Within each group, the order of rows is always preserved, regardless
+                of this argument.
+
+        Returns
+        -------
+        GroupBy
+            Object which can be used to perform aggregations.
+
+        """
+        return self.group_by(by, *more_by, maintain_order=maintain_order)
+
+    @deprecate_renamed_function("group_by_rolling", version="0.19.0")
+    def groupby_rolling(
+        self,
+        index_column: IntoExpr,
+        *,
+        period: str | timedelta,
+        offset: str | timedelta | None = None,
+        closed: ClosedInterval = "right",
+        by: IntoExpr | Iterable[IntoExpr] | None = None,
+        check_sorted: bool = True,
+    ) -> RollingGroupBy:
+        """
+        Create rolling groups based on a time, Int32, or Int64 column.
+
+        Alias for :func:`DataFrame.group_by_rolling`.
+
+        Parameters
+        ----------
+        index_column
+            Column used to group based on the time window.
+            Often of type Date/Datetime.
+            This column must be sorted in ascending order (or, if `by` is specified,
+            then it must be sorted in ascending order within each group).
+
+            In case of a rolling group by on indices, dtype needs to be one of
+            {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if
+            performance matters use an Int64 column.
+        period
+            length of the window - must be non-negative
+        offset
+            offset of the window. Default is -period
+        closed : {'right', 'left', 'both', 'none'}
+            Define which sides of the temporal interval are closed (inclusive).
+        by
+            Also group by this column/these columns
+        check_sorted
+            When the ``by`` argument is given, polars can not check sortedness
+            by the metadata and has to do a full scan on the index column to
+            verify data is sorted. This is expensive. If you are sure the
+            data within the by groups is sorted, you can set this to ``False``.
+            Doing so incorrectly will lead to incorrect output
+
+        """
+        return self.group_by_rolling(
+            index_column,
+            period=period,
+            offset=offset,
+            closed=closed,
+            by=by,
+            check_sorted=check_sorted,
+        )
+
+    @deprecate_renamed_function("group_by_dynamic", version="0.19.0")
+    def groupby_dynamic(
+        self,
+        index_column: IntoExpr,
+        *,
+        every: str | timedelta,
+        period: str | timedelta | None = None,
+        offset: str | timedelta | None = None,
+        truncate: bool = True,
+        include_boundaries: bool = False,
+        closed: ClosedInterval = "left",
+        by: IntoExpr | Iterable[IntoExpr] | None = None,
+        start_by: StartBy = "window",
+        check_sorted: bool = True,
+    ) -> DynamicGroupBy:
+        """
+        Group based on a time value (or index value of type Int32, Int64).
+
+        Alias for :func:`DataFrame.group_by_rolling`.
+
+        Parameters
+        ----------
+        index_column
+            Column used to group based on the time window.
+            Often of type Date/Datetime.
+            This column must be sorted in ascending order (or, if `by` is specified,
+            then it must be sorted in ascending order within each group).
+
+            In case of a dynamic group by on indices, dtype needs to be one of
+            {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if
+            performance matters use an Int64 column.
+        every
+            interval of the window
+        period
+            length of the window, if None it is equal to 'every'
+        offset
+            offset of the window if None and period is None it will be equal to negative
+            `every`
+        truncate
+            truncate the time value to the window lower bound
+        include_boundaries
+            Add the lower and upper bound of the window to the "_lower_bound" and
+            "_upper_bound" columns. This will impact performance because it's harder to
+            parallelize
+        closed : {'left', 'right', 'both', 'none'}
+            Define which sides of the temporal interval are closed (inclusive).
+        by
+            Also group by this column/these columns
+        start_by : {'window', 'datapoint', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'}
+            The strategy to determine the start of the first window by.
+
+            * 'window': Truncate the start of the window with the 'every' argument.
+              Note that weekly windows start on Monday.
+            * 'datapoint': Start from the first encountered data point.
+            * a day of the week (only takes effect if `every` contains ``'w'``):
+
+              * 'monday': Start the window on the Monday before the first data point.
+              * 'tuesday': Start the window on the Tuesday before the first data point.
+              * ...
+              * 'sunday': Start the window on the Sunday before the first data point.
+        check_sorted
+            When the ``by`` argument is given, polars can not check sortedness
+            by the metadata and has to do a full scan on the index column to
+            verify data is sorted. This is expensive. If you are sure the
+            data within the by groups is sorted, you can set this to ``False``.
+            Doing so incorrectly will lead to incorrect output
+
+        Returns
+        -------
+        DynamicGroupBy
+            Object you can call ``.agg`` on to aggregate by groups, the result
+            of which will be sorted by `index_column` (but note that if `by` columns are
+            passed, it will only be sorted within each `by` group).
+
+        """  # noqa: W505
+        return self.group_by_dynamic(
+            index_column,
+            every=every,
+            period=period,
+            offset=offset,
+            truncate=truncate,
+            include_boundaries=include_boundaries,
+            closed=closed,
+            by=by,
+            start_by=start_by,
+            check_sorted=check_sorted,
+        )
+
 
 def _prepare_other_arg(other: Any, length: int | None = None) -> Series:
     # if not a series create singleton series such that it will broadcast
diff --git a/py-polars/polars/dataframe/groupby.py b/py-polars/polars/dataframe/group_by.py
similarity index 94%
rename from py-polars/polars/dataframe/groupby.py
rename to py-polars/polars/dataframe/group_by.py
index 4f6259d6bcd3..306b2e575ea7 100644
--- a/py-polars/polars/dataframe/groupby.py
+++ b/py-polars/polars/dataframe/group_by.py
@@ -36,14 +36,14 @@ def __init__(
         maintain_order: bool,
     ):
         """
-        Utility class for performing a groupby operation over the given dataframe.
+        Utility class for performing a group by operation over the given dataframe.
 
-        Generated by calling ``df.groupby(...)``.
+        Generated by calling ``df.group_by(...)``.
 
         Parameters
         ----------
         df
-            DataFrame to perform the groupby operation over.
+            DataFrame to perform the group by operation over.
         by
             Column or columns to group by. Accepts expression input. Strings are parsed
             as column names.
@@ -51,7 +51,7 @@ def __init__(
             Additional columns to group by, specified as positional arguments.
         maintain_order
             Ensure that the order of the groups is consistent with the input data.
-            This is slower than a default groupby.
+            This is slower than a default group by.
 
         """
         self.df = df
@@ -61,14 +61,14 @@ def __init__(
 
     def __iter__(self) -> Self:
         """
-        Allows iteration over the groups of the groupby operation.
+        Allows iteration over the groups of the group by operation.
 
         Each group is represented by a tuple of (name, data).
 
         Examples
         --------
         >>> df = pl.DataFrame({"foo": ["a", "a", "b"], "bar": [1, 2, 3]})
-        >>> for name, data in df.groupby("foo"):  # doctest: +SKIP
+        >>> for name, data in df.group_by("foo"):  # doctest: +SKIP
         ...     print(name)
         ...     print(data)
         ...
@@ -97,7 +97,7 @@ def __iter__(self) -> Self:
         groups_df = (
             self.df.lazy()
             .with_row_count(name=temp_col)
-            .groupby(self.by, *self.more_by, maintain_order=self.maintain_order)
+            .group_by(self.by, *self.more_by, maintain_order=self.maintain_order)
             .agg(F.col(temp_col))
             .collect(no_optimization=True)
         )
@@ -135,12 +135,12 @@ def agg(
         **named_aggs: IntoExpr,
     ) -> DataFrame:
         """
-        Compute aggregations for each group of a groupby operation.
+        Compute aggregations for each group of a group by operation.
 
         Parameters
         ----------
         *aggs
-            Aggregations to compute for each group of the groupby operation,
+            Aggregations to compute for each group of the group by operation,
             specified as positional arguments.
             Accepts expression input. Strings are parsed as column names.
         **named_aggs
@@ -158,7 +158,7 @@ def agg(
         ...         "c": [5, 4, 3, 2, 1],
         ...     }
         ... )
-        >>> df.groupby("a").agg([pl.col("b"), pl.col("c")])  # doctest: +IGNORE_RESULT
+        >>> df.group_by("a").agg([pl.col("b"), pl.col("c")])  # doctest: +IGNORE_RESULT
         shape: (3, 3)
         ┌─────┬───────────┬───────────┐
         │ a   ┆ b         ┆ c         │
@@ -174,7 +174,7 @@ def agg(
 
         Compute the sum of a column for each group.
 
-        >>> df.groupby("a").agg(pl.col("b").sum())  # doctest: +IGNORE_RESULT
+        >>> df.group_by("a").agg(pl.col("b").sum())  # doctest: +IGNORE_RESULT
         shape: (3, 2)
         ┌─────┬─────┐
         │ a   ┆ b   │
@@ -188,7 +188,7 @@ def agg(
 
         Compute multiple aggregates at once by passing a list of expressions.
 
-        >>> df.groupby("a").agg([pl.sum("b"), pl.mean("c")])  # doctest: +IGNORE_RESULT
+        >>> df.group_by("a").agg([pl.sum("b"), pl.mean("c")])  # doctest: +IGNORE_RESULT
         shape: (3, 3)
         ┌─────┬─────┬─────┐
         │ a   ┆ b   ┆ c   │
@@ -202,7 +202,7 @@ def agg(
 
         Or use positional arguments to compute multiple aggregations in the same way.
 
-        >>> df.groupby("a").agg(
+        >>> df.group_by("a").agg(
         ...     pl.sum("b").suffix("_sum"),
         ...     (pl.col("c") ** 2).mean().suffix("_mean_squared"),
         ... )  # doctest: +IGNORE_RESULT
@@ -219,7 +219,7 @@ def agg(
 
         Use keyword arguments to easily name your expression inputs.
 
-        >>> df.groupby("a").agg(
+        >>> df.group_by("a").agg(
         ...     b_sum=pl.sum("b"),
         ...     c_mean_squared=(pl.col("c") ** 2).mean(),
         ... )  # doctest: +IGNORE_RESULT
@@ -237,7 +237,7 @@ def agg(
         """
         return (
             self.df.lazy()
-            .groupby(self.by, *self.more_by, maintain_order=self.maintain_order)
+            .group_by(self.by, *self.more_by, maintain_order=self.maintain_order)
             .agg(*aggs, **named_aggs)
             .collect(no_optimization=True)
         )
@@ -296,7 +296,7 @@ def apply(self, function: Callable[[DataFrame], DataFrame]) -> DataFrame:
 
         For each color group sample two rows:
 
-        >>> df.groupby("color").apply(
+        >>> df.group_by("color").apply(
         ...     lambda group_df: group_df.sample(2)
         ... )  # doctest: +IGNORE_RESULT
         shape: (4, 3)
@@ -367,7 +367,7 @@ def head(self, n: int = 5) -> DataFrame:
         │ a       ┆ 5   │
         │ b       ┆ 6   │
         └─────────┴─────┘
-        >>> df.groupby("letters").head(2).sort("letters")
+        >>> df.group_by("letters").head(2).sort("letters")
         shape: (5, 2)
         ┌─────────┬─────┐
         │ letters ┆ nrs │
@@ -384,7 +384,7 @@ def head(self, n: int = 5) -> DataFrame:
         """
         return (
             self.df.lazy()
-            .groupby(self.by, *self.more_by, maintain_order=self.maintain_order)
+            .group_by(self.by, *self.more_by, maintain_order=self.maintain_order)
             .head(n)
             .collect(no_optimization=True)
         )
@@ -420,7 +420,7 @@ def tail(self, n: int = 5) -> DataFrame:
         │ a       ┆ 5   │
         │ b       ┆ 6   │
         └─────────┴─────┘
-        >>> df.groupby("letters").tail(2).sort("letters")
+        >>> df.group_by("letters").tail(2).sort("letters")
         shape: (5, 2)
         ┌─────────┬─────┐
         │ letters ┆ nrs │
@@ -437,7 +437,7 @@ def tail(self, n: int = 5) -> DataFrame:
         """
         return (
             self.df.lazy()
-            .groupby(self.by, *self.more_by, maintain_order=self.maintain_order)
+            .group_by(self.by, *self.more_by, maintain_order=self.maintain_order)
             .tail(n)
             .collect(no_optimization=True)
         )
@@ -449,7 +449,7 @@ def all(self) -> DataFrame:
         Examples
         --------
         >>> df = pl.DataFrame({"a": ["one", "two", "one", "two"], "b": [1, 2, 3, 4]})
-        >>> df.groupby("a", maintain_order=True).all()
+        >>> df.group_by("a", maintain_order=True).all()
         shape: (2, 2)
         ┌─────┬───────────┐
         │ a   ┆ b         │
@@ -480,7 +480,7 @@ def count(self) -> DataFrame:
         ...         "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
         ...     }
         ... )
-        >>> df.groupby("d", maintain_order=True).count()
+        >>> df.group_by("d", maintain_order=True).count()
         shape: (3, 2)
         ┌────────┬───────┐
         │ d      ┆ count │
@@ -509,7 +509,7 @@ def first(self) -> DataFrame:
         ...         "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
         ...     }
         ... )
-        >>> df.groupby("d", maintain_order=True).first()
+        >>> df.group_by("d", maintain_order=True).first()
         shape: (3, 4)
         ┌────────┬─────┬──────┬───────┐
         │ d      ┆ a   ┆ b    ┆ c     │
@@ -538,7 +538,7 @@ def last(self) -> DataFrame:
         ...         "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
         ...     }
         ... )
-        >>> df.groupby("d", maintain_order=True).last()
+        >>> df.group_by("d", maintain_order=True).last()
         shape: (3, 4)
         ┌────────┬─────┬──────┬───────┐
         │ d      ┆ a   ┆ b    ┆ c     │
@@ -567,7 +567,7 @@ def max(self) -> DataFrame:
         ...         "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
         ...     }
         ... )
-        >>> df.groupby("d", maintain_order=True).max()
+        >>> df.group_by("d", maintain_order=True).max()
         shape: (3, 4)
         ┌────────┬─────┬──────┬──────┐
         │ d      ┆ a   ┆ b    ┆ c    │
@@ -596,7 +596,7 @@ def mean(self) -> DataFrame:
         ...         "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
         ...     }
         ... )
-        >>> df.groupby("d", maintain_order=True).mean()
+        >>> df.group_by("d", maintain_order=True).mean()
         shape: (3, 4)
         ┌────────┬─────┬──────────┬──────────┐
         │ d      ┆ a   ┆ b        ┆ c        │
@@ -624,7 +624,7 @@ def median(self) -> DataFrame:
         ...         "d": ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"],
         ...     }
         ... )
-        >>> df.groupby("d", maintain_order=True).median()
+        >>> df.group_by("d", maintain_order=True).median()
         shape: (2, 3)
         ┌────────┬─────┬──────┐
         │ d      ┆ a   ┆ b    │
@@ -652,7 +652,7 @@ def min(self) -> DataFrame:
         ...         "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
         ...     }
         ... )
-        >>> df.groupby("d", maintain_order=True).min()
+        >>> df.group_by("d", maintain_order=True).min()
         shape: (3, 4)
         ┌────────┬─────┬──────┬───────┐
         │ d      ┆ a   ┆ b    ┆ c     │
@@ -680,7 +680,7 @@ def n_unique(self) -> DataFrame:
         ...         "d": ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"],
         ...     }
         ... )
-        >>> df.groupby("d", maintain_order=True).n_unique()
+        >>> df.group_by("d", maintain_order=True).n_unique()
         shape: (2, 3)
         ┌────────┬─────┬─────┐
         │ d      ┆ a   ┆ b   │
@@ -716,7 +716,7 @@ def quantile(
         ...         "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
         ...     }
         ... )
-        >>> df.groupby("d", maintain_order=True).quantile(1)
+        >>> df.group_by("d", maintain_order=True).quantile(1)
         shape: (3, 3)
         ┌────────┬─────┬──────┐
         │ d      ┆ a   ┆ b    │
@@ -745,7 +745,7 @@ def sum(self) -> DataFrame:
         ...         "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
         ...     }
         ... )
-        >>> df.groupby("d", maintain_order=True).sum()
+        >>> df.group_by("d", maintain_order=True).sum()
         shape: (3, 4)
         ┌────────┬─────┬──────┬─────┐
         │ d      ┆ a   ┆ b    ┆ c   │
@@ -766,7 +766,7 @@ class RollingGroupBy:
     A rolling grouper.
 
     This has an `.agg` method which will allow you to run all polars expressions in a
-    groupby context.
+    group by context.
     """
 
     def __init__(
@@ -795,7 +795,7 @@ def __iter__(self) -> Self:
         groups_df = (
             self.df.lazy()
             .with_row_count(name=temp_col)
-            .groupby_rolling(
+            .group_by_rolling(
                 index_column=self.time_column,
                 period=self.period,
                 offset=self.offset,
@@ -840,12 +840,12 @@ def agg(
         **named_aggs: IntoExpr,
     ) -> DataFrame:
         """
-        Compute aggregations for each group of a groupby operation.
+        Compute aggregations for each group of a group by operation.
 
         Parameters
         ----------
         *aggs
-            Aggregations to compute for each group of the groupby operation,
+            Aggregations to compute for each group of the group by operation,
             specified as positional arguments.
             Accepts expression input. Strings are parsed as column names.
         **named_aggs
@@ -854,7 +854,7 @@ def agg(
         """
         return (
             self.df.lazy()
-            .groupby_rolling(
+            .group_by_rolling(
                 index_column=self.time_column,
                 period=self.period,
                 offset=self.offset,
@@ -922,7 +922,7 @@ def apply(
 
         >>> (
         ...     df.lazy()
-        ...     .groupby("color")
+        ...     .group_by("color")
         ...     .apply(lambda group_df: group_df.sample(2), schema=None)
         ...     .collect()
         ... )  # doctest: +IGNORE_RESULT
@@ -949,7 +949,7 @@ def apply(
         """
         return (
             self.df.lazy()
-            .groupby_rolling(
+            .group_by_rolling(
                 index_column=self.time_column,
                 period=self.period,
                 offset=self.offset,
@@ -967,7 +967,7 @@ class DynamicGroupBy:
     A dynamic grouper.
 
     This has an `.agg` method which allows you to run all polars expressions in a
-    groupby context.
+    group by context.
     """
 
     def __init__(
@@ -1005,7 +1005,7 @@ def __iter__(self) -> Self:
         groups_df = (
             self.df.lazy()
             .with_row_count(name=temp_col)
-            .groupby_dynamic(
+            .group_by_dynamic(
                 index_column=self.time_column,
                 every=self.every,
                 period=self.period,
@@ -1054,12 +1054,12 @@ def agg(
         **named_aggs: IntoExpr,
     ) -> DataFrame:
         """
-        Compute aggregations for each group of a groupby operation.
+        Compute aggregations for each group of a group by operation.
 
         Parameters
         ----------
         *aggs
-            Aggregations to compute for each group of the groupby operation,
+            Aggregations to compute for each group of the group by operation,
             specified as positional arguments.
             Accepts expression input. Strings are parsed as column names.
         **named_aggs
@@ -1068,7 +1068,7 @@ def agg(
         """
         return (
             self.df.lazy()
-            .groupby_dynamic(
+            .group_by_dynamic(
                 index_column=self.time_column,
                 every=self.every,
                 period=self.period,
@@ -1140,7 +1140,7 @@ def apply(
 
         >>> (
         ...     df.lazy()
-        ...     .groupby("color")
+        ...     .group_by("color")
         ...     .apply(lambda group_df: group_df.sample(2), schema=None)
         ...     .collect()
         ... )  # doctest: +IGNORE_RESULT
@@ -1167,7 +1167,7 @@ def apply(
         """
         return (
             self.df.lazy()
-            .groupby_dynamic(
+            .group_by_dynamic(
                 index_column=self.time_column,
                 every=self.every,
                 period=self.period,
diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py
index 258cbc802bf3..2999118b0bb9 100644
--- a/py-polars/polars/expr/expr.py
+++ b/py-polars/polars/expr/expr.py
@@ -1205,7 +1205,7 @@ def agg_groups(self) -> Self:
         ...         "value": [94, 95, 96, 97, 97, 99],
         ...     }
         ... )
-        >>> df.groupby("group", maintain_order=True).agg(pl.col("value").agg_groups())
+        >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups())
         shape: (2, 2)
         ┌───────┬───────────┐
         │ group ┆ value     │
@@ -1851,7 +1851,7 @@ def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Self:
         Sort this column.
 
         When used in a projection/selection context, the whole column is sorted.
-        When used in a groupby context, the groups are sorted.
+        When used in a group by context, the groups are sorted.
 
         Parameters
         ----------
@@ -1904,7 +1904,7 @@ def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Self:
         │ null │
         └──────┘
 
-        When sorting in a groupby context, the groups are sorted.
+        When sorting in a group by context, the groups are sorted.
 
         >>> df = pl.DataFrame(
         ...     {
@@ -1912,7 +1912,7 @@ def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Self:
         ...         "value": [1, 98, 2, 3, 99, 4],
         ...     }
         ... )
-        >>> df.groupby("group").agg(pl.col("value").sort())  # doctest: +IGNORE_RESULT
+        >>> df.group_by("group").agg(pl.col("value").sort())  # doctest: +IGNORE_RESULT
         shape: (2, 2)
         ┌───────┬────────────┐
         │ group ┆ value      │
@@ -2158,7 +2158,7 @@ def sort_by(
         Sort this column by the ordering of other columns.
 
         When used in a projection/selection context, the whole column is sorted.
-        When used in a groupby context, the groups are sorted.
+        When used in a group by context, the groups are sorted.
 
         Parameters
         ----------
@@ -2240,9 +2240,9 @@ def sort_by(
         │ b     │
         └───────┘
 
-        When sorting in a groupby context, the groups are sorted.
+        When sorting in a group by context, the groups are sorted.
 
-        >>> df.groupby("group").agg(
+        >>> df.group_by("group").agg(
         ...     pl.col("value1").sort_by("value2")
         ... )  # doctest: +IGNORE_RESULT
         shape: (2, 2)
@@ -2258,7 +2258,7 @@ def sort_by(
         Take a single row from each group where a column attains its minimal value
         within that group.
 
-        >>> df.groupby("group").agg(
+        >>> df.group_by("group").agg(
         ...     pl.all().sort_by("value2").first()
         ... )  # doctest: +IGNORE_RESULT
         shape: (2, 3)
@@ -2312,7 +2312,7 @@ def take(
         ...         "value": [1, 98, 2, 3, 99, 4],
         ...     }
         ... )
-        >>> df.groupby("group", maintain_order=True).agg(pl.col("value").take(1))
+        >>> df.group_by("group", maintain_order=True).agg(pl.col("value").take(1))
         shape: (2, 2)
         ┌───────┬───────┐
         │ group ┆ value │
@@ -3056,7 +3056,7 @@ def over(
         """
         Compute expressions over the given groups.
 
-        This expression is similar to performing a groupby aggregation and joining the
+        This expression is similar to performing a group by aggregation and joining the
         result back into the original dataframe.
 
         The outcome is similar to how `window functions
@@ -3576,7 +3576,7 @@ def filter(self, predicate: Expr) -> Self:
         ...         "b": [1, 2, 3],
         ...     }
         ... )
-        >>> df.groupby("group_col").agg(
+        >>> df.group_by("group_col").agg(
         ...     [
         ...         pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"),
         ...         pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"),
@@ -3614,7 +3614,7 @@ def where(self, predicate: Expr) -> Self:
         ...         "b": [1, 2, 3],
         ...     }
         ... )
-        >>> df.groupby("group_col").agg(
+        >>> df.group_by("group_col").agg(
         ...     [
         ...         pl.col("b").where(pl.col("b") < 2).sum().alias("lt"),
         ...         pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"),
@@ -3791,7 +3791,7 @@ def apply(
 
         In a GroupBy context the function is applied by group:
 
-        >>> df.lazy().groupby("b", maintain_order=True).agg(
+        >>> df.lazy().group_by("b", maintain_order=True).agg(
         ...     pl.col("a").apply(lambda x: x.sum())
         ... ).collect()
         shape: (3, 2)
@@ -3807,7 +3807,7 @@ def apply(
 
         It is better to implement this with an expression:
 
-        >>> df.groupby("b", maintain_order=True).agg(
+        >>> df.group_by("b", maintain_order=True).agg(
         ...     pl.col("a").sum(),
         ... )  # doctest: +IGNORE_RESULT
 
@@ -3897,7 +3897,7 @@ def flatten(self) -> Self:
         ...         "values": [[1, 2], [2, 3], [4]],
         ...     }
         ... )
-        >>> df.groupby("group").agg(pl.col("values").flatten())  # doctest: +SKIP
+        >>> df.group_by("group").agg(pl.col("values").flatten())  # doctest: +SKIP
         shape: (2, 2)
         ┌───────┬───────────┐
         │ group ┆ values    │
@@ -5234,7 +5234,7 @@ def rolling_min(
         Notes
         -----
         If you want to compute multiple aggregation statistics over the same dynamic
-        window, consider using `groupby_rolling` this method can cache the window size
+        window, consider using `group_by_rolling` this method can cache the window size
         computation.
 
         Examples
@@ -5440,7 +5440,7 @@ def rolling_max(
         Notes
         -----
         If you want to compute multiple aggregation statistics over the same dynamic
-        window, consider using `groupby_rolling` this method can cache the window size
+        window, consider using `group_by_rolling` this method can cache the window size
         computation.
 
         Examples
@@ -5673,7 +5673,7 @@ def rolling_mean(
         Notes
         -----
         If you want to compute multiple aggregation statistics over the same dynamic
-        window, consider using `groupby_rolling` this method can cache the window size
+        window, consider using `group_by_rolling` this method can cache the window size
         computation.
 
         Examples
@@ -5906,7 +5906,7 @@ def rolling_sum(
         Notes
         -----
         If you want to compute multiple aggregation statistics over the same dynamic
-        window, consider using `groupby_rolling` this method can cache the window size
+        window, consider using `group_by_rolling` this method can cache the window size
         computation.
 
         Examples
@@ -6138,7 +6138,7 @@ def rolling_std(
         Notes
         -----
         If you want to compute multiple aggregation statistics over the same dynamic
-        window, consider using `groupby_rolling` this method can cache the window size
+        window, consider using `group_by_rolling` this method can cache the window size
         computation.
 
         Examples
@@ -6370,7 +6370,7 @@ def rolling_var(
         Notes
         -----
         If you want to compute multiple aggregation statistics over the same dynamic
-        window, consider using `groupby_rolling` this method can cache the window size
+        window, consider using `group_by_rolling` this method can cache the window size
         computation.
 
         Examples
@@ -6605,7 +6605,7 @@ def rolling_median(
         Notes
         -----
         If you want to compute multiple aggregation statistics over the same dynamic
-        window, consider using `groupby_rolling` this method can cache the window size
+        window, consider using `group_by_rolling` this method can cache the window size
         computation.
 
         Examples
@@ -6766,7 +6766,7 @@ def rolling_quantile(
         Notes
         -----
         If you want to compute multiple aggregation statistics over the same dynamic
-        window, consider using `groupby_rolling` this method can cache the window size
+        window, consider using `group_by_rolling` this method can cache the window size
         computation.
 
         Examples
@@ -8446,7 +8446,7 @@ def cumulative_eval(
             Number of valid values there should be in the window before the expression
             is evaluated. valid values = `length - null_count`
         parallel
-            Run in parallel. Don't do this in a groupby or another operation that
+            Run in parallel. Don't do this in a group by or another operation that
             already has much parallelization.
 
         Warnings
diff --git a/py-polars/polars/expr/list.py b/py-polars/polars/expr/list.py
index 1577353b722e..269366c413e8 100644
--- a/py-polars/polars/expr/list.py
+++ b/py-polars/polars/expr/list.py
@@ -870,7 +870,7 @@ def eval(self, expr: Expr, *, parallel: bool = False) -> Expr:
             Run all expression parallel. Don't activate this blindly.
             Parallelism is worth it if there is enough work to do per thread.
 
-            This likely should not be use in the groupby context, because we already
+            This likely should not be use in the group by context, because we already
             parallel execution per group
 
         Examples
diff --git a/py-polars/polars/functions/lazy.py b/py-polars/polars/functions/lazy.py
index feb994947557..78862b54abd5 100644
--- a/py-polars/polars/functions/lazy.py
+++ b/py-polars/polars/functions/lazy.py
@@ -303,7 +303,7 @@ def count(column: str | Series | None = None) -> Expr | int:
     ╞═══════╡
     │ 3     │
     └───────┘
-    >>> df.groupby("c", maintain_order=True).agg(pl.count())
+    >>> df.group_by("c", maintain_order=True).agg(pl.count())
     shape: (2, 2)
     ┌─────┬───────┐
     │ c   ┆ count │
@@ -1083,7 +1083,7 @@ def apply(
     │ 2     ┆ 3   ┆ 7   │
     └───────┴─────┴─────┘
     >>> (
-    ...     df.groupby("group").agg(
+    ...     df.group_by("group").agg(
     ...         pl.apply(
     ...             exprs=["a", "b"],
     ...             function=lambda list_of_series: list_of_series[0]
diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py
index c40b6c72d15a..75703adbd228 100644
--- a/py-polars/polars/lazyframe/frame.py
+++ b/py-polars/polars/lazyframe/frame.py
@@ -47,7 +47,7 @@
 from polars.io._utils import _is_local_file
 from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec
 from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec
-from polars.lazyframe.groupby import LazyGroupBy
+from polars.lazyframe.group_by import LazyGroupBy
 from polars.selectors import _expand_selectors, expand_selector
 from polars.slice import LazyPolarsSlice
 from polars.utils._async import _AsyncDataFrameResult
@@ -992,7 +992,7 @@ def explain(
         ...         "c": [6, 5, 4, 3, 2, 1],
         ...     }
         ... )
-        >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort(
+        >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort(
         ...     "a"
         ... ).explain()  # doctest: +SKIP
         """
@@ -1071,7 +1071,7 @@ def show_graph(
         ...         "c": [6, 5, 4, 3, 2, 1],
         ...     }
         ... )
-        >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort(
+        >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort(
         ...     "a"
         ... ).show_graph()  # doctest: +SKIP
 
@@ -1496,7 +1496,7 @@ def profile(
         ...         "c": [6, 5, 4, 3, 2, 1],
         ...     }
         ... )
-        >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort(
+        >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort(
         ...     "a"
         ... ).profile()  # doctest: +SKIP
         (shape: (3, 3)
@@ -1510,15 +1510,15 @@ def profile(
          │ c   ┆ 6   ┆ 1   │
          └─────┴─────┴─────┘,
          shape: (3, 3)
-         ┌────────────────────────┬───────┬──────┐
-         │ node                   ┆ start ┆ end  │
-         │ ---                    ┆ ---   ┆ ---  │
-         │ str                    ┆ u64   ┆ u64  │
-         ╞════════════════════════╪═══════╪══════╡
-         │ optimization           ┆ 0     ┆ 5    │
-         │ groupby_partitioned(a) ┆ 5     ┆ 470  │
-         │ sort(a)                ┆ 475   ┆ 1964 │
-         └────────────────────────┴───────┴──────┘)
+         ┌─────────────────────────┬───────┬──────┐
+         │ node                    ┆ start ┆ end  │
+         │ ---                     ┆ ---   ┆ ---  │
+         │ str                     ┆ u64   ┆ u64  │
+         ╞═════════════════════════╪═══════╪══════╡
+         │ optimization            ┆ 0     ┆ 5    │
+         │ group_by_partitioned(a) ┆ 5     ┆ 470  │
+         │ sort(a)                 ┆ 475   ┆ 1964 │
+         └─────────────────────────┴───────┴──────┘)
 
         """
         if no_optimization:
@@ -1639,7 +1639,7 @@ def collect(
         ...         "c": [6, 5, 4, 3, 2, 1],
         ...     }
         ... )
-        >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).collect()
+        >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).collect()
         shape: (3, 3)
         ┌─────┬─────┬─────┐
         │ a   ┆ b   ┆ c   │
@@ -1741,7 +1741,7 @@ def collect_async(
         ...     }
         ... )
         >>> a = (
-        ...     lf.groupby("a", maintain_order=True)
+        ...     lf.group_by("a", maintain_order=True)
         ...     .agg(pl.all().sum())
         ...     .collect_async(queue.Queue())
         ... )
@@ -2020,7 +2020,7 @@ def fetch(
         ...         "c": [6, 5, 4, 3, 2, 1],
         ...     }
         ... )
-        >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).fetch(2)
+        >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2)
         shape: (2, 3)
         ┌─────┬─────┬─────┐
         │ a   ┆ b   ┆ c   │
@@ -2438,14 +2438,14 @@ def select_seq(
         )
         return self._from_pyldf(self._ldf.select_seq(pyexprs))
 
-    def groupby(
+    def group_by(
         self,
         by: IntoExpr | Iterable[IntoExpr],
         *more_by: IntoExpr,
         maintain_order: bool = False,
     ) -> LazyGroupBy:
         """
-        Start a groupby operation.
+        Start a group by operation.
 
         Parameters
         ----------
@@ -2456,7 +2456,7 @@ def groupby(
             Additional columns to group by, specified as positional arguments.
         maintain_order
             Ensure that the order of the groups is consistent with the input data.
-            This is slower than a default groupby.
+            This is slower than a default group by.
             Settings this to ``True`` blocks the possibility
             to run on the streaming engine.
 
@@ -2472,7 +2472,7 @@ def groupby(
         ...         "c": [5, 4, 3, 2, 1],
         ...     }
         ... )
-        >>> lf.groupby("a").agg(pl.col("b").sum()).collect()  # doctest: +IGNORE_RESULT
+        >>> lf.group_by("a").agg(pl.col("b").sum()).collect()  # doctest: +IGNORE_RESULT
         shape: (3, 2)
         ┌─────┬─────┐
         │ a   ┆ b   │
@@ -2487,7 +2487,7 @@ def groupby(
         Set ``maintain_order=True`` to ensure the order of the groups is consistent with
         the input.
 
-        >>> lf.groupby("a", maintain_order=True).agg(pl.col("c")).collect()
+        >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect()
         shape: (3, 2)
         ┌─────┬───────────┐
         │ a   ┆ c         │
@@ -2501,7 +2501,7 @@ def groupby(
 
         Group by multiple columns by passing a list of column names.
 
-        >>> lf.groupby(["a", "b"]).agg(pl.max("c")).collect()  # doctest: +SKIP
+        >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect()  # doctest: +SKIP
         shape: (4, 3)
         ┌─────┬─────┬─────┐
         │ a   ┆ b   ┆ c   │
@@ -2517,7 +2517,7 @@ def groupby(
         Or use positional arguments to group by multiple columns in the same way.
         Expressions are also accepted.
 
-        >>> lf.groupby("a", pl.col("b") // 2).agg(
+        >>> lf.group_by("a", pl.col("b") // 2).agg(
         ...     pl.col("c").mean()
         ... ).collect()  # doctest: +SKIP
         shape: (3, 3)
@@ -2536,7 +2536,7 @@ def groupby(
         lgb = self._ldf.group_by(exprs, maintain_order)
         return LazyGroupBy(lgb)
 
-    def groupby_rolling(
+    def group_by_rolling(
         self,
         index_column: IntoExpr,
         *,
@@ -2549,9 +2549,9 @@ def groupby_rolling(
         """
         Create rolling groups based on a time, Int32, or Int64 column.
 
-        Different from a ``dynamic_groupby`` the windows are now determined by the
+        Different from a ``dynamic_group_by`` the windows are now determined by the
         individual values and are not of constant intervals. For constant intervals
-        use *groupby_dynamic*.
+        use :func:`LazyFrame.group_by_dynamic`.
 
         If you have a time series ``<t_0, t_1, ..., t_n>``, then by default the
         windows created will be
@@ -2588,7 +2588,7 @@ def groupby_rolling(
         not be 24 hours, due to daylight savings). Similarly for "calendar week",
         "calendar month", "calendar quarter", and "calendar year".
 
-        In case of a groupby_rolling on an integer column, the windows are defined by:
+        In case of a group_by_rolling on an integer column, the windows are defined by:
 
         - "1i"      # length 1
         - "10i"     # length 10
@@ -2601,7 +2601,7 @@ def groupby_rolling(
             This column must be sorted in ascending order (or, if `by` is specified,
             then it must be sorted in ascending order within each group).
 
-            In case of a rolling groupby on indices, dtype needs to be one of
+            In case of a rolling group by on indices, dtype needs to be one of
             {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if
             performance matters use an Int64 column.
         period
@@ -2628,7 +2628,7 @@ def groupby_rolling(
 
         See Also
         --------
-        groupby_dynamic
+        group_by_dynamic
 
         Examples
         --------
@@ -2644,7 +2644,7 @@ def groupby_rolling(
         ...     pl.col("dt").str.strptime(pl.Datetime).set_sorted()
         ... )
         >>> out = (
-        ...     df.groupby_rolling(index_column="dt", period="2d")
+        ...     df.group_by_rolling(index_column="dt", period="2d")
         ...     .agg(
         ...         [
         ...             pl.sum("a").alias("sum_a"),
@@ -2686,7 +2686,7 @@ def groupby_rolling(
         )
         return LazyGroupBy(lgb)
 
-    def groupby_dynamic(
+    def group_by_dynamic(
         self,
         index_column: IntoExpr,
         *,
@@ -2704,7 +2704,7 @@ def groupby_dynamic(
         Group based on a time value (or index value of type Int32, Int64).
 
         Time windows are calculated and rows are assigned to windows. Different from a
-        normal groupby is that a row can be member of multiple groups. The time/index
+        normal group by is that a row can be member of multiple groups. The time/index
         window could be seen as a rolling window, with a window size determined by
         dates/times/values instead of slots in the DataFrame.
 
@@ -2741,7 +2741,7 @@ def groupby_dynamic(
         not be 24 hours, due to daylight savings). Similarly for "calendar week",
         "calendar month", "calendar quarter", and "calendar year".
 
-        In case of a groupby_dynamic on an integer column, the windows are defined by:
+        In case of a group_by_dynamic on an integer column, the windows are defined by:
 
         - "1i"      # length 1
         - "10i"     # length 10
@@ -2758,7 +2758,7 @@ def groupby_dynamic(
             This column must be sorted in ascending order (or, if `by` is specified,
             then it must be sorted in ascending order within each group).
 
-            In case of a dynamic groupby on indices, dtype needs to be one of
+            In case of a dynamic group by on indices, dtype needs to be one of
             {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if
             performance matters use an Int64 column.
         every
@@ -2806,7 +2806,7 @@ def groupby_dynamic(
 
         See Also
         --------
-        groupby_rolling
+        group_by_rolling
 
         Notes
         -----
@@ -2815,7 +2815,7 @@ def groupby_dynamic(
         .. code-block:: python
 
             # polars
-            df.groupby_dynamic("ts", every="1d").agg(pl.col("value").sum())
+            df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum())
 
         is equivalent to
 
@@ -2861,7 +2861,7 @@ def groupby_dynamic(
 
         Group by windows of 1 hour starting at 2021-12-16 00:00:00.
 
-        >>> lf.groupby_dynamic("time", every="1h", closed="right").agg(
+        >>> lf.group_by_dynamic("time", every="1h", closed="right").agg(
         ...     [
         ...         pl.col("time").min().alias("time_min"),
         ...         pl.col("time").max().alias("time_max"),
@@ -2881,7 +2881,7 @@ def groupby_dynamic(
 
         The window boundaries can also be added to the aggregation result
 
-        >>> lf.groupby_dynamic(
+        >>> lf.group_by_dynamic(
         ...     "time", every="1h", include_boundaries=True, closed="right"
         ... ).agg([pl.col("time").count().alias("time_count")]).collect()
         shape: (4, 4)
@@ -2899,7 +2899,7 @@ def groupby_dynamic(
         When closed="left", should not include right end of interval
         [lower_bound, upper_bound)
 
-        >>> lf.groupby_dynamic("time", every="1h", closed="left").agg(
+        >>> lf.group_by_dynamic("time", every="1h", closed="left").agg(
         ...     [
         ...         pl.col("time").count().alias("time_count"),
         ...         pl.col("time").alias("time_agg_list"),
@@ -2919,7 +2919,7 @@ def groupby_dynamic(
 
         When closed="both" the time values at the window boundaries belong to 2 groups.
 
-        >>> lf.groupby_dynamic("time", every="1h", closed="both").agg(
+        >>> lf.group_by_dynamic("time", every="1h", closed="both").agg(
         ...     pl.col("time").count().alias("time_count")
         ... ).collect()
         shape: (5, 2)
@@ -2935,7 +2935,7 @@ def groupby_dynamic(
         │ 2021-12-16 03:00:00 ┆ 1          │
         └─────────────────────┴────────────┘
 
-        Dynamic groupbys can also be combined with grouping on normal keys
+        Dynamic group bys can also be combined with grouping on normal keys
 
         >>> lf = pl.LazyFrame(
         ...     {
@@ -2964,7 +2964,7 @@ def groupby_dynamic(
         │ 2021-12-16 03:00:00 ┆ a      │
         └─────────────────────┴────────┘
         >>> (
-        ...     lf.groupby_dynamic(
+        ...     lf.group_by_dynamic(
         ...         "time",
         ...         every="1h",
         ...         closed="both",
@@ -2987,7 +2987,7 @@ def groupby_dynamic(
         │ b      ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1          │
         └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
 
-        Dynamic groupby on an index column
+        Dynamic group by on an index column
 
         >>> lf = pl.LazyFrame(
         ...     {
@@ -2995,7 +2995,7 @@ def groupby_dynamic(
         ...         "A": ["A", "A", "B", "B", "B", "C"],
         ...     }
         ... )
-        >>> lf.groupby_dynamic(
+        >>> lf.group_by_dynamic(
         ...     "idx",
         ...     every="2i",
         ...     period="3i",
@@ -5333,3 +5333,178 @@ def update(
             result = result.drop(row_count_name)
 
         return self._from_pyldf(result._ldf)
+
+    @deprecate_renamed_function("group_by", version="0.19.0")
+    def groupby(
+        self,
+        by: IntoExpr | Iterable[IntoExpr],
+        *more_by: IntoExpr,
+        maintain_order: bool = False,
+    ) -> LazyGroupBy:
+        """
+        Start a group by operation.
+
+        Alias for :func:`LazyFrame.group_by`.
+
+        Parameters
+        ----------
+        by
+            Column(s) to group by. Accepts expression input. Strings are parsed as
+            column names.
+        *more_by
+            Additional columns to group by, specified as positional arguments.
+        maintain_order
+            Ensure that the order of the groups is consistent with the input data.
+            This is slower than a default group by.
+            Settings this to ``True`` blocks the possibility
+            to run on the streaming engine.
+
+        """
+        return self.group_by(by, *more_by, maintain_order=maintain_order)
+
+    @deprecate_renamed_function("group_by_rolling", version="0.19.0")
+    def groupby_rolling(
+        self,
+        index_column: IntoExpr,
+        *,
+        period: str | timedelta,
+        offset: str | timedelta | None = None,
+        closed: ClosedInterval = "right",
+        by: IntoExpr | Iterable[IntoExpr] | None = None,
+        check_sorted: bool = True,
+    ) -> LazyGroupBy:
+        """
+        Create rolling groups based on a time, Int32, or Int64 column.
+
+        Alias for :func:`LazyFrame.group_by_rolling`.
+
+        Parameters
+        ----------
+        index_column
+            Column used to group based on the time window.
+            Often of type Date/Datetime.
+            This column must be sorted in ascending order (or, if `by` is specified,
+            then it must be sorted in ascending order within each group).
+
+            In case of a rolling group by on indices, dtype needs to be one of
+            {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if
+            performance matters use an Int64 column.
+        period
+            length of the window - must be non-negative
+        offset
+            offset of the window. Default is -period
+        closed : {'right', 'left', 'both', 'none'}
+            Define which sides of the temporal interval are closed (inclusive).
+        by
+            Also group by this column/these columns
+        check_sorted
+            When the ``by`` argument is given, polars can not check sortedness
+            by the metadata and has to do a full scan on the index column to
+            verify data is sorted. This is expensive. If you are sure the
+            data within the by groups is sorted, you can set this to ``False``.
+            Doing so incorrectly will lead to incorrect output
+
+        Returns
+        -------
+        LazyGroupBy
+            Object you can call ``.agg`` on to aggregate by groups, the result
+            of which will be sorted by `index_column` (but note that if `by` columns are
+            passed, it will only be sorted within each `by` group).
+
+        """
+        return self.group_by_rolling(
+            index_column,
+            period=period,
+            offset=offset,
+            closed=closed,
+            by=by,
+            check_sorted=check_sorted,
+        )
+
+    @deprecate_renamed_function("group_by_dynamic", version="0.19.0")
+    def groupby_dynamic(
+        self,
+        index_column: IntoExpr,
+        *,
+        every: str | timedelta,
+        period: str | timedelta | None = None,
+        offset: str | timedelta | None = None,
+        truncate: bool = True,
+        include_boundaries: bool = False,
+        closed: ClosedInterval = "left",
+        by: IntoExpr | Iterable[IntoExpr] | None = None,
+        start_by: StartBy = "window",
+        check_sorted: bool = True,
+    ) -> LazyGroupBy:
+        """
+        Group based on a time value (or index value of type Int32, Int64).
+
+        Alias for :func:`LazyFrame.group_by_rolling`.
+
+        Parameters
+        ----------
+        index_column
+            Column used to group based on the time window.
+            Often of type Date/Datetime.
+            This column must be sorted in ascending order (or, if `by` is specified,
+            then it must be sorted in ascending order within each group).
+
+            In case of a dynamic group by on indices, dtype needs to be one of
+            {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if
+            performance matters use an Int64 column.
+        every
+            interval of the window
+        period
+            length of the window, if None it is equal to 'every'
+        offset
+            offset of the window if None and period is None it will be equal to negative
+            `every`
+        truncate
+            truncate the time value to the window lower bound
+        include_boundaries
+            Add the lower and upper bound of the window to the "_lower_bound" and
+            "_upper_bound" columns. This will impact performance because it's harder to
+            parallelize
+        closed : {'right', 'left', 'both', 'none'}
+            Define which sides of the temporal interval are closed (inclusive).
+        by
+            Also group by this column/these columns
+        start_by : {'window', 'datapoint', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'}
+            The strategy to determine the start of the first window by.
+
+            * 'window': Truncate the start of the window with the 'every' argument.
+              Note that weekly windows start on Monday.
+            * 'datapoint': Start from the first encountered data point.
+            * a day of the week (only takes effect if `every` contains ``'w'``):
+
+              * 'monday': Start the window on the Monday before the first data point.
+              * 'tuesday': Start the window on the Tuesday before the first data point.
+              * ...
+              * 'sunday': Start the window on the Sunday before the first data point.
+        check_sorted
+            When the ``by`` argument is given, polars can not check sortedness
+            by the metadata and has to do a full scan on the index column to
+            verify data is sorted. This is expensive. If you are sure the
+            data within the by groups is sorted, you can set this to ``False``.
+            Doing so incorrectly will lead to incorrect output
+
+        Returns
+        -------
+        LazyGroupBy
+            Object you can call ``.agg`` on to aggregate by groups, the result
+            of which will be sorted by `index_column` (but note that if `by` columns are
+            passed, it will only be sorted within each `by` group).
+
+        """  # noqa: W505
+        return self.group_by_dynamic(
+            index_column,
+            every=every,
+            period=period,
+            offset=offset,
+            truncate=truncate,
+            include_boundaries=include_boundaries,
+            closed=closed,
+            by=by,
+            start_by=start_by,
+            check_sorted=check_sorted,
+        )
diff --git a/py-polars/polars/lazyframe/groupby.py b/py-polars/polars/lazyframe/group_by.py
similarity index 94%
rename from py-polars/polars/lazyframe/groupby.py
rename to py-polars/polars/lazyframe/group_by.py
index 85eb9e10eb7a..06d700d86315 100644
--- a/py-polars/polars/lazyframe/groupby.py
+++ b/py-polars/polars/lazyframe/group_by.py
@@ -14,9 +14,9 @@
 
 class LazyGroupBy:
     """
-    Utility class for performing a groupby operation over a lazy dataframe.
+    Utility class for performing a group by operation over a lazy dataframe.
 
-    Generated by calling ``df.lazy().groupby(...)``.
+    Generated by calling ``df.lazy().group_by(...)``.
     """
 
     def __init__(self, lgb: PyLazyGroupBy) -> None:
@@ -28,12 +28,12 @@ def agg(
         **named_aggs: IntoExpr,
     ) -> LazyFrame:
         """
-        Compute aggregations for each group of a groupby operation.
+        Compute aggregations for each group of a group by operation.
 
         Parameters
         ----------
         *aggs
-            Aggregations to compute for each group of the groupby operation,
+            Aggregations to compute for each group of the group by operation,
             specified as positional arguments.
             Accepts expression input. Strings are parsed as column names.
         **named_aggs
@@ -51,7 +51,7 @@ def agg(
         ...         "c": [5, 4, 3, 2, 1],
         ...     }
         ... ).lazy()
-        >>> ldf.groupby("a").agg(
+        >>> ldf.group_by("a").agg(
         ...     [pl.col("b"), pl.col("c")]
         ... ).collect()  # doctest: +IGNORE_RESULT
         shape: (3, 3)
@@ -69,7 +69,9 @@ def agg(
 
         Compute the sum of a column for each group.
 
-        >>> ldf.groupby("a").agg(pl.col("b").sum()).collect()  # doctest: +IGNORE_RESULT
+        >>> ldf.group_by("a").agg(
+        ...     pl.col("b").sum()
+        ... ).collect()  # doctest: +IGNORE_RESULT
         shape: (3, 2)
         ┌─────┬─────┐
         │ a   ┆ b   │
@@ -83,7 +85,7 @@ def agg(
 
         Compute multiple aggregates at once by passing a list of expressions.
 
-        >>> ldf.groupby("a").agg(
+        >>> ldf.group_by("a").agg(
         ...     [pl.sum("b"), pl.mean("c")]
         ... ).collect()  # doctest: +IGNORE_RESULT
         shape: (3, 3)
@@ -99,7 +101,7 @@ def agg(
 
         Or use positional arguments to compute multiple aggregations in the same way.
 
-        >>> ldf.groupby("a").agg(
+        >>> ldf.group_by("a").agg(
         ...     pl.sum("b").suffix("_sum"),
         ...     (pl.col("c") ** 2).mean().suffix("_mean_squared"),
         ... ).collect()  # doctest: +IGNORE_RESULT
@@ -116,7 +118,7 @@ def agg(
 
         Use keyword arguments to easily name your expression inputs.
 
-        >>> ldf.groupby("a").agg(
+        >>> ldf.group_by("a").agg(
         ...     b_sum=pl.sum("b"),
         ...     c_mean_squared=(pl.col("c") ** 2).mean(),
         ... ).collect()  # doctest: +IGNORE_RESULT
@@ -202,7 +204,7 @@ def apply(
 
         >>> (
         ...     df.lazy()
-        ...     .groupby("color")
+        ...     .group_by("color")
         ...     .apply(lambda group_df: group_df.sample(2), schema=None)
         ...     .collect()
         ... )  # doctest: +IGNORE_RESULT
@@ -260,7 +262,7 @@ def head(self, n: int = 5) -> LazyFrame:
         │ a       ┆ 5   │
         │ b       ┆ 6   │
         └─────────┴─────┘
-        >>> df.groupby("letters").head(2).sort("letters")
+        >>> df.group_by("letters").head(2).sort("letters")
         shape: (5, 2)
         ┌─────────┬─────┐
         │ letters ┆ nrs │
@@ -308,7 +310,7 @@ def tail(self, n: int = 5) -> LazyFrame:
         │ a       ┆ 5   │
         │ b       ┆ 6   │
         └─────────┴─────┘
-        >>> df.groupby("letters").tail(2).sort("letters")
+        >>> df.group_by("letters").tail(2).sort("letters")
          shape: (5, 2)
         ┌─────────┬─────┐
         │ letters ┆ nrs │
@@ -337,7 +339,7 @@ def all(self) -> LazyFrame:
         ...         "b": [1, 2, 3, 4],
         ...     }
         ... ).lazy()
-        >>> ldf.groupby("a", maintain_order=True).all().collect()
+        >>> ldf.group_by("a", maintain_order=True).all().collect()
         shape: (2, 2)
         ┌─────┬───────────┐
         │ a   ┆ b         │
@@ -368,7 +370,7 @@ def count(self) -> LazyFrame:
         ...         "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
         ...     }
         ... ).lazy()
-        >>> ldf.groupby("d", maintain_order=True).count().collect()
+        >>> ldf.group_by("d", maintain_order=True).count().collect()
         shape: (3, 2)
         ┌────────┬───────┐
         │ d      ┆ count │
@@ -397,7 +399,7 @@ def first(self) -> LazyFrame:
         ...         "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
         ...     }
         ... ).lazy()
-        >>> ldf.groupby("d", maintain_order=True).first().collect()
+        >>> ldf.group_by("d", maintain_order=True).first().collect()
         shape: (3, 4)
         ┌────────┬─────┬──────┬───────┐
         │ d      ┆ a   ┆ b    ┆ c     │
@@ -426,7 +428,7 @@ def last(self) -> LazyFrame:
         ...         "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
         ...     }
         ... ).lazy()
-        >>> ldf.groupby("d", maintain_order=True).last().collect()
+        >>> ldf.group_by("d", maintain_order=True).last().collect()
         shape: (3, 4)
         ┌────────┬─────┬──────┬───────┐
         │ d      ┆ a   ┆ b    ┆ c     │
@@ -455,7 +457,7 @@ def max(self) -> LazyFrame:
         ...         "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
         ...     }
         ... ).lazy()
-        >>> ldf.groupby("d", maintain_order=True).max().collect()
+        >>> ldf.group_by("d", maintain_order=True).max().collect()
         shape: (3, 4)
         ┌────────┬─────┬──────┬──────┐
         │ d      ┆ a   ┆ b    ┆ c    │
@@ -484,7 +486,7 @@ def mean(self) -> LazyFrame:
         ...         "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
         ...     }
         ... ).lazy()
-        >>> ldf.groupby("d", maintain_order=True).mean().collect()
+        >>> ldf.group_by("d", maintain_order=True).mean().collect()
         shape: (3, 4)
         ┌────────┬─────┬──────────┬──────────┐
         │ d      ┆ a   ┆ b        ┆ c        │
@@ -512,7 +514,7 @@ def median(self) -> LazyFrame:
         ...         "d": ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"],
         ...     }
         ... ).lazy()
-        >>> ldf.groupby("d", maintain_order=True).median().collect()
+        >>> ldf.group_by("d", maintain_order=True).median().collect()
         shape: (2, 3)
         ┌────────┬─────┬──────┐
         │ d      ┆ a   ┆ b    │
@@ -540,7 +542,7 @@ def min(self) -> LazyFrame:
         ...         "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
         ...     }
         ... ).lazy()
-        >>> ldf.groupby("d", maintain_order=True).min().collect()
+        >>> ldf.group_by("d", maintain_order=True).min().collect()
         shape: (3, 4)
         ┌────────┬─────┬──────┬───────┐
         │ d      ┆ a   ┆ b    ┆ c     │
@@ -568,7 +570,7 @@ def n_unique(self) -> LazyFrame:
         ...         "d": ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"],
         ...     }
         ... ).lazy()
-        >>> ldf.groupby("d", maintain_order=True).n_unique().collect()
+        >>> ldf.group_by("d", maintain_order=True).n_unique().collect()
         shape: (2, 3)
         ┌────────┬─────┬─────┐
         │ d      ┆ a   ┆ b   │
@@ -604,7 +606,7 @@ def quantile(
         ...         "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
         ...     }
         ... ).lazy()
-        >>> ldf.groupby("d", maintain_order=True).quantile(1).collect()
+        >>> ldf.group_by("d", maintain_order=True).quantile(1).collect()
         shape: (3, 3)
         ┌────────┬─────┬──────┐
         │ d      ┆ a   ┆ b    │
@@ -633,7 +635,7 @@ def sum(self) -> LazyFrame:
         ...         "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
         ...     }
         ... ).lazy()
-        >>> ldf.groupby("d", maintain_order=True).sum().collect()
+        >>> ldf.group_by("d", maintain_order=True).sum().collect()
         shape: (3, 4)
         ┌────────┬─────┬──────┬─────┐
         │ d      ┆ a   ┆ b    ┆ c   │
diff --git a/py-polars/polars/selectors.py b/py-polars/polars/selectors.py
index 477f4196e240..c792735f9b64 100644
--- a/py-polars/polars/selectors.py
+++ b/py-polars/polars/selectors.py
@@ -420,7 +420,7 @@ def by_dtype(
 
     Group by string columns and sum the numeric columns:
 
-    >>> df.groupby(cs.string()).agg(cs.numeric().sum()).sort(by="other")
+    >>> df.group_by(cs.string()).agg(cs.numeric().sum()).sort(by="other")
     shape: (2, 2)
     ┌───────┬──────────┐
     │ other ┆ value    │
@@ -1519,7 +1519,7 @@ def string(include_categorical: bool = False) -> SelectorType:
 
     Group by all string columns, sum the numeric columns, then sort by the string cols:
 
-    >>> df.groupby(cs.string()).agg(cs.numeric().sum()).sort(by=cs.string())
+    >>> df.group_by(cs.string()).agg(cs.numeric().sum()).sort(by=cs.string())
     shape: (2, 3)
     ┌─────┬─────┬─────┐
     │ w   ┆ x   ┆ y   │
@@ -1532,7 +1532,7 @@ def string(include_categorical: bool = False) -> SelectorType:
 
     Group by all string *and* categorical columns:
 
-    >>> df.groupby(cs.string(True)).agg(cs.numeric().sum()).sort(by=cs.string(True))
+    >>> df.group_by(cs.string(True)).agg(cs.numeric().sum()).sort(by=cs.string(True))
     shape: (3, 4)
     ┌─────┬─────┬─────┬──────┐
     │ w   ┆ z   ┆ x   ┆ y    │
diff --git a/py-polars/polars/series/list.py b/py-polars/polars/series/list.py
index 7afd49f03a81..76170741c761 100644
--- a/py-polars/polars/series/list.py
+++ b/py-polars/polars/series/list.py
@@ -547,7 +547,7 @@ def eval(self, expr: Expr, *, parallel: bool = False) -> Series:
             Run all expression parallel. Don't activate this blindly.
             Parallelism is worth it if there is enough work to do per thread.
 
-            This likely should not be use in the groupby context, because we already
+            This likely should not be use in the group by context, because we already
             parallel execution per group
 
         Examples
diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py
index 9c7d9454704e..e3b1025f9f95 100644
--- a/py-polars/polars/series/series.py
+++ b/py-polars/polars/series/series.py
@@ -2327,7 +2327,7 @@ def cumulative_eval(
             Number of valid values there should be in the window before the expression
             is evaluated. valid values = `length - null_count`
         parallel
-            Run in parallel. Don't do this in a groupby or another operation that
+            Run in parallel. Don't do this in a group by or another operation that
             already has much parallelization.
 
         Warnings
diff --git a/py-polars/tests/benchmark/run_h2oai_benchmark.py b/py-polars/tests/benchmark/run_h2oai_benchmark.py
index 133389cfa428..961f8a3a5f9f 100644
--- a/py-polars/tests/benchmark/run_h2oai_benchmark.py
+++ b/py-polars/tests/benchmark/run_h2oai_benchmark.py
@@ -39,7 +39,7 @@
 t00 = time.time()
 t0 = time.time()
 print("q1")
-out = x.groupby("id1").agg(pl.sum("v1").alias("v1_sum")).collect()
+out = x.group_by("id1").agg(pl.sum("v1").alias("v1_sum")).collect()
 print(time.time() - t0)
 print("out.shape", out.shape)
 print('out["v1_sum"].sum()', out["v1_sum"].sum())
@@ -47,7 +47,7 @@
 t0easy = time.time()
 t0 = time.time()
 print("q2")
-out = x.groupby(["id1", "id2"]).agg(pl.sum("v1").alias("v1_sum")).collect()
+out = x.group_by(["id1", "id2"]).agg(pl.sum("v1").alias("v1_sum")).collect()
 print(time.time() - t0)
 print("out.shape", out.shape)
 print('out["v1_sum"].sum()', out["v1_sum"].sum())
@@ -55,7 +55,7 @@
 t0 = time.time()
 print("q3")
 out = (
-    x.groupby("id3")
+    x.group_by("id3")
     .agg([pl.sum("v1").alias("v1_sum"), pl.mean("v3").alias("v3_mean")])
     .collect()
 )
@@ -67,7 +67,7 @@
 t0 = time.time()
 print("q4")
 out = (
-    x.groupby("id4")
+    x.group_by("id4")
     .agg(
         [
             pl.mean("v1").alias("v1_mean"),
@@ -86,7 +86,7 @@
 t0 = time.time()
 print("q5")
 out = (
-    x.groupby("id6")
+    x.group_by("id6")
     .agg(
         [
             pl.sum("v1").alias("v1_sum"),
@@ -106,7 +106,7 @@
 t0 = time.time()
 print("q6")
 out = (
-    x.groupby(["id4", "id5"])
+    x.group_by(["id4", "id5"])
     .agg([pl.median("v3").alias("v3_median"), pl.std("v3").alias("v3_std")])
     .collect()
 )
@@ -118,7 +118,9 @@
 t0 = time.time()
 print("q7")
 out = (
-    x.groupby("id3").agg([(pl.max("v1") - pl.min("v2")).alias("range_v1_v2")]).collect()
+    x.group_by("id3")
+    .agg([(pl.max("v1") - pl.min("v2")).alias("range_v1_v2")])
+    .collect()
 )
 print(time.time() - t0)
 print("out.shape", out.shape)
@@ -128,7 +130,7 @@
 print("q8")
 out = (
     x.drop_nulls("v3")
-    .groupby("id6")
+    .group_by("id6")
     .agg(pl.col("v3").top_k(2).alias("largest2_v3"))
     .explode("largest2_v3")
     .collect()
@@ -139,7 +141,7 @@
 
 t0 = time.time()
 print("q9")
-out = x.groupby(["id2", "id4"]).agg((pl.corr("v1", "v2") ** 2).alias("r2")).collect()
+out = x.group_by(["id2", "id4"]).agg((pl.corr("v1", "v2") ** 2).alias("r2")).collect()
 print(time.time() - t0)
 print("out.shape", out.shape)
 print('out["r2"].sum()', out["r2"].sum())
@@ -147,7 +149,7 @@
 t0 = time.time()
 print("q10")
 out = (
-    x.groupby(["id1", "id2", "id3", "id4", "id5", "id6"])
+    x.group_by(["id1", "id2", "id3", "id4", "id5", "id6"])
     .agg([pl.sum("v3").alias("v3"), pl.count("v1").alias("count")])
     .collect()
 )
@@ -160,7 +162,7 @@
 t00 = time.time()
 t0 = time.time()
 print("q1")
-out = x.groupby("id1").agg(pl.sum("v1").alias("v1_sum")).collect()
+out = x.group_by("id1").agg(pl.sum("v1").alias("v1_sum")).collect()
 print(time.time() - t0)
 assert out.shape == (96, 2)
 assert out["v1_sum"].sum() == 28501451
@@ -168,7 +170,7 @@
 t0easy = time.time()
 t0 = time.time()
 print("q2")
-out = x.groupby(["id1", "id2"]).agg(pl.sum("v1").alias("v1_sum")).collect()
+out = x.group_by(["id1", "id2"]).agg(pl.sum("v1").alias("v1_sum")).collect()
 print(time.time() - t0)
 assert out.shape == (9216, 3)
 assert out["v1_sum"].sum() == 28501451
@@ -176,7 +178,7 @@
 t0 = time.time()
 print("q3")
 out = (
-    x.groupby("id3")
+    x.group_by("id3")
     .agg([pl.sum("v1").alias("v1_sum"), pl.mean("v3").alias("v3_mean")])
     .collect()
 )
@@ -188,7 +190,7 @@
 t0 = time.time()
 print("q4")
 out = (
-    x.groupby("id4")
+    x.group_by("id4")
     .agg(
         [
             pl.mean("v1").alias("v1_mean"),
@@ -207,7 +209,7 @@
 t0 = time.time()
 print("q5")
 out = (
-    x.groupby("id6")
+    x.group_by("id6")
     .agg(
         [
             pl.sum("v1").alias("v1_sum"),
@@ -227,7 +229,7 @@
 t0 = time.time()
 print("q6")
 out = (
-    x.groupby(["id4", "id5"])
+    x.group_by(["id4", "id5"])
     .agg([pl.median("v3").alias("v3_median"), pl.std("v3").alias("v3_std")])
     .collect()
 )
@@ -239,7 +241,7 @@
 t0 = time.time()
 print("q7")
 out = (
-    x.groupby("id3")
+    x.group_by("id3")
     .agg(
         [
             (pl.max("v1").alias("v1_max") - pl.min("v2").alias("v2_mean")).alias(
@@ -258,7 +260,7 @@
 out = (
     x.drop_nulls("v3")
     .sort("v3", descending=True)
-    .groupby("id6")
+    .group_by("id6")
     .agg(pl.col("v3").head(2).alias("largest2_v3"))
     .explode("largest2_v3")
     .collect()
@@ -269,7 +271,7 @@
 
 t0 = time.time()
 print("q9")
-out = x.groupby(["id2", "id4"]).agg((pl.corr("v1", "v2") ** 2).alias("r2")).collect()
+out = x.group_by(["id2", "id4"]).agg((pl.corr("v1", "v2") ** 2).alias("r2")).collect()
 print(time.time() - t0)
 assert out.shape == (9216, 3)
 assert np.isclose(out["r2"].sum(), 9.902706276948825)
@@ -277,7 +279,7 @@
 t0 = time.time()
 print("q10")
 out = (
-    x.groupby(["id1", "id2", "id3", "id4", "id5", "id6"])
+    x.group_by(["id1", "id2", "id3", "id4", "id5", "id6"])
     .agg([pl.sum("v3").alias("v3"), pl.count("v1").alias("count")])
     .collect()
 )
diff --git a/py-polars/tests/benchmark/test_release.py b/py-polars/tests/benchmark/test_release.py
index 7b8adbad0c22..607f19ed0184 100644
--- a/py-polars/tests/benchmark/test_release.py
+++ b/py-polars/tests/benchmark/test_release.py
@@ -170,7 +170,7 @@ def test_boolean_min_max_agg() -> None:
 
     df = pl.DataFrame({"idx": idx, "c": c})
     aggs = [pl.col("c").min().alias("c_min"), pl.col("c").max().alias("c_max")]
-    assert df.groupby("idx").agg(aggs).sum().to_dict(False) == {
+    assert df.group_by("idx").agg(aggs).sum().to_dict(False) == {
         "idx": [107583],
         "c_min": [120],
         "c_max": [321],
@@ -179,14 +179,14 @@ def test_boolean_min_max_agg() -> None:
     nulls = np.random.randint(0, 500, 1000) < 100
     assert df.with_columns(
         c=pl.when(pl.lit(nulls)).then(None).otherwise(pl.col("c"))
-    ).groupby("idx").agg(aggs).sum().to_dict(False) == {
+    ).group_by("idx").agg(aggs).sum().to_dict(False) == {
         "idx": [107583],
         "c_min": [133],
         "c_max": [276],
     }
 
 
-def test_categorical_vs_str_groupby() -> None:
+def test_categorical_vs_str_group_by() -> None:
     # this triggers the perfect hash table
     s = pl.Series("a", np.random.randint(0, 50, 100))
     s_with_nulls = pl.select(
@@ -198,11 +198,11 @@ def test_categorical_vs_str_groupby() -> None:
         cat_out = (
             s_.cast(pl.Categorical)
             .to_frame("a")
-            .groupby("a")
+            .group_by("a")
             .agg(pl.first().alias("first"))
         )
 
-        str_out = s_.to_frame("a").groupby("a").agg(pl.first().alias("first"))
+        str_out = s_.to_frame("a").group_by("a").agg(pl.first().alias("first"))
         cat_out.with_columns(pl.col("a").cast(str))
         assert_frame_equal(
             cat_out.with_columns(
diff --git a/py-polars/tests/parametric/test_groupby_rolling.py b/py-polars/tests/parametric/test_groupby_rolling.py
index cb55825f951b..c4c62b36a250 100644
--- a/py-polars/tests/parametric/test_groupby_rolling.py
+++ b/py-polars/tests/parametric/test_groupby_rolling.py
@@ -25,7 +25,7 @@
     data=st.data(),
     time_unit=strategy_time_unit,
 )
-def test_groupby_rolling(
+def test_group_by_rolling(
     period: str,
     offset: str,
     closed: ClosedInterval,
@@ -43,7 +43,7 @@ def test_groupby_rolling(
     )
     df = dataframe.sort("ts").unique("ts")
     try:
-        result = df.groupby_rolling(
+        result = df.group_by_rolling(
             "ts", period=period, offset=offset, closed=closed
         ).agg(pl.col("value"))
     except pl.exceptions.PolarsPanicError as exc:
diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py
index f3e212444836..e3abcab1c271 100644
--- a/py-polars/tests/unit/dataframe/test_df.py
+++ b/py-polars/tests/unit/dataframe/test_df.py
@@ -744,9 +744,9 @@ def test_shift() -> None:
     assert_frame_equal(a, b)
 
 
-def test_custom_groupby() -> None:
+def test_custom_group_by() -> None:
     df = pl.DataFrame({"a": [1, 2, 1, 1], "b": ["a", "b", "c", "c"]})
-    out = df.groupby("b", maintain_order=True).agg(
+    out = df.group_by("b", maintain_order=True).agg(
         [pl.col("a").apply(lambda x: x.sum(), return_dtype=pl.Int64)]
     )
     assert out.rows() == [("a", 1), ("b", 2), ("c", 2)]
@@ -981,7 +981,7 @@ def test_init_series_edge_cases() -> None:
     assert df3.columns == ["column_0", "column_1"]
 
 
-def test_head_groupby() -> None:
+def test_head_group_by() -> None:
     commodity_prices = {
         "commodity": [
             "Wheat",
@@ -1024,7 +1024,7 @@ def test_head_groupby() -> None:
     keys = ["commodity", "location"]
     out = (
         df.sort(by="price", descending=True)
-        .groupby(keys, maintain_order=True)
+        .group_by(keys, maintain_order=True)
         .agg([pl.col("*").exclude(keys).head(2).keep_name()])
         .explode(pl.col("*").exclude(keys))
     )
@@ -1041,12 +1041,12 @@ def test_head_groupby() -> None:
     df = pl.DataFrame(
         {"letters": ["c", "c", "a", "c", "a", "b"], "nrs": [1, 2, 3, 4, 5, 6]}
     )
-    out = df.groupby("letters").tail(2).sort("letters")
+    out = df.group_by("letters").tail(2).sort("letters")
     assert_frame_equal(
         out,
         pl.DataFrame({"letters": ["a", "a", "b", "c", "c"], "nrs": [3, 5, 6, 2, 4]}),
     )
-    out = df.groupby("letters").head(2).sort("letters")
+    out = df.group_by("letters").head(2).sort("letters")
     assert_frame_equal(
         out,
         pl.DataFrame({"letters": ["a", "a", "b", "c", "c"], "nrs": [3, 5, 6, 1, 2]}),
@@ -1854,7 +1854,7 @@ def __repr__(self) -> str:
 
 
 def test_hashing_on_python_objects() -> None:
-    # see if we can do a groupby, drop_duplicates on a DataFrame with objects.
+    # see if we can do a group_by, drop_duplicates on a DataFrame with objects.
     # this requires that the hashing and aggregations are done on python objects
 
     df = pl.DataFrame({"a": [1, 1, 3, 4], "b": [1, 1, 2, 2]})
@@ -1867,7 +1867,7 @@ def __eq__(self, other: Any) -> bool:
             return True
 
     df = df.with_columns(pl.col("a").apply(lambda x: Foo()).alias("foo"))
-    assert df.groupby(["foo"]).first().shape == (1, 3)
+    assert df.group_by(["foo"]).first().shape == (1, 3)
     assert df.unique().shape == (3, 3)
 
 
@@ -1943,7 +1943,7 @@ def test_apply_dataframe_return() -> None:
     assert_frame_equal(out, expected)
 
 
-def test_groupby_cat_list() -> None:
+def test_group_by_cat_list() -> None:
     grouped = (
         pl.DataFrame(
             [
@@ -1952,7 +1952,7 @@ def test_groupby_cat_list() -> None:
             ]
         )
         .with_columns(pl.col("str_column").cast(pl.Categorical).alias("cat_column"))
-        .groupby("int_column", maintain_order=True)
+        .group_by("int_column", maintain_order=True)
         .agg([pl.col("cat_column")])["cat_column"]
     )
 
@@ -1961,12 +1961,12 @@ def test_groupby_cat_list() -> None:
     assert out[0] == "a"
 
 
-def test_groupby_agg_n_unique_floats() -> None:
+def test_group_by_agg_n_unique_floats() -> None:
     # tests proper dispatch
     df = pl.DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})
 
     for dtype in [pl.Float32, pl.Float64]:
-        out = df.groupby("a", maintain_order=True).agg(
+        out = df.group_by("a", maintain_order=True).agg(
             [pl.col("b").cast(dtype).n_unique()]
         )
         assert out["b"].to_list() == [2, 1]
@@ -2033,7 +2033,7 @@ def __repr__(self) -> str:
     df = pl.DataFrame({"groups": [1, 1, 2], "a": foos})
     assert sys.getrefcount(foos[0]) == base_count + 1
 
-    out = df.groupby("groups", maintain_order=True).agg(pl.col("a").alias("a"))
+    out = df.group_by("groups", maintain_order=True).agg(pl.col("a").alias("a"))
     assert sys.getrefcount(foos[0]) == base_count + 2
     s = out["a"].list.explode()
     assert sys.getrefcount(foos[0]) == base_count + 3
@@ -2048,25 +2048,25 @@ def __repr__(self) -> str:
     assert sys.getrefcount(foos[0]) == base_count
 
 
-def test_groupby_order_dispatch() -> None:
+def test_group_by_order_dispatch() -> None:
     df = pl.DataFrame({"x": list("bab"), "y": range(3)})
 
-    result = df.groupby("x", maintain_order=True).count()
+    result = df.group_by("x", maintain_order=True).count()
     expected = pl.DataFrame(
         {"x": ["b", "a"], "count": [2, 1]}, schema_overrides={"count": pl.UInt32}
     )
     assert_frame_equal(result, expected)
 
-    result = df.groupby("x", maintain_order=True).all()
+    result = df.group_by("x", maintain_order=True).all()
     expected = pl.DataFrame({"x": ["b", "a"], "y": [[0, 2], [1]]})
     assert_frame_equal(result, expected)
 
 
-def test_partitioned_groupby_order() -> None:
+def test_partitioned_group_by_order() -> None:
     # check if group ordering is maintained.
     # we only have 30 groups, so this triggers a partitioned group by
     df = pl.DataFrame({"x": [chr(v) for v in range(33, 63)], "y": range(30)})
-    out = df.groupby("x", maintain_order=True).agg(pl.all().implode())
+    out = df.group_by("x", maintain_order=True).agg(pl.all().implode())
     assert_series_equal(out["x"], df["x"])
 
 
@@ -2721,11 +2721,11 @@ def test_empty_is_in() -> None:
     assert df_empty_isin.schema == {"foo": pl.Utf8}
 
 
-def test_groupby_slice_expression_args() -> None:
+def test_group_by_slice_expression_args() -> None:
     df = pl.DataFrame({"groups": ["a"] * 10 + ["b"] * 20, "vals": range(30)})
 
     out = (
-        df.groupby("groups", maintain_order=True)
+        df.group_by("groups", maintain_order=True)
         .agg([pl.col("vals").slice(pl.count() * 0.1, (pl.count() // 5))])
         .explode("vals")
     )
@@ -2751,7 +2751,7 @@ def test_join_suffixes() -> None:
 def test_explode_empty() -> None:
     df = (
         pl.DataFrame({"x": ["a", "a", "b", "b"], "y": [1, 1, 2, 2]})
-        .groupby("x", maintain_order=True)
+        .group_by("x", maintain_order=True)
         .agg(pl.col("y").take([]))
     )
     assert df.explode("y").to_dict(False) == {"x": ["a", "b"], "y": [None, None]}
diff --git a/py-polars/tests/unit/datatypes/test_array.py b/py-polars/tests/unit/datatypes/test_array.py
index d5888b5f9070..2eb2fedde73c 100644
--- a/py-polars/tests/unit/datatypes/test_array.py
+++ b/py-polars/tests/unit/datatypes/test_array.py
@@ -51,7 +51,7 @@ def test_array_construction() -> None:
     assert df.rows() == []
 
 
-def test_array_in_groupby() -> None:
+def test_array_in_group_by() -> None:
     df = pl.DataFrame(
         [
             pl.Series("id", [1, 2]),
@@ -59,7 +59,7 @@ def test_array_in_groupby() -> None:
         ]
     )
 
-    assert next(iter(df.groupby("id", maintain_order=True)))[1]["list"].to_list() == [
+    assert next(iter(df.group_by("id", maintain_order=True)))[1]["list"].to_list() == [
         [1, 2]
     ]
 
@@ -68,8 +68,8 @@ def test_array_in_groupby() -> None:
         schema={"a": pl.Array(inner=pl.Int64, width=2), "g": pl.Int64},
     )
 
-    out0 = df.groupby("g").agg(pl.col("a")).sort("g")
-    out1 = df.set_sorted("g").groupby("g").agg(pl.col("a"))
+    out0 = df.group_by("g").agg(pl.col("a")).sort("g")
+    out1 = df.set_sorted("g").group_by("g").agg(pl.col("a"))
 
     for out in [out0, out1]:
         assert out.schema == {
diff --git a/py-polars/tests/unit/datatypes/test_categorical.py b/py-polars/tests/unit/datatypes/test_categorical.py
index 9db461143d58..2275b5211a2c 100644
--- a/py-polars/tests/unit/datatypes/test_categorical.py
+++ b/py-polars/tests/unit/datatypes/test_categorical.py
@@ -87,8 +87,8 @@ def test_categorical_describe_3487() -> None:
 @StringCache()
 def test_categorical_is_in_list() -> None:
     # this requires type coercion to cast.
-    # we should not cast within the function as this would be expensive within a groupby
-    # context that would be a cast per group
+    # we should not cast within the function as this would be expensive within a
+    # group by context that would be a cast per group
     df = pl.DataFrame(
         {"a": [1, 2, 3, 1, 2], "b": ["a", "b", "c", "d", "e"]}
     ).with_columns(pl.col("b").cast(pl.Categorical))
@@ -115,7 +115,7 @@ def test_unset_sorted_on_append() -> None:
         ]
     ).sort("key")
     df = pl.concat([df1, df2], rechunk=False)
-    assert df.groupby("key").count()["count"].to_list() == [4, 4]
+    assert df.group_by("key").count()["count"].to_list() == [4, 4]
 
 
 def test_categorical_error_on_local_cmp() -> None:
@@ -307,11 +307,11 @@ def test_nested_categorical_aggregation_7848() -> None:
             "group": [1, 1, 2, 2, 2, 3, 3],
             "letter": ["a", "b", "c", "d", "e", "f", "g"],
         }
-    ).with_columns([pl.col("letter").cast(pl.Categorical)]).groupby(
+    ).with_columns([pl.col("letter").cast(pl.Categorical)]).group_by(
         maintain_order=True, by=["group"]
     ).all().with_columns(
         [pl.col("letter").list.lengths().alias("c_group")]
-    ).groupby(
+    ).group_by(
         by=["c_group"], maintain_order=True
     ).agg(
         pl.col("letter")
diff --git a/py-polars/tests/unit/datatypes/test_decimal.py b/py-polars/tests/unit/datatypes/test_decimal.py
index e6f8029dd0b8..aa89b8eb9dad 100644
--- a/py-polars/tests/unit/datatypes/test_decimal.py
+++ b/py-polars/tests/unit/datatypes/test_decimal.py
@@ -180,7 +180,7 @@ def test_decimal_aggregations() -> None:
         }
     )
 
-    assert df.groupby("g", maintain_order=True).agg(
+    assert df.group_by("g", maintain_order=True).agg(
         sum=pl.sum("a"),
         min=pl.min("a"),
         max=pl.max("a"),
diff --git a/py-polars/tests/unit/datatypes/test_float.py b/py-polars/tests/unit/datatypes/test_float.py
index 6ed39b35e6c7..16dd1df2022c 100644
--- a/py-polars/tests/unit/datatypes/test_float.py
+++ b/py-polars/tests/unit/datatypes/test_float.py
@@ -1,7 +1,7 @@
 import polars as pl
 
 
-def test_nan_in_groupby_agg() -> None:
+def test_nan_in_group_by_agg() -> None:
     df = pl.DataFrame(
         {
             "key": ["a", "a", "a", "a"],
@@ -10,8 +10,8 @@ def test_nan_in_groupby_agg() -> None:
         }
     )
 
-    assert df.groupby("bar", "key").agg(pl.col("value").max())["value"].item() == 18.78
-    assert df.groupby("bar", "key").agg(pl.col("value").min())["value"].item() == 18.58
+    assert df.group_by("bar", "key").agg(pl.col("value").max())["value"].item() == 18.78
+    assert df.group_by("bar", "key").agg(pl.col("value").min())["value"].item() == 18.58
 
 
 def test_nan_aggregations() -> None:
@@ -29,6 +29,6 @@ def test_nan_aggregations() -> None:
         == "{'max': [3.0], 'min': [1.0], 'nan_max': [nan], 'nan_min': [nan]}"
     )
     assert (
-        str(df.groupby("b").agg(aggs).to_dict(False))
+        str(df.group_by("b").agg(aggs).to_dict(False))
         == "{'b': [1], 'max': [3.0], 'min': [1.0], 'nan_max': [nan], 'nan_min': [nan]}"
     )
diff --git a/py-polars/tests/unit/datatypes/test_list.py b/py-polars/tests/unit/datatypes/test_list.py
index 53901bab76a1..c6b0cbc68fb1 100644
--- a/py-polars/tests/unit/datatypes/test_list.py
+++ b/py-polars/tests/unit/datatypes/test_list.py
@@ -61,7 +61,7 @@ def test_categorical() -> None:
         ]
     )
     out = (
-        df.groupby(["a", "b"])
+        df.group_by(["a", "b"])
         .agg(
             [
                 pl.col("c").count().alias("num_different_c"),
@@ -90,11 +90,11 @@ def test_cast_inner() -> None:
     )
 
 
-def test_list_empty_groupby_result_3521() -> None:
+def test_list_empty_group_by_result_3521() -> None:
     # Create a left relation where the join column contains a null value
     left = pl.DataFrame().with_columns(
         [
-            pl.lit(1).alias("groupby_column"),
+            pl.lit(1).alias("group_by_column"),
             pl.lit(None).cast(pl.Int32).alias("join_column"),
         ]
     )
@@ -111,9 +111,9 @@ def test_list_empty_groupby_result_3521() -> None:
     # This will panic on polars version 0.13.38 and 0.13.39
     assert (
         left.join(right, on="join_column", how="left")
-        .groupby("groupby_column")
+        .group_by("group_by_column")
         .agg(pl.col("n_unique_column").drop_nulls())
-    ).to_dict(False) == {"groupby_column": [1], "n_unique_column": [[]]}
+    ).to_dict(False) == {"group_by_column": [1], "n_unique_column": [[]]}
 
 
 def test_list_fill_null() -> None:
@@ -177,21 +177,21 @@ def test_inner_type_categorical_on_rechunk() -> None:
     assert pl.concat([df, df], rechunk=True).dtypes == [pl.List(pl.Categorical)]
 
 
-def test_groupby_list_column() -> None:
+def test_group_by_list_column() -> None:
     df = (
         pl.DataFrame({"a": ["a", "b", "a"]})
         .with_columns(pl.col("a").cast(pl.Categorical))
-        .groupby("a", maintain_order=True)
+        .group_by("a", maintain_order=True)
         .agg(pl.col("a").alias("a_list"))
     )
 
-    assert df.groupby("a_list", maintain_order=True).first().to_dict(False) == {
+    assert df.group_by("a_list", maintain_order=True).first().to_dict(False) == {
         "a_list": [["a", "a"], ["b"]],
         "a": ["a", "b"],
     }
 
 
-def test_groupby_multiple_keys_contains_list_column() -> None:
+def test_group_by_multiple_keys_contains_list_column() -> None:
     df = (
         pl.DataFrame(
             {
@@ -200,7 +200,7 @@ def test_groupby_multiple_keys_contains_list_column() -> None:
                 "c": [3, 2, 1, 0],
             }
         )
-        .groupby(["a", "b"], maintain_order=True)
+        .group_by(["a", "b"], maintain_order=True)
         .agg(pl.all())
     )
     assert df.to_dict(False) == {
@@ -263,7 +263,7 @@ def test_fast_explode_on_list_struct_6208() -> None:
 def test_flat_aggregation_to_list_conversion_6918() -> None:
     df = pl.DataFrame({"a": [1, 2, 2], "b": [[0, 1], [2, 3], [4, 5]]})
 
-    assert df.groupby("a", maintain_order=True).agg(
+    assert df.group_by("a", maintain_order=True).agg(
         pl.concat_list([pl.col("b").list.get(i).mean().implode() for i in range(2)])
     ).to_dict(False) == {"a": [1, 2], "b": [[[0.0, 1.0]], [[3.0, 4.0]]]}
 
@@ -398,7 +398,7 @@ def test_logical_type_struct_agg_list() -> None:
         {"cats": ["Value1", "Value2", "Value1"]},
         schema_overrides={"cats": pl.Categorical},
     )
-    out = df.groupby(1).agg(pl.struct("cats"))
+    out = df.group_by(1).agg(pl.struct("cats"))
     assert out.dtypes == [
         pl.Int32,
         pl.List(pl.Struct([pl.Field("cats", pl.Categorical)])),
@@ -418,7 +418,7 @@ def test_logical_parallel_list_collect() -> None:
             },
             schema_overrides={"Values": pl.Categorical},
         )
-        .groupby("Group")
+        .group_by("Group")
         .agg(pl.col("Values").value_counts(sort=True))
         .explode("Values")
         .unnest("Values")
@@ -498,7 +498,7 @@ def test_list_amortized_iter_clear_settings_10126() -> None:
     out = (
         pl.DataFrame({"a": [[1], [1], [2]], "b": [[1, 2], [1, 3], [4]]})
         .explode("a")
-        .groupby("a")
+        .group_by("a")
         .agg(pl.col("b").flatten())
         .with_columns(pl.col("b").list.unique())
         .sort("a")
diff --git a/py-polars/tests/unit/datatypes/test_struct.py b/py-polars/tests/unit/datatypes/test_struct.py
index 50a4f21315aa..cf694f984387 100644
--- a/py-polars/tests/unit/datatypes/test_struct.py
+++ b/py-polars/tests/unit/datatypes/test_struct.py
@@ -188,7 +188,7 @@ def test_value_counts_expr() -> None:
 
     df = pl.DataFrame({"session": [1, 1, 1], "id": [2, 2, 3]})
 
-    assert df.groupby("session").agg(
+    assert df.group_by("session").agg(
         [pl.col("id").value_counts(sort=True).first()]
     ).to_dict(False) == {"session": [1], "id": [{"id": 2, "counts": 2}]}
 
@@ -375,7 +375,7 @@ def test_struct_agg_all() -> None:
         }
     )
 
-    assert df.groupby("group", maintain_order=True).all().to_dict(False) == {
+    assert df.group_by("group", maintain_order=True).all().to_dict(False) == {
         "group": ["a", "b"],
         "col1": [
             [{"x": 1, "y": 100}, {"x": 2, "y": 200}],
@@ -607,9 +607,9 @@ def test_nested_struct_sliced_append() -> None:
     ]
 
 
-def test_struct_groupby_field_agg_4216() -> None:
+def test_struct_group_by_field_agg_4216() -> None:
     df = pl.DataFrame([{"a": {"b": 1}, "c": 0}])
-    assert df.groupby("c").agg(pl.col("a").struct.field("b").count()).to_dict(
+    assert df.group_by("c").agg(pl.col("a").struct.field("b").count()).to_dict(
         False
     ) == {"c": [0], "b": [1]}
 
@@ -816,7 +816,7 @@ def test_struct_name_passed_in_agg_apply() -> None:
         ]
     ).alias("index")
 
-    assert pl.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [1, 2, 2]}).groupby(
+    assert pl.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [1, 2, 2]}).group_by(
         "C"
     ).agg(struct_expr).sort("C", descending=True).to_dict(False) == {
         "C": [2, 1],
@@ -828,7 +828,7 @@ def test_struct_name_passed_in_agg_apply() -> None:
 
     df = pl.DataFrame({"val": [-3, -2, -1, 0, 1, 2, 3], "k": [0] * 7})
 
-    assert df.groupby("k").agg(
+    assert df.group_by("k").agg(
         pl.struct(
             [
                 pl.col("val").value_counts(sort=True).struct.field("val").alias("val"),
diff --git a/py-polars/tests/unit/datatypes/test_temporal.py b/py-polars/tests/unit/datatypes/test_temporal.py
index f3a2a1862b35..1b6767dc07bc 100644
--- a/py-polars/tests/unit/datatypes/test_temporal.py
+++ b/py-polars/tests/unit/datatypes/test_temporal.py
@@ -529,7 +529,7 @@ def test_truncate_negative_offset(tzinfo: ZoneInfo | None) -> None:
         }
     ).set_sorted("event_date")
     df = df.with_columns(pl.col("event_date").dt.replace_time_zone(time_zone))
-    out = df.groupby_dynamic(
+    out = df.group_by_dynamic(
         index_column="event_date",
         every="1mo",
         period="2mo",
@@ -562,7 +562,7 @@ def test_truncate_negative_offset(tzinfo: ZoneInfo | None) -> None:
     ).set_sorted("event_date")
     df = df.with_columns(pl.col("event_date").dt.replace_time_zone(time_zone))
 
-    out = df.groupby_dynamic(
+    out = df.group_by_dynamic(
         index_column="event_date",
         every="1mo",
         by=["admin", "five_type", "actor"],
@@ -586,7 +586,7 @@ def test_truncate_negative_offset(tzinfo: ZoneInfo | None) -> None:
             .set_sorted("idx")
         )
 
-        out = df.groupby_dynamic(
+        out = df.group_by_dynamic(
             "idx", every="2i", period="3i", include_boundaries=True
         ).agg(pl.col("A"))
 
@@ -624,7 +624,7 @@ def test_explode_date() -> None:
             }
         )
         out = (
-            df.groupby("b", maintain_order=True)
+            df.group_by("b", maintain_order=True)
             .agg([pl.col("a"), pl.col("c").pct_change()])
             .explode(["a", "c"])
         )
@@ -637,7 +637,7 @@ def test_explode_date() -> None:
         ]
 
 
-def test_groupby_dynamic_when_conversion_crosses_dates_7274() -> None:
+def test_group_by_dynamic_when_conversion_crosses_dates_7274() -> None:
     df = (
         pl.DataFrame(
             data={
@@ -658,7 +658,7 @@ def test_groupby_dynamic_when_conversion_crosses_dates_7274() -> None:
             .set_sorted()
         )
     )
-    result = df.groupby_dynamic(
+    result = df.group_by_dynamic(
         index_column="timestamp", every="1d", closed="left"
     ).agg(pl.col("value").count())
     expected = pl.DataFrame({"timestamp": [datetime(1970, 1, 1)], "value": [2]})
@@ -667,7 +667,7 @@ def test_groupby_dynamic_when_conversion_crosses_dates_7274() -> None:
         pl.col("value").cast(pl.UInt32),
     )
     assert_frame_equal(result, expected)
-    result = df.groupby_dynamic(
+    result = df.group_by_dynamic(
         index_column="timestamp_utc", every="1d", closed="left"
     ).agg(pl.col("value").count())
     expected = pl.DataFrame(
@@ -701,7 +701,7 @@ def test_rolling() -> None:
 
     period: str | timedelta
     for period in ("2d", timedelta(days=2)):  # type: ignore[assignment]
-        out = df.groupby_rolling(index_column="dt", period=period).agg(
+        out = df.group_by_rolling(index_column="dt", period=period).agg(
             [
                 pl.sum("a").alias("sum_a"),
                 pl.min("a").alias("min_a"),
@@ -882,7 +882,7 @@ def test_read_utc_times_parquet() -> None:
 
 
 @pytest.mark.parametrize("time_zone", [None, "Asia/Kathmandu"])
-def test_default_negative_every_offset_dynamic_groupby(time_zone: str | None) -> None:
+def test_default_negative_every_offset_dynamic_group_by(time_zone: str | None) -> None:
     # 2791
     dts = [
         datetime(2020, 1, 1),
@@ -892,7 +892,7 @@ def test_default_negative_every_offset_dynamic_groupby(time_zone: str | None) ->
     ]
     df = pl.DataFrame({"dt": dts, "idx": range(len(dts))}).set_sorted("dt")
     df = df.with_columns(pl.col("dt").dt.replace_time_zone(time_zone))
-    out = df.groupby_dynamic(index_column="dt", every="1mo", closed="right").agg(
+    out = df.group_by_dynamic(index_column="dt", every="1mo", closed="right").agg(
         pl.col("idx")
     )
 
@@ -918,14 +918,14 @@ def test_default_negative_every_offset_dynamic_groupby(time_zone: str | None) ->
         ("1w", timedelta(weeks=2)),
     ],
 )
-def test_groupby_dynamic_crossing_dst(rule: str, offset: timedelta) -> None:
+def test_group_by_dynamic_crossing_dst(rule: str, offset: timedelta) -> None:
     start_dt = datetime(2021, 11, 7)
     end_dt = start_dt + offset
     date_range = pl.date_range(
         start_dt, end_dt, rule, time_zone="US/Central", eager=True
     )
     df = pl.DataFrame({"time": date_range, "value": range(len(date_range))})
-    result = df.groupby_dynamic("time", every=rule, start_by="datapoint").agg(
+    result = df.group_by_dynamic("time", every=rule, start_by="datapoint").agg(
         pl.col("value").mean()
     )
     expected = pl.DataFrame(
@@ -996,7 +996,7 @@ def test_groupby_dynamic_crossing_dst(rule: str, offset: timedelta) -> None:
         ),
     ],
 )
-def test_groupby_dynamic_startby_monday_crossing_dst(
+def test_group_by_dynamic_startby_monday_crossing_dst(
     start_by: StartBy, expected_time: list[datetime], expected_value: list[float]
 ) -> None:
     start_dt = datetime(2021, 11, 7)
@@ -1005,7 +1005,7 @@ def test_groupby_dynamic_startby_monday_crossing_dst(
         start_dt, end_dt, "1d", time_zone="US/Central", eager=True
     )
     df = pl.DataFrame({"time": date_range, "value": range(len(date_range))})
-    result = df.groupby_dynamic("time", every="1w", start_by=start_by).agg(
+    result = df.group_by_dynamic("time", every="1w", start_by=start_by).agg(
         pl.col("value").mean()
     )
     expected = pl.DataFrame(
@@ -1015,14 +1015,14 @@ def test_groupby_dynamic_startby_monday_crossing_dst(
     assert_frame_equal(result, expected)
 
 
-def test_groupby_dynamic_startby_monday_dst_8737() -> None:
+def test_group_by_dynamic_startby_monday_dst_8737() -> None:
     start_dt = datetime(2021, 11, 6, 20)
     stop_dt = datetime(2021, 11, 7, 20)
     date_range = pl.date_range(
         start_dt, stop_dt, "1d", time_zone="US/Central", eager=True
     )
     df = pl.DataFrame({"time": date_range, "value": range(len(date_range))})
-    result = df.groupby_dynamic("time", every="1w", start_by="monday").agg(
+    result = df.group_by_dynamic("time", every="1w", start_by="monday").agg(
         pl.col("value").mean()
     )
     expected = pl.DataFrame(
@@ -1037,14 +1037,14 @@ def test_groupby_dynamic_startby_monday_dst_8737() -> None:
     assert_frame_equal(result, expected)
 
 
-def test_groupby_dynamic_monthly_crossing_dst() -> None:
+def test_group_by_dynamic_monthly_crossing_dst() -> None:
     start_dt = datetime(2021, 11, 1)
     end_dt = datetime(2021, 12, 1)
     date_range = pl.date_range(
         start_dt, end_dt, "1mo", time_zone="US/Central", eager=True
     )
     df = pl.DataFrame({"time": date_range, "value": range(len(date_range))})
-    result = df.groupby_dynamic("time", every="1mo").agg(pl.col("value").mean())
+    result = df.group_by_dynamic("time", every="1mo").agg(pl.col("value").mean())
     expected = pl.DataFrame(
         {"time": date_range, "value": range(len(date_range))},
         schema_overrides={"value": pl.Float64},
@@ -1052,10 +1052,10 @@ def test_groupby_dynamic_monthly_crossing_dst() -> None:
     assert_frame_equal(result, expected)
 
 
-def test_groupby_dynamic_2d_9333() -> None:
+def test_group_by_dynamic_2d_9333() -> None:
     df = pl.DataFrame({"ts": [datetime(2000, 1, 1, 3)], "values": [10.0]})
     df = df.with_columns(pl.col("ts").set_sorted())
-    result = df.groupby_dynamic("ts", every="2d").agg(pl.col("values"))
+    result = df.group_by_dynamic("ts", every="2d").agg(pl.col("values"))
     expected = pl.DataFrame({"ts": [datetime(1999, 12, 31, 0)], "values": [[10.0]]})
     assert_frame_equal(result, expected)
 
@@ -1190,10 +1190,10 @@ def test_add_duration_3786() -> None:
     }
 
 
-def test_rolling_groupby_by_argument() -> None:
+def test_rolling_group_by_by_argument() -> None:
     df = pl.DataFrame({"times": range(10), "groups": [1] * 4 + [2] * 6})
 
-    out = df.groupby_rolling("times", period="5i", by=["groups"]).agg(
+    out = df.group_by_rolling("times", period="5i", by=["groups"]).agg(
         pl.col("times").alias("agg_list")
     )
 
@@ -1219,7 +1219,7 @@ def test_rolling_groupby_by_argument() -> None:
     assert_frame_equal(out, expected)
 
 
-def test_groupby_rolling_mean_3020() -> None:
+def test_group_by_rolling_mean_3020() -> None:
     df = pl.DataFrame(
         {
             "Date": [
@@ -1237,7 +1237,7 @@ def test_groupby_rolling_mean_3020() -> None:
 
     period: str | timedelta
     for period in ("1w", timedelta(days=7)):  # type: ignore[assignment]
-        result = df.groupby_rolling(index_column="Date", period=period).agg(
+        result = df.group_by_rolling(index_column="Date", period=period).agg(
             pl.col("val").mean().alias("val_mean")
         )
         expected = pl.DataFrame(
@@ -1539,7 +1539,7 @@ def test_duration_aggregations() -> None:
         }
     )
     df = df.with_columns((pl.col("end") - pl.col("start")).alias("duration"))
-    assert df.groupby("group", maintain_order=True).agg(
+    assert df.group_by("group", maintain_order=True).agg(
         [
             pl.col("duration").mean().alias("mean"),
             pl.col("duration").sum().alias("sum"),
@@ -1648,7 +1648,7 @@ def test_unique_counts_on_dates() -> None:
     }
 
 
-def test_groupby_rolling_by_ordering() -> None:
+def test_group_by_rolling_by_ordering() -> None:
     # we must check that the keys still match the time labels after the rolling window
     # with a `by` argument.
     df = pl.DataFrame(
@@ -1667,7 +1667,7 @@ def test_groupby_rolling_by_ordering() -> None:
         }
     ).set_sorted("dt")
 
-    assert df.groupby_rolling(
+    assert df.group_by_rolling(
         index_column="dt",
         period="2m",
         closed="both",
@@ -1694,7 +1694,7 @@ def test_groupby_rolling_by_ordering() -> None:
     }
 
 
-def test_groupby_rolling_by_() -> None:
+def test_group_by_rolling_by_() -> None:
     df = pl.DataFrame({"group": pl.arange(0, 3, eager=True)}).join(
         pl.DataFrame(
             {
@@ -1707,13 +1707,13 @@ def test_groupby_rolling_by_() -> None:
     )
     out = (
         df.sort("datetime")
-        .groupby_rolling(index_column="datetime", by="group", period=timedelta(days=3))
+        .group_by_rolling(index_column="datetime", by="group", period=timedelta(days=3))
         .agg([pl.count().alias("count")])
     )
 
     expected = (
         df.sort(["group", "datetime"])
-        .groupby_rolling(index_column="datetime", by="group", period="3d")
+        .group_by_rolling(index_column="datetime", by="group", period="3d")
         .agg([pl.count().alias("count")])
     )
     assert_frame_equal(out.sort(["group", "datetime"]), expected)
@@ -2571,7 +2571,7 @@ def test_datetime_cum_agg_schema() -> None:
     }
 
 
-def test_rolling_groupby_empty_groups_by_take_6330() -> None:
+def test_rolling_group_by_empty_groups_by_take_6330() -> None:
     df = (
         pl.DataFrame({"Event": ["Rain", "Sun"]})
         .join(
@@ -2585,7 +2585,7 @@ def test_rolling_groupby_empty_groups_by_take_6330() -> None:
         .set_sorted("Date")
     )
     assert (
-        df.groupby_rolling(
+        df.group_by_rolling(
             index_column="Date",
             period="2i",
             offset="-2i",
@@ -2777,12 +2777,12 @@ def test_pytime_conversion(tm: time) -> None:
         )
     ],
 )
-def test_groupby_dynamic(
+def test_group_by_dynamic(
     input_df: pl.DataFrame, expected_grouped_df: pl.DataFrame
 ) -> None:
     result = (
         input_df.sort("dt")
-        .groupby_dynamic("dt", every="1q")
+        .group_by_dynamic("dt", every="1q")
         .agg(pl.col("dt").count().alias("num_points"))
         .sort("dt")
     )
diff --git a/py-polars/tests/unit/functions/test_as_datatype.py b/py-polars/tests/unit/functions/test_as_datatype.py
index ebffc580d0de..9442e4a5ff13 100644
--- a/py-polars/tests/unit/functions/test_as_datatype.py
+++ b/py-polars/tests/unit/functions/test_as_datatype.py
@@ -156,7 +156,7 @@ def test_concat_list_in_agg_6397() -> None:
     df = pl.DataFrame({"group": [1, 2, 2, 3], "value": ["a", "b", "c", "d"]})
 
     # single list
-    assert df.groupby("group").agg(
+    assert df.group_by("group").agg(
         [
             # this casts every element to a list
             pl.concat_list(pl.col("value")),
@@ -167,7 +167,7 @@ def test_concat_list_in_agg_6397() -> None:
     }
 
     # nested list
-    assert df.groupby("group").agg(
+    assert df.group_by("group").agg(
         [
             pl.concat_list(pl.col("value").implode()).alias("result"),
         ]
diff --git a/py-polars/tests/unit/functions/test_whenthen.py b/py-polars/tests/unit/functions/test_whenthen.py
index b55192975f7e..b3ed26991615 100644
--- a/py-polars/tests/unit/functions/test_whenthen.py
+++ b/py-polars/tests/unit/functions/test_whenthen.py
@@ -187,7 +187,7 @@ def test_when_then_edge_cases_3994() -> None:
     # this tests if lazy correctly assigns the list schema to the column aggregation
     assert (
         df.lazy()
-        .groupby(["id"])
+        .group_by(["id"])
         .agg(pl.col("type"))
         .with_columns(
             pl.when(pl.col("type").list.lengths() == 0)
@@ -201,7 +201,7 @@ def test_when_then_edge_cases_3994() -> None:
     # this tests ternary with an empty argument
     assert (
         df.filter(pl.col("id") == 42)
-        .groupby(["id"])
+        .group_by(["id"])
         .agg(pl.col("type"))
         .with_columns(
             pl.when(pl.col("type").list.lengths() == 0)
diff --git a/py-polars/tests/unit/io/test_lazy_parquet.py b/py-polars/tests/unit/io/test_lazy_parquet.py
index 1c7848ff3c6f..02e51bc93765 100644
--- a/py-polars/tests/unit/io/test_lazy_parquet.py
+++ b/py-polars/tests/unit/io/test_lazy_parquet.py
@@ -331,7 +331,7 @@ def test_streaming_categorical(tmp_path: Path) -> None:
     with pl.StringCache():
         result = (
             pl.scan_parquet(file_path)
-            .groupby("name")
+            .group_by("name")
             .agg(pl.col("amount").sum())
             .collect()
             .sort("name")
diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py
index 1d225decb0f8..be34b546ac92 100644
--- a/py-polars/tests/unit/io/test_parquet.py
+++ b/py-polars/tests/unit/io/test_parquet.py
@@ -250,7 +250,7 @@ def test_recursive_logical_type() -> None:
     df = pl.DataFrame({"str": ["A", "B", "A", "B", "C"], "group": [1, 1, 2, 1, 2]})
     df = df.with_columns(pl.col("str").cast(pl.Categorical))
 
-    df_groups = df.groupby("group").agg([pl.col("str").alias("cat_list")])
+    df_groups = df.group_by("group").agg([pl.col("str").alias("cat_list")])
     f = io.BytesIO()
     df_groups.write_parquet(f, use_pyarrow=True)
     f.seek(0)
@@ -264,7 +264,7 @@ def test_nested_dictionary() -> None:
         df = (
             pl.DataFrame({"str": ["A", "B", "A", "B", "C"], "group": [1, 1, 2, 1, 2]})
             .with_columns(pl.col("str").cast(pl.Categorical))
-            .groupby("group")
+            .group_by("group")
             .agg([pl.col("str").alias("cat_list")])
         )
         f = io.BytesIO()
diff --git a/py-polars/tests/unit/namespaces/test_list.py b/py-polars/tests/unit/namespaces/test_list.py
index 3ba208b9a49c..7a9cbd7505b8 100644
--- a/py-polars/tests/unit/namespaces/test_list.py
+++ b/py-polars/tests/unit/namespaces/test_list.py
@@ -211,7 +211,7 @@ def test_arr_contains_categorical() -> None:
         {"str": ["A", "B", "A", "B", "C"], "group": [1, 1, 2, 1, 2]}
     ).lazy()
     df = df.with_columns(pl.col("str").cast(pl.Categorical))
-    df_groups = df.groupby("group").agg([pl.col("str").alias("str_list")])
+    df_groups = df.group_by("group").agg([pl.col("str").alias("str_list")])
     assert df_groups.filter(pl.col("str_list").list.contains("C")).collect().to_dict(
         False
     ) == {"group": [2], "str_list": [["A", "C"]]}
@@ -364,7 +364,7 @@ def test_list_function_group_awareness() -> None:
         }
     )
 
-    assert df.groupby("group").agg(
+    assert df.group_by("group").agg(
         [
             pl.col("a").implode().list.get(0).alias("get"),
             pl.col("a").implode().list.take([0]).alias("take"),
diff --git a/py-polars/tests/unit/namespaces/test_string.py b/py-polars/tests/unit/namespaces/test_string.py
index 6595f9e4365a..fbdaa0010de9 100644
--- a/py-polars/tests/unit/namespaces/test_string.py
+++ b/py-polars/tests/unit/namespaces/test_string.py
@@ -316,7 +316,7 @@ def test_auto_explode() -> None:
     )
     pl.col("val").str.concat(delimiter=",")
     grouped = (
-        df.groupby("id")
+        df.group_by("id")
         .agg(pl.col("val").str.concat(delimiter=",").alias("grouped"))
         .get_column("grouped")
     )
diff --git a/py-polars/tests/unit/operations/test_aggregations.py b/py-polars/tests/unit/operations/test_aggregations.py
index b0e731c3c536..2629dfc6fa20 100644
--- a/py-polars/tests/unit/operations/test_aggregations.py
+++ b/py-polars/tests/unit/operations/test_aggregations.py
@@ -35,7 +35,7 @@ def test_boolean_aggs() -> None:
         "var": [0.3333333432674408],
     }
 
-    assert df.groupby(pl.lit(1)).agg(aggs).to_dict(False) == {
+    assert df.group_by(pl.lit(1)).agg(aggs).to_dict(False) == {
         "literal": [1],
         "mean": [0.6666666666666666],
         "std": [0.5773502691896258],
@@ -66,7 +66,7 @@ def test_duration_aggs() -> None:
     assert df.select("time_difference").mean().to_dict(False) == {
         "time_difference": [timedelta(days=31)]
     }
-    assert df.groupby(pl.lit(1)).agg(pl.mean("time_difference")).to_dict(False) == {
+    assert df.group_by(pl.lit(1)).agg(pl.mean("time_difference")).to_dict(False) == {
         "literal": [1],
         "time_difference": [timedelta(days=31)],
     }
@@ -80,8 +80,8 @@ def test_hmean_with_str_column() -> None:
 
 def test_list_aggregation_that_filters_all_data_6017() -> None:
     out = (
-        pl.DataFrame({"col_to_groupby": [2], "flt": [1672740910.967138], "col3": [1]})
-        .groupby("col_to_groupby")
+        pl.DataFrame({"col_to_group_by": [2], "flt": [1672740910.967138], "col3": [1]})
+        .group_by("col_to_group_by")
         .agg(
             (pl.col("flt").filter(pl.col("col3") == 0).diff() * 1000)
             .diff()
@@ -89,8 +89,8 @@ def test_list_aggregation_that_filters_all_data_6017() -> None:
         )
     )
 
-    assert out.schema == {"col_to_groupby": pl.Int64, "calc": pl.List(pl.Float64)}
-    assert out.to_dict(False) == {"col_to_groupby": [2], "calc": [[]]}
+    assert out.schema == {"col_to_group_by": pl.Int64, "calc": pl.List(pl.Float64)}
+    assert out.to_dict(False) == {"col_to_group_by": [2], "calc": [[]]}
 
 
 def test_median() -> None:
@@ -167,7 +167,7 @@ def test_literal_group_agg_chunked_7968() -> None:
     ser = pl.concat([pl.Series([3]), pl.Series([4, 5])], rechunk=False)
 
     assert_frame_equal(
-        df.groupby("A").agg(pl.col("B").search_sorted(ser)),
+        df.group_by("A").agg(pl.col("B").search_sorted(ser)),
         pl.DataFrame(
             [
                 pl.Series("A", [1], dtype=pl.Int64),
@@ -191,7 +191,7 @@ def test_duration_function_literal() -> None:
     )
 
     # this checks if the `pl.duration` is flagged as AggState::Literal
-    assert df.groupby("A", maintain_order=True).agg(
+    assert df.group_by("A", maintain_order=True).agg(
         [((pl.col("T").max() + pl.duration(seconds=1)) - pl.col("T"))]
     ).to_dict(False) == {
         "A": ["x", "y"],
@@ -214,7 +214,7 @@ def test_string_par_materialize_8207() -> None:
         }
     )
 
-    assert df.groupby(["a"]).agg(pl.min("b")).sort("a").collect().to_dict(False) == {
+    assert df.group_by(["a"]).agg(pl.min("b")).sort("a").collect().to_dict(False) == {
         "a": ["a", "b", "c", "d", "e"],
         "b": ["P", "L", "T", "R", "a long string"],
     }
@@ -230,7 +230,7 @@ def test_online_variance() -> None:
     )
 
     assert_frame_equal(
-        df.groupby("id")
+        df.group_by("id")
         .agg(pl.all().exclude("id").std())
         .select(["no_nulls", "nulls"]),
         df.select(pl.all().exclude("id").std()),
@@ -245,10 +245,10 @@ def test_err_on_implode_and_agg() -> None:
         pl.InvalidOperationError,
         match=r"'implode' followed by an aggregation is not allowed",
     ):
-        df.groupby("type").agg(pl.col("type").implode().first().alias("foo"))
+        df.group_by("type").agg(pl.col("type").implode().first().alias("foo"))
 
-    # implode + function should be allowed in groupby
-    assert df.groupby("type", maintain_order=True).agg(
+    # implode + function should be allowed in group_by
+    assert df.group_by("type", maintain_order=True).agg(
         pl.col("type").implode().list.head().alias("foo")
     ).to_dict(False) == {
         "type": ["water", "fire", "earth"],
@@ -265,7 +265,7 @@ def test_err_on_implode_and_agg() -> None:
 
 def test_mapped_literal_to_literal_9217() -> None:
     df = pl.DataFrame({"unique_id": ["a", "b"]})
-    assert df.groupby(True).agg(
+    assert df.group_by(True).agg(
         pl.struct(pl.lit("unique_id").alias("unique_id"))
     ).to_dict(False) == {"literal": [True], "unique_id": [{"unique_id": "unique_id"}]}
 
@@ -279,4 +279,4 @@ def test_sum_empty_and_null_set() -> None:
 
     df = pl.DataFrame({"a": [None, None, None], "b": [1, 1, 1]})
     assert df.select(pl.sum("a")).item() == 0.0
-    assert df.groupby("b").agg(pl.sum("a"))["a"].item() == 0.0
+    assert df.group_by("b").agg(pl.sum("a"))["a"].item() == 0.0
diff --git a/py-polars/tests/unit/operations/test_apply.py b/py-polars/tests/unit/operations/test_apply.py
index af6cb5946633..db1dc686155c 100644
--- a/py-polars/tests/unit/operations/test_apply.py
+++ b/py-polars/tests/unit/operations/test_apply.py
@@ -23,7 +23,7 @@ def test_apply_none() -> None:
     )
 
     out = (
-        df.groupby("g", maintain_order=True).agg(
+        df.group_by("g", maintain_order=True).agg(
             pl.apply(
                 exprs=["a", pl.col("b") ** 4, pl.col("a") / 4],
                 function=lambda x: x[0] * x[1] + x[2].sum(),
@@ -44,7 +44,7 @@ def func(s: Sequence[pl.Series]) -> pl.Series | None:
             return s[0]
 
     out = (
-        df.groupby("g", maintain_order=True).agg(
+        df.group_by("g", maintain_order=True).agg(
             pl.apply(
                 exprs=["a", pl.col("b") ** 4, pl.col("a") / 4], function=func
             ).alias("multiple")
@@ -72,7 +72,7 @@ class Foo:
         def __init__(self, payload: Any):
             self.payload = payload
 
-    out = df.groupby("groups").agg(
+    out = df.group_by("groups").agg(
         [
             pl.apply(
                 [pl.col("dates"), pl.col("names")], lambda s: Foo(dict(zip(s[0], s[1])))
@@ -98,7 +98,7 @@ def test_apply_arithmetic_consistency() -> None:
     with pytest.warns(
         PolarsInefficientApplyWarning, match="In this case, you can replace"
     ):
-        assert df.groupby("A").agg(pl.col("B").apply(lambda x: x + 1.0))[
+        assert df.group_by("A").agg(pl.col("B").apply(lambda x: x + 1.0))[
             "B"
         ].to_list() == [[3.0, 4.0]]
 
@@ -135,7 +135,7 @@ def test_apply_numpy_out_3057() -> None:
             "y": [0.0, 1, 1.3, 2, 3, 4],
         }
     )
-    result = df.groupby("id", maintain_order=True).agg(
+    result = df.group_by("id", maintain_order=True).agg(
         pl.apply(["y", "t"], lambda lst: np.trapz(y=lst[0], x=lst[1])).alias("result")
     )
     expected = pl.DataFrame({"id": [0, 1], "result": [1.955, 13.0]})
@@ -220,7 +220,7 @@ def test_apply_type_propagation() -> None:
                 "b": [{"c": 1, "d": 2}, {"c": 2, "d": 3}, {"c": None, "d": None}],
             }
         )
-        .groupby("a", maintain_order=True)
+        .group_by("a", maintain_order=True)
         .agg(
             [
                 pl.when(pl.col("b").null_count() == 0)
@@ -322,7 +322,7 @@ def test_apply_pass_name() -> None:
     def applyer(s: pl.Series) -> pl.Series:
         return pl.Series([mapper[s.name]])
 
-    assert df.groupby("bar", maintain_order=True).agg(
+    assert df.group_by("bar", maintain_order=True).agg(
         [
             pl.col("foo").apply(applyer, pass_name=True),
         ]
@@ -395,7 +395,7 @@ def test_apply_10237() -> None:
 
 def test_apply_on_empty_col_10639() -> None:
     df = pl.DataFrame({"A": [], "B": []})
-    res = df.groupby("B").agg(
+    res = df.group_by("B").agg(
         pl.col("A")
         .apply(lambda x: x, return_dtype=pl.Int32, strategy="threading")
         .alias("Foo")
@@ -404,7 +404,7 @@ def test_apply_on_empty_col_10639() -> None:
         "B": [],
         "Foo": [],
     }
-    res = df.groupby("B").agg(
+    res = df.group_by("B").agg(
         pl.col("A")
         .apply(lambda x: x, return_dtype=pl.Int32, strategy="thread_local")
         .alias("Foo")
diff --git a/py-polars/tests/unit/operations/test_explode.py b/py-polars/tests/unit/operations/test_explode.py
index 6cd6a85e0e0a..4c39eeeefc24 100644
--- a/py-polars/tests/unit/operations/test_explode.py
+++ b/py-polars/tests/unit/operations/test_explode.py
@@ -25,17 +25,17 @@ def test_explode_multiple() -> None:
     assert_frame_equal(df.explode("a", "b"), expected)
 
 
-def test_groupby_flatten_list() -> None:
+def test_group_by_flatten_list() -> None:
     df = pl.DataFrame({"group": ["a", "b", "b"], "values": [[1, 2], [2, 3], [4]]})
-    result = df.groupby("group", maintain_order=True).agg(pl.col("values").flatten())
+    result = df.group_by("group", maintain_order=True).agg(pl.col("values").flatten())
 
     expected = pl.DataFrame({"group": ["a", "b"], "values": [[1, 2], [2, 3, 4]]})
     assert_frame_equal(result, expected)
 
 
-def test_groupby_flatten_string() -> None:
+def test_group_by_flatten_string() -> None:
     df = pl.DataFrame({"group": ["a", "b", "b"], "values": ["foo", "bar", "baz"]})
-    result = df.groupby("group", maintain_order=True).agg(
+    result = df.group_by("group", maintain_order=True).agg(
         pl.col("values").str.explode()
     )
 
@@ -217,7 +217,7 @@ def test_explode_in_agg_context() -> None:
     assert (
         df.with_row_count("row_nr")
         .explode("idxs")
-        .groupby("row_nr")
+        .group_by("row_nr")
         .agg(pl.col("array").flatten())
     ).to_dict(False) == {
         "row_nr": [0, 1, 2],
@@ -231,7 +231,7 @@ def test_explode_inner_lists_3985() -> None:
     ).lazy()
 
     assert (
-        df.groupby("id")
+        df.group_by("id")
         .agg(pl.col("categories"))
         .with_columns(pl.col("categories").list.eval(pl.element().list.explode()))
     ).collect().to_dict(False) == {"id": [1], "categories": [["a", "b", "a", "c"]]}
@@ -291,7 +291,7 @@ def test_logical_explode() -> None:
             {"cats": ["Value1", "Value2", "Value1"]},
             schema_overrides={"cats": pl.Categorical},
         )
-        .groupby(1)
+        .group_by(1)
         .agg(pl.struct("cats"))
         .explode("cats")
         .unnest("cats")
diff --git a/py-polars/tests/unit/operations/test_filter.py b/py-polars/tests/unit/operations/test_filter.py
index 551138742e11..72ca0a0dfe93 100644
--- a/py-polars/tests/unit/operations/test_filter.py
+++ b/py-polars/tests/unit/operations/test_filter.py
@@ -32,19 +32,19 @@ def test_melt_values_predicate_pushdown() -> None:
 def test_filter_is_in_4572() -> None:
     df = pl.DataFrame({"id": [1, 2, 1, 2], "k": ["a"] * 2 + ["b"] * 2})
     expected = (
-        df.groupby("id")
+        df.group_by("id")
         .agg(pl.col("k").filter(pl.col("k") == "a").implode())
         .sort("id")
     )
     result = (
-        df.groupby("id")
+        df.group_by("id")
         .agg(pl.col("k").filter(pl.col("k").is_in(["a"])).implode())
         .sort("id")
     )
     assert_frame_equal(result, expected)
     result = (
         df.sort("id")
-        .groupby("id")
+        .group_by("id")
         .agg(pl.col("k").filter(pl.col("k").is_in(["a"])).implode())
     )
     assert_frame_equal(result, expected)
@@ -61,7 +61,7 @@ def test_filter_aggregation_any() -> None:
     )
 
     result = (
-        df.groupby("group")
+        df.group_by("group")
         .agg(
             pl.any_horizontal("pred_a", "pred_b"),
             pl.col("id")
diff --git a/py-polars/tests/unit/operations/test_groupby.py b/py-polars/tests/unit/operations/test_group_by.py
similarity index 76%
rename from py-polars/tests/unit/operations/test_groupby.py
rename to py-polars/tests/unit/operations/test_group_by.py
index 0e5addbe1070..2be46e4dbde3 100644
--- a/py-polars/tests/unit/operations/test_groupby.py
+++ b/py-polars/tests/unit/operations/test_group_by.py
@@ -17,7 +17,7 @@
 from polars.testing import assert_frame_equal, assert_series_equal
 
 
-def test_groupby() -> None:
+def test_group_by() -> None:
     df = pl.DataFrame(
         {
             "a": ["a", "b", "a", "b", "b", "c"],
@@ -26,16 +26,16 @@ def test_groupby() -> None:
         }
     )
 
-    assert df.groupby("a").apply(lambda df: df[["c"]].sum()).sort("c")["c"][0] == 1
+    assert df.group_by("a").apply(lambda df: df[["c"]].sum()).sort("c")["c"][0] == 1
 
-    # Use lazy API in eager groupby
-    assert sorted(df.groupby("a").agg([pl.sum("b")]).rows()) == [
+    # Use lazy API in eager group_by
+    assert sorted(df.group_by("a").agg([pl.sum("b")]).rows()) == [
         ("a", 4),
         ("b", 11),
         ("c", 6),
     ]
     # test if it accepts a single expression
-    assert df.groupby("a", maintain_order=True).agg(pl.sum("b")).rows() == [
+    assert df.group_by("a", maintain_order=True).agg(pl.sum("b")).rows() == [
         ("a", 4),
         ("b", 11),
         ("c", 6),
@@ -50,10 +50,10 @@ def test_groupby() -> None:
     )
 
     # check if this query runs and thus column names propagate
-    df.groupby("b").agg(pl.col("c").forward_fill()).explode("c")
+    df.group_by("b").agg(pl.col("c").forward_fill()).explode("c")
 
     # get a specific column
-    result = df.groupby("b", maintain_order=True).agg(pl.count("a"))
+    result = df.group_by("b", maintain_order=True).agg(pl.count("a"))
     assert result.rows() == [("a", 2), ("b", 3)]
     assert result.columns == ["b", "a"]
 
@@ -83,28 +83,28 @@ def df() -> pl.DataFrame:
         ("n_unique", [("a", 2, 2), ("b", 3, 2)]),
     ],
 )
-def test_groupby_shorthands(
+def test_group_by_shorthands(
     df: pl.DataFrame, method: str, expected: list[tuple[Any]]
 ) -> None:
-    gb = df.groupby("b", maintain_order=True)
+    gb = df.group_by("b", maintain_order=True)
     result = getattr(gb, method)()
     assert result.rows() == expected
 
-    gb_lazy = df.lazy().groupby("b", maintain_order=True)
+    gb_lazy = df.lazy().group_by("b", maintain_order=True)
     result = getattr(gb_lazy, method)().collect()
     assert result.rows() == expected
 
 
-def test_groupby_shorthand_quantile(df: pl.DataFrame) -> None:
-    result = df.groupby("b", maintain_order=True).quantile(0.5)
+def test_group_by_shorthand_quantile(df: pl.DataFrame) -> None:
+    result = df.group_by("b", maintain_order=True).quantile(0.5)
     expected = [("a", 2.0, 1.0), ("b", 4.0, 1.0)]
     assert result.rows() == expected
 
-    result = df.lazy().groupby("b", maintain_order=True).quantile(0.5).collect()
+    result = df.lazy().group_by("b", maintain_order=True).quantile(0.5).collect()
     assert result.rows() == expected
 
 
-def test_groupby_args() -> None:
+def test_group_by_args() -> None:
     df = pl.DataFrame(
         {
             "a": ["a", "b", "a", "b", "b", "c"],
@@ -114,30 +114,30 @@ def test_groupby_args() -> None:
     )
 
     # Single column name
-    assert df.groupby("a").agg("b").columns == ["a", "b"]
+    assert df.group_by("a").agg("b").columns == ["a", "b"]
     # Column names as list
     expected = ["a", "b", "c"]
-    assert df.groupby(["a", "b"]).agg("c").columns == expected
+    assert df.group_by(["a", "b"]).agg("c").columns == expected
     # Column names as positional arguments
-    assert df.groupby("a", "b").agg("c").columns == expected
+    assert df.group_by("a", "b").agg("c").columns == expected
     # With keyword argument
-    assert df.groupby("a", "b", maintain_order=True).agg("c").columns == expected
+    assert df.group_by("a", "b", maintain_order=True).agg("c").columns == expected
     # Multiple aggregations as list
-    assert df.groupby("a").agg(["b", "c"]).columns == expected
+    assert df.group_by("a").agg(["b", "c"]).columns == expected
     # Multiple aggregations as positional arguments
-    assert df.groupby("a").agg("b", "c").columns == expected
+    assert df.group_by("a").agg("b", "c").columns == expected
     # Multiple aggregations as keyword arguments
-    assert df.groupby("a").agg(q="b", r="c").columns == ["a", "q", "r"]
+    assert df.group_by("a").agg(q="b", r="c").columns == ["a", "q", "r"]
 
 
-def test_groupby_empty() -> None:
+def test_group_by_empty() -> None:
     df = pl.DataFrame({"a": [1, 1, 2]})
-    result = df.groupby("a").agg()
+    result = df.group_by("a").agg()
     expected = pl.DataFrame({"a": [1, 2]})
     assert_frame_equal(result, expected, check_row_order=False)
 
 
-def test_groupby_iteration() -> None:
+def test_group_by_iteration() -> None:
     df = pl.DataFrame(
         {
             "foo": ["a", "b", "a", "b", "b", "c"],
@@ -151,21 +151,21 @@ def test_groupby_iteration() -> None:
         [("b", 2, 5), ("b", 4, 3), ("b", 5, 2)],
         [("c", 6, 1)],
     ]
-    for i, (group, data) in enumerate(df.groupby("foo", maintain_order=True)):
+    for i, (group, data) in enumerate(df.group_by("foo", maintain_order=True)):
         assert group == expected_names[i]
         assert data.rows() == expected_rows[i]
 
     # Grouped by ALL columns should give groups of a single row
-    result = list(df.groupby(["foo", "bar", "baz"]))
+    result = list(df.group_by(["foo", "bar", "baz"]))
     assert len(result) == 6
 
     # Iterating over groups should also work when grouping by expressions
-    result2 = list(df.groupby(["foo", pl.col("bar") * pl.col("baz")]))
+    result2 = list(df.group_by(["foo", pl.col("bar") * pl.col("baz")]))
     assert len(result2) == 5
 
-    # Single column, alias in groupby
+    # Single column, alias in group_by
     df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6]})
-    gb = df.groupby((pl.col("foo") // 2).alias("bar"), maintain_order=True)
+    gb = df.group_by((pl.col("foo") // 2).alias("bar"), maintain_order=True)
     result3 = [(group, df.rows()) for group, df in gb]
     expected3 = [(0, [(1,)]), (1, [(2,), (3,)]), (2, [(4,), (5,)]), (3, [(6,)])]
     assert result3 == expected3
@@ -184,27 +184,27 @@ def good_agg_parameters() -> list[pl.Expr | list[pl.Expr]]:
 
 
 @pytest.mark.parametrize("lazy", [True, False])
-def test_groupby_agg_input_types(lazy: bool) -> None:
+def test_group_by_agg_input_types(lazy: bool) -> None:
     df = pl.DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, 4]})
     df_or_lazy: pl.DataFrame | pl.LazyFrame = df.lazy() if lazy else df
 
     for bad_param in bad_agg_parameters():
         with pytest.raises(TypeError):  # noqa: PT012
-            result = df_or_lazy.groupby("a").agg(bad_param)
+            result = df_or_lazy.group_by("a").agg(bad_param)
             if lazy:
                 result.collect()  # type: ignore[union-attr]
 
     expected = pl.DataFrame({"a": [1, 2], "b": [3, 7]})
 
     for good_param in good_agg_parameters():
-        result = df_or_lazy.groupby("a", maintain_order=True).agg(good_param)
+        result = df_or_lazy.group_by("a", maintain_order=True).agg(good_param)
         if lazy:
             result = result.collect()  # type: ignore[union-attr]
         assert_frame_equal(result, expected)
 
 
 @pytest.mark.parametrize("lazy", [True, False])
-def test_groupby_dynamic_agg_input_types(lazy: bool) -> None:
+def test_group_by_dynamic_agg_input_types(lazy: bool) -> None:
     df = pl.DataFrame({"index_column": [0, 1, 2, 3], "b": [1, 3, 1, 2]}).set_sorted(
         "index_column"
     )
@@ -212,7 +212,7 @@ def test_groupby_dynamic_agg_input_types(lazy: bool) -> None:
 
     for bad_param in bad_agg_parameters():
         with pytest.raises(TypeError):  # noqa: PT012
-            result = df_or_lazy.groupby_dynamic(
+            result = df_or_lazy.group_by_dynamic(
                 index_column="index_column", every="2i", closed="right"
             ).agg(bad_param)
             if lazy:
@@ -221,7 +221,7 @@ def test_groupby_dynamic_agg_input_types(lazy: bool) -> None:
     expected = pl.DataFrame({"index_column": [-2, 0, 2], "b": [1, 4, 2]})
 
     for good_param in good_agg_parameters():
-        result = df_or_lazy.groupby_dynamic(
+        result = df_or_lazy.group_by_dynamic(
             index_column="index_column", every="2i", closed="right"
         ).agg(good_param)
         if lazy:
@@ -229,7 +229,7 @@ def test_groupby_dynamic_agg_input_types(lazy: bool) -> None:
         assert_frame_equal(result, expected)
 
 
-def test_groupby_sorted_empty_dataframe_3680() -> None:
+def test_group_by_sorted_empty_dataframe_3680() -> None:
     df = (
         pl.DataFrame(
             [
@@ -239,7 +239,7 @@ def test_groupby_sorted_empty_dataframe_3680() -> None:
         )
         .lazy()
         .sort("key")
-        .groupby("key")
+        .group_by("key")
         .tail(1)
         .collect()
     )
@@ -248,7 +248,7 @@ def test_groupby_sorted_empty_dataframe_3680() -> None:
     assert df.schema == {"key": pl.Categorical, "val": pl.Float64}
 
 
-def test_groupby_custom_agg_empty_list() -> None:
+def test_group_by_custom_agg_empty_list() -> None:
     assert (
         pl.DataFrame(
             [
@@ -256,7 +256,7 @@ def test_groupby_custom_agg_empty_list() -> None:
                 pl.Series("val", [], dtype=pl.Float64),
             ]
         )
-        .groupby("key")
+        .group_by("key")
         .agg(
             [
                 pl.col("val").mean().alias("mean"),
@@ -268,7 +268,7 @@ def test_groupby_custom_agg_empty_list() -> None:
     ).dtypes == [pl.Categorical, pl.Float64, pl.Float64, pl.Float64, pl.Float64]
 
 
-def test_apply_after_take_in_groupby_3869() -> None:
+def test_apply_after_take_in_group_by_3869() -> None:
     assert (
         pl.DataFrame(
             {
@@ -277,20 +277,20 @@ def test_apply_after_take_in_groupby_3869() -> None:
                 "v": [3, 1, 2, 5, 6, 4],
             }
         )
-        .groupby("k", maintain_order=True)
+        .group_by("k", maintain_order=True)
         .agg(
             pl.col("v").take(pl.col("t").arg_max()).sqrt()
         )  # <- fails for sqrt, exp, log, pow, etc.
     ).to_dict(False) == {"k": ["a", "b"], "v": [1.4142135623730951, 2.0]}
 
 
-def test_groupby_signed_transmutes() -> None:
+def test_group_by_signed_transmutes() -> None:
     df = pl.DataFrame({"foo": [-1, -2, -3, -4, -5], "bar": [500, 600, 700, 800, 900]})
 
     for dt in [pl.Int8, pl.Int16, pl.Int32, pl.Int64]:
         df = (
             df.with_columns([pl.col("foo").cast(dt), pl.col("bar")])
-            .groupby("foo", maintain_order=True)
+            .group_by("foo", maintain_order=True)
             .agg(pl.col("bar").median())
         )
 
@@ -340,10 +340,10 @@ def test_unique_order() -> None:
     }
 
 
-def test_groupby_dynamic_flat_agg_4814() -> None:
+def test_group_by_dynamic_flat_agg_4814() -> None:
     df = pl.DataFrame({"a": [1, 2, 2], "b": [1, 8, 12]}).set_sorted("a")
 
-    assert df.groupby_dynamic("a", every="1i", period="2i").agg(
+    assert df.group_by_dynamic("a", every="1i", period="2i").agg(
         [
             (pl.col("b").sum() / pl.col("a").sum()).alias("sum_ratio_1"),
             (pl.col("b").last() / pl.col("a").last()).alias("last_ratio_1"),
@@ -365,7 +365,7 @@ def test_groupby_dynamic_flat_agg_4814() -> None:
     ],
 )
 @pytest.mark.parametrize("time_zone", [None, "Asia/Kathmandu"])
-def test_groupby_dynamic_overlapping_groups_flat_apply_multiple_5038(
+def test_group_by_dynamic_overlapping_groups_flat_apply_multiple_5038(
     every: str | timedelta, period: str | timedelta, time_zone: str | None
 ) -> None:
     res = (
@@ -382,7 +382,7 @@ def test_groupby_dynamic_overlapping_groups_flat_apply_multiple_5038(
             .with_columns(pl.col("a").dt.replace_time_zone(time_zone))
             .lazy()
             .set_sorted("a")
-            .groupby_dynamic("a", every=every, period=period)
+            .group_by_dynamic("a", every=every, period=period)
             .agg([pl.col("b").var().sqrt().alias("corr")])
         )
         .collect()
@@ -394,26 +394,26 @@ def test_groupby_dynamic_overlapping_groups_flat_apply_multiple_5038(
     assert res["a"] == [None]
 
 
-def test_take_in_groupby() -> None:
+def test_take_in_group_by() -> None:
     df = pl.DataFrame({"group": [1, 1, 1, 2, 2, 2], "values": [10, 200, 3, 40, 500, 6]})
-    assert df.groupby("group").agg(
+    assert df.group_by("group").agg(
         pl.col("values").take(1) - pl.col("values").take(2)
     ).sort("group").to_dict(False) == {"group": [1, 2], "values": [197, 494]}
 
 
-def test_groupby_wildcard() -> None:
+def test_group_by_wildcard() -> None:
     df = pl.DataFrame(
         {
             "a": [1, 2],
             "b": [1, 2],
         }
     )
-    assert df.groupby([pl.col("*")], maintain_order=True).agg(
+    assert df.group_by([pl.col("*")], maintain_order=True).agg(
         [pl.col("a").first().suffix("_agg")]
     ).to_dict(False) == {"a": [1, 2], "b": [1, 2], "a_agg": [1, 2]}
 
 
-def test_groupby_all_masked_out() -> None:
+def test_group_by_all_masked_out() -> None:
     df = pl.DataFrame(
         {
             "val": pl.Series(
@@ -427,24 +427,24 @@ def test_groupby_all_masked_out() -> None:
     assert_frame_equal(parts[0], df)
 
 
-def test_groupby_null_propagation_6185() -> None:
+def test_group_by_null_propagation_6185() -> None:
     df_1 = pl.DataFrame({"A": [0, 0], "B": [1, 2]})
 
     expr = pl.col("A").filter(pl.col("A") > 0)
 
     expected = {"B": [1, 2], "A": [None, None]}
     assert (
-        df_1.groupby("B").agg((expr - expr.mean()).mean()).sort("B").to_dict(False)
+        df_1.group_by("B").agg((expr - expr.mean()).mean()).sort("B").to_dict(False)
         == expected
     )
 
 
-def test_groupby_when_then_with_binary_and_agg_in_pred_6202() -> None:
+def test_group_by_when_then_with_binary_and_agg_in_pred_6202() -> None:
     df = pl.DataFrame(
         {"code": ["a", "b", "b", "b", "a"], "xx": [1.0, -1.5, -0.2, -3.9, 3.0]}
     )
     assert (
-        df.groupby("code", maintain_order=True).agg(
+        df.group_by("code", maintain_order=True).agg(
             [pl.when(pl.col("xx") > pl.min("xx")).then(True).otherwise(False)]
         )
     ).to_dict(False) == {
@@ -455,7 +455,7 @@ def test_groupby_when_then_with_binary_and_agg_in_pred_6202() -> None:
 
 @pytest.mark.parametrize("every", ["1h", timedelta(hours=1)])
 @pytest.mark.parametrize("tzinfo", [None, ZoneInfo("Asia/Kathmandu")])
-def test_groupby_dynamic_iter(every: str | timedelta, tzinfo: ZoneInfo | None) -> None:
+def test_group_by_dynamic_iter(every: str | timedelta, tzinfo: ZoneInfo | None) -> None:
     time_zone = tzinfo.key if tzinfo is not None else None
     df = pl.DataFrame(
         {
@@ -473,7 +473,7 @@ def test_groupby_dynamic_iter(every: str | timedelta, tzinfo: ZoneInfo | None) -
     # Without 'by' argument
     result1 = [
         (name, data.shape)
-        for name, data in df.groupby_dynamic("datetime", every=every, closed="left")
+        for name, data in df.group_by_dynamic("datetime", every=every, closed="left")
     ]
     expected1 = [
         (datetime(2020, 1, 1, 10, tzinfo=tzinfo), (2, 3)),
@@ -484,7 +484,7 @@ def test_groupby_dynamic_iter(every: str | timedelta, tzinfo: ZoneInfo | None) -
     # With 'by' argument
     result2 = [
         (name, data.shape)
-        for name, data in df.groupby_dynamic(
+        for name, data in df.group_by_dynamic(
             "datetime", every=every, closed="left", by="a"
         )
     ]
@@ -498,7 +498,7 @@ def test_groupby_dynamic_iter(every: str | timedelta, tzinfo: ZoneInfo | None) -
 
 @pytest.mark.parametrize("every", ["1h", timedelta(hours=1)])
 @pytest.mark.parametrize("tzinfo", [None, ZoneInfo("Asia/Kathmandu")])
-def test_groupby_dynamic_lazy(every: str | timedelta, tzinfo: ZoneInfo | None) -> None:
+def test_group_by_dynamic_lazy(every: str | timedelta, tzinfo: ZoneInfo | None) -> None:
     ldf = pl.LazyFrame(
         {
             "time": pl.date_range(
@@ -511,7 +511,7 @@ def test_groupby_dynamic_lazy(every: str | timedelta, tzinfo: ZoneInfo | None) -
         }
     )
     df = (
-        ldf.groupby_dynamic("time", every=every, closed="right")
+        ldf.group_by_dynamic("time", every=every, closed="right")
         .agg(
             [
                 pl.col("time").min().alias("time_min"),
@@ -541,20 +541,20 @@ def test_groupby_dynamic_lazy(every: str | timedelta, tzinfo: ZoneInfo | None) -
 
 @pytest.mark.slow()
 @pytest.mark.parametrize("dtype", [pl.Int32, pl.UInt32])
-def test_overflow_mean_partitioned_groupby_5194(dtype: pl.PolarsDataType) -> None:
+def test_overflow_mean_partitioned_group_by_5194(dtype: pl.PolarsDataType) -> None:
     df = pl.DataFrame(
         [
             pl.Series("data", [10_00_00_00] * 100_000, dtype=dtype),
             pl.Series("group", [1, 2] * 50_000, dtype=dtype),
         ]
     )
-    assert df.groupby("group").agg(pl.col("data").mean()).sort(by="group").to_dict(
+    assert df.group_by("group").agg(pl.col("data").mean()).sort(by="group").to_dict(
         False
     ) == {"group": [1, 2], "data": [10000000.0, 10000000.0]}
 
 
 @pytest.mark.parametrize("time_zone", [None, "Asia/Kathmandu"])
-def test_groupby_dynamic_elementwise_following_mean_agg_6904(
+def test_group_by_dynamic_elementwise_following_mean_agg_6904(
     time_zone: str | None,
 ) -> None:
     df = (
@@ -569,7 +569,7 @@ def test_groupby_dynamic_elementwise_following_mean_agg_6904(
         .with_columns(pl.col("a").dt.replace_time_zone(time_zone))
         .lazy()
         .set_sorted("a")
-        .groupby_dynamic("a", every="10s", period="100s")
+        .group_by_dynamic("a", every="10s", period="100s")
         .agg([pl.col("b").mean().sin().alias("c")])
         .collect()
     )
@@ -587,7 +587,7 @@ def test_groupby_dynamic_elementwise_following_mean_agg_6904(
     )
 
 
-def test_groupby_multiple_column_reference() -> None:
+def test_group_by_multiple_column_reference() -> None:
     # Issue #7181
     df = pl.DataFrame(
         {
@@ -595,7 +595,7 @@ def test_groupby_multiple_column_reference() -> None:
             "val": [1, 20, 100, 2000, 10000, 200000],
         }
     )
-    res = df.groupby("gr").agg(
+    res = df.group_by("gr").agg(
         pl.col("val") + pl.col("val").shift().fill_null(0),
     )
 
@@ -618,14 +618,14 @@ def test_groupby_multiple_column_reference() -> None:
         ("quantile", [0.5], [1.0, None], pl.Float64),
     ],
 )
-def test_groupby_empty_groups(
+def test_group_by_empty_groups(
     aggregation: str,
     args: list[object],
     expected_values: list[object],
     expected_dtype: pl.DataType,
 ) -> None:
     df = pl.DataFrame({"a": [1, 2], "b": [1, 2]})
-    result = df.groupby("b", maintain_order=True).agg(
+    result = df.group_by("b", maintain_order=True).agg(
         getattr(pl.col("a").filter(pl.col("b") != 2), aggregation)(*args)
     )
     expected = pl.DataFrame({"b": [1, 2], "a": expected_values}).with_columns(
@@ -743,7 +743,7 @@ def test_perfect_hash_table_null_values_8663() -> None:
         dtype=pl.Categorical,
     )
 
-    assert s.to_frame("a").groupby("a").agg(pl.col("a").alias("agg")).to_dict(
+    assert s.to_frame("a").group_by("a").agg(pl.col("a").alias("agg")).to_dict(
         False
     ) == {
         "a": [
@@ -833,9 +833,69 @@ def test_perfect_hash_table_null_values_8663() -> None:
     }
 
 
-def test_groupby_partitioned_ending_cast(monkeypatch: Any) -> None:
+def test_group_by_partitioned_ending_cast(monkeypatch: Any) -> None:
     monkeypatch.setenv("POLARS_FORCE_PARTITION", "1")
     df = pl.DataFrame({"a": [1] * 5, "b": [1] * 5})
-    out = df.groupby(["a", "b"]).agg(pl.count().cast(pl.Int64).alias("num"))
+    out = df.group_by(["a", "b"]).agg(pl.count().cast(pl.Int64).alias("num"))
     expected = pl.DataFrame({"a": [1], "b": [1], "num": [5]})
     assert_frame_equal(out, expected)
+
+
+def test_groupby_deprecated() -> None:
+    df = pl.DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]})
+
+    with pytest.deprecated_call():
+        result = df.groupby("a").agg(pl.sum("b"))
+    with pytest.deprecated_call():
+        result_lazy = df.lazy().groupby("a").agg(pl.sum("b")).collect()
+
+    expected = df.group_by("a").agg(pl.sum("b"))
+    assert_frame_equal(result, expected, check_row_order=False)
+    assert_frame_equal(result_lazy, expected, check_row_order=False)
+
+
+def test_groupby_rolling_deprecated() -> None:
+    df = pl.DataFrame(
+        {
+            "date": pl.date_range(
+                datetime(2020, 1, 1), datetime(2020, 1, 5), eager=True
+            ),
+            "value": [1, 2, 3, 4, 5],
+        }
+    )
+
+    with pytest.deprecated_call():
+        result = df.groupby_rolling("date", period="2d").agg(pl.sum("value"))
+    with pytest.deprecated_call():
+        result_lazy = (
+            df.lazy()
+            .groupby_rolling("date", period="2d")
+            .agg(pl.sum("value"))
+            .collect()
+        )
+
+    expected = df.group_by_rolling("date", period="2d").agg(pl.sum("value"))
+    assert_frame_equal(result, expected, check_row_order=False)
+    assert_frame_equal(result_lazy, expected, check_row_order=False)
+
+
+def test_groupby_dynamic_deprecated() -> None:
+    df = pl.DataFrame(
+        {
+            "date": pl.date_range(
+                datetime(2020, 1, 1), datetime(2020, 1, 5), eager=True
+            ),
+            "value": [1, 2, 3, 4, 5],
+        }
+    )
+
+    with pytest.deprecated_call():
+        result = df.groupby_dynamic("date", every="2d").agg(pl.sum("value"))
+    with pytest.deprecated_call():
+        result_lazy = (
+            df.lazy().groupby_dynamic("date", every="2d").agg(pl.sum("value")).collect()
+        )
+
+    expected = df.group_by_dynamic("date", every="2d").agg(pl.sum("value"))
+    assert_frame_equal(result, expected, check_row_order=False)
+    assert_frame_equal(result_lazy, expected, check_row_order=False)
diff --git a/py-polars/tests/unit/operations/test_groupby_rolling.py b/py-polars/tests/unit/operations/test_group_by_rolling.py
similarity index 84%
rename from py-polars/tests/unit/operations/test_groupby_rolling.py
rename to py-polars/tests/unit/operations/test_group_by_rolling.py
index 36be0b12bf1e..37b44a3f6252 100644
--- a/py-polars/tests/unit/operations/test_groupby_rolling.py
+++ b/py-polars/tests/unit/operations/test_group_by_rolling.py
@@ -24,7 +24,7 @@ def good_agg_parameters() -> list[pl.Expr | list[pl.Expr]]:
     ]
 
 
-def test_groupby_rolling_apply() -> None:
+def test_group_by_rolling_apply() -> None:
     df = pl.DataFrame(
         {
             "a": [1, 2, 3, 4, 5],
@@ -45,11 +45,11 @@ def apply(df: pl.DataFrame) -> pl.DataFrame:
         ]
     )
 
-    out = df.groupby_rolling("a", period="2i").apply(apply, schema=df.schema)
+    out = df.group_by_rolling("a", period="2i").apply(apply, schema=df.schema)
     assert_frame_equal(out, expected)
 
 
-def test_rolling_groupby_overlapping_groups() -> None:
+def test_rolling_group_by_overlapping_groups() -> None:
     # this first aggregates overlapping groups so they cannot be naively flattened
     df = pl.DataFrame({"a": [41, 60, 37, 51, 52, 39, 40]})
 
@@ -57,7 +57,7 @@ def test_rolling_groupby_overlapping_groups() -> None:
         (
             df.with_row_count()
             .with_columns(pl.col("row_nr").cast(pl.Int32))
-            .groupby_rolling(
+            .group_by_rolling(
                 index_column="row_nr",
                 period="5i",
             )
@@ -73,7 +73,7 @@ def test_rolling_groupby_overlapping_groups() -> None:
 
 
 @pytest.mark.parametrize("lazy", [True, False])
-def test_groupby_rolling_agg_input_types(lazy: bool) -> None:
+def test_group_by_rolling_agg_input_types(lazy: bool) -> None:
     df = pl.DataFrame({"index_column": [0, 1, 2, 3], "b": [1, 3, 1, 2]}).set_sorted(
         "index_column"
     )
@@ -81,7 +81,7 @@ def test_groupby_rolling_agg_input_types(lazy: bool) -> None:
 
     for bad_param in bad_agg_parameters():
         with pytest.raises(TypeError):  # noqa: PT012
-            result = df_or_lazy.groupby_rolling(
+            result = df_or_lazy.group_by_rolling(
                 index_column="index_column", period="2i"
             ).agg(bad_param)
             if lazy:
@@ -90,7 +90,7 @@ def test_groupby_rolling_agg_input_types(lazy: bool) -> None:
     expected = pl.DataFrame({"index_column": [0, 1, 2, 3], "b": [1, 4, 4, 3]})
 
     for good_param in good_agg_parameters():
-        result = df_or_lazy.groupby_rolling(
+        result = df_or_lazy.group_by_rolling(
             index_column="index_column", period="2i"
         ).agg(good_param)
         if lazy:
@@ -98,7 +98,7 @@ def test_groupby_rolling_agg_input_types(lazy: bool) -> None:
         assert_frame_equal(result, expected)
 
 
-def test_groupby_rolling_negative_offset_3914() -> None:
+def test_group_by_rolling_negative_offset_3914() -> None:
     df = pl.DataFrame(
         {
             "datetime": pl.date_range(
@@ -106,7 +106,7 @@ def test_groupby_rolling_negative_offset_3914() -> None:
             ),
         }
     )
-    assert df.groupby_rolling(index_column="datetime", period="2d", offset="-4d").agg(
+    assert df.group_by_rolling(index_column="datetime", period="2d", offset="-4d").agg(
         pl.count().alias("count")
     )["count"].to_list() == [0, 0, 1, 2, 2]
 
@@ -116,7 +116,7 @@ def test_groupby_rolling_negative_offset_3914() -> None:
         }
     )
 
-    assert df.groupby_rolling(index_column="ints", period="2i", offset="-5i").agg(
+    assert df.group_by_rolling(index_column="ints", period="2i", offset="-5i").agg(
         [pl.col("ints").alias("matches")]
     )["matches"].to_list() == [
         [],
@@ -143,7 +143,7 @@ def test_groupby_rolling_negative_offset_3914() -> None:
 
 
 @pytest.mark.parametrize("time_zone", [None, "US/Central"])
-def test_groupby_rolling_negative_offset_crossing_dst(time_zone: str | None) -> None:
+def test_group_by_rolling_negative_offset_crossing_dst(time_zone: str | None) -> None:
     df = pl.DataFrame(
         {
             "datetime": pl.date_range(
@@ -156,9 +156,9 @@ def test_groupby_rolling_negative_offset_crossing_dst(time_zone: str | None) ->
             "value": [1, 4, 9, 155],
         }
     )
-    result = df.groupby_rolling(index_column="datetime", period="2d", offset="-1d").agg(
-        pl.col("value")
-    )
+    result = df.group_by_rolling(
+        index_column="datetime", period="2d", offset="-1d"
+    ).agg(pl.col("value"))
     expected = pl.DataFrame(
         {
             "datetime": pl.date_range(
@@ -188,7 +188,7 @@ def test_groupby_rolling_negative_offset_crossing_dst(time_zone: str | None) ->
         ("1d", "none", [[9], [155], [], []]),
     ],
 )
-def test_groupby_rolling_non_negative_offset_9077(
+def test_group_by_rolling_non_negative_offset_9077(
     time_zone: str | None,
     offset: str,
     closed: ClosedInterval,
@@ -206,7 +206,7 @@ def test_groupby_rolling_non_negative_offset_9077(
             "value": [1, 4, 9, 155],
         }
     )
-    result = df.groupby_rolling(
+    result = df.group_by_rolling(
         index_column="datetime", period="2d", offset=offset, closed=closed
     ).agg(pl.col("value"))
     expected = pl.DataFrame(
@@ -224,7 +224,7 @@ def test_groupby_rolling_non_negative_offset_9077(
     assert_frame_equal(result, expected)
 
 
-def test_groupby_rolling_dynamic_sortedness_check() -> None:
+def test_group_by_rolling_dynamic_sortedness_check() -> None:
     # when the by argument is passed, the sortedness flag
     # will be unset as the take shuffles data, so we must explicitly
     # check the sortedness
@@ -236,12 +236,12 @@ def test_groupby_rolling_dynamic_sortedness_check() -> None:
     )
 
     with pytest.raises(pl.ComputeError, match=r"input data is not sorted"):
-        df.groupby_dynamic("idx", every="2i", by="group").agg(
+        df.group_by_dynamic("idx", every="2i", by="group").agg(
             pl.col("idx").alias("idx1")
         )
 
     with pytest.raises(pl.ComputeError, match=r"input data is not sorted"):
-        df.groupby_rolling("idx", period="2i", by="group").agg(
+        df.group_by_rolling("idx", period="2i", by="group").agg(
             pl.col("idx").alias("idx1")
         )
 
@@ -250,17 +250,17 @@ def test_groupby_rolling_dynamic_sortedness_check() -> None:
         pl.InvalidOperationError,
         match=r"argument in operation 'group_by_dynamic' is not explicitly sorted",
     ):
-        df.groupby_dynamic("idx", every="2i").agg(pl.col("idx").alias("idx1"))
+        df.group_by_dynamic("idx", every="2i").agg(pl.col("idx").alias("idx1"))
 
     # no `by` argument
     with pytest.raises(
         pl.InvalidOperationError,
         match=r"argument in operation 'group_by_rolling' is not explicitly sorted",
     ):
-        df.groupby_rolling("idx", period="2i").agg(pl.col("idx").alias("idx1"))
+        df.group_by_rolling("idx", period="2i").agg(pl.col("idx").alias("idx1"))
 
 
-def test_groupby_rolling_empty_groups_9973() -> None:
+def test_group_by_rolling_empty_groups_9973() -> None:
     dt1 = date(2001, 1, 1)
     dt2 = date(2001, 1, 2)
 
@@ -287,7 +287,7 @@ def test_groupby_rolling_empty_groups_9973() -> None:
         }
     )
 
-    out = data.groupby_rolling(
+    out = data.group_by_rolling(
         index_column="date",
         by="id",
         period="2d",
diff --git a/py-polars/tests/unit/operations/test_join.py b/py-polars/tests/unit/operations/test_join.py
index a1ffc113d9d5..24d28cc9c759 100644
--- a/py-polars/tests/unit/operations/test_join.py
+++ b/py-polars/tests/unit/operations/test_join.py
@@ -65,7 +65,7 @@ def test_join_same_cat_src() -> None:
         data={"column": ["a", "a", "b"], "more": [1, 2, 3]},
         schema=[("column", pl.Categorical), ("more", pl.Int32)],
     )
-    df_agg = df.groupby("column").agg(pl.col("more").mean())
+    df_agg = df.group_by("column").agg(pl.col("more").mean())
     assert df.join(df_agg, on="column").to_dict(False) == {
         "column": ["a", "a", "b"],
         "more": [1, 2, 3],
@@ -434,7 +434,7 @@ def test_semi_join_projection_pushdown_6455() -> None:
         }
     ).lazy()
 
-    latest = df.groupby("id").agg(pl.col("timestamp").max())
+    latest = df.group_by("id").agg(pl.col("timestamp").max())
     df = df.join(latest, on=["id", "timestamp"], how="semi")
     assert df.select(["id", "value"]).collect().to_dict(False) == {
         "id": [1, 2],
diff --git a/py-polars/tests/unit/operations/test_join_asof.py b/py-polars/tests/unit/operations/test_join_asof.py
index 8021d360e92c..6ec901d661fb 100644
--- a/py-polars/tests/unit/operations/test_join_asof.py
+++ b/py-polars/tests/unit/operations/test_join_asof.py
@@ -54,7 +54,7 @@ def test_asof_join_projection_resolution_4606() -> None:
     a = pl.DataFrame({"a": [1], "b": [2], "c": [3]}).lazy()
     b = pl.DataFrame({"a": [1], "b": [2], "d": [4]}).lazy()
     joined_tbl = a.join_asof(b, on=pl.col("a").set_sorted(), by="b")
-    assert joined_tbl.groupby("a").agg(
+    assert joined_tbl.group_by("a").agg(
         [pl.col("c").sum().alias("c")]
     ).collect().columns == ["a", "c"]
 
diff --git a/py-polars/tests/unit/operations/test_profile.py b/py-polars/tests/unit/operations/test_profile.py
index ef7e8b1fd170..df70655b1c19 100644
--- a/py-polars/tests/unit/operations/test_profile.py
+++ b/py-polars/tests/unit/operations/test_profile.py
@@ -5,7 +5,7 @@ def test_profile_columns() -> None:
     ldf = pl.LazyFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
 
     # profile lazyframe operation/plan
-    lazy = ldf.groupby("a").agg(pl.implode("b"))
+    lazy = ldf.group_by("a").agg(pl.implode("b"))
     profiling_info = lazy.profile()
     # ┌──────────────┬───────┬─────┐
     # │ node         ┆ start ┆ end │
@@ -13,7 +13,7 @@ def test_profile_columns() -> None:
     # │ str          ┆ u64   ┆ u64 │
     # ╞══════════════╪═══════╪═════╡
     # │ optimization ┆ 0     ┆ 69  │
-    # │ groupby(a)   ┆ 69    ┆ 342 │
+    # │ group_by(a)  ┆ 69    ┆ 342 │
     # └──────────────┴───────┴─────┘
     assert len(profiling_info) == 2
     assert profiling_info[1].columns == ["node", "start", "end"]
diff --git a/py-polars/tests/unit/operations/test_random.py b/py-polars/tests/unit/operations/test_random.py
index 47022b7d66c8..a92dfbe69677 100644
--- a/py-polars/tests/unit/operations/test_random.py
+++ b/py-polars/tests/unit/operations/test_random.py
@@ -6,15 +6,15 @@
 from polars.testing import assert_frame_equal, assert_series_equal
 
 
-def test_shuffle_groupby_reseed() -> None:
+def test_shuffle_group_by_reseed() -> None:
     def unique_shuffle_groups(n: int, seed: int | None) -> int:
         ls = [1, 2, 3] * n  # 1, 2, 3, 1, 2, 3...
         groups = sorted(list(range(n)) * 3)  # 0, 0, 0, 1, 1, 1, ...
         df = pl.DataFrame({"l": ls, "group": groups})
-        shuffled = df.groupby("group", maintain_order=True).agg(
+        shuffled = df.group_by("group", maintain_order=True).agg(
             pl.col("l").shuffle(seed)
         )
-        num_unique = shuffled.groupby("l").agg(pl.lit(0)).select(pl.count())
+        num_unique = shuffled.group_by("l").agg(pl.lit(0)).select(pl.count())
         return int(num_unique[0, 0])
 
     assert unique_shuffle_groups(50, None) > 1  # Astronomically unlikely.
diff --git a/py-polars/tests/unit/operations/test_rolling.py b/py-polars/tests/unit/operations/test_rolling.py
index 27bcdae19bf6..46ff6c6e5181 100644
--- a/py-polars/tests/unit/operations/test_rolling.py
+++ b/py-polars/tests/unit/operations/test_rolling.py
@@ -44,13 +44,13 @@ def example_df() -> pl.DataFrame:
     ["1d", "2d", "3d", timedelta(days=1), timedelta(days=2), timedelta(days=3)],
 )
 @pytest.mark.parametrize("closed", ["left", "right", "none", "both"])
-def test_rolling_kernels_and_groupby_rolling(
+def test_rolling_kernels_and_group_by_rolling(
     example_df: pl.DataFrame, period: str | timedelta, closed: ClosedInterval
 ) -> None:
     out1 = example_df.select(
         [
             pl.col("dt"),
-            # this differs from groupby aggregation because the empty window is
+            # this differs from group_by aggregation because the empty window is
             # null here
             # where the sum aggregation of an empty set is 0
             pl.col("values")
@@ -64,7 +64,7 @@ def test_rolling_kernels_and_groupby_rolling(
     )
     out2 = (
         example_df.set_sorted("dt")
-        .groupby_rolling("dt", period=period, closed=closed)
+        .group_by_rolling("dt", period=period, closed=closed)
         .agg(
             [
                 pl.col("values").sum().alias("sum"),
@@ -153,7 +153,7 @@ def test_rolling_negative_offset(
             "value": [1, 2, 3, 4],
         }
     )
-    result = df.groupby_rolling("ts", period="2d", offset=offset, closed=closed).agg(
+    result = df.group_by_rolling("ts", period="2d", offset=offset, closed=closed).agg(
         pl.col("value")
     )
     expected = pl.DataFrame(
@@ -269,7 +269,7 @@ def test_rolling_extrema() -> None:
     }
 
 
-def test_rolling_groupby_extrema() -> None:
+def test_rolling_group_by_extrema() -> None:
     # ensure we hit different branches so create
 
     df = pl.DataFrame(
@@ -279,7 +279,7 @@ def test_rolling_groupby_extrema() -> None:
     ).with_columns(pl.col("col1").reverse().alias("row_nr"))
 
     assert (
-        df.groupby_rolling(
+        df.group_by_rolling(
             index_column="row_nr",
             period="3i",
         )
@@ -318,7 +318,7 @@ def test_rolling_groupby_extrema() -> None:
     ).with_columns(pl.col("col1").alias("row_nr"))
 
     assert (
-        df.groupby_rolling(
+        df.group_by_rolling(
             index_column="row_nr",
             period="3i",
         )
@@ -356,7 +356,7 @@ def test_rolling_groupby_extrema() -> None:
     ).with_columns(pl.col("col1").sort().alias("row_nr"))
 
     assert (
-        df.groupby_rolling(
+        df.group_by_rolling(
             index_column="row_nr",
             period="3i",
         )
@@ -387,7 +387,7 @@ def test_rolling_slice_pushdown() -> None:
     df = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "a", "b"], "c": [1, 3, 5]}).lazy()
     df = (
         df.sort("a")
-        .groupby_rolling(
+        .group_by_rolling(
             "a",
             by="b",
             period="2i",
@@ -407,11 +407,11 @@ def test_rolling_slice_pushdown() -> None:
     }
 
 
-def test_groupby_dynamic_slice_pushdown() -> None:
+def test_group_by_dynamic_slice_pushdown() -> None:
     df = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "a", "b"], "c": [1, 3, 5]}).lazy()
     df = (
         df.sort("a")
-        .groupby_dynamic(
+        .group_by_dynamic(
             "a",
             by="b",
             every="2i",
@@ -439,7 +439,7 @@ def test_overlapping_groups_4628() -> None:
         }
     )
     assert (
-        df.groupby_rolling(index_column=pl.col("index").set_sorted(), period="3i").agg(
+        df.group_by_rolling(index_column=pl.col("index").set_sorted(), period="3i").agg(
             [
                 pl.col("val").diff(n=1).alias("val.diff"),
                 (pl.col("val") - pl.col("val").shift(1)).alias("val - val.shift"),
@@ -512,7 +512,7 @@ def test_rolling_var_numerical_stability_5197() -> None:
         (timedelta(days=3), timedelta(days=-1)),
     ],
 )
-def test_dynamic_groupby_timezone_awareness(
+def test_dynamic_group_by_timezone_awareness(
     every: str | timedelta, offset: str | timedelta
 ) -> None:
     df = pl.DataFrame(
@@ -531,7 +531,7 @@ def test_dynamic_groupby_timezone_awareness(
     )
 
     assert (
-        df.groupby_dynamic(
+        df.group_by_dynamic(
             "datetime",
             every=every,
             offset=offset,
@@ -543,13 +543,13 @@ def test_dynamic_groupby_timezone_awareness(
 
 
 @pytest.mark.parametrize("tzinfo", [None, ZoneInfo("Asia/Kathmandu")])
-def test_groupby_dynamic_startby_5599(tzinfo: ZoneInfo | None) -> None:
+def test_group_by_dynamic_startby_5599(tzinfo: ZoneInfo | None) -> None:
     # start by datapoint
     start = datetime(2022, 12, 16, tzinfo=tzinfo)
     stop = datetime(2022, 12, 16, hour=3, tzinfo=tzinfo)
     df = pl.DataFrame({"date": pl.date_range(start, stop, "30m", eager=True)})
 
-    assert df.groupby_dynamic(
+    assert df.group_by_dynamic(
         "date",
         every="31m",
         include_boundaries=True,
@@ -591,7 +591,7 @@ def test_groupby_dynamic_startby_5599(tzinfo: ZoneInfo | None) -> None:
         {"date": pl.date_range(start, stop, "12h", eager=True)}
     ).with_columns(pl.col("date").dt.weekday().alias("day"))
 
-    result = df.groupby_dynamic(
+    result = df.group_by_dynamic(
         "date",
         every="1w",
         period="3d",
@@ -616,7 +616,7 @@ def test_groupby_dynamic_startby_5599(tzinfo: ZoneInfo | None) -> None:
         "data_day": [1, 1],
     }
     # start by saturday
-    result = df.groupby_dynamic(
+    result = df.group_by_dynamic(
         "date",
         every="1w",
         period="3d",
@@ -642,7 +642,7 @@ def test_groupby_dynamic_startby_5599(tzinfo: ZoneInfo | None) -> None:
     }
 
 
-def test_groupby_dynamic_by_monday_and_offset_5444() -> None:
+def test_group_by_dynamic_by_monday_and_offset_5444() -> None:
     df = pl.DataFrame(
         {
             "date": [
@@ -659,7 +659,7 @@ def test_groupby_dynamic_by_monday_and_offset_5444() -> None:
         }
     ).with_columns(pl.col("date").str.strptime(pl.Date, "%Y-%m-%d").set_sorted())
 
-    result = df.groupby_dynamic(
+    result = df.group_by_dynamic(
         "date", every="1w", offset="1d", by="label", start_by="monday"
     ).agg(pl.col("value").sum())
 
@@ -677,13 +677,15 @@ def test_groupby_dynamic_by_monday_and_offset_5444() -> None:
     # test empty
     result_empty = (
         df.filter(pl.col("date") == date(1, 1, 1))
-        .groupby_dynamic("date", every="1w", offset="1d", by="label", start_by="monday")
+        .group_by_dynamic(
+            "date", every="1w", offset="1d", by="label", start_by="monday"
+        )
         .agg(pl.col("value").sum())
     )
     assert result_empty.schema == result.schema
 
 
-def test_groupby_rolling_iter() -> None:
+def test_group_by_rolling_iter() -> None:
     df = pl.DataFrame(
         {
             "date": [date(2020, 1, 1), date(2020, 1, 2), date(2020, 1, 5)],
@@ -695,7 +697,7 @@ def test_groupby_rolling_iter() -> None:
     # Without 'by' argument
     result1 = [
         (name, data.shape)
-        for name, data in df.groupby_rolling(index_column="date", period="2d")
+        for name, data in df.group_by_rolling(index_column="date", period="2d")
     ]
     expected1 = [
         (date(2020, 1, 1), (1, 3)),
@@ -707,7 +709,7 @@ def test_groupby_rolling_iter() -> None:
     # With 'by' argument
     result2 = [
         (name, data.shape)
-        for name, data in df.groupby_rolling(index_column="date", period="2d", by="a")
+        for name, data in df.group_by_rolling(index_column="date", period="2d", by="a")
     ]
     expected2 = [
         ((1, date(2020, 1, 1)), (1, 3)),
@@ -717,18 +719,18 @@ def test_groupby_rolling_iter() -> None:
     assert result2 == expected2
 
 
-def test_groupby_rolling_negative_period() -> None:
+def test_group_by_rolling_negative_period() -> None:
     df = pl.DataFrame({"ts": [datetime(2020, 1, 1)], "value": [1]}).with_columns(
         pl.col("ts").set_sorted()
     )
     with pytest.raises(
         ComputeError, match="rolling window period should be strictly positive"
     ):
-        df.groupby_rolling("ts", period="-1d", offset="-1d").agg(pl.col("value"))
+        df.group_by_rolling("ts", period="-1d", offset="-1d").agg(pl.col("value"))
     with pytest.raises(
         ComputeError, match="rolling window period should be strictly positive"
     ):
-        df.lazy().groupby_rolling("ts", period="-1d", offset="-1d").agg(
+        df.lazy().group_by_rolling("ts", period="-1d", offset="-1d").agg(
             pl.col("value")
         ).collect()
     with pytest.raises(ComputeError, match="window size should be strictly positive"):
@@ -747,10 +749,10 @@ def test_rolling_skew_window_offset() -> None:
     ] == 0.6612545648596286
 
 
-def test_rolling_kernels_groupby_dynamic_7548() -> None:
+def test_rolling_kernels_group_by_dynamic_7548() -> None:
     assert pl.DataFrame(
         {"time": pl.arange(0, 4, eager=True), "value": pl.arange(0, 4, eager=True)}
-    ).groupby_dynamic("time", every="1i", period="3i").agg(
+    ).group_by_dynamic("time", every="1i", period="3i").agg(
         pl.col("value"),
         pl.col("value").min().alias("min_value"),
         pl.col("value").max().alias("max_value"),
diff --git a/py-polars/tests/unit/operations/test_sort.py b/py-polars/tests/unit/operations/test_sort.py
index 9d7eb6d41382..ff9346273d5d 100644
--- a/py-polars/tests/unit/operations/test_sort.py
+++ b/py-polars/tests/unit/operations/test_sort.py
@@ -314,7 +314,7 @@ def test_sorted_flag_unset_by_arithmetic_4937() -> None:
         }
     )
 
-    assert df.sort("price").groupby("ts").agg(
+    assert df.sort("price").group_by("ts").agg(
         [
             (pl.col("price") * pl.col("mask")).max().alias("pmax"),
             (pl.col("price") * pl.col("mask")).min().alias("pmin"),
@@ -332,7 +332,7 @@ def test_unset_sorted_flag_after_extend() -> None:
 
     df1.extend(df2)
     assert not df1["Add"].flags["SORTED_ASC"]
-    df = df1.groupby("Add").agg([pl.col("Batch").min()]).sort("Add")
+    df = df1.group_by("Add").agg([pl.col("Batch").min()]).sort("Add")
     assert df["Add"].flags["SORTED_ASC"]
     assert df.to_dict(False) == {"Add": [37, 41], "Batch": [48, 49]}
 
@@ -356,12 +356,12 @@ def test_sort_slice_fast_path_5245() -> None:
     }
 
 
-def test_explicit_list_agg_sort_in_groupby() -> None:
+def test_explicit_list_agg_sort_in_group_by() -> None:
     df = pl.DataFrame({"A": ["a", "a", "a", "b", "b", "a"], "B": [1, 2, 3, 4, 5, 6]})
 
     # this was col().implode().sort() before we changed the logic
-    result = df.groupby("A").agg(pl.col("B").sort(descending=True)).sort("A")
-    expected = df.groupby("A").agg(pl.col("B").sort(descending=True)).sort("A")
+    result = df.group_by("A").agg(pl.col("B").sort(descending=True)).sort("A")
+    expected = df.group_by("A").agg(pl.col("B").sort(descending=True)).sort("A")
     assert_frame_equal(result, expected)
 
 
@@ -388,7 +388,7 @@ def test_sorted_join_query_5406() -> None:
     df1 = df.sort(by=["Datetime", "RowId"])
 
     filter1 = (
-        df1.groupby(["Datetime", "Group"])
+        df1.group_by(["Datetime", "Group"])
         .agg([pl.all().sort_by("Value", descending=True).first()])
         .sort(["Datetime", "RowId"])
     )
@@ -535,7 +535,7 @@ def test_sort_by_logical() -> None:
             "num": [3, 4, 1],
         }
     )
-    assert df.groupby("name").agg([pl.col("num").sort_by(["dt1", "dt2"])]).sort(
+    assert df.group_by("name").agg([pl.col("num").sort_by(["dt1", "dt2"])]).sort(
         "name"
     ).to_dict(False) == {"name": ["a", "b"], "num": [[3, 1], [4]]}
 
@@ -647,11 +647,11 @@ def test_sort_top_k_fast_path() -> None:
     }
 
 
-def test_sorted_flag_groupby_dynamic() -> None:
+def test_sorted_flag_group_by_dynamic() -> None:
     df = pl.DataFrame({"ts": [date(2020, 1, 1), date(2020, 1, 2)], "val": [1, 2]})
     assert (
         (
-            df.groupby_dynamic(pl.col("ts").set_sorted(), every="1d").agg(
+            df.group_by_dynamic(pl.col("ts").set_sorted(), every="1d").agg(
                 pl.col("val").sum()
             )
         )
diff --git a/py-polars/tests/unit/streaming/test_streaming.py b/py-polars/tests/unit/streaming/test_streaming.py
index 8a1af3fc9b74..08203d438cc4 100644
--- a/py-polars/tests/unit/streaming/test_streaming.py
+++ b/py-polars/tests/unit/streaming/test_streaming.py
@@ -25,7 +25,7 @@ def test_streaming_categoricals_5921() -> None:
             pl.DataFrame({"X": ["a", "a", "a", "b", "b"], "Y": [2, 2, 2, 1, 1]})
             .lazy()
             .with_columns(pl.col("X").cast(pl.Categorical))
-            .groupby("X")
+            .group_by("X")
             .agg(pl.col("Y").min())
             .sort("Y", descending=True)
             .collect(streaming=True)
@@ -34,7 +34,7 @@ def test_streaming_categoricals_5921() -> None:
         out_eager = (
             pl.DataFrame({"X": ["a", "a", "a", "b", "b"], "Y": [2, 2, 2, 1, 1]})
             .with_columns(pl.col("X").cast(pl.Categorical))
-            .groupby("X")
+            .group_by("X")
             .agg(pl.col("Y").min())
             .sort("Y", descending=True)
         )
@@ -48,7 +48,7 @@ def test_streaming_block_on_literals_6054() -> None:
     df = pl.DataFrame({"col_1": [0] * 5 + [1] * 5})
     s = pl.Series("col_2", list(range(10)))
 
-    assert df.lazy().with_columns(s).groupby("col_1").agg(pl.all().first()).collect(
+    assert df.lazy().with_columns(s).group_by("col_1").agg(pl.all().first()).collect(
         streaming=True
     ).sort("col_1").to_dict(False) == {"col_1": [0, 1], "col_2": [0, 5]}
 
@@ -99,14 +99,14 @@ def test_streaming_literal_expansion() -> None:
         "y": ["a", "b"],
         "z": [1, 2],
     }
-    assert q.groupby(["x", "y"]).agg(pl.mean("z")).sort("y").collect(
+    assert q.group_by(["x", "y"]).agg(pl.mean("z")).sort("y").collect(
         streaming=True
     ).to_dict(False) == {
         "x": ["constant", "constant"],
         "y": ["a", "b"],
         "z": [1.0, 2.0],
     }
-    assert q.groupby(["x"]).agg(pl.mean("z")).collect().to_dict(False) == {
+    assert q.group_by(["x"]).agg(pl.mean("z")).collect().to_dict(False) == {
         "x": ["constant"],
         "z": [1.5],
     }
@@ -187,7 +187,7 @@ def test_streaming_sortedness_propagation_9494() -> None:
         )
         .lazy()
         .sort("when")
-        .groupby_dynamic("when", every="1mo")
+        .group_by_dynamic("when", every="1mo")
         .agg(pl.col("what").sum())
         .collect(streaming=True)
     ).to_dict(False) == {"when": [date(2023, 5, 1), date(2023, 6, 1)], "what": [3, 3]}
@@ -226,12 +226,12 @@ def test_streaming_generic_left_and_inner_join_from_disk(tmp_path: Path) -> None
 def test_streaming_9776() -> None:
     df = pl.DataFrame({"col_1": ["a"] * 1000, "ID": [None] + ["a"] * 999})
     ordered = (
-        df.groupby("col_1", "ID", maintain_order=True)
+        df.group_by("col_1", "ID", maintain_order=True)
         .count()
         .filter(pl.col("col_1") == "a")
     )
     unordered = (
-        df.groupby("col_1", "ID", maintain_order=False)
+        df.group_by("col_1", "ID", maintain_order=False)
         .count()
         .filter(pl.col("col_1") == "a")
     )
@@ -317,7 +317,7 @@ def test_null_sum_streaming_10455() -> None:
             "y": [None] * 10,
         }
     )
-    assert df.lazy().groupby("x").sum().collect(streaming=True).to_dict(False) == {
+    assert df.lazy().group_by("x").sum().collect(streaming=True).to_dict(False) == {
         "x": [1],
         "y": [0.0],
     }
@@ -331,7 +331,7 @@ def test_boolean_agg_schema() -> None:
         }
     ).lazy()
 
-    agg_df = df.groupby("x").agg(pl.col("y").max().alias("max_y"))
+    agg_df = df.group_by("x").agg(pl.col("y").max().alias("max_y"))
 
     for streaming in [True, False]:
         assert (
diff --git a/py-polars/tests/unit/streaming/test_streaming_cse.py b/py-polars/tests/unit/streaming/test_streaming_cse.py
index f9aa25924217..a0bd1b0b77f1 100644
--- a/py-polars/tests/unit/streaming/test_streaming_cse.py
+++ b/py-polars/tests/unit/streaming/test_streaming_cse.py
@@ -48,7 +48,7 @@ def test_cse_expr_selection_streaming(monkeypatch: Any, capfd: Any) -> None:
 
 
 @pytest.mark.skip(reason="activate once fixed")
-def test_cse_expr_groupby() -> None:
+def test_cse_expr_group_by() -> None:
     q = pl.LazyFrame(
         {
             "a": [1, 2, 3, 4],
@@ -60,7 +60,7 @@ def test_cse_expr_groupby() -> None:
     derived = pl.col("a") * pl.col("b")
 
     q = (
-        q.groupby("a")
+        q.group_by("a")
         .agg(derived.sum().alias("sum"), derived.min().alias("min"))
         .sort("min")
     )
diff --git a/py-polars/tests/unit/streaming/test_streaming_groupby.py b/py-polars/tests/unit/streaming/test_streaming_group_by.py
similarity index 88%
rename from py-polars/tests/unit/streaming/test_streaming_groupby.py
rename to py-polars/tests/unit/streaming/test_streaming_group_by.py
index a60d7854033c..48798fbea054 100644
--- a/py-polars/tests/unit/streaming/test_streaming_groupby.py
+++ b/py-polars/tests/unit/streaming/test_streaming_group_by.py
@@ -13,7 +13,7 @@
 
 
 @pytest.mark.slow()
-def test_streaming_groupby_sorted_fast_path_nulls_10273() -> None:
+def test_streaming_group_by_sorted_fast_path_nulls_10273() -> None:
     df = pl.Series(
         name="x",
         values=(
@@ -25,14 +25,14 @@ def test_streaming_groupby_sorted_fast_path_nulls_10273() -> None:
     assert (
         df.set_sorted("x")
         .lazy()
-        .groupby("x")
+        .group_by("x")
         .agg(pl.count())
         .collect(streaming=True)
         .sort("x")
     ).to_dict(False) == {"x": [None, 0, 1, 2, 3], "count": [100, 100, 100, 100, 100]}
 
 
-def test_streaming_groupby_types() -> None:
+def test_streaming_group_by_types() -> None:
     df = pl.DataFrame(
         {
             "person_id": [1, 1],
@@ -47,7 +47,7 @@ def test_streaming_groupby_types() -> None:
         out = (
             (
                 df.lazy()
-                .groupby(by)
+                .group_by(by)
                 .agg(
                     [
                         pl.col("person_name").first().alias("str_first"),
@@ -107,7 +107,7 @@ def test_streaming_groupby_types() -> None:
     with pytest.raises(pl.DuplicateError):
         (
             df.lazy()
-            .groupby("person_id")
+            .group_by("person_id")
             .agg(
                 [
                     pl.col("person_name").first().alias("str_first"),
@@ -123,7 +123,7 @@ def test_streaming_groupby_types() -> None:
         )
 
 
-def test_streaming_groupby_min_max() -> None:
+def test_streaming_group_by_min_max() -> None:
     df = pl.DataFrame(
         {
             "person_id": [1, 2, 3, 4, 5, 6],
@@ -132,7 +132,7 @@ def test_streaming_groupby_min_max() -> None:
     )
     out = (
         df.lazy()
-        .groupby("year")
+        .group_by("year")
         .agg([pl.min("person_id").alias("min"), pl.max("person_id").alias("max")])
         .collect()
         .sort("year")
@@ -144,22 +144,22 @@ def test_streaming_groupby_min_max() -> None:
 def test_streaming_non_streaming_gb() -> None:
     n = 100
     df = pl.DataFrame({"a": np.random.randint(0, 20, n)})
-    q = df.lazy().groupby("a").agg(pl.count()).sort("a")
+    q = df.lazy().group_by("a").agg(pl.count()).sort("a")
     assert_frame_equal(q.collect(streaming=True), q.collect())
 
     q = df.lazy().with_columns(pl.col("a").cast(pl.Utf8))
-    q = q.groupby("a").agg(pl.count()).sort("a")
+    q = q.group_by("a").agg(pl.count()).sort("a")
     assert_frame_equal(q.collect(streaming=True), q.collect())
     q = df.lazy().with_columns(pl.col("a").alias("b"))
     q = (
-        q.groupby(["a", "b"])
+        q.group_by(["a", "b"])
         .agg(pl.count(), pl.col("a").sum().alias("sum_a"))
         .sort("a")
     )
     assert_frame_equal(q.collect(streaming=True), q.collect())
 
 
-def test_streaming_groupby_sorted_fast_path() -> None:
+def test_streaming_group_by_sorted_fast_path() -> None:
     a = np.random.randint(0, 20, 80)
     df = pl.DataFrame(
         {
@@ -175,7 +175,7 @@ def test_streaming_groupby_sorted_fast_path() -> None:
         for df_ in [df, df_sorted]:
             out = (
                 df_.lazy()
-                .groupby("a")
+                .group_by("a")
                 .agg(
                     [
                         pl.first("a").alias("first"),
@@ -202,14 +202,16 @@ def random_integers() -> pl.Series:
 
 
 @pytest.mark.write_disk()
-def test_streaming_groupby_ooc_q1(monkeypatch: Any, random_integers: pl.Series) -> None:
+def test_streaming_group_by_ooc_q1(
+    monkeypatch: Any, random_integers: pl.Series
+) -> None:
     s = random_integers
     monkeypatch.setenv("POLARS_FORCE_OOC", "1")
 
     result = (
         s.to_frame()
         .lazy()
-        .groupby("a")
+        .group_by("a")
         .agg(pl.first("a").alias("a_first"), pl.last("a").alias("a_last"))
         .sort("a")
         .collect(streaming=True)
@@ -226,7 +228,9 @@ def test_streaming_groupby_ooc_q1(monkeypatch: Any, random_integers: pl.Series)
 
 
 @pytest.mark.write_disk()
-def test_streaming_groupby_ooc_q2(monkeypatch: Any, random_integers: pl.Series) -> None:
+def test_streaming_group_by_ooc_q2(
+    monkeypatch: Any, random_integers: pl.Series
+) -> None:
     s = random_integers
     monkeypatch.setenv("POLARS_FORCE_OOC", "1")
 
@@ -234,7 +238,7 @@ def test_streaming_groupby_ooc_q2(monkeypatch: Any, random_integers: pl.Series)
         s.cast(str)
         .to_frame()
         .lazy()
-        .groupby("a")
+        .group_by("a")
         .agg(pl.first("a").alias("a_first"), pl.last("a").alias("a_last"))
         .sort("a")
         .collect(streaming=True)
@@ -251,14 +255,16 @@ def test_streaming_groupby_ooc_q2(monkeypatch: Any, random_integers: pl.Series)
 
 
 @pytest.mark.write_disk()
-def test_streaming_groupby_ooc_q3(monkeypatch: Any, random_integers: pl.Series) -> None:
+def test_streaming_group_by_ooc_q3(
+    monkeypatch: Any, random_integers: pl.Series
+) -> None:
     s = random_integers
     monkeypatch.setenv("POLARS_FORCE_OOC", "1")
 
     result = (
         pl.DataFrame({"a": s, "b": s})
         .lazy()
-        .groupby(["a", "b"])
+        .group_by(["a", "b"])
         .agg(pl.first("a").alias("a_first"), pl.last("a").alias("a_last"))
         .sort("a")
         .collect(streaming=True)
@@ -275,14 +281,14 @@ def test_streaming_groupby_ooc_q3(monkeypatch: Any, random_integers: pl.Series)
     assert_frame_equal(result, expected)
 
 
-def test_streaming_groupby_struct_key() -> None:
+def test_streaming_group_by_struct_key() -> None:
     df = pl.DataFrame(
         {"A": [1, 2, 3, 2], "B": ["google", "ms", "apple", "ms"], "C": [2, 3, 4, 3]}
     )
     df1 = df.lazy().with_columns(pl.struct(["A", "C"]).alias("tuples"))
-    assert df1.groupby("tuples").agg(pl.count(), pl.col("B").first()).sort("B").collect(
-        streaming=True
-    ).to_dict(False) == {
+    assert df1.group_by("tuples").agg(pl.count(), pl.col("B").first()).sort(
+        "B"
+    ).collect(streaming=True).to_dict(False) == {
         "tuples": [{"A": 3, "C": 4}, {"A": 1, "C": 2}, {"A": 2, "C": 3}],
         "count": [1, 1, 2],
         "B": ["apple", "google", "ms"],
@@ -290,7 +296,7 @@ def test_streaming_groupby_struct_key() -> None:
 
 
 @pytest.mark.slow()
-def test_streaming_groupby_all_numeric_types_stability_8570() -> None:
+def test_streaming_group_by_all_numeric_types_stability_8570() -> None:
     m = 1000
     n = 1000
 
@@ -310,14 +316,14 @@ def test_streaming_groupby_all_numeric_types_stability_8570() -> None:
             dfd = (
                 dfc.lazy()
                 .with_columns(pl.col("z").cast(dtype))
-                .groupby(keys)
+                .group_by(keys)
                 .agg(pl.col("z").sum().alias("z_sum"))
                 .collect(streaming=True)
             )
             assert dfd["z_sum"].sum() == dfc["z"].sum()
 
 
-def test_streaming_groupby_categorical_aggregate() -> None:
+def test_streaming_group_by_categorical_aggregate() -> None:
     with pl.StringCache():
         out = (
             pl.LazyFrame(
@@ -335,7 +341,7 @@ def test_streaming_groupby_categorical_aggregate() -> None:
                     ),
                 }
             )
-            .groupby(["a", "b"])
+            .group_by(["a", "b"])
             .agg([pl.col("a").first().alias("sum")])
             .collect(streaming=True)
         )
@@ -356,11 +362,11 @@ def test_streaming_groupby_categorical_aggregate() -> None:
     }
 
 
-def test_streaming_groupby_list_9758() -> None:
+def test_streaming_group_by_list_9758() -> None:
     payload = {"a": [[1, 2]]}
     assert (
         pl.LazyFrame(payload)
-        .groupby("a")
+        .group_by("a")
         .first()
         .collect(streaming=True)
         .to_dict(False)
@@ -368,7 +374,7 @@ def test_streaming_groupby_list_9758() -> None:
     )
 
 
-def test_streaming_restart_non_streamable_groupby() -> None:
+def test_streaming_restart_non_streamable_group_by() -> None:
     df = pl.DataFrame({"id": [1], "id2": [1], "id3": [1], "value": [1]})
     res = (
         df.lazy()
@@ -377,7 +383,7 @@ def test_streaming_restart_non_streamable_groupby() -> None:
             (pl.col("id3") > pl.col("id3_right"))
             & (pl.col("id3") - pl.col("id3_right") < 30)
         )
-        .groupby(["id2", "id3", "id3_right"])
+        .group_by(["id2", "id3", "id3_right"])
         .agg(
             pl.col("value").apply(lambda x: x).sum() * pl.col("value").sum()
         )  # non-streamable UDF + nested_agg
@@ -386,7 +392,7 @@ def test_streaming_restart_non_streamable_groupby() -> None:
     assert """--- PIPELINE""" in res.explain(streaming=True)
 
 
-def test_groupby_min_max_string_type() -> None:
+def test_group_by_min_max_string_type() -> None:
     table = pl.from_dict({"a": [1, 1, 2, 2, 2], "b": ["a", "b", "c", "d", None]})
 
     expected = {"a": [1, 2], "min": ["a", "c"], "max": ["b", "d"]}
@@ -394,7 +400,7 @@ def test_groupby_min_max_string_type() -> None:
     for streaming in [True, False]:
         assert (
             table.lazy()
-            .groupby("a")
+            .group_by("a")
             .agg([pl.min("b").alias("min"), pl.max("b").alias("max")])
             .collect(streaming=streaming)
             .sort("a")
diff --git a/py-polars/tests/unit/streaming/test_streaming_unique.py b/py-polars/tests/unit/streaming/test_streaming_unique.py
index fce6a8402b03..c79a734464a3 100644
--- a/py-polars/tests/unit/streaming/test_streaming_unique.py
+++ b/py-polars/tests/unit/streaming/test_streaming_unique.py
@@ -34,7 +34,7 @@ def test_streaming_out_of_core_unique(
 
     # TODO: Re-enable this check when this issue is fixed: https://github.com/pola-rs/polars/issues/10466
     _ = capfd.readouterr().err
-    # assert "OOC groupby started" in err
+    # assert "OOC group_by started" in err
 
 
 def test_streaming_unique(monkeypatch: Any, capfd: Any) -> None:
diff --git a/py-polars/tests/unit/test_context.py b/py-polars/tests/unit/test_context.py
index b384e2c19497..15cf03150a0c 100644
--- a/py-polars/tests/unit/test_context.py
+++ b/py-polars/tests/unit/test_context.py
@@ -9,7 +9,7 @@ def test_context_ignore_5867() -> None:
         .with_context(outer)
     )
     assert (
-        df.groupby("Category", maintain_order=True)
+        df.group_by("Category", maintain_order=True)
         .agg([(pl.col("Counts")).sum()])
         .collect()
         .to_dict(False)
diff --git a/py-polars/tests/unit/test_cse.py b/py-polars/tests/unit/test_cse.py
index dd0e534e52fd..58232f9bc79e 100644
--- a/py-polars/tests/unit/test_cse.py
+++ b/py-polars/tests/unit/test_cse.py
@@ -49,7 +49,7 @@ def test_cse_schema_6081() -> None:
         orient="row",
     ).lazy()
 
-    min_value_by_group = df.groupby(["date", "id"]).agg(
+    min_value_by_group = df.group_by(["date", "id"]).agg(
         pl.col("value").min().alias("min_value")
     )
 
@@ -88,7 +88,7 @@ def test_cse_9630() -> None:
                 joined_df2.select("key", pl.col("y").alias("value")),
             ]
         )
-        .groupby("key")
+        .group_by("key")
         .agg(
             [
                 pl.col("value"),
@@ -123,7 +123,7 @@ def test_schema_row_count_cse() -> None:
     csv_a.seek(0)
 
     df_a = pl.scan_csv(csv_a.name).with_row_count("Idx")
-    assert df_a.join(df_a, on="B").groupby(
+    assert df_a.join(df_a, on="B").group_by(
         "A", maintain_order=True
     ).all().collect().to_dict(False) == {
         "A": ["Gr1"],
@@ -199,7 +199,7 @@ def test_windows_cse_excluded() -> None:
 
 
 @pytest.mark.skip()
-def test_cse_groupby_10215() -> None:
+def test_cse_group_by_10215() -> None:
     q = (
         pl.DataFrame(
             {
@@ -208,7 +208,7 @@ def test_cse_groupby_10215() -> None:
             }
         )
         .lazy()
-        .groupby(
+        .group_by(
             "b",
         )
         .agg(
@@ -295,7 +295,7 @@ def test_cse_10452() -> None:
     assert q.collect(comm_subexpr_elim=True).to_dict(False) == {"b": [13, 14, 15]}
 
 
-def test_cse_groupby_ternary_10490() -> None:
+def test_cse_group_by_ternary_10490() -> None:
     df = pl.DataFrame(
         {
             "a": [1, 1, 2, 2],
@@ -306,7 +306,7 @@ def test_cse_groupby_ternary_10490() -> None:
 
     assert (
         df.lazy()
-        .groupby("a")
+        .group_by("a")
         .agg(
             [
                 pl.when(pl.col(col).is_null().all()).then(None).otherwise(1).alias(col)
diff --git a/py-polars/tests/unit/test_datatypes.py b/py-polars/tests/unit/test_datatypes.py
index 3d73b1b4ca4f..c6fc2ba3ff04 100644
--- a/py-polars/tests/unit/test_datatypes.py
+++ b/py-polars/tests/unit/test_datatypes.py
@@ -141,7 +141,7 @@ def test_conversion_dtype() -> None:
                 pl.col("some_partition_column"),
             ]
         )
-        .groupby(["some_partition_column"], maintain_order=True)
+        .group_by(["some_partition_column"], maintain_order=True)
         .agg([pl.col(["struct"])])
     )
 
diff --git a/py-polars/tests/unit/test_empty.py b/py-polars/tests/unit/test_empty.py
index 12cd4f84680c..6f3f47addbad 100644
--- a/py-polars/tests/unit/test_empty.py
+++ b/py-polars/tests/unit/test_empty.py
@@ -65,19 +65,19 @@ def test_empty_sort_by_args() -> None:
 def test_empty_9137() -> None:
     out = (
         pl.DataFrame({"id": [], "value": []})
-        .groupby("id")
+        .group_by("id")
         .agg(pl.col("value").pow(2).mean())
     )
     assert out.shape == (0, 2)
     assert out.dtypes == [pl.Float32, pl.Float32]
 
 
-def test_empty_groupby_apply_err() -> None:
+def test_empty_group_by_apply_err() -> None:
     df = pl.DataFrame(schema={"x": pl.Int64})
     with pytest.raises(
         pl.ComputeError, match=r"cannot group_by \+ apply on empty 'DataFrame'"
     ):
-        df.groupby("x").apply(lambda x: x)
+        df.group_by("x").apply(lambda x: x)
 
 
 def test_empty_list_namespace_output_9585() -> None:
diff --git a/py-polars/tests/unit/test_errors.py b/py-polars/tests/unit/test_errors.py
index f79c7752e0b8..037af5c92799 100644
--- a/py-polars/tests/unit/test_errors.py
+++ b/py-polars/tests/unit/test_errors.py
@@ -15,11 +15,11 @@
     from polars.type_aliases import ConcatMethod
 
 
-def test_error_on_empty_groupby() -> None:
+def test_error_on_empty_group_by() -> None:
     with pytest.raises(
         pl.ComputeError, match="at least one key is required in a group_by operation"
     ):
-        pl.DataFrame({"x": [0, 0, 1, 1]}).groupby([]).agg(pl.count())
+        pl.DataFrame({"x": [0, 0, 1, 1]}).group_by([]).agg(pl.count())
 
 
 def test_error_on_reducing_map() -> None:
@@ -33,7 +33,7 @@ def test_error_on_reducing_map() -> None:
             r"the input length \(1\); consider using `apply` instead"
         ),
     ):
-        df.groupby("id").agg(pl.map(["t", "y"], np.trapz))
+        df.group_by("id").agg(pl.map(["t", "y"], np.trapz))
 
     df = pl.DataFrame({"x": [1, 2, 3, 4], "group": [1, 2, 1, 2]})
     with pytest.raises(
@@ -136,7 +136,7 @@ def test_projection_update_schema_missing_column() -> None:
             pl.DataFrame({"colA": ["a", "b", "c"], "colB": [1, 2, 3]})
             .lazy()
             .filter(~pl.col("colC").is_null())
-            .groupby(["colA"])
+            .group_by(["colA"])
             .agg([pl.col("colB").sum().alias("result")])
             .collect()
         )
@@ -204,7 +204,7 @@ def test_error_on_double_agg() -> None:
                         "b": [1, 2, 3, 4, 5],
                     }
                 )
-                .groupby("a")
+                .group_by("a")
                 .agg([getattr(pl.col("b").min(), e)()])
             )
 
@@ -381,7 +381,7 @@ def test_sort_by_different_lengths() -> None:
         pl.ComputeError,
         match=r"the expression in `sort_by` argument must result in the same length",
     ):
-        df.groupby("group").agg(
+        df.group_by("group").agg(
             [
                 pl.col("col1").sort_by(pl.col("col2").unique()),
             ]
@@ -391,7 +391,7 @@ def test_sort_by_different_lengths() -> None:
         pl.ComputeError,
         match=r"the expression in `sort_by` argument must result in the same length",
     ):
-        df.groupby("group").agg(
+        df.group_by("group").agg(
             [
                 pl.col("col1").sort_by(pl.col("col2").arg_unique()),
             ]
@@ -568,7 +568,7 @@ def test_invalid_inner_type_cast_list() -> None:
         ),
     ],
 )
-def test_groupby_dynamic_validation(every: str, match: str) -> None:
+def test_group_by_dynamic_validation(every: str, match: str) -> None:
     df = pl.DataFrame(
         {
             "index": [0, 0, 1, 1],
@@ -578,7 +578,7 @@ def test_groupby_dynamic_validation(every: str, match: str) -> None:
     )
 
     with pytest.raises(pl.ComputeError, match=match):
-        df.groupby_dynamic("index", by="group", every=every, period="2i").agg(
+        df.group_by_dynamic("index", by="group", every=every, period="2i").agg(
             pl.col("weight")
         )
 
@@ -602,12 +602,12 @@ def test_invalid_getitem_key_err() -> None:
         df["x", "y"]  # type: ignore[index]
 
 
-def test_invalid_groupby_arg() -> None:
+def test_invalid_group_by_arg() -> None:
     df = pl.DataFrame({"a": [1]})
     with pytest.raises(
         TypeError, match="specifying aggregations as a dictionary is not supported"
     ):
-        df.groupby(1).agg({"a": "sum"})
+        df.group_by(1).agg({"a": "sum"})
 
 
 def test_no_sorted_err() -> None:
@@ -620,7 +620,7 @@ def test_no_sorted_err() -> None:
         pl.InvalidOperationError,
         match=r"argument in operation 'group_by_dynamic' is not explicitly sorted",
     ):
-        df.groupby_dynamic("dt", every="1h").agg(pl.all().count().suffix("_foo"))
+        df.group_by_dynamic("dt", every="1h").agg(pl.all().count().suffix("_foo"))
 
 
 def test_serde_validation() -> None:
@@ -678,6 +678,6 @@ def test_sort_by_err_9259() -> None:
         schema={"a": pl.Float32, "b": pl.Float32, "c": pl.Float32},
     )
     with pytest.raises(pl.ComputeError):
-        df.lazy().groupby("c").agg(
+        df.lazy().group_by("c").agg(
             [pl.col("a").sort_by(pl.col("b").filter(pl.col("b") > 100)).sum()]
         ).collect()
diff --git a/py-polars/tests/unit/test_expr_multi_cols.py b/py-polars/tests/unit/test_expr_multi_cols.py
index a5d21f4cee3a..f3cc51ef36f4 100644
--- a/py-polars/tests/unit/test_expr_multi_cols.py
+++ b/py-polars/tests/unit/test_expr_multi_cols.py
@@ -81,7 +81,7 @@ def test_multiple_columns_length_9137() -> None:
     # list is larger than groups
     cmp_list = ["a", "b", "c"]
 
-    assert df.groupby("a").agg(pl.col("b").is_in(cmp_list)).to_dict(False) == {
+    assert df.group_by("a").agg(pl.col("b").is_in(cmp_list)).to_dict(False) == {
         "a": [1],
         "b": [[True, False]],
     }
diff --git a/py-polars/tests/unit/test_exprs.py b/py-polars/tests/unit/test_exprs.py
index 5b2e878e82c0..a3233d797048 100644
--- a/py-polars/tests/unit/test_exprs.py
+++ b/py-polars/tests/unit/test_exprs.py
@@ -93,7 +93,7 @@ def test_prefix(fruits_cars: pl.DataFrame) -> None:
 def test_cumcount() -> None:
     df = pl.DataFrame([["a"], ["a"], ["a"], ["b"], ["b"], ["a"]], schema=["A"])
 
-    out = df.groupby("A", maintain_order=True).agg(
+    out = df.group_by("A", maintain_order=True).agg(
         [pl.col("A").cumcount(reverse=False).alias("foo")]
     )
 
@@ -103,10 +103,10 @@ def test_cumcount() -> None:
 
 def test_filter_where() -> None:
     df = pl.DataFrame({"a": [1, 2, 3, 1, 2, 3], "b": [4, 5, 6, 7, 8, 9]})
-    result_where = df.groupby("a", maintain_order=True).agg(
+    result_where = df.group_by("a", maintain_order=True).agg(
         pl.col("b").where(pl.col("b") > 4).alias("c")
     )
-    result_filter = df.groupby("a", maintain_order=True).agg(
+    result_filter = df.group_by("a", maintain_order=True).agg(
         pl.col("b").filter(pl.col("b") > 4).alias("c")
     )
     expected = pl.DataFrame({"a": [1, 2, 3], "c": [[7], [5, 8], [6, 9]]})
@@ -127,7 +127,7 @@ def test_count_expr() -> None:
     assert out.shape == (1, 1)
     assert cast(int, out.item()) == 5
 
-    out = df.groupby("b", maintain_order=True).agg(pl.count())
+    out = df.group_by("b", maintain_order=True).agg(pl.count())
     assert out["b"].to_list() == ["a", "b"]
     assert out["count"].to_list() == [4, 1]
 
@@ -169,7 +169,7 @@ def test_entropy() -> None:
             "id": [1, 2, 1, 4, 5, 4, 6],
         }
     )
-    result = df.groupby("group", maintain_order=True).agg(
+    result = df.group_by("group", maintain_order=True).agg(
         pl.col("id").entropy(normalize=True)
     )
     expected = pl.DataFrame(
@@ -178,7 +178,7 @@ def test_entropy() -> None:
     assert_frame_equal(result, expected)
 
 
-def test_dot_in_groupby() -> None:
+def test_dot_in_group_by() -> None:
     df = pl.DataFrame(
         {
             "group": ["a", "a", "a", "b", "b", "b"],
@@ -187,7 +187,7 @@ def test_dot_in_groupby() -> None:
         }
     )
 
-    result = df.groupby("group", maintain_order=True).agg(
+    result = df.group_by("group", maintain_order=True).agg(
         pl.col("x").dot("y").alias("dot")
     )
     expected = pl.DataFrame({"group": ["a", "b"], "dot": [6, 15]})
@@ -364,7 +364,7 @@ def test_rank_so_4109() -> None:
         }
     ).sort(by=["id", "rank"])
 
-    assert df.groupby("id").agg(
+    assert df.group_by("id").agg(
         [
             pl.col("rank").alias("original"),
             pl.col("rank").rank(method="dense").alias("dense"),
diff --git a/py-polars/tests/unit/test_fmt.py b/py-polars/tests/unit/test_fmt.py
index ed9bc408ab5a..57d4f541e639 100644
--- a/py-polars/tests/unit/test_fmt.py
+++ b/py-polars/tests/unit/test_fmt.py
@@ -146,7 +146,7 @@ def test_date_list_fmt() -> None:
 
     df = df.with_columns(pl.col("mydate").str.strptime(pl.Date, "%Y-%m-%d"))
     assert (
-        str(df.groupby("index", maintain_order=True).agg(pl.col("mydate"))["mydate"])
+        str(df.group_by("index", maintain_order=True).agg(pl.col("mydate"))["mydate"])
         == """shape: (3,)
 Series: 'mydate' [list[date]]
 [
diff --git a/py-polars/tests/unit/test_interop.py b/py-polars/tests/unit/test_interop.py
index c43fc195ff56..a9361633075b 100644
--- a/py-polars/tests/unit/test_interop.py
+++ b/py-polars/tests/unit/test_interop.py
@@ -1074,7 +1074,7 @@ def test_to_init_repr() -> None:
 
 def test_untrusted_categorical_input() -> None:
     df = pd.DataFrame({"x": pd.Categorical(["x"], ["x", "y"])})
-    assert pl.from_pandas(df).groupby("x").count().to_dict(False) == {
+    assert pl.from_pandas(df).group_by("x").count().to_dict(False) == {
         "x": ["x"],
         "count": [1],
     }
diff --git a/py-polars/tests/unit/test_lazy.py b/py-polars/tests/unit/test_lazy.py
index 90151c9bebb2..cb78de901ffe 100644
--- a/py-polars/tests/unit/test_lazy.py
+++ b/py-polars/tests/unit/test_lazy.py
@@ -39,7 +39,7 @@ def test_lazy() -> None:
     ).collect()
 
     # test if pl.list is available, this is `to_list` re-exported as list
-    eager = ldf.groupby("a").agg(pl.implode("b")).collect()
+    eager = ldf.group_by("a").agg(pl.implode("b")).collect()
     assert sorted(eager.rows()) == [(1, [[1.0]]), (2, [[2.0]]), (3, [[3.0]])]
 
 
@@ -152,10 +152,10 @@ def test_or() -> None:
     assert out.rows() == [(1, 1.0), (3, 3.0)]
 
 
-def test_groupby_apply() -> None:
+def test_group_by_apply() -> None:
     ldf = (
         pl.LazyFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 3.0]})
-        .groupby("a")
+        .group_by("a")
         .apply(lambda df: df * 2.0, schema={"a": pl.Float64, "b": pl.Float64})
     )
     out = ldf.collect()
@@ -194,7 +194,7 @@ def test_apply_custom_function() -> None:
 
     # two ways to determine the length groups.
     df = (
-        ldf.groupby("fruits")
+        ldf.group_by("fruits")
         .agg(
             [
                 pl.col("cars")
@@ -221,16 +221,16 @@ def test_apply_custom_function() -> None:
     assert_frame_equal(df, expected)
 
 
-def test_groupby() -> None:
+def test_group_by() -> None:
     ldf = pl.LazyFrame({"a": [1.0, None, 3.0, 4.0], "groups": ["a", "a", "b", "b"]})
 
     expected = pl.DataFrame({"groups": ["a", "b"], "a": [1.0, 3.5]})
 
-    out = ldf.groupby("groups").agg(pl.mean("a")).collect()
+    out = ldf.group_by("groups").agg(pl.mean("a")).collect()
     assert_frame_equal(out.sort(by="groups"), expected)
 
     # refer to column via pl.Expr
-    out = ldf.groupby(pl.col("groups")).agg(pl.mean("a")).collect()
+    out = ldf.group_by(pl.col("groups")).agg(pl.mean("a")).collect()
     assert_frame_equal(out.sort(by="groups"), expected)
 
 
@@ -391,7 +391,7 @@ def test_fold_filter() -> None:
     assert out.rows() == [(1, 0), (2, 1), (3, 2)]
 
 
-def test_head_groupby() -> None:
+def test_head_group_by() -> None:
     commodity_prices = {
         "commodity": [
             "Wheat",
@@ -434,7 +434,7 @@ def test_head_groupby() -> None:
     keys = ["commodity", "location"]
     out = (
         ldf.sort(by="price", descending=True)
-        .groupby(keys, maintain_order=True)
+        .group_by(keys, maintain_order=True)
         .agg([pl.col("*").exclude(keys).head(2).keep_name()])
         .explode(pl.col("*").exclude(keys))
     )
@@ -450,12 +450,12 @@ def test_head_groupby() -> None:
     ldf = pl.LazyFrame(
         {"letters": ["c", "c", "a", "c", "a", "b"], "nrs": [1, 2, 3, 4, 5, 6]}
     )
-    out = ldf.groupby("letters").tail(2).sort("letters")
+    out = ldf.group_by("letters").tail(2).sort("letters")
     assert_frame_equal(
         out.collect(),
         pl.DataFrame({"letters": ["a", "a", "b", "c", "c"], "nrs": [3, 5, 6, 2, 4]}),
     )
-    out = ldf.groupby("letters").head(2).sort("letters")
+    out = ldf.group_by("letters").head(2).sort("letters")
     assert_frame_equal(
         out.collect(),
         pl.DataFrame({"letters": ["a", "a", "b", "c", "c"], "nrs": [3, 5, 6, 1, 2]}),
@@ -555,10 +555,10 @@ def test_sort() -> None:
     assert_series_equal(ldf.collect()["a"], pl.Series("a", [1, 2, 2, 3]))
 
 
-def test_custom_groupby() -> None:
+def test_custom_group_by() -> None:
     ldf = pl.LazyFrame({"a": [1, 2, 1, 1], "b": ["a", "b", "c", "c"]})
     out = (
-        ldf.groupby("b", maintain_order=True)
+        ldf.group_by("b", maintain_order=True)
         .agg([pl.col("a").apply(lambda x: x.sum(), return_dtype=pl.Int64)])
         .collect()
     )
@@ -894,7 +894,7 @@ def test_argminmax() -> None:
     assert out["min"][0] == 0
 
     out = (
-        ldf.groupby("b", maintain_order=True)
+        ldf.group_by("b", maintain_order=True)
         .agg([pl.col("a").arg_min().alias("min"), pl.col("a").arg_max().alias("max")])
         .collect()
     )
@@ -981,7 +981,7 @@ def test_spearman_corr() -> None:
     )
 
     out = (
-        ldf.groupby("era", maintain_order=True).agg(
+        ldf.group_by("era", maintain_order=True).agg(
             pl.corr(pl.col("prediction"), pl.col("target"), method="spearman").alias(
                 "c"
             ),
@@ -992,7 +992,7 @@ def test_spearman_corr() -> None:
 
     # we can also pass in column names directly
     out = (
-        ldf.groupby("era", maintain_order=True).agg(
+        ldf.group_by("era", maintain_order=True).agg(
             pl.corr("prediction", "target", method="spearman").alias("c"),
         )
     ).collect()["c"]
@@ -1029,7 +1029,7 @@ def test_pearson_corr() -> None:
     )
 
     out = (
-        ldf.groupby("era", maintain_order=True).agg(
+        ldf.group_by("era", maintain_order=True).agg(
             pl.corr(pl.col("prediction"), pl.col("target"), method="pearson").alias(
                 "c"
             ),
@@ -1039,7 +1039,7 @@ def test_pearson_corr() -> None:
 
     # we can also pass in column names directly
     out = (
-        ldf.groupby("era", maintain_order=True).agg(
+        ldf.group_by("era", maintain_order=True).agg(
             pl.corr("prediction", "target", method="pearson").alias("c"),
         )
     ).collect()["c"]
@@ -1178,7 +1178,7 @@ def test_group_lengths() -> None:
         }
     )
 
-    result = ldf.groupby(["group"], maintain_order=True).agg(
+    result = ldf.group_by(["group"], maintain_order=True).agg(
         [
             (pl.col("id").unique_counts() / pl.col("id").len())
             .sum()
@@ -1205,7 +1205,7 @@ def test_quantile_filtered_agg() -> None:
                 "value": [1, 2, 3, 4, 1, 2, 3, 4],
             }
         )
-        .groupby("group")
+        .group_by("group")
         .agg(pl.col("value").filter(pl.col("value") < 2).quantile(0.5))
         .collect()["value"]
         .to_list()
diff --git a/py-polars/tests/unit/test_projections.py b/py-polars/tests/unit/test_projections.py
index 4312815eeaa6..2e276b837b01 100644
--- a/py-polars/tests/unit/test_projections.py
+++ b/py-polars/tests/unit/test_projections.py
@@ -10,7 +10,7 @@ def test_projection_on_semi_join_4789() -> None:
 
     ab = lfa.join(lfb, on="p", how="semi").inspect()
 
-    intermediate_agg = (ab.groupby("a").agg([pl.col("a").alias("seq")])).select(
+    intermediate_agg = (ab.group_by("a").agg([pl.col("a").alias("seq")])).select(
         ["a", "seq"]
     )
 
@@ -25,7 +25,7 @@ def test_melt_projection_pd_block_4997() -> None:
         .with_row_count()
         .lazy()
         .melt(id_vars="row_nr")
-        .groupby("row_nr")
+        .group_by("row_nr")
         .agg(pl.col("variable").alias("result"))
         .collect()
     ).to_dict(False) == {"row_nr": [0], "result": [["col1", "col2"]]}
@@ -43,13 +43,13 @@ def test_double_projection_pushdown() -> None:
     )
 
 
-def test_groupby_projection_pushdown() -> None:
+def test_group_by_projection_pushdown() -> None:
     assert (
         "PROJECT 2/3 COLUMNS"
         in (
             pl.DataFrame({"c0": [], "c1": [], "c2": []})
             .lazy()
-            .groupby("c0")
+            .group_by("c0")
             .agg(
                 [
                     pl.col("c1").sum().alias("sum(c1)"),
@@ -132,14 +132,14 @@ def test_double_projection_union() -> None:
         }
     ).lazy()
 
-    # in this query the groupby projects only 2 columns, that's one
+    # in this query the group_by projects only 2 columns, that's one
     # less than the upstream projection so the union will fail if
     # the select node does not prune one column
     q = lf1.select(["a", "b", "c"])
 
     q = pl.concat([q, lf2])
 
-    q = q.groupby("c", maintain_order=True).agg([pl.col("a")])
+    q = q.group_by("c", maintain_order=True).agg([pl.col("a")])
     assert q.collect().to_dict(False) == {
         "c": [1, 2, 3],
         "a": [[1, 2, 5, 7], [3, 4, 6], [8]],
@@ -253,7 +253,7 @@ def test_distinct_projection_pd_7578() -> None:
         }
     )
 
-    q = df.lazy().unique().groupby("bar").agg(pl.count())
+    q = df.lazy().unique().group_by("bar").agg(pl.count())
     assert q.collect().sort("bar").to_dict(False) == {
         "bar": ["a", "b"],
         "count": [3, 2],
diff --git a/py-polars/tests/unit/test_queries.py b/py-polars/tests/unit/test_queries.py
index 946c0f42e6f6..4c35ffa6e270 100644
--- a/py-polars/tests/unit/test_queries.py
+++ b/py-polars/tests/unit/test_queries.py
@@ -30,10 +30,10 @@ def test_sort_by_bools() -> None:
     assert out.shape == (3, 4)
 
 
-def test_repeat_expansion_in_groupby() -> None:
+def test_repeat_expansion_in_group_by() -> None:
     out = (
         pl.DataFrame({"g": [1, 2, 2, 3, 3, 3]})
-        .groupby("g", maintain_order=True)
+        .group_by("g", maintain_order=True)
         .agg(pl.repeat(1, pl.count()).cumsum())
         .to_dict(False)
     )
@@ -48,7 +48,7 @@ def test_agg_after_head() -> None:
     expected = pl.DataFrame({"a": [1, 2, 3], "b": [6, 9, 21]})
 
     for maintain_order in [True, False]:
-        out = df.groupby("a", maintain_order=maintain_order).agg(
+        out = df.group_by("a", maintain_order=maintain_order).agg(
             [pl.col("b").head(3).sum()]
         )
 
@@ -71,7 +71,7 @@ def test_overflow_uint16_agg_mean() -> None:
                 pl.col("col3").cast(pl.UInt16),
             ]
         )
-        .groupby(["col1"])
+        .group_by(["col1"])
         .agg(pl.col("col3").mean())
         .to_dict(False)
     ) == {"col1": ["A"], "col3": [64.0]}
@@ -86,7 +86,7 @@ def test_binary_on_list_agg_3345() -> None:
     )
 
     assert (
-        df.groupby(["group"], maintain_order=True)
+        df.group_by(["group"], maintain_order=True)
         .agg(
             [
                 (
@@ -109,12 +109,12 @@ def test_maintain_order_after_sampling() -> None:
             "value": [1, 3, 2, 3, 4, 5, 3, 4],
         }
     )
-    assert df.groupby("type", maintain_order=True).agg(pl.col("value").sum()).to_dict(
+    assert df.group_by("type", maintain_order=True).agg(pl.col("value").sum()).to_dict(
         False
     ) == {"type": ["A", "B", "C", "D"], "value": [5, 8, 5, 7]}
 
 
-def test_sorted_groupby_optimization(monkeypatch: Any) -> None:
+def test_sorted_group_by_optimization(monkeypatch: Any) -> None:
     monkeypatch.setenv("POLARS_NO_STREAMING_GROUPBY", "1")
 
     df = pl.DataFrame({"a": np.random.randint(0, 5, 20)})
@@ -124,11 +124,11 @@ def test_sorted_groupby_optimization(monkeypatch: Any) -> None:
     for descending in [True, False]:
         sorted_implicit = (
             df.with_columns(pl.col("a").sort(descending=descending))
-            .groupby("a")
+            .group_by("a")
             .agg(pl.count())
         )
         sorted_explicit = (
-            df.groupby("a").agg(pl.count()).sort("a", descending=descending)
+            df.group_by("a").agg(pl.count()).sort("a", descending=descending)
         )
         assert_frame_equal(sorted_explicit, sorted_implicit)
 
@@ -147,7 +147,7 @@ def test_median_on_shifted_col_3522() -> None:
     assert diffs.select(pl.col("foo").median()).to_series()[0] == 36828.5
 
 
-def test_groupby_agg_equals_zero_3535() -> None:
+def test_group_by_agg_equals_zero_3535() -> None:
     # setup test frame
     df = pl.DataFrame(
         data=[
@@ -165,7 +165,7 @@ def test_groupby_agg_equals_zero_3535() -> None:
         ],
     )
     # group by the key, aggregating the two numeric cols
-    assert df.groupby(pl.col("key"), maintain_order=True).agg(
+    assert df.group_by(pl.col("key"), maintain_order=True).agg(
         [pl.col("val1").sum(), pl.col("val2").sum()]
     ).to_dict(False) == {
         "key": ["aa", "bb", "cc"],
@@ -190,7 +190,7 @@ def demean_dot() -> pl.Expr:
                 "y": [2, 0, 2, 0],
             }
         )
-        .groupby("key")
+        .group_by("key")
         .agg(
             [
                 demean_dot(),
@@ -228,7 +228,7 @@ def test_opaque_filter_on_lists_3784() -> None:
     ).lazy()
     df = df.with_columns(pl.col("str").cast(pl.Categorical))
 
-    df_groups = df.groupby("group").agg([pl.col("str").alias("str_list")])
+    df_groups = df.group_by("group").agg([pl.col("str").alias("str_list")])
 
     pre = "A"
     succ = "B"
@@ -263,7 +263,7 @@ def map_expr(name: str) -> pl.Expr:
 
     assert (
         pl.DataFrame({"groups": [1, 2, 3, 4], "values": [None, None, 1, 2]})
-        .groupby("groups", maintain_order=True)
+        .group_by("groups", maintain_order=True)
         .agg([map_expr("values")])
     ).to_dict(False) == {
         "groups": [1, 2, 3, 4],
diff --git a/py-polars/tests/unit/test_rows.py b/py-polars/tests/unit/test_rows.py
index 0e01a91e1cbc..cfd364ee06a1 100644
--- a/py-polars/tests/unit/test_rows.py
+++ b/py-polars/tests/unit/test_rows.py
@@ -93,7 +93,7 @@ def test_rows_by_key() -> None:
         "b": [("b", "q", 2.5, 8), ("b", "q", 3.0, 7)],
     }
     assert df.rows_by_key("w", include_key=True) == {
-        key: grp.rows() for key, grp in df.groupby("w")
+        key: grp.rows() for key, grp in df.group_by("w")
     }
     assert df.rows_by_key("w", include_key=True, unique=True) == {
         "a": ("a", "k", 4.5, 6),
@@ -135,7 +135,7 @@ def test_rows_by_key() -> None:
         ],
     }
     assert df.rows_by_key("w", named=True, include_key=True) == {
-        key: grp.rows(named=True) for key, grp in df.groupby("w")
+        key: grp.rows(named=True) for key, grp in df.group_by("w")
     }
     assert df.rows_by_key("w", named=True, include_key=True, unique=True) == {
         "a": {"w": "a", "x": "k", "y": 4.5, "z": 6},
diff --git a/py-polars/tests/unit/test_schema.py b/py-polars/tests/unit/test_schema.py
index eb12a3e6ad80..56de4581014a 100644
--- a/py-polars/tests/unit/test_schema.py
+++ b/py-polars/tests/unit/test_schema.py
@@ -14,7 +14,7 @@ def test_schema_on_agg() -> None:
 
     assert (
         df.lazy()
-        .groupby("a")
+        .group_by("a")
         .agg(
             [
                 pl.col("b").min().alias("min"),
@@ -97,7 +97,7 @@ def test_from_dicts_nested_nulls() -> None:
 def test_group_schema_err() -> None:
     df = pl.DataFrame({"foo": [None, 1, 2], "bar": [1, 2, 3]}).lazy()
     with pytest.raises(pl.ColumnNotFoundError):
-        df.groupby("not-existent").agg(pl.col("bar").max().alias("max_bar")).schema
+        df.group_by("not-existent").agg(pl.col("bar").max().alias("max_bar")).schema
 
 
 def test_schema_inference_from_rows() -> None:
@@ -391,7 +391,7 @@ def sub_col_min(column: str, min_column: str) -> pl.Expr:
 
     q = (
         df.lazy()
-        .groupby("group")
+        .group_by("group")
         .agg(
             [
                 sub_col_min("vals_num", "vals_num").alias("sub_num"),
@@ -439,8 +439,8 @@ def test_schemas(
     for key, dtype in expected_select.items():
         assert schema[key] == dtype
 
-    # test groupby schema
-    schema = df.groupby(pl.lit(1)).agg(expr).schema
+    # test group_by schema
+    schema = df.group_by(pl.lit(1)).agg(expr).schema
     for key, dtype in expected_gb.items():
         assert schema[key] == dtype
 
@@ -511,7 +511,7 @@ def test_lit_iter_schema() -> None:
         }
     )
 
-    assert df.groupby("key").agg(pl.col("dates").unique() + timedelta(days=1)).to_dict(
+    assert df.group_by("key").agg(pl.col("dates").unique() + timedelta(days=1)).to_dict(
         False
     ) == {
         "key": ["A"],
diff --git a/py-polars/tests/unit/test_selectors.py b/py-polars/tests/unit/test_selectors.py
index 1d97e484b703..b1d2f6a3eed1 100644
--- a/py-polars/tests/unit/test_selectors.py
+++ b/py-polars/tests/unit/test_selectors.py
@@ -447,9 +447,9 @@ def test_selector_expr_dispatch() -> None:
         )
 
 
-def test_regex_expansion_groupby_9947() -> None:
+def test_regex_expansion_group_by_9947() -> None:
     df = pl.DataFrame({"g": [3], "abc": [1], "abcd": [3]})
-    assert df.groupby("g").agg(pl.col("^ab.*$")).columns == ["g", "abc", "abcd"]
+    assert df.group_by("g").agg(pl.col("^ab.*$")).columns == ["g", "abc", "abcd"]
 
 
 def test_regex_expansion_exclude_10002() -> None:
diff --git a/py-polars/tests/unit/test_show_graph.py b/py-polars/tests/unit/test_show_graph.py
index 09a9b9484933..f46d135e0792 100644
--- a/py-polars/tests/unit/test_show_graph.py
+++ b/py-polars/tests/unit/test_show_graph.py
@@ -10,6 +10,6 @@ def test_show_graph() -> None:
             "c": [6, 5, 4, 3, 2, 1],
         }
     )
-    query = ldf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort("a")
+    query = ldf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort("a")
     out = query.show_graph(raw_output=True)
     assert isinstance(out, str)
diff --git a/py-polars/tests/unit/test_sql.py b/py-polars/tests/unit/test_sql.py
index c3e5318a2fae..becaa874058c 100644
--- a/py-polars/tests/unit/test_sql.py
+++ b/py-polars/tests/unit/test_sql.py
@@ -412,7 +412,7 @@ def test_sql_trig() -> None:
     assert_frame_equal(left=df_result, right=res, atol=1e-5)
 
 
-def test_sql_groupby(foods_ipc_path: Path) -> None:
+def test_sql_group_by(foods_ipc_path: Path) -> None:
     lf = pl.scan_ipc(foods_ipc_path)
 
     c = pl.SQLContext(eager_execution=True)

From 97ab4cd94a295180e64bdbaa6dda28ff0f27dcfd Mon Sep 17 00:00:00 2001
From: Ion Koutsouris <ioncjk@gmail.com>
Date: Tue, 22 Aug 2023 11:19:01 +0200
Subject: [PATCH 35/55] fix(python): Correctly handle time zones in
 `write_delta` (#10633)

Co-authored-by: Stijn de Gooijer <stijn@degooijer.io>
---
 py-polars/polars/dataframe/frame.py   |  3 ++-
 py-polars/polars/io/_utils.py         |  2 +-
 py-polars/polars/io/delta.py          |  7 ++++---
 py-polars/tests/unit/io/test_delta.py | 30 +++++++++++++++++++++++++--
 4 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
index 9c4f3656f7a9..c7d630b207de 100644
--- a/py-polars/polars/dataframe/frame.py
+++ b/py-polars/polars/dataframe/frame.py
@@ -3516,7 +3516,8 @@ def write_delta(
         to which they can be cast. This affects the following data types:
 
         - Unsigned integers
-        - :class:`Datetime` types with millisecond or nanosecond precision
+        - :class:`Datetime` types with millisecond or nanosecond precision or with
+            time zone information
         - :class:`Utf8`, :class:`Binary`, and :class:`List` ('large' types)
 
         Polars columns are always nullable. To write data to a delta table with
diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py
index 4a59dd65353c..ec3301bbd930 100644
--- a/py-polars/polars/io/_utils.py
+++ b/py-polars/polars/io/_utils.py
@@ -17,7 +17,7 @@ def _is_glob_pattern(file: str) -> bool:
 
 def _is_local_file(file: str) -> bool:
     try:
-        next(glob.iglob(file, recursive=True))
+        next(glob.iglob(file, recursive=True))  # noqa: PTH207
         return True
     except StopIteration:
         return False
diff --git a/py-polars/polars/io/delta.py b/py-polars/polars/io/delta.py
index e04d1a01037f..d1a590aec96f 100644
--- a/py-polars/polars/io/delta.py
+++ b/py-polars/polars/io/delta.py
@@ -338,8 +338,6 @@ def _convert_pa_schema_to_delta(schema: pa.schema) -> pa.schema:
         pa.uint16(): pa.int16(),
         pa.uint32(): pa.int32(),
         pa.uint64(): pa.int64(),
-        pa.timestamp("ns"): pa.timestamp("us"),
-        pa.timestamp("ms"): pa.timestamp("us"),
         pa.large_string(): pa.string(),
         pa.large_binary(): pa.binary(),
     }
@@ -350,7 +348,10 @@ def dtype_to_delta_dtype(dtype: pa.DataType) -> pa.DataType:
             return list_to_delta_dtype(dtype)
         elif isinstance(dtype, pa.StructType):
             return struct_to_delta_dtype(dtype)
-
+        elif isinstance(dtype, pa.TimestampType):
+            # TODO: Support time zones when implemented by delta-rs. See:
+            # https://github.com/delta-io/delta-rs/issues/1598
+            return pa.timestamp("us")
         try:
             return dtype_map[dtype]
         except KeyError:
diff --git a/py-polars/tests/unit/io/test_delta.py b/py-polars/tests/unit/io/test_delta.py
index 581068f48b8a..6ab087171ea4 100644
--- a/py-polars/tests/unit/io/test_delta.py
+++ b/py-polars/tests/unit/io/test_delta.py
@@ -197,7 +197,7 @@ def test_write_delta(df: pl.DataFrame, tmp_path: Path) -> None:
         pl.Series(
             "date_ns",
             [datetime(2010, 1, 1, 0, 0)],
-            dtype=pl.Datetime(time_unit="ns"),
+            dtype=pl.Datetime(time_unit="ns", time_zone="ETC"),
         ),
         pl.Series(
             "date_us",
@@ -262,7 +262,7 @@ def test_write_delta(df: pl.DataFrame, tmp_path: Path) -> None:
                 [
                     pl.Field(
                         "date_range",
-                        pl.List(pl.Datetime(time_unit="ms", time_zone=None)),
+                        pl.List(pl.Datetime(time_unit="ms", time_zone="UTC")),
                     ),
                     pl.Field(
                         "date_us", pl.List(pl.Datetime(time_unit="ms", time_zone=None))
@@ -343,3 +343,29 @@ def test_write_delta_with_schema_10540(tmp_path: Path) -> None:
 
     pa_schema = pa.schema([("a", pa.int64())])
     df.write_delta(tmp_path, delta_write_options={"schema": pa_schema})
+
+
+@pytest.mark.parametrize(
+    "expr",
+    [
+        pl.datetime(2010, 1, 1, time_unit="us", time_zone="UTC"),
+        pl.datetime(2010, 1, 1, time_unit="ns", time_zone="EST"),
+        pl.datetime(2010, 1, 1, time_unit="ms", time_zone="Europe/Amsterdam"),
+    ],
+)
+def test_write_delta_with_tz_in_df(expr: pl.Expr, tmp_path: Path) -> None:
+    df = pl.select(expr)
+
+    pa_schema = pa.schema([("datetime", pa.timestamp("us"))])
+
+    df.write_delta(tmp_path, mode="append")
+    # write second time because delta-rs also casts timestamp with tz to timestamp no tz
+    df.write_delta(tmp_path, mode="append")
+
+    tbl = DeltaTable(tmp_path)
+    assert pa_schema == tbl.schema().to_pyarrow()
+
+    result = pl.read_delta(str(tmp_path), version=0)
+
+    expected = df.cast(pl.Datetime)
+    assert_frame_equal(result, expected)

From dc2e61777112ff4643f21d7b8028800b715e365f Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Tue, 22 Aug 2023 11:33:29 +0200
Subject: [PATCH 36/55] feat(rust, python): add `truncate_ragged_lines`
 (#10660)

---
 crates/polars-error/Cargo.toml                |  3 +++
 crates/polars-error/src/constants.rs          | 10 +++++++
 crates/polars-error/src/lib.rs                |  1 +
 crates/polars-io/Cargo.toml                   |  1 +
 crates/polars-io/src/csv/parser.rs            | 16 ++++++++++--
 crates/polars-io/src/csv/read.rs              | 23 +++++++++++-----
 .../src/csv/read_impl/batched_mmap.rs         |  3 +++
 .../src/csv/read_impl/batched_read.rs         |  3 +++
 crates/polars-io/src/csv/read_impl/mod.rs     | 20 +++++++++-----
 crates/polars-lazy/Cargo.toml                 |  2 +-
 crates/polars-lazy/src/frame/csv.rs           | 10 +++++++
 .../src/physical_plan/executors/scan/csv.rs   |  1 +
 .../polars-pipe/src/executors/sources/csv.rs  |  1 +
 .../polars-plan/src/logical_plan/builder.rs   |  2 ++
 .../polars-plan/src/logical_plan/options.rs   |  1 +
 crates/polars/tests/it/io/csv.rs              |  9 ++++---
 py-polars/polars/dataframe/frame.py           |  3 +++
 py-polars/polars/io/csv/batched_reader.py     |  2 ++
 py-polars/polars/io/csv/functions.py          |  8 ++++++
 py-polars/polars/lazyframe/frame.py           |  2 ++
 py-polars/src/batched_csv.rs                  |  4 ++-
 py-polars/src/dataframe.rs                    |  4 ++-
 py-polars/src/lazyframe.rs                    |  4 ++-
 py-polars/tests/unit/io/test_csv.py           | 26 +++++++++++++++++--
 24 files changed, 135 insertions(+), 24 deletions(-)
 create mode 100644 crates/polars-error/src/constants.rs

diff --git a/crates/polars-error/Cargo.toml b/crates/polars-error/Cargo.toml
index 47bf990d3e48..ce622dcccbe9 100644
--- a/crates/polars-error/Cargo.toml
+++ b/crates/polars-error/Cargo.toml
@@ -12,3 +12,6 @@ description = "Error definitions for the Polars DataFrame library"
 arrow = { workspace = true }
 regex = { workspace = true, optional = true }
 thiserror = { workspace = true }
+
+[features]
+python = []
diff --git a/crates/polars-error/src/constants.rs b/crates/polars-error/src/constants.rs
new file mode 100644
index 000000000000..473e9edfe55b
--- /dev/null
+++ b/crates/polars-error/src/constants.rs
@@ -0,0 +1,10 @@
+//! Constant that help with creating error messages dependent on the host language.
+#[cfg(feature = "python")]
+pub static TRUE: &str = "True";
+#[cfg(feature = "python")]
+pub static FALSE: &str = "False";
+
+#[cfg(not(feature = "python"))]
+pub static TRUE: &str = "true";
+#[cfg(not(feature = "python"))]
+pub static FALSE: &str = "false";
diff --git a/crates/polars-error/src/lib.rs b/crates/polars-error/src/lib.rs
index 3241c9faa54d..6cf86706ab44 100644
--- a/crates/polars-error/src/lib.rs
+++ b/crates/polars-error/src/lib.rs
@@ -1,3 +1,4 @@
+pub mod constants;
 mod warning;
 
 use std::borrow::Cow;
diff --git a/crates/polars-io/Cargo.toml b/crates/polars-io/Cargo.toml
index 5587fbb55ef3..9b9822ec2adc 100644
--- a/crates/polars-io/Cargo.toml
+++ b/crates/polars-io/Cargo.toml
@@ -95,6 +95,7 @@ gcp = ["object_store/gcp", "cloud", "polars-core/gcp"]
 partition = ["polars-core/partition_by"]
 temporal = ["dtype-datetime", "dtype-date", "dtype-time"]
 simd = []
+python = ["polars-error/python"]
 
 [package.metadata.docs.rs]
 all-features = true
diff --git a/crates/polars-io/src/csv/parser.rs b/crates/polars-io/src/csv/parser.rs
index d08dff9c540b..b89d5cbcb297 100644
--- a/crates/polars-io/src/csv/parser.rs
+++ b/crates/polars-io/src/csv/parser.rs
@@ -354,11 +354,12 @@ pub(super) fn parse_lines<'a>(
     comment_char: Option<u8>,
     quote_char: Option<u8>,
     eol_char: u8,
-    null_values: Option<&NullValuesCompiled>,
     missing_is_null: bool,
+    ignore_errors: bool,
+    mut truncate_ragged_lines: bool,
+    null_values: Option<&NullValuesCompiled>,
     projection: &[usize],
     buffers: &mut [Buffer<'a>],
-    ignore_errors: bool,
     n_lines: usize,
     // length of original schema
     schema_len: usize,
@@ -368,6 +369,12 @@ pub(super) fn parse_lines<'a>(
         !projection.is_empty(),
         "at least one column should be projected"
     );
+    // During projection pushdown we are not checking other csv fields.
+    // This would be very expensive and we don't care as we only want
+    // the projected columns.
+    if projection.len() != schema_len {
+        truncate_ragged_lines = true
+    }
 
     // we use the pointers to track the no of bytes read.
     let start = bytes.as_ptr() as usize;
@@ -487,6 +494,11 @@ pub(super) fn parse_lines<'a>(
                                 if bytes.get(read_sol - 1) == Some(&eol_char) {
                                     bytes = &bytes[read_sol..];
                                 } else {
+                                    if !truncate_ragged_lines && read_sol < bytes.len() {
+                                        polars_bail!(ComputeError: r#"found more fields than defined in 'Schema'
+
+Consider setting 'truncate_ragged_lines={}'."#, polars_error::constants::TRUE)
+                                    }
                                     let bytes_rem = skip_this_line(
                                         &bytes[read_sol - 1..],
                                         quote_char,
diff --git a/crates/polars-io/src/csv/read.rs b/crates/polars-io/src/csv/read.rs
index ebaf2c4cd91c..9bf55ca3e06a 100644
--- a/crates/polars-io/src/csv/read.rs
+++ b/crates/polars-io/src/csv/read.rs
@@ -100,8 +100,6 @@ where
 {
     /// File or Stream object
     reader: R,
-    /// Aggregates chunk afterwards to a single chunk.
-    rechunk: bool,
     /// Stop reading from the csv after this number of rows is reached
     n_rows: Option<usize>,
     // used by error ignore logic
@@ -112,8 +110,6 @@ where
     /// Optional column names to project/ select.
     columns: Option<Vec<String>>,
     delimiter: Option<u8>,
-    has_header: bool,
-    ignore_errors: bool,
     pub(crate) schema: Option<SchemaRef>,
     encoding: CsvEncoding,
     n_threads: Option<usize>,
@@ -122,17 +118,22 @@ where
     dtype_overwrite: Option<&'a [DataType]>,
     sample_size: usize,
     chunk_size: usize,
-    low_memory: bool,
     comment_char: Option<u8>,
-    eol_char: u8,
     null_values: Option<NullValues>,
-    missing_is_null: bool,
     predicate: Option<Arc<dyn PhysicalIoExpr>>,
     quote_char: Option<u8>,
     skip_rows_after_header: usize,
     try_parse_dates: bool,
     row_count: Option<RowCount>,
+    /// Aggregates chunk afterwards to a single chunk.
+    rechunk: bool,
     raise_if_empty: bool,
+    truncate_ragged_lines: bool,
+    missing_is_null: bool,
+    low_memory: bool,
+    has_header: bool,
+    ignore_errors: bool,
+    eol_char: u8,
 }
 
 impl<'a, R> CsvReader<'a, R>
@@ -324,6 +325,12 @@ where
         self.predicate = predicate;
         self
     }
+
+    /// Truncate lines that are longer than the schema.
+    pub fn truncate_ragged_lines(mut self, toggle: bool) -> Self {
+        self.truncate_ragged_lines = toggle;
+        self
+    }
 }
 
 impl<'a> CsvReader<'a, File> {
@@ -374,6 +381,7 @@ impl<'a, R: MmapBytesReader + 'a> CsvReader<'a, R> {
             std::mem::take(&mut self.row_count),
             self.try_parse_dates,
             self.raise_if_empty,
+            self.truncate_ragged_lines,
         )
     }
 
@@ -558,6 +566,7 @@ where
             try_parse_dates: false,
             row_count: None,
             raise_if_empty: true,
+            truncate_ragged_lines: false,
         }
     }
 
diff --git a/crates/polars-io/src/csv/read_impl/batched_mmap.rs b/crates/polars-io/src/csv/read_impl/batched_mmap.rs
index 20f6f96018fb..a659f31d6c3c 100644
--- a/crates/polars-io/src/csv/read_impl/batched_mmap.rs
+++ b/crates/polars-io/src/csv/read_impl/batched_mmap.rs
@@ -161,6 +161,7 @@ impl<'a> CoreReader<'a> {
             missing_is_null: self.missing_is_null,
             to_cast: self.to_cast,
             ignore_errors: self.ignore_errors,
+            truncate_ragged_lines: self.truncate_ragged_lines,
             n_rows: self.n_rows,
             encoding: self.encoding,
             delimiter: self.delimiter,
@@ -186,6 +187,7 @@ pub struct BatchedCsvReaderMmap<'a> {
     eol_char: u8,
     null_values: Option<NullValuesCompiled>,
     missing_is_null: bool,
+    truncate_ragged_lines: bool,
     to_cast: Vec<Field>,
     ignore_errors: bool,
     n_rows: Option<usize>,
@@ -244,6 +246,7 @@ impl<'a> BatchedCsvReaderMmap<'a> {
                         self.encoding,
                         self.null_values.as_ref(),
                         self.missing_is_null,
+                        self.truncate_ragged_lines,
                         self.chunk_size,
                         stop_at_nbytes,
                         self.starting_point_offset,
diff --git a/crates/polars-io/src/csv/read_impl/batched_read.rs b/crates/polars-io/src/csv/read_impl/batched_read.rs
index 2c8a74a23969..88249222dcb4 100644
--- a/crates/polars-io/src/csv/read_impl/batched_read.rs
+++ b/crates/polars-io/src/csv/read_impl/batched_read.rs
@@ -244,6 +244,7 @@ impl<'a> CoreReader<'a> {
             missing_is_null: self.missing_is_null,
             to_cast: self.to_cast,
             ignore_errors: self.ignore_errors,
+            truncate_ragged_lines: self.truncate_ragged_lines,
             n_rows: self.n_rows,
             encoding: self.encoding,
             delimiter: self.delimiter,
@@ -271,6 +272,7 @@ pub struct BatchedCsvReaderRead<'a> {
     missing_is_null: bool,
     to_cast: Vec<Field>,
     ignore_errors: bool,
+    truncate_ragged_lines: bool,
     n_rows: Option<usize>,
     encoding: CsvEncoding,
     delimiter: u8,
@@ -341,6 +343,7 @@ impl<'a> BatchedCsvReaderRead<'a> {
                         self.encoding,
                         self.null_values.as_ref(),
                         self.missing_is_null,
+                        self.truncate_ragged_lines,
                         self.chunk_size,
                         stop_at_n_bytes,
                         self.starting_point_offset,
diff --git a/crates/polars-io/src/csv/read_impl/mod.rs b/crates/polars-io/src/csv/read_impl/mod.rs
index 62aa3578aabf..6702d4779184 100644
--- a/crates/polars-io/src/csv/read_impl/mod.rs
+++ b/crates/polars-io/src/csv/read_impl/mod.rs
@@ -115,6 +115,7 @@ pub(crate) struct CoreReader<'a> {
     predicate: Option<Arc<dyn PhysicalIoExpr>>,
     to_cast: Vec<Field>,
     row_count: Option<RowCount>,
+    truncate_ragged_lines: bool,
 }
 
 impl<'a> fmt::Debug for CoreReader<'a> {
@@ -206,6 +207,7 @@ impl<'a> CoreReader<'a> {
         row_count: Option<RowCount>,
         try_parse_dates: bool,
         raise_if_empty: bool,
+        truncate_ragged_lines: bool,
     ) -> PolarsResult<CoreReader<'a>> {
         #[cfg(any(feature = "decompress", feature = "decompress-fast"))]
         let mut reader_bytes = reader_bytes;
@@ -303,6 +305,7 @@ impl<'a> CoreReader<'a> {
             predicate,
             to_cast,
             row_count,
+            truncate_ragged_lines,
         })
     }
 
@@ -609,11 +612,12 @@ impl<'a> CoreReader<'a> {
                                 self.comment_char,
                                 self.quote_char,
                                 self.eol_char,
-                                self.null_values.as_ref(),
                                 self.missing_is_null,
+                                self.truncate_ragged_lines,
+                                ignore_errors,
+                                self.null_values.as_ref(),
                                 projection,
                                 &mut buffers,
-                                ignore_errors,
                                 chunk_size,
                                 self.schema.len(),
                                 &self.schema,
@@ -683,6 +687,7 @@ impl<'a> CoreReader<'a> {
                             self.encoding,
                             self.null_values.as_ref(),
                             self.missing_is_null,
+                            self.truncate_ragged_lines,
                             usize::MAX,
                             stop_at_nbytes,
                             starting_point_offset,
@@ -725,11 +730,12 @@ impl<'a> CoreReader<'a> {
                                 self.comment_char,
                                 self.quote_char,
                                 self.eol_char,
-                                self.null_values.as_ref(),
                                 self.missing_is_null,
+                                self.ignore_errors,
+                                self.truncate_ragged_lines,
+                                self.null_values.as_ref(),
                                 &projection,
                                 &mut buffers,
-                                self.ignore_errors,
                                 remaining_rows - 1,
                                 self.schema.len(),
                                 self.schema.as_ref(),
@@ -811,6 +817,7 @@ fn read_chunk(
     encoding: CsvEncoding,
     null_values: Option<&NullValuesCompiled>,
     missing_is_null: bool,
+    truncate_ragged_lines: bool,
     chunk_size: usize,
     stop_at_nbytes: usize,
     starting_point_offset: Option<usize>,
@@ -842,11 +849,12 @@ fn read_chunk(
             comment_char,
             quote_char,
             eol_char,
-            null_values,
             missing_is_null,
+            ignore_errors,
+            truncate_ragged_lines,
+            null_values,
             projection,
             &mut buffers,
-            ignore_errors,
             chunk_size,
             schema.len(),
             schema,
diff --git a/crates/polars-lazy/Cargo.toml b/crates/polars-lazy/Cargo.toml
index eb432e860d98..31414d8cd7f3 100644
--- a/crates/polars-lazy/Cargo.toml
+++ b/crates/polars-lazy/Cargo.toml
@@ -110,7 +110,7 @@ list_eval = []
 cumulative_eval = []
 chunked_ids = ["polars-plan/chunked_ids", "polars-core/chunked_ids"]
 list_to_struct = ["polars-plan/list_to_struct"]
-python = ["pyo3", "polars-plan/python", "polars-core/python"]
+python = ["pyo3", "polars-plan/python", "polars-core/python", "polars-io/python"]
 row_hash = ["polars-plan/row_hash"]
 string_justify = ["polars-plan/string_justify"]
 string_from_radix = ["polars-plan/string_from_radix"]
diff --git a/crates/polars-lazy/src/frame/csv.rs b/crates/polars-lazy/src/frame/csv.rs
index 5067226ebe57..1e1e97240dbf 100644
--- a/crates/polars-lazy/src/frame/csv.rs
+++ b/crates/polars-lazy/src/frame/csv.rs
@@ -26,6 +26,7 @@ pub struct LazyCsvReader<'a> {
     eol_char: u8,
     null_values: Option<NullValues>,
     missing_is_null: bool,
+    truncate_ragged_lines: bool,
     infer_schema_length: Option<usize>,
     rechunk: bool,
     skip_rows_after_header: usize,
@@ -61,6 +62,7 @@ impl<'a> LazyCsvReader<'a> {
             row_count: None,
             try_parse_dates: false,
             raise_if_empty: true,
+            truncate_ragged_lines: false,
         }
     }
 
@@ -208,6 +210,13 @@ impl<'a> LazyCsvReader<'a> {
         self
     }
 
+    /// Truncate lines that are longer than the schema.
+    #[must_use]
+    pub fn truncate_ragged_lines(mut self, toggle: bool) -> Self {
+        self.truncate_ragged_lines = toggle;
+        self
+    }
+
     /// Modify a schema before we run the lazy scanning.
     ///
     /// Important! Run this function latest in the builder!
@@ -280,6 +289,7 @@ impl LazyFileListReader for LazyCsvReader<'_> {
             self.row_count,
             self.try_parse_dates,
             self.raise_if_empty,
+            self.truncate_ragged_lines,
         )?
         .build()
         .into();
diff --git a/crates/polars-lazy/src/physical_plan/executors/scan/csv.rs b/crates/polars-lazy/src/physical_plan/executors/scan/csv.rs
index bd4f8b20d631..80b2b2e3aa95 100644
--- a/crates/polars-lazy/src/physical_plan/executors/scan/csv.rs
+++ b/crates/polars-lazy/src/physical_plan/executors/scan/csv.rs
@@ -45,6 +45,7 @@ impl CsvExec {
             .with_rechunk(self.file_options.rechunk)
             .with_row_count(std::mem::take(&mut self.file_options.row_count))
             .with_try_parse_dates(self.options.try_parse_dates)
+            .truncate_ragged_lines(self.options.truncate_ragged_lines)
             .raise_if_empty(self.options.raise_if_empty)
             .finish()
     }
diff --git a/crates/polars-pipe/src/executors/sources/csv.rs b/crates/polars-pipe/src/executors/sources/csv.rs
index 46b6362e6856..a9e9f5352d1d 100644
--- a/crates/polars-pipe/src/executors/sources/csv.rs
+++ b/crates/polars-pipe/src/executors/sources/csv.rs
@@ -80,6 +80,7 @@ impl CsvSource {
             .with_chunk_size(chunk_size)
             .with_row_count(file_options.row_count)
             .with_try_parse_dates(options.try_parse_dates)
+            .truncate_ragged_lines(options.truncate_ragged_lines)
             .raise_if_empty(options.raise_if_empty);
 
         let reader = Box::new(reader);
diff --git a/crates/polars-plan/src/logical_plan/builder.rs b/crates/polars-plan/src/logical_plan/builder.rs
index 902ef9b6b91d..276f2aeb76d9 100644
--- a/crates/polars-plan/src/logical_plan/builder.rs
+++ b/crates/polars-plan/src/logical_plan/builder.rs
@@ -252,6 +252,7 @@ impl LogicalPlanBuilder {
         row_count: Option<RowCount>,
         try_parse_dates: bool,
         raise_if_empty: bool,
+        truncate_ragged_lines: bool,
     ) -> PolarsResult<Self> {
         let path = path.into();
         let mut file = polars_utils::open_file(&path).map_err(|e| {
@@ -346,6 +347,7 @@ impl LogicalPlanBuilder {
                     encoding,
                     try_parse_dates,
                     raise_if_empty,
+                    truncate_ragged_lines,
                 },
             },
         }
diff --git a/crates/polars-plan/src/logical_plan/options.rs b/crates/polars-plan/src/logical_plan/options.rs
index 9aef73892951..2c9a44446fe9 100644
--- a/crates/polars-plan/src/logical_plan/options.rs
+++ b/crates/polars-plan/src/logical_plan/options.rs
@@ -35,6 +35,7 @@ pub struct CsvParserOptions {
     pub encoding: CsvEncoding,
     pub try_parse_dates: bool,
     pub raise_if_empty: bool,
+    pub truncate_ragged_lines: bool,
 }
 
 #[cfg(feature = "parquet")]
diff --git a/crates/polars/tests/it/io/csv.rs b/crates/polars/tests/it/io/csv.rs
index 74a66e320640..4c48d71921c6 100644
--- a/crates/polars/tests/it/io/csv.rs
+++ b/crates/polars/tests/it/io/csv.rs
@@ -568,7 +568,7 @@ fn test_comment_lines() -> PolarsResult<()> {
 #[test]
 fn test_null_values_argument() -> PolarsResult<()> {
     let csv = r"1,a,foo
-null-value,b,bar,
+null-value,b,bar
 3,null-value,ham
 ";
 
@@ -826,7 +826,10 @@ fn test_scientific_floats() -> PolarsResult<()> {
 fn test_tsv_header_offset() -> PolarsResult<()> {
     let csv = "foo\tbar\n\t1000011\t1\n\t1000026\t2\n\t1000949\t2";
     let file = Cursor::new(csv);
-    let df = CsvReader::new(file).with_delimiter(b'\t').finish()?;
+    let df = CsvReader::new(file)
+        .truncate_ragged_lines(true)
+        .with_delimiter(b'\t')
+        .finish()?;
 
     assert_eq!(df.shape(), (3, 2));
     assert_eq!(df.dtypes(), &[DataType::Utf8, DataType::Int64]);
@@ -925,7 +928,7 @@ foo,bar
         .finish()?;
     assert_eq!(df.get_column_names(), &["foo", "bar"]);
     assert_eq!(df.shape(), (1, 2));
-    let df = CsvReader::new(file).finish()?;
+    let df = CsvReader::new(file).truncate_ragged_lines(true).finish()?;
     assert_eq!(df.shape(), (5, 1));
 
     Ok(())
diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
index c7d630b207de..96c70447637e 100644
--- a/py-polars/polars/dataframe/frame.py
+++ b/py-polars/polars/dataframe/frame.py
@@ -679,6 +679,7 @@ def _read_csv(
         sample_size: int = 1024,
         eol_char: str = "\n",
         raise_if_empty: bool = True,
+        truncate_ragged_lines: bool = False,
     ) -> DataFrame:
         """
         Read a CSV file into a DataFrame.
@@ -751,6 +752,7 @@ def _read_csv(
                 row_count_offset=row_count_offset,
                 eol_char=eol_char,
                 raise_if_empty=raise_if_empty,
+                truncate_ragged_lines=truncate_ragged_lines,
             )
             if columns is None:
                 return scan.collect()
@@ -792,6 +794,7 @@ def _read_csv(
             sample_size=sample_size,
             eol_char=eol_char,
             raise_if_empty=raise_if_empty,
+            truncate_ragged_lines=truncate_ragged_lines,
         )
         return self
 
diff --git a/py-polars/polars/io/csv/batched_reader.py b/py-polars/polars/io/csv/batched_reader.py
index 27d55afb55e4..9f848981ec15 100644
--- a/py-polars/polars/io/csv/batched_reader.py
+++ b/py-polars/polars/io/csv/batched_reader.py
@@ -53,6 +53,7 @@ def __init__(
         eol_char: str = "\n",
         new_columns: Sequence[str] | None = None,
         raise_if_empty: bool = True,
+        truncate_ragged_lines: bool = False,
     ):
         path: str | None
         if isinstance(source, (str, Path)):
@@ -100,6 +101,7 @@ def __init__(
             sample_size=sample_size,
             eol_char=eol_char,
             raise_if_empty=raise_if_empty,
+            truncate_ragged_lines=truncate_ragged_lines,
         )
         self.new_columns = new_columns
 
diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py
index 42039f416e8c..548a90d89a56 100644
--- a/py-polars/polars/io/csv/functions.py
+++ b/py-polars/polars/io/csv/functions.py
@@ -47,6 +47,7 @@ def read_csv(
     sample_size: int = 1024,
     eol_char: str = "\n",
     raise_if_empty: bool = True,
+    truncate_ragged_lines: bool = False,
 ) -> DataFrame:
     """
     Read a CSV file into a DataFrame.
@@ -157,6 +158,8 @@ def read_csv(
     raise_if_empty
         When there is no data in the source,``NoDataError`` is raised. If this parameter
         is set to False, an empty DataFrame (with no columns) is returned instead.
+    truncate_ragged_lines
+        Truncate lines that are longer than the schema.
 
     Returns
     -------
@@ -379,6 +382,7 @@ def read_csv(
             sample_size=sample_size,
             eol_char=eol_char,
             raise_if_empty=raise_if_empty,
+            truncate_ragged_lines=truncate_ragged_lines,
         )
 
     if new_columns:
@@ -704,6 +708,7 @@ def scan_csv(
     eol_char: str = "\n",
     new_columns: Sequence[str] | None = None,
     raise_if_empty: bool = True,
+    truncate_ragged_lines: bool = False,
 ) -> LazyFrame:
     """
     Lazily read from a CSV file or multiple files via glob patterns.
@@ -788,6 +793,8 @@ def scan_csv(
     raise_if_empty
         When there is no data in the source,``NoDataError`` is raised. If this parameter
         is set to False, an empty LazyFrame (with no columns) is returned instead.
+    truncate_ragged_lines
+        Truncate lines that are longer than the schema.
 
     Returns
     -------
@@ -901,4 +908,5 @@ def with_column_names(_cols: list[str]) -> list[str]:
         try_parse_dates=try_parse_dates,
         eol_char=eol_char,
         raise_if_empty=raise_if_empty,
+        truncate_ragged_lines=truncate_ragged_lines,
     )
diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py
index 75703adbd228..25622fb163a2 100644
--- a/py-polars/polars/lazyframe/frame.py
+++ b/py-polars/polars/lazyframe/frame.py
@@ -343,6 +343,7 @@ def _scan_csv(
         try_parse_dates: bool = False,
         eol_char: str = "\n",
         raise_if_empty: bool = True,
+        truncate_ragged_lines: bool = True,
     ) -> Self:
         """
         Lazily read from a CSV file or multiple files via glob patterns.
@@ -385,6 +386,7 @@ def _scan_csv(
             try_parse_dates,
             eol_char=eol_char,
             raise_if_empty=raise_if_empty,
+            truncate_ragged_lines=truncate_ragged_lines,
         )
         return self
 
diff --git a/py-polars/src/batched_csv.rs b/py-polars/src/batched_csv.rs
index 7161ace32551..6114fb43a675 100644
--- a/py-polars/src/batched_csv.rs
+++ b/py-polars/src/batched_csv.rs
@@ -31,7 +31,7 @@ impl PyBatchedCsv {
         projection, separator, rechunk, columns, encoding, n_threads, path, overwrite_dtype,
         overwrite_dtype_slice, low_memory, comment_char, quote_char, null_values,
         missing_utf8_is_empty_string, try_parse_dates, skip_rows_after_header, row_count,
-        sample_size, eol_char, raise_if_empty)
+        sample_size, eol_char, raise_if_empty, truncate_ragged_lines)
     )]
     fn new(
         infer_schema_length: Option<usize>,
@@ -60,6 +60,7 @@ impl PyBatchedCsv {
         sample_size: usize,
         eol_char: &str,
         raise_if_empty: bool,
+        truncate_ragged_lines: bool,
     ) -> PyResult<PyBatchedCsv> {
         let null_values = null_values.map(|w| w.0);
         let comment_char = comment_char.map(|s| s.as_bytes()[0]);
@@ -118,6 +119,7 @@ impl PyBatchedCsv {
             .with_skip_rows_after_header(skip_rows_after_header)
             .with_row_count(row_count)
             .sample_size(sample_size)
+            .truncate_ragged_lines(truncate_ragged_lines)
             .raise_if_empty(raise_if_empty);
 
         let reader = if low_memory {
diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs
index 355b24d5bec8..3ce6754aefab 100644
--- a/py-polars/src/dataframe.rs
+++ b/py-polars/src/dataframe.rs
@@ -139,7 +139,7 @@ impl PyDataFrame {
         skip_rows, projection, separator, rechunk, columns, encoding, n_threads, path,
         overwrite_dtype, overwrite_dtype_slice, low_memory, comment_char, quote_char,
         null_values, missing_utf8_is_empty_string, try_parse_dates, skip_rows_after_header,
-        row_count, sample_size, eol_char, raise_if_empty)
+        row_count, sample_size, eol_char, raise_if_empty, truncate_ragged_lines)
     )]
     pub fn read_csv(
         py_f: &PyAny,
@@ -169,6 +169,7 @@ impl PyDataFrame {
         sample_size: usize,
         eol_char: &str,
         raise_if_empty: bool,
+        truncate_ragged_lines: bool,
     ) -> PyResult<Self> {
         let null_values = null_values.map(|w| w.0);
         let comment_char = comment_char.map(|s| s.as_bytes()[0]);
@@ -229,6 +230,7 @@ impl PyDataFrame {
             .with_row_count(row_count)
             .sample_size(sample_size)
             .raise_if_empty(raise_if_empty)
+            .truncate_ragged_lines(truncate_ragged_lines)
             .finish()
             .map_err(PyPolarsErr::from)?;
         Ok(df.into())
diff --git a/py-polars/src/lazyframe.rs b/py-polars/src/lazyframe.rs
index ae7df9579ff0..be3c62ce4d97 100644
--- a/py-polars/src/lazyframe.rs
+++ b/py-polars/src/lazyframe.rs
@@ -147,7 +147,7 @@ impl PyLazyFrame {
     #[pyo3(signature = (path, separator, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype,
         low_memory, comment_char, quote_char, null_values, missing_utf8_is_empty_string,
         infer_schema_length, with_schema_modify, rechunk, skip_rows_after_header,
-        encoding, row_count, try_parse_dates, eol_char, raise_if_empty,
+        encoding, row_count, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines
     )
     )]
     fn new_from_csv(
@@ -173,6 +173,7 @@ impl PyLazyFrame {
         try_parse_dates: bool,
         eol_char: &str,
         raise_if_empty: bool,
+        truncate_ragged_lines: bool,
     ) -> PyResult<Self> {
         let null_values = null_values.map(|w| w.0);
         let comment_char = comment_char.map(|s| s.as_bytes()[0]);
@@ -207,6 +208,7 @@ impl PyLazyFrame {
             .with_try_parse_dates(try_parse_dates)
             .with_null_values(null_values)
             .with_missing_is_null(!missing_utf8_is_empty_string)
+            .truncate_ragged_lines(truncate_ragged_lines)
             .raise_if_empty(raise_if_empty);
 
         if let Some(lambda) = with_schema_modify {
diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py
index ca6574a31061..dd973193ea26 100644
--- a/py-polars/tests/unit/io/test_csv.py
+++ b/py-polars/tests/unit/io/test_csv.py
@@ -445,7 +445,7 @@ def test_compressed_csv(io_files_path: Path) -> None:
         """\
         a,b,c
         1,a,1.0
-        2,b,2.0,
+        2,b,2.0
         3,c,3.0
         """
     )
@@ -462,7 +462,7 @@ def test_compressed_csv(io_files_path: Path) -> None:
 
     # now from disk
     csv_file = io_files_path / "gzipped.csv"
-    out = pl.read_csv(str(csv_file))
+    out = pl.read_csv(str(csv_file), truncate_ragged_lines=True)
     assert_frame_equal(out, expected)
 
     # now with column projection
@@ -1472,3 +1472,25 @@ def test_ignore_errors_date_parser() -> None:
             dtypes={"date": pl.Date},
             ignore_errors=False,
         )
+
+
+def test_csv_ragged_lines() -> None:
+    expected = {"column_1": ["A", "B", "C"]}
+    assert (
+        pl.read_csv(
+            io.StringIO("A\nB,ragged\nC"), has_header=False, truncate_ragged_lines=True
+        ).to_dict(False)
+        == expected
+    )
+    assert (
+        pl.read_csv(
+            io.StringIO("A\nB\nC,ragged"), has_header=False, truncate_ragged_lines=True
+        ).to_dict(False)
+        == expected
+    )
+
+    for s in ["A\nB,ragged\nC", "A\nB\nC,ragged"]:
+        with pytest.raises(pl.ComputeError, match=r"found more fields than defined"):
+            pl.read_csv(io.StringIO(s), has_header=False, truncate_ragged_lines=False)
+        with pytest.raises(pl.ComputeError, match=r"found more fields than defined"):
+            pl.read_csv(io.StringIO(s), has_header=False, truncate_ragged_lines=False)

From 767ebe8e48f7cf4fa085d2e45d8569ec3d311b7e Mon Sep 17 00:00:00 2001
From: Alexander Beedie <alexander-beedie@users.noreply.github.com>
Date: Tue, 22 Aug 2023 14:03:02 +0400
Subject: [PATCH 37/55] feat(python): support `DataFrame` init from queries
 against user-instantiated database connections (#10649)

---
 py-polars/docs/source/reference/io.rst        |   1 +
 py-polars/polars/__init__.py                  |   2 +
 py-polars/polars/convert.py                   |  12 +-
 py-polars/polars/io/__init__.py               |   3 +-
 py-polars/polars/io/database.py               | 300 +++++++++++++++--
 py-polars/polars/type_aliases.py              |  30 ++
 py-polars/polars/utils/_construction.py       |   8 +-
 py-polars/pyproject.toml                      |   2 +-
 py-polars/tests/unit/io/test_database.py      | 241 --------------
 py-polars/tests/unit/io/test_database_read.py | 307 ++++++++++++++++++
 .../tests/unit/io/test_database_write.py      | 103 ++++++
 11 files changed, 739 insertions(+), 270 deletions(-)
 delete mode 100644 py-polars/tests/unit/io/test_database.py
 create mode 100644 py-polars/tests/unit/io/test_database_read.py
 create mode 100644 py-polars/tests/unit/io/test_database_write.py

diff --git a/py-polars/docs/source/reference/io.rst b/py-polars/docs/source/reference/io.rst
index d83afcffd10e..243f0fe075f0 100644
--- a/py-polars/docs/source/reference/io.rst
+++ b/py-polars/docs/source/reference/io.rst
@@ -43,6 +43,7 @@ Database
    :toctree: api/
 
    read_database
+   read_database_uri
    DataFrame.write_database
 
 JSON
diff --git a/py-polars/polars/__init__.py b/py-polars/polars/__init__.py
index 01f03a68d68e..159394cec4d0 100644
--- a/py-polars/polars/__init__.py
+++ b/py-polars/polars/__init__.py
@@ -155,6 +155,7 @@
     read_csv,
     read_csv_batched,
     read_database,
+    read_database_uri,
     read_delta,
     read_excel,
     read_ipc,
@@ -248,6 +249,7 @@
     "read_csv",
     "read_csv_batched",
     "read_database",
+    "read_database_uri",
     "read_delta",
     "read_excel",
     "read_ipc",
diff --git a/py-polars/polars/convert.py b/py-polars/polars/convert.py
index 16ca69601012..bd8f8e86672a 100644
--- a/py-polars/polars/convert.py
+++ b/py-polars/polars/convert.py
@@ -2,7 +2,7 @@
 
 import io
 import re
-from itertools import zip_longest
+from itertools import chain, zip_longest
 from typing import TYPE_CHECKING, Any, Iterable, Mapping, Sequence, overload
 
 import polars._reexport as pl
@@ -516,7 +516,7 @@ def from_arrow(
         | pa.Array
         | pa.ChunkedArray
         | pa.RecordBatch
-        | Iterable[pa.RecordBatch]
+        | Iterable[pa.RecordBatch | pa.Table]
     ),
     schema: SchemaDefinition | None = None,
     *,
@@ -532,7 +532,7 @@ def from_arrow(
     Parameters
     ----------
     data : :class:`pyarrow.Table`, :class:`pyarrow.Array`, one or more :class:`pyarrow.RecordBatch`
-        Data representing an Arrow Table, Array, or sequence of RecordBatches.
+        Data representing an Arrow Table, Array, or sequence of RecordBatches or Tables.
     schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict
         The DataFrame schema may be declared in several ways:
 
@@ -609,7 +609,11 @@ def from_arrow(
         data = [data]
     if isinstance(data, Iterable):
         return pl.DataFrame._from_arrow(
-            data=pa.Table.from_batches(data),
+            data=pa.Table.from_batches(
+                chain.from_iterable(
+                    (b.to_batches() if isinstance(b, pa.Table) else [b]) for b in data
+                )
+            ),
             rechunk=rechunk,
             schema=schema,
             schema_overrides=schema_overrides,
diff --git a/py-polars/polars/io/__init__.py b/py-polars/polars/io/__init__.py
index 7243007e82fc..995bc4552c55 100644
--- a/py-polars/polars/io/__init__.py
+++ b/py-polars/polars/io/__init__.py
@@ -2,7 +2,7 @@
 
 from polars.io.avro import read_avro
 from polars.io.csv import read_csv, read_csv_batched, scan_csv
-from polars.io.database import read_database
+from polars.io.database import read_database, read_database_uri
 from polars.io.delta import read_delta, scan_delta
 from polars.io.excel import read_excel
 from polars.io.ipc import read_ipc, read_ipc_schema, read_ipc_stream, scan_ipc
@@ -16,6 +16,7 @@
     "read_csv",
     "read_csv_batched",
     "read_database",
+    "read_database_uri",
     "read_delta",
     "read_excel",
     "read_ipc",
diff --git a/py-polars/polars/io/database.py b/py-polars/polars/io/database.py
index e6cd357c56f5..49abbb505f28 100644
--- a/py-polars/polars/io/database.py
+++ b/py-polars/polars/io/database.py
@@ -3,20 +3,276 @@
 import re
 import sys
 from importlib import import_module
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Iterable, Sequence, TypedDict
 
 from polars.convert import from_arrow
-from polars.utils.deprecation import deprecate_renamed_parameter
+from polars.utils.deprecation import (
+    deprecate_renamed_parameter,
+    issue_deprecation_warning,
+)
 
 if TYPE_CHECKING:
+    from types import TracebackType
+
+    if sys.version_info >= (3, 11):
+        from typing import Self
+    else:
+        from typing_extensions import Self
+
     from polars import DataFrame
-    from polars.type_aliases import DbReadEngine
+    from polars.dependencies import pyarrow as pa
+    from polars.type_aliases import ConnectionOrCursor, Cursor, DbReadEngine
+
+
+class _DriverProperties_(TypedDict):
+    fetch_all: str
+    fetch_batches: str | None
+    exact_batch_size: bool | None
+
+
+_ARROW_DRIVER_REGISTRY_: dict[str, _DriverProperties_] = {
+    "adbc_.*": {
+        "fetch_all": "fetch_arrow_table",
+        "fetch_batches": None,
+        "exact_batch_size": None,
+    },
+    "databricks": {
+        "fetch_all": "fetchall_arrow",
+        "fetch_batches": "fetchmany_arrow",
+        "exact_batch_size": True,
+    },
+    "snowflake": {
+        "fetch_all": "fetch_arrow_all",
+        "fetch_batches": "fetch_arrow_batches",
+        "exact_batch_size": False,
+    },
+    "turbodbc": {
+        "fetch_all": "fetchallarrow",
+        "fetch_batches": "fetcharrowbatches",
+        "exact_batch_size": False,
+    },
+}
+
+
+class ConnectionExecutor:
+    """Abstraction for querying databases with user-supplied connection objects."""
+
+    acquired_cursor = False
+
+    def __init__(self, connection: ConnectionOrCursor) -> None:
+        self.driver = type(connection).__module__.split(".", 1)[0].lower()
+        self.cursor = self._normalise_cursor(connection)
+        self.result: Any = None
+
+    def __enter__(self) -> Self:
+        return self
+
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: TracebackType | None,
+    ) -> None:
+        # iif we created it, close the cursor (NOT the connection)
+        if self.acquired_cursor:
+            self.cursor.close()
+
+    def __repr__(self) -> str:
+        return f"<{type(self).__name__} module={self.driver!r}>"
+
+    def _normalise_cursor(self, conn: ConnectionOrCursor) -> Cursor:
+        """Normalise a connection object such that we have the query executor."""
+        if self.driver == "sqlalchemy" and type(conn).__name__ == "Engine":
+            # sqlalchemy engine; direct use is deprecated, so get the connection
+            self.acquired_cursor = True
+            return conn.connect()  # type: ignore[union-attr]
+        elif hasattr(conn, "cursor"):
+            # connection has a dedicated cursor; prefer over direct execute
+            cursor = cursor() if callable(cursor := conn.cursor) else cursor
+            self.acquired_cursor = True
+            return cursor
+        elif hasattr(conn, "execute"):
+            # can execute directly (given cursor, sqlalchemy connection, etc)
+            return conn  # type: ignore[return-value]
+
+        raise TypeError(
+            f"Unrecognised connection {conn!r}; unable to find 'execute' method"
+        )
+
+    @staticmethod
+    def _fetch_arrow(
+        result: Cursor, fetch_method: str, batch_size: int | None
+    ) -> Iterable[pa.RecordBatch | pa.Table]:
+        """Iterate over the result set, fetching arrow data in batches."""
+        size = (batch_size,) if batch_size else ()
+        while result:  # type: ignore[truthy-bool]
+            result = getattr(result, fetch_method)(*size)
+            yield result
+
+    @staticmethod
+    def _fetchall_rows(result: Cursor) -> Iterable[Sequence[Any]]:
+        """Fetch row data in a single call, returning the complete result set."""
+        rows = result.fetchall()
+        return (
+            [tuple(row) for row in rows]
+            if rows and not isinstance(rows[0], (list, tuple))
+            else rows
+        )
+
+    def _fetchmany_rows(
+        self, result: Cursor, batch_size: int | None
+    ) -> Iterable[Sequence[Any]]:
+        """Fetch row data incrementally, yielding over the complete result set."""
+        while True:
+            rows = result.fetchmany(batch_size)
+            if not rows:
+                break
+            elif not isinstance(rows[0], (list, tuple)):
+                for row in rows:
+                    yield tuple(row)
+            else:
+                yield from rows
+
+    def _from_arrow(self, batch_size: int | None) -> DataFrame | None:
+        """Return resultset data in Arrow format for frame init."""
+        from polars import DataFrame
+
+        for driver, driver_properties in _ARROW_DRIVER_REGISTRY_.items():
+            if re.match(f"^{driver}$", self.driver):
+                size = batch_size if driver_properties["exact_batch_size"] else None
+                fetch_batches = driver_properties["fetch_batches"]
+                return DataFrame(
+                    self._fetch_arrow(self.result, fetch_batches, size)
+                    if batch_size and fetch_batches is not None
+                    else getattr(self.result, driver_properties["fetch_all"])()
+                )
+
+        if self.driver == "duckdb":
+            exec_kwargs = {"rows_per_batch": batch_size} if batch_size else {}
+            return DataFrame(self.result.arrow(**exec_kwargs))
+
+        return None
+
+    def _from_rows(self, batch_size: int | None) -> DataFrame | None:
+        """Return resultset data row-wise for frame init."""
+        from polars import DataFrame
+
+        if hasattr(self.result, "fetchall"):
+            description = (
+                self.result.cursor.description
+                if self.driver == "sqlalchemy"
+                else self.result.description
+            )
+            column_names = [desc[0] for desc in description]
+            return DataFrame(
+                data=(
+                    self._fetchall_rows(self.result)
+                    if not batch_size
+                    else self._fetchmany_rows(self.result, batch_size)
+                ),
+                schema=column_names,
+                orient="row",
+            )
+        return None
+
+    def execute(self, query: str) -> Self:
+        """Execute a query and reference the result set data."""
+        if self.driver == "sqlalchemy":
+            from sqlalchemy.sql import text
+
+            query = text(query)  # type: ignore[assignment]
+
+        if (result := self.cursor.execute(query)) is None:
+            result = self.cursor  # some cursors execute in-place
+
+        self.result = result
+        return self
+
+    def to_frame(self, batch_size: int | None = None) -> DataFrame:
+        """
+        Convert the result set to a DataFrame.
+
+        Wherever possible we try to return arrow-native data directly; only
+        fall back to initialising with row-level data if no other option.
+        """
+        if self.result is None:
+            raise RuntimeError("Cannot return a frame before executing a query")
+
+        for frame_init in (
+            self._from_arrow,  # init from arrow-native data (most efficient option)
+            self._from_rows,  # row-wise fallback covering sqlalchemy, dbapi2, pyodbc
+        ):
+            frame = frame_init(batch_size)
+            if frame is not None:
+                return frame
+
+        raise NotImplementedError(
+            f"Currently no support for {self.driver!r} connection {self.cursor!r}"
+        )
 
 
 @deprecate_renamed_parameter("connection_uri", "connection", version="0.18.9")
-def read_database(
+def read_database(  # noqa: D417
+    query: str,
+    connection: ConnectionOrCursor,
+    batch_size: int | None = None,
+    **kwargs: Any,
+) -> DataFrame:
+    """
+    Read the results of a SQL query into a DataFrame, given a connection object.
+
+    Parameters
+    ----------
+    query
+        String SQL query to execute.
+    connection
+        An instantiated connection (or cursor/client object) that the query can be
+        executed against.
+    batch_size
+        The number of rows to fetch each time as data is collected; if this option is
+        supported by the backend it will be passed to the underlying query execution
+        method (if the backend does not have such support it is ignored without error).
+
+    Notes
+    -----
+    This function supports a wide range of native database drivers (ranging from SQLite
+    to Snowflake), as well as libraries such as ADBC, SQLAlchemy and various flavours
+    of ODBC. If the backend supports returning Arrow data directly then this facility
+    will be used to efficiently instantiate the DataFrame; otherwise, the DataFrame
+    is initialised from row-wise data.
+
+    Examples
+    --------
+    Instantiate a DataFrame from a SQL query against a user-supplied connection:
+
+    >>> df = pl.read_database(
+    ...     query="SELECT * FROM test_data",
+    ...     connection=conn,
+    ... )  # doctest: +SKIP
+
+    See Also
+    --------
+    read_database_uri : Create a DataFrame from a SQL query using a URI string.
+
+    """
+    if isinstance(connection, str):
+        issue_deprecation_warning(
+            message="Use of a string URI with 'read_database' is deprecated; use 'read_database_uri' instead",
+            version="0.19.0",
+        )
+        return read_database_uri(query, uri=connection, **kwargs)
+    elif kwargs:
+        raise ValueError(
+            f"'read_database' does not support arbitrary **kwargs: found {kwargs!r}"
+        )
+
+    with ConnectionExecutor(connection) as cx:
+        return cx.execute(query).to_frame(batch_size)
+
+
+def read_database_uri(
     query: list[str] | str,
-    connection: str,
+    uri: str,
     *,
     partition_on: str | None = None,
     partition_range: tuple[int, int] | None = None,
@@ -25,13 +281,13 @@ def read_database(
     engine: DbReadEngine | None = None,
 ) -> DataFrame:
     """
-    Read the results of a SQL query into a DataFrame.
+    Read the results of a SQL query into a DataFrame, given a URI.
 
     Parameters
     ----------
     query
         Raw SQL query (or queries).
-    connection
+    uri
         A connectorx or ADBC connection URI string that starts with the backend's
         driver name, for example:
 
@@ -73,18 +329,18 @@ def read_database(
 
     Examples
     --------
-    Read a DataFrame from a SQL query using a single thread:
+    Create a DataFrame from a SQL query using a single thread:
 
     >>> uri = "postgresql://username:password@server:port/database"
     >>> query = "SELECT * FROM lineitem"
-    >>> pl.read_database(query, uri)  # doctest: +SKIP
+    >>> pl.read_database_uri(query, uri)  # doctest: +SKIP
 
-    Read a DataFrame in parallel using 10 threads by automatically partitioning the
-    provided SQL on the partition column:
+    Create a DataFrame in parallel using 10 threads by automatically partitioning
+    the provided SQL on the partition column:
 
     >>> uri = "postgresql://username:password@server:port/database"
     >>> query = "SELECT * FROM lineitem"
-    >>> pl.read_database(
+    >>> pl.read_database_uri(
     ...     query,
     ...     uri,
     ...     partition_on="partition_col",
@@ -92,28 +348,32 @@ def read_database(
     ...     engine="connectorx",
     ... )  # doctest: +SKIP
 
-    Read a DataFrame in parallel using 2 threads by explicitly providing two SQL
-    queries:
+    Create a DataFrame in parallel using 2 threads by explicitly providing two
+    SQL queries:
 
     >>> uri = "postgresql://username:password@server:port/database"
     >>> queries = [
     ...     "SELECT * FROM lineitem WHERE partition_col <= 10",
     ...     "SELECT * FROM lineitem WHERE partition_col > 10",
     ... ]
-    >>> pl.read_database(queries, uri, engine="connectorx")  # doctest: +SKIP
+    >>> pl.read_database_uri(queries, uri, engine="connectorx")  # doctest: +SKIP
 
     Read data from Snowflake using the ADBC driver:
 
-    >>> df = pl.read_database(
+    >>> df = pl.read_database_uri(
     ...     "SELECT * FROM test_table",
     ...     "snowflake://user:pass@company-org/testdb/public?warehouse=test&role=myrole",
     ...     engine="adbc",
     ... )  # doctest: +SKIP
 
+    See Also
+    --------
+    read_database : Create a DataFrame from a SQL query using a connection object.
+
     """  # noqa: W505
-    if not isinstance(connection, str):
+    if not isinstance(uri, str):
         raise TypeError(
-            f"expected connection to be a URI string; found {type(connection).__name__!r}"
+            f"expected connection to be a URI string; found {type(uri).__name__!r}"
         )
     elif engine is None:
         engine = "connectorx"
@@ -121,7 +381,7 @@ def read_database(
     if engine == "connectorx":
         return _read_sql_connectorx(
             query,
-            connection,
+            connection_uri=uri,
             partition_on=partition_on,
             partition_range=partition_range,
             partition_num=partition_num,
@@ -130,7 +390,7 @@ def read_database(
     elif engine == "adbc":
         if not isinstance(query, str):
             raise ValueError("only a single SQL query string is accepted for adbc")
-        return _read_sql_adbc(query, connection)
+        return _read_sql_adbc(query, uri)
     else:
         raise ValueError(
             f"engine must be one of {{'connectorx', 'adbc'}}, got {engine!r}"
diff --git a/py-polars/polars/type_aliases.py b/py-polars/polars/type_aliases.py
index 14597c0c6bb7..e87d7ade9b12 100644
--- a/py-polars/polars/type_aliases.py
+++ b/py-polars/polars/type_aliases.py
@@ -10,6 +10,7 @@
     List,
     Literal,
     Mapping,
+    Protocol,
     Sequence,
     Tuple,
     Type,
@@ -193,3 +194,32 @@
 # typevars for core polars types
 PolarsType = TypeVar("PolarsType", "DataFrame", "LazyFrame", "Series", "Expr")
 FrameType = TypeVar("FrameType", "DataFrame", "LazyFrame")
+
+
+# minimal protocol definitions that can reasonably represent
+# an executable connection, cursor, or equivalent object
+class BasicConnection(Protocol):  # noqa: D101
+    def close(self) -> None:
+        """Close the connection."""
+
+    def cursor(self, *args: Any, **kwargs: Any) -> Any:
+        """Return a cursor object."""
+
+
+class BasicCursor(Protocol):  # noqa: D101
+    def close(self) -> None:
+        """Close the cursor."""
+
+    def execute(self, *args: Any, **kwargs: Any) -> Any:
+        """Execute a query."""
+
+
+class Cursor(BasicCursor):  # noqa: D101
+    def fetchall(self, *args: Any, **kwargs: Any) -> Any:
+        """Fetch all results."""
+
+    def fetchmany(self, *args: Any, **kwargs: Any) -> Any:
+        """Fetch results in batches."""
+
+
+ConnectionOrCursor = Union[BasicConnection, BasicCursor, Cursor]
diff --git a/py-polars/polars/utils/_construction.py b/py-polars/polars/utils/_construction.py
index 9f847b6aa0d8..0fe0ce966e99 100644
--- a/py-polars/polars/utils/_construction.py
+++ b/py-polars/polars/utils/_construction.py
@@ -103,8 +103,9 @@ def type_hints(obj: type) -> dict[str, Any]:
 def is_namedtuple(cls: Any, annotated: bool = False) -> bool:
     """Check whether given class derives from NamedTuple."""
     if all(hasattr(cls, attr) for attr in ("_fields", "_field_defaults", "_replace")):
-        if len(cls.__annotations__) == len(cls._fields) if annotated else True:
-            return all(isinstance(fld, str) for fld in cls._fields)
+        if not isinstance(cls._fields, property):
+            if not annotated or len(cls.__annotations__) == len(cls._fields):
+                return all(isinstance(fld, str) for fld in cls._fields)
     return False
 
 
@@ -1491,7 +1492,8 @@ def to_frame_chunk(values: list[Any], schema: SchemaDefinition | None) -> DataFr
             if not original_schema:
                 original_schema = list(df.schema.items())
             if chunk_size != adaptive_chunk_size:
-                chunk_size = adaptive_chunk_size = n_chunk_elems // len(df.columns)
+                if (n_columns := len(df.columns)) > 0:
+                    chunk_size = adaptive_chunk_size = n_chunk_elems // n_columns
         else:
             df.vstack(frame_chunk, in_place=True)
             n_chunks += 1
diff --git a/py-polars/pyproject.toml b/py-polars/pyproject.toml
index f30c9c05f3b8..350b82bf15f4 100644
--- a/py-polars/pyproject.toml
+++ b/py-polars/pyproject.toml
@@ -81,7 +81,7 @@ module = [
   "polars.polars",
   "pyarrow.*",
   "pydantic",
-  "sqlalchemy",
+  "sqlalchemy.*",
   "xlsx2csv",
   "xlsxwriter.*",
   "zoneinfo",
diff --git a/py-polars/tests/unit/io/test_database.py b/py-polars/tests/unit/io/test_database.py
deleted file mode 100644
index 7e874032f2f5..000000000000
--- a/py-polars/tests/unit/io/test_database.py
+++ /dev/null
@@ -1,241 +0,0 @@
-from __future__ import annotations
-
-import sqlite3
-import sys
-from datetime import date
-from pathlib import Path
-from typing import TYPE_CHECKING, Any
-
-import pytest
-
-import polars as pl
-from polars.testing import assert_frame_equal
-
-if TYPE_CHECKING:
-    from polars.type_aliases import DbReadEngine, DbWriteEngine, DbWriteMode
-
-
-@pytest.fixture()
-def sample_df() -> pl.DataFrame:
-    return pl.DataFrame(
-        {
-            "id": [1, 2],
-            "name": ["misc", "other"],
-            "value": [100.0, -99.0],
-            "date": ["2020-01-01", "2021-12-31"],
-        }
-    )
-
-
-def create_temp_sqlite_db(test_db: str) -> None:
-    Path(test_db).unlink(missing_ok=True)
-
-    # NOTE: at the time of writing adcb/connectorx have weak SQLite support (poor or
-    # no bool/date/datetime dtypes, for example) and there is a bug in connectorx that
-    # causes float rounding < py 3.11, hence we are only testing/storing simple values
-    # in this test db for now. as support improves, we can add/test additional dtypes).
-
-    conn = sqlite3.connect(test_db)
-    # ┌─────┬───────┬───────┬────────────┐
-    # │ id  ┆ name  ┆ value ┆ date       │
-    # │ --- ┆ ---   ┆ ---   ┆ ---        │
-    # │ i64 ┆ str   ┆ f64   ┆ date       │
-    # ╞═════╪═══════╪═══════╪════════════╡
-    # │ 1   ┆ misc  ┆ 100.0 ┆ 2020-01-01 │
-    # │ 2   ┆ other ┆ -99.0 ┆ 2021-12-31 │
-    # └─────┴───────┴───────┴────────────┘
-    conn.executescript(
-        """
-        CREATE TABLE test_data (
-            id    INTEGER PRIMARY KEY,
-            name  TEXT NOT NULL,
-            value FLOAT,
-            date  DATE
-        );
-        INSERT INTO test_data(name,value,date)
-        VALUES ('misc',100.0,'2020-01-01'), ('other',-99.5,'2021-12-31');
-        """
-    )
-    conn.close()
-
-
-@pytest.mark.write_disk()
-@pytest.mark.parametrize(
-    ("engine", "expected_dtypes", "expected_dates"),
-    [
-        pytest.param(
-            "connectorx",
-            {
-                "id": pl.Int64,
-                "name": pl.Utf8,
-                "value": pl.Float64,
-                "date": pl.Date,
-            },
-            [date(2020, 1, 1), date(2021, 12, 31)],
-        ),
-        pytest.param(
-            "adbc",
-            {
-                "id": pl.Int64,
-                "name": pl.Utf8,
-                "value": pl.Float64,
-                "date": pl.Utf8,
-            },
-            ["2020-01-01", "2021-12-31"],
-            marks=pytest.mark.skipif(
-                sys.version_info < (3, 9) or sys.platform == "win32",
-                reason="adbc_driver_sqlite not available below Python 3.9 / on Windows",
-            ),
-        ),
-    ],
-)
-def test_read_database(
-    engine: DbReadEngine,
-    expected_dtypes: dict[str, pl.DataType],
-    expected_dates: list[date | str],
-    tmp_path: Path,
-) -> None:
-    tmp_path.mkdir(exist_ok=True)
-
-    test_db = str(tmp_path / "test.db")
-    create_temp_sqlite_db(test_db)
-
-    df = pl.read_database(
-        connection=f"sqlite:///{test_db}",
-        query="SELECT * FROM test_data",
-        engine=engine,
-    )
-    assert df.schema == expected_dtypes
-    assert df.shape == (2, 4)
-    assert df["date"].to_list() == expected_dates
-
-
-@pytest.mark.parametrize(
-    ("engine", "query", "database", "errclass", "err"),
-    [
-        pytest.param(
-            "not_engine",
-            "SELECT * FROM test_data",
-            "sqlite",
-            ValueError,
-            "engine must be one of {'connectorx', 'adbc'}, got 'not_engine'",
-            id="Not an available sql engine",
-        ),
-        pytest.param(
-            "adbc",
-            ["SELECT * FROM test_data", "SELECT * FROM test_data"],
-            "sqlite",
-            ValueError,
-            "only a single SQL query string is accepted for adbc",
-            id="Unavailable list of queries for adbc",
-        ),
-        pytest.param(
-            "adbc",
-            "SELECT * FROM test_data",
-            "mysql",
-            ImportError,
-            "ADBC mysql driver not detected",
-            id="Unavailable adbc driver",
-        ),
-        pytest.param(
-            "adbc",
-            "SELECT * FROM test_data",
-            sqlite3.connect(":memory:"),
-            TypeError,
-            "expected connection to be a URI string",
-            id="Invalid connection URI",
-        ),
-    ],
-)
-def test_read_database_exceptions(
-    engine: DbReadEngine,
-    query: str,
-    database: Any,
-    errclass: type,
-    err: str,
-    tmp_path: Path,
-) -> None:
-    conn = f"{database}://test" if isinstance(database, str) else database
-    with pytest.raises(errclass, match=err):
-        pl.read_database(
-            connection=conn,
-            query=query,
-            engine=engine,
-        )
-
-
-@pytest.mark.write_disk()
-@pytest.mark.parametrize(
-    ("engine", "mode"),
-    [
-        pytest.param(
-            "adbc",
-            "create",
-            id="adbc_create",
-            marks=pytest.mark.skipif(
-                sys.version_info < (3, 9) or sys.platform == "win32",
-                reason="adbc_driver_sqlite not available below Python 3.9 / on Windows",
-            ),
-        ),
-        pytest.param(
-            "adbc",
-            "append",
-            id="adbc_append",
-            marks=pytest.mark.skipif(
-                sys.version_info < (3, 9) or sys.platform == "win32",
-                reason="adbc_driver_sqlite not available below Python 3.9 / on Windows",
-            ),
-        ),
-        pytest.param(
-            "sqlalchemy",
-            "create",
-            id="sa_create",
-        ),
-        pytest.param(
-            "sqlalchemy",
-            "append",
-            id="sa_append",
-        ),
-    ],
-)
-def test_write_database(
-    engine: DbWriteEngine, mode: DbWriteMode, sample_df: pl.DataFrame, tmp_path: Path
-) -> None:
-    tmp_path.mkdir(exist_ok=True)
-    tmp_db = f"test_{engine}.db"
-    test_db = str(tmp_path / tmp_db)
-
-    # note: test a table name that requires quotes to ensure that we handle
-    # it correctly (also supply an explicit db schema with/without quotes)
-    tbl_name = '"test-data"'
-
-    sample_df.write_database(
-        table_name=f"main.{tbl_name}",
-        connection=f"sqlite:///{test_db}",
-        if_exists="replace",
-        engine=engine,
-    )
-    if mode == "append":
-        sample_df.write_database(
-            table_name=f'"main".{tbl_name}',
-            connection=f"sqlite:///{test_db}",
-            if_exists="append",
-            engine=engine,
-        )
-        sample_df = pl.concat([sample_df, sample_df])
-
-    result = pl.read_database(f"SELECT * FROM {tbl_name}", f"sqlite:///{test_db}")
-    sample_df = sample_df.with_columns(pl.col("date").cast(pl.Utf8))
-    assert_frame_equal(sample_df, result)
-
-    # check that some invalid parameters raise errors
-    for invalid_params in (
-        {"table_name": "w.x.y.z"},
-        {"if_exists": "crunk", "table_name": f"main.{tbl_name}"},
-    ):
-        with pytest.raises((ValueError, NotImplementedError)):
-            sample_df.write_database(
-                connection=f"sqlite:///{test_db}",
-                engine=engine,
-                **invalid_params,  # type: ignore[arg-type]
-            )
diff --git a/py-polars/tests/unit/io/test_database_read.py b/py-polars/tests/unit/io/test_database_read.py
new file mode 100644
index 000000000000..c3f89f53762d
--- /dev/null
+++ b/py-polars/tests/unit/io/test_database_read.py
@@ -0,0 +1,307 @@
+from __future__ import annotations
+
+import os
+import sqlite3
+import sys
+from contextlib import suppress
+from datetime import date
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+
+import pytest
+from sqlalchemy import create_engine
+
+import polars as pl
+
+if TYPE_CHECKING:
+    from polars.type_aliases import DbReadEngine
+
+
+def adbc_sqlite_connect(*args: Any, **kwargs: Any) -> Any:
+    with suppress(ModuleNotFoundError):  # not available on 3.8/windows
+        from adbc_driver_sqlite.dbapi import connect
+
+        return connect(*args, **kwargs)
+
+
+@pytest.fixture()
+def sample_df() -> pl.DataFrame:
+    return pl.DataFrame(
+        {
+            "id": [1, 2],
+            "name": ["misc", "other"],
+            "value": [100.0, -99.0],
+            "date": ["2020-01-01", "2021-12-31"],
+        }
+    )
+
+
+def create_temp_sqlite_db(test_db: str) -> None:
+    Path(test_db).unlink(missing_ok=True)
+
+    # NOTE: at the time of writing adcb/connectorx have weak SQLite support (poor or
+    # no bool/date/datetime dtypes, for example) and there is a bug in connectorx that
+    # causes float rounding < py 3.11, hence we are only testing/storing simple values
+    # in this test db for now. as support improves, we can add/test additional dtypes).
+
+    conn = sqlite3.connect(test_db)
+    # ┌─────┬───────┬───────┬────────────┐
+    # │ id  ┆ name  ┆ value ┆ date       │
+    # │ --- ┆ ---   ┆ ---   ┆ ---        │
+    # │ i64 ┆ str   ┆ f64   ┆ date       │
+    # ╞═════╪═══════╪═══════╪════════════╡
+    # │ 1   ┆ misc  ┆ 100.0 ┆ 2020-01-01 │
+    # │ 2   ┆ other ┆ -99.0 ┆ 2021-12-31 │
+    # └─────┴───────┴───────┴────────────┘
+    conn.executescript(
+        """
+        CREATE TABLE test_data (
+            id    INTEGER PRIMARY KEY,
+            name  TEXT NOT NULL,
+            value FLOAT,
+            date  DATE
+        );
+        INSERT INTO test_data(name,value,date)
+        VALUES ('misc',100.0,'2020-01-01'), ('other',-99.5,'2021-12-31');
+        """
+    )
+    conn.close()
+
+
+@pytest.mark.write_disk()
+@pytest.mark.parametrize(
+    ("read_method", "engine_or_connection_init", "expected_dtypes", "expected_dates"),
+    [
+        pytest.param(
+            "read_database_uri",
+            "connectorx",
+            {
+                "id": pl.Int64,
+                "name": pl.Utf8,
+                "value": pl.Float64,
+                "date": pl.Date,
+            },
+            [date(2020, 1, 1), date(2021, 12, 31)],
+            id="uri: connectorx",
+        ),
+        pytest.param(
+            "read_database_uri",
+            "adbc",
+            {
+                "id": pl.Int64,
+                "name": pl.Utf8,
+                "value": pl.Float64,
+                "date": pl.Utf8,
+            },
+            ["2020-01-01", "2021-12-31"],
+            marks=pytest.mark.skipif(
+                sys.version_info < (3, 9) or sys.platform == "win32",
+                reason="adbc_driver_sqlite not available below Python 3.9 / on Windows",
+            ),
+            id="uri: adbc",
+        ),
+        pytest.param(
+            "read_database",
+            lambda path: sqlite3.connect(path, detect_types=True),
+            {
+                "id": pl.Int64,
+                "name": pl.Utf8,
+                "value": pl.Float64,
+                "date": pl.Date,
+            },
+            [date(2020, 1, 1), date(2021, 12, 31)],
+            id="conn: sqlite3",
+        ),
+        pytest.param(
+            "read_database",
+            lambda path: create_engine(
+                f"sqlite:///{path}",
+                connect_args={"detect_types": sqlite3.PARSE_DECLTYPES},
+            ).connect(),
+            {
+                "id": pl.Int64,
+                "name": pl.Utf8,
+                "value": pl.Float64,
+                "date": pl.Date,
+            },
+            [date(2020, 1, 1), date(2021, 12, 31)],
+            id="conn: sqlalchemy",
+        ),
+        pytest.param(
+            "read_database",
+            adbc_sqlite_connect,
+            {
+                "id": pl.Int64,
+                "name": pl.Utf8,
+                "value": pl.Float64,
+                "date": pl.Utf8,
+            },
+            ["2020-01-01", "2021-12-31"],
+            marks=pytest.mark.skipif(
+                sys.version_info < (3, 9) or sys.platform == "win32",
+                reason="adbc_driver_sqlite not available below Python 3.9 / on Windows",
+            ),
+            id="conn: adbc",
+        ),
+    ],
+)
+def test_read_database(
+    read_method: str,
+    engine_or_connection_init: Any,
+    expected_dtypes: dict[str, pl.DataType],
+    expected_dates: list[date | str],
+    tmp_path: Path,
+) -> None:
+    tmp_path.mkdir(exist_ok=True)
+    test_db = str(tmp_path / "test.db")
+    create_temp_sqlite_db(test_db)
+
+    if read_method == "read_database_uri":
+        # instantiate the connection ourselves, using connectorx/adbc
+        df = pl.read_database_uri(
+            uri=f"sqlite:///{test_db}",
+            query="SELECT * FROM test_data",
+            engine=str(engine_or_connection_init),  # type: ignore[arg-type]
+        )
+    elif "adbc" in os.environ["PYTEST_CURRENT_TEST"]:
+        # externally instantiated adbc connections
+        with engine_or_connection_init(test_db) as conn, conn.cursor():
+            df = pl.read_database(connection=conn, query="SELECT * FROM test_data")
+    else:
+        # other user-supplied connections
+        df = pl.read_database(
+            connection=engine_or_connection_init(test_db),
+            query="SELECT * FROM test_data",
+        )
+
+    assert df.schema == expected_dtypes
+    assert df.shape == (2, 4)
+    assert df["date"].to_list() == expected_dates
+
+
+def test_read_database_mocked() -> None:
+    class MockConnection:
+        def __init__(self, driver: str) -> None:
+            self.__class__.__module__ = driver
+            self._cursor = MockCursor()
+
+        def close(self) -> None:
+            pass
+
+        def cursor(self) -> Any:
+            return self._cursor
+
+    class MockCursor:
+        def __init__(self) -> None:
+            self.called: list[str] = []
+
+        def __getattr__(self, item: str) -> Any:
+            if "fetch" in item:
+                self.called.append(item)
+                return lambda *args, **kwargs: []
+            super().__getattr__(item)  # type: ignore[misc]
+
+        def close(self) -> Any:
+            pass
+
+        def execute(self, query: str) -> Any:
+            return self
+
+    # since we don't have access to snowflake/databricks/etc from CI we
+    # mock them so we can check that we're calling the right methods
+    for driver, batch_size, expected_call in (
+        ("snowflake", None, "fetch_arrow_all"),
+        ("snowflake", 10_000, "fetch_arrow_batches"),
+        ("databricks", None, "fetchall_arrow"),
+        ("databricks", 25_000, "fetchmany_arrow"),
+        ("turbodbc", None, "fetchallarrow"),
+        ("turbodbc", 50_000, "fetcharrowbatches"),
+        ("adbc_driver_postgresql", None, "fetch_arrow_table"),
+        ("adbc_driver_postgresql", 75_000, "fetch_arrow_table"),
+    ):
+        mc = MockConnection(driver)
+        pl.read_database(
+            connection=mc,
+            query="SELECT * FROM test_data",
+            batch_size=batch_size,
+        )
+        assert expected_call in mc.cursor().called
+
+
+@pytest.mark.parametrize(
+    ("read_method", "engine", "query", "database", "errclass", "err"),
+    [
+        pytest.param(
+            "read_database_uri",
+            "not_an_engine",
+            "SELECT * FROM test_data",
+            "sqlite",
+            ValueError,
+            "engine must be one of {'connectorx', 'adbc'}, got 'not_an_engine'",
+            id="Not an available sql engine",
+        ),
+        pytest.param(
+            "read_database_uri",
+            "adbc",
+            ["SELECT * FROM test_data", "SELECT * FROM test_data"],
+            "sqlite",
+            ValueError,
+            "only a single SQL query string is accepted for adbc",
+            id="Unavailable list of queries for adbc",
+        ),
+        pytest.param(
+            "read_database_uri",
+            "adbc",
+            "SELECT * FROM test_data",
+            "mysql",
+            ImportError,
+            "ADBC mysql driver not detected",
+            id="Unavailable adbc driver",
+        ),
+        pytest.param(
+            "read_database_uri",
+            "adbc",
+            "SELECT * FROM test_data",
+            sqlite3.connect(":memory:"),
+            TypeError,
+            "expected connection to be a URI string",
+            id="Invalid connection URI",
+        ),
+        pytest.param(
+            "read_database",
+            None,
+            "SELECT * FROM imaginary_table",
+            sqlite3.connect(":memory:"),
+            sqlite3.OperationalError,
+            "no such table: imaginary_table",
+            id="Invalid read DB kwargs",
+        ),
+        pytest.param(
+            "read_database",
+            None,
+            "SELECT * FROM imaginary_table",
+            sys.getsizeof,  # not a connection
+            TypeError,
+            "Unrecognised connection .* unable to find 'execute' method",
+            id="Invalid read DB kwargs",
+        ),
+    ],
+)
+def test_read_database_exceptions(
+    read_method: str,
+    engine: DbReadEngine | None,
+    query: str,
+    database: Any,
+    errclass: type,
+    err: str,
+    tmp_path: Path,
+) -> None:
+    if read_method == "read_database_uri":
+        conn = f"{database}://test" if isinstance(database, str) else database
+        params = {"uri": conn, "query": query, "engine": engine}
+    else:
+        params = {"connection": database, "query": query}
+
+    read_database = getattr(pl, read_method)
+    with pytest.raises(errclass, match=err):
+        read_database(**params)
diff --git a/py-polars/tests/unit/io/test_database_write.py b/py-polars/tests/unit/io/test_database_write.py
new file mode 100644
index 000000000000..9c7efbd9ce23
--- /dev/null
+++ b/py-polars/tests/unit/io/test_database_write.py
@@ -0,0 +1,103 @@
+from __future__ import annotations
+
+import sys
+from typing import TYPE_CHECKING
+
+import pytest
+
+import polars as pl
+from polars.testing import assert_frame_equal
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from polars.type_aliases import DbWriteEngine, DbWriteMode
+
+
+@pytest.fixture()
+def sample_df() -> pl.DataFrame:
+    return pl.DataFrame(
+        {
+            "id": [1, 2],
+            "name": ["misc", "other"],
+            "value": [100.0, -99.0],
+            "date": ["2020-01-01", "2021-12-31"],
+        }
+    )
+
+
+@pytest.mark.write_disk()
+@pytest.mark.parametrize(
+    ("engine", "mode"),
+    [
+        pytest.param(
+            "adbc",
+            "create",
+            id="adbc_create",
+            marks=pytest.mark.skipif(
+                sys.version_info < (3, 9) or sys.platform == "win32",
+                reason="adbc_driver_sqlite not available below Python 3.9 / on Windows",
+            ),
+        ),
+        pytest.param(
+            "adbc",
+            "append",
+            id="adbc_append",
+            marks=pytest.mark.skipif(
+                sys.version_info < (3, 9) or sys.platform == "win32",
+                reason="adbc_driver_sqlite not available below Python 3.9 / on Windows",
+            ),
+        ),
+        pytest.param(
+            "sqlalchemy",
+            "create",
+            id="sa_create",
+        ),
+        pytest.param(
+            "sqlalchemy",
+            "append",
+            id="sa_append",
+        ),
+    ],
+)
+def test_write_database(
+    engine: DbWriteEngine, mode: DbWriteMode, sample_df: pl.DataFrame, tmp_path: Path
+) -> None:
+    tmp_path.mkdir(exist_ok=True)
+    tmp_db = f"test_{engine}.db"
+    test_db = str(tmp_path / tmp_db)
+
+    # note: test a table name that requires quotes to ensure that we handle
+    # it correctly (also supply an explicit db schema with/without quotes)
+    tbl_name = '"test-data"'
+
+    sample_df.write_database(
+        table_name=f"main.{tbl_name}",
+        connection=f"sqlite:///{test_db}",
+        if_exists="replace",
+        engine=engine,
+    )
+    if mode == "append":
+        sample_df.write_database(
+            table_name=f'"main".{tbl_name}',
+            connection=f"sqlite:///{test_db}",
+            if_exists="append",
+            engine=engine,
+        )
+        sample_df = pl.concat([sample_df, sample_df])
+
+    result = pl.read_database_uri(f"SELECT * FROM {tbl_name}", f"sqlite:///{test_db}")
+    sample_df = sample_df.with_columns(pl.col("date").cast(pl.Utf8))
+    assert_frame_equal(sample_df, result)
+
+    # check that some invalid parameters raise errors
+    for invalid_params in (
+        {"table_name": "w.x.y.z"},
+        {"if_exists": "crunk", "table_name": f"main.{tbl_name}"},
+    ):
+        with pytest.raises((ValueError, NotImplementedError)):
+            sample_df.write_database(
+                connection=f"sqlite:///{test_db}",
+                engine=engine,
+                **invalid_params,  # type: ignore[arg-type]
+            )

From 7242fc1174a2937d31147d4c269512627955555a Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Tue, 22 Aug 2023 12:31:45 +0200
Subject: [PATCH 38/55] feat(python): csv: add schema argument (#10665)

---
 crates/polars-io/src/csv/read.rs                   |  4 ++--
 crates/polars-io/src/csv/read_impl/batched_mmap.rs |  2 +-
 crates/polars-io/src/csv/read_impl/batched_read.rs |  2 +-
 crates/polars-lazy/src/frame/csv.rs                |  6 +++---
 crates/polars-pipe/src/executors/sources/csv.rs    |  2 +-
 crates/polars/tests/it/io/csv.rs                   |  6 +++---
 py-polars/polars/dataframe/frame.py                |  3 +++
 py-polars/polars/io/_utils.py                      |  2 +-
 py-polars/polars/io/csv/functions.py               | 12 ++++++++++++
 py-polars/polars/lazyframe/frame.py                |  2 ++
 py-polars/src/dataframe.rs                         |  4 +++-
 py-polars/src/lazyframe.rs                         |  4 +++-
 py-polars/tests/unit/io/test_csv.py                | 13 +++++++++++++
 13 files changed, 48 insertions(+), 14 deletions(-)

diff --git a/crates/polars-io/src/csv/read.rs b/crates/polars-io/src/csv/read.rs
index 9bf55ca3e06a..5f0b3a228596 100644
--- a/crates/polars-io/src/csv/read.rs
+++ b/crates/polars-io/src/csv/read.rs
@@ -181,8 +181,8 @@ where
     /// in the csv parser and expects a complete Schema.
     ///
     /// It is recommended to use [with_dtypes](Self::with_dtypes) instead.
-    pub fn with_schema(mut self, schema: SchemaRef) -> Self {
-        self.schema = Some(schema);
+    pub fn with_schema(mut self, schema: Option<SchemaRef>) -> Self {
+        self.schema = schema;
         self
     }
 
diff --git a/crates/polars-io/src/csv/read_impl/batched_mmap.rs b/crates/polars-io/src/csv/read_impl/batched_mmap.rs
index a659f31d6c3c..18824d5e08f1 100644
--- a/crates/polars-io/src/csv/read_impl/batched_mmap.rs
+++ b/crates/polars-io/src/csv/read_impl/batched_mmap.rs
@@ -308,7 +308,7 @@ pub fn to_batched_owned_mmap(
 ) -> OwnedBatchedCsvReaderMmap {
     // make sure that the schema is bound to the schema we have
     // we will keep ownership of the schema so that the lifetime remains bound to ourselves
-    let reader = reader.with_schema(schema.clone());
+    let reader = reader.with_schema(Some(schema.clone()));
     // extend the lifetime
     // the lifetime was bound to schema, which we own and will store on the heap
     let reader = unsafe {
diff --git a/crates/polars-io/src/csv/read_impl/batched_read.rs b/crates/polars-io/src/csv/read_impl/batched_read.rs
index 88249222dcb4..af3831f00b70 100644
--- a/crates/polars-io/src/csv/read_impl/batched_read.rs
+++ b/crates/polars-io/src/csv/read_impl/batched_read.rs
@@ -405,7 +405,7 @@ pub fn to_batched_owned_read(
 ) -> OwnedBatchedCsvReader {
     // make sure that the schema is bound to the schema we have
     // we will keep ownership of the schema so that the lifetime remains bound to ourselves
-    let reader = reader.with_schema(schema.clone());
+    let reader = reader.with_schema(Some(schema.clone()));
     // extend the lifetime
     // the lifetime was bound to schema, which we own and will store on the heap
     let reader = unsafe {
diff --git a/crates/polars-lazy/src/frame/csv.rs b/crates/polars-lazy/src/frame/csv.rs
index 1e1e97240dbf..be497c336388 100644
--- a/crates/polars-lazy/src/frame/csv.rs
+++ b/crates/polars-lazy/src/frame/csv.rs
@@ -106,8 +106,8 @@ impl<'a> LazyCsvReader<'a> {
 
     /// Set the CSV file's schema
     #[must_use]
-    pub fn with_schema(mut self, schema: SchemaRef) -> Self {
-        self.schema = Some(schema);
+    pub fn with_schema(mut self, schema: Option<SchemaRef>) -> Self {
+        self.schema = schema;
         self
     }
 
@@ -261,7 +261,7 @@ impl<'a> LazyCsvReader<'a> {
             }
         }
 
-        Ok(self.with_schema(Arc::new(schema)))
+        Ok(self.with_schema(Some(Arc::new(schema))))
     }
 }
 
diff --git a/crates/polars-pipe/src/executors/sources/csv.rs b/crates/polars-pipe/src/executors/sources/csv.rs
index a9e9f5352d1d..8a6338827828 100644
--- a/crates/polars-pipe/src/executors/sources/csv.rs
+++ b/crates/polars-pipe/src/executors/sources/csv.rs
@@ -62,7 +62,7 @@ impl CsvSource {
         let reader = CsvReader::from_path(&path)
             .unwrap()
             .has_header(options.has_header)
-            .with_schema(self.schema.clone())
+            .with_schema(Some(self.schema.clone()))
             .with_delimiter(options.delimiter)
             .with_ignore_errors(options.ignore_errors)
             .with_skip_rows(options.skip_rows)
diff --git a/crates/polars/tests/it/io/csv.rs b/crates/polars/tests/it/io/csv.rs
index 4c48d71921c6..9df2115ed8d8 100644
--- a/crates/polars/tests/it/io/csv.rs
+++ b/crates/polars/tests/it/io/csv.rs
@@ -387,7 +387,7 @@ fn test_empty_bytes_to_dataframe() {
     let result = CsvReader::new(file)
         .has_header(false)
         .with_columns(Some(schema.iter_names().map(|s| s.to_string()).collect()))
-        .with_schema(Arc::new(schema))
+        .with_schema(Some(Arc::new(schema)))
         .finish();
     assert!(result.is_ok())
 }
@@ -416,11 +416,11 @@ fn test_missing_value() {
     let file = Cursor::new(csv);
     let df = CsvReader::new(file)
         .has_header(true)
-        .with_schema(Arc::new(Schema::from_iter([
+        .with_schema(Some(Arc::new(Schema::from_iter([
             Field::new("foo", DataType::UInt32),
             Field::new("bar", DataType::UInt32),
             Field::new("ham", DataType::UInt32),
-        ])))
+        ]))))
         .finish()
         .unwrap();
     assert_eq!(df.column("ham").unwrap().len(), 3)
diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
index 96c70447637e..a402ff35619e 100644
--- a/py-polars/polars/dataframe/frame.py
+++ b/py-polars/polars/dataframe/frame.py
@@ -662,6 +662,7 @@ def _read_csv(
         quote_char: str | None = r'"',
         skip_rows: int = 0,
         dtypes: None | (SchemaDict | Sequence[PolarsDataType]) = None,
+        schema: None | SchemaDict = None,
         null_values: str | Sequence[str] | dict[str, str] | None = None,
         missing_utf8_is_empty_string: bool = False,
         ignore_errors: bool = False,
@@ -740,6 +741,7 @@ def _read_csv(
                 quote_char=quote_char,
                 skip_rows=skip_rows,
                 dtypes=dtypes_dict,
+                schema=schema,
                 null_values=null_values,
                 missing_utf8_is_empty_string=missing_utf8_is_empty_string,
                 ignore_errors=ignore_errors,
@@ -795,6 +797,7 @@ def _read_csv(
             eol_char=eol_char,
             raise_if_empty=raise_if_empty,
             truncate_ragged_lines=truncate_ragged_lines,
+            schema=schema,
         )
         return self
 
diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py
index ec3301bbd930..4a59dd65353c 100644
--- a/py-polars/polars/io/_utils.py
+++ b/py-polars/polars/io/_utils.py
@@ -17,7 +17,7 @@ def _is_glob_pattern(file: str) -> bool:
 
 def _is_local_file(file: str) -> bool:
     try:
-        next(glob.iglob(file, recursive=True))  # noqa: PTH207
+        next(glob.iglob(file, recursive=True))
         return True
     except StopIteration:
         return False
diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py
index 548a90d89a56..57d03aebba0e 100644
--- a/py-polars/polars/io/csv/functions.py
+++ b/py-polars/polars/io/csv/functions.py
@@ -28,6 +28,7 @@ def read_csv(
     quote_char: str | None = r'"',
     skip_rows: int = 0,
     dtypes: Mapping[str, PolarsDataType] | Sequence[PolarsDataType] | None = None,
+    schema: SchemaDict | None = None,
     null_values: str | Sequence[str] | dict[str, str] | None = None,
     missing_utf8_is_empty_string: bool = False,
     ignore_errors: bool = False,
@@ -83,6 +84,10 @@ def read_csv(
         Start reading after ``skip_rows`` lines.
     dtypes
         Overwrite dtypes for specific or all columns during schema inference.
+    schema
+        Provide the schema. This means that polars doesn't do schema inference.
+        This argument expects the complete schema, whereas ``dtypes`` can be used
+        to partially overwrite a schema.
     null_values
         Values to interpret as null values. You can provide a:
 
@@ -365,6 +370,7 @@ def read_csv(
             quote_char=quote_char,
             skip_rows=skip_rows,
             dtypes=dtypes,
+            schema=schema,
             null_values=null_values,
             missing_utf8_is_empty_string=missing_utf8_is_empty_string,
             ignore_errors=ignore_errors,
@@ -691,6 +697,7 @@ def scan_csv(
     quote_char: str | None = r'"',
     skip_rows: int = 0,
     dtypes: SchemaDict | Sequence[PolarsDataType] | None = None,
+    schema: SchemaDict | None = None,
     null_values: str | Sequence[str] | dict[str, str] | None = None,
     missing_utf8_is_empty_string: bool = False,
     ignore_errors: bool = False,
@@ -741,6 +748,10 @@ def scan_csv(
         Overwrite dtypes during inference; should be a {colname:dtype,} dict or,
         if providing a list of strings to ``new_columns``, a list of dtypes of
         the same length.
+    schema
+        Provide the schema. This means that polars doesn't do schema inference.
+        This argument expects the complete schema, whereas ``dtypes`` can be used
+        to partially overwrite a schema.
     null_values
         Values to interpret as null values. You can provide a:
 
@@ -892,6 +903,7 @@ def with_column_names(_cols: list[str]) -> list[str]:
         quote_char=quote_char,
         skip_rows=skip_rows,
         dtypes=dtypes,  # type: ignore[arg-type]
+        schema=schema,
         null_values=null_values,
         missing_utf8_is_empty_string=missing_utf8_is_empty_string,
         ignore_errors=ignore_errors,
diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py
index 25622fb163a2..eca59c91c51a 100644
--- a/py-polars/polars/lazyframe/frame.py
+++ b/py-polars/polars/lazyframe/frame.py
@@ -327,6 +327,7 @@ def _scan_csv(
         quote_char: str | None = r'"',
         skip_rows: int = 0,
         dtypes: SchemaDict | None = None,
+        schema: SchemaDict | None = None,
         null_values: str | Sequence[str] | dict[str, str] | None = None,
         missing_utf8_is_empty_string: bool = False,
         ignore_errors: bool = False,
@@ -387,6 +388,7 @@ def _scan_csv(
             eol_char=eol_char,
             raise_if_empty=raise_if_empty,
             truncate_ragged_lines=truncate_ragged_lines,
+            schema=schema,
         )
         return self
 
diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs
index 3ce6754aefab..5292ece6c3fd 100644
--- a/py-polars/src/dataframe.rs
+++ b/py-polars/src/dataframe.rs
@@ -139,7 +139,7 @@ impl PyDataFrame {
         skip_rows, projection, separator, rechunk, columns, encoding, n_threads, path,
         overwrite_dtype, overwrite_dtype_slice, low_memory, comment_char, quote_char,
         null_values, missing_utf8_is_empty_string, try_parse_dates, skip_rows_after_header,
-        row_count, sample_size, eol_char, raise_if_empty, truncate_ragged_lines)
+        row_count, sample_size, eol_char, raise_if_empty, truncate_ragged_lines, schema)
     )]
     pub fn read_csv(
         py_f: &PyAny,
@@ -170,6 +170,7 @@ impl PyDataFrame {
         eol_char: &str,
         raise_if_empty: bool,
         truncate_ragged_lines: bool,
+        schema: Option<Wrap<Schema>>,
     ) -> PyResult<Self> {
         let null_values = null_values.map(|w| w.0);
         let comment_char = comment_char.map(|s| s.as_bytes()[0]);
@@ -219,6 +220,7 @@ impl PyDataFrame {
             .with_path(path)
             .with_dtypes(overwrite_dtype.map(Arc::new))
             .with_dtypes_slice(overwrite_dtype_slice.as_deref())
+            .with_schema(schema.map(|schema| Arc::new(schema.0)))
             .low_memory(low_memory)
             .with_null_values(null_values)
             .with_missing_is_null(!missing_utf8_is_empty_string)
diff --git a/py-polars/src/lazyframe.rs b/py-polars/src/lazyframe.rs
index be3c62ce4d97..288332db7d65 100644
--- a/py-polars/src/lazyframe.rs
+++ b/py-polars/src/lazyframe.rs
@@ -147,7 +147,7 @@ impl PyLazyFrame {
     #[pyo3(signature = (path, separator, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype,
         low_memory, comment_char, quote_char, null_values, missing_utf8_is_empty_string,
         infer_schema_length, with_schema_modify, rechunk, skip_rows_after_header,
-        encoding, row_count, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines
+        encoding, row_count, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines, schema
     )
     )]
     fn new_from_csv(
@@ -174,6 +174,7 @@ impl PyLazyFrame {
         eol_char: &str,
         raise_if_empty: bool,
         truncate_ragged_lines: bool,
+        schema: Option<Wrap<Schema>>,
     ) -> PyResult<Self> {
         let null_values = null_values.map(|w| w.0);
         let comment_char = comment_char.map(|s| s.as_bytes()[0]);
@@ -197,6 +198,7 @@ impl PyLazyFrame {
             .with_n_rows(n_rows)
             .with_cache(cache)
             .with_dtype_overwrite(overwrite_dtype.as_ref())
+            .with_schema(schema.map(|schema| Arc::new(schema.0)))
             .low_memory(low_memory)
             .with_comment_char(comment_char)
             .with_quote_char(quote_char)
diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py
index dd973193ea26..dc24c2091d6e 100644
--- a/py-polars/tests/unit/io/test_csv.py
+++ b/py-polars/tests/unit/io/test_csv.py
@@ -1494,3 +1494,16 @@ def test_csv_ragged_lines() -> None:
             pl.read_csv(io.StringIO(s), has_header=False, truncate_ragged_lines=False)
         with pytest.raises(pl.ComputeError, match=r"found more fields than defined"):
             pl.read_csv(io.StringIO(s), has_header=False, truncate_ragged_lines=False)
+
+
+def test_provide_schema() -> None:
+    # can be used to overload schema with ragged csv files
+    assert pl.read_csv(
+        io.StringIO("A\nB,ragged\nC"),
+        has_header=False,
+        schema={"A": pl.Utf8, "B": pl.Utf8, "C": pl.Utf8},
+    ).to_dict(False) == {
+        "A": ["A", "B", "C"],
+        "B": [None, "ragged", None],
+        "C": [None, None, None],
+    }

From ac12d3b14ba606dd91337962f966df77fec55b64 Mon Sep 17 00:00:00 2001
From: Alexander Beedie <alexander-beedie@users.noreply.github.com>
Date: Tue, 22 Aug 2023 16:44:41 +0400
Subject: [PATCH 39/55] docs(python): add "see also" entries to ne/eq_missing
 and update related examples (#10667)

---
 py-polars/polars/api.py              |  48 +++---
 py-polars/polars/config.py           |   8 +-
 py-polars/polars/dataframe/frame.py  |  58 ++++----
 py-polars/polars/expr/datetime.py    |   8 +-
 py-polars/polars/expr/expr.py        |  88 +++++------
 py-polars/polars/expr/string.py      |  40 ++---
 py-polars/polars/functions/lazy.py   |   8 +-
 py-polars/polars/io/_utils.py        |   2 +-
 py-polars/polars/io/csv/functions.py |   8 +-
 py-polars/polars/io/database.py      |  16 +-
 py-polars/polars/lazyframe/frame.py  |   8 +-
 py-polars/polars/selectors.py        | 210 +++++++++++++--------------
 py-polars/polars/series/datetime.py  |   8 +-
 py-polars/polars/series/list.py      |   8 +-
 py-polars/polars/series/series.py    |  64 +++++++-
 py-polars/polars/series/string.py    |  30 ++--
 py-polars/polars/sql/context.py      |  48 +++---
 17 files changed, 357 insertions(+), 303 deletions(-)

diff --git a/py-polars/polars/api.py b/py-polars/polars/api.py
index 903b34ac8535..7c8d368bed7f 100644
--- a/py-polars/polars/api.py
+++ b/py-polars/polars/api.py
@@ -78,6 +78,12 @@ def register_expr_namespace(name: str) -> Callable[[type[NS]], type[NS]]:
     name
         Name under which the functionality will be accessed.
 
+    See Also
+    --------
+    register_dataframe_namespace: Register functionality on a DataFrame.
+    register_lazyframe_namespace: Register functionality on a LazyFrame.
+    register_series_namespace: Register functionality on a Series.
+
     Examples
     --------
     >>> @pl.api.register_expr_namespace("pow_n")
@@ -114,12 +120,6 @@ def register_expr_namespace(name: str) -> Callable[[type[NS]], type[NS]]:
     │ 64.001 ┆ 128       ┆ 64        ┆ 64           │
     └────────┴───────────┴───────────┴──────────────┘
 
-    See Also
-    --------
-    register_dataframe_namespace: Register functionality on a DataFrame.
-    register_lazyframe_namespace: Register functionality on a LazyFrame.
-    register_series_namespace: Register functionality on a Series.
-
     """
     return _create_namespace(name, pl.Expr)
 
@@ -133,6 +133,12 @@ def register_dataframe_namespace(name: str) -> Callable[[type[NS]], type[NS]]:
     name
         Name under which the functionality will be accessed.
 
+    See Also
+    --------
+    register_expr_namespace: Register functionality on an Expr.
+    register_lazyframe_namespace: Register functionality on a LazyFrame.
+    register_series_namespace: Register functionality on a Series.
+
     Examples
     --------
     >>> @pl.api.register_dataframe_namespace("split")
@@ -214,12 +220,6 @@ def register_dataframe_namespace(name: str) -> Callable[[type[NS]], type[NS]]:
     │ yz  ┆ 6   ┆ 7   ┆ 8   │
     └─────┴─────┴─────┴─────┘]
 
-    See Also
-    --------
-    register_expr_namespace: Register functionality on an Expr.
-    register_lazyframe_namespace: Register functionality on a LazyFrame.
-    register_series_namespace: Register functionality on a Series.
-
     """
     return _create_namespace(name, pl.DataFrame)
 
@@ -233,6 +233,12 @@ def register_lazyframe_namespace(name: str) -> Callable[[type[NS]], type[NS]]:
     name
         Name under which the functionality will be accessed.
 
+    See Also
+    --------
+    register_expr_namespace: Register functionality on an Expr.
+    register_dataframe_namespace: Register functionality on a DataFrame.
+    register_series_namespace: Register functionality on a Series.
+
     Examples
     --------
     >>> @pl.api.register_lazyframe_namespace("types")
@@ -319,12 +325,6 @@ def register_lazyframe_namespace(name: str) -> Callable[[type[NS]], type[NS]]:
     │ 6   ┆ 7   ┆ 8   │
     └─────┴─────┴─────┘]
 
-    See Also
-    --------
-    register_expr_namespace: Register functionality on an Expr.
-    register_dataframe_namespace: Register functionality on a DataFrame.
-    register_series_namespace: Register functionality on a Series.
-
     """
     return _create_namespace(name, pl.LazyFrame)
 
@@ -338,6 +338,12 @@ def register_series_namespace(name: str) -> Callable[[type[NS]], type[NS]]:
     name
         Name under which the functionality will be accessed.
 
+    See Also
+    --------
+    register_expr_namespace: Register functionality on an Expr.
+    register_dataframe_namespace: Register functionality on a DataFrame.
+    register_lazyframe_namespace: Register functionality on a LazyFrame.
+
     Examples
     --------
     >>> @pl.api.register_series_namespace("math")
@@ -374,11 +380,5 @@ def register_series_namespace(name: str) -> Callable[[type[NS]], type[NS]]:
         125
     ]
 
-    See Also
-    --------
-    register_expr_namespace: Register functionality on an Expr.
-    register_dataframe_namespace: Register functionality on a DataFrame.
-    register_lazyframe_namespace: Register functionality on a LazyFrame.
-
     """
     return _create_namespace(name, pl.Series)
diff --git a/py-polars/polars/config.py b/py-polars/polars/config.py
index 2634882b4599..6ada4300c22d 100644
--- a/py-polars/polars/config.py
+++ b/py-polars/polars/config.py
@@ -671,6 +671,10 @@ def set_tbl_hide_dtype_separator(cls, active: bool = True) -> type[Config]:
         """
         Hide the '---' separator between the column names and column types.
 
+        See Also
+        --------
+        set_tbl_column_data_type_inline
+
         Examples
         --------
         >>> df = pl.DataFrame({"abc": [1.0, 2.5, 5.0], "xyz": [True, False, True]})
@@ -687,10 +691,6 @@ def set_tbl_hide_dtype_separator(cls, active: bool = True) -> type[Config]:
         # │ 5.0 ┆ true  │      └─────┴───────┘
         # └─────┴───────┘
 
-        See Also
-        --------
-        set_tbl_column_data_type_inline
-
         """
         os.environ["POLARS_FMT_TABLE_HIDE_COLUMN_SEPARATOR"] = str(int(active))
         return cls
diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
index a402ff35619e..383354272843 100644
--- a/py-polars/polars/dataframe/frame.py
+++ b/py-polars/polars/dataframe/frame.py
@@ -1201,6 +1201,10 @@ def dtypes(self) -> list[PolarsDataType]:
 
         The datatypes can also be found in column headers when printing the DataFrame.
 
+        See Also
+        --------
+        schema : Returns a {colname:dtype} mapping.
+
         Examples
         --------
         >>> df = pl.DataFrame(
@@ -1224,10 +1228,6 @@ def dtypes(self) -> list[PolarsDataType]:
         │ 3   ┆ 8.0 ┆ c   │
         └─────┴─────┴─────┘
 
-        See Also
-        --------
-        schema : Returns a {colname:dtype} mapping.
-
         """
         return self._df.dtypes()
 
@@ -1835,6 +1835,10 @@ def item(self, row: int | None = None, column: int | str | None = None) -> Any:
         column
             Optional column index or name.
 
+        See Also
+        --------
+        row: Get the values of a single row, either by index or by predicate.
+
         Examples
         --------
         >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
@@ -1845,10 +1849,6 @@ def item(self, row: int | None = None, column: int | str | None = None) -> Any:
         >>> df.item(2, "b")
         6
 
-        See Also
-        --------
-        row: Get the values of a single row, either by index or by predicate.
-
         """
         if row is None and column is None:
             if self.shape != (1, 1):
@@ -8816,6 +8816,12 @@ def row(
         You should NEVER use this method to iterate over a DataFrame; if you require
         row-iteration you should strongly prefer use of ``iter_rows()`` instead.
 
+        See Also
+        --------
+        iter_rows : Row iterator over frame data (does not materialise all rows).
+        rows : Materialise all frame data as a list of rows (potentially expensive).
+        item: Return dataframe element as a scalar.
+
         Examples
         --------
         Specify an index to return the row at the given index as a tuple.
@@ -8841,12 +8847,6 @@ def row(
         >>> df.row(by_predicate=(pl.col("ham") == "b"))
         (2, 7, 'b')
 
-        See Also
-        --------
-        iter_rows : Row iterator over frame data (does not materialise all rows).
-        rows : Materialise all frame data as a list of rows (potentially expensive).
-        item: Return dataframe element as a scalar.
-
         """
         if index is not None and by_predicate is not None:
             raise ValueError(
@@ -8927,6 +8927,11 @@ def rows(
         -------
         list of tuples (default) or dictionaries of row values
 
+        See Also
+        --------
+        iter_rows : Row iterator over frame data (does not materialise all rows).
+        rows_by_key : Materialises frame data as a key-indexed dictionary.
+
         Examples
         --------
         >>> df = pl.DataFrame(
@@ -8944,11 +8949,6 @@ def rows(
          {'x': 'b', 'y': 3, 'z': 6},
          {'x': 'a', 'y': 4, 'z': 9}]
 
-        See Also
-        --------
-        iter_rows : Row iterator over frame data (does not materialise all rows).
-        rows_by_key : Materialises frame data as a key-indexed dictionary.
-
         """
         if named:
             # Load these into the local namespace for a minor performance boost
@@ -8997,6 +8997,11 @@ def rows_by_key(
         truncated to microseconds on conversion to Python. If this matters to your
         use-case you should export to a different format (such as Arrow or NumPy).
 
+        See Also
+        --------
+        rows : Materialise all frame data as a list of rows (potentially expensive).
+        iter_rows : Row iterator over frame data (does not materialise all rows).
+
         Examples
         --------
         >>> df = pl.DataFrame(
@@ -9049,11 +9054,6 @@ def rows_by_key(
                           {'w': 'b', 'x': 'q', 'y': 3.0, 'z': 7}],
              ('a', 'k'): [{'w': 'a', 'x': 'k', 'y': 4.5, 'z': 6}]})
 
-        See Also
-        --------
-        rows : Materialise all frame data as a list of rows (potentially expensive).
-        iter_rows : Row iterator over frame data (does not materialise all rows).
-
         """
         from polars.selectors import expand_selector, is_selector
 
@@ -9168,6 +9168,11 @@ def iter_rows(
         -------
         iterator of tuples (default) or dictionaries (if named) of python row values
 
+        See Also
+        --------
+        rows : Materialises all frame data as a list of rows (potentially expensive).
+        rows_by_key : Materialises frame data as a key-indexed dictionary.
+
         Examples
         --------
         >>> df = pl.DataFrame(
@@ -9181,11 +9186,6 @@ def iter_rows(
         >>> [row["b"] for row in df.iter_rows(named=True)]
         [2, 4, 6]
 
-        See Also
-        --------
-        rows : Materialises all frame data as a list of rows (potentially expensive).
-        rows_by_key : Materialises frame data as a key-indexed dictionary.
-
         """
         # load into the local namespace for a (minor) performance boost in the hot loops
         columns, get_row, dict_, zip_ = self.columns, self.row, dict, zip
diff --git a/py-polars/polars/expr/datetime.py b/py-polars/polars/expr/datetime.py
index 31fc8f90f118..ca5ea8c9f7d9 100644
--- a/py-polars/polars/expr/datetime.py
+++ b/py-polars/polars/expr/datetime.py
@@ -461,6 +461,10 @@ def strftime(self, format: str) -> Expr:
             <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
             for specification. Example: ``"%y-%m-%d"``.
 
+        See Also
+        --------
+        to_string : The identical expression for which ``strftime`` is an alias.
+
         Examples
         --------
         >>> from datetime import datetime
@@ -489,10 +493,6 @@ def strftime(self, format: str) -> Expr:
         │ 2020-05-01 00:00:00 ┆ 2020/05/01 00:00:00 │
         └─────────────────────┴─────────────────────┘
 
-        See Also
-        --------
-        to_string : The identical expression for which ``strftime`` is an alias.
-
         """
         return self.to_string(format)
 
diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py
index 2999118b0bb9..eec96107c24c 100644
--- a/py-polars/polars/expr/expr.py
+++ b/py-polars/polars/expr/expr.py
@@ -444,6 +444,11 @@ def arg_true(self) -> Self:
             Modifies number of rows returned, so will fail in combination with other
             expressions. Use as only expression in `select` / `with_columns`.
 
+        See Also
+        --------
+        Series.arg_true : Return indices where Series is True
+        polars.arg_where
+
         Examples
         --------
         >>> df = pl.DataFrame({"a": [1, 1, 2, 1]})
@@ -459,11 +464,6 @@ def arg_true(self) -> Self:
         │ 3   │
         └─────┘
 
-        See Also
-        --------
-        Series.arg_true : Return indices where Series is True
-        polars.arg_where
-
         """
         return self._from_pyexpr(py_arg_where(self._pyexpr))
 
@@ -4225,21 +4225,22 @@ def eq_missing(self, other: Any) -> Self:
         ...     }
         ... )
         >>> df.with_columns(
-        ...     pl.col("x").eq_missing(pl.col("y")).alias("x == y"),
-        ... )
-        shape: (6, 3)
-        ┌──────┬──────┬────────┐
-        │ x    ┆ y    ┆ x == y │
-        │ ---  ┆ ---  ┆ ---    │
-        │ f64  ┆ f64  ┆ bool   │
-        ╞══════╪══════╪════════╡
-        │ 1.0  ┆ 2.0  ┆ false  │
-        │ 2.0  ┆ 2.0  ┆ true   │
-        │ NaN  ┆ NaN  ┆ false  │
-        │ 4.0  ┆ 4.0  ┆ true   │
-        │ null ┆ 5.0  ┆ false  │
-        │ null ┆ null ┆ true   │
-        └──────┴──────┴────────┘
+        ...     pl.col("x").eq(pl.col("y")).alias("x eq y"),
+        ...     pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"),
+        ... )
+        shape: (6, 4)
+        ┌──────┬──────┬────────┬────────────────┐
+        │ x    ┆ y    ┆ x eq y ┆ x eq_missing y │
+        │ ---  ┆ ---  ┆ ---    ┆ ---            │
+        │ f64  ┆ f64  ┆ bool   ┆ bool           │
+        ╞══════╪══════╪════════╪════════════════╡
+        │ 1.0  ┆ 2.0  ┆ false  ┆ false          │
+        │ 2.0  ┆ 2.0  ┆ true   ┆ true           │
+        │ NaN  ┆ NaN  ┆ false  ┆ false          │
+        │ 4.0  ┆ 4.0  ┆ true   ┆ true           │
+        │ null ┆ 5.0  ┆ null   ┆ false          │
+        │ null ┆ null ┆ null   ┆ true           │
+        └──────┴──────┴────────┴────────────────┘
 
         """
         return self._from_pyexpr(self._pyexpr.eq_missing(self._to_expr(other)._pyexpr))
@@ -4439,21 +4440,22 @@ def ne_missing(self, other: Any) -> Self:
         ...     }
         ... )
         >>> df.with_columns(
-        ...     pl.col("x").ne_missing(pl.col("y")).alias("x != y"),
-        ... )
-        shape: (6, 3)
-        ┌──────┬──────┬────────┐
-        │ x    ┆ y    ┆ x != y │
-        │ ---  ┆ ---  ┆ ---    │
-        │ f64  ┆ f64  ┆ bool   │
-        ╞══════╪══════╪════════╡
-        │ 1.0  ┆ 2.0  ┆ true   │
-        │ 2.0  ┆ 2.0  ┆ false  │
-        │ NaN  ┆ NaN  ┆ true   │
-        │ 4.0  ┆ 4.0  ┆ false  │
-        │ null ┆ 5.0  ┆ true   │
-        │ null ┆ null ┆ false  │
-        └──────┴──────┴────────┘
+        ...     pl.col("x").ne(pl.col("y")).alias("x ne y"),
+        ...     pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"),
+        ... )
+        shape: (6, 4)
+        ┌──────┬──────┬────────┬────────────────┐
+        │ x    ┆ y    ┆ x ne y ┆ x ne_missing y │
+        │ ---  ┆ ---  ┆ ---    ┆ ---            │
+        │ f64  ┆ f64  ┆ bool   ┆ bool           │
+        ╞══════╪══════╪════════╪════════════════╡
+        │ 1.0  ┆ 2.0  ┆ true   ┆ true           │
+        │ 2.0  ┆ 2.0  ┆ false  ┆ false          │
+        │ NaN  ┆ NaN  ┆ true   ┆ true           │
+        │ 4.0  ┆ 4.0  ┆ false  ┆ false          │
+        │ null ┆ 5.0  ┆ null   ┆ true           │
+        │ null ┆ null ┆ null   ┆ false          │
+        └──────┴──────┴────────┴────────────────┘
 
         """
         return self._from_pyexpr(self._pyexpr.neq_missing(self._to_expr(other)._pyexpr))
@@ -4514,6 +4516,10 @@ def floordiv(self, other: Any) -> Self:
         other
             Numeric literal or expression value.
 
+        See Also
+        --------
+        truediv
+
         Examples
         --------
         >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]})
@@ -4534,10 +4540,6 @@ def floordiv(self, other: Any) -> Self:
         │ 5   ┆ 2.5 ┆ 2    │
         └─────┴─────┴──────┘
 
-        See Also
-        --------
-        truediv
-
         """
         return self.__floordiv__(other)
 
@@ -4650,6 +4652,10 @@ def truediv(self, other: Any) -> Self:
         0/0: Invalid operation - mathematically undefined, returns NaN.
         n/0: On finite operands gives an exact infinite result, eg: ±infinity.
 
+        See Also
+        --------
+        floordiv
+
         Examples
         --------
         >>> df = pl.DataFrame(
@@ -4672,10 +4678,6 @@ def truediv(self, other: Any) -> Self:
         │ 2   ┆ -0.5 ┆ 1.0  ┆ -4.0  │
         └─────┴──────┴──────┴───────┘
 
-        See Also
-        --------
-        floordiv
-
         """
         return self.__truediv__(other)
 
diff --git a/py-polars/polars/expr/string.py b/py-polars/polars/expr/string.py
index 11338913e987..5897eb77488e 100644
--- a/py-polars/polars/expr/string.py
+++ b/py-polars/polars/expr/string.py
@@ -801,6 +801,11 @@ def contains(
         <https://docs.rs/regex/latest/regex/#grouping-and-flags>`_ for
         additional information about the use of inline expression modifiers.
 
+        See Also
+        --------
+        starts_with : Check if string values start with a substring.
+        ends_with : Check if string values end with a substring.
+
         Examples
         --------
         >>> df = pl.DataFrame({"a": ["Crab", "cat and dog", "rab$bit", None]})
@@ -821,11 +826,6 @@ def contains(
         │ null        ┆ null  ┆ null    │
         └─────────────┴───────┴─────────┘
 
-        See Also
-        --------
-        starts_with : Check if string values start with a substring.
-        ends_with : Check if string values end with a substring.
-
         """
         pattern = parse_as_expression(pattern, str_as_lit=True)
         return wrap_expr(self._pyexpr.str_contains(pattern, literal, strict))
@@ -839,6 +839,11 @@ def ends_with(self, suffix: str | Expr) -> Expr:
         suffix
             Suffix substring.
 
+        See Also
+        --------
+        contains : Check if string contains a substring that matches a regex.
+        starts_with : Check if string values start with a substring.
+
         Examples
         --------
         >>> df = pl.DataFrame({"fruits": ["apple", "mango", None]})
@@ -868,11 +873,6 @@ def ends_with(self, suffix: str | Expr) -> Expr:
         │ mango  │
         └────────┘
 
-        See Also
-        --------
-        contains : Check if string contains a substring that matches a regex.
-        starts_with : Check if string values start with a substring.
-
         """
         suffix = parse_as_expression(suffix, str_as_lit=True)
         return wrap_expr(self._pyexpr.str_ends_with(suffix))
@@ -886,6 +886,11 @@ def starts_with(self, prefix: str | Expr) -> Expr:
         prefix
             Prefix substring.
 
+        See Also
+        --------
+        contains : Check if string contains a substring that matches a regex.
+        ends_with : Check if string values end with a substring.
+
         Examples
         --------
         >>> df = pl.DataFrame({"fruits": ["apple", "mango", None]})
@@ -915,11 +920,6 @@ def starts_with(self, prefix: str | Expr) -> Expr:
         │ apple  │
         └────────┘
 
-        See Also
-        --------
-        contains : Check if string contains a substring that matches a regex.
-        ends_with : Check if string values end with a substring.
-
         """
         prefix = parse_as_expression(prefix, str_as_lit=True)
         return wrap_expr(self._pyexpr.str_starts_with(prefix))
@@ -941,6 +941,11 @@ def json_extract(
             How many rows to parse to determine the schema.
             If ``None`` all rows are used.
 
+        See Also
+        --------
+        json_path_match : Extract the first match of json string with provided JSONPath
+            expression.
+
         Examples
         --------
         >>> df = pl.DataFrame(
@@ -959,11 +964,6 @@ def json_extract(
         │ {2,false}   │
         └─────────────┘
 
-        See Also
-        --------
-        json_path_match : Extract the first match of json string with provided JSONPath
-            expression.
-
         """
         if dtype is not None:
             dtype = py_type_to_dtype(dtype)
diff --git a/py-polars/polars/functions/lazy.py b/py-polars/polars/functions/lazy.py
index 78862b54abd5..2b8e399f9b36 100644
--- a/py-polars/polars/functions/lazy.py
+++ b/py-polars/polars/functions/lazy.py
@@ -1924,6 +1924,10 @@ def arg_where(condition: Expr | Series, *, eager: bool = False) -> Expr | Series
         Evaluate immediately and return a ``Series``. If set to ``False`` (default),
         return an expression instead.
 
+    See Also
+    --------
+    Series.arg_true : Return indices where Series is True
+
     Examples
     --------
     >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
@@ -1939,10 +1943,6 @@ def arg_where(condition: Expr | Series, *, eager: bool = False) -> Expr | Series
         3
     ]
 
-    See Also
-    --------
-    Series.arg_true : Return indices where Series is True
-
     """
     if eager:
         if not isinstance(condition, pl.Series):
diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py
index 4a59dd65353c..ec3301bbd930 100644
--- a/py-polars/polars/io/_utils.py
+++ b/py-polars/polars/io/_utils.py
@@ -17,7 +17,7 @@ def _is_glob_pattern(file: str) -> bool:
 
 def _is_local_file(file: str) -> bool:
     try:
-        next(glob.iglob(file, recursive=True))
+        next(glob.iglob(file, recursive=True))  # noqa: PTH207
         return True
     except StopIteration:
         return False
diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py
index 57d03aebba0e..177c8828cffe 100644
--- a/py-polars/polars/io/csv/functions.py
+++ b/py-polars/polars/io/csv/functions.py
@@ -170,6 +170,10 @@ def read_csv(
     -------
     DataFrame
 
+    See Also
+    --------
+    scan_csv : Lazily read from a CSV file or multiple files via glob patterns.
+
     Notes
     -----
     This operation defaults to a `rechunk` operation at the end, meaning that
@@ -177,10 +181,6 @@ def read_csv(
     Set `rechunk=False` if you are benchmarking the csv-reader. A `rechunk` is
     an expensive operation.
 
-    See Also
-    --------
-    scan_csv : Lazily read from a CSV file or multiple files via glob patterns.
-
     """
     _check_arg_is_1byte("separator", separator, False)
     _check_arg_is_1byte("comment_char", comment_char, False)
diff --git a/py-polars/polars/io/database.py b/py-polars/polars/io/database.py
index 49abbb505f28..8babe712ce21 100644
--- a/py-polars/polars/io/database.py
+++ b/py-polars/polars/io/database.py
@@ -241,6 +241,10 @@ def read_database(  # noqa: D417
     will be used to efficiently instantiate the DataFrame; otherwise, the DataFrame
     is initialised from row-wise data.
 
+    See Also
+    --------
+    read_database_uri : Create a DataFrame from a SQL query using a URI string.
+
     Examples
     --------
     Instantiate a DataFrame from a SQL query against a user-supplied connection:
@@ -250,10 +254,6 @@ def read_database(  # noqa: D417
     ...     connection=conn,
     ... )  # doctest: +SKIP
 
-    See Also
-    --------
-    read_database_uri : Create a DataFrame from a SQL query using a URI string.
-
     """
     if isinstance(connection, str):
         issue_deprecation_warning(
@@ -327,6 +327,10 @@ def read_database_uri(
     For ``adbc`` you will need to have installed ``pyarrow`` and the ADBC driver associated
     with the backend you are connecting to, eg: ``adbc-driver-postgresql``.
 
+    See Also
+    --------
+    read_database : Create a DataFrame from a SQL query using a connection object.
+
     Examples
     --------
     Create a DataFrame from a SQL query using a single thread:
@@ -366,10 +370,6 @@ def read_database_uri(
     ...     engine="adbc",
     ... )  # doctest: +SKIP
 
-    See Also
-    --------
-    read_database : Create a DataFrame from a SQL query using a connection object.
-
     """  # noqa: W505
     if not isinstance(uri, str):
         raise TypeError(
diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py
index eca59c91c51a..3788181da79a 100644
--- a/py-polars/polars/lazyframe/frame.py
+++ b/py-polars/polars/lazyframe/frame.py
@@ -652,6 +652,10 @@ def dtypes(self) -> list[PolarsDataType]:
         """
         Get dtypes of columns in LazyFrame.
 
+        See Also
+        --------
+        schema : Returns a {colname:dtype} mapping.
+
         Examples
         --------
         >>> lf = pl.LazyFrame(
@@ -664,10 +668,6 @@ def dtypes(self) -> list[PolarsDataType]:
         >>> lf.dtypes
         [Int64, Float64, Utf8]
 
-        See Also
-        --------
-        schema : Returns a {colname:dtype} mapping.
-
         """
         return self._ldf.dtypes()
 
diff --git a/py-polars/polars/selectors.py b/py-polars/polars/selectors.py
index c792735f9b64..a3da7242c453 100644
--- a/py-polars/polars/selectors.py
+++ b/py-polars/polars/selectors.py
@@ -325,6 +325,11 @@ def all() -> SelectorType:
     """
     Select all columns.
 
+    See Also
+    --------
+    first : Select the first column in the current scope.
+    last : Select the last column in the current scope.
+
     Examples
     --------
     >>> from datetime import date
@@ -363,11 +368,6 @@ def all() -> SelectorType:
     │ 2024-01-01 │
     └────────────┘
 
-    See Also
-    --------
-    first : Select the first column in the current scope.
-    last : Select the last column in the current scope.
-
     """
     return _selector_proxy_(F.all(), name="all")
 
@@ -378,6 +378,13 @@ def by_dtype(
     """
     Select all columns matching the given dtypes.
 
+    See Also
+    --------
+    integer : Select all integer columns.
+    float : Select all float columns.
+    numeric : Select all numeric columns.
+    temporal : Select all temporal columns.
+
     Examples
     --------
     >>> from datetime import date
@@ -431,13 +438,6 @@ def by_dtype(
     │ foo   ┆ -3265500 │
     └───────┴──────────┘
 
-    See Also
-    --------
-    integer : Select all integer columns.
-    float : Select all float columns.
-    numeric : Select all numeric columns.
-    temporal : Select all temporal columns.
-
     """
     all_dtypes: list[PolarsDataType] = []
     for tp in dtypes:
@@ -465,6 +465,10 @@ def by_name(*names: str | Collection[str]) -> SelectorType:
     *names
         One or more names of columns to select.
 
+    See Also
+    --------
+    by_dtype : Select all columns matching the given dtypes.
+
     Examples
     --------
     >>> import polars.selectors as cs
@@ -503,10 +507,6 @@ def by_name(*names: str | Collection[str]) -> SelectorType:
     │ 5.5 ┆ true  │
     └─────┴───────┘
 
-    See Also
-    --------
-    by_dtype : Select all columns matching the given dtypes.
-
     """
     all_names = []
     for nm in names:
@@ -534,6 +534,12 @@ def contains(substring: str | Collection[str]) -> SelectorType:
     substring
         Substring(s) that matching column names should contain.
 
+    See Also
+    --------
+    matches : Select all columns that match the given regex pattern.
+    ends_with : Select columns that end with the given substring(s).
+    starts_with : Select columns that start with the given substring(s).
+
     Examples
     --------
     >>> import polars.selectors as cs
@@ -585,12 +591,6 @@ def contains(substring: str | Collection[str]) -> SelectorType:
     │ y   ┆ true  │
     └─────┴───────┘
 
-    See Also
-    --------
-    matches : Select all columns that match the given regex pattern.
-    ends_with : Select columns that end with the given substring(s).
-    starts_with : Select columns that start with the given substring(s).
-
     """
     escaped_substring = _re_string(substring)
     raw_params = f"^.*{escaped_substring}.*$"
@@ -861,6 +861,12 @@ def ends_with(*suffix: str) -> SelectorType:
     """
     Select columns that end with the given substring(s).
 
+    See Also
+    --------
+    contains : Select columns that contain the given literal substring(s).
+    matches : Select all columns that match the given regex pattern.
+    starts_with : Select columns that start with the given substring(s).
+
     Parameters
     ----------
     suffix
@@ -917,12 +923,6 @@ def ends_with(*suffix: str) -> SelectorType:
     │ y   ┆ 456 ┆ true  │
     └─────┴─────┴───────┘
 
-    See Also
-    --------
-    contains : Select columns that contain the given literal substring(s).
-    matches : Select all columns that match the given regex pattern.
-    starts_with : Select columns that start with the given substring(s).
-
     """
     escaped_suffix = _re_string(suffix)
     raw_params = f"^.*{escaped_suffix}$"
@@ -938,6 +938,11 @@ def first() -> SelectorType:
     """
     Select the first column in the current scope.
 
+    See Also
+    --------
+    all : Select all columns.
+    last : Select the last column in the current scope.
+
     Examples
     --------
     >>> import polars.selectors as cs
@@ -976,11 +981,6 @@ def first() -> SelectorType:
     │ 456 ┆ 5.5 ┆ 1   │
     └─────┴─────┴─────┘
 
-    See Also
-    --------
-    all : Select all columns.
-    last : Select the last column in the current scope.
-
     """
     return _selector_proxy_(F.first(), name="first")
 
@@ -989,6 +989,13 @@ def float() -> SelectorType:
     """
     Select all float columns.
 
+    See Also
+    --------
+    integer : Select all integer columns.
+    numeric : Select all numeric columns.
+    temporal : Select all temporal columns.
+    string : Select all string columns.
+
     Examples
     --------
     >>> import polars.selectors as cs
@@ -1028,13 +1035,6 @@ def float() -> SelectorType:
     │ y   ┆ 456 │
     └─────┴─────┘
 
-    See Also
-    --------
-    integer : Select all integer columns.
-    numeric : Select all numeric columns.
-    temporal : Select all temporal columns.
-    string : Select all string columns.
-
     """
     return _selector_proxy_(
         F.col(FLOAT_DTYPES),
@@ -1046,6 +1046,14 @@ def integer() -> SelectorType:
     """
     Select all integer columns.
 
+    See Also
+    --------
+    by_dtype : Select columns by dtype.
+    float : Select all float columns.
+    numeric : Select all numeric columns.
+    temporal : Select all temporal columns.
+    string : Select all string columns.
+
     Examples
     --------
     >>> import polars.selectors as cs
@@ -1084,14 +1092,6 @@ def integer() -> SelectorType:
     │ y   ┆ 5.5 │
     └─────┴─────┘
 
-    See Also
-    --------
-    by_dtype : Select columns by dtype.
-    float : Select all float columns.
-    numeric : Select all numeric columns.
-    temporal : Select all temporal columns.
-    string : Select all string columns.
-
     """
     return _selector_proxy_(
         F.col(INTEGER_DTYPES),
@@ -1103,6 +1103,14 @@ def signed_integer() -> SelectorType:
     """
     Select all signed integer columns.
 
+    See Also
+    --------
+    by_dtype : Select columns by dtype.
+    float : Select all float columns.
+    integer: Select all integer columns.
+    numeric : Select all numeric columns.
+    unsigned_integer: Select all unsigned integer columns.
+
     Examples
     --------
     >>> import polars.selectors as cs
@@ -1153,14 +1161,6 @@ def signed_integer() -> SelectorType:
     │ -456 ┆ 6789 ┆ 4321 │
     └──────┴──────┴──────┘
 
-    See Also
-    --------
-    by_dtype : Select columns by dtype.
-    float : Select all float columns.
-    integer: Select all integer columns.
-    numeric : Select all numeric columns.
-    unsigned_integer: Select all unsigned integer columns.
-
     """
     return _selector_proxy_(
         F.col(SIGNED_INTEGER_DTYPES),
@@ -1172,6 +1172,14 @@ def unsigned_integer() -> SelectorType:
     """
     Select all unsigned integer columns.
 
+    See Also
+    --------
+    by_dtype : Select columns by dtype.
+    float : Select all float columns.
+    integer: Select all integer columns.
+    numeric : Select all numeric columns.
+    signed_integer: Select all signed integer columns.
+
     Examples
     --------
     >>> import polars.selectors as cs
@@ -1224,14 +1232,6 @@ def unsigned_integer() -> SelectorType:
     │ -456 ┆ 6789 ┆ 4321 │
     └──────┴──────┴──────┘
 
-    See Also
-    --------
-    by_dtype : Select columns by dtype.
-    float : Select all float columns.
-    integer: Select all integer columns.
-    numeric : Select all numeric columns.
-    signed_integer: Select all signed integer columns.
-
     """
     return _selector_proxy_(
         F.col(UNSIGNED_INTEGER_DTYPES),
@@ -1243,6 +1243,11 @@ def last() -> SelectorType:
     """
     Select the last column in the current scope.
 
+    See Also
+    --------
+    all : Select all columns.
+    first : Select the first column in the current scope.
+
     Examples
     --------
     >>> import polars.selectors as cs
@@ -1281,11 +1286,6 @@ def last() -> SelectorType:
     │ y   ┆ 456 ┆ 5.5 │
     └─────┴─────┴─────┘
 
-    See Also
-    --------
-    all : Select all columns.
-    first : Select the first column in the current scope.
-
     """
     return _selector_proxy_(F.last(), name="last")
 
@@ -1294,6 +1294,12 @@ def matches(pattern: str) -> SelectorType:
     """
     Select all columns that match the given regex pattern.
 
+    See Also
+    --------
+    contains : Select all columns that contain the given substring.
+    ends_with : Select all columns that end with the given substring(s).
+    starts_with : Select all columns that start with the given substring(s).
+
     Parameters
     ----------
     pattern
@@ -1338,12 +1344,6 @@ def matches(pattern: str) -> SelectorType:
     │ y   ┆ 1   │
     └─────┴─────┘
 
-    See Also
-    --------
-    contains : Select all columns that contain the given substring.
-    ends_with : Select all columns that end with the given substring(s).
-    starts_with : Select all columns that start with the given substring(s).
-
     """
     if pattern == ".*":
         return all()
@@ -1368,6 +1368,14 @@ def numeric() -> SelectorType:
     """
     Select all numeric columns.
 
+    See Also
+    --------
+    by_dtype : Select columns by dtype.
+    float : Select all float columns.
+    integer : Select all integer columns.
+    temporal : Select all temporal columns.
+    string : Select all string columns.
+
     Examples
     --------
     >>> import polars.selectors as cs
@@ -1407,14 +1415,6 @@ def numeric() -> SelectorType:
     │ y   │
     └─────┘
 
-    See Also
-    --------
-    by_dtype : Select columns by dtype.
-    float : Select all float columns.
-    integer : Select all integer columns.
-    temporal : Select all temporal columns.
-    string : Select all string columns.
-
     """
     return _selector_proxy_(
         F.col(NUMERIC_DTYPES),
@@ -1431,6 +1431,12 @@ def starts_with(*prefix: str) -> SelectorType:
     prefix
         Substring(s) that matching column names should start with.
 
+    See Also
+    --------
+    contains : Select all columns that contain the given substring.
+    ends_with : Select all columns that end with the given substring(s).
+    matches : Select all columns that match the given regex pattern.
+
     Examples
     --------
     >>> import polars.selectors as cs
@@ -1482,12 +1488,6 @@ def starts_with(*prefix: str) -> SelectorType:
     │ 2.0 ┆ 8   │
     └─────┴─────┘
 
-    See Also
-    --------
-    contains : Select all columns that contain the given substring.
-    ends_with : Select all columns that end with the given substring(s).
-    matches : Select all columns that match the given regex pattern.
-
     """
     escaped_prefix = _re_string(prefix)
     raw_params = f"^{escaped_prefix}.*$"
@@ -1503,6 +1503,14 @@ def string(include_categorical: bool = False) -> SelectorType:
     """
     Select all Utf8 (and, optionally, Categorical) string columns.
 
+    See Also
+    --------
+    by_dtype : Select all columns of a given dtype.
+    float : Select all float columns.
+    integer : Select all integer columns.
+    numeric : Select all numeric columns.
+    temporal : Select all temporal columns.
+
     Examples
     --------
     >>> import polars.selectors as cs
@@ -1544,14 +1552,6 @@ def string(include_categorical: bool = False) -> SelectorType:
     │ yy  ┆ b   ┆ 6   ┆ 7.0  │
     └─────┴─────┴─────┴──────┘
 
-    See Also
-    --------
-    by_dtype : Select all columns of a given dtype.
-    float : Select all float columns.
-    integer : Select all integer columns.
-    numeric : Select all numeric columns.
-    temporal : Select all temporal columns.
-
     """
     string_dtypes: list[PolarsDataType] = [Utf8]
     if include_categorical:
@@ -1568,6 +1568,14 @@ def temporal() -> SelectorType:
     """
     Select all temporal columns.
 
+    See Also
+    --------
+    by_dtype : Select all columns of a given dtype.
+    float : Select all float columns.
+    integer : Select all integer columns.
+    numeric : Select all numeric columns.
+    string : Select all string columns.
+
     Examples
     --------
     >>> from datetime import date, time
@@ -1619,14 +1627,6 @@ def temporal() -> SelectorType:
     │ 2.3456 │
     └────────┘
 
-    See Also
-    --------
-    by_dtype : Select all columns of a given dtype.
-    float : Select all float columns.
-    integer : Select all integer columns.
-    numeric : Select all numeric columns.
-    string : Select all string columns.
-
     """
     return _selector_proxy_(
         F.col(TEMPORAL_DTYPES),
diff --git a/py-polars/polars/series/datetime.py b/py-polars/polars/series/datetime.py
index 9db12fa7960d..d3c9e08c0d26 100644
--- a/py-polars/polars/series/datetime.py
+++ b/py-polars/polars/series/datetime.py
@@ -186,6 +186,10 @@ def strftime(self, format: str) -> Series:
             <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
             for specification. Example: ``"%y-%m-%d"``.
 
+        See Also
+        --------
+        to_string : The identical Series method for which ``strftime`` is an alias.
+
         Examples
         --------
         >>> from datetime import datetime
@@ -202,10 +206,6 @@ def strftime(self, format: str) -> Series:
                 "2020/05/01"
         ]
 
-        See Also
-        --------
-        to_string : The identical Series method for which ``strftime`` is an alias.
-
         """
         return self.to_string(format)
 
diff --git a/py-polars/polars/series/list.py b/py-polars/polars/series/list.py
index 76170741c761..a97c6a36a0c5 100644
--- a/py-polars/polars/series/list.py
+++ b/py-polars/polars/series/list.py
@@ -603,6 +603,10 @@ def set_difference(self, other: Series) -> Series:
         other
             Right hand side of the set operation.
 
+        See Also
+        --------
+        polars.Series.list.diff: Calculates the n-th discrete difference of every sublist.
+
         Examples
         --------
         >>> a = pl.Series([[1, 2, 3], [], [None, 3], [5, 6, 7]])
@@ -617,10 +621,6 @@ def set_difference(self, other: Series) -> Series:
                 [5, 7]
         ]
 
-        See Also
-        --------
-        polars.Series.list.diff: Calculates the n-th discrete difference of every sublist.
-
         """  # noqa: W505.
 
     def set_intersection(self, other: Series) -> Series:
diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py
index e3b1025f9f95..efb12d163b21 100644
--- a/py-polars/polars/series/series.py
+++ b/py-polars/polars/series/series.py
@@ -608,16 +608,42 @@ def eq_missing(self, other: Expr) -> Expr:  # type: ignore[misc]
 
     def eq_missing(self, other: Any) -> Self | Expr:
         """
-        Method equivalent of equality operator ``expr == other`` where `None` == None`.
+        Method equivalent of equality operator ``series == other`` where `None` == None`.
 
-        This differs from default ``ne`` where null values are propagated.
+        This differs from the standard ``ne`` where null values are propagated.
 
         Parameters
         ----------
         other
             A literal or expression value to compare with.
 
-        """
+        See Also
+        --------
+        ne_missing
+        eq
+
+        Examples
+        --------
+        >>> s1 = pl.Series("a", [333, 200, None])
+        >>> s2 = pl.Series("a", [100, 200, None])
+        >>> s1.eq(s2)
+        shape: (3,)
+        Series: 'a' [bool]
+        [
+            false
+            true
+            null
+        ]
+        >>> s1.eq_missing(s2)
+        shape: (3,)
+        Series: 'a' [bool]
+        [
+            false
+            true
+            true
+        ]
+
+        """  # noqa: W505
 
     def ne(self, other: Any) -> Self | Expr:
         """Method equivalent of operator expression ``series != other``."""
@@ -633,16 +659,42 @@ def ne_missing(self, other: Any) -> Self:
 
     def ne_missing(self, other: Any) -> Self | Expr:
         """
-        Method equivalent of equality operator ``expr != other`` where `None` == None`.
+        Method equivalent of equality operator ``series != other`` where `None` == None`.
 
-        This differs from default ``ne`` where null values are propagated.
+        This differs from the standard ``ne`` where null values are propagated.
 
         Parameters
         ----------
         other
             A literal or expression value to compare with.
 
-        """
+        See Also
+        --------
+        eq_missing
+        ne
+
+        Examples
+        --------
+        >>> s1 = pl.Series("a", [333, 200, None])
+        >>> s2 = pl.Series("a", [100, 200, None])
+        >>> s1.ne(s2)
+        shape: (3,)
+        Series: 'a' [bool]
+        [
+            true
+            false
+            null
+        ]
+        >>> s1.ne_missing(s2)
+        shape: (3,)
+        Series: 'a' [bool]
+        [
+            true
+            false
+            false
+        ]
+
+        """  # noqa: W505
 
     def ge(self, other: Any) -> Self | Expr:
         """Method equivalent of operator expression ``series >= other``."""
diff --git a/py-polars/polars/series/string.py b/py-polars/polars/series/string.py
index c77bf0b1bc4d..1ae8d9c9002e 100644
--- a/py-polars/polars/series/string.py
+++ b/py-polars/polars/series/string.py
@@ -449,6 +449,11 @@ def ends_with(self, suffix: str | Expr) -> Series:
         suffix
             Suffix substring.
 
+        See Also
+        --------
+        contains : Check if string contains a substring that matches a regex.
+        starts_with : Check if string values start with a substring.
+
         Examples
         --------
         >>> s = pl.Series("fruits", ["apple", "mango", None])
@@ -461,11 +466,6 @@ def ends_with(self, suffix: str | Expr) -> Series:
             null
         ]
 
-        See Also
-        --------
-        contains : Check if string contains a substring that matches a regex.
-        starts_with : Check if string values start with a substring.
-
         """
 
     def starts_with(self, prefix: str | Expr) -> Series:
@@ -477,6 +477,11 @@ def starts_with(self, prefix: str | Expr) -> Series:
         prefix
             Prefix substring.
 
+        See Also
+        --------
+        contains : Check if string contains a substring that matches a regex.
+        ends_with : Check if string values end with a substring.
+
         Examples
         --------
         >>> s = pl.Series("fruits", ["apple", "mango", None])
@@ -489,11 +494,6 @@ def starts_with(self, prefix: str | Expr) -> Series:
             null
         ]
 
-        See Also
-        --------
-        contains : Check if string contains a substring that matches a regex.
-        ends_with : Check if string values end with a substring.
-
         """
 
     def decode(self, encoding: TransferEncoding, *, strict: bool = True) -> Series:
@@ -555,6 +555,11 @@ def json_extract(
             How many rows to parse to determine the schema.
             If ``None`` all rows are used.
 
+        See Also
+        --------
+        json_path_match : Extract the first match of json string with provided JSONPath
+            expression.
+
         Examples
         --------
         >>> s = pl.Series("json", ['{"a":1, "b": true}', None, '{"a":2, "b": false}'])
@@ -567,11 +572,6 @@ def json_extract(
                 {2,false}
         ]
 
-        See Also
-        --------
-        json_path_match : Extract the first match of json string with provided JSONPath
-            expression.
-
         """
 
     def json_path_match(self, json_path: str) -> Series:
diff --git a/py-polars/polars/sql/context.py b/py-polars/polars/sql/context.py
index f4b39bb71c44..7e8ff0a6285b 100644
--- a/py-polars/polars/sql/context.py
+++ b/py-polars/polars/sql/context.py
@@ -285,6 +285,12 @@ def register(self, name: str, frame: DataFrame | LazyFrame) -> Self:
         frame
             eager/lazy frame to associate with this table name.
 
+        See Also
+        --------
+        register_globals
+        register_many
+        unregister
+
         Examples
         --------
         >>> df = pl.DataFrame({"hello": ["world"]})
@@ -299,12 +305,6 @@ def register(self, name: str, frame: DataFrame | LazyFrame) -> Self:
         │ world │
         └───────┘
 
-        See Also
-        --------
-        register_globals
-        register_many
-        unregister
-
         """
         if isinstance(frame, DataFrame):
             frame = frame.lazy()
@@ -317,6 +317,12 @@ def register_globals(self, n: int | None = None) -> Self:
 
         Automatically maps variable names to table names.
 
+        See Also
+        --------
+        register
+        register_many
+        unregister
+
         Parameters
         ----------
         n
@@ -349,12 +355,6 @@ def register_globals(self, n: int | None = None) -> Self:
         │ 1   ┆ x    ┆ null │
         └─────┴──────┴──────┘
 
-        See Also
-        --------
-        register
-        register_many
-        unregister
-
         """
         return self.register_many(
             frames=_get_stack_locals(of_type=(DataFrame, LazyFrame), n_objects=n)
@@ -375,6 +375,12 @@ def register_many(
         **named_frames
             Named eager/lazy frames, provided as kwargs.
 
+        See Also
+        --------
+        register
+        register_globals
+        unregister
+
         Examples
         --------
         >>> lf1 = pl.LazyFrame({"a": [1, 2, 3], "b": ["m", "n", "o"]})
@@ -393,12 +399,6 @@ def register_many(
         >>> ctx.register_many(tbl3=lf3, tbl4=lf4).tables()
         ['tbl1', 'tbl2', 'tbl3', 'tbl4']
 
-        See Also
-        --------
-        register
-        register_globals
-        unregister
-
         """
         frames = dict(frames or {})
         frames.update(named_frames)
@@ -438,6 +438,12 @@ def unregister(self, names: str | Collection[str]) -> Self:
         >>> ctx.tables()
         ['tbl0']
 
+        See Also
+        --------
+        register
+        register_globals
+        register_many
+
         Examples
         --------
         >>> df0 = pl.DataFrame({"ints": [9, 8, 7, 6, 5]})
@@ -457,12 +463,6 @@ def unregister(self, names: str | Collection[str]) -> Self:
         >>> ctx.unregister("test2").tables()
         []
 
-        See Also
-        --------
-        register
-        register_globals
-        register_many
-
         """
         if isinstance(names, str):
             names = [names]

From 628c9eb0f4e269e89dd6d03e1574679e7a65b455 Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Tue, 22 Aug 2023 17:57:22 +0200
Subject: [PATCH 40/55] feat(rust, python): support broadcasting in list set
 operations (#10668)

---
 .../polars-ops/src/chunked_array/list/sets.rs | 199 ++++++++++++++----
 .../polars-plan/src/dsl/function_expr/list.rs |   2 +-
 crates/polars-plan/src/dsl/list.rs            |   6 +-
 py-polars/src/expr/list.rs                    |   6 +-
 py-polars/tests/unit/namespaces/test_list.py  |  21 ++
 5 files changed, 184 insertions(+), 50 deletions(-)

diff --git a/crates/polars-ops/src/chunked_array/list/sets.rs b/crates/polars-ops/src/chunked_array/list/sets.rs
index fe3fff9a3a78..6029da06248c 100644
--- a/crates/polars-ops/src/chunked_array/list/sets.rs
+++ b/crates/polars-ops/src/chunked_array/list/sets.rs
@@ -36,17 +36,19 @@ impl<'a> MaterializeValues<Option<&'a [u8]>> for MutableBinaryArray<i64> {
     }
 }
 
-fn set_operation<K, I, R>(
+fn set_operation<K, I, J, R>(
     set: &mut PlIndexSet<K>,
     set2: &mut PlIndexSet<K>,
     a: I,
-    b: I,
+    b: J,
     out: &mut R,
     set_op: SetOperation,
+    broadcast_rhs: bool,
 ) -> usize
 where
     K: Eq + Hash + Copy,
     I: IntoIterator<Item = K>,
+    J: IntoIterator<Item = K>,
     R: MaterializeValues<K>,
 {
     set.clear();
@@ -55,9 +57,12 @@ where
 
     match set_op {
         SetOperation::Intersection => {
-            set2.clear();
             set.extend(a);
-            set2.extend(b);
+            // If broadcast `set2` should already be filled.
+            if !broadcast_rhs {
+                set2.clear();
+                set2.extend(b);
+            }
             out.extend_buf(set.intersection(set2).copied())
         },
         SetOperation::Union => {
@@ -73,11 +78,14 @@ where
             out.extend_buf(set.drain(..))
         },
         SetOperation::SymmetricDifference => {
-            set2.clear();
+            // If broadcast `set2` should already be filled.
+            if !broadcast_rhs {
+                set2.clear();
+                set2.extend(b);
+            }
             // We could speed this up, but implementing ourselves, but we need to have a clonable
             // iterator as we need 2 passes
             set.extend(a);
-            set2.extend(b);
             out.extend_buf(set.symmetric_difference(set2).copied())
         },
     }
@@ -115,14 +123,15 @@ fn primitive<T>(
     offsets_b: &[i64],
     set_op: SetOperation,
     validity: Option<Bitmap>,
-) -> ListArray<i64>
+) -> PolarsResult<ListArray<i64>>
 where
     T: NativeType + Hash + Copy + Eq,
 {
-    assert_eq!(offsets_a.len(), offsets_b.len());
+    let broadcast_lhs = offsets_a.len() == 2;
+    let broadcast_rhs = offsets_b.len() == 2;
 
     let mut set = Default::default();
-    let mut set2 = Default::default();
+    let mut set2: PlIndexSet<Option<T>> = Default::default();
 
     let mut values_out = MutablePrimitiveArray::with_capacity(std::cmp::max(
         *offsets_a.last().unwrap(),
@@ -131,7 +140,15 @@ where
     let mut offsets = Vec::with_capacity(std::cmp::max(offsets_a.len(), offsets_b.len()));
     offsets.push(0i64);
 
-    for i in 1..offsets_a.len() {
+    if broadcast_rhs {
+        set2.extend(b.into_iter().map(copied_opt));
+    }
+    let offsets_slice = if offsets_a.len() > offsets_b.len() {
+        offsets_a
+    } else {
+        offsets_b
+    };
+    for i in 1..offsets_slice.len() {
         unsafe {
             let start_a = *offsets_a.get_unchecked(i - 1) as usize;
             let end_a = *offsets_a.get_unchecked(i) as usize;
@@ -139,20 +156,67 @@ where
             let start_b = *offsets_b.get_unchecked(i - 1) as usize;
             let end_b = *offsets_b.get_unchecked(i) as usize;
 
-            // going via skip iterator instead of slice doesn't heap alloc nor trigger a bitcount
-            let a_iter = a
-                .into_iter()
-                .skip(start_a)
-                .take(end_a - start_a)
-                .map(copied_opt);
-            let b_iter = b
-                .into_iter()
-                .skip(start_b)
-                .take(end_b - start_b)
-                .map(copied_opt);
-
-            let offset =
-                set_operation(&mut set, &mut set2, a_iter, b_iter, &mut values_out, set_op);
+            // The branches are the same every loop.
+            // We rely on branch prediction here.
+            let offset = if broadcast_rhs {
+                // going via skip iterator instead of slice doesn't heap alloc nor trigger a bitcount
+                let a_iter = a
+                    .into_iter()
+                    .skip(start_a)
+                    .take(end_a - start_a)
+                    .map(copied_opt);
+                let b_iter = b.into_iter().map(copied_opt);
+                set_operation(
+                    &mut set,
+                    &mut set2,
+                    a_iter,
+                    b_iter,
+                    &mut values_out,
+                    set_op,
+                    true,
+                )
+            } else if broadcast_lhs {
+                let a_iter = a.into_iter().map(copied_opt);
+
+                let b_iter = b
+                    .into_iter()
+                    .skip(start_b)
+                    .take(end_b - start_b)
+                    .map(copied_opt);
+
+                set_operation(
+                    &mut set,
+                    &mut set2,
+                    a_iter,
+                    b_iter,
+                    &mut values_out,
+                    set_op,
+                    false,
+                )
+            } else {
+                // going via skip iterator instead of slice doesn't heap alloc nor trigger a bitcount
+                let a_iter = a
+                    .into_iter()
+                    .skip(start_a)
+                    .take(end_a - start_a)
+                    .map(copied_opt);
+
+                let b_iter = b
+                    .into_iter()
+                    .skip(start_b)
+                    .take(end_b - start_b)
+                    .map(copied_opt);
+                set_operation(
+                    &mut set,
+                    &mut set2,
+                    a_iter,
+                    b_iter,
+                    &mut values_out,
+                    set_op,
+                    false,
+                )
+            };
+
             offsets.push(offset as i64);
         }
     }
@@ -160,7 +224,7 @@ where
     let dtype = ListArray::<i64>::default_datatype(values_out.data_type().clone());
 
     let values: PrimitiveArray<T> = values_out.into();
-    ListArray::new(dtype, offsets, values.boxed(), validity)
+    Ok(ListArray::new(dtype, offsets, values.boxed(), validity))
 }
 
 fn binary(
@@ -171,11 +235,11 @@ fn binary(
     set_op: SetOperation,
     validity: Option<Bitmap>,
     as_utf8: bool,
-) -> ListArray<i64> {
-    assert_eq!(offsets_a.len(), offsets_b.len());
-
+) -> PolarsResult<ListArray<i64>> {
+    let broadcast_lhs = offsets_a.len() == 2;
+    let broadcast_rhs = offsets_b.len() == 2;
     let mut set = Default::default();
-    let mut set2 = Default::default();
+    let mut set2: PlIndexSet<Option<&[u8]>> = Default::default();
 
     let mut values_out = MutableBinaryArray::with_capacity(std::cmp::max(
         *offsets_a.last().unwrap(),
@@ -184,7 +248,15 @@ fn binary(
     let mut offsets = Vec::with_capacity(std::cmp::max(offsets_a.len(), offsets_b.len()));
     offsets.push(0i64);
 
-    for i in 1..offsets_a.len() {
+    if broadcast_rhs {
+        set2.extend(b);
+    }
+    let offsets_slice = if offsets_a.len() > offsets_b.len() {
+        offsets_a
+    } else {
+        offsets_b
+    };
+    for i in 1..offsets_slice.len() {
         unsafe {
             let start_a = *offsets_a.get_unchecked(i - 1) as usize;
             let end_a = *offsets_a.get_unchecked(i) as usize;
@@ -192,12 +264,47 @@ fn binary(
             let start_b = *offsets_b.get_unchecked(i - 1) as usize;
             let end_b = *offsets_b.get_unchecked(i) as usize;
 
-            // going via skip iterator instead of slice doesn't heap alloc nor trigger a bitcount
-            let a_iter = a.into_iter().skip(start_a).take(end_a - start_a);
-            let b_iter = b.into_iter().skip(start_b).take(end_b - start_b);
-
-            let offset =
-                set_operation(&mut set, &mut set2, a_iter, b_iter, &mut values_out, set_op);
+            // The branches are the same every loop.
+            // We rely on branch prediction here.
+            let offset = if broadcast_rhs {
+                // going via skip iterator instead of slice doesn't heap alloc nor trigger a bitcount
+                let a_iter = a.into_iter().skip(start_a).take(end_a - start_a);
+                let b_iter = b.into_iter();
+                set_operation(
+                    &mut set,
+                    &mut set2,
+                    a_iter,
+                    b_iter,
+                    &mut values_out,
+                    set_op,
+                    true,
+                )
+            } else if broadcast_lhs {
+                let a_iter = a.into_iter();
+                let b_iter = b.into_iter().skip(start_b).take(end_b - start_b);
+                set_operation(
+                    &mut set,
+                    &mut set2,
+                    a_iter,
+                    b_iter,
+                    &mut values_out,
+                    set_op,
+                    false,
+                )
+            } else {
+                // going via skip iterator instead of slice doesn't heap alloc nor trigger a bitcount
+                let a_iter = a.into_iter().skip(start_a).take(end_a - start_a);
+                let b_iter = b.into_iter().skip(start_b).take(end_b - start_b);
+                set_operation(
+                    &mut set,
+                    &mut set2,
+                    a_iter,
+                    b_iter,
+                    &mut values_out,
+                    set_op,
+                    false,
+                )
+            };
             offsets.push(offset as i64);
         }
     }
@@ -214,10 +321,10 @@ fn binary(
             )
         };
         let dtype = ListArray::<i64>::default_datatype(values.data_type().clone());
-        ListArray::new(dtype, offsets, values.boxed(), validity)
+        Ok(ListArray::new(dtype, offsets, values.boxed(), validity))
     } else {
         let dtype = ListArray::<i64>::default_datatype(values.data_type().clone());
-        ListArray::new(dtype, offsets, values.boxed(), validity)
+        Ok(ListArray::new(dtype, offsets, values.boxed(), validity))
     }
 }
 
@@ -234,7 +341,7 @@ fn array_set_operation(
     a: &ListArray<i64>,
     b: &ListArray<i64>,
     set_op: SetOperation,
-) -> ListArray<i64> {
+) -> PolarsResult<ListArray<i64>> {
     let offsets_a = a.offsets().as_slice();
     let offsets_b = b.offsets().as_slice();
 
@@ -266,7 +373,7 @@ fn array_set_operation(
             binary(a, b, offsets_a, offsets_b, set_op, validity, false)
         },
         ArrowDataType::Boolean => {
-            todo!("boolean type not yet supported in list union operations")
+            polars_bail!(InvalidOperation: "boolean type not yet supported in list 'set' operations")
         },
         _ => {
             with_match_physical_integer_type!(dtype.into(), |$T| {
@@ -279,13 +386,19 @@ fn array_set_operation(
     }
 }
 
-pub fn list_set_operation(a: &ListChunked, b: &ListChunked, set_op: SetOperation) -> ListChunked {
+pub fn list_set_operation(
+    a: &ListChunked,
+    b: &ListChunked,
+    set_op: SetOperation,
+) -> PolarsResult<ListChunked> {
+    polars_ensure!(a.len() == b.len() || b.len() == 1 || a.len() == 1, ShapeMismatch: "column lengths don't match");
+
     // we use the unsafe variant because we want to keep the nested logical types type.
     unsafe {
-        arity::binary_unchecked_same_type(
+        arity::try_binary_unchecked_same_type(
             a,
             b,
-            |a, b| array_set_operation(a, b, set_op).boxed(),
+            |a, b| array_set_operation(a, b, set_op).map(|arr| arr.boxed()),
             false,
             false,
         )
diff --git a/crates/polars-plan/src/dsl/function_expr/list.rs b/crates/polars-plan/src/dsl/function_expr/list.rs
index 1aed68b37cd9..a2f4dca9f007 100644
--- a/crates/polars-plan/src/dsl/function_expr/list.rs
+++ b/crates/polars-plan/src/dsl/function_expr/list.rs
@@ -258,7 +258,7 @@ pub(super) fn sum(s: &Series) -> PolarsResult<Series> {
 pub(super) fn set_operation(s: &[Series], set_type: SetOperation) -> PolarsResult<Series> {
     let s0 = &s[0];
     let s1 = &s[1];
-    Ok(list_set_operation(s0.list()?, s1.list()?, set_type).into_series())
+    list_set_operation(s0.list()?, s1.list()?, set_type).map(|ca| ca.into_series())
 }
 
 #[cfg(feature = "list_any_all")]
diff --git a/crates/polars-plan/src/dsl/list.rs b/crates/polars-plan/src/dsl/list.rs
index 6299dc3d58ef..8665c76c03d5 100644
--- a/crates/polars-plan/src/dsl/list.rs
+++ b/crates/polars-plan/src/dsl/list.rs
@@ -349,21 +349,21 @@ impl ListNameSpace {
 
     /// Return the SET DIFFERENCE between both list arrays.
     #[cfg(feature = "list_sets")]
-    pub fn difference<E: Into<Expr>>(self, other: E) -> Expr {
+    pub fn set_difference<E: Into<Expr>>(self, other: E) -> Expr {
         let other = other.into();
         self.set_operation(other, SetOperation::Difference)
     }
 
     /// Return the SET INTERSECTION between both list arrays.
     #[cfg(feature = "list_sets")]
-    pub fn intersection<E: Into<Expr>>(self, other: E) -> Expr {
+    pub fn set_intersection<E: Into<Expr>>(self, other: E) -> Expr {
         let other = other.into();
         self.set_operation(other, SetOperation::Intersection)
     }
 
     /// Return the SET SYMMETRIC DIFFERENCE between both list arrays.
     #[cfg(feature = "list_sets")]
-    pub fn symmetric_difference<E: Into<Expr>>(self, other: E) -> Expr {
+    pub fn set_symmetric_difference<E: Into<Expr>>(self, other: E) -> Expr {
         let other = other.into();
         self.set_operation(other, SetOperation::SymmetricDifference)
     }
diff --git a/py-polars/src/expr/list.rs b/py-polars/src/expr/list.rs
index b005e1bf1881..1838bd291903 100644
--- a/py-polars/src/expr/list.rs
+++ b/py-polars/src/expr/list.rs
@@ -158,10 +158,10 @@ impl PyExpr {
     fn list_set_operation(&self, other: PyExpr, operation: Wrap<SetOperation>) -> Self {
         let e = self.inner.clone().list();
         match operation.0 {
-            SetOperation::Intersection => e.intersection(other.inner),
-            SetOperation::Difference => e.difference(other.inner),
+            SetOperation::Intersection => e.set_intersection(other.inner),
+            SetOperation::Difference => e.set_difference(other.inner),
             SetOperation::Union => e.union(other.inner),
-            SetOperation::SymmetricDifference => e.symmetric_difference(other.inner),
+            SetOperation::SymmetricDifference => e.set_symmetric_difference(other.inner),
         }
         .into()
     }
diff --git a/py-polars/tests/unit/namespaces/test_list.py b/py-polars/tests/unit/namespaces/test_list.py
index 7a9cbd7505b8..786e2f6290de 100644
--- a/py-polars/tests/unit/namespaces/test_list.py
+++ b/py-polars/tests/unit/namespaces/test_list.py
@@ -514,6 +514,27 @@ def test_list_set_operations() -> None:
     assert r2 == exp
 
 
+def test_list_set_operations_broadcast() -> None:
+    df = pl.DataFrame(
+        {
+            "a": [[2, 3, 3], [3, 1], [1, 2, 3]],
+        }
+    )
+
+    assert df.with_columns(
+        pl.col("a").list.set_intersection(pl.lit(pl.Series([[1, 2]])))
+    ).to_dict(False) == {"a": [[2], [1], [1, 2]]}
+    assert df.with_columns(
+        pl.col("a").list.set_union(pl.lit(pl.Series([[1, 2]])))
+    ).to_dict(False) == {"a": [[2, 3, 1], [3, 1, 2], [1, 2, 3]]}
+    assert df.with_columns(
+        pl.col("a").list.set_difference(pl.lit(pl.Series([[1, 2]])))
+    ).to_dict(False) == {"a": [[3], [3], [3]]}
+    assert df.with_columns(
+        pl.lit(pl.Series("a", [[1, 2]])).list.set_difference("a")
+    ).to_dict(False) == {"a": [[1], [2], []]}
+
+
 def test_list_take_oob_10079() -> None:
     df = pl.DataFrame(
         {

From d0cd5234125b7e9430ff00a0629c636dc57edf20 Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijn@degooijer.io>
Date: Tue, 22 Aug 2023 18:10:19 +0200
Subject: [PATCH 41/55] refactor(python): Remove `deprecate_renamed_methods`
 util (#10537)

---
 .../reference/dataframe/descriptive.rst       |  2 +
 .../source/reference/expressions/list.rst     |  4 +
 .../reference/lazyframe/miscellaneous.rst     |  1 +
 .../reference/lazyframe/modify_select.rst     |  2 +
 .../docs/source/reference/series/list.rst     |  4 +
 py-polars/polars/dataframe/frame.py           | 45 ++++++++++--
 py-polars/polars/expr/list.py                 | 68 ++++++++++++-----
 py-polars/polars/lazyframe/frame.py           | 46 +++++++++---
 py-polars/polars/series/list.py               | 68 ++++++++++++-----
 py-polars/polars/utils/deprecation.py         | 73 +------------------
 py-polars/tests/unit/test_serde.py            |  2 +-
 .../tests/unit/utils/test_deprecation.py      | 25 -------
 12 files changed, 189 insertions(+), 151 deletions(-)

diff --git a/py-polars/docs/source/reference/dataframe/descriptive.rst b/py-polars/docs/source/reference/dataframe/descriptive.rst
index eab9cb1248f7..42e7b0ed7789 100644
--- a/py-polars/docs/source/reference/dataframe/descriptive.rst
+++ b/py-polars/docs/source/reference/dataframe/descriptive.rst
@@ -6,6 +6,8 @@ Descriptive
 .. autosummary::
    :toctree: api/
 
+    DataFrame.approx_n_unique
+    DataFrame.approx_unique
     DataFrame.describe
     DataFrame.glimpse
     DataFrame.estimated_size
diff --git a/py-polars/docs/source/reference/expressions/list.rst b/py-polars/docs/source/reference/expressions/list.rst
index 9d87a6c831a3..2710b8d56c80 100644
--- a/py-polars/docs/source/reference/expressions/list.rst
+++ b/py-polars/docs/source/reference/expressions/list.rst
@@ -17,11 +17,13 @@ The following methods are available under the `expr.list` attribute.
     Expr.list.contains
     Expr.list.count_match
     Expr.list.diff
+    Expr.list.difference
     Expr.list.eval
     Expr.list.explode
     Expr.list.first
     Expr.list.get
     Expr.list.head
+    Expr.list.intersection
     Expr.list.join
     Expr.list.last
     Expr.list.lengths
@@ -37,7 +39,9 @@ The following methods are available under the `expr.list` attribute.
     Expr.list.slice
     Expr.list.sort
     Expr.list.sum
+    Expr.list.symmetric_difference
     Expr.list.tail
     Expr.list.take
     Expr.list.to_struct
+    Expr.list.union
     Expr.list.unique
diff --git a/py-polars/docs/source/reference/lazyframe/miscellaneous.rst b/py-polars/docs/source/reference/lazyframe/miscellaneous.rst
index 35e385f3430a..77051b2dd589 100644
--- a/py-polars/docs/source/reference/lazyframe/miscellaneous.rst
+++ b/py-polars/docs/source/reference/lazyframe/miscellaneous.rst
@@ -24,3 +24,4 @@ Read/write logical plan
     LazyFrame.from_json
     LazyFrame.read_json
     LazyFrame.serialize
+    LazyFrame.write_json
diff --git a/py-polars/docs/source/reference/lazyframe/modify_select.rst b/py-polars/docs/source/reference/lazyframe/modify_select.rst
index 2257467fb127..5fa34dea8ad0 100644
--- a/py-polars/docs/source/reference/lazyframe/modify_select.rst
+++ b/py-polars/docs/source/reference/lazyframe/modify_select.rst
@@ -6,6 +6,8 @@ Manipulation/selection
 .. autosummary::
    :toctree: api/
 
+    LazyFrame.approx_n_unique
+    LazyFrame.approx_unique
     LazyFrame.bottom_k
     LazyFrame.clear
     LazyFrame.clone
diff --git a/py-polars/docs/source/reference/series/list.rst b/py-polars/docs/source/reference/series/list.rst
index 9f29aab5181d..46942ab076b9 100644
--- a/py-polars/docs/source/reference/series/list.rst
+++ b/py-polars/docs/source/reference/series/list.rst
@@ -17,12 +17,14 @@ The following methods are available under the `Series.list` attribute.
     Series.list.contains
     Series.list.count_match
     Series.list.diff
+    Series.list.difference
     Series.list.eval
     Series.list.explode
     Series.list.first
     Series.list.get
     Series.list.head
     Series.list.join
+    Series.list.intersection
     Series.list.last
     Series.list.lengths
     Series.list.max
@@ -37,7 +39,9 @@ The following methods are available under the `Series.list` attribute.
     Series.list.slice
     Series.list.sort
     Series.list.sum
+    Series.list.symmetric_difference
     Series.list.tail
     Series.list.take
     Series.list.to_struct
+    Series.list.union
     Series.list.unique
diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
index 383354272843..72ee90dce7aa 100644
--- a/py-polars/polars/dataframe/frame.py
+++ b/py-polars/polars/dataframe/frame.py
@@ -85,7 +85,6 @@
 from polars.utils.deprecation import (
     deprecate_function,
     deprecate_renamed_function,
-    deprecate_renamed_methods,
     deprecate_renamed_parameter,
 )
 from polars.utils.various import (
@@ -179,10 +178,6 @@
     P = ParamSpec("P")
 
 
-@deprecate_renamed_methods(
-    mapping={"approx_unique": "approx_n_unique"},
-    versions={"approx_unique": "0.18.12"},
-)
 class DataFrame:
     """
     Two-dimensional data structure representing data as a table with rows and columns.
@@ -8560,9 +8555,47 @@ def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = None) -> i
             struct_fields = F.all() if (subset is None) else subset
             expr = F.struct(struct_fields)  # type: ignore[call-overload]
 
-        df = self.lazy().select(expr.n_unique()).collect()
+        df = self.lazy().select(expr.n_unique()).collect(no_optimization=True)
         return 0 if df.is_empty() else df.row(0)[0]
 
+    def approx_n_unique(self) -> DataFrame:
+        """
+        Approximate count of unique values.
+
+        This is done using the HyperLogLog++ algorithm for cardinality estimation.
+
+        Examples
+        --------
+        >>> df = pl.DataFrame(
+        ...     {
+        ...         "a": [1, 2, 3, 4],
+        ...         "b": [1, 2, 1, 1],
+        ...     }
+        ... )
+        >>> df.approx_n_unique()
+        shape: (1, 2)
+        ┌─────┬─────┐
+        │ a   ┆ b   │
+        │ --- ┆ --- │
+        │ u32 ┆ u32 │
+        ╞═════╪═════╡
+        │ 4   ┆ 2   │
+        └─────┴─────┘
+
+        """
+        return self.lazy().approx_n_unique().collect(no_optimization=True)
+
+    @deprecate_renamed_function("approx_n_unique", version="0.18.12")
+    def approx_unique(self) -> DataFrame:
+        """
+        Approximate count of unique values.
+
+        .. deprecated:: 0.18.12
+            This method has been renamed to :func:`DataFrame.approx_n_unique`.
+
+        """
+        return self.approx_n_unique()
+
     def rechunk(self) -> Self:
         """
         Rechunk the data in this DataFrame to a contiguous allocation.
diff --git a/py-polars/polars/expr/list.py b/py-polars/polars/expr/list.py
index 269366c413e8..5f0e491c2abb 100644
--- a/py-polars/polars/expr/list.py
+++ b/py-polars/polars/expr/list.py
@@ -7,7 +7,7 @@
 from polars import functions as F
 from polars.utils._parse_expr_input import parse_as_expression
 from polars.utils._wrap import wrap_expr
-from polars.utils.deprecation import deprecate_renamed_methods
+from polars.utils.deprecation import deprecate_renamed_function
 
 if TYPE_CHECKING:
     from datetime import date, datetime, time
@@ -16,20 +16,6 @@
     from polars.type_aliases import IntoExpr, NullBehavior, ToStructStrategy
 
 
-@deprecate_renamed_methods(
-    {
-        "difference": "set_difference",
-        "symmetric_difference": "set_symmetric_difference",
-        "intersection": "set_intersection",
-        "union": "set_union",
-    },
-    versions={
-        "difference": "0.18.10",
-        "symmetric_difference": "0.18.10",
-        "intersection": "0.18.10",
-        "union": "0.18.10",
-    },
-)
 class ExprListNameSpace:
     """Namespace for list related expressions."""
 
@@ -893,7 +879,7 @@ def eval(self, expr: Expr, *, parallel: bool = False) -> Expr:
         """
         return wrap_expr(self._pyexpr.list_eval(expr._pyexpr, parallel))
 
-    def set_union(self, other: Expr | IntoExpr) -> Expr:
+    def set_union(self, other: IntoExpr) -> Expr:
         """
         Compute the SET UNION between the elements in this list and the elements of ``other``.
 
@@ -929,7 +915,7 @@ def set_union(self, other: Expr | IntoExpr) -> Expr:
         other = parse_as_expression(other, str_as_lit=False)
         return wrap_expr(self._pyexpr.list_set_operation(other, "union"))
 
-    def set_difference(self, other: Expr | IntoExpr) -> Expr:
+    def set_difference(self, other: IntoExpr) -> Expr:
         """
         Compute the SET DIFFERENCE between the elements in this list and the elements of ``other``.
 
@@ -967,7 +953,7 @@ def set_difference(self, other: Expr | IntoExpr) -> Expr:
         other = parse_as_expression(other, str_as_lit=False)
         return wrap_expr(self._pyexpr.list_set_operation(other, "difference"))
 
-    def set_intersection(self, other: Expr | IntoExpr) -> Expr:
+    def set_intersection(self, other: IntoExpr) -> Expr:
         """
         Compute the SET INTERSECTION between the elements in this list and the elements of ``other``.
 
@@ -1003,7 +989,7 @@ def set_intersection(self, other: Expr | IntoExpr) -> Expr:
         other = parse_as_expression(other, str_as_lit=False)
         return wrap_expr(self._pyexpr.list_set_operation(other, "intersection"))
 
-    def set_symmetric_difference(self, other: Expr | IntoExpr) -> Expr:
+    def set_symmetric_difference(self, other: IntoExpr) -> Expr:
         """
         Compute the SET SYMMETRIC DIFFERENCE between the elements in this list and the elements of ``other``.
 
@@ -1038,3 +1024,47 @@ def set_symmetric_difference(self, other: Expr | IntoExpr) -> Expr:
         """  # noqa: W505.
         other = parse_as_expression(other, str_as_lit=False)
         return wrap_expr(self._pyexpr.list_set_operation(other, "symmetric_difference"))
+
+    @deprecate_renamed_function("set_union", version="0.18.10")
+    def union(self, other: IntoExpr) -> Expr:
+        """
+        Compute the SET UNION between the elements in this list and the elements of ``other``.
+
+        .. deprecated:: 0.18.10
+            This method has been renamed to ``Expr.list.set_union``.
+
+        """  # noqa: W505
+        return self.set_union(other)
+
+    @deprecate_renamed_function("set_difference", version="0.18.10")
+    def difference(self, other: IntoExpr) -> Expr:
+        """
+        Compute the SET DIFFERENCE between the elements in this list and the elements of ``other``.
+
+        .. deprecated:: 0.18.10
+            This method has been renamed to ``Expr.list.set_difference``.
+
+        """  # noqa: W505
+        return self.set_difference(other)
+
+    @deprecate_renamed_function("set_intersection", version="0.18.10")
+    def intersection(self, other: IntoExpr) -> Expr:
+        """
+        Compute the SET INTERSECTION between the elements in this list and the elements of ``other``.
+
+        .. deprecated:: 0.18.10
+            This method has been renamed to ``Expr.list.set_intersection``.
+
+        """  # noqa: W505
+        return self.set_intersection(other)
+
+    @deprecate_renamed_function("set_symmetric_difference", version="0.18.10")
+    def symmetric_difference(self, other: IntoExpr) -> Expr:
+        """
+        Compute the SET SYMMETRIC DIFFERENCE between the elements in this list and the elements of ``other``.
+
+        .. deprecated:: 0.18.10
+            This method has been renamed to ``Expr.list.set_symmetric_difference``.
+
+        """  # noqa: W505
+        return self.set_symmetric_difference(other)
diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py
index 3788181da79a..fefe12c27ef0 100644
--- a/py-polars/polars/lazyframe/frame.py
+++ b/py-polars/polars/lazyframe/frame.py
@@ -60,7 +60,6 @@
 from polars.utils.deprecation import (
     deprecate_function,
     deprecate_renamed_function,
-    deprecate_renamed_methods,
     deprecate_renamed_parameter,
 )
 from polars.utils.various import (
@@ -116,16 +115,6 @@
     P = ParamSpec("P")
 
 
-@deprecate_renamed_methods(
-    mapping={
-        "approx_unique": "approx_n_unique",
-        "write_json": "serialize",
-    },
-    versions={
-        "approx_unique": "0.18.12",
-        "write_json": "0.18.12",
-    },
-)
 class LazyFrame:
     """
     Representation of a Lazy computation graph/query against a DataFrame.
@@ -869,6 +858,30 @@ def serialize(self, file: IOBase | str | Path | None = None) -> str | None:
             self._ldf.serialize(file)
         return None
 
+    @overload
+    def write_json(self, file: None = ...) -> str:
+        ...
+
+    @overload
+    def write_json(self, file: IOBase | str | Path) -> None:
+        ...
+
+    @deprecate_renamed_function("serialize", version="0.18.12")
+    def write_json(self, file: IOBase | str | Path | None = None) -> str | None:
+        """
+        Serialize the logical plan of this LazyFrame to a file or string in JSON format.
+
+        .. deprecated:: 0.18.12
+            This method has been renamed to :func:`LazyFrame.serialize`.
+
+        Parameters
+        ----------
+        file
+            File path to which the result should be written. If set to ``None``
+            (default), the output is returned as a string instead.
+        """
+        return self.serialize(file)
+
     def pipe(
         self,
         function: Callable[Concatenate[LazyFrame, P], T],
@@ -4150,6 +4163,17 @@ def approx_n_unique(self) -> Self:
         """
         return self.select(F.all().approx_n_unique())
 
+    @deprecate_renamed_function("approx_n_unique", version="0.18.12")
+    def approx_unique(self) -> Self:
+        """
+        Approximate count of unique values.
+
+        .. deprecated:: 0.18.12
+            This method has been renamed to :func:`LazyFrame.approx_n_unique`.
+
+        """
+        return self.approx_n_unique()
+
     def with_row_count(self, name: str = "row_nr", offset: int = 0) -> Self:
         """
         Add a column at index 0 that counts the rows.
diff --git a/py-polars/polars/series/list.py b/py-polars/polars/series/list.py
index a97c6a36a0c5..0288c60a4471 100644
--- a/py-polars/polars/series/list.py
+++ b/py-polars/polars/series/list.py
@@ -5,7 +5,7 @@
 from polars import functions as F
 from polars.series.utils import expr_dispatch
 from polars.utils._wrap import wrap_s
-from polars.utils.deprecation import deprecate_renamed_methods
+from polars.utils.deprecation import deprecate_renamed_function
 
 if TYPE_CHECKING:
     from datetime import date, datetime, time
@@ -16,20 +16,6 @@
 
 
 @expr_dispatch
-@deprecate_renamed_methods(
-    {
-        "difference": "set_difference",
-        "symmetric_difference": "set_symmetric_difference",
-        "intersection": "set_intersection",
-        "union": "set_union",
-    },
-    versions={
-        "difference": "0.18.10",
-        "symmetric_difference": "0.18.10",
-        "intersection": "0.18.10",
-        "union": "0.18.10",
-    },
-)
 class ListNameSpace:
     """Namespace for list related methods."""
 
@@ -592,7 +578,7 @@ def set_union(self, other: Series) -> Series:
                 [5, 6, 7, 8]
         ]
 
-        """  # noqa: W505.
+        """  # noqa: W505
 
     def set_difference(self, other: Series) -> Series:
         """
@@ -621,7 +607,7 @@ def set_difference(self, other: Series) -> Series:
                 [5, 7]
         ]
 
-        """  # noqa: W505.
+        """  # noqa: W505
 
     def set_intersection(self, other: Series) -> Series:
         """
@@ -646,7 +632,7 @@ def set_intersection(self, other: Series) -> Series:
                 [6]
         ]
 
-        """  # noqa: W505.
+        """  # noqa: W505
 
     def set_symmetric_difference(self, other: Series) -> Series:
         """
@@ -657,4 +643,48 @@ def set_symmetric_difference(self, other: Series) -> Series:
         other
             Right hand side of the set operation.
 
-        """  # noqa: W505.
+        """  # noqa: W505
+
+    @deprecate_renamed_function("set_union", version="0.18.10")
+    def union(self, other: Series) -> Series:
+        """
+        Compute the SET UNION between the elements in this list and the elements of ``other``.
+
+        .. deprecated:: 0.18.10
+            This method has been renamed to ``Series.list.set_union``.
+
+        """  # noqa: W505
+        return self.set_union(other)
+
+    @deprecate_renamed_function("set_difference", version="0.18.10")
+    def difference(self, other: Series) -> Series:
+        """
+        Compute the SET DIFFERENCE between the elements in this list and the elements of ``other``.
+
+        .. deprecated:: 0.18.10
+            This method has been renamed to ``Series.list.set_difference``.
+
+        """  # noqa: W505
+        return self.set_difference(other)
+
+    @deprecate_renamed_function("set_intersection", version="0.18.10")
+    def intersection(self, other: Series) -> Series:
+        """
+        Compute the SET INTERSECTION between the elements in this list and the elements of ``other``.
+
+        .. deprecated:: 0.18.10
+            This method has been renamed to ``Series.list.set_intersection``.
+
+        """  # noqa: W505
+        return self.set_intersection(other)
+
+    @deprecate_renamed_function("set_symmetric_difference", version="0.18.10")
+    def symmetric_difference(self, other: Series) -> Series:
+        """
+        Compute the SET SYMMETRIC DIFFERENCE between the elements in this list and the elements of ``other``.
+
+        .. deprecated:: 0.18.10
+            This method has been renamed to ``Series.list.set_symmetric_difference``.
+
+        """  # noqa: W505
+        return self.set_symmetric_difference(other)
diff --git a/py-polars/polars/utils/deprecation.py b/py-polars/polars/utils/deprecation.py
index 1307b4a49c4e..db1948cf8913 100644
--- a/py-polars/polars/utils/deprecation.py
+++ b/py-polars/polars/utils/deprecation.py
@@ -2,8 +2,8 @@
 
 import inspect
 import warnings
-from functools import partial, wraps
-from typing import TYPE_CHECKING, Any, Callable, TypeVar
+from functools import wraps
+from typing import TYPE_CHECKING, Callable, TypeVar
 
 from polars.utils.various import find_stacklevel
 
@@ -58,15 +58,7 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
 def deprecate_renamed_function(
     new_name: str, *, version: str
 ) -> Callable[[Callable[P, T]], Callable[P, T]]:
-    """
-    Decorator to mark a function as deprecated due to being renamed.
-
-    Notes
-    -----
-    For deprecating renamed class methods, use the ``deprecate_renamed_methods``
-    class decorator instead.
-
-    """
+    """Decorator to mark a function as deprecated due to being renamed."""
     return deprecate_function(f"It has been renamed to `{new_name}`.", version=version)
 
 
@@ -119,65 +111,6 @@ def _rename_keyword_argument(
         kwargs[new_name] = kwargs.pop(old_name)
 
 
-def deprecate_renamed_methods(
-    mapping: dict[str, str | tuple[str, dict[str, Any]]], *, versions: dict[str, str]
-) -> Callable[[type[T]], type[T]]:
-    """
-    Class decorator to mark methods as deprecated due to being renamed.
-
-    This allows for the deprecated method to be deleted. It will remain available
-    to users, but will no longer show up in auto-complete suggestions.
-
-    If the arguments of the method are being renamed as well, use in conjunction with
-    `deprecate_renamed_parameter`.
-
-    If the new method has different default values for some keyword arguments, supply
-    the old default values as a dictionary in the mapping like so::
-
-        @deprecate_renamed_methods(
-            {"old_method": ("new_method", {"flag": False})},
-            versions={"old_method": "1.0.0"},
-        )
-        class Foo:
-            def new_method(flag=True):
-                ...
-
-    Parameters
-    ----------
-    mapping
-        Mapping of deprecated method names to new method names.
-    versions
-        For each deprecated method name, the Polars version number in which it was
-        deprecated. This argument is used to help developers determine when to remove
-        the deprecated functionality.
-
-    """
-
-    def _redirecting_getattr_(obj: T, item: Any) -> Any:
-        if isinstance(item, str) and item in mapping:
-            new_item = mapping[item]
-            new_item_name = new_item if isinstance(new_item, str) else new_item[0]
-            class_name = type(obj).__name__
-            issue_deprecation_warning(
-                f"`{class_name}.{item}` is deprecated."
-                f" It has been renamed to `{class_name}.{new_item_name}`.",
-                version=versions[item],
-            )
-            item = new_item_name
-
-        attr = obj.__getattribute__(item)
-        if isinstance(new_item, tuple):
-            attr = partial(attr, **new_item[1])
-        return attr
-
-    def decorate(cls: type[T]) -> type[T]:
-        # note: __getattr__ is only invoked if item isn't found on the class
-        cls.__getattr__ = _redirecting_getattr_  # type: ignore[attr-defined]
-        return cls
-
-    return decorate
-
-
 def deprecate_nonkeyword_arguments(
     allowed_args: list[str] | None = None, message: str | None = None, *, version: str
 ) -> Callable[[Callable[P, T]], Callable[P, T]]:
diff --git a/py-polars/tests/unit/test_serde.py b/py-polars/tests/unit/test_serde.py
index f5bee11eb449..4d6a245c20a0 100644
--- a/py-polars/tests/unit/test_serde.py
+++ b/py-polars/tests/unit/test_serde.py
@@ -30,7 +30,7 @@ def test_lazyframe_deprecated_serde() -> None:
     lf = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}).lazy().select(pl.col("a"))
 
     with pytest.deprecated_call():
-        json = lf.write_json()  # type: ignore[attr-defined]
+        json = lf.write_json()
     with pytest.deprecated_call():
         result_from = pl.LazyFrame.from_json(json)
     with pytest.deprecated_call():
diff --git a/py-polars/tests/unit/utils/test_deprecation.py b/py-polars/tests/unit/utils/test_deprecation.py
index 7929f1d466ca..4a489660ba3e 100644
--- a/py-polars/tests/unit/utils/test_deprecation.py
+++ b/py-polars/tests/unit/utils/test_deprecation.py
@@ -9,7 +9,6 @@
     deprecate_function,
     deprecate_nonkeyword_arguments,
     deprecate_renamed_function,
-    deprecate_renamed_methods,
     deprecate_renamed_parameter,
     issue_deprecation_warning,
     warn_closed_future_change,
@@ -52,30 +51,6 @@ def hello(oof: str, rab: str, ham: str) -> None:
     assert "rab" in str(recwarn[1].message)
 
 
-def test_deprecate_renamed_methods() -> None:
-    # one-to-one redirection
-    @deprecate_renamed_methods({"foo": "bar"}, versions={"foo": "1.0.0"})
-    class DemoClass1:
-        def bar(self, upper: bool = False) -> str:
-            return "BAZ" if upper else "baz"
-
-    with pytest.deprecated_call():
-        result = DemoClass1().foo()  # type: ignore[attr-defined]
-    assert result == "baz"
-
-    # redirection with **kwargs
-    @deprecate_renamed_methods(
-        {"foo": ("bar", {"upper": True})}, versions={"foo": "1.0.0"}
-    )
-    class DemoClass2:
-        def bar(self, upper: bool = False) -> str:
-            return "BAZ" if upper else "baz"
-
-    with pytest.deprecated_call():
-        result = DemoClass2().foo()  # type: ignore[attr-defined]
-    assert result == "BAZ"
-
-
 class Foo:  # noqa: D101
     @deprecate_nonkeyword_arguments(allowed_args=["self", "baz"], version="0.1.2")
     def bar(  # noqa: D102

From 641c5d7d1acf74c4150b3938769ba6bf0dc8dccf Mon Sep 17 00:00:00 2001
From: Weijie Guo <reswqa@163.com>
Date: Wed, 23 Aug 2023 12:29:20 +0800
Subject: [PATCH 42/55] fix(rust, python): Sorted Utf8Chunked max_str and
 min_str should consider null value (#10675)

---
 .../src/chunked_array/ops/aggregate/mod.rs    | 32 ++++++++++++++++---
 py-polars/tests/unit/series/test_series.py    |  8 +++++
 2 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs b/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs
index 5ed07614951a..f020ed81f7c9 100644
--- a/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs
+++ b/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs
@@ -450,8 +450,20 @@ impl Utf8Chunked {
             return None;
         }
         match self.is_sorted_flag() {
-            IsSorted::Ascending => self.get(self.len() - 1),
-            IsSorted::Descending => self.get(0),
+            IsSorted::Ascending => {
+                self.last_non_null().and_then(|idx| {
+                    // Safety:
+                    // last_non_null returns in bound index
+                    unsafe { self.get_unchecked(idx) }
+                })
+            },
+            IsSorted::Descending => {
+                self.first_non_null().and_then(|idx| {
+                    // Safety:
+                    // first_non_null returns in bound index
+                    unsafe { self.get_unchecked(idx) }
+                })
+            },
             IsSorted::Not => self
                 .downcast_iter()
                 .filter_map(compute::aggregate::max_string)
@@ -463,8 +475,20 @@ impl Utf8Chunked {
             return None;
         }
         match self.is_sorted_flag() {
-            IsSorted::Ascending => self.get(0),
-            IsSorted::Descending => self.get(self.len() - 1),
+            IsSorted::Ascending => {
+                self.first_non_null().and_then(|idx| {
+                    // Safety:
+                    // first_non_null returns in bound index
+                    unsafe { self.get_unchecked(idx) }
+                })
+            },
+            IsSorted::Descending => {
+                self.last_non_null().and_then(|idx| {
+                    // Safety:
+                    // last_non_null returns in bound index
+                    unsafe { self.get_unchecked(idx) }
+                })
+            },
             IsSorted::Not => self
                 .downcast_iter()
                 .filter_map(compute::aggregate::min_string)
diff --git a/py-polars/tests/unit/series/test_series.py b/py-polars/tests/unit/series/test_series.py
index d8b6336ac9ed..4db1b6fd85af 100644
--- a/py-polars/tests/unit/series/test_series.py
+++ b/py-polars/tests/unit/series/test_series.py
@@ -980,6 +980,14 @@ def test_fill_null() -> None:
     assert out.dtypes == [pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64]
 
 
+def test_utf8_series_min_max_10674() -> None:
+    utf8_series = pl.Series("b", ["a", None, "c", None, "e"], dtype=pl.Utf8)
+    assert utf8_series.min() == "a"
+    assert utf8_series.max() == "e"
+    assert utf8_series.sort(descending=False).min() == "a"
+    assert utf8_series.sort(descending=True).max() == "e"
+
+
 def test_fill_nan() -> None:
     nan = float("nan")
     a = pl.Series("a", [1.0, nan, 2.0, nan, 3.0])

From 37531d5409d34ca76f0624dd2a05df94a5661b62 Mon Sep 17 00:00:00 2001
From: Weijie Guo <reswqa@163.com>
Date: Wed, 23 Aug 2023 13:59:46 +0800
Subject: [PATCH 43/55] feat(rust, python): Support min and max strategy for
 binary & str columns fill null (#10673)

---
 .../src/chunked_array/ops/aggregate/mod.rs    | 74 +++++++++++++++----
 .../src/chunked_array/ops/fill_null.rs        |  6 ++
 py-polars/tests/unit/series/test_series.py    |  8 ++
 3 files changed, 72 insertions(+), 16 deletions(-)

diff --git a/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs b/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs
index f020ed81f7c9..048c4267a913 100644
--- a/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs
+++ b/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs
@@ -138,14 +138,14 @@ where
             IsSorted::Ascending => {
                 self.last_non_null().and_then(|idx| {
                     // Safety:
-                    // first_non_null returns in bound index
+                    // last_non_null returns in bound index
                     unsafe { self.get_unchecked(idx) }
                 })
             },
             IsSorted::Descending => {
                 self.first_non_null().and_then(|idx| {
                     // Safety:
-                    // last returns in bound index
+                    // first_non_null returns in bound index
                     unsafe { self.get_unchecked(idx) }
                 })
             },
@@ -509,27 +509,69 @@ impl ChunkAggSeries for Utf8Chunked {
     }
 }
 
+impl BinaryChunked {
+    pub(crate) fn max_binary(&self) -> Option<&[u8]> {
+        if self.is_empty() {
+            return None;
+        }
+        match self.is_sorted_flag() {
+            IsSorted::Ascending => {
+                self.last_non_null().and_then(|idx| {
+                    // Safety:
+                    // last_non_null returns in bound index
+                    unsafe { self.get_unchecked(idx) }
+                })
+            },
+            IsSorted::Descending => {
+                self.first_non_null().and_then(|idx| {
+                    // Safety:
+                    // first_non_null returns in bound index
+                    unsafe { self.get_unchecked(idx) }
+                })
+            },
+            IsSorted::Not => self
+                .downcast_iter()
+                .filter_map(compute::aggregate::max_binary)
+                .fold_first_(|acc, v| if acc > v { acc } else { v }),
+        }
+    }
+
+    pub(crate) fn min_binary(&self) -> Option<&[u8]> {
+        if self.is_empty() {
+            return None;
+        }
+        match self.is_sorted_flag() {
+            IsSorted::Ascending => {
+                self.first_non_null().and_then(|idx| {
+                    // Safety:
+                    // first_non_null returns in bound index
+                    unsafe { self.get_unchecked(idx) }
+                })
+            },
+            IsSorted::Descending => {
+                self.last_non_null().and_then(|idx| {
+                    // Safety:
+                    // last_non_null returns in bound index
+                    unsafe { self.get_unchecked(idx) }
+                })
+            },
+            IsSorted::Not => self
+                .downcast_iter()
+                .filter_map(compute::aggregate::min_binary)
+                .fold_first_(|acc, v| if acc < v { acc } else { v }),
+        }
+    }
+}
+
 impl ChunkAggSeries for BinaryChunked {
     fn sum_as_series(&self) -> Series {
         BinaryChunked::full_null(self.name(), 1).into_series()
     }
     fn max_as_series(&self) -> Series {
-        Series::new(
-            self.name(),
-            &[self
-                .downcast_iter()
-                .filter_map(compute::aggregate::max_binary)
-                .fold_first_(|acc, v| if acc > v { acc } else { v })],
-        )
+        Series::new(self.name(), [self.max_binary()])
     }
     fn min_as_series(&self) -> Series {
-        Series::new(
-            self.name(),
-            &[self
-                .downcast_iter()
-                .filter_map(compute::aggregate::min_binary)
-                .fold_first_(|acc, v| if acc < v { acc } else { v })],
-        )
+        Series::new(self.name(), [self.min_binary()])
     }
 }
 
diff --git a/crates/polars-core/src/chunked_array/ops/fill_null.rs b/crates/polars-core/src/chunked_array/ops/fill_null.rs
index 0e6539f7e5c2..440fb6591ea8 100644
--- a/crates/polars-core/src/chunked_array/ops/fill_null.rs
+++ b/crates/polars-core/src/chunked_array/ops/fill_null.rs
@@ -363,6 +363,12 @@ fn fill_null_binary(ca: &BinaryChunked, strategy: FillNullStrategy) -> PolarsRes
             out.rename(ca.name());
             Ok(out)
         },
+        FillNullStrategy::Min => {
+            ca.fill_null_with_values(ca.min_binary().ok_or_else(err_fill_null)?)
+        },
+        FillNullStrategy::Max => {
+            ca.fill_null_with_values(ca.max_binary().ok_or_else(err_fill_null)?)
+        },
         strat => polars_bail!(InvalidOperation: "fill-null strategy {:?} is not supported", strat),
     }
 }
diff --git a/py-polars/tests/unit/series/test_series.py b/py-polars/tests/unit/series/test_series.py
index 4db1b6fd85af..c72bd054f8ce 100644
--- a/py-polars/tests/unit/series/test_series.py
+++ b/py-polars/tests/unit/series/test_series.py
@@ -927,6 +927,14 @@ def test_fill_null() -> None:
     assert a.fill_null(strategy="backward").to_list() == [0.0, 1.0, 2.0, 2.0, 3.0, 3.0]
     assert a.fill_null(strategy="mean").to_list() == [0.0, 1.0, 1.5, 2.0, 1.5, 3.0]
 
+    b = pl.Series("b", ["a", None, "c", None, "e"])
+    assert b.fill_null(strategy="min").to_list() == ["a", "a", "c", "a", "e"]
+    assert b.fill_null(strategy="max").to_list() == ["a", "e", "c", "e", "e"]
+
+    c = pl.Series("c", [b"a", None, b"c", None, b"e"])
+    assert c.fill_null(strategy="min").to_list() == [b"a", b"a", b"c", b"a", b"e"]
+    assert c.fill_null(strategy="max").to_list() == [b"a", b"e", b"c", b"e", b"e"]
+
     df = pl.DataFrame(
         [
             pl.Series("i32", [1, 2, None], dtype=pl.Int32),

From 67d5328b77d846927ebb165880792de4c812dc54 Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijn@degooijer.io>
Date: Wed, 23 Aug 2023 08:01:38 +0200
Subject: [PATCH 44/55] docs(python): Fix minor issue with `sink_parquet` docs
 (#10669)

---
 py-polars/polars/lazyframe/frame.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py
index fefe12c27ef0..e5b90b51d568 100644
--- a/py-polars/polars/lazyframe/frame.py
+++ b/py-polars/polars/lazyframe/frame.py
@@ -1838,15 +1838,13 @@ def sink_parquet(
             - "gzip" : min-level: 0, max-level: 10.
             - "brotli" : min-level: 0, max-level: 11.
             - "zstd" : min-level: 1, max-level: 22.
-
         statistics
             Write statistics to the parquet headers. This requires extra compute.
         row_group_size
             Size of the row groups in number of rows.
             If None (default), the chunks of the `DataFrame` are
             used. Writing in smaller chunks may reduce memory pressure and improve
-            writing speeds. If None and ``use_pyarrow=True``, the row group size
-            will be the minimum of the DataFrame size and 64 * 1024 * 1024.
+            writing speeds.
         data_pagesize_limit
             Size limit of individual data pages.
             If not set defaults to 1024 * 1024 bytes

From 76820a8c122eb2e853c5b88926a22f044c2c003f Mon Sep 17 00:00:00 2001
From: Weijie Guo <reswqa@163.com>
Date: Wed, 23 Aug 2023 23:38:17 +0800
Subject: [PATCH 45/55] fix(rust, python): Set the correct fast_explode flag
 for ListUtf8ChunkedBuilder (#10684)

---
 .../src/chunked_array/builder/list/binary.rs       |  3 +++
 py-polars/tests/unit/operations/test_explode.py    | 14 ++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/crates/polars-core/src/chunked_array/builder/list/binary.rs b/crates/polars-core/src/chunked_array/builder/list/binary.rs
index 0f9610db3d00..00564cdc94d4 100644
--- a/crates/polars-core/src/chunked_array/builder/list/binary.rs
+++ b/crates/polars-core/src/chunked_array/builder/list/binary.rs
@@ -48,6 +48,9 @@ impl ListUtf8ChunkedBuilder {
 
     #[inline]
     pub(crate) fn append(&mut self, ca: &Utf8Chunked) {
+        if ca.is_empty() {
+            self.fast_explode = false;
+        }
         let value_builder = self.builder.mut_values();
         value_builder.try_extend(ca).unwrap();
         self.builder.try_push_valid().unwrap();
diff --git a/py-polars/tests/unit/operations/test_explode.py b/py-polars/tests/unit/operations/test_explode.py
index 4c39eeeefc24..2e5aa0679188 100644
--- a/py-polars/tests/unit/operations/test_explode.py
+++ b/py-polars/tests/unit/operations/test_explode.py
@@ -315,3 +315,17 @@ def test_explode_array() -> None:
     for ex in ("a", ~cs.integer()):
         out = df.explode(ex).collect()  # type: ignore[arg-type]
         assert_frame_equal(out, expected)
+
+
+def test_utf8_list_agg_explode() -> None:
+    df = pl.DataFrame({"a": [[None], ["b"]]})
+
+    df = df.select(
+        pl.col("a").list.eval(pl.element().filter(pl.element().is_not_null()))
+    )
+    assert not df["a"].flags["FAST_EXPLODE"]
+
+    df2 = pl.DataFrame({"a": [[], ["b"]]})
+
+    assert_frame_equal(df, df2)
+    assert_frame_equal(df.explode("a"), df2.explode("a"))

From 9bfa5b6db9b112328c657651c2def6543ae46473 Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijn@degooijer.io>
Date: Wed, 23 Aug 2023 17:44:37 +0200
Subject: [PATCH 46/55] test(python): Update for new pyarrow `13.0.0` behavior
 (#10691)

---
 py-polars/polars/dataframe/frame.py  |  4 +---
 py-polars/tests/unit/test_interop.py | 14 +++++++-------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
index 72ee90dce7aa..8f021b3eaef6 100644
--- a/py-polars/polars/dataframe/frame.py
+++ b/py-polars/polars/dataframe/frame.py
@@ -9276,14 +9276,12 @@ def iter_slices(self, n_rows: int = 10_000) -> Iterator[DataFrame]:
 
         >>> for frame in df.iter_slices(n_rows=15_000):
         ...     record_batch = frame.to_arrow().to_batches()[0]
-        ...     print(record_batch, "\n<< ", len(record_batch))
+        ...     print(f"{record_batch.schema}\n<< {len(record_batch)}")
         ...
-        pyarrow.RecordBatch
         a: int32
         b: date32[day]
         c: large_string
         << 15000
-        pyarrow.RecordBatch
         a: int32
         b: date32[day]
         c: large_string
diff --git a/py-polars/tests/unit/test_interop.py b/py-polars/tests/unit/test_interop.py
index a9361633075b..039afb6d8a0b 100644
--- a/py-polars/tests/unit/test_interop.py
+++ b/py-polars/tests/unit/test_interop.py
@@ -569,13 +569,13 @@ def test_to_pandas() -> None:
     )
     pd_out = df.to_pandas()
     pd_out_dtypes_expected = [
-        np.uint8,
-        np.float64,
-        np.float64,
-        np.dtype("datetime64[ns]"),
-        np.object_,
-        np.object_,
-        np.dtype("datetime64[ns]"),
+        np.dtype(np.uint8),
+        np.dtype(np.float64),
+        np.dtype(np.float64),
+        np.dtype("datetime64[ms]"),
+        np.dtype(np.object_),
+        np.dtype(np.object_),
+        np.dtype("datetime64[us]"),
         pd.CategoricalDtype(categories=["a", "b", "c"], ordered=False),
         pd.CategoricalDtype(categories=["e", "f"], ordered=False),
     ]

From f80e6e018624b8d40b6144179d95d9b74ed607a6 Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijn@degooijer.io>
Date: Wed, 23 Aug 2023 18:49:54 +0200
Subject: [PATCH 47/55] feat(python): Explicitly implement `Protocol` for
 interchange classes (#10688)

---
 py-polars/polars/interchange/buffer.py    |   4 +-
 py-polars/polars/interchange/column.py    |   4 +-
 py-polars/polars/interchange/dataframe.py |   7 +-
 py-polars/polars/interchange/protocol.py  | 142 +++++++++++++++++++---
 4 files changed, 137 insertions(+), 20 deletions(-)

diff --git a/py-polars/polars/interchange/buffer.py b/py-polars/polars/interchange/buffer.py
index 5ee3b55d7db6..46c6bf12dc8f 100644
--- a/py-polars/polars/interchange/buffer.py
+++ b/py-polars/polars/interchange/buffer.py
@@ -2,7 +2,7 @@
 
 from typing import TYPE_CHECKING
 
-from polars.interchange.protocol import DlpackDeviceType, DtypeKind
+from polars.interchange.protocol import Buffer, DlpackDeviceType, DtypeKind
 from polars.interchange.utils import polars_dtype_to_dtype
 
 if TYPE_CHECKING:
@@ -11,7 +11,7 @@
     from polars import Series
 
 
-class PolarsBuffer:
+class PolarsBuffer(Buffer):
     """
     A buffer object backed by a Polars Series consisting of a single chunk.
 
diff --git a/py-polars/polars/interchange/column.py b/py-polars/polars/interchange/column.py
index 8cf81b0b33c8..c7a945b1977f 100644
--- a/py-polars/polars/interchange/column.py
+++ b/py-polars/polars/interchange/column.py
@@ -4,7 +4,7 @@
 
 from polars.datatypes import Categorical
 from polars.interchange.buffer import PolarsBuffer
-from polars.interchange.protocol import ColumnNullType, DtypeKind, Endianness
+from polars.interchange.protocol import Column, ColumnNullType, DtypeKind, Endianness
 from polars.interchange.utils import polars_dtype_to_dtype
 from polars.utils._wrap import wrap_s
 
@@ -16,7 +16,7 @@
     from polars.interchange.protocol import CategoricalDescription, ColumnBuffers, Dtype
 
 
-class PolarsColumn:
+class PolarsColumn(Column):
     """
     A column object backed by a Polars Series.
 
diff --git a/py-polars/polars/interchange/dataframe.py b/py-polars/polars/interchange/dataframe.py
index 2d43a1353901..56ed4337d6f0 100644
--- a/py-polars/polars/interchange/dataframe.py
+++ b/py-polars/polars/interchange/dataframe.py
@@ -5,6 +5,7 @@
 from typing import TYPE_CHECKING
 
 from polars.interchange.column import PolarsColumn
+from polars.interchange.protocol import DataFrame as InterchangeDataFrame
 
 if TYPE_CHECKING:
     from collections.abc import Iterator
@@ -13,7 +14,7 @@
     from polars import DataFrame
 
 
-class PolarsDataFrame:
+class PolarsDataFrame(InterchangeDataFrame):
     """
     A dataframe object backed by a Polars DataFrame.
 
@@ -27,6 +28,8 @@ class PolarsDataFrame:
 
     """
 
+    version = 0
+
     def __init__(self, df: DataFrame, *, allow_copy: bool = True):
         self._df = df
         self._allow_copy = allow_copy
@@ -124,7 +127,7 @@ def get_columns(self) -> Iterator[PolarsColumn]:
 
     def select_columns(self, indices: Sequence[int]) -> PolarsDataFrame:
         """
-        Create a new DataFrame by selecting a subset of columns by index.
+        Create a new dataframe by selecting a subset of columns by index.
 
         Parameters
         ----------
diff --git a/py-polars/polars/interchange/protocol.py b/py-polars/polars/interchange/protocol.py
index de51804c2f63..4d7a85bfe83b 100644
--- a/py-polars/polars/interchange/protocol.py
+++ b/py-polars/polars/interchange/protocol.py
@@ -1,7 +1,16 @@
 from __future__ import annotations
 
 from enum import IntEnum
-from typing import TYPE_CHECKING, Literal, Tuple, TypedDict
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Iterable,
+    Literal,
+    Protocol,
+    Sequence,
+    Tuple,
+    TypedDict,
+)
 
 if TYPE_CHECKING:
     import sys
@@ -15,6 +24,19 @@
         from typing_extensions import TypeAlias
 
 
+class DlpackDeviceType(IntEnum):
+    """Integer enum for device type codes matching DLPack."""
+
+    CPU = 1
+    CUDA = 2
+    CPU_PINNED = 3
+    OPENCL = 4
+    VULKAN = 7
+    METAL = 8
+    VPI = 9
+    ROCM = 10
+
+
 class DtypeKind(IntEnum):
     """
     Integer enum for data types.
@@ -105,19 +127,6 @@ class CategoricalDescription(TypedDict):
     categories: PolarsColumn
 
 
-class DlpackDeviceType(IntEnum):
-    """Integer enum for device type codes matching DLPack."""
-
-    CPU = 1
-    CUDA = 2
-    CPU_PINNED = 3
-    OPENCL = 4
-    VULKAN = 7
-    METAL = 8
-    VPI = 9
-    ROCM = 10
-
-
 class Endianness:
     """Enum indicating the byte-order of a data type."""
 
@@ -125,3 +134,108 @@ class Endianness:
     BIG = ">"
     NATIVE = "="
     NA = "|"
+
+
+class Buffer(Protocol):
+    """Interchange buffer object."""
+
+    @property
+    def bufsize(self) -> int:
+        """Buffer size in bytes."""
+
+    @property
+    def ptr(self) -> int:
+        """Pointer to start of the buffer as an integer."""
+
+    def __dlpack__(self) -> Any:
+        """Represent this structure as DLPack interface."""
+
+    def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:
+        """Device type and device ID for where the data in the buffer resides."""
+
+
+class Column(Protocol):
+    """Interchange column object."""
+
+    def size(self) -> int:
+        """Size of the column in elements."""
+
+    @property
+    def offset(self) -> int:
+        """Offset of the first element with respect to the start of the underlying buffer."""  # noqa: W505
+
+    @property
+    def dtype(self) -> Dtype:
+        """Data type of the column."""
+
+    @property
+    def describe_categorical(self) -> CategoricalDescription:
+        """Description of the categorical data type of the column."""
+
+    @property
+    def describe_null(self) -> tuple[ColumnNullType, Any]:
+        """Description of the null representation the column uses."""
+
+    @property
+    def null_count(self) -> int | None:
+        """Number of null elements, if known."""
+
+    @property
+    def metadata(self) -> dict[str, Any]:
+        """The metadata for the column."""
+
+    def num_chunks(self) -> int:
+        """Return the number of chunks the column consists of."""
+
+    def get_chunks(self, n_chunks: int | None = None) -> Iterable[Column]:
+        """Return an iterator yielding the column chunks."""
+
+    def get_buffers(self) -> ColumnBuffers:
+        """Return a dictionary containing the underlying buffers."""
+
+
+class DataFrame(Protocol):
+    """Interchange dataframe object."""
+
+    @property
+    def version(self) -> int:
+        """Version of the protocol."""
+
+    def __dataframe__(
+        self, nan_as_null: bool = False, allow_copy: bool = True
+    ) -> DataFrame:
+        """Construct a new dataframe object, potentially changing the parameters."""
+
+    @property
+    def metadata(self) -> dict[str, Any]:
+        """The metadata for the dataframe."""
+
+    def num_columns(self) -> int:
+        """Return the number of columns in the dataframe."""
+
+    def num_rows(self) -> int | None:
+        """Return the number of rows in the dataframe, if available."""
+
+    def num_chunks(self) -> int:
+        """Return the number of chunks the dataframe consists of.."""
+
+    def column_names(self) -> Iterable[str]:
+        """Return the column names."""
+
+    def get_column(self, i: int) -> Column:
+        """Return the column at the indicated position."""
+
+    def get_column_by_name(self, name: str) -> Column:
+        """Return the column with the given name."""
+
+    def get_columns(self) -> Iterable[Column]:
+        """Return an iterator yielding the columns."""
+
+    def select_columns(self, indices: Sequence[int]) -> DataFrame:
+        """Create a new dataframe by selecting a subset of columns by index."""
+
+    def select_columns_by_name(self, names: Sequence[str]) -> DataFrame:
+        """Create a new dataframe by selecting a subset of columns by name."""
+
+    def get_chunks(self, n_chunks: int | None = None) -> Iterable[DataFrame]:
+        """Return an iterator yielding the chunks of the dataframe."""

From 18736fa0e7956a0abaa91cf0ac766995a0cb8fae Mon Sep 17 00:00:00 2001
From: Weijie Guo <reswqa@163.com>
Date: Thu, 24 Aug 2023 03:42:49 +0800
Subject: [PATCH 48/55] fix(rust, python): re-sort buffer when update window
 swap the whole buffer (#10696)

---
 .../src/kernels/rolling/window.rs             |  3 +-
 .../tests/unit/datatypes/test_temporal.py     | 31 +++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/crates/polars-arrow/src/kernels/rolling/window.rs b/crates/polars-arrow/src/kernels/rolling/window.rs
index c9083c6cf9ba..8f4c0be1b9fb 100644
--- a/crates/polars-arrow/src/kernels/rolling/window.rs
+++ b/crates/polars-arrow/src/kernels/rolling/window.rs
@@ -30,7 +30,8 @@ impl<'a, T: NativeType + IsFloat + PartialOrd> SortedBuf<'a, T> {
         if start >= self.last_end {
             self.buf.clear();
             let new_window = self.slice.get_unchecked(start..end);
-            self.buf.extend_from_slice(new_window)
+            self.buf.extend_from_slice(new_window);
+            sort_buf(&mut self.buf);
         } else {
             // remove elements that should leave the window
             for idx in self.last_start..start {
diff --git a/py-polars/tests/unit/datatypes/test_temporal.py b/py-polars/tests/unit/datatypes/test_temporal.py
index 1b6767dc07bc..269b3947bc7d 100644
--- a/py-polars/tests/unit/datatypes/test_temporal.py
+++ b/py-polars/tests/unit/datatypes/test_temporal.py
@@ -637,6 +637,37 @@ def test_explode_date() -> None:
         ]
 
 
+def test_groupy_by_dynamic_median_10695() -> None:
+    df = pl.DataFrame(
+        {
+            "timestamp": pl.date_range(
+                datetime(2023, 8, 22, 15, 44, 30),
+                datetime(2023, 8, 22, 15, 48, 50),
+                "20s",
+                eager=True,
+            ),
+            "foo": [2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+        }
+    )
+
+    assert df.group_by_dynamic(
+        index_column="timestamp",
+        every="60s",
+        period="3m",
+    ).agg(
+        pl.col("foo").median()
+    ).to_dict(False) == {
+        "timestamp": [
+            datetime(2023, 8, 22, 15, 44),
+            datetime(2023, 8, 22, 15, 45),
+            datetime(2023, 8, 22, 15, 46),
+            datetime(2023, 8, 22, 15, 47),
+            datetime(2023, 8, 22, 15, 48),
+        ],
+        "foo": [1.0, 1.0, 1.0, 1.0, 1.0],
+    }
+
+
 def test_group_by_dynamic_when_conversion_crosses_dates_7274() -> None:
     df = (
         pl.DataFrame(

From 068231592ca1f35b3f363371bba135a03485c709 Mon Sep 17 00:00:00 2001
From: Weijie Guo <reswqa@163.com>
Date: Thu, 24 Aug 2023 13:30:47 +0800
Subject: [PATCH 49/55] fix(rust, python): Reused input series in rolling_apply
 should not be orderly (#10694)

---
 .../src/chunked_array/ops/rolling_window.rs        |  6 ++++--
 py-polars/tests/unit/operations/test_rolling.py    | 14 ++++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/crates/polars-core/src/chunked_array/ops/rolling_window.rs b/crates/polars-core/src/chunked_array/ops/rolling_window.rs
index 487ecc503624..0bc0267b40dc 100644
--- a/crates/polars-core/src/chunked_array/ops/rolling_window.rs
+++ b/crates/polars-core/src/chunked_array/ops/rolling_window.rs
@@ -119,9 +119,10 @@ mod inner_mod {
                         unsafe {
                             *ptr = arr_window;
                         }
+                        // reset flags as we reuse this container
+                        series_container.clear_settings();
                         // ensure the length is correct
                         series_container._get_inner_mut().compute_len();
-
                         let s = if size == options.window_size {
                             f(&series_container.multiply(&weights_series).unwrap())
                         } else {
@@ -166,9 +167,10 @@ mod inner_mod {
                         unsafe {
                             *ptr = arr_window;
                         }
+                        // reset flags as we reuse this container
+                        series_container.clear_settings();
                         // ensure the length is correct
                         series_container._get_inner_mut().compute_len();
-
                         let s = f(&series_container);
                         let out = self.unpack_series_matching_type(&s)?;
                         builder.append_option(out.get(0));
diff --git a/py-polars/tests/unit/operations/test_rolling.py b/py-polars/tests/unit/operations/test_rolling.py
index 46ff6c6e5181..4eb6b6c06e8a 100644
--- a/py-polars/tests/unit/operations/test_rolling.py
+++ b/py-polars/tests/unit/operations/test_rolling.py
@@ -792,6 +792,20 @@ def test_rolling_window_size_9160() -> None:
     ).to_list() == [1]
 
 
+def test_rolling_apply_clear_reuse_series_state_10681() -> None:
+    df = pl.DataFrame({"a": [1, 1, 1, 1, 2, 2, 2, 2], "b": [0, 1, 11.0, 7, 4, 2, 3, 8]})
+    assert df.with_columns(
+        pl.col("b")
+        .rolling_apply(lambda s: s.min(), window_size=3, min_periods=2)
+        .over("a")
+        .alias("min")
+    ).to_dict(False) == {
+        "a": [1, 1, 1, 1, 2, 2, 2, 2],
+        "b": [0.0, 1.0, 11.0, 7.0, 4.0, 2.0, 3.0, 8.0],
+        "min": [None, 0.0, 0.0, 1.0, None, 2.0, 2.0, 2.0],
+    }
+
+
 def test_rolling_empty_window_9406() -> None:
     datecol = pl.Series(
         "d",

From 576ed0d0cd1d710d42b4ea5363f1eee6226be31b Mon Sep 17 00:00:00 2001
From: Marshall <mcrumiller@users.noreply.github.com>
Date: Thu, 24 Aug 2023 01:32:05 -0400
Subject: [PATCH 50/55] fix(python): raise exception with invalid `on` arg type
 for join_asof (#10690)

---
 py-polars/polars/dataframe/frame.py           | 15 +++++++++++
 .../tests/unit/operations/test_join_asof.py   | 26 +++++++++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
index 8f021b3eaef6..47a7af4260ca 100644
--- a/py-polars/polars/dataframe/frame.py
+++ b/py-polars/polars/dataframe/frame.py
@@ -5792,6 +5792,21 @@ def join_asof(
                 f"expected `other` join table to be a DataFrame, got {type(other).__name__!r}"
             )
 
+        if on is not None:
+            if not isinstance(on, (str, pl.Expr)):
+                raise TypeError(
+                    f"expected `on` to be str or Expr, got {type(on).__name__!r}"
+                )
+        else:
+            if not isinstance(left_on, (str, pl.Expr)):
+                raise TypeError(
+                    f"expected `left_on` to be str or Expr, got {type(left_on).__name__!r}"
+                )
+            elif not isinstance(right_on, (str, pl.Expr)):
+                raise TypeError(
+                    f"expected `right_on` to be str or Expr, got {type(right_on).__name__!r}"
+                )
+
         return (
             self.lazy()
             .join_asof(
diff --git a/py-polars/tests/unit/operations/test_join_asof.py b/py-polars/tests/unit/operations/test_join_asof.py
index 6ec901d661fb..2c43c4d10cfc 100644
--- a/py-polars/tests/unit/operations/test_join_asof.py
+++ b/py-polars/tests/unit/operations/test_join_asof.py
@@ -1024,3 +1024,29 @@ def test_join_asof_by_argument_parsing() -> None:
     )
     assert_frame_equal(by_list2, by_list)
     assert_frame_equal(by_tuple2, by_list)
+
+
+def test_join_asof_invalid_args() -> None:
+    df1 = pl.DataFrame(
+        {
+            "a": [1, 2, 3],
+            "b": [1, 2, 3],
+        }
+    ).set_sorted("a")
+    df2 = pl.DataFrame(
+        {
+            "a": [1, 2, 3],
+            "c": [1, 2, 3],
+        }
+    ).set_sorted("a")
+
+    with pytest.raises(TypeError, match="expected `on` to be str or Expr, got 'list'"):
+        df1.join_asof(df2, on=["a"])  # type: ignore[arg-type]
+    with pytest.raises(
+        TypeError, match="expected `left_on` to be str or Expr, got 'list'"
+    ):
+        df1.join_asof(df2, left_on=["a"], right_on="a")  # type: ignore[arg-type]
+    with pytest.raises(
+        TypeError, match="expected `right_on` to be str or Expr, got 'list'"
+    ):
+        df1.join_asof(df2, left_on="a", right_on=["a"])  # type: ignore[arg-type]

From a9a87a4ac46138fe0a901910b7c0cb6ed5924373 Mon Sep 17 00:00:00 2001
From: Weijie Guo <reswqa@163.com>
Date: Thu, 24 Aug 2023 13:43:12 +0800
Subject: [PATCH 51/55] fix(rust, python): Cast small int type when scan csv in
 streaming mode. (#10679)

---
 crates/polars-pipe/src/executors/sources/csv.rs     | 2 +-
 py-polars/tests/unit/streaming/test_streaming_io.py | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/crates/polars-pipe/src/executors/sources/csv.rs b/crates/polars-pipe/src/executors/sources/csv.rs
index 8a6338827828..1053ff1d236c 100644
--- a/crates/polars-pipe/src/executors/sources/csv.rs
+++ b/crates/polars-pipe/src/executors/sources/csv.rs
@@ -62,7 +62,7 @@ impl CsvSource {
         let reader = CsvReader::from_path(&path)
             .unwrap()
             .has_header(options.has_header)
-            .with_schema(Some(self.schema.clone()))
+            .with_dtypes(Some(self.schema.clone()))
             .with_delimiter(options.delimiter)
             .with_ignore_errors(options.ignore_errors)
             .with_skip_rows(options.skip_rows)
diff --git a/py-polars/tests/unit/streaming/test_streaming_io.py b/py-polars/tests/unit/streaming/test_streaming_io.py
index 888faeb6d2c4..de91a0a36b9e 100644
--- a/py-polars/tests/unit/streaming/test_streaming_io.py
+++ b/py-polars/tests/unit/streaming/test_streaming_io.py
@@ -27,3 +27,12 @@ def test_scan_slice_streaming(io_files_path: Path) -> None:
     foods_file_path = io_files_path / "foods1.csv"
     df = pl.scan_csv(foods_file_path).head(5).collect(streaming=True)
     assert df.shape == (5, 4)
+
+
+@pytest.mark.parametrize("dtype", [pl.Int8, pl.UInt8, pl.Int16, pl.UInt16])
+def test_scan_csv_overwrite_small_dtypes(
+    io_files_path: Path, dtype: pl.DataType
+) -> None:
+    file_path = io_files_path / "foods1.csv"
+    df = pl.scan_csv(file_path, dtypes={"sugars_g": dtype}).collect(streaming=True)
+    assert df.dtypes == [pl.Utf8, pl.Int64, pl.Float64, dtype]

From abab4970d8e4895309f3f66ccf4c1d8e84cbfdfd Mon Sep 17 00:00:00 2001
From: Vasanthakumar Vijayasekaran <vasanth260m12@gmail.com>
Date: Thu, 24 Aug 2023 17:48:53 +0530
Subject: [PATCH 52/55] fix(rust): fix bug when providing custom labels and
 opting for duplicates in qcut (#10686)

---
 crates/polars-ops/src/series/ops/cut.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/polars-ops/src/series/ops/cut.rs b/crates/polars-ops/src/series/ops/cut.rs
index 947aa40891f1..ac49dffb7216 100644
--- a/crates/polars-ops/src/series/ops/cut.rs
+++ b/crates/polars-ops/src/series/ops/cut.rs
@@ -127,7 +127,7 @@ pub fn qcut(
                 Some(
                     ll.into_iter()
                         .enumerate()
-                        .filter(|(i, _)| *i == 0 || *i == blen || qbreaks[*i] != qbreaks[i - 1])
+                        .filter(|(i, _)| *i == 0 || *i == blen - 1 || qbreaks[*i] != qbreaks[i - 1])
                         .unzip::<_, _, Vec<_>, Vec<_>>()
                         .1,
                 )

From a4427dff5aae48d6c8b52ca6c1f3d7e2fb670aed Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijn@degooijer.io>
Date: Thu, 24 Aug 2023 19:12:44 +0200
Subject: [PATCH 53/55] ci: Clear GitHub Actions caches weekly (#10715)

---
 .github/workflows/clear-caches.yml | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 .github/workflows/clear-caches.yml

diff --git a/.github/workflows/clear-caches.yml b/.github/workflows/clear-caches.yml
new file mode 100644
index 000000000000..f6a001c35419
--- /dev/null
+++ b/.github/workflows/clear-caches.yml
@@ -0,0 +1,19 @@
+# Clearing caches regularly takes care of Rust caches growing to problematic size over time
+
+name: Clear caches
+
+on:
+  schedule:
+    - cron: '0 4 * * MON'
+  workflow_dispatch:
+
+jobs:
+  clear-caches:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Clear all caches
+        run: gh cache delete --all
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

From 21e8cf0f3f6a781cecf4ee9504e99c1edf3648fc Mon Sep 17 00:00:00 2001
From: Marco Edward Gorelli <marcogorelli@protonmail.com>
Date: Thu, 24 Aug 2023 18:15:07 +0100
Subject: [PATCH 54/55] perf(rust, python): parse time zones outside of
 downcast_iter() in replace_time_zone (#10713)

---
 crates/polars-arrow/src/kernels/time.rs       | 29 ++++---------------
 .../datetime/replace_time_zone.rs             | 18 ++++++++++--
 2 files changed, 21 insertions(+), 26 deletions(-)

diff --git a/crates/polars-arrow/src/kernels/time.rs b/crates/polars-arrow/src/kernels/time.rs
index b9cc64d612cf..b9774d932020 100644
--- a/crates/polars-arrow/src/kernels/time.rs
+++ b/crates/polars-arrow/src/kernels/time.rs
@@ -7,7 +7,6 @@ use arrow::temporal_conversions::{
 };
 use chrono::{LocalResult, NaiveDateTime, TimeZone};
 use chrono_tz::Tz;
-use polars_error::polars_bail;
 
 use crate::error::PolarsResult;
 
@@ -37,11 +36,11 @@ fn convert_to_naive_local(
     }
 }
 
-fn convert_to_timestamp(
-    from_tz: Tz,
-    to_tz: Tz,
+pub fn replace_time_zone(
     arr: &PrimitiveArray<i64>,
     tu: TimeUnit,
+    from_tz: &Tz,
+    to_tz: &Tz,
     use_earliest: Option<bool>,
 ) -> PolarsResult<PrimitiveArray<i64>> {
     let res = match tu {
@@ -49,7 +48,7 @@ fn convert_to_timestamp(
             arr,
             |value| {
                 let ndt = timestamp_ms_to_datetime(value);
-                Ok(convert_to_naive_local(&from_tz, &to_tz, ndt, use_earliest)?.timestamp_millis())
+                Ok(convert_to_naive_local(from_tz, to_tz, ndt, use_earliest)?.timestamp_millis())
             },
             ArrowDataType::Int64,
         ),
@@ -57,7 +56,7 @@ fn convert_to_timestamp(
             arr,
             |value| {
                 let ndt = timestamp_us_to_datetime(value);
-                Ok(convert_to_naive_local(&from_tz, &to_tz, ndt, use_earliest)?.timestamp_micros())
+                Ok(convert_to_naive_local(from_tz, to_tz, ndt, use_earliest)?.timestamp_micros())
             },
             ArrowDataType::Int64,
         ),
@@ -65,7 +64,7 @@ fn convert_to_timestamp(
             arr,
             |value| {
                 let ndt = timestamp_ns_to_datetime(value);
-                Ok(convert_to_naive_local(&from_tz, &to_tz, ndt, use_earliest)?.timestamp_nanos())
+                Ok(convert_to_naive_local(from_tz, to_tz, ndt, use_earliest)?.timestamp_nanos())
             },
             ArrowDataType::Int64,
         ),
@@ -73,19 +72,3 @@ fn convert_to_timestamp(
     };
     Ok(res?)
 }
-
-pub fn replace_time_zone(
-    arr: &PrimitiveArray<i64>,
-    tu: TimeUnit,
-    from: &str,
-    to: &str,
-    use_earliest: Option<bool>,
-) -> PolarsResult<PrimitiveArray<i64>> {
-    match from.parse::<chrono_tz::Tz>() {
-        Ok(from_tz) => match to.parse::<chrono_tz::Tz>() {
-            Ok(to_tz) => convert_to_timestamp(from_tz, to_tz, arr, tu, use_earliest),
-            Err(_) => polars_bail!(ComputeError: "unable to parse time zone: '{}'", to),
-        },
-        Err(_) => polars_bail!(ComputeError: "unable to parse time zone: '{}'", from),
-    }
-}
diff --git a/crates/polars-ops/src/chunked_array/datetime/replace_time_zone.rs b/crates/polars-ops/src/chunked_array/datetime/replace_time_zone.rs
index 77954a64e065..82d2785f82a0 100644
--- a/crates/polars-ops/src/chunked_array/datetime/replace_time_zone.rs
+++ b/crates/polars-ops/src/chunked_array/datetime/replace_time_zone.rs
@@ -1,16 +1,28 @@
+use chrono_tz::Tz;
 use polars_arrow::kernels::replace_time_zone as replace_time_zone_kernel;
 use polars_core::prelude::*;
 
+fn parse_time_zone(s: &str) -> PolarsResult<Tz> {
+    s.parse()
+        .map_err(|e| polars_err!(ComputeError: format!("unable to parse time zone: '{s}': {e}")))
+}
+
 pub fn replace_time_zone(
     ca: &DatetimeChunked,
     time_zone: Option<&str>,
     use_earliest: Option<bool>,
 ) -> PolarsResult<DatetimeChunked> {
     let out: PolarsResult<_> = {
-        let from = ca.time_zone().as_deref().unwrap_or("UTC");
-        let to = time_zone.unwrap_or("UTC");
+        let from_tz = parse_time_zone(ca.time_zone().as_deref().unwrap_or("UTC"))?;
+        let to_tz = parse_time_zone(time_zone.unwrap_or("UTC"))?;
         let chunks = ca.downcast_iter().map(|arr| {
-            replace_time_zone_kernel(arr, ca.time_unit().to_arrow(), from, to, use_earliest)
+            replace_time_zone_kernel(
+                arr,
+                ca.time_unit().to_arrow(),
+                &from_tz,
+                &to_tz,
+                use_earliest,
+            )
         });
         let out = ChunkedArray::try_from_chunk_iter(ca.name(), chunks)?;
         Ok(out.into_datetime(ca.time_unit(), time_zone.map(|x| x.to_string())))

From 5d1b28a227cc6da70e94d359bd5e862e49f2a7a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Andr=C3=A9?= <marcandrechenier@gmail.com>
Date: Fri, 25 Aug 2023 04:20:33 +0200
Subject: [PATCH 55/55] doc(rust): Fix typo in `upsample` docs (#8285)

---
 crates/polars-time/src/upsample.rs | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/crates/polars-time/src/upsample.rs b/crates/polars-time/src/upsample.rs
index d6cde94ee98d..bd170a6bc213 100644
--- a/crates/polars-time/src/upsample.rs
+++ b/crates/polars-time/src/upsample.rs
@@ -19,7 +19,7 @@ pub trait PolarsUpsample {
     /// * `every` - interval will start 'every' duration
     /// * `offset` - change the start of the date_range by this offset.
     ///
-    /// The `period` and `offset` arguments are created with
+    /// The `every` and `offset` arguments are created with
     /// the following string language:
     /// - 1ns   (1 nanosecond)
     /// - 1us   (1 microsecond)
@@ -33,11 +33,14 @@ pub trait PolarsUpsample {
     /// - 1q    (1 calendar quarter)
     /// - 1y    (1 calendar year)
     /// - 1i    (1 index count)
+    ///
     /// Or combine them:
     /// "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
+    ///
     /// Suffix with `"_saturating"` to saturate dates with days too
     /// large for their month to the last day of the month (e.g.
     /// 2022-02-29 to 2022-02-28).
+    ///
     /// By "calendar day", we mean the corresponding time on the next
     /// day (which may not be 24 hours, depending on daylight savings).
     /// Similarly for "calendar week", "calendar month", "calendar quarter",
@@ -59,7 +62,7 @@ pub trait PolarsUpsample {
     /// * `every` - interval will start 'every' duration
     /// * `offset` - change the start of the date_range by this offset.
     ///
-    /// The `period` and `offset` arguments are created with
+    /// The `every` and `offset` arguments are created with
     /// the following string language:
     /// - 1ns   (1 nanosecond)
     /// - 1us   (1 microsecond)
@@ -73,11 +76,14 @@ pub trait PolarsUpsample {
     /// - 1q    (1 calendar quarter)
     /// - 1y    (1 calendar year)
     /// - 1i    (1 index count)
+    ///
     /// Or combine them:
     /// "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
+    ///
     /// Suffix with `"_saturating"` to saturate dates with days too
     /// large for their month to the last day of the month (e.g.
     /// 2022-02-29 to 2022-02-28).
+    ///
     /// By "calendar day", we mean the corresponding time on the next
     /// day (which may not be 24 hours, depending on daylight savings).
     /// Similarly for "calendar week", "calendar month", "calendar quarter",