From 2dbf5cde8dc0cebd8c3092824ff85d37aaba524c Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Thu, 17 Aug 2023 23:10:39 +0200 Subject: [PATCH 01/55] chore(rust): bump MSRV to 1.65 (#10568) --- README.md | 2 +- crates/polars-arrow/src/kernels/rolling/no_nulls/min_max.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4a3d6ed5225b..1fee568eb25c 100644 --- a/README.md +++ b/README.md @@ -220,7 +220,7 @@ point to the `main` branch of this repo. polars = { git = "https://github.com/pola-rs/polars", rev = "" } ``` -Required Rust version `>=1.62` +Required Rust version `>=1.65`. ## Contributing diff --git a/crates/polars-arrow/src/kernels/rolling/no_nulls/min_max.rs b/crates/polars-arrow/src/kernels/rolling/no_nulls/min_max.rs index d9c8927ffdae..efeeb9e183a2 100644 --- a/crates/polars-arrow/src/kernels/rolling/no_nulls/min_max.rs +++ b/crates/polars-arrow/src/kernels/rolling/no_nulls/min_max.rs @@ -168,7 +168,7 @@ macro_rules! minmax_window { }; let empty_overlap = old_last_end <= start; - if entering.is_some_and(|em| $new_is_m(&self.m, em.1) || empty_overlap) { + if entering.map(|em| $new_is_m(&self.m, em.1) || empty_overlap) == Some(true) { // The entering extremum "beats" the previous extremum so we can ignore the overlap self.update_m_and_m_idx(entering.unwrap()); return self.m; From 2a2e25bc8353d4d3de655a078213a8693c6e5b28 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Fri, 18 Aug 2023 13:03:24 +0200 Subject: [PATCH 02/55] perf(rust, python): use binary abstraction for atan2 (#10588) --- crates/polars-arrow/src/kernels/atan2.rs | 14 ++++++++++++++ crates/polars-arrow/src/kernels/mod.rs | 1 + .../src/dsl/function_expr/trigonometry.rs | 17 ++++++++--------- 3 files changed, 23 insertions(+), 9 deletions(-) create mode 100644 crates/polars-arrow/src/kernels/atan2.rs diff --git a/crates/polars-arrow/src/kernels/atan2.rs b/crates/polars-arrow/src/kernels/atan2.rs new file mode 100644 index 000000000000..50670dad5696 --- /dev/null +++ b/crates/polars-arrow/src/kernels/atan2.rs @@ -0,0 +1,14 @@ +use arrow::array::PrimitiveArray; +use arrow::compute::arity::binary; +use arrow::types::NativeType; +use num_traits::Float; + +pub fn atan2( + arr_1: &PrimitiveArray, + arr_2: &PrimitiveArray, +) -> PrimitiveArray +where + T: Float, +{ + binary(arr_1, arr_2, arr_1.data_type().clone(), |a, b| a.atan2(b)) +} diff --git a/crates/polars-arrow/src/kernels/mod.rs b/crates/polars-arrow/src/kernels/mod.rs index 64049c4608a5..29f56bc513a3 100644 --- a/crates/polars-arrow/src/kernels/mod.rs +++ b/crates/polars-arrow/src/kernels/mod.rs @@ -4,6 +4,7 @@ use arrow::array::BooleanArray; use arrow::bitmap::utils::BitChunks; #[cfg(feature = "simd")] pub mod agg_mean; +pub mod atan2; #[cfg(feature = "dtype-array")] pub mod comparison; pub mod concatenate; diff --git a/crates/polars-plan/src/dsl/function_expr/trigonometry.rs b/crates/polars-plan/src/dsl/function_expr/trigonometry.rs index 8891fd4daa8f..a07598204858 100644 --- a/crates/polars-plan/src/dsl/function_expr/trigonometry.rs +++ b/crates/polars-plan/src/dsl/function_expr/trigonometry.rs @@ -1,6 +1,7 @@ use num::Float; -use polars_arrow::utils::CustomIterTools; +use polars_arrow::kernels::atan2::atan2 as atan2_kernel; use polars_core::export::num; +use polars_core::utils::align_chunks_binary; use super::*; @@ -128,15 +129,13 @@ where Ok(Some(x.apply(|v| y_value.atan2(v)).into_series())) } else { + let (ca_1, ca_2) = align_chunks_binary(y, x); + let chunks = ca_1 + .downcast_iter() + .zip(ca_2.downcast_iter()) + .map(|(arr_1, arr_2)| atan2_kernel(arr_1, arr_2)); Ok(Some( - y.into_iter() - .zip(x) - .map(|(opt_y, opt_x)| match (opt_y, opt_x) { - (Some(y), Some(x)) => Some(y.atan2(x)), - _ => None, - }) - .collect_trusted::>() - .into_series(), + ChunkedArray::from_chunk_iter(ca_1.name(), chunks).into_series(), )) } } From b91cd2d3fa42f80319d23d057fd9691ba474bd61 Mon Sep 17 00:00:00 2001 From: Marshall Date: Fri, 18 Aug 2023 07:54:50 -0400 Subject: [PATCH 03/55] fix(rust): join_asof missing `tolerance` implementation, address edge-cases (#10482) --- .../polars-core/src/frame/asof_join/asof.rs | 95 +++- .../polars-core/src/frame/asof_join/groups.rs | 85 +++- crates/polars-core/src/frame/asof_join/mod.rs | 12 +- py-polars/polars/dataframe/frame.py | 9 +- py-polars/polars/lazyframe/frame.py | 13 +- .../tests/unit/operations/test_join_asof.py | 442 +++++++++++++++++- 6 files changed, 629 insertions(+), 27 deletions(-) diff --git a/crates/polars-core/src/frame/asof_join/asof.rs b/crates/polars-core/src/frame/asof_join/asof.rs index 89a189aca03a..7edbd1372bae 100644 --- a/crates/polars-core/src/frame/asof_join/asof.rs +++ b/crates/polars-core/src/frame/asof_join/asof.rs @@ -1,5 +1,5 @@ use std::fmt::Debug; -use std::ops::Sub; +use std::ops::{Add, Sub}; use num_traits::Bounded; use polars_arrow::index::IdxSize; @@ -182,6 +182,94 @@ pub(super) fn join_asof_backward( out } +pub(super) fn join_asof_nearest_with_tolerance< + T: PartialOrd + Copy + Debug + Sub + Add + Bounded, +>( + left: &[T], + right: &[T], + tolerance: T, +) -> Vec> { + let n_left = left.len(); + + if left.is_empty() { + return Vec::new(); + } + let mut out = Vec::with_capacity(n_left); + if right.is_empty() { + out.extend(std::iter::repeat(None).take(n_left)); + return out; + } + + // If we know the first/last values, we can leave early in many cases. + let n_right = right.len(); + let first_left = left[0]; + let last_left = left[n_left - 1]; + let r_lower_bound = right[0] - tolerance; + let r_upper_bound = right[n_right - 1] + tolerance; + + // If the left and right hand side are disjoint partitions, we can early exit. + if (r_lower_bound > last_left) || (r_upper_bound < first_left) { + out.extend(std::iter::repeat(None).take(n_left)); + return out; + } + + for &val_l in left { + // Detect early exit cases + if val_l < r_lower_bound { + // The left value is too low. + out.push(None); + continue; + } else if val_l > r_upper_bound { + // The left value is too high. Subsequent left values are guaranteed to + // be too high as well, so we can early return. + out.extend(std::iter::repeat(None).take(n_left - out.len())); + return out; + } + + // The left value is contained within the RHS window, so we might have a match. + let mut offset: IdxSize = 0; + let mut dist = tolerance; + let mut found_window = false; + let val_l_upper_bound = val_l + tolerance; + for &val_r in right { + // We haven't reached the window yet; go to next RHS value. + if val_l > val_r + tolerance { + offset += 1; + continue; + } + + // We passed the window without a match, so leave immediately. + if !found_window && (val_r > val_l_upper_bound) { + out.push(None); + break; + } + + // We made it to the window: matches are now possible, start measuring distance. + found_window = true; + let current_dist = if val_l > val_r { + val_l - val_r + } else { + val_r - val_l + }; + if current_dist <= dist { + dist = current_dist; + if offset == (n_right - 1) as IdxSize { + // We're the last item, it's a match. + out.push(Some(offset)); + break; + } + } else { + // We'ved moved farther away, so the last element was the match. + out.push(Some(offset - 1)); + break; + } + offset += 1; + } + } + + out +} + pub(super) fn join_asof_nearest + Bounded>( left: &[T], right: &[T], @@ -189,9 +277,9 @@ pub(super) fn join_asof_nearest + let mut out = Vec::with_capacity(left.len()); let mut offset = 0 as IdxSize; let max_value = ::max_value(); - let mut dist: T = max_value; for &val_l in left { + let mut dist: T = max_value; loop { match right.get(offset as usize) { Some(&val_r) => { @@ -209,9 +297,6 @@ pub(super) fn join_asof_nearest + // distance has increased, we're now farther away, so previous element was closest out.push(Some(offset - 1)); - // reset distance - dist = max_value; - // The next left-item may match on the same item, so we need to rewind the offset offset -= 1; break; diff --git a/crates/polars-core/src/frame/asof_join/groups.rs b/crates/polars-core/src/frame/asof_join/groups.rs index 722e5b779f28..ae27b92fb685 100644 --- a/crates/polars-core/src/frame/asof_join/groups.rs +++ b/crates/polars-core/src/frame/asof_join/groups.rs @@ -1,6 +1,6 @@ use std::fmt::Debug; use std::hash::Hash; -use std::ops::Sub; +use std::ops::{Add, Sub}; use ahash::RandomState; use arrow::types::NativeType; @@ -91,6 +91,69 @@ pub(super) unsafe fn join_asof_forward_with_indirection_and_tolerance< (None, offsets.len()) } +pub(super) unsafe fn join_asof_nearest_with_indirection_and_tolerance< + T: PartialOrd + Copy + Debug + Sub + Add, +>( + val_l: T, + right: &[T], + offsets: &[IdxSize], + tolerance: T, +) -> (Option, usize) { + if offsets.is_empty() { + return (None, 0); + } + + // If we know the first/last values, we can leave early in many cases. + let n_right = offsets.len(); + let r_upper_bound = right[offsets[n_right - 1] as usize] + tolerance; + + // The left value is too high. Subsequent values are guaranteed to be too + // high as well, so we can early return. + if val_l > r_upper_bound { + return (None, n_right - 1); + } + + let mut dist: T = tolerance; + let mut prev_offset: IdxSize = 0; + let mut found_window = false; + for (idx, &offset) in offsets.iter().enumerate() { + let val_r = *right.get_unchecked(offset as usize); + + // We haven't reached the window yet; go to next RHS value. + if val_l > val_r + tolerance { + prev_offset = offset; + continue; + } + + // We passed the window without a match, so leave immediately. + if !found_window && (val_r > val_l + tolerance) { + return (None, n_right - 1); + } + + // We made it to the window: matches are now possible, start measuring distance. + found_window = true; + let current_dist = if val_l > val_r { + val_l - val_r + } else { + val_r - val_l + }; + if current_dist <= dist { + dist = current_dist; + if idx == (n_right - 1) { + // We're the last item, it's a match. + return (Some(offset), idx); + } + prev_offset = offset; + } else { + // We'ved moved farther away, so the last element was the match. + return (Some(prev_offset), idx - 1); + } + } + + // This should be unreachable. + (None, 0) +} + pub(super) unsafe fn join_asof_backward_with_indirection( val_l: T, right: &[T], @@ -167,8 +230,6 @@ pub(super) unsafe fn join_asof_nearest_with_indirection< // candidate for match dist = dist_curr; } else { - // note for a nearest-match, we can re-match on the same val_r next time, - // so we need to rewind the idx by 1 return (Some(prev_offset), idx - 1); } prev_offset = offset; @@ -274,7 +335,11 @@ where (None, AsofStrategy::Forward) => { (join_asof_forward_with_indirection, T::Native::zero(), true) }, - (_, AsofStrategy::Nearest) => { + (Some(tolerance), AsofStrategy::Nearest) => { + let tol = tolerance.extract::().unwrap(); + (join_asof_nearest_with_indirection_and_tolerance, tol, false) + }, + (None, AsofStrategy::Nearest) => { (join_asof_nearest_with_indirection, T::Native::zero(), false) }, }; @@ -408,7 +473,11 @@ where (None, AsofStrategy::Forward) => { (join_asof_forward_with_indirection, T::Native::zero(), true) }, - (_, AsofStrategy::Nearest) => { + (Some(tolerance), AsofStrategy::Nearest) => { + let tol = tolerance.extract::().unwrap(); + (join_asof_nearest_with_indirection_and_tolerance, tol, false) + }, + (None, AsofStrategy::Nearest) => { (join_asof_nearest_with_indirection, T::Native::zero(), false) }, }; @@ -534,7 +603,11 @@ where (None, AsofStrategy::Forward) => { (join_asof_forward_with_indirection, T::Native::zero(), true) }, - (_, AsofStrategy::Nearest) => { + (Some(tolerance), AsofStrategy::Nearest) => { + let tol = tolerance.extract::().unwrap(); + (join_asof_nearest_with_indirection_and_tolerance, tol, false) + }, + (None, AsofStrategy::Nearest) => { (join_asof_nearest_with_indirection, T::Native::zero(), false) }, }; diff --git a/crates/polars-core/src/frame/asof_join/mod.rs b/crates/polars-core/src/frame/asof_join/mod.rs index 30954abb14f9..c496c670d696 100644 --- a/crates/polars-core/src/frame/asof_join/mod.rs +++ b/crates/polars-core/src/frame/asof_join/mod.rs @@ -103,8 +103,16 @@ where ) }, }, - AsofStrategy::Nearest => { - join_asof_nearest(ca.cont_slice().unwrap(), other.cont_slice().unwrap()) + AsofStrategy::Nearest => match tolerance { + None => join_asof_nearest(ca.cont_slice().unwrap(), other.cont_slice().unwrap()), + Some(tolerance) => { + let tolerance = tolerance.extract::().unwrap(); + join_asof_nearest_with_tolerance( + self.cont_slice().unwrap(), + other.cont_slice().unwrap(), + tolerance, + ) + }, }, }; Ok(out) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 736db0185bba..fdf85a85c60c 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -5522,7 +5522,7 @@ def join_asof( by: str | Sequence[str] | None = None, strategy: AsofJoinStrategy = "backward", suffix: str = "_right", - tolerance: str | int | float | None = None, + tolerance: str | int | float | timedelta | None = None, allow_parallel: bool = True, force_parallel: bool = False, ) -> DataFrame: @@ -5543,7 +5543,8 @@ def join_asof( 'on' key is greater than or equal to the left's key. - A "nearest" search selects the last row in the right DataFrame whose value - is nearest to the left's key. + is nearest to the left's key. String keys are not currently supported for a + nearest search. The default is "backward". @@ -5571,8 +5572,8 @@ def join_asof( tolerance Numeric tolerance. By setting this the join will only be done if the near keys are within this distance. If an asof join is done on columns of dtype - "Date", "Datetime", "Duration" or "Time", use the following string - language: + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: - 1ns (1 nanosecond) - 1us (1 microsecond) diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 7b9f3d0d26df..7079e1ea4f56 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -2941,7 +2941,7 @@ def join_asof( by: str | Sequence[str] | None = None, strategy: AsofJoinStrategy = "backward", suffix: str = "_right", - tolerance: str | int | float | None = None, + tolerance: str | int | float | timedelta | None = None, allow_parallel: bool = True, force_parallel: bool = False, ) -> Self: @@ -2961,8 +2961,9 @@ def join_asof( - A "forward" search selects the first row in the right DataFrame whose 'on' key is greater than or equal to the left's key. - - A "nearest" search selects the last row in the right DataFrame whose value - is nearest to the left's key. + A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left's key. String keys are not currently supported for a + nearest search. The default is "backward". @@ -2990,8 +2991,8 @@ def join_asof( tolerance Numeric tolerance. By setting this the join will only be done if the near keys are within this distance. If an asof join is done on columns of dtype - "Date", "Datetime", "Duration" or "Time" you use the following string - language: + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: - 1ns (1 nanosecond) - 1us (1 microsecond) @@ -3091,6 +3092,8 @@ def join_asof( tolerance_num: float | int | None = None if isinstance(tolerance, str): tolerance_str = tolerance + elif isinstance(tolerance, timedelta): + tolerance_str = _timedelta_to_pl_duration(tolerance) else: tolerance_num = tolerance diff --git a/py-polars/tests/unit/operations/test_join_asof.py b/py-polars/tests/unit/operations/test_join_asof.py index f30827ac36bf..8021d360e92c 100644 --- a/py-polars/tests/unit/operations/test_join_asof.py +++ b/py-polars/tests/unit/operations/test_join_asof.py @@ -1,4 +1,4 @@ -from datetime import date, datetime +from datetime import date, datetime, timedelta from typing import Any import numpy as np @@ -426,6 +426,7 @@ def test_asof_join_sorted_by_group(capsys: Any) -> None: def test_asof_join_nearest() -> None: + # Generic join_asof df1 = pl.DataFrame( { "asof_key": [-1, 1, 2, 4, 6], @@ -435,20 +436,170 @@ def test_asof_join_nearest() -> None: df2 = pl.DataFrame( { - "asof_key": [1, 2, 4, 5], + "asof_key": [-1, 2, 4, 5], "b": [1, 2, 3, 4], } ).sort(by="asof_key") expected = pl.DataFrame( - {"asof_key": [-1, 1, 2, 4, 6], "a": [1, 2, 3, 4, 5], "b": [1, 1, 2, 3, 4]} + {"asof_key": [-1, 1, 2, 4, 6], "a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 3, 4]} ) out = df1.join_asof(df2, on="asof_key", strategy="nearest") assert_frame_equal(out, expected) + # Edge case: last item of right matches multiples on left + df1 = pl.DataFrame( + { + "asof_key": [9, 9, 10, 10, 10], + "a": [1, 2, 3, 4, 5], + } + ).set_sorted("asof_key") + + df2 = pl.DataFrame( + { + "asof_key": [1, 2, 3, 10], + "b": [1, 2, 3, 4], + } + ).set_sorted("asof_key") + + expected = pl.DataFrame( + { + "asof_key": [9, 9, 10, 10, 10], + "a": [1, 2, 3, 4, 5], + "b": [4, 4, 4, 4, 4], + } + ) + + out = df1.join_asof(df2, on="asof_key", strategy="nearest") + assert_frame_equal(out, expected) + + +def test_asof_join_nearest_with_tolerance() -> None: + a = b = [1, 2, 3, 4, 5] + + nones = pl.Series([None, None, None, None, None], dtype=pl.Int64) + + # Case 1: complete miss + df1 = pl.DataFrame({"asof_key": [1, 2, 3, 4, 5], "a": a}).set_sorted("asof_key") + df2 = pl.DataFrame( + { + "asof_key": [7, 8, 9, 10, 11], + "b": b, + } + ).set_sorted("asof_key") + expected = df1.with_columns(nones.alias("b")) + out = df1.join_asof(df2, on="asof_key", strategy="nearest", tolerance=1) + assert_frame_equal(out, expected) + + # Case 2: complete miss in other direction + df1 = pl.DataFrame({"asof_key": [7, 8, 9, 10, 11], "a": a}).set_sorted("asof_key") + df2 = pl.DataFrame( + { + "asof_key": [1, 2, 3, 4, 5], + "b": b, + } + ).set_sorted("asof_key") + expected = df1.with_columns(nones.alias("b")) + out = df1.join_asof(df2, on="asof_key", strategy="nearest", tolerance=1) + assert_frame_equal(out, expected) + + # Case 3: match first item + df1 = pl.DataFrame({"asof_key": [1, 2, 3, 4, 5], "a": a}).set_sorted("asof_key") + df2 = pl.DataFrame( + { + "asof_key": [6, 7, 8, 9, 10], + "b": b, + } + ).set_sorted("asof_key") + out = df1.join_asof(df2, on="asof_key", strategy="nearest", tolerance=1) + expected = df1.with_columns(pl.Series([None, None, None, None, 1]).alias("b")) + assert_frame_equal(out, expected) + + # Case 4: match last item + df1 = pl.DataFrame({"asof_key": [1, 2, 3, 4, 5], "a": a}).set_sorted("asof_key") + df2 = pl.DataFrame( + { + "asof_key": [-4, -3, -2, -1, 0], + "b": b, + } + ).set_sorted("asof_key") + out = df1.join_asof(df2, on="asof_key", strategy="nearest", tolerance=1) + expected = df1.with_columns(pl.Series([5, None, None, None, None]).alias("b")) + assert_frame_equal(out, expected) + + # Case 5: match multiples, pick closer + df1 = pl.DataFrame( + {"asof_key": pl.Series([1, 2, 3, 4, 5], dtype=pl.Float64), "a": a} + ).set_sorted("asof_key") + df2 = pl.DataFrame( + { + "asof_key": [0, 2, 2.4, 3.4, 10], + "b": b, + } + ).set_sorted("asof_key") + out = df1.join_asof(df2, on="asof_key", strategy="nearest", tolerance=1) + expected = df1.with_columns(pl.Series([2, 2, 4, 4, None]).alias("b")) + assert_frame_equal(out, expected) + + # Case 6: use 0 tolerance + df1 = pl.DataFrame( + {"asof_key": pl.Series([1, 2, 3, 4, 5], dtype=pl.Float64), "a": a} + ).set_sorted("asof_key") + df2 = pl.DataFrame( + { + "asof_key": [0, 2, 2.4, 3.4, 10], + "b": b, + } + ).set_sorted("asof_key") + out = df1.join_asof(df2, on="asof_key", strategy="nearest", tolerance=0) + expected = df1.with_columns(pl.Series([None, 2, None, None, None]).alias("b")) + assert_frame_equal(out, expected) + + # Case 7: test with datetime + df1 = pl.DataFrame( + { + "asof_key": pl.Series( + [ + datetime(2023, 1, 1), + datetime(2023, 1, 2), + datetime(2023, 1, 3), + datetime(2023, 1, 4), + datetime(2023, 1, 6), + ] + ), + "a": a, + } + ).set_sorted("asof_key") + df2 = pl.DataFrame( + { + "asof_key": pl.Series( + [ + datetime(2022, 1, 1), + datetime(2022, 1, 2), + datetime(2022, 1, 3), + datetime( + 2023, 1, 2, 21, 30, 0 + ), # should match with 2023-01-02, 2023-01-03, and 2021-01-04 + datetime(2023, 1, 7), + ] + ), + "b": b, + } + ).set_sorted("asof_key") + out = df1.join_asof(df2, on="asof_key", strategy="nearest", tolerance="1d4h") + expected = df1.with_columns(pl.Series([None, 4, 4, 4, 5]).alias("b")) + assert_frame_equal(out, expected) + + # Case 8: test using timedelta tolerance + out = df1.join_asof( + df2, on="asof_key", strategy="nearest", tolerance=timedelta(days=1, hours=4) + ) + assert_frame_equal(out, expected) + def test_asof_join_nearest_by() -> None: + # Generic join_asof df1 = pl.DataFrame( { "asof_key": [-1, 1, 2, 6, 1], @@ -459,7 +610,7 @@ def test_asof_join_nearest_by() -> None: df2 = pl.DataFrame( { - "asof_key": [1, 2, 5, 1], + "asof_key": [-1, 2, 5, 1], "group": [1, 1, 2, 2], "b": [1, 2, 3, 4], } @@ -469,11 +620,37 @@ def test_asof_join_nearest_by() -> None: { "asof_key": [-1, 1, 2, 6, 1], "group": [1, 1, 1, 2, 2], + "a": [1, 2, 3, 5, 2], + "b": [1, 2, 2, 4, 3], + } + ).sort(by=["group", "asof_key"]) + + # Edge case: last item of right matches multiples on left + df1 = pl.DataFrame( + { + "asof_key": [9, 9, 10, 10, 10], + "group": [1, 1, 1, 2, 2], "a": [1, 2, 3, 2, 5], - "b": [1, 1, 2, 3, 4], } ).sort(by=["group", "asof_key"]) + df2 = pl.DataFrame( + { + "asof_key": [-1, 1, 1, 10], + "group": [1, 1, 2, 2], + "b": [1, 2, 3, 4], + } + ).sort(by=["group", "asof_key"]) + + expected = pl.DataFrame( + { + "asof_key": [9, 9, 10, 10, 10], + "group": [1, 1, 1, 2, 2], + "a": [1, 2, 3, 2, 5], + "b": [2, 2, 2, 4, 4], + } + ) + out = df1.join_asof(df2, on="asof_key", by="group", strategy="nearest") assert_frame_equal(out, expected) @@ -503,6 +680,261 @@ def test_asof_join_nearest_by() -> None: assert_frame_equal(out, expected) +def test_asof_join_nearest_by_with_tolerance() -> None: + df1 = pl.DataFrame( + { + "group": [ + 1, + 1, + 1, + 1, + 1, + 2, + 2, + 2, + 2, + 2, + 3, + 3, + 3, + 3, + 3, + 4, + 4, + 4, + 4, + 4, + 5, + 5, + 5, + 5, + 5, + 6, + 6, + 6, + 6, + 6, + ], + "asof_key": pl.Series( + [ + 1, + 2, + 3, + 4, + 5, + 7, + 8, + 9, + 10, + 11, + 1, + 2, + 3, + 4, + 5, + 1, + 2, + 3, + 4, + 5, + 1, + 2, + 3, + 4, + 5, + 1, + 2, + 3, + 4, + 5, + ], + dtype=pl.Float32, + ), + "a": [ + 1, + 2, + 3, + 4, + 5, + 1, + 2, + 3, + 4, + 5, + 1, + 2, + 3, + 4, + 5, + 1, + 2, + 3, + 4, + 5, + 1, + 2, + 3, + 4, + 5, + 1, + 2, + 3, + 4, + 5, + ], + } + ) + + df2 = pl.DataFrame( + { + "group": [ + 1, + 1, + 1, + 1, + 1, + 2, + 2, + 2, + 2, + 2, + 3, + 3, + 3, + 3, + 3, + 4, + 4, + 4, + 4, + 4, + 5, + 5, + 5, + 5, + 5, + 6, + 6, + 6, + 6, + 6, + ], + "asof_key": pl.Series( + [ + 7, + 8, + 9, + 10, + 11, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 5, + -3, + -2, + -1, + 0, + 0, + 2, + 2.4, + 3.4, + 10, + -3, + 3, + 8, + 9, + 10, + ], + dtype=pl.Float32, + ), + "b": [ + 1, + 2, + 3, + 4, + 5, + 1, + 2, + 3, + 4, + 5, + 1, + 2, + 3, + 4, + 5, + 1, + 2, + 3, + 4, + 5, + 1, + 2, + 3, + 4, + 5, + 1, + 2, + 3, + 4, + 5, + ], + } + ) + + expected = df1.with_columns( + pl.Series( + [ + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + 1, + 5, + None, + None, + 1, + 1, + 2, + 2, + 4, + 4, + None, + None, + 2, + 2, + 2, + None, + ] + ).alias("b") + ) + df1 = df1.sort(by=["group", "asof_key"]) + df2 = df2.sort(by=["group", "asof_key"]) + expected = expected.sort(by=["group", "a"]) + + out = df1.join_asof( + df2, by="group", on="asof_key", strategy="nearest", tolerance=1.0 + ).sort(by=["group", "a"]) + assert_frame_equal(out, expected) + + def test_asof_join_nearest_by_date() -> None: df1 = pl.DataFrame( { From d7bc251c09c59d88ad7a7b8334f95a2054876343 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Sat, 19 Aug 2023 10:02:34 +0400 Subject: [PATCH 04/55] chore(python): ensure that `make requirements` fully refreshes unpinned packages/deps (#10591) --- py-polars/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/py-polars/Makefile b/py-polars/Makefile index dbd16f1428ff..c673c9ed7482 100644 --- a/py-polars/Makefile +++ b/py-polars/Makefile @@ -17,9 +17,9 @@ endif .PHONY: requirements requirements: .venv ## Install/refresh all project requirements $(VENV_BIN)/python -m pip install --upgrade pip - $(VENV_BIN)/pip install -r requirements-dev.txt - $(VENV_BIN)/pip install -r requirements-lint.txt - $(VENV_BIN)/pip install -r docs/requirements-docs.txt + $(VENV_BIN)/pip install --upgrade -r requirements-dev.txt + $(VENV_BIN)/pip install --upgrade -r requirements-lint.txt + $(VENV_BIN)/pip install --upgrade -r docs/requirements-docs.txt .PHONY: build build: .venv ## Compile and install Polars for development From 7dff4c3ef75b75eb93606be4cc43e6bb632a11c0 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sat, 19 Aug 2023 10:32:36 +0200 Subject: [PATCH 05/55] refactor(rust): make binary chunkedarray functions DRY (#10607) --- .../src/chunked_array/arithmetic/decimal.rs | 1 + .../src/chunked_array/arithmetic/mod.rs | 16 +- .../src/chunked_array/arithmetic/numeric.rs | 10 +- .../polars-core/src/chunked_array/bitwise.rs | 30 +-- .../src/chunked_array/comparison/mod.rs | 218 ++++++------------ .../src/chunked_array/ops/apply.rs | 2 +- .../src/chunked_array/ops/arity.rs | 153 ++++++++++++ .../src/chunked_array/ops/downcast.rs | 6 + .../src/chunked_array/ops/filter.rs | 97 ++++---- .../src/chunked_array/ops/min_max_binary.rs | 41 +--- .../polars-core/src/chunked_array/ops/mod.rs | 1 + .../src/chunked_array/ops/repeat_by.rs | 66 +++--- .../src/series/arithmetic/borrowed.rs | 101 +++----- .../polars-ops/src/chunked_array/list/sets.rs | 23 +- .../polars-ops/src/series/ops/floor_divide.rs | 9 +- .../polars-plan/src/dsl/function_expr/pow.rs | 9 +- .../src/dsl/function_expr/trigonometry.rs | 8 +- 17 files changed, 378 insertions(+), 413 deletions(-) create mode 100644 crates/polars-core/src/chunked_array/ops/arity.rs diff --git a/crates/polars-core/src/chunked_array/arithmetic/decimal.rs b/crates/polars-core/src/chunked_array/arithmetic/decimal.rs index 4c755ec06d25..9341407d9565 100644 --- a/crates/polars-core/src/chunked_array/arithmetic/decimal.rs +++ b/crates/polars-core/src/chunked_array/arithmetic/decimal.rs @@ -2,6 +2,7 @@ use polars_arrow::compute::arithmetics::decimal; use super::*; use crate::prelude::DecimalChunked; +use crate::utils::align_chunks_binary; // TODO: remove impl ArrayArithmetics for i128 { diff --git a/crates/polars-core/src/chunked_array/arithmetic/mod.rs b/crates/polars-core/src/chunked_array/arithmetic/mod.rs index 6460435caccf..6a51424b4e21 100644 --- a/crates/polars-core/src/chunked_array/arithmetic/mod.rs +++ b/crates/polars-core/src/chunked_array/arithmetic/mod.rs @@ -15,7 +15,7 @@ use polars_arrow::utils::combine_validities_and; use crate::prelude::*; use crate::series::IsSorted; -use crate::utils::{align_chunks_binary, align_chunks_binary_owned}; +use crate::utils::align_chunks_binary_owned; pub trait ArrayArithmetics where @@ -148,12 +148,7 @@ impl Add for &BinaryChunked { }; } - let (lhs, rhs) = align_chunks_binary(self, rhs); - let chunks = lhs - .downcast_iter() - .zip(rhs.downcast_iter()) - .map(|(a, b)| concat_binary(a, b)); - ChunkedArray::from_chunk_iter(self.name(), chunks) + arity::binary_mut(self, rhs, concat_binary) } } @@ -202,12 +197,7 @@ impl Add for &BooleanChunked { if self.len() == 1 { return rhs.add(self); } - let (lhs, rhs) = align_chunks_binary(self, rhs); - let chunks = lhs - .downcast_iter() - .zip(rhs.downcast_iter()) - .map(|(a, b)| add_boolean(a, b)); - ChunkedArray::from_chunk_iter(self.name(), chunks) + arity::binary_mut(self, rhs, add_boolean) } } diff --git a/crates/polars-core/src/chunked_array/arithmetic/numeric.rs b/crates/polars-core/src/chunked_array/arithmetic/numeric.rs index 6e4216a1ecdc..ebfb835c715f 100644 --- a/crates/polars-core/src/chunked_array/arithmetic/numeric.rs +++ b/crates/polars-core/src/chunked_array/arithmetic/numeric.rs @@ -12,15 +12,7 @@ where F: Fn(T::Native, T::Native) -> T::Native, { let mut ca = match (lhs.len(), rhs.len()) { - (a, b) if a == b => { - let (lhs, rhs) = align_chunks_binary(lhs, rhs); - let chunks = lhs - .downcast_iter() - .zip(rhs.downcast_iter()) - .map(|(lhs, rhs)| Box::new(kernel(lhs, rhs)) as ArrayRef) - .collect(); - unsafe { lhs.copy_with_chunks(chunks, false, false) } - }, + (a, b) if a == b => arity::binary_mut(lhs, rhs, |lhs, rhs| kernel(lhs, rhs)), // broadcast right path (_, 1) => { let opt_rhs = rhs.get(0); diff --git a/crates/polars-core/src/chunked_array/bitwise.rs b/crates/polars-core/src/chunked_array/bitwise.rs index 7bf9f6457c61..ea9372ef3adc 100644 --- a/crates/polars-core/src/chunked_array/bitwise.rs +++ b/crates/polars-core/src/chunked_array/bitwise.rs @@ -6,7 +6,6 @@ use polars_arrow::utils::combine_validities_and; use super::arithmetic::arithmetic_helper; use super::*; -use crate::utils::align_chunks_binary; impl BitAnd for &ChunkedArray where @@ -73,12 +72,7 @@ impl BitOr for &BooleanChunked { _ => {}, } - let (lhs, rhs) = align_chunks_binary(self, rhs); - let chunks = lhs - .downcast_iter() - .zip(rhs.downcast_iter()) - .map(|(lhs, rhs)| compute::boolean_kleene::or(lhs, rhs)); - BooleanChunked::from_chunk_iter(self.name(), chunks) + arity::binary_mut(self, rhs, compute::boolean_kleene::or) } } @@ -123,16 +117,11 @@ impl BitXor for &BooleanChunked { _ => {}, } - let (l, r) = align_chunks_binary(self, rhs); - let chunks = l - .downcast_iter() - .zip(r.downcast_iter()) - .map(|(l_arr, r_arr)| { - let validity = combine_validities_and(l_arr.validity(), r_arr.validity()); - let values = l_arr.values() ^ r_arr.values(); - BooleanArray::from_data_default(values, validity) - }); - ChunkedArray::from_chunk_iter(self.name(), chunks) + arity::binary_mut(self, rhs, |l_arr, r_arr| { + let validity = combine_validities_and(l_arr.validity(), r_arr.validity()); + let values = l_arr.values() ^ r_arr.values(); + BooleanArray::from_data_default(values, validity) + }) } } @@ -169,12 +158,7 @@ impl BitAnd for &BooleanChunked { _ => {}, } - let (lhs, rhs) = align_chunks_binary(self, rhs); - let chunks = lhs - .downcast_iter() - .zip(rhs.downcast_iter()) - .map(|(lhs, rhs)| compute::boolean_kleene::and(lhs, rhs)); - BooleanChunked::from_chunk_iter(self.name(), chunks) + arity::binary_mut(self, rhs, compute::boolean_kleene::and) } } diff --git a/crates/polars-core/src/chunked_array/comparison/mod.rs b/crates/polars-core/src/chunked_array/comparison/mod.rs index 202473428f8a..29350564f8e7 100644 --- a/crates/polars-core/src/chunked_array/comparison/mod.rs +++ b/crates/polars-core/src/chunked_array/comparison/mod.rs @@ -14,30 +14,14 @@ use polars_arrow::prelude::FromData; use crate::prelude::*; use crate::series::IsSorted; -use crate::utils::align_chunks_binary; impl ChunkedArray where T: PolarsNumericType, { - /// First ensure that the chunks of lhs and rhs match and then iterates over the chunks and applies - /// the comparison operator. - fn comparison( - &self, - rhs: &ChunkedArray, - f: impl Fn(&PrimitiveArray, &PrimitiveArray) -> BooleanArray, - ) -> BooleanChunked { - let chunks = self - .downcast_iter() - .zip(rhs.downcast_iter()) - .map(|(left, right)| f(left, right)); - ChunkedArray::from_chunk_iter("", chunks) - } - // Also includes validity in comparison. pub fn not_equal_and_validity(&self, rhs: &ChunkedArray) -> BooleanChunked { - let (lhs, rhs) = align_chunks_binary(self, rhs); - lhs.comparison(&rhs, |x, y| comparison::neq_and_validity(x, y)) + arity::binary_mut_with_options(self, rhs, |a, b| comparison::neq_and_validity(a, b), "") } } @@ -64,11 +48,7 @@ where BooleanChunked::full_null("", rhs.len()) } }, - _ => { - // Same length. - let (lhs, rhs) = align_chunks_binary(self, rhs); - lhs.comparison(&rhs, |x, y| comparison::eq(x, y)) - }, + _ => arity::binary_mut_with_options(self, rhs, |a, b| comparison::eq(a, b), ""), } } @@ -89,11 +69,12 @@ where rhs.is_null() } }, - _ => { - // Same length. - let (lhs, rhs) = align_chunks_binary(self, rhs); - lhs.comparison(&rhs, |x, y| comparison::eq_and_validity(x, y)) - }, + _ => arity::binary_mut_with_options( + self, + rhs, + |a, b| comparison::eq_and_validity(a, b), + "", + ), } } @@ -114,11 +95,7 @@ where BooleanChunked::full_null("", rhs.len()) } }, - _ => { - // Same length. - let (lhs, rhs) = align_chunks_binary(self, rhs); - lhs.comparison(&rhs, |x, y| comparison::neq(x, y)) - }, + _ => arity::binary_mut_with_options(self, rhs, |a, b| comparison::neq(a, b), ""), } } @@ -139,11 +116,12 @@ where rhs.is_not_null() } }, - _ => { - // Same length. - let (lhs, rhs) = align_chunks_binary(self, rhs); - lhs.comparison(&rhs, |x, y| comparison::neq_and_validity(x, y)) - }, + _ => arity::binary_mut_with_options( + self, + rhs, + |a, b| comparison::neq_and_validity(a, b), + "", + ), } } @@ -164,11 +142,7 @@ where BooleanChunked::full_null("", rhs.len()) } }, - _ => { - // Same length. - let (lhs, rhs) = align_chunks_binary(self, rhs); - lhs.comparison(&rhs, |x, y| comparison::gt(x, y)) - }, + _ => arity::binary_mut_with_options(self, rhs, |a, b| comparison::gt(a, b), ""), } } @@ -189,11 +163,7 @@ where BooleanChunked::full_null("", rhs.len()) } }, - _ => { - // Same length. - let (lhs, rhs) = align_chunks_binary(self, rhs); - lhs.comparison(&rhs, |x, y| comparison::gt_eq(x, y)) - }, + _ => arity::binary_mut_with_options(self, rhs, |a, b| comparison::gt_eq(a, b), ""), } } @@ -214,11 +184,7 @@ where BooleanChunked::full_null("", rhs.len()) } }, - _ => { - // Same length. - let (lhs, rhs) = align_chunks_binary(self, rhs); - lhs.comparison(&rhs, |x, y| comparison::lt(x, y)) - }, + _ => arity::binary_mut_with_options(self, rhs, |a, b| comparison::lt(a, b), ""), } } @@ -239,27 +205,11 @@ where BooleanChunked::full_null("", rhs.len()) } }, - _ => { - // Same length. - let (lhs, rhs) = align_chunks_binary(self, rhs); - lhs.comparison(&rhs, |x, y| comparison::lt_eq(x, y)) - }, + _ => arity::binary_mut_with_options(self, rhs, |a, b| comparison::lt_eq(a, b), ""), } } } -fn compare_bools( - lhs: &BooleanChunked, - rhs: &BooleanChunked, - f: impl Fn(&BooleanArray, &BooleanArray) -> BooleanArray, -) -> BooleanChunked { - let chunks = lhs - .downcast_iter() - .zip(rhs.downcast_iter()) - .map(|(l, r)| f(l, r)); - ChunkedArray::from_chunk_iter("", chunks) -} - impl ChunkCompare<&BooleanChunked> for BooleanChunked { type Item = BooleanChunked; @@ -278,11 +228,7 @@ impl ChunkCompare<&BooleanChunked> for BooleanChunked { } }, (1, _) => rhs.equal(self), - _ => { - // Same length. - let (lhs, rhs) = align_chunks_binary(self, rhs); - compare_bools(&lhs, &rhs, |lhs, rhs| comparison::eq(lhs, rhs)) - }, + _ => arity::binary_mut_with_options(self, rhs, |lhs, rhs| comparison::eq(lhs, rhs), ""), } } @@ -321,11 +267,12 @@ impl ChunkCompare<&BooleanChunked> for BooleanChunked { } }, (1, _) => rhs.equal_missing(self), - _ => { - // Same length. - let (lhs, rhs) = align_chunks_binary(self, rhs); - compare_bools(&lhs, &rhs, |lhs, rhs| comparison::eq_and_validity(lhs, rhs)) - }, + _ => arity::binary_mut_with_options( + self, + rhs, + |lhs, rhs| comparison::eq_and_validity(lhs, rhs), + "", + ), } } @@ -345,9 +292,7 @@ impl ChunkCompare<&BooleanChunked> for BooleanChunked { }, (1, _) => rhs.not_equal(self), _ => { - // Same length. - let (lhs, rhs) = align_chunks_binary(self, rhs); - compare_bools(&lhs, &rhs, |lhs, rhs| comparison::neq(lhs, rhs)) + arity::binary_mut_with_options(self, rhs, |lhs, rhs| comparison::neq(lhs, rhs), "") }, } } @@ -381,13 +326,12 @@ impl ChunkCompare<&BooleanChunked> for BooleanChunked { } }, (1, _) => rhs.not_equal_missing(self), - _ => { - // Same length. - let (lhs, rhs) = align_chunks_binary(self, rhs); - compare_bools(&lhs, &rhs, |lhs, rhs| { - comparison::neq_and_validity(lhs, rhs) - }) - }, + _ => arity::binary_mut_with_options( + self, + rhs, + |lhs, rhs| comparison::neq_and_validity(lhs, rhs), + "", + ), } } @@ -414,11 +358,7 @@ impl ChunkCompare<&BooleanChunked> for BooleanChunked { BooleanChunked::full_null("", rhs.len()) } }, - _ => { - // Same length. - let (lhs, rhs) = align_chunks_binary(self, rhs); - compare_bools(&lhs, &rhs, |lhs, rhs| comparison::gt(lhs, rhs)) - }, + _ => arity::binary_mut_with_options(self, rhs, |lhs, rhs| comparison::gt(lhs, rhs), ""), } } @@ -445,11 +385,12 @@ impl ChunkCompare<&BooleanChunked> for BooleanChunked { BooleanChunked::full_null("", rhs.len()) } }, - _ => { - // Same length. - let (lhs, rhs) = align_chunks_binary(self, rhs); - compare_bools(&lhs, &rhs, |lhs, rhs| comparison::gt_eq(lhs, rhs)) - }, + _ => arity::binary_mut_with_options( + self, + rhs, + |lhs, rhs| comparison::gt_eq(lhs, rhs), + "", + ), } } @@ -476,11 +417,7 @@ impl ChunkCompare<&BooleanChunked> for BooleanChunked { BooleanChunked::full_null("", rhs.len()) } }, - _ => { - // Same length. - let (lhs, rhs) = align_chunks_binary(self, rhs); - compare_bools(&lhs, &rhs, |lhs, rhs| comparison::lt(lhs, rhs)) - }, + _ => arity::binary_mut_with_options(self, rhs, |lhs, rhs| comparison::lt(lhs, rhs), ""), } } @@ -507,11 +444,12 @@ impl ChunkCompare<&BooleanChunked> for BooleanChunked { BooleanChunked::full_null("", rhs.len()) } }, - _ => { - // Same length. - let (lhs, rhs) = align_chunks_binary(self, rhs); - compare_bools(&lhs, &rhs, |lhs, rhs| comparison::lt_eq(lhs, rhs)) - }, + _ => arity::binary_mut_with_options( + self, + rhs, + |lhs, rhs| comparison::lt_eq(lhs, rhs), + "", + ), } } } @@ -551,20 +489,6 @@ impl ChunkCompare<&Utf8Chunked> for Utf8Chunked { } } -impl BinaryChunked { - fn comparison( - &self, - rhs: &BinaryChunked, - f: impl Fn(&BinaryArray, &BinaryArray) -> BooleanArray, - ) -> BooleanChunked { - let chunks = self - .downcast_iter() - .zip(rhs.downcast_iter()) - .map(|(left, right)| f(left, right)); - ChunkedArray::from_chunk_iter("", chunks) - } -} - impl ChunkCompare<&BinaryChunked> for BinaryChunked { type Item = BooleanChunked; @@ -583,8 +507,7 @@ impl ChunkCompare<&BinaryChunked> for BinaryChunked { BooleanChunked::full_null("", rhs.len()) } } else { - let (lhs, rhs) = align_chunks_binary(self, rhs); - lhs.comparison(&rhs, comparison::binary::eq) + arity::binary_mut_with_options(self, rhs, comparison::binary::eq, "") } } @@ -603,8 +526,7 @@ impl ChunkCompare<&BinaryChunked> for BinaryChunked { rhs.is_null() } } else { - let (lhs, rhs) = align_chunks_binary(self, rhs); - lhs.comparison(&rhs, comparison::binary::eq_and_validity) + arity::binary_mut_with_options(self, rhs, comparison::binary::eq_and_validity, "") } } @@ -623,8 +545,7 @@ impl ChunkCompare<&BinaryChunked> for BinaryChunked { BooleanChunked::full_null("", rhs.len()) } } else { - let (lhs, rhs) = align_chunks_binary(self, rhs); - lhs.comparison(&rhs, comparison::binary::neq) + arity::binary_mut_with_options(self, rhs, comparison::binary::neq, "") } } @@ -643,8 +564,7 @@ impl ChunkCompare<&BinaryChunked> for BinaryChunked { rhs.is_not_null() } } else { - let (lhs, rhs) = align_chunks_binary(self, rhs); - lhs.comparison(&rhs, comparison::binary::neq_and_validity) + arity::binary_mut_with_options(self, rhs, comparison::binary::neq_and_validity, "") } } @@ -663,8 +583,7 @@ impl ChunkCompare<&BinaryChunked> for BinaryChunked { BooleanChunked::full_null("", self.len()) } } else { - let (lhs, rhs) = align_chunks_binary(self, rhs); - lhs.comparison(&rhs, |l, r| comparison::gt(l, r)) + arity::binary_mut_with_options(self, rhs, comparison::binary::gt, "") } } @@ -683,8 +602,7 @@ impl ChunkCompare<&BinaryChunked> for BinaryChunked { BooleanChunked::full_null("", self.len()) } } else { - let (lhs, rhs) = align_chunks_binary(self, rhs); - lhs.comparison(&rhs, |l, r| comparison::gt_eq(l, r)) + arity::binary_mut_with_options(self, rhs, comparison::binary::gt_eq, "") } } @@ -703,8 +621,7 @@ impl ChunkCompare<&BinaryChunked> for BinaryChunked { BooleanChunked::full_null("", self.len()) } } else { - let (lhs, rhs) = align_chunks_binary(self, rhs); - lhs.comparison(&rhs, |l, r| comparison::lt(l, r)) + arity::binary_mut_with_options(self, rhs, comparison::binary::lt, "") } } @@ -723,8 +640,7 @@ impl ChunkCompare<&BinaryChunked> for BinaryChunked { BooleanChunked::full_null("", self.len()) } } else { - let (lhs, rhs) = align_chunks_binary(self, rhs); - lhs.comparison(&rhs, |l, r| comparison::lt_eq(l, r)) + arity::binary_mut_with_options(self, rhs, comparison::binary::lt_eq, "") } } } @@ -905,12 +821,12 @@ impl ChunkCompare<&StructChunked> for StructChunked { impl ChunkCompare<&ArrayChunked> for ArrayChunked { type Item = BooleanChunked; fn equal(&self, rhs: &ArrayChunked) -> BooleanChunked { - let (a, b) = align_chunks_binary(self, rhs); - let chunks = a - .downcast_iter() - .zip(b.downcast_iter()) - .map(|(a, b)| polars_arrow::kernels::comparison::fixed_size_list_eq(a, b)); - ChunkedArray::from_chunk_iter(self.name(), chunks) + arity::binary_mut_with_options( + self, + rhs, + polars_arrow::kernels::comparison::fixed_size_list_eq, + "", + ) } fn equal_missing(&self, rhs: &ArrayChunked) -> BooleanChunked { @@ -919,12 +835,12 @@ impl ChunkCompare<&ArrayChunked> for ArrayChunked { } fn not_equal(&self, rhs: &ArrayChunked) -> BooleanChunked { - let (a, b) = align_chunks_binary(self, rhs); - let chunks = a - .downcast_iter() - .zip(b.downcast_iter()) - .map(|(a, b)| polars_arrow::kernels::comparison::fixed_size_list_neq(a, b)); - ChunkedArray::from_chunk_iter(self.name(), chunks) + arity::binary_mut_with_options( + self, + rhs, + polars_arrow::kernels::comparison::fixed_size_list_neq, + "", + ) } fn not_equal_missing(&self, rhs: &ArrayChunked) -> Self::Item { diff --git a/crates/polars-core/src/chunked_array/ops/apply.rs b/crates/polars-core/src/chunked_array/ops/apply.rs index 533c49f82d43..bff8409d0052 100644 --- a/crates/polars-core/src/chunked_array/ops/apply.rs +++ b/crates/polars-core/src/chunked_array/ops/apply.rs @@ -13,7 +13,7 @@ use crate::prelude::*; use crate::series::IsSorted; use crate::utils::{CustomIterTools, NoNull}; -fn collect_array>( +pub(super) fn collect_array>( iter: I, validity: Option, ) -> PrimitiveArray { diff --git a/crates/polars-core/src/chunked_array/ops/arity.rs b/crates/polars-core/src/chunked_array/ops/arity.rs new file mode 100644 index 000000000000..287ab18adb89 --- /dev/null +++ b/crates/polars-core/src/chunked_array/ops/arity.rs @@ -0,0 +1,153 @@ +use arrow::array::{Array, PrimitiveArray}; +use polars_arrow::utils::combine_validities_and; + +use crate::chunked_array::ops::apply::collect_array; +use crate::datatypes::{ + HasUnderlyingArray, PolarsNumericType, StaticArray, StaticallyMatchesPolarsType, +}; +use crate::prelude::{ChunkedArray, PolarsDataType}; +use crate::utils::align_chunks_binary; + +#[inline] +pub fn binary_elementwise( + lhs: &ChunkedArray, + rhs: &ChunkedArray, + mut op: F, +) -> ChunkedArray +where + T: PolarsDataType, + U: PolarsDataType, + V: PolarsNumericType, + ChunkedArray: HasUnderlyingArray, + ChunkedArray: HasUnderlyingArray, + F: for<'a> FnMut( + Option<< as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>>, + Option<< as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>>, + ) -> Option, +{ + let (lhs, rhs) = align_chunks_binary(lhs, rhs); + let iter = lhs + .downcast_iter() + .zip(rhs.downcast_iter()) + .map(|(lhs_arr, rhs_arr)| { + lhs_arr + .iter() + .zip(rhs_arr.iter()) + .map(|(lhs_opt_val, rhs_opt_val)| op(lhs_opt_val, rhs_opt_val)) + .collect::>() + }); + ChunkedArray::from_chunk_iter(lhs.name(), iter) +} + +#[inline] +pub fn binary_elementwise_values( + lhs: &ChunkedArray, + rhs: &ChunkedArray, + mut op: F, +) -> ChunkedArray +where + T: PolarsDataType, + U: PolarsDataType, + V: PolarsNumericType, + ChunkedArray: HasUnderlyingArray, + ChunkedArray: HasUnderlyingArray, + F: for<'a> FnMut( + < as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>, + < as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>, + ) -> V::Native, +{ + let (lhs, rhs) = align_chunks_binary(lhs, rhs); + let iter = lhs + .downcast_iter() + .zip(rhs.downcast_iter()) + .map(|(lhs_arr, rhs_arr)| { + let validity = combine_validities_and(lhs_arr.validity(), rhs_arr.validity()); + + let iter = lhs_arr + .values_iter() + .zip(rhs_arr.values_iter()) + .map(|(lhs_val, rhs_val)| op(lhs_val, rhs_val)); + collect_array(iter, validity) + }); + ChunkedArray::from_chunk_iter(lhs.name(), iter) +} + +/// Applies a kernel that produces `Array` types. +#[inline] +pub fn binary_mut_with_options( + lhs: &ChunkedArray, + rhs: &ChunkedArray, + mut op: F, + name: &str, +) -> ChunkedArray +where + T: PolarsDataType, + U: PolarsDataType, + V: PolarsDataType, + ChunkedArray: HasUnderlyingArray, + ChunkedArray: HasUnderlyingArray, + Arr: Array + StaticallyMatchesPolarsType, + F: FnMut( + & as HasUnderlyingArray>::ArrayT, + & as HasUnderlyingArray>::ArrayT, + ) -> Arr, +{ + let (lhs, rhs) = align_chunks_binary(lhs, rhs); + let iter = lhs + .downcast_iter() + .zip(rhs.downcast_iter()) + .map(|(lhs_arr, rhs_arr)| op(lhs_arr, rhs_arr)); + ChunkedArray::from_chunk_iter(name, iter) +} + +/// Applies a kernel that produces `Array` types. +pub fn binary_mut( + lhs: &ChunkedArray, + rhs: &ChunkedArray, + op: F, +) -> ChunkedArray +where + T: PolarsDataType, + U: PolarsDataType, + V: PolarsDataType, + ChunkedArray: HasUnderlyingArray, + ChunkedArray: HasUnderlyingArray, + Arr: Array + StaticallyMatchesPolarsType, + F: FnMut( + & as HasUnderlyingArray>::ArrayT, + & as HasUnderlyingArray>::ArrayT, + ) -> Arr, +{ + binary_mut_with_options(lhs, rhs, op, lhs.name()) +} + +/// Applies a kernel that produces `ArrayRef` of the same type. +/// +/// # Safety +/// Caller must ensure that the returned `ArrayRef` belongs to `T: PolarsDataType`. +#[inline] +pub unsafe fn binary_mut_unchecked_same_type( + lhs: &ChunkedArray, + rhs: &ChunkedArray, + mut op: F, + keep_sorted: bool, + keep_fast_explode: bool, +) -> ChunkedArray +where + T: PolarsDataType, + U: PolarsDataType, + ChunkedArray: HasUnderlyingArray, + ChunkedArray: HasUnderlyingArray, + F: FnMut( + & as HasUnderlyingArray>::ArrayT, + & as HasUnderlyingArray>::ArrayT, + ) -> Box, +{ + let (lhs, rhs) = align_chunks_binary(lhs, rhs); + let chunks = lhs + .downcast_iter() + .zip(rhs.downcast_iter()) + .map(|(lhs_arr, rhs_arr)| op(lhs_arr, rhs_arr)) + .collect(); + lhs.copy_with_chunks(chunks, keep_sorted, keep_fast_explode) +} diff --git a/crates/polars-core/src/chunked_array/ops/downcast.rs b/crates/polars-core/src/chunked_array/ops/downcast.rs index 1b29f939f94c..66197f51efb7 100644 --- a/crates/polars-core/src/chunked_array/ops/downcast.rs +++ b/crates/polars-core/src/chunked_array/ops/downcast.rs @@ -18,6 +18,7 @@ impl<'a, T> Chunks<'a, T> { } } + #[inline] pub fn get(&self, index: usize) -> Option<&'a T> { self.chunks.get(index).map(|arr| { let arr = &**arr; @@ -25,6 +26,7 @@ impl<'a, T> Chunks<'a, T> { }) } + #[inline] pub unsafe fn get_unchecked(&self, index: usize) -> &'a T { let arr = self.chunks.get_unchecked(index); let arr = &**arr; @@ -35,6 +37,7 @@ impl<'a, T> Chunks<'a, T> { self.chunks.len() } + #[inline] pub fn last(&self) -> Option<&'a T> { self.chunks.last().map(|arr| { let arr = &**arr; @@ -48,6 +51,7 @@ impl ChunkedArray where Self: HasUnderlyingArray, { + #[inline] pub fn downcast_iter( &self, ) -> impl Iterator::ArrayT> + DoubleEndedIterator { @@ -62,6 +66,7 @@ where /// The caller must ensure: /// * the length remains correct. /// * the flags (sorted, etc) remain correct. + #[inline] pub unsafe fn downcast_iter_mut( &mut self, ) -> impl Iterator::ArrayT> + DoubleEndedIterator { @@ -72,6 +77,7 @@ where }) } + #[inline] pub fn downcast_chunks(&self) -> Chunks<'_, ::ArrayT> { Chunks::new(&self.chunks) } diff --git a/crates/polars-core/src/chunked_array/ops/filter.rs b/crates/polars-core/src/chunked_array/ops/filter.rs index 4f46af830179..408902b3258b 100644 --- a/crates/polars-core/src/chunked_array/ops/filter.rs +++ b/crates/polars-core/src/chunked_array/ops/filter.rs @@ -5,7 +5,6 @@ use arrow::compute::filter::filter as filter_fn; #[cfg(feature = "object")] use crate::chunked_array::object::builder::ObjectChunkedBuilder; use crate::prelude::*; -use crate::utils::align_chunks_binary; macro_rules! check_filter_len { ($self:expr, $filter:expr) => {{ @@ -30,14 +29,15 @@ where }; } check_filter_len!(self, filter); - let (left, filter) = align_chunks_binary(self, filter); - - let chunks = left - .downcast_iter() - .zip(filter.downcast_iter()) - .map(|(left, mask)| filter_fn(left, mask).unwrap()) - .collect::>(); - unsafe { Ok(self.copy_with_chunks(chunks, true, true)) } + Ok(unsafe { + arity::binary_mut_unchecked_same_type( + self, + filter, + |left, mask| filter_fn(left, mask).unwrap(), + true, + true, + ) + }) } } @@ -51,14 +51,15 @@ impl ChunkFilter for BooleanChunked { }; } check_filter_len!(self, filter); - let (left, filter) = align_chunks_binary(self, filter); - - let chunks = left - .downcast_iter() - .zip(filter.downcast_iter()) - .map(|(left, mask)| filter_fn(left, mask).unwrap()) - .collect::>(); - unsafe { Ok(self.copy_with_chunks(chunks, true, true)) } + Ok(unsafe { + arity::binary_mut_unchecked_same_type( + self, + filter, + |left, mask| filter_fn(left, mask).unwrap(), + true, + true, + ) + }) } } @@ -79,15 +80,15 @@ impl ChunkFilter for BinaryChunked { }; } check_filter_len!(self, filter); - let (left, filter) = align_chunks_binary(self, filter); - - let chunks = left - .downcast_iter() - .zip(filter.downcast_iter()) - .map(|(left, mask)| filter_fn(left, mask).unwrap()) - .collect::>(); - - unsafe { Ok(self.copy_with_chunks(chunks, true, true)) } + Ok(unsafe { + arity::binary_mut_unchecked_same_type( + self, + filter, + |left, mask| filter_fn(left, mask).unwrap(), + true, + true, + ) + }) } } @@ -103,19 +104,15 @@ impl ChunkFilter for ListChunked { )), }; } - let (left, filter) = align_chunks_binary(self, filter); - - let chunks = left - .downcast_iter() - .zip(filter.downcast_iter()) - .map(|(left, mask)| filter_fn(left, mask).unwrap()) - .collect::>(); - - // inner type may be categorical or logical type so we clone the state. - let mut ca = self.clone(); - ca.chunks = chunks; - ca.compute_len(); - Ok(ca) + Ok(unsafe { + arity::binary_mut_unchecked_same_type( + self, + filter, + |left, mask| filter_fn(left, mask).unwrap(), + true, + true, + ) + }) } } @@ -132,19 +129,15 @@ impl ChunkFilter for ArrayChunked { )), }; } - let (left, filter) = align_chunks_binary(self, filter); - - let chunks = left - .downcast_iter() - .zip(filter.downcast_iter()) - .map(|(left, mask)| filter_fn(left, mask).unwrap()) - .collect::>(); - - // inner type may be categorical or logical type so we clone the state. - let mut ca = self.clone(); - ca.chunks = chunks; - ca.compute_len(); - Ok(ca) + Ok(unsafe { + arity::binary_mut_unchecked_same_type( + self, + filter, + |left, mask| filter_fn(left, mask).unwrap(), + true, + true, + ) + }) } } diff --git a/crates/polars-core/src/chunked_array/ops/min_max_binary.rs b/crates/polars-core/src/chunked_array/ops/min_max_binary.rs index 1270db482f16..bfb8dcc1d014 100644 --- a/crates/polars-core/src/chunked_array/ops/min_max_binary.rs +++ b/crates/polars-core/src/chunked_array/ops/min_max_binary.rs @@ -1,45 +1,20 @@ -use arrow::array::PrimitiveArray; -use polars_arrow::prelude::FromData; - use crate::datatypes::PolarsNumericType; use crate::prelude::*; use crate::series::arithmetic::coerce_lhs_rhs; -use crate::utils::align_chunks_binary; - -fn cmp_binary(left: &ChunkedArray, right: &ChunkedArray, op: F) -> ChunkedArray -where - T: PolarsNumericType, - F: Fn(T::Native, T::Native) -> T::Native, -{ - let (left, right) = align_chunks_binary(left, right); - let chunks = left - .downcast_iter() - .zip(right.downcast_iter()) - .map(|(left, right)| { - let values = left - .values() - .iter() - .zip(right.values().iter()) - .map(|(l, r)| op(*l, *r)) - .collect::>(); - PrimitiveArray::from_data_default(values.into(), None) - }); - ChunkedArray::from_chunk_iter(left.name(), chunks) -} fn min_binary(left: &ChunkedArray, right: &ChunkedArray) -> ChunkedArray where T: PolarsNumericType, T::Native: PartialOrd, { - let op = |l, r| { + let op = |l: &T::Native, r: &T::Native| { if l < r { - l + *l } else { - r + *r } }; - cmp_binary(left, right, op) + arity::binary_elementwise_values(left, right, op) } fn max_binary(left: &ChunkedArray, right: &ChunkedArray) -> ChunkedArray @@ -47,14 +22,14 @@ where T: PolarsNumericType, T::Native: PartialOrd, { - let op = |l, r| { + let op = |l: &T::Native, r: &T::Native| { if l > r { - l + *l } else { - r + *r } }; - cmp_binary(left, right, op) + arity::binary_elementwise_values(left, right, op) } pub(crate) fn min_max_binary_series( diff --git a/crates/polars-core/src/chunked_array/ops/mod.rs b/crates/polars-core/src/chunked_array/ops/mod.rs index 719ede0ee8a6..3420a041813a 100644 --- a/crates/polars-core/src/chunked_array/ops/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/mod.rs @@ -13,6 +13,7 @@ pub(crate) mod aggregate; pub(crate) mod any_value; pub(crate) mod append; mod apply; +pub mod arity; mod bit_repr; pub(crate) mod chunkops; pub(crate) mod compare_inner; diff --git a/crates/polars-core/src/chunked_array/ops/repeat_by.rs b/crates/polars-core/src/chunked_array/ops/repeat_by.rs index 31fb62c19971..3932b644ad9f 100644 --- a/crates/polars-core/src/chunked_array/ops/repeat_by.rs +++ b/crates/polars-core/src/chunked_array/ops/repeat_by.rs @@ -30,16 +30,17 @@ where .collect::>(), )); } - let iter = self - .into_iter() - .zip(by) - .map(|(opt_v, opt_by)| opt_by.map(|by| std::iter::repeat(opt_v).take(by as usize))); - - // SAFETY: length of iter is trusted. - let arr = unsafe { - LargeListArray::from_iter_primitive_trusted_len(iter, T::get_dtype().to_arrow()) - }; - Ok(ChunkedArray::with_chunk(self.name(), arr)) + + Ok(arity::binary_mut(self, by, |arr, by| { + let iter = arr.into_iter().zip(by).map(|(opt_v, opt_by)| { + opt_by.map(|by| std::iter::repeat(opt_v.copied()).take(*by as usize)) + }); + + // SAFETY: length of iter is trusted. + unsafe { + LargeListArray::from_iter_primitive_trusted_len(iter, T::get_dtype().to_arrow()) + } + })) } } impl RepeatBy for BooleanChunked { @@ -55,14 +56,14 @@ impl RepeatBy for BooleanChunked { )); } - let iter = self - .into_iter() - .zip(by) - .map(|(opt_v, opt_by)| opt_by.map(|by| std::iter::repeat(opt_v).take(by as usize))); + Ok(arity::binary_mut(self, by, |arr, by| { + let iter = arr.into_iter().zip(by).map(|(opt_v, opt_by)| { + opt_by.map(|by| std::iter::repeat(opt_v).take(*by as usize)) + }); - // SAFETY: length of iter is trusted. - let arr = unsafe { LargeListArray::from_iter_bool_trusted_len(iter) }; - Ok(ChunkedArray::with_chunk(self.name(), arr)) + // SAFETY: length of iter is trusted. + unsafe { LargeListArray::from_iter_bool_trusted_len(iter) } + })) } } impl RepeatBy for Utf8Chunked { @@ -79,14 +80,14 @@ impl RepeatBy for Utf8Chunked { )); } - let iter = self - .into_iter() - .zip(by) - .map(|(opt_v, opt_by)| opt_by.map(|by| std::iter::repeat(opt_v).take(by as usize))); + Ok(arity::binary_mut(self, by, |arr, by| { + let iter = arr.into_iter().zip(by).map(|(opt_v, opt_by)| { + opt_by.map(|by| std::iter::repeat(opt_v).take(*by as usize)) + }); - // SAFETY: length of iter is trusted. - let arr = unsafe { LargeListArray::from_iter_utf8_trusted_len(iter, self.len()) }; - Ok(ChunkedArray::with_chunk(self.name(), arr)) + // SAFETY: length of iter is trusted. + unsafe { LargeListArray::from_iter_utf8_trusted_len(iter, self.len()) } + })) } } @@ -102,13 +103,14 @@ impl RepeatBy for BinaryChunked { .collect::>(), )); } - let iter = self - .into_iter() - .zip(by) - .map(|(opt_v, opt_by)| opt_by.map(|by| std::iter::repeat(opt_v).take(by as usize))); - - // SAFETY: length of iter is trusted. - let arr = unsafe { LargeListArray::from_iter_binary_trusted_len(iter, self.len()) }; - Ok(ChunkedArray::with_chunk(self.name(), arr)) + + Ok(arity::binary_mut(self, by, |arr, by| { + let iter = arr.into_iter().zip(by).map(|(opt_v, opt_by)| { + opt_by.map(|by| std::iter::repeat(opt_v).take(*by as usize)) + }); + + // SAFETY: length of iter is trusted. + unsafe { LargeListArray::from_iter_binary_trusted_len(iter, self.len()) } + })) } } diff --git a/crates/polars-core/src/series/arithmetic/borrowed.rs b/crates/polars-core/src/series/arithmetic/borrowed.rs index 2388bd5e0890..4ca8e3e7e00b 100644 --- a/crates/polars-core/src/series/arithmetic/borrowed.rs +++ b/crates/polars-core/src/series/arithmetic/borrowed.rs @@ -118,7 +118,6 @@ pub mod checked { use num_traits::{CheckedDiv, One, ToPrimitive, Zero}; use super::*; - use crate::utils::align_chunks_binary; pub trait NumOpsDispatchCheckedInner: PolarsDataType + Sized { /// Checked integer division. Computes self / rhs, returning None if rhs == 0 or the division results in overflow. @@ -161,24 +160,14 @@ pub mod checked { // Note that the physical type correctness is checked! // The ChunkedArray with the wrong dtype is dropped after this operation let rhs = unsafe { lhs.unpack_series_matching_physical_type(rhs) }; - let (l, r) = align_chunks_binary(lhs, rhs); - - Ok((l) - .downcast_iter() - .zip(r.downcast_iter()) - .flat_map(|(l_arr, r_arr)| { - l_arr - .into_iter() - .zip(r_arr) - // we don't use a kernel, because the checked div also supplies nulls. - // so the usual bit combining is not enough. - .map(|(opt_l, opt_r)| match (opt_l, opt_r) { - (Some(l), Some(r)) => l.checked_div(r), - _ => None, - }) + + Ok( + arity::binary_elementwise(lhs, rhs, |opt_l, opt_r| match (opt_l, opt_r) { + (Some(l), Some(r)) => l.checked_div(r), + _ => None, }) - .collect::>() - .into_series()) + .into_series(), + ) } } @@ -187,30 +176,22 @@ pub mod checked { // Safety: // see check_div for chunkedarray let rhs = unsafe { lhs.unpack_series_matching_physical_type(rhs) }; - let (l, r) = align_chunks_binary(lhs, rhs); - - Ok((l) - .downcast_iter() - .zip(r.downcast_iter()) - .flat_map(|(l_arr, r_arr)| { - l_arr - .into_iter() - .zip(r_arr) - // we don't use a kernel, because the checked div also supplies nulls. - // so the usual bit combining is not enough. - .map(|(opt_l, opt_r)| match (opt_l, opt_r) { - (Some(l), Some(r)) => { - if r.is_zero() { - None - } else { - Some(l / r) - } - }, - _ => None, - }) + + Ok( + arity::binary_elementwise::<_, _, Float32Type, _>(lhs, rhs, |opt_l, opt_r| match ( + opt_l, opt_r, + ) { + (Some(l), Some(r)) => { + if r.is_zero() { + None + } else { + Some(l / r) + } + }, + _ => None, }) - .collect::() - .into_series()) + .into_series(), + ) } } @@ -219,30 +200,22 @@ pub mod checked { // Safety: // see check_div let rhs = unsafe { lhs.unpack_series_matching_physical_type(rhs) }; - let (l, r) = align_chunks_binary(lhs, rhs); - - Ok((l) - .downcast_iter() - .zip(r.downcast_iter()) - .flat_map(|(l_arr, r_arr)| { - l_arr - .into_iter() - .zip(r_arr) - // we don't use a kernel, because the checked div also supplies nulls. - // so the usual bit combining is not enough. - .map(|(opt_l, opt_r)| match (opt_l, opt_r) { - (Some(l), Some(r)) => { - if r.is_zero() { - None - } else { - Some(l / r) - } - }, - _ => None, - }) + + Ok( + arity::binary_elementwise::<_, _, Float64Type, _>(lhs, rhs, |opt_l, opt_r| match ( + opt_l, opt_r, + ) { + (Some(l), Some(r)) => { + if r.is_zero() { + None + } else { + Some(l / r) + } + }, + _ => None, }) - .collect::() - .into_series()) + .into_series(), + ) } } diff --git a/crates/polars-ops/src/chunked_array/list/sets.rs b/crates/polars-ops/src/chunked_array/list/sets.rs index 1f3c3a7e8782..a442e820a420 100644 --- a/crates/polars-ops/src/chunked_array/list/sets.rs +++ b/crates/polars-ops/src/chunked_array/list/sets.rs @@ -10,7 +10,6 @@ use arrow::offset::OffsetsBuffer; use arrow::types::NativeType; use polars_arrow::utils::combine_validities_and; use polars_core::prelude::*; -use polars_core::utils::align_chunks_binary; use polars_core::with_match_physical_integer_type; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; @@ -281,16 +280,14 @@ fn array_set_operation( } pub fn list_set_operation(a: &ListChunked, b: &ListChunked, set_op: SetOperation) -> ListChunked { - let (a, b) = align_chunks_binary(a, b); - - // no downcasting needed as lists - // already have logical types - let chunks = a - .downcast_iter() - .zip(b.downcast_iter()) - .map(|(a, b)| array_set_operation(a, b, set_op).boxed()) - .collect::>(); - - // safety: dtypes are correct - unsafe { a.with_chunks(chunks) } + // we use the unsafe variant because we want to keep the nested logical types type. + unsafe { + arity::binary_mut_unchecked_same_type( + a, + b, + |a, b| array_set_operation(a, b, set_op).boxed(), + false, + false, + ) + } } diff --git a/crates/polars-ops/src/series/ops/floor_divide.rs b/crates/polars-ops/src/series/ops/floor_divide.rs index 01e4e038d2c0..1bf7800070f8 100644 --- a/crates/polars-ops/src/series/ops/floor_divide.rs +++ b/crates/polars-ops/src/series/ops/floor_divide.rs @@ -6,7 +6,6 @@ use polars_core::export::num; use polars_core::prelude::*; #[cfg(feature = "dtype-struct")] use polars_core::series::arithmetic::_struct_arithmetic; -use polars_core::utils::align_chunks_binary; use polars_core::with_match_physical_numeric_polars_type; #[inline] @@ -79,13 +78,7 @@ fn floor_div_ca(a: &ChunkedArray, b: &ChunkedArray) ChunkedArray::full_null(a.name(), a.len()) }; } - let (a, b) = align_chunks_binary(a, b); - - let chunks = a - .downcast_iter() - .zip(b.downcast_iter()) - .map(|(a, b)| floor_div_array(a, b)); - ChunkedArray::from_chunk_iter(a.name(), chunks) + arity::binary_mut(a, b, floor_div_array) } pub fn floor_div_series(a: &Series, b: &Series) -> PolarsResult { diff --git a/crates/polars-plan/src/dsl/function_expr/pow.rs b/crates/polars-plan/src/dsl/function_expr/pow.rs index c0e87fb43072..530041707365 100644 --- a/crates/polars-plan/src/dsl/function_expr/pow.rs +++ b/crates/polars-plan/src/dsl/function_expr/pow.rs @@ -2,7 +2,6 @@ use num::pow::Pow; use polars_arrow::kernels::pow::pow as pow_kernel; use polars_core::export::num; use polars_core::export::num::{Float, ToPrimitive}; -use polars_core::utils::align_chunks_binary; use super::*; @@ -64,13 +63,9 @@ where exponent.apply(|exp| Pow::pow(base, exp)).into_series(), )) } else { - let (ca_1, ca_2) = align_chunks_binary(base, exponent); - let chunks = ca_1 - .downcast_iter() - .zip(ca_2.downcast_iter()) - .map(|(arr_1, arr_2)| pow_kernel(arr_1, arr_2)); Ok(Some( - ChunkedArray::from_chunk_iter(ca_1.name(), chunks).into_series(), + polars_core::chunked_array::ops::arity::binary_mut(base, exponent, pow_kernel) + .into_series(), )) } } diff --git a/crates/polars-plan/src/dsl/function_expr/trigonometry.rs b/crates/polars-plan/src/dsl/function_expr/trigonometry.rs index a07598204858..99cd90cee546 100644 --- a/crates/polars-plan/src/dsl/function_expr/trigonometry.rs +++ b/crates/polars-plan/src/dsl/function_expr/trigonometry.rs @@ -1,7 +1,6 @@ use num::Float; use polars_arrow::kernels::atan2::atan2 as atan2_kernel; use polars_core::export::num; -use polars_core::utils::align_chunks_binary; use super::*; @@ -129,13 +128,8 @@ where Ok(Some(x.apply(|v| y_value.atan2(v)).into_series())) } else { - let (ca_1, ca_2) = align_chunks_binary(y, x); - let chunks = ca_1 - .downcast_iter() - .zip(ca_2.downcast_iter()) - .map(|(arr_1, arr_2)| atan2_kernel(arr_1, arr_2)); Ok(Some( - ChunkedArray::from_chunk_iter(ca_1.name(), chunks).into_series(), + polars_core::prelude::arity::binary_mut(y, x, atan2_kernel).into_series(), )) } } From 76557703cbf3eb697a6fa893734545d10bf89d1b Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sat, 19 Aug 2023 09:33:07 +0100 Subject: [PATCH 06/55] refactor(python): deprecate DataFrame.replace (#10600) --- py-polars/polars/dataframe/frame.py | 20 ++++++++++++++++---- py-polars/tests/unit/dataframe/test_df.py | 3 ++- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index fdf85a85c60c..19c09a790fcd 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -83,6 +83,7 @@ from polars.utils._wrap import wrap_expr, wrap_ldf, wrap_s from polars.utils.convert import _timedelta_to_pl_duration from polars.utils.deprecation import ( + deprecate_function, deprecate_renamed_methods, deprecate_renamed_parameter, ) @@ -1033,6 +1034,11 @@ def _read_ndjson( ) return self + def _replace(self, column: str, new_column: Series) -> Self: + """Replace a column by a new Series (in place).""" + self._df.replace(column, new_column._s) + return self + @property def shape(self) -> tuple[int, int]: """ @@ -1701,7 +1707,7 @@ def __setitem__( self.replace_at_idx(col_selection, s) # df["foo"] elif isinstance(col_selection, str): - self.replace(col_selection, s) + self._replace(col_selection, s) else: raise TypeError( f"cannot use `__setitem__` on DataFrame" @@ -4380,6 +4386,13 @@ def frame_equal(self, other: DataFrame, *, null_equal: bool = True) -> bool: """ return self._df.frame_equal(other._df, null_equal) + @deprecate_function( + "DataFrame.replace is deprecated and will be removed in a future version. " + "Please use\n" + " df = df.with_columns(new_column.alias(column_name))\n" + "instead.", + version="0.19.0", + ) def replace(self, column: str, new_column: Series) -> Self: """ Replace a column by a new Series. @@ -4395,7 +4408,7 @@ def replace(self, column: str, new_column: Series) -> Self: -------- >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) >>> s = pl.Series([10, 20, 30]) - >>> df.replace("foo", s) # works in-place! + >>> df.replace("foo", s) # works in-place! # doctest: +SKIP shape: (3, 2) ┌─────┬─────┐ │ foo ┆ bar │ @@ -4408,8 +4421,7 @@ def replace(self, column: str, new_column: Series) -> Self: └─────┴─────┘ """ - self._df.replace(column, new_column._s) - return self + return self._replace(column, new_column) def slice(self, offset: int, length: int | None = None) -> Self: """ diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py index e8b921119912..0258dd0fb1e3 100644 --- a/py-polars/tests/unit/dataframe/test_df.py +++ b/py-polars/tests/unit/dataframe/test_df.py @@ -511,7 +511,8 @@ def test_sort_maintain_order() -> None: def test_replace() -> None: df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3]}) s = pl.Series("c", [True, False, True]) - df.replace("a", s) + with pytest.deprecated_call(): + df.replace("a", s) assert_frame_equal(df, pl.DataFrame({"a": [True, False, True], "b": [1, 2, 3]})) From 1095763fdf5e81f789dddde589161ea97c910cdc Mon Sep 17 00:00:00 2001 From: Weijie Guo Date: Sat, 19 Aug 2023 19:00:36 +0800 Subject: [PATCH 07/55] fix(rust, python): Fix serialization for categorical chunked. (#10609) --- crates/polars-core/src/serde/chunked_array.rs | 2 +- py-polars/tests/unit/test_serde.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/crates/polars-core/src/serde/chunked_array.rs b/crates/polars-core/src/serde/chunked_array.rs index ad5f31213566..19e8a815b8bf 100644 --- a/crates/polars-core/src/serde/chunked_array.rs +++ b/crates/polars-core/src/serde/chunked_array.rs @@ -144,7 +144,7 @@ impl Serialize for CategoricalChunked { S: Serializer, { { - let mut state = serializer.serialize_map(Some(3))?; + let mut state = serializer.serialize_map(Some(4))?; state.serialize_entry("name", self.name())?; state.serialize_entry("datatype", self.dtype())?; state.serialize_entry("bit_settings", &self.get_flags())?; diff --git a/py-polars/tests/unit/test_serde.py b/py-polars/tests/unit/test_serde.py index 5f6c7a45baf1..f5bee11eb449 100644 --- a/py-polars/tests/unit/test_serde.py +++ b/py-polars/tests/unit/test_serde.py @@ -7,6 +7,7 @@ import pytest import polars as pl +from polars import StringCache from polars.testing import assert_frame_equal, assert_series_equal @@ -182,3 +183,10 @@ def inner_df_times2(df: pl.DataFrame) -> pl.DataFrame: q = pickle.loads(b) assert q.collect()["a"].to_list() == [2, 4, 6] + + +@StringCache() +def test_serde_categorical_series_10586() -> None: + s = pl.Series(["a", "b", "b", "a", "c"], dtype=pl.Categorical) + loaded_s = pickle.loads(pickle.dumps(s)) + assert_series_equal(loaded_s, s) From caa6f7a71d6c8568374446f734a22ed34c6ccbdd Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sat, 19 Aug 2023 13:18:24 +0200 Subject: [PATCH 08/55] feat!: Change behavior of `all` - fix Kleene logic implementation for `all`/`any` (#10564) --- Cargo.toml | 10 +- .../src/chunked_array/comparison/mod.rs | 55 ++++--- .../src/dsl/function_expr/boolean.rs | 32 ++-- crates/polars-plan/src/dsl/mod.rs | 36 +++-- py-polars/Cargo.lock | 15 +- py-polars/polars/expr/expr.py | 150 ++++++++++-------- py-polars/polars/expr/list.py | 2 +- .../polars/functions/aggregation/vertical.py | 46 ++++-- py-polars/polars/series/list.py | 2 +- py-polars/polars/series/series.py | 90 +++++++++-- py-polars/src/expr/general.rs | 10 +- py-polars/tests/unit/datatypes/test_bool.py | 6 - py-polars/tests/unit/datatypes/test_list.py | 2 +- py-polars/tests/unit/series/test_all_any.py | 75 +++++++++ 14 files changed, 368 insertions(+), 163 deletions(-) create mode 100644 py-polars/tests/unit/series/test_all_any.py diff --git a/Cargo.toml b/Cargo.toml index 492f2d9dd5a6..735b905a14b0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,12 +53,10 @@ xxhash-rust = { version = "0.8.6", features = ["xxh3"] } [workspace.dependencies.arrow] package = "arrow2" -# git = "https://github.com/jorgecarleitao/arrow2" -# git = "https://github.com/ritchie46/arrow2" -# rev = "9beabec8cfb5502582d31ab898fdd36e7af0873c" -# path = "../arrow2" -# branch = "duration_json" -version = "0.17.4" +git = "https://github.com/jorgecarleitao/arrow2" +rev = "7edf5f9e359e0ed02e9d0c6b9318b06964d805f0" +# branch = "" +# version = "0.17.4" default-features = false features = [ "compute_aggregate", diff --git a/crates/polars-core/src/chunked_array/comparison/mod.rs b/crates/polars-core/src/chunked_array/comparison/mod.rs index 29350564f8e7..31540fbba65a 100644 --- a/crates/polars-core/src/chunked_array/comparison/mod.rs +++ b/crates/polars-core/src/chunked_array/comparison/mod.rs @@ -884,31 +884,50 @@ impl Not for BooleanChunked { } impl BooleanChunked { - /// Check if all values are `true` - pub fn all(&self) -> bool { - self.downcast_iter().all(compute::boolean::all) - } - - /// Check if any value is `true` + /// Returns whether any of the values in the column are `true`. + /// + /// Null values are ignored. pub fn any(&self) -> bool { self.downcast_iter().any(compute::boolean::any) } - // Three-valued versions which can return None - pub fn all_3val(&self, drop_nulls: bool) -> Option { - if drop_nulls || self.null_count() == 0 { - Some(self.all()) - } else { - None + /// Returns whether all values in the array are `true`. + /// + /// Null values are ignored. + pub fn all(&self) -> bool { + self.downcast_iter().all(compute::boolean::all) + } + + /// Returns whether any of the values in the column are `true`. + /// + /// The output is unknown (`None`) if the array contains any null values and + /// no `true` values. + pub fn any_kleene(&self) -> Option { + let mut result = Some(false); + for arr in self.downcast_iter() { + match compute::boolean_kleene::any(arr) { + Some(true) => return Some(true), + None => result = None, + _ => (), + }; } + result } - pub fn any_3val(&self, drop_nulls: bool) -> Option { - let res = self.any(); - if drop_nulls || res { - Some(res) - } else { - None + + /// Returns whether all values in the column are `true`. + /// + /// The output is unknown (`None`) if the array contains any null values and + /// no `false` values. + pub fn all_kleene(&self) -> Option { + let mut result = Some(true); + for arr in self.downcast_iter() { + match compute::boolean_kleene::all(arr) { + Some(false) => return Some(false), + None => result = None, + _ => (), + }; } + result } } diff --git a/crates/polars-plan/src/dsl/function_expr/boolean.rs b/crates/polars-plan/src/dsl/function_expr/boolean.rs index 091922a6fa04..fc2eb6307c19 100644 --- a/crates/polars-plan/src/dsl/function_expr/boolean.rs +++ b/crates/polars-plan/src/dsl/function_expr/boolean.rs @@ -9,11 +9,11 @@ use crate::{map, wrap}; #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Clone, PartialEq, Debug, Eq, Hash)] pub enum BooleanFunction { - All { - drop_nulls: bool, - }, Any { - drop_nulls: bool, + ignore_nulls: bool, + }, + All { + ignore_nulls: bool, }, IsNot, IsNull, @@ -77,8 +77,8 @@ impl From for SpecialEq> { fn from(func: BooleanFunction) -> Self { use BooleanFunction::*; match func { - All { drop_nulls } => map!(all, drop_nulls), - Any { drop_nulls } => map!(any, drop_nulls), + Any { ignore_nulls } => map!(any, ignore_nulls), + All { ignore_nulls } => map!(all, ignore_nulls), IsNot => map!(is_not), IsNull => map!(is_null), IsNotNull => map!(is_not_null), @@ -106,14 +106,22 @@ impl From for FunctionExpr { } } -fn all(s: &Series, drop_nulls: bool) -> PolarsResult { - let boolean = s.bool()?; - Ok(Series::new(s.name(), [boolean.all_3val(drop_nulls)])) +fn any(s: &Series, ignore_nulls: bool) -> PolarsResult { + let ca = s.bool()?; + if ignore_nulls { + Ok(Series::new(s.name(), [ca.any()])) + } else { + Ok(Series::new(s.name(), [ca.any_kleene()])) + } } -fn any(s: &Series, drop_nulls: bool) -> PolarsResult { - let boolean = s.bool()?; - Ok(Series::new(s.name(), [boolean.any_3val(drop_nulls)])) +fn all(s: &Series, ignore_nulls: bool) -> PolarsResult { + let ca = s.bool()?; + if ignore_nulls { + Ok(Series::new(s.name(), [ca.all()])) + } else { + Ok(Series::new(s.name(), [ca.all_kleene()])) + } } fn is_not(s: &Series) -> PolarsResult { diff --git a/crates/polars-plan/src/dsl/mod.rs b/crates/polars-plan/src/dsl/mod.rs index c8e8746eae6a..4e3c8520851a 100644 --- a/crates/polars-plan/src/dsl/mod.rs +++ b/crates/polars-plan/src/dsl/mod.rs @@ -1652,9 +1652,30 @@ impl Expr { .with_fmt("ewm_var") } - /// Check if any boolean value is `true` - pub fn any(self, drop_nulls: bool) -> Self { - self.apply_private(BooleanFunction::Any { drop_nulls }.into()) + /// Returns whether any of the values in the column are `true`. + /// + /// If `ignore_nulls` is `False`, [Kleene logic] is used to deal with nulls: + /// if the column contains any null values and no `true` values, the output + /// is null. + /// + /// [Kleene logic]: https://en.wikipedia.org/wiki/Three-valued_logic + pub fn any(self, ignore_nulls: bool) -> Self { + self.apply_private(BooleanFunction::Any { ignore_nulls }.into()) + .with_function_options(|mut opt| { + opt.auto_explode = true; + opt + }) + } + + /// Returns whether all values in the column are `true`. + /// + /// If `ignore_nulls` is `False`, [Kleene logic] is used to deal with nulls: + /// if the column contains any null values and no `true` values, the output + /// is null. + /// + /// [Kleene logic]: https://en.wikipedia.org/wiki/Three-valued_logic + pub fn all(self, ignore_nulls: bool) -> Self { + self.apply_private(BooleanFunction::All { ignore_nulls }.into()) .with_function_options(|mut opt| { opt.auto_explode = true; opt @@ -1668,15 +1689,6 @@ impl Expr { self.map_private(FunctionExpr::ShrinkType) } - /// Check if all boolean values are `true` - pub fn all(self, drop_nulls: bool) -> Self { - self.apply_private(BooleanFunction::All { drop_nulls }.into()) - .with_function_options(|mut opt| { - opt.auto_explode = true; - opt - }) - } - #[cfg(feature = "dtype-struct")] /// Count all unique values and create a struct mapping value to count /// Note that it is better to turn multithreaded off in the aggregation context diff --git a/py-polars/Cargo.lock b/py-polars/Cargo.lock index 0479f717938a..286f80fe8ae0 100644 --- a/py-polars/Cargo.lock +++ b/py-polars/Cargo.lock @@ -99,8 +99,7 @@ dependencies = [ [[package]] name = "arrow2" version = "0.17.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59c468daea140b747d781a1da9f7db5f0a8e6636d4af20cc539e43d05b0604fa" +source = "git+https://github.com/jorgecarleitao/arrow2?rev=7edf5f9e359e0ed02e9d0c6b9318b06964d805f0#7edf5f9e359e0ed02e9d0c6b9318b06964d805f0" dependencies = [ "ahash", "arrow-format", @@ -123,7 +122,7 @@ dependencies = [ "num-traits", "parquet2", "regex", - "regex-syntax 0.6.29", + "regex-syntax", "rustc_version", "simdutf8", "streaming-iterator", @@ -1863,7 +1862,7 @@ dependencies = [ "aho-corasick", "memchr", "regex-automata", - "regex-syntax 0.7.4", + "regex-syntax", ] [[package]] @@ -1874,15 +1873,9 @@ checksum = "fed1ceff11a1dddaee50c9dc8e4938bd106e9d89ae372f192311e7da498e3b69" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.7.4", + "regex-syntax", ] -[[package]] -name = "regex-syntax" -version = "0.6.29" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" - [[package]] name = "regex-syntax" version = "0.7.4" diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index f29a813614ab..29735133c9dc 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -317,14 +317,23 @@ def to_physical(self) -> Self: """ return self._from_pyexpr(self._pyexpr.to_physical()) - def any(self, drop_nulls: bool = True) -> Self: + @deprecate_renamed_parameter("drop_nulls", "ignore_nulls", version="0.19.0") + def any(self, *, ignore_nulls: bool = True) -> Self: """ - Check if any boolean value in a Boolean column is `True`. + Return whether any of the values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. Parameters ---------- - drop_nulls - If False, return None if there are nulls but no Trues. + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic Returns ------- @@ -333,51 +342,59 @@ def any(self, drop_nulls: bool = True) -> Self: Examples -------- - >>> df = pl.DataFrame({"TF": [True, False], "FF": [False, False]}) - >>> df.select(pl.all().any()) - shape: (1, 2) - ┌──────┬───────┐ - │ TF ┆ FF │ - │ --- ┆ --- │ - │ bool ┆ bool │ - ╞══════╪═══════╡ - │ true ┆ false │ - └──────┴───────┘ - >>> df = pl.DataFrame(dict(x=[None, False], y=[None, True])) - >>> df.select(pl.col("x").any(True), pl.col("y").any(True)) - shape: (1, 2) - ┌───────┬──────┐ - │ x ┆ y │ - │ --- ┆ --- │ - │ bool ┆ bool │ - ╞═══════╪══════╡ - │ false ┆ true │ - └───────┴──────┘ - >>> df.select(pl.col("x").any(False), pl.col("y").any(False)) - shape: (1, 2) - ┌──────┬──────┐ - │ x ┆ y │ - │ --- ┆ --- │ - │ bool ┆ bool │ - ╞══════╪══════╡ - │ null ┆ true │ - └──────┴──────┘ + >>> df = pl.DataFrame( + ... { + ... "a": [True, False], + ... "b": [False, False], + ... "c": [None, False], + ... } + ... ) + >>> df.select(pl.col("*").any()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> df.select(pl.col("*").any(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ """ - return self._from_pyexpr(self._pyexpr.any(drop_nulls)) + return self._from_pyexpr(self._pyexpr.any(ignore_nulls)) - def all(self, drop_nulls: bool = True) -> Self: + @deprecate_renamed_parameter("drop_nulls", "ignore_nulls", version="0.19.0") + def all(self, *, ignore_nulls: bool = True) -> Self: """ - Check if all boolean values in a Boolean column are `True`. + Return whether all values in the column are ``True``. - This method is an expression - not to be confused with - :func:`polars.all` which is a function to select all columns. + Only works on columns of data type :class:`Boolean`. + + .. note:: + This method is not to be confused with the function :func:`polars.all`, + which can be used to select all columns. Parameters ---------- - drop_nulls - If False, return None if there are any nulls. + ignore_nulls + Ignore null values (default). + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic Returns ------- @@ -387,39 +404,36 @@ def all(self, drop_nulls: bool = True) -> Self: Examples -------- >>> df = pl.DataFrame( - ... {"TT": [True, True], "TF": [True, False], "FF": [False, False]} + ... { + ... "a": [True, True], + ... "b": [False, True], + ... "c": [None, True], + ... } ... ) >>> df.select(pl.col("*").all()) shape: (1, 3) - ┌──────┬───────┬───────┐ - │ TT ┆ TF ┆ FF │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞══════╪═══════╪═══════╡ - │ true ┆ false ┆ false │ - └──────┴───────┴───────┘ - >>> df = pl.DataFrame(dict(x=[None, False], y=[None, True])) - >>> df.select(pl.col("x").all(True), pl.col("y").all(True)) - shape: (1, 2) - ┌───────┬───────┐ - │ x ┆ y │ - │ --- ┆ --- │ - │ bool ┆ bool │ - ╞═══════╪═══════╡ - │ false ┆ false │ - └───────┴───────┘ - >>> df.select(pl.col("x").all(False), pl.col("y").all(False)) - shape: (1, 2) - ┌──────┬──────┐ - │ x ┆ y │ - │ --- ┆ --- │ - │ bool ┆ bool │ - ╞══════╪══════╡ - │ null ┆ null │ - └──────┴──────┘ + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ true │ + └──────┴───────┴──────┘ + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> df.select(pl.col("*").all(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ """ - return self._from_pyexpr(self._pyexpr.all(drop_nulls)) + return self._from_pyexpr(self._pyexpr.all(ignore_nulls)) def arg_true(self) -> Self: """ diff --git a/py-polars/polars/expr/list.py b/py-polars/polars/expr/list.py index 6b5c426ebdab..1577353b722e 100644 --- a/py-polars/polars/expr/list.py +++ b/py-polars/polars/expr/list.py @@ -60,7 +60,7 @@ def all(self) -> Expr: │ true │ │ false │ │ false │ - │ false │ + │ true │ │ true │ │ null │ └───────┘ diff --git a/py-polars/polars/functions/aggregation/vertical.py b/py-polars/polars/functions/aggregation/vertical.py index 600b20cc3fca..ef1234b2bc26 100644 --- a/py-polars/polars/functions/aggregation/vertical.py +++ b/py-polars/polars/functions/aggregation/vertical.py @@ -15,20 +15,24 @@ @overload -def all(exprs: Series) -> bool: # type: ignore[misc] +def all(exprs: Series, *, ignore_nulls: bool = ...) -> bool | None: # type: ignore[misc] ... @overload def all( - exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr + exprs: IntoExpr | Iterable[IntoExpr] | None = ..., + *more_exprs: IntoExpr, + ignore_nulls: bool = ..., ) -> Expr: ... @deprecate_renamed_parameter("columns", "exprs", version="0.18.7") def all( - exprs: IntoExpr | Iterable[IntoExpr] | None = None, *more_exprs: IntoExpr + exprs: IntoExpr | Iterable[IntoExpr] | None = None, + *more_exprs: IntoExpr, + ignore_nulls: bool = True, ) -> Expr | bool | None: """ Either return an expression representing all columns, or evaluate a bitwise AND operation. @@ -50,6 +54,14 @@ def all( parsed as column names, other non-expression inputs are parsed as literals. *more_exprs Additional columns to use in the aggregation, specified as positional arguments. + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is ``None``. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic See Also -------- @@ -96,27 +108,33 @@ def all( "passing a Series to `all` is deprecated. Use `Series.all()` instead.", version="0.18.7", ) - return exprs.all() + return exprs.all(ignore_nulls=ignore_nulls) elif isinstance(exprs, str): - return F.col(exprs).all() + return F.col(exprs).all(ignore_nulls=ignore_nulls) _warn_for_deprecated_horizontal_use("all") return F.all_horizontal(exprs, *more_exprs) @overload -def any(exprs: Series) -> bool: # type: ignore[misc] +def any(exprs: Series, *, ignore_nulls: bool = ...) -> bool | None: # type: ignore[misc] ... @overload -def any(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr: +def any( + exprs: IntoExpr | Iterable[IntoExpr], + *more_exprs: IntoExpr, + ignore_nulls: bool = ..., +) -> Expr: ... @deprecate_renamed_parameter("columns", "exprs", version="0.18.7") def any( - exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr + exprs: IntoExpr | Iterable[IntoExpr], + *more_exprs: IntoExpr, + ignore_nulls: bool = True, ) -> Expr | bool | None: """ Evaluate a bitwise OR operation. @@ -141,6 +159,14 @@ def any( parsed as column names, other non-expression inputs are parsed as literals. *more_exprs Additional columns to use in the aggregation, specified as positional arguments. + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is ``None``. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic Examples -------- @@ -167,9 +193,9 @@ def any( "passing a Series to `any` is deprecated. Use `Series.any()` instead.", version="0.18.7", ) - return exprs.any() + return exprs.any(ignore_nulls=ignore_nulls) elif isinstance(exprs, str): - return F.col(exprs).any() + return F.col(exprs).any(ignore_nulls=ignore_nulls) _warn_for_deprecated_horizontal_use("any") return F.any_horizontal(exprs, *more_exprs) diff --git a/py-polars/polars/series/list.py b/py-polars/polars/series/list.py index 69a9d8ffec69..7afd49f03a81 100644 --- a/py-polars/polars/series/list.py +++ b/py-polars/polars/series/list.py @@ -57,7 +57,7 @@ def all(self) -> Expr: │ true │ │ false │ │ false │ - │ false │ + │ true │ │ true │ │ null │ └───────┘ diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index 357bd3b87656..fcd637ca0f35 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -1229,37 +1229,103 @@ def cbrt(self) -> Series: """ - def any(self, drop_nulls: bool = True) -> bool | None: + @overload + def any(self, *, ignore_nulls: Literal[True] = ...) -> bool: + ... + + @overload + def any(self, *, ignore_nulls: bool) -> bool | None: + ... + + @deprecate_renamed_parameter("drop_nulls", "ignore_nulls", version="0.19.0") + def any(self, *, ignore_nulls: bool = True) -> bool | None: """ - Check if any boolean value in the column is `True`. + Return whether any of the values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is ``None``. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic Returns ------- - Series - Series of data type :class:`Boolean`. + bool or None + + Examples + -------- + >>> pl.Series([True, False]).any() + True + >>> pl.Series([False, False]).any() + False + >>> pl.Series([None, False]).any() + False + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None """ return ( self.to_frame() - .select(F.col(self.name).any(drop_nulls=drop_nulls)) - .to_series() + .select(F.col(self.name).any(ignore_nulls=ignore_nulls)) .item() ) - def all(self, drop_nulls: bool = True) -> bool | None: + @overload + def all(self, *, ignore_nulls: Literal[True] = ...) -> bool: + ... + + @overload + def all(self, *, ignore_nulls: bool) -> bool | None: + ... + + @deprecate_renamed_parameter("drop_nulls", "ignore_nulls", version="0.19.0") + def all(self, *, ignore_nulls: bool = True) -> bool | None: """ - Check if all boolean values in the column are `True`. + Return whether all values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is ``None``. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic Returns ------- - Series - Series of data type :class:`Boolean`. + bool or None + + Examples + -------- + >>> pl.Series([True, True]).all() + True + >>> pl.Series([False, True]).all() + False + >>> pl.Series([None, True]).all() + True + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None """ return ( self.to_frame() - .select(F.col(self.name).all(drop_nulls=drop_nulls)) - .to_series() + .select(F.col(self.name).all(ignore_nulls=ignore_nulls)) .item() ) diff --git a/py-polars/src/expr/general.rs b/py-polars/src/expr/general.rs index f9e8e726b26d..a66318dafe18 100644 --- a/py-polars/src/expr/general.rs +++ b/py-polars/src/expr/general.rs @@ -1145,12 +1145,12 @@ impl PyExpr { .with_fmt("extend") .into() } - fn any(&self, drop_nulls: bool) -> Self { - self.inner.clone().any(drop_nulls).into() - } - fn all(&self, drop_nulls: bool) -> Self { - self.inner.clone().all(drop_nulls).into() + fn any(&self, ignore_nulls: bool) -> Self { + self.inner.clone().any(ignore_nulls).into() + } + fn all(&self, ignore_nulls: bool) -> Self { + self.inner.clone().all(ignore_nulls).into() } fn log(&self, base: f64) -> Self { diff --git a/py-polars/tests/unit/datatypes/test_bool.py b/py-polars/tests/unit/datatypes/test_bool.py index 95c670ae2ebc..34a4b0d589a6 100644 --- a/py-polars/tests/unit/datatypes/test_bool.py +++ b/py-polars/tests/unit/datatypes/test_bool.py @@ -61,9 +61,3 @@ def val(expr: pl.Expr) -> dict[str, list[bool]]: assert val(True | pl.col("x")) == {"literal": [True, True]} assert val(False ^ pl.col("x")) == {"literal": [False, True]} assert val(True ^ pl.col("x")) == {"literal": [True, False]} - - -def test_all_empty() -> None: - s = pl.Series([], dtype=pl.Boolean) - assert s.all() - assert not s.any() diff --git a/py-polars/tests/unit/datatypes/test_list.py b/py-polars/tests/unit/datatypes/test_list.py index d9af8d8296b0..962875782f73 100644 --- a/py-polars/tests/unit/datatypes/test_list.py +++ b/py-polars/tests/unit/datatypes/test_list.py @@ -311,7 +311,7 @@ def test_list_all() -> None: ] } ).select(pl.col("a").list.all()).to_dict(False) == { - "a": [True, False, True, False, False, False, True] + "a": [True, False, True, False, False, True, True] } diff --git a/py-polars/tests/unit/series/test_all_any.py b/py-polars/tests/unit/series/test_all_any.py new file mode 100644 index 000000000000..eae9989dee75 --- /dev/null +++ b/py-polars/tests/unit/series/test_all_any.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +import pytest + +import polars as pl + + +@pytest.mark.parametrize( + ("data", "expected"), + [ + ([], False), + ([None], False), + ([False], False), + ([False, None], False), + ([True], True), + ([True, None], True), + ], +) +def test_any(data: list[bool | None], expected: bool) -> None: + assert pl.Series(data, dtype=pl.Boolean).any() is expected + + +@pytest.mark.parametrize( + ("data", "expected"), + [ + ([], False), + ([None], None), + ([False], False), + ([False, None], None), + ([True], True), + ([True, None], True), + ], +) +def test_any_kleene(data: list[bool | None], expected: bool | None) -> None: + assert pl.Series(data, dtype=pl.Boolean).any(ignore_nulls=False) is expected + + +def test_any_wrong_dtype() -> None: + with pytest.raises(pl.SchemaError, match="expected `Boolean`"): + pl.Series([0, 1, 0]).any() + + +@pytest.mark.parametrize( + ("data", "expected"), + [ + ([], True), + ([None], True), + ([False], False), + ([False, None], False), + ([True], True), + ([True, None], True), + ], +) +def test_all(data: list[bool | None], expected: bool) -> None: + assert pl.Series(data, dtype=pl.Boolean).all() is expected + + +@pytest.mark.parametrize( + ("data", "expected"), + [ + ([], True), + ([None], None), + ([False], False), + ([False, None], False), + ([True], True), + ([True, None], None), + ], +) +def test_all_kleene(data: list[bool | None], expected: bool | None) -> None: + assert pl.Series(data, dtype=pl.Boolean).all(ignore_nulls=False) is expected + + +def test_all_wrong_dtype() -> None: + with pytest.raises(pl.SchemaError, match="expected `Boolean`"): + pl.Series([0, 1, 0]).all() From 3beff9a5faf9db48e3ea6ff2235740bcb6adeadc Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sat, 19 Aug 2023 13:32:57 +0200 Subject: [PATCH 09/55] ci: Enforce up-to-date `Cargo.lock` (#10555) --- .github/workflows/lint-py-polars.yml | 2 +- py-polars/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/lint-py-polars.yml b/.github/workflows/lint-py-polars.yml index 5265f3181b3b..47e05ff3bbfa 100644 --- a/.github/workflows/lint-py-polars.yml +++ b/.github/workflows/lint-py-polars.yml @@ -43,4 +43,4 @@ jobs: run: cargo fmt --all -- --check - name: Run clippy - run: cargo clippy -- -D warnings + run: cargo clippy --locked -- -D warnings diff --git a/py-polars/Makefile b/py-polars/Makefile index c673c9ed7482..e0fcee1b83a5 100644 --- a/py-polars/Makefile +++ b/py-polars/Makefile @@ -67,7 +67,7 @@ fmt: .venv ## Run autoformatting and linting .PHONY: clippy clippy: ## Run clippy - cargo clippy -- -D warnings + cargo clippy --locked -- -D warnings .PHONY: pre-commit pre-commit: fmt clippy ## Run all code quality checks From 88a8c3c6a7da8342a95956b77f571909b7337762 Mon Sep 17 00:00:00 2001 From: Liam Brannigan Date: Sat, 19 Aug 2023 16:16:24 +0100 Subject: [PATCH 10/55] docs(python): Add docstrings for `Expr.meta` namespace (#10617) Co-authored-by: Liam Brannigan --- py-polars/polars/expr/meta.py | 118 ++++++++++++++++++++++++++++++++-- 1 file changed, 112 insertions(+), 6 deletions(-) diff --git a/py-polars/polars/expr/meta.py b/py-polars/polars/expr/meta.py index 3ac0568bf076..45205b54074d 100644 --- a/py-polars/polars/expr/meta.py +++ b/py-polars/polars/expr/meta.py @@ -28,19 +28,63 @@ def __ne__(self, other: ExprMetaNameSpace | Expr) -> bool: # type: ignore[overr return not self == other def eq(self, other: ExprMetaNameSpace | Expr) -> bool: - """Indicate if this expression is the same as another expression.""" + """ + Indicate if this expression is the same as another expression. + + Examples + -------- + >>> foo_bar = pl.col("foo").alias("bar") + >>> foo = pl.col("foo") + >>> foo_bar.meta.eq(foo) + False + >>> foo_bar2 = pl.col("foo").alias("bar") + >>> foo_bar.meta.eq(foo_bar2) + True + + """ return self._pyexpr.meta_eq(other._pyexpr) def ne(self, other: ExprMetaNameSpace | Expr) -> bool: - """Indicate if this expression is NOT the same as another expression.""" + """ + Indicate if this expression is NOT the same as another expression. + + Examples + -------- + >>> foo_bar = pl.col("foo").alias("bar") + >>> foo = pl.col("foo") + >>> foo_bar.meta.ne(foo) + True + >>> foo_bar2 = pl.col("foo").alias("bar") + >>> foo_bar.meta.ne(foo_bar2) + False + + """ return not self.eq(other) def has_multiple_outputs(self) -> bool: - """Whether this expression expands into multiple expressions.""" + """ + Whether this expression expands into multiple expressions. + + Examples + -------- + >>> e = pl.col(["a", "b"]).alias("bar") + >>> e.meta.has_multiple_outputs() + True + + """ return self._pyexpr.meta_has_multiple_outputs() def is_regex_projection(self) -> bool: - """Whether this expression expands to columns that match a regex pattern.""" + """ + Whether this expression expands to columns that match a regex pattern. + + Examples + -------- + >>> e = pl.col("^.*$").alias("bar") + >>> e.meta.is_regex_projection() + True + + """ return self._pyexpr.meta_is_regex_projection() def output_name(self) -> str: @@ -50,6 +94,24 @@ def output_name(self) -> str: It may not always be possible to determine the output name, as that can depend on the schema of the context; in that case this will raise ``ComputeError``. + Examples + -------- + >>> e = pl.col("foo") * pl.col("bar") + >>> e.meta.output_name() + 'foo' + >>> e_filter = pl.col("foo").filter(pl.col("bar") == 13) + >>> e_filter.meta.output_name() + 'foo' + >>> e_sum_over = pl.sum("foo").over("groups") + >>> e_sum_over.meta.output_name() + 'foo' + >>> e_sum_slice = pl.sum("foo").slice(pl.count() - 10, pl.col("bar")) + >>> e_sum_slice.meta.output_name() + 'foo' + >>> e_count = pl.count() + >>> e_count.meta.output_name() + 'count' + """ return self._pyexpr.meta_output_name() @@ -64,15 +126,54 @@ def pop(self) -> list[Expr]: This is not the case when an expression has multiple inputs. For instance in a ``fold`` expression. + Examples + -------- + >>> e = pl.col("foo").alias("bar") + >>> first = e.meta.pop()[0] + >>> first.meta == pl.col("foo") + True + >>> first.meta == pl.col("bar") + False + """ return [wrap_expr(e) for e in self._pyexpr.meta_pop()] def root_names(self) -> list[str]: - """Get a list with the root column name.""" + """ + Get a list with the root column name. + + Examples + -------- + >>> e = pl.col("foo") * pl.col("bar") + >>> e.meta.root_names() + ['foo', 'bar'] + >>> e_filter = pl.col("foo").filter(pl.col("bar") == 13) + >>> e_filter.meta.root_names() + ['foo', 'bar'] + >>> e_sum_over = pl.sum("foo").over("groups") + >>> e_sum_over.meta.root_names() + ['foo', 'groups'] + >>> e_sum_slice = pl.sum("foo").slice(pl.count() - 10, pl.col("bar")) + >>> e_sum_slice.meta.root_names() + ['foo', 'bar'] + + """ return self._pyexpr.meta_root_names() def undo_aliases(self) -> Expr: - """Undo any renaming operation like ``alias`` or ``keep_name``.""" + """ + Undo any renaming operation like ``alias`` or ``keep_name``. + + Examples + -------- + >>> e = pl.col("foo").alias("bar") + >>> e.meta.undo_aliases().meta == pl.col("foo") + True + >>> e = pl.col("foo").sum().over("bar") + >>> e.keep_name().meta.undo_aliases().meta == e + True + + """ return wrap_expr(self._pyexpr.meta_undo_aliases()) def _as_selector(self) -> Expr: @@ -135,6 +236,11 @@ def tree_format(self, return_as_string: bool = False) -> str | None: return_as_string: If True, return as string rather than printing to stdout. + Examples + -------- + >>> e = (pl.col("foo") * pl.col("bar")).sum().over(pl.col("ham")) / 2 + >>> e.meta.tree_format(return_as_string=True) # doctest: +SKIP + """ s = self._pyexpr.meta_tree_format() if return_as_string: From 5bdca8978044eb069fe5c6f45b29ffca1f812d5c Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sun, 20 Aug 2023 09:17:20 +0200 Subject: [PATCH 11/55] feat(rust, python): propagate null is in `is_in` and more generic array construction (#10614) --- crates/polars-core/Cargo.toml | 4 +- .../src/chunked_array/ops/apply.rs | 39 +++++++ .../src/chunked_array/ops/bit_repr.rs | 70 ++++++++++-- .../src/chunked_array/ops/is_in.rs | 104 ++++++++---------- .../src/chunked_array/ops/min_max_binary.rs | 12 +- .../polars-core/src/datatypes/from_values.rs | 72 ++++++++++++ crates/polars-core/src/datatypes/mod.rs | 2 + .../polars-core/src/datatypes/static_array.rs | 31 +++++- .../src/frame/groupby/into_groups.rs | 16 +-- .../src/series/arithmetic/borrowed.rs | 2 +- py-polars/tests/unit/operations/test_is_in.py | 24 +++- 11 files changed, 280 insertions(+), 96 deletions(-) create mode 100644 crates/polars-core/src/datatypes/from_values.rs diff --git a/crates/polars-core/Cargo.toml b/crates/polars-core/Cargo.toml index c051e09d7306..becd9894074d 100644 --- a/crates/polars-core/Cargo.toml +++ b/crates/polars-core/Cargo.toml @@ -59,7 +59,7 @@ lazy = [] # ~40% faster collect, needed until trustedlength iter stabilizes # more fast paths, slower compilation -performant = ["polars-arrow/performant"] +performant = ["polars-arrow/performant", "reinterpret"] # extra utilities for Utf8Chunked strings = ["regex", "polars-arrow/strings", "arrow/compute_substring", "polars-error/regex"] @@ -77,7 +77,7 @@ sort_multiple = [] rows = [] # operations -is_in = [] +is_in = ["reinterpret"] zip_with = [] round_series = [] checked_arithmetic = [] diff --git a/crates/polars-core/src/chunked_array/ops/apply.rs b/crates/polars-core/src/chunked_array/ops/apply.rs index bff8409d0052..48cfdee12d1d 100644 --- a/crates/polars-core/src/chunked_array/ops/apply.rs +++ b/crates/polars-core/src/chunked_array/ops/apply.rs @@ -13,6 +13,45 @@ use crate::prelude::*; use crate::series::IsSorted; use crate::utils::{CustomIterTools, NoNull}; +impl ChunkedArray +where + T: PolarsDataType, + Self: HasUnderlyingArray, +{ + pub fn apply_values<'a, U, K, F>(&'a self, op: F) -> ChunkedArray + where + U: PolarsDataType, + F: FnMut(<::ArrayT as StaticArray>::ValueT<'a>) -> K + Copy, + K: ArrayFromElementIter, + K::ArrayType: StaticallyMatchesPolarsType, + { + let iter = self.downcast_iter().map(|arr| { + let element_iter = arr.values_iter().map(op); + let array = K::array_from_values_iter(element_iter); + array.with_validity_typed(arr.validity().cloned()) + }); + + ChunkedArray::from_chunk_iter(self.name(), iter) + } + pub fn apply2<'a, U, K, F>(&'a self, op: F) -> ChunkedArray + where + U: PolarsDataType, + F: FnMut( + Option<<::ArrayT as StaticArray>::ValueT<'a>>, + ) -> Option + + Copy, + K: ArrayFromElementIter, + K::ArrayType: StaticallyMatchesPolarsType, + { + let iter = self.downcast_iter().map(|arr| { + let element_iter = arr.iter().map(op); + K::array_from_iter(element_iter) + }); + + ChunkedArray::from_chunk_iter(self.name(), iter) + } +} + pub(super) fn collect_array>( iter: I, validity: Option, diff --git a/crates/polars-core/src/chunked_array/ops/bit_repr.rs b/crates/polars-core/src/chunked_array/ops/bit_repr.rs index 2724e2813287..2133405b3eb4 100644 --- a/crates/polars-core/src/chunked_array/ops/bit_repr.rs +++ b/crates/polars-core/src/chunked_array/ops/bit_repr.rs @@ -22,17 +22,47 @@ fn reinterpret_chunked_array( ChunkedArray::from_chunk_iter(ca.name(), chunks) } -#[cfg(feature = "performant")] -impl Int16Chunked { - pub(crate) fn reinterpret_unsigned(&self) -> UInt16Chunked { - reinterpret_chunked_array(self) +#[cfg(all(feature = "reinterpret", feature = "dtype-i16", feature = "dtype-u16"))] +impl Reinterpret for Int16Chunked { + fn reinterpret_signed(&self) -> Series { + self.clone().into_series() + } + + fn reinterpret_unsigned(&self) -> Series { + reinterpret_chunked_array::<_, UInt16Type>(self).into_series() } } -#[cfg(feature = "performant")] -impl Int8Chunked { - pub(crate) fn reinterpret_unsigned(&self) -> UInt8Chunked { - reinterpret_chunked_array(self) +#[cfg(all(feature = "reinterpret", feature = "dtype-u16", feature = "dtype-i16"))] +impl Reinterpret for UInt16Chunked { + fn reinterpret_signed(&self) -> Series { + reinterpret_chunked_array::<_, Int16Type>(self).into_series() + } + + fn reinterpret_unsigned(&self) -> Series { + self.clone().into_series() + } +} + +#[cfg(all(feature = "reinterpret", feature = "dtype-i8", feature = "dtype-u8"))] +impl Reinterpret for Int8Chunked { + fn reinterpret_signed(&self) -> Series { + self.clone().into_series() + } + + fn reinterpret_unsigned(&self) -> Series { + reinterpret_chunked_array::<_, UInt8Type>(self).into_series() + } +} + +#[cfg(all(feature = "reinterpret", feature = "dtype-u8", feature = "dtype-i8"))] +impl Reinterpret for UInt8Chunked { + fn reinterpret_signed(&self) -> Series { + reinterpret_chunked_array::<_, Int8Type>(self).into_series() + } + + fn reinterpret_unsigned(&self) -> Series { + self.clone().into_series() } } @@ -120,7 +150,29 @@ impl Reinterpret for Int32Chunked { } fn reinterpret_unsigned(&self) -> Series { - self.bit_repr_large().into_series() + self.bit_repr_small().into_series() + } +} + +#[cfg(feature = "reinterpret")] +impl Reinterpret for Float32Chunked { + fn reinterpret_signed(&self) -> Series { + reinterpret_chunked_array::<_, Int32Type>(self).into_series() + } + + fn reinterpret_unsigned(&self) -> Series { + reinterpret_chunked_array::<_, UInt32Type>(self).into_series() + } +} + +#[cfg(feature = "reinterpret")] +impl Reinterpret for Float64Chunked { + fn reinterpret_signed(&self) -> Series { + reinterpret_chunked_array::<_, Int64Type>(self).into_series() + } + + fn reinterpret_unsigned(&self) -> Series { + reinterpret_chunked_array::<_, UInt64Type>(self).into_series() } } diff --git a/crates/polars-core/src/chunked_array/ops/is_in.rs b/crates/polars-core/src/chunked_array/ops/is_in.rs index f81a5b33e53e..a53b5de54d44 100644 --- a/crates/polars-core/src/chunked_array/ops/is_in.rs +++ b/crates/polars-core/src/chunked_array/ops/is_in.rs @@ -3,42 +3,29 @@ use std::hash::Hash; use crate::prelude::*; use crate::utils::{try_get_supertype, CustomIterTools}; -unsafe fn is_in_helper(ca: &ChunkedArray, other: &Series) -> PolarsResult +fn is_in_helper<'a, T>(ca: &'a ChunkedArray, other: &Series) -> PolarsResult where - T: PolarsNumericType, - P: Eq + Hash + Copy, + T: PolarsDataType, + ChunkedArray: HasUnderlyingArray, + < as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>: Hash + Eq + Copy, { let mut set = PlHashSet::with_capacity(other.len()); let other = ca.unpack_series_matching_type(other)?; other.downcast_iter().for_each(|iter| { - iter.into_iter().for_each(|opt_val| { - // Safety - // bit sizes are/ should be equal - let ptr = &opt_val.copied() as *const Option as *const Option

; - let opt_val = *ptr; - set.insert(opt_val); + iter.iter().for_each(|opt_val| { + if let Some(v) = opt_val { + set.insert(v); + } }) }); - - let name = ca.name(); - let mut ca: BooleanChunked = ca - .into_iter() - .map(|opt_val| { - // Safety - // bit sizes are/ should be equal - let ptr = &opt_val as *const Option as *const Option

; - let opt_val = *ptr; - set.contains(&opt_val) - }) - .collect_trusted(); - ca.rename(name); - Ok(ca) + Ok(ca.apply_values(|val| set.contains(&val))) } impl IsIn for ChunkedArray where - T: PolarsNumericType, + T: PolarsIntegerType, + T::Native: Hash + Eq, { fn is_in(&self, other: &Series) -> PolarsResult { // We check implicitly cast to supertype here @@ -88,24 +75,7 @@ where let right = other.cast(&st)?; return left.is_in(&right); } - // now that the types are equal, we coerce every 32 bit array to u32 - // and every 64 bit array to u64 (including floats) - // this allows hashing them and greatly reduces the number of code paths. - match self.dtype() { - DataType::UInt64 | DataType::Int64 | DataType::Float64 => unsafe { - is_in_helper::(self, other) - }, - DataType::UInt32 | DataType::Int32 | DataType::Float32 => unsafe { - is_in_helper::(self, other) - }, - DataType::UInt8 | DataType::Int8 => unsafe { - is_in_helper::(self, other) - }, - DataType::UInt16 | DataType::Int16 => unsafe { - is_in_helper::(self, other) - }, - dt => polars_bail!(opq = is_in, dt), - } + is_in_helper(self, other) } } .map(|mut ca| { @@ -114,6 +84,26 @@ where }) } } + +impl IsIn for Float32Chunked { + fn is_in(&self, other: &Series) -> PolarsResult { + let other = other.cast(&DataType::Float32)?; + let other = other.f32().unwrap(); + let other = other.reinterpret_unsigned(); + let ca = self.reinterpret_unsigned(); + ca.is_in(&other) + } +} +impl IsIn for Float64Chunked { + fn is_in(&self, other: &Series) -> PolarsResult { + let other = other.cast(&DataType::Float64)?; + let other = other.f64().unwrap(); + let other = other.reinterpret_unsigned(); + let ca = self.reinterpret_unsigned(); + ca.is_in(&other) + } +} + impl IsIn for Utf8Chunked { fn is_in(&self, other: &Series) -> PolarsResult { match other.dtype() { @@ -209,20 +199,7 @@ impl IsIn for BinaryChunked { Ok(ca) } DataType::Binary => { - let mut set = PlHashSet::with_capacity(other.len()); - - let other = other.binary()?; - other.downcast_iter().for_each(|iter| { - iter.into_iter().for_each(|opt_val| { - set.insert(opt_val); - }) - }); - let mut ca: BooleanChunked = self - .into_iter() - .map(|opt_val| set.contains(&opt_val)) - .collect_trusted(); - ca.rename(self.name()); - Ok(ca) + is_in_helper(self, other) } _ => polars_bail!(opq = is_in, self.dtype(), other.dtype()), } @@ -363,11 +340,11 @@ impl IsIn for StructChunked { } let mut anyvalues = Vec::with_capacity(other.len() * other.fields().len()); - // Safety: + // SAFETY: // the iterator is unsafe as the lifetime is tied to the iterator // so we copy to an owned buffer first - other.into_iter().for_each(|val| { - anyvalues.extend_from_slice(val); + other.into_iter().for_each(|vals| { + anyvalues.extend_from_slice(vals); }); // then we fill the set @@ -382,7 +359,14 @@ impl IsIn for StructChunked { // and then we check for membership let mut ca: BooleanChunked = self_ca .into_iter() - .map(|vals| set.contains(&vals)) + .map(|vals| { + // If all rows are null we see the struct row as missing. + if !vals.iter().all(|val| matches!(val, AnyValue::Null)) { + Some(set.contains(&vals)) + } else { + None + } + }) .collect(); ca.rename(self.name()); Ok(ca) diff --git a/crates/polars-core/src/chunked_array/ops/min_max_binary.rs b/crates/polars-core/src/chunked_array/ops/min_max_binary.rs index bfb8dcc1d014..279a4ae0719f 100644 --- a/crates/polars-core/src/chunked_array/ops/min_max_binary.rs +++ b/crates/polars-core/src/chunked_array/ops/min_max_binary.rs @@ -7,11 +7,11 @@ where T: PolarsNumericType, T::Native: PartialOrd, { - let op = |l: &T::Native, r: &T::Native| { + let op = |l: T::Native, r: T::Native| { if l < r { - *l + l } else { - *r + r } }; arity::binary_elementwise_values(left, right, op) @@ -22,11 +22,11 @@ where T: PolarsNumericType, T::Native: PartialOrd, { - let op = |l: &T::Native, r: &T::Native| { + let op = |l: T::Native, r: T::Native| { if l > r { - *l + l } else { - *r + r } }; arity::binary_elementwise_values(left, right, op) diff --git a/crates/polars-core/src/datatypes/from_values.rs b/crates/polars-core/src/datatypes/from_values.rs new file mode 100644 index 000000000000..7674adbec0e1 --- /dev/null +++ b/crates/polars-core/src/datatypes/from_values.rs @@ -0,0 +1,72 @@ +use arrow::array::{BooleanArray, PrimitiveArray, Utf8Array}; +use polars_arrow::array::utf8::Utf8FromIter; +use polars_arrow::trusted_len::TrustedLen; + +use crate::prelude::StaticArray; + +pub trait ArrayFromElementIter +where + Self: Sized, +{ + type ArrayType: StaticArray; + + fn array_from_iter>>(iter: I) -> Self::ArrayType; + + fn array_from_values_iter>(iter: I) -> Self::ArrayType; +} + +impl ArrayFromElementIter for bool { + type ArrayType = BooleanArray; + + fn array_from_iter>>(iter: I) -> Self::ArrayType { + // SAFETY: guarded by `TrustedLen` trait + unsafe { BooleanArray::from_trusted_len_iter_unchecked(iter) } + } + + fn array_from_values_iter>(iter: I) -> Self::ArrayType { + // SAFETY: guarded by `TrustedLen` trait + unsafe { BooleanArray::from_trusted_len_values_iter_unchecked(iter) } + } +} + +macro_rules! impl_primitive { + ($tp:ty) => { + impl ArrayFromElementIter for $tp { + type ArrayType = PrimitiveArray; + + fn array_from_iter>>(iter: I) -> Self::ArrayType { + // SAFETY: guarded by `TrustedLen` trait + unsafe { PrimitiveArray::from_trusted_len_iter_unchecked(iter) } + } + + fn array_from_values_iter>(iter: I) -> Self::ArrayType { + // SAFETY: guarded by `TrustedLen` trait + unsafe { PrimitiveArray::from_trusted_len_values_iter_unchecked(iter) } + } + } + }; +} + +impl_primitive!(u8); +impl_primitive!(u16); +impl_primitive!(u32); +impl_primitive!(u64); +impl_primitive!(i8); +impl_primitive!(i16); +impl_primitive!(i32); +impl_primitive!(i64); + +impl ArrayFromElementIter for &str { + type ArrayType = Utf8Array; + + fn array_from_iter>>(iter: I) -> Self::ArrayType { + // SAFETY: guarded by `TrustedLen` trait + unsafe { Utf8Array::from_trusted_len_iter_unchecked(iter) } + } + + fn array_from_values_iter>(iter: I) -> Self::ArrayType { + // SAFETY: guarded by `TrustedLen` trait + let len = iter.size_hint().0; + Utf8Array::from_values_iter(iter, len, len * 24) + } +} diff --git a/crates/polars-core/src/datatypes/mod.rs b/crates/polars-core/src/datatypes/mod.rs index bdb5221d6b4d..120820365207 100644 --- a/crates/polars-core/src/datatypes/mod.rs +++ b/crates/polars-core/src/datatypes/mod.rs @@ -12,6 +12,7 @@ mod aliases; mod any_value; mod dtype; mod field; +mod from_values; mod static_array; mod time_unit; @@ -31,6 +32,7 @@ use arrow::types::simd::Simd; use arrow::types::NativeType; pub use dtype::*; pub use field::*; +pub use from_values::ArrayFromElementIter; use num_traits::{Bounded, FromPrimitive, Num, NumCast, Zero}; use polars_arrow::data_types::IsFloat; #[cfg(feature = "serde")] diff --git a/crates/polars-core/src/datatypes/static_array.rs b/crates/polars-core/src/datatypes/static_array.rs index 46dedae311bb..ecf54b7179e7 100644 --- a/crates/polars-core/src/datatypes/static_array.rs +++ b/crates/polars-core/src/datatypes/static_array.rs @@ -1,4 +1,5 @@ use arrow::bitmap::utils::{BitmapIter, ZipValidity}; +use arrow::bitmap::Bitmap; #[cfg(feature = "object")] use crate::chunked_array::object::ObjectArray; @@ -16,18 +17,22 @@ pub trait StaticArray: Array { fn iter(&self) -> ZipValidity, Self::ValueIterT<'_>, BitmapIter>; fn values_iter(&self) -> Self::ValueIterT<'_>; + fn with_validity_typed(self, validity: Option) -> Self; } impl StaticArray for PrimitiveArray { - type ValueT<'a> = &'a T; - type ValueIterT<'a> = std::slice::Iter<'a, T>; + type ValueT<'a> = T; + type ValueIterT<'a> = std::iter::Copied>; fn values_iter(&self) -> Self::ValueIterT<'_> { - self.values_iter() + self.values_iter().copied() } fn iter(&self) -> ZipValidity, Self::ValueIterT<'_>, BitmapIter> { - self.iter() + ZipValidity::new_with_validity(self.values().iter().copied(), self.validity()) + } + fn with_validity_typed(self, validity: Option) -> Self { + self.with_validity(validity) } } @@ -42,6 +47,9 @@ impl StaticArray for BooleanArray { fn iter(&self) -> ZipValidity, Self::ValueIterT<'_>, BitmapIter> { self.iter() } + fn with_validity_typed(self, validity: Option) -> Self { + self.with_validity(validity) + } } impl StaticArray for Utf8Array { @@ -55,6 +63,9 @@ impl StaticArray for Utf8Array { fn iter(&self) -> ZipValidity, Self::ValueIterT<'_>, BitmapIter> { self.iter() } + fn with_validity_typed(self, validity: Option) -> Self { + self.with_validity(validity) + } } impl StaticArray for BinaryArray { @@ -68,6 +79,9 @@ impl StaticArray for BinaryArray { fn iter(&self) -> ZipValidity, Self::ValueIterT<'_>, BitmapIter> { self.iter() } + fn with_validity_typed(self, validity: Option) -> Self { + self.with_validity(validity) + } } impl StaticArray for ListArray { @@ -81,6 +95,9 @@ impl StaticArray for ListArray { fn iter(&self) -> ZipValidity, Self::ValueIterT<'_>, BitmapIter> { self.iter() } + fn with_validity_typed(self, validity: Option) -> Self { + self.with_validity(validity) + } } #[cfg(feature = "dtype-array")] @@ -95,6 +112,9 @@ impl StaticArray for FixedSizeListArray { fn iter(&self) -> ZipValidity, Self::ValueIterT<'_>, BitmapIter> { self.iter() } + fn with_validity_typed(self, validity: Option) -> Self { + self.with_validity(validity) + } } #[cfg(feature = "object")] @@ -109,4 +129,7 @@ impl StaticArray for ObjectArray { fn iter(&self) -> ZipValidity, Self::ValueIterT<'_>, BitmapIter> { todo!() } + fn with_validity_typed(self, _validity: Option) -> Self { + todo!() + } } diff --git a/crates/polars-core/src/frame/groupby/into_groups.rs b/crates/polars-core/src/frame/groupby/into_groups.rs index 4c0141ca41f7..5518f1a760d0 100644 --- a/crates/polars-core/src/frame/groupby/into_groups.rs +++ b/crates/polars-core/src/frame/groupby/into_groups.rs @@ -172,30 +172,30 @@ where let ca = self.bit_repr_small(); num_groups_proxy(&ca, multithreaded, sorted) }, - #[cfg(feature = "performant")] + #[cfg(all(feature = "performant", feature = "dtype-i8", feature = "dtype-u8"))] DataType::Int8 => { // convince the compiler that we are this type. let ca: &Int8Chunked = unsafe { &*(self as *const ChunkedArray as *const ChunkedArray) }; - let ca = ca.reinterpret_unsigned(); - num_groups_proxy(&ca, multithreaded, sorted) + let s = ca.reinterpret_unsigned(); + return s.group_tuples(multithreaded, sorted); }, - #[cfg(feature = "performant")] + #[cfg(all(feature = "performant", feature = "dtype-i8", feature = "dtype-u8"))] DataType::UInt8 => { // convince the compiler that we are this type. let ca: &UInt8Chunked = unsafe { &*(self as *const ChunkedArray as *const ChunkedArray) }; num_groups_proxy(ca, multithreaded, sorted) }, - #[cfg(feature = "performant")] + #[cfg(all(feature = "performant", feature = "dtype-i16", feature = "dtype-u16"))] DataType::Int16 => { // convince the compiler that we are this type. let ca: &Int16Chunked = unsafe { &*(self as *const ChunkedArray as *const ChunkedArray) }; - let ca = ca.reinterpret_unsigned(); - num_groups_proxy(&ca, multithreaded, sorted) + let s = ca.reinterpret_unsigned(); + return s.group_tuples(multithreaded, sorted); }, - #[cfg(feature = "performant")] + #[cfg(all(feature = "performant", feature = "dtype-i16", feature = "dtype-u16"))] DataType::UInt16 => { // convince the compiler that we are this type. let ca: &UInt16Chunked = unsafe { diff --git a/crates/polars-core/src/series/arithmetic/borrowed.rs b/crates/polars-core/src/series/arithmetic/borrowed.rs index 4ca8e3e7e00b..bc71816331f7 100644 --- a/crates/polars-core/src/series/arithmetic/borrowed.rs +++ b/crates/polars-core/src/series/arithmetic/borrowed.rs @@ -163,7 +163,7 @@ pub mod checked { Ok( arity::binary_elementwise(lhs, rhs, |opt_l, opt_r| match (opt_l, opt_r) { - (Some(l), Some(r)) => l.checked_div(r), + (Some(l), Some(r)) => l.checked_div(&r), _ => None, }) .into_series(), diff --git a/py-polars/tests/unit/operations/test_is_in.py b/py-polars/tests/unit/operations/test_is_in.py index 7e7a23df0c63..9066f49aa171 100644 --- a/py-polars/tests/unit/operations/test_is_in.py +++ b/py-polars/tests/unit/operations/test_is_in.py @@ -44,14 +44,9 @@ def test_is_in_empty_list_4639() -> None: df = pl.DataFrame({"a": [1, None]}) empty_list: list[int] = [] - print(df.with_columns(pl.col("a").is_in(empty_list))) assert df.with_columns([pl.col("a").is_in(empty_list).alias("a_in_list")]).to_dict( False - ) == {"a": [1, None], "a_in_list": [False, False]} - # df = pl.DataFrame() - # assert df.with_columns( - # [pl.lit(None).cast(pl.Int64).is_in(empty_list).alias("in_empty_list")] - # ).to_dict(False) == {"in_empty_list": [False]} + ) == {"a": [1, None], "a_in_list": [False, None]} def test_is_in_struct() -> None: @@ -71,6 +66,23 @@ def test_is_in_struct() -> None: } +def test_is_in_null_prop() -> None: + assert pl.Series([None], dtype=pl.Float32).is_in(pl.Series([42])).item() is None + assert ( + pl.Series([{"a": None}], dtype=pl.Struct({"a": pl.Float32})) + .is_in(pl.Series([{"a": 42}])) + .item() + is None + ) + assert pl.Series([None], dtype=pl.Boolean).is_in(pl.Series([42])).item() is None + assert ( + pl.Series([{"a": None}], dtype=pl.Struct({"a": pl.Boolean})) + .is_in(pl.Series([{"a": 42}])) + .item() + is None + ) + + def test_is_in_df() -> None: df = pl.DataFrame({"a": [1, 2, 3]}) assert df.select(pl.col("a").is_in([1, 2]))["a"].to_list() == [ From 046588fed62790c56ffd7474c62a6d5bf033a760 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sun, 20 Aug 2023 10:12:02 +0200 Subject: [PATCH 12/55] fix(rust, python): fix int/float downcast in `is_in` (#10620) --- .../src/logical_plan/optimizer/type_coercion/mod.rs | 6 +++++- py-polars/tests/unit/operations/test_is_in.py | 4 ++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/crates/polars-plan/src/logical_plan/optimizer/type_coercion/mod.rs b/crates/polars-plan/src/logical_plan/optimizer/type_coercion/mod.rs index 1ae3d357438c..5ae5f8c9eda1 100644 --- a/crates/polars-plan/src/logical_plan/optimizer/type_coercion/mod.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/type_coercion/mod.rs @@ -369,7 +369,11 @@ impl OptimizationRule for TypeCoercionRule { // if right is another type, we cast it to left // we do not use super-type as an `is_in` operation should not // cast the whole column implicitly. - (a, b) if a != b => { + (a, b) + if a != b + // For integer/ float comparison we let them use supertypes. + && !(a.is_integer() && b.is_float()) => + { AExpr::Cast { expr: other_node, data_type: type_left, diff --git a/py-polars/tests/unit/operations/test_is_in.py b/py-polars/tests/unit/operations/test_is_in.py index 9066f49aa171..7eefd3758921 100644 --- a/py-polars/tests/unit/operations/test_is_in.py +++ b/py-polars/tests/unit/operations/test_is_in.py @@ -83,6 +83,10 @@ def test_is_in_null_prop() -> None: ) +def test_is_in_9070() -> None: + assert not pl.Series([1]).is_in(pl.Series([1.99])).item() + + def test_is_in_df() -> None: df = pl.DataFrame({"a": [1, 2, 3]}) assert df.select(pl.col("a").is_in([1, 2]))["a"].to_list() == [ From c3faa051a44ebebeb00dd14e1a1647e8b1979ef1 Mon Sep 17 00:00:00 2001 From: Weijie Guo Date: Sun, 20 Aug 2023 16:34:28 +0800 Subject: [PATCH 13/55] feature(rust, python): Multiple groupby key supports list type (#10615) --- .../src/chunked_array/ops/compare_inner.rs | 39 +++++ .../src/chunked_array/ops/take/take_random.rs | 12 +- .../polars-core/src/hashing/vector_hasher.rs | 133 ++++++++++++++---- .../src/series/implementations/binary.rs | 4 +- .../src/series/implementations/boolean.rs | 4 +- .../src/series/implementations/categorical.rs | 4 +- .../src/series/implementations/dates_time.rs | 4 +- .../src/series/implementations/datetime.rs | 4 +- .../src/series/implementations/duration.rs | 4 +- .../src/series/implementations/floats.rs | 4 +- .../src/series/implementations/list.rs | 24 ++++ .../src/series/implementations/mod.rs | 4 +- .../src/series/implementations/object.rs | 4 +- .../src/series/implementations/utf8.rs | 4 +- crates/polars-error/src/lib.rs | 5 + py-polars/tests/unit/datatypes/test_list.py | 19 +++ .../tests/unit/operations/test_unique.py | 9 ++ py-polars/tests/unit/test_errors.py | 5 - 18 files changed, 228 insertions(+), 58 deletions(-) diff --git a/crates/polars-core/src/chunked_array/ops/compare_inner.rs b/crates/polars-core/src/chunked_array/ops/compare_inner.rs index 59d064a506c9..a637d423f705 100644 --- a/crates/polars-core/src/chunked_array/ops/compare_inner.rs +++ b/crates/polars-core/src/chunked_array/ops/compare_inner.rs @@ -12,6 +12,7 @@ use crate::chunked_array::ops::take::take_random::{ #[cfg(feature = "object")] use crate::chunked_array::ops::take::take_random::{ObjectTakeRandom, ObjectTakeRandomSingleChunk}; use crate::prelude::*; +use crate::utils::Wrap; pub trait PartialEqInner: Send + Sync { /// Safety: @@ -77,6 +78,18 @@ impl_traits!(BoolTakeRandomSingleChunk<'_>); impl_traits!(NumTakeRandomSingleChunk<'_, T>, T); impl_traits!(NumTakeRandomChunked<'_, T>, T); +impl<'a> PartialEqInner for ListTakeRandomSingleChunk<'a> { + unsafe fn eq_element_unchecked(&self, idx_a: usize, idx_b: usize) -> bool { + self.get_unchecked(idx_a).map(Wrap) == self.get_unchecked(idx_b).map(Wrap) + } +} + +impl<'a> PartialEqInner for ListTakeRandom<'a> { + unsafe fn eq_element_unchecked(&self, idx_a: usize, idx_b: usize) -> bool { + self.get_unchecked(idx_a).map(Wrap) == self.get_unchecked(idx_b).map(Wrap) + } +} + impl PartialEqInner for NumTakeRandomCont<'_, T> where T: Copy + PartialEq + Sync, @@ -123,6 +136,32 @@ where } } +impl<'a> IntoPartialEqInner<'a> for &'a ListChunked { + fn into_partial_eq_inner(self) -> Box { + match self.chunks.len() { + 1 => { + let arr = self.downcast_iter().next().unwrap(); + let t = ListTakeRandomSingleChunk { + arr, + name: self.name(), + }; + Box::new(t) + }, + _ => { + let name = self.name(); + let inner_type = self.inner_dtype().to_physical(); + let t = ListTakeRandom { + inner_type, + name, + chunks: self.downcast_iter().collect(), + chunk_lens: self.chunks.iter().map(|a| a.len() as IdxSize).collect(), + }; + Box::new(t) + }, + } + } +} + impl<'a> IntoPartialEqInner<'a> for &'a Utf8Chunked { fn into_partial_eq_inner(self) -> Box { match self.chunks.len() { diff --git a/crates/polars-core/src/chunked_array/ops/take/take_random.rs b/crates/polars-core/src/chunked_array/ops/take/take_random.rs index 601ad468a330..43feaca04576 100644 --- a/crates/polars-core/src/chunked_array/ops/take/take_random.rs +++ b/crates/polars-core/src/chunked_array/ops/take/take_random.rs @@ -540,10 +540,10 @@ impl<'a> TakeRandom for BoolTakeRandomSingleChunk<'a> { } pub struct ListTakeRandom<'a> { - inner_type: DataType, - name: &'a str, - chunks: Vec<&'a ListArray>, - chunk_lens: Vec, + pub(crate) inner_type: DataType, + pub(crate) name: &'a str, + pub(crate) chunks: Vec<&'a ListArray>, + pub(crate) chunk_lens: Vec, } impl<'a> TakeRandom for ListTakeRandom<'a> { @@ -579,8 +579,8 @@ impl<'a> TakeRandom for ListTakeRandom<'a> { } pub struct ListTakeRandomSingleChunk<'a> { - arr: &'a ListArray, - name: &'a str, + pub(crate) arr: &'a ListArray, + pub(crate) name: &'a str, } impl<'a> TakeRandom for ListTakeRandomSingleChunk<'a> { diff --git a/crates/polars-core/src/hashing/vector_hasher.rs b/crates/polars-core/src/hashing/vector_hasher.rs index fd635e858d9c..537e71ee9e4c 100644 --- a/crates/polars-core/src/hashing/vector_hasher.rs +++ b/crates/polars-core/src/hashing/vector_hasher.rs @@ -1,6 +1,8 @@ use arrow::bitmap::utils::get_bit_unchecked; use hashbrown::hash_map::RawEntryMut; use hashbrown::HashMap; +#[cfg(feature = "groupby_list")] +use polars_arrow::kernels::list_bytes_iter::numeric_list_bytes_iter; use polars_arrow::utils::CustomIterTools; use rayon::prelude::*; use xxhash_rust::xxh3::xxh3_64_with_seed; @@ -22,12 +24,16 @@ pub trait VecHash { /// Compute the hash for all values in the array. /// /// This currently only works with the AHash RandomState hasher builder. - fn vec_hash(&self, _random_state: RandomState, _buf: &mut Vec) { - unimplemented!() + fn vec_hash(&self, _random_state: RandomState, _buf: &mut Vec) -> PolarsResult<()> { + polars_bail!(un_impl = vec_hash); } - fn vec_hash_combine(&self, _random_state: RandomState, _hashes: &mut [u64]) { - unimplemented!() + fn vec_hash_combine( + &self, + _random_state: RandomState, + _hashes: &mut [u64], + ) -> PolarsResult<()> { + polars_bail!(un_impl = vec_hash_combine); } } @@ -138,12 +144,18 @@ where macro_rules! vec_hash_int { ($ca:ident) => { impl VecHash for $ca { - fn vec_hash(&self, random_state: RandomState, buf: &mut Vec) { - integer_vec_hash(self, random_state, buf) + fn vec_hash(&self, random_state: RandomState, buf: &mut Vec) -> PolarsResult<()> { + integer_vec_hash(self, random_state, buf); + Ok(()) } - fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) { - integer_vec_hash_combine(self, random_state, hashes) + fn vec_hash_combine( + &self, + random_state: RandomState, + hashes: &mut [u64], + ) -> PolarsResult<()> { + integer_vec_hash_combine(self, random_state, hashes); + Ok(()) } } }; @@ -159,12 +171,14 @@ vec_hash_int!(UInt16Chunked); vec_hash_int!(UInt8Chunked); impl VecHash for Utf8Chunked { - fn vec_hash(&self, random_state: RandomState, buf: &mut Vec) { - self.as_binary().vec_hash(random_state, buf) + fn vec_hash(&self, random_state: RandomState, buf: &mut Vec) -> PolarsResult<()> { + self.as_binary().vec_hash(random_state, buf)?; + Ok(()) } - fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) { - self.as_binary().vec_hash_combine(random_state, hashes) + fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) -> PolarsResult<()> { + self.as_binary().vec_hash_combine(random_state, hashes)?; + Ok(()) } } @@ -183,14 +197,15 @@ pub fn _hash_binary_array(arr: &BinaryArray, random_state: RandomState, buf } impl VecHash for BinaryChunked { - fn vec_hash(&self, random_state: RandomState, buf: &mut Vec) { + fn vec_hash(&self, random_state: RandomState, buf: &mut Vec) -> PolarsResult<()> { buf.clear(); buf.reserve(self.len()); self.downcast_iter() .for_each(|arr| _hash_binary_array(arr, random_state.clone(), buf)); + Ok(()) } - fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) { + fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) -> PolarsResult<()> { let null_h = get_null_hash_value(random_state); let mut offset = 0; @@ -222,11 +237,12 @@ impl VecHash for BinaryChunked { } offset += arr.len(); }); + Ok(()) } } impl VecHash for BooleanChunked { - fn vec_hash(&self, random_state: RandomState, buf: &mut Vec) { + fn vec_hash(&self, random_state: RandomState, buf: &mut Vec) -> PolarsResult<()> { buf.clear(); buf.reserve(self.len()); let true_h = random_state.hash_one(true); @@ -243,9 +259,10 @@ impl VecHash for BooleanChunked { })) } }); + Ok(()) } - fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) { + fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) -> PolarsResult<()> { let true_h = random_state.hash_one(true); let false_h = random_state.hash_one(false); let null_h = get_null_hash_value(random_state); @@ -283,24 +300,83 @@ impl VecHash for BooleanChunked { } offset += arr.len(); }); + Ok(()) } } impl VecHash for Float32Chunked { - fn vec_hash(&self, random_state: RandomState, buf: &mut Vec) { - self.bit_repr_small().vec_hash(random_state, buf) + fn vec_hash(&self, random_state: RandomState, buf: &mut Vec) -> PolarsResult<()> { + self.bit_repr_small().vec_hash(random_state, buf)?; + Ok(()) } - fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) { - self.bit_repr_small().vec_hash_combine(random_state, hashes) + fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) -> PolarsResult<()> { + self.bit_repr_small() + .vec_hash_combine(random_state, hashes)?; + Ok(()) } } impl VecHash for Float64Chunked { - fn vec_hash(&self, random_state: RandomState, buf: &mut Vec) { - self.bit_repr_large().vec_hash(random_state, buf) + fn vec_hash(&self, random_state: RandomState, buf: &mut Vec) -> PolarsResult<()> { + self.bit_repr_large().vec_hash(random_state, buf)?; + Ok(()) } - fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) { - self.bit_repr_large().vec_hash_combine(random_state, hashes) + fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) -> PolarsResult<()> { + self.bit_repr_large() + .vec_hash_combine(random_state, hashes)?; + Ok(()) + } +} + +#[cfg(feature = "groupby_list")] +impl VecHash for ListChunked { + fn vec_hash(&self, _random_state: RandomState, _buf: &mut Vec) -> PolarsResult<()> { + polars_ensure!( + self.inner_dtype().to_physical().is_numeric(), + ComputeError: "grouping on list type is only allowed if the inner type is numeric" + ); + _buf.clear(); + _buf.reserve(self.len()); + let null_h = get_null_hash_value(_random_state.clone()); + + for arr in self.downcast_iter() { + _buf.extend( + numeric_list_bytes_iter(arr)?.map(|opt_bytes| match opt_bytes { + Some(s) => xxh3_64_with_seed(s, null_h), + None => null_h, + }), + ) + } + Ok(()) + } + + fn vec_hash_combine( + &self, + _random_state: RandomState, + _hashes: &mut [u64], + ) -> PolarsResult<()> { + polars_ensure!( + self.inner_dtype().to_physical().is_numeric(), + ComputeError: "grouping on list type is only allowed if the inner type is numeric" + ); + + let null_h = get_null_hash_value(_random_state); + + let mut offset = 0; + self.downcast_iter().try_for_each(|arr| { + numeric_list_bytes_iter(arr)? + .zip(&mut _hashes[offset..]) + .for_each(|(opt_bytes, h)| { + let l = match opt_bytes { + Some(s) => xxh3_64_with_seed(s, null_h), + None => null_h, + }; + *h = _boost_hash_combine(l, *h) + }); + offset += arr.len(); + PolarsResult::Ok(()) + })?; + Ok(()) } } @@ -309,7 +385,7 @@ impl VecHash for ObjectChunked where T: PolarsObject, { - fn vec_hash(&self, random_state: RandomState, buf: &mut Vec) { + fn vec_hash(&self, random_state: RandomState, buf: &mut Vec) -> PolarsResult<()> { // Note that we don't use the no null branch! This can break in unexpected ways. // for instance with threading we split an array in n_threads, this may lead to // splits that have no nulls and splits that have nulls. Then one array is hashed with @@ -326,9 +402,11 @@ where hasher.finish() })) }); + + Ok(()) } - fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) { + fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) -> PolarsResult<()> { self.apply_to_slice( |opt_v, h| { let mut hasher = random_state.build_hasher(); @@ -336,7 +414,8 @@ where _boost_hash_combine(hasher.finish(), *h) }, hashes, - ) + ); + Ok(()) } } diff --git a/crates/polars-core/src/series/implementations/binary.rs b/crates/polars-core/src/series/implementations/binary.rs index 2e1e8f42925c..25f9a44c17ab 100644 --- a/crates/polars-core/src/series/implementations/binary.rs +++ b/crates/polars-core/src/series/implementations/binary.rs @@ -50,12 +50,12 @@ impl private::PrivateSeries for SeriesWrap { } fn vec_hash(&self, random_state: RandomState, buf: &mut Vec) -> PolarsResult<()> { - self.0.vec_hash(random_state, buf); + self.0.vec_hash(random_state, buf)?; Ok(()) } fn vec_hash_combine(&self, build_hasher: RandomState, hashes: &mut [u64]) -> PolarsResult<()> { - self.0.vec_hash_combine(build_hasher, hashes); + self.0.vec_hash_combine(build_hasher, hashes)?; Ok(()) } diff --git a/crates/polars-core/src/series/implementations/boolean.rs b/crates/polars-core/src/series/implementations/boolean.rs index ce46082adcc9..11baec69aaf0 100644 --- a/crates/polars-core/src/series/implementations/boolean.rs +++ b/crates/polars-core/src/series/implementations/boolean.rs @@ -51,12 +51,12 @@ impl private::PrivateSeries for SeriesWrap { } fn vec_hash(&self, random_state: RandomState, buf: &mut Vec) -> PolarsResult<()> { - self.0.vec_hash(random_state, buf); + self.0.vec_hash(random_state, buf)?; Ok(()) } fn vec_hash_combine(&self, build_hasher: RandomState, hashes: &mut [u64]) -> PolarsResult<()> { - self.0.vec_hash_combine(build_hasher, hashes); + self.0.vec_hash_combine(build_hasher, hashes)?; Ok(()) } diff --git a/crates/polars-core/src/series/implementations/categorical.rs b/crates/polars-core/src/series/implementations/categorical.rs index 5e134e0273be..4430a1db84a7 100644 --- a/crates/polars-core/src/series/implementations/categorical.rs +++ b/crates/polars-core/src/series/implementations/categorical.rs @@ -98,12 +98,12 @@ impl private::PrivateSeries for SeriesWrap { } fn vec_hash(&self, random_state: RandomState, buf: &mut Vec) -> PolarsResult<()> { - self.0.logical().vec_hash(random_state, buf); + self.0.logical().vec_hash(random_state, buf)?; Ok(()) } fn vec_hash_combine(&self, build_hasher: RandomState, hashes: &mut [u64]) -> PolarsResult<()> { - self.0.logical().vec_hash_combine(build_hasher, hashes); + self.0.logical().vec_hash_combine(build_hasher, hashes)?; Ok(()) } diff --git a/crates/polars-core/src/series/implementations/dates_time.rs b/crates/polars-core/src/series/implementations/dates_time.rs index 886d9198cd57..477af2cd0e7f 100644 --- a/crates/polars-core/src/series/implementations/dates_time.rs +++ b/crates/polars-core/src/series/implementations/dates_time.rs @@ -77,7 +77,7 @@ macro_rules! impl_dyn_series { } fn vec_hash(&self, random_state: RandomState, buf: &mut Vec) -> PolarsResult<()> { - self.0.vec_hash(random_state, buf); + self.0.vec_hash(random_state, buf)?; Ok(()) } @@ -86,7 +86,7 @@ macro_rules! impl_dyn_series { build_hasher: RandomState, hashes: &mut [u64], ) -> PolarsResult<()> { - self.0.vec_hash_combine(build_hasher, hashes); + self.0.vec_hash_combine(build_hasher, hashes)?; Ok(()) } diff --git a/crates/polars-core/src/series/implementations/datetime.rs b/crates/polars-core/src/series/implementations/datetime.rs index d070e75e8a8c..cab324d2fcbe 100644 --- a/crates/polars-core/src/series/implementations/datetime.rs +++ b/crates/polars-core/src/series/implementations/datetime.rs @@ -75,12 +75,12 @@ impl private::PrivateSeries for SeriesWrap { } fn vec_hash(&self, random_state: RandomState, buf: &mut Vec) -> PolarsResult<()> { - self.0.vec_hash(random_state, buf); + self.0.vec_hash(random_state, buf)?; Ok(()) } fn vec_hash_combine(&self, build_hasher: RandomState, hashes: &mut [u64]) -> PolarsResult<()> { - self.0.vec_hash_combine(build_hasher, hashes); + self.0.vec_hash_combine(build_hasher, hashes)?; Ok(()) } diff --git a/crates/polars-core/src/series/implementations/duration.rs b/crates/polars-core/src/series/implementations/duration.rs index a1738ebb9fd5..5caa0deacaa1 100644 --- a/crates/polars-core/src/series/implementations/duration.rs +++ b/crates/polars-core/src/series/implementations/duration.rs @@ -80,12 +80,12 @@ impl private::PrivateSeries for SeriesWrap { } fn vec_hash(&self, random_state: RandomState, buf: &mut Vec) -> PolarsResult<()> { - self.0.vec_hash(random_state, buf); + self.0.vec_hash(random_state, buf)?; Ok(()) } fn vec_hash_combine(&self, build_hasher: RandomState, hashes: &mut [u64]) -> PolarsResult<()> { - self.0.vec_hash_combine(build_hasher, hashes); + self.0.vec_hash_combine(build_hasher, hashes)?; Ok(()) } diff --git a/crates/polars-core/src/series/implementations/floats.rs b/crates/polars-core/src/series/implementations/floats.rs index b0a09e1e0e1b..dd28d742ca21 100644 --- a/crates/polars-core/src/series/implementations/floats.rs +++ b/crates/polars-core/src/series/implementations/floats.rs @@ -76,7 +76,7 @@ macro_rules! impl_dyn_series { } fn vec_hash(&self, random_state: RandomState, buf: &mut Vec) -> PolarsResult<()> { - self.0.vec_hash(random_state, buf); + self.0.vec_hash(random_state, buf)?; Ok(()) } @@ -85,7 +85,7 @@ macro_rules! impl_dyn_series { build_hasher: RandomState, hashes: &mut [u64], ) -> PolarsResult<()> { - self.0.vec_hash_combine(build_hasher, hashes); + self.0.vec_hash_combine(build_hasher, hashes)?; Ok(()) } diff --git a/crates/polars-core/src/series/implementations/list.rs b/crates/polars-core/src/series/implementations/list.rs index f5c55a051950..bb4fc35987ba 100644 --- a/crates/polars-core/src/series/implementations/list.rs +++ b/crates/polars-core/src/series/implementations/list.rs @@ -1,8 +1,12 @@ use std::any::Any; use std::borrow::Cow; +#[cfg(feature = "groupby_list")] +use ahash::RandomState; + use super::{private, IntoSeries, SeriesTrait}; use crate::chunked_array::comparison::*; +use crate::chunked_array::ops::compare_inner::{IntoPartialEqInner, PartialEqInner}; use crate::chunked_array::ops::explode::ExplodeByOffsets; use crate::chunked_array::{AsSinglePtr, Settings}; use crate::frame::groupby::*; @@ -48,6 +52,26 @@ impl private::PrivateSeries for SeriesWrap { fn group_tuples(&self, multithreaded: bool, sorted: bool) -> PolarsResult { IntoGroupsProxy::group_tuples(&self.0, multithreaded, sorted) } + + #[cfg(feature = "groupby_list")] + fn vec_hash(&self, _build_hasher: RandomState, _buf: &mut Vec) -> PolarsResult<()> { + self.0.vec_hash(_build_hasher, _buf)?; + Ok(()) + } + + #[cfg(feature = "groupby_list")] + fn vec_hash_combine( + &self, + _build_hasher: RandomState, + _hashes: &mut [u64], + ) -> PolarsResult<()> { + self.0.vec_hash_combine(_build_hasher, _hashes)?; + Ok(()) + } + + fn into_partial_eq_inner<'a>(&'a self) -> Box { + (&self.0).into_partial_eq_inner() + } } impl SeriesTrait for SeriesWrap { diff --git a/crates/polars-core/src/series/implementations/mod.rs b/crates/polars-core/src/series/implementations/mod.rs index cfdbb490a71e..9ca9bce80b91 100644 --- a/crates/polars-core/src/series/implementations/mod.rs +++ b/crates/polars-core/src/series/implementations/mod.rs @@ -139,7 +139,7 @@ macro_rules! impl_dyn_series { } fn vec_hash(&self, random_state: RandomState, buf: &mut Vec) -> PolarsResult<()> { - self.0.vec_hash(random_state, buf); + self.0.vec_hash(random_state, buf)?; Ok(()) } @@ -148,7 +148,7 @@ macro_rules! impl_dyn_series { build_hasher: RandomState, hashes: &mut [u64], ) -> PolarsResult<()> { - self.0.vec_hash_combine(build_hasher, hashes); + self.0.vec_hash_combine(build_hasher, hashes)?; Ok(()) } diff --git a/crates/polars-core/src/series/implementations/object.rs b/crates/polars-core/src/series/implementations/object.rs index 3a8b8594d41a..01eb9fdbe0e9 100644 --- a/crates/polars-core/src/series/implementations/object.rs +++ b/crates/polars-core/src/series/implementations/object.rs @@ -55,12 +55,12 @@ where } fn vec_hash(&self, random_state: RandomState, buf: &mut Vec) -> PolarsResult<()> { - self.0.vec_hash(random_state, buf); + self.0.vec_hash(random_state, buf)?; Ok(()) } fn vec_hash_combine(&self, build_hasher: RandomState, hashes: &mut [u64]) -> PolarsResult<()> { - self.0.vec_hash_combine(build_hasher, hashes); + self.0.vec_hash_combine(build_hasher, hashes)?; Ok(()) } diff --git a/crates/polars-core/src/series/implementations/utf8.rs b/crates/polars-core/src/series/implementations/utf8.rs index ddee3fb82253..a25684c1ffd1 100644 --- a/crates/polars-core/src/series/implementations/utf8.rs +++ b/crates/polars-core/src/series/implementations/utf8.rs @@ -51,12 +51,12 @@ impl private::PrivateSeries for SeriesWrap { } fn vec_hash(&self, random_state: RandomState, buf: &mut Vec) -> PolarsResult<()> { - self.0.vec_hash(random_state, buf); + self.0.vec_hash(random_state, buf)?; Ok(()) } fn vec_hash_combine(&self, build_hasher: RandomState, hashes: &mut [u64]) -> PolarsResult<()> { - self.0.vec_hash_combine(build_hasher, hashes); + self.0.vec_hash_combine(build_hasher, hashes)?; Ok(()) } diff --git a/crates/polars-error/src/lib.rs b/crates/polars-error/src/lib.rs index 730f484123d0..3241c9faa54d 100644 --- a/crates/polars-error/src/lib.rs +++ b/crates/polars-error/src/lib.rs @@ -140,6 +140,11 @@ macro_rules! polars_err { op = concat!("`", stringify!($op), "`"), got = $arg, expected = $expected ) }; + (un_impl = $op:ident) => { + $crate::polars_err!( + InvalidOperation: "{} operation is not implemented.", concat!("`", stringify!($op), "`") + ) + }; (op = $op:expr, $arg:expr) => { $crate::polars_err!( InvalidOperation: "{} operation not supported for dtype `{}`", $op, $arg diff --git a/py-polars/tests/unit/datatypes/test_list.py b/py-polars/tests/unit/datatypes/test_list.py index 962875782f73..70f2117caad4 100644 --- a/py-polars/tests/unit/datatypes/test_list.py +++ b/py-polars/tests/unit/datatypes/test_list.py @@ -190,6 +190,25 @@ def test_groupby_list_column() -> None: } +def test_groupby_multiple_keys_contains_list_column() -> None: + df = ( + pl.DataFrame( + { + "a": ["x", "x", "y", "y"], + "b": [[1, 2], [1, 2], [3, 4, 5], [6]], + "c": [3, 2, 1, 0], + } + ) + .groupby(["a", "b"], maintain_order=True) + .agg(pl.all()) + ) + assert df.to_dict(False) == { + "a": ["x", "y", "y"], + "b": [[1, 2], [3, 4, 5], [6]], + "c": [[3, 2], [1], [0]], + } + + def test_fast_explode_flag() -> None: df1 = pl.DataFrame({"values": [[[1, 2]]]}) assert df1.clone().vstack(df1)["values"].flags["FAST_EXPLODE"] diff --git a/py-polars/tests/unit/operations/test_unique.py b/py-polars/tests/unit/operations/test_unique.py index 36ba61c701d3..1162a2b4e728 100644 --- a/py-polars/tests/unit/operations/test_unique.py +++ b/py-polars/tests/unit/operations/test_unique.py @@ -26,3 +26,12 @@ def test_unique_predicate_pd() -> None: ) expected = pl.DataFrame({"x": ["abc"], "y": ["xxx"], "z": [True]}) assert_frame_equal(result, expected) + + +def test_unique_on_list_df() -> None: + assert pl.DataFrame( + {"a": [1, 2, 3, 4, 4], "b": [[1, 1], [2], [3], [4, 4], [4, 4]]} + ).unique(maintain_order=True).to_dict(False) == { + "a": [1, 2, 3, 4], + "b": [[1, 1], [2], [3], [4, 4]], + } diff --git a/py-polars/tests/unit/test_errors.py b/py-polars/tests/unit/test_errors.py index fc0b5b6d6c06..a697301353d4 100644 --- a/py-polars/tests/unit/test_errors.py +++ b/py-polars/tests/unit/test_errors.py @@ -209,11 +209,6 @@ def test_error_on_double_agg() -> None: ) -def test_unique_on_list_df() -> None: - with pytest.raises(pl.InvalidOperationError): - pl.DataFrame({"a": [1, 2, 3, 4], "b": [[1, 1], [2], [3], [4, 4]]}).unique() - - def test_filter_not_of_type_bool() -> None: df = pl.DataFrame({"json_val": ['{"a":"hello"}', None, '{"a":"world"}']}) with pytest.raises( From c3f01f8244d96738b408ac54289adb06219e24cf Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sun, 20 Aug 2023 12:09:08 +0200 Subject: [PATCH 14/55] refactor(rust): remove unused apply functions and add fallible generic apply functions (#10621) --- Cargo.toml | 2 +- .../src/chunked_array/arithmetic/mod.rs | 2 +- .../src/chunked_array/arithmetic/numeric.rs | 8 +- .../chunked_array/logical/categorical/mod.rs | 2 +- crates/polars-core/src/chunked_array/mod.rs | 24 +- .../polars-core/src/chunked_array/ops/abs.rs | 2 +- .../src/chunked_array/ops/aggregate/var.rs | 6 +- .../src/chunked_array/ops/apply.rs | 297 ++++-------------- .../src/chunked_array/ops/is_in.rs | 4 +- .../polars-core/src/chunked_array/ops/mod.rs | 39 +-- .../polars-core/src/datatypes/from_values.rs | 130 +++++++- crates/polars-core/src/functions.rs | 4 +- .../src/series/arithmetic/borrowed.rs | 26 +- crates/polars-core/src/series/ops/round.rs | 16 +- .../src/chunked_array/binary/namespace.rs | 8 +- .../src/chunked_array/strings/json_path.rs | 4 +- .../src/chunked_array/strings/namespace.rs | 8 +- crates/polars-ops/src/frame/pivot/mod.rs | 2 +- .../polars-ops/src/series/ops/floor_divide.rs | 8 +- crates/polars-ops/src/series/ops/log.rs | 16 +- .../polars-plan/src/dsl/function_expr/pow.rs | 14 +- .../polars-plan/src/dsl/function_expr/sign.rs | 2 +- .../src/dsl/function_expr/strings.rs | 26 +- .../src/dsl/function_expr/trigonometry.rs | 34 +- crates/polars-time/src/base_utc_offset.rs | 2 +- crates/polars-time/src/dst_offset.rs | 2 +- crates/polars/src/docs/eager.rs | 6 +- py-polars/Cargo.lock | 2 +- 28 files changed, 305 insertions(+), 391 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 735b905a14b0..da68e35af692 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -54,7 +54,7 @@ xxhash-rust = { version = "0.8.6", features = ["xxh3"] } [workspace.dependencies.arrow] package = "arrow2" git = "https://github.com/jorgecarleitao/arrow2" -rev = "7edf5f9e359e0ed02e9d0c6b9318b06964d805f0" +rev = "2b3e2a9e83725a557d78b90cd39298c5bef0ca4a" # branch = "" # version = "0.17.4" default-features = false diff --git a/crates/polars-core/src/chunked_array/arithmetic/mod.rs b/crates/polars-core/src/chunked_array/arithmetic/mod.rs index 6a51424b4e21..101eab32120e 100644 --- a/crates/polars-core/src/chunked_array/arithmetic/mod.rs +++ b/crates/polars-core/src/chunked_array/arithmetic/mod.rs @@ -189,7 +189,7 @@ impl Add for &BooleanChunked { if rhs.len() == 1 { let rhs = rhs.get(0); return match rhs { - Some(rhs) => self.apply_cast_numeric(|v| v as IdxSize + rhs as IdxSize), + Some(rhs) => self.apply_values_generic(|v| v as IdxSize + rhs as IdxSize), None => IdxCa::full_null(self.name(), self.len()), }; } diff --git a/crates/polars-core/src/chunked_array/arithmetic/numeric.rs b/crates/polars-core/src/chunked_array/arithmetic/numeric.rs index ebfb835c715f..f0e5fa53ac12 100644 --- a/crates/polars-core/src/chunked_array/arithmetic/numeric.rs +++ b/crates/polars-core/src/chunked_array/arithmetic/numeric.rs @@ -18,14 +18,14 @@ where let opt_rhs = rhs.get(0); match opt_rhs { None => ChunkedArray::full_null(lhs.name(), lhs.len()), - Some(rhs) => lhs.apply(|lhs| operation(lhs, rhs)), + Some(rhs) => lhs.apply_values(|lhs| operation(lhs, rhs)), } }, (1, _) => { let opt_lhs = lhs.get(0); match opt_lhs { None => ChunkedArray::full_null(lhs.name(), rhs.len()), - Some(lhs) => rhs.apply(|rhs| operation(lhs, rhs)), + Some(lhs) => rhs.apply_values(|rhs| operation(lhs, rhs)), } }, _ => panic!("Cannot apply operation on arrays of different lengths"), @@ -253,7 +253,7 @@ where fn add(self, rhs: N) -> Self::Output { let adder: T::Native = NumCast::from(rhs).unwrap(); - let mut out = self.apply(|val| val + adder); + let mut out = self.apply_values(|val| val + adder); out.set_sorted_flag(self.is_sorted_flag()); out } @@ -268,7 +268,7 @@ where fn sub(self, rhs: N) -> Self::Output { let subber: T::Native = NumCast::from(rhs).unwrap(); - let mut out = self.apply(|val| val - subber); + let mut out = self.apply_values(|val| val - subber); out.set_sorted_flag(self.is_sorted_flag()); out } diff --git a/crates/polars-core/src/chunked_array/logical/categorical/mod.rs b/crates/polars-core/src/chunked_array/logical/categorical/mod.rs index f809a474190e..6841d6da5ba1 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/mod.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/mod.rs @@ -74,7 +74,7 @@ impl CategoricalChunked { // we can skip the apply and only update the rev_map let local_ca = self .logical() - .apply_on_opt(|opt_v| opt_v.map(|v| *physical_map.get(&v).unwrap())); + .apply(|opt_v| opt_v.map(|v| *physical_map.get(&v).unwrap())); let mut out = unsafe { Self::from_cats_and_rev_map_unchecked(local_ca, local_rev_map.into()) }; diff --git a/crates/polars-core/src/chunked_array/mod.rs b/crates/polars-core/src/chunked_array/mod.rs index 6bde255bb244..19f4600b7510 100644 --- a/crates/polars-core/src/chunked_array/mod.rs +++ b/crates/polars-core/src/chunked_array/mod.rs @@ -68,30 +68,8 @@ pub type ChunkIdIter<'a> = std::iter::Map, fn(&Ar /// /// ```rust /// # use polars_core::prelude::*; -/// fn apply_cosine(ca: &Float32Chunked) -> Float32Chunked { -/// ca.apply(|v| v.cos()) -/// } -/// ``` -/// -/// If we would like to cast the result we could use a Rust Iterator instead of an `apply` method. -/// Note that Iterators are slightly slower as the null values aren't ignored implicitly. -/// -/// ```rust -/// # use polars_core::prelude::*; -/// fn apply_cosine_and_cast(ca: &Float32Chunked) -> Float64Chunked { -/// ca.into_iter() -/// .map(|opt_v| { -/// opt_v.map(|v| v.cos() as f64) -/// }).collect() -/// } -/// ``` -/// -/// Another option is to first cast and then use an apply. -/// -/// ```rust -/// # use polars_core::prelude::*; /// fn apply_cosine_and_cast(ca: &Float32Chunked) -> Float64Chunked { -/// ca.apply_cast_numeric(|v| v.cos() as f64) +/// ca.apply_values_generic(|v| v.cos() as f64) /// } /// ``` /// diff --git a/crates/polars-core/src/chunked_array/ops/abs.rs b/crates/polars-core/src/chunked_array/ops/abs.rs index 2e4da7d44c13..f0e035b6ecf0 100644 --- a/crates/polars-core/src/chunked_array/ops/abs.rs +++ b/crates/polars-core/src/chunked_array/ops/abs.rs @@ -9,6 +9,6 @@ where /// Convert all values to their absolute/positive value. #[must_use] pub fn abs(&self) -> Self { - self.apply(|v| v.abs()) + self.apply_values(|v| v.abs()) } } diff --git a/crates/polars-core/src/chunked_array/ops/aggregate/var.rs b/crates/polars-core/src/chunked_array/ops/aggregate/var.rs index fa7bbe3102b3..d7f4a4828591 100644 --- a/crates/polars-core/src/chunked_array/ops/aggregate/var.rs +++ b/crates/polars-core/src/chunked_array/ops/aggregate/var.rs @@ -23,7 +23,7 @@ where let n_values = n_values as f64; let mean = self.mean()?; - let squared = self.apply_cast_numeric::<_, Float64Type>(|value| { + let squared: Float64Chunked = ChunkedArray::apply_values_generic(self, |value| { let tmp = value.to_f64().unwrap() - mean; tmp * tmp }); @@ -50,7 +50,7 @@ impl ChunkVar for Float32Chunked { let n_values = n_values as f32; let mean = self.mean()? as f32; - let squared = self.apply(|value| { + let squared = self.apply_values(|value| { let tmp = value - mean; tmp * tmp }); @@ -74,7 +74,7 @@ impl ChunkVar for Float64Chunked { let n_values = n_values as f64; let mean = self.mean()?; - let squared = self.apply(|value| { + let squared = self.apply_values(|value| { let tmp = value - mean; tmp * tmp }); diff --git a/crates/polars-core/src/chunked_array/ops/apply.rs b/crates/polars-core/src/chunked_array/ops/apply.rs index 48cfdee12d1d..d2fad1295b85 100644 --- a/crates/polars-core/src/chunked_array/ops/apply.rs +++ b/crates/polars-core/src/chunked_array/ops/apply.rs @@ -1,6 +1,7 @@ //! Implementations of the ChunkApply Trait. use std::borrow::Cow; use std::convert::TryFrom; +use std::error::Error; use arrow::array::{BooleanArray, PrimitiveArray}; use arrow::bitmap::utils::{get_bit_unchecked, set_bit_unchecked}; @@ -11,14 +12,14 @@ use polars_arrow::bitmap::unary_mut; use crate::prelude::*; use crate::series::IsSorted; -use crate::utils::{CustomIterTools, NoNull}; +use crate::utils::CustomIterTools; impl ChunkedArray where T: PolarsDataType, Self: HasUnderlyingArray, { - pub fn apply_values<'a, U, K, F>(&'a self, op: F) -> ChunkedArray + pub fn apply_values_generic<'a, U, K, F>(&'a self, op: F) -> ChunkedArray where U: PolarsDataType, F: FnMut(<::ArrayT as StaticArray>::ValueT<'a>) -> K + Copy, @@ -33,7 +34,46 @@ where ChunkedArray::from_chunk_iter(self.name(), iter) } - pub fn apply2<'a, U, K, F>(&'a self, op: F) -> ChunkedArray + + pub fn try_apply_values_generic<'a, U, K, F, E>(&'a self, op: F) -> Result, E> + where + U: PolarsDataType, + F: FnMut(<::ArrayT as StaticArray>::ValueT<'a>) -> Result + + Copy, + K: ArrayFromElementIter, + K::ArrayType: StaticallyMatchesPolarsType, + E: Error, + { + let iter = self.downcast_iter().map(|arr| { + let element_iter = arr.values_iter().map(op); + let array = K::try_array_from_values_iter(element_iter)?; + Ok(array.with_validity_typed(arr.validity().cloned())) + }); + + ChunkedArray::try_from_chunk_iter(self.name(), iter) + } + + pub fn try_apply_generic<'a, U, K, F, E>(&'a self, op: F) -> Result, E> + where + U: PolarsDataType, + F: FnMut( + Option<<::ArrayT as StaticArray>::ValueT<'a>>, + ) -> Result, E> + + Copy, + K: ArrayFromElementIter, + K::ArrayType: StaticallyMatchesPolarsType, + E: Error, + { + let iter = self.downcast_iter().map(|arr| { + let element_iter = arr.iter().map(op); + let array = K::try_array_from_iter(element_iter)?; + Ok(array.with_validity_typed(arr.validity().cloned())) + }); + + ChunkedArray::try_from_chunk_iter(self.name(), iter) + } + + pub fn apply_generic<'a, U, K, F>(&'a self, op: F) -> ChunkedArray where U: PolarsDataType, F: FnMut( @@ -85,24 +125,6 @@ macro_rules! apply { }}; } -macro_rules! apply_enumerate { - ($self:expr, $f:expr) => {{ - if !$self.has_validity() { - $self - .into_no_null_iter() - .enumerate() - .map($f) - .collect_trusted() - } else { - $self - .into_iter() - .enumerate() - .map(|(idx, opt_v)| opt_v.map(|v| $f((idx, v)))) - .collect_trusted() - } - }}; -} - fn apply_in_place_impl(name: &str, chunks: Vec, f: F) -> ChunkedArray where F: Fn(S::Native) -> S::Native + Copy, @@ -190,7 +212,7 @@ where { type FuncRet = T::Native; - fn apply(&'a self, f: F) -> Self + fn apply_values(&'a self, f: F) -> Self where F: Fn(T::Native) -> T::Native + Copy, { @@ -219,7 +241,7 @@ where Ok(ca) } - fn apply_on_opt(&'a self, f: F) -> Self + fn apply(&'a self, f: F) -> Self where F: Fn(Option) -> Option + Copy, { @@ -230,44 +252,6 @@ where Self::from_chunk_iter(self.name(), chunks) } - fn apply_with_idx(&'a self, f: F) -> Self - where - F: Fn((usize, T::Native)) -> T::Native + Copy, - { - if !self.has_validity() { - let ca: NoNull<_> = self - .into_no_null_iter() - .enumerate() - .map(f) - .collect_trusted(); - ca.into_inner() - } else { - // we know that we only iterate over length == self.len() - unsafe { - self.downcast_iter() - .flatten() - .trust_my_length(self.len()) - .enumerate() - .map(|(idx, opt_v)| opt_v.map(|v| f((idx, *v)))) - .collect_trusted() - } - } - } - - fn apply_with_idx_on_opt(&'a self, f: F) -> Self - where - F: Fn((usize, Option)) -> Option + Copy, - { - // we know that we only iterate over length == self.len() - unsafe { - self.downcast_iter() - .flatten() - .trust_my_length(self.len()) - .enumerate() - .map(|(idx, v)| f((idx, v.copied()))) - .collect_trusted() - } - } fn apply_to_slice(&'a self, f: F, slice: &mut [V]) where F: Fn(Option, &V) -> V, @@ -290,7 +274,7 @@ where impl<'a> ChunkApply<'a, bool> for BooleanChunked { type FuncRet = bool; - fn apply(&self, f: F) -> Self + fn apply_values(&self, f: F) -> Self where F: Fn(bool) -> bool + Copy, { @@ -351,25 +335,11 @@ impl<'a> ChunkApply<'a, bool> for BooleanChunked { Ok(ret) } - fn apply_on_opt(&'a self, f: F) -> Self + fn apply(&'a self, f: F) -> Self where F: Fn(Option) -> Option + Copy, { - self.into_iter().map(f).collect_trusted() - } - - fn apply_with_idx(&'a self, f: F) -> Self - where - F: Fn((usize, bool)) -> bool + Copy, - { - apply_enumerate!(self, f) - } - - fn apply_with_idx_on_opt(&'a self, f: F) -> Self - where - F: Fn((usize, Option)) -> Option + Copy, - { - self.into_iter().enumerate().map(f).collect_trusted() + self.apply_generic(f) } fn apply_to_slice(&'a self, f: F, slice: &mut [T]) @@ -426,50 +396,25 @@ impl BinaryChunked { impl<'a> ChunkApply<'a, &'a str> for Utf8Chunked { type FuncRet = Cow<'a, str>; - fn apply(&'a self, f: F) -> Self + fn apply_values(&'a self, f: F) -> Self where F: Fn(&'a str) -> Cow<'a, str> + Copy, { - use polars_arrow::array::utf8::Utf8FromIter; - let chunks = self.downcast_iter().map(|arr| { - let iter = arr.values_iter().map(f); - let size_hint = (arr.get_values_size() as f64 * 1.3) as usize; - let new = Utf8Array::::from_values_iter(iter, arr.len(), size_hint); - new.with_validity(arr.validity().cloned()) - }); - Utf8Chunked::from_chunk_iter(self.name(), chunks) + ChunkedArray::apply_values_generic(self, f) } fn try_apply(&'a self, f: F) -> PolarsResult where F: Fn(&'a str) -> PolarsResult> + Copy, { - try_apply!(self, f) + self.try_apply_values_generic(f) } - fn apply_on_opt(&'a self, f: F) -> Self + fn apply(&'a self, f: F) -> Self where F: Fn(Option<&'a str>) -> Option> + Copy, { - let mut ca: Self = self.into_iter().map(f).collect_trusted(); - ca.rename(self.name()); - ca - } - - fn apply_with_idx(&'a self, f: F) -> Self - where - F: Fn((usize, &'a str)) -> Cow<'a, str> + Copy, - { - apply_enumerate!(self, f) - } - - fn apply_with_idx_on_opt(&'a self, f: F) -> Self - where - F: Fn((usize, Option<&'a str>)) -> Option> + Copy, - { - let mut ca: Self = self.into_iter().enumerate().map(f).collect_trusted(); - ca.rename(self.name()); - ca + self.apply_generic(f) } fn apply_to_slice(&'a self, f: F, slice: &mut [T]) @@ -494,43 +439,25 @@ impl<'a> ChunkApply<'a, &'a str> for Utf8Chunked { impl<'a> ChunkApply<'a, &'a [u8]> for BinaryChunked { type FuncRet = Cow<'a, [u8]>; - fn apply(&'a self, f: F) -> Self + fn apply_values(&'a self, f: F) -> Self where F: Fn(&'a [u8]) -> Cow<'a, [u8]> + Copy, { - apply!(self, f) + self.apply_values_generic(f) } fn try_apply(&'a self, f: F) -> PolarsResult where F: Fn(&'a [u8]) -> PolarsResult> + Copy, { - try_apply!(self, f) + self.try_apply_values_generic(f) } - fn apply_on_opt(&'a self, f: F) -> Self + fn apply(&'a self, f: F) -> Self where F: Fn(Option<&'a [u8]>) -> Option> + Copy, { - let mut ca: Self = self.into_iter().map(f).collect_trusted(); - ca.rename(self.name()); - ca - } - - fn apply_with_idx(&'a self, f: F) -> Self - where - F: Fn((usize, &'a [u8])) -> Cow<'a, [u8]> + Copy, - { - apply_enumerate!(self, f) - } - - fn apply_with_idx_on_opt(&'a self, f: F) -> Self - where - F: Fn((usize, Option<&'a [u8]>)) -> Option> + Copy, - { - let mut ca: Self = self.into_iter().enumerate().map(f).collect_trusted(); - ca.rename(self.name()); - ca + self.apply_generic(f) } fn apply_to_slice(&'a self, f: F, slice: &mut [T]) @@ -618,7 +545,7 @@ impl<'a> ChunkApply<'a, Series> for ListChunked { type FuncRet = Series; /// Apply a closure `F` elementwise. - fn apply(&'a self, f: F) -> Self + fn apply_values(&'a self, f: F) -> Self where F: Fn(Series) -> Series + Copy, { @@ -666,7 +593,7 @@ impl<'a> ChunkApply<'a, Series> for ListChunked { Ok(ca) } - fn apply_on_opt(&'a self, f: F) -> Self + fn apply(&'a self, f: F) -> Self where F: Fn(Option) -> Option + Copy, { @@ -676,54 +603,6 @@ impl<'a> ChunkApply<'a, Series> for ListChunked { self.into_iter().map(f).collect_trusted() } - /// Apply a closure elementwise. The closure gets the index of the element as first argument. - fn apply_with_idx(&'a self, f: F) -> Self - where - F: Fn((usize, Series)) -> Series + Copy, - { - if self.is_empty() { - return self.clone(); - } - let mut fast_explode = true; - let mut function = |(idx, s)| { - let out = f((idx, s)); - if out.is_empty() { - fast_explode = false; - } - out - }; - let mut ca: ListChunked = apply_enumerate!(self, function); - if fast_explode { - ca.set_fast_explode() - } - ca - } - - /// Apply a closure elementwise. The closure gets the index of the element as first argument. - fn apply_with_idx_on_opt(&'a self, f: F) -> Self - where - F: Fn((usize, Option)) -> Option + Copy, - { - if self.is_empty() { - return self.clone(); - } - let mut fast_explode = true; - let function = |(idx, s)| { - let out = f((idx, s)); - if let Some(out) = &out { - if out.is_empty() { - fast_explode = false; - } - } - out - }; - let mut ca: ListChunked = self.into_iter().enumerate().map(function).collect_trusted(); - if fast_explode { - ca.set_fast_explode() - } - ca - } - fn apply_to_slice(&'a self, f: F, slice: &mut [T]) where F: Fn(Option, &T) -> T, @@ -752,7 +631,7 @@ where { type FuncRet = T; - fn apply(&'a self, f: F) -> Self + fn apply_values(&'a self, f: F) -> Self where F: Fn(&'a T) -> T + Copy, { @@ -768,7 +647,7 @@ where todo!() } - fn apply_on_opt(&'a self, f: F) -> Self + fn apply(&'a self, f: F) -> Self where F: Fn(Option<&'a T>) -> Option + Copy, { @@ -777,20 +656,6 @@ where ca } - fn apply_with_idx(&'a self, _f: F) -> Self - where - F: Fn((usize, &'a T)) -> T + Copy, - { - todo!() - } - - fn apply_with_idx_on_opt(&'a self, _f: F) -> Self - where - F: Fn((usize, Option<&'a T>)) -> Option + Copy, - { - todo!() - } - fn apply_to_slice(&'a self, f: F, slice: &mut [V]) where F: Fn(Option<&'a T>, &V) -> V, @@ -808,41 +673,3 @@ where }); } } - -impl<'a, T: PolarsDataType> ChunkApplyCast<'a> for ChunkedArray -where - ChunkedArray: HasUnderlyingArray, -{ - fn apply_cast_numeric(&'a self, f: F) -> ChunkedArray - where - F: Fn(<::ArrayT as StaticArray>::ValueT<'a>) -> R::Native - + Copy, - R: PolarsNumericType, - { - let chunks = self.downcast_iter().map(|array| { - let values = array.values_iter().map(f); - collect_array(values, array.validity().cloned()) - }); - ChunkedArray::from_chunk_iter(self.name(), chunks) - } - - fn branch_apply_cast_numeric_no_null(&'a self, f: F) -> ChunkedArray - where - F: Fn( - Option<<::ArrayT as StaticArray>::ValueT<'a>>, - ) -> R::Native - + Copy, - R: PolarsNumericType, - { - let chunks = self.downcast_iter().map(|array| { - if array.null_count() == 0 { - let values = array.values_iter().map(|v| f(Some(v))); - collect_array(values, None) - } else { - let values = array.iter().map(f); - collect_array(values, None) - } - }); - ChunkedArray::from_chunk_iter(self.name(), chunks) - } -} diff --git a/crates/polars-core/src/chunked_array/ops/is_in.rs b/crates/polars-core/src/chunked_array/ops/is_in.rs index a53b5de54d44..cf4becb1cb21 100644 --- a/crates/polars-core/src/chunked_array/ops/is_in.rs +++ b/crates/polars-core/src/chunked_array/ops/is_in.rs @@ -19,7 +19,7 @@ where } }) }); - Ok(ca.apply_values(|val| set.contains(&val))) + Ok(ca.apply_values_generic(|val| set.contains(&val))) } impl IsIn for ChunkedArray @@ -256,7 +256,7 @@ impl IsIn for BooleanChunked { } else { !(other.sum().unwrap() as usize + nc) == other.len() }; - Ok(self.apply(|v| if v { has_true } else { has_false })) + Ok(self.apply_values(|v| if v { has_true } else { has_false })) } _ => polars_bail!(opq = is_in, self.dtype(), other.dtype()), } diff --git a/crates/polars-core/src/chunked_array/ops/mod.rs b/crates/polars-core/src/chunked_array/ops/mod.rs index 3420a041813a..6e0cc146a6b8 100644 --- a/crates/polars-core/src/chunked_array/ops/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/mod.rs @@ -296,27 +296,6 @@ pub trait ChunkCast { unsafe fn cast_unchecked(&self, data_type: &DataType) -> PolarsResult; } -pub trait ChunkApplyCast<'a>: HasUnderlyingArray { - /// Apply a closure elementwise and cast to a Numeric [`ChunkedArray`]. This is fastest when the null check branching is more expensive - /// than the closure application. - /// - /// Null values remain null. - fn apply_cast_numeric(&'a self, f: F) -> ChunkedArray - where - F: Fn(<::ArrayT as StaticArray>::ValueT<'a>) -> R::Native - + Copy, - R: PolarsNumericType; - - /// Apply a closure on optional values and cast to Numeric ChunkedArray without null values. - fn branch_apply_cast_numeric_no_null(&'a self, f: F) -> ChunkedArray - where - F: Fn( - Option<<::ArrayT as StaticArray>::ValueT<'a>>, - ) -> R::Native - + Copy, - R: PolarsNumericType; -} - /// Fastest way to do elementwise operations on a [`ChunkedArray`] when the operation is cheaper than /// branching due to null checking. pub trait ChunkApply<'a, T> { @@ -332,11 +311,11 @@ pub trait ChunkApply<'a, T> { /// ``` /// use polars_core::prelude::*; /// fn double(ca: &UInt32Chunked) -> UInt32Chunked { - /// ca.apply(|v| v * 2) + /// ca.apply_values(|v| v * 2) /// } /// ``` #[must_use] - fn apply(&'a self, f: F) -> Self + fn apply_values(&'a self, f: F) -> Self where F: Fn(T) -> Self::FuncRet + Copy; @@ -347,22 +326,10 @@ pub trait ChunkApply<'a, T> { /// Apply a closure elementwise including null values. #[must_use] - fn apply_on_opt(&'a self, f: F) -> Self + fn apply(&'a self, f: F) -> Self where F: Fn(Option) -> Option + Copy; - /// Apply a closure elementwise. The closure gets the index of the element as first argument. - #[must_use] - fn apply_with_idx(&'a self, f: F) -> Self - where - F: Fn((usize, T)) -> Self::FuncRet + Copy; - - /// Apply a closure elementwise. The closure gets the index of the element as first argument. - #[must_use] - fn apply_with_idx_on_opt(&'a self, f: F) -> Self - where - F: Fn((usize, Option)) -> Option + Copy; - /// Apply a closure elementwise and write results to a mutable slice. fn apply_to_slice(&'a self, f: F, slice: &mut [S]) // (value of chunkedarray, value of slice) -> value of slice diff --git a/crates/polars-core/src/datatypes/from_values.rs b/crates/polars-core/src/datatypes/from_values.rs index 7674adbec0e1..236495a27305 100644 --- a/crates/polars-core/src/datatypes/from_values.rs +++ b/crates/polars-core/src/datatypes/from_values.rs @@ -1,5 +1,13 @@ -use arrow::array::{BooleanArray, PrimitiveArray, Utf8Array}; -use polars_arrow::array::utf8::Utf8FromIter; +use std::borrow::Cow; +use std::error::Error; + +use arrow::array::{ + BinaryArray, BooleanArray, MutableBinaryArray, MutableBinaryValuesArray, MutablePrimitiveArray, + MutableUtf8Array, MutableUtf8ValuesArray, PrimitiveArray, Utf8Array, +}; +use arrow::bitmap::Bitmap; +use polars_arrow::array::utf8::{BinaryFromIter, Utf8FromIter}; +use polars_arrow::prelude::FromData; use polars_arrow::trusted_len::TrustedLen; use crate::prelude::StaticArray; @@ -13,6 +21,14 @@ where fn array_from_iter>>(iter: I) -> Self::ArrayType; fn array_from_values_iter>(iter: I) -> Self::ArrayType; + + fn try_array_from_iter, E>>>( + iter: I, + ) -> Result; + + fn try_array_from_values_iter>>( + iter: I, + ) -> Result; } impl ArrayFromElementIter for bool { @@ -27,6 +43,20 @@ impl ArrayFromElementIter for bool { // SAFETY: guarded by `TrustedLen` trait unsafe { BooleanArray::from_trusted_len_values_iter_unchecked(iter) } } + + fn try_array_from_iter, E>>>( + iter: I, + ) -> Result { + // SAFETY: guarded by `TrustedLen` trait + unsafe { BooleanArray::try_from_trusted_len_iter_unchecked(iter) } + } + fn try_array_from_values_iter>>( + iter: I, + ) -> Result { + // SAFETY: guarded by `TrustedLen` trait + let values = unsafe { Bitmap::try_from_trusted_len_iter_unchecked(iter) }?; + Ok(BooleanArray::from_data_default(values, None)) + } } macro_rules! impl_primitive { @@ -43,6 +73,20 @@ macro_rules! impl_primitive { // SAFETY: guarded by `TrustedLen` trait unsafe { PrimitiveArray::from_trusted_len_values_iter_unchecked(iter) } } + fn try_array_from_iter, E>>>( + iter: I, + ) -> Result { + // SAFETY: guarded by `TrustedLen` trait + unsafe { + Ok(MutablePrimitiveArray::try_from_trusted_len_iter_unchecked(iter)?.into()) + } + } + fn try_array_from_values_iter>>( + iter: I, + ) -> Result { + let values: Vec<_> = iter.collect::, _>>()?; + Ok(PrimitiveArray::from_vec(values)) + } } }; } @@ -55,6 +99,8 @@ impl_primitive!(i8); impl_primitive!(i16); impl_primitive!(i32); impl_primitive!(i64); +impl_primitive!(f32); +impl_primitive!(f64); impl ArrayFromElementIter for &str { type ArrayType = Utf8Array; @@ -64,9 +110,89 @@ impl ArrayFromElementIter for &str { unsafe { Utf8Array::from_trusted_len_iter_unchecked(iter) } } + fn array_from_values_iter>(iter: I) -> Self::ArrayType { + let len = iter.size_hint().0; + Utf8Array::from_values_iter(iter, len, len * 24) + } + fn try_array_from_iter, E>>>( + iter: I, + ) -> Result { + let len = iter.size_hint().0; + let mut mutable = MutableUtf8Array::::with_capacities(len, len * 24); + mutable.extend_fallible(iter)?; + Ok(mutable.into()) + } + + fn try_array_from_values_iter>>( + iter: I, + ) -> Result { + let len = iter.size_hint().0; + let mut mutable = MutableUtf8ValuesArray::::with_capacities(len, len * 24); + mutable.extend_fallible(iter)?; + Ok(mutable.into()) + } +} + +impl ArrayFromElementIter for Cow<'_, str> { + type ArrayType = Utf8Array; + + fn array_from_iter>>(iter: I) -> Self::ArrayType { + // SAFETY: guarded by `TrustedLen` trait + unsafe { Utf8Array::from_trusted_len_iter_unchecked(iter) } + } + fn array_from_values_iter>(iter: I) -> Self::ArrayType { // SAFETY: guarded by `TrustedLen` trait let len = iter.size_hint().0; Utf8Array::from_values_iter(iter, len, len * 24) } + fn try_array_from_iter, E>>>( + iter: I, + ) -> Result { + let len = iter.size_hint().0; + let mut mutable = MutableUtf8Array::::with_capacities(len, len * 24); + mutable.extend_fallible(iter)?; + Ok(mutable.into()) + } + + fn try_array_from_values_iter>>( + iter: I, + ) -> Result { + let len = iter.size_hint().0; + let mut mutable = MutableUtf8ValuesArray::::with_capacities(len, len * 24); + mutable.extend_fallible(iter)?; + Ok(mutable.into()) + } +} + +impl ArrayFromElementIter for Cow<'_, [u8]> { + type ArrayType = BinaryArray; + + fn array_from_iter>>(iter: I) -> Self::ArrayType { + // SAFETY: guarded by `TrustedLen` trait + unsafe { BinaryArray::from_trusted_len_iter_unchecked(iter) } + } + + fn array_from_values_iter>(iter: I) -> Self::ArrayType { + // SAFETY: guarded by `TrustedLen` trait + let len = iter.size_hint().0; + BinaryArray::from_values_iter(iter, len, len * 24) + } + fn try_array_from_iter, E>>>( + iter: I, + ) -> Result { + let len = iter.size_hint().0; + let mut mutable = MutableBinaryArray::::with_capacities(len, len * 24); + mutable.extend_fallible(iter)?; + Ok(mutable.into()) + } + + fn try_array_from_values_iter>>( + iter: I, + ) -> Result { + let len = iter.size_hint().0; + let mut mutable = MutableBinaryValuesArray::::with_capacities(len, len * 24); + mutable.extend_fallible(iter)?; + Ok(mutable.into()) + } } diff --git a/crates/polars-core/src/functions.rs b/crates/polars-core/src/functions.rs index 6c15c1bb910a..515913b43aad 100644 --- a/crates/polars-core/src/functions.rs +++ b/crates/polars-core/src/functions.rs @@ -49,8 +49,8 @@ where } else { let a_mean = a.mean()?; let b_mean = b.mean()?; - let a = a.apply_cast_numeric::<_, Float64Type>(|a| a.to_f64().unwrap() - a_mean); - let b = b.apply_cast_numeric(|b| b.to_f64().unwrap() - b_mean); + let a: Float64Chunked = a.apply_values_generic(|a| a.to_f64().unwrap() - a_mean); + let b: Float64Chunked = b.apply_values_generic(|b| b.to_f64().unwrap() - b_mean); let tmp = a * b; let n = tmp.len() - tmp.null_count(); diff --git a/crates/polars-core/src/series/arithmetic/borrowed.rs b/crates/polars-core/src/series/arithmetic/borrowed.rs index bc71816331f7..d34227aa4198 100644 --- a/crates/polars-core/src/series/arithmetic/borrowed.rs +++ b/crates/polars-core/src/series/arithmetic/borrowed.rs @@ -234,50 +234,50 @@ pub mod checked { UInt8 => s .u8() .unwrap() - .apply_on_opt(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_u8().unwrap()))) + .apply(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_u8().unwrap()))) .into_series(), #[cfg(feature = "dtype-i8")] Int8 => s .i8() .unwrap() - .apply_on_opt(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_i8().unwrap()))) + .apply(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_i8().unwrap()))) .into_series(), #[cfg(feature = "dtype-i16")] Int16 => s .i16() .unwrap() - .apply_on_opt(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_i16().unwrap()))) + .apply(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_i16().unwrap()))) .into_series(), #[cfg(feature = "dtype-u16")] UInt16 => s .u16() .unwrap() - .apply_on_opt(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_u16().unwrap()))) + .apply(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_u16().unwrap()))) .into_series(), UInt32 => s .u32() .unwrap() - .apply_on_opt(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_u32().unwrap()))) + .apply(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_u32().unwrap()))) .into_series(), Int32 => s .i32() .unwrap() - .apply_on_opt(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_i32().unwrap()))) + .apply(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_i32().unwrap()))) .into_series(), UInt64 => s .u64() .unwrap() - .apply_on_opt(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_u64().unwrap()))) + .apply(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_u64().unwrap()))) .into_series(), Int64 => s .i64() .unwrap() - .apply_on_opt(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_i64().unwrap()))) + .apply(|opt_v| opt_v.and_then(|v| v.checked_div(rhs.to_i64().unwrap()))) .into_series(), Float32 => s .f32() .unwrap() - .apply_on_opt(|opt_v| { + .apply(|opt_v| { opt_v.and_then(|v| { let res = rhs.to_f32().unwrap(); if res.is_zero() { @@ -291,7 +291,7 @@ pub mod checked { Float64 => s .f64() .unwrap() - .apply_on_opt(|opt_v| { + .apply(|opt_v| { opt_v.and_then(|v| { let res = rhs.to_f64().unwrap(); if res.is_zero() { @@ -698,21 +698,21 @@ where #[must_use] pub fn lhs_sub(&self, lhs: N) -> Self { let lhs: T::Native = NumCast::from(lhs).expect("could not cast"); - self.apply(|v| lhs - v) + self.apply_values(|v| lhs - v) } /// Apply lhs / self #[must_use] pub fn lhs_div(&self, lhs: N) -> Self { let lhs: T::Native = NumCast::from(lhs).expect("could not cast"); - self.apply(|v| lhs / v) + self.apply_values(|v| lhs / v) } /// Apply lhs % self #[must_use] pub fn lhs_rem(&self, lhs: N) -> Self { let lhs: T::Native = NumCast::from(lhs).expect("could not cast"); - self.apply(|v| lhs % v) + self.apply_values(|v| lhs % v) } } diff --git a/crates/polars-core/src/series/ops/round.rs b/crates/polars-core/src/series/ops/round.rs index c58006ba3dbe..edcd3f31cbca 100644 --- a/crates/polars-core/src/series/ops/round.rs +++ b/crates/polars-core/src/series/ops/round.rs @@ -8,26 +8,26 @@ impl Series { pub fn round(&self, decimals: u32) -> PolarsResult { if let Ok(ca) = self.f32() { if decimals == 0 { - let s = ca.apply(|val| val.round()).into_series(); + let s = ca.apply_values(|val| val.round()).into_series(); return Ok(s); } else { // Note we do the computation on f64 floats to not lose precision // when the computation is done, we cast to f32 let multiplier = 10.0.pow(decimals as f64); let s = ca - .apply(|val| ((val as f64 * multiplier).round() / multiplier) as f32) + .apply_values(|val| ((val as f64 * multiplier).round() / multiplier) as f32) .into_series(); return Ok(s); } } if let Ok(ca) = self.f64() { if decimals == 0 { - let s = ca.apply(|val| val.round()).into_series(); + let s = ca.apply_values(|val| val.round()).into_series(); return Ok(s); } else { let multiplier = 10.0.pow(decimals as f64); let s = ca - .apply(|val| (val * multiplier).round() / multiplier) + .apply_values(|val| (val * multiplier).round() / multiplier) .into_series(); return Ok(s); } @@ -38,11 +38,11 @@ impl Series { /// Floor underlying floating point array to the lowest integers smaller or equal to the float value. pub fn floor(&self) -> PolarsResult { if let Ok(ca) = self.f32() { - let s = ca.apply(|val| val.floor()).into_series(); + let s = ca.apply_values(|val| val.floor()).into_series(); return Ok(s); } if let Ok(ca) = self.f64() { - let s = ca.apply(|val| val.floor()).into_series(); + let s = ca.apply_values(|val| val.floor()).into_series(); return Ok(s); } polars_bail!(opq = floor, self.dtype()); @@ -51,11 +51,11 @@ impl Series { /// Ceil underlying floating point array to the highest integers smaller or equal to the float value. pub fn ceil(&self) -> PolarsResult { if let Ok(ca) = self.f32() { - let s = ca.apply(|val| val.ceil()).into_series(); + let s = ca.apply_values(|val| val.ceil()).into_series(); return Ok(s); } if let Ok(ca) = self.f64() { - let s = ca.apply(|val| val.ceil()).into_series(); + let s = ca.apply_values(|val| val.ceil()).into_series(); return Ok(s); } polars_bail!(opq = ceil, self.dtype()); diff --git a/crates/polars-ops/src/chunked_array/binary/namespace.rs b/crates/polars-ops/src/chunked_array/binary/namespace.rs index 2a674925cb44..59c444b4ac1d 100644 --- a/crates/polars-ops/src/chunked_array/binary/namespace.rs +++ b/crates/polars-ops/src/chunked_array/binary/namespace.rs @@ -60,7 +60,7 @@ pub trait BinaryNameSpaceImpl: AsBinary { Ok(bytes.into()) }) } else { - Ok(ca.apply_on_opt(|opt_s| opt_s.and_then(|s| hex::decode(s).ok().map(Cow::Owned)))) + Ok(ca.apply(|opt_s| opt_s.and_then(|s| hex::decode(s).ok().map(Cow::Owned)))) } } @@ -68,7 +68,7 @@ pub trait BinaryNameSpaceImpl: AsBinary { fn hex_encode(&self) -> Series { let ca = self.as_binary(); unsafe { - ca.apply(|s| hex::encode(s).into_bytes().into()) + ca.apply_values(|s| hex::encode(s).into_bytes().into()) .cast_unchecked(&DataType::Utf8) .unwrap() } @@ -88,7 +88,7 @@ pub trait BinaryNameSpaceImpl: AsBinary { Ok(bytes.into()) }) } else { - Ok(ca.apply_on_opt(|opt_s| { + Ok(ca.apply(|opt_s| { opt_s.and_then(|s| general_purpose::STANDARD.decode(s).ok().map(Cow::Owned)) })) } @@ -98,7 +98,7 @@ pub trait BinaryNameSpaceImpl: AsBinary { fn base64_encode(&self) -> Series { let ca = self.as_binary(); unsafe { - ca.apply(|s| general_purpose::STANDARD.encode(s).into_bytes().into()) + ca.apply_values(|s| general_purpose::STANDARD.encode(s).into_bytes().into()) .cast_unchecked(&DataType::Utf8) .unwrap() } diff --git a/crates/polars-ops/src/chunked_array/strings/json_path.rs b/crates/polars-ops/src/chunked_array/strings/json_path.rs index 0f10895c689b..39ebd91e4f46 100644 --- a/crates/polars-ops/src/chunked_array/strings/json_path.rs +++ b/crates/polars-ops/src/chunked_array/strings/json_path.rs @@ -45,7 +45,7 @@ pub trait Utf8JsonPathImpl: AsUtf8 { .map_err(|e| polars_err!(ComputeError: "error compiling JSONpath expression {}", e))?; Ok(self .as_utf8() - .apply_on_opt(|opt_s| opt_s.and_then(|s| extract_json(&pat, s)))) + .apply(|opt_s| opt_s.and_then(|s| extract_json(&pat, s)))) } /// Returns the inferred DataType for JSON values for each row @@ -93,7 +93,7 @@ pub trait Utf8JsonPathImpl: AsUtf8 { .map_err(|e| polars_err!(ComputeError: "error compiling JSONpath expression: {}", e))?; Ok(self .as_utf8() - .apply_on_opt(|opt_s| opt_s.and_then(|s| select_json(&pat, s)))) + .apply(|opt_s| opt_s.and_then(|s| select_json(&pat, s)))) } fn json_path_extract( diff --git a/crates/polars-ops/src/chunked_array/strings/namespace.rs b/crates/polars-ops/src/chunked_array/strings/namespace.rs index 50374fd0a927..ab9b4919abed 100644 --- a/crates/polars-ops/src/chunked_array/strings/namespace.rs +++ b/crates/polars-ops/src/chunked_array/strings/namespace.rs @@ -29,7 +29,7 @@ pub trait Utf8NameSpaceImpl: AsUtf8 { #[cfg(feature = "string_encoding")] fn hex_encode(&self) -> Utf8Chunked { let ca = self.as_utf8(); - ca.apply(|s| hex::encode(s).into()) + ca.apply_values(|s| hex::encode(s).into()) } #[cfg(not(feature = "binary_encoding"))] @@ -47,7 +47,7 @@ pub trait Utf8NameSpaceImpl: AsUtf8 { #[cfg(feature = "string_encoding")] fn base64_encode(&self) -> Utf8Chunked { let ca = self.as_utf8(); - ca.apply(|s| general_purpose::STANDARD.encode(s).into()) + ca.apply_values(|s| general_purpose::STANDARD.encode(s).into()) } #[cfg(feature = "string_from_radix")] @@ -178,7 +178,7 @@ pub trait Utf8NameSpaceImpl: AsUtf8 { let reg = Regex::new(pat)?; let f = |s: &'a str| reg.replace(s, val); let ca = self.as_utf8(); - Ok(ca.apply(f)) + Ok(ca.apply_values(f)) } /// Replace the leftmost literal (sub)string with another string @@ -235,7 +235,7 @@ pub trait Utf8NameSpaceImpl: AsUtf8 { fn replace_all(&self, pat: &str, val: &str) -> PolarsResult { let ca = self.as_utf8(); let reg = Regex::new(pat)?; - Ok(ca.apply(|s| reg.replace_all(s, val))) + Ok(ca.apply_values(|s| reg.replace_all(s, val))) } /// Replace all matching literal (sub)strings with another string diff --git a/crates/polars-ops/src/frame/pivot/mod.rs b/crates/polars-ops/src/frame/pivot/mod.rs index c76474540547..9867dede28d9 100644 --- a/crates/polars-ops/src/frame/pivot/mod.rs +++ b/crates/polars-ops/src/frame/pivot/mod.rs @@ -241,7 +241,7 @@ fn pivot_impl( let headers = column_agg.unique_stable()?.cast(&DataType::Utf8)?; let mut headers = headers.utf8().unwrap().clone(); if values.len() > 1 { - headers = headers.apply(|v| Cow::from(format!("{value_col_name}{sep}{column_column_name}{sep}{v}"))) + headers = headers.apply_values(|v| Cow::from(format!("{value_col_name}{sep}{column_column_name}{sep}{v}"))) } let n_cols = headers.len(); diff --git a/crates/polars-ops/src/series/ops/floor_divide.rs b/crates/polars-ops/src/series/ops/floor_divide.rs index 1bf7800070f8..933519cdd2a4 100644 --- a/crates/polars-ops/src/series/ops/floor_divide.rs +++ b/crates/polars-ops/src/series/ops/floor_divide.rs @@ -57,9 +57,9 @@ fn floor_div_ca(a: &ChunkedArray, b: &ChunkedArray) let name = a.name(); return if let Some(a) = a.get(0) { let mut out = if b.null_count() == 0 { - b.apply(|b| floor_div_element(a, b)) + b.apply_values(|b| floor_div_element(a, b)) } else { - b.apply_on_opt(|b| b.map(|b| floor_div_element(a, b))) + b.apply(|b| b.map(|b| floor_div_element(a, b))) }; out.rename(name); out @@ -70,9 +70,9 @@ fn floor_div_ca(a: &ChunkedArray, b: &ChunkedArray) if b.len() == 1 { return if let Some(b) = b.get(0) { if a.null_count() == 0 { - a.apply(|a| floor_div_element(a, b)) + a.apply_values(|a| floor_div_element(a, b)) } else { - a.apply_on_opt(|a| a.map(|a| floor_div_element(a, b))) + a.apply(|a| a.map(|a| floor_div_element(a, b))) } } else { ChunkedArray::full_null(a.name(), a.len()) diff --git a/crates/polars-ops/src/series/ops/log.rs b/crates/polars-ops/src/series/ops/log.rs index 174ca7bc33cc..d73fad24d27e 100644 --- a/crates/polars-ops/src/series/ops/log.rs +++ b/crates/polars-ops/src/series/ops/log.rs @@ -26,8 +26,12 @@ pub trait LogSeries: SeriesSealed { Int64 => log(s.i64().unwrap(), base).into_series(), UInt32 => log(s.u32().unwrap(), base).into_series(), UInt64 => log(s.u64().unwrap(), base).into_series(), - Float32 => s.f32().unwrap().apply(|v| v.log(base as f32)).into_series(), - Float64 => s.f64().unwrap().apply(|v| v.log(base)).into_series(), + Float32 => s + .f32() + .unwrap() + .apply_values(|v| v.log(base as f32)) + .into_series(), + Float64 => s.f64().unwrap().apply_values(|v| v.log(base)).into_series(), _ => s.cast(&DataType::Float64).unwrap().log(base), } } @@ -43,8 +47,8 @@ pub trait LogSeries: SeriesSealed { Int64 => log1p(s.i64().unwrap()).into_series(), UInt32 => log1p(s.u32().unwrap()).into_series(), UInt64 => log1p(s.u64().unwrap()).into_series(), - Float32 => s.f32().unwrap().apply(|v| v.ln_1p()).into_series(), - Float64 => s.f64().unwrap().apply(|v| v.ln_1p()).into_series(), + Float32 => s.f32().unwrap().apply_values(|v| v.ln_1p()).into_series(), + Float64 => s.f64().unwrap().apply_values(|v| v.ln_1p()).into_series(), _ => s.cast(&DataType::Float64).unwrap().log1p(), } } @@ -60,8 +64,8 @@ pub trait LogSeries: SeriesSealed { Int64 => exp(s.i64().unwrap()).into_series(), UInt32 => exp(s.u32().unwrap()).into_series(), UInt64 => exp(s.u64().unwrap()).into_series(), - Float32 => s.f32().unwrap().apply(|v| v.exp()).into_series(), - Float64 => s.f64().unwrap().apply(|v| v.exp()).into_series(), + Float32 => s.f32().unwrap().apply_values(|v| v.exp()).into_series(), + Float64 => s.f64().unwrap().apply_values(|v| v.exp()).into_series(), _ => s.cast(&DataType::Float64).unwrap().exp(), } } diff --git a/crates/polars-plan/src/dsl/function_expr/pow.rs b/crates/polars-plan/src/dsl/function_expr/pow.rs index 530041707365..cfb1fe14dff3 100644 --- a/crates/polars-plan/src/dsl/function_expr/pow.rs +++ b/crates/polars-plan/src/dsl/function_expr/pow.rs @@ -42,7 +42,7 @@ where a if a == 1.0 => base.clone().into_series(), // specialized sqrt will ensure (-inf)^0.5 = NaN // and will likely be faster as well. - a if a == 0.5 => base.apply(|v| v.sqrt()).into_series(), + a if a == 0.5 => base.apply_values(|v| v.sqrt()).into_series(), a if a.fract() == 0.0 && a < 10.0 && a > 1.0 => { let mut out = base.clone(); @@ -51,7 +51,9 @@ where } out.into_series() }, - _ => base.apply(|v| Pow::pow(v, exponent_value)).into_series(), + _ => base + .apply_values(|v| Pow::pow(v, exponent_value)) + .into_series(), }; Ok(Some(s)) } else if (base.len() == 1) && (exponent.len() != 1) { @@ -60,7 +62,9 @@ where .ok_or_else(|| polars_err!(ComputeError: "base is null"))?; Ok(Some( - exponent.apply(|exp| Pow::pow(base, exp)).into_series(), + exponent + .apply_values(|exp| Pow::pow(base, exp)) + .into_series(), )) } else { Ok(Some( @@ -129,7 +133,7 @@ where T::Native: num::pow::Pow + ToPrimitive + Float, ChunkedArray: IntoSeries, { - Ok(base.apply(|v| v.sqrt()).into_series()) + Ok(base.apply_values(|v| v.sqrt()).into_series()) } pub(super) fn cbrt(base: &Series) -> PolarsResult { @@ -156,5 +160,5 @@ where T::Native: num::pow::Pow + ToPrimitive + Float, ChunkedArray: IntoSeries, { - Ok(base.apply(|v| v.cbrt()).into_series()) + Ok(base.apply_values(|v| v.cbrt()).into_series()) } diff --git a/crates/polars-plan/src/dsl/function_expr/sign.rs b/crates/polars-plan/src/dsl/function_expr/sign.rs index 6951f3a9cf45..41707664e3ac 100644 --- a/crates/polars-plan/src/dsl/function_expr/sign.rs +++ b/crates/polars-plan/src/dsl/function_expr/sign.rs @@ -27,7 +27,7 @@ where T::Native: num::Float, ChunkedArray: IntoSeries, { - ca.apply(signum_improved).into_series().cast(&Int64) + ca.apply_values(signum_improved).into_series().cast(&Int64) } // Wrapper for the signum function that handles +/-0.0 inputs differently diff --git a/crates/polars-plan/src/dsl/function_expr/strings.rs b/crates/polars-plan/src/dsl/function_expr/strings.rs index 32abb7cf8596..287356050f66 100644 --- a/crates/polars-plan/src/dsl/function_expr/strings.rs +++ b/crates/polars-plan/src/dsl/function_expr/strings.rs @@ -331,15 +331,15 @@ pub(super) fn strip(s: &Series, matches: Option<&str>) -> PolarsResult { if matches.chars().count() == 1 { // Fast path for when a single character is passed Ok(ca - .apply(|s| Cow::Borrowed(s.trim_matches(matches.chars().next().unwrap()))) + .apply_values(|s| Cow::Borrowed(s.trim_matches(matches.chars().next().unwrap()))) .into_series()) } else { Ok(ca - .apply(|s| Cow::Borrowed(s.trim_matches(|c| matches.contains(c)))) + .apply_values(|s| Cow::Borrowed(s.trim_matches(|c| matches.contains(c)))) .into_series()) } } else { - Ok(ca.apply(|s| Cow::Borrowed(s.trim())).into_series()) + Ok(ca.apply_values(|s| Cow::Borrowed(s.trim())).into_series()) } } @@ -350,15 +350,19 @@ pub(super) fn lstrip(s: &Series, matches: Option<&str>) -> PolarsResult if matches.chars().count() == 1 { // Fast path for when a single character is passed Ok(ca - .apply(|s| Cow::Borrowed(s.trim_start_matches(matches.chars().next().unwrap()))) + .apply_values(|s| { + Cow::Borrowed(s.trim_start_matches(matches.chars().next().unwrap())) + }) .into_series()) } else { Ok(ca - .apply(|s| Cow::Borrowed(s.trim_start_matches(|c| matches.contains(c)))) + .apply_values(|s| Cow::Borrowed(s.trim_start_matches(|c| matches.contains(c)))) .into_series()) } } else { - Ok(ca.apply(|s| Cow::Borrowed(s.trim_start())).into_series()) + Ok(ca + .apply_values(|s| Cow::Borrowed(s.trim_start())) + .into_series()) } } @@ -368,15 +372,19 @@ pub(super) fn rstrip(s: &Series, matches: Option<&str>) -> PolarsResult if matches.chars().count() == 1 { // Fast path for when a single character is passed Ok(ca - .apply(|s| Cow::Borrowed(s.trim_end_matches(matches.chars().next().unwrap()))) + .apply_values(|s| { + Cow::Borrowed(s.trim_end_matches(matches.chars().next().unwrap())) + }) .into_series()) } else { Ok(ca - .apply(|s| Cow::Borrowed(s.trim_end_matches(|c| matches.contains(c)))) + .apply_values(|s| Cow::Borrowed(s.trim_end_matches(|c| matches.contains(c)))) .into_series()) } } else { - Ok(ca.apply(|s| Cow::Borrowed(s.trim_end())).into_series()) + Ok(ca + .apply_values(|s| Cow::Borrowed(s.trim_end())) + .into_series()) } } diff --git a/crates/polars-plan/src/dsl/function_expr/trigonometry.rs b/crates/polars-plan/src/dsl/function_expr/trigonometry.rs index 99cd90cee546..32a57a623e59 100644 --- a/crates/polars-plan/src/dsl/function_expr/trigonometry.rs +++ b/crates/polars-plan/src/dsl/function_expr/trigonometry.rs @@ -120,13 +120,13 @@ where .get(0) .ok_or_else(|| polars_err!(ComputeError: "arctan2 x value is null"))?; - Ok(Some(y.apply(|v| v.atan2(x_value)).into_series())) + Ok(Some(y.apply_values(|v| v.atan2(x_value)).into_series())) } else if y.len() == 1 { let y_value = y .get(0) .ok_or_else(|| polars_err!(ComputeError: "arctan2 y value is null"))?; - Ok(Some(x.apply(|v| y_value.atan2(v)).into_series())) + Ok(Some(x.apply_values(|v| y_value.atan2(v)).into_series())) } else { Ok(Some( polars_core::prelude::arity::binary_mut(y, x, atan2_kernel).into_series(), @@ -168,7 +168,7 @@ where T::Native: Float, ChunkedArray: IntoSeries, { - Ok(ca.apply(|v| v.cos()).into_series()) + Ok(ca.apply_values(|v| v.cos()).into_series()) } fn cot(ca: &ChunkedArray) -> PolarsResult @@ -177,7 +177,7 @@ where T::Native: Float, ChunkedArray: IntoSeries, { - Ok(ca.apply(|v| v.cos() / v.sin()).into_series()) + Ok(ca.apply_values(|v| v.cos() / v.sin()).into_series()) } fn sin(ca: &ChunkedArray) -> PolarsResult @@ -186,7 +186,7 @@ where T::Native: Float, ChunkedArray: IntoSeries, { - Ok(ca.apply(|v| v.sin()).into_series()) + Ok(ca.apply_values(|v| v.sin()).into_series()) } fn tan(ca: &ChunkedArray) -> PolarsResult @@ -195,7 +195,7 @@ where T::Native: Float, ChunkedArray: IntoSeries, { - Ok(ca.apply(|v| v.tan()).into_series()) + Ok(ca.apply_values(|v| v.tan()).into_series()) } fn arccos(ca: &ChunkedArray) -> PolarsResult @@ -204,7 +204,7 @@ where T::Native: Float, ChunkedArray: IntoSeries, { - Ok(ca.apply(|v| v.acos()).into_series()) + Ok(ca.apply_values(|v| v.acos()).into_series()) } fn arcsin(ca: &ChunkedArray) -> PolarsResult @@ -213,7 +213,7 @@ where T::Native: Float, ChunkedArray: IntoSeries, { - Ok(ca.apply(|v| v.asin()).into_series()) + Ok(ca.apply_values(|v| v.asin()).into_series()) } fn arctan(ca: &ChunkedArray) -> PolarsResult @@ -222,7 +222,7 @@ where T::Native: Float, ChunkedArray: IntoSeries, { - Ok(ca.apply(|v| v.atan()).into_series()) + Ok(ca.apply_values(|v| v.atan()).into_series()) } fn cosh(ca: &ChunkedArray) -> PolarsResult @@ -231,7 +231,7 @@ where T::Native: Float, ChunkedArray: IntoSeries, { - Ok(ca.apply(|v| v.cosh()).into_series()) + Ok(ca.apply_values(|v| v.cosh()).into_series()) } fn sinh(ca: &ChunkedArray) -> PolarsResult @@ -240,7 +240,7 @@ where T::Native: Float, ChunkedArray: IntoSeries, { - Ok(ca.apply(|v| v.sinh()).into_series()) + Ok(ca.apply_values(|v| v.sinh()).into_series()) } fn tanh(ca: &ChunkedArray) -> PolarsResult @@ -249,7 +249,7 @@ where T::Native: Float, ChunkedArray: IntoSeries, { - Ok(ca.apply(|v| v.tanh()).into_series()) + Ok(ca.apply_values(|v| v.tanh()).into_series()) } fn arccosh(ca: &ChunkedArray) -> PolarsResult @@ -258,7 +258,7 @@ where T::Native: Float, ChunkedArray: IntoSeries, { - Ok(ca.apply(|v| v.acosh()).into_series()) + Ok(ca.apply_values(|v| v.acosh()).into_series()) } fn arcsinh(ca: &ChunkedArray) -> PolarsResult @@ -267,7 +267,7 @@ where T::Native: Float, ChunkedArray: IntoSeries, { - Ok(ca.apply(|v| v.asinh()).into_series()) + Ok(ca.apply_values(|v| v.asinh()).into_series()) } fn arctanh(ca: &ChunkedArray) -> PolarsResult @@ -276,7 +276,7 @@ where T::Native: Float, ChunkedArray: IntoSeries, { - Ok(ca.apply(|v| v.atanh()).into_series()) + Ok(ca.apply_values(|v| v.atanh()).into_series()) } fn degrees(ca: &ChunkedArray) -> PolarsResult @@ -285,7 +285,7 @@ where T::Native: Float, ChunkedArray: IntoSeries, { - Ok(ca.apply(|v| v.to_degrees()).into_series()) + Ok(ca.apply_values(|v| v.to_degrees()).into_series()) } fn radians(ca: &ChunkedArray) -> PolarsResult @@ -294,5 +294,5 @@ where T::Native: Float, ChunkedArray: IntoSeries, { - Ok(ca.apply(|v| v.to_radians()).into_series()) + Ok(ca.apply_values(|v| v.to_radians()).into_series()) } diff --git a/crates/polars-time/src/base_utc_offset.rs b/crates/polars-time/src/base_utc_offset.rs index a5c944885df0..128bfe5d23cb 100644 --- a/crates/polars-time/src/base_utc_offset.rs +++ b/crates/polars-time/src/base_utc_offset.rs @@ -21,7 +21,7 @@ pub fn base_utc_offset( TimeUnit::Microseconds => timestamp_us_to_datetime, TimeUnit::Milliseconds => timestamp_ms_to_datetime, }; - ca.0.apply(|t| { + ca.0.apply_values(|t| { let ndt = timestamp_to_datetime(t); let dt = time_zone.from_utc_datetime(&ndt); dt.offset().base_utc_offset().num_milliseconds() diff --git a/crates/polars-time/src/dst_offset.rs b/crates/polars-time/src/dst_offset.rs index 58e4f34259ec..74c91c19ecec 100644 --- a/crates/polars-time/src/dst_offset.rs +++ b/crates/polars-time/src/dst_offset.rs @@ -18,7 +18,7 @@ pub fn dst_offset(ca: &DatetimeChunked, time_unit: &TimeUnit, time_zone: &Tz) -> TimeUnit::Microseconds => timestamp_us_to_datetime, TimeUnit::Milliseconds => timestamp_ms_to_datetime, }; - ca.0.apply(|t| { + ca.0.apply_values(|t| { let ndt = timestamp_to_datetime(t); let dt = time_zone.from_utc_datetime(&ndt); dt.offset().dst_offset().num_milliseconds() diff --git a/crates/polars/src/docs/eager.rs b/crates/polars/src/docs/eager.rs index 01e1ccde3068..f8030818d94d 100644 --- a/crates/polars/src/docs/eager.rs +++ b/crates/polars/src/docs/eager.rs @@ -148,7 +148,7 @@ //! let ca = UInt32Chunked::new("foo", &[1, 2, 3]); //! //! // 1 / ca -//! let divide_one_by_ca = ca.apply(|rhs| 1 / rhs); +//! let divide_one_by_ca = ca.apply_values(|rhs| 1 / rhs); //! ``` //! //! ## Comparisons @@ -245,11 +245,11 @@ //! //! // apply a closure over all values //! let s = Series::new("foo", &[Some(1), Some(2), None]); -//! s.i32()?.apply(|value| value * 20); +//! s.i32()?.apply_values(|value| value * 20); //! //! // count string lengths //! let s = Series::new("foo", &["foo", "bar", "foobar"]); -//! s.utf8()?.apply_cast_numeric::<_, UInt64Type>(|str_val| str_val.len() as u64); +//! s.utf8()?.apply_values_generic(|str_val| str_val.len() as u64); //! //! # Ok(()) //! # } diff --git a/py-polars/Cargo.lock b/py-polars/Cargo.lock index 286f80fe8ae0..32fef5e7a898 100644 --- a/py-polars/Cargo.lock +++ b/py-polars/Cargo.lock @@ -99,7 +99,7 @@ dependencies = [ [[package]] name = "arrow2" version = "0.17.4" -source = "git+https://github.com/jorgecarleitao/arrow2?rev=7edf5f9e359e0ed02e9d0c6b9318b06964d805f0#7edf5f9e359e0ed02e9d0c6b9318b06964d805f0" +source = "git+https://github.com/jorgecarleitao/arrow2?rev=2b3e2a9e83725a557d78b90cd39298c5bef0ca4a#2b3e2a9e83725a557d78b90cd39298c5bef0ca4a" dependencies = [ "ahash", "arrow-format", From bccb11af03af615195b28c5e3bd77355965b5006 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sun, 20 Aug 2023 12:53:44 +0200 Subject: [PATCH 15/55] feat(rust): improve binary (arity) generics (#10622) --- .../src/chunked_array/arithmetic/mod.rs | 4 +- .../src/chunked_array/arithmetic/numeric.rs | 2 +- .../polars-core/src/chunked_array/bitwise.rs | 6 +- .../src/chunked_array/ops/apply.rs | 2 +- .../src/chunked_array/ops/arity.rs | 169 ++++++++++++++++-- .../src/chunked_array/ops/filter.rs | 10 +- .../src/chunked_array/ops/repeat_by.rs | 8 +- .../polars-core/src/datatypes/from_values.rs | 69 +++---- .../src/series/arithmetic/borrowed.rs | 24 +-- .../polars-ops/src/chunked_array/list/sets.rs | 2 +- .../polars-ops/src/series/ops/floor_divide.rs | 2 +- .../polars-plan/src/dsl/function_expr/pow.rs | 2 +- .../src/dsl/function_expr/trigonometry.rs | 2 +- 13 files changed, 214 insertions(+), 88 deletions(-) diff --git a/crates/polars-core/src/chunked_array/arithmetic/mod.rs b/crates/polars-core/src/chunked_array/arithmetic/mod.rs index 101eab32120e..ecfad06720d9 100644 --- a/crates/polars-core/src/chunked_array/arithmetic/mod.rs +++ b/crates/polars-core/src/chunked_array/arithmetic/mod.rs @@ -148,7 +148,7 @@ impl Add for &BinaryChunked { }; } - arity::binary_mut(self, rhs, concat_binary) + arity::binary(self, rhs, concat_binary) } } @@ -197,7 +197,7 @@ impl Add for &BooleanChunked { if self.len() == 1 { return rhs.add(self); } - arity::binary_mut(self, rhs, add_boolean) + arity::binary(self, rhs, add_boolean) } } diff --git a/crates/polars-core/src/chunked_array/arithmetic/numeric.rs b/crates/polars-core/src/chunked_array/arithmetic/numeric.rs index f0e5fa53ac12..6efa9a3ffa13 100644 --- a/crates/polars-core/src/chunked_array/arithmetic/numeric.rs +++ b/crates/polars-core/src/chunked_array/arithmetic/numeric.rs @@ -12,7 +12,7 @@ where F: Fn(T::Native, T::Native) -> T::Native, { let mut ca = match (lhs.len(), rhs.len()) { - (a, b) if a == b => arity::binary_mut(lhs, rhs, |lhs, rhs| kernel(lhs, rhs)), + (a, b) if a == b => arity::binary(lhs, rhs, |lhs, rhs| kernel(lhs, rhs)), // broadcast right path (_, 1) => { let opt_rhs = rhs.get(0); diff --git a/crates/polars-core/src/chunked_array/bitwise.rs b/crates/polars-core/src/chunked_array/bitwise.rs index ea9372ef3adc..908549a947cc 100644 --- a/crates/polars-core/src/chunked_array/bitwise.rs +++ b/crates/polars-core/src/chunked_array/bitwise.rs @@ -72,7 +72,7 @@ impl BitOr for &BooleanChunked { _ => {}, } - arity::binary_mut(self, rhs, compute::boolean_kleene::or) + arity::binary(self, rhs, compute::boolean_kleene::or) } } @@ -117,7 +117,7 @@ impl BitXor for &BooleanChunked { _ => {}, } - arity::binary_mut(self, rhs, |l_arr, r_arr| { + arity::binary(self, rhs, |l_arr, r_arr| { let validity = combine_validities_and(l_arr.validity(), r_arr.validity()); let values = l_arr.values() ^ r_arr.values(); BooleanArray::from_data_default(values, validity) @@ -158,7 +158,7 @@ impl BitAnd for &BooleanChunked { _ => {}, } - arity::binary_mut(self, rhs, compute::boolean_kleene::and) + arity::binary(self, rhs, compute::boolean_kleene::and) } } diff --git a/crates/polars-core/src/chunked_array/ops/apply.rs b/crates/polars-core/src/chunked_array/ops/apply.rs index d2fad1295b85..18a34d5566da 100644 --- a/crates/polars-core/src/chunked_array/ops/apply.rs +++ b/crates/polars-core/src/chunked_array/ops/apply.rs @@ -92,7 +92,7 @@ where } } -pub(super) fn collect_array>( +fn collect_array>( iter: I, validity: Option, ) -> PrimitiveArray { diff --git a/crates/polars-core/src/chunked_array/ops/arity.rs b/crates/polars-core/src/chunked_array/ops/arity.rs index 287ab18adb89..4214c41deccc 100644 --- a/crates/polars-core/src/chunked_array/ops/arity.rs +++ b/crates/polars-core/src/chunked_array/ops/arity.rs @@ -1,15 +1,17 @@ -use arrow::array::{Array, PrimitiveArray}; +use std::error::Error; + +use arrow::array::Array; use polars_arrow::utils::combine_validities_and; -use crate::chunked_array::ops::apply::collect_array; use crate::datatypes::{ - HasUnderlyingArray, PolarsNumericType, StaticArray, StaticallyMatchesPolarsType, + ArrayFromElementIter, HasUnderlyingArray, PolarsNumericType, StaticArray, + StaticallyMatchesPolarsType, }; use crate::prelude::{ChunkedArray, PolarsDataType}; use crate::utils::align_chunks_binary; #[inline] -pub fn binary_elementwise( +pub fn binary_elementwise( lhs: &ChunkedArray, rhs: &ChunkedArray, mut op: F, @@ -17,30 +19,66 @@ pub fn binary_elementwise( where T: PolarsDataType, U: PolarsDataType, - V: PolarsNumericType, + V: PolarsDataType, ChunkedArray: HasUnderlyingArray, ChunkedArray: HasUnderlyingArray, F: for<'a> FnMut( Option<< as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>>, Option<< as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>>, - ) -> Option, + ) -> Option, + K: ArrayFromElementIter, + K::ArrayType: StaticallyMatchesPolarsType, { let (lhs, rhs) = align_chunks_binary(lhs, rhs); let iter = lhs .downcast_iter() .zip(rhs.downcast_iter()) .map(|(lhs_arr, rhs_arr)| { - lhs_arr + let element_iter = lhs_arr .iter() .zip(rhs_arr.iter()) - .map(|(lhs_opt_val, rhs_opt_val)| op(lhs_opt_val, rhs_opt_val)) - .collect::>() + .map(|(lhs_opt_val, rhs_opt_val)| op(lhs_opt_val, rhs_opt_val)); + K::array_from_iter(element_iter) }); ChunkedArray::from_chunk_iter(lhs.name(), iter) } #[inline] -pub fn binary_elementwise_values( +pub fn try_binary_elementwise( + lhs: &ChunkedArray, + rhs: &ChunkedArray, + mut op: F, +) -> Result, E> +where + T: PolarsDataType, + U: PolarsDataType, + V: PolarsDataType, + ChunkedArray: HasUnderlyingArray, + ChunkedArray: HasUnderlyingArray, + F: for<'a> FnMut( + Option<< as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>>, + Option<< as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>>, + ) -> Result, E>, + K: ArrayFromElementIter, + K::ArrayType: StaticallyMatchesPolarsType, + E: Error, +{ + let (lhs, rhs) = align_chunks_binary(lhs, rhs); + let iter = lhs + .downcast_iter() + .zip(rhs.downcast_iter()) + .map(|(lhs_arr, rhs_arr)| { + let element_iter = lhs_arr + .iter() + .zip(rhs_arr.iter()) + .map(|(lhs_opt_val, rhs_opt_val)| op(lhs_opt_val, rhs_opt_val)); + K::try_array_from_iter(element_iter) + }); + ChunkedArray::try_from_chunk_iter(lhs.name(), iter) +} + +#[inline] +pub fn binary_elementwise_values( lhs: &ChunkedArray, rhs: &ChunkedArray, mut op: F, @@ -54,7 +92,9 @@ where F: for<'a> FnMut( < as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>, < as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>, - ) -> V::Native, + ) -> K, + K: ArrayFromElementIter, + K::ArrayType: StaticallyMatchesPolarsType, { let (lhs, rhs) = align_chunks_binary(lhs, rhs); let iter = lhs @@ -63,15 +103,55 @@ where .map(|(lhs_arr, rhs_arr)| { let validity = combine_validities_and(lhs_arr.validity(), rhs_arr.validity()); - let iter = lhs_arr + let element_iter = lhs_arr .values_iter() .zip(rhs_arr.values_iter()) .map(|(lhs_val, rhs_val)| op(lhs_val, rhs_val)); - collect_array(iter, validity) + + let array = K::array_from_values_iter(element_iter); + array.with_validity_typed(validity) }); ChunkedArray::from_chunk_iter(lhs.name(), iter) } +#[inline] +pub fn try_binary_elementwise_values( + lhs: &ChunkedArray, + rhs: &ChunkedArray, + mut op: F, +) -> Result, E> +where + T: PolarsDataType, + U: PolarsDataType, + V: PolarsNumericType, + ChunkedArray: HasUnderlyingArray, + ChunkedArray: HasUnderlyingArray, + F: for<'a> FnMut( + < as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>, + < as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>, + ) -> Result, + K: ArrayFromElementIter, + K::ArrayType: StaticallyMatchesPolarsType, + E: Error, +{ + let (lhs, rhs) = align_chunks_binary(lhs, rhs); + let iter = lhs + .downcast_iter() + .zip(rhs.downcast_iter()) + .map(|(lhs_arr, rhs_arr)| { + let validity = combine_validities_and(lhs_arr.validity(), rhs_arr.validity()); + + let element_iter = lhs_arr + .values_iter() + .zip(rhs_arr.values_iter()) + .map(|(lhs_val, rhs_val)| op(lhs_val, rhs_val)); + + let array = K::try_array_from_values_iter(element_iter)?; + Ok(array.with_validity_typed(validity)) + }); + ChunkedArray::try_from_chunk_iter(lhs.name(), iter) +} + /// Applies a kernel that produces `Array` types. #[inline] pub fn binary_mut_with_options( @@ -101,7 +181,7 @@ where } /// Applies a kernel that produces `Array` types. -pub fn binary_mut( +pub fn binary( lhs: &ChunkedArray, rhs: &ChunkedArray, op: F, @@ -121,12 +201,39 @@ where binary_mut_with_options(lhs, rhs, op, lhs.name()) } +/// Applies a kernel that produces `Array` types. +pub fn try_binary( + lhs: &ChunkedArray, + rhs: &ChunkedArray, + mut op: F, +) -> Result, E> +where + T: PolarsDataType, + U: PolarsDataType, + V: PolarsDataType, + ChunkedArray: HasUnderlyingArray, + ChunkedArray: HasUnderlyingArray, + Arr: Array + StaticallyMatchesPolarsType, + F: FnMut( + & as HasUnderlyingArray>::ArrayT, + & as HasUnderlyingArray>::ArrayT, + ) -> Result, + E: Error, +{ + let (lhs, rhs) = align_chunks_binary(lhs, rhs); + let iter = lhs + .downcast_iter() + .zip(rhs.downcast_iter()) + .map(|(lhs_arr, rhs_arr)| op(lhs_arr, rhs_arr)); + ChunkedArray::try_from_chunk_iter(lhs.name(), iter) +} + /// Applies a kernel that produces `ArrayRef` of the same type. /// /// # Safety /// Caller must ensure that the returned `ArrayRef` belongs to `T: PolarsDataType`. #[inline] -pub unsafe fn binary_mut_unchecked_same_type( +pub unsafe fn binary_unchecked_same_type( lhs: &ChunkedArray, rhs: &ChunkedArray, mut op: F, @@ -151,3 +258,35 @@ where .collect(); lhs.copy_with_chunks(chunks, keep_sorted, keep_fast_explode) } + +/// Applies a kernel that produces `ArrayRef` of the same type. +/// +/// # Safety +/// Caller must ensure that the returned `ArrayRef` belongs to `T: PolarsDataType`. +#[inline] +pub unsafe fn try_binary_unchecked_same_type( + lhs: &ChunkedArray, + rhs: &ChunkedArray, + mut op: F, + keep_sorted: bool, + keep_fast_explode: bool, +) -> Result, E> +where + T: PolarsDataType, + U: PolarsDataType, + ChunkedArray: HasUnderlyingArray, + ChunkedArray: HasUnderlyingArray, + F: FnMut( + & as HasUnderlyingArray>::ArrayT, + & as HasUnderlyingArray>::ArrayT, + ) -> Result, E>, + E: Error, +{ + let (lhs, rhs) = align_chunks_binary(lhs, rhs); + let chunks = lhs + .downcast_iter() + .zip(rhs.downcast_iter()) + .map(|(lhs_arr, rhs_arr)| op(lhs_arr, rhs_arr)) + .collect::, E>>()?; + Ok(lhs.copy_with_chunks(chunks, keep_sorted, keep_fast_explode)) +} diff --git a/crates/polars-core/src/chunked_array/ops/filter.rs b/crates/polars-core/src/chunked_array/ops/filter.rs index 408902b3258b..7543cff66583 100644 --- a/crates/polars-core/src/chunked_array/ops/filter.rs +++ b/crates/polars-core/src/chunked_array/ops/filter.rs @@ -30,7 +30,7 @@ where } check_filter_len!(self, filter); Ok(unsafe { - arity::binary_mut_unchecked_same_type( + arity::binary_unchecked_same_type( self, filter, |left, mask| filter_fn(left, mask).unwrap(), @@ -52,7 +52,7 @@ impl ChunkFilter for BooleanChunked { } check_filter_len!(self, filter); Ok(unsafe { - arity::binary_mut_unchecked_same_type( + arity::binary_unchecked_same_type( self, filter, |left, mask| filter_fn(left, mask).unwrap(), @@ -81,7 +81,7 @@ impl ChunkFilter for BinaryChunked { } check_filter_len!(self, filter); Ok(unsafe { - arity::binary_mut_unchecked_same_type( + arity::binary_unchecked_same_type( self, filter, |left, mask| filter_fn(left, mask).unwrap(), @@ -105,7 +105,7 @@ impl ChunkFilter for ListChunked { }; } Ok(unsafe { - arity::binary_mut_unchecked_same_type( + arity::binary_unchecked_same_type( self, filter, |left, mask| filter_fn(left, mask).unwrap(), @@ -130,7 +130,7 @@ impl ChunkFilter for ArrayChunked { }; } Ok(unsafe { - arity::binary_mut_unchecked_same_type( + arity::binary_unchecked_same_type( self, filter, |left, mask| filter_fn(left, mask).unwrap(), diff --git a/crates/polars-core/src/chunked_array/ops/repeat_by.rs b/crates/polars-core/src/chunked_array/ops/repeat_by.rs index 3932b644ad9f..419065689dff 100644 --- a/crates/polars-core/src/chunked_array/ops/repeat_by.rs +++ b/crates/polars-core/src/chunked_array/ops/repeat_by.rs @@ -31,7 +31,7 @@ where )); } - Ok(arity::binary_mut(self, by, |arr, by| { + Ok(arity::binary(self, by, |arr, by| { let iter = arr.into_iter().zip(by).map(|(opt_v, opt_by)| { opt_by.map(|by| std::iter::repeat(opt_v.copied()).take(*by as usize)) }); @@ -56,7 +56,7 @@ impl RepeatBy for BooleanChunked { )); } - Ok(arity::binary_mut(self, by, |arr, by| { + Ok(arity::binary(self, by, |arr, by| { let iter = arr.into_iter().zip(by).map(|(opt_v, opt_by)| { opt_by.map(|by| std::iter::repeat(opt_v).take(*by as usize)) }); @@ -80,7 +80,7 @@ impl RepeatBy for Utf8Chunked { )); } - Ok(arity::binary_mut(self, by, |arr, by| { + Ok(arity::binary(self, by, |arr, by| { let iter = arr.into_iter().zip(by).map(|(opt_v, opt_by)| { opt_by.map(|by| std::iter::repeat(opt_v).take(*by as usize)) }); @@ -104,7 +104,7 @@ impl RepeatBy for BinaryChunked { )); } - Ok(arity::binary_mut(self, by, |arr, by| { + Ok(arity::binary(self, by, |arr, by| { let iter = arr.into_iter().zip(by).map(|(opt_v, opt_by)| { opt_by.map(|by| std::iter::repeat(opt_v).take(*by as usize)) }); diff --git a/crates/polars-core/src/datatypes/from_values.rs b/crates/polars-core/src/datatypes/from_values.rs index 236495a27305..07341355caa9 100644 --- a/crates/polars-core/src/datatypes/from_values.rs +++ b/crates/polars-core/src/datatypes/from_values.rs @@ -10,6 +10,7 @@ use polars_arrow::array::utf8::{BinaryFromIter, Utf8FromIter}; use polars_arrow::prelude::FromData; use polars_arrow::trusted_len::TrustedLen; +use crate::datatypes::NumericNative; use crate::prelude::StaticArray; pub trait ArrayFromElementIter @@ -59,48 +60,34 @@ impl ArrayFromElementIter for bool { } } -macro_rules! impl_primitive { - ($tp:ty) => { - impl ArrayFromElementIter for $tp { - type ArrayType = PrimitiveArray; - - fn array_from_iter>>(iter: I) -> Self::ArrayType { - // SAFETY: guarded by `TrustedLen` trait - unsafe { PrimitiveArray::from_trusted_len_iter_unchecked(iter) } - } - - fn array_from_values_iter>(iter: I) -> Self::ArrayType { - // SAFETY: guarded by `TrustedLen` trait - unsafe { PrimitiveArray::from_trusted_len_values_iter_unchecked(iter) } - } - fn try_array_from_iter, E>>>( - iter: I, - ) -> Result { - // SAFETY: guarded by `TrustedLen` trait - unsafe { - Ok(MutablePrimitiveArray::try_from_trusted_len_iter_unchecked(iter)?.into()) - } - } - fn try_array_from_values_iter>>( - iter: I, - ) -> Result { - let values: Vec<_> = iter.collect::, _>>()?; - Ok(PrimitiveArray::from_vec(values)) - } - } - }; -} +impl ArrayFromElementIter for T +where + T: NumericNative, +{ + type ArrayType = PrimitiveArray; + + fn array_from_iter>>(iter: I) -> Self::ArrayType { + // SAFETY: guarded by `TrustedLen` trait + unsafe { PrimitiveArray::from_trusted_len_iter_unchecked(iter) } + } -impl_primitive!(u8); -impl_primitive!(u16); -impl_primitive!(u32); -impl_primitive!(u64); -impl_primitive!(i8); -impl_primitive!(i16); -impl_primitive!(i32); -impl_primitive!(i64); -impl_primitive!(f32); -impl_primitive!(f64); + fn array_from_values_iter>(iter: I) -> Self::ArrayType { + // SAFETY: guarded by `TrustedLen` trait + unsafe { PrimitiveArray::from_trusted_len_values_iter_unchecked(iter) } + } + fn try_array_from_iter, E>>>( + iter: I, + ) -> Result { + // SAFETY: guarded by `TrustedLen` trait + unsafe { Ok(MutablePrimitiveArray::try_from_trusted_len_iter_unchecked(iter)?.into()) } + } + fn try_array_from_values_iter>>( + iter: I, + ) -> Result { + let values: Vec<_> = iter.collect::, _>>()?; + Ok(PrimitiveArray::from_vec(values)) + } +} impl ArrayFromElementIter for &str { type ArrayType = Utf8Array; diff --git a/crates/polars-core/src/series/arithmetic/borrowed.rs b/crates/polars-core/src/series/arithmetic/borrowed.rs index d34227aa4198..bb04ddc9c976 100644 --- a/crates/polars-core/src/series/arithmetic/borrowed.rs +++ b/crates/polars-core/src/series/arithmetic/borrowed.rs @@ -177,10 +177,10 @@ pub mod checked { // see check_div for chunkedarray let rhs = unsafe { lhs.unpack_series_matching_physical_type(rhs) }; - Ok( - arity::binary_elementwise::<_, _, Float32Type, _>(lhs, rhs, |opt_l, opt_r| match ( - opt_l, opt_r, - ) { + Ok(arity::binary_elementwise::<_, _, Float32Type, _, _>( + lhs, + rhs, + |opt_l, opt_r| match (opt_l, opt_r) { (Some(l), Some(r)) => { if r.is_zero() { None @@ -189,9 +189,9 @@ pub mod checked { } }, _ => None, - }) - .into_series(), + }, ) + .into_series()) } } @@ -201,10 +201,10 @@ pub mod checked { // see check_div let rhs = unsafe { lhs.unpack_series_matching_physical_type(rhs) }; - Ok( - arity::binary_elementwise::<_, _, Float64Type, _>(lhs, rhs, |opt_l, opt_r| match ( - opt_l, opt_r, - ) { + Ok(arity::binary_elementwise::<_, _, Float64Type, _, _>( + lhs, + rhs, + |opt_l, opt_r| match (opt_l, opt_r) { (Some(l), Some(r)) => { if r.is_zero() { None @@ -213,9 +213,9 @@ pub mod checked { } }, _ => None, - }) - .into_series(), + }, ) + .into_series()) } } diff --git a/crates/polars-ops/src/chunked_array/list/sets.rs b/crates/polars-ops/src/chunked_array/list/sets.rs index a442e820a420..fe3fff9a3a78 100644 --- a/crates/polars-ops/src/chunked_array/list/sets.rs +++ b/crates/polars-ops/src/chunked_array/list/sets.rs @@ -282,7 +282,7 @@ fn array_set_operation( pub fn list_set_operation(a: &ListChunked, b: &ListChunked, set_op: SetOperation) -> ListChunked { // we use the unsafe variant because we want to keep the nested logical types type. unsafe { - arity::binary_mut_unchecked_same_type( + arity::binary_unchecked_same_type( a, b, |a, b| array_set_operation(a, b, set_op).boxed(), diff --git a/crates/polars-ops/src/series/ops/floor_divide.rs b/crates/polars-ops/src/series/ops/floor_divide.rs index 933519cdd2a4..c8ae34dbb272 100644 --- a/crates/polars-ops/src/series/ops/floor_divide.rs +++ b/crates/polars-ops/src/series/ops/floor_divide.rs @@ -78,7 +78,7 @@ fn floor_div_ca(a: &ChunkedArray, b: &ChunkedArray) ChunkedArray::full_null(a.name(), a.len()) }; } - arity::binary_mut(a, b, floor_div_array) + arity::binary(a, b, floor_div_array) } pub fn floor_div_series(a: &Series, b: &Series) -> PolarsResult { diff --git a/crates/polars-plan/src/dsl/function_expr/pow.rs b/crates/polars-plan/src/dsl/function_expr/pow.rs index cfb1fe14dff3..dc88256ad1e0 100644 --- a/crates/polars-plan/src/dsl/function_expr/pow.rs +++ b/crates/polars-plan/src/dsl/function_expr/pow.rs @@ -68,7 +68,7 @@ where )) } else { Ok(Some( - polars_core::chunked_array::ops::arity::binary_mut(base, exponent, pow_kernel) + polars_core::chunked_array::ops::arity::binary(base, exponent, pow_kernel) .into_series(), )) } diff --git a/crates/polars-plan/src/dsl/function_expr/trigonometry.rs b/crates/polars-plan/src/dsl/function_expr/trigonometry.rs index 32a57a623e59..a24a0ebb94ab 100644 --- a/crates/polars-plan/src/dsl/function_expr/trigonometry.rs +++ b/crates/polars-plan/src/dsl/function_expr/trigonometry.rs @@ -129,7 +129,7 @@ where Ok(Some(x.apply_values(|v| y_value.atan2(v)).into_series())) } else { Ok(Some( - polars_core::prelude::arity::binary_mut(y, x, atan2_kernel).into_series(), + polars_core::prelude::arity::binary(y, x, atan2_kernel).into_series(), )) } } From a75495ac1a4c6844ee82ba12f00ba9d6849b14a6 Mon Sep 17 00:00:00 2001 From: Sam Damashek Date: Sun, 20 Aug 2023 19:31:38 +0800 Subject: [PATCH 16/55] feat(python, rust!): Read/write support for IPC streams in DataFrames (#10606) --- crates/polars-io/src/ipc/ipc_stream.rs | 11 +-- crates/polars/tests/it/io/ipc_stream.rs | 7 +- py-polars/Cargo.toml | 2 + py-polars/docs/source/reference/io.rst | 2 + py-polars/polars/__init__.py | 2 + py-polars/polars/dataframe/frame.py | 120 +++++++++++++++++++++++- py-polars/polars/io/__init__.py | 3 +- py-polars/polars/io/ipc/__init__.py | 3 +- py-polars/polars/io/ipc/functions.py | 75 +++++++++++++++ py-polars/src/dataframe.rs | 50 ++++++++++ py-polars/tests/unit/io/test_ipc.py | 72 +++++++++----- 11 files changed, 308 insertions(+), 39 deletions(-) diff --git a/crates/polars-io/src/ipc/ipc_stream.rs b/crates/polars-io/src/ipc/ipc_stream.rs index 52f07918c68c..3a3b6a399e65 100644 --- a/crates/polars-io/src/ipc/ipc_stream.rs +++ b/crates/polars-io/src/ipc/ipc_stream.rs @@ -237,17 +237,16 @@ fn fix_column_order(df: DataFrame, projection: Option>, row_count: bo #[must_use] pub struct IpcStreamWriter { writer: W, - compression: Option, + compression: Option, } use polars_core::frame::ArrowChunk; -pub use write::Compression as IpcCompression; use crate::RowCount; impl IpcStreamWriter { /// Set the compression used. Defaults to None. - pub fn with_compression(mut self, compression: Option) -> Self { + pub fn with_compression(mut self, compression: Option) -> Self { self.compression = compression; self } @@ -268,7 +267,7 @@ where let mut ipc_stream_writer = write::StreamWriter::new( &mut self.writer, WriteOptions { - compression: self.compression, + compression: self.compression.map(|c| c.into()), }, ); @@ -286,7 +285,7 @@ where } pub struct IpcStreamWriterOption { - compression: Option, + compression: Option, extension: PathBuf, } @@ -299,7 +298,7 @@ impl IpcStreamWriterOption { } /// Set the compression used. Defaults to None. - pub fn with_compression(mut self, compression: Option) -> Self { + pub fn with_compression(mut self, compression: Option) -> Self { self.compression = compression; self } diff --git a/crates/polars/tests/it/io/ipc_stream.rs b/crates/polars/tests/it/io/ipc_stream.rs index 1bb070af99a0..eb369b284f40 100644 --- a/crates/polars/tests/it/io/ipc_stream.rs +++ b/crates/polars/tests/it/io/ipc_stream.rs @@ -2,7 +2,6 @@ mod test { use std::io::Cursor; - use polars::export::arrow::io::ipc::write; use polars_core::df; use polars_core::prelude::*; use polars_io::ipc::*; @@ -105,11 +104,7 @@ mod test { fn test_write_with_compression() { let mut df = create_df(); - let compressions = vec![ - None, - Some(write::Compression::LZ4), - Some(write::Compression::ZSTD), - ]; + let compressions = vec![None, Some(IpcCompression::LZ4), Some(IpcCompression::ZSTD)]; for compression in compressions.into_iter() { let mut buf: Cursor> = Cursor::new(Vec::new()); diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml index 741c3f358f7b..ce25037a4e48 100644 --- a/py-polars/Cargo.toml +++ b/py-polars/Cargo.toml @@ -101,6 +101,7 @@ dtype-u16 = [] avro = ["polars/avro"] parquet = ["polars/parquet"] ipc = ["polars/ipc"] +ipc_streaming = ["polars/ipc_streaming"] is_in = ["polars/is_in"] json = ["polars/serde", "serde_json", "polars/json"] trigonometry = ["polars/trigonometry"] @@ -145,6 +146,7 @@ all = [ "json", "parquet", "ipc", + "ipc_streaming", "avro", "is_in", "repeat_by", diff --git a/py-polars/docs/source/reference/io.rst b/py-polars/docs/source/reference/io.rst index 6c315f70929c..d83afcffd10e 100644 --- a/py-polars/docs/source/reference/io.rst +++ b/py-polars/docs/source/reference/io.rst @@ -19,9 +19,11 @@ Feather/ IPC :toctree: api/ read_ipc + read_ipc_stream scan_ipc read_ipc_schema DataFrame.write_ipc + DataFrame.write_ipc_stream LazyFrame.sink_ipc Parquet diff --git a/py-polars/polars/__init__.py b/py-polars/polars/__init__.py index fac8e02788ad..12557c8bb62b 100644 --- a/py-polars/polars/__init__.py +++ b/py-polars/polars/__init__.py @@ -158,6 +158,7 @@ read_excel, read_ipc, read_ipc_schema, + read_ipc_stream, read_json, read_ndjson, read_parquet, @@ -250,6 +251,7 @@ "read_excel", "read_ipc", "read_ipc_schema", + "read_ipc_stream", "read_json", "read_ndjson", "read_parquet", diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 19c09a790fcd..6e8794d791c6 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -904,9 +904,10 @@ def _read_ipc( memory_map: bool = True, ) -> Self: """ - Read into a DataFrame from Arrow IPC stream format. + Read into a DataFrame from Arrow IPC file format. - Arrow IPC is also know as Feather (v2). + See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. + Arrow IPC files are also known as Feather (v2) files. Parameters ---------- @@ -972,6 +973,58 @@ def _read_ipc( ) return self + @classmethod + def _read_ipc_stream( + cls, + source: str | Path | BinaryIO | bytes, + *, + columns: Sequence[int] | Sequence[str] | None = None, + n_rows: int | None = None, + row_count_name: str | None = None, + row_count_offset: int = 0, + rechunk: bool = True, + ) -> Self: + """ + Read into a DataFrame from Arrow IPC record batch stream format. + + See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC stream after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + + """ + if isinstance(source, (str, Path)): + source = normalise_filepath(source) + if isinstance(columns, str): + columns = [columns] + + projection, columns = handle_projection_columns(columns) + self = cls.__new__(cls) + self._df = PyDataFrame.read_ipc_stream( + source, + columns, + projection, + n_rows, + _prepare_row_count_args(row_count_name, row_count_offset), + rechunk, + ) + return self + @classmethod def _read_json( cls, @@ -3085,6 +3138,8 @@ def write_ipc( """ Write to Arrow IPC binary stream or Feather file. + See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. + Parameters ---------- file @@ -3120,6 +3175,67 @@ def write_ipc( self._df.write_ipc(file, compression) return file if return_bytes else None # type: ignore[return-value] + @overload + def write_ipc_stream( + self, + file: None, + compression: IpcCompression = "uncompressed", + ) -> BytesIO: + ... + + @overload + def write_ipc_stream( + self, + file: BinaryIO | BytesIO | str | Path, + compression: IpcCompression = "uncompressed", + ) -> None: + ... + + def write_ipc_stream( + self, + file: BinaryIO | BytesIO | str | Path | None, + compression: IpcCompression = "uncompressed", + ) -> BytesIO | None: + """ + Write to Arrow IPC record batch stream. + + See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path to which the IPC record batch data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {'uncompressed', 'lz4', 'zstd'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc_stream(path) + + """ + return_bytes = file is None + if return_bytes: + file = BytesIO() + elif isinstance(file, (str, Path)): + file = normalise_filepath(file) + + if compression is None: + compression = "uncompressed" + + self._df.write_ipc_stream(file, compression) + return file if return_bytes else None # type: ignore[return-value] + def write_parquet( self, file: str | Path | BytesIO, diff --git a/py-polars/polars/io/__init__.py b/py-polars/polars/io/__init__.py index 4ed17a139803..7243007e82fc 100644 --- a/py-polars/polars/io/__init__.py +++ b/py-polars/polars/io/__init__.py @@ -5,7 +5,7 @@ from polars.io.database import read_database from polars.io.delta import read_delta, scan_delta from polars.io.excel import read_excel -from polars.io.ipc import read_ipc, read_ipc_schema, scan_ipc +from polars.io.ipc import read_ipc, read_ipc_schema, read_ipc_stream, scan_ipc from polars.io.json import read_json from polars.io.ndjson import read_ndjson, scan_ndjson from polars.io.parquet import read_parquet, read_parquet_schema, scan_parquet @@ -19,6 +19,7 @@ "read_delta", "read_excel", "read_ipc", + "read_ipc_stream", "read_ipc_schema", "read_json", "read_ndjson", diff --git a/py-polars/polars/io/ipc/__init__.py b/py-polars/polars/io/ipc/__init__.py index e0f4b0d4be27..9423bbceb829 100644 --- a/py-polars/polars/io/ipc/__init__.py +++ b/py-polars/polars/io/ipc/__init__.py @@ -1,7 +1,8 @@ -from polars.io.ipc.functions import read_ipc, read_ipc_schema, scan_ipc +from polars.io.ipc.functions import read_ipc, read_ipc_schema, read_ipc_stream, scan_ipc __all__ = [ "read_ipc", + "read_ipc_stream", "read_ipc_schema", "scan_ipc", ] diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py index e2bda1b8a7d4..7f661cc2d33f 100644 --- a/py-polars/polars/io/ipc/functions.py +++ b/py-polars/polars/io/ipc/functions.py @@ -111,6 +111,81 @@ def read_ipc( ) +def read_ipc_stream( + source: str | BinaryIO | BytesIO | Path | bytes, + *, + columns: list[int] | list[str] | None = None, + n_rows: int | None = None, + use_pyarrow: bool = False, + storage_options: dict[str, Any] | None = None, + row_count_name: str | None = None, + row_count_offset: int = 0, + rechunk: bool = True, +) -> DataFrame: + """ + Read into a DataFrame from Arrow IPC record batch stream. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to objects + that have a ``read()`` method, such as a file handler (e.g. via builtin ``open`` + function) or ``BytesIO``). If ``fsspec`` is installed, it will be used to open + remote files. + columns + Columns to select. Accepts a list of column indices (starting at zero) or a list + of column names. + n_rows + Stop reading from IPC stream after reading ``n_rows``. + Only valid when `use_pyarrow=False`. + use_pyarrow + Use pyarrow or the native Rust reader. + storage_options + Extra options that make sense for ``fsspec.open()`` or a particular storage + connection, e.g. host, port, username, password, etc. + row_count_name + If not None, this will insert a row count column with give name into the + DataFrame + row_count_offset + Offset to start the row_count column (only use if the name is set) + rechunk + Make sure that all data is contiguous. + + Returns + ------- + DataFrame + + """ + storage_options = storage_options or {} + with _prepare_file_arg(source, use_pyarrow=use_pyarrow, **storage_options) as data: + if use_pyarrow: + if not _PYARROW_AVAILABLE: + raise ImportError( + "'pyarrow' is required when using" + " 'read_ipc_stream(..., use_pyarrow=True)'" + ) + + import pyarrow as pa + + with pa.ipc.RecordBatchStreamReader(data) as reader: + tbl = reader.read_all() + df = pl.DataFrame._from_arrow(tbl, rechunk=rechunk) + if row_count_name is not None: + df = df.with_row_count(row_count_name, row_count_offset) + if n_rows is not None: + df = df.slice(0, n_rows) + return df + + return pl.DataFrame._read_ipc_stream( + data, + columns=columns, + n_rows=n_rows, + row_count_name=row_count_name, + row_count_offset=row_count_offset, + rechunk=rechunk, + ) + + def read_ipc_schema(source: str | BinaryIO | Path | bytes) -> dict[str, PolarsDataType]: """ Get the schema of an IPC file without reading data. diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs index 2ffc201629ae..b3e77cf07f96 100644 --- a/py-polars/src/dataframe.rs +++ b/py-polars/src/dataframe.rs @@ -304,6 +304,30 @@ impl PyDataFrame { Ok(PyDataFrame::new(df)) } + #[staticmethod] + #[cfg(feature = "ipc_streaming")] + #[pyo3(signature = (py_f, columns, projection, n_rows, row_count, rechunk))] + pub fn read_ipc_stream( + py_f: &PyAny, + columns: Option>, + projection: Option>, + n_rows: Option, + row_count: Option<(String, IdxSize)>, + rechunk: bool, + ) -> PyResult { + let row_count = row_count.map(|(name, offset)| RowCount { name, offset }); + let mmap_bytes_r = get_mmap_bytes_reader(py_f)?; + let df = IpcStreamReader::new(mmap_bytes_r) + .with_projection(projection) + .with_columns(columns) + .with_n_rows(n_rows) + .with_row_count(row_count) + .set_rechunk(rechunk) + .finish() + .map_err(PyPolarsErr::from)?; + Ok(PyDataFrame::new(df)) + } + #[staticmethod] #[cfg(feature = "avro")] #[pyo3(signature = (py_f, columns, projection, n_rows))] @@ -626,6 +650,32 @@ impl PyDataFrame { Ok(()) } + #[cfg(feature = "ipc_streaming")] + pub fn write_ipc_stream( + &mut self, + py: Python, + py_f: PyObject, + compression: Wrap>, + ) -> PyResult<()> { + if let Ok(s) = py_f.extract::<&str>(py) { + py.allow_threads(|| { + let f = std::fs::File::create(s).unwrap(); + IpcStreamWriter::new(f) + .with_compression(compression.0) + .finish(&mut self.df) + .map_err(PyPolarsErr::from) + })?; + } else { + let mut buf = get_file_like(py_f, true)?; + + IpcStreamWriter::new(&mut buf) + .with_compression(compression.0) + .finish(&mut self.df) + .map_err(PyPolarsErr::from)?; + } + Ok(()) + } + #[cfg(feature = "object")] pub fn row_tuple(&self, idx: i64) -> PyResult { let idx = if idx < 0 { diff --git a/py-polars/tests/unit/io/test_ipc.py b/py-polars/tests/unit/io/test_ipc.py index 2f8af8207e31..6a7161fd9fe2 100644 --- a/py-polars/tests/unit/io/test_ipc.py +++ b/py-polars/tests/unit/io/test_ipc.py @@ -1,7 +1,7 @@ from __future__ import annotations import io -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any import pandas as pd import pytest @@ -17,83 +17,108 @@ COMPRESSIONS = ["uncompressed", "lz4", "zstd"] +def read_ipc(is_stream: bool, *args: Any, **kwargs: Any) -> pl.DataFrame: + if is_stream: + return pl.read_ipc_stream(*args, **kwargs) + else: + return pl.read_ipc(*args, **kwargs) + + +def write_ipc(df: pl.DataFrame, is_stream: bool, *args: Any, **kwargs: Any) -> Any: + if is_stream: + return df.write_ipc_stream(*args, **kwargs) + else: + return df.write_ipc(*args, **kwargs) + + @pytest.mark.parametrize("compression", COMPRESSIONS) -def test_from_to_buffer(df: pl.DataFrame, compression: IpcCompression) -> None: +@pytest.mark.parametrize("stream", [True, False]) +def test_from_to_buffer( + df: pl.DataFrame, compression: IpcCompression, stream: bool +) -> None: # use an ad-hoc buffer (file=None) - buf1 = df.write_ipc(None, compression=compression) - read_df = pl.read_ipc(buf1, use_pyarrow=False) + buf1 = write_ipc(df, stream, None, compression=compression) + read_df = read_ipc(stream, buf1, use_pyarrow=False) assert_frame_equal(df, read_df, categorical_as_str=True) # explicitly supply an existing buffer buf2 = io.BytesIO() - df.write_ipc(buf2, compression=compression) + write_ipc(df, stream, buf2, compression=compression) buf2.seek(0) - read_df = pl.read_ipc(buf2, use_pyarrow=False) + read_df = read_ipc(stream, buf2, use_pyarrow=False) assert_frame_equal(df, read_df, categorical_as_str=True) @pytest.mark.parametrize("compression", COMPRESSIONS) @pytest.mark.parametrize("path_as_string", [True, False]) +@pytest.mark.parametrize("stream", [True, False]) @pytest.mark.write_disk() def test_from_to_file( df: pl.DataFrame, compression: IpcCompression, path_as_string: bool, tmp_path: Path, + stream: bool, ) -> None: tmp_path.mkdir(exist_ok=True) file_path = tmp_path / "small.ipc" if path_as_string: file_path = str(file_path) # type: ignore[assignment] - df.write_ipc(file_path, compression=compression) - df_read = pl.read_ipc(file_path, use_pyarrow=False) + write_ipc(df, stream, file_path, compression=compression) + df_read = read_ipc(stream, file_path, use_pyarrow=False) assert_frame_equal(df, df_read, categorical_as_str=True) +@pytest.mark.parametrize("stream", [True, False]) @pytest.mark.write_disk() -def test_select_columns_from_file(df: pl.DataFrame, tmp_path: Path) -> None: +def test_select_columns_from_file( + df: pl.DataFrame, tmp_path: Path, stream: bool +) -> None: tmp_path.mkdir(exist_ok=True) file_path = tmp_path / "small.ipc" - df.write_ipc(file_path) - df_read = pl.read_ipc(file_path, columns=["bools"]) + write_ipc(df, stream, file_path) + df_read = read_ipc(stream, file_path, columns=["bools"]) assert df_read.columns == ["bools"] -def test_select_columns_from_buffer() -> None: +@pytest.mark.parametrize("stream", [True, False]) +def test_select_columns_from_buffer(stream: bool) -> None: df = pl.DataFrame({"a": [1, 2, 3], "b": [True, False, True], "c": ["a", "b", "c"]}) expected = pl.DataFrame({"b": [True, False, True], "c": ["a", "b", "c"]}) f = io.BytesIO() - df.write_ipc(f) + write_ipc(df, stream, f) f.seek(0) - read_df = pl.read_ipc(f, columns=["b", "c"], use_pyarrow=False) + read_df = read_ipc(stream, f, columns=["b", "c"], use_pyarrow=False) assert_frame_equal(expected, read_df) -def test_select_columns_projection() -> None: +@pytest.mark.parametrize("stream", [True, False]) +def test_select_columns_projection(stream: bool) -> None: df = pl.DataFrame({"a": [1, 2, 3], "b": [True, False, True], "c": ["a", "b", "c"]}) expected = pl.DataFrame({"b": [True, False, True], "c": ["a", "b", "c"]}) f = io.BytesIO() - df.write_ipc(f) + write_ipc(df, stream, f) f.seek(0) - read_df = pl.read_ipc(f, columns=[1, 2], use_pyarrow=False) + read_df = read_ipc(stream, f, columns=[1, 2], use_pyarrow=False) assert_frame_equal(expected, read_df) @pytest.mark.parametrize("compression", COMPRESSIONS) -def test_compressed_simple(compression: IpcCompression) -> None: +@pytest.mark.parametrize("stream", [True, False]) +def test_compressed_simple(compression: IpcCompression, stream: bool) -> None: df = pl.DataFrame({"a": [1, 2, 3], "b": [True, False, True], "c": ["a", "b", "c"]}) f = io.BytesIO() - df.write_ipc(f, compression) + write_ipc(df, stream, f, compression) f.seek(0) - df_read = pl.read_ipc(f, use_pyarrow=False) + df_read = read_ipc(stream, f, use_pyarrow=False) assert_frame_equal(df_read, df) @@ -143,7 +168,8 @@ def test_ipc_schema_from_file( assert schema == expected -def test_ipc_column_order() -> None: +@pytest.mark.parametrize("stream", [True, False]) +def test_ipc_column_order(stream: bool) -> None: df = pl.DataFrame( { "cola": ["x", "y", "z"], @@ -152,12 +178,12 @@ def test_ipc_column_order() -> None: } ) f = io.BytesIO() - df.write_ipc(f) + write_ipc(df, stream, f) f.seek(0) columns = ["colc", "colb", "cola"] # read file into polars; the specified column order is no longer respected - assert pl.read_ipc(f, columns=columns).columns == columns + assert read_ipc(stream, f, columns=columns).columns == columns @pytest.mark.write_disk() From c6a301e3141d40c6c6febdebe0b95632ac8da2ed Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sun, 20 Aug 2023 16:40:43 +0200 Subject: [PATCH 17/55] chore(python): Bump ruff and enable new setting (#10626) --- py-polars/polars/convert.py | 9 +-------- py-polars/polars/expr/expr.py | 4 ++-- py-polars/polars/functions/range.py | 4 +--- py-polars/polars/io/_utils.py | 11 ++--------- py-polars/polars/io/csv/batched_reader.py | 5 +---- py-polars/polars/io/csv/functions.py | 10 +--------- py-polars/polars/sql/context.py | 8 +------- py-polars/polars/testing/parametric/__init__.py | 7 +------ py-polars/polars/testing/parametric/primitives.py | 7 +------ py-polars/pyproject.toml | 3 +++ py-polars/requirements-lint.txt | 2 +- py-polars/tests/unit/io/test_database.py | 6 +----- py-polars/tests/unit/test_lazy.py | 1 - 13 files changed, 16 insertions(+), 61 deletions(-) diff --git a/py-polars/polars/convert.py b/py-polars/polars/convert.py index 262eb10d28b0..a6fd5f8401c1 100644 --- a/py-polars/polars/convert.py +++ b/py-polars/polars/convert.py @@ -7,14 +7,7 @@ import polars._reexport as pl from polars import functions as F -from polars.datatypes import ( - N_INFER_DEFAULT, - Categorical, - List, - Object, - Struct, - Utf8, -) +from polars.datatypes import N_INFER_DEFAULT, Categorical, List, Object, Struct, Utf8 from polars.dependencies import pandas as pd from polars.dependencies import pyarrow as pa from polars.exceptions import NoDataError diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index 29735133c9dc..d7bf90aeeb81 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -8827,11 +8827,11 @@ def _remap_key_or_value_series( except OverflowError as exc: if is_keys: raise ValueError( - f"remapping keys for map_dict could not be converted to {dtype!r}: {str(exc)}" + f"remapping keys for map_dict could not be converted to {dtype!r}: {exc!s}" ) from exc else: raise ValueError( - f"choose a more suitable output dtype for map_dict as remapping value could not be converted to {dtype!r}: {str(exc)}" + f"choose a more suitable output dtype for map_dict as remapping value could not be converted to {dtype!r}: {exc!s}" ) from exc if is_keys: diff --git a/py-polars/polars/functions/range.py b/py-polars/polars/functions/range.py index 05a051bb803c..20e6433b9b84 100644 --- a/py-polars/polars/functions/range.py +++ b/py-polars/polars/functions/range.py @@ -8,9 +8,7 @@ from polars.datatypes import Int64 from polars.utils._parse_expr_input import parse_as_expression from polars.utils._wrap import wrap_expr -from polars.utils.convert import ( - _timedelta_to_pl_duration, -) +from polars.utils.convert import _timedelta_to_pl_duration from polars.utils.deprecation import ( deprecate_renamed_parameter, issue_deprecation_warning, diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py index 65033e28045d..69c21748072f 100644 --- a/py-polars/polars/io/_utils.py +++ b/py-polars/polars/io/_utils.py @@ -4,14 +4,7 @@ from contextlib import contextmanager from io import BytesIO, StringIO from pathlib import Path -from typing import ( - Any, - BinaryIO, - ContextManager, - Iterator, - TextIO, - overload, -) +from typing import Any, BinaryIO, ContextManager, Iterator, TextIO, overload from polars.dependencies import _FSSPEC_AVAILABLE, fsspec from polars.exceptions import NoDataError @@ -24,7 +17,7 @@ def _is_glob_pattern(file: str) -> bool: def _is_local_file(file: str) -> bool: try: - next(glob.iglob(file, recursive=True)) + next(glob.iglob(file, recursive=True)) # noqa: PTH207 return True except StopIteration: return False diff --git a/py-polars/polars/io/csv/batched_reader.py b/py-polars/polars/io/csv/batched_reader.py index 9bb71d39215c..87b58c055be2 100644 --- a/py-polars/polars/io/csv/batched_reader.py +++ b/py-polars/polars/io/csv/batched_reader.py @@ -4,10 +4,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Sequence -from polars.datatypes import ( - N_INFER_DEFAULT, - py_type_to_dtype, -) +from polars.datatypes import N_INFER_DEFAULT, py_type_to_dtype from polars.io.csv._utils import _update_columns from polars.utils._wrap import wrap_df from polars.utils.various import ( diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index 13564a3df9c9..45eac753c094 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -1,15 +1,7 @@ from __future__ import annotations from pathlib import Path -from typing import ( - TYPE_CHECKING, - Any, - BinaryIO, - Callable, - Mapping, - Sequence, - TextIO, -) +from typing import TYPE_CHECKING, Any, BinaryIO, Callable, Mapping, Sequence, TextIO import polars._reexport as pl from polars.datatypes import N_INFER_DEFAULT, Utf8 diff --git a/py-polars/polars/sql/context.py b/py-polars/polars/sql/context.py index 6ef4ae458512..f4b39bb71c44 100644 --- a/py-polars/polars/sql/context.py +++ b/py-polars/polars/sql/context.py @@ -1,13 +1,7 @@ from __future__ import annotations import contextlib -from typing import ( - TYPE_CHECKING, - Collection, - Generic, - Mapping, - overload, -) +from typing import TYPE_CHECKING, Collection, Generic, Mapping, overload from polars.dataframe import DataFrame from polars.lazyframe import LazyFrame diff --git a/py-polars/polars/testing/parametric/__init__.py b/py-polars/polars/testing/parametric/__init__.py index 3c08421b349c..98272ba93190 100644 --- a/py-polars/polars/testing/parametric/__init__.py +++ b/py-polars/polars/testing/parametric/__init__.py @@ -3,12 +3,7 @@ from polars.dependencies import _HYPOTHESIS_AVAILABLE if _HYPOTHESIS_AVAILABLE: - from polars.testing.parametric.primitives import ( - column, - columns, - dataframes, - series, - ) + from polars.testing.parametric.primitives import column, columns, dataframes, series from polars.testing.parametric.profiles import load_profile, set_profile from polars.testing.parametric.strategies import ( all_strategies, diff --git a/py-polars/polars/testing/parametric/primitives.py b/py-polars/polars/testing/parametric/primitives.py index c7b0648f7e80..d91f16ae9358 100644 --- a/py-polars/polars/testing/parametric/primitives.py +++ b/py-polars/polars/testing/parametric/primitives.py @@ -8,12 +8,7 @@ from typing import TYPE_CHECKING, Any, Collection, Sequence, overload from hypothesis.errors import InvalidArgument, NonInteractiveExampleWarning -from hypothesis.strategies import ( - booleans, - composite, - lists, - sampled_from, -) +from hypothesis.strategies import booleans, composite, lists, sampled_from from hypothesis.strategies._internal.utils import defines_strategy from polars.dataframe import DataFrame diff --git a/py-polars/pyproject.toml b/py-polars/pyproject.toml index 907694a73abb..f30c9c05f3b8 100644 --- a/py-polars/pyproject.toml +++ b/py-polars/pyproject.toml @@ -153,6 +153,9 @@ ignore = [ [tool.ruff.pycodestyle] max-doc-length = 88 +[tool.ruff.isort] +split-on-trailing-comma = false + [tool.ruff.flake8-tidy-imports] ban-relative-imports = "all" diff --git a/py-polars/requirements-lint.txt b/py-polars/requirements-lint.txt index ddf13052477e..2adb9a5681c5 100644 --- a/py-polars/requirements-lint.txt +++ b/py-polars/requirements-lint.txt @@ -1,5 +1,5 @@ black==23.7.0 blackdoc==0.3.8 mypy==1.4.1 -ruff==0.0.278 +ruff==0.0.285 typos==1.16.1 diff --git a/py-polars/tests/unit/io/test_database.py b/py-polars/tests/unit/io/test_database.py index 8a3fbcfdc606..d8efec1eb5ec 100644 --- a/py-polars/tests/unit/io/test_database.py +++ b/py-polars/tests/unit/io/test_database.py @@ -12,11 +12,7 @@ from polars.testing import assert_frame_equal if TYPE_CHECKING: - from polars.type_aliases import ( - DbReadEngine, - DbWriteEngine, - DbWriteMode, - ) + from polars.type_aliases import DbReadEngine, DbWriteEngine, DbWriteMode @pytest.fixture() diff --git a/py-polars/tests/unit/test_lazy.py b/py-polars/tests/unit/test_lazy.py index 980ae3caef51..fe2262ba1f80 100644 --- a/py-polars/tests/unit/test_lazy.py +++ b/py-polars/tests/unit/test_lazy.py @@ -529,7 +529,6 @@ def test_floor() -> None: (123.55, 0, 124.0), (123.55, 1, 123.6), (-1.23456789, 6, -1.234568), - (-1835.665, 2, -1835.67), (1.0e-5, 5, 0.00001), (1.0e-20, 20, 1e-20), (1.0e20, 2, 100000000000000000000.0), From 576b1463c0dba41e167e34946af42c3d9713f7f6 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sun, 20 Aug 2023 19:49:17 +0100 Subject: [PATCH 18/55] docs(python): give more relevant example for polars.apply (#10631) --- py-polars/polars/functions/lazy.py | 68 +++++++++++++++++------------- 1 file changed, 38 insertions(+), 30 deletions(-) diff --git a/py-polars/polars/functions/lazy.py b/py-polars/polars/functions/lazy.py index 5972aa709cb3..32a930ff1890 100644 --- a/py-polars/polars/functions/lazy.py +++ b/py-polars/polars/functions/lazy.py @@ -1061,39 +1061,47 @@ def apply( -------- >>> df = pl.DataFrame( ... { - ... "a": [7, 2, 3, 4], - ... "b": [2, 5, 6, 7], + ... "group": [1, 1, 2], + ... "a": [1, 3, 3], + ... "b": [5, 6, 7], ... } ... ) >>> df - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 7 ┆ 2 │ - │ 2 ┆ 5 │ - │ 3 ┆ 6 │ - │ 4 ┆ 7 │ - └─────┴─────┘ - - Calculate product of ``a``. - - >>> df.with_columns( # doctest: +SKIP - ... pl.col("a").apply(lambda x: x * x).alias("product_a") - ... ) - shape: (4, 3) - ┌─────┬─────┬───────────┐ - │ a ┆ b ┆ product_a │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═══════════╡ - │ 7 ┆ 2 ┆ 49 │ - │ 2 ┆ 5 ┆ 4 │ - │ 3 ┆ 6 ┆ 9 │ - │ 4 ┆ 7 ┆ 16 │ - └─────┴─────┴───────────┘ + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ group ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 5 │ + │ 1 ┆ 3 ┆ 6 │ + │ 2 ┆ 3 ┆ 7 │ + └───────┴─────┴─────┘ + >>> ( + ... df.groupby("group").agg( + ... pl.apply( + ... exprs=["a", "b"], + ... function=lambda list_of_series: list_of_series[0] + ... / list_of_series[0].sum() + ... + list_of_series[1], + ... ).alias("my_custom_aggregation") + ... ) + ... ).sort("group") + shape: (2, 2) + ┌───────┬───────────────────────┐ + │ group ┆ my_custom_aggregation │ + │ --- ┆ --- │ + │ i64 ┆ list[f64] │ + ╞═══════╪═══════════════════════╡ + │ 1 ┆ [5.25, 6.75] │ + │ 2 ┆ [8.0] │ + └───────┴───────────────────────┘ + + The output for group `1` can be understood as follows: + + - group `1` contains series `'a': [1, 3]` and `'b': [4, 5]` + - applying the function to those lists of Series, one gets the output + `[1 / 4 + 5, 3 / 4 + 6]`, i.e. `[5.25, 6.75]` """ exprs = parse_as_list_of_expressions(exprs) return wrap_expr( From 115fdbaae359afd9eb15ed03ee9530194bdcf960 Mon Sep 17 00:00:00 2001 From: Zverev Konstantin Date: Sun, 20 Aug 2023 23:51:46 +0500 Subject: [PATCH 19/55] feat(python): Add `LazyFrame.collect_async` (#10616) --- py-polars/polars/__init__.py | 2 + py-polars/polars/functions/__init__.py | 2 + py-polars/polars/functions/lazy.py | 85 +++++++++++++++++++ py-polars/polars/lazyframe/frame.py | 111 +++++++++++++++++++++++++ py-polars/polars/utils/_async.py | 45 ++++++++++ py-polars/src/functions/lazy.rs | 32 +++++++ py-polars/src/lazyframe.rs | 26 ++++++ py-polars/src/lib.rs | 2 + 8 files changed, 305 insertions(+) create mode 100644 py-polars/polars/utils/_async.py diff --git a/py-polars/polars/__init__.py b/py-polars/polars/__init__.py index 12557c8bb62b..01f03a68d68e 100644 --- a/py-polars/polars/__init__.py +++ b/py-polars/polars/__init__.py @@ -91,6 +91,7 @@ coalesce, col, collect_all, + collect_all_async, concat, concat_list, concat_str, @@ -306,6 +307,7 @@ "coalesce", "col", "collect_all", + "collect_all_async", "concat_list", "concat_str", "corr", diff --git a/py-polars/polars/functions/__init__.py b/py-polars/polars/functions/__init__.py index 44db012930a6..1c780163cdf8 100644 --- a/py-polars/polars/functions/__init__.py +++ b/py-polars/polars/functions/__init__.py @@ -34,6 +34,7 @@ coalesce, col, collect_all, + collect_all_async, corr, count, cov, @@ -113,6 +114,7 @@ "coalesce", "col", "collect_all", + "collect_all_async", "concat_list", "concat_str", "corr", diff --git a/py-polars/polars/functions/lazy.py b/py-polars/polars/functions/lazy.py index 32a930ff1890..427f368d6e2d 100644 --- a/py-polars/polars/functions/lazy.py +++ b/py-polars/polars/functions/lazy.py @@ -11,6 +11,7 @@ Int64, is_polars_dtype, ) +from polars.utils._async import _AsyncDataFrameResult from polars.utils._parse_expr_input import ( parse_as_expression, parse_as_list_of_expressions, @@ -27,6 +28,7 @@ if TYPE_CHECKING: + from queue import Queue from typing import Collection, Literal from polars import DataFrame, Expr, LazyFrame, Series @@ -1769,6 +1771,89 @@ def collect_all( return result +def collect_all_async( + lazy_frames: Sequence[LazyFrame], + queue: Queue[list[DataFrame] | Exception], + *, + type_coercion: bool = True, + predicate_pushdown: bool = True, + projection_pushdown: bool = True, + simplify_expression: bool = True, + no_optimization: bool = False, + slice_pushdown: bool = True, + comm_subplan_elim: bool = True, + comm_subexpr_elim: bool = True, + streaming: bool = False, +) -> _AsyncDataFrameResult[list[DataFrame]]: + """ + Collect multiple LazyFrames at the same time asynchronously in thread pool. + + Collects into a list of DataFrame, like :func:`polars.collect_all` + but instead of returning them directly its collected inside thread pool + and gets put into `queue` with `put_nowait` method, + while this method returns almost instantly. + + May be useful if you use gevent or asyncio and want to release control to other + greenlets/tasks while LazyFrames are being collected. + You must use correct queue in that case. + Given `queue` must be thread safe! + + For gevent use + [`gevent.queue.Queue`](https://www.gevent.org/api/gevent.queue.html#gevent.queue.Queue). + + For asyncio + [`asyncio.queues.Queue`](https://docs.python.org/3/library/asyncio-queue.html#queue) + can not be used, since it's not thread safe! + For that purpose use [janus](https://github.com/aio-libs/janus) library. + + Notes + ----- + Results are put in queue exactly once using `put_nowait`. + If error occurred then Exception will be put in the queue instead of result + which is then raised by returned wrapper `get` method. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + polars.collect_all : Collect multiple LazyFrames at the same time. + LazyFrame.collect_async: To collect single frame. + + Returns + ------- + Wrapper that has `get` method and `queue` attribute with given queue. + `get` accepts kwargs that are passed down to `queue.get`. + """ + if no_optimization: + predicate_pushdown = False + projection_pushdown = False + slice_pushdown = False + comm_subplan_elim = False + comm_subexpr_elim = False + + prepared = [] + + for lf in lazy_frames: + ldf = lf._ldf.optimization_toggle( + type_coercion, + predicate_pushdown, + projection_pushdown, + simplify_expression, + slice_pushdown, + comm_subplan_elim, + comm_subexpr_elim, + streaming, + ) + prepared.append(ldf) + + result = _AsyncDataFrameResult(queue) + plr.collect_all_with_callback(prepared, result._callback_all) + return result + + def select(*exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: """ Run polars expressions without a context. diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 7079e1ea4f56..36a54c86f713 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -50,6 +50,7 @@ from polars.lazyframe.groupby import LazyGroupBy from polars.selectors import _expand_selectors, expand_selector from polars.slice import LazyPolarsSlice +from polars.utils._async import _AsyncDataFrameResult from polars.utils._parse_expr_input import ( parse_as_expression, parse_as_list_of_expressions, @@ -75,6 +76,7 @@ if TYPE_CHECKING: import sys from io import IOBase + from queue import Queue from typing import Literal import pyarrow as pa @@ -1672,6 +1674,115 @@ def collect( ) return wrap_df(ldf.collect()) + def collect_async( + self, + queue: Queue[DataFrame | Exception], + *, + type_coercion: bool = True, + predicate_pushdown: bool = True, + projection_pushdown: bool = True, + simplify_expression: bool = True, + no_optimization: bool = False, + slice_pushdown: bool = True, + comm_subplan_elim: bool = True, + comm_subexpr_elim: bool = True, + streaming: bool = False, + ) -> _AsyncDataFrameResult[DataFrame]: + """ + Collect dataframe asynchronously in thread pool. + + Collects into a DataFrame, like :func:`collect` + but instead of returning dataframe directly its collected inside thread pool + and gets put into `queue` with `put_nowait` method, + while this method returns almost instantly. + + May be useful if you use gevent or asyncio and want to release control to other + greenlets/tasks while LazyFrames are being collected. + You must use correct queue in that case. + Given `queue` must be thread safe! + + For gevent use + [`gevent.queue.Queue`](https://www.gevent.org/api/gevent.queue.html#gevent.queue.Queue). + + For asyncio + [`asyncio.queues.Queue`](https://docs.python.org/3/library/asyncio-queue.html#queue) + can not be used, since it's not thread safe! + For that purpose use [janus](https://github.com/aio-libs/janus) library. + + Notes + ----- + Results are put in queue exactly once using `put_nowait`. + If error occurred then Exception will be put in the queue instead of result + which is then raised by returned wrapper `get` method. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. + + Returns + ------- + Wrapper that has `get` method and `queue` attribute with given queue. + `get` accepts kwargs that are passed down to `queue.get`. + + Examples + -------- + >>> import queue + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> a = ( + ... lf.groupby("a", maintain_order=True) + ... .agg(pl.all().sum()) + ... .collect_async(queue.Queue()) + ... ) + >>> a.get() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + """ + if no_optimization: + predicate_pushdown = False + projection_pushdown = False + slice_pushdown = False + comm_subplan_elim = False + comm_subexpr_elim = False + + if streaming: + comm_subplan_elim = False + + ldf = self._ldf.optimization_toggle( + type_coercion, + predicate_pushdown, + projection_pushdown, + simplify_expression, + slice_pushdown, + comm_subplan_elim, + comm_subexpr_elim, + streaming, + ) + + result = _AsyncDataFrameResult(queue) + ldf.collect_with_callback(result._callback) + return result + def sink_parquet( self, path: str | Path, diff --git a/py-polars/polars/utils/_async.py b/py-polars/polars/utils/_async.py new file mode 100644 index 000000000000..d35956156b8c --- /dev/null +++ b/py-polars/polars/utils/_async.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Generic, TypeVar + +from polars.utils._wrap import wrap_df + +if TYPE_CHECKING: + from queue import Queue + + from polars.polars import PyDataFrame + + +T = TypeVar("T") + + +class _AsyncDataFrameResult(Generic[T]): + queue: Queue[Exception | T] + _result: Exception | T | None + + __slots__ = ("queue", "_result") + + def __init__(self, queue: Queue[Exception | T]) -> None: + self.queue = queue + self._result = None + + def get(self, **kwargs: Any) -> T: + if self._result is not None: + if isinstance(self._result, Exception): + raise self._result + return self._result + + self._result = self.queue.get(**kwargs) + if isinstance(self._result, Exception): + raise self._result + return self._result + + def _callback(self, obj: PyDataFrame | Exception) -> None: + if not isinstance(obj, Exception): + obj = wrap_df(obj) + self.queue.put_nowait(obj) + + def _callback_all(self, obj: list[PyDataFrame] | Exception) -> None: + if not isinstance(obj, Exception): + obj = [wrap_df(pydf) for pydf in obj] + self.queue.put_nowait(obj) # type: ignore[arg-type] diff --git a/py-polars/src/functions/lazy.rs b/py-polars/src/functions/lazy.rs index 6d94b2f42fab..fbf78e5ae957 100644 --- a/py-polars/src/functions/lazy.rs +++ b/py-polars/src/functions/lazy.rs @@ -104,6 +104,38 @@ pub fn collect_all(lfs: Vec, py: Python) -> PyResult, lambda: PyObject, py: Python) { + use polars_core::utils::rayon::prelude::*; + + py.allow_threads(|| { + polars_core::POOL.install(move || { + polars_core::POOL.spawn(move || { + let result = lfs + .par_iter() + .map(|lf| { + let df = lf.ldf.clone().collect()?; + Ok(PyDataFrame::new(df)) + }) + .collect::>>() + .map_err(PyPolarsErr::from); + + Python::with_gil(|py| match result { + Ok(dfs) => { + lambda.call1(py, (dfs,)).map_err(|err| err.restore(py)).ok(); + }, + Err(err) => { + lambda + .call1(py, (PyErr::from(err).to_object(py),)) + .map_err(|err| err.restore(py)) + .ok(); + }, + }) + }) + }); + }); +} + #[pyfunction] pub fn cols(names: Vec) -> PyExpr { dsl::cols(names).into() diff --git a/py-polars/src/lazyframe.rs b/py-polars/src/lazyframe.rs index 2f288a1bd3bb..6d3b77400a8a 100644 --- a/py-polars/src/lazyframe.rs +++ b/py-polars/src/lazyframe.rs @@ -444,6 +444,32 @@ impl PyLazyFrame { Ok(df.into()) } + #[pyo3(signature = (lambda,))] + fn collect_with_callback(&self, py: Python, lambda: PyObject) { + py.allow_threads(|| { + let ldf = self.ldf.clone(); + + polars_core::POOL.spawn(move || { + let result = ldf + .collect() + .map(PyDataFrame::new) + .map_err(PyPolarsErr::from); + + Python::with_gil(|py| match result { + Ok(df) => { + lambda.call1(py, (df,)).map_err(|err| err.restore(py)).ok(); + }, + Err(err) => { + lambda + .call1(py, (PyErr::from(err).to_object(py),)) + .map_err(|err| err.restore(py)) + .ok(); + }, + }); + }); + }); + } + #[allow(clippy::too_many_arguments)] #[cfg(all(feature = "streaming", feature = "parquet"))] #[pyo3(signature = (path, compression, compression_level, statistics, row_group_size, data_pagesize_limit, maintain_order))] diff --git a/py-polars/src/lib.rs b/py-polars/src/lib.rs index 35c212084317..ebe003438453 100644 --- a/py-polars/src/lib.rs +++ b/py-polars/src/lib.rs @@ -128,6 +128,8 @@ fn polars(py: Python, m: &PyModule) -> PyResult<()> { .unwrap(); m.add_wrapped(wrap_pyfunction!(functions::lazy::collect_all)) .unwrap(); + m.add_wrapped(wrap_pyfunction!(functions::lazy::collect_all_with_callback)) + .unwrap(); m.add_wrapped(wrap_pyfunction!(functions::lazy::cols)) .unwrap(); m.add_wrapped(wrap_pyfunction!(functions::lazy::concat_lf)) From 55cb641a76af3b0dc4c28fbca07f024fecdf9a95 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sun, 20 Aug 2023 21:53:19 +0100 Subject: [PATCH 20/55] chore(python): fix potential memory leak from usage of `inspect.currentframe` (#10630) --- py-polars/polars/utils/udfs.py | 28 +++++++++----- py-polars/polars/utils/various.py | 61 +++++++++++++++++++------------ 2 files changed, 57 insertions(+), 32 deletions(-) diff --git a/py-polars/polars/utils/udfs.py b/py-polars/polars/utils/udfs.py index a21ae86dc2c5..58a85d9a02c3 100644 --- a/py-polars/polars/utils/udfs.py +++ b/py-polars/polars/utils/udfs.py @@ -178,16 +178,26 @@ def _get_all_caller_variables() -> dict[str, Any]: # https://stackoverflow.com/questions/17407119/python-inspect-stack-is-slow frame = inspect.currentframe() n = 0 - while frame: - fname = inspect.getfile(frame) - if fname.startswith(str(pkg_dir)): - frame = frame.f_back - n += 1 + try: + while frame: + fname = inspect.getfile(frame) + if fname.startswith(str(pkg_dir)): + frame = frame.f_back + n += 1 + else: + break + variables: dict[str, Any] + if frame is None: + variables = {} else: - break - if frame is None: - return {} - return {**frame.f_locals, **frame.f_globals} + variables = {**frame.f_locals, **frame.f_globals} + finally: + # https://docs.python.org/3/library/inspect.html + # > Though the cycle detector will catch these, destruction of the frames + # > (and local variables) can be made deterministic by removing the cycle + # > in a finally clause. + del frame + return variables class BytecodeParser: diff --git a/py-polars/polars/utils/various.py b/py-polars/polars/utils/various.py index 677defeb55fe..88f0b9c1d42f 100644 --- a/py-polars/polars/utils/various.py +++ b/py-polars/polars/utils/various.py @@ -368,13 +368,20 @@ def find_stacklevel() -> int: # https://stackoverflow.com/questions/17407119/python-inspect-stack-is-slow frame = inspect.currentframe() n = 0 - while frame: - fname = inspect.getfile(frame) - if fname.startswith(str(pkg_dir)): - frame = frame.f_back - n += 1 - else: - break + try: + while frame: + fname = inspect.getfile(frame) + if fname.startswith(str(pkg_dir)): + frame = frame.f_back + n += 1 + else: + break + finally: + # https://docs.python.org/3/library/inspect.html + # > Though the cycle detector will catch these, destruction of the frames + # > (and local variables) can be made deterministic by removing the cycle + # > in a finally clause. + del frame return n @@ -406,22 +413,30 @@ def _get_stack_locals( examined_frames = 0 if n_frames is None: n_frames = sys.maxsize - stack_frame = getattr(inspect.currentframe(), "f_back", None) - - while stack_frame and examined_frames < n_frames: - local_items = list(stack_frame.f_locals.items()) - for nm, obj in reversed(local_items): - if ( - nm not in objects - and (named is None or (nm in named)) - and (of_type is None or isinstance(obj, of_type)) - ): - objects[nm] = obj - if n_objects is not None and len(objects) >= n_objects: - return objects - - stack_frame = stack_frame.f_back - examined_frames += 1 + stack_frame = inspect.currentframe() + stack_frame = getattr(stack_frame, "f_back", None) + + try: + while stack_frame and examined_frames < n_frames: + local_items = list(stack_frame.f_locals.items()) + for nm, obj in reversed(local_items): + if ( + nm not in objects + and (named is None or (nm in named)) + and (of_type is None or isinstance(obj, of_type)) + ): + objects[nm] = obj + if n_objects is not None and len(objects) >= n_objects: + return objects + + stack_frame = stack_frame.f_back + examined_frames += 1 + finally: + # https://docs.python.org/3/library/inspect.html + # > Though the cycle detector will catch these, destruction of the frames + # > (and local variables) can be made deterministic by removing the cycle + # > in a finally clause. + del stack_frame return objects From 15527ae585a20d70b2680820676b123dfb08523e Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Mon, 21 Aug 2023 06:56:57 +0200 Subject: [PATCH 21/55] fix(rust, python): fix rename + projection pushdown (#10624) --- .../optimizer/projection_pushdown/rename.rs | 11 +++++++---- py-polars/tests/unit/test_projections.py | 7 +++++++ 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/rename.rs b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/rename.rs index 7caa3aff226f..1c19036d4ae9 100644 --- a/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/rename.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/rename.rs @@ -33,13 +33,15 @@ pub(super) fn process_rename( ) -> PolarsResult<()> { let mut processed = BTreeSet::new(); if swapping { + // We clone otherwise we update a data structure whilst we rename it. + let mut new_projected_names = projected_names.clone(); for (existing, new) in existing.iter().zip(new.iter()) { let has_existing = projected_names.contains(existing.as_str()); + // Only if the new column name is projected by the upper node we must update the name. let has_new = projected_names.contains(new.as_str()); let has_both = has_existing && has_new; - let has_any = has_existing || has_new; - if has_any { + if has_new { // swapping path // this must leave projected names intact, as we only swap if has_both { @@ -54,9 +56,9 @@ pub(super) fn process_rename( // simple new name path // this must add and remove names else { - projected_names.remove(new.as_str()); + new_projected_names.remove(new.as_str()); let name: Arc = Arc::from(existing.as_str()); - projected_names.insert(name); + new_projected_names.insert(name); iter_and_update_nodes( existing, new, @@ -67,6 +69,7 @@ pub(super) fn process_rename( } } } + *projected_names = new_projected_names; } else { for (existing, new) in existing.iter().zip(new.iter()) { if projected_names.remove(new.as_str()) { diff --git a/py-polars/tests/unit/test_projections.py b/py-polars/tests/unit/test_projections.py index b839b986a2c9..4312815eeaa6 100644 --- a/py-polars/tests/unit/test_projections.py +++ b/py-polars/tests/unit/test_projections.py @@ -313,3 +313,10 @@ def test_projection_join_names_9955() -> None: "yearID": pl.Int64, "lgID": pl.Utf8, } + + +def test_projection_rename_10595() -> None: + lf = pl.LazyFrame(schema=["a", "b"]) + assert lf.select("a", "b").rename({"b": "a", "a": "b"}).select( + "a" + ).collect().schema == {"a": pl.Float32} From 6dd34327f8ad83decb6e6af6c77107d765978c4f Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Mon, 21 Aug 2023 09:31:22 +0200 Subject: [PATCH 22/55] fix(rust, python): respect 'ignore_errors=False' in csv parser (#10641) --- crates/polars-core/src/series/mod.rs | 6 +-- crates/polars-core/src/utils/series.rs | 5 ++ crates/polars-io/src/csv/buffer.rs | 36 +++++++++----- .../src/csv/read_impl/batched_mmap.rs | 2 +- .../src/csv/read_impl/batched_read.rs | 2 +- crates/polars-io/src/csv/read_impl/mod.rs | 48 ++++++++++++------- crates/polars/tests/it/io/csv.rs | 1 + py-polars/tests/unit/io/test_csv.py | 32 +++++++++++++ 8 files changed, 96 insertions(+), 36 deletions(-) diff --git a/crates/polars-core/src/series/mod.rs b/crates/polars-core/src/series/mod.rs index 1b6f114802f6..cdd010643198 100644 --- a/crates/polars-core/src/series/mod.rs +++ b/crates/polars-core/src/series/mod.rs @@ -32,7 +32,7 @@ use crate::chunked_array::Settings; use crate::prelude::unique::rank::rank; #[cfg(feature = "zip_with")] use crate::series::arithmetic::coerce_lhs_rhs; -use crate::utils::{_split_offsets, split_ca, split_series, Wrap}; +use crate::utils::{_split_offsets, get_casting_failures, split_ca, split_series, Wrap}; use crate::POOL; /// # Series @@ -790,14 +790,12 @@ impl Series { } let s = self.0.cast(dtype)?; if null_count != s.null_count() { - let failure_mask = !self.is_null() & s.is_null(); - let failures = self.filter_threaded(&failure_mask, false)?.unique()?; + let failures = get_casting_failures(self, &s)?; polars_bail!( ComputeError: "strict conversion from `{}` to `{}` failed for column: {}, value(s) {}; \ if you were trying to cast Utf8 to temporal dtypes, consider using `strptime`", self.dtype(), dtype, s.name(), failures.fmt_list(), - ); } else { Ok(s) diff --git a/crates/polars-core/src/utils/series.rs b/crates/polars-core/src/utils/series.rs index 2f41790e953c..b6c87b2cff33 100644 --- a/crates/polars-core/src/utils/series.rs +++ b/crates/polars-core/src/utils/series.rs @@ -39,3 +39,8 @@ pub fn ensure_sorted_arg(s: &Series, operation: &str) -> PolarsResult<()> { ", operation); Ok(()) } + +pub fn get_casting_failures(input: &Series, output: &Series) -> PolarsResult { + let failure_mask = !input.is_null() & output.is_null(); + input.filter_threaded(&failure_mask, false)?.unique() +} diff --git a/crates/polars-io/src/csv/buffer.rs b/crates/polars-io/src/csv/buffer.rs index 5d8317be27ad..6eeb98e41e39 100644 --- a/crates/polars-io/src/csv/buffer.rs +++ b/crates/polars-io/src/csv/buffer.rs @@ -422,14 +422,18 @@ where // Safety: // we just checked it is ascii unsafe { std::str::from_utf8_unchecked(bytes) } - } else if ignore_errors { - buf.builder.append_null(); - return Ok(()); - } else if !ignore_errors && std::str::from_utf8(bytes).is_err() { - polars_bail!(ComputeError: "invalid utf-8 sequence"); } else { - buf.builder.append_null(); - return Ok(()); + match std::str::from_utf8(bytes) { + Ok(val) => val, + Err(_) => { + if ignore_errors { + buf.builder.append_null(); + return Ok(()); + } else { + polars_bail!(ComputeError: "invalid utf-8 sequence"); + } + }, + } }; let pattern = match &buf.compiled { @@ -437,8 +441,12 @@ where None => match infer_pattern_single(val) { Some(pattern) => pattern, None => { - buf.builder.append_null(); - return Ok(()); + if ignore_errors { + buf.builder.append_null(); + return Ok(()); + } else { + polars_bail!(ComputeError: "could not find a 'date/datetime' pattern for {}", val) + } }, }, }; @@ -449,9 +457,13 @@ where buf.builder.append_option(parsed); Ok(()) }, - Err(_) => { - buf.builder.append_null(); - Ok(()) + Err(err) => { + if ignore_errors { + buf.builder.append_null(); + Ok(()) + } else { + Err(err) + } }, } } diff --git a/crates/polars-io/src/csv/read_impl/batched_mmap.rs b/crates/polars-io/src/csv/read_impl/batched_mmap.rs index b69fb10e4700..20f6f96018fb 100644 --- a/crates/polars-io/src/csv/read_impl/batched_mmap.rs +++ b/crates/polars-io/src/csv/read_impl/batched_mmap.rs @@ -249,7 +249,7 @@ impl<'a> BatchedCsvReaderMmap<'a> { self.starting_point_offset, )?; - cast_columns(&mut df, &self.to_cast, false)?; + cast_columns(&mut df, &self.to_cast, false, self.ignore_errors)?; update_string_stats(&self.str_capacities, &self.str_columns, &df)?; if let Some(rc) = &self.row_count { diff --git a/crates/polars-io/src/csv/read_impl/batched_read.rs b/crates/polars-io/src/csv/read_impl/batched_read.rs index 1152cbec2525..2c8a74a23969 100644 --- a/crates/polars-io/src/csv/read_impl/batched_read.rs +++ b/crates/polars-io/src/csv/read_impl/batched_read.rs @@ -346,7 +346,7 @@ impl<'a> BatchedCsvReaderRead<'a> { self.starting_point_offset, )?; - cast_columns(&mut df, &self.to_cast, false)?; + cast_columns(&mut df, &self.to_cast, false, self.ignore_errors)?; update_string_stats(&self.str_capacities, &self.str_columns, &df)?; if let Some(rc) = &self.row_count { diff --git a/crates/polars-io/src/csv/read_impl/mod.rs b/crates/polars-io/src/csv/read_impl/mod.rs index a95bacb6fc3f..62aa3578aabf 100644 --- a/crates/polars-io/src/csv/read_impl/mod.rs +++ b/crates/polars-io/src/csv/read_impl/mod.rs @@ -11,7 +11,7 @@ pub use batched_read::*; use polars_arrow::array::*; use polars_core::config::verbose; use polars_core::prelude::*; -use polars_core::utils::accumulate_dataframes_vertical; +use polars_core::utils::{accumulate_dataframes_vertical, get_casting_failures}; use polars_core::POOL; #[cfg(feature = "polars-time")] use polars_time::prelude::*; @@ -32,21 +32,33 @@ pub(crate) fn cast_columns( df: &mut DataFrame, to_cast: &[Field], parallel: bool, + ignore_errors: bool, ) -> PolarsResult<()> { - let cast_fn = |s: &Series, fld: &Field| match (s.dtype(), fld.data_type()) { - #[cfg(feature = "temporal")] - (DataType::Utf8, DataType::Date) => s - .utf8() - .unwrap() - .as_date(None, false) - .map(|ca| ca.into_series()), - #[cfg(feature = "temporal")] - (DataType::Utf8, DataType::Datetime(tu, _)) => s - .utf8() - .unwrap() - .as_datetime(None, *tu, false, false, None, None) - .map(|ca| ca.into_series()), - (_, dt) => s.cast(dt), + let cast_fn = |s: &Series, fld: &Field| { + let out = match (s.dtype(), fld.data_type()) { + #[cfg(feature = "temporal")] + (DataType::Utf8, DataType::Date) => s + .utf8() + .unwrap() + .as_date(None, false) + .map(|ca| ca.into_series()), + #[cfg(feature = "temporal")] + (DataType::Utf8, DataType::Datetime(tu, _)) => s + .utf8() + .unwrap() + .as_datetime(None, *tu, false, false, None, None) + .map(|ca| ca.into_series()), + (_, dt) => s.cast(dt), + }?; + if !ignore_errors && s.null_count() != out.null_count() { + let failures = get_casting_failures(s, &out)?; + polars_bail!( + ComputeError: + "parsing to `{}` failed for column: {}, value(s) {};", + fld.data_type(), s.name(), failures.fmt_list(), + ) + } + Ok(out) }; if parallel { @@ -618,7 +630,7 @@ impl<'a> CoreReader<'a> { local_df.with_row_count_mut(&rc.name, Some(rc.offset)); }; - cast_columns(&mut local_df, &self.to_cast, false)?; + cast_columns(&mut local_df, &self.to_cast, false, self.ignore_errors)?; let s = predicate.evaluate(&local_df)?; let mask = s.bool()?; local_df = local_df.filter(mask)?; @@ -681,7 +693,7 @@ impl<'a> CoreReader<'a> { update_string_stats(&str_capacities, &str_columns, &df)?; } - cast_columns(&mut df, &self.to_cast, false)?; + cast_columns(&mut df, &self.to_cast, false, self.ignore_errors)?; if let Some(rc) = &self.row_count { df.with_row_count_mut(&rc.name, Some(rc.offset)); } @@ -731,7 +743,7 @@ impl<'a> CoreReader<'a> { ) }; - cast_columns(&mut df, &self.to_cast, false)?; + cast_columns(&mut df, &self.to_cast, false, self.ignore_errors)?; if let Some(rc) = &self.row_count { df.with_row_count_mut(&rc.name, Some(rc.offset)); } diff --git a/crates/polars/tests/it/io/csv.rs b/crates/polars/tests/it/io/csv.rs index 61abad8a03fe..74a66e320640 100644 --- a/crates/polars/tests/it/io/csv.rs +++ b/crates/polars/tests/it/io/csv.rs @@ -442,6 +442,7 @@ AUDCAD,1616455921,0.96212,0.95666,1 "b", DataType::Datetime(TimeUnit::Nanoseconds, None), )])))) + .with_ignore_errors(true) .finish()?; assert_eq!( diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index 060cb317be99..ca6574a31061 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -1440,3 +1440,35 @@ def test_csv_quote_styles() -> None: df.write_csv(quote_style="non_numeric", quote="8") == '8float8,8string8,8int8,8bool8\n1.0,8a8,1,8true8\n2.0,8abc8,2,8false8\n,8"hello8,3,\n' ) + + +def test_ignore_errors_casting_dtypes() -> None: + csv = """inventory + 10 + + 400 + 90 + """ + + assert pl.read_csv( + source=io.StringIO(csv), + dtypes={"inventory": pl.Int8}, + ignore_errors=True, + ).to_dict(False) == {"inventory": [10, None, None, 90]} + + with pytest.raises(pl.ComputeError): + pl.read_csv( + source=io.StringIO(csv), + dtypes={"inventory": pl.Int8}, + ignore_errors=False, + ) + + +def test_ignore_errors_date_parser() -> None: + data_invalid_date = "int,float,date\n3,3.4,X" + with pytest.raises(pl.ComputeError): + pl.read_csv( + source=io.StringIO(data_invalid_date), + dtypes={"date": pl.Date}, + ignore_errors=False, + ) From 6ac24505a8e0a6ce7bdddfde22f921a3c7894112 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Mon, 21 Aug 2023 11:39:41 +0400 Subject: [PATCH 23/55] feat(python): support selector usage in `write_excel` arguments (#10589) --- py-polars/polars/dataframe/frame.py | 35 +++++++++++++---------- py-polars/polars/io/excel/_write_utils.py | 33 ++++++++++++++++----- py-polars/polars/selectors.py | 24 ++++++++++++++++ py-polars/polars/type_aliases.py | 10 ++++++- py-polars/tests/unit/io/test_excel.py | 7 +++-- 5 files changed, 83 insertions(+), 26 deletions(-) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 6e8794d791c6..817c49079313 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -66,7 +66,7 @@ _xl_unique_table_name, _XLFormatCache, ) -from polars.selectors import _expand_selectors +from polars.selectors import _expand_selector_dicts, _expand_selectors from polars.slice import PolarsSlice from polars.utils._construction import ( _post_apply_columns, @@ -120,8 +120,10 @@ AsofJoinStrategy, AvroCompression, ClosedInterval, + ColumnFormatDict, ColumnNameOrSelector, ColumnTotalsDefinition, + ColumnWidthsDefinition, ComparisonOperator, ConditionalFormatDict, CsvEncoding, @@ -2626,12 +2628,12 @@ def write_excel( position: tuple[int, int] | str = "A1", table_style: str | dict[str, Any] | None = None, table_name: str | None = None, - column_formats: dict[str | tuple[str, ...], str | dict[str, str]] | None = None, + column_formats: ColumnFormatDict | None = None, dtype_formats: dict[OneOrMoreDataTypes, str] | None = None, conditional_formats: ConditionalFormatDict | None = None, header_format: dict[str, Any] | None = None, column_totals: ColumnTotalsDefinition | None = None, - column_widths: dict[str | tuple[str, ...], int] | int | None = None, + column_widths: ColumnWidthsDefinition | None = None, row_totals: RowTotalsDefinition | None = None, row_heights: dict[int | tuple[int, ...], int] | int | None = None, sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = None, @@ -2674,9 +2676,9 @@ def write_excel( Name of the output table object in the worksheet; can then be referred to in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. column_formats : dict - A ``{colname:str,}`` dictionary for applying an Excel format string to the - given columns. Formats defined here (such as "dd/mm/yyyy", "0.00%", etc) - will override any defined in ``dtype_formats`` (below). + A ``{colname(s):str,}`` or ``{selector:str,}`` dictionary for applying an + Excel format string to the given columns. Formats defined here (such as + "dd/mm/yyyy", "0.00%", etc) will override any defined in ``dtype_formats``. dtype_formats : dict A ``{dtype:str,}`` dictionary that sets the default Excel format for the given dtype. (This can be overridden on a per-column basis by the @@ -2684,8 +2686,8 @@ def write_excel( ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform integer and float formats. conditional_formats : dict - A ``{colname(s):str,}``, ``{colname(s):dict,}``, or ``{colname(s):list,}`` - dictionary defining conditional format options for the specified columns. + A dictionary of colname (or selector) keys to a format str, dict, or list + that defines conditional formatting options for the specified columns. * If supplying a string typename, should be one of the valid ``xlsxwriter`` types such as "3_color_scale", "data_bar", etc. @@ -2711,9 +2713,9 @@ def write_excel( Valid total function names are "average", "count_nums", "count", "max", "min", "std_dev", "sum", and "var". column_widths : {dict, int} - A ``{colname:int,}`` dict or single integer that sets (or overrides if - autofitting) table column widths in integer pixel units. If given as an - integer the same value is used for all table columns. + A ``{colname:int,}`` or ``{selector:int,}`` dict or a single integer that + sets (or overrides if autofitting) table column widths, in integer pixel + units. If given as an integer the same value is used for all table columns. row_totals : {dict, bool} Add a row-total column to the right-hand side of the exported table. @@ -2754,7 +2756,7 @@ def write_excel( "formula" (mandatory), one of "insert_before" or "insert_after", and optionally "return_dtype". The latter is used to appropriately format the output of the formula and allow it to participate in row/column totals. - float_precision : {dict, int} + float_precision : int Default number of decimals displayed for floating point columns (note that this is purely a formatting directive; the actual values are not rounded). has_header : bool @@ -2764,7 +2766,7 @@ def write_excel( autofit : bool Calculate individual column widths from the data. hidden_columns : list - A list of table columns to hide in the worksheet. + A list or selector representing table columns to hide in the worksheet. hide_gridlines : bool Do not display any gridlines on the output worksheet. sheet_zoom : int @@ -2784,7 +2786,6 @@ def write_excel( scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. - Notes ----- * A list of compatible ``xlsxwriter`` format property names can be found here: @@ -3062,9 +3063,13 @@ def write_excel( ) # additional column-level properties - hidden_columns = hidden_columns or () + hidden_columns = _expand_selectors(df, hidden_columns or ()) if isinstance(column_widths, int): column_widths = {column: column_widths for column in df.columns} + else: + column_widths = _expand_selector_dicts( # type: ignore[assignment] + df, column_widths, expand_keys=True, expand_values=False + ) column_widths = _unpack_multi_column_dict(column_widths or {}) # type: ignore[assignment] for column in df.columns: diff --git a/py-polars/polars/io/excel/_write_utils.py b/py-polars/polars/io/excel/_write_utils.py index 4bf5c022b945..89206deecb78 100644 --- a/py-polars/polars/io/excel/_write_utils.py +++ b/py-polars/polars/io/excel/_write_utils.py @@ -19,6 +19,7 @@ ) from polars.dependencies import json from polars.exceptions import DuplicateError +from polars.selectors import _expand_selector_dicts, _expand_selectors if TYPE_CHECKING: from typing import Literal @@ -29,6 +30,7 @@ from polars import DataFrame, Series from polars.type_aliases import ( + ColumnFormatDict, ColumnTotalsDefinition, ConditionalFormatDict, OneOrMoreDataTypes, @@ -113,7 +115,9 @@ def _xl_apply_conditional_formats( """Take all conditional formatting options and apply them to the table/range.""" from xlsxwriter.format import Format - for cols, formats in conditional_formats.items(): + for cols, formats in _expand_selector_dicts( + df, conditional_formats, expand_keys=True, expand_values=False, tuple_keys=True + ).items(): if not isinstance(cols, str) and len(cols) == 1: cols = next(iter(cols)) if isinstance(formats, (str, dict)): @@ -305,7 +309,7 @@ def _xl_setup_table_columns( df: DataFrame, format_cache: _XLFormatCache, column_totals: ColumnTotalsDefinition | None = None, - column_formats: dict[str | tuple[str, ...], str | dict[str, str]] | None = None, + column_formats: ColumnFormatDict | None = None, dtype_formats: dict[OneOrMoreDataTypes, str] | None = None, header_format: dict[str, Any] | None = None, sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = None, @@ -327,8 +331,16 @@ def _map_str(s: Series) -> Series: if cast_cols: df = df.with_columns(cast_cols) - column_totals = _unpack_multi_column_dict(column_totals) # type: ignore[assignment] - column_formats = _unpack_multi_column_dict(column_formats) # type: ignore[assignment] + column_totals = _unpack_multi_column_dict( # type: ignore[assignment] + _expand_selector_dicts(df, column_totals, expand_keys=True, expand_values=False) + if isinstance(column_totals, dict) + else _expand_selectors(df, column_totals) + ) + column_formats = _unpack_multi_column_dict( # type: ignore[assignment] + _expand_selector_dicts( + df, column_formats, expand_keys=True, expand_values=False, tuple_keys=True + ) + ) # normalise column totals column_total_funcs = ( @@ -348,12 +360,19 @@ def _map_str(s: Series) -> Series: sum_cols = ( numeric_cols if row_totals is True - else ({row_totals} if isinstance(row_totals, str) else set(row_totals)) + else ( + {row_totals} + if isinstance(row_totals, str) + else set(_expand_selectors(df, row_totals)) + ) ) n_ucase = sum((c[0] if c else "").isupper() for c in df.columns) total = f"{'T' if (n_ucase > len(df.columns) // 2) else 't'}otal" row_total_funcs = {total: _xl_table_formula(df, sum_cols, "sum")} else: + row_totals = _expand_selector_dicts( + df, row_totals, expand_keys=False, expand_values=True + ) row_total_funcs = { name: _xl_table_formula( df, numeric_cols if cols is True else cols, "sum" @@ -368,8 +387,8 @@ def _map_str(s: Series) -> Series: } # normalise formats - column_formats = (column_formats or {}).copy() - dtype_formats = (dtype_formats or {}).copy() + column_formats = dict(column_formats or {}) + dtype_formats = dict(dtype_formats or {}) for tp in list(dtype_formats): if isinstance(tp, (tuple, frozenset)): diff --git a/py-polars/polars/selectors.py b/py-polars/polars/selectors.py index 2c00caeb841b..477f4196e240 100644 --- a/py-polars/polars/selectors.py +++ b/py-polars/polars/selectors.py @@ -173,6 +173,30 @@ def _expand_selectors( return expanded +def _expand_selector_dicts( + df: DataFrame, + d: Mapping[Any, Any] | None, + expand_keys: bool, + expand_values: bool, + tuple_keys: bool = False, +) -> dict[str, Any]: + """Expand dict key/value selectors into their underlying column names.""" + expanded = {} + for key, value in (d or {}).items(): + if expand_values and is_selector(value): + expanded[key] = expand_selector(df, selector=value) + value = expanded[key] + if expand_keys and is_selector(key): + cols = expand_selector(df, selector=key) + if tuple_keys: + expanded[cols] = value + else: + expanded.update({c: value for c in cols}) + else: + expanded[key] = value + return expanded + + class _selector_proxy_(Expr): """Base column selector expression/proxy.""" diff --git a/py-polars/polars/type_aliases.py b/py-polars/polars/type_aliases.py index ff907a68ce90..14597c0c6bb7 100644 --- a/py-polars/polars/type_aliases.py +++ b/py-polars/polars/type_aliases.py @@ -161,9 +161,14 @@ ] # Excel IO +ColumnFormatDict: TypeAlias = Mapping[ + # dict of colname(s) or selector(s) to format string or dict + Union[ColumnNameOrSelector, Tuple[ColumnNameOrSelector, ...]], + Union[str, Mapping[str, str]], +] ConditionalFormatDict: TypeAlias = Mapping[ # dict of colname(s) to str, dict, or sequence of str/dict - Union[str, Collection[str]], + Union[ColumnNameOrSelector, Collection[str]], Union[str, Union[Mapping[str, Any], Sequence[Union[str, Mapping[str, Any]]]]], ] ColumnTotalsDefinition: TypeAlias = Union[ @@ -172,6 +177,9 @@ Sequence[str], bool, ] +ColumnWidthsDefinition: TypeAlias = Union[ + Mapping[ColumnNameOrSelector, Union[Tuple[str, ...], int]], int +] RowTotalsDefinition: TypeAlias = Union[ # dict of colname to str(s), a collection of str, or a boolean Mapping[str, Union[str, Collection[str]]], diff --git a/py-polars/tests/unit/io/test_excel.py b/py-polars/tests/unit/io/test_excel.py index 4458c4cf812e..77a654610525 100644 --- a/py-polars/tests/unit/io/test_excel.py +++ b/py-polars/tests/unit/io/test_excel.py @@ -7,6 +7,7 @@ import pytest import polars as pl +import polars.selectors as cs from polars.exceptions import NoDataError from polars.testing import assert_frame_equal @@ -219,7 +220,7 @@ def test_excel_sparklines() -> None: worksheet="frame_data", table_style="Table Style Light 2", dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, - column_formats={("h1", "h2"): "#,##0_);(#,##0)"}, + column_formats={cs.starts_with("h"): "#,##0_);(#,##0)"}, sparklines={ "trend": ["q1", "q2", "q3", "q4"], "+/-": { @@ -229,13 +230,13 @@ def test_excel_sparklines() -> None: }, }, conditional_formats={ - ("q1", "q2", "q3", "q4", "h1", "h2"): { + cs.starts_with("q", "h"): { "type": "2_color_scale", "min_color": "#95b3d7", "max_color": "#ffffff", } }, - column_widths={("q1", "q2", "q3", "q4", "h1", "h2"): 40}, + column_widths={cs.starts_with("q", "h"): 40}, row_totals={ "h1": ("q1", "q2"), "h2": ("q3", "q4"), From aa10faab9ef031f3da60b97810487d46abf40abf Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Mon, 21 Aug 2023 10:21:01 +0200 Subject: [PATCH 24/55] feat(python)!: Remove deprecated behavior from vertical aggregations (#10602) --- .../polars/functions/aggregation/vertical.py | 323 +++--------------- .../functions/aggregation/test_vertical.py | 34 +- .../tests/unit/functions/test_functions.py | 39 ++- 3 files changed, 68 insertions(+), 328 deletions(-) diff --git a/py-polars/polars/functions/aggregation/vertical.py b/py-polars/polars/functions/aggregation/vertical.py index ef1234b2bc26..952ff10e4352 100644 --- a/py-polars/polars/functions/aggregation/vertical.py +++ b/py-polars/polars/functions/aggregation/vertical.py @@ -1,59 +1,24 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Iterable, overload +from typing import TYPE_CHECKING -import polars._reexport as pl import polars.functions as F -from polars.utils.deprecation import ( - deprecate_renamed_parameter, - issue_deprecation_warning, -) if TYPE_CHECKING: - from polars import Expr, Series - from polars.type_aliases import IntoExpr, PythonLiteral + from polars import Expr -@overload -def all(exprs: Series, *, ignore_nulls: bool = ...) -> bool | None: # type: ignore[misc] - ... - - -@overload -def all( - exprs: IntoExpr | Iterable[IntoExpr] | None = ..., - *more_exprs: IntoExpr, - ignore_nulls: bool = ..., -) -> Expr: - ... - - -@deprecate_renamed_parameter("columns", "exprs", version="0.18.7") -def all( - exprs: IntoExpr | Iterable[IntoExpr] | None = None, - *more_exprs: IntoExpr, - ignore_nulls: bool = True, -) -> Expr | bool | None: +def all(*names: str, ignore_nulls: bool = True) -> Expr: """ Either return an expression representing all columns, or evaluate a bitwise AND operation. - If no arguments are passed, this is an alias for ``pl.col("*")``. - If a single string is passed, this is an alias for ``pl.col(name).any()``. - - If a single Series is passed, this is an alias for ``Series.any()``. - **This functionality is deprecated**. - - Otherwise, this function computes the bitwise AND horizontally across multiple - columns. - **This functionality is deprecated**, use ``pl.all_horizontal`` instead. + If no arguments are passed, this function is syntactic sugar for ``col("*")``. + Otherwise, this function is syntactic sugar for ``col(names).all()``. Parameters ---------- - exprs - Column(s) to use in the aggregation. Accepts expression input. Strings are - parsed as column names, other non-expression inputs are parsed as literals. - *more_exprs - Additional columns to use in the aggregation, specified as positional arguments. + *names + Name(s) of the columns to use in the aggregation. ignore_nulls Ignore null values (default). @@ -87,7 +52,7 @@ def all( │ 2 ┆ 0 │ └─────┴─────┘ - Evaluate bitwise AND for a column: + Evaluate bitwise AND for a column. >>> df.select(pl.all("a")) shape: (1, 1) @@ -100,53 +65,17 @@ def all( └───────┘ """ # noqa: W505 - if not more_exprs: - if exprs is None: - return F.col("*") - elif isinstance(exprs, pl.Series): - issue_deprecation_warning( - "passing a Series to `all` is deprecated. Use `Series.all()` instead.", - version="0.18.7", - ) - return exprs.all(ignore_nulls=ignore_nulls) - elif isinstance(exprs, str): - return F.col(exprs).all(ignore_nulls=ignore_nulls) - - _warn_for_deprecated_horizontal_use("all") - return F.all_horizontal(exprs, *more_exprs) - - -@overload -def any(exprs: Series, *, ignore_nulls: bool = ...) -> bool | None: # type: ignore[misc] - ... - - -@overload -def any( - exprs: IntoExpr | Iterable[IntoExpr], - *more_exprs: IntoExpr, - ignore_nulls: bool = ..., -) -> Expr: - ... - - -@deprecate_renamed_parameter("columns", "exprs", version="0.18.7") -def any( - exprs: IntoExpr | Iterable[IntoExpr], - *more_exprs: IntoExpr, - ignore_nulls: bool = True, -) -> Expr | bool | None: - """ - Evaluate a bitwise OR operation. + if not names: + return F.col("*") - If a single string is passed, this is an alias for ``pl.col(name).any()``. + return F.col(*names).all(ignore_nulls=ignore_nulls) - If a single Series is passed, this is an alias for ``Series.any()``. - **This functionality is deprecated**. - Otherwise, this function computes the bitwise OR horizontally across multiple - columns. - **This functionality is deprecated**, use ``pl.any_horizontal`` instead. +def any(*names: str, ignore_nulls: bool = True) -> Expr | bool | None: + """ + Evaluate a bitwise OR operation. + + Syntactic sugar for ``col(names).any()``. See Also -------- @@ -154,11 +83,8 @@ def any( Parameters ---------- - exprs - Column(s) to use in the aggregation. Accepts expression input. Strings are - parsed as column names, other non-expression inputs are parsed as literals. - *more_exprs - Additional columns to use in the aggregation, specified as positional arguments. + *names + Name(s) of the columns to use in the aggregation. ignore_nulls Ignore null values (default). @@ -187,50 +113,19 @@ def any( └──────┘ """ - if not more_exprs: - if isinstance(exprs, pl.Series): - issue_deprecation_warning( - "passing a Series to `any` is deprecated. Use `Series.any()` instead.", - version="0.18.7", - ) - return exprs.any(ignore_nulls=ignore_nulls) - elif isinstance(exprs, str): - return F.col(exprs).any(ignore_nulls=ignore_nulls) - - _warn_for_deprecated_horizontal_use("any") - return F.any_horizontal(exprs, *more_exprs) - - -@overload -def max(exprs: Series) -> PythonLiteral | None: # type: ignore[misc] - ... - + return F.col(*names).any(ignore_nulls=ignore_nulls) -@overload -def max(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr: - ... - -def max(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr | Any: +def max(*names: str) -> Expr: """ Get the maximum value. - If a single string is passed, this is an alias for ``pl.col(name).max()``. - - If a single Series is passed, this is an alias for ``Series.max()``. - **This functionality is deprecated**. - - Otherwise, this function computes the maximum value horizontally across multiple - columns. - **This functionality is deprecated**, use ``pl.max_horizontal`` instead. + Syntactic sugar for ``col(names).max()``. Parameters ---------- - exprs - Column(s) to use in the aggregation. Accepts expression input. Strings are - parsed as column names, other non-expression inputs are parsed as literals. - *more_exprs - Additional columns to use in the aggregation, specified as positional arguments. + *names + Name(s) of the columns to use in the aggregation. See Also -------- @@ -238,7 +133,7 @@ def max(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr | A Examples -------- - Get the maximum value of a column by passing a single column name. + Get the maximum value of a column. >>> df = pl.DataFrame( ... { @@ -257,8 +152,7 @@ def max(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr | A │ 8 │ └─────┘ - Get column-wise maximums for multiple columns by passing a regular expression, - or call ``.max()`` on a multi-column expression instead. + Get the maximum value of multiple columns. >>> df.select(pl.max("^a|b$")) shape: (1, 2) @@ -269,7 +163,7 @@ def max(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr | A ╞═════╪═════╡ │ 8 ┆ 5 │ └─────┴─────┘ - >>> df.select(pl.col("a", "b").max()) + >>> df.select(pl.max("a", "b")) shape: (1, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -280,52 +174,19 @@ def max(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr | A └─────┴─────┘ """ - if not more_exprs: - if isinstance(exprs, pl.Series): - issue_deprecation_warning( - "passing a Series to `max` is deprecated. Use `Series.max()` instead.", - version="0.18.7", - ) - return exprs.max() - elif isinstance(exprs, str): - return F.col(exprs).max() - - _warn_for_deprecated_horizontal_use("max") - return F.max_horizontal(exprs, *more_exprs) - + return F.col(*names).max() -@overload -def min(exprs: Series) -> PythonLiteral | None: # type: ignore[misc] - ... - -@overload -def min(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr: - ... - - -def min( - exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr -) -> Expr | PythonLiteral | None: +def min(*names: str) -> Expr: """ Get the minimum value. - If a single string is passed, this is an alias for ``pl.col(name).min()``. - - If a single Series is passed, this is an alias for ``Series.min()``. - **This functionality is deprecated**. - - Otherwise, this function computes the minimum value horizontally across multiple - columns. - **This functionality is deprecated**, use ``pl.min_horizontal`` instead. + Syntactic sugar for ``col(names).min()``. Parameters ---------- - exprs - Column(s) to use in the aggregation. Accepts expression input. Strings are - parsed as column names, other non-expression inputs are parsed as literals. - *more_exprs - Additional columns to use in the aggregation, specified as positional arguments. + *names + Name(s) of the columns to use in the aggregation. See Also -------- @@ -333,7 +194,7 @@ def min( Examples -------- - Get the minimum value of a column by passing a single column name. + Get the minimum value of a column. >>> df = pl.DataFrame( ... { @@ -352,8 +213,7 @@ def min( │ 1 │ └─────┘ - Get column-wise minimums for multiple columns by passing a regular expression, - or call ``.min()`` on a multi-column expression instead. + Get the minimum value of multiple columns. >>> df.select(pl.min("^a|b$")) shape: (1, 2) @@ -364,7 +224,7 @@ def min( ╞═════╪═════╡ │ 1 ┆ 2 │ └─────┴─────┘ - >>> df.select(pl.col("a", "b").min()) + >>> df.select(pl.min("a", "b")) shape: (1, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -375,52 +235,19 @@ def min( └─────┴─────┘ """ - if not more_exprs: - if isinstance(exprs, pl.Series): - issue_deprecation_warning( - "passing a Series to `min` is deprecated. Use `Series.min()` instead.", - version="0.18.7", - ) - return exprs.min() - elif isinstance(exprs, str): - return F.col(exprs).min() - - _warn_for_deprecated_horizontal_use("min") - return F.min_horizontal(exprs, *more_exprs) - - -@overload -def sum(exprs: Series) -> int | float: # type: ignore[misc] - ... - - -@overload -def sum(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr: - ... + return F.col(*names).min() -@deprecate_renamed_parameter("column", "exprs", version="0.18.7") -def sum( - exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr -) -> Expr | int | float: +def sum(*names: str) -> Expr: """ Sum all values. - If a single string is passed, this is an alias for ``pl.col(name).sum()``. - - If a single Series is passed, this is an alias for ``Series.sum()``. - **This functionality is deprecated**. - - Otherwise, this function computes the sum horizontally across multiple columns. - **This functionality is deprecated**, use ``pl.sum_horizontal`` instead. + Syntactic sugar for ``col(name).sum()``. Parameters ---------- - exprs - Column(s) to use in the aggregation. Accepts expression input. Strings are - parsed as column names, other non-expression inputs are parsed as literals. - *more_exprs - Additional columns to use in the aggregation, specified as positional arguments. + *names + Name(s) of the columns to use in the aggregation. See Also -------- @@ -428,7 +255,7 @@ def sum( Examples -------- - Sum a column by name: + Sum a column. >>> df = pl.DataFrame( ... { @@ -447,10 +274,9 @@ def sum( │ 3 │ └─────┘ - To aggregate the sums for more than one column/expression use ``pl.col(list).sum()`` - or a regular expression selector like ``pl.sum(regex)``: + Sum multiple columns. - >>> df.select(pl.col("a", "c").sum()) + >>> df.select(pl.sum("a", "c")) shape: (1, 2) ┌─────┬─────┐ │ a ┆ c │ @@ -459,7 +285,6 @@ def sum( ╞═════╪═════╡ │ 3 ┆ 11 │ └─────┴─────┘ - >>> df.select(pl.sum("^.*[bc]$")) shape: (1, 2) ┌─────┬─────┐ @@ -471,53 +296,19 @@ def sum( └─────┴─────┘ """ - if not more_exprs: - if isinstance(exprs, pl.Series): - issue_deprecation_warning( - "passing a Series to `sum` is deprecated. Use `Series.sum()` instead.", - version="0.18.7", - ) - return exprs.sum() - elif isinstance(exprs, str): - return F.col(exprs).sum() - - _warn_for_deprecated_horizontal_use("sum") - return F.sum_horizontal(exprs, *more_exprs) - + return F.col(*names).sum() -@overload -def cumsum(exprs: Series) -> Series: # type: ignore[misc] - ... - -@overload -def cumsum(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr: - ... - - -@deprecate_renamed_parameter("column", "exprs", version="0.18.7") -def cumsum( - exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr -) -> Expr | Series: +def cumsum(*names: str) -> Expr: """ Cumulatively sum all values. - If a single string is passed, this is an alias for ``pl.col(name).cumsum()``. - - If a single Series is passed, this is an alias for ``Series.cumsum()``. - **This functionality is deprecated**. - - Otherwise, this function computes the cumulative sum horizontally across multiple - columns. - **This functionality is deprecated**, use ``pl.cumsum_horizontal`` instead. + Syntactic sugar for ``col(names).cumsum()``. Parameters ---------- - exprs - Column(s) to use in the aggregation. Accepts expression input. Strings are - parsed as column names, other non-expression inputs are parsed as literals. - *more_exprs - Additional columns to use in the aggregation, specified as positional arguments. + *names + Name(s) of the columns to use in the aggregation. See Also -------- @@ -544,22 +335,4 @@ def cumsum( └─────┘ """ - if not more_exprs: - if isinstance(exprs, pl.Series): - issue_deprecation_warning( - "passing a Series to `cumsum` is deprecated. Use `Series.cumsum()` instead.", - version="0.18.7", - ) - return exprs.cumsum() - elif isinstance(exprs, str): - return F.col(exprs).cumsum() - - _warn_for_deprecated_horizontal_use("cumsum") - return F.cumsum_horizontal(exprs, *more_exprs) - - -def _warn_for_deprecated_horizontal_use(name: str) -> None: - issue_deprecation_warning( - f"using `{name}` for horizontal computation is deprecated. Use `{name}_horizontal` instead.", - version="0.18.7", - ) + return F.col(*names).cumsum() diff --git a/py-polars/tests/unit/functions/aggregation/test_vertical.py b/py-polars/tests/unit/functions/aggregation/test_vertical.py index 57a45fd14a96..1651e3375e72 100644 --- a/py-polars/tests/unit/functions/aggregation/test_vertical.py +++ b/py-polars/tests/unit/functions/aggregation/test_vertical.py @@ -3,7 +3,7 @@ import pytest import polars as pl -from polars.testing import assert_frame_equal, assert_series_equal +from polars.testing import assert_frame_equal def assert_expr_equal( @@ -55,35 +55,3 @@ def test_alias_for_col_agg(function: str, input: str) -> None: expected = getattr(pl.col(input), function)() # e.g. pl.col(input).min() context = pl.DataFrame({"a": [1, 4], "b": [3, 2]}) assert_expr_equal(result, expected, context) - - -@pytest.mark.parametrize("function", ["all", "any"]) -def test_deprecated_alias_for_series_agg_bool(function: str) -> None: - s = pl.Series([True, True, False]) - with pytest.deprecated_call(): - result = getattr(pl, function)(s) # e.g. pl.all(s) - expected = getattr(s, function)() # e.g. s.all() - assert result == expected - - -@pytest.mark.parametrize("function", ["min", "max", "sum"]) -def test_deprecated_alias_for_series_agg_numeric(function: str) -> None: - s = pl.Series([1, 2, 3]) - with pytest.deprecated_call(): - result = getattr(pl, function)(s) # e.g. pl.max(s) - expected = getattr(s, function)() # e.g. s.max() - assert result == expected - - -def test_deprecated_alias_for_series_agg_cumsum() -> None: - s = pl.Series([1, 2, 3]) - with pytest.deprecated_call(): - result = pl.cumsum(s) - expected = s.cumsum() - assert_series_equal(result, expected) - - -@pytest.mark.parametrize("function", ["all", "any", "min", "max", "sum", "cumsum"]) -def test_deprecated_horizontal(function: str) -> None: - with pytest.deprecated_call(): - getattr(pl, function)(pl.col("a")) # e.g. pl.all(pl.col("a")) diff --git a/py-polars/tests/unit/functions/test_functions.py b/py-polars/tests/unit/functions/test_functions.py index 01ad444049a1..af79b585a0d4 100644 --- a/py-polars/tests/unit/functions/test_functions.py +++ b/py-polars/tests/unit/functions/test_functions.py @@ -373,44 +373,43 @@ def test_lazy_functions() -> None: ) expected = 1.0 assert np.isclose(out.to_series(0), expected) - with pytest.deprecated_call(): - assert np.isclose(pl.var(df["b"]), expected) # type: ignore[arg-type] + assert np.isclose(df["b"].var(), expected) # type: ignore[arg-type] + expected = 1.0 assert np.isclose(out.to_series(1), expected) - with pytest.deprecated_call(): - assert np.isclose(pl.std(df["b"]), expected) # type: ignore[arg-type] + assert np.isclose(df["b"].std(), expected) # type: ignore[arg-type] + expected = 3 assert np.isclose(out.to_series(2), expected) - with pytest.deprecated_call(): - assert np.isclose(pl.max(df["b"]), expected) # type: ignore[arg-type] + assert np.isclose(df["b"].max(), expected) # type: ignore[arg-type] + expected = 1 assert np.isclose(out.to_series(3), expected) - with pytest.deprecated_call(): - assert np.isclose(pl.min(df["b"]), expected) # type: ignore[arg-type] + assert np.isclose(df["b"].min(), expected) # type: ignore[arg-type] + expected = 6 assert np.isclose(out.to_series(4), expected) - with pytest.deprecated_call(): - assert np.isclose(pl.sum(df["b"]), expected) + assert np.isclose(df["b"].sum(), expected) + expected = 2 assert np.isclose(out.to_series(5), expected) - with pytest.deprecated_call(): - assert np.isclose(pl.mean(df["b"]), expected) + assert np.isclose(df["b"].mean(), expected) # type: ignore[arg-type] + expected = 2 assert np.isclose(out.to_series(6), expected) - with pytest.deprecated_call(): - assert np.isclose(pl.median(df["b"]), expected) + assert np.isclose(df["b"].median(), expected) # type: ignore[arg-type] + expected = 3 assert np.isclose(out.to_series(7), expected) - with pytest.deprecated_call(): - assert np.isclose(pl.n_unique(df["b"]), expected) + assert np.isclose(df["b"].n_unique(), expected) + expected = 1 assert np.isclose(out.to_series(8), expected) - with pytest.deprecated_call(): - assert np.isclose(pl.first(df["b"]), expected) + assert np.isclose(df["b"][0], expected) + expected = 3 assert np.isclose(out.to_series(9), expected) - with pytest.deprecated_call(): - assert np.isclose(pl.last(df["b"]), expected) + assert np.isclose(df["b"][-1], expected) # regex selection out = df.select( From cdf83247ef279ed90b270cb8903e07198646c452 Mon Sep 17 00:00:00 2001 From: Julian Date: Mon, 21 Aug 2023 11:39:20 +0200 Subject: [PATCH 25/55] feat(python, rust): preserve whitespace in notebook output (#10644) --- py-polars/polars/dataframe/_html.py | 1 + 1 file changed, 1 insertion(+) diff --git a/py-polars/polars/dataframe/_html.py b/py-polars/polars/dataframe/_html.py index f144f9fa2c30..1d432b8b161a 100644 --- a/py-polars/polars/dataframe/_html.py +++ b/py-polars/polars/dataframe/_html.py @@ -161,6 +161,7 @@ def write_style(self) -> None: .dataframe > thead > tr > th, .dataframe > tbody > tr > td { text-align: right; + white-space: pre; } """ From 6f50321b68820238dd1e7c2384048f6519c3e0ad Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Mon, 21 Aug 2023 11:59:53 +0200 Subject: [PATCH 26/55] feat(python)!: Update a lot of error types (#10637) --- py-polars/polars/convert.py | 4 +- py-polars/polars/dataframe/frame.py | 94 ++++++++++--------- py-polars/polars/datatypes/constructor.py | 2 +- py-polars/polars/datatypes/convert.py | 4 +- py-polars/polars/dependencies.py | 2 +- py-polars/polars/expr/datetime.py | 2 +- py-polars/polars/expr/expr.py | 34 +++---- py-polars/polars/expr/string.py | 2 +- py-polars/polars/expr/struct.py | 2 +- py-polars/polars/functions/eager.py | 8 +- py-polars/polars/functions/lazy.py | 9 +- py-polars/polars/io/_utils.py | 4 +- py-polars/polars/io/csv/batched_reader.py | 2 +- py-polars/polars/io/csv/functions.py | 12 +-- py-polars/polars/io/database.py | 20 ++-- py-polars/polars/io/delta.py | 5 +- py-polars/polars/io/excel/_write_utils.py | 2 +- py-polars/polars/io/excel/functions.py | 6 +- py-polars/polars/io/ipc/functions.py | 4 +- py-polars/polars/io/parquet/functions.py | 5 +- py-polars/polars/lazyframe/frame.py | 16 ++-- py-polars/polars/lazyframe/groupby.py | 2 +- py-polars/polars/series/series.py | 40 ++++---- py-polars/polars/series/struct.py | 2 +- .../polars/testing/parametric/strategies.py | 2 +- py-polars/polars/utils/various.py | 5 +- py-polars/tests/unit/dataframe/test_df.py | 18 ++-- py-polars/tests/unit/io/test_database.py | 6 +- py-polars/tests/unit/io/test_excel.py | 2 +- py-polars/tests/unit/operations/test_join.py | 8 +- py-polars/tests/unit/series/test_series.py | 58 +++++++----- py-polars/tests/unit/test_errors.py | 10 +- py-polars/tests/unit/test_exprs.py | 12 +-- py-polars/tests/unit/test_interop.py | 2 +- py-polars/tests/unit/test_lazy.py | 4 +- 35 files changed, 213 insertions(+), 197 deletions(-) diff --git a/py-polars/polars/convert.py b/py-polars/polars/convert.py index a6fd5f8401c1..16ca69601012 100644 --- a/py-polars/polars/convert.py +++ b/py-polars/polars/convert.py @@ -291,7 +291,7 @@ def _from_dataframe_repr(m: re.Match[str]) -> DataFrame: for dtype in set(schema.values()): if dtype in (List, Struct, Object): raise NotImplementedError( - f"'from_repr' does not support {dtype.base_type()} dtype" + f"`from_repr` does not support data type {dtype.base_type().__name__!r}" ) # construct DataFrame from string series and cast from repr to native dtype @@ -720,6 +720,6 @@ def from_pandas( include_index=include_index, ) else: - raise ValueError( + raise TypeError( f"expected pandas DataFrame or Series, got {type(data).__name__!r}" ) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 817c49079313..180dfc4daebd 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -403,7 +403,7 @@ def __init__( ) else: raise TypeError( - f"DataFrame constructor received unsupported type {type(data).__name__!r}" + f"DataFrame constructor called with unsupported type {type(data).__name__!r}" " for the `data` parameter" ) @@ -711,7 +711,9 @@ def _read_csv( elif isinstance(dtypes, Sequence): dtype_slice = dtypes else: - raise ValueError("dtype arg should be list or dict") + raise TypeError( + f"`dtypes` should be of type list or dict, got {type(dtypes).__name__!r}" + ) processed_null_values = _process_null_values(null_values) @@ -723,8 +725,8 @@ def _read_csv( dtypes_dict = dict(dtype_list) if dtype_slice is not None: raise ValueError( - "cannot use glob patterns and unnamed dtypes as `dtypes` argument;" - " Use dtypes: Mapping[str, Type[DataType]" + "cannot use glob patterns and unnamed dtypes as `dtypes` argument" + "\n\nUse `dtypes`: Mapping[str, Type[DataType]" ) from polars import scan_csv @@ -755,8 +757,8 @@ def _read_csv( return scan.select(columns).collect() else: raise ValueError( - "cannot use glob patterns and integer based projection as `columns`" - " argument; Use columns: List[str]" + "cannot use glob patterns and integer based projection as `columns` argument" + "\n\nUse columns: List[str]" ) projection, columns = handle_projection_columns(columns) @@ -843,9 +845,9 @@ def _read_parquet( elif is_str_sequence(columns, allow_str=False): return scan.select(columns).collect() else: - raise ValueError( - "cannot use glob patterns and integer based projection as `columns`" - " argument; Use columns: List[str]" + raise TypeError( + "cannot use glob patterns and integer based projection as `columns` argument" + "\n\nUse columns: List[str]" ) projection, columns = handle_projection_columns(columns) @@ -957,9 +959,9 @@ def _read_ipc( elif is_str_sequence(columns, allow_str=False): df = scan.select(columns).collect() else: - raise ValueError( - "cannot use glob patterns and integer based projection as `columns`" - " argument; Use columns: List[str]" + raise TypeError( + "cannot use glob patterns and integer based projection as `columns` argument" + "\n\nUse columns: List[str]" ) return cls._from_pydf(df._df) @@ -1429,7 +1431,7 @@ def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: return self._div(other, floordiv=False) def __bool__(self) -> NoReturn: - raise ValueError( + raise TypeError( "the truth value of a DataFrame is ambiguous" "\n\nHint: to check if a DataFrame contains any values, use `is_empty()`." ) @@ -1724,10 +1726,10 @@ def __setitem__( raise ValueError("can only set multiple columns with 2D matrix") if value.shape[1] != len(key): raise ValueError( - "matrix columns should be equal to list use to determine column names" + "matrix columns should be equal to list used to determine column names" ) - # todo! we can parallelize this by calling from_numpy + # TODO: we can parallelize this by calling from_numpy columns = [] for i, name in enumerate(key): columns.append(pl.Series(name, value[:, i])) @@ -1740,8 +1742,8 @@ def __setitem__( if ( isinstance(row_selection, pl.Series) and row_selection.dtype == Boolean ) or is_bool_sequence(row_selection): - raise ValueError( - "not allowed to set 'DataFrame' by boolean mask in the row position." + raise TypeError( + "not allowed to set DataFrame by boolean mask in the row position" "\n\nConsider using `DataFrame.with_columns`." ) @@ -1751,7 +1753,7 @@ def __setitem__( elif isinstance(col_selection, int): s = self[:, col_selection] else: - raise ValueError(f"unexpected column selection {col_selection!r}") + raise TypeError(f"unexpected column selection {col_selection!r}") # dispatch to __setitem__ of Series to do modification s[row_selection] = value @@ -1858,7 +1860,7 @@ def item(self, row: int | None = None, column: int | str | None = None) -> Any: else self._df.column(column) ) if s is None: - raise ValueError(f"column index {column!r} is out of bounds") + raise IndexError(f"column index {column!r} is out of bounds") return s.get_idx(row) def to_arrow(self) -> pa.Table: @@ -2238,8 +2240,8 @@ def to_series(self, index: int = 0) -> Series: """ if not isinstance(index, int): - raise ValueError( - f'Index value "{index}" should be be an int, but is {type(index)}.' + raise TypeError( + f"index value {index!r} should be an int, but is {type(index).__name__!r}" ) if index < 0: @@ -2984,7 +2986,8 @@ def write_excel( from xlsxwriter.utility import xl_cell_to_rowcol except ImportError: raise ImportError( - "Excel export requires xlsxwriter; please run `pip install XlsxWriter`" + "Excel export requires xlsxwriter" + "\n\nPlease run `pip install XlsxWriter`" ) from None # setup workbook/worksheet @@ -3402,15 +3405,17 @@ def write_database( if engine == "adbc": if if_exists == "fail": - raise ValueError("'if_exists' not yet supported with engine ADBC") + raise NotImplementedError( + "`if_exists` not yet supported with engine ADBC" + ) elif if_exists == "replace": mode = "create" elif if_exists == "append": mode = "append" else: raise ValueError( - f"value for 'if_exists'={if_exists} was unexpected." - f" Choose one of: {'fail', 'replace', 'append'}" + f"unexpected value for `if_exists`: {if_exists!r}" + f"\n\nChoose one of: {'fail', 'replace', 'append'}" ) with _open_adbc_connection(connection) as conn, conn.cursor() as cursor: cursor.adbc_ingest(table_name, self.to_arrow(), mode) @@ -3419,13 +3424,13 @@ def write_database( elif engine == "sqlalchemy": if parse_version(pd.__version__) < parse_version("1.5"): raise ModuleNotFoundError( - f"writing with engine 'sqlalchemy' requires Pandas 1.5.x or higher, found Pandas {pd.__version__!r}" + f"writing with engine 'sqlalchemy' requires pandas 1.5.x or higher, found pandas {pd.__version__!r}" ) try: from sqlalchemy import create_engine - except ImportError as exc: - raise ImportError( + except ModuleNotFoundError as exc: + raise ModuleNotFoundError( "'sqlalchemy' not found. Install polars with 'pip install polars[sqlalchemy]'" ) from exc from csv import reader as delimited_read @@ -3434,7 +3439,7 @@ def write_database( # both components and pass them through unquoted (sqlalachemy will quote) table_ident = next(delimited_read([table_name], delimiter=".")) if len(table_ident) > 2: - raise ValueError(f"table_name appears to be invalid: {table_name!r}") + raise ValueError(f"`table_name` appears to be invalid: {table_name!r}") elif len(table_ident) > 1: db_schema = table_ident[0] table_name = table_ident[1] @@ -5781,7 +5786,7 @@ def join_asof( """ if not isinstance(other, DataFrame): raise TypeError( - f"expected 'other' join table to be a DataFrame, not a {type(other).__name__!r}" + f"expected `other` join table to be a DataFrame, got {type(other).__name__!r}" ) return ( @@ -5935,7 +5940,7 @@ def join( """ if not isinstance(other, DataFrame): raise TypeError( - f"expected 'other' join table to be a DataFrame, not a {type(other).__name__!r}" + f"expected `other` join table to be a DataFrame, got {type(other).__name__!r}" ) return ( @@ -6584,8 +6589,8 @@ def get_column(self, name: str) -> Series: """ if not isinstance(name, str): - raise ValueError( - f'column name "{name!r}" should be be a string, but is {type(name).__name__!r}' + raise TypeError( + f"column name {name!r} should be be a string, but is {type(name).__name__!r}" ) return self[name] @@ -7900,8 +7905,8 @@ def n_chunks(self, strategy: str = "first") -> int | list[int]: return [s.n_chunks() for s in self.__iter__()] else: raise ValueError( - f"strategy: '{strategy}' not understood." - f" Choose one of {{'first', 'all'}}" + f"unexpected input for `strategy`: {strategy!r}" + f"\n\nChoose one of {{'first', 'all'}}" ) @overload @@ -7944,7 +7949,7 @@ def max(self, axis: int = 0) -> Self | Series: return self._from_pydf(self._df.max()) if axis == 1: return wrap_s(self._df.hmax()) - raise ValueError("axis should be 0 or 1") # pragma: no cover + raise ValueError("axis should be 0 or 1") @overload def min(self, axis: Literal[0] = ...) -> Self: @@ -7986,7 +7991,7 @@ def min(self, axis: int = 0) -> Self | Series: return self._from_pydf(self._df.min()) if axis == 1: return wrap_s(self._df.hmin()) - raise ValueError("axis should be 0 or 1") # pragma: no cover + raise ValueError("axis should be 0 or 1") @overload def sum( @@ -8063,7 +8068,7 @@ def sum( return self._from_pydf(self._df.sum()) if axis == 1: return wrap_s(self._df.hsum(null_strategy)) - raise ValueError("axis should be 0 or 1") # pragma: no cover + raise ValueError("axis should be 0 or 1") @overload def mean( @@ -8141,7 +8146,7 @@ def mean( return self._from_pydf(self._df.mean()) if axis == 1: return wrap_s(self._df.hmean(null_strategy)) - raise ValueError("axis should be 0 or 1") # pragma: no cover + raise ValueError("axis should be 0 or 1") def std(self, ddof: int = 1) -> Self: """ @@ -8840,7 +8845,9 @@ def row( "cannot set both 'index' and 'by_predicate'; mutually exclusive" ) elif isinstance(index, pl.Expr): - raise TypeError("expressions should be passed to the 'by_predicate' param") + raise TypeError( + "expressions should be passed to the `by_predicate` parameter" + ) if index is not None: row = self._df.row_tuple(index) @@ -8852,8 +8859,7 @@ def row( elif by_predicate is not None: if not isinstance(by_predicate, pl.Expr): raise TypeError( - f"expected 'by_predicate to be an expression;" - f" found {type(by_predicate).__name__!r}" + f"expected `by_predicate` to be an expression, got {type(by_predicate).__name__!r}" ) rows = self.filter(by_predicate).rows() n_rows = len(rows) @@ -8872,7 +8878,7 @@ def row( else: return row else: - raise ValueError("one of 'index' or 'by_predicate' must be set") + raise ValueError("one of `index` or `by_predicate` must be set") @overload def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: @@ -9679,7 +9685,7 @@ def _prepare_other_arg(other: Any, length: int | None = None) -> Series: if isinstance(other, str): pass elif isinstance(other, Sequence): - raise ValueError("operation not supported") + raise TypeError("operation not supported") other = pl.Series("", [other]) if length and length > 1: diff --git a/py-polars/polars/datatypes/constructor.py b/py-polars/polars/datatypes/constructor.py index 7066f3cdcc20..103e8a0bcdee 100644 --- a/py-polars/polars/datatypes/constructor.py +++ b/py-polars/polars/datatypes/constructor.py @@ -122,7 +122,7 @@ def numpy_type_to_constructor(dtype: type[np.dtype[Any]]) -> Callable[..., PySer except KeyError: return PySeries.new_object except NameError: # pragma: no cover - raise ImportError( + raise ModuleNotFoundError( f"'numpy' is required to convert numpy dtype {dtype!r}" ) from None diff --git a/py-polars/polars/datatypes/convert.py b/py-polars/polars/datatypes/convert.py index e457e543780d..9d6e096f8a67 100644 --- a/py-polars/polars/datatypes/convert.py +++ b/py-polars/polars/datatypes/convert.py @@ -158,7 +158,7 @@ def is_polars_dtype(dtype: Any, include_unknown: bool = False) -> bool: return include_unknown else: return isinstance(dtype, (DataType, DataTypeClass)) - except ValueError: + except TypeError: return False @@ -517,7 +517,7 @@ def maybe_cast(el: Any, dtype: PolarsDataType) -> Any: try: el = py_type(el) # type: ignore[call-arg, misc] except Exception: - raise ValueError( + raise TypeError( f"cannot convert Python type {type(el).__name__!r} to {dtype!r}" ) from None return el diff --git a/py-polars/polars/dependencies.py b/py-polars/polars/dependencies.py index fac71538f887..b042e0c91382 100644 --- a/py-polars/polars/dependencies.py +++ b/py-polars/polars/dependencies.py @@ -94,7 +94,7 @@ def __getattr__(self, attr: Any) -> Any: # all other attribute access raises a helpful exception pfx = self._mod_pfx.get(self._module_name, "") raise ModuleNotFoundError( - f"{pfx}{attr} requires '{self._module_name}' module to be installed" + f"{pfx}{attr} requires {self._module_name!r} module to be installed" ) from None diff --git a/py-polars/polars/expr/datetime.py b/py-polars/polars/expr/datetime.py index 91caaa3ee422..31fc8f90f118 100644 --- a/py-polars/polars/expr/datetime.py +++ b/py-polars/polars/expr/datetime.py @@ -395,7 +395,7 @@ def combine(self, time: dt.time | Expr, time_unit: TimeUnit = "us") -> Expr: """ if not isinstance(time, (dt.time, pl.Expr)): raise TypeError( - f"expected 'time' to be a python time or polars expression, found {time!r}" + f"expected 'time' to be a Python time or Polars expression, found {type(time).__name__!r}" ) time = parse_as_expression(time) return wrap_expr(self._pyexpr.dt_combine(time, time_unit)) diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index d7bf90aeeb81..68c013feaca8 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -132,8 +132,8 @@ def __str__(self) -> str: return self._pyexpr.to_str() def __bool__(self) -> NoReturn: - raise ValueError( - "since Expr are lazy, the truthiness of an Expr is ambiguous." + raise TypeError( + "the truth value of an Expr is ambiguous" "\n\nHint: use '&' or '|' to logically combine Expr, not 'and'/'or', and" " use 'x.is_in([y,z])' instead of 'x in [y,z]' to check membership" ) @@ -246,7 +246,8 @@ def __array_ufunc__( if num_expr > 1: if num_expr < len(inputs): raise ValueError( - "Numpy ufunc with more than one expression can only be used if all non-expression inputs are provided as keyword arguments only" + "NumPy ufunc with more than one expression can only be used" + " if all non-expression inputs are provided as keyword arguments only" ) exprs = parse_as_list_of_expressions(inputs) @@ -893,8 +894,8 @@ def exclude( exclude_dtypes.append(item) else: raise TypeError( - "invalid input for `exclude`. Expected one or more `str`," - f"`DataType`, or selector; found {type(item).__name__!r} instead" + "invalid input for `exclude`" + f"\n\nExpected one or more `str`, `DataType`, or selector; found {type(item).__name__!r} instead." ) if exclude_cols and exclude_dtypes: @@ -2483,13 +2484,12 @@ def fill_null( """ if value is not None and strategy is not None: - raise ValueError("cannot specify both 'value' and 'strategy'") + raise ValueError("cannot specify both `value` and `strategy`") elif value is None and strategy is None: - raise ValueError("must specify either a fill 'value' or 'strategy'") + raise ValueError("must specify either a fill `value` or `strategy`") elif strategy not in ("forward", "backward") and limit is not None: raise ValueError( - "can only specify 'limit' when strategy is set to " - "'backward' or 'forward'" + "can only specify `limit` when strategy is set to 'backward' or 'forward'" ) if value is not None: @@ -4944,7 +4944,7 @@ def is_between( return (self >= lower_bound) & (self < upper_bound) else: raise ValueError( - "closed must be one of {'left', 'right', 'both', 'none'}," + "`closed` must be one of {'left', 'right', 'both', 'none'}," f" got {closed!r}" ) @@ -8260,7 +8260,7 @@ def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: """ if isinstance(value, Expr): - raise TypeError(f"'value' must be a supported literal; found {value!r}") + raise TypeError(f"`value` must be a supported literal; found {value!r}") return self._from_pyexpr(self._pyexpr.extend_constant(value, n)) @@ -8805,7 +8805,7 @@ def _remap_key_or_value_series( ) if dtype != s.dtype: raise ValueError( - f"remapping values for map_dict could not be converted to {dtype!r}: found {s.dtype!r}" + f"remapping values for `map_dict` could not be converted to {dtype!r}: found {s.dtype!r}" ) else: # dtype was set, which should always be the case when: @@ -8821,13 +8821,13 @@ def _remap_key_or_value_series( ) if dtype != s.dtype: raise ValueError( - f"remapping {'keys' if is_keys else 'values'} for map_dict could not be converted to {dtype!r}: found {s.dtype!r}" + f"remapping {'keys' if is_keys else 'values'} for `map_dict` could not be converted to {dtype!r}: found {s.dtype!r}" ) except OverflowError as exc: if is_keys: raise ValueError( - f"remapping keys for map_dict could not be converted to {dtype!r}: {exc!s}" + f"remapping keys for `map_dict` could not be converted to {dtype!r}: {exc!s}" ) from exc else: raise ValueError( @@ -8842,7 +8842,7 @@ def _remap_key_or_value_series( pass else: raise ValueError( - f"remapping keys for map_dict could not be converted to {dtype!r} without losing values in the conversion" + f"remapping keys for `map_dict` could not be converted to {dtype!r} without losing values in the conversion" ) else: # values = remapping.values() @@ -8852,7 +8852,7 @@ def _remap_key_or_value_series( pass else: raise ValueError( - f"remapping values for map_dict could not be converted to {dtype!r} without losing values in the conversion" + f"remapping values for `map_dict` could not be converted to {dtype!r} without losing values in the conversion" ) return s @@ -9178,7 +9178,7 @@ def _prepare_rolling_window_args( ) -> tuple[str, int]: if isinstance(window_size, int): if window_size < 1: - raise ValueError("'window_size' should be positive") + raise ValueError("`window_size` must be positive") if min_periods is None: min_periods = window_size diff --git a/py-polars/polars/expr/string.py b/py-polars/polars/expr/string.py index 0efcbcab56a1..11338913e987 100644 --- a/py-polars/polars/expr/string.py +++ b/py-polars/polars/expr/string.py @@ -285,7 +285,7 @@ def strptime( elif dtype == Time: return self.to_time(format, strict=strict, cache=cache) else: - raise ValueError("dtype should be of type {Date, Datetime, Time}") + raise ValueError("`dtype` must be of type {Date, Datetime, Time}") def to_decimal( self, diff --git a/py-polars/polars/expr/struct.py b/py-polars/polars/expr/struct.py index 84440293983c..db9382405921 100644 --- a/py-polars/polars/expr/struct.py +++ b/py-polars/polars/expr/struct.py @@ -22,7 +22,7 @@ def __getitem__(self, item: str | int) -> Expr: elif isinstance(item, int): return wrap_expr(self._pyexpr.struct_field_by_index(item)) else: - raise ValueError( + raise TypeError( f"expected type 'int | str', got {type(item).__name__!r} ({item!r})" ) diff --git a/py-polars/polars/functions/eager.py b/py-polars/polars/functions/eager.py index 59f18d0a1711..820c1e67abec 100644 --- a/py-polars/polars/functions/eager.py +++ b/py-polars/polars/functions/eager.py @@ -136,7 +136,7 @@ def concat( if how == "align": if not isinstance(elems[0], (pl.DataFrame, pl.LazyFrame)): - raise RuntimeError( + raise TypeError( f"'align' strategy is not supported for {type(elems[0]).__name__!r}" ) @@ -194,14 +194,12 @@ def concat( if how == "vertical": out = wrap_s(plr.concat_series(elems)) else: - raise ValueError("'Series' only allows {'vertical'} concat strategy") + raise ValueError("Series only allows {'vertical'} concat strategy") elif isinstance(first, pl.Expr): return wrap_expr(plr.concat_expr([e._pyexpr for e in elems], rechunk)) else: - raise ValueError( - f"did not expect type: {type(first).__name__!r} in 'pl.concat'" - ) + raise TypeError(f"did not expect type: {type(first).__name__!r} in `concat`") if rechunk: return out.rechunk() diff --git a/py-polars/polars/functions/lazy.py b/py-polars/polars/functions/lazy.py index 427f368d6e2d..feb994947557 100644 --- a/py-polars/polars/functions/lazy.py +++ b/py-polars/polars/functions/lazy.py @@ -185,7 +185,8 @@ def col( return wrap_expr(plr.dtype_cols(dtypes)) else: raise TypeError( - f"invalid input for `col`. Expected `str` or `DataType`, got {type(name).__name__!r}" + "invalid input for `col`" + f"\n\nExpected `str` or `DataType`, got {type(name).__name__!r}." ) if isinstance(name, str): @@ -204,12 +205,14 @@ def col( return wrap_expr(plr.dtype_cols(names)) else: raise TypeError( - "invalid input for `col`. Expected iterable of type `str` or `DataType`," + "invalid input for `col`" + "\n\nExpected iterable of type `str` or `DataType`," f" got iterable of type {type(item).__name__!r}" ) else: raise TypeError( - f"invalid input for `col`. Expected `str` or `DataType`, got {type(name).__name__!r}" + "invalid input for `col`" + f"\n\nExpected `str` or `DataType`, got {type(name).__name__!r}" ) diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py index 69c21748072f..ec3301bbd930 100644 --- a/py-polars/polars/io/_utils.py +++ b/py-polars/polars/io/_utils.py @@ -168,7 +168,9 @@ def managed_file(file: Any) -> Iterator[Any]: # todo! add azure/ gcp/ ? if file.startswith("s3://"): - raise ImportError("fsspec needs to be installed to read files from s3") + raise ModuleNotFoundError( + "fsspec needs to be installed to read files from S3" + ) if isinstance(file, list) and bool(file) and all(isinstance(f, str) for f in file): if _FSSPEC_AVAILABLE: diff --git a/py-polars/polars/io/csv/batched_reader.py b/py-polars/polars/io/csv/batched_reader.py index 87b58c055be2..27d55afb55e4 100644 --- a/py-polars/polars/io/csv/batched_reader.py +++ b/py-polars/polars/io/csv/batched_reader.py @@ -68,7 +68,7 @@ def __init__( elif isinstance(dtypes, Sequence): dtype_slice = dtypes else: - raise ValueError("dtype arg should be list or dict") + raise TypeError("`dtypes` arg should be list or dict") processed_null_values = _process_null_values(null_values) projection, columns = handle_projection_columns(columns) diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index 45eac753c094..42039f416e8c 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -186,7 +186,7 @@ def read_csv( for column in columns: if not column.startswith("column_"): raise ValueError( - 'specified column names do not start with "column_",' + "specified column names do not start with 'column_'," " but autogenerated header names were requested" ) @@ -558,8 +558,8 @@ def read_csv_batched( for column in columns: if not column.startswith("column_"): raise ValueError( - 'specified column names do not start with "column_",' - " but autogenerated header names were requested." + "specified column names do not start with 'column_'," + " but autogenerated header names were requested" ) if projection and dtypes and isinstance(dtypes, list): @@ -598,8 +598,7 @@ def read_csv_batched( if columns: if len(columns) < len(new_columns): raise ValueError( - "more new column names are specified than there are selected" - " columns" + "more new column names are specified than there are selected columns" ) # Get column names of requested columns. @@ -610,8 +609,7 @@ def read_csv_batched( if projection: if columns and len(columns) < len(new_columns): raise ValueError( - "more new column names are specified than there are selected" - " columns" + "more new column names are specified than there are selected columns" ) # Convert column indices from projection to 'column_1', 'column_2', ... # column names. diff --git a/py-polars/polars/io/database.py b/py-polars/polars/io/database.py index 4c89c3b144d8..e6cd357c56f5 100644 --- a/py-polars/polars/io/database.py +++ b/py-polars/polars/io/database.py @@ -113,7 +113,7 @@ def read_database( """ # noqa: W505 if not isinstance(connection, str): raise TypeError( - f"expect connection to be a URI string; found {type(connection).__name__!r}" + f"expected connection to be a URI string; found {type(connection).__name__!r}" ) elif engine is None: engine = "connectorx" @@ -132,7 +132,9 @@ def read_database( raise ValueError("only a single SQL query string is accepted for adbc") return _read_sql_adbc(query, connection) else: - raise ValueError(f"engine {engine!r} not implemented; use connectorx or adbc") + raise ValueError( + f"engine must be one of {{'connectorx', 'adbc'}}, got {engine!r}" + ) def _read_sql_connectorx( @@ -145,9 +147,10 @@ def _read_sql_connectorx( ) -> DataFrame: try: import connectorx as cx - except ImportError: - raise ImportError( - "connectorx is not installed. Please run `pip install connectorx>=0.3.1`" + except ModuleNotFoundError: + raise ModuleNotFoundError( + "connectorx is not installed" + "\n\nPlease run `pip install connectorx>=0.3.1`." ) from None tbl = cx.read_sql( @@ -182,9 +185,10 @@ def _open_adbc_connection(connection_uri: str) -> Any: import_module(module_name) adbc_driver = sys.modules[module_name] except ImportError: - raise ImportError( - f"ADBC {driver_name} driver not detected; if ADBC supports this database," - f" please run `pip install adbc-driver-{driver_name} pyarrow`" + raise ModuleNotFoundError( + f"ADBC {driver_name} driver not detected" + "\n\nIf ADBC supports this database, please run:" + " `pip install adbc-driver-{driver_name} pyarrow`" ) from None # some backends require the driver name to be stripped from the URI diff --git a/py-polars/polars/io/delta.py b/py-polars/polars/io/delta.py index 211dd6294411..e04d1a01037f 100644 --- a/py-polars/polars/io/delta.py +++ b/py-polars/polars/io/delta.py @@ -315,8 +315,9 @@ def _get_delta_lake_table( def _check_if_delta_available() -> None: if not _DELTALAKE_AVAILABLE: - raise ImportError( - "deltalake is not installed. Please run `pip install deltalake>=0.9.0`" + raise ModuleNotFoundError( + "deltalake is not installed" + "\n\nPlease run: `pip install deltalake>=0.9.0`" ) diff --git a/py-polars/polars/io/excel/_write_utils.py b/py-polars/polars/io/excel/_write_utils.py index 89206deecb78..1903c61afb5d 100644 --- a/py-polars/polars/io/excel/_write_utils.py +++ b/py-polars/polars/io/excel/_write_utils.py @@ -493,7 +493,7 @@ def _xl_setup_table_options( ) for key in table_style: if key not in valid_options: - raise ValueError(f"invalid table style key:{key!r}") + raise ValueError(f"invalid table style key: {key!r}") table_options = table_style.copy() table_style = table_options.pop("style", None) diff --git a/py-polars/polars/io/excel/functions.py b/py-polars/polars/io/excel/functions.py index 7ec1aceec4e8..f46ffc231111 100644 --- a/py-polars/polars/io/excel/functions.py +++ b/py-polars/polars/io/excel/functions.py @@ -165,13 +165,13 @@ def read_excel( try: import xlsx2csv except ImportError: - raise ImportError( - "xlsx2csv is not installed. Please run `pip install xlsx2csv`" + raise ModuleNotFoundError( + "xlsx2csv is not installed\n\nPlease run: `pip install xlsx2csv`" ) from None if sheet_id is not None and sheet_name is not None: raise ValueError( - f"Cannot specify both `sheet_name` ({sheet_name!r}) and `sheet_id` ({sheet_id!r})" + f"cannot specify both `sheet_name` ({sheet_name!r}) and `sheet_id` ({sheet_id!r})" ) if isinstance(source, (str, Path)): diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py index 7f661cc2d33f..1decddc41c12 100644 --- a/py-polars/polars/io/ipc/functions.py +++ b/py-polars/polars/io/ipc/functions.py @@ -84,7 +84,7 @@ def read_ipc( with _prepare_file_arg(source, use_pyarrow=use_pyarrow, **storage_options) as data: if use_pyarrow: if not _PYARROW_AVAILABLE: - raise ImportError( + raise ModuleNotFoundError( "'pyarrow' is required when using" " 'read_ipc(..., use_pyarrow=True)'" ) @@ -160,7 +160,7 @@ def read_ipc_stream( with _prepare_file_arg(source, use_pyarrow=use_pyarrow, **storage_options) as data: if use_pyarrow: if not _PYARROW_AVAILABLE: - raise ImportError( + raise ModuleNotFoundError( "'pyarrow' is required when using" " 'read_ipc_stream(..., use_pyarrow=True)'" ) diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py index 23c700271d84..26d660c42fe7 100644 --- a/py-polars/polars/io/parquet/functions.py +++ b/py-polars/polars/io/parquet/functions.py @@ -112,9 +112,8 @@ def read_parquet( ) as source_prep: if use_pyarrow: if not _PYARROW_AVAILABLE: - raise ImportError( - "'pyarrow' is required when using" - " 'read_parquet(..., use_pyarrow=True)'" + raise ModuleNotFoundError( + "'pyarrow' is required when using `read_parquet(..., use_pyarrow=True)`" ) import pyarrow as pa diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 36a54c86f713..90f9fc452938 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -720,9 +720,9 @@ def width(self) -> int: return self._ldf.width() def __bool__(self) -> NoReturn: - raise ValueError( + raise TypeError( "the truth value of a LazyFrame is ambiguous" - "\n\nLazyFrames cannot be used in boolean context with and/or/not operators" + "\n\nLazyFrames cannot be used in boolean context with and/or/not operators." ) def _comparison_error(self, operator: str) -> NoReturn: @@ -760,8 +760,8 @@ def __deepcopy__(self, memo: None = None) -> Self: def __getitem__(self, item: int | range | slice) -> LazyFrame: if not isinstance(item, slice): raise TypeError( - "'LazyFrame' object is not subscriptable (aside from slicing). Use" - " 'select()' or 'filter()' instead" + "'LazyFrame' object is not subscriptable (aside from slicing)" + "\n\nUse `select()` or `filter()` instead." ) return LazyPolarsSlice(self).apply(item) @@ -1117,7 +1117,7 @@ def show_graph( import matplotlib.image as mpimg import matplotlib.pyplot as plt except ImportError: - raise ImportError( + raise ModuleNotFoundError( "matplotlib should be installed to show graph" ) from None plt.figure(figsize=figsize) @@ -1577,7 +1577,7 @@ def profile( plt.show() except ImportError: - raise ImportError( + raise ModuleNotFoundError( "matplotlib should be installed to show profiling plot" ) from None @@ -3178,7 +3178,7 @@ def join_asof( """ if not isinstance(other, LazyFrame): raise TypeError( - f"expected 'other' join table to be a LazyFrame, not a {type(other).__name__!r}" + f"expected `other` join table to be a LazyFrame, not a {type(other).__name__!r}" ) if isinstance(on, (str, pl.Expr)): @@ -3358,7 +3358,7 @@ def join( """ if not isinstance(other, LazyFrame): raise TypeError( - f"expected 'other' join table to be a LazyFrame, not a {type(other).__name__!r}" + f"expected `other` join table to be a LazyFrame, not a {type(other).__name__!r}" ) if how == "cross": diff --git a/py-polars/polars/lazyframe/groupby.py b/py-polars/polars/lazyframe/groupby.py index cb7ba5d15bd2..85eb9e10eb7a 100644 --- a/py-polars/polars/lazyframe/groupby.py +++ b/py-polars/polars/lazyframe/groupby.py @@ -133,7 +133,7 @@ def agg( """ if aggs and isinstance(aggs[0], dict): - raise ValueError( + raise TypeError( "specifying aggregations as a dictionary is not supported" "\n\nTry unpacking the dictionary to take advantage of the keyword syntax" " of the `agg` method." diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index fcd637ca0f35..9c7d9454704e 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -320,8 +320,9 @@ def __init__( dtype_if_empty=dtype_if_empty, ) else: - raise ValueError( - f"Series constructor called with unsupported type; got {type(values).__name__!r}" + raise TypeError( + f"Series constructor called with unsupported type {type(values).__name__!r}" + " for the `values` parameter" ) @classmethod @@ -352,8 +353,7 @@ def _get_ptr(self) -> tuple[int, int, int]: """ Get a pointer to the start of the values buffer of a numeric Series. - This will raise an error if the - ``Series`` contains multiple chunks + This will raise an error if the ``Series`` contains multiple chunks. This will return the offset, length and the pointer itself. @@ -416,7 +416,7 @@ def shape(self) -> tuple[int]: return (self._s.len(),) def __bool__(self) -> NoReturn: - raise ValueError( + raise TypeError( "the truth value of a Series is ambiguous" "\n\nHint: use '&' or '|' to chain Series boolean results together, not and/or." " To check if a Series contains any values, use `is_empty()`." @@ -673,7 +673,7 @@ def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: other = maybe_cast(other, self.dtype) f = get_ffi_func(op_ffi, self.dtype, self._s) if f is None: - raise ValueError( + raise TypeError( f"cannot do arithmetic with series of dtype: {self.dtype} and argument" f" of type: {type(other).__name__!r}" ) @@ -725,7 +725,7 @@ def __truediv__(self, other: Any) -> Series | Expr: if isinstance(other, pl.Expr): return F.lit(self) / other if self.is_temporal(): - raise ValueError("first cast to integer before dividing datelike dtypes") + raise TypeError("first cast to integer before dividing datelike dtypes") # this branch is exactly the floordiv function without rounding the floats if self.is_float() or self.dtype == Decimal: @@ -745,7 +745,7 @@ def __floordiv__(self, other: Any) -> Series | Expr: if isinstance(other, pl.Expr): return F.lit(self) // other if self.is_temporal(): - raise ValueError("first cast to integer before dividing datelike dtypes") + raise TypeError("first cast to integer before dividing datelike dtypes") if not isinstance(other, pl.Expr): other = F.lit(other) @@ -772,7 +772,7 @@ def __mul__(self, other: Any) -> Series | DataFrame | Expr: if isinstance(other, pl.Expr): return F.lit(self) * other if self.is_temporal(): - raise ValueError("first cast to integer before multiplying datelike dtypes") + raise TypeError("first cast to integer before multiplying datelike dtypes") elif isinstance(other, pl.DataFrame): return other * self else: @@ -790,14 +790,14 @@ def __mod__(self, other: Any) -> Series | Expr: if isinstance(other, pl.Expr): return F.lit(self).__mod__(other) if self.is_temporal(): - raise ValueError( + raise TypeError( "first cast to integer before applying modulo on datelike dtypes" ) return self._arithmetic(other, "rem", "rem_<>") def __rmod__(self, other: Any) -> Series: if self.is_temporal(): - raise ValueError( + raise TypeError( "first cast to integer before applying modulo on datelike dtypes" ) return self._arithmetic(other, "rem", "rem_<>_rhs") @@ -812,7 +812,7 @@ def __rsub__(self, other: Any) -> Series: def __rtruediv__(self, other: Any) -> Series: if self.is_temporal(): - raise ValueError("first cast to integer before dividing datelike dtypes") + raise TypeError("first cast to integer before dividing datelike dtypes") if self.is_float(): self.__rfloordiv__(other) @@ -822,12 +822,12 @@ def __rtruediv__(self, other: Any) -> Series: def __rfloordiv__(self, other: Any) -> Series: if self.is_temporal(): - raise ValueError("first cast to integer before dividing datelike dtypes") + raise TypeError("first cast to integer before dividing datelike dtypes") return self._arithmetic(other, "div", "div_<>_rhs") def __rmul__(self, other: Any) -> Series: if self.is_temporal(): - raise ValueError("first cast to integer before multiplying datelike dtypes") + raise TypeError("first cast to integer before multiplying datelike dtypes") return self._arithmetic(other, "mul", "mul_<>") def __pow__(self, exponent: int | float | None | Series) -> Series: @@ -835,7 +835,7 @@ def __pow__(self, exponent: int | float | None | Series) -> Series: def __rpow__(self, other: Any) -> Series: if self.is_temporal(): - raise ValueError( + raise TypeError( "first cast to integer before raising datelike dtypes to a power" ) return self.to_frame().select(other ** F.col(self.name)).to_series() @@ -1010,7 +1010,7 @@ def __setitem__( if self.is_numeric() or self.is_temporal(): self.set_at_idx(key, value) # type: ignore[arg-type] return None - raise ValueError( + raise TypeError( f"cannot set Series of dtype: {self.dtype!r} with list/tuple as value;" " use a scalar value" ) @@ -1036,7 +1036,7 @@ def __setitem__( s = self._from_pyseries(sequence_to_pyseries("", key, dtype=UInt32)) self.__setitem__(s, value) else: - raise ValueError(f'cannot use "{key!r}" for indexing') + raise TypeError(f'cannot use "{key!r}" for indexing') def __array__(self, dtype: Any = None) -> np.ndarray[Any, Any]: """ @@ -1075,7 +1075,7 @@ def __array_ufunc__( elif isinstance(arg, Series): args.append(arg.view(ignore_nulls=True)) else: - raise ValueError( + raise TypeError( f"unsupported type {type(arg).__name__!r} for {arg!r}" ) @@ -1551,7 +1551,7 @@ def pow(self, exponent: int | float | None | Series) -> Series: """ if self.is_temporal(): - raise ValueError( + raise TypeError( "first cast to integer before raising datelike dtypes to a power" ) if _check_for_numpy(exponent) and isinstance(exponent, np.ndarray): @@ -3764,7 +3764,7 @@ def is_integer(self, signed: bool | None = None) -> bool: elif signed is False: return self.dtype in UNSIGNED_INTEGER_DTYPES - raise ValueError(f"'signed' must be None, True or False; given {signed!r}") + raise ValueError(f"`signed` must be None, True or False; got {signed!r}") def is_temporal(self, excluding: OneOrMoreDataTypes | None = None) -> bool: """ diff --git a/py-polars/polars/series/struct.py b/py-polars/polars/series/struct.py index 9c6d36177e03..529d0fde2d6a 100644 --- a/py-polars/polars/series/struct.py +++ b/py-polars/polars/series/struct.py @@ -30,7 +30,7 @@ def __getitem__(self, item: int | str) -> Series: elif isinstance(item, str): return self.field(item) else: - raise ValueError(f"expected type 'int | str', got {type(item).__name__!r}") + raise TypeError(f"expected type 'int | str', got {type(item).__name__!r}") def _ipython_key_completions_(self) -> list[str]: return self.fields diff --git a/py-polars/polars/testing/parametric/strategies.py b/py-polars/polars/testing/parametric/strategies.py index 71866ed5db70..0959e50d85aa 100644 --- a/py-polars/polars/testing/parametric/strategies.py +++ b/py-polars/polars/testing/parametric/strategies.py @@ -376,7 +376,7 @@ def create_list_strategy( """ if select_from and inner_dtype is None: - raise ValueError("if specifying 'select_from', must also specify 'inner_dtype'") + raise ValueError("if specifying `select_from`, must also specify `inner_dtype`") if inner_dtype is None: strats = list( diff --git a/py-polars/polars/utils/various.py b/py-polars/polars/utils/various.py index 88f0b9c1d42f..49c854d18789 100644 --- a/py-polars/polars/utils/various.py +++ b/py-polars/polars/utils/various.py @@ -121,9 +121,8 @@ def handle_projection_columns( elif is_int_sequence(columns): projection = list(columns) elif not is_str_sequence(columns): - raise ValueError( - "'columns' arg should contain a list of all integers or all strings" - " values" + raise TypeError( + "'columns' arg should contain a list of all integers or all strings values" ) else: new_columns = columns diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py index 0258dd0fb1e3..f3e212444836 100644 --- a/py-polars/tests/unit/dataframe/test_df.py +++ b/py-polars/tests/unit/dataframe/test_df.py @@ -56,7 +56,7 @@ def test_init_empty() -> None: # note: cannot use df (empty or otherwise) in boolean context empty_df = pl.DataFrame() - with pytest.raises(ValueError, match="ambiguous"): + with pytest.raises(TypeError, match="ambiguous"): not empty_df @@ -2387,7 +2387,7 @@ def test_arithmetic() -> None: assert_frame_equal(out, expected) # cannot do arithmetic with a sequence - with pytest.raises(ValueError, match="operation not supported"): + with pytest.raises(TypeError, match="operation not supported"): _ = df + [1] # type: ignore[operator] @@ -3069,14 +3069,14 @@ def test_set() -> None: df["new"] = np.random.rand(10) with pytest.raises( - ValueError, - match=r"not allowed to set 'DataFrame' by boolean mask in the row position." + TypeError, + match=r"not allowed to set DataFrame by boolean mask in the row position" r"\n\nConsider using `DataFrame.with_columns`.", ): df[df["ham"] > 0.5, "ham"] = "a" with pytest.raises( - ValueError, - match=r"not allowed to set 'DataFrame' by boolean mask in the row position." + TypeError, + match=r"not allowed to set DataFrame by boolean mask in the row position" r"\n\nConsider using `DataFrame.with_columns`.", ): df[[True, False], "ham"] = "a" @@ -3101,9 +3101,9 @@ def test_set() -> None: assert df[0, "b"] == 2 # row and col selection have to be int or str - with pytest.raises(ValueError): + with pytest.raises(TypeError): df[:, [1]] = 1 # type: ignore[index] - with pytest.raises(ValueError): + with pytest.raises(TypeError): df[True, :] = 1 # type: ignore[index] # needs to be a 2 element tuple @@ -3315,7 +3315,7 @@ def test_item() -> None: df = pl.DataFrame({}) with pytest.raises(ValueError, match=r".* frame has shape \(0, 0\)"): df.item() - with pytest.raises(ValueError, match="column index 10 is out of bounds"): + with pytest.raises(IndexError, match="column index 10 is out of bounds"): df.item(0, 10) diff --git a/py-polars/tests/unit/io/test_database.py b/py-polars/tests/unit/io/test_database.py index d8efec1eb5ec..7e874032f2f5 100644 --- a/py-polars/tests/unit/io/test_database.py +++ b/py-polars/tests/unit/io/test_database.py @@ -118,7 +118,7 @@ def test_read_database( "SELECT * FROM test_data", "sqlite", ValueError, - "engine 'not_engine' not implemented; use connectorx or adbc", + "engine must be one of {'connectorx', 'adbc'}, got 'not_engine'", id="Not an available sql engine", ), pytest.param( @@ -142,7 +142,7 @@ def test_read_database( "SELECT * FROM test_data", sqlite3.connect(":memory:"), TypeError, - "expect connection to be a URI string", + "expected connection to be a URI string", id="Invalid connection URI", ), ], @@ -233,7 +233,7 @@ def test_write_database( {"table_name": "w.x.y.z"}, {"if_exists": "crunk", "table_name": f"main.{tbl_name}"}, ): - with pytest.raises(ValueError): + with pytest.raises((ValueError, NotImplementedError)): sample_df.write_database( connection=f"sqlite:///{test_db}", engine=engine, diff --git a/py-polars/tests/unit/io/test_excel.py b/py-polars/tests/unit/io/test_excel.py index 77a654610525..4442ba428d32 100644 --- a/py-polars/tests/unit/io/test_excel.py +++ b/py-polars/tests/unit/io/test_excel.py @@ -45,7 +45,7 @@ def test_read_excel_all_sheets(excel_file_path: Path) -> None: def test_read_excel_all_sheets_with_sheet_name(excel_file_path: Path) -> None: with pytest.raises( ValueError, - match=r"Cannot specify both `sheet_name` \('Sheet1'\) and `sheet_id` \(1\)", + match=r"cannot specify both `sheet_name` \('Sheet1'\) and `sheet_id` \(1\)", ): pl.read_excel(excel_file_path, sheet_id=1, sheet_name="Sheet1") diff --git a/py-polars/tests/unit/operations/test_join.py b/py-polars/tests/unit/operations/test_join.py index 5b986f6e2221..a1ffc113d9d5 100644 --- a/py-polars/tests/unit/operations/test_join.py +++ b/py-polars/tests/unit/operations/test_join.py @@ -467,13 +467,13 @@ def test_join_frame_consistency() -> None: df = pl.DataFrame({"A": [1, 2, 3]}) ldf = pl.DataFrame({"A": [1, 2, 5]}).lazy() - with pytest.raises(TypeError, match="expected 'other'.* LazyFrame"): + with pytest.raises(TypeError, match="expected `other`.* LazyFrame"): _ = ldf.join(df, on="A") # type: ignore[arg-type] - with pytest.raises(TypeError, match="expected 'other'.* DataFrame"): + with pytest.raises(TypeError, match="expected `other`.* DataFrame"): _ = df.join(ldf, on="A") # type: ignore[arg-type] - with pytest.raises(TypeError, match="expected 'other'.* LazyFrame"): + with pytest.raises(TypeError, match="expected `other`.* LazyFrame"): _ = ldf.join_asof(df, on="A") # type: ignore[arg-type] - with pytest.raises(TypeError, match="expected 'other'.* DataFrame"): + with pytest.raises(TypeError, match="expected `other`.* DataFrame"): _ = df.join_asof(ldf, on="A") # type: ignore[arg-type] diff --git a/py-polars/tests/unit/series/test_series.py b/py-polars/tests/unit/series/test_series.py index 18be8757b499..d8b6336ac9ed 100644 --- a/py-polars/tests/unit/series/test_series.py +++ b/py-polars/tests/unit/series/test_series.py @@ -138,14 +138,14 @@ def test_init_inputs(monkeypatch: Any) -> None: # Bad inputs with pytest.raises(TypeError): pl.Series([1, 2, 3], [1, 2, 3]) - with pytest.raises(ValueError): + with pytest.raises(TypeError): pl.Series({"a": [1, 2, 3]}) with pytest.raises(OverflowError): pl.Series("bigint", [2**64]) # numpy not available monkeypatch.setattr(pl.series.series, "_check_for_numpy", lambda x: False) - with pytest.raises(ValueError): + with pytest.raises(TypeError): pl.DataFrame(np.array([1, 2, 3]), schema=["a"]) @@ -345,31 +345,37 @@ def test_arithmetic(s: pl.Series) -> None: assert ((1.0 + a) == [2, 3]).sum() == 2 assert ((1.0 % a) == [0, 1]).sum() == 2 + +def test_arithmetic_datetime() -> None: a = pl.Series("a", [datetime(2021, 1, 1)]) - with pytest.raises(ValueError): + with pytest.raises(TypeError): a // 2 - with pytest.raises(ValueError): + with pytest.raises(TypeError): a / 2 - with pytest.raises(ValueError): + with pytest.raises(TypeError): a * 2 - with pytest.raises(ValueError): + with pytest.raises(TypeError): a % 2 - with pytest.raises(ValueError): + with pytest.raises(TypeError): a**2 - with pytest.raises(ValueError): + with pytest.raises(TypeError): 2 / a - with pytest.raises(ValueError): + with pytest.raises(TypeError): 2 // a - with pytest.raises(ValueError): + with pytest.raises(TypeError): 2 * a - with pytest.raises(ValueError): + with pytest.raises(TypeError): 2 % a - with pytest.raises(ValueError): + with pytest.raises(TypeError): 2**a - with pytest.raises(ValueError): + + with pytest.raises(TypeError): +a + + +def test_arithmetic_string() -> None: a = pl.Series("a", [""]) - with pytest.raises(ValueError): + with pytest.raises(TypeError): +a @@ -385,7 +391,7 @@ def test_power() -> None: assert_series_equal(b**b, pl.Series([None, 4.0], dtype=Float64)) assert_series_equal(a**b, pl.Series([None, 4.0], dtype=Float64)) assert_series_equal(a**None, pl.Series([None] * len(a), dtype=Float64)) - with pytest.raises(ValueError): + with pytest.raises(TypeError): c**2 with pytest.raises(pl.ColumnNotFoundError): a ** "hi" # type: ignore[operator] @@ -393,7 +399,7 @@ def test_power() -> None: # rpow assert_series_equal(2.0**a, pl.Series("literal", [2.0, 4.0], dtype=Float64)) assert_series_equal(2**b, pl.Series("literal", [None, 4.0], dtype=Float64)) - with pytest.raises(ValueError): + with pytest.raises(TypeError): 2**c with pytest.raises(pl.ColumnNotFoundError): "hi" ** a @@ -840,18 +846,18 @@ def test_set_value_as_list_fail() -> None: # for other types it is not allowed s = pl.Series("a", ["a", "b", "c"]) - with pytest.raises(ValueError): + with pytest.raises(TypeError): s[[0, 1]] = ["d", "e"] s = pl.Series("a", [True, False, False]) - with pytest.raises(ValueError): + with pytest.raises(TypeError): s[[0, 1]] = [True, False] @pytest.mark.parametrize("key", [True, False, 1.0]) def test_set_invalid_key(key: Any) -> None: s = pl.Series("a", [1, 2, 3]) - with pytest.raises(ValueError): + with pytest.raises(TypeError): s[key] = 1 @@ -1145,7 +1151,7 @@ def test_empty() -> None: assert a.name == empty_a.name assert len(empty_a) == n - with pytest.raises(ValueError, match="ambiguous"): + with pytest.raises(TypeError, match="ambiguous"): not empty_a @@ -1470,10 +1476,10 @@ def test_bitwise() -> None: assert_series_equal(out["xor"], pl.Series("xor", [2, 6, 6])) # ensure mistaken use of logical 'and'/'or' raises an exception - with pytest.raises(ValueError, match="ambiguous"): + with pytest.raises(TypeError, match="ambiguous"): a and b - with pytest.raises(ValueError, match="ambiguous"): + with pytest.raises(TypeError, match="ambiguous"): a or b @@ -1621,17 +1627,17 @@ def test_comparisons_bool_series_to_int() -> None: r"cannot do arithmetic with series of dtype: Boolean" r" and argument of type: 'bool'" ) - with pytest.raises(ValueError, match=match): + with pytest.raises(TypeError, match=match): srs_bool - 1 - with pytest.raises(ValueError, match=match): + with pytest.raises(TypeError, match=match): srs_bool + 1 match = ( r"cannot do arithmetic with series of dtype: Boolean" r" and argument of type: 'bool'" ) - with pytest.raises(ValueError, match=match): + with pytest.raises(TypeError, match=match): srs_bool % 2 - with pytest.raises(ValueError, match=match): + with pytest.raises(TypeError, match=match): srs_bool * 1 from operator import ge, gt, le, lt diff --git a/py-polars/tests/unit/test_errors.py b/py-polars/tests/unit/test_errors.py index a697301353d4..fc6e7b2a96e5 100644 --- a/py-polars/tests/unit/test_errors.py +++ b/py-polars/tests/unit/test_errors.py @@ -117,13 +117,13 @@ def test_join_lazy_on_df() -> None: with pytest.raises( TypeError, - match="expected 'other' .* to be a LazyFrame.* not a 'DataFrame'", + match="expected `other` .* to be a LazyFrame.* not a 'DataFrame'", ): df_left.lazy().join(df_right, on="Id") # type: ignore[arg-type] with pytest.raises( TypeError, - match="expected 'other' .* to be a LazyFrame.* not a 'DataFrame'", + match="expected `other` .* to be a LazyFrame.* not a 'DataFrame'", ): df_left.lazy().join_asof(df_right, on="Id") # type: ignore[arg-type] @@ -298,7 +298,7 @@ def test_series_concat_err(how: ConcatMethod) -> None: s = pl.Series([1, 2, 3]) with pytest.raises( ValueError, - match="'Series' only allows {'vertical'} concat strategy", + match="Series only allows {'vertical'} concat strategy", ): pl.concat([s, s], how=how) @@ -591,7 +591,7 @@ def test_lit_agg_err() -> None: def test_window_size_validation() -> None: df = pl.DataFrame({"x": [1.0]}) - with pytest.raises(ValueError, match=r"'window_size' should be positive"): + with pytest.raises(ValueError, match=r"`window_size` must be positive"): df.with_columns(trailing_min=pl.col("x").rolling_min(window_size=-3)) @@ -605,7 +605,7 @@ def test_invalid_getitem_key_err() -> None: def test_invalid_groupby_arg() -> None: df = pl.DataFrame({"a": [1]}) with pytest.raises( - ValueError, match="specifying aggregations as a dictionary is not supported" + TypeError, match="specifying aggregations as a dictionary is not supported" ): df.groupby(1).agg({"a": "sum"}) diff --git a/py-polars/tests/unit/test_exprs.py b/py-polars/tests/unit/test_exprs.py index ed878a857966..5b2e878e82c0 100644 --- a/py-polars/tests/unit/test_exprs.py +++ b/py-polars/tests/unit/test_exprs.py @@ -425,18 +425,18 @@ def test_abs_expr() -> None: def test_logical_boolean() -> None: # note, cannot use expressions in logical # boolean context (eg: and/or/not operators) - with pytest.raises(ValueError, match="ambiguous"): + with pytest.raises(TypeError, match="ambiguous"): pl.col("colx") and pl.col("coly") - with pytest.raises(ValueError, match="ambiguous"): + with pytest.raises(TypeError, match="ambiguous"): pl.col("colx") or pl.col("coly") df = pl.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 3, 4, 5]}) - with pytest.raises(ValueError, match="ambiguous"): + with pytest.raises(TypeError, match="ambiguous"): df.select([(pl.col("a") > pl.col("b")) and (pl.col("b") > pl.col("b"))]) - with pytest.raises(ValueError, match="ambiguous"): + with pytest.raises(TypeError, match="ambiguous"): df.select([(pl.col("a") > pl.col("b")) or (pl.col("b") > pl.col("b"))]) @@ -720,13 +720,13 @@ def test_map_dict() -> None: with pytest.raises( pl.ComputeError, - match="remapping keys for map_dict could not be converted to Utf8 without losing values in the conversion", + match="remapping keys for `map_dict` could not be converted to Utf8 without losing values in the conversion", ): df_int_as_str.with_columns(pl.col("int").map_dict(int_dict)) with pytest.raises( pl.ComputeError, - match="remapping keys for map_dict could not be converted to Utf8 without losing values in the conversion", + match="remapping keys for `map_dict` could not be converted to Utf8 without losing values in the conversion", ): df_int_as_str.with_columns(pl.col("int").map_dict(int_with_none_dict)) diff --git a/py-polars/tests/unit/test_interop.py b/py-polars/tests/unit/test_interop.py index 9e50a2d91057..c43fc195ff56 100644 --- a/py-polars/tests/unit/test_interop.py +++ b/py-polars/tests/unit/test_interop.py @@ -477,7 +477,7 @@ def test_from_pandas_dataframe() -> None: assert df.rows() == [(1, 2, 3), (4, 5, 6)] # if not a pandas dataframe, raise a ValueError - with pytest.raises(ValueError): + with pytest.raises(TypeError): _ = pl.from_pandas([1, 2]) # type: ignore[call-overload] diff --git a/py-polars/tests/unit/test_lazy.py b/py-polars/tests/unit/test_lazy.py index fe2262ba1f80..90151c9bebb2 100644 --- a/py-polars/tests/unit/test_lazy.py +++ b/py-polars/tests/unit/test_lazy.py @@ -63,7 +63,7 @@ def test_lazyframe_membership_operator() -> None: assert "phone" not in ldf # note: cannot use lazyframe in boolean context - with pytest.raises(ValueError, match="ambiguous"): + with pytest.raises(TypeError, match="ambiguous"): not ldf @@ -675,7 +675,7 @@ def test_fill_null() -> None: df.fill_null() with pytest.raises(ValueError, match="cannot specify both"): df.fill_null(value=3.0, strategy="max") - with pytest.raises(ValueError, match="can only specify 'limit'"): + with pytest.raises(ValueError, match="can only specify `limit`"): df.fill_null(strategy="max", limit=2) From 08154e545a3c22092131fac8af605468881d8788 Mon Sep 17 00:00:00 2001 From: Weijie Guo Date: Mon, 21 Aug 2023 18:01:02 +0800 Subject: [PATCH 27/55] fix(python): fix apply for empty series in threading mode (#10651) --- py-polars/polars/expr/expr.py | 13 ++++++----- py-polars/tests/unit/operations/test_apply.py | 22 +++++++++++++++++++ 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index 68c013feaca8..258cbc802bf3 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -3845,8 +3845,16 @@ def wrap_f(x: Series) -> Series: # pragma: no cover elif strategy == "threading": def wrap_threading(x: Series) -> Series: + def get_lazy_promise(df: DataFrame) -> LazyFrame: + return df.lazy().select( + F.col("x").map(wrap_f, agg_list=True, return_dtype=return_dtype) + ) + df = x.to_frame("x") + if x.len() == 0: + return get_lazy_promise(df).collect().to_series() + n_threads = threadpool_size() chunk_size = x.len() // n_threads remainder = x.len() % n_threads @@ -3858,11 +3866,6 @@ def wrap_threading(x: Series) -> Series: for i in range(n_threads) ] - def get_lazy_promise(df: DataFrame) -> LazyFrame: - return df.lazy().select( - F.col("x").map(wrap_f, agg_list=True, return_dtype=return_dtype) - ) - # create partitions with LazyFrames # these are promises on a computation partitions = [] diff --git a/py-polars/tests/unit/operations/test_apply.py b/py-polars/tests/unit/operations/test_apply.py index d7948408cd2f..af6cb5946633 100644 --- a/py-polars/tests/unit/operations/test_apply.py +++ b/py-polars/tests/unit/operations/test_apply.py @@ -391,3 +391,25 @@ def test_apply_dict_order_10128() -> None: def test_apply_10237() -> None: df = pl.DataFrame({"a": [1, 2, 3]}) assert df.select(pl.all().apply(lambda x: x > 50))["a"].to_list() == [False] * 3 + + +def test_apply_on_empty_col_10639() -> None: + df = pl.DataFrame({"A": [], "B": []}) + res = df.groupby("B").agg( + pl.col("A") + .apply(lambda x: x, return_dtype=pl.Int32, strategy="threading") + .alias("Foo") + ) + assert res.to_dict(False) == { + "B": [], + "Foo": [], + } + res = df.groupby("B").agg( + pl.col("A") + .apply(lambda x: x, return_dtype=pl.Int32, strategy="thread_local") + .alias("Foo") + ) + assert res.to_dict(False) == { + "B": [], + "Foo": [], + } From bc166cef669f0e783acae4dd6f703e8605659251 Mon Sep 17 00:00:00 2001 From: Weijie Guo Date: Mon, 21 Aug 2023 18:01:34 +0800 Subject: [PATCH 28/55] fix(rust): List chunked builder should take care of series name (#10642) --- .../src/chunked_array/builder/list/boolean.rs | 23 ----------- .../src/chunked_array/builder/list/mod.rs | 4 +- .../src/chunked_array/builder/list/null.rs | 38 +++++++++++++++++++ crates/polars/tests/it/core/series.rs | 7 ++++ 4 files changed, 48 insertions(+), 24 deletions(-) create mode 100644 crates/polars-core/src/chunked_array/builder/list/null.rs diff --git a/crates/polars-core/src/chunked_array/builder/list/boolean.rs b/crates/polars-core/src/chunked_array/builder/list/boolean.rs index a6483dfcca31..4d7bc490cb3d 100644 --- a/crates/polars-core/src/chunked_array/builder/list/boolean.rs +++ b/crates/polars-core/src/chunked_array/builder/list/boolean.rs @@ -69,26 +69,3 @@ impl ListBuilderTrait for ListBooleanChunkedBuilder { self.fast_explode } } - -impl ListBuilderTrait for LargeListNullBuilder { - #[inline] - fn append_series(&mut self, _s: &Series) -> PolarsResult<()> { - self.push_null(); - Ok(()) - } - - #[inline] - fn append_null(&mut self) { - self.push_null() - } - - fn finish(&mut self) -> ListChunked { - unsafe { - ListChunked::from_chunks_and_dtype_unchecked( - "", - vec![self.as_box()], - DataType::List(Box::new(DataType::Null)), - ) - } - } -} diff --git a/crates/polars-core/src/chunked_array/builder/list/mod.rs b/crates/polars-core/src/chunked_array/builder/list/mod.rs index 4484a4da1cd3..e4938e9fca17 100644 --- a/crates/polars-core/src/chunked_array/builder/list/mod.rs +++ b/crates/polars-core/src/chunked_array/builder/list/mod.rs @@ -4,6 +4,7 @@ mod boolean; #[cfg(feature = "dtype-categorical")] mod categorical; mod dtypes; +mod null; mod primitive; pub use anonymous::*; @@ -12,6 +13,7 @@ pub use boolean::*; #[cfg(feature = "dtype-categorical")] use categorical::*; use dtypes::*; +use null::*; use polars_arrow::array::list::AnonymousBuilder; use polars_arrow::array::null::MutableNullArray; use polars_arrow::prelude::*; @@ -116,7 +118,7 @@ pub fn get_list_builder( list_capacity, Some(inner_type_logical.clone()), ))), - DataType::Null => Ok(Box::new(LargeListNullBuilder::with_capacity(list_capacity))), + DataType::Null => Ok(Box::new(ListNullChunkedBuilder::new(name, list_capacity))), DataType::List(_) => Ok(Box::new(AnonymousOwnedListBuilder::new( name, list_capacity, diff --git a/crates/polars-core/src/chunked_array/builder/list/null.rs b/crates/polars-core/src/chunked_array/builder/list/null.rs new file mode 100644 index 000000000000..70346ed32071 --- /dev/null +++ b/crates/polars-core/src/chunked_array/builder/list/null.rs @@ -0,0 +1,38 @@ +use super::*; + +pub struct ListNullChunkedBuilder { + builder: LargeListNullBuilder, + name: String, +} + +impl ListNullChunkedBuilder { + pub fn new(name: &str, capacity: usize) -> Self { + ListNullChunkedBuilder { + builder: LargeListNullBuilder::with_capacity(capacity), + name: name.into(), + } + } +} + +impl ListBuilderTrait for ListNullChunkedBuilder { + #[inline] + fn append_series(&mut self, _s: &Series) -> PolarsResult<()> { + self.builder.push_null(); + Ok(()) + } + + #[inline] + fn append_null(&mut self) { + self.builder.push_null(); + } + + fn finish(&mut self) -> ListChunked { + unsafe { + ListChunked::from_chunks_and_dtype_unchecked( + &self.name, + vec![self.builder.as_box()], + DataType::List(Box::new(DataType::Null)), + ) + } + } +} diff --git a/crates/polars/tests/it/core/series.rs b/crates/polars/tests/it/core/series.rs index 6e87defa58b6..42b533c78f1a 100644 --- a/crates/polars/tests/it/core/series.rs +++ b/crates/polars/tests/it/core/series.rs @@ -36,3 +36,10 @@ fn test_min_max_sorted_desc() { assert_eq!(a.max(), Some(4)); assert_eq!(a.min(), Some(1)); } + +#[test] +fn test_construct_list_of_null_series() { + let s = Series::new("a", [Series::new_null("a1", 1), Series::new_null("a1", 1)]); + assert_eq!(s.null_count(), s.len()); + assert_eq!(s.field().name(), "a"); +} From 60efadff8f32b6439e31d57ec2e8f7c0fadc89e6 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Mon, 21 Aug 2023 20:18:57 +0200 Subject: [PATCH 29/55] feat(rust)!: Rename `groupby` to `group_by` (#10654) --- crates/polars-algo/src/algo.rs | 4 +- crates/polars-arrow/src/array/mod.rs | 8 +- crates/polars-core/Cargo.toml | 6 +- .../logical/categorical/builder.rs | 2 +- .../logical/categorical/ops/unique.rs | 2 +- .../logical/categorical/stringcache.rs | 2 +- .../src/chunked_array/ops/take/traits.rs | 2 +- .../src/chunked_array/ops/unique/mod.rs | 6 +- .../src/doc/changelog/v0_10_0_11.rs | 2 +- crates/polars-core/src/doc/changelog/v0_4.rs | 2 +- crates/polars-core/src/doc/changelog/v0_7.rs | 2 +- .../polars-core/src/frame/asof_join/groups.rs | 2 +- .../aggregations/agg_list.rs | 0 .../aggregations/boolean.rs | 0 .../aggregations/dispatch.rs | 0 .../{groupby => group_by}/aggregations/mod.rs | 6 +- .../aggregations/utf8.rs | 0 .../src/frame/{groupby => group_by}/expr.rs | 0 .../frame/{groupby => group_by}/hashing.rs | 18 +-- .../{groupby => group_by}/into_groups.rs | 32 ++--- .../src/frame/{groupby => group_by}/mod.rs | 126 +++++++++--------- .../frame/{groupby => group_by}/perfect.rs | 0 .../src/frame/{groupby => group_by}/proxy.rs | 2 +- crates/polars-core/src/frame/hash_join/mod.rs | 2 +- .../src/frame/hash_join/multiple_keys.rs | 2 +- crates/polars-core/src/frame/mod.rs | 20 +-- .../polars-core/src/hashing/vector_hasher.rs | 4 +- crates/polars-core/src/prelude.rs | 4 +- .../src/series/implementations/array.rs | 2 +- .../src/series/implementations/binary.rs | 2 +- .../src/series/implementations/boolean.rs | 2 +- .../src/series/implementations/categorical.rs | 2 +- .../src/series/implementations/dates_time.rs | 2 +- .../src/series/implementations/datetime.rs | 2 +- .../src/series/implementations/duration.rs | 2 +- .../src/series/implementations/floats.rs | 2 +- .../src/series/implementations/list.rs | 8 +- .../src/series/implementations/mod.rs | 2 +- .../src/series/implementations/object.rs | 2 +- .../src/series/implementations/struct_.rs | 2 +- .../src/series/implementations/utf8.rs | 2 +- crates/polars-core/src/series/ops/unique.rs | 2 +- crates/polars-core/src/series/series_trait.rs | 2 +- crates/polars-io/src/partition.rs | 2 +- crates/polars-lazy/Cargo.toml | 2 +- crates/polars-lazy/src/dsl/list.rs | 4 +- crates/polars-lazy/src/dsl/mod.rs | 2 +- crates/polars-lazy/src/frame/mod.rs | 70 +++++----- crates/polars-lazy/src/frame/pivot.rs | 2 +- crates/polars-lazy/src/lib.rs | 6 +- .../executors/{groupby.rs => group_by.rs} | 8 +- ...groupby_dynamic.rs => group_by_dynamic.rs} | 24 ++-- ...partitioned.rs => group_by_partitioned.rs} | 28 ++-- ...groupby_rolling.rs => group_by_rolling.rs} | 24 ++-- .../src/physical_plan/executors/mod.rs | 20 +-- .../executors/projection_utils.rs | 10 +- .../physical_plan/expressions/aggregation.rs | 4 +- .../src/physical_plan/expressions/alias.rs | 2 +- .../src/physical_plan/expressions/apply.rs | 4 +- .../src/physical_plan/expressions/binary.rs | 2 +- .../src/physical_plan/expressions/cast.rs | 2 +- .../src/physical_plan/expressions/column.rs | 4 +- .../src/physical_plan/expressions/filter.rs | 2 +- .../src/physical_plan/expressions/literal.rs | 2 +- .../src/physical_plan/expressions/mod.rs | 4 +- .../src/physical_plan/expressions/slice.rs | 2 +- .../src/physical_plan/expressions/sort.rs | 2 +- .../src/physical_plan/expressions/sortby.rs | 4 +- .../src/physical_plan/expressions/take.rs | 2 +- .../src/physical_plan/expressions/ternary.rs | 4 +- .../src/physical_plan/expressions/window.rs | 32 ++--- .../src/physical_plan/planner/expr.rs | 2 +- .../src/physical_plan/planner/lp.rs | 12 +- crates/polars-lazy/src/physical_plan/state.rs | 2 +- .../physical_plan/streaming/convert_alp.rs | 4 +- crates/polars-lazy/src/prelude.rs | 2 +- crates/polars-lazy/src/tests/aggregations.rs | 42 +++--- crates/polars-lazy/src/tests/arity.rs | 4 +- crates/polars-lazy/src/tests/logical.rs | 4 +- .../src/tests/optimization_checks.rs | 14 +- crates/polars-lazy/src/tests/queries.rs | 102 +++++++------- crates/polars-lazy/src/tests/streaming.rs | 12 +- crates/polars-lazy/src/tests/tpch.rs | 2 +- .../nan_propagating_aggregate.rs | 2 +- crates/polars-ops/src/frame/pivot/mod.rs | 20 +-- .../polars-ops/src/series/ops/to_dummies.rs | 2 +- .../aggregates/convert.rs | 14 +- .../{groupby => group_by}/aggregates/count.rs | 0 .../{groupby => group_by}/aggregates/first.rs | 2 +- .../aggregates/interface.rs | 14 +- .../{groupby => group_by}/aggregates/last.rs | 2 +- .../{groupby => group_by}/aggregates/mean.rs | 0 .../aggregates/min_max.rs | 0 .../{groupby => group_by}/aggregates/mod.rs | 0 .../{groupby => group_by}/aggregates/null.rs | 2 +- .../{groupby => group_by}/aggregates/sum.rs | 0 .../{groupby => group_by}/generic/eval.rs | 0 .../{groupby => group_by}/generic/global.rs | 0 .../generic/hash_table.rs | 0 .../{groupby => group_by}/generic/mod.rs | 2 +- .../generic/ooc_state.rs | 4 +- .../{groupby => group_by}/generic/sink.rs | 8 +- .../{groupby => group_by}/generic/source.rs | 6 +- .../generic/thread_local.rs | 0 .../sinks/{groupby => group_by}/mod.rs | 0 .../sinks/{groupby => group_by}/ooc.rs | 14 +- .../sinks/{groupby => group_by}/ooc_state.rs | 4 +- .../{groupby => group_by}/primitive/mod.rs | 16 +-- .../sinks/{groupby => group_by}/string.rs | 16 +-- .../sinks/{groupby => group_by}/utils.rs | 4 +- crates/polars-pipe/src/executors/sinks/mod.rs | 2 +- crates/polars-pipe/src/pipeline/convert.rs | 20 +-- crates/polars-pipe/src/pipeline/dispatcher.rs | 2 +- crates/polars-pipe/src/pipeline/mod.rs | 2 +- crates/polars-plan/Cargo.toml | 2 +- crates/polars-plan/src/dsl/functions/arity.rs | 2 +- crates/polars-plan/src/dsl/mod.rs | 10 +- .../polars-plan/src/logical_plan/aexpr/mod.rs | 2 +- .../polars-plan/src/logical_plan/builder.rs | 12 +- .../src/logical_plan/builder_alp.rs | 4 +- .../src/logical_plan/optimizer/cse_expr.rs | 18 +-- .../optimizer/predicate_pushdown/utils.rs | 4 +- .../{groupby.rs => group_by.rs} | 8 +- .../optimizer/projection_pushdown/mod.rs | 6 +- .../optimizer/type_coercion/binary.rs | 2 +- .../optimizer/type_coercion/mod.rs | 4 +- .../polars-plan/src/logical_plan/options.rs | 8 +- .../src/logical_plan/projection.rs | 2 +- crates/polars-sql/src/context.rs | 32 ++--- crates/polars-sql/tests/iss_7437.rs | 2 +- crates/polars-sql/tests/ops_distinct_on.rs | 2 +- crates/polars-sql/tests/simple_exprs.rs | 8 +- .../src/chunkedarray/rolling_window/mod.rs | 4 +- .../rolling_kernels/no_nulls.rs | 24 ++-- .../src/{groupby => group_by}/dynamic.rs | 80 +++++------ .../src/{groupby => group_by}/mod.rs | 0 crates/polars-time/src/lib.rs | 6 +- crates/polars-time/src/prelude.rs | 2 +- crates/polars-time/src/upsample.rs | 4 +- crates/polars-time/src/windows/bounds.rs | 4 +- .../src/windows/{groupby.rs => group_by.rs} | 38 +++--- crates/polars-time/src/windows/mod.rs | 2 +- crates/polars-time/src/windows/test.rs | 61 +++++---- crates/polars/Cargo.toml | 6 +- crates/polars/src/docs/eager.rs | 12 +- crates/polars/src/docs/lazy.rs | 6 +- crates/polars/src/lib.rs | 16 +-- crates/polars/src/prelude.rs | 2 +- .../tests/it/core/{groupby.rs => group_by.rs} | 2 +- crates/polars/tests/it/core/mod.rs | 2 +- crates/polars/tests/it/joins.rs | 6 +- crates/polars/tests/it/lazy/aggregation.rs | 4 +- .../polars/tests/it/lazy/expressions/apply.rs | 6 +- .../polars/tests/it/lazy/expressions/arity.rs | 18 +-- .../tests/it/lazy/expressions/filter.rs | 6 +- .../polars/tests/it/lazy/expressions/slice.rs | 2 +- .../tests/it/lazy/{groupby.rs => group_by.rs} | 20 +-- ...groupby_dynamic.rs => group_by_dynamic.rs} | 10 +- crates/polars/tests/it/lazy/mod.rs | 4 +- crates/polars/tests/it/lazy/queries.rs | 20 +-- py-polars/Cargo.toml | 4 +- py-polars/polars/dataframe/groupby.py | 2 +- py-polars/polars/lazyframe/frame.py | 6 +- py-polars/src/dataframe.rs | 6 +- py-polars/src/lazyframe.rs | 14 +- .../unit/operations/test_groupby_rolling.py | 4 +- py-polars/tests/unit/test_empty.py | 2 +- py-polars/tests/unit/test_errors.py | 4 +- 168 files changed, 740 insertions(+), 723 deletions(-) rename crates/polars-core/src/frame/{groupby => group_by}/aggregations/agg_list.rs (100%) rename crates/polars-core/src/frame/{groupby => group_by}/aggregations/boolean.rs (100%) rename crates/polars-core/src/frame/{groupby => group_by}/aggregations/dispatch.rs (100%) rename crates/polars-core/src/frame/{groupby => group_by}/aggregations/mod.rs (99%) rename crates/polars-core/src/frame/{groupby => group_by}/aggregations/utf8.rs (100%) rename crates/polars-core/src/frame/{groupby => group_by}/expr.rs (100%) rename crates/polars-core/src/frame/{groupby => group_by}/hashing.rs (97%) rename crates/polars-core/src/frame/{groupby => group_by}/into_groups.rs (94%) rename crates/polars-core/src/frame/{groupby => group_by}/mod.rs (90%) rename crates/polars-core/src/frame/{groupby => group_by}/perfect.rs (100%) rename crates/polars-core/src/frame/{groupby => group_by}/proxy.rs (99%) rename crates/polars-lazy/src/physical_plan/executors/{groupby.rs => group_by.rs} (95%) rename crates/polars-lazy/src/physical_plan/executors/{groupby_dynamic.rs => group_by_dynamic.rs} (82%) rename crates/polars-lazy/src/physical_plan/executors/{groupby_partitioned.rs => group_by_partitioned.rs} (93%) rename crates/polars-lazy/src/physical_plan/executors/{groupby_rolling.rs => group_by_rolling.rs} (85%) rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/aggregates/convert.rs (95%) rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/aggregates/count.rs (100%) rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/aggregates/first.rs (96%) rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/aggregates/interface.rs (89%) rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/aggregates/last.rs (96%) rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/aggregates/mean.rs (100%) rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/aggregates/min_max.rs (100%) rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/aggregates/mod.rs (100%) rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/aggregates/null.rs (92%) rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/aggregates/sum.rs (100%) rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/generic/eval.rs (100%) rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/generic/global.rs (100%) rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/generic/hash_table.rs (100%) rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/generic/mod.rs (97%) rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/generic/ooc_state.rs (97%) rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/generic/sink.rs (96%) rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/generic/source.rs (95%) rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/generic/thread_local.rs (100%) rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/mod.rs (100%) rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/ooc.rs (95%) rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/ooc_state.rs (93%) rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/primitive/mod.rs (97%) rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/string.rs (97%) rename crates/polars-pipe/src/executors/sinks/{groupby => group_by}/utils.rs (96%) rename crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/{groupby.rs => group_by.rs} (95%) rename crates/polars-time/src/{groupby => group_by}/dynamic.rs (94%) rename crates/polars-time/src/{groupby => group_by}/mod.rs (100%) rename crates/polars-time/src/windows/{groupby.rs => group_by.rs} (94%) rename crates/polars/tests/it/core/{groupby.rs => group_by.rs} (98%) rename crates/polars/tests/it/lazy/{groupby.rs => group_by.rs} (90%) rename crates/polars/tests/it/lazy/{groupby_dynamic.rs => group_by_dynamic.rs} (92%) diff --git a/crates/polars-algo/src/algo.rs b/crates/polars-algo/src/algo.rs index 2da0f1787521..92533386ea89 100644 --- a/crates/polars-algo/src/algo.rs +++ b/crates/polars-algo/src/algo.rs @@ -47,7 +47,7 @@ pub fn hist(s: &Series, bins: Option<&Series>, bin_count: Option) -> Resu DataType::UInt16 => (lit(u32::MIN), AnyValue::UInt16(u16::MAX)), _ => polars_bail!( InvalidOperation: - "cannot take histogram of non-numeric types; consider a groupby and count" + "cannot take histogram of non-numeric types; consider a group_by and count" ), }; let mut bins = bins.extend_constant(max_value, 1)?; @@ -92,7 +92,7 @@ pub fn hist(s: &Series, bins: Option<&Series>, bin_count: Option) -> Resu let out = out .select(["category", s.name()])? - .groupby(["category"])? + .group_by(["category"])? .count()?; cuts.left_join(&out, [category_str], [category_str])? diff --git a/crates/polars-arrow/src/array/mod.rs b/crates/polars-arrow/src/array/mod.rs index 51f51bc63329..51f813440185 100644 --- a/crates/polars-arrow/src/array/mod.rs +++ b/crates/polars-arrow/src/array/mod.rs @@ -102,7 +102,7 @@ macro_rules! iter_to_values { pub trait ListFromIter { /// Create a list-array from an iterator. - /// Used in groupby agg-list + /// Used in group_by agg-list /// /// # Safety /// Will produce incorrect arrays if size hint is incorrect. @@ -136,7 +136,7 @@ pub trait ListFromIter { } /// Create a list-array from an iterator. - /// Used in groupby agg-list + /// Used in group_by agg-list /// /// # Safety /// Will produce incorrect arrays if size hint is incorrect. @@ -166,7 +166,7 @@ pub trait ListFromIter { } /// Create a list-array from an iterator. - /// Used in groupby agg-list + /// Used in group_by agg-list /// /// # Safety /// Will produce incorrect arrays if size hint is incorrect. @@ -212,7 +212,7 @@ pub trait ListFromIter { } /// Create a list-array from an iterator. - /// Used in groupby agg-list + /// Used in group_by agg-list /// /// # Safety /// Will produce incorrect arrays if size hint is incorrect. diff --git a/crates/polars-core/Cargo.toml b/crates/polars-core/Cargo.toml index becd9894074d..d0a2f41c6ef2 100644 --- a/crates/polars-core/Cargo.toml +++ b/crates/polars-core/Cargo.toml @@ -92,8 +92,8 @@ row_hash = [] reinterpret = [] take_opt_iter = [] mode = [] -# allow groupby operation on list type -groupby_list = [] +# allow group_by operation on list type +group_by_list = [] # cumsum, cummin, etc. cum_agg = [] # rolling window functions @@ -114,7 +114,7 @@ semi_anti_join = [] chunked_ids = [] describe = [] timezones = ["chrono-tz", "arrow/chrono-tz", "polars-arrow/timezones"] -dynamic_groupby = ["dtype-datetime", "dtype-date"] +dynamic_group_by = ["dtype-datetime", "dtype-date"] # opt-in datatypes for Series dtype-date = ["temporal"] diff --git a/crates/polars-core/src/chunked_array/logical/categorical/builder.rs b/crates/polars-core/src/chunked_array/logical/categorical/builder.rs index 0f07c5083af2..d8ad21b12f88 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/builder.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/builder.rs @@ -6,7 +6,7 @@ use hashbrown::hash_map::{Entry, RawEntryMut}; use polars_arrow::trusted_len::TrustedLenPush; use crate::datatypes::PlHashMap; -use crate::frame::groupby::hashing::HASHMAP_INIT_SIZE; +use crate::frame::group_by::hashing::HASHMAP_INIT_SIZE; use crate::prelude::*; use crate::{using_string_cache, StringCache, POOL}; diff --git a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs index c7a24a91fe01..9ac7d32ae749 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs @@ -1,5 +1,5 @@ use super::*; -use crate::frame::groupby::IntoGroupsProxy; +use crate::frame::group_by::IntoGroupsProxy; impl CategoricalChunked { pub fn unique(&self) -> PolarsResult { diff --git a/crates/polars-core/src/chunked_array/logical/categorical/stringcache.rs b/crates/polars-core/src/chunked_array/logical/categorical/stringcache.rs index cf1be4be6525..195579e1392b 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/stringcache.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/stringcache.rs @@ -8,7 +8,7 @@ use once_cell::sync::Lazy; use smartstring::{LazyCompact, SmartString}; use crate::datatypes::PlIdHashMap; -use crate::frame::groupby::hashing::HASHMAP_INIT_SIZE; +use crate::frame::group_by::hashing::HASHMAP_INIT_SIZE; use crate::prelude::InitHashMaps; /// We use atomic reference counting diff --git a/crates/polars-core/src/chunked_array/ops/take/traits.rs b/crates/polars-core/src/chunked_array/ops/take/traits.rs index e54e0a1c8a51..818681f831e6 100644 --- a/crates/polars-core/src/chunked_array/ops/take/traits.rs +++ b/crates/polars-core/src/chunked_array/ops/take/traits.rs @@ -1,5 +1,5 @@ //! Traits that indicate the allowed arguments in a ChunkedArray::take operation. -use crate::frame::groupby::GroupsProxyIter; +use crate::frame::group_by::GroupsProxyIter; use crate::prelude::*; // Utility traits diff --git a/crates/polars-core/src/chunked_array/ops/unique/mod.rs b/crates/polars-core/src/chunked_array/ops/unique/mod.rs index 66e90d9f56bd..217e7a5494b0 100644 --- a/crates/polars-core/src/chunked_array/ops/unique/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/unique/mod.rs @@ -8,10 +8,10 @@ use arrow::bitmap::MutableBitmap; #[cfg(feature = "object")] use crate::datatypes::ObjectType; use crate::datatypes::PlHashSet; -use crate::frame::groupby::hashing::HASHMAP_INIT_SIZE; -use crate::frame::groupby::GroupsProxy; +use crate::frame::group_by::hashing::HASHMAP_INIT_SIZE; +use crate::frame::group_by::GroupsProxy; #[cfg(feature = "mode")] -use crate::frame::groupby::IntoGroupsProxy; +use crate::frame::group_by::IntoGroupsProxy; use crate::prelude::*; use crate::series::IsSorted; diff --git a/crates/polars-core/src/doc/changelog/v0_10_0_11.rs b/crates/polars-core/src/doc/changelog/v0_10_0_11.rs index 502f267c1dc0..8136f24f8f80 100644 --- a/crates/polars-core/src/doc/changelog/v0_10_0_11.rs +++ b/crates/polars-core/src/doc/changelog/v0_10_0_11.rs @@ -6,7 +6,7 @@ //! * Performance increase in take kernel //! * Performance increase in ChunkedArray builders //! * Join operation on multiple columns. -//! * ~3.5 x performance increase in groupby operations (measured on db-benchmark), +//! * ~3.5 x performance increase in group_by operations (measured on db-benchmark), //! due to embarrassingly parallel grouping and better branch prediction (tight loops). //! * Performance increase on join operation due to better branch prediction. //! * Categorical datatype and global string cache (BETA). diff --git a/crates/polars-core/src/doc/changelog/v0_4.rs b/crates/polars-core/src/doc/changelog/v0_4.rs index c4f00cf1b50b..d357526134ef 100644 --- a/crates/polars-core/src/doc/changelog/v0_4.rs +++ b/crates/polars-core/src/doc/changelog/v0_4.rs @@ -1,7 +1,7 @@ //! # Changelog v0.4 //! //! * median aggregation added to `ChunkedArray` -//! * Arrow LargeList datatype support (and groupby aggregation into LargeList). +//! * Arrow LargeList datatype support (and group_by aggregation into LargeList). //! * Shift operation. //! * Fill None operation. //! * Buffered serialization (less memory requirements) diff --git a/crates/polars-core/src/doc/changelog/v0_7.rs b/crates/polars-core/src/doc/changelog/v0_7.rs index 4d13947ac46e..55996f2fcaa5 100644 --- a/crates/polars-core/src/doc/changelog/v0_7.rs +++ b/crates/polars-core/src/doc/changelog/v0_7.rs @@ -20,7 +20,7 @@ //! - Type coercion optimizer //! - Selection (filter, where clause) //! - Projection (select foo from bar) -//! - Aggregation (groupby) +//! - Aggregation (group_by) //! - all eager aggregations supported //! - Joins //! - WithColumn operation diff --git a/crates/polars-core/src/frame/asof_join/groups.rs b/crates/polars-core/src/frame/asof_join/groups.rs index ae27b92fb685..9c980c935b3b 100644 --- a/crates/polars-core/src/frame/asof_join/groups.rs +++ b/crates/polars-core/src/frame/asof_join/groups.rs @@ -9,7 +9,7 @@ use rayon::prelude::*; use smartstring::alias::String as SmartString; use super::*; -use crate::frame::groupby::hashing::HASHMAP_INIT_SIZE; +use crate::frame::group_by::hashing::HASHMAP_INIT_SIZE; #[cfg(feature = "dtype-categorical")] use crate::frame::hash_join::_check_categorical_src; use crate::frame::hash_join::{ diff --git a/crates/polars-core/src/frame/groupby/aggregations/agg_list.rs b/crates/polars-core/src/frame/group_by/aggregations/agg_list.rs similarity index 100% rename from crates/polars-core/src/frame/groupby/aggregations/agg_list.rs rename to crates/polars-core/src/frame/group_by/aggregations/agg_list.rs diff --git a/crates/polars-core/src/frame/groupby/aggregations/boolean.rs b/crates/polars-core/src/frame/group_by/aggregations/boolean.rs similarity index 100% rename from crates/polars-core/src/frame/groupby/aggregations/boolean.rs rename to crates/polars-core/src/frame/group_by/aggregations/boolean.rs diff --git a/crates/polars-core/src/frame/groupby/aggregations/dispatch.rs b/crates/polars-core/src/frame/group_by/aggregations/dispatch.rs similarity index 100% rename from crates/polars-core/src/frame/groupby/aggregations/dispatch.rs rename to crates/polars-core/src/frame/group_by/aggregations/dispatch.rs diff --git a/crates/polars-core/src/frame/groupby/aggregations/mod.rs b/crates/polars-core/src/frame/group_by/aggregations/mod.rs similarity index 99% rename from crates/polars-core/src/frame/groupby/aggregations/mod.rs rename to crates/polars-core/src/frame/group_by/aggregations/mod.rs index 1dbfa47d0662..b40f9137c554 100644 --- a/crates/polars-core/src/frame/groupby/aggregations/mod.rs +++ b/crates/polars-core/src/frame/group_by/aggregations/mod.rs @@ -27,9 +27,9 @@ use rayon::prelude::*; #[cfg(feature = "object")] use crate::chunked_array::object::extension::create_extension; -use crate::frame::groupby::GroupsIdx; +use crate::frame::group_by::GroupsIdx; #[cfg(feature = "object")] -use crate::frame::groupby::GroupsIndicator; +use crate::frame::group_by::GroupsIndicator; use crate::prelude::*; use crate::series::implementations::SeriesWrap; use crate::series::IsSorted; @@ -76,7 +76,7 @@ where } // This iterators length can be trusted - // these represent the number of groups in the groupby operation + // these represent the number of groups in the group_by operation let output_len = offsets.size_hint().0; // start with a dummy index, will be overwritten on first iteration. // Safety: diff --git a/crates/polars-core/src/frame/groupby/aggregations/utf8.rs b/crates/polars-core/src/frame/group_by/aggregations/utf8.rs similarity index 100% rename from crates/polars-core/src/frame/groupby/aggregations/utf8.rs rename to crates/polars-core/src/frame/group_by/aggregations/utf8.rs diff --git a/crates/polars-core/src/frame/groupby/expr.rs b/crates/polars-core/src/frame/group_by/expr.rs similarity index 100% rename from crates/polars-core/src/frame/groupby/expr.rs rename to crates/polars-core/src/frame/group_by/expr.rs diff --git a/crates/polars-core/src/frame/groupby/hashing.rs b/crates/polars-core/src/frame/group_by/hashing.rs similarity index 97% rename from crates/polars-core/src/frame/groupby/hashing.rs rename to crates/polars-core/src/frame/group_by/hashing.rs index 528156448ec3..3e24df8817b3 100644 --- a/crates/polars-core/src/frame/groupby/hashing.rs +++ b/crates/polars-core/src/frame/group_by/hashing.rs @@ -8,7 +8,7 @@ use rayon::prelude::*; use super::GroupsProxy; use crate::datatypes::PlHashMap; -use crate::frame::groupby::{GroupsIdx, IdxItem}; +use crate::frame::group_by::{GroupsIdx, IdxItem}; use crate::hashing::{ df_rows_to_hashes_threaded_vertical, series_to_hashes, this_partition, AsU64, IdBuildHasher, IdxHash, @@ -83,7 +83,7 @@ fn finish_group_order(mut out: Vec>, sorted: bool) -> GroupsProxy { } // The inner vecs should be sorted by IdxSize -// the groupby multiple keys variants suffice +// the group_by multiple keys variants suffice // this requirements as they use an IdxMap strategy fn finish_group_order_vecs( mut vecs: Vec<(Vec, Vec>)>, @@ -144,7 +144,7 @@ fn finish_group_order_vecs( } } -pub(crate) fn groupby(a: impl Iterator, sorted: bool) -> GroupsProxy +pub(crate) fn group_by(a: impl Iterator, sorted: bool) -> GroupsProxy where T: Hash + Eq, { @@ -183,7 +183,7 @@ where // giving the slice info to the compiler is much // faster than the using an iterator, that's why we // have the code duplication -pub(crate) fn groupby_threaded_slice( +pub(crate) fn group_by_threaded_slice( keys: Vec, n_partitions: u64, sorted: bool, @@ -246,7 +246,7 @@ where finish_group_order(out, sorted) } -pub(crate) fn groupby_threaded_iter( +pub(crate) fn group_by_threaded_iter( keys: &[I], n_partitions: u64, sorted: bool, @@ -373,7 +373,7 @@ pub(crate) fn populate_multiple_key_hashmap( idx_hash.hash == original_h && { let key_idx = idx_hash.idx; // Safety: - // indices in a groupby operation are always in bounds. + // indices in a group_by operation are always in bounds. unsafe { compare_df_rows(keys, key_idx as usize, idx as usize) } } }); @@ -435,7 +435,7 @@ pub(crate) fn populate_multiple_key_hashmap2<'a, V, H, F, G>( original_h == idx_hash.hash && { let key_idx = idx_hash.idx; // Safety: - // indices in a groupby operation are always in bounds. + // indices in a group_by operation are always in bounds. unsafe { compare_keys(keys_cmp, key_idx as usize, idx as usize) } } }); @@ -450,7 +450,7 @@ pub(crate) fn populate_multiple_key_hashmap2<'a, V, H, F, G>( } } -pub(crate) fn groupby_threaded_multiple_keys_flat( +pub(crate) fn group_by_threaded_multiple_keys_flat( mut keys: DataFrame, n_partitions: usize, sorted: bool, @@ -540,7 +540,7 @@ pub(crate) fn groupby_threaded_multiple_keys_flat( Ok(finish_group_order_vecs(v, sorted)) } -pub(crate) fn groupby_multiple_keys(keys: DataFrame, sorted: bool) -> PolarsResult { +pub(crate) fn group_by_multiple_keys(keys: DataFrame, sorted: bool) -> PolarsResult { let mut hashes = Vec::with_capacity(keys.height()); let _ = series_to_hashes(keys.get_columns(), None, &mut hashes)?; diff --git a/crates/polars-core/src/frame/groupby/into_groups.rs b/crates/polars-core/src/frame/group_by/into_groups.rs similarity index 94% rename from crates/polars-core/src/frame/groupby/into_groups.rs rename to crates/polars-core/src/frame/group_by/into_groups.rs index 5518f1a760d0..5144711ffa58 100644 --- a/crates/polars-core/src/frame/groupby/into_groups.rs +++ b/crates/polars-core/src/frame/group_by/into_groups.rs @@ -1,4 +1,4 @@ -#[cfg(feature = "groupby_list")] +#[cfg(feature = "group_by_list")] use polars_arrow::kernels::list_bytes_iter::numeric_list_bytes_iter; use polars_arrow::kernels::sort_partition::{create_clean_partitions, partition_to_groups}; use polars_arrow::prelude::*; @@ -8,9 +8,9 @@ use crate::config::verbose; use crate::utils::_split_offsets; use crate::utils::flatten::flatten_par; -/// Used to create the tuples for a groupby operation. +/// Used to create the tuples for a group_by operation. pub trait IntoGroupsProxy { - /// Create the tuples need for a groupby operation. + /// Create the tuples need for a group_by operation. /// * The first value in the tuple is the first index of the group. /// * The second value in the tuple is are the indexes of the groups including the first value. fn group_tuples(&self, _multithreaded: bool, _sorted: bool) -> PolarsResult { @@ -38,15 +38,15 @@ where .downcast_iter() .map(|arr| arr.values().as_slice()) .collect::>(); - groupby_threaded_slice(keys, n_partitions, sorted) + group_by_threaded_slice(keys, n_partitions, sorted) } else { let keys = ca.downcast_iter().collect::>(); - groupby_threaded_iter(&keys, n_partitions, sorted) + group_by_threaded_iter(&keys, n_partitions, sorted) } } else if !ca.has_validity() { - groupby(ca.into_no_null_iter(), sorted) + group_by(ca.into_no_null_iter(), sorted) } else { - groupby(ca.into_iter(), sorted) + group_by(ca.into_iter(), sorted) } } @@ -57,7 +57,7 @@ where { fn create_groups_from_sorted(&self, multithreaded: bool) -> GroupsSlice { if verbose() { - eprintln!("groupby keys are sorted; running sorted key fast path"); + eprintln!("group_by keys are sorted; running sorted key fast path"); } let arr = self.downcast_iter().next().unwrap(); if arr.is_empty() { @@ -271,7 +271,7 @@ impl IntoGroupsProxy for BinaryChunked { .collect::>() }); let byte_hashes = byte_hashes.iter().collect::>(); - groupby_threaded_slice(byte_hashes, n_partitions as u64, sorted) + group_by_threaded_slice(byte_hashes, n_partitions as u64, sorted) } else { let byte_hashes = self .into_iter() @@ -283,7 +283,7 @@ impl IntoGroupsProxy for BinaryChunked { BytesHash::new(opt_b, hash) }) .collect_trusted::>(); - groupby(byte_hashes.iter(), sorted) + group_by(byte_hashes.iter(), sorted) }; Ok(out) } @@ -293,7 +293,7 @@ impl IntoGroupsProxy for ListChunked { #[allow(clippy::needless_lifetimes)] #[allow(unused_variables)] fn group_tuples<'a>(&'a self, multithreaded: bool, sorted: bool) -> PolarsResult { - #[cfg(feature = "groupby_list")] + #[cfg(feature = "group_by_list")] { polars_ensure!( self.inner_dtype().to_physical().is_numeric(), @@ -338,7 +338,7 @@ impl IntoGroupsProxy for ListChunked { }) .collect::>>()?; let bytes_hashes = bytes_hashes.iter().collect::>(); - Ok(groupby_threaded_slice( + Ok(group_by_threaded_slice( bytes_hashes, n_partitions as u64, sorted, @@ -347,12 +347,12 @@ impl IntoGroupsProxy for ListChunked { groups } else { let hashes = arr_to_hashes(self)?; - Ok(groupby(hashes.iter(), sorted)) + Ok(group_by(hashes.iter(), sorted)) } } - #[cfg(not(feature = "groupby_list"))] + #[cfg(not(feature = "group_by_list"))] { - panic!("activate 'groupby_list' feature") + panic!("activate 'group_by_list' feature") } } } @@ -376,6 +376,6 @@ where T: PolarsObject, { fn group_tuples(&self, _multithreaded: bool, sorted: bool) -> PolarsResult { - Ok(groupby(self.into_iter(), sorted)) + Ok(group_by(self.into_iter(), sorted)) } } diff --git a/crates/polars-core/src/frame/groupby/mod.rs b/crates/polars-core/src/frame/group_by/mod.rs similarity index 90% rename from crates/polars-core/src/frame/groupby/mod.rs rename to crates/polars-core/src/frame/group_by/mod.rs index c44a7ee3f24f..2ff670ac248f 100644 --- a/crates/polars-core/src/frame/groupby/mod.rs +++ b/crates/polars-core/src/frame/group_by/mod.rs @@ -50,7 +50,7 @@ fn prepare_dataframe_unsorted(by: &[Series]) -> DataFrame { } impl DataFrame { - pub fn groupby_with_series( + pub fn group_by_with_series( &self, mut by: Vec, multithreaded: bool, @@ -58,7 +58,7 @@ impl DataFrame { ) -> PolarsResult { polars_ensure!( !by.is_empty(), - ComputeError: "at least one key is required in a groupby operation" + ComputeError: "at least one key is required in a group_by operation" ); let by_len = by[0].len(); @@ -88,9 +88,9 @@ impl DataFrame { } let keys_df = prepare_dataframe_unsorted(&by); if multithreaded { - groupby_threaded_multiple_keys_flat(keys_df, n_partitions, sorted) + group_by_threaded_multiple_keys_flat(keys_df, n_partitions, sorted) } else { - groupby_multiple_keys(keys_df, sorted) + group_by_multiple_keys(keys_df, sorted) } }; Ok(GroupBy::new(self, by, groups?, None)) @@ -102,34 +102,34 @@ impl DataFrame { /// /// ``` /// use polars_core::prelude::*; - /// fn groupby_sum(df: &DataFrame) -> PolarsResult { - /// df.groupby(["column_name"])? + /// fn group_by_sum(df: &DataFrame) -> PolarsResult { + /// df.group_by(["column_name"])? /// .select(["agg_column_name"]) /// .sum() /// } /// ``` - pub fn groupby(&self, by: I) -> PolarsResult + pub fn group_by(&self, by: I) -> PolarsResult where I: IntoIterator, S: AsRef, { let selected_keys = self.select_series(by)?; - self.groupby_with_series(selected_keys, true, false) + self.group_by_with_series(selected_keys, true, false) } /// Group DataFrame using a Series column. /// The groups are ordered by their smallest row index. - pub fn groupby_stable(&self, by: I) -> PolarsResult + pub fn group_by_stable(&self, by: I) -> PolarsResult where I: IntoIterator, S: AsRef, { let selected_keys = self.select_series(by)?; - self.groupby_with_series(selected_keys, true, true) + self.group_by_with_series(selected_keys, true, true) } } -/// Returned by a groupby operation on a DataFrame. This struct supports +/// Returned by a group_by operation on a DataFrame. This struct supports /// several aggregations. /// /// Until described otherwise, the examples in this struct are performed on the following DataFrame: @@ -329,7 +329,7 @@ impl<'df> GroupBy<'df> { /// ```rust /// # use polars_core::prelude::*; /// fn example(df: DataFrame) -> PolarsResult { - /// df.groupby(["date"])?.select(&["temp", "rain"]).mean() + /// df.group_by(["date"])?.select(&["temp", "rain"]).mean() /// } /// ``` /// Returns: @@ -352,7 +352,7 @@ impl<'df> GroupBy<'df> { let (mut cols, agg_cols) = self.prepare_agg()?; for agg_col in agg_cols { - let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Mean); + let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::Mean); let mut agg = unsafe { agg_col.agg_mean(&self.groups) }; agg.rename(&new_name); cols.push(agg); @@ -367,7 +367,7 @@ impl<'df> GroupBy<'df> { /// ```rust /// # use polars_core::prelude::*; /// fn example(df: DataFrame) -> PolarsResult { - /// df.groupby(["date"])?.select(["temp"]).sum() + /// df.group_by(["date"])?.select(["temp"]).sum() /// } /// ``` /// Returns: @@ -390,7 +390,7 @@ impl<'df> GroupBy<'df> { let (mut cols, agg_cols) = self.prepare_agg()?; for agg_col in agg_cols { - let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Sum); + let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::Sum); let mut agg = unsafe { agg_col.agg_sum(&self.groups) }; agg.rename(&new_name); cols.push(agg); @@ -405,7 +405,7 @@ impl<'df> GroupBy<'df> { /// ```rust /// # use polars_core::prelude::*; /// fn example(df: DataFrame) -> PolarsResult { - /// df.groupby(["date"])?.select(["temp"]).min() + /// df.group_by(["date"])?.select(["temp"]).min() /// } /// ``` /// Returns: @@ -427,7 +427,7 @@ impl<'df> GroupBy<'df> { pub fn min(&self) -> PolarsResult { let (mut cols, agg_cols) = self.prepare_agg()?; for agg_col in agg_cols { - let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Min); + let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::Min); let mut agg = unsafe { agg_col.agg_min(&self.groups) }; agg.rename(&new_name); cols.push(agg); @@ -442,7 +442,7 @@ impl<'df> GroupBy<'df> { /// ```rust /// # use polars_core::prelude::*; /// fn example(df: DataFrame) -> PolarsResult { - /// df.groupby(["date"])?.select(["temp"]).max() + /// df.group_by(["date"])?.select(["temp"]).max() /// } /// ``` /// Returns: @@ -464,7 +464,7 @@ impl<'df> GroupBy<'df> { pub fn max(&self) -> PolarsResult { let (mut cols, agg_cols) = self.prepare_agg()?; for agg_col in agg_cols { - let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Max); + let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::Max); let mut agg = unsafe { agg_col.agg_max(&self.groups) }; agg.rename(&new_name); cols.push(agg); @@ -479,7 +479,7 @@ impl<'df> GroupBy<'df> { /// ```rust /// # use polars_core::prelude::*; /// fn example(df: DataFrame) -> PolarsResult { - /// df.groupby(["date"])?.select(["temp"]).first() + /// df.group_by(["date"])?.select(["temp"]).first() /// } /// ``` /// Returns: @@ -501,7 +501,7 @@ impl<'df> GroupBy<'df> { pub fn first(&self) -> PolarsResult { let (mut cols, agg_cols) = self.prepare_agg()?; for agg_col in agg_cols { - let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::First); + let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::First); let mut agg = unsafe { agg_col.agg_first(&self.groups) }; agg.rename(&new_name); cols.push(agg); @@ -516,7 +516,7 @@ impl<'df> GroupBy<'df> { /// ```rust /// # use polars_core::prelude::*; /// fn example(df: DataFrame) -> PolarsResult { - /// df.groupby(["date"])?.select(["temp"]).last() + /// df.group_by(["date"])?.select(["temp"]).last() /// } /// ``` /// Returns: @@ -538,7 +538,7 @@ impl<'df> GroupBy<'df> { pub fn last(&self) -> PolarsResult { let (mut cols, agg_cols) = self.prepare_agg()?; for agg_col in agg_cols { - let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Last); + let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::Last); let mut agg = unsafe { agg_col.agg_last(&self.groups) }; agg.rename(&new_name); cols.push(agg); @@ -553,7 +553,7 @@ impl<'df> GroupBy<'df> { /// ```rust /// # use polars_core::prelude::*; /// fn example(df: DataFrame) -> PolarsResult { - /// df.groupby(["date"])?.select(["temp"]).n_unique() + /// df.group_by(["date"])?.select(["temp"]).n_unique() /// } /// ``` /// Returns: @@ -575,7 +575,7 @@ impl<'df> GroupBy<'df> { pub fn n_unique(&self) -> PolarsResult { let (mut cols, agg_cols) = self.prepare_agg()?; for agg_col in agg_cols { - let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::NUnique); + let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::NUnique); let mut agg = unsafe { agg_col.agg_n_unique(&self.groups) }; agg.rename(&new_name); cols.push(agg.into_series()); @@ -592,7 +592,7 @@ impl<'df> GroupBy<'df> { /// # use polars_arrow::prelude::QuantileInterpolOptions; /// /// fn example(df: DataFrame) -> PolarsResult { - /// df.groupby(["date"])?.select(["temp"]).quantile(0.2, QuantileInterpolOptions::default()) + /// df.group_by(["date"])?.select(["temp"]).quantile(0.2, QuantileInterpolOptions::default()) /// } /// ``` #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")] @@ -608,7 +608,7 @@ impl<'df> GroupBy<'df> { let (mut cols, agg_cols) = self.prepare_agg()?; for agg_col in agg_cols { let new_name = - fmt_groupby_column(agg_col.name(), GroupByMethod::Quantile(quantile, interpol)); + fmt_group_by_column(agg_col.name(), GroupByMethod::Quantile(quantile, interpol)); let mut agg = unsafe { agg_col.agg_quantile(&self.groups, quantile, interpol) }; agg.rename(&new_name); cols.push(agg.into_series()); @@ -623,14 +623,14 @@ impl<'df> GroupBy<'df> { /// ```rust /// # use polars_core::prelude::*; /// fn example(df: DataFrame) -> PolarsResult { - /// df.groupby(["date"])?.select(["temp"]).median() + /// df.group_by(["date"])?.select(["temp"]).median() /// } /// ``` #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")] pub fn median(&self) -> PolarsResult { let (mut cols, agg_cols) = self.prepare_agg()?; for agg_col in agg_cols { - let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Median); + let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::Median); let mut agg = unsafe { agg_col.agg_median(&self.groups) }; agg.rename(&new_name); cols.push(agg.into_series()); @@ -643,7 +643,7 @@ impl<'df> GroupBy<'df> { pub fn var(&self, ddof: u8) -> PolarsResult { let (mut cols, agg_cols) = self.prepare_agg()?; for agg_col in agg_cols { - let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Var(ddof)); + let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::Var(ddof)); let mut agg = unsafe { agg_col.agg_var(&self.groups, ddof) }; agg.rename(&new_name); cols.push(agg.into_series()); @@ -656,7 +656,7 @@ impl<'df> GroupBy<'df> { pub fn std(&self, ddof: u8) -> PolarsResult { let (mut cols, agg_cols) = self.prepare_agg()?; for agg_col in agg_cols { - let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Std(ddof)); + let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::Std(ddof)); let mut agg = unsafe { agg_col.agg_std(&self.groups, ddof) }; agg.rename(&new_name); cols.push(agg.into_series()); @@ -671,7 +671,7 @@ impl<'df> GroupBy<'df> { /// ```rust /// # use polars_core::prelude::*; /// fn example(df: DataFrame) -> PolarsResult { - /// df.groupby(["date"])?.select(["temp"]).count() + /// df.group_by(["date"])?.select(["temp"]).count() /// } /// ``` /// Returns: @@ -693,7 +693,7 @@ impl<'df> GroupBy<'df> { let (mut cols, agg_cols) = self.prepare_agg()?; for agg_col in agg_cols { - let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Count); + let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::Count); let mut ca = self.groups.group_count(); ca.rename(&new_name); cols.push(ca.into_series()); @@ -701,14 +701,14 @@ impl<'df> GroupBy<'df> { DataFrame::new(cols) } - /// Get the groupby group indexes. + /// Get the group_by group indexes. /// /// # Example /// /// ```rust /// # use polars_core::prelude::*; /// fn example(df: DataFrame) -> PolarsResult { - /// df.groupby(["date"])?.groups() + /// df.group_by(["date"])?.groups() /// } /// ``` /// Returns: @@ -729,13 +729,13 @@ impl<'df> GroupBy<'df> { pub fn groups(&self) -> PolarsResult { let mut cols = self.keys(); let mut column = self.groups.as_list_chunked(); - let new_name = fmt_groupby_column("", GroupByMethod::Groups); + let new_name = fmt_group_by_column("", GroupByMethod::Groups); column.rename(&new_name); cols.push(column.into_series()); DataFrame::new(cols) } - /// Aggregate the groups of the groupby operation into lists. + /// Aggregate the groups of the group_by operation into lists. /// /// # Example /// @@ -743,7 +743,7 @@ impl<'df> GroupBy<'df> { /// # use polars_core::prelude::*; /// fn example(df: DataFrame) -> PolarsResult { /// // GroupBy and aggregate to Lists - /// df.groupby(["date"])?.select(["temp"]).agg_list() + /// df.group_by(["date"])?.select(["temp"]).agg_list() /// } /// ``` /// Returns: @@ -765,7 +765,7 @@ impl<'df> GroupBy<'df> { pub fn agg_list(&self) -> PolarsResult { let (mut cols, agg_cols) = self.prepare_agg()?; for agg_col in agg_cols { - let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Implode); + let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::Implode); let mut agg = unsafe { agg_col.agg_list(&self.groups) }; agg.rename(&new_name); cols.push(agg); @@ -774,7 +774,7 @@ impl<'df> GroupBy<'df> { } fn prepare_apply(&self) -> PolarsResult { - polars_ensure!(self.df.height() > 0, ComputeError: "cannot groupby + apply on empty 'DataFrame'"); + polars_ensure!(self.df.height() > 0, ComputeError: "cannot group_by + apply on empty 'DataFrame'"); if let Some(agg) = &self.selected_agg { if agg.is_empty() { Ok(self.df.clone()) @@ -889,7 +889,7 @@ impl Display for GroupByMethod { } // Formatting functions used in eager and lazy code for renaming grouped columns -pub fn fmt_groupby_column(name: &str, method: GroupByMethod) -> String { +pub fn fmt_group_by_column(name: &str, method: GroupByMethod) -> String { use GroupByMethod::*; match method { Min => format!("{name}_min"), @@ -935,7 +935,7 @@ mod test { let s2 = Series::new("rain", [0.2, 0.1, 0.3, 0.1, 0.01]); let df = DataFrame::new(vec![s0, s1, s2]).unwrap(); - let out = df.groupby_stable(["date"])?.select(["temp"]).count()?; + let out = df.group_by_stable(["date"])?.select(["temp"]).count()?; assert_eq!( out.column("temp_count")?, &Series::new("temp_count", [2 as IdxSize, 2, 1]) @@ -945,7 +945,7 @@ mod test { #[allow(deprecated)] // Select multiple let out = df - .groupby_stable(["date"])? + .group_by_stable(["date"])? .select(["temp", "rain"]) .mean()?; assert_eq!( @@ -957,14 +957,14 @@ mod test { #[allow(deprecated)] // Group by multiple let out = df - .groupby_stable(["date", "temp"])? + .group_by_stable(["date", "temp"])? .select(["rain"]) .mean()?; assert!(out.column("rain_mean").is_ok()); // Use of deprecated `sum()` for testing purposes #[allow(deprecated)] - let out = df.groupby_stable(["date"])?.select(["temp"]).sum()?; + let out = df.group_by_stable(["date"])?.select(["temp"]).sum()?; assert_eq!( out.column("temp_sum")?, &Series::new("temp_sum", [30, 8, 9]) @@ -973,7 +973,7 @@ mod test { // Use of deprecated `n_unique()` for testing purposes #[allow(deprecated)] // implicit select all and only aggregate on methods that support that aggregation - let gb = df.groupby(["date"]).unwrap().n_unique().unwrap(); + let gb = df.group_by(["date"]).unwrap().n_unique().unwrap(); // check the group by column is filtered out. assert_eq!(gb.width(), 3); Ok(()) @@ -981,7 +981,7 @@ mod test { #[test] #[cfg_attr(miri, ignore)] - fn test_static_groupby_by_12_columns() { + fn test_static_group_by_by_12_columns() { // Build GroupBy DataFrame. let s0 = Series::new("G1", ["A", "A", "B", "B", "C"].as_ref()); let s1 = Series::new("N", [1, 2, 2, 4, 2].as_ref()); @@ -1003,7 +1003,7 @@ mod test { // Use of deprecated `sum()` for testing purposes #[allow(deprecated)] let adf = df - .groupby([ + .group_by([ "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8", "G9", "G10", "G11", "G12", ]) .unwrap() @@ -1019,11 +1019,11 @@ mod test { #[test] #[cfg_attr(miri, ignore)] - fn test_dynamic_groupby_by_13_columns() { - // The content for every groupby series. + fn test_dynamic_group_by_by_13_columns() { + // The content for every group_by series. let series_content = ["A", "A", "B", "B", "C"]; - // The name of every groupby series. + // The name of every group_by series. let series_names = [ "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8", "G9", "G10", "G11", "G12", "G13", ]; @@ -1048,7 +1048,7 @@ mod test { #[allow(deprecated)] // Compute the aggregated DataFrame by the 13 columns defined in `series_names`. let adf = df - .groupby(series_names) + .group_by(series_names) .unwrap() .select(["N"]) .sum() @@ -1072,14 +1072,14 @@ mod test { #[test] #[cfg_attr(miri, ignore)] - fn test_groupby_floats() { + fn test_group_by_floats() { let df = df! {"flt" => [1., 1., 2., 2., 3.], "val" => [1, 1, 1, 1, 1] } .unwrap(); // Use of deprecated `sum()` for testing purposes #[allow(deprecated)] - let res = df.groupby(["flt"]).unwrap().sum().unwrap(); + let res = df.group_by(["flt"]).unwrap().sum().unwrap(); let res = res.sort(["flt"], false, false).unwrap(); assert_eq!( Vec::from(res.column("val_sum").unwrap().i32().unwrap()), @@ -1090,7 +1090,7 @@ mod test { #[test] #[cfg_attr(miri, ignore)] #[cfg(feature = "dtype-categorical")] - fn test_groupby_categorical() { + fn test_group_by_categorical() { let mut df = df! {"foo" => ["a", "a", "b", "b", "c"], "ham" => ["a", "a", "b", "b", "c"], "bar" => [1, 1, 1, 1, 1] @@ -1104,7 +1104,7 @@ mod test { #[allow(deprecated)] // check multiple keys and categorical let res = df - .groupby_stable(["foo", "ham"]) + .group_by_stable(["foo", "ham"]) .unwrap() .select(["bar"]) .sum() @@ -1118,14 +1118,14 @@ mod test { #[test] #[cfg_attr(miri, ignore)] - fn test_groupby_null_handling() -> PolarsResult<()> { + fn test_group_by_null_handling() -> PolarsResult<()> { let df = df!( "a" => ["a", "a", "a", "b", "b"], "b" => [Some(1), Some(2), None, None, Some(1)] )?; // Use of deprecated `mean()` for testing purposes #[allow(deprecated)] - let out = df.groupby_stable(["a"])?.mean()?; + let out = df.group_by_stable(["a"])?.mean()?; assert_eq!( Vec::from(out.column("b_mean")?.f64()?), @@ -1136,7 +1136,7 @@ mod test { #[test] #[cfg_attr(miri, ignore)] - fn test_groupby_var() -> PolarsResult<()> { + fn test_group_by_var() -> PolarsResult<()> { // check variance and proper coercion to f64 let df = df![ "g" => ["foo", "foo", "bar"], @@ -1146,12 +1146,12 @@ mod test { // Use of deprecated `sum()` for testing purposes #[allow(deprecated)] - let out = df.groupby_stable(["g"])?.select(["int"]).var(1)?; + let out = df.group_by_stable(["g"])?.select(["int"]).var(1)?; assert_eq!(out.column("int_agg_var")?.f64()?.get(0), Some(0.5)); // Use of deprecated `std()` for testing purposes #[allow(deprecated)] - let out = df.groupby_stable(["g"])?.select(["int"]).std(1)?; + let out = df.group_by_stable(["g"])?.select(["int"]).std(1)?; let val = out.column("int_agg_std")?.f64()?.get(0).unwrap(); let expected = f64::FRAC_1_SQRT_2(); assert!((val - expected).abs() < 0.000001); @@ -1161,7 +1161,7 @@ mod test { #[test] #[cfg_attr(miri, ignore)] #[cfg(feature = "dtype-categorical")] - fn test_groupby_null_group() -> PolarsResult<()> { + fn test_group_by_null_group() -> PolarsResult<()> { // check if null is own group let mut df = df![ "g" => [Some("foo"), Some("foo"), Some("bar"), None, None], @@ -1173,7 +1173,7 @@ mod test { // Use of deprecated `sum()` for testing purposes #[allow(deprecated)] - let _ = df.groupby(["g"])?.sum()?; + let _ = df.group_by(["g"])?.sum()?; Ok(()) } } diff --git a/crates/polars-core/src/frame/groupby/perfect.rs b/crates/polars-core/src/frame/group_by/perfect.rs similarity index 100% rename from crates/polars-core/src/frame/groupby/perfect.rs rename to crates/polars-core/src/frame/group_by/perfect.rs diff --git a/crates/polars-core/src/frame/groupby/proxy.rs b/crates/polars-core/src/frame/group_by/proxy.rs similarity index 99% rename from crates/polars-core/src/frame/groupby/proxy.rs rename to crates/polars-core/src/frame/group_by/proxy.rs index 2c0a2b48bd18..ebd33232772d 100644 --- a/crates/polars-core/src/frame/groupby/proxy.rs +++ b/crates/polars-core/src/frame/group_by/proxy.rs @@ -296,7 +296,7 @@ pub enum GroupsProxy { Slice { // the groups slices groups: GroupsSlice, - // indicates if we do a rolling groupby + // indicates if we do a rolling group_by rolling: bool, }, } diff --git a/crates/polars-core/src/frame/hash_join/mod.rs b/crates/polars-core/src/frame/hash_join/mod.rs index d596c916c612..f2f7500447da 100644 --- a/crates/polars-core/src/frame/hash_join/mod.rs +++ b/crates/polars-core/src/frame/hash_join/mod.rs @@ -36,7 +36,7 @@ pub(crate) use zip_outer::*; pub use self::multiple_keys::private_left_join_multiple_keys; use crate::datatypes::PlHashMap; -use crate::frame::groupby::hashing::HASHMAP_INIT_SIZE; +use crate::frame::group_by::hashing::HASHMAP_INIT_SIZE; pub use crate::frame::hash_join::multiple_keys::{ _inner_join_multiple_keys, _left_join_multiple_keys, _outer_join_multiple_keys, }; diff --git a/crates/polars-core/src/frame/hash_join/multiple_keys.rs b/crates/polars-core/src/frame/hash_join/multiple_keys.rs index 509ca3f4c223..157584328978 100644 --- a/crates/polars-core/src/frame/hash_join/multiple_keys.rs +++ b/crates/polars-core/src/frame/hash_join/multiple_keys.rs @@ -3,7 +3,7 @@ use hashbrown::HashMap; use rayon::prelude::*; use super::*; -use crate::frame::groupby::hashing::{populate_multiple_key_hashmap, HASHMAP_INIT_SIZE}; +use crate::frame::group_by::hashing::{populate_multiple_key_hashmap, HASHMAP_INIT_SIZE}; use crate::frame::hash_join::{ get_hash_tbl_threaded_join_mut_partitioned, get_hash_tbl_threaded_join_partitioned, }; diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 526b2e9aea82..eb86b38540e5 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -22,7 +22,7 @@ mod chunks; pub(crate) mod cross_join; pub mod explode; mod from; -pub mod groupby; +pub mod group_by; pub mod hash_join; #[cfg(feature = "rows")] pub mod row; @@ -34,7 +34,7 @@ pub use chunks::*; use serde::{Deserialize, Serialize}; use smartstring::alias::String as SmartString; -use crate::frame::groupby::GroupsIndicator; +use crate::frame::group_by::GroupsIndicator; #[cfg(feature = "row_hash")] use crate::hashing::df_rows_to_hashes_threaded_vertical; #[cfg(feature = "zip_with")] @@ -3097,7 +3097,7 @@ impl DataFrame { let columns = match (keep, maintain_order) { (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, true) => { - let gb = df.groupby_stable(names)?; + let gb = df.group_by_stable(names)?; let groups = gb.get_groups(); let (offset, len) = slice.unwrap_or((0, groups.len())); let groups = groups.slice(offset, len); @@ -3106,7 +3106,7 @@ impl DataFrame { (UniqueKeepStrategy::Last, true) => { // maintain order by last values, so the sorted groups are not correct as they // are sorted by the first value - let gb = df.groupby(names)?; + let gb = df.group_by(names)?; let groups = gb.get_groups(); let func = |g: GroupsIndicator| match g { @@ -3126,14 +3126,14 @@ impl DataFrame { return Ok(unsafe { df.take_unchecked(&last_idx) }); }, (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, false) => { - let gb = df.groupby(names)?; + let gb = df.group_by(names)?; let groups = gb.get_groups(); let (offset, len) = slice.unwrap_or((0, groups.len())); let groups = groups.slice(offset, len); df.apply_columns_par(&|s| unsafe { s.agg_first(&groups) }) }, (UniqueKeepStrategy::Last, false) => { - let gb = df.groupby(names)?; + let gb = df.group_by(names)?; let groups = gb.get_groups(); let (offset, len) = slice.unwrap_or((0, groups.len())); let groups = groups.slice(offset, len); @@ -3166,7 +3166,7 @@ impl DataFrame { /// # Ok::<(), PolarsError>(()) /// ``` pub fn is_unique(&self) -> PolarsResult { - let gb = self.groupby(self.get_column_names())?; + let gb = self.group_by(self.get_column_names())?; let groups = gb.take_groups(); Ok(is_unique_helper( groups, @@ -3190,7 +3190,7 @@ impl DataFrame { /// # Ok::<(), PolarsError>(()) /// ``` pub fn is_duplicated(&self) -> PolarsResult { - let gb = self.groupby(self.get_column_names())?; + let gb = self.group_by(self.get_column_names())?; let groups = gb.take_groups(); Ok(is_unique_helper( groups, @@ -3332,9 +3332,9 @@ impl DataFrame { include_key: bool, ) -> PolarsResult> { let groups = if stable { - self.groupby_stable(cols)?.take_groups() + self.group_by_stable(cols)?.take_groups() } else { - self.groupby(cols)?.take_groups() + self.group_by(cols)?.take_groups() }; // drop key columns prior to calculation if requested diff --git a/crates/polars-core/src/hashing/vector_hasher.rs b/crates/polars-core/src/hashing/vector_hasher.rs index 537e71ee9e4c..4aa7b08a527c 100644 --- a/crates/polars-core/src/hashing/vector_hasher.rs +++ b/crates/polars-core/src/hashing/vector_hasher.rs @@ -1,7 +1,7 @@ use arrow::bitmap::utils::get_bit_unchecked; use hashbrown::hash_map::RawEntryMut; use hashbrown::HashMap; -#[cfg(feature = "groupby_list")] +#[cfg(feature = "group_by_list")] use polars_arrow::kernels::list_bytes_iter::numeric_list_bytes_iter; use polars_arrow::utils::CustomIterTools; use rayon::prelude::*; @@ -328,7 +328,7 @@ impl VecHash for Float64Chunked { } } -#[cfg(feature = "groupby_list")] +#[cfg(feature = "group_by_list")] impl VecHash for ListChunked { fn vec_hash(&self, _random_state: RandomState, _buf: &mut Vec) -> PolarsResult<()> { polars_ensure!( diff --git a/crates/polars-core/src/prelude.rs b/crates/polars-core/src/prelude.rs index a6ba458e222b..e80e37899f85 100644 --- a/crates/polars-core/src/prelude.rs +++ b/crates/polars-core/src/prelude.rs @@ -38,8 +38,8 @@ pub use crate::error::{ #[cfg(feature = "asof_join")] pub use crate::frame::asof_join::*; pub use crate::frame::explode::MeltArgs; -pub(crate) use crate::frame::groupby::aggregations::*; -pub use crate::frame::groupby::{GroupsIdx, GroupsProxy, GroupsSlice, IntoGroupsProxy}; +pub(crate) use crate::frame::group_by::aggregations::*; +pub use crate::frame::group_by::{GroupsIdx, GroupsProxy, GroupsSlice, IntoGroupsProxy}; pub(crate) use crate::frame::hash_join::*; pub use crate::frame::hash_join::{JoinArgs, JoinType}; pub use crate::frame::{DataFrame, UniqueKeepStrategy}; diff --git a/crates/polars-core/src/series/implementations/array.rs b/crates/polars-core/src/series/implementations/array.rs index ef560e50ccbc..44f422e447f0 100644 --- a/crates/polars-core/src/series/implementations/array.rs +++ b/crates/polars-core/src/series/implementations/array.rs @@ -5,7 +5,7 @@ use super::{private, IntoSeries, SeriesTrait}; use crate::chunked_array::comparison::*; use crate::chunked_array::ops::explode::ExplodeByOffsets; use crate::chunked_array::{AsSinglePtr, Settings}; -use crate::frame::groupby::*; +use crate::frame::group_by::*; use crate::prelude::*; use crate::series::implementations::SeriesWrap; #[cfg(feature = "chunked_ids")] diff --git a/crates/polars-core/src/series/implementations/binary.rs b/crates/polars-core/src/series/implementations/binary.rs index 25f9a44c17ab..d2277e2a1f47 100644 --- a/crates/polars-core/src/series/implementations/binary.rs +++ b/crates/polars-core/src/series/implementations/binary.rs @@ -9,7 +9,7 @@ use crate::chunked_array::ops::compare_inner::{ }; use crate::chunked_array::ops::explode::ExplodeByOffsets; use crate::chunked_array::AsSinglePtr; -use crate::frame::groupby::*; +use crate::frame::group_by::*; use crate::frame::hash_join::ZipOuterJoinColumn; use crate::prelude::*; use crate::series::implementations::SeriesWrap; diff --git a/crates/polars-core/src/series/implementations/boolean.rs b/crates/polars-core/src/series/implementations/boolean.rs index 11baec69aaf0..a1b54ab136c4 100644 --- a/crates/polars-core/src/series/implementations/boolean.rs +++ b/crates/polars-core/src/series/implementations/boolean.rs @@ -10,7 +10,7 @@ use crate::chunked_array::ops::compare_inner::{ }; use crate::chunked_array::ops::explode::ExplodeByOffsets; use crate::chunked_array::{AsSinglePtr, ChunkIdIter}; -use crate::frame::groupby::*; +use crate::frame::group_by::*; use crate::frame::hash_join::ZipOuterJoinColumn; use crate::prelude::*; use crate::series::implementations::SeriesWrap; diff --git a/crates/polars-core/src/series/implementations/categorical.rs b/crates/polars-core/src/series/implementations/categorical.rs index 4430a1db84a7..f3bd53c3c012 100644 --- a/crates/polars-core/src/series/implementations/categorical.rs +++ b/crates/polars-core/src/series/implementations/categorical.rs @@ -8,7 +8,7 @@ use crate::chunked_array::comparison::*; use crate::chunked_array::ops::compare_inner::{IntoPartialOrdInner, PartialOrdInner}; use crate::chunked_array::ops::explode::ExplodeByOffsets; use crate::chunked_array::AsSinglePtr; -use crate::frame::groupby::*; +use crate::frame::group_by::*; use crate::frame::hash_join::ZipOuterJoinColumn; #[cfg(feature = "is_in")] use crate::frame::hash_join::_check_categorical_src; diff --git a/crates/polars-core/src/series/implementations/dates_time.rs b/crates/polars-core/src/series/implementations/dates_time.rs index 477af2cd0e7f..a442535efb2b 100644 --- a/crates/polars-core/src/series/implementations/dates_time.rs +++ b/crates/polars-core/src/series/implementations/dates_time.rs @@ -17,7 +17,7 @@ use super::{private, IntoSeries, SeriesTrait, SeriesWrap, *}; use crate::chunked_array::ops::explode::ExplodeByOffsets; use crate::chunked_array::ops::ToBitRepr; use crate::chunked_array::AsSinglePtr; -use crate::frame::groupby::*; +use crate::frame::group_by::*; use crate::frame::hash_join::*; use crate::prelude::*; diff --git a/crates/polars-core/src/series/implementations/datetime.rs b/crates/polars-core/src/series/implementations/datetime.rs index cab324d2fcbe..dce1c7fd9385 100644 --- a/crates/polars-core/src/series/implementations/datetime.rs +++ b/crates/polars-core/src/series/implementations/datetime.rs @@ -6,7 +6,7 @@ use ahash::RandomState; use super::{private, IntoSeries, SeriesTrait, SeriesWrap, *}; use crate::chunked_array::ops::explode::ExplodeByOffsets; use crate::chunked_array::AsSinglePtr; -use crate::frame::groupby::*; +use crate::frame::group_by::*; use crate::frame::hash_join::*; use crate::prelude::*; diff --git a/crates/polars-core/src/series/implementations/duration.rs b/crates/polars-core/src/series/implementations/duration.rs index 5caa0deacaa1..85e8f5b8ed9b 100644 --- a/crates/polars-core/src/series/implementations/duration.rs +++ b/crates/polars-core/src/series/implementations/duration.rs @@ -7,7 +7,7 @@ use super::{private, IntoSeries, SeriesTrait, SeriesWrap, *}; use crate::chunked_array::comparison::*; use crate::chunked_array::ops::explode::ExplodeByOffsets; use crate::chunked_array::AsSinglePtr; -use crate::frame::groupby::*; +use crate::frame::group_by::*; use crate::frame::hash_join::*; use crate::prelude::*; diff --git a/crates/polars-core/src/series/implementations/floats.rs b/crates/polars-core/src/series/implementations/floats.rs index dd28d742ca21..b0a8a246b79d 100644 --- a/crates/polars-core/src/series/implementations/floats.rs +++ b/crates/polars-core/src/series/implementations/floats.rs @@ -11,7 +11,7 @@ use crate::chunked_array::ops::compare_inner::{ }; use crate::chunked_array::ops::explode::ExplodeByOffsets; use crate::chunked_array::AsSinglePtr; -use crate::frame::groupby::*; +use crate::frame::group_by::*; use crate::frame::hash_join::ZipOuterJoinColumn; use crate::prelude::*; #[cfg(feature = "checked_arithmetic")] diff --git a/crates/polars-core/src/series/implementations/list.rs b/crates/polars-core/src/series/implementations/list.rs index bb4fc35987ba..7ca4f1371874 100644 --- a/crates/polars-core/src/series/implementations/list.rs +++ b/crates/polars-core/src/series/implementations/list.rs @@ -1,7 +1,7 @@ use std::any::Any; use std::borrow::Cow; -#[cfg(feature = "groupby_list")] +#[cfg(feature = "group_by_list")] use ahash::RandomState; use super::{private, IntoSeries, SeriesTrait}; @@ -9,7 +9,7 @@ use crate::chunked_array::comparison::*; use crate::chunked_array::ops::compare_inner::{IntoPartialEqInner, PartialEqInner}; use crate::chunked_array::ops::explode::ExplodeByOffsets; use crate::chunked_array::{AsSinglePtr, Settings}; -use crate::frame::groupby::*; +use crate::frame::group_by::*; use crate::prelude::*; use crate::series::implementations::SeriesWrap; #[cfg(feature = "chunked_ids")] @@ -53,13 +53,13 @@ impl private::PrivateSeries for SeriesWrap { IntoGroupsProxy::group_tuples(&self.0, multithreaded, sorted) } - #[cfg(feature = "groupby_list")] + #[cfg(feature = "group_by_list")] fn vec_hash(&self, _build_hasher: RandomState, _buf: &mut Vec) -> PolarsResult<()> { self.0.vec_hash(_build_hasher, _buf)?; Ok(()) } - #[cfg(feature = "groupby_list")] + #[cfg(feature = "group_by_list")] fn vec_hash_combine( &self, _build_hasher: RandomState, diff --git a/crates/polars-core/src/series/implementations/mod.rs b/crates/polars-core/src/series/implementations/mod.rs index 9ca9bce80b91..0a7b471dc492 100644 --- a/crates/polars-core/src/series/implementations/mod.rs +++ b/crates/polars-core/src/series/implementations/mod.rs @@ -41,7 +41,7 @@ use crate::chunked_array::ops::compare_inner::{ }; use crate::chunked_array::ops::explode::ExplodeByOffsets; use crate::chunked_array::AsSinglePtr; -use crate::frame::groupby::*; +use crate::frame::group_by::*; use crate::frame::hash_join::ZipOuterJoinColumn; use crate::prelude::*; #[cfg(feature = "checked_arithmetic")] diff --git a/crates/polars-core/src/series/implementations/object.rs b/crates/polars-core/src/series/implementations/object.rs index 01eb9fdbe0e9..941f1b550183 100644 --- a/crates/polars-core/src/series/implementations/object.rs +++ b/crates/polars-core/src/series/implementations/object.rs @@ -6,7 +6,7 @@ use ahash::RandomState; use crate::chunked_array::object::PolarsObjectSafe; use crate::chunked_array::ops::compare_inner::{IntoPartialEqInner, PartialEqInner}; use crate::chunked_array::Settings; -use crate::frame::groupby::{GroupsProxy, IntoGroupsProxy}; +use crate::frame::group_by::{GroupsProxy, IntoGroupsProxy}; use crate::prelude::*; use crate::series::implementations::SeriesWrap; use crate::series::private::{PrivateSeries, PrivateSeriesNumeric}; diff --git a/crates/polars-core/src/series/implementations/struct_.rs b/crates/polars-core/src/series/implementations/struct_.rs index 32222176b54b..21bf3f78a9e9 100644 --- a/crates/polars-core/src/series/implementations/struct_.rs +++ b/crates/polars-core/src/series/implementations/struct_.rs @@ -65,7 +65,7 @@ impl private::PrivateSeries for SeriesWrap { fn group_tuples(&self, multithreaded: bool, sorted: bool) -> PolarsResult { let df = DataFrame::new_no_checks(vec![]); let gb = df - .groupby_with_series(self.0.fields().to_vec(), multithreaded, sorted) + .group_by_with_series(self.0.fields().to_vec(), multithreaded, sorted) .unwrap(); Ok(gb.take_groups()) } diff --git a/crates/polars-core/src/series/implementations/utf8.rs b/crates/polars-core/src/series/implementations/utf8.rs index a25684c1ffd1..952591fed652 100644 --- a/crates/polars-core/src/series/implementations/utf8.rs +++ b/crates/polars-core/src/series/implementations/utf8.rs @@ -9,7 +9,7 @@ use crate::chunked_array::ops::compare_inner::{ }; use crate::chunked_array::ops::explode::ExplodeByOffsets; use crate::chunked_array::AsSinglePtr; -use crate::frame::groupby::*; +use crate::frame::group_by::*; use crate::frame::hash_join::ZipOuterJoinColumn; use crate::prelude::*; use crate::series::implementations::SeriesWrap; diff --git a/crates/polars-core/src/series/ops/unique.rs b/crates/polars-core/src/series/ops/unique.rs index 1c20b2ad8036..cfae77d687e7 100644 --- a/crates/polars-core/src/series/ops/unique.rs +++ b/crates/polars-core/src/series/ops/unique.rs @@ -2,7 +2,7 @@ use std::hash::Hash; #[cfg(feature = "unique_counts")] -use crate::frame::groupby::hashing::HASHMAP_INIT_SIZE; +use crate::frame::group_by::hashing::HASHMAP_INIT_SIZE; use crate::prelude::*; #[cfg(feature = "unique_counts")] use crate::utils::NoNull; diff --git a/crates/polars-core/src/series/series_trait.rs b/crates/polars-core/src/series/series_trait.rs index 5e298eb2d169..79033df353a5 100644 --- a/crates/polars-core/src/series/series_trait.rs +++ b/crates/polars-core/src/series/series_trait.rs @@ -48,7 +48,7 @@ pub(crate) mod private { use crate::chunked_array::ops::compare_inner::{PartialEqInner, PartialOrdInner}; use crate::chunked_array::Settings; #[cfg(feature = "rows")] - use crate::frame::groupby::GroupsProxy; + use crate::frame::group_by::GroupsProxy; pub trait PrivateSeriesNumeric { fn bit_repr_is_large(&self) -> bool { diff --git a/crates/polars-io/src/partition.rs b/crates/polars-io/src/partition.rs index 48044fe6e486..33d7f19abe70 100644 --- a/crates/polars-io/src/partition.rs +++ b/crates/polars-io/src/partition.rs @@ -91,7 +91,7 @@ where } pub fn finish(self, df: &DataFrame) -> PolarsResult<()> { - let groups = df.groupby(self.by.clone())?; + let groups = df.group_by(self.by.clone())?; let groups = groups.get_groups(); // don't parallelize this diff --git a/crates/polars-lazy/Cargo.toml b/crates/polars-lazy/Cargo.toml index fb6d1d649bd3..f2317ea7f5dc 100644 --- a/crates/polars-lazy/Cargo.toml +++ b/crates/polars-lazy/Cargo.toml @@ -100,7 +100,7 @@ pct_change = ["polars-plan/pct_change"] moment = ["polars-plan/moment", "polars-ops/moment"] abs = ["polars-plan/abs"] random = ["polars-plan/random"] -dynamic_groupby = ["polars-plan/dynamic_groupby", "polars-time", "temporal"] +dynamic_group_by = ["polars-plan/dynamic_group_by", "polars-time", "temporal"] ewma = ["polars-plan/ewma"] dot_diagram = ["polars-plan/dot_diagram"] diagonal_concat = [] diff --git a/crates/polars-lazy/src/dsl/list.rs b/crates/polars-lazy/src/dsl/list.rs index 7d6607868502..5e2b851c0904 100644 --- a/crates/polars-lazy/src/dsl/list.rs +++ b/crates/polars-lazy/src/dsl/list.rs @@ -110,7 +110,7 @@ fn run_per_sublist( } } -fn run_on_groupby_engine( +fn run_on_group_by_engine( name: &str, lst: &ListChunked, expr: &Expr, @@ -194,7 +194,7 @@ pub trait ListNameSpaceExtension: IntoListNameSpace + Sized { }; if fits_idx_size && s.null_count() == 0 && !is_user_apply() { - run_on_groupby_engine(s.name(), &lst, &expr) + run_on_group_by_engine(s.name(), &lst, &expr) } else { run_per_sublist(s, &lst, &expr, parallel, output_field) } diff --git a/crates/polars-lazy/src/dsl/mod.rs b/crates/polars-lazy/src/dsl/mod.rs index b9126f044fa5..95d475e61ced 100644 --- a/crates/polars-lazy/src/dsl/mod.rs +++ b/crates/polars-lazy/src/dsl/mod.rs @@ -1,7 +1,7 @@ //! Domain specific language for the Lazy API. //! //! This DSL revolves around the [`Expr`] type, which represents an abstract -//! operation on a DataFrame, such as mapping over a column, filtering, groupby, or aggregation. +//! operation on a DataFrame, such as mapping over a column, filtering, group_by, or aggregation. //! In general, functions on [`LazyFrame`](crate::frame::LazyFrame)s consume the LazyFrame and produce a new LazyFrame representing //! the result of applying the function and passed expressions to the consumed LazyFrame. //! At runtime, when [`LazyFrame::collect`](crate::frame::LazyFrame::collect) is called, the expressions that comprise diff --git a/crates/polars-lazy/src/frame/mod.rs b/crates/polars-lazy/src/frame/mod.rs index ac62ad8f9cbb..b16a809f990d 100644 --- a/crates/polars-lazy/src/frame/mod.rs +++ b/crates/polars-lazy/src/frame/mod.rs @@ -602,7 +602,7 @@ impl LazyFrame { /// /// fn example(df: DataFrame) -> PolarsResult { /// df.lazy() - /// .groupby([col("foo")]) + /// .group_by([col("foo")]) /// .agg([col("bar").sum(), col("ham").mean().alias("avg_ham")]) /// .collect() /// } @@ -764,7 +764,7 @@ impl LazyFrame { /// /// fn example(df: DataFrame) -> LazyFrame { /// df.lazy() - /// .groupby([col("date")]) + /// .group_by([col("date")]) /// .agg([ /// col("rain").min().alias("min_rain"), /// col("rain").sum().alias("sum_rain"), @@ -772,7 +772,7 @@ impl LazyFrame { /// ]) /// } /// ``` - pub fn groupby, IE: Into + Clone>(self, by: E) -> LazyGroupBy { + pub fn group_by, IE: Into + Clone>(self, by: E) -> LazyGroupBy { let keys = by .as_ref() .iter() @@ -780,7 +780,7 @@ impl LazyFrame { .collect::>(); let opt_state = self.get_opt_state(); - #[cfg(feature = "dynamic_groupby")] + #[cfg(feature = "dynamic_group_by")] { LazyGroupBy { logical_plan: self.logical_plan, @@ -792,7 +792,7 @@ impl LazyFrame { } } - #[cfg(not(feature = "dynamic_groupby"))] + #[cfg(not(feature = "dynamic_group_by"))] { LazyGroupBy { logical_plan: self.logical_plan, @@ -807,11 +807,11 @@ impl LazyFrame { /// /// Also works for index values of type Int32 or Int64. /// - /// Different from a [`groupby_dynamic`][`Self::groupby_dynamic`], the windows are now determined by the + /// Different from a [`group_by_dynamic`][`Self::group_by_dynamic`], the windows are now determined by the /// individual values and are not of constant intervals. For constant intervals use - /// *groupby_dynamic* - #[cfg(feature = "dynamic_groupby")] - pub fn groupby_rolling>( + /// *group_by_dynamic* + #[cfg(feature = "dynamic_group_by")] + pub fn group_by_rolling>( self, index_column: Expr, by: E, @@ -821,9 +821,11 @@ impl LazyFrame { options.index_column = name.as_ref().into(); } else { let name = expr_output_name(&index_column).unwrap(); - return self - .with_column(index_column) - .groupby_rolling(Expr::Column(name), by, options); + return self.with_column(index_column).group_by_rolling( + Expr::Column(name), + by, + options, + ); } let opt_state = self.get_opt_state(); LazyGroupBy { @@ -839,7 +841,7 @@ impl LazyFrame { /// Group based on a time value (or index value of type Int32, Int64). /// /// Time windows are calculated and rows are assigned to windows. Different from a - /// normal groupby is that a row can be member of multiple groups. The time/index + /// normal group_by is that a row can be member of multiple groups. The time/index /// window could be seen as a rolling window, with a window size determined by /// dates/times/values instead of slots in the DataFrame. /// @@ -850,9 +852,9 @@ impl LazyFrame { /// - offset: offset of the window /// /// The `by` argument should be empty `[]` if you don't want to combine this - /// with a ordinary groupby on these keys. - #[cfg(feature = "dynamic_groupby")] - pub fn groupby_dynamic>( + /// with a ordinary group_by on these keys. + #[cfg(feature = "dynamic_group_by")] + pub fn group_by_dynamic>( self, index_column: Expr, by: E, @@ -862,9 +864,11 @@ impl LazyFrame { options.index_column = name.as_ref().into(); } else { let name = expr_output_name(&index_column).unwrap(); - return self - .with_column(index_column) - .groupby_dynamic(Expr::Column(name), by, options); + return self.with_column(index_column).group_by_dynamic( + Expr::Column(name), + by, + options, + ); } let opt_state = self.get_opt_state(); LazyGroupBy { @@ -877,8 +881,8 @@ impl LazyFrame { } } - /// Similar to [`groupby`][`Self::groupby`], but order of the DataFrame is maintained. - pub fn groupby_stable, IE: Into + Clone>(self, by: E) -> LazyGroupBy { + /// Similar to [`group_by`][`Self::group_by`], but order of the DataFrame is maintained. + pub fn group_by_stable, IE: Into + Clone>(self, by: E) -> LazyGroupBy { let keys = by .as_ref() .iter() @@ -886,7 +890,7 @@ impl LazyFrame { .collect::>(); let opt_state = self.get_opt_state(); - #[cfg(feature = "dynamic_groupby")] + #[cfg(feature = "dynamic_group_by")] { LazyGroupBy { logical_plan: self.logical_plan, @@ -898,7 +902,7 @@ impl LazyFrame { } } - #[cfg(not(feature = "dynamic_groupby"))] + #[cfg(not(feature = "dynamic_group_by"))] { LazyGroupBy { logical_plan: self.logical_plan, @@ -1364,16 +1368,16 @@ impl LazyFrame { } } -/// Utility struct for lazy groupby operation. +/// Utility struct for lazy group_by operation. #[derive(Clone)] pub struct LazyGroupBy { pub logical_plan: LogicalPlan, opt_state: OptState, keys: Vec, maintain_order: bool, - #[cfg(feature = "dynamic_groupby")] + #[cfg(feature = "dynamic_group_by")] dynamic_options: Option, - #[cfg(feature = "dynamic_groupby")] + #[cfg(feature = "dynamic_group_by")] rolling_options: Option, } @@ -1392,7 +1396,7 @@ impl LazyGroupBy { /// /// fn example(df: DataFrame) -> LazyFrame { /// df.lazy() - /// .groupby_stable([col("date")]) + /// .group_by_stable([col("date")]) /// .agg([ /// col("rain").min().alias("min_rain"), /// col("rain").sum().alias("sum_rain"), @@ -1401,9 +1405,9 @@ impl LazyGroupBy { /// } /// ``` pub fn agg>(self, aggs: E) -> LazyFrame { - #[cfg(feature = "dynamic_groupby")] + #[cfg(feature = "dynamic_group_by")] let lp = LogicalPlanBuilder::from(self.logical_plan) - .groupby( + .group_by( self.keys, aggs, None, @@ -1413,9 +1417,9 @@ impl LazyGroupBy { ) .build(); - #[cfg(not(feature = "dynamic_groupby"))] + #[cfg(not(feature = "dynamic_group_by"))] let lp = LogicalPlanBuilder::from(self.logical_plan) - .groupby(self.keys, aggs, None, self.maintain_order) + .group_by(self.keys, aggs, None, self.maintain_order) .build(); LazyFrame::from_logical_plan(lp, self.opt_state) } @@ -1450,14 +1454,14 @@ impl LazyGroupBy { where F: 'static + Fn(DataFrame) -> PolarsResult + Send + Sync, { - #[cfg(feature = "dynamic_groupby")] + #[cfg(feature = "dynamic_group_by")] let options = GroupbyOptions { dynamic: self.dynamic_options, rolling: self.rolling_options, slice: None, }; - #[cfg(not(feature = "dynamic_groupby"))] + #[cfg(not(feature = "dynamic_group_by"))] let options = GroupbyOptions { slice: None }; let lp = LogicalPlan::Aggregate { diff --git a/crates/polars-lazy/src/frame/pivot.rs b/crates/polars-lazy/src/frame/pivot.rs index 195d4ad49427..c9e0339593db 100644 --- a/crates/polars-lazy/src/frame/pivot.rs +++ b/crates/polars-lazy/src/frame/pivot.rs @@ -6,7 +6,7 @@ //! pivot is here, because we want to be able to pass expressions to the pivot operation. //! -use polars_core::frame::groupby::expr::PhysicalAggExpr; +use polars_core::frame::group_by::expr::PhysicalAggExpr; use polars_core::prelude::*; use polars_ops::pivot::PivotAgg; diff --git a/crates/polars-lazy/src/lib.rs b/crates/polars-lazy/src/lib.rs index d4d0de8f1acd..cbe53c484dbe 100644 --- a/crates/polars-lazy/src/lib.rs +++ b/crates/polars-lazy/src/lib.rs @@ -99,7 +99,7 @@ //! )?; //! //! df.lazy() -//! .groupby([col("date")]) +//! .group_by([col("date")]) //! .agg([ //! col("rain").min().alias("min_rain"), //! col("rain").sum().alias("sum_rain"), @@ -160,7 +160,7 @@ //! .filter( //! col("a").lt(lit(2)) //! ) -//! .groupby([col("b")]) +//! .group_by([col("b")]) //! .agg( //! vec![col("b").first().alias("first_b"), col("c").first().alias("first_c")] //! ) @@ -176,7 +176,7 @@ //! //! fn aggregate_all_columns(df_a: DataFrame) -> LazyFrame { //! df_a.lazy() -//! .groupby([col("b")]) +//! .group_by([col("b")]) //! .agg( //! vec![col("*").first()] //! ) diff --git a/crates/polars-lazy/src/physical_plan/executors/groupby.rs b/crates/polars-lazy/src/physical_plan/executors/group_by.rs similarity index 95% rename from crates/polars-lazy/src/physical_plan/executors/groupby.rs rename to crates/polars-lazy/src/physical_plan/executors/group_by.rs index 55e12d2e8997..24d8bd88eb8d 100644 --- a/crates/polars-lazy/src/physical_plan/executors/groupby.rs +++ b/crates/polars-lazy/src/physical_plan/executors/group_by.rs @@ -54,7 +54,7 @@ impl GroupByExec { } #[allow(clippy::too_many_arguments)] -pub(super) fn groupby_helper( +pub(super) fn group_by_helper( mut df: DataFrame, keys: Vec, aggs: &[Arc], @@ -64,7 +64,7 @@ pub(super) fn groupby_helper( slice: Option<(i64, usize)>, ) -> PolarsResult { df.as_single_chunk_par(); - let gb = df.groupby_with_series(keys, true, maintain_order)?; + let gb = df.group_by_with_series(keys, true, maintain_order)?; if let Some(f) = apply { return gb.apply(move |df| f.call_udf(df)); @@ -101,7 +101,7 @@ impl GroupByExec { .iter() .map(|e| e.evaluate(&df, state)) .collect::>()?; - groupby_helper( + group_by_helper( df, keys, &self.aggs, @@ -132,7 +132,7 @@ impl Executor for GroupByExec { .iter() .map(|s| Ok(s.to_field(&self.input_schema)?.name)) .collect::>>()?; - let name = comma_delimited("groupby".to_string(), &by); + let name = comma_delimited("group_by".to_string(), &by); Cow::Owned(name) } else { Cow::Borrowed("") diff --git a/crates/polars-lazy/src/physical_plan/executors/groupby_dynamic.rs b/crates/polars-lazy/src/physical_plan/executors/group_by_dynamic.rs similarity index 82% rename from crates/polars-lazy/src/physical_plan/executors/groupby_dynamic.rs rename to crates/polars-lazy/src/physical_plan/executors/group_by_dynamic.rs index 651ee63716e1..3aa156b062e4 100644 --- a/crates/polars-lazy/src/physical_plan/executors/groupby_dynamic.rs +++ b/crates/polars-lazy/src/physical_plan/executors/group_by_dynamic.rs @@ -1,18 +1,18 @@ -#[cfg(feature = "dynamic_groupby")] -use polars_core::frame::groupby::GroupBy; -#[cfg(feature = "dynamic_groupby")] +#[cfg(feature = "dynamic_group_by")] +use polars_core::frame::group_by::GroupBy; +#[cfg(feature = "dynamic_group_by")] use polars_time::DynamicGroupOptions; use super::*; -#[cfg_attr(not(feature = "dynamic_groupby"), allow(dead_code))] +#[cfg_attr(not(feature = "dynamic_group_by"), allow(dead_code))] pub(crate) struct GroupByDynamicExec { pub(crate) input: Box, // we will use this later #[allow(dead_code)] pub(crate) keys: Vec>, pub(crate) aggs: Vec>, - #[cfg(feature = "dynamic_groupby")] + #[cfg(feature = "dynamic_group_by")] pub(crate) options: DynamicGroupOptions, pub(crate) input_schema: SchemaRef, pub(crate) slice: Option<(i64, usize)>, @@ -20,7 +20,7 @@ pub(crate) struct GroupByDynamicExec { } impl GroupByDynamicExec { - #[cfg(feature = "dynamic_groupby")] + #[cfg(feature = "dynamic_group_by")] fn execute_impl( &mut self, state: &ExecutionState, @@ -33,7 +33,7 @@ impl GroupByDynamicExec { .map(|e| e.evaluate(&df, state)) .collect::>>()?; - let (mut time_key, mut keys, groups) = df.groupby_dynamic(keys, &self.options)?; + let (mut time_key, mut keys, groups) = df.group_by_dynamic(keys, &self.options)?; if let Some(f) = &self.apply { let gb = GroupBy::new(&df, vec![], groups, None); @@ -57,7 +57,7 @@ impl GroupByDynamicExec { time_key = time_key.slice(offset, len); // todo! optimize this, we can prevent an agg_first aggregation upstream - // the ordering has changed due to the groupby + // the ordering has changed due to the group_by for key in keys.iter_mut() { *key = key.slice(offset, len) } @@ -75,12 +75,12 @@ impl GroupByDynamicExec { } impl Executor for GroupByDynamicExec { - #[cfg(not(feature = "dynamic_groupby"))] + #[cfg(not(feature = "dynamic_group_by"))] fn execute(&mut self, _state: &mut ExecutionState) -> PolarsResult { - panic!("activate feature dynamic_groupby") + panic!("activate feature dynamic_group_by") } - #[cfg(feature = "dynamic_groupby")] + #[cfg(feature = "dynamic_group_by")] fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult { #[cfg(debug_assertions)] { @@ -96,7 +96,7 @@ impl Executor for GroupByDynamicExec { .iter() .map(|s| Ok(s.to_field(&self.input_schema)?.name)) .collect::>>()?; - let name = comma_delimited("groupby_dynamic".to_string(), &by); + let name = comma_delimited("group_by_dynamic".to_string(), &by); Cow::Owned(name) } else { Cow::Borrowed("") diff --git a/crates/polars-lazy/src/physical_plan/executors/groupby_partitioned.rs b/crates/polars-lazy/src/physical_plan/executors/group_by_partitioned.rs similarity index 93% rename from crates/polars-lazy/src/physical_plan/executors/groupby_partitioned.rs rename to crates/polars-lazy/src/physical_plan/executors/group_by_partitioned.rs index a7341f951c00..068c2bf01754 100644 --- a/crates/polars-lazy/src/physical_plan/executors/groupby_partitioned.rs +++ b/crates/polars-lazy/src/physical_plan/executors/group_by_partitioned.rs @@ -70,9 +70,9 @@ fn run_partitions( n_threads: usize, maintain_order: bool, ) -> PolarsResult> { - // We do a partitioned groupby. - // Meaning that we first do the groupby operation arbitrarily - // split on several threads. Than the final result we apply the same groupby again. + // We do a partitioned group_by. + // Meaning that we first do the group_by operation arbitrarily + // split on several threads. Than the final result we apply the same group_by again. let dfs = split_df(df, n_threads)?; let phys_aggs = &exec.phys_aggs; @@ -81,7 +81,7 @@ fn run_partitions( dfs.into_par_iter() .map(|df| { let keys = compute_keys(keys, &df, state)?; - let gb = df.groupby_with_series(keys, false, maintain_order)?; + let gb = df.group_by_with_series(keys, false, maintain_order)?; let groups = gb.get_groups(); let mut columns = gb.keys(); @@ -151,7 +151,7 @@ fn estimate_unique_count(keys: &[Series], mut sample_size: usize) -> PolarsResul .collect::>(); let df = DataFrame::new_no_checks(keys); let names = df.get_column_names(); - let gb = df.groupby(names).unwrap(); + let gb = df.group_by(names).unwrap(); Ok(finish(gb.get_groups())) } } @@ -180,7 +180,7 @@ fn can_run_partitioned( } Ok(false) } else { - // below this boundary we assume the partitioned groupby will be faster + // below this boundary we assume the partitioned group_by will be faster let unique_count_boundary = std::env::var("POLARS_PARTITION_UNIQUE_COUNT") .map(|s| s.parse::().unwrap()) .unwrap_or(1000); @@ -230,7 +230,7 @@ impl PartitionGroupByExec { original_df: DataFrame, ) -> Option> { #[allow(clippy::needless_update)] - let groupby_options = GroupbyOptions { + let group_by_options = GroupbyOptions { slice: self.slice, ..Default::default() } @@ -242,7 +242,7 @@ impl PartitionGroupByExec { schema: self.output_schema.clone(), apply: None, maintain_order: false, - options: groupby_options, + options: group_by_options, }; let mut expr_arena = Default::default(); let mut lp_arena = Default::default(); @@ -276,14 +276,14 @@ impl PartitionGroupByExec { mut original_df: DataFrame, ) -> PolarsResult { let dfs = { - // already get the keys. This is the very last minute decision which groupby method we choose. + // already get the keys. This is the very last minute decision which group_by method we choose. // If the column is a categorical, we know the number of groups we have and can decide to continue - // partitioned or go for the standard groupby. The partitioned is likely to be faster on a small number + // partitioned or go for the standard group_by. The partitioned is likely to be faster on a small number // of groups. let keys = self.keys(&original_df, state)?; if !can_run_partitioned(&keys, &original_df, state, self.from_partitioned_ds)? { - return groupby_helper( + return group_by_helper( original_df, keys, &self.phys_aggs, @@ -321,11 +321,11 @@ impl PartitionGroupByExec { // MERGE phase // merge and hash aggregate again let df = accumulate_dataframes_vertical(dfs)?; - // the partitioned groupby has added columns so we must update the schema. + // the partitioned group_by has added columns so we must update the schema. let keys = self.keys(&df, state)?; // first get mutable access and optionally sort - let gb = df.groupby_with_series(keys, true, self.maintain_order)?; + let gb = df.group_by_with_series(keys, true, self.maintain_order)?; let mut groups = gb.get_groups(); #[allow(unused_assignments)] @@ -377,7 +377,7 @@ impl Executor for PartitionGroupByExec { .iter() .map(|s| Ok(s.to_field(&self.input_schema)?.name)) .collect::>>()?; - let name = comma_delimited("groupby_partitioned".to_string(), &by); + let name = comma_delimited("group_by_partitioned".to_string(), &by); Cow::Owned(name) } else { Cow::Borrowed("") diff --git a/crates/polars-lazy/src/physical_plan/executors/groupby_rolling.rs b/crates/polars-lazy/src/physical_plan/executors/group_by_rolling.rs similarity index 85% rename from crates/polars-lazy/src/physical_plan/executors/groupby_rolling.rs rename to crates/polars-lazy/src/physical_plan/executors/group_by_rolling.rs index 170730fe3258..b6d890cbac0a 100644 --- a/crates/polars-lazy/src/physical_plan/executors/groupby_rolling.rs +++ b/crates/polars-lazy/src/physical_plan/executors/group_by_rolling.rs @@ -1,16 +1,16 @@ -#[cfg(feature = "dynamic_groupby")] -use polars_core::frame::groupby::GroupBy; -#[cfg(feature = "dynamic_groupby")] +#[cfg(feature = "dynamic_group_by")] +use polars_core::frame::group_by::GroupBy; +#[cfg(feature = "dynamic_group_by")] use polars_time::RollingGroupOptions; use super::*; -#[cfg_attr(not(feature = "dynamic_groupby"), allow(dead_code))] +#[cfg_attr(not(feature = "dynamic_group_by"), allow(dead_code))] pub(crate) struct GroupByRollingExec { pub(crate) input: Box, pub(crate) keys: Vec>, pub(crate) aggs: Vec>, - #[cfg(feature = "dynamic_groupby")] + #[cfg(feature = "dynamic_group_by")] pub(crate) options: RollingGroupOptions, pub(crate) input_schema: SchemaRef, pub(crate) slice: Option<(i64, usize)>, @@ -18,7 +18,7 @@ pub(crate) struct GroupByRollingExec { } impl GroupByRollingExec { - #[cfg(feature = "dynamic_groupby")] + #[cfg(feature = "dynamic_group_by")] fn execute_impl( &mut self, state: &ExecutionState, @@ -32,7 +32,7 @@ impl GroupByRollingExec { .map(|e| e.evaluate(&df, state)) .collect::>>()?; - let (mut time_key, mut keys, groups) = df.groupby_rolling(keys, &self.options)?; + let (mut time_key, mut keys, groups) = df.group_by_rolling(keys, &self.options)?; if let Some(f) = &self.apply { let gb = GroupBy::new(&df, vec![], groups, None); @@ -56,7 +56,7 @@ impl GroupByRollingExec { time_key = time_key.slice(offset, len); } - // the ordering has changed due to the groupby + // the ordering has changed due to the group_by if !keys.is_empty() { unsafe { match groups { @@ -92,12 +92,12 @@ impl GroupByRollingExec { } impl Executor for GroupByRollingExec { - #[cfg(not(feature = "dynamic_groupby"))] + #[cfg(not(feature = "dynamic_group_by"))] fn execute(&mut self, _state: &mut ExecutionState) -> PolarsResult { - panic!("activate feature dynamic_groupby") + panic!("activate feature dynamic_group_by") } - #[cfg(feature = "dynamic_groupby")] + #[cfg(feature = "dynamic_group_by")] fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult { #[cfg(debug_assertions)] { @@ -112,7 +112,7 @@ impl Executor for GroupByRollingExec { .iter() .map(|s| Ok(s.to_field(&self.input_schema)?.name)) .collect::>>()?; - let name = comma_delimited("groupby_rolling".to_string(), &by); + let name = comma_delimited("group_by_rolling".to_string(), &by); Cow::Owned(name) } else { Cow::Borrowed("") diff --git a/crates/polars-lazy/src/physical_plan/executors/mod.rs b/crates/polars-lazy/src/physical_plan/executors/mod.rs index bf95a06ed6f7..e43a99c2d53f 100644 --- a/crates/polars-lazy/src/physical_plan/executors/mod.rs +++ b/crates/polars-lazy/src/physical_plan/executors/mod.rs @@ -2,10 +2,10 @@ mod cache; mod executor; mod ext_context; mod filter; -mod groupby; -mod groupby_dynamic; -mod groupby_partitioned; -mod groupby_rolling; +mod group_by; +mod group_by_dynamic; +mod group_by_partitioned; +mod group_by_rolling; mod join; mod projection; mod projection_utils; @@ -31,12 +31,12 @@ use rayon::prelude::*; pub(super) use self::cache::*; pub(super) use self::ext_context::*; pub(super) use self::filter::*; -pub(super) use self::groupby::*; -#[cfg(feature = "dynamic_groupby")] -pub(super) use self::groupby_dynamic::*; -pub(super) use self::groupby_partitioned::*; -#[cfg(feature = "dynamic_groupby")] -pub(super) use self::groupby_rolling::*; +pub(super) use self::group_by::*; +#[cfg(feature = "dynamic_group_by")] +pub(super) use self::group_by_dynamic::*; +pub(super) use self::group_by_partitioned::*; +#[cfg(feature = "dynamic_group_by")] +pub(super) use self::group_by_rolling::*; pub(super) use self::join::*; pub(super) use self::projection::*; #[cfg(feature = "python")] diff --git a/crates/polars-lazy/src/physical_plan/executors/projection_utils.rs b/crates/polars-lazy/src/physical_plan/executors/projection_utils.rs index 00bfa825554c..70e33fb986a0 100644 --- a/crates/polars-lazy/src/physical_plan/executors/projection_utils.rs +++ b/crates/polars-lazy/src/physical_plan/executors/projection_utils.rs @@ -24,7 +24,7 @@ fn execute_projection_cached_window_fns( ) -> PolarsResult> { // We partition by normal expression and window expression // - the normal expressions can run in parallel - // - the window expression take more memory and often use the same groupby keys and join tuples + // - the window expression take more memory and often use the same group_by keys and join tuples // so they are cached and run sequential // the partitioning messes with column order, so we also store the idx @@ -36,7 +36,7 @@ fn execute_projection_cached_window_fns( let mut other = Vec::with_capacity(exprs.len()); // first we partition the window function by the values they group over. - // the groupby values should be cached + // the group_by values should be cached let mut index = 0u32; exprs.iter().for_each(|phys| { index += 1; @@ -45,11 +45,11 @@ fn execute_projection_cached_window_fns( let mut is_window = false; for e in e.into_iter() { if let Expr::Window { partition_by, .. } = e { - let groupby = format!("{:?}", partition_by.as_slice()); - if let Some(tpl) = windows.iter_mut().find(|tpl| tpl.0 == groupby) { + let group_by = format!("{:?}", partition_by.as_slice()); + if let Some(tpl) = windows.iter_mut().find(|tpl| tpl.0 == group_by) { tpl.1.push((index, phys.clone())) } else { - windows.push((groupby, vec![(index, phys.clone())])) + windows.push((group_by, vec![(index, phys.clone())])) } is_window = true; break; diff --git a/crates/polars-lazy/src/physical_plan/expressions/aggregation.rs b/crates/polars-lazy/src/physical_plan/expressions/aggregation.rs index 867ea1bd520b..3f7b700dde44 100644 --- a/crates/polars-lazy/src/physical_plan/expressions/aggregation.rs +++ b/crates/polars-lazy/src/physical_plan/expressions/aggregation.rs @@ -6,7 +6,7 @@ use polars_arrow::export::arrow::compute::concatenate::concatenate; use polars_arrow::export::arrow::offset::Offsets; use polars_arrow::prelude::QuantileInterpolOptions; use polars_arrow::utils::CustomIterTools; -use polars_core::frame::groupby::{GroupByMethod, GroupsProxy}; +use polars_core::frame::group_by::{GroupByMethod, GroupsProxy}; use polars_core::prelude::*; use polars_core::utils::NoNull; #[cfg(feature = "dtype-struct")] @@ -426,7 +426,7 @@ impl PartitionedAggregation for AggregationExpr { for (_, idx) in groups { let ca = unsafe { // Safety - // The indexes of the groupby operation are never out of bounds + // The indexes of the group_by operation are never out of bounds ca.take_unchecked(idx.into()) }; process_group(ca)?; diff --git a/crates/polars-lazy/src/physical_plan/expressions/alias.rs b/crates/polars-lazy/src/physical_plan/expressions/alias.rs index 6dc975be1a8a..d9cc5cb73511 100644 --- a/crates/polars-lazy/src/physical_plan/expressions/alias.rs +++ b/crates/polars-lazy/src/physical_plan/expressions/alias.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use polars_core::frame::groupby::GroupsProxy; +use polars_core::frame::group_by::GroupsProxy; use polars_core::prelude::*; use crate::physical_plan::state::ExecutionState; diff --git a/crates/polars-lazy/src/physical_plan/expressions/apply.rs b/crates/polars-lazy/src/physical_plan/expressions/apply.rs index 7a8dea411e83..b24d14c48e60 100644 --- a/crates/polars-lazy/src/physical_plan/expressions/apply.rs +++ b/crates/polars-lazy/src/physical_plan/expressions/apply.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::sync::Arc; -use polars_core::frame::groupby::GroupsProxy; +use polars_core::frame::group_by::GroupsProxy; use polars_core::prelude::*; use polars_core::POOL; #[cfg(feature = "parquet")] @@ -285,7 +285,7 @@ impl PhysicalExpr for ApplyExpr { polars_ensure!( self.allow_group_aware, expr = self.expr, - ComputeError: "this expression cannot run in the groupby context", + ComputeError: "this expression cannot run in the group_by context", ); if self.inputs.len() == 1 { let mut ac = self.inputs[0].evaluate_on_groups(df, groups, state)?; diff --git a/crates/polars-lazy/src/physical_plan/expressions/binary.rs b/crates/polars-lazy/src/physical_plan/expressions/binary.rs index 329b32167d9c..74c1b7833b7a 100644 --- a/crates/polars-lazy/src/physical_plan/expressions/binary.rs +++ b/crates/polars-lazy/src/physical_plan/expressions/binary.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use polars_core::frame::groupby::GroupsProxy; +use polars_core::frame::group_by::GroupsProxy; use polars_core::prelude::*; use polars_core::POOL; diff --git a/crates/polars-lazy/src/physical_plan/expressions/cast.rs b/crates/polars-lazy/src/physical_plan/expressions/cast.rs index d1255efe1072..fc63c6aa9dfc 100644 --- a/crates/polars-lazy/src/physical_plan/expressions/cast.rs +++ b/crates/polars-lazy/src/physical_plan/expressions/cast.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use polars_core::frame::groupby::GroupsProxy; +use polars_core::frame::group_by::GroupsProxy; use polars_core::prelude::*; use crate::physical_plan::state::ExecutionState; diff --git a/crates/polars-lazy/src/physical_plan/expressions/column.rs b/crates/polars-lazy/src/physical_plan/expressions/column.rs index 4baf5d2224ff..b9656e3a2c4d 100644 --- a/crates/polars-lazy/src/physical_plan/expressions/column.rs +++ b/crates/polars-lazy/src/physical_plan/expressions/column.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::sync::Arc; -use polars_core::frame::groupby::GroupsProxy; +use polars_core::frame::group_by::GroupsProxy; use polars_core::prelude::*; use crate::physical_plan::state::ExecutionState; @@ -133,7 +133,7 @@ impl PhysicalExpr for ColumnExpr { match df.get_columns().get(idx) { Some(out) => self.process_by_idx(out, state, schema, df, true), None => { - // partitioned groupby special case + // partitioned group_by special case if let Some(schema) = state.get_schema() { self.process_from_state_schema(df, state, &schema) } else { diff --git a/crates/polars-lazy/src/physical_plan/expressions/filter.rs b/crates/polars-lazy/src/physical_plan/expressions/filter.rs index 9abaf40aacff..a3408a377a2c 100644 --- a/crates/polars-lazy/src/physical_plan/expressions/filter.rs +++ b/crates/polars-lazy/src/physical_plan/expressions/filter.rs @@ -1,7 +1,7 @@ use std::sync::Arc; use polars_arrow::is_valid::IsValid; -use polars_core::frame::groupby::GroupsProxy; +use polars_core::frame::group_by::GroupsProxy; use polars_core::prelude::*; use polars_core::POOL; use rayon::prelude::*; diff --git a/crates/polars-lazy/src/physical_plan/expressions/literal.rs b/crates/polars-lazy/src/physical_plan/expressions/literal.rs index 4e3061e3d1e3..12937639c968 100644 --- a/crates/polars-lazy/src/physical_plan/expressions/literal.rs +++ b/crates/polars-lazy/src/physical_plan/expressions/literal.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::ops::Deref; -use polars_core::frame::groupby::GroupsProxy; +use polars_core::frame::group_by::GroupsProxy; use polars_core::prelude::*; use polars_core::utils::NoNull; diff --git a/crates/polars-lazy/src/physical_plan/expressions/mod.rs b/crates/polars-lazy/src/physical_plan/expressions/mod.rs index 0e81e54f1b3a..23dec5117d7c 100644 --- a/crates/polars-lazy/src/physical_plan/expressions/mod.rs +++ b/crates/polars-lazy/src/physical_plan/expressions/mod.rs @@ -28,7 +28,7 @@ pub(crate) use count::*; pub(crate) use filter::*; pub(crate) use literal::*; use polars_arrow::utils::CustomIterTools; -use polars_core::frame::groupby::GroupsProxy; +use polars_core::frame::group_by::GroupsProxy; use polars_core::prelude::*; use polars_io::predicates::PhysicalIoExpr; pub(crate) use slice::*; @@ -400,7 +400,7 @@ impl<'a> AggregationContext<'a> { #[cfg(debug_assertions)] { if self.groups.len() > s.len() { - polars_warn!("groups may be out of bounds; more groups than elements in a series is only possible in dynamic groupby") + polars_warn!("groups may be out of bounds; more groups than elements in a series is only possible in dynamic group_by") } } diff --git a/crates/polars-lazy/src/physical_plan/expressions/slice.rs b/crates/polars-lazy/src/physical_plan/expressions/slice.rs index 03abd46bff2d..7cffda73843f 100644 --- a/crates/polars-lazy/src/physical_plan/expressions/slice.rs +++ b/crates/polars-lazy/src/physical_plan/expressions/slice.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use polars_core::frame::groupby::{GroupsProxy, IdxItem}; +use polars_core::frame::group_by::{GroupsProxy, IdxItem}; use polars_core::prelude::*; use polars_core::utils::{slice_offsets, CustomIterTools}; use polars_core::POOL; diff --git a/crates/polars-lazy/src/physical_plan/expressions/sort.rs b/crates/polars-lazy/src/physical_plan/expressions/sort.rs index 515320cac6cb..473c43e5befc 100644 --- a/crates/polars-lazy/src/physical_plan/expressions/sort.rs +++ b/crates/polars-lazy/src/physical_plan/expressions/sort.rs @@ -1,7 +1,7 @@ use std::sync::Arc; use polars_arrow::utils::CustomIterTools; -use polars_core::frame::groupby::GroupsProxy; +use polars_core::frame::group_by::GroupsProxy; use polars_core::prelude::*; use polars_core::POOL; use rayon::prelude::*; diff --git a/crates/polars-lazy/src/physical_plan/expressions/sortby.rs b/crates/polars-lazy/src/physical_plan/expressions/sortby.rs index 0da6c9e0b865..fac35f8e4009 100644 --- a/crates/polars-lazy/src/physical_plan/expressions/sortby.rs +++ b/crates/polars-lazy/src/physical_plan/expressions/sortby.rs @@ -1,7 +1,7 @@ use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; -use polars_core::frame::groupby::{GroupsIndicator, GroupsProxy}; +use polars_core::frame::group_by::{GroupsIndicator, GroupsProxy}; use polars_core::prelude::*; use polars_core::POOL; use rayon::prelude::*; @@ -298,7 +298,7 @@ impl PhysicalExpr for SortByExpr { ); // if the rhs is already aggregated once, - // it is reordered by the groupby operation + // it is reordered by the group_by operation // we must ensure that we are as well. if ordered_by_group_operation { let s = ac_in.aggregated(); diff --git a/crates/polars-lazy/src/physical_plan/expressions/take.rs b/crates/polars-lazy/src/physical_plan/expressions/take.rs index 76d1d9338c5e..22b14b517dfa 100644 --- a/crates/polars-lazy/src/physical_plan/expressions/take.rs +++ b/crates/polars-lazy/src/physical_plan/expressions/take.rs @@ -1,7 +1,7 @@ use std::sync::Arc; use polars_arrow::utils::CustomIterTools; -use polars_core::frame::groupby::GroupsProxy; +use polars_core::frame::group_by::GroupsProxy; use polars_core::prelude::*; use polars_core::utils::NoNull; diff --git a/crates/polars-lazy/src/physical_plan/expressions/ternary.rs b/crates/polars-lazy/src/physical_plan/expressions/ternary.rs index f32547743168..1efad2cfdbdf 100644 --- a/crates/polars-lazy/src/physical_plan/expressions/ternary.rs +++ b/crates/polars-lazy/src/physical_plan/expressions/ternary.rs @@ -1,7 +1,7 @@ use std::sync::Arc; use polars_arrow::utils::CustomIterTools; -use polars_core::frame::groupby::GroupsProxy; +use polars_core::frame::group_by::GroupsProxy; use polars_core::prelude::*; use polars_core::POOL; @@ -140,7 +140,7 @@ impl PhysicalExpr for TernaryExpr { if !aggregation_predicate { // unwrap will not fail as it is not an aggregation expression. eprintln!( - "The predicate '{}' in 'when->then->otherwise' is not a valid aggregation and might produce a different number of rows than the groupby operation would. This behavior is experimental and may be subject to change", self.predicate.as_expression().unwrap() + "The predicate '{}' in 'when->then->otherwise' is not a valid aggregation and might produce a different number of rows than the group_by operation would. This behavior is experimental and may be subject to change", self.predicate.as_expression().unwrap() ) } let op_mask = || self.predicate.evaluate_on_groups(df, groups, state); diff --git a/crates/polars-lazy/src/physical_plan/expressions/window.rs b/crates/polars-lazy/src/physical_plan/expressions/window.rs index 670ce3854683..fb26b0e6fb5c 100644 --- a/crates/polars-lazy/src/physical_plan/expressions/window.rs +++ b/crates/polars-lazy/src/physical_plan/expressions/window.rs @@ -3,7 +3,7 @@ use std::sync::Arc; use polars_arrow::export::arrow::array::PrimitiveArray; use polars_core::export::arrow::bitmap::Bitmap; -use polars_core::frame::groupby::{GroupBy, GroupsProxy}; +use polars_core::frame::group_by::{GroupBy, GroupsProxy}; use polars_core::frame::hash_join::{ default_join_ids, private_left_join_multiple_keys, ChunkJoinOptIds, JoinValidation, }; @@ -130,7 +130,7 @@ impl WindowExpr { out_column: Series, flattened: Series, mut ac: AggregationContext, - groupby_columns: &[Series], + group_by_columns: &[Series], gb: GroupBy, state: &ExecutionState, cache_key: &str, @@ -175,7 +175,7 @@ impl WindowExpr { if let Some((output, group)) = non_matching_group { let first = group.first(); - let group = groupby_columns + let group = group_by_columns .iter() .map(|s| format_smartstring!("{}", s.get(first as usize).unwrap())) .collect::>(); @@ -371,13 +371,13 @@ impl WindowExpr { impl PhysicalExpr for WindowExpr { // Note: this was first implemented with expression evaluation but this performed really bad. - // Therefore we choose the groupby -> apply -> self join approach + // Therefore we choose the group_by -> apply -> self join approach - // This first cached the groupby and the join tuples, but rayon under a mutex leads to deadlocks: + // This first cached the group_by and the join tuples, but rayon under a mutex leads to deadlocks: // https://github.com/rayon-rs/rayon/issues/592 fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { // This method does the following: - // 1. determine groupby tuples based on the group_column + // 1. determine group_by tuples based on the group_column // 2. apply an aggregation function // 3. join the results back to the original dataframe // this stores all group values on the original df size @@ -407,14 +407,14 @@ impl PhysicalExpr for WindowExpr { return Ok(Series::full_null(field.name(), 0, field.data_type())); } - let groupby_columns = self + let group_by_columns = self .group_by .iter() .map(|e| e.evaluate(df, state)) .collect::>>()?; // if the keys are sorted - let sorted_keys = groupby_columns.iter().all(|s| { + let sorted_keys = group_by_columns.iter().all(|s| { matches!( s.is_sorted_flag(), IsSorted::Ascending | IsSorted::Descending @@ -441,16 +441,16 @@ impl PhysicalExpr for WindowExpr { } let create_groups = || { - let gb = df.groupby_with_series(groupby_columns.clone(), true, sort_groups)?; + let gb = df.group_by_with_series(group_by_columns.clone(), true, sort_groups)?; let out: PolarsResult = Ok(gb.take_groups()); out }; // Try to get cached grouptuples let (mut groups, _, cache_key) = if state.cache_window() { - let mut cache_key = String::with_capacity(32 * groupby_columns.len()); + let mut cache_key = String::with_capacity(32 * group_by_columns.len()); write!(&mut cache_key, "{}", state.branch_idx).unwrap(); - for s in &groupby_columns { + for s in &group_by_columns { cache_key.push_str(s.name()); } @@ -488,7 +488,7 @@ impl PhysicalExpr for WindowExpr { if sort_groups || state.cache_window() { groups.sort() } - let gb = GroupBy::new(df, groupby_columns.clone(), groups, Some(apply_columns)); + let gb = GroupBy::new(df, group_by_columns.clone(), groups, Some(apply_columns)); // If the aggregation creates categoricals and `MapStrategy` is `Join`, // the string cache was needed. So we hold it for that case. @@ -531,7 +531,7 @@ impl PhysicalExpr for WindowExpr { out_column, flattened, ac, - &groupby_columns, + &group_by_columns, gb, state, &cache_key, @@ -558,16 +558,16 @@ impl PhysicalExpr for WindowExpr { cache_gb(gb, state, &cache_key); let get_join_tuples = || { - if groupby_columns.len() == 1 { + if group_by_columns.len() == 1 { // group key from right column let right = &keys[0]; - groupby_columns[0] + group_by_columns[0] .hash_join_left(right, JoinValidation::ManyToMany) .unwrap() .1 } else { let df_right = DataFrame::new_no_checks(keys); - let df_left = DataFrame::new_no_checks(groupby_columns); + let df_left = DataFrame::new_no_checks(group_by_columns); private_left_join_multiple_keys(&df_left, &df_right, None, None).1 } }; diff --git a/crates/polars-lazy/src/physical_plan/planner/expr.rs b/crates/polars-lazy/src/physical_plan/planner/expr.rs index a96bb84f1193..2411539969d0 100644 --- a/crates/polars-lazy/src/physical_plan/planner/expr.rs +++ b/crates/polars-lazy/src/physical_plan/planner/expr.rs @@ -1,4 +1,4 @@ -use polars_core::frame::groupby::GroupByMethod; +use polars_core::frame::group_by::GroupByMethod; use polars_core::prelude::*; use polars_core::series::IsSorted; use polars_core::utils::_split_offsets; diff --git a/crates/polars-lazy/src/physical_plan/planner/lp.rs b/crates/polars-lazy/src/physical_plan/planner/lp.rs index df764fd7d2a5..5a08853631dd 100644 --- a/crates/polars-lazy/src/physical_plan/planner/lp.rs +++ b/crates/polars-lazy/src/physical_plan/planner/lp.rs @@ -12,17 +12,17 @@ fn partitionable_gb( expr_arena: &Arena, apply: &Option>, ) -> bool { - // We first check if we can partition the groupby on the latest moment. + // We first check if we can partition the group_by on the latest moment. let mut partitionable = true; // checks: - // 1. complex expressions in the groupby itself are also not partitionable + // 1. complex expressions in the group_by itself are also not partitionable // in this case anything more than col("foo") // 2. a custom function cannot be partitioned // 3. we don't bother with more than 2 keys, as the cardinality likely explodes // by the combinations if !keys.is_empty() && keys.len() < 3 && apply.is_none() { - // complex expressions in the groupby itself are also not partitionable + // complex expressions in the group_by itself are also not partitionable // in this case anything more than col("foo") for key in keys { if (expr_arena).iter(*key).count() > 1 { @@ -405,7 +405,7 @@ pub fn create_physical_plan( )?; let _slice = options.slice; - #[cfg(feature = "dynamic_groupby")] + #[cfg(feature = "dynamic_group_by")] if let Some(options) = options.dynamic { let input = create_physical_plan(input, lp_arena, expr_arena)?; return Ok(Box::new(executors::GroupByDynamicExec { @@ -419,7 +419,7 @@ pub fn create_physical_plan( })); } - #[cfg(feature = "dynamic_groupby")] + #[cfg(feature = "dynamic_group_by")] if let Some(options) = options.rolling { let input = create_physical_plan(input, lp_arena, expr_arena)?; return Ok(Box::new(executors::GroupByRollingExec { @@ -433,7 +433,7 @@ pub fn create_physical_plan( })); } - // We first check if we can partition the groupby on the latest moment. + // We first check if we can partition the group_by on the latest moment. let partitionable = partitionable_gb(&keys, &aggs, &input_schema, expr_arena, &apply); if partitionable { let from_partitioned_ds = (&*lp_arena).iter(input).any(|(_, lp)| { diff --git a/crates/polars-lazy/src/physical_plan/state.rs b/crates/polars-lazy/src/physical_plan/state.rs index 164e25adf8c1..ebf1e501885c 100644 --- a/crates/polars-lazy/src/physical_plan/state.rs +++ b/crates/polars-lazy/src/physical_plan/state.rs @@ -5,7 +5,7 @@ use std::sync::{Mutex, RwLock}; use bitflags::bitflags; use once_cell::sync::OnceCell; use polars_core::config::verbose; -use polars_core::frame::groupby::GroupsProxy; +use polars_core::frame::group_by::GroupsProxy; use polars_core::frame::hash_join::ChunkJoinOptIds; use polars_core::prelude::*; #[cfg(any(feature = "parquet", feature = "csv", feature = "ipc"))] diff --git a/crates/polars-lazy/src/physical_plan/streaming/convert_alp.rs b/crates/polars-lazy/src/physical_plan/streaming/convert_alp.rs index 0a78548937ae..20369fafd237 100644 --- a/crates/polars-lazy/src/physical_plan/streaming/convert_alp.rs +++ b/crates/polars-lazy/src/physical_plan/streaming/convert_alp.rs @@ -365,7 +365,7 @@ pub(crate) fn insert_streaming_nodes( #[allow(unused_mut)] let mut can_stream = true; - #[cfg(feature = "dynamic_groupby")] + #[cfg(feature = "dynamic_group_by")] { if options.rolling.is_some() || options.dynamic.is_some() { can_stream = false @@ -387,7 +387,7 @@ pub(crate) fn insert_streaming_nodes( expr_arena .get(*node) .get_type(schema, Context::Default, expr_arena) - // ensure we don't groupby list + // ensure we don't group_by list .map(|dt| !matches!(dt, DataType::List(_))) .unwrap_or(false) }) diff --git a/crates/polars-lazy/src/prelude.rs b/crates/polars-lazy/src/prelude.rs index 798a6e2515bc..0d4a0e3e67ce 100644 --- a/crates/polars-lazy/src/prelude.rs +++ b/crates/polars-lazy/src/prelude.rs @@ -9,7 +9,7 @@ pub use polars_plan::prelude::ParquetWriteOptions; pub(crate) use polars_plan::prelude::*; #[cfg(feature = "rolling_window")] pub use polars_time::{prelude::RollingOptions, Duration}; -#[cfg(feature = "dynamic_groupby")] +#[cfg(feature = "dynamic_group_by")] pub use polars_time::{DynamicGroupOptions, PolarsTemporalGroupby, RollingGroupOptions}; pub(crate) use polars_utils::arena::{Arena, Node}; diff --git a/crates/polars-lazy/src/tests/aggregations.rs b/crates/polars-lazy/src/tests/aggregations.rs index e96c5c0e35e4..d0e620056ea1 100644 --- a/crates/polars-lazy/src/tests/aggregations.rs +++ b/crates/polars-lazy/src/tests/aggregations.rs @@ -9,7 +9,7 @@ fn test_agg_exprs() -> PolarsResult<()> { // a binary expression followed by a function and an aggregation. See if it runs let out = df .lazy() - .groupby_stable([col("cars")]) + .group_by_stable([col("cars")]) .agg([(lit(1) - col("A")) .map(|s| Ok(Some(&s * 2)), GetOutput::same_type()) .alias("foo")]) @@ -30,7 +30,7 @@ fn test_agg_unique_first() -> PolarsResult<()> { let out = df .lazy() - .groupby_stable([col("g")]) + .group_by_stable([col("g")]) .agg([ col("v").unique().first().alias("v_first"), col("v").unique().sort(false).first().alias("true_first"), @@ -73,7 +73,7 @@ fn test_cumsum_agg_as_key() -> PolarsResult<()> { let out = df .lazy() - .groupby([col("soil") + .group_by([col("soil") .neq(col("soil").shift_and_fill(1, col("soil").first())) .cumsum(false) .alias("key")]) @@ -100,7 +100,7 @@ fn test_auto_skew_kurtosis_agg() -> PolarsResult<()> { let out = df .lazy() - .groupby([col("fruits")]) + .group_by([col("fruits")]) .agg([ col("B").skew(false).alias("bskew"), col("B").kurtosis(false, false).alias("bkurt"), @@ -121,17 +121,17 @@ fn test_auto_list_agg() -> PolarsResult<()> { let out = df .clone() .lazy() - .groupby([col("fruits")]) + .group_by([col("fruits")]) .agg([col("B").shift_and_fill(-1, lit(-1)).alias("foo")]) .collect()?; assert!(matches!(out.column("foo")?.dtype(), DataType::List(_))); - // test if it runs and groupby executor thus implements a list after shift_and_fill + // test if it runs and group_by executor thus implements a list after shift_and_fill let _out = df .clone() .lazy() - .groupby([col("fruits")]) + .group_by([col("fruits")]) .agg([col("B").shift_and_fill(-1, lit(-1))]) .collect()?; @@ -157,7 +157,7 @@ fn test_power_in_agg_list1() -> PolarsResult<()> { // a flat apply on a final aggregation let out = df .lazy() - .groupby([col("fruits")]) + .group_by([col("fruits")]) .agg([ col("A") .rolling_min(RollingOptions { @@ -199,7 +199,7 @@ fn test_power_in_agg_list2() -> PolarsResult<()> { // a flat apply on evaluate_on_groups let out = df .lazy() - .groupby([col("fruits")]) + .group_by([col("fruits")]) .agg([col("A") .rolling_min(RollingOptions { window_size: Duration::new(2), @@ -233,7 +233,7 @@ fn test_binary_agg_context_0() -> PolarsResult<()> { let out = df .lazy() - .groupby_stable([col("groups")]) + .group_by_stable([col("groups")]) .agg([when(col("vals").first().neq(lit(1))) .then(repeat(lit("a"), count())) .otherwise(repeat(lit("b"), count())) @@ -274,7 +274,7 @@ fn test_binary_agg_context_1() -> PolarsResult<()> { let out = df .clone() .lazy() - .groupby_stable([col("groups")]) + .group_by_stable([col("groups")]) .agg([when(col("vals").eq(lit(1))) .then(col("vals").sum()) .otherwise(lit(90)) @@ -295,7 +295,7 @@ fn test_binary_agg_context_1() -> PolarsResult<()> { let out = df .lazy() - .groupby_stable([col("groups")]) + .group_by_stable([col("groups")]) .agg([when(col("vals").eq(lit(1))) .then(lit(90)) .otherwise(col("vals").sum()) @@ -329,7 +329,7 @@ fn test_binary_agg_context_2() -> PolarsResult<()> { let out = df .clone() .lazy() - .groupby_stable([col("groups")]) + .group_by_stable([col("groups")]) .agg([(col("vals").first() - col("vals")).alias("vals")]) .collect()?; @@ -347,7 +347,7 @@ fn test_binary_agg_context_2() -> PolarsResult<()> { // Same, but now we reverse the lhs / rhs. let out = df .lazy() - .groupby_stable([col("groups")]) + .group_by_stable([col("groups")]) .agg([((col("vals")) - col("vals").first()).alias("vals")]) .collect()?; @@ -371,7 +371,7 @@ fn test_binary_agg_context_3() -> PolarsResult<()> { let out = df .lazy() - .groupby_stable([col("cars")]) + .group_by_stable([col("cars")]) .agg([(col("A") - col("A").first()).last().alias("last")]) .collect()?; @@ -391,7 +391,7 @@ fn test_shift_elementwise_issue_2509() -> PolarsResult<()> { let out = df .lazy() // Don't use maintain order here! That hides the bug - .groupby([col("x")]) + .group_by([col("x")]) .agg(&[(col("y").shift(-1) + col("x")).alias("sum")]) .sort("x", Default::default()) .collect()?; @@ -419,7 +419,7 @@ fn take_aggregations() -> PolarsResult<()> { let out = df .clone() .lazy() - .groupby([col("user")]) + .group_by([col("user")]) .agg([col("book").take(col("count").arg_max()).alias("fav_book")]) .sort("user", Default::default()) .collect()?; @@ -432,7 +432,7 @@ fn take_aggregations() -> PolarsResult<()> { let out = df .clone() .lazy() - .groupby([col("user")]) + .group_by([col("user")]) .agg([ // keep the head as it test slice correctness col("book") @@ -458,7 +458,7 @@ fn take_aggregations() -> PolarsResult<()> { let out = df .lazy() - .groupby([col("user")]) + .group_by([col("user")]) .agg([col("book").take(lit(0)).alias("take_lit")]) .sort("user", Default::default()) .collect()?; @@ -493,7 +493,7 @@ fn test_take_consistency() -> PolarsResult<()> { let out = df .clone() .lazy() - .groupby_stable([col("cars")]) + .group_by_stable([col("cars")]) .agg([col("A") .arg_sort(SortOptions { descending: true, @@ -510,7 +510,7 @@ fn test_take_consistency() -> PolarsResult<()> { let out_df = df .lazy() - .groupby_stable([col("cars")]) + .group_by_stable([col("cars")]) .agg([ col("A"), col("A") diff --git a/crates/polars-lazy/src/tests/arity.rs b/crates/polars-lazy/src/tests/arity.rs index 77f7c4798ef3..12a2eb8ce9c8 100644 --- a/crates/polars-lazy/src/tests/arity.rs +++ b/crates/polars-lazy/src/tests/arity.rs @@ -12,7 +12,7 @@ fn test_pearson_corr() -> PolarsResult<()> { let out = df .clone() .lazy() - .groupby_stable([col("uid")]) + .group_by_stable([col("uid")]) // a double aggregation expression. .agg([pearson_corr(col("day"), col("cumcases"), 1).alias("pearson_corr")]) .collect()?; @@ -22,7 +22,7 @@ fn test_pearson_corr() -> PolarsResult<()> { let out = df .lazy() - .groupby_stable([col("uid")]) + .group_by_stable([col("uid")]) // a double aggregation expression. .agg([pearson_corr(col("day"), col("cumcases"), 1) .pow(2.0) diff --git a/crates/polars-lazy/src/tests/logical.rs b/crates/polars-lazy/src/tests/logical.rs index 14dff0e1cd91..6fdef02c8b38 100644 --- a/crates/polars-lazy/src/tests/logical.rs +++ b/crates/polars-lazy/src/tests/logical.rs @@ -20,7 +20,7 @@ fn test_duration() -> PolarsResult<()> { .cast(DataType::Datetime(TimeUnit::Milliseconds, None)) .alias("datetime"), ) - .groupby([col("groups")]) + .group_by([col("groups")]) .agg([ (col("date") - col("date").first()).alias("date"), (col("datetime") - col("datetime").first()).alias("datetime"), @@ -104,7 +104,7 @@ fn test_lazy_logical_plan_schema() { let lp = df .lazy() - .groupby([col("variety")]) + .group_by([col("variety")]) .agg([col("sepal.width").min()]) .logical_plan; assert!(lp.schema().unwrap().get("sepal.width").is_some()); diff --git a/crates/polars-lazy/src/tests/optimization_checks.rs b/crates/polars-lazy/src/tests/optimization_checks.rs index 4cd8f36737ac..ab6dcb57b177 100644 --- a/crates/polars-lazy/src/tests/optimization_checks.rs +++ b/crates/polars-lazy/src/tests/optimization_checks.rs @@ -202,16 +202,16 @@ pub fn test_slice_pushdown_join() -> PolarsResult<()> { } #[test] -pub fn test_slice_pushdown_groupby() -> PolarsResult<()> { +pub fn test_slice_pushdown_group_by() -> PolarsResult<()> { let _guard = SINGLE_LOCK.lock().unwrap(); let q = scan_foods_parquet(false).limit(100); let q = q - .groupby([col("category")]) + .group_by([col("category")]) .agg([col("calories").sum()]) .slice(1, 3); - // test if optimization continued beyond the groupby node + // test if optimization continued beyond the group_by node assert!(slice_at_scan(q.clone())); let (mut expr_arena, mut lp_arena) = get_arenas(); @@ -392,7 +392,7 @@ fn test_with_row_count_opts() -> PolarsResult<()> { } #[test] -fn test_groupby_ternary_literal_predicate() -> PolarsResult<()> { +fn test_group_by_ternary_literal_predicate() -> PolarsResult<()> { let df = df![ "a" => [1, 2, 3], "b" => [1, 2, 3] @@ -402,7 +402,7 @@ fn test_groupby_ternary_literal_predicate() -> PolarsResult<()> { let q = df .clone() .lazy() - .groupby(["a"]) + .group_by(["a"]) .agg([when(lit(predicate)) .then(col("b").sum()) .otherwise(NULL.lit())]) @@ -527,14 +527,14 @@ fn test_with_column_prune() -> PolarsResult<()> { } #[test] -fn test_slice_at_scan_groupby() -> PolarsResult<()> { +fn test_slice_at_scan_group_by() -> PolarsResult<()> { let ldf = scan_foods_csv(); // this tests if slice pushdown restarts aggregation nodes (it did not) let q = ldf .slice(0, 5) .filter(col("calories").lt(lit(10))) - .groupby([col("calories")]) + .group_by([col("calories")]) .agg([col("fats_g").first()]) .select([col("fats_g")]); diff --git a/crates/polars-lazy/src/tests/queries.rs b/crates/polars-lazy/src/tests/queries.rs index 2a6dc72fc4f9..4984f909de27 100644 --- a/crates/polars-lazy/src/tests/queries.rs +++ b/crates/polars-lazy/src/tests/queries.rs @@ -121,7 +121,7 @@ fn test_lazy_is_null() { let new = df .lazy() - .groupby([col("variety")]) + .group_by([col("variety")]) .agg([col("sepal.width").min()]) .collect() .unwrap(); @@ -135,7 +135,7 @@ fn test_lazy_pushdown_through_agg() { let df = get_df(); let new = df .lazy() - .groupby([col("variety")]) + .group_by([col("variety")]) .agg([ col("sepal.length").min(), col("petal.length").min().alias("foo"), @@ -234,7 +234,7 @@ fn test_lazy_query_2() { fn test_lazy_query_3() { // query checks if schema of scanning is not changed by aggregation let _ = scan_foods_csv() - .groupby([col("calories")]) + .group_by([col("calories")]) .agg([col("fats_g").max()]) .collect() .unwrap(); @@ -253,7 +253,7 @@ fn test_lazy_query_4() { let out = base_df .clone() - .groupby([col("uid")]) + .group_by([col("uid")]) .agg([ col("day").alias("day"), col("cumcases") @@ -290,7 +290,7 @@ fn test_lazy_query_5() { let out = df .lazy() - .groupby([col("uid")]) + .group_by([col("uid")]) .agg([col("day").head(Some(2))]) .collect() .unwrap(); @@ -370,7 +370,7 @@ fn test_lazy_query_9() -> PolarsResult<()> { [col("Cities.City")], JoinType::Inner.into(), ) - .groupby([col("Cities.Country")]) + .group_by([col("Cities.Country")]) .agg([col("Sales.Amount").sum().alias("sum")]) .sort("sum", Default::default()) .collect()?; @@ -561,7 +561,7 @@ fn test_lazy_wildcard() { let new = df .lazy() - .groupby([col("b")]) + .group_by([col("b")]) .agg([col("*").sum().suffix(""), col("*").first().suffix("_first")]) .collect() .unwrap(); @@ -652,7 +652,7 @@ fn test_lazy_partition_agg() { let out = df .lazy() - .groupby([col("foo")]) + .group_by([col("foo")]) .agg([col("bar").mean()]) .sort("foo", Default::default()) .collect() @@ -664,7 +664,7 @@ fn test_lazy_partition_agg() { ); let out = scan_foods_csv() - .groupby([col("category")]) + .group_by([col("category")]) .agg([col("calories")]) .sort("category", Default::default()) .collect() @@ -687,11 +687,11 @@ fn test_lazy_partition_agg() { } #[test] -fn test_lazy_groupby_apply() { +fn test_lazy_group_by_apply() { let df = fruits_cars(); df.lazy() - .groupby([col("fruits")]) + .group_by([col("fruits")]) .agg([col("cars").apply( |s: Series| Ok(Some(Series::new("", &[s.len() as u32]))), GetOutput::same_type(), @@ -733,7 +733,7 @@ fn test_lazy_shift_and_fill() { } #[test] -fn test_lazy_groupby() { +fn test_lazy_group_by() { let df = df! { "a" => &[Some(1.0), None, Some(3.0), Some(4.0), Some(5.0)], "groups" => &["a", "a", "b", "c", "c"] @@ -742,7 +742,7 @@ fn test_lazy_groupby() { let out = df .lazy() - .groupby([col("groups")]) + .group_by([col("groups")]) .agg([col("a").mean()]) .sort("a", Default::default()) .collect() @@ -763,7 +763,7 @@ fn test_lazy_tail() { } #[test] -fn test_lazy_groupby_sort() { +fn test_lazy_group_by_sort() { let df = df! { "a" => ["a", "b", "a", "b", "b", "c"], "b" => [1, 2, 3, 4, 5, 6] @@ -773,7 +773,7 @@ fn test_lazy_groupby_sort() { let out = df .clone() .lazy() - .groupby([col("a")]) + .group_by([col("a")]) .agg([col("b").sort(false).first()]) .collect() .unwrap() @@ -787,7 +787,7 @@ fn test_lazy_groupby_sort() { let out = df .lazy() - .groupby([col("a")]) + .group_by([col("a")]) .agg([col("b").sort(false).last()]) .collect() .unwrap() @@ -801,7 +801,7 @@ fn test_lazy_groupby_sort() { } #[test] -fn test_lazy_groupby_sort_by() { +fn test_lazy_group_by_sort_by() { let df = df! { "a" => ["a", "a", "a", "b", "b", "c"], "b" => [1, 2, 3, 4, 5, 6], @@ -811,7 +811,7 @@ fn test_lazy_groupby_sort_by() { let out = df .lazy() - .groupby([col("a")]) + .group_by([col("a")]) .agg([col("b").sort_by([col("c")], [true]).first()]) .collect() .unwrap() @@ -826,17 +826,17 @@ fn test_lazy_groupby_sort_by() { #[test] #[cfg(feature = "dtype-datetime")] -fn test_lazy_groupby_cast() { +fn test_lazy_group_by_cast() { let df = df! { "a" => ["a", "a", "a", "b", "b", "c"], "b" => [1, 2, 3, 4, 5, 6] } .unwrap(); - // test if it runs in groupby context + // test if it runs in group_by context let _out = df .lazy() - .groupby([col("a")]) + .group_by([col("a")]) .agg([col("b") .mean() .cast(DataType::Datetime(TimeUnit::Nanoseconds, None))]) @@ -845,17 +845,17 @@ fn test_lazy_groupby_cast() { } #[test] -fn test_lazy_groupby_binary_expr() { +fn test_lazy_group_by_binary_expr() { let df = df! { "a" => ["a", "a", "a", "b", "b", "c"], "b" => [1, 2, 3, 4, 5, 6] } .unwrap(); - // test if it runs in groupby context + // test if it runs in group_by context let out = df .lazy() - .groupby([col("a")]) + .group_by([col("a")]) .agg([col("b").mean() * lit(2)]) .sort("a", Default::default()) .collect() @@ -867,18 +867,18 @@ fn test_lazy_groupby_binary_expr() { } #[test] -fn test_lazy_groupby_filter() -> PolarsResult<()> { +fn test_lazy_group_by_filter() -> PolarsResult<()> { let df = df! { "a" => ["a", "a", "a", "b", "b", "c"], "b" => [1, 2, 3, 4, 5, 6] }?; - // We test if the filters work in the groupby context + // We test if the filters work in the group_by context // and that the aggregations can deal with empty sets let out = df .lazy() - .groupby([col("a")]) + .group_by([col("a")]) .agg([ col("b").filter(col("a").eq(lit("a"))).sum().alias("b_sum"), col("b") @@ -926,7 +926,7 @@ fn test_lazy_groupby_filter() -> PolarsResult<()> { } #[test] -fn test_groupby_projection_pd_same_column() -> PolarsResult<()> { +fn test_group_by_projection_pd_same_column() -> PolarsResult<()> { // this query failed when projection pushdown was enabled let a = || { @@ -952,7 +952,7 @@ fn test_groupby_projection_pd_same_column() -> PolarsResult<()> { } #[test] -fn test_groupby_sort_slice() -> PolarsResult<()> { +fn test_group_by_sort_slice() -> PolarsResult<()> { let df = df![ "groups" => [1, 2, 2, 3, 3, 3], "vals" => [1, 5, 6, 3, 9, 8] @@ -975,14 +975,14 @@ fn test_groupby_sort_slice() -> PolarsResult<()> { ..Default::default() }, ) - .groupby([col("groups")]) + .group_by([col("groups")]) .agg([col("vals").head(Some(2)).alias("foo")]) .sort("groups", SortOptions::default()) .collect()?; let out2 = df .lazy() - .groupby([col("groups")]) + .group_by([col("groups")]) .agg([col("vals").sort(true).head(Some(2)).alias("foo")]) .sort("groups", SortOptions::default()) .collect()?; @@ -992,7 +992,7 @@ fn test_groupby_sort_slice() -> PolarsResult<()> { } #[test] -fn test_groupby_cumsum() -> PolarsResult<()> { +fn test_group_by_cumsum() -> PolarsResult<()> { let df = df![ "groups" => [1, 2, 2, 3, 3, 3], "vals" => [1, 5, 6, 3, 9, 8] @@ -1000,7 +1000,7 @@ fn test_groupby_cumsum() -> PolarsResult<()> { let out = df .lazy() - .groupby([col("groups")]) + .group_by([col("groups")]) .agg([col("vals").cumsum(false)]) .sort("groups", Default::default()) .collect()?; @@ -1059,7 +1059,7 @@ fn test_multiple_explode() -> PolarsResult<()> { let out = df .lazy() - .groupby([col("a")]) + .group_by([col("a")]) .agg([col("b").alias("b_list"), col("c").alias("c_list")]) .explode([col("c_list"), col("b_list")]) .collect()?; @@ -1261,7 +1261,7 @@ fn test_sort_by() -> PolarsResult<()> { let out = df .clone() .lazy() - .groupby_stable([col("b")]) + .group_by_stable([col("b")]) .agg([col("a").sort_by([col("b"), col("c")], [false])]) .collect()?; let a = out.column("a")?.explode()?; @@ -1273,7 +1273,7 @@ fn test_sort_by() -> PolarsResult<()> { // evaluate_on_groups let out = df .lazy() - .groupby_stable([col("b")]) + .group_by_stable([col("b")]) .agg([col("a").sort_by([col("b"), col("c")], [false])]) .collect()?; @@ -1394,7 +1394,7 @@ fn test_filter_count() -> PolarsResult<()> { #[test] #[cfg(feature = "dtype-i16")] -fn test_groupby_small_ints() -> PolarsResult<()> { +fn test_group_by_small_ints() -> PolarsResult<()> { let df = df![ "id_32" => [1i32, 2], "id_16" => [1i16, 2] @@ -1403,7 +1403,7 @@ fn test_groupby_small_ints() -> PolarsResult<()> { // https://github.com/pola-rs/polars/issues/1255 let out = df .lazy() - .groupby([col("id_16"), col("id_32")]) + .group_by([col("id_16"), col("id_32")]) .agg([col("id_16").sum().alias("foo")]) .sort( "foo", @@ -1468,7 +1468,7 @@ fn test_round_after_agg() -> PolarsResult<()> { let out = df .lazy() - .groupby([col("fruits")]) + .group_by([col("fruits")]) .agg([col("A") .cast(DataType::Float32) .mean() @@ -1504,7 +1504,7 @@ fn test_round_after_agg() -> PolarsResult<()> { let out = df .lazy() - .groupby_stable([col("groups")]) + .group_by_stable([col("groups")]) .agg([((col("b") * col("c")).sum() / col("b").sum()) .round(2) .alias("foo")]) @@ -1549,11 +1549,11 @@ fn test_exclude_regex() -> PolarsResult<()> { #[test] #[cfg(feature = "rank")] -fn test_groupby_rank() -> PolarsResult<()> { +fn test_group_by_rank() -> PolarsResult<()> { let df = fruits_cars(); let out = df .lazy() - .groupby_stable([col("cars")]) + .group_by_stable([col("cars")]) .agg([col("B").rank( RankOptions { method: RankMethod::Dense, @@ -1703,7 +1703,7 @@ fn test_apply_flatten() -> PolarsResult<()> { let out = df .lazy() - .groupby_stable([col("B")]) + .group_by_stable([col("B")]) .agg([col("A").abs().sum().alias("A_sum")]) .collect()?; @@ -1723,7 +1723,7 @@ fn test_is_in() -> PolarsResult<()> { let out = df .clone() .lazy() - .groupby_stable([col("fruits")]) + .group_by_stable([col("fruits")]) .agg([col("cars").is_in(col("cars").filter(col("cars").eq(lit("beetle"))))]) .collect()?; let out = out.column("cars").unwrap(); @@ -1737,7 +1737,7 @@ fn test_is_in() -> PolarsResult<()> { // this will be executed by map let out = df .lazy() - .groupby_stable([col("fruits")]) + .group_by_stable([col("fruits")]) .agg([col("cars").is_in(lit(Series::new("a", ["beetle", "vw"])))]) .collect()?; @@ -1761,7 +1761,7 @@ fn test_partitioned_gb_1() -> PolarsResult<()> { "vals" => ["a", "b", "c", "a", "a"] ]? .lazy() - .groupby([col("keys")]) + .group_by([col("keys")]) .agg([ (col("vals").eq(lit("a"))).sum().alias("eq_a"), (col("vals").eq(lit("b"))).sum().alias("eq_b"), @@ -1785,7 +1785,7 @@ fn test_partitioned_gb_count() -> PolarsResult<()> { "col" => (0..100).map(|_| Some(0)).collect::().into_series(), ]? .lazy() - .groupby([col("col")]) + .group_by([col("col")]) .agg([ // we make sure to alias with a different name count().alias("counted"), @@ -1810,7 +1810,7 @@ fn test_partitioned_gb_mean() -> PolarsResult<()> { ]? .lazy() .with_columns([lit("a").alias("str"), lit(1).alias("int")]) - .groupby([col("key")]) + .group_by([col("key")]) .agg([ col("str").mean().alias("mean_str"), col("int").mean().alias("mean_int"), @@ -1836,7 +1836,7 @@ fn test_partitioned_gb_binary() -> PolarsResult<()> { let out = df .clone() .lazy() - .groupby([col("col")]) + .group_by([col("col")]) .agg([(col("col") + lit(10)).sum().alias("sum")]) .collect()?; @@ -1847,7 +1847,7 @@ fn test_partitioned_gb_binary() -> PolarsResult<()> { let out = df .lazy() - .groupby([col("col")]) + .group_by([col("col")]) .agg([(col("col").cast(DataType::Float32) + lit(10.0)) .sum() .alias("sum")]) @@ -1871,7 +1871,7 @@ fn test_partitioned_gb_ternary() -> PolarsResult<()> { let out = df .lazy() - .groupby([col("col")]) + .group_by([col("col")]) .agg([when(col("val").gt(lit(10))) .then(lit(1)) .otherwise(lit(0)) diff --git a/crates/polars-lazy/src/tests/streaming.rs b/crates/polars-lazy/src/tests/streaming.rs index a64c965a45bf..927beac8370a 100644 --- a/crates/polars-lazy/src/tests/streaming.rs +++ b/crates/polars-lazy/src/tests/streaming.rs @@ -37,7 +37,7 @@ fn test_streaming_parquet() -> PolarsResult<()> { let q = get_parquet_file(); let q = q - .groupby([col("sugars_g")]) + .group_by([col("sugars_g")]) .agg([((lit(1) - col("fats_g")) + col("calories")).sum()]) .sort("sugars_g", Default::default()); @@ -51,7 +51,7 @@ fn test_streaming_csv() -> PolarsResult<()> { let q = q .select([col("sugars_g"), col("calories")]) - .groupby([col("sugars_g")]) + .group_by([col("sugars_g")]) .agg([col("calories").sum()]) .sort("sugars_g", Default::default()); @@ -95,7 +95,7 @@ fn test_streaming_multiple_keys_aggregate() -> PolarsResult<()> { let q = q .filter(col("sugars_g").gt(lit(10))) - .groupby([col("sugars_g"), col("calories")]) + .group_by([col("sugars_g"), col("calories")]) .agg([ (col("fats_g") * lit(10)).sum(), col("calories").mean().alias("cal_mean"), @@ -117,7 +117,7 @@ fn test_streaming_first_sum() -> PolarsResult<()> { let q = q .select([col("sugars_g"), col("calories")]) - .groupby([col("sugars_g")]) + .group_by([col("sugars_g")]) .agg([ col("calories").sum(), col("calories").first().alias("calories_first"), @@ -146,7 +146,7 @@ fn test_streaming_aggregate_slice() -> PolarsResult<()> { let q = get_parquet_file(); let q = q - .groupby([col("sugars_g")]) + .group_by([col("sugars_g")]) .agg([((lit(1) - col("fats_g")) + col("calories")).sum()]) .slice(3, 3); @@ -311,7 +311,7 @@ fn test_streaming_aggregate_join() -> PolarsResult<()> { let q = get_parquet_file(); let q = q - .groupby([col("sugars_g")]) + .group_by([col("sugars_g")]) .agg([((lit(1) - col("fats_g")) + col("calories")).sum()]) .slice(0, 3); diff --git a/crates/polars-lazy/src/tests/tpch.rs b/crates/polars-lazy/src/tests/tpch.rs index c5f876477f5c..929711c751c7 100644 --- a/crates/polars-lazy/src/tests/tpch.rs +++ b/crates/polars-lazy/src/tests/tpch.rs @@ -58,7 +58,7 @@ fn test_q2() -> PolarsResult<()> { .filter(col("p_type").str().ends_with(lit("BRASS".to_string()))); let q = q1 .clone() - .groupby([col("p_partkey")]) + .group_by([col("p_partkey")]) .agg([col("ps_supplycost").min()]) .join( q1, diff --git a/crates/polars-ops/src/chunked_array/nan_propagating_aggregate.rs b/crates/polars-ops/src/chunked_array/nan_propagating_aggregate.rs index 93cf9afd8550..ebf7f0d1545d 100644 --- a/crates/polars-ops/src/chunked_array/nan_propagating_aggregate.rs +++ b/crates/polars-ops/src/chunked_array/nan_propagating_aggregate.rs @@ -9,7 +9,7 @@ use polars_arrow::kernels::take_agg::{ }; use polars_arrow::utils::CustomIterTools; use polars_core::export::num::Bounded; -use polars_core::frame::groupby::aggregations::{ +use polars_core::frame::group_by::aggregations::{ _agg_helper_idx, _agg_helper_slice, _rolling_apply_agg_window_no_nulls, _rolling_apply_agg_window_nulls, _slice_from_offsets, _use_rolling_kernels, }; diff --git a/crates/polars-ops/src/frame/pivot/mod.rs b/crates/polars-ops/src/frame/pivot/mod.rs index 9867dede28d9..df020f010548 100644 --- a/crates/polars-ops/src/frame/pivot/mod.rs +++ b/crates/polars-ops/src/frame/pivot/mod.rs @@ -3,7 +3,7 @@ mod positioning; use std::borrow::Cow; use polars_core::export::rayon::prelude::*; -use polars_core::frame::groupby::expr::PhysicalAggExpr; +use polars_core::frame::group_by::expr::PhysicalAggExpr; use polars_core::prelude::*; use polars_core::utils::_split_offsets; use polars_core::{downcast_as_macro_arg_physical, POOL}; @@ -75,7 +75,7 @@ fn restore_logical_type(s: &Series, logical_type: &DataType) -> Series { /// /// # Note /// Polars'/arrow memory is not ideal for transposing operations like pivots. -/// If you have a relatively large table, consider using a groupby over a pivot. +/// If you have a relatively large table, consider using a group_by over a pivot. pub fn pivot( pivot_df: &DataFrame, values: I0, @@ -121,7 +121,7 @@ where /// /// # Note /// Polars'/arrow memory is not ideal for transposing operations like pivots. -/// If you have a relatively large table, consider using a groupby over a pivot. +/// If you have a relatively large table, consider using a group_by over a pivot. pub fn pivot_stable( pivot_df: &DataFrame, values: I0, @@ -167,12 +167,12 @@ where #[allow(clippy::too_many_arguments)] fn pivot_impl( pivot_df: &DataFrame, - // these columns will be aggregated in the nested groupby + // these columns will be aggregated in the nested group_by values: &[String], - // keys of the first groupby operation + // keys of the first group_by operation index: &[String], - // these columns will be used for a nested groupby - // the rows of this nested groupby will be pivoted as header column values + // these columns will be used for a nested group_by + // the rows of this nested group_by will be pivoted as header column values columns: &[String], // aggregation function agg_fn: Option, @@ -189,10 +189,10 @@ fn pivot_impl( let mut count = 0; let out: PolarsResult<()> = POOL.install(|| { for column_column_name in columns { - let mut groupby = index.to_vec(); - groupby.push(column_column_name.clone()); + let mut group_by = index.to_vec(); + group_by.push(column_column_name.clone()); - let groups = pivot_df.groupby_stable(groupby)?.take_groups(); + let groups = pivot_df.group_by_stable(group_by)?.take_groups(); // these are the row locations if !stable { diff --git a/crates/polars-ops/src/series/ops/to_dummies.rs b/crates/polars-ops/src/series/ops/to_dummies.rs index a69d4a0d29cd..3d68161a304e 100644 --- a/crates/polars-ops/src/series/ops/to_dummies.rs +++ b/crates/polars-ops/src/series/ops/to_dummies.rs @@ -1,4 +1,4 @@ -use polars_core::frame::groupby::GroupsIndicator; +use polars_core::frame::group_by::GroupsIndicator; use super::*; diff --git a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/convert.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/convert.rs similarity index 95% rename from crates/polars-pipe/src/executors/sinks/groupby/aggregates/convert.rs rename to crates/polars-pipe/src/executors/sinks/group_by/aggregates/convert.rs index d36085dea640..9e3276b9e0a6 100644 --- a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/convert.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/convert.rs @@ -11,13 +11,13 @@ use polars_plan::prelude::{AAggExpr, AExpr}; use polars_utils::arena::{Arena, Node}; use polars_utils::IdxSize; -use crate::executors::sinks::groupby::aggregates::count::CountAgg; -use crate::executors::sinks::groupby::aggregates::first::FirstAgg; -use crate::executors::sinks::groupby::aggregates::last::LastAgg; -use crate::executors::sinks::groupby::aggregates::mean::MeanAgg; -use crate::executors::sinks::groupby::aggregates::min_max::{new_max, new_min}; -use crate::executors::sinks::groupby::aggregates::null::NullAgg; -use crate::executors::sinks::groupby::aggregates::{AggregateFunction, SumAgg}; +use crate::executors::sinks::group_by::aggregates::count::CountAgg; +use crate::executors::sinks::group_by::aggregates::first::FirstAgg; +use crate::executors::sinks::group_by::aggregates::last::LastAgg; +use crate::executors::sinks::group_by::aggregates::mean::MeanAgg; +use crate::executors::sinks::group_by::aggregates::min_max::{new_max, new_min}; +use crate::executors::sinks::group_by::aggregates::null::NullAgg; +use crate::executors::sinks::group_by::aggregates::{AggregateFunction, SumAgg}; use crate::expressions::PhysicalPipedExpr; use crate::operators::DataChunk; diff --git a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/count.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/count.rs similarity index 100% rename from crates/polars-pipe/src/executors/sinks/groupby/aggregates/count.rs rename to crates/polars-pipe/src/executors/sinks/group_by/aggregates/count.rs diff --git a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/first.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/first.rs similarity index 96% rename from crates/polars-pipe/src/executors/sinks/groupby/aggregates/first.rs rename to crates/polars-pipe/src/executors/sinks/group_by/aggregates/first.rs index 6de591c44a4d..604502902d53 100644 --- a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/first.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/first.rs @@ -4,7 +4,7 @@ use polars_core::datatypes::DataType; use polars_core::prelude::{AnyValue, Series}; use polars_utils::unwrap::UnwrapUncheckedRelease; -use crate::executors::sinks::groupby::aggregates::AggregateFn; +use crate::executors::sinks::group_by::aggregates::AggregateFn; use crate::operators::IdxSize; pub(crate) struct FirstAgg { diff --git a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/interface.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/interface.rs similarity index 89% rename from crates/polars-pipe/src/executors/sinks/groupby/aggregates/interface.rs rename to crates/polars-pipe/src/executors/sinks/group_by/aggregates/interface.rs index b56a8b9aee34..982506ecaabd 100644 --- a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/interface.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/interface.rs @@ -5,13 +5,13 @@ use enum_dispatch::enum_dispatch; use polars_core::datatypes::DataType; use polars_core::prelude::{AnyValue, Series}; -use crate::executors::sinks::groupby::aggregates::count::CountAgg; -use crate::executors::sinks::groupby::aggregates::first::FirstAgg; -use crate::executors::sinks::groupby::aggregates::last::LastAgg; -use crate::executors::sinks::groupby::aggregates::mean::MeanAgg; -use crate::executors::sinks::groupby::aggregates::min_max::MinMaxAgg; -use crate::executors::sinks::groupby::aggregates::null::NullAgg; -use crate::executors::sinks::groupby::aggregates::SumAgg; +use crate::executors::sinks::group_by::aggregates::count::CountAgg; +use crate::executors::sinks::group_by::aggregates::first::FirstAgg; +use crate::executors::sinks::group_by::aggregates::last::LastAgg; +use crate::executors::sinks::group_by::aggregates::mean::MeanAgg; +use crate::executors::sinks::group_by::aggregates::min_max::MinMaxAgg; +use crate::executors::sinks::group_by::aggregates::null::NullAgg; +use crate::executors::sinks::group_by::aggregates::SumAgg; use crate::operators::IdxSize; #[enum_dispatch(AggregateFunction)] diff --git a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/last.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/last.rs similarity index 96% rename from crates/polars-pipe/src/executors/sinks/groupby/aggregates/last.rs rename to crates/polars-pipe/src/executors/sinks/group_by/aggregates/last.rs index 27ddaf790c0b..08f211359064 100644 --- a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/last.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/last.rs @@ -4,7 +4,7 @@ use polars_core::datatypes::DataType; use polars_core::prelude::{AnyValue, Series}; use polars_utils::unwrap::UnwrapUncheckedRelease; -use crate::executors::sinks::groupby::aggregates::AggregateFn; +use crate::executors::sinks::group_by::aggregates::AggregateFn; use crate::operators::IdxSize; pub(crate) struct LastAgg { diff --git a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/mean.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/mean.rs similarity index 100% rename from crates/polars-pipe/src/executors/sinks/groupby/aggregates/mean.rs rename to crates/polars-pipe/src/executors/sinks/group_by/aggregates/mean.rs diff --git a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/min_max.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/min_max.rs similarity index 100% rename from crates/polars-pipe/src/executors/sinks/groupby/aggregates/min_max.rs rename to crates/polars-pipe/src/executors/sinks/group_by/aggregates/min_max.rs diff --git a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/mod.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/mod.rs similarity index 100% rename from crates/polars-pipe/src/executors/sinks/groupby/aggregates/mod.rs rename to crates/polars-pipe/src/executors/sinks/group_by/aggregates/mod.rs diff --git a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/null.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/null.rs similarity index 92% rename from crates/polars-pipe/src/executors/sinks/groupby/aggregates/null.rs rename to crates/polars-pipe/src/executors/sinks/group_by/aggregates/null.rs index 4846af4faf0b..768bcde96947 100644 --- a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/null.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/null.rs @@ -2,7 +2,7 @@ use std::any::Any; use polars_core::prelude::*; -use crate::executors::sinks::groupby::aggregates::AggregateFn; +use crate::executors::sinks::group_by::aggregates::AggregateFn; #[derive(Clone)] pub struct NullAgg(DataType); diff --git a/crates/polars-pipe/src/executors/sinks/groupby/aggregates/sum.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/sum.rs similarity index 100% rename from crates/polars-pipe/src/executors/sinks/groupby/aggregates/sum.rs rename to crates/polars-pipe/src/executors/sinks/group_by/aggregates/sum.rs diff --git a/crates/polars-pipe/src/executors/sinks/groupby/generic/eval.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/eval.rs similarity index 100% rename from crates/polars-pipe/src/executors/sinks/groupby/generic/eval.rs rename to crates/polars-pipe/src/executors/sinks/group_by/generic/eval.rs diff --git a/crates/polars-pipe/src/executors/sinks/groupby/generic/global.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/global.rs similarity index 100% rename from crates/polars-pipe/src/executors/sinks/groupby/generic/global.rs rename to crates/polars-pipe/src/executors/sinks/group_by/generic/global.rs diff --git a/crates/polars-pipe/src/executors/sinks/groupby/generic/hash_table.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs similarity index 100% rename from crates/polars-pipe/src/executors/sinks/groupby/generic/hash_table.rs rename to crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs diff --git a/crates/polars-pipe/src/executors/sinks/groupby/generic/mod.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/mod.rs similarity index 97% rename from crates/polars-pipe/src/executors/sinks/groupby/generic/mod.rs rename to crates/polars-pipe/src/executors/sinks/group_by/generic/mod.rs index 73e7c762bf58..88a06239ac9a 100644 --- a/crates/polars-pipe/src/executors/sinks/groupby/generic/mod.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/generic/mod.rs @@ -24,7 +24,7 @@ pub(crate) use sink::GenericGroupby2; use thread_local::ThreadLocalTable; use super::*; -use crate::executors::sinks::groupby::aggregates::{AggregateFn, AggregateFunction}; +use crate::executors::sinks::group_by::aggregates::{AggregateFn, AggregateFunction}; use crate::executors::sinks::io::IOThread; use crate::operators::{DataChunk, FinalizedSink, PExecutionContext, Sink, SinkResult}; diff --git a/crates/polars-pipe/src/executors/sinks/groupby/generic/ooc_state.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/ooc_state.rs similarity index 97% rename from crates/polars-pipe/src/executors/sinks/groupby/generic/ooc_state.rs rename to crates/polars-pipe/src/executors/sinks/group_by/generic/ooc_state.rs index 745b5dd99d95..1b8610c54251 100644 --- a/crates/polars-pipe/src/executors/sinks/groupby/generic/ooc_state.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/generic/ooc_state.rs @@ -53,14 +53,14 @@ pub(super) enum SpillAction { impl OocState { fn init_ooc(&mut self, spill_schema: Schema) -> PolarsResult<()> { if verbose() { - eprintln!("OOC groupby started"); + eprintln!("OOC group_by started"); } self.ooc = true; // start IO thread let mut iot = self.io_thread.lock().unwrap(); if iot.is_none() { - *iot = Some(IOThread::try_new(Arc::new(spill_schema), "groupby").unwrap()); + *iot = Some(IOThread::try_new(Arc::new(spill_schema), "group_by").unwrap()); } Ok(()) } diff --git a/crates/polars-pipe/src/executors/sinks/groupby/generic/sink.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/sink.rs similarity index 96% rename from crates/polars-pipe/src/executors/sinks/groupby/generic/sink.rs rename to crates/polars-pipe/src/executors/sinks/group_by/generic/sink.rs index 93c0e0e24066..eb94bccc3839 100644 --- a/crates/polars-pipe/src/executors/sinks/groupby/generic/sink.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/generic/sink.rs @@ -3,9 +3,9 @@ use std::cell::UnsafeCell; use polars_core::utils::accumulate_dataframes_vertical_unchecked; use super::*; -use crate::executors::sinks::groupby::generic::global::GlobalTable; -use crate::executors::sinks::groupby::generic::ooc_state::{OocState, SpillAction}; -use crate::executors::sinks::groupby::generic::source::GroupBySource; +use crate::executors::sinks::group_by::generic::global::GlobalTable; +use crate::executors::sinks::group_by::generic::ooc_state::{OocState, SpillAction}; +use crate::executors::sinks::group_by::generic::source::GroupBySource; use crate::executors::sources::DataFrameSource; use crate::expressions::PhysicalPipedExpr; @@ -174,7 +174,7 @@ impl Sink for GenericGroupby2 { } fn fmt(&self) -> &str { - "generic-groupby" + "generic-group_by" } } diff --git a/crates/polars-pipe/src/executors/sinks/groupby/generic/source.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/source.rs similarity index 95% rename from crates/polars-pipe/src/executors/sinks/groupby/generic/source.rs rename to crates/polars-pipe/src/executors/sinks/group_by/generic/source.rs index d174e70998d9..bdb52235b3b7 100644 --- a/crates/polars-pipe/src/executors/sinks/groupby/generic/source.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/generic/source.rs @@ -3,7 +3,7 @@ use polars_io::ipc::IpcReader; use polars_io::SerReader; use super::*; -use crate::executors::sinks::groupby::generic::global::GlobalTable; +use crate::executors::sinks::group_by::generic::global::GlobalTable; use crate::executors::sinks::io::{block_thread_until_io_thread_done, IOThread}; use crate::operators::{Source, SourceResult}; use crate::pipeline::PARTITION_SIZE; @@ -26,7 +26,7 @@ impl GroupBySource { let io_thread = io_thread.take().unwrap(); if let Some(slice) = slice { - polars_ensure!(slice.0 >= 0, ComputeError: "negative slice not supported with out-of-core groupby") + polars_ensure!(slice.0 >= 0, ComputeError: "negative slice not supported with out-of-core group_by") } block_thread_until_io_thread_done(&io_thread); @@ -89,6 +89,6 @@ impl Source for GroupBySource { )])) } fn fmt(&self) -> &str { - "generic-groupby-source" + "generic-group_by-source" } } diff --git a/crates/polars-pipe/src/executors/sinks/groupby/generic/thread_local.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/thread_local.rs similarity index 100% rename from crates/polars-pipe/src/executors/sinks/groupby/generic/thread_local.rs rename to crates/polars-pipe/src/executors/sinks/group_by/generic/thread_local.rs diff --git a/crates/polars-pipe/src/executors/sinks/groupby/mod.rs b/crates/polars-pipe/src/executors/sinks/group_by/mod.rs similarity index 100% rename from crates/polars-pipe/src/executors/sinks/groupby/mod.rs rename to crates/polars-pipe/src/executors/sinks/group_by/mod.rs diff --git a/crates/polars-pipe/src/executors/sinks/groupby/ooc.rs b/crates/polars-pipe/src/executors/sinks/group_by/ooc.rs similarity index 95% rename from crates/polars-pipe/src/executors/sinks/groupby/ooc.rs rename to crates/polars-pipe/src/executors/sinks/group_by/ooc.rs index 36c8b2b4ba14..1c57aad65cbd 100644 --- a/crates/polars-pipe/src/executors/sinks/groupby/ooc.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/ooc.rs @@ -12,7 +12,7 @@ pub(super) struct GroupBySource { _io_thread: IOThread, already_finished: Option, partitions: std::fs::ReadDir, - groupby_sink: Box, + group_by_sink: Box, chunk_idx: IdxSize, morsels_per_sink: usize, slice: Option<(usize, usize)>, @@ -22,14 +22,14 @@ impl GroupBySource { pub(super) fn new( io_thread: IOThread, already_finished: DataFrame, - groupby_sink: Box, + group_by_sink: Box, slice: Option<(i64, usize)>, ) -> PolarsResult { let partitions = std::fs::read_dir(&io_thread.dir)?; if let Some(slice) = slice { if slice.0 < 0 { - polars_bail!(ComputeError: "negative slice not supported with out-of-core groupby") + polars_bail!(ComputeError: "negative slice not supported with out-of-core group_by") } } @@ -37,7 +37,7 @@ impl GroupBySource { _io_thread: io_thread, already_finished: Some(already_finished), partitions, - groupby_sink, + group_by_sink, chunk_idx: 0, morsels_per_sink: morsels_per_sink(), slice: slice.map(|slice| (slice.0 as usize, slice.1)), @@ -81,9 +81,9 @@ impl Source for GroupBySource { }) .collect::>>()?; - // create a pipeline with a the files as sources and the groupby as sink + // create a pipeline with a the files as sources and the group_by as sink let mut pipe = - PipeLine::new_simple(sources, vec![], self.groupby_sink.split(0), verbose()); + PipeLine::new_simple(sources, vec![], self.group_by_sink.split(0), verbose()); match pipe.run_pipeline(context, Default::default())?.unwrap() { FinalizedSink::Finished(mut df) => { @@ -124,6 +124,6 @@ impl Source for GroupBySource { } fn fmt(&self) -> &str { - "ooc-groupby-source" + "ooc-group_by-source" } } diff --git a/crates/polars-pipe/src/executors/sinks/groupby/ooc_state.rs b/crates/polars-pipe/src/executors/sinks/group_by/ooc_state.rs similarity index 93% rename from crates/polars-pipe/src/executors/sinks/groupby/ooc_state.rs rename to crates/polars-pipe/src/executors/sinks/group_by/ooc_state.rs index c43f18b757be..c51af7e81b3e 100644 --- a/crates/polars-pipe/src/executors/sinks/groupby/ooc_state.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/ooc_state.rs @@ -33,14 +33,14 @@ impl OocState { pub(super) fn init_ooc(&mut self, input_schema: SchemaRef) -> PolarsResult<()> { if verbose() { - eprintln!("OOC groupby started"); + eprintln!("OOC group_by started"); } self.ooc = true; // start IO thread let mut iot = self.io_thread.lock().unwrap(); if iot.is_none() { - *iot = Some(IOThread::try_new(input_schema, "groupby")?) + *iot = Some(IOThread::try_new(input_schema, "group_by")?) } Ok(()) } diff --git a/crates/polars-pipe/src/executors/sinks/groupby/primitive/mod.rs b/crates/polars-pipe/src/executors/sinks/group_by/primitive/mod.rs similarity index 97% rename from crates/polars-pipe/src/executors/sinks/groupby/primitive/mod.rs rename to crates/polars-pipe/src/executors/sinks/group_by/primitive/mod.rs index d5e103062cfa..3711969eda20 100644 --- a/crates/polars-pipe/src/executors/sinks/groupby/primitive/mod.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/primitive/mod.rs @@ -21,11 +21,11 @@ use polars_utils::unwrap::UnwrapUncheckedRelease; use rayon::prelude::*; use super::aggregates::AggregateFn; -use crate::executors::sinks::groupby::aggregates::AggregateFunction; -use crate::executors::sinks::groupby::ooc_state::OocState; -use crate::executors::sinks::groupby::physical_agg_to_logical; -use crate::executors::sinks::groupby::string::{apply_aggregate, write_agg_idx}; -use crate::executors::sinks::groupby::utils::{compute_slices, finalize_groupby}; +use crate::executors::sinks::group_by::aggregates::AggregateFunction; +use crate::executors::sinks::group_by::ooc_state::OocState; +use crate::executors::sinks::group_by::physical_agg_to_logical; +use crate::executors::sinks::group_by::string::{apply_aggregate, write_agg_idx}; +use crate::executors::sinks::group_by::utils::{compute_slices, finalize_group_by}; use crate::executors::sinks::io::IOThread; use crate::executors::sinks::utils::load_vec; use crate::executors::sinks::HASHMAP_INIT_SIZE; @@ -455,7 +455,7 @@ where let payload = if self.ooc_state.ooc { let mut iot = self.ooc_state.io_thread.lock().unwrap(); // make sure that we reset the shared states - // the OOC groupby will call split as well and it should + // the OOC group_by will call split as well and it should // not send continue spilling to disk let iot = iot.take().unwrap(); self.ooc_state.ooc = false; @@ -464,7 +464,7 @@ where } else { None }; - finalize_groupby(dfs, &self.output_schema, self.slice, payload) + finalize_group_by(dfs, &self.output_schema, self.slice, payload) } fn split(&self, thread_no: usize) -> Box { @@ -487,7 +487,7 @@ where self } fn fmt(&self) -> &str { - "primitive_groupby" + "primitive_group_by" } } diff --git a/crates/polars-pipe/src/executors/sinks/groupby/string.rs b/crates/polars-pipe/src/executors/sinks/group_by/string.rs similarity index 97% rename from crates/polars-pipe/src/executors/sinks/groupby/string.rs rename to crates/polars-pipe/src/executors/sinks/group_by/string.rs index 7c58a040895d..d8d0ae293bb2 100644 --- a/crates/polars-pipe/src/executors/sinks/groupby/string.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/string.rs @@ -15,11 +15,11 @@ use polars_utils::unwrap::UnwrapUncheckedRelease; use rayon::prelude::*; use super::aggregates::AggregateFn; -use crate::executors::sinks::groupby::aggregates::AggregateFunction; -use crate::executors::sinks::groupby::ooc_state::OocState; -use crate::executors::sinks::groupby::physical_agg_to_logical; -use crate::executors::sinks::groupby::primitive::apply_aggregation; -use crate::executors::sinks::groupby::utils::{compute_slices, finalize_groupby}; +use crate::executors::sinks::group_by::aggregates::AggregateFunction; +use crate::executors::sinks::group_by::ooc_state::OocState; +use crate::executors::sinks::group_by::physical_agg_to_logical; +use crate::executors::sinks::group_by::primitive::apply_aggregation; +use crate::executors::sinks::group_by::utils::{compute_slices, finalize_group_by}; use crate::executors::sinks::io::IOThread; use crate::executors::sinks::utils::load_vec; use crate::executors::sinks::HASHMAP_INIT_SIZE; @@ -497,7 +497,7 @@ impl Sink for Utf8GroupbySink { let payload = if self.ooc_state.ooc { let mut iot = self.ooc_state.io_thread.lock().unwrap(); // make sure that we reset the shared states - // the OOC groupby will call split as well and it should + // the OOC group_by will call split as well and it should // not send continue spilling to disk let iot = iot.take().unwrap(); self.ooc_state.ooc = false; @@ -506,14 +506,14 @@ impl Sink for Utf8GroupbySink { } else { None }; - finalize_groupby(dfs, &self.output_schema, self.slice, payload) + finalize_group_by(dfs, &self.output_schema, self.slice, payload) } fn as_any(&mut self) -> &mut dyn Any { self } fn fmt(&self) -> &str { - "utf8_groupby" + "utf8_group_by" } } diff --git a/crates/polars-pipe/src/executors/sinks/groupby/utils.rs b/crates/polars-pipe/src/executors/sinks/group_by/utils.rs similarity index 96% rename from crates/polars-pipe/src/executors/sinks/groupby/utils.rs rename to crates/polars-pipe/src/executors/sinks/group_by/utils.rs index e669fd4a8407..6120bcedcb3f 100644 --- a/crates/polars-pipe/src/executors/sinks/groupby/utils.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/utils.rs @@ -2,7 +2,7 @@ use hashbrown::HashMap; use polars_core::prelude::*; use polars_core::utils::{accumulate_dataframes_vertical_unchecked, slice_offsets}; -use crate::executors::sinks::groupby::ooc::GroupBySource; +use crate::executors::sinks::group_by::ooc::GroupBySource; use crate::executors::sinks::io::{block_thread_until_io_thread_done, IOThread}; use crate::operators::{FinalizedSink, Sink}; @@ -50,7 +50,7 @@ pub(super) fn compute_slices( } } -pub(super) fn finalize_groupby( +pub(super) fn finalize_group_by( dfs: Vec, output_schema: &Schema, slice: Option<(i64, usize)>, diff --git a/crates/polars-pipe/src/executors/sinks/mod.rs b/crates/polars-pipe/src/executors/sinks/mod.rs index fb142566a6b9..328ab178a9e6 100644 --- a/crates/polars-pipe/src/executors/sinks/mod.rs +++ b/crates/polars-pipe/src/executors/sinks/mod.rs @@ -1,6 +1,6 @@ #[cfg(any(feature = "parquet", feature = "ipc"))] mod file_sink; -pub(crate) mod groupby; +pub(crate) mod group_by; mod io; mod joins; mod memory; diff --git a/crates/polars-pipe/src/pipeline/convert.rs b/crates/polars-pipe/src/pipeline/convert.rs index 7ee9f7620875..ff1ce7550c0d 100644 --- a/crates/polars-pipe/src/pipeline/convert.rs +++ b/crates/polars-pipe/src/pipeline/convert.rs @@ -8,8 +8,8 @@ use polars_core::with_match_physical_integer_polars_type; use polars_plan::prelude::*; use crate::executors::operators::HstackOperator; -use crate::executors::sinks::groupby::aggregates::convert_to_hash_agg; -use crate::executors::sinks::groupby::GenericGroupby2; +use crate::executors::sinks::group_by::aggregates::convert_to_hash_agg; +use crate::executors::sinks::group_by::GenericGroupby2; use crate::executors::sinks::*; use crate::executors::{operators, sources}; use crate::expressions::PhysicalPipedExpr; @@ -241,13 +241,13 @@ where (keys, aggs, input_schema.clone()) }, Some(keys) => { - let mut groupby_out_schema = Schema::with_capacity(input_schema.len()); + let mut group_by_out_schema = Schema::with_capacity(input_schema.len()); let key_names = PlHashSet::from_iter(keys.iter().map(|s| s.as_ref())); let keys = keys .iter() .map(|key| { let (_, name, dtype) = input_schema.get_full(key.as_str()).unwrap(); - groupby_out_schema.with_column(name.clone(), dtype.clone()); + group_by_out_schema.with_column(name.clone(), dtype.clone()); expr_arena.add(AExpr::Column(Arc::from(key.as_str()))) }) .collect(); @@ -260,7 +260,7 @@ where } else { let (_, name, dtype) = input_schema.get_full(name.as_str()).unwrap(); - groupby_out_schema.with_column(name.clone(), dtype.clone()); + group_by_out_schema.with_column(name.clone(), dtype.clone()); let col = expr_arena.add(AExpr::Column(Arc::from(name.as_str()))); Some(match options.keep_strategy { UniqueKeepStrategy::First | UniqueKeepStrategy::Any => { @@ -276,7 +276,7 @@ where } }) .collect(); - (keys, aggs, groupby_out_schema.into()) + (keys, aggs, group_by_out_schema.into()) }, }; @@ -300,7 +300,7 @@ where } let aggregation_columns = Arc::new(aggregation_columns); - let groupby_sink = Box::new(GenericGroupby2::new( + let group_by_sink = Box::new(GenericGroupby2::new( key_columns, aggregation_columns, Arc::from(agg_fns), @@ -309,7 +309,7 @@ where options.slice, )); - Box::new(ReProjectSink::new(input_schema, groupby_sink)) + Box::new(ReProjectSink::new(input_schema, group_by_sink)) }, Aggregate { input, @@ -356,7 +356,7 @@ where ) { (dt, 1) if dt.is_integer() => { with_match_physical_integer_polars_type!(dt, |$T| { - Box::new(groupby::PrimitiveGroupbySink::<$T>::new( + Box::new(group_by::PrimitiveGroupbySink::<$T>::new( key_columns[0].clone(), aggregation_columns, agg_fns, @@ -366,7 +366,7 @@ where )) as Box }) }, - (DataType::Utf8, 1) => Box::new(groupby::Utf8GroupbySink::new( + (DataType::Utf8, 1) => Box::new(group_by::Utf8GroupbySink::new( key_columns[0].clone(), aggregation_columns, agg_fns, diff --git a/crates/polars-pipe/src/pipeline/dispatcher.rs b/crates/polars-pipe/src/pipeline/dispatcher.rs index c9c5b75827f8..ce4b6b4bd4d8 100644 --- a/crates/polars-pipe/src/pipeline/dispatcher.rs +++ b/crates/polars-pipe/src/pipeline/dispatcher.rs @@ -36,7 +36,7 @@ use crate::pipeline::morsels_per_sink; /// /// - 3. One or more sinks /// A sink needs all data in scope to finalize a pipeline branch. -/// Think of sorts, preparing a build phase of a join, groupby + aggregations. +/// Think of sorts, preparing a build phase of a join, group_by + aggregations. /// /// This struct will have the SOS (source, operators, sinks) of its own pipeline branch, but also /// the SOS of other branches. The SOS are stored data oriented and the sinks have an offset that diff --git a/crates/polars-pipe/src/pipeline/mod.rs b/crates/polars-pipe/src/pipeline/mod.rs index 5e85ba469225..eced14cd6ece 100644 --- a/crates/polars-pipe/src/pipeline/mod.rs +++ b/crates/polars-pipe/src/pipeline/mod.rs @@ -7,7 +7,7 @@ pub use dispatcher::PipeLine; use polars_core::prelude::*; use polars_core::POOL; -pub use crate::executors::sinks::groupby::aggregates::can_convert_to_hash_agg; +pub use crate::executors::sinks::group_by::aggregates::can_convert_to_hash_agg; pub(crate) fn morsels_per_sink() -> usize { POOL.current_num_threads() diff --git a/crates/polars-plan/Cargo.toml b/crates/polars-plan/Cargo.toml index e385f9e89a23..9225ec0906fd 100644 --- a/crates/polars-plan/Cargo.toml +++ b/crates/polars-plan/Cargo.toml @@ -112,7 +112,7 @@ pct_change = ["polars-core/pct_change"] moment = ["polars-core/moment", "polars-ops/moment"] abs = ["polars-core/abs"] random = ["polars-core/random"] -dynamic_groupby = ["polars-core/dynamic_groupby"] +dynamic_group_by = ["polars-core/dynamic_group_by"] ewma = ["polars-core/ewma"] dot_diagram = [] unique_counts = ["polars-core/unique_counts"] diff --git a/crates/polars-plan/src/dsl/functions/arity.rs b/crates/polars-plan/src/dsl/functions/arity.rs index 1a865efed104..e37b158d19ee 100644 --- a/crates/polars-plan/src/dsl/functions/arity.rs +++ b/crates/polars-plan/src/dsl/functions/arity.rs @@ -22,7 +22,7 @@ where a.map_many(function, &[b], output_type) } -/// Like [`map_binary`], but used in a groupby-aggregation context. +/// Like [`map_binary`], but used in a group_by-aggregation context. /// /// See [`Expr::apply`] for the difference between [`map`](Expr::map) and [`apply`](Expr::apply). pub fn apply_binary(a: Expr, b: Expr, f: F, output_type: GetOutput) -> Expr diff --git a/crates/polars-plan/src/dsl/mod.rs b/crates/polars-plan/src/dsl/mod.rs index 4e3c8520851a..1d49cbe62a72 100644 --- a/crates/polars-plan/src/dsl/mod.rs +++ b/crates/polars-plan/src/dsl/mod.rs @@ -599,7 +599,7 @@ impl Expr { } } - /// Apply a function/closure over the groups. This should only be used in a groupby aggregation. + /// Apply a function/closure over the groups. This should only be used in a group_by aggregation. /// /// It is the responsibility of the caller that the schema is correct by giving /// the correct output_type. If None given the output type of the input expr is used. @@ -637,7 +637,7 @@ impl Expr { } } - /// Apply a function/closure over the groups with many arguments. This should only be used in a groupby aggregation. + /// Apply a function/closure over the groups with many arguments. This should only be used in a group_by aggregation. /// /// See the [`Expr::apply`] function for the differences between [`map`](Expr::map) and [`apply`](Expr::apply). pub fn apply_many(self, function: F, arguments: &[Expr], output_type: GetOutput) -> Self @@ -867,7 +867,7 @@ impl Expr { } /// Apply window function over a subgroup. - /// This is similar to a groupby + aggregation + self join. + /// This is similar to a group_by + aggregation + self join. /// Or similar to [window functions in Postgres](https://www.postgresql.org/docs/9.1/tutorial-window.html). /// /// # Example @@ -1057,7 +1057,7 @@ impl Expr { } /// Sort this column by the ordering of another column. - /// Can also be used in a groupby context to sort the groups. + /// Can also be used in a group_by context to sort the groups. pub fn sort_by, IE: Into + Clone, R: AsRef<[bool]>>( self, by: E, @@ -1878,7 +1878,7 @@ where } } -/// Apply a function/closure over the groups of multiple columns. This should only be used in a groupby aggregation. +/// Apply a function/closure over the groups of multiple columns. This should only be used in a group_by aggregation. /// /// It is the responsibility of the caller that the schema is correct by giving /// the correct output_type. If None given the output type of the input expr is used. diff --git a/crates/polars-plan/src/logical_plan/aexpr/mod.rs b/crates/polars-plan/src/logical_plan/aexpr/mod.rs index 9fbc72ba71aa..09227fc972e9 100644 --- a/crates/polars-plan/src/logical_plan/aexpr/mod.rs +++ b/crates/polars-plan/src/logical_plan/aexpr/mod.rs @@ -4,7 +4,7 @@ mod schema; use std::sync::Arc; use polars_arrow::prelude::QuantileInterpolOptions; -use polars_core::frame::groupby::GroupByMethod; +use polars_core::frame::group_by::GroupByMethod; use polars_core::prelude::*; use polars_core::utils::{get_time_units, try_get_supertype}; use polars_utils::arena::{Arena, Node}; diff --git a/crates/polars-plan/src/logical_plan/builder.rs b/crates/polars-plan/src/logical_plan/builder.rs index faa1371b9f30..902ef9b6b91d 100644 --- a/crates/polars-plan/src/logical_plan/builder.rs +++ b/crates/polars-plan/src/logical_plan/builder.rs @@ -546,14 +546,14 @@ impl LogicalPlanBuilder { .into() } - pub fn groupby>( + pub fn group_by>( self, keys: Vec, aggs: E, apply: Option>, maintain_order: bool, - #[cfg(feature = "dynamic_groupby")] dynamic_options: Option, - #[cfg(feature = "dynamic_groupby")] rolling_options: Option, + #[cfg(feature = "dynamic_group_by")] dynamic_options: Option, + #[cfg(feature = "dynamic_group_by")] rolling_options: Option, ) -> Self { let current_schema = try_delayed!(self.0.schema(), &self.0, into); let current_schema = current_schema.as_ref(); @@ -594,7 +594,7 @@ impl LogicalPlanBuilder { try_delayed!(check_names(), &self.0, into) } - #[cfg(feature = "dynamic_groupby")] + #[cfg(feature = "dynamic_group_by")] { let index_columns = &[ rolling_options @@ -616,14 +616,14 @@ impl LogicalPlanBuilder { } } - #[cfg(feature = "dynamic_groupby")] + #[cfg(feature = "dynamic_group_by")] let options = GroupbyOptions { dynamic: dynamic_options, rolling: rolling_options, slice: None, }; - #[cfg(not(feature = "dynamic_groupby"))] + #[cfg(not(feature = "dynamic_group_by"))] let options = GroupbyOptions { slice: None }; LogicalPlan::Aggregate { diff --git a/crates/polars-plan/src/logical_plan/builder_alp.rs b/crates/polars-plan/src/logical_plan/builder_alp.rs index 199dfcd921d2..b5ce0d863b73 100644 --- a/crates/polars-plan/src/logical_plan/builder_alp.rs +++ b/crates/polars-plan/src/logical_plan/builder_alp.rs @@ -121,7 +121,7 @@ impl<'a> ALogicalPlanBuilder<'a> { self.add_alp(lp) } - pub fn groupby( + pub fn group_by( self, keys: Vec, aggs: Vec, @@ -143,7 +143,7 @@ impl<'a> ALogicalPlanBuilder<'a> { ); schema.merge(other); - #[cfg(feature = "dynamic_groupby")] + #[cfg(feature = "dynamic_group_by")] { let index_columns = &[ options diff --git a/crates/polars-plan/src/logical_plan/optimizer/cse_expr.rs b/crates/polars-plan/src/logical_plan/optimizer/cse_expr.rs index aa0e44670d6b..4ea6c3de5327 100644 --- a/crates/polars-plan/src/logical_plan/optimizer/cse_expr.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/cse_expr.rs @@ -136,7 +136,7 @@ enum VisitRecord { // The `bool` indicates if this expression is valid. // This can be `AND` accumulated by the lineage of the expression to determine // of the whole expression can be added. - // For instance a in a groupby we only want to use elementwise operation in cse: + // For instance a in a group_by we only want to use elementwise operation in cse: // - `(col("a") * 2).sum(), (col("a") * 2)` -> we want to do `col("a") * 2` on a `with_columns` // - `col("a").sum() * col("a").sum()` -> we don't want `sum` to run on `with_columns` // as that doesn't have groups context. If we encounter a `sum` it should be flagged as `false` @@ -210,7 +210,7 @@ struct ExprIdentifierVisitor<'a> { // whether the expression replaced a subexpression has_sub_expr: bool, // During aggregation we only identify element-wise operations - is_groupby: bool, + is_group_by: bool, } impl ExprIdentifierVisitor<'_> { @@ -218,7 +218,7 @@ impl ExprIdentifierVisitor<'_> { se_count: &'a mut SubExprCount, identifier_array: &'a mut IdentifierArray, visit_stack: &'a mut Vec, - is_groupby: bool, + is_group_by: bool, ) -> ExprIdentifierVisitor<'a> { let id_array_offset = identifier_array.len(); ExprIdentifierVisitor { @@ -229,7 +229,7 @@ impl ExprIdentifierVisitor<'_> { visit_stack, id_array_offset, has_sub_expr: false, - is_groupby, + is_group_by, } } @@ -274,7 +274,7 @@ impl ExprIdentifierVisitor<'_> { // during aggregation we only store elementwise operation in the state // other operations we cannot add to the state as they have the output size of the // groups, not the original dataframe - if self.is_groupby { + if self.is_group_by { match ae { AExpr::Agg(_) | AExpr::AnonymousFunction { .. } => { Some((VisitRecursion::Continue, false)) @@ -528,13 +528,13 @@ impl<'a> CommonSubExprOptimizer<'a> { fn visit_expression( &mut self, ae_node: AexprNode, - is_groupby: bool, + is_group_by: bool, ) -> PolarsResult<(usize, bool)> { let mut visitor = ExprIdentifierVisitor::new( &mut self.se_count, &mut self.id_array, &mut self.visit_stack, - is_groupby, + is_group_by, ); ae_node.visit(&mut visitor).map(|_| ())?; Ok((visitor.id_array_offset, visitor.has_sub_expr)) @@ -563,7 +563,7 @@ impl<'a> CommonSubExprOptimizer<'a> { expr: &[Node], expr_arena: &mut Arena, id_array_offsets: &mut Vec, - is_groupby: bool, + is_group_by: bool, schema: &Schema, ) -> PolarsResult> { let mut has_sub_expr = false; @@ -577,7 +577,7 @@ impl<'a> CommonSubExprOptimizer<'a> { // visit expressions and collect sub-expression counts let (id_array_offset, this_expr_has_se) = AexprNode::with_context(*node, expr_arena, |ae_node| { - self.visit_expression(ae_node, is_groupby) + self.visit_expression(ae_node, is_group_by) })?; id_array_offsets.push(id_array_offset as u32); has_sub_expr |= this_expr_has_se; diff --git a/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/utils.rs b/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/utils.rs index 22ad1e3e2366..58179b8aae8b 100644 --- a/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/utils.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/utils.rs @@ -123,7 +123,7 @@ pub(super) fn predicate_is_pushdown_boundary(node: Node, expr_arena: &Arena true, // The series might be used in a comparison with exactly the right length diff --git a/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/groupby.rs b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/group_by.rs similarity index 95% rename from crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/groupby.rs rename to crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/group_by.rs index 09586b88490c..aaeca3ae7f79 100644 --- a/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/groupby.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/group_by.rs @@ -1,7 +1,7 @@ use super::*; #[allow(clippy::too_many_arguments)] -pub(super) fn process_groupby( +pub(super) fn process_group_by( proj_pd: &mut ProjectionPushDown, input: Node, keys: Vec, @@ -66,13 +66,13 @@ pub(super) fn process_groupby( } // make sure that the dynamic key is projected - #[cfg(feature = "dynamic_groupby")] + #[cfg(feature = "dynamic_group_by")] if let Some(options) = &options.dynamic { let node = expr_arena.add(AExpr::Column(Arc::from(options.index_column.as_str()))); add_expr_to_accumulated(node, &mut acc_projections, &mut names, expr_arena); } // make sure that the rolling key is projected - #[cfg(feature = "dynamic_groupby")] + #[cfg(feature = "dynamic_group_by")] if let Some(options) = &options.rolling { let node = expr_arena.add(AExpr::Column(Arc::from(options.index_column.as_str()))); add_expr_to_accumulated(node, &mut acc_projections, &mut names, expr_arena); @@ -87,7 +87,7 @@ pub(super) fn process_groupby( expr_arena, )?; - let builder = ALogicalPlanBuilder::new(input, expr_arena, lp_arena).groupby( + let builder = ALogicalPlanBuilder::new(input, expr_arena, lp_arena).group_by( keys, projected_aggs, apply, diff --git a/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/mod.rs b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/mod.rs index edf8b6edbdb7..127e9ac14c95 100644 --- a/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/mod.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/mod.rs @@ -1,6 +1,6 @@ mod functions; mod generic; -mod groupby; +mod group_by; mod hstack; mod joins; mod projection; @@ -17,7 +17,7 @@ use semi_anti_join::process_semi_anti_join; use crate::logical_plan::Context; use crate::prelude::iterator::ArenaExprIter; use crate::prelude::optimizer::projection_pushdown::generic::process_generic; -use crate::prelude::optimizer::projection_pushdown::groupby::process_groupby; +use crate::prelude::optimizer::projection_pushdown::group_by::process_group_by; use crate::prelude::optimizer::projection_pushdown::hstack::process_hstack; use crate::prelude::optimizer::projection_pushdown::joins::process_join; use crate::prelude::optimizer::projection_pushdown::projection::process_projection; @@ -580,7 +580,7 @@ impl ProjectionPushDown { schema, maintain_order, options, - } => process_groupby( + } => process_group_by( self, input, keys, diff --git a/crates/polars-plan/src/logical_plan/optimizer/type_coercion/binary.rs b/crates/polars-plan/src/logical_plan/optimizer/type_coercion/binary.rs index 89db0560bcc6..423b699ef240 100644 --- a/crates/polars-plan/src/logical_plan/optimizer/type_coercion/binary.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/type_coercion/binary.rs @@ -282,7 +282,7 @@ pub(super) fn process_binary( // only cast if the type is not already the super type. // this can prevent an expensive flattening and subsequent aggregation - // in a groupby context. To be able to cast the groups need to be + // in a group_by context. To be able to cast the groups need to be // flattened let new_node_left = if type_left != st { expr_arena.add(AExpr::Cast { diff --git a/crates/polars-plan/src/logical_plan/optimizer/type_coercion/mod.rs b/crates/polars-plan/src/logical_plan/optimizer/type_coercion/mod.rs index 5ae5f8c9eda1..ba92141239fe 100644 --- a/crates/polars-plan/src/logical_plan/optimizer/type_coercion/mod.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/type_coercion/mod.rs @@ -298,7 +298,7 @@ impl OptimizationRule for TypeCoercionRule { // only cast if the type is not already the super type. // this can prevent an expensive flattening and subsequent aggregation - // in a groupby context. To be able to cast the groups need to be + // in a group_by context. To be able to cast the groups need to be // flattened let new_node_truthy = if type_true != st { expr_arena.add(AExpr::Cast { @@ -462,7 +462,7 @@ impl OptimizationRule for TypeCoercionRule { } // only cast if the type is not already the super type. // this can prevent an expensive flattening and subsequent aggregation - // in a groupby context. To be able to cast the groups need to be + // in a group_by context. To be able to cast the groups need to be // flattened let new_node_self = if type_self != super_type { expr_arena.add(AExpr::Cast { diff --git a/crates/polars-plan/src/logical_plan/options.rs b/crates/polars-plan/src/logical_plan/options.rs index fd653f3764c1..9aef73892951 100644 --- a/crates/polars-plan/src/logical_plan/options.rs +++ b/crates/polars-plan/src/logical_plan/options.rs @@ -8,7 +8,7 @@ use polars_io::ipc::IpcCompression; #[cfg(feature = "parquet")] use polars_io::parquet::ParquetCompression; use polars_io::RowCount; -#[cfg(feature = "dynamic_groupby")] +#[cfg(feature = "dynamic_group_by")] use polars_time::{DynamicGroupOptions, RollingGroupOptions}; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; @@ -105,9 +105,9 @@ pub struct UnionOptions { #[derive(Clone, Debug, PartialEq, Eq, Default)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct GroupbyOptions { - #[cfg(feature = "dynamic_groupby")] + #[cfg(feature = "dynamic_group_by")] pub dynamic: Option, - #[cfg(feature = "dynamic_groupby")] + #[cfg(feature = "dynamic_group_by")] pub rolling: Option, /// Take only a slice of the result pub slice: Option<(i64, usize)>, @@ -195,7 +195,7 @@ pub struct FunctionOptions { // If set to `false` the physical engine will ensure the left input // expression is the output name. pub allow_rename: bool, - // if set, then the `Series` passed to the function in the groupby operation + // if set, then the `Series` passed to the function in the group_by operation // will ensure the name is set. This is an extra heap allocation per group. pub pass_name_to_apply: bool, // For example a `unique` or a `slice` diff --git a/crates/polars-plan/src/logical_plan/projection.rs b/crates/polars-plan/src/logical_plan/projection.rs index aa6b5fef739c..fd467fe56496 100644 --- a/crates/polars-plan/src/logical_plan/projection.rs +++ b/crates/polars-plan/src/logical_plan/projection.rs @@ -353,7 +353,7 @@ fn prepare_excluded( } } - // exclude groupby keys + // exclude group_by keys for mut expr in keys.iter() { // Allow a number of aliases of a column expression, still exclude column from aggregation loop { diff --git a/crates/polars-sql/src/context.rs b/crates/polars-sql/src/context.rs index 39102198e72c..acba55bd4c62 100644 --- a/crates/polars-sql/src/context.rs +++ b/crates/polars-sql/src/context.rs @@ -317,7 +317,7 @@ impl SQLContext { // Check for group by // After projection since there might be number. - let groupby_keys: Vec = select_stmt + let group_by_keys: Vec = select_stmt .group_by .iter() .map(|e| match e { @@ -325,7 +325,7 @@ impl SQLContext { let idx = match idx.parse::() { Ok(0) | Err(_) => Err(polars_err!( ComputeError: - "groupby error: a positive number or an expression expected, got {}", + "group_by error: a positive number or an expression expected, got {}", idx )), Ok(idx) => Ok(idx), @@ -334,16 +334,16 @@ impl SQLContext { }, SqlExpr::Value(_) => Err(polars_err!( ComputeError: - "groupby error: a positive number or an expression expected", + "group_by error: a positive number or an expression expected", )), _ => parse_sql_expr(e, self), }) .collect::>()?; - if groupby_keys.is_empty() { + if group_by_keys.is_empty() { lf = lf.select(projections) } else { - lf = self.process_groupby(lf, contains_wildcard, &groupby_keys, &projections)?; + lf = self.process_group_by(lf, contains_wildcard, &group_by_keys, &projections)?; // Apply optional 'having' clause, post-aggregation lf = match select_stmt.having.as_ref() { @@ -481,31 +481,31 @@ impl SQLContext { Ok(lf.sort_by_exprs(&by, descending, false, false)) } - fn process_groupby( + fn process_group_by( &mut self, lf: LazyFrame, contains_wildcard: bool, - groupby_keys: &[Expr], + group_by_keys: &[Expr], projections: &[Expr], ) -> PolarsResult { - // check groupby and projection due to difference between SQL and polars + // check group_by and projection due to difference between SQL and polars // Return error on wild card, shouldn't process this polars_ensure!( !contains_wildcard, - ComputeError: "groupby error: can't process wildcard in groupby" + ComputeError: "group_by error: can't process wildcard in group_by" ); let schema_before = lf.schema()?; - let groupby_keys_schema = - expressions_to_schema(groupby_keys, &schema_before, Context::Default)?; + let group_by_keys_schema = + expressions_to_schema(group_by_keys, &schema_before, Context::Default)?; - // remove the groupby keys as polars adds those implicitly + // remove the group_by keys as polars adds those implicitly let mut aggregation_projection = Vec::with_capacity(projections.len()); let mut aliases: BTreeSet<&str> = BTreeSet::new(); for mut e in projections { // if it is a simple expression & has alias, - // we must defer the aliasing until after the groupby + // we must defer the aliasing until after the group_by if e.clone().meta().is_simple_projection() { if let Expr::Alias(expr, name) = e { aliases.insert(name); @@ -514,12 +514,12 @@ impl SQLContext { } let field = e.to_field(&schema_before, Context::Default)?; - if groupby_keys_schema.get(&field.name).is_none() { + if group_by_keys_schema.get(&field.name).is_none() { aggregation_projection.push(e.clone()) } } - let aggregated = lf.groupby(groupby_keys).agg(&aggregation_projection); + let aggregated = lf.group_by(group_by_keys).agg(&aggregation_projection); let projection_schema = expressions_to_schema(projections, &schema_before, Context::Default)?; // a final projection to get the proper order @@ -527,7 +527,7 @@ impl SQLContext { .iter_names() .zip(projections) .map(|(name, projection_expr)| { - if groupby_keys_schema.get(name).is_some() || aliases.contains(name.as_str()) { + if group_by_keys_schema.get(name).is_some() || aliases.contains(name.as_str()) { projection_expr.clone() } else { col(name) diff --git a/crates/polars-sql/tests/iss_7437.rs b/crates/polars-sql/tests/iss_7437.rs index a8479d92691e..29229ba5c4c6 100644 --- a/crates/polars-sql/tests/iss_7437.rs +++ b/crates/polars-sql/tests/iss_7437.rs @@ -25,7 +25,7 @@ fn iss_7437() -> PolarsResult<()> { let expected = LazyCsvReader::new("../../examples/datasets/foods1.csv") .finish()? - .groupby(vec![col("category").alias("category")]) + .group_by(vec![col("category").alias("category")]) .agg(vec![]) .collect()? .sort(["category"], vec![false], false)?; diff --git a/crates/polars-sql/tests/ops_distinct_on.rs b/crates/polars-sql/tests/ops_distinct_on.rs index 9497cf63c530..4adc2ab75b45 100644 --- a/crates/polars-sql/tests/ops_distinct_on.rs +++ b/crates/polars-sql/tests/ops_distinct_on.rs @@ -34,7 +34,7 @@ fn test_distinct_on() { true, false, ) - .groupby_stable(vec![col("Name")]) + .group_by_stable(vec![col("Name")]) .agg(vec![col("*").first()]); let expected = expected.collect().unwrap(); assert!(actual.frame_equal(&expected)) diff --git a/crates/polars-sql/tests/simple_exprs.rs b/crates/polars-sql/tests/simple_exprs.rs index 68653d8def23..5ea4937f6995 100644 --- a/crates/polars-sql/tests/simple_exprs.rs +++ b/crates/polars-sql/tests/simple_exprs.rs @@ -54,7 +54,7 @@ fn test_nested_expr() -> PolarsResult<()> { Ok(()) } #[test] -fn test_groupby_simple() -> PolarsResult<()> { +fn test_group_by_simple() -> PolarsResult<()> { let df = create_sample_df()?; let mut context = SQLContext::new(); context.register("df", df.clone().lazy()); @@ -78,7 +78,7 @@ fn test_groupby_simple() -> PolarsResult<()> { .collect()?; let df_pl = df .lazy() - .groupby(&[col("a")]) + .group_by(&[col("a")]) .agg(&[ col("b").sum().alias("b"), (col("a") + col("b")).sum().alias("c"), @@ -463,7 +463,7 @@ fn test_ctes() -> PolarsResult<()> { #[test] #[cfg(feature = "ipc")] -fn test_groupby_2() -> PolarsResult<()> { +fn test_group_by_2() -> PolarsResult<()> { let mut context = SQLContext::new(); let sql = r#" CREATE TABLE foods AS @@ -486,7 +486,7 @@ fn test_groupby_2() -> PolarsResult<()> { let df_sql = df_sql.collect()?; let expected = LazyFrame::scan_ipc("../../examples/datasets/foods1.ipc", Default::default())? .select(&[col("*")]) - .groupby(vec![col("category")]) + .group_by(vec![col("category")]) .agg(vec![ col("category").count().alias("count"), col("calories").max(), diff --git a/crates/polars-time/src/chunkedarray/rolling_window/mod.rs b/crates/polars-time/src/chunkedarray/rolling_window/mod.rs index ead6d9012eb9..dbb3e07d18e6 100644 --- a/crates/polars-time/src/chunkedarray/rolling_window/mod.rs +++ b/crates/polars-time/src/chunkedarray/rolling_window/mod.rs @@ -282,7 +282,7 @@ where }) } else { if arr.null_count() > 0 { - panic!("'rolling by' not yet supported for series with null values, consider using 'groupby_rolling'") + panic!("'rolling by' not yet supported for series with null values, consider using 'group_by_rolling'") } let values = arr.values().as_slice(); let duration = options.window_size; @@ -291,7 +291,7 @@ where let by = options.by.unwrap(); let closed_window = options.closed_window.expect("closed window must be set"); let func = rolling_agg_fn_dynamic.expect( - "'rolling by' not yet supported for this expression, consider using 'groupby_rolling'", + "'rolling by' not yet supported for this expression, consider using 'group_by_rolling'", ); func( diff --git a/crates/polars-time/src/chunkedarray/rolling_window/rolling_kernels/no_nulls.rs b/crates/polars-time/src/chunkedarray/rolling_window/rolling_kernels/no_nulls.rs index 79a349009ead..3894a69dc3f1 100644 --- a/crates/polars-time/src/chunkedarray/rolling_window/rolling_kernels/no_nulls.rs +++ b/crates/polars-time/src/chunkedarray/rolling_window/rolling_kernels/no_nulls.rs @@ -61,8 +61,8 @@ where { let offset_iter = match tz { #[cfg(feature = "timezones")] - Some(tz) => groupby_values_iter(period, time, closed_window, tu, tz.parse::().ok()), - _ => groupby_values_iter(period, time, closed_window, tu, None), + Some(tz) => group_by_values_iter(period, time, closed_window, tu, tz.parse::().ok()), + _ => group_by_values_iter(period, time, closed_window, tu, None), }; rolling_apply_agg_window::, _, _>(values, offset_iter, None) } @@ -82,8 +82,8 @@ where { let offset_iter = match tz { #[cfg(feature = "timezones")] - Some(tz) => groupby_values_iter(period, time, closed_window, tu, tz.parse::().ok()), - _ => groupby_values_iter(period, time, closed_window, tu, None), + Some(tz) => group_by_values_iter(period, time, closed_window, tu, tz.parse::().ok()), + _ => group_by_values_iter(period, time, closed_window, tu, None), }; rolling_apply_agg_window::, _, _>(values, offset_iter, None) } @@ -103,8 +103,8 @@ where { let offset_iter = match tz { #[cfg(feature = "timezones")] - Some(tz) => groupby_values_iter(period, time, closed_window, tu, tz.parse::().ok()), - _ => groupby_values_iter(period, time, closed_window, tu, None), + Some(tz) => group_by_values_iter(period, time, closed_window, tu, tz.parse::().ok()), + _ => group_by_values_iter(period, time, closed_window, tu, None), }; rolling_apply_agg_window::, _, _>(values, offset_iter, None) } @@ -124,8 +124,8 @@ where { let offset_iter = match tz { #[cfg(feature = "timezones")] - Some(tz) => groupby_values_iter(period, time, closed_window, tu, tz.parse::().ok()), - _ => groupby_values_iter(period, time, closed_window, tu, None), + Some(tz) => group_by_values_iter(period, time, closed_window, tu, tz.parse::().ok()), + _ => group_by_values_iter(period, time, closed_window, tu, None), }; rolling_apply_agg_window::, _, _>(values, offset_iter, None) } @@ -145,8 +145,8 @@ where { let offset_iter = match tz { #[cfg(feature = "timezones")] - Some(tz) => groupby_values_iter(period, time, closed_window, tu, tz.parse::().ok()), - _ => groupby_values_iter(period, time, closed_window, tu, None), + Some(tz) => group_by_values_iter(period, time, closed_window, tu, tz.parse::().ok()), + _ => group_by_values_iter(period, time, closed_window, tu, None), }; rolling_apply_agg_window::, _, _>(values, offset_iter, params) } @@ -166,8 +166,8 @@ where { let offset_iter = match tz { #[cfg(feature = "timezones")] - Some(tz) => groupby_values_iter(period, time, closed_window, tu, tz.parse::().ok()), - _ => groupby_values_iter(period, time, closed_window, tu, None), + Some(tz) => group_by_values_iter(period, time, closed_window, tu, tz.parse::().ok()), + _ => group_by_values_iter(period, time, closed_window, tu, None), }; rolling_apply_agg_window::, _, _>(values, offset_iter, params) } diff --git a/crates/polars-time/src/groupby/dynamic.rs b/crates/polars-time/src/group_by/dynamic.rs similarity index 94% rename from crates/polars-time/src/groupby/dynamic.rs rename to crates/polars-time/src/group_by/dynamic.rs index c08248c3ae77..56e2bb969f34 100644 --- a/crates/polars-time/src/groupby/dynamic.rs +++ b/crates/polars-time/src/group_by/dynamic.rs @@ -1,7 +1,7 @@ use polars_arrow::time_zone::Tz; use polars_arrow::utils::CustomIterTools; use polars_core::export::rayon::prelude::*; -use polars_core::frame::groupby::GroupsProxy; +use polars_core::frame::group_by::GroupsProxy; use polars_core::prelude::*; use polars_core::series::IsSorted; use polars_core::utils::ensure_sorted_arg; @@ -92,13 +92,13 @@ const LB_NAME: &str = "_lower_boundary"; const UP_NAME: &str = "_upper_boundary"; pub trait PolarsTemporalGroupby { - fn groupby_rolling( + fn group_by_rolling( &self, by: Vec, options: &RollingGroupOptions, ) -> PolarsResult<(Series, Vec, GroupsProxy)>; - fn groupby_dynamic( + fn group_by_dynamic( &self, by: Vec, options: &DynamicGroupOptions, @@ -106,25 +106,25 @@ pub trait PolarsTemporalGroupby { } impl PolarsTemporalGroupby for DataFrame { - fn groupby_rolling( + fn group_by_rolling( &self, by: Vec, options: &RollingGroupOptions, ) -> PolarsResult<(Series, Vec, GroupsProxy)> { - Wrap(self).groupby_rolling(by, options) + Wrap(self).group_by_rolling(by, options) } - fn groupby_dynamic( + fn group_by_dynamic( &self, by: Vec, options: &DynamicGroupOptions, ) -> PolarsResult<(Series, Vec, GroupsProxy)> { - Wrap(self).groupby_dynamic(by, options) + Wrap(self).group_by_dynamic(by, options) } } impl Wrap<&DataFrame> { - fn groupby_rolling( + fn group_by_rolling( &self, by: Vec, options: &RollingGroupOptions, @@ -138,11 +138,11 @@ impl Wrap<&DataFrame> { if by.is_empty() { // if by is given, the column must be sorted in the 'by' arg, which we can not check now // this will be checked when the groups are materialized - ensure_sorted_arg(&time, "groupby_rolling")?; + ensure_sorted_arg(&time, "group_by_rolling")?; } let time_type = time.dtype(); - polars_ensure!(time.null_count() == 0, ComputeError: "null values in dynamic groupby not supported, fill nulls."); + polars_ensure!(time.null_count() == 0, ComputeError: "null values in dynamic group_by not supported, fill nulls."); use DataType::*; let (dt, tu, tz): (Series, TimeUnit, Option) = match time_type { @@ -155,7 +155,7 @@ impl Wrap<&DataFrame> { Int32 => { let time_type = Datetime(TimeUnit::Nanoseconds, None); let dt = time.cast(&Int64).unwrap().cast(&time_type).unwrap(); - let (out, by, gt) = self.impl_groupby_rolling( + let (out, by, gt) = self.impl_group_by_rolling( dt, by, options, @@ -169,7 +169,7 @@ impl Wrap<&DataFrame> { Int64 => { let time_type = Datetime(TimeUnit::Nanoseconds, None); let dt = time.cast(&time_type).unwrap(); - let (out, by, gt) = self.impl_groupby_rolling( + let (out, by, gt) = self.impl_group_by_rolling( dt, by, options, @@ -189,14 +189,14 @@ impl Wrap<&DataFrame> { match tz { #[cfg(feature = "timezones")] Some(tz) => { - self.impl_groupby_rolling(dt, by, options, tu, tz.parse::().ok(), time_type) + self.impl_group_by_rolling(dt, by, options, tu, tz.parse::().ok(), time_type) }, - _ => self.impl_groupby_rolling(dt, by, options, tu, None, time_type), + _ => self.impl_group_by_rolling(dt, by, options, tu, None, time_type), } } /// Returns: time_keys, keys, groupsproxy - fn groupby_dynamic( + fn group_by_dynamic( &self, by: Vec, options: &DynamicGroupOptions, @@ -214,11 +214,11 @@ impl Wrap<&DataFrame> { if by.is_empty() { // if by is given, the column must be sorted in the 'by' arg, which we can not check now // this will be checked when the groups are materialized - ensure_sorted_arg(&time, "groupby_dynamic")?; + ensure_sorted_arg(&time, "group_by_dynamic")?; } let time_type = time.dtype(); - polars_ensure!(time.null_count() == 0, ComputeError: "null values in dynamic groupby not supported, fill nulls."); + polars_ensure!(time.null_count() == 0, ComputeError: "null values in dynamic group_by not supported, fill nulls."); use DataType::*; let (dt, tu) = match time_type { @@ -231,7 +231,7 @@ impl Wrap<&DataFrame> { let time_type = Datetime(TimeUnit::Nanoseconds, None); let dt = time.cast(&Int64).unwrap().cast(&time_type).unwrap(); let (out, mut keys, gt) = - self.impl_groupby_dynamic(dt, by, options, TimeUnit::Nanoseconds, &time_type)?; + self.impl_group_by_dynamic(dt, by, options, TimeUnit::Nanoseconds, &time_type)?; let out = out.cast(&Int64).unwrap().cast(&Int32).unwrap(); for k in &mut keys { if k.name() == UP_NAME || k.name() == LB_NAME { @@ -244,7 +244,7 @@ impl Wrap<&DataFrame> { let time_type = Datetime(TimeUnit::Nanoseconds, None); let dt = time.cast(&time_type).unwrap(); let (out, mut keys, gt) = - self.impl_groupby_dynamic(dt, by, options, TimeUnit::Nanoseconds, &time_type)?; + self.impl_group_by_dynamic(dt, by, options, TimeUnit::Nanoseconds, &time_type)?; let out = out.cast(&Int64).unwrap(); for k in &mut keys { if k.name() == UP_NAME || k.name() == LB_NAME { @@ -259,10 +259,10 @@ impl Wrap<&DataFrame> { dt ), }; - self.impl_groupby_dynamic(dt, by, options, tu, time_type) + self.impl_group_by_dynamic(dt, by, options, tu, time_type) } - fn impl_groupby_dynamic( + fn impl_group_by_dynamic( &self, mut dt: Series, mut by: Vec, @@ -313,7 +313,7 @@ impl Wrap<&DataFrame> { let groups = if by.is_empty() { let vals = dt.downcast_iter().next().unwrap(); let ts = vals.values().as_slice(); - let (groups, lower, upper) = groupby_windows( + let (groups, lower, upper) = group_by_windows( w, ts, options.closed_window, @@ -331,7 +331,7 @@ impl Wrap<&DataFrame> { } else { let groups = self .0 - .groupby_with_series(by.clone(), true, true)? + .group_by_with_series(by.clone(), true, true)? .take_groups(); // include boundaries cannot be parallel (easily) @@ -349,7 +349,7 @@ impl Wrap<&DataFrame> { { check_sortedness_slice(ts)? } - let (sub_groups, lower, upper) = groupby_windows( + let (sub_groups, lower, upper) = group_by_windows( w, ts, options.closed_window, @@ -383,7 +383,7 @@ impl Wrap<&DataFrame> { let dt = dt.slice(base_g[0] as i64, base_g[1] as usize); let vals = dt.downcast_iter().next().unwrap(); let ts = vals.values().as_slice(); - let (sub_groups, lower, upper) = groupby_windows( + let (sub_groups, lower, upper) = group_by_windows( w, ts, options.closed_window, @@ -428,7 +428,7 @@ impl Wrap<&DataFrame> { { check_sortedness_slice(ts)? } - let (sub_groups, _, _) = groupby_windows( + let (sub_groups, _, _) = group_by_windows( w, ts, options.closed_window, @@ -450,7 +450,7 @@ impl Wrap<&DataFrame> { let dt = dt.slice(base_g[0] as i64, base_g[1] as usize); let vals = dt.downcast_iter().next().unwrap(); let ts = vals.values().as_slice(); - let (sub_groups, _, _) = groupby_windows( + let (sub_groups, _, _) = group_by_windows( w, ts, options.closed_window, @@ -516,7 +516,7 @@ impl Wrap<&DataFrame> { } /// Returns: time_keys, keys, groupsproxy - fn impl_groupby_rolling( + fn impl_group_by_rolling( &self, dt: Series, by: Vec, @@ -535,7 +535,7 @@ impl Wrap<&DataFrame> { let vals = dt.downcast_iter().next().unwrap(); let ts = vals.values().as_slice(); PolarsResult::Ok(GroupsProxy::Slice { - groups: groupby_values( + groups: group_by_values( options.period, options.offset, ts, @@ -548,7 +548,7 @@ impl Wrap<&DataFrame> { } else { let groups = self .0 - .groupby_with_series(by.clone(), true, true)? + .group_by_with_series(by.clone(), true, true)? .take_groups(); // we keep a local copy, as we are reordering on next operation. @@ -573,7 +573,7 @@ impl Wrap<&DataFrame> { check_sortedness_slice(ts)? } - let sub_groups = groupby_values( + let sub_groups = group_by_values( options.period, options.offset, ts, @@ -594,7 +594,7 @@ impl Wrap<&DataFrame> { let dt = dt_local.slice(base_g[0] as i64, base_g[1] as usize); let vals = dt.downcast_iter().next().unwrap(); let ts = vals.values().as_slice(); - let sub_groups = groupby_values( + let sub_groups = group_by_values( options.period, options.offset, ts, @@ -641,7 +641,7 @@ fn update_subgroups_idx( let new_first = if len == 0 { // in case the group is empty // keep the original first so that the - // groupby keys still point to the original group + // group_by keys still point to the original group base_g.0 } else { unsafe { *base_g.1.get_unchecked_release(first as usize) } @@ -664,7 +664,7 @@ mod test { use super::*; #[test] - fn test_rolling_groupby_tu() -> PolarsResult<()> { + fn test_rolling_group_by_tu() -> PolarsResult<()> { // test multiple time units for tu in [ TimeUnit::Nanoseconds, @@ -689,7 +689,7 @@ mod test { let df = DataFrame::new(vec![date, a.clone()])?; let (_, _, groups) = df - .groupby_rolling( + .group_by_rolling( vec![], &RollingGroupOptions { index_column: "dt".into(), @@ -710,7 +710,7 @@ mod test { } #[test] - fn test_rolling_groupby_aggs() -> PolarsResult<()> { + fn test_rolling_group_by_aggs() -> PolarsResult<()> { let mut date = Utf8Chunked::new( "dt", [ @@ -730,7 +730,7 @@ mod test { let df = DataFrame::new(vec![date, a.clone()])?; let (_, _, groups) = df - .groupby_rolling( + .group_by_rolling( vec![], &RollingGroupOptions { index_column: "dt".into(), @@ -782,7 +782,7 @@ mod test { } #[test] - fn test_dynamic_groupby_window() -> PolarsResult<()> { + fn test_dynamic_group_by_window() -> PolarsResult<()> { let start = NaiveDate::from_ymd_opt(2021, 12, 16) .unwrap() .and_hms_opt(0, 0, 0) @@ -808,7 +808,7 @@ mod test { let df = DataFrame::new(vec![range, groups.clone()]).unwrap(); let (time_key, mut keys, groups) = df - .groupby_dynamic( + .group_by_dynamic( vec![groups], &DynamicGroupOptions { index_column: "date".into(), @@ -923,7 +923,7 @@ mod test { let df = DataFrame::new(vec![range, groups.clone()]).unwrap(); let (mut time_key, keys, _groups) = df - .groupby_dynamic( + .group_by_dynamic( vec![groups], &DynamicGroupOptions { index_column: "date".into(), diff --git a/crates/polars-time/src/groupby/mod.rs b/crates/polars-time/src/group_by/mod.rs similarity index 100% rename from crates/polars-time/src/groupby/mod.rs rename to crates/polars-time/src/group_by/mod.rs diff --git a/crates/polars-time/src/lib.rs b/crates/polars-time/src/lib.rs index 25b2e0704be5..b2162e26a740 100644 --- a/crates/polars-time/src/lib.rs +++ b/crates/polars-time/src/lib.rs @@ -3,7 +3,7 @@ mod base_utc_offset; pub mod chunkedarray; mod date_range; mod dst_offset; -mod groupby; +mod group_by; mod month_end; mod month_start; pub mod prelude; @@ -20,7 +20,7 @@ pub use date_range::*; #[cfg(feature = "timezones")] pub use dst_offset::*; #[cfg(any(feature = "dtype-date", feature = "dtype-datetime"))] -pub use groupby::dynamic::*; +pub use group_by::dynamic::*; pub use month_end::*; pub use month_start::*; pub use round::*; @@ -28,5 +28,5 @@ pub use truncate::*; pub use upsample::*; pub use windows::calendar::temporal_range as temporal_range_vec; pub use windows::duration::Duration; -pub use windows::groupby::ClosedWindow; +pub use windows::group_by::ClosedWindow; pub use windows::window::Window; diff --git a/crates/polars-time/src/prelude.rs b/crates/polars-time/src/prelude.rs index b59367b09453..aa9a2a1d5b21 100644 --- a/crates/polars-time/src/prelude.rs +++ b/crates/polars-time/src/prelude.rs @@ -4,6 +4,6 @@ pub use crate::chunkedarray::*; pub use crate::series::{SeriesOpsTime, TemporalMethods}; pub use crate::windows::bounds::*; pub use crate::windows::duration::*; -pub use crate::windows::groupby::*; +pub use crate::windows::group_by::*; pub use crate::windows::window::*; pub use crate::*; diff --git a/crates/polars-time/src/upsample.rs b/crates/polars-time/src/upsample.rs index 65c7b69750b5..d6cde94ee98d 100644 --- a/crates/polars-time/src/upsample.rs +++ b/crates/polars-time/src/upsample.rs @@ -140,9 +140,9 @@ fn upsample_impl( upsample_single_impl(source, index_column, every, offset) } else { let gb = if stable { - source.groupby_stable(by) + source.group_by_stable(by) } else { - source.groupby(by) + source.group_by(by) }; // don't parallelize this, this may SO on large data. gb?.apply(|df| { diff --git a/crates/polars-time/src/windows/bounds.rs b/crates/polars-time/src/windows/bounds.rs index 64af87b61f8a..c3699be2b278 100644 --- a/crates/polars-time/src/windows/bounds.rs +++ b/crates/polars-time/src/windows/bounds.rs @@ -1,4 +1,4 @@ -use super::groupby::ClosedWindow; +use super::group_by::ClosedWindow; #[derive(Copy, Clone, Debug)] pub struct Bounds { @@ -12,7 +12,7 @@ impl Bounds { assert!( start <= stop, "boundary start must be smaller than stop; is your time column sorted in ascending order?\ - \nIf you did a groupby, note that null values are a separate group." + \nIf you did a group_by, note that null values are a separate group." ); Self::new(start, stop) } diff --git a/crates/polars-time/src/windows/groupby.rs b/crates/polars-time/src/windows/group_by.rs similarity index 94% rename from crates/polars-time/src/windows/groupby.rs rename to crates/polars-time/src/windows/group_by.rs index 87e285d9847c..40d6c0e7bea4 100644 --- a/crates/polars-time/src/windows/groupby.rs +++ b/crates/polars-time/src/windows/group_by.rs @@ -143,7 +143,7 @@ fn update_groups_and_bounds( /// /// If `include_boundaries` is `false` those `lower` and `upper` vectors will be empty. #[allow(clippy::too_many_arguments)] -pub fn groupby_windows( +pub fn group_by_windows( window: Window, time: &[i64], closed_window: ClosedWindow, @@ -224,7 +224,7 @@ pub fn groupby_windows( } // this assumes that the given time point is the right endpoint of the window -pub(crate) fn groupby_values_iter_lookbehind( +pub(crate) fn group_by_values_iter_lookbehind( period: Duration, offset: Duration, time: &[i64], @@ -281,7 +281,7 @@ pub(crate) fn groupby_values_iter_lookbehind( } // this one is correct for all lookbehind/lookaheads, but is slower -pub(crate) fn groupby_values_iter_window_behind_t( +pub(crate) fn group_by_values_iter_window_behind_t( period: Duration, offset: Duration, time: &[i64], @@ -331,7 +331,7 @@ pub(crate) fn groupby_values_iter_window_behind_t( } // this one is correct for all lookbehind/lookaheads, but is slower -pub(crate) fn groupby_values_iter_partial_lookbehind( +pub(crate) fn group_by_values_iter_partial_lookbehind( period: Duration, offset: Duration, time: &[i64], @@ -369,7 +369,7 @@ pub(crate) fn groupby_values_iter_partial_lookbehind( } #[allow(clippy::too_many_arguments)] -pub(crate) fn groupby_values_iter_partial_lookahead( +pub(crate) fn group_by_values_iter_partial_lookahead( period: Duration, offset: Duration, time: &[i64], @@ -406,7 +406,7 @@ pub(crate) fn groupby_values_iter_partial_lookahead( }) } #[allow(clippy::too_many_arguments)] -pub(crate) fn groupby_values_iter_full_lookahead( +pub(crate) fn group_by_values_iter_full_lookahead( period: Duration, offset: Duration, time: &[i64], @@ -454,7 +454,7 @@ pub(crate) fn groupby_values_iter_full_lookahead( } #[cfg(feature = "rolling_window")] -pub(crate) fn groupby_values_iter<'a>( +pub(crate) fn group_by_values_iter<'a>( period: Duration, time: &'a [i64], closed_window: ClosedWindow, @@ -464,16 +464,16 @@ pub(crate) fn groupby_values_iter<'a>( let mut offset = period; offset.negative = true; // t is at the right endpoint of the window - let iter = groupby_values_iter_lookbehind(period, offset, time, closed_window, tu, tz, 0); + let iter = group_by_values_iter_lookbehind(period, offset, time, closed_window, tu, tz, 0); Box::new(iter) } -/// Different from `groupby_windows`, where define window buckets and search which values fit that +/// Different from `group_by_windows`, where define window buckets and search which values fit that /// pre-defined bucket, this function defines every window based on the: /// - timestamp (lower bound) /// - timestamp + period (upper bound) /// where timestamps are the individual values in the array `time` -pub fn groupby_values( +pub fn group_by_values( period: Duration, offset: Duration, time: &[i64], @@ -496,7 +496,7 @@ pub fn groupby_values( .copied() .map(|(base_offset, len)| { let upper_bound = base_offset + len; - let iter = groupby_values_iter_lookbehind( + let iter = group_by_values_iter_lookbehind( period, offset, &time[..upper_bound], @@ -520,7 +520,7 @@ pub fn groupby_values( // ---------------t--- // [---] let iter = - groupby_values_iter_window_behind_t(period, offset, time, closed_window, tu, tz); + group_by_values_iter_window_behind_t(period, offset, time, closed_window, tu, tz); iter.map(|result| result.map(|(offset, len)| [offset, len])) .collect::>() } @@ -531,8 +531,14 @@ pub fn groupby_values( // ----t--- // [---] else { - let iter = - groupby_values_iter_partial_lookbehind(period, offset, time, closed_window, tu, tz); + let iter = group_by_values_iter_partial_lookbehind( + period, + offset, + time, + closed_window, + tu, + tz, + ); iter.map(|result| result.map(|(offset, len)| [offset, len])) .collect::>() } @@ -550,7 +556,7 @@ pub fn groupby_values( .map(|(base_offset, len)| { let lower_bound = base_offset; let upper_bound = base_offset + len; - let iter = groupby_values_iter_full_lookahead( + let iter = group_by_values_iter_full_lookahead( period, offset, time, @@ -578,7 +584,7 @@ pub fn groupby_values( .map(|(base_offset, len)| { let lower_bound = base_offset; let upper_bound = base_offset + len; - let iter = groupby_values_iter_partial_lookahead( + let iter = group_by_values_iter_partial_lookahead( period, offset, time, diff --git a/crates/polars-time/src/windows/mod.rs b/crates/polars-time/src/windows/mod.rs index 120707752d6f..65a8cee9318e 100644 --- a/crates/polars-time/src/windows/mod.rs +++ b/crates/polars-time/src/windows/mod.rs @@ -7,7 +7,7 @@ pub(crate) mod bounds; pub(crate) mod calendar; pub(crate) mod duration; -pub(crate) mod groupby; +pub(crate) mod group_by; #[cfg(test)] mod test; pub(crate) mod window; diff --git a/crates/polars-time/src/windows/test.rs b/crates/polars-time/src/windows/test.rs index adb820837230..652746fbfa93 100644 --- a/crates/polars-time/src/windows/test.rs +++ b/crates/polars-time/src/windows/test.rs @@ -93,7 +93,7 @@ fn test_groups_large_interval() { let dur = Duration::parse("2d"); let w = Window::new(Duration::parse("2d"), dur, Duration::from_nsecs(0)); - let (groups, _, _) = groupby_windows( + let (groups, _, _) = group_by_windows( w, &ts, ClosedWindow::Both, @@ -108,7 +108,7 @@ fn test_groups_large_interval() { assert_eq!(groups[1], [1, 1]); assert_eq!(groups[2], [1, 3]); assert_eq!(groups[3], [3, 1]); - let (groups, _, _) = groupby_windows( + let (groups, _, _) = group_by_windows( w, &ts, ClosedWindow::Left, @@ -120,7 +120,7 @@ fn test_groups_large_interval() { ); assert_eq!(groups.len(), 3); assert_eq!(groups[2], [3, 1]); - let (groups, _, _) = groupby_windows( + let (groups, _, _) = group_by_windows( w, &ts, ClosedWindow::Right, @@ -191,7 +191,7 @@ fn test_boundaries() { assert_eq!(b.start, start.timestamp_nanos()); // test closed: "both" (includes both ends of the interval) - let (groups, lower, higher) = groupby_windows( + let (groups, lower, higher) = group_by_windows( w, &ts, ClosedWindow::Both, @@ -287,7 +287,7 @@ fn test_boundaries() { assert_eq!(groups[2], [4, 3]); // test closed: "left" (should not include right end of interval) - let (groups, _, _) = groupby_windows( + let (groups, _, _) = group_by_windows( w, &ts, ClosedWindow::Left, @@ -302,7 +302,7 @@ fn test_boundaries() { assert_eq!(groups[2], [4, 2]); // 02:00:00 -> 02:30:00 // test closed: "right" (should not include left end of interval) - let (groups, _, _) = groupby_windows( + let (groups, _, _) = group_by_windows( w, &ts, ClosedWindow::Right, @@ -317,7 +317,7 @@ fn test_boundaries() { assert_eq!(groups[2], [5, 2]); // 02:00:00 -> 02:30:00 // test closed: "none" (should not include left or right end of interval) - let (groups, _, _) = groupby_windows( + let (groups, _, _) = group_by_windows( w, &ts, ClosedWindow::None, @@ -367,7 +367,7 @@ fn test_boundaries_2() { assert_eq!(b.start, start.timestamp_nanos() + offset.duration_ns()); - let (groups, lower, higher) = groupby_windows( + let (groups, lower, higher) = group_by_windows( w, &ts, ClosedWindow::Left, @@ -475,7 +475,7 @@ fn test_boundaries_ms() { assert_eq!(b.start, start.timestamp_millis()); // test closed: "both" (includes both ends of the interval) - let (groups, lower, higher) = groupby_windows( + let (groups, lower, higher) = group_by_windows( w, &ts, ClosedWindow::Both, @@ -571,7 +571,7 @@ fn test_boundaries_ms() { assert_eq!(groups[2], [4, 3]); // test closed: "left" (should not include right end of interval) - let (groups, _, _) = groupby_windows( + let (groups, _, _) = group_by_windows( w, &ts, ClosedWindow::Left, @@ -586,7 +586,7 @@ fn test_boundaries_ms() { assert_eq!(groups[2], [4, 2]); // 02:00:00 -> 02:30:00 // test closed: "right" (should not include left end of interval) - let (groups, _, _) = groupby_windows( + let (groups, _, _) = group_by_windows( w, &ts, ClosedWindow::Right, @@ -601,7 +601,7 @@ fn test_boundaries_ms() { assert_eq!(groups[2], [5, 2]); // 02:00:00 -> 02:30:00 // test closed: "none" (should not include left or right end of interval) - let (groups, _, _) = groupby_windows( + let (groups, _, _) = group_by_windows( w, &ts, ClosedWindow::None, @@ -638,7 +638,7 @@ fn test_rolling_lookback() { .unwrap(); // unwrapping as we pass None as the time zone // full lookbehind - let groups = groupby_values( + let groups = group_by_values( Duration::parse("2h"), Duration::parse("-2h"), &dates, @@ -659,7 +659,7 @@ fn test_rolling_lookback() { assert_eq!(groups[8], [5, 4]); // bound: 02:00 -> 04:00 time: 04:00 // partial lookbehind - let groups = groupby_values( + let groups = group_by_values( Duration::parse("2h"), Duration::parse("-1h"), &dates, @@ -680,7 +680,7 @@ fn test_rolling_lookback() { assert_eq!(groups[8], [7, 2]); // no lookbehind - let groups = groupby_values( + let groups = group_by_values( Duration::parse("2h"), Duration::parse("0h"), &dates, @@ -709,13 +709,20 @@ fn test_rolling_lookback() { ClosedWindow::None, ] { let offset = Duration::parse("-2h"); - let g0 = groupby_values_iter_lookbehind(period, offset, &dates, closed_window, tu, None, 0) - .collect::>>() - .unwrap(); - let g1 = - groupby_values_iter_partial_lookbehind(period, offset, &dates, closed_window, tu, None) + let g0 = + group_by_values_iter_lookbehind(period, offset, &dates, closed_window, tu, None, 0) .collect::>>() .unwrap(); + let g1 = group_by_values_iter_partial_lookbehind( + period, + offset, + &dates, + closed_window, + tu, + None, + ) + .collect::>>() + .unwrap(); assert_eq!(g0, g1); } } @@ -746,7 +753,7 @@ fn test_end_membership() { // 2021-03-01 -> 2021-05-01 members: None // 2021-04-01 -> 2021-06-01 members: [1] // 2021-05-01 -> 2021-07-01 members: [1] - let (groups, _, _) = groupby_windows( + let (groups, _, _) = group_by_windows( window, &time, ClosedWindow::Left, @@ -763,14 +770,14 @@ fn test_end_membership() { } #[test] -fn test_groupby_windows_membership_2791() { +fn test_group_by_windows_membership_2791() { let dates = [0, 0, 2, 2]; let window = Window::new( Duration::parse("1ms"), Duration::parse("1ms"), Duration::parse("0ns"), ); - let (groups, _, _) = groupby_windows( + let (groups, _, _) = group_by_windows( window, &dates, ClosedWindow::Left, @@ -785,7 +792,7 @@ fn test_groupby_windows_membership_2791() { } #[test] -fn test_groupby_windows_duplicates_2931() { +fn test_group_by_windows_duplicates_2931() { let dates = [0, 3, 3, 5, 5]; let window = Window::new( Duration::parse("1ms"), @@ -793,7 +800,7 @@ fn test_groupby_windows_duplicates_2931() { Duration::parse("0ns"), ); - let (groups, _, _) = groupby_windows( + let (groups, _, _) = group_by_windows( window, &dates, ClosedWindow::Left, @@ -807,7 +814,7 @@ fn test_groupby_windows_duplicates_2931() { } #[test] -fn test_groupby_windows_offsets_3776() { +fn test_group_by_windows_offsets_3776() { let dates = &[ NaiveDate::from_ymd_opt(2020, 12, 1).unwrap(), NaiveDate::from_ymd_opt(2021, 2, 1).unwrap(), @@ -823,7 +830,7 @@ fn test_groupby_windows_offsets_3776() { Duration::parse("2d"), Duration::parse("-2d"), ); - let (groups, _, _) = groupby_windows( + let (groups, _, _) = group_by_windows( window, &ts, ClosedWindow::Right, diff --git a/crates/polars/Cargo.toml b/crates/polars/Cargo.toml index 3e66aaee16b6..7b6d0f3d3097 100644 --- a/crates/polars/Cargo.toml +++ b/crates/polars/Cargo.toml @@ -130,7 +130,7 @@ extract_jsonpath = [ ] string_encoding = ["polars-ops/string_encoding", "polars-core/strings"] binary_encoding = ["polars-ops/binary_encoding"] -groupby_list = ["polars-core/groupby_list"] +group_by_list = ["polars-core/group_by_list"] lazy_regex = ["polars-lazy/regex"] cum_agg = ["polars-core/cum_agg", "polars-core/cum_agg"] rolling_window = ["polars-core/rolling_window", "polars-lazy/rolling_window", "polars-time/rolling_window"] @@ -144,7 +144,7 @@ true_div = ["polars-lazy/true_div"] diagonal_concat = ["polars-core/diagonal_concat", "polars-lazy/diagonal_concat"] horizontal_concat = ["polars-core/horizontal_concat"] abs = ["polars-core/abs", "polars-lazy/abs"] -dynamic_groupby = ["polars-core/dynamic_groupby", "polars-lazy/dynamic_groupby"] +dynamic_group_by = ["polars-core/dynamic_group_by", "polars-lazy/dynamic_group_by"] ewma = ["polars-core/ewma", "polars-lazy/ewma"] dot_diagram = ["polars-lazy/dot_diagram"] dataframe_arithmetic = ["polars-core/dataframe_arithmetic"] @@ -327,7 +327,7 @@ docs-selection = [ "arg_where", "propagate_nans", "coalesce", - "dynamic_groupby", + "dynamic_group_by", "extract_groups", ] diff --git a/crates/polars/src/docs/eager.rs b/crates/polars/src/docs/eager.rs index f8030818d94d..e31a5ada79e7 100644 --- a/crates/polars/src/docs/eager.rs +++ b/crates/polars/src/docs/eager.rs @@ -18,7 +18,7 @@ //! * [Filter](#filter) //! * [Sort](#sort) //! * [Joins](#joins) -//! * [GroupBy](#groupby) +//! * [GroupBy](#group_by) //! - [pivot](#pivot) //! * [Melt](#melt) //! * [Explode](#explode) @@ -400,20 +400,20 @@ //! //! ## Groupby //! -//! Note that Polars lazy is a lot more powerful in and more performant in groupby operations. +//! Note that Polars lazy is a lot more powerful in and more performant in group_by operations. //! In lazy a myriad of aggregations can be combined from expressions. //! //! See more in: //! -//! * [Groupby](crate::frame::groupby::GroupBy) +//! * [Groupby](crate::frame::group_by::GroupBy) //! //! ### GroupBy //! ``` //! use polars::prelude::*; //! //! # fn example(df: &DataFrame) -> PolarsResult<()> { -//! // groupby "groups" | sum "foo" -//! let out = df.groupby(["groups"])? +//! // group_by "groups" | sum "foo" +//! let out = df.group_by(["groups"])? //! .select(["foo"]) //! .sum(); //! @@ -434,7 +434,7 @@ //! "bar" => ["k", "l", "m", "n", "0"] //! )?; //! -//! // groupby "foo" | pivot "bar" column | aggregate "N" +//! // group_by "foo" | pivot "bar" column | aggregate "N" //! let pivoted = pivot::pivot(&df, ["foo"], ["bar"], ["N"], false, Some(first()), None); //! //! // pivoted: diff --git a/crates/polars/src/docs/lazy.rs b/crates/polars/src/docs/lazy.rs index 0e72c5a540c4..0d2404b6a753 100644 --- a/crates/polars/src/docs/lazy.rs +++ b/crates/polars/src/docs/lazy.rs @@ -9,7 +9,7 @@ //! * [Start a lazy computation](#start-a-lazy-computation) //! * [Filter](#filter) //! * [Sort](#sort) -//! * [GroupBy](#groupby) +//! * [GroupBy](#group_by) //! * [Joins](#joins) //! * [Conditionally apply](#conditionally-apply) //! * [Black box function](#black-box-function) @@ -106,7 +106,7 @@ //! //! ## Groupby //! -//! This example is from the polars [user guide](https://pola-rs.github.io/polars-book/user-guide/concepts/contexts/#groupby-aggregation). +//! This example is from the polars [user guide](https://pola-rs.github.io/polars-book/user-guide/concepts/contexts/#group_by-aggregation). //! //! ``` //! use polars::prelude::*; @@ -116,7 +116,7 @@ //! .has_header(true) //! .with_delimiter(b',') //! .finish()? -//! .groupby([col("comment_karma")]) +//! .group_by([col("comment_karma")]) //! .agg([col("name").n_unique().alias("unique_names"), col("link_karma").max()]) //! // take only 100 rows. //! .fetch(100)?; diff --git a/crates/polars/src/lib.rs b/crates/polars/src/lib.rs index b16dc3917509..530bb4db3da6 100644 --- a/crates/polars/src/lib.rs +++ b/crates/polars/src/lib.rs @@ -14,7 +14,7 @@ //! # fn example() -> PolarsResult<()> { //! //! let lf1 = LazyFrame::scan_parquet("myfile_1.parquet", Default::default())? -//! .groupby([col("ham")]) +//! .group_by([col("ham")]) //! .agg([ //! // expressions can be combined into powerful aggregations //! col("foo") @@ -201,7 +201,7 @@ //! * gzip //! //! * `DataFrame` operations: -//! - `dynamic_groupby` - Groupby based on a time window instead of predefined keys. +//! - `dynamic_group_by` - Groupby based on a time window instead of predefined keys. //! Also activates rolling window group by operations. //! - `sort_multiple` - Allow sorting a `DataFrame` on multiple columns //! - `rows` - Create `DataFrame` from rows and extract rows from `DataFrames`. @@ -209,7 +209,7 @@ //! - `asof_join` - Join ASOF, to join on nearest keys instead of exact equality match. //! - `cross_join` - Create the cartesian product of two DataFrames. //! - `semi_anti_join` - SEMI and ANTI joins. -//! - `groupby_list` - Allow groupby operation on keys of type List. +//! - `group_by_list` - Allow group_by operation on keys of type List. //! - `row_hash` - Utility to hash DataFrame rows to UInt64Chunked //! - `diagonal_concat` - Concat diagonally thereby combining different schemas. //! - `horizontal_concat` - Concat horizontally and extend with null values if lengths don't match @@ -358,11 +358,11 @@ //! * `POLARS_TABLE_WIDTH` -> width of the tables used during DataFrame formatting. //! * `POLARS_MAX_THREADS` -> maximum number of threads used to initialize thread pool (on startup). //! * `POLARS_VERBOSE` -> print logging info to stderr. -//! * `POLARS_NO_PARTITION` -> polars may choose to partition the groupby operation, based on data -//! cardinality. Setting this env var will turn partitioned groupby's off. -//! * `POLARS_PARTITION_UNIQUE_COUNT` -> at which (estimated) key count a partitioned groupby should run. -//! defaults to `1000`, any higher cardinality will run default groupby. -//! * `POLARS_FORCE_PARTITION` -> force partitioned groupby if the keys and aggregations allow it. +//! * `POLARS_NO_PARTITION` -> polars may choose to partition the group_by operation, based on data +//! cardinality. Setting this env var will turn partitioned group_by's off. +//! * `POLARS_PARTITION_UNIQUE_COUNT` -> at which (estimated) key count a partitioned group_by should run. +//! defaults to `1000`, any higher cardinality will run default group_by. +//! * `POLARS_FORCE_PARTITION` -> force partitioned group_by if the keys and aggregations allow it. //! * `POLARS_ALLOW_EXTENSION` -> allows for `[ObjectChunked]` to be used in arrow, opening up possibilities like using //! `T` in complex lazy expressions. However this does require `unsafe` code allow this. //! * `POLARS_NO_PARQUET_STATISTICS` -> if set, statistics in parquet files are ignored. diff --git a/crates/polars/src/prelude.rs b/crates/polars/src/prelude.rs index 044cd574c028..f8901ec52fbb 100644 --- a/crates/polars/src/prelude.rs +++ b/crates/polars/src/prelude.rs @@ -1,6 +1,6 @@ #[cfg(feature = "polars-algo")] pub use polars_algo::prelude::*; -pub use polars_core::frame::groupby::*; +pub use polars_core::frame::group_by::*; pub use polars_core::prelude::*; pub use polars_core::utils::NoNull; #[cfg(feature = "polars-io")] diff --git a/crates/polars/tests/it/core/groupby.rs b/crates/polars/tests/it/core/group_by.rs similarity index 98% rename from crates/polars/tests/it/core/groupby.rs rename to crates/polars/tests/it/core/group_by.rs index 9a5696e5aa35..f14caad753dd 100644 --- a/crates/polars/tests/it/core/groupby.rs +++ b/crates/polars/tests/it/core/group_by.rs @@ -3,7 +3,7 @@ use polars_core::series::IsSorted; use super::*; #[test] -fn test_sorted_groupby() -> PolarsResult<()> { +fn test_sorted_group_by() -> PolarsResult<()> { // nulls last let mut s = Series::new("a", &[Some(1), Some(1), Some(1), Some(6), Some(6), None]); s.set_sorted_flag(IsSorted::Ascending); diff --git a/crates/polars/tests/it/core/mod.rs b/crates/polars/tests/it/core/mod.rs index a2017782af9d..76adb01d5677 100644 --- a/crates/polars/tests/it/core/mod.rs +++ b/crates/polars/tests/it/core/mod.rs @@ -1,5 +1,5 @@ mod date_like; -mod groupby; +mod group_by; mod joins; mod list; mod ops; diff --git a/crates/polars/tests/it/joins.rs b/crates/polars/tests/it/joins.rs index ea286750123f..fa224b01f4ee 100644 --- a/crates/polars/tests/it/joins.rs +++ b/crates/polars/tests/it/joins.rs @@ -12,10 +12,10 @@ fn join_nans_outer() -> PolarsResult<()> { .lazy(); let a1 = df1 .clone() - .groupby(vec![col("w").alias("w"), col("t")]) + .group_by(vec![col("w").alias("w"), col("t")]) .agg(vec![col("c").sum().alias("c_sum")]); let a2 = df1 - .groupby(vec![col("w").alias("w"), col("t")]) + .group_by(vec![col("w").alias("w"), col("t")]) .agg(vec![col("c").max().alias("c_max")]); let res = a1 @@ -42,7 +42,7 @@ fn join_empty_datasets() -> PolarsResult<()> { .unwrap(); a.lazy() - .groupby([col("foo")]) + .group_by([col("foo")]) .agg([all().last()]) .inner_join(b.lazy(), "foo", "foo") .collect() diff --git a/crates/polars/tests/it/lazy/aggregation.rs b/crates/polars/tests/it/lazy/aggregation.rs index 180db6a5b85e..21e19152303e 100644 --- a/crates/polars/tests/it/lazy/aggregation.rs +++ b/crates/polars/tests/it/lazy/aggregation.rs @@ -51,7 +51,7 @@ fn test_lazy_agg() { let lf = df .lazy() - .groupby([col("date")]) + .group_by([col("date")]) .agg([ col("rain").min().alias("min"), col("rain").sum().alias("sum"), @@ -90,7 +90,7 @@ fn test_apply_multiple_error() { let _res = df .lazy() .with_streaming(false) - .groupby_stable([col("rf")]) + .group_by_stable([col("rf")]) .agg([issue()]) .collect() .unwrap(); diff --git a/crates/polars/tests/it/lazy/expressions/apply.rs b/crates/polars/tests/it/lazy/expressions/apply.rs index 09ed578e9ab2..1c61aca40be3 100644 --- a/crates/polars/tests/it/lazy/expressions/apply.rs +++ b/crates/polars/tests/it/lazy/expressions/apply.rs @@ -28,7 +28,7 @@ fn test_groups_update() -> PolarsResult<()> { let out = df .lazy() - .groupby_stable([col("group")]) + .group_by_stable([col("group")]) .agg([col("id").unique_counts().log(2.0)]) .explode([col("id")]) .collect()?; @@ -50,7 +50,7 @@ fn test_groups_update_binary_shift_log() -> PolarsResult<()> { "b" => [1, 2, 1, 2], ]? .lazy() - .groupby([col("b")]) + .group_by([col("b")]) .agg([col("a") - col("a").shift(1).log(2.0)]) .sort("b", Default::default()) .explode([col("a")]) @@ -93,7 +93,7 @@ fn test_apply_groups_empty() -> PolarsResult<()> { let out = df .lazy() .filter(col("id").eq(lit(2))) - .groupby([col("id")]) + .group_by([col("id")]) .agg([col("hi").drop_nulls().unique()]) .collect()?; diff --git a/crates/polars/tests/it/lazy/expressions/arity.rs b/crates/polars/tests/it/lazy/expressions/arity.rs index 07e7c6700ff5..290bd9f3efca 100644 --- a/crates/polars/tests/it/lazy/expressions/arity.rs +++ b/crates/polars/tests/it/lazy/expressions/arity.rs @@ -10,7 +10,7 @@ fn test_list_broadcast() { ] .unwrap() .lazy() - .groupby([col("g")]) + .group_by([col("g")]) .agg([col("a").unique_counts() * count()]) .collect() .unwrap(); @@ -161,7 +161,7 @@ fn test_when_then_otherwise_single_bool() -> PolarsResult<()> { let out = df .lazy() - .groupby_stable([col("key")]) + .group_by_stable([col("key")]) .agg([when(col("val").null_count().gt(lit(0))) .then(Null {}.lit()) .otherwise(col("val").sum()) @@ -191,7 +191,7 @@ fn test_update_groups_in_cast() -> PolarsResult<()> { // in aggregation that cast coerces a list and the cast may forget to update groups let out = df .lazy() - .groupby_stable([col("group")]) + .group_by_stable([col("group")]) .agg([col("id").unique_counts() * lit(-1)]) .collect()?; @@ -214,7 +214,7 @@ fn test_when_then_otherwise_sum_in_agg() -> PolarsResult<()> { let q = df .lazy() - .groupby([col("groups")]) + .group_by([col("groups")]) .agg([when(all().exclude(["groups"]).sum().eq(lit(1))) .then(all().exclude(["groups"]).sum()) .otherwise(lit(NULL))]) @@ -292,7 +292,7 @@ fn test_ternary_aggregation_set_literals() -> PolarsResult<()> { let out = df .clone() .lazy() - .groupby([col("name")]) + .group_by([col("name")]) .agg([when(col("value").sum().eq(lit(3))) .then(col("value").rank(Default::default(), None)) .otherwise(lit(Series::new("", &[10 as IdxSize])))]) @@ -312,7 +312,7 @@ fn test_ternary_aggregation_set_literals() -> PolarsResult<()> { let out = df .clone() .lazy() - .groupby([col("name")]) + .group_by([col("name")]) .agg([when(col("value").sum().eq(lit(3))) .then(lit(Series::new("", &[10 as IdxSize])).alias("value")) .otherwise(col("value").rank(Default::default(), None))]) @@ -332,7 +332,7 @@ fn test_ternary_aggregation_set_literals() -> PolarsResult<()> { let out = df .clone() .lazy() - .groupby([col("name")]) + .group_by([col("name")]) .agg([when(col("value").sum().eq(lit(3))) .then(col("value").rank(Default::default(), None)) .otherwise(Null {}.lit())]) @@ -346,7 +346,7 @@ fn test_ternary_aggregation_set_literals() -> PolarsResult<()> { // swapped branch let out = df .lazy() - .groupby([col("name")]) + .group_by([col("name")]) .agg([when(col("value").sum().eq(lit(3))) .then(Null {}.lit().alias("value")) .otherwise(col("value").rank(Default::default(), None))]) @@ -370,7 +370,7 @@ fn test_binary_group_consistency() -> PolarsResult<()> { .lazy(); let out = lf - .groupby([col("category")]) + .group_by([col("category")]) .agg([col("name").filter(col("score").eq(col("score").max()))]) .sort("category", Default::default()) .collect()?; diff --git a/crates/polars/tests/it/lazy/expressions/filter.rs b/crates/polars/tests/it/lazy/expressions/filter.rs index 706f764e5a37..2d60525c3d1a 100644 --- a/crates/polars/tests/it/lazy/expressions/filter.rs +++ b/crates/polars/tests/it/lazy/expressions/filter.rs @@ -1,7 +1,7 @@ use super::*; #[test] -fn test_filter_in_groupby_agg() -> PolarsResult<()> { +fn test_filter_in_group_by_agg() -> PolarsResult<()> { // This tests if the filter is correctly handled by the binary expression. // This could lead to UB if it were not the case. The filter creates an empty column. // but the group tuples could still be untouched leading to out of bounds aggregation. @@ -13,7 +13,7 @@ fn test_filter_in_groupby_agg() -> PolarsResult<()> { let out = df .clone() .lazy() - .groupby([col("a")]) + .group_by([col("a")]) .agg([(col("b").filter(col("b").eq(lit(100))) * lit(2)) .mean() .alias("b_mean")]) @@ -23,7 +23,7 @@ fn test_filter_in_groupby_agg() -> PolarsResult<()> { let out = df .lazy() - .groupby([col("a")]) + .group_by([col("a")]) .agg([(col("b") .filter(col("b").eq(lit(100))) .map(|v| Ok(Some(v)), GetOutput::same_type())) diff --git a/crates/polars/tests/it/lazy/expressions/slice.rs b/crates/polars/tests/it/lazy/expressions/slice.rs index 0d996d9ccc83..c57e8f83b7ad 100644 --- a/crates/polars/tests/it/lazy/expressions/slice.rs +++ b/crates/polars/tests/it/lazy/expressions/slice.rs @@ -14,7 +14,7 @@ fn test_slice_args() -> PolarsResult<()> { "vals" => 0i32..30 ]? .lazy() - .groupby_stable([col("groups")]) + .group_by_stable([col("groups")]) .agg([col("vals").slice(lit(0i64), count() * lit(0.2))]) .collect()?; diff --git a/crates/polars/tests/it/lazy/groupby.rs b/crates/polars/tests/it/lazy/group_by.rs similarity index 90% rename from crates/polars/tests/it/lazy/groupby.rs rename to crates/polars/tests/it/lazy/group_by.rs index 6e3ed7e09666..9ed05bc25bd0 100644 --- a/crates/polars/tests/it/lazy/groupby.rs +++ b/crates/polars/tests/it/lazy/group_by.rs @@ -18,7 +18,7 @@ fn test_filter_sort_diff_2984() -> PolarsResult<()> { let out = df .lazy() // don't use stable in this test, it hides wrong state - .groupby([col("group")]) + .group_by([col("group")]) .agg([col("id") .filter(col("id").lt(lit(3))) .sort(false) @@ -40,7 +40,7 @@ fn test_filter_after_tail() -> PolarsResult<()> { let out = df .lazy() - .groupby_stable([col("a")]) + .group_by_stable([col("a")]) .tail(Some(1)) .filter(col("b").eq(lit(3))) .with_predicate_pushdown(false) @@ -66,7 +66,7 @@ fn test_filter_diff_arithmetic() -> PolarsResult<()> { let out = df .lazy() - .groupby([col("user")]) + .group_by([col("user")]) .agg([(col("value") .filter(col("group").eq(lit(1))) .diff(1, Default::default()) @@ -83,14 +83,14 @@ fn test_filter_diff_arithmetic() -> PolarsResult<()> { } #[test] -fn test_groupby_lit_agg() -> PolarsResult<()> { +fn test_group_by_lit_agg() -> PolarsResult<()> { let df = df![ "group" => [1, 2, 1, 1, 2], ]?; let out = df .lazy() - .groupby([col("group")]) + .group_by([col("group")]) .agg([lit("foo").alias("foo")]) .collect()?; @@ -101,7 +101,7 @@ fn test_groupby_lit_agg() -> PolarsResult<()> { #[test] #[cfg(feature = "diff")] -fn test_groupby_agg_list_with_not_aggregated() -> PolarsResult<()> { +fn test_group_by_agg_list_with_not_aggregated() -> PolarsResult<()> { let df = df![ "group" => ["a", "a", "a", "a", "a", "a", "b", "b", "b", "b", "b", "b"], "value" => [0, 2, 3, 6, 2, 4, 7, 9, 3, 4, 6, 7, ], @@ -109,7 +109,7 @@ fn test_groupby_agg_list_with_not_aggregated() -> PolarsResult<()> { let out = df .lazy() - .groupby([col("group")]) + .group_by([col("group")]) .agg([when(col("value").diff(1, NullBehavior::Ignore).gt_eq(0)) .then(col("value").diff(1, NullBehavior::Ignore)) .otherwise(col("value"))]) @@ -127,7 +127,7 @@ fn test_groupby_agg_list_with_not_aggregated() -> PolarsResult<()> { #[test] #[cfg(all(feature = "dtype-duration", feature = "dtype-struct"))] -fn test_logical_mean_partitioned_groupby_block() -> PolarsResult<()> { +fn test_logical_mean_partitioned_group_by_block() -> PolarsResult<()> { let _guard = SINGLE_LOCK.lock(); let df = df![ "a" => [1, 1, 2], @@ -137,7 +137,7 @@ fn test_logical_mean_partitioned_groupby_block() -> PolarsResult<()> { let out = df .lazy() .with_column(col("duration").cast(DataType::Duration(TimeUnit::Microseconds))) - .groupby([col("a")]) + .group_by([col("a")]) .agg([col("duration").mean()]) .sort("duration", Default::default()) .collect()?; @@ -164,7 +164,7 @@ fn test_filter_aggregated_expression() -> PolarsResult<()> { let df = df .lazy() - .groupby([col("day")]) + .group_by([col("day")]) .agg([(col("x") - col("x").first()).filter(f)]) .sort("day", Default::default()) .collect() diff --git a/crates/polars/tests/it/lazy/groupby_dynamic.rs b/crates/polars/tests/it/lazy/group_by_dynamic.rs similarity index 92% rename from crates/polars/tests/it/lazy/groupby_dynamic.rs rename to crates/polars/tests/it/lazy/group_by_dynamic.rs index 44be5a7bcf19..1fa5ec6a396f 100644 --- a/crates/polars/tests/it/lazy/groupby_dynamic.rs +++ b/crates/polars/tests/it/lazy/group_by_dynamic.rs @@ -1,8 +1,8 @@ -// used only if feature="temporal", "dtype-date", "dynamic_groupby" +// used only if feature="temporal", "dtype-date", "dynamic_group_by" #[allow(unused_imports)] use polars::export::chrono::prelude::*; -// used only if feature="temporal", "dtype-date", "dynamic_groupby" +// used only if feature="temporal", "dtype-date", "dynamic_group_by" #[allow(unused_imports)] use super::*; @@ -10,9 +10,9 @@ use super::*; #[cfg(all( feature = "temporal", feature = "dtype-date", - feature = "dynamic_groupby" + feature = "dynamic_group_by" ))] -fn test_groupby_dynamic_week_bounds() -> PolarsResult<()> { +fn test_group_by_dynamic_week_bounds() -> PolarsResult<()> { let start = NaiveDate::from_ymd_opt(2022, 2, 1) .unwrap() .and_hms_opt(0, 0, 0) @@ -40,7 +40,7 @@ fn test_groupby_dynamic_week_bounds() -> PolarsResult<()> { let out = df .lazy() - .groupby_dynamic( + .group_by_dynamic( col("dt"), [], DynamicGroupOptions { diff --git a/crates/polars/tests/it/lazy/mod.rs b/crates/polars/tests/it/lazy/mod.rs index 7dc6c62bb775..a5808478f150 100644 --- a/crates/polars/tests/it/lazy/mod.rs +++ b/crates/polars/tests/it/lazy/mod.rs @@ -5,8 +5,8 @@ mod explodes; mod expressions; mod folds; mod functions; -mod groupby; -mod groupby_dynamic; +mod group_by; +mod group_by_dynamic; mod predicate_queries; mod projection_queries; mod queries; diff --git a/crates/polars/tests/it/lazy/queries.rs b/crates/polars/tests/it/lazy/queries.rs index c83752f88100..d0af51efaab3 100644 --- a/crates/polars/tests/it/lazy/queries.rs +++ b/crates/polars/tests/it/lazy/queries.rs @@ -33,8 +33,8 @@ fn test_drop() -> PolarsResult<()> { } #[test] -#[cfg(feature = "dynamic_groupby")] -fn test_special_groupby_schemas() -> PolarsResult<()> { +#[cfg(feature = "dynamic_group_by")] +fn test_special_group_by_schemas() -> PolarsResult<()> { let df = df![ "a" => [1, 2, 3, 4, 5], "b" => [1, 2, 3, 4, 5], @@ -44,7 +44,7 @@ fn test_special_groupby_schemas() -> PolarsResult<()> { .clone() .lazy() .with_column(col("a").set_sorted_flag(IsSorted::Ascending)) - .groupby_rolling( + .group_by_rolling( col("a"), [], RollingGroupOptions { @@ -69,7 +69,7 @@ fn test_special_groupby_schemas() -> PolarsResult<()> { let out = df .lazy() .with_column(col("a").set_sorted_flag(IsSorted::Ascending)) - .groupby_dynamic( + .group_by_dynamic( col("a"), [], DynamicGroupOptions { @@ -108,7 +108,7 @@ fn max_on_empty_df_3027() -> PolarsResult<()> { let out = df .lazy() - .groupby(&[col("id"), col("name")]) + .group_by(&[col("id"), col("name")]) .agg(&[col("numb").max()]) .collect()?; assert_eq!(out.shape(), (0, 3)); @@ -144,7 +144,7 @@ fn test_sorted_path() -> PolarsResult<()> { .lazy() .with_row_count("row_nr", None) .explode(["a"]) - .groupby(["row_nr"]) + .group_by(["row_nr"]) .agg([col("a").count().alias("count")]) .collect()?; @@ -215,7 +215,7 @@ fn test_apply_multiple_columns() -> PolarsResult<()> { let out = df .lazy() - .groupby_stable([col("cars")]) + .group_by_stable([col("cars")]) .agg([apply_multiple( multiply, [col("A"), col("B")], @@ -233,7 +233,7 @@ fn test_apply_multiple_columns() -> PolarsResult<()> { } #[test] -fn test_groupby_on_lists() -> PolarsResult<()> { +fn test_group_by_on_lists() -> PolarsResult<()> { let s0 = Series::new("", [1i32, 2, 3]); let s1 = Series::new("groups", [4i32, 5]); @@ -247,7 +247,7 @@ fn test_groupby_on_lists() -> PolarsResult<()> { let out = df .clone() .lazy() - .groupby([col("groups")]) + .group_by([col("groups")]) .agg([col("arrays").first()]) .collect()?; @@ -258,7 +258,7 @@ fn test_groupby_on_lists() -> PolarsResult<()> { let out = df .lazy() - .groupby([col("groups")]) + .group_by([col("groups")]) .agg([col("arrays").implode()]) .collect()?; diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml index ce25037a4e48..608895f110bc 100644 --- a/py-polars/Cargo.toml +++ b/py-polars/Cargo.toml @@ -45,7 +45,7 @@ features = [ "dot_product", "dtype-categorical", "dtype-full", - "dynamic_groupby", + "dynamic_group_by", "ewma", "fmt", "horizontal_concat", @@ -169,7 +169,7 @@ all = [ "build_info", "cse", "propagate_nans", - "polars/groupby_list", + "polars/group_by_list", "polars/fused", "sql", "binary_encoding", diff --git a/py-polars/polars/dataframe/groupby.py b/py-polars/polars/dataframe/groupby.py index 33a4dbea9a44..4f6259d6bcd3 100644 --- a/py-polars/polars/dataframe/groupby.py +++ b/py-polars/polars/dataframe/groupby.py @@ -333,7 +333,7 @@ def apply(self, function: Callable[[DataFrame], DataFrame]) -> DataFrame: raise TypeError("cannot call `apply` when grouping by an expression") return self.df.__class__._from_pydf( - self.df._df.groupby_apply(by, function, self.maintain_order) + self.df._df.group_by_apply(by, function, self.maintain_order) ) def head(self, n: int = 5) -> DataFrame: diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 90f9fc452938..c40b6c72d15a 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -2533,7 +2533,7 @@ def groupby( """ exprs = parse_as_list_of_expressions(by, *more_by) - lgb = self._ldf.groupby(exprs, maintain_order) + lgb = self._ldf.group_by(exprs, maintain_order) return LazyGroupBy(lgb) def groupby_rolling( @@ -2681,7 +2681,7 @@ def groupby_rolling( period = _timedelta_to_pl_duration(period) offset = _timedelta_to_pl_duration(offset) - lgb = self._ldf.groupby_rolling( + lgb = self._ldf.group_by_rolling( index_column, period, offset, closed, pyexprs_by, check_sorted ) return LazyGroupBy(lgb) @@ -3026,7 +3026,7 @@ def groupby_dynamic( every = _timedelta_to_pl_duration(every) pyexprs_by = parse_as_list_of_expressions(by) if by is not None else [] - lgb = self._ldf.groupby_dynamic( + lgb = self._ldf.group_by_dynamic( index_column, every, period, diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs index b3e77cf07f96..355b24d5bec8 100644 --- a/py-polars/src/dataframe.rs +++ b/py-polars/src/dataframe.rs @@ -1130,16 +1130,16 @@ impl PyDataFrame { Ok(df.into()) } - pub fn groupby_apply( + pub fn group_by_apply( &self, by: Vec<&str>, lambda: PyObject, maintain_order: bool, ) -> PyResult { let gb = if maintain_order { - self.df.groupby_stable(&by) + self.df.group_by_stable(&by) } else { - self.df.groupby(&by) + self.df.group_by(&by) } .map_err(PyPolarsErr::from)?; diff --git a/py-polars/src/lazyframe.rs b/py-polars/src/lazyframe.rs index 6d3b77400a8a..ae7df9579ff0 100644 --- a/py-polars/src/lazyframe.rs +++ b/py-polars/src/lazyframe.rs @@ -550,19 +550,19 @@ impl PyLazyFrame { ldf.select_seq(exprs).into() } - fn groupby(&mut self, by: Vec, maintain_order: bool) -> PyLazyGroupBy { + fn group_by(&mut self, by: Vec, maintain_order: bool) -> PyLazyGroupBy { let ldf = self.ldf.clone(); let by = by.to_exprs(); let lazy_gb = if maintain_order { - ldf.groupby_stable(by) + ldf.group_by_stable(by) } else { - ldf.groupby(by) + ldf.group_by(by) }; PyLazyGroupBy { lgb: Some(lazy_gb) } } - fn groupby_rolling( + fn group_by_rolling( &mut self, index_column: PyExpr, period: &str, @@ -577,7 +577,7 @@ impl PyLazyFrame { .into_iter() .map(|pyexpr| pyexpr.inner) .collect::>(); - let lazy_gb = ldf.groupby_rolling( + let lazy_gb = ldf.group_by_rolling( index_column.inner, by, RollingGroupOptions { @@ -593,7 +593,7 @@ impl PyLazyFrame { } #[allow(clippy::too_many_arguments)] - fn groupby_dynamic( + fn group_by_dynamic( &mut self, index_column: PyExpr, every: &str, @@ -612,7 +612,7 @@ impl PyLazyFrame { .map(|pyexpr| pyexpr.inner) .collect::>(); let ldf = self.ldf.clone(); - let lazy_gb = ldf.groupby_dynamic( + let lazy_gb = ldf.group_by_dynamic( index_column.inner, by, DynamicGroupOptions { diff --git a/py-polars/tests/unit/operations/test_groupby_rolling.py b/py-polars/tests/unit/operations/test_groupby_rolling.py index 193693178165..36be0b12bf1e 100644 --- a/py-polars/tests/unit/operations/test_groupby_rolling.py +++ b/py-polars/tests/unit/operations/test_groupby_rolling.py @@ -248,14 +248,14 @@ def test_groupby_rolling_dynamic_sortedness_check() -> None: # no `by` argument with pytest.raises( pl.InvalidOperationError, - match=r"argument in operation 'groupby_dynamic' is not explicitly sorted", + match=r"argument in operation 'group_by_dynamic' is not explicitly sorted", ): df.groupby_dynamic("idx", every="2i").agg(pl.col("idx").alias("idx1")) # no `by` argument with pytest.raises( pl.InvalidOperationError, - match=r"argument in operation 'groupby_rolling' is not explicitly sorted", + match=r"argument in operation 'group_by_rolling' is not explicitly sorted", ): df.groupby_rolling("idx", period="2i").agg(pl.col("idx").alias("idx1")) diff --git a/py-polars/tests/unit/test_empty.py b/py-polars/tests/unit/test_empty.py index ba07619b104c..12cd4f84680c 100644 --- a/py-polars/tests/unit/test_empty.py +++ b/py-polars/tests/unit/test_empty.py @@ -75,7 +75,7 @@ def test_empty_9137() -> None: def test_empty_groupby_apply_err() -> None: df = pl.DataFrame(schema={"x": pl.Int64}) with pytest.raises( - pl.ComputeError, match=r"cannot groupby \+ apply on empty 'DataFrame'" + pl.ComputeError, match=r"cannot group_by \+ apply on empty 'DataFrame'" ): df.groupby("x").apply(lambda x: x) diff --git a/py-polars/tests/unit/test_errors.py b/py-polars/tests/unit/test_errors.py index fc6e7b2a96e5..f79c7752e0b8 100644 --- a/py-polars/tests/unit/test_errors.py +++ b/py-polars/tests/unit/test_errors.py @@ -17,7 +17,7 @@ def test_error_on_empty_groupby() -> None: with pytest.raises( - pl.ComputeError, match="at least one key is required in a groupby operation" + pl.ComputeError, match="at least one key is required in a group_by operation" ): pl.DataFrame({"x": [0, 0, 1, 1]}).groupby([]).agg(pl.count()) @@ -618,7 +618,7 @@ def test_no_sorted_err() -> None: ) with pytest.raises( pl.InvalidOperationError, - match=r"argument in operation 'groupby_dynamic' is not explicitly sorted", + match=r"argument in operation 'group_by_dynamic' is not explicitly sorted", ): df.groupby_dynamic("dt", every="1h").agg(pl.all().count().suffix("_foo")) From 4b388e3c0c0369a05c690f9a5c6e77827802d7c5 Mon Sep 17 00:00:00 2001 From: Weijie Guo Date: Tue, 22 Aug 2023 03:01:09 +0800 Subject: [PATCH 30/55] feat(rust, python): support cast to list (#10623) --- Cargo.toml | 2 +- py-polars/tests/unit/datatypes/test_list.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index da68e35af692..0ec7f425cce6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -54,7 +54,7 @@ xxhash-rust = { version = "0.8.6", features = ["xxh3"] } [workspace.dependencies.arrow] package = "arrow2" git = "https://github.com/jorgecarleitao/arrow2" -rev = "2b3e2a9e83725a557d78b90cd39298c5bef0ca4a" +rev = "ba6a882bc1542b0b899774b696ebea77482b5c31" # branch = "" # version = "0.17.4" default-features = false diff --git a/py-polars/tests/unit/datatypes/test_list.py b/py-polars/tests/unit/datatypes/test_list.py index 70f2117caad4..53901bab76a1 100644 --- a/py-polars/tests/unit/datatypes/test_list.py +++ b/py-polars/tests/unit/datatypes/test_list.py @@ -5,6 +5,7 @@ import pandas as pd import polars as pl +from polars.testing import assert_series_equal def test_dtype() -> None: @@ -439,6 +440,15 @@ def test_list_recursive_categorical_cast() -> None: assert s.to_list() == values +def test_non_nested_cast_to_list() -> None: + df = pl.DataFrame({"a": [1, 2, 3]}) + + df = df.with_columns([pl.col("a").cast(pl.List(pl.Int64))]) + + expected = pl.Series("a", [[1], [2], [3]]) + assert_series_equal(df.to_series(), expected) + + def test_list_new_from_index_logical() -> None: s = ( pl.select(pl.struct(pl.Series("a", [date(2001, 1, 1)])).implode()) From 29fff1772606b40ebea30279e3b1bda8665526de Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Tue, 22 Aug 2023 05:49:08 +0200 Subject: [PATCH 31/55] fix(rust): `AllHorizontal` format string (#10658) --- .github/workflows/lint-py-polars.yml | 2 ++ crates/polars-plan/src/dsl/function_expr/boolean.rs | 2 +- py-polars/Cargo.lock | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/lint-py-polars.yml b/.github/workflows/lint-py-polars.yml index 47e05ff3bbfa..2af80ca0b9e6 100644 --- a/.github/workflows/lint-py-polars.yml +++ b/.github/workflows/lint-py-polars.yml @@ -4,6 +4,7 @@ on: pull_request: paths: - crates/** + - Cargo.toml - py-polars/src/** - py-polars/Cargo.toml - .github/workflows/lint-py-polars.yml @@ -12,6 +13,7 @@ on: - main paths: - crates/** + - Cargo.toml - py-polars/src/** - py-polars/Cargo.toml - .github/workflows/lint-py-polars.yml diff --git a/crates/polars-plan/src/dsl/function_expr/boolean.rs b/crates/polars-plan/src/dsl/function_expr/boolean.rs index fc2eb6307c19..41d9ceebcaeb 100644 --- a/crates/polars-plan/src/dsl/function_expr/boolean.rs +++ b/crates/polars-plan/src/dsl/function_expr/boolean.rs @@ -67,7 +67,7 @@ impl Display for BooleanFunction { #[cfg(feature = "is_in")] IsIn => "is_in", AnyHorizontal => "any_horizontal", - AllHorizontal => "any_horizontal", + AllHorizontal => "all_horizontal", }; write!(f, "{s}") } diff --git a/py-polars/Cargo.lock b/py-polars/Cargo.lock index 32fef5e7a898..740d3614dc21 100644 --- a/py-polars/Cargo.lock +++ b/py-polars/Cargo.lock @@ -99,7 +99,7 @@ dependencies = [ [[package]] name = "arrow2" version = "0.17.4" -source = "git+https://github.com/jorgecarleitao/arrow2?rev=2b3e2a9e83725a557d78b90cd39298c5bef0ca4a#2b3e2a9e83725a557d78b90cd39298c5bef0ca4a" +source = "git+https://github.com/jorgecarleitao/arrow2?rev=ba6a882bc1542b0b899774b696ebea77482b5c31#ba6a882bc1542b0b899774b696ebea77482b5c31" dependencies = [ "ahash", "arrow-format", From 5fbb7197e050a10b3b3f8af750508c0ec229a310 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Tue, 22 Aug 2023 07:24:49 +0200 Subject: [PATCH 32/55] refactor(rust): Clean up schema calculation for `date_range` (#10653) --- .../src/dsl/function_expr/schema.rs | 115 ++++++++++-------- py-polars/tests/unit/functions/test_range.py | 10 +- 2 files changed, 69 insertions(+), 56 deletions(-) diff --git a/crates/polars-plan/src/dsl/function_expr/schema.rs b/crates/polars-plan/src/dsl/function_expr/schema.rs index 3f777fa25510..91c272a0f082 100644 --- a/crates/polars-plan/src/dsl/function_expr/schema.rs +++ b/crates/polars-plan/src/dsl/function_expr/schema.rs @@ -68,7 +68,11 @@ impl FunctionExpr { time_zone, } => { // output dtype may change based on `every`, `time_unit`, and `time_zone` - let dtype = mapper.map_to_date_range_dtype(every, time_unit, time_zone)?; + let dtype = mapper.map_to_date_range_dtype( + every, + time_unit.as_ref(), + time_zone.as_deref(), + )?; return Ok(Field::new("date", dtype)); }, DateRanges { @@ -78,8 +82,11 @@ impl FunctionExpr { time_zone, } => { // output dtype may change based on `every`, `time_unit`, and `time_zone` - let inner_dtype = - mapper.map_to_date_range_dtype(every, time_unit, time_zone)?; + let inner_dtype = mapper.map_to_date_range_dtype( + every, + time_unit.as_ref(), + time_zone.as_deref(), + )?; return Ok(Field::new( "date_range", DataType::List(Box::new(inner_dtype)), @@ -379,59 +386,65 @@ impl<'a> FieldsMapper<'a> { pub(super) fn map_to_date_range_dtype( &self, every: &Duration, - time_unit: &Option, - tz: &Option, + time_unit: Option<&TimeUnit>, + time_zone: Option<&str>, ) -> PolarsResult { - let inner_dtype = match (&self.map_to_supertype()?.dtype, time_unit, tz, every) { - #[cfg(feature = "timezones")] - (DataType::Datetime(tu, Some(field_tz)), time_unit, Some(tz), _) => { - if field_tz != tz { - polars_bail!(ComputeError: format!("Given time_zone is different from that of timezone aware datetimes. \ - Given: '{}', got: '{}'.", tz, field_tz)) - } - if let Some(time_unit) = time_unit { - DataType::Datetime(*time_unit, Some(tz.to_string())) - } else { - DataType::Datetime(*tu, Some(tz.to_string())) - } - }, - #[cfg(feature = "timezones")] - (DataType::Datetime(_, Some(tz)), Some(time_unit), _, _) => { - DataType::Datetime(*time_unit, Some(tz.to_string())) - }, - #[cfg(feature = "timezones")] - (DataType::Datetime(tu, Some(tz)), None, _, _) => { - DataType::Datetime(*tu, Some(tz.to_string())) - }, - #[cfg(feature = "timezones")] - (DataType::Datetime(_, _), Some(time_unit), Some(tz), _) => { - DataType::Datetime(*time_unit, Some(tz.to_string())) + let data_dtype = self.map_to_supertype()?.dtype; + match data_dtype { + DataType::Datetime(tu, tz) => { + self.map_datetime_to_date_range_dtype(tu, tz, time_unit, time_zone) }, - #[cfg(feature = "timezones")] - (DataType::Datetime(tu, _), None, Some(tz), _) => { - DataType::Datetime(*tu, Some(tz.to_string())) + DataType::Date => { + let schema_dtype = self.map_date_to_date_range_dtype(every, time_unit, time_zone); + Ok(schema_dtype) }, - (DataType::Datetime(_, _), Some(time_unit), _, _) => { - DataType::Datetime(*time_unit, None) - }, - (DataType::Datetime(tu, _), None, _, _) => DataType::Datetime(*tu, None), - (DataType::Date, time_unit, time_zone, every) => { - let nsecs = every.nanoseconds(); - if nsecs == 0 { - DataType::Date - } else if let Some(tu) = time_unit { - DataType::Datetime(*tu, time_zone.clone()) - } else if nsecs % 1000 != 0 { - DataType::Datetime(TimeUnit::Nanoseconds, time_zone.clone()) - } else { - DataType::Datetime(TimeUnit::Microseconds, time_zone.clone()) - } - }, - (dtype, _, _, _) => { - polars_bail!(ComputeError: "expected Date or Datetime, got {}", dtype) + _ => polars_bail!(ComputeError: "expected Date or Datetime, got {}", data_dtype), + } + } + #[cfg(feature = "temporal")] + fn map_datetime_to_date_range_dtype( + &self, + data_time_unit: TimeUnit, + data_time_zone: Option, + given_time_unit: Option<&TimeUnit>, + given_time_zone: Option<&str>, + ) -> PolarsResult { + let schema_time_zone = match (data_time_zone, given_time_zone) { + (Some(data_tz), Some(given_tz)) => { + polars_ensure!( + data_tz == given_tz, + ComputeError: format!( + "`time_zone` does not match the data\ + \n\nData has time zone '{}', got '{}'.", data_tz, given_tz) + ); + Some(data_tz) }, + (_, Some(given_tz)) => Some(given_tz.to_string()), + (Some(data_tz), None) => Some(data_tz), + (_, _) => None, }; - Ok(inner_dtype) + let schema_time_unit = given_time_unit.unwrap_or(&data_time_unit); + + let schema_dtype = DataType::Datetime(*schema_time_unit, schema_time_zone); + Ok(schema_dtype) + } + #[cfg(feature = "temporal")] + fn map_date_to_date_range_dtype( + &self, + every: &Duration, + time_unit: Option<&TimeUnit>, + time_zone: Option<&str>, + ) -> DataType { + let nsecs = every.nanoseconds(); + if nsecs == 0 { + DataType::Date + } else if let Some(tu) = time_unit { + DataType::Datetime(*tu, time_zone.map(String::from)) + } else if nsecs % 1000 != 0 { + DataType::Datetime(TimeUnit::Nanoseconds, time_zone.map(String::from)) + } else { + DataType::Datetime(TimeUnit::Microseconds, time_zone.map(String::from)) + } } /// Map the dtypes to the "supertype" of a list of lists. diff --git a/py-polars/tests/unit/functions/test_range.py b/py-polars/tests/unit/functions/test_range.py index abeb42bb2786..436dd2add813 100644 --- a/py-polars/tests/unit/functions/test_range.py +++ b/py-polars/tests/unit/functions/test_range.py @@ -230,7 +230,10 @@ def test_date_range_lazy_time_zones_invalid() -> None: stop = datetime(2020, 1, 2, tzinfo=ZoneInfo("Asia/Kathmandu")) with pytest.raises( ComputeError, - match="Given time_zone is different from that of timezone aware datetimes. Given: 'Pacific/Tarawa', got: 'Asia/Kathmandu", + match=( + "`time_zone` does not match the data" + "\n\nData has time zone 'Asia/Kathmandu', got 'Pacific/Tarawa'." + ), ), pytest.warns(TimeZoneAwareConstructorWarning, match="Series with UTC"): ( pl.DataFrame({"start": [start], "stop": [stop]}) @@ -380,10 +383,7 @@ def test_timezone_aware_date_range() -> None: eager=True, ) - with pytest.raises( - ComputeError, - match=r"Given time_zone is different from that of timezone aware datetimes", - ): + with pytest.raises(ComputeError, match="`time_zone` does not match the data"): pl.date_range( low, high, interval=timedelta(days=5), time_zone="UTC", eager=True ) From c33d7055f1350a362dda2b14b3f7bb8e1c0f5f8a Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Tue, 22 Aug 2023 07:27:37 +0200 Subject: [PATCH 33/55] refactor(rust): move 'is_in' to polars-ops (#10645) --- crates/polars-core/Cargo.toml | 2 - .../src/chunked_array/ops/is_in.rs | 403 ------------------ .../polars-core/src/chunked_array/ops/mod.rs | 11 - .../src/series/implementations/binary.rs | 4 - .../src/series/implementations/boolean.rs | 4 - .../src/series/implementations/categorical.rs | 7 - .../src/series/implementations/dates_time.rs | 4 - .../src/series/implementations/datetime.rs | 4 - .../src/series/implementations/duration.rs | 4 - .../src/series/implementations/floats.rs | 4 - .../src/series/implementations/mod.rs | 4 - .../src/series/implementations/struct_.rs | 5 - .../src/series/implementations/utf8.rs | 4 - crates/polars-core/src/series/series_trait.rs | 5 - crates/polars-lazy/Cargo.toml | 2 +- crates/polars-ops/Cargo.toml | 1 + crates/polars-ops/src/series/ops/is_in.rs | 400 +++++++++++++++++ crates/polars-ops/src/series/ops/mod.rs | 4 + crates/polars-plan/Cargo.toml | 2 +- .../src/dsl/function_expr/boolean.rs | 2 +- .../polars-plan/src/dsl/function_expr/list.rs | 2 +- crates/polars/Cargo.toml | 2 +- crates/polars/src/lib.rs | 4 +- py-polars/polars/io/_utils.py | 2 +- 24 files changed, 413 insertions(+), 473 deletions(-) delete mode 100644 crates/polars-core/src/chunked_array/ops/is_in.rs create mode 100644 crates/polars-ops/src/series/ops/is_in.rs diff --git a/crates/polars-core/Cargo.toml b/crates/polars-core/Cargo.toml index d0a2f41c6ef2..adb530002559 100644 --- a/crates/polars-core/Cargo.toml +++ b/crates/polars-core/Cargo.toml @@ -77,7 +77,6 @@ sort_multiple = [] rows = [] # operations -is_in = ["reinterpret"] zip_with = [] round_series = [] checked_arithmetic = [] @@ -141,7 +140,6 @@ serde-lazy = ["serde", "polars-arrow/serde", "indexmap/serde", "smartstring/serd docs-selection = [ "ndarray", - "is_in", "rows", "docs", "strings", diff --git a/crates/polars-core/src/chunked_array/ops/is_in.rs b/crates/polars-core/src/chunked_array/ops/is_in.rs deleted file mode 100644 index cf4becb1cb21..000000000000 --- a/crates/polars-core/src/chunked_array/ops/is_in.rs +++ /dev/null @@ -1,403 +0,0 @@ -use std::hash::Hash; - -use crate::prelude::*; -use crate::utils::{try_get_supertype, CustomIterTools}; - -fn is_in_helper<'a, T>(ca: &'a ChunkedArray, other: &Series) -> PolarsResult -where - T: PolarsDataType, - ChunkedArray: HasUnderlyingArray, - < as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>: Hash + Eq + Copy, -{ - let mut set = PlHashSet::with_capacity(other.len()); - - let other = ca.unpack_series_matching_type(other)?; - other.downcast_iter().for_each(|iter| { - iter.iter().for_each(|opt_val| { - if let Some(v) = opt_val { - set.insert(v); - } - }) - }); - Ok(ca.apply_values_generic(|val| set.contains(&val))) -} - -impl IsIn for ChunkedArray -where - T: PolarsIntegerType, - T::Native: Hash + Eq, -{ - fn is_in(&self, other: &Series) -> PolarsResult { - // We check implicitly cast to supertype here - match other.dtype() { - DataType::List(dt) => { - let st = try_get_supertype(self.dtype(), dt)?; - if &st != self.dtype() || **dt != st { - let left = self.cast(&st)?; - let right = other.cast(&DataType::List(Box::new(st)))?; - return left.is_in(&right); - } - - let mut ca: BooleanChunked = if self.len() == 1 && other.len() != 1 { - let value = self.get(0); - - other - .list()? - .amortized_iter() - .map(|opt_s| { - opt_s.map(|s| { - let ca = s.as_ref().unpack::().unwrap(); - ca.into_iter().any(|a| a == value) - }) == Some(true) - }) - .collect_trusted() - } else { - polars_ensure!(self.len() == other.len(), ComputeError: "shapes don't match: expected {} elements in 'is_in' comparison, got {}", self.len(), other.len()); - self.into_iter() - .zip(other.list()?.amortized_iter()) - .map(|(value, series)| match (value, series) { - (val, Some(series)) => { - let ca = series.as_ref().unpack::().unwrap(); - ca.into_iter().any(|a| a == val) - } - _ => false, - }) - .collect_trusted() - }; - ca.rename(self.name()); - Ok(ca) - } - _ => { - // first make sure that the types are equal - if self.dtype() != other.dtype() { - let st = try_get_supertype(self.dtype(), other.dtype())?; - let left = self.cast(&st)?; - let right = other.cast(&st)?; - return left.is_in(&right); - } - is_in_helper(self, other) - } - } - .map(|mut ca| { - ca.rename(self.name()); - ca - }) - } -} - -impl IsIn for Float32Chunked { - fn is_in(&self, other: &Series) -> PolarsResult { - let other = other.cast(&DataType::Float32)?; - let other = other.f32().unwrap(); - let other = other.reinterpret_unsigned(); - let ca = self.reinterpret_unsigned(); - ca.is_in(&other) - } -} -impl IsIn for Float64Chunked { - fn is_in(&self, other: &Series) -> PolarsResult { - let other = other.cast(&DataType::Float64)?; - let other = other.f64().unwrap(); - let other = other.reinterpret_unsigned(); - let ca = self.reinterpret_unsigned(); - ca.is_in(&other) - } -} - -impl IsIn for Utf8Chunked { - fn is_in(&self, other: &Series) -> PolarsResult { - match other.dtype() { - #[cfg(feature = "dtype-categorical")] - DataType::List(dt) if matches!(&**dt, DataType::Categorical(_)) => { - if let DataType::Categorical(Some(rev_map)) = &**dt { - let opt_val = self.get(0); - - let other = other.list()?; - match opt_val { - None => { - let mut ca: BooleanChunked = other - .amortized_iter() - .map(|opt_s| { - opt_s.map(|s| s.as_ref().null_count() > 0) == Some(true) - }) - .collect_trusted(); - ca.rename(self.name()); - Ok(ca) - }, - Some(value) => { - match rev_map.find(value) { - // all false - None => Ok(BooleanChunked::full(self.name(), false, other.len())), - Some(idx) => { - let mut ca: BooleanChunked = other - .amortized_iter() - .map(|opt_s| { - opt_s.map(|s| { - let s = s.as_ref().to_physical_repr(); - let ca = s.as_ref().u32().unwrap(); - if ca.null_count() == 0 { - ca.into_no_null_iter().any(|a| a == idx) - } else { - ca.into_iter().any(|a| a == Some(idx)) - } - }) == Some(true) - }) - .collect_trusted(); - ca.rename(self.name()); - Ok(ca) - }, - } - }, - } - } else { - unreachable!() - } - }, - DataType::List(dt) if DataType::Utf8 == **dt => self.as_binary().is_in( - &other - .cast(&DataType::List(Box::new(DataType::Binary))) - .unwrap(), - ), - DataType::Utf8 => self - .as_binary() - .is_in(&other.cast(&DataType::Binary).unwrap()), - _ => polars_bail!(opq = is_in, self.dtype(), other.dtype()), - } - } -} - -impl IsIn for BinaryChunked { - fn is_in(&self, other: &Series) -> PolarsResult { - match other.dtype() { - DataType::List(dt) if DataType::Binary == **dt => { - let mut ca: BooleanChunked = if self.len() == 1 && other.len() != 1 { - let value = self.get(0); - other - .list()? - .amortized_iter() - .map(|opt_b| { - opt_b.map(|s| { - let ca = s.as_ref().unpack::().unwrap(); - ca.into_iter().any(|a| a == value) - }) == Some(true) - }) - .collect_trusted() - } else { - polars_ensure!(self.len() == other.len(), ComputeError: "shapes don't match: expected {} elements in 'is_in' comparison, got {}", self.len(), other.len()); - self.into_iter() - .zip(other.list()?.amortized_iter()) - .map(|(value, series)| match (value, series) { - (val, Some(series)) => { - let ca = series.as_ref().unpack::().unwrap(); - ca.into_iter().any(|a| a == val) - } - _ => false, - }) - .collect_trusted() - }; - ca.rename(self.name()); - Ok(ca) - } - DataType::Binary => { - is_in_helper(self, other) - } - _ => polars_bail!(opq = is_in, self.dtype(), other.dtype()), - } - .map(|mut ca| { - ca.rename(self.name()); - ca - }) - } -} - -impl IsIn for BooleanChunked { - fn is_in(&self, other: &Series) -> PolarsResult { - match other.dtype() { - DataType::List(dt) if self.dtype() == &**dt => { - let mut ca: BooleanChunked = if self.len() == 1 && other.len() != 1 { - let value = self.get(0); - // safety: we know the iterators len - unsafe { - other - .list()? - .amortized_iter() - .map(|opt_s| { - opt_s.map(|s| { - let ca = s.as_ref().unpack::().unwrap(); - ca.into_iter().any(|a| a == value) - }) == Some(true) - }) - .trust_my_length(other.len()) - .collect_trusted() - } - } else { - polars_ensure!(self.len() == other.len(), ComputeError: "shapes don't match: expected {} elements in 'is_in' comparison, got {}", self.len(), other.len()); - self.into_iter() - .zip(other.list()?.amortized_iter()) - .map(|(value, series)| match (value, series) { - (val, Some(series)) => { - let ca = series.as_ref().unpack::().unwrap(); - ca.into_iter().any(|a| a == val) - } - _ => false, - }) - .collect_trusted() - }; - ca.rename(self.name()); - Ok(ca) - } - DataType::Boolean => { - let other = other.bool().unwrap(); - let has_true = other.any(); - let nc = other.null_count(); - - let has_false = if nc == 0 { - !other.all() - } else { - !(other.sum().unwrap() as usize + nc) == other.len() - }; - Ok(self.apply_values(|v| if v { has_true } else { has_false })) - } - _ => polars_bail!(opq = is_in, self.dtype(), other.dtype()), - } - .map(|mut ca| { - ca.rename(self.name()); - ca - }) - } -} - -#[cfg(feature = "dtype-struct")] -impl IsIn for StructChunked { - fn is_in(&self, other: &Series) -> PolarsResult { - match other.dtype() { - DataType::List(_) => { - let mut ca: BooleanChunked = if self.len() == 1 && other.len() != 1 { - let mut value = vec![]; - let left = self.clone().into_series(); - let av = left.get(0).unwrap(); - if let AnyValue::Struct(_, _, _) = av { - av._materialize_struct_av(&mut value); - } - other - .list()? - .amortized_iter() - .map(|opt_s| { - opt_s.map(|s| { - let ca = s.as_ref().struct_().unwrap(); - ca.into_iter().any(|a| a == value) - }) == Some(true) - }) - .collect() - } else { - polars_ensure!(self.len() == other.len(), ComputeError: "shapes don't match: expected {} elements in 'is_in' comparison, got {}", self.len(), other.len()); - self.into_iter() - .zip(other.list()?.amortized_iter()) - .map(|(value, series)| match (value, series) { - (val, Some(series)) => { - let ca = series.as_ref().struct_().unwrap(); - ca.into_iter().any(|a| a == val) - }, - _ => false, - }) - .collect() - }; - ca.rename(self.name()); - Ok(ca) - }, - _ => { - let other = other.cast(&other.dtype().to_physical()).unwrap(); - let other = other.struct_()?; - - polars_ensure!( - self.fields().len() == other.fields().len(), - ComputeError: "`is_in`: mismatch in the number of struct fields: {} and {}", - self.fields().len(), other.fields().len() - ); - - // first make sure that the types are equal - let self_dtypes: Vec<_> = self.fields().iter().map(|f| f.dtype()).collect(); - let other_dtypes: Vec<_> = other.fields().iter().map(|f| f.dtype()).collect(); - if self_dtypes != other_dtypes { - let self_names = self.fields().iter().map(|f| f.name()); - let other_names = other.fields().iter().map(|f| f.name()); - let supertypes = self_dtypes - .iter() - .zip(other_dtypes.iter()) - .map(|(dt1, dt2)| try_get_supertype(dt1, dt2)) - .collect::, _>>()?; - let self_supertype_fields = self_names - .zip(supertypes.iter()) - .map(|(name, st)| Field::new(name, st.clone())) - .collect(); - let self_super = self.cast(&DataType::Struct(self_supertype_fields))?; - let other_supertype_fields = other_names - .zip(supertypes.iter()) - .map(|(name, st)| Field::new(name, st.clone())) - .collect(); - let other_super = other.cast(&DataType::Struct(other_supertype_fields))?; - return self_super.is_in(&other_super); - } - - let mut anyvalues = Vec::with_capacity(other.len() * other.fields().len()); - // SAFETY: - // the iterator is unsafe as the lifetime is tied to the iterator - // so we copy to an owned buffer first - other.into_iter().for_each(|vals| { - anyvalues.extend_from_slice(vals); - }); - - // then we fill the set - let mut set = PlHashSet::with_capacity(other.len()); - for key in anyvalues.chunks_exact(other.fields().len()) { - set.insert(key); - } - // physical self - let self_ca = self.cast(&self.dtype().to_physical()).unwrap(); - let self_ca = self_ca.struct_().unwrap(); - - // and then we check for membership - let mut ca: BooleanChunked = self_ca - .into_iter() - .map(|vals| { - // If all rows are null we see the struct row as missing. - if !vals.iter().all(|val| matches!(val, AnyValue::Null)) { - Some(set.contains(&vals)) - } else { - None - } - }) - .collect(); - ca.rename(self.name()); - Ok(ca) - }, - } - } -} - -#[cfg(test)] -mod test { - use crate::prelude::*; - - #[test] - fn test_is_in() -> PolarsResult<()> { - let a = Int32Chunked::new("a", &[1, 2, 3, 4]); - let b = Int64Chunked::new("b", &[4, 5, 1]); - - let out = a.is_in(&b.into_series())?; - assert_eq!( - Vec::from(&out), - [Some(true), Some(false), Some(false), Some(true)] - ); - - let a = Utf8Chunked::new("a", &["a", "b", "c", "d"]); - let b = Utf8Chunked::new("b", &["d", "e", "c"]); - - let out = a.is_in(&b.into_series())?; - assert_eq!( - Vec::from(&out), - [Some(false), Some(false), Some(true), Some(true)] - ); - Ok(()) - } -} diff --git a/crates/polars-core/src/chunked_array/ops/mod.rs b/crates/polars-core/src/chunked_array/ops/mod.rs index 6e0cc146a6b8..280ecc63a3a7 100644 --- a/crates/polars-core/src/chunked_array/ops/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/mod.rs @@ -32,8 +32,6 @@ mod filter; pub mod full; #[cfg(feature = "interpolate")] mod interpolate; -#[cfg(feature = "is_in")] -mod is_in; mod len; #[cfg(feature = "zip_with")] pub(crate) mod min_max_binary; @@ -714,15 +712,6 @@ pub trait ChunkPeaks { } } -/// Check if element is member of list array -#[cfg(feature = "is_in")] -pub trait IsIn { - /// Check if elements of this array are in the right Series, or List values of the right Series. - fn is_in(&self, _other: &Series) -> PolarsResult { - unimplemented!() - } -} - /// Repeat the values `n` times. #[cfg(feature = "repeat_by")] pub trait RepeatBy { diff --git a/crates/polars-core/src/series/implementations/binary.rs b/crates/polars-core/src/series/implementations/binary.rs index d2277e2a1f47..d03e48836358 100644 --- a/crates/polars-core/src/series/implementations/binary.rs +++ b/crates/polars-core/src/series/implementations/binary.rs @@ -274,10 +274,6 @@ impl SeriesTrait for SeriesWrap { Arc::new(SeriesWrap(Clone::clone(&self.0))) } - #[cfg(feature = "is_in")] - fn is_in(&self, other: &Series) -> PolarsResult { - IsIn::is_in(&self.0, other) - } #[cfg(feature = "repeat_by")] fn repeat_by(&self, by: &IdxCa) -> PolarsResult { RepeatBy::repeat_by(&self.0, by) diff --git a/crates/polars-core/src/series/implementations/boolean.rs b/crates/polars-core/src/series/implementations/boolean.rs index a1b54ab136c4..bc29640ff17d 100644 --- a/crates/polars-core/src/series/implementations/boolean.rs +++ b/crates/polars-core/src/series/implementations/boolean.rs @@ -328,10 +328,6 @@ impl SeriesTrait for SeriesWrap { Arc::new(SeriesWrap(Clone::clone(&self.0))) } - #[cfg(feature = "is_in")] - fn is_in(&self, other: &Series) -> PolarsResult { - IsIn::is_in(&self.0, other) - } #[cfg(feature = "repeat_by")] fn repeat_by(&self, by: &IdxCa) -> PolarsResult { RepeatBy::repeat_by(&self.0, by) diff --git a/crates/polars-core/src/series/implementations/categorical.rs b/crates/polars-core/src/series/implementations/categorical.rs index f3bd53c3c012..dccea0498d0b 100644 --- a/crates/polars-core/src/series/implementations/categorical.rs +++ b/crates/polars-core/src/series/implementations/categorical.rs @@ -10,8 +10,6 @@ use crate::chunked_array::ops::explode::ExplodeByOffsets; use crate::chunked_array::AsSinglePtr; use crate::frame::group_by::*; use crate::frame::hash_join::ZipOuterJoinColumn; -#[cfg(feature = "is_in")] -use crate::frame::hash_join::_check_categorical_src; use crate::prelude::*; use crate::series::implementations::SeriesWrap; @@ -358,11 +356,6 @@ impl SeriesTrait for SeriesWrap { Arc::new(SeriesWrap(Clone::clone(&self.0))) } - #[cfg(feature = "is_in")] - fn is_in(&self, other: &Series) -> PolarsResult { - _check_categorical_src(self.dtype(), other.dtype())?; - self.0.logical().is_in(&other.to_physical_repr()) - } #[cfg(feature = "repeat_by")] fn repeat_by(&self, by: &IdxCa) -> PolarsResult { let out = self.0.logical().repeat_by(by)?; diff --git a/crates/polars-core/src/series/implementations/dates_time.rs b/crates/polars-core/src/series/implementations/dates_time.rs index a442535efb2b..d2de3c250129 100644 --- a/crates/polars-core/src/series/implementations/dates_time.rs +++ b/crates/polars-core/src/series/implementations/dates_time.rs @@ -433,10 +433,6 @@ macro_rules! impl_dyn_series { fn peak_min(&self) -> BooleanChunked { self.0.peak_min() } - #[cfg(feature = "is_in")] - fn is_in(&self, other: &Series) -> PolarsResult { - self.0.is_in(other) - } #[cfg(feature = "repeat_by")] fn repeat_by(&self, by: &IdxCa) -> PolarsResult { match self.0.dtype() { diff --git a/crates/polars-core/src/series/implementations/datetime.rs b/crates/polars-core/src/series/implementations/datetime.rs index dce1c7fd9385..eeae18499da2 100644 --- a/crates/polars-core/src/series/implementations/datetime.rs +++ b/crates/polars-core/src/series/implementations/datetime.rs @@ -440,10 +440,6 @@ impl SeriesTrait for SeriesWrap { fn peak_min(&self) -> BooleanChunked { self.0.peak_min() } - #[cfg(feature = "is_in")] - fn is_in(&self, other: &Series) -> PolarsResult { - self.0.is_in(other) - } #[cfg(feature = "repeat_by")] fn repeat_by(&self, by: &IdxCa) -> PolarsResult { Ok(self diff --git a/crates/polars-core/src/series/implementations/duration.rs b/crates/polars-core/src/series/implementations/duration.rs index 85e8f5b8ed9b..4d7a97c16bdc 100644 --- a/crates/polars-core/src/series/implementations/duration.rs +++ b/crates/polars-core/src/series/implementations/duration.rs @@ -444,10 +444,6 @@ impl SeriesTrait for SeriesWrap { fn peak_min(&self) -> BooleanChunked { self.0.peak_min() } - #[cfg(feature = "is_in")] - fn is_in(&self, other: &Series) -> PolarsResult { - self.0.is_in(other) - } #[cfg(feature = "repeat_by")] fn repeat_by(&self, by: &IdxCa) -> PolarsResult { Ok(self diff --git a/crates/polars-core/src/series/implementations/floats.rs b/crates/polars-core/src/series/implementations/floats.rs index b0a8a246b79d..26704508a154 100644 --- a/crates/polars-core/src/series/implementations/floats.rs +++ b/crates/polars-core/src/series/implementations/floats.rs @@ -365,10 +365,6 @@ macro_rules! impl_dyn_series { self.0.peak_min() } - #[cfg(feature = "is_in")] - fn is_in(&self, other: &Series) -> PolarsResult { - IsIn::is_in(&self.0, other) - } #[cfg(feature = "repeat_by")] fn repeat_by(&self, by: &IdxCa) -> PolarsResult { RepeatBy::repeat_by(&self.0, by) diff --git a/crates/polars-core/src/series/implementations/mod.rs b/crates/polars-core/src/series/implementations/mod.rs index 0a7b471dc492..f3ab17eb7da3 100644 --- a/crates/polars-core/src/series/implementations/mod.rs +++ b/crates/polars-core/src/series/implementations/mod.rs @@ -459,10 +459,6 @@ macro_rules! impl_dyn_series { self.0.peak_min() } - #[cfg(feature = "is_in")] - fn is_in(&self, other: &Series) -> PolarsResult { - IsIn::is_in(&self.0, other) - } #[cfg(feature = "repeat_by")] fn repeat_by(&self, by: &IdxCa) -> PolarsResult { RepeatBy::repeat_by(&self.0, by) diff --git a/crates/polars-core/src/series/implementations/struct_.rs b/crates/polars-core/src/series/implementations/struct_.rs index 21bf3f78a9e9..db79be60ae12 100644 --- a/crates/polars-core/src/series/implementations/struct_.rs +++ b/crates/polars-core/src/series/implementations/struct_.rs @@ -346,11 +346,6 @@ impl SeriesTrait for SeriesWrap { self.0.apply_fields(|s| s.shift(periods)).into_series() } - #[cfg(feature = "is_in")] - fn is_in(&self, other: &Series) -> PolarsResult { - self.0.is_in(other) - } - fn clone_inner(&self) -> Arc { Arc::new(SeriesWrap(Clone::clone(&self.0))) } diff --git a/crates/polars-core/src/series/implementations/utf8.rs b/crates/polars-core/src/series/implementations/utf8.rs index 952591fed652..3bcf1d3fdefa 100644 --- a/crates/polars-core/src/series/implementations/utf8.rs +++ b/crates/polars-core/src/series/implementations/utf8.rs @@ -289,10 +289,6 @@ impl SeriesTrait for SeriesWrap { Arc::new(SeriesWrap(Clone::clone(&self.0))) } - #[cfg(feature = "is_in")] - fn is_in(&self, other: &Series) -> PolarsResult { - IsIn::is_in(&self.0, other) - } #[cfg(feature = "repeat_by")] fn repeat_by(&self, by: &IdxCa) -> PolarsResult { RepeatBy::repeat_by(&self.0, by) diff --git a/crates/polars-core/src/series/series_trait.rs b/crates/polars-core/src/series/series_trait.rs index 79033df353a5..fc212efdc2fc 100644 --- a/crates/polars-core/src/series/series_trait.rs +++ b/crates/polars-core/src/series/series_trait.rs @@ -501,11 +501,6 @@ pub trait SeriesTrait: invalid_operation_panic!(peak_min, self) } - /// Check if elements of this Series are in the right Series, or List values of the right Series. - #[cfg(feature = "is_in")] - fn is_in(&self, _other: &Series) -> PolarsResult { - polars_bail!(opq = is_in, self._dtype()); - } #[cfg(feature = "repeat_by")] fn repeat_by(&self, _by: &IdxCa) -> PolarsResult { polars_bail!(opq = repeat_by, self._dtype()); diff --git a/crates/polars-lazy/Cargo.toml b/crates/polars-lazy/Cargo.toml index f2317ea7f5dc..eb432e860d98 100644 --- a/crates/polars-lazy/Cargo.toml +++ b/crates/polars-lazy/Cargo.toml @@ -78,7 +78,7 @@ extract_jsonpath = ["polars-plan/extract_jsonpath", "polars-ops/extract_jsonpath # operations approx_unique = ["polars-plan/approx_unique"] -is_in = ["polars-plan/is_in"] +is_in = ["polars-plan/is_in", "polars-ops/is_in"] repeat_by = ["polars-plan/repeat_by"] round_series = ["polars-plan/round_series", "polars-ops/round_series"] is_first = ["polars-plan/is_first"] diff --git a/crates/polars-ops/Cargo.toml b/crates/polars-ops/Cargo.toml index 305476bec432..7819ffaa7f35 100644 --- a/crates/polars-ops/Cargo.toml +++ b/crates/polars-ops/Cargo.toml @@ -89,3 +89,4 @@ list_take = [] list_sets = [] list_any_all = [] extract_groups = ["dtype-struct", "polars-core/regex"] +is_in = ["polars-core/reinterpret"] diff --git a/crates/polars-ops/src/series/ops/is_in.rs b/crates/polars-ops/src/series/ops/is_in.rs new file mode 100644 index 000000000000..6574d78d070f --- /dev/null +++ b/crates/polars-ops/src/series/ops/is_in.rs @@ -0,0 +1,400 @@ +use std::hash::Hash; + +use polars_core::prelude::*; +use polars_core::utils::{try_get_supertype, CustomIterTools}; +use polars_core::with_match_physical_integer_polars_type; + +fn is_in_helper<'a, T>(ca: &'a ChunkedArray, other: &Series) -> PolarsResult +where + T: PolarsDataType, + ChunkedArray: HasUnderlyingArray, + < as HasUnderlyingArray>::ArrayT as StaticArray>::ValueT<'a>: Hash + Eq + Copy, +{ + let mut set = PlHashSet::with_capacity(other.len()); + + let other = ca.unpack_series_matching_type(other)?; + other.downcast_iter().for_each(|iter| { + iter.iter().for_each(|opt_val| { + if let Some(v) = opt_val { + set.insert(v); + } + }) + }); + Ok(ca.apply_values_generic(|val| set.contains(&val))) +} + +fn is_in_numeric(ca_in: &ChunkedArray, other: &Series) -> PolarsResult +where + T: PolarsIntegerType, + T::Native: Hash + Eq, +{ + // We check implicitly cast to supertype here + match other.dtype() { + DataType::List(dt) => { + let st = try_get_supertype(ca_in.dtype(), dt)?; + if &st != ca_in.dtype() || **dt != st { + let left = ca_in.cast(&st)?; + let right = other.cast(&DataType::List(Box::new(st)))?; + return is_in(&left, &right); + } + + let mut ca: BooleanChunked = if ca_in.len() == 1 && other.len() != 1 { + let value = ca_in.get(0); + + other + .list()? + .amortized_iter() + .map(|opt_s| { + opt_s.map(|s| { + let ca = s.as_ref().unpack::().unwrap(); + ca.into_iter().any(|a| a == value) + }) == Some(true) + }) + .collect_trusted() + } else { + polars_ensure!(ca_in.len() == other.len(), ComputeError: "shapes don't match: expected {} elements in 'is_in' comparison, got {}", ca_in.len(), other.len()); + ca_in.into_iter() + .zip(other.list()?.amortized_iter()) + .map(|(value, series)| match (value, series) { + (val, Some(series)) => { + let ca = series.as_ref().unpack::().unwrap(); + ca.into_iter().any(|a| a == val) + } + _ => false, + }) + .collect_trusted() + }; + ca.rename(ca_in.name()); + Ok(ca) + } + _ => { + // first make sure that the types are equal + if ca_in.dtype() != other.dtype() { + let st = try_get_supertype(ca_in.dtype(), other.dtype())?; + let left = ca_in.cast(&st)?; + let right = other.cast(&st)?; + return is_in(&left, &right); + } + is_in_helper(ca_in, other) + } + } + .map(|mut ca| { + ca.rename(ca_in.name()); + ca + }) +} + +fn is_in_utf8(ca_in: &Utf8Chunked, other: &Series) -> PolarsResult { + match other.dtype() { + #[cfg(feature = "dtype-categorical")] + DataType::List(dt) if matches!(&**dt, DataType::Categorical(_)) => { + if let DataType::Categorical(Some(rev_map)) = &**dt { + let opt_val = ca_in.get(0); + + let other = other.list()?; + match opt_val { + None => { + let mut ca: BooleanChunked = other + .amortized_iter() + .map(|opt_s| opt_s.map(|s| s.as_ref().null_count() > 0) == Some(true)) + .collect_trusted(); + ca.rename(ca_in.name()); + Ok(ca) + }, + Some(value) => { + match rev_map.find(value) { + // all false + None => Ok(BooleanChunked::full(ca_in.name(), false, other.len())), + Some(idx) => { + let mut ca: BooleanChunked = other + .amortized_iter() + .map(|opt_s| { + opt_s.map(|s| { + let s = s.as_ref().to_physical_repr(); + let ca = s.as_ref().u32().unwrap(); + if ca.null_count() == 0 { + ca.into_no_null_iter().any(|a| a == idx) + } else { + ca.into_iter().any(|a| a == Some(idx)) + } + }) == Some(true) + }) + .collect_trusted(); + ca.rename(ca_in.name()); + Ok(ca) + }, + } + }, + } + } else { + unreachable!() + } + }, + DataType::List(dt) if DataType::Utf8 == **dt => is_in_binary( + &ca_in.as_binary(), + &other + .cast(&DataType::List(Box::new(DataType::Binary))) + .unwrap(), + ), + DataType::Utf8 => is_in_binary(&ca_in.as_binary(), &other.cast(&DataType::Binary).unwrap()), + _ => polars_bail!(opq = is_in, ca_in.dtype(), other.dtype()), + } +} + +fn is_in_binary(ca_in: &BinaryChunked, other: &Series) -> PolarsResult { + match other.dtype() { + DataType::List(dt) if DataType::Binary == **dt => { + let mut ca: BooleanChunked = if ca_in.len() == 1 && other.len() != 1 { + let value = ca_in.get(0); + other + .list()? + .amortized_iter() + .map(|opt_b| { + opt_b.map(|s| { + let ca = s.as_ref().unpack::().unwrap(); + ca.into_iter().any(|a| a == value) + }) == Some(true) + }) + .collect_trusted() + } else { + polars_ensure!(ca_in.len() == other.len(), ComputeError: "shapes don't match: expected {} elements in 'is_in' comparison, got {}", ca_in.len(), other.len()); + ca_in.into_iter() + .zip(other.list()?.amortized_iter()) + .map(|(value, series)| match (value, series) { + (val, Some(series)) => { + let ca = series.as_ref().unpack::().unwrap(); + ca.into_iter().any(|a| a == val) + } + _ => false, + }) + .collect_trusted() + }; + ca.rename(ca_in.name()); + Ok(ca) + } + DataType::Binary => { + is_in_helper(ca_in, other) + } + _ => polars_bail!(opq = is_in, ca_in.dtype(), other.dtype()), + } + .map(|mut ca| { + ca.rename(ca_in.name()); + ca + }) +} + +fn is_in_boolean(ca_in: &BooleanChunked, other: &Series) -> PolarsResult { + match other.dtype() { + DataType::List(dt) if ca_in.dtype() == &**dt => { + let mut ca: BooleanChunked = if ca_in.len() == 1 && other.len() != 1 { + let value = ca_in.get(0); + // safety: we know the iterators len + unsafe { + other + .list()? + .amortized_iter() + .map(|opt_s| { + opt_s.map(|s| { + let ca = s.as_ref().unpack::().unwrap(); + ca.into_iter().any(|a| a == value) + }) == Some(true) + }) + .trust_my_length(other.len()) + .collect_trusted() + } + } else { + polars_ensure!(ca_in.len() == other.len(), ComputeError: "shapes don't match: expected {} elements in 'is_in' comparison, got {}", ca_in.len(), other.len()); + ca_in.into_iter() + .zip(other.list()?.amortized_iter()) + .map(|(value, series)| match (value, series) { + (val, Some(series)) => { + let ca = series.as_ref().unpack::().unwrap(); + ca.into_iter().any(|a| a == val) + } + _ => false, + }) + .collect_trusted() + }; + ca.rename(ca_in.name()); + Ok(ca) + } + DataType::Boolean => { + let other = other.bool().unwrap(); + let has_true = other.any(); + let nc = other.null_count(); + + let has_false = if nc == 0 { + !other.all() + } else { + !(other.sum().unwrap() as usize + nc) == other.len() + }; + Ok(ca_in.apply_values(|v| if v { has_true } else { has_false })) + } + _ => polars_bail!(opq = is_in, ca_in.dtype(), other.dtype()), + } + .map(|mut ca| { + ca.rename(ca_in.name()); + ca + }) +} + +#[cfg(feature = "dtype-struct")] +fn is_in_struct(ca_in: &StructChunked, other: &Series) -> PolarsResult { + match other.dtype() { + DataType::List(_) => { + let mut ca: BooleanChunked = if ca_in.len() == 1 && other.len() != 1 { + let mut value = vec![]; + let left = ca_in.clone().into_series(); + let av = left.get(0).unwrap(); + if let AnyValue::Struct(_, _, _) = av { + av._materialize_struct_av(&mut value); + } + other + .list()? + .amortized_iter() + .map(|opt_s| { + opt_s.map(|s| { + let ca = s.as_ref().struct_().unwrap(); + ca.into_iter().any(|a| a == value) + }) == Some(true) + }) + .collect() + } else { + polars_ensure!(ca_in.len() == other.len(), ComputeError: "shapes don't match: expected {} elements in 'is_in' comparison, got {}", ca_in.len(), other.len()); + ca_in + .into_iter() + .zip(other.list()?.amortized_iter()) + .map(|(value, series)| match (value, series) { + (val, Some(series)) => { + let ca = series.as_ref().struct_().unwrap(); + ca.into_iter().any(|a| a == val) + }, + _ => false, + }) + .collect() + }; + ca.rename(ca_in.name()); + Ok(ca) + }, + _ => { + let other = other.cast(&other.dtype().to_physical()).unwrap(); + let other = other.struct_()?; + + polars_ensure!( + ca_in.fields().len() == other.fields().len(), + ComputeError: "`is_in`: mismatch in the number of struct fields: {} and {}", + ca_in.fields().len(), other.fields().len() + ); + + // first make sure that the types are equal + let ca_in_dtypes: Vec<_> = ca_in.fields().iter().map(|f| f.dtype()).collect(); + let other_dtypes: Vec<_> = other.fields().iter().map(|f| f.dtype()).collect(); + if ca_in_dtypes != other_dtypes { + let ca_in_names = ca_in.fields().iter().map(|f| f.name()); + let other_names = other.fields().iter().map(|f| f.name()); + let supertypes = ca_in_dtypes + .iter() + .zip(other_dtypes.iter()) + .map(|(dt1, dt2)| try_get_supertype(dt1, dt2)) + .collect::, _>>()?; + let ca_in_supertype_fields = ca_in_names + .zip(supertypes.iter()) + .map(|(name, st)| Field::new(name, st.clone())) + .collect(); + let ca_in_super = ca_in.cast(&DataType::Struct(ca_in_supertype_fields))?; + let other_supertype_fields = other_names + .zip(supertypes.iter()) + .map(|(name, st)| Field::new(name, st.clone())) + .collect(); + let other_super = other.cast(&DataType::Struct(other_supertype_fields))?; + return is_in(&ca_in_super, &other_super); + } + + let mut anyvalues = Vec::with_capacity(other.len() * other.fields().len()); + // SAFETY: + // the iterator is unsafe as the lifetime is tied to the iterator + // so we copy to an owned buffer first + other.into_iter().for_each(|vals| { + anyvalues.extend_from_slice(vals); + }); + + // then we fill the set + let mut set = PlHashSet::with_capacity(other.len()); + for key in anyvalues.chunks_exact(other.fields().len()) { + set.insert(key); + } + // physical ca_in + let ca_in_ca = ca_in.cast(&ca_in.dtype().to_physical()).unwrap(); + let ca_in_ca = ca_in_ca.struct_().unwrap(); + + // and then we check for membership + let mut ca: BooleanChunked = ca_in_ca + .into_iter() + .map(|vals| { + // If all rows are null we see the struct row as missing. + if !vals.iter().all(|val| matches!(val, AnyValue::Null)) { + Some(set.contains(&vals)) + } else { + None + } + }) + .collect(); + ca.rename(ca_in.name()); + Ok(ca) + }, + } +} + +pub fn is_in(s: &Series, other: &Series) -> PolarsResult { + match s.dtype() { + #[cfg(feature = "dtype-categorical")] + DataType::Categorical(_) => { + use polars_core::frame::hash_join::_check_categorical_src; + _check_categorical_src(s.dtype(), other.dtype())?; + let ca = s.categorical().unwrap(); + let ca = ca.logical(); + is_in_numeric(ca, &other.to_physical_repr()) + }, + #[cfg(feature = "dtype-struct")] + DataType::Struct(_) => { + let ca = s.struct_().unwrap(); + is_in_struct(ca, other) + }, + DataType::Utf8 => { + let ca = s.utf8().unwrap(); + is_in_utf8(ca, other) + }, + DataType::Binary => { + let ca = s.binary().unwrap(); + is_in_binary(ca, other) + }, + DataType::Boolean => { + let ca = s.bool().unwrap(); + is_in_boolean(ca, other) + }, + DataType::Float32 => { + let other = other.cast(&DataType::Float32)?; + let other = other.f32().unwrap(); + let other = other.reinterpret_unsigned(); + let ca = s.f32().unwrap(); + let s = ca.reinterpret_unsigned(); + is_in(&s, &other) + }, + DataType::Float64 => { + let other = other.cast(&DataType::Float64)?; + let other = other.f64().unwrap(); + let other = other.reinterpret_unsigned(); + let ca = s.f64().unwrap(); + let s = ca.reinterpret_unsigned(); + is_in(&s, &other) + }, + dt if dt.to_physical().is_integer() => { + let s = s.to_physical_repr(); + with_match_physical_integer_polars_type!(s.dtype(), |$T| { + let ca: &ChunkedArray<$T> = s.as_ref().as_ref().as_ref(); + is_in_numeric(ca, other) + }) + }, + dt => polars_bail!(opq = is_int, dt), + } +} diff --git a/crates/polars-ops/src/series/ops/mod.rs b/crates/polars-ops/src/series/ops/mod.rs index 0781a8655e9d..6abde8b44886 100644 --- a/crates/polars-ops/src/series/ops/mod.rs +++ b/crates/polars-ops/src/series/ops/mod.rs @@ -10,6 +10,8 @@ mod floor_divide; mod fused; #[cfg(feature = "is_first")] mod is_first; +#[cfg(feature = "is_in")] +mod is_in; #[cfg(feature = "is_unique")] mod is_unique; #[cfg(feature = "log")] @@ -36,6 +38,8 @@ pub use floor_divide::*; pub use fused::*; #[cfg(feature = "is_first")] pub use is_first::*; +#[cfg(feature = "is_in")] +pub use is_in::*; #[cfg(feature = "is_unique")] pub use is_unique::*; #[cfg(feature = "log")] diff --git a/crates/polars-plan/Cargo.toml b/crates/polars-plan/Cargo.toml index 9225ec0906fd..aefc78e45f4f 100644 --- a/crates/polars-plan/Cargo.toml +++ b/crates/polars-plan/Cargo.toml @@ -88,7 +88,7 @@ extract_jsonpath = ["polars-ops/extract_jsonpath"] # operations approx_unique = ["polars-ops/approx_unique"] -is_in = ["polars-core/is_in"] +is_in = ["polars-ops/is_in"] repeat_by = ["polars-core/repeat_by"] round_series = ["polars-core/round_series"] is_first = ["polars-core/is_first", "polars-ops/is_first"] diff --git a/crates/polars-plan/src/dsl/function_expr/boolean.rs b/crates/polars-plan/src/dsl/function_expr/boolean.rs index 41d9ceebcaeb..2f534f95de93 100644 --- a/crates/polars-plan/src/dsl/function_expr/boolean.rs +++ b/crates/polars-plan/src/dsl/function_expr/boolean.rs @@ -171,7 +171,7 @@ fn is_duplicated(s: &Series) -> PolarsResult { fn is_in(s: &mut [Series]) -> PolarsResult> { let left = &s[0]; let other = &s[1]; - left.is_in(other).map(|ca| Some(ca.into_series())) + polars_ops::prelude::is_in(left, other).map(|ca| Some(ca.into_series())) } fn any_horizontal(s: &mut [Series]) -> PolarsResult> { diff --git a/crates/polars-plan/src/dsl/function_expr/list.rs b/crates/polars-plan/src/dsl/function_expr/list.rs index 3d4cceaa8680..1aed68b37cd9 100644 --- a/crates/polars-plan/src/dsl/function_expr/list.rs +++ b/crates/polars-plan/src/dsl/function_expr/list.rs @@ -55,7 +55,7 @@ pub(super) fn contains(args: &mut [Series]) -> PolarsResult> { let list = &args[0]; let is_in = &args[1]; - is_in.is_in(list).map(|mut ca| { + polars_ops::prelude::is_in(is_in, list).map(|mut ca| { ca.rename(list.name()); Some(ca.into_series()) }) diff --git a/crates/polars/Cargo.toml b/crates/polars/Cargo.toml index 7b6d0f3d3097..7763c91d807e 100644 --- a/crates/polars/Cargo.toml +++ b/crates/polars/Cargo.toml @@ -104,7 +104,7 @@ sort_multiple = ["polars-core/sort_multiple"] # extra operations approx_unique = ["polars-lazy/approx_unique", "polars-ops/approx_unique"] -is_in = ["polars-core/is_in", "polars-lazy/is_in"] +is_in = ["polars-lazy/is_in"] zip_with = ["polars-core/zip_with"] round_series = ["polars-core/round_series", "polars-lazy/round_series", "polars-ops/round_series"] checked_arithmetic = ["polars-core/checked_arithmetic"] diff --git a/crates/polars/src/lib.rs b/crates/polars/src/lib.rs index 530bb4db3da6..01516feff4bf 100644 --- a/crates/polars/src/lib.rs +++ b/crates/polars/src/lib.rs @@ -216,8 +216,8 @@ //! - `dataframe_arithmetic` - Arithmetic on (Dataframe and DataFrames) and (DataFrame on Series) //! - `partition_by` - Split into multiple DataFrames partitioned by groups. //! * `Series`/`Expression` operations: -//! - `is_in` - [Check for membership in `Series`](crate::chunked_array::ops::IsIn) -//! - `zip_with` - [Zip two Series/ ChunkedArrays](crate::chunked_array::ops::ChunkZip) +//! - `is_in` - Check for membership in `Series`. +//! - `zip_with` - [Zip two Series/ ChunkedArrays](crate::chunked_array::ops::ChunkZip). //! - `round_series` - round underlying float types of `Series`. //! - `repeat_by` - [Repeat element in an Array N times, where N is given by another array. //! - `is_first` - Check if element is first unique value. diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py index ec3301bbd930..4a59dd65353c 100644 --- a/py-polars/polars/io/_utils.py +++ b/py-polars/polars/io/_utils.py @@ -17,7 +17,7 @@ def _is_glob_pattern(file: str) -> bool: def _is_local_file(file: str) -> bool: try: - next(glob.iglob(file, recursive=True)) # noqa: PTH207 + next(glob.iglob(file, recursive=True)) return True except StopIteration: return False From c92260db3cc81601d94eb396bf52b623ec0e96b6 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Tue, 22 Aug 2023 11:12:03 +0200 Subject: [PATCH 34/55] depr(python): Rename `groupby` to `group_by` (#10656) --- .github/deploy_manylinux.sh | 2 +- .github/workflows/release-python.yml | 2 +- README.md | 2 +- .../dataframe/{groupby.rst => group_by.rst} | 4 +- .../docs/source/reference/dataframe/index.rst | 2 +- .../reference/dataframe/modify_select.rst | 3 + .../lazyframe/{groupby.rst => group_by.rst} | 4 +- .../docs/source/reference/lazyframe/index.rst | 2 +- .../reference/lazyframe/modify_select.rst | 3 + py-polars/docs/source/reference/selectors.rst | 2 +- py-polars/polars/dataframe/frame.py | 238 ++++++++++++++-- .../dataframe/{groupby.py => group_by.py} | 90 +++--- py-polars/polars/expr/expr.py | 48 ++-- py-polars/polars/expr/list.py | 2 +- py-polars/polars/functions/lazy.py | 4 +- py-polars/polars/lazyframe/frame.py | 263 +++++++++++++++--- .../lazyframe/{groupby.py => group_by.py} | 48 ++-- py-polars/polars/selectors.py | 6 +- py-polars/polars/series/list.py | 2 +- py-polars/polars/series/series.py | 2 +- .../tests/benchmark/run_h2oai_benchmark.py | 42 +-- py-polars/tests/benchmark/test_release.py | 10 +- .../tests/parametric/test_groupby_rolling.py | 4 +- py-polars/tests/unit/dataframe/test_df.py | 42 +-- py-polars/tests/unit/datatypes/test_array.py | 8 +- .../tests/unit/datatypes/test_categorical.py | 10 +- .../tests/unit/datatypes/test_decimal.py | 2 +- py-polars/tests/unit/datatypes/test_float.py | 8 +- py-polars/tests/unit/datatypes/test_list.py | 28 +- py-polars/tests/unit/datatypes/test_struct.py | 12 +- .../tests/unit/datatypes/test_temporal.py | 68 ++--- .../tests/unit/functions/test_as_datatype.py | 4 +- .../tests/unit/functions/test_whenthen.py | 4 +- py-polars/tests/unit/io/test_lazy_parquet.py | 2 +- py-polars/tests/unit/io/test_parquet.py | 4 +- py-polars/tests/unit/namespaces/test_list.py | 4 +- .../tests/unit/namespaces/test_string.py | 2 +- .../unit/operations/test_aggregations.py | 30 +- py-polars/tests/unit/operations/test_apply.py | 18 +- .../tests/unit/operations/test_explode.py | 14 +- .../tests/unit/operations/test_filter.py | 8 +- .../{test_groupby.py => test_group_by.py} | 204 +++++++++----- ...by_rolling.py => test_group_by_rolling.py} | 46 +-- py-polars/tests/unit/operations/test_join.py | 4 +- .../tests/unit/operations/test_join_asof.py | 2 +- .../tests/unit/operations/test_profile.py | 4 +- .../tests/unit/operations/test_random.py | 6 +- .../tests/unit/operations/test_rolling.py | 60 ++-- py-polars/tests/unit/operations/test_sort.py | 18 +- .../tests/unit/streaming/test_streaming.py | 20 +- .../unit/streaming/test_streaming_cse.py | 4 +- ..._groupby.py => test_streaming_group_by.py} | 70 ++--- .../unit/streaming/test_streaming_unique.py | 2 +- py-polars/tests/unit/test_context.py | 2 +- py-polars/tests/unit/test_cse.py | 14 +- py-polars/tests/unit/test_datatypes.py | 2 +- py-polars/tests/unit/test_empty.py | 6 +- py-polars/tests/unit/test_errors.py | 26 +- py-polars/tests/unit/test_expr_multi_cols.py | 2 +- py-polars/tests/unit/test_exprs.py | 16 +- py-polars/tests/unit/test_fmt.py | 2 +- py-polars/tests/unit/test_interop.py | 2 +- py-polars/tests/unit/test_lazy.py | 40 +-- py-polars/tests/unit/test_projections.py | 14 +- py-polars/tests/unit/test_queries.py | 28 +- py-polars/tests/unit/test_rows.py | 4 +- py-polars/tests/unit/test_schema.py | 12 +- py-polars/tests/unit/test_selectors.py | 4 +- py-polars/tests/unit/test_show_graph.py | 2 +- py-polars/tests/unit/test_sql.py | 2 +- 70 files changed, 1051 insertions(+), 620 deletions(-) rename py-polars/docs/source/reference/dataframe/{groupby.rst => group_by.rst} (87%) rename py-polars/docs/source/reference/lazyframe/{groupby.rst => group_by.rst} (77%) rename py-polars/polars/dataframe/{groupby.py => group_by.py} (94%) rename py-polars/polars/lazyframe/{groupby.py => group_by.py} (94%) rename py-polars/tests/unit/operations/{test_groupby.py => test_group_by.py} (76%) rename py-polars/tests/unit/operations/{test_groupby_rolling.py => test_group_by_rolling.py} (84%) rename py-polars/tests/unit/streaming/{test_streaming_groupby.py => test_streaming_group_by.py} (88%) diff --git a/.github/deploy_manylinux.sh b/.github/deploy_manylinux.sh index 993f4b39f2f5..2d27619e2a53 100644 --- a/.github/deploy_manylinux.sh +++ b/.github/deploy_manylinux.sh @@ -19,7 +19,7 @@ maturin publish \ # now compile polars with bigidx feature sed -i 's/name = "polars"/name = "polars-u64-idx"/' pyproject.toml # a brittle hack to insert the 'bigidx' feature -sed -i 's/"dynamic_groupby",/"dynamic_groupby",\n"bigidx",/' Cargo.toml +sed -i 's/"dynamic_group_by",/"dynamic_group_by",\n"bigidx",/' Cargo.toml maturin publish \ --skip-existing \ diff --git a/.github/workflows/release-python.yml b/.github/workflows/release-python.yml index f8819dd9a923..2e9a611f52c1 100644 --- a/.github/workflows/release-python.yml +++ b/.github/workflows/release-python.yml @@ -87,7 +87,7 @@ jobs: run: | sed -i 's/name = "polars"/name = "polars-u64-idx"/' py-polars/pyproject.toml # A brittle hack to insert the 'bigidx' feature - sed -i 's/"dynamic_groupby",/"dynamic_groupby",\n"bigidx",/' py-polars/Cargo.toml + sed -i 's/"dynamic_group_by",/"dynamic_group_by",\n"bigidx",/' py-polars/Cargo.toml - name: Publish wheel uses: PyO3/maturin-action@v1 diff --git a/README.md b/README.md index 1fee568eb25c..6a45f6a1f28f 100644 --- a/README.md +++ b/README.md @@ -129,7 +129,7 @@ shape: (5, 8) >>> # and continue in python >>> lf = context.execute(query) >>> (lf.join(other_table) -... .groupby("foo") +... .group_by("foo") ... .agg( ... pl.col("sum_v1").count() ... ).collect()) diff --git a/py-polars/docs/source/reference/dataframe/groupby.rst b/py-polars/docs/source/reference/dataframe/group_by.rst similarity index 87% rename from py-polars/docs/source/reference/dataframe/groupby.rst rename to py-polars/docs/source/reference/dataframe/group_by.rst index bd25b45699e9..5855d518f492 100644 --- a/py-polars/docs/source/reference/dataframe/groupby.rst +++ b/py-polars/docs/source/reference/dataframe/group_by.rst @@ -2,9 +2,9 @@ GroupBy ======= -This namespace is available after calling :code:`DataFrame.groupby(...)`. +This namespace is available after calling :code:`DataFrame.group_by(...)`. -.. currentmodule:: polars.dataframe.groupby +.. currentmodule:: polars.dataframe.group_by .. autosummary:: :toctree: api/ diff --git a/py-polars/docs/source/reference/dataframe/index.rst b/py-polars/docs/source/reference/dataframe/index.rst index 5fdaebbf9e27..ffcc810cc829 100644 --- a/py-polars/docs/source/reference/dataframe/index.rst +++ b/py-polars/docs/source/reference/dataframe/index.rst @@ -13,7 +13,7 @@ This page gives an overview of all public DataFrame methods. computation descriptive export - groupby + group_by modify_select miscellaneous diff --git a/py-polars/docs/source/reference/dataframe/modify_select.rst b/py-polars/docs/source/reference/dataframe/modify_select.rst index fad9e70b34b0..7feb84ffbc6c 100644 --- a/py-polars/docs/source/reference/dataframe/modify_select.rst +++ b/py-polars/docs/source/reference/dataframe/modify_select.rst @@ -20,6 +20,9 @@ Manipulation/selection DataFrame.find_idx_by_name DataFrame.get_column DataFrame.get_columns + DataFrame.group_by + DataFrame.group_by_dynamic + DataFrame.group_by_rolling DataFrame.groupby DataFrame.groupby_dynamic DataFrame.groupby_rolling diff --git a/py-polars/docs/source/reference/lazyframe/groupby.rst b/py-polars/docs/source/reference/lazyframe/group_by.rst similarity index 77% rename from py-polars/docs/source/reference/lazyframe/groupby.rst rename to py-polars/docs/source/reference/lazyframe/group_by.rst index 9745656e0bc7..05e786726e3a 100644 --- a/py-polars/docs/source/reference/lazyframe/groupby.rst +++ b/py-polars/docs/source/reference/lazyframe/group_by.rst @@ -2,9 +2,9 @@ GroupBy ======= -This namespace comes available by calling `LazyFrame.groupby(..)`. +This namespace comes available by calling `LazyFrame.group_by(..)`. -.. currentmodule:: polars.lazyframe.groupby +.. currentmodule:: polars.lazyframe.group_by .. autosummary:: :toctree: api/ diff --git a/py-polars/docs/source/reference/lazyframe/index.rst b/py-polars/docs/source/reference/lazyframe/index.rst index 702cb4bf2cca..70f2b5434a7a 100644 --- a/py-polars/docs/source/reference/lazyframe/index.rst +++ b/py-polars/docs/source/reference/lazyframe/index.rst @@ -11,7 +11,7 @@ This page gives an overview of all public LazyFrame methods. aggregation attributes descriptive - groupby + group_by modify_select miscellaneous diff --git a/py-polars/docs/source/reference/lazyframe/modify_select.rst b/py-polars/docs/source/reference/lazyframe/modify_select.rst index f52bab24e662..2257467fb127 100644 --- a/py-polars/docs/source/reference/lazyframe/modify_select.rst +++ b/py-polars/docs/source/reference/lazyframe/modify_select.rst @@ -16,6 +16,9 @@ Manipulation/selection LazyFrame.fill_null LazyFrame.filter LazyFrame.first + LazyFrame.group_by + LazyFrame.group_by_dynamic + LazyFrame.group_by_rolling LazyFrame.groupby LazyFrame.groupby_dynamic LazyFrame.groupby_rolling diff --git a/py-polars/docs/source/reference/selectors.rst b/py-polars/docs/source/reference/selectors.rst index 1a0a71a45541..064cd530f968 100644 --- a/py-polars/docs/source/reference/selectors.rst +++ b/py-polars/docs/source/reference/selectors.rst @@ -28,7 +28,7 @@ Importing "z": ["a", "b", "a", "b", "b"], }, ) - df.groupby(by=cs.string()).agg(cs.numeric().sum()) + df.group_by(by=cs.string()).agg(cs.numeric().sum()) Set operations -------------- diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 180dfc4daebd..9c4f3656f7a9 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -30,7 +30,7 @@ import polars._reexport as pl from polars import functions as F from polars.dataframe._html import NotebookFormatter -from polars.dataframe.groupby import DynamicGroupBy, GroupBy, RollingGroupBy +from polars.dataframe.group_by import DynamicGroupBy, GroupBy, RollingGroupBy from polars.datatypes import ( FLOAT_DTYPES, INTEGER_DTYPES, @@ -84,6 +84,7 @@ from polars.utils.convert import _timedelta_to_pl_duration from polars.utils.deprecation import ( deprecate_function, + deprecate_renamed_function, deprecate_renamed_methods, deprecate_renamed_parameter, ) @@ -4923,14 +4924,14 @@ def with_row_count(self, name: str = "row_nr", offset: int = 0) -> Self: """ return self._from_pydf(self._df.with_row_count(name, offset)) - def groupby( + def group_by( self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = False, ) -> GroupBy: """ - Start a groupby operation. + Start a group by operation. Parameters ---------- @@ -4941,7 +4942,7 @@ def groupby( Additional columns to group by, specified as positional arguments. maintain_order Ensure that the order of the groups is consistent with the input data. - This is slower than a default groupby. + This is slower than a default group by. Settings this to ``True`` blocks the possibility to run on the streaming engine. @@ -4966,7 +4967,7 @@ def groupby( ... "c": [5, 4, 3, 2, 1], ... } ... ) - >>> df.groupby("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -4981,7 +4982,7 @@ def groupby( Set ``maintain_order=True`` to ensure the order of the groups is consistent with the input. - >>> df.groupby("a", maintain_order=True).agg(pl.col("c")) + >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) shape: (3, 2) ┌─────┬───────────┐ │ a ┆ c │ @@ -4995,7 +4996,7 @@ def groupby( Group by multiple columns by passing a list of column names. - >>> df.groupby(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT shape: (4, 3) ┌─────┬─────┬─────┐ │ a ┆ b ┆ c │ @@ -5011,7 +5012,7 @@ def groupby( Or use positional arguments to group by multiple columns in the same way. Expressions are also accepted. - >>> df.groupby("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP shape: (3, 3) ┌─────┬─────┬─────┐ │ a ┆ b ┆ c │ @@ -5026,7 +5027,7 @@ def groupby( The ``GroupBy`` object returned by this method is iterable, returning the name and data of each group. - >>> for name, data in df.groupby("a"): # doctest: +SKIP + >>> for name, data in df.group_by("a"): # doctest: +SKIP ... print(name) ... print(data) ... @@ -5063,7 +5064,7 @@ def groupby( """ return GroupBy(self, by, *more_by, maintain_order=maintain_order) - def groupby_rolling( + def group_by_rolling( self, index_column: IntoExpr, *, @@ -5076,9 +5077,9 @@ def groupby_rolling( """ Create rolling groups based on a time, Int32, or Int64 column. - Different from a ``dynamic_groupby`` the windows are now determined by the + Different from a ``group_by_dynamic`` the windows are now determined by the individual values and are not of constant intervals. For constant intervals use - *groupby_dynamic*. + :func:`DataFrame.group_by_dynamic`. If you have a time series ````, then by default the windows created will be @@ -5115,7 +5116,7 @@ def groupby_rolling( not be 24 hours, due to daylight savings). Similarly for "calendar week", "calendar month", "calendar quarter", and "calendar year". - In case of a groupby_rolling on an integer column, the windows are defined by: + In case of a group_by_rolling on an integer column, the windows are defined by: - **"1i" # length 1** - **"10i" # length 10** @@ -5128,7 +5129,7 @@ def groupby_rolling( This column must be sorted in ascending order (or, if `by` is specified, then it must be sorted in ascending order within each group). - In case of a rolling groupby on indices, dtype needs to be one of + In case of a rolling group by on indices, dtype needs to be one of {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if performance matters use an Int64 column. period @@ -5155,7 +5156,7 @@ def groupby_rolling( See Also -------- - groupby_dynamic + group_by_dynamic Examples -------- @@ -5170,7 +5171,7 @@ def groupby_rolling( >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() ... ) - >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + >>> out = df.group_by_rolling(index_column="dt", period="2d").agg( ... [ ... pl.sum("a").alias("sum_a"), ... pl.min("a").alias("min_a"), @@ -5200,7 +5201,7 @@ def groupby_rolling( self, index_column, period, offset, closed, by, check_sorted ) - def groupby_dynamic( + def group_by_dynamic( self, index_column: IntoExpr, *, @@ -5218,7 +5219,7 @@ def groupby_dynamic( Group based on a time value (or index value of type Int32, Int64). Time windows are calculated and rows are assigned to windows. Different from a - normal groupby is that a row can be member of multiple groups. The time/index + normal group by is that a row can be member of multiple groups. The time/index window could be seen as a rolling window, with a window size determined by dates/times/values instead of slots in the DataFrame. @@ -5255,7 +5256,7 @@ def groupby_dynamic( not be 24 hours, due to daylight savings). Similarly for "calendar week", "calendar month", "calendar quarter", and "calendar year". - In case of a groupby_dynamic on an integer column, the windows are defined by: + In case of a group_by_dynamic on an integer column, the windows are defined by: - "1i" # length 1 - "10i" # length 10 @@ -5272,7 +5273,7 @@ def groupby_dynamic( This column must be sorted in ascending order (or, if `by` is specified, then it must be sorted in ascending order within each group). - In case of a dynamic groupby on indices, dtype needs to be one of + In case of a dynamic group by on indices, dtype needs to be one of {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if performance matters use an Int64 column. every @@ -5325,7 +5326,7 @@ def groupby_dynamic( .. code-block:: python # polars - df.groupby_dynamic("ts", every="1d").agg(pl.col("value").sum()) + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) is equivalent to @@ -5371,7 +5372,7 @@ def groupby_dynamic( Group by windows of 1 hour starting at 2021-12-16 00:00:00. - >>> df.groupby_dynamic("time", every="1h", closed="right").agg( + >>> df.group_by_dynamic("time", every="1h", closed="right").agg( ... [ ... pl.col("time").min().alias("time_min"), ... pl.col("time").max().alias("time_max"), @@ -5391,7 +5392,7 @@ def groupby_dynamic( The window boundaries can also be added to the aggregation result - >>> df.groupby_dynamic( + >>> df.group_by_dynamic( ... "time", every="1h", include_boundaries=True, closed="right" ... ).agg([pl.col("time").count().alias("time_count")]) shape: (4, 4) @@ -5409,7 +5410,7 @@ def groupby_dynamic( When closed="left", should not include right end of interval [lower_bound, upper_bound) - >>> df.groupby_dynamic("time", every="1h", closed="left").agg( + >>> df.group_by_dynamic("time", every="1h", closed="left").agg( ... [ ... pl.col("time").count().alias("time_count"), ... pl.col("time").alias("time_agg_list"), @@ -5429,7 +5430,7 @@ def groupby_dynamic( When closed="both" the time values at the window boundaries belong to 2 groups. - >>> df.groupby_dynamic("time", every="1h", closed="both").agg( + >>> df.group_by_dynamic("time", every="1h", closed="both").agg( ... [pl.col("time").count().alias("time_count")] ... ) shape: (5, 2) @@ -5445,7 +5446,7 @@ def groupby_dynamic( │ 2021-12-16 03:00:00 ┆ 1 │ └─────────────────────┴────────────┘ - Dynamic groupbys can also be combined with grouping on normal keys + Dynamic group bys can also be combined with grouping on normal keys >>> df = pl.DataFrame( ... { @@ -5473,7 +5474,7 @@ def groupby_dynamic( │ 2021-12-16 02:30:00 ┆ a │ │ 2021-12-16 03:00:00 ┆ a │ └─────────────────────┴────────┘ - >>> df.groupby_dynamic( + >>> df.group_by_dynamic( ... "time", ... every="1h", ... closed="both", @@ -5495,7 +5496,7 @@ def groupby_dynamic( │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ - Dynamic groupby on an index column + Dynamic group by on an index column >>> df = pl.DataFrame( ... { @@ -5504,7 +5505,7 @@ def groupby_dynamic( ... } ... ) >>> ( - ... df.groupby_dynamic( + ... df.group_by_dynamic( ... "idx", ... every="2i", ... period="3i", @@ -8512,7 +8513,7 @@ def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = None) -> i In aggregate context there is also an equivalent method for returning the unique values per-group: - >>> df_agg_nunique = df.groupby(by=["a"]).n_unique() + >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() Examples -------- @@ -9677,6 +9678,183 @@ def update( """ return self.lazy().update(other.lazy(), on, how).collect(no_optimization=True) + @deprecate_renamed_function("group_by", version="0.19.0") + def groupby( + self, + by: IntoExpr | Iterable[IntoExpr], + *more_by: IntoExpr, + maintain_order: bool = False, + ) -> GroupBy: + """ + Start a group by operation. + + Alias for :func:`DataFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + """ + return self.group_by(by, *more_by, maintain_order=maintain_order) + + @deprecate_renamed_function("group_by_rolling", version="0.19.0") + def groupby_rolling( + self, + index_column: IntoExpr, + *, + period: str | timedelta, + offset: str | timedelta | None = None, + closed: ClosedInterval = "right", + by: IntoExpr | Iterable[IntoExpr] | None = None, + check_sorted: bool = True, + ) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + Alias for :func:`DataFrame.group_by_rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + """ + return self.group_by_rolling( + index_column, + period=period, + offset=offset, + closed=closed, + by=by, + check_sorted=check_sorted, + ) + + @deprecate_renamed_function("group_by_dynamic", version="0.19.0") + def groupby_dynamic( + self, + index_column: IntoExpr, + *, + every: str | timedelta, + period: str | timedelta | None = None, + offset: str | timedelta | None = None, + truncate: bool = True, + include_boundaries: bool = False, + closed: ClosedInterval = "left", + by: IntoExpr | Iterable[IntoExpr] | None = None, + start_by: StartBy = "window", + check_sorted: bool = True, + ) -> DynamicGroupBy: + """ + Group based on a time value (or index value of type Int32, Int64). + + Alias for :func:`DataFrame.group_by_rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to 'every' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it's harder to + parallelize + closed : {'left', 'right', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {'window', 'datapoint', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'} + The strategy to determine the start of the first window by. + + * 'window': Truncate the start of the window with the 'every' argument. + Note that weekly windows start on Monday. + * 'datapoint': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``'w'``): + + * 'monday': Start the window on the Monday before the first data point. + * 'tuesday': Start the window on the Tuesday before the first data point. + * ... + * 'sunday': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ # noqa: W505 + return self.group_by_dynamic( + index_column, + every=every, + period=period, + offset=offset, + truncate=truncate, + include_boundaries=include_boundaries, + closed=closed, + by=by, + start_by=start_by, + check_sorted=check_sorted, + ) + def _prepare_other_arg(other: Any, length: int | None = None) -> Series: # if not a series create singleton series such that it will broadcast diff --git a/py-polars/polars/dataframe/groupby.py b/py-polars/polars/dataframe/group_by.py similarity index 94% rename from py-polars/polars/dataframe/groupby.py rename to py-polars/polars/dataframe/group_by.py index 4f6259d6bcd3..306b2e575ea7 100644 --- a/py-polars/polars/dataframe/groupby.py +++ b/py-polars/polars/dataframe/group_by.py @@ -36,14 +36,14 @@ def __init__( maintain_order: bool, ): """ - Utility class for performing a groupby operation over the given dataframe. + Utility class for performing a group by operation over the given dataframe. - Generated by calling ``df.groupby(...)``. + Generated by calling ``df.group_by(...)``. Parameters ---------- df - DataFrame to perform the groupby operation over. + DataFrame to perform the group by operation over. by Column or columns to group by. Accepts expression input. Strings are parsed as column names. @@ -51,7 +51,7 @@ def __init__( Additional columns to group by, specified as positional arguments. maintain_order Ensure that the order of the groups is consistent with the input data. - This is slower than a default groupby. + This is slower than a default group by. """ self.df = df @@ -61,14 +61,14 @@ def __init__( def __iter__(self) -> Self: """ - Allows iteration over the groups of the groupby operation. + Allows iteration over the groups of the group by operation. Each group is represented by a tuple of (name, data). Examples -------- >>> df = pl.DataFrame({"foo": ["a", "a", "b"], "bar": [1, 2, 3]}) - >>> for name, data in df.groupby("foo"): # doctest: +SKIP + >>> for name, data in df.group_by("foo"): # doctest: +SKIP ... print(name) ... print(data) ... @@ -97,7 +97,7 @@ def __iter__(self) -> Self: groups_df = ( self.df.lazy() .with_row_count(name=temp_col) - .groupby(self.by, *self.more_by, maintain_order=self.maintain_order) + .group_by(self.by, *self.more_by, maintain_order=self.maintain_order) .agg(F.col(temp_col)) .collect(no_optimization=True) ) @@ -135,12 +135,12 @@ def agg( **named_aggs: IntoExpr, ) -> DataFrame: """ - Compute aggregations for each group of a groupby operation. + Compute aggregations for each group of a group by operation. Parameters ---------- *aggs - Aggregations to compute for each group of the groupby operation, + Aggregations to compute for each group of the group by operation, specified as positional arguments. Accepts expression input. Strings are parsed as column names. **named_aggs @@ -158,7 +158,7 @@ def agg( ... "c": [5, 4, 3, 2, 1], ... } ... ) - >>> df.groupby("a").agg([pl.col("b"), pl.col("c")]) # doctest: +IGNORE_RESULT + >>> df.group_by("a").agg([pl.col("b"), pl.col("c")]) # doctest: +IGNORE_RESULT shape: (3, 3) ┌─────┬───────────┬───────────┐ │ a ┆ b ┆ c │ @@ -174,7 +174,7 @@ def agg( Compute the sum of a column for each group. - >>> df.groupby("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -188,7 +188,7 @@ def agg( Compute multiple aggregates at once by passing a list of expressions. - >>> df.groupby("a").agg([pl.sum("b"), pl.mean("c")]) # doctest: +IGNORE_RESULT + >>> df.group_by("a").agg([pl.sum("b"), pl.mean("c")]) # doctest: +IGNORE_RESULT shape: (3, 3) ┌─────┬─────┬─────┐ │ a ┆ b ┆ c │ @@ -202,7 +202,7 @@ def agg( Or use positional arguments to compute multiple aggregations in the same way. - >>> df.groupby("a").agg( + >>> df.group_by("a").agg( ... pl.sum("b").suffix("_sum"), ... (pl.col("c") ** 2).mean().suffix("_mean_squared"), ... ) # doctest: +IGNORE_RESULT @@ -219,7 +219,7 @@ def agg( Use keyword arguments to easily name your expression inputs. - >>> df.groupby("a").agg( + >>> df.group_by("a").agg( ... b_sum=pl.sum("b"), ... c_mean_squared=(pl.col("c") ** 2).mean(), ... ) # doctest: +IGNORE_RESULT @@ -237,7 +237,7 @@ def agg( """ return ( self.df.lazy() - .groupby(self.by, *self.more_by, maintain_order=self.maintain_order) + .group_by(self.by, *self.more_by, maintain_order=self.maintain_order) .agg(*aggs, **named_aggs) .collect(no_optimization=True) ) @@ -296,7 +296,7 @@ def apply(self, function: Callable[[DataFrame], DataFrame]) -> DataFrame: For each color group sample two rows: - >>> df.groupby("color").apply( + >>> df.group_by("color").apply( ... lambda group_df: group_df.sample(2) ... ) # doctest: +IGNORE_RESULT shape: (4, 3) @@ -367,7 +367,7 @@ def head(self, n: int = 5) -> DataFrame: │ a ┆ 5 │ │ b ┆ 6 │ └─────────┴─────┘ - >>> df.groupby("letters").head(2).sort("letters") + >>> df.group_by("letters").head(2).sort("letters") shape: (5, 2) ┌─────────┬─────┐ │ letters ┆ nrs │ @@ -384,7 +384,7 @@ def head(self, n: int = 5) -> DataFrame: """ return ( self.df.lazy() - .groupby(self.by, *self.more_by, maintain_order=self.maintain_order) + .group_by(self.by, *self.more_by, maintain_order=self.maintain_order) .head(n) .collect(no_optimization=True) ) @@ -420,7 +420,7 @@ def tail(self, n: int = 5) -> DataFrame: │ a ┆ 5 │ │ b ┆ 6 │ └─────────┴─────┘ - >>> df.groupby("letters").tail(2).sort("letters") + >>> df.group_by("letters").tail(2).sort("letters") shape: (5, 2) ┌─────────┬─────┐ │ letters ┆ nrs │ @@ -437,7 +437,7 @@ def tail(self, n: int = 5) -> DataFrame: """ return ( self.df.lazy() - .groupby(self.by, *self.more_by, maintain_order=self.maintain_order) + .group_by(self.by, *self.more_by, maintain_order=self.maintain_order) .tail(n) .collect(no_optimization=True) ) @@ -449,7 +449,7 @@ def all(self) -> DataFrame: Examples -------- >>> df = pl.DataFrame({"a": ["one", "two", "one", "two"], "b": [1, 2, 3, 4]}) - >>> df.groupby("a", maintain_order=True).all() + >>> df.group_by("a", maintain_order=True).all() shape: (2, 2) ┌─────┬───────────┐ │ a ┆ b │ @@ -480,7 +480,7 @@ def count(self) -> DataFrame: ... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"], ... } ... ) - >>> df.groupby("d", maintain_order=True).count() + >>> df.group_by("d", maintain_order=True).count() shape: (3, 2) ┌────────┬───────┐ │ d ┆ count │ @@ -509,7 +509,7 @@ def first(self) -> DataFrame: ... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"], ... } ... ) - >>> df.groupby("d", maintain_order=True).first() + >>> df.group_by("d", maintain_order=True).first() shape: (3, 4) ┌────────┬─────┬──────┬───────┐ │ d ┆ a ┆ b ┆ c │ @@ -538,7 +538,7 @@ def last(self) -> DataFrame: ... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"], ... } ... ) - >>> df.groupby("d", maintain_order=True).last() + >>> df.group_by("d", maintain_order=True).last() shape: (3, 4) ┌────────┬─────┬──────┬───────┐ │ d ┆ a ┆ b ┆ c │ @@ -567,7 +567,7 @@ def max(self) -> DataFrame: ... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"], ... } ... ) - >>> df.groupby("d", maintain_order=True).max() + >>> df.group_by("d", maintain_order=True).max() shape: (3, 4) ┌────────┬─────┬──────┬──────┐ │ d ┆ a ┆ b ┆ c │ @@ -596,7 +596,7 @@ def mean(self) -> DataFrame: ... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"], ... } ... ) - >>> df.groupby("d", maintain_order=True).mean() + >>> df.group_by("d", maintain_order=True).mean() shape: (3, 4) ┌────────┬─────┬──────────┬──────────┐ │ d ┆ a ┆ b ┆ c │ @@ -624,7 +624,7 @@ def median(self) -> DataFrame: ... "d": ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"], ... } ... ) - >>> df.groupby("d", maintain_order=True).median() + >>> df.group_by("d", maintain_order=True).median() shape: (2, 3) ┌────────┬─────┬──────┐ │ d ┆ a ┆ b │ @@ -652,7 +652,7 @@ def min(self) -> DataFrame: ... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"], ... } ... ) - >>> df.groupby("d", maintain_order=True).min() + >>> df.group_by("d", maintain_order=True).min() shape: (3, 4) ┌────────┬─────┬──────┬───────┐ │ d ┆ a ┆ b ┆ c │ @@ -680,7 +680,7 @@ def n_unique(self) -> DataFrame: ... "d": ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"], ... } ... ) - >>> df.groupby("d", maintain_order=True).n_unique() + >>> df.group_by("d", maintain_order=True).n_unique() shape: (2, 3) ┌────────┬─────┬─────┐ │ d ┆ a ┆ b │ @@ -716,7 +716,7 @@ def quantile( ... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"], ... } ... ) - >>> df.groupby("d", maintain_order=True).quantile(1) + >>> df.group_by("d", maintain_order=True).quantile(1) shape: (3, 3) ┌────────┬─────┬──────┐ │ d ┆ a ┆ b │ @@ -745,7 +745,7 @@ def sum(self) -> DataFrame: ... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"], ... } ... ) - >>> df.groupby("d", maintain_order=True).sum() + >>> df.group_by("d", maintain_order=True).sum() shape: (3, 4) ┌────────┬─────┬──────┬─────┐ │ d ┆ a ┆ b ┆ c │ @@ -766,7 +766,7 @@ class RollingGroupBy: A rolling grouper. This has an `.agg` method which will allow you to run all polars expressions in a - groupby context. + group by context. """ def __init__( @@ -795,7 +795,7 @@ def __iter__(self) -> Self: groups_df = ( self.df.lazy() .with_row_count(name=temp_col) - .groupby_rolling( + .group_by_rolling( index_column=self.time_column, period=self.period, offset=self.offset, @@ -840,12 +840,12 @@ def agg( **named_aggs: IntoExpr, ) -> DataFrame: """ - Compute aggregations for each group of a groupby operation. + Compute aggregations for each group of a group by operation. Parameters ---------- *aggs - Aggregations to compute for each group of the groupby operation, + Aggregations to compute for each group of the group by operation, specified as positional arguments. Accepts expression input. Strings are parsed as column names. **named_aggs @@ -854,7 +854,7 @@ def agg( """ return ( self.df.lazy() - .groupby_rolling( + .group_by_rolling( index_column=self.time_column, period=self.period, offset=self.offset, @@ -922,7 +922,7 @@ def apply( >>> ( ... df.lazy() - ... .groupby("color") + ... .group_by("color") ... .apply(lambda group_df: group_df.sample(2), schema=None) ... .collect() ... ) # doctest: +IGNORE_RESULT @@ -949,7 +949,7 @@ def apply( """ return ( self.df.lazy() - .groupby_rolling( + .group_by_rolling( index_column=self.time_column, period=self.period, offset=self.offset, @@ -967,7 +967,7 @@ class DynamicGroupBy: A dynamic grouper. This has an `.agg` method which allows you to run all polars expressions in a - groupby context. + group by context. """ def __init__( @@ -1005,7 +1005,7 @@ def __iter__(self) -> Self: groups_df = ( self.df.lazy() .with_row_count(name=temp_col) - .groupby_dynamic( + .group_by_dynamic( index_column=self.time_column, every=self.every, period=self.period, @@ -1054,12 +1054,12 @@ def agg( **named_aggs: IntoExpr, ) -> DataFrame: """ - Compute aggregations for each group of a groupby operation. + Compute aggregations for each group of a group by operation. Parameters ---------- *aggs - Aggregations to compute for each group of the groupby operation, + Aggregations to compute for each group of the group by operation, specified as positional arguments. Accepts expression input. Strings are parsed as column names. **named_aggs @@ -1068,7 +1068,7 @@ def agg( """ return ( self.df.lazy() - .groupby_dynamic( + .group_by_dynamic( index_column=self.time_column, every=self.every, period=self.period, @@ -1140,7 +1140,7 @@ def apply( >>> ( ... df.lazy() - ... .groupby("color") + ... .group_by("color") ... .apply(lambda group_df: group_df.sample(2), schema=None) ... .collect() ... ) # doctest: +IGNORE_RESULT @@ -1167,7 +1167,7 @@ def apply( """ return ( self.df.lazy() - .groupby_dynamic( + .group_by_dynamic( index_column=self.time_column, every=self.every, period=self.period, diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index 258cbc802bf3..2999118b0bb9 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -1205,7 +1205,7 @@ def agg_groups(self) -> Self: ... "value": [94, 95, 96, 97, 97, 99], ... } ... ) - >>> df.groupby("group", maintain_order=True).agg(pl.col("value").agg_groups()) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) shape: (2, 2) ┌───────┬───────────┐ │ group ┆ value │ @@ -1851,7 +1851,7 @@ def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Self: Sort this column. When used in a projection/selection context, the whole column is sorted. - When used in a groupby context, the groups are sorted. + When used in a group by context, the groups are sorted. Parameters ---------- @@ -1904,7 +1904,7 @@ def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Self: │ null │ └──────┘ - When sorting in a groupby context, the groups are sorted. + When sorting in a group by context, the groups are sorted. >>> df = pl.DataFrame( ... { @@ -1912,7 +1912,7 @@ def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Self: ... "value": [1, 98, 2, 3, 99, 4], ... } ... ) - >>> df.groupby("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT shape: (2, 2) ┌───────┬────────────┐ │ group ┆ value │ @@ -2158,7 +2158,7 @@ def sort_by( Sort this column by the ordering of other columns. When used in a projection/selection context, the whole column is sorted. - When used in a groupby context, the groups are sorted. + When used in a group by context, the groups are sorted. Parameters ---------- @@ -2240,9 +2240,9 @@ def sort_by( │ b │ └───────┘ - When sorting in a groupby context, the groups are sorted. + When sorting in a group by context, the groups are sorted. - >>> df.groupby("group").agg( + >>> df.group_by("group").agg( ... pl.col("value1").sort_by("value2") ... ) # doctest: +IGNORE_RESULT shape: (2, 2) @@ -2258,7 +2258,7 @@ def sort_by( Take a single row from each group where a column attains its minimal value within that group. - >>> df.groupby("group").agg( + >>> df.group_by("group").agg( ... pl.all().sort_by("value2").first() ... ) # doctest: +IGNORE_RESULT shape: (2, 3) @@ -2312,7 +2312,7 @@ def take( ... "value": [1, 98, 2, 3, 99, 4], ... } ... ) - >>> df.groupby("group", maintain_order=True).agg(pl.col("value").take(1)) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").take(1)) shape: (2, 2) ┌───────┬───────┐ │ group ┆ value │ @@ -3056,7 +3056,7 @@ def over( """ Compute expressions over the given groups. - This expression is similar to performing a groupby aggregation and joining the + This expression is similar to performing a group by aggregation and joining the result back into the original dataframe. The outcome is similar to how `window functions @@ -3576,7 +3576,7 @@ def filter(self, predicate: Expr) -> Self: ... "b": [1, 2, 3], ... } ... ) - >>> df.groupby("group_col").agg( + >>> df.group_by("group_col").agg( ... [ ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), @@ -3614,7 +3614,7 @@ def where(self, predicate: Expr) -> Self: ... "b": [1, 2, 3], ... } ... ) - >>> df.groupby("group_col").agg( + >>> df.group_by("group_col").agg( ... [ ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), @@ -3791,7 +3791,7 @@ def apply( In a GroupBy context the function is applied by group: - >>> df.lazy().groupby("b", maintain_order=True).agg( + >>> df.lazy().group_by("b", maintain_order=True).agg( ... pl.col("a").apply(lambda x: x.sum()) ... ).collect() shape: (3, 2) @@ -3807,7 +3807,7 @@ def apply( It is better to implement this with an expression: - >>> df.groupby("b", maintain_order=True).agg( + >>> df.group_by("b", maintain_order=True).agg( ... pl.col("a").sum(), ... ) # doctest: +IGNORE_RESULT @@ -3897,7 +3897,7 @@ def flatten(self) -> Self: ... "values": [[1, 2], [2, 3], [4]], ... } ... ) - >>> df.groupby("group").agg(pl.col("values").flatten()) # doctest: +SKIP + >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP shape: (2, 2) ┌───────┬───────────┐ │ group ┆ values │ @@ -5234,7 +5234,7 @@ def rolling_min( Notes ----- If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `groupby_rolling` this method can cache the window size + window, consider using `group_by_rolling` this method can cache the window size computation. Examples @@ -5440,7 +5440,7 @@ def rolling_max( Notes ----- If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `groupby_rolling` this method can cache the window size + window, consider using `group_by_rolling` this method can cache the window size computation. Examples @@ -5673,7 +5673,7 @@ def rolling_mean( Notes ----- If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `groupby_rolling` this method can cache the window size + window, consider using `group_by_rolling` this method can cache the window size computation. Examples @@ -5906,7 +5906,7 @@ def rolling_sum( Notes ----- If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `groupby_rolling` this method can cache the window size + window, consider using `group_by_rolling` this method can cache the window size computation. Examples @@ -6138,7 +6138,7 @@ def rolling_std( Notes ----- If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `groupby_rolling` this method can cache the window size + window, consider using `group_by_rolling` this method can cache the window size computation. Examples @@ -6370,7 +6370,7 @@ def rolling_var( Notes ----- If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `groupby_rolling` this method can cache the window size + window, consider using `group_by_rolling` this method can cache the window size computation. Examples @@ -6605,7 +6605,7 @@ def rolling_median( Notes ----- If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `groupby_rolling` this method can cache the window size + window, consider using `group_by_rolling` this method can cache the window size computation. Examples @@ -6766,7 +6766,7 @@ def rolling_quantile( Notes ----- If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `groupby_rolling` this method can cache the window size + window, consider using `group_by_rolling` this method can cache the window size computation. Examples @@ -8446,7 +8446,7 @@ def cumulative_eval( Number of valid values there should be in the window before the expression is evaluated. valid values = `length - null_count` parallel - Run in parallel. Don't do this in a groupby or another operation that + Run in parallel. Don't do this in a group by or another operation that already has much parallelization. Warnings diff --git a/py-polars/polars/expr/list.py b/py-polars/polars/expr/list.py index 1577353b722e..269366c413e8 100644 --- a/py-polars/polars/expr/list.py +++ b/py-polars/polars/expr/list.py @@ -870,7 +870,7 @@ def eval(self, expr: Expr, *, parallel: bool = False) -> Expr: Run all expression parallel. Don't activate this blindly. Parallelism is worth it if there is enough work to do per thread. - This likely should not be use in the groupby context, because we already + This likely should not be use in the group by context, because we already parallel execution per group Examples diff --git a/py-polars/polars/functions/lazy.py b/py-polars/polars/functions/lazy.py index feb994947557..78862b54abd5 100644 --- a/py-polars/polars/functions/lazy.py +++ b/py-polars/polars/functions/lazy.py @@ -303,7 +303,7 @@ def count(column: str | Series | None = None) -> Expr | int: ╞═══════╡ │ 3 │ └───────┘ - >>> df.groupby("c", maintain_order=True).agg(pl.count()) + >>> df.group_by("c", maintain_order=True).agg(pl.count()) shape: (2, 2) ┌─────┬───────┐ │ c ┆ count │ @@ -1083,7 +1083,7 @@ def apply( │ 2 ┆ 3 ┆ 7 │ └───────┴─────┴─────┘ >>> ( - ... df.groupby("group").agg( + ... df.group_by("group").agg( ... pl.apply( ... exprs=["a", "b"], ... function=lambda list_of_series: list_of_series[0] diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index c40b6c72d15a..75703adbd228 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -47,7 +47,7 @@ from polars.io._utils import _is_local_file from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec -from polars.lazyframe.groupby import LazyGroupBy +from polars.lazyframe.group_by import LazyGroupBy from polars.selectors import _expand_selectors, expand_selector from polars.slice import LazyPolarsSlice from polars.utils._async import _AsyncDataFrameResult @@ -992,7 +992,7 @@ def explain( ... "c": [6, 5, 4, 3, 2, 1], ... } ... ) - >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( ... "a" ... ).explain() # doctest: +SKIP """ @@ -1071,7 +1071,7 @@ def show_graph( ... "c": [6, 5, 4, 3, 2, 1], ... } ... ) - >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( ... "a" ... ).show_graph() # doctest: +SKIP @@ -1496,7 +1496,7 @@ def profile( ... "c": [6, 5, 4, 3, 2, 1], ... } ... ) - >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( ... "a" ... ).profile() # doctest: +SKIP (shape: (3, 3) @@ -1510,15 +1510,15 @@ def profile( │ c ┆ 6 ┆ 1 │ └─────┴─────┴─────┘, shape: (3, 3) - ┌────────────────────────┬───────┬──────┐ - │ node ┆ start ┆ end │ - │ --- ┆ --- ┆ --- │ - │ str ┆ u64 ┆ u64 │ - ╞════════════════════════╪═══════╪══════╡ - │ optimization ┆ 0 ┆ 5 │ - │ groupby_partitioned(a) ┆ 5 ┆ 470 │ - │ sort(a) ┆ 475 ┆ 1964 │ - └────────────────────────┴───────┴──────┘) + ┌─────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞═════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ group_by_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └─────────────────────────┴───────┴──────┘) """ if no_optimization: @@ -1639,7 +1639,7 @@ def collect( ... "c": [6, 5, 4, 3, 2, 1], ... } ... ) - >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).collect() + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).collect() shape: (3, 3) ┌─────┬─────┬─────┐ │ a ┆ b ┆ c │ @@ -1741,7 +1741,7 @@ def collect_async( ... } ... ) >>> a = ( - ... lf.groupby("a", maintain_order=True) + ... lf.group_by("a", maintain_order=True) ... .agg(pl.all().sum()) ... .collect_async(queue.Queue()) ... ) @@ -2020,7 +2020,7 @@ def fetch( ... "c": [6, 5, 4, 3, 2, 1], ... } ... ) - >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) shape: (2, 3) ┌─────┬─────┬─────┐ │ a ┆ b ┆ c │ @@ -2438,14 +2438,14 @@ def select_seq( ) return self._from_pyldf(self._ldf.select_seq(pyexprs)) - def groupby( + def group_by( self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = False, ) -> LazyGroupBy: """ - Start a groupby operation. + Start a group by operation. Parameters ---------- @@ -2456,7 +2456,7 @@ def groupby( Additional columns to group by, specified as positional arguments. maintain_order Ensure that the order of the groups is consistent with the input data. - This is slower than a default groupby. + This is slower than a default group by. Settings this to ``True`` blocks the possibility to run on the streaming engine. @@ -2472,7 +2472,7 @@ def groupby( ... "c": [5, 4, 3, 2, 1], ... } ... ) - >>> lf.groupby("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -2487,7 +2487,7 @@ def groupby( Set ``maintain_order=True`` to ensure the order of the groups is consistent with the input. - >>> lf.groupby("a", maintain_order=True).agg(pl.col("c")).collect() + >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() shape: (3, 2) ┌─────┬───────────┐ │ a ┆ c │ @@ -2501,7 +2501,7 @@ def groupby( Group by multiple columns by passing a list of column names. - >>> lf.groupby(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP shape: (4, 3) ┌─────┬─────┬─────┐ │ a ┆ b ┆ c │ @@ -2517,7 +2517,7 @@ def groupby( Or use positional arguments to group by multiple columns in the same way. Expressions are also accepted. - >>> lf.groupby("a", pl.col("b") // 2).agg( + >>> lf.group_by("a", pl.col("b") // 2).agg( ... pl.col("c").mean() ... ).collect() # doctest: +SKIP shape: (3, 3) @@ -2536,7 +2536,7 @@ def groupby( lgb = self._ldf.group_by(exprs, maintain_order) return LazyGroupBy(lgb) - def groupby_rolling( + def group_by_rolling( self, index_column: IntoExpr, *, @@ -2549,9 +2549,9 @@ def groupby_rolling( """ Create rolling groups based on a time, Int32, or Int64 column. - Different from a ``dynamic_groupby`` the windows are now determined by the + Different from a ``dynamic_group_by`` the windows are now determined by the individual values and are not of constant intervals. For constant intervals - use *groupby_dynamic*. + use :func:`LazyFrame.group_by_dynamic`. If you have a time series ````, then by default the windows created will be @@ -2588,7 +2588,7 @@ def groupby_rolling( not be 24 hours, due to daylight savings). Similarly for "calendar week", "calendar month", "calendar quarter", and "calendar year". - In case of a groupby_rolling on an integer column, the windows are defined by: + In case of a group_by_rolling on an integer column, the windows are defined by: - "1i" # length 1 - "10i" # length 10 @@ -2601,7 +2601,7 @@ def groupby_rolling( This column must be sorted in ascending order (or, if `by` is specified, then it must be sorted in ascending order within each group). - In case of a rolling groupby on indices, dtype needs to be one of + In case of a rolling group by on indices, dtype needs to be one of {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if performance matters use an Int64 column. period @@ -2628,7 +2628,7 @@ def groupby_rolling( See Also -------- - groupby_dynamic + group_by_dynamic Examples -------- @@ -2644,7 +2644,7 @@ def groupby_rolling( ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() ... ) >>> out = ( - ... df.groupby_rolling(index_column="dt", period="2d") + ... df.group_by_rolling(index_column="dt", period="2d") ... .agg( ... [ ... pl.sum("a").alias("sum_a"), @@ -2686,7 +2686,7 @@ def groupby_rolling( ) return LazyGroupBy(lgb) - def groupby_dynamic( + def group_by_dynamic( self, index_column: IntoExpr, *, @@ -2704,7 +2704,7 @@ def groupby_dynamic( Group based on a time value (or index value of type Int32, Int64). Time windows are calculated and rows are assigned to windows. Different from a - normal groupby is that a row can be member of multiple groups. The time/index + normal group by is that a row can be member of multiple groups. The time/index window could be seen as a rolling window, with a window size determined by dates/times/values instead of slots in the DataFrame. @@ -2741,7 +2741,7 @@ def groupby_dynamic( not be 24 hours, due to daylight savings). Similarly for "calendar week", "calendar month", "calendar quarter", and "calendar year". - In case of a groupby_dynamic on an integer column, the windows are defined by: + In case of a group_by_dynamic on an integer column, the windows are defined by: - "1i" # length 1 - "10i" # length 10 @@ -2758,7 +2758,7 @@ def groupby_dynamic( This column must be sorted in ascending order (or, if `by` is specified, then it must be sorted in ascending order within each group). - In case of a dynamic groupby on indices, dtype needs to be one of + In case of a dynamic group by on indices, dtype needs to be one of {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if performance matters use an Int64 column. every @@ -2806,7 +2806,7 @@ def groupby_dynamic( See Also -------- - groupby_rolling + group_by_rolling Notes ----- @@ -2815,7 +2815,7 @@ def groupby_dynamic( .. code-block:: python # polars - df.groupby_dynamic("ts", every="1d").agg(pl.col("value").sum()) + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) is equivalent to @@ -2861,7 +2861,7 @@ def groupby_dynamic( Group by windows of 1 hour starting at 2021-12-16 00:00:00. - >>> lf.groupby_dynamic("time", every="1h", closed="right").agg( + >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( ... [ ... pl.col("time").min().alias("time_min"), ... pl.col("time").max().alias("time_max"), @@ -2881,7 +2881,7 @@ def groupby_dynamic( The window boundaries can also be added to the aggregation result - >>> lf.groupby_dynamic( + >>> lf.group_by_dynamic( ... "time", every="1h", include_boundaries=True, closed="right" ... ).agg([pl.col("time").count().alias("time_count")]).collect() shape: (4, 4) @@ -2899,7 +2899,7 @@ def groupby_dynamic( When closed="left", should not include right end of interval [lower_bound, upper_bound) - >>> lf.groupby_dynamic("time", every="1h", closed="left").agg( + >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( ... [ ... pl.col("time").count().alias("time_count"), ... pl.col("time").alias("time_agg_list"), @@ -2919,7 +2919,7 @@ def groupby_dynamic( When closed="both" the time values at the window boundaries belong to 2 groups. - >>> lf.groupby_dynamic("time", every="1h", closed="both").agg( + >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( ... pl.col("time").count().alias("time_count") ... ).collect() shape: (5, 2) @@ -2935,7 +2935,7 @@ def groupby_dynamic( │ 2021-12-16 03:00:00 ┆ 1 │ └─────────────────────┴────────────┘ - Dynamic groupbys can also be combined with grouping on normal keys + Dynamic group bys can also be combined with grouping on normal keys >>> lf = pl.LazyFrame( ... { @@ -2964,7 +2964,7 @@ def groupby_dynamic( │ 2021-12-16 03:00:00 ┆ a │ └─────────────────────┴────────┘ >>> ( - ... lf.groupby_dynamic( + ... lf.group_by_dynamic( ... "time", ... every="1h", ... closed="both", @@ -2987,7 +2987,7 @@ def groupby_dynamic( │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ - Dynamic groupby on an index column + Dynamic group by on an index column >>> lf = pl.LazyFrame( ... { @@ -2995,7 +2995,7 @@ def groupby_dynamic( ... "A": ["A", "A", "B", "B", "B", "C"], ... } ... ) - >>> lf.groupby_dynamic( + >>> lf.group_by_dynamic( ... "idx", ... every="2i", ... period="3i", @@ -5333,3 +5333,178 @@ def update( result = result.drop(row_count_name) return self._from_pyldf(result._ldf) + + @deprecate_renamed_function("group_by", version="0.19.0") + def groupby( + self, + by: IntoExpr | Iterable[IntoExpr], + *more_by: IntoExpr, + maintain_order: bool = False, + ) -> LazyGroupBy: + """ + Start a group by operation. + + Alias for :func:`LazyFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + """ + return self.group_by(by, *more_by, maintain_order=maintain_order) + + @deprecate_renamed_function("group_by_rolling", version="0.19.0") + def groupby_rolling( + self, + index_column: IntoExpr, + *, + period: str | timedelta, + offset: str | timedelta | None = None, + closed: ClosedInterval = "right", + by: IntoExpr | Iterable[IntoExpr] | None = None, + check_sorted: bool = True, + ) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + Alias for :func:`LazyFrame.group_by_rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + return self.group_by_rolling( + index_column, + period=period, + offset=offset, + closed=closed, + by=by, + check_sorted=check_sorted, + ) + + @deprecate_renamed_function("group_by_dynamic", version="0.19.0") + def groupby_dynamic( + self, + index_column: IntoExpr, + *, + every: str | timedelta, + period: str | timedelta | None = None, + offset: str | timedelta | None = None, + truncate: bool = True, + include_boundaries: bool = False, + closed: ClosedInterval = "left", + by: IntoExpr | Iterable[IntoExpr] | None = None, + start_by: StartBy = "window", + check_sorted: bool = True, + ) -> LazyGroupBy: + """ + Group based on a time value (or index value of type Int32, Int64). + + Alias for :func:`LazyFrame.group_by_rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to 'every' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it's harder to + parallelize + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {'window', 'datapoint', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'} + The strategy to determine the start of the first window by. + + * 'window': Truncate the start of the window with the 'every' argument. + Note that weekly windows start on Monday. + * 'datapoint': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``'w'``): + + * 'monday': Start the window on the Monday before the first data point. + * 'tuesday': Start the window on the Tuesday before the first data point. + * ... + * 'sunday': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ # noqa: W505 + return self.group_by_dynamic( + index_column, + every=every, + period=period, + offset=offset, + truncate=truncate, + include_boundaries=include_boundaries, + closed=closed, + by=by, + start_by=start_by, + check_sorted=check_sorted, + ) diff --git a/py-polars/polars/lazyframe/groupby.py b/py-polars/polars/lazyframe/group_by.py similarity index 94% rename from py-polars/polars/lazyframe/groupby.py rename to py-polars/polars/lazyframe/group_by.py index 85eb9e10eb7a..06d700d86315 100644 --- a/py-polars/polars/lazyframe/groupby.py +++ b/py-polars/polars/lazyframe/group_by.py @@ -14,9 +14,9 @@ class LazyGroupBy: """ - Utility class for performing a groupby operation over a lazy dataframe. + Utility class for performing a group by operation over a lazy dataframe. - Generated by calling ``df.lazy().groupby(...)``. + Generated by calling ``df.lazy().group_by(...)``. """ def __init__(self, lgb: PyLazyGroupBy) -> None: @@ -28,12 +28,12 @@ def agg( **named_aggs: IntoExpr, ) -> LazyFrame: """ - Compute aggregations for each group of a groupby operation. + Compute aggregations for each group of a group by operation. Parameters ---------- *aggs - Aggregations to compute for each group of the groupby operation, + Aggregations to compute for each group of the group by operation, specified as positional arguments. Accepts expression input. Strings are parsed as column names. **named_aggs @@ -51,7 +51,7 @@ def agg( ... "c": [5, 4, 3, 2, 1], ... } ... ).lazy() - >>> ldf.groupby("a").agg( + >>> ldf.group_by("a").agg( ... [pl.col("b"), pl.col("c")] ... ).collect() # doctest: +IGNORE_RESULT shape: (3, 3) @@ -69,7 +69,9 @@ def agg( Compute the sum of a column for each group. - >>> ldf.groupby("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + >>> ldf.group_by("a").agg( + ... pl.col("b").sum() + ... ).collect() # doctest: +IGNORE_RESULT shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -83,7 +85,7 @@ def agg( Compute multiple aggregates at once by passing a list of expressions. - >>> ldf.groupby("a").agg( + >>> ldf.group_by("a").agg( ... [pl.sum("b"), pl.mean("c")] ... ).collect() # doctest: +IGNORE_RESULT shape: (3, 3) @@ -99,7 +101,7 @@ def agg( Or use positional arguments to compute multiple aggregations in the same way. - >>> ldf.groupby("a").agg( + >>> ldf.group_by("a").agg( ... pl.sum("b").suffix("_sum"), ... (pl.col("c") ** 2).mean().suffix("_mean_squared"), ... ).collect() # doctest: +IGNORE_RESULT @@ -116,7 +118,7 @@ def agg( Use keyword arguments to easily name your expression inputs. - >>> ldf.groupby("a").agg( + >>> ldf.group_by("a").agg( ... b_sum=pl.sum("b"), ... c_mean_squared=(pl.col("c") ** 2).mean(), ... ).collect() # doctest: +IGNORE_RESULT @@ -202,7 +204,7 @@ def apply( >>> ( ... df.lazy() - ... .groupby("color") + ... .group_by("color") ... .apply(lambda group_df: group_df.sample(2), schema=None) ... .collect() ... ) # doctest: +IGNORE_RESULT @@ -260,7 +262,7 @@ def head(self, n: int = 5) -> LazyFrame: │ a ┆ 5 │ │ b ┆ 6 │ └─────────┴─────┘ - >>> df.groupby("letters").head(2).sort("letters") + >>> df.group_by("letters").head(2).sort("letters") shape: (5, 2) ┌─────────┬─────┐ │ letters ┆ nrs │ @@ -308,7 +310,7 @@ def tail(self, n: int = 5) -> LazyFrame: │ a ┆ 5 │ │ b ┆ 6 │ └─────────┴─────┘ - >>> df.groupby("letters").tail(2).sort("letters") + >>> df.group_by("letters").tail(2).sort("letters") shape: (5, 2) ┌─────────┬─────┐ │ letters ┆ nrs │ @@ -337,7 +339,7 @@ def all(self) -> LazyFrame: ... "b": [1, 2, 3, 4], ... } ... ).lazy() - >>> ldf.groupby("a", maintain_order=True).all().collect() + >>> ldf.group_by("a", maintain_order=True).all().collect() shape: (2, 2) ┌─────┬───────────┐ │ a ┆ b │ @@ -368,7 +370,7 @@ def count(self) -> LazyFrame: ... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"], ... } ... ).lazy() - >>> ldf.groupby("d", maintain_order=True).count().collect() + >>> ldf.group_by("d", maintain_order=True).count().collect() shape: (3, 2) ┌────────┬───────┐ │ d ┆ count │ @@ -397,7 +399,7 @@ def first(self) -> LazyFrame: ... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"], ... } ... ).lazy() - >>> ldf.groupby("d", maintain_order=True).first().collect() + >>> ldf.group_by("d", maintain_order=True).first().collect() shape: (3, 4) ┌────────┬─────┬──────┬───────┐ │ d ┆ a ┆ b ┆ c │ @@ -426,7 +428,7 @@ def last(self) -> LazyFrame: ... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"], ... } ... ).lazy() - >>> ldf.groupby("d", maintain_order=True).last().collect() + >>> ldf.group_by("d", maintain_order=True).last().collect() shape: (3, 4) ┌────────┬─────┬──────┬───────┐ │ d ┆ a ┆ b ┆ c │ @@ -455,7 +457,7 @@ def max(self) -> LazyFrame: ... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"], ... } ... ).lazy() - >>> ldf.groupby("d", maintain_order=True).max().collect() + >>> ldf.group_by("d", maintain_order=True).max().collect() shape: (3, 4) ┌────────┬─────┬──────┬──────┐ │ d ┆ a ┆ b ┆ c │ @@ -484,7 +486,7 @@ def mean(self) -> LazyFrame: ... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"], ... } ... ).lazy() - >>> ldf.groupby("d", maintain_order=True).mean().collect() + >>> ldf.group_by("d", maintain_order=True).mean().collect() shape: (3, 4) ┌────────┬─────┬──────────┬──────────┐ │ d ┆ a ┆ b ┆ c │ @@ -512,7 +514,7 @@ def median(self) -> LazyFrame: ... "d": ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"], ... } ... ).lazy() - >>> ldf.groupby("d", maintain_order=True).median().collect() + >>> ldf.group_by("d", maintain_order=True).median().collect() shape: (2, 3) ┌────────┬─────┬──────┐ │ d ┆ a ┆ b │ @@ -540,7 +542,7 @@ def min(self) -> LazyFrame: ... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"], ... } ... ).lazy() - >>> ldf.groupby("d", maintain_order=True).min().collect() + >>> ldf.group_by("d", maintain_order=True).min().collect() shape: (3, 4) ┌────────┬─────┬──────┬───────┐ │ d ┆ a ┆ b ┆ c │ @@ -568,7 +570,7 @@ def n_unique(self) -> LazyFrame: ... "d": ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"], ... } ... ).lazy() - >>> ldf.groupby("d", maintain_order=True).n_unique().collect() + >>> ldf.group_by("d", maintain_order=True).n_unique().collect() shape: (2, 3) ┌────────┬─────┬─────┐ │ d ┆ a ┆ b │ @@ -604,7 +606,7 @@ def quantile( ... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"], ... } ... ).lazy() - >>> ldf.groupby("d", maintain_order=True).quantile(1).collect() + >>> ldf.group_by("d", maintain_order=True).quantile(1).collect() shape: (3, 3) ┌────────┬─────┬──────┐ │ d ┆ a ┆ b │ @@ -633,7 +635,7 @@ def sum(self) -> LazyFrame: ... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"], ... } ... ).lazy() - >>> ldf.groupby("d", maintain_order=True).sum().collect() + >>> ldf.group_by("d", maintain_order=True).sum().collect() shape: (3, 4) ┌────────┬─────┬──────┬─────┐ │ d ┆ a ┆ b ┆ c │ diff --git a/py-polars/polars/selectors.py b/py-polars/polars/selectors.py index 477f4196e240..c792735f9b64 100644 --- a/py-polars/polars/selectors.py +++ b/py-polars/polars/selectors.py @@ -420,7 +420,7 @@ def by_dtype( Group by string columns and sum the numeric columns: - >>> df.groupby(cs.string()).agg(cs.numeric().sum()).sort(by="other") + >>> df.group_by(cs.string()).agg(cs.numeric().sum()).sort(by="other") shape: (2, 2) ┌───────┬──────────┐ │ other ┆ value │ @@ -1519,7 +1519,7 @@ def string(include_categorical: bool = False) -> SelectorType: Group by all string columns, sum the numeric columns, then sort by the string cols: - >>> df.groupby(cs.string()).agg(cs.numeric().sum()).sort(by=cs.string()) + >>> df.group_by(cs.string()).agg(cs.numeric().sum()).sort(by=cs.string()) shape: (2, 3) ┌─────┬─────┬─────┐ │ w ┆ x ┆ y │ @@ -1532,7 +1532,7 @@ def string(include_categorical: bool = False) -> SelectorType: Group by all string *and* categorical columns: - >>> df.groupby(cs.string(True)).agg(cs.numeric().sum()).sort(by=cs.string(True)) + >>> df.group_by(cs.string(True)).agg(cs.numeric().sum()).sort(by=cs.string(True)) shape: (3, 4) ┌─────┬─────┬─────┬──────┐ │ w ┆ z ┆ x ┆ y │ diff --git a/py-polars/polars/series/list.py b/py-polars/polars/series/list.py index 7afd49f03a81..76170741c761 100644 --- a/py-polars/polars/series/list.py +++ b/py-polars/polars/series/list.py @@ -547,7 +547,7 @@ def eval(self, expr: Expr, *, parallel: bool = False) -> Series: Run all expression parallel. Don't activate this blindly. Parallelism is worth it if there is enough work to do per thread. - This likely should not be use in the groupby context, because we already + This likely should not be use in the group by context, because we already parallel execution per group Examples diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index 9c7d9454704e..e3b1025f9f95 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -2327,7 +2327,7 @@ def cumulative_eval( Number of valid values there should be in the window before the expression is evaluated. valid values = `length - null_count` parallel - Run in parallel. Don't do this in a groupby or another operation that + Run in parallel. Don't do this in a group by or another operation that already has much parallelization. Warnings diff --git a/py-polars/tests/benchmark/run_h2oai_benchmark.py b/py-polars/tests/benchmark/run_h2oai_benchmark.py index 133389cfa428..961f8a3a5f9f 100644 --- a/py-polars/tests/benchmark/run_h2oai_benchmark.py +++ b/py-polars/tests/benchmark/run_h2oai_benchmark.py @@ -39,7 +39,7 @@ t00 = time.time() t0 = time.time() print("q1") -out = x.groupby("id1").agg(pl.sum("v1").alias("v1_sum")).collect() +out = x.group_by("id1").agg(pl.sum("v1").alias("v1_sum")).collect() print(time.time() - t0) print("out.shape", out.shape) print('out["v1_sum"].sum()', out["v1_sum"].sum()) @@ -47,7 +47,7 @@ t0easy = time.time() t0 = time.time() print("q2") -out = x.groupby(["id1", "id2"]).agg(pl.sum("v1").alias("v1_sum")).collect() +out = x.group_by(["id1", "id2"]).agg(pl.sum("v1").alias("v1_sum")).collect() print(time.time() - t0) print("out.shape", out.shape) print('out["v1_sum"].sum()', out["v1_sum"].sum()) @@ -55,7 +55,7 @@ t0 = time.time() print("q3") out = ( - x.groupby("id3") + x.group_by("id3") .agg([pl.sum("v1").alias("v1_sum"), pl.mean("v3").alias("v3_mean")]) .collect() ) @@ -67,7 +67,7 @@ t0 = time.time() print("q4") out = ( - x.groupby("id4") + x.group_by("id4") .agg( [ pl.mean("v1").alias("v1_mean"), @@ -86,7 +86,7 @@ t0 = time.time() print("q5") out = ( - x.groupby("id6") + x.group_by("id6") .agg( [ pl.sum("v1").alias("v1_sum"), @@ -106,7 +106,7 @@ t0 = time.time() print("q6") out = ( - x.groupby(["id4", "id5"]) + x.group_by(["id4", "id5"]) .agg([pl.median("v3").alias("v3_median"), pl.std("v3").alias("v3_std")]) .collect() ) @@ -118,7 +118,9 @@ t0 = time.time() print("q7") out = ( - x.groupby("id3").agg([(pl.max("v1") - pl.min("v2")).alias("range_v1_v2")]).collect() + x.group_by("id3") + .agg([(pl.max("v1") - pl.min("v2")).alias("range_v1_v2")]) + .collect() ) print(time.time() - t0) print("out.shape", out.shape) @@ -128,7 +130,7 @@ print("q8") out = ( x.drop_nulls("v3") - .groupby("id6") + .group_by("id6") .agg(pl.col("v3").top_k(2).alias("largest2_v3")) .explode("largest2_v3") .collect() @@ -139,7 +141,7 @@ t0 = time.time() print("q9") -out = x.groupby(["id2", "id4"]).agg((pl.corr("v1", "v2") ** 2).alias("r2")).collect() +out = x.group_by(["id2", "id4"]).agg((pl.corr("v1", "v2") ** 2).alias("r2")).collect() print(time.time() - t0) print("out.shape", out.shape) print('out["r2"].sum()', out["r2"].sum()) @@ -147,7 +149,7 @@ t0 = time.time() print("q10") out = ( - x.groupby(["id1", "id2", "id3", "id4", "id5", "id6"]) + x.group_by(["id1", "id2", "id3", "id4", "id5", "id6"]) .agg([pl.sum("v3").alias("v3"), pl.count("v1").alias("count")]) .collect() ) @@ -160,7 +162,7 @@ t00 = time.time() t0 = time.time() print("q1") -out = x.groupby("id1").agg(pl.sum("v1").alias("v1_sum")).collect() +out = x.group_by("id1").agg(pl.sum("v1").alias("v1_sum")).collect() print(time.time() - t0) assert out.shape == (96, 2) assert out["v1_sum"].sum() == 28501451 @@ -168,7 +170,7 @@ t0easy = time.time() t0 = time.time() print("q2") -out = x.groupby(["id1", "id2"]).agg(pl.sum("v1").alias("v1_sum")).collect() +out = x.group_by(["id1", "id2"]).agg(pl.sum("v1").alias("v1_sum")).collect() print(time.time() - t0) assert out.shape == (9216, 3) assert out["v1_sum"].sum() == 28501451 @@ -176,7 +178,7 @@ t0 = time.time() print("q3") out = ( - x.groupby("id3") + x.group_by("id3") .agg([pl.sum("v1").alias("v1_sum"), pl.mean("v3").alias("v3_mean")]) .collect() ) @@ -188,7 +190,7 @@ t0 = time.time() print("q4") out = ( - x.groupby("id4") + x.group_by("id4") .agg( [ pl.mean("v1").alias("v1_mean"), @@ -207,7 +209,7 @@ t0 = time.time() print("q5") out = ( - x.groupby("id6") + x.group_by("id6") .agg( [ pl.sum("v1").alias("v1_sum"), @@ -227,7 +229,7 @@ t0 = time.time() print("q6") out = ( - x.groupby(["id4", "id5"]) + x.group_by(["id4", "id5"]) .agg([pl.median("v3").alias("v3_median"), pl.std("v3").alias("v3_std")]) .collect() ) @@ -239,7 +241,7 @@ t0 = time.time() print("q7") out = ( - x.groupby("id3") + x.group_by("id3") .agg( [ (pl.max("v1").alias("v1_max") - pl.min("v2").alias("v2_mean")).alias( @@ -258,7 +260,7 @@ out = ( x.drop_nulls("v3") .sort("v3", descending=True) - .groupby("id6") + .group_by("id6") .agg(pl.col("v3").head(2).alias("largest2_v3")) .explode("largest2_v3") .collect() @@ -269,7 +271,7 @@ t0 = time.time() print("q9") -out = x.groupby(["id2", "id4"]).agg((pl.corr("v1", "v2") ** 2).alias("r2")).collect() +out = x.group_by(["id2", "id4"]).agg((pl.corr("v1", "v2") ** 2).alias("r2")).collect() print(time.time() - t0) assert out.shape == (9216, 3) assert np.isclose(out["r2"].sum(), 9.902706276948825) @@ -277,7 +279,7 @@ t0 = time.time() print("q10") out = ( - x.groupby(["id1", "id2", "id3", "id4", "id5", "id6"]) + x.group_by(["id1", "id2", "id3", "id4", "id5", "id6"]) .agg([pl.sum("v3").alias("v3"), pl.count("v1").alias("count")]) .collect() ) diff --git a/py-polars/tests/benchmark/test_release.py b/py-polars/tests/benchmark/test_release.py index 7b8adbad0c22..607f19ed0184 100644 --- a/py-polars/tests/benchmark/test_release.py +++ b/py-polars/tests/benchmark/test_release.py @@ -170,7 +170,7 @@ def test_boolean_min_max_agg() -> None: df = pl.DataFrame({"idx": idx, "c": c}) aggs = [pl.col("c").min().alias("c_min"), pl.col("c").max().alias("c_max")] - assert df.groupby("idx").agg(aggs).sum().to_dict(False) == { + assert df.group_by("idx").agg(aggs).sum().to_dict(False) == { "idx": [107583], "c_min": [120], "c_max": [321], @@ -179,14 +179,14 @@ def test_boolean_min_max_agg() -> None: nulls = np.random.randint(0, 500, 1000) < 100 assert df.with_columns( c=pl.when(pl.lit(nulls)).then(None).otherwise(pl.col("c")) - ).groupby("idx").agg(aggs).sum().to_dict(False) == { + ).group_by("idx").agg(aggs).sum().to_dict(False) == { "idx": [107583], "c_min": [133], "c_max": [276], } -def test_categorical_vs_str_groupby() -> None: +def test_categorical_vs_str_group_by() -> None: # this triggers the perfect hash table s = pl.Series("a", np.random.randint(0, 50, 100)) s_with_nulls = pl.select( @@ -198,11 +198,11 @@ def test_categorical_vs_str_groupby() -> None: cat_out = ( s_.cast(pl.Categorical) .to_frame("a") - .groupby("a") + .group_by("a") .agg(pl.first().alias("first")) ) - str_out = s_.to_frame("a").groupby("a").agg(pl.first().alias("first")) + str_out = s_.to_frame("a").group_by("a").agg(pl.first().alias("first")) cat_out.with_columns(pl.col("a").cast(str)) assert_frame_equal( cat_out.with_columns( diff --git a/py-polars/tests/parametric/test_groupby_rolling.py b/py-polars/tests/parametric/test_groupby_rolling.py index cb55825f951b..c4c62b36a250 100644 --- a/py-polars/tests/parametric/test_groupby_rolling.py +++ b/py-polars/tests/parametric/test_groupby_rolling.py @@ -25,7 +25,7 @@ data=st.data(), time_unit=strategy_time_unit, ) -def test_groupby_rolling( +def test_group_by_rolling( period: str, offset: str, closed: ClosedInterval, @@ -43,7 +43,7 @@ def test_groupby_rolling( ) df = dataframe.sort("ts").unique("ts") try: - result = df.groupby_rolling( + result = df.group_by_rolling( "ts", period=period, offset=offset, closed=closed ).agg(pl.col("value")) except pl.exceptions.PolarsPanicError as exc: diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py index f3e212444836..e3abcab1c271 100644 --- a/py-polars/tests/unit/dataframe/test_df.py +++ b/py-polars/tests/unit/dataframe/test_df.py @@ -744,9 +744,9 @@ def test_shift() -> None: assert_frame_equal(a, b) -def test_custom_groupby() -> None: +def test_custom_group_by() -> None: df = pl.DataFrame({"a": [1, 2, 1, 1], "b": ["a", "b", "c", "c"]}) - out = df.groupby("b", maintain_order=True).agg( + out = df.group_by("b", maintain_order=True).agg( [pl.col("a").apply(lambda x: x.sum(), return_dtype=pl.Int64)] ) assert out.rows() == [("a", 1), ("b", 2), ("c", 2)] @@ -981,7 +981,7 @@ def test_init_series_edge_cases() -> None: assert df3.columns == ["column_0", "column_1"] -def test_head_groupby() -> None: +def test_head_group_by() -> None: commodity_prices = { "commodity": [ "Wheat", @@ -1024,7 +1024,7 @@ def test_head_groupby() -> None: keys = ["commodity", "location"] out = ( df.sort(by="price", descending=True) - .groupby(keys, maintain_order=True) + .group_by(keys, maintain_order=True) .agg([pl.col("*").exclude(keys).head(2).keep_name()]) .explode(pl.col("*").exclude(keys)) ) @@ -1041,12 +1041,12 @@ def test_head_groupby() -> None: df = pl.DataFrame( {"letters": ["c", "c", "a", "c", "a", "b"], "nrs": [1, 2, 3, 4, 5, 6]} ) - out = df.groupby("letters").tail(2).sort("letters") + out = df.group_by("letters").tail(2).sort("letters") assert_frame_equal( out, pl.DataFrame({"letters": ["a", "a", "b", "c", "c"], "nrs": [3, 5, 6, 2, 4]}), ) - out = df.groupby("letters").head(2).sort("letters") + out = df.group_by("letters").head(2).sort("letters") assert_frame_equal( out, pl.DataFrame({"letters": ["a", "a", "b", "c", "c"], "nrs": [3, 5, 6, 1, 2]}), @@ -1854,7 +1854,7 @@ def __repr__(self) -> str: def test_hashing_on_python_objects() -> None: - # see if we can do a groupby, drop_duplicates on a DataFrame with objects. + # see if we can do a group_by, drop_duplicates on a DataFrame with objects. # this requires that the hashing and aggregations are done on python objects df = pl.DataFrame({"a": [1, 1, 3, 4], "b": [1, 1, 2, 2]}) @@ -1867,7 +1867,7 @@ def __eq__(self, other: Any) -> bool: return True df = df.with_columns(pl.col("a").apply(lambda x: Foo()).alias("foo")) - assert df.groupby(["foo"]).first().shape == (1, 3) + assert df.group_by(["foo"]).first().shape == (1, 3) assert df.unique().shape == (3, 3) @@ -1943,7 +1943,7 @@ def test_apply_dataframe_return() -> None: assert_frame_equal(out, expected) -def test_groupby_cat_list() -> None: +def test_group_by_cat_list() -> None: grouped = ( pl.DataFrame( [ @@ -1952,7 +1952,7 @@ def test_groupby_cat_list() -> None: ] ) .with_columns(pl.col("str_column").cast(pl.Categorical).alias("cat_column")) - .groupby("int_column", maintain_order=True) + .group_by("int_column", maintain_order=True) .agg([pl.col("cat_column")])["cat_column"] ) @@ -1961,12 +1961,12 @@ def test_groupby_cat_list() -> None: assert out[0] == "a" -def test_groupby_agg_n_unique_floats() -> None: +def test_group_by_agg_n_unique_floats() -> None: # tests proper dispatch df = pl.DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]}) for dtype in [pl.Float32, pl.Float64]: - out = df.groupby("a", maintain_order=True).agg( + out = df.group_by("a", maintain_order=True).agg( [pl.col("b").cast(dtype).n_unique()] ) assert out["b"].to_list() == [2, 1] @@ -2033,7 +2033,7 @@ def __repr__(self) -> str: df = pl.DataFrame({"groups": [1, 1, 2], "a": foos}) assert sys.getrefcount(foos[0]) == base_count + 1 - out = df.groupby("groups", maintain_order=True).agg(pl.col("a").alias("a")) + out = df.group_by("groups", maintain_order=True).agg(pl.col("a").alias("a")) assert sys.getrefcount(foos[0]) == base_count + 2 s = out["a"].list.explode() assert sys.getrefcount(foos[0]) == base_count + 3 @@ -2048,25 +2048,25 @@ def __repr__(self) -> str: assert sys.getrefcount(foos[0]) == base_count -def test_groupby_order_dispatch() -> None: +def test_group_by_order_dispatch() -> None: df = pl.DataFrame({"x": list("bab"), "y": range(3)}) - result = df.groupby("x", maintain_order=True).count() + result = df.group_by("x", maintain_order=True).count() expected = pl.DataFrame( {"x": ["b", "a"], "count": [2, 1]}, schema_overrides={"count": pl.UInt32} ) assert_frame_equal(result, expected) - result = df.groupby("x", maintain_order=True).all() + result = df.group_by("x", maintain_order=True).all() expected = pl.DataFrame({"x": ["b", "a"], "y": [[0, 2], [1]]}) assert_frame_equal(result, expected) -def test_partitioned_groupby_order() -> None: +def test_partitioned_group_by_order() -> None: # check if group ordering is maintained. # we only have 30 groups, so this triggers a partitioned group by df = pl.DataFrame({"x": [chr(v) for v in range(33, 63)], "y": range(30)}) - out = df.groupby("x", maintain_order=True).agg(pl.all().implode()) + out = df.group_by("x", maintain_order=True).agg(pl.all().implode()) assert_series_equal(out["x"], df["x"]) @@ -2721,11 +2721,11 @@ def test_empty_is_in() -> None: assert df_empty_isin.schema == {"foo": pl.Utf8} -def test_groupby_slice_expression_args() -> None: +def test_group_by_slice_expression_args() -> None: df = pl.DataFrame({"groups": ["a"] * 10 + ["b"] * 20, "vals": range(30)}) out = ( - df.groupby("groups", maintain_order=True) + df.group_by("groups", maintain_order=True) .agg([pl.col("vals").slice(pl.count() * 0.1, (pl.count() // 5))]) .explode("vals") ) @@ -2751,7 +2751,7 @@ def test_join_suffixes() -> None: def test_explode_empty() -> None: df = ( pl.DataFrame({"x": ["a", "a", "b", "b"], "y": [1, 1, 2, 2]}) - .groupby("x", maintain_order=True) + .group_by("x", maintain_order=True) .agg(pl.col("y").take([])) ) assert df.explode("y").to_dict(False) == {"x": ["a", "b"], "y": [None, None]} diff --git a/py-polars/tests/unit/datatypes/test_array.py b/py-polars/tests/unit/datatypes/test_array.py index d5888b5f9070..2eb2fedde73c 100644 --- a/py-polars/tests/unit/datatypes/test_array.py +++ b/py-polars/tests/unit/datatypes/test_array.py @@ -51,7 +51,7 @@ def test_array_construction() -> None: assert df.rows() == [] -def test_array_in_groupby() -> None: +def test_array_in_group_by() -> None: df = pl.DataFrame( [ pl.Series("id", [1, 2]), @@ -59,7 +59,7 @@ def test_array_in_groupby() -> None: ] ) - assert next(iter(df.groupby("id", maintain_order=True)))[1]["list"].to_list() == [ + assert next(iter(df.group_by("id", maintain_order=True)))[1]["list"].to_list() == [ [1, 2] ] @@ -68,8 +68,8 @@ def test_array_in_groupby() -> None: schema={"a": pl.Array(inner=pl.Int64, width=2), "g": pl.Int64}, ) - out0 = df.groupby("g").agg(pl.col("a")).sort("g") - out1 = df.set_sorted("g").groupby("g").agg(pl.col("a")) + out0 = df.group_by("g").agg(pl.col("a")).sort("g") + out1 = df.set_sorted("g").group_by("g").agg(pl.col("a")) for out in [out0, out1]: assert out.schema == { diff --git a/py-polars/tests/unit/datatypes/test_categorical.py b/py-polars/tests/unit/datatypes/test_categorical.py index 9db461143d58..2275b5211a2c 100644 --- a/py-polars/tests/unit/datatypes/test_categorical.py +++ b/py-polars/tests/unit/datatypes/test_categorical.py @@ -87,8 +87,8 @@ def test_categorical_describe_3487() -> None: @StringCache() def test_categorical_is_in_list() -> None: # this requires type coercion to cast. - # we should not cast within the function as this would be expensive within a groupby - # context that would be a cast per group + # we should not cast within the function as this would be expensive within a + # group by context that would be a cast per group df = pl.DataFrame( {"a": [1, 2, 3, 1, 2], "b": ["a", "b", "c", "d", "e"]} ).with_columns(pl.col("b").cast(pl.Categorical)) @@ -115,7 +115,7 @@ def test_unset_sorted_on_append() -> None: ] ).sort("key") df = pl.concat([df1, df2], rechunk=False) - assert df.groupby("key").count()["count"].to_list() == [4, 4] + assert df.group_by("key").count()["count"].to_list() == [4, 4] def test_categorical_error_on_local_cmp() -> None: @@ -307,11 +307,11 @@ def test_nested_categorical_aggregation_7848() -> None: "group": [1, 1, 2, 2, 2, 3, 3], "letter": ["a", "b", "c", "d", "e", "f", "g"], } - ).with_columns([pl.col("letter").cast(pl.Categorical)]).groupby( + ).with_columns([pl.col("letter").cast(pl.Categorical)]).group_by( maintain_order=True, by=["group"] ).all().with_columns( [pl.col("letter").list.lengths().alias("c_group")] - ).groupby( + ).group_by( by=["c_group"], maintain_order=True ).agg( pl.col("letter") diff --git a/py-polars/tests/unit/datatypes/test_decimal.py b/py-polars/tests/unit/datatypes/test_decimal.py index e6f8029dd0b8..aa89b8eb9dad 100644 --- a/py-polars/tests/unit/datatypes/test_decimal.py +++ b/py-polars/tests/unit/datatypes/test_decimal.py @@ -180,7 +180,7 @@ def test_decimal_aggregations() -> None: } ) - assert df.groupby("g", maintain_order=True).agg( + assert df.group_by("g", maintain_order=True).agg( sum=pl.sum("a"), min=pl.min("a"), max=pl.max("a"), diff --git a/py-polars/tests/unit/datatypes/test_float.py b/py-polars/tests/unit/datatypes/test_float.py index 6ed39b35e6c7..16dd1df2022c 100644 --- a/py-polars/tests/unit/datatypes/test_float.py +++ b/py-polars/tests/unit/datatypes/test_float.py @@ -1,7 +1,7 @@ import polars as pl -def test_nan_in_groupby_agg() -> None: +def test_nan_in_group_by_agg() -> None: df = pl.DataFrame( { "key": ["a", "a", "a", "a"], @@ -10,8 +10,8 @@ def test_nan_in_groupby_agg() -> None: } ) - assert df.groupby("bar", "key").agg(pl.col("value").max())["value"].item() == 18.78 - assert df.groupby("bar", "key").agg(pl.col("value").min())["value"].item() == 18.58 + assert df.group_by("bar", "key").agg(pl.col("value").max())["value"].item() == 18.78 + assert df.group_by("bar", "key").agg(pl.col("value").min())["value"].item() == 18.58 def test_nan_aggregations() -> None: @@ -29,6 +29,6 @@ def test_nan_aggregations() -> None: == "{'max': [3.0], 'min': [1.0], 'nan_max': [nan], 'nan_min': [nan]}" ) assert ( - str(df.groupby("b").agg(aggs).to_dict(False)) + str(df.group_by("b").agg(aggs).to_dict(False)) == "{'b': [1], 'max': [3.0], 'min': [1.0], 'nan_max': [nan], 'nan_min': [nan]}" ) diff --git a/py-polars/tests/unit/datatypes/test_list.py b/py-polars/tests/unit/datatypes/test_list.py index 53901bab76a1..c6b0cbc68fb1 100644 --- a/py-polars/tests/unit/datatypes/test_list.py +++ b/py-polars/tests/unit/datatypes/test_list.py @@ -61,7 +61,7 @@ def test_categorical() -> None: ] ) out = ( - df.groupby(["a", "b"]) + df.group_by(["a", "b"]) .agg( [ pl.col("c").count().alias("num_different_c"), @@ -90,11 +90,11 @@ def test_cast_inner() -> None: ) -def test_list_empty_groupby_result_3521() -> None: +def test_list_empty_group_by_result_3521() -> None: # Create a left relation where the join column contains a null value left = pl.DataFrame().with_columns( [ - pl.lit(1).alias("groupby_column"), + pl.lit(1).alias("group_by_column"), pl.lit(None).cast(pl.Int32).alias("join_column"), ] ) @@ -111,9 +111,9 @@ def test_list_empty_groupby_result_3521() -> None: # This will panic on polars version 0.13.38 and 0.13.39 assert ( left.join(right, on="join_column", how="left") - .groupby("groupby_column") + .group_by("group_by_column") .agg(pl.col("n_unique_column").drop_nulls()) - ).to_dict(False) == {"groupby_column": [1], "n_unique_column": [[]]} + ).to_dict(False) == {"group_by_column": [1], "n_unique_column": [[]]} def test_list_fill_null() -> None: @@ -177,21 +177,21 @@ def test_inner_type_categorical_on_rechunk() -> None: assert pl.concat([df, df], rechunk=True).dtypes == [pl.List(pl.Categorical)] -def test_groupby_list_column() -> None: +def test_group_by_list_column() -> None: df = ( pl.DataFrame({"a": ["a", "b", "a"]}) .with_columns(pl.col("a").cast(pl.Categorical)) - .groupby("a", maintain_order=True) + .group_by("a", maintain_order=True) .agg(pl.col("a").alias("a_list")) ) - assert df.groupby("a_list", maintain_order=True).first().to_dict(False) == { + assert df.group_by("a_list", maintain_order=True).first().to_dict(False) == { "a_list": [["a", "a"], ["b"]], "a": ["a", "b"], } -def test_groupby_multiple_keys_contains_list_column() -> None: +def test_group_by_multiple_keys_contains_list_column() -> None: df = ( pl.DataFrame( { @@ -200,7 +200,7 @@ def test_groupby_multiple_keys_contains_list_column() -> None: "c": [3, 2, 1, 0], } ) - .groupby(["a", "b"], maintain_order=True) + .group_by(["a", "b"], maintain_order=True) .agg(pl.all()) ) assert df.to_dict(False) == { @@ -263,7 +263,7 @@ def test_fast_explode_on_list_struct_6208() -> None: def test_flat_aggregation_to_list_conversion_6918() -> None: df = pl.DataFrame({"a": [1, 2, 2], "b": [[0, 1], [2, 3], [4, 5]]}) - assert df.groupby("a", maintain_order=True).agg( + assert df.group_by("a", maintain_order=True).agg( pl.concat_list([pl.col("b").list.get(i).mean().implode() for i in range(2)]) ).to_dict(False) == {"a": [1, 2], "b": [[[0.0, 1.0]], [[3.0, 4.0]]]} @@ -398,7 +398,7 @@ def test_logical_type_struct_agg_list() -> None: {"cats": ["Value1", "Value2", "Value1"]}, schema_overrides={"cats": pl.Categorical}, ) - out = df.groupby(1).agg(pl.struct("cats")) + out = df.group_by(1).agg(pl.struct("cats")) assert out.dtypes == [ pl.Int32, pl.List(pl.Struct([pl.Field("cats", pl.Categorical)])), @@ -418,7 +418,7 @@ def test_logical_parallel_list_collect() -> None: }, schema_overrides={"Values": pl.Categorical}, ) - .groupby("Group") + .group_by("Group") .agg(pl.col("Values").value_counts(sort=True)) .explode("Values") .unnest("Values") @@ -498,7 +498,7 @@ def test_list_amortized_iter_clear_settings_10126() -> None: out = ( pl.DataFrame({"a": [[1], [1], [2]], "b": [[1, 2], [1, 3], [4]]}) .explode("a") - .groupby("a") + .group_by("a") .agg(pl.col("b").flatten()) .with_columns(pl.col("b").list.unique()) .sort("a") diff --git a/py-polars/tests/unit/datatypes/test_struct.py b/py-polars/tests/unit/datatypes/test_struct.py index 50a4f21315aa..cf694f984387 100644 --- a/py-polars/tests/unit/datatypes/test_struct.py +++ b/py-polars/tests/unit/datatypes/test_struct.py @@ -188,7 +188,7 @@ def test_value_counts_expr() -> None: df = pl.DataFrame({"session": [1, 1, 1], "id": [2, 2, 3]}) - assert df.groupby("session").agg( + assert df.group_by("session").agg( [pl.col("id").value_counts(sort=True).first()] ).to_dict(False) == {"session": [1], "id": [{"id": 2, "counts": 2}]} @@ -375,7 +375,7 @@ def test_struct_agg_all() -> None: } ) - assert df.groupby("group", maintain_order=True).all().to_dict(False) == { + assert df.group_by("group", maintain_order=True).all().to_dict(False) == { "group": ["a", "b"], "col1": [ [{"x": 1, "y": 100}, {"x": 2, "y": 200}], @@ -607,9 +607,9 @@ def test_nested_struct_sliced_append() -> None: ] -def test_struct_groupby_field_agg_4216() -> None: +def test_struct_group_by_field_agg_4216() -> None: df = pl.DataFrame([{"a": {"b": 1}, "c": 0}]) - assert df.groupby("c").agg(pl.col("a").struct.field("b").count()).to_dict( + assert df.group_by("c").agg(pl.col("a").struct.field("b").count()).to_dict( False ) == {"c": [0], "b": [1]} @@ -816,7 +816,7 @@ def test_struct_name_passed_in_agg_apply() -> None: ] ).alias("index") - assert pl.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [1, 2, 2]}).groupby( + assert pl.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [1, 2, 2]}).group_by( "C" ).agg(struct_expr).sort("C", descending=True).to_dict(False) == { "C": [2, 1], @@ -828,7 +828,7 @@ def test_struct_name_passed_in_agg_apply() -> None: df = pl.DataFrame({"val": [-3, -2, -1, 0, 1, 2, 3], "k": [0] * 7}) - assert df.groupby("k").agg( + assert df.group_by("k").agg( pl.struct( [ pl.col("val").value_counts(sort=True).struct.field("val").alias("val"), diff --git a/py-polars/tests/unit/datatypes/test_temporal.py b/py-polars/tests/unit/datatypes/test_temporal.py index f3a2a1862b35..1b6767dc07bc 100644 --- a/py-polars/tests/unit/datatypes/test_temporal.py +++ b/py-polars/tests/unit/datatypes/test_temporal.py @@ -529,7 +529,7 @@ def test_truncate_negative_offset(tzinfo: ZoneInfo | None) -> None: } ).set_sorted("event_date") df = df.with_columns(pl.col("event_date").dt.replace_time_zone(time_zone)) - out = df.groupby_dynamic( + out = df.group_by_dynamic( index_column="event_date", every="1mo", period="2mo", @@ -562,7 +562,7 @@ def test_truncate_negative_offset(tzinfo: ZoneInfo | None) -> None: ).set_sorted("event_date") df = df.with_columns(pl.col("event_date").dt.replace_time_zone(time_zone)) - out = df.groupby_dynamic( + out = df.group_by_dynamic( index_column="event_date", every="1mo", by=["admin", "five_type", "actor"], @@ -586,7 +586,7 @@ def test_truncate_negative_offset(tzinfo: ZoneInfo | None) -> None: .set_sorted("idx") ) - out = df.groupby_dynamic( + out = df.group_by_dynamic( "idx", every="2i", period="3i", include_boundaries=True ).agg(pl.col("A")) @@ -624,7 +624,7 @@ def test_explode_date() -> None: } ) out = ( - df.groupby("b", maintain_order=True) + df.group_by("b", maintain_order=True) .agg([pl.col("a"), pl.col("c").pct_change()]) .explode(["a", "c"]) ) @@ -637,7 +637,7 @@ def test_explode_date() -> None: ] -def test_groupby_dynamic_when_conversion_crosses_dates_7274() -> None: +def test_group_by_dynamic_when_conversion_crosses_dates_7274() -> None: df = ( pl.DataFrame( data={ @@ -658,7 +658,7 @@ def test_groupby_dynamic_when_conversion_crosses_dates_7274() -> None: .set_sorted() ) ) - result = df.groupby_dynamic( + result = df.group_by_dynamic( index_column="timestamp", every="1d", closed="left" ).agg(pl.col("value").count()) expected = pl.DataFrame({"timestamp": [datetime(1970, 1, 1)], "value": [2]}) @@ -667,7 +667,7 @@ def test_groupby_dynamic_when_conversion_crosses_dates_7274() -> None: pl.col("value").cast(pl.UInt32), ) assert_frame_equal(result, expected) - result = df.groupby_dynamic( + result = df.group_by_dynamic( index_column="timestamp_utc", every="1d", closed="left" ).agg(pl.col("value").count()) expected = pl.DataFrame( @@ -701,7 +701,7 @@ def test_rolling() -> None: period: str | timedelta for period in ("2d", timedelta(days=2)): # type: ignore[assignment] - out = df.groupby_rolling(index_column="dt", period=period).agg( + out = df.group_by_rolling(index_column="dt", period=period).agg( [ pl.sum("a").alias("sum_a"), pl.min("a").alias("min_a"), @@ -882,7 +882,7 @@ def test_read_utc_times_parquet() -> None: @pytest.mark.parametrize("time_zone", [None, "Asia/Kathmandu"]) -def test_default_negative_every_offset_dynamic_groupby(time_zone: str | None) -> None: +def test_default_negative_every_offset_dynamic_group_by(time_zone: str | None) -> None: # 2791 dts = [ datetime(2020, 1, 1), @@ -892,7 +892,7 @@ def test_default_negative_every_offset_dynamic_groupby(time_zone: str | None) -> ] df = pl.DataFrame({"dt": dts, "idx": range(len(dts))}).set_sorted("dt") df = df.with_columns(pl.col("dt").dt.replace_time_zone(time_zone)) - out = df.groupby_dynamic(index_column="dt", every="1mo", closed="right").agg( + out = df.group_by_dynamic(index_column="dt", every="1mo", closed="right").agg( pl.col("idx") ) @@ -918,14 +918,14 @@ def test_default_negative_every_offset_dynamic_groupby(time_zone: str | None) -> ("1w", timedelta(weeks=2)), ], ) -def test_groupby_dynamic_crossing_dst(rule: str, offset: timedelta) -> None: +def test_group_by_dynamic_crossing_dst(rule: str, offset: timedelta) -> None: start_dt = datetime(2021, 11, 7) end_dt = start_dt + offset date_range = pl.date_range( start_dt, end_dt, rule, time_zone="US/Central", eager=True ) df = pl.DataFrame({"time": date_range, "value": range(len(date_range))}) - result = df.groupby_dynamic("time", every=rule, start_by="datapoint").agg( + result = df.group_by_dynamic("time", every=rule, start_by="datapoint").agg( pl.col("value").mean() ) expected = pl.DataFrame( @@ -996,7 +996,7 @@ def test_groupby_dynamic_crossing_dst(rule: str, offset: timedelta) -> None: ), ], ) -def test_groupby_dynamic_startby_monday_crossing_dst( +def test_group_by_dynamic_startby_monday_crossing_dst( start_by: StartBy, expected_time: list[datetime], expected_value: list[float] ) -> None: start_dt = datetime(2021, 11, 7) @@ -1005,7 +1005,7 @@ def test_groupby_dynamic_startby_monday_crossing_dst( start_dt, end_dt, "1d", time_zone="US/Central", eager=True ) df = pl.DataFrame({"time": date_range, "value": range(len(date_range))}) - result = df.groupby_dynamic("time", every="1w", start_by=start_by).agg( + result = df.group_by_dynamic("time", every="1w", start_by=start_by).agg( pl.col("value").mean() ) expected = pl.DataFrame( @@ -1015,14 +1015,14 @@ def test_groupby_dynamic_startby_monday_crossing_dst( assert_frame_equal(result, expected) -def test_groupby_dynamic_startby_monday_dst_8737() -> None: +def test_group_by_dynamic_startby_monday_dst_8737() -> None: start_dt = datetime(2021, 11, 6, 20) stop_dt = datetime(2021, 11, 7, 20) date_range = pl.date_range( start_dt, stop_dt, "1d", time_zone="US/Central", eager=True ) df = pl.DataFrame({"time": date_range, "value": range(len(date_range))}) - result = df.groupby_dynamic("time", every="1w", start_by="monday").agg( + result = df.group_by_dynamic("time", every="1w", start_by="monday").agg( pl.col("value").mean() ) expected = pl.DataFrame( @@ -1037,14 +1037,14 @@ def test_groupby_dynamic_startby_monday_dst_8737() -> None: assert_frame_equal(result, expected) -def test_groupby_dynamic_monthly_crossing_dst() -> None: +def test_group_by_dynamic_monthly_crossing_dst() -> None: start_dt = datetime(2021, 11, 1) end_dt = datetime(2021, 12, 1) date_range = pl.date_range( start_dt, end_dt, "1mo", time_zone="US/Central", eager=True ) df = pl.DataFrame({"time": date_range, "value": range(len(date_range))}) - result = df.groupby_dynamic("time", every="1mo").agg(pl.col("value").mean()) + result = df.group_by_dynamic("time", every="1mo").agg(pl.col("value").mean()) expected = pl.DataFrame( {"time": date_range, "value": range(len(date_range))}, schema_overrides={"value": pl.Float64}, @@ -1052,10 +1052,10 @@ def test_groupby_dynamic_monthly_crossing_dst() -> None: assert_frame_equal(result, expected) -def test_groupby_dynamic_2d_9333() -> None: +def test_group_by_dynamic_2d_9333() -> None: df = pl.DataFrame({"ts": [datetime(2000, 1, 1, 3)], "values": [10.0]}) df = df.with_columns(pl.col("ts").set_sorted()) - result = df.groupby_dynamic("ts", every="2d").agg(pl.col("values")) + result = df.group_by_dynamic("ts", every="2d").agg(pl.col("values")) expected = pl.DataFrame({"ts": [datetime(1999, 12, 31, 0)], "values": [[10.0]]}) assert_frame_equal(result, expected) @@ -1190,10 +1190,10 @@ def test_add_duration_3786() -> None: } -def test_rolling_groupby_by_argument() -> None: +def test_rolling_group_by_by_argument() -> None: df = pl.DataFrame({"times": range(10), "groups": [1] * 4 + [2] * 6}) - out = df.groupby_rolling("times", period="5i", by=["groups"]).agg( + out = df.group_by_rolling("times", period="5i", by=["groups"]).agg( pl.col("times").alias("agg_list") ) @@ -1219,7 +1219,7 @@ def test_rolling_groupby_by_argument() -> None: assert_frame_equal(out, expected) -def test_groupby_rolling_mean_3020() -> None: +def test_group_by_rolling_mean_3020() -> None: df = pl.DataFrame( { "Date": [ @@ -1237,7 +1237,7 @@ def test_groupby_rolling_mean_3020() -> None: period: str | timedelta for period in ("1w", timedelta(days=7)): # type: ignore[assignment] - result = df.groupby_rolling(index_column="Date", period=period).agg( + result = df.group_by_rolling(index_column="Date", period=period).agg( pl.col("val").mean().alias("val_mean") ) expected = pl.DataFrame( @@ -1539,7 +1539,7 @@ def test_duration_aggregations() -> None: } ) df = df.with_columns((pl.col("end") - pl.col("start")).alias("duration")) - assert df.groupby("group", maintain_order=True).agg( + assert df.group_by("group", maintain_order=True).agg( [ pl.col("duration").mean().alias("mean"), pl.col("duration").sum().alias("sum"), @@ -1648,7 +1648,7 @@ def test_unique_counts_on_dates() -> None: } -def test_groupby_rolling_by_ordering() -> None: +def test_group_by_rolling_by_ordering() -> None: # we must check that the keys still match the time labels after the rolling window # with a `by` argument. df = pl.DataFrame( @@ -1667,7 +1667,7 @@ def test_groupby_rolling_by_ordering() -> None: } ).set_sorted("dt") - assert df.groupby_rolling( + assert df.group_by_rolling( index_column="dt", period="2m", closed="both", @@ -1694,7 +1694,7 @@ def test_groupby_rolling_by_ordering() -> None: } -def test_groupby_rolling_by_() -> None: +def test_group_by_rolling_by_() -> None: df = pl.DataFrame({"group": pl.arange(0, 3, eager=True)}).join( pl.DataFrame( { @@ -1707,13 +1707,13 @@ def test_groupby_rolling_by_() -> None: ) out = ( df.sort("datetime") - .groupby_rolling(index_column="datetime", by="group", period=timedelta(days=3)) + .group_by_rolling(index_column="datetime", by="group", period=timedelta(days=3)) .agg([pl.count().alias("count")]) ) expected = ( df.sort(["group", "datetime"]) - .groupby_rolling(index_column="datetime", by="group", period="3d") + .group_by_rolling(index_column="datetime", by="group", period="3d") .agg([pl.count().alias("count")]) ) assert_frame_equal(out.sort(["group", "datetime"]), expected) @@ -2571,7 +2571,7 @@ def test_datetime_cum_agg_schema() -> None: } -def test_rolling_groupby_empty_groups_by_take_6330() -> None: +def test_rolling_group_by_empty_groups_by_take_6330() -> None: df = ( pl.DataFrame({"Event": ["Rain", "Sun"]}) .join( @@ -2585,7 +2585,7 @@ def test_rolling_groupby_empty_groups_by_take_6330() -> None: .set_sorted("Date") ) assert ( - df.groupby_rolling( + df.group_by_rolling( index_column="Date", period="2i", offset="-2i", @@ -2777,12 +2777,12 @@ def test_pytime_conversion(tm: time) -> None: ) ], ) -def test_groupby_dynamic( +def test_group_by_dynamic( input_df: pl.DataFrame, expected_grouped_df: pl.DataFrame ) -> None: result = ( input_df.sort("dt") - .groupby_dynamic("dt", every="1q") + .group_by_dynamic("dt", every="1q") .agg(pl.col("dt").count().alias("num_points")) .sort("dt") ) diff --git a/py-polars/tests/unit/functions/test_as_datatype.py b/py-polars/tests/unit/functions/test_as_datatype.py index ebffc580d0de..9442e4a5ff13 100644 --- a/py-polars/tests/unit/functions/test_as_datatype.py +++ b/py-polars/tests/unit/functions/test_as_datatype.py @@ -156,7 +156,7 @@ def test_concat_list_in_agg_6397() -> None: df = pl.DataFrame({"group": [1, 2, 2, 3], "value": ["a", "b", "c", "d"]}) # single list - assert df.groupby("group").agg( + assert df.group_by("group").agg( [ # this casts every element to a list pl.concat_list(pl.col("value")), @@ -167,7 +167,7 @@ def test_concat_list_in_agg_6397() -> None: } # nested list - assert df.groupby("group").agg( + assert df.group_by("group").agg( [ pl.concat_list(pl.col("value").implode()).alias("result"), ] diff --git a/py-polars/tests/unit/functions/test_whenthen.py b/py-polars/tests/unit/functions/test_whenthen.py index b55192975f7e..b3ed26991615 100644 --- a/py-polars/tests/unit/functions/test_whenthen.py +++ b/py-polars/tests/unit/functions/test_whenthen.py @@ -187,7 +187,7 @@ def test_when_then_edge_cases_3994() -> None: # this tests if lazy correctly assigns the list schema to the column aggregation assert ( df.lazy() - .groupby(["id"]) + .group_by(["id"]) .agg(pl.col("type")) .with_columns( pl.when(pl.col("type").list.lengths() == 0) @@ -201,7 +201,7 @@ def test_when_then_edge_cases_3994() -> None: # this tests ternary with an empty argument assert ( df.filter(pl.col("id") == 42) - .groupby(["id"]) + .group_by(["id"]) .agg(pl.col("type")) .with_columns( pl.when(pl.col("type").list.lengths() == 0) diff --git a/py-polars/tests/unit/io/test_lazy_parquet.py b/py-polars/tests/unit/io/test_lazy_parquet.py index 1c7848ff3c6f..02e51bc93765 100644 --- a/py-polars/tests/unit/io/test_lazy_parquet.py +++ b/py-polars/tests/unit/io/test_lazy_parquet.py @@ -331,7 +331,7 @@ def test_streaming_categorical(tmp_path: Path) -> None: with pl.StringCache(): result = ( pl.scan_parquet(file_path) - .groupby("name") + .group_by("name") .agg(pl.col("amount").sum()) .collect() .sort("name") diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py index 1d225decb0f8..be34b546ac92 100644 --- a/py-polars/tests/unit/io/test_parquet.py +++ b/py-polars/tests/unit/io/test_parquet.py @@ -250,7 +250,7 @@ def test_recursive_logical_type() -> None: df = pl.DataFrame({"str": ["A", "B", "A", "B", "C"], "group": [1, 1, 2, 1, 2]}) df = df.with_columns(pl.col("str").cast(pl.Categorical)) - df_groups = df.groupby("group").agg([pl.col("str").alias("cat_list")]) + df_groups = df.group_by("group").agg([pl.col("str").alias("cat_list")]) f = io.BytesIO() df_groups.write_parquet(f, use_pyarrow=True) f.seek(0) @@ -264,7 +264,7 @@ def test_nested_dictionary() -> None: df = ( pl.DataFrame({"str": ["A", "B", "A", "B", "C"], "group": [1, 1, 2, 1, 2]}) .with_columns(pl.col("str").cast(pl.Categorical)) - .groupby("group") + .group_by("group") .agg([pl.col("str").alias("cat_list")]) ) f = io.BytesIO() diff --git a/py-polars/tests/unit/namespaces/test_list.py b/py-polars/tests/unit/namespaces/test_list.py index 3ba208b9a49c..7a9cbd7505b8 100644 --- a/py-polars/tests/unit/namespaces/test_list.py +++ b/py-polars/tests/unit/namespaces/test_list.py @@ -211,7 +211,7 @@ def test_arr_contains_categorical() -> None: {"str": ["A", "B", "A", "B", "C"], "group": [1, 1, 2, 1, 2]} ).lazy() df = df.with_columns(pl.col("str").cast(pl.Categorical)) - df_groups = df.groupby("group").agg([pl.col("str").alias("str_list")]) + df_groups = df.group_by("group").agg([pl.col("str").alias("str_list")]) assert df_groups.filter(pl.col("str_list").list.contains("C")).collect().to_dict( False ) == {"group": [2], "str_list": [["A", "C"]]} @@ -364,7 +364,7 @@ def test_list_function_group_awareness() -> None: } ) - assert df.groupby("group").agg( + assert df.group_by("group").agg( [ pl.col("a").implode().list.get(0).alias("get"), pl.col("a").implode().list.take([0]).alias("take"), diff --git a/py-polars/tests/unit/namespaces/test_string.py b/py-polars/tests/unit/namespaces/test_string.py index 6595f9e4365a..fbdaa0010de9 100644 --- a/py-polars/tests/unit/namespaces/test_string.py +++ b/py-polars/tests/unit/namespaces/test_string.py @@ -316,7 +316,7 @@ def test_auto_explode() -> None: ) pl.col("val").str.concat(delimiter=",") grouped = ( - df.groupby("id") + df.group_by("id") .agg(pl.col("val").str.concat(delimiter=",").alias("grouped")) .get_column("grouped") ) diff --git a/py-polars/tests/unit/operations/test_aggregations.py b/py-polars/tests/unit/operations/test_aggregations.py index b0e731c3c536..2629dfc6fa20 100644 --- a/py-polars/tests/unit/operations/test_aggregations.py +++ b/py-polars/tests/unit/operations/test_aggregations.py @@ -35,7 +35,7 @@ def test_boolean_aggs() -> None: "var": [0.3333333432674408], } - assert df.groupby(pl.lit(1)).agg(aggs).to_dict(False) == { + assert df.group_by(pl.lit(1)).agg(aggs).to_dict(False) == { "literal": [1], "mean": [0.6666666666666666], "std": [0.5773502691896258], @@ -66,7 +66,7 @@ def test_duration_aggs() -> None: assert df.select("time_difference").mean().to_dict(False) == { "time_difference": [timedelta(days=31)] } - assert df.groupby(pl.lit(1)).agg(pl.mean("time_difference")).to_dict(False) == { + assert df.group_by(pl.lit(1)).agg(pl.mean("time_difference")).to_dict(False) == { "literal": [1], "time_difference": [timedelta(days=31)], } @@ -80,8 +80,8 @@ def test_hmean_with_str_column() -> None: def test_list_aggregation_that_filters_all_data_6017() -> None: out = ( - pl.DataFrame({"col_to_groupby": [2], "flt": [1672740910.967138], "col3": [1]}) - .groupby("col_to_groupby") + pl.DataFrame({"col_to_group_by": [2], "flt": [1672740910.967138], "col3": [1]}) + .group_by("col_to_group_by") .agg( (pl.col("flt").filter(pl.col("col3") == 0).diff() * 1000) .diff() @@ -89,8 +89,8 @@ def test_list_aggregation_that_filters_all_data_6017() -> None: ) ) - assert out.schema == {"col_to_groupby": pl.Int64, "calc": pl.List(pl.Float64)} - assert out.to_dict(False) == {"col_to_groupby": [2], "calc": [[]]} + assert out.schema == {"col_to_group_by": pl.Int64, "calc": pl.List(pl.Float64)} + assert out.to_dict(False) == {"col_to_group_by": [2], "calc": [[]]} def test_median() -> None: @@ -167,7 +167,7 @@ def test_literal_group_agg_chunked_7968() -> None: ser = pl.concat([pl.Series([3]), pl.Series([4, 5])], rechunk=False) assert_frame_equal( - df.groupby("A").agg(pl.col("B").search_sorted(ser)), + df.group_by("A").agg(pl.col("B").search_sorted(ser)), pl.DataFrame( [ pl.Series("A", [1], dtype=pl.Int64), @@ -191,7 +191,7 @@ def test_duration_function_literal() -> None: ) # this checks if the `pl.duration` is flagged as AggState::Literal - assert df.groupby("A", maintain_order=True).agg( + assert df.group_by("A", maintain_order=True).agg( [((pl.col("T").max() + pl.duration(seconds=1)) - pl.col("T"))] ).to_dict(False) == { "A": ["x", "y"], @@ -214,7 +214,7 @@ def test_string_par_materialize_8207() -> None: } ) - assert df.groupby(["a"]).agg(pl.min("b")).sort("a").collect().to_dict(False) == { + assert df.group_by(["a"]).agg(pl.min("b")).sort("a").collect().to_dict(False) == { "a": ["a", "b", "c", "d", "e"], "b": ["P", "L", "T", "R", "a long string"], } @@ -230,7 +230,7 @@ def test_online_variance() -> None: ) assert_frame_equal( - df.groupby("id") + df.group_by("id") .agg(pl.all().exclude("id").std()) .select(["no_nulls", "nulls"]), df.select(pl.all().exclude("id").std()), @@ -245,10 +245,10 @@ def test_err_on_implode_and_agg() -> None: pl.InvalidOperationError, match=r"'implode' followed by an aggregation is not allowed", ): - df.groupby("type").agg(pl.col("type").implode().first().alias("foo")) + df.group_by("type").agg(pl.col("type").implode().first().alias("foo")) - # implode + function should be allowed in groupby - assert df.groupby("type", maintain_order=True).agg( + # implode + function should be allowed in group_by + assert df.group_by("type", maintain_order=True).agg( pl.col("type").implode().list.head().alias("foo") ).to_dict(False) == { "type": ["water", "fire", "earth"], @@ -265,7 +265,7 @@ def test_err_on_implode_and_agg() -> None: def test_mapped_literal_to_literal_9217() -> None: df = pl.DataFrame({"unique_id": ["a", "b"]}) - assert df.groupby(True).agg( + assert df.group_by(True).agg( pl.struct(pl.lit("unique_id").alias("unique_id")) ).to_dict(False) == {"literal": [True], "unique_id": [{"unique_id": "unique_id"}]} @@ -279,4 +279,4 @@ def test_sum_empty_and_null_set() -> None: df = pl.DataFrame({"a": [None, None, None], "b": [1, 1, 1]}) assert df.select(pl.sum("a")).item() == 0.0 - assert df.groupby("b").agg(pl.sum("a"))["a"].item() == 0.0 + assert df.group_by("b").agg(pl.sum("a"))["a"].item() == 0.0 diff --git a/py-polars/tests/unit/operations/test_apply.py b/py-polars/tests/unit/operations/test_apply.py index af6cb5946633..db1dc686155c 100644 --- a/py-polars/tests/unit/operations/test_apply.py +++ b/py-polars/tests/unit/operations/test_apply.py @@ -23,7 +23,7 @@ def test_apply_none() -> None: ) out = ( - df.groupby("g", maintain_order=True).agg( + df.group_by("g", maintain_order=True).agg( pl.apply( exprs=["a", pl.col("b") ** 4, pl.col("a") / 4], function=lambda x: x[0] * x[1] + x[2].sum(), @@ -44,7 +44,7 @@ def func(s: Sequence[pl.Series]) -> pl.Series | None: return s[0] out = ( - df.groupby("g", maintain_order=True).agg( + df.group_by("g", maintain_order=True).agg( pl.apply( exprs=["a", pl.col("b") ** 4, pl.col("a") / 4], function=func ).alias("multiple") @@ -72,7 +72,7 @@ class Foo: def __init__(self, payload: Any): self.payload = payload - out = df.groupby("groups").agg( + out = df.group_by("groups").agg( [ pl.apply( [pl.col("dates"), pl.col("names")], lambda s: Foo(dict(zip(s[0], s[1]))) @@ -98,7 +98,7 @@ def test_apply_arithmetic_consistency() -> None: with pytest.warns( PolarsInefficientApplyWarning, match="In this case, you can replace" ): - assert df.groupby("A").agg(pl.col("B").apply(lambda x: x + 1.0))[ + assert df.group_by("A").agg(pl.col("B").apply(lambda x: x + 1.0))[ "B" ].to_list() == [[3.0, 4.0]] @@ -135,7 +135,7 @@ def test_apply_numpy_out_3057() -> None: "y": [0.0, 1, 1.3, 2, 3, 4], } ) - result = df.groupby("id", maintain_order=True).agg( + result = df.group_by("id", maintain_order=True).agg( pl.apply(["y", "t"], lambda lst: np.trapz(y=lst[0], x=lst[1])).alias("result") ) expected = pl.DataFrame({"id": [0, 1], "result": [1.955, 13.0]}) @@ -220,7 +220,7 @@ def test_apply_type_propagation() -> None: "b": [{"c": 1, "d": 2}, {"c": 2, "d": 3}, {"c": None, "d": None}], } ) - .groupby("a", maintain_order=True) + .group_by("a", maintain_order=True) .agg( [ pl.when(pl.col("b").null_count() == 0) @@ -322,7 +322,7 @@ def test_apply_pass_name() -> None: def applyer(s: pl.Series) -> pl.Series: return pl.Series([mapper[s.name]]) - assert df.groupby("bar", maintain_order=True).agg( + assert df.group_by("bar", maintain_order=True).agg( [ pl.col("foo").apply(applyer, pass_name=True), ] @@ -395,7 +395,7 @@ def test_apply_10237() -> None: def test_apply_on_empty_col_10639() -> None: df = pl.DataFrame({"A": [], "B": []}) - res = df.groupby("B").agg( + res = df.group_by("B").agg( pl.col("A") .apply(lambda x: x, return_dtype=pl.Int32, strategy="threading") .alias("Foo") @@ -404,7 +404,7 @@ def test_apply_on_empty_col_10639() -> None: "B": [], "Foo": [], } - res = df.groupby("B").agg( + res = df.group_by("B").agg( pl.col("A") .apply(lambda x: x, return_dtype=pl.Int32, strategy="thread_local") .alias("Foo") diff --git a/py-polars/tests/unit/operations/test_explode.py b/py-polars/tests/unit/operations/test_explode.py index 6cd6a85e0e0a..4c39eeeefc24 100644 --- a/py-polars/tests/unit/operations/test_explode.py +++ b/py-polars/tests/unit/operations/test_explode.py @@ -25,17 +25,17 @@ def test_explode_multiple() -> None: assert_frame_equal(df.explode("a", "b"), expected) -def test_groupby_flatten_list() -> None: +def test_group_by_flatten_list() -> None: df = pl.DataFrame({"group": ["a", "b", "b"], "values": [[1, 2], [2, 3], [4]]}) - result = df.groupby("group", maintain_order=True).agg(pl.col("values").flatten()) + result = df.group_by("group", maintain_order=True).agg(pl.col("values").flatten()) expected = pl.DataFrame({"group": ["a", "b"], "values": [[1, 2], [2, 3, 4]]}) assert_frame_equal(result, expected) -def test_groupby_flatten_string() -> None: +def test_group_by_flatten_string() -> None: df = pl.DataFrame({"group": ["a", "b", "b"], "values": ["foo", "bar", "baz"]}) - result = df.groupby("group", maintain_order=True).agg( + result = df.group_by("group", maintain_order=True).agg( pl.col("values").str.explode() ) @@ -217,7 +217,7 @@ def test_explode_in_agg_context() -> None: assert ( df.with_row_count("row_nr") .explode("idxs") - .groupby("row_nr") + .group_by("row_nr") .agg(pl.col("array").flatten()) ).to_dict(False) == { "row_nr": [0, 1, 2], @@ -231,7 +231,7 @@ def test_explode_inner_lists_3985() -> None: ).lazy() assert ( - df.groupby("id") + df.group_by("id") .agg(pl.col("categories")) .with_columns(pl.col("categories").list.eval(pl.element().list.explode())) ).collect().to_dict(False) == {"id": [1], "categories": [["a", "b", "a", "c"]]} @@ -291,7 +291,7 @@ def test_logical_explode() -> None: {"cats": ["Value1", "Value2", "Value1"]}, schema_overrides={"cats": pl.Categorical}, ) - .groupby(1) + .group_by(1) .agg(pl.struct("cats")) .explode("cats") .unnest("cats") diff --git a/py-polars/tests/unit/operations/test_filter.py b/py-polars/tests/unit/operations/test_filter.py index 551138742e11..72ca0a0dfe93 100644 --- a/py-polars/tests/unit/operations/test_filter.py +++ b/py-polars/tests/unit/operations/test_filter.py @@ -32,19 +32,19 @@ def test_melt_values_predicate_pushdown() -> None: def test_filter_is_in_4572() -> None: df = pl.DataFrame({"id": [1, 2, 1, 2], "k": ["a"] * 2 + ["b"] * 2}) expected = ( - df.groupby("id") + df.group_by("id") .agg(pl.col("k").filter(pl.col("k") == "a").implode()) .sort("id") ) result = ( - df.groupby("id") + df.group_by("id") .agg(pl.col("k").filter(pl.col("k").is_in(["a"])).implode()) .sort("id") ) assert_frame_equal(result, expected) result = ( df.sort("id") - .groupby("id") + .group_by("id") .agg(pl.col("k").filter(pl.col("k").is_in(["a"])).implode()) ) assert_frame_equal(result, expected) @@ -61,7 +61,7 @@ def test_filter_aggregation_any() -> None: ) result = ( - df.groupby("group") + df.group_by("group") .agg( pl.any_horizontal("pred_a", "pred_b"), pl.col("id") diff --git a/py-polars/tests/unit/operations/test_groupby.py b/py-polars/tests/unit/operations/test_group_by.py similarity index 76% rename from py-polars/tests/unit/operations/test_groupby.py rename to py-polars/tests/unit/operations/test_group_by.py index 0e5addbe1070..2be46e4dbde3 100644 --- a/py-polars/tests/unit/operations/test_groupby.py +++ b/py-polars/tests/unit/operations/test_group_by.py @@ -17,7 +17,7 @@ from polars.testing import assert_frame_equal, assert_series_equal -def test_groupby() -> None: +def test_group_by() -> None: df = pl.DataFrame( { "a": ["a", "b", "a", "b", "b", "c"], @@ -26,16 +26,16 @@ def test_groupby() -> None: } ) - assert df.groupby("a").apply(lambda df: df[["c"]].sum()).sort("c")["c"][0] == 1 + assert df.group_by("a").apply(lambda df: df[["c"]].sum()).sort("c")["c"][0] == 1 - # Use lazy API in eager groupby - assert sorted(df.groupby("a").agg([pl.sum("b")]).rows()) == [ + # Use lazy API in eager group_by + assert sorted(df.group_by("a").agg([pl.sum("b")]).rows()) == [ ("a", 4), ("b", 11), ("c", 6), ] # test if it accepts a single expression - assert df.groupby("a", maintain_order=True).agg(pl.sum("b")).rows() == [ + assert df.group_by("a", maintain_order=True).agg(pl.sum("b")).rows() == [ ("a", 4), ("b", 11), ("c", 6), @@ -50,10 +50,10 @@ def test_groupby() -> None: ) # check if this query runs and thus column names propagate - df.groupby("b").agg(pl.col("c").forward_fill()).explode("c") + df.group_by("b").agg(pl.col("c").forward_fill()).explode("c") # get a specific column - result = df.groupby("b", maintain_order=True).agg(pl.count("a")) + result = df.group_by("b", maintain_order=True).agg(pl.count("a")) assert result.rows() == [("a", 2), ("b", 3)] assert result.columns == ["b", "a"] @@ -83,28 +83,28 @@ def df() -> pl.DataFrame: ("n_unique", [("a", 2, 2), ("b", 3, 2)]), ], ) -def test_groupby_shorthands( +def test_group_by_shorthands( df: pl.DataFrame, method: str, expected: list[tuple[Any]] ) -> None: - gb = df.groupby("b", maintain_order=True) + gb = df.group_by("b", maintain_order=True) result = getattr(gb, method)() assert result.rows() == expected - gb_lazy = df.lazy().groupby("b", maintain_order=True) + gb_lazy = df.lazy().group_by("b", maintain_order=True) result = getattr(gb_lazy, method)().collect() assert result.rows() == expected -def test_groupby_shorthand_quantile(df: pl.DataFrame) -> None: - result = df.groupby("b", maintain_order=True).quantile(0.5) +def test_group_by_shorthand_quantile(df: pl.DataFrame) -> None: + result = df.group_by("b", maintain_order=True).quantile(0.5) expected = [("a", 2.0, 1.0), ("b", 4.0, 1.0)] assert result.rows() == expected - result = df.lazy().groupby("b", maintain_order=True).quantile(0.5).collect() + result = df.lazy().group_by("b", maintain_order=True).quantile(0.5).collect() assert result.rows() == expected -def test_groupby_args() -> None: +def test_group_by_args() -> None: df = pl.DataFrame( { "a": ["a", "b", "a", "b", "b", "c"], @@ -114,30 +114,30 @@ def test_groupby_args() -> None: ) # Single column name - assert df.groupby("a").agg("b").columns == ["a", "b"] + assert df.group_by("a").agg("b").columns == ["a", "b"] # Column names as list expected = ["a", "b", "c"] - assert df.groupby(["a", "b"]).agg("c").columns == expected + assert df.group_by(["a", "b"]).agg("c").columns == expected # Column names as positional arguments - assert df.groupby("a", "b").agg("c").columns == expected + assert df.group_by("a", "b").agg("c").columns == expected # With keyword argument - assert df.groupby("a", "b", maintain_order=True).agg("c").columns == expected + assert df.group_by("a", "b", maintain_order=True).agg("c").columns == expected # Multiple aggregations as list - assert df.groupby("a").agg(["b", "c"]).columns == expected + assert df.group_by("a").agg(["b", "c"]).columns == expected # Multiple aggregations as positional arguments - assert df.groupby("a").agg("b", "c").columns == expected + assert df.group_by("a").agg("b", "c").columns == expected # Multiple aggregations as keyword arguments - assert df.groupby("a").agg(q="b", r="c").columns == ["a", "q", "r"] + assert df.group_by("a").agg(q="b", r="c").columns == ["a", "q", "r"] -def test_groupby_empty() -> None: +def test_group_by_empty() -> None: df = pl.DataFrame({"a": [1, 1, 2]}) - result = df.groupby("a").agg() + result = df.group_by("a").agg() expected = pl.DataFrame({"a": [1, 2]}) assert_frame_equal(result, expected, check_row_order=False) -def test_groupby_iteration() -> None: +def test_group_by_iteration() -> None: df = pl.DataFrame( { "foo": ["a", "b", "a", "b", "b", "c"], @@ -151,21 +151,21 @@ def test_groupby_iteration() -> None: [("b", 2, 5), ("b", 4, 3), ("b", 5, 2)], [("c", 6, 1)], ] - for i, (group, data) in enumerate(df.groupby("foo", maintain_order=True)): + for i, (group, data) in enumerate(df.group_by("foo", maintain_order=True)): assert group == expected_names[i] assert data.rows() == expected_rows[i] # Grouped by ALL columns should give groups of a single row - result = list(df.groupby(["foo", "bar", "baz"])) + result = list(df.group_by(["foo", "bar", "baz"])) assert len(result) == 6 # Iterating over groups should also work when grouping by expressions - result2 = list(df.groupby(["foo", pl.col("bar") * pl.col("baz")])) + result2 = list(df.group_by(["foo", pl.col("bar") * pl.col("baz")])) assert len(result2) == 5 - # Single column, alias in groupby + # Single column, alias in group_by df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6]}) - gb = df.groupby((pl.col("foo") // 2).alias("bar"), maintain_order=True) + gb = df.group_by((pl.col("foo") // 2).alias("bar"), maintain_order=True) result3 = [(group, df.rows()) for group, df in gb] expected3 = [(0, [(1,)]), (1, [(2,), (3,)]), (2, [(4,), (5,)]), (3, [(6,)])] assert result3 == expected3 @@ -184,27 +184,27 @@ def good_agg_parameters() -> list[pl.Expr | list[pl.Expr]]: @pytest.mark.parametrize("lazy", [True, False]) -def test_groupby_agg_input_types(lazy: bool) -> None: +def test_group_by_agg_input_types(lazy: bool) -> None: df = pl.DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, 4]}) df_or_lazy: pl.DataFrame | pl.LazyFrame = df.lazy() if lazy else df for bad_param in bad_agg_parameters(): with pytest.raises(TypeError): # noqa: PT012 - result = df_or_lazy.groupby("a").agg(bad_param) + result = df_or_lazy.group_by("a").agg(bad_param) if lazy: result.collect() # type: ignore[union-attr] expected = pl.DataFrame({"a": [1, 2], "b": [3, 7]}) for good_param in good_agg_parameters(): - result = df_or_lazy.groupby("a", maintain_order=True).agg(good_param) + result = df_or_lazy.group_by("a", maintain_order=True).agg(good_param) if lazy: result = result.collect() # type: ignore[union-attr] assert_frame_equal(result, expected) @pytest.mark.parametrize("lazy", [True, False]) -def test_groupby_dynamic_agg_input_types(lazy: bool) -> None: +def test_group_by_dynamic_agg_input_types(lazy: bool) -> None: df = pl.DataFrame({"index_column": [0, 1, 2, 3], "b": [1, 3, 1, 2]}).set_sorted( "index_column" ) @@ -212,7 +212,7 @@ def test_groupby_dynamic_agg_input_types(lazy: bool) -> None: for bad_param in bad_agg_parameters(): with pytest.raises(TypeError): # noqa: PT012 - result = df_or_lazy.groupby_dynamic( + result = df_or_lazy.group_by_dynamic( index_column="index_column", every="2i", closed="right" ).agg(bad_param) if lazy: @@ -221,7 +221,7 @@ def test_groupby_dynamic_agg_input_types(lazy: bool) -> None: expected = pl.DataFrame({"index_column": [-2, 0, 2], "b": [1, 4, 2]}) for good_param in good_agg_parameters(): - result = df_or_lazy.groupby_dynamic( + result = df_or_lazy.group_by_dynamic( index_column="index_column", every="2i", closed="right" ).agg(good_param) if lazy: @@ -229,7 +229,7 @@ def test_groupby_dynamic_agg_input_types(lazy: bool) -> None: assert_frame_equal(result, expected) -def test_groupby_sorted_empty_dataframe_3680() -> None: +def test_group_by_sorted_empty_dataframe_3680() -> None: df = ( pl.DataFrame( [ @@ -239,7 +239,7 @@ def test_groupby_sorted_empty_dataframe_3680() -> None: ) .lazy() .sort("key") - .groupby("key") + .group_by("key") .tail(1) .collect() ) @@ -248,7 +248,7 @@ def test_groupby_sorted_empty_dataframe_3680() -> None: assert df.schema == {"key": pl.Categorical, "val": pl.Float64} -def test_groupby_custom_agg_empty_list() -> None: +def test_group_by_custom_agg_empty_list() -> None: assert ( pl.DataFrame( [ @@ -256,7 +256,7 @@ def test_groupby_custom_agg_empty_list() -> None: pl.Series("val", [], dtype=pl.Float64), ] ) - .groupby("key") + .group_by("key") .agg( [ pl.col("val").mean().alias("mean"), @@ -268,7 +268,7 @@ def test_groupby_custom_agg_empty_list() -> None: ).dtypes == [pl.Categorical, pl.Float64, pl.Float64, pl.Float64, pl.Float64] -def test_apply_after_take_in_groupby_3869() -> None: +def test_apply_after_take_in_group_by_3869() -> None: assert ( pl.DataFrame( { @@ -277,20 +277,20 @@ def test_apply_after_take_in_groupby_3869() -> None: "v": [3, 1, 2, 5, 6, 4], } ) - .groupby("k", maintain_order=True) + .group_by("k", maintain_order=True) .agg( pl.col("v").take(pl.col("t").arg_max()).sqrt() ) # <- fails for sqrt, exp, log, pow, etc. ).to_dict(False) == {"k": ["a", "b"], "v": [1.4142135623730951, 2.0]} -def test_groupby_signed_transmutes() -> None: +def test_group_by_signed_transmutes() -> None: df = pl.DataFrame({"foo": [-1, -2, -3, -4, -5], "bar": [500, 600, 700, 800, 900]}) for dt in [pl.Int8, pl.Int16, pl.Int32, pl.Int64]: df = ( df.with_columns([pl.col("foo").cast(dt), pl.col("bar")]) - .groupby("foo", maintain_order=True) + .group_by("foo", maintain_order=True) .agg(pl.col("bar").median()) ) @@ -340,10 +340,10 @@ def test_unique_order() -> None: } -def test_groupby_dynamic_flat_agg_4814() -> None: +def test_group_by_dynamic_flat_agg_4814() -> None: df = pl.DataFrame({"a": [1, 2, 2], "b": [1, 8, 12]}).set_sorted("a") - assert df.groupby_dynamic("a", every="1i", period="2i").agg( + assert df.group_by_dynamic("a", every="1i", period="2i").agg( [ (pl.col("b").sum() / pl.col("a").sum()).alias("sum_ratio_1"), (pl.col("b").last() / pl.col("a").last()).alias("last_ratio_1"), @@ -365,7 +365,7 @@ def test_groupby_dynamic_flat_agg_4814() -> None: ], ) @pytest.mark.parametrize("time_zone", [None, "Asia/Kathmandu"]) -def test_groupby_dynamic_overlapping_groups_flat_apply_multiple_5038( +def test_group_by_dynamic_overlapping_groups_flat_apply_multiple_5038( every: str | timedelta, period: str | timedelta, time_zone: str | None ) -> None: res = ( @@ -382,7 +382,7 @@ def test_groupby_dynamic_overlapping_groups_flat_apply_multiple_5038( .with_columns(pl.col("a").dt.replace_time_zone(time_zone)) .lazy() .set_sorted("a") - .groupby_dynamic("a", every=every, period=period) + .group_by_dynamic("a", every=every, period=period) .agg([pl.col("b").var().sqrt().alias("corr")]) ) .collect() @@ -394,26 +394,26 @@ def test_groupby_dynamic_overlapping_groups_flat_apply_multiple_5038( assert res["a"] == [None] -def test_take_in_groupby() -> None: +def test_take_in_group_by() -> None: df = pl.DataFrame({"group": [1, 1, 1, 2, 2, 2], "values": [10, 200, 3, 40, 500, 6]}) - assert df.groupby("group").agg( + assert df.group_by("group").agg( pl.col("values").take(1) - pl.col("values").take(2) ).sort("group").to_dict(False) == {"group": [1, 2], "values": [197, 494]} -def test_groupby_wildcard() -> None: +def test_group_by_wildcard() -> None: df = pl.DataFrame( { "a": [1, 2], "b": [1, 2], } ) - assert df.groupby([pl.col("*")], maintain_order=True).agg( + assert df.group_by([pl.col("*")], maintain_order=True).agg( [pl.col("a").first().suffix("_agg")] ).to_dict(False) == {"a": [1, 2], "b": [1, 2], "a_agg": [1, 2]} -def test_groupby_all_masked_out() -> None: +def test_group_by_all_masked_out() -> None: df = pl.DataFrame( { "val": pl.Series( @@ -427,24 +427,24 @@ def test_groupby_all_masked_out() -> None: assert_frame_equal(parts[0], df) -def test_groupby_null_propagation_6185() -> None: +def test_group_by_null_propagation_6185() -> None: df_1 = pl.DataFrame({"A": [0, 0], "B": [1, 2]}) expr = pl.col("A").filter(pl.col("A") > 0) expected = {"B": [1, 2], "A": [None, None]} assert ( - df_1.groupby("B").agg((expr - expr.mean()).mean()).sort("B").to_dict(False) + df_1.group_by("B").agg((expr - expr.mean()).mean()).sort("B").to_dict(False) == expected ) -def test_groupby_when_then_with_binary_and_agg_in_pred_6202() -> None: +def test_group_by_when_then_with_binary_and_agg_in_pred_6202() -> None: df = pl.DataFrame( {"code": ["a", "b", "b", "b", "a"], "xx": [1.0, -1.5, -0.2, -3.9, 3.0]} ) assert ( - df.groupby("code", maintain_order=True).agg( + df.group_by("code", maintain_order=True).agg( [pl.when(pl.col("xx") > pl.min("xx")).then(True).otherwise(False)] ) ).to_dict(False) == { @@ -455,7 +455,7 @@ def test_groupby_when_then_with_binary_and_agg_in_pred_6202() -> None: @pytest.mark.parametrize("every", ["1h", timedelta(hours=1)]) @pytest.mark.parametrize("tzinfo", [None, ZoneInfo("Asia/Kathmandu")]) -def test_groupby_dynamic_iter(every: str | timedelta, tzinfo: ZoneInfo | None) -> None: +def test_group_by_dynamic_iter(every: str | timedelta, tzinfo: ZoneInfo | None) -> None: time_zone = tzinfo.key if tzinfo is not None else None df = pl.DataFrame( { @@ -473,7 +473,7 @@ def test_groupby_dynamic_iter(every: str | timedelta, tzinfo: ZoneInfo | None) - # Without 'by' argument result1 = [ (name, data.shape) - for name, data in df.groupby_dynamic("datetime", every=every, closed="left") + for name, data in df.group_by_dynamic("datetime", every=every, closed="left") ] expected1 = [ (datetime(2020, 1, 1, 10, tzinfo=tzinfo), (2, 3)), @@ -484,7 +484,7 @@ def test_groupby_dynamic_iter(every: str | timedelta, tzinfo: ZoneInfo | None) - # With 'by' argument result2 = [ (name, data.shape) - for name, data in df.groupby_dynamic( + for name, data in df.group_by_dynamic( "datetime", every=every, closed="left", by="a" ) ] @@ -498,7 +498,7 @@ def test_groupby_dynamic_iter(every: str | timedelta, tzinfo: ZoneInfo | None) - @pytest.mark.parametrize("every", ["1h", timedelta(hours=1)]) @pytest.mark.parametrize("tzinfo", [None, ZoneInfo("Asia/Kathmandu")]) -def test_groupby_dynamic_lazy(every: str | timedelta, tzinfo: ZoneInfo | None) -> None: +def test_group_by_dynamic_lazy(every: str | timedelta, tzinfo: ZoneInfo | None) -> None: ldf = pl.LazyFrame( { "time": pl.date_range( @@ -511,7 +511,7 @@ def test_groupby_dynamic_lazy(every: str | timedelta, tzinfo: ZoneInfo | None) - } ) df = ( - ldf.groupby_dynamic("time", every=every, closed="right") + ldf.group_by_dynamic("time", every=every, closed="right") .agg( [ pl.col("time").min().alias("time_min"), @@ -541,20 +541,20 @@ def test_groupby_dynamic_lazy(every: str | timedelta, tzinfo: ZoneInfo | None) - @pytest.mark.slow() @pytest.mark.parametrize("dtype", [pl.Int32, pl.UInt32]) -def test_overflow_mean_partitioned_groupby_5194(dtype: pl.PolarsDataType) -> None: +def test_overflow_mean_partitioned_group_by_5194(dtype: pl.PolarsDataType) -> None: df = pl.DataFrame( [ pl.Series("data", [10_00_00_00] * 100_000, dtype=dtype), pl.Series("group", [1, 2] * 50_000, dtype=dtype), ] ) - assert df.groupby("group").agg(pl.col("data").mean()).sort(by="group").to_dict( + assert df.group_by("group").agg(pl.col("data").mean()).sort(by="group").to_dict( False ) == {"group": [1, 2], "data": [10000000.0, 10000000.0]} @pytest.mark.parametrize("time_zone", [None, "Asia/Kathmandu"]) -def test_groupby_dynamic_elementwise_following_mean_agg_6904( +def test_group_by_dynamic_elementwise_following_mean_agg_6904( time_zone: str | None, ) -> None: df = ( @@ -569,7 +569,7 @@ def test_groupby_dynamic_elementwise_following_mean_agg_6904( .with_columns(pl.col("a").dt.replace_time_zone(time_zone)) .lazy() .set_sorted("a") - .groupby_dynamic("a", every="10s", period="100s") + .group_by_dynamic("a", every="10s", period="100s") .agg([pl.col("b").mean().sin().alias("c")]) .collect() ) @@ -587,7 +587,7 @@ def test_groupby_dynamic_elementwise_following_mean_agg_6904( ) -def test_groupby_multiple_column_reference() -> None: +def test_group_by_multiple_column_reference() -> None: # Issue #7181 df = pl.DataFrame( { @@ -595,7 +595,7 @@ def test_groupby_multiple_column_reference() -> None: "val": [1, 20, 100, 2000, 10000, 200000], } ) - res = df.groupby("gr").agg( + res = df.group_by("gr").agg( pl.col("val") + pl.col("val").shift().fill_null(0), ) @@ -618,14 +618,14 @@ def test_groupby_multiple_column_reference() -> None: ("quantile", [0.5], [1.0, None], pl.Float64), ], ) -def test_groupby_empty_groups( +def test_group_by_empty_groups( aggregation: str, args: list[object], expected_values: list[object], expected_dtype: pl.DataType, ) -> None: df = pl.DataFrame({"a": [1, 2], "b": [1, 2]}) - result = df.groupby("b", maintain_order=True).agg( + result = df.group_by("b", maintain_order=True).agg( getattr(pl.col("a").filter(pl.col("b") != 2), aggregation)(*args) ) expected = pl.DataFrame({"b": [1, 2], "a": expected_values}).with_columns( @@ -743,7 +743,7 @@ def test_perfect_hash_table_null_values_8663() -> None: dtype=pl.Categorical, ) - assert s.to_frame("a").groupby("a").agg(pl.col("a").alias("agg")).to_dict( + assert s.to_frame("a").group_by("a").agg(pl.col("a").alias("agg")).to_dict( False ) == { "a": [ @@ -833,9 +833,69 @@ def test_perfect_hash_table_null_values_8663() -> None: } -def test_groupby_partitioned_ending_cast(monkeypatch: Any) -> None: +def test_group_by_partitioned_ending_cast(monkeypatch: Any) -> None: monkeypatch.setenv("POLARS_FORCE_PARTITION", "1") df = pl.DataFrame({"a": [1] * 5, "b": [1] * 5}) - out = df.groupby(["a", "b"]).agg(pl.count().cast(pl.Int64).alias("num")) + out = df.group_by(["a", "b"]).agg(pl.count().cast(pl.Int64).alias("num")) expected = pl.DataFrame({"a": [1], "b": [1], "num": [5]}) assert_frame_equal(out, expected) + + +def test_groupby_deprecated() -> None: + df = pl.DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}) + + with pytest.deprecated_call(): + result = df.groupby("a").agg(pl.sum("b")) + with pytest.deprecated_call(): + result_lazy = df.lazy().groupby("a").agg(pl.sum("b")).collect() + + expected = df.group_by("a").agg(pl.sum("b")) + assert_frame_equal(result, expected, check_row_order=False) + assert_frame_equal(result_lazy, expected, check_row_order=False) + + +def test_groupby_rolling_deprecated() -> None: + df = pl.DataFrame( + { + "date": pl.date_range( + datetime(2020, 1, 1), datetime(2020, 1, 5), eager=True + ), + "value": [1, 2, 3, 4, 5], + } + ) + + with pytest.deprecated_call(): + result = df.groupby_rolling("date", period="2d").agg(pl.sum("value")) + with pytest.deprecated_call(): + result_lazy = ( + df.lazy() + .groupby_rolling("date", period="2d") + .agg(pl.sum("value")) + .collect() + ) + + expected = df.group_by_rolling("date", period="2d").agg(pl.sum("value")) + assert_frame_equal(result, expected, check_row_order=False) + assert_frame_equal(result_lazy, expected, check_row_order=False) + + +def test_groupby_dynamic_deprecated() -> None: + df = pl.DataFrame( + { + "date": pl.date_range( + datetime(2020, 1, 1), datetime(2020, 1, 5), eager=True + ), + "value": [1, 2, 3, 4, 5], + } + ) + + with pytest.deprecated_call(): + result = df.groupby_dynamic("date", every="2d").agg(pl.sum("value")) + with pytest.deprecated_call(): + result_lazy = ( + df.lazy().groupby_dynamic("date", every="2d").agg(pl.sum("value")).collect() + ) + + expected = df.group_by_dynamic("date", every="2d").agg(pl.sum("value")) + assert_frame_equal(result, expected, check_row_order=False) + assert_frame_equal(result_lazy, expected, check_row_order=False) diff --git a/py-polars/tests/unit/operations/test_groupby_rolling.py b/py-polars/tests/unit/operations/test_group_by_rolling.py similarity index 84% rename from py-polars/tests/unit/operations/test_groupby_rolling.py rename to py-polars/tests/unit/operations/test_group_by_rolling.py index 36be0b12bf1e..37b44a3f6252 100644 --- a/py-polars/tests/unit/operations/test_groupby_rolling.py +++ b/py-polars/tests/unit/operations/test_group_by_rolling.py @@ -24,7 +24,7 @@ def good_agg_parameters() -> list[pl.Expr | list[pl.Expr]]: ] -def test_groupby_rolling_apply() -> None: +def test_group_by_rolling_apply() -> None: df = pl.DataFrame( { "a": [1, 2, 3, 4, 5], @@ -45,11 +45,11 @@ def apply(df: pl.DataFrame) -> pl.DataFrame: ] ) - out = df.groupby_rolling("a", period="2i").apply(apply, schema=df.schema) + out = df.group_by_rolling("a", period="2i").apply(apply, schema=df.schema) assert_frame_equal(out, expected) -def test_rolling_groupby_overlapping_groups() -> None: +def test_rolling_group_by_overlapping_groups() -> None: # this first aggregates overlapping groups so they cannot be naively flattened df = pl.DataFrame({"a": [41, 60, 37, 51, 52, 39, 40]}) @@ -57,7 +57,7 @@ def test_rolling_groupby_overlapping_groups() -> None: ( df.with_row_count() .with_columns(pl.col("row_nr").cast(pl.Int32)) - .groupby_rolling( + .group_by_rolling( index_column="row_nr", period="5i", ) @@ -73,7 +73,7 @@ def test_rolling_groupby_overlapping_groups() -> None: @pytest.mark.parametrize("lazy", [True, False]) -def test_groupby_rolling_agg_input_types(lazy: bool) -> None: +def test_group_by_rolling_agg_input_types(lazy: bool) -> None: df = pl.DataFrame({"index_column": [0, 1, 2, 3], "b": [1, 3, 1, 2]}).set_sorted( "index_column" ) @@ -81,7 +81,7 @@ def test_groupby_rolling_agg_input_types(lazy: bool) -> None: for bad_param in bad_agg_parameters(): with pytest.raises(TypeError): # noqa: PT012 - result = df_or_lazy.groupby_rolling( + result = df_or_lazy.group_by_rolling( index_column="index_column", period="2i" ).agg(bad_param) if lazy: @@ -90,7 +90,7 @@ def test_groupby_rolling_agg_input_types(lazy: bool) -> None: expected = pl.DataFrame({"index_column": [0, 1, 2, 3], "b": [1, 4, 4, 3]}) for good_param in good_agg_parameters(): - result = df_or_lazy.groupby_rolling( + result = df_or_lazy.group_by_rolling( index_column="index_column", period="2i" ).agg(good_param) if lazy: @@ -98,7 +98,7 @@ def test_groupby_rolling_agg_input_types(lazy: bool) -> None: assert_frame_equal(result, expected) -def test_groupby_rolling_negative_offset_3914() -> None: +def test_group_by_rolling_negative_offset_3914() -> None: df = pl.DataFrame( { "datetime": pl.date_range( @@ -106,7 +106,7 @@ def test_groupby_rolling_negative_offset_3914() -> None: ), } ) - assert df.groupby_rolling(index_column="datetime", period="2d", offset="-4d").agg( + assert df.group_by_rolling(index_column="datetime", period="2d", offset="-4d").agg( pl.count().alias("count") )["count"].to_list() == [0, 0, 1, 2, 2] @@ -116,7 +116,7 @@ def test_groupby_rolling_negative_offset_3914() -> None: } ) - assert df.groupby_rolling(index_column="ints", period="2i", offset="-5i").agg( + assert df.group_by_rolling(index_column="ints", period="2i", offset="-5i").agg( [pl.col("ints").alias("matches")] )["matches"].to_list() == [ [], @@ -143,7 +143,7 @@ def test_groupby_rolling_negative_offset_3914() -> None: @pytest.mark.parametrize("time_zone", [None, "US/Central"]) -def test_groupby_rolling_negative_offset_crossing_dst(time_zone: str | None) -> None: +def test_group_by_rolling_negative_offset_crossing_dst(time_zone: str | None) -> None: df = pl.DataFrame( { "datetime": pl.date_range( @@ -156,9 +156,9 @@ def test_groupby_rolling_negative_offset_crossing_dst(time_zone: str | None) -> "value": [1, 4, 9, 155], } ) - result = df.groupby_rolling(index_column="datetime", period="2d", offset="-1d").agg( - pl.col("value") - ) + result = df.group_by_rolling( + index_column="datetime", period="2d", offset="-1d" + ).agg(pl.col("value")) expected = pl.DataFrame( { "datetime": pl.date_range( @@ -188,7 +188,7 @@ def test_groupby_rolling_negative_offset_crossing_dst(time_zone: str | None) -> ("1d", "none", [[9], [155], [], []]), ], ) -def test_groupby_rolling_non_negative_offset_9077( +def test_group_by_rolling_non_negative_offset_9077( time_zone: str | None, offset: str, closed: ClosedInterval, @@ -206,7 +206,7 @@ def test_groupby_rolling_non_negative_offset_9077( "value": [1, 4, 9, 155], } ) - result = df.groupby_rolling( + result = df.group_by_rolling( index_column="datetime", period="2d", offset=offset, closed=closed ).agg(pl.col("value")) expected = pl.DataFrame( @@ -224,7 +224,7 @@ def test_groupby_rolling_non_negative_offset_9077( assert_frame_equal(result, expected) -def test_groupby_rolling_dynamic_sortedness_check() -> None: +def test_group_by_rolling_dynamic_sortedness_check() -> None: # when the by argument is passed, the sortedness flag # will be unset as the take shuffles data, so we must explicitly # check the sortedness @@ -236,12 +236,12 @@ def test_groupby_rolling_dynamic_sortedness_check() -> None: ) with pytest.raises(pl.ComputeError, match=r"input data is not sorted"): - df.groupby_dynamic("idx", every="2i", by="group").agg( + df.group_by_dynamic("idx", every="2i", by="group").agg( pl.col("idx").alias("idx1") ) with pytest.raises(pl.ComputeError, match=r"input data is not sorted"): - df.groupby_rolling("idx", period="2i", by="group").agg( + df.group_by_rolling("idx", period="2i", by="group").agg( pl.col("idx").alias("idx1") ) @@ -250,17 +250,17 @@ def test_groupby_rolling_dynamic_sortedness_check() -> None: pl.InvalidOperationError, match=r"argument in operation 'group_by_dynamic' is not explicitly sorted", ): - df.groupby_dynamic("idx", every="2i").agg(pl.col("idx").alias("idx1")) + df.group_by_dynamic("idx", every="2i").agg(pl.col("idx").alias("idx1")) # no `by` argument with pytest.raises( pl.InvalidOperationError, match=r"argument in operation 'group_by_rolling' is not explicitly sorted", ): - df.groupby_rolling("idx", period="2i").agg(pl.col("idx").alias("idx1")) + df.group_by_rolling("idx", period="2i").agg(pl.col("idx").alias("idx1")) -def test_groupby_rolling_empty_groups_9973() -> None: +def test_group_by_rolling_empty_groups_9973() -> None: dt1 = date(2001, 1, 1) dt2 = date(2001, 1, 2) @@ -287,7 +287,7 @@ def test_groupby_rolling_empty_groups_9973() -> None: } ) - out = data.groupby_rolling( + out = data.group_by_rolling( index_column="date", by="id", period="2d", diff --git a/py-polars/tests/unit/operations/test_join.py b/py-polars/tests/unit/operations/test_join.py index a1ffc113d9d5..24d28cc9c759 100644 --- a/py-polars/tests/unit/operations/test_join.py +++ b/py-polars/tests/unit/operations/test_join.py @@ -65,7 +65,7 @@ def test_join_same_cat_src() -> None: data={"column": ["a", "a", "b"], "more": [1, 2, 3]}, schema=[("column", pl.Categorical), ("more", pl.Int32)], ) - df_agg = df.groupby("column").agg(pl.col("more").mean()) + df_agg = df.group_by("column").agg(pl.col("more").mean()) assert df.join(df_agg, on="column").to_dict(False) == { "column": ["a", "a", "b"], "more": [1, 2, 3], @@ -434,7 +434,7 @@ def test_semi_join_projection_pushdown_6455() -> None: } ).lazy() - latest = df.groupby("id").agg(pl.col("timestamp").max()) + latest = df.group_by("id").agg(pl.col("timestamp").max()) df = df.join(latest, on=["id", "timestamp"], how="semi") assert df.select(["id", "value"]).collect().to_dict(False) == { "id": [1, 2], diff --git a/py-polars/tests/unit/operations/test_join_asof.py b/py-polars/tests/unit/operations/test_join_asof.py index 8021d360e92c..6ec901d661fb 100644 --- a/py-polars/tests/unit/operations/test_join_asof.py +++ b/py-polars/tests/unit/operations/test_join_asof.py @@ -54,7 +54,7 @@ def test_asof_join_projection_resolution_4606() -> None: a = pl.DataFrame({"a": [1], "b": [2], "c": [3]}).lazy() b = pl.DataFrame({"a": [1], "b": [2], "d": [4]}).lazy() joined_tbl = a.join_asof(b, on=pl.col("a").set_sorted(), by="b") - assert joined_tbl.groupby("a").agg( + assert joined_tbl.group_by("a").agg( [pl.col("c").sum().alias("c")] ).collect().columns == ["a", "c"] diff --git a/py-polars/tests/unit/operations/test_profile.py b/py-polars/tests/unit/operations/test_profile.py index ef7e8b1fd170..df70655b1c19 100644 --- a/py-polars/tests/unit/operations/test_profile.py +++ b/py-polars/tests/unit/operations/test_profile.py @@ -5,7 +5,7 @@ def test_profile_columns() -> None: ldf = pl.LazyFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]}) # profile lazyframe operation/plan - lazy = ldf.groupby("a").agg(pl.implode("b")) + lazy = ldf.group_by("a").agg(pl.implode("b")) profiling_info = lazy.profile() # ┌──────────────┬───────┬─────┐ # │ node ┆ start ┆ end │ @@ -13,7 +13,7 @@ def test_profile_columns() -> None: # │ str ┆ u64 ┆ u64 │ # ╞══════════════╪═══════╪═════╡ # │ optimization ┆ 0 ┆ 69 │ - # │ groupby(a) ┆ 69 ┆ 342 │ + # │ group_by(a) ┆ 69 ┆ 342 │ # └──────────────┴───────┴─────┘ assert len(profiling_info) == 2 assert profiling_info[1].columns == ["node", "start", "end"] diff --git a/py-polars/tests/unit/operations/test_random.py b/py-polars/tests/unit/operations/test_random.py index 47022b7d66c8..a92dfbe69677 100644 --- a/py-polars/tests/unit/operations/test_random.py +++ b/py-polars/tests/unit/operations/test_random.py @@ -6,15 +6,15 @@ from polars.testing import assert_frame_equal, assert_series_equal -def test_shuffle_groupby_reseed() -> None: +def test_shuffle_group_by_reseed() -> None: def unique_shuffle_groups(n: int, seed: int | None) -> int: ls = [1, 2, 3] * n # 1, 2, 3, 1, 2, 3... groups = sorted(list(range(n)) * 3) # 0, 0, 0, 1, 1, 1, ... df = pl.DataFrame({"l": ls, "group": groups}) - shuffled = df.groupby("group", maintain_order=True).agg( + shuffled = df.group_by("group", maintain_order=True).agg( pl.col("l").shuffle(seed) ) - num_unique = shuffled.groupby("l").agg(pl.lit(0)).select(pl.count()) + num_unique = shuffled.group_by("l").agg(pl.lit(0)).select(pl.count()) return int(num_unique[0, 0]) assert unique_shuffle_groups(50, None) > 1 # Astronomically unlikely. diff --git a/py-polars/tests/unit/operations/test_rolling.py b/py-polars/tests/unit/operations/test_rolling.py index 27bcdae19bf6..46ff6c6e5181 100644 --- a/py-polars/tests/unit/operations/test_rolling.py +++ b/py-polars/tests/unit/operations/test_rolling.py @@ -44,13 +44,13 @@ def example_df() -> pl.DataFrame: ["1d", "2d", "3d", timedelta(days=1), timedelta(days=2), timedelta(days=3)], ) @pytest.mark.parametrize("closed", ["left", "right", "none", "both"]) -def test_rolling_kernels_and_groupby_rolling( +def test_rolling_kernels_and_group_by_rolling( example_df: pl.DataFrame, period: str | timedelta, closed: ClosedInterval ) -> None: out1 = example_df.select( [ pl.col("dt"), - # this differs from groupby aggregation because the empty window is + # this differs from group_by aggregation because the empty window is # null here # where the sum aggregation of an empty set is 0 pl.col("values") @@ -64,7 +64,7 @@ def test_rolling_kernels_and_groupby_rolling( ) out2 = ( example_df.set_sorted("dt") - .groupby_rolling("dt", period=period, closed=closed) + .group_by_rolling("dt", period=period, closed=closed) .agg( [ pl.col("values").sum().alias("sum"), @@ -153,7 +153,7 @@ def test_rolling_negative_offset( "value": [1, 2, 3, 4], } ) - result = df.groupby_rolling("ts", period="2d", offset=offset, closed=closed).agg( + result = df.group_by_rolling("ts", period="2d", offset=offset, closed=closed).agg( pl.col("value") ) expected = pl.DataFrame( @@ -269,7 +269,7 @@ def test_rolling_extrema() -> None: } -def test_rolling_groupby_extrema() -> None: +def test_rolling_group_by_extrema() -> None: # ensure we hit different branches so create df = pl.DataFrame( @@ -279,7 +279,7 @@ def test_rolling_groupby_extrema() -> None: ).with_columns(pl.col("col1").reverse().alias("row_nr")) assert ( - df.groupby_rolling( + df.group_by_rolling( index_column="row_nr", period="3i", ) @@ -318,7 +318,7 @@ def test_rolling_groupby_extrema() -> None: ).with_columns(pl.col("col1").alias("row_nr")) assert ( - df.groupby_rolling( + df.group_by_rolling( index_column="row_nr", period="3i", ) @@ -356,7 +356,7 @@ def test_rolling_groupby_extrema() -> None: ).with_columns(pl.col("col1").sort().alias("row_nr")) assert ( - df.groupby_rolling( + df.group_by_rolling( index_column="row_nr", period="3i", ) @@ -387,7 +387,7 @@ def test_rolling_slice_pushdown() -> None: df = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "a", "b"], "c": [1, 3, 5]}).lazy() df = ( df.sort("a") - .groupby_rolling( + .group_by_rolling( "a", by="b", period="2i", @@ -407,11 +407,11 @@ def test_rolling_slice_pushdown() -> None: } -def test_groupby_dynamic_slice_pushdown() -> None: +def test_group_by_dynamic_slice_pushdown() -> None: df = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "a", "b"], "c": [1, 3, 5]}).lazy() df = ( df.sort("a") - .groupby_dynamic( + .group_by_dynamic( "a", by="b", every="2i", @@ -439,7 +439,7 @@ def test_overlapping_groups_4628() -> None: } ) assert ( - df.groupby_rolling(index_column=pl.col("index").set_sorted(), period="3i").agg( + df.group_by_rolling(index_column=pl.col("index").set_sorted(), period="3i").agg( [ pl.col("val").diff(n=1).alias("val.diff"), (pl.col("val") - pl.col("val").shift(1)).alias("val - val.shift"), @@ -512,7 +512,7 @@ def test_rolling_var_numerical_stability_5197() -> None: (timedelta(days=3), timedelta(days=-1)), ], ) -def test_dynamic_groupby_timezone_awareness( +def test_dynamic_group_by_timezone_awareness( every: str | timedelta, offset: str | timedelta ) -> None: df = pl.DataFrame( @@ -531,7 +531,7 @@ def test_dynamic_groupby_timezone_awareness( ) assert ( - df.groupby_dynamic( + df.group_by_dynamic( "datetime", every=every, offset=offset, @@ -543,13 +543,13 @@ def test_dynamic_groupby_timezone_awareness( @pytest.mark.parametrize("tzinfo", [None, ZoneInfo("Asia/Kathmandu")]) -def test_groupby_dynamic_startby_5599(tzinfo: ZoneInfo | None) -> None: +def test_group_by_dynamic_startby_5599(tzinfo: ZoneInfo | None) -> None: # start by datapoint start = datetime(2022, 12, 16, tzinfo=tzinfo) stop = datetime(2022, 12, 16, hour=3, tzinfo=tzinfo) df = pl.DataFrame({"date": pl.date_range(start, stop, "30m", eager=True)}) - assert df.groupby_dynamic( + assert df.group_by_dynamic( "date", every="31m", include_boundaries=True, @@ -591,7 +591,7 @@ def test_groupby_dynamic_startby_5599(tzinfo: ZoneInfo | None) -> None: {"date": pl.date_range(start, stop, "12h", eager=True)} ).with_columns(pl.col("date").dt.weekday().alias("day")) - result = df.groupby_dynamic( + result = df.group_by_dynamic( "date", every="1w", period="3d", @@ -616,7 +616,7 @@ def test_groupby_dynamic_startby_5599(tzinfo: ZoneInfo | None) -> None: "data_day": [1, 1], } # start by saturday - result = df.groupby_dynamic( + result = df.group_by_dynamic( "date", every="1w", period="3d", @@ -642,7 +642,7 @@ def test_groupby_dynamic_startby_5599(tzinfo: ZoneInfo | None) -> None: } -def test_groupby_dynamic_by_monday_and_offset_5444() -> None: +def test_group_by_dynamic_by_monday_and_offset_5444() -> None: df = pl.DataFrame( { "date": [ @@ -659,7 +659,7 @@ def test_groupby_dynamic_by_monday_and_offset_5444() -> None: } ).with_columns(pl.col("date").str.strptime(pl.Date, "%Y-%m-%d").set_sorted()) - result = df.groupby_dynamic( + result = df.group_by_dynamic( "date", every="1w", offset="1d", by="label", start_by="monday" ).agg(pl.col("value").sum()) @@ -677,13 +677,15 @@ def test_groupby_dynamic_by_monday_and_offset_5444() -> None: # test empty result_empty = ( df.filter(pl.col("date") == date(1, 1, 1)) - .groupby_dynamic("date", every="1w", offset="1d", by="label", start_by="monday") + .group_by_dynamic( + "date", every="1w", offset="1d", by="label", start_by="monday" + ) .agg(pl.col("value").sum()) ) assert result_empty.schema == result.schema -def test_groupby_rolling_iter() -> None: +def test_group_by_rolling_iter() -> None: df = pl.DataFrame( { "date": [date(2020, 1, 1), date(2020, 1, 2), date(2020, 1, 5)], @@ -695,7 +697,7 @@ def test_groupby_rolling_iter() -> None: # Without 'by' argument result1 = [ (name, data.shape) - for name, data in df.groupby_rolling(index_column="date", period="2d") + for name, data in df.group_by_rolling(index_column="date", period="2d") ] expected1 = [ (date(2020, 1, 1), (1, 3)), @@ -707,7 +709,7 @@ def test_groupby_rolling_iter() -> None: # With 'by' argument result2 = [ (name, data.shape) - for name, data in df.groupby_rolling(index_column="date", period="2d", by="a") + for name, data in df.group_by_rolling(index_column="date", period="2d", by="a") ] expected2 = [ ((1, date(2020, 1, 1)), (1, 3)), @@ -717,18 +719,18 @@ def test_groupby_rolling_iter() -> None: assert result2 == expected2 -def test_groupby_rolling_negative_period() -> None: +def test_group_by_rolling_negative_period() -> None: df = pl.DataFrame({"ts": [datetime(2020, 1, 1)], "value": [1]}).with_columns( pl.col("ts").set_sorted() ) with pytest.raises( ComputeError, match="rolling window period should be strictly positive" ): - df.groupby_rolling("ts", period="-1d", offset="-1d").agg(pl.col("value")) + df.group_by_rolling("ts", period="-1d", offset="-1d").agg(pl.col("value")) with pytest.raises( ComputeError, match="rolling window period should be strictly positive" ): - df.lazy().groupby_rolling("ts", period="-1d", offset="-1d").agg( + df.lazy().group_by_rolling("ts", period="-1d", offset="-1d").agg( pl.col("value") ).collect() with pytest.raises(ComputeError, match="window size should be strictly positive"): @@ -747,10 +749,10 @@ def test_rolling_skew_window_offset() -> None: ] == 0.6612545648596286 -def test_rolling_kernels_groupby_dynamic_7548() -> None: +def test_rolling_kernels_group_by_dynamic_7548() -> None: assert pl.DataFrame( {"time": pl.arange(0, 4, eager=True), "value": pl.arange(0, 4, eager=True)} - ).groupby_dynamic("time", every="1i", period="3i").agg( + ).group_by_dynamic("time", every="1i", period="3i").agg( pl.col("value"), pl.col("value").min().alias("min_value"), pl.col("value").max().alias("max_value"), diff --git a/py-polars/tests/unit/operations/test_sort.py b/py-polars/tests/unit/operations/test_sort.py index 9d7eb6d41382..ff9346273d5d 100644 --- a/py-polars/tests/unit/operations/test_sort.py +++ b/py-polars/tests/unit/operations/test_sort.py @@ -314,7 +314,7 @@ def test_sorted_flag_unset_by_arithmetic_4937() -> None: } ) - assert df.sort("price").groupby("ts").agg( + assert df.sort("price").group_by("ts").agg( [ (pl.col("price") * pl.col("mask")).max().alias("pmax"), (pl.col("price") * pl.col("mask")).min().alias("pmin"), @@ -332,7 +332,7 @@ def test_unset_sorted_flag_after_extend() -> None: df1.extend(df2) assert not df1["Add"].flags["SORTED_ASC"] - df = df1.groupby("Add").agg([pl.col("Batch").min()]).sort("Add") + df = df1.group_by("Add").agg([pl.col("Batch").min()]).sort("Add") assert df["Add"].flags["SORTED_ASC"] assert df.to_dict(False) == {"Add": [37, 41], "Batch": [48, 49]} @@ -356,12 +356,12 @@ def test_sort_slice_fast_path_5245() -> None: } -def test_explicit_list_agg_sort_in_groupby() -> None: +def test_explicit_list_agg_sort_in_group_by() -> None: df = pl.DataFrame({"A": ["a", "a", "a", "b", "b", "a"], "B": [1, 2, 3, 4, 5, 6]}) # this was col().implode().sort() before we changed the logic - result = df.groupby("A").agg(pl.col("B").sort(descending=True)).sort("A") - expected = df.groupby("A").agg(pl.col("B").sort(descending=True)).sort("A") + result = df.group_by("A").agg(pl.col("B").sort(descending=True)).sort("A") + expected = df.group_by("A").agg(pl.col("B").sort(descending=True)).sort("A") assert_frame_equal(result, expected) @@ -388,7 +388,7 @@ def test_sorted_join_query_5406() -> None: df1 = df.sort(by=["Datetime", "RowId"]) filter1 = ( - df1.groupby(["Datetime", "Group"]) + df1.group_by(["Datetime", "Group"]) .agg([pl.all().sort_by("Value", descending=True).first()]) .sort(["Datetime", "RowId"]) ) @@ -535,7 +535,7 @@ def test_sort_by_logical() -> None: "num": [3, 4, 1], } ) - assert df.groupby("name").agg([pl.col("num").sort_by(["dt1", "dt2"])]).sort( + assert df.group_by("name").agg([pl.col("num").sort_by(["dt1", "dt2"])]).sort( "name" ).to_dict(False) == {"name": ["a", "b"], "num": [[3, 1], [4]]} @@ -647,11 +647,11 @@ def test_sort_top_k_fast_path() -> None: } -def test_sorted_flag_groupby_dynamic() -> None: +def test_sorted_flag_group_by_dynamic() -> None: df = pl.DataFrame({"ts": [date(2020, 1, 1), date(2020, 1, 2)], "val": [1, 2]}) assert ( ( - df.groupby_dynamic(pl.col("ts").set_sorted(), every="1d").agg( + df.group_by_dynamic(pl.col("ts").set_sorted(), every="1d").agg( pl.col("val").sum() ) ) diff --git a/py-polars/tests/unit/streaming/test_streaming.py b/py-polars/tests/unit/streaming/test_streaming.py index 8a1af3fc9b74..08203d438cc4 100644 --- a/py-polars/tests/unit/streaming/test_streaming.py +++ b/py-polars/tests/unit/streaming/test_streaming.py @@ -25,7 +25,7 @@ def test_streaming_categoricals_5921() -> None: pl.DataFrame({"X": ["a", "a", "a", "b", "b"], "Y": [2, 2, 2, 1, 1]}) .lazy() .with_columns(pl.col("X").cast(pl.Categorical)) - .groupby("X") + .group_by("X") .agg(pl.col("Y").min()) .sort("Y", descending=True) .collect(streaming=True) @@ -34,7 +34,7 @@ def test_streaming_categoricals_5921() -> None: out_eager = ( pl.DataFrame({"X": ["a", "a", "a", "b", "b"], "Y": [2, 2, 2, 1, 1]}) .with_columns(pl.col("X").cast(pl.Categorical)) - .groupby("X") + .group_by("X") .agg(pl.col("Y").min()) .sort("Y", descending=True) ) @@ -48,7 +48,7 @@ def test_streaming_block_on_literals_6054() -> None: df = pl.DataFrame({"col_1": [0] * 5 + [1] * 5}) s = pl.Series("col_2", list(range(10))) - assert df.lazy().with_columns(s).groupby("col_1").agg(pl.all().first()).collect( + assert df.lazy().with_columns(s).group_by("col_1").agg(pl.all().first()).collect( streaming=True ).sort("col_1").to_dict(False) == {"col_1": [0, 1], "col_2": [0, 5]} @@ -99,14 +99,14 @@ def test_streaming_literal_expansion() -> None: "y": ["a", "b"], "z": [1, 2], } - assert q.groupby(["x", "y"]).agg(pl.mean("z")).sort("y").collect( + assert q.group_by(["x", "y"]).agg(pl.mean("z")).sort("y").collect( streaming=True ).to_dict(False) == { "x": ["constant", "constant"], "y": ["a", "b"], "z": [1.0, 2.0], } - assert q.groupby(["x"]).agg(pl.mean("z")).collect().to_dict(False) == { + assert q.group_by(["x"]).agg(pl.mean("z")).collect().to_dict(False) == { "x": ["constant"], "z": [1.5], } @@ -187,7 +187,7 @@ def test_streaming_sortedness_propagation_9494() -> None: ) .lazy() .sort("when") - .groupby_dynamic("when", every="1mo") + .group_by_dynamic("when", every="1mo") .agg(pl.col("what").sum()) .collect(streaming=True) ).to_dict(False) == {"when": [date(2023, 5, 1), date(2023, 6, 1)], "what": [3, 3]} @@ -226,12 +226,12 @@ def test_streaming_generic_left_and_inner_join_from_disk(tmp_path: Path) -> None def test_streaming_9776() -> None: df = pl.DataFrame({"col_1": ["a"] * 1000, "ID": [None] + ["a"] * 999}) ordered = ( - df.groupby("col_1", "ID", maintain_order=True) + df.group_by("col_1", "ID", maintain_order=True) .count() .filter(pl.col("col_1") == "a") ) unordered = ( - df.groupby("col_1", "ID", maintain_order=False) + df.group_by("col_1", "ID", maintain_order=False) .count() .filter(pl.col("col_1") == "a") ) @@ -317,7 +317,7 @@ def test_null_sum_streaming_10455() -> None: "y": [None] * 10, } ) - assert df.lazy().groupby("x").sum().collect(streaming=True).to_dict(False) == { + assert df.lazy().group_by("x").sum().collect(streaming=True).to_dict(False) == { "x": [1], "y": [0.0], } @@ -331,7 +331,7 @@ def test_boolean_agg_schema() -> None: } ).lazy() - agg_df = df.groupby("x").agg(pl.col("y").max().alias("max_y")) + agg_df = df.group_by("x").agg(pl.col("y").max().alias("max_y")) for streaming in [True, False]: assert ( diff --git a/py-polars/tests/unit/streaming/test_streaming_cse.py b/py-polars/tests/unit/streaming/test_streaming_cse.py index f9aa25924217..a0bd1b0b77f1 100644 --- a/py-polars/tests/unit/streaming/test_streaming_cse.py +++ b/py-polars/tests/unit/streaming/test_streaming_cse.py @@ -48,7 +48,7 @@ def test_cse_expr_selection_streaming(monkeypatch: Any, capfd: Any) -> None: @pytest.mark.skip(reason="activate once fixed") -def test_cse_expr_groupby() -> None: +def test_cse_expr_group_by() -> None: q = pl.LazyFrame( { "a": [1, 2, 3, 4], @@ -60,7 +60,7 @@ def test_cse_expr_groupby() -> None: derived = pl.col("a") * pl.col("b") q = ( - q.groupby("a") + q.group_by("a") .agg(derived.sum().alias("sum"), derived.min().alias("min")) .sort("min") ) diff --git a/py-polars/tests/unit/streaming/test_streaming_groupby.py b/py-polars/tests/unit/streaming/test_streaming_group_by.py similarity index 88% rename from py-polars/tests/unit/streaming/test_streaming_groupby.py rename to py-polars/tests/unit/streaming/test_streaming_group_by.py index a60d7854033c..48798fbea054 100644 --- a/py-polars/tests/unit/streaming/test_streaming_groupby.py +++ b/py-polars/tests/unit/streaming/test_streaming_group_by.py @@ -13,7 +13,7 @@ @pytest.mark.slow() -def test_streaming_groupby_sorted_fast_path_nulls_10273() -> None: +def test_streaming_group_by_sorted_fast_path_nulls_10273() -> None: df = pl.Series( name="x", values=( @@ -25,14 +25,14 @@ def test_streaming_groupby_sorted_fast_path_nulls_10273() -> None: assert ( df.set_sorted("x") .lazy() - .groupby("x") + .group_by("x") .agg(pl.count()) .collect(streaming=True) .sort("x") ).to_dict(False) == {"x": [None, 0, 1, 2, 3], "count": [100, 100, 100, 100, 100]} -def test_streaming_groupby_types() -> None: +def test_streaming_group_by_types() -> None: df = pl.DataFrame( { "person_id": [1, 1], @@ -47,7 +47,7 @@ def test_streaming_groupby_types() -> None: out = ( ( df.lazy() - .groupby(by) + .group_by(by) .agg( [ pl.col("person_name").first().alias("str_first"), @@ -107,7 +107,7 @@ def test_streaming_groupby_types() -> None: with pytest.raises(pl.DuplicateError): ( df.lazy() - .groupby("person_id") + .group_by("person_id") .agg( [ pl.col("person_name").first().alias("str_first"), @@ -123,7 +123,7 @@ def test_streaming_groupby_types() -> None: ) -def test_streaming_groupby_min_max() -> None: +def test_streaming_group_by_min_max() -> None: df = pl.DataFrame( { "person_id": [1, 2, 3, 4, 5, 6], @@ -132,7 +132,7 @@ def test_streaming_groupby_min_max() -> None: ) out = ( df.lazy() - .groupby("year") + .group_by("year") .agg([pl.min("person_id").alias("min"), pl.max("person_id").alias("max")]) .collect() .sort("year") @@ -144,22 +144,22 @@ def test_streaming_groupby_min_max() -> None: def test_streaming_non_streaming_gb() -> None: n = 100 df = pl.DataFrame({"a": np.random.randint(0, 20, n)}) - q = df.lazy().groupby("a").agg(pl.count()).sort("a") + q = df.lazy().group_by("a").agg(pl.count()).sort("a") assert_frame_equal(q.collect(streaming=True), q.collect()) q = df.lazy().with_columns(pl.col("a").cast(pl.Utf8)) - q = q.groupby("a").agg(pl.count()).sort("a") + q = q.group_by("a").agg(pl.count()).sort("a") assert_frame_equal(q.collect(streaming=True), q.collect()) q = df.lazy().with_columns(pl.col("a").alias("b")) q = ( - q.groupby(["a", "b"]) + q.group_by(["a", "b"]) .agg(pl.count(), pl.col("a").sum().alias("sum_a")) .sort("a") ) assert_frame_equal(q.collect(streaming=True), q.collect()) -def test_streaming_groupby_sorted_fast_path() -> None: +def test_streaming_group_by_sorted_fast_path() -> None: a = np.random.randint(0, 20, 80) df = pl.DataFrame( { @@ -175,7 +175,7 @@ def test_streaming_groupby_sorted_fast_path() -> None: for df_ in [df, df_sorted]: out = ( df_.lazy() - .groupby("a") + .group_by("a") .agg( [ pl.first("a").alias("first"), @@ -202,14 +202,16 @@ def random_integers() -> pl.Series: @pytest.mark.write_disk() -def test_streaming_groupby_ooc_q1(monkeypatch: Any, random_integers: pl.Series) -> None: +def test_streaming_group_by_ooc_q1( + monkeypatch: Any, random_integers: pl.Series +) -> None: s = random_integers monkeypatch.setenv("POLARS_FORCE_OOC", "1") result = ( s.to_frame() .lazy() - .groupby("a") + .group_by("a") .agg(pl.first("a").alias("a_first"), pl.last("a").alias("a_last")) .sort("a") .collect(streaming=True) @@ -226,7 +228,9 @@ def test_streaming_groupby_ooc_q1(monkeypatch: Any, random_integers: pl.Series) @pytest.mark.write_disk() -def test_streaming_groupby_ooc_q2(monkeypatch: Any, random_integers: pl.Series) -> None: +def test_streaming_group_by_ooc_q2( + monkeypatch: Any, random_integers: pl.Series +) -> None: s = random_integers monkeypatch.setenv("POLARS_FORCE_OOC", "1") @@ -234,7 +238,7 @@ def test_streaming_groupby_ooc_q2(monkeypatch: Any, random_integers: pl.Series) s.cast(str) .to_frame() .lazy() - .groupby("a") + .group_by("a") .agg(pl.first("a").alias("a_first"), pl.last("a").alias("a_last")) .sort("a") .collect(streaming=True) @@ -251,14 +255,16 @@ def test_streaming_groupby_ooc_q2(monkeypatch: Any, random_integers: pl.Series) @pytest.mark.write_disk() -def test_streaming_groupby_ooc_q3(monkeypatch: Any, random_integers: pl.Series) -> None: +def test_streaming_group_by_ooc_q3( + monkeypatch: Any, random_integers: pl.Series +) -> None: s = random_integers monkeypatch.setenv("POLARS_FORCE_OOC", "1") result = ( pl.DataFrame({"a": s, "b": s}) .lazy() - .groupby(["a", "b"]) + .group_by(["a", "b"]) .agg(pl.first("a").alias("a_first"), pl.last("a").alias("a_last")) .sort("a") .collect(streaming=True) @@ -275,14 +281,14 @@ def test_streaming_groupby_ooc_q3(monkeypatch: Any, random_integers: pl.Series) assert_frame_equal(result, expected) -def test_streaming_groupby_struct_key() -> None: +def test_streaming_group_by_struct_key() -> None: df = pl.DataFrame( {"A": [1, 2, 3, 2], "B": ["google", "ms", "apple", "ms"], "C": [2, 3, 4, 3]} ) df1 = df.lazy().with_columns(pl.struct(["A", "C"]).alias("tuples")) - assert df1.groupby("tuples").agg(pl.count(), pl.col("B").first()).sort("B").collect( - streaming=True - ).to_dict(False) == { + assert df1.group_by("tuples").agg(pl.count(), pl.col("B").first()).sort( + "B" + ).collect(streaming=True).to_dict(False) == { "tuples": [{"A": 3, "C": 4}, {"A": 1, "C": 2}, {"A": 2, "C": 3}], "count": [1, 1, 2], "B": ["apple", "google", "ms"], @@ -290,7 +296,7 @@ def test_streaming_groupby_struct_key() -> None: @pytest.mark.slow() -def test_streaming_groupby_all_numeric_types_stability_8570() -> None: +def test_streaming_group_by_all_numeric_types_stability_8570() -> None: m = 1000 n = 1000 @@ -310,14 +316,14 @@ def test_streaming_groupby_all_numeric_types_stability_8570() -> None: dfd = ( dfc.lazy() .with_columns(pl.col("z").cast(dtype)) - .groupby(keys) + .group_by(keys) .agg(pl.col("z").sum().alias("z_sum")) .collect(streaming=True) ) assert dfd["z_sum"].sum() == dfc["z"].sum() -def test_streaming_groupby_categorical_aggregate() -> None: +def test_streaming_group_by_categorical_aggregate() -> None: with pl.StringCache(): out = ( pl.LazyFrame( @@ -335,7 +341,7 @@ def test_streaming_groupby_categorical_aggregate() -> None: ), } ) - .groupby(["a", "b"]) + .group_by(["a", "b"]) .agg([pl.col("a").first().alias("sum")]) .collect(streaming=True) ) @@ -356,11 +362,11 @@ def test_streaming_groupby_categorical_aggregate() -> None: } -def test_streaming_groupby_list_9758() -> None: +def test_streaming_group_by_list_9758() -> None: payload = {"a": [[1, 2]]} assert ( pl.LazyFrame(payload) - .groupby("a") + .group_by("a") .first() .collect(streaming=True) .to_dict(False) @@ -368,7 +374,7 @@ def test_streaming_groupby_list_9758() -> None: ) -def test_streaming_restart_non_streamable_groupby() -> None: +def test_streaming_restart_non_streamable_group_by() -> None: df = pl.DataFrame({"id": [1], "id2": [1], "id3": [1], "value": [1]}) res = ( df.lazy() @@ -377,7 +383,7 @@ def test_streaming_restart_non_streamable_groupby() -> None: (pl.col("id3") > pl.col("id3_right")) & (pl.col("id3") - pl.col("id3_right") < 30) ) - .groupby(["id2", "id3", "id3_right"]) + .group_by(["id2", "id3", "id3_right"]) .agg( pl.col("value").apply(lambda x: x).sum() * pl.col("value").sum() ) # non-streamable UDF + nested_agg @@ -386,7 +392,7 @@ def test_streaming_restart_non_streamable_groupby() -> None: assert """--- PIPELINE""" in res.explain(streaming=True) -def test_groupby_min_max_string_type() -> None: +def test_group_by_min_max_string_type() -> None: table = pl.from_dict({"a": [1, 1, 2, 2, 2], "b": ["a", "b", "c", "d", None]}) expected = {"a": [1, 2], "min": ["a", "c"], "max": ["b", "d"]} @@ -394,7 +400,7 @@ def test_groupby_min_max_string_type() -> None: for streaming in [True, False]: assert ( table.lazy() - .groupby("a") + .group_by("a") .agg([pl.min("b").alias("min"), pl.max("b").alias("max")]) .collect(streaming=streaming) .sort("a") diff --git a/py-polars/tests/unit/streaming/test_streaming_unique.py b/py-polars/tests/unit/streaming/test_streaming_unique.py index fce6a8402b03..c79a734464a3 100644 --- a/py-polars/tests/unit/streaming/test_streaming_unique.py +++ b/py-polars/tests/unit/streaming/test_streaming_unique.py @@ -34,7 +34,7 @@ def test_streaming_out_of_core_unique( # TODO: Re-enable this check when this issue is fixed: https://github.com/pola-rs/polars/issues/10466 _ = capfd.readouterr().err - # assert "OOC groupby started" in err + # assert "OOC group_by started" in err def test_streaming_unique(monkeypatch: Any, capfd: Any) -> None: diff --git a/py-polars/tests/unit/test_context.py b/py-polars/tests/unit/test_context.py index b384e2c19497..15cf03150a0c 100644 --- a/py-polars/tests/unit/test_context.py +++ b/py-polars/tests/unit/test_context.py @@ -9,7 +9,7 @@ def test_context_ignore_5867() -> None: .with_context(outer) ) assert ( - df.groupby("Category", maintain_order=True) + df.group_by("Category", maintain_order=True) .agg([(pl.col("Counts")).sum()]) .collect() .to_dict(False) diff --git a/py-polars/tests/unit/test_cse.py b/py-polars/tests/unit/test_cse.py index dd0e534e52fd..58232f9bc79e 100644 --- a/py-polars/tests/unit/test_cse.py +++ b/py-polars/tests/unit/test_cse.py @@ -49,7 +49,7 @@ def test_cse_schema_6081() -> None: orient="row", ).lazy() - min_value_by_group = df.groupby(["date", "id"]).agg( + min_value_by_group = df.group_by(["date", "id"]).agg( pl.col("value").min().alias("min_value") ) @@ -88,7 +88,7 @@ def test_cse_9630() -> None: joined_df2.select("key", pl.col("y").alias("value")), ] ) - .groupby("key") + .group_by("key") .agg( [ pl.col("value"), @@ -123,7 +123,7 @@ def test_schema_row_count_cse() -> None: csv_a.seek(0) df_a = pl.scan_csv(csv_a.name).with_row_count("Idx") - assert df_a.join(df_a, on="B").groupby( + assert df_a.join(df_a, on="B").group_by( "A", maintain_order=True ).all().collect().to_dict(False) == { "A": ["Gr1"], @@ -199,7 +199,7 @@ def test_windows_cse_excluded() -> None: @pytest.mark.skip() -def test_cse_groupby_10215() -> None: +def test_cse_group_by_10215() -> None: q = ( pl.DataFrame( { @@ -208,7 +208,7 @@ def test_cse_groupby_10215() -> None: } ) .lazy() - .groupby( + .group_by( "b", ) .agg( @@ -295,7 +295,7 @@ def test_cse_10452() -> None: assert q.collect(comm_subexpr_elim=True).to_dict(False) == {"b": [13, 14, 15]} -def test_cse_groupby_ternary_10490() -> None: +def test_cse_group_by_ternary_10490() -> None: df = pl.DataFrame( { "a": [1, 1, 2, 2], @@ -306,7 +306,7 @@ def test_cse_groupby_ternary_10490() -> None: assert ( df.lazy() - .groupby("a") + .group_by("a") .agg( [ pl.when(pl.col(col).is_null().all()).then(None).otherwise(1).alias(col) diff --git a/py-polars/tests/unit/test_datatypes.py b/py-polars/tests/unit/test_datatypes.py index 3d73b1b4ca4f..c6fc2ba3ff04 100644 --- a/py-polars/tests/unit/test_datatypes.py +++ b/py-polars/tests/unit/test_datatypes.py @@ -141,7 +141,7 @@ def test_conversion_dtype() -> None: pl.col("some_partition_column"), ] ) - .groupby(["some_partition_column"], maintain_order=True) + .group_by(["some_partition_column"], maintain_order=True) .agg([pl.col(["struct"])]) ) diff --git a/py-polars/tests/unit/test_empty.py b/py-polars/tests/unit/test_empty.py index 12cd4f84680c..6f3f47addbad 100644 --- a/py-polars/tests/unit/test_empty.py +++ b/py-polars/tests/unit/test_empty.py @@ -65,19 +65,19 @@ def test_empty_sort_by_args() -> None: def test_empty_9137() -> None: out = ( pl.DataFrame({"id": [], "value": []}) - .groupby("id") + .group_by("id") .agg(pl.col("value").pow(2).mean()) ) assert out.shape == (0, 2) assert out.dtypes == [pl.Float32, pl.Float32] -def test_empty_groupby_apply_err() -> None: +def test_empty_group_by_apply_err() -> None: df = pl.DataFrame(schema={"x": pl.Int64}) with pytest.raises( pl.ComputeError, match=r"cannot group_by \+ apply on empty 'DataFrame'" ): - df.groupby("x").apply(lambda x: x) + df.group_by("x").apply(lambda x: x) def test_empty_list_namespace_output_9585() -> None: diff --git a/py-polars/tests/unit/test_errors.py b/py-polars/tests/unit/test_errors.py index f79c7752e0b8..037af5c92799 100644 --- a/py-polars/tests/unit/test_errors.py +++ b/py-polars/tests/unit/test_errors.py @@ -15,11 +15,11 @@ from polars.type_aliases import ConcatMethod -def test_error_on_empty_groupby() -> None: +def test_error_on_empty_group_by() -> None: with pytest.raises( pl.ComputeError, match="at least one key is required in a group_by operation" ): - pl.DataFrame({"x": [0, 0, 1, 1]}).groupby([]).agg(pl.count()) + pl.DataFrame({"x": [0, 0, 1, 1]}).group_by([]).agg(pl.count()) def test_error_on_reducing_map() -> None: @@ -33,7 +33,7 @@ def test_error_on_reducing_map() -> None: r"the input length \(1\); consider using `apply` instead" ), ): - df.groupby("id").agg(pl.map(["t", "y"], np.trapz)) + df.group_by("id").agg(pl.map(["t", "y"], np.trapz)) df = pl.DataFrame({"x": [1, 2, 3, 4], "group": [1, 2, 1, 2]}) with pytest.raises( @@ -136,7 +136,7 @@ def test_projection_update_schema_missing_column() -> None: pl.DataFrame({"colA": ["a", "b", "c"], "colB": [1, 2, 3]}) .lazy() .filter(~pl.col("colC").is_null()) - .groupby(["colA"]) + .group_by(["colA"]) .agg([pl.col("colB").sum().alias("result")]) .collect() ) @@ -204,7 +204,7 @@ def test_error_on_double_agg() -> None: "b": [1, 2, 3, 4, 5], } ) - .groupby("a") + .group_by("a") .agg([getattr(pl.col("b").min(), e)()]) ) @@ -381,7 +381,7 @@ def test_sort_by_different_lengths() -> None: pl.ComputeError, match=r"the expression in `sort_by` argument must result in the same length", ): - df.groupby("group").agg( + df.group_by("group").agg( [ pl.col("col1").sort_by(pl.col("col2").unique()), ] @@ -391,7 +391,7 @@ def test_sort_by_different_lengths() -> None: pl.ComputeError, match=r"the expression in `sort_by` argument must result in the same length", ): - df.groupby("group").agg( + df.group_by("group").agg( [ pl.col("col1").sort_by(pl.col("col2").arg_unique()), ] @@ -568,7 +568,7 @@ def test_invalid_inner_type_cast_list() -> None: ), ], ) -def test_groupby_dynamic_validation(every: str, match: str) -> None: +def test_group_by_dynamic_validation(every: str, match: str) -> None: df = pl.DataFrame( { "index": [0, 0, 1, 1], @@ -578,7 +578,7 @@ def test_groupby_dynamic_validation(every: str, match: str) -> None: ) with pytest.raises(pl.ComputeError, match=match): - df.groupby_dynamic("index", by="group", every=every, period="2i").agg( + df.group_by_dynamic("index", by="group", every=every, period="2i").agg( pl.col("weight") ) @@ -602,12 +602,12 @@ def test_invalid_getitem_key_err() -> None: df["x", "y"] # type: ignore[index] -def test_invalid_groupby_arg() -> None: +def test_invalid_group_by_arg() -> None: df = pl.DataFrame({"a": [1]}) with pytest.raises( TypeError, match="specifying aggregations as a dictionary is not supported" ): - df.groupby(1).agg({"a": "sum"}) + df.group_by(1).agg({"a": "sum"}) def test_no_sorted_err() -> None: @@ -620,7 +620,7 @@ def test_no_sorted_err() -> None: pl.InvalidOperationError, match=r"argument in operation 'group_by_dynamic' is not explicitly sorted", ): - df.groupby_dynamic("dt", every="1h").agg(pl.all().count().suffix("_foo")) + df.group_by_dynamic("dt", every="1h").agg(pl.all().count().suffix("_foo")) def test_serde_validation() -> None: @@ -678,6 +678,6 @@ def test_sort_by_err_9259() -> None: schema={"a": pl.Float32, "b": pl.Float32, "c": pl.Float32}, ) with pytest.raises(pl.ComputeError): - df.lazy().groupby("c").agg( + df.lazy().group_by("c").agg( [pl.col("a").sort_by(pl.col("b").filter(pl.col("b") > 100)).sum()] ).collect() diff --git a/py-polars/tests/unit/test_expr_multi_cols.py b/py-polars/tests/unit/test_expr_multi_cols.py index a5d21f4cee3a..f3cc51ef36f4 100644 --- a/py-polars/tests/unit/test_expr_multi_cols.py +++ b/py-polars/tests/unit/test_expr_multi_cols.py @@ -81,7 +81,7 @@ def test_multiple_columns_length_9137() -> None: # list is larger than groups cmp_list = ["a", "b", "c"] - assert df.groupby("a").agg(pl.col("b").is_in(cmp_list)).to_dict(False) == { + assert df.group_by("a").agg(pl.col("b").is_in(cmp_list)).to_dict(False) == { "a": [1], "b": [[True, False]], } diff --git a/py-polars/tests/unit/test_exprs.py b/py-polars/tests/unit/test_exprs.py index 5b2e878e82c0..a3233d797048 100644 --- a/py-polars/tests/unit/test_exprs.py +++ b/py-polars/tests/unit/test_exprs.py @@ -93,7 +93,7 @@ def test_prefix(fruits_cars: pl.DataFrame) -> None: def test_cumcount() -> None: df = pl.DataFrame([["a"], ["a"], ["a"], ["b"], ["b"], ["a"]], schema=["A"]) - out = df.groupby("A", maintain_order=True).agg( + out = df.group_by("A", maintain_order=True).agg( [pl.col("A").cumcount(reverse=False).alias("foo")] ) @@ -103,10 +103,10 @@ def test_cumcount() -> None: def test_filter_where() -> None: df = pl.DataFrame({"a": [1, 2, 3, 1, 2, 3], "b": [4, 5, 6, 7, 8, 9]}) - result_where = df.groupby("a", maintain_order=True).agg( + result_where = df.group_by("a", maintain_order=True).agg( pl.col("b").where(pl.col("b") > 4).alias("c") ) - result_filter = df.groupby("a", maintain_order=True).agg( + result_filter = df.group_by("a", maintain_order=True).agg( pl.col("b").filter(pl.col("b") > 4).alias("c") ) expected = pl.DataFrame({"a": [1, 2, 3], "c": [[7], [5, 8], [6, 9]]}) @@ -127,7 +127,7 @@ def test_count_expr() -> None: assert out.shape == (1, 1) assert cast(int, out.item()) == 5 - out = df.groupby("b", maintain_order=True).agg(pl.count()) + out = df.group_by("b", maintain_order=True).agg(pl.count()) assert out["b"].to_list() == ["a", "b"] assert out["count"].to_list() == [4, 1] @@ -169,7 +169,7 @@ def test_entropy() -> None: "id": [1, 2, 1, 4, 5, 4, 6], } ) - result = df.groupby("group", maintain_order=True).agg( + result = df.group_by("group", maintain_order=True).agg( pl.col("id").entropy(normalize=True) ) expected = pl.DataFrame( @@ -178,7 +178,7 @@ def test_entropy() -> None: assert_frame_equal(result, expected) -def test_dot_in_groupby() -> None: +def test_dot_in_group_by() -> None: df = pl.DataFrame( { "group": ["a", "a", "a", "b", "b", "b"], @@ -187,7 +187,7 @@ def test_dot_in_groupby() -> None: } ) - result = df.groupby("group", maintain_order=True).agg( + result = df.group_by("group", maintain_order=True).agg( pl.col("x").dot("y").alias("dot") ) expected = pl.DataFrame({"group": ["a", "b"], "dot": [6, 15]}) @@ -364,7 +364,7 @@ def test_rank_so_4109() -> None: } ).sort(by=["id", "rank"]) - assert df.groupby("id").agg( + assert df.group_by("id").agg( [ pl.col("rank").alias("original"), pl.col("rank").rank(method="dense").alias("dense"), diff --git a/py-polars/tests/unit/test_fmt.py b/py-polars/tests/unit/test_fmt.py index ed9bc408ab5a..57d4f541e639 100644 --- a/py-polars/tests/unit/test_fmt.py +++ b/py-polars/tests/unit/test_fmt.py @@ -146,7 +146,7 @@ def test_date_list_fmt() -> None: df = df.with_columns(pl.col("mydate").str.strptime(pl.Date, "%Y-%m-%d")) assert ( - str(df.groupby("index", maintain_order=True).agg(pl.col("mydate"))["mydate"]) + str(df.group_by("index", maintain_order=True).agg(pl.col("mydate"))["mydate"]) == """shape: (3,) Series: 'mydate' [list[date]] [ diff --git a/py-polars/tests/unit/test_interop.py b/py-polars/tests/unit/test_interop.py index c43fc195ff56..a9361633075b 100644 --- a/py-polars/tests/unit/test_interop.py +++ b/py-polars/tests/unit/test_interop.py @@ -1074,7 +1074,7 @@ def test_to_init_repr() -> None: def test_untrusted_categorical_input() -> None: df = pd.DataFrame({"x": pd.Categorical(["x"], ["x", "y"])}) - assert pl.from_pandas(df).groupby("x").count().to_dict(False) == { + assert pl.from_pandas(df).group_by("x").count().to_dict(False) == { "x": ["x"], "count": [1], } diff --git a/py-polars/tests/unit/test_lazy.py b/py-polars/tests/unit/test_lazy.py index 90151c9bebb2..cb78de901ffe 100644 --- a/py-polars/tests/unit/test_lazy.py +++ b/py-polars/tests/unit/test_lazy.py @@ -39,7 +39,7 @@ def test_lazy() -> None: ).collect() # test if pl.list is available, this is `to_list` re-exported as list - eager = ldf.groupby("a").agg(pl.implode("b")).collect() + eager = ldf.group_by("a").agg(pl.implode("b")).collect() assert sorted(eager.rows()) == [(1, [[1.0]]), (2, [[2.0]]), (3, [[3.0]])] @@ -152,10 +152,10 @@ def test_or() -> None: assert out.rows() == [(1, 1.0), (3, 3.0)] -def test_groupby_apply() -> None: +def test_group_by_apply() -> None: ldf = ( pl.LazyFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 3.0]}) - .groupby("a") + .group_by("a") .apply(lambda df: df * 2.0, schema={"a": pl.Float64, "b": pl.Float64}) ) out = ldf.collect() @@ -194,7 +194,7 @@ def test_apply_custom_function() -> None: # two ways to determine the length groups. df = ( - ldf.groupby("fruits") + ldf.group_by("fruits") .agg( [ pl.col("cars") @@ -221,16 +221,16 @@ def test_apply_custom_function() -> None: assert_frame_equal(df, expected) -def test_groupby() -> None: +def test_group_by() -> None: ldf = pl.LazyFrame({"a": [1.0, None, 3.0, 4.0], "groups": ["a", "a", "b", "b"]}) expected = pl.DataFrame({"groups": ["a", "b"], "a": [1.0, 3.5]}) - out = ldf.groupby("groups").agg(pl.mean("a")).collect() + out = ldf.group_by("groups").agg(pl.mean("a")).collect() assert_frame_equal(out.sort(by="groups"), expected) # refer to column via pl.Expr - out = ldf.groupby(pl.col("groups")).agg(pl.mean("a")).collect() + out = ldf.group_by(pl.col("groups")).agg(pl.mean("a")).collect() assert_frame_equal(out.sort(by="groups"), expected) @@ -391,7 +391,7 @@ def test_fold_filter() -> None: assert out.rows() == [(1, 0), (2, 1), (3, 2)] -def test_head_groupby() -> None: +def test_head_group_by() -> None: commodity_prices = { "commodity": [ "Wheat", @@ -434,7 +434,7 @@ def test_head_groupby() -> None: keys = ["commodity", "location"] out = ( ldf.sort(by="price", descending=True) - .groupby(keys, maintain_order=True) + .group_by(keys, maintain_order=True) .agg([pl.col("*").exclude(keys).head(2).keep_name()]) .explode(pl.col("*").exclude(keys)) ) @@ -450,12 +450,12 @@ def test_head_groupby() -> None: ldf = pl.LazyFrame( {"letters": ["c", "c", "a", "c", "a", "b"], "nrs": [1, 2, 3, 4, 5, 6]} ) - out = ldf.groupby("letters").tail(2).sort("letters") + out = ldf.group_by("letters").tail(2).sort("letters") assert_frame_equal( out.collect(), pl.DataFrame({"letters": ["a", "a", "b", "c", "c"], "nrs": [3, 5, 6, 2, 4]}), ) - out = ldf.groupby("letters").head(2).sort("letters") + out = ldf.group_by("letters").head(2).sort("letters") assert_frame_equal( out.collect(), pl.DataFrame({"letters": ["a", "a", "b", "c", "c"], "nrs": [3, 5, 6, 1, 2]}), @@ -555,10 +555,10 @@ def test_sort() -> None: assert_series_equal(ldf.collect()["a"], pl.Series("a", [1, 2, 2, 3])) -def test_custom_groupby() -> None: +def test_custom_group_by() -> None: ldf = pl.LazyFrame({"a": [1, 2, 1, 1], "b": ["a", "b", "c", "c"]}) out = ( - ldf.groupby("b", maintain_order=True) + ldf.group_by("b", maintain_order=True) .agg([pl.col("a").apply(lambda x: x.sum(), return_dtype=pl.Int64)]) .collect() ) @@ -894,7 +894,7 @@ def test_argminmax() -> None: assert out["min"][0] == 0 out = ( - ldf.groupby("b", maintain_order=True) + ldf.group_by("b", maintain_order=True) .agg([pl.col("a").arg_min().alias("min"), pl.col("a").arg_max().alias("max")]) .collect() ) @@ -981,7 +981,7 @@ def test_spearman_corr() -> None: ) out = ( - ldf.groupby("era", maintain_order=True).agg( + ldf.group_by("era", maintain_order=True).agg( pl.corr(pl.col("prediction"), pl.col("target"), method="spearman").alias( "c" ), @@ -992,7 +992,7 @@ def test_spearman_corr() -> None: # we can also pass in column names directly out = ( - ldf.groupby("era", maintain_order=True).agg( + ldf.group_by("era", maintain_order=True).agg( pl.corr("prediction", "target", method="spearman").alias("c"), ) ).collect()["c"] @@ -1029,7 +1029,7 @@ def test_pearson_corr() -> None: ) out = ( - ldf.groupby("era", maintain_order=True).agg( + ldf.group_by("era", maintain_order=True).agg( pl.corr(pl.col("prediction"), pl.col("target"), method="pearson").alias( "c" ), @@ -1039,7 +1039,7 @@ def test_pearson_corr() -> None: # we can also pass in column names directly out = ( - ldf.groupby("era", maintain_order=True).agg( + ldf.group_by("era", maintain_order=True).agg( pl.corr("prediction", "target", method="pearson").alias("c"), ) ).collect()["c"] @@ -1178,7 +1178,7 @@ def test_group_lengths() -> None: } ) - result = ldf.groupby(["group"], maintain_order=True).agg( + result = ldf.group_by(["group"], maintain_order=True).agg( [ (pl.col("id").unique_counts() / pl.col("id").len()) .sum() @@ -1205,7 +1205,7 @@ def test_quantile_filtered_agg() -> None: "value": [1, 2, 3, 4, 1, 2, 3, 4], } ) - .groupby("group") + .group_by("group") .agg(pl.col("value").filter(pl.col("value") < 2).quantile(0.5)) .collect()["value"] .to_list() diff --git a/py-polars/tests/unit/test_projections.py b/py-polars/tests/unit/test_projections.py index 4312815eeaa6..2e276b837b01 100644 --- a/py-polars/tests/unit/test_projections.py +++ b/py-polars/tests/unit/test_projections.py @@ -10,7 +10,7 @@ def test_projection_on_semi_join_4789() -> None: ab = lfa.join(lfb, on="p", how="semi").inspect() - intermediate_agg = (ab.groupby("a").agg([pl.col("a").alias("seq")])).select( + intermediate_agg = (ab.group_by("a").agg([pl.col("a").alias("seq")])).select( ["a", "seq"] ) @@ -25,7 +25,7 @@ def test_melt_projection_pd_block_4997() -> None: .with_row_count() .lazy() .melt(id_vars="row_nr") - .groupby("row_nr") + .group_by("row_nr") .agg(pl.col("variable").alias("result")) .collect() ).to_dict(False) == {"row_nr": [0], "result": [["col1", "col2"]]} @@ -43,13 +43,13 @@ def test_double_projection_pushdown() -> None: ) -def test_groupby_projection_pushdown() -> None: +def test_group_by_projection_pushdown() -> None: assert ( "PROJECT 2/3 COLUMNS" in ( pl.DataFrame({"c0": [], "c1": [], "c2": []}) .lazy() - .groupby("c0") + .group_by("c0") .agg( [ pl.col("c1").sum().alias("sum(c1)"), @@ -132,14 +132,14 @@ def test_double_projection_union() -> None: } ).lazy() - # in this query the groupby projects only 2 columns, that's one + # in this query the group_by projects only 2 columns, that's one # less than the upstream projection so the union will fail if # the select node does not prune one column q = lf1.select(["a", "b", "c"]) q = pl.concat([q, lf2]) - q = q.groupby("c", maintain_order=True).agg([pl.col("a")]) + q = q.group_by("c", maintain_order=True).agg([pl.col("a")]) assert q.collect().to_dict(False) == { "c": [1, 2, 3], "a": [[1, 2, 5, 7], [3, 4, 6], [8]], @@ -253,7 +253,7 @@ def test_distinct_projection_pd_7578() -> None: } ) - q = df.lazy().unique().groupby("bar").agg(pl.count()) + q = df.lazy().unique().group_by("bar").agg(pl.count()) assert q.collect().sort("bar").to_dict(False) == { "bar": ["a", "b"], "count": [3, 2], diff --git a/py-polars/tests/unit/test_queries.py b/py-polars/tests/unit/test_queries.py index 946c0f42e6f6..4c35ffa6e270 100644 --- a/py-polars/tests/unit/test_queries.py +++ b/py-polars/tests/unit/test_queries.py @@ -30,10 +30,10 @@ def test_sort_by_bools() -> None: assert out.shape == (3, 4) -def test_repeat_expansion_in_groupby() -> None: +def test_repeat_expansion_in_group_by() -> None: out = ( pl.DataFrame({"g": [1, 2, 2, 3, 3, 3]}) - .groupby("g", maintain_order=True) + .group_by("g", maintain_order=True) .agg(pl.repeat(1, pl.count()).cumsum()) .to_dict(False) ) @@ -48,7 +48,7 @@ def test_agg_after_head() -> None: expected = pl.DataFrame({"a": [1, 2, 3], "b": [6, 9, 21]}) for maintain_order in [True, False]: - out = df.groupby("a", maintain_order=maintain_order).agg( + out = df.group_by("a", maintain_order=maintain_order).agg( [pl.col("b").head(3).sum()] ) @@ -71,7 +71,7 @@ def test_overflow_uint16_agg_mean() -> None: pl.col("col3").cast(pl.UInt16), ] ) - .groupby(["col1"]) + .group_by(["col1"]) .agg(pl.col("col3").mean()) .to_dict(False) ) == {"col1": ["A"], "col3": [64.0]} @@ -86,7 +86,7 @@ def test_binary_on_list_agg_3345() -> None: ) assert ( - df.groupby(["group"], maintain_order=True) + df.group_by(["group"], maintain_order=True) .agg( [ ( @@ -109,12 +109,12 @@ def test_maintain_order_after_sampling() -> None: "value": [1, 3, 2, 3, 4, 5, 3, 4], } ) - assert df.groupby("type", maintain_order=True).agg(pl.col("value").sum()).to_dict( + assert df.group_by("type", maintain_order=True).agg(pl.col("value").sum()).to_dict( False ) == {"type": ["A", "B", "C", "D"], "value": [5, 8, 5, 7]} -def test_sorted_groupby_optimization(monkeypatch: Any) -> None: +def test_sorted_group_by_optimization(monkeypatch: Any) -> None: monkeypatch.setenv("POLARS_NO_STREAMING_GROUPBY", "1") df = pl.DataFrame({"a": np.random.randint(0, 5, 20)}) @@ -124,11 +124,11 @@ def test_sorted_groupby_optimization(monkeypatch: Any) -> None: for descending in [True, False]: sorted_implicit = ( df.with_columns(pl.col("a").sort(descending=descending)) - .groupby("a") + .group_by("a") .agg(pl.count()) ) sorted_explicit = ( - df.groupby("a").agg(pl.count()).sort("a", descending=descending) + df.group_by("a").agg(pl.count()).sort("a", descending=descending) ) assert_frame_equal(sorted_explicit, sorted_implicit) @@ -147,7 +147,7 @@ def test_median_on_shifted_col_3522() -> None: assert diffs.select(pl.col("foo").median()).to_series()[0] == 36828.5 -def test_groupby_agg_equals_zero_3535() -> None: +def test_group_by_agg_equals_zero_3535() -> None: # setup test frame df = pl.DataFrame( data=[ @@ -165,7 +165,7 @@ def test_groupby_agg_equals_zero_3535() -> None: ], ) # group by the key, aggregating the two numeric cols - assert df.groupby(pl.col("key"), maintain_order=True).agg( + assert df.group_by(pl.col("key"), maintain_order=True).agg( [pl.col("val1").sum(), pl.col("val2").sum()] ).to_dict(False) == { "key": ["aa", "bb", "cc"], @@ -190,7 +190,7 @@ def demean_dot() -> pl.Expr: "y": [2, 0, 2, 0], } ) - .groupby("key") + .group_by("key") .agg( [ demean_dot(), @@ -228,7 +228,7 @@ def test_opaque_filter_on_lists_3784() -> None: ).lazy() df = df.with_columns(pl.col("str").cast(pl.Categorical)) - df_groups = df.groupby("group").agg([pl.col("str").alias("str_list")]) + df_groups = df.group_by("group").agg([pl.col("str").alias("str_list")]) pre = "A" succ = "B" @@ -263,7 +263,7 @@ def map_expr(name: str) -> pl.Expr: assert ( pl.DataFrame({"groups": [1, 2, 3, 4], "values": [None, None, 1, 2]}) - .groupby("groups", maintain_order=True) + .group_by("groups", maintain_order=True) .agg([map_expr("values")]) ).to_dict(False) == { "groups": [1, 2, 3, 4], diff --git a/py-polars/tests/unit/test_rows.py b/py-polars/tests/unit/test_rows.py index 0e01a91e1cbc..cfd364ee06a1 100644 --- a/py-polars/tests/unit/test_rows.py +++ b/py-polars/tests/unit/test_rows.py @@ -93,7 +93,7 @@ def test_rows_by_key() -> None: "b": [("b", "q", 2.5, 8), ("b", "q", 3.0, 7)], } assert df.rows_by_key("w", include_key=True) == { - key: grp.rows() for key, grp in df.groupby("w") + key: grp.rows() for key, grp in df.group_by("w") } assert df.rows_by_key("w", include_key=True, unique=True) == { "a": ("a", "k", 4.5, 6), @@ -135,7 +135,7 @@ def test_rows_by_key() -> None: ], } assert df.rows_by_key("w", named=True, include_key=True) == { - key: grp.rows(named=True) for key, grp in df.groupby("w") + key: grp.rows(named=True) for key, grp in df.group_by("w") } assert df.rows_by_key("w", named=True, include_key=True, unique=True) == { "a": {"w": "a", "x": "k", "y": 4.5, "z": 6}, diff --git a/py-polars/tests/unit/test_schema.py b/py-polars/tests/unit/test_schema.py index eb12a3e6ad80..56de4581014a 100644 --- a/py-polars/tests/unit/test_schema.py +++ b/py-polars/tests/unit/test_schema.py @@ -14,7 +14,7 @@ def test_schema_on_agg() -> None: assert ( df.lazy() - .groupby("a") + .group_by("a") .agg( [ pl.col("b").min().alias("min"), @@ -97,7 +97,7 @@ def test_from_dicts_nested_nulls() -> None: def test_group_schema_err() -> None: df = pl.DataFrame({"foo": [None, 1, 2], "bar": [1, 2, 3]}).lazy() with pytest.raises(pl.ColumnNotFoundError): - df.groupby("not-existent").agg(pl.col("bar").max().alias("max_bar")).schema + df.group_by("not-existent").agg(pl.col("bar").max().alias("max_bar")).schema def test_schema_inference_from_rows() -> None: @@ -391,7 +391,7 @@ def sub_col_min(column: str, min_column: str) -> pl.Expr: q = ( df.lazy() - .groupby("group") + .group_by("group") .agg( [ sub_col_min("vals_num", "vals_num").alias("sub_num"), @@ -439,8 +439,8 @@ def test_schemas( for key, dtype in expected_select.items(): assert schema[key] == dtype - # test groupby schema - schema = df.groupby(pl.lit(1)).agg(expr).schema + # test group_by schema + schema = df.group_by(pl.lit(1)).agg(expr).schema for key, dtype in expected_gb.items(): assert schema[key] == dtype @@ -511,7 +511,7 @@ def test_lit_iter_schema() -> None: } ) - assert df.groupby("key").agg(pl.col("dates").unique() + timedelta(days=1)).to_dict( + assert df.group_by("key").agg(pl.col("dates").unique() + timedelta(days=1)).to_dict( False ) == { "key": ["A"], diff --git a/py-polars/tests/unit/test_selectors.py b/py-polars/tests/unit/test_selectors.py index 1d97e484b703..b1d2f6a3eed1 100644 --- a/py-polars/tests/unit/test_selectors.py +++ b/py-polars/tests/unit/test_selectors.py @@ -447,9 +447,9 @@ def test_selector_expr_dispatch() -> None: ) -def test_regex_expansion_groupby_9947() -> None: +def test_regex_expansion_group_by_9947() -> None: df = pl.DataFrame({"g": [3], "abc": [1], "abcd": [3]}) - assert df.groupby("g").agg(pl.col("^ab.*$")).columns == ["g", "abc", "abcd"] + assert df.group_by("g").agg(pl.col("^ab.*$")).columns == ["g", "abc", "abcd"] def test_regex_expansion_exclude_10002() -> None: diff --git a/py-polars/tests/unit/test_show_graph.py b/py-polars/tests/unit/test_show_graph.py index 09a9b9484933..f46d135e0792 100644 --- a/py-polars/tests/unit/test_show_graph.py +++ b/py-polars/tests/unit/test_show_graph.py @@ -10,6 +10,6 @@ def test_show_graph() -> None: "c": [6, 5, 4, 3, 2, 1], } ) - query = ldf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort("a") + query = ldf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort("a") out = query.show_graph(raw_output=True) assert isinstance(out, str) diff --git a/py-polars/tests/unit/test_sql.py b/py-polars/tests/unit/test_sql.py index c3e5318a2fae..becaa874058c 100644 --- a/py-polars/tests/unit/test_sql.py +++ b/py-polars/tests/unit/test_sql.py @@ -412,7 +412,7 @@ def test_sql_trig() -> None: assert_frame_equal(left=df_result, right=res, atol=1e-5) -def test_sql_groupby(foods_ipc_path: Path) -> None: +def test_sql_group_by(foods_ipc_path: Path) -> None: lf = pl.scan_ipc(foods_ipc_path) c = pl.SQLContext(eager_execution=True) From 97ab4cd94a295180e64bdbaa6dda28ff0f27dcfd Mon Sep 17 00:00:00 2001 From: Ion Koutsouris Date: Tue, 22 Aug 2023 11:19:01 +0200 Subject: [PATCH 35/55] fix(python): Correctly handle time zones in `write_delta` (#10633) Co-authored-by: Stijn de Gooijer --- py-polars/polars/dataframe/frame.py | 3 ++- py-polars/polars/io/_utils.py | 2 +- py-polars/polars/io/delta.py | 7 ++++--- py-polars/tests/unit/io/test_delta.py | 30 +++++++++++++++++++++++++-- 4 files changed, 35 insertions(+), 7 deletions(-) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 9c4f3656f7a9..c7d630b207de 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -3516,7 +3516,8 @@ def write_delta( to which they can be cast. This affects the following data types: - Unsigned integers - - :class:`Datetime` types with millisecond or nanosecond precision + - :class:`Datetime` types with millisecond or nanosecond precision or with + time zone information - :class:`Utf8`, :class:`Binary`, and :class:`List` ('large' types) Polars columns are always nullable. To write data to a delta table with diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py index 4a59dd65353c..ec3301bbd930 100644 --- a/py-polars/polars/io/_utils.py +++ b/py-polars/polars/io/_utils.py @@ -17,7 +17,7 @@ def _is_glob_pattern(file: str) -> bool: def _is_local_file(file: str) -> bool: try: - next(glob.iglob(file, recursive=True)) + next(glob.iglob(file, recursive=True)) # noqa: PTH207 return True except StopIteration: return False diff --git a/py-polars/polars/io/delta.py b/py-polars/polars/io/delta.py index e04d1a01037f..d1a590aec96f 100644 --- a/py-polars/polars/io/delta.py +++ b/py-polars/polars/io/delta.py @@ -338,8 +338,6 @@ def _convert_pa_schema_to_delta(schema: pa.schema) -> pa.schema: pa.uint16(): pa.int16(), pa.uint32(): pa.int32(), pa.uint64(): pa.int64(), - pa.timestamp("ns"): pa.timestamp("us"), - pa.timestamp("ms"): pa.timestamp("us"), pa.large_string(): pa.string(), pa.large_binary(): pa.binary(), } @@ -350,7 +348,10 @@ def dtype_to_delta_dtype(dtype: pa.DataType) -> pa.DataType: return list_to_delta_dtype(dtype) elif isinstance(dtype, pa.StructType): return struct_to_delta_dtype(dtype) - + elif isinstance(dtype, pa.TimestampType): + # TODO: Support time zones when implemented by delta-rs. See: + # https://github.com/delta-io/delta-rs/issues/1598 + return pa.timestamp("us") try: return dtype_map[dtype] except KeyError: diff --git a/py-polars/tests/unit/io/test_delta.py b/py-polars/tests/unit/io/test_delta.py index 581068f48b8a..6ab087171ea4 100644 --- a/py-polars/tests/unit/io/test_delta.py +++ b/py-polars/tests/unit/io/test_delta.py @@ -197,7 +197,7 @@ def test_write_delta(df: pl.DataFrame, tmp_path: Path) -> None: pl.Series( "date_ns", [datetime(2010, 1, 1, 0, 0)], - dtype=pl.Datetime(time_unit="ns"), + dtype=pl.Datetime(time_unit="ns", time_zone="ETC"), ), pl.Series( "date_us", @@ -262,7 +262,7 @@ def test_write_delta(df: pl.DataFrame, tmp_path: Path) -> None: [ pl.Field( "date_range", - pl.List(pl.Datetime(time_unit="ms", time_zone=None)), + pl.List(pl.Datetime(time_unit="ms", time_zone="UTC")), ), pl.Field( "date_us", pl.List(pl.Datetime(time_unit="ms", time_zone=None)) @@ -343,3 +343,29 @@ def test_write_delta_with_schema_10540(tmp_path: Path) -> None: pa_schema = pa.schema([("a", pa.int64())]) df.write_delta(tmp_path, delta_write_options={"schema": pa_schema}) + + +@pytest.mark.parametrize( + "expr", + [ + pl.datetime(2010, 1, 1, time_unit="us", time_zone="UTC"), + pl.datetime(2010, 1, 1, time_unit="ns", time_zone="EST"), + pl.datetime(2010, 1, 1, time_unit="ms", time_zone="Europe/Amsterdam"), + ], +) +def test_write_delta_with_tz_in_df(expr: pl.Expr, tmp_path: Path) -> None: + df = pl.select(expr) + + pa_schema = pa.schema([("datetime", pa.timestamp("us"))]) + + df.write_delta(tmp_path, mode="append") + # write second time because delta-rs also casts timestamp with tz to timestamp no tz + df.write_delta(tmp_path, mode="append") + + tbl = DeltaTable(tmp_path) + assert pa_schema == tbl.schema().to_pyarrow() + + result = pl.read_delta(str(tmp_path), version=0) + + expected = df.cast(pl.Datetime) + assert_frame_equal(result, expected) From dc2e61777112ff4643f21d7b8028800b715e365f Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Tue, 22 Aug 2023 11:33:29 +0200 Subject: [PATCH 36/55] feat(rust, python): add `truncate_ragged_lines` (#10660) --- crates/polars-error/Cargo.toml | 3 +++ crates/polars-error/src/constants.rs | 10 +++++++ crates/polars-error/src/lib.rs | 1 + crates/polars-io/Cargo.toml | 1 + crates/polars-io/src/csv/parser.rs | 16 ++++++++++-- crates/polars-io/src/csv/read.rs | 23 +++++++++++----- .../src/csv/read_impl/batched_mmap.rs | 3 +++ .../src/csv/read_impl/batched_read.rs | 3 +++ crates/polars-io/src/csv/read_impl/mod.rs | 20 +++++++++----- crates/polars-lazy/Cargo.toml | 2 +- crates/polars-lazy/src/frame/csv.rs | 10 +++++++ .../src/physical_plan/executors/scan/csv.rs | 1 + .../polars-pipe/src/executors/sources/csv.rs | 1 + .../polars-plan/src/logical_plan/builder.rs | 2 ++ .../polars-plan/src/logical_plan/options.rs | 1 + crates/polars/tests/it/io/csv.rs | 9 ++++--- py-polars/polars/dataframe/frame.py | 3 +++ py-polars/polars/io/csv/batched_reader.py | 2 ++ py-polars/polars/io/csv/functions.py | 8 ++++++ py-polars/polars/lazyframe/frame.py | 2 ++ py-polars/src/batched_csv.rs | 4 ++- py-polars/src/dataframe.rs | 4 ++- py-polars/src/lazyframe.rs | 4 ++- py-polars/tests/unit/io/test_csv.py | 26 +++++++++++++++++-- 24 files changed, 135 insertions(+), 24 deletions(-) create mode 100644 crates/polars-error/src/constants.rs diff --git a/crates/polars-error/Cargo.toml b/crates/polars-error/Cargo.toml index 47bf990d3e48..ce622dcccbe9 100644 --- a/crates/polars-error/Cargo.toml +++ b/crates/polars-error/Cargo.toml @@ -12,3 +12,6 @@ description = "Error definitions for the Polars DataFrame library" arrow = { workspace = true } regex = { workspace = true, optional = true } thiserror = { workspace = true } + +[features] +python = [] diff --git a/crates/polars-error/src/constants.rs b/crates/polars-error/src/constants.rs new file mode 100644 index 000000000000..473e9edfe55b --- /dev/null +++ b/crates/polars-error/src/constants.rs @@ -0,0 +1,10 @@ +//! Constant that help with creating error messages dependent on the host language. +#[cfg(feature = "python")] +pub static TRUE: &str = "True"; +#[cfg(feature = "python")] +pub static FALSE: &str = "False"; + +#[cfg(not(feature = "python"))] +pub static TRUE: &str = "true"; +#[cfg(not(feature = "python"))] +pub static FALSE: &str = "false"; diff --git a/crates/polars-error/src/lib.rs b/crates/polars-error/src/lib.rs index 3241c9faa54d..6cf86706ab44 100644 --- a/crates/polars-error/src/lib.rs +++ b/crates/polars-error/src/lib.rs @@ -1,3 +1,4 @@ +pub mod constants; mod warning; use std::borrow::Cow; diff --git a/crates/polars-io/Cargo.toml b/crates/polars-io/Cargo.toml index 5587fbb55ef3..9b9822ec2adc 100644 --- a/crates/polars-io/Cargo.toml +++ b/crates/polars-io/Cargo.toml @@ -95,6 +95,7 @@ gcp = ["object_store/gcp", "cloud", "polars-core/gcp"] partition = ["polars-core/partition_by"] temporal = ["dtype-datetime", "dtype-date", "dtype-time"] simd = [] +python = ["polars-error/python"] [package.metadata.docs.rs] all-features = true diff --git a/crates/polars-io/src/csv/parser.rs b/crates/polars-io/src/csv/parser.rs index d08dff9c540b..b89d5cbcb297 100644 --- a/crates/polars-io/src/csv/parser.rs +++ b/crates/polars-io/src/csv/parser.rs @@ -354,11 +354,12 @@ pub(super) fn parse_lines<'a>( comment_char: Option, quote_char: Option, eol_char: u8, - null_values: Option<&NullValuesCompiled>, missing_is_null: bool, + ignore_errors: bool, + mut truncate_ragged_lines: bool, + null_values: Option<&NullValuesCompiled>, projection: &[usize], buffers: &mut [Buffer<'a>], - ignore_errors: bool, n_lines: usize, // length of original schema schema_len: usize, @@ -368,6 +369,12 @@ pub(super) fn parse_lines<'a>( !projection.is_empty(), "at least one column should be projected" ); + // During projection pushdown we are not checking other csv fields. + // This would be very expensive and we don't care as we only want + // the projected columns. + if projection.len() != schema_len { + truncate_ragged_lines = true + } // we use the pointers to track the no of bytes read. let start = bytes.as_ptr() as usize; @@ -487,6 +494,11 @@ pub(super) fn parse_lines<'a>( if bytes.get(read_sol - 1) == Some(&eol_char) { bytes = &bytes[read_sol..]; } else { + if !truncate_ragged_lines && read_sol < bytes.len() { + polars_bail!(ComputeError: r#"found more fields than defined in 'Schema' + +Consider setting 'truncate_ragged_lines={}'."#, polars_error::constants::TRUE) + } let bytes_rem = skip_this_line( &bytes[read_sol - 1..], quote_char, diff --git a/crates/polars-io/src/csv/read.rs b/crates/polars-io/src/csv/read.rs index ebaf2c4cd91c..9bf55ca3e06a 100644 --- a/crates/polars-io/src/csv/read.rs +++ b/crates/polars-io/src/csv/read.rs @@ -100,8 +100,6 @@ where { /// File or Stream object reader: R, - /// Aggregates chunk afterwards to a single chunk. - rechunk: bool, /// Stop reading from the csv after this number of rows is reached n_rows: Option, // used by error ignore logic @@ -112,8 +110,6 @@ where /// Optional column names to project/ select. columns: Option>, delimiter: Option, - has_header: bool, - ignore_errors: bool, pub(crate) schema: Option, encoding: CsvEncoding, n_threads: Option, @@ -122,17 +118,22 @@ where dtype_overwrite: Option<&'a [DataType]>, sample_size: usize, chunk_size: usize, - low_memory: bool, comment_char: Option, - eol_char: u8, null_values: Option, - missing_is_null: bool, predicate: Option>, quote_char: Option, skip_rows_after_header: usize, try_parse_dates: bool, row_count: Option, + /// Aggregates chunk afterwards to a single chunk. + rechunk: bool, raise_if_empty: bool, + truncate_ragged_lines: bool, + missing_is_null: bool, + low_memory: bool, + has_header: bool, + ignore_errors: bool, + eol_char: u8, } impl<'a, R> CsvReader<'a, R> @@ -324,6 +325,12 @@ where self.predicate = predicate; self } + + /// Truncate lines that are longer than the schema. + pub fn truncate_ragged_lines(mut self, toggle: bool) -> Self { + self.truncate_ragged_lines = toggle; + self + } } impl<'a> CsvReader<'a, File> { @@ -374,6 +381,7 @@ impl<'a, R: MmapBytesReader + 'a> CsvReader<'a, R> { std::mem::take(&mut self.row_count), self.try_parse_dates, self.raise_if_empty, + self.truncate_ragged_lines, ) } @@ -558,6 +566,7 @@ where try_parse_dates: false, row_count: None, raise_if_empty: true, + truncate_ragged_lines: false, } } diff --git a/crates/polars-io/src/csv/read_impl/batched_mmap.rs b/crates/polars-io/src/csv/read_impl/batched_mmap.rs index 20f6f96018fb..a659f31d6c3c 100644 --- a/crates/polars-io/src/csv/read_impl/batched_mmap.rs +++ b/crates/polars-io/src/csv/read_impl/batched_mmap.rs @@ -161,6 +161,7 @@ impl<'a> CoreReader<'a> { missing_is_null: self.missing_is_null, to_cast: self.to_cast, ignore_errors: self.ignore_errors, + truncate_ragged_lines: self.truncate_ragged_lines, n_rows: self.n_rows, encoding: self.encoding, delimiter: self.delimiter, @@ -186,6 +187,7 @@ pub struct BatchedCsvReaderMmap<'a> { eol_char: u8, null_values: Option, missing_is_null: bool, + truncate_ragged_lines: bool, to_cast: Vec, ignore_errors: bool, n_rows: Option, @@ -244,6 +246,7 @@ impl<'a> BatchedCsvReaderMmap<'a> { self.encoding, self.null_values.as_ref(), self.missing_is_null, + self.truncate_ragged_lines, self.chunk_size, stop_at_nbytes, self.starting_point_offset, diff --git a/crates/polars-io/src/csv/read_impl/batched_read.rs b/crates/polars-io/src/csv/read_impl/batched_read.rs index 2c8a74a23969..88249222dcb4 100644 --- a/crates/polars-io/src/csv/read_impl/batched_read.rs +++ b/crates/polars-io/src/csv/read_impl/batched_read.rs @@ -244,6 +244,7 @@ impl<'a> CoreReader<'a> { missing_is_null: self.missing_is_null, to_cast: self.to_cast, ignore_errors: self.ignore_errors, + truncate_ragged_lines: self.truncate_ragged_lines, n_rows: self.n_rows, encoding: self.encoding, delimiter: self.delimiter, @@ -271,6 +272,7 @@ pub struct BatchedCsvReaderRead<'a> { missing_is_null: bool, to_cast: Vec, ignore_errors: bool, + truncate_ragged_lines: bool, n_rows: Option, encoding: CsvEncoding, delimiter: u8, @@ -341,6 +343,7 @@ impl<'a> BatchedCsvReaderRead<'a> { self.encoding, self.null_values.as_ref(), self.missing_is_null, + self.truncate_ragged_lines, self.chunk_size, stop_at_n_bytes, self.starting_point_offset, diff --git a/crates/polars-io/src/csv/read_impl/mod.rs b/crates/polars-io/src/csv/read_impl/mod.rs index 62aa3578aabf..6702d4779184 100644 --- a/crates/polars-io/src/csv/read_impl/mod.rs +++ b/crates/polars-io/src/csv/read_impl/mod.rs @@ -115,6 +115,7 @@ pub(crate) struct CoreReader<'a> { predicate: Option>, to_cast: Vec, row_count: Option, + truncate_ragged_lines: bool, } impl<'a> fmt::Debug for CoreReader<'a> { @@ -206,6 +207,7 @@ impl<'a> CoreReader<'a> { row_count: Option, try_parse_dates: bool, raise_if_empty: bool, + truncate_ragged_lines: bool, ) -> PolarsResult> { #[cfg(any(feature = "decompress", feature = "decompress-fast"))] let mut reader_bytes = reader_bytes; @@ -303,6 +305,7 @@ impl<'a> CoreReader<'a> { predicate, to_cast, row_count, + truncate_ragged_lines, }) } @@ -609,11 +612,12 @@ impl<'a> CoreReader<'a> { self.comment_char, self.quote_char, self.eol_char, - self.null_values.as_ref(), self.missing_is_null, + self.truncate_ragged_lines, + ignore_errors, + self.null_values.as_ref(), projection, &mut buffers, - ignore_errors, chunk_size, self.schema.len(), &self.schema, @@ -683,6 +687,7 @@ impl<'a> CoreReader<'a> { self.encoding, self.null_values.as_ref(), self.missing_is_null, + self.truncate_ragged_lines, usize::MAX, stop_at_nbytes, starting_point_offset, @@ -725,11 +730,12 @@ impl<'a> CoreReader<'a> { self.comment_char, self.quote_char, self.eol_char, - self.null_values.as_ref(), self.missing_is_null, + self.ignore_errors, + self.truncate_ragged_lines, + self.null_values.as_ref(), &projection, &mut buffers, - self.ignore_errors, remaining_rows - 1, self.schema.len(), self.schema.as_ref(), @@ -811,6 +817,7 @@ fn read_chunk( encoding: CsvEncoding, null_values: Option<&NullValuesCompiled>, missing_is_null: bool, + truncate_ragged_lines: bool, chunk_size: usize, stop_at_nbytes: usize, starting_point_offset: Option, @@ -842,11 +849,12 @@ fn read_chunk( comment_char, quote_char, eol_char, - null_values, missing_is_null, + ignore_errors, + truncate_ragged_lines, + null_values, projection, &mut buffers, - ignore_errors, chunk_size, schema.len(), schema, diff --git a/crates/polars-lazy/Cargo.toml b/crates/polars-lazy/Cargo.toml index eb432e860d98..31414d8cd7f3 100644 --- a/crates/polars-lazy/Cargo.toml +++ b/crates/polars-lazy/Cargo.toml @@ -110,7 +110,7 @@ list_eval = [] cumulative_eval = [] chunked_ids = ["polars-plan/chunked_ids", "polars-core/chunked_ids"] list_to_struct = ["polars-plan/list_to_struct"] -python = ["pyo3", "polars-plan/python", "polars-core/python"] +python = ["pyo3", "polars-plan/python", "polars-core/python", "polars-io/python"] row_hash = ["polars-plan/row_hash"] string_justify = ["polars-plan/string_justify"] string_from_radix = ["polars-plan/string_from_radix"] diff --git a/crates/polars-lazy/src/frame/csv.rs b/crates/polars-lazy/src/frame/csv.rs index 5067226ebe57..1e1e97240dbf 100644 --- a/crates/polars-lazy/src/frame/csv.rs +++ b/crates/polars-lazy/src/frame/csv.rs @@ -26,6 +26,7 @@ pub struct LazyCsvReader<'a> { eol_char: u8, null_values: Option, missing_is_null: bool, + truncate_ragged_lines: bool, infer_schema_length: Option, rechunk: bool, skip_rows_after_header: usize, @@ -61,6 +62,7 @@ impl<'a> LazyCsvReader<'a> { row_count: None, try_parse_dates: false, raise_if_empty: true, + truncate_ragged_lines: false, } } @@ -208,6 +210,13 @@ impl<'a> LazyCsvReader<'a> { self } + /// Truncate lines that are longer than the schema. + #[must_use] + pub fn truncate_ragged_lines(mut self, toggle: bool) -> Self { + self.truncate_ragged_lines = toggle; + self + } + /// Modify a schema before we run the lazy scanning. /// /// Important! Run this function latest in the builder! @@ -280,6 +289,7 @@ impl LazyFileListReader for LazyCsvReader<'_> { self.row_count, self.try_parse_dates, self.raise_if_empty, + self.truncate_ragged_lines, )? .build() .into(); diff --git a/crates/polars-lazy/src/physical_plan/executors/scan/csv.rs b/crates/polars-lazy/src/physical_plan/executors/scan/csv.rs index bd4f8b20d631..80b2b2e3aa95 100644 --- a/crates/polars-lazy/src/physical_plan/executors/scan/csv.rs +++ b/crates/polars-lazy/src/physical_plan/executors/scan/csv.rs @@ -45,6 +45,7 @@ impl CsvExec { .with_rechunk(self.file_options.rechunk) .with_row_count(std::mem::take(&mut self.file_options.row_count)) .with_try_parse_dates(self.options.try_parse_dates) + .truncate_ragged_lines(self.options.truncate_ragged_lines) .raise_if_empty(self.options.raise_if_empty) .finish() } diff --git a/crates/polars-pipe/src/executors/sources/csv.rs b/crates/polars-pipe/src/executors/sources/csv.rs index 46b6362e6856..a9e9f5352d1d 100644 --- a/crates/polars-pipe/src/executors/sources/csv.rs +++ b/crates/polars-pipe/src/executors/sources/csv.rs @@ -80,6 +80,7 @@ impl CsvSource { .with_chunk_size(chunk_size) .with_row_count(file_options.row_count) .with_try_parse_dates(options.try_parse_dates) + .truncate_ragged_lines(options.truncate_ragged_lines) .raise_if_empty(options.raise_if_empty); let reader = Box::new(reader); diff --git a/crates/polars-plan/src/logical_plan/builder.rs b/crates/polars-plan/src/logical_plan/builder.rs index 902ef9b6b91d..276f2aeb76d9 100644 --- a/crates/polars-plan/src/logical_plan/builder.rs +++ b/crates/polars-plan/src/logical_plan/builder.rs @@ -252,6 +252,7 @@ impl LogicalPlanBuilder { row_count: Option, try_parse_dates: bool, raise_if_empty: bool, + truncate_ragged_lines: bool, ) -> PolarsResult { let path = path.into(); let mut file = polars_utils::open_file(&path).map_err(|e| { @@ -346,6 +347,7 @@ impl LogicalPlanBuilder { encoding, try_parse_dates, raise_if_empty, + truncate_ragged_lines, }, }, } diff --git a/crates/polars-plan/src/logical_plan/options.rs b/crates/polars-plan/src/logical_plan/options.rs index 9aef73892951..2c9a44446fe9 100644 --- a/crates/polars-plan/src/logical_plan/options.rs +++ b/crates/polars-plan/src/logical_plan/options.rs @@ -35,6 +35,7 @@ pub struct CsvParserOptions { pub encoding: CsvEncoding, pub try_parse_dates: bool, pub raise_if_empty: bool, + pub truncate_ragged_lines: bool, } #[cfg(feature = "parquet")] diff --git a/crates/polars/tests/it/io/csv.rs b/crates/polars/tests/it/io/csv.rs index 74a66e320640..4c48d71921c6 100644 --- a/crates/polars/tests/it/io/csv.rs +++ b/crates/polars/tests/it/io/csv.rs @@ -568,7 +568,7 @@ fn test_comment_lines() -> PolarsResult<()> { #[test] fn test_null_values_argument() -> PolarsResult<()> { let csv = r"1,a,foo -null-value,b,bar, +null-value,b,bar 3,null-value,ham "; @@ -826,7 +826,10 @@ fn test_scientific_floats() -> PolarsResult<()> { fn test_tsv_header_offset() -> PolarsResult<()> { let csv = "foo\tbar\n\t1000011\t1\n\t1000026\t2\n\t1000949\t2"; let file = Cursor::new(csv); - let df = CsvReader::new(file).with_delimiter(b'\t').finish()?; + let df = CsvReader::new(file) + .truncate_ragged_lines(true) + .with_delimiter(b'\t') + .finish()?; assert_eq!(df.shape(), (3, 2)); assert_eq!(df.dtypes(), &[DataType::Utf8, DataType::Int64]); @@ -925,7 +928,7 @@ foo,bar .finish()?; assert_eq!(df.get_column_names(), &["foo", "bar"]); assert_eq!(df.shape(), (1, 2)); - let df = CsvReader::new(file).finish()?; + let df = CsvReader::new(file).truncate_ragged_lines(true).finish()?; assert_eq!(df.shape(), (5, 1)); Ok(()) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index c7d630b207de..96c70447637e 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -679,6 +679,7 @@ def _read_csv( sample_size: int = 1024, eol_char: str = "\n", raise_if_empty: bool = True, + truncate_ragged_lines: bool = False, ) -> DataFrame: """ Read a CSV file into a DataFrame. @@ -751,6 +752,7 @@ def _read_csv( row_count_offset=row_count_offset, eol_char=eol_char, raise_if_empty=raise_if_empty, + truncate_ragged_lines=truncate_ragged_lines, ) if columns is None: return scan.collect() @@ -792,6 +794,7 @@ def _read_csv( sample_size=sample_size, eol_char=eol_char, raise_if_empty=raise_if_empty, + truncate_ragged_lines=truncate_ragged_lines, ) return self diff --git a/py-polars/polars/io/csv/batched_reader.py b/py-polars/polars/io/csv/batched_reader.py index 27d55afb55e4..9f848981ec15 100644 --- a/py-polars/polars/io/csv/batched_reader.py +++ b/py-polars/polars/io/csv/batched_reader.py @@ -53,6 +53,7 @@ def __init__( eol_char: str = "\n", new_columns: Sequence[str] | None = None, raise_if_empty: bool = True, + truncate_ragged_lines: bool = False, ): path: str | None if isinstance(source, (str, Path)): @@ -100,6 +101,7 @@ def __init__( sample_size=sample_size, eol_char=eol_char, raise_if_empty=raise_if_empty, + truncate_ragged_lines=truncate_ragged_lines, ) self.new_columns = new_columns diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index 42039f416e8c..548a90d89a56 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -47,6 +47,7 @@ def read_csv( sample_size: int = 1024, eol_char: str = "\n", raise_if_empty: bool = True, + truncate_ragged_lines: bool = False, ) -> DataFrame: """ Read a CSV file into a DataFrame. @@ -157,6 +158,8 @@ def read_csv( raise_if_empty When there is no data in the source,``NoDataError`` is raised. If this parameter is set to False, an empty DataFrame (with no columns) is returned instead. + truncate_ragged_lines + Truncate lines that are longer than the schema. Returns ------- @@ -379,6 +382,7 @@ def read_csv( sample_size=sample_size, eol_char=eol_char, raise_if_empty=raise_if_empty, + truncate_ragged_lines=truncate_ragged_lines, ) if new_columns: @@ -704,6 +708,7 @@ def scan_csv( eol_char: str = "\n", new_columns: Sequence[str] | None = None, raise_if_empty: bool = True, + truncate_ragged_lines: bool = False, ) -> LazyFrame: """ Lazily read from a CSV file or multiple files via glob patterns. @@ -788,6 +793,8 @@ def scan_csv( raise_if_empty When there is no data in the source,``NoDataError`` is raised. If this parameter is set to False, an empty LazyFrame (with no columns) is returned instead. + truncate_ragged_lines + Truncate lines that are longer than the schema. Returns ------- @@ -901,4 +908,5 @@ def with_column_names(_cols: list[str]) -> list[str]: try_parse_dates=try_parse_dates, eol_char=eol_char, raise_if_empty=raise_if_empty, + truncate_ragged_lines=truncate_ragged_lines, ) diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 75703adbd228..25622fb163a2 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -343,6 +343,7 @@ def _scan_csv( try_parse_dates: bool = False, eol_char: str = "\n", raise_if_empty: bool = True, + truncate_ragged_lines: bool = True, ) -> Self: """ Lazily read from a CSV file or multiple files via glob patterns. @@ -385,6 +386,7 @@ def _scan_csv( try_parse_dates, eol_char=eol_char, raise_if_empty=raise_if_empty, + truncate_ragged_lines=truncate_ragged_lines, ) return self diff --git a/py-polars/src/batched_csv.rs b/py-polars/src/batched_csv.rs index 7161ace32551..6114fb43a675 100644 --- a/py-polars/src/batched_csv.rs +++ b/py-polars/src/batched_csv.rs @@ -31,7 +31,7 @@ impl PyBatchedCsv { projection, separator, rechunk, columns, encoding, n_threads, path, overwrite_dtype, overwrite_dtype_slice, low_memory, comment_char, quote_char, null_values, missing_utf8_is_empty_string, try_parse_dates, skip_rows_after_header, row_count, - sample_size, eol_char, raise_if_empty) + sample_size, eol_char, raise_if_empty, truncate_ragged_lines) )] fn new( infer_schema_length: Option, @@ -60,6 +60,7 @@ impl PyBatchedCsv { sample_size: usize, eol_char: &str, raise_if_empty: bool, + truncate_ragged_lines: bool, ) -> PyResult { let null_values = null_values.map(|w| w.0); let comment_char = comment_char.map(|s| s.as_bytes()[0]); @@ -118,6 +119,7 @@ impl PyBatchedCsv { .with_skip_rows_after_header(skip_rows_after_header) .with_row_count(row_count) .sample_size(sample_size) + .truncate_ragged_lines(truncate_ragged_lines) .raise_if_empty(raise_if_empty); let reader = if low_memory { diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs index 355b24d5bec8..3ce6754aefab 100644 --- a/py-polars/src/dataframe.rs +++ b/py-polars/src/dataframe.rs @@ -139,7 +139,7 @@ impl PyDataFrame { skip_rows, projection, separator, rechunk, columns, encoding, n_threads, path, overwrite_dtype, overwrite_dtype_slice, low_memory, comment_char, quote_char, null_values, missing_utf8_is_empty_string, try_parse_dates, skip_rows_after_header, - row_count, sample_size, eol_char, raise_if_empty) + row_count, sample_size, eol_char, raise_if_empty, truncate_ragged_lines) )] pub fn read_csv( py_f: &PyAny, @@ -169,6 +169,7 @@ impl PyDataFrame { sample_size: usize, eol_char: &str, raise_if_empty: bool, + truncate_ragged_lines: bool, ) -> PyResult { let null_values = null_values.map(|w| w.0); let comment_char = comment_char.map(|s| s.as_bytes()[0]); @@ -229,6 +230,7 @@ impl PyDataFrame { .with_row_count(row_count) .sample_size(sample_size) .raise_if_empty(raise_if_empty) + .truncate_ragged_lines(truncate_ragged_lines) .finish() .map_err(PyPolarsErr::from)?; Ok(df.into()) diff --git a/py-polars/src/lazyframe.rs b/py-polars/src/lazyframe.rs index ae7df9579ff0..be3c62ce4d97 100644 --- a/py-polars/src/lazyframe.rs +++ b/py-polars/src/lazyframe.rs @@ -147,7 +147,7 @@ impl PyLazyFrame { #[pyo3(signature = (path, separator, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype, low_memory, comment_char, quote_char, null_values, missing_utf8_is_empty_string, infer_schema_length, with_schema_modify, rechunk, skip_rows_after_header, - encoding, row_count, try_parse_dates, eol_char, raise_if_empty, + encoding, row_count, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines ) )] fn new_from_csv( @@ -173,6 +173,7 @@ impl PyLazyFrame { try_parse_dates: bool, eol_char: &str, raise_if_empty: bool, + truncate_ragged_lines: bool, ) -> PyResult { let null_values = null_values.map(|w| w.0); let comment_char = comment_char.map(|s| s.as_bytes()[0]); @@ -207,6 +208,7 @@ impl PyLazyFrame { .with_try_parse_dates(try_parse_dates) .with_null_values(null_values) .with_missing_is_null(!missing_utf8_is_empty_string) + .truncate_ragged_lines(truncate_ragged_lines) .raise_if_empty(raise_if_empty); if let Some(lambda) = with_schema_modify { diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index ca6574a31061..dd973193ea26 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -445,7 +445,7 @@ def test_compressed_csv(io_files_path: Path) -> None: """\ a,b,c 1,a,1.0 - 2,b,2.0, + 2,b,2.0 3,c,3.0 """ ) @@ -462,7 +462,7 @@ def test_compressed_csv(io_files_path: Path) -> None: # now from disk csv_file = io_files_path / "gzipped.csv" - out = pl.read_csv(str(csv_file)) + out = pl.read_csv(str(csv_file), truncate_ragged_lines=True) assert_frame_equal(out, expected) # now with column projection @@ -1472,3 +1472,25 @@ def test_ignore_errors_date_parser() -> None: dtypes={"date": pl.Date}, ignore_errors=False, ) + + +def test_csv_ragged_lines() -> None: + expected = {"column_1": ["A", "B", "C"]} + assert ( + pl.read_csv( + io.StringIO("A\nB,ragged\nC"), has_header=False, truncate_ragged_lines=True + ).to_dict(False) + == expected + ) + assert ( + pl.read_csv( + io.StringIO("A\nB\nC,ragged"), has_header=False, truncate_ragged_lines=True + ).to_dict(False) + == expected + ) + + for s in ["A\nB,ragged\nC", "A\nB\nC,ragged"]: + with pytest.raises(pl.ComputeError, match=r"found more fields than defined"): + pl.read_csv(io.StringIO(s), has_header=False, truncate_ragged_lines=False) + with pytest.raises(pl.ComputeError, match=r"found more fields than defined"): + pl.read_csv(io.StringIO(s), has_header=False, truncate_ragged_lines=False) From 767ebe8e48f7cf4fa085d2e45d8569ec3d311b7e Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Tue, 22 Aug 2023 14:03:02 +0400 Subject: [PATCH 37/55] feat(python): support `DataFrame` init from queries against user-instantiated database connections (#10649) --- py-polars/docs/source/reference/io.rst | 1 + py-polars/polars/__init__.py | 2 + py-polars/polars/convert.py | 12 +- py-polars/polars/io/__init__.py | 3 +- py-polars/polars/io/database.py | 300 +++++++++++++++-- py-polars/polars/type_aliases.py | 30 ++ py-polars/polars/utils/_construction.py | 8 +- py-polars/pyproject.toml | 2 +- py-polars/tests/unit/io/test_database.py | 241 -------------- py-polars/tests/unit/io/test_database_read.py | 307 ++++++++++++++++++ .../tests/unit/io/test_database_write.py | 103 ++++++ 11 files changed, 739 insertions(+), 270 deletions(-) delete mode 100644 py-polars/tests/unit/io/test_database.py create mode 100644 py-polars/tests/unit/io/test_database_read.py create mode 100644 py-polars/tests/unit/io/test_database_write.py diff --git a/py-polars/docs/source/reference/io.rst b/py-polars/docs/source/reference/io.rst index d83afcffd10e..243f0fe075f0 100644 --- a/py-polars/docs/source/reference/io.rst +++ b/py-polars/docs/source/reference/io.rst @@ -43,6 +43,7 @@ Database :toctree: api/ read_database + read_database_uri DataFrame.write_database JSON diff --git a/py-polars/polars/__init__.py b/py-polars/polars/__init__.py index 01f03a68d68e..159394cec4d0 100644 --- a/py-polars/polars/__init__.py +++ b/py-polars/polars/__init__.py @@ -155,6 +155,7 @@ read_csv, read_csv_batched, read_database, + read_database_uri, read_delta, read_excel, read_ipc, @@ -248,6 +249,7 @@ "read_csv", "read_csv_batched", "read_database", + "read_database_uri", "read_delta", "read_excel", "read_ipc", diff --git a/py-polars/polars/convert.py b/py-polars/polars/convert.py index 16ca69601012..bd8f8e86672a 100644 --- a/py-polars/polars/convert.py +++ b/py-polars/polars/convert.py @@ -2,7 +2,7 @@ import io import re -from itertools import zip_longest +from itertools import chain, zip_longest from typing import TYPE_CHECKING, Any, Iterable, Mapping, Sequence, overload import polars._reexport as pl @@ -516,7 +516,7 @@ def from_arrow( | pa.Array | pa.ChunkedArray | pa.RecordBatch - | Iterable[pa.RecordBatch] + | Iterable[pa.RecordBatch | pa.Table] ), schema: SchemaDefinition | None = None, *, @@ -532,7 +532,7 @@ def from_arrow( Parameters ---------- data : :class:`pyarrow.Table`, :class:`pyarrow.Array`, one or more :class:`pyarrow.RecordBatch` - Data representing an Arrow Table, Array, or sequence of RecordBatches. + Data representing an Arrow Table, Array, or sequence of RecordBatches or Tables. schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict The DataFrame schema may be declared in several ways: @@ -609,7 +609,11 @@ def from_arrow( data = [data] if isinstance(data, Iterable): return pl.DataFrame._from_arrow( - data=pa.Table.from_batches(data), + data=pa.Table.from_batches( + chain.from_iterable( + (b.to_batches() if isinstance(b, pa.Table) else [b]) for b in data + ) + ), rechunk=rechunk, schema=schema, schema_overrides=schema_overrides, diff --git a/py-polars/polars/io/__init__.py b/py-polars/polars/io/__init__.py index 7243007e82fc..995bc4552c55 100644 --- a/py-polars/polars/io/__init__.py +++ b/py-polars/polars/io/__init__.py @@ -2,7 +2,7 @@ from polars.io.avro import read_avro from polars.io.csv import read_csv, read_csv_batched, scan_csv -from polars.io.database import read_database +from polars.io.database import read_database, read_database_uri from polars.io.delta import read_delta, scan_delta from polars.io.excel import read_excel from polars.io.ipc import read_ipc, read_ipc_schema, read_ipc_stream, scan_ipc @@ -16,6 +16,7 @@ "read_csv", "read_csv_batched", "read_database", + "read_database_uri", "read_delta", "read_excel", "read_ipc", diff --git a/py-polars/polars/io/database.py b/py-polars/polars/io/database.py index e6cd357c56f5..49abbb505f28 100644 --- a/py-polars/polars/io/database.py +++ b/py-polars/polars/io/database.py @@ -3,20 +3,276 @@ import re import sys from importlib import import_module -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Iterable, Sequence, TypedDict from polars.convert import from_arrow -from polars.utils.deprecation import deprecate_renamed_parameter +from polars.utils.deprecation import ( + deprecate_renamed_parameter, + issue_deprecation_warning, +) if TYPE_CHECKING: + from types import TracebackType + + if sys.version_info >= (3, 11): + from typing import Self + else: + from typing_extensions import Self + from polars import DataFrame - from polars.type_aliases import DbReadEngine + from polars.dependencies import pyarrow as pa + from polars.type_aliases import ConnectionOrCursor, Cursor, DbReadEngine + + +class _DriverProperties_(TypedDict): + fetch_all: str + fetch_batches: str | None + exact_batch_size: bool | None + + +_ARROW_DRIVER_REGISTRY_: dict[str, _DriverProperties_] = { + "adbc_.*": { + "fetch_all": "fetch_arrow_table", + "fetch_batches": None, + "exact_batch_size": None, + }, + "databricks": { + "fetch_all": "fetchall_arrow", + "fetch_batches": "fetchmany_arrow", + "exact_batch_size": True, + }, + "snowflake": { + "fetch_all": "fetch_arrow_all", + "fetch_batches": "fetch_arrow_batches", + "exact_batch_size": False, + }, + "turbodbc": { + "fetch_all": "fetchallarrow", + "fetch_batches": "fetcharrowbatches", + "exact_batch_size": False, + }, +} + + +class ConnectionExecutor: + """Abstraction for querying databases with user-supplied connection objects.""" + + acquired_cursor = False + + def __init__(self, connection: ConnectionOrCursor) -> None: + self.driver = type(connection).__module__.split(".", 1)[0].lower() + self.cursor = self._normalise_cursor(connection) + self.result: Any = None + + def __enter__(self) -> Self: + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_val: BaseException | None, + exc_tb: TracebackType | None, + ) -> None: + # iif we created it, close the cursor (NOT the connection) + if self.acquired_cursor: + self.cursor.close() + + def __repr__(self) -> str: + return f"<{type(self).__name__} module={self.driver!r}>" + + def _normalise_cursor(self, conn: ConnectionOrCursor) -> Cursor: + """Normalise a connection object such that we have the query executor.""" + if self.driver == "sqlalchemy" and type(conn).__name__ == "Engine": + # sqlalchemy engine; direct use is deprecated, so get the connection + self.acquired_cursor = True + return conn.connect() # type: ignore[union-attr] + elif hasattr(conn, "cursor"): + # connection has a dedicated cursor; prefer over direct execute + cursor = cursor() if callable(cursor := conn.cursor) else cursor + self.acquired_cursor = True + return cursor + elif hasattr(conn, "execute"): + # can execute directly (given cursor, sqlalchemy connection, etc) + return conn # type: ignore[return-value] + + raise TypeError( + f"Unrecognised connection {conn!r}; unable to find 'execute' method" + ) + + @staticmethod + def _fetch_arrow( + result: Cursor, fetch_method: str, batch_size: int | None + ) -> Iterable[pa.RecordBatch | pa.Table]: + """Iterate over the result set, fetching arrow data in batches.""" + size = (batch_size,) if batch_size else () + while result: # type: ignore[truthy-bool] + result = getattr(result, fetch_method)(*size) + yield result + + @staticmethod + def _fetchall_rows(result: Cursor) -> Iterable[Sequence[Any]]: + """Fetch row data in a single call, returning the complete result set.""" + rows = result.fetchall() + return ( + [tuple(row) for row in rows] + if rows and not isinstance(rows[0], (list, tuple)) + else rows + ) + + def _fetchmany_rows( + self, result: Cursor, batch_size: int | None + ) -> Iterable[Sequence[Any]]: + """Fetch row data incrementally, yielding over the complete result set.""" + while True: + rows = result.fetchmany(batch_size) + if not rows: + break + elif not isinstance(rows[0], (list, tuple)): + for row in rows: + yield tuple(row) + else: + yield from rows + + def _from_arrow(self, batch_size: int | None) -> DataFrame | None: + """Return resultset data in Arrow format for frame init.""" + from polars import DataFrame + + for driver, driver_properties in _ARROW_DRIVER_REGISTRY_.items(): + if re.match(f"^{driver}$", self.driver): + size = batch_size if driver_properties["exact_batch_size"] else None + fetch_batches = driver_properties["fetch_batches"] + return DataFrame( + self._fetch_arrow(self.result, fetch_batches, size) + if batch_size and fetch_batches is not None + else getattr(self.result, driver_properties["fetch_all"])() + ) + + if self.driver == "duckdb": + exec_kwargs = {"rows_per_batch": batch_size} if batch_size else {} + return DataFrame(self.result.arrow(**exec_kwargs)) + + return None + + def _from_rows(self, batch_size: int | None) -> DataFrame | None: + """Return resultset data row-wise for frame init.""" + from polars import DataFrame + + if hasattr(self.result, "fetchall"): + description = ( + self.result.cursor.description + if self.driver == "sqlalchemy" + else self.result.description + ) + column_names = [desc[0] for desc in description] + return DataFrame( + data=( + self._fetchall_rows(self.result) + if not batch_size + else self._fetchmany_rows(self.result, batch_size) + ), + schema=column_names, + orient="row", + ) + return None + + def execute(self, query: str) -> Self: + """Execute a query and reference the result set data.""" + if self.driver == "sqlalchemy": + from sqlalchemy.sql import text + + query = text(query) # type: ignore[assignment] + + if (result := self.cursor.execute(query)) is None: + result = self.cursor # some cursors execute in-place + + self.result = result + return self + + def to_frame(self, batch_size: int | None = None) -> DataFrame: + """ + Convert the result set to a DataFrame. + + Wherever possible we try to return arrow-native data directly; only + fall back to initialising with row-level data if no other option. + """ + if self.result is None: + raise RuntimeError("Cannot return a frame before executing a query") + + for frame_init in ( + self._from_arrow, # init from arrow-native data (most efficient option) + self._from_rows, # row-wise fallback covering sqlalchemy, dbapi2, pyodbc + ): + frame = frame_init(batch_size) + if frame is not None: + return frame + + raise NotImplementedError( + f"Currently no support for {self.driver!r} connection {self.cursor!r}" + ) @deprecate_renamed_parameter("connection_uri", "connection", version="0.18.9") -def read_database( +def read_database( # noqa: D417 + query: str, + connection: ConnectionOrCursor, + batch_size: int | None = None, + **kwargs: Any, +) -> DataFrame: + """ + Read the results of a SQL query into a DataFrame, given a connection object. + + Parameters + ---------- + query + String SQL query to execute. + connection + An instantiated connection (or cursor/client object) that the query can be + executed against. + batch_size + The number of rows to fetch each time as data is collected; if this option is + supported by the backend it will be passed to the underlying query execution + method (if the backend does not have such support it is ignored without error). + + Notes + ----- + This function supports a wide range of native database drivers (ranging from SQLite + to Snowflake), as well as libraries such as ADBC, SQLAlchemy and various flavours + of ODBC. If the backend supports returning Arrow data directly then this facility + will be used to efficiently instantiate the DataFrame; otherwise, the DataFrame + is initialised from row-wise data. + + Examples + -------- + Instantiate a DataFrame from a SQL query against a user-supplied connection: + + >>> df = pl.read_database( + ... query="SELECT * FROM test_data", + ... connection=conn, + ... ) # doctest: +SKIP + + See Also + -------- + read_database_uri : Create a DataFrame from a SQL query using a URI string. + + """ + if isinstance(connection, str): + issue_deprecation_warning( + message="Use of a string URI with 'read_database' is deprecated; use 'read_database_uri' instead", + version="0.19.0", + ) + return read_database_uri(query, uri=connection, **kwargs) + elif kwargs: + raise ValueError( + f"'read_database' does not support arbitrary **kwargs: found {kwargs!r}" + ) + + with ConnectionExecutor(connection) as cx: + return cx.execute(query).to_frame(batch_size) + + +def read_database_uri( query: list[str] | str, - connection: str, + uri: str, *, partition_on: str | None = None, partition_range: tuple[int, int] | None = None, @@ -25,13 +281,13 @@ def read_database( engine: DbReadEngine | None = None, ) -> DataFrame: """ - Read the results of a SQL query into a DataFrame. + Read the results of a SQL query into a DataFrame, given a URI. Parameters ---------- query Raw SQL query (or queries). - connection + uri A connectorx or ADBC connection URI string that starts with the backend's driver name, for example: @@ -73,18 +329,18 @@ def read_database( Examples -------- - Read a DataFrame from a SQL query using a single thread: + Create a DataFrame from a SQL query using a single thread: >>> uri = "postgresql://username:password@server:port/database" >>> query = "SELECT * FROM lineitem" - >>> pl.read_database(query, uri) # doctest: +SKIP + >>> pl.read_database_uri(query, uri) # doctest: +SKIP - Read a DataFrame in parallel using 10 threads by automatically partitioning the - provided SQL on the partition column: + Create a DataFrame in parallel using 10 threads by automatically partitioning + the provided SQL on the partition column: >>> uri = "postgresql://username:password@server:port/database" >>> query = "SELECT * FROM lineitem" - >>> pl.read_database( + >>> pl.read_database_uri( ... query, ... uri, ... partition_on="partition_col", @@ -92,28 +348,32 @@ def read_database( ... engine="connectorx", ... ) # doctest: +SKIP - Read a DataFrame in parallel using 2 threads by explicitly providing two SQL - queries: + Create a DataFrame in parallel using 2 threads by explicitly providing two + SQL queries: >>> uri = "postgresql://username:password@server:port/database" >>> queries = [ ... "SELECT * FROM lineitem WHERE partition_col <= 10", ... "SELECT * FROM lineitem WHERE partition_col > 10", ... ] - >>> pl.read_database(queries, uri, engine="connectorx") # doctest: +SKIP + >>> pl.read_database_uri(queries, uri, engine="connectorx") # doctest: +SKIP Read data from Snowflake using the ADBC driver: - >>> df = pl.read_database( + >>> df = pl.read_database_uri( ... "SELECT * FROM test_table", ... "snowflake://user:pass@company-org/testdb/public?warehouse=test&role=myrole", ... engine="adbc", ... ) # doctest: +SKIP + See Also + -------- + read_database : Create a DataFrame from a SQL query using a connection object. + """ # noqa: W505 - if not isinstance(connection, str): + if not isinstance(uri, str): raise TypeError( - f"expected connection to be a URI string; found {type(connection).__name__!r}" + f"expected connection to be a URI string; found {type(uri).__name__!r}" ) elif engine is None: engine = "connectorx" @@ -121,7 +381,7 @@ def read_database( if engine == "connectorx": return _read_sql_connectorx( query, - connection, + connection_uri=uri, partition_on=partition_on, partition_range=partition_range, partition_num=partition_num, @@ -130,7 +390,7 @@ def read_database( elif engine == "adbc": if not isinstance(query, str): raise ValueError("only a single SQL query string is accepted for adbc") - return _read_sql_adbc(query, connection) + return _read_sql_adbc(query, uri) else: raise ValueError( f"engine must be one of {{'connectorx', 'adbc'}}, got {engine!r}" diff --git a/py-polars/polars/type_aliases.py b/py-polars/polars/type_aliases.py index 14597c0c6bb7..e87d7ade9b12 100644 --- a/py-polars/polars/type_aliases.py +++ b/py-polars/polars/type_aliases.py @@ -10,6 +10,7 @@ List, Literal, Mapping, + Protocol, Sequence, Tuple, Type, @@ -193,3 +194,32 @@ # typevars for core polars types PolarsType = TypeVar("PolarsType", "DataFrame", "LazyFrame", "Series", "Expr") FrameType = TypeVar("FrameType", "DataFrame", "LazyFrame") + + +# minimal protocol definitions that can reasonably represent +# an executable connection, cursor, or equivalent object +class BasicConnection(Protocol): # noqa: D101 + def close(self) -> None: + """Close the connection.""" + + def cursor(self, *args: Any, **kwargs: Any) -> Any: + """Return a cursor object.""" + + +class BasicCursor(Protocol): # noqa: D101 + def close(self) -> None: + """Close the cursor.""" + + def execute(self, *args: Any, **kwargs: Any) -> Any: + """Execute a query.""" + + +class Cursor(BasicCursor): # noqa: D101 + def fetchall(self, *args: Any, **kwargs: Any) -> Any: + """Fetch all results.""" + + def fetchmany(self, *args: Any, **kwargs: Any) -> Any: + """Fetch results in batches.""" + + +ConnectionOrCursor = Union[BasicConnection, BasicCursor, Cursor] diff --git a/py-polars/polars/utils/_construction.py b/py-polars/polars/utils/_construction.py index 9f847b6aa0d8..0fe0ce966e99 100644 --- a/py-polars/polars/utils/_construction.py +++ b/py-polars/polars/utils/_construction.py @@ -103,8 +103,9 @@ def type_hints(obj: type) -> dict[str, Any]: def is_namedtuple(cls: Any, annotated: bool = False) -> bool: """Check whether given class derives from NamedTuple.""" if all(hasattr(cls, attr) for attr in ("_fields", "_field_defaults", "_replace")): - if len(cls.__annotations__) == len(cls._fields) if annotated else True: - return all(isinstance(fld, str) for fld in cls._fields) + if not isinstance(cls._fields, property): + if not annotated or len(cls.__annotations__) == len(cls._fields): + return all(isinstance(fld, str) for fld in cls._fields) return False @@ -1491,7 +1492,8 @@ def to_frame_chunk(values: list[Any], schema: SchemaDefinition | None) -> DataFr if not original_schema: original_schema = list(df.schema.items()) if chunk_size != adaptive_chunk_size: - chunk_size = adaptive_chunk_size = n_chunk_elems // len(df.columns) + if (n_columns := len(df.columns)) > 0: + chunk_size = adaptive_chunk_size = n_chunk_elems // n_columns else: df.vstack(frame_chunk, in_place=True) n_chunks += 1 diff --git a/py-polars/pyproject.toml b/py-polars/pyproject.toml index f30c9c05f3b8..350b82bf15f4 100644 --- a/py-polars/pyproject.toml +++ b/py-polars/pyproject.toml @@ -81,7 +81,7 @@ module = [ "polars.polars", "pyarrow.*", "pydantic", - "sqlalchemy", + "sqlalchemy.*", "xlsx2csv", "xlsxwriter.*", "zoneinfo", diff --git a/py-polars/tests/unit/io/test_database.py b/py-polars/tests/unit/io/test_database.py deleted file mode 100644 index 7e874032f2f5..000000000000 --- a/py-polars/tests/unit/io/test_database.py +++ /dev/null @@ -1,241 +0,0 @@ -from __future__ import annotations - -import sqlite3 -import sys -from datetime import date -from pathlib import Path -from typing import TYPE_CHECKING, Any - -import pytest - -import polars as pl -from polars.testing import assert_frame_equal - -if TYPE_CHECKING: - from polars.type_aliases import DbReadEngine, DbWriteEngine, DbWriteMode - - -@pytest.fixture() -def sample_df() -> pl.DataFrame: - return pl.DataFrame( - { - "id": [1, 2], - "name": ["misc", "other"], - "value": [100.0, -99.0], - "date": ["2020-01-01", "2021-12-31"], - } - ) - - -def create_temp_sqlite_db(test_db: str) -> None: - Path(test_db).unlink(missing_ok=True) - - # NOTE: at the time of writing adcb/connectorx have weak SQLite support (poor or - # no bool/date/datetime dtypes, for example) and there is a bug in connectorx that - # causes float rounding < py 3.11, hence we are only testing/storing simple values - # in this test db for now. as support improves, we can add/test additional dtypes). - - conn = sqlite3.connect(test_db) - # ┌─────┬───────┬───────┬────────────┐ - # │ id ┆ name ┆ value ┆ date │ - # │ --- ┆ --- ┆ --- ┆ --- │ - # │ i64 ┆ str ┆ f64 ┆ date │ - # ╞═════╪═══════╪═══════╪════════════╡ - # │ 1 ┆ misc ┆ 100.0 ┆ 2020-01-01 │ - # │ 2 ┆ other ┆ -99.0 ┆ 2021-12-31 │ - # └─────┴───────┴───────┴────────────┘ - conn.executescript( - """ - CREATE TABLE test_data ( - id INTEGER PRIMARY KEY, - name TEXT NOT NULL, - value FLOAT, - date DATE - ); - INSERT INTO test_data(name,value,date) - VALUES ('misc',100.0,'2020-01-01'), ('other',-99.5,'2021-12-31'); - """ - ) - conn.close() - - -@pytest.mark.write_disk() -@pytest.mark.parametrize( - ("engine", "expected_dtypes", "expected_dates"), - [ - pytest.param( - "connectorx", - { - "id": pl.Int64, - "name": pl.Utf8, - "value": pl.Float64, - "date": pl.Date, - }, - [date(2020, 1, 1), date(2021, 12, 31)], - ), - pytest.param( - "adbc", - { - "id": pl.Int64, - "name": pl.Utf8, - "value": pl.Float64, - "date": pl.Utf8, - }, - ["2020-01-01", "2021-12-31"], - marks=pytest.mark.skipif( - sys.version_info < (3, 9) or sys.platform == "win32", - reason="adbc_driver_sqlite not available below Python 3.9 / on Windows", - ), - ), - ], -) -def test_read_database( - engine: DbReadEngine, - expected_dtypes: dict[str, pl.DataType], - expected_dates: list[date | str], - tmp_path: Path, -) -> None: - tmp_path.mkdir(exist_ok=True) - - test_db = str(tmp_path / "test.db") - create_temp_sqlite_db(test_db) - - df = pl.read_database( - connection=f"sqlite:///{test_db}", - query="SELECT * FROM test_data", - engine=engine, - ) - assert df.schema == expected_dtypes - assert df.shape == (2, 4) - assert df["date"].to_list() == expected_dates - - -@pytest.mark.parametrize( - ("engine", "query", "database", "errclass", "err"), - [ - pytest.param( - "not_engine", - "SELECT * FROM test_data", - "sqlite", - ValueError, - "engine must be one of {'connectorx', 'adbc'}, got 'not_engine'", - id="Not an available sql engine", - ), - pytest.param( - "adbc", - ["SELECT * FROM test_data", "SELECT * FROM test_data"], - "sqlite", - ValueError, - "only a single SQL query string is accepted for adbc", - id="Unavailable list of queries for adbc", - ), - pytest.param( - "adbc", - "SELECT * FROM test_data", - "mysql", - ImportError, - "ADBC mysql driver not detected", - id="Unavailable adbc driver", - ), - pytest.param( - "adbc", - "SELECT * FROM test_data", - sqlite3.connect(":memory:"), - TypeError, - "expected connection to be a URI string", - id="Invalid connection URI", - ), - ], -) -def test_read_database_exceptions( - engine: DbReadEngine, - query: str, - database: Any, - errclass: type, - err: str, - tmp_path: Path, -) -> None: - conn = f"{database}://test" if isinstance(database, str) else database - with pytest.raises(errclass, match=err): - pl.read_database( - connection=conn, - query=query, - engine=engine, - ) - - -@pytest.mark.write_disk() -@pytest.mark.parametrize( - ("engine", "mode"), - [ - pytest.param( - "adbc", - "create", - id="adbc_create", - marks=pytest.mark.skipif( - sys.version_info < (3, 9) or sys.platform == "win32", - reason="adbc_driver_sqlite not available below Python 3.9 / on Windows", - ), - ), - pytest.param( - "adbc", - "append", - id="adbc_append", - marks=pytest.mark.skipif( - sys.version_info < (3, 9) or sys.platform == "win32", - reason="adbc_driver_sqlite not available below Python 3.9 / on Windows", - ), - ), - pytest.param( - "sqlalchemy", - "create", - id="sa_create", - ), - pytest.param( - "sqlalchemy", - "append", - id="sa_append", - ), - ], -) -def test_write_database( - engine: DbWriteEngine, mode: DbWriteMode, sample_df: pl.DataFrame, tmp_path: Path -) -> None: - tmp_path.mkdir(exist_ok=True) - tmp_db = f"test_{engine}.db" - test_db = str(tmp_path / tmp_db) - - # note: test a table name that requires quotes to ensure that we handle - # it correctly (also supply an explicit db schema with/without quotes) - tbl_name = '"test-data"' - - sample_df.write_database( - table_name=f"main.{tbl_name}", - connection=f"sqlite:///{test_db}", - if_exists="replace", - engine=engine, - ) - if mode == "append": - sample_df.write_database( - table_name=f'"main".{tbl_name}', - connection=f"sqlite:///{test_db}", - if_exists="append", - engine=engine, - ) - sample_df = pl.concat([sample_df, sample_df]) - - result = pl.read_database(f"SELECT * FROM {tbl_name}", f"sqlite:///{test_db}") - sample_df = sample_df.with_columns(pl.col("date").cast(pl.Utf8)) - assert_frame_equal(sample_df, result) - - # check that some invalid parameters raise errors - for invalid_params in ( - {"table_name": "w.x.y.z"}, - {"if_exists": "crunk", "table_name": f"main.{tbl_name}"}, - ): - with pytest.raises((ValueError, NotImplementedError)): - sample_df.write_database( - connection=f"sqlite:///{test_db}", - engine=engine, - **invalid_params, # type: ignore[arg-type] - ) diff --git a/py-polars/tests/unit/io/test_database_read.py b/py-polars/tests/unit/io/test_database_read.py new file mode 100644 index 000000000000..c3f89f53762d --- /dev/null +++ b/py-polars/tests/unit/io/test_database_read.py @@ -0,0 +1,307 @@ +from __future__ import annotations + +import os +import sqlite3 +import sys +from contextlib import suppress +from datetime import date +from pathlib import Path +from typing import TYPE_CHECKING, Any + +import pytest +from sqlalchemy import create_engine + +import polars as pl + +if TYPE_CHECKING: + from polars.type_aliases import DbReadEngine + + +def adbc_sqlite_connect(*args: Any, **kwargs: Any) -> Any: + with suppress(ModuleNotFoundError): # not available on 3.8/windows + from adbc_driver_sqlite.dbapi import connect + + return connect(*args, **kwargs) + + +@pytest.fixture() +def sample_df() -> pl.DataFrame: + return pl.DataFrame( + { + "id": [1, 2], + "name": ["misc", "other"], + "value": [100.0, -99.0], + "date": ["2020-01-01", "2021-12-31"], + } + ) + + +def create_temp_sqlite_db(test_db: str) -> None: + Path(test_db).unlink(missing_ok=True) + + # NOTE: at the time of writing adcb/connectorx have weak SQLite support (poor or + # no bool/date/datetime dtypes, for example) and there is a bug in connectorx that + # causes float rounding < py 3.11, hence we are only testing/storing simple values + # in this test db for now. as support improves, we can add/test additional dtypes). + + conn = sqlite3.connect(test_db) + # ┌─────┬───────┬───────┬────────────┐ + # │ id ┆ name ┆ value ┆ date │ + # │ --- ┆ --- ┆ --- ┆ --- │ + # │ i64 ┆ str ┆ f64 ┆ date │ + # ╞═════╪═══════╪═══════╪════════════╡ + # │ 1 ┆ misc ┆ 100.0 ┆ 2020-01-01 │ + # │ 2 ┆ other ┆ -99.0 ┆ 2021-12-31 │ + # └─────┴───────┴───────┴────────────┘ + conn.executescript( + """ + CREATE TABLE test_data ( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL, + value FLOAT, + date DATE + ); + INSERT INTO test_data(name,value,date) + VALUES ('misc',100.0,'2020-01-01'), ('other',-99.5,'2021-12-31'); + """ + ) + conn.close() + + +@pytest.mark.write_disk() +@pytest.mark.parametrize( + ("read_method", "engine_or_connection_init", "expected_dtypes", "expected_dates"), + [ + pytest.param( + "read_database_uri", + "connectorx", + { + "id": pl.Int64, + "name": pl.Utf8, + "value": pl.Float64, + "date": pl.Date, + }, + [date(2020, 1, 1), date(2021, 12, 31)], + id="uri: connectorx", + ), + pytest.param( + "read_database_uri", + "adbc", + { + "id": pl.Int64, + "name": pl.Utf8, + "value": pl.Float64, + "date": pl.Utf8, + }, + ["2020-01-01", "2021-12-31"], + marks=pytest.mark.skipif( + sys.version_info < (3, 9) or sys.platform == "win32", + reason="adbc_driver_sqlite not available below Python 3.9 / on Windows", + ), + id="uri: adbc", + ), + pytest.param( + "read_database", + lambda path: sqlite3.connect(path, detect_types=True), + { + "id": pl.Int64, + "name": pl.Utf8, + "value": pl.Float64, + "date": pl.Date, + }, + [date(2020, 1, 1), date(2021, 12, 31)], + id="conn: sqlite3", + ), + pytest.param( + "read_database", + lambda path: create_engine( + f"sqlite:///{path}", + connect_args={"detect_types": sqlite3.PARSE_DECLTYPES}, + ).connect(), + { + "id": pl.Int64, + "name": pl.Utf8, + "value": pl.Float64, + "date": pl.Date, + }, + [date(2020, 1, 1), date(2021, 12, 31)], + id="conn: sqlalchemy", + ), + pytest.param( + "read_database", + adbc_sqlite_connect, + { + "id": pl.Int64, + "name": pl.Utf8, + "value": pl.Float64, + "date": pl.Utf8, + }, + ["2020-01-01", "2021-12-31"], + marks=pytest.mark.skipif( + sys.version_info < (3, 9) or sys.platform == "win32", + reason="adbc_driver_sqlite not available below Python 3.9 / on Windows", + ), + id="conn: adbc", + ), + ], +) +def test_read_database( + read_method: str, + engine_or_connection_init: Any, + expected_dtypes: dict[str, pl.DataType], + expected_dates: list[date | str], + tmp_path: Path, +) -> None: + tmp_path.mkdir(exist_ok=True) + test_db = str(tmp_path / "test.db") + create_temp_sqlite_db(test_db) + + if read_method == "read_database_uri": + # instantiate the connection ourselves, using connectorx/adbc + df = pl.read_database_uri( + uri=f"sqlite:///{test_db}", + query="SELECT * FROM test_data", + engine=str(engine_or_connection_init), # type: ignore[arg-type] + ) + elif "adbc" in os.environ["PYTEST_CURRENT_TEST"]: + # externally instantiated adbc connections + with engine_or_connection_init(test_db) as conn, conn.cursor(): + df = pl.read_database(connection=conn, query="SELECT * FROM test_data") + else: + # other user-supplied connections + df = pl.read_database( + connection=engine_or_connection_init(test_db), + query="SELECT * FROM test_data", + ) + + assert df.schema == expected_dtypes + assert df.shape == (2, 4) + assert df["date"].to_list() == expected_dates + + +def test_read_database_mocked() -> None: + class MockConnection: + def __init__(self, driver: str) -> None: + self.__class__.__module__ = driver + self._cursor = MockCursor() + + def close(self) -> None: + pass + + def cursor(self) -> Any: + return self._cursor + + class MockCursor: + def __init__(self) -> None: + self.called: list[str] = [] + + def __getattr__(self, item: str) -> Any: + if "fetch" in item: + self.called.append(item) + return lambda *args, **kwargs: [] + super().__getattr__(item) # type: ignore[misc] + + def close(self) -> Any: + pass + + def execute(self, query: str) -> Any: + return self + + # since we don't have access to snowflake/databricks/etc from CI we + # mock them so we can check that we're calling the right methods + for driver, batch_size, expected_call in ( + ("snowflake", None, "fetch_arrow_all"), + ("snowflake", 10_000, "fetch_arrow_batches"), + ("databricks", None, "fetchall_arrow"), + ("databricks", 25_000, "fetchmany_arrow"), + ("turbodbc", None, "fetchallarrow"), + ("turbodbc", 50_000, "fetcharrowbatches"), + ("adbc_driver_postgresql", None, "fetch_arrow_table"), + ("adbc_driver_postgresql", 75_000, "fetch_arrow_table"), + ): + mc = MockConnection(driver) + pl.read_database( + connection=mc, + query="SELECT * FROM test_data", + batch_size=batch_size, + ) + assert expected_call in mc.cursor().called + + +@pytest.mark.parametrize( + ("read_method", "engine", "query", "database", "errclass", "err"), + [ + pytest.param( + "read_database_uri", + "not_an_engine", + "SELECT * FROM test_data", + "sqlite", + ValueError, + "engine must be one of {'connectorx', 'adbc'}, got 'not_an_engine'", + id="Not an available sql engine", + ), + pytest.param( + "read_database_uri", + "adbc", + ["SELECT * FROM test_data", "SELECT * FROM test_data"], + "sqlite", + ValueError, + "only a single SQL query string is accepted for adbc", + id="Unavailable list of queries for adbc", + ), + pytest.param( + "read_database_uri", + "adbc", + "SELECT * FROM test_data", + "mysql", + ImportError, + "ADBC mysql driver not detected", + id="Unavailable adbc driver", + ), + pytest.param( + "read_database_uri", + "adbc", + "SELECT * FROM test_data", + sqlite3.connect(":memory:"), + TypeError, + "expected connection to be a URI string", + id="Invalid connection URI", + ), + pytest.param( + "read_database", + None, + "SELECT * FROM imaginary_table", + sqlite3.connect(":memory:"), + sqlite3.OperationalError, + "no such table: imaginary_table", + id="Invalid read DB kwargs", + ), + pytest.param( + "read_database", + None, + "SELECT * FROM imaginary_table", + sys.getsizeof, # not a connection + TypeError, + "Unrecognised connection .* unable to find 'execute' method", + id="Invalid read DB kwargs", + ), + ], +) +def test_read_database_exceptions( + read_method: str, + engine: DbReadEngine | None, + query: str, + database: Any, + errclass: type, + err: str, + tmp_path: Path, +) -> None: + if read_method == "read_database_uri": + conn = f"{database}://test" if isinstance(database, str) else database + params = {"uri": conn, "query": query, "engine": engine} + else: + params = {"connection": database, "query": query} + + read_database = getattr(pl, read_method) + with pytest.raises(errclass, match=err): + read_database(**params) diff --git a/py-polars/tests/unit/io/test_database_write.py b/py-polars/tests/unit/io/test_database_write.py new file mode 100644 index 000000000000..9c7efbd9ce23 --- /dev/null +++ b/py-polars/tests/unit/io/test_database_write.py @@ -0,0 +1,103 @@ +from __future__ import annotations + +import sys +from typing import TYPE_CHECKING + +import pytest + +import polars as pl +from polars.testing import assert_frame_equal + +if TYPE_CHECKING: + from pathlib import Path + + from polars.type_aliases import DbWriteEngine, DbWriteMode + + +@pytest.fixture() +def sample_df() -> pl.DataFrame: + return pl.DataFrame( + { + "id": [1, 2], + "name": ["misc", "other"], + "value": [100.0, -99.0], + "date": ["2020-01-01", "2021-12-31"], + } + ) + + +@pytest.mark.write_disk() +@pytest.mark.parametrize( + ("engine", "mode"), + [ + pytest.param( + "adbc", + "create", + id="adbc_create", + marks=pytest.mark.skipif( + sys.version_info < (3, 9) or sys.platform == "win32", + reason="adbc_driver_sqlite not available below Python 3.9 / on Windows", + ), + ), + pytest.param( + "adbc", + "append", + id="adbc_append", + marks=pytest.mark.skipif( + sys.version_info < (3, 9) or sys.platform == "win32", + reason="adbc_driver_sqlite not available below Python 3.9 / on Windows", + ), + ), + pytest.param( + "sqlalchemy", + "create", + id="sa_create", + ), + pytest.param( + "sqlalchemy", + "append", + id="sa_append", + ), + ], +) +def test_write_database( + engine: DbWriteEngine, mode: DbWriteMode, sample_df: pl.DataFrame, tmp_path: Path +) -> None: + tmp_path.mkdir(exist_ok=True) + tmp_db = f"test_{engine}.db" + test_db = str(tmp_path / tmp_db) + + # note: test a table name that requires quotes to ensure that we handle + # it correctly (also supply an explicit db schema with/without quotes) + tbl_name = '"test-data"' + + sample_df.write_database( + table_name=f"main.{tbl_name}", + connection=f"sqlite:///{test_db}", + if_exists="replace", + engine=engine, + ) + if mode == "append": + sample_df.write_database( + table_name=f'"main".{tbl_name}', + connection=f"sqlite:///{test_db}", + if_exists="append", + engine=engine, + ) + sample_df = pl.concat([sample_df, sample_df]) + + result = pl.read_database_uri(f"SELECT * FROM {tbl_name}", f"sqlite:///{test_db}") + sample_df = sample_df.with_columns(pl.col("date").cast(pl.Utf8)) + assert_frame_equal(sample_df, result) + + # check that some invalid parameters raise errors + for invalid_params in ( + {"table_name": "w.x.y.z"}, + {"if_exists": "crunk", "table_name": f"main.{tbl_name}"}, + ): + with pytest.raises((ValueError, NotImplementedError)): + sample_df.write_database( + connection=f"sqlite:///{test_db}", + engine=engine, + **invalid_params, # type: ignore[arg-type] + ) From 7242fc1174a2937d31147d4c269512627955555a Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Tue, 22 Aug 2023 12:31:45 +0200 Subject: [PATCH 38/55] feat(python): csv: add schema argument (#10665) --- crates/polars-io/src/csv/read.rs | 4 ++-- crates/polars-io/src/csv/read_impl/batched_mmap.rs | 2 +- crates/polars-io/src/csv/read_impl/batched_read.rs | 2 +- crates/polars-lazy/src/frame/csv.rs | 6 +++--- crates/polars-pipe/src/executors/sources/csv.rs | 2 +- crates/polars/tests/it/io/csv.rs | 6 +++--- py-polars/polars/dataframe/frame.py | 3 +++ py-polars/polars/io/_utils.py | 2 +- py-polars/polars/io/csv/functions.py | 12 ++++++++++++ py-polars/polars/lazyframe/frame.py | 2 ++ py-polars/src/dataframe.rs | 4 +++- py-polars/src/lazyframe.rs | 4 +++- py-polars/tests/unit/io/test_csv.py | 13 +++++++++++++ 13 files changed, 48 insertions(+), 14 deletions(-) diff --git a/crates/polars-io/src/csv/read.rs b/crates/polars-io/src/csv/read.rs index 9bf55ca3e06a..5f0b3a228596 100644 --- a/crates/polars-io/src/csv/read.rs +++ b/crates/polars-io/src/csv/read.rs @@ -181,8 +181,8 @@ where /// in the csv parser and expects a complete Schema. /// /// It is recommended to use [with_dtypes](Self::with_dtypes) instead. - pub fn with_schema(mut self, schema: SchemaRef) -> Self { - self.schema = Some(schema); + pub fn with_schema(mut self, schema: Option) -> Self { + self.schema = schema; self } diff --git a/crates/polars-io/src/csv/read_impl/batched_mmap.rs b/crates/polars-io/src/csv/read_impl/batched_mmap.rs index a659f31d6c3c..18824d5e08f1 100644 --- a/crates/polars-io/src/csv/read_impl/batched_mmap.rs +++ b/crates/polars-io/src/csv/read_impl/batched_mmap.rs @@ -308,7 +308,7 @@ pub fn to_batched_owned_mmap( ) -> OwnedBatchedCsvReaderMmap { // make sure that the schema is bound to the schema we have // we will keep ownership of the schema so that the lifetime remains bound to ourselves - let reader = reader.with_schema(schema.clone()); + let reader = reader.with_schema(Some(schema.clone())); // extend the lifetime // the lifetime was bound to schema, which we own and will store on the heap let reader = unsafe { diff --git a/crates/polars-io/src/csv/read_impl/batched_read.rs b/crates/polars-io/src/csv/read_impl/batched_read.rs index 88249222dcb4..af3831f00b70 100644 --- a/crates/polars-io/src/csv/read_impl/batched_read.rs +++ b/crates/polars-io/src/csv/read_impl/batched_read.rs @@ -405,7 +405,7 @@ pub fn to_batched_owned_read( ) -> OwnedBatchedCsvReader { // make sure that the schema is bound to the schema we have // we will keep ownership of the schema so that the lifetime remains bound to ourselves - let reader = reader.with_schema(schema.clone()); + let reader = reader.with_schema(Some(schema.clone())); // extend the lifetime // the lifetime was bound to schema, which we own and will store on the heap let reader = unsafe { diff --git a/crates/polars-lazy/src/frame/csv.rs b/crates/polars-lazy/src/frame/csv.rs index 1e1e97240dbf..be497c336388 100644 --- a/crates/polars-lazy/src/frame/csv.rs +++ b/crates/polars-lazy/src/frame/csv.rs @@ -106,8 +106,8 @@ impl<'a> LazyCsvReader<'a> { /// Set the CSV file's schema #[must_use] - pub fn with_schema(mut self, schema: SchemaRef) -> Self { - self.schema = Some(schema); + pub fn with_schema(mut self, schema: Option) -> Self { + self.schema = schema; self } @@ -261,7 +261,7 @@ impl<'a> LazyCsvReader<'a> { } } - Ok(self.with_schema(Arc::new(schema))) + Ok(self.with_schema(Some(Arc::new(schema)))) } } diff --git a/crates/polars-pipe/src/executors/sources/csv.rs b/crates/polars-pipe/src/executors/sources/csv.rs index a9e9f5352d1d..8a6338827828 100644 --- a/crates/polars-pipe/src/executors/sources/csv.rs +++ b/crates/polars-pipe/src/executors/sources/csv.rs @@ -62,7 +62,7 @@ impl CsvSource { let reader = CsvReader::from_path(&path) .unwrap() .has_header(options.has_header) - .with_schema(self.schema.clone()) + .with_schema(Some(self.schema.clone())) .with_delimiter(options.delimiter) .with_ignore_errors(options.ignore_errors) .with_skip_rows(options.skip_rows) diff --git a/crates/polars/tests/it/io/csv.rs b/crates/polars/tests/it/io/csv.rs index 4c48d71921c6..9df2115ed8d8 100644 --- a/crates/polars/tests/it/io/csv.rs +++ b/crates/polars/tests/it/io/csv.rs @@ -387,7 +387,7 @@ fn test_empty_bytes_to_dataframe() { let result = CsvReader::new(file) .has_header(false) .with_columns(Some(schema.iter_names().map(|s| s.to_string()).collect())) - .with_schema(Arc::new(schema)) + .with_schema(Some(Arc::new(schema))) .finish(); assert!(result.is_ok()) } @@ -416,11 +416,11 @@ fn test_missing_value() { let file = Cursor::new(csv); let df = CsvReader::new(file) .has_header(true) - .with_schema(Arc::new(Schema::from_iter([ + .with_schema(Some(Arc::new(Schema::from_iter([ Field::new("foo", DataType::UInt32), Field::new("bar", DataType::UInt32), Field::new("ham", DataType::UInt32), - ]))) + ])))) .finish() .unwrap(); assert_eq!(df.column("ham").unwrap().len(), 3) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 96c70447637e..a402ff35619e 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -662,6 +662,7 @@ def _read_csv( quote_char: str | None = r'"', skip_rows: int = 0, dtypes: None | (SchemaDict | Sequence[PolarsDataType]) = None, + schema: None | SchemaDict = None, null_values: str | Sequence[str] | dict[str, str] | None = None, missing_utf8_is_empty_string: bool = False, ignore_errors: bool = False, @@ -740,6 +741,7 @@ def _read_csv( quote_char=quote_char, skip_rows=skip_rows, dtypes=dtypes_dict, + schema=schema, null_values=null_values, missing_utf8_is_empty_string=missing_utf8_is_empty_string, ignore_errors=ignore_errors, @@ -795,6 +797,7 @@ def _read_csv( eol_char=eol_char, raise_if_empty=raise_if_empty, truncate_ragged_lines=truncate_ragged_lines, + schema=schema, ) return self diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py index ec3301bbd930..4a59dd65353c 100644 --- a/py-polars/polars/io/_utils.py +++ b/py-polars/polars/io/_utils.py @@ -17,7 +17,7 @@ def _is_glob_pattern(file: str) -> bool: def _is_local_file(file: str) -> bool: try: - next(glob.iglob(file, recursive=True)) # noqa: PTH207 + next(glob.iglob(file, recursive=True)) return True except StopIteration: return False diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index 548a90d89a56..57d03aebba0e 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -28,6 +28,7 @@ def read_csv( quote_char: str | None = r'"', skip_rows: int = 0, dtypes: Mapping[str, PolarsDataType] | Sequence[PolarsDataType] | None = None, + schema: SchemaDict | None = None, null_values: str | Sequence[str] | dict[str, str] | None = None, missing_utf8_is_empty_string: bool = False, ignore_errors: bool = False, @@ -83,6 +84,10 @@ def read_csv( Start reading after ``skip_rows`` lines. dtypes Overwrite dtypes for specific or all columns during schema inference. + schema + Provide the schema. This means that polars doesn't do schema inference. + This argument expects the complete schema, whereas ``dtypes`` can be used + to partially overwrite a schema. null_values Values to interpret as null values. You can provide a: @@ -365,6 +370,7 @@ def read_csv( quote_char=quote_char, skip_rows=skip_rows, dtypes=dtypes, + schema=schema, null_values=null_values, missing_utf8_is_empty_string=missing_utf8_is_empty_string, ignore_errors=ignore_errors, @@ -691,6 +697,7 @@ def scan_csv( quote_char: str | None = r'"', skip_rows: int = 0, dtypes: SchemaDict | Sequence[PolarsDataType] | None = None, + schema: SchemaDict | None = None, null_values: str | Sequence[str] | dict[str, str] | None = None, missing_utf8_is_empty_string: bool = False, ignore_errors: bool = False, @@ -741,6 +748,10 @@ def scan_csv( Overwrite dtypes during inference; should be a {colname:dtype,} dict or, if providing a list of strings to ``new_columns``, a list of dtypes of the same length. + schema + Provide the schema. This means that polars doesn't do schema inference. + This argument expects the complete schema, whereas ``dtypes`` can be used + to partially overwrite a schema. null_values Values to interpret as null values. You can provide a: @@ -892,6 +903,7 @@ def with_column_names(_cols: list[str]) -> list[str]: quote_char=quote_char, skip_rows=skip_rows, dtypes=dtypes, # type: ignore[arg-type] + schema=schema, null_values=null_values, missing_utf8_is_empty_string=missing_utf8_is_empty_string, ignore_errors=ignore_errors, diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 25622fb163a2..eca59c91c51a 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -327,6 +327,7 @@ def _scan_csv( quote_char: str | None = r'"', skip_rows: int = 0, dtypes: SchemaDict | None = None, + schema: SchemaDict | None = None, null_values: str | Sequence[str] | dict[str, str] | None = None, missing_utf8_is_empty_string: bool = False, ignore_errors: bool = False, @@ -387,6 +388,7 @@ def _scan_csv( eol_char=eol_char, raise_if_empty=raise_if_empty, truncate_ragged_lines=truncate_ragged_lines, + schema=schema, ) return self diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs index 3ce6754aefab..5292ece6c3fd 100644 --- a/py-polars/src/dataframe.rs +++ b/py-polars/src/dataframe.rs @@ -139,7 +139,7 @@ impl PyDataFrame { skip_rows, projection, separator, rechunk, columns, encoding, n_threads, path, overwrite_dtype, overwrite_dtype_slice, low_memory, comment_char, quote_char, null_values, missing_utf8_is_empty_string, try_parse_dates, skip_rows_after_header, - row_count, sample_size, eol_char, raise_if_empty, truncate_ragged_lines) + row_count, sample_size, eol_char, raise_if_empty, truncate_ragged_lines, schema) )] pub fn read_csv( py_f: &PyAny, @@ -170,6 +170,7 @@ impl PyDataFrame { eol_char: &str, raise_if_empty: bool, truncate_ragged_lines: bool, + schema: Option>, ) -> PyResult { let null_values = null_values.map(|w| w.0); let comment_char = comment_char.map(|s| s.as_bytes()[0]); @@ -219,6 +220,7 @@ impl PyDataFrame { .with_path(path) .with_dtypes(overwrite_dtype.map(Arc::new)) .with_dtypes_slice(overwrite_dtype_slice.as_deref()) + .with_schema(schema.map(|schema| Arc::new(schema.0))) .low_memory(low_memory) .with_null_values(null_values) .with_missing_is_null(!missing_utf8_is_empty_string) diff --git a/py-polars/src/lazyframe.rs b/py-polars/src/lazyframe.rs index be3c62ce4d97..288332db7d65 100644 --- a/py-polars/src/lazyframe.rs +++ b/py-polars/src/lazyframe.rs @@ -147,7 +147,7 @@ impl PyLazyFrame { #[pyo3(signature = (path, separator, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype, low_memory, comment_char, quote_char, null_values, missing_utf8_is_empty_string, infer_schema_length, with_schema_modify, rechunk, skip_rows_after_header, - encoding, row_count, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines + encoding, row_count, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines, schema ) )] fn new_from_csv( @@ -174,6 +174,7 @@ impl PyLazyFrame { eol_char: &str, raise_if_empty: bool, truncate_ragged_lines: bool, + schema: Option>, ) -> PyResult { let null_values = null_values.map(|w| w.0); let comment_char = comment_char.map(|s| s.as_bytes()[0]); @@ -197,6 +198,7 @@ impl PyLazyFrame { .with_n_rows(n_rows) .with_cache(cache) .with_dtype_overwrite(overwrite_dtype.as_ref()) + .with_schema(schema.map(|schema| Arc::new(schema.0))) .low_memory(low_memory) .with_comment_char(comment_char) .with_quote_char(quote_char) diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index dd973193ea26..dc24c2091d6e 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -1494,3 +1494,16 @@ def test_csv_ragged_lines() -> None: pl.read_csv(io.StringIO(s), has_header=False, truncate_ragged_lines=False) with pytest.raises(pl.ComputeError, match=r"found more fields than defined"): pl.read_csv(io.StringIO(s), has_header=False, truncate_ragged_lines=False) + + +def test_provide_schema() -> None: + # can be used to overload schema with ragged csv files + assert pl.read_csv( + io.StringIO("A\nB,ragged\nC"), + has_header=False, + schema={"A": pl.Utf8, "B": pl.Utf8, "C": pl.Utf8}, + ).to_dict(False) == { + "A": ["A", "B", "C"], + "B": [None, "ragged", None], + "C": [None, None, None], + } From ac12d3b14ba606dd91337962f966df77fec55b64 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Tue, 22 Aug 2023 16:44:41 +0400 Subject: [PATCH 39/55] docs(python): add "see also" entries to ne/eq_missing and update related examples (#10667) --- py-polars/polars/api.py | 48 +++--- py-polars/polars/config.py | 8 +- py-polars/polars/dataframe/frame.py | 58 ++++---- py-polars/polars/expr/datetime.py | 8 +- py-polars/polars/expr/expr.py | 88 +++++------ py-polars/polars/expr/string.py | 40 ++--- py-polars/polars/functions/lazy.py | 8 +- py-polars/polars/io/_utils.py | 2 +- py-polars/polars/io/csv/functions.py | 8 +- py-polars/polars/io/database.py | 16 +- py-polars/polars/lazyframe/frame.py | 8 +- py-polars/polars/selectors.py | 210 +++++++++++++-------------- py-polars/polars/series/datetime.py | 8 +- py-polars/polars/series/list.py | 8 +- py-polars/polars/series/series.py | 64 +++++++- py-polars/polars/series/string.py | 30 ++-- py-polars/polars/sql/context.py | 48 +++--- 17 files changed, 357 insertions(+), 303 deletions(-) diff --git a/py-polars/polars/api.py b/py-polars/polars/api.py index 903b34ac8535..7c8d368bed7f 100644 --- a/py-polars/polars/api.py +++ b/py-polars/polars/api.py @@ -78,6 +78,12 @@ def register_expr_namespace(name: str) -> Callable[[type[NS]], type[NS]]: name Name under which the functionality will be accessed. + See Also + -------- + register_dataframe_namespace: Register functionality on a DataFrame. + register_lazyframe_namespace: Register functionality on a LazyFrame. + register_series_namespace: Register functionality on a Series. + Examples -------- >>> @pl.api.register_expr_namespace("pow_n") @@ -114,12 +120,6 @@ def register_expr_namespace(name: str) -> Callable[[type[NS]], type[NS]]: │ 64.001 ┆ 128 ┆ 64 ┆ 64 │ └────────┴───────────┴───────────┴──────────────┘ - See Also - -------- - register_dataframe_namespace: Register functionality on a DataFrame. - register_lazyframe_namespace: Register functionality on a LazyFrame. - register_series_namespace: Register functionality on a Series. - """ return _create_namespace(name, pl.Expr) @@ -133,6 +133,12 @@ def register_dataframe_namespace(name: str) -> Callable[[type[NS]], type[NS]]: name Name under which the functionality will be accessed. + See Also + -------- + register_expr_namespace: Register functionality on an Expr. + register_lazyframe_namespace: Register functionality on a LazyFrame. + register_series_namespace: Register functionality on a Series. + Examples -------- >>> @pl.api.register_dataframe_namespace("split") @@ -214,12 +220,6 @@ def register_dataframe_namespace(name: str) -> Callable[[type[NS]], type[NS]]: │ yz ┆ 6 ┆ 7 ┆ 8 │ └─────┴─────┴─────┴─────┘] - See Also - -------- - register_expr_namespace: Register functionality on an Expr. - register_lazyframe_namespace: Register functionality on a LazyFrame. - register_series_namespace: Register functionality on a Series. - """ return _create_namespace(name, pl.DataFrame) @@ -233,6 +233,12 @@ def register_lazyframe_namespace(name: str) -> Callable[[type[NS]], type[NS]]: name Name under which the functionality will be accessed. + See Also + -------- + register_expr_namespace: Register functionality on an Expr. + register_dataframe_namespace: Register functionality on a DataFrame. + register_series_namespace: Register functionality on a Series. + Examples -------- >>> @pl.api.register_lazyframe_namespace("types") @@ -319,12 +325,6 @@ def register_lazyframe_namespace(name: str) -> Callable[[type[NS]], type[NS]]: │ 6 ┆ 7 ┆ 8 │ └─────┴─────┴─────┘] - See Also - -------- - register_expr_namespace: Register functionality on an Expr. - register_dataframe_namespace: Register functionality on a DataFrame. - register_series_namespace: Register functionality on a Series. - """ return _create_namespace(name, pl.LazyFrame) @@ -338,6 +338,12 @@ def register_series_namespace(name: str) -> Callable[[type[NS]], type[NS]]: name Name under which the functionality will be accessed. + See Also + -------- + register_expr_namespace: Register functionality on an Expr. + register_dataframe_namespace: Register functionality on a DataFrame. + register_lazyframe_namespace: Register functionality on a LazyFrame. + Examples -------- >>> @pl.api.register_series_namespace("math") @@ -374,11 +380,5 @@ def register_series_namespace(name: str) -> Callable[[type[NS]], type[NS]]: 125 ] - See Also - -------- - register_expr_namespace: Register functionality on an Expr. - register_dataframe_namespace: Register functionality on a DataFrame. - register_lazyframe_namespace: Register functionality on a LazyFrame. - """ return _create_namespace(name, pl.Series) diff --git a/py-polars/polars/config.py b/py-polars/polars/config.py index 2634882b4599..6ada4300c22d 100644 --- a/py-polars/polars/config.py +++ b/py-polars/polars/config.py @@ -671,6 +671,10 @@ def set_tbl_hide_dtype_separator(cls, active: bool = True) -> type[Config]: """ Hide the '---' separator between the column names and column types. + See Also + -------- + set_tbl_column_data_type_inline + Examples -------- >>> df = pl.DataFrame({"abc": [1.0, 2.5, 5.0], "xyz": [True, False, True]}) @@ -687,10 +691,6 @@ def set_tbl_hide_dtype_separator(cls, active: bool = True) -> type[Config]: # │ 5.0 ┆ true │ └─────┴───────┘ # └─────┴───────┘ - See Also - -------- - set_tbl_column_data_type_inline - """ os.environ["POLARS_FMT_TABLE_HIDE_COLUMN_SEPARATOR"] = str(int(active)) return cls diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index a402ff35619e..383354272843 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -1201,6 +1201,10 @@ def dtypes(self) -> list[PolarsDataType]: The datatypes can also be found in column headers when printing the DataFrame. + See Also + -------- + schema : Returns a {colname:dtype} mapping. + Examples -------- >>> df = pl.DataFrame( @@ -1224,10 +1228,6 @@ def dtypes(self) -> list[PolarsDataType]: │ 3 ┆ 8.0 ┆ c │ └─────┴─────┴─────┘ - See Also - -------- - schema : Returns a {colname:dtype} mapping. - """ return self._df.dtypes() @@ -1835,6 +1835,10 @@ def item(self, row: int | None = None, column: int | str | None = None) -> Any: column Optional column index or name. + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + Examples -------- >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -1845,10 +1849,6 @@ def item(self, row: int | None = None, column: int | str | None = None) -> Any: >>> df.item(2, "b") 6 - See Also - -------- - row: Get the values of a single row, either by index or by predicate. - """ if row is None and column is None: if self.shape != (1, 1): @@ -8816,6 +8816,12 @@ def row( You should NEVER use this method to iterate over a DataFrame; if you require row-iteration you should strongly prefer use of ``iter_rows()`` instead. + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + Examples -------- Specify an index to return the row at the given index as a tuple. @@ -8841,12 +8847,6 @@ def row( >>> df.row(by_predicate=(pl.col("ham") == "b")) (2, 7, 'b') - See Also - -------- - iter_rows : Row iterator over frame data (does not materialise all rows). - rows : Materialise all frame data as a list of rows (potentially expensive). - item: Return dataframe element as a scalar. - """ if index is not None and by_predicate is not None: raise ValueError( @@ -8927,6 +8927,11 @@ def rows( ------- list of tuples (default) or dictionaries of row values + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + Examples -------- >>> df = pl.DataFrame( @@ -8944,11 +8949,6 @@ def rows( {'x': 'b', 'y': 3, 'z': 6}, {'x': 'a', 'y': 4, 'z': 9}] - See Also - -------- - iter_rows : Row iterator over frame data (does not materialise all rows). - rows_by_key : Materialises frame data as a key-indexed dictionary. - """ if named: # Load these into the local namespace for a minor performance boost @@ -8997,6 +8997,11 @@ def rows_by_key( truncated to microseconds on conversion to Python. If this matters to your use-case you should export to a different format (such as Arrow or NumPy). + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + Examples -------- >>> df = pl.DataFrame( @@ -9049,11 +9054,6 @@ def rows_by_key( {'w': 'b', 'x': 'q', 'y': 3.0, 'z': 7}], ('a', 'k'): [{'w': 'a', 'x': 'k', 'y': 4.5, 'z': 6}]}) - See Also - -------- - rows : Materialise all frame data as a list of rows (potentially expensive). - iter_rows : Row iterator over frame data (does not materialise all rows). - """ from polars.selectors import expand_selector, is_selector @@ -9168,6 +9168,11 @@ def iter_rows( ------- iterator of tuples (default) or dictionaries (if named) of python row values + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + Examples -------- >>> df = pl.DataFrame( @@ -9181,11 +9186,6 @@ def iter_rows( >>> [row["b"] for row in df.iter_rows(named=True)] [2, 4, 6] - See Also - -------- - rows : Materialises all frame data as a list of rows (potentially expensive). - rows_by_key : Materialises frame data as a key-indexed dictionary. - """ # load into the local namespace for a (minor) performance boost in the hot loops columns, get_row, dict_, zip_ = self.columns, self.row, dict, zip diff --git a/py-polars/polars/expr/datetime.py b/py-polars/polars/expr/datetime.py index 31fc8f90f118..ca5ea8c9f7d9 100644 --- a/py-polars/polars/expr/datetime.py +++ b/py-polars/polars/expr/datetime.py @@ -461,6 +461,10 @@ def strftime(self, format: str) -> Expr: `_ for specification. Example: ``"%y-%m-%d"``. + See Also + -------- + to_string : The identical expression for which ``strftime`` is an alias. + Examples -------- >>> from datetime import datetime @@ -489,10 +493,6 @@ def strftime(self, format: str) -> Expr: │ 2020-05-01 00:00:00 ┆ 2020/05/01 00:00:00 │ └─────────────────────┴─────────────────────┘ - See Also - -------- - to_string : The identical expression for which ``strftime`` is an alias. - """ return self.to_string(format) diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index 2999118b0bb9..eec96107c24c 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -444,6 +444,11 @@ def arg_true(self) -> Self: Modifies number of rows returned, so will fail in combination with other expressions. Use as only expression in `select` / `with_columns`. + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + Examples -------- >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) @@ -459,11 +464,6 @@ def arg_true(self) -> Self: │ 3 │ └─────┘ - See Also - -------- - Series.arg_true : Return indices where Series is True - polars.arg_where - """ return self._from_pyexpr(py_arg_where(self._pyexpr)) @@ -4225,21 +4225,22 @@ def eq_missing(self, other: Any) -> Self: ... } ... ) >>> df.with_columns( - ... pl.col("x").eq_missing(pl.col("y")).alias("x == y"), - ... ) - shape: (6, 3) - ┌──────┬──────┬────────┐ - │ x ┆ y ┆ x == y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞══════╪══════╪════════╡ - │ 1.0 ┆ 2.0 ┆ false │ - │ 2.0 ┆ 2.0 ┆ true │ - │ NaN ┆ NaN ┆ false │ - │ 4.0 ┆ 4.0 ┆ true │ - │ null ┆ 5.0 ┆ false │ - │ null ┆ null ┆ true │ - └──────┴──────┴────────┘ + ... pl.col("x").eq(pl.col("y")).alias("x eq y"), + ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x eq y ┆ x eq_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ false ┆ false │ + │ 2.0 ┆ 2.0 ┆ true ┆ true │ + │ NaN ┆ NaN ┆ false ┆ false │ + │ 4.0 ┆ 4.0 ┆ true ┆ true │ + │ null ┆ 5.0 ┆ null ┆ false │ + │ null ┆ null ┆ null ┆ true │ + └──────┴──────┴────────┴────────────────┘ """ return self._from_pyexpr(self._pyexpr.eq_missing(self._to_expr(other)._pyexpr)) @@ -4439,21 +4440,22 @@ def ne_missing(self, other: Any) -> Self: ... } ... ) >>> df.with_columns( - ... pl.col("x").ne_missing(pl.col("y")).alias("x != y"), - ... ) - shape: (6, 3) - ┌──────┬──────┬────────┐ - │ x ┆ y ┆ x != y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞══════╪══════╪════════╡ - │ 1.0 ┆ 2.0 ┆ true │ - │ 2.0 ┆ 2.0 ┆ false │ - │ NaN ┆ NaN ┆ true │ - │ 4.0 ┆ 4.0 ┆ false │ - │ null ┆ 5.0 ┆ true │ - │ null ┆ null ┆ false │ - └──────┴──────┴────────┘ + ... pl.col("x").ne(pl.col("y")).alias("x ne y"), + ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x ne y ┆ x ne_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ true ┆ true │ + │ 2.0 ┆ 2.0 ┆ false ┆ false │ + │ NaN ┆ NaN ┆ true ┆ true │ + │ 4.0 ┆ 4.0 ┆ false ┆ false │ + │ null ┆ 5.0 ┆ null ┆ true │ + │ null ┆ null ┆ null ┆ false │ + └──────┴──────┴────────┴────────────────┘ """ return self._from_pyexpr(self._pyexpr.neq_missing(self._to_expr(other)._pyexpr)) @@ -4514,6 +4516,10 @@ def floordiv(self, other: Any) -> Self: other Numeric literal or expression value. + See Also + -------- + truediv + Examples -------- >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) @@ -4534,10 +4540,6 @@ def floordiv(self, other: Any) -> Self: │ 5 ┆ 2.5 ┆ 2 │ └─────┴─────┴──────┘ - See Also - -------- - truediv - """ return self.__floordiv__(other) @@ -4650,6 +4652,10 @@ def truediv(self, other: Any) -> Self: 0/0: Invalid operation - mathematically undefined, returns NaN. n/0: On finite operands gives an exact infinite result, eg: ±infinity. + See Also + -------- + floordiv + Examples -------- >>> df = pl.DataFrame( @@ -4672,10 +4678,6 @@ def truediv(self, other: Any) -> Self: │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ └─────┴──────┴──────┴───────┘ - See Also - -------- - floordiv - """ return self.__truediv__(other) diff --git a/py-polars/polars/expr/string.py b/py-polars/polars/expr/string.py index 11338913e987..5897eb77488e 100644 --- a/py-polars/polars/expr/string.py +++ b/py-polars/polars/expr/string.py @@ -801,6 +801,11 @@ def contains( `_ for additional information about the use of inline expression modifiers. + See Also + -------- + starts_with : Check if string values start with a substring. + ends_with : Check if string values end with a substring. + Examples -------- >>> df = pl.DataFrame({"a": ["Crab", "cat and dog", "rab$bit", None]}) @@ -821,11 +826,6 @@ def contains( │ null ┆ null ┆ null │ └─────────────┴───────┴─────────┘ - See Also - -------- - starts_with : Check if string values start with a substring. - ends_with : Check if string values end with a substring. - """ pattern = parse_as_expression(pattern, str_as_lit=True) return wrap_expr(self._pyexpr.str_contains(pattern, literal, strict)) @@ -839,6 +839,11 @@ def ends_with(self, suffix: str | Expr) -> Expr: suffix Suffix substring. + See Also + -------- + contains : Check if string contains a substring that matches a regex. + starts_with : Check if string values start with a substring. + Examples -------- >>> df = pl.DataFrame({"fruits": ["apple", "mango", None]}) @@ -868,11 +873,6 @@ def ends_with(self, suffix: str | Expr) -> Expr: │ mango │ └────────┘ - See Also - -------- - contains : Check if string contains a substring that matches a regex. - starts_with : Check if string values start with a substring. - """ suffix = parse_as_expression(suffix, str_as_lit=True) return wrap_expr(self._pyexpr.str_ends_with(suffix)) @@ -886,6 +886,11 @@ def starts_with(self, prefix: str | Expr) -> Expr: prefix Prefix substring. + See Also + -------- + contains : Check if string contains a substring that matches a regex. + ends_with : Check if string values end with a substring. + Examples -------- >>> df = pl.DataFrame({"fruits": ["apple", "mango", None]}) @@ -915,11 +920,6 @@ def starts_with(self, prefix: str | Expr) -> Expr: │ apple │ └────────┘ - See Also - -------- - contains : Check if string contains a substring that matches a regex. - ends_with : Check if string values end with a substring. - """ prefix = parse_as_expression(prefix, str_as_lit=True) return wrap_expr(self._pyexpr.str_starts_with(prefix)) @@ -941,6 +941,11 @@ def json_extract( How many rows to parse to determine the schema. If ``None`` all rows are used. + See Also + -------- + json_path_match : Extract the first match of json string with provided JSONPath + expression. + Examples -------- >>> df = pl.DataFrame( @@ -959,11 +964,6 @@ def json_extract( │ {2,false} │ └─────────────┘ - See Also - -------- - json_path_match : Extract the first match of json string with provided JSONPath - expression. - """ if dtype is not None: dtype = py_type_to_dtype(dtype) diff --git a/py-polars/polars/functions/lazy.py b/py-polars/polars/functions/lazy.py index 78862b54abd5..2b8e399f9b36 100644 --- a/py-polars/polars/functions/lazy.py +++ b/py-polars/polars/functions/lazy.py @@ -1924,6 +1924,10 @@ def arg_where(condition: Expr | Series, *, eager: bool = False) -> Expr | Series Evaluate immediately and return a ``Series``. If set to ``False`` (default), return an expression instead. + See Also + -------- + Series.arg_true : Return indices where Series is True + Examples -------- >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) @@ -1939,10 +1943,6 @@ def arg_where(condition: Expr | Series, *, eager: bool = False) -> Expr | Series 3 ] - See Also - -------- - Series.arg_true : Return indices where Series is True - """ if eager: if not isinstance(condition, pl.Series): diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py index 4a59dd65353c..ec3301bbd930 100644 --- a/py-polars/polars/io/_utils.py +++ b/py-polars/polars/io/_utils.py @@ -17,7 +17,7 @@ def _is_glob_pattern(file: str) -> bool: def _is_local_file(file: str) -> bool: try: - next(glob.iglob(file, recursive=True)) + next(glob.iglob(file, recursive=True)) # noqa: PTH207 return True except StopIteration: return False diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index 57d03aebba0e..177c8828cffe 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -170,6 +170,10 @@ def read_csv( ------- DataFrame + See Also + -------- + scan_csv : Lazily read from a CSV file or multiple files via glob patterns. + Notes ----- This operation defaults to a `rechunk` operation at the end, meaning that @@ -177,10 +181,6 @@ def read_csv( Set `rechunk=False` if you are benchmarking the csv-reader. A `rechunk` is an expensive operation. - See Also - -------- - scan_csv : Lazily read from a CSV file or multiple files via glob patterns. - """ _check_arg_is_1byte("separator", separator, False) _check_arg_is_1byte("comment_char", comment_char, False) diff --git a/py-polars/polars/io/database.py b/py-polars/polars/io/database.py index 49abbb505f28..8babe712ce21 100644 --- a/py-polars/polars/io/database.py +++ b/py-polars/polars/io/database.py @@ -241,6 +241,10 @@ def read_database( # noqa: D417 will be used to efficiently instantiate the DataFrame; otherwise, the DataFrame is initialised from row-wise data. + See Also + -------- + read_database_uri : Create a DataFrame from a SQL query using a URI string. + Examples -------- Instantiate a DataFrame from a SQL query against a user-supplied connection: @@ -250,10 +254,6 @@ def read_database( # noqa: D417 ... connection=conn, ... ) # doctest: +SKIP - See Also - -------- - read_database_uri : Create a DataFrame from a SQL query using a URI string. - """ if isinstance(connection, str): issue_deprecation_warning( @@ -327,6 +327,10 @@ def read_database_uri( For ``adbc`` you will need to have installed ``pyarrow`` and the ADBC driver associated with the backend you are connecting to, eg: ``adbc-driver-postgresql``. + See Also + -------- + read_database : Create a DataFrame from a SQL query using a connection object. + Examples -------- Create a DataFrame from a SQL query using a single thread: @@ -366,10 +370,6 @@ def read_database_uri( ... engine="adbc", ... ) # doctest: +SKIP - See Also - -------- - read_database : Create a DataFrame from a SQL query using a connection object. - """ # noqa: W505 if not isinstance(uri, str): raise TypeError( diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index eca59c91c51a..3788181da79a 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -652,6 +652,10 @@ def dtypes(self) -> list[PolarsDataType]: """ Get dtypes of columns in LazyFrame. + See Also + -------- + schema : Returns a {colname:dtype} mapping. + Examples -------- >>> lf = pl.LazyFrame( @@ -664,10 +668,6 @@ def dtypes(self) -> list[PolarsDataType]: >>> lf.dtypes [Int64, Float64, Utf8] - See Also - -------- - schema : Returns a {colname:dtype} mapping. - """ return self._ldf.dtypes() diff --git a/py-polars/polars/selectors.py b/py-polars/polars/selectors.py index c792735f9b64..a3da7242c453 100644 --- a/py-polars/polars/selectors.py +++ b/py-polars/polars/selectors.py @@ -325,6 +325,11 @@ def all() -> SelectorType: """ Select all columns. + See Also + -------- + first : Select the first column in the current scope. + last : Select the last column in the current scope. + Examples -------- >>> from datetime import date @@ -363,11 +368,6 @@ def all() -> SelectorType: │ 2024-01-01 │ └────────────┘ - See Also - -------- - first : Select the first column in the current scope. - last : Select the last column in the current scope. - """ return _selector_proxy_(F.all(), name="all") @@ -378,6 +378,13 @@ def by_dtype( """ Select all columns matching the given dtypes. + See Also + -------- + integer : Select all integer columns. + float : Select all float columns. + numeric : Select all numeric columns. + temporal : Select all temporal columns. + Examples -------- >>> from datetime import date @@ -431,13 +438,6 @@ def by_dtype( │ foo ┆ -3265500 │ └───────┴──────────┘ - See Also - -------- - integer : Select all integer columns. - float : Select all float columns. - numeric : Select all numeric columns. - temporal : Select all temporal columns. - """ all_dtypes: list[PolarsDataType] = [] for tp in dtypes: @@ -465,6 +465,10 @@ def by_name(*names: str | Collection[str]) -> SelectorType: *names One or more names of columns to select. + See Also + -------- + by_dtype : Select all columns matching the given dtypes. + Examples -------- >>> import polars.selectors as cs @@ -503,10 +507,6 @@ def by_name(*names: str | Collection[str]) -> SelectorType: │ 5.5 ┆ true │ └─────┴───────┘ - See Also - -------- - by_dtype : Select all columns matching the given dtypes. - """ all_names = [] for nm in names: @@ -534,6 +534,12 @@ def contains(substring: str | Collection[str]) -> SelectorType: substring Substring(s) that matching column names should contain. + See Also + -------- + matches : Select all columns that match the given regex pattern. + ends_with : Select columns that end with the given substring(s). + starts_with : Select columns that start with the given substring(s). + Examples -------- >>> import polars.selectors as cs @@ -585,12 +591,6 @@ def contains(substring: str | Collection[str]) -> SelectorType: │ y ┆ true │ └─────┴───────┘ - See Also - -------- - matches : Select all columns that match the given regex pattern. - ends_with : Select columns that end with the given substring(s). - starts_with : Select columns that start with the given substring(s). - """ escaped_substring = _re_string(substring) raw_params = f"^.*{escaped_substring}.*$" @@ -861,6 +861,12 @@ def ends_with(*suffix: str) -> SelectorType: """ Select columns that end with the given substring(s). + See Also + -------- + contains : Select columns that contain the given literal substring(s). + matches : Select all columns that match the given regex pattern. + starts_with : Select columns that start with the given substring(s). + Parameters ---------- suffix @@ -917,12 +923,6 @@ def ends_with(*suffix: str) -> SelectorType: │ y ┆ 456 ┆ true │ └─────┴─────┴───────┘ - See Also - -------- - contains : Select columns that contain the given literal substring(s). - matches : Select all columns that match the given regex pattern. - starts_with : Select columns that start with the given substring(s). - """ escaped_suffix = _re_string(suffix) raw_params = f"^.*{escaped_suffix}$" @@ -938,6 +938,11 @@ def first() -> SelectorType: """ Select the first column in the current scope. + See Also + -------- + all : Select all columns. + last : Select the last column in the current scope. + Examples -------- >>> import polars.selectors as cs @@ -976,11 +981,6 @@ def first() -> SelectorType: │ 456 ┆ 5.5 ┆ 1 │ └─────┴─────┴─────┘ - See Also - -------- - all : Select all columns. - last : Select the last column in the current scope. - """ return _selector_proxy_(F.first(), name="first") @@ -989,6 +989,13 @@ def float() -> SelectorType: """ Select all float columns. + See Also + -------- + integer : Select all integer columns. + numeric : Select all numeric columns. + temporal : Select all temporal columns. + string : Select all string columns. + Examples -------- >>> import polars.selectors as cs @@ -1028,13 +1035,6 @@ def float() -> SelectorType: │ y ┆ 456 │ └─────┴─────┘ - See Also - -------- - integer : Select all integer columns. - numeric : Select all numeric columns. - temporal : Select all temporal columns. - string : Select all string columns. - """ return _selector_proxy_( F.col(FLOAT_DTYPES), @@ -1046,6 +1046,14 @@ def integer() -> SelectorType: """ Select all integer columns. + See Also + -------- + by_dtype : Select columns by dtype. + float : Select all float columns. + numeric : Select all numeric columns. + temporal : Select all temporal columns. + string : Select all string columns. + Examples -------- >>> import polars.selectors as cs @@ -1084,14 +1092,6 @@ def integer() -> SelectorType: │ y ┆ 5.5 │ └─────┴─────┘ - See Also - -------- - by_dtype : Select columns by dtype. - float : Select all float columns. - numeric : Select all numeric columns. - temporal : Select all temporal columns. - string : Select all string columns. - """ return _selector_proxy_( F.col(INTEGER_DTYPES), @@ -1103,6 +1103,14 @@ def signed_integer() -> SelectorType: """ Select all signed integer columns. + See Also + -------- + by_dtype : Select columns by dtype. + float : Select all float columns. + integer: Select all integer columns. + numeric : Select all numeric columns. + unsigned_integer: Select all unsigned integer columns. + Examples -------- >>> import polars.selectors as cs @@ -1153,14 +1161,6 @@ def signed_integer() -> SelectorType: │ -456 ┆ 6789 ┆ 4321 │ └──────┴──────┴──────┘ - See Also - -------- - by_dtype : Select columns by dtype. - float : Select all float columns. - integer: Select all integer columns. - numeric : Select all numeric columns. - unsigned_integer: Select all unsigned integer columns. - """ return _selector_proxy_( F.col(SIGNED_INTEGER_DTYPES), @@ -1172,6 +1172,14 @@ def unsigned_integer() -> SelectorType: """ Select all unsigned integer columns. + See Also + -------- + by_dtype : Select columns by dtype. + float : Select all float columns. + integer: Select all integer columns. + numeric : Select all numeric columns. + signed_integer: Select all signed integer columns. + Examples -------- >>> import polars.selectors as cs @@ -1224,14 +1232,6 @@ def unsigned_integer() -> SelectorType: │ -456 ┆ 6789 ┆ 4321 │ └──────┴──────┴──────┘ - See Also - -------- - by_dtype : Select columns by dtype. - float : Select all float columns. - integer: Select all integer columns. - numeric : Select all numeric columns. - signed_integer: Select all signed integer columns. - """ return _selector_proxy_( F.col(UNSIGNED_INTEGER_DTYPES), @@ -1243,6 +1243,11 @@ def last() -> SelectorType: """ Select the last column in the current scope. + See Also + -------- + all : Select all columns. + first : Select the first column in the current scope. + Examples -------- >>> import polars.selectors as cs @@ -1281,11 +1286,6 @@ def last() -> SelectorType: │ y ┆ 456 ┆ 5.5 │ └─────┴─────┴─────┘ - See Also - -------- - all : Select all columns. - first : Select the first column in the current scope. - """ return _selector_proxy_(F.last(), name="last") @@ -1294,6 +1294,12 @@ def matches(pattern: str) -> SelectorType: """ Select all columns that match the given regex pattern. + See Also + -------- + contains : Select all columns that contain the given substring. + ends_with : Select all columns that end with the given substring(s). + starts_with : Select all columns that start with the given substring(s). + Parameters ---------- pattern @@ -1338,12 +1344,6 @@ def matches(pattern: str) -> SelectorType: │ y ┆ 1 │ └─────┴─────┘ - See Also - -------- - contains : Select all columns that contain the given substring. - ends_with : Select all columns that end with the given substring(s). - starts_with : Select all columns that start with the given substring(s). - """ if pattern == ".*": return all() @@ -1368,6 +1368,14 @@ def numeric() -> SelectorType: """ Select all numeric columns. + See Also + -------- + by_dtype : Select columns by dtype. + float : Select all float columns. + integer : Select all integer columns. + temporal : Select all temporal columns. + string : Select all string columns. + Examples -------- >>> import polars.selectors as cs @@ -1407,14 +1415,6 @@ def numeric() -> SelectorType: │ y │ └─────┘ - See Also - -------- - by_dtype : Select columns by dtype. - float : Select all float columns. - integer : Select all integer columns. - temporal : Select all temporal columns. - string : Select all string columns. - """ return _selector_proxy_( F.col(NUMERIC_DTYPES), @@ -1431,6 +1431,12 @@ def starts_with(*prefix: str) -> SelectorType: prefix Substring(s) that matching column names should start with. + See Also + -------- + contains : Select all columns that contain the given substring. + ends_with : Select all columns that end with the given substring(s). + matches : Select all columns that match the given regex pattern. + Examples -------- >>> import polars.selectors as cs @@ -1482,12 +1488,6 @@ def starts_with(*prefix: str) -> SelectorType: │ 2.0 ┆ 8 │ └─────┴─────┘ - See Also - -------- - contains : Select all columns that contain the given substring. - ends_with : Select all columns that end with the given substring(s). - matches : Select all columns that match the given regex pattern. - """ escaped_prefix = _re_string(prefix) raw_params = f"^{escaped_prefix}.*$" @@ -1503,6 +1503,14 @@ def string(include_categorical: bool = False) -> SelectorType: """ Select all Utf8 (and, optionally, Categorical) string columns. + See Also + -------- + by_dtype : Select all columns of a given dtype. + float : Select all float columns. + integer : Select all integer columns. + numeric : Select all numeric columns. + temporal : Select all temporal columns. + Examples -------- >>> import polars.selectors as cs @@ -1544,14 +1552,6 @@ def string(include_categorical: bool = False) -> SelectorType: │ yy ┆ b ┆ 6 ┆ 7.0 │ └─────┴─────┴─────┴──────┘ - See Also - -------- - by_dtype : Select all columns of a given dtype. - float : Select all float columns. - integer : Select all integer columns. - numeric : Select all numeric columns. - temporal : Select all temporal columns. - """ string_dtypes: list[PolarsDataType] = [Utf8] if include_categorical: @@ -1568,6 +1568,14 @@ def temporal() -> SelectorType: """ Select all temporal columns. + See Also + -------- + by_dtype : Select all columns of a given dtype. + float : Select all float columns. + integer : Select all integer columns. + numeric : Select all numeric columns. + string : Select all string columns. + Examples -------- >>> from datetime import date, time @@ -1619,14 +1627,6 @@ def temporal() -> SelectorType: │ 2.3456 │ └────────┘ - See Also - -------- - by_dtype : Select all columns of a given dtype. - float : Select all float columns. - integer : Select all integer columns. - numeric : Select all numeric columns. - string : Select all string columns. - """ return _selector_proxy_( F.col(TEMPORAL_DTYPES), diff --git a/py-polars/polars/series/datetime.py b/py-polars/polars/series/datetime.py index 9db12fa7960d..d3c9e08c0d26 100644 --- a/py-polars/polars/series/datetime.py +++ b/py-polars/polars/series/datetime.py @@ -186,6 +186,10 @@ def strftime(self, format: str) -> Series: `_ for specification. Example: ``"%y-%m-%d"``. + See Also + -------- + to_string : The identical Series method for which ``strftime`` is an alias. + Examples -------- >>> from datetime import datetime @@ -202,10 +206,6 @@ def strftime(self, format: str) -> Series: "2020/05/01" ] - See Also - -------- - to_string : The identical Series method for which ``strftime`` is an alias. - """ return self.to_string(format) diff --git a/py-polars/polars/series/list.py b/py-polars/polars/series/list.py index 76170741c761..a97c6a36a0c5 100644 --- a/py-polars/polars/series/list.py +++ b/py-polars/polars/series/list.py @@ -603,6 +603,10 @@ def set_difference(self, other: Series) -> Series: other Right hand side of the set operation. + See Also + -------- + polars.Series.list.diff: Calculates the n-th discrete difference of every sublist. + Examples -------- >>> a = pl.Series([[1, 2, 3], [], [None, 3], [5, 6, 7]]) @@ -617,10 +621,6 @@ def set_difference(self, other: Series) -> Series: [5, 7] ] - See Also - -------- - polars.Series.list.diff: Calculates the n-th discrete difference of every sublist. - """ # noqa: W505. def set_intersection(self, other: Series) -> Series: diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index e3b1025f9f95..efb12d163b21 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -608,16 +608,42 @@ def eq_missing(self, other: Expr) -> Expr: # type: ignore[misc] def eq_missing(self, other: Any) -> Self | Expr: """ - Method equivalent of equality operator ``expr == other`` where `None` == None`. + Method equivalent of equality operator ``series == other`` where `None` == None`. - This differs from default ``ne`` where null values are propagated. + This differs from the standard ``ne`` where null values are propagated. Parameters ---------- other A literal or expression value to compare with. - """ + See Also + -------- + ne_missing + eq + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.eq(s2) + shape: (3,) + Series: 'a' [bool] + [ + false + true + null + ] + >>> s1.eq_missing(s2) + shape: (3,) + Series: 'a' [bool] + [ + false + true + true + ] + + """ # noqa: W505 def ne(self, other: Any) -> Self | Expr: """Method equivalent of operator expression ``series != other``.""" @@ -633,16 +659,42 @@ def ne_missing(self, other: Any) -> Self: def ne_missing(self, other: Any) -> Self | Expr: """ - Method equivalent of equality operator ``expr != other`` where `None` == None`. + Method equivalent of equality operator ``series != other`` where `None` == None`. - This differs from default ``ne`` where null values are propagated. + This differs from the standard ``ne`` where null values are propagated. Parameters ---------- other A literal or expression value to compare with. - """ + See Also + -------- + eq_missing + ne + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.ne(s2) + shape: (3,) + Series: 'a' [bool] + [ + true + false + null + ] + >>> s1.ne_missing(s2) + shape: (3,) + Series: 'a' [bool] + [ + true + false + false + ] + + """ # noqa: W505 def ge(self, other: Any) -> Self | Expr: """Method equivalent of operator expression ``series >= other``.""" diff --git a/py-polars/polars/series/string.py b/py-polars/polars/series/string.py index c77bf0b1bc4d..1ae8d9c9002e 100644 --- a/py-polars/polars/series/string.py +++ b/py-polars/polars/series/string.py @@ -449,6 +449,11 @@ def ends_with(self, suffix: str | Expr) -> Series: suffix Suffix substring. + See Also + -------- + contains : Check if string contains a substring that matches a regex. + starts_with : Check if string values start with a substring. + Examples -------- >>> s = pl.Series("fruits", ["apple", "mango", None]) @@ -461,11 +466,6 @@ def ends_with(self, suffix: str | Expr) -> Series: null ] - See Also - -------- - contains : Check if string contains a substring that matches a regex. - starts_with : Check if string values start with a substring. - """ def starts_with(self, prefix: str | Expr) -> Series: @@ -477,6 +477,11 @@ def starts_with(self, prefix: str | Expr) -> Series: prefix Prefix substring. + See Also + -------- + contains : Check if string contains a substring that matches a regex. + ends_with : Check if string values end with a substring. + Examples -------- >>> s = pl.Series("fruits", ["apple", "mango", None]) @@ -489,11 +494,6 @@ def starts_with(self, prefix: str | Expr) -> Series: null ] - See Also - -------- - contains : Check if string contains a substring that matches a regex. - ends_with : Check if string values end with a substring. - """ def decode(self, encoding: TransferEncoding, *, strict: bool = True) -> Series: @@ -555,6 +555,11 @@ def json_extract( How many rows to parse to determine the schema. If ``None`` all rows are used. + See Also + -------- + json_path_match : Extract the first match of json string with provided JSONPath + expression. + Examples -------- >>> s = pl.Series("json", ['{"a":1, "b": true}', None, '{"a":2, "b": false}']) @@ -567,11 +572,6 @@ def json_extract( {2,false} ] - See Also - -------- - json_path_match : Extract the first match of json string with provided JSONPath - expression. - """ def json_path_match(self, json_path: str) -> Series: diff --git a/py-polars/polars/sql/context.py b/py-polars/polars/sql/context.py index f4b39bb71c44..7e8ff0a6285b 100644 --- a/py-polars/polars/sql/context.py +++ b/py-polars/polars/sql/context.py @@ -285,6 +285,12 @@ def register(self, name: str, frame: DataFrame | LazyFrame) -> Self: frame eager/lazy frame to associate with this table name. + See Also + -------- + register_globals + register_many + unregister + Examples -------- >>> df = pl.DataFrame({"hello": ["world"]}) @@ -299,12 +305,6 @@ def register(self, name: str, frame: DataFrame | LazyFrame) -> Self: │ world │ └───────┘ - See Also - -------- - register_globals - register_many - unregister - """ if isinstance(frame, DataFrame): frame = frame.lazy() @@ -317,6 +317,12 @@ def register_globals(self, n: int | None = None) -> Self: Automatically maps variable names to table names. + See Also + -------- + register + register_many + unregister + Parameters ---------- n @@ -349,12 +355,6 @@ def register_globals(self, n: int | None = None) -> Self: │ 1 ┆ x ┆ null │ └─────┴──────┴──────┘ - See Also - -------- - register - register_many - unregister - """ return self.register_many( frames=_get_stack_locals(of_type=(DataFrame, LazyFrame), n_objects=n) @@ -375,6 +375,12 @@ def register_many( **named_frames Named eager/lazy frames, provided as kwargs. + See Also + -------- + register + register_globals + unregister + Examples -------- >>> lf1 = pl.LazyFrame({"a": [1, 2, 3], "b": ["m", "n", "o"]}) @@ -393,12 +399,6 @@ def register_many( >>> ctx.register_many(tbl3=lf3, tbl4=lf4).tables() ['tbl1', 'tbl2', 'tbl3', 'tbl4'] - See Also - -------- - register - register_globals - unregister - """ frames = dict(frames or {}) frames.update(named_frames) @@ -438,6 +438,12 @@ def unregister(self, names: str | Collection[str]) -> Self: >>> ctx.tables() ['tbl0'] + See Also + -------- + register + register_globals + register_many + Examples -------- >>> df0 = pl.DataFrame({"ints": [9, 8, 7, 6, 5]}) @@ -457,12 +463,6 @@ def unregister(self, names: str | Collection[str]) -> Self: >>> ctx.unregister("test2").tables() [] - See Also - -------- - register - register_globals - register_many - """ if isinstance(names, str): names = [names] From 628c9eb0f4e269e89dd6d03e1574679e7a65b455 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Tue, 22 Aug 2023 17:57:22 +0200 Subject: [PATCH 40/55] feat(rust, python): support broadcasting in list set operations (#10668) --- .../polars-ops/src/chunked_array/list/sets.rs | 199 ++++++++++++++---- .../polars-plan/src/dsl/function_expr/list.rs | 2 +- crates/polars-plan/src/dsl/list.rs | 6 +- py-polars/src/expr/list.rs | 6 +- py-polars/tests/unit/namespaces/test_list.py | 21 ++ 5 files changed, 184 insertions(+), 50 deletions(-) diff --git a/crates/polars-ops/src/chunked_array/list/sets.rs b/crates/polars-ops/src/chunked_array/list/sets.rs index fe3fff9a3a78..6029da06248c 100644 --- a/crates/polars-ops/src/chunked_array/list/sets.rs +++ b/crates/polars-ops/src/chunked_array/list/sets.rs @@ -36,17 +36,19 @@ impl<'a> MaterializeValues> for MutableBinaryArray { } } -fn set_operation( +fn set_operation( set: &mut PlIndexSet, set2: &mut PlIndexSet, a: I, - b: I, + b: J, out: &mut R, set_op: SetOperation, + broadcast_rhs: bool, ) -> usize where K: Eq + Hash + Copy, I: IntoIterator, + J: IntoIterator, R: MaterializeValues, { set.clear(); @@ -55,9 +57,12 @@ where match set_op { SetOperation::Intersection => { - set2.clear(); set.extend(a); - set2.extend(b); + // If broadcast `set2` should already be filled. + if !broadcast_rhs { + set2.clear(); + set2.extend(b); + } out.extend_buf(set.intersection(set2).copied()) }, SetOperation::Union => { @@ -73,11 +78,14 @@ where out.extend_buf(set.drain(..)) }, SetOperation::SymmetricDifference => { - set2.clear(); + // If broadcast `set2` should already be filled. + if !broadcast_rhs { + set2.clear(); + set2.extend(b); + } // We could speed this up, but implementing ourselves, but we need to have a clonable // iterator as we need 2 passes set.extend(a); - set2.extend(b); out.extend_buf(set.symmetric_difference(set2).copied()) }, } @@ -115,14 +123,15 @@ fn primitive( offsets_b: &[i64], set_op: SetOperation, validity: Option, -) -> ListArray +) -> PolarsResult> where T: NativeType + Hash + Copy + Eq, { - assert_eq!(offsets_a.len(), offsets_b.len()); + let broadcast_lhs = offsets_a.len() == 2; + let broadcast_rhs = offsets_b.len() == 2; let mut set = Default::default(); - let mut set2 = Default::default(); + let mut set2: PlIndexSet> = Default::default(); let mut values_out = MutablePrimitiveArray::with_capacity(std::cmp::max( *offsets_a.last().unwrap(), @@ -131,7 +140,15 @@ where let mut offsets = Vec::with_capacity(std::cmp::max(offsets_a.len(), offsets_b.len())); offsets.push(0i64); - for i in 1..offsets_a.len() { + if broadcast_rhs { + set2.extend(b.into_iter().map(copied_opt)); + } + let offsets_slice = if offsets_a.len() > offsets_b.len() { + offsets_a + } else { + offsets_b + }; + for i in 1..offsets_slice.len() { unsafe { let start_a = *offsets_a.get_unchecked(i - 1) as usize; let end_a = *offsets_a.get_unchecked(i) as usize; @@ -139,20 +156,67 @@ where let start_b = *offsets_b.get_unchecked(i - 1) as usize; let end_b = *offsets_b.get_unchecked(i) as usize; - // going via skip iterator instead of slice doesn't heap alloc nor trigger a bitcount - let a_iter = a - .into_iter() - .skip(start_a) - .take(end_a - start_a) - .map(copied_opt); - let b_iter = b - .into_iter() - .skip(start_b) - .take(end_b - start_b) - .map(copied_opt); - - let offset = - set_operation(&mut set, &mut set2, a_iter, b_iter, &mut values_out, set_op); + // The branches are the same every loop. + // We rely on branch prediction here. + let offset = if broadcast_rhs { + // going via skip iterator instead of slice doesn't heap alloc nor trigger a bitcount + let a_iter = a + .into_iter() + .skip(start_a) + .take(end_a - start_a) + .map(copied_opt); + let b_iter = b.into_iter().map(copied_opt); + set_operation( + &mut set, + &mut set2, + a_iter, + b_iter, + &mut values_out, + set_op, + true, + ) + } else if broadcast_lhs { + let a_iter = a.into_iter().map(copied_opt); + + let b_iter = b + .into_iter() + .skip(start_b) + .take(end_b - start_b) + .map(copied_opt); + + set_operation( + &mut set, + &mut set2, + a_iter, + b_iter, + &mut values_out, + set_op, + false, + ) + } else { + // going via skip iterator instead of slice doesn't heap alloc nor trigger a bitcount + let a_iter = a + .into_iter() + .skip(start_a) + .take(end_a - start_a) + .map(copied_opt); + + let b_iter = b + .into_iter() + .skip(start_b) + .take(end_b - start_b) + .map(copied_opt); + set_operation( + &mut set, + &mut set2, + a_iter, + b_iter, + &mut values_out, + set_op, + false, + ) + }; + offsets.push(offset as i64); } } @@ -160,7 +224,7 @@ where let dtype = ListArray::::default_datatype(values_out.data_type().clone()); let values: PrimitiveArray = values_out.into(); - ListArray::new(dtype, offsets, values.boxed(), validity) + Ok(ListArray::new(dtype, offsets, values.boxed(), validity)) } fn binary( @@ -171,11 +235,11 @@ fn binary( set_op: SetOperation, validity: Option, as_utf8: bool, -) -> ListArray { - assert_eq!(offsets_a.len(), offsets_b.len()); - +) -> PolarsResult> { + let broadcast_lhs = offsets_a.len() == 2; + let broadcast_rhs = offsets_b.len() == 2; let mut set = Default::default(); - let mut set2 = Default::default(); + let mut set2: PlIndexSet> = Default::default(); let mut values_out = MutableBinaryArray::with_capacity(std::cmp::max( *offsets_a.last().unwrap(), @@ -184,7 +248,15 @@ fn binary( let mut offsets = Vec::with_capacity(std::cmp::max(offsets_a.len(), offsets_b.len())); offsets.push(0i64); - for i in 1..offsets_a.len() { + if broadcast_rhs { + set2.extend(b); + } + let offsets_slice = if offsets_a.len() > offsets_b.len() { + offsets_a + } else { + offsets_b + }; + for i in 1..offsets_slice.len() { unsafe { let start_a = *offsets_a.get_unchecked(i - 1) as usize; let end_a = *offsets_a.get_unchecked(i) as usize; @@ -192,12 +264,47 @@ fn binary( let start_b = *offsets_b.get_unchecked(i - 1) as usize; let end_b = *offsets_b.get_unchecked(i) as usize; - // going via skip iterator instead of slice doesn't heap alloc nor trigger a bitcount - let a_iter = a.into_iter().skip(start_a).take(end_a - start_a); - let b_iter = b.into_iter().skip(start_b).take(end_b - start_b); - - let offset = - set_operation(&mut set, &mut set2, a_iter, b_iter, &mut values_out, set_op); + // The branches are the same every loop. + // We rely on branch prediction here. + let offset = if broadcast_rhs { + // going via skip iterator instead of slice doesn't heap alloc nor trigger a bitcount + let a_iter = a.into_iter().skip(start_a).take(end_a - start_a); + let b_iter = b.into_iter(); + set_operation( + &mut set, + &mut set2, + a_iter, + b_iter, + &mut values_out, + set_op, + true, + ) + } else if broadcast_lhs { + let a_iter = a.into_iter(); + let b_iter = b.into_iter().skip(start_b).take(end_b - start_b); + set_operation( + &mut set, + &mut set2, + a_iter, + b_iter, + &mut values_out, + set_op, + false, + ) + } else { + // going via skip iterator instead of slice doesn't heap alloc nor trigger a bitcount + let a_iter = a.into_iter().skip(start_a).take(end_a - start_a); + let b_iter = b.into_iter().skip(start_b).take(end_b - start_b); + set_operation( + &mut set, + &mut set2, + a_iter, + b_iter, + &mut values_out, + set_op, + false, + ) + }; offsets.push(offset as i64); } } @@ -214,10 +321,10 @@ fn binary( ) }; let dtype = ListArray::::default_datatype(values.data_type().clone()); - ListArray::new(dtype, offsets, values.boxed(), validity) + Ok(ListArray::new(dtype, offsets, values.boxed(), validity)) } else { let dtype = ListArray::::default_datatype(values.data_type().clone()); - ListArray::new(dtype, offsets, values.boxed(), validity) + Ok(ListArray::new(dtype, offsets, values.boxed(), validity)) } } @@ -234,7 +341,7 @@ fn array_set_operation( a: &ListArray, b: &ListArray, set_op: SetOperation, -) -> ListArray { +) -> PolarsResult> { let offsets_a = a.offsets().as_slice(); let offsets_b = b.offsets().as_slice(); @@ -266,7 +373,7 @@ fn array_set_operation( binary(a, b, offsets_a, offsets_b, set_op, validity, false) }, ArrowDataType::Boolean => { - todo!("boolean type not yet supported in list union operations") + polars_bail!(InvalidOperation: "boolean type not yet supported in list 'set' operations") }, _ => { with_match_physical_integer_type!(dtype.into(), |$T| { @@ -279,13 +386,19 @@ fn array_set_operation( } } -pub fn list_set_operation(a: &ListChunked, b: &ListChunked, set_op: SetOperation) -> ListChunked { +pub fn list_set_operation( + a: &ListChunked, + b: &ListChunked, + set_op: SetOperation, +) -> PolarsResult { + polars_ensure!(a.len() == b.len() || b.len() == 1 || a.len() == 1, ShapeMismatch: "column lengths don't match"); + // we use the unsafe variant because we want to keep the nested logical types type. unsafe { - arity::binary_unchecked_same_type( + arity::try_binary_unchecked_same_type( a, b, - |a, b| array_set_operation(a, b, set_op).boxed(), + |a, b| array_set_operation(a, b, set_op).map(|arr| arr.boxed()), false, false, ) diff --git a/crates/polars-plan/src/dsl/function_expr/list.rs b/crates/polars-plan/src/dsl/function_expr/list.rs index 1aed68b37cd9..a2f4dca9f007 100644 --- a/crates/polars-plan/src/dsl/function_expr/list.rs +++ b/crates/polars-plan/src/dsl/function_expr/list.rs @@ -258,7 +258,7 @@ pub(super) fn sum(s: &Series) -> PolarsResult { pub(super) fn set_operation(s: &[Series], set_type: SetOperation) -> PolarsResult { let s0 = &s[0]; let s1 = &s[1]; - Ok(list_set_operation(s0.list()?, s1.list()?, set_type).into_series()) + list_set_operation(s0.list()?, s1.list()?, set_type).map(|ca| ca.into_series()) } #[cfg(feature = "list_any_all")] diff --git a/crates/polars-plan/src/dsl/list.rs b/crates/polars-plan/src/dsl/list.rs index 6299dc3d58ef..8665c76c03d5 100644 --- a/crates/polars-plan/src/dsl/list.rs +++ b/crates/polars-plan/src/dsl/list.rs @@ -349,21 +349,21 @@ impl ListNameSpace { /// Return the SET DIFFERENCE between both list arrays. #[cfg(feature = "list_sets")] - pub fn difference>(self, other: E) -> Expr { + pub fn set_difference>(self, other: E) -> Expr { let other = other.into(); self.set_operation(other, SetOperation::Difference) } /// Return the SET INTERSECTION between both list arrays. #[cfg(feature = "list_sets")] - pub fn intersection>(self, other: E) -> Expr { + pub fn set_intersection>(self, other: E) -> Expr { let other = other.into(); self.set_operation(other, SetOperation::Intersection) } /// Return the SET SYMMETRIC DIFFERENCE between both list arrays. #[cfg(feature = "list_sets")] - pub fn symmetric_difference>(self, other: E) -> Expr { + pub fn set_symmetric_difference>(self, other: E) -> Expr { let other = other.into(); self.set_operation(other, SetOperation::SymmetricDifference) } diff --git a/py-polars/src/expr/list.rs b/py-polars/src/expr/list.rs index b005e1bf1881..1838bd291903 100644 --- a/py-polars/src/expr/list.rs +++ b/py-polars/src/expr/list.rs @@ -158,10 +158,10 @@ impl PyExpr { fn list_set_operation(&self, other: PyExpr, operation: Wrap) -> Self { let e = self.inner.clone().list(); match operation.0 { - SetOperation::Intersection => e.intersection(other.inner), - SetOperation::Difference => e.difference(other.inner), + SetOperation::Intersection => e.set_intersection(other.inner), + SetOperation::Difference => e.set_difference(other.inner), SetOperation::Union => e.union(other.inner), - SetOperation::SymmetricDifference => e.symmetric_difference(other.inner), + SetOperation::SymmetricDifference => e.set_symmetric_difference(other.inner), } .into() } diff --git a/py-polars/tests/unit/namespaces/test_list.py b/py-polars/tests/unit/namespaces/test_list.py index 7a9cbd7505b8..786e2f6290de 100644 --- a/py-polars/tests/unit/namespaces/test_list.py +++ b/py-polars/tests/unit/namespaces/test_list.py @@ -514,6 +514,27 @@ def test_list_set_operations() -> None: assert r2 == exp +def test_list_set_operations_broadcast() -> None: + df = pl.DataFrame( + { + "a": [[2, 3, 3], [3, 1], [1, 2, 3]], + } + ) + + assert df.with_columns( + pl.col("a").list.set_intersection(pl.lit(pl.Series([[1, 2]]))) + ).to_dict(False) == {"a": [[2], [1], [1, 2]]} + assert df.with_columns( + pl.col("a").list.set_union(pl.lit(pl.Series([[1, 2]]))) + ).to_dict(False) == {"a": [[2, 3, 1], [3, 1, 2], [1, 2, 3]]} + assert df.with_columns( + pl.col("a").list.set_difference(pl.lit(pl.Series([[1, 2]]))) + ).to_dict(False) == {"a": [[3], [3], [3]]} + assert df.with_columns( + pl.lit(pl.Series("a", [[1, 2]])).list.set_difference("a") + ).to_dict(False) == {"a": [[1], [2], []]} + + def test_list_take_oob_10079() -> None: df = pl.DataFrame( { From d0cd5234125b7e9430ff00a0629c636dc57edf20 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Tue, 22 Aug 2023 18:10:19 +0200 Subject: [PATCH 41/55] refactor(python): Remove `deprecate_renamed_methods` util (#10537) --- .../reference/dataframe/descriptive.rst | 2 + .../source/reference/expressions/list.rst | 4 + .../reference/lazyframe/miscellaneous.rst | 1 + .../reference/lazyframe/modify_select.rst | 2 + .../docs/source/reference/series/list.rst | 4 + py-polars/polars/dataframe/frame.py | 45 ++++++++++-- py-polars/polars/expr/list.py | 68 ++++++++++++----- py-polars/polars/lazyframe/frame.py | 46 +++++++++--- py-polars/polars/series/list.py | 68 ++++++++++++----- py-polars/polars/utils/deprecation.py | 73 +------------------ py-polars/tests/unit/test_serde.py | 2 +- .../tests/unit/utils/test_deprecation.py | 25 ------- 12 files changed, 189 insertions(+), 151 deletions(-) diff --git a/py-polars/docs/source/reference/dataframe/descriptive.rst b/py-polars/docs/source/reference/dataframe/descriptive.rst index eab9cb1248f7..42e7b0ed7789 100644 --- a/py-polars/docs/source/reference/dataframe/descriptive.rst +++ b/py-polars/docs/source/reference/dataframe/descriptive.rst @@ -6,6 +6,8 @@ Descriptive .. autosummary:: :toctree: api/ + DataFrame.approx_n_unique + DataFrame.approx_unique DataFrame.describe DataFrame.glimpse DataFrame.estimated_size diff --git a/py-polars/docs/source/reference/expressions/list.rst b/py-polars/docs/source/reference/expressions/list.rst index 9d87a6c831a3..2710b8d56c80 100644 --- a/py-polars/docs/source/reference/expressions/list.rst +++ b/py-polars/docs/source/reference/expressions/list.rst @@ -17,11 +17,13 @@ The following methods are available under the `expr.list` attribute. Expr.list.contains Expr.list.count_match Expr.list.diff + Expr.list.difference Expr.list.eval Expr.list.explode Expr.list.first Expr.list.get Expr.list.head + Expr.list.intersection Expr.list.join Expr.list.last Expr.list.lengths @@ -37,7 +39,9 @@ The following methods are available under the `expr.list` attribute. Expr.list.slice Expr.list.sort Expr.list.sum + Expr.list.symmetric_difference Expr.list.tail Expr.list.take Expr.list.to_struct + Expr.list.union Expr.list.unique diff --git a/py-polars/docs/source/reference/lazyframe/miscellaneous.rst b/py-polars/docs/source/reference/lazyframe/miscellaneous.rst index 35e385f3430a..77051b2dd589 100644 --- a/py-polars/docs/source/reference/lazyframe/miscellaneous.rst +++ b/py-polars/docs/source/reference/lazyframe/miscellaneous.rst @@ -24,3 +24,4 @@ Read/write logical plan LazyFrame.from_json LazyFrame.read_json LazyFrame.serialize + LazyFrame.write_json diff --git a/py-polars/docs/source/reference/lazyframe/modify_select.rst b/py-polars/docs/source/reference/lazyframe/modify_select.rst index 2257467fb127..5fa34dea8ad0 100644 --- a/py-polars/docs/source/reference/lazyframe/modify_select.rst +++ b/py-polars/docs/source/reference/lazyframe/modify_select.rst @@ -6,6 +6,8 @@ Manipulation/selection .. autosummary:: :toctree: api/ + LazyFrame.approx_n_unique + LazyFrame.approx_unique LazyFrame.bottom_k LazyFrame.clear LazyFrame.clone diff --git a/py-polars/docs/source/reference/series/list.rst b/py-polars/docs/source/reference/series/list.rst index 9f29aab5181d..46942ab076b9 100644 --- a/py-polars/docs/source/reference/series/list.rst +++ b/py-polars/docs/source/reference/series/list.rst @@ -17,12 +17,14 @@ The following methods are available under the `Series.list` attribute. Series.list.contains Series.list.count_match Series.list.diff + Series.list.difference Series.list.eval Series.list.explode Series.list.first Series.list.get Series.list.head Series.list.join + Series.list.intersection Series.list.last Series.list.lengths Series.list.max @@ -37,7 +39,9 @@ The following methods are available under the `Series.list` attribute. Series.list.slice Series.list.sort Series.list.sum + Series.list.symmetric_difference Series.list.tail Series.list.take Series.list.to_struct + Series.list.union Series.list.unique diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 383354272843..72ee90dce7aa 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -85,7 +85,6 @@ from polars.utils.deprecation import ( deprecate_function, deprecate_renamed_function, - deprecate_renamed_methods, deprecate_renamed_parameter, ) from polars.utils.various import ( @@ -179,10 +178,6 @@ P = ParamSpec("P") -@deprecate_renamed_methods( - mapping={"approx_unique": "approx_n_unique"}, - versions={"approx_unique": "0.18.12"}, -) class DataFrame: """ Two-dimensional data structure representing data as a table with rows and columns. @@ -8560,9 +8555,47 @@ def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = None) -> i struct_fields = F.all() if (subset is None) else subset expr = F.struct(struct_fields) # type: ignore[call-overload] - df = self.lazy().select(expr.n_unique()).collect() + df = self.lazy().select(expr.n_unique()).collect(no_optimization=True) return 0 if df.is_empty() else df.row(0)[0] + def approx_n_unique(self) -> DataFrame: + """ + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> df.approx_n_unique() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + """ + return self.lazy().approx_n_unique().collect(no_optimization=True) + + @deprecate_renamed_function("approx_n_unique", version="0.18.12") + def approx_unique(self) -> DataFrame: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`DataFrame.approx_n_unique`. + + """ + return self.approx_n_unique() + def rechunk(self) -> Self: """ Rechunk the data in this DataFrame to a contiguous allocation. diff --git a/py-polars/polars/expr/list.py b/py-polars/polars/expr/list.py index 269366c413e8..5f0e491c2abb 100644 --- a/py-polars/polars/expr/list.py +++ b/py-polars/polars/expr/list.py @@ -7,7 +7,7 @@ from polars import functions as F from polars.utils._parse_expr_input import parse_as_expression from polars.utils._wrap import wrap_expr -from polars.utils.deprecation import deprecate_renamed_methods +from polars.utils.deprecation import deprecate_renamed_function if TYPE_CHECKING: from datetime import date, datetime, time @@ -16,20 +16,6 @@ from polars.type_aliases import IntoExpr, NullBehavior, ToStructStrategy -@deprecate_renamed_methods( - { - "difference": "set_difference", - "symmetric_difference": "set_symmetric_difference", - "intersection": "set_intersection", - "union": "set_union", - }, - versions={ - "difference": "0.18.10", - "symmetric_difference": "0.18.10", - "intersection": "0.18.10", - "union": "0.18.10", - }, -) class ExprListNameSpace: """Namespace for list related expressions.""" @@ -893,7 +879,7 @@ def eval(self, expr: Expr, *, parallel: bool = False) -> Expr: """ return wrap_expr(self._pyexpr.list_eval(expr._pyexpr, parallel)) - def set_union(self, other: Expr | IntoExpr) -> Expr: + def set_union(self, other: IntoExpr) -> Expr: """ Compute the SET UNION between the elements in this list and the elements of ``other``. @@ -929,7 +915,7 @@ def set_union(self, other: Expr | IntoExpr) -> Expr: other = parse_as_expression(other, str_as_lit=False) return wrap_expr(self._pyexpr.list_set_operation(other, "union")) - def set_difference(self, other: Expr | IntoExpr) -> Expr: + def set_difference(self, other: IntoExpr) -> Expr: """ Compute the SET DIFFERENCE between the elements in this list and the elements of ``other``. @@ -967,7 +953,7 @@ def set_difference(self, other: Expr | IntoExpr) -> Expr: other = parse_as_expression(other, str_as_lit=False) return wrap_expr(self._pyexpr.list_set_operation(other, "difference")) - def set_intersection(self, other: Expr | IntoExpr) -> Expr: + def set_intersection(self, other: IntoExpr) -> Expr: """ Compute the SET INTERSECTION between the elements in this list and the elements of ``other``. @@ -1003,7 +989,7 @@ def set_intersection(self, other: Expr | IntoExpr) -> Expr: other = parse_as_expression(other, str_as_lit=False) return wrap_expr(self._pyexpr.list_set_operation(other, "intersection")) - def set_symmetric_difference(self, other: Expr | IntoExpr) -> Expr: + def set_symmetric_difference(self, other: IntoExpr) -> Expr: """ Compute the SET SYMMETRIC DIFFERENCE between the elements in this list and the elements of ``other``. @@ -1038,3 +1024,47 @@ def set_symmetric_difference(self, other: Expr | IntoExpr) -> Expr: """ # noqa: W505. other = parse_as_expression(other, str_as_lit=False) return wrap_expr(self._pyexpr.list_set_operation(other, "symmetric_difference")) + + @deprecate_renamed_function("set_union", version="0.18.10") + def union(self, other: IntoExpr) -> Expr: + """ + Compute the SET UNION between the elements in this list and the elements of ``other``. + + .. deprecated:: 0.18.10 + This method has been renamed to ``Expr.list.set_union``. + + """ # noqa: W505 + return self.set_union(other) + + @deprecate_renamed_function("set_difference", version="0.18.10") + def difference(self, other: IntoExpr) -> Expr: + """ + Compute the SET DIFFERENCE between the elements in this list and the elements of ``other``. + + .. deprecated:: 0.18.10 + This method has been renamed to ``Expr.list.set_difference``. + + """ # noqa: W505 + return self.set_difference(other) + + @deprecate_renamed_function("set_intersection", version="0.18.10") + def intersection(self, other: IntoExpr) -> Expr: + """ + Compute the SET INTERSECTION between the elements in this list and the elements of ``other``. + + .. deprecated:: 0.18.10 + This method has been renamed to ``Expr.list.set_intersection``. + + """ # noqa: W505 + return self.set_intersection(other) + + @deprecate_renamed_function("set_symmetric_difference", version="0.18.10") + def symmetric_difference(self, other: IntoExpr) -> Expr: + """ + Compute the SET SYMMETRIC DIFFERENCE between the elements in this list and the elements of ``other``. + + .. deprecated:: 0.18.10 + This method has been renamed to ``Expr.list.set_symmetric_difference``. + + """ # noqa: W505 + return self.set_symmetric_difference(other) diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 3788181da79a..fefe12c27ef0 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -60,7 +60,6 @@ from polars.utils.deprecation import ( deprecate_function, deprecate_renamed_function, - deprecate_renamed_methods, deprecate_renamed_parameter, ) from polars.utils.various import ( @@ -116,16 +115,6 @@ P = ParamSpec("P") -@deprecate_renamed_methods( - mapping={ - "approx_unique": "approx_n_unique", - "write_json": "serialize", - }, - versions={ - "approx_unique": "0.18.12", - "write_json": "0.18.12", - }, -) class LazyFrame: """ Representation of a Lazy computation graph/query against a DataFrame. @@ -869,6 +858,30 @@ def serialize(self, file: IOBase | str | Path | None = None) -> str | None: self._ldf.serialize(file) return None + @overload + def write_json(self, file: None = ...) -> str: + ... + + @overload + def write_json(self, file: IOBase | str | Path) -> None: + ... + + @deprecate_renamed_function("serialize", version="0.18.12") + def write_json(self, file: IOBase | str | Path | None = None) -> str | None: + """ + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.serialize`. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + """ + return self.serialize(file) + def pipe( self, function: Callable[Concatenate[LazyFrame, P], T], @@ -4150,6 +4163,17 @@ def approx_n_unique(self) -> Self: """ return self.select(F.all().approx_n_unique()) + @deprecate_renamed_function("approx_n_unique", version="0.18.12") + def approx_unique(self) -> Self: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.approx_n_unique`. + + """ + return self.approx_n_unique() + def with_row_count(self, name: str = "row_nr", offset: int = 0) -> Self: """ Add a column at index 0 that counts the rows. diff --git a/py-polars/polars/series/list.py b/py-polars/polars/series/list.py index a97c6a36a0c5..0288c60a4471 100644 --- a/py-polars/polars/series/list.py +++ b/py-polars/polars/series/list.py @@ -5,7 +5,7 @@ from polars import functions as F from polars.series.utils import expr_dispatch from polars.utils._wrap import wrap_s -from polars.utils.deprecation import deprecate_renamed_methods +from polars.utils.deprecation import deprecate_renamed_function if TYPE_CHECKING: from datetime import date, datetime, time @@ -16,20 +16,6 @@ @expr_dispatch -@deprecate_renamed_methods( - { - "difference": "set_difference", - "symmetric_difference": "set_symmetric_difference", - "intersection": "set_intersection", - "union": "set_union", - }, - versions={ - "difference": "0.18.10", - "symmetric_difference": "0.18.10", - "intersection": "0.18.10", - "union": "0.18.10", - }, -) class ListNameSpace: """Namespace for list related methods.""" @@ -592,7 +578,7 @@ def set_union(self, other: Series) -> Series: [5, 6, 7, 8] ] - """ # noqa: W505. + """ # noqa: W505 def set_difference(self, other: Series) -> Series: """ @@ -621,7 +607,7 @@ def set_difference(self, other: Series) -> Series: [5, 7] ] - """ # noqa: W505. + """ # noqa: W505 def set_intersection(self, other: Series) -> Series: """ @@ -646,7 +632,7 @@ def set_intersection(self, other: Series) -> Series: [6] ] - """ # noqa: W505. + """ # noqa: W505 def set_symmetric_difference(self, other: Series) -> Series: """ @@ -657,4 +643,48 @@ def set_symmetric_difference(self, other: Series) -> Series: other Right hand side of the set operation. - """ # noqa: W505. + """ # noqa: W505 + + @deprecate_renamed_function("set_union", version="0.18.10") + def union(self, other: Series) -> Series: + """ + Compute the SET UNION between the elements in this list and the elements of ``other``. + + .. deprecated:: 0.18.10 + This method has been renamed to ``Series.list.set_union``. + + """ # noqa: W505 + return self.set_union(other) + + @deprecate_renamed_function("set_difference", version="0.18.10") + def difference(self, other: Series) -> Series: + """ + Compute the SET DIFFERENCE between the elements in this list and the elements of ``other``. + + .. deprecated:: 0.18.10 + This method has been renamed to ``Series.list.set_difference``. + + """ # noqa: W505 + return self.set_difference(other) + + @deprecate_renamed_function("set_intersection", version="0.18.10") + def intersection(self, other: Series) -> Series: + """ + Compute the SET INTERSECTION between the elements in this list and the elements of ``other``. + + .. deprecated:: 0.18.10 + This method has been renamed to ``Series.list.set_intersection``. + + """ # noqa: W505 + return self.set_intersection(other) + + @deprecate_renamed_function("set_symmetric_difference", version="0.18.10") + def symmetric_difference(self, other: Series) -> Series: + """ + Compute the SET SYMMETRIC DIFFERENCE between the elements in this list and the elements of ``other``. + + .. deprecated:: 0.18.10 + This method has been renamed to ``Series.list.set_symmetric_difference``. + + """ # noqa: W505 + return self.set_symmetric_difference(other) diff --git a/py-polars/polars/utils/deprecation.py b/py-polars/polars/utils/deprecation.py index 1307b4a49c4e..db1948cf8913 100644 --- a/py-polars/polars/utils/deprecation.py +++ b/py-polars/polars/utils/deprecation.py @@ -2,8 +2,8 @@ import inspect import warnings -from functools import partial, wraps -from typing import TYPE_CHECKING, Any, Callable, TypeVar +from functools import wraps +from typing import TYPE_CHECKING, Callable, TypeVar from polars.utils.various import find_stacklevel @@ -58,15 +58,7 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> T: def deprecate_renamed_function( new_name: str, *, version: str ) -> Callable[[Callable[P, T]], Callable[P, T]]: - """ - Decorator to mark a function as deprecated due to being renamed. - - Notes - ----- - For deprecating renamed class methods, use the ``deprecate_renamed_methods`` - class decorator instead. - - """ + """Decorator to mark a function as deprecated due to being renamed.""" return deprecate_function(f"It has been renamed to `{new_name}`.", version=version) @@ -119,65 +111,6 @@ def _rename_keyword_argument( kwargs[new_name] = kwargs.pop(old_name) -def deprecate_renamed_methods( - mapping: dict[str, str | tuple[str, dict[str, Any]]], *, versions: dict[str, str] -) -> Callable[[type[T]], type[T]]: - """ - Class decorator to mark methods as deprecated due to being renamed. - - This allows for the deprecated method to be deleted. It will remain available - to users, but will no longer show up in auto-complete suggestions. - - If the arguments of the method are being renamed as well, use in conjunction with - `deprecate_renamed_parameter`. - - If the new method has different default values for some keyword arguments, supply - the old default values as a dictionary in the mapping like so:: - - @deprecate_renamed_methods( - {"old_method": ("new_method", {"flag": False})}, - versions={"old_method": "1.0.0"}, - ) - class Foo: - def new_method(flag=True): - ... - - Parameters - ---------- - mapping - Mapping of deprecated method names to new method names. - versions - For each deprecated method name, the Polars version number in which it was - deprecated. This argument is used to help developers determine when to remove - the deprecated functionality. - - """ - - def _redirecting_getattr_(obj: T, item: Any) -> Any: - if isinstance(item, str) and item in mapping: - new_item = mapping[item] - new_item_name = new_item if isinstance(new_item, str) else new_item[0] - class_name = type(obj).__name__ - issue_deprecation_warning( - f"`{class_name}.{item}` is deprecated." - f" It has been renamed to `{class_name}.{new_item_name}`.", - version=versions[item], - ) - item = new_item_name - - attr = obj.__getattribute__(item) - if isinstance(new_item, tuple): - attr = partial(attr, **new_item[1]) - return attr - - def decorate(cls: type[T]) -> type[T]: - # note: __getattr__ is only invoked if item isn't found on the class - cls.__getattr__ = _redirecting_getattr_ # type: ignore[attr-defined] - return cls - - return decorate - - def deprecate_nonkeyword_arguments( allowed_args: list[str] | None = None, message: str | None = None, *, version: str ) -> Callable[[Callable[P, T]], Callable[P, T]]: diff --git a/py-polars/tests/unit/test_serde.py b/py-polars/tests/unit/test_serde.py index f5bee11eb449..4d6a245c20a0 100644 --- a/py-polars/tests/unit/test_serde.py +++ b/py-polars/tests/unit/test_serde.py @@ -30,7 +30,7 @@ def test_lazyframe_deprecated_serde() -> None: lf = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}).lazy().select(pl.col("a")) with pytest.deprecated_call(): - json = lf.write_json() # type: ignore[attr-defined] + json = lf.write_json() with pytest.deprecated_call(): result_from = pl.LazyFrame.from_json(json) with pytest.deprecated_call(): diff --git a/py-polars/tests/unit/utils/test_deprecation.py b/py-polars/tests/unit/utils/test_deprecation.py index 7929f1d466ca..4a489660ba3e 100644 --- a/py-polars/tests/unit/utils/test_deprecation.py +++ b/py-polars/tests/unit/utils/test_deprecation.py @@ -9,7 +9,6 @@ deprecate_function, deprecate_nonkeyword_arguments, deprecate_renamed_function, - deprecate_renamed_methods, deprecate_renamed_parameter, issue_deprecation_warning, warn_closed_future_change, @@ -52,30 +51,6 @@ def hello(oof: str, rab: str, ham: str) -> None: assert "rab" in str(recwarn[1].message) -def test_deprecate_renamed_methods() -> None: - # one-to-one redirection - @deprecate_renamed_methods({"foo": "bar"}, versions={"foo": "1.0.0"}) - class DemoClass1: - def bar(self, upper: bool = False) -> str: - return "BAZ" if upper else "baz" - - with pytest.deprecated_call(): - result = DemoClass1().foo() # type: ignore[attr-defined] - assert result == "baz" - - # redirection with **kwargs - @deprecate_renamed_methods( - {"foo": ("bar", {"upper": True})}, versions={"foo": "1.0.0"} - ) - class DemoClass2: - def bar(self, upper: bool = False) -> str: - return "BAZ" if upper else "baz" - - with pytest.deprecated_call(): - result = DemoClass2().foo() # type: ignore[attr-defined] - assert result == "BAZ" - - class Foo: # noqa: D101 @deprecate_nonkeyword_arguments(allowed_args=["self", "baz"], version="0.1.2") def bar( # noqa: D102 From 641c5d7d1acf74c4150b3938769ba6bf0dc8dccf Mon Sep 17 00:00:00 2001 From: Weijie Guo Date: Wed, 23 Aug 2023 12:29:20 +0800 Subject: [PATCH 42/55] fix(rust, python): Sorted Utf8Chunked max_str and min_str should consider null value (#10675) --- .../src/chunked_array/ops/aggregate/mod.rs | 32 ++++++++++++++++--- py-polars/tests/unit/series/test_series.py | 8 +++++ 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs b/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs index 5ed07614951a..f020ed81f7c9 100644 --- a/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs @@ -450,8 +450,20 @@ impl Utf8Chunked { return None; } match self.is_sorted_flag() { - IsSorted::Ascending => self.get(self.len() - 1), - IsSorted::Descending => self.get(0), + IsSorted::Ascending => { + self.last_non_null().and_then(|idx| { + // Safety: + // last_non_null returns in bound index + unsafe { self.get_unchecked(idx) } + }) + }, + IsSorted::Descending => { + self.first_non_null().and_then(|idx| { + // Safety: + // first_non_null returns in bound index + unsafe { self.get_unchecked(idx) } + }) + }, IsSorted::Not => self .downcast_iter() .filter_map(compute::aggregate::max_string) @@ -463,8 +475,20 @@ impl Utf8Chunked { return None; } match self.is_sorted_flag() { - IsSorted::Ascending => self.get(0), - IsSorted::Descending => self.get(self.len() - 1), + IsSorted::Ascending => { + self.first_non_null().and_then(|idx| { + // Safety: + // first_non_null returns in bound index + unsafe { self.get_unchecked(idx) } + }) + }, + IsSorted::Descending => { + self.last_non_null().and_then(|idx| { + // Safety: + // last_non_null returns in bound index + unsafe { self.get_unchecked(idx) } + }) + }, IsSorted::Not => self .downcast_iter() .filter_map(compute::aggregate::min_string) diff --git a/py-polars/tests/unit/series/test_series.py b/py-polars/tests/unit/series/test_series.py index d8b6336ac9ed..4db1b6fd85af 100644 --- a/py-polars/tests/unit/series/test_series.py +++ b/py-polars/tests/unit/series/test_series.py @@ -980,6 +980,14 @@ def test_fill_null() -> None: assert out.dtypes == [pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64] +def test_utf8_series_min_max_10674() -> None: + utf8_series = pl.Series("b", ["a", None, "c", None, "e"], dtype=pl.Utf8) + assert utf8_series.min() == "a" + assert utf8_series.max() == "e" + assert utf8_series.sort(descending=False).min() == "a" + assert utf8_series.sort(descending=True).max() == "e" + + def test_fill_nan() -> None: nan = float("nan") a = pl.Series("a", [1.0, nan, 2.0, nan, 3.0]) From 37531d5409d34ca76f0624dd2a05df94a5661b62 Mon Sep 17 00:00:00 2001 From: Weijie Guo Date: Wed, 23 Aug 2023 13:59:46 +0800 Subject: [PATCH 43/55] feat(rust, python): Support min and max strategy for binary & str columns fill null (#10673) --- .../src/chunked_array/ops/aggregate/mod.rs | 74 +++++++++++++++---- .../src/chunked_array/ops/fill_null.rs | 6 ++ py-polars/tests/unit/series/test_series.py | 8 ++ 3 files changed, 72 insertions(+), 16 deletions(-) diff --git a/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs b/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs index f020ed81f7c9..048c4267a913 100644 --- a/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs @@ -138,14 +138,14 @@ where IsSorted::Ascending => { self.last_non_null().and_then(|idx| { // Safety: - // first_non_null returns in bound index + // last_non_null returns in bound index unsafe { self.get_unchecked(idx) } }) }, IsSorted::Descending => { self.first_non_null().and_then(|idx| { // Safety: - // last returns in bound index + // first_non_null returns in bound index unsafe { self.get_unchecked(idx) } }) }, @@ -509,27 +509,69 @@ impl ChunkAggSeries for Utf8Chunked { } } +impl BinaryChunked { + pub(crate) fn max_binary(&self) -> Option<&[u8]> { + if self.is_empty() { + return None; + } + match self.is_sorted_flag() { + IsSorted::Ascending => { + self.last_non_null().and_then(|idx| { + // Safety: + // last_non_null returns in bound index + unsafe { self.get_unchecked(idx) } + }) + }, + IsSorted::Descending => { + self.first_non_null().and_then(|idx| { + // Safety: + // first_non_null returns in bound index + unsafe { self.get_unchecked(idx) } + }) + }, + IsSorted::Not => self + .downcast_iter() + .filter_map(compute::aggregate::max_binary) + .fold_first_(|acc, v| if acc > v { acc } else { v }), + } + } + + pub(crate) fn min_binary(&self) -> Option<&[u8]> { + if self.is_empty() { + return None; + } + match self.is_sorted_flag() { + IsSorted::Ascending => { + self.first_non_null().and_then(|idx| { + // Safety: + // first_non_null returns in bound index + unsafe { self.get_unchecked(idx) } + }) + }, + IsSorted::Descending => { + self.last_non_null().and_then(|idx| { + // Safety: + // last_non_null returns in bound index + unsafe { self.get_unchecked(idx) } + }) + }, + IsSorted::Not => self + .downcast_iter() + .filter_map(compute::aggregate::min_binary) + .fold_first_(|acc, v| if acc < v { acc } else { v }), + } + } +} + impl ChunkAggSeries for BinaryChunked { fn sum_as_series(&self) -> Series { BinaryChunked::full_null(self.name(), 1).into_series() } fn max_as_series(&self) -> Series { - Series::new( - self.name(), - &[self - .downcast_iter() - .filter_map(compute::aggregate::max_binary) - .fold_first_(|acc, v| if acc > v { acc } else { v })], - ) + Series::new(self.name(), [self.max_binary()]) } fn min_as_series(&self) -> Series { - Series::new( - self.name(), - &[self - .downcast_iter() - .filter_map(compute::aggregate::min_binary) - .fold_first_(|acc, v| if acc < v { acc } else { v })], - ) + Series::new(self.name(), [self.min_binary()]) } } diff --git a/crates/polars-core/src/chunked_array/ops/fill_null.rs b/crates/polars-core/src/chunked_array/ops/fill_null.rs index 0e6539f7e5c2..440fb6591ea8 100644 --- a/crates/polars-core/src/chunked_array/ops/fill_null.rs +++ b/crates/polars-core/src/chunked_array/ops/fill_null.rs @@ -363,6 +363,12 @@ fn fill_null_binary(ca: &BinaryChunked, strategy: FillNullStrategy) -> PolarsRes out.rename(ca.name()); Ok(out) }, + FillNullStrategy::Min => { + ca.fill_null_with_values(ca.min_binary().ok_or_else(err_fill_null)?) + }, + FillNullStrategy::Max => { + ca.fill_null_with_values(ca.max_binary().ok_or_else(err_fill_null)?) + }, strat => polars_bail!(InvalidOperation: "fill-null strategy {:?} is not supported", strat), } } diff --git a/py-polars/tests/unit/series/test_series.py b/py-polars/tests/unit/series/test_series.py index 4db1b6fd85af..c72bd054f8ce 100644 --- a/py-polars/tests/unit/series/test_series.py +++ b/py-polars/tests/unit/series/test_series.py @@ -927,6 +927,14 @@ def test_fill_null() -> None: assert a.fill_null(strategy="backward").to_list() == [0.0, 1.0, 2.0, 2.0, 3.0, 3.0] assert a.fill_null(strategy="mean").to_list() == [0.0, 1.0, 1.5, 2.0, 1.5, 3.0] + b = pl.Series("b", ["a", None, "c", None, "e"]) + assert b.fill_null(strategy="min").to_list() == ["a", "a", "c", "a", "e"] + assert b.fill_null(strategy="max").to_list() == ["a", "e", "c", "e", "e"] + + c = pl.Series("c", [b"a", None, b"c", None, b"e"]) + assert c.fill_null(strategy="min").to_list() == [b"a", b"a", b"c", b"a", b"e"] + assert c.fill_null(strategy="max").to_list() == [b"a", b"e", b"c", b"e", b"e"] + df = pl.DataFrame( [ pl.Series("i32", [1, 2, None], dtype=pl.Int32), From 67d5328b77d846927ebb165880792de4c812dc54 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Wed, 23 Aug 2023 08:01:38 +0200 Subject: [PATCH 44/55] docs(python): Fix minor issue with `sink_parquet` docs (#10669) --- py-polars/polars/lazyframe/frame.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index fefe12c27ef0..e5b90b51d568 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -1838,15 +1838,13 @@ def sink_parquet( - "gzip" : min-level: 0, max-level: 10. - "brotli" : min-level: 0, max-level: 11. - "zstd" : min-level: 1, max-level: 22. - statistics Write statistics to the parquet headers. This requires extra compute. row_group_size Size of the row groups in number of rows. If None (default), the chunks of the `DataFrame` are used. Writing in smaller chunks may reduce memory pressure and improve - writing speeds. If None and ``use_pyarrow=True``, the row group size - will be the minimum of the DataFrame size and 64 * 1024 * 1024. + writing speeds. data_pagesize_limit Size limit of individual data pages. If not set defaults to 1024 * 1024 bytes From 76820a8c122eb2e853c5b88926a22f044c2c003f Mon Sep 17 00:00:00 2001 From: Weijie Guo Date: Wed, 23 Aug 2023 23:38:17 +0800 Subject: [PATCH 45/55] fix(rust, python): Set the correct fast_explode flag for ListUtf8ChunkedBuilder (#10684) --- .../src/chunked_array/builder/list/binary.rs | 3 +++ py-polars/tests/unit/operations/test_explode.py | 14 ++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/crates/polars-core/src/chunked_array/builder/list/binary.rs b/crates/polars-core/src/chunked_array/builder/list/binary.rs index 0f9610db3d00..00564cdc94d4 100644 --- a/crates/polars-core/src/chunked_array/builder/list/binary.rs +++ b/crates/polars-core/src/chunked_array/builder/list/binary.rs @@ -48,6 +48,9 @@ impl ListUtf8ChunkedBuilder { #[inline] pub(crate) fn append(&mut self, ca: &Utf8Chunked) { + if ca.is_empty() { + self.fast_explode = false; + } let value_builder = self.builder.mut_values(); value_builder.try_extend(ca).unwrap(); self.builder.try_push_valid().unwrap(); diff --git a/py-polars/tests/unit/operations/test_explode.py b/py-polars/tests/unit/operations/test_explode.py index 4c39eeeefc24..2e5aa0679188 100644 --- a/py-polars/tests/unit/operations/test_explode.py +++ b/py-polars/tests/unit/operations/test_explode.py @@ -315,3 +315,17 @@ def test_explode_array() -> None: for ex in ("a", ~cs.integer()): out = df.explode(ex).collect() # type: ignore[arg-type] assert_frame_equal(out, expected) + + +def test_utf8_list_agg_explode() -> None: + df = pl.DataFrame({"a": [[None], ["b"]]}) + + df = df.select( + pl.col("a").list.eval(pl.element().filter(pl.element().is_not_null())) + ) + assert not df["a"].flags["FAST_EXPLODE"] + + df2 = pl.DataFrame({"a": [[], ["b"]]}) + + assert_frame_equal(df, df2) + assert_frame_equal(df.explode("a"), df2.explode("a")) From 9bfa5b6db9b112328c657651c2def6543ae46473 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Wed, 23 Aug 2023 17:44:37 +0200 Subject: [PATCH 46/55] test(python): Update for new pyarrow `13.0.0` behavior (#10691) --- py-polars/polars/dataframe/frame.py | 4 +--- py-polars/tests/unit/test_interop.py | 14 +++++++------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 72ee90dce7aa..8f021b3eaef6 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -9276,14 +9276,12 @@ def iter_slices(self, n_rows: int = 10_000) -> Iterator[DataFrame]: >>> for frame in df.iter_slices(n_rows=15_000): ... record_batch = frame.to_arrow().to_batches()[0] - ... print(record_batch, "\n<< ", len(record_batch)) + ... print(f"{record_batch.schema}\n<< {len(record_batch)}") ... - pyarrow.RecordBatch a: int32 b: date32[day] c: large_string << 15000 - pyarrow.RecordBatch a: int32 b: date32[day] c: large_string diff --git a/py-polars/tests/unit/test_interop.py b/py-polars/tests/unit/test_interop.py index a9361633075b..039afb6d8a0b 100644 --- a/py-polars/tests/unit/test_interop.py +++ b/py-polars/tests/unit/test_interop.py @@ -569,13 +569,13 @@ def test_to_pandas() -> None: ) pd_out = df.to_pandas() pd_out_dtypes_expected = [ - np.uint8, - np.float64, - np.float64, - np.dtype("datetime64[ns]"), - np.object_, - np.object_, - np.dtype("datetime64[ns]"), + np.dtype(np.uint8), + np.dtype(np.float64), + np.dtype(np.float64), + np.dtype("datetime64[ms]"), + np.dtype(np.object_), + np.dtype(np.object_), + np.dtype("datetime64[us]"), pd.CategoricalDtype(categories=["a", "b", "c"], ordered=False), pd.CategoricalDtype(categories=["e", "f"], ordered=False), ] From f80e6e018624b8d40b6144179d95d9b74ed607a6 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Wed, 23 Aug 2023 18:49:54 +0200 Subject: [PATCH 47/55] feat(python): Explicitly implement `Protocol` for interchange classes (#10688) --- py-polars/polars/interchange/buffer.py | 4 +- py-polars/polars/interchange/column.py | 4 +- py-polars/polars/interchange/dataframe.py | 7 +- py-polars/polars/interchange/protocol.py | 142 +++++++++++++++++++--- 4 files changed, 137 insertions(+), 20 deletions(-) diff --git a/py-polars/polars/interchange/buffer.py b/py-polars/polars/interchange/buffer.py index 5ee3b55d7db6..46c6bf12dc8f 100644 --- a/py-polars/polars/interchange/buffer.py +++ b/py-polars/polars/interchange/buffer.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING -from polars.interchange.protocol import DlpackDeviceType, DtypeKind +from polars.interchange.protocol import Buffer, DlpackDeviceType, DtypeKind from polars.interchange.utils import polars_dtype_to_dtype if TYPE_CHECKING: @@ -11,7 +11,7 @@ from polars import Series -class PolarsBuffer: +class PolarsBuffer(Buffer): """ A buffer object backed by a Polars Series consisting of a single chunk. diff --git a/py-polars/polars/interchange/column.py b/py-polars/polars/interchange/column.py index 8cf81b0b33c8..c7a945b1977f 100644 --- a/py-polars/polars/interchange/column.py +++ b/py-polars/polars/interchange/column.py @@ -4,7 +4,7 @@ from polars.datatypes import Categorical from polars.interchange.buffer import PolarsBuffer -from polars.interchange.protocol import ColumnNullType, DtypeKind, Endianness +from polars.interchange.protocol import Column, ColumnNullType, DtypeKind, Endianness from polars.interchange.utils import polars_dtype_to_dtype from polars.utils._wrap import wrap_s @@ -16,7 +16,7 @@ from polars.interchange.protocol import CategoricalDescription, ColumnBuffers, Dtype -class PolarsColumn: +class PolarsColumn(Column): """ A column object backed by a Polars Series. diff --git a/py-polars/polars/interchange/dataframe.py b/py-polars/polars/interchange/dataframe.py index 2d43a1353901..56ed4337d6f0 100644 --- a/py-polars/polars/interchange/dataframe.py +++ b/py-polars/polars/interchange/dataframe.py @@ -5,6 +5,7 @@ from typing import TYPE_CHECKING from polars.interchange.column import PolarsColumn +from polars.interchange.protocol import DataFrame as InterchangeDataFrame if TYPE_CHECKING: from collections.abc import Iterator @@ -13,7 +14,7 @@ from polars import DataFrame -class PolarsDataFrame: +class PolarsDataFrame(InterchangeDataFrame): """ A dataframe object backed by a Polars DataFrame. @@ -27,6 +28,8 @@ class PolarsDataFrame: """ + version = 0 + def __init__(self, df: DataFrame, *, allow_copy: bool = True): self._df = df self._allow_copy = allow_copy @@ -124,7 +127,7 @@ def get_columns(self) -> Iterator[PolarsColumn]: def select_columns(self, indices: Sequence[int]) -> PolarsDataFrame: """ - Create a new DataFrame by selecting a subset of columns by index. + Create a new dataframe by selecting a subset of columns by index. Parameters ---------- diff --git a/py-polars/polars/interchange/protocol.py b/py-polars/polars/interchange/protocol.py index de51804c2f63..4d7a85bfe83b 100644 --- a/py-polars/polars/interchange/protocol.py +++ b/py-polars/polars/interchange/protocol.py @@ -1,7 +1,16 @@ from __future__ import annotations from enum import IntEnum -from typing import TYPE_CHECKING, Literal, Tuple, TypedDict +from typing import ( + TYPE_CHECKING, + Any, + Iterable, + Literal, + Protocol, + Sequence, + Tuple, + TypedDict, +) if TYPE_CHECKING: import sys @@ -15,6 +24,19 @@ from typing_extensions import TypeAlias +class DlpackDeviceType(IntEnum): + """Integer enum for device type codes matching DLPack.""" + + CPU = 1 + CUDA = 2 + CPU_PINNED = 3 + OPENCL = 4 + VULKAN = 7 + METAL = 8 + VPI = 9 + ROCM = 10 + + class DtypeKind(IntEnum): """ Integer enum for data types. @@ -105,19 +127,6 @@ class CategoricalDescription(TypedDict): categories: PolarsColumn -class DlpackDeviceType(IntEnum): - """Integer enum for device type codes matching DLPack.""" - - CPU = 1 - CUDA = 2 - CPU_PINNED = 3 - OPENCL = 4 - VULKAN = 7 - METAL = 8 - VPI = 9 - ROCM = 10 - - class Endianness: """Enum indicating the byte-order of a data type.""" @@ -125,3 +134,108 @@ class Endianness: BIG = ">" NATIVE = "=" NA = "|" + + +class Buffer(Protocol): + """Interchange buffer object.""" + + @property + def bufsize(self) -> int: + """Buffer size in bytes.""" + + @property + def ptr(self) -> int: + """Pointer to start of the buffer as an integer.""" + + def __dlpack__(self) -> Any: + """Represent this structure as DLPack interface.""" + + def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: + """Device type and device ID for where the data in the buffer resides.""" + + +class Column(Protocol): + """Interchange column object.""" + + def size(self) -> int: + """Size of the column in elements.""" + + @property + def offset(self) -> int: + """Offset of the first element with respect to the start of the underlying buffer.""" # noqa: W505 + + @property + def dtype(self) -> Dtype: + """Data type of the column.""" + + @property + def describe_categorical(self) -> CategoricalDescription: + """Description of the categorical data type of the column.""" + + @property + def describe_null(self) -> tuple[ColumnNullType, Any]: + """Description of the null representation the column uses.""" + + @property + def null_count(self) -> int | None: + """Number of null elements, if known.""" + + @property + def metadata(self) -> dict[str, Any]: + """The metadata for the column.""" + + def num_chunks(self) -> int: + """Return the number of chunks the column consists of.""" + + def get_chunks(self, n_chunks: int | None = None) -> Iterable[Column]: + """Return an iterator yielding the column chunks.""" + + def get_buffers(self) -> ColumnBuffers: + """Return a dictionary containing the underlying buffers.""" + + +class DataFrame(Protocol): + """Interchange dataframe object.""" + + @property + def version(self) -> int: + """Version of the protocol.""" + + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> DataFrame: + """Construct a new dataframe object, potentially changing the parameters.""" + + @property + def metadata(self) -> dict[str, Any]: + """The metadata for the dataframe.""" + + def num_columns(self) -> int: + """Return the number of columns in the dataframe.""" + + def num_rows(self) -> int | None: + """Return the number of rows in the dataframe, if available.""" + + def num_chunks(self) -> int: + """Return the number of chunks the dataframe consists of..""" + + def column_names(self) -> Iterable[str]: + """Return the column names.""" + + def get_column(self, i: int) -> Column: + """Return the column at the indicated position.""" + + def get_column_by_name(self, name: str) -> Column: + """Return the column with the given name.""" + + def get_columns(self) -> Iterable[Column]: + """Return an iterator yielding the columns.""" + + def select_columns(self, indices: Sequence[int]) -> DataFrame: + """Create a new dataframe by selecting a subset of columns by index.""" + + def select_columns_by_name(self, names: Sequence[str]) -> DataFrame: + """Create a new dataframe by selecting a subset of columns by name.""" + + def get_chunks(self, n_chunks: int | None = None) -> Iterable[DataFrame]: + """Return an iterator yielding the chunks of the dataframe.""" From 18736fa0e7956a0abaa91cf0ac766995a0cb8fae Mon Sep 17 00:00:00 2001 From: Weijie Guo Date: Thu, 24 Aug 2023 03:42:49 +0800 Subject: [PATCH 48/55] fix(rust, python): re-sort buffer when update window swap the whole buffer (#10696) --- .../src/kernels/rolling/window.rs | 3 +- .../tests/unit/datatypes/test_temporal.py | 31 +++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/crates/polars-arrow/src/kernels/rolling/window.rs b/crates/polars-arrow/src/kernels/rolling/window.rs index c9083c6cf9ba..8f4c0be1b9fb 100644 --- a/crates/polars-arrow/src/kernels/rolling/window.rs +++ b/crates/polars-arrow/src/kernels/rolling/window.rs @@ -30,7 +30,8 @@ impl<'a, T: NativeType + IsFloat + PartialOrd> SortedBuf<'a, T> { if start >= self.last_end { self.buf.clear(); let new_window = self.slice.get_unchecked(start..end); - self.buf.extend_from_slice(new_window) + self.buf.extend_from_slice(new_window); + sort_buf(&mut self.buf); } else { // remove elements that should leave the window for idx in self.last_start..start { diff --git a/py-polars/tests/unit/datatypes/test_temporal.py b/py-polars/tests/unit/datatypes/test_temporal.py index 1b6767dc07bc..269b3947bc7d 100644 --- a/py-polars/tests/unit/datatypes/test_temporal.py +++ b/py-polars/tests/unit/datatypes/test_temporal.py @@ -637,6 +637,37 @@ def test_explode_date() -> None: ] +def test_groupy_by_dynamic_median_10695() -> None: + df = pl.DataFrame( + { + "timestamp": pl.date_range( + datetime(2023, 8, 22, 15, 44, 30), + datetime(2023, 8, 22, 15, 48, 50), + "20s", + eager=True, + ), + "foo": [2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + } + ) + + assert df.group_by_dynamic( + index_column="timestamp", + every="60s", + period="3m", + ).agg( + pl.col("foo").median() + ).to_dict(False) == { + "timestamp": [ + datetime(2023, 8, 22, 15, 44), + datetime(2023, 8, 22, 15, 45), + datetime(2023, 8, 22, 15, 46), + datetime(2023, 8, 22, 15, 47), + datetime(2023, 8, 22, 15, 48), + ], + "foo": [1.0, 1.0, 1.0, 1.0, 1.0], + } + + def test_group_by_dynamic_when_conversion_crosses_dates_7274() -> None: df = ( pl.DataFrame( From 068231592ca1f35b3f363371bba135a03485c709 Mon Sep 17 00:00:00 2001 From: Weijie Guo Date: Thu, 24 Aug 2023 13:30:47 +0800 Subject: [PATCH 49/55] fix(rust, python): Reused input series in rolling_apply should not be orderly (#10694) --- .../src/chunked_array/ops/rolling_window.rs | 6 ++++-- py-polars/tests/unit/operations/test_rolling.py | 14 ++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/crates/polars-core/src/chunked_array/ops/rolling_window.rs b/crates/polars-core/src/chunked_array/ops/rolling_window.rs index 487ecc503624..0bc0267b40dc 100644 --- a/crates/polars-core/src/chunked_array/ops/rolling_window.rs +++ b/crates/polars-core/src/chunked_array/ops/rolling_window.rs @@ -119,9 +119,10 @@ mod inner_mod { unsafe { *ptr = arr_window; } + // reset flags as we reuse this container + series_container.clear_settings(); // ensure the length is correct series_container._get_inner_mut().compute_len(); - let s = if size == options.window_size { f(&series_container.multiply(&weights_series).unwrap()) } else { @@ -166,9 +167,10 @@ mod inner_mod { unsafe { *ptr = arr_window; } + // reset flags as we reuse this container + series_container.clear_settings(); // ensure the length is correct series_container._get_inner_mut().compute_len(); - let s = f(&series_container); let out = self.unpack_series_matching_type(&s)?; builder.append_option(out.get(0)); diff --git a/py-polars/tests/unit/operations/test_rolling.py b/py-polars/tests/unit/operations/test_rolling.py index 46ff6c6e5181..4eb6b6c06e8a 100644 --- a/py-polars/tests/unit/operations/test_rolling.py +++ b/py-polars/tests/unit/operations/test_rolling.py @@ -792,6 +792,20 @@ def test_rolling_window_size_9160() -> None: ).to_list() == [1] +def test_rolling_apply_clear_reuse_series_state_10681() -> None: + df = pl.DataFrame({"a": [1, 1, 1, 1, 2, 2, 2, 2], "b": [0, 1, 11.0, 7, 4, 2, 3, 8]}) + assert df.with_columns( + pl.col("b") + .rolling_apply(lambda s: s.min(), window_size=3, min_periods=2) + .over("a") + .alias("min") + ).to_dict(False) == { + "a": [1, 1, 1, 1, 2, 2, 2, 2], + "b": [0.0, 1.0, 11.0, 7.0, 4.0, 2.0, 3.0, 8.0], + "min": [None, 0.0, 0.0, 1.0, None, 2.0, 2.0, 2.0], + } + + def test_rolling_empty_window_9406() -> None: datecol = pl.Series( "d", From 576ed0d0cd1d710d42b4ea5363f1eee6226be31b Mon Sep 17 00:00:00 2001 From: Marshall Date: Thu, 24 Aug 2023 01:32:05 -0400 Subject: [PATCH 50/55] fix(python): raise exception with invalid `on` arg type for join_asof (#10690) --- py-polars/polars/dataframe/frame.py | 15 +++++++++++ .../tests/unit/operations/test_join_asof.py | 26 +++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 8f021b3eaef6..47a7af4260ca 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -5792,6 +5792,21 @@ def join_asof( f"expected `other` join table to be a DataFrame, got {type(other).__name__!r}" ) + if on is not None: + if not isinstance(on, (str, pl.Expr)): + raise TypeError( + f"expected `on` to be str or Expr, got {type(on).__name__!r}" + ) + else: + if not isinstance(left_on, (str, pl.Expr)): + raise TypeError( + f"expected `left_on` to be str or Expr, got {type(left_on).__name__!r}" + ) + elif not isinstance(right_on, (str, pl.Expr)): + raise TypeError( + f"expected `right_on` to be str or Expr, got {type(right_on).__name__!r}" + ) + return ( self.lazy() .join_asof( diff --git a/py-polars/tests/unit/operations/test_join_asof.py b/py-polars/tests/unit/operations/test_join_asof.py index 6ec901d661fb..2c43c4d10cfc 100644 --- a/py-polars/tests/unit/operations/test_join_asof.py +++ b/py-polars/tests/unit/operations/test_join_asof.py @@ -1024,3 +1024,29 @@ def test_join_asof_by_argument_parsing() -> None: ) assert_frame_equal(by_list2, by_list) assert_frame_equal(by_tuple2, by_list) + + +def test_join_asof_invalid_args() -> None: + df1 = pl.DataFrame( + { + "a": [1, 2, 3], + "b": [1, 2, 3], + } + ).set_sorted("a") + df2 = pl.DataFrame( + { + "a": [1, 2, 3], + "c": [1, 2, 3], + } + ).set_sorted("a") + + with pytest.raises(TypeError, match="expected `on` to be str or Expr, got 'list'"): + df1.join_asof(df2, on=["a"]) # type: ignore[arg-type] + with pytest.raises( + TypeError, match="expected `left_on` to be str or Expr, got 'list'" + ): + df1.join_asof(df2, left_on=["a"], right_on="a") # type: ignore[arg-type] + with pytest.raises( + TypeError, match="expected `right_on` to be str or Expr, got 'list'" + ): + df1.join_asof(df2, left_on="a", right_on=["a"]) # type: ignore[arg-type] From a9a87a4ac46138fe0a901910b7c0cb6ed5924373 Mon Sep 17 00:00:00 2001 From: Weijie Guo Date: Thu, 24 Aug 2023 13:43:12 +0800 Subject: [PATCH 51/55] fix(rust, python): Cast small int type when scan csv in streaming mode. (#10679) --- crates/polars-pipe/src/executors/sources/csv.rs | 2 +- py-polars/tests/unit/streaming/test_streaming_io.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/crates/polars-pipe/src/executors/sources/csv.rs b/crates/polars-pipe/src/executors/sources/csv.rs index 8a6338827828..1053ff1d236c 100644 --- a/crates/polars-pipe/src/executors/sources/csv.rs +++ b/crates/polars-pipe/src/executors/sources/csv.rs @@ -62,7 +62,7 @@ impl CsvSource { let reader = CsvReader::from_path(&path) .unwrap() .has_header(options.has_header) - .with_schema(Some(self.schema.clone())) + .with_dtypes(Some(self.schema.clone())) .with_delimiter(options.delimiter) .with_ignore_errors(options.ignore_errors) .with_skip_rows(options.skip_rows) diff --git a/py-polars/tests/unit/streaming/test_streaming_io.py b/py-polars/tests/unit/streaming/test_streaming_io.py index 888faeb6d2c4..de91a0a36b9e 100644 --- a/py-polars/tests/unit/streaming/test_streaming_io.py +++ b/py-polars/tests/unit/streaming/test_streaming_io.py @@ -27,3 +27,12 @@ def test_scan_slice_streaming(io_files_path: Path) -> None: foods_file_path = io_files_path / "foods1.csv" df = pl.scan_csv(foods_file_path).head(5).collect(streaming=True) assert df.shape == (5, 4) + + +@pytest.mark.parametrize("dtype", [pl.Int8, pl.UInt8, pl.Int16, pl.UInt16]) +def test_scan_csv_overwrite_small_dtypes( + io_files_path: Path, dtype: pl.DataType +) -> None: + file_path = io_files_path / "foods1.csv" + df = pl.scan_csv(file_path, dtypes={"sugars_g": dtype}).collect(streaming=True) + assert df.dtypes == [pl.Utf8, pl.Int64, pl.Float64, dtype] From abab4970d8e4895309f3f66ccf4c1d8e84cbfdfd Mon Sep 17 00:00:00 2001 From: Vasanthakumar Vijayasekaran Date: Thu, 24 Aug 2023 17:48:53 +0530 Subject: [PATCH 52/55] fix(rust): fix bug when providing custom labels and opting for duplicates in qcut (#10686) --- crates/polars-ops/src/series/ops/cut.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/polars-ops/src/series/ops/cut.rs b/crates/polars-ops/src/series/ops/cut.rs index 947aa40891f1..ac49dffb7216 100644 --- a/crates/polars-ops/src/series/ops/cut.rs +++ b/crates/polars-ops/src/series/ops/cut.rs @@ -127,7 +127,7 @@ pub fn qcut( Some( ll.into_iter() .enumerate() - .filter(|(i, _)| *i == 0 || *i == blen || qbreaks[*i] != qbreaks[i - 1]) + .filter(|(i, _)| *i == 0 || *i == blen - 1 || qbreaks[*i] != qbreaks[i - 1]) .unzip::<_, _, Vec<_>, Vec<_>>() .1, ) From a4427dff5aae48d6c8b52ca6c1f3d7e2fb670aed Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Thu, 24 Aug 2023 19:12:44 +0200 Subject: [PATCH 53/55] ci: Clear GitHub Actions caches weekly (#10715) --- .github/workflows/clear-caches.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 .github/workflows/clear-caches.yml diff --git a/.github/workflows/clear-caches.yml b/.github/workflows/clear-caches.yml new file mode 100644 index 000000000000..f6a001c35419 --- /dev/null +++ b/.github/workflows/clear-caches.yml @@ -0,0 +1,19 @@ +# Clearing caches regularly takes care of Rust caches growing to problematic size over time + +name: Clear caches + +on: + schedule: + - cron: '0 4 * * MON' + workflow_dispatch: + +jobs: + clear-caches: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Clear all caches + run: gh cache delete --all + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 21e8cf0f3f6a781cecf4ee9504e99c1edf3648fc Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Thu, 24 Aug 2023 18:15:07 +0100 Subject: [PATCH 54/55] perf(rust, python): parse time zones outside of downcast_iter() in replace_time_zone (#10713) --- crates/polars-arrow/src/kernels/time.rs | 29 ++++--------------- .../datetime/replace_time_zone.rs | 18 ++++++++++-- 2 files changed, 21 insertions(+), 26 deletions(-) diff --git a/crates/polars-arrow/src/kernels/time.rs b/crates/polars-arrow/src/kernels/time.rs index b9cc64d612cf..b9774d932020 100644 --- a/crates/polars-arrow/src/kernels/time.rs +++ b/crates/polars-arrow/src/kernels/time.rs @@ -7,7 +7,6 @@ use arrow::temporal_conversions::{ }; use chrono::{LocalResult, NaiveDateTime, TimeZone}; use chrono_tz::Tz; -use polars_error::polars_bail; use crate::error::PolarsResult; @@ -37,11 +36,11 @@ fn convert_to_naive_local( } } -fn convert_to_timestamp( - from_tz: Tz, - to_tz: Tz, +pub fn replace_time_zone( arr: &PrimitiveArray, tu: TimeUnit, + from_tz: &Tz, + to_tz: &Tz, use_earliest: Option, ) -> PolarsResult> { let res = match tu { @@ -49,7 +48,7 @@ fn convert_to_timestamp( arr, |value| { let ndt = timestamp_ms_to_datetime(value); - Ok(convert_to_naive_local(&from_tz, &to_tz, ndt, use_earliest)?.timestamp_millis()) + Ok(convert_to_naive_local(from_tz, to_tz, ndt, use_earliest)?.timestamp_millis()) }, ArrowDataType::Int64, ), @@ -57,7 +56,7 @@ fn convert_to_timestamp( arr, |value| { let ndt = timestamp_us_to_datetime(value); - Ok(convert_to_naive_local(&from_tz, &to_tz, ndt, use_earliest)?.timestamp_micros()) + Ok(convert_to_naive_local(from_tz, to_tz, ndt, use_earliest)?.timestamp_micros()) }, ArrowDataType::Int64, ), @@ -65,7 +64,7 @@ fn convert_to_timestamp( arr, |value| { let ndt = timestamp_ns_to_datetime(value); - Ok(convert_to_naive_local(&from_tz, &to_tz, ndt, use_earliest)?.timestamp_nanos()) + Ok(convert_to_naive_local(from_tz, to_tz, ndt, use_earliest)?.timestamp_nanos()) }, ArrowDataType::Int64, ), @@ -73,19 +72,3 @@ fn convert_to_timestamp( }; Ok(res?) } - -pub fn replace_time_zone( - arr: &PrimitiveArray, - tu: TimeUnit, - from: &str, - to: &str, - use_earliest: Option, -) -> PolarsResult> { - match from.parse::() { - Ok(from_tz) => match to.parse::() { - Ok(to_tz) => convert_to_timestamp(from_tz, to_tz, arr, tu, use_earliest), - Err(_) => polars_bail!(ComputeError: "unable to parse time zone: '{}'", to), - }, - Err(_) => polars_bail!(ComputeError: "unable to parse time zone: '{}'", from), - } -} diff --git a/crates/polars-ops/src/chunked_array/datetime/replace_time_zone.rs b/crates/polars-ops/src/chunked_array/datetime/replace_time_zone.rs index 77954a64e065..82d2785f82a0 100644 --- a/crates/polars-ops/src/chunked_array/datetime/replace_time_zone.rs +++ b/crates/polars-ops/src/chunked_array/datetime/replace_time_zone.rs @@ -1,16 +1,28 @@ +use chrono_tz::Tz; use polars_arrow::kernels::replace_time_zone as replace_time_zone_kernel; use polars_core::prelude::*; +fn parse_time_zone(s: &str) -> PolarsResult { + s.parse() + .map_err(|e| polars_err!(ComputeError: format!("unable to parse time zone: '{s}': {e}"))) +} + pub fn replace_time_zone( ca: &DatetimeChunked, time_zone: Option<&str>, use_earliest: Option, ) -> PolarsResult { let out: PolarsResult<_> = { - let from = ca.time_zone().as_deref().unwrap_or("UTC"); - let to = time_zone.unwrap_or("UTC"); + let from_tz = parse_time_zone(ca.time_zone().as_deref().unwrap_or("UTC"))?; + let to_tz = parse_time_zone(time_zone.unwrap_or("UTC"))?; let chunks = ca.downcast_iter().map(|arr| { - replace_time_zone_kernel(arr, ca.time_unit().to_arrow(), from, to, use_earliest) + replace_time_zone_kernel( + arr, + ca.time_unit().to_arrow(), + &from_tz, + &to_tz, + use_earliest, + ) }); let out = ChunkedArray::try_from_chunk_iter(ca.name(), chunks)?; Ok(out.into_datetime(ca.time_unit(), time_zone.map(|x| x.to_string()))) From 5d1b28a227cc6da70e94d359bd5e862e49f2a7a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9?= Date: Fri, 25 Aug 2023 04:20:33 +0200 Subject: [PATCH 55/55] doc(rust): Fix typo in `upsample` docs (#8285) --- crates/polars-time/src/upsample.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/crates/polars-time/src/upsample.rs b/crates/polars-time/src/upsample.rs index d6cde94ee98d..bd170a6bc213 100644 --- a/crates/polars-time/src/upsample.rs +++ b/crates/polars-time/src/upsample.rs @@ -19,7 +19,7 @@ pub trait PolarsUpsample { /// * `every` - interval will start 'every' duration /// * `offset` - change the start of the date_range by this offset. /// - /// The `period` and `offset` arguments are created with + /// The `every` and `offset` arguments are created with /// the following string language: /// - 1ns (1 nanosecond) /// - 1us (1 microsecond) @@ -33,11 +33,14 @@ pub trait PolarsUpsample { /// - 1q (1 calendar quarter) /// - 1y (1 calendar year) /// - 1i (1 index count) + /// /// Or combine them: /// "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + /// /// Suffix with `"_saturating"` to saturate dates with days too /// large for their month to the last day of the month (e.g. /// 2022-02-29 to 2022-02-28). + /// /// By "calendar day", we mean the corresponding time on the next /// day (which may not be 24 hours, depending on daylight savings). /// Similarly for "calendar week", "calendar month", "calendar quarter", @@ -59,7 +62,7 @@ pub trait PolarsUpsample { /// * `every` - interval will start 'every' duration /// * `offset` - change the start of the date_range by this offset. /// - /// The `period` and `offset` arguments are created with + /// The `every` and `offset` arguments are created with /// the following string language: /// - 1ns (1 nanosecond) /// - 1us (1 microsecond) @@ -73,11 +76,14 @@ pub trait PolarsUpsample { /// - 1q (1 calendar quarter) /// - 1y (1 calendar year) /// - 1i (1 index count) + /// /// Or combine them: /// "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + /// /// Suffix with `"_saturating"` to saturate dates with days too /// large for their month to the last day of the month (e.g. /// 2022-02-29 to 2022-02-28). + /// /// By "calendar day", we mean the corresponding time on the next /// day (which may not be 24 hours, depending on daylight savings). /// Similarly for "calendar week", "calendar month", "calendar quarter",