Skip to content

Commit

Permalink
feat(rust, python): add label argument to group_by_dynamic, and d…
Browse files Browse the repository at this point in the history
…eprecate `truncate`
  • Loading branch information
MarcoGorelli committed Sep 26, 2023
1 parent 1f0450a commit 2efddf0
Show file tree
Hide file tree
Showing 7 changed files with 94 additions and 24 deletions.
30 changes: 17 additions & 13 deletions crates/polars-time/src/group_by/dynamic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ pub struct DynamicGroupOptions {
/// Offset window boundaries.
pub offset: Duration,
/// Truncate the time column values to the window.
pub truncate: bool,
pub label: Label,
/// Add the boundaries to the dataframe.
pub include_boundaries: bool,
pub closed_window: ClosedWindow,
Expand All @@ -46,7 +46,7 @@ impl Default for DynamicGroupOptions {
every: Duration::new(1),
period: Duration::new(1),
offset: Duration::new(1),
truncate: true,
label: Label::Left,
include_boundaries: false,
closed_window: ClosedWindow::Left,
start_by: Default::default(),
Expand Down Expand Up @@ -290,8 +290,10 @@ impl Wrap<&DataFrame> {
include_lower_bound = true;
include_upper_bound = true;
}
if options.truncate {
if options.label == Label::Left {
include_lower_bound = true;
} else if options.label == Label::Right {
include_upper_bound = true;
}

let mut update_bounds =
Expand Down Expand Up @@ -481,28 +483,30 @@ impl Wrap<&DataFrame> {
}

let lower = lower_bound.map(|lower| Int64Chunked::new_vec(LB_NAME, lower));
let upper = upper_bound.map(|upper| Int64Chunked::new_vec(UP_NAME, upper));

if options.truncate {
if options.label == Label::Left {
let mut lower = lower.clone().unwrap();
if by.is_empty() {
lower.set_sorted_flag(IsSorted::Ascending)
}
dt = lower.with_name(dt.name());
} else if options.label == Label::Right {
let mut upper = upper.clone().unwrap();
if by.is_empty() {
upper.set_sorted_flag(IsSorted::Ascending)
}
dt = upper.with_name(dt.name());
}

if let (true, Some(mut lower), Some(upper)) =
(options.include_boundaries, lower, upper_bound)
if let (true, Some(mut lower), Some(mut upper)) = (options.include_boundaries, lower, upper)
{
let mut upper = Int64Chunked::new_vec(UP_NAME, upper)
.into_datetime(tu, tz.clone())
.into_series();

if by.is_empty() {
lower.set_sorted_flag(IsSorted::Ascending);
upper.set_sorted_flag(IsSorted::Ascending);
}
by.push(lower.into_datetime(tu, tz.clone()).into_series());
by.push(upper);
by.push(upper.into_datetime(tu, tz.clone()).into_series());
}

dt.into_datetime(tu, None)
Expand Down Expand Up @@ -824,7 +828,7 @@ mod test {
every: Duration::parse("1h"),
period: Duration::parse("1h"),
offset: Duration::parse("0h"),
truncate: true,
label: Label::Left,
include_boundaries: true,
closed_window: ClosedWindow::Both,
start_by: Default::default(),
Expand Down Expand Up @@ -939,7 +943,7 @@ mod test {
every: Duration::parse("6d"),
period: Duration::parse("6d"),
offset: Duration::parse("0h"),
truncate: true,
label: Label::Left,
include_boundaries: true,
closed_window: ClosedWindow::Both,
start_by: Default::default(),
Expand Down
8 changes: 8 additions & 0 deletions crates/polars-time/src/windows/group_by.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@ pub enum ClosedWindow {
None,
}

#[derive(Debug, Clone, PartialEq, Eq)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub enum Label {
Left,
Right,
Datapoint,
}

#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub enum StartBy {
Expand Down
13 changes: 12 additions & 1 deletion py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5239,9 +5239,10 @@ def group_by_dynamic(
every: str | timedelta,
period: str | timedelta | None = None,
offset: str | timedelta | None = None,
truncate: bool = True,
truncate: bool | None = None,
include_boundaries: bool = False,
closed: ClosedInterval = "left",
label: str = "left",
by: IntoExpr | Iterable[IntoExpr] | None = None,
start_by: StartBy = "window",
check_sorted: bool = True,
Expand Down Expand Up @@ -5285,12 +5286,21 @@ def group_by_dynamic(
Defaults to negative `every`.
truncate
truncate the time value to the window lower bound
.. deprecated:: 0.19.4
Use `label` instead.
include_boundaries
Add the lower and upper bound of the window to the "_lower_bound" and
"_upper_bound" columns. This will impact performance because it's harder to
parallelize
closed : {'left', 'right', 'both', 'none'}
Define which sides of the temporal interval are closed (inclusive).
label : {'left', 'right', 'datapoint'}
Define which label to use for the window:
- 'left': lower boundary of the window
- 'right': upper boundary of the window
- 'datapoint': the first value of the index column in the given window
by
Also group by this column/these columns
start_by : {'window', 'datapoint', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'}
Expand Down Expand Up @@ -5570,6 +5580,7 @@ def group_by_dynamic(
period=period,
offset=offset,
truncate=truncate,
label=label,
include_boundaries=include_boundaries,
closed=closed,
by=by,
Expand Down
6 changes: 5 additions & 1 deletion py-polars/polars/dataframe/group_by.py
Original file line number Diff line number Diff line change
Expand Up @@ -955,9 +955,10 @@ def __init__(
every: str | timedelta,
period: str | timedelta | None,
offset: str | timedelta | None,
truncate: bool,
truncate: bool | None,
include_boundaries: bool,
closed: ClosedInterval,
label: str,
by: IntoExpr | Iterable[IntoExpr] | None,
start_by: StartBy,
check_sorted: bool,
Expand All @@ -972,6 +973,7 @@ def __init__(
self.period = period
self.offset = offset
self.truncate = truncate
self.label = label
self.include_boundaries = include_boundaries
self.closed = closed
self.by = by
Expand All @@ -989,6 +991,7 @@ def __iter__(self) -> Self:
period=self.period,
offset=self.offset,
truncate=self.truncate,
label=self.label,
include_boundaries=self.include_boundaries,
closed=self.closed,
by=self.by,
Expand Down Expand Up @@ -1052,6 +1055,7 @@ def agg(
period=self.period,
offset=self.offset,
truncate=self.truncate,
label=self.label,
include_boundaries=self.include_boundaries,
closed=self.closed,
by=self.by,
Expand Down
31 changes: 28 additions & 3 deletions py-polars/polars/lazyframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import contextlib
import os
import warnings
from datetime import date, datetime, time, timedelta
from io import BytesIO, StringIO
from pathlib import Path
Expand Down Expand Up @@ -66,6 +67,7 @@
_in_notebook,
_prepare_row_count_args,
_process_null_values,
find_stacklevel,
normalize_filepath,
)

Expand Down Expand Up @@ -2931,9 +2933,10 @@ def group_by_dynamic(
every: str | timedelta,
period: str | timedelta | None = None,
offset: str | timedelta | None = None,
truncate: bool = True,
truncate: bool | None = None,
include_boundaries: bool = False,
closed: ClosedInterval = "left",
label: str = "left",
by: IntoExpr | Iterable[IntoExpr] | None = None,
start_by: StartBy = "window",
check_sorted: bool = True,
Expand Down Expand Up @@ -2977,12 +2980,21 @@ def group_by_dynamic(
Defaults to negative `every`.
truncate
truncate the time value to the window lower bound
.. deprecated:: 0.19.4
Use `label` instead.
include_boundaries
Add the lower and upper bound of the window to the "_lower_bound" and
"_upper_bound" columns. This will impact performance because it's harder to
parallelize
closed : {'right', 'left', 'both', 'none'}
closed : {'left', 'right', 'both', 'none'}
Define which sides of the temporal interval are closed (inclusive).
label : {'left', 'right', 'datapoint'}
Define which label to use for the window:
- 'left': lower boundary of the window
- 'right': upper boundary of the window
- 'datapoint': the first value of the index column in the given window
by
Also group by this column/these columns
start_by : {'window', 'datapoint', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'}
Expand Down Expand Up @@ -3255,6 +3267,19 @@ def group_by_dynamic(
└─────────────────┴─────────────────┴─────┴─────────────────┘
""" # noqa: W505
if truncate is not None:
if truncate:
label = "left"
else:
label = "datapoint"
warnings.warn(
f"`truncate` is deprecated and will be removed in a future version. "
f"Please replace `truncate={truncate}` with `label='{label}'` to "
"silence this warning.",
DeprecationWarning,
stacklevel=find_stacklevel(),
)

index_column = parse_as_expression(index_column)
if offset is None:
offset = _negate_duration(_timedelta_to_pl_duration(every))
Expand All @@ -3272,7 +3297,7 @@ def group_by_dynamic(
every,
period,
offset,
truncate,
label,
include_boundaries,
closed,
pyexprs_by,
Expand Down
10 changes: 8 additions & 2 deletions py-polars/src/lazyframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -651,7 +651,7 @@ impl PyLazyFrame {
every: &str,
period: &str,
offset: &str,
truncate: bool,
label: &str,
include_boundaries: bool,
closed: Wrap<ClosedWindow>,
by: Vec<PyExpr>,
Expand All @@ -664,14 +664,20 @@ impl PyLazyFrame {
.map(|pyexpr| pyexpr.inner)
.collect::<Vec<_>>();
let ldf = self.ldf.clone();
let label = match label {
"left" => Label::Left,
"right" => Label::Right,
"datapoint" => Label::Datapoint,
_ => unreachable!(),
};
let lazy_gb = ldf.group_by_dynamic(
index_column.inner,
by,
DynamicGroupOptions {
every: Duration::parse(every),
period: Duration::parse(period),
offset: Duration::parse(offset),
truncate,
label,
include_boundaries,
closed_window,
start_by: start_by.0,
Expand Down
20 changes: 16 additions & 4 deletions py-polars/tests/unit/operations/rolling/test_rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,7 +538,7 @@ def test_dynamic_group_by_timezone_awareness(
offset=offset,
closed="right",
include_boundaries=True,
truncate=False,
label="datapoint",
).agg(pl.col("value").last())
).dtypes == [pl.Datetime("ns", "UTC")] * 3 + [pl.Int64]

Expand All @@ -554,7 +554,7 @@ def test_group_by_dynamic_startby_5599(tzinfo: ZoneInfo | None) -> None:
"date",
every="31m",
include_boundaries=True,
truncate=False,
label="datapoint",
start_by="datapoint",
).agg(pl.count()).to_dict(False) == {
"_lower_boundary": [
Expand Down Expand Up @@ -598,7 +598,7 @@ def test_group_by_dynamic_startby_5599(tzinfo: ZoneInfo | None) -> None:
period="3d",
include_boundaries=True,
start_by="monday",
truncate=False,
label="datapoint",
).agg([pl.count(), pl.col("day").first().alias("data_day")])
assert result.to_dict(False) == {
"_lower_boundary": [
Expand All @@ -623,7 +623,7 @@ def test_group_by_dynamic_startby_5599(tzinfo: ZoneInfo | None) -> None:
period="3d",
include_boundaries=True,
start_by="saturday",
truncate=False,
label="datapoint",
).agg([pl.count(), pl.col("day").first().alias("data_day")])
assert result.to_dict(False) == {
"_lower_boundary": [
Expand Down Expand Up @@ -686,6 +686,18 @@ def test_group_by_dynamic_by_monday_and_offset_5444() -> None:
assert result_empty.schema == result.schema


def test_group_by_dynamic_truncate_to_label_deprecation() -> None:
df = pl.LazyFrame({"ts": [], "n": []})
with pytest.warns(
DeprecationWarning, match="replace `truncate=False` with `label='datapoint'`"
):
df.group_by_dynamic("ts", every="1d", truncate=False)
with pytest.warns(
DeprecationWarning, match="replace `truncate=True` with `label='left'`"
):
df.group_by_dynamic("ts", every="1d", truncate=True)


def test_group_by_rolling_iter() -> None:
df = pl.DataFrame(
{
Expand Down

0 comments on commit 2efddf0

Please sign in to comment.