From 873d18edd0eaae78b3755f8f4226ab2d8f17f5ce Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Thu, 3 Aug 2023 06:08:00 +0200 Subject: [PATCH] feat(rust!,python): Extend `datetime` expression function with time zone/time unit parameters (#10235) --- .../src/dsl/function_expr/datetime.rs | 6 + .../polars-plan/src/dsl/function_expr/mod.rs | 12 ++ .../src/dsl/function_expr/schema.rs | 10 ++ .../src/dsl/function_expr/temporal.rs | 107 +++++++++++++++ .../polars-plan/src/dsl/functions/temporal.rs | 127 +++++++----------- py-polars/polars/expr/datetime.py | 14 +- py-polars/polars/functions/as_datatype.py | 48 ++++--- py-polars/polars/series/datetime.py | 14 +- py-polars/src/functions/lazy.rs | 10 +- .../tests/unit/functions/test_as_datatype.py | 48 +++++++ 10 files changed, 289 insertions(+), 107 deletions(-) diff --git a/crates/polars-plan/src/dsl/function_expr/datetime.rs b/crates/polars-plan/src/dsl/function_expr/datetime.rs index c1d9054e702b..7f57f1f4c010 100644 --- a/crates/polars-plan/src/dsl/function_expr/datetime.rs +++ b/crates/polars-plan/src/dsl/function_expr/datetime.rs @@ -64,6 +64,11 @@ pub enum TemporalFunction { closed: ClosedWindow, }, Combine(TimeUnit), + DatetimeFunction { + time_unit: TimeUnit, + time_zone: Option, + use_earliest: Option, + }, } impl Display for TemporalFunction { @@ -105,6 +110,7 @@ impl Display for TemporalFunction { DateRanges { .. } => return write!(f, "date_ranges"), TimeRange { .. } => return write!(f, "time_range"), TimeRanges { .. } => return write!(f, "time_ranges"), + DatetimeFunction { .. } => return write!(f, "datetime"), Combine(_) => "combine", }; write!(f, "dt.{s}") diff --git a/crates/polars-plan/src/dsl/function_expr/mod.rs b/crates/polars-plan/src/dsl/function_expr/mod.rs index 3dd74b58331f..30add44729da 100644 --- a/crates/polars-plan/src/dsl/function_expr/mod.rs +++ b/crates/polars-plan/src/dsl/function_expr/mod.rs @@ -799,6 +799,18 @@ impl From for SpecialEq> { None ) } + DatetimeFunction { + time_unit, + time_zone, + use_earliest, + } => { + map_as_slice!( + temporal::datetime, + &time_unit, + time_zone.as_deref(), + use_earliest + ) + } } } } diff --git a/crates/polars-plan/src/dsl/function_expr/schema.rs b/crates/polars-plan/src/dsl/function_expr/schema.rs index 82dd321fd853..2533c8f7e047 100644 --- a/crates/polars-plan/src/dsl/function_expr/schema.rs +++ b/crates/polars-plan/src/dsl/function_expr/schema.rs @@ -96,6 +96,16 @@ impl FunctionExpr { DataType::List(Box::new(DataType::Time)), )); } + DatetimeFunction { + time_unit, + time_zone, + use_earliest: _, + } => { + return Ok(Field::new( + "datetime", + DataType::Datetime(*time_unit, time_zone.clone()), + )); + } Combine(tu) => match mapper.with_same_dtype().unwrap().dtype { DataType::Datetime(_, tz) => DataType::Datetime(*tu, tz), DataType::Date => DataType::Datetime(*tu, None), diff --git a/crates/polars-plan/src/dsl/function_expr/temporal.rs b/crates/polars-plan/src/dsl/function_expr/temporal.rs index ae07aa4fd636..bfe2d8f94981 100644 --- a/crates/polars-plan/src/dsl/function_expr/temporal.rs +++ b/crates/polars-plan/src/dsl/function_expr/temporal.rs @@ -6,6 +6,113 @@ use polars_time::prelude::*; use super::*; +pub(super) fn datetime( + s: &[Series], + time_unit: &TimeUnit, + time_zone: Option<&str>, + use_earliest: Option, +) -> PolarsResult { + use polars_core::export::chrono::NaiveDate; + use polars_core::utils::CustomIterTools; + + let year = &s[0]; + let month = &s[1]; + let day = &s[2]; + let hour = &s[3]; + let minute = &s[4]; + let second = &s[5]; + let microsecond = &s[6]; + + let max_len = s.iter().map(|s| s.len()).max().unwrap(); + + let mut year = year.cast(&DataType::Int32)?; + if year.len() < max_len { + year = year.new_from_index(0, max_len) + } + let year = year.i32()?; + + let mut month = month.cast(&DataType::UInt32)?; + if month.len() < max_len { + month = month.new_from_index(0, max_len); + } + let month = month.u32()?; + + let mut day = day.cast(&DataType::UInt32)?; + if day.len() < max_len { + day = day.new_from_index(0, max_len); + } + let day = day.u32()?; + + let mut hour = hour.cast(&DataType::UInt32)?; + if hour.len() < max_len { + hour = hour.new_from_index(0, max_len); + } + let hour = hour.u32()?; + + let mut minute = minute.cast(&DataType::UInt32)?; + if minute.len() < max_len { + minute = minute.new_from_index(0, max_len); + } + let minute = minute.u32()?; + + let mut second = second.cast(&DataType::UInt32)?; + if second.len() < max_len { + second = second.new_from_index(0, max_len); + } + let second = second.u32()?; + + let mut microsecond = microsecond.cast(&DataType::UInt32)?; + if microsecond.len() < max_len { + microsecond = microsecond.new_from_index(0, max_len); + } + let microsecond = microsecond.u32()?; + + let ca: Int64Chunked = year + .into_iter() + .zip(month) + .zip(day) + .zip(hour) + .zip(minute) + .zip(second) + .zip(microsecond) + .map(|((((((y, m), d), h), mnt), s), us)| { + if let (Some(y), Some(m), Some(d), Some(h), Some(mnt), Some(s), Some(us)) = + (y, m, d, h, mnt, s, us) + { + NaiveDate::from_ymd_opt(y, m, d) + .and_then(|nd| nd.and_hms_micro_opt(h, mnt, s, us)) + .map(|ndt| match time_unit { + TimeUnit::Milliseconds => ndt.timestamp_millis(), + TimeUnit::Microseconds => ndt.timestamp_micros(), + TimeUnit::Nanoseconds => ndt.timestamp_nanos(), + }) + } else { + None + } + }) + .collect_trusted(); + + let ca = match time_zone { + #[cfg(feature = "timezones")] + Some(_) => { + let mut ca = ca.into_datetime(*time_unit, None); + ca = replace_time_zone(&ca, time_zone, use_earliest)?; + ca + } + _ => { + polars_ensure!( + time_zone.is_none() && use_earliest.is_none(), + ComputeError: "cannot make use of the `time_zone` and `use_earliest` arguments without the 'timezones' feature enabled." + ); + ca.into_datetime(*time_unit, None) + } + }; + + let mut s = ca.into_series(); + s.rename("datetime"); + Ok(s) +} + #[cfg(feature = "date_offset")] pub(super) fn date_offset(s: Series, offset: Duration) -> PolarsResult { let preserve_sortedness: bool; diff --git a/crates/polars-plan/src/dsl/functions/temporal.rs b/crates/polars-plan/src/dsl/functions/temporal.rs index 2b035d265cc7..890f79b796dc 100644 --- a/crates/polars-plan/src/dsl/functions/temporal.rs +++ b/crates/polars-plan/src/dsl/functions/temporal.rs @@ -36,6 +36,26 @@ pub struct DatetimeArgs { pub minute: Expr, pub second: Expr, pub microsecond: Expr, + pub time_unit: TimeUnit, + pub time_zone: Option, + pub use_earliest: Option, +} + +impl Default for DatetimeArgs { + fn default() -> Self { + Self { + year: lit(1970), + month: lit(1), + day: lit(1), + hour: lit(0), + minute: lit(0), + second: lit(0), + microsecond: lit(0), + time_unit: TimeUnit::Microseconds, + time_zone: None, + use_earliest: None, + } + } } impl DatetimeArgs { @@ -47,10 +67,7 @@ impl DatetimeArgs { year, month, day, - hour: lit(0), - minute: lit(0), - second: lit(0), - microsecond: lit(0), + ..Default::default() } } @@ -78,14 +95,26 @@ impl DatetimeArgs { impl_unit_setter!(with_minute(minute)); impl_unit_setter!(with_second(second)); impl_unit_setter!(with_microsecond(microsecond)); + + pub fn with_time_unit(self, time_unit: TimeUnit) -> Self { + Self { time_unit, ..self } + } + #[cfg(feature = "timezones")] + pub fn with_time_zone(self, time_zone: Option) -> Self { + Self { time_zone, ..self } + } + #[cfg(feature = "timezones")] + pub fn with_use_earliest(self, use_earliest: Option) -> Self { + Self { + use_earliest, + ..self + } + } } /// Construct a column of `Datetime` from the provided [`DatetimeArgs`]. #[cfg(feature = "temporal")] pub fn datetime(args: DatetimeArgs) -> Expr { - use polars_core::export::chrono::NaiveDate; - use polars_core::utils::CustomIterTools; - let year = args.year; let month = args.month; let day = args.day; @@ -93,87 +122,27 @@ pub fn datetime(args: DatetimeArgs) -> Expr { let minute = args.minute; let second = args.second; let microsecond = args.microsecond; + let time_unit = args.time_unit; + let time_zone = args.time_zone; + let use_earliest = args.use_earliest; - let function = SpecialEq::new(Arc::new(move |s: &mut [Series]| { - assert_eq!(s.len(), 7); - let max_len = s.iter().map(|s| s.len()).max().unwrap(); - let mut year = s[0].cast(&DataType::Int32)?; - if year.len() < max_len { - year = year.new_from_index(0, max_len) - } - let year = year.i32()?; - let mut month = s[1].cast(&DataType::UInt32)?; - if month.len() < max_len { - month = month.new_from_index(0, max_len); - } - let month = month.u32()?; - let mut day = s[2].cast(&DataType::UInt32)?; - if day.len() < max_len { - day = day.new_from_index(0, max_len); - } - let day = day.u32()?; - let mut hour = s[3].cast(&DataType::UInt32)?; - if hour.len() < max_len { - hour = hour.new_from_index(0, max_len); - } - let hour = hour.u32()?; - - let mut minute = s[4].cast(&DataType::UInt32)?; - if minute.len() < max_len { - minute = minute.new_from_index(0, max_len); - } - let minute = minute.u32()?; - - let mut second = s[5].cast(&DataType::UInt32)?; - if second.len() < max_len { - second = second.new_from_index(0, max_len); - } - let second = second.u32()?; - - let mut microsecond = s[6].cast(&DataType::UInt32)?; - if microsecond.len() < max_len { - microsecond = microsecond.new_from_index(0, max_len); - } - let microsecond = microsecond.u32()?; + let input = vec![year, month, day, hour, minute, second, microsecond]; - let ca: Int64Chunked = year - .into_iter() - .zip(month) - .zip(day) - .zip(hour) - .zip(minute) - .zip(second) - .zip(microsecond) - .map(|((((((y, m), d), h), mnt), s), us)| { - if let (Some(y), Some(m), Some(d), Some(h), Some(mnt), Some(s), Some(us)) = - (y, m, d, h, mnt, s, us) - { - NaiveDate::from_ymd_opt(y, m, d) - .and_then(|nd| nd.and_hms_micro_opt(h, mnt, s, us)) - .map(|ndt| ndt.timestamp_micros()) - } else { - None - } - }) - .collect_trusted(); - - Ok(Some( - ca.into_datetime(TimeUnit::Microseconds, None).into_series(), - )) - }) as Arc); - - Expr::AnonymousFunction { - input: vec![year, month, day, hour, minute, second, microsecond], - function, - output_type: GetOutput::from_type(DataType::Datetime(TimeUnit::Microseconds, None)), + Expr::Function { + input, + function: FunctionExpr::TemporalExpr(TemporalFunction::DatetimeFunction { + time_unit, + time_zone, + use_earliest, + }), options: FunctionOptions { collect_groups: ApplyOptions::ApplyFlat, + allow_rename: true, input_wildcard_expansion: true, fmt_str: "datetime", ..Default::default() }, } - .alias("datetime") } /// Arguments used by `duration` in order to produce an `Expr` of `Duration` diff --git a/py-polars/polars/expr/datetime.py b/py-polars/polars/expr/datetime.py index 09e970c74a76..36c15a00f872 100644 --- a/py-polars/polars/expr/datetime.py +++ b/py-polars/polars/expr/datetime.py @@ -47,9 +47,10 @@ def truncate( Offset the window use_earliest Determine how to deal with ambiguous datetimes: - - None (default): raise; - - True: use the earliest datetime; - - False: use the latest datetime. + + - ``None`` (default): raise + - ``True``: use the earliest datetime + - ``False``: use the latest datetime Notes ----- @@ -1507,9 +1508,10 @@ def replace_time_zone( Time zone for the `Datetime` expression. Pass `None` to unset time zone. use_earliest Determine how to deal with ambiguous datetimes: - - None (default): raise; - - True: use the earliest datetime; - - False: use the latest datetime. + + - ``None`` (default): raise + - ``True``: use the earliest datetime + - ``False``: use the latest datetime Examples -------- diff --git a/py-polars/polars/functions/as_datatype.py b/py-polars/polars/functions/as_datatype.py index 72fa9b622b51..9a69dea0ec04 100644 --- a/py-polars/polars/functions/as_datatype.py +++ b/py-polars/polars/functions/as_datatype.py @@ -20,17 +20,21 @@ from typing import Literal from polars import Expr, Series - from polars.type_aliases import IntoExpr, SchemaDict + from polars.type_aliases import IntoExpr, SchemaDict, TimeUnit def datetime_( - year: Expr | str | int, - month: Expr | str | int, - day: Expr | str | int, - hour: Expr | str | int | None = None, - minute: Expr | str | int | None = None, - second: Expr | str | int | None = None, - microsecond: Expr | str | int | None = None, + year: int | IntoExpr, + month: int | IntoExpr, + day: int | IntoExpr, + hour: int | IntoExpr | None = None, + minute: int | IntoExpr | None = None, + second: int | IntoExpr | None = None, + microsecond: int | IntoExpr | None = None, + *, + time_unit: TimeUnit = "us", + time_zone: str | None = None, + use_earliest: bool | None = None, ) -> Expr: """ Create a Polars literal expression of type Datetime. @@ -38,19 +42,30 @@ def datetime_( Parameters ---------- year - column or literal. + Column or literal. month - column or literal, ranging from 1-12. + Column or literal, ranging from 1-12. day - column or literal, ranging from 1-31. + Column or literal, ranging from 1-31. hour - column or literal, ranging from 0-23. + Column or literal, ranging from 0-23. minute - column or literal, ranging from 0-59. + Column or literal, ranging from 0-59. second - column or literal, ranging from 0-59. + Column or literal, ranging from 0-59. microsecond - column or literal, ranging from 0-999999. + Column or literal, ranging from 0-999999. + time_unit : {'us', 'ms', 'ns'} + Time unit of the resulting expression. + time_zone + Time zone of the resulting expression. + use_earliest + Determine how to deal with ambiguous datetimes: + + - ``None`` (default): raise + - ``True``: use the earliest datetime + - ``False``: use the latest datetime + Returns ------- @@ -80,6 +95,9 @@ def datetime_( minute, second, microsecond, + time_unit, + time_zone, + use_earliest, ) ) diff --git a/py-polars/polars/series/datetime.py b/py-polars/polars/series/datetime.py index 2fe18143109d..92f054b6a803 100644 --- a/py-polars/polars/series/datetime.py +++ b/py-polars/polars/series/datetime.py @@ -1158,9 +1158,10 @@ def replace_time_zone( Time zone for the `Datetime` Series. Pass `None` to unset time zone. use_earliest Determine how to deal with ambiguous datetimes: - - None (default): raise; - - True: use the earliest datetime; - - False: use the latest datetime. + + - ``None`` (default): raise + - ``True``: use the earliest datetime + - ``False``: use the latest datetime Examples -------- @@ -1611,9 +1612,10 @@ def truncate( Offset the window use_earliest Determine how to deal with ambiguous datetimes: - - None (default): raise; - - True: use the earliest datetime; - - False: use the latest datetime. + + - ``None`` (default): raise + - ``True``: use the earliest datetime + - ``False``: use the latest datetime Notes ----- diff --git a/py-polars/src/functions/lazy.rs b/py-polars/src/functions/lazy.rs index 904d4813837e..176fa448b961 100644 --- a/py-polars/src/functions/lazy.rs +++ b/py-polars/src/functions/lazy.rs @@ -186,7 +186,9 @@ pub fn cumreduce(lambda: PyObject, exprs: Vec) -> PyExpr { dsl::cumreduce_exprs(func, exprs).into() } +#[allow(clippy::too_many_arguments)] #[pyfunction] +#[pyo3(signature = (year, month, day, hour=None, minute=None, second=None, microsecond=None, time_unit=Wrap(TimeUnit::Microseconds), time_zone=None, use_earliest=None))] pub fn datetime( year: PyExpr, month: PyExpr, @@ -195,12 +197,15 @@ pub fn datetime( minute: Option, second: Option, microsecond: Option, + time_unit: Wrap, + time_zone: Option, + use_earliest: Option, ) -> PyExpr { let year = year.inner; let month = month.inner; let day = day.inner; - set_unwrapped_or_0!(hour, minute, second, microsecond); + let time_unit = time_unit.0; let args = DatetimeArgs { year, @@ -210,6 +215,9 @@ pub fn datetime( minute, second, microsecond, + time_unit, + time_zone, + use_earliest, }; dsl::datetime(args).into() } diff --git a/py-polars/tests/unit/functions/test_as_datatype.py b/py-polars/tests/unit/functions/test_as_datatype.py index 73a027f04093..86fbb0fdfd14 100644 --- a/py-polars/tests/unit/functions/test_as_datatype.py +++ b/py-polars/tests/unit/functions/test_as_datatype.py @@ -1,10 +1,20 @@ +from __future__ import annotations + from datetime import date, datetime +from typing import TYPE_CHECKING import pytest import polars as pl from polars.testing import assert_frame_equal, assert_series_equal +if TYPE_CHECKING: + from zoneinfo import ZoneInfo + + from polars.type_aliases import TimeUnit +else: + from polars.utils.convert import get_zoneinfo as ZoneInfo + def test_date_datetime() -> None: df = pl.DataFrame( @@ -24,6 +34,44 @@ def test_date_datetime() -> None: assert_series_equal(out["h2"], df["hour"].rename("h2")) +@pytest.mark.parametrize("time_unit", ["ms", "us", "ns"]) +def test_datetime_time_unit(time_unit: TimeUnit) -> None: + result = pl.datetime(2022, 1, 2, time_unit=time_unit) + + assert pl.select(result.dt.year()).item() == 2022 + assert pl.select(result.dt.month()).item() == 1 + assert pl.select(result.dt.day()).item() == 2 + + +@pytest.mark.parametrize("time_zone", [None, "Europe/Amsterdam", "UTC"]) +def test_datetime_time_zone(time_zone: str | None) -> None: + result = pl.datetime(2022, 1, 2, 10, time_zone=time_zone) + + assert pl.select(result.dt.year()).item() == 2022 + assert pl.select(result.dt.month()).item() == 1 + assert pl.select(result.dt.day()).item() == 2 + assert pl.select(result.dt.hour()).item() == 10 + + +def test_datetime_ambiguous_time_zone() -> None: + expr = pl.datetime(2018, 10, 28, 2, 30, time_zone="Europe/Brussels") + + with pytest.raises(pl.ArrowError): + pl.select(expr) + + +def test_datetime_ambiguous_time_zone_use_earliest() -> None: + expr = pl.datetime( + 2018, 10, 28, 2, 30, time_zone="Europe/Brussels", use_earliest=True + ) + + result = pl.select(expr).item() + + expected = datetime(2018, 10, 28, 2, 30, tzinfo=ZoneInfo("Europe/Brussels")) + assert result == expected + assert result.fold == 0 + + def test_time() -> None: df = pl.DataFrame( {