diff --git a/crates/polars-arrow/src/compute/cast/mod.rs b/crates/polars-arrow/src/compute/cast/mod.rs index bbabbe279439..23edbd1c9056 100644 --- a/crates/polars-arrow/src/compute/cast/mod.rs +++ b/crates/polars-arrow/src/compute/cast/mod.rs @@ -585,9 +585,11 @@ pub fn cast( LargeUtf8 => Ok(Box::new(utf8_to_large_utf8( array.as_any().downcast_ref().unwrap(), ))), - Timestamp(TimeUnit::Nanosecond, None) => utf8_to_naive_timestamp_ns_dyn::(array), - Timestamp(TimeUnit::Nanosecond, Some(tz)) => { - utf8_to_timestamp_ns_dyn::(array, tz.clone()) + Timestamp(time_unit, None) => { + utf8_to_naive_timestamp_dyn::(array, time_unit.to_owned()) + }, + Timestamp(time_unit, Some(time_zone)) => { + utf8_to_timestamp_dyn::(array, time_zone.clone(), time_unit.to_owned()) }, _ => polars_bail!(InvalidOperation: "casting from {from_type:?} to {to_type:?} not supported", @@ -612,9 +614,11 @@ pub fn cast( to_type.clone(), ) .boxed()), - Timestamp(TimeUnit::Nanosecond, None) => utf8_to_naive_timestamp_ns_dyn::(array), - Timestamp(TimeUnit::Nanosecond, Some(tz)) => { - utf8_to_timestamp_ns_dyn::(array, tz.clone()) + Timestamp(time_unit, None) => { + utf8_to_naive_timestamp_dyn::(array, time_unit.to_owned()) + }, + Timestamp(time_unit, Some(time_zone)) => { + utf8_to_timestamp_dyn::(array, time_zone.clone(), time_unit.to_owned()) }, _ => polars_bail!(InvalidOperation: "casting from {from_type:?} to {to_type:?} not supported", diff --git a/crates/polars-arrow/src/compute/cast/utf8_to.rs b/crates/polars-arrow/src/compute/cast/utf8_to.rs index 6af294c00e44..cc92c975c192 100644 --- a/crates/polars-arrow/src/compute/cast/utf8_to.rs +++ b/crates/polars-arrow/src/compute/cast/utf8_to.rs @@ -3,11 +3,11 @@ use polars_error::PolarsResult; use super::CastOptions; use crate::array::*; -use crate::datatypes::DataType; +use crate::datatypes::{DataType, TimeUnit}; use crate::offset::Offset; use crate::temporal_conversions::{ - utf8_to_naive_timestamp_ns as utf8_to_naive_timestamp_ns_, - utf8_to_timestamp_ns as utf8_to_timestamp_ns_, EPOCH_DAYS_FROM_CE, + utf8_to_naive_timestamp as utf8_to_naive_timestamp_, utf8_to_timestamp as utf8_to_timestamp_, + EPOCH_DAYS_FROM_CE, }; use crate::types::NativeType; @@ -110,34 +110,40 @@ pub fn utf8_to_dictionary( Ok(array.into()) } -pub(super) fn utf8_to_naive_timestamp_ns_dyn( +pub(super) fn utf8_to_naive_timestamp_dyn( from: &dyn Array, + time_unit: TimeUnit, ) -> PolarsResult> { let from = from.as_any().downcast_ref().unwrap(); - Ok(Box::new(utf8_to_naive_timestamp_ns::(from))) + Ok(Box::new(utf8_to_naive_timestamp::(from, time_unit))) } -/// [`crate::temporal_conversions::utf8_to_timestamp_ns`] applied for RFC3339 formatting -pub fn utf8_to_naive_timestamp_ns(from: &Utf8Array) -> PrimitiveArray { - utf8_to_naive_timestamp_ns_(from, RFC3339) +/// [`crate::temporal_conversions::utf8_to_timestamp`] applied for RFC3339 formatting +pub fn utf8_to_naive_timestamp( + from: &Utf8Array, + time_unit: TimeUnit, +) -> PrimitiveArray { + utf8_to_naive_timestamp_(from, RFC3339, time_unit) } -pub(super) fn utf8_to_timestamp_ns_dyn( +pub(super) fn utf8_to_timestamp_dyn( from: &dyn Array, timezone: String, + time_unit: TimeUnit, ) -> PolarsResult> { let from = from.as_any().downcast_ref().unwrap(); - utf8_to_timestamp_ns::(from, timezone) + utf8_to_timestamp::(from, timezone, time_unit) .map(Box::new) .map(|x| x as Box) } -/// [`crate::temporal_conversions::utf8_to_timestamp_ns`] applied for RFC3339 formatting -pub fn utf8_to_timestamp_ns( +/// [`crate::temporal_conversions::utf8_to_timestamp`] applied for RFC3339 formatting +pub fn utf8_to_timestamp( from: &Utf8Array, timezone: String, + time_unit: TimeUnit, ) -> PolarsResult> { - utf8_to_timestamp_ns_(from, RFC3339, timezone) + utf8_to_timestamp_(from, RFC3339, timezone, time_unit) } /// Conversion of utf8 diff --git a/crates/polars-arrow/src/temporal_conversions.rs b/crates/polars-arrow/src/temporal_conversions.rs index bcaa4875363d..02270c2ef661 100644 --- a/crates/polars-arrow/src/temporal_conversions.rs +++ b/crates/polars-arrow/src/temporal_conversions.rs @@ -321,17 +321,6 @@ pub fn parse_offset(offset: &str) -> PolarsResult { .expect("FixedOffset::east out of bounds")) } -/// Parses `value` to `Option` consistent with the Arrow's definition of timestamp with timezone. -/// `tz` must be built from `timezone` (either via [`parse_offset`] or `chrono-tz`). -#[inline] -pub fn utf8_to_timestamp_ns_scalar( - value: &str, - fmt: &str, - tz: &T, -) -> Option { - utf8_to_timestamp_scalar(value, fmt, tz, &TimeUnit::Nanosecond) -} - /// Parses `value` to `Option` consistent with the Arrow's definition of timestamp with timezone. /// `tz` must be built from `timezone` (either via [`parse_offset`] or `chrono-tz`). /// Returns in scale `tz` of `TimeUnit`. @@ -362,12 +351,6 @@ pub fn utf8_to_timestamp_scalar( } } -/// Parses `value` to `Option` consistent with the Arrow's definition of timestamp without timezone. -#[inline] -pub fn utf8_to_naive_timestamp_ns_scalar(value: &str, fmt: &str) -> Option { - utf8_to_naive_timestamp_scalar(value, fmt, &TimeUnit::Nanosecond) -} - /// Parses `value` to `Option` consistent with the Arrow's definition of timestamp without timezone. /// Returns in scale `tz` of `TimeUnit`. #[inline] @@ -386,18 +369,18 @@ pub fn utf8_to_naive_timestamp_scalar(value: &str, fmt: &str, tu: &TimeUnit) -> .ok() } -fn utf8_to_timestamp_ns_impl( +fn utf8_to_timestamp_impl( array: &Utf8Array, fmt: &str, - timezone: String, + time_zone: String, tz: T, + time_unit: TimeUnit, ) -> PrimitiveArray { let iter = array .iter() - .map(|x| x.and_then(|x| utf8_to_timestamp_ns_scalar(x, fmt, &tz))); + .map(|x| x.and_then(|x| utf8_to_timestamp_scalar(x, fmt, &tz, &time_unit))); - PrimitiveArray::from_trusted_len_iter(iter) - .to(DataType::Timestamp(TimeUnit::Nanosecond, Some(timezone))) + PrimitiveArray::from_trusted_len_iter(iter).to(DataType::Timestamp(time_unit, Some(time_zone))) } /// Parses `value` to a [`chrono_tz::Tz`] with the Arrow's definition of timestamp with a timezone. @@ -411,20 +394,22 @@ pub fn parse_offset_tz(timezone: &str) -> PolarsResult { #[cfg(feature = "chrono-tz")] #[cfg_attr(docsrs, doc(cfg(feature = "chrono-tz")))] -fn chrono_tz_utf_to_timestamp_ns( +fn chrono_tz_utf_to_timestamp( array: &Utf8Array, fmt: &str, - timezone: String, + time_zone: String, + time_unit: TimeUnit, ) -> PolarsResult> { - let tz = parse_offset_tz(&timezone)?; - Ok(utf8_to_timestamp_ns_impl(array, fmt, timezone, tz)) + let tz = parse_offset_tz(&time_zone)?; + Ok(utf8_to_timestamp_impl(array, fmt, time_zone, tz, time_unit)) } #[cfg(not(feature = "chrono-tz"))] -fn chrono_tz_utf_to_timestamp_ns( +fn chrono_tz_utf_to_timestamp( _: &Utf8Array, _: &str, timezone: String, + _: TimeUnit, ) -> PolarsResult> { panic!("timezone \"{timezone}\" cannot be parsed (feature chrono-tz is not active)") } @@ -432,22 +417,23 @@ fn chrono_tz_utf_to_timestamp_ns( /// Parses a [`Utf8Array`] to a timeozone-aware timestamp, i.e. [`PrimitiveArray`] with type `Timestamp(Nanosecond, Some(timezone))`. /// # Implementation /// * parsed values with timezone other than `timezone` are converted to `timezone`. -/// * parsed values without timezone are null. Use [`utf8_to_naive_timestamp_ns`] to parse naive timezones. +/// * parsed values without timezone are null. Use [`utf8_to_naive_timestamp`] to parse naive timezones. /// * Null elements remain null; non-parsable elements are null. /// The feature `"chrono-tz"` enables IANA and zoneinfo formats for `timezone`. /// # Error /// This function errors iff `timezone` is not parsable to an offset. -pub fn utf8_to_timestamp_ns( +pub fn utf8_to_timestamp( array: &Utf8Array, fmt: &str, - timezone: String, + time_zone: String, + time_unit: TimeUnit, ) -> PolarsResult> { - let tz = parse_offset(timezone.as_str()); + let tz = parse_offset(time_zone.as_str()); if let Ok(tz) = tz { - Ok(utf8_to_timestamp_ns_impl(array, fmt, timezone, tz)) + Ok(utf8_to_timestamp_impl(array, fmt, time_zone, tz, time_unit)) } else { - chrono_tz_utf_to_timestamp_ns(array, fmt, timezone) + chrono_tz_utf_to_timestamp(array, fmt, time_zone, time_unit) } } @@ -455,15 +441,16 @@ pub fn utf8_to_timestamp_ns( /// [`PrimitiveArray`] with type `Timestamp(Nanosecond, None)`. /// Timezones are ignored. /// Null elements remain null; non-parsable elements are set to null. -pub fn utf8_to_naive_timestamp_ns( +pub fn utf8_to_naive_timestamp( array: &Utf8Array, fmt: &str, + time_unit: TimeUnit, ) -> PrimitiveArray { let iter = array .iter() - .map(|x| x.and_then(|x| utf8_to_naive_timestamp_ns_scalar(x, fmt))); + .map(|x| x.and_then(|x| utf8_to_naive_timestamp_scalar(x, fmt, &time_unit))); - PrimitiveArray::from_trusted_len_iter(iter).to(DataType::Timestamp(TimeUnit::Nanosecond, None)) + PrimitiveArray::from_trusted_len_iter(iter).to(DataType::Timestamp(time_unit, None)) } fn add_month(year: i32, month: u32, months: i32) -> chrono::NaiveDate { diff --git a/crates/polars-core/src/chunked_array/cast.rs b/crates/polars-core/src/chunked_array/cast.rs index bf06636829f7..2c216a69731a 100644 --- a/crates/polars-core/src/chunked_array/cast.rs +++ b/crates/polars-core/src/chunked_array/cast.rs @@ -7,6 +7,7 @@ use arrow::compute::cast::CastOptions; use crate::chunked_array::categorical::CategoricalChunkedBuilder; #[cfg(feature = "timezones")] use crate::chunked_array::temporal::validate_time_zone; +use crate::prelude::DataType::Datetime; use crate::prelude::*; pub(crate) fn cast_chunks( @@ -195,6 +196,33 @@ impl ChunkCast for Utf8Chunked { polars_bail!(ComputeError: "expected 'precision' or 'scale' when casting to Decimal") }, }, + #[cfg(feature = "dtype-date")] + DataType::Date => { + let result = cast_chunks(&self.chunks, data_type, true)?; + let out = Series::try_from((self.name(), result))?; + Ok(out) + }, + #[cfg(feature = "dtype-datetime")] + DataType::Datetime(time_unit, time_zone) => { + let out = match time_zone { + #[cfg(feature = "timezones")] + Some(time_zone) => { + validate_time_zone(time_zone)?; + let result = cast_chunks( + &self.chunks, + &Datetime(time_unit.to_owned(), Some(time_zone.clone())), + true, + )?; + Series::try_from((self.name(), result)) + }, + _ => { + let result = + cast_chunks(&self.chunks, &Datetime(time_unit.to_owned(), None), true)?; + Series::try_from((self.name(), result)) + }, + }; + out + }, _ => cast_impl(self.name(), &self.chunks, data_type), } } diff --git a/py-polars/tests/unit/operations/test_cast.py b/py-polars/tests/unit/operations/test_cast.py new file mode 100644 index 000000000000..00000bdbdc73 --- /dev/null +++ b/py-polars/tests/unit/operations/test_cast.py @@ -0,0 +1,109 @@ +from datetime import date, datetime + +import pytest + +import polars as pl +from polars.exceptions import ComputeError +from polars.testing import assert_frame_equal + + +def test_utf8_date() -> None: + df = pl.DataFrame({"x1": ["2021-01-01"]}).with_columns( + **{"x1-date": pl.col("x1").cast(pl.Date)} + ) + expected = pl.DataFrame({"x1-date": [date(2021, 1, 1)]}) + out = df.select(pl.col("x1-date")) + assert_frame_equal(expected, out) + + +def test_invalid_utf8_date() -> None: + df = pl.DataFrame({"x1": ["2021-01-aa"]}) + + with pytest.raises(ComputeError): + df.with_columns(**{"x1-date": pl.col("x1").cast(pl.Date)}) + + +def test_utf8_datetime() -> None: + df = pl.DataFrame( + {"x1": ["2021-12-19T00:39:57", "2022-12-19T16:39:57"]} + ).with_columns( + **{ + "x1-datetime-ns": pl.col("x1").cast(pl.Datetime(time_unit="ns")), + "x1-datetime-ms": pl.col("x1").cast(pl.Datetime(time_unit="ms")), + "x1-datetime-us": pl.col("x1").cast(pl.Datetime(time_unit="us")), + } + ) + first_row = datetime(year=2021, month=12, day=19, hour=00, minute=39, second=57) + second_row = datetime(year=2022, month=12, day=19, hour=16, minute=39, second=57) + expected = pl.DataFrame( + { + "x1-datetime-ns": [first_row, second_row], + "x1-datetime-ms": [first_row, second_row], + "x1-datetime-us": [first_row, second_row], + } + ).select( + pl.col("x1-datetime-ns").dt.cast_time_unit("ns"), + pl.col("x1-datetime-ms").dt.cast_time_unit("ms"), + pl.col("x1-datetime-us").dt.cast_time_unit("us"), + ) + + out = df.select( + pl.col("x1-datetime-ns"), pl.col("x1-datetime-ms"), pl.col("x1-datetime-us") + ) + assert_frame_equal(expected, out) + + +def test_invalid_utf8_datetime() -> None: + df = pl.DataFrame({"x1": ["2021-12-19 00:39:57", "2022-12-19 16:39:57"]}) + with pytest.raises(ComputeError): + df.with_columns( + **{"x1-datetime-ns": pl.col("x1").cast(pl.Datetime(time_unit="ns"))} + ) + + +def test_utf8_datetime_timezone() -> None: + ccs_tz = "America/Caracas" + stg_tz = "America/Santiago" + utc_tz = "UTC" + df = pl.DataFrame( + {"x1": ["1996-12-19T16:39:57 +00:00", "2022-12-19T00:39:57 +00:00"]} + ).with_columns( + **{ + "x1-datetime-ns": pl.col("x1").cast( + pl.Datetime(time_unit="ns", time_zone=ccs_tz) + ), + "x1-datetime-ms": pl.col("x1").cast( + pl.Datetime(time_unit="ms", time_zone=stg_tz) + ), + "x1-datetime-us": pl.col("x1").cast( + pl.Datetime(time_unit="us", time_zone=utc_tz) + ), + } + ) + + expected = pl.DataFrame( + { + "x1-datetime-ns": [ + datetime(year=1996, month=12, day=19, hour=12, minute=39, second=57), + datetime(year=2022, month=12, day=18, hour=20, minute=39, second=57), + ], + "x1-datetime-ms": [ + datetime(year=1996, month=12, day=19, hour=13, minute=39, second=57), + datetime(year=2022, month=12, day=18, hour=21, minute=39, second=57), + ], + "x1-datetime-us": [ + datetime(year=1996, month=12, day=19, hour=16, minute=39, second=57), + datetime(year=2022, month=12, day=19, hour=00, minute=39, second=57), + ], + } + ).select( + pl.col("x1-datetime-ns").dt.cast_time_unit("ns").dt.replace_time_zone(ccs_tz), + pl.col("x1-datetime-ms").dt.cast_time_unit("ms").dt.replace_time_zone(stg_tz), + pl.col("x1-datetime-us").dt.cast_time_unit("us").dt.replace_time_zone(utc_tz), + ) + + out = df.select( + pl.col("x1-datetime-ns"), pl.col("x1-datetime-ms"), pl.col("x1-datetime-us") + ) + + assert_frame_equal(expected, out) diff --git a/py-polars/tests/unit/test_lazy.py b/py-polars/tests/unit/test_lazy.py index 1805a2221e8b..bb484ba4ae52 100644 --- a/py-polars/tests/unit/test_lazy.py +++ b/py-polars/tests/unit/test_lazy.py @@ -1341,7 +1341,7 @@ def test_quadratic_behavior_4736() -> None: ldf.select(reduce(add, (pl.col(fld) for fld in ldf.columns))) -@pytest.mark.parametrize("input_dtype", [pl.Utf8, pl.Int64, pl.Float64]) +@pytest.mark.parametrize("input_dtype", [pl.Int64, pl.Float64]) def test_from_epoch(input_dtype: pl.PolarsDataType) -> None: ldf = pl.LazyFrame( [ @@ -1381,6 +1381,23 @@ def test_from_epoch(input_dtype: pl.PolarsDataType) -> None: _ = ldf.select(pl.from_epoch(ts_col, time_unit="s2")) # type: ignore[call-overload] +def test_from_epoch_str() -> None: + ldf = pl.LazyFrame( + [ + pl.Series("timestamp_ms", [1147880044 * 1_000]).cast(pl.Utf8), + pl.Series("timestamp_us", [1147880044 * 1_000_000]).cast(pl.Utf8), + ] + ) + + with pytest.raises(ComputeError): + ldf.select( + [ + pl.from_epoch(pl.col("timestamp_ms"), time_unit="ms"), + pl.from_epoch(pl.col("timestamp_us"), time_unit="us"), + ] + ).collect() + + def test_cumagg_types() -> None: ldf = pl.LazyFrame({"a": [1, 2], "b": [True, False], "c": [1.3, 2.4]}) cumsum_lf = ldf.select(