Skip to content

Commit

Permalink
feat(rust): casting utf8 to temporal (#12072)
Browse files Browse the repository at this point in the history
Co-authored-by: MarcoGorelli <[email protected]>
  • Loading branch information
brayanjuls and MarcoGorelli authored Oct 28, 2023
1 parent ec2876a commit 440e093
Show file tree
Hide file tree
Showing 6 changed files with 207 additions and 56 deletions.
16 changes: 10 additions & 6 deletions crates/polars-arrow/src/compute/cast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -585,9 +585,11 @@ pub fn cast(
LargeUtf8 => Ok(Box::new(utf8_to_large_utf8(
array.as_any().downcast_ref().unwrap(),
))),
Timestamp(TimeUnit::Nanosecond, None) => utf8_to_naive_timestamp_ns_dyn::<i32>(array),
Timestamp(TimeUnit::Nanosecond, Some(tz)) => {
utf8_to_timestamp_ns_dyn::<i32>(array, tz.clone())
Timestamp(time_unit, None) => {
utf8_to_naive_timestamp_dyn::<i32>(array, time_unit.to_owned())
},
Timestamp(time_unit, Some(time_zone)) => {
utf8_to_timestamp_dyn::<i32>(array, time_zone.clone(), time_unit.to_owned())
},
_ => polars_bail!(InvalidOperation:
"casting from {from_type:?} to {to_type:?} not supported",
Expand All @@ -612,9 +614,11 @@ pub fn cast(
to_type.clone(),
)
.boxed()),
Timestamp(TimeUnit::Nanosecond, None) => utf8_to_naive_timestamp_ns_dyn::<i64>(array),
Timestamp(TimeUnit::Nanosecond, Some(tz)) => {
utf8_to_timestamp_ns_dyn::<i64>(array, tz.clone())
Timestamp(time_unit, None) => {
utf8_to_naive_timestamp_dyn::<i64>(array, time_unit.to_owned())
},
Timestamp(time_unit, Some(time_zone)) => {
utf8_to_timestamp_dyn::<i64>(array, time_zone.clone(), time_unit.to_owned())
},
_ => polars_bail!(InvalidOperation:
"casting from {from_type:?} to {to_type:?} not supported",
Expand Down
32 changes: 19 additions & 13 deletions crates/polars-arrow/src/compute/cast/utf8_to.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@ use polars_error::PolarsResult;

use super::CastOptions;
use crate::array::*;
use crate::datatypes::DataType;
use crate::datatypes::{DataType, TimeUnit};
use crate::offset::Offset;
use crate::temporal_conversions::{
utf8_to_naive_timestamp_ns as utf8_to_naive_timestamp_ns_,
utf8_to_timestamp_ns as utf8_to_timestamp_ns_, EPOCH_DAYS_FROM_CE,
utf8_to_naive_timestamp as utf8_to_naive_timestamp_, utf8_to_timestamp as utf8_to_timestamp_,
EPOCH_DAYS_FROM_CE,
};
use crate::types::NativeType;

Expand Down Expand Up @@ -110,34 +110,40 @@ pub fn utf8_to_dictionary<O: Offset, K: DictionaryKey>(
Ok(array.into())
}

pub(super) fn utf8_to_naive_timestamp_ns_dyn<O: Offset>(
pub(super) fn utf8_to_naive_timestamp_dyn<O: Offset>(
from: &dyn Array,
time_unit: TimeUnit,
) -> PolarsResult<Box<dyn Array>> {
let from = from.as_any().downcast_ref().unwrap();
Ok(Box::new(utf8_to_naive_timestamp_ns::<O>(from)))
Ok(Box::new(utf8_to_naive_timestamp::<O>(from, time_unit)))
}

/// [`crate::temporal_conversions::utf8_to_timestamp_ns`] applied for RFC3339 formatting
pub fn utf8_to_naive_timestamp_ns<O: Offset>(from: &Utf8Array<O>) -> PrimitiveArray<i64> {
utf8_to_naive_timestamp_ns_(from, RFC3339)
/// [`crate::temporal_conversions::utf8_to_timestamp`] applied for RFC3339 formatting
pub fn utf8_to_naive_timestamp<O: Offset>(
from: &Utf8Array<O>,
time_unit: TimeUnit,
) -> PrimitiveArray<i64> {
utf8_to_naive_timestamp_(from, RFC3339, time_unit)
}

pub(super) fn utf8_to_timestamp_ns_dyn<O: Offset>(
pub(super) fn utf8_to_timestamp_dyn<O: Offset>(
from: &dyn Array,
timezone: String,
time_unit: TimeUnit,
) -> PolarsResult<Box<dyn Array>> {
let from = from.as_any().downcast_ref().unwrap();
utf8_to_timestamp_ns::<O>(from, timezone)
utf8_to_timestamp::<O>(from, timezone, time_unit)
.map(Box::new)
.map(|x| x as Box<dyn Array>)
}

/// [`crate::temporal_conversions::utf8_to_timestamp_ns`] applied for RFC3339 formatting
pub fn utf8_to_timestamp_ns<O: Offset>(
/// [`crate::temporal_conversions::utf8_to_timestamp`] applied for RFC3339 formatting
pub fn utf8_to_timestamp<O: Offset>(
from: &Utf8Array<O>,
timezone: String,
time_unit: TimeUnit,
) -> PolarsResult<PrimitiveArray<i64>> {
utf8_to_timestamp_ns_(from, RFC3339, timezone)
utf8_to_timestamp_(from, RFC3339, timezone, time_unit)
}

/// Conversion of utf8
Expand Down
59 changes: 23 additions & 36 deletions crates/polars-arrow/src/temporal_conversions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -321,17 +321,6 @@ pub fn parse_offset(offset: &str) -> PolarsResult<FixedOffset> {
.expect("FixedOffset::east out of bounds"))
}

/// Parses `value` to `Option<i64>` consistent with the Arrow's definition of timestamp with timezone.
/// `tz` must be built from `timezone` (either via [`parse_offset`] or `chrono-tz`).
#[inline]
pub fn utf8_to_timestamp_ns_scalar<T: chrono::TimeZone>(
value: &str,
fmt: &str,
tz: &T,
) -> Option<i64> {
utf8_to_timestamp_scalar(value, fmt, tz, &TimeUnit::Nanosecond)
}

/// Parses `value` to `Option<i64>` consistent with the Arrow's definition of timestamp with timezone.
/// `tz` must be built from `timezone` (either via [`parse_offset`] or `chrono-tz`).
/// Returns in scale `tz` of `TimeUnit`.
Expand Down Expand Up @@ -362,12 +351,6 @@ pub fn utf8_to_timestamp_scalar<T: chrono::TimeZone>(
}
}

/// Parses `value` to `Option<i64>` consistent with the Arrow's definition of timestamp without timezone.
#[inline]
pub fn utf8_to_naive_timestamp_ns_scalar(value: &str, fmt: &str) -> Option<i64> {
utf8_to_naive_timestamp_scalar(value, fmt, &TimeUnit::Nanosecond)
}

/// Parses `value` to `Option<i64>` consistent with the Arrow's definition of timestamp without timezone.
/// Returns in scale `tz` of `TimeUnit`.
#[inline]
Expand All @@ -386,18 +369,18 @@ pub fn utf8_to_naive_timestamp_scalar(value: &str, fmt: &str, tu: &TimeUnit) ->
.ok()
}

fn utf8_to_timestamp_ns_impl<O: Offset, T: chrono::TimeZone>(
fn utf8_to_timestamp_impl<O: Offset, T: chrono::TimeZone>(
array: &Utf8Array<O>,
fmt: &str,
timezone: String,
time_zone: String,
tz: T,
time_unit: TimeUnit,
) -> PrimitiveArray<i64> {
let iter = array
.iter()
.map(|x| x.and_then(|x| utf8_to_timestamp_ns_scalar(x, fmt, &tz)));
.map(|x| x.and_then(|x| utf8_to_timestamp_scalar(x, fmt, &tz, &time_unit)));

PrimitiveArray::from_trusted_len_iter(iter)
.to(DataType::Timestamp(TimeUnit::Nanosecond, Some(timezone)))
PrimitiveArray::from_trusted_len_iter(iter).to(DataType::Timestamp(time_unit, Some(time_zone)))
}

/// Parses `value` to a [`chrono_tz::Tz`] with the Arrow's definition of timestamp with a timezone.
Expand All @@ -411,59 +394,63 @@ pub fn parse_offset_tz(timezone: &str) -> PolarsResult<chrono_tz::Tz> {

#[cfg(feature = "chrono-tz")]
#[cfg_attr(docsrs, doc(cfg(feature = "chrono-tz")))]
fn chrono_tz_utf_to_timestamp_ns<O: Offset>(
fn chrono_tz_utf_to_timestamp<O: Offset>(
array: &Utf8Array<O>,
fmt: &str,
timezone: String,
time_zone: String,
time_unit: TimeUnit,
) -> PolarsResult<PrimitiveArray<i64>> {
let tz = parse_offset_tz(&timezone)?;
Ok(utf8_to_timestamp_ns_impl(array, fmt, timezone, tz))
let tz = parse_offset_tz(&time_zone)?;
Ok(utf8_to_timestamp_impl(array, fmt, time_zone, tz, time_unit))
}

#[cfg(not(feature = "chrono-tz"))]
fn chrono_tz_utf_to_timestamp_ns<O: Offset>(
fn chrono_tz_utf_to_timestamp<O: Offset>(
_: &Utf8Array<O>,
_: &str,
timezone: String,
_: TimeUnit,
) -> PolarsResult<PrimitiveArray<i64>> {
panic!("timezone \"{timezone}\" cannot be parsed (feature chrono-tz is not active)")
}

/// Parses a [`Utf8Array`] to a timeozone-aware timestamp, i.e. [`PrimitiveArray<i64>`] with type `Timestamp(Nanosecond, Some(timezone))`.
/// # Implementation
/// * parsed values with timezone other than `timezone` are converted to `timezone`.
/// * parsed values without timezone are null. Use [`utf8_to_naive_timestamp_ns`] to parse naive timezones.
/// * parsed values without timezone are null. Use [`utf8_to_naive_timestamp`] to parse naive timezones.
/// * Null elements remain null; non-parsable elements are null.
/// The feature `"chrono-tz"` enables IANA and zoneinfo formats for `timezone`.
/// # Error
/// This function errors iff `timezone` is not parsable to an offset.
pub fn utf8_to_timestamp_ns<O: Offset>(
pub fn utf8_to_timestamp<O: Offset>(
array: &Utf8Array<O>,
fmt: &str,
timezone: String,
time_zone: String,
time_unit: TimeUnit,
) -> PolarsResult<PrimitiveArray<i64>> {
let tz = parse_offset(timezone.as_str());
let tz = parse_offset(time_zone.as_str());

if let Ok(tz) = tz {
Ok(utf8_to_timestamp_ns_impl(array, fmt, timezone, tz))
Ok(utf8_to_timestamp_impl(array, fmt, time_zone, tz, time_unit))
} else {
chrono_tz_utf_to_timestamp_ns(array, fmt, timezone)
chrono_tz_utf_to_timestamp(array, fmt, time_zone, time_unit)
}
}

/// Parses a [`Utf8Array`] to naive timestamp, i.e.
/// [`PrimitiveArray<i64>`] with type `Timestamp(Nanosecond, None)`.
/// Timezones are ignored.
/// Null elements remain null; non-parsable elements are set to null.
pub fn utf8_to_naive_timestamp_ns<O: Offset>(
pub fn utf8_to_naive_timestamp<O: Offset>(
array: &Utf8Array<O>,
fmt: &str,
time_unit: TimeUnit,
) -> PrimitiveArray<i64> {
let iter = array
.iter()
.map(|x| x.and_then(|x| utf8_to_naive_timestamp_ns_scalar(x, fmt)));
.map(|x| x.and_then(|x| utf8_to_naive_timestamp_scalar(x, fmt, &time_unit)));

PrimitiveArray::from_trusted_len_iter(iter).to(DataType::Timestamp(TimeUnit::Nanosecond, None))
PrimitiveArray::from_trusted_len_iter(iter).to(DataType::Timestamp(time_unit, None))
}

fn add_month(year: i32, month: u32, months: i32) -> chrono::NaiveDate {
Expand Down
28 changes: 28 additions & 0 deletions crates/polars-core/src/chunked_array/cast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use arrow::compute::cast::CastOptions;
use crate::chunked_array::categorical::CategoricalChunkedBuilder;
#[cfg(feature = "timezones")]
use crate::chunked_array::temporal::validate_time_zone;
use crate::prelude::DataType::Datetime;
use crate::prelude::*;

pub(crate) fn cast_chunks(
Expand Down Expand Up @@ -195,6 +196,33 @@ impl ChunkCast for Utf8Chunked {
polars_bail!(ComputeError: "expected 'precision' or 'scale' when casting to Decimal")
},
},
#[cfg(feature = "dtype-date")]
DataType::Date => {
let result = cast_chunks(&self.chunks, data_type, true)?;
let out = Series::try_from((self.name(), result))?;
Ok(out)
},
#[cfg(feature = "dtype-datetime")]
DataType::Datetime(time_unit, time_zone) => {
let out = match time_zone {
#[cfg(feature = "timezones")]
Some(time_zone) => {
validate_time_zone(time_zone)?;
let result = cast_chunks(
&self.chunks,
&Datetime(time_unit.to_owned(), Some(time_zone.clone())),
true,
)?;
Series::try_from((self.name(), result))
},
_ => {
let result =
cast_chunks(&self.chunks, &Datetime(time_unit.to_owned(), None), true)?;
Series::try_from((self.name(), result))
},
};
out
},
_ => cast_impl(self.name(), &self.chunks, data_type),
}
}
Expand Down
109 changes: 109 additions & 0 deletions py-polars/tests/unit/operations/test_cast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
from datetime import date, datetime

import pytest

import polars as pl
from polars.exceptions import ComputeError
from polars.testing import assert_frame_equal


def test_utf8_date() -> None:
df = pl.DataFrame({"x1": ["2021-01-01"]}).with_columns(
**{"x1-date": pl.col("x1").cast(pl.Date)}
)
expected = pl.DataFrame({"x1-date": [date(2021, 1, 1)]})
out = df.select(pl.col("x1-date"))
assert_frame_equal(expected, out)


def test_invalid_utf8_date() -> None:
df = pl.DataFrame({"x1": ["2021-01-aa"]})

with pytest.raises(ComputeError):
df.with_columns(**{"x1-date": pl.col("x1").cast(pl.Date)})


def test_utf8_datetime() -> None:
df = pl.DataFrame(
{"x1": ["2021-12-19T00:39:57", "2022-12-19T16:39:57"]}
).with_columns(
**{
"x1-datetime-ns": pl.col("x1").cast(pl.Datetime(time_unit="ns")),
"x1-datetime-ms": pl.col("x1").cast(pl.Datetime(time_unit="ms")),
"x1-datetime-us": pl.col("x1").cast(pl.Datetime(time_unit="us")),
}
)
first_row = datetime(year=2021, month=12, day=19, hour=00, minute=39, second=57)
second_row = datetime(year=2022, month=12, day=19, hour=16, minute=39, second=57)
expected = pl.DataFrame(
{
"x1-datetime-ns": [first_row, second_row],
"x1-datetime-ms": [first_row, second_row],
"x1-datetime-us": [first_row, second_row],
}
).select(
pl.col("x1-datetime-ns").dt.cast_time_unit("ns"),
pl.col("x1-datetime-ms").dt.cast_time_unit("ms"),
pl.col("x1-datetime-us").dt.cast_time_unit("us"),
)

out = df.select(
pl.col("x1-datetime-ns"), pl.col("x1-datetime-ms"), pl.col("x1-datetime-us")
)
assert_frame_equal(expected, out)


def test_invalid_utf8_datetime() -> None:
df = pl.DataFrame({"x1": ["2021-12-19 00:39:57", "2022-12-19 16:39:57"]})
with pytest.raises(ComputeError):
df.with_columns(
**{"x1-datetime-ns": pl.col("x1").cast(pl.Datetime(time_unit="ns"))}
)


def test_utf8_datetime_timezone() -> None:
ccs_tz = "America/Caracas"
stg_tz = "America/Santiago"
utc_tz = "UTC"
df = pl.DataFrame(
{"x1": ["1996-12-19T16:39:57 +00:00", "2022-12-19T00:39:57 +00:00"]}
).with_columns(
**{
"x1-datetime-ns": pl.col("x1").cast(
pl.Datetime(time_unit="ns", time_zone=ccs_tz)
),
"x1-datetime-ms": pl.col("x1").cast(
pl.Datetime(time_unit="ms", time_zone=stg_tz)
),
"x1-datetime-us": pl.col("x1").cast(
pl.Datetime(time_unit="us", time_zone=utc_tz)
),
}
)

expected = pl.DataFrame(
{
"x1-datetime-ns": [
datetime(year=1996, month=12, day=19, hour=12, minute=39, second=57),
datetime(year=2022, month=12, day=18, hour=20, minute=39, second=57),
],
"x1-datetime-ms": [
datetime(year=1996, month=12, day=19, hour=13, minute=39, second=57),
datetime(year=2022, month=12, day=18, hour=21, minute=39, second=57),
],
"x1-datetime-us": [
datetime(year=1996, month=12, day=19, hour=16, minute=39, second=57),
datetime(year=2022, month=12, day=19, hour=00, minute=39, second=57),
],
}
).select(
pl.col("x1-datetime-ns").dt.cast_time_unit("ns").dt.replace_time_zone(ccs_tz),
pl.col("x1-datetime-ms").dt.cast_time_unit("ms").dt.replace_time_zone(stg_tz),
pl.col("x1-datetime-us").dt.cast_time_unit("us").dt.replace_time_zone(utc_tz),
)

out = df.select(
pl.col("x1-datetime-ns"), pl.col("x1-datetime-ms"), pl.col("x1-datetime-us")
)

assert_frame_equal(expected, out)
Loading

0 comments on commit 440e093

Please sign in to comment.