Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: fixed-offset datetime weren't being parsed #1303

Merged
merged 2 commits into from
Nov 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 49 additions & 12 deletions narwhals/_pandas_like/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,51 @@
Implementation.CUDF,
Implementation.MODIN,
}
PD_DATETIME_RGX = r"""^
datetime64\[
(?P<time_unit>s|ms|us|ns) # Match time unit: s, ms, us, or ns
(?:, # Begin non-capturing group for optional timezone
\s* # Optional whitespace after comma
(?P<time_zone> # Start named group for timezone
[a-zA-Z\/]+ # Match timezone name, e.g., UTC, America/New_York
(?:[+-]\d{2}:\d{2})? # Optional offset in format +HH:MM or -HH:MM
| # OR
pytz\.FixedOffset\(\d+\) # Match pytz.FixedOffset with integer offset in parentheses
) # End time_zone group
)? # End optional timezone group
\] # Closing bracket for datetime64
$"""
PATTERN_PD_DATETIME = re.compile(PD_DATETIME_RGX, re.VERBOSE)
PA_DATETIME_RGX = r"""^
timestamp\[
(?P<time_unit>s|ms|us|ns) # Match time unit: s, ms, us, or ns
(?:, # Begin non-capturing group for optional timezone
\s?tz= # Match "tz=" prefix
(?P<time_zone> # Start named group for timezone
[a-zA-Z\/]* # Match timezone name (e.g., UTC, America/New_York)
(?: # Begin optional non-capturing group for offset
[+-]\d{2}:\d{2} # Match offset in format +HH:MM or -HH:MM
)? # End optional offset group
) # End time_zone group
)? # End optional timezone group
\] # Closing bracket for timestamp
\[pyarrow\] # Literal string "[pyarrow]"
$"""
PATTERN_PA_DATETIME = re.compile(PA_DATETIME_RGX, re.VERBOSE)
PD_DURATION_RGX = r"""^
timedelta64\[
(?P<time_unit>s|ms|us|ns) # Match time unit: s, ms, us, or ns
\] # Closing bracket for timedelta64
$"""

PATTERN_PD_DURATION = re.compile(PD_DURATION_RGX, re.VERBOSE)
PA_DURATION_RGX = r"""^
duration\[
(?P<time_unit>s|ms|us|ns) # Match time unit: s, ms, us, or ns
\] # Closing bracket for duration
\[pyarrow\] # Literal string "[pyarrow]"
$"""
PATTERN_PA_DURATION = re.compile(PA_DURATION_RGX, re.VERBOSE)


def validate_column_comparand(index: Any, other: Any) -> Any:
Expand Down Expand Up @@ -223,14 +268,6 @@ def native_to_narwhals_dtype(
) -> DType:
dtype = str(native_column.dtype)

pd_datetime_rgx = (
r"^datetime64\[(?P<time_unit>s|ms|us|ns)(?:, (?P<time_zone>[a-zA-Z\/]+))?\]$"
)
pa_datetime_rgx = r"^timestamp\[(?P<time_unit>s|ms|us|ns)(?:, tz=(?P<time_zone>[a-zA-Z\/]+))?\]\[pyarrow\]$"

pd_duration_rgx = r"^timedelta64\[(?P<time_unit>s|ms|us|ns)\]$"
pa_duration_rgx = r"^duration\[(?P<time_unit>s|ms|us|ns)\]\[pyarrow\]$"

if dtype in {"int64", "Int64", "Int64[pyarrow]", "int64[pyarrow]"}:
return dtypes.Int64()
if dtype in {"int32", "Int32", "Int32[pyarrow]", "int32[pyarrow]"}:
Expand Down Expand Up @@ -269,14 +306,14 @@ def native_to_narwhals_dtype(
return dtypes.Boolean()
if dtype == "category" or dtype.startswith("dictionary<"):
return dtypes.Categorical()
if (match_ := re.match(pd_datetime_rgx, dtype)) or (
match_ := re.match(pa_datetime_rgx, dtype)
if (match_ := PATTERN_PD_DATETIME.match(dtype)) or (
match_ := PATTERN_PA_DATETIME.match(dtype)
):
dt_time_unit: Literal["us", "ns", "ms", "s"] = match_.group("time_unit") # type: ignore[assignment]
dt_time_zone: str | None = match_.group("time_zone")
return dtypes.Datetime(dt_time_unit, dt_time_zone)
if (match_ := re.match(pd_duration_rgx, dtype)) or (
match_ := re.match(pa_duration_rgx, dtype)
if (match_ := PATTERN_PD_DURATION.match(dtype)) or (
match_ := PATTERN_PA_DURATION.match(dtype)
):
du_time_unit: Literal["us", "ns", "ms", "s"] = match_.group("time_unit") # type: ignore[assignment]
return dtypes.Duration(du_time_unit)
Expand Down
21 changes: 21 additions & 0 deletions tests/dtypes_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,24 @@ def test_pandas_inplace_modification_1267(request: pytest.FixtureRequest) -> Non
assert snw.dtype == nw.Int64
s[0] = 999.5
assert snw.dtype == nw.Float64


def test_pandas_fixed_offset_1302() -> None:
result = nw.from_native(
pd.Series(pd.to_datetime(["2020-01-01T00:00:00.000000000+01:00"])),
series_only=True,
).dtype
if PANDAS_VERSION >= (2,):
assert result == nw.Datetime("ns", "UTC+01:00")
else: # pragma: no cover
assert result == nw.Datetime("ns", "pytz.FixedOffset(60)")
if PANDAS_VERSION >= (2,):
result = nw.from_native(
pd.Series(
pd.to_datetime(["2020-01-01T00:00:00.000000000+01:00"])
).convert_dtypes(dtype_backend="pyarrow"),
series_only=True,
).dtype
assert result == nw.Datetime("ns", "+01:00")
else: # pragma: no cover
pass
Loading