From d2725cb7b746b686fc4671cef06b653efa91847e Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 2 Nov 2024 10:27:14 +0000 Subject: [PATCH 1/2] fix: fix parsing of fixed-offset timezones --- narwhals/_pandas_like/utils.py | 61 +++++++++++++++++++++++++++------- tests/dtypes_test.py | 15 +++++++++ 2 files changed, 64 insertions(+), 12 deletions(-) diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index 99181bc1e..7241cd814 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -30,6 +30,51 @@ Implementation.CUDF, Implementation.MODIN, } +PD_DATETIME_RGX = r"""^ + datetime64\[ + (?Ps|ms|us|ns) # Match time unit: s, ms, us, or ns + (?:, # Begin non-capturing group for optional timezone + \s? # Optional whitespace after comma + (?P # Start named group for timezone + [a-zA-Z\/]+ # Match timezone name, e.g., UTC, America/New_York + (?: # Begin optional non-capturing group for offset + [+-]\d{2}:\d{2} # Match offset in format +HH:MM or -HH:MM + )? # End optional offset group + ) # End time_zone group + )? # End optional timezone group + \] # Closing bracket +$""" +PATTERN_PD_DATETIME = re.compile(PD_DATETIME_RGX, re.VERBOSE) +PA_DATETIME_RGX = r"""^ + timestamp\[ + (?Ps|ms|us|ns) # Match time unit: s, ms, us, or ns + (?:, # Begin non-capturing group for optional timezone + \s?tz= # Match "tz=" prefix + (?P # Start named group for timezone + [a-zA-Z\/]* # Match timezone name (e.g., UTC, America/New_York) + (?: # Begin optional non-capturing group for offset + [+-]\d{2}:\d{2} # Match offset in format +HH:MM or -HH:MM + )? # End optional offset group + ) # End time_zone group + )? # End optional timezone group + \] # Closing bracket for timestamp + \[pyarrow\] # Literal string "[pyarrow]" +$""" +PATTERN_PA_DATETIME = re.compile(PA_DATETIME_RGX, re.VERBOSE) +PD_DURATION_RGX = r"""^ + timedelta64\[ + (?Ps|ms|us|ns) # Match time unit: s, ms, us, or ns + \] # Closing bracket for timedelta64 +$""" + +PATTERN_PD_DURATION = re.compile(PD_DURATION_RGX, re.VERBOSE) +PA_DURATION_RGX = r"""^ + duration\[ + (?Ps|ms|us|ns) # Match time unit: s, ms, us, or ns + \] # Closing bracket for duration + \[pyarrow\] # Literal string "[pyarrow]" +$""" +PATTERN_PA_DURATION = re.compile(PA_DURATION_RGX, re.VERBOSE) def validate_column_comparand(index: Any, other: Any) -> Any: @@ -223,14 +268,6 @@ def native_to_narwhals_dtype( ) -> DType: dtype = str(native_column.dtype) - pd_datetime_rgx = ( - r"^datetime64\[(?Ps|ms|us|ns)(?:, (?P[a-zA-Z\/]+))?\]$" - ) - pa_datetime_rgx = r"^timestamp\[(?Ps|ms|us|ns)(?:, tz=(?P[a-zA-Z\/]+))?\]\[pyarrow\]$" - - pd_duration_rgx = r"^timedelta64\[(?Ps|ms|us|ns)\]$" - pa_duration_rgx = r"^duration\[(?Ps|ms|us|ns)\]\[pyarrow\]$" - if dtype in {"int64", "Int64", "Int64[pyarrow]", "int64[pyarrow]"}: return dtypes.Int64() if dtype in {"int32", "Int32", "Int32[pyarrow]", "int32[pyarrow]"}: @@ -269,14 +306,14 @@ def native_to_narwhals_dtype( return dtypes.Boolean() if dtype == "category" or dtype.startswith("dictionary<"): return dtypes.Categorical() - if (match_ := re.match(pd_datetime_rgx, dtype)) or ( - match_ := re.match(pa_datetime_rgx, dtype) + if (match_ := PATTERN_PD_DATETIME.match(dtype)) or ( + match_ := PATTERN_PA_DATETIME.match(dtype) ): dt_time_unit: Literal["us", "ns", "ms", "s"] = match_.group("time_unit") # type: ignore[assignment] dt_time_zone: str | None = match_.group("time_zone") return dtypes.Datetime(dt_time_unit, dt_time_zone) - if (match_ := re.match(pd_duration_rgx, dtype)) or ( - match_ := re.match(pa_duration_rgx, dtype) + if (match_ := PATTERN_PD_DURATION.match(dtype)) or ( + match_ := PATTERN_PA_DURATION.match(dtype) ): du_time_unit: Literal["us", "ns", "ms", "s"] = match_.group("time_unit") # type: ignore[assignment] return dtypes.Duration(du_time_unit) diff --git a/tests/dtypes_test.py b/tests/dtypes_test.py index 0d6363aee..bb8439b12 100644 --- a/tests/dtypes_test.py +++ b/tests/dtypes_test.py @@ -176,3 +176,18 @@ def test_pandas_inplace_modification_1267(request: pytest.FixtureRequest) -> Non assert snw.dtype == nw.Int64 s[0] = 999.5 assert snw.dtype == nw.Float64 + + +def test_pandas_fixed_offset_1302() -> None: + result = nw.from_native( + pd.Series(pd.to_datetime(["2020-01-01T00:00:00.000000000+01:00"])), + series_only=True, + ).dtype + assert result == nw.Datetime("ns", "UTC+01:00") + result = nw.from_native( + pd.Series(pd.to_datetime(["2020-01-01T00:00:00.000000000+01:00"])).convert_dtypes( + dtype_backend="pyarrow" + ), + series_only=True, + ).dtype + assert result == nw.Datetime("ns", "+01:00") From 62c1ac47c7eaa196d3fbe9922bfbd771d2660d63 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 2 Nov 2024 10:33:07 +0000 Subject: [PATCH 2/2] vresions --- narwhals/_pandas_like/utils.py | 10 +++++----- tests/dtypes_test.py | 22 ++++++++++++++-------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index 7241cd814..8074413d7 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -34,15 +34,15 @@ datetime64\[ (?Ps|ms|us|ns) # Match time unit: s, ms, us, or ns (?:, # Begin non-capturing group for optional timezone - \s? # Optional whitespace after comma + \s* # Optional whitespace after comma (?P # Start named group for timezone [a-zA-Z\/]+ # Match timezone name, e.g., UTC, America/New_York - (?: # Begin optional non-capturing group for offset - [+-]\d{2}:\d{2} # Match offset in format +HH:MM or -HH:MM - )? # End optional offset group + (?:[+-]\d{2}:\d{2})? # Optional offset in format +HH:MM or -HH:MM + | # OR + pytz\.FixedOffset\(\d+\) # Match pytz.FixedOffset with integer offset in parentheses ) # End time_zone group )? # End optional timezone group - \] # Closing bracket + \] # Closing bracket for datetime64 $""" PATTERN_PD_DATETIME = re.compile(PD_DATETIME_RGX, re.VERBOSE) PA_DATETIME_RGX = r"""^ diff --git a/tests/dtypes_test.py b/tests/dtypes_test.py index bb8439b12..2993521b9 100644 --- a/tests/dtypes_test.py +++ b/tests/dtypes_test.py @@ -183,11 +183,17 @@ def test_pandas_fixed_offset_1302() -> None: pd.Series(pd.to_datetime(["2020-01-01T00:00:00.000000000+01:00"])), series_only=True, ).dtype - assert result == nw.Datetime("ns", "UTC+01:00") - result = nw.from_native( - pd.Series(pd.to_datetime(["2020-01-01T00:00:00.000000000+01:00"])).convert_dtypes( - dtype_backend="pyarrow" - ), - series_only=True, - ).dtype - assert result == nw.Datetime("ns", "+01:00") + if PANDAS_VERSION >= (2,): + assert result == nw.Datetime("ns", "UTC+01:00") + else: # pragma: no cover + assert result == nw.Datetime("ns", "pytz.FixedOffset(60)") + if PANDAS_VERSION >= (2,): + result = nw.from_native( + pd.Series( + pd.to_datetime(["2020-01-01T00:00:00.000000000+01:00"]) + ).convert_dtypes(dtype_backend="pyarrow"), + series_only=True, + ).dtype + assert result == nw.Datetime("ns", "+01:00") + else: # pragma: no cover + pass