From fae83f24128f797c028338fc3912a21a0acc4f11 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 1 Feb 2024 13:59:00 -0800 Subject: [PATCH 1/8] Allow zoneinfo objects --- fastparquet/test/test_converted_types.py | 12 +++++++ fastparquet/util.py | 42 +++++++++++++----------- 2 files changed, 35 insertions(+), 19 deletions(-) diff --git a/fastparquet/test/test_converted_types.py b/fastparquet/test/test_converted_types.py index 4c539a67..bce9b6bf 100644 --- a/fastparquet/test/test_converted_types.py +++ b/fastparquet/test/test_converted_types.py @@ -2,6 +2,7 @@ """test_converted_types.py - tests for decoding data to their logical data types.""" import datetime import os.path +import zoneinfo import numpy as np import pandas as pd @@ -169,6 +170,17 @@ def test_tz_nonstring(tmpdir): assert (event_df == round).all().all() +def test_tz_zoneinfo(tmpdir): + dti = pd.DatetimeIndex([pd.Timestamp(2020, 1, 1)]).tz_localize(zoneinfo.ZoneInfo("UTC")) + df = pd.DataFrame(dti) + fn = '{}/{}.parquet'.format(tmpdir, 'zoneinfo_tmp') + df.to_parquet(fn, compression='uncompressed', engine='fastparquet') + result = pd.read_parquet(fn, engine="fastparquet") + result_dtype = result.iloc[:, 0].dtype + assert isinstance(result_dtype, pd.DatetimeTZDtype) + assert str(result_dtype.tz) == "UTC" + + def test_pandas_simple_type(tmpdir): import pandas as pd fn = os.path.join(tmpdir, "out.parquet") diff --git a/fastparquet/util.py b/fastparquet/util.py index 4e1c3115..9923e775 100644 --- a/fastparquet/util.py +++ b/fastparquet/util.py @@ -8,6 +8,7 @@ import operator import re import numbers +import zoneinfo import numpy as np import pandas as pd @@ -417,25 +418,28 @@ def get_column_metadata(column, name, object_dtype=None): 'ordered': column.cat.ordered, } dtype = column.cat.codes.dtype - elif hasattr(dtype, 'tz'): - try: - stz = str(dtype.tz) - if "UTC" in stz and ":" in stz: - extra_metadata = {'timezone': stz.strip("UTC")} - elif len(str(stz)) == 3: # like "UTC", "CET", ... - extra_metadata = {'timezone': str(stz)} - elif getattr(dtype.tz, "zone", False): - extra_metadata = {'timezone': dtype.tz.zone} - elif "pytz" not in stz: - pd.Series([pd.to_datetime('now', utc=True)]).dt.tz_localize(stz) - extra_metadata = {'timezone': stz} - elif "Offset" in stz: - extra_metadata = {'timezone': f"{dtype.tz._minutes // 60:+03}:00"} - else: - raise KeyError - except Exception as e: - raise ValueError("Time-zone information could not be serialised: " - "%s, please use another" % str(dtype.tz)) from e + elif isinstance(dtype, pd.DatetimeTZDtype): + if isinstance(dtype.tz, zoneinfo.ZoneInfo): + extra_metadata = {'timezone': dtype.tz.zone.key} + else: + try: + stz = str(dtype.tz) + if "UTC" in stz and ":" in stz: + extra_metadata = {'timezone': stz.strip("UTC")} + elif len(str(stz)) == 3: # like "UTC", "CET", ... + extra_metadata = {'timezone': str(stz)} + elif getattr(dtype.tz, "zone", False): + extra_metadata = {'timezone': dtype.tz.zone} + elif "pytz" not in stz: + pd.Series([pd.to_datetime('now', utc=True)]).dt.tz_localize(stz) + extra_metadata = {'timezone': stz} + elif "Offset" in stz: + extra_metadata = {'timezone': f"{dtype.tz._minutes // 60:+03}:00"} + else: + raise KeyError + except Exception as e: + raise ValueError("Time-zone information could not be serialised: " + "%s, please use another" % str(dtype.tz)) from e else: extra_metadata = None From c114015e2c369240a7a5240cf267884a72b107c5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 2 Feb 2024 11:00:08 -0800 Subject: [PATCH 2/8] Remove zone --- fastparquet/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastparquet/util.py b/fastparquet/util.py index 9923e775..043688d9 100644 --- a/fastparquet/util.py +++ b/fastparquet/util.py @@ -420,7 +420,7 @@ def get_column_metadata(column, name, object_dtype=None): dtype = column.cat.codes.dtype elif isinstance(dtype, pd.DatetimeTZDtype): if isinstance(dtype.tz, zoneinfo.ZoneInfo): - extra_metadata = {'timezone': dtype.tz.zone.key} + extra_metadata = {'timezone': dtype.tz.key} else: try: stz = str(dtype.tz) From a50dbdb3c3b7c659b9c25284e8c84e2353c26954 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 5 Feb 2024 13:57:59 -0800 Subject: [PATCH 3/8] Ensure column name is string --- fastparquet/test/test_converted_types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastparquet/test/test_converted_types.py b/fastparquet/test/test_converted_types.py index bce9b6bf..ef03dc44 100644 --- a/fastparquet/test/test_converted_types.py +++ b/fastparquet/test/test_converted_types.py @@ -172,7 +172,7 @@ def test_tz_nonstring(tmpdir): def test_tz_zoneinfo(tmpdir): dti = pd.DatetimeIndex([pd.Timestamp(2020, 1, 1)]).tz_localize(zoneinfo.ZoneInfo("UTC")) - df = pd.DataFrame(dti) + df = pd.DataFrame({"a": dti}) fn = '{}/{}.parquet'.format(tmpdir, 'zoneinfo_tmp') df.to_parquet(fn, compression='uncompressed', engine='fastparquet') result = pd.read_parquet(fn, engine="fastparquet") From 3b4d4db18e85f4e247485ba4681ddfca63e8a9da Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 5 Feb 2024 15:00:40 -0800 Subject: [PATCH 4/8] Try bumping cibuildwheel --- .github/workflows/wheel.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/wheel.yml b/.github/workflows/wheel.yml index d1b97832..867787e0 100644 --- a/.github/workflows/wheel.yml +++ b/.github/workflows/wheel.yml @@ -52,7 +52,7 @@ jobs: python -m pip install delvewheel cython - name: Build wheels - uses: joerick/cibuildwheel@v2.16.2 + uses: pypa/cibuildwheel@v2.16.5 - uses: actions/upload-artifact@v3 with: @@ -107,7 +107,7 @@ jobs: python -m pip install delvewheel cython - name: Build wheels - uses: joerick/cibuildwheel@v2.16.2 + uses: pypa/cibuildwheel@v2.16.5 - uses: actions/upload-artifact@v3 with: @@ -162,7 +162,7 @@ jobs: python -m pip install delvewheel cython - name: Build wheels - uses: joerick/cibuildwheel@v2.16.2 + uses: pypa/cibuildwheel@v2.16.5 - uses: actions/upload-artifact@v3 with: @@ -217,7 +217,7 @@ jobs: python -m pip install delvewheel cython - name: Build wheels - uses: joerick/cibuildwheel@v2.16.2 + uses: pypa/cibuildwheel@v2.16.5 - uses: actions/upload-artifact@v3 with: @@ -251,7 +251,7 @@ jobs: python-version: "3.11" - name: Build wheels - uses: joerick/cibuildwheel@v2.16.2 + uses: pypa/cibuildwheel@v2.16.5 - uses: actions/upload-artifact@v3 with: From 2fc02f1944646ba7ca2871e34a32ded1b75b1d2f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 17 Apr 2024 10:04:47 -0700 Subject: [PATCH 5/8] Revert "Try bumping cibuildwheel" This reverts commit 3b4d4db18e85f4e247485ba4681ddfca63e8a9da. --- .github/workflows/wheel.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/wheel.yml b/.github/workflows/wheel.yml index 867787e0..d1b97832 100644 --- a/.github/workflows/wheel.yml +++ b/.github/workflows/wheel.yml @@ -52,7 +52,7 @@ jobs: python -m pip install delvewheel cython - name: Build wheels - uses: pypa/cibuildwheel@v2.16.5 + uses: joerick/cibuildwheel@v2.16.2 - uses: actions/upload-artifact@v3 with: @@ -107,7 +107,7 @@ jobs: python -m pip install delvewheel cython - name: Build wheels - uses: pypa/cibuildwheel@v2.16.5 + uses: joerick/cibuildwheel@v2.16.2 - uses: actions/upload-artifact@v3 with: @@ -162,7 +162,7 @@ jobs: python -m pip install delvewheel cython - name: Build wheels - uses: pypa/cibuildwheel@v2.16.5 + uses: joerick/cibuildwheel@v2.16.2 - uses: actions/upload-artifact@v3 with: @@ -217,7 +217,7 @@ jobs: python -m pip install delvewheel cython - name: Build wheels - uses: pypa/cibuildwheel@v2.16.5 + uses: joerick/cibuildwheel@v2.16.2 - uses: actions/upload-artifact@v3 with: @@ -251,7 +251,7 @@ jobs: python-version: "3.11" - name: Build wheels - uses: pypa/cibuildwheel@v2.16.5 + uses: joerick/cibuildwheel@v2.16.2 - uses: actions/upload-artifact@v3 with: From 59486c7dcff27082d2ae6afee1a0e05dc2e5e1d3 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 17 Apr 2024 10:10:32 -0700 Subject: [PATCH 6/8] Ensure column is name --- fastparquet/test/test_converted_types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastparquet/test/test_converted_types.py b/fastparquet/test/test_converted_types.py index ef03dc44..9e11f997 100644 --- a/fastparquet/test/test_converted_types.py +++ b/fastparquet/test/test_converted_types.py @@ -171,7 +171,7 @@ def test_tz_nonstring(tmpdir): def test_tz_zoneinfo(tmpdir): - dti = pd.DatetimeIndex([pd.Timestamp(2020, 1, 1)]).tz_localize(zoneinfo.ZoneInfo("UTC")) + dti = pd.DatetimeIndex([pd.Timestamp(2020, 1, 1)], name="a").tz_localize(zoneinfo.ZoneInfo("UTC")) df = pd.DataFrame({"a": dti}) fn = '{}/{}.parquet'.format(tmpdir, 'zoneinfo_tmp') df.to_parquet(fn, compression='uncompressed', engine='fastparquet') From df20bcc6e54c409aa72507f4670b086fba6ebad4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 17 Apr 2024 12:01:32 -0700 Subject: [PATCH 7/8] Turn query planning off --- fastparquet/test/test_api.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fastparquet/test/test_api.py b/fastparquet/test/test_api.py index 8b3b2e9b..007beced 100644 --- a/fastparquet/test/test_api.py +++ b/fastparquet/test/test_api.py @@ -132,6 +132,7 @@ def test_sorted_row_group_columns(tempdir): def test_sorted_row_group_columns_with_filters(tempdir): # fails up to 2021.08.1 + dask = pytest.importorskip('dask') dd = pytest.importorskip('dask.dataframe') # create dummy dataframe df = pd.DataFrame({'unique': [0, 0, 1, 1, 2, 2, 3, 3], @@ -140,11 +141,12 @@ def test_sorted_row_group_columns_with_filters(tempdir): 'id1', 'id2', 'id1', 'id2']}, index=[0, 0, 1, 1, 2, 2, 3, 3]) - df = dd.from_pandas(df, npartitions=2) - fn = os.path.join(tempdir, 'foo.parquet') - df.to_parquet(fn, - engine='fastparquet', - partition_on=['id']) + with dask.config.set({"dataframe.query-planning": False}): + df = dd.from_pandas(df, npartitions=2) + fn = os.path.join(tempdir, 'foo.parquet') + df.to_parquet(fn, + engine='fastparquet', + partition_on=['id']) # load ParquetFile pf = ParquetFile(fn) filters = [('id', '==', 'id1')] From 4cb25dc2e5135d5bde1542a0cc9e14d62eae5c8e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 18 Apr 2024 11:30:34 -0700 Subject: [PATCH 8/8] add xfail --- fastparquet/test/test_api.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fastparquet/test/test_api.py b/fastparquet/test/test_api.py index 007beced..62cf749c 100644 --- a/fastparquet/test/test_api.py +++ b/fastparquet/test/test_api.py @@ -130,6 +130,11 @@ def test_sorted_row_group_columns(tempdir): assert result == expected +@pytest.mark.xfail( + reason="Not supported by dask expressions", + raises=NotImplementedError, + strict=True, +) def test_sorted_row_group_columns_with_filters(tempdir): # fails up to 2021.08.1 dask = pytest.importorskip('dask')