From 1e16c172dd9dcf005990aa4393251e467e7f2957 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Mon, 1 Jul 2024 11:15:06 +0100 Subject: [PATCH] chore: add Missing pyarrow methods script (#371) --- .pre-commit-config.yaml | 5 ++ utils/check_backend_completeness.py | 135 ++++++++++++++++++++++++++++ 2 files changed, 140 insertions(+) create mode 100644 utils/check_backend_completeness.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9d87472b1..8178cd851 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -34,3 +34,8 @@ repos: language: pygrep files: ^narwhals/ exclude: ^narwhals/dependencies\.py + - id: check-arrow-backend-completeness + name: check-arrow-backend-completeness + pass_filenames: false + entry: python -m utils.check_backend_completeness + language: python diff --git a/utils/check_backend_completeness.py b/utils/check_backend_completeness.py new file mode 100644 index 000000000..a5352dbfc --- /dev/null +++ b/utils/check_backend_completeness.py @@ -0,0 +1,135 @@ +""" +Hopefully temporary script which tracks which methods we're missing +for the PyArrow table backend. + +If you implement a method, please remove it from the `MISSING` list. +""" + +# ruff: noqa +import sys + +import narwhals as nw +from narwhals._arrow.dataframe import ArrowDataFrame + +MISSING = [ + "DataFrame.collect", + "DataFrame.drop", + "DataFrame.drop_nulls", + "DataFrame.filter", + "DataFrame.group_by", + "DataFrame.head", + "DataFrame.is_duplicated", + "DataFrame.is_empty", + "DataFrame.is_unique", + "DataFrame.item", + "DataFrame.iter_rows", + "DataFrame.join", + "DataFrame.lazy", + "DataFrame.null_count", + "DataFrame.pipe", + "DataFrame.rename", + "DataFrame.sort", + "DataFrame.tail", + "DataFrame.to_dict", + "DataFrame.to_numpy", + "DataFrame.to_pandas", + "DataFrame.unique", + "DataFrame.with_columns", + "DataFrame.with_row_index", + "DataFrame.write_parquet", + "Series.all", + "Series.any", + "Series.cast", + "Series.cat", + "Series.diff", + "Series.drop_nulls", + "Series.fill_null", + "Series.filter", + "Series.from_iterable", + "Series.head", + "Series.is_between", + "Series.is_duplicated", + "Series.is_empty", + "Series.is_first_distinct", + "Series.is_in", + "Series.is_last_distinct", + "Series.is_null", + "Series.is_sorted", + "Series.is_unique", + "Series.item", + "Series.len", + "Series.max", + "Series.mean", + "Series.min", + "Series.n_unique", + "Series.null_count", + "Series.quantile", + "Series.round", + "Series.sample", + "Series.shift", + "Series.sort", + "Series.std", + "Series.str", + "Series.sum", + "Series.tail", + "Series.to_frame", + "Series.to_pandas", + "Series.unique", + "Series.value_counts", + "Series.zip_with", +] + + +class MockDataFrame: + # Make a little mock object so we can instantiate + # PandasDataFrame without having pandas installed + def __init__(self, dataframe): ... + + def __narwhals_dataframe__(self): + return self + + @property + def columns(self): + return [] + + @property + def loc(self): + return self + + def __getitem__(self, *args): + return MockSeries(self) + + +class MockSeries: + # Make a little mock object so we can instantiate + # nw.DataFrame without having dataframe libraries + # installed + def __init__(self, series): ... + + def __narwhals_series__(self): + return self + + @property + def name(self): + return "a" + + +if __name__ == "__main__": + missing = [] + + df_pa = ArrowDataFrame(MockDataFrame({"a": [1, 2, 3]})) + df_pd = nw.DataFrame(MockDataFrame({"a": [1, 2, 3]}), is_polars=True) + pa_methods = [f"DataFrame.{x}" for x in df_pa.__dir__() if not x.startswith("_")] + pd_methods = [f"DataFrame.{x}" for x in df_pd.__dir__() if not x.startswith("_")] + missing.extend([x for x in pd_methods if x not in pa_methods and x not in MISSING]) + + ser_pa = df_pa["a"] + ser_pd = df_pd["a"] + pa_methods = [f"Series.{x}" for x in ser_pa.__dir__() if not x.startswith("_")] + pd_methods = [f"Series.{x}" for x in ser_pd.__dir__() if not x.startswith("_")] + missing.extend([x for x in pd_methods if x not in pa_methods and x not in MISSING]) + + if missing: + print(sorted(missing)) + sys.exit(1) + sys.exit(0)