chore: add Missing pyarrow methods script (#371)

narwhals-dev · Jul 1, 2024 · 1e16c17 · 1e16c17
1 parent 8e23177
commit 1e16c17
Show file tree

Hide file tree

Showing 2 changed files with 140 additions and 0 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -34,3 +34,8 @@ repos:
       language: pygrep
       files: ^narwhals/
       exclude: ^narwhals/dependencies\.py
+    - id: check-arrow-backend-completeness
+      name: check-arrow-backend-completeness
+      pass_filenames: false
+      entry: python -m utils.check_backend_completeness
+      language: python
diff --git a/utils/check_backend_completeness.py b/utils/check_backend_completeness.py
@@ -0,0 +1,135 @@
+"""
+Hopefully temporary script which tracks which methods we're missing
+for the PyArrow table backend.
+
+If you implement a method, please remove it from the `MISSING` list.
+"""
+
+# ruff: noqa
+import sys
+
+import narwhals as nw
+from narwhals._arrow.dataframe import ArrowDataFrame
+
+MISSING = [
+    "DataFrame.collect",
+    "DataFrame.drop",
+    "DataFrame.drop_nulls",
+    "DataFrame.filter",
+    "DataFrame.group_by",
+    "DataFrame.head",
+    "DataFrame.is_duplicated",
+    "DataFrame.is_empty",
+    "DataFrame.is_unique",
+    "DataFrame.item",
+    "DataFrame.iter_rows",
+    "DataFrame.join",
+    "DataFrame.lazy",
+    "DataFrame.null_count",
+    "DataFrame.pipe",
+    "DataFrame.rename",
+    "DataFrame.sort",
+    "DataFrame.tail",
+    "DataFrame.to_dict",
+    "DataFrame.to_numpy",
+    "DataFrame.to_pandas",
+    "DataFrame.unique",
+    "DataFrame.with_columns",
+    "DataFrame.with_row_index",
+    "DataFrame.write_parquet",
+    "Series.all",
+    "Series.any",
+    "Series.cast",
+    "Series.cat",
+    "Series.diff",
+    "Series.drop_nulls",
+    "Series.fill_null",
+    "Series.filter",
+    "Series.from_iterable",
+    "Series.head",
+    "Series.is_between",
+    "Series.is_duplicated",
+    "Series.is_empty",
+    "Series.is_first_distinct",
+    "Series.is_in",
+    "Series.is_last_distinct",
+    "Series.is_null",
+    "Series.is_sorted",
+    "Series.is_unique",
+    "Series.item",
+    "Series.len",
+    "Series.max",
+    "Series.mean",
+    "Series.min",
+    "Series.n_unique",
+    "Series.null_count",
+    "Series.quantile",
+    "Series.round",
+    "Series.sample",
+    "Series.shift",
+    "Series.sort",
+    "Series.std",
+    "Series.str",
+    "Series.sum",
+    "Series.tail",
+    "Series.to_frame",
+    "Series.to_pandas",
+    "Series.unique",
+    "Series.value_counts",
+    "Series.zip_with",
+]
+
+
+class MockDataFrame:
+    # Make a little mock object so we can instantiate
+    # PandasDataFrame without having pandas installed
+    def __init__(self, dataframe): ...
+
+    def __narwhals_dataframe__(self):
+        return self
+
+    @property
+    def columns(self):
+        return []
+
+    @property
+    def loc(self):
+        return self
+
+    def __getitem__(self, *args):
+        return MockSeries(self)
+
+
+class MockSeries:
+    # Make a little mock object so we can instantiate
+    # nw.DataFrame without having dataframe libraries
+    # installed
+    def __init__(self, series): ...
+
+    def __narwhals_series__(self):
+        return self
+
+    @property
+    def name(self):
+        return "a"
+
+
+if __name__ == "__main__":
+    missing = []
+
+    df_pa = ArrowDataFrame(MockDataFrame({"a": [1, 2, 3]}))
+    df_pd = nw.DataFrame(MockDataFrame({"a": [1, 2, 3]}), is_polars=True)
+    pa_methods = [f"DataFrame.{x}" for x in df_pa.__dir__() if not x.startswith("_")]
+    pd_methods = [f"DataFrame.{x}" for x in df_pd.__dir__() if not x.startswith("_")]
+    missing.extend([x for x in pd_methods if x not in pa_methods and x not in MISSING])
+
+    ser_pa = df_pa["a"]
+    ser_pd = df_pd["a"]
+    pa_methods = [f"Series.{x}" for x in ser_pa.__dir__() if not x.startswith("_")]
+    pd_methods = [f"Series.{x}" for x in ser_pd.__dir__() if not x.startswith("_")]
+    missing.extend([x for x in pd_methods if x not in pa_methods and x not in MISSING])
+
+    if missing:
+        print(sorted(missing))
+        sys.exit(1)
+    sys.exit(0)