From b72a823ca34ee1f6fcd7d3ca737ee5614b4fcba4 Mon Sep 17 00:00:00 2001
From: Uchenna Ugoh <61969079+ugohuche@users.noreply.github.com>
Date: Thu, 11 Jul 2024 09:27:40 +0100
Subject: [PATCH 1/5] docs: add TPC-H Query 17 and 18 (#485)

* TPCH Queries 9 and 10

* update id

* strip notebook output

* TPC-H Query 18

* TPC-H Query 17

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 tpch/notebooks/q17/execute.ipynb        | 357 ++++++++++++++++++++++++
 tpch/notebooks/q17/kernel-metadata.json |  15 +
 tpch/notebooks/q18/execute.ipynb        | 211 ++++++++++++++
 tpch/notebooks/q18/kernel-metadata.json |  15 +
 4 files changed, 598 insertions(+)
 create mode 100644 tpch/notebooks/q17/execute.ipynb
 create mode 100644 tpch/notebooks/q17/kernel-metadata.json
 create mode 100644 tpch/notebooks/q18/execute.ipynb
 create mode 100644 tpch/notebooks/q18/kernel-metadata.json

diff --git a/tpch/notebooks/q17/execute.ipynb b/tpch/notebooks/q17/execute.ipynb
new file mode 100644
index 0000000000..958c7f5bef
--- /dev/null
+++ b/tpch/notebooks/q17/execute.ipynb
@@ -0,0 +1,357 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "papermill": {
+     "duration": 33.390992,
+     "end_time": "2024-03-22T17:24:15.601719",
+     "exception": false,
+     "start_time": "2024-03-22T17:23:42.210727",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "papermill": {
+     "duration": 0.907754,
+     "end_time": "2024-03-22T17:24:39.053873",
+     "exception": false,
+     "start_time": "2024-03-22T17:24:38.146119",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import polars as pl\n",
+    "\n",
+    "pd.options.mode.copy_on_write = True\n",
+    "pd.options.future.infer_string = True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "papermill": {
+     "duration": 0.021725,
+     "end_time": "2024-03-22T17:24:39.080999",
+     "exception": false,
+     "start_time": "2024-03-22T17:24:39.059274",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from typing import Any\n",
+    "import narwhals as nw\n",
+    "\n",
+    "def q17(\n",
+    "    lineitem_ds_raw: Any,\n",
+    "    part_ds_raw: Any\n",
+    ") -> Any:\n",
+    "\n",
+    "    lineitem_ds = nw.from_native(lineitem_ds_raw)\n",
+    "    part_ds = nw.from_native(part_ds_raw)\n",
+    "    \n",
+    "    var1 = \"Brand#23\"\n",
+    "    var2 = \"MED BOX\"\n",
+    "    \n",
+    "    query1 = (\n",
+    "        part_ds.filter(nw.col(\"p_brand\") == var1)\n",
+    "        .filter(nw.col(\"p_container\") == var2)\n",
+    "        .join(lineitem_ds, how=\"left\", left_on=\"p_partkey\", right_on=\"l_partkey\")\n",
+    "    )\n",
+    "    \n",
+    "    final_query = (\n",
+    "        query1.group_by(\"p_partkey\")\n",
+    "        .agg((0.2 * nw.col(\"l_quantity\").mean()).alias(\"avg_quantity\"))\n",
+    "        .select(nw.col(\"p_partkey\").alias(\"key\"), nw.col(\"avg_quantity\"))\n",
+    "        .join(query1, left_on=\"key\", right_on=\"p_partkey\")\n",
+    "        .filter(nw.col(\"l_quantity\") < nw.col(\"avg_quantity\"))\n",
+    "        .select((nw.col(\"l_extendedprice\").sum() / 7.0).round(2).alias(\"avg_yearly\"))\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "    return nw.to_native(final_query)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "papermill": {
+     "duration": 0.013325,
+     "end_time": "2024-03-22T17:24:39.099766",
+     "exception": false,
+     "start_time": "2024-03-22T17:24:39.086441",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n",
+    "lineitem = dir_ + 'lineitem.parquet'\n",
+    "part = dir_ + 'part.parquet'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "papermill": {
+     "duration": 0.014284,
+     "end_time": "2024-03-22T17:24:39.119737",
+     "exception": false,
+     "start_time": "2024-03-22T17:24:39.105453",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "IO_FUNCS = {\n",
+    "    'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n",
+    "    'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n",
+    "    'polars[eager]': lambda x: pl.read_parquet(x),\n",
+    "    'polars[lazy]': lambda x: pl.scan_parquet(x),\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = {}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "papermill": {
+     "duration": 0.005113,
+     "end_time": "2024-03-22T17:24:39.130472",
+     "exception": false,
+     "start_time": "2024-03-22T17:24:39.125359",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "## pandas via Narwhals"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "papermill": {
+     "duration": 196.786925,
+     "end_time": "2024-03-22T17:27:55.922832",
+     "exception": false,
+     "start_time": "2024-03-22T17:24:39.135907",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "tool = 'pandas'\n",
+    "fn = IO_FUNCS[tool]\n",
+    "timings = %timeit -o q17(fn(lineitem), fn(part))\n",
+    "results[tool] = timings.all_runs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "papermill": {
+     "duration": 0.005184,
+     "end_time": "2024-03-22T17:27:55.933407",
+     "exception": false,
+     "start_time": "2024-03-22T17:27:55.928223",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "## pandas, pyarrow dtypes, via Narwhals"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "papermill": {
+     "duration": 158.748353,
+     "end_time": "2024-03-22T17:30:34.688289",
+     "exception": false,
+     "start_time": "2024-03-22T17:27:55.939936",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "tool = 'pandas[pyarrow]'\n",
+    "fn = IO_FUNCS[tool]\n",
+    "timings = %timeit -o q17(fn(lineitem), fn(part))\n",
+    "results[tool] = timings.all_runs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "papermill": {
+     "duration": 0.005773,
+     "end_time": "2024-03-22T17:30:34.7003",
+     "exception": false,
+     "start_time": "2024-03-22T17:30:34.694527",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "## Polars read_parquet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "papermill": {
+     "duration": 37.821116,
+     "end_time": "2024-03-22T17:31:12.527466",
+     "exception": false,
+     "start_time": "2024-03-22T17:30:34.70635",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "tool = 'polars[eager]'\n",
+    "fn = IO_FUNCS[tool]\n",
+    "timings = %timeit -o q17(fn(lineitem), fn(part))\n",
+    "results[tool] = timings.all_runs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "papermill": {
+     "duration": 0.005515,
+     "end_time": "2024-03-22T17:31:12.539068",
+     "exception": false,
+     "start_time": "2024-03-22T17:31:12.533553",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "## Polars scan_parquet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "papermill": {
+     "duration": 4.800698,
+     "end_time": "2024-03-22T17:31:17.346813",
+     "exception": false,
+     "start_time": "2024-03-22T17:31:12.546115",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "tool = 'polars[lazy]'\n",
+    "fn = IO_FUNCS[tool]\n",
+    "timings = %timeit -o q17(fn(lineitem), fn(part)).collect()\n",
+    "results[tool] = timings.all_runs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Save"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "with open('results.json', 'w') as fd:\n",
+    "    json.dump(results, fd)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kaggle": {
+   "accelerator": "none",
+   "dataSources": [
+    {
+     "sourceId": 167796934,
+     "sourceType": "kernelVersion"
+    }
+   ],
+   "dockerImageVersionId": 30673,
+   "isGpuEnabled": false,
+   "isInternetEnabled": true,
+   "language": "python",
+   "sourceType": "notebook"
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  },
+  "papermill": {
+   "default_parameters": {},
+   "duration": 458.423327,
+   "end_time": "2024-03-22T17:31:18.077306",
+   "environment_variables": {},
+   "exception": null,
+   "input_path": "__notebook__.ipynb",
+   "output_path": "__notebook__.ipynb",
+   "parameters": {},
+   "start_time": "2024-03-22T17:23:39.653979",
+   "version": "2.5.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tpch/notebooks/q17/kernel-metadata.json b/tpch/notebooks/q17/kernel-metadata.json
new file mode 100644
index 0000000000..0fd73368cb
--- /dev/null
+++ b/tpch/notebooks/q17/kernel-metadata.json
@@ -0,0 +1,15 @@
+{
+  "id": "marcogorelli/narwhals-tpch-q17-s2",
+  "title": "Narwhals TPCH Q17 S2",
+  "code_file": "execute.ipynb",
+  "language": "python",
+  "kernel_type": "notebook",
+  "is_private": "false",
+  "enable_gpu": "false",
+  "enable_tpu": "false",
+  "enable_internet": "true",
+  "dataset_sources": [],
+  "competition_sources": [],
+  "kernel_sources": ["marcogorelli/tpc-h-data-parquet-s-2"],
+  "model_sources": []
+}
\ No newline at end of file
diff --git a/tpch/notebooks/q18/execute.ipynb b/tpch/notebooks/q18/execute.ipynb
new file mode 100644
index 0000000000..21557c957b
--- /dev/null
+++ b/tpch/notebooks/q18/execute.ipynb
@@ -0,0 +1,211 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import polars as pl\n",
+    "\n",
+    "pd.options.mode.copy_on_write = True\n",
+    "pd.options.future.infer_string = True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Any\n",
+    "import narwhals as nw\n",
+    "\n",
+    "def q18(\n",
+    "    customer_ds_raw: Any,\n",
+    "    lineitem_ds_raw: Any,\n",
+    "    orders_ds_raw: Any\n",
+    ") -> Any:\n",
+    "\n",
+    "    customer_ds = nw.from_native(customer_ds_raw)\n",
+    "    lineitem_ds = nw.from_native(lineitem_ds_raw)\n",
+    "    orders_ds = nw.from_native(orders_ds_raw)\n",
+    "    \n",
+    "    var1 = 300\n",
+    "\n",
+    "    query1 = (\n",
+    "        lineitem_ds.group_by(\"l_orderkey\")\n",
+    "        .agg(nw.col(\"l_quantity\").sum().alias(\"sum_quantity\"))\n",
+    "        .filter(nw.col(\"sum_quantity\") > var1)\n",
+    "    )\n",
+    "\n",
+    "    q_final = (\n",
+    "        orders_ds.join(query1, left_on=\"o_orderkey\", right_on=\"l_orderkey\", how=\"semi\")\n",
+    "        .join(lineitem_ds, left_on=\"o_orderkey\", right_on=\"l_orderkey\")\n",
+    "        .join(customer_ds, left_on=\"o_custkey\", right_on=\"c_custkey\")\n",
+    "        .group_by(\"c_name\", \"o_custkey\", \"o_orderkey\", \"o_orderdate\", \"o_totalprice\")\n",
+    "        .agg(nw.col(\"l_quantity\").sum().alias(\"col6\"))\n",
+    "        .select(\n",
+    "            nw.col(\"c_name\"),\n",
+    "            nw.col(\"o_custkey\").alias(\"c_custkey\"),\n",
+    "            nw.col(\"o_orderkey\"),\n",
+    "            nw.col(\"o_orderdate\").alias(\"o_orderdat\"),\n",
+    "            nw.col(\"o_totalprice\"),\n",
+    "            nw.col(\"col6\"),\n",
+    "        )\n",
+    "        .sort(by=[\"o_totalprice\", \"o_orderdat\"], descending=[True, False])\n",
+    "        .head(100)\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "    return nw.to_native(q_final)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n",
+    "customer = dir_ + 'customer.parquet'\n",
+    "lineitem = dir_ + 'lineitem.parquet'\n",
+    "orders = dir_ + 'orders.parquet'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "IO_FUNCS = {\n",
+    "    'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n",
+    "    'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n",
+    "    'polars[eager]': lambda x: pl.read_parquet(x),\n",
+    "    'polars[lazy]': lambda x: pl.scan_parquet(x),\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = {}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## pandas via Narwhals"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tool = 'pandas'\n",
+    "fn = IO_FUNCS[tool]\n",
+    "timings = %timeit -o q19(fn(lineitem), fn(part))\n",
+    "results[tool] = timings.all_runs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## pandas, pyarrow dtypes, via Narwhals"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tool = 'pandas[pyarrow]'\n",
+    "fn = IO_FUNCS[tool]\n",
+    "timings = %timeit -o q18(fn(customer), fn(lineitem), fn(orders))\n",
+    "results[tool] = timings.all_runs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Polars read_parquet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tool = 'polars[eager]'\n",
+    "fn = IO_FUNCS[tool]\n",
+    "timings = %timeit -o q18(fn(customer), fn(lineitem), fn(orders))\n",
+    "results[tool] = timings.all_runs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Polars scan_parquet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tool = 'polars[lazy]'\n",
+    "fn = IO_FUNCS[tool]\n",
+    "timings = %timeit -o q18(fn(customer), fn(lineitem), fn(orders)).collect()\n",
+    "results[tool] = timings.all_runs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Save"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "with open('results.json', 'w') as fd:\n",
+    "    json.dump(results, fd)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tpch/notebooks/q18/kernel-metadata.json b/tpch/notebooks/q18/kernel-metadata.json
new file mode 100644
index 0000000000..e1c11e53c5
--- /dev/null
+++ b/tpch/notebooks/q18/kernel-metadata.json
@@ -0,0 +1,15 @@
+{
+  "id": "marcogorelli/narwhals-tpch-q18-s2",
+  "title": "Narwhals TPCH Q18 S2",
+  "code_file": "execute.ipynb",
+  "language": "python",
+  "kernel_type": "notebook",
+  "is_private": "false",
+  "enable_gpu": "false",
+  "enable_tpu": "false",
+  "enable_internet": "true",
+  "dataset_sources": [],
+  "competition_sources": [],
+  "kernel_sources": ["marcogorelli/tpc-h-data-parquet-s-2"],
+  "model_sources": []
+}
\ No newline at end of file

From 7e72d88ac6f731ed3aaf8a29d607141471a4c286 Mon Sep 17 00:00:00 2001
From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com>
Date: Thu, 11 Jul 2024 10:33:05 +0200
Subject: [PATCH 2/5] feat: pyarrow `to_numpy`, `to_dict`, `with_row_index`
 (#487)

* feat: to_numpy,to_dict,pipe,with_row_index

* pipe comes from upstream
---
 narwhals/_arrow/dataframe.py        | 26 ++++++++++++++++++++++++++
 tests/frame/pipe_test.py            |  4 ++--
 tests/frame/to_dict_test.py         |  8 ++++----
 tests/frame/to_numpy_test.py        |  4 ++--
 tests/frame/with_row_index_test.py  |  6 +++---
 utils/check_backend_completeness.py |  3 ---
 6 files changed, 37 insertions(+), 14 deletions(-)

diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py
index 9aab773816..bf912b0d10 100644
--- a/narwhals/_arrow/dataframe.py
+++ b/narwhals/_arrow/dataframe.py
@@ -195,6 +195,32 @@ def sort(
     def to_pandas(self) -> Any:
         return self._native_dataframe.to_pandas()
 
+    def to_numpy(self) -> Any:
+        import numpy as np
+
+        return np.column_stack([col.to_numpy() for col in self._native_dataframe.columns])
+
+    def to_dict(self, *, as_series: bool) -> Any:
+        df = self._native_dataframe
+
+        names_and_values = zip(df.column_names, df.columns)
+        if as_series:
+            from narwhals._arrow.series import ArrowSeries
+
+            return {
+                name: ArrowSeries(col, name=name, backend_version=self._backend_version)
+                for name, col in names_and_values
+            }
+        else:
+            return {name: col.to_pylist() for name, col in names_and_values}
+
+    def with_row_index(self, name: str) -> Self:
+        pa = get_pyarrow()
+        df = self._native_dataframe
+
+        row_indices = pa.array(range(df.num_rows))
+        return self._from_native_dataframe(df.append_column(name, row_indices))
+
     def lazy(self) -> Self:
         return self
 
diff --git a/tests/frame/pipe_test.py b/tests/frame/pipe_test.py
index 6f49966b97..9dd66f10a9 100644
--- a/tests/frame/pipe_test.py
+++ b/tests/frame/pipe_test.py
@@ -9,8 +9,8 @@
 }
 
 
-def test_pipe(constructor: Any) -> None:
-    df = nw.from_native(constructor(data))
+def test_pipe(constructor_with_pyarrow: Any) -> None:
+    df = nw.from_native(constructor_with_pyarrow(data))
     columns = df.lazy().collect().columns
     result = df.pipe(lambda _df: _df.select([x for x in columns if len(x) == 2]))
     expected = {"ab": ["foo", "bars"]}
diff --git a/tests/frame/to_dict_test.py b/tests/frame/to_dict_test.py
index b0950c5c9f..8fa31c3361 100644
--- a/tests/frame/to_dict_test.py
+++ b/tests/frame/to_dict_test.py
@@ -3,16 +3,16 @@
 import narwhals.stable.v1 as nw
 
 
-def test_to_dict(constructor: Any) -> None:
+def test_to_dict(constructor_with_pyarrow: Any) -> None:
     data = {"a": [1, 3, 2], "b": [4, 4, 6], "c": [7.0, 8, 9]}
-    df = nw.from_native(constructor(data), eager_only=True)
+    df = nw.from_native(constructor_with_pyarrow(data), eager_only=True)
     result = df.to_dict(as_series=False)
     assert result == data
 
 
-def test_to_dict_as_series(constructor: Any) -> None:
+def test_to_dict_as_series(constructor_with_pyarrow: Any) -> None:
     data = {"a": [1, 3, 2], "b": [4, 4, 6], "c": [7.0, 8, 9]}
-    df = nw.from_native(constructor(data), eager_only=True)
+    df = nw.from_native(constructor_with_pyarrow(data), eager_only=True)
     result = df.to_dict(as_series=True)
     assert isinstance(result["a"], nw.Series)
     assert isinstance(result["b"], nw.Series)
diff --git a/tests/frame/to_numpy_test.py b/tests/frame/to_numpy_test.py
index 6f516334d6..5cfa69bf71 100644
--- a/tests/frame/to_numpy_test.py
+++ b/tests/frame/to_numpy_test.py
@@ -7,9 +7,9 @@
 import narwhals.stable.v1 as nw
 
 
-def test_convert_numpy(constructor: Any) -> None:
+def test_convert_numpy(constructor_with_pyarrow: Any) -> None:
     data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.1, 8, 9]}
-    df_raw = constructor(data)
+    df_raw = constructor_with_pyarrow(data)
     result = nw.from_native(df_raw, eager_only=True).to_numpy()
 
     expected = np.array([[1, 3, 2], [4, 4, 6], [7.1, 8, 9]]).T
diff --git a/tests/frame/with_row_index_test.py b/tests/frame/with_row_index_test.py
index 1b0ad5792b..ef54557bd3 100644
--- a/tests/frame/with_row_index_test.py
+++ b/tests/frame/with_row_index_test.py
@@ -9,9 +9,9 @@
 }
 
 
-def test_with_row_index(constructor: Any) -> None:
-    result = nw.from_native(constructor(data)).with_row_index()
+def test_with_row_index(constructor_with_pyarrow: Any) -> None:
+    result = nw.from_native(constructor_with_pyarrow(data)).with_row_index()
     expected = {"a": ["foo", "bars"], "ab": ["foo", "bars"], "index": [0, 1]}
     compare_dicts(result, expected)
-    result = nw.from_native(constructor(data)).lazy().with_row_index()
+    result = nw.from_native(constructor_with_pyarrow(data)).lazy().with_row_index()
     compare_dicts(result, expected)
diff --git a/utils/check_backend_completeness.py b/utils/check_backend_completeness.py
index 924115fbb3..abed808c6a 100644
--- a/utils/check_backend_completeness.py
+++ b/utils/check_backend_completeness.py
@@ -25,10 +25,7 @@
     "DataFrame.pipe",
     "DataFrame.rename",
     "DataFrame.tail",
-    "DataFrame.to_dict",
-    "DataFrame.to_numpy",
     "DataFrame.unique",
-    "DataFrame.with_row_index",
     "DataFrame.write_parquet",
     "Series.drop_nulls",
     "Series.fill_null",

From e5dc47431be30f49d7e1c0771193a2fdc714ed31 Mon Sep 17 00:00:00 2001
From: Bruno Conde Kind <condekind@users.noreply.github.com>
Date: Thu, 11 Jul 2024 05:40:18 -0300
Subject: [PATCH 3/5] Moved/renamed test_expr_unary to
 expr/unary_test::test_unary (#486)

* Moved/renamed test_expr_unary to expr/unary_test::test_unary

* Update tests/expr/unary_test.py

Co-authored-by: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com>

---------

Co-authored-by: Marco Edward Gorelli <marcogorelli@protonmail.com>
Co-authored-by: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com>
---
 tests/expr/unary_test.py   | 22 ++++++++++++++++++++++
 tests/frame/test_common.py | 18 ------------------
 2 files changed, 22 insertions(+), 18 deletions(-)
 create mode 100644 tests/expr/unary_test.py

diff --git a/tests/expr/unary_test.py b/tests/expr/unary_test.py
new file mode 100644
index 0000000000..c130844364
--- /dev/null
+++ b/tests/expr/unary_test.py
@@ -0,0 +1,22 @@
+from typing import Any
+
+import narwhals as nw
+from tests.utils import compare_dicts
+
+
+def test_unary(constructor_with_lazy: Any) -> None:
+    data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}
+    result = (
+        nw.from_native(constructor_with_lazy(data))
+        .with_columns(
+            a_mean=nw.col("a").mean(),
+            a_sum=nw.col("a").sum(),
+            b_nunique=nw.col("b").n_unique(),
+            z_min=nw.col("z").min(),
+            z_max=nw.col("z").max(),
+        )
+        .select(nw.col("a_mean", "a_sum", "b_nunique", "z_min", "z_max").unique())
+    )
+    result_native = nw.to_native(result)
+    expected = {"a_mean": [2], "a_sum": [6], "b_nunique": [2], "z_min": [7], "z_max": [9]}
+    compare_dicts(result_native, expected)
diff --git a/tests/frame/test_common.py b/tests/frame/test_common.py
index 69be5e6d1c..99a16de605 100644
--- a/tests/frame/test_common.py
+++ b/tests/frame/test_common.py
@@ -180,24 +180,6 @@ def test_expr_binary(df_raw: Any) -> None:
     compare_dicts(result_native, expected)
 
 
-@pytest.mark.parametrize("df_raw", [df_polars, df_pandas, df_lazy])
-def test_expr_unary(df_raw: Any) -> None:
-    result = (
-        nw.from_native(df_raw)
-        .with_columns(
-            a_mean=nw.col("a").mean(),
-            a_sum=nw.col("a").sum(),
-            b_nunique=nw.col("b").n_unique(),
-            z_min=nw.col("z").min(),
-            z_max=nw.col("z").max(),
-        )
-        .select(nw.col("a_mean", "a_sum", "b_nunique", "z_min", "z_max").unique())
-    )
-    result_native = nw.to_native(result)
-    expected = {"a_mean": [2], "a_sum": [6], "b_nunique": [2], "z_min": [7], "z_max": [9]}
-    compare_dicts(result_native, expected)
-
-
 @pytest.mark.parametrize("df_raw", [df_polars, df_pandas, df_mpd, df_lazy])
 def test_expr_transform(df_raw: Any) -> None:
     result = nw.from_native(df_raw).with_columns(

From 013be1a5bc4f7442c094b41033726611a7e51fd9 Mon Sep 17 00:00:00 2001
From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com>
Date: Thu, 11 Jul 2024 10:42:16 +0200
Subject: [PATCH 4/5] bug: fix `__getitem__` for tuple of row and col keys
 (#462)

---
 narwhals/_arrow/dataframe.py       | 20 +++++++++++++++++++-
 narwhals/_pandas_like/dataframe.py | 27 ++++++++++++++++++++++++++-
 narwhals/dataframe.py              | 15 +++++++++++----
 narwhals/stable/v1.py              |  5 ++++-
 tests/frame/slice_test.py          | 22 ++++++++++++++++++++++
 5 files changed, 82 insertions(+), 7 deletions(-)

diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py
index bf912b0d10..525402b8e1 100644
--- a/narwhals/_arrow/dataframe.py
+++ b/narwhals/_arrow/dataframe.py
@@ -72,13 +72,21 @@ def get_column(self, name: str) -> ArrowSeries:
             backend_version=self._backend_version,
         )
 
+    @overload
+    def __getitem__(self, item: tuple[Sequence[int], str | int]) -> ArrowSeries: ...  # type: ignore[overload-overlap]
+
+    @overload
+    def __getitem__(self, item: Sequence[int]) -> ArrowDataFrame: ...
+
     @overload
     def __getitem__(self, item: str) -> ArrowSeries: ...
 
     @overload
     def __getitem__(self, item: slice) -> ArrowDataFrame: ...
 
-    def __getitem__(self, item: str | slice) -> ArrowSeries | ArrowDataFrame:
+    def __getitem__(
+        self, item: str | slice | Sequence[int] | tuple[Sequence[int], str | int]
+    ) -> ArrowSeries | ArrowDataFrame:
         if isinstance(item, str):
             from narwhals._arrow.series import ArrowSeries
 
@@ -87,6 +95,16 @@ def __getitem__(self, item: str | slice) -> ArrowSeries | ArrowDataFrame:
                 name=item,
                 backend_version=self._backend_version,
             )
+        elif isinstance(item, tuple) and len(item) == 2:
+            from narwhals._arrow.series import ArrowSeries
+
+            # PyArrow columns are always strings
+            col_name = item[1] if isinstance(item[1], str) else self.columns[item[1]]
+            return ArrowSeries(
+                self._native_dataframe[col_name].take(item[0]),
+                name=col_name,
+                backend_version=self._backend_version,
+            )
 
         elif isinstance(item, slice):
             if item.step is not None and item.step != 1:
diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py
index a889885c15..6be62d3e03 100644
--- a/narwhals/_pandas_like/dataframe.py
+++ b/narwhals/_pandas_like/dataframe.py
@@ -99,13 +99,21 @@ def get_column(self, name: str) -> PandasLikeSeries:
             backend_version=self._backend_version,
         )
 
+    @overload
+    def __getitem__(self, item: tuple[Sequence[int], str | int]) -> PandasLikeSeries: ...  # type: ignore[overload-overlap]
+
+    @overload
+    def __getitem__(self, item: Sequence[int]) -> PandasLikeDataFrame: ...
+
     @overload
     def __getitem__(self, item: str) -> PandasLikeSeries: ...
 
     @overload
     def __getitem__(self, item: slice) -> PandasLikeDataFrame: ...
 
-    def __getitem__(self, item: str | slice) -> PandasLikeSeries | PandasLikeDataFrame:
+    def __getitem__(
+        self, item: str | slice | Sequence[int] | tuple[Sequence[int], str | int]
+    ) -> PandasLikeSeries | PandasLikeDataFrame:
         if isinstance(item, str):
             from narwhals._pandas_like.series import PandasLikeSeries
 
@@ -115,6 +123,23 @@ def __getitem__(self, item: str | slice) -> PandasLikeSeries | PandasLikeDataFra
                 backend_version=self._backend_version,
             )
 
+        elif isinstance(item, tuple) and len(item) == 2:
+            from narwhals._pandas_like.series import PandasLikeSeries
+
+            if isinstance(item[1], str):
+                native_series = self._native_dataframe.loc[item]
+            elif isinstance(item[1], int):
+                native_series = self._native_dataframe.iloc[item]
+            else:  # pragma: no cover
+                msg = f"Expected str or int, got: {type(item[1])}"
+                raise TypeError(msg)
+
+            return PandasLikeSeries(
+                native_series,
+                implementation=self._implementation,
+                backend_version=self._backend_version,
+            )
+
         elif isinstance(item, (slice, Sequence)) or (
             (np := get_numpy()) is not None
             and isinstance(item, np.ndarray)
diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py
index e255844a98..e863803686 100644
--- a/narwhals/dataframe.py
+++ b/narwhals/dataframe.py
@@ -456,7 +456,10 @@ def get_column(self, name: str) -> Series:
         )
 
     @overload
-    def __getitem__(self, item: Sequence[int]) -> Series: ...
+    def __getitem__(self, item: tuple[Sequence[int], str | int]) -> Series: ...  # type: ignore[overload-overlap]
+
+    @overload
+    def __getitem__(self, item: Sequence[int]) -> Self: ...
 
     @overload
     def __getitem__(self, item: str) -> Series: ...
@@ -464,7 +467,9 @@ def __getitem__(self, item: str) -> Series: ...
     @overload
     def __getitem__(self, item: slice) -> Self: ...
 
-    def __getitem__(self, item: str | slice | Sequence[int]) -> Series | Self:
+    def __getitem__(
+        self, item: str | slice | Sequence[int] | tuple[Sequence[int], str | int]
+    ) -> Series | Self:
         """
         Extract column or slice of DataFrame.
 
@@ -473,7 +478,9 @@ def __getitem__(self, item: str | slice | Sequence[int]) -> Series | Self:
 
                 - str: extract column
                 - slice or Sequence of integers: slice rows from dataframe.
-
+                - tuple of Sequence of integers and str or int: slice rows and extract column at the same time.
+                  If the second element of the tuple is an integer, it is interpreted as the column index. Otherwise,
+                  it is interpreted as the column name.
         Notes:
             In contrast with Polars, pandas allows non-string column names.
             If you don't know whether the column name you're trying to extract
@@ -508,7 +515,7 @@ def __getitem__(self, item: str | slice | Sequence[int]) -> Series | Self:
                 2
             ]
         """
-        if isinstance(item, str):
+        if isinstance(item, str) or (isinstance(item, tuple) and len(item) == 2):
             from narwhals.series import Series
 
             return Series(
diff --git a/narwhals/stable/v1.py b/narwhals/stable/v1.py
index 86602a1de0..e27a494852 100644
--- a/narwhals/stable/v1.py
+++ b/narwhals/stable/v1.py
@@ -67,7 +67,10 @@ class DataFrame(NwDataFrame[IntoDataFrameT]):
     """
 
     @overload
-    def __getitem__(self, item: Sequence[int]) -> Series: ...
+    def __getitem__(self, item: tuple[Sequence[int], str | int]) -> Series: ...  # type: ignore[overload-overlap]
+
+    @overload
+    def __getitem__(self, item: Sequence[int]) -> Self: ...
 
     @overload
     def __getitem__(self, item: str) -> Series: ...
diff --git a/tests/frame/slice_test.py b/tests/frame/slice_test.py
index 45390c561e..4a911142ef 100644
--- a/tests/frame/slice_test.py
+++ b/tests/frame/slice_test.py
@@ -5,8 +5,10 @@
 import polars as pl
 import pyarrow as pa
 import pytest
+from pandas.testing import assert_series_equal
 
 import narwhals.stable.v1 as nw
+from narwhals.utils import parse_version
 from tests.utils import compare_dicts
 
 data = {
@@ -69,3 +71,23 @@ def test_gather_pandas_index() -> None:
     result = nw.from_native(df, eager_only=True)[[1, 2]]
     expected = {"a": [1, 2], "b": [4, 2]}
     compare_dicts(result, expected)
+
+
+def test_gather_rows_cols(constructor_with_pyarrow: Any) -> None:
+    native_df = constructor_with_pyarrow(data)
+    df = nw.from_native(native_df, eager_only=True)
+    is_pandas_wo_pyarrow = parse_version(pd.__version__) < parse_version("1.0.0")
+    if isinstance(native_df, pa.Table) or is_pandas_wo_pyarrow:
+        # PyArrowSeries do not have `to_pandas`
+        result = df[[0, 3, 1], 1].to_numpy()
+        expected = np.array([11, 14, 12])
+        assert np.array_equal(result, expected)
+        result = df[np.array([0, 3, 1]), "b"].to_numpy()
+        assert np.array_equal(result, expected)
+    else:
+        result = df[[0, 3, 1], 1].to_pandas()
+        expected_index = range(3) if isinstance(native_df, pl.DataFrame) else [0, 3, 1]
+        expected = pd.Series([11, 14, 12], name="b", index=expected_index)
+        assert_series_equal(result, expected, check_dtype=False)
+        result = df[np.array([0, 3, 1]), "b"].to_pandas()
+        assert_series_equal(result, expected, check_dtype=False)

From 66a1909e4a7ed3f56df09af14be40e5a0c09ad60 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dea=20Mar=C3=ADa=20L=C3=A9on?= <deamarialeon@gmail.com>
Date: Thu, 11 Jul 2024 10:51:19 +0200
Subject: [PATCH 5/5] Fix link (#483)

---
 docs/overhead.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/overhead.md b/docs/overhead.md
index 7edfb1f950..1477f6fa67 100644
--- a/docs/overhead.md
+++ b/docs/overhead.md
@@ -10,7 +10,7 @@ vs running pandas via Narwhals:
 
 ![Comparison of pandas vs "pandas via Narwhals" timings on TPC-H queries showing neglibile overhead](https://github.com/narwhals-dev/narwhals/assets/33491632/71029c26-4121-43bb-90fb-5ac1c16ab8a2)
 
-[Here](https://www.kaggle.com/code/marcogorelli/narwhals-tpc-h-results-s-2-w-native)'s the code to
+[Here](https://www.kaggle.com/code/marcogorelli/narwhals-tpc-h-results-s-2)'s the code to
 reproduce the plot above, check the input
 sources for notebooks which run each individual query, along with
 the data sources.