From b72a823ca34ee1f6fcd7d3ca737ee5614b4fcba4 Mon Sep 17 00:00:00 2001 From: Uchenna Ugoh <61969079+ugohuche@users.noreply.github.com> Date: Thu, 11 Jul 2024 09:27:40 +0100 Subject: [PATCH 1/5] docs: add TPC-H Query 17 and 18 (#485) * TPCH Queries 9 and 10 * update id * strip notebook output * TPC-H Query 18 * TPC-H Query 17 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- tpch/notebooks/q17/execute.ipynb | 357 ++++++++++++++++++++++++ tpch/notebooks/q17/kernel-metadata.json | 15 + tpch/notebooks/q18/execute.ipynb | 211 ++++++++++++++ tpch/notebooks/q18/kernel-metadata.json | 15 + 4 files changed, 598 insertions(+) create mode 100644 tpch/notebooks/q17/execute.ipynb create mode 100644 tpch/notebooks/q17/kernel-metadata.json create mode 100644 tpch/notebooks/q18/execute.ipynb create mode 100644 tpch/notebooks/q18/kernel-metadata.json diff --git a/tpch/notebooks/q17/execute.ipynb b/tpch/notebooks/q17/execute.ipynb new file mode 100644 index 000000000..958c7f5be --- /dev/null +++ b/tpch/notebooks/q17/execute.ipynb @@ -0,0 +1,357 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 33.390992, + "end_time": "2024-03-22T17:24:15.601719", + "exception": false, + "start_time": "2024-03-22T17:23:42.210727", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.907754, + "end_time": "2024-03-22T17:24:39.053873", + "exception": false, + "start_time": "2024-03-22T17:24:38.146119", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import polars as pl\n", + "\n", + "pd.options.mode.copy_on_write = True\n", + "pd.options.future.infer_string = True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.021725, + "end_time": "2024-03-22T17:24:39.080999", + "exception": false, + "start_time": "2024-03-22T17:24:39.059274", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from typing import Any\n", + "import narwhals as nw\n", + "\n", + "def q17(\n", + " lineitem_ds_raw: Any,\n", + " part_ds_raw: Any\n", + ") -> Any:\n", + "\n", + " lineitem_ds = nw.from_native(lineitem_ds_raw)\n", + " part_ds = nw.from_native(part_ds_raw)\n", + " \n", + " var1 = \"Brand#23\"\n", + " var2 = \"MED BOX\"\n", + " \n", + " query1 = (\n", + " part_ds.filter(nw.col(\"p_brand\") == var1)\n", + " .filter(nw.col(\"p_container\") == var2)\n", + " .join(lineitem_ds, how=\"left\", left_on=\"p_partkey\", right_on=\"l_partkey\")\n", + " )\n", + " \n", + " final_query = (\n", + " query1.group_by(\"p_partkey\")\n", + " .agg((0.2 * nw.col(\"l_quantity\").mean()).alias(\"avg_quantity\"))\n", + " .select(nw.col(\"p_partkey\").alias(\"key\"), nw.col(\"avg_quantity\"))\n", + " .join(query1, left_on=\"key\", right_on=\"p_partkey\")\n", + " .filter(nw.col(\"l_quantity\") < nw.col(\"avg_quantity\"))\n", + " .select((nw.col(\"l_extendedprice\").sum() / 7.0).round(2).alias(\"avg_yearly\"))\n", + " )\n", + "\n", + "\n", + " return nw.to_native(final_query)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.013325, + "end_time": "2024-03-22T17:24:39.099766", + "exception": false, + "start_time": "2024-03-22T17:24:39.086441", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", + "lineitem = dir_ + 'lineitem.parquet'\n", + "part = dir_ + 'part.parquet'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.014284, + "end_time": "2024-03-22T17:24:39.119737", + "exception": false, + "start_time": "2024-03-22T17:24:39.105453", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "IO_FUNCS = {\n", + " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", + " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", + " 'polars[eager]': lambda x: pl.read_parquet(x),\n", + " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results = {}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005113, + "end_time": "2024-03-22T17:24:39.130472", + "exception": false, + "start_time": "2024-03-22T17:24:39.125359", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## pandas via Narwhals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 196.786925, + "end_time": "2024-03-22T17:27:55.922832", + "exception": false, + "start_time": "2024-03-22T17:24:39.135907", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'pandas'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q17(fn(lineitem), fn(part))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005184, + "end_time": "2024-03-22T17:27:55.933407", + "exception": false, + "start_time": "2024-03-22T17:27:55.928223", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## pandas, pyarrow dtypes, via Narwhals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 158.748353, + "end_time": "2024-03-22T17:30:34.688289", + "exception": false, + "start_time": "2024-03-22T17:27:55.939936", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'pandas[pyarrow]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q17(fn(lineitem), fn(part))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005773, + "end_time": "2024-03-22T17:30:34.7003", + "exception": false, + "start_time": "2024-03-22T17:30:34.694527", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Polars read_parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 37.821116, + "end_time": "2024-03-22T17:31:12.527466", + "exception": false, + "start_time": "2024-03-22T17:30:34.70635", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'polars[eager]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q17(fn(lineitem), fn(part))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005515, + "end_time": "2024-03-22T17:31:12.539068", + "exception": false, + "start_time": "2024-03-22T17:31:12.533553", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Polars scan_parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 4.800698, + "end_time": "2024-03-22T17:31:17.346813", + "exception": false, + "start_time": "2024-03-22T17:31:12.546115", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'polars[lazy]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q17(fn(lineitem), fn(part)).collect()\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "with open('results.json', 'w') as fd:\n", + " json.dump(results, fd)\n" + ] + } + ], + "metadata": { + "kaggle": { + "accelerator": "none", + "dataSources": [ + { + "sourceId": 167796934, + "sourceType": "kernelVersion" + } + ], + "dockerImageVersionId": 30673, + "isGpuEnabled": false, + "isInternetEnabled": true, + "language": "python", + "sourceType": "notebook" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + }, + "papermill": { + "default_parameters": {}, + "duration": 458.423327, + "end_time": "2024-03-22T17:31:18.077306", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2024-03-22T17:23:39.653979", + "version": "2.5.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tpch/notebooks/q17/kernel-metadata.json b/tpch/notebooks/q17/kernel-metadata.json new file mode 100644 index 000000000..0fd73368c --- /dev/null +++ b/tpch/notebooks/q17/kernel-metadata.json @@ -0,0 +1,15 @@ +{ + "id": "marcogorelli/narwhals-tpch-q17-s2", + "title": "Narwhals TPCH Q17 S2", + "code_file": "execute.ipynb", + "language": "python", + "kernel_type": "notebook", + "is_private": "false", + "enable_gpu": "false", + "enable_tpu": "false", + "enable_internet": "true", + "dataset_sources": [], + "competition_sources": [], + "kernel_sources": ["marcogorelli/tpc-h-data-parquet-s-2"], + "model_sources": [] +} \ No newline at end of file diff --git a/tpch/notebooks/q18/execute.ipynb b/tpch/notebooks/q18/execute.ipynb new file mode 100644 index 000000000..21557c957 --- /dev/null +++ b/tpch/notebooks/q18/execute.ipynb @@ -0,0 +1,211 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import polars as pl\n", + "\n", + "pd.options.mode.copy_on_write = True\n", + "pd.options.future.infer_string = True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Any\n", + "import narwhals as nw\n", + "\n", + "def q18(\n", + " customer_ds_raw: Any,\n", + " lineitem_ds_raw: Any,\n", + " orders_ds_raw: Any\n", + ") -> Any:\n", + "\n", + " customer_ds = nw.from_native(customer_ds_raw)\n", + " lineitem_ds = nw.from_native(lineitem_ds_raw)\n", + " orders_ds = nw.from_native(orders_ds_raw)\n", + " \n", + " var1 = 300\n", + "\n", + " query1 = (\n", + " lineitem_ds.group_by(\"l_orderkey\")\n", + " .agg(nw.col(\"l_quantity\").sum().alias(\"sum_quantity\"))\n", + " .filter(nw.col(\"sum_quantity\") > var1)\n", + " )\n", + "\n", + " q_final = (\n", + " orders_ds.join(query1, left_on=\"o_orderkey\", right_on=\"l_orderkey\", how=\"semi\")\n", + " .join(lineitem_ds, left_on=\"o_orderkey\", right_on=\"l_orderkey\")\n", + " .join(customer_ds, left_on=\"o_custkey\", right_on=\"c_custkey\")\n", + " .group_by(\"c_name\", \"o_custkey\", \"o_orderkey\", \"o_orderdate\", \"o_totalprice\")\n", + " .agg(nw.col(\"l_quantity\").sum().alias(\"col6\"))\n", + " .select(\n", + " nw.col(\"c_name\"),\n", + " nw.col(\"o_custkey\").alias(\"c_custkey\"),\n", + " nw.col(\"o_orderkey\"),\n", + " nw.col(\"o_orderdate\").alias(\"o_orderdat\"),\n", + " nw.col(\"o_totalprice\"),\n", + " nw.col(\"col6\"),\n", + " )\n", + " .sort(by=[\"o_totalprice\", \"o_orderdat\"], descending=[True, False])\n", + " .head(100)\n", + " )\n", + "\n", + "\n", + " return nw.to_native(q_final)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", + "customer = dir_ + 'customer.parquet'\n", + "lineitem = dir_ + 'lineitem.parquet'\n", + "orders = dir_ + 'orders.parquet'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "IO_FUNCS = {\n", + " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", + " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", + " 'polars[eager]': lambda x: pl.read_parquet(x),\n", + " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results = {}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas via Narwhals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tool = 'pandas'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q19(fn(lineitem), fn(part))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas, pyarrow dtypes, via Narwhals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tool = 'pandas[pyarrow]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q18(fn(customer), fn(lineitem), fn(orders))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Polars read_parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tool = 'polars[eager]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q18(fn(customer), fn(lineitem), fn(orders))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Polars scan_parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tool = 'polars[lazy]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q18(fn(customer), fn(lineitem), fn(orders)).collect()\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "with open('results.json', 'w') as fd:\n", + " json.dump(results, fd)\n" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tpch/notebooks/q18/kernel-metadata.json b/tpch/notebooks/q18/kernel-metadata.json new file mode 100644 index 000000000..e1c11e53c --- /dev/null +++ b/tpch/notebooks/q18/kernel-metadata.json @@ -0,0 +1,15 @@ +{ + "id": "marcogorelli/narwhals-tpch-q18-s2", + "title": "Narwhals TPCH Q18 S2", + "code_file": "execute.ipynb", + "language": "python", + "kernel_type": "notebook", + "is_private": "false", + "enable_gpu": "false", + "enable_tpu": "false", + "enable_internet": "true", + "dataset_sources": [], + "competition_sources": [], + "kernel_sources": ["marcogorelli/tpc-h-data-parquet-s-2"], + "model_sources": [] +} \ No newline at end of file From 7e72d88ac6f731ed3aaf8a29d607141471a4c286 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Thu, 11 Jul 2024 10:33:05 +0200 Subject: [PATCH 2/5] feat: pyarrow `to_numpy`, `to_dict`, `with_row_index` (#487) * feat: to_numpy,to_dict,pipe,with_row_index * pipe comes from upstream --- narwhals/_arrow/dataframe.py | 26 ++++++++++++++++++++++++++ tests/frame/pipe_test.py | 4 ++-- tests/frame/to_dict_test.py | 8 ++++---- tests/frame/to_numpy_test.py | 4 ++-- tests/frame/with_row_index_test.py | 6 +++--- utils/check_backend_completeness.py | 3 --- 6 files changed, 37 insertions(+), 14 deletions(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 9aab77381..bf912b0d1 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -195,6 +195,32 @@ def sort( def to_pandas(self) -> Any: return self._native_dataframe.to_pandas() + def to_numpy(self) -> Any: + import numpy as np + + return np.column_stack([col.to_numpy() for col in self._native_dataframe.columns]) + + def to_dict(self, *, as_series: bool) -> Any: + df = self._native_dataframe + + names_and_values = zip(df.column_names, df.columns) + if as_series: + from narwhals._arrow.series import ArrowSeries + + return { + name: ArrowSeries(col, name=name, backend_version=self._backend_version) + for name, col in names_and_values + } + else: + return {name: col.to_pylist() for name, col in names_and_values} + + def with_row_index(self, name: str) -> Self: + pa = get_pyarrow() + df = self._native_dataframe + + row_indices = pa.array(range(df.num_rows)) + return self._from_native_dataframe(df.append_column(name, row_indices)) + def lazy(self) -> Self: return self diff --git a/tests/frame/pipe_test.py b/tests/frame/pipe_test.py index 6f49966b9..9dd66f10a 100644 --- a/tests/frame/pipe_test.py +++ b/tests/frame/pipe_test.py @@ -9,8 +9,8 @@ } -def test_pipe(constructor: Any) -> None: - df = nw.from_native(constructor(data)) +def test_pipe(constructor_with_pyarrow: Any) -> None: + df = nw.from_native(constructor_with_pyarrow(data)) columns = df.lazy().collect().columns result = df.pipe(lambda _df: _df.select([x for x in columns if len(x) == 2])) expected = {"ab": ["foo", "bars"]} diff --git a/tests/frame/to_dict_test.py b/tests/frame/to_dict_test.py index b0950c5c9..8fa31c336 100644 --- a/tests/frame/to_dict_test.py +++ b/tests/frame/to_dict_test.py @@ -3,16 +3,16 @@ import narwhals.stable.v1 as nw -def test_to_dict(constructor: Any) -> None: +def test_to_dict(constructor_with_pyarrow: Any) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "c": [7.0, 8, 9]} - df = nw.from_native(constructor(data), eager_only=True) + df = nw.from_native(constructor_with_pyarrow(data), eager_only=True) result = df.to_dict(as_series=False) assert result == data -def test_to_dict_as_series(constructor: Any) -> None: +def test_to_dict_as_series(constructor_with_pyarrow: Any) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "c": [7.0, 8, 9]} - df = nw.from_native(constructor(data), eager_only=True) + df = nw.from_native(constructor_with_pyarrow(data), eager_only=True) result = df.to_dict(as_series=True) assert isinstance(result["a"], nw.Series) assert isinstance(result["b"], nw.Series) diff --git a/tests/frame/to_numpy_test.py b/tests/frame/to_numpy_test.py index 6f516334d..5cfa69bf7 100644 --- a/tests/frame/to_numpy_test.py +++ b/tests/frame/to_numpy_test.py @@ -7,9 +7,9 @@ import narwhals.stable.v1 as nw -def test_convert_numpy(constructor: Any) -> None: +def test_convert_numpy(constructor_with_pyarrow: Any) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.1, 8, 9]} - df_raw = constructor(data) + df_raw = constructor_with_pyarrow(data) result = nw.from_native(df_raw, eager_only=True).to_numpy() expected = np.array([[1, 3, 2], [4, 4, 6], [7.1, 8, 9]]).T diff --git a/tests/frame/with_row_index_test.py b/tests/frame/with_row_index_test.py index 1b0ad5792..ef54557bd 100644 --- a/tests/frame/with_row_index_test.py +++ b/tests/frame/with_row_index_test.py @@ -9,9 +9,9 @@ } -def test_with_row_index(constructor: Any) -> None: - result = nw.from_native(constructor(data)).with_row_index() +def test_with_row_index(constructor_with_pyarrow: Any) -> None: + result = nw.from_native(constructor_with_pyarrow(data)).with_row_index() expected = {"a": ["foo", "bars"], "ab": ["foo", "bars"], "index": [0, 1]} compare_dicts(result, expected) - result = nw.from_native(constructor(data)).lazy().with_row_index() + result = nw.from_native(constructor_with_pyarrow(data)).lazy().with_row_index() compare_dicts(result, expected) diff --git a/utils/check_backend_completeness.py b/utils/check_backend_completeness.py index 924115fbb..abed808c6 100644 --- a/utils/check_backend_completeness.py +++ b/utils/check_backend_completeness.py @@ -25,10 +25,7 @@ "DataFrame.pipe", "DataFrame.rename", "DataFrame.tail", - "DataFrame.to_dict", - "DataFrame.to_numpy", "DataFrame.unique", - "DataFrame.with_row_index", "DataFrame.write_parquet", "Series.drop_nulls", "Series.fill_null", From e5dc47431be30f49d7e1c0771193a2fdc714ed31 Mon Sep 17 00:00:00 2001 From: Bruno Conde Kind Date: Thu, 11 Jul 2024 05:40:18 -0300 Subject: [PATCH 3/5] Moved/renamed test_expr_unary to expr/unary_test::test_unary (#486) * Moved/renamed test_expr_unary to expr/unary_test::test_unary * Update tests/expr/unary_test.py Co-authored-by: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> --------- Co-authored-by: Marco Edward Gorelli Co-authored-by: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> --- tests/expr/unary_test.py | 22 ++++++++++++++++++++++ tests/frame/test_common.py | 18 ------------------ 2 files changed, 22 insertions(+), 18 deletions(-) create mode 100644 tests/expr/unary_test.py diff --git a/tests/expr/unary_test.py b/tests/expr/unary_test.py new file mode 100644 index 000000000..c13084436 --- /dev/null +++ b/tests/expr/unary_test.py @@ -0,0 +1,22 @@ +from typing import Any + +import narwhals as nw +from tests.utils import compare_dicts + + +def test_unary(constructor_with_lazy: Any) -> None: + data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} + result = ( + nw.from_native(constructor_with_lazy(data)) + .with_columns( + a_mean=nw.col("a").mean(), + a_sum=nw.col("a").sum(), + b_nunique=nw.col("b").n_unique(), + z_min=nw.col("z").min(), + z_max=nw.col("z").max(), + ) + .select(nw.col("a_mean", "a_sum", "b_nunique", "z_min", "z_max").unique()) + ) + result_native = nw.to_native(result) + expected = {"a_mean": [2], "a_sum": [6], "b_nunique": [2], "z_min": [7], "z_max": [9]} + compare_dicts(result_native, expected) diff --git a/tests/frame/test_common.py b/tests/frame/test_common.py index 69be5e6d1..99a16de60 100644 --- a/tests/frame/test_common.py +++ b/tests/frame/test_common.py @@ -180,24 +180,6 @@ def test_expr_binary(df_raw: Any) -> None: compare_dicts(result_native, expected) -@pytest.mark.parametrize("df_raw", [df_polars, df_pandas, df_lazy]) -def test_expr_unary(df_raw: Any) -> None: - result = ( - nw.from_native(df_raw) - .with_columns( - a_mean=nw.col("a").mean(), - a_sum=nw.col("a").sum(), - b_nunique=nw.col("b").n_unique(), - z_min=nw.col("z").min(), - z_max=nw.col("z").max(), - ) - .select(nw.col("a_mean", "a_sum", "b_nunique", "z_min", "z_max").unique()) - ) - result_native = nw.to_native(result) - expected = {"a_mean": [2], "a_sum": [6], "b_nunique": [2], "z_min": [7], "z_max": [9]} - compare_dicts(result_native, expected) - - @pytest.mark.parametrize("df_raw", [df_polars, df_pandas, df_mpd, df_lazy]) def test_expr_transform(df_raw: Any) -> None: result = nw.from_native(df_raw).with_columns( From 013be1a5bc4f7442c094b41033726611a7e51fd9 Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Thu, 11 Jul 2024 10:42:16 +0200 Subject: [PATCH 4/5] bug: fix `__getitem__` for tuple of row and col keys (#462) --- narwhals/_arrow/dataframe.py | 20 +++++++++++++++++++- narwhals/_pandas_like/dataframe.py | 27 ++++++++++++++++++++++++++- narwhals/dataframe.py | 15 +++++++++++---- narwhals/stable/v1.py | 5 ++++- tests/frame/slice_test.py | 22 ++++++++++++++++++++++ 5 files changed, 82 insertions(+), 7 deletions(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index bf912b0d1..525402b8e 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -72,13 +72,21 @@ def get_column(self, name: str) -> ArrowSeries: backend_version=self._backend_version, ) + @overload + def __getitem__(self, item: tuple[Sequence[int], str | int]) -> ArrowSeries: ... # type: ignore[overload-overlap] + + @overload + def __getitem__(self, item: Sequence[int]) -> ArrowDataFrame: ... + @overload def __getitem__(self, item: str) -> ArrowSeries: ... @overload def __getitem__(self, item: slice) -> ArrowDataFrame: ... - def __getitem__(self, item: str | slice) -> ArrowSeries | ArrowDataFrame: + def __getitem__( + self, item: str | slice | Sequence[int] | tuple[Sequence[int], str | int] + ) -> ArrowSeries | ArrowDataFrame: if isinstance(item, str): from narwhals._arrow.series import ArrowSeries @@ -87,6 +95,16 @@ def __getitem__(self, item: str | slice) -> ArrowSeries | ArrowDataFrame: name=item, backend_version=self._backend_version, ) + elif isinstance(item, tuple) and len(item) == 2: + from narwhals._arrow.series import ArrowSeries + + # PyArrow columns are always strings + col_name = item[1] if isinstance(item[1], str) else self.columns[item[1]] + return ArrowSeries( + self._native_dataframe[col_name].take(item[0]), + name=col_name, + backend_version=self._backend_version, + ) elif isinstance(item, slice): if item.step is not None and item.step != 1: diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index a889885c1..6be62d3e0 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -99,13 +99,21 @@ def get_column(self, name: str) -> PandasLikeSeries: backend_version=self._backend_version, ) + @overload + def __getitem__(self, item: tuple[Sequence[int], str | int]) -> PandasLikeSeries: ... # type: ignore[overload-overlap] + + @overload + def __getitem__(self, item: Sequence[int]) -> PandasLikeDataFrame: ... + @overload def __getitem__(self, item: str) -> PandasLikeSeries: ... @overload def __getitem__(self, item: slice) -> PandasLikeDataFrame: ... - def __getitem__(self, item: str | slice) -> PandasLikeSeries | PandasLikeDataFrame: + def __getitem__( + self, item: str | slice | Sequence[int] | tuple[Sequence[int], str | int] + ) -> PandasLikeSeries | PandasLikeDataFrame: if isinstance(item, str): from narwhals._pandas_like.series import PandasLikeSeries @@ -115,6 +123,23 @@ def __getitem__(self, item: str | slice) -> PandasLikeSeries | PandasLikeDataFra backend_version=self._backend_version, ) + elif isinstance(item, tuple) and len(item) == 2: + from narwhals._pandas_like.series import PandasLikeSeries + + if isinstance(item[1], str): + native_series = self._native_dataframe.loc[item] + elif isinstance(item[1], int): + native_series = self._native_dataframe.iloc[item] + else: # pragma: no cover + msg = f"Expected str or int, got: {type(item[1])}" + raise TypeError(msg) + + return PandasLikeSeries( + native_series, + implementation=self._implementation, + backend_version=self._backend_version, + ) + elif isinstance(item, (slice, Sequence)) or ( (np := get_numpy()) is not None and isinstance(item, np.ndarray) diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index e255844a9..e86380368 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -456,7 +456,10 @@ def get_column(self, name: str) -> Series: ) @overload - def __getitem__(self, item: Sequence[int]) -> Series: ... + def __getitem__(self, item: tuple[Sequence[int], str | int]) -> Series: ... # type: ignore[overload-overlap] + + @overload + def __getitem__(self, item: Sequence[int]) -> Self: ... @overload def __getitem__(self, item: str) -> Series: ... @@ -464,7 +467,9 @@ def __getitem__(self, item: str) -> Series: ... @overload def __getitem__(self, item: slice) -> Self: ... - def __getitem__(self, item: str | slice | Sequence[int]) -> Series | Self: + def __getitem__( + self, item: str | slice | Sequence[int] | tuple[Sequence[int], str | int] + ) -> Series | Self: """ Extract column or slice of DataFrame. @@ -473,7 +478,9 @@ def __getitem__(self, item: str | slice | Sequence[int]) -> Series | Self: - str: extract column - slice or Sequence of integers: slice rows from dataframe. - + - tuple of Sequence of integers and str or int: slice rows and extract column at the same time. + If the second element of the tuple is an integer, it is interpreted as the column index. Otherwise, + it is interpreted as the column name. Notes: In contrast with Polars, pandas allows non-string column names. If you don't know whether the column name you're trying to extract @@ -508,7 +515,7 @@ def __getitem__(self, item: str | slice | Sequence[int]) -> Series | Self: 2 ] """ - if isinstance(item, str): + if isinstance(item, str) or (isinstance(item, tuple) and len(item) == 2): from narwhals.series import Series return Series( diff --git a/narwhals/stable/v1.py b/narwhals/stable/v1.py index 86602a1de..e27a49485 100644 --- a/narwhals/stable/v1.py +++ b/narwhals/stable/v1.py @@ -67,7 +67,10 @@ class DataFrame(NwDataFrame[IntoDataFrameT]): """ @overload - def __getitem__(self, item: Sequence[int]) -> Series: ... + def __getitem__(self, item: tuple[Sequence[int], str | int]) -> Series: ... # type: ignore[overload-overlap] + + @overload + def __getitem__(self, item: Sequence[int]) -> Self: ... @overload def __getitem__(self, item: str) -> Series: ... diff --git a/tests/frame/slice_test.py b/tests/frame/slice_test.py index 45390c561..4a911142e 100644 --- a/tests/frame/slice_test.py +++ b/tests/frame/slice_test.py @@ -5,8 +5,10 @@ import polars as pl import pyarrow as pa import pytest +from pandas.testing import assert_series_equal import narwhals.stable.v1 as nw +from narwhals.utils import parse_version from tests.utils import compare_dicts data = { @@ -69,3 +71,23 @@ def test_gather_pandas_index() -> None: result = nw.from_native(df, eager_only=True)[[1, 2]] expected = {"a": [1, 2], "b": [4, 2]} compare_dicts(result, expected) + + +def test_gather_rows_cols(constructor_with_pyarrow: Any) -> None: + native_df = constructor_with_pyarrow(data) + df = nw.from_native(native_df, eager_only=True) + is_pandas_wo_pyarrow = parse_version(pd.__version__) < parse_version("1.0.0") + if isinstance(native_df, pa.Table) or is_pandas_wo_pyarrow: + # PyArrowSeries do not have `to_pandas` + result = df[[0, 3, 1], 1].to_numpy() + expected = np.array([11, 14, 12]) + assert np.array_equal(result, expected) + result = df[np.array([0, 3, 1]), "b"].to_numpy() + assert np.array_equal(result, expected) + else: + result = df[[0, 3, 1], 1].to_pandas() + expected_index = range(3) if isinstance(native_df, pl.DataFrame) else [0, 3, 1] + expected = pd.Series([11, 14, 12], name="b", index=expected_index) + assert_series_equal(result, expected, check_dtype=False) + result = df[np.array([0, 3, 1]), "b"].to_pandas() + assert_series_equal(result, expected, check_dtype=False) From 66a1909e4a7ed3f56df09af14be40e5a0c09ad60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dea=20Mar=C3=ADa=20L=C3=A9on?= Date: Thu, 11 Jul 2024 10:51:19 +0200 Subject: [PATCH 5/5] Fix link (#483) --- docs/overhead.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/overhead.md b/docs/overhead.md index 7edfb1f95..1477f6fa6 100644 --- a/docs/overhead.md +++ b/docs/overhead.md @@ -10,7 +10,7 @@ vs running pandas via Narwhals: ![Comparison of pandas vs "pandas via Narwhals" timings on TPC-H queries showing neglibile overhead](https://github.com/narwhals-dev/narwhals/assets/33491632/71029c26-4121-43bb-90fb-5ac1c16ab8a2) -[Here](https://www.kaggle.com/code/marcogorelli/narwhals-tpc-h-results-s-2-w-native)'s the code to +[Here](https://www.kaggle.com/code/marcogorelli/narwhals-tpc-h-results-s-2)'s the code to reproduce the plot above, check the input sources for notebooks which run each individual query, along with the data sources.