From c997006b5842c64076f8cc179ecedc355996ffa5 Mon Sep 17 00:00:00 2001 From: Uchenna Ugoh <61969079+ugohuche@users.noreply.github.com> Date: Fri, 5 Jul 2024 15:47:48 +0100 Subject: [PATCH] TPCH Queries 9 and 10 (#407) * TPCH Queries 9 and 10 * update id * strip notebook output --------- Co-authored-by: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> --- tpch/notebooks/q10/execute.ipynb | 382 ++++++++++++++++++++++++ tpch/notebooks/q10/kernel-metadata.json | 15 + tpch/notebooks/q9/execute.ipynb | 374 +++++++++++++++++++++++ tpch/notebooks/q9/kernel-metadata.json | 15 + 4 files changed, 786 insertions(+) create mode 100644 tpch/notebooks/q10/execute.ipynb create mode 100644 tpch/notebooks/q10/kernel-metadata.json create mode 100644 tpch/notebooks/q9/execute.ipynb create mode 100644 tpch/notebooks/q9/kernel-metadata.json diff --git a/tpch/notebooks/q10/execute.ipynb b/tpch/notebooks/q10/execute.ipynb new file mode 100644 index 000000000..307f69e7a --- /dev/null +++ b/tpch/notebooks/q10/execute.ipynb @@ -0,0 +1,382 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 33.390992, + "end_time": "2024-03-22T17:24:15.601719", + "exception": false, + "start_time": "2024-03-22T17:23:42.210727", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.907754, + "end_time": "2024-03-22T17:24:39.053873", + "exception": false, + "start_time": "2024-03-22T17:24:38.146119", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import polars as pl\n", + "\n", + "pd.options.mode.copy_on_write = True\n", + "pd.options.future.infer_string = True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.021725, + "end_time": "2024-03-22T17:24:39.080999", + "exception": false, + "start_time": "2024-03-22T17:24:39.059274", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from typing import Any\n", + "from datetime import datetime\n", + "import narwhals as nw\n", + "\n", + "def q10(\n", + " customer_ds_raw: Any,\n", + " nation_ds_raw: Any,\n", + " lineitem_ds_raw: Any,\n", + " orders_ds_raw: Any,\n", + ") -> Any:\n", + "\n", + " nation_ds = nw.from_native(nation_ds_raw)\n", + " line_item_ds = nw.from_native(lineitem_ds_raw)\n", + " orders_ds = nw.from_native(orders_ds_raw)\n", + " customer_ds = nw.from_native(customer_ds_raw)\n", + " \n", + " var1 = datetime(1993, 10, 1)\n", + " var2 = datetime(1994, 1, 1)\n", + "\n", + " result = (\n", + " customer_ds.join(orders_ds, left_on=\"c_custkey\", right_on=\"o_custkey\")\n", + " .join(line_item_ds, left_on=\"o_orderkey\", right_on=\"l_orderkey\")\n", + " .join(nation_ds, left_on=\"c_nationkey\", right_on=\"n_nationkey\")\n", + " .filter(nw.col(\"o_orderdate\").is_between(var1, var2, closed=\"left\"))\n", + " .filter(nw.col(\"l_returnflag\") == \"R\")\n", + " .with_columns(\n", + " (nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\")))\n", + " .alias(\"revenue\")\n", + " )\n", + " .group_by(\n", + " \"c_custkey\",\n", + " \"c_name\",\n", + " \"c_acctbal\",\n", + " \"c_phone\",\n", + " \"n_name\",\n", + " \"c_address\",\n", + " \"c_comment\",\n", + " )\n", + " .agg(nw.sum(\"revenue\"))\n", + " .select(\n", + " \"c_custkey\",\n", + " \"c_name\",\n", + " \"revenue\",\n", + " \"c_acctbal\",\n", + " \"n_name\",\n", + " \"c_address\",\n", + " \"c_phone\",\n", + " \"c_comment\",\n", + " )\n", + " .sort(by=\"revenue\", descending=True)\n", + " .head(20)\n", + " )\n", + "\n", + " return nw.to_native(result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.013325, + "end_time": "2024-03-22T17:24:39.099766", + "exception": false, + "start_time": "2024-03-22T17:24:39.086441", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", + "nation = dir_ + 'nation.parquet'\n", + "lineitem = dir_ + 'lineitem.parquet'\n", + "orders = dir_ + 'orders.parquet'\n", + "customer = dir_ + 'customer.parquet'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.014284, + "end_time": "2024-03-22T17:24:39.119737", + "exception": false, + "start_time": "2024-03-22T17:24:39.105453", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "IO_FUNCS = {\n", + " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", + " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", + " 'polars[eager]': lambda x: pl.read_parquet(x),\n", + " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results = {}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005113, + "end_time": "2024-03-22T17:24:39.130472", + "exception": false, + "start_time": "2024-03-22T17:24:39.125359", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## pandas via Narwhals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 196.786925, + "end_time": "2024-03-22T17:27:55.922832", + "exception": false, + "start_time": "2024-03-22T17:24:39.135907", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'pandas'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q10(fn(customer), fn(nation), fn(lineitem), fn(orders))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005184, + "end_time": "2024-03-22T17:27:55.933407", + "exception": false, + "start_time": "2024-03-22T17:27:55.928223", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## pandas, pyarrow dtypes, via Narwhals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 158.748353, + "end_time": "2024-03-22T17:30:34.688289", + "exception": false, + "start_time": "2024-03-22T17:27:55.939936", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'pandas[pyarrow]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q10(fn(customer), fn(nation), fn(lineitem), fn(orders))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005773, + "end_time": "2024-03-22T17:30:34.7003", + "exception": false, + "start_time": "2024-03-22T17:30:34.694527", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Polars read_parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 37.821116, + "end_time": "2024-03-22T17:31:12.527466", + "exception": false, + "start_time": "2024-03-22T17:30:34.70635", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'polars[eager]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q10(fn(customer), fn(nation), fn(lineitem), fn(orders))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005515, + "end_time": "2024-03-22T17:31:12.539068", + "exception": false, + "start_time": "2024-03-22T17:31:12.533553", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Polars scan_parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 4.800698, + "end_time": "2024-03-22T17:31:17.346813", + "exception": false, + "start_time": "2024-03-22T17:31:12.546115", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'polars[lazy]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q10(fn(customer), fn(nation), fn(lineitem), fn(orders)).collect()\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "with open('results.json', 'w') as fd:\n", + " json.dump(results, fd)\n" + ] + } + ], + "metadata": { + "kaggle": { + "accelerator": "none", + "dataSources": [ + { + "sourceId": 167796934, + "sourceType": "kernelVersion" + } + ], + "dockerImageVersionId": 30673, + "isGpuEnabled": false, + "isInternetEnabled": true, + "language": "python", + "sourceType": "notebook" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + }, + "papermill": { + "default_parameters": {}, + "duration": 458.423327, + "end_time": "2024-03-22T17:31:18.077306", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2024-03-22T17:23:39.653979", + "version": "2.5.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tpch/notebooks/q10/kernel-metadata.json b/tpch/notebooks/q10/kernel-metadata.json new file mode 100644 index 000000000..5fa48ff5e --- /dev/null +++ b/tpch/notebooks/q10/kernel-metadata.json @@ -0,0 +1,15 @@ +{ + "id": "marcogorelli/narwhals-tpch-q10-s2", + "title": "Narwhals TPCH Q10 S2", + "code_file": "execute.ipynb", + "language": "python", + "kernel_type": "notebook", + "is_private": "false", + "enable_gpu": "false", + "enable_tpu": "false", + "enable_internet": "true", + "dataset_sources": [], + "competition_sources": [], + "kernel_sources": ["marcogorelli/tpc-h-data-parquet-s-2"], + "model_sources": [] +} \ No newline at end of file diff --git a/tpch/notebooks/q9/execute.ipynb b/tpch/notebooks/q9/execute.ipynb new file mode 100644 index 000000000..d7412426c --- /dev/null +++ b/tpch/notebooks/q9/execute.ipynb @@ -0,0 +1,374 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 33.390992, + "end_time": "2024-03-22T17:24:15.601719", + "exception": false, + "start_time": "2024-03-22T17:23:42.210727", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.907754, + "end_time": "2024-03-22T17:24:39.053873", + "exception": false, + "start_time": "2024-03-22T17:24:38.146119", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import polars as pl\n", + "\n", + "pd.options.mode.copy_on_write = True\n", + "pd.options.future.infer_string = True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.021725, + "end_time": "2024-03-22T17:24:39.080999", + "exception": false, + "start_time": "2024-03-22T17:24:39.059274", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from typing import Any\n", + "import narwhals as nw\n", + "\n", + "def q9(\n", + " part_ds_raw: Any,\n", + " partsupp_ds_raw: Any,\n", + " nation_ds_raw: Any,\n", + " lineitem_ds_raw: Any,\n", + " orders_ds_raw: Any,\n", + " supplier_ds_raw: Any,\n", + ") -> Any:\n", + "\n", + " part_ds = nw.from_native(part_ds_raw)\n", + " nation_ds = nw.from_native(nation_ds_raw)\n", + " partsupp_ds = nw.from_native(partsupp_ds_raw)\n", + " lineitem_ds = nw.from_native(lineitem_ds_raw)\n", + " orders_ds = nw.from_native(orders_ds_raw)\n", + " supplier_ds = nw.from_native(supplier_ds_raw)\n", + "\n", + " result = (\n", + " part_ds.join(partsupp_ds, left_on=\"p_partkey\", right_on=\"ps_partkey\")\n", + " .join(supplier_ds, left_on=\"ps_suppkey\", right_on=\"s_suppkey\")\n", + " .join(\n", + " lineitem_ds,\n", + " left_on=[\"p_partkey\", \"ps_suppkey\"],\n", + " right_on=[\"l_partkey\", \"l_suppkey\"],\n", + " )\n", + " .join(orders_ds, left_on=\"l_orderkey\", right_on=\"o_orderkey\")\n", + " .join(nation_ds, left_on=\"s_nationkey\", right_on=\"n_nationkey\")\n", + " .filter(nw.col(\"p_name\").str.contains(\"green\"))\n", + " .select(\n", + " nw.col(\"n_name\").alias(\"nation\"),\n", + " nw.col(\"o_orderdate\").dt.year().alias(\"o_year\"),\n", + " (\n", + " nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\"))\n", + " - nw.col(\"ps_supplycost\") * nw.col(\"l_quantity\")\n", + " ).alias(\"amount\")\n", + " )\n", + " .group_by(\"nation\", \"o_year\")\n", + " .agg(nw.sum(\"amount\").alias(\"sum_profit\"))\n", + " .sort(by=[\"nation\", \"o_year\"], descending=[False, True])\n", + " )\n", + "\n", + " return nw.to_native(result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.013325, + "end_time": "2024-03-22T17:24:39.099766", + "exception": false, + "start_time": "2024-03-22T17:24:39.086441", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", + "nation = dir_ + 'nation.parquet'\n", + "lineitem = dir_ + 'lineitem.parquet'\n", + "orders = dir_ + 'orders.parquet'\n", + "supplier = dir_ + 'supplier.parquet'\n", + "part = dir_ + 'part.parquet'\n", + "partsupp = dir_ + 'partsupp.parquet'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.014284, + "end_time": "2024-03-22T17:24:39.119737", + "exception": false, + "start_time": "2024-03-22T17:24:39.105453", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "IO_FUNCS = {\n", + " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", + " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", + " 'polars[eager]': lambda x: pl.read_parquet(x),\n", + " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results = {}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005113, + "end_time": "2024-03-22T17:24:39.130472", + "exception": false, + "start_time": "2024-03-22T17:24:39.125359", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## pandas via Narwhals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 196.786925, + "end_time": "2024-03-22T17:27:55.922832", + "exception": false, + "start_time": "2024-03-22T17:24:39.135907", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'pandas'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q9(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005184, + "end_time": "2024-03-22T17:27:55.933407", + "exception": false, + "start_time": "2024-03-22T17:27:55.928223", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## pandas, pyarrow dtypes, via Narwhals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 158.748353, + "end_time": "2024-03-22T17:30:34.688289", + "exception": false, + "start_time": "2024-03-22T17:27:55.939936", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'pandas[pyarrow]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q9(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005773, + "end_time": "2024-03-22T17:30:34.7003", + "exception": false, + "start_time": "2024-03-22T17:30:34.694527", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Polars read_parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 37.821116, + "end_time": "2024-03-22T17:31:12.527466", + "exception": false, + "start_time": "2024-03-22T17:30:34.70635", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'polars[eager]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q9(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005515, + "end_time": "2024-03-22T17:31:12.539068", + "exception": false, + "start_time": "2024-03-22T17:31:12.533553", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Polars scan_parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 4.800698, + "end_time": "2024-03-22T17:31:17.346813", + "exception": false, + "start_time": "2024-03-22T17:31:12.546115", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'polars[lazy]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q9(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier)).collect()\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "with open('results.json', 'w') as fd:\n", + " json.dump(results, fd)\n" + ] + } + ], + "metadata": { + "kaggle": { + "accelerator": "none", + "dataSources": [ + { + "sourceId": 167796934, + "sourceType": "kernelVersion" + } + ], + "dockerImageVersionId": 30673, + "isGpuEnabled": false, + "isInternetEnabled": true, + "language": "python", + "sourceType": "notebook" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + }, + "papermill": { + "default_parameters": {}, + "duration": 458.423327, + "end_time": "2024-03-22T17:31:18.077306", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2024-03-22T17:23:39.653979", + "version": "2.5.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tpch/notebooks/q9/kernel-metadata.json b/tpch/notebooks/q9/kernel-metadata.json new file mode 100644 index 000000000..bdbebcfeb --- /dev/null +++ b/tpch/notebooks/q9/kernel-metadata.json @@ -0,0 +1,15 @@ +{ + "id": "marcogorelli/narwhals-tpch-q9-s2", + "title": "Narwhals TPCH Q9 S2", + "code_file": "execute.ipynb", + "language": "python", + "kernel_type": "notebook", + "is_private": "false", + "enable_gpu": "false", + "enable_tpu": "false", + "enable_internet": "true", + "dataset_sources": [], + "competition_sources": [], + "kernel_sources": ["marcogorelli/tpc-h-data-parquet-s-2"], + "model_sources": [] +} \ No newline at end of file