From 7b78c1da931dc5b1ff9cf5676c6ca2c8c8b48522 Mon Sep 17 00:00:00 2001 From: Uchenna Ugoh <61969079+ugohuche@users.noreply.github.com> Date: Fri, 26 Jul 2024 21:14:55 +0100 Subject: [PATCH] Added TPC-H Q13 and Q22 --- tpch/notebooks/q13/execute.ipynb | 396 +++++++++++++++++++++++ tpch/notebooks/q13/kernel-metadata.json | 15 + tpch/notebooks/q22/execute.ipynb | 409 ++++++++++++++++++++++++ tpch/notebooks/q22/kernel-metadata.json | 15 + 4 files changed, 835 insertions(+) create mode 100644 tpch/notebooks/q13/execute.ipynb create mode 100644 tpch/notebooks/q13/kernel-metadata.json create mode 100644 tpch/notebooks/q22/execute.ipynb create mode 100644 tpch/notebooks/q22/kernel-metadata.json diff --git a/tpch/notebooks/q13/execute.ipynb b/tpch/notebooks/q13/execute.ipynb new file mode 100644 index 000000000..1b2526132 --- /dev/null +++ b/tpch/notebooks/q13/execute.ipynb @@ -0,0 +1,396 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "0", + "metadata": { + "papermill": { + "duration": 46.481932, + "end_time": "2024-07-25T23:10:36.288698", + "exception": false, + "start_time": "2024-07-25T23:09:49.806766", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1", + "metadata": { + "papermill": { + "duration": 0.522877, + "end_time": "2024-07-25T23:10:36.819167", + "exception": false, + "start_time": "2024-07-25T23:10:36.296290", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import polars as pl\n", + "\n", + "pd.options.mode.copy_on_write = True\n", + "pd.options.future.infer_string = True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2", + "metadata": { + "papermill": { + "duration": 0.02756, + "end_time": "2024-07-25T23:10:36.854110", + "exception": false, + "start_time": "2024-07-25T23:10:36.826550", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from typing import Any\n", + "import narwhals as nw\n", + "\n", + "def q13(\n", + " customer_ds_raw: Any,\n", + " orders_ds_raw: Any\n", + ") -> Any:\n", + "\n", + " customer_ds = nw.from_native(customer_ds_raw)\n", + " orders_ds = nw.from_native(orders_ds_raw)\n", + " \n", + " var1 = \"special\"\n", + " var2 = \"requests\"\n", + "\n", + " orders_ds = orders_ds.filter(~nw.col(\"o_comment\").str.contains(f\"{var1}.*{var2}\"))\n", + " \n", + " result = (\n", + " customer_ds.join(orders_ds, left_on=\"c_custkey\", right_on=\"o_custkey\", how=\"left\")\n", + " .group_by(\"c_custkey\")\n", + " .agg(nw.col(\"o_orderkey\").len().alias(\"c_count\"))\n", + " .group_by(\"c_count\")\n", + " .agg(nw.col(\"c_count\").len().alias(\"len\"))\n", + " .select(nw.col(\"c_count\"), nw.col(\"len\").alias(\"custdist\"))\n", + " .sort(by=[\"custdist\", \"c_count\"], descending=[True, True])\n", + " )\n", + "\n", + " return nw.to_native(result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3", + "metadata": { + "papermill": { + "duration": 0.016452, + "end_time": "2024-07-25T23:10:36.878001", + "exception": false, + "start_time": "2024-07-25T23:10:36.861549", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", + "customer = dir_ + 'customer.parquet'\n", + "orders = dir_ + 'orders.parquet'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4", + "metadata": { + "papermill": { + "duration": 0.016664, + "end_time": "2024-07-25T23:10:36.902043", + "exception": false, + "start_time": "2024-07-25T23:10:36.885379", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "IO_FUNCS = {\n", + " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", + " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", + " 'polars[eager]': lambda x: pl.read_parquet(x),\n", + " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5", + "metadata": { + "papermill": { + "duration": 0.015614, + "end_time": "2024-07-25T23:10:36.924894", + "exception": false, + "start_time": "2024-07-25T23:10:36.909280", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "results = {}" + ] + }, + { + "cell_type": "markdown", + "id": "6", + "metadata": { + "papermill": { + "duration": 0.006996, + "end_time": "2024-07-25T23:10:36.939302", + "exception": false, + "start_time": "2024-07-25T23:10:36.932306", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## pandas via Narwhals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7", + "metadata": { + "papermill": { + "duration": 44.926739, + "end_time": "2024-07-25T23:11:21.873211", + "exception": false, + "start_time": "2024-07-25T23:10:36.946472", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'pandas'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q13(fn(customer), fn(orders))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "id": "8", + "metadata": { + "papermill": { + "duration": 0.006853, + "end_time": "2024-07-25T23:11:21.887504", + "exception": false, + "start_time": "2024-07-25T23:11:21.880651", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## pandas, pyarrow dtypes, via Narwhals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9", + "metadata": { + "papermill": { + "duration": 44.733391, + "end_time": "2024-07-25T23:12:06.628137", + "exception": false, + "start_time": "2024-07-25T23:11:21.894746", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'pandas[pyarrow]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q13(fn(customer), fn(orders))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "id": "10", + "metadata": { + "papermill": { + "duration": 0.006896, + "end_time": "2024-07-25T23:12:06.642200", + "exception": false, + "start_time": "2024-07-25T23:12:06.635304", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Polars read_parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11", + "metadata": { + "papermill": { + "duration": 13.901571, + "end_time": "2024-07-25T23:12:20.550910", + "exception": false, + "start_time": "2024-07-25T23:12:06.649339", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'polars[eager]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q13(fn(customer), fn(orders))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "id": "12", + "metadata": { + "papermill": { + "duration": 0.0075, + "end_time": "2024-07-25T23:12:20.566105", + "exception": false, + "start_time": "2024-07-25T23:12:20.558605", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Polars scan_parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13", + "metadata": { + "papermill": { + "duration": 10.393542, + "end_time": "2024-07-25T23:12:30.967063", + "exception": false, + "start_time": "2024-07-25T23:12:20.573521", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'polars[lazy]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q13(fn(customer), fn(orders)).collect()\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "id": "14", + "metadata": { + "papermill": { + "duration": 0.007808, + "end_time": "2024-07-25T23:12:30.982613", + "exception": false, + "start_time": "2024-07-25T23:12:30.974805", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Save" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15", + "metadata": { + "papermill": { + "duration": 0.017013, + "end_time": "2024-07-25T23:12:31.007464", + "exception": false, + "start_time": "2024-07-25T23:12:30.990451", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import json\n", + "with open('results.json', 'w') as fd:\n", + " json.dump(results, fd)\n" + ] + } + ], + "metadata": { + "kaggle": { + "accelerator": "none", + "dataSources": [ + { + "sourceId": 167796934, + "sourceType": "kernelVersion" + } + ], + "dockerImageVersionId": 30673, + "isGpuEnabled": false, + "isInternetEnabled": true, + "language": "python", + "sourceType": "notebook" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + }, + "papermill": { + "default_parameters": {}, + "duration": 164.526043, + "end_time": "2024-07-25T23:12:31.536428", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2024-07-25T23:09:47.010385", + "version": "2.5.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tpch/notebooks/q13/kernel-metadata.json b/tpch/notebooks/q13/kernel-metadata.json new file mode 100644 index 000000000..6511ee943 --- /dev/null +++ b/tpch/notebooks/q13/kernel-metadata.json @@ -0,0 +1,15 @@ +{ + "id": "marcogorelli/narwhals-tpch-q22-s2", + "title": "Narwhals TPCH Q22 S2", + "code_file": "execute.ipynb", + "language": "python", + "kernel_type": "notebook", + "is_private": "false", + "enable_gpu": "false", + "enable_tpu": "false", + "enable_internet": "true", + "dataset_sources": [], + "competition_sources": [], + "kernel_sources": ["marcogorelli/tpc-h-data-parquet-s-2"], + "model_sources": [] +} \ No newline at end of file diff --git a/tpch/notebooks/q22/execute.ipynb b/tpch/notebooks/q22/execute.ipynb new file mode 100644 index 000000000..337c1bf7d --- /dev/null +++ b/tpch/notebooks/q22/execute.ipynb @@ -0,0 +1,409 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "0", + "metadata": { + "papermill": { + "duration": 49.860489, + "end_time": "2024-07-26T12:50:30.931456", + "exception": false, + "start_time": "2024-07-26T12:49:41.070967", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1", + "metadata": { + "papermill": { + "duration": 0.563589, + "end_time": "2024-07-26T12:50:31.502451", + "exception": false, + "start_time": "2024-07-26T12:50:30.938862", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import polars as pl\n", + "\n", + "pd.options.mode.copy_on_write = True\n", + "pd.options.future.infer_string = True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2", + "metadata": { + "papermill": { + "duration": 0.031352, + "end_time": "2024-07-26T12:50:31.541220", + "exception": false, + "start_time": "2024-07-26T12:50:31.509868", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from typing import Any\n", + "import narwhals as nw\n", + "\n", + "def q22(\n", + " customer_ds_raw: Any,\n", + " orders_ds_raw: Any,\n", + ") -> Any:\n", + "\n", + " customer_ds = nw.from_native(customer_ds_raw)\n", + " orders_ds = nw.from_native(orders_ds_raw)\n", + " \n", + " \n", + " query1 = (\n", + " customer_ds.with_columns(nw.col(\"c_phone\").str.slice(0, 2).alias(\"cntrycode\"))\n", + " .filter(nw.col(\"cntrycode\").str.contains(\"13|31|23|29|30|18|17\"))\n", + " .select(\"c_acctbal\", \"c_custkey\", \"cntrycode\")\n", + " )\n", + "\n", + " query2 = query1.filter(nw.col(\"c_acctbal\") > 0.0).select(\n", + " nw.col(\"c_acctbal\").mean().alias(\"avg_acctbal\")\n", + " )\n", + "\n", + " query3 = orders_ds.select(nw.col(\"o_custkey\").unique()).with_columns(\n", + " nw.col(\"o_custkey\").alias(\"c_custkey\")\n", + " )\n", + "\n", + " final_query = (\n", + " query1.join(query3, left_on=\"c_custkey\", right_on=\"c_custkey\", how=\"left\")\n", + " .filter(nw.col(\"o_custkey\").is_null())\n", + " .join(query2, how=\"cross\")\n", + " .filter(nw.col(\"c_acctbal\") > nw.col(\"avg_acctbal\"))\n", + " .group_by(\"cntrycode\")\n", + " .agg(\n", + " nw.col(\"c_acctbal\").count().alias(\"numcust\"),\n", + " nw.col(\"c_acctbal\").sum().round(2).alias(\"totacctbal\"),\n", + " )\n", + " .sort(\"cntrycode\")\n", + " )\n", + "\n", + " return nw.to_native(final_query)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3", + "metadata": { + "papermill": { + "duration": 0.016247, + "end_time": "2024-07-26T12:50:31.564892", + "exception": false, + "start_time": "2024-07-26T12:50:31.548645", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", + "customer = dir_ + 'customer.parquet'\n", + "orders = dir_ + 'orders.parquet'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4", + "metadata": { + "papermill": { + "duration": 0.017416, + "end_time": "2024-07-26T12:50:31.589507", + "exception": false, + "start_time": "2024-07-26T12:50:31.572091", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "IO_FUNCS = {\n", + " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", + " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", + " 'polars[eager]': lambda x: pl.read_parquet(x),\n", + " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5", + "metadata": { + "papermill": { + "duration": 0.015402, + "end_time": "2024-07-26T12:50:31.612243", + "exception": false, + "start_time": "2024-07-26T12:50:31.596841", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "results = {}" + ] + }, + { + "cell_type": "markdown", + "id": "6", + "metadata": { + "papermill": { + "duration": 0.006963, + "end_time": "2024-07-26T12:50:31.626579", + "exception": false, + "start_time": "2024-07-26T12:50:31.619616", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## pandas via Narwhals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7", + "metadata": { + "papermill": { + "duration": 10.559142, + "end_time": "2024-07-26T12:50:42.192870", + "exception": false, + "start_time": "2024-07-26T12:50:31.633728", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'pandas'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q22(fn(customer), fn(orders))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "id": "8", + "metadata": { + "papermill": { + "duration": 0.008478, + "end_time": "2024-07-26T12:50:42.209200", + "exception": false, + "start_time": "2024-07-26T12:50:42.200722", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## pandas, pyarrow dtypes, via Narwhals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9", + "metadata": { + "papermill": { + "duration": 7.07373, + "end_time": "2024-07-26T12:50:49.292349", + "exception": false, + "start_time": "2024-07-26T12:50:42.218619", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'pandas[pyarrow]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q22(fn(customer), fn(orders))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "id": "10", + "metadata": { + "papermill": { + "duration": 0.00828, + "end_time": "2024-07-26T12:50:49.309227", + "exception": false, + "start_time": "2024-07-26T12:50:49.300947", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Polars read_parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11", + "metadata": { + "papermill": { + "duration": 6.202911, + "end_time": "2024-07-26T12:50:55.520546", + "exception": false, + "start_time": "2024-07-26T12:50:49.317635", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'polars[eager]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q22(fn(customer), fn(orders))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "id": "12", + "metadata": { + "papermill": { + "duration": 0.008436, + "end_time": "2024-07-26T12:50:55.537904", + "exception": false, + "start_time": "2024-07-26T12:50:55.529468", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Polars scan_parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13", + "metadata": { + "papermill": { + "duration": 12.591074, + "end_time": "2024-07-26T12:51:08.137474", + "exception": false, + "start_time": "2024-07-26T12:50:55.546400", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'polars[lazy]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q22(fn(customer), fn(orders)).collect()\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "id": "14", + "metadata": { + "papermill": { + "duration": 0.008232, + "end_time": "2024-07-26T12:51:08.154223", + "exception": false, + "start_time": "2024-07-26T12:51:08.145991", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Save" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15", + "metadata": { + "papermill": { + "duration": 0.017523, + "end_time": "2024-07-26T12:51:08.180173", + "exception": false, + "start_time": "2024-07-26T12:51:08.162650", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import json\n", + "with open('results.json', 'w') as fd:\n", + " json.dump(results, fd)\n" + ] + } + ], + "metadata": { + "kaggle": { + "accelerator": "none", + "dataSources": [ + { + "sourceId": 167796934, + "sourceType": "kernelVersion" + } + ], + "dockerImageVersionId": 30673, + "isGpuEnabled": false, + "isInternetEnabled": true, + "language": "python", + "sourceType": "notebook" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + }, + "papermill": { + "default_parameters": {}, + "duration": 90.628471, + "end_time": "2024-07-26T12:51:08.710737", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2024-07-26T12:49:38.082266", + "version": "2.5.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tpch/notebooks/q22/kernel-metadata.json b/tpch/notebooks/q22/kernel-metadata.json new file mode 100644 index 000000000..6511ee943 --- /dev/null +++ b/tpch/notebooks/q22/kernel-metadata.json @@ -0,0 +1,15 @@ +{ + "id": "marcogorelli/narwhals-tpch-q22-s2", + "title": "Narwhals TPCH Q22 S2", + "code_file": "execute.ipynb", + "language": "python", + "kernel_type": "notebook", + "is_private": "false", + "enable_gpu": "false", + "enable_tpu": "false", + "enable_internet": "true", + "dataset_sources": [], + "competition_sources": [], + "kernel_sources": ["marcogorelli/tpc-h-data-parquet-s-2"], + "model_sources": [] +} \ No newline at end of file