diff --git a/tpch/notebooks/q11/execute.ipynb b/tpch/notebooks/q11/execute.ipynb index bb2051a8f..fec9ee27e 100644 --- a/tpch/notebooks/q11/execute.ipynb +++ b/tpch/notebooks/q11/execute.ipynb @@ -1 +1,381 @@ -{"metadata":{"kaggle":{"accelerator":"none","dataSources":[{"sourceId":167796934,"sourceType":"kernelVersion"}],"dockerImageVersionId":30673,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.13"},"papermill":{"default_parameters":{},"duration":458.423327,"end_time":"2024-03-22T17:31:18.077306","environment_variables":{},"exception":null,"input_path":"__notebook__.ipynb","output_path":"__notebook__.ipynb","parameters":{},"start_time":"2024-03-22T17:23:39.653979","version":"2.5.0"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals ","metadata":{"papermill":{"duration":33.390992,"end_time":"2024-03-22T17:24:15.601719","exception":false,"start_time":"2024-03-22T17:23:42.210727","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-05T10:32:59.927716Z","iopub.execute_input":"2024-07-05T10:32:59.928639Z","iopub.status.idle":"2024-07-05T10:33:14.029566Z","shell.execute_reply.started":"2024-07-05T10:32:59.928593Z","shell.execute_reply":"2024-07-05T10:33:14.028082Z"},"trusted":true},"execution_count":13,"outputs":[{"name":"stdout","text":"\u001b[33mWARNING: Skipping apache-beam as it is not installed.\u001b[0m\u001b[33m\n\u001b[0mRequirement already satisfied: pandas in /opt/conda/lib/python3.10/site-packages (2.2.2)\nRequirement already satisfied: polars in /opt/conda/lib/python3.10/site-packages (1.0.0)\nRequirement already satisfied: pyarrow in /opt/conda/lib/python3.10/site-packages (16.1.0)\nRequirement already satisfied: narwhals in /opt/conda/lib/python3.10/site-packages (0.9.28)\nRequirement already satisfied: numpy>=1.22.4 in /opt/conda/lib/python3.10/site-packages (from pandas) (1.26.4)\nRequirement already satisfied: python-dateutil>=2.8.2 in /opt/conda/lib/python3.10/site-packages (from pandas) (2.9.0.post0)\nRequirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.10/site-packages (from pandas) (2023.3.post1)\nRequirement already satisfied: tzdata>=2022.7 in /opt/conda/lib/python3.10/site-packages (from pandas) (2023.4)\nRequirement already satisfied: six>=1.5 in /opt/conda/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n","output_type":"stream"}]},{"cell_type":"code","source":"import pandas as pd\nimport polars as pl\n\npd.options.mode.copy_on_write = True\npd.options.future.infer_string = True","metadata":{"papermill":{"duration":0.907754,"end_time":"2024-03-22T17:24:39.053873","exception":false,"start_time":"2024-03-22T17:24:38.146119","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-05T10:33:14.032036Z","iopub.execute_input":"2024-07-05T10:33:14.032455Z","iopub.status.idle":"2024-07-05T10:33:14.039074Z","shell.execute_reply.started":"2024-07-05T10:33:14.032409Z","shell.execute_reply":"2024-07-05T10:33:14.037818Z"},"trusted":true},"execution_count":14,"outputs":[]},{"cell_type":"code","source":"from typing import Any\nimport narwhals as nw\n\ndef q11(\n partsupp_ds_raw: Any,\n nation_ds_raw: Any,\n supplier_ds_raw: Any,\n) -> Any:\n\n nation_ds = nw.from_native(nation_ds_raw)\n partsupp_ds = nw.from_native(partsupp_ds_raw)\n supplier_ds = nw.from_native(supplier_ds_raw)\n\n \n var1 = \"GERMANY\"\n var2 = 0.0001\n\n q1 = (\n partsupp_ds.join(supplier_ds, left_on=\"ps_suppkey\", right_on=\"s_suppkey\")\n .join(nation_ds, left_on=\"s_nationkey\", right_on=\"n_nationkey\")\n .filter(nw.col(\"n_name\") == var1)\n )\n q2 = q1.select(\n (nw.col(\"ps_supplycost\") * nw.col(\"ps_availqty\")).sum().round(2).alias(\"tmp\")\n * var2\n )\n\n q_final = (\n q1.with_columns(\n (nw.col(\"ps_supplycost\") * nw.col(\"ps_availqty\"))\n .alias(\"value\")\n )\n .group_by(\"ps_partkey\")\n .agg(\n nw.sum(\"value\")\n )\n .join(q2, how=\"cross\")\n .filter(nw.col(\"value\") > nw.col(\"tmp\"))\n .select(\"ps_partkey\", \"value\")\n .sort(\"value\", descending=True)\n )\n\n return nw.to_native(q_final)","metadata":{"papermill":{"duration":0.021725,"end_time":"2024-03-22T17:24:39.080999","exception":false,"start_time":"2024-03-22T17:24:39.059274","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-05T10:33:14.040616Z","iopub.execute_input":"2024-07-05T10:33:14.040982Z","iopub.status.idle":"2024-07-05T10:33:14.056075Z","shell.execute_reply.started":"2024-07-05T10:33:14.040950Z","shell.execute_reply":"2024-07-05T10:33:14.054409Z"},"trusted":true},"execution_count":15,"outputs":[]},{"cell_type":"code","source":"dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\nnation = dir_ + 'nation.parquet'\nsupplier = dir_ + 'supplier.parquet'\npartsupp = dir_ + 'partsupp.parquet'","metadata":{"papermill":{"duration":0.013325,"end_time":"2024-03-22T17:24:39.099766","exception":false,"start_time":"2024-03-22T17:24:39.086441","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-05T10:33:14.059211Z","iopub.execute_input":"2024-07-05T10:33:14.059609Z","iopub.status.idle":"2024-07-05T10:33:14.068385Z","shell.execute_reply.started":"2024-07-05T10:33:14.059578Z","shell.execute_reply":"2024-07-05T10:33:14.067130Z"},"trusted":true},"execution_count":16,"outputs":[]},{"cell_type":"code","source":"IO_FUNCS = {\n 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n 'polars[eager]': lambda x: pl.read_parquet(x),\n 'polars[lazy]': lambda x: pl.scan_parquet(x),\n}","metadata":{"papermill":{"duration":0.014284,"end_time":"2024-03-22T17:24:39.119737","exception":false,"start_time":"2024-03-22T17:24:39.105453","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-05T10:33:14.069813Z","iopub.execute_input":"2024-07-05T10:33:14.070174Z","iopub.status.idle":"2024-07-05T10:33:14.085661Z","shell.execute_reply.started":"2024-07-05T10:33:14.070142Z","shell.execute_reply":"2024-07-05T10:33:14.084444Z"},"trusted":true},"execution_count":17,"outputs":[]},{"cell_type":"code","source":"results = {}","metadata":{"execution":{"iopub.status.busy":"2024-07-05T10:33:14.086962Z","iopub.execute_input":"2024-07-05T10:33:14.087330Z","iopub.status.idle":"2024-07-05T10:33:14.096595Z","shell.execute_reply.started":"2024-07-05T10:33:14.087298Z","shell.execute_reply":"2024-07-05T10:33:14.095430Z"},"trusted":true},"execution_count":18,"outputs":[]},{"cell_type":"markdown","source":"## pandas via Narwhals","metadata":{"papermill":{"duration":0.005113,"end_time":"2024-03-22T17:24:39.130472","exception":false,"start_time":"2024-03-22T17:24:39.125359","status":"completed"},"tags":[]}},{"cell_type":"code","source":"tool = 'pandas'\nfn = IO_FUNCS[tool]\ntimings = %timeit -o q11(fn(partsupp), fn(nation), fn(supplier))\nresults[tool] = timings.all_runs","metadata":{"papermill":{"duration":196.786925,"end_time":"2024-03-22T17:27:55.922832","exception":false,"start_time":"2024-03-22T17:24:39.135907","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-05T10:33:14.098215Z","iopub.execute_input":"2024-07-05T10:33:14.098662Z","iopub.status.idle":"2024-07-05T10:33:27.624855Z","shell.execute_reply.started":"2024-07-05T10:33:14.098622Z","shell.execute_reply":"2024-07-05T10:33:27.623747Z"},"trusted":true},"execution_count":19,"outputs":[{"name":"stdout","text":"1.66 s ± 118 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n","output_type":"stream"}]},{"cell_type":"markdown","source":"## pandas, pyarrow dtypes, via Narwhals","metadata":{"papermill":{"duration":0.005184,"end_time":"2024-03-22T17:27:55.933407","exception":false,"start_time":"2024-03-22T17:27:55.928223","status":"completed"},"tags":[]}},{"cell_type":"code","source":"tool = 'pandas[pyarrow]'\nfn = IO_FUNCS[tool]\ntimings = %timeit -o q11(fn(partsupp), fn(nation), fn(supplier))\nresults[tool] = timings.all_runs","metadata":{"papermill":{"duration":158.748353,"end_time":"2024-03-22T17:30:34.688289","exception":false,"start_time":"2024-03-22T17:27:55.939936","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-05T10:33:27.626162Z","iopub.execute_input":"2024-07-05T10:33:27.626489Z","iopub.status.idle":"2024-07-05T10:33:40.866730Z","shell.execute_reply.started":"2024-07-05T10:33:27.626463Z","shell.execute_reply":"2024-07-05T10:33:40.865296Z"},"trusted":true},"execution_count":20,"outputs":[{"name":"stdout","text":"1.67 s ± 86.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n","output_type":"stream"}]},{"cell_type":"markdown","source":"## Polars read_parquet","metadata":{"papermill":{"duration":0.005773,"end_time":"2024-03-22T17:30:34.7003","exception":false,"start_time":"2024-03-22T17:30:34.694527","status":"completed"},"tags":[]}},{"cell_type":"code","source":"tool = 'polars[eager]'\nfn = IO_FUNCS[tool]\ntimings = %timeit -o q11(fn(partsupp), fn(nation), fn(supplier))\nresults[tool] = timings.all_runs","metadata":{"papermill":{"duration":37.821116,"end_time":"2024-03-22T17:31:12.527466","exception":false,"start_time":"2024-03-22T17:30:34.70635","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-05T10:33:40.868234Z","iopub.execute_input":"2024-07-05T10:33:40.868584Z","iopub.status.idle":"2024-07-05T10:33:48.072165Z","shell.execute_reply.started":"2024-07-05T10:33:40.868552Z","shell.execute_reply":"2024-07-05T10:33:48.071018Z"},"trusted":true},"execution_count":21,"outputs":[{"name":"stdout","text":"890 ms ± 13.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n","output_type":"stream"}]},{"cell_type":"markdown","source":"## Polars scan_parquet","metadata":{"papermill":{"duration":0.005515,"end_time":"2024-03-22T17:31:12.539068","exception":false,"start_time":"2024-03-22T17:31:12.533553","status":"completed"},"tags":[]}},{"cell_type":"code","source":"tool = 'polars[lazy]'\nfn = IO_FUNCS[tool]\ntimings = %timeit -o q11(fn(partsupp), fn(nation), fn(supplier)).collect()\nresults[tool] = timings.all_runs","metadata":{"papermill":{"duration":4.800698,"end_time":"2024-03-22T17:31:17.346813","exception":false,"start_time":"2024-03-22T17:31:12.546115","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-05T10:33:48.075257Z","iopub.execute_input":"2024-07-05T10:33:48.075613Z","iopub.status.idle":"2024-07-05T10:33:56.980125Z","shell.execute_reply.started":"2024-07-05T10:33:48.075582Z","shell.execute_reply":"2024-07-05T10:33:56.979088Z"},"trusted":true},"execution_count":22,"outputs":[{"name":"stdout","text":"110 ms ± 12.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n","output_type":"stream"}]},{"cell_type":"markdown","source":"## Save","metadata":{}},{"cell_type":"code","source":"import json\nwith open('results.json', 'w') as fd:\n json.dump(results, fd)\n","metadata":{"execution":{"iopub.status.busy":"2024-07-05T10:33:56.981599Z","iopub.execute_input":"2024-07-05T10:33:56.982033Z","iopub.status.idle":"2024-07-05T10:33:56.988558Z","shell.execute_reply.started":"2024-07-05T10:33:56.981992Z","shell.execute_reply":"2024-07-05T10:33:56.987342Z"},"trusted":true},"execution_count":23,"outputs":[]},{"cell_type":"code","source":"from pprint import pprint\n\npprint(results)","metadata":{"execution":{"iopub.status.busy":"2024-07-05T10:33:56.990445Z","iopub.execute_input":"2024-07-05T10:33:56.990881Z","iopub.status.idle":"2024-07-05T10:33:57.000730Z","shell.execute_reply.started":"2024-07-05T10:33:56.990841Z","shell.execute_reply":"2024-07-05T10:33:56.999458Z"},"trusted":true},"execution_count":24,"outputs":[{"name":"stdout","text":"{'pandas': [1.708278326000027,\n 1.8040552429999934,\n 1.8417796100000032,\n 1.600905629999943,\n 1.6415783779998492,\n 1.5647700059998897,\n 1.493057884000109],\n 'pandas[pyarrow]': [1.6380957989999843,\n 1.5802785819998917,\n 1.5376337459999831,\n 1.7884727590001148,\n 1.7397616020000441,\n 1.7496962650000114,\n 1.6605698180001127],\n 'polars[eager]': [0.9160442119998606,\n 0.8955544509999527,\n 0.8863846530000501,\n 0.8829364579999037,\n 0.8918134509999618,\n 0.8924379529998987,\n 0.8672452630000862],\n 'polars[lazy]': [1.1258213609999075,\n 1.4064464999999018,\n 1.046419743999877,\n 1.0376091739999538,\n 1.043019643999969,\n 1.0296597439999005,\n 1.0383537459999843]}\n","output_type":"stream"}]}]} \ No newline at end of file +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 33.390992, + "end_time": "2024-03-22T17:24:15.601719", + "exception": false, + "start_time": "2024-03-22T17:23:42.210727", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.907754, + "end_time": "2024-03-22T17:24:39.053873", + "exception": false, + "start_time": "2024-03-22T17:24:38.146119", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import polars as pl\n", + "\n", + "pd.options.mode.copy_on_write = True\n", + "pd.options.future.infer_string = True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.021725, + "end_time": "2024-03-22T17:24:39.080999", + "exception": false, + "start_time": "2024-03-22T17:24:39.059274", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from typing import Any\n", + "import narwhals as nw\n", + "\n", + "def q11(\n", + " partsupp_ds_raw: Any,\n", + " nation_ds_raw: Any,\n", + " supplier_ds_raw: Any,\n", + ") -> Any:\n", + "\n", + " nation_ds = nw.from_native(nation_ds_raw)\n", + " partsupp_ds = nw.from_native(partsupp_ds_raw)\n", + " supplier_ds = nw.from_native(supplier_ds_raw)\n", + "\n", + " \n", + " var1 = \"GERMANY\"\n", + " var2 = 0.0001\n", + "\n", + " q1 = (\n", + " partsupp_ds.join(supplier_ds, left_on=\"ps_suppkey\", right_on=\"s_suppkey\")\n", + " .join(nation_ds, left_on=\"s_nationkey\", right_on=\"n_nationkey\")\n", + " .filter(nw.col(\"n_name\") == var1)\n", + " )\n", + " q2 = q1.select(\n", + " (nw.col(\"ps_supplycost\") * nw.col(\"ps_availqty\")).sum().round(2).alias(\"tmp\")\n", + " * var2\n", + " )\n", + "\n", + " q_final = (\n", + " q1.with_columns(\n", + " (nw.col(\"ps_supplycost\") * nw.col(\"ps_availqty\"))\n", + " .alias(\"value\")\n", + " )\n", + " .group_by(\"ps_partkey\")\n", + " .agg(\n", + " nw.sum(\"value\")\n", + " )\n", + " .join(q2, how=\"cross\")\n", + " .filter(nw.col(\"value\") > nw.col(\"tmp\"))\n", + " .select(\"ps_partkey\", \"value\")\n", + " .sort(\"value\", descending=True)\n", + " )\n", + "\n", + " return nw.to_native(q_final)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.013325, + "end_time": "2024-03-22T17:24:39.099766", + "exception": false, + "start_time": "2024-03-22T17:24:39.086441", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", + "nation = dir_ + 'nation.parquet'\n", + "supplier = dir_ + 'supplier.parquet'\n", + "partsupp = dir_ + 'partsupp.parquet'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.014284, + "end_time": "2024-03-22T17:24:39.119737", + "exception": false, + "start_time": "2024-03-22T17:24:39.105453", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "IO_FUNCS = {\n", + " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", + " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", + " 'polars[eager]': lambda x: pl.read_parquet(x),\n", + " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results = {}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005113, + "end_time": "2024-03-22T17:24:39.130472", + "exception": false, + "start_time": "2024-03-22T17:24:39.125359", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## pandas via Narwhals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 196.786925, + "end_time": "2024-03-22T17:27:55.922832", + "exception": false, + "start_time": "2024-03-22T17:24:39.135907", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'pandas'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q11(fn(partsupp), fn(nation), fn(supplier))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005184, + "end_time": "2024-03-22T17:27:55.933407", + "exception": false, + "start_time": "2024-03-22T17:27:55.928223", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## pandas, pyarrow dtypes, via Narwhals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 158.748353, + "end_time": "2024-03-22T17:30:34.688289", + "exception": false, + "start_time": "2024-03-22T17:27:55.939936", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'pandas[pyarrow]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q11(fn(partsupp), fn(nation), fn(supplier))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005773, + "end_time": "2024-03-22T17:30:34.7003", + "exception": false, + "start_time": "2024-03-22T17:30:34.694527", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Polars read_parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 37.821116, + "end_time": "2024-03-22T17:31:12.527466", + "exception": false, + "start_time": "2024-03-22T17:30:34.70635", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'polars[eager]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q11(fn(partsupp), fn(nation), fn(supplier))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005515, + "end_time": "2024-03-22T17:31:12.539068", + "exception": false, + "start_time": "2024-03-22T17:31:12.533553", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Polars scan_parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 4.800698, + "end_time": "2024-03-22T17:31:17.346813", + "exception": false, + "start_time": "2024-03-22T17:31:12.546115", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'polars[lazy]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q11(fn(partsupp), fn(nation), fn(supplier)).collect()\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "with open('results.json', 'w') as fd:\n", + " json.dump(results, fd)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pprint import pprint\n", + "\n", + "pprint(results)" + ] + } + ], + "metadata": { + "kaggle": { + "accelerator": "none", + "dataSources": [ + { + "sourceId": 167796934, + "sourceType": "kernelVersion" + } + ], + "dockerImageVersionId": 30673, + "isGpuEnabled": false, + "isInternetEnabled": true, + "language": "python", + "sourceType": "notebook" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + }, + "papermill": { + "default_parameters": {}, + "duration": 458.423327, + "end_time": "2024-03-22T17:31:18.077306", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2024-03-22T17:23:39.653979", + "version": "2.5.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tpch/notebooks/q11/kernel-metadata.json b/tpch/notebooks/q11/kernel-metadata.json index 335130dbb..2aa47c6c9 100644 --- a/tpch/notebooks/q11/kernel-metadata.json +++ b/tpch/notebooks/q11/kernel-metadata.json @@ -1,5 +1,5 @@ { - "id": "uchennaugoh/narwhals-tpch-q11-s2", + "id": "marcogorelli/narwhals-tpch-q11-s2", "title": "Narwhals TPCH Q11 S2", "code_file": "execute.ipynb", "language": "python", diff --git a/tpch/notebooks/q15/execute.ipynb b/tpch/notebooks/q15/execute.ipynb index edf3dce83..b487a9bf3 100644 --- a/tpch/notebooks/q15/execute.ipynb +++ b/tpch/notebooks/q15/execute.ipynb @@ -1 +1,361 @@ -{"metadata":{"kaggle":{"accelerator":"none","dataSources":[{"sourceId":167796934,"sourceType":"kernelVersion"}],"dockerImageVersionId":30673,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.13"},"papermill":{"default_parameters":{},"duration":458.423327,"end_time":"2024-03-22T17:31:18.077306","environment_variables":{},"exception":null,"input_path":"__notebook__.ipynb","output_path":"__notebook__.ipynb","parameters":{},"start_time":"2024-03-22T17:23:39.653979","version":"2.5.0"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals ","metadata":{"papermill":{"duration":33.390992,"end_time":"2024-03-22T17:24:15.601719","exception":false,"start_time":"2024-03-22T17:23:42.210727","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-04T22:19:10.979470Z","iopub.execute_input":"2024-07-04T22:19:10.979901Z","iopub.status.idle":"2024-07-04T22:19:28.653765Z","shell.execute_reply.started":"2024-07-04T22:19:10.979868Z","shell.execute_reply":"2024-07-04T22:19:28.652407Z"},"trusted":true},"execution_count":26,"outputs":[{"name":"stdout","text":"\u001b[33mWARNING: Skipping apache-beam as it is not installed.\u001b[0m\u001b[33m\n\u001b[0mRequirement already satisfied: pandas in /opt/conda/lib/python3.10/site-packages (2.2.2)\nRequirement already satisfied: polars in /opt/conda/lib/python3.10/site-packages (1.0.0)\nRequirement already satisfied: pyarrow in /opt/conda/lib/python3.10/site-packages (16.1.0)\nRequirement already satisfied: narwhals in /opt/conda/lib/python3.10/site-packages (0.9.28)\nRequirement already satisfied: numpy>=1.22.4 in /opt/conda/lib/python3.10/site-packages (from pandas) (1.26.4)\nRequirement already satisfied: python-dateutil>=2.8.2 in /opt/conda/lib/python3.10/site-packages (from pandas) (2.9.0.post0)\nRequirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.10/site-packages (from pandas) (2023.3.post1)\nRequirement already satisfied: tzdata>=2022.7 in /opt/conda/lib/python3.10/site-packages (from pandas) (2023.4)\nRequirement already satisfied: six>=1.5 in /opt/conda/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n","output_type":"stream"}]},{"cell_type":"code","source":"import pandas as pd\nimport polars as pl\n\npd.options.mode.copy_on_write = True\npd.options.future.infer_string = True","metadata":{"papermill":{"duration":0.907754,"end_time":"2024-03-22T17:24:39.053873","exception":false,"start_time":"2024-03-22T17:24:38.146119","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-04T22:19:28.656542Z","iopub.execute_input":"2024-07-04T22:19:28.656941Z","iopub.status.idle":"2024-07-04T22:19:28.663347Z","shell.execute_reply.started":"2024-07-04T22:19:28.656902Z","shell.execute_reply":"2024-07-04T22:19:28.662193Z"},"trusted":true},"execution_count":27,"outputs":[]},{"cell_type":"code","source":"from typing import Any\nimport narwhals as nw\nfrom datetime import datetime\n\ndef q15(\n lineitem_ds_raw: Any,\n supplier_ds_raw: Any,\n) -> Any:\n\n lineitem_ds = nw.from_native(lineitem_ds_raw)\n supplier_ds = nw.from_native(supplier_ds_raw)\n \n var1 = datetime(1996, 1, 1)\n var2 = datetime(1996, 4, 1)\n\n revenue = (\n lineitem_ds.filter(nw.col(\"l_shipdate\").is_between(var1, var2, closed=\"left\"))\n .with_columns(\n (nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\")))\n .alias(\"total_revenue\")\n )\n .group_by(\"l_suppkey\")\n .agg(nw.sum(\"total_revenue\"))\n .select(nw.col(\"l_suppkey\").alias(\"supplier_no\"), nw.col(\"total_revenue\"))\n )\n \n result = (\n supplier_ds.join(revenue, left_on=\"s_suppkey\", right_on=\"supplier_no\")\n .filter(nw.col(\"total_revenue\") == nw.col(\"total_revenue\").max())\n .with_columns(nw.col(\"total_revenue\").round(2))\n .select(\"s_suppkey\", \"s_name\", \"s_address\", \"s_phone\", \"total_revenue\")\n .sort(\"s_suppkey\")\n )\n\n return nw.to_native(result)","metadata":{"papermill":{"duration":0.021725,"end_time":"2024-03-22T17:24:39.080999","exception":false,"start_time":"2024-03-22T17:24:39.059274","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-04T22:19:28.664478Z","iopub.execute_input":"2024-07-04T22:19:28.664792Z","iopub.status.idle":"2024-07-04T22:19:28.678504Z","shell.execute_reply.started":"2024-07-04T22:19:28.664758Z","shell.execute_reply":"2024-07-04T22:19:28.677368Z"},"trusted":true},"execution_count":28,"outputs":[]},{"cell_type":"code","source":"dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\nlineitem = dir_ + 'lineitem.parquet'\nsupplier = dir_ + 'supplier.parquet'","metadata":{"papermill":{"duration":0.013325,"end_time":"2024-03-22T17:24:39.099766","exception":false,"start_time":"2024-03-22T17:24:39.086441","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-04T22:19:28.680103Z","iopub.execute_input":"2024-07-04T22:19:28.680493Z","iopub.status.idle":"2024-07-04T22:19:28.689584Z","shell.execute_reply.started":"2024-07-04T22:19:28.680455Z","shell.execute_reply":"2024-07-04T22:19:28.688449Z"},"trusted":true},"execution_count":29,"outputs":[]},{"cell_type":"code","source":"IO_FUNCS = {\n 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n 'polars[eager]': lambda x: pl.read_parquet(x),\n 'polars[lazy]': lambda x: pl.scan_parquet(x),\n}","metadata":{"papermill":{"duration":0.014284,"end_time":"2024-03-22T17:24:39.119737","exception":false,"start_time":"2024-03-22T17:24:39.105453","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-04T22:19:28.692439Z","iopub.execute_input":"2024-07-04T22:19:28.692808Z","iopub.status.idle":"2024-07-04T22:19:28.703549Z","shell.execute_reply.started":"2024-07-04T22:19:28.692777Z","shell.execute_reply":"2024-07-04T22:19:28.702283Z"},"trusted":true},"execution_count":30,"outputs":[]},{"cell_type":"code","source":"results = {}","metadata":{"execution":{"iopub.status.busy":"2024-07-04T22:19:28.704842Z","iopub.execute_input":"2024-07-04T22:19:28.705222Z","iopub.status.idle":"2024-07-04T22:19:28.717212Z","shell.execute_reply.started":"2024-07-04T22:19:28.705181Z","shell.execute_reply":"2024-07-04T22:19:28.715763Z"},"trusted":true},"execution_count":31,"outputs":[]},{"cell_type":"markdown","source":"## pandas via Narwhals","metadata":{"papermill":{"duration":0.005113,"end_time":"2024-03-22T17:24:39.130472","exception":false,"start_time":"2024-03-22T17:24:39.125359","status":"completed"},"tags":[]}},{"cell_type":"code","source":"tool = 'pandas'\nfn = IO_FUNCS[tool]\ntimings = %timeit -o q15(fn(lineitem), fn(supplier))\nresults[tool] = timings.all_runs","metadata":{"papermill":{"duration":196.786925,"end_time":"2024-03-22T17:27:55.922832","exception":false,"start_time":"2024-03-22T17:24:39.135907","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-04T22:19:28.718919Z","iopub.execute_input":"2024-07-04T22:19:28.719300Z","iopub.status.idle":"2024-07-04T22:20:10.260509Z","shell.execute_reply.started":"2024-07-04T22:19:28.719268Z","shell.execute_reply":"2024-07-04T22:20:10.259016Z"},"trusted":true},"execution_count":32,"outputs":[{"name":"stdout","text":"5.14 s ± 147 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n","output_type":"stream"}]},{"cell_type":"markdown","source":"## pandas, pyarrow dtypes, via Narwhals","metadata":{"papermill":{"duration":0.005184,"end_time":"2024-03-22T17:27:55.933407","exception":false,"start_time":"2024-03-22T17:27:55.928223","status":"completed"},"tags":[]}},{"cell_type":"code","source":"tool = 'pandas[pyarrow]'\nfn = IO_FUNCS[tool]\ntimings = %timeit -o q15(fn(lineitem), fn(supplier))\nresults[tool] = timings.all_runs","metadata":{"papermill":{"duration":158.748353,"end_time":"2024-03-22T17:30:34.688289","exception":false,"start_time":"2024-03-22T17:27:55.939936","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-04T22:20:10.262406Z","iopub.execute_input":"2024-07-04T22:20:10.262791Z","iopub.status.idle":"2024-07-04T22:20:38.551136Z","shell.execute_reply.started":"2024-07-04T22:20:10.262748Z","shell.execute_reply":"2024-07-04T22:20:38.549609Z"},"trusted":true},"execution_count":33,"outputs":[{"name":"stdout","text":"3.52 s ± 74.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n","output_type":"stream"}]},{"cell_type":"markdown","source":"## Polars read_parquet","metadata":{"papermill":{"duration":0.005773,"end_time":"2024-03-22T17:30:34.7003","exception":false,"start_time":"2024-03-22T17:30:34.694527","status":"completed"},"tags":[]}},{"cell_type":"code","source":"tool = 'polars[eager]'\nfn = IO_FUNCS[tool]\ntimings = %timeit -o q15(fn(lineitem), fn(supplier))\nresults[tool] = timings.all_runs","metadata":{"papermill":{"duration":37.821116,"end_time":"2024-03-22T17:31:12.527466","exception":false,"start_time":"2024-03-22T17:30:34.70635","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-04T22:20:38.553251Z","iopub.execute_input":"2024-07-04T22:20:38.553671Z","iopub.status.idle":"2024-07-04T22:21:00.080140Z","shell.execute_reply.started":"2024-07-04T22:20:38.553639Z","shell.execute_reply":"2024-07-04T22:21:00.077382Z"},"trusted":true},"execution_count":34,"outputs":[{"name":"stdout","text":"2.7 s ± 673 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n","output_type":"stream"}]},{"cell_type":"markdown","source":"## Polars scan_parquet","metadata":{"papermill":{"duration":0.005515,"end_time":"2024-03-22T17:31:12.539068","exception":false,"start_time":"2024-03-22T17:31:12.533553","status":"completed"},"tags":[]}},{"cell_type":"code","source":"tool = 'polars[lazy]'\nfn = IO_FUNCS[tool]\ntimings = %timeit -o q15(fn(lineitem), fn(supplier)).collect()\nresults[tool] = timings.all_runs","metadata":{"papermill":{"duration":4.800698,"end_time":"2024-03-22T17:31:17.346813","exception":false,"start_time":"2024-03-22T17:31:12.546115","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-04T22:21:00.083547Z","iopub.execute_input":"2024-07-04T22:21:00.084644Z","iopub.status.idle":"2024-07-04T22:21:03.792153Z","shell.execute_reply.started":"2024-07-04T22:21:00.084593Z","shell.execute_reply":"2024-07-04T22:21:03.790604Z"},"trusted":true},"execution_count":35,"outputs":[{"name":"stdout","text":"446 ms ± 7.77 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n","output_type":"stream"}]},{"cell_type":"markdown","source":"## Save","metadata":{}},{"cell_type":"code","source":"import json\nwith open('results.json', 'w') as fd:\n json.dump(results, fd)\n","metadata":{"execution":{"iopub.status.busy":"2024-07-04T22:21:03.793988Z","iopub.execute_input":"2024-07-04T22:21:03.794422Z","iopub.status.idle":"2024-07-04T22:21:03.801314Z","shell.execute_reply.started":"2024-07-04T22:21:03.794390Z","shell.execute_reply":"2024-07-04T22:21:03.799616Z"},"trusted":true},"execution_count":36,"outputs":[]}]} \ No newline at end of file +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 33.390992, + "end_time": "2024-03-22T17:24:15.601719", + "exception": false, + "start_time": "2024-03-22T17:23:42.210727", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.907754, + "end_time": "2024-03-22T17:24:39.053873", + "exception": false, + "start_time": "2024-03-22T17:24:38.146119", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import polars as pl\n", + "\n", + "pd.options.mode.copy_on_write = True\n", + "pd.options.future.infer_string = True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.021725, + "end_time": "2024-03-22T17:24:39.080999", + "exception": false, + "start_time": "2024-03-22T17:24:39.059274", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from typing import Any\n", + "import narwhals as nw\n", + "from datetime import datetime\n", + "\n", + "def q15(\n", + " lineitem_ds_raw: Any,\n", + " supplier_ds_raw: Any,\n", + ") -> Any:\n", + "\n", + " lineitem_ds = nw.from_native(lineitem_ds_raw)\n", + " supplier_ds = nw.from_native(supplier_ds_raw)\n", + " \n", + " var1 = datetime(1996, 1, 1)\n", + " var2 = datetime(1996, 4, 1)\n", + "\n", + " revenue = (\n", + " lineitem_ds.filter(nw.col(\"l_shipdate\").is_between(var1, var2, closed=\"left\"))\n", + " .with_columns(\n", + " (nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\")))\n", + " .alias(\"total_revenue\")\n", + " )\n", + " .group_by(\"l_suppkey\")\n", + " .agg(nw.sum(\"total_revenue\"))\n", + " .select(nw.col(\"l_suppkey\").alias(\"supplier_no\"), nw.col(\"total_revenue\"))\n", + " )\n", + " \n", + " result = (\n", + " supplier_ds.join(revenue, left_on=\"s_suppkey\", right_on=\"supplier_no\")\n", + " .filter(nw.col(\"total_revenue\") == nw.col(\"total_revenue\").max())\n", + " .with_columns(nw.col(\"total_revenue\").round(2))\n", + " .select(\"s_suppkey\", \"s_name\", \"s_address\", \"s_phone\", \"total_revenue\")\n", + " .sort(\"s_suppkey\")\n", + " )\n", + "\n", + " return nw.to_native(result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.013325, + "end_time": "2024-03-22T17:24:39.099766", + "exception": false, + "start_time": "2024-03-22T17:24:39.086441", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", + "lineitem = dir_ + 'lineitem.parquet'\n", + "supplier = dir_ + 'supplier.parquet'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.014284, + "end_time": "2024-03-22T17:24:39.119737", + "exception": false, + "start_time": "2024-03-22T17:24:39.105453", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "IO_FUNCS = {\n", + " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", + " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", + " 'polars[eager]': lambda x: pl.read_parquet(x),\n", + " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results = {}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005113, + "end_time": "2024-03-22T17:24:39.130472", + "exception": false, + "start_time": "2024-03-22T17:24:39.125359", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## pandas via Narwhals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 196.786925, + "end_time": "2024-03-22T17:27:55.922832", + "exception": false, + "start_time": "2024-03-22T17:24:39.135907", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'pandas'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q15(fn(lineitem), fn(supplier))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005184, + "end_time": "2024-03-22T17:27:55.933407", + "exception": false, + "start_time": "2024-03-22T17:27:55.928223", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## pandas, pyarrow dtypes, via Narwhals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 158.748353, + "end_time": "2024-03-22T17:30:34.688289", + "exception": false, + "start_time": "2024-03-22T17:27:55.939936", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'pandas[pyarrow]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q15(fn(lineitem), fn(supplier))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005773, + "end_time": "2024-03-22T17:30:34.7003", + "exception": false, + "start_time": "2024-03-22T17:30:34.694527", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Polars read_parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 37.821116, + "end_time": "2024-03-22T17:31:12.527466", + "exception": false, + "start_time": "2024-03-22T17:30:34.70635", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'polars[eager]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q15(fn(lineitem), fn(supplier))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005515, + "end_time": "2024-03-22T17:31:12.539068", + "exception": false, + "start_time": "2024-03-22T17:31:12.533553", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Polars scan_parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 4.800698, + "end_time": "2024-03-22T17:31:17.346813", + "exception": false, + "start_time": "2024-03-22T17:31:12.546115", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'polars[lazy]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q15(fn(lineitem), fn(supplier)).collect()\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "with open('results.json', 'w') as fd:\n", + " json.dump(results, fd)\n" + ] + } + ], + "metadata": { + "kaggle": { + "accelerator": "none", + "dataSources": [ + { + "sourceId": 167796934, + "sourceType": "kernelVersion" + } + ], + "dockerImageVersionId": 30673, + "isGpuEnabled": false, + "isInternetEnabled": true, + "language": "python", + "sourceType": "notebook" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + }, + "papermill": { + "default_parameters": {}, + "duration": 458.423327, + "end_time": "2024-03-22T17:31:18.077306", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2024-03-22T17:23:39.653979", + "version": "2.5.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tpch/notebooks/q15/kernel-metadata.json b/tpch/notebooks/q15/kernel-metadata.json index 2acefff8c..e552c9477 100644 --- a/tpch/notebooks/q15/kernel-metadata.json +++ b/tpch/notebooks/q15/kernel-metadata.json @@ -1,5 +1,5 @@ { - "id": "uchennaugoh/narwhals-tpch-q15-s2", + "id": "marcogorelli/narwhals-tpch-q15-s2", "title": "Narwhals TPCH Q15 S2", "code_file": "execute.ipynb", "language": "python", diff --git a/tpch/notebooks/q19/execute.ipynb b/tpch/notebooks/q19/execute.ipynb index d13e4d16f..a8cd3fea3 100644 --- a/tpch/notebooks/q19/execute.ipynb +++ b/tpch/notebooks/q19/execute.ipynb @@ -1 +1,378 @@ -{"metadata":{"kaggle":{"accelerator":"none","dataSources":[{"sourceId":167796934,"sourceType":"kernelVersion"}],"dockerImageVersionId":30673,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.13"},"papermill":{"default_parameters":{},"duration":458.423327,"end_time":"2024-03-22T17:31:18.077306","environment_variables":{},"exception":null,"input_path":"__notebook__.ipynb","output_path":"__notebook__.ipynb","parameters":{},"start_time":"2024-03-22T17:23:39.653979","version":"2.5.0"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals ","metadata":{"papermill":{"duration":33.390992,"end_time":"2024-03-22T17:24:15.601719","exception":false,"start_time":"2024-03-22T17:23:42.210727","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-04T23:37:21.831296Z","iopub.execute_input":"2024-07-04T23:37:21.833130Z","iopub.status.idle":"2024-07-04T23:37:39.755616Z","shell.execute_reply.started":"2024-07-04T23:37:21.833082Z","shell.execute_reply":"2024-07-04T23:37:39.753974Z"},"trusted":true},"execution_count":12,"outputs":[{"name":"stdout","text":"\u001b[33mWARNING: Skipping apache-beam as it is not installed.\u001b[0m\u001b[33m\n\u001b[0mRequirement already satisfied: pandas in /opt/conda/lib/python3.10/site-packages (2.2.2)\nRequirement already satisfied: polars in /opt/conda/lib/python3.10/site-packages (1.0.0)\nRequirement already satisfied: pyarrow in /opt/conda/lib/python3.10/site-packages (16.1.0)\nRequirement already satisfied: narwhals in /opt/conda/lib/python3.10/site-packages (0.9.28)\nRequirement already satisfied: numpy>=1.22.4 in /opt/conda/lib/python3.10/site-packages (from pandas) (1.26.4)\nRequirement already satisfied: python-dateutil>=2.8.2 in /opt/conda/lib/python3.10/site-packages (from pandas) (2.9.0.post0)\nRequirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.10/site-packages (from pandas) (2023.3.post1)\nRequirement already satisfied: tzdata>=2022.7 in /opt/conda/lib/python3.10/site-packages (from pandas) (2023.4)\nRequirement already satisfied: six>=1.5 in /opt/conda/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n","output_type":"stream"}]},{"cell_type":"code","source":"import pandas as pd\nimport polars as pl\n\npd.options.mode.copy_on_write = True\npd.options.future.infer_string = True","metadata":{"papermill":{"duration":0.907754,"end_time":"2024-03-22T17:24:39.053873","exception":false,"start_time":"2024-03-22T17:24:38.146119","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-04T23:37:39.758625Z","iopub.execute_input":"2024-07-04T23:37:39.759066Z","iopub.status.idle":"2024-07-04T23:37:39.766415Z","shell.execute_reply.started":"2024-07-04T23:37:39.759026Z","shell.execute_reply":"2024-07-04T23:37:39.765113Z"},"trusted":true},"execution_count":13,"outputs":[]},{"cell_type":"code","source":"from typing import Any\nimport narwhals as nw\n\ndef q19(\n lineitem_ds_raw: Any,\n part_ds_raw: Any\n \n) -> Any:\n\n lineitem_ds = nw.from_native(lineitem_ds_raw)\n part_ds = nw.from_native(part_ds_raw)\n\n result = (\n part_ds.join(lineitem_ds, left_on=\"p_partkey\", right_on=\"l_partkey\")\n .filter(nw.col(\"l_shipmode\").is_in([\"AIR\", \"AIR REG\"]))\n .filter(nw.col(\"l_shipinstruct\") == \"DELIVER IN PERSON\")\n .filter(\n (\n (nw.col(\"p_brand\") == \"Brand#12\")\n & nw.col(\"p_container\").is_in(\n [\"SM CASE\", \"SM BOX\", \"SM PACK\", \"SM PKG\"]\n )\n & (nw.col(\"l_quantity\").is_between(1, 11))\n & (nw.col(\"p_size\").is_between(1, 5))\n )\n | (\n (nw.col(\"p_brand\") == \"Brand#23\")\n & nw.col(\"p_container\").is_in(\n [\"MED BAG\", \"MED BOX\", \"MED PKG\", \"MED PACK\"]\n )\n & (nw.col(\"l_quantity\").is_between(10, 20))\n & (nw.col(\"p_size\").is_between(1, 10))\n )\n | (\n (nw.col(\"p_brand\") == \"Brand#34\")\n & nw.col(\"p_container\").is_in(\n [\"LG CASE\", \"LG BOX\", \"LG PACK\", \"LG PKG\"]\n )\n & (nw.col(\"l_quantity\").is_between(20, 30))\n & (nw.col(\"p_size\").is_between(1, 15))\n )\n )\n .select(\n (nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\")))\n .sum()\n .round(2)\n .alias(\"revenue\")\n )\n )\n\n\n return nw.to_native(result)","metadata":{"papermill":{"duration":0.021725,"end_time":"2024-03-22T17:24:39.080999","exception":false,"start_time":"2024-03-22T17:24:39.059274","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-04T23:37:39.768688Z","iopub.execute_input":"2024-07-04T23:37:39.769390Z","iopub.status.idle":"2024-07-04T23:37:39.789082Z","shell.execute_reply.started":"2024-07-04T23:37:39.769343Z","shell.execute_reply":"2024-07-04T23:37:39.787404Z"},"trusted":true},"execution_count":14,"outputs":[]},{"cell_type":"code","source":"dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\nlineitem = dir_ + 'lineitem.parquet'\npart = dir_ + 'part.parquet'","metadata":{"papermill":{"duration":0.013325,"end_time":"2024-03-22T17:24:39.099766","exception":false,"start_time":"2024-03-22T17:24:39.086441","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-04T23:37:39.790960Z","iopub.execute_input":"2024-07-04T23:37:39.791461Z","iopub.status.idle":"2024-07-04T23:37:39.811195Z","shell.execute_reply.started":"2024-07-04T23:37:39.791424Z","shell.execute_reply":"2024-07-04T23:37:39.809234Z"},"trusted":true},"execution_count":15,"outputs":[]},{"cell_type":"code","source":"IO_FUNCS = {\n 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n 'polars[eager]': lambda x: pl.read_parquet(x),\n 'polars[lazy]': lambda x: pl.scan_parquet(x),\n}","metadata":{"papermill":{"duration":0.014284,"end_time":"2024-03-22T17:24:39.119737","exception":false,"start_time":"2024-03-22T17:24:39.105453","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-04T23:37:39.815180Z","iopub.execute_input":"2024-07-04T23:37:39.815633Z","iopub.status.idle":"2024-07-04T23:37:39.827489Z","shell.execute_reply.started":"2024-07-04T23:37:39.815595Z","shell.execute_reply":"2024-07-04T23:37:39.826215Z"},"trusted":true},"execution_count":16,"outputs":[]},{"cell_type":"code","source":"results = {}","metadata":{"execution":{"iopub.status.busy":"2024-07-04T23:37:39.829338Z","iopub.execute_input":"2024-07-04T23:37:39.829794Z","iopub.status.idle":"2024-07-04T23:37:39.840194Z","shell.execute_reply.started":"2024-07-04T23:37:39.829749Z","shell.execute_reply":"2024-07-04T23:37:39.838941Z"},"trusted":true},"execution_count":17,"outputs":[]},{"cell_type":"markdown","source":"## pandas via Narwhals","metadata":{"papermill":{"duration":0.005113,"end_time":"2024-03-22T17:24:39.130472","exception":false,"start_time":"2024-03-22T17:24:39.125359","status":"completed"},"tags":[]}},{"cell_type":"code","source":"tool = 'pandas'\nfn = IO_FUNCS[tool]\ntimings = %timeit -o q19(fn(lineitem), fn(part))\nresults[tool] = timings.all_runs","metadata":{"papermill":{"duration":196.786925,"end_time":"2024-03-22T17:27:55.922832","exception":false,"start_time":"2024-03-22T17:24:39.135907","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-04T23:37:39.841695Z","iopub.execute_input":"2024-07-04T23:37:39.842100Z","iopub.status.idle":"2024-07-04T23:40:48.730278Z","shell.execute_reply.started":"2024-07-04T23:37:39.842069Z","shell.execute_reply":"2024-07-04T23:40:48.729021Z"},"trusted":true},"execution_count":18,"outputs":[{"name":"stdout","text":"23.1 s ± 441 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n","output_type":"stream"}]},{"cell_type":"markdown","source":"## pandas, pyarrow dtypes, via Narwhals","metadata":{"papermill":{"duration":0.005184,"end_time":"2024-03-22T17:27:55.933407","exception":false,"start_time":"2024-03-22T17:27:55.928223","status":"completed"},"tags":[]}},{"cell_type":"code","source":"tool = 'pandas[pyarrow]'\nfn = IO_FUNCS[tool]\ntimings = %timeit -o q19(fn(lineitem), fn(part))\nresults[tool] = timings.all_runs","metadata":{"papermill":{"duration":158.748353,"end_time":"2024-03-22T17:30:34.688289","exception":false,"start_time":"2024-03-22T17:27:55.939936","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-04T23:40:48.732014Z","iopub.execute_input":"2024-07-04T23:40:48.732460Z","iopub.status.idle":"2024-07-04T23:43:41.750616Z","shell.execute_reply.started":"2024-07-04T23:40:48.732418Z","shell.execute_reply":"2024-07-04T23:43:41.749119Z"},"trusted":true},"execution_count":19,"outputs":[{"name":"stdout","text":"21.6 s ± 57.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n","output_type":"stream"}]},{"cell_type":"markdown","source":"## Polars read_parquet","metadata":{"papermill":{"duration":0.005773,"end_time":"2024-03-22T17:30:34.7003","exception":false,"start_time":"2024-03-22T17:30:34.694527","status":"completed"},"tags":[]}},{"cell_type":"code","source":"tool = 'polars[eager]'\nfn = IO_FUNCS[tool]\ntimings = %timeit -o q19(fn(lineitem), fn(part))\nresults[tool] = timings.all_runs","metadata":{"papermill":{"duration":37.821116,"end_time":"2024-03-22T17:31:12.527466","exception":false,"start_time":"2024-03-22T17:30:34.70635","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-04T23:43:41.752635Z","iopub.execute_input":"2024-07-04T23:43:41.753069Z","iopub.status.idle":"2024-07-04T23:44:25.134771Z","shell.execute_reply.started":"2024-07-04T23:43:41.753033Z","shell.execute_reply":"2024-07-04T23:44:25.133230Z"},"trusted":true},"execution_count":20,"outputs":[{"name":"stdout","text":"5.38 s ± 156 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n","output_type":"stream"}]},{"cell_type":"markdown","source":"## Polars scan_parquet","metadata":{"papermill":{"duration":0.005515,"end_time":"2024-03-22T17:31:12.539068","exception":false,"start_time":"2024-03-22T17:31:12.533553","status":"completed"},"tags":[]}},{"cell_type":"code","source":"tool = 'polars[lazy]'\nfn = IO_FUNCS[tool]\ntimings = %timeit -o q19(fn(lineitem), fn(part)).collect()\nresults[tool] = timings.all_runs","metadata":{"papermill":{"duration":4.800698,"end_time":"2024-03-22T17:31:17.346813","exception":false,"start_time":"2024-03-22T17:31:12.546115","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-04T23:44:25.136164Z","iopub.execute_input":"2024-07-04T23:44:25.136610Z","iopub.status.idle":"2024-07-04T23:44:32.779716Z","shell.execute_reply.started":"2024-07-04T23:44:25.136567Z","shell.execute_reply":"2024-07-04T23:44:32.778469Z"},"trusted":true},"execution_count":21,"outputs":[{"name":"stdout","text":"956 ms ± 89.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n","output_type":"stream"}]},{"cell_type":"markdown","source":"## Save","metadata":{}},{"cell_type":"code","source":"import json\nwith open('results.json', 'w') as fd:\n json.dump(results, fd)\n","metadata":{"execution":{"iopub.status.busy":"2024-07-04T23:44:32.781294Z","iopub.execute_input":"2024-07-04T23:44:32.781622Z","iopub.status.idle":"2024-07-04T23:44:32.788263Z","shell.execute_reply.started":"2024-07-04T23:44:32.781594Z","shell.execute_reply":"2024-07-04T23:44:32.787046Z"},"trusted":true},"execution_count":22,"outputs":[]}]} \ No newline at end of file +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 33.390992, + "end_time": "2024-03-22T17:24:15.601719", + "exception": false, + "start_time": "2024-03-22T17:23:42.210727", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.907754, + "end_time": "2024-03-22T17:24:39.053873", + "exception": false, + "start_time": "2024-03-22T17:24:38.146119", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import polars as pl\n", + "\n", + "pd.options.mode.copy_on_write = True\n", + "pd.options.future.infer_string = True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.021725, + "end_time": "2024-03-22T17:24:39.080999", + "exception": false, + "start_time": "2024-03-22T17:24:39.059274", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from typing import Any\n", + "import narwhals as nw\n", + "\n", + "def q19(\n", + " lineitem_ds_raw: Any,\n", + " part_ds_raw: Any\n", + " \n", + ") -> Any:\n", + "\n", + " lineitem_ds = nw.from_native(lineitem_ds_raw)\n", + " part_ds = nw.from_native(part_ds_raw)\n", + "\n", + " result = (\n", + " part_ds.join(lineitem_ds, left_on=\"p_partkey\", right_on=\"l_partkey\")\n", + " .filter(nw.col(\"l_shipmode\").is_in([\"AIR\", \"AIR REG\"]))\n", + " .filter(nw.col(\"l_shipinstruct\") == \"DELIVER IN PERSON\")\n", + " .filter(\n", + " (\n", + " (nw.col(\"p_brand\") == \"Brand#12\")\n", + " & nw.col(\"p_container\").is_in(\n", + " [\"SM CASE\", \"SM BOX\", \"SM PACK\", \"SM PKG\"]\n", + " )\n", + " & (nw.col(\"l_quantity\").is_between(1, 11))\n", + " & (nw.col(\"p_size\").is_between(1, 5))\n", + " )\n", + " | (\n", + " (nw.col(\"p_brand\") == \"Brand#23\")\n", + " & nw.col(\"p_container\").is_in(\n", + " [\"MED BAG\", \"MED BOX\", \"MED PKG\", \"MED PACK\"]\n", + " )\n", + " & (nw.col(\"l_quantity\").is_between(10, 20))\n", + " & (nw.col(\"p_size\").is_between(1, 10))\n", + " )\n", + " | (\n", + " (nw.col(\"p_brand\") == \"Brand#34\")\n", + " & nw.col(\"p_container\").is_in(\n", + " [\"LG CASE\", \"LG BOX\", \"LG PACK\", \"LG PKG\"]\n", + " )\n", + " & (nw.col(\"l_quantity\").is_between(20, 30))\n", + " & (nw.col(\"p_size\").is_between(1, 15))\n", + " )\n", + " )\n", + " .select(\n", + " (nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\")))\n", + " .sum()\n", + " .round(2)\n", + " .alias(\"revenue\")\n", + " )\n", + " )\n", + "\n", + "\n", + " return nw.to_native(result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.013325, + "end_time": "2024-03-22T17:24:39.099766", + "exception": false, + "start_time": "2024-03-22T17:24:39.086441", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", + "lineitem = dir_ + 'lineitem.parquet'\n", + "part = dir_ + 'part.parquet'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.014284, + "end_time": "2024-03-22T17:24:39.119737", + "exception": false, + "start_time": "2024-03-22T17:24:39.105453", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "IO_FUNCS = {\n", + " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", + " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", + " 'polars[eager]': lambda x: pl.read_parquet(x),\n", + " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results = {}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005113, + "end_time": "2024-03-22T17:24:39.130472", + "exception": false, + "start_time": "2024-03-22T17:24:39.125359", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## pandas via Narwhals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 196.786925, + "end_time": "2024-03-22T17:27:55.922832", + "exception": false, + "start_time": "2024-03-22T17:24:39.135907", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'pandas'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q19(fn(lineitem), fn(part))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005184, + "end_time": "2024-03-22T17:27:55.933407", + "exception": false, + "start_time": "2024-03-22T17:27:55.928223", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## pandas, pyarrow dtypes, via Narwhals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 158.748353, + "end_time": "2024-03-22T17:30:34.688289", + "exception": false, + "start_time": "2024-03-22T17:27:55.939936", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'pandas[pyarrow]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q19(fn(lineitem), fn(part))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005773, + "end_time": "2024-03-22T17:30:34.7003", + "exception": false, + "start_time": "2024-03-22T17:30:34.694527", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Polars read_parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 37.821116, + "end_time": "2024-03-22T17:31:12.527466", + "exception": false, + "start_time": "2024-03-22T17:30:34.70635", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'polars[eager]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q19(fn(lineitem), fn(part))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005515, + "end_time": "2024-03-22T17:31:12.539068", + "exception": false, + "start_time": "2024-03-22T17:31:12.533553", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Polars scan_parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 4.800698, + "end_time": "2024-03-22T17:31:17.346813", + "exception": false, + "start_time": "2024-03-22T17:31:12.546115", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'polars[lazy]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q19(fn(lineitem), fn(part)).collect()\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "with open('results.json', 'w') as fd:\n", + " json.dump(results, fd)\n" + ] + } + ], + "metadata": { + "kaggle": { + "accelerator": "none", + "dataSources": [ + { + "sourceId": 167796934, + "sourceType": "kernelVersion" + } + ], + "dockerImageVersionId": 30673, + "isGpuEnabled": false, + "isInternetEnabled": true, + "language": "python", + "sourceType": "notebook" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + }, + "papermill": { + "default_parameters": {}, + "duration": 458.423327, + "end_time": "2024-03-22T17:31:18.077306", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2024-03-22T17:23:39.653979", + "version": "2.5.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tpch/notebooks/q19/kernel-metadata.json b/tpch/notebooks/q19/kernel-metadata.json index 703e595fd..b250ac0a4 100644 --- a/tpch/notebooks/q19/kernel-metadata.json +++ b/tpch/notebooks/q19/kernel-metadata.json @@ -1,5 +1,5 @@ { - "id": "uchennaugoh/narwhals-tpch-q19-s2", + "id": "marcogorelli/narwhals-tpch-q19-s2", "title": "Narwhals TPCH Q19 S2", "code_file": "execute.ipynb", "language": "python", diff --git a/tpch/notebooks/q20/execute.ipynb b/tpch/notebooks/q20/execute.ipynb index 2a67eecfa..f0719f317 100644 --- a/tpch/notebooks/q20/execute.ipynb +++ b/tpch/notebooks/q20/execute.ipynb @@ -1 +1,379 @@ -{"metadata":{"kaggle":{"accelerator":"none","dataSources":[{"sourceId":167796934,"sourceType":"kernelVersion"}],"dockerImageVersionId":30673,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.13"},"papermill":{"default_parameters":{},"duration":458.423327,"end_time":"2024-03-22T17:31:18.077306","environment_variables":{},"exception":null,"input_path":"__notebook__.ipynb","output_path":"__notebook__.ipynb","parameters":{},"start_time":"2024-03-22T17:23:39.653979","version":"2.5.0"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals ","metadata":{"papermill":{"duration":33.390992,"end_time":"2024-03-22T17:24:15.601719","exception":false,"start_time":"2024-03-22T17:23:42.210727","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-05T00:36:20.650914Z","iopub.execute_input":"2024-07-05T00:36:20.651341Z","iopub.status.idle":"2024-07-05T00:36:34.683759Z","shell.execute_reply.started":"2024-07-05T00:36:20.651308Z","shell.execute_reply":"2024-07-05T00:36:34.682116Z"},"trusted":true},"execution_count":8,"outputs":[{"name":"stdout","text":"\u001b[33mWARNING: Skipping apache-beam as it is not installed.\u001b[0m\u001b[33m\n\u001b[0mRequirement already satisfied: pandas in /opt/conda/lib/python3.10/site-packages (2.2.2)\nRequirement already satisfied: polars in /opt/conda/lib/python3.10/site-packages (1.0.0)\nRequirement already satisfied: pyarrow in /opt/conda/lib/python3.10/site-packages (16.1.0)\nRequirement already satisfied: narwhals in /opt/conda/lib/python3.10/site-packages (0.9.28)\nRequirement already satisfied: numpy>=1.22.4 in /opt/conda/lib/python3.10/site-packages (from pandas) (1.26.4)\nRequirement already satisfied: python-dateutil>=2.8.2 in /opt/conda/lib/python3.10/site-packages (from pandas) (2.9.0.post0)\nRequirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.10/site-packages (from pandas) (2023.3.post1)\nRequirement already satisfied: tzdata>=2022.7 in /opt/conda/lib/python3.10/site-packages (from pandas) (2023.4)\nRequirement already satisfied: six>=1.5 in /opt/conda/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n","output_type":"stream"}]},{"cell_type":"code","source":"import pandas as pd\nimport polars as pl\n\npd.options.mode.copy_on_write = True\npd.options.future.infer_string = True","metadata":{"papermill":{"duration":0.907754,"end_time":"2024-03-22T17:24:39.053873","exception":false,"start_time":"2024-03-22T17:24:38.146119","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-05T00:36:34.686513Z","iopub.execute_input":"2024-07-05T00:36:34.686898Z","iopub.status.idle":"2024-07-05T00:36:34.693024Z","shell.execute_reply.started":"2024-07-05T00:36:34.686862Z","shell.execute_reply":"2024-07-05T00:36:34.691756Z"},"trusted":true},"execution_count":9,"outputs":[]},{"cell_type":"code","source":"from typing import Any\nimport narwhals as nw\nfrom datetime import datetime\n\ndef q20(\n part_ds_raw: Any,\n partsupp_ds_raw: Any,\n nation_ds_raw: Any,\n lineitem_ds_raw: Any,\n supplier_ds_raw: Any\n) -> Any:\n\n part_ds = nw.from_native(part_ds_raw)\n nation_ds = nw.from_native(nation_ds_raw)\n partsupp_ds = nw.from_native(partsupp_ds_raw)\n lineitem_ds = nw.from_native(lineitem_ds_raw)\n supplier_ds = nw.from_native(supplier_ds_raw)\n \n var1 = datetime(1994, 1, 1)\n var2 = datetime(1995, 1, 1)\n var3 = \"CANADA\"\n var4 = \"forest\"\n\n query1 = (\n lineitem_ds.filter(nw.col(\"l_shipdate\").is_between(var1, var2, closed=\"left\"))\n .group_by(\"l_partkey\", \"l_suppkey\")\n .agg((nw.col(\"l_quantity\").sum()).alias(\"sum_quantity\"))\n .with_columns(sum_quantity = nw.col(\"sum_quantity\") * 0.5)\n )\n query2 = nation_ds.filter(nw.col(\"n_name\") == var3)\n query3 = supplier_ds.join(query2, left_on=\"s_nationkey\", right_on=\"n_nationkey\")\n\n result = (\n part_ds.filter(nw.col(\"p_name\").str.starts_with(var4))\n .select(nw.col(\"p_partkey\").unique())\n .join(partsupp_ds, left_on=\"p_partkey\", right_on=\"ps_partkey\")\n .join(\n query1,\n left_on=[\"ps_suppkey\", \"p_partkey\"],\n right_on=[\"l_suppkey\", \"l_partkey\"],\n )\n .filter(nw.col(\"ps_availqty\") > nw.col(\"sum_quantity\"))\n .select(nw.col(\"ps_suppkey\").unique())\n .join(query3, left_on=\"ps_suppkey\", right_on=\"s_suppkey\")\n .select(\"s_name\", \"s_address\")\n .sort(\"s_name\")\n )\n\n\n return nw.to_native(result)","metadata":{"papermill":{"duration":0.021725,"end_time":"2024-03-22T17:24:39.080999","exception":false,"start_time":"2024-03-22T17:24:39.059274","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-05T00:36:34.694907Z","iopub.execute_input":"2024-07-05T00:36:34.695324Z","iopub.status.idle":"2024-07-05T00:36:34.708778Z","shell.execute_reply.started":"2024-07-05T00:36:34.695283Z","shell.execute_reply":"2024-07-05T00:36:34.707486Z"},"trusted":true},"execution_count":10,"outputs":[]},{"cell_type":"code","source":"dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\nnation = dir_ + 'nation.parquet'\nlineitem = dir_ + 'lineitem.parquet'\nsupplier = dir_ + 'supplier.parquet'\npart = dir_ + 'part.parquet'\npartsupp = dir_ + 'partsupp.parquet'","metadata":{"papermill":{"duration":0.013325,"end_time":"2024-03-22T17:24:39.099766","exception":false,"start_time":"2024-03-22T17:24:39.086441","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-05T00:36:34.710598Z","iopub.execute_input":"2024-07-05T00:36:34.710988Z","iopub.status.idle":"2024-07-05T00:36:34.726761Z","shell.execute_reply.started":"2024-07-05T00:36:34.710948Z","shell.execute_reply":"2024-07-05T00:36:34.725611Z"},"trusted":true},"execution_count":11,"outputs":[]},{"cell_type":"code","source":"IO_FUNCS = {\n 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n 'polars[eager]': lambda x: pl.read_parquet(x),\n 'polars[lazy]': lambda x: pl.scan_parquet(x),\n}","metadata":{"papermill":{"duration":0.014284,"end_time":"2024-03-22T17:24:39.119737","exception":false,"start_time":"2024-03-22T17:24:39.105453","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-05T00:36:34.729573Z","iopub.execute_input":"2024-07-05T00:36:34.730646Z","iopub.status.idle":"2024-07-05T00:36:34.739908Z","shell.execute_reply.started":"2024-07-05T00:36:34.730608Z","shell.execute_reply":"2024-07-05T00:36:34.738827Z"},"trusted":true},"execution_count":12,"outputs":[]},{"cell_type":"code","source":"results = {}","metadata":{"execution":{"iopub.status.busy":"2024-07-05T00:36:34.741225Z","iopub.execute_input":"2024-07-05T00:36:34.741579Z","iopub.status.idle":"2024-07-05T00:36:34.759379Z","shell.execute_reply.started":"2024-07-05T00:36:34.741550Z","shell.execute_reply":"2024-07-05T00:36:34.758120Z"},"trusted":true},"execution_count":13,"outputs":[]},{"cell_type":"markdown","source":"## pandas via Narwhals","metadata":{"papermill":{"duration":0.005113,"end_time":"2024-03-22T17:24:39.130472","exception":false,"start_time":"2024-03-22T17:24:39.125359","status":"completed"},"tags":[]}},{"cell_type":"code","source":"tool = 'pandas'\nfn = IO_FUNCS[tool]\ntimings = %timeit -o q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))\nresults[tool] = timings.all_runs","metadata":{"papermill":{"duration":196.786925,"end_time":"2024-03-22T17:27:55.922832","exception":false,"start_time":"2024-03-22T17:24:39.135907","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-05T00:36:34.761065Z","iopub.execute_input":"2024-07-05T00:36:34.761853Z","iopub.status.idle":"2024-07-05T00:37:30.558394Z","shell.execute_reply.started":"2024-07-05T00:36:34.761808Z","shell.execute_reply":"2024-07-05T00:37:30.557287Z"},"trusted":true},"execution_count":14,"outputs":[{"name":"stdout","text":"6.44 s ± 148 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n","output_type":"stream"}]},{"cell_type":"markdown","source":"## pandas, pyarrow dtypes, via Narwhals","metadata":{"papermill":{"duration":0.005184,"end_time":"2024-03-22T17:27:55.933407","exception":false,"start_time":"2024-03-22T17:27:55.928223","status":"completed"},"tags":[]}},{"cell_type":"code","source":"tool = 'pandas[pyarrow]'\nfn = IO_FUNCS[tool]\ntimings = %timeit -o q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))\nresults[tool] = timings.all_runs","metadata":{"papermill":{"duration":158.748353,"end_time":"2024-03-22T17:30:34.688289","exception":false,"start_time":"2024-03-22T17:27:55.939936","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-05T00:37:30.559744Z","iopub.execute_input":"2024-07-05T00:37:30.560076Z","iopub.status.idle":"2024-07-05T00:38:11.450176Z","shell.execute_reply.started":"2024-07-05T00:37:30.560047Z","shell.execute_reply":"2024-07-05T00:38:11.448938Z"},"trusted":true},"execution_count":15,"outputs":[{"name":"stdout","text":"5.12 s ± 169 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n","output_type":"stream"}]},{"cell_type":"markdown","source":"## Polars read_parquet","metadata":{"papermill":{"duration":0.005773,"end_time":"2024-03-22T17:30:34.7003","exception":false,"start_time":"2024-03-22T17:30:34.694527","status":"completed"},"tags":[]}},{"cell_type":"code","source":"tool = 'polars[eager]'\nfn = IO_FUNCS[tool]\ntimings = %timeit -o q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))\nresults[tool] = timings.all_runs","metadata":{"papermill":{"duration":37.821116,"end_time":"2024-03-22T17:31:12.527466","exception":false,"start_time":"2024-03-22T17:30:34.70635","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-05T00:38:11.451622Z","iopub.execute_input":"2024-07-05T00:38:11.451941Z","iopub.status.idle":"2024-07-05T00:38:41.218576Z","shell.execute_reply.started":"2024-07-05T00:38:11.451913Z","shell.execute_reply":"2024-07-05T00:38:41.217210Z"},"trusted":true},"execution_count":16,"outputs":[{"name":"stdout","text":"3.69 s ± 83.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n","output_type":"stream"}]},{"cell_type":"markdown","source":"## Polars scan_parquet","metadata":{"papermill":{"duration":0.005515,"end_time":"2024-03-22T17:31:12.539068","exception":false,"start_time":"2024-03-22T17:31:12.533553","status":"completed"},"tags":[]}},{"cell_type":"code","source":"tool = 'polars[lazy]'\nfn = IO_FUNCS[tool]\ntimings = %timeit -o q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier)).collect()\nresults[tool] = timings.all_runs","metadata":{"papermill":{"duration":4.800698,"end_time":"2024-03-22T17:31:17.346813","exception":false,"start_time":"2024-03-22T17:31:12.546115","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-05T00:38:41.220118Z","iopub.execute_input":"2024-07-05T00:38:41.220589Z","iopub.status.idle":"2024-07-05T00:38:47.222198Z","shell.execute_reply.started":"2024-07-05T00:38:41.220538Z","shell.execute_reply":"2024-07-05T00:38:47.220814Z"},"trusted":true},"execution_count":17,"outputs":[{"name":"stdout","text":"750 ms ± 18.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n","output_type":"stream"}]},{"cell_type":"markdown","source":"## Save","metadata":{}},{"cell_type":"code","source":"import json\nwith open('results.json', 'w') as fd:\n json.dump(results, fd)\n","metadata":{"execution":{"iopub.status.busy":"2024-07-05T00:38:47.223794Z","iopub.execute_input":"2024-07-05T00:38:47.224198Z","iopub.status.idle":"2024-07-05T00:38:47.230577Z","shell.execute_reply.started":"2024-07-05T00:38:47.224162Z","shell.execute_reply":"2024-07-05T00:38:47.229437Z"},"trusted":true},"execution_count":18,"outputs":[]}]} \ No newline at end of file +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 33.390992, + "end_time": "2024-03-22T17:24:15.601719", + "exception": false, + "start_time": "2024-03-22T17:23:42.210727", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.907754, + "end_time": "2024-03-22T17:24:39.053873", + "exception": false, + "start_time": "2024-03-22T17:24:38.146119", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import polars as pl\n", + "\n", + "pd.options.mode.copy_on_write = True\n", + "pd.options.future.infer_string = True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.021725, + "end_time": "2024-03-22T17:24:39.080999", + "exception": false, + "start_time": "2024-03-22T17:24:39.059274", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from typing import Any\n", + "import narwhals as nw\n", + "from datetime import datetime\n", + "\n", + "def q20(\n", + " part_ds_raw: Any,\n", + " partsupp_ds_raw: Any,\n", + " nation_ds_raw: Any,\n", + " lineitem_ds_raw: Any,\n", + " supplier_ds_raw: Any\n", + ") -> Any:\n", + "\n", + " part_ds = nw.from_native(part_ds_raw)\n", + " nation_ds = nw.from_native(nation_ds_raw)\n", + " partsupp_ds = nw.from_native(partsupp_ds_raw)\n", + " lineitem_ds = nw.from_native(lineitem_ds_raw)\n", + " supplier_ds = nw.from_native(supplier_ds_raw)\n", + " \n", + " var1 = datetime(1994, 1, 1)\n", + " var2 = datetime(1995, 1, 1)\n", + " var3 = \"CANADA\"\n", + " var4 = \"forest\"\n", + "\n", + " query1 = (\n", + " lineitem_ds.filter(nw.col(\"l_shipdate\").is_between(var1, var2, closed=\"left\"))\n", + " .group_by(\"l_partkey\", \"l_suppkey\")\n", + " .agg((nw.col(\"l_quantity\").sum()).alias(\"sum_quantity\"))\n", + " .with_columns(sum_quantity = nw.col(\"sum_quantity\") * 0.5)\n", + " )\n", + " query2 = nation_ds.filter(nw.col(\"n_name\") == var3)\n", + " query3 = supplier_ds.join(query2, left_on=\"s_nationkey\", right_on=\"n_nationkey\")\n", + "\n", + " result = (\n", + " part_ds.filter(nw.col(\"p_name\").str.starts_with(var4))\n", + " .select(nw.col(\"p_partkey\").unique())\n", + " .join(partsupp_ds, left_on=\"p_partkey\", right_on=\"ps_partkey\")\n", + " .join(\n", + " query1,\n", + " left_on=[\"ps_suppkey\", \"p_partkey\"],\n", + " right_on=[\"l_suppkey\", \"l_partkey\"],\n", + " )\n", + " .filter(nw.col(\"ps_availqty\") > nw.col(\"sum_quantity\"))\n", + " .select(nw.col(\"ps_suppkey\").unique())\n", + " .join(query3, left_on=\"ps_suppkey\", right_on=\"s_suppkey\")\n", + " .select(\"s_name\", \"s_address\")\n", + " .sort(\"s_name\")\n", + " )\n", + "\n", + "\n", + " return nw.to_native(result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.013325, + "end_time": "2024-03-22T17:24:39.099766", + "exception": false, + "start_time": "2024-03-22T17:24:39.086441", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", + "nation = dir_ + 'nation.parquet'\n", + "lineitem = dir_ + 'lineitem.parquet'\n", + "supplier = dir_ + 'supplier.parquet'\n", + "part = dir_ + 'part.parquet'\n", + "partsupp = dir_ + 'partsupp.parquet'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.014284, + "end_time": "2024-03-22T17:24:39.119737", + "exception": false, + "start_time": "2024-03-22T17:24:39.105453", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "IO_FUNCS = {\n", + " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", + " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", + " 'polars[eager]': lambda x: pl.read_parquet(x),\n", + " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results = {}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005113, + "end_time": "2024-03-22T17:24:39.130472", + "exception": false, + "start_time": "2024-03-22T17:24:39.125359", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## pandas via Narwhals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 196.786925, + "end_time": "2024-03-22T17:27:55.922832", + "exception": false, + "start_time": "2024-03-22T17:24:39.135907", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'pandas'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005184, + "end_time": "2024-03-22T17:27:55.933407", + "exception": false, + "start_time": "2024-03-22T17:27:55.928223", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## pandas, pyarrow dtypes, via Narwhals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 158.748353, + "end_time": "2024-03-22T17:30:34.688289", + "exception": false, + "start_time": "2024-03-22T17:27:55.939936", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'pandas[pyarrow]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005773, + "end_time": "2024-03-22T17:30:34.7003", + "exception": false, + "start_time": "2024-03-22T17:30:34.694527", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Polars read_parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 37.821116, + "end_time": "2024-03-22T17:31:12.527466", + "exception": false, + "start_time": "2024-03-22T17:30:34.70635", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'polars[eager]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005515, + "end_time": "2024-03-22T17:31:12.539068", + "exception": false, + "start_time": "2024-03-22T17:31:12.533553", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Polars scan_parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 4.800698, + "end_time": "2024-03-22T17:31:17.346813", + "exception": false, + "start_time": "2024-03-22T17:31:12.546115", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'polars[lazy]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier)).collect()\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "with open('results.json', 'w') as fd:\n", + " json.dump(results, fd)\n" + ] + } + ], + "metadata": { + "kaggle": { + "accelerator": "none", + "dataSources": [ + { + "sourceId": 167796934, + "sourceType": "kernelVersion" + } + ], + "dockerImageVersionId": 30673, + "isGpuEnabled": false, + "isInternetEnabled": true, + "language": "python", + "sourceType": "notebook" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + }, + "papermill": { + "default_parameters": {}, + "duration": 458.423327, + "end_time": "2024-03-22T17:31:18.077306", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2024-03-22T17:23:39.653979", + "version": "2.5.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tpch/notebooks/q20/kernel-metadata.json b/tpch/notebooks/q20/kernel-metadata.json index 7ecc350f0..e6733375e 100644 --- a/tpch/notebooks/q20/kernel-metadata.json +++ b/tpch/notebooks/q20/kernel-metadata.json @@ -1,5 +1,5 @@ { - "id": "uchennaugoh/narwhals-tpch-q20-s2", + "id": "marcogorelli/narwhals-tpch-q20-s2", "title": "Narwhals TPCH Q20 S2", "code_file": "execute.ipynb", "language": "python",