From 4b1b17243cc44b993b99ab79de0ca56578ca18c1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 10 Jul 2024 19:08:05 +0000 Subject: [PATCH] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tpch/notebooks/q17/execute.ipynb | 358 ++++++++++++++++++++++++++++++- 1 file changed, 357 insertions(+), 1 deletion(-) diff --git a/tpch/notebooks/q17/execute.ipynb b/tpch/notebooks/q17/execute.ipynb index 3e35f3dba..958c7f5be 100644 --- a/tpch/notebooks/q17/execute.ipynb +++ b/tpch/notebooks/q17/execute.ipynb @@ -1 +1,357 @@ -{"metadata":{"kaggle":{"accelerator":"none","dataSources":[{"sourceId":167796934,"sourceType":"kernelVersion"}],"dockerImageVersionId":30673,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.13"},"papermill":{"default_parameters":{},"duration":458.423327,"end_time":"2024-03-22T17:31:18.077306","environment_variables":{},"exception":null,"input_path":"__notebook__.ipynb","output_path":"__notebook__.ipynb","parameters":{},"start_time":"2024-03-22T17:23:39.653979","version":"2.5.0"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals ","metadata":{"papermill":{"duration":33.390992,"end_time":"2024-03-22T17:24:15.601719","exception":false,"start_time":"2024-03-22T17:23:42.210727","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-10T18:02:46.287233Z","iopub.execute_input":"2024-07-10T18:02:46.288273Z","iopub.status.idle":"2024-07-10T18:03:03.887347Z","shell.execute_reply.started":"2024-07-10T18:02:46.288225Z","shell.execute_reply":"2024-07-10T18:03:03.885718Z"},"trusted":true},"execution_count":33,"outputs":[{"name":"stdout","text":"\u001b[33mWARNING: Skipping apache-beam as it is not installed.\u001b[0m\u001b[33m\n\u001b[0mRequirement already satisfied: pandas in /opt/conda/lib/python3.10/site-packages (2.2.2)\nRequirement already satisfied: polars in /opt/conda/lib/python3.10/site-packages (1.1.0)\nRequirement already satisfied: pyarrow in /opt/conda/lib/python3.10/site-packages (16.1.0)\nRequirement already satisfied: narwhals in /opt/conda/lib/python3.10/site-packages (1.0.4)\nRequirement already satisfied: numpy>=1.22.4 in /opt/conda/lib/python3.10/site-packages (from pandas) (1.26.4)\nRequirement already satisfied: python-dateutil>=2.8.2 in /opt/conda/lib/python3.10/site-packages (from pandas) (2.9.0.post0)\nRequirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.10/site-packages (from pandas) (2023.3.post1)\nRequirement already satisfied: tzdata>=2022.7 in /opt/conda/lib/python3.10/site-packages (from pandas) (2023.4)\nRequirement already satisfied: six>=1.5 in /opt/conda/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n","output_type":"stream"}]},{"cell_type":"code","source":"import pandas as pd\nimport polars as pl\n\npd.options.mode.copy_on_write = True\npd.options.future.infer_string = True","metadata":{"papermill":{"duration":0.907754,"end_time":"2024-03-22T17:24:39.053873","exception":false,"start_time":"2024-03-22T17:24:38.146119","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-10T18:03:03.890254Z","iopub.execute_input":"2024-07-10T18:03:03.890672Z","iopub.status.idle":"2024-07-10T18:03:03.897591Z","shell.execute_reply.started":"2024-07-10T18:03:03.890635Z","shell.execute_reply":"2024-07-10T18:03:03.896100Z"},"trusted":true},"execution_count":34,"outputs":[]},{"cell_type":"code","source":"from typing import Any\nimport narwhals as nw\n\ndef q17(\n lineitem_ds_raw: Any,\n part_ds_raw: Any\n) -> Any:\n\n lineitem_ds = nw.from_native(lineitem_ds_raw)\n part_ds = nw.from_native(part_ds_raw)\n \n var1 = \"Brand#23\"\n var2 = \"MED BOX\"\n \n query1 = (\n part_ds.filter(nw.col(\"p_brand\") == var1)\n .filter(nw.col(\"p_container\") == var2)\n .join(lineitem_ds, how=\"left\", left_on=\"p_partkey\", right_on=\"l_partkey\")\n )\n \n final_query = (\n query1.group_by(\"p_partkey\")\n .agg((0.2 * nw.col(\"l_quantity\").mean()).alias(\"avg_quantity\"))\n .select(nw.col(\"p_partkey\").alias(\"key\"), nw.col(\"avg_quantity\"))\n .join(query1, left_on=\"key\", right_on=\"p_partkey\")\n .filter(nw.col(\"l_quantity\") < nw.col(\"avg_quantity\"))\n .select((nw.col(\"l_extendedprice\").sum() / 7.0).round(2).alias(\"avg_yearly\"))\n )\n\n\n return nw.to_native(final_query)","metadata":{"papermill":{"duration":0.021725,"end_time":"2024-03-22T17:24:39.080999","exception":false,"start_time":"2024-03-22T17:24:39.059274","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-10T18:03:03.899127Z","iopub.execute_input":"2024-07-10T18:03:03.899507Z","iopub.status.idle":"2024-07-10T18:03:03.915289Z","shell.execute_reply.started":"2024-07-10T18:03:03.899466Z","shell.execute_reply":"2024-07-10T18:03:03.914027Z"},"trusted":true},"execution_count":35,"outputs":[]},{"cell_type":"code","source":"dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\nlineitem = dir_ + 'lineitem.parquet'\npart = dir_ + 'part.parquet'","metadata":{"papermill":{"duration":0.013325,"end_time":"2024-03-22T17:24:39.099766","exception":false,"start_time":"2024-03-22T17:24:39.086441","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-10T18:03:03.918173Z","iopub.execute_input":"2024-07-10T18:03:03.918661Z","iopub.status.idle":"2024-07-10T18:03:03.939508Z","shell.execute_reply.started":"2024-07-10T18:03:03.918616Z","shell.execute_reply":"2024-07-10T18:03:03.937980Z"},"trusted":true},"execution_count":36,"outputs":[]},{"cell_type":"code","source":"IO_FUNCS = {\n 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n 'polars[eager]': lambda x: pl.read_parquet(x),\n 'polars[lazy]': lambda x: pl.scan_parquet(x),\n}","metadata":{"papermill":{"duration":0.014284,"end_time":"2024-03-22T17:24:39.119737","exception":false,"start_time":"2024-03-22T17:24:39.105453","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-10T18:03:03.941014Z","iopub.execute_input":"2024-07-10T18:03:03.941408Z","iopub.status.idle":"2024-07-10T18:03:03.957630Z","shell.execute_reply.started":"2024-07-10T18:03:03.941377Z","shell.execute_reply":"2024-07-10T18:03:03.956359Z"},"trusted":true},"execution_count":37,"outputs":[]},{"cell_type":"code","source":"results = {}","metadata":{"execution":{"iopub.status.busy":"2024-07-10T18:03:03.959976Z","iopub.execute_input":"2024-07-10T18:03:03.960426Z","iopub.status.idle":"2024-07-10T18:03:03.973898Z","shell.execute_reply.started":"2024-07-10T18:03:03.960386Z","shell.execute_reply":"2024-07-10T18:03:03.972472Z"},"trusted":true},"execution_count":38,"outputs":[]},{"cell_type":"markdown","source":"## pandas via Narwhals","metadata":{"papermill":{"duration":0.005113,"end_time":"2024-03-22T17:24:39.130472","exception":false,"start_time":"2024-03-22T17:24:39.125359","status":"completed"},"tags":[]}},{"cell_type":"code","source":"tool = 'pandas'\nfn = IO_FUNCS[tool]\ntimings = %timeit -o q17(fn(lineitem), fn(part))\nresults[tool] = timings.all_runs","metadata":{"papermill":{"duration":196.786925,"end_time":"2024-03-22T17:27:55.922832","exception":false,"start_time":"2024-03-22T17:24:39.135907","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-10T18:03:03.975534Z","iopub.execute_input":"2024-07-10T18:03:03.975965Z","iopub.status.idle":"2024-07-10T18:04:00.559360Z","shell.execute_reply.started":"2024-07-10T18:03:03.975931Z","shell.execute_reply":"2024-07-10T18:04:00.558223Z"},"trusted":true},"execution_count":39,"outputs":[{"name":"stderr","text":"/opt/conda/lib/python3.10/site-packages/narwhals/_pandas_like/group_by.py:59: UserWarning: Found complex group-by expression, which can't be expressed efficiently with the pandas API. If you can, please rewrite your query such that group-by aggregations are simple (e.g. mean, std, min, max, ...).\n return agg_pandas(\n/opt/conda/lib/python3.10/site-packages/narwhals/_pandas_like/group_by.py:59: UserWarning: Found complex group-by expression, which can't be expressed efficiently with the pandas API. If you can, please rewrite your query such that group-by aggregations are simple (e.g. mean, std, min, max, ...).\n return agg_pandas(\n/opt/conda/lib/python3.10/site-packages/narwhals/_pandas_like/group_by.py:59: UserWarning: Found complex group-by expression, which can't be expressed efficiently with the pandas API. If you can, please rewrite your query such that group-by aggregations are simple (e.g. mean, std, min, max, ...).\n return agg_pandas(\n/opt/conda/lib/python3.10/site-packages/narwhals/_pandas_like/group_by.py:59: UserWarning: Found complex group-by expression, which can't be expressed efficiently with the pandas API. If you can, please rewrite your query such that group-by aggregations are simple (e.g. mean, std, min, max, ...).\n return agg_pandas(\n/opt/conda/lib/python3.10/site-packages/narwhals/_pandas_like/group_by.py:59: UserWarning: Found complex group-by expression, which can't be expressed efficiently with the pandas API. If you can, please rewrite your query such that group-by aggregations are simple (e.g. mean, std, min, max, ...).\n return agg_pandas(\n/opt/conda/lib/python3.10/site-packages/narwhals/_pandas_like/group_by.py:59: UserWarning: Found complex group-by expression, which can't be expressed efficiently with the pandas API. If you can, please rewrite your query such that group-by aggregations are simple (e.g. mean, std, min, max, ...).\n return agg_pandas(\n/opt/conda/lib/python3.10/site-packages/narwhals/_pandas_like/group_by.py:59: UserWarning: Found complex group-by expression, which can't be expressed efficiently with the pandas API. If you can, please rewrite your query such that group-by aggregations are simple (e.g. mean, std, min, max, ...).\n return agg_pandas(\n/opt/conda/lib/python3.10/site-packages/narwhals/_pandas_like/group_by.py:59: UserWarning: Found complex group-by expression, which can't be expressed efficiently with the pandas API. If you can, please rewrite your query such that group-by aggregations are simple (e.g. mean, std, min, max, ...).\n return agg_pandas(\n","output_type":"stream"},{"name":"stdout","text":"6.91 s ± 108 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n","output_type":"stream"}]},{"cell_type":"markdown","source":"## pandas, pyarrow dtypes, via Narwhals","metadata":{"papermill":{"duration":0.005184,"end_time":"2024-03-22T17:27:55.933407","exception":false,"start_time":"2024-03-22T17:27:55.928223","status":"completed"},"tags":[]}},{"cell_type":"code","source":"tool = 'pandas[pyarrow]'\nfn = IO_FUNCS[tool]\ntimings = %timeit -o q17(fn(lineitem), fn(part))\nresults[tool] = timings.all_runs","metadata":{"papermill":{"duration":158.748353,"end_time":"2024-03-22T17:30:34.688289","exception":false,"start_time":"2024-03-22T17:27:55.939936","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-10T18:04:00.561065Z","iopub.execute_input":"2024-07-10T18:04:00.561520Z","iopub.status.idle":"2024-07-10T18:04:43.896752Z","shell.execute_reply.started":"2024-07-10T18:04:00.561479Z","shell.execute_reply":"2024-07-10T18:04:43.895321Z"},"trusted":true},"execution_count":40,"outputs":[{"name":"stderr","text":"/opt/conda/lib/python3.10/site-packages/narwhals/_pandas_like/group_by.py:59: UserWarning: Found complex group-by expression, which can't be expressed efficiently with the pandas API. If you can, please rewrite your query such that group-by aggregations are simple (e.g. mean, std, min, max, ...).\n return agg_pandas(\n/opt/conda/lib/python3.10/site-packages/narwhals/_pandas_like/group_by.py:59: UserWarning: Found complex group-by expression, which can't be expressed efficiently with the pandas API. If you can, please rewrite your query such that group-by aggregations are simple (e.g. mean, std, min, max, ...).\n return agg_pandas(\n/opt/conda/lib/python3.10/site-packages/narwhals/_pandas_like/group_by.py:59: UserWarning: Found complex group-by expression, which can't be expressed efficiently with the pandas API. If you can, please rewrite your query such that group-by aggregations are simple (e.g. mean, std, min, max, ...).\n return agg_pandas(\n/opt/conda/lib/python3.10/site-packages/narwhals/_pandas_like/group_by.py:59: UserWarning: Found complex group-by expression, which can't be expressed efficiently with the pandas API. If you can, please rewrite your query such that group-by aggregations are simple (e.g. mean, std, min, max, ...).\n return agg_pandas(\n/opt/conda/lib/python3.10/site-packages/narwhals/_pandas_like/group_by.py:59: UserWarning: Found complex group-by expression, which can't be expressed efficiently with the pandas API. If you can, please rewrite your query such that group-by aggregations are simple (e.g. mean, std, min, max, ...).\n return agg_pandas(\n/opt/conda/lib/python3.10/site-packages/narwhals/_pandas_like/group_by.py:59: UserWarning: Found complex group-by expression, which can't be expressed efficiently with the pandas API. If you can, please rewrite your query such that group-by aggregations are simple (e.g. mean, std, min, max, ...).\n return agg_pandas(\n/opt/conda/lib/python3.10/site-packages/narwhals/_pandas_like/group_by.py:59: UserWarning: Found complex group-by expression, which can't be expressed efficiently with the pandas API. If you can, please rewrite your query such that group-by aggregations are simple (e.g. mean, std, min, max, ...).\n return agg_pandas(\n/opt/conda/lib/python3.10/site-packages/narwhals/_pandas_like/group_by.py:59: UserWarning: Found complex group-by expression, which can't be expressed efficiently with the pandas API. If you can, please rewrite your query such that group-by aggregations are simple (e.g. mean, std, min, max, ...).\n return agg_pandas(\n","output_type":"stream"},{"name":"stdout","text":"5.39 s ± 99.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n","output_type":"stream"}]},{"cell_type":"markdown","source":"## Polars read_parquet","metadata":{"papermill":{"duration":0.005773,"end_time":"2024-03-22T17:30:34.7003","exception":false,"start_time":"2024-03-22T17:30:34.694527","status":"completed"},"tags":[]}},{"cell_type":"code","source":"tool = 'polars[eager]'\nfn = IO_FUNCS[tool]\ntimings = %timeit -o q17(fn(lineitem), fn(part))\nresults[tool] = timings.all_runs","metadata":{"papermill":{"duration":37.821116,"end_time":"2024-03-22T17:31:12.527466","exception":false,"start_time":"2024-03-22T17:30:34.70635","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-10T18:04:43.898285Z","iopub.execute_input":"2024-07-10T18:04:43.898782Z","iopub.status.idle":"2024-07-10T18:05:08.470102Z","shell.execute_reply.started":"2024-07-10T18:04:43.898625Z","shell.execute_reply":"2024-07-10T18:05:08.468618Z"},"trusted":true},"execution_count":41,"outputs":[{"name":"stdout","text":"3.06 s ± 113 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n","output_type":"stream"}]},{"cell_type":"markdown","source":"## Polars scan_parquet","metadata":{"papermill":{"duration":0.005515,"end_time":"2024-03-22T17:31:12.539068","exception":false,"start_time":"2024-03-22T17:31:12.533553","status":"completed"},"tags":[]}},{"cell_type":"code","source":"tool = 'polars[lazy]'\nfn = IO_FUNCS[tool]\ntimings = %timeit -o q17(fn(lineitem), fn(part)).collect()\nresults[tool] = timings.all_runs","metadata":{"papermill":{"duration":4.800698,"end_time":"2024-03-22T17:31:17.346813","exception":false,"start_time":"2024-03-22T17:31:12.546115","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-10T18:05:08.473576Z","iopub.execute_input":"2024-07-10T18:05:08.473961Z","iopub.status.idle":"2024-07-10T18:05:19.591631Z","shell.execute_reply.started":"2024-07-10T18:05:08.473931Z","shell.execute_reply":"2024-07-10T18:05:19.590241Z"},"trusted":true},"execution_count":42,"outputs":[{"name":"stdout","text":"1.39 s ± 33.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n","output_type":"stream"}]},{"cell_type":"markdown","source":"## Save","metadata":{}},{"cell_type":"code","source":"import json\nwith open('results.json', 'w') as fd:\n json.dump(results, fd)\n","metadata":{"execution":{"iopub.status.busy":"2024-07-10T18:05:19.592958Z","iopub.execute_input":"2024-07-10T18:05:19.593321Z","iopub.status.idle":"2024-07-10T18:05:19.601429Z","shell.execute_reply.started":"2024-07-10T18:05:19.593283Z","shell.execute_reply":"2024-07-10T18:05:19.599960Z"},"trusted":true},"execution_count":43,"outputs":[]}]} \ No newline at end of file +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 33.390992, + "end_time": "2024-03-22T17:24:15.601719", + "exception": false, + "start_time": "2024-03-22T17:23:42.210727", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.907754, + "end_time": "2024-03-22T17:24:39.053873", + "exception": false, + "start_time": "2024-03-22T17:24:38.146119", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import polars as pl\n", + "\n", + "pd.options.mode.copy_on_write = True\n", + "pd.options.future.infer_string = True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.021725, + "end_time": "2024-03-22T17:24:39.080999", + "exception": false, + "start_time": "2024-03-22T17:24:39.059274", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from typing import Any\n", + "import narwhals as nw\n", + "\n", + "def q17(\n", + " lineitem_ds_raw: Any,\n", + " part_ds_raw: Any\n", + ") -> Any:\n", + "\n", + " lineitem_ds = nw.from_native(lineitem_ds_raw)\n", + " part_ds = nw.from_native(part_ds_raw)\n", + " \n", + " var1 = \"Brand#23\"\n", + " var2 = \"MED BOX\"\n", + " \n", + " query1 = (\n", + " part_ds.filter(nw.col(\"p_brand\") == var1)\n", + " .filter(nw.col(\"p_container\") == var2)\n", + " .join(lineitem_ds, how=\"left\", left_on=\"p_partkey\", right_on=\"l_partkey\")\n", + " )\n", + " \n", + " final_query = (\n", + " query1.group_by(\"p_partkey\")\n", + " .agg((0.2 * nw.col(\"l_quantity\").mean()).alias(\"avg_quantity\"))\n", + " .select(nw.col(\"p_partkey\").alias(\"key\"), nw.col(\"avg_quantity\"))\n", + " .join(query1, left_on=\"key\", right_on=\"p_partkey\")\n", + " .filter(nw.col(\"l_quantity\") < nw.col(\"avg_quantity\"))\n", + " .select((nw.col(\"l_extendedprice\").sum() / 7.0).round(2).alias(\"avg_yearly\"))\n", + " )\n", + "\n", + "\n", + " return nw.to_native(final_query)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.013325, + "end_time": "2024-03-22T17:24:39.099766", + "exception": false, + "start_time": "2024-03-22T17:24:39.086441", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", + "lineitem = dir_ + 'lineitem.parquet'\n", + "part = dir_ + 'part.parquet'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 0.014284, + "end_time": "2024-03-22T17:24:39.119737", + "exception": false, + "start_time": "2024-03-22T17:24:39.105453", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "IO_FUNCS = {\n", + " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", + " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", + " 'polars[eager]': lambda x: pl.read_parquet(x),\n", + " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results = {}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005113, + "end_time": "2024-03-22T17:24:39.130472", + "exception": false, + "start_time": "2024-03-22T17:24:39.125359", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## pandas via Narwhals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 196.786925, + "end_time": "2024-03-22T17:27:55.922832", + "exception": false, + "start_time": "2024-03-22T17:24:39.135907", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'pandas'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q17(fn(lineitem), fn(part))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005184, + "end_time": "2024-03-22T17:27:55.933407", + "exception": false, + "start_time": "2024-03-22T17:27:55.928223", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## pandas, pyarrow dtypes, via Narwhals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 158.748353, + "end_time": "2024-03-22T17:30:34.688289", + "exception": false, + "start_time": "2024-03-22T17:27:55.939936", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'pandas[pyarrow]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q17(fn(lineitem), fn(part))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005773, + "end_time": "2024-03-22T17:30:34.7003", + "exception": false, + "start_time": "2024-03-22T17:30:34.694527", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Polars read_parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 37.821116, + "end_time": "2024-03-22T17:31:12.527466", + "exception": false, + "start_time": "2024-03-22T17:30:34.70635", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'polars[eager]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q17(fn(lineitem), fn(part))\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.005515, + "end_time": "2024-03-22T17:31:12.539068", + "exception": false, + "start_time": "2024-03-22T17:31:12.533553", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Polars scan_parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "papermill": { + "duration": 4.800698, + "end_time": "2024-03-22T17:31:17.346813", + "exception": false, + "start_time": "2024-03-22T17:31:12.546115", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tool = 'polars[lazy]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q17(fn(lineitem), fn(part)).collect()\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "with open('results.json', 'w') as fd:\n", + " json.dump(results, fd)\n" + ] + } + ], + "metadata": { + "kaggle": { + "accelerator": "none", + "dataSources": [ + { + "sourceId": 167796934, + "sourceType": "kernelVersion" + } + ], + "dockerImageVersionId": 30673, + "isGpuEnabled": false, + "isInternetEnabled": true, + "language": "python", + "sourceType": "notebook" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + }, + "papermill": { + "default_parameters": {}, + "duration": 458.423327, + "end_time": "2024-03-22T17:31:18.077306", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2024-03-22T17:23:39.653979", + "version": "2.5.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}