diff --git a/tpch/notebooks/q13/execute.ipynb b/tpch/notebooks/q13/execute.ipynb deleted file mode 100644 index 70df514f2..000000000 --- a/tpch/notebooks/q13/execute.ipynb +++ /dev/null @@ -1,396 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "0", - "metadata": { - "papermill": { - "duration": 46.481932, - "end_time": "2024-07-25T23:10:36.288698", - "exception": false, - "start_time": "2024-07-25T23:09:49.806766", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1", - "metadata": { - "papermill": { - "duration": 0.522877, - "end_time": "2024-07-25T23:10:36.819167", - "exception": false, - "start_time": "2024-07-25T23:10:36.296290", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import polars as pl\n", - "\n", - "pd.options.mode.copy_on_write = True\n", - "pd.options.future.infer_string = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2", - "metadata": { - "papermill": { - "duration": 0.02756, - "end_time": "2024-07-25T23:10:36.854110", - "exception": false, - "start_time": "2024-07-25T23:10:36.826550", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "from typing import Any\n", - "import narwhals as nw\n", - "\n", - "def q13(\n", - " customer_ds_raw: Any,\n", - " orders_ds_raw: Any\n", - ") -> Any:\n", - "\n", - " customer_ds = nw.from_native(customer_ds_raw)\n", - " orders_ds = nw.from_native(orders_ds_raw)\n", - " \n", - " var1 = \"special\"\n", - " var2 = \"requests\"\n", - "\n", - " orders_ds = orders_ds.filter(~nw.col(\"o_comment\").str.contains(f\"{var1}.*{var2}\"))\n", - " \n", - " result = (\n", - " customer_ds.join(orders_ds, left_on=\"c_custkey\", right_on=\"o_custkey\", how=\"left\")\n", - " .group_by(\"c_custkey\")\n", - " .agg(nw.col(\"o_orderkey\").len().alias(\"c_count\"))\n", - " .group_by(\"c_count\")\n", - " .agg(nw.col(\"c_count\").len().alias(\"len\"))\n", - " .select(nw.col(\"c_count\"), nw.col(\"len\").alias(\"custdist\"))\n", - " .sort(by=[\"custdist\", \"c_count\"], descending=[True, True])\n", - " )\n", - "\n", - " return nw.to_native(result)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3", - "metadata": { - "papermill": { - "duration": 0.016452, - "end_time": "2024-07-25T23:10:36.878001", - "exception": false, - "start_time": "2024-07-25T23:10:36.861549", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "customer = dir_ + 'customer.parquet'\n", - "orders = dir_ + 'orders.parquet'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4", - "metadata": { - "papermill": { - "duration": 0.016664, - "end_time": "2024-07-25T23:10:36.902043", - "exception": false, - "start_time": "2024-07-25T23:10:36.885379", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5", - "metadata": { - "papermill": { - "duration": 0.015614, - "end_time": "2024-07-25T23:10:36.924894", - "exception": false, - "start_time": "2024-07-25T23:10:36.909280", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "results = {}" - ] - }, - { - "cell_type": "markdown", - "id": "6", - "metadata": { - "papermill": { - "duration": 0.006996, - "end_time": "2024-07-25T23:10:36.939302", - "exception": false, - "start_time": "2024-07-25T23:10:36.932306", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7", - "metadata": { - "papermill": { - "duration": 44.926739, - "end_time": "2024-07-25T23:11:21.873211", - "exception": false, - "start_time": "2024-07-25T23:10:36.946472", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = 'pandas'\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q13(fn(customer), fn(orders))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "8", - "metadata": { - "papermill": { - "duration": 0.006853, - "end_time": "2024-07-25T23:11:21.887504", - "exception": false, - "start_time": "2024-07-25T23:11:21.880651", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas, pyarrow dtypes, via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9", - "metadata": { - "papermill": { - "duration": 44.733391, - "end_time": "2024-07-25T23:12:06.628137", - "exception": false, - "start_time": "2024-07-25T23:11:21.894746", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = 'pandas[pyarrow]'\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q13(fn(customer), fn(orders))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "10", - "metadata": { - "papermill": { - "duration": 0.006896, - "end_time": "2024-07-25T23:12:06.642200", - "exception": false, - "start_time": "2024-07-25T23:12:06.635304", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars read_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "11", - "metadata": { - "papermill": { - "duration": 13.901571, - "end_time": "2024-07-25T23:12:20.550910", - "exception": false, - "start_time": "2024-07-25T23:12:06.649339", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = 'polars[eager]'\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q13(fn(customer), fn(orders))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "12", - "metadata": { - "papermill": { - "duration": 0.0075, - "end_time": "2024-07-25T23:12:20.566105", - "exception": false, - "start_time": "2024-07-25T23:12:20.558605", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars scan_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "13", - "metadata": { - "papermill": { - "duration": 10.393542, - "end_time": "2024-07-25T23:12:30.967063", - "exception": false, - "start_time": "2024-07-25T23:12:20.573521", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = 'polars[lazy]'\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q13(fn(customer), fn(orders)).collect()\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "14", - "metadata": { - "papermill": { - "duration": 0.007808, - "end_time": "2024-07-25T23:12:30.982613", - "exception": false, - "start_time": "2024-07-25T23:12:30.974805", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "15", - "metadata": { - "papermill": { - "duration": 0.017013, - "end_time": "2024-07-25T23:12:31.007464", - "exception": false, - "start_time": "2024-07-25T23:12:30.990451", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" - ] - } - ], - "metadata": { - "kaggle": { - "accelerator": "none", - "dataSources": [ - { - "sourceId": 167796934, - "sourceType": "kernelVersion" - } - ], - "dockerImageVersionId": 30673, - "isGpuEnabled": false, - "isInternetEnabled": true, - "language": "python", - "sourceType": "notebook" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - }, - "papermill": { - "default_parameters": {}, - "duration": 164.526043, - "end_time": "2024-07-25T23:12:31.536428", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2024-07-25T23:09:47.010385", - "version": "2.5.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/tpch/notebooks/q13/kernel-metadata.json b/tpch/notebooks/q13/kernel-metadata.json deleted file mode 100644 index 7626c20d6..000000000 --- a/tpch/notebooks/q13/kernel-metadata.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "marcogorelli/narwhals-tpch-q22-s2", - "title": "Narwhals TPCH Q22 S2", - "code_file": "execute.ipynb", - "language": "python", - "kernel_type": "notebook", - "is_private": "false", - "enable_gpu": "false", - "enable_tpu": "false", - "enable_internet": "true", - "dataset_sources": [], - "competition_sources": [], - "kernel_sources": ["marcogorelli/tpc-h-data-parquet-s-2"], - "model_sources": [] -} \ No newline at end of file diff --git a/tpch/notebooks/q22/execute.ipynb b/tpch/notebooks/q22/execute.ipynb deleted file mode 100644 index 01866066e..000000000 --- a/tpch/notebooks/q22/execute.ipynb +++ /dev/null @@ -1,409 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "0", - "metadata": { - "papermill": { - "duration": 49.860489, - "end_time": "2024-07-26T12:50:30.931456", - "exception": false, - "start_time": "2024-07-26T12:49:41.070967", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1", - "metadata": { - "papermill": { - "duration": 0.563589, - "end_time": "2024-07-26T12:50:31.502451", - "exception": false, - "start_time": "2024-07-26T12:50:30.938862", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import polars as pl\n", - "\n", - "pd.options.mode.copy_on_write = True\n", - "pd.options.future.infer_string = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2", - "metadata": { - "papermill": { - "duration": 0.031352, - "end_time": "2024-07-26T12:50:31.541220", - "exception": false, - "start_time": "2024-07-26T12:50:31.509868", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "from typing import Any\n", - "import narwhals as nw\n", - "\n", - "def q22(\n", - " customer_ds_raw: Any,\n", - " orders_ds_raw: Any,\n", - ") -> Any:\n", - "\n", - " customer_ds = nw.from_native(customer_ds_raw)\n", - " orders_ds = nw.from_native(orders_ds_raw)\n", - " \n", - " \n", - " query1 = (\n", - " customer_ds.with_columns(nw.col(\"c_phone\").str.slice(0, 2).alias(\"cntrycode\"))\n", - " .filter(nw.col(\"cntrycode\").str.contains(\"13|31|23|29|30|18|17\"))\n", - " .select(\"c_acctbal\", \"c_custkey\", \"cntrycode\")\n", - " )\n", - "\n", - " query2 = query1.filter(nw.col(\"c_acctbal\") > 0.0).select(\n", - " nw.col(\"c_acctbal\").mean().alias(\"avg_acctbal\")\n", - " )\n", - "\n", - " query3 = orders_ds.select(nw.col(\"o_custkey\").unique()).with_columns(\n", - " nw.col(\"o_custkey\").alias(\"c_custkey\")\n", - " )\n", - "\n", - " final_query = (\n", - " query1.join(query3, left_on=\"c_custkey\", right_on=\"c_custkey\", how=\"left\")\n", - " .filter(nw.col(\"o_custkey\").is_null())\n", - " .join(query2, how=\"cross\")\n", - " .filter(nw.col(\"c_acctbal\") > nw.col(\"avg_acctbal\"))\n", - " .group_by(\"cntrycode\")\n", - " .agg(\n", - " nw.col(\"c_acctbal\").count().alias(\"numcust\"),\n", - " nw.col(\"c_acctbal\").sum().round(2).alias(\"totacctbal\"),\n", - " )\n", - " .sort(\"cntrycode\")\n", - " )\n", - "\n", - " return nw.to_native(final_query)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3", - "metadata": { - "papermill": { - "duration": 0.016247, - "end_time": "2024-07-26T12:50:31.564892", - "exception": false, - "start_time": "2024-07-26T12:50:31.548645", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "customer = dir_ + 'customer.parquet'\n", - "orders = dir_ + 'orders.parquet'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4", - "metadata": { - "papermill": { - "duration": 0.017416, - "end_time": "2024-07-26T12:50:31.589507", - "exception": false, - "start_time": "2024-07-26T12:50:31.572091", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5", - "metadata": { - "papermill": { - "duration": 0.015402, - "end_time": "2024-07-26T12:50:31.612243", - "exception": false, - "start_time": "2024-07-26T12:50:31.596841", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "results = {}" - ] - }, - { - "cell_type": "markdown", - "id": "6", - "metadata": { - "papermill": { - "duration": 0.006963, - "end_time": "2024-07-26T12:50:31.626579", - "exception": false, - "start_time": "2024-07-26T12:50:31.619616", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7", - "metadata": { - "papermill": { - "duration": 10.559142, - "end_time": "2024-07-26T12:50:42.192870", - "exception": false, - "start_time": "2024-07-26T12:50:31.633728", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = 'pandas'\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q22(fn(customer), fn(orders))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "8", - "metadata": { - "papermill": { - "duration": 0.008478, - "end_time": "2024-07-26T12:50:42.209200", - "exception": false, - "start_time": "2024-07-26T12:50:42.200722", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## pandas, pyarrow dtypes, via Narwhals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9", - "metadata": { - "papermill": { - "duration": 7.07373, - "end_time": "2024-07-26T12:50:49.292349", - "exception": false, - "start_time": "2024-07-26T12:50:42.218619", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = 'pandas[pyarrow]'\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q22(fn(customer), fn(orders))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "10", - "metadata": { - "papermill": { - "duration": 0.00828, - "end_time": "2024-07-26T12:50:49.309227", - "exception": false, - "start_time": "2024-07-26T12:50:49.300947", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars read_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "11", - "metadata": { - "papermill": { - "duration": 6.202911, - "end_time": "2024-07-26T12:50:55.520546", - "exception": false, - "start_time": "2024-07-26T12:50:49.317635", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = 'polars[eager]'\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q22(fn(customer), fn(orders))\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "12", - "metadata": { - "papermill": { - "duration": 0.008436, - "end_time": "2024-07-26T12:50:55.537904", - "exception": false, - "start_time": "2024-07-26T12:50:55.529468", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Polars scan_parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "13", - "metadata": { - "papermill": { - "duration": 12.591074, - "end_time": "2024-07-26T12:51:08.137474", - "exception": false, - "start_time": "2024-07-26T12:50:55.546400", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "tool = 'polars[lazy]'\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q22(fn(customer), fn(orders)).collect()\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "14", - "metadata": { - "papermill": { - "duration": 0.008232, - "end_time": "2024-07-26T12:51:08.154223", - "exception": false, - "start_time": "2024-07-26T12:51:08.145991", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "15", - "metadata": { - "papermill": { - "duration": 0.017523, - "end_time": "2024-07-26T12:51:08.180173", - "exception": false, - "start_time": "2024-07-26T12:51:08.162650", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" - ] - } - ], - "metadata": { - "kaggle": { - "accelerator": "none", - "dataSources": [ - { - "sourceId": 167796934, - "sourceType": "kernelVersion" - } - ], - "dockerImageVersionId": 30673, - "isGpuEnabled": false, - "isInternetEnabled": true, - "language": "python", - "sourceType": "notebook" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - }, - "papermill": { - "default_parameters": {}, - "duration": 90.628471, - "end_time": "2024-07-26T12:51:08.710737", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2024-07-26T12:49:38.082266", - "version": "2.5.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/tpch/notebooks/q22/kernel-metadata.json b/tpch/notebooks/q22/kernel-metadata.json deleted file mode 100644 index 7626c20d6..000000000 --- a/tpch/notebooks/q22/kernel-metadata.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "marcogorelli/narwhals-tpch-q22-s2", - "title": "Narwhals TPCH Q22 S2", - "code_file": "execute.ipynb", - "language": "python", - "kernel_type": "notebook", - "is_private": "false", - "enable_gpu": "false", - "enable_tpu": "false", - "enable_internet": "true", - "dataset_sources": [], - "competition_sources": [], - "kernel_sources": ["marcogorelli/tpc-h-data-parquet-s-2"], - "model_sources": [] -} \ No newline at end of file