Skip to content

Commit

Permalink
TPCH Query 11
Browse files Browse the repository at this point in the history
  • Loading branch information
ugohuche authored and MarcoGorelli committed Jul 5, 2024
1 parent d1e752f commit 3af2d14
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 0 deletions.
1 change: 1 addition & 0 deletions tpch/notebooks/q11/execute.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"metadata":{"kaggle":{"accelerator":"none","dataSources":[{"sourceId":167796934,"sourceType":"kernelVersion"}],"dockerImageVersionId":30673,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.13"},"papermill":{"default_parameters":{},"duration":458.423327,"end_time":"2024-03-22T17:31:18.077306","environment_variables":{},"exception":null,"input_path":"__notebook__.ipynb","output_path":"__notebook__.ipynb","parameters":{},"start_time":"2024-03-22T17:23:39.653979","version":"2.5.0"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals ","metadata":{"papermill":{"duration":33.390992,"end_time":"2024-03-22T17:24:15.601719","exception":false,"start_time":"2024-03-22T17:23:42.210727","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-05T10:32:59.927716Z","iopub.execute_input":"2024-07-05T10:32:59.928639Z","iopub.status.idle":"2024-07-05T10:33:14.029566Z","shell.execute_reply.started":"2024-07-05T10:32:59.928593Z","shell.execute_reply":"2024-07-05T10:33:14.028082Z"},"trusted":true},"execution_count":13,"outputs":[{"name":"stdout","text":"\u001b[33mWARNING: Skipping apache-beam as it is not installed.\u001b[0m\u001b[33m\n\u001b[0mRequirement already satisfied: pandas in /opt/conda/lib/python3.10/site-packages (2.2.2)\nRequirement already satisfied: polars in /opt/conda/lib/python3.10/site-packages (1.0.0)\nRequirement already satisfied: pyarrow in /opt/conda/lib/python3.10/site-packages (16.1.0)\nRequirement already satisfied: narwhals in /opt/conda/lib/python3.10/site-packages (0.9.28)\nRequirement already satisfied: numpy>=1.22.4 in /opt/conda/lib/python3.10/site-packages (from pandas) (1.26.4)\nRequirement already satisfied: python-dateutil>=2.8.2 in /opt/conda/lib/python3.10/site-packages (from pandas) (2.9.0.post0)\nRequirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.10/site-packages (from pandas) (2023.3.post1)\nRequirement already satisfied: tzdata>=2022.7 in /opt/conda/lib/python3.10/site-packages (from pandas) (2023.4)\nRequirement already satisfied: six>=1.5 in /opt/conda/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n","output_type":"stream"}]},{"cell_type":"code","source":"import pandas as pd\nimport polars as pl\n\npd.options.mode.copy_on_write = True\npd.options.future.infer_string = True","metadata":{"papermill":{"duration":0.907754,"end_time":"2024-03-22T17:24:39.053873","exception":false,"start_time":"2024-03-22T17:24:38.146119","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-05T10:33:14.032036Z","iopub.execute_input":"2024-07-05T10:33:14.032455Z","iopub.status.idle":"2024-07-05T10:33:14.039074Z","shell.execute_reply.started":"2024-07-05T10:33:14.032409Z","shell.execute_reply":"2024-07-05T10:33:14.037818Z"},"trusted":true},"execution_count":14,"outputs":[]},{"cell_type":"code","source":"from typing import Any\nimport narwhals as nw\n\ndef q11(\n partsupp_ds_raw: Any,\n nation_ds_raw: Any,\n supplier_ds_raw: Any,\n) -> Any:\n\n nation_ds = nw.from_native(nation_ds_raw)\n partsupp_ds = nw.from_native(partsupp_ds_raw)\n supplier_ds = nw.from_native(supplier_ds_raw)\n\n \n var1 = \"GERMANY\"\n var2 = 0.0001\n\n q1 = (\n partsupp_ds.join(supplier_ds, left_on=\"ps_suppkey\", right_on=\"s_suppkey\")\n .join(nation_ds, left_on=\"s_nationkey\", right_on=\"n_nationkey\")\n .filter(nw.col(\"n_name\") == var1)\n )\n q2 = q1.select(\n (nw.col(\"ps_supplycost\") * nw.col(\"ps_availqty\")).sum().round(2).alias(\"tmp\")\n * var2\n )\n\n q_final = (\n q1.with_columns(\n (nw.col(\"ps_supplycost\") * nw.col(\"ps_availqty\"))\n .alias(\"value\")\n )\n .group_by(\"ps_partkey\")\n .agg(\n nw.sum(\"value\")\n )\n .join(q2, how=\"cross\")\n .filter(nw.col(\"value\") > nw.col(\"tmp\"))\n .select(\"ps_partkey\", \"value\")\n .sort(\"value\", descending=True)\n )\n\n return nw.to_native(q_final)","metadata":{"papermill":{"duration":0.021725,"end_time":"2024-03-22T17:24:39.080999","exception":false,"start_time":"2024-03-22T17:24:39.059274","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-05T10:33:14.040616Z","iopub.execute_input":"2024-07-05T10:33:14.040982Z","iopub.status.idle":"2024-07-05T10:33:14.056075Z","shell.execute_reply.started":"2024-07-05T10:33:14.040950Z","shell.execute_reply":"2024-07-05T10:33:14.054409Z"},"trusted":true},"execution_count":15,"outputs":[]},{"cell_type":"code","source":"dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\nnation = dir_ + 'nation.parquet'\nsupplier = dir_ + 'supplier.parquet'\npartsupp = dir_ + 'partsupp.parquet'","metadata":{"papermill":{"duration":0.013325,"end_time":"2024-03-22T17:24:39.099766","exception":false,"start_time":"2024-03-22T17:24:39.086441","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-05T10:33:14.059211Z","iopub.execute_input":"2024-07-05T10:33:14.059609Z","iopub.status.idle":"2024-07-05T10:33:14.068385Z","shell.execute_reply.started":"2024-07-05T10:33:14.059578Z","shell.execute_reply":"2024-07-05T10:33:14.067130Z"},"trusted":true},"execution_count":16,"outputs":[]},{"cell_type":"code","source":"IO_FUNCS = {\n 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n 'polars[eager]': lambda x: pl.read_parquet(x),\n 'polars[lazy]': lambda x: pl.scan_parquet(x),\n}","metadata":{"papermill":{"duration":0.014284,"end_time":"2024-03-22T17:24:39.119737","exception":false,"start_time":"2024-03-22T17:24:39.105453","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-05T10:33:14.069813Z","iopub.execute_input":"2024-07-05T10:33:14.070174Z","iopub.status.idle":"2024-07-05T10:33:14.085661Z","shell.execute_reply.started":"2024-07-05T10:33:14.070142Z","shell.execute_reply":"2024-07-05T10:33:14.084444Z"},"trusted":true},"execution_count":17,"outputs":[]},{"cell_type":"code","source":"results = {}","metadata":{"execution":{"iopub.status.busy":"2024-07-05T10:33:14.086962Z","iopub.execute_input":"2024-07-05T10:33:14.087330Z","iopub.status.idle":"2024-07-05T10:33:14.096595Z","shell.execute_reply.started":"2024-07-05T10:33:14.087298Z","shell.execute_reply":"2024-07-05T10:33:14.095430Z"},"trusted":true},"execution_count":18,"outputs":[]},{"cell_type":"markdown","source":"## pandas via Narwhals","metadata":{"papermill":{"duration":0.005113,"end_time":"2024-03-22T17:24:39.130472","exception":false,"start_time":"2024-03-22T17:24:39.125359","status":"completed"},"tags":[]}},{"cell_type":"code","source":"tool = 'pandas'\nfn = IO_FUNCS[tool]\ntimings = %timeit -o q11(fn(partsupp), fn(nation), fn(supplier))\nresults[tool] = timings.all_runs","metadata":{"papermill":{"duration":196.786925,"end_time":"2024-03-22T17:27:55.922832","exception":false,"start_time":"2024-03-22T17:24:39.135907","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-05T10:33:14.098215Z","iopub.execute_input":"2024-07-05T10:33:14.098662Z","iopub.status.idle":"2024-07-05T10:33:27.624855Z","shell.execute_reply.started":"2024-07-05T10:33:14.098622Z","shell.execute_reply":"2024-07-05T10:33:27.623747Z"},"trusted":true},"execution_count":19,"outputs":[{"name":"stdout","text":"1.66 s ± 118 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n","output_type":"stream"}]},{"cell_type":"markdown","source":"## pandas, pyarrow dtypes, via Narwhals","metadata":{"papermill":{"duration":0.005184,"end_time":"2024-03-22T17:27:55.933407","exception":false,"start_time":"2024-03-22T17:27:55.928223","status":"completed"},"tags":[]}},{"cell_type":"code","source":"tool = 'pandas[pyarrow]'\nfn = IO_FUNCS[tool]\ntimings = %timeit -o q11(fn(partsupp), fn(nation), fn(supplier))\nresults[tool] = timings.all_runs","metadata":{"papermill":{"duration":158.748353,"end_time":"2024-03-22T17:30:34.688289","exception":false,"start_time":"2024-03-22T17:27:55.939936","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-05T10:33:27.626162Z","iopub.execute_input":"2024-07-05T10:33:27.626489Z","iopub.status.idle":"2024-07-05T10:33:40.866730Z","shell.execute_reply.started":"2024-07-05T10:33:27.626463Z","shell.execute_reply":"2024-07-05T10:33:40.865296Z"},"trusted":true},"execution_count":20,"outputs":[{"name":"stdout","text":"1.67 s ± 86.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n","output_type":"stream"}]},{"cell_type":"markdown","source":"## Polars read_parquet","metadata":{"papermill":{"duration":0.005773,"end_time":"2024-03-22T17:30:34.7003","exception":false,"start_time":"2024-03-22T17:30:34.694527","status":"completed"},"tags":[]}},{"cell_type":"code","source":"tool = 'polars[eager]'\nfn = IO_FUNCS[tool]\ntimings = %timeit -o q11(fn(partsupp), fn(nation), fn(supplier))\nresults[tool] = timings.all_runs","metadata":{"papermill":{"duration":37.821116,"end_time":"2024-03-22T17:31:12.527466","exception":false,"start_time":"2024-03-22T17:30:34.70635","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-05T10:33:40.868234Z","iopub.execute_input":"2024-07-05T10:33:40.868584Z","iopub.status.idle":"2024-07-05T10:33:48.072165Z","shell.execute_reply.started":"2024-07-05T10:33:40.868552Z","shell.execute_reply":"2024-07-05T10:33:48.071018Z"},"trusted":true},"execution_count":21,"outputs":[{"name":"stdout","text":"890 ms ± 13.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n","output_type":"stream"}]},{"cell_type":"markdown","source":"## Polars scan_parquet","metadata":{"papermill":{"duration":0.005515,"end_time":"2024-03-22T17:31:12.539068","exception":false,"start_time":"2024-03-22T17:31:12.533553","status":"completed"},"tags":[]}},{"cell_type":"code","source":"tool = 'polars[lazy]'\nfn = IO_FUNCS[tool]\ntimings = %timeit -o q11(fn(partsupp), fn(nation), fn(supplier)).collect()\nresults[tool] = timings.all_runs","metadata":{"papermill":{"duration":4.800698,"end_time":"2024-03-22T17:31:17.346813","exception":false,"start_time":"2024-03-22T17:31:12.546115","status":"completed"},"tags":[],"execution":{"iopub.status.busy":"2024-07-05T10:33:48.075257Z","iopub.execute_input":"2024-07-05T10:33:48.075613Z","iopub.status.idle":"2024-07-05T10:33:56.980125Z","shell.execute_reply.started":"2024-07-05T10:33:48.075582Z","shell.execute_reply":"2024-07-05T10:33:56.979088Z"},"trusted":true},"execution_count":22,"outputs":[{"name":"stdout","text":"110 ms ± 12.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n","output_type":"stream"}]},{"cell_type":"markdown","source":"## Save","metadata":{}},{"cell_type":"code","source":"import json\nwith open('results.json', 'w') as fd:\n json.dump(results, fd)\n","metadata":{"execution":{"iopub.status.busy":"2024-07-05T10:33:56.981599Z","iopub.execute_input":"2024-07-05T10:33:56.982033Z","iopub.status.idle":"2024-07-05T10:33:56.988558Z","shell.execute_reply.started":"2024-07-05T10:33:56.981992Z","shell.execute_reply":"2024-07-05T10:33:56.987342Z"},"trusted":true},"execution_count":23,"outputs":[]},{"cell_type":"code","source":"from pprint import pprint\n\npprint(results)","metadata":{"execution":{"iopub.status.busy":"2024-07-05T10:33:56.990445Z","iopub.execute_input":"2024-07-05T10:33:56.990881Z","iopub.status.idle":"2024-07-05T10:33:57.000730Z","shell.execute_reply.started":"2024-07-05T10:33:56.990841Z","shell.execute_reply":"2024-07-05T10:33:56.999458Z"},"trusted":true},"execution_count":24,"outputs":[{"name":"stdout","text":"{'pandas': [1.708278326000027,\n 1.8040552429999934,\n 1.8417796100000032,\n 1.600905629999943,\n 1.6415783779998492,\n 1.5647700059998897,\n 1.493057884000109],\n 'pandas[pyarrow]': [1.6380957989999843,\n 1.5802785819998917,\n 1.5376337459999831,\n 1.7884727590001148,\n 1.7397616020000441,\n 1.7496962650000114,\n 1.6605698180001127],\n 'polars[eager]': [0.9160442119998606,\n 0.8955544509999527,\n 0.8863846530000501,\n 0.8829364579999037,\n 0.8918134509999618,\n 0.8924379529998987,\n 0.8672452630000862],\n 'polars[lazy]': [1.1258213609999075,\n 1.4064464999999018,\n 1.046419743999877,\n 1.0376091739999538,\n 1.043019643999969,\n 1.0296597439999005,\n 1.0383537459999843]}\n","output_type":"stream"}]}]}
15 changes: 15 additions & 0 deletions tpch/notebooks/q11/kernel-metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"id": "uchennaugoh/narwhals-tpch-q11-s2",
"title": "Narwhals TPCH Q11 S2",
"code_file": "execute.ipynb",
"language": "python",
"kernel_type": "notebook",
"is_private": "false",
"enable_gpu": "false",
"enable_tpu": "false",
"enable_internet": "true",
"dataset_sources": [],
"competition_sources": [],
"kernel_sources": ["marcogorelli/tpc-h-data-parquet-s-2"],
"model_sources": []
}

0 comments on commit 3af2d14

Please sign in to comment.