Skip to content

Commit

Permalink
Added TPC-H Q13 and Q22
Browse files Browse the repository at this point in the history
  • Loading branch information
ugohuche authored Jul 26, 2024
1 parent 74e65e9 commit 7b78c1d
Show file tree
Hide file tree
Showing 4 changed files with 835 additions and 0 deletions.
396 changes: 396 additions & 0 deletions tpch/notebooks/q13/execute.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,396 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "0",
"metadata": {
"papermill": {
"duration": 46.481932,
"end_time": "2024-07-25T23:10:36.288698",
"exception": false,
"start_time": "2024-07-25T23:09:49.806766",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1",
"metadata": {
"papermill": {
"duration": 0.522877,
"end_time": "2024-07-25T23:10:36.819167",
"exception": false,
"start_time": "2024-07-25T23:10:36.296290",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import polars as pl\n",
"\n",
"pd.options.mode.copy_on_write = True\n",
"pd.options.future.infer_string = True"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2",
"metadata": {
"papermill": {
"duration": 0.02756,
"end_time": "2024-07-25T23:10:36.854110",
"exception": false,
"start_time": "2024-07-25T23:10:36.826550",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"from typing import Any\n",
"import narwhals as nw\n",
"\n",
"def q13(\n",
" customer_ds_raw: Any,\n",
" orders_ds_raw: Any\n",
") -> Any:\n",
"\n",
" customer_ds = nw.from_native(customer_ds_raw)\n",
" orders_ds = nw.from_native(orders_ds_raw)\n",
" \n",
" var1 = \"special\"\n",
" var2 = \"requests\"\n",
"\n",
" orders_ds = orders_ds.filter(~nw.col(\"o_comment\").str.contains(f\"{var1}.*{var2}\"))\n",
" \n",
" result = (\n",
" customer_ds.join(orders_ds, left_on=\"c_custkey\", right_on=\"o_custkey\", how=\"left\")\n",
" .group_by(\"c_custkey\")\n",
" .agg(nw.col(\"o_orderkey\").len().alias(\"c_count\"))\n",
" .group_by(\"c_count\")\n",
" .agg(nw.col(\"c_count\").len().alias(\"len\"))\n",
" .select(nw.col(\"c_count\"), nw.col(\"len\").alias(\"custdist\"))\n",
" .sort(by=[\"custdist\", \"c_count\"], descending=[True, True])\n",
" )\n",
"\n",
" return nw.to_native(result)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3",
"metadata": {
"papermill": {
"duration": 0.016452,
"end_time": "2024-07-25T23:10:36.878001",
"exception": false,
"start_time": "2024-07-25T23:10:36.861549",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n",
"customer = dir_ + 'customer.parquet'\n",
"orders = dir_ + 'orders.parquet'"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4",
"metadata": {
"papermill": {
"duration": 0.016664,
"end_time": "2024-07-25T23:10:36.902043",
"exception": false,
"start_time": "2024-07-25T23:10:36.885379",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"IO_FUNCS = {\n",
" 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n",
" 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n",
" 'polars[eager]': lambda x: pl.read_parquet(x),\n",
" 'polars[lazy]': lambda x: pl.scan_parquet(x),\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5",
"metadata": {
"papermill": {
"duration": 0.015614,
"end_time": "2024-07-25T23:10:36.924894",
"exception": false,
"start_time": "2024-07-25T23:10:36.909280",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"results = {}"
]
},
{
"cell_type": "markdown",
"id": "6",
"metadata": {
"papermill": {
"duration": 0.006996,
"end_time": "2024-07-25T23:10:36.939302",
"exception": false,
"start_time": "2024-07-25T23:10:36.932306",
"status": "completed"
},
"tags": []
},
"source": [
"## pandas via Narwhals"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7",
"metadata": {
"papermill": {
"duration": 44.926739,
"end_time": "2024-07-25T23:11:21.873211",
"exception": false,
"start_time": "2024-07-25T23:10:36.946472",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"tool = 'pandas'\n",
"fn = IO_FUNCS[tool]\n",
"timings = %timeit -o q13(fn(customer), fn(orders))\n",
"results[tool] = timings.all_runs"
]
},
{
"cell_type": "markdown",
"id": "8",
"metadata": {
"papermill": {
"duration": 0.006853,
"end_time": "2024-07-25T23:11:21.887504",
"exception": false,
"start_time": "2024-07-25T23:11:21.880651",
"status": "completed"
},
"tags": []
},
"source": [
"## pandas, pyarrow dtypes, via Narwhals"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9",
"metadata": {
"papermill": {
"duration": 44.733391,
"end_time": "2024-07-25T23:12:06.628137",
"exception": false,
"start_time": "2024-07-25T23:11:21.894746",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"tool = 'pandas[pyarrow]'\n",
"fn = IO_FUNCS[tool]\n",
"timings = %timeit -o q13(fn(customer), fn(orders))\n",
"results[tool] = timings.all_runs"
]
},
{
"cell_type": "markdown",
"id": "10",
"metadata": {
"papermill": {
"duration": 0.006896,
"end_time": "2024-07-25T23:12:06.642200",
"exception": false,
"start_time": "2024-07-25T23:12:06.635304",
"status": "completed"
},
"tags": []
},
"source": [
"## Polars read_parquet"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "11",
"metadata": {
"papermill": {
"duration": 13.901571,
"end_time": "2024-07-25T23:12:20.550910",
"exception": false,
"start_time": "2024-07-25T23:12:06.649339",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"tool = 'polars[eager]'\n",
"fn = IO_FUNCS[tool]\n",
"timings = %timeit -o q13(fn(customer), fn(orders))\n",
"results[tool] = timings.all_runs"
]
},
{
"cell_type": "markdown",
"id": "12",
"metadata": {
"papermill": {
"duration": 0.0075,
"end_time": "2024-07-25T23:12:20.566105",
"exception": false,
"start_time": "2024-07-25T23:12:20.558605",
"status": "completed"
},
"tags": []
},
"source": [
"## Polars scan_parquet"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "13",
"metadata": {
"papermill": {
"duration": 10.393542,
"end_time": "2024-07-25T23:12:30.967063",
"exception": false,
"start_time": "2024-07-25T23:12:20.573521",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"tool = 'polars[lazy]'\n",
"fn = IO_FUNCS[tool]\n",
"timings = %timeit -o q13(fn(customer), fn(orders)).collect()\n",
"results[tool] = timings.all_runs"
]
},
{
"cell_type": "markdown",
"id": "14",
"metadata": {
"papermill": {
"duration": 0.007808,
"end_time": "2024-07-25T23:12:30.982613",
"exception": false,
"start_time": "2024-07-25T23:12:30.974805",
"status": "completed"
},
"tags": []
},
"source": [
"## Save"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "15",
"metadata": {
"papermill": {
"duration": 0.017013,
"end_time": "2024-07-25T23:12:31.007464",
"exception": false,
"start_time": "2024-07-25T23:12:30.990451",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"import json\n",
"with open('results.json', 'w') as fd:\n",
" json.dump(results, fd)\n"
]
}
],
"metadata": {
"kaggle": {
"accelerator": "none",
"dataSources": [
{
"sourceId": 167796934,
"sourceType": "kernelVersion"
}
],
"dockerImageVersionId": 30673,
"isGpuEnabled": false,
"isInternetEnabled": true,
"language": "python",
"sourceType": "notebook"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
},
"papermill": {
"default_parameters": {},
"duration": 164.526043,
"end_time": "2024-07-25T23:12:31.536428",
"environment_variables": {},
"exception": null,
"input_path": "__notebook__.ipynb",
"output_path": "__notebook__.ipynb",
"parameters": {},
"start_time": "2024-07-25T23:09:47.010385",
"version": "2.5.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit 7b78c1d

Please sign in to comment.