Skip to content

Commit

Permalink
TPCH Queries 9 and 10 (#407)
Browse files Browse the repository at this point in the history
* TPCH Queries 9 and 10

* update id

* strip notebook output

---------

Co-authored-by: Marco Gorelli <[email protected]>
  • Loading branch information
ugohuche and MarcoGorelli authored Jul 5, 2024
1 parent f406390 commit c997006
Show file tree
Hide file tree
Showing 4 changed files with 786 additions and 0 deletions.
382 changes: 382 additions & 0 deletions tpch/notebooks/q10/execute.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,382 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"papermill": {
"duration": 33.390992,
"end_time": "2024-03-22T17:24:15.601719",
"exception": false,
"start_time": "2024-03-22T17:23:42.210727",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"papermill": {
"duration": 0.907754,
"end_time": "2024-03-22T17:24:39.053873",
"exception": false,
"start_time": "2024-03-22T17:24:38.146119",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import polars as pl\n",
"\n",
"pd.options.mode.copy_on_write = True\n",
"pd.options.future.infer_string = True"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"papermill": {
"duration": 0.021725,
"end_time": "2024-03-22T17:24:39.080999",
"exception": false,
"start_time": "2024-03-22T17:24:39.059274",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"from typing import Any\n",
"from datetime import datetime\n",
"import narwhals as nw\n",
"\n",
"def q10(\n",
" customer_ds_raw: Any,\n",
" nation_ds_raw: Any,\n",
" lineitem_ds_raw: Any,\n",
" orders_ds_raw: Any,\n",
") -> Any:\n",
"\n",
" nation_ds = nw.from_native(nation_ds_raw)\n",
" line_item_ds = nw.from_native(lineitem_ds_raw)\n",
" orders_ds = nw.from_native(orders_ds_raw)\n",
" customer_ds = nw.from_native(customer_ds_raw)\n",
" \n",
" var1 = datetime(1993, 10, 1)\n",
" var2 = datetime(1994, 1, 1)\n",
"\n",
" result = (\n",
" customer_ds.join(orders_ds, left_on=\"c_custkey\", right_on=\"o_custkey\")\n",
" .join(line_item_ds, left_on=\"o_orderkey\", right_on=\"l_orderkey\")\n",
" .join(nation_ds, left_on=\"c_nationkey\", right_on=\"n_nationkey\")\n",
" .filter(nw.col(\"o_orderdate\").is_between(var1, var2, closed=\"left\"))\n",
" .filter(nw.col(\"l_returnflag\") == \"R\")\n",
" .with_columns(\n",
" (nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\")))\n",
" .alias(\"revenue\")\n",
" )\n",
" .group_by(\n",
" \"c_custkey\",\n",
" \"c_name\",\n",
" \"c_acctbal\",\n",
" \"c_phone\",\n",
" \"n_name\",\n",
" \"c_address\",\n",
" \"c_comment\",\n",
" )\n",
" .agg(nw.sum(\"revenue\"))\n",
" .select(\n",
" \"c_custkey\",\n",
" \"c_name\",\n",
" \"revenue\",\n",
" \"c_acctbal\",\n",
" \"n_name\",\n",
" \"c_address\",\n",
" \"c_phone\",\n",
" \"c_comment\",\n",
" )\n",
" .sort(by=\"revenue\", descending=True)\n",
" .head(20)\n",
" )\n",
"\n",
" return nw.to_native(result)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"papermill": {
"duration": 0.013325,
"end_time": "2024-03-22T17:24:39.099766",
"exception": false,
"start_time": "2024-03-22T17:24:39.086441",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n",
"nation = dir_ + 'nation.parquet'\n",
"lineitem = dir_ + 'lineitem.parquet'\n",
"orders = dir_ + 'orders.parquet'\n",
"customer = dir_ + 'customer.parquet'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"papermill": {
"duration": 0.014284,
"end_time": "2024-03-22T17:24:39.119737",
"exception": false,
"start_time": "2024-03-22T17:24:39.105453",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"IO_FUNCS = {\n",
" 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n",
" 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n",
" 'polars[eager]': lambda x: pl.read_parquet(x),\n",
" 'polars[lazy]': lambda x: pl.scan_parquet(x),\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"results = {}"
]
},
{
"cell_type": "markdown",
"metadata": {
"papermill": {
"duration": 0.005113,
"end_time": "2024-03-22T17:24:39.130472",
"exception": false,
"start_time": "2024-03-22T17:24:39.125359",
"status": "completed"
},
"tags": []
},
"source": [
"## pandas via Narwhals"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"papermill": {
"duration": 196.786925,
"end_time": "2024-03-22T17:27:55.922832",
"exception": false,
"start_time": "2024-03-22T17:24:39.135907",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"tool = 'pandas'\n",
"fn = IO_FUNCS[tool]\n",
"timings = %timeit -o q10(fn(customer), fn(nation), fn(lineitem), fn(orders))\n",
"results[tool] = timings.all_runs"
]
},
{
"cell_type": "markdown",
"metadata": {
"papermill": {
"duration": 0.005184,
"end_time": "2024-03-22T17:27:55.933407",
"exception": false,
"start_time": "2024-03-22T17:27:55.928223",
"status": "completed"
},
"tags": []
},
"source": [
"## pandas, pyarrow dtypes, via Narwhals"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"papermill": {
"duration": 158.748353,
"end_time": "2024-03-22T17:30:34.688289",
"exception": false,
"start_time": "2024-03-22T17:27:55.939936",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"tool = 'pandas[pyarrow]'\n",
"fn = IO_FUNCS[tool]\n",
"timings = %timeit -o q10(fn(customer), fn(nation), fn(lineitem), fn(orders))\n",
"results[tool] = timings.all_runs"
]
},
{
"cell_type": "markdown",
"metadata": {
"papermill": {
"duration": 0.005773,
"end_time": "2024-03-22T17:30:34.7003",
"exception": false,
"start_time": "2024-03-22T17:30:34.694527",
"status": "completed"
},
"tags": []
},
"source": [
"## Polars read_parquet"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"papermill": {
"duration": 37.821116,
"end_time": "2024-03-22T17:31:12.527466",
"exception": false,
"start_time": "2024-03-22T17:30:34.70635",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"tool = 'polars[eager]'\n",
"fn = IO_FUNCS[tool]\n",
"timings = %timeit -o q10(fn(customer), fn(nation), fn(lineitem), fn(orders))\n",
"results[tool] = timings.all_runs"
]
},
{
"cell_type": "markdown",
"metadata": {
"papermill": {
"duration": 0.005515,
"end_time": "2024-03-22T17:31:12.539068",
"exception": false,
"start_time": "2024-03-22T17:31:12.533553",
"status": "completed"
},
"tags": []
},
"source": [
"## Polars scan_parquet"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"papermill": {
"duration": 4.800698,
"end_time": "2024-03-22T17:31:17.346813",
"exception": false,
"start_time": "2024-03-22T17:31:12.546115",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"tool = 'polars[lazy]'\n",
"fn = IO_FUNCS[tool]\n",
"timings = %timeit -o q10(fn(customer), fn(nation), fn(lineitem), fn(orders)).collect()\n",
"results[tool] = timings.all_runs"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Save"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"with open('results.json', 'w') as fd:\n",
" json.dump(results, fd)\n"
]
}
],
"metadata": {
"kaggle": {
"accelerator": "none",
"dataSources": [
{
"sourceId": 167796934,
"sourceType": "kernelVersion"
}
],
"dockerImageVersionId": 30673,
"isGpuEnabled": false,
"isInternetEnabled": true,
"language": "python",
"sourceType": "notebook"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
},
"papermill": {
"default_parameters": {},
"duration": 458.423327,
"end_time": "2024-03-22T17:31:18.077306",
"environment_variables": {},
"exception": null,
"input_path": "__notebook__.ipynb",
"output_path": "__notebook__.ipynb",
"parameters": {},
"start_time": "2024-03-22T17:23:39.653979",
"version": "2.5.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
15 changes: 15 additions & 0 deletions tpch/notebooks/q10/kernel-metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"id": "marcogorelli/narwhals-tpch-q10-s2",
"title": "Narwhals TPCH Q10 S2",
"code_file": "execute.ipynb",
"language": "python",
"kernel_type": "notebook",
"is_private": "false",
"enable_gpu": "false",
"enable_tpu": "false",
"enable_internet": "true",
"dataset_sources": [],
"competition_sources": [],
"kernel_sources": ["marcogorelli/tpc-h-data-parquet-s-2"],
"model_sources": []
}
Loading

0 comments on commit c997006

Please sign in to comment.