forked from narwhals-dev/narwhals
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
835 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,396 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "0", | ||
"metadata": { | ||
"papermill": { | ||
"duration": 46.481932, | ||
"end_time": "2024-07-25T23:10:36.288698", | ||
"exception": false, | ||
"start_time": "2024-07-25T23:09:49.806766", | ||
"status": "completed" | ||
}, | ||
"tags": [] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "1", | ||
"metadata": { | ||
"papermill": { | ||
"duration": 0.522877, | ||
"end_time": "2024-07-25T23:10:36.819167", | ||
"exception": false, | ||
"start_time": "2024-07-25T23:10:36.296290", | ||
"status": "completed" | ||
}, | ||
"tags": [] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"import polars as pl\n", | ||
"\n", | ||
"pd.options.mode.copy_on_write = True\n", | ||
"pd.options.future.infer_string = True" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "2", | ||
"metadata": { | ||
"papermill": { | ||
"duration": 0.02756, | ||
"end_time": "2024-07-25T23:10:36.854110", | ||
"exception": false, | ||
"start_time": "2024-07-25T23:10:36.826550", | ||
"status": "completed" | ||
}, | ||
"tags": [] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"from typing import Any\n", | ||
"import narwhals as nw\n", | ||
"\n", | ||
"def q13(\n", | ||
" customer_ds_raw: Any,\n", | ||
" orders_ds_raw: Any\n", | ||
") -> Any:\n", | ||
"\n", | ||
" customer_ds = nw.from_native(customer_ds_raw)\n", | ||
" orders_ds = nw.from_native(orders_ds_raw)\n", | ||
" \n", | ||
" var1 = \"special\"\n", | ||
" var2 = \"requests\"\n", | ||
"\n", | ||
" orders_ds = orders_ds.filter(~nw.col(\"o_comment\").str.contains(f\"{var1}.*{var2}\"))\n", | ||
" \n", | ||
" result = (\n", | ||
" customer_ds.join(orders_ds, left_on=\"c_custkey\", right_on=\"o_custkey\", how=\"left\")\n", | ||
" .group_by(\"c_custkey\")\n", | ||
" .agg(nw.col(\"o_orderkey\").len().alias(\"c_count\"))\n", | ||
" .group_by(\"c_count\")\n", | ||
" .agg(nw.col(\"c_count\").len().alias(\"len\"))\n", | ||
" .select(nw.col(\"c_count\"), nw.col(\"len\").alias(\"custdist\"))\n", | ||
" .sort(by=[\"custdist\", \"c_count\"], descending=[True, True])\n", | ||
" )\n", | ||
"\n", | ||
" return nw.to_native(result)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "3", | ||
"metadata": { | ||
"papermill": { | ||
"duration": 0.016452, | ||
"end_time": "2024-07-25T23:10:36.878001", | ||
"exception": false, | ||
"start_time": "2024-07-25T23:10:36.861549", | ||
"status": "completed" | ||
}, | ||
"tags": [] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", | ||
"customer = dir_ + 'customer.parquet'\n", | ||
"orders = dir_ + 'orders.parquet'" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "4", | ||
"metadata": { | ||
"papermill": { | ||
"duration": 0.016664, | ||
"end_time": "2024-07-25T23:10:36.902043", | ||
"exception": false, | ||
"start_time": "2024-07-25T23:10:36.885379", | ||
"status": "completed" | ||
}, | ||
"tags": [] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"IO_FUNCS = {\n", | ||
" 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", | ||
" 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", | ||
" 'polars[eager]': lambda x: pl.read_parquet(x),\n", | ||
" 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", | ||
"}" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "5", | ||
"metadata": { | ||
"papermill": { | ||
"duration": 0.015614, | ||
"end_time": "2024-07-25T23:10:36.924894", | ||
"exception": false, | ||
"start_time": "2024-07-25T23:10:36.909280", | ||
"status": "completed" | ||
}, | ||
"tags": [] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"results = {}" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "6", | ||
"metadata": { | ||
"papermill": { | ||
"duration": 0.006996, | ||
"end_time": "2024-07-25T23:10:36.939302", | ||
"exception": false, | ||
"start_time": "2024-07-25T23:10:36.932306", | ||
"status": "completed" | ||
}, | ||
"tags": [] | ||
}, | ||
"source": [ | ||
"## pandas via Narwhals" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "7", | ||
"metadata": { | ||
"papermill": { | ||
"duration": 44.926739, | ||
"end_time": "2024-07-25T23:11:21.873211", | ||
"exception": false, | ||
"start_time": "2024-07-25T23:10:36.946472", | ||
"status": "completed" | ||
}, | ||
"tags": [] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"tool = 'pandas'\n", | ||
"fn = IO_FUNCS[tool]\n", | ||
"timings = %timeit -o q13(fn(customer), fn(orders))\n", | ||
"results[tool] = timings.all_runs" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "8", | ||
"metadata": { | ||
"papermill": { | ||
"duration": 0.006853, | ||
"end_time": "2024-07-25T23:11:21.887504", | ||
"exception": false, | ||
"start_time": "2024-07-25T23:11:21.880651", | ||
"status": "completed" | ||
}, | ||
"tags": [] | ||
}, | ||
"source": [ | ||
"## pandas, pyarrow dtypes, via Narwhals" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "9", | ||
"metadata": { | ||
"papermill": { | ||
"duration": 44.733391, | ||
"end_time": "2024-07-25T23:12:06.628137", | ||
"exception": false, | ||
"start_time": "2024-07-25T23:11:21.894746", | ||
"status": "completed" | ||
}, | ||
"tags": [] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"tool = 'pandas[pyarrow]'\n", | ||
"fn = IO_FUNCS[tool]\n", | ||
"timings = %timeit -o q13(fn(customer), fn(orders))\n", | ||
"results[tool] = timings.all_runs" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "10", | ||
"metadata": { | ||
"papermill": { | ||
"duration": 0.006896, | ||
"end_time": "2024-07-25T23:12:06.642200", | ||
"exception": false, | ||
"start_time": "2024-07-25T23:12:06.635304", | ||
"status": "completed" | ||
}, | ||
"tags": [] | ||
}, | ||
"source": [ | ||
"## Polars read_parquet" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "11", | ||
"metadata": { | ||
"papermill": { | ||
"duration": 13.901571, | ||
"end_time": "2024-07-25T23:12:20.550910", | ||
"exception": false, | ||
"start_time": "2024-07-25T23:12:06.649339", | ||
"status": "completed" | ||
}, | ||
"tags": [] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"tool = 'polars[eager]'\n", | ||
"fn = IO_FUNCS[tool]\n", | ||
"timings = %timeit -o q13(fn(customer), fn(orders))\n", | ||
"results[tool] = timings.all_runs" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "12", | ||
"metadata": { | ||
"papermill": { | ||
"duration": 0.0075, | ||
"end_time": "2024-07-25T23:12:20.566105", | ||
"exception": false, | ||
"start_time": "2024-07-25T23:12:20.558605", | ||
"status": "completed" | ||
}, | ||
"tags": [] | ||
}, | ||
"source": [ | ||
"## Polars scan_parquet" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "13", | ||
"metadata": { | ||
"papermill": { | ||
"duration": 10.393542, | ||
"end_time": "2024-07-25T23:12:30.967063", | ||
"exception": false, | ||
"start_time": "2024-07-25T23:12:20.573521", | ||
"status": "completed" | ||
}, | ||
"tags": [] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"tool = 'polars[lazy]'\n", | ||
"fn = IO_FUNCS[tool]\n", | ||
"timings = %timeit -o q13(fn(customer), fn(orders)).collect()\n", | ||
"results[tool] = timings.all_runs" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "14", | ||
"metadata": { | ||
"papermill": { | ||
"duration": 0.007808, | ||
"end_time": "2024-07-25T23:12:30.982613", | ||
"exception": false, | ||
"start_time": "2024-07-25T23:12:30.974805", | ||
"status": "completed" | ||
}, | ||
"tags": [] | ||
}, | ||
"source": [ | ||
"## Save" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "15", | ||
"metadata": { | ||
"papermill": { | ||
"duration": 0.017013, | ||
"end_time": "2024-07-25T23:12:31.007464", | ||
"exception": false, | ||
"start_time": "2024-07-25T23:12:30.990451", | ||
"status": "completed" | ||
}, | ||
"tags": [] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"import json\n", | ||
"with open('results.json', 'w') as fd:\n", | ||
" json.dump(results, fd)\n" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kaggle": { | ||
"accelerator": "none", | ||
"dataSources": [ | ||
{ | ||
"sourceId": 167796934, | ||
"sourceType": "kernelVersion" | ||
} | ||
], | ||
"dockerImageVersionId": 30673, | ||
"isGpuEnabled": false, | ||
"isInternetEnabled": true, | ||
"language": "python", | ||
"sourceType": "notebook" | ||
}, | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.13" | ||
}, | ||
"papermill": { | ||
"default_parameters": {}, | ||
"duration": 164.526043, | ||
"end_time": "2024-07-25T23:12:31.536428", | ||
"environment_variables": {}, | ||
"exception": null, | ||
"input_path": "__notebook__.ipynb", | ||
"output_path": "__notebook__.ipynb", | ||
"parameters": {}, | ||
"start_time": "2024-07-25T23:09:47.010385", | ||
"version": "2.5.0" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Oops, something went wrong.