diff --git a/README.md b/README.md index c5504c908..37765250e 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ Like Ibis, Narwhals aims to enable dataframe-agnostic code. However, Narwhals co is about as lightweight as it gets, and is aimed at library developers rather than at end users. It also does not aim to support as many backends, instead preferring to focus on dataframes. So, which should you use? -- If you need a SQL frontend: Ibis! +- If you need a SQL frontend in Python: Ibis! - If you're a library maintainer and want a lightweight and minimal-overhead layer to get cross-dataframe library support: Narwhals! Here is the package size increase which would result from installing each tool in a non-pandas diff --git a/docs/overhead.md b/docs/overhead.md index d3f535957..7edfb1f95 100644 --- a/docs/overhead.md +++ b/docs/overhead.md @@ -18,8 +18,3 @@ the data sources. On some runs, the Narwhals code makes things marginally faster, on others marginally slower. The overall picture is clear: with Narwhals, you can support both Polars and pandas APIs with little to no impact on either. - -A fairly common question we receive is "why not just use Ibis". We believe -that Ibis works well as a SQL frontend, but find [its overhead when translating -dataframe APIs](https://github.com/ibis-project/ibis/issues/9345) to be unacceptably high - -that's why we created something new. diff --git a/tpch/notebooks/q1/execute.ipynb b/tpch/notebooks/q1/execute.ipynb index e7b81c9b3..a110f504b 100755 --- a/tpch/notebooks/q1/execute.ipynb +++ b/tpch/notebooks/q1/execute.ipynb @@ -47,7 +47,7 @@ } ], "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals>=0.9.5 " + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals>=0.9.5 ibis-framework" ] }, { @@ -179,6 +179,46 @@ " )" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "2aeb714e", + "metadata": {}, + "outputs": [], + "source": [ + "def q1_ibis(lineitem: Any, *, tool):\n", + " var1 = datetime(1998, 9, 2)\n", + " lineitem = lineitem.filter(lineitem[\"l_shipdate\"] <= var1)\n", + " lineitem = lineitem.mutate(\n", + " disc_price=lineitem[\"l_extendedprice\"] * (1 - lineitem[\"l_discount\"]),\n", + " charge=(\n", + " lineitem[\"l_extendedprice\"]\n", + " * (1.0 - lineitem[\"l_discount\"])\n", + " * (1.0 + lineitem[\"l_tax\"])\n", + " ),\n", + " )\n", + " q_final = (\n", + " lineitem\n", + " .group_by([\"l_returnflag\", \"l_linestatus\"])\n", + " .aggregate(\n", + " sum_qty=lineitem[\"l_quantity\"].sum(),\n", + " sum_base_price=lineitem[\"l_extendedprice\"].sum(),\n", + " sum_disc_price=(lineitem['disc_price'].sum()),\n", + " sum_charge=(lineitem['charge'].sum()),\n", + " avg_qty=lineitem[\"l_quantity\"].mean(),\n", + " avg_price=lineitem[\"l_extendedprice\"].mean(),\n", + " avg_disc=lineitem[\"l_discount\"].mean(),\n", + " count_order=lambda lineitem: lineitem.count(),\n", + " )\n", + " .order_by([\"l_returnflag\", \"l_linestatus\"])\n", + " )\n", + " if tool == 'pandas':\n", + " return q_final.to_pandas()\n", + " if tool == 'polars':\n", + " return q_final.to_polars()\n", + " raise ValueError(\"expected pandas or polars\")" + ] + }, { "cell_type": "code", "execution_count": 5, @@ -234,11 +274,18 @@ }, "outputs": [], "source": [ + "import ibis\n", + "\n", + "con_pd = ibis.pandas.connect()\n", + "con_pl = ibis.polars.connect()\n", + "\n", "IO_FUNCS = {\n", " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", + " 'pandas[pyarrow][ibis]': lambda x: con_pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", " 'polars[eager]': lambda x: pl.read_parquet(x),\n", " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + " 'polars[lazy][ibis]': lambda x: con_pl.read_parquet(x),\n", "}" ] }, @@ -252,6 +299,44 @@ "results = {}" ] }, + { + "cell_type": "markdown", + "id": "b2dc4c17", + "metadata": {}, + "source": [ + "## pandas, pyarrow dtypes, ibis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d342ce97", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24 s ± 142 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + }, + { + "data": { + "text/plain": [ + "23.841894793999984" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tool = 'pandas[pyarrow][ibis]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q1_ibis(fn(lineitem), tool='pandas')\n", + "results[tool] = timings.all_runs" + ] + }, { "cell_type": "markdown", "id": "64b20949", @@ -542,6 +627,44 @@ "results[tool] = timings.all_runs" ] }, + { + "cell_type": "markdown", + "id": "aa0a2882", + "metadata": {}, + "source": [ + "## Polars scan_parquet ibis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c00e7434", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24 s ± 142 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + }, + { + "data": { + "text/plain": [ + "23.841894793999984" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tool = 'polars[lazy][ibis]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q1_ibis(fn(lineitem), tool='polars')\n", + "results[tool] = timings.all_runs" + ] + }, { "cell_type": "markdown", "id": "37ce6bf3", diff --git a/tpch/notebooks/q2/execute.ipynb b/tpch/notebooks/q2/execute.ipynb index d260638f7..2ad26a274 100755 --- a/tpch/notebooks/q2/execute.ipynb +++ b/tpch/notebooks/q2/execute.ipynb @@ -47,7 +47,7 @@ } ], "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals>=0.9.5 " + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals>=0.9.5 ibis-framework " ] }, { @@ -222,6 +222,64 @@ " return nw.to_native(q_final)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "5823bdfe", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Any\n", + "from datetime import datetime\n", + "import ibis\n", + "\n", + "def q2_ibis(\n", + " region: Any,\n", + " nation: Any,\n", + " supplier: Any,\n", + " part: Any,\n", + " partsupp: Any,\n", + " *,\n", + " tool: str,\n", + ") -> Any:\n", + " var1 = 15\n", + " var2 = \"BRASS\"\n", + " var3 = \"EUROPE\"\n", + "\n", + " q2 = (\n", + " part.join(partsupp, part[\"p_partkey\"] == partsupp[\"ps_partkey\"])\n", + " .join(supplier, partsupp[\"ps_suppkey\"] == supplier[\"s_suppkey\"])\n", + " .join(nation, supplier[\"s_nationkey\"] == nation[\"n_nationkey\"])\n", + " .join(region, nation[\"n_regionkey\"] == region[\"r_regionkey\"])\n", + " .filter(ibis._[\"p_size\"] == var1)\n", + " .filter(ibis._[\"p_type\"].endswith(var2))\n", + " .filter(ibis._[\"r_name\"] == var3)\n", + " )\n", + "\n", + " q_final = (\n", + " q2.group_by(\"p_partkey\")\n", + " .agg(ps_supplycost=ibis._[\"ps_supplycost\"].min())\n", + " .join(q2, [\"p_partkey\"])\n", + " .select(\n", + " \"s_acctbal\",\n", + " \"s_name\",\n", + " \"n_name\",\n", + " \"p_partkey\",\n", + " \"p_mfgr\",\n", + " \"s_address\",\n", + " \"s_phone\",\n", + " \"s_comment\",\n", + " )\n", + " .order_by(ibis.desc(\"s_acctbal\"), \"n_name\", \"s_name\", \"p_partkey\")\n", + " .limit(100)\n", + " )\n", + " if tool == 'pandas':\n", + " return q_final.to_pandas()\n", + " if tool == 'polars':\n", + " return q_final.to_polars()\n", + " raise ValueError(\"expected pandas or polars\")" + ] + }, { "cell_type": "code", "execution_count": 5, @@ -277,11 +335,18 @@ }, "outputs": [], "source": [ + "import ibis\n", + "\n", + "con_pd = ibis.pandas.connect()\n", + "con_pl = ibis.polars.connect()\n", + "\n", "IO_FUNCS = {\n", " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", + " 'pandas[pyarrow][ibis]': lambda x: con_pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", " 'polars[eager]': lambda x: pl.read_parquet(x),\n", " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + " 'polars[lazy][ibis]': lambda x: con_pl.read_parquet(x),\n", "}" ] }, @@ -295,6 +360,82 @@ "results = {}" ] }, + { + "cell_type": "markdown", + "id": "526a038b", + "metadata": {}, + "source": [ + "## pandas, pyarrow dtypes, via ibis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f8b42fe", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24 s ± 142 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + }, + { + "data": { + "text/plain": [ + "23.841894793999984" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tool = 'pandas[pyarrow][ibis]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q2_ibis(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp), tool='pandas')\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "id": "13c5e9be", + "metadata": {}, + "source": [ + "## Polars scan_parquet via ibis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d15d742", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "595 ms ± 18.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + }, + { + "data": { + "text/plain": [ + "0.5674880569999914" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tool = 'polars[lazy][ibis]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q2_ibis(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp), tool='polars')\n", + "results[tool] = timings.all_runs" + ] + }, { "cell_type": "markdown", "id": "eb2f8fd9", diff --git a/tpch/notebooks/q3/execute.ipynb b/tpch/notebooks/q3/execute.ipynb index bfdb06b3c..d7c2ca709 100755 --- a/tpch/notebooks/q3/execute.ipynb +++ b/tpch/notebooks/q3/execute.ipynb @@ -47,7 +47,7 @@ } ], "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals>=0.9.5 " + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals>=0.9.5 ibis-framework " ] }, { @@ -185,6 +185,57 @@ " return nw.to_native(q_final)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb9e593a", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Any\n", + "from datetime import datetime\n", + "import narwhals as nw\n", + "import ibis\n", + "\n", + "def q3_ibis(\n", + " customer: Any,\n", + " lineitem: Any,\n", + " orders: Any,\n", + " *,\n", + " tool,\n", + ") -> Any:\n", + " var1 = \"BUILDING\"\n", + " var2 = date(1995, 3, 15)\n", + "\n", + " q_final = (\n", + " customer.filter(customer[\"c_mktsegment\"] == var1)\n", + " .join(orders, customer[\"c_custkey\"] == orders[\"o_custkey\"])\n", + " .join(lineitem, orders[\"o_orderkey\"] == lineitem[\"l_orderkey\"])\n", + " .filter(ibis._[\"o_orderdate\"] < var2)\n", + " .filter(ibis._[\"l_shipdate\"] > var2)\n", + " .mutate(revenue=(lineitem[\"l_extendedprice\"] * (1 - lineitem[\"l_discount\"])))\n", + " .group_by(\n", + " \"o_orderkey\",\n", + " \"o_orderdate\",\n", + " \"o_shippriority\",\n", + " )\n", + " .agg(revenue=ibis._[\"revenue\"].sum())\n", + " .select(\n", + " ibis._[\"o_orderkey\"].name(\"o_orderkey\"),\n", + " \"revenue\",\n", + " \"o_orderdate\",\n", + " \"o_shippriority\",\n", + " )\n", + " .order_by(ibis.desc(\"revenue\"), \"o_orderdate\")\n", + " .limit(10)\n", + " )\n", + " if tool == 'pandas':\n", + " return q_final.to_pandas()\n", + " if tool == 'polars':\n", + " return q_final.to_polars()\n", + " raise ValueError(\"expected pandas or polars\")" + ] + }, { "cell_type": "code", "execution_count": 5, @@ -240,11 +291,18 @@ }, "outputs": [], "source": [ + "import ibis\n", + "\n", + "con_pd = ibis.pandas.connect()\n", + "con_pl = ibis.polars.connect()\n", + "\n", "IO_FUNCS = {\n", " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", + " 'pandas[pyarrow][ibis]': lambda x: con_pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", " 'polars[eager]': lambda x: pl.read_parquet(x),\n", " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + " 'polars[lazy][ibis]': lambda x: con_pl.read_parquet(x),\n", "}" ] }, @@ -258,6 +316,82 @@ "results = {}" ] }, + { + "cell_type": "markdown", + "id": "01c2358d", + "metadata": {}, + "source": [ + "## pandas, pyarrow dtypes, via ibis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6327c5ed", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24 s ± 142 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + }, + { + "data": { + "text/plain": [ + "23.841894793999984" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tool = 'pandas[pyarrow][ibis]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q3_ibis(fn(customer), fn(lineitem), fn(orders), tool='pandas')\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "id": "ed569d39", + "metadata": {}, + "source": [ + "## Polars, lazy, via ibis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc7fe959", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24 s ± 142 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + }, + { + "data": { + "text/plain": [ + "23.841894793999984" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tool = 'polars[lazy][ibis]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q3_ibis(fn(customer), fn(lineitem), fn(orders), tool='polars')\n", + "results[tool] = timings.all_runs" + ] + }, { "cell_type": "markdown", "id": "6bfedb9e", diff --git a/tpch/notebooks/q4/execute.ipynb b/tpch/notebooks/q4/execute.ipynb index 2b0348eb5..355295480 100755 --- a/tpch/notebooks/q4/execute.ipynb +++ b/tpch/notebooks/q4/execute.ipynb @@ -47,7 +47,7 @@ } ], "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals>=0.9.5 " + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals>=0.9.5 ibis-framework " ] }, { @@ -165,6 +165,42 @@ " return nw.to_native(result)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0626d0e", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Any\n", + "from datetime import datetime\n", + "import ibis\n", + "\n", + "def q4_ibis(\n", + " lineitem: Any,\n", + " orders: Any,\n", + " *,\n", + " tool: str\n", + ") -> Any:\n", + " var1 = datetime(1993, 7, 1)\n", + " var2 = datetime(1993, 10, 1)\n", + "\n", + " q_final = (\n", + " lineitem.join(orders, lineitem[\"l_orderkey\"] == orders[\"o_orderkey\"])\n", + " .filter((orders[\"o_orderdate\"] >= var1) & (orders[\"o_orderdate\"] < var2))\n", + " .filter(lineitem[\"l_commitdate\"] < lineitem[\"l_receiptdate\"])\n", + " .distinct(on=[\"o_orderpriority\", \"l_orderkey\"])\n", + " .group_by(\"o_orderpriority\")\n", + " .agg(order_count=ibis._.count())\n", + " .order_by(\"o_orderpriority\")\n", + " )\n", + " if tool == 'pandas':\n", + " return q_final.to_pandas()\n", + " if tool == 'polars':\n", + " return q_final.to_polars()\n", + " raise ValueError(\"expected pandas or polars\")" + ] + }, { "cell_type": "code", "execution_count": 5, @@ -220,11 +256,18 @@ }, "outputs": [], "source": [ + "import ibis\n", + "\n", + "con_pd = ibis.pandas.connect()\n", + "con_pl = ibis.polars.connect()\n", + "\n", "IO_FUNCS = {\n", " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", + " 'pandas[pyarrow][ibis]': lambda x: con_pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", " 'polars[eager]': lambda x: pl.read_parquet(x),\n", " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + " 'polars[lazy][ibis]': lambda x: con_pl.read_parquet(x),\n", "}" ] }, @@ -238,6 +281,44 @@ "results = {}" ] }, + { + "cell_type": "markdown", + "id": "fdb2c22c", + "metadata": {}, + "source": [ + "## polars, lazy, via ibis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d9e5e85", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24 s ± 142 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + }, + { + "data": { + "text/plain": [ + "23.841894793999984" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tool = 'polars[lazy][ibis]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q4_ibis(fn(lineitem), fn(orders), tool='polars')\n", + "results[tool] = timings.all_runs" + ] + }, { "cell_type": "markdown", "id": "956e9675", diff --git a/tpch/notebooks/q5/execute.ipynb b/tpch/notebooks/q5/execute.ipynb index ecf991d59..42a56b78b 100755 --- a/tpch/notebooks/q5/execute.ipynb +++ b/tpch/notebooks/q5/execute.ipynb @@ -47,7 +47,7 @@ } ], "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals>=0.9.5 " + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals>=0.9.5 ibis-framework " ] }, { @@ -123,24 +123,9 @@ }, { "cell_type": "code", - "execution_count": 4, - "id": "42e7f0e2", - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-22T17:24:39.066341Z", - "iopub.status.busy": "2024-03-22T17:24:39.065881Z", - "iopub.status.idle": "2024-03-22T17:24:39.078875Z", - "shell.execute_reply": "2024-03-22T17:24:39.077655Z" - }, - "papermill": { - "duration": 0.021725, - "end_time": "2024-03-22T17:24:39.080999", - "exception": false, - "start_time": "2024-03-22T17:24:39.059274", - "status": "completed" - }, - "tags": [] - }, + "execution_count": null, + "id": "d54f49c4", + "metadata": {}, "outputs": [], "source": [ "from typing import Any\n", @@ -191,6 +176,57 @@ " return nw.to_native(result)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "e09ad394", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Any\n", + "from datetime import datetime\n", + "import narwhals as nw\n", + "import ibis\n", + "\n", + "def q5_ibis(\n", + " region: Any,\n", + " nation: Any,\n", + " customer: Any,\n", + " lineitem: Any,\n", + " orders: Any,\n", + " supplier: Any,\n", + " *,\n", + " tool: str,\n", + ") -> Any:\n", + " var1 = \"ASIA\"\n", + " var2 = datetime(1994, 1, 1)\n", + " var3 = datetime(1995, 1, 1)\n", + "\n", + " q_final = (\n", + " region.join(nation, region[\"r_regionkey\"] == nation[\"n_regionkey\"])\n", + " .join(customer, ibis._[\"n_nationkey\"] == customer[\"c_nationkey\"])\n", + " .join(orders, ibis._[\"c_custkey\"] == orders[\"o_custkey\"])\n", + " .join(lineitem, ibis._[\"o_orderkey\"] == lineitem[\"l_orderkey\"])\n", + " .join(\n", + " supplier,\n", + " (ibis._[\"l_suppkey\"] == supplier[\"s_suppkey\"])\n", + " & (ibis._[\"n_nationkey\"] == supplier[\"s_nationkey\"]),\n", + " )\n", + " .filter(ibis._[\"r_name\"] == var1)\n", + " .filter((ibis._[\"o_orderdate\"] >= var2) & (ibis._[\"o_orderdate\"] < var3))\n", + " .mutate(revenue=(lineitem[\"l_extendedprice\"] * (1 - lineitem[\"l_discount\"])))\n", + " .group_by(\"n_name\")\n", + " .agg(revenue=ibis._[\"revenue\"].sum())\n", + " .order_by(ibis.desc(\"revenue\"))\n", + " )\n", + "\n", + " if tool == 'pandas':\n", + " return q_final.to_pandas()\n", + " if tool == 'polars':\n", + " return q_final.to_polars()\n", + " raise ValueError(\"expected pandas or polars\")" + ] + }, { "cell_type": "code", "execution_count": 5, @@ -246,11 +282,18 @@ }, "outputs": [], "source": [ + "import ibis\n", + "\n", + "con_pd = ibis.pandas.connect()\n", + "con_pl = ibis.polars.connect()\n", + "\n", "IO_FUNCS = {\n", " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", + " 'pandas[pyarrow][ibis]': lambda x: con_pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", " 'polars[eager]': lambda x: pl.read_parquet(x),\n", " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + " 'polars[lazy][ibis]': lambda x: con_pl.read_parquet(x),\n", "}" ] }, @@ -264,6 +307,44 @@ "results = {}" ] }, + { + "cell_type": "markdown", + "id": "e18f721f", + "metadata": {}, + "source": [ + "## Polars, lazy, via ibis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "817409fa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24 s ± 142 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + }, + { + "data": { + "text/plain": [ + "23.841894793999984" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tool = 'polars[lazy][ibis]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q5_ibis(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier), tool='polars')\n", + "results[tool] = timings.all_runs" + ] + }, { "cell_type": "markdown", "id": "3d76284a", diff --git a/tpch/notebooks/q6/execute.ipynb b/tpch/notebooks/q6/execute.ipynb index 7e7189782..ef3342104 100755 --- a/tpch/notebooks/q6/execute.ipynb +++ b/tpch/notebooks/q6/execute.ipynb @@ -47,7 +47,7 @@ } ], "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals>=0.9.5 " + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals>=0.9.5 ibis-framework " ] }, { @@ -154,6 +154,41 @@ " return nw.to_native(result)\n" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa317199", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Any\n", + "from datetime import datetime\n", + "import narwhals as nw\n", + "\n", + "def q6_ibis(lineitem, *, tool: str) -> None:\n", + " var1 = datetime(1994, 1, 1)\n", + " var2 = datetime(1995, 1, 1)\n", + " var3 = 0.05\n", + " var4 = 0.07\n", + " var5 = 24\n", + "\n", + " q_final = (\n", + " lineitem.filter(\n", + " (lineitem[\"l_shipdate\"] >= var1) & (lineitem[\"l_shipdate\"] < var2)\n", + " )\n", + " .filter((lineitem[\"l_discount\"] >= var3) & (lineitem[\"l_discount\"] <= var4))\n", + " .filter(lineitem[\"l_quantity\"] < var5)\n", + " .mutate(revenue=ibis._[\"l_extendedprice\"] * (ibis._[\"l_discount\"]))\n", + " .agg(revenue=ibis._[\"revenue\"].sum())\n", + " )\n", + " \n", + " if tool == 'pandas':\n", + " return q_final.to_pandas()\n", + " if tool == 'polars':\n", + " return q_final.to_polars()\n", + " raise ValueError(\"expected pandas or polars\")\n" + ] + }, { "cell_type": "code", "execution_count": 5, @@ -209,11 +244,18 @@ }, "outputs": [], "source": [ + "import ibis\n", + "\n", + "con_pd = ibis.pandas.connect()\n", + "con_pl = ibis.polars.connect()\n", + "\n", "IO_FUNCS = {\n", " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", + " 'pandas[pyarrow][ibis]': lambda x: con_pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", " 'polars[eager]': lambda x: pl.read_parquet(x),\n", " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + " 'polars[lazy][ibis]': lambda x: con_pl.read_parquet(x),\n", "}" ] }, @@ -227,6 +269,82 @@ "results = {}" ] }, + { + "cell_type": "markdown", + "id": "a90892d8", + "metadata": {}, + "source": [ + "## pandas, pyarrow dtypes, via ibis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "253f08c9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24 s ± 142 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + }, + { + "data": { + "text/plain": [ + "23.841894793999984" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tool = 'pandas[pyarrow][ibis]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q6_ibis(fn(lineitem), tool='pandas')\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "id": "dc4d46c0", + "metadata": {}, + "source": [ + "## polars, lazy, via ibis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4048e2a6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24 s ± 142 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + }, + { + "data": { + "text/plain": [ + "23.841894793999984" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tool = 'polars[lazy][ibis]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q6_ibis(fn(lineitem), tool='polars')\n", + "results[tool] = timings.all_runs" + ] + }, { "cell_type": "markdown", "id": "56b73231", diff --git a/tpch/notebooks/q7/execute.ipynb b/tpch/notebooks/q7/execute.ipynb index 71b116c85..5132a3129 100755 --- a/tpch/notebooks/q7/execute.ipynb +++ b/tpch/notebooks/q7/execute.ipynb @@ -47,7 +47,7 @@ } ], "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals>=0.9.5" + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals>=0.9.5 ibis-framework" ] }, { @@ -217,6 +217,73 @@ " return nw.to_native(result)\n" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "a54f8603", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Any\n", + "from datetime import datetime\n", + "import ibis\n", + "\n", + "def q7_ibis(\n", + " nation: Any,\n", + " customer: Any,\n", + " lineitem: Any,\n", + " orders: Any,\n", + " supplier: Any,\n", + " *,\n", + " tool: str\n", + ") -> None:\n", + " var1 = \"FRANCE\"\n", + " var2 = \"GERMANY\"\n", + " var3 = datetime(1995, 1, 1)\n", + " var4 = datetime(1996, 12, 31)\n", + "\n", + " n1 = nation.filter(nation[\"n_name\"] == var1)\n", + " n2 = nation.filter(nation[\"n_name\"] == var2)\n", + "\n", + " q1 = (\n", + " customer.join(n1, customer[\"c_nationkey\"] == n1[\"n_nationkey\"])\n", + " .join(orders, customer[\"c_custkey\"] == orders[\"o_custkey\"])\n", + " .rename({\"cust_nation\": \"n_name\"})\n", + " .join(lineitem, orders[\"o_orderkey\"] == lineitem[\"l_orderkey\"])\n", + " .join(supplier, lineitem[\"l_suppkey\"] == supplier[\"s_suppkey\"])\n", + " .join(n2, supplier[\"s_nationkey\"] == n2[\"n_nationkey\"])\n", + " .rename({\"supp_nation\": \"n_name\"})\n", + " )\n", + "\n", + " q2 = (\n", + " customer.join(n2, customer[\"c_nationkey\"] == n2[\"n_nationkey\"])\n", + " .join(orders, customer[\"c_custkey\"] == orders[\"o_custkey\"])\n", + " .rename({\"cust_nation\": \"n_name\"})\n", + " .join(lineitem, orders[\"o_orderkey\"] == lineitem[\"l_orderkey\"])\n", + " .join(supplier, lineitem[\"l_suppkey\"] == supplier[\"s_suppkey\"])\n", + " .join(n1, supplier[\"s_nationkey\"] == n1[\"n_nationkey\"])\n", + " .rename({\"supp_nation\": \"n_name\"})\n", + " )\n", + "\n", + " q_final = (\n", + " q1.union(q2)\n", + " .filter((ibis._[\"l_shipdate\"] >= var3) & (ibis._[\"l_shipdate\"] <= var4))\n", + " .mutate(\n", + " volume=(ibis._[\"l_extendedprice\"] * (1 - ibis._[\"l_discount\"])),\n", + " l_year=ibis._[\"l_shipdate\"].year(),\n", + " )\n", + " .group_by(\"supp_nation\", \"cust_nation\", \"l_year\")\n", + " .agg(revenue=ibis._[\"volume\"].sum())\n", + " .order_by(\"supp_nation\", \"cust_nation\", \"l_year\")\n", + " )\n", + "\n", + " if tool == 'pandas':\n", + " return q_final.to_pandas()\n", + " if tool == 'polars':\n", + " return q_final.to_polars()\n", + " raise ValueError(\"expected pandas or polars\")" + ] + }, { "cell_type": "code", "execution_count": 3, @@ -272,11 +339,18 @@ }, "outputs": [], "source": [ + "import ibis\n", + "\n", + "con_pd = ibis.pandas.connect()\n", + "con_pl = ibis.polars.connect()\n", + "\n", "IO_FUNCS = {\n", " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", + " 'pandas[pyarrow][ibis]': lambda x: con_pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", " 'polars[eager]': lambda x: pl.read_parquet(x),\n", " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + " 'polars[lazy][ibis]': lambda x: con_pl.read_parquet(x),\n", "}" ] }, @@ -290,6 +364,82 @@ "results = {}" ] }, + { + "cell_type": "markdown", + "id": "a88be78f", + "metadata": {}, + "source": [ + "## pandas, pyarrow dtypes, via ibis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "168faa16", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "20.2 s ± 5.8 s per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + }, + { + "data": { + "text/plain": [ + "16.42582530300001" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tool = 'pandas[pyarrow][ibis]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q7_ibis(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier), tool='pandas')\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "id": "b2e0f762", + "metadata": {}, + "source": [ + "## polars, lazy, via ibis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21477ece", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "20.2 s ± 5.8 s per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + }, + { + "data": { + "text/plain": [ + "16.42582530300001" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tool = 'polars[lazy][ibis]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q7_ibis(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier), tool='polars')\n", + "results[tool] = timings.all_runs" + ] + }, { "cell_type": "markdown", "id": "12824d5d",