diff --git a/docs/data/debugging_with_pipes_1.csv b/docs/data/debugging_with_pipes_1.csv new file mode 100644 index 000000000000..ad6596cf34f2 --- /dev/null +++ b/docs/data/debugging_with_pipes_1.csv @@ -0,0 +1,7 @@ +foo,bar +1,5 +1,6 +2,7 +2,8 +3,9 +3,0 diff --git a/docs/data/debugging_with_pipes_2.csv b/docs/data/debugging_with_pipes_2.csv new file mode 100644 index 000000000000..bee33793c9c4 --- /dev/null +++ b/docs/data/debugging_with_pipes_2.csv @@ -0,0 +1,7 @@ +foo,baz +1,4 +1,5 +2,0 +2,7 +3,0 +3,9 diff --git a/docs/src/python/user-guide/misc/debugging_with_pipes.py b/docs/src/python/user-guide/misc/debugging_with_pipes.py new file mode 100644 index 000000000000..57ed29759328 --- /dev/null +++ b/docs/src/python/user-guide/misc/debugging_with_pipes.py @@ -0,0 +1,67 @@ +# --8<-- [start:setup] +from inspect import currentframe +import polars as pl + +bar_table = "docs/data/debugging_with_pipes_1.csv" +baz_table = "docs/data/debugging_with_pipes_2.csv" +# --8<-- [end:setup] + +# --8<-- [start:pipeline1] +df = ( + pl.scan_csv(bar_table) + .filter(pl.col("bar") > 0) + .join(pl.scan_csv(baz_table), on="foo") + .select("bar", "baz") + .group_by("bar") + .agg(pl.count("baz")) + .collect() +) +# --8<-- [end:pipeline1] + + +# --8<-- [start:assert_schema] +def assert_schema( + lf: pl.LazyFrame, + schema: dict[str, pl.PolarsDataType], +) -> pl.LazyFrame: + "Assert that the schema conforms to expectations." + if lf.schema != schema: + msg = ( + "Wrong LazyFrame schema:\n" + f"• expected: '{schema}',\n" + f"• observed: '{dict(lf.schema)}'." + ) + raise AssertionError(msg) + return lf +# --8<-- [end:assert_schema] + + +# --8<-- [start:print_expr] +def print_expr( + lf: pl.LazyFrame, + expr: pl.Expr, +) -> pl.LazyFrame: + "Evaluate and print an expression." + df = lf.collect() # switch to eager mode + print(f"[line {currentframe().f_back.f_lineno}]") + print(df.select(expr)) + return df.lazy() # proceed in lazy mode +# --8<-- [end:print_expr] + +# --8<-- [start:pipeline2] +schema = {"bar": pl.Int64, "baz": pl.Int64} + +expr = pl.col("bar").unique().count() + +df = ( + pl.scan_csv(bar_table) + .pipe(print_expr, expr) # ⇐ PRINT + .filter(pl.col("bar") > 0) + .join(pl.scan_csv(baz_table), on="foo") + .select("bar", "baz") + .pipe(assert_schema, schema) # ⇐ ASSERT + .group_by("bar") + .agg(pl.count("baz")) + .collect() +) +# --8<-- [end:pipeline2] diff --git a/docs/user-guide/misc/debugging_with_pipes.md b/docs/user-guide/misc/debugging_with_pipes.md new file mode 100644 index 000000000000..b6de06ae6d2e --- /dev/null +++ b/docs/user-guide/misc/debugging_with_pipes.md @@ -0,0 +1,42 @@ +# Debugging with pipes + +Suppose that you write a long chain of transformations: + +{{code_block('user-guide/misc/debugging_with_pipes','pipeline1',[])}} + +```python exec="on" session="user-guide/misc/debugging_with_pipes" +--8<-- "python/user-guide/misc/debugging_with_pipes.py:setup" +``` + +```python exec="on" session="user-guide/misc/debugging_with_pipes" +--8<-- "python/user-guide/misc/debugging_with_pipes.py:pipeline1" +``` + +... and in the middle of the chain something breaks. + +How do you insert `print` and `assert` statements into the middle of the chain? + +Consider writing your own helper functions and saving them +(as you might need them multiple times in the future). For example: + +{{code_block('user-guide/misc/debugging_with_pipes','assert_schema',[])}} + +```python exec="on" session="user-guide/misc/debugging_with_pipes" +--8<-- "python/user-guide/misc/debugging_with_pipes.py:assert_schema" +``` + +{{code_block('user-guide/misc/debugging_with_pipes','print_expr',[])}} + +```python exec="on" session="user-guide/misc/debugging_with_pipes" +--8<-- "python/user-guide/misc/debugging_with_pipes.py:print_expr" +``` + +Now you can insert a couple of lines here: + +{{code_block('user-guide/misc/debugging_with_pipes','pipeline2',[])}} + +```python exec="on" result="text" session="user-guide/misc/debugging_with_pipes" +--8<-- "python/user-guide/misc/debugging_with_pipes.py:pipeline2" +``` + +When your debugging session is over, you can remove those lines. diff --git a/mkdocs.yml b/mkdocs.yml index 6673d17741ce..53f89ec61f94 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -81,6 +81,7 @@ nav: - Misc: - user-guide/misc/multiprocessing.md - user-guide/misc/visualization.md + - user-guide/misc/debugging_with_pipes.md - user-guide/misc/comparison.md - API reference: api/index.md