diff --git a/docs/requirements.txt b/docs/requirements.txt
index 0922ff44d8b5..072c07aad41e 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -5,6 +5,7 @@ matplotlib
 seaborn
 plotly
 altair
+numba
 # Unpin NumPy when support is implemented in numpy crate:
 # https://github.com/pola-rs/polars/issues/16998
 numpy<2
diff --git a/docs/src/python/user-guide/expressions/structs.py b/docs/src/python/user-guide/expressions/structs.py
index ee034a362bc6..01e21cca25b5 100644
--- a/docs/src/python/user-guide/expressions/structs.py
+++ b/docs/src/python/user-guide/expressions/structs.py
@@ -64,3 +64,15 @@
 ).filter(pl.struct("Movie", "Theatre").is_duplicated())
 print(out)
 # --8<-- [end:struct_ranking]
+
+# --8<-- [start:multi_column_apply]
+df = pl.DataFrame({"keys": ["a", "a", "b"], "values": [10, 7, 1]})
+
+out = df.select(
+    pl.struct(["keys", "values"])
+    .map_elements(lambda x: len(x["keys"]) + x["values"])
+    .alias("solution_map_elements"),
+    (pl.col("keys").str.len_bytes() + pl.col("values")).alias("solution_expr"),
+)
+print(out)
+# --8<-- [end:multi_column_apply]
diff --git a/docs/src/python/user-guide/expressions/user-defined-functions.py b/docs/src/python/user-guide/expressions/user-defined-functions.py
index 6c248691e1a9..a436a6d8241e 100644
--- a/docs/src/python/user-guide/expressions/user-defined-functions.py
+++ b/docs/src/python/user-guide/expressions/user-defined-functions.py
@@ -7,59 +7,104 @@
 # --8<-- [start:dataframe]
 df = pl.DataFrame(
     {
-        "keys": ["a", "a", "b"],
-        "values": [10, 7, 1],
+        "keys": ["a", "a", "b", "b"],
+        "values": [10, 7, 1, 23],
     }
 )
 print(df)
 # --8<-- [end:dataframe]
 
-# --8<-- [start:shift_map_batches]
-out = df.group_by("keys", maintain_order=True).agg(
-    pl.col("values")
-    .map_batches(lambda s: s.shift(), is_elementwise=True)
-    .alias("shift_map_batches"),
-    pl.col("values").shift().alias("shift_expression"),
-)
+# --8<-- [start:individual_log]
+import math
+
+
+def my_log(value):
+    return math.log(value)
+
+
+out = df.select(pl.col("values").map_elements(my_log, return_dtype=pl.Float64))
 print(out)
-# --8<-- [end:shift_map_batches]
+# --8<-- [end:individual_log]
 
 
-# --8<-- [start:map_elements]
-out = df.group_by("keys", maintain_order=True).agg(
-    pl.col("values")
-    .map_elements(lambda s: s.shift(), return_dtype=pl.List(int))
-    .alias("shift_map_elements"),
-    pl.col("values").shift().alias("shift_expression"),
-)
+# --8<-- [start:diff_from_mean]
+def diff_from_mean(series):
+    # This will be very slow for non-trivial Series, since it's all Python
+    # code:
+    total = 0
+    for value in series:
+        total += value
+    mean = total / len(series)
+    return pl.Series([value - mean for value in series])
+
+
+# Apply our custom function to a full Series with map_batches():
+out = df.select(pl.col("values").map_batches(diff_from_mean))
+print("== select() with UDF ==")
+print(out)
+
+# Apply our custom function per group:
+print("== group_by() with UDF ==")
+out = df.group_by("keys").agg(pl.col("values").map_batches(diff_from_mean))
 print(out)
-# --8<-- [end:map_elements]
+# --8<-- [end:diff_from_mean]
 
-# --8<-- [start:counter]
-counter = 0
+# --8<-- [start:np_log]
+import numpy as np
 
+out = df.select(pl.col("values").map_batches(np.log))
+print(out)
+# --8<-- [end:np_log]
 
-def add_counter(val: int) -> int:
-    global counter
-    counter += 1
-    return counter + val
+# --8<-- [start:diff_from_mean_numba]
+from numba import guvectorize, int64, float64
 
 
-out = df.select(
-    pl.col("values")
-    .map_elements(add_counter, return_dtype=pl.Int64)
-    .alias("solution_map_elements"),
-    (pl.col("values") + pl.int_range(1, pl.len() + 1)).alias("solution_expr"),
-)
+# This will be compiled to machine code, so it will be fast. The Series is
+# converted to a NumPy array before being passed to the function. See the
+# Numba documentation for more details:
+# https://numba.readthedocs.io/en/stable/user/vectorize.html
+@guvectorize([(int64[:], float64[:])], "(n)->(n)")
+def diff_from_mean_numba(arr, result):
+    total = 0
+    for value in arr:
+        total += value
+    mean = total / len(arr)
+    for i, value in enumerate(arr):
+        result[i] = value - mean
+
+
+out = df.select(pl.col("values").map_batches(diff_from_mean_numba))
+print("== select() with UDF ==")
+print(out)
+
+out = df.group_by("keys").agg(pl.col("values").map_batches(diff_from_mean_numba))
+print("== group_by() with UDF ==")
 print(out)
-# --8<-- [end:counter]
+# --8<-- [end:diff_from_mean_numba]
+
 
 # --8<-- [start:combine]
-out = df.select(
-    pl.struct("keys", "values")
-    .map_elements(lambda x: len(x["keys"]) + x["values"], return_dtype=pl.Int64)
-    .alias("solution_map_elements"),
-    (pl.col("keys").str.len_bytes() + pl.col("values")).alias("solution_expr"),
+# Add two arrays together:
+@guvectorize([(int64[:], int64[:], float64[:])], "(n),(n)->(n)")
+def add(arr, arr2, result):
+    for i in range(len(arr)):
+        result[i] = arr[i] + arr2[i]
+
+
+df3 = pl.DataFrame({"values1": [1, 2, 3], "values2": [10, 20, 30]})
+
+out = df3.select(
+    # Create a struct that has two columns in it:
+    pl.struct(["values1", "values2"])
+    # Pass the struct to a lambda that then passes the individual columns to
+    # the add() function:
+    .map_batches(
+        lambda combined: add(
+            combined.struct.field("values1"), combined.struct.field("values2")
+        )
+    )
+    .alias("add_columns")
 )
 print(out)
 # --8<-- [end:combine]
diff --git a/docs/src/rust/user-guide/expressions/structs.rs b/docs/src/rust/user-guide/expressions/structs.rs
index abb05e99ad5e..b064d2c41665 100644
--- a/docs/src/rust/user-guide/expressions/structs.rs
+++ b/docs/src/rust/user-guide/expressions/structs.rs
@@ -95,5 +95,54 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     println!("{}", &out);
     // --8<-- [end:struct_ranking]
 
+    // --8<-- [start:multi_column_apply]
+    let df = df!(
+        "keys" => &["a", "a", "b"],
+        "values" => &[10, 7, 1],
+    )?;
+
+    let out = df
+        .lazy()
+        .select([
+            // pack to struct to get access to multiple fields in a custom `apply/map`
+            as_struct(vec![col("keys"), col("values")])
+                // we will compute the len(a) + b
+                .apply(
+                    |s| {
+                        // downcast to struct
+                        let ca = s.struct_()?;
+
+                        // get the fields as Series
+                        let s_a = &ca.fields()[0];
+                        let s_b = &ca.fields()[1];
+
+                        // downcast the `Series` to their known type
+                        let ca_a = s_a.str()?;
+                        let ca_b = s_b.i32()?;
+
+                        // iterate both `ChunkedArrays`
+                        let out: Int32Chunked = ca_a
+                            .into_iter()
+                            .zip(ca_b)
+                            .map(|(opt_a, opt_b)| match (opt_a, opt_b) {
+                                (Some(a), Some(b)) => Some(a.len() as i32 + b),
+                                _ => None,
+                            })
+                            .collect();
+
+                        Ok(Some(out.into_series()))
+                    },
+                    GetOutput::from_type(DataType::Int32),
+                )
+                // note: the `'solution_map_elements'` alias is just there to show how you
+                // get the same output as in the Python API example.
+                .alias("solution_map_elements"),
+            (col("keys").str().count_matches(lit("."), true) + col("values"))
+                .alias("solution_expr"),
+        ])
+        .collect()?;
+    println!("{}", out);
+
+    // --8<-- [end:multi_column_apply]
     Ok(())
 }
diff --git a/docs/src/rust/user-guide/expressions/user-defined-functions.rs b/docs/src/rust/user-guide/expressions/user-defined-functions.rs
index 56661fcabc84..b83898ef6c7c 100644
--- a/docs/src/rust/user-guide/expressions/user-defined-functions.rs
+++ b/docs/src/rust/user-guide/expressions/user-defined-functions.rs
@@ -3,93 +3,25 @@ use polars::prelude::*;
 fn main() -> Result<(), Box<dyn std::error::Error>> {
     // --8<-- [start:dataframe]
     let df = df!(
-        "keys" => &["a", "a", "b"],
-        "values" => &[10, 7, 1],
+        "keys" => &["a", "a", "b", "b"],
+        "values" => &[10, 7, 1, 23],
     )?;
     println!("{}", df);
     // --8<-- [end:dataframe]
 
-    // --8<-- [start:shift_map_batches]
-    let out = df
-        .clone()
-        .lazy()
-        .group_by(["keys"])
-        .agg([
-            col("values")
-                .map(|s| Ok(Some(s.shift(1))), GetOutput::default())
-                // note: the `'shift_map_batches'` alias is just there to show how you
-                // get the same output as in the Python API example.
-                .alias("shift_map_batches"),
-            col("values").shift(lit(1)).alias("shift_expression"),
-        ])
-        .collect()?;
+    // --8<-- [start:individual_log]
+    // --8<-- [end:individual_log]
 
-    println!("{}", out);
-    // --8<-- [end:shift_map_batches]
+    // --8<-- [start:diff_from_mean]
+    // --8<-- [end:diff_from_mean]
 
-    // --8<-- [start:map_elements]
-    let out = df
-        .clone()
-        .lazy()
-        .group_by([col("keys")])
-        .agg([
-            col("values")
-                .apply(|s| Ok(Some(s.shift(1))), GetOutput::default())
-                // note: the `'shift_map_elements'` alias is just there to show how you
-                // get the same output as in the Python API example.
-                .alias("shift_map_elements"),
-            col("values").shift(lit(1)).alias("shift_expression"),
-        ])
-        .collect()?;
-    println!("{}", out);
-    // --8<-- [end:map_elements]
+    // --8<-- [start:np_log]
+    // --8<-- [end:np_log]
 
-    // --8<-- [start:counter]
-
-    // --8<-- [end:counter]
+    // --8<-- [start:diff_from_mean_numba]
+    // --8<-- [end:diff_from_mean_numba]
 
     // --8<-- [start:combine]
-    let out = df
-        .lazy()
-        .select([
-            // pack to struct to get access to multiple fields in a custom `apply/map`
-            as_struct(vec![col("keys"), col("values")])
-                // we will compute the len(a) + b
-                .apply(
-                    |s| {
-                        // downcast to struct
-                        let ca = s.struct_()?;
-
-                        // get the fields as Series
-                        let s_a = &ca.fields()[0];
-                        let s_b = &ca.fields()[1];
-
-                        // downcast the `Series` to their known type
-                        let ca_a = s_a.str()?;
-                        let ca_b = s_b.i32()?;
-
-                        // iterate both `ChunkedArrays`
-                        let out: Int32Chunked = ca_a
-                            .into_iter()
-                            .zip(ca_b)
-                            .map(|(opt_a, opt_b)| match (opt_a, opt_b) {
-                                (Some(a), Some(b)) => Some(a.len() as i32 + b),
-                                _ => None,
-                            })
-                            .collect();
-
-                        Ok(Some(out.into_series()))
-                    },
-                    GetOutput::from_type(DataType::Int32),
-                )
-                // note: the `'solution_map_elements'` alias is just there to show how you
-                // get the same output as in the Python API example.
-                .alias("solution_map_elements"),
-            (col("keys").str().count_matches(lit("."), true) + col("values"))
-                .alias("solution_expr"),
-        ])
-        .collect()?;
-    println!("{}", out);
     // --8<-- [end:combine]
     Ok(())
 }
diff --git a/docs/user-guide/expressions/numpy.md b/docs/user-guide/expressions/numpy.md
index 6500e87b5207..4a5a46978b57 100644
--- a/docs/user-guide/expressions/numpy.md
+++ b/docs/user-guide/expressions/numpy.md
@@ -15,8 +15,8 @@ This means that if a function is not provided by Polars, we can use NumPy and we
 
 ### Interoperability
 
-Polars `Series` have support for NumPy universal functions (ufuncs). Element-wise functions such as `np.exp()`, `np.cos()`, `np.div()`, etc. all work with almost zero overhead.
+Polars `Series` have support for NumPy universal functions (ufuncs) and generalized ufuncs. Element-wise functions such as `np.exp()`, `np.cos()`, `np.div()`, etc. all work with almost zero overhead.
 
-However, as a Polars-specific remark: missing values are a separate bitmask and are not visible by NumPy. This can lead to a window function or a `np.convolve()` giving flawed or incomplete results.
+However, as a Polars-specific remark: missing values are a separate bitmask and are not visible by NumPy. This can lead to a window function or a `np.convolve()` giving flawed or incomplete results, so an error will be raised if you pass a `Series` with missing data to a generalized ufunc.
 
 Convert a Polars `Series` to a NumPy array with the `.to_numpy()` method. Missing values will be replaced by `np.nan` during the conversion.
diff --git a/docs/user-guide/expressions/structs.md b/docs/user-guide/expressions/structs.md
index 056c1b2e21b7..d692c05ad0a1 100644
--- a/docs/user-guide/expressions/structs.md
+++ b/docs/user-guide/expressions/structs.md
@@ -96,4 +96,11 @@ That's a pretty complex set of requirements done very elegantly in Polars!
 
 ### Using multi-column apply
 
-This was discussed in the previous section on _User Defined Functions_.
+This was discussed in the previous section on _User Defined Functions_ for the Python case.
+Here's an example of doing so with both Python and Rust:
+
+{{code_block('user-guide/expressions/structs','multi_column_apply',[])}}
+
+```python exec="on" result="text" session="user-guide/structs"
+--8<-- "python/user-guide/expressions/structs.py:multi_column_apply"
+```
diff --git a/docs/user-guide/expressions/user-defined-functions.md b/docs/user-guide/expressions/user-defined-functions.md
index 67c618c220f5..dc994148c63b 100644
--- a/docs/user-guide/expressions/user-defined-functions.md
+++ b/docs/user-guide/expressions/user-defined-functions.md
@@ -1,47 +1,17 @@
 # User-defined functions (Python)
 
-You should be convinced by now that Polars expressions are so powerful and flexible that there is much less need for custom Python functions
-than in other libraries.
+Polars expressions are quite powerful and flexible, so there is much less need for custom Python functions compared to other libraries.
+Still, you may need to pass an expression's state to a third party library or apply your black box function to data in Polars.
 
-Still, you need to have the power to be able to pass an expression's state to a third party library or apply your black box function
-over data in Polars.
+In this part of the documentation we'll be using two APIs that allows you to do this:
 
-For this we provide the following expressions:
+- [:material-api: `map_elements`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_elements.html): Call a function separately on each value in the `Series`.
+- [:material-api: `map_batches`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html): Always passes the full `Series` to the function.
 
-- `map_batches`
-- `map_elements`
+## Processing individual values with `map_elements()`
 
-## To `map_batches` or to `map_elements`.
-
-These functions have an important distinction in how they operate and consequently what data they will pass to the user.
-
-A `map_batches` passes the `Series` backed by the `expression` as is.
-
-`map_batches` follows the same rules in both the `select` and the `group_by` context, this will
-mean that the `Series` represents a column in a `DataFrame`. Note that in the `group_by` context, that column is not yet
-aggregated!
-
-Use cases for `map_batches` are for instance passing the `Series` in an expression to a third party library. Below we show how
-we could use `map_batches` to pass an expression column to a neural network model.
-
-=== ":fontawesome-brands-python: Python"
-[:material-api: `map_batches`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.map_batches.html)
-
-```python
-df.with_columns([
-    pl.col("features").map_batches(lambda s: MyNeuralNetwork.forward(s.to_numpy())).alias("activations")
-])
-```
-
-=== ":fontawesome-brands-rust: Rust"
-
-```rust
-df.with_columns([
-    col("features").map(|s| Ok(my_nn.forward(s))).alias("activations")
-])
-```
-
-Use cases for `map_batches` in the `group_by` context are slim. They are only used for performance reasons, but can quite easily lead to incorrect results. Let me explain why.
+Let's start with the simplest case: we want to process each value in a `Series` individually.
+Here is our data:
 
 {{code_block('user-guide/expressions/user-defined-functions','dataframe',[])}}
 
@@ -50,94 +20,92 @@ Use cases for `map_batches` in the `group_by` context are slim. They are only us
 --8<-- "python/user-guide/expressions/user-defined-functions.py:dataframe"
 ```
 
-In the snippet above we group by the `"keys"` column. That means we have the following groups:
-
-```c
-"a" -> [10, 7]
-"b" -> [1]
-```
+We'll call `math.log()` on each individual value:
 
-If we would then apply a `shift` operation to the right, we'd expect:
+{{code_block('user-guide/expressions/user-defined-functions','individual_log',[])}}
 
-```c
-"a" -> [null, 10]
-"b" -> [null]
+```python exec="on" result="text" session="user-guide/udf"
+--8<-- "python/user-guide/expressions/user-defined-functions.py:individual_log"
 ```
 
-Let's try that out and see what we get:
+While this works, `map_elements()` has two problems:
 
-{{code_block('user-guide/expressions/user-defined-functions','shift_map_batches',[])}}
+1. **Limited to individual items:** Often you'll want to have a calculation that needs to operate on the whole `Series`, rather than individual items one by one.
+2. **Performance overhead:** Even if you do want to process each item individually, calling a function for each individual item is slow; all those extra function calls add a lot of overhead.
 
-```python exec="on" result="text" session="user-guide/udf"
---8<-- "python/user-guide/expressions/user-defined-functions.py:shift_map_batches"
-```
+Let's start by solving the first problem, and then we'll see how to solve the second problem.
 
-Ouch.. we clearly get the wrong results here. Group `"b"` even got a value from group `"a"` 😵.
+## Processing a whole `Series` with `map_batches()`
 
-This went horribly wrong because `map_batches` applied the function before aggregation, due to the `is_elementwise=True` parameter being provided. So that means the whole column `[10, 7, 1]` got shifted to `[null, 10, 7]` and was then aggregated.
+We want to run a custom function on the contents of a whole `Series`.
+For demonstration purposes, let's say we want to calculate the difference between the mean of a `Series` and each value.
 
-So my advice is to never use `map_batches` in the `group_by` context unless you know you need it and know what you are doing.
+We can use the `map_batches()` API to run this function on either the full `Series` or individual groups in a `group_by()`:
 
-## To `map_elements`
+{{code_block('user-guide/expressions/user-defined-functions','diff_from_mean',[])}}
 
-Luckily we can fix previous example with `map_elements`. `map_elements` works on the smallest logical elements for that operation.
+```python exec="on" result="text" session="user-guide/udf"
+--8<-- "python/user-guide/expressions/user-defined-functions.py:diff_from_mean"
+```
 
-That is:
+## Fast operations with user-defined functions
 
-- `select context` -> single elements
-- `group by context` -> single groups
+The problem with a pure-Python implementation is that it's slow.
+In general, you want to minimize how much Python code you call if you want fast results.
 
-So with `map_elements` we should be able to fix our example:
+To maximize speed, you'll want to make sure that you're using a function written in a compiled language.
+For numeric calculations Polars supports a pair of interfaces defined by NumPy called ["ufuncs"](https://numpy.org/doc/stable/reference/ufuncs.html) and ["generalized ufuncs"](https://numpy.org/neps/nep-0005-generalized-ufuncs.html).
+The former runs on each item individually, and the latter accepts a whole NumPy array, which allows for more flexible operations.
 
-=== ":fontawesome-brands-python: Python"
-[:material-api: `map_elements`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.map_elements.html)
+[NumPy](https://numpy.org/doc/stable/reference/ufuncs.html) and other libraries like [SciPy](https://docs.scipy.org/doc/scipy/reference/special.html#module-scipy.special) come with pre-written ufuncs you can use with Polars.
+For example:
 
-{{code_block('user-guide/expressions/user-defined-functions','map_elements',[])}}
+{{code_block('user-guide/expressions/user-defined-functions','np_log',[])}}
 
 ```python exec="on" result="text" session="user-guide/udf"
---8<-- "python/user-guide/expressions/user-defined-functions.py:map_elements"
+--8<-- "python/user-guide/expressions/user-defined-functions.py:np_log"
 ```
 
-And observe, a valid result! 🎉
-
-## `map_elements` in the `select` context
+Notice that we can use `map_batches()`, because `numpy.log()` is able to run on both individual items and on whole NumPy arrays.
+This means it will run much faster than our original example, since we only have a single Python call and then all processing happens in a fast low-level language.
 
-In the `select` context, the `map_elements` expression passes elements of the column to the Python function.
+## Example: A fast custom function using Numba
 
-_Note that you are now running Python, this will be slow._
+The pre-written functions NumPy provides are helpful, but our goal is to write our own functions.
+For example, let's say we want a fast version of our `diff_from_mean()` example above.
+The easiest way to write this in Python is to use [Numba](https://numba.readthedocs.io/en/stable/), which allows you to write custom functions in (a subset) of Python while still getting the benefit of compiled code.
 
-Let's go through some examples to see what to expect. We will continue with the `DataFrame` we defined at the start of
-this section and show an example with the `map_elements` function and a counter example where we use the expression API to
-achieve the same goals.
+In particular, Numba provides a decorator called [`@guvectorize`](https://numba.readthedocs.io/en/stable/user/vectorize.html#the-guvectorize-decorator).
+This creates a generalized ufunc by compiling a Python function to fast machine code, in a way that allows it to be used by Polars.
 
-### Adding a counter
+In the following example the `diff_from_mean_numba()` will be compiled to fast machine code at import time, which will take a little time.
+After that all calls to the function will run quickly.
+The `Series` will be converted to a NumPy array before being passed to the function:
 
-In this example we create a global `counter` and then add the integer `1` to the global state at every element processed.
-Every iteration the result of the increment will be added to the element value.
-
-> Note, this example isn't provided in Rust. The reason is that the global `counter` value would lead to data races when this `apply` is evaluated in parallel. It would be possible to wrap it in a `Mutex` to protect the variable, but that would be obscuring the point of the example. This is a case where the Python Global Interpreter Lock's performance tradeoff provides some safety guarantees.
-
-{{code_block('user-guide/expressions/user-defined-functions','counter',[])}}
+{{code_block('user-guide/expressions/user-defined-functions','diff_from_mean_numba',[])}}
 
 ```python exec="on" result="text" session="user-guide/udf"
---8<-- "python/user-guide/expressions/user-defined-functions.py:counter"
+--8<-- "python/user-guide/expressions/user-defined-functions.py:diff_from_mean_numba"
 ```
 
-### Combining multiple column values
+## Missing data is not allowed when calling generalized ufuncs
 
-If we want to have access to values of different columns in a single `map_elements` function call, we can create `struct` data
-type. This data type collects those columns as fields in the `struct`. So if we'd create a struct from the columns
-`"keys"` and `"values"`, we would get the following struct elements:
+Before being passed to a user-defined function like `diff_from_mean_numba()`, a `Series` will be converted to a NumPy array.
+Unfortunately, NumPy arrays don't have a concept of missing data.
+If there is missing data in the original `Series`, this means the resulting array won't actually match the `Series`.
 
-```python
-[
-    {"keys": "a", "values": 10},
-    {"keys": "a", "values": 7},
-    {"keys": "b", "values": 1},
-]
-```
+If you're calculating results item by item, this doesn't matter.
+For example, `numpy.log()` gets called on each individual value separately, so those missing values don't change the calculation.
+But if the result of a user-defined function depend on multiple values in the `Series`, it's not clear what exactly should happen with the missing values.
 
-In Python, those would be passed as `dict` to the calling Python function and can thus be indexed by `field: str`. In Rust, you'll get a `Series` with the `Struct` type. The fields of the struct can then be indexed and downcast.
+Therefore, when calling generalized ufuncs such as Numba functions decorated with `@guvectorize`, Polars will raise an error if you try to pass in a `Series` with missing data.
+How do you get rid of missing data?
+Either [fill it in](missing-data.md) or [drop it](https://docs.pola.rs/py-polars/html/reference/dataframe/api/polars.DataFrame.drop_nulls.html) before calling your custom function.
+
+## Combining multiple column values
+
+If you want to pass multiple columns to a user-defined function, you can use `Struct`s, which are [covered in detail in a different section](structs.md).
+The basic idea is to combine multiple columns into a `Struct`, and then the function can extract the columns back out:
 
 {{code_block('user-guide/expressions/user-defined-functions','combine',[])}}
 
@@ -145,17 +113,22 @@ In Python, those would be passed as `dict` to the calling Python function and ca
 --8<-- "python/user-guide/expressions/user-defined-functions.py:combine"
 ```
 
-`Structs` are covered in detail in the next section.
+## Streaming calculations
 
-### Return types?
+Passing the full `Series` to the user-defined function has a cost: it may use a lot of memory, as its contents are copied into a NumPy array.
+You can use the `is_elementwise=True` argument to [:material-api: `map_batches`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html) to stream results into the function, which means it might not get all values at once.
 
-Custom Python functions are black boxes for Polars. We really don't know what kind of black arts you are doing, so we have
-to infer and try our best to understand what you meant.
+!!! note
+The `is_elementwise` argument can lead to incorrect results if set incorrectly.
+If you set `is_elementwise=True`, make sure that your function actually operates
+element-by-element (e.g. "calculate the logarithm of each value") - our example function `diff_from_mean()`,
+for instance, does not.
 
-As a user it helps to understand what we do to better utilize custom functions.
+## Return types
 
-The data type is automatically inferred. We do that by waiting for the first non-null value. That value will then be used
-to determine the type of the `Series`.
+Custom Python functions are often black boxes; Polars doesn't know what your function is doing or what it will return.
+The return data type is therefore automatically inferred. We do that by waiting for the first non-null value. That value will then be used
+to determine the type of the resulting `Series`.
 
 The mapping of Python types to Polars data types is as follows:
 
@@ -174,3 +147,5 @@ Rust types map as follows:
 - `bool` -> `Boolean`
 - `String` or `str` -> `String`
 - `Vec<tp>` -> `List[tp]` (where the inner type is inferred with the same rules)
+
+You can pass a `return_dtype` argument to [:material-api: `map_batches`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html) if you want to override the inferred type.
diff --git a/py-polars/requirements-dev.txt b/py-polars/requirements-dev.txt
index b4e4ac4ee634..49e661031b81 100644
--- a/py-polars/requirements-dev.txt
+++ b/py-polars/requirements-dev.txt
@@ -23,6 +23,7 @@ numba; python_version < '3.13'  # Numba can lag Python releases
 pandas
 pyarrow
 pydantic>=2.0.0
+numba
 # Datetime / time zones
 backports.zoneinfo; python_version < '3.9'
 tzdata; platform_system == 'Windows'
diff --git a/py-polars/tests/docs/test_user_guide.py b/py-polars/tests/docs/test_user_guide.py
index 08be6fe9dfbf..a513f4b5f0c1 100644
--- a/py-polars/tests/docs/test_user_guide.py
+++ b/py-polars/tests/docs/test_user_guide.py
@@ -32,5 +32,8 @@ def _change_test_dir() -> Iterator[None]:
 @pytest.mark.docs()
 @pytest.mark.parametrize("path", snippet_paths)
 @pytest.mark.usefixtures("_change_test_dir")
+@pytest.mark.filterwarnings(
+    r"ignore:\nExpr\.map_elements:polars.exceptions.PolarsInefficientMapWarning"
+)
 def test_run_python_snippets(path: Path) -> None:
     runpy.run_path(str(path))