diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 2150a9117..5725f244f 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -69,6 +69,7 @@ ADD_HEYOKA_BENCHMARK(bandf)
 ADD_HEYOKA_BENCHMARK(bandf_odeint)
 ADD_HEYOKA_BENCHMARK(penc_comparison)
 ADD_HEYOKA_BENCHMARK(large_cfunc)
+ADD_HEYOKA_BENCHMARK(sims_flanagan_jac)
 
 if(HEYOKA_WITH_MPPP AND mp++_WITH_MPFR)
   ADD_HEYOKA_BENCHMARK(pendulum_mp)
diff --git a/benchmark/sims_flanagan_jac.cpp b/benchmark/sims_flanagan_jac.cpp
new file mode 100644
index 000000000..ae4bb033f
--- /dev/null
+++ b/benchmark/sims_flanagan_jac.cpp
@@ -0,0 +1,157 @@
+// Copyright 2020, 2021, 2022, 2023 Francesco Biscani (bluescarni@gmail.com), Dario Izzo (dario.izzo@gmail.com)
+//
+// This file is part of the heyoka library.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <algorithm>
+#include <array>
+#include <iostream>
+#include <iterator>
+#include <stdexcept>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <boost/program_options.hpp>
+
+#include <spdlog/spdlog.h>
+
+#include <heyoka/expression.hpp>
+#include <heyoka/llvm_state.hpp>
+#include <heyoka/logging.hpp>
+#include <heyoka/math/cos.hpp>
+#include <heyoka/math/kepDE.hpp>
+#include <heyoka/math/sin.hpp>
+#include <heyoka/math/sqrt.hpp>
+
+#include <fmt/ranges.h>
+
+using namespace heyoka;
+
+using v_ex_t = std::array<expression, 3>;
+
+const auto mu = make_vars("mu")[0];
+const auto tm = make_vars("t")[0];
+
+std::pair<v_ex_t, v_ex_t> make_lp(const v_ex_t &pos_0, const v_ex_t &vel_0)
+{
+    const auto &[x0, y0, z0] = pos_0;
+    const auto &[vx0, vy0, vz0] = vel_0;
+
+    auto v02 = vx0 * vx0 + vy0 * vy0 + vz0 * vz0;
+    auto r0 = sqrt(x0 * x0 + y0 * y0 + z0 * z0);
+    auto eps = v02 * 0.5 - mu / r0;
+    auto a = -mu / (2. * eps);
+
+    auto sigma0 = (x0 * vx0 + y0 * vy0 + z0 * vz0) / sqrt(mu);
+    auto s0 = sigma0 / sqrt(a);
+    auto c0 = 1. - r0 / a;
+
+    auto n = sqrt(mu / (a * a * a));
+    auto DM = n * tm;
+
+    auto DE = kepDE(s0, c0, DM);
+
+    auto cDE = cos(DE);
+    auto sDE = sin(DE);
+
+    auto r = a + (r0 - a) * cDE + sigma0 * sqrt(a) * sDE;
+
+    auto F = 1. - a / r0 * (1. - cDE);
+    auto G = a * sigma0 / sqrt(mu) * (1. - cDE) + r0 * sqrt(a / mu) * sDE;
+    auto Ft = -sqrt(mu * a) / (r * r0) * sDE;
+    auto Gt = 1. - a / r * (1. - cDE);
+
+    return {{{F * x0 + G * vx0, F * y0 + G * vy0, F * z0 + G * vz0}},
+            {{Ft * x0 + Gt * vx0, Ft * y0 + Gt * vy0, Ft * z0 + Gt * vz0}}};
+}
+
+int main(int argc, char *argv[])
+{
+    namespace po = boost::program_options;
+
+    create_logger();
+    auto logger = spdlog::get("heyoka");
+
+    set_logger_level_trace();
+
+    unsigned N{};
+
+    po::options_description desc("Options");
+
+    desc.add_options()("help", "produce help message")("N", po::value<unsigned>(&N)->default_value(20u),
+                                                       "number of segments");
+
+    po::variables_map vm;
+    po::store(po::parse_command_line(argc, argv, desc), vm);
+    po::notify(vm);
+
+    if (vm.count("help") != 0u) {
+        std::cout << desc << "\n";
+        return 0;
+    }
+
+    if (N == 0u) {
+        throw std::invalid_argument("The number of segments cannot be zero");
+    }
+
+    auto [x0, y0, z0] = make_vars("x0", "y0", "z0");
+    auto [vx0, vy0, vz0] = make_vars("vx0", "vy0", "vz0");
+
+    std::vector<v_ex_t> Dvs;
+    for (auto i = 0u; i < N; ++i) {
+        auto [Dvx, Dvy, Dvz]
+            = make_vars(fmt::format("Dvx{}", i + 1u), fmt::format("Dvy{}", i + 1u), fmt::format("Dvz{}", i + 1u));
+        Dvs.push_back(v_ex_t{Dvx, Dvy, Dvz});
+    }
+
+    auto [pos_f, vel_f] = make_lp({x0, y0, z0}, {vx0, vy0, vz0});
+
+    for (auto i = 0u; i < N; ++i) {
+        vel_f[0] += Dvs[i][0];
+        vel_f[1] += Dvs[i][1];
+        vel_f[2] += Dvs[i][2];
+
+        std::tie(pos_f, vel_f) = make_lp(pos_f, vel_f);
+    }
+
+    std::vector<expression> Dvs_list;
+    for (const auto &Dv : Dvs) {
+        Dvs_list.push_back(Dv[0]);
+        Dvs_list.push_back(Dv[1]);
+        Dvs_list.push_back(Dv[2]);
+    }
+
+    std::vector<expression> diff_vars_list = {x0, y0, z0, vx0, vy0, vz0};
+    diff_vars_list.insert(diff_vars_list.end(), Dvs_list.begin(), Dvs_list.end());
+
+    auto dt
+        = diff_tensors({pos_f[0], pos_f[1], pos_f[2], vel_f[0], vel_f[1], vel_f[2]}, kw::diff_args = diff_vars_list);
+
+    std::vector<expression> jac;
+    auto jac_sr = dt.get_derivatives(1);
+    std::transform(jac_sr.begin(), jac_sr.end(), std::back_inserter(jac), [](const auto &p) { return p.second; });
+
+    llvm_state s;
+
+    std::vector<expression> vars_list = {x0, y0, z0, vx0, vy0, vz0, mu, tm};
+    vars_list.insert(vars_list.end(), Dvs_list.begin(), Dvs_list.end());
+
+    logger->trace("Adding cfunc");
+
+    add_cfunc<double>(s, "func", jac, kw::vars = vars_list);
+
+    logger->trace("Compiling cfunc");
+
+    s.compile();
+
+    logger->trace("Looking up cfunc");
+
+    auto *fptr
+        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("func"));
+
+    logger->trace("All done");
+}
diff --git a/doc/changelog.rst b/doc/changelog.rst
index 42be02d6f..a56af916d 100644
--- a/doc/changelog.rst
+++ b/doc/changelog.rst
@@ -13,6 +13,14 @@ New
   system (`#352 <https://github.com/bluescarni/heyoka/pull/352>`__).
   Taylor derivatives are not implemented yet.
 
+Changes
+~~~~~~~
+
+- Substantial performance improvements in the computation of
+  derivative tensors of large expressions with a high degree
+  of internal redundancy
+  (`#354 <https://github.com/bluescarni/heyoka/pull/354>`__).
+
 Fix
 ~~~
 
diff --git a/doc/conf.py.in b/doc/conf.py.in
index ce2e28f78..5127f35fd 100644
--- a/doc/conf.py.in
+++ b/doc/conf.py.in
@@ -63,6 +63,8 @@ html_theme_options = {
     "repository_url": "https://github.com/bluescarni/heyoka",
     "use_repository_button": True,
     "use_issues_button": True,
+    # See: https://github.com/pydata/pydata-sphinx-theme/issues/1492
+    "navigation_with_keys": False,
 }
 
 
diff --git a/include/heyoka/detail/fast_unordered.hpp b/include/heyoka/detail/fast_unordered.hpp
new file mode 100644
index 000000000..dd0fbcaad
--- /dev/null
+++ b/include/heyoka/detail/fast_unordered.hpp
@@ -0,0 +1,67 @@
+// Copyright 2020, 2021, 2022, 2023 Francesco Biscani (bluescarni@gmail.com), Dario Izzo (dario.izzo@gmail.com)
+//
+// This file is part of the heyoka library.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef HEYOKA_DETAIL_FAST_UNORDERED_HPP
+#define HEYOKA_DETAIL_FAST_UNORDERED_HPP
+
+#include <heyoka/config.hpp>
+
+#include <boost/version.hpp>
+
+#if (BOOST_VERSION / 100000 > 1) || (BOOST_VERSION / 100000 == 1 && BOOST_VERSION / 100 % 1000 >= 81)
+
+#define HEYOKA_HAVE_BOOST_UNORDERED_FLAT
+
+#endif
+
+#if defined(HEYOKA_HAVE_BOOST_UNORDERED_FLAT)
+
+#include <boost/unordered/unordered_flat_map.hpp>
+#include <boost/unordered/unordered_flat_set.hpp>
+
+#else
+
+#include <unordered_map>
+#include <unordered_set>
+
+#endif
+
+HEYOKA_BEGIN_NAMESPACE
+
+namespace detail
+{
+
+template <typename... Args>
+using fast_uset =
+#if defined(HEYOKA_HAVE_BOOST_UNORDERED_FLAT)
+    boost::unordered_flat_set<Args...>
+#else
+    std::unordered_set<Args...>
+#endif
+    ;
+
+template <typename... Args>
+using fast_umap =
+#if defined(HEYOKA_HAVE_BOOST_UNORDERED_FLAT)
+    boost::unordered_flat_map<Args...>
+#else
+    std::unordered_map<Args...>
+#endif
+    ;
+
+} // namespace detail
+
+HEYOKA_END_NAMESPACE
+
+#if defined(HEYOKA_HAVE_BOOST_UNORDERED_FLAT)
+
+#undef HEYOKA_HAVE_BOOST_UNORDERED_FLAT
+
+#endif
+
+#endif
diff --git a/include/heyoka/detail/func_cache.hpp b/include/heyoka/detail/func_cache.hpp
index ad4c756c9..b6a5b6896 100644
--- a/include/heyoka/detail/func_cache.hpp
+++ b/include/heyoka/detail/func_cache.hpp
@@ -16,47 +16,20 @@
 #define HEYOKA_DETAIL_FUNC_CACHE_HPP
 
 #include <heyoka/config.hpp>
-
-#include <boost/version.hpp>
-
-#if (BOOST_VERSION / 100000 > 1) || (BOOST_VERSION / 100000 == 1 && BOOST_VERSION / 100 % 1000 >= 81)
-
-#include <boost/unordered/unordered_flat_map.hpp>
-#include <boost/unordered/unordered_flat_set.hpp>
+#include <heyoka/detail/fast_unordered.hpp>
 
 HEYOKA_BEGIN_NAMESPACE
 
 namespace detail
 {
 
-using funcptr_set = boost::unordered_flat_set<const void *>;
+using funcptr_set = fast_uset<const void *>;
 
 template <typename T>
-using funcptr_map = boost::unordered_flat_map<const void *, T>;
+using funcptr_map = fast_umap<const void *, T>;
 
 } // namespace detail
 
 HEYOKA_END_NAMESPACE
 
-#else
-
-#include <unordered_map>
-#include <unordered_set>
-
-HEYOKA_BEGIN_NAMESPACE
-
-namespace detail
-{
-
-using funcptr_set = std::unordered_set<const void *>;
-
-template <typename T>
-using funcptr_map = std::unordered_map<const void *, T>;
-
-} // namespace detail
-
-HEYOKA_END_NAMESPACE
-
-#endif
-
 #endif
diff --git a/include/heyoka/expression.hpp b/include/heyoka/expression.hpp
index 7859de82c..8a781cfd8 100644
--- a/include/heyoka/expression.hpp
+++ b/include/heyoka/expression.hpp
@@ -501,6 +501,8 @@ class HEYOKA_DLL_PUBLIC dtens
 
     [[nodiscard]] subrange get_derivatives(std::uint32_t, std::uint32_t) const;
     [[nodiscard]] subrange get_derivatives(std::uint32_t) const;
+    [[nodiscard]] std::vector<expression> get_gradient() const;
+    [[nodiscard]] std::vector<expression> get_jacobian() const;
 };
 
 HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const dtens &);
diff --git a/src/expression_basic.cpp b/src/expression_basic.cpp
index c10a5296c..9a81a5a3f 100644
--- a/src/expression_basic.cpp
+++ b/src/expression_basic.cpp
@@ -27,6 +27,7 @@
 #include <variant>
 #include <vector>
 
+#include <boost/container_hash/hash.hpp>
 #include <boost/safe_numerics/safe_integer.hpp>
 
 #include <llvm/IR/Function.h>
@@ -494,12 +495,60 @@ std::size_t hash(funcptr_map<std::size_t> &func_map, const expression &ex) noexc
         ex.value());
 }
 
-// NOLINTNEXTLINE(bugprone-exception-escape)
+// NOLINTNEXTLINE(bugprone-exception-escape,misc-no-recursion)
 std::size_t hash(const expression &ex) noexcept
 {
-    detail::funcptr_map<std::size_t> func_map;
+    // NOTE: we implement an optimisation here: if either the expression is **not** a function,
+    // or if it is a function and none of its arguments are functions (i.e., it is a non-recursive
+    // expression), then we do not recurse into detail::hash() and we do not create/update the
+    // funcptr_map cache. This allows us to avoid memory allocations, in an effort to optimise the
+    // computation of hashes in decompositions, where the elementary subexpressions are
+    // non-recursive by definition.
+    // NOTE: we must be very careful with this optimisation: we need to make sure the hash computation
+    // result is the same whether the optimisation is enabled or not. That is, the hash
+    // **must** be the same regardless of whether a non-recursive expression is standalone
+    // or it is part of a larger expression.
+    return std::visit(
+        // NOLINTNEXTLINE(misc-no-recursion)
+        [&ex](const auto &v) {
+            using type = detail::uncvref_t<decltype(v)>;
+
+            if constexpr (std::is_same_v<type, func>) {
+                const auto no_func_args = std::none_of(v.args().begin(), v.args().end(), [](const auto &x) {
+                    return std::holds_alternative<func>(x.value());
+                });
+
+                if (no_func_args) {
+                    // NOTE: it is very important that here we replicate *exactly* the logic
+                    // of func::hash(), so that we produce the same hash value that would
+                    // be produced if ex were part of a larger expression.
+                    std::size_t seed = std::hash<std::string>{}(v.get_name());
+
+                    for (const auto &arg : v.args()) {
+                        // NOTE: for non-function arguments, calling hash(arg)
+                        // is identical to calling detail::hash(func_map, arg)
+                        // (as we do in func::hash()):
+                        // both ultimately end up using directly the std::hash
+                        // specialisation on the arg.
+                        boost::hash_combine(seed, hash(arg));
+                    }
 
-    return hash(func_map, ex);
+                    return seed;
+                } else {
+                    // The regular, non-optimised path.
+                    detail::funcptr_map<std::size_t> func_map;
+
+                    return hash(func_map, ex);
+                }
+            } else {
+                // ex is not a function, compute its hash directly
+                // via a std::hash specialisation.
+                // NOTE: this is the same logic implemented in detail::hash(),
+                // except we avoid the unnecessary creation of a funcptr_map.
+                return std::hash<type>{}(v);
+            }
+        },
+        ex.value());
 }
 
 } // namespace detail
diff --git a/src/expression_cfunc.cpp b/src/expression_cfunc.cpp
index 23f9facf2..768b44749 100644
--- a/src/expression_cfunc.cpp
+++ b/src/expression_cfunc.cpp
@@ -62,6 +62,7 @@
 #endif
 
 #include <heyoka/detail/cm_utils.hpp>
+#include <heyoka/detail/fast_unordered.hpp>
 #include <heyoka/detail/func_cache.hpp>
 #include <heyoka/detail/llvm_func_create.hpp>
 #include <heyoka/detail/llvm_fwd.hpp>
@@ -217,7 +218,7 @@ std::vector<expression> function_decompose_cse(std::vector<expression> &v_ex,
     // all the unique expressions from v_ex, and it will
     // map them to their indices in retval (which will
     // in general differ from their indices in v_ex).
-    std::unordered_map<expression, idx_t> ex_map;
+    fast_umap<expression, idx_t, std::hash<expression>> ex_map;
 
     // Map for the renaming of u variables
     // in the expressions.
@@ -1005,7 +1006,7 @@ auto cfunc_build_function_maps(llvm_state &s, llvm::Type *fp_t, const std::vecto
         // will contain {f : [[a, b, c], [d, e, f]]}.
         // After construction, we have verified that for each function
         // in the map the sets of arguments have all the same size.
-        std::unordered_map<llvm::Function *, std::vector<std::vector<std::variant<std::uint32_t, number>>>> tmp_map;
+        fast_umap<llvm::Function *, std::vector<std::vector<std::variant<std::uint32_t, number>>>> tmp_map;
 
         for (const auto &ex : seg) {
             // Get the evaluation function.
@@ -1041,7 +1042,7 @@ auto cfunc_build_function_maps(llvm_state &s, llvm::Type *fp_t, const std::vecto
 
         // Now we build the transposition of tmp_map: from {f : [[a, b, c], [d, e, f]]}
         // to {f : [[a, d], [b, e], [c, f]]}.
-        std::unordered_map<llvm::Function *, std::vector<std::variant<std::vector<std::uint32_t>, std::vector<number>>>>
+        fast_umap<llvm::Function *, std::vector<std::variant<std::vector<std::uint32_t>, std::vector<number>>>>
             tmp_map_transpose;
         for (const auto &[func, vv] : tmp_map) {
             assert(!vv.empty()); // LCOV_EXCL_LINE
diff --git a/src/expression_diff.cpp b/src/expression_diff.cpp
index d4c13102c..e96f519b6 100644
--- a/src/expression_diff.cpp
+++ b/src/expression_diff.cpp
@@ -12,6 +12,7 @@
 #include <cstdint>
 #include <deque>
 #include <functional>
+#include <iterator>
 #include <map>
 #include <memory>
 #include <ostream>
@@ -450,7 +451,7 @@ auto diff_make_adj_dep(const std::vector<expression> &dc, std::vector<expression
 }
 
 // Forward-mode implementation of diff_tensors().
-template <typename DiffMap, typename Dep, typename Adj, typename SubsMap>
+template <typename DiffMap, typename Dep, typename Adj>
 void diff_tensors_forward_impl(
     // The map of derivatives. It wil be updated as new derivatives
     // are computed.
@@ -473,10 +474,7 @@ void diff_tensors_forward_impl(
     const std::vector<expression> &args,
     // Iterator in diff_map pointing to the first
     // derivative for the previous order.
-    typename DiffMap::iterator prev_begin,
-    // The substitution map to recover the original
-    // variables when constructing the derivatives.
-    const SubsMap &subs_map)
+    typename DiffMap::iterator prev_begin)
 {
     assert(dc.size() > nvars);
 
@@ -640,7 +638,7 @@ void diff_tensors_forward_impl(
             // Check if we already computed this derivative.
             if (const auto it = diff_map.find(tmp_v_idx); it == diff_map.end()) {
                 // The derivative is new.
-                auto cur_der = subs(diffs[diffs.size() - cur_nouts + out_idx], subs_map);
+                auto cur_der = diffs[diffs.size() - cur_nouts + out_idx];
 
                 [[maybe_unused]] const auto [_, flag] = diff_map.try_emplace(tmp_v_idx, std::move(cur_der));
                 assert(flag);
@@ -650,7 +648,7 @@ void diff_tensors_forward_impl(
 }
 
 // Reverse-mode implementation of diff_tensors().
-template <typename DiffMap, typename Dep, typename Adj, typename SubsMap>
+template <typename DiffMap, typename Dep, typename Adj>
 void diff_tensors_reverse_impl(
     // The map of derivatives. It wil be updated as new derivatives
     // are computed.
@@ -673,10 +671,7 @@ void diff_tensors_reverse_impl(
     const std::vector<expression> &args,
     // Iterator in diff_map pointing to the first
     // derivative for the previous order.
-    typename DiffMap::iterator prev_begin,
-    // The substitution map to recover the original
-    // variables when constructing the derivatives.
-    const SubsMap &subs_map)
+    typename DiffMap::iterator prev_begin)
 {
     assert(dc.size() > nvars);
 
@@ -836,7 +831,7 @@ void diff_tensors_reverse_impl(
                 expression cur_der = 0_dbl;
 
                 if (const auto it_dmap = dmap.find(args[j]); it_dmap != dmap.end()) {
-                    cur_der = subs(it_dmap->second, subs_map);
+                    cur_der = it_dmap->second;
                 }
 
                 [[maybe_unused]] const auto [_, flag] = diff_map.try_emplace(tmp_v_idx, std::move(cur_der));
@@ -1059,13 +1054,41 @@ auto diff_tensors_impl(const std::vector<expression> &v_ex, const std::vector<ex
         spdlog::stopwatch sw_inner;
 
         // NOTE: in order to choose between forward and reverse mode, we adopt the standard approach
-        // of comparing the number of inputs and outputs. A more accurate (yet more exepensive) approach
+        // of comparing the number of inputs and outputs. A more accurate (yet more expensive) approach
         // would be to do the computation in both modes (e.g., in parallel) and pick the mode which
         // results in the shortest decomposition. Perhaps we can consider this for a future extension.
         if (cur_nouts >= args.size()) {
-            diff_tensors_forward_impl(diff_map, cur_nouts, dc, dep, revdep, adj, nvars, args, prev_begin, subs_map);
+            diff_tensors_forward_impl(diff_map, cur_nouts, dc, dep, revdep, adj, nvars, args, prev_begin);
         } else {
-            diff_tensors_reverse_impl(diff_map, cur_nouts, dc, dep, revdep, adj, nvars, args, prev_begin, subs_map);
+            diff_tensors_reverse_impl(diff_map, cur_nouts, dc, dep, revdep, adj, nvars, args, prev_begin);
+        }
+
+        // NOTE: the derivatives we just added to diff_map are still expressed in terms of u variables.
+        // We need to apply the substitution map subs_map in order to recover the expressions in terms
+        // of the original variables. It is important that we do this now (rather then when constructing
+        // the derivatives in diff_tensors_*_impl()) because now we can do the substitution in a vectorised
+        // fashion, which greatly reduces the internal redundancy of the resulting expressions.
+
+        // Locate in diff_map the iterator to where we began adding derivatives for the current order.
+        tmp_v_idx[0] = 0;
+        tmp_v_idx[1] = cur_order + 1u;
+        std::fill(tmp_v_idx.begin() + 2, tmp_v_idx.end(), static_cast<std::uint32_t>(0));
+        const auto new_begin = diff_map.find(tmp_v_idx);
+        assert(new_begin != diff_map.end());
+
+        // Create the vector of expressions for the substitution.
+        std::vector<expression> subs_ret;
+        for (auto it = new_begin; it != diff_map.end(); ++it) {
+            subs_ret.push_back(it->second);
+        }
+
+        // Do the substitution.
+        subs_ret = subs(subs_ret, subs_map);
+
+        // Replace the original expressions in diff_map.
+        decltype(subs_ret.size()) i = 0;
+        for (auto it = new_begin; i < subs_ret.size(); ++i, ++it) {
+            it->second = subs_ret[i];
         }
 
         get_logger()->trace("dtens diff runtime for order {}: {}", cur_order + 1u, sw_inner);
@@ -1401,16 +1424,67 @@ dtens::subrange dtens::get_derivatives(std::uint32_t component, std::uint32_t or
     return subrange{b, e};
 }
 
-std::uint32_t dtens::get_nvars() const
+std::vector<expression> dtens::get_gradient() const
 {
-    if (p_impl->m_map.empty()) {
-        return 0;
+    if (get_nouts() != 1u) {
+        throw std::invalid_argument(fmt::format("The gradient can be requested only for a function with a single "
+                                                "output, but the number of outputs is instead {}",
+                                                get_nouts()));
+    }
+
+    if (get_order() == 0u) {
+        throw std::invalid_argument("First-order derivatives are not available");
+    }
+
+    const auto sr = get_derivatives(0, 1);
+    std::vector<expression> retval;
+    retval.reserve(get_nvars());
+    std::transform(sr.begin(), sr.end(), std::back_inserter(retval), [](const auto &p) { return p.second; });
+
+    assert(retval.size() == get_nvars());
+
+    return retval;
+}
+
+std::vector<expression> dtens::get_jacobian() const
+{
+    if (get_nouts() == 0u) {
+        throw std::invalid_argument("Cannot return the Jacobian of a function with no outputs");
     }
 
+    if (get_order() == 0u) {
+        throw std::invalid_argument("First-order derivatives are not available");
+    }
+
+    const auto sr = get_derivatives(1);
+    std::vector<expression> retval;
+    retval.reserve(boost::safe_numerics::safe<decltype(retval.size())>(get_nvars()) * get_nouts());
+    std::transform(sr.begin(), sr.end(), std::back_inserter(retval), [](const auto &p) { return p.second; });
+
+    assert(retval.size() == boost::safe_numerics::safe<decltype(retval.size())>(get_nvars()) * get_nouts());
+
+    return retval;
+}
+
+std::uint32_t dtens::get_nvars() const
+{
     // NOTE: we ensure in the diff_tensors() implementation
     // that the number of diff variables is representable
     // by std::uint32_t.
-    return static_cast<std::uint32_t>(begin()->first.size() - 1u);
+    auto ret = static_cast<std::uint32_t>(get_args().size());
+
+#if !defined(NDEBUG)
+
+    if (p_impl->m_map.empty()) {
+        assert(ret == 0u);
+    } else {
+        assert(!begin()->first.empty());
+        assert(ret == begin()->first.size() - 1u);
+    }
+
+#endif
+
+    return ret;
 }
 
 std::uint32_t dtens::get_nouts() const
diff --git a/src/func.cpp b/src/func.cpp
index 82d13143b..3c8304af4 100644
--- a/src/func.cpp
+++ b/src/func.cpp
@@ -598,6 +598,9 @@ void swap(func &a, func &b) noexcept
     std::swap(a.m_ptr, b.m_ptr);
 }
 
+// NOTE: it is very important that the implementation here is kept exactly in sync
+// with the implementation of expression hashing for non-recursive expressions. If one
+// changes, the other must as well.
 std::size_t func::hash(detail::funcptr_map<std::size_t> &func_map) const
 {
     // NOTE: the hash value is computed by combining the hash values of:
diff --git a/src/math/kepDE.cpp b/src/math/kepDE.cpp
index 3580f8c30..49540c1bb 100644
--- a/src/math/kepDE.cpp
+++ b/src/math/kepDE.cpp
@@ -85,7 +85,7 @@ expression kepDE_impl::diff_impl(funcptr_map<expression> &func_map, const T &s)
 
     const expression DE{func{*this}};
 
-    return (detail::diff(func_map, s0, s) * (cos(DE) - 1_dbl) - detail::diff(func_map, c0, s) * sin(DE)
+    return (detail::diff(func_map, s0, s) * (cos(DE) - 1_dbl) + detail::diff(func_map, c0, s) * sin(DE)
             + detail::diff(func_map, DM, s))
            / (1_dbl + s0 * sin(DE) - c0 * cos(DE));
 }
diff --git a/src/taylor_02.cpp b/src/taylor_02.cpp
index 61ae7c790..367dc9aad 100644
--- a/src/taylor_02.cpp
+++ b/src/taylor_02.cpp
@@ -20,7 +20,6 @@
 #include <stdexcept>
 #include <string>
 #include <type_traits>
-#include <unordered_map>
 #include <utility>
 #include <variant>
 #include <vector>
@@ -58,6 +57,7 @@
 #endif
 
 #include <heyoka/detail/cm_utils.hpp>
+#include <heyoka/detail/fast_unordered.hpp>
 #include <heyoka/detail/fwd_decl.hpp>
 #include <heyoka/detail/llvm_func_create.hpp>
 #include <heyoka/detail/llvm_fwd.hpp>
@@ -749,7 +749,7 @@ auto taylor_build_function_maps(llvm_state &s, llvm::Type *fp_t, const std::vect
         // will contain {f : [[a, b, c], [d, e, f]]}.
         // After construction, we have verified that for each function
         // in the map the sets of arguments have all the same size.
-        std::unordered_map<llvm::Function *, std::vector<std::vector<std::variant<std::uint32_t, number>>>> tmp_map;
+        fast_umap<llvm::Function *, std::vector<std::vector<std::variant<std::uint32_t, number>>>> tmp_map;
 
         for (const auto &ex : seg) {
             // Get the function for the computation of the derivative.
@@ -783,7 +783,7 @@ auto taylor_build_function_maps(llvm_state &s, llvm::Type *fp_t, const std::vect
 
         // Now we build the transposition of tmp_map: from {f : [[a, b, c], [d, e, f]]}
         // to {f : [[a, d], [b, e], [c, f]]}.
-        std::unordered_map<llvm::Function *, std::vector<std::variant<std::vector<std::uint32_t>, std::vector<number>>>>
+        fast_umap<llvm::Function *, std::vector<std::variant<std::vector<std::uint32_t>, std::vector<number>>>>
             tmp_map_transpose;
         for (const auto &[func, vv] : tmp_map) {
             assert(!vv.empty()); // LCOV_EXCL_LINE
diff --git a/test/expression.cpp b/test/expression.cpp
index beb94a7a6..bd39f7722 100644
--- a/test/expression.cpp
+++ b/test/expression.cpp
@@ -22,6 +22,7 @@
 #include <variant>
 #include <vector>
 
+#include <boost/container_hash/hash.hpp>
 #include <boost/numeric/conversion/cast.hpp>
 
 #include <fmt/format.h>
@@ -1897,3 +1898,86 @@ TEST_CASE("cfunc sum_to_sub")
                 == 1);
     }
 }
+
+// Test to check that the optimisation for hashing non-recursive
+// expressions is correct.
+TEST_CASE("hash opt")
+{
+    auto [x, y, z] = make_vars("x", "y", "z");
+
+    // Non-recursive function.
+    {
+        auto ex = x + y;
+
+        const auto nr_hash = std::hash<expression>{}(ex);
+
+        auto ex2 = z * ex;
+
+        const auto full_hash = std::hash<expression>{}(ex2);
+
+        // Compute manually the hash of ex2.
+        std::size_t seed = std::hash<std::string>{}(std::get<func>(ex2.value()).get_name());
+
+        boost::hash_combine(seed, std::hash<variable>{}(std::get<variable>(z.value())));
+        boost::hash_combine(seed, nr_hash);
+
+        REQUIRE(seed == full_hash);
+    }
+
+    // Variable.
+    {
+        auto ex = x;
+
+        const auto nr_hash = std::hash<expression>{}(ex);
+
+        auto ex2 = ex + 2. * y;
+
+        const auto full_hash = std::hash<expression>{}(ex2);
+
+        // Compute manually the hash of ex2.
+        std::size_t seed = std::hash<std::string>{}(std::get<func>(ex2.value()).get_name());
+
+        boost::hash_combine(seed, nr_hash);
+        boost::hash_combine(seed, std::hash<expression>{}(2. * y));
+
+        REQUIRE(seed == full_hash);
+    }
+
+    // Parameter.
+    {
+        auto ex = par[0];
+
+        const auto nr_hash = std::hash<expression>{}(ex);
+
+        auto ex2 = ex + 2. * y;
+
+        const auto full_hash = std::hash<expression>{}(ex2);
+
+        // Compute manually the hash of ex2.
+        std::size_t seed = std::hash<std::string>{}(std::get<func>(ex2.value()).get_name());
+
+        boost::hash_combine(seed, nr_hash);
+        boost::hash_combine(seed, std::hash<expression>{}(2. * y));
+
+        REQUIRE(seed == full_hash);
+    }
+
+    // Number.
+    {
+        auto ex = 1.1_dbl;
+
+        const auto nr_hash = std::hash<expression>{}(ex);
+
+        auto ex2 = ex + 2. * y;
+
+        const auto full_hash = std::hash<expression>{}(ex2);
+
+        // Compute manually the hash of ex2.
+        std::size_t seed = std::hash<std::string>{}(std::get<func>(ex2.value()).get_name());
+
+        boost::hash_combine(seed, nr_hash);
+        boost::hash_combine(seed, std::hash<expression>{}(2. * y));
+
+        REQUIRE(seed == full_hash);
+    }
+}
diff --git a/test/expression_diff_tensors.cpp b/test/expression_diff_tensors.cpp
index eb4be7f6f..aa04923c4 100644
--- a/test/expression_diff_tensors.cpp
+++ b/test/expression_diff_tensors.cpp
@@ -109,6 +109,7 @@ TEST_CASE("diff_tensors basic")
     };
 
     REQUIRE(dt.size() == 2u);
+    REQUIRE(dt.get_nvars() == 1u);
 
     assign_sr(dt.get_derivatives(0, 0));
     REQUIRE(diff_vec.size() == 1u);
@@ -127,6 +128,7 @@ TEST_CASE("diff_tensors basic")
         Message(fmt::format("Cannot locate the derivative corresponding to the indices vector {}", std::vector{0, 2})));
 
     dt = diff_tensors({1_dbl}, kw::diff_order = 2, kw::diff_args = {par[0]});
+    REQUIRE(dt.get_nvars() == 1u);
     REQUIRE(dt.size() == 3u);
     assign_sr(dt.get_derivatives(0, 0));
     REQUIRE(diff_vec.size() == 1u);
@@ -149,6 +151,7 @@ TEST_CASE("diff_tensors basic")
         Message(fmt::format("Cannot locate the derivative corresponding to the indices vector {}", std::vector{0, 3})));
 
     dt = diff_tensors({1_dbl}, kw::diff_order = 3, kw::diff_args = {par[0]});
+    REQUIRE(dt.get_nvars() == 1u);
     REQUIRE(dt.size() == 4u);
     assign_sr(dt.get_derivatives(0, 0));
     REQUIRE(diff_vec.size() == 1u);
@@ -176,6 +179,7 @@ TEST_CASE("diff_tensors basic")
 
     // Automatically deduced diff variables.
     dt = diff_tensors({x + y, x * y * y}, kw::diff_order = 2);
+    REQUIRE(dt.get_nvars() == 2u);
     REQUIRE(dt.size() == 12u);
     assign_sr(dt.get_derivatives(0));
     REQUIRE(diff_vec == std::vector{x + y, x * y * y});
@@ -186,6 +190,7 @@ TEST_CASE("diff_tensors basic")
 
     // Diff wrt all variables.
     dt = diff_tensors({x + y, x * y * y}, kw::diff_order = 2, kw::diff_args = diff_args::vars);
+    REQUIRE(dt.get_nvars() == 2u);
     REQUIRE(dt.size() == 12u);
     assign_sr(dt.get_derivatives(0));
     REQUIRE(diff_vec == std::vector{x + y, x * y * y});
@@ -196,6 +201,7 @@ TEST_CASE("diff_tensors basic")
 
     // Diff wrt some variables.
     dt = diff_tensors({x + y, x * y * y}, kw::diff_order = 2, kw::diff_args = {x});
+    REQUIRE(dt.get_nvars() == 1u);
     REQUIRE(dt.size() == 6u);
     assign_sr(dt.get_derivatives(0));
     REQUIRE(diff_vec == std::vector{x + y, x * y * y});
@@ -206,6 +212,7 @@ TEST_CASE("diff_tensors basic")
 
     // Diff wrt all params.
     dt = diff_tensors({par[0] + y, x * y * par[1]}, kw::diff_order = 2, kw::diff_args = diff_args::params);
+    REQUIRE(dt.get_nvars() == 2u);
     REQUIRE(dt.size() == 12u);
     assign_sr(dt.get_derivatives(0));
     REQUIRE(diff_vec == std::vector{par[0] + y, x * y * par[1]});
@@ -216,6 +223,7 @@ TEST_CASE("diff_tensors basic")
 
     // Diff wrt some param.
     dt = diff_tensors({par[0] + y, x * y * par[1]}, kw::diff_order = 2, kw::diff_args = {par[1]});
+    REQUIRE(dt.get_nvars() == 1u);
     REQUIRE(dt.size() == 6u);
     assign_sr(dt.get_derivatives(0));
     REQUIRE(diff_vec == std::vector{par[0] + y, x * y * par[1]});
@@ -628,3 +636,74 @@ TEST_CASE("speelpenning complexity")
         s.compile();
     }
 }
+
+TEST_CASE("gradient")
+{
+    using Catch::Matchers::Message;
+
+    auto [x, y] = make_vars("x", "y");
+
+    // Error modes first.
+    {
+        dtens dt;
+        REQUIRE_THROWS_MATCHES(dt.get_gradient(), std::invalid_argument,
+                               Message("The gradient can be requested only for a function with a single "
+                                       "output, but the number of outputs is instead 0"));
+    }
+
+    {
+        auto dt = diff_tensors({x + y, x - y});
+        REQUIRE_THROWS_MATCHES(dt.get_gradient(), std::invalid_argument,
+                               Message("The gradient can be requested only for a function with a single "
+                                       "output, but the number of outputs is instead 2"));
+    }
+
+    {
+        auto dt = diff_tensors({x + y}, kw::diff_order = 0u);
+        REQUIRE_THROWS_MATCHES(dt.get_gradient(), std::invalid_argument,
+                               Message("First-order derivatives are not available"));
+    }
+
+    auto dt = diff_tensors({x + y}, kw::diff_order = 1u);
+    auto grad = dt.get_gradient();
+    REQUIRE(grad == std::vector{1_dbl, 1_dbl});
+}
+
+TEST_CASE("jacobian")
+{
+    using Catch::Matchers::Message;
+
+    auto [x, y] = make_vars("x", "y");
+
+    // Error modes first.
+    {
+        dtens dt;
+
+        REQUIRE_THROWS_MATCHES(dt.get_jacobian(), std::invalid_argument,
+                               Message("Cannot return the Jacobian of a function with no outputs"));
+    }
+
+    {
+        auto dt = diff_tensors({x + y}, kw::diff_order = 0u);
+        REQUIRE_THROWS_MATCHES(dt.get_jacobian(), std::invalid_argument,
+                               Message("First-order derivatives are not available"));
+    }
+
+    {
+        auto dt = diff_tensors({x + y}, kw::diff_order = 1u);
+        auto jac = dt.get_jacobian();
+        REQUIRE(jac == std::vector{1_dbl, 1_dbl});
+    }
+
+    {
+        auto dt = diff_tensors({x + y, x - y}, kw::diff_order = 1u);
+        auto jac = dt.get_jacobian();
+        REQUIRE(jac == std::vector{1_dbl, 1_dbl, 1_dbl, -1_dbl});
+    }
+
+    {
+        auto dt = diff_tensors({x + y, x - y, -x - y}, kw::diff_order = 1u);
+        auto jac = dt.get_jacobian();
+        REQUIRE(jac == std::vector{1_dbl, 1_dbl, 1_dbl, -1_dbl, -1_dbl, -1_dbl});
+    }
+}
diff --git a/test/kepDE.cpp b/test/kepDE.cpp
index 42ec85b3f..5ba8f0d17 100644
--- a/test/kepDE.cpp
+++ b/test/kepDE.cpp
@@ -8,6 +8,7 @@
 
 #include <heyoka/config.hpp>
 
+#include <array>
 #include <cmath>
 #include <initializer_list>
 #include <limits>
@@ -84,12 +85,12 @@ TEST_CASE("kepDE diff")
         REQUIRE(diff(kepDE(x, y, z), x)
                 == (cos(kepDE(x, y, z)) - 1_dbl) / (1_dbl + x * sin(kepDE(x, y, z)) - y * cos(kepDE(x, y, z))));
         REQUIRE(diff(kepDE(x, y, z), y)
-                == -sin(kepDE(x, y, z)) / (1_dbl + x * sin(kepDE(x, y, z)) - y * cos(kepDE(x, y, z))));
+                == sin(kepDE(x, y, z)) / (1_dbl + x * sin(kepDE(x, y, z)) - y * cos(kepDE(x, y, z))));
         REQUIRE(diff(kepDE(x, y, z), z) == 1_dbl / (1_dbl + x * sin(kepDE(x, y, z)) - y * cos(kepDE(x, y, z))));
         auto DE = kepDE(x * x, x * y, x * z);
         REQUIRE(diff(DE, x)
-                == (2_dbl * x * (cos(DE) - 1_dbl) - y * sin(DE) + z) / (1_dbl + x * x * sin(DE) - x * y * cos(DE)));
-        REQUIRE(diff(DE, y) == (-x * sin(DE)) / (1_dbl + x * x * sin(DE) - x * y * cos(DE)));
+                == (2_dbl * x * (cos(DE) - 1_dbl) + y * sin(DE) + z) / (1_dbl + x * x * sin(DE) - x * y * cos(DE)));
+        REQUIRE(diff(DE, y) == (x * sin(DE)) / (1_dbl + x * x * sin(DE) - x * y * cos(DE)));
     }
 
     {
@@ -97,16 +98,98 @@ TEST_CASE("kepDE diff")
                 == (cos(kepDE(par[0], y, z)) - 1_dbl)
                        / (1_dbl + par[0] * sin(kepDE(par[0], y, z)) - y * cos(kepDE(par[0], y, z))));
         REQUIRE(diff(kepDE(x, par[1], z), par[1])
-                == -sin(kepDE(x, par[1], z))
+                == sin(kepDE(x, par[1], z))
                        / (1_dbl + x * sin(kepDE(x, par[1], z)) - par[1] * cos(kepDE(x, par[1], z))));
         REQUIRE(diff(kepDE(x, y, par[2]), par[2])
                 == 1_dbl / (1_dbl + x * sin(kepDE(x, y, par[2])) - y * cos(kepDE(x, y, par[2]))));
         auto DE = kepDE(par[0] * par[0], par[0] * par[1], par[0] * par[2]);
         REQUIRE(diff(DE, par[0])
-                == (2_dbl * par[0] * (cos(DE) - 1_dbl) - par[1] * sin(DE) + par[2])
+                == (2_dbl * par[0] * (cos(DE) - 1_dbl) + par[1] * sin(DE) + par[2])
                        / (1_dbl + par[0] * par[0] * sin(DE) - par[0] * par[1] * cos(DE)));
         REQUIRE(diff(DE, par[1])
-                == (-par[0] * sin(DE)) / (1_dbl + par[0] * par[0] * sin(DE) - par[0] * par[1] * cos(DE)));
+                == (par[0] * sin(DE)) / (1_dbl + par[0] * par[0] * sin(DE) - par[0] * par[1] * cos(DE)));
+    }
+
+    // Use numerical differencing as a further check.
+    {
+        llvm_state s;
+
+        auto der = diff(kepDE(x, y, z), x);
+        add_cfunc<double>(s, "der", {der});
+        add_cfunc<double>(s, "f", {kepDE(x, y, z)});
+        s.compile();
+
+        auto *der_ptr
+            = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("der"));
+        auto *f_ptr
+            = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("f"));
+
+        double out_der{};
+        const std::array in_der = {.1, .1, 1.1};
+        der_ptr(&out_der, in_der.data(), nullptr, nullptr);
+
+        double out_f[2] = {};
+        const auto in_f1 = in_der;
+        auto in_f2 = in_der;
+        in_f2[0] += 1e-8;
+        f_ptr(out_f, in_f1.data(), nullptr, nullptr);
+        f_ptr(out_f + 1, in_f2.data(), nullptr, nullptr);
+
+        REQUIRE(out_der == approximately((out_f[1] - out_f[0]) / 1e-8, 1e9));
+    }
+
+    {
+        llvm_state s;
+
+        auto der = diff(kepDE(x, y, z), y);
+        add_cfunc<double>(s, "der", {der});
+        add_cfunc<double>(s, "f", {kepDE(x, y, z)});
+        s.compile();
+
+        auto *der_ptr
+            = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("der"));
+        auto *f_ptr
+            = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("f"));
+
+        double out_der{};
+        const std::array in_der = {.1, .1, 1.1};
+        der_ptr(&out_der, in_der.data(), nullptr, nullptr);
+
+        double out_f[2] = {};
+        const auto in_f1 = in_der;
+        auto in_f2 = in_der;
+        in_f2[1] += 1e-8;
+        f_ptr(out_f, in_f1.data(), nullptr, nullptr);
+        f_ptr(out_f + 1, in_f2.data(), nullptr, nullptr);
+
+        REQUIRE(out_der == approximately((out_f[1] - out_f[0]) / 1e-8, 1e9));
+    }
+
+    {
+        llvm_state s;
+
+        auto der = diff(kepDE(x, y, z), z);
+        add_cfunc<double>(s, "der", {der});
+        add_cfunc<double>(s, "f", {kepDE(x, y, z)});
+        s.compile();
+
+        auto *der_ptr
+            = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("der"));
+        auto *f_ptr
+            = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("f"));
+
+        double out_der{};
+        const std::array in_der = {.1, .1, 1.1};
+        der_ptr(&out_der, in_der.data(), nullptr, nullptr);
+
+        double out_f[2] = {};
+        const auto in_f1 = in_der;
+        auto in_f2 = in_der;
+        in_f2[2] += 1e-8;
+        f_ptr(out_f, in_f1.data(), nullptr, nullptr);
+        f_ptr(out_f + 1, in_f2.data(), nullptr, nullptr);
+
+        REQUIRE(out_der == approximately((out_f[1] - out_f[0]) / 1e-8, 1e9));
     }
 }
 
diff --git a/test/kepF.cpp b/test/kepF.cpp
index 0845738e9..668faba42 100644
--- a/test/kepF.cpp
+++ b/test/kepF.cpp
@@ -8,6 +8,7 @@
 
 #include <heyoka/config.hpp>
 
+#include <array>
 #include <cmath>
 #include <initializer_list>
 #include <limits>
@@ -105,6 +106,88 @@ TEST_CASE("kepF diff")
                        / (1_dbl - par[0] * par[0] * sin(F) - par[0] * par[1] * cos(F)));
         REQUIRE(diff(F, par[1]) == (par[0] * sin(F)) / (1_dbl - par[0] * par[0] * sin(F) - par[0] * par[1] * cos(F)));
     }
+
+    // Use numerical differencing as a further check.
+    {
+        llvm_state s;
+
+        auto der = diff(kepF(x, y, z), x);
+        add_cfunc<double>(s, "der", {der});
+        add_cfunc<double>(s, "f", {kepF(x, y, z)});
+        s.compile();
+
+        auto *der_ptr
+            = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("der"));
+        auto *f_ptr
+            = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("f"));
+
+        double out_der{};
+        const std::array in_der = {.1, .1, 1.1};
+        der_ptr(&out_der, in_der.data(), nullptr, nullptr);
+
+        double out_f[2] = {};
+        const auto in_f1 = in_der;
+        auto in_f2 = in_der;
+        in_f2[0] += 1e-8;
+        f_ptr(out_f, in_f1.data(), nullptr, nullptr);
+        f_ptr(out_f + 1, in_f2.data(), nullptr, nullptr);
+
+        REQUIRE(out_der == approximately((out_f[1] - out_f[0]) / 1e-8, 1e9));
+    }
+
+    {
+        llvm_state s;
+
+        auto der = diff(kepF(x, y, z), y);
+        add_cfunc<double>(s, "der", {der});
+        add_cfunc<double>(s, "f", {kepF(x, y, z)});
+        s.compile();
+
+        auto *der_ptr
+            = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("der"));
+        auto *f_ptr
+            = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("f"));
+
+        double out_der{};
+        const std::array in_der = {.1, .1, 1.1};
+        der_ptr(&out_der, in_der.data(), nullptr, nullptr);
+
+        double out_f[2] = {};
+        const auto in_f1 = in_der;
+        auto in_f2 = in_der;
+        in_f2[1] += 1e-8;
+        f_ptr(out_f, in_f1.data(), nullptr, nullptr);
+        f_ptr(out_f + 1, in_f2.data(), nullptr, nullptr);
+
+        REQUIRE(out_der == approximately((out_f[1] - out_f[0]) / 1e-8, 1e9));
+    }
+
+    {
+        llvm_state s;
+
+        auto der = diff(kepF(x, y, z), z);
+        add_cfunc<double>(s, "der", {der});
+        add_cfunc<double>(s, "f", {kepF(x, y, z)});
+        s.compile();
+
+        auto *der_ptr
+            = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("der"));
+        auto *f_ptr
+            = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("f"));
+
+        double out_der{};
+        const std::array in_der = {.1, .1, 1.1};
+        der_ptr(&out_der, in_der.data(), nullptr, nullptr);
+
+        double out_f[2] = {};
+        const auto in_f1 = in_der;
+        auto in_f2 = in_der;
+        in_f2[2] += 1e-8;
+        f_ptr(out_f, in_f1.data(), nullptr, nullptr);
+        f_ptr(out_f + 1, in_f2.data(), nullptr, nullptr);
+
+        REQUIRE(out_der == approximately((out_f[1] - out_f[0]) / 1e-8, 1e9));
+    }
 }
 
 #define HEYOKA_TEST_KEPF_OVERLOAD(type)                                                                                \