Merge pull request #359 from bluescarni/pr/perf_fixes

Performance and cache fixes
bluescarni · Nov 9, 2023 · 04c38cb · 04c38cb
2 parents abd3c0c + dfd0bc3
commit 04c38cb
Show file tree

Hide file tree

Showing 8 changed files with 154 additions and 35 deletions.
diff --git a/doc/changelog.rst b/doc/changelog.rst
@@ -29,6 +29,10 @@ Changes
 Fix
 ~~~
 
+- Fix global constants in an LLVM module being generated in unordered fashion
+  when compact mode is active. This would result in two logically-identical
+  modules being considered different by the in-memory cache
+  (`#359 <https://github.com/bluescarni/heyoka/pull/359>`__).
 - Fix compiler warning when building without SLEEF support
   (`#356 <https://github.com/bluescarni/heyoka/pull/356>`__).
 - Improve the numerical stability of the VSOP2013 model

diff --git a/include/heyoka/expression.hpp b/include/heyoka/expression.hpp
@@ -632,11 +632,13 @@ HEYOKA_DLL_PUBLIC llvm::Function *taylor_c_diff_func(llvm_state &, llvm::Type *,
                                                      std::uint32_t, bool);
 
 HEYOKA_DLL_PUBLIC std::uint32_t get_param_size(const expression &);
+HEYOKA_DLL_PUBLIC std::uint32_t get_param_size(const std::vector<expression> &);
 
 HEYOKA_DLL_PUBLIC std::vector<expression> get_params(const expression &);
 HEYOKA_DLL_PUBLIC std::vector<expression> get_params(const std::vector<expression> &);
 
 HEYOKA_DLL_PUBLIC bool is_time_dependent(const expression &);
+HEYOKA_DLL_PUBLIC bool is_time_dependent(const std::vector<expression> &);
 
 namespace detail
 {

diff --git a/src/expression_basic.cpp b/src/expression_basic.cpp
@@ -1081,11 +1081,9 @@ std::uint32_t get_param_size(detail::funcptr_set &func_set, const expression &ex
             using type = uncvref_t<decltype(v)>;
 
             if constexpr (std::is_same_v<type, param>) {
-                if (v.idx() == std::numeric_limits<std::uint32_t>::max()) {
-                    throw std::overflow_error("Overflow dected in get_n_param()");
-                }
+                using safe_uint32_t = boost::safe_numerics::safe<std::uint32_t>;
 
-                retval = std::max(static_cast<std::uint32_t>(v.idx() + 1u), retval);
+                retval = std::max(static_cast<std::uint32_t>(v.idx() + safe_uint32_t(1)), retval);
             } else if constexpr (std::is_same_v<type, func>) {
                 const auto f_id = v.get_ptr();
 
@@ -1124,6 +1122,19 @@ std::uint32_t get_param_size(const expression &ex)
     return detail::get_param_size(func_set, ex);
 }
 
+std::uint32_t get_param_size(const std::vector<expression> &v_ex)
+{
+    std::uint32_t retval = 0;
+
+    detail::funcptr_set func_set;
+
+    for (const auto &ex : v_ex) {
+        retval = std::max(retval, detail::get_param_size(func_set, ex));
+    }
+
+    return retval;
+}
+
 namespace detail
 {
 
@@ -1221,47 +1232,45 @@ namespace
 {
 
 // NOLINTNEXTLINE(misc-no-recursion)
-bool is_time_dependent(funcptr_map<bool> &func_map, const expression &ex)
+bool is_time_dependent(funcptr_set &func_set, const expression &ex)
 {
     // - If ex is a function, check if it is time-dependent, or
     //   if any of its arguments is time-dependent,
     // - otherwise, return false.
     return std::visit(
         // NOLINTNEXTLINE(misc-no-recursion)
-        [&func_map](const auto &v) {
+        [&func_set](const auto &v) {
             using type = uncvref_t<decltype(v)>;
 
             if constexpr (std::is_same_v<type, func>) {
                 const auto f_id = v.get_ptr();
 
-                // Did we already determine if v is time-dependent?
-                if (const auto it = func_map.find(f_id); it != func_map.end()) {
-                    return it->second;
+                // Did we already determine that v is *not* time-dependent?
+                if (const auto it = func_set.find(f_id); it != func_set.end()) {
+                    return false;
                 }
 
                 // Check if the function is intrinsically time-dependent.
-                bool is_tm_dep = v.is_time_dependent();
-                if (!is_tm_dep) {
-                    // The function does **not** intrinsically depend on time.
-                    // Check its arguments.
-                    for (const auto &a : v.args()) {
-                        if (is_time_dependent(func_map, a)) {
-                            // A time-dependent argument was found. Update
-                            // is_tm_dep and break out, no point in checking
-                            // the other arguments.
-                            is_tm_dep = true;
-                            break;
-                        }
+                if (v.is_time_dependent()) {
+                    return true;
+                }
+
+                // The function does *not* intrinsically depend on time.
+                // Check its arguments.
+                for (const auto &a : v.args()) {
+                    if (is_time_dependent(func_set, a)) {
+                        // A time-dependent argument was found, return true.
+                        return true;
                     }
                 }
 
                 // Update the cache.
-                [[maybe_unused]] const auto [_, flag] = func_map.emplace(f_id, is_tm_dep);
+                [[maybe_unused]] const auto [_, flag] = func_set.emplace(f_id);
 
                 // An expression cannot contain itself.
                 assert(flag);
 
-                return is_tm_dep;
+                return false;
             } else {
                 return false;
             }
@@ -1276,9 +1285,24 @@ bool is_time_dependent(funcptr_map<bool> &func_map, const expression &ex)
 // Determine if an expression is time-dependent.
 bool is_time_dependent(const expression &ex)
 {
-    detail::funcptr_map<bool> func_map;
+    // NOTE: this will contain pointers to (sub)expressions
+    // which are *not* time-dependent.
+    detail::funcptr_set func_set;
+
+    return detail::is_time_dependent(func_set, ex);
+}
+
+bool is_time_dependent(const std::vector<expression> &v_ex)
+{
+    detail::funcptr_set func_set;
+
+    for (const auto &ex : v_ex) {
+        if (detail::is_time_dependent(func_set, ex)) {
+            return true;
+        }
+    }
 
-    return detail::is_time_dependent(func_map, ex);
+    return false;
 }
 
 namespace detail

diff --git a/src/expression_cfunc.cpp b/src/expression_cfunc.cpp
@@ -1006,7 +1006,16 @@ auto cfunc_build_function_maps(llvm_state &s, llvm::Type *fp_t, const std::vecto
         // will contain {f : [[a, b, c], [d, e, f]]}.
         // After construction, we have verified that for each function
         // in the map the sets of arguments have all the same size.
-        fast_umap<llvm::Function *, std::vector<std::vector<std::variant<std::uint32_t, number>>>> tmp_map;
+        // NOTE: again, here and below we use name-based ordered maps for the functions.
+        // This ensures that the invocations of cm_make_arg_gen_*(), which create several
+        // global variables, always happen in a well-defined order. If we used an unordered map instead,
+        // the variables would be created in a "random" order, which would result in a
+        // unnecessary miss for the in-memory cache machinery when two logically-identical
+        // LLVM modules are considered different because of the difference in the order
+        // of declaration of global variables.
+        std::map<llvm::Function *, std::vector<std::vector<std::variant<std::uint32_t, number>>>,
+                 llvm_func_name_compare>
+            tmp_map;
 
         for (const auto &ex : seg) {
             // Get the evaluation function.
@@ -1021,14 +1030,14 @@ auto cfunc_build_function_maps(llvm_state &s, llvm::Type *fp_t, const std::vecto
             // element into a set of indices/constants.
             const auto c_args = udef_to_variants(ex, {});
 
+            // LCOV_EXCL_START
             if (!is_new_func && it->second.back().size() - 1u != c_args.size()) {
-                // LCOV_EXCL_START
                 throw std::invalid_argument(
                     fmt::format("Inconsistent arity detected in a compiled function in compact "
                                 "mode: the same function is being called with both {} and {} arguments",
                                 it->second.back().size() - 1u, c_args.size()));
-                // LCOV_EXCL_STOP
             }
+            // LCOV_EXCL_STOP
 
             // Add the new set of arguments.
             it->second.emplace_back();
@@ -1042,7 +1051,8 @@ auto cfunc_build_function_maps(llvm_state &s, llvm::Type *fp_t, const std::vecto
 
         // Now we build the transposition of tmp_map: from {f : [[a, b, c], [d, e, f]]}
         // to {f : [[a, d], [b, e], [c, f]]}.
-        fast_umap<llvm::Function *, std::vector<std::variant<std::vector<std::uint32_t>, std::vector<number>>>>
+        std::map<llvm::Function *, std::vector<std::variant<std::vector<std::uint32_t>, std::vector<number>>>,
+                 llvm_func_name_compare>
             tmp_map_transpose;
         for (const auto &[func, vv] : tmp_map) {
             assert(!vv.empty()); // LCOV_EXCL_LINE

diff --git a/src/taylor_02.cpp b/src/taylor_02.cpp
@@ -57,7 +57,6 @@
 #endif
 
 #include <heyoka/detail/cm_utils.hpp>
-#include <heyoka/detail/fast_unordered.hpp>
 #include <heyoka/detail/fwd_decl.hpp>
 #include <heyoka/detail/llvm_func_create.hpp>
 #include <heyoka/detail/llvm_fwd.hpp>
@@ -749,7 +748,16 @@ auto taylor_build_function_maps(llvm_state &s, llvm::Type *fp_t, const std::vect
         // will contain {f : [[a, b, c], [d, e, f]]}.
         // After construction, we have verified that for each function
         // in the map the sets of arguments have all the same size.
-        fast_umap<llvm::Function *, std::vector<std::vector<std::variant<std::uint32_t, number>>>> tmp_map;
+        // NOTE: again, here and below we use name-based ordered maps for the functions.
+        // This ensures that the invocations of cm_make_arg_gen_*(), which create several
+        // global variables, always happen in a well-defined order. If we used an unordered map instead,
+        // the variables would be created in a "random" order, which would result in a
+        // unnecessary miss for the in-memory cache machinery when two logically-identical
+        // LLVM modules are considered different because of the difference in the order
+        // of declaration of global variables.
+        std::map<llvm::Function *, std::vector<std::vector<std::variant<std::uint32_t, number>>>,
+                 llvm_func_name_compare>
+            tmp_map;
 
         for (const auto &ex : seg) {
             // Get the function for the computation of the derivative.
@@ -764,12 +772,14 @@ auto taylor_build_function_maps(llvm_state &s, llvm::Type *fp_t, const std::vect
             // element into a set of indices/constants.
             const auto cdiff_args = udef_to_variants(ex.first, ex.second);
 
+            // LCOV_EXCL_START
             if (!is_new_func && it->second.back().size() - 1u != cdiff_args.size()) {
                 throw std::invalid_argument(
                     fmt::format("Inconsistent arity detected in a Taylor derivative function in compact "
                                 "mode: the same function is being called with both {} and {} arguments",
                                 it->second.back().size() - 1u, cdiff_args.size()));
             }
+            // LCOV_EXCL_STOP
 
             // Add the new set of arguments.
             it->second.emplace_back();
@@ -783,7 +793,8 @@ auto taylor_build_function_maps(llvm_state &s, llvm::Type *fp_t, const std::vect
 
         // Now we build the transposition of tmp_map: from {f : [[a, b, c], [d, e, f]]}
         // to {f : [[a, d], [b, e], [c, f]]}.
-        fast_umap<llvm::Function *, std::vector<std::variant<std::vector<std::uint32_t>, std::vector<number>>>>
+        std::map<llvm::Function *, std::vector<std::variant<std::vector<std::uint32_t>, std::vector<number>>>,
+                 llvm_func_name_compare>
             tmp_map_transpose;
         for (const auto &[func, vv] : tmp_map) {
             assert(!vv.empty()); // LCOV_EXCL_LINE

diff --git a/test/expression.cpp b/test/expression.cpp
@@ -16,6 +16,7 @@
 #include <sstream>
 #include <stdexcept>
 #include <string>
+#include <system_error>
 #include <type_traits>
 #include <unordered_map>
 #include <utility>
@@ -331,8 +332,7 @@ TEST_CASE("get_param_size")
 
     REQUIRE(get_param_size(par[0]) == 1u);
     REQUIRE(get_param_size(par[123]) == 124u);
-    REQUIRE_THROWS_MATCHES(get_param_size(par[std::numeric_limits<std::uint32_t>::max()]), std::overflow_error,
-                           Message("Overflow dected in get_n_param()"));
+    REQUIRE_THROWS_AS(get_param_size(par[std::numeric_limits<std::uint32_t>::max()]), std::system_error);
     REQUIRE(get_param_size(par[123] + "x"_var) == 124u);
     REQUIRE(get_param_size("x"_var + par[123]) == 124u);
     REQUIRE(get_param_size(par[123] + 1_dbl) == 124u);
@@ -362,6 +362,15 @@ TEST_CASE("get_param_size")
     bar = par[23] + (foo - x) / (2. * foo) + par[92];
 
     REQUIRE(get_param_size(bar) == 93u);
+
+    // Tests with vectorisation.
+    REQUIRE(get_param_size({par[0], par[123]}) == 124u);
+    REQUIRE(get_param_size({par[123], par[0]}) == 124u);
+    REQUIRE(get_param_size({bar, bar + par[86]}) == 93u);
+    REQUIRE(get_param_size({bar, bar + par[186]}) == 187u);
+    REQUIRE(get_param_size({bar + par[86], bar}) == 93u);
+    REQUIRE(get_param_size({bar + par[186], bar}) == 187u);
+    REQUIRE(get_param_size({"x"_var, 2_dbl}) == 0u);
 }
 
 TEST_CASE("binary simpls")
@@ -557,6 +566,19 @@ TEST_CASE("is_time_dependent")
     bar = hy::time + (foo - x) / (2. * foo) + hy::time;
 
     REQUIRE(is_time_dependent(bar));
+
+    // Tests with vectorization.
+    REQUIRE(is_time_dependent({bar}));
+    REQUIRE(is_time_dependent({x, bar}));
+    REQUIRE(is_time_dependent({bar, x}));
+    REQUIRE(!is_time_dependent({2_dbl - par[0], x}));
+
+    foo = ((x + y) * (z + x)) * ((z - x) * (y + x));
+    bar = (foo - x) / (2. * foo);
+    REQUIRE(!is_time_dependent({x, bar}));
+    REQUIRE(is_time_dependent({x, bar, hy::time}));
+    REQUIRE(is_time_dependent({x, hy::time, bar}));
+    REQUIRE(is_time_dependent({hy::time, x, bar}));
 }
 
 TEST_CASE("s11n")

diff --git a/test/llvm_state_mem_cache.cpp b/test/llvm_state_mem_cache.cpp
@@ -6,6 +6,7 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#include <heyoka/kw.hpp>
 #include <heyoka/llvm_state.hpp>
 #include <heyoka/model/pendulum.hpp>
 #include <heyoka/taylor.hpp>
@@ -140,3 +141,48 @@ TEST_CASE("slp_vectorize test")
         model::pendulum(), {1., 0.}, kw::tol = 1e-11, kw::slp_vectorize = true, kw::force_avx512 = true};
     REQUIRE(llvm_state::get_memcache_size() > new_size);
 }
+
+// Bug: in compact mode, global variables used to be created in random
+// order, which would lead to logically-identical modules considered
+// different by the cache machinery due to the different declaration order.
+TEST_CASE("bug cache miss compact mode")
+{
+    {
+        llvm_state::clear_memcache();
+        llvm_state::set_memcache_limit(2048ull * 1024u * 1024u);
+
+        auto ta = taylor_adaptive<double>{model::pendulum(), {1., 0.}, kw::tol = 1e-11, kw::compact_mode = true};
+        const auto orig_size = llvm_state::get_memcache_size();
+
+        // Re-create the same ta several times and then check the cache size has not changed.
+        for (auto i = 0; i < 100; ++i) {
+            ta = taylor_adaptive<double>{model::pendulum(), {1., 0.}, kw::tol = 1e-11, kw::compact_mode = true};
+        }
+
+        REQUIRE(llvm_state::get_memcache_size() == orig_size);
+    }
+
+    {
+        llvm_state::clear_memcache();
+        llvm_state::set_memcache_limit(2048ull * 1024u * 1024u);
+
+        {
+            llvm_state s;
+            add_cfunc<double>(s, "func", {model::pendulum_energy()}, kw::compact_mode = true);
+            s.compile();
+            (void)s.jit_lookup("func");
+        }
+
+        const auto orig_size = llvm_state::get_memcache_size();
+
+        // Re-create the same cfunc several times and then check the cache size has not changed.
+        for (auto i = 0; i < 100; ++i) {
+            llvm_state s;
+            add_cfunc<double>(s, "func", {model::pendulum_energy()}, kw::compact_mode = true);
+            s.compile();
+            (void)s.jit_lookup("func");
+        }
+
+        REQUIRE(llvm_state::get_memcache_size() == orig_size);
+    }
+}
diff --git a/test/model_ffnn.cpp b/test/model_ffnn.cpp
@@ -10,6 +10,7 @@
 #include <functional>
 #include <limits>
 #include <stdexcept>
+#include <system_error>
 
 #include <fmt/core.h>
 #include <fmt/ranges.h>
@@ -19,7 +20,6 @@
 #include <heyoka/kw.hpp>
 #include <heyoka/math.hpp>
 #include <heyoka/model/ffnn.hpp>
-#include <system_error>
 
 #include "catch.hpp"