Skip to content

Commit

Permalink
Merge pull request #359 from bluescarni/pr/perf_fixes
Browse files Browse the repository at this point in the history
Performance and cache fixes
  • Loading branch information
bluescarni authored Nov 9, 2023
2 parents abd3c0c + dfd0bc3 commit 04c38cb
Show file tree
Hide file tree
Showing 8 changed files with 154 additions and 35 deletions.
4 changes: 4 additions & 0 deletions doc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ Changes
Fix
~~~

- Fix global constants in an LLVM module being generated in unordered fashion
when compact mode is active. This would result in two logically-identical
modules being considered different by the in-memory cache
(`#359 <https://github.com/bluescarni/heyoka/pull/359>`__).
- Fix compiler warning when building without SLEEF support
(`#356 <https://github.com/bluescarni/heyoka/pull/356>`__).
- Improve the numerical stability of the VSOP2013 model
Expand Down
2 changes: 2 additions & 0 deletions include/heyoka/expression.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -632,11 +632,13 @@ HEYOKA_DLL_PUBLIC llvm::Function *taylor_c_diff_func(llvm_state &, llvm::Type *,
std::uint32_t, bool);

HEYOKA_DLL_PUBLIC std::uint32_t get_param_size(const expression &);
HEYOKA_DLL_PUBLIC std::uint32_t get_param_size(const std::vector<expression> &);

HEYOKA_DLL_PUBLIC std::vector<expression> get_params(const expression &);
HEYOKA_DLL_PUBLIC std::vector<expression> get_params(const std::vector<expression> &);

HEYOKA_DLL_PUBLIC bool is_time_dependent(const expression &);
HEYOKA_DLL_PUBLIC bool is_time_dependent(const std::vector<expression> &);

namespace detail
{
Expand Down
74 changes: 49 additions & 25 deletions src/expression_basic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1081,11 +1081,9 @@ std::uint32_t get_param_size(detail::funcptr_set &func_set, const expression &ex
using type = uncvref_t<decltype(v)>;

if constexpr (std::is_same_v<type, param>) {
if (v.idx() == std::numeric_limits<std::uint32_t>::max()) {
throw std::overflow_error("Overflow dected in get_n_param()");
}
using safe_uint32_t = boost::safe_numerics::safe<std::uint32_t>;

retval = std::max(static_cast<std::uint32_t>(v.idx() + 1u), retval);
retval = std::max(static_cast<std::uint32_t>(v.idx() + safe_uint32_t(1)), retval);
} else if constexpr (std::is_same_v<type, func>) {
const auto f_id = v.get_ptr();

Expand Down Expand Up @@ -1124,6 +1122,19 @@ std::uint32_t get_param_size(const expression &ex)
return detail::get_param_size(func_set, ex);
}

std::uint32_t get_param_size(const std::vector<expression> &v_ex)
{
std::uint32_t retval = 0;

detail::funcptr_set func_set;

for (const auto &ex : v_ex) {
retval = std::max(retval, detail::get_param_size(func_set, ex));
}

return retval;
}

namespace detail
{

Expand Down Expand Up @@ -1221,47 +1232,45 @@ namespace
{

// NOLINTNEXTLINE(misc-no-recursion)
bool is_time_dependent(funcptr_map<bool> &func_map, const expression &ex)
bool is_time_dependent(funcptr_set &func_set, const expression &ex)
{
// - If ex is a function, check if it is time-dependent, or
// if any of its arguments is time-dependent,
// - otherwise, return false.
return std::visit(
// NOLINTNEXTLINE(misc-no-recursion)
[&func_map](const auto &v) {
[&func_set](const auto &v) {
using type = uncvref_t<decltype(v)>;

if constexpr (std::is_same_v<type, func>) {
const auto f_id = v.get_ptr();

// Did we already determine if v is time-dependent?
if (const auto it = func_map.find(f_id); it != func_map.end()) {
return it->second;
// Did we already determine that v is *not* time-dependent?
if (const auto it = func_set.find(f_id); it != func_set.end()) {
return false;
}

// Check if the function is intrinsically time-dependent.
bool is_tm_dep = v.is_time_dependent();
if (!is_tm_dep) {
// The function does **not** intrinsically depend on time.
// Check its arguments.
for (const auto &a : v.args()) {
if (is_time_dependent(func_map, a)) {
// A time-dependent argument was found. Update
// is_tm_dep and break out, no point in checking
// the other arguments.
is_tm_dep = true;
break;
}
if (v.is_time_dependent()) {
return true;
}

// The function does *not* intrinsically depend on time.
// Check its arguments.
for (const auto &a : v.args()) {
if (is_time_dependent(func_set, a)) {
// A time-dependent argument was found, return true.
return true;
}
}

// Update the cache.
[[maybe_unused]] const auto [_, flag] = func_map.emplace(f_id, is_tm_dep);
[[maybe_unused]] const auto [_, flag] = func_set.emplace(f_id);

// An expression cannot contain itself.
assert(flag);

return is_tm_dep;
return false;
} else {
return false;
}
Expand All @@ -1276,9 +1285,24 @@ bool is_time_dependent(funcptr_map<bool> &func_map, const expression &ex)
// Determine if an expression is time-dependent.
bool is_time_dependent(const expression &ex)
{
detail::funcptr_map<bool> func_map;
// NOTE: this will contain pointers to (sub)expressions
// which are *not* time-dependent.
detail::funcptr_set func_set;

return detail::is_time_dependent(func_set, ex);
}

bool is_time_dependent(const std::vector<expression> &v_ex)
{
detail::funcptr_set func_set;

for (const auto &ex : v_ex) {
if (detail::is_time_dependent(func_set, ex)) {
return true;
}
}

return detail::is_time_dependent(func_map, ex);
return false;
}

namespace detail
Expand Down
18 changes: 14 additions & 4 deletions src/expression_cfunc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1006,7 +1006,16 @@ auto cfunc_build_function_maps(llvm_state &s, llvm::Type *fp_t, const std::vecto
// will contain {f : [[a, b, c], [d, e, f]]}.
// After construction, we have verified that for each function
// in the map the sets of arguments have all the same size.
fast_umap<llvm::Function *, std::vector<std::vector<std::variant<std::uint32_t, number>>>> tmp_map;
// NOTE: again, here and below we use name-based ordered maps for the functions.
// This ensures that the invocations of cm_make_arg_gen_*(), which create several
// global variables, always happen in a well-defined order. If we used an unordered map instead,
// the variables would be created in a "random" order, which would result in a
// unnecessary miss for the in-memory cache machinery when two logically-identical
// LLVM modules are considered different because of the difference in the order
// of declaration of global variables.
std::map<llvm::Function *, std::vector<std::vector<std::variant<std::uint32_t, number>>>,
llvm_func_name_compare>
tmp_map;

for (const auto &ex : seg) {
// Get the evaluation function.
Expand All @@ -1021,14 +1030,14 @@ auto cfunc_build_function_maps(llvm_state &s, llvm::Type *fp_t, const std::vecto
// element into a set of indices/constants.
const auto c_args = udef_to_variants(ex, {});

// LCOV_EXCL_START
if (!is_new_func && it->second.back().size() - 1u != c_args.size()) {
// LCOV_EXCL_START
throw std::invalid_argument(
fmt::format("Inconsistent arity detected in a compiled function in compact "
"mode: the same function is being called with both {} and {} arguments",
it->second.back().size() - 1u, c_args.size()));
// LCOV_EXCL_STOP
}
// LCOV_EXCL_STOP

// Add the new set of arguments.
it->second.emplace_back();
Expand All @@ -1042,7 +1051,8 @@ auto cfunc_build_function_maps(llvm_state &s, llvm::Type *fp_t, const std::vecto

// Now we build the transposition of tmp_map: from {f : [[a, b, c], [d, e, f]]}
// to {f : [[a, d], [b, e], [c, f]]}.
fast_umap<llvm::Function *, std::vector<std::variant<std::vector<std::uint32_t>, std::vector<number>>>>
std::map<llvm::Function *, std::vector<std::variant<std::vector<std::uint32_t>, std::vector<number>>>,
llvm_func_name_compare>
tmp_map_transpose;
for (const auto &[func, vv] : tmp_map) {
assert(!vv.empty()); // LCOV_EXCL_LINE
Expand Down
17 changes: 14 additions & 3 deletions src/taylor_02.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@
#endif

#include <heyoka/detail/cm_utils.hpp>
#include <heyoka/detail/fast_unordered.hpp>
#include <heyoka/detail/fwd_decl.hpp>
#include <heyoka/detail/llvm_func_create.hpp>
#include <heyoka/detail/llvm_fwd.hpp>
Expand Down Expand Up @@ -749,7 +748,16 @@ auto taylor_build_function_maps(llvm_state &s, llvm::Type *fp_t, const std::vect
// will contain {f : [[a, b, c], [d, e, f]]}.
// After construction, we have verified that for each function
// in the map the sets of arguments have all the same size.
fast_umap<llvm::Function *, std::vector<std::vector<std::variant<std::uint32_t, number>>>> tmp_map;
// NOTE: again, here and below we use name-based ordered maps for the functions.
// This ensures that the invocations of cm_make_arg_gen_*(), which create several
// global variables, always happen in a well-defined order. If we used an unordered map instead,
// the variables would be created in a "random" order, which would result in a
// unnecessary miss for the in-memory cache machinery when two logically-identical
// LLVM modules are considered different because of the difference in the order
// of declaration of global variables.
std::map<llvm::Function *, std::vector<std::vector<std::variant<std::uint32_t, number>>>,
llvm_func_name_compare>
tmp_map;

for (const auto &ex : seg) {
// Get the function for the computation of the derivative.
Expand All @@ -764,12 +772,14 @@ auto taylor_build_function_maps(llvm_state &s, llvm::Type *fp_t, const std::vect
// element into a set of indices/constants.
const auto cdiff_args = udef_to_variants(ex.first, ex.second);

// LCOV_EXCL_START
if (!is_new_func && it->second.back().size() - 1u != cdiff_args.size()) {
throw std::invalid_argument(
fmt::format("Inconsistent arity detected in a Taylor derivative function in compact "
"mode: the same function is being called with both {} and {} arguments",
it->second.back().size() - 1u, cdiff_args.size()));
}
// LCOV_EXCL_STOP

// Add the new set of arguments.
it->second.emplace_back();
Expand All @@ -783,7 +793,8 @@ auto taylor_build_function_maps(llvm_state &s, llvm::Type *fp_t, const std::vect

// Now we build the transposition of tmp_map: from {f : [[a, b, c], [d, e, f]]}
// to {f : [[a, d], [b, e], [c, f]]}.
fast_umap<llvm::Function *, std::vector<std::variant<std::vector<std::uint32_t>, std::vector<number>>>>
std::map<llvm::Function *, std::vector<std::variant<std::vector<std::uint32_t>, std::vector<number>>>,
llvm_func_name_compare>
tmp_map_transpose;
for (const auto &[func, vv] : tmp_map) {
assert(!vv.empty()); // LCOV_EXCL_LINE
Expand Down
26 changes: 24 additions & 2 deletions test/expression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <sstream>
#include <stdexcept>
#include <string>
#include <system_error>
#include <type_traits>
#include <unordered_map>
#include <utility>
Expand Down Expand Up @@ -331,8 +332,7 @@ TEST_CASE("get_param_size")

REQUIRE(get_param_size(par[0]) == 1u);
REQUIRE(get_param_size(par[123]) == 124u);
REQUIRE_THROWS_MATCHES(get_param_size(par[std::numeric_limits<std::uint32_t>::max()]), std::overflow_error,
Message("Overflow dected in get_n_param()"));
REQUIRE_THROWS_AS(get_param_size(par[std::numeric_limits<std::uint32_t>::max()]), std::system_error);
REQUIRE(get_param_size(par[123] + "x"_var) == 124u);
REQUIRE(get_param_size("x"_var + par[123]) == 124u);
REQUIRE(get_param_size(par[123] + 1_dbl) == 124u);
Expand Down Expand Up @@ -362,6 +362,15 @@ TEST_CASE("get_param_size")
bar = par[23] + (foo - x) / (2. * foo) + par[92];

REQUIRE(get_param_size(bar) == 93u);

// Tests with vectorisation.
REQUIRE(get_param_size({par[0], par[123]}) == 124u);
REQUIRE(get_param_size({par[123], par[0]}) == 124u);
REQUIRE(get_param_size({bar, bar + par[86]}) == 93u);
REQUIRE(get_param_size({bar, bar + par[186]}) == 187u);
REQUIRE(get_param_size({bar + par[86], bar}) == 93u);
REQUIRE(get_param_size({bar + par[186], bar}) == 187u);
REQUIRE(get_param_size({"x"_var, 2_dbl}) == 0u);
}

TEST_CASE("binary simpls")
Expand Down Expand Up @@ -557,6 +566,19 @@ TEST_CASE("is_time_dependent")
bar = hy::time + (foo - x) / (2. * foo) + hy::time;

REQUIRE(is_time_dependent(bar));

// Tests with vectorization.
REQUIRE(is_time_dependent({bar}));
REQUIRE(is_time_dependent({x, bar}));
REQUIRE(is_time_dependent({bar, x}));
REQUIRE(!is_time_dependent({2_dbl - par[0], x}));

foo = ((x + y) * (z + x)) * ((z - x) * (y + x));
bar = (foo - x) / (2. * foo);
REQUIRE(!is_time_dependent({x, bar}));
REQUIRE(is_time_dependent({x, bar, hy::time}));
REQUIRE(is_time_dependent({x, hy::time, bar}));
REQUIRE(is_time_dependent({hy::time, x, bar}));
}

TEST_CASE("s11n")
Expand Down
46 changes: 46 additions & 0 deletions test/llvm_state_mem_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

#include <heyoka/kw.hpp>
#include <heyoka/llvm_state.hpp>
#include <heyoka/model/pendulum.hpp>
#include <heyoka/taylor.hpp>
Expand Down Expand Up @@ -140,3 +141,48 @@ TEST_CASE("slp_vectorize test")
model::pendulum(), {1., 0.}, kw::tol = 1e-11, kw::slp_vectorize = true, kw::force_avx512 = true};
REQUIRE(llvm_state::get_memcache_size() > new_size);
}

// Bug: in compact mode, global variables used to be created in random
// order, which would lead to logically-identical modules considered
// different by the cache machinery due to the different declaration order.
TEST_CASE("bug cache miss compact mode")
{
{
llvm_state::clear_memcache();
llvm_state::set_memcache_limit(2048ull * 1024u * 1024u);

auto ta = taylor_adaptive<double>{model::pendulum(), {1., 0.}, kw::tol = 1e-11, kw::compact_mode = true};
const auto orig_size = llvm_state::get_memcache_size();

// Re-create the same ta several times and then check the cache size has not changed.
for (auto i = 0; i < 100; ++i) {
ta = taylor_adaptive<double>{model::pendulum(), {1., 0.}, kw::tol = 1e-11, kw::compact_mode = true};
}

REQUIRE(llvm_state::get_memcache_size() == orig_size);
}

{
llvm_state::clear_memcache();
llvm_state::set_memcache_limit(2048ull * 1024u * 1024u);

{
llvm_state s;
add_cfunc<double>(s, "func", {model::pendulum_energy()}, kw::compact_mode = true);
s.compile();
(void)s.jit_lookup("func");
}

const auto orig_size = llvm_state::get_memcache_size();

// Re-create the same cfunc several times and then check the cache size has not changed.
for (auto i = 0; i < 100; ++i) {
llvm_state s;
add_cfunc<double>(s, "func", {model::pendulum_energy()}, kw::compact_mode = true);
s.compile();
(void)s.jit_lookup("func");
}

REQUIRE(llvm_state::get_memcache_size() == orig_size);
}
}
2 changes: 1 addition & 1 deletion test/model_ffnn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <functional>
#include <limits>
#include <stdexcept>
#include <system_error>

#include <fmt/core.h>
#include <fmt/ranges.h>
Expand All @@ -19,7 +20,6 @@
#include <heyoka/kw.hpp>
#include <heyoka/math.hpp>
#include <heyoka/model/ffnn.hpp>
#include <system_error>

#include "catch.hpp"

Expand Down

0 comments on commit 04c38cb

Please sign in to comment.