Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance optimisations #354

Merged
merged 19 commits into from
Oct 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ ADD_HEYOKA_BENCHMARK(bandf)
ADD_HEYOKA_BENCHMARK(bandf_odeint)
ADD_HEYOKA_BENCHMARK(penc_comparison)
ADD_HEYOKA_BENCHMARK(large_cfunc)
ADD_HEYOKA_BENCHMARK(sims_flanagan_jac)

if(HEYOKA_WITH_MPPP AND mp++_WITH_MPFR)
ADD_HEYOKA_BENCHMARK(pendulum_mp)
Expand Down
157 changes: 157 additions & 0 deletions benchmark/sims_flanagan_jac.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
// Copyright 2020, 2021, 2022, 2023 Francesco Biscani ([email protected]), Dario Izzo ([email protected])
//
// This file is part of the heyoka library.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

#include <algorithm>
#include <array>
#include <iostream>
#include <iterator>
#include <stdexcept>
#include <tuple>
#include <utility>
#include <vector>

#include <boost/program_options.hpp>

#include <spdlog/spdlog.h>

#include <heyoka/expression.hpp>
#include <heyoka/llvm_state.hpp>
#include <heyoka/logging.hpp>
#include <heyoka/math/cos.hpp>
#include <heyoka/math/kepDE.hpp>
#include <heyoka/math/sin.hpp>
#include <heyoka/math/sqrt.hpp>

#include <fmt/ranges.h>

using namespace heyoka;

using v_ex_t = std::array<expression, 3>;

const auto mu = make_vars("mu")[0];
const auto tm = make_vars("t")[0];

std::pair<v_ex_t, v_ex_t> make_lp(const v_ex_t &pos_0, const v_ex_t &vel_0)
{
const auto &[x0, y0, z0] = pos_0;
const auto &[vx0, vy0, vz0] = vel_0;

auto v02 = vx0 * vx0 + vy0 * vy0 + vz0 * vz0;
auto r0 = sqrt(x0 * x0 + y0 * y0 + z0 * z0);
auto eps = v02 * 0.5 - mu / r0;
auto a = -mu / (2. * eps);

auto sigma0 = (x0 * vx0 + y0 * vy0 + z0 * vz0) / sqrt(mu);
auto s0 = sigma0 / sqrt(a);
auto c0 = 1. - r0 / a;

auto n = sqrt(mu / (a * a * a));
auto DM = n * tm;

auto DE = kepDE(s0, c0, DM);

auto cDE = cos(DE);
auto sDE = sin(DE);

auto r = a + (r0 - a) * cDE + sigma0 * sqrt(a) * sDE;

auto F = 1. - a / r0 * (1. - cDE);
auto G = a * sigma0 / sqrt(mu) * (1. - cDE) + r0 * sqrt(a / mu) * sDE;
auto Ft = -sqrt(mu * a) / (r * r0) * sDE;
auto Gt = 1. - a / r * (1. - cDE);

return {{{F * x0 + G * vx0, F * y0 + G * vy0, F * z0 + G * vz0}},
{{Ft * x0 + Gt * vx0, Ft * y0 + Gt * vy0, Ft * z0 + Gt * vz0}}};
}

int main(int argc, char *argv[])
{
namespace po = boost::program_options;

create_logger();
auto logger = spdlog::get("heyoka");

set_logger_level_trace();

unsigned N{};

po::options_description desc("Options");

desc.add_options()("help", "produce help message")("N", po::value<unsigned>(&N)->default_value(20u),
"number of segments");

po::variables_map vm;
po::store(po::parse_command_line(argc, argv, desc), vm);
po::notify(vm);

if (vm.count("help") != 0u) {
std::cout << desc << "\n";
return 0;
}

if (N == 0u) {
throw std::invalid_argument("The number of segments cannot be zero");
}

auto [x0, y0, z0] = make_vars("x0", "y0", "z0");
auto [vx0, vy0, vz0] = make_vars("vx0", "vy0", "vz0");

std::vector<v_ex_t> Dvs;
for (auto i = 0u; i < N; ++i) {
auto [Dvx, Dvy, Dvz]
= make_vars(fmt::format("Dvx{}", i + 1u), fmt::format("Dvy{}", i + 1u), fmt::format("Dvz{}", i + 1u));
Dvs.push_back(v_ex_t{Dvx, Dvy, Dvz});
}

auto [pos_f, vel_f] = make_lp({x0, y0, z0}, {vx0, vy0, vz0});

for (auto i = 0u; i < N; ++i) {
vel_f[0] += Dvs[i][0];
vel_f[1] += Dvs[i][1];
vel_f[2] += Dvs[i][2];

std::tie(pos_f, vel_f) = make_lp(pos_f, vel_f);
}

std::vector<expression> Dvs_list;
for (const auto &Dv : Dvs) {
Dvs_list.push_back(Dv[0]);
Dvs_list.push_back(Dv[1]);
Dvs_list.push_back(Dv[2]);
}

std::vector<expression> diff_vars_list = {x0, y0, z0, vx0, vy0, vz0};
diff_vars_list.insert(diff_vars_list.end(), Dvs_list.begin(), Dvs_list.end());

auto dt
= diff_tensors({pos_f[0], pos_f[1], pos_f[2], vel_f[0], vel_f[1], vel_f[2]}, kw::diff_args = diff_vars_list);

std::vector<expression> jac;
auto jac_sr = dt.get_derivatives(1);
std::transform(jac_sr.begin(), jac_sr.end(), std::back_inserter(jac), [](const auto &p) { return p.second; });

llvm_state s;

std::vector<expression> vars_list = {x0, y0, z0, vx0, vy0, vz0, mu, tm};
vars_list.insert(vars_list.end(), Dvs_list.begin(), Dvs_list.end());

logger->trace("Adding cfunc");

add_cfunc<double>(s, "func", jac, kw::vars = vars_list);

logger->trace("Compiling cfunc");

s.compile();

logger->trace("Looking up cfunc");

auto *fptr
= reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("func"));

logger->trace("All done");
}
8 changes: 8 additions & 0 deletions doc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,14 @@ New
system (`#352 <https://github.com/bluescarni/heyoka/pull/352>`__).
Taylor derivatives are not implemented yet.

Changes
~~~~~~~

- Substantial performance improvements in the computation of
derivative tensors of large expressions with a high degree
of internal redundancy
(`#354 <https://github.com/bluescarni/heyoka/pull/354>`__).

Fix
~~~

Expand Down
2 changes: 2 additions & 0 deletions doc/conf.py.in
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ html_theme_options = {
"repository_url": "https://github.com/bluescarni/heyoka",
"use_repository_button": True,
"use_issues_button": True,
# See: https://github.com/pydata/pydata-sphinx-theme/issues/1492
"navigation_with_keys": False,
}


Expand Down
67 changes: 67 additions & 0 deletions include/heyoka/detail/fast_unordered.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// Copyright 2020, 2021, 2022, 2023 Francesco Biscani ([email protected]), Dario Izzo ([email protected])
//
// This file is part of the heyoka library.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

#ifndef HEYOKA_DETAIL_FAST_UNORDERED_HPP
#define HEYOKA_DETAIL_FAST_UNORDERED_HPP

#include <heyoka/config.hpp>

#include <boost/version.hpp>

#if (BOOST_VERSION / 100000 > 1) || (BOOST_VERSION / 100000 == 1 && BOOST_VERSION / 100 % 1000 >= 81)

#define HEYOKA_HAVE_BOOST_UNORDERED_FLAT

#endif

#if defined(HEYOKA_HAVE_BOOST_UNORDERED_FLAT)

#include <boost/unordered/unordered_flat_map.hpp>
#include <boost/unordered/unordered_flat_set.hpp>

#else

#include <unordered_map>
#include <unordered_set>

#endif

HEYOKA_BEGIN_NAMESPACE

namespace detail
{

template <typename... Args>
using fast_uset =
#if defined(HEYOKA_HAVE_BOOST_UNORDERED_FLAT)
boost::unordered_flat_set<Args...>
#else
std::unordered_set<Args...>
#endif
;

template <typename... Args>
using fast_umap =
#if defined(HEYOKA_HAVE_BOOST_UNORDERED_FLAT)
boost::unordered_flat_map<Args...>
#else
std::unordered_map<Args...>
#endif
;

} // namespace detail

HEYOKA_END_NAMESPACE

#if defined(HEYOKA_HAVE_BOOST_UNORDERED_FLAT)

#undef HEYOKA_HAVE_BOOST_UNORDERED_FLAT

#endif

#endif
33 changes: 3 additions & 30 deletions include/heyoka/detail/func_cache.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,47 +16,20 @@
#define HEYOKA_DETAIL_FUNC_CACHE_HPP

#include <heyoka/config.hpp>

#include <boost/version.hpp>

#if (BOOST_VERSION / 100000 > 1) || (BOOST_VERSION / 100000 == 1 && BOOST_VERSION / 100 % 1000 >= 81)

#include <boost/unordered/unordered_flat_map.hpp>
#include <boost/unordered/unordered_flat_set.hpp>
#include <heyoka/detail/fast_unordered.hpp>

HEYOKA_BEGIN_NAMESPACE

namespace detail
{

using funcptr_set = boost::unordered_flat_set<const void *>;
using funcptr_set = fast_uset<const void *>;

template <typename T>
using funcptr_map = boost::unordered_flat_map<const void *, T>;
using funcptr_map = fast_umap<const void *, T>;

} // namespace detail

HEYOKA_END_NAMESPACE

#else

#include <unordered_map>
#include <unordered_set>

HEYOKA_BEGIN_NAMESPACE

namespace detail
{

using funcptr_set = std::unordered_set<const void *>;

template <typename T>
using funcptr_map = std::unordered_map<const void *, T>;

} // namespace detail

HEYOKA_END_NAMESPACE

#endif

#endif
2 changes: 2 additions & 0 deletions include/heyoka/expression.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,8 @@ class HEYOKA_DLL_PUBLIC dtens

[[nodiscard]] subrange get_derivatives(std::uint32_t, std::uint32_t) const;
[[nodiscard]] subrange get_derivatives(std::uint32_t) const;
[[nodiscard]] std::vector<expression> get_gradient() const;
[[nodiscard]] std::vector<expression> get_jacobian() const;
};

HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const dtens &);
Expand Down
55 changes: 52 additions & 3 deletions src/expression_basic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <variant>
#include <vector>

#include <boost/container_hash/hash.hpp>
#include <boost/safe_numerics/safe_integer.hpp>

#include <llvm/IR/Function.h>
Expand Down Expand Up @@ -494,12 +495,60 @@ std::size_t hash(funcptr_map<std::size_t> &func_map, const expression &ex) noexc
ex.value());
}

// NOLINTNEXTLINE(bugprone-exception-escape)
// NOLINTNEXTLINE(bugprone-exception-escape,misc-no-recursion)
std::size_t hash(const expression &ex) noexcept
{
detail::funcptr_map<std::size_t> func_map;
// NOTE: we implement an optimisation here: if either the expression is **not** a function,
// or if it is a function and none of its arguments are functions (i.e., it is a non-recursive
// expression), then we do not recurse into detail::hash() and we do not create/update the
// funcptr_map cache. This allows us to avoid memory allocations, in an effort to optimise the
// computation of hashes in decompositions, where the elementary subexpressions are
// non-recursive by definition.
// NOTE: we must be very careful with this optimisation: we need to make sure the hash computation
// result is the same whether the optimisation is enabled or not. That is, the hash
// **must** be the same regardless of whether a non-recursive expression is standalone
// or it is part of a larger expression.
return std::visit(
// NOLINTNEXTLINE(misc-no-recursion)
[&ex](const auto &v) {
using type = detail::uncvref_t<decltype(v)>;

if constexpr (std::is_same_v<type, func>) {
const auto no_func_args = std::none_of(v.args().begin(), v.args().end(), [](const auto &x) {
return std::holds_alternative<func>(x.value());
});

if (no_func_args) {
// NOTE: it is very important that here we replicate *exactly* the logic
// of func::hash(), so that we produce the same hash value that would
// be produced if ex were part of a larger expression.
std::size_t seed = std::hash<std::string>{}(v.get_name());

for (const auto &arg : v.args()) {
// NOTE: for non-function arguments, calling hash(arg)
// is identical to calling detail::hash(func_map, arg)
// (as we do in func::hash()):
// both ultimately end up using directly the std::hash
// specialisation on the arg.
boost::hash_combine(seed, hash(arg));
}

return hash(func_map, ex);
return seed;
} else {
// The regular, non-optimised path.
detail::funcptr_map<std::size_t> func_map;

return hash(func_map, ex);
}
} else {
// ex is not a function, compute its hash directly
// via a std::hash specialisation.
// NOTE: this is the same logic implemented in detail::hash(),
// except we avoid the unnecessary creation of a funcptr_map.
return std::hash<type>{}(v);
}
},
ex.value());
}

} // namespace detail
Expand Down
Loading