Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: benchmark CLI tool #904

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion tests/benchmarks/cpu/propagation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ int main(int argc, char** argv) {
using test_algebra = typename toy_detector_t::algebra_type;
using scalar = dscalar<test_algebra>;
using vector3 = dvector3D<test_algebra>;

using free_track_parameters_t = free_track_parameters<test_algebra>;
using uniform_gen_t =
detail::random_numbers<scalar, std::uniform_real_distribution<scalar>>;
Expand Down Expand Up @@ -92,11 +93,12 @@ int main(int argc, char** argv) {
n_tracks);

// Specific configuration for the random track generation
trk_cfg.seed(42u);
trk_cfg.seed(detail::random_numbers<scalar>::default_seed());

// Add additional tracks for warmup
bench_cfg.n_warmup(static_cast<int>(
std::ceil(0.1f * static_cast<float>(trk_cfg.n_tracks()))));
bench_cfg.do_warmup(true);

//
// Prepare data
Expand Down
22 changes: 6 additions & 16 deletions tests/benchmarks/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,17 @@ enable_language(CUDA)
# Set the CUDA build flags.
include(detray-compiler-options-cuda)

# Look for openMP, which is used for the CPU benchmark
find_package(OpenMP)

# make unit tests for multiple algebras
# Currently vc and smatrix is not supported
set(algebras "array")
# Build benchmarks for multiple algebra plugins
# Currently vc and smatrix is not supported on device
set(algebra_plugins "array")
if(DETRAY_EIGEN_PLUGIN)
list(APPEND algebras "eigen")
list(APPEND algebra_plugins "eigen")
endif()

foreach(algebra ${algebras})
foreach(algebra ${algebra_plugins})
detray_add_executable(benchmark_cuda_propagation_${algebra}
"propagation.cpp"
LINK_LIBRARIES detray::benchmark_cuda detray::core detray::algebra_${algebra} vecmem::cuda detray::test_utils
LINK_LIBRARIES detray::benchmark_cuda_${algebra} detray::core_${algebra} vecmem::cuda detray::test_utils
)

target_compile_definitions(
Expand All @@ -40,11 +37,4 @@ foreach(algebra ${algebras})
detray_benchmark_cuda_propagation_${algebra}
PRIVATE "-march=native" "-ftree-vectorize"
)

if(OpenMP_CXX_FOUND)
target_link_libraries(
detray_benchmark_cuda_propagation_${algebra}
PRIVATE OpenMP::OpenMP_CXX
)
endif()
endforeach()
4 changes: 3 additions & 1 deletion tests/benchmarks/cuda/propagation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ int main(int argc, char** argv) {
using test_algebra = typename toy_detector_t::algebra_type;
using scalar = dscalar<test_algebra>;
using vector3 = dvector3D<test_algebra>;

using free_track_parameters_t = free_track_parameters<test_algebra>;
using uniform_gen_t =
detail::random_numbers<scalar, std::uniform_real_distribution<scalar>>;
Expand Down Expand Up @@ -87,11 +88,12 @@ int main(int argc, char** argv) {
n_tracks);

// Specific configuration for the random track generation
trk_cfg.seed(42u);
trk_cfg.seed(detail::random_numbers<scalar>::default_seed());

// Add additional tracks for warmup
bench_cfg.n_warmup(static_cast<int>(
std::ceil(0.1f * static_cast<float>(trk_cfg.n_tracks()))));
bench_cfg.do_warmup(true);

//
// Prepare data
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/** Detray library, part of the ACTS project (R&D line)
*
* (c) 2023-2024 CERN for the benefit of the ACTS project
* (c) 2023-2025 CERN for the benefit of the ACTS project
*
* Mozilla Public License Version 2.0
*/
Expand Down Expand Up @@ -52,12 +52,12 @@ struct host_propagation_bm : public benchmark_base {
configuration &config() { return m_cfg; }

/// Prepare data and run benchmark loop
inline void operator()(::benchmark::State &state,
dvector<free_track_parameters<algebra_t>> *tracks,
const typename propagator_t::detector_type *det,
const bfield_t *bfield,
typename propagator_t::actor_chain_type::state_tuple
*input_actor_states) const {
inline void operator()(
::benchmark::State &state,
const dvector<free_track_parameters<algebra_t>> *tracks,
const typename propagator_t::detector_type *det, const bfield_t *bfield,
const typename propagator_t::actor_chain_type::state_tuple
*input_actor_states) const {
using actor_chain_t = typename propagator_t::actor_chain_type;
using actor_states_t = typename actor_chain_t::state_tuple;

Expand All @@ -76,7 +76,8 @@ struct host_propagation_bm : public benchmark_base {

// Call the host propagation
auto run_propagation = [&p, det, bfield, input_actor_states](
free_track_parameters<algebra_t> &track) {
const free_track_parameters<algebra_t>
&track) {
// Fresh copy of actor states
actor_states_t actor_states(*input_actor_states);
// Tuple of references to pass to the propagator
Expand All @@ -103,17 +104,26 @@ struct host_propagation_bm : public benchmark_base {
// Warm-up
if (m_cfg.benchmark().do_warmup()) {
assert(n_warmup > 0);
auto stride{n_samples / n_warmup};
int stride{n_samples / n_warmup};
stride = (stride == 0) ? 10 : stride;
assert(stride > 0);

#pragma omp parallel for schedule(dynamic)
for (int i = 0; i < n_samples; i += stride) {
// The track gets copied into the stepper state, so that the
// original track sample vector remains unchanged
run_propagation((*tracks)[static_cast<std::size_t>(i)]);
}
} else {
std::cout << "WARNING: Running host benchmarks without warmup"
<< std::endl;
}

// Run the benchmark

// Calculate the propagation rate
// @see
// https://github.com/google/benchmark/blob/main/docs/user_guide.md#custom-counters
std::size_t total_tracks = 0u;
for (auto _ : state) {
#pragma omp parallel for schedule(dynamic)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,34 @@ enable_language(CUDA)
# Set the CUDA build flags.
include(detray-compiler-options-cuda)

# Build benchmark library for multiple algebra plugins to create correct
# template instantiations
# Currently vc and smatrix is not supported on device
set(algebra_plugins "array")
if(DETRAY_EIGEN_PLUGIN)
list(APPEND algebra_plugins "eigen")
endif()

# Set up a benchamrk library for CUDA
add_library(
detray_benchmark_cuda
STATIC
"propagation_benchmark.hpp"
"propagation_benchmark.cu"
)
foreach(algebra ${algebra_plugins})
add_library(
detray_benchmark_cuda_${algebra}
STATIC
"propagation_benchmark.hpp"
"propagation_benchmark.cu"
)

add_library(detray::benchmark_cuda ALIAS detray_benchmark_cuda)
add_library(
detray::benchmark_cuda_${algebra}
ALIAS detray_benchmark_cuda_${algebra}
)

target_link_libraries(
detray_benchmark_cuda
PUBLIC vecmem::cuda detray::benchmarks detray::test_utils detray::core_array
)
target_link_libraries(
detray_benchmark_cuda_${algebra}
PUBLIC
vecmem::cuda
detray::benchmarks
detray::test_utils
detray::core_${algebra}
)
endforeach()
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/** Detray library, part of the ACTS project (R&D line)
*
* (c) 2022-2024 CERN for the benefit of the ACTS project
* (c) 2022-2025 CERN for the benefit of the ACTS project
*
* Mozilla Public License Version 2.0
*/
Expand All @@ -15,7 +15,7 @@ __global__ void __launch_bounds__(256, 4) propagator_benchmark_kernel(
propagation::config cfg,
typename propagator_t::detector_type::view_type det_view,
typename propagator_t::stepper_type::magnetic_field_type field_view,
typename propagator_t::actor_chain_type::state_tuple
const typename propagator_t::actor_chain_type::state_tuple
*device_actor_state_ptr,
vecmem::data::vector_view<
free_track_parameters<typename propagator_t::algebra_type>>
Expand All @@ -30,8 +30,9 @@ __global__ void __launch_bounds__(256, 4) propagator_benchmark_kernel(
propagator<typename propagator_t::stepper_type,
navigator<detector_device_t>, actor_chain_t>;

detector_device_t det(det_view);
vecmem::device_vector<free_track_parameters<algebra_t>> tracks(tracks_view);
const detector_device_t det(det_view);
const vecmem::device_vector<free_track_parameters<algebra_t>> tracks(
tracks_view);

int gid = threadIdx.x + blockIdx.x * blockDim.x;
if (gid >= tracks.size()) {
Expand All @@ -46,6 +47,9 @@ __global__ void __launch_bounds__(256, 4) propagator_benchmark_kernel(
auto actor_state_refs = actor_chain_t::setup_actor_states(actor_states);

// Create the propagator state

// The track gets copied into the stepper state, so that the
// original track sample vector remains unchanged
typename propagator_device_t::state p_state(tracks.at(gid), field_view,
det);

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/** Detray library, part of the ACTS project (R&D line)
*
* (c) 2024 CERN for the benefit of the ACTS project
* (c) 2024-2025 CERN for the benefit of the ACTS project
*
* Mozilla Public License Version 2.0
*/
Expand Down Expand Up @@ -40,6 +40,7 @@
// System include(s)
#include <algorithm>
#include <cassert>
#include <iostream>
#include <random>
#include <string>

Expand Down Expand Up @@ -85,6 +86,7 @@ void run_propagation_kernel(
const int);

/// Allocate actor state blueprint on device
/// @note This only works if each actor state in the tuple is essentially POD
template <typename propagator_t>
typename propagator_t::actor_chain_type::state_tuple *setup_actor_states(
typename propagator_t::actor_chain_type::state_tuple *);
Expand Down Expand Up @@ -155,14 +157,22 @@ struct cuda_propagation_bm : public benchmark_base {
setup_actor_states<propagator_t>(input_actor_states);

// Do a small warm up run
{
if (m_cfg.benchmark().do_warmup()) {
auto warmup_track_buffer = detray::get_buffer(
vecmem::get_data(*tracks), *dev_mr, cuda_cpy);

run_propagation_kernel<propagator_t, kOPT>(
m_cfg.propagation(), det_view, *bfield, device_actor_state_ptr,
warmup_track_buffer, math::min(n_warmup, n_samples));
} else {
std::cout << "WARNING: Running CUDA benchmarks without warmup is "
"not recommended"
<< std::endl;
}

// Calculate the propagation rate
// @see
// https://github.com/google/benchmark/blob/main/docs/user_guide.md#custom-counters
std::size_t total_tracks = 0u;
for (auto _ : state) {
// Launch the propagator test for GPU device
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -171,19 +171,19 @@ inline void register_benchmark(

std::cout << bench_name << "\n" << bench_cfg;

// Cpu benchmark
if constexpr (std::is_invocable_v<
decltype(prop_benchmark), ::benchmark::State &,
dvector<free_track_parameters<algebra_t>> *,
const detector_t *, const bfield_bknd_t *,
typename propagator_t::actor_chain_type::state_tuple
*>) {
// Cpu benchmark
::benchmark::RegisterBenchmark(bench_name.c_str(), prop_benchmark,
&tracks, &det, &bfield,
actor_states);
//->MeasureProcessCPUTime();
} else {

// Device benchmark
::benchmark::RegisterBenchmark(bench_name.c_str(), prop_benchmark,
dev_mr, &tracks, &det, &bfield,
actor_states);
Expand Down
1 change: 0 additions & 1 deletion tests/tools/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ target_link_libraries(
INTERFACE
Boost::program_options
vecmem::core
detray::core_array
detray::test_common
detray::io
detray::csv_io
Expand Down
3 changes: 2 additions & 1 deletion tests/tools/include/detray/options/propagation_options.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ void add_options<detray::navigation::config>(
"mask_tolerance_scalor",
boost::program_options::value<float>()->default_value(
cfg.mask_tolerance_scalor),
"Mask tolerance scaling")(
"Mask tolerance scale factor")(
"overstep_tolerance",
boost::program_options::value<float>()->default_value(
cfg.overstep_tolerance / unit<float>::um),
Expand Down Expand Up @@ -172,6 +172,7 @@ void configure_options<detray::stepping::config>(

cfg.path_limit = path_limit * unit<float>::m;
}
cfg.do_covariance_transport = false;
if (vm.count("covariance_transport")) {
cfg.do_covariance_transport = true;
}
Expand Down
Loading
Loading