Skip to content

Commit

Permalink
Add CUDA benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
niermann999 committed Jan 8, 2025
1 parent 39c0749 commit d3b6d27
Show file tree
Hide file tree
Showing 15 changed files with 279 additions and 68 deletions.
6 changes: 0 additions & 6 deletions core/include/detray/propagator/propagator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,6 @@ struct propagator {
explicit constexpr propagator(const propagation::config &cfg)
: m_cfg{cfg} {}

/// @returns the actor chain
DETRAY_HOST_DEVICE
constexpr const actor_chain_t &get_actor_chain() const {
return run_actors;
}

/// Propagation that state aggregates a stepping and a navigation state. It
/// also keeps references to the actor states.
struct state {
Expand Down
2 changes: 2 additions & 0 deletions tests/benchmarks/cpu/propagation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ int main(int argc, char** argv) {
using test_algebra = typename toy_detector_t::algebra_type;
using scalar = dscalar<test_algebra>;
using vector3 = dvector3D<test_algebra>;

using free_track_parameters_t = free_track_parameters<test_algebra>;
using uniform_gen_t =
detail::random_numbers<scalar, std::uniform_real_distribution<scalar>>;
Expand Down Expand Up @@ -97,6 +98,7 @@ int main(int argc, char** argv) {
// Add additional tracks for warmup
bench_cfg.n_warmup(static_cast<int>(
std::ceil(0.1f * static_cast<float>(trk_cfg.n_tracks()))));
bench_cfg.do_warmup(true);

//
// Prepare data
Expand Down
22 changes: 6 additions & 16 deletions tests/benchmarks/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,17 @@ enable_language(CUDA)
# Set the CUDA build flags.
include(detray-compiler-options-cuda)

# Look for openMP, which is used for the CPU benchmark
find_package(OpenMP)

# make unit tests for multiple algebras
# Currently vc and smatrix is not supported
set(algebras "array")
# Build benchmarks for multiple algebra plugins
# Currently vc and smatrix is not supported on device
set(algebra_plugins "array")
if(DETRAY_EIGEN_PLUGIN)
list(APPEND algebras "eigen")
list(APPEND algebra_plugins "eigen")
endif()

foreach(algebra ${algebras})
foreach(algebra ${algebra_plugins})
detray_add_executable(benchmark_cuda_propagation_${algebra}
"propagation.cpp"
LINK_LIBRARIES detray::benchmark_cuda detray::core detray::algebra_${algebra} vecmem::cuda detray::test_utils
LINK_LIBRARIES detray::benchmark_cuda_${algebra} detray::core detray::algebra_${algebra} vecmem::cuda detray::test_utils
)

target_compile_definitions(
Expand All @@ -40,11 +37,4 @@ foreach(algebra ${algebras})
detray_benchmark_cuda_propagation_${algebra}
PRIVATE "-march=native" "-ftree-vectorize"
)

if(OpenMP_CXX_FOUND)
target_link_libraries(
detray_benchmark_cuda_propagation_${algebra}
PRIVATE OpenMP::OpenMP_CXX
)
endif()
endforeach()
4 changes: 3 additions & 1 deletion tests/benchmarks/cuda/propagation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ int main(int argc, char** argv) {
using test_algebra = typename toy_detector_t::algebra_type;
using scalar = dscalar<test_algebra>;
using vector3 = dvector3D<test_algebra>;

using free_track_parameters_t = free_track_parameters<test_algebra>;
using uniform_gen_t =
detail::random_numbers<scalar, std::uniform_real_distribution<scalar>>;
Expand Down Expand Up @@ -87,11 +88,12 @@ int main(int argc, char** argv) {
n_tracks);

// Specific configuration for the random track generation
trk_cfg.seed(42u);
trk_cfg.seed(detail::random_numbers<scalar>::default_seed());

// Add additional tracks for warmup
bench_cfg.n_warmup(static_cast<int>(
std::ceil(0.1f * static_cast<float>(trk_cfg.n_tracks()))));
bench_cfg.do_warmup(true);

//
// Prepare data
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/** Detray library, part of the ACTS project (R&D line)
*
* (c) 2023-2024 CERN for the benefit of the ACTS project
* (c) 2023-2025 CERN for the benefit of the ACTS project
*
* Mozilla Public License Version 2.0
*/
Expand Down Expand Up @@ -52,12 +52,12 @@ struct host_propagation_bm : public benchmark_base {
configuration &config() { return m_cfg; }

/// Prepare data and run benchmark loop
inline void operator()(::benchmark::State &state,
dvector<free_track_parameters<algebra_t>> *tracks,
const typename propagator_t::detector_type *det,
const bfield_t *bfield,
typename propagator_t::actor_chain_type::state_tuple
*input_actor_states) const {
inline void operator()(
::benchmark::State &state,
const dvector<free_track_parameters<algebra_t>> *tracks,
const typename propagator_t::detector_type *det, const bfield_t *bfield,
const typename propagator_t::actor_chain_type::state_tuple
*input_actor_states) const {
using actor_chain_t = typename propagator_t::actor_chain_type;
using actor_states_t = typename actor_chain_t::state_tuple;

Expand All @@ -76,7 +76,8 @@ struct host_propagation_bm : public benchmark_base {

// Call the host propagation
auto run_propagation = [&p, det, bfield, input_actor_states](
free_track_parameters<algebra_t> &track) {
const free_track_parameters<algebra_t>
&track) {
// Fresh copy of actor states
actor_states_t actor_states(*input_actor_states);
// Tuple of references to pass to the propagator
Expand All @@ -103,17 +104,26 @@ struct host_propagation_bm : public benchmark_base {
// Warm-up
if (m_cfg.benchmark().do_warmup()) {
assert(n_warmup > 0);
auto stride{n_samples / n_warmup};
int stride{n_samples / n_warmup};
stride = (stride == 0) ? 10 : stride;
assert(stride > 0);

#pragma omp parallel for schedule(dynamic)
for (int i = 0; i < n_samples; i += stride) {
// The track gets copied into the stepper state, so that the
// original track sample vector remains unchanged
run_propagation((*tracks)[static_cast<std::size_t>(i)]);
}
} else {
std::cout << "WARNING: Running host benchmarks without warmup"
<< std::endl;
}

// Run the benchmark

// Calculate the propagation rate
// @see
// https://github.com/google/benchmark/blob/main/docs/user_guide.md#custom-counters
std::size_t total_tracks = 0u;
for (auto _ : state) {
#pragma omp parallel for schedule(dynamic)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,34 @@ enable_language(CUDA)
# Set the CUDA build flags.
include(detray-compiler-options-cuda)

# Build benchmark library for multiple algebra plugins to create correct
# template instantiations
# Currently vc and smatrix is not supported on device
set(algebra_plugins "array")
if(DETRAY_EIGEN_PLUGIN)
list(APPEND algebra_plugins "eigen")
endif()

# Set up a benchamrk library for CUDA
add_library(
detray_benchmark_cuda
STATIC
"propagation_benchmark.hpp"
"propagation_benchmark.cu"
)
foreach(algebra ${algebra_plugins})
add_library(
detray_benchmark_cuda_${algebra}
STATIC
"propagation_benchmark.hpp"
"propagation_benchmark.cu"
)

add_library(detray::benchmark_cuda ALIAS detray_benchmark_cuda)
add_library(
detray::benchmark_cuda_${algebra}
ALIAS detray_benchmark_cuda_${algebra}
)

target_link_libraries(
detray_benchmark_cuda
PUBLIC vecmem::cuda detray::benchmarks detray::test_utils detray::core_array
)
target_link_libraries(
detray_benchmark_cuda_${algebra}
PUBLIC
vecmem::cuda
detray::benchmarks
detray::test_utils
detray::core_${algebra}
)
endforeach()
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/** Detray library, part of the ACTS project (R&D line)
*
* (c) 2022-2024 CERN for the benefit of the ACTS project
* (c) 2022-2025 CERN for the benefit of the ACTS project
*
* Mozilla Public License Version 2.0
*/
Expand All @@ -15,7 +15,7 @@ __global__ void __launch_bounds__(256, 4) propagator_benchmark_kernel(
propagation::config cfg,
typename propagator_t::detector_type::view_type det_view,
typename propagator_t::stepper_type::magnetic_field_type field_view,
typename propagator_t::actor_chain_type::state_tuple
const typename propagator_t::actor_chain_type::state_tuple
*device_actor_state_ptr,
vecmem::data::vector_view<
free_track_parameters<typename propagator_t::algebra_type>>
Expand All @@ -30,8 +30,9 @@ __global__ void __launch_bounds__(256, 4) propagator_benchmark_kernel(
propagator<typename propagator_t::stepper_type,
navigator<detector_device_t>, actor_chain_t>;

detector_device_t det(det_view);
vecmem::device_vector<free_track_parameters<algebra_t>> tracks(tracks_view);
const detector_device_t det(det_view);
const vecmem::device_vector<free_track_parameters<algebra_t>> tracks(
tracks_view);

int gid = threadIdx.x + blockIdx.x * blockDim.x;
if (gid >= tracks.size()) {
Expand All @@ -46,6 +47,9 @@ __global__ void __launch_bounds__(256, 4) propagator_benchmark_kernel(
auto actor_state_refs = actor_chain_t::setup_actor_states(actor_states);

// Create the propagator state

// The track gets copied into the stepper state, so that the
// original track sample vector remains unchanged
typename propagator_device_t::state p_state(tracks.at(gid), field_view,
det);

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/** Detray library, part of the ACTS project (R&D line)
*
* (c) 2024 CERN for the benefit of the ACTS project
* (c) 2024-2025 CERN for the benefit of the ACTS project
*
* Mozilla Public License Version 2.0
*/
Expand Down Expand Up @@ -40,6 +40,7 @@
// System include(s)
#include <algorithm>
#include <cassert>
#include <iostream>
#include <random>
#include <string>

Expand Down Expand Up @@ -85,6 +86,7 @@ void run_propagation_kernel(
const int);

/// Allocate actor state blueprint on device
/// @note This only works if each actor state in the tuple is essentially POD
template <typename propagator_t>
typename propagator_t::actor_chain_type::state_tuple *setup_actor_states(
typename propagator_t::actor_chain_type::state_tuple *);
Expand Down Expand Up @@ -155,14 +157,22 @@ struct cuda_propagation_bm : public benchmark_base {
setup_actor_states<propagator_t>(input_actor_states);

// Do a small warm up run
{
if (m_cfg.benchmark().do_warmup()) {
auto warmup_track_buffer = detray::get_buffer(
vecmem::get_data(*tracks), *dev_mr, cuda_cpy);

run_propagation_kernel<propagator_t, kOPT>(
m_cfg.propagation(), det_view, *bfield, device_actor_state_ptr,
warmup_track_buffer, math::min(n_warmup, n_samples));
} else {
std::cout << "WARNING: Running CUDA benchmarks without warmup is "
"not recommended"
<< std::endl;
}

// Calculate the propagation rate
// @see
// https://github.com/google/benchmark/blob/main/docs/user_guide.md#custom-counters
std::size_t total_tracks = 0u;
for (auto _ : state) {
// Launch the propagator test for GPU device
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -171,19 +171,19 @@ inline void register_benchmark(

std::cout << bench_name << "\n" << bench_cfg;

// Cpu benchmark
if constexpr (std::is_invocable_v<
decltype(prop_benchmark), ::benchmark::State &,
dvector<free_track_parameters<algebra_t>> *,
const detector_t *, const bfield_bknd_t *,
typename propagator_t::actor_chain_type::state_tuple
*>) {
// Cpu benchmark
::benchmark::RegisterBenchmark(bench_name.c_str(), prop_benchmark,
&tracks, &det, &bfield,
actor_states);
//->MeasureProcessCPUTime();
} else {

// Device benchmark
::benchmark::RegisterBenchmark(bench_name.c_str(), prop_benchmark,
dev_mr, &tracks, &det, &bfield,
actor_states);
Expand Down
2 changes: 1 addition & 1 deletion tests/tools/include/detray/options/propagation_options.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ void add_options<detray::navigation::config>(
"mask_tolerance_scalor",
boost::program_options::value<float>()->default_value(
cfg.mask_tolerance_scalor),
"Mask tolerance scaling")(
"Mask tolerance scale factor")(
"overstep_tolerance",
boost::program_options::value<float>()->default_value(
cfg.overstep_tolerance / unit<float>::um),
Expand Down
12 changes: 6 additions & 6 deletions tests/tools/include/detray/options/track_generator_options.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,10 @@ void add_uniform_track_gen_options(
"Coordintates for particle gun origin position [mm]")(
"p_range",
boost::program_options::value<std::vector<scalar_t>>()->multitoken(),
"Total momentum [range] of the test particle [GeV]")(
"Total momentum [range] of the test particles [GeV]")(
"pT_range",
boost::program_options::value<std::vector<scalar_t>>()->multitoken(),
"Transverse momentum [range] of the test particle [GeV]");
"Transverse momentum [range] of the test particles [GeV]");
}

/// Add options for detray event generation
Expand Down Expand Up @@ -149,20 +149,20 @@ void add_rnd_track_gen_options(
"Seed for the random number generator")(
"theta_range",
boost::program_options::value<std::vector<scalar_t>>()->multitoken(),
"Min, Max range of theta values for particle gun")(
"Min, Max range of theta values for particle gun. Interval in [0, pi)")(
"eta_range",
boost::program_options::value<std::vector<scalar_t>>()->multitoken(),
"Min, Max range of eta values for particle gun")(
"randomize_charge", "Randomly flip charge sign per track")(
"origin",
boost::program_options::value<std::vector<scalar_t>>()->multitoken(),
"Coordintates for particle gun origin position")(
"Coordintates for particle gun origin position [mm]")(
"p_range",
boost::program_options::value<std::vector<scalar_t>>()->multitoken(),
"Total momentum [range] of the test particle [GeV]")(
"Total momentum [range] of the test particles [GeV]")(
"pT_range",
boost::program_options::value<std::vector<scalar_t>>()->multitoken(),
"Transverse momentum [range] of the test particle [GeV]");
"Transverse momentum [range] of the test particles [GeV]");
}

/// Add options for detray event generation
Expand Down
8 changes: 4 additions & 4 deletions tests/tools/src/cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -58,19 +58,19 @@ if(DETRAY_BUILD_BENCHMARKS)

# Build the propagation benchmark executable.
macro(detray_add_propagation_benchmark algebra)
detray_add_executable(propagation_benchmark_${algebra}
detray_add_executable(propagation_benchmark_cpu_${algebra}
"propagation_benchmark.cpp"
LINK_LIBRARIES Boost::program_options benchmark::benchmark benchmark::benchmark_main vecmem::core detray::core_${algebra} detray::benchmarks detray::benchmark_cpu detray::tools detray::detectors
LINK_LIBRARIES Boost::program_options benchmark::benchmark benchmark::benchmark_main vecmem::core detray::benchmark_cpu detray::core_${algebra} detray::tools detray::detectors
)

target_compile_options(
detray_propagation_benchmark_${algebra}
detray_propagation_benchmark_cpu_${algebra}
PRIVATE "-march=native" "-ftree-vectorize"
)

if(OpenMP_CXX_FOUND)
target_link_libraries(
detray_propagation_benchmark_${algebra}
detray_propagation_benchmark_cpu_${algebra}
PRIVATE OpenMP::OpenMP_CXX
)
endif()
Expand Down
Loading

0 comments on commit d3b6d27

Please sign in to comment.