Add CUDA benchmarks

acts-project · Jan 8, 2025 · d3b6d27 · d3b6d27
1 parent 39c0749
commit d3b6d27
Show file tree

Hide file tree

Showing 15 changed files with 279 additions and 68 deletions.
diff --git a/core/include/detray/propagator/propagator.hpp b/core/include/detray/propagator/propagator.hpp
@@ -55,12 +55,6 @@ struct propagator {
     explicit constexpr propagator(const propagation::config &cfg)
         : m_cfg{cfg} {}
 
-    /// @returns the actor chain
-    DETRAY_HOST_DEVICE
-    constexpr const actor_chain_t &get_actor_chain() const {
-        return run_actors;
-    }
-
     /// Propagation that state aggregates a stepping and a navigation state. It
     /// also keeps references to the actor states.
     struct state {

diff --git a/tests/benchmarks/cpu/propagation.cpp b/tests/benchmarks/cpu/propagation.cpp
@@ -40,6 +40,7 @@ int main(int argc, char** argv) {
     using test_algebra = typename toy_detector_t::algebra_type;
     using scalar = dscalar<test_algebra>;
     using vector3 = dvector3D<test_algebra>;
+
     using free_track_parameters_t = free_track_parameters<test_algebra>;
     using uniform_gen_t =
         detail::random_numbers<scalar, std::uniform_real_distribution<scalar>>;
@@ -97,6 +98,7 @@ int main(int argc, char** argv) {
     // Add additional tracks for warmup
     bench_cfg.n_warmup(static_cast<int>(
         std::ceil(0.1f * static_cast<float>(trk_cfg.n_tracks()))));
+    bench_cfg.do_warmup(true);
 
     //
     // Prepare data

diff --git a/tests/benchmarks/cuda/CMakeLists.txt b/tests/benchmarks/cuda/CMakeLists.txt
@@ -15,20 +15,17 @@ enable_language(CUDA)
 # Set the CUDA build flags.
 include(detray-compiler-options-cuda)
 
-# Look for openMP, which is used for the CPU benchmark
-find_package(OpenMP)
-
-# make unit tests for multiple algebras
-# Currently vc and smatrix is not supported
-set(algebras "array")
+# Build benchmarks for multiple algebra plugins
+# Currently vc and smatrix is not supported on device
+set(algebra_plugins "array")
 if(DETRAY_EIGEN_PLUGIN)
-    list(APPEND algebras "eigen")
+    list(APPEND algebra_plugins "eigen")
 endif()
 
-foreach(algebra ${algebras})
+foreach(algebra ${algebra_plugins})
     detray_add_executable(benchmark_cuda_propagation_${algebra}
       "propagation.cpp"
-       LINK_LIBRARIES detray::benchmark_cuda detray::core detray::algebra_${algebra} vecmem::cuda detray::test_utils
+       LINK_LIBRARIES detray::benchmark_cuda_${algebra} detray::core detray::algebra_${algebra} vecmem::cuda detray::test_utils
     )
 
     target_compile_definitions(
@@ -40,11 +37,4 @@ foreach(algebra ${algebras})
         detray_benchmark_cuda_propagation_${algebra}
         PRIVATE "-march=native" "-ftree-vectorize"
     )
-
-    if(OpenMP_CXX_FOUND)
-        target_link_libraries(
-            detray_benchmark_cuda_propagation_${algebra}
-            PRIVATE OpenMP::OpenMP_CXX
-        )
-    endif()
 endforeach()
diff --git a/tests/benchmarks/cuda/propagation.cpp b/tests/benchmarks/cuda/propagation.cpp
@@ -41,6 +41,7 @@ int main(int argc, char** argv) {
     using test_algebra = typename toy_detector_t::algebra_type;
     using scalar = dscalar<test_algebra>;
     using vector3 = dvector3D<test_algebra>;
+
     using free_track_parameters_t = free_track_parameters<test_algebra>;
     using uniform_gen_t =
         detail::random_numbers<scalar, std::uniform_real_distribution<scalar>>;
@@ -87,11 +88,12 @@ int main(int argc, char** argv) {
             n_tracks);
 
     // Specific configuration for the random track generation
-    trk_cfg.seed(42u);
+    trk_cfg.seed(detail::random_numbers<scalar>::default_seed());
 
     // Add additional tracks for warmup
     bench_cfg.n_warmup(static_cast<int>(
         std::ceil(0.1f * static_cast<float>(trk_cfg.n_tracks()))));
+    bench_cfg.do_warmup(true);
 
     //
     // Prepare data

diff --git a/tests/benchmarks/include/detray/benchmarks/cpu/propagation_benchmark.hpp b/tests/benchmarks/include/detray/benchmarks/cpu/propagation_benchmark.hpp
@@ -1,6 +1,6 @@
 /** Detray library, part of the ACTS project (R&D line)
  *
- * (c) 2023-2024 CERN for the benefit of the ACTS project
+ * (c) 2023-2025 CERN for the benefit of the ACTS project
  *
  * Mozilla Public License Version 2.0
  */
@@ -52,12 +52,12 @@ struct host_propagation_bm : public benchmark_base {
     configuration &config() { return m_cfg; }
 
     /// Prepare data and run benchmark loop
-    inline void operator()(::benchmark::State &state,
-                           dvector<free_track_parameters<algebra_t>> *tracks,
-                           const typename propagator_t::detector_type *det,
-                           const bfield_t *bfield,
-                           typename propagator_t::actor_chain_type::state_tuple
-                               *input_actor_states) const {
+    inline void operator()(
+        ::benchmark::State &state,
+        const dvector<free_track_parameters<algebra_t>> *tracks,
+        const typename propagator_t::detector_type *det, const bfield_t *bfield,
+        const typename propagator_t::actor_chain_type::state_tuple
+            *input_actor_states) const {
         using actor_chain_t = typename propagator_t::actor_chain_type;
         using actor_states_t = typename actor_chain_t::state_tuple;
 
@@ -76,7 +76,8 @@ struct host_propagation_bm : public benchmark_base {
 
         // Call the host propagation
         auto run_propagation = [&p, det, bfield, input_actor_states](
-                                   free_track_parameters<algebra_t> &track) {
+                                   const free_track_parameters<algebra_t>
+                                       &track) {
             // Fresh copy of actor states
             actor_states_t actor_states(*input_actor_states);
             // Tuple of references to pass to the propagator
@@ -103,17 +104,26 @@ struct host_propagation_bm : public benchmark_base {
         // Warm-up
         if (m_cfg.benchmark().do_warmup()) {
             assert(n_warmup > 0);
-            auto stride{n_samples / n_warmup};
+            int stride{n_samples / n_warmup};
             stride = (stride == 0) ? 10 : stride;
             assert(stride > 0);
 
 #pragma omp parallel for schedule(dynamic)
             for (int i = 0; i < n_samples; i += stride) {
+                // The track gets copied into the stepper state, so that the
+                // original track sample vector remains unchanged
                 run_propagation((*tracks)[static_cast<std::size_t>(i)]);
             }
+        } else {
+            std::cout << "WARNING: Running host benchmarks without warmup"
+                      << std::endl;
         }
 
         // Run the benchmark
+
+        // Calculate the propagation rate
+        // @see
+        // https://github.com/google/benchmark/blob/main/docs/user_guide.md#custom-counters
         std::size_t total_tracks = 0u;
         for (auto _ : state) {
 #pragma omp parallel for schedule(dynamic)

diff --git a/tests/benchmarks/include/detray/benchmarks/device/cuda/CMakeLists.txt b/tests/benchmarks/include/detray/benchmarks/device/cuda/CMakeLists.txt
@@ -13,17 +13,34 @@ enable_language(CUDA)
 # Set the CUDA build flags.
 include(detray-compiler-options-cuda)
 
+# Build benchmark library for multiple algebra plugins to create correct
+# template instantiations
+# Currently vc and smatrix is not supported on device
+set(algebra_plugins "array")
+if(DETRAY_EIGEN_PLUGIN)
+    list(APPEND algebra_plugins "eigen")
+endif()
+
 # Set up a benchamrk library for CUDA
-add_library(
-    detray_benchmark_cuda
-    STATIC
-    "propagation_benchmark.hpp"
-    "propagation_benchmark.cu"
-)
+foreach(algebra ${algebra_plugins})
+    add_library(
+        detray_benchmark_cuda_${algebra}
+        STATIC
+        "propagation_benchmark.hpp"
+        "propagation_benchmark.cu"
+    )
 
-add_library(detray::benchmark_cuda ALIAS detray_benchmark_cuda)
+    add_library(
+        detray::benchmark_cuda_${algebra}
+        ALIAS detray_benchmark_cuda_${algebra}
+    )
 
-target_link_libraries(
-    detray_benchmark_cuda
-    PUBLIC vecmem::cuda detray::benchmarks detray::test_utils detray::core_array
-)
+    target_link_libraries(
+        detray_benchmark_cuda_${algebra}
+        PUBLIC
+            vecmem::cuda
+            detray::benchmarks
+            detray::test_utils
+            detray::core_${algebra}
+    )
+endforeach()
diff --git a/tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.cu b/tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.cu
@@ -1,6 +1,6 @@
 /** Detray library, part of the ACTS project (R&D line)
  *
- * (c) 2022-2024 CERN for the benefit of the ACTS project
+ * (c) 2022-2025 CERN for the benefit of the ACTS project
  *
  * Mozilla Public License Version 2.0
  */
@@ -15,7 +15,7 @@ __global__ void __launch_bounds__(256, 4) propagator_benchmark_kernel(
     propagation::config cfg,
     typename propagator_t::detector_type::view_type det_view,
     typename propagator_t::stepper_type::magnetic_field_type field_view,
-    typename propagator_t::actor_chain_type::state_tuple
+    const typename propagator_t::actor_chain_type::state_tuple
         *device_actor_state_ptr,
     vecmem::data::vector_view<
         free_track_parameters<typename propagator_t::algebra_type>>
@@ -30,8 +30,9 @@ __global__ void __launch_bounds__(256, 4) propagator_benchmark_kernel(
         propagator<typename propagator_t::stepper_type,
                    navigator<detector_device_t>, actor_chain_t>;
 
-    detector_device_t det(det_view);
-    vecmem::device_vector<free_track_parameters<algebra_t>> tracks(tracks_view);
+    const detector_device_t det(det_view);
+    const vecmem::device_vector<free_track_parameters<algebra_t>> tracks(
+        tracks_view);
 
     int gid = threadIdx.x + blockIdx.x * blockDim.x;
     if (gid >= tracks.size()) {
@@ -46,6 +47,9 @@ __global__ void __launch_bounds__(256, 4) propagator_benchmark_kernel(
     auto actor_state_refs = actor_chain_t::setup_actor_states(actor_states);
 
     // Create the propagator state
+
+    // The track gets copied into the stepper state, so that the
+    // original track sample vector remains unchanged
     typename propagator_device_t::state p_state(tracks.at(gid), field_view,
                                                 det);
 

diff --git a/tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.hpp b/tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.hpp
@@ -1,6 +1,6 @@
 /** Detray library, part of the ACTS project (R&D line)
  *
- * (c) 2024 CERN for the benefit of the ACTS project
+ * (c) 2024-2025 CERN for the benefit of the ACTS project
  *
  * Mozilla Public License Version 2.0
  */
@@ -40,6 +40,7 @@
 // System include(s)
 #include <algorithm>
 #include <cassert>
+#include <iostream>
 #include <random>
 #include <string>
 
@@ -85,6 +86,7 @@ void run_propagation_kernel(
     const int);
 
 /// Allocate actor state blueprint on device
+/// @note This only works if each actor state in the tuple is essentially POD
 template <typename propagator_t>
 typename propagator_t::actor_chain_type::state_tuple *setup_actor_states(
     typename propagator_t::actor_chain_type::state_tuple *);
@@ -155,14 +157,22 @@ struct cuda_propagation_bm : public benchmark_base {
             setup_actor_states<propagator_t>(input_actor_states);
 
         // Do a small warm up run
-        {
+        if (m_cfg.benchmark().do_warmup()) {
             auto warmup_track_buffer = detray::get_buffer(
                 vecmem::get_data(*tracks), *dev_mr, cuda_cpy);
+
             run_propagation_kernel<propagator_t, kOPT>(
                 m_cfg.propagation(), det_view, *bfield, device_actor_state_ptr,
                 warmup_track_buffer, math::min(n_warmup, n_samples));
+        } else {
+            std::cout << "WARNING: Running CUDA benchmarks without warmup is "
+                         "not recommended"
+                      << std::endl;
         }
 
+        // Calculate the propagation rate
+        // @see
+        // https://github.com/google/benchmark/blob/main/docs/user_guide.md#custom-counters
         std::size_t total_tracks = 0u;
         for (auto _ : state) {
             // Launch the propagator test for GPU device

diff --git a/tests/benchmarks/include/detray/benchmarks/propagation_benchmark_utils.hpp b/tests/benchmarks/include/detray/benchmarks/propagation_benchmark_utils.hpp
@@ -171,19 +171,19 @@ inline void register_benchmark(
 
         std::cout << bench_name << "\n" << bench_cfg;
 
-        // Cpu benchmark
         if constexpr (std::is_invocable_v<
                           decltype(prop_benchmark), ::benchmark::State &,
                           dvector<free_track_parameters<algebra_t>> *,
                           const detector_t *, const bfield_bknd_t *,
                           typename propagator_t::actor_chain_type::state_tuple
                               *>) {
+            // Cpu benchmark
             ::benchmark::RegisterBenchmark(bench_name.c_str(), prop_benchmark,
                                            &tracks, &det, &bfield,
                                            actor_states);
             //->MeasureProcessCPUTime();
         } else {
-
+            // Device benchmark
             ::benchmark::RegisterBenchmark(bench_name.c_str(), prop_benchmark,
                                            dev_mr, &tracks, &det, &bfield,
                                            actor_states);

diff --git a/tests/tools/include/detray/options/propagation_options.hpp b/tests/tools/include/detray/options/propagation_options.hpp
@@ -42,7 +42,7 @@ void add_options<detray::navigation::config>(
         "mask_tolerance_scalor",
         boost::program_options::value<float>()->default_value(
             cfg.mask_tolerance_scalor),
-        "Mask tolerance scaling")(
+        "Mask tolerance scale factor")(
         "overstep_tolerance",
         boost::program_options::value<float>()->default_value(
             cfg.overstep_tolerance / unit<float>::um),

diff --git a/tests/tools/include/detray/options/track_generator_options.hpp b/tests/tools/include/detray/options/track_generator_options.hpp
@@ -53,10 +53,10 @@ void add_uniform_track_gen_options(
         "Coordintates for particle gun origin position [mm]")(
         "p_range",
         boost::program_options::value<std::vector<scalar_t>>()->multitoken(),
-        "Total momentum [range] of the test particle [GeV]")(
+        "Total momentum [range] of the test particles [GeV]")(
         "pT_range",
         boost::program_options::value<std::vector<scalar_t>>()->multitoken(),
-        "Transverse momentum [range] of the test particle [GeV]");
+        "Transverse momentum [range] of the test particles [GeV]");
 }
 
 /// Add options for detray event generation
@@ -149,20 +149,20 @@ void add_rnd_track_gen_options(
         "Seed for the random number generator")(
         "theta_range",
         boost::program_options::value<std::vector<scalar_t>>()->multitoken(),
-        "Min, Max range of theta values for particle gun")(
+        "Min, Max range of theta values for particle gun. Interval in [0, pi)")(
         "eta_range",
         boost::program_options::value<std::vector<scalar_t>>()->multitoken(),
         "Min, Max range of eta values for particle gun")(
         "randomize_charge", "Randomly flip charge sign per track")(
         "origin",
         boost::program_options::value<std::vector<scalar_t>>()->multitoken(),
-        "Coordintates for particle gun origin position")(
+        "Coordintates for particle gun origin position [mm]")(
         "p_range",
         boost::program_options::value<std::vector<scalar_t>>()->multitoken(),
-        "Total momentum [range] of the test particle [GeV]")(
+        "Total momentum [range] of the test particles [GeV]")(
         "pT_range",
         boost::program_options::value<std::vector<scalar_t>>()->multitoken(),
-        "Transverse momentum [range] of the test particle [GeV]");
+        "Transverse momentum [range] of the test particles [GeV]");
 }
 
 /// Add options for detray event generation

diff --git a/tests/tools/src/cpu/CMakeLists.txt b/tests/tools/src/cpu/CMakeLists.txt
@@ -58,19 +58,19 @@ if(DETRAY_BUILD_BENCHMARKS)
 
     # Build the propagation benchmark executable.
     macro(detray_add_propagation_benchmark algebra)
-        detray_add_executable(propagation_benchmark_${algebra}
+        detray_add_executable(propagation_benchmark_cpu_${algebra}
                             "propagation_benchmark.cpp"
-                            LINK_LIBRARIES Boost::program_options benchmark::benchmark benchmark::benchmark_main vecmem::core detray::core_${algebra} detray::benchmarks detray::benchmark_cpu detray::tools detray::detectors
+                            LINK_LIBRARIES Boost::program_options benchmark::benchmark benchmark::benchmark_main vecmem::core detray::benchmark_cpu detray::core_${algebra} detray::tools detray::detectors
         )
 
         target_compile_options(
-            detray_propagation_benchmark_${algebra}
+            detray_propagation_benchmark_cpu_${algebra}
             PRIVATE "-march=native" "-ftree-vectorize"
         )
 
         if(OpenMP_CXX_FOUND)
             target_link_libraries(
-                detray_propagation_benchmark_${algebra}
+                detray_propagation_benchmark_cpu_${algebra}
                 PRIVATE OpenMP::OpenMP_CXX
             )
         endif()