acts-project · niermann999 · Oct 23, 2022 · Dec 18, 2024
diff --git a/tests/benchmarks/cpu/propagation.cpp b/tests/benchmarks/cpu/propagation.cpp
@@ -40,6 +40,7 @@ int main(int argc, char** argv) {
     using test_algebra = typename toy_detector_t::algebra_type;
     using scalar = dscalar<test_algebra>;
     using vector3 = dvector3D<test_algebra>;
+
     using free_track_parameters_t = free_track_parameters<test_algebra>;
     using uniform_gen_t =
         detail::random_numbers<scalar, std::uniform_real_distribution<scalar>>;
@@ -92,11 +93,12 @@ int main(int argc, char** argv) {
             n_tracks);
 
     // Specific configuration for the random track generation
-    trk_cfg.seed(42u);
+    trk_cfg.seed(detail::random_numbers<scalar>::default_seed());
 
     // Add additional tracks for warmup
     bench_cfg.n_warmup(static_cast<int>(
         std::ceil(0.1f * static_cast<float>(trk_cfg.n_tracks()))));
+    bench_cfg.do_warmup(true);
 
     //
     // Prepare data

diff --git a/tests/benchmarks/cuda/CMakeLists.txt b/tests/benchmarks/cuda/CMakeLists.txt
@@ -15,20 +15,17 @@ enable_language(CUDA)
 # Set the CUDA build flags.
 include(detray-compiler-options-cuda)
 
-# Look for openMP, which is used for the CPU benchmark
-find_package(OpenMP)
-
-# make unit tests for multiple algebras
-# Currently vc and smatrix is not supported
-set(algebras "array")
+# Build benchmarks for multiple algebra plugins
+# Currently vc and smatrix is not supported on device
+set(algebra_plugins "array")
 if(DETRAY_EIGEN_PLUGIN)
-    list(APPEND algebras "eigen")
+    list(APPEND algebra_plugins "eigen")
 endif()
 
-foreach(algebra ${algebras})
+foreach(algebra ${algebra_plugins})
     detray_add_executable(benchmark_cuda_propagation_${algebra}
       "propagation.cpp"
-       LINK_LIBRARIES detray::benchmark_cuda detray::core detray::algebra_${algebra} vecmem::cuda detray::test_utils
+       LINK_LIBRARIES detray::benchmark_cuda_${algebra} detray::core_${algebra} vecmem::cuda detray::test_utils
     )
 
     target_compile_definitions(
@@ -40,11 +37,4 @@ foreach(algebra ${algebras})
         detray_benchmark_cuda_propagation_${algebra}
         PRIVATE "-march=native" "-ftree-vectorize"
     )
-
-    if(OpenMP_CXX_FOUND)
-        target_link_libraries(
-            detray_benchmark_cuda_propagation_${algebra}
-            PRIVATE OpenMP::OpenMP_CXX
-        )
-    endif()
 endforeach()
diff --git a/tests/benchmarks/cuda/propagation.cpp b/tests/benchmarks/cuda/propagation.cpp
@@ -41,6 +41,7 @@ int main(int argc, char** argv) {
     using test_algebra = typename toy_detector_t::algebra_type;
     using scalar = dscalar<test_algebra>;
     using vector3 = dvector3D<test_algebra>;
+
     using free_track_parameters_t = free_track_parameters<test_algebra>;
     using uniform_gen_t =
         detail::random_numbers<scalar, std::uniform_real_distribution<scalar>>;
@@ -87,11 +88,12 @@ int main(int argc, char** argv) {
             n_tracks);
 
     // Specific configuration for the random track generation
-    trk_cfg.seed(42u);
+    trk_cfg.seed(detail::random_numbers<scalar>::default_seed());
 
     // Add additional tracks for warmup
     bench_cfg.n_warmup(static_cast<int>(
         std::ceil(0.1f * static_cast<float>(trk_cfg.n_tracks()))));
+    bench_cfg.do_warmup(true);
 
     //
     // Prepare data

diff --git a/tests/benchmarks/include/detray/benchmarks/cpu/propagation_benchmark.hpp b/tests/benchmarks/include/detray/benchmarks/cpu/propagation_benchmark.hpp
@@ -1,6 +1,6 @@
 /** Detray library, part of the ACTS project (R&D line)
  *
- * (c) 2023-2024 CERN for the benefit of the ACTS project
+ * (c) 2023-2025 CERN for the benefit of the ACTS project
  *
  * Mozilla Public License Version 2.0
  */
@@ -52,12 +52,12 @@ struct host_propagation_bm : public benchmark_base {
     configuration &config() { return m_cfg; }
 
     /// Prepare data and run benchmark loop
-    inline void operator()(::benchmark::State &state,
-                           dvector<free_track_parameters<algebra_t>> *tracks,
-                           const typename propagator_t::detector_type *det,
-                           const bfield_t *bfield,
-                           typename propagator_t::actor_chain_type::state_tuple
-                               *input_actor_states) const {
+    inline void operator()(
+        ::benchmark::State &state,
+        const dvector<free_track_parameters<algebra_t>> *tracks,
+        const typename propagator_t::detector_type *det, const bfield_t *bfield,
+        const typename propagator_t::actor_chain_type::state_tuple
+            *input_actor_states) const {
         using actor_chain_t = typename propagator_t::actor_chain_type;
         using actor_states_t = typename actor_chain_t::state_tuple;
 
@@ -76,7 +76,8 @@ struct host_propagation_bm : public benchmark_base {
 
         // Call the host propagation
         auto run_propagation = [&p, det, bfield, input_actor_states](
-                                   free_track_parameters<algebra_t> &track) {
+                                   const free_track_parameters<algebra_t>
+                                       &track) {
             // Fresh copy of actor states
             actor_states_t actor_states(*input_actor_states);
             // Tuple of references to pass to the propagator
@@ -103,17 +104,26 @@ struct host_propagation_bm : public benchmark_base {
         // Warm-up
         if (m_cfg.benchmark().do_warmup()) {
             assert(n_warmup > 0);
-            auto stride{n_samples / n_warmup};
+            int stride{n_samples / n_warmup};
             stride = (stride == 0) ? 10 : stride;
             assert(stride > 0);
 
 #pragma omp parallel for schedule(dynamic)
             for (int i = 0; i < n_samples; i += stride) {
+                // The track gets copied into the stepper state, so that the
+                // original track sample vector remains unchanged
                 run_propagation((*tracks)[static_cast<std::size_t>(i)]);
             }
+        } else {
+            std::cout << "WARNING: Running host benchmarks without warmup"
+                      << std::endl;
         }
 
         // Run the benchmark
+
+        // Calculate the propagation rate
+        // @see
+        // https://github.com/google/benchmark/blob/main/docs/user_guide.md#custom-counters
         std::size_t total_tracks = 0u;
         for (auto _ : state) {
 #pragma omp parallel for schedule(dynamic)

diff --git a/tests/benchmarks/include/detray/benchmarks/device/cuda/CMakeLists.txt b/tests/benchmarks/include/detray/benchmarks/device/cuda/CMakeLists.txt
@@ -13,17 +13,34 @@ enable_language(CUDA)
 # Set the CUDA build flags.
 include(detray-compiler-options-cuda)
 
+# Build benchmark library for multiple algebra plugins to create correct
+# template instantiations
+# Currently vc and smatrix is not supported on device
+set(algebra_plugins "array")
+if(DETRAY_EIGEN_PLUGIN)
+    list(APPEND algebra_plugins "eigen")
+endif()
+
 # Set up a benchamrk library for CUDA
-add_library(
-    detray_benchmark_cuda
-    STATIC
-    "propagation_benchmark.hpp"
-    "propagation_benchmark.cu"
-)
+foreach(algebra ${algebra_plugins})
+    add_library(
+        detray_benchmark_cuda_${algebra}
+        STATIC
+        "propagation_benchmark.hpp"
+        "propagation_benchmark.cu"
+    )
 
-add_library(detray::benchmark_cuda ALIAS detray_benchmark_cuda)
+    add_library(
+        detray::benchmark_cuda_${algebra}
+        ALIAS detray_benchmark_cuda_${algebra}
+    )
 
-target_link_libraries(
-    detray_benchmark_cuda
-    PUBLIC vecmem::cuda detray::benchmarks detray::test_utils detray::core_array
-)
+    target_link_libraries(
+        detray_benchmark_cuda_${algebra}
+        PUBLIC
+            vecmem::cuda
+            detray::benchmarks
+            detray::test_utils
+            detray::core_${algebra}
+    )
+endforeach()
diff --git a/tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.cu b/tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.cu
@@ -1,6 +1,6 @@
 /** Detray library, part of the ACTS project (R&D line)
  *
- * (c) 2022-2024 CERN for the benefit of the ACTS project
+ * (c) 2022-2025 CERN for the benefit of the ACTS project
  *
  * Mozilla Public License Version 2.0
  */
@@ -15,7 +15,7 @@ __global__ void __launch_bounds__(256, 4) propagator_benchmark_kernel(
     propagation::config cfg,
     typename propagator_t::detector_type::view_type det_view,
     typename propagator_t::stepper_type::magnetic_field_type field_view,
-    typename propagator_t::actor_chain_type::state_tuple
+    const typename propagator_t::actor_chain_type::state_tuple
         *device_actor_state_ptr,
     vecmem::data::vector_view<
         free_track_parameters<typename propagator_t::algebra_type>>
@@ -30,8 +30,9 @@ __global__ void __launch_bounds__(256, 4) propagator_benchmark_kernel(
         propagator<typename propagator_t::stepper_type,
                    navigator<detector_device_t>, actor_chain_t>;
 
-    detector_device_t det(det_view);
-    vecmem::device_vector<free_track_parameters<algebra_t>> tracks(tracks_view);
+    const detector_device_t det(det_view);
+    const vecmem::device_vector<free_track_parameters<algebra_t>> tracks(
+        tracks_view);
 
     int gid = threadIdx.x + blockIdx.x * blockDim.x;
     if (gid >= tracks.size()) {
@@ -46,6 +47,9 @@ __global__ void __launch_bounds__(256, 4) propagator_benchmark_kernel(
     auto actor_state_refs = actor_chain_t::setup_actor_states(actor_states);
 
     // Create the propagator state
+
+    // The track gets copied into the stepper state, so that the
+    // original track sample vector remains unchanged
     typename propagator_device_t::state p_state(tracks.at(gid), field_view,
                                                 det);
 

diff --git a/tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.hpp b/tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.hpp
@@ -1,6 +1,6 @@
 /** Detray library, part of the ACTS project (R&D line)
  *
- * (c) 2024 CERN for the benefit of the ACTS project
+ * (c) 2024-2025 CERN for the benefit of the ACTS project
  *
  * Mozilla Public License Version 2.0
  */
@@ -40,6 +40,7 @@
 // System include(s)
 #include <algorithm>
 #include <cassert>
+#include <iostream>
 #include <random>
 #include <string>
 
@@ -85,6 +86,7 @@ void run_propagation_kernel(
     const int);
 
 /// Allocate actor state blueprint on device
+/// @note This only works if each actor state in the tuple is essentially POD
 template <typename propagator_t>
 typename propagator_t::actor_chain_type::state_tuple *setup_actor_states(
     typename propagator_t::actor_chain_type::state_tuple *);
@@ -155,14 +157,22 @@ struct cuda_propagation_bm : public benchmark_base {
             setup_actor_states<propagator_t>(input_actor_states);
 
         // Do a small warm up run
-        {
+        if (m_cfg.benchmark().do_warmup()) {
             auto warmup_track_buffer = detray::get_buffer(
                 vecmem::get_data(*tracks), *dev_mr, cuda_cpy);
+
             run_propagation_kernel<propagator_t, kOPT>(
                 m_cfg.propagation(), det_view, *bfield, device_actor_state_ptr,
                 warmup_track_buffer, math::min(n_warmup, n_samples));
+        } else {
+            std::cout << "WARNING: Running CUDA benchmarks without warmup is "
+                         "not recommended"
+                      << std::endl;
         }
 
+        // Calculate the propagation rate
+        // @see
+        // https://github.com/google/benchmark/blob/main/docs/user_guide.md#custom-counters
         std::size_t total_tracks = 0u;
         for (auto _ : state) {
             // Launch the propagator test for GPU device

diff --git a/tests/benchmarks/include/detray/benchmarks/propagation_benchmark_utils.hpp b/tests/benchmarks/include/detray/benchmarks/propagation_benchmark_utils.hpp
@@ -171,19 +171,19 @@ inline void register_benchmark(
 
         std::cout << bench_name << "\n" << bench_cfg;
 
-        // Cpu benchmark
         if constexpr (std::is_invocable_v<
                           decltype(prop_benchmark), ::benchmark::State &,
                           dvector<free_track_parameters<algebra_t>> *,
                           const detector_t *, const bfield_bknd_t *,
                           typename propagator_t::actor_chain_type::state_tuple
                               *>) {
+            // Cpu benchmark
             ::benchmark::RegisterBenchmark(bench_name.c_str(), prop_benchmark,
                                            &tracks, &det, &bfield,
                                            actor_states);
             //->MeasureProcessCPUTime();
         } else {
-
+            // Device benchmark
             ::benchmark::RegisterBenchmark(bench_name.c_str(), prop_benchmark,
                                            dev_mr, &tracks, &det, &bfield,
                                            actor_states);

diff --git a/tests/tools/CMakeLists.txt b/tests/tools/CMakeLists.txt
@@ -21,7 +21,6 @@ target_link_libraries(
     INTERFACE
         Boost::program_options
         vecmem::core
-        detray::core_array
         detray::test_common
         detray::io
         detray::csv_io

diff --git a/tests/tools/include/detray/options/propagation_options.hpp b/tests/tools/include/detray/options/propagation_options.hpp
@@ -42,7 +42,7 @@ void add_options<detray::navigation::config>(
         "mask_tolerance_scalor",
         boost::program_options::value<float>()->default_value(
             cfg.mask_tolerance_scalor),
-        "Mask tolerance scaling")(
+        "Mask tolerance scale factor")(
         "overstep_tolerance",
         boost::program_options::value<float>()->default_value(
             cfg.overstep_tolerance / unit<float>::um),
@@ -172,6 +172,7 @@ void configure_options<detray::stepping::config>(
 
         cfg.path_limit = path_limit * unit<float>::m;
     }
+    cfg.do_covariance_transport = false;
     if (vm.count("covariance_transport")) {
         cfg.do_covariance_transport = true;
     }