From c1352223e42265ccfd9d42ee0393b045cc677326 Mon Sep 17 00:00:00 2001 From: Attila Krasznahorkay Date: Fri, 27 Sep 2024 10:24:41 +0200 Subject: [PATCH 1/4] Introduced some simple AoS / SoA copy benchmarks. --- benchmarks/CMakeLists.txt | 4 +- benchmarks/common/simple_aos_container.hpp | 51 +++++++++ benchmarks/common/simple_soa_container.hpp | 63 +++++++++++ benchmarks/core/CMakeLists.txt | 3 +- benchmarks/core/benchmark_edm_copy.cpp | 124 +++++++++++++++++++++ 5 files changed, 243 insertions(+), 2 deletions(-) create mode 100644 benchmarks/common/simple_aos_container.hpp create mode 100644 benchmarks/common/simple_soa_container.hpp create mode 100644 benchmarks/core/benchmark_edm_copy.cpp diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 67215e9f..3f586543 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -31,7 +31,9 @@ add_library( vecmem_benchmark_common STATIC "common/make_jagged_sizes.hpp" "common/make_jagged_sizes.cpp" "common/make_jagged_vector.hpp" - "common/make_jagged_vector.cpp" ) + "common/make_jagged_vector.cpp" + "common/simple_aos_container.hpp" + "common/simple_soa_container.hpp" ) target_link_libraries( vecmem_benchmark_common PUBLIC vecmem::core ) set_target_properties( vecmem_benchmark_common PROPERTIES diff --git a/benchmarks/common/simple_aos_container.hpp b/benchmarks/common/simple_aos_container.hpp new file mode 100644 index 00000000..a0cfcccd --- /dev/null +++ b/benchmarks/common/simple_aos_container.hpp @@ -0,0 +1,51 @@ +/* VecMem project, part of the ACTS project (R&D line) + * + * (c) 2023-2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ +#pragma once + +// Local include(s). +#include "vecmem/containers/data/vector_buffer.hpp" +#include "vecmem/containers/data/vector_view.hpp" +#include "vecmem/containers/device_vector.hpp" +#include "vecmem/containers/vector.hpp" + +namespace vecmem { +namespace benchmark { + +/// Simple AoS struct used for benchmarking. +struct simple_aos { + + int count; + float measurement; + float average; + int index; + +}; // class simple_aos + +/// "Simple" container for the benchmarks +struct simple_aos_container { + +#if __cplusplus >= 201700L + /// Host container + using host = vector; + /// Host buffer + using buffer = data::vector_buffer; +#endif // __cplusplus >= 201700L + + /// Non-const device collection for @c simple_aos + using device = device_vector; + /// Constant device collection for @c simple_aos + using const_device = device_vector; + + /// Non-constant view of an @c simple_aos collection + using view = data::vector_view; + /// Constant view of an @c simple_aos collection + using const_view = data::vector_view; + +}; // struct simple_aos_container + +} // namespace benchmark +} // namespace vecmem diff --git a/benchmarks/common/simple_soa_container.hpp b/benchmarks/common/simple_soa_container.hpp new file mode 100644 index 00000000..ffede186 --- /dev/null +++ b/benchmarks/common/simple_soa_container.hpp @@ -0,0 +1,63 @@ +/* VecMem project, part of the ACTS project (R&D line) + * + * (c) 2023-2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ +#pragma once + +// Local include(s). +#include "vecmem/edm/container.hpp" +#include "vecmem/utils/types.hpp" + +namespace vecmem { +namespace benchmark { + +/// Interface to a "simple container" used for benchmarking. +template +class simple_soa : public BASE { + +public: + /// Inherit the base class's constructor(s) + using BASE::BASE; + + /// "Count" of something (non-const) + VECMEM_HOST_AND_DEVICE + auto& count() { return BASE::template get<0>(); } + /// "Count" of something (const) + VECMEM_HOST_AND_DEVICE + const auto& count() const { return BASE::template get<0>(); } + + /// "Measurement" of something (non-const) + VECMEM_HOST_AND_DEVICE + auto& measurement() { return BASE::template get<1>(); } + /// "Measurement" of something (const) + VECMEM_HOST_AND_DEVICE + const auto& measurement() const { return BASE::template get<1>(); } + + /// "Average" of something (non-const) + VECMEM_HOST_AND_DEVICE + auto& average() { return BASE::template get<2>(); } + /// "Average" of something (const) + VECMEM_HOST_AND_DEVICE + const auto& average() const { return BASE::template get<2>(); } + + /// "Index" of something (non-const) + VECMEM_HOST_AND_DEVICE + auto& index() { return BASE::template get<3>(); } + /// "Index" of something (const) + VECMEM_HOST_AND_DEVICE + const auto& index() const { return BASE::template get<3>(); } + +}; // class simple_soa + +/// "Simple" container for the tests +/// +/// Meaning that it would not have any jagged vector variables in it... +/// +using simple_soa_container = + edm::container, edm::type::vector, + edm::type::vector, edm::type::vector >; + +} // namespace benchmark +} // namespace vecmem diff --git a/benchmarks/core/CMakeLists.txt b/benchmarks/core/CMakeLists.txt index 542995ad..8dd5a9eb 100644 --- a/benchmarks/core/CMakeLists.txt +++ b/benchmarks/core/CMakeLists.txt @@ -10,7 +10,8 @@ include( vecmem-compiler-options-cpp ) # Set up the benchmark(s) for the core library. add_executable( vecmem_benchmark_core "benchmark_core.cpp" - "benchmark_copy.cpp" ) + "benchmark_copy.cpp" + "benchmark_edm_copy.cpp" ) target_link_libraries( vecmem_benchmark_core diff --git a/benchmarks/core/benchmark_edm_copy.cpp b/benchmarks/core/benchmark_edm_copy.cpp new file mode 100644 index 00000000..d3ec6537 --- /dev/null +++ b/benchmarks/core/benchmark_edm_copy.cpp @@ -0,0 +1,124 @@ +/* + * VecMem project, part of the ACTS project (R&D line) + * + * (c) 2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +// VecMem include(s). +#include +#include + +// Common benchmark include(s). +#include "../common/simple_aos_container.hpp" +#include "../common/simple_soa_container.hpp" + +// Google benchmark include(s). +#include + +namespace vecmem::benchmark { + +/// The (host) memory resource to use in the benchmark(s). +static host_memory_resource host_mr; +/// The copy object to use in the benchmark(s). +static copy host_copy; + +void simpleSoADirectHostToFixedBufferCopy(::benchmark::State& state) { + + // Get the size of the host container to create. + const std::size_t size = static_cast(state.range(0)); + static constexpr std::size_t element_size = + 2 * sizeof(int) + 2 * sizeof(float); + const double bytes = static_cast(size * element_size); + + // Set custom "counters" for the benchmark. + state.counters["Bytes"] = ::benchmark::Counter( + bytes, ::benchmark::Counter::kDefaults, ::benchmark::Counter::kIs1024); + state.counters["Rate"] = ::benchmark::Counter( + bytes, ::benchmark::Counter::kIsIterationInvariantRate, + ::benchmark::Counter::kIs1024); + + // Create the source host container. + simple_soa_container::host source{host_mr}; + source.resize(size); + + // Create the destination (host) buffer. + simple_soa_container::buffer dest{ + static_cast(size), host_mr}; + host_copy.setup(dest)->ignore(); + + // Perform the copy benchmark. + for (auto _ : state) { + host_copy(get_data(source), dest)->ignore(); + } +} +BENCHMARK(simpleSoADirectHostToFixedBufferCopy)->Range(1UL, 1UL << 26); + +void simpleSoAOptimalHostToFixedBufferCopy(::benchmark::State& state) { + + // Get the size of the host container to create. + const std::size_t size = static_cast(state.range(0)); + static constexpr std::size_t element_size = + 2 * sizeof(int) + 2 * sizeof(float); + const double bytes = static_cast(size * element_size); + + // Set custom "counters" for the benchmark. + state.counters["Bytes"] = ::benchmark::Counter( + bytes, ::benchmark::Counter::kDefaults, ::benchmark::Counter::kIs1024); + state.counters["Rate"] = ::benchmark::Counter( + bytes, ::benchmark::Counter::kIsIterationInvariantRate, + ::benchmark::Counter::kIs1024); + + // Create the source host container. + simple_soa_container::host host{host_mr}; + host.resize(size); + simple_soa_container::buffer source{ + static_cast(size), host_mr}; + host_copy.setup(source)->ignore(); + host_copy(get_data(host), source)->ignore(); + + // Create the destination (host) buffer. + simple_soa_container::buffer dest{ + static_cast(size), host_mr}; + host_copy.setup(dest)->ignore(); + + // Perform the copy benchmark. + for (auto _ : state) { + host_copy(source, dest)->ignore(); + } +} +BENCHMARK(simpleSoAOptimalHostToFixedBufferCopy)->Range(1UL, 1UL << 26); + +void simpleAoSHostToFixedBufferCopy(::benchmark::State& state) { + + // Get the size of the host container to create. + const std::size_t size = static_cast(state.range(0)); + static constexpr std::size_t element_size = + 2 * sizeof(int) + 2 * sizeof(float); + const double bytes = static_cast(size * element_size); + + // Set custom "counters" for the benchmark. + state.counters["Bytes"] = ::benchmark::Counter( + bytes, ::benchmark::Counter::kDefaults, ::benchmark::Counter::kIs1024); + state.counters["Rate"] = ::benchmark::Counter( + bytes, ::benchmark::Counter::kIsIterationInvariantRate, + ::benchmark::Counter::kIs1024); + + // Create the source host container. + simple_aos_container::host source{&host_mr}; + source.resize(size); + + // Create the destination (host) buffer. + simple_aos_container::buffer dest{ + static_cast(size), host_mr}; + host_copy.setup(dest)->ignore(); + + // Perform the copy benchmark. + for (auto _ : state) { + host_copy(get_data(source), dest)->ignore(); + } +} +BENCHMARK(simpleAoSHostToFixedBufferCopy)->Range(1UL, 1UL << 26); + +} // namespace vecmem::benchmark From ae38b3b8188bfd1897ba0b0a695296f7ed14da18 Mon Sep 17 00:00:00 2001 From: Attila Krasznahorkay Date: Fri, 27 Sep 2024 16:33:05 +0200 Subject: [PATCH 2/4] Refactored the AoS and SoA copy tests. So that it would be easier to set up the CUDA, HIP and SYCL tests as a next step. --- benchmarks/CMakeLists.txt | 10 +- .../common/setup_simple_copy_counters.cpp | 34 +++++ .../common/setup_simple_copy_counters.hpp | 22 +++ .../common/simple_aos_copy_benchmarks.cpp | 70 ++++++++++ .../common/simple_aos_copy_benchmarks.hpp | 35 +++++ .../common/simple_soa_copy_benchmarks.cpp | 125 +++++++++++++++++ .../common/simple_soa_copy_benchmarks.hpp | 50 +++++++ benchmarks/core/benchmark_edm_copy.cpp | 132 +++++------------- 8 files changed, 378 insertions(+), 100 deletions(-) create mode 100644 benchmarks/common/setup_simple_copy_counters.cpp create mode 100644 benchmarks/common/setup_simple_copy_counters.hpp create mode 100644 benchmarks/common/simple_aos_copy_benchmarks.cpp create mode 100644 benchmarks/common/simple_aos_copy_benchmarks.hpp create mode 100644 benchmarks/common/simple_soa_copy_benchmarks.cpp create mode 100644 benchmarks/common/simple_soa_copy_benchmarks.hpp diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 3f586543..97626d54 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -32,10 +32,16 @@ add_library( vecmem_benchmark_common STATIC "common/make_jagged_sizes.cpp" "common/make_jagged_vector.hpp" "common/make_jagged_vector.cpp" + "common/setup_simple_copy_counters.hpp" + "common/setup_simple_copy_counters.cpp" "common/simple_aos_container.hpp" - "common/simple_soa_container.hpp" ) + "common/simple_aos_copy_benchmarks.hpp" + "common/simple_aos_copy_benchmarks.cpp" + "common/simple_soa_container.hpp" + "common/simple_soa_copy_benchmarks.hpp" + "common/simple_soa_copy_benchmarks.cpp" ) target_link_libraries( vecmem_benchmark_common - PUBLIC vecmem::core ) + PUBLIC benchmark::benchmark vecmem::core ) set_target_properties( vecmem_benchmark_common PROPERTIES FOLDER "vecmem/benchmarks" ) diff --git a/benchmarks/common/setup_simple_copy_counters.cpp b/benchmarks/common/setup_simple_copy_counters.cpp new file mode 100644 index 00000000..70dc4b14 --- /dev/null +++ b/benchmarks/common/setup_simple_copy_counters.cpp @@ -0,0 +1,34 @@ +/* + * VecMem project, part of the ACTS project (R&D line) + * + * (c) 2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +// Local include(s). +#include "setup_simple_copy_counters.hpp" + +namespace vecmem::benchmark { + +/// Set up the custom "counters" for the "simple copy" benchmarks. +std::size_t setup_simple_copy_counters(::benchmark::State& state) { + + // Get the size of the host container to create. + const std::size_t size = static_cast(state.range(0)); + static constexpr std::size_t element_size = + 2 * sizeof(int) + 2 * sizeof(float); + const double bytes = static_cast(size * element_size); + + // Set custom "counters" for the benchmark. + state.counters["Bytes"] = ::benchmark::Counter( + bytes, ::benchmark::Counter::kDefaults, ::benchmark::Counter::kIs1024); + state.counters["Rate"] = ::benchmark::Counter( + bytes, ::benchmark::Counter::kIsIterationInvariantRate, + ::benchmark::Counter::kIs1024); + + // Return the size of the benchmarked container. + return size; +} + +} // namespace vecmem::benchmark diff --git a/benchmarks/common/setup_simple_copy_counters.hpp b/benchmarks/common/setup_simple_copy_counters.hpp new file mode 100644 index 00000000..3288a77f --- /dev/null +++ b/benchmarks/common/setup_simple_copy_counters.hpp @@ -0,0 +1,22 @@ +/* + * VecMem project, part of the ACTS project (R&D line) + * + * (c) 2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ +#pragma once + +// Google include(s). +#include + +namespace vecmem::benchmark { + +/// Set up the custom "counters" for the "simple copy" benchmarks. +/// +/// @param state The benchmark state to set up the counters for. +/// @return The size of the benchmarked container. +/// +std::size_t setup_simple_copy_counters(::benchmark::State& state); + +} // namespace vecmem::benchmark diff --git a/benchmarks/common/simple_aos_copy_benchmarks.cpp b/benchmarks/common/simple_aos_copy_benchmarks.cpp new file mode 100644 index 00000000..607c2db6 --- /dev/null +++ b/benchmarks/common/simple_aos_copy_benchmarks.cpp @@ -0,0 +1,70 @@ +/* + * VecMem project, part of the ACTS project (R&D line) + * + * (c) 2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +// Local include(s). +#include "simple_aos_copy_benchmarks.hpp" + +#include "setup_simple_copy_counters.hpp" +#include "simple_aos_container.hpp" + +namespace vecmem::benchmark { + +void simple_aos_h2d_copy_benchmark(::benchmark::State& state, + memory_resource& host_mr, + memory_resource& device_mr, + copy& device_copy, + data::buffer_type buffer_type) { + + // Get the size of the host container to create, while setting up the + // counters used by the benchmark. + const std::size_t size = setup_simple_copy_counters(state); + + // Create the source (host) container. + simple_aos_container::host source{&host_mr}; + source.resize(size); + + // Create the destination buffer. + simple_aos_container::buffer dest{ + static_cast(size), device_mr, + buffer_type}; + device_copy.setup(dest)->wait(); + + // Perform the copy benchmark. + for (auto _ : state) { + device_copy(get_data(source), dest, vecmem::copy::type::host_to_device) + ->wait(); + } +} + +void simple_aos_d2h_copy_benchmark(::benchmark::State& state, + memory_resource& host_mr, + memory_resource& device_mr, + copy& device_copy) { + + // Get the size of the host container to create, while setting up the + // counters used by the benchmark. + const std::size_t size = setup_simple_copy_counters(state); + + // Create the source buffer. + simple_aos_container::buffer source{ + static_cast(size), device_mr}; + device_copy.setup(source)->wait(); + + // Create the destination (host) container. + simple_aos_container::host dest{&host_mr}; + + // Perform the copy benchmark. + for (auto _ : state) { + state.PauseTiming(); + dest.clear(); + state.ResumeTiming(); + device_copy(source, dest, vecmem::copy::type::device_to_host)->wait(); + } +} + +} // namespace vecmem::benchmark diff --git a/benchmarks/common/simple_aos_copy_benchmarks.hpp b/benchmarks/common/simple_aos_copy_benchmarks.hpp new file mode 100644 index 00000000..a5854e0f --- /dev/null +++ b/benchmarks/common/simple_aos_copy_benchmarks.hpp @@ -0,0 +1,35 @@ +/* + * VecMem project, part of the ACTS project (R&D line) + * + * (c) 2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ +#pragma once + +// Project include(s). +#include "vecmem/containers/data/buffer_type.hpp" +#include "vecmem/memory/memory_resource.hpp" +#include "vecmem/utils/copy.hpp" + +// Google include(s). +#include + +namespace vecmem::benchmark { + +/// Benchmark copying @c simple_aos_container::host to +/// @c simple_aos_container::buffer +void simple_aos_h2d_copy_benchmark(::benchmark::State& state, + memory_resource& host_mr, + memory_resource& device_mr, + copy& device_copy, + data::buffer_type buffer_type); + +/// Benchmark copying @c simple_aos_container::buffer to +/// @c simple_aos_container::host +void simple_aos_d2h_copy_benchmark(::benchmark::State& state, + memory_resource& host_mr, + memory_resource& device_mr, + copy& device_copy); + +} // namespace vecmem::benchmark diff --git a/benchmarks/common/simple_soa_copy_benchmarks.cpp b/benchmarks/common/simple_soa_copy_benchmarks.cpp new file mode 100644 index 00000000..b9fe1793 --- /dev/null +++ b/benchmarks/common/simple_soa_copy_benchmarks.cpp @@ -0,0 +1,125 @@ +/* + * VecMem project, part of the ACTS project (R&D line) + * + * (c) 2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +// Local include(s). +#include "simple_soa_copy_benchmarks.hpp" + +#include "setup_simple_copy_counters.hpp" +#include "simple_soa_container.hpp" + +namespace vecmem::benchmark { + +void simple_soa_direct_h2d_copy_benchmark(::benchmark::State& state, + memory_resource& host_mr, + memory_resource& device_mr, + copy& device_copy, + data::buffer_type buffer_type) { + + // Get the size of the host container to create, while setting up the + // counters used by the benchmark. + const std::size_t size = setup_simple_copy_counters(state); + + // Create the source host container. + simple_soa_container::host source{host_mr}; + source.resize(size); + + // Create the destination (host) buffer. + simple_soa_container::buffer dest{ + static_cast(size), device_mr, + buffer_type}; + device_copy.setup(dest)->wait(); + + // Perform the copy benchmark. + for (auto _ : state) { + device_copy(get_data(source), dest)->wait(); + } +} + +void simple_soa_optimal_h2d_copy_benchmark(::benchmark::State& state, + memory_resource& host_mr, + memory_resource& device_mr, + copy& host_copy, copy& device_copy, + data::buffer_type buffer_type) { + + // Get the size of the host container to create, while setting up the + // counters used by the benchmark. + const std::size_t size = setup_simple_copy_counters(state); + + // Create the source host container and buffer. + simple_soa_container::host host{host_mr}; + host.resize(size); + simple_soa_container::buffer source{ + static_cast(size), host_mr, + buffer_type}; + host_copy.setup(source)->wait(); + host_copy(get_data(host), source)->wait(); + + // Create the destination (host) buffer. + simple_soa_container::buffer dest{ + static_cast(size), device_mr, + buffer_type}; + device_copy.setup(dest)->wait(); + + // Perform the copy benchmark. + for (auto _ : state) { + device_copy(source, dest)->wait(); + } +} + +void simple_soa_direct_d2h_copy_benchmark(::benchmark::State& state, + memory_resource& host_mr, + memory_resource& device_mr, + copy& device_copy) { + + // Get the size of the host container to create, while setting up the + // counters used by the benchmark. + const std::size_t size = setup_simple_copy_counters(state); + + // Create the source buffer. + simple_soa_container::buffer source{ + static_cast(size), device_mr}; + device_copy.setup(source)->wait(); + + // Create the destination (host) container. + simple_soa_container::host dest{host_mr}; + + // Perform the copy benchmark. + for (auto _ : state) { + state.PauseTiming(); + dest.resize(0u); + state.ResumeTiming(); + device_copy(source, dest)->wait(); + } +} + +void simple_soa_optimal_d2h_copy_benchmark(::benchmark::State& state, + memory_resource& host_mr, + memory_resource& device_mr, + copy& host_copy, copy& device_copy) { + + // Get the size of the host container to create, while setting up the + // counters used by the benchmark. + const std::size_t size = setup_simple_copy_counters(state); + + // Create the source buffer. + simple_soa_container::buffer source{ + static_cast(size), device_mr}; + device_copy.setup(source)->wait(); + + // Create the destination buffer. + simple_soa_container::buffer dest{ + static_cast(size), host_mr}; + host_copy.setup(dest)->wait(); + + // Perform the copy benchmark. + for (auto _ : state) { + device_copy(source, dest)->wait(); + } +} + +} // namespace vecmem::benchmark diff --git a/benchmarks/common/simple_soa_copy_benchmarks.hpp b/benchmarks/common/simple_soa_copy_benchmarks.hpp new file mode 100644 index 00000000..a648ad47 --- /dev/null +++ b/benchmarks/common/simple_soa_copy_benchmarks.hpp @@ -0,0 +1,50 @@ +/* + * VecMem project, part of the ACTS project (R&D line) + * + * (c) 2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ +#pragma once + +// Project include(s). +#include "vecmem/containers/data/buffer_type.hpp" +#include "vecmem/memory/memory_resource.hpp" +#include "vecmem/utils/copy.hpp" + +// Google include(s). +#include + +namespace vecmem::benchmark { + +/// Benchmark copying @c simple_soa_container::host to +/// @c simple_soa_container::buffer directly +void simple_soa_direct_h2d_copy_benchmark(::benchmark::State& state, + memory_resource& host_mr, + memory_resource& device_mr, + copy& device_copy, + data::buffer_type buffer_type); + +/// Benchmark copying @c simple_soa_container::host to +/// @c simple_soa_container::buffer through an intermediate buffer +void simple_soa_optimal_h2d_copy_benchmark(::benchmark::State& state, + memory_resource& host_mr, + memory_resource& device_mr, + copy& host_copy, copy& device_copy, + data::buffer_type buffer_type); + +/// Benchmark copying @c simple_soa_container::buffer to +/// @c simple_soa_container::host directly +void simple_soa_direct_d2h_copy_benchmark(::benchmark::State& state, + memory_resource& host_mr, + memory_resource& device_mr, + copy& device_copy); + +/// Benchmark copying @c simple_soa_container::buffer to +/// @c simple_soa_container::host through an intermediate buffer +void simple_soa_optimal_d2h_copy_benchmark(::benchmark::State& state, + memory_resource& host_mr, + memory_resource& device_mr, + copy& host_copy, copy& device_copy); + +} // namespace vecmem::benchmark diff --git a/benchmarks/core/benchmark_edm_copy.cpp b/benchmarks/core/benchmark_edm_copy.cpp index d3ec6537..98ce11ba 100644 --- a/benchmarks/core/benchmark_edm_copy.cpp +++ b/benchmarks/core/benchmark_edm_copy.cpp @@ -11,8 +11,8 @@ #include // Common benchmark include(s). -#include "../common/simple_aos_container.hpp" -#include "../common/simple_soa_container.hpp" +#include "../common/simple_aos_copy_benchmarks.hpp" +#include "../common/simple_soa_copy_benchmarks.hpp" // Google benchmark include(s). #include @@ -24,101 +24,37 @@ static host_memory_resource host_mr; /// The copy object to use in the benchmark(s). static copy host_copy; -void simpleSoADirectHostToFixedBufferCopy(::benchmark::State& state) { - - // Get the size of the host container to create. - const std::size_t size = static_cast(state.range(0)); - static constexpr std::size_t element_size = - 2 * sizeof(int) + 2 * sizeof(float); - const double bytes = static_cast(size * element_size); - - // Set custom "counters" for the benchmark. - state.counters["Bytes"] = ::benchmark::Counter( - bytes, ::benchmark::Counter::kDefaults, ::benchmark::Counter::kIs1024); - state.counters["Rate"] = ::benchmark::Counter( - bytes, ::benchmark::Counter::kIsIterationInvariantRate, - ::benchmark::Counter::kIs1024); - - // Create the source host container. - simple_soa_container::host source{host_mr}; - source.resize(size); - - // Create the destination (host) buffer. - simple_soa_container::buffer dest{ - static_cast(size), host_mr}; - host_copy.setup(dest)->ignore(); - - // Perform the copy benchmark. - for (auto _ : state) { - host_copy(get_data(source), dest)->ignore(); - } -} -BENCHMARK(simpleSoADirectHostToFixedBufferCopy)->Range(1UL, 1UL << 26); - -void simpleSoAOptimalHostToFixedBufferCopy(::benchmark::State& state) { - - // Get the size of the host container to create. - const std::size_t size = static_cast(state.range(0)); - static constexpr std::size_t element_size = - 2 * sizeof(int) + 2 * sizeof(float); - const double bytes = static_cast(size * element_size); - - // Set custom "counters" for the benchmark. - state.counters["Bytes"] = ::benchmark::Counter( - bytes, ::benchmark::Counter::kDefaults, ::benchmark::Counter::kIs1024); - state.counters["Rate"] = ::benchmark::Counter( - bytes, ::benchmark::Counter::kIsIterationInvariantRate, - ::benchmark::Counter::kIs1024); - - // Create the source host container. - simple_soa_container::host host{host_mr}; - host.resize(size); - simple_soa_container::buffer source{ - static_cast(size), host_mr}; - host_copy.setup(source)->ignore(); - host_copy(get_data(host), source)->ignore(); - - // Create the destination (host) buffer. - simple_soa_container::buffer dest{ - static_cast(size), host_mr}; - host_copy.setup(dest)->ignore(); - - // Perform the copy benchmark. - for (auto _ : state) { - host_copy(source, dest)->ignore(); - } -} -BENCHMARK(simpleSoAOptimalHostToFixedBufferCopy)->Range(1UL, 1UL << 26); - -void simpleAoSHostToFixedBufferCopy(::benchmark::State& state) { - - // Get the size of the host container to create. - const std::size_t size = static_cast(state.range(0)); - static constexpr std::size_t element_size = - 2 * sizeof(int) + 2 * sizeof(float); - const double bytes = static_cast(size * element_size); - - // Set custom "counters" for the benchmark. - state.counters["Bytes"] = ::benchmark::Counter( - bytes, ::benchmark::Counter::kDefaults, ::benchmark::Counter::kIs1024); - state.counters["Rate"] = ::benchmark::Counter( - bytes, ::benchmark::Counter::kIsIterationInvariantRate, - ::benchmark::Counter::kIs1024); - - // Create the source host container. - simple_aos_container::host source{&host_mr}; - source.resize(size); - - // Create the destination (host) buffer. - simple_aos_container::buffer dest{ - static_cast(size), host_mr}; - host_copy.setup(dest)->ignore(); - - // Perform the copy benchmark. - for (auto _ : state) { - host_copy(get_data(source), dest)->ignore(); - } -} -BENCHMARK(simpleAoSHostToFixedBufferCopy)->Range(1UL, 1UL << 26); +BENCHMARK_CAPTURE(simple_soa_direct_h2d_copy_benchmark, host_fixed_buffer, + host_mr, host_mr, host_copy, data::buffer_type::fixed_size) + ->Range(1UL, 1UL << 26); +BENCHMARK_CAPTURE(simple_soa_direct_h2d_copy_benchmark, host_resizable_buffer, + host_mr, host_mr, host_copy, data::buffer_type::resizable) + ->Range(1UL, 1UL << 26); + +BENCHMARK_CAPTURE(simple_soa_optimal_h2d_copy_benchmark, host_fixed_buffer, + host_mr, host_mr, host_copy, host_copy, + data::buffer_type::fixed_size) + ->Range(1UL, 1UL << 26); +BENCHMARK_CAPTURE(simple_soa_optimal_h2d_copy_benchmark, host_resizable_buffer, + host_mr, host_mr, host_copy, host_copy, + data::buffer_type::resizable) + ->Range(1UL, 1UL << 26); + +BENCHMARK_CAPTURE(simple_soa_direct_d2h_copy_benchmark, host, host_mr, host_mr, + host_copy) + ->Range(1UL, 1UL << 26); +BENCHMARK_CAPTURE(simple_soa_optimal_d2h_copy_benchmark, host, host_mr, host_mr, + host_copy, host_copy) + ->Range(1UL, 1UL << 26); + +BENCHMARK_CAPTURE(simple_aos_h2d_copy_benchmark, host_fixed_buffer, host_mr, + host_mr, host_copy, data::buffer_type::fixed_size) + ->Range(1UL, 1UL << 26); +BENCHMARK_CAPTURE(simple_aos_h2d_copy_benchmark, host_resizable_buffer, host_mr, + host_mr, host_copy, data::buffer_type::resizable) + ->Range(1UL, 1UL << 26); +BENCHMARK_CAPTURE(simple_aos_d2h_copy_benchmark, host, host_mr, host_mr, + host_copy) + ->Range(1UL, 1UL << 26); } // namespace vecmem::benchmark From 7b60f23e5c09a02411c25a341dcfab53387bc695 Mon Sep 17 00:00:00 2001 From: Attila Krasznahorkay Date: Fri, 27 Sep 2024 17:49:13 +0200 Subject: [PATCH 3/4] Introduced EDM copy benchmarks for CUDA and SYCL as well. --- benchmarks/core/benchmark_edm_copy.cpp | 73 ++++++++++++--------- benchmarks/cuda/CMakeLists.txt | 3 +- benchmarks/cuda/benchmark_edm_copy.cpp | 91 ++++++++++++++++++++++++++ benchmarks/sycl/CMakeLists.txt | 3 +- benchmarks/sycl/benchmark_edm_copy.cpp | 91 ++++++++++++++++++++++++++ 5 files changed, 227 insertions(+), 34 deletions(-) create mode 100644 benchmarks/cuda/benchmark_edm_copy.cpp create mode 100644 benchmarks/sycl/benchmark_edm_copy.cpp diff --git a/benchmarks/core/benchmark_edm_copy.cpp b/benchmarks/core/benchmark_edm_copy.cpp index 98ce11ba..8aa38459 100644 --- a/benchmarks/core/benchmark_edm_copy.cpp +++ b/benchmarks/core/benchmark_edm_copy.cpp @@ -24,37 +24,46 @@ static host_memory_resource host_mr; /// The copy object to use in the benchmark(s). static copy host_copy; -BENCHMARK_CAPTURE(simple_soa_direct_h2d_copy_benchmark, host_fixed_buffer, - host_mr, host_mr, host_copy, data::buffer_type::fixed_size) - ->Range(1UL, 1UL << 26); -BENCHMARK_CAPTURE(simple_soa_direct_h2d_copy_benchmark, host_resizable_buffer, - host_mr, host_mr, host_copy, data::buffer_type::resizable) - ->Range(1UL, 1UL << 26); - -BENCHMARK_CAPTURE(simple_soa_optimal_h2d_copy_benchmark, host_fixed_buffer, - host_mr, host_mr, host_copy, host_copy, - data::buffer_type::fixed_size) - ->Range(1UL, 1UL << 26); -BENCHMARK_CAPTURE(simple_soa_optimal_h2d_copy_benchmark, host_resizable_buffer, - host_mr, host_mr, host_copy, host_copy, - data::buffer_type::resizable) - ->Range(1UL, 1UL << 26); - -BENCHMARK_CAPTURE(simple_soa_direct_d2h_copy_benchmark, host, host_mr, host_mr, - host_copy) - ->Range(1UL, 1UL << 26); -BENCHMARK_CAPTURE(simple_soa_optimal_d2h_copy_benchmark, host, host_mr, host_mr, - host_copy, host_copy) - ->Range(1UL, 1UL << 26); - -BENCHMARK_CAPTURE(simple_aos_h2d_copy_benchmark, host_fixed_buffer, host_mr, - host_mr, host_copy, data::buffer_type::fixed_size) - ->Range(1UL, 1UL << 26); -BENCHMARK_CAPTURE(simple_aos_h2d_copy_benchmark, host_resizable_buffer, host_mr, - host_mr, host_copy, data::buffer_type::resizable) - ->Range(1UL, 1UL << 26); -BENCHMARK_CAPTURE(simple_aos_d2h_copy_benchmark, host, host_mr, host_mr, - host_copy) - ->Range(1UL, 1UL << 26); +// +// Helper macro(s) for setting up all the different benchmarks. +// +#define CONFIGURE_BENCHMARK(BM) BM->Range(1UL, 1UL << 26) + +// +// Set up all the different benchmarks. +// + +CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE(simple_soa_direct_h2d_copy_benchmark, + host_fixed_buffer, host_mr, host_mr, + host_copy, + data::buffer_type::fixed_size)); +CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE(simple_soa_direct_h2d_copy_benchmark, + host_resizable_buffer, host_mr, host_mr, + host_copy, data::buffer_type::resizable)); + +CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE(simple_soa_optimal_h2d_copy_benchmark, + host_fixed_buffer, host_mr, host_mr, + host_copy, host_copy, + data::buffer_type::fixed_size)); +CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE(simple_soa_optimal_h2d_copy_benchmark, + host_resizable_buffer, host_mr, host_mr, + host_copy, host_copy, + data::buffer_type::resizable)); + +CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE(simple_soa_direct_d2h_copy_benchmark, + host, host_mr, host_mr, host_copy)); +CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE(simple_soa_optimal_d2h_copy_benchmark, + host, host_mr, host_mr, host_copy, + host_copy)); + +CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE(simple_aos_h2d_copy_benchmark, + host_fixed_buffer, host_mr, host_mr, + host_copy, + data::buffer_type::fixed_size)); +CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE(simple_aos_h2d_copy_benchmark, + host_resizable_buffer, host_mr, host_mr, + host_copy, data::buffer_type::resizable)); +CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE(simple_aos_d2h_copy_benchmark, host, + host_mr, host_mr, host_copy)); } // namespace vecmem::benchmark diff --git a/benchmarks/cuda/CMakeLists.txt b/benchmarks/cuda/CMakeLists.txt index ab2c4977..a962baaa 100644 --- a/benchmarks/cuda/CMakeLists.txt +++ b/benchmarks/cuda/CMakeLists.txt @@ -11,7 +11,8 @@ include( vecmem-compiler-options-cuda ) # Set up the benchmark(s) for the CUDA library. add_executable( vecmem_benchmark_cuda "benchmark_cuda.cpp" - "benchmark_copy.cpp" ) + "benchmark_copy.cpp" + "benchmark_edm_copy.cpp" ) target_link_libraries( vecmem_benchmark_cuda diff --git a/benchmarks/cuda/benchmark_edm_copy.cpp b/benchmarks/cuda/benchmark_edm_copy.cpp new file mode 100644 index 00000000..57c65d4f --- /dev/null +++ b/benchmarks/cuda/benchmark_edm_copy.cpp @@ -0,0 +1,91 @@ +/* + * VecMem project, part of the ACTS project (R&D line) + * + * (c) 2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +// VecMem include(s). +#include +#include +#include +#include +#include +#include + +// Common benchmark include(s). +#include "../common/simple_aos_copy_benchmarks.hpp" +#include "../common/simple_soa_copy_benchmarks.hpp" + +// Google benchmark include(s). +#include + +namespace vecmem::benchmark { + +/// CUDA stream to use in the benchmark(s). +static cuda::stream_wrapper stream; + +/// Non-pinned host memory resource to use in the benchmark(s). +static host_memory_resource host_mr; +/// Pinned host memory resource to use in the benchmark(s). +static cuda::host_memory_resource cuda_host_mr; + +/// Device memory resource to use in the benchmark(s). +static cuda::device_memory_resource device_mr; + +/// The host copy object to use in the benchmark(s). +static copy host_copy; +/// The synchronous device copy object to use in the benchmark(s). +static cuda::copy device_copy; +/// The asynchronous device copy object to use in the benchmark(s). +static cuda::async_copy async_device_copy{stream}; + +// +// Helper macro(s) for setting up all the different benchmarks. +// +#define CONFIGURE_BENCHMARK(BM) BM->Range(1UL, 1UL << 26) + +#define EDM_COPY_BENCHMARKS(TITLE, HOST_MR, DEVICE_MR, HOST_COPY, DEVICE_COPY) \ + CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE( \ + simple_soa_direct_h2d_copy_benchmark, TITLE##_fixed_buffer, HOST_MR, \ + DEVICE_MR, DEVICE_COPY, data::buffer_type::fixed_size)); \ + CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE( \ + simple_soa_direct_h2d_copy_benchmark, TITLE##_resizable_buffer, \ + HOST_MR, DEVICE_MR, DEVICE_COPY, data::buffer_type::resizable)); \ + CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE( \ + simple_soa_optimal_h2d_copy_benchmark, TITLE##_fixed_buffer, HOST_MR, \ + DEVICE_MR, HOST_COPY, DEVICE_COPY, data::buffer_type::fixed_size)); \ + CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE( \ + simple_soa_optimal_h2d_copy_benchmark, TITLE##_resizable_buffer, \ + HOST_MR, DEVICE_MR, HOST_COPY, DEVICE_COPY, \ + data::buffer_type::resizable)); \ + CONFIGURE_BENCHMARK( \ + BENCHMARK_CAPTURE(simple_soa_direct_d2h_copy_benchmark, TITLE, \ + HOST_MR, DEVICE_MR, DEVICE_COPY)); \ + CONFIGURE_BENCHMARK( \ + BENCHMARK_CAPTURE(simple_soa_optimal_d2h_copy_benchmark, TITLE, \ + HOST_MR, DEVICE_MR, HOST_COPY, DEVICE_COPY)); \ + CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE( \ + simple_aos_h2d_copy_benchmark, TITLE##_fixed_buffer, HOST_MR, \ + DEVICE_MR, DEVICE_COPY, data::buffer_type::fixed_size)); \ + CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE( \ + simple_aos_h2d_copy_benchmark, TITLE##_resizable_buffer, HOST_MR, \ + DEVICE_MR, DEVICE_COPY, data::buffer_type::resizable)); \ + CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE(simple_aos_d2h_copy_benchmark, \ + TITLE, HOST_MR, DEVICE_MR, \ + DEVICE_COPY)) + +// +// Set up all the different benchmarks. +// +EDM_COPY_BENCHMARKS(cuda_pageable_sync, host_mr, device_mr, host_copy, + device_copy); +EDM_COPY_BENCHMARKS(cuda_pageable_async, host_mr, device_mr, host_copy, + async_device_copy); +EDM_COPY_BENCHMARKS(cuda_pinned_sync, cuda_host_mr, device_mr, host_copy, + device_copy); +EDM_COPY_BENCHMARKS(cuda_pinned_async, cuda_host_mr, device_mr, host_copy, + async_device_copy); + +} // namespace vecmem::benchmark diff --git a/benchmarks/sycl/CMakeLists.txt b/benchmarks/sycl/CMakeLists.txt index 6abb44f1..382d2da4 100644 --- a/benchmarks/sycl/CMakeLists.txt +++ b/benchmarks/sycl/CMakeLists.txt @@ -11,7 +11,8 @@ include( vecmem-compiler-options-sycl ) # Set up the benchmark(s) for the SYCL library. add_executable( vecmem_benchmark_sycl "benchmark_sycl.cpp" - "benchmark_copy.cpp" ) + "benchmark_copy.cpp" + "benchmark_edm_copy.cpp") target_link_libraries( vecmem_benchmark_sycl PRIVATE vecmem::sycl vecmem_benchmark_common benchmark::benchmark benchmark::benchmark_main diff --git a/benchmarks/sycl/benchmark_edm_copy.cpp b/benchmarks/sycl/benchmark_edm_copy.cpp new file mode 100644 index 00000000..908a8605 --- /dev/null +++ b/benchmarks/sycl/benchmark_edm_copy.cpp @@ -0,0 +1,91 @@ +/* + * VecMem project, part of the ACTS project (R&D line) + * + * (c) 2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +// VecMem include(s). +#include +#include +#include +#include +#include +#include + +// Common benchmark include(s). +#include "../common/simple_aos_copy_benchmarks.hpp" +#include "../common/simple_soa_copy_benchmarks.hpp" + +// Google benchmark include(s). +#include + +namespace vecmem::benchmark { + +/// SYCL queue to use in the benchmark(s). +static sycl::queue_wrapper queue; + +/// Non-pinned host memory resource to use in the benchmark(s). +static host_memory_resource host_mr; +/// Pinned host memory resource to use in the benchmark(s). +static sycl::host_memory_resource sycl_host_mr{queue}; + +/// Device memory resource to use in the benchmark(s). +static sycl::device_memory_resource device_mr{queue}; + +/// The host copy object to use in the benchmark(s). +static copy host_copy; +/// The synchronous device copy object to use in the benchmark(s). +static sycl::copy device_copy{queue}; +/// The asynchronous device copy object to use in the benchmark(s). +static sycl::async_copy async_device_copy{queue}; + +// +// Helper macro(s) for setting up all the different benchmarks. +// +#define CONFIGURE_BENCHMARK(BM) BM->Range(1UL, 1UL << 26) + +#define EDM_COPY_BENCHMARKS(TITLE, HOST_MR, DEVICE_MR, HOST_COPY, DEVICE_COPY) \ + CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE( \ + simple_soa_direct_h2d_copy_benchmark, TITLE##_fixed_buffer, HOST_MR, \ + DEVICE_MR, DEVICE_COPY, data::buffer_type::fixed_size)); \ + CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE( \ + simple_soa_direct_h2d_copy_benchmark, TITLE##_resizable_buffer, \ + HOST_MR, DEVICE_MR, DEVICE_COPY, data::buffer_type::resizable)); \ + CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE( \ + simple_soa_optimal_h2d_copy_benchmark, TITLE##_fixed_buffer, HOST_MR, \ + DEVICE_MR, HOST_COPY, DEVICE_COPY, data::buffer_type::fixed_size)); \ + CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE( \ + simple_soa_optimal_h2d_copy_benchmark, TITLE##_resizable_buffer, \ + HOST_MR, DEVICE_MR, HOST_COPY, DEVICE_COPY, \ + data::buffer_type::resizable)); \ + CONFIGURE_BENCHMARK( \ + BENCHMARK_CAPTURE(simple_soa_direct_d2h_copy_benchmark, TITLE, \ + HOST_MR, DEVICE_MR, DEVICE_COPY)); \ + CONFIGURE_BENCHMARK( \ + BENCHMARK_CAPTURE(simple_soa_optimal_d2h_copy_benchmark, TITLE, \ + HOST_MR, DEVICE_MR, HOST_COPY, DEVICE_COPY)); \ + CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE( \ + simple_aos_h2d_copy_benchmark, TITLE##_fixed_buffer, HOST_MR, \ + DEVICE_MR, DEVICE_COPY, data::buffer_type::fixed_size)); \ + CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE( \ + simple_aos_h2d_copy_benchmark, TITLE##_resizable_buffer, HOST_MR, \ + DEVICE_MR, DEVICE_COPY, data::buffer_type::resizable)); \ + CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE(simple_aos_d2h_copy_benchmark, \ + TITLE, HOST_MR, DEVICE_MR, \ + DEVICE_COPY)) + +// +// Set up all the different benchmarks. +// +EDM_COPY_BENCHMARKS(sycl_pageable_sync, host_mr, device_mr, host_copy, + device_copy); +EDM_COPY_BENCHMARKS(sycl_pageable_async, host_mr, device_mr, host_copy, + async_device_copy); +EDM_COPY_BENCHMARKS(sycl_pinned_sync, sycl_host_mr, device_mr, host_copy, + device_copy); +EDM_COPY_BENCHMARKS(sycl_pinned_async, sycl_host_mr, device_mr, host_copy, + async_device_copy); + +} // namespace vecmem::benchmark From 288f9fcb77a3d59d99afa752fd50f29b228ffa2d Mon Sep 17 00:00:00 2001 From: Attila Krasznahorkay Date: Tue, 1 Oct 2024 15:51:55 +0200 Subject: [PATCH 4/4] Introduced benchmarks for HIP. Simply copying the current CUDA benchmark code, with all its imperfections. --- benchmarks/CMakeLists.txt | 3 + benchmarks/hip/CMakeLists.txt | 29 +++++ benchmarks/hip/benchmark_copy.cpp | 166 ++++++++++++++++++++++++++ benchmarks/hip/benchmark_edm_copy.cpp | 81 +++++++++++++ benchmarks/hip/benchmark_hip.cpp | 112 +++++++++++++++++ 5 files changed, 391 insertions(+) create mode 100644 benchmarks/hip/CMakeLists.txt create mode 100644 benchmarks/hip/benchmark_copy.cpp create mode 100644 benchmarks/hip/benchmark_edm_copy.cpp create mode 100644 benchmarks/hip/benchmark_hip.cpp diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 97626d54..17bd628d 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -50,6 +50,9 @@ add_subdirectory(core) if(VECMEM_BUILD_CUDA_LIBRARY) add_subdirectory(cuda) endif() +if(VECMEM_BUILD_HIP_LIBRARY) + add_subdirectory(hip) +endif() if(VECMEM_BUILD_SYCL_LIBRARY) add_subdirectory(sycl) endif() diff --git a/benchmarks/hip/CMakeLists.txt b/benchmarks/hip/CMakeLists.txt new file mode 100644 index 00000000..278e4a3d --- /dev/null +++ b/benchmarks/hip/CMakeLists.txt @@ -0,0 +1,29 @@ +# VecMem project, part of the ACTS project (R&D line) +# +# (c) 2024 CERN for the benefit of the ACTS project +# +# Mozilla Public License Version 2.0 + +# Project include(s). +include( vecmem-compiler-options-cpp ) +include( vecmem-compiler-options-hip ) + +# Set up the benchmark(s) for the HIP library. +add_executable( vecmem_benchmark_hip + "benchmark_hip.cpp" + "benchmark_copy.cpp" + "benchmark_edm_copy.cpp" ) + +target_link_libraries( + vecmem_benchmark_hip + + PRIVATE + vecmem::core + vecmem::hip + vecmem_benchmark_common + benchmark::benchmark + benchmark::benchmark_main +) + +set_target_properties( vecmem_benchmark_hip PROPERTIES + FOLDER "vecmem/benchmarks" ) diff --git a/benchmarks/hip/benchmark_copy.cpp b/benchmarks/hip/benchmark_copy.cpp new file mode 100644 index 00000000..8844a8cc --- /dev/null +++ b/benchmarks/hip/benchmark_copy.cpp @@ -0,0 +1,166 @@ +/* + * VecMem project, part of the ACTS project (R&D line) + * + * (c) 2022-2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +// VecMem include(s). +#include +#include +#include + +// Common benchmark include(s). +#include "../common/make_jagged_sizes.hpp" +#include "../common/make_jagged_vector.hpp" + +// Google benchmark include(s). +#include + +// System include(s). +#include +#include + +namespace vecmem::hip::benchmark { + +/// The (host) memory resource to use in the benchmark(s). +static vecmem::host_memory_resource host_mr; +/// The (device) memory resource to use in the benchmark(s). +static device_memory_resource device_mr; +/// The copy object to use in the benchmark(s). +static copy hip_copy; + +/// Function benchmarking "unknown" host-to-device jagged vector copies +void jaggedVectorUnknownHtoDCopy(::benchmark::State& state) { + + // Generate the sizes of the jagged vector/buffer for the test. + const std::vector sizes = + vecmem::benchmark::make_jagged_sizes(state.range(0), state.range(1)); + + // Set custom "counters" for the benchmark. + const std::size_t bytes = std::accumulate(sizes.begin(), sizes.end(), + static_cast(0u)) * + sizeof(int); + state.counters["Bytes"] = static_cast(bytes); + state.counters["Rate"] = + ::benchmark::Counter(static_cast(bytes), + ::benchmark::Counter::kIsIterationInvariantRate, + ::benchmark::Counter::kIs1024); + + // Create the "source vector". + jagged_vector source = + vecmem::benchmark::make_jagged_vector(sizes, host_mr); + const data::jagged_vector_data source_data = get_data(source); + // Create the "destination buffer". + data::jagged_vector_buffer dest(sizes, device_mr, &host_mr); + hip_copy.setup(dest)->wait(); + + // Perform the copy benchmark. + for (auto _ : state) { + hip_copy(source_data, dest)->wait(); + } +} +// Set up the benchmark. +BENCHMARK(jaggedVectorUnknownHtoDCopy)->Ranges({{10, 100000}, {50, 5000}}); + +/// Function benchmarking "known" host-to-device jagged vector copies +void jaggedVectorKnownHtoDCopy(::benchmark::State& state) { + + // Generate the sizes of the jagged vector/buffer for the test. + const std::vector sizes = + vecmem::benchmark::make_jagged_sizes(state.range(0), state.range(1)); + + // Set custom "counters" for the benchmark. + const std::size_t bytes = std::accumulate(sizes.begin(), sizes.end(), + static_cast(0u)) * + sizeof(int); + state.counters["Bytes"] = static_cast(bytes); + state.counters["Rate"] = + ::benchmark::Counter(static_cast(bytes), + ::benchmark::Counter::kIsIterationInvariantRate, + ::benchmark::Counter::kIs1024); + + // Create the "source vector". + jagged_vector source = + vecmem::benchmark::make_jagged_vector(sizes, host_mr); + const data::jagged_vector_data source_data = get_data(source); + // Create the "destination buffer". + data::jagged_vector_buffer dest(sizes, device_mr, &host_mr); + hip_copy.setup(dest)->wait(); + + // Perform the copy benchmark. + for (auto _ : state) { + hip_copy(source_data, dest, copy::type::host_to_device)->wait(); + } +} +// Set up the benchmark. +BENCHMARK(jaggedVectorKnownHtoDCopy)->Ranges({{10, 100000}, {50, 5000}}); + +/// Function benchmarking "unknown" device-to-host jagged vector copies +void jaggedVectorUnknownDtoHCopy(::benchmark::State& state) { + + // Generate the sizes of the jagged vector/buffer for the test. + const std::vector sizes = + vecmem::benchmark::make_jagged_sizes(state.range(0), state.range(1)); + + // Set custom "counters" for the benchmark. + const std::size_t bytes = std::accumulate(sizes.begin(), sizes.end(), + static_cast(0u)) * + sizeof(int); + state.counters["Bytes"] = static_cast(bytes); + state.counters["Rate"] = + ::benchmark::Counter(static_cast(bytes), + ::benchmark::Counter::kIsIterationInvariantRate, + ::benchmark::Counter::kIs1024); + + // Create the "source buffer". + data::jagged_vector_buffer source(sizes, device_mr, &host_mr); + hip_copy.setup(source)->wait(); + // Create the "destination vector". + jagged_vector dest = + vecmem::benchmark::make_jagged_vector(sizes, host_mr); + data::jagged_vector_data dest_data = get_data(dest); + + // Perform the copy benchmark. + for (auto _ : state) { + hip_copy(source, dest_data)->wait(); + } +} +// Set up the benchmark. +BENCHMARK(jaggedVectorUnknownDtoHCopy)->Ranges({{10, 100000}, {50, 5000}}); + +/// Function benchmarking "known" device-to-host jagged vector copies +void jaggedVectorKnownDtoHCopy(::benchmark::State& state) { + + // Generate the sizes of the jagged vector/buffer for the test. + const std::vector sizes = + vecmem::benchmark::make_jagged_sizes(state.range(0), state.range(1)); + + // Set custom "counters" for the benchmark. + const std::size_t bytes = std::accumulate(sizes.begin(), sizes.end(), + static_cast(0u)) * + sizeof(int); + state.counters["Bytes"] = static_cast(bytes); + state.counters["Rate"] = + ::benchmark::Counter(static_cast(bytes), + ::benchmark::Counter::kIsIterationInvariantRate, + ::benchmark::Counter::kIs1024); + + // Create the "source buffer". + data::jagged_vector_buffer source(sizes, device_mr, &host_mr); + hip_copy.setup(source)->wait(); + // Create the "destination vector". + jagged_vector dest = + vecmem::benchmark::make_jagged_vector(sizes, host_mr); + data::jagged_vector_data dest_data = get_data(dest); + + // Perform the copy benchmark. + for (auto _ : state) { + hip_copy(source, dest_data, copy::type::device_to_host)->wait(); + } +} +// Set up the benchmark. +BENCHMARK(jaggedVectorKnownDtoHCopy)->Ranges({{10, 100000}, {50, 5000}}); + +} // namespace vecmem::hip::benchmark diff --git a/benchmarks/hip/benchmark_edm_copy.cpp b/benchmarks/hip/benchmark_edm_copy.cpp new file mode 100644 index 00000000..20148b44 --- /dev/null +++ b/benchmarks/hip/benchmark_edm_copy.cpp @@ -0,0 +1,81 @@ +/* + * VecMem project, part of the ACTS project (R&D line) + * + * (c) 2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +// VecMem include(s). +#include +#include +#include +#include +#include + +// Common benchmark include(s). +#include "../common/simple_aos_copy_benchmarks.hpp" +#include "../common/simple_soa_copy_benchmarks.hpp" + +// Google benchmark include(s). +#include + +namespace vecmem::benchmark { + +/// Non-pinned host memory resource to use in the benchmark(s). +static host_memory_resource host_mr; +/// Pinned host memory resource to use in the benchmark(s). +static hip::host_memory_resource hip_host_mr; + +/// Device memory resource to use in the benchmark(s). +static hip::device_memory_resource device_mr; + +/// The host copy object to use in the benchmark(s). +static copy host_copy; +/// The synchronous device copy object to use in the benchmark(s). +static hip::copy device_copy; + +// +// Helper macro(s) for setting up all the different benchmarks. +// +#define CONFIGURE_BENCHMARK(BM) BM->Range(1UL, 1UL << 26) + +#define EDM_COPY_BENCHMARKS(TITLE, HOST_MR, DEVICE_MR, HOST_COPY, DEVICE_COPY) \ + CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE( \ + simple_soa_direct_h2d_copy_benchmark, TITLE##_fixed_buffer, HOST_MR, \ + DEVICE_MR, DEVICE_COPY, data::buffer_type::fixed_size)); \ + CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE( \ + simple_soa_direct_h2d_copy_benchmark, TITLE##_resizable_buffer, \ + HOST_MR, DEVICE_MR, DEVICE_COPY, data::buffer_type::resizable)); \ + CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE( \ + simple_soa_optimal_h2d_copy_benchmark, TITLE##_fixed_buffer, HOST_MR, \ + DEVICE_MR, HOST_COPY, DEVICE_COPY, data::buffer_type::fixed_size)); \ + CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE( \ + simple_soa_optimal_h2d_copy_benchmark, TITLE##_resizable_buffer, \ + HOST_MR, DEVICE_MR, HOST_COPY, DEVICE_COPY, \ + data::buffer_type::resizable)); \ + CONFIGURE_BENCHMARK( \ + BENCHMARK_CAPTURE(simple_soa_direct_d2h_copy_benchmark, TITLE, \ + HOST_MR, DEVICE_MR, DEVICE_COPY)); \ + CONFIGURE_BENCHMARK( \ + BENCHMARK_CAPTURE(simple_soa_optimal_d2h_copy_benchmark, TITLE, \ + HOST_MR, DEVICE_MR, HOST_COPY, DEVICE_COPY)); \ + CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE( \ + simple_aos_h2d_copy_benchmark, TITLE##_fixed_buffer, HOST_MR, \ + DEVICE_MR, DEVICE_COPY, data::buffer_type::fixed_size)); \ + CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE( \ + simple_aos_h2d_copy_benchmark, TITLE##_resizable_buffer, HOST_MR, \ + DEVICE_MR, DEVICE_COPY, data::buffer_type::resizable)); \ + CONFIGURE_BENCHMARK(BENCHMARK_CAPTURE(simple_aos_d2h_copy_benchmark, \ + TITLE, HOST_MR, DEVICE_MR, \ + DEVICE_COPY)) + +// +// Set up all the different benchmarks. +// +EDM_COPY_BENCHMARKS(hip_pageable_sync, host_mr, device_mr, host_copy, + device_copy); +EDM_COPY_BENCHMARKS(hip_pinned_sync, hip_host_mr, device_mr, host_copy, + device_copy); + +} // namespace vecmem::benchmark diff --git a/benchmarks/hip/benchmark_hip.cpp b/benchmarks/hip/benchmark_hip.cpp new file mode 100644 index 00000000..7c96b730 --- /dev/null +++ b/benchmarks/hip/benchmark_hip.cpp @@ -0,0 +1,112 @@ +/* VecMem project, part of the ACTS project (R&D line) + * + * (c) 2021-2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +// VecMem include(s). +#include +#include +#include +#include + +// Google benchmark include(s). +#include + +#include + +static vecmem::hip::device_memory_resource device_mr; +void BenchmarkHipDevice(benchmark::State& state) { + const std::size_t size = static_cast(state.range(0)); + for (auto _ : state) { + void* p = device_mr.allocate(size); + device_mr.deallocate(p, size); + } +} +BENCHMARK(BenchmarkHipDevice)->RangeMultiplier(2)->Range(1, 2UL << 31); + +void BenchmarkHipDeviceBinaryPage(benchmark::State& state) { + std::size_t size = static_cast(state.range(0)); + + vecmem::binary_page_memory_resource mr(device_mr); + + for (auto _ : state) { + void* p = mr.allocate(size); + mr.deallocate(p, size); + } +} +BENCHMARK(BenchmarkHipDeviceBinaryPage) + ->RangeMultiplier(2) + ->Range(1, 2UL << 31); + +void BenchmarkHipDeviceBinaryPageMultiple(benchmark::State& state) { + std::size_t size = static_cast(state.range(0)); + std::size_t nallocs = static_cast(state.range(1)); + + std::vector allocs; + + allocs.reserve(nallocs); + + for (auto _ : state) { + vecmem::binary_page_memory_resource mr(device_mr); + for (std::size_t i = 0; i < nallocs; ++i) { + allocs[i] = mr.allocate(size); + } + + for (std::size_t i = 0; i < nallocs; ++i) { + mr.deallocate(allocs[i], size); + } + } +} +BENCHMARK(BenchmarkHipDeviceBinaryPageMultiple) + ->RangeMultiplier(2) + ->Ranges({{1, 2UL << 21}, {1, 1024}}); + +static vecmem::hip::host_memory_resource host_mr; +void BenchmarkHipPinned(benchmark::State& state) { + const std::size_t size = static_cast(state.range(0)); + for (auto _ : state) { + void* p = host_mr.allocate(size); + host_mr.deallocate(p, size); + } +} +BENCHMARK(BenchmarkHipPinned)->RangeMultiplier(2)->Range(1, 2UL << 31); + +void BenchmarkHipPinnedBinaryPage(benchmark::State& state) { + std::size_t size = static_cast(state.range(0)); + + vecmem::binary_page_memory_resource mr(host_mr); + + for (auto _ : state) { + void* p = mr.allocate(size); + mr.deallocate(p, size); + } +} +BENCHMARK(BenchmarkHipPinnedBinaryPage) + ->RangeMultiplier(2) + ->Range(1, 2UL << 31); + +static vecmem::hip::managed_memory_resource managed_mr; +void BenchmarkHipManaged(benchmark::State& state) { + const std::size_t size = static_cast(state.range(0)); + for (auto _ : state) { + void* p = managed_mr.allocate(size); + managed_mr.deallocate(p, size); + } +} +BENCHMARK(BenchmarkHipManaged)->RangeMultiplier(2)->Range(1, 2UL << 31); + +void BenchmarkHipManagedBinaryPage(benchmark::State& state) { + std::size_t size = static_cast(state.range(0)); + + vecmem::binary_page_memory_resource mr(managed_mr); + + for (auto _ : state) { + void* p = mr.allocate(size); + mr.deallocate(p, size); + } +} +BENCHMARK(BenchmarkHipManagedBinaryPage) + ->RangeMultiplier(2) + ->Range(1, 2UL << 31);