diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 8a48126e195..a5b248135c1 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -40,8 +40,13 @@ target_include_directories( # Use an OBJECT library so we only compile these helper source files only once add_library( - cudf_benchmark_common OBJECT "${CUDF_SOURCE_DIR}/tests/utilities/random_seed.cpp" - synchronization/synchronization.cpp io/cuio_common.cpp + cudf_benchmark_common OBJECT + "${CUDF_SOURCE_DIR}/tests/utilities/random_seed.cpp" + synchronization/synchronization.cpp + io/cuio_common.cpp + common/table_utilities.cpp + common/benchmark_utilities.cpp + common/nvbench_utilities.cpp ) target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen $) add_custom_command( diff --git a/cpp/benchmarks/common/benchmark_utilities.cpp b/cpp/benchmarks/common/benchmark_utilities.cpp new file mode 100644 index 00000000000..0b9fc17e779 --- /dev/null +++ b/cpp/benchmarks/common/benchmark_utilities.cpp @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "benchmark_utilities.hpp" + +void set_items_processed(::benchmark::State& state, int64_t items_processed_per_iteration) +{ + state.SetItemsProcessed(state.iterations() * items_processed_per_iteration); +} + +void set_bytes_processed(::benchmark::State& state, int64_t bytes_processed_per_iteration) +{ + state.SetBytesProcessed(state.iterations() * bytes_processed_per_iteration); +} diff --git a/cpp/benchmarks/common/benchmark_utilities.hpp b/cpp/benchmarks/common/benchmark_utilities.hpp new file mode 100644 index 00000000000..c5c80e73674 --- /dev/null +++ b/cpp/benchmarks/common/benchmark_utilities.hpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +/** + * @brief Sets the number of items processed during the benchmark. + * + * This function could be used instead of ::benchmark::State.SetItemsProcessed() + * to avoid repeatedly computing ::benchmark::State.iterations() * items_processed_per_iteration. + * + * @param state the benchmark state + * @param items_processed_per_iteration number of items processed per iteration + */ +void set_items_processed(::benchmark::State& state, int64_t items_processed_per_iteration); + +/** + * @brief Sets the number of bytes processed during the benchmark. + * + * This function could be used instead of ::benchmark::State.SetItemsProcessed() + * to avoid repeatedly computing ::benchmark::State.iterations() * bytes_processed_per_iteration. + * + * @param state the benchmark state + * @param bytes_processed_per_iteration number of bytes processed per iteration + */ +void set_bytes_processed(::benchmark::State& state, int64_t bytes_processed_per_iteration); diff --git a/cpp/benchmarks/common/nvbench_utilities.cpp b/cpp/benchmarks/common/nvbench_utilities.cpp new file mode 100644 index 00000000000..c740eaa52f4 --- /dev/null +++ b/cpp/benchmarks/common/nvbench_utilities.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nvbench_utilities.hpp" + +#include + +// This function is copied over from +// https://github.com/NVIDIA/nvbench/blob/a171514056e5d6a7f52a035dd6c812fa301d4f4f/nvbench/detail/measure_cold.cu#L190-L224. +void set_throughputs(nvbench::state& state) +{ + double avg_cuda_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + + if (const auto items = state.get_element_count(); items != 0) { + auto& summ = state.add_summary("nv/cold/bw/item_rate"); + summ.set_string("name", "Elem/s"); + summ.set_string("hint", "item_rate"); + summ.set_string("description", "Number of input elements processed per second"); + summ.set_float64("value", static_cast(items) / avg_cuda_time); + } + + if (const auto bytes = state.get_global_memory_rw_bytes(); bytes != 0) { + const auto avg_used_gmem_bw = static_cast(bytes) / avg_cuda_time; + { + auto& summ = state.add_summary("nv/cold/bw/global/bytes_per_second"); + summ.set_string("name", "GlobalMem BW"); + summ.set_string("hint", "byte_rate"); + summ.set_string("description", + "Number of bytes read/written per second to the CUDA " + "device's global memory"); + summ.set_float64("value", avg_used_gmem_bw); + } + + { + const auto peak_gmem_bw = + static_cast(state.get_device()->get_global_memory_bus_bandwidth()); + + auto& summ = state.add_summary("nv/cold/bw/global/utilization"); + summ.set_string("name", "BWUtil"); + summ.set_string("hint", "percentage"); + summ.set_string("description", + "Global device memory utilization as a percentage of the " + "device's peak bandwidth"); + summ.set_float64("value", avg_used_gmem_bw / peak_gmem_bw); + } + } +} diff --git a/cpp/benchmarks/common/nvbench_utilities.hpp b/cpp/benchmarks/common/nvbench_utilities.hpp new file mode 100644 index 00000000000..98d879efac5 --- /dev/null +++ b/cpp/benchmarks/common/nvbench_utilities.hpp @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace nvbench { +struct state; +} + +/** + * @brief Sets throughput statistics, such as "Elem/s", "GlobalMem BW", and "BWUtil" for the + * nvbench results summary. + * + * This function could be used to work around a known issue that the throughput statistics + * should be added before the nvbench::state.exec() call, otherwise they will not be printed + * in the summary. See https://github.com/NVIDIA/nvbench/issues/175 for more details. + */ +void set_throughputs(nvbench::state& state); diff --git a/cpp/benchmarks/common/table_utilities.cpp b/cpp/benchmarks/common/table_utilities.cpp new file mode 100644 index 00000000000..a6fbdac9fb8 --- /dev/null +++ b/cpp/benchmarks/common/table_utilities.cpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "table_utilities.hpp" + +#include +#include + +#include + +int64_t estimate_size(cudf::column_view const& col) +{ + return estimate_size(cudf::table_view({col})); +} + +int64_t estimate_size(cudf::table_view const& view) +{ + // Compute the size in bits for each row. + auto const row_sizes = cudf::row_bit_count(view); + // Accumulate the row sizes to compute a sum. + auto const agg = cudf::make_sum_aggregation(); + cudf::data_type sum_dtype{cudf::type_id::INT64}; + auto const total_size_scalar = cudf::reduce(*row_sizes, *agg, sum_dtype); + auto const total_size_in_bits = + static_cast*>(total_size_scalar.get())->value(); + // Convert the size in bits to the size in bytes. + return static_cast(std::ceil(static_cast(total_size_in_bits) / 8)); +} diff --git a/cpp/benchmarks/common/table_utilities.hpp b/cpp/benchmarks/common/table_utilities.hpp new file mode 100644 index 00000000000..04ee847d397 --- /dev/null +++ b/cpp/benchmarks/common/table_utilities.hpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +/** + * @brief Estimates the column size in bytes. + * + * @remark As this function internally uses cudf::row_bit_count() to estimate each row size + * and accumulates them, the returned estimate may be an inexact approximation in some + * cases. See cudf::row_bit_count() for more details. + * + * @param view The column view to estimate its size + */ +int64_t estimate_size(cudf::column_view const& view); + +/** + * @brief Estimates the table size in bytes. + * + * @remark As this function internally uses cudf::row_bit_count() to estimate each row size + * and accumulates them, the returned estimate may be an inexact approximation in some + * cases. See cudf::row_bit_count() for more details. + * + * @param view The table view to estimate its size + */ +int64_t estimate_size(cudf::table_view const& view); diff --git a/cpp/benchmarks/reduction/anyall.cpp b/cpp/benchmarks/reduction/anyall.cpp index 8b1e71c1585..e9d23881764 100644 --- a/cpp/benchmarks/reduction/anyall.cpp +++ b/cpp/benchmarks/reduction/anyall.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,9 @@ * limitations under the License. */ +#include #include +#include #include #include @@ -42,6 +44,10 @@ void BM_reduction_anyall(benchmark::State& state, cuda_event_timer timer(state, true); auto result = cudf::reduce(*values, *agg, output_dtype); } + + // The benchmark takes a column and produces one scalar. + set_items_processed(state, column_size + 1); + set_bytes_processed(state, estimate_size(values->view()) + cudf::size_of(output_dtype)); } #define concat(a, b, c) a##b##c diff --git a/cpp/benchmarks/reduction/dictionary.cpp b/cpp/benchmarks/reduction/dictionary.cpp index c1c44c919ac..5095337dbb3 100644 --- a/cpp/benchmarks/reduction/dictionary.cpp +++ b/cpp/benchmarks/reduction/dictionary.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include #include @@ -52,6 +53,13 @@ void BM_reduction_dictionary(benchmark::State& state, cuda_event_timer timer(state, true); auto result = cudf::reduce(*values, *agg, output_dtype); } + + // The benchmark takes a column and produces two scalars. + set_items_processed(state, column_size + 1); + + // We don't set the metrics for the size read/written as row_bit_count() doesn't + // support the dictionary type yet (and so is estimate_size()). + // See https://github.com/rapidsai/cudf/issues/16121 for details. } #define concat(a, b, c) a##b##c diff --git a/cpp/benchmarks/reduction/minmax.cpp b/cpp/benchmarks/reduction/minmax.cpp index 963c26692e7..050f2887221 100644 --- a/cpp/benchmarks/reduction/minmax.cpp +++ b/cpp/benchmarks/reduction/minmax.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,9 @@ * limitations under the License. */ +#include #include +#include #include #include @@ -28,14 +30,19 @@ template void BM_reduction(benchmark::State& state) { cudf::size_type const column_size{(cudf::size_type)state.range(0)}; - auto const dtype = cudf::type_to_id(); + auto const dtype_id = cudf::type_to_id(); auto const input_column = - create_random_column(dtype, row_count{column_size}, data_profile_builder().no_validity()); + create_random_column(dtype_id, row_count{column_size}, data_profile_builder().no_validity()); for (auto _ : state) { cuda_event_timer timer(state, true); auto result = cudf::minmax(*input_column); } + + // The benchmark takes a column and produces two scalars. + set_items_processed(state, column_size + 2); + cudf::data_type dtype = cudf::data_type{dtype_id}; + set_bytes_processed(state, estimate_size(input_column->view()) + 2 * cudf::size_of(dtype)); } #define concat(a, b, c) a##b##c diff --git a/cpp/benchmarks/reduction/rank.cpp b/cpp/benchmarks/reduction/rank.cpp index e55f3b9e09f..14876c80d3e 100644 --- a/cpp/benchmarks/reduction/rank.cpp +++ b/cpp/benchmarks/reduction/rank.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,8 @@ */ #include +#include +#include #include #include @@ -39,11 +41,18 @@ static void nvbench_reduction_scan(nvbench::state& state, nvbench::type_listview(), 2); cudf::column_view input(new_tbl->view().column(0)); + std::unique_ptr result = nullptr; state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { rmm::cuda_stream_view stream_view{launch.get_stream()}; - auto result = cudf::detail::inclusive_dense_rank_scan( + result = cudf::detail::inclusive_dense_rank_scan( input, stream_view, rmm::mr::get_current_device_resource()); }); + + state.add_element_count(input.size()); + state.add_global_memory_reads(estimate_size(input)); + state.add_global_memory_writes(estimate_size(result->view())); + + set_throughputs(state); } using data_type = nvbench::type_list; diff --git a/cpp/benchmarks/reduction/reduce.cpp b/cpp/benchmarks/reduction/reduce.cpp index 5bd3e2e3bba..63c96f4fe9e 100644 --- a/cpp/benchmarks/reduction/reduce.cpp +++ b/cpp/benchmarks/reduction/reduce.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,9 @@ * limitations under the License. */ +#include #include +#include #include #include @@ -46,6 +48,10 @@ void BM_reduction(benchmark::State& state, std::unique_ptrview()) + cudf::size_of(output_dtype)); } #define concat(a, b, c) a##b##c diff --git a/cpp/benchmarks/reduction/scan.cpp b/cpp/benchmarks/reduction/scan.cpp index 8c9883ece9c..dc05aad9807 100644 --- a/cpp/benchmarks/reduction/scan.cpp +++ b/cpp/benchmarks/reduction/scan.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,9 @@ * limitations under the License. */ +#include #include +#include #include #include @@ -34,11 +36,16 @@ static void BM_reduction_scan(benchmark::State& state, bool include_nulls) auto const column = create_random_column(dtype, row_count{n_rows}); if (!include_nulls) column->set_null_mask(rmm::device_buffer{}, 0); + std::unique_ptr result = nullptr; for (auto _ : state) { cuda_event_timer timer(state, true); - auto result = cudf::scan( + result = cudf::scan( *column, *cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE); } + + // The benchmark takes a column and produces a new column of the same size as input. + set_items_processed(state, n_rows * 2); + set_bytes_processed(state, estimate_size(column->view()) + estimate_size(result->view())); } #define SCAN_BENCHMARK_DEFINE(name, type, nulls) \ diff --git a/cpp/benchmarks/reduction/scan_structs.cpp b/cpp/benchmarks/reduction/scan_structs.cpp index ee97b54fbef..a781f75a314 100644 --- a/cpp/benchmarks/reduction/scan_structs.cpp +++ b/cpp/benchmarks/reduction/scan_structs.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,8 @@ */ #include +#include +#include #include #include @@ -45,16 +47,24 @@ static void nvbench_structs_scan(nvbench::state& state) auto [null_mask, null_count] = create_random_null_mask(size, null_probability); auto const input = cudf::make_structs_column( size, std::move(data_table->release()), null_count, std::move(null_mask)); + auto input_view = input->view(); auto const agg = cudf::make_min_aggregation(); auto const null_policy = static_cast(state.get_int64("null_policy")); auto const stream = cudf::get_default_stream(); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + std::unique_ptr result = nullptr; state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto const result = cudf::detail::scan_inclusive( - *input, *agg, null_policy, stream, rmm::mr::get_current_device_resource()); + result = cudf::detail::scan_inclusive( + input_view, *agg, null_policy, stream, rmm::mr::get_current_device_resource()); }); + + state.add_element_count(input_view.size()); + state.add_global_memory_reads(estimate_size(input_view)); + state.add_global_memory_writes(estimate_size(result->view())); + + set_throughputs(state); } NVBENCH_BENCH(nvbench_structs_scan)