Skip to content

Commit

Permalink
fixed sort benchmarks, shp uses its own specific get_queue in benchma…
Browse files Browse the repository at this point in the history
…rks (#544)
  • Loading branch information
lslusarczyk authored Sep 7, 2023
1 parent 4f55fea commit c618154
Showing 3 changed files with 114 additions and 40 deletions.
16 changes: 13 additions & 3 deletions benchmarks/gbench/common/dr_bench.hpp
Original file line number Diff line number Diff line change
@@ -36,13 +36,23 @@ extern bool weak_scaling;
#define DR_BENCHMARK_BASE(x) \
BENCHMARK(x)->UseRealTime()->Unit(benchmark::kMillisecond)

#ifdef SYCL_LANGUAGE_VERSION
#define DR_BENCHMARK_REGISTER_F(fixture, case) \
BENCHMARK_REGISTER_F(fixture, case) \
->UseRealTime() \
->Unit(benchmark::kMillisecond) \
->MinWarmUpTime(.1) \
->MinTime(.1)

#ifdef SYCL_LANGUAGE_VERSION
inline auto device_info(sycl::device device) {
return fmt::format("{}, max_compute_units: {}",
device.get_info<sycl::info::device::name>(),
device.get_info<sycl::info::device::max_compute_units>());
}
#endif

#ifdef BENCH_MHP
#ifdef SYCL_LANGUAGE_VERSION

inline sycl::queue get_queue() {
std::vector<sycl::device> devices;
@@ -76,8 +86,6 @@ inline sycl::queue get_queue() {

#endif

#ifdef BENCH_MHP

#include "dr/mhp.hpp"

namespace xhp = dr::mhp;
@@ -95,6 +103,8 @@ extern bool check_results;

namespace xhp = dr::shp;

inline sycl::queue &get_queue() { return dr::shp::__detail::default_queue(); }

#endif

class Stats {
128 changes: 92 additions & 36 deletions benchmarks/gbench/common/sort.cpp
Original file line number Diff line number Diff line change
@@ -12,70 +12,126 @@ template <rng::forward_range X> void fill_random(X &&x) {
}
}

static void Sort_DR(benchmark::State &state) {
dr::shp::distributed_vector<T> a(default_vector_size);
fill_random(a);
Stats stats(state, sizeof(T) * a.size());
class DRSortFixture : public benchmark::Fixture {
protected:
dr::shp::distributed_vector<T> *a;

public:
void SetUp(::benchmark::State &) {
a = new dr::shp::distributed_vector<T>(default_vector_size);
fill_random(*a);
}

void TearDown(::benchmark::State &) { delete a; }
};

BENCHMARK_DEFINE_F(DRSortFixture, Sort_DR)(benchmark::State &state) {
Stats stats(state, sizeof(T) * a->size());
for (auto _ : state) {
state.PauseTiming();
dr::shp::distributed_vector<T> vec{*a};
stats.rep();
dr::shp::sort(a);
state.ResumeTiming();

dr::shp::sort(vec);
}
}

DR_BENCHMARK(Sort_DR);
DR_BENCHMARK_REGISTER_F(DRSortFixture, Sort_DR);

class SyclSortFixture : public benchmark::Fixture {
protected:
std::vector<T> local_vec;
sycl::queue queue;
oneapi::dpl::execution::device_policy<> policy;
T *vec;

public:
void SetUp(::benchmark::State &) {
// when using mhp's get_queue() long execution is observed in this test
// (probably due to JIT), now shp and shp use their own get_queue-s
queue = get_queue();
policy = oneapi::dpl::execution::make_device_policy(queue);
local_vec = std::vector<T>(default_vector_size);
fill_random(local_vec);
vec = sycl::malloc_device<T>(default_vector_size, queue);
}

void TearDown(::benchmark::State &state) {
// copy back to check if last sort really sorted
queue.memcpy(local_vec.data(), vec, default_vector_size * sizeof(T)).wait();
sycl::free(vec, queue);

if (!rng::is_sorted(local_vec)) {
state.SkipWithError("sycl sort did not sort the vector");
}
}
};

#ifdef SYCL_LANGUAGE_VERSION
static void Sort_EXP(benchmark::State &state) {
auto q = get_queue();
auto policy = oneapi::dpl::execution::make_device_policy(q);
std::vector<T> a_local(default_vector_size);
fill_random(a_local);
auto a = sycl::malloc_device<T>(default_vector_size, q);
q.memcpy(a, a_local.data(), default_vector_size * sizeof(T)).wait();
BENCHMARK_DEFINE_F(SyclSortFixture, Sort_EXP)(benchmark::State &state) {
Stats stats(state, sizeof(T) * default_vector_size);

for (auto _ : state) {

state.PauseTiming();
queue.memcpy(vec, local_vec.data(), default_vector_size * sizeof(T)).wait();
stats.rep();
std::sort(policy, a, a + default_vector_size);
state.ResumeTiming();

std::sort(policy, vec, vec + default_vector_size);
}
sycl::free(a, q);
}

DR_BENCHMARK(Sort_EXP);

static void Sort_DPL(benchmark::State &state) {
auto q = get_queue();
auto policy = oneapi::dpl::execution::make_device_policy(q);
std::vector<T> a_local(default_vector_size);
fill_random(a_local);
auto a = sycl::malloc_device<T>(default_vector_size, q);
q.memcpy(a, a_local.data(), default_vector_size * sizeof(T)).wait();
std::span<T> d_a(a, default_vector_size);
dr::__detail::direct_iterator d_first(d_a.begin());
dr::__detail::direct_iterator d_last(d_a.end());
DR_BENCHMARK_REGISTER_F(SyclSortFixture, Sort_EXP);

BENCHMARK_DEFINE_F(SyclSortFixture, Sort_DPL)(benchmark::State &state) {
Stats stats(state, sizeof(T) * default_vector_size);

for (auto _ : state) {
state.PauseTiming();
queue.memcpy(vec, local_vec.data(), default_vector_size * sizeof(T)).wait();
stats.rep();
state.ResumeTiming();

std::span<T> d_a(vec, default_vector_size);
dr::__detail::direct_iterator d_first(d_a.begin());
dr::__detail::direct_iterator d_last(d_a.end());
oneapi::dpl::experimental::sort_async(policy, d_first, d_last,
std::less<>{})
.wait();
}
sycl::free(a, q);
}

DR_BENCHMARK(Sort_DPL);
#endif
DR_BENCHMARK_REGISTER_F(SyclSortFixture, Sort_DPL);

class StdSortFixture : public benchmark::Fixture {
protected:
std::vector<T> vec_orig;
std::vector<T> vec;

public:
void SetUp(::benchmark::State &) {
vec_orig = std::vector<T>(default_vector_size);
fill_random(vec_orig);
}

void TearDown(::benchmark::State &state) {
if (!rng::is_sorted(vec)) {
state.SkipWithError("std sort did not sort the vector");
}
}
};

static void Sort_Std(benchmark::State &state) {
std::vector<T> a(default_vector_size);
fill_random(a);
BENCHMARK_DEFINE_F(StdSortFixture, Sort_Std)(benchmark::State &state) {
Stats stats(state, sizeof(T) * default_vector_size);

for (auto _ : state) {
state.PauseTiming();
vec = vec_orig;
stats.rep();
std::sort(a.begin(), a.end());
state.ResumeTiming();
std::sort(vec.begin(), vec.end());
}
}

DR_BENCHMARK(Sort_Std);
DR_BENCHMARK_REGISTER_F(StdSortFixture, Sort_Std);
10 changes: 9 additions & 1 deletion benchmarks/gbench/shp/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -4,15 +4,23 @@

set(CMAKE_INCLUDE_CURRENT_DIR ON)

if(ENABLE_CUDA)
# because sort.cpp compilation fails with
# dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h warning:
# attribute argument 16 is invalid and will be ignored; CUDA requires
# sub_group size 32
add_compile_options(-Wno-error=cuda-compat)
endif()

# cmake-format: off
add_executable(
shp-bench
shp-bench.cpp
gemm.cpp
../common/sort.cpp
../common/distributed_vector.cpp
../common/dot_product.cpp
../common/inclusive_scan.cpp
../common/sort.cpp
../common/stream.cpp)
# cmake-format: on

0 comments on commit c618154

Please sign in to comment.