Skip to content

Commit

Permalink
enabled sort benchmarks in mhp, fixed slow benchmarks in mhp (#547)
Browse files Browse the repository at this point in the history
  • Loading branch information
lslusarczyk authored Sep 8, 2023
1 parent c618154 commit 705893d
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 9 deletions.
8 changes: 8 additions & 0 deletions benchmarks/gbench/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
GIT_TAG v1.8.0)
FetchContent_MakeAvailable(googlebench)

if(ENABLE_CUDA)
# because sort.cpp compilation fails with
# dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h warning:
# attribute argument 16 is invalid and will be ignored; CUDA requires
# sub_group size 32
add_compile_options(-Wno-error=cuda-compat)
endif()

# mhp is not under ENABLE_SYCL to check bechmarks also compilation in gcc
add_subdirectory(mhp)

Expand Down
15 changes: 11 additions & 4 deletions benchmarks/gbench/common/dr_bench.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,18 @@ inline auto device_info(sycl::device device) {
#ifdef BENCH_MHP
#ifdef SYCL_LANGUAGE_VERSION

inline sycl::context *mhp_global_context_ = nullptr;
inline std::vector<sycl::device> devices;

inline sycl::queue get_queue() {
std::vector<sycl::device> devices;
if (mhp_global_context_ != nullptr) {
return sycl::queue(*mhp_global_context_, devices[0]);
}

auto root_devices = sycl::platform().get_devices();

for (auto &&root_device : root_devices) {
dr::drlog.debug("Root device: {}\n",
for (auto &&[idx, root_device] : rng::views::enumerate(root_devices)) {
dr::drlog.debug("Root device no {}: {}\n", idx,
root_device.get_info<sycl::info::device::name>());
if (dr::__detail::partitionable(root_device)) {
auto subdevices = root_device.create_sub_devices<
Expand All @@ -81,7 +86,9 @@ inline sycl::queue get_queue() {
}

assert(rng::size(devices) > 0);
return sycl::queue(devices[0]);

mhp_global_context_ = new sycl::context(devices);
return sycl::queue(*mhp_global_context_, devices[0]);
}

#endif
Expand Down
19 changes: 14 additions & 5 deletions benchmarks/gbench/common/sort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@ template <rng::forward_range X> void fill_random(X &&x) {

class DRSortFixture : public benchmark::Fixture {
protected:
dr::shp::distributed_vector<T> *a;
xhp::distributed_vector<T> *a;

public:
void SetUp(::benchmark::State &) {
a = new dr::shp::distributed_vector<T>(default_vector_size);
a = new xhp::distributed_vector<T>(default_vector_size);
fill_random(*a);
}

Expand All @@ -29,27 +29,34 @@ BENCHMARK_DEFINE_F(DRSortFixture, Sort_DR)(benchmark::State &state) {
Stats stats(state, sizeof(T) * a->size());
for (auto _ : state) {
state.PauseTiming();
dr::shp::distributed_vector<T> vec{*a};
xhp::distributed_vector<T> vec(a->size());
xhp::copy(*a, rng::begin(vec));
stats.rep();
state.ResumeTiming();

dr::shp::sort(vec);
// sort not implemented in mhp yet
#ifdef BENCH_SHP
xhp::sort(vec);
#endif
}
}

DR_BENCHMARK_REGISTER_F(DRSortFixture, Sort_DR);

#ifdef SYCL_LANGUAGE_VERSION
class SyclSortFixture : public benchmark::Fixture {
protected:
std::vector<T> local_vec;

sycl::queue queue;
oneapi::dpl::execution::device_policy<> policy;
T *vec;

public:
void SetUp(::benchmark::State &) {
dr::drlog.debug("setting up SyclSortFixture\n");
// when using mhp's get_queue() long execution is observed in this test
// (probably due to JIT), now shp and shp use their own get_queue-s
// (probably due to JIT), now mhp and shp use their own get_queue-s
queue = get_queue();
policy = oneapi::dpl::execution::make_device_policy(queue);
local_vec = std::vector<T>(default_vector_size);
Expand All @@ -58,6 +65,7 @@ class SyclSortFixture : public benchmark::Fixture {
}

void TearDown(::benchmark::State &state) {
dr::drlog.debug("tearing down SyclSortFixture\n");
// copy back to check if last sort really sorted
queue.memcpy(local_vec.data(), vec, default_vector_size * sizeof(T)).wait();
sycl::free(vec, queue);
Expand Down Expand Up @@ -103,6 +111,7 @@ BENCHMARK_DEFINE_F(SyclSortFixture, Sort_DPL)(benchmark::State &state) {
}

DR_BENCHMARK_REGISTER_F(SyclSortFixture, Sort_DPL);
#endif

class StdSortFixture : public benchmark::Fixture {
protected:
Expand Down
1 change: 1 addition & 0 deletions benchmarks/gbench/mhp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ add_executable(
../common/distributed_vector.cpp
../common/dot_product.cpp
../common/inclusive_scan.cpp
../common/sort.cpp
../common/stream.cpp
wave_equation.cpp
rooted.cpp
Expand Down

0 comments on commit 705893d

Please sign in to comment.