From cb24f6da0669856f31d9670ab5cf327213e6757f Mon Sep 17 00:00:00 2001 From: PhilipDeegan Date: Sun, 20 Oct 2024 21:25:20 +0200 Subject: [PATCH] host index function --- inc/mkn/gpu/multi_launch.hpp | 30 +++++++++++++++++++++- inc/mkn/gpu/rocm.hpp | 9 ++++--- test/any/async_streaming.cpp | 48 ++++++++++++++++++++++++++++++++++-- 3 files changed, 81 insertions(+), 6 deletions(-) diff --git a/inc/mkn/gpu/multi_launch.hpp b/inc/mkn/gpu/multi_launch.hpp index 933dc04..d6b47ad 100644 --- a/inc/mkn/gpu/multi_launch.hpp +++ b/inc/mkn/gpu/multi_launch.hpp @@ -252,7 +252,7 @@ struct StreamGroupBarrierFunction : StreamGroupFunction { v.reserve(groups); for (std::size_t i = 0; i < groups; ++i) v.emplace_back(std::make_unique(self, i)); - return std::move(v); + return v; } StreamGroupBarrierFunction(std::size_t const& gs, Strat& strat) @@ -298,6 +298,27 @@ struct StreamHostGroupMutexFunction : StreamGroupFunction { std::vector mutices; }; +template +struct StreamHostGroupIndexFunction : StreamGroupFunction { + using Super = StreamGroupFunction; + using Super::strat; + + std::string_view constexpr static MOD_GROUP_ERROR = + "mkn.gpu error: StreamHostGroupIndexFunction Group size must be a divisor of datas"; + + StreamHostGroupIndexFunction(std::size_t const& gs, std::size_t const& gid_, Strat& strat, + Fn&& fn_) + : Super{gs, strat, StreamFunctionMode::HOST_WAIT}, fn{fn_}, gid{gid_} {} + + void run(std::uint32_t const i) override { + if (i % Super::group_size == gid) fn(i); + strat.status[i] = SFS::WAIT; // done + } + + Fn fn; + std::size_t const gid; +}; + template struct ThreadedStreamLauncher : public StreamLauncher> { using This = ThreadedStreamLauncher; @@ -343,6 +364,13 @@ struct ThreadedStreamLauncher : public StreamLauncher + This& host_group_idx(std::size_t const& group_size, std::size_t const& group_idx, Fn&& fn) { + fns.emplace_back(std::make_shared>( + group_size, group_idx, *this, std::forward(fn))); + return *this; + } + void operator()() { join(); } Super& super() { return *this; } void super(std::size_t const& idx) { return super()(idx); } diff --git a/inc/mkn/gpu/rocm.hpp b/inc/mkn/gpu/rocm.hpp index 9917879..fb4cc3a 100644 --- a/inc/mkn/gpu/rocm.hpp +++ b/inc/mkn/gpu/rocm.hpp @@ -45,8 +45,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // #define MKN_GPU_ASSERT(x) (KASSERT((x) == hipSuccess)) -#define MKN_GPU_ASSERT(ans) \ - { gpuAssert((ans), __FILE__, __LINE__); } +#define MKN_GPU_ASSERT(ans) \ + { \ + gpuAssert((ans), __FILE__, __LINE__); \ + } inline void gpuAssert(hipError_t code, const char* file, int line, bool abort = true) { if (code != hipSuccess) { fprintf(stderr, "GPUassert: %s %s %d\n", hipGetErrorString(code), file, line); @@ -203,8 +205,9 @@ void alloc_host(T*& p, Size size) { template void alloc_managed(T*& p, Size size) { + auto const bytes = size * sizeof(T); KLOG(TRC) << "GPU alloced: " << size * sizeof(T); - MKN_GPU_ASSERT(hipMallocManaged((void**)&p, size * sizeof(T))); + MKN_GPU_ASSERT(hipMallocManaged((void**)&p, bytes)); } void inline destroy(void* p) { diff --git a/test/any/async_streaming.cpp b/test/any/async_streaming.cpp index cfc188c..3cffaa0 100644 --- a/test/any/async_streaming.cpp +++ b/test/any/async_streaming.cpp @@ -151,8 +151,52 @@ std::uint32_t test_threaded_host_group_mutex(std::size_t const& nthreads = 2) { return 0; } +std::uint32_t test_threaded_host_group_idx(std::size_t const& nthreads = 2) { + using T = double; + KUL_DBG_FUNC_ENTER; + + std::size_t constexpr group_size = 3; + + std::vector> vecs(C + 1, ManagedVector(NUM, 0)); + for (std::size_t i = 0; i < vecs.size(); ++i) std::fill_n(vecs[i].data(), NUM, i); + + ManagedVector datas(C + 1); + for (std::size_t i = 0; i < vecs.size(); ++i) datas[i] = vecs[i].data(); + auto views = datas.data(); + + ThreadedStreamLauncher{vecs, nthreads} + .dev([=] __device__(auto const& i) { views[i][mkn::gpu::idx()] += 1; }) + .host([&](auto i) mutable { + std::this_thread::sleep_for(200ms); + for (auto& e : vecs[i]) e += 1; + }) + .host_group_idx(group_size, 0, + [&](auto const i) { + for (auto& e : vecs[i]) e += 1; + }) + .dev([=] __device__(auto const& i) { views[i][mkn::gpu::idx()] += 3; })(); + + std::size_t val = 5; + for (std::size_t i = 0; i < vecs.size(); i++) { + if (i % group_size == 0) { + for (auto const& e : vecs[i]) + if (e != val + 1) return 1; + } else { + for (auto const& e : vecs[i]) + if (e != val) return 1; + } + ++val; + }; + + return 0; +} + int main() { KOUT(NON) << __FILE__; - return test() + test_threaded() + test_threaded(6) + test_threaded_group_barrier() + - test_threaded_host_group_mutex(); + return test() // + + test_threaded() // + + test_threaded(6) // + + test_threaded_group_barrier() // + + test_threaded_host_group_mutex() // + + test_threaded_host_group_idx(); }