From cb24f6da0669856f31d9670ab5cf327213e6757f Mon Sep 17 00:00:00 2001
From: PhilipDeegan <philip.deegan@gmail.com>
Date: Sun, 20 Oct 2024 21:25:20 +0200
Subject: [PATCH] host index function

---
 inc/mkn/gpu/multi_launch.hpp | 30 +++++++++++++++++++++-
 inc/mkn/gpu/rocm.hpp         |  9 ++++---
 test/any/async_streaming.cpp | 48 ++++++++++++++++++++++++++++++++++--
 3 files changed, 81 insertions(+), 6 deletions(-)
diff --git a/inc/mkn/gpu/multi_launch.hpp b/inc/mkn/gpu/multi_launch.hpp
index 933dc04..d6b47ad 100644
--- a/inc/mkn/gpu/multi_launch.hpp
+++ b/inc/mkn/gpu/multi_launch.hpp
@@ -252,7 +252,7 @@ struct StreamGroupBarrierFunction : StreamGroupFunction<Strat> {
     v.reserve(groups);
     for (std::size_t i = 0; i < groups; ++i)
       v.emplace_back(std::make_unique<GroupBarrier>(self, i));
-    return std::move(v);
+    return v;
   }
 
   StreamGroupBarrierFunction(std::size_t const& gs, Strat& strat)
@@ -298,6 +298,27 @@ struct StreamHostGroupMutexFunction : StreamGroupFunction<Strat> {
   std::vector<std::mutex> mutices;
 };
 
+template <typename Strat, typename Fn>
+struct StreamHostGroupIndexFunction : StreamGroupFunction<Strat> {
+  using Super = StreamGroupFunction<Strat>;
+  using Super::strat;
+
+  std::string_view constexpr static MOD_GROUP_ERROR =
+      "mkn.gpu error: StreamHostGroupIndexFunction Group size must be a divisor of datas";
+
+  StreamHostGroupIndexFunction(std::size_t const& gs, std::size_t const& gid_, Strat& strat,
+                               Fn&& fn_)
+      : Super{gs, strat, StreamFunctionMode::HOST_WAIT}, fn{fn_}, gid{gid_} {}
+
+  void run(std::uint32_t const i) override {
+    if (i % Super::group_size == gid) fn(i);
+    strat.status[i] = SFS::WAIT;  // done
+  }
+
+  Fn fn;
+  std::size_t const gid;
+};
+
 template <typename Datas>
 struct ThreadedStreamLauncher : public StreamLauncher<Datas, ThreadedStreamLauncher<Datas>> {
   using This = ThreadedStreamLauncher<Datas>;
@@ -343,6 +364,13 @@ struct ThreadedStreamLauncher : public StreamLauncher<Datas, ThreadedStreamLaunc
     return *this;
   }
 
+  template <typename Fn>
+  This& host_group_idx(std::size_t const& group_size, std::size_t const& group_idx, Fn&& fn) {
+    fns.emplace_back(std::make_shared<StreamHostGroupIndexFunction<This, Fn>>(
+        group_size, group_idx, *this, std::forward<Fn>(fn)));
+    return *this;
+  }
+
   void operator()() { join(); }
   Super& super() { return *this; }
   void super(std::size_t const& idx) { return super()(idx); }
diff --git a/inc/mkn/gpu/rocm.hpp b/inc/mkn/gpu/rocm.hpp
index 9917879..fb4cc3a 100644
--- a/inc/mkn/gpu/rocm.hpp
+++ b/inc/mkn/gpu/rocm.hpp
@@ -45,8 +45,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 // #define MKN_GPU_ASSERT(x) (KASSERT((x) == hipSuccess))
 
-#define MKN_GPU_ASSERT(ans) \
-  { gpuAssert((ans), __FILE__, __LINE__); }
+#define MKN_GPU_ASSERT(ans)               \
+  {                                       \
+    gpuAssert((ans), __FILE__, __LINE__); \
+  }
 inline void gpuAssert(hipError_t code, const char* file, int line, bool abort = true) {
   if (code != hipSuccess) {
     fprintf(stderr, "GPUassert: %s %s %d\n", hipGetErrorString(code), file, line);
@@ -203,8 +205,9 @@ void alloc_host(T*& p, Size size) {
 
 template <typename T, typename Size>
 void alloc_managed(T*& p, Size size) {
+  auto const bytes = size * sizeof(T);
   KLOG(TRC) << "GPU alloced: " << size * sizeof(T);
-  MKN_GPU_ASSERT(hipMallocManaged((void**)&p, size * sizeof(T)));
+  MKN_GPU_ASSERT(hipMallocManaged((void**)&p, bytes));
 }
 
 void inline destroy(void* p) {
diff --git a/test/any/async_streaming.cpp b/test/any/async_streaming.cpp
index cfc188c..3cffaa0 100644
--- a/test/any/async_streaming.cpp
+++ b/test/any/async_streaming.cpp
@@ -151,8 +151,52 @@ std::uint32_t test_threaded_host_group_mutex(std::size_t const& nthreads = 2) {
   return 0;
 }
 
+std::uint32_t test_threaded_host_group_idx(std::size_t const& nthreads = 2) {
+  using T = double;
+  KUL_DBG_FUNC_ENTER;
+
+  std::size_t constexpr group_size = 3;
+
+  std::vector<ManagedVector<T>> vecs(C + 1, ManagedVector<T>(NUM, 0));
+  for (std::size_t i = 0; i < vecs.size(); ++i) std::fill_n(vecs[i].data(), NUM, i);
+
+  ManagedVector<T*> datas(C + 1);
+  for (std::size_t i = 0; i < vecs.size(); ++i) datas[i] = vecs[i].data();
+  auto views = datas.data();
+
+  ThreadedStreamLauncher{vecs, nthreads}
+      .dev([=] __device__(auto const& i) { views[i][mkn::gpu::idx()] += 1; })
+      .host([&](auto i) mutable {
+        std::this_thread::sleep_for(200ms);
+        for (auto& e : vecs[i]) e += 1;
+      })
+      .host_group_idx(group_size, 0,
+                      [&](auto const i) {
+                        for (auto& e : vecs[i]) e += 1;
+                      })
+      .dev([=] __device__(auto const& i) { views[i][mkn::gpu::idx()] += 3; })();
+
+  std::size_t val = 5;
+  for (std::size_t i = 0; i < vecs.size(); i++) {
+    if (i % group_size == 0) {
+      for (auto const& e : vecs[i])
+        if (e != val + 1) return 1;
+    } else {
+      for (auto const& e : vecs[i])
+        if (e != val) return 1;
+    }
+    ++val;
+  };
+
+  return 0;
+}
+
 int main() {
   KOUT(NON) << __FILE__;
-  return test() + test_threaded() + test_threaded(6) + test_threaded_group_barrier() +
-         test_threaded_host_group_mutex();
+  return test()                              //
+         + test_threaded()                   //
+         + test_threaded(6)                  //
+         + test_threaded_group_barrier()     //
+         + test_threaded_host_group_mutex()  //
+         + test_threaded_host_group_idx();
 }