diff --git a/inc/mkn/gpu/cli.hpp b/inc/mkn/gpu/cli.hpp index bae675c..6a40948 100644 --- a/inc/mkn/gpu/cli.hpp +++ b/inc/mkn/gpu/cli.hpp @@ -32,33 +32,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef _MKN_GPU_CLI_HPP_ #define _MKN_GPU_CLI_HPP_ -#include -#include - #include "mkn/kul/env.hpp" +#include "mkn/kul/string.hpp" namespace mkn::gpu { template struct Cli { - // + constexpr static inline char const* MKN_GPU_BX_THREADS = "MKN_GPU_BX_THREADS"; auto bx_threads() const { - char const* ENV = "MKN_GPU_BX_THREADS"; - if (mkn::kul::env::EXISTS(ENV)) { - return as(mkn::kul::env::GET(ENV)); - } + if (kul::env::EXISTS(MKN_GPU_BX_THREADS)) + return kul::String::INT32(kul::env::GET(MKN_GPU_BX_THREADS)); return dev.maxThreadsPerBlock; } - template - auto static as(std::string const& from) { - T t; - std::stringstream ss(from); - ss >> t; - return t; - } - Device const& dev; }; diff --git a/inc/mkn/gpu/cpu.hpp b/inc/mkn/gpu/cpu.hpp index 5936969..295bce7 100644 --- a/inc/mkn/gpu/cpu.hpp +++ b/inc/mkn/gpu/cpu.hpp @@ -108,12 +108,13 @@ struct StreamEvent { ~StreamEvent() {} auto& operator()() { return event; }; - void record() { ; } - bool finished() const { return true; } - void reset() {} + void record() { ++stage; } + bool finished() const { return stage == 2; } + void reset() { stage = 0; } Stream stream; std::size_t event = 0; + std::uint16_t stage = 0; }; template diff --git a/inc/mkn/gpu/cuda.hpp b/inc/mkn/gpu/cuda.hpp index 7addcca..9278c6d 100644 --- a/inc/mkn/gpu/cuda.hpp +++ b/inc/mkn/gpu/cuda.hpp @@ -44,7 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "mkn/gpu/def.hpp" -// #define MKN_GPU_ASSERT(x) (KASSERT((x) == cudaSuccess)) +// #define MKN_GPU_ASSERT(ans) \ { gpuAssert((ans), __FILE__, __LINE__); } @@ -110,14 +110,7 @@ struct Stream { struct StreamEvent { StreamEvent(Stream& stream_) : stream{stream_} { reset(); } - ~StreamEvent() { - if (start) { - MKN_GPU_ASSERT(result = cudaEventDestroy(start)) - } - if (stop) { - MKN_GPU_ASSERT(result = cudaEventDestroy(stop)) - } - } + ~StreamEvent() { clear(); } StreamEvent(StreamEvent&& that) : stream{that.stream}, start{that.start}, stop{that.stop} { that.start = nullptr; @@ -128,7 +121,7 @@ struct StreamEvent { StreamEvent& operator=(StreamEvent const&) = delete; auto& operator()() { return stop; }; - void record() { + auto& record() { if (stage == 0) { MKN_GPU_ASSERT(result = cudaEventRecord(start, stream())); ++stage; @@ -136,12 +129,25 @@ struct StreamEvent { MKN_GPU_ASSERT(result = cudaEventRecord(stop, stream())); ++stage; } + return *this; + } + auto& wait() { + if (stage == 0) { + MKN_GPU_ASSERT(result = cudaStreamWaitEvent(stream(), start)); + } else { + MKN_GPU_ASSERT(result = cudaStreamWaitEvent(stream(), stop)); + } + return *this; + } + + void clear() { + if (start) MKN_GPU_ASSERT(result = cudaEventDestroy(start)); + if (stop) MKN_GPU_ASSERT(result = cudaEventDestroy(stop)); } bool finished() const { return stage == 2 and cudaEventQuery(stop) == cudaSuccess; } void reset() { - if (start) MKN_GPU_ASSERT(result = cudaEventDestroy(start)); + clear(); MKN_GPU_ASSERT(result = cudaEventCreate(&start)); - if (stop) MKN_GPU_ASSERT(result = cudaEventDestroy(stop)); MKN_GPU_ASSERT(result = cudaEventCreate(&stop)); stage = 0; } @@ -356,6 +362,17 @@ void inline prinfo(size_t dev = 0) { KOUT(NON) << " threadsPBlock " << devProp.maxThreadsPerBlock; } +void print_gpu_mem_used() { + float free_m = 0, total_m = 0, used_m = 0; + std::size_t free_t = 0, total_t = 0; + cudaMemGetInfo(&free_t, &total_t); + free_m = free_t / 1048576.0; + total_m = total_t / 1048576.0; + used_m = total_m - free_m; + printf(" mem free %zu .... %f MB mem total %zu....%f MB mem used %f MB\n", free_t, free_m, + total_t, total_m, used_m); +} + __device__ void grid_sync() { namespace cg = cooperative_groups; cg::grid_group grid = cg::this_grid(); diff --git a/inc/mkn/gpu/multi_launch.hpp b/inc/mkn/gpu/multi_launch.hpp index 57e7b55..c75b820 100644 --- a/inc/mkn/gpu/multi_launch.hpp +++ b/inc/mkn/gpu/multi_launch.hpp @@ -31,20 +31,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef _MKN_GPU_MULTI_LAUNCH_HPP_ #define _MKN_GPU_MULTI_LAUNCH_HPP_ -#include +#include #include -#include -#include -#include #include -#include +#include +#include +#include +#include +#include #include #include "mkn/gpu.hpp" namespace mkn::gpu { -enum class StreamFunctionMode { HOST_WAIT = 0, DEVICE_WAIT }; +enum class StreamFunctionMode { HOST_WAIT = 0, DEVICE_WAIT, BARRIER }; enum class StreamFunctionStatus { HOST_BUSY = 0, DEVICE_BUSY }; template @@ -129,11 +130,14 @@ struct StreamLauncher { } void operator()(std::uint32_t const i) { - auto const& step = data_step[i]; + auto const step = data_step[i]; + assert(step < fns.size()); - fns[step]->run(i); assert(i < events.size()); - if (fns[step]->mode == StreamFunctionMode::DEVICE_WAIT) events[i].record(); + + // if (fns[step]->mode == StreamFunctionMode::HOST_WAIT) events[i].stream.sync(); + fns[step]->run(i); + if (fns[step]->mode == StreamFunctionMode::DEVICE_WAIT) events[i].record().wait(); } bool is_finished() const { @@ -145,7 +149,7 @@ struct StreamLauncher { bool is_finished(std::uint32_t idx) const { return data_step[idx] == fns.size(); } - bool is_fn_finished(std::uint32_t i) { + bool is_fn_finished(std::uint32_t const& i) { auto const b = [&]() { auto const& step = data_step[i]; if (fns[step]->mode == StreamFunctionMode::HOST_WAIT) return true; @@ -181,6 +185,24 @@ struct AsyncStreamHostFunction : StreamFunction { Fn fn; }; +template +struct StreamBarrierFunction : StreamFunction { + using Super = StreamFunction; + using Super::strat; + + StreamBarrierFunction(Strat& strat) + : Super{strat, StreamFunctionMode::BARRIER}, + sync_point{std::ssize(strat.datas), on_completion} {} + + void run(std::uint32_t const /*i*/) override { [[maybe_unused]] auto ret = sync_point.arrive(); } + + std::function on_completion = [&]() { + for (auto& stat : strat.status) stat = SFS::WAIT; + }; + + std::barrier sync_point; +}; + template struct ThreadedStreamLauncher : public StreamLauncher> { using This = ThreadedStreamLauncher; @@ -192,8 +214,9 @@ struct ThreadedStreamLauncher : public StreamLauncher>(*this)); + return *this; + } + void operator()() { join(); } Super& super() { return *this; } void super(std::size_t const& idx) { return super()(idx); } - bool is_fn_finished(std::uint32_t i) { + bool is_fn_finished(std::uint32_t const& i) { auto const b = [&]() { if (fns[step[i]]->mode == StreamFunctionMode::HOST_WAIT) return status[i] == SFS::WAIT; return events[i].finished(); @@ -222,8 +250,9 @@ struct ThreadedStreamLauncher : public StreamLauncher= wait_max_ms ? wait_max_ms : waitms + 10; - if (check_finished()) done = 1; + continue; } + + std::this_thread::sleep_for(std::chrono::milliseconds(waitms)); + waitms = waitms >= wait_max_ms ? wait_max_ms : waitms + 10; } } @@ -247,12 +275,12 @@ struct ThreadedStreamLauncher : public StreamLauncher get_work(std::size_t const& start = 0) { - std::unique_lock lk(work_); + std::scoped_lock lk(work_); for (std::size_t i = start; i < datas.size(); ++i) { if (status[i] == SFS::BUSY) { if (is_fn_finished(i)) status[i] = SFS::WAIT; - - } else if (status[i] == SFS::WAIT) { + } + if (status[i] == SFS::WAIT) { ++step[i]; if (Super::is_finished(i)) { @@ -268,7 +296,7 @@ struct ThreadedStreamLauncher : public StreamLauncher threads; std::mutex work_; diff --git a/inc/mkn/gpu/rocm.hpp b/inc/mkn/gpu/rocm.hpp index 7059d5a..9917879 100644 --- a/inc/mkn/gpu/rocm.hpp +++ b/inc/mkn/gpu/rocm.hpp @@ -104,22 +104,56 @@ struct Stream { hipStream_t stream; }; +// + struct StreamEvent { StreamEvent(Stream& stream_) : stream{stream_} { reset(); } - ~StreamEvent() { /*MKN_GPU_ASSERT(result = hipEventDestroy(event));*/ + ~StreamEvent() { clear(); } + + StreamEvent(StreamEvent&& that) : stream{that.stream}, start{that.start}, stop{that.stop} { + that.start = nullptr; + that.stop = nullptr; } - auto& operator()() { return event; }; - void record() { MKN_GPU_ASSERT(result = hipEventRecord(event, stream())); } - bool finished() const { return hipEventQuery(event) == hipSuccess; } + StreamEvent(StreamEvent const&) = delete; + StreamEvent& operator=(StreamEvent const&) = delete; + + auto& operator()() { return stop; }; + auto& record() { + if (stage == 0) { + MKN_GPU_ASSERT(result = hipEventRecord(start, stream())); + ++stage; + } else { + MKN_GPU_ASSERT(result = hipEventRecord(stop, stream())); + ++stage; + } + return *this; + } + auto& wait() { + if (stage == 0) { + MKN_GPU_ASSERT(result = hipStreamWaitEvent(stream(), start)); + } else { + MKN_GPU_ASSERT(result = hipStreamWaitEvent(stream(), stop)); + } + return *this; + } + + void clear() { + if (start) MKN_GPU_ASSERT(result = hipEventDestroy(start)); + if (stop) MKN_GPU_ASSERT(result = hipEventDestroy(stop)); + } + bool finished() const { return stage == 2 and hipEventQuery(stop) == hipSuccess; } void reset() { - if (event) MKN_GPU_ASSERT(result = hipEventDestroy(event)); - MKN_GPU_ASSERT(result = hipEventCreate(&event)); + clear(); + MKN_GPU_ASSERT(result = hipEventCreate(&start)); + MKN_GPU_ASSERT(result = hipEventCreate(&stop)); + stage = 0; } Stream& stream; hipError_t result; - hipEvent_t event = nullptr; + hipEvent_t start = nullptr, stop = nullptr; + std::uint16_t stage = 0; }; // https://rocm.docs.amd.com/projects/HIP/en/latest/doxygen/html/group___global_defs.html#gaea86e91d3cd65992d787b39b218435a3 @@ -250,7 +284,7 @@ void launch(F&& f, dim3 g, dim3 b, std::size_t ds, hipStream_t& s, Args&&... arg if constexpr (_coop) { auto address_of = [](auto& a) { return (void*)&a; }; void* kernelArgs[] = {(address_of(params), ...)}; - hipLaunchCooperativeKernel(f, g, b, kernelArgs, ds, s); + MKN_GPU_ASSERT(hipLaunchCooperativeKernel(f, g, b, kernelArgs, ds, s)); } else { hipLaunchKernelGGL(f, g, b, ds, s, params...); } @@ -322,7 +356,7 @@ void fill(Container& c, T val) { // https://rocm-developer-tools.github.io/HIP/group__Device.html void inline prinfo(size_t dev = 0) { hipDeviceProp_t devProp; - [[maybe_unused]] auto ret = hipGetDeviceProperties(&devProp, dev); + MKN_GPU_ASSERT(hipGetDeviceProperties(&devProp, dev)); KOUT(NON) << " System version " << devProp.major << "." << devProp.minor; KOUT(NON) << " agent name " << devProp.name; KOUT(NON) << " cores " << devProp.multiProcessorCount; @@ -333,6 +367,17 @@ void inline prinfo(size_t dev = 0) { KOUT(NON) << " threadsPBlock " << devProp.maxThreadsPerBlock; } +void print_gpu_mem_used() { + float free_m = 0, total_m = 0, used_m = 0; + std::size_t free_t = 0, total_t = 0; + MKN_GPU_ASSERT(hipMemGetInfo(&free_t, &total_t)); + free_m = free_t / 1048576.0; + total_m = total_t / 1048576.0; + used_m = total_m - free_m; + printf(" mem free %zu .... %f MB mem total %zu....%f MB mem used %f MB\n", free_t, free_m, + total_t, total_m, used_m); +} + __device__ void grid_sync() { namespace cg = cooperative_groups; cg::grid_group grid = cg::this_grid(); diff --git a/mkn.yaml b/mkn.yaml index 2d7b461..0ad5f68 100644 --- a/mkn.yaml +++ b/mkn.yaml @@ -1,4 +1,4 @@ -#! clean build test run -Op rocm -x res/mkn/hipcc -W +#! clean build test run -tOqp rocm -x res/mkn/hipcc -W name: mkn.gpu parent: headers @@ -11,8 +11,8 @@ profile: - name: rocm parent: headers # arg: -DMKN_GPU_ROCM=1 - test: test/any/(\w).cpp - test/hip/(\w).cpp + # test: test/any/(\w).cpp + # test/hip/(\w).cpp - name: cuda parent: headers diff --git a/res/mkn/clang_cuda.yaml b/res/mkn/clang_cuda.yaml index 7f0aedd..5654326 100644 --- a/res/mkn/clang_cuda.yaml +++ b/res/mkn/clang_cuda.yaml @@ -12,7 +12,7 @@ # property: - cxx_flags: --std=c++17 -fPIC -fsized-deallocation -Wno-unknown-cuda-version + cxx_flags: --std=c++20 -fPIC -fsized-deallocation -Wno-unknown-cuda-version cxx_cuda: -x cuda --cuda-gpu-arch="sm_61" -Xclang -fcuda-allow-variadic-functions # local: diff --git a/res/mkn/hipcc.yaml b/res/mkn/hipcc.yaml index 3cb9a0a..e123216 100644 --- a/res/mkn/hipcc.yaml +++ b/res/mkn/hipcc.yaml @@ -18,6 +18,6 @@ env: file: - type: cpp:cxx:cc archiver: ar -cr - compiler: hipcc -std=c++17 -fPIC + compiler: hipcc -std=c++20 -fPIC linker: hipcc diff --git a/test/any/async_streaming.cpp b/test/any/async_streaming.cpp index b79a700..fb0b825 100644 --- a/test/any/async_streaming.cpp +++ b/test/any/async_streaming.cpp @@ -39,10 +39,7 @@ std::uint32_t test() { std::this_thread::sleep_for(200ms); for (auto& e : vecs[i]) e += 1; }) - .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 3; }) // - (); - - for (auto const& vec : vecs) std::cout << __LINE__ << " " << vec[0] << std::endl; + .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 3; })(); std::size_t val = 5; for (auto const& vec : vecs) { @@ -54,7 +51,7 @@ std::uint32_t test() { return 0; } -std::uint32_t test_threaded() { +std::uint32_t test_threaded(std::size_t const& nthreads = 2) { using namespace mkn::gpu; using T = double; @@ -69,16 +66,14 @@ std::uint32_t test_threaded() { using namespace std::chrono_literals; - ThreadedStreamLauncher{vecs, 6} + ThreadedStreamLauncher{vecs, nthreads} .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 1; }) .host([&](auto i) mutable { std::this_thread::sleep_for(200ms); for (auto& e : vecs[i]) e += 1; }) - .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 3; }) // - (); - - for (auto const& vec : vecs) std::cout << __LINE__ << " " << vec[0] << std::endl; + .barrier() + .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 3; })(); std::size_t val = 5; for (auto const& vec : vecs) { @@ -92,5 +87,5 @@ std::uint32_t test_threaded() { int main() { KOUT(NON) << __FILE__; - return test() + test_threaded(); + return test() + test_threaded() + test_threaded(6); }