From f3d35665cb255e6fea60f6fa7b89def871d0133e Mon Sep 17 00:00:00 2001 From: dekken Date: Fri, 12 Jul 2024 22:28:32 +0200 Subject: [PATCH 1/7] betterment --- .github/workflows/build.yml | 4 +- .sublime-project | 2 +- LICENSE.md | 2 +- inc/mkn/gpu.hpp | 22 ++--- inc/mkn/gpu/alloc.hpp | 97 ++++++++++++++++++--- inc/mkn/gpu/asio.hpp | 2 +- inc/mkn/gpu/cli.hpp | 67 +++++++++++++++ inc/mkn/gpu/cpu.hpp | 39 ++++++++- inc/mkn/gpu/cuda.hpp | 127 ++++++++++++++++++++------- inc/mkn/gpu/cuda/def.hpp | 24 ------ inc/mkn/gpu/def.hpp | 12 --- inc/mkn/gpu/defines.hpp | 53 ++++++++++++ inc/mkn/gpu/device.hpp | 5 +- inc/mkn/gpu/launchers.hpp | 18 ++-- inc/mkn/gpu/multi_launch.hpp | 160 +++++++++++++++++++++++++++++++++++ inc/mkn/gpu/rocm.hpp | 122 ++++++++++++++++++++++---- inc/mkn/gpu/rocm/def.hpp | 24 ------ inc/mkn/gpu/tuple.hpp | 2 +- mkn.yaml | 6 +- res/mkn/clang_cuda.yaml | 5 ++ res/mkn/hipcc.yaml | 6 +- test/any/add.cpp | 10 +++ test/any/async_streaming.cpp | 45 ++++++++++ test/any/construct.cpp | 59 +++++++++++++ test/any/coop.cpp | 41 +++++++++ test/any/managed.cpp | 4 + 26 files changed, 804 insertions(+), 154 deletions(-) create mode 100644 inc/mkn/gpu/cli.hpp delete mode 100644 inc/mkn/gpu/cuda/def.hpp create mode 100644 inc/mkn/gpu/defines.hpp create mode 100644 inc/mkn/gpu/multi_launch.hpp delete mode 100644 inc/mkn/gpu/rocm/def.hpp create mode 100644 test/any/async_streaming.cpp create mode 100644 test/any/construct.cpp create mode 100644 test/any/coop.cpp diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c5912d3..810b748 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -10,10 +10,10 @@ jobs: build: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: test run: | curl -Lo mkn https://github.com/mkn/mkn/releases/download/latest/mkn_nix chmod +x mkn - KLOG=3 ./mkn clean build run -dtKOp cpu -a "-std=c++17" -g 0 test -W 9 + KLOG=3 ./mkn clean build run -dtKOgp cpu -a "-std=c++17" test -W 9 diff --git a/.sublime-project b/.sublime-project index 2fda01d..45091a2 100644 --- a/.sublime-project +++ b/.sublime-project @@ -9,7 +9,7 @@ { "ClangFormat": { - "binary": "clang-format-15", + "binary": "clang-format", "format_on_save": true, "style": "file" }, diff --git a/LICENSE.md b/LICENSE.md index baaa780..97bfc8f 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,4 +1,4 @@ -Copyright (c) 2020, Philip Deegan. +Copyright (c) 2024, Philip Deegan. All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/inc/mkn/gpu.hpp b/inc/mkn/gpu.hpp index 148c7f6..1f6c813 100644 --- a/inc/mkn/gpu.hpp +++ b/inc/mkn/gpu.hpp @@ -1,5 +1,5 @@ /** -Copyright (c) 2020, Philip Deegan. +Copyright (c) 2024, Philip Deegan. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -31,27 +31,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef _MKN_GPU_HPP_ #define _MKN_GPU_HPP_ -#if defined(MKN_GPU_ROCM) -#include "mkn/gpu/rocm.hpp" -#elif defined(MKN_GPU_CUDA) -#include "mkn/gpu/cuda.hpp" -#elif defined(MKN_GPU_CPU) -#include "mkn/gpu/cpu.hpp" -#elif !defined(MKN_GPU_FN_PER_NS) || MKN_GPU_FN_PER_NS == 0 -#error "UNKNOWN GPU / define MKN_GPU_ROCM or MKN_GPU_CUDA" -#endif +#include "mkn/gpu/defines.hpp" namespace mkn::gpu { __device__ uint32_t idx() { -#if defined(MKN_GPU_ROCM) +#if MKN_GPU_ROCM return mkn::gpu::hip::idx(); -#elif defined(MKN_GPU_CUDA) + +#elif MKN_GPU_CUDA return mkn::gpu::cuda::idx(); -#elif defined(MKN_GPU_CPU) + +#elif MKN_GPU_CPU return mkn::gpu::cpu::idx(); + #else #error "UNKNOWN GPU / define MKN_GPU_ROCM or MKN_GPU_CUDA" + #endif } diff --git a/inc/mkn/gpu/alloc.hpp b/inc/mkn/gpu/alloc.hpp index b442240..da2441c 100644 --- a/inc/mkn/gpu/alloc.hpp +++ b/inc/mkn/gpu/alloc.hpp @@ -1,5 +1,5 @@ /** -Copyright (c) 2020, Philip Deegan. +Copyright (c) 2024, Philip Deegan. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -31,9 +31,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef _MKN_GPU_ALLOC_HPP_ #define _MKN_GPU_ALLOC_HPP_ -template -class ManagedAllocator { - using This = ManagedAllocator; +template +class MknGPUAllocator { + using This = MknGPUAllocator; public: using pointer = T*; @@ -44,7 +44,7 @@ class ManagedAllocator { template struct rebind { - using other = ManagedAllocator; + using other = MknGPUAllocator; }; T* allocate(std::size_t const n) const { @@ -70,20 +70,91 @@ class ManagedAllocator { } }; +template +class NoConstructAllocator : public MknGPUAllocator { + public: + template + struct rebind { + using other = NoConstructAllocator; + }; + + template + void construct(U* /*ptr*/, Args&&... /*args*/) {} // nothing + template + void construct(U* /*ptr*/) noexcept(std::is_nothrow_default_constructible::value) {} +}; + +template +std::vector>& as_super( + std::vector>& v) { + return *reinterpret_cast>*>(&v); +} + +template +class ManagedAllocator : public MknGPUAllocator { + public: + template + struct rebind { + using other = ManagedAllocator; + }; +}; + +template +std::vector>& as_super(std::vector>& v) { + return *reinterpret_cast>*>(&v); +} + template -void copy(T* const dst, T const* const src, Size size) { - auto dst_p = Pointer{dst}; - auto src_p = Pointer{src}; +void copy(T* dst, T* src, Size size) { + assert(dst and src); - bool to_send = dst_p.is_device_ptr() && src_p.is_host_ptr(); - bool to_take = dst_p.is_host_ptr() && src_p.is_device_ptr(); + Pointer src_p{src}; + Pointer dst_p{dst}; - if (to_send) + auto to_send = [&]() { return dst_p.is_device_ptr() && src_p.is_host_ptr(); }; + auto to_take = [&]() { return dst_p.is_host_ptr() && src_p.is_device_ptr(); }; + auto on_host = [&]() { return dst_p.is_host_ptr() && src_p.is_host_ptr(); }; + auto on_device = [&]() { return dst_p.is_device_ptr() && src_p.is_device_ptr(); }; + + if (on_host()) + std::copy(src, src + size, dst); + else if (on_device()) + copy_on_device(dst, src, size); + else if (to_send()) send(dst, src, size); - else if (to_take) - take(dst, src, size); + else if (to_take()) + take(src, dst, size); else throw std::runtime_error("Unsupported operation (PR welcome)"); } +template +auto& reserve(std::vector>& v, std::size_t const& s, + bool mem_copy = true) { + if (s <= v.capacity()) { + v.reserve(s); + return v; + } + std::vector> cpy{NoConstructAllocator{}}; + cpy.reserve(s); + cpy.resize(v.size()); + if (mem_copy and v.size()) copy(cpy.data(), v.data(), v.size()); + v = std::move(cpy); + return v; +} + +template +auto& resize(std::vector>& v, std::size_t const& s, + bool mem_copy = true) { + if (s <= v.capacity()) { + v.resize(s); + return v; + } + std::vector> cpy{NoConstructAllocator{}}; + cpy.resize(s); + if (mem_copy and v.size()) copy(cpy.data(), v.data(), v.size()); + v = std::move(cpy); + return v; +} + #endif /* _MKN_GPU_ALLOC_HPP_ */ diff --git a/inc/mkn/gpu/asio.hpp b/inc/mkn/gpu/asio.hpp index e9be973..c2795da 100644 --- a/inc/mkn/gpu/asio.hpp +++ b/inc/mkn/gpu/asio.hpp @@ -1,5 +1,5 @@ /** -Copyright (c) 2020, Philip Deegan. +Copyright (c) 2024, Philip Deegan. All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/inc/mkn/gpu/cli.hpp b/inc/mkn/gpu/cli.hpp new file mode 100644 index 0000000..bae675c --- /dev/null +++ b/inc/mkn/gpu/cli.hpp @@ -0,0 +1,67 @@ +/** +Copyright (c) 2024, Philip Deegan. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Philip Deegan nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +// IWYU pragma: private, include "mkn/gpu.hpp" +#ifndef _MKN_GPU_CLI_HPP_ +#define _MKN_GPU_CLI_HPP_ + +#include +#include + +#include "mkn/kul/env.hpp" + +namespace mkn::gpu { + +template +struct Cli { + // + + auto bx_threads() const { + char const* ENV = "MKN_GPU_BX_THREADS"; + if (mkn::kul::env::EXISTS(ENV)) { + return as(mkn::kul::env::GET(ENV)); + } + return dev.maxThreadsPerBlock; + } + + template + auto static as(std::string const& from) { + T t; + std::stringstream ss(from); + ss >> t; + return t; + } + + Device const& dev; +}; + +} /* namespace mkn::gpu */ + +#endif /*_MKN_GPU_CLI_HPP_*/ diff --git a/inc/mkn/gpu/cpu.hpp b/inc/mkn/gpu/cpu.hpp index 8b34013..36de6f8 100644 --- a/inc/mkn/gpu/cpu.hpp +++ b/inc/mkn/gpu/cpu.hpp @@ -1,5 +1,5 @@ /** -Copyright (c) 2020, Philip Deegan. +Copyright (c) 2024, Philip Deegan. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "mkn/kul/assert.hpp" #include "mkn/kul/threads.hpp" +#include "mkn/gpu/cli.hpp" #include "mkn/gpu/def.hpp" #include @@ -82,6 +83,15 @@ struct dim3 { std::size_t x = 1, y = 1, z = 1; }; +void setLimitMallocHeapSize(std::size_t const& /*bytes*/) { + // noop +} + +auto supportsCooperativeLaunch(int const /*dev*/ = 0) { + int supportsCoopLaunch = 0; + return supportsCoopLaunch; +} + struct Stream { Stream() {} ~Stream() {} @@ -93,6 +103,19 @@ struct Stream { std::size_t stream = 0; }; +struct StreamEvent { + StreamEvent(Stream&) {} + ~StreamEvent() {} + + auto& operator()() { return event; }; + void record() { ; } + bool finished() const { return true; } + void reset() {} + + Stream stream; + std::size_t event = 0; +}; + template struct Pointer { Pointer(T* _t) : t{_t} {} @@ -129,7 +152,7 @@ void alloc_managed(T*& p, Size size) { MKN_GPU_ASSERT(p = reinterpret_cast(std::malloc(size * sizeof(T)))); } -void destroy(void* p) { +void inline destroy(void* p) { KLOG(TRC); std::free(p); } @@ -146,6 +169,12 @@ void destroy_host(T*& p) { std::free(p); } +template +void copy_on_device(T* dst, T const* src, Size size = 1) { + KLOG(TRC); + MKN_GPU_ASSERT(std::memcpy(dst, src, size * sizeof(T))); +} + template void send(void* p, void* t, Size size = 1) { KLOG(TRC); @@ -177,7 +206,7 @@ void take_async(T* p, Span& span, Stream& /*stream*/, std::size_t start) { take(p, span.data(), span.size(), start); } -void sync() {} +void inline sync() {} #include "mkn/gpu/alloc.hpp" #include "mkn/gpu/device.hpp" @@ -186,7 +215,7 @@ namespace detail { static thread_local std::size_t idx = 0; } -template +template void launch(F f, dim3 g, dim3 b, std::size_t /*ds*/, std::size_t /*stream*/, Args&&... args) { std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z); KLOG(TRC) << N; @@ -252,6 +281,8 @@ static void global_gd_kernel(F& f, std::size_t s, Args... args) { #include "launchers.hpp" +void grid_sync() {} + } /* namespace MKN_GPU_NS */ #undef MKN_GPU_ASSERT diff --git a/inc/mkn/gpu/cuda.hpp b/inc/mkn/gpu/cuda.hpp index edd7794..dd96528 100644 --- a/inc/mkn/gpu/cuda.hpp +++ b/inc/mkn/gpu/cuda.hpp @@ -1,5 +1,5 @@ /** -Copyright (c) 2020, Philip Deegan. +Copyright (c) 2024, Philip Deegan. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -34,13 +34,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include - #include "mkn/kul/log.hpp" #include "mkn/kul/span.hpp" #include "mkn/kul/tuple.hpp" #include "mkn/kul/assert.hpp" +#include "mkn/gpu/cli.hpp" +#include +#include #include "mkn/gpu/def.hpp" // #define MKN_GPU_ASSERT(x) (KASSERT((x) == cudaSuccess)) @@ -50,10 +51,31 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true) { if (code != cudaSuccess) { fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); + if (abort) std::abort(); } } +namespace mkn::gpu::cuda { + +template +__device__ SIZE idx() { + SIZE width = gridDim.x * blockDim.x; + SIZE height = gridDim.y * blockDim.y; + SIZE x = blockDim.x * blockIdx.x + threadIdx.x; + SIZE y = blockDim.y * blockIdx.y + threadIdx.y; + SIZE z = blockDim.z * blockIdx.z + threadIdx.z; + return x + (y * width) + (z * width * height); +} + +template +__device__ SIZE block_idx_x() { + return blockIdx.x; +} + +} // namespace mkn::gpu::cuda + +// + #if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS #define MKN_GPU_NS mkn::gpu::cuda #else @@ -62,6 +84,16 @@ inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = namespace MKN_GPU_NS { +void setLimitMallocHeapSize(std::size_t const& bytes) { + MKN_GPU_ASSERT(cudaDeviceSetLimit(cudaLimitMallocHeapSize, bytes)); +} + +auto supportsCooperativeLaunch(int const dev = 0) { + int supportsCoopLaunch = 0; + MKN_GPU_ASSERT(cudaDeviceGetAttribute(&supportsCoopLaunch, cudaDevAttrCooperativeLaunch, dev)); + return supportsCoopLaunch; +} + struct Stream { Stream() { MKN_GPU_ASSERT(result = cudaStreamCreate(&stream)); } ~Stream() { MKN_GPU_ASSERT(result = cudaStreamDestroy(stream)); } @@ -74,6 +106,23 @@ struct Stream { cudaStream_t stream; }; +struct StreamEvent { + StreamEvent(Stream& stream_) : stream{stream_} { reset(); } + ~StreamEvent() { /*MKN_GPU_ASSERT(result = cudaEventDestroy(event));*/ } + + auto& operator()() { return event; }; + void record() { MKN_GPU_ASSERT(result = cudaEventRecord(event, stream())); } + bool finished() const { return cudaEventQuery(event) == cudaSuccess; } + void reset() { + if (event) MKN_GPU_ASSERT(result = cudaEventDestroy(event)); + MKN_GPU_ASSERT(result = cudaEventCreate(&event)); + } + + Stream& stream; + cudaError_t result; + cudaEvent_t event = nullptr; +}; + template struct Pointer { Pointer(T* _t) : t{_t} { MKN_GPU_ASSERT(cudaPointerGetAttributes(&attributes, t)); } @@ -112,7 +161,7 @@ void alloc_managed(T*& p, Size size) { MKN_GPU_ASSERT(cudaMallocManaged((void**)&p, size * sizeof(T))); } -void destroy(void* p) { +void inline destroy(void* p) { KLOG(TRC); MKN_GPU_ASSERT(cudaFree(p)); } @@ -129,6 +178,12 @@ void destroy_host(T*& ptr) { MKN_GPU_ASSERT(cudaFreeHost(ptr)); } +template +void copy_on_device(T* dst, T const* src, Size size = 1) { + KLOG(TRC); + MKN_GPU_ASSERT(cudaMemcpy(dst, src, size * sizeof(T), cudaMemcpyDeviceToDevice)); +} + template void send(void* p, void* t, Size size = 1) { KLOG(TRC); @@ -147,23 +202,6 @@ void take(T const* p, T* t, Size size = 1, Size start = 0) { MKN_GPU_ASSERT(cudaMemcpy(t, p + start, size * sizeof(T), cudaMemcpyDeviceToHost)); } -template -void copy(T* dst, T const* src, Size size = 1, Size start = 0) { - KLOG(TRC); - Pointer p{dst}; - if (p.is_host_ptr()) - take(src, dst, size, start); - else - send(dst, src, size, start); -} - -template -void copy(std::vector& dst, std::vector const& src) { - KLOG(TRC); - assert(dst.size() >= src.size()); - copy(dst.data(), src.data(), dst.size()); -} - template void send_async(T* p, T const* t, Stream& stream, Size size = 1, Size start = 0) { KLOG(TRC); @@ -185,19 +223,34 @@ void take_async(T* p, Span& span, Stream& stream, std::size_t start) { stream())); } -void sync() { MKN_GPU_ASSERT(cudaDeviceSynchronize()); } +void inline sync() { MKN_GPU_ASSERT(cudaDeviceSynchronize()); } +void inline sync(cudaStream_t stream) { MKN_GPU_ASSERT(cudaStreamSynchronize(stream)); } #include "mkn/gpu/alloc.hpp" #include "mkn/gpu/device.hpp" -template +template void launch(F&& f, dim3 g, dim3 b, std::size_t ds, cudaStream_t& s, Args&&... args) { std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z); KLOG(TRC) << N; std::apply( - [&](auto&&... params) { f<<>>(params...); }, + [&](auto&&... params) { + if constexpr (_coop) { + auto address_of = [](auto& a) { return (void*)&a; }; + void* kernelArgs[] = {(address_of(params), ...)}; + cudaLaunchCooperativeKernel((void*)f, g, b, kernelArgs, ds); + } else { + f<<>>(params...); + } + MKN_GPU_ASSERT(cudaGetLastError()); + }, devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence())); - sync(); + if constexpr (_sync) { + if (s) + sync(s); + else + sync(); + } } // @@ -219,16 +272,23 @@ struct Launcher { }; struct GLauncher : public Launcher { - GLauncher(std::size_t s, size_t dev = 0) : Launcher{dim3{}, dim3{}}, count{s} { - [[maybe_unused]] auto ret = cudaGetDeviceProperties(&devProp, dev); + GLauncher(std::size_t const& s, std::size_t const& _dev = 0) + : Launcher{dim3{}, dim3{}}, dev{_dev}, count{s} { + MKN_GPU_ASSERT(cudaGetDeviceProperties(&devProp, dev)); - b.x = devProp.maxThreadsPerBlock; + resize(s); + } + + void resize(std::size_t const& s, std::size_t const& bx = 0) { + b.x = bx > 0 ? bx : cli.bx_threads(); g.x = s / b.x; if ((s % b.x) > 0) ++g.x; } + std::size_t dev = 0; std::size_t count = 0; cudaDeviceProp devProp; + mkn::gpu::Cli cli{devProp}; }; template @@ -254,7 +314,7 @@ void fill(Container& c, T val) { } // -void prinfo(size_t dev = 0) { +void inline prinfo(size_t dev = 0) { cudaDeviceProp devProp; [[maybe_unused]] auto ret = cudaGetDeviceProperties(&devProp, dev); KOUT(NON) << " System version " << devProp.major << "." << devProp.minor; @@ -267,6 +327,13 @@ void prinfo(size_t dev = 0) { KOUT(NON) << " threadsPBlock " << devProp.maxThreadsPerBlock; } +__device__ void grid_sync() { + namespace cg = cooperative_groups; + cg::grid_group grid = cg::this_grid(); + assert(grid.is_valid()); + grid.sync(); +} + } // namespace MKN_GPU_NS #undef MKN_GPU_ASSERT diff --git a/inc/mkn/gpu/cuda/def.hpp b/inc/mkn/gpu/cuda/def.hpp deleted file mode 100644 index 37abb46..0000000 --- a/inc/mkn/gpu/cuda/def.hpp +++ /dev/null @@ -1,24 +0,0 @@ - -// IWYU pragma: private, include "mkn/gpu/def.hpp" - -#ifndef _MKN_GPU_CUDA_DEF_HPP_ -#define _MKN_GPU_CUDA_DEF_HPP_ - -#include - -namespace mkn::gpu::cuda { - -template -__device__ SIZE idx() { - SIZE width = gridDim.x * blockDim.x; - SIZE height = gridDim.y * blockDim.y; - - SIZE x = blockDim.x * blockIdx.x + threadIdx.x; - SIZE y = blockDim.y * blockIdx.y + threadIdx.y; - SIZE z = blockDim.z * blockIdx.z + threadIdx.z; - return x + (y * width) + (z * width * height); -} - -} // namespace mkn::gpu::cuda - -#endif /*_MKN_GPU_CUDA_DEF_HPP_*/ diff --git a/inc/mkn/gpu/def.hpp b/inc/mkn/gpu/def.hpp index da3a26c..d25689a 100644 --- a/inc/mkn/gpu/def.hpp +++ b/inc/mkn/gpu/def.hpp @@ -1,20 +1,8 @@ - - #ifndef _MKN_GPU_DEF_HPP_ #define _MKN_GPU_DEF_HPP_ #include -#if defined(MKN_GPU_ROCM) -#include "mkn/gpu/rocm/def.hpp" -#elif defined(MKN_GPU_CUDA) -#include "mkn/gpu/cuda/def.hpp" -#elif defined(MKN_GPU_CPU) - -#elif !defined(MKN_GPU_FN_PER_NS) || MKN_GPU_FN_PER_NS == 0 -#error "UNKNOWN GPU / define MKN_GPU_ROCM or MKN_GPU_CUDA" -#endif - namespace mkn::gpu { #if defined(MKN_GPU_CPU) diff --git a/inc/mkn/gpu/defines.hpp b/inc/mkn/gpu/defines.hpp new file mode 100644 index 0000000..6f98830 --- /dev/null +++ b/inc/mkn/gpu/defines.hpp @@ -0,0 +1,53 @@ + + +#ifndef _MKN_GPU_DEFINES_HPP_ +#define _MKN_GPU_DEFINES_HPP_ + +#include + +#if !defined(MKN_GPU_FN_PER_NS) +#define MKN_GPU_FN_PER_NS 0 +#endif + +#if !defined(MKN_GPU_ROCM) and __has_include("hip/hip_runtime.h") +#define MKN_GPU_ROCM 1 +#endif +#if !defined(MKN_GPU_ROCM) +#define MKN_GPU_ROCM 0 +#endif + +#if !defined(MKN_GPU_CUDA) and __has_include() +#define MKN_GPU_CUDA 1 +#endif +#if !defined(MKN_GPU_CUDA) +#define MKN_GPU_CUDA 0 +#endif + +#if MKN_GPU_CUDA == 1 && MKN_GPU_ROCM == 1 && MKN_GPU_FN_PER_NS == 0 +#define MKN_GPU_FN_PER_NS 1 +#endif + +#if MKN_GPU_ROCM == 1 +#include "mkn/gpu/rocm.hpp" +#endif + +#if MKN_GPU_CUDA +#include "mkn/gpu/cuda.hpp" +#endif + +#if MKN_GPU_FN_PER_NS == 1 || MKN_GPU_CPU == 1 +#include "mkn/gpu/cpu.hpp" +#endif + +namespace mkn::gpu { + +struct CompileFlags { + bool constexpr static withCUDA = MKN_GPU_CUDA; + bool constexpr static withROCM = MKN_GPU_ROCM; + bool constexpr static withCPU = !MKN_GPU_ROCM and !MKN_GPU_CUDA; + bool constexpr static perNamespace = MKN_GPU_FN_PER_NS; +}; + +} /* namespace mkn::gpu */ + +#endif /*_MKN_GPU_DEFINES_HPP_*/ diff --git a/inc/mkn/gpu/device.hpp b/inc/mkn/gpu/device.hpp index 7580180..04cc9e8 100644 --- a/inc/mkn/gpu/device.hpp +++ b/inc/mkn/gpu/device.hpp @@ -1,5 +1,5 @@ /** -Copyright (c) 2020, Philip Deegan. +Copyright (c) 2024, Philip Deegan. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -100,6 +100,9 @@ struct DeviceMem { auto& size() const { return s; } + auto* data() { return p; } + auto* data() const { return p; } + std::size_t s = 0; T* p = nullptr; bool owned = false; diff --git a/inc/mkn/gpu/launchers.hpp b/inc/mkn/gpu/launchers.hpp index adbd213..67bd510 100644 --- a/inc/mkn/gpu/launchers.hpp +++ b/inc/mkn/gpu/launchers.hpp @@ -1,5 +1,5 @@ /** -Copyright (c) 2020, Philip Deegan. +Copyright (c) 2024, Philip Deegan. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -31,12 +31,20 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef _MKN_GPU_LAUNCHERS_HPP_ #define _MKN_GPU_LAUNCHERS_HPP_ +template struct GDLauncher : public GLauncher { GDLauncher(std::size_t s, size_t dev = 0) : GLauncher{s, dev} {} template auto operator()(F&& f, Args&&... args) { - _launch(std::forward(f), + _launch(s, std::forward(f), + as_values(std::forward_as_tuple(args...), std::make_index_sequence()), + count, args...); + } + + template + auto stream(Stream& s, F&& f, Args&&... args) { + _launch(s.stream, std::forward(f), as_values(std::forward_as_tuple(args...), std::make_index_sequence()), count, args...); } @@ -48,9 +56,9 @@ struct GDLauncher : public GLauncher { return T{nullptr}; } - template - void _launch(F&& f, std::tuple*, Args&&... args) { - MKN_GPU_NS::launch(&global_gd_kernel, g, b, ds, s, f, args...); + template + void _launch(S& _s, F&& f, std::tuple*, Args&&... args) { + MKN_GPU_NS::launch<_sync, _coop>(&global_gd_kernel, g, b, ds, _s, f, args...); } }; diff --git a/inc/mkn/gpu/multi_launch.hpp b/inc/mkn/gpu/multi_launch.hpp new file mode 100644 index 0000000..f50fac4 --- /dev/null +++ b/inc/mkn/gpu/multi_launch.hpp @@ -0,0 +1,160 @@ +/** +Copyright (c) 2024, Philip Deegan. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Philip Deegan nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef _MKN_GPU_MULTI_LAUNCH_HPP_ +#define _MKN_GPU_MULTI_LAUNCH_HPP_ + +#include +#include +#include +#include +#include + +#include "mkn/gpu.hpp" + +namespace mkn::gpu { + +enum class StreamFunctionMode { HOST_WAIT = 0, DEVICE_WAIT }; + +template +struct StreamFunction { + StreamFunction(Strat& strat_, StreamFunctionMode mode_) : strat{strat_}, mode{mode_} {} + virtual ~StreamFunction() {} + virtual void run(std::uint32_t const) {}; + + Strat& strat; + StreamFunctionMode mode; +}; + +template +struct StreamDeviceFunction : StreamFunction { + using Super = StreamFunction; + using Super::strat; + + StreamDeviceFunction(Strat& strat, Fn&& fn_) + : Super{strat, StreamFunctionMode::DEVICE_WAIT}, fn{fn_} {} + void run(std::uint32_t const i) override { + mkn::gpu::GDLauncher{strat.datas[i].size()}.stream( + strat.streams[i], [=, fn = fn] __device__() mutable { fn(i); }); + } + + Fn fn; +}; + +template +struct StreamHostFunction : StreamFunction { + using Super = StreamFunction; + StreamHostFunction(Strat& strat, Fn&& fn_) + : Super{strat, StreamFunctionMode::HOST_WAIT}, fn{fn_} {} + void run(std::uint32_t const i) override { fn(i); } + Fn fn; +}; + +template +struct StreamLauncher { + using This = StreamLauncher; + using T = typename Datas::value_type::value_type; + + StreamLauncher(Datas& datas_) : datas{datas_}, streams(datas.size()), data_step(datas.size(), 0) { + for (auto& s : streams) events.emplace_back(s); + } + + ~StreamLauncher() { sync(); } + + void sync() noexcept { + for (auto& s : streams) s.sync(); + } + + template + auto& dev(Fn&& fn) { + fns.emplace_back(std::make_shared>(self, std::forward(fn))); + return self; + } + template + auto& host(Fn&& fn) { + fns.emplace_back(std::make_shared>(self, std::forward(fn))); + return self; + } + + void operator()() { + using namespace std::chrono_literals; + + if (fns.size() == 0) return; + + for (std::size_t i = 0; i < datas.size(); ++i) self(i); + + do { + for (std::size_t i = 0; i < datas.size(); ++i) { + if (is_finished(i)) continue; + if (is_fn_finished(i)) { + data_step[i] += 1; + if (not is_finished(i)) self(i); + } + } + std::this_thread::sleep_for(1ms); // make sleep time configurable + } while (!is_finished()); + } + + void operator()(std::uint32_t const i) { + auto const& step = data_step[i]; + fns[step]->run(i); + if (fns[step]->mode == StreamFunctionMode::DEVICE_WAIT) events[i].record(); + } + + bool is_finished() const { + std::uint32_t finished = 0; + for (std::size_t i = 0; i < datas.size(); ++i) + if (is_finished(i)) ++finished; + return finished == datas.size(); + } + + bool is_finished(std::uint32_t idx) const { return data_step[idx] == fns.size(); } + + bool is_fn_finished(std::uint32_t i) { + auto const b = [&]() { + auto const& step = data_step[i]; + if (fns[step]->mode == StreamFunctionMode::HOST_WAIT) return true; + return events[i].finished(); + }(); + if (b) events[i].reset(); + return b; + } + + Datas& datas; + std::vector>> fns; + std::vector streams; + std::vector events; + std::vector data_step; + This& self = *this; +}; + +} // namespace mkn::gpu + +#endif /* _MKN_GPU_MULTI_LAUNCH_HPP_ */ diff --git a/inc/mkn/gpu/rocm.hpp b/inc/mkn/gpu/rocm.hpp index a09b5a6..6ca519c 100644 --- a/inc/mkn/gpu/rocm.hpp +++ b/inc/mkn/gpu/rocm.hpp @@ -1,5 +1,5 @@ /** -Copyright (c) 2020, Philip Deegan. +Copyright (c) 2024, Philip Deegan. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -32,15 +32,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef _MKN_GPU_ROCM_HPP_ #define _MKN_GPU_ROCM_HPP_ -#include "hip/hip_runtime.h" - #include "mkn/kul/log.hpp" #include "mkn/kul/span.hpp" #include "mkn/kul/tuple.hpp" #include "mkn/kul/assert.hpp" +#include "mkn/gpu/cli.hpp" +#include "hip/hip_runtime.h" #include "mkn/gpu/def.hpp" +#include "hip/hip_cooperative_groups.h" + // #define MKN_GPU_ASSERT(x) (KASSERT((x) == hipSuccess)) #define MKN_GPU_ASSERT(ans) \ @@ -48,10 +50,27 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. inline void gpuAssert(hipError_t code, const char* file, int line, bool abort = true) { if (code != hipSuccess) { fprintf(stderr, "GPUassert: %s %s %d\n", hipGetErrorString(code), file, line); - if (abort) exit(code); + if (abort) std::abort(); } } +namespace mkn::gpu::hip { + +template +__device__ SIZE idx() { + SIZE width = hipGridDim_x * hipBlockDim_x; + SIZE height = hipGridDim_y * hipBlockDim_y; + + SIZE x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + SIZE y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + SIZE z = hipBlockDim_z * hipBlockIdx_z + hipThreadIdx_z; + return x + (y * width) + (z * width * height); // max 4294967296 +} + +} // namespace mkn::gpu::hip + +// + #if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS #define MKN_GPU_NS mkn::gpu::hip #else @@ -60,6 +79,17 @@ inline void gpuAssert(hipError_t code, const char* file, int line, bool abort = namespace MKN_GPU_NS { +void setLimitMallocHeapSize(std::size_t const& bytes) { + MKN_GPU_ASSERT(hipDeviceSetLimit(hipLimitMallocHeapSize, bytes)); +} + +auto supportsCooperativeLaunch(int const dev = 0) { + int supportsCoopLaunch = 0; + MKN_GPU_ASSERT( + hipDeviceGetAttribute(&supportsCoopLaunch, hipDeviceAttributeCooperativeLaunch, dev)); + return supportsCoopLaunch; +} + struct Stream { Stream() { MKN_GPU_ASSERT(result = hipStreamCreate(&stream)); } ~Stream() { MKN_GPU_ASSERT(result = hipStreamDestroy(stream)); } @@ -72,17 +102,48 @@ struct Stream { hipStream_t stream; }; +struct StreamEvent { + StreamEvent(Stream& stream_) : stream{stream_} { reset(); } + ~StreamEvent() { /*MKN_GPU_ASSERT(result = hipEventDestroy(event));*/ } + + auto& operator()() { return event; }; + void record() { MKN_GPU_ASSERT(result = hipEventRecord(event, stream())); } + bool finished() const { return hipEventQuery(event) == hipSuccess; } + void reset() { + if (event) MKN_GPU_ASSERT(result = hipEventDestroy(event)); + MKN_GPU_ASSERT(result = hipEventCreate(&event)); + } + + Stream& stream; + hipError_t result; + hipEvent_t event = nullptr; +}; + +// https://rocm.docs.amd.com/projects/HIP/en/latest/doxygen/html/group___global_defs.html#gaea86e91d3cd65992d787b39b218435a3 template struct Pointer { - Pointer(T* _t) : t{_t} { MKN_GPU_ASSERT(hipPointerGetAttributes(&attributes, t)); } + Pointer(T* _t) : t{_t} { + assert(t); + MKN_GPU_ASSERT(hipPointerGetAttributes(&attributes, t)); + type = attributes.type; + } - // bool is_unregistered_ptr() const { return attributes.type == 0; } - bool is_host_ptr() const { return attributes.hostPointer != nullptr; } - bool is_device_ptr() const { return attributes.devicePointer != nullptr; } - bool is_managed_ptr() const { return attributes.isManaged; } + bool is_unregistered_ptr() const { + return attributes.type == hipMemoryType::hipMemoryTypeUnregistered; + } + bool is_host_ptr() const { + return is_unregistered_ptr() || type == hipMemoryType::hipMemoryTypeHost; + } + bool is_device_ptr() const { + return type == hipMemoryType::hipMemoryTypeDevice || attributes.isManaged; + } + bool is_managed_ptr() const { + return attributes.isManaged || type == hipMemoryType::hipMemoryTypeUnified; + } T* t; hipPointerAttribute_t attributes; + hipMemoryType type = hipMemoryType::hipMemoryTypeUnregistered; }; template @@ -109,7 +170,7 @@ void alloc_managed(T*& p, Size size) { MKN_GPU_ASSERT(hipMallocManaged((void**)&p, size * sizeof(T))); } -void destroy(void* p) { +void inline destroy(void* p) { KLOG(TRC); MKN_GPU_ASSERT(hipFree(p)); } @@ -126,6 +187,12 @@ void destroy_host(T* ptr) { MKN_GPU_ASSERT(hipHostFree(ptr)); } +template +void copy_on_device(T* dst, T const* src, Size size = 1) { + KLOG(TRC); + MKN_GPU_ASSERT(hipMemcpy(dst, src, size * sizeof(T), hipMemcpyDeviceToDevice)); +} + template void send(void* p, void* t, Size size = 1) { KLOG(TRC); @@ -165,19 +232,34 @@ void take_async(T* p, Span& span, Stream& stream, std::size_t start) { stream())); } -void sync() { MKN_GPU_ASSERT(hipDeviceSynchronize()); } +void inline sync() { MKN_GPU_ASSERT(hipDeviceSynchronize()); } +void inline sync(hipStream_t stream) { MKN_GPU_ASSERT(hipStreamSynchronize(stream)); } #include "mkn/gpu/alloc.hpp" #include "mkn/gpu/device.hpp" -template +template void launch(F&& f, dim3 g, dim3 b, std::size_t ds, hipStream_t& s, Args&&... args) { std::size_t N = (g.x * g.y * g.z) * (b.x * b.y * b.z); KLOG(TRC) << N; std::apply( - [&](auto&&... params) { hipLaunchKernelGGL(f, g, b, ds, s, params...); }, + [&](auto&&... params) { + if constexpr (_coop) { + auto address_of = [](auto& a) { return (void*)&a; }; + void* kernelArgs[] = {(address_of(params), ...)}; + hipLaunchCooperativeKernel(f, g, b, kernelArgs, ds, s); + } else { + hipLaunchKernelGGL(f, g, b, ds, s, params...); + } + MKN_GPU_ASSERT(hipGetLastError()); + }, devmem_replace(std::forward_as_tuple(args...), std::make_index_sequence())); - sync(); + if constexpr (_sync) { + if (s) + sync(s); + else + sync(); + } } // https://rocm-documentation.readthedocs.io/en/latest/Programming_Guides/HIP-GUIDE.html#calling-global-functions @@ -202,13 +284,14 @@ struct GLauncher : public Launcher { GLauncher(std::size_t s, size_t dev = 0) : Launcher{dim3{}, dim3{}}, count{s} { [[maybe_unused]] auto ret = hipGetDeviceProperties(&devProp, dev); - b.x = devProp.maxThreadsPerBlock; + b.x = cli.bx_threads(); g.x = s / b.x; if ((s % b.x) > 0) ++g.x; } std::size_t count = 0; hipDeviceProp_t devProp; + mkn::gpu::Cli cli{devProp}; }; template @@ -234,7 +317,7 @@ void fill(Container& c, T val) { } // https://rocm-developer-tools.github.io/HIP/group__Device.html -void prinfo(size_t dev = 0) { +void inline prinfo(size_t dev = 0) { hipDeviceProp_t devProp; [[maybe_unused]] auto ret = hipGetDeviceProperties(&devProp, dev); KOUT(NON) << " System version " << devProp.major << "." << devProp.minor; @@ -247,6 +330,13 @@ void prinfo(size_t dev = 0) { KOUT(NON) << " threadsPBlock " << devProp.maxThreadsPerBlock; } +__device__ void grid_sync() { + namespace cg = cooperative_groups; + cg::grid_group grid = cg::this_grid(); + assert(grid.is_valid()); + grid.sync(); +} + } // namespace MKN_GPU_NS #undef MKN_GPU_ASSERT diff --git a/inc/mkn/gpu/rocm/def.hpp b/inc/mkn/gpu/rocm/def.hpp deleted file mode 100644 index d556972..0000000 --- a/inc/mkn/gpu/rocm/def.hpp +++ /dev/null @@ -1,24 +0,0 @@ - -// IWYU pragma: private, include "mkn/gpu/def.hpp" - -#ifndef _MKN_GPU_ROCM_DEF_HPP_ -#define _MKN_GPU_ROCM_DEF_HPP_ - -#include "hip/hip_runtime.h" - -namespace mkn::gpu::hip { - -template -__device__ SIZE idx() { - SIZE width = hipGridDim_x * hipBlockDim_x; - SIZE height = hipGridDim_y * hipBlockDim_y; - - SIZE x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - SIZE y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; - SIZE z = hipBlockDim_z * hipBlockIdx_z + hipThreadIdx_z; - return x + (y * width) + (z * width * height); // max 4294967296 -} - -} // namespace mkn::gpu::hip - -#endif /*_MKN_GPU_ROCM_DEF_HPP_*/ diff --git a/inc/mkn/gpu/tuple.hpp b/inc/mkn/gpu/tuple.hpp index 704e77e..383a2b9 100644 --- a/inc/mkn/gpu/tuple.hpp +++ b/inc/mkn/gpu/tuple.hpp @@ -1,5 +1,5 @@ /** -Copyright (c) 2017, Philip Deegan. +Copyright (c) 2024, Philip Deegan. All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/mkn.yaml b/mkn.yaml index 2bf616c..2d7b461 100644 --- a/mkn.yaml +++ b/mkn.yaml @@ -1,4 +1,4 @@ -#! clean build test run -tOp rocm -x res/mkn/hipcc +#! clean build test run -Op rocm -x res/mkn/hipcc -W name: mkn.gpu parent: headers @@ -10,13 +10,13 @@ profile: - name: rocm parent: headers - arg: -DMKN_GPU_ROCM + # arg: -DMKN_GPU_ROCM=1 test: test/any/(\w).cpp test/hip/(\w).cpp - name: cuda parent: headers - arg: -DMKN_GPU_CUDA + # arg: -DMKN_GPU_CUDA test: test/any/(\w).cpp test/cuda/(\w).cpp diff --git a/res/mkn/clang_cuda.yaml b/res/mkn/clang_cuda.yaml index 8c6d86d..638b4c6 100644 --- a/res/mkn/clang_cuda.yaml +++ b/res/mkn/clang_cuda.yaml @@ -5,6 +5,11 @@ ## The following compile flags will likely be required with their specific values for your current hardware # --cuda-gpu-arch="sm_61" # sm_61 = nvidia 1080 ( # see: http://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards +# +# Common issues: +# GPUassert: unknown error +# reboot or try `sudo modprobe nvidia_uvm` +# property: cxx_flags: --std=c++17 -fPIC -fsized-deallocation -Wno-unknown-cuda-version diff --git a/res/mkn/hipcc.yaml b/res/mkn/hipcc.yaml index d0dc3f7..3cb9a0a 100644 --- a/res/mkn/hipcc.yaml +++ b/res/mkn/hipcc.yaml @@ -1,8 +1,8 @@ ## Recommended settings commented out. -# local: -# repo: /mkn/r -# mod-repo: /mkn/m +local: + repo: /mkn/r + mod-repo: /mkn/m # remote: # repo: git@github.com:mkn/ diff --git a/test/any/add.cpp b/test/any/add.cpp index 860d7f0..9a21b08 100644 --- a/test/any/add.cpp +++ b/test/any/add.cpp @@ -30,10 +30,20 @@ __global__ void vectoradd1(T* a, T* b) { template uint32_t test_add1() { std::vector b(NUM); + + assert(mkn::gpu::Pointer{b.data()}.is_host_ptr()); + for (uint32_t i = 0; i < NUM; i++) b[i] = i; mkn::gpu::DeviceMem devA(NUM), devB(b); + + if constexpr (!mkn::gpu::CompileFlags::withCPU) { + assert(mkn::gpu::Pointer{devA.p}.is_device_ptr()); + } + mkn::gpu::Launcher{WIDTH, HEIGHT, TPB_X, TPB_Y}(vectoradd1, devA, devB); auto a = devA(); + + // assert(mkn::gpu::Pointer{a.data()}.is_device_ptr()); for (uint32_t i = 0; i < NUM; i++) if (a[i] != b[i] + 1) return 1; return 0; diff --git a/test/any/async_streaming.cpp b/test/any/async_streaming.cpp new file mode 100644 index 0000000..b24ca81 --- /dev/null +++ b/test/any/async_streaming.cpp @@ -0,0 +1,45 @@ + +#include +#include +#include +#include +#include + +#include "mkn/gpu/multi_launch.hpp" + +std::uint32_t static constexpr NUM = 128 * 1024 * 1024; // ~ 1GB of doubles +std::size_t constexpr static C = 5; // ~ 5GB of doubles + +template +using ManagedVector = std::vector>; + +struct A { + std::uint32_t i0; +}; + +std::uint32_t test() { + using namespace mkn::gpu; + using T = double; + std::vector> vecs(C, ManagedVector(NUM, 0)); + for (std::size_t i = 0; i < vecs.size(); ++i) std::fill_n(vecs[i].data(), NUM, i); + + ManagedVector datas(C); + for (std::size_t i = 0; i < vecs.size(); ++i) datas[i] = vecs[i].data(); + auto views = datas.data(); + + StreamLauncher{vecs} + .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 1; }) + .host([&](auto i) mutable { vecs[i][0] += 1; }) + .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 3; }) // + (); + + for (auto const& vec : vecs) std::cout << __LINE__ << " " << vec[0] << std::endl; + + return 0; +} + +int main() { + KOUT(NON) << __FILE__; + + return test(); +} diff --git a/test/any/construct.cpp b/test/any/construct.cpp new file mode 100644 index 0000000..188b729 --- /dev/null +++ b/test/any/construct.cpp @@ -0,0 +1,59 @@ + +#include "mkn/gpu.hpp" + +static constexpr uint32_t NUM = 5; + +template +using ManagedVector = std::vector>; + +template +using ManagedMemory = std::vector>; + +std::size_t alloced = 0; +struct S { + S() { ++alloced; } + + std::uint16_t s = 1; +}; + +std::uint32_t test_does_construct_on_resize() { + KLOG(INF); + alloced = 0; + ManagedVector mem{NUM}; + mem.resize(NUM + NUM); + return alloced != NUM + NUM; +} + +std::uint32_t test_does_not_construct_on_resize() { + KLOG(INF); + alloced = 0; + ManagedMemory mem{NUM}; + assert(mem.size() == 5 && "wrong size"); + resize(mem, NUM + NUM); + assert(mem.size() == 10 && "wrong size"); + + auto cap = mem.capacity(); + + KLOG(INF) << mem.capacity(); + as_super(mem).emplace_back(); // does construct + KLOG(INF) << mem.capacity(); + assert(mem.capacity() != cap && "capacity bad!"); + assert(mem[10].s == 1 && "bad construct!"); + + cap = mem.capacity(); + + resize(mem, NUM * 5); + assert(mem.size() == 25 && "wrong size"); + assert(mem.capacity() != cap && "capacity bad!"); + assert(mem[10].s == 1 && "bad copy!"); + + KLOG(INF) << mem[10].s; + KLOG(INF) << mem[11].s; + return alloced != 1 or mem[10].s != 1; +} + +int main() { + KOUT(NON) << __FILE__; + + return test_does_construct_on_resize() + test_does_not_construct_on_resize(); +} diff --git a/test/any/coop.cpp b/test/any/coop.cpp new file mode 100644 index 0000000..9e0d473 --- /dev/null +++ b/test/any/coop.cpp @@ -0,0 +1,41 @@ + +#include "mkn/gpu.hpp" + +static constexpr uint32_t WIDTH = 1024, HEIGHT = 1024; +static constexpr uint32_t NUM = WIDTH * HEIGHT; + +template +using ManagedVector = std::vector>; + +struct S { + float f0 = 1; + double d0 = 1; +}; + +std::uint32_t test_lambda_copy_capture_views() { + mkn::gpu::GDLauncher launcher{NUM}; + + ManagedVector mem{NUM}; + for (std::uint32_t i = 0; i < NUM; ++i) mem[i].d0 = i; + + auto* view = mem.data(); + + launcher([=] __device__() { + auto i = mkn::gpu::idx(); + mkn::gpu::grid_sync(); + view[i].f0 = view[i].d0 + 1; + }); + + for (std::uint32_t i = 0; i < NUM; ++i) + if (view[i].f0 != view[i].d0 + 1) return 1; + + return 0; +} + +int main() { + KOUT(NON) << __FILE__; + + // if (mkn::gpu::supportsCooperativeLaunch()) return test_lambda_copy_capture_views(); + // KOUT(NON) << "Cooperative Launch not supported"; + return 0; +} diff --git a/test/any/managed.cpp b/test/any/managed.cpp index 158e960..c563885 100644 --- a/test/any/managed.cpp +++ b/test/any/managed.cpp @@ -21,6 +21,10 @@ __global__ void kernel(S* structs) { template std::uint32_t _test(L&& launcher) { ManagedVector mem{NUM}; + if constexpr (!mkn::gpu::CompileFlags::withCPU) { + assert(mkn::gpu::Pointer{mem.data()}.is_managed_ptr()); + } + for (std::uint32_t i = 0; i < NUM; ++i) mem[i].d0 = i; launcher(kernel, mem); From c0fb26128b07b13716660e97be15af0c9977e2ac Mon Sep 17 00:00:00 2001 From: PhilipDeegan Date: Sun, 14 Jul 2024 13:27:10 +0200 Subject: [PATCH 2/7] allow ref lambdas --- inc/mkn/gpu/launchers.hpp | 4 ++-- test/any/managed.cpp | 28 +++++++++++++++++++++++++--- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/inc/mkn/gpu/launchers.hpp b/inc/mkn/gpu/launchers.hpp index 67bd510..8cbe770 100644 --- a/inc/mkn/gpu/launchers.hpp +++ b/inc/mkn/gpu/launchers.hpp @@ -37,7 +37,7 @@ struct GDLauncher : public GLauncher { template auto operator()(F&& f, Args&&... args) { - _launch(s, std::forward(f), + _launch(s, f, as_values(std::forward_as_tuple(args...), std::make_index_sequence()), count, args...); } @@ -57,7 +57,7 @@ struct GDLauncher : public GLauncher { } template - void _launch(S& _s, F&& f, std::tuple*, Args&&... args) { + void _launch(S& _s, F& f, std::tuple*, Args&&... args) { MKN_GPU_NS::launch<_sync, _coop>(&global_gd_kernel, g, b, ds, _s, f, args...); } }; diff --git a/test/any/managed.cpp b/test/any/managed.cpp index c563885..0c9634b 100644 --- a/test/any/managed.cpp +++ b/test/any/managed.cpp @@ -62,9 +62,31 @@ std::uint32_t test_lambda_copy_capture_views() { return _test_lambda_copy_capture_views(mkn::gpu::GDLauncher{NUM}); } +std::uint32_t test_lambda_ref_copy_capture_views() { + mkn::gpu::GDLauncher launcher{NUM}; + + ManagedVector mem{NUM}; + for (std::uint32_t i = 0; i < NUM; ++i) mem[i].d0 = i; + + auto* view = mem.data(); + + auto fn = [=] __device__() { + auto i = mkn::gpu::idx(); + view[i].f0 = view[i].d0 + 1; + }; + + launcher(fn); + + for (std::uint32_t i = 0; i < NUM; ++i) + if (view[i].f0 != view[i].d0 + 1) return 1; + + return 0; +} + int main() { KOUT(NON) << __FILE__; - return test() + // - test_guess() + // - test_lambda_copy_capture_views(); + return test() + // + test_guess() + // + test_lambda_copy_capture_views() + // + test_lambda_ref_copy_capture_views(); } From 923b6e5770068dd0d6a37defc40293dd235d9276 Mon Sep 17 00:00:00 2001 From: PhilipDeegan Date: Wed, 17 Jul 2024 17:59:15 +0200 Subject: [PATCH 3/7] up --- inc/mkn/gpu/launchers.hpp | 2 +- inc/mkn/gpu/multi_launch.hpp | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/inc/mkn/gpu/launchers.hpp b/inc/mkn/gpu/launchers.hpp index 8cbe770..c17cb56 100644 --- a/inc/mkn/gpu/launchers.hpp +++ b/inc/mkn/gpu/launchers.hpp @@ -44,7 +44,7 @@ struct GDLauncher : public GLauncher { template auto stream(Stream& s, F&& f, Args&&... args) { - _launch(s.stream, std::forward(f), + _launch(s.stream, f, as_values(std::forward_as_tuple(args...), std::make_index_sequence()), count, args...); } diff --git a/inc/mkn/gpu/multi_launch.hpp b/inc/mkn/gpu/multi_launch.hpp index f50fac4..b0491d4 100644 --- a/inc/mkn/gpu/multi_launch.hpp +++ b/inc/mkn/gpu/multi_launch.hpp @@ -47,7 +47,7 @@ template struct StreamFunction { StreamFunction(Strat& strat_, StreamFunctionMode mode_) : strat{strat_}, mode{mode_} {} virtual ~StreamFunction() {} - virtual void run(std::uint32_t const) {}; + virtual void run(std::uint32_t const) {} Strat& strat; StreamFunctionMode mode; @@ -80,7 +80,6 @@ struct StreamHostFunction : StreamFunction { template struct StreamLauncher { using This = StreamLauncher; - using T = typename Datas::value_type::value_type; StreamLauncher(Datas& datas_) : datas{datas_}, streams(datas.size()), data_step(datas.size(), 0) { for (auto& s : streams) events.emplace_back(s); From 583105120692d586d8f2e2501d4b3d98cefcb7e3 Mon Sep 17 00:00:00 2001 From: PhilipDeegan Date: Thu, 25 Jul 2024 16:58:58 +0200 Subject: [PATCH 4/7] threaded stream launcher --- inc/mkn/gpu/multi_launch.hpp | 166 ++++++++++++++++++++++++++++++++--- test/any/async_streaming.cpp | 60 ++++++++++++- 2 files changed, 212 insertions(+), 14 deletions(-) diff --git a/inc/mkn/gpu/multi_launch.hpp b/inc/mkn/gpu/multi_launch.hpp index b0491d4..66626b8 100644 --- a/inc/mkn/gpu/multi_launch.hpp +++ b/inc/mkn/gpu/multi_launch.hpp @@ -34,7 +34,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include +#include #include +#include #include #include "mkn/gpu.hpp" @@ -42,12 +45,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace mkn::gpu { enum class StreamFunctionMode { HOST_WAIT = 0, DEVICE_WAIT }; +enum class StreamFunctionStatus { HOST_BUSY = 0, DEVICE_BUSY }; template struct StreamFunction { StreamFunction(Strat& strat_, StreamFunctionMode mode_) : strat{strat_}, mode{mode_} {} virtual ~StreamFunction() {} - virtual void run(std::uint32_t const) {} + virtual void run(std::uint32_t const) = 0; Strat& strat; StreamFunctionMode mode; @@ -61,6 +65,8 @@ struct StreamDeviceFunction : StreamFunction { StreamDeviceFunction(Strat& strat, Fn&& fn_) : Super{strat, StreamFunctionMode::DEVICE_WAIT}, fn{fn_} {} void run(std::uint32_t const i) override { + strat.events[i].record(); + mkn::gpu::GDLauncher{strat.datas[i].size()}.stream( strat.streams[i], [=, fn = fn] __device__() mutable { fn(i); }); } @@ -77,9 +83,10 @@ struct StreamHostFunction : StreamFunction { Fn fn; }; -template +template struct StreamLauncher { - using This = StreamLauncher; + using This = StreamLauncher; + using Self = std::conditional_t, This, Self_>; StreamLauncher(Datas& datas_) : datas{datas_}, streams(datas.size()), data_step(datas.size(), 0) { for (auto& s : streams) events.emplace_back(s); @@ -92,13 +99,13 @@ struct StreamLauncher { } template - auto& dev(Fn&& fn) { - fns.emplace_back(std::make_shared>(self, std::forward(fn))); + Self& dev(Fn&& fn) { + fns.emplace_back(std::make_shared>(self, std::forward(fn))); return self; } template - auto& host(Fn&& fn) { - fns.emplace_back(std::make_shared>(self, std::forward(fn))); + Self& host(Fn&& fn) { + fns.emplace_back(std::make_shared>(self, std::forward(fn))); return self; } @@ -123,6 +130,7 @@ struct StreamLauncher { void operator()(std::uint32_t const i) { auto const& step = data_step[i]; + assert(step < fns.size()); fns[step]->run(i); if (fns[step]->mode == StreamFunctionMode::DEVICE_WAIT) events[i].record(); } @@ -142,16 +150,154 @@ struct StreamLauncher { if (fns[step]->mode == StreamFunctionMode::HOST_WAIT) return true; return events[i].finished(); }(); - if (b) events[i].reset(); + if (b) { + events[i].reset(); + } return b; } Datas& datas; - std::vector>> fns; + std::vector>> fns; std::vector streams; std::vector events; std::vector data_step; - This& self = *this; + Self& self = *reinterpret_cast(this); +}; + +enum class SFS : std::uint16_t { FIRST = 0, BUSY, WAIT, FIN }; +enum class SFP : std::uint16_t { WORK = 0, NEXT, SKIP }; + +template +struct AsyncStreamHostFunction : StreamFunction { + using Super = StreamFunction; + using Super::strat; + AsyncStreamHostFunction(Strat& strat, Fn&& fn_) + : Super{strat, StreamFunctionMode::HOST_WAIT}, fn{fn_} {} + void run(std::uint32_t const i) override { + fn(i); + strat.status[i] = SFS::WAIT; + } + Fn fn; +}; + +template +struct ThreadedStreamLauncher : public StreamLauncher> { + using This = ThreadedStreamLauncher; + using Super = StreamLauncher; + using Super::datas; + using Super::events; + using Super::fns; + + constexpr static std::size_t wait_ms = 1; + constexpr static std::size_t wait_max_ms = 100; + + ThreadedStreamLauncher(Datas& datas, std::size_t const _n_threads = 1) + : Super{datas}, n_threads{_n_threads} { + thread_status.resize(n_threads, SFP::NEXT); + status.resize(datas.size(), SFS::FIRST); + } + + ~ThreadedStreamLauncher() { join(); } + + template + This& host(Fn&& fn) { + fns.emplace_back( + std::make_shared>(*this, std::forward(fn))); + return *this; + } + + void operator()() { join(); } + Super& super() { return *this; } + void super(std::size_t const& idx) { return super()(idx); } + + bool is_fn_finished(std::uint32_t i) { + auto const b = [&]() { + if (fns[step[i]]->mode == StreamFunctionMode::HOST_WAIT) return status[i] == SFS::WAIT; + return events[i].finished(); + }(); + if (b) { + events[i].reset(); + status[i] = SFS::WAIT; + } + return b; + } + void thread_fn(std::size_t const& /*tid*/) { + std::size_t waitms = wait_ms; + while (!done) { + auto const& [ts, idx] = get_work(); + + if (ts == SFP::WORK) { + waitms = wait_ms; + super(idx); + + } else { + std::this_thread::sleep_for(std::chrono::milliseconds(waitms)); + waitms = waitms >= wait_max_ms ? wait_max_ms : waitms + 10; + if (check_finished()) done = 1; + } + } + } + + bool check_finished() { + for (std::size_t i = 0; i < datas.size(); ++i) + if (status[i] != SFS::FIN) return false; + return true; + } + + std::pair get_work(std::size_t const& start = 0) { + std::unique_lock lk(work_); + for (std::size_t i = start; i < datas.size(); ++i) { + if (status[i] == SFS::BUSY) { + if (is_fn_finished(i)) status[i] = SFS::WAIT; + + } else if (status[i] == SFS::WAIT) { + ++step[i]; + + if (Super::is_finished(i)) { + status[i] = SFS::FIN; + continue; + } + + status[i] = SFS::BUSY; + return std::make_pair(SFP::WORK, i); + + } else if (status[i] == SFS::FIRST) { + status[i] = SFS::BUSY; + return std::make_pair(SFP::WORK, i); + } + } + + return std::make_pair(SFP::SKIP, 0); + } + + This& join(bool const& clear = false) { + if (!started) start(); + if (joined) return *this; + joined = true; + + for (auto& t : threads) t.join(); + if (clear) threads.clear(); + return *this; + } + + This& start() { + if (started) return *this; + started = 1; + for (std::size_t i = 0; i < n_threads; ++i) + threads.emplace_back([i = i, this]() { thread_fn(i); }); + return *this; + } + + std::size_t const n_threads = 1; + std::vector threads; + + std::mutex work_; + std::vector status; + std::vector thread_status; + std::vector& step = Super::data_step; + + private: + bool joined = false, started = false, done = false; }; } // namespace mkn::gpu diff --git a/test/any/async_streaming.cpp b/test/any/async_streaming.cpp index b24ca81..ed4e7d4 100644 --- a/test/any/async_streaming.cpp +++ b/test/any/async_streaming.cpp @@ -5,10 +5,13 @@ #include #include +#include "mkn/kul/dbg.hpp" #include "mkn/gpu/multi_launch.hpp" -std::uint32_t static constexpr NUM = 128 * 1024 * 1024; // ~ 1GB of doubles -std::size_t constexpr static C = 5; // ~ 5GB of doubles +using namespace std::chrono_literals; + +std::uint32_t static constexpr NUM = 128 * 1024; // ~ 1MB of doubles +std::size_t constexpr static C = 5; // ~ 5MB of doubles template using ManagedVector = std::vector>; @@ -20,6 +23,9 @@ struct A { std::uint32_t test() { using namespace mkn::gpu; using T = double; + + KUL_DBG_FUNC_ENTER; + std::vector> vecs(C, ManagedVector(NUM, 0)); for (std::size_t i = 0; i < vecs.size(); ++i) std::fill_n(vecs[i].data(), NUM, i); @@ -29,17 +35,63 @@ std::uint32_t test() { StreamLauncher{vecs} .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 1; }) - .host([&](auto i) mutable { vecs[i][0] += 1; }) + .host([&](auto i) mutable { + std::this_thread::sleep_for(200ms); + for (auto& e : vecs[i]) e += 1; + }) + .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 3; }) // + (); + + for (auto const& vec : vecs) std::cout << __LINE__ << " " << vec[0] << std::endl; + + std::size_t val = 5; + for (auto const& vec : vecs) { + for (auto const& e : vec) + if (e != val) return 1; + ++val; + }; + + return 0; +} + +std::uint32_t test_threaded() { + using namespace mkn::gpu; + using T = double; + + KUL_DBG_FUNC_ENTER; + + std::vector> vecs(C, ManagedVector(NUM, 0)); + for (std::size_t i = 0; i < vecs.size(); ++i) std::fill_n(vecs[i].data(), NUM, i); + + ManagedVector datas(C); + for (std::size_t i = 0; i < vecs.size(); ++i) datas[i] = vecs[i].data(); + auto views = datas.data(); + + using namespace std::chrono_literals; + + ThreadedStreamLauncher{vecs, 6} + .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 1; }) + .host([&](auto i) mutable { + std::this_thread::sleep_for(200ms); + for (auto& e : vecs[i]) e += 1; + }) .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 3; }) // (); for (auto const& vec : vecs) std::cout << __LINE__ << " " << vec[0] << std::endl; + std::size_t val = 5; + for (auto const& vec : vecs) { + for (auto const& e : vec) + if (e != val) return 1; + ++val; + }; + return 0; } int main() { KOUT(NON) << __FILE__; - return test(); + return test() + test_threaded(); } From f1871c44150f279f76b01e9840db8a699ceba737 Mon Sep 17 00:00:00 2001 From: p Date: Sat, 27 Jul 2024 17:40:57 +0100 Subject: [PATCH 5/7] event streaming --- inc/mkn/gpu/cuda.hpp | 41 ++++++++++++++++++++++++++++++------ inc/mkn/gpu/multi_launch.hpp | 5 ++++- res/mkn/clang_cuda.yaml | 3 +++ test/any/async_streaming.cpp | 2 +- 4 files changed, 42 insertions(+), 9 deletions(-) diff --git a/inc/mkn/gpu/cuda.hpp b/inc/mkn/gpu/cuda.hpp index dd96528..df1812c 100644 --- a/inc/mkn/gpu/cuda.hpp +++ b/inc/mkn/gpu/cuda.hpp @@ -108,19 +108,46 @@ struct Stream { struct StreamEvent { StreamEvent(Stream& stream_) : stream{stream_} { reset(); } - ~StreamEvent() { /*MKN_GPU_ASSERT(result = cudaEventDestroy(event));*/ } + ~StreamEvent() { + if (start) { + MKN_GPU_ASSERT(result = cudaEventDestroy(start)) + } + if (stop) { + MKN_GPU_ASSERT(result = cudaEventDestroy(stop)) + } + } + + StreamEvent(StreamEvent&& that) : stream{that.stream}, start{that.start}, stop{that.stop} { + that.start = nullptr; + that.stop = nullptr; + } - auto& operator()() { return event; }; - void record() { MKN_GPU_ASSERT(result = cudaEventRecord(event, stream())); } - bool finished() const { return cudaEventQuery(event) == cudaSuccess; } + StreamEvent(StreamEvent const&) = delete; + StreamEvent& operator=(StreamEvent const&) = delete; + + auto& operator()() { return stop; }; + void record() { + if (stage == 0) { + MKN_GPU_ASSERT(result = cudaEventRecord(start, stream())); + ++stage; + } else { + MKN_GPU_ASSERT(result = cudaEventRecord(stop, stream())); + ++stage; + } + } + bool finished() const { return stage == 2 and cudaEventQuery(stop) == cudaSuccess; } void reset() { - if (event) MKN_GPU_ASSERT(result = cudaEventDestroy(event)); - MKN_GPU_ASSERT(result = cudaEventCreate(&event)); + if (start) MKN_GPU_ASSERT(result = cudaEventDestroy(start)); + MKN_GPU_ASSERT(result = cudaEventCreate(&start)); + if (stop) MKN_GPU_ASSERT(result = cudaEventDestroy(stop)); + MKN_GPU_ASSERT(result = cudaEventCreate(&stop)); + stage = 0; } Stream& stream; cudaError_t result; - cudaEvent_t event = nullptr; + cudaEvent_t start = nullptr, stop = nullptr; + std::uint16_t stage = 0; }; template diff --git a/inc/mkn/gpu/multi_launch.hpp b/inc/mkn/gpu/multi_launch.hpp index 66626b8..e46425b 100644 --- a/inc/mkn/gpu/multi_launch.hpp +++ b/inc/mkn/gpu/multi_launch.hpp @@ -132,6 +132,8 @@ struct StreamLauncher { auto const& step = data_step[i]; assert(step < fns.size()); fns[step]->run(i); + assert(i < events.size()); + assert(step < fns.size()); if (fns[step]->mode == StreamFunctionMode::DEVICE_WAIT) events[i].record(); } @@ -222,6 +224,7 @@ struct ThreadedStreamLauncher : public StreamLauncher Date: Mon, 29 Jul 2024 23:18:45 +0200 Subject: [PATCH 6/7] set device --- inc/mkn/gpu/cpu.hpp | 6 +++--- inc/mkn/gpu/cuda.hpp | 2 ++ inc/mkn/gpu/multi_launch.hpp | 3 +-- inc/mkn/gpu/rocm.hpp | 5 ++++- test/any/async_streaming.cpp | 1 - 5 files changed, 10 insertions(+), 7 deletions(-) diff --git a/inc/mkn/gpu/cpu.hpp b/inc/mkn/gpu/cpu.hpp index 36de6f8..5936969 100644 --- a/inc/mkn/gpu/cpu.hpp +++ b/inc/mkn/gpu/cpu.hpp @@ -83,9 +83,9 @@ struct dim3 { std::size_t x = 1, y = 1, z = 1; }; -void setLimitMallocHeapSize(std::size_t const& /*bytes*/) { - // noop -} +void setLimitMallocHeapSize(std::size_t const& /*bytes*/) {} /*noop*/ + +void setDevice(std::size_t const& /*dev*/) {} /*noop*/ auto supportsCooperativeLaunch(int const /*dev*/ = 0) { int supportsCoopLaunch = 0; diff --git a/inc/mkn/gpu/cuda.hpp b/inc/mkn/gpu/cuda.hpp index df1812c..7addcca 100644 --- a/inc/mkn/gpu/cuda.hpp +++ b/inc/mkn/gpu/cuda.hpp @@ -88,6 +88,8 @@ void setLimitMallocHeapSize(std::size_t const& bytes) { MKN_GPU_ASSERT(cudaDeviceSetLimit(cudaLimitMallocHeapSize, bytes)); } +void setDevice(std::size_t const& dev) { MKN_GPU_ASSERT(cudaSetDevice(dev)); } + auto supportsCooperativeLaunch(int const dev = 0) { int supportsCoopLaunch = 0; MKN_GPU_ASSERT(cudaDeviceGetAttribute(&supportsCoopLaunch, cudaDevAttrCooperativeLaunch, dev)); diff --git a/inc/mkn/gpu/multi_launch.hpp b/inc/mkn/gpu/multi_launch.hpp index e46425b..57e7b55 100644 --- a/inc/mkn/gpu/multi_launch.hpp +++ b/inc/mkn/gpu/multi_launch.hpp @@ -133,7 +133,6 @@ struct StreamLauncher { assert(step < fns.size()); fns[step]->run(i); assert(i < events.size()); - assert(step < fns.size()); if (fns[step]->mode == StreamFunctionMode::DEVICE_WAIT) events[i].record(); } @@ -224,7 +223,7 @@ struct ThreadedStreamLauncher : public StreamLauncher Date: Sat, 3 Aug 2024 15:47:58 +0200 Subject: [PATCH 7/7] support barrier --- .github/workflows/build.yml | 2 +- inc/mkn/gpu/cli.hpp | 20 ++-------- inc/mkn/gpu/cpu.hpp | 11 ++++-- inc/mkn/gpu/cuda.hpp | 41 ++++++++++++++------ inc/mkn/gpu/multi_launch.hpp | 75 +++++++++++++++++++++++++----------- inc/mkn/gpu/rocm.hpp | 63 +++++++++++++++++++++++++----- mkn.yaml | 2 +- res/mkn/clang_cuda.yaml | 2 +- res/mkn/hipcc.yaml | 2 +- test/any/async_streaming.cpp | 17 +++----- 10 files changed, 157 insertions(+), 78 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 810b748..3c667f2 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -16,4 +16,4 @@ jobs: run: | curl -Lo mkn https://github.com/mkn/mkn/releases/download/latest/mkn_nix chmod +x mkn - KLOG=3 ./mkn clean build run -dtKOgp cpu -a "-std=c++17" test -W 9 + KLOG=3 ./mkn clean build run -dtKOgp cpu -a "-std=c++20" test -W 9 diff --git a/inc/mkn/gpu/cli.hpp b/inc/mkn/gpu/cli.hpp index bae675c..6a40948 100644 --- a/inc/mkn/gpu/cli.hpp +++ b/inc/mkn/gpu/cli.hpp @@ -32,33 +32,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef _MKN_GPU_CLI_HPP_ #define _MKN_GPU_CLI_HPP_ -#include -#include - #include "mkn/kul/env.hpp" +#include "mkn/kul/string.hpp" namespace mkn::gpu { template struct Cli { - // + constexpr static inline char const* MKN_GPU_BX_THREADS = "MKN_GPU_BX_THREADS"; auto bx_threads() const { - char const* ENV = "MKN_GPU_BX_THREADS"; - if (mkn::kul::env::EXISTS(ENV)) { - return as(mkn::kul::env::GET(ENV)); - } + if (kul::env::EXISTS(MKN_GPU_BX_THREADS)) + return kul::String::INT32(kul::env::GET(MKN_GPU_BX_THREADS)); return dev.maxThreadsPerBlock; } - template - auto static as(std::string const& from) { - T t; - std::stringstream ss(from); - ss >> t; - return t; - } - Device const& dev; }; diff --git a/inc/mkn/gpu/cpu.hpp b/inc/mkn/gpu/cpu.hpp index 5936969..bedac17 100644 --- a/inc/mkn/gpu/cpu.hpp +++ b/inc/mkn/gpu/cpu.hpp @@ -108,12 +108,17 @@ struct StreamEvent { ~StreamEvent() {} auto& operator()() { return event; }; - void record() { ; } - bool finished() const { return true; } - void reset() {} + auto& record() { + ++stage; + return *this; + } + auto& wait() { return *this; } + bool finished() const { return stage == 2; } + void reset() { stage = 0; } Stream stream; std::size_t event = 0; + std::uint16_t stage = 0; }; template diff --git a/inc/mkn/gpu/cuda.hpp b/inc/mkn/gpu/cuda.hpp index 7addcca..9278c6d 100644 --- a/inc/mkn/gpu/cuda.hpp +++ b/inc/mkn/gpu/cuda.hpp @@ -44,7 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "mkn/gpu/def.hpp" -// #define MKN_GPU_ASSERT(x) (KASSERT((x) == cudaSuccess)) +// #define MKN_GPU_ASSERT(ans) \ { gpuAssert((ans), __FILE__, __LINE__); } @@ -110,14 +110,7 @@ struct Stream { struct StreamEvent { StreamEvent(Stream& stream_) : stream{stream_} { reset(); } - ~StreamEvent() { - if (start) { - MKN_GPU_ASSERT(result = cudaEventDestroy(start)) - } - if (stop) { - MKN_GPU_ASSERT(result = cudaEventDestroy(stop)) - } - } + ~StreamEvent() { clear(); } StreamEvent(StreamEvent&& that) : stream{that.stream}, start{that.start}, stop{that.stop} { that.start = nullptr; @@ -128,7 +121,7 @@ struct StreamEvent { StreamEvent& operator=(StreamEvent const&) = delete; auto& operator()() { return stop; }; - void record() { + auto& record() { if (stage == 0) { MKN_GPU_ASSERT(result = cudaEventRecord(start, stream())); ++stage; @@ -136,12 +129,25 @@ struct StreamEvent { MKN_GPU_ASSERT(result = cudaEventRecord(stop, stream())); ++stage; } + return *this; + } + auto& wait() { + if (stage == 0) { + MKN_GPU_ASSERT(result = cudaStreamWaitEvent(stream(), start)); + } else { + MKN_GPU_ASSERT(result = cudaStreamWaitEvent(stream(), stop)); + } + return *this; + } + + void clear() { + if (start) MKN_GPU_ASSERT(result = cudaEventDestroy(start)); + if (stop) MKN_GPU_ASSERT(result = cudaEventDestroy(stop)); } bool finished() const { return stage == 2 and cudaEventQuery(stop) == cudaSuccess; } void reset() { - if (start) MKN_GPU_ASSERT(result = cudaEventDestroy(start)); + clear(); MKN_GPU_ASSERT(result = cudaEventCreate(&start)); - if (stop) MKN_GPU_ASSERT(result = cudaEventDestroy(stop)); MKN_GPU_ASSERT(result = cudaEventCreate(&stop)); stage = 0; } @@ -356,6 +362,17 @@ void inline prinfo(size_t dev = 0) { KOUT(NON) << " threadsPBlock " << devProp.maxThreadsPerBlock; } +void print_gpu_mem_used() { + float free_m = 0, total_m = 0, used_m = 0; + std::size_t free_t = 0, total_t = 0; + cudaMemGetInfo(&free_t, &total_t); + free_m = free_t / 1048576.0; + total_m = total_t / 1048576.0; + used_m = total_m - free_m; + printf(" mem free %zu .... %f MB mem total %zu....%f MB mem used %f MB\n", free_t, free_m, + total_t, total_m, used_m); +} + __device__ void grid_sync() { namespace cg = cooperative_groups; cg::grid_group grid = cg::this_grid(); diff --git a/inc/mkn/gpu/multi_launch.hpp b/inc/mkn/gpu/multi_launch.hpp index 57e7b55..c75b820 100644 --- a/inc/mkn/gpu/multi_launch.hpp +++ b/inc/mkn/gpu/multi_launch.hpp @@ -31,20 +31,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef _MKN_GPU_MULTI_LAUNCH_HPP_ #define _MKN_GPU_MULTI_LAUNCH_HPP_ -#include +#include #include -#include -#include -#include #include -#include +#include +#include +#include +#include +#include #include #include "mkn/gpu.hpp" namespace mkn::gpu { -enum class StreamFunctionMode { HOST_WAIT = 0, DEVICE_WAIT }; +enum class StreamFunctionMode { HOST_WAIT = 0, DEVICE_WAIT, BARRIER }; enum class StreamFunctionStatus { HOST_BUSY = 0, DEVICE_BUSY }; template @@ -129,11 +130,14 @@ struct StreamLauncher { } void operator()(std::uint32_t const i) { - auto const& step = data_step[i]; + auto const step = data_step[i]; + assert(step < fns.size()); - fns[step]->run(i); assert(i < events.size()); - if (fns[step]->mode == StreamFunctionMode::DEVICE_WAIT) events[i].record(); + + // if (fns[step]->mode == StreamFunctionMode::HOST_WAIT) events[i].stream.sync(); + fns[step]->run(i); + if (fns[step]->mode == StreamFunctionMode::DEVICE_WAIT) events[i].record().wait(); } bool is_finished() const { @@ -145,7 +149,7 @@ struct StreamLauncher { bool is_finished(std::uint32_t idx) const { return data_step[idx] == fns.size(); } - bool is_fn_finished(std::uint32_t i) { + bool is_fn_finished(std::uint32_t const& i) { auto const b = [&]() { auto const& step = data_step[i]; if (fns[step]->mode == StreamFunctionMode::HOST_WAIT) return true; @@ -181,6 +185,24 @@ struct AsyncStreamHostFunction : StreamFunction { Fn fn; }; +template +struct StreamBarrierFunction : StreamFunction { + using Super = StreamFunction; + using Super::strat; + + StreamBarrierFunction(Strat& strat) + : Super{strat, StreamFunctionMode::BARRIER}, + sync_point{std::ssize(strat.datas), on_completion} {} + + void run(std::uint32_t const /*i*/) override { [[maybe_unused]] auto ret = sync_point.arrive(); } + + std::function on_completion = [&]() { + for (auto& stat : strat.status) stat = SFS::WAIT; + }; + + std::barrier sync_point; +}; + template struct ThreadedStreamLauncher : public StreamLauncher> { using This = ThreadedStreamLauncher; @@ -192,8 +214,9 @@ struct ThreadedStreamLauncher : public StreamLauncher>(*this)); + return *this; + } + void operator()() { join(); } Super& super() { return *this; } void super(std::size_t const& idx) { return super()(idx); } - bool is_fn_finished(std::uint32_t i) { + bool is_fn_finished(std::uint32_t const& i) { auto const b = [&]() { if (fns[step[i]]->mode == StreamFunctionMode::HOST_WAIT) return status[i] == SFS::WAIT; return events[i].finished(); @@ -222,8 +250,9 @@ struct ThreadedStreamLauncher : public StreamLauncher= wait_max_ms ? wait_max_ms : waitms + 10; - if (check_finished()) done = 1; + continue; } + + std::this_thread::sleep_for(std::chrono::milliseconds(waitms)); + waitms = waitms >= wait_max_ms ? wait_max_ms : waitms + 10; } } @@ -247,12 +275,12 @@ struct ThreadedStreamLauncher : public StreamLauncher get_work(std::size_t const& start = 0) { - std::unique_lock lk(work_); + std::scoped_lock lk(work_); for (std::size_t i = start; i < datas.size(); ++i) { if (status[i] == SFS::BUSY) { if (is_fn_finished(i)) status[i] = SFS::WAIT; - - } else if (status[i] == SFS::WAIT) { + } + if (status[i] == SFS::WAIT) { ++step[i]; if (Super::is_finished(i)) { @@ -268,7 +296,7 @@ struct ThreadedStreamLauncher : public StreamLauncher threads; std::mutex work_; diff --git a/inc/mkn/gpu/rocm.hpp b/inc/mkn/gpu/rocm.hpp index 7059d5a..9917879 100644 --- a/inc/mkn/gpu/rocm.hpp +++ b/inc/mkn/gpu/rocm.hpp @@ -104,22 +104,56 @@ struct Stream { hipStream_t stream; }; +// + struct StreamEvent { StreamEvent(Stream& stream_) : stream{stream_} { reset(); } - ~StreamEvent() { /*MKN_GPU_ASSERT(result = hipEventDestroy(event));*/ + ~StreamEvent() { clear(); } + + StreamEvent(StreamEvent&& that) : stream{that.stream}, start{that.start}, stop{that.stop} { + that.start = nullptr; + that.stop = nullptr; } - auto& operator()() { return event; }; - void record() { MKN_GPU_ASSERT(result = hipEventRecord(event, stream())); } - bool finished() const { return hipEventQuery(event) == hipSuccess; } + StreamEvent(StreamEvent const&) = delete; + StreamEvent& operator=(StreamEvent const&) = delete; + + auto& operator()() { return stop; }; + auto& record() { + if (stage == 0) { + MKN_GPU_ASSERT(result = hipEventRecord(start, stream())); + ++stage; + } else { + MKN_GPU_ASSERT(result = hipEventRecord(stop, stream())); + ++stage; + } + return *this; + } + auto& wait() { + if (stage == 0) { + MKN_GPU_ASSERT(result = hipStreamWaitEvent(stream(), start)); + } else { + MKN_GPU_ASSERT(result = hipStreamWaitEvent(stream(), stop)); + } + return *this; + } + + void clear() { + if (start) MKN_GPU_ASSERT(result = hipEventDestroy(start)); + if (stop) MKN_GPU_ASSERT(result = hipEventDestroy(stop)); + } + bool finished() const { return stage == 2 and hipEventQuery(stop) == hipSuccess; } void reset() { - if (event) MKN_GPU_ASSERT(result = hipEventDestroy(event)); - MKN_GPU_ASSERT(result = hipEventCreate(&event)); + clear(); + MKN_GPU_ASSERT(result = hipEventCreate(&start)); + MKN_GPU_ASSERT(result = hipEventCreate(&stop)); + stage = 0; } Stream& stream; hipError_t result; - hipEvent_t event = nullptr; + hipEvent_t start = nullptr, stop = nullptr; + std::uint16_t stage = 0; }; // https://rocm.docs.amd.com/projects/HIP/en/latest/doxygen/html/group___global_defs.html#gaea86e91d3cd65992d787b39b218435a3 @@ -250,7 +284,7 @@ void launch(F&& f, dim3 g, dim3 b, std::size_t ds, hipStream_t& s, Args&&... arg if constexpr (_coop) { auto address_of = [](auto& a) { return (void*)&a; }; void* kernelArgs[] = {(address_of(params), ...)}; - hipLaunchCooperativeKernel(f, g, b, kernelArgs, ds, s); + MKN_GPU_ASSERT(hipLaunchCooperativeKernel(f, g, b, kernelArgs, ds, s)); } else { hipLaunchKernelGGL(f, g, b, ds, s, params...); } @@ -322,7 +356,7 @@ void fill(Container& c, T val) { // https://rocm-developer-tools.github.io/HIP/group__Device.html void inline prinfo(size_t dev = 0) { hipDeviceProp_t devProp; - [[maybe_unused]] auto ret = hipGetDeviceProperties(&devProp, dev); + MKN_GPU_ASSERT(hipGetDeviceProperties(&devProp, dev)); KOUT(NON) << " System version " << devProp.major << "." << devProp.minor; KOUT(NON) << " agent name " << devProp.name; KOUT(NON) << " cores " << devProp.multiProcessorCount; @@ -333,6 +367,17 @@ void inline prinfo(size_t dev = 0) { KOUT(NON) << " threadsPBlock " << devProp.maxThreadsPerBlock; } +void print_gpu_mem_used() { + float free_m = 0, total_m = 0, used_m = 0; + std::size_t free_t = 0, total_t = 0; + MKN_GPU_ASSERT(hipMemGetInfo(&free_t, &total_t)); + free_m = free_t / 1048576.0; + total_m = total_t / 1048576.0; + used_m = total_m - free_m; + printf(" mem free %zu .... %f MB mem total %zu....%f MB mem used %f MB\n", free_t, free_m, + total_t, total_m, used_m); +} + __device__ void grid_sync() { namespace cg = cooperative_groups; cg::grid_group grid = cg::this_grid(); diff --git a/mkn.yaml b/mkn.yaml index 2d7b461..9f20fa4 100644 --- a/mkn.yaml +++ b/mkn.yaml @@ -1,4 +1,4 @@ -#! clean build test run -Op rocm -x res/mkn/hipcc -W +#! clean build test run -tOqp rocm -x res/mkn/hipcc -W name: mkn.gpu parent: headers diff --git a/res/mkn/clang_cuda.yaml b/res/mkn/clang_cuda.yaml index 7f0aedd..5654326 100644 --- a/res/mkn/clang_cuda.yaml +++ b/res/mkn/clang_cuda.yaml @@ -12,7 +12,7 @@ # property: - cxx_flags: --std=c++17 -fPIC -fsized-deallocation -Wno-unknown-cuda-version + cxx_flags: --std=c++20 -fPIC -fsized-deallocation -Wno-unknown-cuda-version cxx_cuda: -x cuda --cuda-gpu-arch="sm_61" -Xclang -fcuda-allow-variadic-functions # local: diff --git a/res/mkn/hipcc.yaml b/res/mkn/hipcc.yaml index 3cb9a0a..e123216 100644 --- a/res/mkn/hipcc.yaml +++ b/res/mkn/hipcc.yaml @@ -18,6 +18,6 @@ env: file: - type: cpp:cxx:cc archiver: ar -cr - compiler: hipcc -std=c++17 -fPIC + compiler: hipcc -std=c++20 -fPIC linker: hipcc diff --git a/test/any/async_streaming.cpp b/test/any/async_streaming.cpp index b79a700..fb0b825 100644 --- a/test/any/async_streaming.cpp +++ b/test/any/async_streaming.cpp @@ -39,10 +39,7 @@ std::uint32_t test() { std::this_thread::sleep_for(200ms); for (auto& e : vecs[i]) e += 1; }) - .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 3; }) // - (); - - for (auto const& vec : vecs) std::cout << __LINE__ << " " << vec[0] << std::endl; + .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 3; })(); std::size_t val = 5; for (auto const& vec : vecs) { @@ -54,7 +51,7 @@ std::uint32_t test() { return 0; } -std::uint32_t test_threaded() { +std::uint32_t test_threaded(std::size_t const& nthreads = 2) { using namespace mkn::gpu; using T = double; @@ -69,16 +66,14 @@ std::uint32_t test_threaded() { using namespace std::chrono_literals; - ThreadedStreamLauncher{vecs, 6} + ThreadedStreamLauncher{vecs, nthreads} .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 1; }) .host([&](auto i) mutable { std::this_thread::sleep_for(200ms); for (auto& e : vecs[i]) e += 1; }) - .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 3; }) // - (); - - for (auto const& vec : vecs) std::cout << __LINE__ << " " << vec[0] << std::endl; + .barrier() + .dev([=] __device__(auto i) { views[i][mkn::gpu::idx()] += 3; })(); std::size_t val = 5; for (auto const& vec : vecs) { @@ -92,5 +87,5 @@ std::uint32_t test_threaded() { int main() { KOUT(NON) << __FILE__; - return test() + test_threaded(); + return test() + test_threaded() + test_threaded(6); }