diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 4dfd2fbc10..b203870e04 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -101,6 +101,11 @@ raja_add_executable( NAME kernel-dynamic-tile SOURCES kernel-dynamic-tile.cpp) +raja_add_executable( + NAME kernel-reduction + SOURCES kernel-reduction.cpp) + + raja_add_executable( NAME resource-kernel SOURCES resource-kernel.cpp) diff --git a/examples/kernel-reduction.cpp b/examples/kernel-reduction.cpp new file mode 100644 index 0000000000..d681f866d1 --- /dev/null +++ b/examples/kernel-reduction.cpp @@ -0,0 +1,73 @@ +#include "RAJA/RAJA.hpp" +#include "RAJA/index/RangeSegment.hpp" +#include "memoryManager.hpp" + + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + +// matrix min, really dumb example +using EXEC_POL8 = + RAJA::KernelPolicy< + RAJA::statement::CudaKernel< + RAJA::statement::For<1, RAJA::cuda_block_x_loop, // row + RAJA::statement::For<0, RAJA::cuda_thread_x_loop, // col + RAJA::statement::Lambda<0> // min addition do I need an extra , RAJA::Params<0> here? + > + > + > + >; + // _matmult_3lambdakernel_cuda_end + + using VALOPLOC_INT_MIN = RAJA::expt::ValLocOp; + using VALOP_INT_MIN = RAJA::expt::ValOp; + // RAJA::expt::Reduce(&cuda_min), + int cuda_min = 0; + + int seq_sum = 0; + int N = 10000; + + RAJA::TypedRangeSegment row_range(0, N); + RAJA::TypedRangeSegment col_range(0, N); + + RAJA::resources::Cuda cuda_res; + int *A = memoryManager::allocate(N * N); + for (int row = 0; row < N; ++row) { + for (int col = 0; col < N; ++col) { + A[col + row * N] = -row; + } + } + + RAJA::View> Aview(A, N, N); + + // doesn't compile: + // no known conversion from + // 'RAJA::expt::detail::Reducer, int, RAJA::expt::ValOp>' + // to 'VALOP_INT_MIN &' + RAJA::kernel_param( + // segments + RAJA::make_tuple(col_range, row_range), + // params + RAJA::make_tuple(RAJA::expt::Reduce(&cuda_min)), + //RAJA::tuple(0.0), + // lambda 1 + [=] RAJA_DEVICE (int col, int row, VALOP_INT_MIN &_cuda_min) { + _cuda_min.min(Aview(row, col)); + //double& a){ + //a += Aview(row, col); + } + + ); + + // compiles + RAJA::forall>(cuda_res, RAJA::RangeSegment(0, N), + RAJA::expt::Reduce(&cuda_min), + [=] RAJA_DEVICE (int i, VALOP_INT_MIN &_cuda_min) { + _cuda_min.min(Aview(i, 0)); + } + + ); + + std::cout << "MIN VAL = " << cuda_min << std::endl; + //checkResult(Cview, N); +}; diff --git a/include/RAJA/kernel-reduce.cpp b/include/RAJA/kernel-reduce.cpp new file mode 100644 index 0000000000..2d0636e06c --- /dev/null +++ b/include/RAJA/kernel-reduce.cpp @@ -0,0 +1,41 @@ +#include "RAJA/RAJA.hpp" + +using EXEC_POL8 = + RAJA::KernelPolicy< + RAJA::statement::CudaKernel< + RAJA::statement::For<1, RAJA::cuda_block_x_loop, // row + RAJA::statement::For<0, RAJA::cuda_thread_x_loop, // col + RAJA::statement::Lambda<0, RAJA::Params<0>>, // dot = 0.0 + RAJA::statement::For<2, RAJA::seq_exec, + RAJA::statement::Lambda<1> // dot += ... + >, + RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>> // set C = ... + > + > + > + >; + // _matmult_3lambdakernel_cuda_end + + RAJA::kernel_param( + RAJA::make_tuple(col_range, row_range, dot_range), + + RAJA::tuple{0.0}, // thread local variable for 'dot' + + // lambda 0 + [=] RAJA_DEVICE (double& dot) { + dot = 0.0; + }, + + // lambda 1 + [=] RAJA_DEVICE (int col, int row, int k, double& dot) { + dot += Aview(row, k) * Bview(k, col); + }, + + // lambda 2 + [=] RAJA_DEVICE (int col, int row, double& dot) { + Cview(row, col) = dot; + } + + ); + + checkResult(Cview, N); \ No newline at end of file diff --git a/include/RAJA/pattern/params/reducer.hpp b/include/RAJA/pattern/params/reducer.hpp index bb8595f621..92cee9bb8d 100644 --- a/include/RAJA/pattern/params/reducer.hpp +++ b/include/RAJA/pattern/params/reducer.hpp @@ -68,7 +68,7 @@ struct Reducer : public ForallParamBase { using op = Op; using value_type = T; // This is a basic data type - + //using VOp = ValOp; Reducer() = default; // Basic data type constructor