Skip to content

Commit

Permalink
Update code MPI OpenMP and CUDA
Browse files Browse the repository at this point in the history
  • Loading branch information
lemoinep committed Sep 27, 2023
1 parent db4ab76 commit 804dab3
Show file tree
Hide file tree
Showing 18 changed files with 2,407 additions and 0 deletions.
103 changes: 103 additions & 0 deletions src/Cuda/Block/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@

#project(MPI_bcast)

######## A simple cmakelists.txt file for ... #############

cmake_minimum_required(VERSION 3.17)
#set(CMAKE_CXX_STANDARD 14)
#set(CMAKE_BUILD_TYPE Debug)
set(CMAKE_BUILD_TYPE Release)
#set(CMAKE_CXX_COMPILER "/usr/local/bin/g++")
#set(CMAKE_CXX_STANDARD 14)
#set(CMAKE_CXX_COMPILER "/usr/local/bin/g++")
#set(CMAKE_C_COMPILER "/usr/bin/clang-14")
#set(CMAKE_CXX_COMPILER "/usr/bin/clang++-14")
#set(CMAKE_CXX_COMPILER "/usr/bin/gcc")
#set(CMAKE_CXX_COMPILER "/usr/bin/g++-11")



if(FALSE)
find_package(MPI REQUIRED)
if (MPI_FOUND)
MESSAGE("{MPI_CXX_LIBRARIES}")
else (MPI_FOUND)
MESSAGE (SEND_ERROR "This application cannot compile without MPI")
endif(MPI_FOUND)
endif()


if(FALSE)
find_package(OpenMP)
if (OpenMP_CXX_FOUND)
MESSAGE("{OpenMP_CXX_LIBRARIES}")
else (OpenMP_CXX_FOUND)
MESSAGE (SEND_ERROR "This application cannot compile without OpenMPI")
endif(OpenMP_CXX_FOUND)
endif()


find_package(CUDA REQUIRED)

if (CUDA_FOUND)
MESSAGE("{CUDA_CXX_LIBRARIES}")
else (CUDA_FOUND)
MESSAGE (SEND_ERROR "This application cannot compile without CUDA")
endif(CUDA_FOUND)

add_definitions(-D_FORCE_INLINES)

#set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --gpu-architecture sm_21 -std=c++11)

set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -O3 )

file(GLOB WFOPenMP_SRC
"*.cu"
)

foreach (myfile ${WFOPenMP_SRC})
get_filename_component(myname ${myfile} NAME_WLE)
get_filename_component(dirname ${myfile} DIRECTORY)
message("${myname}.cu | ${dir_src}")
#add_executable(${myname} "${myname}.c")

cuda_add_executable(${myname} "${myname}.cu")

#target_link_libraries( ${myname} -lfoobar -ljoestuff )

#if(MPI_FOUND)
#include_directories(SYSTEM ${MPI_INCLUDES_PATH})
#target_include_directories(${myname} PUBLIC ${MPI_CXX_INCLUDE_DIRS})
#target_link_libraries(${myname} PUBLIC ${MPI_CXX_LIBRARIES} )
#endif()


endforeach (file ${WFOPenMP_SRC})


if(FALSE)
file(GLOB WFOPenMP_SRC
"*.cpp"
"*.h"
)

foreach (myfile ${WFOPenMP_SRC})
get_filename_component(myname ${myfile} NAME_WLE)
get_filename_component(dirname ${myfile} DIRECTORY)
message("${myname}.cpp | ${dir_src}")
#add_executable(${myname} "${myname}.c")

cuda_add_executable(${myname} "${myname}.cpp")

#target_link_libraries( ${myname} -lfoobar -ljoestuff )

#if(MPI_FOUND)
#include_directories(SYSTEM ${MPI_INCLUDES_PATH})
#target_include_directories(${myname} PUBLIC ${MPI_CXX_INCLUDE_DIRS})
#target_link_libraries(${myname} PUBLIC ${MPI_CXX_LIBRARIES} )
#endif()
endforeach (file ${WFOPenMP_SRC})
endif()


########### end ####################################
46 changes: 46 additions & 0 deletions src/Cuda/Block/block_floyd.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// Copyright 2023 Pierre Talbot

#include "utility.hpp"
#include <string>

void floyd_warshall_cpu(std::vector<std::vector<int>>& d) {
size_t n = d.size();
for(int k = 0; k < n; ++k) {
for(int i = 0; i < n; ++i) {
for(int j = 0; j < n; ++j) {
if(d[i][j] > d[i][k] + d[k][j]) {
d[i][j] = d[i][k] + d[k][j];
}
}
}
}
}

int main(int argc, char** argv) {
if(argc != 3) {
std::cout << "usage: " << argv[0] << " <matrix size> <block size>" << std::endl;
exit(1);
}
size_t n = std::stoi(argv[1]);
size_t block_size = std::stoi(argv[2]);

// I. Generate a random distance matrix of size N x N.
std::vector<std::vector<int>> cpu_distances = initialize_distances(n);
// Note that `std::vector` cannot be used on GPU, hence we transfer it into a simple `int**` array in managed memory.
int** gpu_distances = initialize_gpu_distances(cpu_distances);

// II. Running Floyd Warshall on CPU.
long cpu_ms = benchmark_one_ms([&]{
floyd_warshall_cpu(cpu_distances);
});
std::cout << "CPU: " << cpu_ms << " ms" << std::endl;

// III. Running Floyd Warshall on GPU (single block of size `block_size`).

// TODO

// IV. Verifying both give the same result and deallocating.
check_equal_matrix(cpu_distances, gpu_distances);
deallocate_gpu_distances(gpu_distances, n);
return 0;
}
66 changes: 66 additions & 0 deletions src/Cuda/Block/block_min.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// Copyright 2023 Pierre Talbot

#include "utility.hpp"
#include <climits>

__global__ void parallel_min(int* v, size_t n, int* local_min) {
local_min[threadIdx.x] = INT_MAX;
size_t m = n / blockDim.x + (n % blockDim.x != 0);
size_t from = threadIdx.x * m;
size_t to = min(n, from + m);
for(size_t i = from; i < to; ++i) {
local_min[threadIdx.x] = min(local_min[threadIdx.x], v[i]);
}
}

__global__ void parallel_min_stride(int* v, size_t n, int* local_min) {
local_min[threadIdx.x] = INT_MAX;
for(size_t i = threadIdx.x; i < n; i += blockDim.x) {
local_min[threadIdx.x] = min(local_min[threadIdx.x], v[i]);
}
}

int main(int argc, char** argv) {
if(argc != 3) {
std::cout << "usage: " << argv[0] << " <vector size> <threads-per-block>" << std::endl;
std::cout << "example: " << argv[0] << " 1000000000 512" << std::endl;
exit(1);
}
size_t n = std::stoi(argv[1]);
size_t threads_per_block = std::stoi(argv[2]);

// I. Initialize and allocate in managed memory the array of numbers.
int* v = init_random_vector(n);
int* local_min;
CUDIE(cudaMallocManaged(&local_min, sizeof(int) * threads_per_block));

// II. Run the kernel on one block, every thread `i` stores its local minimum in `local_min[i]`.
long gpu_ms = benchmark_ms([&]{
parallel_min<<<1, threads_per_block>>>(v, n, local_min);
CUDIE(cudaDeviceSynchronize());
});
std::cout << "GPU: " << gpu_ms << " ms" << std::endl;

long gpu_strided_ms = benchmark_ms([&]{
parallel_min_stride<<<1, threads_per_block>>>(v, n, local_min);
CUDIE(cudaDeviceSynchronize());
});
std::cout << "GPU (contiguous memory accesses): " << gpu_strided_ms << " ms" << std::endl;

// III. Find the true minimum among all local minimum computed on the GPU.
int res = local_min[0];
for(size_t i = 1; i < threads_per_block; ++i) {
res = min(res, local_min[i]);
}
std::cout << "Minimum on GPU: " << res << std::endl;

// IV. Find the minimum using CPU.
int cpu_res = INT_MAX;
for(size_t i = 0; i < n; ++i) {
cpu_res = std::min(cpu_res, v[i]);
}
std::cout << "Minimum on CPU: " << cpu_res << std::endl;

cudaFree(v);
cudaFree(local_min);
}
81 changes: 81 additions & 0 deletions src/Cuda/Block/block_shared_floyd.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
// Copyright 2023 Pierre Talbot

#include "utility.hpp"
#include <string>

__forceinline__ __device__ int dim2D(int x, int y, int n) {
return x * n + y;
}

__global__ void floyd_warshall_gpu(int** d, size_t n) {
// Copy the matrix into the shared memory.
extern __shared__ int d2[];
for(int i = 0; i < n; ++i) {
for(int j = threadIdx.x; j < n; j += blockDim.x) {
d2[dim2D(i, j, n)] = d[i][j];
}
}
__syncthreads();
// Compute on the shared memory.
for(int k = 0; k < n; ++k) {
for(int i = 0; i < n; ++i) {
for(int j = threadIdx.x; j < n; j += blockDim.x) {
if(d2[dim2D(i,j,n)] > d2[dim2D(i,k,n)] + d2[dim2D(k,j,n)]) {
d2[dim2D(i,j,n)] = d2[dim2D(i,k,n)] + d2[dim2D(k,j,n)];
}
}
}
__syncthreads();
}
// Copy the matrix back to the global memory.
for(int i = 0; i < n; ++i) {
for(int j = threadIdx.x; j < n; j += blockDim.x) {
d[i][j] = d2[dim2D(i, j, n)];
}
}
}

void floyd_warshall_cpu(std::vector<std::vector<int>>& d) {
size_t n = d.size();
for(int k = 0; k < n; ++k) {
for(int i = 0; i < n; ++i) {
for(int j = 0; j < n; ++j) {
if(d[i][j] > d[i][k] + d[k][j]) {
d[i][j] = d[i][k] + d[k][j];
}
}
}
}
}

int main(int argc, char** argv) {
if(argc != 3) {
std::cout << "usage: " << argv[0] << " <matrix size> <threads-per-block>" << std::endl;
exit(1);
}
size_t n = std::stoi(argv[1]);
size_t threads_per_block = std::stoi(argv[2]);

// I. Generate a random distance matrix of size N x N.
std::vector<std::vector<int>> cpu_distances = initialize_distances(n);
// Note that `std::vector` cannot be used on GPU, hence we transfer it into a simple `int**` array in managed memory.
int** gpu_distances = initialize_gpu_distances(cpu_distances);

// II. Running Floyd Warshall on CPU.
long cpu_ms = benchmark_one_ms([&]{
floyd_warshall_cpu(cpu_distances);
});
std::cout << "CPU: " << cpu_ms << " ms" << std::endl;

// III. Running Floyd Warshall on GPU (single block of size `threads_per_block`).
long gpu_ms = benchmark_one_ms([&]{
floyd_warshall_gpu<<<1, threads_per_block, n * n * sizeof(int)>>>(gpu_distances, n);
CUDIE(cudaDeviceSynchronize());
});
std::cout << "GPU: " << gpu_ms << " ms" << std::endl;

// IV. Verifying both give the same result and deallocating.
check_equal_matrix(cpu_distances, gpu_distances);
deallocate_gpu_distances(gpu_distances, n);
return 0;
}
38 changes: 38 additions & 0 deletions src/Cuda/Block/block_shared_min.ccu
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// Copyright 2023 Pierre Talbot

#include "utility.hpp"
#include <algorithm>
#include <climit>

__global__ void parallel_min(int* v, size_t n, int* res) {
__shared__ int* local_min;
if(threadIdx.x == 0) {
local_min = new int[blockDim.x];
for(int i = 0; i < blockDim.x; ++i) {
local_min[i] = INT_MAX;
}
}
__syncthreads();
for(size_t i = threadIdx.x; i < n; i += blockDim.x) {
local_min[threadIdx.x] = min(local_min[threadIdx.x], v[i]);
}
__syncthreads();
if(threadIdx.x == 0) {
*res = local_min[0];
for(size_t i = 1; i < blockDim.x; ++i) {
*res = min(*res, local_min[i]);
}
}
}

int main() {
size_t n = 100000000;
int* v = init_random_vector(n);
int* res;
CUDIE(cudaMallocManaged(&res, sizeof(int)));
parallel_min<<<1, 256>>>(v, n, res);
CUDIE(cudaDeviceSynchronize());
std::cout << "Minimum: " << *res << std::endl;
cudaFree(v);
cudaFree(res);
}
47 changes: 47 additions & 0 deletions src/Cuda/Block/grid_floyd.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
// Copyright 2023 Pierre Talbot

#include "utility.hpp"
#include <string>

void floyd_warshall_cpu(std::vector<std::vector<int>>& d) {
size_t n = d.size();
for(int k = 0; k < n; ++k) {
for(int i = 0; i < n; ++i) {
for(int j = 0; j < n; ++j) {
if(d[i][j] > d[i][k] + d[k][j]) {
d[i][j] = d[i][k] + d[k][j];
}
}
}
}
}

int main(int argc, char** argv) {
if(argc != 4) {
std::cout << "usage: " << argv[0] << " <matrix size> <threads-per-block> <num-blocks>" << std::endl;
exit(1);
}
size_t n = std::stoi(argv[1]);
size_t threads_per_block = std::stoi(argv[2]);
size_t num_blocks = std::stoi(argv[3]);

// I. Generate a random distance matrix of size N x N.
std::vector<std::vector<int>> cpu_distances = initialize_distances(n);
// Note that `std::vector` cannot be used on GPU, hence we transfer it into a simple `int**` array in managed memory.
int** gpu_distances = initialize_gpu_distances(cpu_distances);

// II. Running Floyd Warshall on CPU.
long cpu_ms = benchmark_one_ms([&]{
floyd_warshall_cpu(cpu_distances);
});
std::cout << "CPU: " << cpu_ms << " ms" << std::endl;

// III. Running Floyd Warshall on the whole GPU grid.

// TODO: call the kernel `n` times for each value of `k` (move the outer loop outside of the kernel).

// IV. Verifying both give the same result and deallocating.
check_equal_matrix(cpu_distances, gpu_distances);
deallocate_gpu_distances(gpu_distances, n);
return 0;
}
Loading

0 comments on commit 804dab3

Please sign in to comment.