diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..2c318f4 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,37 @@ +CMAKE_MINIMUM_REQUIRED(VERSION 2.8) +PROJECT(rcm) +INCLUDE(cmake/SBELUtils.cmake) + +SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) + +enable_cuda_support() +MESSAGE(STATUS "Cuda version: ${CUDA_VERSION}") + +SET(RCM_HEADERS + rcm/common.h + rcm/rcm_um.h + rcm/exception.h + rcm/timer.h + ) + +SET(RCM_CUHEADERS + rcm/device/kernels.cuh + ) + +SET(MMIO_FILES + mm_io/mm_io.h + mm_io/mm_io.c + ) + +SOURCE_GROUP("Headers" FILES ${MC64_HEADERS}) +SOURCE_GROUP("CUDA Headers" FILES ${MC64_CUHEADERS}) +SOURCE_GROUP("MM_IO" FILES ${MMIO_FILES}) + +INCLUDE_DIRECTORIES( + ${CMAKE_SOURCE_DIR} + ) + +IF(NOT (${CUDA_VERSION} VERSION_LESS "6.0")) + cuda_add_executable(driver_um driver_um.cu ${MC64_HEADERS} ${MC64_CUHEADERS} ${MMIO_FILES}) + cuda_add_executable(testing testing.cu ${MC64_HEADERS} ${MC64_CUHEADERS} ${MMIO_FILES}) +ENDIF() diff --git a/SBELUtils.cmake b/SBELUtils.cmake new file mode 100644 index 0000000..c2e36c5 --- /dev/null +++ b/SBELUtils.cmake @@ -0,0 +1,166 @@ +#################################################### +## Only modify if you know what you're doing. ## +#################################################### + + +# Helps Eclipse/CDT find our include directories +set(CMAKE_VERBOSE_MAKEFILE on) + +# Detect the bitness of our machine (eg 32- or 64-bit) +# C-equiv: sizeof(void*) +# Alt: 8*sizeof(void*) +math(EXPR CMAKE_ARCH_BITNESS 8*${CMAKE_SIZEOF_VOID_P}) + +# For non-multi-configuration generators (eg, make, Eclipse) +# The Visual Studio generator creates a single project with all these +set(CMAKE_BUILD_TYPE "Release" CACHE STRING "For single-configuration generators (e.g. make) set the type of build: Release, Debug, RelWithDebugInfo, MinSizeRel") +SET_PROPERTY(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Release" "Debug" "RelWithDebugInfo" "MinSizeRel") + + +#################################################### +## ---------------------------------------------- ## +## - - ## +## - Enable MPI Support - ## +## - - ## +## ---------------------------------------------- ## +#################################################### + +# Begin configuring MPI options +macro(enable_mpi_support) + + find_package("MPI" REQUIRED) + + # Add the MPI-specific compiler and linker flags + # Also, search for #includes in MPI's paths + + list(APPEND CMAKE_C_COMPILE_FLAGS ${MPI_C_COMPILE_FLAGS}) + list(APPEND CMAKE_C_LINK_FLAGS ${MPI_C_LINK_FLAGS}) + include_directories(${MPI_C_INCLUDE_PATH}) + + list(APPEND CMAKE_CXX_COMPILE_FLAGS ${MPI_CXX_COMPILE_FLAGS}) + list(APPEND CMAKE_CXX_LINK_FLAGS ${MPI_CXX_LINK_FLAGS}) + include_directories(${MPI_CXX_INCLUDE_PATH}) + +endmacro(enable_mpi_support) +# Done configuring MPI Options + + +#################################################### +## ---------------------------------------------- ## +## - - ## +## - Enable OpenMP Support - ## +## - - ## +## ---------------------------------------------- ## +#################################################### + +# Begin configuring OpenMP options +macro(enable_openmp_support) + + find_package("OpenMP" REQUIRED) + + # Add the OpenMP-specific compiler and linker flags + list(APPEND CMAKE_CXX_FLAGS ${OpenMP_CXX_FLAGS}) + list(APPEND CMAKE_C_FLAGS ${OpenMP_C_FLAGS}) + +endmacro(enable_openmp_support) +# Done configuring OpenMP Options + + +#################################################### +## ---------------------------------------------- ## +## - - ## +## - Enable CUDA Support - ## +## - - ## +## ---------------------------------------------- ## +#################################################### + +# Begin configuring CUDA options +# This is ugly... +macro(enable_cuda_support) + + # Hide a number of options from the default CMake screen + mark_as_advanced(CLEAR CUDA_BUILD_CUBIN) + mark_as_advanced(CLEAR CUDA_SDK_ROOT_DIR) + mark_as_advanced(CLEAR CUDA_TOOLKIT_ROOT_DIR) + mark_as_advanced(CLEAR CUDA_VERBOSE_BUILD) + mark_as_advanced(CLEAR CUDA_FAST_MATH) + mark_as_advanced(CLEAR CUDA_USE_CUSTOM_COMPILER) + mark_as_advanced(CLEAR CUDA_VERBOSE_PTX) + mark_as_advanced(CLEAR CUDA_DEVICE_VERSION) + + # select Compute Capability + # This needs to be manually updated when devices with new CCs come out + set(CUDA_DEVICE_VERSION "20" CACHE STRING "CUDA Device Version") + set_property(CACHE CUDA_DEVICE_VERSION PROPERTY STRINGS "10" "11" "12" "13" "20" "21" "30" "35") + + # Enable fast-math for CUDA (_not_ GCC) + set(CUDA_FAST_MATH TRUE CACHE BOOL "Use Fast Math Operations") + + # Tell nvcc to use a separate compiler for non-CUDA code. + # This is useful if you need to use an older of GCC than comes by default + set(CUDA_USE_CUSTOM_COMPILER FALSE CACHE BOOL "Use Custom Compiler") + set(CUDA_CUSTOM_COMPILER "" CACHE STRING "Custom C++ Compiler for CUDA If Needed") + + # Shows register usage, etc + set(CUDA_VERBOSE_PTX TRUE CACHE BOOL "Show Verbose Kernel Info During Compilation") + + + # Let's get going... + find_package("CUDA" REQUIRED) + + # Frequently used in the examples + cuda_include_directories(${CUDA_SDK_ROOT_DIR}/common/inc) + cuda_include_directories(${CUDA_SDK_ROOT_DIR}/../shared/inc) + + set(CUDA_SDK_LIB_DIR ${CUDA_SDK_ROOT_DIR}/common/lib + ${CUDA_SDK_ROOT_DIR}/lib ${CUDA_SDK_ROOT_DIR}/../shared/lib) + + # these are no longer needed + # # Find path to shrutil libs, from CUDA SDK + # find_library(LIBSHRUTIL + # NAMES shrUtils${CMAKE_ARCH_BITNESS} shrutil_${CMAKE_SYSTEM_PROCESSOR} + # PATHS ${CUDA_SDK_LIB_DIR}) + # find_library(LIBSHRUTIL_DBG + # NAMES shrUtils${CMAKE_ARCH_BITNESS}D shrutil_${CMAKE_SYSTEM_PROCESSOR}D + # PATHS ${CUDA_SDK_LIB_DIR}) + # + # # Find path to cutil libs, from CUDA SDK + # find_library(LIBCUTIL + # NAMES cutil${CMAKE_ARCH_BITNESS} cutil_${CMAKE_SYSTEM_PROCESSOR} + # PATHS ${CUDA_SDK_LIB_DIR}) + # find_library(LIBCUTIL_DBG + # NAMES cutil${arch}D cutil_${CMAKE_SYSTEM_PROCESSOR}D + # PATHS ${CUDA_SDK_LIB_DIR}) + + # Set custom compiler flags + set(CUDA_NVCC_FLAGS "" CACHE STRING "" FORCE) + + if(CUDA_USE_CUSTOM_COMPILER) + mark_as_advanced(CLEAR CUDA_CUSTOM_COMPILER) + list(APPEND CUDA_NVCC_FLAGS "-ccbin=${CUDA_CUSTOM_COMPILER}") + else() + mark_as_advanced(FORCE CUDA_CUSTOM_COMPILER) + endif() + + # Macro for setting the Compute Capability + macro(set_compute_capability cc) + list(APPEND CUDA_NVCC_FLAGS "-gencode=arch=compute_${cc},code=sm_${cc}") + list(APPEND CUDA_NVCC_FLAGS "-gencode=arch=compute_${cc},code=compute_${cc}") + endmacro(set_compute_capability) + + # Tell nvcc to compile for the selected Compute Capability + # This can also be called from the main CMakeLists.txt to enable + # support for additional CCs + set_compute_capability(${CUDA_DEVICE_VERSION}) + + # Enable fast-math if selected + if(CUDA_FAST_MATH) + list(APPEND CUDA_NVCC_FLAGS "-use_fast_math") + endif() + + # Enable verbose compile if selected + if(CUDA_VERBOSE_PTX) + list(APPEND CUDA_NVCC_FLAGS "--ptxas-options=-v") + endif() +endmacro(enable_cuda_support) +# Done configuring CUDA options diff --git a/cmake/SBELUtils.cmake b/cmake/SBELUtils.cmake new file mode 100644 index 0000000..4df6ea1 --- /dev/null +++ b/cmake/SBELUtils.cmake @@ -0,0 +1,166 @@ +#################################################### +## Only modify if you know what you're doing. ## +#################################################### + + +# Helps Eclipse/CDT find our include directories +set(CMAKE_VERBOSE_MAKEFILE on) + +# Detect the bitness of our machine (eg 32- or 64-bit) +# C-equiv: sizeof(void*) +# Alt: 8*sizeof(void*) +math(EXPR CMAKE_ARCH_BITNESS 8*${CMAKE_SIZEOF_VOID_P}) + +# For non-multi-configuration generators (eg, make, Eclipse) +# The Visual Studio generator creates a single project with all these +set(CMAKE_BUILD_TYPE "Release" CACHE STRING "For single-configuration generators (e.g. make) set the type of build: Release, Debug, RelWithDebugInfo, MinSizeRel") +SET_PROPERTY(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Release" "Debug" "RelWithDebugInfo" "MinSizeRel") + + +#################################################### +## ---------------------------------------------- ## +## - - ## +## - Enable MPI Support - ## +## - - ## +## ---------------------------------------------- ## +#################################################### + +# Begin configuring MPI options +macro(enable_mpi_support) + + find_package("MPI" REQUIRED) + + # Add the MPI-specific compiler and linker flags + # Also, search for #includes in MPI's paths + + list(APPEND CMAKE_C_COMPILE_FLAGS ${MPI_C_COMPILE_FLAGS}) + list(APPEND CMAKE_C_LINK_FLAGS ${MPI_C_LINK_FLAGS}) + include_directories(${MPI_C_INCLUDE_PATH}) + + list(APPEND CMAKE_CXX_COMPILE_FLAGS ${MPI_CXX_COMPILE_FLAGS}) + list(APPEND CMAKE_CXX_LINK_FLAGS ${MPI_CXX_LINK_FLAGS}) + include_directories(${MPI_CXX_INCLUDE_PATH}) + +endmacro(enable_mpi_support) +# Done configuring MPI Options + + +#################################################### +## ---------------------------------------------- ## +## - - ## +## - Enable OpenMP Support - ## +## - - ## +## ---------------------------------------------- ## +#################################################### + +# Begin configuring OpenMP options +macro(enable_openmp_support) + + find_package("OpenMP" REQUIRED) + + # Add the OpenMP-specific compiler and linker flags + list(APPEND CMAKE_CXX_FLAGS ${OpenMP_CXX_FLAGS}) + list(APPEND CMAKE_C_FLAGS ${OpenMP_C_FLAGS}) + +endmacro(enable_openmp_support) +# Done configuring OpenMP Options + + +#################################################### +## ---------------------------------------------- ## +## - - ## +## - Enable CUDA Support - ## +## - - ## +## ---------------------------------------------- ## +#################################################### + +# Begin configuring CUDA options +# This is ugly... +macro(enable_cuda_support) + + # Hide a number of options from the default CMake screen + mark_as_advanced(CLEAR CUDA_BUILD_CUBIN) + mark_as_advanced(CLEAR CUDA_SDK_ROOT_DIR) + mark_as_advanced(CLEAR CUDA_TOOLKIT_ROOT_DIR) + mark_as_advanced(CLEAR CUDA_VERBOSE_BUILD) + mark_as_advanced(CLEAR CUDA_FAST_MATH) + mark_as_advanced(CLEAR CUDA_USE_CUSTOM_COMPILER) + mark_as_advanced(CLEAR CUDA_VERBOSE_PTX) + mark_as_advanced(CLEAR CUDA_DEVICE_VERSION) + + # select Compute Capability + # This needs to be manually updated when devices with new CCs come out + set(CUDA_DEVICE_VERSION "20" CACHE STRING "CUDA Device Version") + set_property(CACHE CUDA_DEVICE_VERSION PROPERTY STRINGS "10" "11" "12" "13" "20" "21" "30" "35") + + # Enable fast-math for CUDA (_not_ GCC) + set(CUDA_FAST_MATH TRUE CACHE BOOL "Use Fast Math Operations") + + # Tell nvcc to use a separate compiler for non-CUDA code. + # This is useful if you need to use an older of GCC than comes by default + set(CUDA_USE_CUSTOM_COMPILER FALSE CACHE BOOL "Use Custom Compiler") + set(CUDA_CUSTOM_COMPILER "" CACHE STRING "Custom C++ Compiler for CUDA If Needed") + + # Shows register usage, etc + set(CUDA_VERBOSE_PTX TRUE CACHE BOOL "Show Verbose Kernel Info During Compilation") + + + # Let's get going... + find_package("CUDA" REQUIRED) + + # Frequently used in the examples + cuda_include_directories(${CUDA_SDK_ROOT_DIR}/common/inc) + cuda_include_directories(${CUDA_SDK_ROOT_DIR}/../shared/inc) + + set(CUDA_SDK_LIB_DIR ${CUDA_SDK_ROOT_DIR}/common/lib + ${CUDA_SDK_ROOT_DIR}/lib ${CUDA_SDK_ROOT_DIR}/../shared/lib) + + # these are no longer needed + # # Find path to shrutil libs, from CUDA SDK + # find_library(LIBSHRUTIL + # NAMES shrUtils${CMAKE_ARCH_BITNESS} shrutil_${CMAKE_SYSTEM_PROCESSOR} + # PATHS ${CUDA_SDK_LIB_DIR}) + # find_library(LIBSHRUTIL_DBG + # NAMES shrUtils${CMAKE_ARCH_BITNESS}D shrutil_${CMAKE_SYSTEM_PROCESSOR}D + # PATHS ${CUDA_SDK_LIB_DIR}) + # + # # Find path to cutil libs, from CUDA SDK + # find_library(LIBCUTIL + # NAMES cutil${CMAKE_ARCH_BITNESS} cutil_${CMAKE_SYSTEM_PROCESSOR} + # PATHS ${CUDA_SDK_LIB_DIR}) + # find_library(LIBCUTIL_DBG + # NAMES cutil${arch}D cutil_${CMAKE_SYSTEM_PROCESSOR}D + # PATHS ${CUDA_SDK_LIB_DIR}) + + # Set custom compiler flags + set(CUDA_NVCC_FLAGS "" CACHE STRING "" FORCE) + + if(CUDA_USE_CUSTOM_COMPILER) + mark_as_advanced(CLEAR CUDA_CUSTOM_COMPILER) + list(APPEND CUDA_NVCC_FLAGS "-ccbin=${CUDA_CUSTOM_COMPILER}") + else() + mark_as_advanced(FORCE CUDA_CUSTOM_COMPILER) + endif() + + # Macro for setting the Compute Capability + macro(set_compute_capability cc) + list(APPEND CUDA_NVCC_FLAGS "-gencode=arch=compute_${cc},code=sm_${cc}") + list(APPEND CUDA_NVCC_FLAGS "-gencode=arch=compute_${cc},code=compute_${cc}") + endmacro(set_compute_capability) + + # Tell nvcc to compile for the selected Compute Capability + # This can also be called from the main CMakeLists.txt to enable + # support for additional CCs + set_compute_capability(${CUDA_DEVICE_VERSION}) + + # Enable fast-math if selected + if(CUDA_FAST_MATH) + list(APPEND CUDA_NVCC_FLAGS "-use_fast_math") + endif() + + # Enable verbose compile if selected + if(CUDA_VERBOSE_PTX) + list(APPEND CUDA_NVCC_FLAGS "--ptxas-options=-v") + endif() +endmacro(enable_cuda_support) +# Done configuring CUDA options diff --git a/driver_um.cu b/driver_um.cu new file mode 100644 index 0000000..30db9ea --- /dev/null +++ b/driver_um.cu @@ -0,0 +1,80 @@ +#include +#include +#include +#include + +extern "C" { +#include "mm_io/mm_io.h" +} + +#include +#include + +typedef typename rcm::ManagedVector IntVector; +typedef typename rcm::ManagedVector DoubleVector; + +int main(int argc, char **argv) +{ + if (argc < 3) { + std::cout << "Usage:\n driver input_file output_file" << std::endl; + return 1; + } + + // Read matrix from file (COO format) + MM_typecode matcode; + int M, N, nnz; + int* row_i; + int* col_j; + double* vals; + + std::cout << "Read MTX file... "; + int err = mm_read_mtx_crd(argv[1], &M, &N, &nnz, &row_i, &col_j, &vals, &matcode); + if (err != 0) { + std::cout << "error: " << err << std::endl; + return 1; + } + std::cout << "M = " << M << " N = " << N << " nnz = " << nnz << std::endl; + + // Switch to 0-based indices + for (int i = 0; i < nnz; i++) { + row_i[i]--; + col_j[i]--; + } + // Convert to CSR format and load into thrust vectors. + IntVector row_offsets(N + 1); + IntVector column_indices(nnz); + DoubleVector values(nnz); + + std::cout << "Convert to CSR" << std::endl; + rcm::coo2csr(M, N, nnz, row_i, col_j, vals, row_offsets, column_indices, values); + + // Print thrust vectors + /* + std::cout << "Row offsets\n"; + thrust::copy(row_offsets.begin(), row_offsets.end(), std::ostream_iterator(std::cout, "\n")); + std::cout << "Column indices\n"; + thrust::copy(column_indices.begin(), column_indices.end(), std::ostream_iterator(std::cout, "\n")); + std::cout << "Values\n"; + thrust::copy(values.begin(), values.end(), std::ostream_iterator(std::cout, "\n")); + */ + + // Run the RCM algorithm + rcm::RCM_UM algo(row_offsets, column_indices, values); + + std::cout << "Run RCM... "; + try { + algo.execute(); + } catch (const rcm::system_error& se) { + std::cout << "error " << se.reason() << std::endl; + return 1; + } + std::cout << "success" << std::endl; + + std::cout << "Write output file " << argv[2] << std::endl; + std::ofstream fout; + fout.open(argv[2]); + //// algo.print(fout); + fout.close(); + + return 0; +} diff --git a/mm_io/mm_io.c b/mm_io/mm_io.c new file mode 100644 index 0000000..6999991 --- /dev/null +++ b/mm_io/mm_io.c @@ -0,0 +1,844 @@ +# include +# include +# include +# include +# include + +# include "mm_io.h" + +/******************************************************************************/ + +int mm_is_valid ( MM_typecode matcode ) + +/******************************************************************************/ +/* + Purpose: + + MM_IS_VALID checks whether the MM header information is valid. + + Modified: + + 31 October 2008 + + Parameters: + + Input, MM_typecode MATCODE, the header information. + + Output, int MM_IS_VALID, is TRUE if the matrix code is valid. +*/ +{ + if ( !mm_is_matrix ( matcode ) ) + { + return 0; + } + + if ( mm_is_dense ( matcode ) && mm_is_pattern ( matcode ) ) + { + return 0; + } + + if ( mm_is_real ( matcode ) && mm_is_hermitian ( matcode ) ) + { + return 0; + } + + if ( mm_is_pattern ( matcode ) && + ( mm_is_hermitian ( matcode ) || mm_is_skew ( matcode ) ) ) + { + return 0; + } + return 1; +} +/******************************************************************************/ + +int mm_read_banner ( FILE *f, MM_typecode *matcode ) + +/******************************************************************************/ +/* + Purpose: + + MM_READ_BANNER reads the header line of an MM file. + + Modified: + + 31 October 2008 + + Parameters: + + Input, FILE *F, a pointer to the input file. + + Output, MM_typecode *MATCODE, the header information. +*/ +{ + char line[MM_MAX_LINE_LENGTH]; + char banner[MM_MAX_TOKEN_LENGTH]; + char mtx[MM_MAX_TOKEN_LENGTH]; + char crd[MM_MAX_TOKEN_LENGTH]; + char data_type[MM_MAX_TOKEN_LENGTH]; + char storage_scheme[MM_MAX_TOKEN_LENGTH]; + char *p; + + mm_clear_typecode ( matcode ); + + if ( fgets ( line, MM_MAX_LINE_LENGTH, f ) == NULL ) + { + return MM_PREMATURE_EOF; + } + + if ( sscanf ( line, "%s %s %s %s %s", banner, mtx, crd, data_type, + storage_scheme ) != 5 ) + { + return MM_PREMATURE_EOF; + } + + for (p=mtx; *p!='\0'; *p=tolower(*p),p++); /* convert to lower case */ + for (p=crd; *p!='\0'; *p=tolower(*p),p++); + for (p=data_type; *p!='\0'; *p=tolower(*p),p++); + for (p=storage_scheme; *p!='\0'; *p=tolower(*p),p++); +/* + check for banner +*/ + if (strncmp(banner, MatrixMarketBanner, strlen(MatrixMarketBanner)) != 0) + return MM_NO_HEADER; + +/* + first field should be "mtx" +*/ + if (strcmp(mtx, MM_MTX_STR) != 0) + return MM_UNSUPPORTED_TYPE; + mm_set_matrix(matcode); + +/* + second field describes whether this is a sparse matrix (in coordinate + storgae) or a dense array +*/ + if (strcmp(crd, MM_SPARSE_STR) == 0) + mm_set_sparse(matcode); + else + if (strcmp(crd, MM_DENSE_STR) == 0) + mm_set_dense(matcode); + else + return MM_UNSUPPORTED_TYPE; + +/* + third field +*/ + if (strcmp(data_type, MM_REAL_STR) == 0) + { + mm_set_real(matcode); + } + else if (strcmp(data_type, MM_COMPLEX_STR) == 0) + { + mm_set_complex(matcode); + } + else if (strcmp(data_type, MM_PATTERN_STR) == 0) + { + mm_set_pattern(matcode); + } + else if (strcmp(data_type, MM_INT_STR) == 0) + { + mm_set_integer(matcode); + } + else + { + return MM_UNSUPPORTED_TYPE; + } +/* + fourth field +*/ + + if (strcmp(storage_scheme, MM_GENERAL_STR) == 0) + mm_set_general(matcode); + else + if (strcmp(storage_scheme, MM_SYMM_STR) == 0) + mm_set_symmetric(matcode); + else + if (strcmp(storage_scheme, MM_HERM_STR) == 0) + mm_set_hermitian(matcode); + else + if (strcmp(storage_scheme, MM_SKEW_STR) == 0) + mm_set_skew(matcode); + else + return MM_UNSUPPORTED_TYPE; + + + return 0; +} +/******************************************************************************/ + +int mm_read_mtx_array_size ( FILE *f, int *M, int *N ) + +/******************************************************************************/ +/* + Purpose: + + MM_READ_MTX_ARRAY_SIZE reads the size line of an MM array file. + + Modified: + + 03 November 2008 + + Parameters: + + Input, FILE *F, a pointer to the input file. + + Output, int *M, the number of rows, as read from the file. + + Output, int *N, the number of columns, as read from the file. + + Output, MM_READ_MTX_ARRAY_SIZE, an error flag. + 0, no error. +*/ +{ + char line[MM_MAX_LINE_LENGTH]; + int num_items_read; +/* + set return null parameter values, in case we exit with errors +*/ + *M = 0; + *N = 0; +/* + now continue scanning until you reach the end-of-comments +*/ + do + { + if ( fgets ( line, MM_MAX_LINE_LENGTH, f ) == NULL ) + { + return MM_PREMATURE_EOF; + } + } while ( line[0] == '%' ); +/* + line[] is either blank or has M,N, nz +*/ + if ( sscanf ( line, "%d %d", M, N ) == 2 ) + { + return 0; + } + else + { + do + { + num_items_read = fscanf ( f, "%d %d", M, N ); + if ( num_items_read == EOF ) + { + return MM_PREMATURE_EOF; + } + } + while ( num_items_read != 2 ); + } + return 0; +} +/******************************************************************************/ + +int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J, + double **val, MM_typecode *matcode) + +/******************************************************************************/ +/* + Purpose: + + MM_READ_MTX_CRD reads the values in an MM coordinate file. + + Discussion: + + This function allocates the storage for the arrays. + + mm_read_mtx_crd() fills M, N, nz, array of values, and return + type code, e.g. 'MCRS' + + if matrix is complex, values[] is of size 2*nz, + (nz pairs of real/imaginary values) + + Modified: + + 31 October 2008 + + Parameters: + +*/ +{ + int ret_code; + FILE *f; + + if (strcmp(fname, "stdin") == 0) f=stdin; + else + if ((f = fopen(fname, "r")) == NULL) + return MM_COULD_NOT_READ_FILE; + + + if ((ret_code = mm_read_banner(f, matcode)) != 0) + return ret_code; + + if (!(mm_is_valid(*matcode) && mm_is_sparse(*matcode) && + mm_is_matrix(*matcode))) + return MM_UNSUPPORTED_TYPE; + + if ((ret_code = mm_read_mtx_crd_size(f, M, N, nz)) != 0) + return ret_code; + + + *I = (int *) malloc(*nz * sizeof(int)); + *J = (int *) malloc(*nz * sizeof(int)); + *val = NULL; + + if (mm_is_complex(*matcode)) + { + *val = (double *) malloc(*nz * 2 * sizeof(double)); + ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, + *matcode); + if (ret_code != 0) return ret_code; + } + else if (mm_is_real(*matcode)) + { + *val = (double *) malloc(*nz * sizeof(double)); + ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, + *matcode); + if (ret_code != 0) return ret_code; + } + + else if (mm_is_pattern(*matcode)) + { + ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, + *matcode); + if (ret_code != 0) return ret_code; + } + + if (f != stdin) fclose(f); + return 0; +} +/******************************************************************************/ + +int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[], + double val[], MM_typecode matcode) + +/******************************************************************************/ +/* + Purpose: + + MM_READ_MTX_CRD_DATA reads the values in an MM coordinate file. + + Discussion: + + This function assumes the array storage has already been allocated. + + Modified: + + 31 October 2008 + + Parameters: + + Input, FILE *F, a pointer to the input file. +*/ +{ + int i; + + if (mm_is_complex(matcode)) + { + for (i=0; i +// Restore NDEBUG mode if it was active. +# ifdef NDEBUG_ACTIVE +# define NDEBUG +# undef NDEBUG_ACTIVE +# endif +#else +// Include the assert.h header file using whatever the +// current definition of NDEBUG is. +# include +#endif + +# include +# include +# include + + +// ---------------------------------------------------------------------------- + + +namespace rcm { + +const unsigned int BLOCK_SIZE = 512; + +const unsigned int MAX_GRID_DIMENSION = 32768; + +inline +void kernelConfigAdjust(int &numThreads, int &numBlockX, const int numThreadsMax) { + if (numThreads > numThreadsMax) { + numBlockX = (numThreads + numThreadsMax - 1) / numThreadsMax; + numThreads = numThreadsMax; + } +} + +inline +void kernelConfigAdjust(int &numThreads, int &numBlockX, int &numBlockY, const int numThreadsMax, const int numBlockXMax) { + kernelConfigAdjust(numThreads, numBlockX, numThreadsMax); + if (numBlockX > numBlockXMax) { + numBlockY = (numBlockX + numBlockXMax - 1) / numBlockXMax; + numBlockX = numBlockXMax; + } +} + +template +class ManagedVector { + T* m_p_array; + size_t m_size; + +public: + typedef typename thrust::device_ptr iterator; + + ManagedVector(): m_p_array(0), m_size(0) {} + ManagedVector(size_t n): m_size(n) { + cudaMallocManaged(&m_p_array, sizeof(T) * n); + } + ManagedVector(size_t n, T val): m_size(n) { + cudaMallocManaged(&m_p_array, sizeof(T) * n); + thrust::fill(thrust::cuda::par, m_p_array, m_p_array + n, val); + cudaDeviceSynchronize(); + } + ManagedVector(const ManagedVector &a): m_size(a.m_size) { + cudaMallocManaged(&m_p_array, sizeof(T) * a.m_size); + thrust::copy(thrust::cuda::par, a.m_p_array, a.m_p_array + a.m_size, m_p_array); + cudaDeviceSynchronize(); + } + ~ManagedVector() {cudaFree(m_p_array);} + + ManagedVector& operator=(const ManagedVector &a) { + if (m_size < a.m_size) { + m_size = a.m_size; + cudaFree(m_p_array); + cudaMallocManaged(&m_p_array, sizeof(T) * a.m_size); + thrust::copy(thrust::cuda::par, a.m_p_array, a.m_p_array + a.m_size, m_p_array); + cudaDeviceSynchronize(); + } else { + m_size = a.m_size; + thrust::copy(thrust::cuda::par, a.m_p_array, a.m_p_array + a.m_size, m_p_array); + cudaDeviceSynchronize(); + } + + return *this; + } + + thrust::device_ptr begin() const {return thrust::device_pointer_cast(&m_p_array[0]);} + thrust::device_ptr end() const {return thrust::device_pointer_cast(&m_p_array[m_size]);} + + T& operator[](size_t n) {return m_p_array[n];} + const T& operator[](size_t n) const {return m_p_array[n];} + + size_t size() const {return m_size;} + + void resize(size_t n) { + if (m_size >= n) m_size = n; + else { + T *p_tmp; + cudaMallocManaged(&p_tmp, sizeof(T) * n); + + if (m_size > 0) { + thrust::copy(thrust::cuda::par, m_p_array, m_p_array + m_size, p_tmp); + cudaDeviceSynchronize(); + } + + m_size = n; + cudaFree(m_p_array); + + m_p_array = p_tmp; + } + } + +}; + +// ----------------------------------------------------------------------------- +// Convert matrix from COO to CSR format +// ----------------------------------------------------------------------------- +template +void coo2csr(const int n_row, + const int n_col, + const int nnz, + const int Ai[], + const int Aj[], + const double Ax[], + Ivec& Bp, + Ivec& Bj, + Rvec& Bx) +{ + //compute number of non-zero entries per row of A + for (int i = 0; i <= n_row; i++) + Bp[i] = 0; + + for (int i = 0; i < nnz; i++) + Bp[Ai[i]]++; + + //cumsum the nnz per row to get Bp[] + for(int i = 0, cumsum = 0; i < n_row; i++) { + int temp = Bp[i]; + Bp[i] = cumsum; + cumsum += temp; + } + Bp[n_row] = nnz; + + //write Aj,Ax into Bj,Bx + for(int i = 0; i < nnz; i++) { + int row = Ai[i]; + int dest = Bp[row]; + + Bj[dest] = Aj[i]; + Bx[dest] = Ax[i]; + + Bp[row]++; + } + + for(int i = 0, last = 0; i <= n_row; i++) { + int temp = Bp[i]; + Bp[i] = last; + last = temp; + } +} + + +} // namespace rcm + + +#endif diff --git a/rcm/device/kernels.cuh b/rcm/device/kernels.cuh new file mode 100644 index 0000000..8866945 --- /dev/null +++ b/rcm/device/kernels.cuh @@ -0,0 +1,57 @@ +#ifndef KERNELS_CUH +#define KERNELS_CUH + +namespace rcm { + +namespace device { + +__global__ void generalToSymmetric(int nnz, + const int * row_indices, + const int * column_indices, + int * new_row_indices, + int * new_column_indices) +{ + int tid = threadIdx.x + (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x; + + if (tid >= nnz) return; + + new_row_indices[tid << 1] = row_indices[tid]; + new_column_indices[tid << 1] = column_indices[tid]; + new_row_indices[(tid << 1) + 1] = column_indices[tid]; + new_column_indices[(tid << 1) + 1] = row_indices[tid]; +} + +__global__ void achieveLevels(int N, + const int* row_offsets, + const int* column_indices, + bool* frontier, + int* visited, + int* updated_by, + int* levels) +{ + int bid = blockIdx.x + blockIdx.y * gridDim.x; + + if (bid >= N) return; + + if (!frontier[bid]) return; + + if (threadIdx.x == 0) + frontier[bid] = false; + + int start_idx = row_offsets[bid], end_idx = row_offsets[bid + 1]; + int cur_cost = levels[bid]; + + for (int tid = start_idx + threadIdx.x; tid < end_idx; tid += blockDim.x) { + int column = column_indices[tid]; + if (visited[column]) continue; + visited[column] = true; + frontier[column] = true; + updated_by[column] = bid + 1; + levels[column] = cur_cost + 1; + } +} + +} // namespace device +} // namespace rcm + +#endif diff --git a/rcm/exception.h b/rcm/exception.h new file mode 100644 index 0000000..3454a00 --- /dev/null +++ b/rcm/exception.h @@ -0,0 +1,41 @@ +#ifndef EXCEPTION_H +#define EXCEPTION_H + +#include +#include + +namespace rcm { + +class system_error : public std::runtime_error +{ +public: + enum Reason + { + Negative_MC64_weight = -1, + Matrix_singular = -2 + }; + + system_error(Reason reason, + const std::string& what_arg) + : std::runtime_error(what_arg), + m_reason(reason) + {} + + system_error(Reason reason, + const char* what_arg) + : std::runtime_error(what_arg), + m_reason(reason) + {} + + virtual ~system_error() throw() {} + + Reason reason() const {return m_reason;} + +private: + Reason m_reason; +}; + +} // namespace rcm + + +#endif diff --git a/rcm/rcm_um.h b/rcm/rcm_um.h new file mode 100644 index 0000000..14d2f5f --- /dev/null +++ b/rcm/rcm_um.h @@ -0,0 +1,266 @@ +#ifndef RCM_UM_H +#define RCM_UM_H + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern "C" { +#include "mm_io/mm_io.h" +} + +namespace rcm { + +class RCM_UM +{ +private: + typedef ManagedVector IntVector; + typedef ManagedVector DoubleVector; + typedef ManagedVector BoolVector; + + typedef typename thrust::tuple IntTuple; + typedef typename thrust::zip_iterator NodeIterator; + + IntVector m_row_offsets; + IntVector m_column_indices; + DoubleVector m_values; + + IntVector m_perm; + int m_half_bandwidth; + int m_half_bandwidth_original; + + size_t m_n; + size_t m_nnz; + + + template + static void offsets_to_indices(const IVector& offsets, IVector& indices) + { + // convert compressed row offsets into uncompressed row indices + thrust::fill(indices.begin(), indices.end(), 0); + thrust::scatter( thrust::counting_iterator(0), + thrust::counting_iterator(offsets.size()-1), + offsets.begin(), + indices.begin()); + thrust::inclusive_scan(indices.begin(), indices.end(), indices.begin(), thrust::maximum()); + } + + template + static void indices_to_offsets(const IVector& indices, IVector& offsets) + { + // convert uncompressed row indices into compressed row offsets + thrust::lower_bound(indices.begin(), + indices.end(), + thrust::counting_iterator(0), + thrust::counting_iterator(offsets.size()), + offsets.begin()); + } + +public: + RCM_UM(const IntVector& row_offsets, + const IntVector& column_indices, + const DoubleVector& values) + : m_row_offsets(row_offsets), + m_column_indices(column_indices), + m_values(values) + { + size_t n = row_offsets.size() - 1; + m_perm.resize(n); + m_n = n; + m_nnz = m_values.size(); + } + + ~RCM_UM() {} + + void execute(); + + int getHalfBandwidth() const {return m_half_bandwidth;} + int getHalfBandwidthOriginal() const {return m_half_bandwidth_original;} + + struct Difference: public thrust::binary_function + { + inline + __host__ __device__ + int operator() (const int &a, const int &b) const { + return abs(a-b); + } + }; + + struct ExtendedDifference: public thrust::binary_function + { + int *m_perm; + + ExtendedDifference(int *perm): m_perm(perm) {} + inline + __host__ __device__ + int operator() (const int &a, const int &b) const { + return abs(m_perm[a]-m_perm[b]); + } + }; + + struct TupleCompare + { + inline + __host__ __device__ + bool operator() (IntTuple a, IntTuple b) const + { + int a_level = thrust::get<0>(a), b_level = thrust::get<0>(b); + if (a_level != b_level) return a_level < b_level; + int a_updated_by = thrust::get<1>(a), b_updated_by = thrust::get<1>(b); + if (a_updated_by != b_updated_by) return a_updated_by < b_updated_by; + return thrust::get<2>(a) < thrust::get<2>(b); + } + }; +}; + +void +RCM_UM::execute() +{ + BoolVector frontier(m_n); + IntVector visited(m_n); + BoolVector tried(m_n, false); + + IntVector updated_by(m_n); + IntVector levels(m_n); + + const int ITER_COUNT = 5; + + IntVector row_indices(m_nnz); + + IntVector tmp_row_offsets(m_n + 1); + IntVector tmp_row_indices(m_nnz << 1); + IntVector tmp_column_indices(m_nnz << 1); + IntVector degrees(m_n + 1); + IntVector tmp_degrees(m_n); + + offsets_to_indices(m_row_offsets, row_indices); + + { + int numThreads = m_nnz, numBlockX = 1, numBlockY = 1; + + kernelConfigAdjust(numThreads, numBlockX, numBlockY, BLOCK_SIZE, MAX_GRID_DIMENSION); + dim3 grids(numBlockX, numBlockY); + + const int* p_row_indices = thrust::raw_pointer_cast(&row_indices[0]); + const int* p_column_indices = thrust::raw_pointer_cast(&m_column_indices[0]); + int * p_new_row_indices = thrust::raw_pointer_cast(&tmp_row_indices[0]); + int * p_new_column_indices = thrust::raw_pointer_cast(&tmp_column_indices[0]); + device::generalToSymmetric<<>>(m_nnz, p_row_indices, p_column_indices, p_new_row_indices, p_new_column_indices); + + thrust::sort_by_key(tmp_row_indices.begin(), tmp_row_indices.end(), tmp_column_indices.begin()); + indices_to_offsets(tmp_row_indices, tmp_row_offsets); + thrust::adjacent_difference(tmp_row_offsets.begin(), tmp_row_offsets.end(), degrees.begin()); + + m_half_bandwidth = m_half_bandwidth_original = thrust::inner_product(row_indices.begin(), row_indices.end(), m_column_indices.begin(), 0, thrust::maximum(), Difference()); + + thrust::sequence(m_perm.begin(), m_perm.end()); + cudaDeviceSynchronize(); + } + + IntVector tmp_reordering(m_n); + IntVector tmp_perm(m_n); + + for (int i = 0; i < ITER_COUNT; i++) { + int S = 0; + + if (i > 0) { + while(tried[S]) + S = rand() % m_n; + } + + tried[S] = true; + + + thrust::fill(frontier.begin(), frontier.end(), false); + thrust::fill(visited.begin(), visited.end(), 0); + thrust::sequence(tmp_reordering.begin(), tmp_reordering.end()); + cudaDeviceSynchronize(); + + frontier[S] = true; + visited[S] = 1; + levels[S] = 0; + updated_by[S] = -1; + + int last = 0; + + + for (int l = 0; l < m_n; l ++) + { + int sum = thrust::reduce(visited.begin(), visited.end()); + + if (sum >= m_n) break; + + sum = thrust::reduce(frontier.begin(), frontier.end()); + + if (sum == 0) { + for (int j = last; j < m_n; j++) + if (!visited[j]) { + visited[j] = 1; + frontier[j] = true; + levels[j] = l; + updated_by[j] = -1; + last = j; + tried[j] = true; + break; + } + } + + int numBlockX = m_n, numBlockY = 1; + kernelConfigAdjust(numBlockX, numBlockY, MAX_GRID_DIMENSION); + dim3 grids(numBlockX, numBlockY); + + const int *p_row_offsets = thrust::raw_pointer_cast(&tmp_row_offsets[0]); + const int *p_column_indices = thrust::raw_pointer_cast(&tmp_column_indices[0]); + bool * p_frontier = thrust::raw_pointer_cast(&frontier[0]); + int * p_visited = thrust::raw_pointer_cast(&visited[0]); + int * p_updated_by = thrust::raw_pointer_cast(&updated_by[0]); + int * p_levels = thrust::raw_pointer_cast(&levels[0]); + + device::achieveLevels<<>>(m_n, p_row_offsets, p_column_indices, p_frontier, p_visited, p_updated_by, p_levels); + cudaDeviceSynchronize(); + } + + thrust::copy(degrees.begin() + 1, degrees.end(), tmp_degrees.begin()); + + + thrust::sort(thrust::make_zip_iterator(thrust::make_tuple(levels.begin(), updated_by.begin(), tmp_degrees.begin(), tmp_reordering.begin() )), + thrust::make_zip_iterator(thrust::make_tuple(levels.end(), updated_by.end(), tmp_degrees.end(), tmp_reordering.end())), + TupleCompare() + ); + + + thrust::scatter(thrust::make_counting_iterator(0), + thrust::make_counting_iterator((int)(m_n)), + tmp_reordering.begin(), + tmp_perm.begin()); + + int *p_perm = thrust::raw_pointer_cast(&tmp_perm[0]); + int tmp_half_bandwidth = thrust::inner_product(row_indices.begin(), row_indices.end(), m_column_indices.begin(), 0, thrust::maximum(), ExtendedDifference(p_perm)); + + if (tmp_half_bandwidth < m_half_bandwidth) { + m_half_bandwidth = tmp_half_bandwidth; + m_perm = tmp_perm; + } + + cudaDeviceSynchronize(); + + } + +} + +} // end namespace rcm + +#endif diff --git a/rcm/timer.h b/rcm/timer.h new file mode 100644 index 0000000..153fc46 --- /dev/null +++ b/rcm/timer.h @@ -0,0 +1,126 @@ +/** \file timer.h + * \brief CPU and GPU timer classes. + */ + +#ifndef TIMER_H +#define TIMER_H + +#include +#include + +#ifdef WIN32 +#include +#else +#include +#endif + + +namespace rcm { + + +/// Base timer class. +class Timer { +public: + virtual ~Timer() {} + virtual void Start()=0; + virtual void Stop()=0; + virtual double getElapsed()=0; +}; + + +/// GPU timer. +/** + * CUDA-based GPU timer. + */ +class GPUTimer : public Timer { +protected: + int gpu_idx; + cudaEvent_t timeStart; + cudaEvent_t timeEnd; +public: + GPUTimer(int g_idx = 0) { + gpu_idx = g_idx; + + cudaEventCreate(&timeStart); + cudaEventCreate(&timeEnd); + } + + virtual ~GPUTimer() { + cudaEventDestroy(timeStart); + cudaEventDestroy(timeEnd); + } + + virtual void Start() { + cudaEventRecord(timeStart, 0); + } + + virtual void Stop() { + cudaEventRecord(timeEnd, 0); + cudaEventSynchronize(timeEnd); + } + + virtual double getElapsed() { + float elapsed; + cudaEventElapsedTime(&elapsed, timeStart, timeEnd); + return elapsed; + } +}; + + +/// CPU timer. +/** + * CPU timer using the performance counter for WIN32 and + * gettimeofday() for Linux. + */ +#ifdef WIN32 + +class CPUTimer : public Timer +{ +public: + CPUTimer() + { + QueryPerformanceFrequency(&m_frequency); + } + ~CPUTimer() {} + + virtual void Start() {QueryPerformanceCounter(&m_start);} + virtual void Stop() {QueryPerformanceCounter(&m_stop);} + + virtual double getElapsed() { + return (m_stop.QuadPart - m_start.QuadPart) * 1000.0 / m_frequency.QuadPart; + } + +private: + LARGE_INTEGER m_frequency; + LARGE_INTEGER m_start; + LARGE_INTEGER m_stop; +}; + +#else // WIN32 + +class CPUTimer : public Timer { +protected: + timeval timeStart; + timeval timeEnd; +public: + virtual ~CPUTimer() {} + + virtual void Start() { + gettimeofday(&timeStart, 0); + } + + virtual void Stop() { + gettimeofday(&timeEnd, 0); + } + + virtual double getElapsed() { + return 1000.0 * (timeEnd.tv_sec - timeStart.tv_sec) + (timeEnd.tv_usec - timeStart.tv_usec) / 1000.0; + } +}; + +#endif // WIN32 + +} // namespace mc64 + + +#endif diff --git a/testing.cu b/testing.cu new file mode 100644 index 0000000..9d313fb --- /dev/null +++ b/testing.cu @@ -0,0 +1,126 @@ +#include +#include +#include +#include + +extern "C" { +#include "mm_io/mm_io.h" +} + +#include +#include +#include + +typedef typename rcm::ManagedVector IntVector; +typedef typename rcm::ManagedVector DoubleVector; + +enum TestColor { + COLOR_NO = 0, + COLOR_RED, + COLOR_GREEN +}; + +// ----------------------------------------------------------------------------- + +class OutputItem +{ + public: + OutputItem(std::ostream &o) : m_o(o), m_additional_item_count(19) {} + + int m_additional_item_count; + + template + void operator() (T item, TestColor c = COLOR_NO) + { + m_o << "\n"; + switch (c) + { + case COLOR_RED: + m_o << "

" << item << "

\n"; + break; + case COLOR_GREEN: + m_o << "

" << item << "

\n"; + break; + default: + m_o << "

" << item << "

\n"; + break; + } + m_o << "\n"; + } + + private: + std::ostream &m_o; +}; + +int main(int argc, char **argv) +{ + if (argc < 2) { + std::cout << "Usage:\n driver input_file" << std::endl; + return 1; + } + + // Read matrix from file (COO format) + MM_typecode matcode; + int M, N, nnz; + int* row_i; + int* col_j; + double* vals; + + int err = mm_read_mtx_crd(argv[1], &M, &N, &nnz, &row_i, &col_j, &vals, &matcode); + if (err != 0) { + std::cout << "error: " << err << std::endl; + return 1; + } + + // Switch to 0-based indices + for (int i = 0; i < nnz; i++) { + row_i[i]--; + col_j[i]--; + } + // Convert to CSR format and load into thrust vectors. + IntVector row_offsets(N + 1); + IntVector column_indices(nnz); + DoubleVector values(nnz); + + rcm::coo2csr(M, N, nnz, row_i, col_j, vals, row_offsets, column_indices, values); + + // Run the RCM algorithm + rcm::RCM_UM algo(row_offsets, column_indices, values); + OutputItem outputItem(std::cout); + + std::cout << "" << std::endl; + { + std::string fileMat(argv[1]); + int i; + for (i = fileMat.size()-1; i>=0 && fileMat[i] != '/' && fileMat[i] != '\\'; i--); + i++; + fileMat = fileMat.substr(i); + + size_t j = fileMat.rfind(".mtx"); + if (j != std::string::npos) + outputItem( fileMat.substr(0, j)); + else + outputItem( fileMat); + } + outputItem(N); + outputItem(nnz); + + rcm::CPUTimer cpu_timer; + try { + cpu_timer.Start(); + algo.execute(); + cpu_timer.Stop(); + } catch (const rcm::system_error& se) { + outputItem(""); + outputItem(""); + outputItem(""); + std::cout << "" << std::endl; + return 1; + } + outputItem(algo.getHalfBandwidthOriginal()); + outputItem(algo.getHalfBandwidth()); + outputItem(cpu_timer.getElapsed()); + std::cout << "" << std::endl; + + return 0; +}