diff --git a/.gitignore b/.gitignore index 27c6a0f5..ede69a7d 100644 --- a/.gitignore +++ b/.gitignore @@ -60,3 +60,14 @@ perf.* .#* \#*# .~lock.*# + +# vscode settings +.vscode/settings +.vscode/ +src/.vscode + +# profiling workspace +profiling/ + +# cmake build dir +build_cmake/ \ No newline at end of file diff --git a/CMakeConfig.h.in b/CMakeConfig.h.in new file mode 100644 index 00000000..20d82e74 --- /dev/null +++ b/CMakeConfig.h.in @@ -0,0 +1,87 @@ +/* Define to 1 if you have the header file, and it defines `DIR'. + */ +#cmakedefine HAVE_DIRENT_H 1 + +/* Define to 1 if you have the header file. */ +#cmakedefine HAVE_ENDIAN_H 1 + +/* Define to 1 if fseeko (and presumably ftello) exists and is declared. */ +#cmakedefine HAVE_FSEEKO 1 + +/* Define to 1 if you have the `getopt' function. */ +#cmakedefine HAVE_GETOPT 1 + +/* Define to 1 if you have the header file. */ +#cmakedefine HAVE_GETOPT_H 1 + +/* Define to 1 if you have the `getopt_long' function. */ +#cmakedefine HAVE_GETOPT_LONG 1 + +/* Define to 1 if you have the header file. */ +#cmakedefine HAVE_INTTYPES_H 1 + +/* Define to 1 if you have the header file. */ +#cmakedefine HAVE_LIMITS_H 1 + +/* Define to 1 if you have the `memcpy' function. */ +#cmakedefine HAVE_MEMCPY 1 + +/* Define to 1 if you have the header file, and it defines `DIR'. */ +#cmakedefine HAVE_NDIR_H 1 + +/* Define to 1 if stdbool.h conforms to C99. */ +#cmakedefine HAVE_STDBOOL_H 1 + +/* Define to 1 if you have the header file. */ +#cmakedefine HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#cmakedefine HAVE_STDIO_H 1 + +/* Define to 1 if you have the header file. */ +#cmakedefine HAVE_STDLIB_H 1 + +/* Define to 1 if you have the `strcasecmp' function. */ +#cmakedefine HAVE_STRCASECMP 1 + +/* Define to 1 if you have the `strchr' function. */ +#cmakedefine HAVE_STRCHR 1 + +/* Define to 1 if you have the `stricmp' function. */ +#cmakedefine HAVE_STRICMP 1 + +/* Define to 1 if you have the header file. */ +#cmakedefine HAVE_STRINGS_H 1 + +/* Define to 1 if you have the header file. */ +#cmakedefine HAVE_STRING_H 1 + +/* Define to 1 if you have the header file, and it defines `DIR'. + */ +#cmakedefine HAVE_SYS_DIR_H 1 + +/* Define to 1 if you have the header file, and it defines `DIR'. + */ +#cmakedefine HAVE_SYS_NDIR_H 1 + +/* Define to 1 if you have the header file. */ +#cmakedefine HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +#cmakedefine HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the header file. */ +#cmakedefine HAVE_UNISTD_H 1 + +/* Define to 1 if all of the C90 standard headers exist (not just the ones + required in a freestanding environment). This macro is provided for + backward compatibility; new code need not use it. */ +#cmakedefine STDC_HEADERS 1 + +#cmakedefine ENABLE_CUDA 1 + +/* Name of package */ +#define PACKAGE "@PROJECT_NAME@" + +/* Version number of package */ +#define VERSION "@PROJECT_VERSION@" diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 00000000..1f999e0e --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,260 @@ +cmake_minimum_required(VERSION 3.23) + +project(par2cmdline VERSION 0.9.0 LANGUAGES CXX) + +# Optionally enable CUDA +option(ENABLE_CUDA "Enable CUDA" OFF) + +if (ENABLE_CUDA) + # Check if CUDA is available + include(CheckLanguage) + check_language(CUDA) + if (CMAKE_CUDA_COMPILER) + enable_language(CUDA) + find_package(CUDA REQUIRED) + include_directories("${CUDA_INCLUDE_DIRS}") + else () + message(STATUS "No CUDA compiler found") + endif () +endif (ENABLE_CUDA) + +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CUDA_STANDARD 14) +set(CMAKE_CXX_STANDARD_REQUIRED True) +set(CMAKE_CUDA_STANDARD_REQUIRED True) + +if(NOT (CMAKE_BUILD_TYPE OR DEFINED ENV{CMAKE_BUILD_TYPE})) + set(CMAKE_BUILD_TYPE Release) +endif(NOT (CMAKE_BUILD_TYPE OR DEFINED ENV{CMAKE_BUILD_TYPE})) +message(STATUS ${CMAKE_BUILD_TYPE}) + +# Check existing headers and generate config.h +# for multi-platform support +include(CheckIncludeFiles) +include(CheckSymbolExists) +include(CheckFunctionExists) + +check_include_files(dirent.h HAVE_DIRENT_H) +check_include_files(endian.h HAVE_ENDIAN_H) +check_include_files(getopt.h HAVE_GETOPT_H) +check_include_files(inttypes.h HAVE_INTTYPES_H) +check_include_files(limits.h HAVE_LIMITS_H) +check_include_files(ndir.h HAVE_NDIR_H) +check_include_files(stdbool.h HAVE_STDBOOL_H) +check_include_files(stdint.h HAVE_STDINT_H) +check_include_files(stdio.h HAVE_STDIO_H) +check_include_files(stdlib.h HAVE_STDLIB_H) +check_include_files(strings.h HAVE_STRINGS_H) +check_include_files(string.h HAVE_STRING_H) +check_include_files(sys/dir.h HAVE_SYS_DIR_H) +check_include_files(sys/ndir.h HAVE_SYS_NDIR_H) +check_include_files(sys/stat.h HAVE_SYS_STAT_H) +check_include_files(sys/types.h HAVE_SYS_TYPES_H) +check_include_files(unistd.h HAVE_UNISTD_H) + +check_include_files("stdlib.h;stdarg.h;string.h;float.h" StandardHeadersExist) +if(StandardHeadersExist) + + check_function_exists(memchr memchrExists) + if(memchrExists) + + check_symbol_exists(free stdlib.h freeExists) + if(freeExists) + + message(STATUS "ANSI C header files - found") + set(STDC_HEADERS 1 CACHE INTERNAL "System has ANSI C header files") + + endif(freeExists) + endif(memchrExists) +endif(StandardHeadersExist) + +check_function_exists(fseeko HAVE_FSEEKO) +check_function_exists(getopt HAVE_GETOPT) +check_function_exists(getopt_long HAVE_GETOPT_LONG) +check_function_exists(memcpy HAVE_MEMCPY) +check_function_exists(strcasecmp HAVE_STRCASECMP) +check_function_exists(strchr HAVE_STRCHR) +check_function_exists(stricmp HAVE_STRICMP) + +# replacement for AC_C_BIGENDIAN +include (TestBigEndian) +test_big_endian(WORDS_BIGENDIAN) + +# generate config.h +configure_file(CMakeConfig.h.in ${PROJECT_BINARY_DIR}/config.h) + +add_compile_options( + -Wall -pipe -fstack-protector-strong + # $<$:-O3> + # $<$:-O0> + # $<$:-g> +) + +# compile targets +set(CMAKE_STATIC_LIBRARY_PREFIX "") +set(LIBPAR2SRC + src/crc.cpp + src/creatorpacket.cpp + src/criticalpacket.cpp + src/datablock.cpp + src/descriptionpacket.cpp + src/diskfile.cpp + src/filechecksummer.cpp + src/galois.cpp + src/mainpacket.cpp + src/md5.cpp + src/par1fileformat.cpp + src/par1repairer.cpp + src/par1repairersourcefile.cpp + src/par2creator.cpp + src/par2creatorsourcefile.cpp + src/par2fileformat.cpp + src/par2repairer.cpp + src/par2repairersourcefile.cpp + src/recoverypacket.cpp + src/reedsolomon.cpp + src/verificationhashtable.cpp + src/verificationpacket.cpp + src/libpar2.cpp +) + +set(PAR2SRC + src/par2cmdline.cpp + src/commandline.cpp +) + +# add cuda source files if enabled cuda +if (ENABLE_CUDA AND NOT "${CMAKE_CUDA_COMPILER}" STREQUAL "") + set(LIBPAR2SRC + ${LIBPAR2SRC} + src/galois_cu.cu + src/par2creator.cu + src/reedsolomon.cu + ) + # set_source_files_properties(${PAR2SRC} PROPERTIES LANGUAGE CUDA) + # message(STATUS "${LIBPAR2SRC}") + +endif (ENABLE_CUDA AND NOT "${CMAKE_CUDA_COMPILER}" STREQUAL "") + +# build libpar2 +add_library(libpar2 STATIC ${LIBPAR2SRC}) + +# build par2 +add_executable(par2 ${PAR2SRC}) + +add_compile_definitions(HAVE_CONFIG_H) + +include_directories(${CMAKE_BINARY_DIR}) + +target_link_libraries(par2 PRIVATE libpar2) + +# If OpenMP is installed, link against openmp +find_package(OpenMP) +if (OPENMP_CXX_FOUND) + target_compile_options(libpar2 PRIVATE ${OpenMP_CXX_FLAGS}) + target_link_libraries(par2 PRIVATE OpenMP::OpenMP_CXX) + target_link_libraries(libpar2 PRIVATE OpenMP::OpenMP_CXX) +endif() + +if (ENABLE_CUDA AND NOT "${CMAKE_CUDA_COMPILER}" STREQUAL "") + set_target_properties(libpar2 PROPERTIES CUDA_ARCHITECTURES "all-major") +endif (ENABLE_CUDA AND NOT "${CMAKE_CUDA_COMPILER}" STREQUAL "") + +# =========================== +# Testing +enable_testing() + +# Build unit tests +add_executable(letype_test src/letype_test.cpp) +add_executable(crc_test src/crc_test.cpp src/crc.cpp) +add_executable(md5_test src/md5_test.cpp src/md5.cpp) +add_executable(diskfile_test src/diskfile_test.cpp src/diskfile.cpp) +add_executable(libpar2_test src/libpar2_test.cpp) +target_link_libraries(libpar2_test PRIVATE libpar2) + +add_executable(commandline_test src/commandline_test.cpp src/commandline.cpp) +target_link_libraries(commandline_test PRIVATE libpar2) + +add_executable(descriptionpacket_test src/descriptionpacket_test.cpp src/descriptionpacket.cpp) +target_link_libraries(descriptionpacket_test PRIVATE libpar2) + +add_executable(criticalpacket_test src/criticalpacket_test.cpp src/criticalpacket.cpp) +target_link_libraries(criticalpacket_test PRIVATE libpar2) + +add_executable(reedsolomon_test src/reedsolomon_test.cpp src/reedsolomon.cpp) +add_executable(galois_test src/galois_test.cpp src/galois.cpp) + +set(TEST_OUTPUT_DIR ${CMAKE_BINARY_DIR}/tests) +set_target_properties( + letype_test + crc_test + md5_test + diskfile_test + libpar2_test + commandline_test + descriptionpacket_test + criticalpacket_test + reedsolomon_test + galois_test + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${TEST_OUTPUT_DIR} +) + +# Run tests +file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/tests DESTINATION ${CMAKE_BINARY_DIR}) +find_program (BASH_PROGRAM bash) +if (BASH_PROGRAM) + + add_test(NAME test1 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test1) + add_test(NAME test2 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test2) + add_test(NAME test3 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test3) + add_test(NAME test4 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test4) + add_test(NAME test5 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test5) + add_test(NAME test6 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test6) + add_test(NAME test7 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test7) + add_test(NAME test8 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test8) + add_test(NAME test9 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test9) + add_test(NAME test10 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test10) + add_test(NAME test11 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test11) + add_test(NAME test12 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test12) + add_test(NAME test13 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test13) + add_test(NAME test14 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test14) + add_test(NAME test15 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test15) + add_test(NAME test16 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test16) + add_test(NAME test17 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test17) + add_test(NAME test18 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test18) + add_test(NAME test19 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test19) + add_test(NAME test20 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test20) + add_test(NAME test21 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test21) + add_test(NAME test22 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test22) + add_test(NAME test23 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test23) + add_test(NAME test24 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test24) + add_test(NAME test25 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test25) + add_test(NAME test26 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test26) + add_test(NAME test27 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test27) + add_test(NAME test28 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test28) + add_test(NAME unit_tests COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/unit_tests) + +endif (BASH_PROGRAM) + +# ========================== +# install +message(STATUS "Install prefix: ${CMAKE_INSTALL_PREFIX}") +install(TARGETS par2 + DESTINATION bin) + +add_custom_command( + TARGET par2 + POST_BUILD + COMMAND ln;-sf;par2${CMAKE_EXECUTABLE_SUFFIX};par2create${CMAKE_EXECUTABLE_SUFFIX} + COMMAND ln;-sf;par2${CMAKE_EXECUTABLE_SUFFIX};par2verify${CMAKE_EXECUTABLE_SUFFIX} + COMMAND ln;-sf;par2${CMAKE_EXECUTABLE_SUFFIX};par2repair${CMAKE_EXECUTABLE_SUFFIX} +) + +install( + FILES + ${CMAKE_CURRENT_BINARY_DIR}/par2create${CMAKE_EXECUTABLE_SUFFIX} + ${CMAKE_CURRENT_BINARY_DIR}/par2verify${CMAKE_EXECUTABLE_SUFFIX} + ${CMAKE_CURRENT_BINARY_DIR}/par2repair${CMAKE_EXECUTABLE_SUFFIX} + DESTINATION bin +) \ No newline at end of file diff --git a/README.md b/README.md index 36f2d4a2..c2fec960 100644 --- a/README.md +++ b/README.md @@ -68,6 +68,16 @@ For macOS you can install llvm via homebrew to get OpenMP support. See *INSTALL* for full details on how to use the *configure* script. +## Compiling par2cmdline with CUDA (NVIDIA GPU) acceleration +To compile with CUDA option enabled on Linux and other Unix variantes, use the following commands: + + mkdir build_cmake + cd build_cmake + cmake -DENABLE_CUDA=ON .. + make + make check + make install + ## Using par2cmdline The command line parameters for par2cmdline are as follow: @@ -101,6 +111,7 @@ The command line parameters for par2cmdline are as follow: -n : Number of recovery files (don't use both -n and -l) -m : Memory (in MB) to use -t : Number of threads to use (Auto-detected) + -C : Use CUDA device to accelerate recovery files creation -v [-v] : Be more verbose -q [-q] : Be more quiet (-qq gives silence) -p : Purge backup files and par files on successful recovery or diff --git a/config.h.in b/config.h.in index d386e8cc..4caab69a 100644 --- a/config.h.in +++ b/config.h.in @@ -31,9 +31,6 @@ /* Define to 1 if you have the `memcpy' function. */ #undef HAVE_MEMCPY -/* Define to 1 if you have the header file. */ -#undef HAVE_MEMORY_H - /* Define to 1 if you have the header file, and it defines `DIR'. */ #undef HAVE_NDIR_H @@ -105,7 +102,9 @@ /* Define to the version of this package. */ #undef PACKAGE_VERSION -/* Define to 1 if you have the ANSI C header files. */ +/* Define to 1 if all of the C90 standard headers exist (not just the ones + required in a freestanding environment). This macro is provided for + backward compatibility; new code need not use it. */ #undef STDC_HEADERS /* Version number of package */ @@ -123,11 +122,6 @@ # endif #endif -/* Enable large inode numbers on Mac OS X 10.5. */ -#ifndef _DARWIN_USE_64_BIT_INODE -# define _DARWIN_USE_64_BIT_INODE 1 -#endif - /* Number of bits in a file offset, on hosts where this is settable. */ #undef _FILE_OFFSET_BITS diff --git a/configure.ac b/configure.ac old mode 100755 new mode 100644 index 8d8150f2..932cb872 --- a/configure.ac +++ b/configure.ac @@ -22,8 +22,8 @@ dnl Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA dnl -*- Autoconf -*- dnl Process this file with autoconf to produce a configure script. -AC_PREREQ(2.62) -AC_INIT([par2cmdline], [0.8.1], [ike.devolder@gmail.com]) +AC_PREREQ([2.71]) +AC_INIT([par2cmdline],[0.8.1],[ike.devolder@gmail.com]) AC_CONFIG_SRCDIR([src/par2cmdline.cpp]) AC_CANONICAL_HOST @@ -48,7 +48,15 @@ AC_LANG(C++) dnl Checks for header files. AC_HEADER_DIRENT AC_HEADER_STDBOOL -AC_HEADER_STDC +m4_warn([obsolete], +[The preprocessor macro `STDC_HEADERS' is obsolete. + Except in unusual embedded environments, you can safely include all + ISO C90 headers unconditionally.])dnl +# Autoupdate added the next two lines to ensure that your configure +# script's behavior did not change. They are probably safe to remove. +AC_CHECK_INCLUDES_DEFAULT +AC_PROG_EGREP + AC_CHECK_HEADERS([stdio.h] [endian.h]) AC_CHECK_HEADERS([getopt.h] [limits.h]) diff --git a/src/commandline.cpp b/src/commandline.cpp index 6745bf30..12c59255 100644 --- a/src/commandline.cpp +++ b/src/commandline.cpp @@ -22,6 +22,11 @@ #include #include #include "commandline.h" +#ifdef ENABLE_CUDA + #include + #include + #include "helper_cuda.cuh" +#endif using namespace std; #ifdef _MSC_VER @@ -47,6 +52,9 @@ CommandLine::CommandLine(void) , nthreads(0) // 0 means use default number , filethreads( _FILE_THREADS ) // default from header file #endif +#ifdef ENABLE_CUDA +, useCuda(false) +#endif , parfilename() , rawfilenames() , extrafiles() @@ -136,6 +144,9 @@ void CommandLine::usage(void) " -l : Limit size of recovery files (don't use both -u and -l)\n" " -n : Number of recovery files (don't use both -n and -l)\n" " -R : Recurse into subdirectories\n" +#ifdef ENABLE_CUDA + " -C : Use CUDA device to accelerate recovery files creation\n" +#endif "\n"; cout << "Example:\n" @@ -206,20 +217,20 @@ bool CommandLine::ReadArgs(int argc, const char * const *argv) { if (argv[0] == string("-h") || argv[0] == string("--help")) { - usage(); - return true; + usage(); + return true; } else if (argv[0] == string("-V") || argv[0] == string("--version")) { - showversion(); - return true; + showversion(); + return true; } else if (argv[0] == string("-VV")) { - showversion(); - cout << endl; - banner(); - return true; + showversion(); + cout << endl; + banner(); + return true; } } } @@ -425,6 +436,18 @@ bool CommandLine::ReadArgs(int argc, const char * const *argv) } break; #endif +#ifdef ENABLE_CUDA + case 'C': + { + if (operation != opCreate) + { + cerr << "As of now, CUDA acceleration is only supported on creating." << endl; + return false; + } + useCuda = true; + } + break; +#endif case 'r': // Set the amount of redundancy required { @@ -932,10 +955,15 @@ bool CommandLine::CheckValuesAndSetDefaults() { noiselevel = nlNormal; } - // Default memorylimit of 128MB + // Default memorylimit of half physical memory if (memorylimit == 0) { u64 TotalPhysicalMemory = GetTotalPhysicalMemory(); +#ifdef ENABLE_CUDA + cudaDeviceProp prop; + cudaErrchk(cudaGetDeviceProperties(&prop, 0)); + u64 TotalVideoMemory = prop.totalGlobalMem; +#endif if (TotalPhysicalMemory == 0) { @@ -946,8 +974,12 @@ bool CommandLine::CheckValuesAndSetDefaults() { // Half of total physical memory memorylimit = (size_t)(TotalPhysicalMemory / 1048576 / 2); +#ifdef ENABLE_CUDA + // 3/4 of total vram or half of total ram, whichever is smaller. + memorylimit = min(memorylimit, (size_t) (TotalVideoMemory * 3 / 4 / 1048576)); +#endif } - // convert to megabytes + // convert from megabytes memorylimit *= 1048576; if (noiselevel >= nlDebug) @@ -1143,11 +1175,11 @@ bool CommandLine::ComputeBlockSize() { u64 largestfilesize = 0; for (vector::const_iterator i=extrafiles.begin(); i!=extrafiles.end(); i++) { - u64 filesize = filesize_cache.get(*i); - if (filesize > largestfilesize) - { - largestfilesize = filesize; - } + u64 filesize = filesize_cache.get(*i); + if (filesize > largestfilesize) + { + largestfilesize = filesize; + } } blocksize = (largestfilesize + 3) & ~3; } diff --git a/src/commandline.h b/src/commandline.h index e505d187..f3c088f5 100644 --- a/src/commandline.h +++ b/src/commandline.h @@ -91,13 +91,13 @@ class CommandLine u32 GetFirstRecoveryBlock(void) const {return firstblock;} u32 GetRecoveryFileCount(void) const {return recoveryfilecount;} u32 GetRecoveryBlockCount(void) const {return recoveryblockcount;} - Scheme GetRecoveryFileScheme(void) const {return recoveryfilescheme;} + Scheme GetRecoveryFileScheme(void) const {return recoveryfilescheme;} size_t GetMemoryLimit(void) const {return memorylimit;} - NoiseLevel GetNoiseLevel(void) const {return noiselevel;} + NoiseLevel GetNoiseLevel(void) const {return noiselevel;} string GetParFilename(void) const {return parfilename;} string GetBasePath(void) const {return basepath;} - const vector& GetExtraFiles(void) const {return extrafiles;} + const vector& GetExtraFiles(void) const {return extrafiles;} bool GetPurgeFiles(void) const {return purgefiles;} bool GetRecursive(void) const {return recursive;} bool GetSkipData(void) const {return skipdata;} @@ -107,6 +107,10 @@ class CommandLine u32 GetFileThreads(void) {return filethreads;} #endif +#ifdef ENABLE_CUDA + bool GetUseCuda(void) const {return useCuda;} +#endif + static bool ComputeRecoveryBlockCount(u32 *recoveryblockcount, u32 sourceblockcount, @@ -153,6 +157,10 @@ class CommandLine u32 nthreads; // Default number of threads u32 filethreads; // Number of threads for file processing #endif + +#ifdef ENABLE_CUDA + bool useCuda; +#endif // NOTE: using the "-t" option to set the number of threads does not // end up here, but results in a direct call to "omp_set_num_threads" diff --git a/src/galois.h b/src/galois.h index a86c76ad..9980940c 100644 --- a/src/galois.h +++ b/src/galois.h @@ -101,6 +101,7 @@ class Galois Bits = GaloisTable::Bits, Count = GaloisTable::Count, Limit = GaloisTable::Limit, + Generator = generator, }; protected: diff --git a/src/galois_cu.cu b/src/galois_cu.cu new file mode 100644 index 00000000..1aa4dd97 --- /dev/null +++ b/src/galois_cu.cu @@ -0,0 +1,12 @@ +#include "libpar2internal.h" + +#ifdef _MSC_VER +#ifdef _DEBUG +#undef THIS_FILE +static char THIS_FILE[]=__FILE__; +#define new DEBUG_NEW +#endif +#endif + +template<> +bool GaloisCu16::uploaded = false; \ No newline at end of file diff --git a/src/galois_cu.cuh b/src/galois_cu.cuh new file mode 100644 index 00000000..a5bd3aa3 --- /dev/null +++ b/src/galois_cu.cuh @@ -0,0 +1,285 @@ +// This file is part of par2cmdline (a PAR 2.0 compatible file verification and +// repair tool). See http://parchive.sourceforge.net for details of PAR 2.0. +// +// Copyright (c) 2003 Peter Brian Clements +// Copyright (c) 2022 Xiuyan Wu +// +// par2cmdline is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// par2cmdline is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +#pragma once + +// This source file defines the CUDA version of Galois object for carrying out +// arithmetic in GF(2^16) using the generator 0x1100B on CUDA device. + +// Also defined are the GaloisTable object (which contains log and +// anti log tables for use in multiplication and division), and +// the GaloisLongMultiplyTable object (which contains tables for +// carrying out multiplation of 16-bit galois numbers 8 bits at a time). + +// CUDA Device global galois log/antilog table object +template +__device__ GaloisTable *d_table; + +// For readability +#define D_TABLE d_table + + +template +class GaloisCu +{ +public: + typedef valuetype ValueType; + + // Basic constructors + __device__ __host__ GaloisCu(void) {}; + __device__ __host__ GaloisCu(ValueType v); + + // Construct from CPU Galois variable in the same field. + __device__ __host__ GaloisCu(const Galois g); + + // Copy and assignment + __device__ __host__ GaloisCu(const GaloisCu &right) {value = right.value;} + __device__ __host__ GaloisCu& operator = (const GaloisCu &right) { value = right.value; return *this;} + + // Addition + __device__ GaloisCu operator + (const GaloisCu &right) const { return (value ^ right.value); } + __device__ GaloisCu& operator += (const GaloisCu &right) { value ^= right.value; return *this;} + + // Subtraction + __device__ GaloisCu operator - (const GaloisCu &right) const { return (value ^ right.value); } + __device__ GaloisCu& operator -= (const GaloisCu &right) { value ^= right.value; return *this;} + + // Multiplication + __device__ GaloisCu operator * (const GaloisCu &right) const; + __device__ GaloisCu& operator *= (const GaloisCu &right); + + // Division + __device__ GaloisCu operator / (const GaloisCu &right) const; + __device__ GaloisCu& operator /= (const GaloisCu &right); + + // Power + __device__ GaloisCu pow(unsigned int right) const; + __device__ GaloisCu operator ^ (unsigned int right) const; + __device__ GaloisCu& operator ^= (unsigned int right); + + // Cast to value and value access + __device__ __host__ operator ValueType(void) const {return value;} + __device__ __host__ ValueType Value(void) const {return value;} + + // Direct log and antilog + __device__ ValueType Log(void) const; + __device__ ValueType ALog(void) const; + + // Upload Galois Table to CUDA device + static bool uploadTable(void) + { + if (uploaded) return true; + + GaloisTable table, *d; + + cudaErrchk( cudaMalloc((void**) &d, sizeof(GaloisTable)) ); + cudaErrchk( cudaMemcpy(d, &table, sizeof(GaloisTable), cudaMemcpyHostToDevice) ); + cudaErrchk( cudaMemcpyToSymbol(D_TABLE, &d, sizeof(d), 0, cudaMemcpyHostToDevice) ); + +#ifdef _DEBUG + printf("Copied Galois table to device.\n"); +#endif // _DEBUG + return true; + } + + // Free Galois Table from CUDA device memory. + // To be called at the end of the program. + static void freeTable(void) + { + if (!uploaded) return; + cudaFree(D_TABLE); + uploaded = false; + } + + enum + { + Bits = GaloisTable::Bits, + Count = GaloisTable::Count, + Limit = GaloisTable::Limit, + }; + +protected: + ValueType value; + static bool uploaded; +}; + +template +__device__ __host__ inline GaloisCu::GaloisCu(typename GaloisCu::ValueType v) +{ + value = v; +} + +template +__device__ __host__ inline GaloisCu::GaloisCu(Galois g) +{ + value = g.Value; +} + +template +__device__ inline GaloisCu GaloisCu::operator * (const GaloisCu &right) const +{ + if(value == 0 || right.value == 0) return 0; + unsigned int sum = D_TABLE->log[value] + D_TABLE->log[right.value]; + if (sum >= Limit) + { + return D_TABLE->antilog[sum - Limit]; + } + else + { + return D_TABLE->antilog[sum]; + } +} + +template +__device__ inline GaloisCu& GaloisCu::operator *= (const GaloisCu &right) +{ + if(value == 0 || right.value == 0) + { + value = 0; + } + else + { + unsigned int sum = D_TABLE->log[value] + D_TABLE->log[right.value]; + if (sum >= Limit) + { + value = D_TABLE->antilog[sum-Limit]; + } + else + { + value = D_TABLE->antilog[sum]; + } + } + + return *this; +} + +template +__device__ inline GaloisCu GaloisCu::operator / (const GaloisCu &right) const +{ + if (value == 0) return 0; + + assert(right.value != 0); + if (right.value == 0) {return 0;} // Division by 0! + + int sum = D_TABLE->log[value] - D_TABLE->log[right.value]; + if (sum < 0) + { + return D_TABLE->antilog[sum+Limit]; + } + else + { + return D_TABLE->antilog[sum]; + } +} + +template +__device__ inline GaloisCu& GaloisCu::operator /= (const GaloisCu &right) +{ + if (value == 0) return *this; + + assert(right.value); + if (right.value == 0) {return *this;} // Division by 0! + + int sum = D_TABLE->log[value] - D_TABLE->log[right.value]; + if (sum < 0) + { + value = D_TABLE->antilog[sum+Limit]; + } + else + { + value = D_TABLE->antilog[sum]; + } + + return *this; +} + +template +__device__ inline GaloisCu GaloisCu::pow(unsigned int right) const +{ + if (right == 0) return 1; + if (value == 0) return 0; + + unsigned int sum = D_TABLE->log[value] * right; + + sum = (sum >> Bits) + (sum & Limit); + if (sum >= Limit) + { + return D_TABLE->antilog[sum-Limit]; + } + else + { + return D_TABLE->antilog[sum]; + } +} + +template +__device__ inline GaloisCu GaloisCu::operator ^ (unsigned int right) const +{ + if (right == 0) return 1; + if (value == 0) return 0; + + unsigned int sum = D_TABLE->log[value] * right; + + sum = (sum >> Bits) + (sum & Limit); + if (sum >= Limit) + { + return D_TABLE->antilog[sum-Limit]; + } + else + { + return D_TABLE->antilog[sum]; + } +} + +template +__device__ inline GaloisCu& GaloisCu::operator ^= (unsigned int right) +{ + if (right == 0) {value = 1; return *this;} + if (value == 0) return *this; + + unsigned int sum = D_TABLE->log[value] * right; + + sum = (sum >> Bits) + (sum & Limit); + if (sum >= Limit) + { + value = D_TABLE->antilog[sum-Limit]; + } + else + { + value = D_TABLE->antilog[sum]; + } + + return *this; +} + +template +__device__ inline valuetype GaloisCu::Log(void) const +{ + return D_TABLE->log[value]; +} + +template +__device__ inline valuetype GaloisCu::ALog(void) const +{ + return D_TABLE->antilog[value]; +} + +typedef GaloisCu<8,0x11D,u8> GaloisCu8; +typedef GaloisCu<16,0x1100B,u16> GaloisCu16; diff --git a/src/galois_cu_test.cu b/src/galois_cu_test.cu new file mode 100644 index 00000000..5caaf3f3 --- /dev/null +++ b/src/galois_cu_test.cu @@ -0,0 +1,419 @@ +#include +#include +#include +#include +#include "libpar2.h" +#include "galois_cu.cuh" +#include "helper_cuda.cuh" +#include "../profiling/Timer.h" + +#define BLOCK_WIDTH 64 + +void TestMult(void); +void TestDiv(void); +void TestAdd(void); +void TestSub(void); +void CompareResults(const Galois16 *resCPU, const GaloisCu16 *resGPU, char op); + +__global__ +void callDevice( GaloisCu16 *a, GaloisCu16 *b ) +{ + printf( "On device\n" ); + + printf( "a * b is %hu\n", ( *a * *b ).Value() ); +} + +int main() +{ + GaloisCu16::uploadTable(); + + // u16 ua = 52235; + // u16 ub = 65521; + // GaloisCu16 da(ua), db(ub); + // Galois16 a(ua), b(ub); + // printf("%hu\n", (a * b).Value()); + // GaloisCu16 *dap, *dbp; + // cudaMalloc((void**) &dap, 2); + // cudaMalloc((void**) &dbp, 2); + // cudaMemcpy(dap, &da, 2, cudaMemcpyHostToDevice); + // cudaMemcpy(dbp, &db, 2, cudaMemcpyHostToDevice); + // callDevice<<<1, 1>>> (dap, dbp); + // cudaDeviceSynchronize(); + + + TestMult(); + printf("----------------------\n"); + TestDiv(); + printf("----------------------\n"); + TestAdd(); + printf("----------------------\n"); + TestSub(); + + return 0; +} + +void CompareResults(const Galois16 *resCPU, const GaloisCu16 *resGPU, char op) +{ + printf("Varifying results.\n"); + bool correct = true; + size_t resId = 0; + for ( size_t i = 0; i < Galois16::Count; ++i ) + { + for ( size_t j = i; j < Galois16::Count; ++j ) + { + if ( resCPU[resId].Value() != resGPU[resId].Value() ) + { + correct = false; + printf("Result doesn't match! %u %c %u = %hu, but got %hu.\n", + i, op, j, resCPU[resId], resGPU[resId]); + } + ++resId; + } + } + + if (correct) + { + printf("Operation %c is correct!\n", op); + } +} + + +__global__ +void KerMult(__restrict__ GaloisCu16 *vars, __restrict__ GaloisCu16 *res) +{ + size_t outRowIdx = blockIdx.x * blockDim.x + threadIdx.x; // Also the idx of left var. + size_t outColMax = GaloisCu16::Count - outRowIdx; + size_t outStartIdx = ( outColMax + GaloisCu16::Count + 1 ) * outRowIdx / 2; + GaloisCu16 left = vars[ outRowIdx ]; + + for ( size_t i = 0; i < outColMax; ++i ) + { + res[ outStartIdx + i ] = left * vars[ outRowIdx + i ]; + } + + +} + +void TestMult() +{ + Galois16 vals[ Galois16::Count ], *results, *resultsOmp; + GaloisCu16 valsCu[ GaloisCu16::Count ], *resultsCu; + + // Fill arrays with every element in GF(2^16) + for ( size_t i = 0; i < Galois16::Count; ++i ) + { + vals[ i ] = Galois16( ( Galois16::ValueType ) i ); + valsCu[ i ] = GaloisCu16( ( Galois16::ValueType ) i ); + } + printf("Input arrays filled.\n"); + + // Allocate space for multiplication result + size_t resultSz = sizeof( Galois16 ) * ( Galois16::Count + 1 ) * Galois16::Count / 2; + // results = ( Galois16* ) malloc(resultSz); + resultsOmp = ( Galois16* ) malloc(resultSz); + cudaMallocHost( ( void** ) &resultsCu, resultSz ); + printf("Output arrays allocated\n"); + + // Calculate multiplication results on CPU. + // printf("Calculating reference results.\n"); + // { + // Timer timer("CPU-Serial"); + + // size_t resId = 0; + // for ( size_t i = 0; i < Galois16::Count; ++i ) + // { + // for ( size_t j = i; j < Galois16::Count; ++j ) + // { + // results[resId] = vals[i] * vals[j]; + // ++resId; + // } + // } + // } + // printf("Finished calculating reference results.\n"); + + // Calculate results using openmp + printf("Calculating OMP results.\n"); + { + Timer timer("CPU-OMP"); + + #pragma omp parallel for + for ( size_t i = 0; i < Galois16::Count; ++i ) + { + for ( size_t j = i; j < Galois16::Count; ++j ) + { + resultsOmp[i * ( 2 * Galois16::Count - i + 1 ) / 2 + j - i] = vals[i] * vals[j]; + } + } + } + + // Upload data to GPU + GaloisCu16 *d_valsCu, *d_resultsCu; + { + Timer timer("GPU"); + cudaErrchk( cudaMalloc( (void**) &d_valsCu, sizeof( valsCu ) ) ); + cudaErrchk( cudaMalloc( (void**) &d_resultsCu, resultSz ) ); + cudaErrchk( cudaMemcpy( d_valsCu, valsCu, sizeof( valsCu ), cudaMemcpyHostToDevice ) ); + printf("Input copied to GPU.\n"); + + // Launch kernel + dim3 dimBlock(BLOCK_WIDTH); + dim3 dimGrid(GaloisCu16::Count / BLOCK_WIDTH); + KerMult<<>> (d_valsCu, d_resultsCu); + cudaErrchk( cudaGetLastError() ); + printf("Kernel launched.\n"); + + cudaDeviceSynchronize(); + printf("Kernel completed.\n"); + + // Download results from GPU + cudaErrchk ( cudaMemcpy(resultsCu, d_resultsCu, resultSz, cudaMemcpyDeviceToHost) ); + } + + CompareResults(resultsOmp, resultsCu, '*'); + + + // free(results); + free(resultsOmp); + cudaFreeHost(resultsCu); + cudaFree(d_resultsCu); +} + +__global__ +void KerDiv(__restrict__ GaloisCu16 *vars, __restrict__ GaloisCu16 *res) +{ + size_t outRowIdx = blockIdx.x * blockDim.x + threadIdx.x; // Also the idx of left var. + size_t outColMax = GaloisCu16::Count - outRowIdx; + size_t outStartIdx = ( outColMax + GaloisCu16::Count + 1 ) * outRowIdx / 2; + GaloisCu16 left = vars[ outRowIdx ]; + + for ( size_t i = 0; i < outColMax; ++i ) + { + res[ outStartIdx + i ] = left / vars[ outRowIdx + i ]; + } + +} + +void TestDiv() +{ + Galois16 vals[ Galois16::Count ], *resultsOmp; + GaloisCu16 valsCu[ GaloisCu16::Count ], *resultsCu; + + // Fill arrays with every element in GF(2^16) + for ( size_t i = 0; i < Galois16::Count; ++i ) + { + vals[ i ] = Galois16( ( Galois16::ValueType ) i ); + valsCu[ i ] = GaloisCu16( ( Galois16::ValueType ) i ); + } + printf("Input arrays filled.\n"); + + // Allocate space for result + size_t resultSz = sizeof( Galois16 ) * ( Galois16::Count + 1 ) * Galois16::Count / 2; + resultsOmp = ( Galois16* ) malloc(resultSz); + cudaMallocHost( ( void** ) &resultsCu, resultSz ); + printf("Output arrays allocated\n"); + + // Calculate results using openmp + printf("Calculating Reference results.\n"); + { + Timer timer("CPU-OMP"); + + #pragma omp parallel for + for ( size_t i = 0; i < Galois16::Count; ++i ) + { + for ( size_t j = i; j < Galois16::Count; ++j ) + { + resultsOmp[i * ( 2 * Galois16::Count - i + 1 ) / 2 + j - i] = vals[i] / vals[j]; + } + } + } + + // Upload data to GPU + GaloisCu16 *d_valsCu, *d_resultsCu; + { + Timer timer("GPU"); + cudaErrchk( cudaMalloc( (void**) &d_valsCu, sizeof( valsCu ) ) ); + cudaErrchk( cudaMalloc( (void**) &d_resultsCu, resultSz ) ); + cudaErrchk( cudaMemcpy( d_valsCu, valsCu, sizeof( valsCu ), cudaMemcpyHostToDevice ) ); + printf("Input copied to GPU.\n"); + + // Launch kernel + dim3 dimBlock(BLOCK_WIDTH); + dim3 dimGrid(GaloisCu16::Count / BLOCK_WIDTH); + KerDiv<<>> (d_valsCu, d_resultsCu); + cudaErrchk( cudaGetLastError() ); + printf("Kernel launched.\n"); + + cudaDeviceSynchronize(); + printf("Kernel completed.\n"); + + // Download results from GPU + cudaErrchk ( cudaMemcpy(resultsCu, d_resultsCu, resultSz, cudaMemcpyDeviceToHost) ); + } + + CompareResults(resultsOmp, resultsCu, '/'); + + // free(results); + free(resultsOmp); + cudaFreeHost(resultsCu); + cudaFree(d_resultsCu); +} + +__global__ +void KerAdd(__restrict__ GaloisCu16 *vars, __restrict__ GaloisCu16 *res) +{ + size_t outRowIdx = blockIdx.x * blockDim.x + threadIdx.x; // Also the idx of left var. + size_t outColMax = GaloisCu16::Count - outRowIdx; + size_t outStartIdx = ( outColMax + GaloisCu16::Count + 1 ) * outRowIdx / 2; + GaloisCu16 left = vars[ outRowIdx ]; + + for ( size_t i = 0; i < outColMax; ++i ) + { + res[ outStartIdx + i ] = left + vars[ outRowIdx + i ]; + } + +} + +void TestAdd() +{ + Galois16 vals[ Galois16::Count ], *resultsOmp; + GaloisCu16 valsCu[ GaloisCu16::Count ], *resultsCu; + + // Fill arrays with every element in GF(2^16) + for ( size_t i = 0; i < Galois16::Count; ++i ) + { + vals[ i ] = Galois16( ( Galois16::ValueType ) i ); + valsCu[ i ] = GaloisCu16( ( Galois16::ValueType ) i ); + } + printf("Input arrays filled.\n"); + + // Allocate space for result + size_t resultSz = sizeof( Galois16 ) * ( Galois16::Count + 1 ) * Galois16::Count / 2; + resultsOmp = ( Galois16* ) malloc(resultSz); + cudaMallocHost( ( void** ) &resultsCu, resultSz ); + printf("Output arrays allocated\n"); + + // Calculate results using openmp + printf("Calculating Reference results.\n"); + { + Timer timer("CPU-OMP"); + + #pragma omp parallel for + for ( size_t i = 0; i < Galois16::Count; ++i ) + { + for ( size_t j = i; j < Galois16::Count; ++j ) + { + resultsOmp[i * ( 2 * Galois16::Count - i + 1 ) / 2 + j - i] = vals[i] + vals[j]; + } + } + } + + // Upload data to GPU + GaloisCu16 *d_valsCu, *d_resultsCu; + { + Timer timer("GPU"); + cudaErrchk( cudaMalloc( (void**) &d_valsCu, sizeof( valsCu ) ) ); + cudaErrchk( cudaMalloc( (void**) &d_resultsCu, resultSz ) ); + cudaErrchk( cudaMemcpy( d_valsCu, valsCu, sizeof( valsCu ), cudaMemcpyHostToDevice ) ); + printf("Input copied to GPU.\n"); + + // Launch kernel + dim3 dimBlock(BLOCK_WIDTH); + dim3 dimGrid(GaloisCu16::Count / BLOCK_WIDTH); + KerAdd<<>> (d_valsCu, d_resultsCu); + cudaErrchk( cudaGetLastError() ); + printf("Kernel launched.\n"); + + cudaDeviceSynchronize(); + printf("Kernel completed.\n"); + + // Download results from GPU + cudaErrchk ( cudaMemcpy(resultsCu, d_resultsCu, resultSz, cudaMemcpyDeviceToHost) ); + } + + CompareResults(resultsOmp, resultsCu, '+'); + + // free(results); + free(resultsOmp); + cudaFreeHost(resultsCu); + cudaFree(d_resultsCu); +} + +__global__ +void KerSub(__restrict__ GaloisCu16 *vars, __restrict__ GaloisCu16 *res) +{ + size_t outRowIdx = blockIdx.x * blockDim.x + threadIdx.x; // Also the idx of left var. + size_t outColMax = GaloisCu16::Count - outRowIdx; + size_t outStartIdx = ( outColMax + GaloisCu16::Count + 1 ) * outRowIdx / 2; + GaloisCu16 left = vars[ outRowIdx ]; + + for ( size_t i = 0; i < outColMax; ++i ) + { + res[ outStartIdx + i ] = left - vars[ outRowIdx + i ]; + } + +} + +void TestSub() +{ + Galois16 vals[ Galois16::Count ], *resultsOmp; + GaloisCu16 valsCu[ GaloisCu16::Count ], *resultsCu; + + // Fill arrays with every element in GF(2^16) + for ( size_t i = 0; i < Galois16::Count; ++i ) + { + vals[ i ] = Galois16( ( Galois16::ValueType ) i ); + valsCu[ i ] = GaloisCu16( ( Galois16::ValueType ) i ); + } + printf("Input arrays filled.\n"); + + // Allocate space for result + size_t resultSz = sizeof( Galois16 ) * ( Galois16::Count + 1 ) * Galois16::Count / 2; + resultsOmp = ( Galois16* ) malloc(resultSz); + cudaMallocHost( ( void** ) &resultsCu, resultSz ); + printf("Output arrays allocated\n"); + + // Calculate results using openmp + printf("Calculating Reference results.\n"); + { + Timer timer("CPU-OMP"); + + #pragma omp parallel for + for ( size_t i = 0; i < Galois16::Count; ++i ) + { + for ( size_t j = i; j < Galois16::Count; ++j ) + { + resultsOmp[i * ( 2 * Galois16::Count - i + 1 ) / 2 + j - i] = vals[i] - vals[j]; + } + } + } + + // Upload data to GPU + GaloisCu16 *d_valsCu, *d_resultsCu; + { + Timer timer("GPU"); + cudaErrchk( cudaMalloc( (void**) &d_valsCu, sizeof( valsCu ) ) ); + cudaErrchk( cudaMalloc( (void**) &d_resultsCu, resultSz ) ); + cudaErrchk( cudaMemcpy( d_valsCu, valsCu, sizeof( valsCu ), cudaMemcpyHostToDevice ) ); + printf("Input copied to GPU.\n"); + + // Launch kernel + dim3 dimBlock(BLOCK_WIDTH); + dim3 dimGrid(GaloisCu16::Count / BLOCK_WIDTH); + KerSub<<>> (d_valsCu, d_resultsCu); + cudaErrchk( cudaGetLastError() ); + printf("Kernel launched.\n"); + + cudaDeviceSynchronize(); + printf("Kernel completed.\n"); + + // Download results from GPU + cudaErrchk ( cudaMemcpy(resultsCu, d_resultsCu, resultSz, cudaMemcpyDeviceToHost) ); + } + + CompareResults(resultsOmp, resultsCu, '-'); + + // free(results); + free(resultsOmp); + cudaFreeHost(resultsCu); + cudaFree(d_resultsCu); +} \ No newline at end of file diff --git a/src/helper_cuda.cuh b/src/helper_cuda.cuh new file mode 100644 index 00000000..1a8b9985 --- /dev/null +++ b/src/helper_cuda.cuh @@ -0,0 +1,14 @@ +#pragma once +#include + +#define cudaErrchk(ans) { if ( !gpuAssert((ans), __FILE__, __LINE__) ) return false; } +inline bool gpuAssert(cudaError_t code, const char *file, int line) +{ + if (code != cudaSuccess) + { + fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); + // if (abort) exit(code); + return false; + } + return true; +} \ No newline at end of file diff --git a/src/libpar2.cpp b/src/libpar2.cpp index bed2e535..0c68c434 100644 --- a/src/libpar2.cpp +++ b/src/libpar2.cpp @@ -27,6 +27,9 @@ Result par2create(std::ostream &sout, #ifdef _OPENMP const u32 nthreads, const u32 filethreads, +#endif +#ifdef ENABLE_CUDA + const bool useCuda, #endif const string &parfilename, const vector &extrafiles, @@ -44,6 +47,9 @@ Result par2create(std::ostream &sout, #ifdef _OPENMP nthreads, filethreads, +#endif +#ifdef ENABLE_CUDA + useCuda, #endif parfilename, extrafiles, diff --git a/src/libpar2.h b/src/libpar2.h index f0f4e109..e6d58380 100644 --- a/src/libpar2.h +++ b/src/libpar2.h @@ -144,6 +144,9 @@ Result par2create(std::ostream &sout, #ifdef _OPENMP const u32 nthreads, const u32 filethreads, +#endif +#ifdef ENABLE_CUDA + const bool useCuda, #endif const std::string &parfilename, const std::vector &extrafiles, diff --git a/src/libpar2internal.h b/src/libpar2internal.h index 699a5570..f3147d22 100644 --- a/src/libpar2internal.h +++ b/src/libpar2internal.h @@ -245,6 +245,11 @@ using namespace std; #include "par1repairersourcefile.h" #include "par1repairer.h" +#ifdef __NVCC__ + #include "helper_cuda.cuh" + #include "galois_cu.cuh" +#endif + // Heap checking #ifdef _MSC_VER #define _CRTDBG_MAP_ALLOC diff --git a/src/par2cmdline.cpp b/src/par2cmdline.cpp index 48b87da5..f808d478 100644 --- a/src/par2cmdline.cpp +++ b/src/par2cmdline.cpp @@ -67,6 +67,9 @@ int main(int argc, char *argv[]) #ifdef _OPENMP commandline->GetNumThreads(), commandline->GetFileThreads(), +#endif +#ifdef ENABLE_CUDA + commandline->GetUseCuda(), #endif commandline->GetParFilename(), commandline->GetExtraFiles(), diff --git a/src/par2creator.cpp b/src/par2creator.cpp index 25f43dd3..dc7a5d90 100644 --- a/src/par2creator.cpp +++ b/src/par2creator.cpp @@ -95,6 +95,9 @@ Result Par2Creator::Process( #ifdef _OPENMP const u32 nthreads, const u32 _filethreads, +#endif +#ifdef ENABLE_CUDA + const bool _useCuda, #endif const string &parfilename, const vector &_extrafiles, @@ -108,6 +111,10 @@ Result Par2Creator::Process( filethreads = _filethreads; #endif +#ifdef ENABLE_CUDA + useCuda = _useCuda; +#endif + // Get information from commandline blocksize = _blocksize; const vector extrafiles = _extrafiles; @@ -188,20 +195,31 @@ Result Par2Creator::Process( // Set the total amount of data to be processed. progress = 0; totaldata = blocksize * sourceblockcount * recoveryblockcount; + + #ifdef ENABLE_CUDA + if (!useCuda) { + #endif + // Start at an offset of 0 within a block. + u64 blockoffset = 0; + while (blockoffset < blocksize) // Continue until the end of the block. + { + // Work out how much data to process this time. + size_t blocklength = (size_t)min((u64)chunksize, blocksize-blockoffset); - // Start at an offset of 0 within a block. - u64 blockoffset = 0; - while (blockoffset < blocksize) // Continue until the end of the block. - { - // Work out how much data to process this time. - size_t blocklength = (size_t)min((u64)chunksize, blocksize-blockoffset); + // Read source data, process it through the RS matrix and write it to disk. + if (!ProcessData(blockoffset, blocklength)) + return eFileIOError; - // Read source data, process it through the RS matrix and write it to disk. - if (!ProcessData(blockoffset, blocklength)) + blockoffset += blocklength; + } + #ifdef ENABLE_CUDA + } else { + // Read source data, process it through the RS matrix using GPU and write it to disk. + if (!ProcessDataCu()) { return eFileIOError; - - blockoffset += blocklength; + } } + #endif if (noiselevel > nlQuiet) sout << "Writing recovery packets" << endl; @@ -299,10 +317,27 @@ bool Par2Creator::CalculateProcessBlockSize(size_t memorylimit) else { // Would single pass processing use too much memory - if (blocksize * recoveryblockcount > memorylimit) + // recoveryblockcount for outputbuffer and 1 for inputbuffer + u64 memoryNeed = blocksize * (recoveryblockcount + 1); + +#ifdef ENABLE_CUDA + if (useCuda) { + memoryNeed = blocksize * (recoveryblockcount + sourceblockcount); + } +#endif + + if (memoryNeed > memorylimit) { // Pick a size that is small enough - chunksize = ~3 & (memorylimit / recoveryblockcount); +#ifdef ENABLE_CUDA + if (!useCuda) { +#endif + chunksize = ~3 & (memorylimit / (recoveryblockcount + 1)); +#ifdef ENABLE_CUDA + } else { + chunksize = ~3 & (memorylimit / (recoveryblockcount + sourceblockcount)); + } +#endif deferhashcomputation = false; } @@ -705,7 +740,16 @@ bool Par2Creator::InitialiseOutputFiles(const string &parfilename) // Allocate memory buffers for reading and writing data to disk. bool Par2Creator::AllocateBuffers(void) { - inputbuffer = new u8[chunksize]; +#ifdef ENABLE_CUDA + if (!useCuda) { +#endif + inputbuffer = new u8[chunksize]; +#ifdef ENABLE_CUDA + } else { + inputbuffer = new u8[chunksize * sourceblockcount]; + } +#endif + outputbuffer = new u8[chunksize * recoveryblockcount]; if (inputbuffer == NULL || outputbuffer == NULL) diff --git a/src/par2creator.cu b/src/par2creator.cu new file mode 100644 index 00000000..88c5f31b --- /dev/null +++ b/src/par2creator.cu @@ -0,0 +1,140 @@ +// This file is part of par2cmdline (a PAR 2.0 compatible file verification and +// repair tool). See http://parchive.sourceforge.net for details of PAR 2.0. +// +// Copyright (c) 2003 Peter Brian Clements +// Copyright (c) 2019 Michael D. Nahas +// Copyright (c) 2022 Xiuyan Wu +// +// par2cmdline is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// par2cmdline is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +#include "libpar2internal.h" + +#ifdef _MSC_VER +#ifdef _DEBUG +#undef THIS_FILE +static char THIS_FILE[]=__FILE__; +#define new DEBUG_NEW +#endif +#endif + +// ProcessData, but on CUDA device. +bool Par2Creator::ProcessDataCu() +{ + // Start at an offset of 0 within a block. + // Continue until the end of the block. + u64 blockOffset = 0; + while (blockOffset < blocksize) { + // Work out how much data to process this time. + size_t blockLen = (size_t) min((u64) chunksize, blocksize - blockOffset); + + // Clear the output buffer + memset(outputbuffer, 0, chunksize * recoveryblockcount); + + // If we have deferred computation of the file hash and block crc and hashes + // sourcefile and sourceindex will be used to update them during + // the main recovery block computation + vector::iterator sourcefile = sourcefiles.begin(); + u32 sourceindex = 0; + + vector::iterator sourceblock; + u32 inputIdx; + + DiskFile *lastopenfile = NULL; + + // Read blockLen bytes of each input block into inputbuffer + for ((sourceblock=sourceblocks.begin()),(inputIdx=0); + sourceblock != sourceblocks.end(); + ++sourceblock, ++inputIdx) + { + // Are we reading from a new file? + if (lastopenfile != (*sourceblock).GetDiskFile()) + { + // Close the last file + if (lastopenfile != NULL) + { + lastopenfile->Close(); + } + + // Open the new file + lastopenfile = (*sourceblock).GetDiskFile(); + if (!lastopenfile->Open()) + { + return false; + } + } + + // Read data from the current input block + if (!sourceblock->ReadData(blockOffset, blockLen, &((u8*) inputbuffer)[blockLen * inputIdx])) + return false; + + if (deferhashcomputation) + { + assert(blockOffset == 0 && blockLen == blocksize); + assert(sourcefile != sourcefiles.end()); + + (*sourcefile)->UpdateHashes(sourceindex, &((u8*) inputbuffer)[blockLen * inputIdx], blockLen); + } + + // Work out which source file the next block belongs to + if (++sourceindex >= (*sourcefile)->BlockCount()) + { + sourceindex = 0; + ++sourcefile; + } + } + + // Close the last file + if (lastopenfile != NULL) + { + lastopenfile->Close(); + } + + // Process the data through the RS matrix on GPU + if (!rs.ProcessCu(blockLen, 0, sourceblockcount - 1, inputbuffer, 0, recoveryblockcount - 1, outputbuffer)) { + return false; + } + + if (noiselevel > nlQuiet) + { + // Update a progress indicator + u32 oldfraction = (u32)(1000 * progress / totaldata); + progress += blockLen * sourceblockcount * recoveryblockcount; + u32 newfraction = (u32)(1000 * progress / totaldata); + + if (oldfraction != newfraction) + { + sout << "Processing: " << newfraction/10 << '.' << newfraction%10 << "%\r" << flush; + } + } + + // For each output block + for (u32 outputblock=0; outputblock nlQuiet) + sout << "Wrote " << recoveryblockcount * blockLen << " bytes to disk" << endl; + + blockOffset += blockLen; + } + + return true; +} diff --git a/src/par2creator.h b/src/par2creator.h index a28f8cde..24ec5b5e 100644 --- a/src/par2creator.h +++ b/src/par2creator.h @@ -38,6 +38,9 @@ class Par2Creator #ifdef _OPENMP const u32 nthreads, const u32 filethreads, +#endif +#ifdef ENABLE_CUDA + const bool useCuda, #endif const string &parfilename, const vector &extrafiles, @@ -83,6 +86,11 @@ class Par2Creator // Read source data, process it through the RS matrix and write it to disk. bool ProcessData(u64 blockoffset, size_t blocklength); +#ifdef ENABLE_CUDA + // ProcessData, but on CUDA device. + bool ProcessDataCu(void); +#endif + // Finish computation of the recovery packets and write the headers to disk. bool WriteRecoveryPacketHeaders(void); @@ -112,6 +120,10 @@ class Par2Creator static u32 filethreads; // Number of threads for file processing #endif +#ifdef ENABLE_CUDA + bool useCuda; +#endif + u64 blocksize; // The size of each block. size_t chunksize; // How much of each block will be processed at a // time (due to memory constraints). diff --git a/src/reedsolomon.cu b/src/reedsolomon.cu new file mode 100644 index 00000000..5e5c3a3f --- /dev/null +++ b/src/reedsolomon.cu @@ -0,0 +1,399 @@ +// This file is part of par2cmdline (a PAR 2.0 compatible file verification and +// repair tool). See http://parchive.sourceforge.net for details of PAR 2.0. +// +// Copyright (c) 2022 Xiuyan Wu +// +// par2cmdline is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// par2cmdline is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +#include + +#include "libpar2internal.h" +#include "helper_cuda.cuh" + +__global__ void ProcessKer( const int batchSz, // size in number of word of each batch + const void * __restrict__ inputData, // size: chunkSz * inputCount + const void * __restrict__ bases, // size: sizeof(G) * inputCount + const int inputCount, + const int outputCount, // # of output blocks + void * __restrict__ outputBuf, // size: chunkSz * outputCount + const void * __restrict__ exponents // size: sizeof(G) * outputCount + ); + +__global__ void ReduceKer( const u32 * __restrict__ input, // Countains results from ProcessKer + u32 * __restrict__ output, // Output Location + const int outputCount, // Number of output blocks + const int tileCount // Number of input tiles + ); + +void CUDART_CB WriteOutputCB( void *data ); + +typedef struct +{ + size_t batchSz; + size_t wordPerChunk; + size_t wordPerBatch; + size_t batchIdx; + size_t outCount; + Galois16 *finalOutput; + Galois16 *batchOutput; +} workload; + +#define TBLOCK_SZ 512 +#define MAX_THREAD 1024 +#define TILE_WIDTH 16 +#define SHARED_MEM_SZ 32768 + +// Calculate the contribution of words contained in inputBuf to specified output blocks. +template <> +bool ReedSolomon::ProcessCu( const size_t size, // size of one chunk of data + const u32 inputIdxStart, + const u32 inputIdxEnd, + const void *inputBuf, // inputCount * size input1-input2-... + const u32 outputIdxStart, + const u32 outputIdxEnd, + void *outputBuf ) // outputCount * size +{ + // BUG: Doesn't really respect memory limit for VRAM. + // CUDA Device compatible Galois type. + typedef GaloisCu Gd; + if ( !Gd::uploadTable() ) return false; + cudaFuncSetCacheConfig(ProcessKer, cudaFuncCachePreferL1); + + const u32 inCount = inputIdxEnd - inputIdxStart + 1; + const u32 outCount = outputIdxEnd - outputIdxStart + 1; + const u32 wordPerChunk = size / sizeof(Gd); + // Batch need to be 4-byte aligned + const u32 wordPerBatch = (TBLOCK_SZ * SHARED_MEM_SZ / ( MAX_THREAD * TILE_WIDTH * sizeof(Gd) ) - 1) & ~1; + const u32 tileCount = inCount / TILE_WIDTH + ( inCount % TILE_WIDTH != 0 ); + const u32 batchCount = ceil( (float) wordPerChunk / wordPerBatch ); + + /* + * size: chunk size + * VRam footprint: (inCount + outCount) * size + (inputcount + outCount) * sizeof(g) + sizeof(g::GaloisTable) + * + * + * Assume the total VRam footprint can be fitted into device vram. + * Align chunkSz to 4 bytes + * + * + */ + + // Allocate GPU memory buffers + Gd *d_bases, *d_exponents; + + cudaErrchk( cudaMalloc( (void**) &d_bases, inCount * sizeof(Gd) ) ); + cudaErrchk( cudaMalloc( (void**) &d_exponents, outCount * sizeof(Gd) ) ); + + // Copy bases and exponents to GPU + u16 *baseOffset = &database[inputIdxStart]; + u16 *exponents = new u16[outCount]; + for ( u32 i = outputIdxStart; i <= outputIdxEnd; ++i ) { + exponents[i - outputIdxStart] = outputrows[i].exponent; + } + + cudaErrchk( cudaMemcpyAsync( d_bases, baseOffset, inCount * sizeof(Gd), cudaMemcpyHostToDevice ) ); + cudaErrchk( cudaMemcpyAsync( d_exponents, exponents, outCount * sizeof(u16), cudaMemcpyHostToDevice ) ); + cudaErrchk( cudaDeviceSynchronize() ); + delete [] exponents; + + // Set kernel launch parameters + dim3 dimGrid( tileCount ); + dim3 dimBlock( TBLOCK_SZ ); + + // Create stream + cudaStream_t *stream = new cudaStream_t[batchCount]; + for ( u32 i = 0; i < batchCount; ++i ) { + cudaErrchk( cudaStreamCreateWithFlags( &stream[i], cudaStreamNonBlocking ) ); + } + + G *batchInput, *batchOutput; + cudaErrchk( cudaMallocHost( (void**) &batchInput, inCount * wordPerBatch * sizeof(G) ) ); + cudaErrchk( cudaMallocHost( (void**) &batchOutput, outCount * wordPerBatch * sizeof(G) ) ); + + Gd **d_input = new Gd*[batchCount]; + Gd **d_intermediate = new Gd*[batchCount]; + Gd **d_output = new Gd*[batchCount]; + + cudaEvent_t written, upload; + cudaErrchk( cudaEventCreate( &written, cudaEventDisableTiming ) ); + cudaErrchk( cudaEventCreate( &upload, cudaEventBlockingSync ) ); + + // Concurrent kernel invoking + for ( u32 batchIdx = 0; batchIdx < batchCount; ++batchIdx ) { + int batchSz = wordPerBatch; + if ( batchIdx == batchCount - 1 ) { + batchSz = wordPerChunk - batchIdx * wordPerBatch; + } + int batchSzAligned = batchSz + (batchSz & 1); + + // Allocate memory + // Gd *d_input, *d_intermediate, *d_output; + cudaErrchk( cudaMallocAsync( (void**) &d_input[batchIdx], inCount * batchSz * sizeof(Gd), stream[batchIdx] ) ); + cudaErrchk( cudaMallocAsync( (void**) &d_intermediate[batchIdx], tileCount * batchSzAligned * outCount * sizeof(Gd), stream[batchIdx] ) ); + cudaErrchk( cudaMallocAsync( (void**) &d_output[batchIdx], outCount * batchSzAligned * sizeof(Gd), stream[batchIdx] ) ); + + // Wait until the last iteration has sent all input data to GPU. + cudaErrchk( cudaEventSynchronize( upload ) ); + + // Copy input data to GPU + for ( u32 i = 0; i < inCount; ++i ) { + void *inputBufOffset = (char*) inputBuf + i * size + batchIdx * wordPerBatch * sizeof(G); + void *batchInputOffset = (char*) batchInput + i * batchSz * sizeof(G); + memcpy( batchInputOffset, inputBufOffset, batchSz * sizeof(G) ); + } + + cudaErrchk( cudaMemcpyAsync( d_input[batchIdx], batchInput, inCount * batchSz * sizeof(G), cudaMemcpyHostToDevice, stream[batchIdx] ) ); + cudaErrchk( cudaEventRecord( upload, stream[batchIdx] ) ); + + // Lauch Compute Kernel + ProcessKer<<>> + ( batchSz, + d_input[batchIdx], + d_bases, + inCount, + outCount, + d_intermediate[batchIdx], + d_exponents + ); + + // Lauch Reduce Kernel + dim3 dimBlockReduce( 32 ); + dim3 dimGridReduce( ceil( outCount / (float) dimBlockReduce.x ), batchSzAligned / 2 ); + ReduceKer<<>> + ( (u32*) d_intermediate[batchIdx], + (u32*) d_output[batchIdx], + outCount, + tileCount + ); + + // Wait until output from the last iteration has already + // been written to actual output buffer. + cudaErrchk( cudaStreamWaitEvent( stream[batchIdx], written) ); + + // Copy Result to batch output buffer + cudaErrchk( cudaMemcpyAsync( batchOutput, + d_output[batchIdx], + batchSzAligned * outCount * sizeof(Gd), + cudaMemcpyDeviceToHost, + stream[batchIdx] + ) ); + + // Copy result in batch output buffer to actual output buffer + workload *work = new workload; + work->batchSz = batchSz; + work->wordPerChunk = wordPerChunk; + work->wordPerBatch = wordPerBatch; + work->batchIdx = batchIdx; + work->outCount = outCount; + work->finalOutput = (G*) outputBuf; + work->batchOutput = batchOutput; + cudaErrchk( cudaLaunchHostFunc( stream[batchIdx], WriteOutputCB, work ) ); + cudaErrchk( cudaEventRecord( written, stream[batchIdx] ) ); + + cudaErrchk( cudaFreeAsync( d_input[batchIdx], stream[batchIdx] ) ); + cudaErrchk( cudaFreeAsync( d_intermediate[batchIdx], stream[batchIdx] ) ); + cudaErrchk( cudaFreeAsync( d_output[batchIdx], stream[batchIdx] ) ); + + } + cudaErrchk( cudaDeviceSynchronize() ); + + // Destroy stream + for ( u32 i = 0; i < batchCount; ++i ) { + cudaErrchk( cudaStreamDestroy( stream[i] ) ); + } + + cudaFree( d_bases ); + cudaFree( d_exponents ); + cudaFreeHost( batchInput ); + cudaFreeHost( batchOutput ); + delete[] stream; + delete[] d_input; + delete[] d_intermediate; + delete[] d_output; + + return true; +} + +__global__ void ProcessKer( const int batchSz, + const void * __restrict__ inputData, + const void * __restrict__ bases, + const int inputCount, + const int outputCount, + void * __restrict__ outputBuf, + const void * __restrict__ exponents + ) +{ + /* + inputData: I_1,1 I_1,2 ... ... I_1,batchSz + I_2,1 ... ... ... I_2,batchSz + . + . + . + I_TILE_WIDTH*gridDim.x,1 ... ... I_TILE_WIDTH*gridDim.x,batchSz + + outputBuf **Transposed**: + <> + O_1,1 O_1,2 O_1,3 ... O_1,batchSz + O_2,1 ... ... ... O_2,batchSz + . + . + . + O_outputCount,1 ... ... O_outputCount,batchSz + <> + O_1,1 O_1,2 O_1,3 ... O_1,batchSz + . + . + . + . + O_outputCount,1 ... ... O_outputCount,batchSz + <> + . + . + <> + */ + + // Need (batchSz * TILE_WIDTH + TILE_WIDTH) * sizeof(G) bytes + typedef GaloisCu16 G; + extern __shared__ char sharedMem[]; + + const int batchSzAligned = batchSz + (batchSz & 1); + G *smInput = (G *) sharedMem; // Shared memory input buffer + G *smBases = (G *) ( sharedMem + batchSzAligned * TILE_WIDTH * sizeof(G) ); + + const int wordPerInt = sizeof(u32) / sizeof(G); + const int intPerBatch = batchSzAligned / wordPerInt; + const int outBufWidth = outputCount * gridDim.x; + const int outBufRowPos = outputCount * blockIdx.x; + + // Load input data and bases into shared mem + for ( int i = 0; i < TILE_WIDTH; ++i ) + { + int inputIdx = blockIdx.x * TILE_WIDTH + i; + for ( int j = threadIdx.x; j < batchSzAligned; j += blockDim.x ) + { + if ( inputIdx < inputCount && j < batchSz ) { + ((G *) smInput)[i * batchSzAligned + j] = ((G *) inputData)[inputIdx * batchSz + j]; + } else { + ((G *) smInput)[i * batchSzAligned + j] = 0; + } + } + } + + for ( int i = threadIdx.x; i < TILE_WIDTH; i += blockDim.x ) + { + int inputIdx = blockIdx.x * TILE_WIDTH + i; + if ( inputIdx < inputCount ) { + ((G *) smBases)[i] = ((G *) bases)[inputIdx]; + } else { + ((G *) smBases)[i] = 0; + } + } + + __syncthreads(); + + G factors[TILE_WIDTH]; + u16 exponent; + u32 acc = 0; + u32 words, res; + // Each thread compute one output block. + for ( int i = threadIdx.x; i < outputCount; i += blockDim.x ) + { + // Calculate factors for this tile for this output block + exponent = ((G *) exponents)[i].Value(); + // #pragma unroll + for ( int ii = 0; ii < TILE_WIDTH; ++ii ) { + factors[ii] = smBases[ii].pow(exponent); + } + + // For each int in the batch + for ( int j = 0; j < intPerBatch; ++j ) + { + acc = 0; + // For each inputblock in the tile, calculate contribution of corresponding 2 words (a int). + for ( int k = 0; k < TILE_WIDTH; ++k ) + { + // factor = smBases[k].pow( exponent ); + words = ((u32 *) smInput)[k * intPerBatch + j]; + res = 0; + for ( int w = 0; w < wordPerInt; ++w ) + { + ((G *) &res)[w] = factors[k] * ((G *) &words)[w]; + } + acc ^= res; + } + + // Write result to outputBuf + ((u32 *)outputBuf)[j * outBufWidth + outBufRowPos + i] = acc; + } + } +} + +__global__ void ReduceKer( const u32 * __restrict__ input, // Countains results from ProcessKer + u32 * __restrict__ output, // Output Location + const int outputCount, // Number of output blocks + const int tileCount // Number of input tiles + ) +{ + /* + input **Transposed**: + <> + O_1,1 O_1,2 O_1,3 ... O_1,batchSz + O_2,1 ... ... ... O_2,batchSz + . + . + . + O_outputCount,1 ... ... O_outputCount,batchSz + <> + O_1,1 O_1,2 O_1,3 ... O_1,batchSz + . + . + . + . + O_outputCount,1 ... ... O_outputCount,batchSz + <> + . + . + <> + */ + // 2D Grid: blocks at position x, y process + // the y^th (two) word of output block x*blockDim to (x+1)*blockDim. + const int ox = blockIdx.x * blockDim.x + threadIdx.x; + const int row = blockIdx.y * outputCount * tileCount; + + if (ox >= outputCount) return; + + u32 acc = 0; + int inputIdx = ox; + for ( int i = 0; i < tileCount; ++i ) { + acc ^= input[row + inputIdx]; + inputIdx += outputCount; + } + + output[gridDim.y * ox + blockIdx.y] = acc; +} + +void CUDART_CB WriteOutputCB( void *data ) { + // Write output from batch output buffer into actual output buffer. + workload *work = (workload *) data; + size_t batchSzAligned = work->batchSz + (work->batchSz & 1); + for ( u32 i = 0; i < work->outCount; ++i ){ + memcpy( &work->finalOutput[work->wordPerChunk * i + work->wordPerBatch * work->batchIdx], + &work->batchOutput[batchSzAligned * i], + work->batchSz * sizeof(Galois16) + ); + } + delete work; +} + diff --git a/src/reedsolomon.h b/src/reedsolomon.h index 8746db6e..57ab7850 100644 --- a/src/reedsolomon.h +++ b/src/reedsolomon.h @@ -3,6 +3,7 @@ // // Copyright (c) 2003 Peter Brian Clements // Copyright (c) 2019 Michael D. Nahas +// Copyright (c) 2022 Xiuyan Wu // // par2cmdline is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by @@ -67,6 +68,19 @@ class ReedSolomon const void *inputbuffer, // Buffer containing input data u32 outputindex, // The row in the RS matrix void *outputbuffer); // Buffer containing output data + +#ifdef ENABLE_CUDA + // Process a block of data for output blocks [outputIdxStart, outputIdxEnd] on a CUDA device. + bool ProcessCu( const size_t size, // The size of the block of data + const u32 inputIdxStart, // The index of the first input block to be processed + const u32 inputIdxEnd, // The index of the last input block to be processed + const void *inputBuf, // Buffer containing input data + const u32 outputIdxStart, // The row for the first output block to process in RS matrix + const u32 outputIdxEnd, // The row for the last output block to process in RS matrix + void *outputBuf ); // Buffer containing output data +#endif + + private: bool InternalProcess(const g &factor, size_t size, const void *inputbuffer, void *outputbuffer); // Optimization diff --git a/src/reedsolomon_cu_test.cu b/src/reedsolomon_cu_test.cu new file mode 100644 index 00000000..8db4ba8a --- /dev/null +++ b/src/reedsolomon_cu_test.cu @@ -0,0 +1,6 @@ +#include "reedsolomon.cu" + +int main() +{ + +} \ No newline at end of file