diff --git a/.gitignore b/.gitignore
index 27c6a0f5..ede69a7d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -60,3 +60,14 @@ perf.*
 .#*
 \#*#
 .~lock.*#
+
+# vscode settings
+.vscode/settings
+.vscode/
+src/.vscode
+
+# profiling workspace
+profiling/
+
+# cmake build dir
+build_cmake/
\ No newline at end of file
diff --git a/CMakeConfig.h.in b/CMakeConfig.h.in
new file mode 100644
index 00000000..20d82e74
--- /dev/null
+++ b/CMakeConfig.h.in
@@ -0,0 +1,87 @@
+/* Define to 1 if you have the <dirent.h> header file, and it defines `DIR'.
+   */
+#cmakedefine HAVE_DIRENT_H 1
+
+/* Define to 1 if you have the <endian.h> header file. */
+#cmakedefine HAVE_ENDIAN_H 1
+
+/* Define to 1 if fseeko (and presumably ftello) exists and is declared. */
+#cmakedefine HAVE_FSEEKO 1
+
+/* Define to 1 if you have the `getopt' function. */
+#cmakedefine HAVE_GETOPT 1
+
+/* Define to 1 if you have the <getopt.h> header file. */
+#cmakedefine HAVE_GETOPT_H 1
+
+/* Define to 1 if you have the `getopt_long' function. */
+#cmakedefine HAVE_GETOPT_LONG 1
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#cmakedefine HAVE_INTTYPES_H 1
+
+/* Define to 1 if you have the <limits.h> header file. */
+#cmakedefine HAVE_LIMITS_H 1
+
+/* Define to 1 if you have the `memcpy' function. */
+#cmakedefine HAVE_MEMCPY 1
+
+/* Define to 1 if you have the <ndir.h> header file, and it defines `DIR'. */
+#cmakedefine HAVE_NDIR_H 1
+
+/* Define to 1 if stdbool.h conforms to C99. */
+#cmakedefine HAVE_STDBOOL_H 1
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#cmakedefine HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdio.h> header file. */
+#cmakedefine HAVE_STDIO_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#cmakedefine HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the `strcasecmp' function. */
+#cmakedefine HAVE_STRCASECMP 1
+
+/* Define to 1 if you have the `strchr' function. */
+#cmakedefine HAVE_STRCHR 1
+
+/* Define to 1 if you have the `stricmp' function. */
+#cmakedefine HAVE_STRICMP 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#cmakedefine HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#cmakedefine HAVE_STRING_H 1
+
+/* Define to 1 if you have the <sys/dir.h> header file, and it defines `DIR'.
+   */
+#cmakedefine HAVE_SYS_DIR_H 1
+
+/* Define to 1 if you have the <sys/ndir.h> header file, and it defines `DIR'.
+   */
+#cmakedefine HAVE_SYS_NDIR_H 1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#cmakedefine HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#cmakedefine HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#cmakedefine HAVE_UNISTD_H 1
+
+/* Define to 1 if all of the C90 standard headers exist (not just the ones
+   required in a freestanding environment). This macro is provided for
+   backward compatibility; new code need not use it. */
+#cmakedefine STDC_HEADERS 1
+
+#cmakedefine ENABLE_CUDA 1
+
+/* Name of package */
+#define PACKAGE "@PROJECT_NAME@"
+
+/* Version number of package */
+#define VERSION "@PROJECT_VERSION@"
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 00000000..1f999e0e
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,260 @@
+cmake_minimum_required(VERSION 3.23)
+
+project(par2cmdline VERSION 0.9.0 LANGUAGES CXX)
+
+# Optionally enable CUDA
+option(ENABLE_CUDA "Enable CUDA" OFF)
+
+if (ENABLE_CUDA)
+    # Check if CUDA is available
+    include(CheckLanguage)
+    check_language(CUDA)
+    if (CMAKE_CUDA_COMPILER)
+        enable_language(CUDA)
+        find_package(CUDA REQUIRED)
+        include_directories("${CUDA_INCLUDE_DIRS}")
+    else ()
+        message(STATUS "No CUDA compiler found")
+    endif ()
+endif (ENABLE_CUDA)
+
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CUDA_STANDARD 14)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+set(CMAKE_CUDA_STANDARD_REQUIRED True)
+
+if(NOT (CMAKE_BUILD_TYPE OR DEFINED ENV{CMAKE_BUILD_TYPE}))
+  set(CMAKE_BUILD_TYPE Release)
+endif(NOT (CMAKE_BUILD_TYPE OR DEFINED ENV{CMAKE_BUILD_TYPE}))
+message(STATUS ${CMAKE_BUILD_TYPE})
+
+# Check existing headers and generate config.h
+# for multi-platform support
+include(CheckIncludeFiles)
+include(CheckSymbolExists)
+include(CheckFunctionExists)
+
+check_include_files(dirent.h HAVE_DIRENT_H)
+check_include_files(endian.h HAVE_ENDIAN_H)
+check_include_files(getopt.h HAVE_GETOPT_H)
+check_include_files(inttypes.h HAVE_INTTYPES_H)
+check_include_files(limits.h HAVE_LIMITS_H)
+check_include_files(ndir.h HAVE_NDIR_H)
+check_include_files(stdbool.h HAVE_STDBOOL_H)
+check_include_files(stdint.h HAVE_STDINT_H)
+check_include_files(stdio.h HAVE_STDIO_H)
+check_include_files(stdlib.h HAVE_STDLIB_H)
+check_include_files(strings.h HAVE_STRINGS_H)
+check_include_files(string.h HAVE_STRING_H)
+check_include_files(sys/dir.h HAVE_SYS_DIR_H)
+check_include_files(sys/ndir.h HAVE_SYS_NDIR_H)
+check_include_files(sys/stat.h HAVE_SYS_STAT_H)
+check_include_files(sys/types.h HAVE_SYS_TYPES_H)
+check_include_files(unistd.h HAVE_UNISTD_H)
+
+check_include_files("stdlib.h;stdarg.h;string.h;float.h" StandardHeadersExist)
+if(StandardHeadersExist)
+
+    check_function_exists(memchr memchrExists)
+    if(memchrExists)
+
+        check_symbol_exists(free stdlib.h freeExists)
+        if(freeExists)
+
+            message(STATUS "ANSI C header files - found")
+            set(STDC_HEADERS 1 CACHE INTERNAL "System has ANSI C header files")
+
+        endif(freeExists)
+	endif(memchrExists)
+endif(StandardHeadersExist)
+
+check_function_exists(fseeko HAVE_FSEEKO)
+check_function_exists(getopt HAVE_GETOPT)
+check_function_exists(getopt_long HAVE_GETOPT_LONG)
+check_function_exists(memcpy HAVE_MEMCPY)
+check_function_exists(strcasecmp HAVE_STRCASECMP)
+check_function_exists(strchr HAVE_STRCHR)
+check_function_exists(stricmp HAVE_STRICMP)
+
+# replacement for AC_C_BIGENDIAN
+include (TestBigEndian)
+test_big_endian(WORDS_BIGENDIAN)
+
+# generate config.h
+configure_file(CMakeConfig.h.in ${PROJECT_BINARY_DIR}/config.h)
+
+add_compile_options(
+    -Wall -pipe -fstack-protector-strong
+    # $<$<CONFIG:RELEASE>:-O3>
+    # $<$<CONFIG:DEBUG>:-O0>
+    # $<$<CONFIG:DEBUG>:-g>
+)
+
+# compile targets
+set(CMAKE_STATIC_LIBRARY_PREFIX "")
+set(LIBPAR2SRC 
+    src/crc.cpp
+    src/creatorpacket.cpp
+    src/criticalpacket.cpp
+    src/datablock.cpp
+    src/descriptionpacket.cpp
+    src/diskfile.cpp
+    src/filechecksummer.cpp
+    src/galois.cpp
+    src/mainpacket.cpp
+    src/md5.cpp
+    src/par1fileformat.cpp
+    src/par1repairer.cpp
+    src/par1repairersourcefile.cpp
+    src/par2creator.cpp
+    src/par2creatorsourcefile.cpp
+    src/par2fileformat.cpp
+    src/par2repairer.cpp
+    src/par2repairersourcefile.cpp
+    src/recoverypacket.cpp
+    src/reedsolomon.cpp
+    src/verificationhashtable.cpp
+    src/verificationpacket.cpp
+    src/libpar2.cpp
+)
+
+set(PAR2SRC
+    src/par2cmdline.cpp
+    src/commandline.cpp
+)
+
+# add cuda source files if enabled cuda
+if (ENABLE_CUDA AND NOT "${CMAKE_CUDA_COMPILER}" STREQUAL "")
+    set(LIBPAR2SRC
+        ${LIBPAR2SRC}
+        src/galois_cu.cu
+        src/par2creator.cu
+        src/reedsolomon.cu
+    )
+    # set_source_files_properties(${PAR2SRC} PROPERTIES LANGUAGE CUDA)
+    # message(STATUS "${LIBPAR2SRC}")
+
+endif (ENABLE_CUDA AND NOT "${CMAKE_CUDA_COMPILER}" STREQUAL "")
+
+# build libpar2
+add_library(libpar2 STATIC ${LIBPAR2SRC})
+
+# build par2
+add_executable(par2 ${PAR2SRC})
+
+add_compile_definitions(HAVE_CONFIG_H)
+
+include_directories(${CMAKE_BINARY_DIR})
+
+target_link_libraries(par2 PRIVATE libpar2)
+
+# If OpenMP is installed, link against openmp
+find_package(OpenMP)
+if (OPENMP_CXX_FOUND)
+    target_compile_options(libpar2 PRIVATE ${OpenMP_CXX_FLAGS})
+    target_link_libraries(par2 PRIVATE OpenMP::OpenMP_CXX)
+    target_link_libraries(libpar2 PRIVATE OpenMP::OpenMP_CXX)
+endif()
+
+if (ENABLE_CUDA AND NOT "${CMAKE_CUDA_COMPILER}" STREQUAL "")
+    set_target_properties(libpar2 PROPERTIES CUDA_ARCHITECTURES "all-major")
+endif (ENABLE_CUDA AND NOT "${CMAKE_CUDA_COMPILER}" STREQUAL "")
+
+# ===========================
+# Testing
+enable_testing()
+
+# Build unit tests
+add_executable(letype_test src/letype_test.cpp)
+add_executable(crc_test src/crc_test.cpp src/crc.cpp)
+add_executable(md5_test src/md5_test.cpp src/md5.cpp)
+add_executable(diskfile_test src/diskfile_test.cpp src/diskfile.cpp)
+add_executable(libpar2_test src/libpar2_test.cpp)
+target_link_libraries(libpar2_test PRIVATE libpar2)
+
+add_executable(commandline_test src/commandline_test.cpp src/commandline.cpp)
+target_link_libraries(commandline_test PRIVATE libpar2)
+
+add_executable(descriptionpacket_test src/descriptionpacket_test.cpp src/descriptionpacket.cpp)
+target_link_libraries(descriptionpacket_test PRIVATE libpar2)
+
+add_executable(criticalpacket_test src/criticalpacket_test.cpp src/criticalpacket.cpp)
+target_link_libraries(criticalpacket_test PRIVATE libpar2)
+
+add_executable(reedsolomon_test src/reedsolomon_test.cpp src/reedsolomon.cpp)
+add_executable(galois_test src/galois_test.cpp src/galois.cpp)
+
+set(TEST_OUTPUT_DIR ${CMAKE_BINARY_DIR}/tests)
+set_target_properties(
+    letype_test
+    crc_test
+    md5_test
+    diskfile_test
+    libpar2_test
+    commandline_test
+    descriptionpacket_test
+    criticalpacket_test
+    reedsolomon_test
+    galois_test
+    PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY ${TEST_OUTPUT_DIR}
+)
+
+# Run tests
+file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/tests DESTINATION ${CMAKE_BINARY_DIR})
+find_program (BASH_PROGRAM bash)
+if (BASH_PROGRAM)
+
+    add_test(NAME test1 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test1)
+    add_test(NAME test2 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test2)
+    add_test(NAME test3 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test3)
+    add_test(NAME test4 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test4)
+    add_test(NAME test5 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test5)
+    add_test(NAME test6 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test6)
+    add_test(NAME test7 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test7)
+    add_test(NAME test8 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test8)
+    add_test(NAME test9 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test9)
+    add_test(NAME test10 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test10)
+    add_test(NAME test11 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test11)
+    add_test(NAME test12 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test12)
+    add_test(NAME test13 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test13)
+    add_test(NAME test14 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test14)
+    add_test(NAME test15 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test15)
+    add_test(NAME test16 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test16)
+    add_test(NAME test17 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test17)
+    add_test(NAME test18 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test18)
+    add_test(NAME test19 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test19)
+    add_test(NAME test20 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test20)
+    add_test(NAME test21 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test21)
+    add_test(NAME test22 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test22)
+    add_test(NAME test23 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test23)
+    add_test(NAME test24 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test24)
+    add_test(NAME test25 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test25)
+    add_test(NAME test26 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test26)
+    add_test(NAME test27 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test27)
+    add_test(NAME test28 COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/test28)
+    add_test(NAME unit_tests COMMAND ${BASH_PROGRAM} ${CMAKE_BINARY_DIR}/tests/unit_tests)
+
+endif (BASH_PROGRAM)
+
+# ==========================
+# install
+message(STATUS "Install prefix: ${CMAKE_INSTALL_PREFIX}")
+install(TARGETS par2
+        DESTINATION bin)
+
+add_custom_command(
+    TARGET par2
+    POST_BUILD
+    COMMAND ln;-sf;par2${CMAKE_EXECUTABLE_SUFFIX};par2create${CMAKE_EXECUTABLE_SUFFIX}
+    COMMAND ln;-sf;par2${CMAKE_EXECUTABLE_SUFFIX};par2verify${CMAKE_EXECUTABLE_SUFFIX}
+    COMMAND ln;-sf;par2${CMAKE_EXECUTABLE_SUFFIX};par2repair${CMAKE_EXECUTABLE_SUFFIX}
+)
+
+install(
+  FILES
+    ${CMAKE_CURRENT_BINARY_DIR}/par2create${CMAKE_EXECUTABLE_SUFFIX}
+    ${CMAKE_CURRENT_BINARY_DIR}/par2verify${CMAKE_EXECUTABLE_SUFFIX}
+    ${CMAKE_CURRENT_BINARY_DIR}/par2repair${CMAKE_EXECUTABLE_SUFFIX}
+  DESTINATION bin
+)
\ No newline at end of file
diff --git a/README.md b/README.md
index 36f2d4a2..c2fec960 100644
--- a/README.md
+++ b/README.md
@@ -68,6 +68,16 @@ For macOS you can install llvm via homebrew to get OpenMP support.
 
 See *INSTALL* for full details on how to use the *configure* script.
 
+## Compiling par2cmdline with CUDA (NVIDIA GPU) acceleration
+To compile with CUDA option enabled on Linux and other Unix variantes, use the following commands:
+    
+    mkdir build_cmake
+    cd build_cmake
+    cmake -DENABLE_CUDA=ON ..
+    make
+    make check
+    make install
+
 ## Using par2cmdline
 
 The command line parameters for par2cmdline are as follow:
@@ -101,6 +111,7 @@ The command line parameters for par2cmdline are as follow:
     -n<n>    : Number of recovery files (don't use both -n and -l)
     -m<n>    : Memory (in MB) to use
     -t<n>    : Number of threads to use (Auto-detected)
+    -C       : Use CUDA device to accelerate recovery files creation 
     -v [-v]  : Be more verbose
     -q [-q]  : Be more quiet (-qq gives silence)
     -p       : Purge backup files and par files on successful recovery or
diff --git a/config.h.in b/config.h.in
index d386e8cc..4caab69a 100644
--- a/config.h.in
+++ b/config.h.in
@@ -31,9 +31,6 @@
 /* Define to 1 if you have the `memcpy' function. */
 #undef HAVE_MEMCPY
 
-/* Define to 1 if you have the <memory.h> header file. */
-#undef HAVE_MEMORY_H
-
 /* Define to 1 if you have the <ndir.h> header file, and it defines `DIR'. */
 #undef HAVE_NDIR_H
 
@@ -105,7 +102,9 @@
 /* Define to the version of this package. */
 #undef PACKAGE_VERSION
 
-/* Define to 1 if you have the ANSI C header files. */
+/* Define to 1 if all of the C90 standard headers exist (not just the ones
+   required in a freestanding environment). This macro is provided for
+   backward compatibility; new code need not use it. */
 #undef STDC_HEADERS
 
 /* Version number of package */
@@ -123,11 +122,6 @@
 # endif
 #endif
 
-/* Enable large inode numbers on Mac OS X 10.5.  */
-#ifndef _DARWIN_USE_64_BIT_INODE
-# define _DARWIN_USE_64_BIT_INODE 1
-#endif
-
 /* Number of bits in a file offset, on hosts where this is settable. */
 #undef _FILE_OFFSET_BITS
 
diff --git a/configure.ac b/configure.ac
old mode 100755
new mode 100644
index 8d8150f2..932cb872
--- a/configure.ac
+++ b/configure.ac
@@ -22,8 +22,8 @@ dnl  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 dnl                                               -*- Autoconf -*-
 dnl Process this file with autoconf to produce a configure script.
 
-AC_PREREQ(2.62)
-AC_INIT([par2cmdline], [0.8.1], [ike.devolder@gmail.com])
+AC_PREREQ([2.71])
+AC_INIT([par2cmdline],[0.8.1],[ike.devolder@gmail.com])
 AC_CONFIG_SRCDIR([src/par2cmdline.cpp])
 
 AC_CANONICAL_HOST
@@ -48,7 +48,15 @@ AC_LANG(C++)
 dnl Checks for header files.
 AC_HEADER_DIRENT
 AC_HEADER_STDBOOL
-AC_HEADER_STDC
+m4_warn([obsolete],
+[The preprocessor macro `STDC_HEADERS' is obsolete.
+  Except in unusual embedded environments, you can safely include all
+  ISO C90 headers unconditionally.])dnl
+# Autoupdate added the next two lines to ensure that your configure
+# script's behavior did not change.  They are probably safe to remove.
+AC_CHECK_INCLUDES_DEFAULT
+AC_PROG_EGREP
+
 AC_CHECK_HEADERS([stdio.h] [endian.h])
 AC_CHECK_HEADERS([getopt.h] [limits.h])
 
diff --git a/src/commandline.cpp b/src/commandline.cpp
index 6745bf30..12c59255 100644
--- a/src/commandline.cpp
+++ b/src/commandline.cpp
@@ -22,6 +22,11 @@
 #include<iostream>
 #include<algorithm>
 #include "commandline.h"
+#ifdef ENABLE_CUDA
+  #include <cuda_runtime.h>
+  #include <cuda.h>
+  #include "helper_cuda.cuh"
+#endif
 using namespace std;
 
 #ifdef _MSC_VER
@@ -47,6 +52,9 @@ CommandLine::CommandLine(void)
 , nthreads(0) // 0 means use default number
 , filethreads( _FILE_THREADS ) // default from header file
 #endif
+#ifdef ENABLE_CUDA
+, useCuda(false)
+#endif
 , parfilename()
 , rawfilenames()
 , extrafiles()
@@ -136,6 +144,9 @@ void CommandLine::usage(void)
     "  -l       : Limit size of recovery files (don't use both -u and -l)\n"
     "  -n<n>    : Number of recovery files (don't use both -n and -l)\n"
     "  -R       : Recurse into subdirectories\n"
+#ifdef ENABLE_CUDA
+    "  -C       : Use CUDA device to accelerate recovery files creation\n"
+#endif
     "\n";
   cout <<
     "Example:\n"
@@ -206,20 +217,20 @@ bool CommandLine::ReadArgs(int argc, const char * const *argv)
     {
       if (argv[0] == string("-h") || argv[0] == string("--help"))
       {
-	usage();
-	return true;
+        usage();
+        return true;
       }
       else if (argv[0] == string("-V") || argv[0] == string("--version"))
       {
-	showversion();
-	return true;
+        showversion();
+        return true;
       }
       else if (argv[0] == string("-VV"))
       {
-	showversion();
-	cout << endl;
-	banner();
-	return true;
+        showversion();
+        cout << endl;
+        banner();
+        return true;
       }
     }
   }
@@ -425,6 +436,18 @@ bool CommandLine::ReadArgs(int argc, const char * const *argv)
           }
           break;
 #endif
+#ifdef ENABLE_CUDA
+        case 'C':
+          {
+            if (operation != opCreate)
+            {
+              cerr << "As of now, CUDA acceleration is only supported on creating." << endl;
+              return false;
+            }
+            useCuda = true;
+          }
+          break;
+#endif
 
         case 'r':  // Set the amount of redundancy required
           {
@@ -932,10 +955,15 @@ bool CommandLine::CheckValuesAndSetDefaults() {
     noiselevel = nlNormal;
   }
 
-  // Default memorylimit of 128MB
+  // Default memorylimit of half physical memory
   if (memorylimit == 0)
   {
     u64 TotalPhysicalMemory = GetTotalPhysicalMemory();
+#ifdef ENABLE_CUDA
+    cudaDeviceProp prop;
+    cudaErrchk(cudaGetDeviceProperties(&prop, 0));
+    u64 TotalVideoMemory = prop.totalGlobalMem;
+#endif
 
     if (TotalPhysicalMemory == 0)
     {
@@ -946,8 +974,12 @@ bool CommandLine::CheckValuesAndSetDefaults() {
 
     // Half of total physical memory
     memorylimit = (size_t)(TotalPhysicalMemory / 1048576 / 2);
+#ifdef ENABLE_CUDA
+    // 3/4 of total vram or half of total ram, whichever is smaller.
+    memorylimit = min(memorylimit, (size_t) (TotalVideoMemory * 3 / 4 / 1048576));
+#endif
   }
-  // convert to megabytes
+  // convert from megabytes
   memorylimit *= 1048576;
 
   if (noiselevel >= nlDebug)
@@ -1143,11 +1175,11 @@ bool CommandLine::ComputeBlockSize() {
       u64 largestfilesize = 0;
       for (vector<string>::const_iterator i=extrafiles.begin(); i!=extrafiles.end(); i++)
       {
-	u64 filesize = filesize_cache.get(*i);
-	if (filesize > largestfilesize)
-	{
-	  largestfilesize = filesize;
-	}
+        u64 filesize = filesize_cache.get(*i);
+        if (filesize > largestfilesize)
+        {
+          largestfilesize = filesize;
+        }
       }
       blocksize = (largestfilesize + 3) & ~3;
     }
diff --git a/src/commandline.h b/src/commandline.h
index e505d187..f3c088f5 100644
--- a/src/commandline.h
+++ b/src/commandline.h
@@ -91,13 +91,13 @@ class CommandLine
   u32                    GetFirstRecoveryBlock(void) const {return firstblock;}
   u32                    GetRecoveryFileCount(void) const  {return recoveryfilecount;}
   u32                    GetRecoveryBlockCount(void) const {return recoveryblockcount;}
-  Scheme    GetRecoveryFileScheme(void) const {return recoveryfilescheme;}
+  Scheme                 GetRecoveryFileScheme(void) const {return recoveryfilescheme;}
   size_t                 GetMemoryLimit(void) const        {return memorylimit;}
-  NoiseLevel GetNoiseLevel(void) const        {return noiselevel;}
+  NoiseLevel             GetNoiseLevel(void)  const        {return noiselevel;}
 
   string                              GetParFilename(void) const {return parfilename;}
   string                              GetBasePath(void) const    {return basepath;}
-  const vector<string>& GetExtraFiles(void) const  {return extrafiles;}
+  const vector<string>&               GetExtraFiles(void) const  {return extrafiles;}
   bool                                GetPurgeFiles(void) const  {return purgefiles;}
   bool                                GetRecursive(void) const   {return recursive;}
   bool                                GetSkipData(void) const    {return skipdata;}
@@ -107,6 +107,10 @@ class CommandLine
   u32                          GetFileThreads(void) {return filethreads;}
 #endif
 
+#ifdef ENABLE_CUDA
+  bool                  GetUseCuda(void) const            {return useCuda;}
+#endif
+
 
   static bool ComputeRecoveryBlockCount(u32 *recoveryblockcount,
 					u32 sourceblockcount,
@@ -153,6 +157,10 @@ class CommandLine
   u32 nthreads;         // Default number of threads
   u32 filethreads;      // Number of threads for file processing
 #endif
+
+#ifdef ENABLE_CUDA
+  bool useCuda;
+#endif
   // NOTE: using the "-t" option to set the number of threads does not
   // end up here, but results in a direct call to "omp_set_num_threads"
 
diff --git a/src/galois.h b/src/galois.h
index a86c76ad..9980940c 100644
--- a/src/galois.h
+++ b/src/galois.h
@@ -101,6 +101,7 @@ class Galois
     Bits  = GaloisTable<bits,generator,valuetype>::Bits,
     Count = GaloisTable<bits,generator,valuetype>::Count,
     Limit = GaloisTable<bits,generator,valuetype>::Limit,
+    Generator = generator,
   };
 
 protected:
diff --git a/src/galois_cu.cu b/src/galois_cu.cu
new file mode 100644
index 00000000..1aa4dd97
--- /dev/null
+++ b/src/galois_cu.cu
@@ -0,0 +1,12 @@
+#include "libpar2internal.h"
+
+#ifdef _MSC_VER
+#ifdef _DEBUG
+#undef THIS_FILE
+static char THIS_FILE[]=__FILE__;
+#define new DEBUG_NEW
+#endif
+#endif
+
+template<>
+bool GaloisCu16::uploaded = false;
\ No newline at end of file
diff --git a/src/galois_cu.cuh b/src/galois_cu.cuh
new file mode 100644
index 00000000..a5bd3aa3
--- /dev/null
+++ b/src/galois_cu.cuh
@@ -0,0 +1,285 @@
+//  This file is part of par2cmdline (a PAR 2.0 compatible file verification and
+//  repair tool). See http://parchive.sourceforge.net for details of PAR 2.0.
+//
+//  Copyright (c) 2003 Peter Brian Clements
+//  Copyright (c) 2022 Xiuyan Wu
+//
+//  par2cmdline is free software; you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation; either version 2 of the License, or
+//  (at your option) any later version.
+//
+//  par2cmdline is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with this program; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+#pragma once
+
+// This source file defines the CUDA version of Galois object for carrying out
+// arithmetic in GF(2^16) using the generator 0x1100B on CUDA device.
+
+// Also defined are the GaloisTable object (which contains log and
+// anti log tables for use in multiplication and division), and
+// the GaloisLongMultiplyTable object (which contains tables for
+// carrying out multiplation of 16-bit galois numbers 8 bits at a time).
+
+// CUDA Device global galois log/antilog table object
+template <const unsigned int bits, const unsigned int generator, typename valuetype>
+__device__ GaloisTable<bits,generator,valuetype> *d_table;
+
+// For readability
+#define D_TABLE d_table<bits, generator, valuetype>
+
+
+template <const unsigned int bits, const unsigned int generator, typename valuetype>
+class GaloisCu
+{
+public:
+  typedef valuetype ValueType;
+
+  // Basic constructors
+  __device__ __host__ GaloisCu(void) {};
+  __device__ __host__ GaloisCu(ValueType v);
+
+  // Construct from CPU Galois variable in the same field.
+  __device__ __host__ GaloisCu(const Galois<bits, generator, valuetype> g);
+
+  // Copy and assignment
+  __device__ __host__ GaloisCu(const GaloisCu &right) {value = right.value;}
+  __device__ __host__ GaloisCu& operator = (const GaloisCu &right) { value = right.value; return *this;}
+
+  // Addition
+  __device__ GaloisCu operator + (const GaloisCu &right) const { return (value ^ right.value); }
+  __device__ GaloisCu& operator += (const GaloisCu &right) { value ^= right.value; return *this;}
+
+  // Subtraction
+  __device__ GaloisCu operator - (const GaloisCu &right) const { return (value ^ right.value); }
+  __device__ GaloisCu& operator -= (const GaloisCu &right) { value ^= right.value; return *this;}
+
+  // Multiplication
+  __device__ GaloisCu operator * (const GaloisCu &right) const;
+  __device__ GaloisCu& operator *= (const GaloisCu &right);
+
+  // Division
+  __device__ GaloisCu operator / (const GaloisCu &right) const;
+  __device__ GaloisCu& operator /= (const GaloisCu &right);
+
+  // Power
+  __device__ GaloisCu pow(unsigned int right) const;
+  __device__ GaloisCu operator ^ (unsigned int right) const;
+  __device__ GaloisCu& operator ^= (unsigned int right);
+
+  // Cast to value and value access
+  __device__ __host__ operator ValueType(void) const {return value;}
+  __device__ __host__ ValueType Value(void) const {return value;}
+
+  // Direct log and antilog
+  __device__ ValueType Log(void) const;
+  __device__ ValueType ALog(void) const;
+
+  // Upload Galois Table to CUDA device
+  static bool uploadTable(void)
+  {
+    if (uploaded) return true;
+
+    GaloisTable<bits,generator,valuetype> table, *d;
+
+    cudaErrchk( cudaMalloc((void**) &d, sizeof(GaloisTable<bits, generator, valuetype>)) );
+    cudaErrchk( cudaMemcpy(d, &table, sizeof(GaloisTable<bits, generator, valuetype>), cudaMemcpyHostToDevice) );
+    cudaErrchk( cudaMemcpyToSymbol(D_TABLE, &d, sizeof(d), 0, cudaMemcpyHostToDevice) );
+
+#ifdef _DEBUG
+    printf("Copied Galois table to device.\n");
+#endif // _DEBUG
+    return true;
+  }
+
+  // Free Galois Table from CUDA device memory.
+  // To be called at the end of the program.
+  static void freeTable(void)
+  {
+    if (!uploaded) return;
+    cudaFree(D_TABLE);
+    uploaded = false;
+  }
+
+  enum
+  {
+    Bits  = GaloisTable<bits,generator,valuetype>::Bits,
+    Count = GaloisTable<bits,generator,valuetype>::Count,
+    Limit = GaloisTable<bits,generator,valuetype>::Limit,
+  };
+
+protected:
+  ValueType value;
+  static bool uploaded;
+};
+
+template <const unsigned int bits, const unsigned int generator, typename valuetype>
+__device__ __host__ inline GaloisCu<bits,generator,valuetype>::GaloisCu(typename GaloisCu<bits,generator,valuetype>::ValueType v)
+{
+  value = v;
+}
+
+template <const unsigned int bits, const unsigned int generator, typename valuetype>
+__device__ __host__ inline GaloisCu<bits,generator,valuetype>::GaloisCu(Galois<bits, generator, valuetype> g)
+{
+  value = g.Value;
+}
+
+template <const unsigned int bits, const unsigned int generator, typename valuetype>
+__device__ inline GaloisCu<bits,generator,valuetype> GaloisCu<bits,generator,valuetype>::operator * (const GaloisCu<bits,generator,valuetype> &right) const
+{
+  if(value == 0 || right.value == 0) return 0;
+  unsigned int sum = D_TABLE->log[value] + D_TABLE->log[right.value];
+  if (sum >= Limit)
+  {
+    return D_TABLE->antilog[sum - Limit];
+  }
+  else
+  {
+    return D_TABLE->antilog[sum];
+  }
+}
+
+template <const unsigned int bits, const unsigned int generator, typename valuetype>
+__device__ inline GaloisCu<bits,generator,valuetype>& GaloisCu<bits,generator,valuetype>::operator *= (const GaloisCu<bits,generator,valuetype> &right)
+{
+  if(value == 0 || right.value == 0)
+  {
+    value = 0;
+  }
+  else
+  {
+    unsigned int sum = D_TABLE->log[value] + D_TABLE->log[right.value];
+    if (sum >= Limit)
+    {
+      value = D_TABLE->antilog[sum-Limit];
+    }
+    else
+    {
+      value = D_TABLE->antilog[sum];
+    }
+  }
+
+  return *this;
+}
+
+template <const unsigned int bits, const unsigned int generator, typename valuetype>
+__device__ inline GaloisCu<bits,generator,valuetype> GaloisCu<bits,generator,valuetype>::operator / (const GaloisCu<bits,generator,valuetype> &right) const
+{
+  if (value == 0) return 0;
+
+  assert(right.value != 0);
+  if (right.value == 0) {return 0;} // Division by 0!
+
+  int sum = D_TABLE->log[value] - D_TABLE->log[right.value];
+  if (sum < 0)
+  {
+    return D_TABLE->antilog[sum+Limit];
+  }
+  else
+  {
+    return D_TABLE->antilog[sum];
+  }
+}
+
+template <const unsigned int bits, const unsigned int generator, typename valuetype>
+__device__ inline GaloisCu<bits,generator,valuetype>& GaloisCu<bits,generator,valuetype>::operator /= (const GaloisCu<bits,generator,valuetype> &right)
+{
+  if (value == 0) return *this;
+
+  assert(right.value);
+  if (right.value == 0) {return *this;} // Division by 0!
+
+  int sum = D_TABLE->log[value] - D_TABLE->log[right.value];
+  if (sum < 0)
+  {
+    value = D_TABLE->antilog[sum+Limit];
+  }
+  else
+  {
+    value = D_TABLE->antilog[sum];
+  }
+
+  return *this;
+}
+
+template <const unsigned int bits, const unsigned int generator, typename valuetype>
+__device__ inline GaloisCu<bits,generator,valuetype> GaloisCu<bits,generator,valuetype>::pow(unsigned int right) const
+{
+  if (right == 0) return 1;
+  if (value == 0) return 0;
+
+  unsigned int sum = D_TABLE->log[value] * right;
+
+  sum = (sum >> Bits) + (sum & Limit);
+  if (sum >= Limit)
+  {
+    return D_TABLE->antilog[sum-Limit];
+  }
+  else
+  {
+    return D_TABLE->antilog[sum];
+  }
+}
+
+template <const unsigned int bits, const unsigned int generator, typename valuetype>
+__device__ inline GaloisCu<bits,generator,valuetype> GaloisCu<bits,generator,valuetype>::operator ^ (unsigned int right) const
+{
+  if (right == 0) return 1;
+  if (value == 0) return 0;
+
+  unsigned int sum = D_TABLE->log[value] * right;
+
+  sum = (sum >> Bits) + (sum & Limit);
+  if (sum >= Limit)
+  {
+    return D_TABLE->antilog[sum-Limit];
+  }
+  else
+  {
+    return D_TABLE->antilog[sum];
+  }
+}
+
+template <const unsigned int bits, const unsigned int generator, typename valuetype>
+__device__ inline GaloisCu<bits,generator,valuetype>& GaloisCu<bits,generator,valuetype>::operator ^= (unsigned int right)
+{
+  if (right == 0) {value = 1; return *this;}
+  if (value == 0) return *this;
+
+  unsigned int sum = D_TABLE->log[value] * right;
+
+  sum = (sum >> Bits) + (sum & Limit);
+  if (sum >= Limit)
+  {
+    value = D_TABLE->antilog[sum-Limit];
+  }
+  else
+  {
+    value = D_TABLE->antilog[sum];
+  }
+
+  return *this;
+}
+
+template <const unsigned int bits, const unsigned int generator, typename valuetype>
+__device__ inline valuetype GaloisCu<bits,generator,valuetype>::Log(void) const
+{
+  return D_TABLE->log[value];
+}
+
+template <const unsigned int bits, const unsigned int generator, typename valuetype>
+__device__ inline valuetype GaloisCu<bits,generator,valuetype>::ALog(void) const
+{
+  return D_TABLE->antilog[value];
+}
+
+typedef GaloisCu<8,0x11D,u8> GaloisCu8;
+typedef GaloisCu<16,0x1100B,u16> GaloisCu16;
diff --git a/src/galois_cu_test.cu b/src/galois_cu_test.cu
new file mode 100644
index 00000000..5caaf3f3
--- /dev/null
+++ b/src/galois_cu_test.cu
@@ -0,0 +1,419 @@
+#include <stdio.h>
+#include <omp.h>
+#include <iostream>
+#include <assert.h>
+#include "libpar2.h"
+#include "galois_cu.cuh"
+#include "helper_cuda.cuh"
+#include "../profiling/Timer.h"
+
+#define BLOCK_WIDTH 64
+
+void TestMult(void);
+void TestDiv(void);
+void TestAdd(void);
+void TestSub(void);
+void CompareResults(const Galois16 *resCPU, const GaloisCu16 *resGPU, char op);
+
+__global__
+void callDevice( GaloisCu16 *a, GaloisCu16 *b )
+{
+  printf( "On device\n" );
+
+  printf( "a * b is %hu\n", ( *a * *b ).Value() );
+}
+
+int main()
+{
+  GaloisCu16::uploadTable();
+
+  // u16 ua = 52235;
+  // u16 ub = 65521;
+  // GaloisCu16 da(ua), db(ub);
+  // Galois16 a(ua), b(ub);
+  // printf("%hu\n", (a * b).Value());
+  // GaloisCu16 *dap, *dbp;
+  // cudaMalloc((void**) &dap, 2);
+  // cudaMalloc((void**) &dbp, 2);
+  // cudaMemcpy(dap, &da, 2, cudaMemcpyHostToDevice);
+  // cudaMemcpy(dbp, &db, 2, cudaMemcpyHostToDevice);
+  // callDevice<<<1, 1>>> (dap, dbp);
+  // cudaDeviceSynchronize();
+
+
+  TestMult();
+  printf("----------------------\n");
+  TestDiv();
+  printf("----------------------\n");
+  TestAdd();
+  printf("----------------------\n");
+  TestSub();
+
+  return 0;
+}
+
+void CompareResults(const Galois16 *resCPU, const GaloisCu16 *resGPU, char op)
+{
+  printf("Varifying results.\n");
+  bool correct = true;
+  size_t resId = 0;
+  for ( size_t i = 0; i < Galois16::Count; ++i )
+  {
+    for ( size_t j = i; j < Galois16::Count; ++j )
+    {
+      if ( resCPU[resId].Value() != resGPU[resId].Value() )
+      {
+        correct = false;
+        printf("Result doesn't match! %u %c %u = %hu, but got %hu.\n",
+                i, op, j, resCPU[resId], resGPU[resId]);
+      }
+      ++resId;
+    }
+  }
+
+  if (correct)
+  {
+    printf("Operation %c is correct!\n", op);
+  }
+}
+
+
+__global__
+void KerMult(__restrict__ GaloisCu16 *vars, __restrict__ GaloisCu16 *res)
+{
+  size_t outRowIdx = blockIdx.x * blockDim.x + threadIdx.x;   // Also the idx of left var.
+  size_t outColMax = GaloisCu16::Count - outRowIdx;
+  size_t outStartIdx = ( outColMax + GaloisCu16::Count + 1 ) * outRowIdx / 2;
+  GaloisCu16 left = vars[ outRowIdx ];
+
+  for ( size_t i = 0; i < outColMax; ++i )
+  {
+    res[ outStartIdx + i ] = left * vars[ outRowIdx + i ];
+  }
+  
+  
+}
+
+void TestMult()
+{
+  Galois16 vals[ Galois16::Count ], *results, *resultsOmp;
+  GaloisCu16 valsCu[ GaloisCu16::Count ], *resultsCu;
+
+  // Fill arrays with every element in GF(2^16)
+  for ( size_t i = 0; i < Galois16::Count; ++i )
+  {
+    vals[ i ] = Galois16( ( Galois16::ValueType ) i );
+    valsCu[ i ] = GaloisCu16( ( Galois16::ValueType ) i );
+  }
+  printf("Input arrays filled.\n");
+
+  // Allocate space for multiplication result
+  size_t resultSz = sizeof( Galois16 ) * ( Galois16::Count + 1 ) * Galois16::Count / 2;
+  // results = ( Galois16* ) malloc(resultSz);
+  resultsOmp = ( Galois16* ) malloc(resultSz);
+  cudaMallocHost( ( void** ) &resultsCu, resultSz );
+  printf("Output arrays allocated\n");
+
+  // Calculate multiplication results on CPU.
+  // printf("Calculating reference results.\n");
+  // {
+  //   Timer timer("CPU-Serial");
+
+  //   size_t resId = 0;
+  //   for ( size_t i = 0; i < Galois16::Count; ++i )
+  //   {
+  //     for ( size_t j = i; j < Galois16::Count; ++j )
+  //     {
+  //       results[resId] = vals[i] * vals[j];
+  //       ++resId;
+  //     }
+  //   }
+  // }
+  // printf("Finished calculating reference results.\n");
+
+  // Calculate results using openmp
+  printf("Calculating OMP results.\n");
+  {
+    Timer timer("CPU-OMP");
+
+    #pragma omp parallel for
+    for ( size_t i = 0; i < Galois16::Count; ++i )
+    {
+      for ( size_t j = i; j < Galois16::Count; ++j )
+      {
+        resultsOmp[i * ( 2 * Galois16::Count - i + 1 ) / 2 + j - i] = vals[i] * vals[j];
+      }
+    }
+  }
+
+  // Upload data to GPU
+  GaloisCu16 *d_valsCu, *d_resultsCu;
+  {
+    Timer timer("GPU");
+    cudaErrchk( cudaMalloc( (void**) &d_valsCu, sizeof( valsCu ) ) );
+    cudaErrchk( cudaMalloc( (void**) &d_resultsCu, resultSz ) );
+    cudaErrchk( cudaMemcpy( d_valsCu, valsCu, sizeof( valsCu ), cudaMemcpyHostToDevice ) );
+    printf("Input copied to GPU.\n");
+
+    // Launch kernel
+    dim3 dimBlock(BLOCK_WIDTH);
+    dim3 dimGrid(GaloisCu16::Count / BLOCK_WIDTH);
+    KerMult<<<dimGrid, dimBlock>>> (d_valsCu, d_resultsCu);
+    cudaErrchk( cudaGetLastError() );
+    printf("Kernel launched.\n");
+
+    cudaDeviceSynchronize();
+    printf("Kernel completed.\n");
+
+    // Download results from GPU
+    cudaErrchk ( cudaMemcpy(resultsCu, d_resultsCu, resultSz, cudaMemcpyDeviceToHost) );
+  }
+
+  CompareResults(resultsOmp, resultsCu, '*');
+
+
+  // free(results);
+  free(resultsOmp);
+  cudaFreeHost(resultsCu);
+  cudaFree(d_resultsCu);
+}
+
+__global__
+void KerDiv(__restrict__ GaloisCu16 *vars, __restrict__ GaloisCu16 *res)
+{
+  size_t outRowIdx = blockIdx.x * blockDim.x + threadIdx.x;   // Also the idx of left var.
+  size_t outColMax = GaloisCu16::Count - outRowIdx;
+  size_t outStartIdx = ( outColMax + GaloisCu16::Count + 1 ) * outRowIdx / 2;
+  GaloisCu16 left = vars[ outRowIdx ];
+
+  for ( size_t i = 0; i < outColMax; ++i )
+  {
+    res[ outStartIdx + i ] = left / vars[ outRowIdx + i ];
+  }
+  
+}
+
+void TestDiv()
+{
+  Galois16 vals[ Galois16::Count ], *resultsOmp;
+  GaloisCu16 valsCu[ GaloisCu16::Count ], *resultsCu;
+
+  // Fill arrays with every element in GF(2^16)
+  for ( size_t i = 0; i < Galois16::Count; ++i )
+  {
+    vals[ i ] = Galois16( ( Galois16::ValueType ) i );
+    valsCu[ i ] = GaloisCu16( ( Galois16::ValueType ) i );
+  }
+  printf("Input arrays filled.\n");
+
+  // Allocate space for result
+  size_t resultSz = sizeof( Galois16 ) * ( Galois16::Count + 1 ) * Galois16::Count / 2;
+  resultsOmp = ( Galois16* ) malloc(resultSz);
+  cudaMallocHost( ( void** ) &resultsCu, resultSz );
+  printf("Output arrays allocated\n");
+
+  // Calculate results using openmp
+  printf("Calculating Reference results.\n");
+  {
+    Timer timer("CPU-OMP");
+
+    #pragma omp parallel for
+    for ( size_t i = 0; i < Galois16::Count; ++i )
+    {
+      for ( size_t j = i; j < Galois16::Count; ++j )
+      {
+        resultsOmp[i * ( 2 * Galois16::Count - i + 1 ) / 2 + j - i] = vals[i] / vals[j];
+      }
+    }
+  }
+
+  // Upload data to GPU
+  GaloisCu16 *d_valsCu, *d_resultsCu;
+  {
+    Timer timer("GPU");
+    cudaErrchk( cudaMalloc( (void**) &d_valsCu, sizeof( valsCu ) ) );
+    cudaErrchk( cudaMalloc( (void**) &d_resultsCu, resultSz ) );
+    cudaErrchk( cudaMemcpy( d_valsCu, valsCu, sizeof( valsCu ), cudaMemcpyHostToDevice ) );
+    printf("Input copied to GPU.\n");
+
+    // Launch kernel
+    dim3 dimBlock(BLOCK_WIDTH);
+    dim3 dimGrid(GaloisCu16::Count / BLOCK_WIDTH);
+    KerDiv<<<dimGrid, dimBlock>>> (d_valsCu, d_resultsCu);
+    cudaErrchk( cudaGetLastError() );
+    printf("Kernel launched.\n");
+
+    cudaDeviceSynchronize();
+    printf("Kernel completed.\n");
+
+    // Download results from GPU
+    cudaErrchk ( cudaMemcpy(resultsCu, d_resultsCu, resultSz, cudaMemcpyDeviceToHost) );
+  }
+
+  CompareResults(resultsOmp, resultsCu, '/');
+
+  // free(results);
+  free(resultsOmp);
+  cudaFreeHost(resultsCu);
+  cudaFree(d_resultsCu);
+}
+
+__global__
+void KerAdd(__restrict__ GaloisCu16 *vars, __restrict__ GaloisCu16 *res)
+{
+  size_t outRowIdx = blockIdx.x * blockDim.x + threadIdx.x;   // Also the idx of left var.
+  size_t outColMax = GaloisCu16::Count - outRowIdx;
+  size_t outStartIdx = ( outColMax + GaloisCu16::Count + 1 ) * outRowIdx / 2;
+  GaloisCu16 left = vars[ outRowIdx ];
+
+  for ( size_t i = 0; i < outColMax; ++i )
+  {
+    res[ outStartIdx + i ] = left + vars[ outRowIdx + i ];
+  }
+  
+}
+
+void TestAdd()
+{
+  Galois16 vals[ Galois16::Count ], *resultsOmp;
+  GaloisCu16 valsCu[ GaloisCu16::Count ], *resultsCu;
+
+  // Fill arrays with every element in GF(2^16)
+  for ( size_t i = 0; i < Galois16::Count; ++i )
+  {
+    vals[ i ] = Galois16( ( Galois16::ValueType ) i );
+    valsCu[ i ] = GaloisCu16( ( Galois16::ValueType ) i );
+  }
+  printf("Input arrays filled.\n");
+
+  // Allocate space for result
+  size_t resultSz = sizeof( Galois16 ) * ( Galois16::Count + 1 ) * Galois16::Count / 2;
+  resultsOmp = ( Galois16* ) malloc(resultSz);
+  cudaMallocHost( ( void** ) &resultsCu, resultSz );
+  printf("Output arrays allocated\n");
+
+  // Calculate results using openmp
+  printf("Calculating Reference results.\n");
+  {
+    Timer timer("CPU-OMP");
+
+    #pragma omp parallel for
+    for ( size_t i = 0; i < Galois16::Count; ++i )
+    {
+      for ( size_t j = i; j < Galois16::Count; ++j )
+      {
+        resultsOmp[i * ( 2 * Galois16::Count - i + 1 ) / 2 + j - i] = vals[i] + vals[j];
+      }
+    }
+  }
+
+  // Upload data to GPU
+  GaloisCu16 *d_valsCu, *d_resultsCu;
+  {
+    Timer timer("GPU");
+    cudaErrchk( cudaMalloc( (void**) &d_valsCu, sizeof( valsCu ) ) );
+    cudaErrchk( cudaMalloc( (void**) &d_resultsCu, resultSz ) );
+    cudaErrchk( cudaMemcpy( d_valsCu, valsCu, sizeof( valsCu ), cudaMemcpyHostToDevice ) );
+    printf("Input copied to GPU.\n");
+
+    // Launch kernel
+    dim3 dimBlock(BLOCK_WIDTH);
+    dim3 dimGrid(GaloisCu16::Count / BLOCK_WIDTH);
+    KerAdd<<<dimGrid, dimBlock>>> (d_valsCu, d_resultsCu);
+    cudaErrchk( cudaGetLastError() );
+    printf("Kernel launched.\n");
+
+    cudaDeviceSynchronize();
+    printf("Kernel completed.\n");
+
+    // Download results from GPU
+    cudaErrchk ( cudaMemcpy(resultsCu, d_resultsCu, resultSz, cudaMemcpyDeviceToHost) );
+  }
+
+  CompareResults(resultsOmp, resultsCu, '+');
+
+  // free(results);
+  free(resultsOmp);
+  cudaFreeHost(resultsCu);
+  cudaFree(d_resultsCu);
+}
+
+__global__
+void KerSub(__restrict__ GaloisCu16 *vars, __restrict__ GaloisCu16 *res)
+{
+  size_t outRowIdx = blockIdx.x * blockDim.x + threadIdx.x;   // Also the idx of left var.
+  size_t outColMax = GaloisCu16::Count - outRowIdx;
+  size_t outStartIdx = ( outColMax + GaloisCu16::Count + 1 ) * outRowIdx / 2;
+  GaloisCu16 left = vars[ outRowIdx ];
+
+  for ( size_t i = 0; i < outColMax; ++i )
+  {
+    res[ outStartIdx + i ] = left - vars[ outRowIdx + i ];
+  }
+  
+}
+
+void TestSub()
+{
+  Galois16 vals[ Galois16::Count ], *resultsOmp;
+  GaloisCu16 valsCu[ GaloisCu16::Count ], *resultsCu;
+
+  // Fill arrays with every element in GF(2^16)
+  for ( size_t i = 0; i < Galois16::Count; ++i )
+  {
+    vals[ i ] = Galois16( ( Galois16::ValueType ) i );
+    valsCu[ i ] = GaloisCu16( ( Galois16::ValueType ) i );
+  }
+  printf("Input arrays filled.\n");
+
+  // Allocate space for result
+  size_t resultSz = sizeof( Galois16 ) * ( Galois16::Count + 1 ) * Galois16::Count / 2;
+  resultsOmp = ( Galois16* ) malloc(resultSz);
+  cudaMallocHost( ( void** ) &resultsCu, resultSz );
+  printf("Output arrays allocated\n");
+
+  // Calculate results using openmp
+  printf("Calculating Reference results.\n");
+  {
+    Timer timer("CPU-OMP");
+
+    #pragma omp parallel for
+    for ( size_t i = 0; i < Galois16::Count; ++i )
+    {
+      for ( size_t j = i; j < Galois16::Count; ++j )
+      {
+        resultsOmp[i * ( 2 * Galois16::Count - i + 1 ) / 2 + j - i] = vals[i] - vals[j];
+      }
+    }
+  }
+
+  // Upload data to GPU
+  GaloisCu16 *d_valsCu, *d_resultsCu;
+  {
+    Timer timer("GPU");
+    cudaErrchk( cudaMalloc( (void**) &d_valsCu, sizeof( valsCu ) ) );
+    cudaErrchk( cudaMalloc( (void**) &d_resultsCu, resultSz ) );
+    cudaErrchk( cudaMemcpy( d_valsCu, valsCu, sizeof( valsCu ), cudaMemcpyHostToDevice ) );
+    printf("Input copied to GPU.\n");
+
+    // Launch kernel
+    dim3 dimBlock(BLOCK_WIDTH);
+    dim3 dimGrid(GaloisCu16::Count / BLOCK_WIDTH);
+    KerSub<<<dimGrid, dimBlock>>> (d_valsCu, d_resultsCu);
+    cudaErrchk( cudaGetLastError() );
+    printf("Kernel launched.\n");
+
+    cudaDeviceSynchronize();
+    printf("Kernel completed.\n");
+
+    // Download results from GPU
+    cudaErrchk ( cudaMemcpy(resultsCu, d_resultsCu, resultSz, cudaMemcpyDeviceToHost) );
+  }
+
+  CompareResults(resultsOmp, resultsCu, '-');
+
+  // free(results);
+  free(resultsOmp);
+  cudaFreeHost(resultsCu);
+  cudaFree(d_resultsCu);
+}
\ No newline at end of file
diff --git a/src/helper_cuda.cuh b/src/helper_cuda.cuh
new file mode 100644
index 00000000..1a8b9985
--- /dev/null
+++ b/src/helper_cuda.cuh
@@ -0,0 +1,14 @@
+#pragma once
+#include <stdio.h>
+
+#define cudaErrchk(ans) { if ( !gpuAssert((ans), __FILE__, __LINE__) ) return false; }
+inline bool gpuAssert(cudaError_t code, const char *file, int line)
+{
+   if (code != cudaSuccess) 
+   {
+      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+      // if (abort) exit(code);
+      return false;
+   }
+   return true;
+}
\ No newline at end of file
diff --git a/src/libpar2.cpp b/src/libpar2.cpp
index bed2e535..0c68c434 100644
--- a/src/libpar2.cpp
+++ b/src/libpar2.cpp
@@ -27,6 +27,9 @@ Result par2create(std::ostream &sout,
 #ifdef _OPENMP
 		  const u32 nthreads,
 		  const u32 filethreads,
+#endif
+#ifdef ENABLE_CUDA
+			const bool useCuda,
 #endif
 		  const string &parfilename,
 		  const vector<string> &extrafiles,
@@ -44,6 +47,9 @@ Result par2create(std::ostream &sout,
 #ifdef _OPENMP
 				  nthreads,
 				  filethreads,
+#endif
+#ifdef ENABLE_CUDA
+					useCuda,
 #endif
 				  parfilename,
 				  extrafiles,
diff --git a/src/libpar2.h b/src/libpar2.h
index f0f4e109..e6d58380 100644
--- a/src/libpar2.h
+++ b/src/libpar2.h
@@ -144,6 +144,9 @@ Result par2create(std::ostream &sout,
 #ifdef _OPENMP
 			  const u32 nthreads,
 			  const u32 filethreads,
+#endif
+#ifdef ENABLE_CUDA
+				const bool useCuda,
 #endif
 			  const std::string &parfilename,
 			  const std::vector<std::string> &extrafiles,
diff --git a/src/libpar2internal.h b/src/libpar2internal.h
index 699a5570..f3147d22 100644
--- a/src/libpar2internal.h
+++ b/src/libpar2internal.h
@@ -245,6 +245,11 @@ using namespace std;
 #include "par1repairersourcefile.h"
 #include "par1repairer.h"
 
+#ifdef __NVCC__
+  #include "helper_cuda.cuh"
+  #include "galois_cu.cuh"
+#endif
+
 // Heap checking
 #ifdef _MSC_VER
 #define _CRTDBG_MAP_ALLOC
diff --git a/src/par2cmdline.cpp b/src/par2cmdline.cpp
index 48b87da5..f808d478 100644
--- a/src/par2cmdline.cpp
+++ b/src/par2cmdline.cpp
@@ -67,6 +67,9 @@ int main(int argc, char *argv[])
 #ifdef _OPENMP
 			    commandline->GetNumThreads(),
 			    commandline->GetFileThreads(),
+#endif
+#ifdef ENABLE_CUDA
+					commandline->GetUseCuda(),
 #endif
 			    commandline->GetParFilename(),
 			    commandline->GetExtraFiles(),
diff --git a/src/par2creator.cpp b/src/par2creator.cpp
index 25f43dd3..dc7a5d90 100644
--- a/src/par2creator.cpp
+++ b/src/par2creator.cpp
@@ -95,6 +95,9 @@ Result Par2Creator::Process(
 #ifdef _OPENMP
 			    const u32 nthreads,
 			    const u32 _filethreads,
+#endif
+#ifdef ENABLE_CUDA
+          const bool _useCuda,
 #endif
 			    const string &parfilename,
 			    const vector<string> &_extrafiles,
@@ -108,6 +111,10 @@ Result Par2Creator::Process(
   filethreads = _filethreads;
 #endif
 
+#ifdef ENABLE_CUDA
+  useCuda = _useCuda;
+#endif
+
   // Get information from commandline
   blocksize = _blocksize;
   const vector<string> extrafiles = _extrafiles;
@@ -188,20 +195,31 @@ Result Par2Creator::Process(
     // Set the total amount of data to be processed.
     progress = 0;
     totaldata = blocksize * sourceblockcount * recoveryblockcount;
+  
+  #ifdef ENABLE_CUDA
+    if (!useCuda) {
+  #endif
+      // Start at an offset of 0 within a block.
+      u64 blockoffset = 0;
+      while (blockoffset < blocksize) // Continue until the end of the block.
+      {
+        // Work out how much data to process this time.
+        size_t blocklength = (size_t)min((u64)chunksize, blocksize-blockoffset);
 
-    // Start at an offset of 0 within a block.
-    u64 blockoffset = 0;
-    while (blockoffset < blocksize) // Continue until the end of the block.
-    {
-      // Work out how much data to process this time.
-      size_t blocklength = (size_t)min((u64)chunksize, blocksize-blockoffset);
+        // Read source data, process it through the RS matrix and write it to disk.
+        if (!ProcessData(blockoffset, blocklength))
+          return eFileIOError;
 
-      // Read source data, process it through the RS matrix and write it to disk.
-      if (!ProcessData(blockoffset, blocklength))
+        blockoffset += blocklength;
+      }
+  #ifdef ENABLE_CUDA
+    } else {
+      // Read source data, process it through the RS matrix using GPU and write it to disk.
+      if (!ProcessDataCu()) {
         return eFileIOError;
-
-      blockoffset += blocklength;
+      }
     }
+  #endif
 
     if (noiselevel > nlQuiet)
       sout << "Writing recovery packets" << endl;
@@ -299,10 +317,27 @@ bool Par2Creator::CalculateProcessBlockSize(size_t memorylimit)
   else
   {
     // Would single pass processing use too much memory
-    if (blocksize * recoveryblockcount > memorylimit)
+    // recoveryblockcount for outputbuffer and 1 for inputbuffer 
+    u64 memoryNeed = blocksize * (recoveryblockcount + 1);
+
+#ifdef ENABLE_CUDA
+    if (useCuda) {
+      memoryNeed = blocksize * (recoveryblockcount + sourceblockcount);
+    }
+#endif
+
+    if (memoryNeed > memorylimit)
     {
       // Pick a size that is small enough
-      chunksize = ~3 & (memorylimit / recoveryblockcount);
+#ifdef ENABLE_CUDA
+      if (!useCuda) {
+#endif
+        chunksize = ~3 & (memorylimit / (recoveryblockcount + 1));
+#ifdef ENABLE_CUDA
+      } else {
+        chunksize = ~3 & (memorylimit / (recoveryblockcount + sourceblockcount));
+      }
+#endif
 
       deferhashcomputation = false;
     }
@@ -705,7 +740,16 @@ bool Par2Creator::InitialiseOutputFiles(const string &parfilename)
 // Allocate memory buffers for reading and writing data to disk.
 bool Par2Creator::AllocateBuffers(void)
 {
-  inputbuffer = new u8[chunksize];
+#ifdef ENABLE_CUDA
+  if (!useCuda) {
+#endif
+    inputbuffer = new u8[chunksize];
+#ifdef ENABLE_CUDA
+  } else {
+    inputbuffer = new u8[chunksize * sourceblockcount];
+  }
+#endif
+
   outputbuffer = new u8[chunksize * recoveryblockcount];
 
   if (inputbuffer == NULL || outputbuffer == NULL)
diff --git a/src/par2creator.cu b/src/par2creator.cu
new file mode 100644
index 00000000..88c5f31b
--- /dev/null
+++ b/src/par2creator.cu
@@ -0,0 +1,140 @@
+//  This file is part of par2cmdline (a PAR 2.0 compatible file verification and
+//  repair tool). See http://parchive.sourceforge.net for details of PAR 2.0.
+//
+//  Copyright (c) 2003 Peter Brian Clements
+//  Copyright (c) 2019 Michael D. Nahas
+//  Copyright (c) 2022 Xiuyan Wu
+//
+//  par2cmdline is free software; you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation; either version 2 of the License, or
+//  (at your option) any later version.
+//
+//  par2cmdline is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with this program; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+#include "libpar2internal.h"
+
+#ifdef _MSC_VER
+#ifdef _DEBUG
+#undef THIS_FILE
+static char THIS_FILE[]=__FILE__;
+#define new DEBUG_NEW
+#endif
+#endif
+
+// ProcessData, but on CUDA device.
+bool Par2Creator::ProcessDataCu()
+{
+  // Start at an offset of 0 within a block.
+  // Continue until the end of the block.
+  u64 blockOffset = 0;
+  while (blockOffset < blocksize) {
+    // Work out how much data to process this time.
+    size_t blockLen = (size_t) min((u64) chunksize, blocksize - blockOffset);
+
+    // Clear the output buffer
+    memset(outputbuffer, 0, chunksize * recoveryblockcount);
+
+    // If we have deferred computation of the file hash and block crc and hashes
+    // sourcefile and sourceindex will be used to update them during
+    // the main recovery block computation
+    vector<Par2CreatorSourceFile*>::iterator sourcefile = sourcefiles.begin();
+    u32 sourceindex = 0;
+
+    vector<DataBlock>::iterator sourceblock;
+    u32 inputIdx;
+
+    DiskFile *lastopenfile = NULL;
+
+    // Read blockLen bytes of each input block into inputbuffer
+    for ((sourceblock=sourceblocks.begin()),(inputIdx=0);
+        sourceblock != sourceblocks.end();
+        ++sourceblock, ++inputIdx)
+    {
+      // Are we reading from a new file?
+      if (lastopenfile != (*sourceblock).GetDiskFile())
+      {
+        // Close the last file
+        if (lastopenfile != NULL)
+        {
+          lastopenfile->Close();
+        }
+
+        // Open the new file
+        lastopenfile = (*sourceblock).GetDiskFile();
+        if (!lastopenfile->Open())
+        {
+          return false;
+        }
+      }
+
+      // Read data from the current input block
+      if (!sourceblock->ReadData(blockOffset, blockLen, &((u8*) inputbuffer)[blockLen * inputIdx]))
+        return false;
+
+      if (deferhashcomputation)
+      {
+        assert(blockOffset == 0 && blockLen == blocksize);
+        assert(sourcefile != sourcefiles.end());
+
+        (*sourcefile)->UpdateHashes(sourceindex, &((u8*) inputbuffer)[blockLen * inputIdx], blockLen);
+      }
+
+      // Work out which source file the next block belongs to
+      if (++sourceindex >= (*sourcefile)->BlockCount())
+      {
+        sourceindex = 0;
+        ++sourcefile;
+      }
+    }
+
+    // Close the last file
+    if (lastopenfile != NULL)
+    {
+      lastopenfile->Close();
+    }
+
+    // Process the data through the RS matrix on GPU
+    if (!rs.ProcessCu(blockLen, 0, sourceblockcount - 1, inputbuffer, 0, recoveryblockcount - 1, outputbuffer)) {
+      return false;
+    }
+
+    if (noiselevel > nlQuiet)
+    {
+      // Update a progress indicator
+      u32 oldfraction = (u32)(1000 * progress / totaldata);
+      progress += blockLen * sourceblockcount * recoveryblockcount;
+      u32 newfraction = (u32)(1000 * progress / totaldata);
+
+      if (oldfraction != newfraction)
+      {
+        sout << "Processing: " << newfraction/10 << '.' << newfraction%10 << "%\r" << flush;
+      }
+    }
+
+    // For each output block
+    for (u32 outputblock=0; outputblock<recoveryblockcount;outputblock++)
+    {
+      // Select the appropriate part of the output buffer
+      u8 *outbuf = &((u8*) outputbuffer)[blockLen * outputblock];
+
+      // Write the data to the recovery packet
+      if (!recoverypackets[outputblock].WriteData(blockOffset, blockLen, outbuf))
+        return false;
+    }
+
+    if (noiselevel > nlQuiet)
+      sout << "Wrote " << recoveryblockcount * blockLen << " bytes to disk" << endl;
+
+    blockOffset += blockLen;
+  }
+
+  return true;
+}
diff --git a/src/par2creator.h b/src/par2creator.h
index a28f8cde..24ec5b5e 100644
--- a/src/par2creator.h
+++ b/src/par2creator.h
@@ -38,6 +38,9 @@ class Par2Creator
 #ifdef _OPENMP
 		 const u32 nthreads,
 		 const u32 filethreads,
+#endif
+#ifdef ENABLE_CUDA
+     const bool useCuda,
 #endif
 		 const string &parfilename,
 		 const vector<string> &extrafiles,
@@ -83,6 +86,11 @@ class Par2Creator
   // Read source data, process it through the RS matrix and write it to disk.
   bool ProcessData(u64 blockoffset, size_t blocklength);
 
+#ifdef ENABLE_CUDA
+  // ProcessData, but on CUDA device.
+  bool ProcessDataCu(void);
+#endif
+
   // Finish computation of the recovery packets and write the headers to disk.
   bool WriteRecoveryPacketHeaders(void);
 
@@ -112,6 +120,10 @@ class Par2Creator
   static u32 filethreads;      // Number of threads for file processing
 #endif
 
+#ifdef ENABLE_CUDA
+  bool useCuda;
+#endif
+
   u64 blocksize;      // The size of each block.
   size_t chunksize;   // How much of each block will be processed at a
                       // time (due to memory constraints).
diff --git a/src/reedsolomon.cu b/src/reedsolomon.cu
new file mode 100644
index 00000000..5e5c3a3f
--- /dev/null
+++ b/src/reedsolomon.cu
@@ -0,0 +1,399 @@
+//  This file is part of par2cmdline (a PAR 2.0 compatible file verification and
+//  repair tool). See http://parchive.sourceforge.net for details of PAR 2.0.
+//
+//  Copyright (c) 2022 Xiuyan Wu
+//
+//  par2cmdline is free software; you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation; either version 2 of the License, or
+//  (at your option) any later version.
+//
+//  par2cmdline is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with this program; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+#include <assert.h>
+
+#include "libpar2internal.h"
+#include "helper_cuda.cuh"
+
+__global__ void ProcessKer( const int batchSz,                    // size in number of word of each batch
+                            const void * __restrict__ inputData,  // size: chunkSz * inputCount
+                            const void * __restrict__ bases,      // size: sizeof(G) * inputCount
+                            const int inputCount,
+                            const int outputCount,                // # of output blocks
+                            void * __restrict__ outputBuf,        // size: chunkSz * outputCount
+                            const void * __restrict__ exponents   // size: sizeof(G) * outputCount
+                          );                   
+
+__global__ void ReduceKer( const u32 * __restrict__ input,       // Countains results from ProcessKer
+                           u32 * __restrict__ output,            // Output Location
+                           const int outputCount,                // Number of output blocks
+                           const int tileCount                   // Number of input tiles
+                          );
+
+void CUDART_CB WriteOutputCB( void *data );
+
+typedef struct
+{
+    size_t batchSz;
+    size_t wordPerChunk;
+    size_t wordPerBatch;
+    size_t batchIdx;
+    size_t outCount;
+    Galois16 *finalOutput;
+    Galois16 *batchOutput;
+} workload;
+
+#define TBLOCK_SZ 512
+#define MAX_THREAD 1024
+#define TILE_WIDTH 16
+#define SHARED_MEM_SZ 32768
+
+// Calculate the contribution of words contained in inputBuf to specified output blocks.
+template <>
+bool ReedSolomon<Galois16>::ProcessCu( const size_t size,          // size of one chunk of data
+                                const u32 inputIdxStart,
+                                const u32 inputIdxEnd,
+                                const void *inputBuf,       // inputCount * size input1-input2-...
+                                const u32 outputIdxStart,
+                                const u32 outputIdxEnd,
+                                void *outputBuf )           // outputCount * size
+{
+  // BUG: Doesn't really respect memory limit for VRAM.
+  // CUDA Device compatible Galois type.
+  typedef GaloisCu<G::Bits, G::Generator, G::ValueType> Gd;
+  if ( !Gd::uploadTable() ) return false;
+  cudaFuncSetCacheConfig(ProcessKer, cudaFuncCachePreferL1);
+
+  const u32 inCount = inputIdxEnd - inputIdxStart + 1;
+  const u32 outCount = outputIdxEnd - outputIdxStart + 1;
+  const u32 wordPerChunk = size / sizeof(Gd);
+  // Batch need to be 4-byte aligned
+  const u32 wordPerBatch = (TBLOCK_SZ * SHARED_MEM_SZ / ( MAX_THREAD * TILE_WIDTH * sizeof(Gd) ) - 1) & ~1;
+  const u32 tileCount = inCount / TILE_WIDTH + ( inCount % TILE_WIDTH != 0 );
+  const u32 batchCount = ceil( (float) wordPerChunk / wordPerBatch );
+
+  /* 
+  * size: chunk size
+  * VRam footprint: (inCount + outCount) * size + (inputcount + outCount) * sizeof(g) + sizeof(g::GaloisTable)
+  *                 <input buffer and output buffer>          <base and exponent>            <Galois log tables>
+  * 
+  * Assume the total VRam footprint can be fitted into device vram.
+  * Align chunkSz to 4 bytes
+  * 
+  * 
+  */
+
+  // Allocate GPU memory buffers
+  Gd *d_bases, *d_exponents;
+
+  cudaErrchk( cudaMalloc( (void**) &d_bases, inCount * sizeof(Gd) ) );
+  cudaErrchk( cudaMalloc( (void**) &d_exponents, outCount * sizeof(Gd) ) );
+
+  // Copy bases and exponents to GPU
+  u16 *baseOffset = &database[inputIdxStart];
+  u16 *exponents = new u16[outCount];
+  for ( u32 i = outputIdxStart; i <= outputIdxEnd; ++i ) {
+    exponents[i - outputIdxStart] = outputrows[i].exponent;
+  }
+
+  cudaErrchk( cudaMemcpyAsync( d_bases, baseOffset, inCount * sizeof(Gd), cudaMemcpyHostToDevice ) );
+  cudaErrchk( cudaMemcpyAsync( d_exponents, exponents, outCount * sizeof(u16), cudaMemcpyHostToDevice ) );
+  cudaErrchk( cudaDeviceSynchronize() );
+  delete [] exponents;
+
+  // Set kernel launch parameters
+  dim3 dimGrid( tileCount );
+  dim3 dimBlock( TBLOCK_SZ );
+
+  // Create stream
+  cudaStream_t *stream = new cudaStream_t[batchCount];
+  for ( u32 i = 0; i < batchCount; ++i ) {
+    cudaErrchk( cudaStreamCreateWithFlags( &stream[i], cudaStreamNonBlocking ) );
+  }
+
+  G *batchInput, *batchOutput;
+  cudaErrchk( cudaMallocHost( (void**) &batchInput, inCount * wordPerBatch * sizeof(G) ) );
+  cudaErrchk( cudaMallocHost( (void**) &batchOutput, outCount * wordPerBatch * sizeof(G) ) );
+
+  Gd **d_input = new Gd*[batchCount];
+  Gd **d_intermediate = new Gd*[batchCount];
+  Gd **d_output = new Gd*[batchCount];
+
+  cudaEvent_t written, upload;
+  cudaErrchk( cudaEventCreate( &written, cudaEventDisableTiming ) );
+  cudaErrchk( cudaEventCreate( &upload, cudaEventBlockingSync ) );
+
+  // Concurrent kernel invoking
+  for ( u32 batchIdx = 0; batchIdx < batchCount; ++batchIdx ) {
+    int batchSz = wordPerBatch;
+    if ( batchIdx == batchCount - 1 ) {
+      batchSz = wordPerChunk - batchIdx * wordPerBatch;
+    }
+    int batchSzAligned = batchSz + (batchSz & 1);
+
+    // Allocate memory
+    // Gd *d_input, *d_intermediate, *d_output;
+    cudaErrchk( cudaMallocAsync( (void**) &d_input[batchIdx], inCount * batchSz * sizeof(Gd), stream[batchIdx] ) );
+    cudaErrchk( cudaMallocAsync( (void**) &d_intermediate[batchIdx], tileCount * batchSzAligned * outCount * sizeof(Gd), stream[batchIdx] ) );
+    cudaErrchk( cudaMallocAsync( (void**) &d_output[batchIdx], outCount * batchSzAligned * sizeof(Gd), stream[batchIdx] ) );
+
+    // Wait until the last iteration has sent all input data to GPU.
+    cudaErrchk( cudaEventSynchronize( upload ) );
+
+    // Copy input data to GPU
+    for ( u32 i = 0; i < inCount; ++i ) {
+      void *inputBufOffset = (char*) inputBuf + i * size + batchIdx * wordPerBatch * sizeof(G);
+      void *batchInputOffset = (char*) batchInput + i * batchSz * sizeof(G);
+      memcpy( batchInputOffset, inputBufOffset, batchSz * sizeof(G) );
+    }
+
+    cudaErrchk( cudaMemcpyAsync( d_input[batchIdx], batchInput, inCount * batchSz * sizeof(G), cudaMemcpyHostToDevice, stream[batchIdx] ) );
+    cudaErrchk( cudaEventRecord( upload, stream[batchIdx] ) );
+
+    // Lauch Compute Kernel
+    ProcessKer<<<dimGrid, dimBlock, (batchSzAligned + 1) * TILE_WIDTH * sizeof(G), stream[batchIdx]>>> 
+    ( batchSz,
+      d_input[batchIdx],
+      d_bases,
+      inCount,
+      outCount,
+      d_intermediate[batchIdx],
+      d_exponents
+    );
+
+    // Lauch Reduce Kernel
+    dim3 dimBlockReduce( 32 );
+    dim3 dimGridReduce( ceil( outCount / (float) dimBlockReduce.x ), batchSzAligned / 2 );
+    ReduceKer<<<dimGridReduce, dimBlockReduce, 0, stream[batchIdx]>>>
+    ( (u32*) d_intermediate[batchIdx],
+      (u32*) d_output[batchIdx],
+      outCount,
+      tileCount
+    );
+
+    // Wait until output from the last iteration has already
+    // been written to actual output buffer.
+    cudaErrchk( cudaStreamWaitEvent( stream[batchIdx], written) );
+
+    // Copy Result to batch output buffer
+    cudaErrchk( cudaMemcpyAsync( batchOutput,
+                                 d_output[batchIdx],
+                                 batchSzAligned * outCount * sizeof(Gd),
+                                 cudaMemcpyDeviceToHost,
+                                 stream[batchIdx]
+                                ) );
+    
+    // Copy result in batch output buffer to actual output buffer
+    workload *work = new workload;
+    work->batchSz = batchSz;
+    work->wordPerChunk = wordPerChunk;
+    work->wordPerBatch = wordPerBatch;
+    work->batchIdx = batchIdx;
+    work->outCount = outCount;
+    work->finalOutput = (G*) outputBuf;
+    work->batchOutput = batchOutput;
+    cudaErrchk( cudaLaunchHostFunc( stream[batchIdx], WriteOutputCB, work ) );
+    cudaErrchk( cudaEventRecord( written, stream[batchIdx] ) );
+
+    cudaErrchk( cudaFreeAsync( d_input[batchIdx], stream[batchIdx] ) );
+    cudaErrchk( cudaFreeAsync( d_intermediate[batchIdx], stream[batchIdx] ) );
+    cudaErrchk( cudaFreeAsync( d_output[batchIdx], stream[batchIdx] ) );
+
+  }
+  cudaErrchk( cudaDeviceSynchronize() );
+
+  // Destroy stream
+  for ( u32 i = 0; i < batchCount; ++i ) {
+    cudaErrchk( cudaStreamDestroy( stream[i] ) );
+  }
+
+  cudaFree( d_bases );
+  cudaFree( d_exponents );
+  cudaFreeHost( batchInput );
+  cudaFreeHost( batchOutput );
+  delete[] stream;
+  delete[] d_input;
+  delete[] d_intermediate;
+  delete[] d_output;
+  
+  return true;
+}
+
+__global__ void ProcessKer( const int batchSz,
+                            const void * __restrict__ inputData,
+                            const void * __restrict__ bases,
+                            const int inputCount,
+                            const int outputCount,
+                            void * __restrict__ outputBuf,
+                            const void * __restrict__ exponents
+                            )
+{
+  /*
+  inputData: I_1,1  I_1,2  ...  ...  I_1,batchSz
+             I_2,1  ...    ...  ...  I_2,batchSz
+             .
+             .
+             .
+             I_TILE_WIDTH*gridDim.x,1 ... ...  I_TILE_WIDTH*gridDim.x,batchSz
+
+  outputBuf **Transposed**: 
+             <<Tile 0>>
+             O_1,1  O_1,2  O_1,3  ...  O_1,batchSz
+             O_2,1  ...    ...    ...  O_2,batchSz
+             .
+             .
+             .
+             O_outputCount,1 ...  ...  O_outputCount,batchSz
+             <<Tile 1>>
+             O_1,1  O_1,2  O_1,3  ...  O_1,batchSz
+             .
+             .
+             .
+             .
+             O_outputCount,1 ...  ...  O_outputCount,batchSz
+             <<Tile 2>>
+             .
+             .
+             <<Tile gridDim.x>>
+  */
+
+  // Need (batchSz * TILE_WIDTH + TILE_WIDTH) * sizeof(G) bytes
+  typedef GaloisCu16 G;
+  extern __shared__ char sharedMem[];
+
+  const int batchSzAligned = batchSz + (batchSz & 1);
+  G *smInput = (G *) sharedMem;  // Shared memory input buffer
+  G *smBases = (G *) ( sharedMem + batchSzAligned * TILE_WIDTH * sizeof(G) );
+  
+  const int wordPerInt = sizeof(u32) / sizeof(G);
+  const int intPerBatch = batchSzAligned / wordPerInt;
+  const int outBufWidth = outputCount * gridDim.x;
+  const int outBufRowPos = outputCount * blockIdx.x;
+
+  // Load input data and bases into shared mem
+  for ( int i = 0; i < TILE_WIDTH; ++i )
+  {
+    int inputIdx = blockIdx.x * TILE_WIDTH + i;
+    for ( int j = threadIdx.x; j < batchSzAligned; j += blockDim.x )
+    {
+      if ( inputIdx < inputCount && j < batchSz ) {
+        ((G *) smInput)[i * batchSzAligned + j] = ((G *) inputData)[inputIdx * batchSz + j];
+      } else {
+        ((G *) smInput)[i * batchSzAligned + j] = 0;
+      }
+    }
+  }
+
+  for ( int i = threadIdx.x; i < TILE_WIDTH; i += blockDim.x )
+  {
+    int inputIdx = blockIdx.x * TILE_WIDTH + i;
+    if ( inputIdx < inputCount ) {
+      ((G *) smBases)[i] = ((G *) bases)[inputIdx];
+    } else {
+      ((G *) smBases)[i] = 0;
+    }
+  }
+
+  __syncthreads();
+
+  G factors[TILE_WIDTH];
+  u16 exponent;
+  u32 acc = 0;
+  u32 words, res;
+  // Each thread compute one output block.
+  for ( int i = threadIdx.x; i < outputCount; i += blockDim.x )
+  {
+    // Calculate factors for this tile for this output block
+    exponent = ((G *) exponents)[i].Value();
+    // #pragma unroll
+    for ( int ii = 0; ii < TILE_WIDTH; ++ii ) {
+      factors[ii] = smBases[ii].pow(exponent);
+    }
+
+    // For each int in the batch
+    for ( int j = 0; j < intPerBatch; ++j )
+    {
+      acc = 0;
+      // For each inputblock in the tile, calculate contribution of corresponding 2 words (a int).
+      for ( int k = 0; k < TILE_WIDTH; ++k )
+      {
+        // factor = smBases[k].pow( exponent );
+        words = ((u32 *) smInput)[k * intPerBatch + j];
+        res = 0;
+        for ( int w = 0; w < wordPerInt; ++w )
+        {
+          ((G *) &res)[w] = factors[k] * ((G *) &words)[w];
+        }
+        acc ^= res;
+      }
+
+      // Write result to outputBuf
+      ((u32 *)outputBuf)[j * outBufWidth + outBufRowPos + i] = acc;
+    }
+  }
+}
+
+__global__ void ReduceKer( const u32 * __restrict__ input,       // Countains results from ProcessKer
+                           u32 * __restrict__ output,            // Output Location
+                           const int outputCount,                 // Number of output blocks
+                           const int tileCount                    // Number of input tiles
+                          ) 
+{
+  /*
+  input **Transposed**: 
+             <<Tile 0>>
+             O_1,1  O_1,2  O_1,3  ...  O_1,batchSz
+             O_2,1  ...    ...    ...  O_2,batchSz
+             .
+             .
+             .
+             O_outputCount,1 ...  ...  O_outputCount,batchSz
+             <<Tile 1>>
+             O_1,1  O_1,2  O_1,3  ...  O_1,batchSz
+             .
+             .
+             .
+             .
+             O_outputCount,1 ...  ...  O_outputCount,batchSz
+             <<Tile 2>>
+             .
+             .
+             <<Tile gridDim.x>>
+  */
+  // 2D Grid: blocks at position x, y process
+  // the y^th (two) word of output block x*blockDim to (x+1)*blockDim.
+  const int ox = blockIdx.x * blockDim.x + threadIdx.x;
+  const int row = blockIdx.y * outputCount * tileCount;
+  
+  if (ox >= outputCount)  return;
+
+  u32 acc = 0;
+  int inputIdx = ox;
+  for ( int i = 0; i < tileCount; ++i ) {
+    acc ^= input[row + inputIdx];
+    inputIdx += outputCount;
+  }
+
+  output[gridDim.y * ox + blockIdx.y] = acc;
+}
+
+void CUDART_CB WriteOutputCB( void *data ) {
+  // Write output from batch output buffer into actual output buffer.
+  workload *work = (workload *) data;
+  size_t batchSzAligned = work->batchSz + (work->batchSz & 1);
+  for ( u32 i = 0; i < work->outCount; ++i ){
+    memcpy( &work->finalOutput[work->wordPerChunk * i + work->wordPerBatch * work->batchIdx],
+            &work->batchOutput[batchSzAligned * i],
+            work->batchSz * sizeof(Galois16)
+          );
+  }
+  delete work;
+}
+
diff --git a/src/reedsolomon.h b/src/reedsolomon.h
index 8746db6e..57ab7850 100644
--- a/src/reedsolomon.h
+++ b/src/reedsolomon.h
@@ -3,6 +3,7 @@
 //
 //  Copyright (c) 2003 Peter Brian Clements
 //  Copyright (c) 2019 Michael D. Nahas
+//  Copyright (c) 2022 Xiuyan Wu
 //
 //  par2cmdline is free software; you can redistribute it and/or modify
 //  it under the terms of the GNU General Public License as published by
@@ -67,6 +68,19 @@ class ReedSolomon
                const void *inputbuffer, // Buffer containing input data
                u32 outputindex,         // The row in the RS matrix
                void *outputbuffer);     // Buffer containing output data
+
+#ifdef ENABLE_CUDA
+  // Process a block of data for output blocks [outputIdxStart, outputIdxEnd] on a CUDA device.
+  bool ProcessCu( const size_t size,           // The size of the block of data
+                  const u32 inputIdxStart,     // The index of the first input block to be processed
+                  const u32 inputIdxEnd,       // The index of the last input block to be processed
+                  const void *inputBuf,        // Buffer containing input data
+                  const u32 outputIdxStart,    // The row for the first output block to process in RS matrix
+                  const u32 outputIdxEnd,      // The row for the last output block to process in RS matrix
+                  void *outputBuf );           // Buffer containing output data
+#endif
+
+
 private:
 		bool InternalProcess(const g &factor, size_t size, const void *inputbuffer, void *outputbuffer);	// Optimization
 
diff --git a/src/reedsolomon_cu_test.cu b/src/reedsolomon_cu_test.cu
new file mode 100644
index 00000000..8db4ba8a
--- /dev/null
+++ b/src/reedsolomon_cu_test.cu
@@ -0,0 +1,6 @@
+#include "reedsolomon.cu"
+
+int main()
+{
+    
+}
\ No newline at end of file