[Project] Add existing code legacy

SparseLinearAlgebra · Apr 14, 2021 · 645f00d · 645f00d
1 parent ca5079c
commit 645f00d
Show file tree

Hide file tree

Showing 773 changed files with 8,941,207 additions and 1,430 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,9 @@
-# Idea files
+# IDE files
 /**/.idea
 
 # Build files
+/**/cmake-build-debug
 /**/*build*
+
+# Python cache
+/**/__pycache__
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "deps/gtest"]
 	path = deps/gtest
 	url = https://github.com/google/googletest.git
+[submodule "deps/cub"]
+	path = deps/cub
+	url = https://github.com/NVIDIA/cub.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,15 +1,58 @@
-cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
+# CuBool library Cmake config file
+# Add this file as sub-directory to your project to use library functionality
+
+cmake_minimum_required(VERSION 3.15 FATAL_ERROR)
 project(spbla LANGUAGES CXX)
 
-option(SPBLA_WITH_CPU_BACKEND "Build library with cpu backend as fallback" ON)
-option(SPBLA_WITH_CUDA_BACKEND "Build library with cuda backend" OFF)
-option(SPBLA_WITH_OPENCL_BACKEND "Build library with opencl backend" OFF)
+# Exposed to the user build options
+option(SPBLA_WITH_CUDA          "Build library with cuda backend (default)" ON)
+option(SPBLA_WITH_SEQUENTIAL    "Build library with cpu sequential backend (fallback)" ON)
+option(SPBLA_BUILD_TESTS        "Build project unit-tests with gtest" ON)
+option(SPBLA_COPY_TO_PY_PACKAGE "Copy compiled shared library into python package folder (for package use purposes)" ON)
+
+set(SPBLA_VERSION_MAJOR 1)
+set(SPBLA_VERSION_MINOR 0)
+set(SPBLA_VERSION_SUB 0)
+
+set(SPBLA_DEBUG OFF)
+set(SPBLA_RELEASE OFF)
+
+if (${CMAKE_BUILD_TYPE} MATCHES Release)
+    message(STATUS "Build cubool in release mode")
+    set(SPBLA_RELEASE ON)
+elseif (${CMAKE_BUILD_TYPE} MATCHES Debug)
+    message(STATUS "Build cubool in debug mode")
+    set(SPBLA_DEBUG ON)
+else()
+    message(STATUS "Build cubool in release mode (default: was not specified)")
+    set(SPBLA_RELEASE ON)
+    set(CMAKE_BUILD_TYPE Release)
+endif()
 
-option(SPBLA_WITH_TESTS "Build library source code with provided unit tests" ON)
+# Configure cuda dependencies
+if (SPBLA_WITH_CUDA)
+    message(STATUS "Add cub as cuda utility")
+    set(CUB_ENABLE_HEADER_TESTING OFF CACHE BOOL "" FORCE)
+    set(CUB_ENABLE_TESTING OFF CACHE BOOL "" FORCE)
+    set(CUB_ENABLE_EXAMPLES OFF CACHE BOOL "" FORCE)
+    add_subdirectory(deps/cub)
+    add_library(cub INTERFACE IMPORTED)
+    target_link_libraries(cub INTERFACE CUB::CUB)
 
-if (SPBLA_WITH_TESTS)
-    message(STATUS "Add gtest dependency for library unit tests")
+    message(STATUS "Add nsparse library as crs matrix multiplication backend")
+    add_subdirectory(deps/nsparse)
+endif()
+
+if (SPBLA_BUILD_TESTS)
+    message(STATUS "Add googletest as unit-testing library")
     add_subdirectory(deps/gtest)
 endif()
 
+# Actual cxx implementation
 add_subdirectory(spbla)
+
+# Copy scripts into binary directory
+file(COPY scripts DESTINATION ${CMAKE_BINARY_DIR}/)
+
+# Copy python related stuff
+file(COPY python DESTINATION ${CMAKE_BINARY_DIR}/)
diff --git a/README.md b/README.md
@@ -1,2 +1,3 @@
 # spbla
-Sparse Boolean Linear Algebra
+
+Sparse Boolean linear algebra for CPU/GPU computations.
diff --git a/deps/cub b/deps/cub
diff --git a/deps/nsparse/CMakeLists.txt b/deps/nsparse/CMakeLists.txt
@@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 3.15)
+project(nsparse_um LANGUAGES CXX CUDA)
+
+add_library(nsparse_um INTERFACE)
+target_include_directories(nsparse_um INTERFACE include/)
+target_link_libraries(nsparse_um INTERFACE cub)
+target_compile_options(nsparse_um INTERFACE $<$<COMPILE_LANGUAGE:CUDA>: --expt-relaxed-constexpr --expt-extended-lambda>)
+
+if (SPBLA_BUILD_NSPARSE_TESTS)
+    add_subdirectory(test)
+endif()
diff --git a/deps/nsparse/README.md b/deps/nsparse/README.md
@@ -0,0 +1,25 @@
+# NSPARSE-UNIFIED-MEMORY
+
+Sparse boolean matrix multiplication and addition implementation
+for matrix in compressed-sparse row (csr) format, which employs
+cuda unified memory allocator for allocating large gpu resources.
+
+Original source code is hosted [here](https://github.com/YaccConstructor/RedisGraph).
+For more info view branches `CFPQ-gpu` and `CFPQ-gpu-um`, where the source code
+is stored at path `deps/cfpq/algorithms/cuda/nsparse`.
+
+## Useful links
+
+- High-performance and Memory-saving
+  Sparse General Matrix-Matrix Multiplication for NVIDIA Pascal GPU 
+  [paper](https://ieeexplore.ieee.org/document/8025284)
+- GPU Merge Path - A GPU Merging Algorithm 
+  [paper](https://www.researchgate.net/publication/254462662_GPU_merge_path_a_GPU_merging_algorithm)
+- Context-Free Path Querying with Single-Path
+  Semantics by Matrix Multiplication 
+  [paper](https://www.researchgate.net/publication/342164347_Context-Free_Path_Querying_with_Single-Path_Semantics_by_Matrix_Multiplication)
+
+## Also
+
+The author of the original implementation is Artyom Khoroshev 
+(Github: [profile](https://github.com/akhoroshev)).
diff --git a/deps/nsparse/include/nsparse/detail/add_values.cuh b/deps/nsparse/include/nsparse/detail/add_values.cuh
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+
+namespace nsparse {
+
+template <typename value_type, typename index_type>
+__global__ void add_values(thrust::device_ptr<const index_type> skeleton_col_idx,
+                           thrust::device_ptr<const index_type> skeleton_row_idx,
+                           thrust::device_ptr<value_type> values,
+                           thrust::device_ptr<const index_type> edges_col_idx,
+                           thrust::device_ptr<const index_type> edges_row_idx, value_type value) {
+  auto rid = blockIdx.x;
+
+  index_type skeleton_row_begin = skeleton_row_idx[rid];
+  index_type skeleton_row_end = skeleton_row_idx[rid + 1];
+
+  index_type edges_row_begin = edges_row_idx[rid];
+  index_type edges_row_end = edges_row_idx[rid + 1];
+
+  for (auto i = edges_row_begin + threadIdx.x; i < edges_row_end; i += blockDim.x) {
+    index_type edges_column = edges_col_idx[i];
+
+    bool was_found = false;
+    for (auto j = skeleton_row_begin; j < skeleton_row_end; j++) {
+      index_type skeleton_column = skeleton_col_idx[j];
+
+      if (skeleton_column == edges_column) {
+        values[j] = value;
+        was_found = true;
+        break;
+      }
+    }
+    assert(was_found);
+  }
+}
+
+}  // namespace nsparse
diff --git a/deps/nsparse/include/nsparse/detail/add_values.h b/deps/nsparse/include/nsparse/detail/add_values.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+
+#include <nsparse/detail/util.h>
+
+#include <nsparse/detail/add_values.cuh>
+
+namespace nsparse {
+
+template <typename value_type, typename index_type>
+void add_values(index_type rows, const thrust::device_vector<index_type>& skeleton_col_idx,
+                const thrust::device_vector<index_type>& skeleton_row_idx,
+                thrust::device_vector<value_type>& values,
+                const thrust::device_vector<index_type>& edges_col_idx,
+                const thrust::device_vector<index_type>& edges_row_idx, value_type value) {
+  if (rows > 0)
+    add_values<<<rows, 64>>>(skeleton_col_idx.data(), skeleton_row_idx.data(), values.data(),
+                             edges_col_idx.data(), edges_row_idx.data(), value);
+}
+
+}  // namespace nsparse
diff --git a/deps/nsparse/include/nsparse/detail/bitonic.cuh b/deps/nsparse/include/nsparse/detail/bitonic.cuh
@@ -0,0 +1,67 @@
+#pragma once
+
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include <cooperative_groups.h>
+
+namespace nsparse {
+
+template <typename T>
+__device__ void Comparator(T& keyA, T& keyB, uint dir) {
+  T t;
+
+  if ((keyA > keyB) == dir) {
+    t = keyA;
+    keyA = keyB;
+    keyB = t;
+  }
+}
+
+template <typename T, uint array_size, typename group_t>
+__device__ void bitonic_sort_shared(group_t group, T* s_key, uint dir = 1) {
+  for (uint size = 2; size < array_size; size <<= 1) {
+    for (uint stride = size / 2; stride > 0; stride >>= 1) {
+      group.sync();
+      for (uint id = group.thread_rank(); id < array_size / 2; id += group.size()) {
+        uint ddd = dir ^ ((id & (size / 2)) != 0);
+
+        uint pos = 2 * id - (id & (stride - 1));
+        Comparator(s_key[pos + 0], s_key[pos + stride], ddd);
+      }
+    }
+  }
+
+  for (uint stride = array_size / 2; stride > 0; stride >>= 1) {
+    group.sync();
+    for (uint id = group.thread_rank(); id < array_size / 2; id += group.size()) {
+      uint pos = 2 * id - (id & (stride - 1));
+      Comparator(s_key[pos + 0], s_key[pos + stride], dir);
+    }
+  }
+  group.sync();
+}
+
+template <typename T>
+__device__ void bitonicSortGlobal(T* key, T array_size, uint dir = 1) {
+  for (uint size = 2; size < array_size; size <<= 1) {
+    for (uint stride = size / 2; stride > 0; stride >>= 1) {
+      __syncthreads();
+      for (uint id = threadIdx.x; id < array_size / 2; id += blockDim.x) {
+        uint ddd = dir ^ ((id & (size / 2)) != 0);
+
+        uint pos = 2 * id - (id & (stride - 1));
+        Comparator(key[pos + 0], key[pos + stride], ddd);
+      }
+    }
+  }
+
+  for (uint stride = array_size / 2; stride > 0; stride >>= 1) {
+    __syncthreads();
+    for (uint id = threadIdx.x; id < array_size / 2; id += blockDim.x) {
+      uint pos = 2 * id - (id & (stride - 1));
+      Comparator(key[pos + 0], key[pos + stride], dir);
+    }
+  }
+}
+
+}  // namespace nsparse