diff --git a/benchmarks/src/Makefile b/benchmarks/src/Makefile deleted file mode 100644 index ce403d231..000000000 --- a/benchmarks/src/Makefile +++ /dev/null @@ -1,103 +0,0 @@ -# USE G++-10 for baremetal testing, G++-12 for Docker use -# Check if running within a Docker container -IS_DOCKER := $(shell test -f /.dockerenv && echo 1) - -ifeq ($(IS_DOCKER),1) - # If running within a Docker container, use g++-12 - CC := g++-12 -else - # If not running within a Docker container, use g++-10 - CC := g++-10 -endif - -CCSTD = -std=c++20 -march=native -lpthread -CCFLGS = -pg -g -Wall -Wextra -pedantic -Wno-unused-result -Wparentheses -Wsign-compare - -NVCC = nvcc - -PROJDIR = $(realpath $(CURDIR)) -SRCDIR = $(PROJDIR)/src -CPP = $(shell find $(PROJDIR)/src -name '*.cpp') -MAIN = benchmark.cpp -BIN = benchsys -OBJDIR = $(PROJDIR)/obj - -# check if nvcc (CUDA compiler) is available -ifeq ($(shell command -v nvcc -V 2> /dev/null),) - HAS_NVCC = - OBJ_CUDA = - -# has NVCC -else - # set CUDA defs - CUDA = $(shell find $(PROJDIR)/src -name '*.cu') - HAS_NVCC = -D__HAS_NVCC__ - NVCC_FLGS = -pg -g -Wno-deprecated-gpu-targets - OBJ_CUDA = $(patsubst $(SRCDIR)/%.cu,$(OBJDIR)/%.o,$(CUDA)) - -$(OBJDIR)/%.o: $(SRCDIR)/%.cu - @mkdir -p $(@D) - $(NVCC) -c ${HAS_NVCC} $(NVCC_FLGS) $< -o $@ - -endif -OBJ_CPP = $(patsubst $(SRCDIR)/%.cpp,$(OBJDIR)/%.o,$(CPP)) -OBJ = $(OBJ_CPP) $(OBJ_CUDA) - -$(OBJDIR)/%.o: $(SRCDIR)/%.cpp - @mkdir -p $(@D) - $(CC) -c $(HAS_NVCC) $(CCSTD) $(CCFLGS) $< -o $@ - -$(BIN): $(OBJ) -ifeq ($(HAS_NVCC), -D__HAS_NVCC__) - $(NVCC) $(MAIN) $(HAS_NVCC) $(NVCC_FLGS) $^ -o $@ -else - $(CC) $(MAIN) $(HAS_NVCC) $(CCSTD) $(CCFLGS) $^ -o $@ -endif - -run_bench: - ./${BIN} -b - -run_daemon: - ./${BIN} -d - -gprof: - gprof ${BIN} gmon.out > gprof.txt - -callgrind: - valgrind --tool=callgrind ./${BIN} - #callgrind_annotate callgrind.out - -flamegraph: - sudo perf record -g ./${BIN} - sudo perf script | sudo ../FlameGraph/stackcollapse-perf.pl | sudo ../FlameGraph/flamegraph.pl > rpi.svg - -docker_img: - sudo docker build . -t benchmarks - -docker_run: - sudo docker run --privileged -it benchmarks:latest /bin/bash - -gpu_docker_img: - sudo nvidia-docker build . -t benchmarks - -gpu_docker_run: - sudo nvidia-docker run --privileged -it benchmarks:latest /bin/bash - -avail_macros: - gcc -dM -E - -#include - -/** - * @brief Driver for CUDA Monte Carlo method - * @param dev - * @param devStates - */ -void run_gpu_monte_carlo(float *dev, curandState *devStates); - -#endif diff --git a/benchmarks/src/lib/montecarlo.hpp b/benchmarks/src/lib/montecarlo.hpp deleted file mode 100644 index 206d53a75..000000000 --- a/benchmarks/src/lib/montecarlo.hpp +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef MONTECARLO_HPP -#define MONTECARLO_HPP -#define PI 3.1415926535 - -/** - * @brief Monte Carlo method for predicting the number representing pi - * TODO - */ -double monte_carlo(int total_trials); - -#endif diff --git a/benchmarks/src/lib/primes.cuh b/benchmarks/src/lib/primes.cuh deleted file mode 100644 index 71de36061..000000000 --- a/benchmarks/src/lib/primes.cuh +++ /dev/null @@ -1,26 +0,0 @@ -/** - * Primality test GPU methods header - */ -#ifndef __PRIMES_CUH__ -#define __PRIMES_CUH__ - -#define TRIALS_PER_THREAD 4096 -#define BLOCKS 256 -#define THREADS 256 -#define PI 3.1415926535 // known value of pi - -#include -#include - -/** - * @brief Driver for CUDA Miller Rabin method - * @param - * @param - */ -void run_gpu_miller_rabin(const uint32_t *input, - bool *output, - int iters, - int threads, - int blocks); - -#endif diff --git a/benchmarks/src/lib/primes.hpp b/benchmarks/src/lib/primes.hpp deleted file mode 100644 index 38dfac6e9..000000000 --- a/benchmarks/src/lib/primes.hpp +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef PRIMES_HPP -#define PRIMES_HPP -#include - -/** - * @brief Utility modulo multiply function finding (a * b) % m - * - * @param a first operand of the multiplication - * @param b second operand of the multiplication - * @param m modulus used to limit the result - * - * @return result of (a * b) % m - */ -uint32_t mod_mul(uint32_t a, uint32_t b, uint32_t m); - -/** - * @brief Utility modulo power function finding (a ^ b) % m - * - * @param a base for exponentiation - * @param b exponent - * @param m modulus used to limit the result - * - * @return result of (a^b) % m - */ -uint32_t mod_pow(uint32_t a, uint32_t b, uint32_t m); - -/** - * @brief witness utility function for miller_rabin checks if a given witness - * 'a' indicates that 'n' is composite - * - * @param n number being tested for primality - * @param d value 'd' computed from 'n' during the primality test - * @param a witness value being tested - * @param s number of times 'd' can be divided by 2 (s = log2(d)) - * - * @return true/false (bool) - */ -bool witness(uint32_t n, uint32_t d, uint32_t a, uint32_t s); - -/** - * @brief Modified primes algorithm - * - * @param n target number (uint64_t) - * @param iters iterations determine accuracy (uint64_t) - * - * return true/false (bool) - */ -bool miller_rabin(uint32_t n, uint32_t iters); - -#endif diff --git a/benchmarks/src/lib/threadpool.hpp b/benchmarks/src/lib/threadpool.hpp deleted file mode 100644 index 89254f52c..000000000 --- a/benchmarks/src/lib/threadpool.hpp +++ /dev/null @@ -1,181 +0,0 @@ -#ifndef THREADS_HPP -#define THREADS_HPP - -#include -#include -#include -#include -#include -#include -#include - -class ThreadPool { - private: - // VECTOR of threads to execute tasks - std::vector workers; - // QUEUE of tasks to be executed - std::queue> tasks; - // MUTEX synchronizing access to the QUEUE of tasks - std::mutex queue_mutex; - // CONDITIONAL to notify waiting threads when queue gets populated - std::condition_variable condition; - // BOOL indicating if ThreadPool should stop execution - bool stop; - - public: - /** - * @brief Default constructor that creates a ThreadPool with the number - * of threads - */ - ThreadPool() : ThreadPool(std::thread::hardware_concurrency()) { - } - - /** - * @brief Constructs a ThreadPool with a given number of worker - * threads to dispatch functions. - * @param numThreads The number of worker threads to be created. - * @details Constructs a ThreadPool object with the specified number of - * worker threads. - */ - explicit ThreadPool(int numThreads) : stop(false) { - - // traverse through the number of threads specified - for (int i = 0; i < numThreads; ++i) { - // add a new thread to the vector storing workers using lambda - // function - workers.emplace_back([this] { - for (;;) { - // worker thread creates task object that holds next task to - // be executed - std::function task_obj; - - // this "symbolizes" the critical section of the TheadPool - // class - { - // worker thread locks queue_mutex - std::unique_lock lock(this->queue_mutex); - // wait on conditional_variable (ThreadPool stop OR - // queued task), wait() locks/unlocks based on condition - // result - this->condition.wait(lock, [this] { - return this->stop || !this->tasks.empty(); - }); - // based on stop OR awaiting tasks, return from the - // thread - if (this->stop && this->tasks.empty()) { - return; - } - - // if above isnt met, move first task in TASKS queue to - // the task object to transfer ownership - task_obj = std::move(this->tasks.front()); - - // pop the handed off task to make room for a new one. - // only ONE thread should remove a task from the queue - // at a time - this->tasks.pop(); - } - - // EXECUTE THE HANDED OFF TASK - task_obj(); - } - }); - } - } - - /** - * @brief Enqueues a task to the thread pool. - * @tparam F Type of the function to be enqueued. - * @tparam Args Variadic template parameter pack of the arguments - * passed to the function. - * @param f Function to be enqueued. - * @param args Arguments passed to the function - * @return std::future::type> A - * future object that will contain the result of the function - * execution. - * @throw std::runtime_error If the ThreadPool has already been - * stopped. - */ - template - auto enqueue(F &&f, Args &&...args) - -> std::future::type> { - - // this is the return type of the passed in function - using return_type = typename std::result_of::type; - // * SHARED POINTER to PACKAGED TASK used to store the passed in i - // function + its arguments - // * std::bind used to create function object binded to the - // function `f` + its args to the packaged tasks - // * std::forward used for forwarding an argument to another - // function - auto task = std::make_shared>( - std::bind(std::forward(f), std::forward(args)...)); - - // the FUTURE obj retrieves the return value of the function passed in - std::future res = task->get_future(); - { - // aquire lock on queue_mutex for synchronization - std::unique_lock lock(queue_mutex); - // check if threadpool stop is initiated - if (stop) { - throw std::runtime_error("enqueue on stopped ThreadPool"); - } - // add a task using emplace to the queue as a lambda that calls the - // packaged task - tasks.emplace([task]() { (*task)(); }); - } // once this is hit, unique_lock is out of scope & mutex is - // automatically unlocked - // notify one waiting thread of one new task added to the queue - condition.notify_one(); - // the return is the future object - return res; - } - - ~ThreadPool() { - { - // lock queue_mutex & set stop to true - std::unique_lock lock(queue_mutex); - stop = true; - } - // unblock all threads - condition.notify_all(); - // treaverse threads and join - for (std::thread &worker : workers) { - worker.join(); - } - } -}; - -/** - * @brief A class that provides a function to dispatch a function call to a - * thread pool and return a future object for obtaining the result. - */ -class ThreadDispatch { - public: - /** - * @brief Dispatches a function call to a ThreadPool and returns a - * future object for obtaining the result. - * @tparam Function The type of the function to be dispatched. - * @tparam Args The types of the arguments to be passed to the - * function. - * @param pool The ThreadPool object to which the function call is - * dispatched. - * @param func The function to be dispatched. - * @param args The arguments to be passed to the function. - * @return A future object for obtaining the result of the dispatched - * function call. - */ - template - auto dispatch(ThreadPool &pool, Function &&func, Args &&...args) - -> std::future::type> { - - // enqueue the function call to the thread pool - auto result = pool.enqueue(std::forward(func), - std::forward(args)...); - - // return the future object to get the result later - return result; - } -}; - -#endif diff --git a/benchmarks/src/src/fourier.cpp b/benchmarks/src/src/fourier.cpp deleted file mode 100644 index 769d5ede5..000000000 --- a/benchmarks/src/src/fourier.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/** - * Fast & Discrete Fourier Transforms - */ - -#include -#include -#include -#include - -// compute the Discrete Fourier Transform (DFT) of a sequence -std::vector> -DFT(const std::vector> &x) { - int N = x.size(); // Size of the input sequence - std::vector> X(N); - - for (int k = 0; k < N; k++) { - X[k] = 0; - for (int n = 0; n < N; n++) { - std::complex exp_term = - std::polar(1.0, -2 * M_PI * k * n / N); - X[k] += x[n] * exp_term; - } - } - - return X; -} - -/* -int main() { - std::vector> input_signal = - {1.0, 2.0, 3.0, 4.0, 9.0, 1.0, 20.0, 11.0}; - std::vector> result = DFT(input_signal); - - // Print the DFT result - for (int k = 0; k < result.size(); k++) { - std::cout << "X[" << k << "] = " << result[k] << std::endl; - } - - return 0; -}*/ diff --git a/benchmarks/src/src/montecarlo.cpp b/benchmarks/src/src/montecarlo.cpp deleted file mode 100644 index 014394962..000000000 --- a/benchmarks/src/src/montecarlo.cpp +++ /dev/null @@ -1,41 +0,0 @@ -/** - * A Monte Carlo method is really a problem dealing with random distribution - * and random sampling related technique. In this case and within this file - * we use a "Monte Carlo Method" from what is talked about in the video below - * specifically when the author talks about predicting the number pi π using - * random sampling by "dropping marbles" into a square and determining how - * many are dropped within 1/4 of the square. The number of successes divided - * by the number of total attempts will our prediction of pi π - * - * https://www.youtube.com/watch?v=7ESK5SaP-bc - */ -#include "../lib/montecarlo.hpp" -#include -#include -#include -#include - -#define PI 3.1415926535 - -double monte_carlo(int total_trials) { - int successes = 0; - double x, y; - - // Initialize a random number generator - // TODO set a seed to be used for this PRNG and in montecarlo.cu - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution dis(0.0, 1.0); - - for (int i = 0; i < total_trials; i++) { - x = dis(gen); - y = dis(gen); - // Check if the point is inside the unit circle - successes += (x * x + y * y <= 1.0); - } - - // Estimate pi - double predicted_pi = 4.0 * static_cast(successes) / total_trials; - - return predicted_pi; -} diff --git a/benchmarks/src/src/montecarlo_gpu.cu b/benchmarks/src/src/montecarlo_gpu.cu deleted file mode 100644 index 0681563d1..000000000 --- a/benchmarks/src/src/montecarlo_gpu.cu +++ /dev/null @@ -1,40 +0,0 @@ -#include "../lib/montecarlo.cuh" -#include -#include -#include -#include -#include -#include - -__global__ void gpu_monte_carlo(float *estimate, curandState *states) { - unsigned int thread_id = threadIdx.x + blockDim.x * blockIdx.x; - int points_in_circle = 0; - float x, y; - // Initialize CURAND - curand_init(1234, thread_id, 0, &states[thread_id]); - - for (int i = 0; i < TRIALS_PER_THREAD; i++) { - x = curand_uniform(&states[thread_id]); - y = curand_uniform(&states[thread_id]); - points_in_circle += - (x * x + y * y <= 1.0f); // count if x & y is in the circle. - } - estimate[thread_id] = 4.0f * points_in_circle / - (float)TRIALS_PER_THREAD; // return estimate of pi -} - -float host_monte_carlo(long trials) { - float x, y; - long points_in_circle = 0; - for (long i = 0; i < trials; i++) { - x = rand() / (float)RAND_MAX; - y = rand() / (float)RAND_MAX; - points_in_circle += (x * x + y * y <= 1.0f); - } - return 4.0f * points_in_circle / trials; -} - -void run_gpu_monte_carlo(float *dev, curandState *devStates) { - - gpu_monte_carlo<<>>(dev, devStates); -} diff --git a/benchmarks/src/src/mtx.cpp b/benchmarks/src/src/mtx.cpp deleted file mode 100644 index e69de29bb..000000000 diff --git a/benchmarks/src/src/mtx.cu b/benchmarks/src/src/mtx.cu deleted file mode 100644 index e69de29bb..000000000 diff --git a/benchmarks/src/src/primes.cpp b/benchmarks/src/src/primes.cpp deleted file mode 100644 index 128d6383a..000000000 --- a/benchmarks/src/src/primes.cpp +++ /dev/null @@ -1,71 +0,0 @@ -#include "../lib/primes.hpp" -#include -#include -#include - -uint32_t mod_mul(uint32_t a, uint32_t b, uint32_t m) { - uint32_t res = 0; - while (b > 0) { - if (b & 1) { - res = (res + a) % m; - } - a = (2 * a) % m; - b >>= 1; - } - return res; -} - -uint32_t mod_pow(uint32_t a, uint32_t b, uint32_t m) { - uint32_t res = 1; - - a %= m; - while (b > 0) { - if (b & 1) { - res = mod_mul(res, a, m); - } - a = mod_mul(a, a, m); - b >>= 1; - } - return res; -} - -bool witness(uint32_t n, uint32_t d, uint32_t a, uint32_t s) { - uint32_t x = mod_pow(a, d, n); - - // likely prime, return false - if (x == 1 || x == n - 1) { - return false; - } - - for (uint32_t r = 1; r < s; r++) { - x = mod_mul(x, x, n); - if (x == n - 1) { - return false; - } - } - return true; -} - -bool miller_rabin(uint32_t n, uint32_t iters = 10) { - if (n < 2) { - return false; - } - if (n == 2 || n == 3) { - return true; - } - if (n % 2 == 0) { - return false; - } - uint32_t d = n - 1, s = 0; - while (d % 2 == 0) { - d /= 2; - s++; - } - for (uint32_t i = 0; i < iters; i++) { - uint32_t a = rand() % (n - 3) + 2; - if (witness(n, d, a, s)) { - return false; - } - } - return true; -} diff --git a/benchmarks/src/src/primes_gpu.cu b/benchmarks/src/src/primes_gpu.cu deleted file mode 100644 index aac8c329d..000000000 --- a/benchmarks/src/src/primes_gpu.cu +++ /dev/null @@ -1,96 +0,0 @@ -#include "../lib/primes.cuh" -#include -#include -#include -#include -#include - -__device__ uint32_t gpu_mod_mul(uint32_t a, uint32_t b, uint32_t m) { - uint32_t res = 0; - while (b > 0) { - if (b & 1) { - res = (res + a) % m; - } - a = (2 * a) % m; - b >>= 1; - } - return res; -} - -__device__ uint32_t gpu_mod_pow(uint32_t a, uint32_t b, uint32_t m) { - uint32_t res = 1; - - a %= m; - while (b > 0) { - if (b & 1) { - res = gpu_mod_mul(res, a, m); - } - a = gpu_mod_mul(a, a, m); - b >>= 1; - } - return res; -} - -__device__ bool gpu_witness(uint32_t n, uint32_t d, uint32_t a, uint32_t s) { - uint32_t x = gpu_mod_pow(a, d, n); - - if (x == 1 || x == n - 1) { - return false; - } - - for (uint32_t r = 1; r < s; r++) { - x = gpu_mod_mul(x, x, n); - if (x == n - 1) { - return false; - } - } - return true; -} - -__global__ void -miller_rabin_kernel(const uint32_t *input, bool *output, int iters) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - - uint32_t num = input[idx]; - uint32_t d = num - 1, s = 0; - - if (num < 2) { - output[idx] = false; - return; - } - if (num == 2 || num == 3) { - output[idx] = true; - return; - } - if (num % 2 == 0) { - output[idx] = false; - return; - } - - while (d % 2 == 0) { - d /= 2; - s++; - } - - curandState state; - curand_init(clock64(), idx, 0, &state); - - for (int i = 0; i < iters; i++) { - uint32_t a = curand(&state) % (num - 3) + 2; - if (gpu_witness(num, d, a, s)) { - output[idx] = false; - return; - } - } - - output[idx] = true; -} - -void run_gpu_miller_rabin(const uint32_t *input, - bool *output, - int iters, - int threads, - int blocks) { - - miller_rabin_kernel<<>>(input, output, iters); -} diff --git a/benchmarks/src/src/sys.cpp b/benchmarks/src/sys.cpp similarity index 100% rename from benchmarks/src/src/sys.cpp rename to benchmarks/src/sys.cpp diff --git a/benchmarks/src/lib/sys.hpp b/benchmarks/src/sys.hpp similarity index 100% rename from benchmarks/src/lib/sys.hpp rename to benchmarks/src/sys.hpp diff --git a/experiment/blas1.c b/experiment/blas/blas1.c similarity index 100% rename from experiment/blas1.c rename to experiment/blas/blas1.c diff --git a/experiment/blas2.c b/experiment/blas/blas2.c similarity index 100% rename from experiment/blas2.c rename to experiment/blas/blas2.c diff --git a/experiment/csv.cpp b/experiment/csv/csv.cpp similarity index 100% rename from experiment/csv.cpp rename to experiment/csv/csv.cpp diff --git a/experiment/csv2.cpp b/experiment/csv/csv2.cpp similarity index 100% rename from experiment/csv2.cpp rename to experiment/csv/csv2.cpp diff --git a/experiment/csv3.cpp b/experiment/csv/csv3.cpp similarity index 100% rename from experiment/csv3.cpp rename to experiment/csv/csv3.cpp diff --git a/experiment/csv_intrin.cpp b/experiment/csv/csv_intrin.cpp similarity index 100% rename from experiment/csv_intrin.cpp rename to experiment/csv/csv_intrin.cpp diff --git a/experiment/sigproc.cpp b/experiment/sigproc/sigproc.cpp similarity index 100% rename from experiment/sigproc.cpp rename to experiment/sigproc/sigproc.cpp diff --git a/experiment/sigproc.hpp b/experiment/sigproc/sigproc.hpp similarity index 100% rename from experiment/sigproc.hpp rename to experiment/sigproc/sigproc.hpp diff --git a/experiment/t_sigproc.cpp b/experiment/sigproc/t_sigproc.cpp similarity index 100% rename from experiment/t_sigproc.cpp rename to experiment/sigproc/t_sigproc.cpp