From 7cbf4de9264c099c67d019fb4433faf8ac4fc5af Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Sun, 14 Nov 2021 23:29:20 +0100 Subject: [PATCH 01/67] BLAS library WIP --- CMakeLists.txt | 8 ++- interface/Apfp.cpp | 3 +- interface/Apfp.h | 2 + interface/ApfpBlas.cpp | 144 +++++++++++++++++++++++++++++++++++++++++ interface/ApfpBlas.h | 25 +++++++ 5 files changed, 179 insertions(+), 3 deletions(-) create mode 100644 interface/ApfpBlas.cpp create mode 100644 interface/ApfpBlas.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 9f864d3..901367e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,7 +54,7 @@ add_library(simulation ${APFP_KERNEL_FILES}) target_compile_options(simulation PRIVATE -Wno-unknown-pragmas -DAP_INT_MAX_W=${APFP_MAX_BITS}) target_link_libraries(simulation ${CMAKE_THREAD_LIBS_INIT}) -add_library(ApfpHostlib SHARED interface/Apfp.cpp) +add_library(ApfpHostlib SHARED interface/Apfp.cpp interface/ApfpBlas.cpp) target_link_libraries(ApfpHostlib ${Vitis_LIBRARIES} ${GMP_LIBRARIES}) target_compile_definitions(ApfpHostlib PRIVATE HLSLIB_SIMULATE_OPENCL) @@ -75,4 +75,8 @@ add_executable(UnitTests host/UnitTests.cpp) target_link_libraries(UnitTests Catch ${GMP_LIBRARIES} ${MPFR_LIBRARIES} apfp simulation) add_test(UnitTests UnitTests) -install(TARGETS ApfpHostlib) \ No newline at end of file +install(TARGETS ApfpHostlib) +install(FILES + interface/Apfp.h + interface/ApfpBlas.h + DESTINATION include/apfp) diff --git a/interface/Apfp.cpp b/interface/Apfp.cpp index 9c6b4b6..e317213 100644 --- a/interface/Apfp.cpp +++ b/interface/Apfp.cpp @@ -31,7 +31,8 @@ void Apfp::MatrixMultiplication(const DeviceMatrix& a, const DeviceMatrix& b, De if (a.cols() != b.rows() || result->rows() != a.rows() || result->cols() != b.cols()) { throw std::logic_error("Matrix dimension mismatch"); } - auto kernel = program_->MakeKernel(::MatrixMultiplication, "MatrixMultiplication", a.buffer_, b.buffer_, result->buffer_, result->buffer_, + auto kernel = program_->MakeKernel("MatrixMultiplication", a.buffer_, b.buffer_, result->buffer_, result->buffer_, + // auto kernel = program_->MakeKernel(::MatrixMultiplication, "MatrixMultiplication", a.buffer_, b.buffer_, result->buffer_, result->buffer_, static_cast(a.rows()), static_cast(b.rows()), static_cast(result->cols())); kernel.ExecuteTask(); } diff --git a/interface/Apfp.h b/interface/Apfp.h index f17d39e..9df1057 100644 --- a/interface/Apfp.h +++ b/interface/Apfp.h @@ -7,6 +7,8 @@ #include "MatrixMultiplication.h" #include "PackedFloat.h" +#include + class DeviceMatrix; /// Object oriented interface for Apfp diff --git a/interface/ApfpBlas.cpp b/interface/ApfpBlas.cpp new file mode 100644 index 0000000..e3febef --- /dev/null +++ b/interface/ApfpBlas.cpp @@ -0,0 +1,144 @@ +#include "ApfpBlas.h" +#include "Apfp.h" +#include "Config.h" +#include +#include + +static std::optional apfp; + +enum ApfpBlasUplo : char { + upper = 'U', + lower = 'L' +}; + +enum ApfpBlasTrans : char { + normal = 'N', + transpose = 'T', +}; + +int ApfpInit(unsigned int precision) { + try { + if (precision > kBits) { + // Requested bit width too large + return ApfpBlasError::bitwidth; + } + apfp.emplace(); + return ApfpBlasError::success; + }catch(...) { + // Unknown exception + return ApfpBlasError::unknown; + } +} + +int ApfpFinalize() { + apfp.reset(); + return ApfpBlasError::success; +} + +/// Copy the upper or lower triangle from an NxN matrix A to a full size buffer +template +void CopyFromMatrixUplo(ApfpBlasUplo uplo, unsigned long N, ptr_function_type A, unsigned long LDA, ApfpInterfaceType* buffer) { + auto dest_lda = N; + // Col major layout + for (unsigned long j = 0; j < N; ++j) { + for (unsigned long i = 0; i <= j; ++i) { + auto source = uplo == ApfpBlasUplo::lower ? A(i + j * LDA) : A(j + i * LDA); + buffer[i + j * dest_lda] = *source; + buffer[j + i * dest_lda] = *source; + } + } +} + +/// Copy from a full size buffer to the upper or lower triangle of an NxN matrix A +template +void CopyToMatrixUplo(ApfpBlasUplo uplo, unsigned long N, ptr_function_type A, unsigned long LDA, ApfpInterfaceType* buffer) { + auto source_lda = N; + // Col major layout + for (unsigned long j = 0; j < N; ++j) { + for (unsigned long i = 0; i <= j; ++i) { + auto dest = uplo == ApfpBlasUplo::lower ? A(i + j * LDA) : A(j + i * LDA); + *dest = buffer[i + j * source_lda]; + } + } +} + +/// Copy from an NxK matrix A to a full size buffer +template +void CopyFromMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigned long LDA, ApfpInterfaceType* buffer) { + auto dest_lda = N; + // Col major layout + for (unsigned long j = 0; j < N; ++j) { + for (unsigned long i = 0; i < K; ++i) { + buffer[i + j * dest_lda] = *A(i + j * LDA); + } + } +} + +/// Copy to an NxK matrix A from a full size buffer +template +void CopyToMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigned long LDA, ApfpInterfaceType* buffer) { + auto source_lda = N; + // Col major layout + for (unsigned long j = 0; j < N; ++j) { + for (unsigned long i = 0; i < K; ++i) { + *A(i + j * LDA) = buffer[i + j * source_lda]; + } + } +} + +template +int ApfpSyrkTemplate(char uplo, char trans, unsigned long N, unsigned long K, ptr_function_type_a A, unsigned long LDA, ptr_function_type_c C, unsigned long LDC) { + try { + // ==== library input validation stuff ====f + if (std::toupper(uplo) != 'U' && std::toupper(uplo) != 'L') { return -1; } + auto uplo_validated = static_cast(uplo); + + if (std::toupper(trans) != 'N' && std::toupper(trans) != 'T') { return -2; } + // Let's not worry about this mode with N and K being different meanings for now + if (trans == ApfpBlasTrans::transpose) { + return ApfpBlasError::unimplemented; + } + + // We're not going to support weird edge cases + if (N == 0) { return -3; } + if (K == 0) { return -4; } + + if (LDA < N) { return -6; } + if (LDC < N) { return -8; } + + // ==== setup ==== + + std::vector host_a, host_c; + host_a.resize(N*K); + CopyFromMatrix(N, K, A, LDA, host_a.data()); + auto device_a = apfp->AllocateDeviceMatrix(N, K); + device_a.TransferToDevice(host_a.data(), host_a.size()); + + host_c.resize(N*N); + CopyFromMatrixUplo(uplo_validated, N, C, LDC, host_c.data()); + auto device_c = apfp->AllocateDeviceMatrix(N, N); + device_c.TransferToDevice(host_c.data(), host_c.size()); + + // ==== compute and teardown ==== + // apfp.MatrixMultiply() + + device_c.TransferToHost(host_c.data(), host_c.size()); + CopyToMatrixUplo(uplo_validated, N, C, LDC, host_c.data()); + } catch (...) { + return ApfpBlasError::unknown; + } + + return ApfpBlasError::success; +} + +/// See netlib's documentation on Syrk for usage. Alpha and beta unsupported +int ApfpSyrk(char uplo, char trans, unsigned long N, unsigned long K, const ApfpInterfaceType* A, unsigned long LDA, ApfpInterfaceType* C, unsigned long LDC) { + auto a_ptr_function = [&](unsigned long i) -> const ApfpInterfaceType* { return &(A[i]); }; + auto c_ptr_function = [&](unsigned long i) -> ApfpInterfaceType* { return &(C[i]); }; + return ApfpSyrkTemplate(uplo, trans, N, K, a_ptr_function, LDA, c_ptr_function, LDC); +} + +int ApfpSyrk(char uplo, char trans, unsigned long N, unsigned long K, ConstIndexFunction A, unsigned long LDA, IndexFunction C, unsigned long LDC) { + return ApfpSyrkTemplate(uplo, trans, N, K, A, LDA, C, LDC); +} + diff --git a/interface/ApfpBlas.h b/interface/ApfpBlas.h new file mode 100644 index 0000000..a078a2c --- /dev/null +++ b/interface/ApfpBlas.h @@ -0,0 +1,25 @@ +#pragma once +#include +#include + +using ApfpInterfaceType = mpf_t; +// +using IndexFunction = std::function; +using ConstIndexFunction = std::function; + +int ApfpInit(unsigned int precision); + +int ApfpFinalize(); + +/// See netlib's documentation on Syrk for usage. Alpha and beta unsupported +int ApfpSyrk(char uplo, char trans, unsigned long N, unsigned long K, const ApfpInterfaceType* A, unsigned long LDA, ApfpInterfaceType* C, unsigned long LDC); +int ApfpSyrk(char uplo, char trans, unsigned long N, unsigned long K, ConstIndexFunction A, unsigned long LDA, IndexFunction C, unsigned long LDC); + +enum ApfpBlasError : int { + success = 0, + unknown = 1, + unimplemented = 2, + bitwidth = 3 +}; + + From 2d72035a1fe9aedbe0eb945b37e075c0cc698ca1 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Fri, 26 Nov 2021 23:07:06 +0100 Subject: [PATCH 02/67] Empty matrix is a noop --- interface/ApfpBlas.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/interface/ApfpBlas.cpp b/interface/ApfpBlas.cpp index e3febef..2a8890b 100644 --- a/interface/ApfpBlas.cpp +++ b/interface/ApfpBlas.cpp @@ -99,13 +99,13 @@ int ApfpSyrkTemplate(char uplo, char trans, unsigned long N, unsigned long K, pt return ApfpBlasError::unimplemented; } - // We're not going to support weird edge cases - if (N == 0) { return -3; } - if (K == 0) { return -4; } - if (LDA < N) { return -6; } if (LDC < N) { return -8; } + // Empty matrix no-op + if (N == 0) { return ApfpBlasError::success; } + if (K == 0) { return ApfpBlasError::success; } + // ==== setup ==== std::vector host_a, host_c; From 08461cc2b0a30295319c3fb63e3e70fccb96e7eb Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Fri, 26 Nov 2021 23:12:11 +0100 Subject: [PATCH 03/67] put mpf_t in one place --- CMakeLists.txt | 3 ++- interface/Apfp.cpp | 6 +++--- interface/Apfp.h | 6 ++++-- interface/ApfpBlas.h | 2 +- interface/ApfpInterfaceType.cpp | 17 +++++++++++++++++ interface/ApfpInterfaceType.h | 12 ++++++++++++ 6 files changed, 39 insertions(+), 7 deletions(-) create mode 100644 interface/ApfpInterfaceType.cpp create mode 100644 interface/ApfpInterfaceType.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 901367e..73e177a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,7 +54,7 @@ add_library(simulation ${APFP_KERNEL_FILES}) target_compile_options(simulation PRIVATE -Wno-unknown-pragmas -DAP_INT_MAX_W=${APFP_MAX_BITS}) target_link_libraries(simulation ${CMAKE_THREAD_LIBS_INIT}) -add_library(ApfpHostlib SHARED interface/Apfp.cpp interface/ApfpBlas.cpp) +add_library(ApfpHostlib SHARED interface/Apfp.cpp interface/ApfpBlas.cpp interface/ApfpInterfaceType.cpp) target_link_libraries(ApfpHostlib ${Vitis_LIBRARIES} ${GMP_LIBRARIES}) target_compile_definitions(ApfpHostlib PRIVATE HLSLIB_SIMULATE_OPENCL) @@ -79,4 +79,5 @@ install(TARGETS ApfpHostlib) install(FILES interface/Apfp.h interface/ApfpBlas.h + interface/ApfpInterfaceType.h DESTINATION include/apfp) diff --git a/interface/Apfp.cpp b/interface/Apfp.cpp index e317213..e8b86fa 100644 --- a/interface/Apfp.cpp +++ b/interface/Apfp.cpp @@ -45,7 +45,7 @@ DeviceMatrix Apfp::Transpose(const DeviceMatrix& a) { throw std::exception(); } -void DeviceMatrix::TransferToDevice(const mpf_t* buffer_ptr, std::size_t buffer_size) { +void DeviceMatrix::TransferToDevice(const ApfpInterfaceType* buffer_ptr, std::size_t buffer_size) { if (rows() * cols() > buffer_size) { throw std::runtime_error("Source host buffer size smaller than destination device matrix size"); } @@ -55,13 +55,13 @@ void DeviceMatrix::TransferToDevice(const mpf_t* buffer_ptr, std::size_t buffer_ host_buffer.resize(cols() * rows()); std::transform(buffer_ptr, buffer_ptr + host_buffer.size(), host_buffer.begin(), - [](const mpf_t& a) { return PackedFloat(a); }); + [](const ApfpInterfaceType& a) { return PackedFloat(a); }); buffer_.CopyFromHost(0, host_buffer.size() * kLinesPerNumber, reinterpret_cast(host_buffer.data())); } -void DeviceMatrix::TransferToHost(mpf_t* buffer_ptr, std::size_t buffer_size) { +void DeviceMatrix::TransferToHost(ApfpInterfaceType* buffer_ptr, std::size_t buffer_size) { if (rows() * cols() >= buffer_size) { throw std::runtime_error("Destination host buffer size smaller than source device matrix size"); } diff --git a/interface/Apfp.h b/interface/Apfp.h index 9df1057..8c8223f 100644 --- a/interface/Apfp.h +++ b/interface/Apfp.h @@ -7,6 +7,8 @@ #include "MatrixMultiplication.h" #include "PackedFloat.h" +#include "ApfpInterfaceType.h" + #include class DeviceMatrix; @@ -60,9 +62,9 @@ class DeviceMatrix { /// Transfer from the host to the device /// TODO: Make this take input iterators - void TransferToDevice(const mpf_t* buffer_ptr, std::size_t buffer_size); + void TransferToDevice(const ApfpInterfaceType* buffer_ptr, std::size_t buffer_size); /// Transfer from the device to the host /// TODO: Make this take output iterators - void TransferToHost(mpf_t* buffer_ptr, std::size_t buffer_size); + void TransferToHost(ApfpInterfaceType* buffer_ptr, std::size_t buffer_size); }; diff --git a/interface/ApfpBlas.h b/interface/ApfpBlas.h index a078a2c..4b15f0e 100644 --- a/interface/ApfpBlas.h +++ b/interface/ApfpBlas.h @@ -1,8 +1,8 @@ #pragma once #include #include +#include "ApfpInterfaceType.h" -using ApfpInterfaceType = mpf_t; // using IndexFunction = std::function; using ConstIndexFunction = std::function; diff --git a/interface/ApfpInterfaceType.cpp b/interface/ApfpInterfaceType.cpp new file mode 100644 index 0000000..920aa6b --- /dev/null +++ b/interface/ApfpInterfaceType.cpp @@ -0,0 +1,17 @@ +#include "ApfpInterfaceType.h" + +void InitApfpInterfaceType(ApfpInterfaceType value, unsigned int precision) { + mpf_init2(value, precision); +} + +void ClearApfpInterfaceType(ApfpInterfaceType value) { + mpf_clear(value); +} + +void SwapApfpInterfaceType(ApfpInterfaceType a, ApfpInterfaceType b) { + mpf_swap(a, b); +} + +void SetApfpInterfaceType(ApfpInterfaceType dest, ApfpInterfaceType source) { + mpf_set(dest, source); +} \ No newline at end of file diff --git a/interface/ApfpInterfaceType.h b/interface/ApfpInterfaceType.h new file mode 100644 index 0000000..660d334 --- /dev/null +++ b/interface/ApfpInterfaceType.h @@ -0,0 +1,12 @@ +#pragma once +#include + +using ApfpInterfaceType = mpf_t; + +void InitApfpInterfaceType(ApfpInterfaceType value, unsigned int precision); + +void ClearApfpInterfaceType(ApfpInterfaceType value); + +void SwapApfpInterfaceType(ApfpInterfaceType a, ApfpInterfaceType b); + +void SetApfpInterfaceType(ApfpInterfaceType dest, ApfpInterfaceType source); \ No newline at end of file From b531adc3213cd5536ec3df1b44dede4e7121c1f6 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Sat, 27 Nov 2021 00:25:48 +0100 Subject: [PATCH 04/67] PackFloat ToMpfr and ToGmp now const --- include/PackedFloat.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/PackedFloat.h b/include/PackedFloat.h index 7d96e5a..90e3141 100644 --- a/include/PackedFloat.h +++ b/include/PackedFloat.h @@ -85,7 +85,7 @@ class PackedFloat { return *this; } - inline void ToGmp(mpf_t num) { + inline void ToGmp(mpf_t num) const { const size_t gmp_limbs = (mpf_get_prec(num) + 8 * sizeof(mp_limb_t) - 1) / (8 * sizeof(mp_limb_t)); constexpr size_t kNumLimbs = kMantissaBytes / sizeof(Limb); // GMP does not allow graceful rounding, so we cannot handle having insufficient bits in the target GMP number @@ -104,7 +104,7 @@ class PackedFloat { } } - inline void ToMpfr(mpfr_t num) { + inline void ToMpfr(mpfr_t num) const { // Copy the most significant bytes, padding zeros if necessary const auto mpfr_limbs = (mpfr_get_prec(num) + 8 * sizeof(mp_limb_t) - 1) / (8 * sizeof(mp_limb_t)); const size_t mpfr_bytes = mpfr_limbs * sizeof(mp_limb_t); From a59dd657f0bbf3e94d2896e12db41e352ba6c27c Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Sat, 27 Nov 2021 00:37:21 +0100 Subject: [PATCH 05/67] mpf_t/mpfr_t wrapper type --- interface/Apfp.cpp | 47 ++++++++++++++++++++++++++++----- interface/Apfp.h | 10 +++++++ interface/ApfpInterfaceType.cpp | 16 +++++++++-- interface/ApfpInterfaceType.h | 24 +++++++++++++++-- 4 files changed, 87 insertions(+), 10 deletions(-) diff --git a/interface/Apfp.cpp b/interface/Apfp.cpp index e8b86fa..51ea031 100644 --- a/interface/Apfp.cpp +++ b/interface/Apfp.cpp @@ -45,7 +45,8 @@ DeviceMatrix Apfp::Transpose(const DeviceMatrix& a) { throw std::exception(); } -void DeviceMatrix::TransferToDevice(const ApfpInterfaceType* buffer_ptr, std::size_t buffer_size) { +template +void DeviceMatrix::TransferToDeviceImpl(ptr_function_type buffer_ptr_func, std::size_t buffer_size) { if (rows() * cols() > buffer_size) { throw std::runtime_error("Source host buffer size smaller than destination device matrix size"); } @@ -54,17 +55,51 @@ void DeviceMatrix::TransferToDevice(const ApfpInterfaceType* buffer_ptr, std::si std::vector host_buffer; host_buffer.resize(cols() * rows()); - std::transform(buffer_ptr, buffer_ptr + host_buffer.size(), host_buffer.begin(), - [](const ApfpInterfaceType& a) { return PackedFloat(a); }); + for(std::size_t i = 0; i < host_buffer.size(); ++i) { + host_buffer[i] = PackedFloat(*buffer_ptr_func(i)); + } buffer_.CopyFromHost(0, host_buffer.size() * kLinesPerNumber, reinterpret_cast(host_buffer.data())); } -void DeviceMatrix::TransferToHost(ApfpInterfaceType* buffer_ptr, std::size_t buffer_size) { - if (rows() * cols() >= buffer_size) { +void DeviceMatrix::TransferToDevice(const ApfpInterfaceType* buffer_ptr, std::size_t buffer_size) { + TransferToDeviceImpl([&](std::size_t i) { return &buffer_ptr[i]; }, buffer_size); +} + +void DeviceMatrix::TransferToDevice(const ApfpInterfaceWrapper* buffer_ptr, std::size_t buffer_size) { + TransferToDeviceImpl([&](std::size_t i) { return buffer_ptr[i].get(); }, buffer_size); +} + +void PackedFloatToInterfaceType(const PackedFloat& packed, mpfr_t dest) { + packed.ToMpfr(dest); +} + +void PackedFloatToInterfaceType(const PackedFloat& packed, mpf_t dest) { + packed.ToGmp(dest); +} + +template +void DeviceMatrix::TransferToHostImpl(ptr_function_type buffer_ptr_func, std::size_t buffer_size) { + if (rows() * cols() >= buffer_size) { throw std::runtime_error("Destination host buffer size smaller than source device matrix size"); } - buffer_.CopyToHost(0, kLinesPerNumber * rows() * cols(), reinterpret_cast(buffer_ptr)); + std::vector host_buffer; + host_buffer.resize(cols() * rows()); + + buffer_.CopyToHost(0, kLinesPerNumber * rows() * cols(), reinterpret_cast(host_buffer.data())); + + ApfpInterfaceWrapper scratch; + for(std::size_t i = 0; i < host_buffer.size(); ++i) { + PackedFloatToInterfaceType(host_buffer[i], *buffer_ptr_func(i)); + } +} + +void DeviceMatrix::TransferToHost(ApfpInterfaceType* buffer_ptr, std::size_t buffer_size) { + TransferToHostImpl([&](std::size_t i) { return &(buffer_ptr[i]); }, buffer_size); +} + +void DeviceMatrix::TransferToHost(ApfpInterfaceWrapper* buffer_ptr, std::size_t buffer_size) { + TransferToHostImpl([&](std::size_t i) { return buffer_ptr[i].get(); }, buffer_size); } diff --git a/interface/Apfp.h b/interface/Apfp.h index 8c8223f..411c7b2 100644 --- a/interface/Apfp.h +++ b/interface/Apfp.h @@ -63,8 +63,18 @@ class DeviceMatrix { /// Transfer from the host to the device /// TODO: Make this take input iterators void TransferToDevice(const ApfpInterfaceType* buffer_ptr, std::size_t buffer_size); + void TransferToDevice(const ApfpInterfaceWrapper* buffer_ptr, std::size_t buffer_size); + /// Transfer from the device to the host /// TODO: Make this take output iterators void TransferToHost(ApfpInterfaceType* buffer_ptr, std::size_t buffer_size); + void TransferToHost(ApfpInterfaceWrapper* buffer_ptr, std::size_t buffer_size); + + private: + template + void TransferToDeviceImpl(ptr_function_type buffer_ptr_func, std::size_t buffer_size); + + template + void TransferToHostImpl(ptr_function_type buffer_ptr_func, std::size_t buffer_size); }; diff --git a/interface/ApfpInterfaceType.cpp b/interface/ApfpInterfaceType.cpp index 920aa6b..6544cf2 100644 --- a/interface/ApfpInterfaceType.cpp +++ b/interface/ApfpInterfaceType.cpp @@ -1,6 +1,10 @@ #include "ApfpInterfaceType.h" -void InitApfpInterfaceType(ApfpInterfaceType value, unsigned int precision) { +void InitApfpInterfaceType(ApfpInterfaceType value) { + mpf_init(value); +} + +void Init2ApfpInterfaceType(ApfpInterfaceType value, unsigned int precision) { mpf_init2(value, precision); } @@ -12,6 +16,14 @@ void SwapApfpInterfaceType(ApfpInterfaceType a, ApfpInterfaceType b) { mpf_swap(a, b); } -void SetApfpInterfaceType(ApfpInterfaceType dest, ApfpInterfaceType source) { +void SetApfpInterfaceType(ApfpInterfaceType dest, const ApfpInterfaceType source) { mpf_set(dest, source); +} + +ApfpInterfaceWrapper::~ApfpInterfaceWrapper() { + ClearApfpInterfaceType(data_); +} + +ApfpInterfaceWrapper::ApfpInterfaceWrapper() { + InitApfpInterfaceType(data_); } \ No newline at end of file diff --git a/interface/ApfpInterfaceType.h b/interface/ApfpInterfaceType.h index 660d334..55e809c 100644 --- a/interface/ApfpInterfaceType.h +++ b/interface/ApfpInterfaceType.h @@ -3,10 +3,30 @@ using ApfpInterfaceType = mpf_t; -void InitApfpInterfaceType(ApfpInterfaceType value, unsigned int precision); +void InitApfpInterfaceType(ApfpInterfaceType value); + +void Init2ApfpInterfaceType(ApfpInterfaceType value, unsigned int precision); void ClearApfpInterfaceType(ApfpInterfaceType value); void SwapApfpInterfaceType(ApfpInterfaceType a, ApfpInterfaceType b); -void SetApfpInterfaceType(ApfpInterfaceType dest, ApfpInterfaceType source); \ No newline at end of file +void SetApfpInterfaceType(ApfpInterfaceType dest, const ApfpInterfaceType source); + +/// Smart pointer-like wrapper class for GMP/MPFR types +class ApfpInterfaceWrapper { + ApfpInterfaceType data_; + +public: + ~ApfpInterfaceWrapper(); + + ApfpInterfaceWrapper(); + + ApfpInterfaceWrapper(ApfpInterfaceWrapper&) = delete; + + ApfpInterfaceWrapper& operator=(const ApfpInterfaceWrapper&) = delete; + + ApfpInterfaceType* get() { return &data_; } + + const ApfpInterfaceType* get() const { return &data_; } +}; \ No newline at end of file From d11fc10f604fb0732fdd789b8677a7dda1fe1b13 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Sat, 27 Nov 2021 00:46:28 +0100 Subject: [PATCH 06/67] unsigned long for precision --- interface/ApfpBlas.cpp | 2 +- interface/ApfpBlas.h | 2 +- interface/ApfpInterfaceType.cpp | 2 +- interface/ApfpInterfaceType.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/interface/ApfpBlas.cpp b/interface/ApfpBlas.cpp index 2a8890b..426af02 100644 --- a/interface/ApfpBlas.cpp +++ b/interface/ApfpBlas.cpp @@ -16,7 +16,7 @@ enum ApfpBlasTrans : char { transpose = 'T', }; -int ApfpInit(unsigned int precision) { +int ApfpInit(unsigned long precision) { try { if (precision > kBits) { // Requested bit width too large diff --git a/interface/ApfpBlas.h b/interface/ApfpBlas.h index 4b15f0e..95a3a4d 100644 --- a/interface/ApfpBlas.h +++ b/interface/ApfpBlas.h @@ -7,7 +7,7 @@ using IndexFunction = std::function; using ConstIndexFunction = std::function; -int ApfpInit(unsigned int precision); +int ApfpInit(unsigned long precision); int ApfpFinalize(); diff --git a/interface/ApfpInterfaceType.cpp b/interface/ApfpInterfaceType.cpp index 6544cf2..7361f24 100644 --- a/interface/ApfpInterfaceType.cpp +++ b/interface/ApfpInterfaceType.cpp @@ -4,7 +4,7 @@ void InitApfpInterfaceType(ApfpInterfaceType value) { mpf_init(value); } -void Init2ApfpInterfaceType(ApfpInterfaceType value, unsigned int precision) { +void Init2ApfpInterfaceType(ApfpInterfaceType value, unsigned long precision) { mpf_init2(value, precision); } diff --git a/interface/ApfpInterfaceType.h b/interface/ApfpInterfaceType.h index 55e809c..da35340 100644 --- a/interface/ApfpInterfaceType.h +++ b/interface/ApfpInterfaceType.h @@ -5,7 +5,7 @@ using ApfpInterfaceType = mpf_t; void InitApfpInterfaceType(ApfpInterfaceType value); -void Init2ApfpInterfaceType(ApfpInterfaceType value, unsigned int precision); +void Init2ApfpInterfaceType(ApfpInterfaceType value, unsigned long precision); void ClearApfpInterfaceType(ApfpInterfaceType value); From 0745071ac96dfdbf19ca6ae06bfffa21ce6272f4 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Sat, 27 Nov 2021 00:47:55 +0100 Subject: [PATCH 07/67] overload of ApfpInterfaceType constructor with precision specified --- interface/ApfpInterfaceType.cpp | 4 ++++ interface/ApfpInterfaceType.h | 2 ++ 2 files changed, 6 insertions(+) diff --git a/interface/ApfpInterfaceType.cpp b/interface/ApfpInterfaceType.cpp index 7361f24..754da4a 100644 --- a/interface/ApfpInterfaceType.cpp +++ b/interface/ApfpInterfaceType.cpp @@ -26,4 +26,8 @@ ApfpInterfaceWrapper::~ApfpInterfaceWrapper() { ApfpInterfaceWrapper::ApfpInterfaceWrapper() { InitApfpInterfaceType(data_); +} + +ApfpInterfaceWrapper::ApfpInterfaceWrapper(unsigned long precision) { + Init2ApfpInterfaceType(data_, precision); } \ No newline at end of file diff --git a/interface/ApfpInterfaceType.h b/interface/ApfpInterfaceType.h index da35340..d11ca09 100644 --- a/interface/ApfpInterfaceType.h +++ b/interface/ApfpInterfaceType.h @@ -22,6 +22,8 @@ class ApfpInterfaceWrapper { ApfpInterfaceWrapper(); + ApfpInterfaceWrapper(unsigned long precision); + ApfpInterfaceWrapper(ApfpInterfaceWrapper&) = delete; ApfpInterfaceWrapper& operator=(const ApfpInterfaceWrapper&) = delete; From 1da49df88e23a52e9598c8feb01f27c8980e4162 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Sat, 27 Nov 2021 01:12:19 +0100 Subject: [PATCH 08/67] ApfpInterfaceWrapper move semantics --- interface/ApfpInterfaceType.cpp | 14 +++++++++++++- interface/ApfpInterfaceType.h | 4 ++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/interface/ApfpInterfaceType.cpp b/interface/ApfpInterfaceType.cpp index 754da4a..e7eec01 100644 --- a/interface/ApfpInterfaceType.cpp +++ b/interface/ApfpInterfaceType.cpp @@ -30,4 +30,16 @@ ApfpInterfaceWrapper::ApfpInterfaceWrapper() { ApfpInterfaceWrapper::ApfpInterfaceWrapper(unsigned long precision) { Init2ApfpInterfaceType(data_, precision); -} \ No newline at end of file +} + + +ApfpInterfaceWrapper::ApfpInterfaceWrapper(ApfpInterfaceWrapper&& other) { + SwapApfpInterfaceType(data_, other.data_); + ClearApfpInterfaceType(other.data_); +} + +ApfpInterfaceWrapper& ApfpInterfaceWrapper::operator=(ApfpInterfaceWrapper&& other) { + SwapApfpInterfaceType(data_, other.data_); + ClearApfpInterfaceType(other.data_); + return *this; +} diff --git a/interface/ApfpInterfaceType.h b/interface/ApfpInterfaceType.h index d11ca09..02d75ed 100644 --- a/interface/ApfpInterfaceType.h +++ b/interface/ApfpInterfaceType.h @@ -24,9 +24,13 @@ class ApfpInterfaceWrapper { ApfpInterfaceWrapper(unsigned long precision); + ApfpInterfaceWrapper(ApfpInterfaceWrapper&&); + ApfpInterfaceWrapper(ApfpInterfaceWrapper&) = delete; ApfpInterfaceWrapper& operator=(const ApfpInterfaceWrapper&) = delete; + + ApfpInterfaceWrapper& operator=(ApfpInterfaceWrapper&&); ApfpInterfaceType* get() { return &data_; } From dd0db458b7019cf89d7f9f24a481f5bec810db9a Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Sat, 27 Nov 2021 01:16:46 +0100 Subject: [PATCH 09/67] Fix memory leaks in BLAS library --- interface/ApfpBlas.cpp | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/interface/ApfpBlas.cpp b/interface/ApfpBlas.cpp index 426af02..1c9c727 100644 --- a/interface/ApfpBlas.cpp +++ b/interface/ApfpBlas.cpp @@ -37,57 +37,57 @@ int ApfpFinalize() { /// Copy the upper or lower triangle from an NxN matrix A to a full size buffer template -void CopyFromMatrixUplo(ApfpBlasUplo uplo, unsigned long N, ptr_function_type A, unsigned long LDA, ApfpInterfaceType* buffer) { +void CopyFromMatrixUplo(ApfpBlasUplo uplo, unsigned long N, ptr_function_type A, unsigned long LDA, ApfpInterfaceWrapper* buffer) { auto dest_lda = N; // Col major layout for (unsigned long j = 0; j < N; ++j) { for (unsigned long i = 0; i <= j; ++i) { auto source = uplo == ApfpBlasUplo::lower ? A(i + j * LDA) : A(j + i * LDA); - buffer[i + j * dest_lda] = *source; - buffer[j + i * dest_lda] = *source; + SetApfpInterfaceType(*buffer[i + j * dest_lda].get(), *source); + SetApfpInterfaceType(*buffer[j + i * dest_lda].get(), *source); } } } /// Copy from a full size buffer to the upper or lower triangle of an NxN matrix A template -void CopyToMatrixUplo(ApfpBlasUplo uplo, unsigned long N, ptr_function_type A, unsigned long LDA, ApfpInterfaceType* buffer) { +void CopyToMatrixUplo(ApfpBlasUplo uplo, unsigned long N, ptr_function_type A, unsigned long LDA, ApfpInterfaceWrapper* buffer) { auto source_lda = N; // Col major layout for (unsigned long j = 0; j < N; ++j) { for (unsigned long i = 0; i <= j; ++i) { auto dest = uplo == ApfpBlasUplo::lower ? A(i + j * LDA) : A(j + i * LDA); - *dest = buffer[i + j * source_lda]; + SetApfpInterfaceType(*dest, *buffer[i + j * source_lda].get()); } } } /// Copy from an NxK matrix A to a full size buffer template -void CopyFromMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigned long LDA, ApfpInterfaceType* buffer) { +void CopyFromMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigned long LDA, ApfpInterfaceWrapper* buffer) { auto dest_lda = N; // Col major layout for (unsigned long j = 0; j < N; ++j) { for (unsigned long i = 0; i < K; ++i) { - buffer[i + j * dest_lda] = *A(i + j * LDA); + SetApfpInterfaceType(*buffer[i + j * dest_lda].get(), *A(i + j * LDA)); } } } /// Copy to an NxK matrix A from a full size buffer template -void CopyToMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigned long LDA, ApfpInterfaceType* buffer) { +void CopyToMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigned long LDA, ApfpInterfaceWrapper* buffer) { auto source_lda = N; // Col major layout for (unsigned long j = 0; j < N; ++j) { for (unsigned long i = 0; i < K; ++i) { - *A(i + j * LDA) = buffer[i + j * source_lda]; + SetApfpInterfaceType(*A(i + j * LDA), *buffer[i + j * source_lda].get()); } } } template -int ApfpSyrkTemplate(char uplo, char trans, unsigned long N, unsigned long K, ptr_function_type_a A, unsigned long LDA, ptr_function_type_c C, unsigned long LDC) { +int ApfpSyrkImpl(char uplo, char trans, unsigned long N, unsigned long K, ptr_function_type_a A, unsigned long LDA, ptr_function_type_c C, unsigned long LDC) { try { // ==== library input validation stuff ====f if (std::toupper(uplo) != 'U' && std::toupper(uplo) != 'L') { return -1; } @@ -108,7 +108,7 @@ int ApfpSyrkTemplate(char uplo, char trans, unsigned long N, unsigned long K, pt // ==== setup ==== - std::vector host_a, host_c; + std::vector host_a, host_c; host_a.resize(N*K); CopyFromMatrix(N, K, A, LDA, host_a.data()); auto device_a = apfp->AllocateDeviceMatrix(N, K); @@ -120,7 +120,7 @@ int ApfpSyrkTemplate(char uplo, char trans, unsigned long N, unsigned long K, pt device_c.TransferToDevice(host_c.data(), host_c.size()); // ==== compute and teardown ==== - // apfp.MatrixMultiply() + // apfp.SymmetricRankKUpdate(host_a, host_a, *host_c); device_c.TransferToHost(host_c.data(), host_c.size()); CopyToMatrixUplo(uplo_validated, N, C, LDC, host_c.data()); @@ -128,6 +128,8 @@ int ApfpSyrkTemplate(char uplo, char trans, unsigned long N, unsigned long K, pt return ApfpBlasError::unknown; } + return ApfpBlasError::unimplemented; + return ApfpBlasError::success; } @@ -135,10 +137,10 @@ int ApfpSyrkTemplate(char uplo, char trans, unsigned long N, unsigned long K, pt int ApfpSyrk(char uplo, char trans, unsigned long N, unsigned long K, const ApfpInterfaceType* A, unsigned long LDA, ApfpInterfaceType* C, unsigned long LDC) { auto a_ptr_function = [&](unsigned long i) -> const ApfpInterfaceType* { return &(A[i]); }; auto c_ptr_function = [&](unsigned long i) -> ApfpInterfaceType* { return &(C[i]); }; - return ApfpSyrkTemplate(uplo, trans, N, K, a_ptr_function, LDA, c_ptr_function, LDC); + return ApfpSyrkImpl(uplo, trans, N, K, a_ptr_function, LDA, c_ptr_function, LDC); } int ApfpSyrk(char uplo, char trans, unsigned long N, unsigned long K, ConstIndexFunction A, unsigned long LDA, IndexFunction C, unsigned long LDC) { - return ApfpSyrkTemplate(uplo, trans, N, K, A, LDA, C, LDC); + return ApfpSyrkImpl(uplo, trans, N, K, A, LDA, C, LDC); } From 2dd8b21fb4d9eea972f4e323b5e5a6c4408adec5 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Tue, 14 Dec 2021 04:36:30 +0100 Subject: [PATCH 10/67] Matrix Addition dummy --- interface/Apfp.cpp | 6 +++++- interface/Apfp.h | 3 +++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/interface/Apfp.cpp b/interface/Apfp.cpp index 67d23a5..c729b5a 100644 --- a/interface/Apfp.cpp +++ b/interface/Apfp.cpp @@ -36,11 +36,15 @@ void Apfp::MatrixMultiplication(const DeviceMatrix& a, const DeviceMatrix& b, De kernel.ExecuteTask(); } +void Apfp::MatrixAddition(const DeviceMatrix&, const DeviceMatrix&, DeviceMatrix*) { + throw std::exception(); +} + void Apfp::TransposeInPlace(DeviceMatrix*) { throw std::exception(); } -DeviceMatrix Apfp::Transpose(const DeviceMatrix& a) { +DeviceMatrix Apfp::Transpose(const DeviceMatrix&) { throw std::exception(); } diff --git a/interface/Apfp.h b/interface/Apfp.h index 411c7b2..6e4c30f 100644 --- a/interface/Apfp.h +++ b/interface/Apfp.h @@ -33,6 +33,9 @@ class Apfp { /// Three argument matrix multiply with supplied output buffer void MatrixMultiplication(const DeviceMatrix& a, const DeviceMatrix& b, DeviceMatrix* result); + /// Three argument matrix addition with supplied output buffer + void MatrixAddition(const DeviceMatrix& a, const DeviceMatrix& b, DeviceMatrix* result); + // Transpose a matrix in place void TransposeInPlace(DeviceMatrix* a); From e4165cbd7bae30e14a43af97195b3e4d67f04ccf Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Tue, 14 Dec 2021 04:27:16 +0100 Subject: [PATCH 11/67] mpf_t |-> mpf_ptr in PackedFloat --- include/PackedFloat.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/PackedFloat.h b/include/PackedFloat.h index 0e7eeca..213f496 100644 --- a/include/PackedFloat.h +++ b/include/PackedFloat.h @@ -54,7 +54,7 @@ class PackedFloat { } #ifndef HLSLIB_SYNTHESIS // Interoperability with GMP/MPFR, but only on the host side - inline PackedFloat(const mpf_t num) { + inline PackedFloat(mpf_srcptr num) { // Copy the most significant bytes, padding zeros if necessary const auto num_limbs = std::min(size_t(std::abs(num->_mp_size)), (mpf_get_prec(num) + 8 * sizeof(mp_limb_t) - 1) / (8 * sizeof(mp_limb_t))); @@ -80,12 +80,12 @@ class PackedFloat { sign = num->_mpfr_sign < 0; // 1 if negative, 0 otherwise } - inline PackedFloat &operator=(const mpf_t num) { + inline PackedFloat &operator=(mpf_srcptr num) { *this = PackedFloat(num); return *this; } - inline void ToGmp(mpf_t num) const { + inline void ToGmp(mpf_ptr num) { const size_t gmp_limbs = (mpf_get_prec(num) + 8 * sizeof(mp_limb_t) - 1) / (8 * sizeof(mp_limb_t)); constexpr size_t kNumLimbs = kMantissaBytes / sizeof(Limb); // GMP does not allow graceful rounding, so we cannot handle having insufficient bits in the target GMP number From 7da55beccae7d1668142f5717fa9994a28401a15 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Tue, 14 Dec 2021 04:41:43 +0100 Subject: [PATCH 12/67] const ToGmp --- include/PackedFloat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/PackedFloat.h b/include/PackedFloat.h index 213f496..e55e874 100644 --- a/include/PackedFloat.h +++ b/include/PackedFloat.h @@ -85,7 +85,7 @@ class PackedFloat { return *this; } - inline void ToGmp(mpf_ptr num) { + inline void ToGmp(mpf_ptr num) const { const size_t gmp_limbs = (mpf_get_prec(num) + 8 * sizeof(mp_limb_t) - 1) / (8 * sizeof(mp_limb_t)); constexpr size_t kNumLimbs = kMantissaBytes / sizeof(Limb); // GMP does not allow graceful rounding, so we cannot handle having insufficient bits in the target GMP number From aeb9ce1df4ac00bda094cd0d2d5622b39a7550cb Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Tue, 14 Dec 2021 04:55:43 +0100 Subject: [PATCH 13/67] Hostlib takes mpf_ptr. Host transpose/add syrk --- interface/Apfp.cpp | 18 +++++----- interface/Apfp.h | 4 +-- interface/ApfpBlas.cpp | 62 ++++++++++++++++++++++----------- interface/ApfpBlas.h | 6 ++-- interface/ApfpInterfaceType.cpp | 10 +++--- interface/ApfpInterfaceType.h | 17 +++++---- 6 files changed, 71 insertions(+), 46 deletions(-) diff --git a/interface/Apfp.cpp b/interface/Apfp.cpp index c729b5a..c842e57 100644 --- a/interface/Apfp.cpp +++ b/interface/Apfp.cpp @@ -59,26 +59,26 @@ void DeviceMatrix::TransferToDeviceImpl(ptr_function_type buffer_ptr_func, std:: host_buffer.resize(cols() * rows()); for(std::size_t i = 0; i < host_buffer.size(); ++i) { - host_buffer[i] = PackedFloat(*buffer_ptr_func(i)); + host_buffer[i] = PackedFloat(buffer_ptr_func(i)); } buffer_.CopyFromHost(0, host_buffer.size() * kLinesPerNumber, reinterpret_cast(host_buffer.data())); } -void DeviceMatrix::TransferToDevice(const ApfpInterfaceType* buffer_ptr, std::size_t buffer_size) { - TransferToDeviceImpl([&](std::size_t i) { return &buffer_ptr[i]; }, buffer_size); +void DeviceMatrix::TransferToDevice(ApfpInterfaceTypeConstPtr buffer_ptr, std::size_t buffer_size) { + TransferToDeviceImpl([&](std::size_t i) { return buffer_ptr + i; }, buffer_size); } void DeviceMatrix::TransferToDevice(const ApfpInterfaceWrapper* buffer_ptr, std::size_t buffer_size) { TransferToDeviceImpl([&](std::size_t i) { return buffer_ptr[i].get(); }, buffer_size); } -void PackedFloatToInterfaceType(const PackedFloat& packed, mpfr_t dest) { +void PackedFloatToInterfaceType(const PackedFloat& packed, mpfr_ptr dest) { packed.ToMpfr(dest); } -void PackedFloatToInterfaceType(const PackedFloat& packed, mpf_t dest) { +void PackedFloatToInterfaceType(const PackedFloat& packed, mpf_ptr dest) { packed.ToGmp(dest); } @@ -95,14 +95,14 @@ void DeviceMatrix::TransferToHostImpl(ptr_function_type buffer_ptr_func, std::si ApfpInterfaceWrapper scratch; for(std::size_t i = 0; i < host_buffer.size(); ++i) { - PackedFloatToInterfaceType(host_buffer[i], *buffer_ptr_func(i)); + PackedFloatToInterfaceType(host_buffer[i], buffer_ptr_func(i)); } } -void DeviceMatrix::TransferToHost(ApfpInterfaceType* buffer_ptr, std::size_t buffer_size) { - TransferToHostImpl([&](std::size_t i) { return &(buffer_ptr[i]); }, buffer_size); +void DeviceMatrix::TransferToHost(ApfpInterfaceTypePtr buffer_ptr, std::size_t buffer_size) { + TransferToHostImpl([&](std::size_t i) -> ApfpInterfaceTypePtr { return buffer_ptr + i; }, buffer_size); } void DeviceMatrix::TransferToHost(ApfpInterfaceWrapper* buffer_ptr, std::size_t buffer_size) { - TransferToHostImpl([&](std::size_t i) { return buffer_ptr[i].get(); }, buffer_size); + TransferToHostImpl([&](std::size_t i) -> ApfpInterfaceTypePtr { return buffer_ptr[i].get(); }, buffer_size); } diff --git a/interface/Apfp.h b/interface/Apfp.h index 6e4c30f..65400b7 100644 --- a/interface/Apfp.h +++ b/interface/Apfp.h @@ -65,13 +65,13 @@ class DeviceMatrix { /// Transfer from the host to the device /// TODO: Make this take input iterators - void TransferToDevice(const ApfpInterfaceType* buffer_ptr, std::size_t buffer_size); + void TransferToDevice(ApfpInterfaceTypeConstPtr buffer_ptr, std::size_t buffer_size); void TransferToDevice(const ApfpInterfaceWrapper* buffer_ptr, std::size_t buffer_size); /// Transfer from the device to the host /// TODO: Make this take output iterators - void TransferToHost(ApfpInterfaceType* buffer_ptr, std::size_t buffer_size); + void TransferToHost(ApfpInterfaceTypePtr buffer_ptr, std::size_t buffer_size); void TransferToHost(ApfpInterfaceWrapper* buffer_ptr, std::size_t buffer_size); private: diff --git a/interface/ApfpBlas.cpp b/interface/ApfpBlas.cpp index 1c9c727..b82d2c7 100644 --- a/interface/ApfpBlas.cpp +++ b/interface/ApfpBlas.cpp @@ -43,8 +43,8 @@ void CopyFromMatrixUplo(ApfpBlasUplo uplo, unsigned long N, ptr_function_type A, for (unsigned long j = 0; j < N; ++j) { for (unsigned long i = 0; i <= j; ++i) { auto source = uplo == ApfpBlasUplo::lower ? A(i + j * LDA) : A(j + i * LDA); - SetApfpInterfaceType(*buffer[i + j * dest_lda].get(), *source); - SetApfpInterfaceType(*buffer[j + i * dest_lda].get(), *source); + SetApfpInterfaceType(buffer[i + j * dest_lda].get(), source); + SetApfpInterfaceType(buffer[j + i * dest_lda].get(), source); } } } @@ -57,7 +57,7 @@ void CopyToMatrixUplo(ApfpBlasUplo uplo, unsigned long N, ptr_function_type A, u for (unsigned long j = 0; j < N; ++j) { for (unsigned long i = 0; i <= j; ++i) { auto dest = uplo == ApfpBlasUplo::lower ? A(i + j * LDA) : A(j + i * LDA); - SetApfpInterfaceType(*dest, *buffer[i + j * source_lda].get()); + SetApfpInterfaceType(dest, buffer[i + j * source_lda].get()); } } } @@ -69,7 +69,19 @@ void CopyFromMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsig // Col major layout for (unsigned long j = 0; j < N; ++j) { for (unsigned long i = 0; i < K; ++i) { - SetApfpInterfaceType(*buffer[i + j * dest_lda].get(), *A(i + j * LDA)); + SetApfpInterfaceType(buffer[i + j * dest_lda].get(), A(i + j * LDA)); + } + } +} + +/// Copy the transpose of a NxK matrix A to a full size buffer +template +void CopyTransposeFromMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigned long LDA, ApfpInterfaceWrapper* buffer) { + auto dest_lda = K; + // Col major layout + for (unsigned long j = 0; j < N; ++j) { + for (unsigned long i = 0; i < K; ++i) { + SetApfpInterfaceType(buffer[j + i * dest_lda].get(), A(i + j * LDA)); } } } @@ -81,7 +93,7 @@ void CopyToMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigne // Col major layout for (unsigned long j = 0; j < N; ++j) { for (unsigned long i = 0; i < K; ++i) { - SetApfpInterfaceType(*A(i + j * LDA), *buffer[i + j * source_lda].get()); + SetApfpInterfaceType(A(i + j * LDA), buffer[i + j * source_lda].get()); } } } @@ -94,10 +106,6 @@ int ApfpSyrkImpl(char uplo, char trans, unsigned long N, unsigned long K, ptr_fu auto uplo_validated = static_cast(uplo); if (std::toupper(trans) != 'N' && std::toupper(trans) != 'T') { return -2; } - // Let's not worry about this mode with N and K being different meanings for now - if (trans == ApfpBlasTrans::transpose) { - return ApfpBlasError::unimplemented; - } if (LDA < N) { return -6; } if (LDC < N) { return -8; } @@ -106,37 +114,51 @@ int ApfpSyrkImpl(char uplo, char trans, unsigned long N, unsigned long K, ptr_fu if (N == 0) { return ApfpBlasError::success; } if (K == 0) { return ApfpBlasError::success; } + // Let's not worry about this mode with N and K being different meanings for now + if (trans == ApfpBlasTrans::transpose) { + return ApfpBlasError::unimplemented; + } // ==== setup ==== - - std::vector host_a, host_c; + std::vector host_a, host_a_transpose, host_c; host_a.resize(N*K); CopyFromMatrix(N, K, A, LDA, host_a.data()); auto device_a = apfp->AllocateDeviceMatrix(N, K); device_a.TransferToDevice(host_a.data(), host_a.size()); + host_a_transpose.resize(K*N); + CopyTransposeFromMatrix(N, K, A, LDA, host_a_transpose.data()); + auto device_a_transpose = apfp->AllocateDeviceMatrix(K, N); + device_a_transpose.TransferToDevice(host_a_transpose.data(), host_a_transpose.size()); + host_c.resize(N*N); CopyFromMatrixUplo(uplo_validated, N, C, LDC, host_c.data()); - auto device_c = apfp->AllocateDeviceMatrix(N, N); - device_c.TransferToDevice(host_c.data(), host_c.size()); // ==== compute and teardown ==== - // apfp.SymmetricRankKUpdate(host_a, host_a, *host_c); + auto mul_result = apfp->AllocateDeviceMatrix(N, N); + apfp->MatrixMultiplication(device_a, device_a_transpose, &mul_result); + std::vector host_result; + host_result.resize(N*N); + + mul_result.TransferToHost(host_result.data(), host_result.size()); + + ApfpInterfaceWrapper add_result; + for(unsigned long i = 0; i < host_result.size(); ++i) { + mpf_add(add_result.get(), host_result[i].get(), host_c[i].get()); + SetApfpInterfaceType(host_c[i].get(), add_result.get()); + } - device_c.TransferToHost(host_c.data(), host_c.size()); CopyToMatrixUplo(uplo_validated, N, C, LDC, host_c.data()); } catch (...) { return ApfpBlasError::unknown; } - return ApfpBlasError::unimplemented; - return ApfpBlasError::success; } /// See netlib's documentation on Syrk for usage. Alpha and beta unsupported -int ApfpSyrk(char uplo, char trans, unsigned long N, unsigned long K, const ApfpInterfaceType* A, unsigned long LDA, ApfpInterfaceType* C, unsigned long LDC) { - auto a_ptr_function = [&](unsigned long i) -> const ApfpInterfaceType* { return &(A[i]); }; - auto c_ptr_function = [&](unsigned long i) -> ApfpInterfaceType* { return &(C[i]); }; +int ApfpSyrk(char uplo, char trans, unsigned long N, unsigned long K, ApfpInterfaceTypeConstPtr A, unsigned long LDA, ApfpInterfaceTypePtr C, unsigned long LDC) { + auto a_ptr_function = [&](unsigned long i) -> ApfpInterfaceTypeConstPtr { return A + i; }; + auto c_ptr_function = [&](unsigned long i) -> ApfpInterfaceTypePtr { return C + i; }; return ApfpSyrkImpl(uplo, trans, N, K, a_ptr_function, LDA, c_ptr_function, LDC); } diff --git a/interface/ApfpBlas.h b/interface/ApfpBlas.h index 95a3a4d..b7cd248 100644 --- a/interface/ApfpBlas.h +++ b/interface/ApfpBlas.h @@ -4,15 +4,15 @@ #include "ApfpInterfaceType.h" // -using IndexFunction = std::function; -using ConstIndexFunction = std::function; +using IndexFunction = std::function; +using ConstIndexFunction = std::function; int ApfpInit(unsigned long precision); int ApfpFinalize(); /// See netlib's documentation on Syrk for usage. Alpha and beta unsupported -int ApfpSyrk(char uplo, char trans, unsigned long N, unsigned long K, const ApfpInterfaceType* A, unsigned long LDA, ApfpInterfaceType* C, unsigned long LDC); +int ApfpSyrk(char uplo, char trans, unsigned long N, unsigned long K, ApfpInterfaceTypeConstPtr A, unsigned long LDA, ApfpInterfaceTypePtr C, unsigned long LDC); int ApfpSyrk(char uplo, char trans, unsigned long N, unsigned long K, ConstIndexFunction A, unsigned long LDA, IndexFunction C, unsigned long LDC); enum ApfpBlasError : int { diff --git a/interface/ApfpInterfaceType.cpp b/interface/ApfpInterfaceType.cpp index e7eec01..4982bf8 100644 --- a/interface/ApfpInterfaceType.cpp +++ b/interface/ApfpInterfaceType.cpp @@ -1,22 +1,22 @@ #include "ApfpInterfaceType.h" -void InitApfpInterfaceType(ApfpInterfaceType value) { +void InitApfpInterfaceType(ApfpInterfaceTypePtr value) { mpf_init(value); } -void Init2ApfpInterfaceType(ApfpInterfaceType value, unsigned long precision) { +void Init2ApfpInterfaceType(ApfpInterfaceTypePtr value, unsigned long precision) { mpf_init2(value, precision); } -void ClearApfpInterfaceType(ApfpInterfaceType value) { +void ClearApfpInterfaceType(ApfpInterfaceTypePtr value) { mpf_clear(value); } -void SwapApfpInterfaceType(ApfpInterfaceType a, ApfpInterfaceType b) { +void SwapApfpInterfaceType(ApfpInterfaceTypePtr a, ApfpInterfaceTypePtr b) { mpf_swap(a, b); } -void SetApfpInterfaceType(ApfpInterfaceType dest, const ApfpInterfaceType source) { +void SetApfpInterfaceType(ApfpInterfaceTypePtr dest, ApfpInterfaceTypeConstPtr source) { mpf_set(dest, source); } diff --git a/interface/ApfpInterfaceType.h b/interface/ApfpInterfaceType.h index 02d75ed..a875d57 100644 --- a/interface/ApfpInterfaceType.h +++ b/interface/ApfpInterfaceType.h @@ -2,16 +2,18 @@ #include using ApfpInterfaceType = mpf_t; +using ApfpInterfaceTypePtr = mpf_ptr; +using ApfpInterfaceTypeConstPtr = mpf_srcptr; -void InitApfpInterfaceType(ApfpInterfaceType value); +void InitApfpInterfaceType(ApfpInterfaceTypePtr value); -void Init2ApfpInterfaceType(ApfpInterfaceType value, unsigned long precision); +void Init2ApfpInterfaceType(ApfpInterfaceTypePtr value, unsigned long precision); -void ClearApfpInterfaceType(ApfpInterfaceType value); +void ClearApfpInterfaceType(ApfpInterfaceTypePtr value); -void SwapApfpInterfaceType(ApfpInterfaceType a, ApfpInterfaceType b); +void SwapApfpInterfaceType(ApfpInterfaceTypePtr a, ApfpInterfaceTypePtr b); -void SetApfpInterfaceType(ApfpInterfaceType dest, const ApfpInterfaceType source); +void SetApfpInterfaceType(ApfpInterfaceTypePtr dest, ApfpInterfaceTypeConstPtr source); /// Smart pointer-like wrapper class for GMP/MPFR types class ApfpInterfaceWrapper { @@ -32,7 +34,8 @@ class ApfpInterfaceWrapper { ApfpInterfaceWrapper& operator=(ApfpInterfaceWrapper&&); - ApfpInterfaceType* get() { return &data_; } + // This decays to the pointer type + ApfpInterfaceTypePtr get() { return data_; } - const ApfpInterfaceType* get() const { return &data_; } + ApfpInterfaceTypeConstPtr get() const { return data_; } }; \ No newline at end of file From bfcacd9647dbb46196c3479121857085f5119d5f Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Sun, 19 Dec 2021 23:08:58 +0100 Subject: [PATCH 14/67] MPFR BLAS interface --- CMakeLists.txt | 4 +++- interface/ApfpBlas.cpp | 2 +- interface/ApfpInterfaceType.cpp | 30 ++++++++++++++++++++++++++++++ interface/ApfpInterfaceType.h | 11 ++++++++++- 4 files changed, 44 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index da5c71f..77294d6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,6 +11,8 @@ set(APFP_TILE_SIZE_N 32 CACHE STRING "Tile size in the N-dimension when running set(APFP_TILE_SIZE_M 32 CACHE STRING "Tile size in the M-dimension when running matrix-matrix multiplication.") set(APFP_SEMANTICS "MPFR" CACHE STRING "Which semantics to use for floating point operations [GMP/MPFR].") set_property(CACHE APFP_SEMANTICS PROPERTY STRINGS GMP MPFR) +set(APFP_INTERFACE_TYPE "MPFR" CACHE STRING "Which data types to use for the interface [GMP/MPFR].") +set_property(CACHE APFP_INTERFACE_TYPE PROPERTY STRINGS GMP MPFR) # Validation and derived numbers math(EXPR APFP_ALIGNED "${APFP_BITS} % 512") @@ -26,7 +28,7 @@ find_package(MPFR REQUIRED) find_package(GMP REQUIRED) find_package(Threads REQUIRED) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -DAPFP_${APFP_SEMANTICS}_SEMANTICS") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -DAPFP_${APFP_SEMANTICS}_SEMANTICS -DAPFP_${APFP_INTERFACE}_INTERFACE_TYPE") include_directories(${CMAKE_BINARY_DIR} include SYSTEM hlslib/include ${Vitis_INCLUDE_DIRS} ) configure_file(include/Config.h.in Config.h) diff --git a/interface/ApfpBlas.cpp b/interface/ApfpBlas.cpp index b82d2c7..21cdb60 100644 --- a/interface/ApfpBlas.cpp +++ b/interface/ApfpBlas.cpp @@ -143,7 +143,7 @@ int ApfpSyrkImpl(char uplo, char trans, unsigned long N, unsigned long K, ptr_fu ApfpInterfaceWrapper add_result; for(unsigned long i = 0; i < host_result.size(); ++i) { - mpf_add(add_result.get(), host_result[i].get(), host_c[i].get()); + AddApfpInterfaceType(add_result.get(), host_result[i].get(), host_c[i].get()); SetApfpInterfaceType(host_c[i].get(), add_result.get()); } diff --git a/interface/ApfpInterfaceType.cpp b/interface/ApfpInterfaceType.cpp index 4982bf8..5b0337d 100644 --- a/interface/ApfpInterfaceType.cpp +++ b/interface/ApfpInterfaceType.cpp @@ -1,25 +1,55 @@ #include "ApfpInterfaceType.h" void InitApfpInterfaceType(ApfpInterfaceTypePtr value) { +#ifdef APFP_GMP_INTERFACE_TYPE mpf_init(value); +#else + mpfr_init(value); +#endif } void Init2ApfpInterfaceType(ApfpInterfaceTypePtr value, unsigned long precision) { +#ifdef APFP_GMP_INTERFACE_TYPE mpf_init2(value, precision); +#else + mpfr_init(value); + mpfr_set_prec(value, precision); +#endif } void ClearApfpInterfaceType(ApfpInterfaceTypePtr value) { +#ifdef APFP_GMP_INTERFACE_TYPE mpf_clear(value); +#else + mpfr_clear(value); +#endif } void SwapApfpInterfaceType(ApfpInterfaceTypePtr a, ApfpInterfaceTypePtr b) { +#ifdef APFP_GMP_INTERFACE_TYPE mpf_swap(a, b); +#else + mpfr_swap(a, b); +#endif } void SetApfpInterfaceType(ApfpInterfaceTypePtr dest, ApfpInterfaceTypeConstPtr source) { +#ifdef APFP_GMP_INTERFACE_TYPE mpf_set(dest, source); +#else + mpfr_set(dest, source, mpfr_get_default_rounding_mode()); +#endif } +void AddApfpInterfaceType(ApfpInterfaceTypeConstPtr a, ApfpInterfaceTypeConstPtr b, ApfpInterfaceTypePtr dest) { +#ifdef APFP_GMP_INTERFACE_TYPE + mpf_add(dest, a, b); +#else + mpfr_add(dest, a, b, mpfr_get_default_rounding_mode()); +#endif +} + + ApfpInterfaceWrapper::~ApfpInterfaceWrapper() { ClearApfpInterfaceType(data_); } diff --git a/interface/ApfpInterfaceType.h b/interface/ApfpInterfaceType.h index a875d57..eaa0134 100644 --- a/interface/ApfpInterfaceType.h +++ b/interface/ApfpInterfaceType.h @@ -1,10 +1,17 @@ #pragma once +#ifdef APFP_GMP_INTERFACE_TYPE // Interface with GMP types #include - using ApfpInterfaceType = mpf_t; using ApfpInterfaceTypePtr = mpf_ptr; using ApfpInterfaceTypeConstPtr = mpf_srcptr; +#else +#include +using ApfpInterfaceType = mpfr_t; +using ApfpInterfaceTypePtr = mpfr_ptr; +using ApfpInterfaceTypeConstPtr = mpfr_srcptr; +#endif + void InitApfpInterfaceType(ApfpInterfaceTypePtr value); void Init2ApfpInterfaceType(ApfpInterfaceTypePtr value, unsigned long precision); @@ -15,6 +22,8 @@ void SwapApfpInterfaceType(ApfpInterfaceTypePtr a, ApfpInterfaceTypePtr b); void SetApfpInterfaceType(ApfpInterfaceTypePtr dest, ApfpInterfaceTypeConstPtr source); +void AddApfpInterfaceType(ApfpInterfaceTypeConstPtr a, ApfpInterfaceTypeConstPtr b, ApfpInterfaceTypePtr dest); + /// Smart pointer-like wrapper class for GMP/MPFR types class ApfpInterfaceWrapper { ApfpInterfaceType data_; From b50c80e96f8d6da568cf7d677c8daff74dba0277 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Mon, 20 Dec 2021 02:15:42 +0100 Subject: [PATCH 15/67] Add unsigned long init and mul to wrapper header --- interface/ApfpInterfaceType.cpp | 15 +++++++++++++++ interface/ApfpInterfaceType.h | 4 ++++ 2 files changed, 19 insertions(+) diff --git a/interface/ApfpInterfaceType.cpp b/interface/ApfpInterfaceType.cpp index 5b0337d..4b3b555 100644 --- a/interface/ApfpInterfaceType.cpp +++ b/interface/ApfpInterfaceType.cpp @@ -41,6 +41,14 @@ void SetApfpInterfaceType(ApfpInterfaceTypePtr dest, ApfpInterfaceTypeConstPtr s #endif } +void SetApfpInterfaceType(ApfpInterfaceTypePtr dest, long int source) { +#ifdef APFP_GMP_INTERFACE_TYPE + mpf_set(dest, source); +#else + mpfr_set_si(dest, source, mpfr_get_default_rounding_mode()); +#endif +} + void AddApfpInterfaceType(ApfpInterfaceTypeConstPtr a, ApfpInterfaceTypeConstPtr b, ApfpInterfaceTypePtr dest) { #ifdef APFP_GMP_INTERFACE_TYPE mpf_add(dest, a, b); @@ -49,6 +57,13 @@ void AddApfpInterfaceType(ApfpInterfaceTypeConstPtr a, ApfpInterfaceTypeConstPtr #endif } +void MulApfpInterfaceType(ApfpInterfaceTypeConstPtr a, ApfpInterfaceTypeConstPtr b, ApfpInterfaceTypePtr dest) { +#ifdef APFP_GMP_INTERFACE_TYPE + mpf_mul(dest, a, b); +#else + mpfr_mul(dest, a, b, mpfr_get_default_rounding_mode()); +#endif +} ApfpInterfaceWrapper::~ApfpInterfaceWrapper() { ClearApfpInterfaceType(data_); diff --git a/interface/ApfpInterfaceType.h b/interface/ApfpInterfaceType.h index eaa0134..a057c71 100644 --- a/interface/ApfpInterfaceType.h +++ b/interface/ApfpInterfaceType.h @@ -22,8 +22,12 @@ void SwapApfpInterfaceType(ApfpInterfaceTypePtr a, ApfpInterfaceTypePtr b); void SetApfpInterfaceType(ApfpInterfaceTypePtr dest, ApfpInterfaceTypeConstPtr source); +void SetApfpInterfaceType(ApfpInterfaceTypePtr dest, long int source); + void AddApfpInterfaceType(ApfpInterfaceTypeConstPtr a, ApfpInterfaceTypeConstPtr b, ApfpInterfaceTypePtr dest); +void MulApfpInterfaceType(ApfpInterfaceTypeConstPtr a, ApfpInterfaceTypeConstPtr b, ApfpInterfaceTypePtr dest); + /// Smart pointer-like wrapper class for GMP/MPFR types class ApfpInterfaceWrapper { ApfpInterfaceType data_; From 4947c85d13d341bc25f7f557b7e644bef84dbbe3 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Mon, 20 Dec 2021 03:13:08 +0100 Subject: [PATCH 16/67] Generate takes mpfr_ptr --- host/Random.cpp | 4 ++-- include/Random.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/host/Random.cpp b/host/Random.cpp index c5f58c4..b8776d8 100644 --- a/host/Random.cpp +++ b/host/Random.cpp @@ -29,12 +29,12 @@ __mpfr_struct RandomNumberGenerator::GenerateMpfr() { return num[0]; } -void RandomNumberGenerator::Generate(mpfr_t &num) { +void RandomNumberGenerator::Generate(mpfr_ptr num) { std::unique_lock lock(mutex_); mpfr_urandom(num, state_, kRoundingMode); } -void RandomNumberGenerator::Generate(mpf_t &num) { +void RandomNumberGenerator::Generate(mpf_ptr num) { std::unique_lock lock(mutex_); mpf_urandomb(num, state_, kMantissaBits); } diff --git a/include/Random.h b/include/Random.h index d282693..b1a71cb 100644 --- a/include/Random.h +++ b/include/Random.h @@ -23,13 +23,13 @@ class RandomNumberGenerator { __mpf_struct GenerateGmp(); /// Generate a random GMP number into the specified output variable. - void Generate(mpf_t &); + void Generate(mpf_ptr); /// Generate a random MPFR number. __mpfr_struct GenerateMpfr(); /// Generate a random MPFR into the specified output variable. - void Generate(mpfr_t &); + void Generate(mpfr_ptr); private: gmp_randstate_t state_; From d96de48d04b26d0017ab86ae6c05bea7cb17cfdf Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Mon, 20 Dec 2021 03:13:30 +0100 Subject: [PATCH 17/67] BLAS syrk unit test --- CMakeLists.txt | 6 +-- host/TestBlas.cpp | 106 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+), 3 deletions(-) create mode 100644 host/TestBlas.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 77294d6..d9f5cf6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,7 +29,7 @@ find_package(GMP REQUIRED) find_package(Threads REQUIRED) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -DAPFP_${APFP_SEMANTICS}_SEMANTICS -DAPFP_${APFP_INTERFACE}_INTERFACE_TYPE") -include_directories(${CMAKE_BINARY_DIR} include SYSTEM hlslib/include ${Vitis_INCLUDE_DIRS} ) +include_directories(${CMAKE_BINARY_DIR} include SYSTEM hlslib/include ${Vitis_INCLUDE_DIRS} interface) configure_file(include/Config.h.in Config.h) @@ -79,8 +79,8 @@ target_link_libraries(TestHardware apfp simulation ${Vitis_LIBRARIES} ${GMP_LIBR enable_testing() add_test(TestSimulation TestSimulation 4 4 4) add_library(Catch host/Catch.cpp) -add_executable(UnitTests host/UnitTests.cpp) -target_link_libraries(UnitTests Catch ${GMP_LIBRARIES} ${MPFR_LIBRARIES} apfp simulation) +add_executable(UnitTests host/UnitTests.cpp host/TestBlas.cpp) +target_link_libraries(UnitTests Catch ${GMP_LIBRARIES} ${MPFR_LIBRARIES} apfp ApfpHostlib simulation) add_test(UnitTests UnitTests) install(TARGETS ApfpHostlib) diff --git a/host/TestBlas.cpp b/host/TestBlas.cpp new file mode 100644 index 0000000..74007e5 --- /dev/null +++ b/host/TestBlas.cpp @@ -0,0 +1,106 @@ +#include "Config.h" +#include +#include +#include + +// #include "ArithmeticOperations.h" +// #include "Karatsuba.h" +// #include "PackedFloat.h" +#include "Random.h" + +#include "ApfpBlas.h" + +void ApfpSetup() { +#ifdef APFP_GMP_INTERFACE_TYPE + mpf_set_default_prec(kMantissaBits); +#else + mpfr_set_default_prec(kMantissaBits); +#endif + ApfpInit(kMantissaBits); +} + +void ApfpTeardown() { + ApfpFinalize(); +} + +bool IsClose(ApfpInterfaceTypeConstPtr a, ApfpInterfaceTypeConstPtr b) { + ApfpInterfaceWrapper diff, sum, ratio; +#ifdef APFP_GMP_INTERFACE_TYPE + mpf_sub(diff.get(), a, b); + mpf_add(sum.get(), a, b); + mpf_div(ratio.get(), diff.get(), sum.get()); + auto exp = mpf_get_exp(ratio.get()); +#else + auto rounding_mode = mpfr_get_default_rounding_mode(); + mpfr_sub(diff.get(), a, b, rounding_mode); + mpfr_add(sum.get(), a, b, rounding_mode); + mpfr_div(ratio.get(), diff.get(), sum.get(), rounding_mode); + auto exp = mpfr_get_exp(ratio.get()); +#endif + // Require the numbers to match to the first 90% decimal places + return exp < -((kMantissaBits*3 * 9)/10); +} + +TEST_CASE("SYRK") { + ApfpSetup(); + + auto rng = RandomNumberGenerator(); + std::array matrix_sizes {1, 8, 15, 16, 31, 32, 33}; + // Test SYRK + // In 'N' mode, we perform AA^T + C + // A is NxK (A : R^K -> R^N) + // C is NxN + // Matrices are stored column major because BLAS + for(auto N : matrix_sizes) { + for(auto K : matrix_sizes) { + std::vector a_matrix; + a_matrix.resize(N*K); + for(auto& v : a_matrix) { + rng.Generate(v.get()); + } + + std::vector c_matrix; + c_matrix.resize(N*N); + for(auto& v : c_matrix) { + rng.Generate(v.get()); + } + + std::vector ref_result; + ref_result.resize(N*N); + + // Compute reference result + ApfpInterfaceWrapper prod_temp, sum_temp; + for(unsigned long j = 0; j < N; ++j) { + // lower half + for(unsigned long i = 0; i < j; ++i) { + auto r_idx = i + j*N; + SetApfpInterfaceType(ref_result[r_idx].get(), c_matrix[r_idx].get()); + + for(unsigned long k = 0; k < K; ++k) { + // (AB)_ij = sum_k A(i,k)B(k,j) + MulApfpInterfaceType(prod_temp.get(), a_matrix[i + k*N].get(), a_matrix[j + k*N].get()); + AddApfpInterfaceType(sum_temp.get(), prod_temp.get(), ref_result[r_idx].get()); + SetApfpInterfaceType(ref_result[r_idx].get(), sum_temp.get()); + } + } + } + + // Use APFP BLAS library + auto error_code = ApfpSyrk('L', 'N', N, K, + [&](unsigned long i) { return a_matrix[i].get(); }, K, + [&](unsigned long i) { return c_matrix[i].get(); }, N); + REQUIRE(error_code == ApfpBlasError::success); + + // Check all entries are sufficiently close + ApfpInterfaceWrapper diff; + for(unsigned long j = 0; j < N; ++j) { + // lower half + for(unsigned long i = 0; i < j; ++i) { + REQUIRE(IsClose(ref_result[i + j*N].get(), c_matrix[i + j*N].get())); + } + } + } + } + + ApfpTeardown(); +} \ No newline at end of file From 4eb48810be1334995e4ec5c00026f519ff84d9d2 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Tue, 21 Dec 2021 20:05:34 +0100 Subject: [PATCH 18/67] Blas unit tests in separate executable --- CMakeLists.txt | 5 ++++- host/{TestBlas.cpp => BlasUnitTests.cpp} | 0 2 files changed, 4 insertions(+), 1 deletion(-) rename host/{TestBlas.cpp => BlasUnitTests.cpp} (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index d9f5cf6..368b4cb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -79,10 +79,13 @@ target_link_libraries(TestHardware apfp simulation ${Vitis_LIBRARIES} ${GMP_LIBR enable_testing() add_test(TestSimulation TestSimulation 4 4 4) add_library(Catch host/Catch.cpp) -add_executable(UnitTests host/UnitTests.cpp host/TestBlas.cpp) +add_executable(UnitTests host/UnitTests.cpp) target_link_libraries(UnitTests Catch ${GMP_LIBRARIES} ${MPFR_LIBRARIES} apfp ApfpHostlib simulation) add_test(UnitTests UnitTests) +add_executable(BlasUnitTests host/BlasUnitTests.cpp) +target_link_libraries(BlasUnitTests Catch ${GMP_LIBRARIES} ${MPFR_LIBRARIES} apfp ApfpHostlib simulation) + install(TARGETS ApfpHostlib) install(FILES interface/Apfp.h diff --git a/host/TestBlas.cpp b/host/BlasUnitTests.cpp similarity index 100% rename from host/TestBlas.cpp rename to host/BlasUnitTests.cpp From c49e272659e39eaa5c2b0f262a9abd03d742cc66 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Wed, 22 Dec 2021 23:18:30 +0100 Subject: [PATCH 19/67] Search for kernel in current working directory --- interface/Apfp.cpp | 13 ++++++++++++- interface/Apfp.h | 4 ++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/interface/Apfp.cpp b/interface/Apfp.cpp index c842e57..7e589ce 100644 --- a/interface/Apfp.cpp +++ b/interface/Apfp.cpp @@ -3,14 +3,25 @@ #include #include +#include #include "Config.h" Apfp::Apfp() { - program_.emplace(context_.MakeProgram(kernel_path_)); + program_.emplace(context_.MakeProgram(FindKernel())); lines_per_number_ = kLinesPerNumber; } +std::string Apfp::FindKernel() { + auto kernel_name = std::filesystem::path("MatrixMultiplication_hw.xclbin"); + { + auto kernel_current_directory = std::filesystem::current_path() / kernel_name; + if(std::filesystem::exists(kernel_current_directory)) { + return kernel_current_directory.string(); + } + } +} + DeviceMatrix Apfp::AllocateDeviceMatrix(std::size_t rows, std::size_t cols) { // This seems like poor encapsulation, is there a better way? diff --git a/interface/Apfp.h b/interface/Apfp.h index 65400b7..2457f26 100644 --- a/interface/Apfp.h +++ b/interface/Apfp.h @@ -19,8 +19,8 @@ class Apfp { std::optional program_; std::size_t lines_per_number_; - const std::string kernel_path_ = ""; - + + static std::string FindKernel(); public: Apfp(); From bdd9f35e30a8b3cd9a4548c3dab60171a131a2b4 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Sun, 26 Dec 2021 19:33:15 +0100 Subject: [PATCH 20/67] Throw an exception if we can't find the kernel --- interface/Apfp.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/interface/Apfp.cpp b/interface/Apfp.cpp index 7e589ce..7a1a121 100644 --- a/interface/Apfp.cpp +++ b/interface/Apfp.cpp @@ -20,6 +20,7 @@ std::string Apfp::FindKernel() { return kernel_current_directory.string(); } } + throw std::runtime_error("Unable to find FPGA kernel"); } DeviceMatrix Apfp::AllocateDeviceMatrix(std::size_t rows, std::size_t cols) { From adaba04881c39a3d926e86f12168eb6dfbd16146 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Sun, 26 Dec 2021 20:37:26 +0100 Subject: [PATCH 21/67] Guard against calling unitialized library --- interface/ApfpBlas.cpp | 10 +++++++++- interface/ApfpBlas.h | 3 ++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/interface/ApfpBlas.cpp b/interface/ApfpBlas.cpp index 21cdb60..d8e1001 100644 --- a/interface/ApfpBlas.cpp +++ b/interface/ApfpBlas.cpp @@ -35,6 +35,10 @@ int ApfpFinalize() { return ApfpBlasError::success; } +bool ApfpIsInitialized() { + return apfp.has_value(); +} + /// Copy the upper or lower triangle from an NxN matrix A to a full size buffer template void CopyFromMatrixUplo(ApfpBlasUplo uplo, unsigned long N, ptr_function_type A, unsigned long LDA, ApfpInterfaceWrapper* buffer) { @@ -101,7 +105,11 @@ void CopyToMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigne template int ApfpSyrkImpl(char uplo, char trans, unsigned long N, unsigned long K, ptr_function_type_a A, unsigned long LDA, ptr_function_type_c C, unsigned long LDC) { try { - // ==== library input validation stuff ====f + // ==== library input validation stuff ==== + if(!ApfpIsInitialized()) { + return ApfpBlasError::uninitialized; + } + if (std::toupper(uplo) != 'U' && std::toupper(uplo) != 'L') { return -1; } auto uplo_validated = static_cast(uplo); diff --git a/interface/ApfpBlas.h b/interface/ApfpBlas.h index b7cd248..954cd71 100644 --- a/interface/ApfpBlas.h +++ b/interface/ApfpBlas.h @@ -19,7 +19,8 @@ enum ApfpBlasError : int { success = 0, unknown = 1, unimplemented = 2, - bitwidth = 3 + bitwidth = 3, + uninitialized = 4, }; From 13aa0e99ffa36b50b2db1d9627b53ae71165a89a Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Sun, 26 Dec 2021 21:11:00 +0100 Subject: [PATCH 22/67] Add mechanism to get ApfpBlas error strings --- interface/ApfpBlas.cpp | 12 ++++++++++-- interface/ApfpBlas.h | 15 ++++++++++----- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/interface/ApfpBlas.cpp b/interface/ApfpBlas.cpp index d8e1001..1e8f0ec 100644 --- a/interface/ApfpBlas.cpp +++ b/interface/ApfpBlas.cpp @@ -5,6 +5,7 @@ #include static std::optional apfp; +static std::string last_error_message; enum ApfpBlasUplo : char { upper = 'U', @@ -20,12 +21,14 @@ int ApfpInit(unsigned long precision) { try { if (precision > kBits) { // Requested bit width too large + last_error_message = "Requested bitwidth too large"; return ApfpBlasError::bitwidth; } apfp.emplace(); return ApfpBlasError::success; - }catch(...) { + }catch(const std::exception& e) { // Unknown exception + last_error_message = e.what(); return ApfpBlasError::unknown; } } @@ -39,6 +42,10 @@ bool ApfpIsInitialized() { return apfp.has_value(); } +const char* ApfpErrorDescription() { + return last_error_message.c_str(); +} + /// Copy the upper or lower triangle from an NxN matrix A to a full size buffer template void CopyFromMatrixUplo(ApfpBlasUplo uplo, unsigned long N, ptr_function_type A, unsigned long LDA, ApfpInterfaceWrapper* buffer) { @@ -156,7 +163,8 @@ int ApfpSyrkImpl(char uplo, char trans, unsigned long N, unsigned long K, ptr_fu } CopyToMatrixUplo(uplo_validated, N, C, LDC, host_c.data()); - } catch (...) { + } catch(const std::exception& e) { + last_error_message = e.what(); return ApfpBlasError::unknown; } diff --git a/interface/ApfpBlas.h b/interface/ApfpBlas.h index 954cd71..4276c2f 100644 --- a/interface/ApfpBlas.h +++ b/interface/ApfpBlas.h @@ -7,6 +7,11 @@ using IndexFunction = std::function; using ConstIndexFunction = std::function; + +/// Null terminated string describing the most recent library error if available +/// Pointer is only guaranteed to live until the next library call +const char* ApfpErrorDescription(); + int ApfpInit(unsigned long precision); int ApfpFinalize(); @@ -16,11 +21,11 @@ int ApfpSyrk(char uplo, char trans, unsigned long N, unsigned long K, ApfpInterf int ApfpSyrk(char uplo, char trans, unsigned long N, unsigned long K, ConstIndexFunction A, unsigned long LDA, IndexFunction C, unsigned long LDC); enum ApfpBlasError : int { - success = 0, - unknown = 1, - unimplemented = 2, - bitwidth = 3, - uninitialized = 4, + success = 0, + unknown = 1, + unimplemented = 2, + bitwidth = 3, + uninitialized = 4, }; From e2c32d897124815da01e8500833d5389f8ce077e Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Sun, 26 Dec 2021 21:11:28 +0100 Subject: [PATCH 23/67] Guard error code for ApfpInit in UnitTests --- host/BlasUnitTests.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/host/BlasUnitTests.cpp b/host/BlasUnitTests.cpp index 74007e5..25699f0 100644 --- a/host/BlasUnitTests.cpp +++ b/host/BlasUnitTests.cpp @@ -16,7 +16,8 @@ void ApfpSetup() { #else mpfr_set_default_prec(kMantissaBits); #endif - ApfpInit(kMantissaBits); + auto apfp_error_code = ApfpInit(kMantissaBits); + REQUIRE(apfp_error_code == ApfpBlasError::success); } void ApfpTeardown() { From 41f75a844e45dab80749aabef09003ef4a2b70c2 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Sun, 26 Dec 2021 21:12:44 +0100 Subject: [PATCH 24/67] More sophisticated kernel search routine --- interface/Apfp.cpp | 50 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/interface/Apfp.cpp b/interface/Apfp.cpp index 7a1a121..c33530c 100644 --- a/interface/Apfp.cpp +++ b/interface/Apfp.cpp @@ -4,22 +4,60 @@ #include #include +#include +#include #include "Config.h" Apfp::Apfp() { - program_.emplace(context_.MakeProgram(FindKernel())); + auto kernel_path = FindKernel(); + program_.emplace(context_.MakeProgram(kernel_path)); lines_per_number_ = kLinesPerNumber; } std::string Apfp::FindKernel() { - auto kernel_name = std::filesystem::path("MatrixMultiplication_hw.xclbin"); - { - auto kernel_current_directory = std::filesystem::current_path() / kernel_name; - if(std::filesystem::exists(kernel_current_directory)) { - return kernel_current_directory.string(); + { // Specify a path to the APFP kernel manually + char* apfp_kernel_env_var = std::getenv("APFP_KERNEL"); + if(apfp_kernel_env_var != nullptr) { + auto kernel_override_path = std::filesystem::path(apfp_kernel_env_var); + + if (!std::filesystem::exists(kernel_override_path)) { + throw std::runtime_error("APFP kernel path specified with APFP_KERNEL environment variable does not exist"); + } + return kernel_override_path.string(); + } + } + + char* apfp_use_simulation_env_var = std::getenv("APFP_USE_SIMULATION"); + auto apfp_use_simulation = apfp_use_simulation_env_var != nullptr && !std::string(apfp_use_simulation_env_var).empty(); + auto kernel_name = std::filesystem::path(apfp_use_simulation ? "MatrixMultiplication_hw_emu.xclbin" : "MatrixMultiplication_hw.xclbin"); + + { // Search for the kernel in /lib, /usr/lib, LD_LIBRARY_PATH, current directory + std::vector search_paths; + // System dirs + search_paths.push_back(std::filesystem::path("/lib")); + search_paths.push_back(std::filesystem::path("/usr/lib")); + + // LD_LIBRARY_PATH + char* ld_library_path_env_var = std::getenv("LD_LIBRARY_PATH"); + auto ld_library_path = (ld_library_path_env_var == nullptr) ? "" : std::string(ld_library_path_env_var); + for(std::size_t begin = 0, end = std::string::npos; begin != end; begin = end) { + end = ld_library_path.find(":", begin); + search_paths.push_back(std::filesystem::path(ld_library_path.substr(begin, end))); + } + + // Current working directory + search_paths.push_back(std::filesystem::current_path()); + + // Search + for(auto candidate_dir : search_paths) { + auto candidate_kernel_path = candidate_dir / kernel_name; + if(std::filesystem::exists(candidate_kernel_path)) { + return candidate_kernel_path.string(); + } } } + throw std::runtime_error("Unable to find FPGA kernel"); } From 4ab5d11e68fb54738a007d6cc6c824be46f46a94 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Mon, 27 Dec 2021 00:37:59 +0100 Subject: [PATCH 25/67] Setup/teardown test case --- host/BlasUnitTests.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/host/BlasUnitTests.cpp b/host/BlasUnitTests.cpp index 25699f0..e4cfaa6 100644 --- a/host/BlasUnitTests.cpp +++ b/host/BlasUnitTests.cpp @@ -42,6 +42,11 @@ bool IsClose(ApfpInterfaceTypeConstPtr a, ApfpInterfaceTypeConstPtr b) { return exp < -((kMantissaBits*3 * 9)/10); } +TEST_CASE("Init_Teardown") { + ApfpSetup(); + ApfpTeardown(); +} + TEST_CASE("SYRK") { ApfpSetup(); From 3c37a157bae723c40b611af2cb7e16cdcbcd0a9c Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Mon, 27 Dec 2021 10:01:21 +0100 Subject: [PATCH 26/67] Fix buffer size check on TransferToHost --- interface/Apfp.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/interface/Apfp.cpp b/interface/Apfp.cpp index c33530c..44d7272 100644 --- a/interface/Apfp.cpp +++ b/interface/Apfp.cpp @@ -134,14 +134,14 @@ void PackedFloatToInterfaceType(const PackedFloat& packed, mpf_ptr dest) { template void DeviceMatrix::TransferToHostImpl(ptr_function_type buffer_ptr_func, std::size_t buffer_size) { - if (rows() * cols() >= buffer_size) { + if (rows() * cols() > buffer_size) { throw std::runtime_error("Destination host buffer size smaller than source device matrix size"); } std::vector host_buffer; host_buffer.resize(cols() * rows()); - buffer_.CopyToHost(0, kLinesPerNumber * rows() * cols(), reinterpret_cast(host_buffer.data())); + buffer_.CopyToHost(0, kLinesPerNumber * host_buffer.size(), reinterpret_cast(host_buffer.data())); ApfpInterfaceWrapper scratch; for(std::size_t i = 0; i < host_buffer.size(); ++i) { From 98f472136e5b974e8cb97c4f55ebd771f42ee3a7 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Mon, 27 Dec 2021 10:01:56 +0100 Subject: [PATCH 27/67] CopyTransposeFromMatrix destination LDA --- interface/ApfpBlas.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/ApfpBlas.cpp b/interface/ApfpBlas.cpp index 1e8f0ec..b83e640 100644 --- a/interface/ApfpBlas.cpp +++ b/interface/ApfpBlas.cpp @@ -88,7 +88,7 @@ void CopyFromMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsig /// Copy the transpose of a NxK matrix A to a full size buffer template void CopyTransposeFromMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigned long LDA, ApfpInterfaceWrapper* buffer) { - auto dest_lda = K; + auto dest_lda = N; // Col major layout for (unsigned long j = 0; j < N; ++j) { for (unsigned long i = 0; i < K; ++i) { From ee113bac88e099ddb42258e0735c5679bb3829da Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Mon, 27 Dec 2021 10:18:09 +0100 Subject: [PATCH 28/67] Blas unit tests pass --- host/BlasUnitTests.cpp | 4 ++-- interface/ApfpBlas.cpp | 25 +++++++++++++------------ 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/host/BlasUnitTests.cpp b/host/BlasUnitTests.cpp index e4cfaa6..c1b1dd1 100644 --- a/host/BlasUnitTests.cpp +++ b/host/BlasUnitTests.cpp @@ -51,7 +51,7 @@ TEST_CASE("SYRK") { ApfpSetup(); auto rng = RandomNumberGenerator(); - std::array matrix_sizes {1, 8, 15, 16, 31, 32, 33}; + std::array matrix_sizes {1, 8, 15, 16, 31, 32, 33, 0}; // Test SYRK // In 'N' mode, we perform AA^T + C // A is NxK (A : R^K -> R^N) @@ -93,7 +93,7 @@ TEST_CASE("SYRK") { // Use APFP BLAS library auto error_code = ApfpSyrk('L', 'N', N, K, - [&](unsigned long i) { return a_matrix[i].get(); }, K, + [&](unsigned long i) { return a_matrix[i].get(); }, N, [&](unsigned long i) { return c_matrix[i].get(); }, N); REQUIRE(error_code == ApfpBlasError::success); diff --git a/interface/ApfpBlas.cpp b/interface/ApfpBlas.cpp index b83e640..7a7d4ac 100644 --- a/interface/ApfpBlas.cpp +++ b/interface/ApfpBlas.cpp @@ -78,8 +78,8 @@ template void CopyFromMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigned long LDA, ApfpInterfaceWrapper* buffer) { auto dest_lda = N; // Col major layout - for (unsigned long j = 0; j < N; ++j) { - for (unsigned long i = 0; i < K; ++i) { + for (unsigned long j = 0; j < K; ++j) { + for (unsigned long i = 0; i < N; ++i) { SetApfpInterfaceType(buffer[i + j * dest_lda].get(), A(i + j * LDA)); } } @@ -88,11 +88,11 @@ void CopyFromMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsig /// Copy the transpose of a NxK matrix A to a full size buffer template void CopyTransposeFromMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigned long LDA, ApfpInterfaceWrapper* buffer) { - auto dest_lda = N; + auto dest_lda = K; // Col major layout - for (unsigned long j = 0; j < N; ++j) { - for (unsigned long i = 0; i < K; ++i) { - SetApfpInterfaceType(buffer[j + i * dest_lda].get(), A(i + j * LDA)); + for (unsigned long j = 0; j < K; ++j) { + for (unsigned long i = 0; i < N; ++i) { + SetApfpInterfaceType(buffer[i * dest_lda + j].get(), A(i + j * LDA)); } } } @@ -102,8 +102,8 @@ template void CopyToMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigned long LDA, ApfpInterfaceWrapper* buffer) { auto source_lda = N; // Col major layout - for (unsigned long j = 0; j < N; ++j) { - for (unsigned long i = 0; i < K; ++i) { + for (unsigned long j = 0; j < K; ++j) { + for (unsigned long i = 0; i < N; ++i) { SetApfpInterfaceType(A(i + j * LDA), buffer[i + j * source_lda].get()); } } @@ -122,6 +122,11 @@ int ApfpSyrkImpl(char uplo, char trans, unsigned long N, unsigned long K, ptr_fu if (std::toupper(trans) != 'N' && std::toupper(trans) != 'T') { return -2; } + // Let's not worry about this mode with N and K being different meanings for now + if (trans == ApfpBlasTrans::transpose) { + return ApfpBlasError::unimplemented; + } + if (LDA < N) { return -6; } if (LDC < N) { return -8; } @@ -129,10 +134,6 @@ int ApfpSyrkImpl(char uplo, char trans, unsigned long N, unsigned long K, ptr_fu if (N == 0) { return ApfpBlasError::success; } if (K == 0) { return ApfpBlasError::success; } - // Let's not worry about this mode with N and K being different meanings for now - if (trans == ApfpBlasTrans::transpose) { - return ApfpBlasError::unimplemented; - } // ==== setup ==== std::vector host_a, host_a_transpose, host_c; host_a.resize(N*K); From 1d53578da9e248337b30af7f8fb362628c1b1cbe Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Tue, 28 Dec 2021 05:21:51 +0100 Subject: [PATCH 29/67] Move interface type to Config.h --- CMakeLists.txt | 2 +- include/Config.h.in | 2 ++ interface/ApfpInterfaceType.h | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 368b4cb..0919de4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,7 +28,7 @@ find_package(MPFR REQUIRED) find_package(GMP REQUIRED) find_package(Threads REQUIRED) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -DAPFP_${APFP_SEMANTICS}_SEMANTICS -DAPFP_${APFP_INTERFACE}_INTERFACE_TYPE") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -DAPFP_${APFP_SEMANTICS}_SEMANTICS") include_directories(${CMAKE_BINARY_DIR} include SYSTEM hlslib/include ${Vitis_INCLUDE_DIRS} interface) configure_file(include/Config.h.in Config.h) diff --git a/include/Config.h.in b/include/Config.h.in index e9e0ab1..957943f 100644 --- a/include/Config.h.in +++ b/include/Config.h.in @@ -7,3 +7,5 @@ constexpr int kTileSizeN = ${APFP_TILE_SIZE_N}; constexpr int kTileSizeM = ${APFP_TILE_SIZE_M}; constexpr auto kBuildDir = "${CMAKE_BINARY_DIR}"; static_assert(kBits % 8 == 0, "Number of bits must be byte-aligned."); + +#define APFP_${APFP_INTERFACE_TYPE}_INTERFACE_TYPE diff --git a/interface/ApfpInterfaceType.h b/interface/ApfpInterfaceType.h index a057c71..b124680 100644 --- a/interface/ApfpInterfaceType.h +++ b/interface/ApfpInterfaceType.h @@ -1,4 +1,6 @@ #pragma once +#include "Config.h" + #ifdef APFP_GMP_INTERFACE_TYPE // Interface with GMP types #include using ApfpInterfaceType = mpf_t; From c6a86a7c99b7c874740ae110227d22482e2a7dbc Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Tue, 28 Dec 2021 05:22:09 +0100 Subject: [PATCH 30/67] install kernels to lib --- CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0919de4..e4a627e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -92,3 +92,7 @@ install(FILES interface/ApfpBlas.h interface/ApfpInterfaceType.h DESTINATION include/apfp) +install(FILES + ${CMAKE_BINARY_DIR}/MatrixMultiplication_hw.xclbin + ${CMAKE_BINARY_DIR}/MatrixMultiplication_sw_emu.xclbin + DESTINATION lib) \ No newline at end of file From 4c228853f121f464ed7fb8b9661db02aab2d795f Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Tue, 28 Dec 2021 05:29:52 +0100 Subject: [PATCH 31/67] Compile under GMP interface type --- host/BlasUnitTests.cpp | 3 ++- interface/ApfpInterfaceType.cpp | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/host/BlasUnitTests.cpp b/host/BlasUnitTests.cpp index c1b1dd1..47a1038 100644 --- a/host/BlasUnitTests.cpp +++ b/host/BlasUnitTests.cpp @@ -30,7 +30,8 @@ bool IsClose(ApfpInterfaceTypeConstPtr a, ApfpInterfaceTypeConstPtr b) { mpf_sub(diff.get(), a, b); mpf_add(sum.get(), a, b); mpf_div(ratio.get(), diff.get(), sum.get()); - auto exp = mpf_get_exp(ratio.get()); + long exp; + mpf_get_d_2exp(&exp, ratio.get()); #else auto rounding_mode = mpfr_get_default_rounding_mode(); mpfr_sub(diff.get(), a, b, rounding_mode); diff --git a/interface/ApfpInterfaceType.cpp b/interface/ApfpInterfaceType.cpp index 4b3b555..1fb3aee 100644 --- a/interface/ApfpInterfaceType.cpp +++ b/interface/ApfpInterfaceType.cpp @@ -43,7 +43,7 @@ void SetApfpInterfaceType(ApfpInterfaceTypePtr dest, ApfpInterfaceTypeConstPtr s void SetApfpInterfaceType(ApfpInterfaceTypePtr dest, long int source) { #ifdef APFP_GMP_INTERFACE_TYPE - mpf_set(dest, source); + mpf_set_ui(dest, source); #else mpfr_set_si(dest, source, mpfr_get_default_rounding_mode()); #endif From 2aa28f5555cca065b579ba04187a74788013e851 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Tue, 28 Dec 2021 05:35:49 +0100 Subject: [PATCH 32/67] Fix closeness check in BlasUnitTest for a=b=0 --- host/BlasUnitTests.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/host/BlasUnitTests.cpp b/host/BlasUnitTests.cpp index 47a1038..60a874d 100644 --- a/host/BlasUnitTests.cpp +++ b/host/BlasUnitTests.cpp @@ -24,7 +24,20 @@ void ApfpTeardown() { ApfpFinalize(); } +bool IsZero(ApfpInterfaceTypeConstPtr a) { +#ifdef APFP_GMP_INTERFACE_TYPE + return mpf_sgn(a) == 0; +#else + return mpfr_sgn(a) == 0; +#endif +} + bool IsClose(ApfpInterfaceTypeConstPtr a, ApfpInterfaceTypeConstPtr b) { + // Avoids divide by zero if a = b = 0 + if(IsZero(a) && IsZero(b)) { + return true; + } + ApfpInterfaceWrapper diff, sum, ratio; #ifdef APFP_GMP_INTERFACE_TYPE mpf_sub(diff.get(), a, b); From 47022fd6ee20f4aeaa8c3a09a18427aefc1cc767 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Tue, 28 Dec 2021 05:48:04 +0100 Subject: [PATCH 33/67] Use generators for SYRK test case --- host/BlasUnitTests.cpp | 86 +++++++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/host/BlasUnitTests.cpp b/host/BlasUnitTests.cpp index 60a874d..fb6f89e 100644 --- a/host/BlasUnitTests.cpp +++ b/host/BlasUnitTests.cpp @@ -65,59 +65,59 @@ TEST_CASE("SYRK") { ApfpSetup(); auto rng = RandomNumberGenerator(); - std::array matrix_sizes {1, 8, 15, 16, 31, 32, 33, 0}; + + unsigned long N = GENERATE(0, 1, 8, 15, 16, 31, 32, 33); + unsigned long K = GENERATE(0, 1, 8, 15, 16, 31, 32, 33); // Test SYRK // In 'N' mode, we perform AA^T + C // A is NxK (A : R^K -> R^N) // C is NxN // Matrices are stored column major because BLAS - for(auto N : matrix_sizes) { - for(auto K : matrix_sizes) { - std::vector a_matrix; - a_matrix.resize(N*K); - for(auto& v : a_matrix) { - rng.Generate(v.get()); - } + { + std::vector a_matrix; + a_matrix.resize(N*K); + for(auto& v : a_matrix) { + rng.Generate(v.get()); + } - std::vector c_matrix; - c_matrix.resize(N*N); - for(auto& v : c_matrix) { - rng.Generate(v.get()); - } + std::vector c_matrix; + c_matrix.resize(N*N); + for(auto& v : c_matrix) { + rng.Generate(v.get()); + } - std::vector ref_result; - ref_result.resize(N*N); - - // Compute reference result - ApfpInterfaceWrapper prod_temp, sum_temp; - for(unsigned long j = 0; j < N; ++j) { - // lower half - for(unsigned long i = 0; i < j; ++i) { - auto r_idx = i + j*N; - SetApfpInterfaceType(ref_result[r_idx].get(), c_matrix[r_idx].get()); - - for(unsigned long k = 0; k < K; ++k) { - // (AB)_ij = sum_k A(i,k)B(k,j) - MulApfpInterfaceType(prod_temp.get(), a_matrix[i + k*N].get(), a_matrix[j + k*N].get()); - AddApfpInterfaceType(sum_temp.get(), prod_temp.get(), ref_result[r_idx].get()); - SetApfpInterfaceType(ref_result[r_idx].get(), sum_temp.get()); - } + std::vector ref_result; + ref_result.resize(N*N); + + // Compute reference result + ApfpInterfaceWrapper prod_temp, sum_temp; + for(unsigned long j = 0; j < N; ++j) { + // lower half + for(unsigned long i = 0; i < j; ++i) { + auto r_idx = i + j*N; + SetApfpInterfaceType(ref_result[r_idx].get(), c_matrix[r_idx].get()); + + for(unsigned long k = 0; k < K; ++k) { + // (AB)_ij = sum_k A(i,k)B(k,j) + MulApfpInterfaceType(prod_temp.get(), a_matrix[i + k*N].get(), a_matrix[j + k*N].get()); + AddApfpInterfaceType(sum_temp.get(), prod_temp.get(), ref_result[r_idx].get()); + SetApfpInterfaceType(ref_result[r_idx].get(), sum_temp.get()); } } + } - // Use APFP BLAS library - auto error_code = ApfpSyrk('L', 'N', N, K, - [&](unsigned long i) { return a_matrix[i].get(); }, N, - [&](unsigned long i) { return c_matrix[i].get(); }, N); - REQUIRE(error_code == ApfpBlasError::success); - - // Check all entries are sufficiently close - ApfpInterfaceWrapper diff; - for(unsigned long j = 0; j < N; ++j) { - // lower half - for(unsigned long i = 0; i < j; ++i) { - REQUIRE(IsClose(ref_result[i + j*N].get(), c_matrix[i + j*N].get())); - } + // Use APFP BLAS library + auto error_code = ApfpSyrk('L', 'N', N, K, + [&](unsigned long i) { return a_matrix[i].get(); }, N, + [&](unsigned long i) { return c_matrix[i].get(); }, N); + REQUIRE(error_code == ApfpBlasError::success); + + // Check all entries are sufficiently close + ApfpInterfaceWrapper diff; + for(unsigned long j = 0; j < N; ++j) { + // lower half + for(unsigned long i = 0; i < j; ++i) { + REQUIRE(IsClose(ref_result[i + j*N].get(), c_matrix[i + j*N].get())); } } } From 8c8e2c2ddf2b821bba9cb947dbd472b051fa8eef Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Tue, 28 Dec 2021 05:58:34 +0100 Subject: [PATCH 34/67] Add config.h to install dirs --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index e4a627e..2890c5c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -91,6 +91,7 @@ install(FILES interface/Apfp.h interface/ApfpBlas.h interface/ApfpInterfaceType.h + ${CMAKE_BINARY_DIR}/Config.h DESTINATION include/apfp) install(FILES ${CMAKE_BINARY_DIR}/MatrixMultiplication_hw.xclbin From b1449e1f3ee1d183fb3497faef84b7b666766193 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Tue, 28 Dec 2021 08:36:16 +0100 Subject: [PATCH 35/67] Support 'T' argument in syrk --- host/BlasUnitTests.cpp | 25 ++++++++++++++++--------- interface/ApfpBlas.cpp | 34 ++++++++++++++++++++++------------ 2 files changed, 38 insertions(+), 21 deletions(-) diff --git a/host/BlasUnitTests.cpp b/host/BlasUnitTests.cpp index fb6f89e..e89e9cd 100644 --- a/host/BlasUnitTests.cpp +++ b/host/BlasUnitTests.cpp @@ -68,6 +68,7 @@ TEST_CASE("SYRK") { unsigned long N = GENERATE(0, 1, 8, 15, 16, 31, 32, 33); unsigned long K = GENERATE(0, 1, 8, 15, 16, 31, 32, 33); + char mode = GENERATE('N', 'T'); // Test SYRK // In 'N' mode, we perform AA^T + C // A is NxK (A : R^K -> R^N) @@ -95,21 +96,27 @@ TEST_CASE("SYRK") { // lower half for(unsigned long i = 0; i < j; ++i) { auto r_idx = i + j*N; - SetApfpInterfaceType(ref_result[r_idx].get(), c_matrix[r_idx].get()); + SetApfpInterfaceType(ref_result.at(r_idx).get(), c_matrix.at(r_idx).get()); for(unsigned long k = 0; k < K; ++k) { - // (AB)_ij = sum_k A(i,k)B(k,j) - MulApfpInterfaceType(prod_temp.get(), a_matrix[i + k*N].get(), a_matrix[j + k*N].get()); - AddApfpInterfaceType(sum_temp.get(), prod_temp.get(), ref_result[r_idx].get()); - SetApfpInterfaceType(ref_result[r_idx].get(), sum_temp.get()); + // A is NxK if N, KxN if T + if (mode == 'N') { + // (AB)_ij = sum_k A(i,k)B(k,j) + MulApfpInterfaceType(prod_temp.get(), a_matrix.at(i + k*N).get(), a_matrix.at(j + k*N).get()); + } else { + // (AB)_ij = sum_k A(i,k) B(k,j) + MulApfpInterfaceType(prod_temp.get(), a_matrix.at(k + i*K).get(), a_matrix.at(k + j*K).get()); + } + AddApfpInterfaceType(sum_temp.get(), prod_temp.get(), ref_result.at(r_idx).get()); + SetApfpInterfaceType(ref_result.at(r_idx).get(), sum_temp.get()); } } } // Use APFP BLAS library - auto error_code = ApfpSyrk('L', 'N', N, K, - [&](unsigned long i) { return a_matrix[i].get(); }, N, - [&](unsigned long i) { return c_matrix[i].get(); }, N); + auto error_code = ApfpSyrk('L', mode, N, K, + [&](unsigned long i) { return a_matrix.at(i).get(); }, mode == 'N' ? N : K, + [&](unsigned long i) { return c_matrix.at(i).get(); }, N); REQUIRE(error_code == ApfpBlasError::success); // Check all entries are sufficiently close @@ -117,7 +124,7 @@ TEST_CASE("SYRK") { for(unsigned long j = 0; j < N; ++j) { // lower half for(unsigned long i = 0; i < j; ++i) { - REQUIRE(IsClose(ref_result[i + j*N].get(), c_matrix[i + j*N].get())); + REQUIRE(IsClose(ref_result.at(i + j*N).get(), c_matrix.at(i + j*N).get())); } } } diff --git a/interface/ApfpBlas.cpp b/interface/ApfpBlas.cpp index 7a7d4ac..57c5809 100644 --- a/interface/ApfpBlas.cpp +++ b/interface/ApfpBlas.cpp @@ -122,28 +122,34 @@ int ApfpSyrkImpl(char uplo, char trans, unsigned long N, unsigned long K, ptr_fu if (std::toupper(trans) != 'N' && std::toupper(trans) != 'T') { return -2; } - // Let's not worry about this mode with N and K being different meanings for now - if (trans == ApfpBlasTrans::transpose) { - return ApfpBlasError::unimplemented; - } - - if (LDA < N) { return -6; } + // A is NxK if 'N', KxN if 'T' + // C is always NxN + // N mode + // A A^T + C + // T mode + // A^T A + C + bool use_transpose = trans == ApfpBlasTrans::transpose; + + unsigned long A_rows = use_transpose ? K : N; + unsigned long A_cols = use_transpose ? N : K; + + if (LDA < (use_transpose ? K : N)) { return -6; } if (LDC < N) { return -8; } // Empty matrix no-op if (N == 0) { return ApfpBlasError::success; } - if (K == 0) { return ApfpBlasError::success; } + if (K == 0) { return ApfpBlasError::success; } // ==== setup ==== std::vector host_a, host_a_transpose, host_c; host_a.resize(N*K); - CopyFromMatrix(N, K, A, LDA, host_a.data()); - auto device_a = apfp->AllocateDeviceMatrix(N, K); + CopyFromMatrix(A_rows, A_cols, A, LDA, host_a.data()); + auto device_a = apfp->AllocateDeviceMatrix(A_rows, A_cols); device_a.TransferToDevice(host_a.data(), host_a.size()); host_a_transpose.resize(K*N); - CopyTransposeFromMatrix(N, K, A, LDA, host_a_transpose.data()); - auto device_a_transpose = apfp->AllocateDeviceMatrix(K, N); + CopyTransposeFromMatrix(A_rows, A_cols, A, LDA, host_a_transpose.data()); + auto device_a_transpose = apfp->AllocateDeviceMatrix(A_cols, A_rows); device_a_transpose.TransferToDevice(host_a_transpose.data(), host_a_transpose.size()); host_c.resize(N*N); @@ -151,7 +157,11 @@ int ApfpSyrkImpl(char uplo, char trans, unsigned long N, unsigned long K, ptr_fu // ==== compute and teardown ==== auto mul_result = apfp->AllocateDeviceMatrix(N, N); - apfp->MatrixMultiplication(device_a, device_a_transpose, &mul_result); + if(use_transpose) { + apfp->MatrixMultiplication(device_a_transpose, device_a, &mul_result); + } else { + apfp->MatrixMultiplication(device_a, device_a_transpose, &mul_result); + } std::vector host_result; host_result.resize(N*N); From 572a4cac3be84f13c90fc50e20341074a431e913 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Tue, 28 Dec 2021 08:38:33 +0100 Subject: [PATCH 36/67] Check upper/lower Syrk mode --- host/BlasUnitTests.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/host/BlasUnitTests.cpp b/host/BlasUnitTests.cpp index e89e9cd..6dd7e49 100644 --- a/host/BlasUnitTests.cpp +++ b/host/BlasUnitTests.cpp @@ -69,6 +69,7 @@ TEST_CASE("SYRK") { unsigned long N = GENERATE(0, 1, 8, 15, 16, 31, 32, 33); unsigned long K = GENERATE(0, 1, 8, 15, 16, 31, 32, 33); char mode = GENERATE('N', 'T'); + char uplo_mode = GENERATE('U', 'L'); // Test SYRK // In 'N' mode, we perform AA^T + C // A is NxK (A : R^K -> R^N) @@ -94,7 +95,7 @@ TEST_CASE("SYRK") { ApfpInterfaceWrapper prod_temp, sum_temp; for(unsigned long j = 0; j < N; ++j) { // lower half - for(unsigned long i = 0; i < j; ++i) { + for(unsigned long i = 0; i < N; ++i) { auto r_idx = i + j*N; SetApfpInterfaceType(ref_result.at(r_idx).get(), c_matrix.at(r_idx).get()); @@ -114,7 +115,7 @@ TEST_CASE("SYRK") { } // Use APFP BLAS library - auto error_code = ApfpSyrk('L', mode, N, K, + auto error_code = ApfpSyrk(uplo_mode, mode, N, K, [&](unsigned long i) { return a_matrix.at(i).get(); }, mode == 'N' ? N : K, [&](unsigned long i) { return c_matrix.at(i).get(); }, N); REQUIRE(error_code == ApfpBlasError::success); @@ -124,7 +125,11 @@ TEST_CASE("SYRK") { for(unsigned long j = 0; j < N; ++j) { // lower half for(unsigned long i = 0; i < j; ++i) { - REQUIRE(IsClose(ref_result.at(i + j*N).get(), c_matrix.at(i + j*N).get())); + if (uplo_mode == 'L') { + REQUIRE(IsClose(ref_result.at(i + j*N).get(), c_matrix.at(i + j*N).get())); + } else { + REQUIRE(IsClose(ref_result.at(j + i*N).get(), c_matrix.at(j + i*N).get())); + } } } } From 102c8189a3576c0000dc482dca96733b6ddcd725 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Wed, 29 Dec 2021 10:24:55 +0100 Subject: [PATCH 37/67] Fix MPFR wrapper argument order --- host/BlasUnitTests.cpp | 12 +++++------- interface/ApfpInterfaceType.cpp | 4 ++-- interface/ApfpInterfaceType.h | 4 ++-- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/host/BlasUnitTests.cpp b/host/BlasUnitTests.cpp index 6dd7e49..f7fbbf2 100644 --- a/host/BlasUnitTests.cpp +++ b/host/BlasUnitTests.cpp @@ -66,8 +66,8 @@ TEST_CASE("SYRK") { auto rng = RandomNumberGenerator(); - unsigned long N = GENERATE(0, 1, 8, 15, 16, 31, 32, 33); - unsigned long K = GENERATE(0, 1, 8, 15, 16, 31, 32, 33); + unsigned long N = GENERATE(0, 1, 2, 8, 15, 16, 31, 32, 33); + unsigned long K = GENERATE(0, 1, 2, 8, 15, 16, 31, 32, 33); char mode = GENERATE('N', 'T'); char uplo_mode = GENERATE('U', 'L'); // Test SYRK @@ -125,11 +125,9 @@ TEST_CASE("SYRK") { for(unsigned long j = 0; j < N; ++j) { // lower half for(unsigned long i = 0; i < j; ++i) { - if (uplo_mode == 'L') { - REQUIRE(IsClose(ref_result.at(i + j*N).get(), c_matrix.at(i + j*N).get())); - } else { - REQUIRE(IsClose(ref_result.at(j + i*N).get(), c_matrix.at(j + i*N).get())); - } + auto ref_value = uplo_mode == 'L' ? ref_result.at(i + j*N).get() : ref_result.at(j + i*N).get(); + auto test_value = uplo_mode == 'L' ? c_matrix.at(i + j*N).get() : c_matrix.at(j + i*N).get(); + REQUIRE(IsClose(ref_value, test_value)); } } } diff --git a/interface/ApfpInterfaceType.cpp b/interface/ApfpInterfaceType.cpp index 1fb3aee..d21067a 100644 --- a/interface/ApfpInterfaceType.cpp +++ b/interface/ApfpInterfaceType.cpp @@ -49,7 +49,7 @@ void SetApfpInterfaceType(ApfpInterfaceTypePtr dest, long int source) { #endif } -void AddApfpInterfaceType(ApfpInterfaceTypeConstPtr a, ApfpInterfaceTypeConstPtr b, ApfpInterfaceTypePtr dest) { +void AddApfpInterfaceType(ApfpInterfaceTypePtr dest, ApfpInterfaceTypeConstPtr a, ApfpInterfaceTypeConstPtr b) { #ifdef APFP_GMP_INTERFACE_TYPE mpf_add(dest, a, b); #else @@ -57,7 +57,7 @@ void AddApfpInterfaceType(ApfpInterfaceTypeConstPtr a, ApfpInterfaceTypeConstPtr #endif } -void MulApfpInterfaceType(ApfpInterfaceTypeConstPtr a, ApfpInterfaceTypeConstPtr b, ApfpInterfaceTypePtr dest) { +void MulApfpInterfaceType(ApfpInterfaceTypePtr dest, ApfpInterfaceTypeConstPtr a, ApfpInterfaceTypeConstPtr b) { #ifdef APFP_GMP_INTERFACE_TYPE mpf_mul(dest, a, b); #else diff --git a/interface/ApfpInterfaceType.h b/interface/ApfpInterfaceType.h index b124680..7b534e2 100644 --- a/interface/ApfpInterfaceType.h +++ b/interface/ApfpInterfaceType.h @@ -26,9 +26,9 @@ void SetApfpInterfaceType(ApfpInterfaceTypePtr dest, ApfpInterfaceTypeConstPtr s void SetApfpInterfaceType(ApfpInterfaceTypePtr dest, long int source); -void AddApfpInterfaceType(ApfpInterfaceTypeConstPtr a, ApfpInterfaceTypeConstPtr b, ApfpInterfaceTypePtr dest); +void AddApfpInterfaceType(ApfpInterfaceTypePtr dest, ApfpInterfaceTypeConstPtr a, ApfpInterfaceTypeConstPtr b); -void MulApfpInterfaceType(ApfpInterfaceTypeConstPtr a, ApfpInterfaceTypeConstPtr b, ApfpInterfaceTypePtr dest); +void MulApfpInterfaceType(ApfpInterfaceTypePtr dest, ApfpInterfaceTypeConstPtr a, ApfpInterfaceTypeConstPtr b); /// Smart pointer-like wrapper class for GMP/MPFR types class ApfpInterfaceWrapper { From b223108cbf8e2c8b18d203b32785c3e8aab93ed3 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Wed, 29 Dec 2021 10:27:11 +0100 Subject: [PATCH 38/67] Remove mystery character in CMakeLists.txt --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a399570..b709773 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.0) project(apfp) -  + set(CMAKE_CXX_STANDARD 17) # Target options @@ -40,7 +40,7 @@ set(APFP_KERNEL_FILES device/MatrixMultiplication.cpp # Setup FPGA kernel targets add_vitis_kernel(MatrixMultiplication FILES ${APFP_KERNEL_FILES} - INCLUDE_DIRS include hlslib/include ${CMAKE_BINARY_DIR} + INCLUDE_DIRS include hlslib/include ${CMAKE_BINARY_DIR} ${GMP_INCLUDES} HLS_FLAGS "-DAP_INT_MAX_W=${APFP_MAX_BITS} -DAPFP_${APFP_SEMANTICS}_SEMANTICS" HLS_CONFIG "config_compile -pipeline_style frp\nconfig_dataflow -fifo_depth 16" DEPENDS ${CMAKE_BINARY_DIR}/Config.h From 71a4cf87b0880255d02413623a6da9a8761bb6a4 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Thu, 30 Dec 2021 02:59:05 +0100 Subject: [PATCH 39/67] Fix LD_LIBRARY_PATH search for FPGA kernel --- interface/Apfp.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/interface/Apfp.cpp b/interface/Apfp.cpp index 44d7272..a3b69ce 100644 --- a/interface/Apfp.cpp +++ b/interface/Apfp.cpp @@ -41,9 +41,12 @@ std::string Apfp::FindKernel() { // LD_LIBRARY_PATH char* ld_library_path_env_var = std::getenv("LD_LIBRARY_PATH"); auto ld_library_path = (ld_library_path_env_var == nullptr) ? "" : std::string(ld_library_path_env_var); - for(std::size_t begin = 0, end = std::string::npos; begin != end; begin = end) { - end = ld_library_path.find(":", begin); - search_paths.push_back(std::filesystem::path(ld_library_path.substr(begin, end))); + + for(std::string::iterator seg_begin = ld_library_path.begin(), seg_end; seg_begin < ld_library_path.end(); seg_begin = seg_end+1) { + seg_end = std::find(seg_begin, ld_library_path.end(), ':'); + + std::string candidate_path(seg_begin, seg_end); + search_paths.push_back(std::filesystem::path(candidate_path)); } // Current working directory From 14274fc08fc79afd5a19871d27141feaeed70322 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Thu, 30 Dec 2021 03:20:18 +0100 Subject: [PATCH 40/67] Marginally more helpful error handling --- interface/Apfp.cpp | 8 ++++---- interface/Apfp.h | 25 +++++++++++++++++++++++++ interface/ApfpBlas.cpp | 7 ++++++- interface/ApfpBlas.h | 1 + 4 files changed, 36 insertions(+), 5 deletions(-) diff --git a/interface/Apfp.cpp b/interface/Apfp.cpp index a3b69ce..7436b1e 100644 --- a/interface/Apfp.cpp +++ b/interface/Apfp.cpp @@ -61,7 +61,7 @@ std::string Apfp::FindKernel() { } } - throw std::runtime_error("Unable to find FPGA kernel"); + throw KernelNotFoundException("Unable to find FPGA kernel"); } DeviceMatrix Apfp::AllocateDeviceMatrix(std::size_t rows, std::size_t cols) { @@ -90,15 +90,15 @@ void Apfp::MatrixMultiplication(const DeviceMatrix& a, const DeviceMatrix& b, De } void Apfp::MatrixAddition(const DeviceMatrix&, const DeviceMatrix&, DeviceMatrix*) { - throw std::exception(); + throw UnimplementedException(); } void Apfp::TransposeInPlace(DeviceMatrix*) { - throw std::exception(); + throw UnimplementedException(); } DeviceMatrix Apfp::Transpose(const DeviceMatrix&) { - throw std::exception(); + throw UnimplementedException(); } template diff --git a/interface/Apfp.h b/interface/Apfp.h index 2457f26..a11d598 100644 --- a/interface/Apfp.h +++ b/interface/Apfp.h @@ -81,3 +81,28 @@ class DeviceMatrix { template void TransferToHostImpl(ptr_function_type buffer_ptr_func, std::size_t buffer_size); }; + +// === Custom exception types === +struct ApfpException : public std::exception { + std::string e; + + ApfpException() { + e = ""; + } + + ApfpException(const std::string& what_arg) { + e = what_arg; + } + + virtual const char* what() const noexcept { + return e.c_str(); + } +}; + +struct KernelNotFoundException : public ApfpException { + using ApfpException::ApfpException; +}; + +struct UnimplementedException : public ApfpException { + using ApfpException::ApfpException; +}; diff --git a/interface/ApfpBlas.cpp b/interface/ApfpBlas.cpp index 57c5809..c25edb2 100644 --- a/interface/ApfpBlas.cpp +++ b/interface/ApfpBlas.cpp @@ -26,7 +26,12 @@ int ApfpInit(unsigned long precision) { } apfp.emplace(); return ApfpBlasError::success; - }catch(const std::exception& e) { + + }catch(const KernelNotFoundException& e) { + last_error_message = e.what(); + return ApfpBlasError::kernel_not_found; + + } catch(const std::exception& e) { // Unknown exception last_error_message = e.what(); return ApfpBlasError::unknown; diff --git a/interface/ApfpBlas.h b/interface/ApfpBlas.h index 4276c2f..13e65c7 100644 --- a/interface/ApfpBlas.h +++ b/interface/ApfpBlas.h @@ -26,6 +26,7 @@ enum ApfpBlasError : int { unimplemented = 2, bitwidth = 3, uninitialized = 4, + kernel_not_found = 5, }; From 0b435bec50d21919cfcddd91eb1ba008a2721851 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Thu, 30 Dec 2021 03:33:30 +0100 Subject: [PATCH 41/67] GMP allows aliasing inputs --- host/BlasUnitTests.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/host/BlasUnitTests.cpp b/host/BlasUnitTests.cpp index f7fbbf2..2124840 100644 --- a/host/BlasUnitTests.cpp +++ b/host/BlasUnitTests.cpp @@ -92,7 +92,7 @@ TEST_CASE("SYRK") { ref_result.resize(N*N); // Compute reference result - ApfpInterfaceWrapper prod_temp, sum_temp; + ApfpInterfaceWrapper prod_temp; for(unsigned long j = 0; j < N; ++j) { // lower half for(unsigned long i = 0; i < N; ++i) { @@ -108,8 +108,7 @@ TEST_CASE("SYRK") { // (AB)_ij = sum_k A(i,k) B(k,j) MulApfpInterfaceType(prod_temp.get(), a_matrix.at(k + i*K).get(), a_matrix.at(k + j*K).get()); } - AddApfpInterfaceType(sum_temp.get(), prod_temp.get(), ref_result.at(r_idx).get()); - SetApfpInterfaceType(ref_result.at(r_idx).get(), sum_temp.get()); + AddApfpInterfaceType(ref_result.at(r_idx).get(), prod_temp.get(), ref_result.at(r_idx).get()); } } } From 1f4334850d8382637eb9ffa4b36bd6f9a8729b2f Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Thu, 30 Dec 2021 22:36:50 +0100 Subject: [PATCH 42/67] Install hw emu kernel --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b709773..2a40fca 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,5 +96,5 @@ install(FILES DESTINATION include/apfp) install(FILES ${CMAKE_BINARY_DIR}/MatrixMultiplication_hw.xclbin - ${CMAKE_BINARY_DIR}/MatrixMultiplication_sw_emu.xclbin + ${CMAKE_BINARY_DIR}/MatrixMultiplication_hw_emu.xclbin DESTINATION lib) \ No newline at end of file From 9856042e25373f9058d3f120180372c37c60a7c4 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Thu, 30 Dec 2021 22:37:43 +0100 Subject: [PATCH 43/67] Do SYRK addition on the FPGA --- interface/ApfpBlas.cpp | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/interface/ApfpBlas.cpp b/interface/ApfpBlas.cpp index c25edb2..beaa156 100644 --- a/interface/ApfpBlas.cpp +++ b/interface/ApfpBlas.cpp @@ -159,25 +159,18 @@ int ApfpSyrkImpl(char uplo, char trans, unsigned long N, unsigned long K, ptr_fu host_c.resize(N*N); CopyFromMatrixUplo(uplo_validated, N, C, LDC, host_c.data()); + auto device_c = apfp->AllocateDeviceMatrix(N, N); + device_c.TransferToDevice(host_c.data(), host_c.size()); // ==== compute and teardown ==== auto mul_result = apfp->AllocateDeviceMatrix(N, N); if(use_transpose) { - apfp->MatrixMultiplication(device_a_transpose, device_a, &mul_result); + apfp->MatrixMultiplication(device_a_transpose, device_a, &device_c); } else { - apfp->MatrixMultiplication(device_a, device_a_transpose, &mul_result); - } - std::vector host_result; - host_result.resize(N*N); - - mul_result.TransferToHost(host_result.data(), host_result.size()); - - ApfpInterfaceWrapper add_result; - for(unsigned long i = 0; i < host_result.size(); ++i) { - AddApfpInterfaceType(add_result.get(), host_result[i].get(), host_c[i].get()); - SetApfpInterfaceType(host_c[i].get(), add_result.get()); + apfp->MatrixMultiplication(device_a, device_a_transpose, &device_c); } + device_c.TransferToHost(host_c.data(), host_c.size()); CopyToMatrixUplo(uplo_validated, N, C, LDC, host_c.data()); } catch(const std::exception& e) { last_error_message = e.what(); From 1e8cffe4b2231dd0b41181ba0fc44222a3a2fc6c Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Thu, 30 Dec 2021 22:43:38 +0100 Subject: [PATCH 44/67] Move Apfp lib into namespace --- host/BlasUnitTests.cpp | 34 ++++++++++++++++----------------- interface/Apfp.cpp | 4 ++++ interface/Apfp.h | 4 ++++ interface/ApfpBlas.cpp | 3 +++ interface/ApfpBlas.h | 5 +++-- interface/ApfpInterfaceType.cpp | 4 ++++ interface/ApfpInterfaceType.h | 11 ++++++++--- 7 files changed, 43 insertions(+), 22 deletions(-) diff --git a/host/BlasUnitTests.cpp b/host/BlasUnitTests.cpp index 2124840..564fc50 100644 --- a/host/BlasUnitTests.cpp +++ b/host/BlasUnitTests.cpp @@ -16,15 +16,15 @@ void ApfpSetup() { #else mpfr_set_default_prec(kMantissaBits); #endif - auto apfp_error_code = ApfpInit(kMantissaBits); - REQUIRE(apfp_error_code == ApfpBlasError::success); + auto apfp_error_code = apfp::ApfpInit(kMantissaBits); + REQUIRE(apfp_error_code == apfp::ApfpBlasError::success); } void ApfpTeardown() { - ApfpFinalize(); + apfp::ApfpFinalize(); } -bool IsZero(ApfpInterfaceTypeConstPtr a) { +bool IsZero(apfp::ApfpInterfaceTypeConstPtr a) { #ifdef APFP_GMP_INTERFACE_TYPE return mpf_sgn(a) == 0; #else @@ -32,13 +32,13 @@ bool IsZero(ApfpInterfaceTypeConstPtr a) { #endif } -bool IsClose(ApfpInterfaceTypeConstPtr a, ApfpInterfaceTypeConstPtr b) { +bool IsClose(apfp::ApfpInterfaceTypeConstPtr a, apfp::ApfpInterfaceTypeConstPtr b) { // Avoids divide by zero if a = b = 0 if(IsZero(a) && IsZero(b)) { return true; } - ApfpInterfaceWrapper diff, sum, ratio; + apfp::ApfpInterfaceWrapper diff, sum, ratio; #ifdef APFP_GMP_INTERFACE_TYPE mpf_sub(diff.get(), a, b); mpf_add(sum.get(), a, b); @@ -76,51 +76,51 @@ TEST_CASE("SYRK") { // C is NxN // Matrices are stored column major because BLAS { - std::vector a_matrix; + std::vector a_matrix; a_matrix.resize(N*K); for(auto& v : a_matrix) { rng.Generate(v.get()); } - std::vector c_matrix; + std::vector c_matrix; c_matrix.resize(N*N); for(auto& v : c_matrix) { rng.Generate(v.get()); } - std::vector ref_result; + std::vector ref_result; ref_result.resize(N*N); // Compute reference result - ApfpInterfaceWrapper prod_temp; + apfp::ApfpInterfaceWrapper prod_temp; for(unsigned long j = 0; j < N; ++j) { // lower half for(unsigned long i = 0; i < N; ++i) { auto r_idx = i + j*N; - SetApfpInterfaceType(ref_result.at(r_idx).get(), c_matrix.at(r_idx).get()); + apfp::SetApfpInterfaceType(ref_result.at(r_idx).get(), c_matrix.at(r_idx).get()); for(unsigned long k = 0; k < K; ++k) { // A is NxK if N, KxN if T if (mode == 'N') { // (AB)_ij = sum_k A(i,k)B(k,j) - MulApfpInterfaceType(prod_temp.get(), a_matrix.at(i + k*N).get(), a_matrix.at(j + k*N).get()); + apfp::MulApfpInterfaceType(prod_temp.get(), a_matrix.at(i + k*N).get(), a_matrix.at(j + k*N).get()); } else { // (AB)_ij = sum_k A(i,k) B(k,j) - MulApfpInterfaceType(prod_temp.get(), a_matrix.at(k + i*K).get(), a_matrix.at(k + j*K).get()); + apfp::MulApfpInterfaceType(prod_temp.get(), a_matrix.at(k + i*K).get(), a_matrix.at(k + j*K).get()); } - AddApfpInterfaceType(ref_result.at(r_idx).get(), prod_temp.get(), ref_result.at(r_idx).get()); + apfp::AddApfpInterfaceType(ref_result.at(r_idx).get(), prod_temp.get(), ref_result.at(r_idx).get()); } } } // Use APFP BLAS library - auto error_code = ApfpSyrk(uplo_mode, mode, N, K, + auto error_code = apfp::ApfpSyrk(uplo_mode, mode, N, K, [&](unsigned long i) { return a_matrix.at(i).get(); }, mode == 'N' ? N : K, [&](unsigned long i) { return c_matrix.at(i).get(); }, N); - REQUIRE(error_code == ApfpBlasError::success); + REQUIRE(error_code == apfp::ApfpBlasError::success); // Check all entries are sufficiently close - ApfpInterfaceWrapper diff; + apfp::ApfpInterfaceWrapper diff; for(unsigned long j = 0; j < N; ++j) { // lower half for(unsigned long i = 0; i < j; ++i) { diff --git a/interface/Apfp.cpp b/interface/Apfp.cpp index 7436b1e..d4f9e92 100644 --- a/interface/Apfp.cpp +++ b/interface/Apfp.cpp @@ -9,6 +9,8 @@ #include "Config.h" +namespace apfp { + Apfp::Apfp() { auto kernel_path = FindKernel(); program_.emplace(context_.MakeProgram(kernel_path)); @@ -159,3 +161,5 @@ void DeviceMatrix::TransferToHost(ApfpInterfaceTypePtr buffer_ptr, std::size_t b void DeviceMatrix::TransferToHost(ApfpInterfaceWrapper* buffer_ptr, std::size_t buffer_size) { TransferToHostImpl([&](std::size_t i) -> ApfpInterfaceTypePtr { return buffer_ptr[i].get(); }, buffer_size); } + +} \ No newline at end of file diff --git a/interface/Apfp.h b/interface/Apfp.h index a11d598..87551e4 100644 --- a/interface/Apfp.h +++ b/interface/Apfp.h @@ -11,6 +11,8 @@ #include +namespace apfp { + class DeviceMatrix; /// Object oriented interface for Apfp @@ -106,3 +108,5 @@ struct KernelNotFoundException : public ApfpException { struct UnimplementedException : public ApfpException { using ApfpException::ApfpException; }; + +} \ No newline at end of file diff --git a/interface/ApfpBlas.cpp b/interface/ApfpBlas.cpp index beaa156..da6d1e5 100644 --- a/interface/ApfpBlas.cpp +++ b/interface/ApfpBlas.cpp @@ -4,6 +4,8 @@ #include #include +namespace apfp { + static std::optional apfp; static std::string last_error_message; @@ -191,3 +193,4 @@ int ApfpSyrk(char uplo, char trans, unsigned long N, unsigned long K, ConstIndex return ApfpSyrkImpl(uplo, trans, N, K, A, LDA, C, LDC); } +} \ No newline at end of file diff --git a/interface/ApfpBlas.h b/interface/ApfpBlas.h index 13e65c7..a0ed374 100644 --- a/interface/ApfpBlas.h +++ b/interface/ApfpBlas.h @@ -3,7 +3,8 @@ #include #include "ApfpInterfaceType.h" -// +namespace apfp { + using IndexFunction = std::function; using ConstIndexFunction = std::function; @@ -29,4 +30,4 @@ enum ApfpBlasError : int { kernel_not_found = 5, }; - +} \ No newline at end of file diff --git a/interface/ApfpInterfaceType.cpp b/interface/ApfpInterfaceType.cpp index d21067a..446a526 100644 --- a/interface/ApfpInterfaceType.cpp +++ b/interface/ApfpInterfaceType.cpp @@ -1,5 +1,7 @@ #include "ApfpInterfaceType.h" +namespace apfp { + void InitApfpInterfaceType(ApfpInterfaceTypePtr value) { #ifdef APFP_GMP_INTERFACE_TYPE mpf_init(value); @@ -88,3 +90,5 @@ ApfpInterfaceWrapper& ApfpInterfaceWrapper::operator=(ApfpInterfaceWrapper&& oth ClearApfpInterfaceType(other.data_); return *this; } + +} \ No newline at end of file diff --git a/interface/ApfpInterfaceType.h b/interface/ApfpInterfaceType.h index 7b534e2..17bee81 100644 --- a/interface/ApfpInterfaceType.h +++ b/interface/ApfpInterfaceType.h @@ -1,12 +1,15 @@ #pragma once #include "Config.h" -#ifdef APFP_GMP_INTERFACE_TYPE // Interface with GMP types #include +#include + +namespace apfp { + +#ifdef APFP_GMP_INTERFACE_TYPE // Interface with GMP types using ApfpInterfaceType = mpf_t; using ApfpInterfaceTypePtr = mpf_ptr; using ApfpInterfaceTypeConstPtr = mpf_srcptr; - #else #include using ApfpInterfaceType = mpfr_t; @@ -53,4 +56,6 @@ class ApfpInterfaceWrapper { ApfpInterfaceTypePtr get() { return data_; } ApfpInterfaceTypeConstPtr get() const { return data_; } -}; \ No newline at end of file +}; + +} \ No newline at end of file From c1fdfa973492cfbad91394b6171a91f46787702b Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Thu, 30 Dec 2021 23:02:17 +0100 Subject: [PATCH 45/67] Make the interface type wrapping nicer --- host/BlasUnitTests.cpp | 34 ++++++++--------- interface/Apfp.cpp | 14 +++---- interface/Apfp.h | 8 ++-- interface/ApfpBlas.cpp | 68 ++++++++++++++++----------------- interface/ApfpBlas.h | 14 +++---- interface/ApfpInterfaceType.cpp | 42 ++++++++++---------- interface/ApfpInterfaceType.h | 58 +++++++++++++++------------- 7 files changed, 122 insertions(+), 116 deletions(-) diff --git a/host/BlasUnitTests.cpp b/host/BlasUnitTests.cpp index 564fc50..b985fe2 100644 --- a/host/BlasUnitTests.cpp +++ b/host/BlasUnitTests.cpp @@ -16,15 +16,15 @@ void ApfpSetup() { #else mpfr_set_default_prec(kMantissaBits); #endif - auto apfp_error_code = apfp::ApfpInit(kMantissaBits); - REQUIRE(apfp_error_code == apfp::ApfpBlasError::success); + auto apfp_error_code = apfp::Init(kMantissaBits); + REQUIRE(apfp_error_code == apfp::BlasError::success); } void ApfpTeardown() { - apfp::ApfpFinalize(); + apfp::Finalize(); } -bool IsZero(apfp::ApfpInterfaceTypeConstPtr a) { +bool IsZero(apfp::interface::ConstPtr a) { #ifdef APFP_GMP_INTERFACE_TYPE return mpf_sgn(a) == 0; #else @@ -32,13 +32,13 @@ bool IsZero(apfp::ApfpInterfaceTypeConstPtr a) { #endif } -bool IsClose(apfp::ApfpInterfaceTypeConstPtr a, apfp::ApfpInterfaceTypeConstPtr b) { +bool IsClose(apfp::interface::ConstPtr a, apfp::interface::ConstPtr b) { // Avoids divide by zero if a = b = 0 if(IsZero(a) && IsZero(b)) { return true; } - apfp::ApfpInterfaceWrapper diff, sum, ratio; + apfp::interface::Wrapper diff, sum, ratio; #ifdef APFP_GMP_INTERFACE_TYPE mpf_sub(diff.get(), a, b); mpf_add(sum.get(), a, b); @@ -76,51 +76,51 @@ TEST_CASE("SYRK") { // C is NxN // Matrices are stored column major because BLAS { - std::vector a_matrix; + std::vector a_matrix; a_matrix.resize(N*K); for(auto& v : a_matrix) { rng.Generate(v.get()); } - std::vector c_matrix; + std::vector c_matrix; c_matrix.resize(N*N); for(auto& v : c_matrix) { rng.Generate(v.get()); } - std::vector ref_result; + std::vector ref_result; ref_result.resize(N*N); // Compute reference result - apfp::ApfpInterfaceWrapper prod_temp; + apfp::interface::Wrapper prod_temp; for(unsigned long j = 0; j < N; ++j) { // lower half for(unsigned long i = 0; i < N; ++i) { auto r_idx = i + j*N; - apfp::SetApfpInterfaceType(ref_result.at(r_idx).get(), c_matrix.at(r_idx).get()); + apfp::interface::Set(ref_result.at(r_idx).get(), c_matrix.at(r_idx).get()); for(unsigned long k = 0; k < K; ++k) { // A is NxK if N, KxN if T if (mode == 'N') { // (AB)_ij = sum_k A(i,k)B(k,j) - apfp::MulApfpInterfaceType(prod_temp.get(), a_matrix.at(i + k*N).get(), a_matrix.at(j + k*N).get()); + apfp::interface::Mul(prod_temp.get(), a_matrix.at(i + k*N).get(), a_matrix.at(j + k*N).get()); } else { // (AB)_ij = sum_k A(i,k) B(k,j) - apfp::MulApfpInterfaceType(prod_temp.get(), a_matrix.at(k + i*K).get(), a_matrix.at(k + j*K).get()); + apfp::interface::Mul(prod_temp.get(), a_matrix.at(k + i*K).get(), a_matrix.at(k + j*K).get()); } - apfp::AddApfpInterfaceType(ref_result.at(r_idx).get(), prod_temp.get(), ref_result.at(r_idx).get()); + apfp::interface::Add(ref_result.at(r_idx).get(), prod_temp.get(), ref_result.at(r_idx).get()); } } } // Use APFP BLAS library - auto error_code = apfp::ApfpSyrk(uplo_mode, mode, N, K, + auto error_code = apfp::Syrk(uplo_mode, mode, N, K, [&](unsigned long i) { return a_matrix.at(i).get(); }, mode == 'N' ? N : K, [&](unsigned long i) { return c_matrix.at(i).get(); }, N); - REQUIRE(error_code == apfp::ApfpBlasError::success); + REQUIRE(error_code == apfp::BlasError::success); // Check all entries are sufficiently close - apfp::ApfpInterfaceWrapper diff; + apfp::interface::Wrapper diff; for(unsigned long j = 0; j < N; ++j) { // lower half for(unsigned long i = 0; i < j; ++i) { diff --git a/interface/Apfp.cpp b/interface/Apfp.cpp index d4f9e92..6436854 100644 --- a/interface/Apfp.cpp +++ b/interface/Apfp.cpp @@ -121,11 +121,11 @@ void DeviceMatrix::TransferToDeviceImpl(ptr_function_type buffer_ptr_func, std:: reinterpret_cast(host_buffer.data())); } -void DeviceMatrix::TransferToDevice(ApfpInterfaceTypeConstPtr buffer_ptr, std::size_t buffer_size) { +void DeviceMatrix::TransferToDevice(interface::ConstPtr buffer_ptr, std::size_t buffer_size) { TransferToDeviceImpl([&](std::size_t i) { return buffer_ptr + i; }, buffer_size); } -void DeviceMatrix::TransferToDevice(const ApfpInterfaceWrapper* buffer_ptr, std::size_t buffer_size) { +void DeviceMatrix::TransferToDevice(const interface::Wrapper* buffer_ptr, std::size_t buffer_size) { TransferToDeviceImpl([&](std::size_t i) { return buffer_ptr[i].get(); }, buffer_size); } @@ -148,18 +148,18 @@ void DeviceMatrix::TransferToHostImpl(ptr_function_type buffer_ptr_func, std::si buffer_.CopyToHost(0, kLinesPerNumber * host_buffer.size(), reinterpret_cast(host_buffer.data())); - ApfpInterfaceWrapper scratch; + interface::Wrapper scratch; for(std::size_t i = 0; i < host_buffer.size(); ++i) { PackedFloatToInterfaceType(host_buffer[i], buffer_ptr_func(i)); } } -void DeviceMatrix::TransferToHost(ApfpInterfaceTypePtr buffer_ptr, std::size_t buffer_size) { - TransferToHostImpl([&](std::size_t i) -> ApfpInterfaceTypePtr { return buffer_ptr + i; }, buffer_size); +void DeviceMatrix::TransferToHost(interface::Ptr buffer_ptr, std::size_t buffer_size) { + TransferToHostImpl([&](std::size_t i) -> interface::Ptr { return buffer_ptr + i; }, buffer_size); } -void DeviceMatrix::TransferToHost(ApfpInterfaceWrapper* buffer_ptr, std::size_t buffer_size) { - TransferToHostImpl([&](std::size_t i) -> ApfpInterfaceTypePtr { return buffer_ptr[i].get(); }, buffer_size); +void DeviceMatrix::TransferToHost(interface::Wrapper* buffer_ptr, std::size_t buffer_size) { + TransferToHostImpl([&](std::size_t i) -> interface::Ptr { return buffer_ptr[i].get(); }, buffer_size); } } \ No newline at end of file diff --git a/interface/Apfp.h b/interface/Apfp.h index 87551e4..4eb004d 100644 --- a/interface/Apfp.h +++ b/interface/Apfp.h @@ -67,14 +67,14 @@ class DeviceMatrix { /// Transfer from the host to the device /// TODO: Make this take input iterators - void TransferToDevice(ApfpInterfaceTypeConstPtr buffer_ptr, std::size_t buffer_size); - void TransferToDevice(const ApfpInterfaceWrapper* buffer_ptr, std::size_t buffer_size); + void TransferToDevice(interface::ConstPtr buffer_ptr, std::size_t buffer_size); + void TransferToDevice(const interface::Wrapper* buffer_ptr, std::size_t buffer_size); /// Transfer from the device to the host /// TODO: Make this take output iterators - void TransferToHost(ApfpInterfaceTypePtr buffer_ptr, std::size_t buffer_size); - void TransferToHost(ApfpInterfaceWrapper* buffer_ptr, std::size_t buffer_size); + void TransferToHost(interface::Ptr buffer_ptr, std::size_t buffer_size); + void TransferToHost(interface::Wrapper* buffer_ptr, std::size_t buffer_size); private: template diff --git a/interface/ApfpBlas.cpp b/interface/ApfpBlas.cpp index da6d1e5..ddffe34 100644 --- a/interface/ApfpBlas.cpp +++ b/interface/ApfpBlas.cpp @@ -9,40 +9,40 @@ namespace apfp { static std::optional apfp; static std::string last_error_message; -enum ApfpBlasUplo : char { +enum BlasUplo : char { upper = 'U', lower = 'L' }; -enum ApfpBlasTrans : char { +enum BlasTrans : char { normal = 'N', transpose = 'T', }; -int ApfpInit(unsigned long precision) { +int Init(unsigned long precision) { try { if (precision > kBits) { // Requested bit width too large last_error_message = "Requested bitwidth too large"; - return ApfpBlasError::bitwidth; + return BlasError::bitwidth; } apfp.emplace(); - return ApfpBlasError::success; + return BlasError::success; }catch(const KernelNotFoundException& e) { last_error_message = e.what(); - return ApfpBlasError::kernel_not_found; + return BlasError::kernel_not_found; } catch(const std::exception& e) { // Unknown exception last_error_message = e.what(); - return ApfpBlasError::unknown; + return BlasError::unknown; } } -int ApfpFinalize() { +int Finalize() { apfp.reset(); - return ApfpBlasError::success; + return BlasError::success; } bool ApfpIsInitialized() { @@ -55,63 +55,63 @@ const char* ApfpErrorDescription() { /// Copy the upper or lower triangle from an NxN matrix A to a full size buffer template -void CopyFromMatrixUplo(ApfpBlasUplo uplo, unsigned long N, ptr_function_type A, unsigned long LDA, ApfpInterfaceWrapper* buffer) { +void CopyFromMatrixUplo(BlasUplo uplo, unsigned long N, ptr_function_type A, unsigned long LDA, interface::Wrapper* buffer) { auto dest_lda = N; // Col major layout for (unsigned long j = 0; j < N; ++j) { for (unsigned long i = 0; i <= j; ++i) { - auto source = uplo == ApfpBlasUplo::lower ? A(i + j * LDA) : A(j + i * LDA); - SetApfpInterfaceType(buffer[i + j * dest_lda].get(), source); - SetApfpInterfaceType(buffer[j + i * dest_lda].get(), source); + auto source = uplo == BlasUplo::lower ? A(i + j * LDA) : A(j + i * LDA); + interface::Set(buffer[i + j * dest_lda].get(), source); + interface::Set(buffer[j + i * dest_lda].get(), source); } } } /// Copy from a full size buffer to the upper or lower triangle of an NxN matrix A template -void CopyToMatrixUplo(ApfpBlasUplo uplo, unsigned long N, ptr_function_type A, unsigned long LDA, ApfpInterfaceWrapper* buffer) { +void CopyToMatrixUplo(BlasUplo uplo, unsigned long N, ptr_function_type A, unsigned long LDA, interface::Wrapper* buffer) { auto source_lda = N; // Col major layout for (unsigned long j = 0; j < N; ++j) { for (unsigned long i = 0; i <= j; ++i) { - auto dest = uplo == ApfpBlasUplo::lower ? A(i + j * LDA) : A(j + i * LDA); - SetApfpInterfaceType(dest, buffer[i + j * source_lda].get()); + auto dest = uplo == BlasUplo::lower ? A(i + j * LDA) : A(j + i * LDA); + interface::Set(dest, buffer[i + j * source_lda].get()); } } } /// Copy from an NxK matrix A to a full size buffer template -void CopyFromMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigned long LDA, ApfpInterfaceWrapper* buffer) { +void CopyFromMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigned long LDA, interface::Wrapper* buffer) { auto dest_lda = N; // Col major layout for (unsigned long j = 0; j < K; ++j) { for (unsigned long i = 0; i < N; ++i) { - SetApfpInterfaceType(buffer[i + j * dest_lda].get(), A(i + j * LDA)); + interface::Set(buffer[i + j * dest_lda].get(), A(i + j * LDA)); } } } /// Copy the transpose of a NxK matrix A to a full size buffer template -void CopyTransposeFromMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigned long LDA, ApfpInterfaceWrapper* buffer) { +void CopyTransposeFromMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigned long LDA, interface::Wrapper* buffer) { auto dest_lda = K; // Col major layout for (unsigned long j = 0; j < K; ++j) { for (unsigned long i = 0; i < N; ++i) { - SetApfpInterfaceType(buffer[i * dest_lda + j].get(), A(i + j * LDA)); + interface::Set(buffer[i * dest_lda + j].get(), A(i + j * LDA)); } } } /// Copy to an NxK matrix A from a full size buffer template -void CopyToMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigned long LDA, ApfpInterfaceWrapper* buffer) { +void CopyToMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigned long LDA, interface::Wrapper* buffer) { auto source_lda = N; // Col major layout for (unsigned long j = 0; j < K; ++j) { for (unsigned long i = 0; i < N; ++i) { - SetApfpInterfaceType(A(i + j * LDA), buffer[i + j * source_lda].get()); + interface::Set(A(i + j * LDA), buffer[i + j * source_lda].get()); } } } @@ -121,11 +121,11 @@ int ApfpSyrkImpl(char uplo, char trans, unsigned long N, unsigned long K, ptr_fu try { // ==== library input validation stuff ==== if(!ApfpIsInitialized()) { - return ApfpBlasError::uninitialized; + return BlasError::uninitialized; } if (std::toupper(uplo) != 'U' && std::toupper(uplo) != 'L') { return -1; } - auto uplo_validated = static_cast(uplo); + auto uplo_validated = static_cast(uplo); if (std::toupper(trans) != 'N' && std::toupper(trans) != 'T') { return -2; } @@ -135,7 +135,7 @@ int ApfpSyrkImpl(char uplo, char trans, unsigned long N, unsigned long K, ptr_fu // A A^T + C // T mode // A^T A + C - bool use_transpose = trans == ApfpBlasTrans::transpose; + bool use_transpose = trans == BlasTrans::transpose; unsigned long A_rows = use_transpose ? K : N; unsigned long A_cols = use_transpose ? N : K; @@ -144,11 +144,11 @@ int ApfpSyrkImpl(char uplo, char trans, unsigned long N, unsigned long K, ptr_fu if (LDC < N) { return -8; } // Empty matrix no-op - if (N == 0) { return ApfpBlasError::success; } - if (K == 0) { return ApfpBlasError::success; } + if (N == 0) { return BlasError::success; } + if (K == 0) { return BlasError::success; } // ==== setup ==== - std::vector host_a, host_a_transpose, host_c; + std::vector host_a, host_a_transpose, host_c; host_a.resize(N*K); CopyFromMatrix(A_rows, A_cols, A, LDA, host_a.data()); auto device_a = apfp->AllocateDeviceMatrix(A_rows, A_cols); @@ -176,20 +176,20 @@ int ApfpSyrkImpl(char uplo, char trans, unsigned long N, unsigned long K, ptr_fu CopyToMatrixUplo(uplo_validated, N, C, LDC, host_c.data()); } catch(const std::exception& e) { last_error_message = e.what(); - return ApfpBlasError::unknown; + return BlasError::unknown; } - return ApfpBlasError::success; + return BlasError::success; } /// See netlib's documentation on Syrk for usage. Alpha and beta unsupported -int ApfpSyrk(char uplo, char trans, unsigned long N, unsigned long K, ApfpInterfaceTypeConstPtr A, unsigned long LDA, ApfpInterfaceTypePtr C, unsigned long LDC) { - auto a_ptr_function = [&](unsigned long i) -> ApfpInterfaceTypeConstPtr { return A + i; }; - auto c_ptr_function = [&](unsigned long i) -> ApfpInterfaceTypePtr { return C + i; }; +int Syrk(char uplo, char trans, unsigned long N, unsigned long K, interface::ConstPtr A, unsigned long LDA, interface::Ptr C, unsigned long LDC) { + auto a_ptr_function = [&](unsigned long i) -> interface::ConstPtr { return A + i; }; + auto c_ptr_function = [&](unsigned long i) -> interface::Ptr { return C + i; }; return ApfpSyrkImpl(uplo, trans, N, K, a_ptr_function, LDA, c_ptr_function, LDC); } -int ApfpSyrk(char uplo, char trans, unsigned long N, unsigned long K, ConstIndexFunction A, unsigned long LDA, IndexFunction C, unsigned long LDC) { +int Syrk(char uplo, char trans, unsigned long N, unsigned long K, ConstIndexFunction A, unsigned long LDA, IndexFunction C, unsigned long LDC) { return ApfpSyrkImpl(uplo, trans, N, K, A, LDA, C, LDC); } diff --git a/interface/ApfpBlas.h b/interface/ApfpBlas.h index a0ed374..c8ae75a 100644 --- a/interface/ApfpBlas.h +++ b/interface/ApfpBlas.h @@ -5,23 +5,23 @@ namespace apfp { -using IndexFunction = std::function; -using ConstIndexFunction = std::function; +using IndexFunction = std::function; +using ConstIndexFunction = std::function; /// Null terminated string describing the most recent library error if available /// Pointer is only guaranteed to live until the next library call const char* ApfpErrorDescription(); -int ApfpInit(unsigned long precision); +int Init(unsigned long precision); -int ApfpFinalize(); +int Finalize(); /// See netlib's documentation on Syrk for usage. Alpha and beta unsupported -int ApfpSyrk(char uplo, char trans, unsigned long N, unsigned long K, ApfpInterfaceTypeConstPtr A, unsigned long LDA, ApfpInterfaceTypePtr C, unsigned long LDC); -int ApfpSyrk(char uplo, char trans, unsigned long N, unsigned long K, ConstIndexFunction A, unsigned long LDA, IndexFunction C, unsigned long LDC); +int Syrk(char uplo, char trans, unsigned long N, unsigned long K, interface::ConstPtr A, unsigned long LDA, interface::Ptr C, unsigned long LDC); +int Syrk(char uplo, char trans, unsigned long N, unsigned long K, ConstIndexFunction A, unsigned long LDA, IndexFunction C, unsigned long LDC); -enum ApfpBlasError : int { +enum BlasError : int { success = 0, unknown = 1, unimplemented = 2, diff --git a/interface/ApfpInterfaceType.cpp b/interface/ApfpInterfaceType.cpp index 446a526..aab6845 100644 --- a/interface/ApfpInterfaceType.cpp +++ b/interface/ApfpInterfaceType.cpp @@ -1,8 +1,8 @@ #include "ApfpInterfaceType.h" -namespace apfp { +namespace apfp::interface { -void InitApfpInterfaceType(ApfpInterfaceTypePtr value) { +void Init(Ptr value) { #ifdef APFP_GMP_INTERFACE_TYPE mpf_init(value); #else @@ -10,7 +10,7 @@ void InitApfpInterfaceType(ApfpInterfaceTypePtr value) { #endif } -void Init2ApfpInterfaceType(ApfpInterfaceTypePtr value, unsigned long precision) { +void Init2(Ptr value, unsigned long precision) { #ifdef APFP_GMP_INTERFACE_TYPE mpf_init2(value, precision); #else @@ -19,7 +19,7 @@ void Init2ApfpInterfaceType(ApfpInterfaceTypePtr value, unsigned long precision) #endif } -void ClearApfpInterfaceType(ApfpInterfaceTypePtr value) { +void Clear(Ptr value) { #ifdef APFP_GMP_INTERFACE_TYPE mpf_clear(value); #else @@ -27,7 +27,7 @@ void ClearApfpInterfaceType(ApfpInterfaceTypePtr value) { #endif } -void SwapApfpInterfaceType(ApfpInterfaceTypePtr a, ApfpInterfaceTypePtr b) { +void Swap(Ptr a, Ptr b) { #ifdef APFP_GMP_INTERFACE_TYPE mpf_swap(a, b); #else @@ -35,7 +35,7 @@ void SwapApfpInterfaceType(ApfpInterfaceTypePtr a, ApfpInterfaceTypePtr b) { #endif } -void SetApfpInterfaceType(ApfpInterfaceTypePtr dest, ApfpInterfaceTypeConstPtr source) { +void Set(Ptr dest, ConstPtr source) { #ifdef APFP_GMP_INTERFACE_TYPE mpf_set(dest, source); #else @@ -43,7 +43,7 @@ void SetApfpInterfaceType(ApfpInterfaceTypePtr dest, ApfpInterfaceTypeConstPtr s #endif } -void SetApfpInterfaceType(ApfpInterfaceTypePtr dest, long int source) { +void Set(Ptr dest, long int source) { #ifdef APFP_GMP_INTERFACE_TYPE mpf_set_ui(dest, source); #else @@ -51,7 +51,7 @@ void SetApfpInterfaceType(ApfpInterfaceTypePtr dest, long int source) { #endif } -void AddApfpInterfaceType(ApfpInterfaceTypePtr dest, ApfpInterfaceTypeConstPtr a, ApfpInterfaceTypeConstPtr b) { +void Add(Ptr dest, ConstPtr a, ConstPtr b) { #ifdef APFP_GMP_INTERFACE_TYPE mpf_add(dest, a, b); #else @@ -59,7 +59,7 @@ void AddApfpInterfaceType(ApfpInterfaceTypePtr dest, ApfpInterfaceTypeConstPtr a #endif } -void MulApfpInterfaceType(ApfpInterfaceTypePtr dest, ApfpInterfaceTypeConstPtr a, ApfpInterfaceTypeConstPtr b) { +void Mul(Ptr dest, ConstPtr a, ConstPtr b) { #ifdef APFP_GMP_INTERFACE_TYPE mpf_mul(dest, a, b); #else @@ -67,27 +67,27 @@ void MulApfpInterfaceType(ApfpInterfaceTypePtr dest, ApfpInterfaceTypeConstPtr a #endif } -ApfpInterfaceWrapper::~ApfpInterfaceWrapper() { - ClearApfpInterfaceType(data_); +Wrapper::~Wrapper() { + Clear(data_); } -ApfpInterfaceWrapper::ApfpInterfaceWrapper() { - InitApfpInterfaceType(data_); +Wrapper::Wrapper() { + Init(data_); } -ApfpInterfaceWrapper::ApfpInterfaceWrapper(unsigned long precision) { - Init2ApfpInterfaceType(data_, precision); +Wrapper::Wrapper(unsigned long precision) { + Init2(data_, precision); } -ApfpInterfaceWrapper::ApfpInterfaceWrapper(ApfpInterfaceWrapper&& other) { - SwapApfpInterfaceType(data_, other.data_); - ClearApfpInterfaceType(other.data_); +Wrapper::Wrapper(Wrapper&& other) { + Swap(data_, other.data_); + Clear(other.data_); } -ApfpInterfaceWrapper& ApfpInterfaceWrapper::operator=(ApfpInterfaceWrapper&& other) { - SwapApfpInterfaceType(data_, other.data_); - ClearApfpInterfaceType(other.data_); +Wrapper& Wrapper::operator=(Wrapper&& other) { + Swap(data_, other.data_); + Clear(other.data_); return *this; } diff --git a/interface/ApfpInterfaceType.h b/interface/ApfpInterfaceType.h index 17bee81..b0ef7d4 100644 --- a/interface/ApfpInterfaceType.h +++ b/interface/ApfpInterfaceType.h @@ -4,58 +4,64 @@ #include #include -namespace apfp { + +/* This header abstracts away the choice of MPFR or GMP in the interface + * It defines four types: Value, Ptr, ConstPtr, Wrapper + * The first three directly correspond to MPFR/GMP types + * The last one is a wrapper that manages the memory footprint with RAII + */ +namespace apfp::interface { #ifdef APFP_GMP_INTERFACE_TYPE // Interface with GMP types -using ApfpInterfaceType = mpf_t; -using ApfpInterfaceTypePtr = mpf_ptr; -using ApfpInterfaceTypeConstPtr = mpf_srcptr; +using Value = mpf_t; +using Ptr = mpf_ptr; +using ConstPtr = mpf_srcptr; #else #include -using ApfpInterfaceType = mpfr_t; -using ApfpInterfaceTypePtr = mpfr_ptr; -using ApfpInterfaceTypeConstPtr = mpfr_srcptr; +using Value = mpfr_t; +using Ptr = mpfr_ptr; +using ConstPtr = mpfr_srcptr; #endif -void InitApfpInterfaceType(ApfpInterfaceTypePtr value); +void Init(Ptr value); -void Init2ApfpInterfaceType(ApfpInterfaceTypePtr value, unsigned long precision); +void Init2(Ptr value, unsigned long precision); -void ClearApfpInterfaceType(ApfpInterfaceTypePtr value); +void Clear(Ptr value); -void SwapApfpInterfaceType(ApfpInterfaceTypePtr a, ApfpInterfaceTypePtr b); +void Swap(Ptr a, Ptr b); -void SetApfpInterfaceType(ApfpInterfaceTypePtr dest, ApfpInterfaceTypeConstPtr source); +void Set(Ptr dest, ConstPtr source); -void SetApfpInterfaceType(ApfpInterfaceTypePtr dest, long int source); +void Set(Ptr dest, long int source); -void AddApfpInterfaceType(ApfpInterfaceTypePtr dest, ApfpInterfaceTypeConstPtr a, ApfpInterfaceTypeConstPtr b); +void Add(Ptr dest, ConstPtr a, ConstPtr b); -void MulApfpInterfaceType(ApfpInterfaceTypePtr dest, ApfpInterfaceTypeConstPtr a, ApfpInterfaceTypeConstPtr b); +void Mul(Ptr dest, ConstPtr a, ConstPtr b); /// Smart pointer-like wrapper class for GMP/MPFR types -class ApfpInterfaceWrapper { - ApfpInterfaceType data_; +class Wrapper { + Value data_; public: - ~ApfpInterfaceWrapper(); + ~Wrapper(); - ApfpInterfaceWrapper(); + Wrapper(); - ApfpInterfaceWrapper(unsigned long precision); + Wrapper(unsigned long precision); - ApfpInterfaceWrapper(ApfpInterfaceWrapper&&); + Wrapper(Wrapper&&); - ApfpInterfaceWrapper(ApfpInterfaceWrapper&) = delete; + Wrapper(Wrapper&) = delete; - ApfpInterfaceWrapper& operator=(const ApfpInterfaceWrapper&) = delete; + Wrapper& operator=(const Wrapper&) = delete; - ApfpInterfaceWrapper& operator=(ApfpInterfaceWrapper&&); + Wrapper& operator=(Wrapper&&); // This decays to the pointer type - ApfpInterfaceTypePtr get() { return data_; } + Ptr get() { return data_; } - ApfpInterfaceTypeConstPtr get() const { return data_; } + ConstPtr get() const { return data_; } }; } \ No newline at end of file From 0677b71c0ecde9ed8fc11e9e580701f3878d9f5c Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Thu, 30 Dec 2021 23:05:44 +0100 Subject: [PATCH 46/67] Rename ErrorDescription --- interface/ApfpBlas.cpp | 2 +- interface/ApfpBlas.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/interface/ApfpBlas.cpp b/interface/ApfpBlas.cpp index ddffe34..1b6772f 100644 --- a/interface/ApfpBlas.cpp +++ b/interface/ApfpBlas.cpp @@ -49,7 +49,7 @@ bool ApfpIsInitialized() { return apfp.has_value(); } -const char* ApfpErrorDescription() { +const char* ErrorDescription() { return last_error_message.c_str(); } diff --git a/interface/ApfpBlas.h b/interface/ApfpBlas.h index c8ae75a..3ece1dc 100644 --- a/interface/ApfpBlas.h +++ b/interface/ApfpBlas.h @@ -11,7 +11,7 @@ using ConstIndexFunction = std::function; /// Null terminated string describing the most recent library error if available /// Pointer is only guaranteed to live until the next library call -const char* ApfpErrorDescription(); +const char* ErrorDescription(); int Init(unsigned long precision); From 03859eed9a24ed289f38cd3363b581c0c81fb17f Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Thu, 30 Dec 2021 23:12:44 +0100 Subject: [PATCH 47/67] Enum class Uplo/Trans --- host/BlasUnitTests.cpp | 12 ++++++------ interface/ApfpBlas.cpp | 25 +++++-------------------- interface/ApfpBlas.h | 32 +++++++++++++++++++++----------- 3 files changed, 32 insertions(+), 37 deletions(-) diff --git a/host/BlasUnitTests.cpp b/host/BlasUnitTests.cpp index b985fe2..2496e8d 100644 --- a/host/BlasUnitTests.cpp +++ b/host/BlasUnitTests.cpp @@ -68,8 +68,8 @@ TEST_CASE("SYRK") { unsigned long N = GENERATE(0, 1, 2, 8, 15, 16, 31, 32, 33); unsigned long K = GENERATE(0, 1, 2, 8, 15, 16, 31, 32, 33); - char mode = GENERATE('N', 'T'); - char uplo_mode = GENERATE('U', 'L'); + auto mode = GENERATE(apfp::BlasTrans::normal, apfp::BlasTrans::transpose); + auto uplo_mode = GENERATE(apfp::BlasUplo::upper, apfp::BlasUplo::lower); // Test SYRK // In 'N' mode, we perform AA^T + C // A is NxK (A : R^K -> R^N) @@ -101,7 +101,7 @@ TEST_CASE("SYRK") { for(unsigned long k = 0; k < K; ++k) { // A is NxK if N, KxN if T - if (mode == 'N') { + if (mode == apfp::BlasTrans::normal) { // (AB)_ij = sum_k A(i,k)B(k,j) apfp::interface::Mul(prod_temp.get(), a_matrix.at(i + k*N).get(), a_matrix.at(j + k*N).get()); } else { @@ -115,7 +115,7 @@ TEST_CASE("SYRK") { // Use APFP BLAS library auto error_code = apfp::Syrk(uplo_mode, mode, N, K, - [&](unsigned long i) { return a_matrix.at(i).get(); }, mode == 'N' ? N : K, + [&](unsigned long i) { return a_matrix.at(i).get(); }, mode == apfp::BlasTrans::normal ? N : K, [&](unsigned long i) { return c_matrix.at(i).get(); }, N); REQUIRE(error_code == apfp::BlasError::success); @@ -124,8 +124,8 @@ TEST_CASE("SYRK") { for(unsigned long j = 0; j < N; ++j) { // lower half for(unsigned long i = 0; i < j; ++i) { - auto ref_value = uplo_mode == 'L' ? ref_result.at(i + j*N).get() : ref_result.at(j + i*N).get(); - auto test_value = uplo_mode == 'L' ? c_matrix.at(i + j*N).get() : c_matrix.at(j + i*N).get(); + auto ref_value = uplo_mode == apfp::BlasUplo::lower ? ref_result.at(i + j*N).get() : ref_result.at(j + i*N).get(); + auto test_value = uplo_mode == apfp::BlasUplo::lower ? c_matrix.at(i + j*N).get() : c_matrix.at(j + i*N).get(); REQUIRE(IsClose(ref_value, test_value)); } } diff --git a/interface/ApfpBlas.cpp b/interface/ApfpBlas.cpp index 1b6772f..6dcc508 100644 --- a/interface/ApfpBlas.cpp +++ b/interface/ApfpBlas.cpp @@ -9,16 +9,6 @@ namespace apfp { static std::optional apfp; static std::string last_error_message; -enum BlasUplo : char { - upper = 'U', - lower = 'L' -}; - -enum BlasTrans : char { - normal = 'N', - transpose = 'T', -}; - int Init(unsigned long precision) { try { if (precision > kBits) { @@ -117,18 +107,13 @@ void CopyToMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigne } template -int ApfpSyrkImpl(char uplo, char trans, unsigned long N, unsigned long K, ptr_function_type_a A, unsigned long LDA, ptr_function_type_c C, unsigned long LDC) { +int ApfpSyrkImpl(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, ptr_function_type_a A, unsigned long LDA, ptr_function_type_c C, unsigned long LDC) { try { // ==== library input validation stuff ==== if(!ApfpIsInitialized()) { return BlasError::uninitialized; } - if (std::toupper(uplo) != 'U' && std::toupper(uplo) != 'L') { return -1; } - auto uplo_validated = static_cast(uplo); - - if (std::toupper(trans) != 'N' && std::toupper(trans) != 'T') { return -2; } - // A is NxK if 'N', KxN if 'T' // C is always NxN // N mode @@ -160,7 +145,7 @@ int ApfpSyrkImpl(char uplo, char trans, unsigned long N, unsigned long K, ptr_fu device_a_transpose.TransferToDevice(host_a_transpose.data(), host_a_transpose.size()); host_c.resize(N*N); - CopyFromMatrixUplo(uplo_validated, N, C, LDC, host_c.data()); + CopyFromMatrixUplo(uplo, N, C, LDC, host_c.data()); auto device_c = apfp->AllocateDeviceMatrix(N, N); device_c.TransferToDevice(host_c.data(), host_c.size()); @@ -173,7 +158,7 @@ int ApfpSyrkImpl(char uplo, char trans, unsigned long N, unsigned long K, ptr_fu } device_c.TransferToHost(host_c.data(), host_c.size()); - CopyToMatrixUplo(uplo_validated, N, C, LDC, host_c.data()); + CopyToMatrixUplo(uplo, N, C, LDC, host_c.data()); } catch(const std::exception& e) { last_error_message = e.what(); return BlasError::unknown; @@ -183,13 +168,13 @@ int ApfpSyrkImpl(char uplo, char trans, unsigned long N, unsigned long K, ptr_fu } /// See netlib's documentation on Syrk for usage. Alpha and beta unsupported -int Syrk(char uplo, char trans, unsigned long N, unsigned long K, interface::ConstPtr A, unsigned long LDA, interface::Ptr C, unsigned long LDC) { +int Syrk(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, interface::ConstPtr A, unsigned long LDA, interface::Ptr C, unsigned long LDC) { auto a_ptr_function = [&](unsigned long i) -> interface::ConstPtr { return A + i; }; auto c_ptr_function = [&](unsigned long i) -> interface::Ptr { return C + i; }; return ApfpSyrkImpl(uplo, trans, N, K, a_ptr_function, LDA, c_ptr_function, LDC); } -int Syrk(char uplo, char trans, unsigned long N, unsigned long K, ConstIndexFunction A, unsigned long LDA, IndexFunction C, unsigned long LDC) { +int Syrk(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, ConstIndexFunction A, unsigned long LDA, IndexFunction C, unsigned long LDC) { return ApfpSyrkImpl(uplo, trans, N, K, A, LDA, C, LDC); } diff --git a/interface/ApfpBlas.h b/interface/ApfpBlas.h index 3ece1dc..10d2c30 100644 --- a/interface/ApfpBlas.h +++ b/interface/ApfpBlas.h @@ -5,6 +5,25 @@ namespace apfp { +enum BlasError : int { + success = 0, + unknown = 1, + unimplemented = 2, + bitwidth = 3, + uninitialized = 4, + kernel_not_found = 5, +}; + +enum class BlasUplo : char { + upper = 'U', + lower = 'L' +}; + +enum class BlasTrans : char { + normal = 'N', + transpose = 'T', +}; + using IndexFunction = std::function; using ConstIndexFunction = std::function; @@ -18,16 +37,7 @@ int Init(unsigned long precision); int Finalize(); /// See netlib's documentation on Syrk for usage. Alpha and beta unsupported -int Syrk(char uplo, char trans, unsigned long N, unsigned long K, interface::ConstPtr A, unsigned long LDA, interface::Ptr C, unsigned long LDC); -int Syrk(char uplo, char trans, unsigned long N, unsigned long K, ConstIndexFunction A, unsigned long LDA, IndexFunction C, unsigned long LDC); - -enum BlasError : int { - success = 0, - unknown = 1, - unimplemented = 2, - bitwidth = 3, - uninitialized = 4, - kernel_not_found = 5, -}; +int Syrk(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, interface::ConstPtr A, unsigned long LDA, interface::Ptr C, unsigned long LDC); +int Syrk(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, ConstIndexFunction A, unsigned long LDA, IndexFunction C, unsigned long LDC); } \ No newline at end of file From b1a768d348bfc0ef60350a1af51e86748d507d16 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Thu, 30 Dec 2021 23:15:08 +0100 Subject: [PATCH 48/67] Formatting because I keep forgetting --- host/BlasUnitTests.cpp | 54 +++++++++++----------- interface/Apfp.cpp | 47 ++++++++++--------- interface/Apfp.h | 18 ++++---- interface/ApfpBlas.cpp | 80 ++++++++++++++++++++------------- interface/ApfpBlas.h | 30 ++++++------- interface/ApfpInterfaceType.cpp | 3 +- interface/ApfpInterfaceType.h | 19 ++++---- 7 files changed, 139 insertions(+), 112 deletions(-) diff --git a/host/BlasUnitTests.cpp b/host/BlasUnitTests.cpp index 2496e8d..0dade4b 100644 --- a/host/BlasUnitTests.cpp +++ b/host/BlasUnitTests.cpp @@ -1,14 +1,14 @@ -#include "Config.h" #include #include #include +#include "Config.h" + // #include "ArithmeticOperations.h" // #include "Karatsuba.h" // #include "PackedFloat.h" -#include "Random.h" - #include "ApfpBlas.h" +#include "Random.h" void ApfpSetup() { #ifdef APFP_GMP_INTERFACE_TYPE @@ -25,7 +25,7 @@ void ApfpTeardown() { } bool IsZero(apfp::interface::ConstPtr a) { -#ifdef APFP_GMP_INTERFACE_TYPE +#ifdef APFP_GMP_INTERFACE_TYPE return mpf_sgn(a) == 0; #else return mpfr_sgn(a) == 0; @@ -34,7 +34,7 @@ bool IsZero(apfp::interface::ConstPtr a) { bool IsClose(apfp::interface::ConstPtr a, apfp::interface::ConstPtr b) { // Avoids divide by zero if a = b = 0 - if(IsZero(a) && IsZero(b)) { + if (IsZero(a) && IsZero(b)) { return true; } @@ -53,7 +53,7 @@ bool IsClose(apfp::interface::ConstPtr a, apfp::interface::ConstPtr b) { auto exp = mpfr_get_exp(ratio.get()); #endif // Require the numbers to match to the first 90% decimal places - return exp < -((kMantissaBits*3 * 9)/10); + return exp < -((kMantissaBits * 3 * 9) / 10); } TEST_CASE("Init_Teardown") { @@ -77,36 +77,38 @@ TEST_CASE("SYRK") { // Matrices are stored column major because BLAS { std::vector a_matrix; - a_matrix.resize(N*K); - for(auto& v : a_matrix) { + a_matrix.resize(N * K); + for (auto& v : a_matrix) { rng.Generate(v.get()); } std::vector c_matrix; - c_matrix.resize(N*N); - for(auto& v : c_matrix) { + c_matrix.resize(N * N); + for (auto& v : c_matrix) { rng.Generate(v.get()); } std::vector ref_result; - ref_result.resize(N*N); + ref_result.resize(N * N); // Compute reference result apfp::interface::Wrapper prod_temp; - for(unsigned long j = 0; j < N; ++j) { + for (unsigned long j = 0; j < N; ++j) { // lower half - for(unsigned long i = 0; i < N; ++i) { - auto r_idx = i + j*N; + for (unsigned long i = 0; i < N; ++i) { + auto r_idx = i + j * N; apfp::interface::Set(ref_result.at(r_idx).get(), c_matrix.at(r_idx).get()); - - for(unsigned long k = 0; k < K; ++k) { + + for (unsigned long k = 0; k < K; ++k) { // A is NxK if N, KxN if T if (mode == apfp::BlasTrans::normal) { // (AB)_ij = sum_k A(i,k)B(k,j) - apfp::interface::Mul(prod_temp.get(), a_matrix.at(i + k*N).get(), a_matrix.at(j + k*N).get()); + apfp::interface::Mul(prod_temp.get(), a_matrix.at(i + k * N).get(), + a_matrix.at(j + k * N).get()); } else { // (AB)_ij = sum_k A(i,k) B(k,j) - apfp::interface::Mul(prod_temp.get(), a_matrix.at(k + i*K).get(), a_matrix.at(k + j*K).get()); + apfp::interface::Mul(prod_temp.get(), a_matrix.at(k + i * K).get(), + a_matrix.at(k + j * K).get()); } apfp::interface::Add(ref_result.at(r_idx).get(), prod_temp.get(), ref_result.at(r_idx).get()); } @@ -114,18 +116,20 @@ TEST_CASE("SYRK") { } // Use APFP BLAS library - auto error_code = apfp::Syrk(uplo_mode, mode, N, K, - [&](unsigned long i) { return a_matrix.at(i).get(); }, mode == apfp::BlasTrans::normal ? N : K, - [&](unsigned long i) { return c_matrix.at(i).get(); }, N); + auto error_code = apfp::Syrk( + uplo_mode, mode, N, K, [&](unsigned long i) { return a_matrix.at(i).get(); }, + mode == apfp::BlasTrans::normal ? N : K, [&](unsigned long i) { return c_matrix.at(i).get(); }, N); REQUIRE(error_code == apfp::BlasError::success); // Check all entries are sufficiently close apfp::interface::Wrapper diff; - for(unsigned long j = 0; j < N; ++j) { + for (unsigned long j = 0; j < N; ++j) { // lower half - for(unsigned long i = 0; i < j; ++i) { - auto ref_value = uplo_mode == apfp::BlasUplo::lower ? ref_result.at(i + j*N).get() : ref_result.at(j + i*N).get(); - auto test_value = uplo_mode == apfp::BlasUplo::lower ? c_matrix.at(i + j*N).get() : c_matrix.at(j + i*N).get(); + for (unsigned long i = 0; i < j; ++i) { + auto ref_value = uplo_mode == apfp::BlasUplo::lower ? ref_result.at(i + j * N).get() + : ref_result.at(j + i * N).get(); + auto test_value = + uplo_mode == apfp::BlasUplo::lower ? c_matrix.at(i + j * N).get() : c_matrix.at(j + i * N).get(); REQUIRE(IsClose(ref_value, test_value)); } } diff --git a/interface/Apfp.cpp b/interface/Apfp.cpp index 6436854..8195885 100644 --- a/interface/Apfp.cpp +++ b/interface/Apfp.cpp @@ -2,9 +2,9 @@ #include -#include -#include #include +#include +#include #include #include "Config.h" @@ -18,23 +18,26 @@ Apfp::Apfp() { } std::string Apfp::FindKernel() { - { // Specify a path to the APFP kernel manually + { // Specify a path to the APFP kernel manually char* apfp_kernel_env_var = std::getenv("APFP_KERNEL"); - if(apfp_kernel_env_var != nullptr) { + if (apfp_kernel_env_var != nullptr) { auto kernel_override_path = std::filesystem::path(apfp_kernel_env_var); - + if (!std::filesystem::exists(kernel_override_path)) { - throw std::runtime_error("APFP kernel path specified with APFP_KERNEL environment variable does not exist"); + throw std::runtime_error( + "APFP kernel path specified with APFP_KERNEL environment variable does not exist"); } return kernel_override_path.string(); - } + } } char* apfp_use_simulation_env_var = std::getenv("APFP_USE_SIMULATION"); - auto apfp_use_simulation = apfp_use_simulation_env_var != nullptr && !std::string(apfp_use_simulation_env_var).empty(); - auto kernel_name = std::filesystem::path(apfp_use_simulation ? "MatrixMultiplication_hw_emu.xclbin" : "MatrixMultiplication_hw.xclbin"); + auto apfp_use_simulation = + apfp_use_simulation_env_var != nullptr && !std::string(apfp_use_simulation_env_var).empty(); + auto kernel_name = std::filesystem::path(apfp_use_simulation ? "MatrixMultiplication_hw_emu.xclbin" + : "MatrixMultiplication_hw.xclbin"); - { // Search for the kernel in /lib, /usr/lib, LD_LIBRARY_PATH, current directory + { // Search for the kernel in /lib, /usr/lib, LD_LIBRARY_PATH, current directory std::vector search_paths; // System dirs search_paths.push_back(std::filesystem::path("/lib")); @@ -44,9 +47,10 @@ std::string Apfp::FindKernel() { char* ld_library_path_env_var = std::getenv("LD_LIBRARY_PATH"); auto ld_library_path = (ld_library_path_env_var == nullptr) ? "" : std::string(ld_library_path_env_var); - for(std::string::iterator seg_begin = ld_library_path.begin(), seg_end; seg_begin < ld_library_path.end(); seg_begin = seg_end+1) { + for (std::string::iterator seg_begin = ld_library_path.begin(), seg_end; seg_begin < ld_library_path.end(); + seg_begin = seg_end + 1) { seg_end = std::find(seg_begin, ld_library_path.end(), ':'); - + std::string candidate_path(seg_begin, seg_end); search_paths.push_back(std::filesystem::path(candidate_path)); } @@ -55,9 +59,9 @@ std::string Apfp::FindKernel() { search_paths.push_back(std::filesystem::current_path()); // Search - for(auto candidate_dir : search_paths) { + for (auto candidate_dir : search_paths) { auto candidate_kernel_path = candidate_dir / kernel_name; - if(std::filesystem::exists(candidate_kernel_path)) { + if (std::filesystem::exists(candidate_kernel_path)) { return candidate_kernel_path.string(); } } @@ -86,8 +90,9 @@ void Apfp::MatrixMultiplication(const DeviceMatrix& a, const DeviceMatrix& b, De if (a.cols() != b.rows() || result->rows() != a.rows() || result->cols() != b.cols()) { throw std::logic_error("Matrix dimension mismatch"); } - auto kernel = program_->MakeKernel("MatrixMultiplication", a.buffer_, b.buffer_, result->buffer_, result->buffer_, - static_cast(a.rows()), static_cast(b.rows()), static_cast(result->cols())); + auto kernel = + program_->MakeKernel("MatrixMultiplication", a.buffer_, b.buffer_, result->buffer_, result->buffer_, + static_cast(a.rows()), static_cast(b.rows()), static_cast(result->cols())); kernel.ExecuteTask(); } @@ -103,7 +108,7 @@ DeviceMatrix Apfp::Transpose(const DeviceMatrix&) { throw UnimplementedException(); } -template +template void DeviceMatrix::TransferToDeviceImpl(ptr_function_type buffer_ptr_func, std::size_t buffer_size) { if (rows() * cols() > buffer_size) { throw std::runtime_error("Source host buffer size smaller than destination device matrix size"); @@ -113,7 +118,7 @@ void DeviceMatrix::TransferToDeviceImpl(ptr_function_type buffer_ptr_func, std:: std::vector host_buffer; host_buffer.resize(cols() * rows()); - for(std::size_t i = 0; i < host_buffer.size(); ++i) { + for (std::size_t i = 0; i < host_buffer.size(); ++i) { host_buffer[i] = PackedFloat(buffer_ptr_func(i)); } @@ -137,7 +142,7 @@ void PackedFloatToInterfaceType(const PackedFloat& packed, mpf_ptr dest) { packed.ToGmp(dest); } -template +template void DeviceMatrix::TransferToHostImpl(ptr_function_type buffer_ptr_func, std::size_t buffer_size) { if (rows() * cols() > buffer_size) { throw std::runtime_error("Destination host buffer size smaller than source device matrix size"); @@ -149,7 +154,7 @@ void DeviceMatrix::TransferToHostImpl(ptr_function_type buffer_ptr_func, std::si buffer_.CopyToHost(0, kLinesPerNumber * host_buffer.size(), reinterpret_cast(host_buffer.data())); interface::Wrapper scratch; - for(std::size_t i = 0; i < host_buffer.size(); ++i) { + for (std::size_t i = 0; i < host_buffer.size(); ++i) { PackedFloatToInterfaceType(host_buffer[i], buffer_ptr_func(i)); } } @@ -162,4 +167,4 @@ void DeviceMatrix::TransferToHost(interface::Wrapper* buffer_ptr, std::size_t bu TransferToHostImpl([&](std::size_t i) -> interface::Ptr { return buffer_ptr[i].get(); }, buffer_size); } -} \ No newline at end of file +} // namespace apfp \ No newline at end of file diff --git a/interface/Apfp.h b/interface/Apfp.h index 4eb004d..2c12205 100644 --- a/interface/Apfp.h +++ b/interface/Apfp.h @@ -2,15 +2,13 @@ #include #include +#include #include +#include "ApfpInterfaceType.h" #include "MatrixMultiplication.h" #include "PackedFloat.h" -#include "ApfpInterfaceType.h" - -#include - namespace apfp { class DeviceMatrix; @@ -21,8 +19,9 @@ class Apfp { std::optional program_; std::size_t lines_per_number_; - + static std::string FindKernel(); + public: Apfp(); @@ -70,24 +69,23 @@ class DeviceMatrix { void TransferToDevice(interface::ConstPtr buffer_ptr, std::size_t buffer_size); void TransferToDevice(const interface::Wrapper* buffer_ptr, std::size_t buffer_size); - /// Transfer from the device to the host /// TODO: Make this take output iterators void TransferToHost(interface::Ptr buffer_ptr, std::size_t buffer_size); void TransferToHost(interface::Wrapper* buffer_ptr, std::size_t buffer_size); private: - template + template void TransferToDeviceImpl(ptr_function_type buffer_ptr_func, std::size_t buffer_size); - template + template void TransferToHostImpl(ptr_function_type buffer_ptr_func, std::size_t buffer_size); }; // === Custom exception types === struct ApfpException : public std::exception { std::string e; - + ApfpException() { e = ""; } @@ -109,4 +107,4 @@ struct UnimplementedException : public ApfpException { using ApfpException::ApfpException; }; -} \ No newline at end of file +} // namespace apfp \ No newline at end of file diff --git a/interface/ApfpBlas.cpp b/interface/ApfpBlas.cpp index 6dcc508..323730e 100644 --- a/interface/ApfpBlas.cpp +++ b/interface/ApfpBlas.cpp @@ -1,9 +1,11 @@ #include "ApfpBlas.h" -#include "Apfp.h" -#include "Config.h" + #include #include +#include "Apfp.h" +#include "Config.h" + namespace apfp { static std::optional apfp; @@ -18,12 +20,12 @@ int Init(unsigned long precision) { } apfp.emplace(); return BlasError::success; - - }catch(const KernelNotFoundException& e) { + + } catch (const KernelNotFoundException& e) { last_error_message = e.what(); return BlasError::kernel_not_found; - } catch(const std::exception& e) { + } catch (const std::exception& e) { // Unknown exception last_error_message = e.what(); return BlasError::unknown; @@ -44,8 +46,9 @@ const char* ErrorDescription() { } /// Copy the upper or lower triangle from an NxN matrix A to a full size buffer -template -void CopyFromMatrixUplo(BlasUplo uplo, unsigned long N, ptr_function_type A, unsigned long LDA, interface::Wrapper* buffer) { +template +void CopyFromMatrixUplo(BlasUplo uplo, unsigned long N, ptr_function_type A, unsigned long LDA, + interface::Wrapper* buffer) { auto dest_lda = N; // Col major layout for (unsigned long j = 0; j < N; ++j) { @@ -58,8 +61,9 @@ void CopyFromMatrixUplo(BlasUplo uplo, unsigned long N, ptr_function_type A, uns } /// Copy from a full size buffer to the upper or lower triangle of an NxN matrix A -template -void CopyToMatrixUplo(BlasUplo uplo, unsigned long N, ptr_function_type A, unsigned long LDA, interface::Wrapper* buffer) { +template +void CopyToMatrixUplo(BlasUplo uplo, unsigned long N, ptr_function_type A, unsigned long LDA, + interface::Wrapper* buffer) { auto source_lda = N; // Col major layout for (unsigned long j = 0; j < N; ++j) { @@ -71,8 +75,9 @@ void CopyToMatrixUplo(BlasUplo uplo, unsigned long N, ptr_function_type A, unsig } /// Copy from an NxK matrix A to a full size buffer -template -void CopyFromMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigned long LDA, interface::Wrapper* buffer) { +template +void CopyFromMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigned long LDA, + interface::Wrapper* buffer) { auto dest_lda = N; // Col major layout for (unsigned long j = 0; j < K; ++j) { @@ -83,8 +88,9 @@ void CopyFromMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsig } /// Copy the transpose of a NxK matrix A to a full size buffer -template -void CopyTransposeFromMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigned long LDA, interface::Wrapper* buffer) { +template +void CopyTransposeFromMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigned long LDA, + interface::Wrapper* buffer) { auto dest_lda = K; // Col major layout for (unsigned long j = 0; j < K; ++j) { @@ -95,8 +101,9 @@ void CopyTransposeFromMatrix(unsigned long N, unsigned long K, ptr_function_type } /// Copy to an NxK matrix A from a full size buffer -template -void CopyToMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigned long LDA, interface::Wrapper* buffer) { +template +void CopyToMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigned long LDA, + interface::Wrapper* buffer) { auto source_lda = N; // Col major layout for (unsigned long j = 0; j < K; ++j) { @@ -106,11 +113,12 @@ void CopyToMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigne } } -template -int ApfpSyrkImpl(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, ptr_function_type_a A, unsigned long LDA, ptr_function_type_c C, unsigned long LDC) { +template +int ApfpSyrkImpl(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, ptr_function_type_a A, + unsigned long LDA, ptr_function_type_c C, unsigned long LDC) { try { // ==== library input validation stuff ==== - if(!ApfpIsInitialized()) { + if (!ApfpIsInitialized()) { return BlasError::uninitialized; } @@ -125,33 +133,41 @@ int ApfpSyrkImpl(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long unsigned long A_rows = use_transpose ? K : N; unsigned long A_cols = use_transpose ? N : K; - if (LDA < (use_transpose ? K : N)) { return -6; } - if (LDC < N) { return -8; } + if (LDA < (use_transpose ? K : N)) { + return -6; + } + if (LDC < N) { + return -8; + } // Empty matrix no-op - if (N == 0) { return BlasError::success; } - if (K == 0) { return BlasError::success; } - + if (N == 0) { + return BlasError::success; + } + if (K == 0) { + return BlasError::success; + } + // ==== setup ==== std::vector host_a, host_a_transpose, host_c; - host_a.resize(N*K); + host_a.resize(N * K); CopyFromMatrix(A_rows, A_cols, A, LDA, host_a.data()); auto device_a = apfp->AllocateDeviceMatrix(A_rows, A_cols); device_a.TransferToDevice(host_a.data(), host_a.size()); - host_a_transpose.resize(K*N); + host_a_transpose.resize(K * N); CopyTransposeFromMatrix(A_rows, A_cols, A, LDA, host_a_transpose.data()); auto device_a_transpose = apfp->AllocateDeviceMatrix(A_cols, A_rows); device_a_transpose.TransferToDevice(host_a_transpose.data(), host_a_transpose.size()); - host_c.resize(N*N); + host_c.resize(N * N); CopyFromMatrixUplo(uplo, N, C, LDC, host_c.data()); auto device_c = apfp->AllocateDeviceMatrix(N, N); device_c.TransferToDevice(host_c.data(), host_c.size()); // ==== compute and teardown ==== auto mul_result = apfp->AllocateDeviceMatrix(N, N); - if(use_transpose) { + if (use_transpose) { apfp->MatrixMultiplication(device_a_transpose, device_a, &device_c); } else { apfp->MatrixMultiplication(device_a, device_a_transpose, &device_c); @@ -159,7 +175,7 @@ int ApfpSyrkImpl(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long device_c.TransferToHost(host_c.data(), host_c.size()); CopyToMatrixUplo(uplo, N, C, LDC, host_c.data()); - } catch(const std::exception& e) { + } catch (const std::exception& e) { last_error_message = e.what(); return BlasError::unknown; } @@ -168,14 +184,16 @@ int ApfpSyrkImpl(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long } /// See netlib's documentation on Syrk for usage. Alpha and beta unsupported -int Syrk(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, interface::ConstPtr A, unsigned long LDA, interface::Ptr C, unsigned long LDC) { +int Syrk(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, interface::ConstPtr A, unsigned long LDA, + interface::Ptr C, unsigned long LDC) { auto a_ptr_function = [&](unsigned long i) -> interface::ConstPtr { return A + i; }; auto c_ptr_function = [&](unsigned long i) -> interface::Ptr { return C + i; }; return ApfpSyrkImpl(uplo, trans, N, K, a_ptr_function, LDA, c_ptr_function, LDC); } -int Syrk(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, ConstIndexFunction A, unsigned long LDA, IndexFunction C, unsigned long LDC) { +int Syrk(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, ConstIndexFunction A, unsigned long LDA, + IndexFunction C, unsigned long LDC) { return ApfpSyrkImpl(uplo, trans, N, K, A, LDA, C, LDC); } -} \ No newline at end of file +} // namespace apfp \ No newline at end of file diff --git a/interface/ApfpBlas.h b/interface/ApfpBlas.h index 10d2c30..bc2c7cc 100644 --- a/interface/ApfpBlas.h +++ b/interface/ApfpBlas.h @@ -1,23 +1,22 @@ #pragma once -#include +#include + #include + #include "ApfpInterfaceType.h" namespace apfp { enum BlasError : int { - success = 0, - unknown = 1, - unimplemented = 2, - bitwidth = 3, - uninitialized = 4, - kernel_not_found = 5, + success = 0, + unknown = 1, + unimplemented = 2, + bitwidth = 3, + uninitialized = 4, + kernel_not_found = 5, }; -enum class BlasUplo : char { - upper = 'U', - lower = 'L' -}; +enum class BlasUplo : char { upper = 'U', lower = 'L' }; enum class BlasTrans : char { normal = 'N', @@ -27,7 +26,6 @@ enum class BlasTrans : char { using IndexFunction = std::function; using ConstIndexFunction = std::function; - /// Null terminated string describing the most recent library error if available /// Pointer is only guaranteed to live until the next library call const char* ErrorDescription(); @@ -37,7 +35,9 @@ int Init(unsigned long precision); int Finalize(); /// See netlib's documentation on Syrk for usage. Alpha and beta unsupported -int Syrk(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, interface::ConstPtr A, unsigned long LDA, interface::Ptr C, unsigned long LDC); -int Syrk(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, ConstIndexFunction A, unsigned long LDA, IndexFunction C, unsigned long LDC); +int Syrk(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, interface::ConstPtr A, unsigned long LDA, + interface::Ptr C, unsigned long LDC); +int Syrk(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, ConstIndexFunction A, unsigned long LDA, + IndexFunction C, unsigned long LDC); -} \ No newline at end of file +} // namespace apfp \ No newline at end of file diff --git a/interface/ApfpInterfaceType.cpp b/interface/ApfpInterfaceType.cpp index aab6845..d86a7fc 100644 --- a/interface/ApfpInterfaceType.cpp +++ b/interface/ApfpInterfaceType.cpp @@ -79,7 +79,6 @@ Wrapper::Wrapper(unsigned long precision) { Init2(data_, precision); } - Wrapper::Wrapper(Wrapper&& other) { Swap(data_, other.data_); Clear(other.data_); @@ -91,4 +90,4 @@ Wrapper& Wrapper::operator=(Wrapper&& other) { return *this; } -} \ No newline at end of file +} // namespace apfp::interface \ No newline at end of file diff --git a/interface/ApfpInterfaceType.h b/interface/ApfpInterfaceType.h index b0ef7d4..57ed2bf 100644 --- a/interface/ApfpInterfaceType.h +++ b/interface/ApfpInterfaceType.h @@ -1,9 +1,8 @@ #pragma once -#include "Config.h" - #include #include +#include "Config.h" /* This header abstracts away the choice of MPFR or GMP in the interface * It defines four types: Value, Ptr, ConstPtr, Wrapper @@ -12,7 +11,7 @@ */ namespace apfp::interface { -#ifdef APFP_GMP_INTERFACE_TYPE // Interface with GMP types +#ifdef APFP_GMP_INTERFACE_TYPE // Interface with GMP types using Value = mpf_t; using Ptr = mpf_ptr; using ConstPtr = mpf_srcptr; @@ -43,7 +42,7 @@ void Mul(Ptr dest, ConstPtr a, ConstPtr b); class Wrapper { Value data_; -public: + public: ~Wrapper(); Wrapper(); @@ -57,11 +56,15 @@ class Wrapper { Wrapper& operator=(const Wrapper&) = delete; Wrapper& operator=(Wrapper&&); - + // This decays to the pointer type - Ptr get() { return data_; } + Ptr get() { + return data_; + } - ConstPtr get() const { return data_; } + ConstPtr get() const { + return data_; + } }; -} \ No newline at end of file +} // namespace apfp::interface \ No newline at end of file From 78a45949455939a796acd217a82d0be6fb1a207f Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Fri, 31 Dec 2021 22:56:14 +0100 Subject: [PATCH 49/67] apfpHostlib naming convention --- CMakeLists.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2a40fca..64bb070 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -63,9 +63,9 @@ add_library(simulation ${APFP_KERNEL_FILES}) target_compile_options(simulation PRIVATE -Wno-unknown-pragmas -DAP_INT_MAX_W=${APFP_MAX_BITS}) target_link_libraries(simulation ${CMAKE_THREAD_LIBS_INIT}) -add_library(ApfpHostlib SHARED interface/Apfp.cpp interface/ApfpBlas.cpp interface/ApfpInterfaceType.cpp) -target_link_libraries(ApfpHostlib ${Vitis_LIBRARIES} ${GMP_LIBRARIES}) -target_compile_definitions(ApfpHostlib PRIVATE HLSLIB_SIMULATE_OPENCL) +add_library(apfpHostlib SHARED interface/Apfp.cpp interface/ApfpBlas.cpp interface/ApfpInterfaceType.cpp) +target_link_libraries(apfpHostlib ${Vitis_LIBRARIES} ${GMP_LIBRARIES}) +target_compile_definitions(apfpHostlib PRIVATE HLSLIB_SIMULATE_OPENCL) # Executable used to run in simulation mode, calling the kernel as a C++ function directly add_executable(TestSimulation host/TestProgram.cpp) @@ -81,13 +81,13 @@ enable_testing() add_test(TestSimulation TestSimulation 4 4 4) add_library(Catch host/Catch.cpp) add_executable(UnitTests host/UnitTests.cpp) -target_link_libraries(UnitTests Catch ${GMP_LIBRARIES} ${MPFR_LIBRARIES} apfp ApfpHostlib simulation) +target_link_libraries(UnitTests Catch ${GMP_LIBRARIES} ${MPFR_LIBRARIES} apfp apfpHostlib simulation) add_test(UnitTests UnitTests) add_executable(BlasUnitTests host/BlasUnitTests.cpp) -target_link_libraries(BlasUnitTests Catch ${GMP_LIBRARIES} ${MPFR_LIBRARIES} apfp ApfpHostlib simulation) +target_link_libraries(BlasUnitTests Catch ${GMP_LIBRARIES} ${MPFR_LIBRARIES} apfp apfpHostlib simulation) -install(TARGETS ApfpHostlib) +install(TARGETS apfpHostlib) install(FILES interface/Apfp.h interface/ApfpBlas.h From 119185cc23ad2a564ee4368313f50730c21744a7 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Mon, 3 Jan 2022 13:29:09 +0100 Subject: [PATCH 50/67] Switch kernel to column major ordering --- device/MatrixMultiplication.cpp | 38 +++++++++++++------------- host/MatrixMultiplicationReference.cpp | 5 ++-- host/TestProgram.cpp | 10 +++---- 3 files changed, 27 insertions(+), 26 deletions(-) diff --git a/device/MatrixMultiplication.cpp b/device/MatrixMultiplication.cpp index 683c1e7..4cc171b 100644 --- a/device/MatrixMultiplication.cpp +++ b/device/MatrixMultiplication.cpp @@ -9,7 +9,7 @@ // Annoyingly we have to specialize the innermost loop on whether multiple DRAM flits per number are required or not, // because HLS otherwise gets confused by pragmas applied to a loop of size 1 in the latter case. template -void ReadAInner(DramLine const *const mem, hlslib::Stream &a_to_feeder, const int size_k, const int n0, +void ReadAInner(DramLine const *const mem, hlslib::Stream &a_to_feeder, const int size_n, const int n0, const int k) { #pragma HLS INLINE DramLine num[kLinesPerNumber]; @@ -19,7 +19,7 @@ void ReadAInner(DramLine const *const mem, hlslib::Stream &a_to_fee for (int i = 0; i < kLinesPerNumber; ++i) { #pragma HLS PIPELINE II = 1 #pragma HLS LOOP_FLATTEN - num[i] = mem[((n0 * kTileSizeN + n1) * size_k + k) * kLinesPerNumber + i]; + num[i] = mem[((n0 * kTileSizeN + n1) + k * size_n) * kLinesPerNumber + i]; if (i == kLinesPerNumber - 1) { a_to_feeder.Push(*reinterpret_cast(num)); } @@ -28,14 +28,14 @@ void ReadAInner(DramLine const *const mem, hlslib::Stream &a_to_fee } template <> -void ReadAInner<1>(DramLine const *const mem, hlslib::Stream &a_to_feeder, const int size_k, const int n0, +void ReadAInner<1>(DramLine const *const mem, hlslib::Stream &a_to_feeder, const int size_n, const int n0, const int k) { #pragma HLS INLINE ReadA_N: for (int n1 = 0; n1 < kTileSizeN; ++n1) { #pragma HLS PIPELINE II = 1 #pragma HLS LOOP_FLATTEN - const auto num = mem[((n0 * kTileSizeN + n1) * size_k + k) * kLinesPerNumber]; + const auto num = mem[((n0 * kTileSizeN + n1) + k * size_n) * kLinesPerNumber]; a_to_feeder.Push(*reinterpret_cast(&num)); } } @@ -50,7 +50,7 @@ void ReadA(DramLine const *const mem, hlslib::Stream &a_to_feeder, for (int m0 = 0; m0 < tiles_m; ++m0) { ReadA_K: for (int k = 0; k < size_k; ++k) { - ReadAInner(mem, a_to_feeder, size_k, n0, k); + ReadAInner(mem, a_to_feeder, size_n, n0, k); } } } @@ -89,7 +89,7 @@ void FeedA(hlslib::Stream &a_to_feeder, hlslib::Stream //////////////////////////////////////////////////////////////////////////////// template -void ReadBInner(DramLine const *const mem, hlslib::Stream &b_to_feeder, const int size_m, const int m0, +void ReadBInner(DramLine const *const mem, hlslib::Stream &b_to_feeder, const int size_k, const int m0, const int k) { #pragma HLS INLINE DramLine num[kLinesPerNumber]; @@ -99,7 +99,7 @@ void ReadBInner(DramLine const *const mem, hlslib::Stream &b_to_fee for (int i = 0; i < kLinesPerNumber; ++i) { #pragma HLS PIPELINE II = 1 #pragma HLS LOOP_FLATTEN - num[i] = mem[(k * size_m + m0 * kTileSizeM + m1) * kLinesPerNumber + i]; + num[i] = mem[(k + (m0 * kTileSizeM + m1) * size_k) * kLinesPerNumber + i]; if (i == kLinesPerNumber - 1) { b_to_feeder.Push(*reinterpret_cast(num)); } @@ -108,14 +108,14 @@ void ReadBInner(DramLine const *const mem, hlslib::Stream &b_to_fee } template <> -void ReadBInner<1>(DramLine const *const mem, hlslib::Stream &b_to_feeder, const int size_m, const int m0, +void ReadBInner<1>(DramLine const *const mem, hlslib::Stream &b_to_feeder, const int size_k, const int m0, const int k) { #pragma HLS INLINE ReadB_M: for (int m1 = 0; m1 < kTileSizeM; ++m1) { #pragma HLS PIPELINE II = 1 #pragma HLS LOOP_FLATTEN - const auto num = mem[(k * size_m + m0 * kTileSizeM + m1) * kLinesPerNumber]; + const auto num = mem[(k + (m0 * kTileSizeM + m1) * size_k) * kLinesPerNumber]; b_to_feeder.Push(*reinterpret_cast(&num)); } } @@ -130,7 +130,7 @@ void ReadB(DramLine const *const mem, hlslib::Stream &b_to_feeder, for (int m0 = 0; m0 < tiles_m; ++m0) { ReadB_K: for (int k = 0; k < size_k; ++k) { - ReadBInner(mem, b_to_feeder, size_m, m0, k); + ReadBInner(mem, b_to_feeder, size_k, m0, k); } } } @@ -167,7 +167,7 @@ void FeedB(hlslib::Stream &b_to_feeder, hlslib::Stream //////////////////////////////////////////////////////////////////////////////// template -void ReadCInner(DramLine const *const mem, hlslib::Stream &c_to_feeder, const int size_m, const int n0, +void ReadCInner(DramLine const *const mem, hlslib::Stream &c_to_feeder, const int size_n, const int n0, const int m0, const int n1) { #pragma HLS INLINE ReadC_M: @@ -177,7 +177,7 @@ void ReadCInner(DramLine const *const mem, hlslib::Stream &c_to_fee for (int i = 0; i < kLinesPerNumber; ++i) { #pragma HLS PIPELINE II = 1 #pragma HLS LOOP_FLATTEN - num[i] = mem[((n0 * kTileSizeN + n1) * size_m + m0 * kTileSizeM + m1) * kLinesPerNumber + i]; + num[i] = mem[((n0 * kTileSizeN + n1) + (m0 * kTileSizeM + m1) * size_n) * kLinesPerNumber + i]; if (i == kLinesPerNumber - 1) { c_to_feeder.Push(*reinterpret_cast(num)); } @@ -186,14 +186,14 @@ void ReadCInner(DramLine const *const mem, hlslib::Stream &c_to_fee } template <> -void ReadCInner<1>(DramLine const *const mem, hlslib::Stream &c_to_feeder, const int size_m, const int n0, +void ReadCInner<1>(DramLine const *const mem, hlslib::Stream &c_to_feeder, const int size_n, const int n0, const int m0, const int n1) { #pragma HLS INLINE ReadC_M: for (int m1 = 0; m1 < kTileSizeM; ++m1) { #pragma HLS PIPELINE II = 1 #pragma HLS LOOP_FLATTEN - const auto num = mem[((n0 * kTileSizeN + n1) * size_m + m0 * kTileSizeM + m1) * kLinesPerNumber]; + const auto num = mem[((n0 * kTileSizeN + n1) + (m0 * kTileSizeM + m1) * size_n) * kLinesPerNumber]; c_to_feeder.Push(*reinterpret_cast(&num)); } } @@ -207,7 +207,7 @@ void ReadC(DramLine const *const mem, hlslib::Stream &c_to_feeder, for (int m0 = 0; m0 < tiles_m; ++m0) { ReadC_N: for (int n1 = 0; n1 < kTileSizeN; ++n1) { - ReadCInner(mem, c_to_feeder, size_m, n0, m0, n1); + ReadCInner(mem, c_to_feeder, size_n, n0, m0, n1); } } } @@ -287,7 +287,7 @@ void WriteCInner(hlslib::Stream &from_kernel, DramLine *const mem, } const bool in_bounds = (n0 * kTileSizeN + n1 < size_n) && (m0 * kTileSizeM + m1 < size_m); if (in_bounds) { - mem[((n0 * kTileSizeN + n1) * size_m + m0 * kTileSizeM + m1) * kLinesPerNumber + i] = num[i]; + mem[((n0 * kTileSizeN + n1) + (m0 * kTileSizeM + m1) * size_n) * kLinesPerNumber + i] = num[i]; } } } @@ -304,7 +304,7 @@ void WriteCInner<1>(hlslib::Stream &from_kernel, DramLine *const me const auto num = from_kernel.Pop(); const bool in_bounds = (n0 * kTileSizeN + n1 < size_n) && (m0 * kTileSizeM + m1 < size_m); if (in_bounds) { - mem[((n0 * kTileSizeN + n1) * size_m + m0 * kTileSizeM + m1) * kLinesPerNumber] = + mem[((n0 * kTileSizeN + n1) + (m0 * kTileSizeM + m1) * size_n) * kLinesPerNumber] = *reinterpret_cast(&num); } } @@ -351,7 +351,7 @@ void Compute(hlslib::Stream &a_in, hlslib::Stream &b_i const PackedFloat c_read = c_in.Pop(); const PackedFloat a = (m1 == 0) ? a_read : a_buffer; const PackedFloat b = (n1 == 0) ? b_read : b_buffer[m1]; - const PackedFloat c = (k == 0) ? c_read : c_buffer[n1 * kTileSizeM + m1]; + const PackedFloat c = (k == 0) ? c_read : c_buffer[n1 + m1 * kTileSizeN]; a_buffer = a; b_buffer[m1] = b; // Ignore contributions from out-of-bound indices @@ -360,7 +360,7 @@ void Compute(hlslib::Stream &a_in, hlslib::Stream &b_i const auto res = MultiplyAccumulate(in_bounds ? a : PackedFloat::Zero(), in_bounds ? b : PackedFloat::Zero(), c); // Write back to buffer - c_buffer[n1 * kTileSizeM + m1] = res; + c_buffer[n1 + m1 * kTileSizeN] = res; c_out.Push(res); } } diff --git a/host/MatrixMultiplicationReference.cpp b/host/MatrixMultiplicationReference.cpp index 42d2b98..3c0d180 100644 --- a/host/MatrixMultiplicationReference.cpp +++ b/host/MatrixMultiplicationReference.cpp @@ -8,8 +8,9 @@ void MatrixMultiplicationReference(mpfr_t const *a, mpfr_t const *b, mpfr_t *c, for (int n = 0; n < size_n; ++n) { for (int k = 0; k < size_k; ++k) { for (int m = 0; m < size_m; ++m) { - mpfr_mul(tmp, a[n * size_k + k], b[k * size_m + m], kRoundingMode); - mpfr_t &_c = c[n * size_m + m]; + // C(n, m) = sum_k A(n, k) B(k, m) + mpfr_mul(tmp, a[n + k * size_n], b[k + m * size_k], kRoundingMode); + mpfr_t &_c = c[n + m * size_n]; mpfr_add(_c, _c, tmp, kRoundingMode); } } diff --git a/host/TestProgram.cpp b/host/TestProgram.cpp index e4c2eb2..3a6b414 100644 --- a/host/TestProgram.cpp +++ b/host/TestProgram.cpp @@ -109,8 +109,8 @@ bool RunTest(std::string const &kernel_path, int size_n, int size_k, int size_m) // Verify results for (int n = 0; n < size_n; ++n) { for (int m = 0; m < size_m; ++m) { - const PackedFloat res = c_host[n * size_m + m]; - const PackedFloat ref(c_mpfr[n * size_m + m]); + const PackedFloat res = c_host[n + m * size_n]; + const PackedFloat ref(c_mpfr[n + m * size_n]); if (ref != res) { std::cerr << "Verification failed at (" << n << ", " << m << "):\n\t" << res << "\n\t" << ref << "\n"; return false; @@ -122,17 +122,17 @@ bool RunTest(std::string const &kernel_path, int size_n, int size_k, int size_m) // Clean up for (int n = 0; n < size_n; ++n) { for (int k = 0; k < size_k; ++k) { - mpfr_clear(a_mpfr[n * size_k + k]); + mpfr_clear(a_mpfr[n + k * size_n]); } } for (int k = 0; k < size_k; ++k) { for (int m = 0; m < size_m; ++m) { - mpfr_clear(b_mpfr[k * size_m + m]); + mpfr_clear(b_mpfr[k + m * size_k]); } } for (int n = 0; n < size_n; ++n) { for (int m = 0; m < size_m; ++m) { - mpfr_clear(c_mpfr[n * size_m + m]); + mpfr_clear(c_mpfr[n + m * size_n]); } } From 8338d02a08266a13f36a9a10b41e6a528e9b6e23 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Mon, 3 Jan 2022 13:40:33 +0100 Subject: [PATCH 51/67] Remove extremely large volume simulation test cases --- scripts/run_simulation.sh | 58 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/scripts/run_simulation.sh b/scripts/run_simulation.sh index cee9fbd..f2725f4 100755 --- a/scripts/run_simulation.sh +++ b/scripts/run_simulation.sh @@ -1,13 +1,65 @@ #!/bin/bash sizes=(1 3 4 7 9 15 16 17 31 33 41) +small_sizes=(1 3 4 7 9 15 16 17) +large_sizes=(31 33 41) batch_size=12 -for n in "${sizes[@]}" + +for n in "${small_sizes[@]}" +do + for m in "${small_sizes[@]}" + do + for k in "${small_sizes[@]}" + do + echo $n $m $k 1>&2 + (./TestSimulation $n $m $k | tee sim_output.${n}.${m}.${k}.txt) & + + if [[ $(jobs -r -p | wc -l) -ge $batch_size ]]; then + wait -n + fi + done + done +done + + +for n in "${small_sizes[@]}" +do + for m in "${large_sizes[@]}" + do + for k in "${large_sizes[@]}" + do + echo $n $m $k 1>&2 + (./TestSimulation $n $m $k | tee sim_output.${n}.${m}.${k}.txt) & + + if [[ $(jobs -r -p | wc -l) -ge $batch_size ]]; then + wait -n + fi + done + done +done + +for n in "${large_sizes[@]}" +do + for m in "${small_sizes[@]}" + do + for k in "${large_sizes[@]}" + do + echo $n $m $k 1>&2 + (./TestSimulation $n $m $k | tee sim_output.${n}.${m}.${k}.txt) & + + if [[ $(jobs -r -p | wc -l) -ge $batch_size ]]; then + wait -n + fi + done + done +done + +for n in "${large_sizes[@]}" do - for m in "${sizes[@]}" + for m in "${large_sizes[@]}" do - for k in "${sizes[@]}" + for k in "${small_sizes[@]}" do echo $n $m $k 1>&2 (./TestSimulation $n $m $k | tee sim_output.${n}.${m}.${k}.txt) & From 137d11dc661f101bb703d7bd72b03c1b4b43e94c Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Tue, 4 Jan 2022 03:02:15 -0800 Subject: [PATCH 52/67] ApfpIsInitialized |-> IsInitialized Co-authored-by: definelicht --- interface/ApfpBlas.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/ApfpBlas.cpp b/interface/ApfpBlas.cpp index 323730e..abc1804 100644 --- a/interface/ApfpBlas.cpp +++ b/interface/ApfpBlas.cpp @@ -37,7 +37,7 @@ int Finalize() { return BlasError::success; } -bool ApfpIsInitialized() { +bool IsInitialized() { return apfp.has_value(); } From 98434fcbd8ef71d93d53dbcda7f20a62435d53e5 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Thu, 6 Jan 2022 23:54:57 +0100 Subject: [PATCH 53/67] Scale back directory search for kernel --- include/Config.h.in | 1 + interface/Apfp.cpp | 21 ++------------------- 2 files changed, 3 insertions(+), 19 deletions(-) diff --git a/include/Config.h.in b/include/Config.h.in index 957943f..6b7a358 100644 --- a/include/Config.h.in +++ b/include/Config.h.in @@ -6,6 +6,7 @@ constexpr int kMultBaseBits = ${APFP_MULT_BASE_BITS}; constexpr int kTileSizeN = ${APFP_TILE_SIZE_N}; constexpr int kTileSizeM = ${APFP_TILE_SIZE_M}; constexpr auto kBuildDir = "${CMAKE_BINARY_DIR}"; +constexpr auto kInstallPrefix = "${CMAKE_INSTALL_PREFIX}"; static_assert(kBits % 8 == 0, "Number of bits must be byte-aligned."); #define APFP_${APFP_INTERFACE_TYPE}_INTERFACE_TYPE diff --git a/interface/Apfp.cpp b/interface/Apfp.cpp index 8195885..86a40bc 100644 --- a/interface/Apfp.cpp +++ b/interface/Apfp.cpp @@ -37,26 +37,9 @@ std::string Apfp::FindKernel() { auto kernel_name = std::filesystem::path(apfp_use_simulation ? "MatrixMultiplication_hw_emu.xclbin" : "MatrixMultiplication_hw.xclbin"); - { // Search for the kernel in /lib, /usr/lib, LD_LIBRARY_PATH, current directory + { std::vector search_paths; - // System dirs - search_paths.push_back(std::filesystem::path("/lib")); - search_paths.push_back(std::filesystem::path("/usr/lib")); - - // LD_LIBRARY_PATH - char* ld_library_path_env_var = std::getenv("LD_LIBRARY_PATH"); - auto ld_library_path = (ld_library_path_env_var == nullptr) ? "" : std::string(ld_library_path_env_var); - - for (std::string::iterator seg_begin = ld_library_path.begin(), seg_end; seg_begin < ld_library_path.end(); - seg_begin = seg_end + 1) { - seg_end = std::find(seg_begin, ld_library_path.end(), ':'); - - std::string candidate_path(seg_begin, seg_end); - search_paths.push_back(std::filesystem::path(candidate_path)); - } - - // Current working directory - search_paths.push_back(std::filesystem::current_path()); + search_paths.push_back(std::filesystem::path(kInstallPrefix)); // Search for (auto candidate_dir : search_paths) { From 4e239181a470e32dee6e3441099c5aa9395edccc Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Fri, 7 Jan 2022 00:05:30 +0100 Subject: [PATCH 54/67] Missing function renames --- interface/ApfpBlas.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/interface/ApfpBlas.cpp b/interface/ApfpBlas.cpp index abc1804..e978a3d 100644 --- a/interface/ApfpBlas.cpp +++ b/interface/ApfpBlas.cpp @@ -114,11 +114,11 @@ void CopyToMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigne } template -int ApfpSyrkImpl(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, ptr_function_type_a A, +int SyrkImpl(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, ptr_function_type_a A, unsigned long LDA, ptr_function_type_c C, unsigned long LDC) { try { // ==== library input validation stuff ==== - if (!ApfpIsInitialized()) { + if (!IsInitialized()) { return BlasError::uninitialized; } @@ -188,12 +188,12 @@ int Syrk(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, inter interface::Ptr C, unsigned long LDC) { auto a_ptr_function = [&](unsigned long i) -> interface::ConstPtr { return A + i; }; auto c_ptr_function = [&](unsigned long i) -> interface::Ptr { return C + i; }; - return ApfpSyrkImpl(uplo, trans, N, K, a_ptr_function, LDA, c_ptr_function, LDC); + return SyrkImpl(uplo, trans, N, K, a_ptr_function, LDA, c_ptr_function, LDC); } int Syrk(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, ConstIndexFunction A, unsigned long LDA, IndexFunction C, unsigned long LDC) { - return ApfpSyrkImpl(uplo, trans, N, K, A, LDA, C, LDC); + return SyrkImpl(uplo, trans, N, K, A, LDA, C, LDC); } } // namespace apfp \ No newline at end of file From 7f1fa90690b2aefe852a363f6de31834639b96bb Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Fri, 7 Jan 2022 00:07:48 +0100 Subject: [PATCH 55/67] Add cwd to kernel search path --- interface/Apfp.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/interface/Apfp.cpp b/interface/Apfp.cpp index 86a40bc..4236d32 100644 --- a/interface/Apfp.cpp +++ b/interface/Apfp.cpp @@ -39,6 +39,7 @@ std::string Apfp::FindKernel() { { std::vector search_paths; + search_paths.push_back(std::filesystem::current_path()); search_paths.push_back(std::filesystem::path(kInstallPrefix)); // Search From 6fd095e9b5e0b8c475208b4fcb4ff8788ba3bacf Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Fri, 7 Jan 2022 00:08:45 +0100 Subject: [PATCH 56/67] Set INTERFACE_TYPE to SEMANTICS --- CMakeLists.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5117a82..c9dfb62 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,8 +14,10 @@ set(APFP_DEBUGGING OFF CACHE BOOL "Enable debugging in generated kernels.") set(APFP_PROFILING OFF CACHE BOOL "Enable profiling in generated kernels.") set(APFP_SAVE_TEMPS OFF CACHE BOOL "Save temporary files from kernel builds.") set_property(CACHE APFP_SEMANTICS PROPERTY STRINGS GMP MPFR) -set(APFP_INTERFACE_TYPE "MPFR" CACHE STRING "Which data types to use for the interface [GMP/MPFR].") -set_property(CACHE APFP_INTERFACE_TYPE PROPERTY STRINGS GMP MPFR) + +# One day we might accept both +set(APFP_INTERFACE_TYPE ${APFP_SEMANTICS}) +# but not today # Validation and derived numbers math(EXPR APFP_ALIGNED "${APFP_BITS} % 512") From 9d1173c305eb8903d5ff696fd7583fee5d21002a Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Fri, 7 Jan 2022 00:28:37 +0100 Subject: [PATCH 57/67] BlasError is scoped enum --- host/BlasUnitTests.cpp | 4 ++-- interface/ApfpBlas.cpp | 24 ++++++++++++++---------- interface/ApfpBlas.h | 9 +++++++-- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/host/BlasUnitTests.cpp b/host/BlasUnitTests.cpp index 0dade4b..42c18fb 100644 --- a/host/BlasUnitTests.cpp +++ b/host/BlasUnitTests.cpp @@ -17,7 +17,7 @@ void ApfpSetup() { mpfr_set_default_prec(kMantissaBits); #endif auto apfp_error_code = apfp::Init(kMantissaBits); - REQUIRE(apfp_error_code == apfp::BlasError::success); + REQUIRE(apfp_error_code); } void ApfpTeardown() { @@ -119,7 +119,7 @@ TEST_CASE("SYRK") { auto error_code = apfp::Syrk( uplo_mode, mode, N, K, [&](unsigned long i) { return a_matrix.at(i).get(); }, mode == apfp::BlasTrans::normal ? N : K, [&](unsigned long i) { return c_matrix.at(i).get(); }, N); - REQUIRE(error_code == apfp::BlasError::success); + REQUIRE(error_code); // Check all entries are sufficiently close apfp::interface::Wrapper diff; diff --git a/interface/ApfpBlas.cpp b/interface/ApfpBlas.cpp index e978a3d..e9f1182 100644 --- a/interface/ApfpBlas.cpp +++ b/interface/ApfpBlas.cpp @@ -16,25 +16,25 @@ int Init(unsigned long precision) { if (precision > kBits) { // Requested bit width too large last_error_message = "Requested bitwidth too large"; - return BlasError::bitwidth; + return static_cast(BlasError::bitwidth); } apfp.emplace(); - return BlasError::success; + return static_cast(BlasError::success); } catch (const KernelNotFoundException& e) { last_error_message = e.what(); - return BlasError::kernel_not_found; + return static_cast(BlasError::kernel_not_found); } catch (const std::exception& e) { // Unknown exception last_error_message = e.what(); - return BlasError::unknown; + return static_cast(BlasError::unknown); } } int Finalize() { apfp.reset(); - return BlasError::success; + return static_cast(BlasError::success); } bool IsInitialized() { @@ -45,6 +45,10 @@ const char* ErrorDescription() { return last_error_message.c_str(); } +BlasError InterpretError(int a) { + return a < 0 ? BlasError::argument_error : static_cast(a); +} + /// Copy the upper or lower triangle from an NxN matrix A to a full size buffer template void CopyFromMatrixUplo(BlasUplo uplo, unsigned long N, ptr_function_type A, unsigned long LDA, @@ -119,7 +123,7 @@ int SyrkImpl(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, p try { // ==== library input validation stuff ==== if (!IsInitialized()) { - return BlasError::uninitialized; + return static_cast(BlasError::uninitialized); } // A is NxK if 'N', KxN if 'T' @@ -142,10 +146,10 @@ int SyrkImpl(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, p // Empty matrix no-op if (N == 0) { - return BlasError::success; + return static_cast(BlasError::success); } if (K == 0) { - return BlasError::success; + return static_cast(BlasError::success); } // ==== setup ==== @@ -177,10 +181,10 @@ int SyrkImpl(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, p CopyToMatrixUplo(uplo, N, C, LDC, host_c.data()); } catch (const std::exception& e) { last_error_message = e.what(); - return BlasError::unknown; + return static_cast(BlasError::unknown); } - return BlasError::success; + return static_cast(BlasError::success); } /// See netlib's documentation on Syrk for usage. Alpha and beta unsupported diff --git a/interface/ApfpBlas.h b/interface/ApfpBlas.h index bc2c7cc..8badfa5 100644 --- a/interface/ApfpBlas.h +++ b/interface/ApfpBlas.h @@ -7,13 +7,14 @@ namespace apfp { -enum BlasError : int { +enum class BlasError : int { success = 0, unknown = 1, unimplemented = 2, bitwidth = 3, uninitialized = 4, kernel_not_found = 5, + argument_error = 6, }; enum class BlasUplo : char { upper = 'U', lower = 'L' }; @@ -23,13 +24,17 @@ enum class BlasTrans : char { transpose = 'T', }; + using IndexFunction = std::function; using ConstIndexFunction = std::function; - /// Null terminated string describing the most recent library error if available /// Pointer is only guaranteed to live until the next library call const char* ErrorDescription(); +/// Convert a return code to a BlasError type +/// Negative return codes are converted to BlasError::argument_error +BlasError InterpretError(int a); + int Init(unsigned long precision); int Finalize(); From d0ab102ffbc9f06a5082218b3a978f5ef7155266 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Fri, 7 Jan 2022 00:35:13 +0100 Subject: [PATCH 58/67] class Apfp -> Context --- interface/Apfp.cpp | 16 ++++++++-------- interface/Apfp.h | 8 ++++---- interface/ApfpBlas.cpp | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/interface/Apfp.cpp b/interface/Apfp.cpp index 4236d32..fda1c8a 100644 --- a/interface/Apfp.cpp +++ b/interface/Apfp.cpp @@ -11,13 +11,13 @@ namespace apfp { -Apfp::Apfp() { +Context::Context() { auto kernel_path = FindKernel(); program_.emplace(context_.MakeProgram(kernel_path)); lines_per_number_ = kLinesPerNumber; } -std::string Apfp::FindKernel() { +std::string Context::FindKernel() { { // Specify a path to the APFP kernel manually char* apfp_kernel_env_var = std::getenv("APFP_KERNEL"); if (apfp_kernel_env_var != nullptr) { @@ -54,7 +54,7 @@ std::string Apfp::FindKernel() { throw KernelNotFoundException("Unable to find FPGA kernel"); } -DeviceMatrix Apfp::AllocateDeviceMatrix(std::size_t rows, std::size_t cols) { +DeviceMatrix Context::AllocateDeviceMatrix(std::size_t rows, std::size_t cols) { // This seems like poor encapsulation, is there a better way? DeviceMatrix matrix; @@ -64,13 +64,13 @@ DeviceMatrix Apfp::AllocateDeviceMatrix(std::size_t rows, std::size_t cols) { return matrix; } -DeviceMatrix Apfp::MatrixMultiplication(const DeviceMatrix& a, const DeviceMatrix& b) { +DeviceMatrix Context::MatrixMultiplication(const DeviceMatrix& a, const DeviceMatrix& b) { auto result = AllocateDeviceMatrix(a.rows(), b.cols()); MatrixMultiplication(a, b, &result); return result; } -void Apfp::MatrixMultiplication(const DeviceMatrix& a, const DeviceMatrix& b, DeviceMatrix* result) { +void Context::MatrixMultiplication(const DeviceMatrix& a, const DeviceMatrix& b, DeviceMatrix* result) { if (a.cols() != b.rows() || result->rows() != a.rows() || result->cols() != b.cols()) { throw std::logic_error("Matrix dimension mismatch"); } @@ -80,15 +80,15 @@ void Apfp::MatrixMultiplication(const DeviceMatrix& a, const DeviceMatrix& b, De kernel.ExecuteTask(); } -void Apfp::MatrixAddition(const DeviceMatrix&, const DeviceMatrix&, DeviceMatrix*) { +void Context::MatrixAddition(const DeviceMatrix&, const DeviceMatrix&, DeviceMatrix*) { throw UnimplementedException(); } -void Apfp::TransposeInPlace(DeviceMatrix*) { +void Context::TransposeInPlace(DeviceMatrix*) { throw UnimplementedException(); } -DeviceMatrix Apfp::Transpose(const DeviceMatrix&) { +DeviceMatrix Context::Transpose(const DeviceMatrix&) { throw UnimplementedException(); } diff --git a/interface/Apfp.h b/interface/Apfp.h index 2c12205..6be5555 100644 --- a/interface/Apfp.h +++ b/interface/Apfp.h @@ -14,7 +14,7 @@ namespace apfp { class DeviceMatrix; /// Object oriented interface for Apfp -class Apfp { +class Context { hlslib::ocl::Context context_; std::optional program_; @@ -23,7 +23,7 @@ class Apfp { static std::string FindKernel(); public: - Apfp(); + Context(); /// Allocate a buffer on the device DeviceMatrix AllocateDeviceMatrix(std::size_t rows, std::size_t cols); @@ -45,13 +45,13 @@ class Apfp { }; /// Helper class to track matrices on the device -/// We should probably refactor the interface to Apfp to something more controlled? +/// We should probably refactor the interface to Context to something more controlled? class DeviceMatrix { std::size_t num_rows_; std::size_t num_cols_; hlslib::ocl::Buffer buffer_; - friend Apfp; + friend Context; DeviceMatrix() = default; diff --git a/interface/ApfpBlas.cpp b/interface/ApfpBlas.cpp index e9f1182..b201bb5 100644 --- a/interface/ApfpBlas.cpp +++ b/interface/ApfpBlas.cpp @@ -8,7 +8,7 @@ namespace apfp { -static std::optional apfp; +static std::optional apfp; static std::string last_error_message; int Init(unsigned long precision) { From 155d0b87ffb42a23b5bccd2e654adb6347732edd Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Fri, 7 Jan 2022 00:55:19 +0100 Subject: [PATCH 59/67] Use RNDZ MPFR rounding mode everywhere --- host/BlasUnitTests.cpp | 7 +++---- interface/ApfpInterfaceType.cpp | 9 +++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/host/BlasUnitTests.cpp b/host/BlasUnitTests.cpp index 42c18fb..395fadd 100644 --- a/host/BlasUnitTests.cpp +++ b/host/BlasUnitTests.cpp @@ -46,10 +46,9 @@ bool IsClose(apfp::interface::ConstPtr a, apfp::interface::ConstPtr b) { long exp; mpf_get_d_2exp(&exp, ratio.get()); #else - auto rounding_mode = mpfr_get_default_rounding_mode(); - mpfr_sub(diff.get(), a, b, rounding_mode); - mpfr_add(sum.get(), a, b, rounding_mode); - mpfr_div(ratio.get(), diff.get(), sum.get(), rounding_mode); + mpfr_sub(diff.get(), a, b, kRoundingMode); + mpfr_add(sum.get(), a, b, kRoundingMode); + mpfr_div(ratio.get(), diff.get(), sum.get(), kRoundingMode); auto exp = mpfr_get_exp(ratio.get()); #endif // Require the numbers to match to the first 90% decimal places diff --git a/interface/ApfpInterfaceType.cpp b/interface/ApfpInterfaceType.cpp index d86a7fc..6862444 100644 --- a/interface/ApfpInterfaceType.cpp +++ b/interface/ApfpInterfaceType.cpp @@ -1,4 +1,5 @@ #include "ApfpInterfaceType.h" +#include "DeviceTypes.h" namespace apfp::interface { @@ -39,7 +40,7 @@ void Set(Ptr dest, ConstPtr source) { #ifdef APFP_GMP_INTERFACE_TYPE mpf_set(dest, source); #else - mpfr_set(dest, source, mpfr_get_default_rounding_mode()); + mpfr_set(dest, source, kRoundingMode); #endif } @@ -47,7 +48,7 @@ void Set(Ptr dest, long int source) { #ifdef APFP_GMP_INTERFACE_TYPE mpf_set_ui(dest, source); #else - mpfr_set_si(dest, source, mpfr_get_default_rounding_mode()); + mpfr_set_si(dest, source, kRoundingMode); #endif } @@ -55,7 +56,7 @@ void Add(Ptr dest, ConstPtr a, ConstPtr b) { #ifdef APFP_GMP_INTERFACE_TYPE mpf_add(dest, a, b); #else - mpfr_add(dest, a, b, mpfr_get_default_rounding_mode()); + mpfr_add(dest, a, b, kRoundingMode); #endif } @@ -63,7 +64,7 @@ void Mul(Ptr dest, ConstPtr a, ConstPtr b) { #ifdef APFP_GMP_INTERFACE_TYPE mpf_mul(dest, a, b); #else - mpfr_mul(dest, a, b, mpfr_get_default_rounding_mode()); + mpfr_mul(dest, a, b, kRoundingMode); #endif } From 55c9fe8c72cd835ed159ffbd49c802328f496d19 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Fri, 7 Jan 2022 00:59:29 +0100 Subject: [PATCH 60/67] Throw KernelNotFoundException if APFP_KERNEL misset --- interface/Apfp.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/Apfp.cpp b/interface/Apfp.cpp index fda1c8a..e88585e 100644 --- a/interface/Apfp.cpp +++ b/interface/Apfp.cpp @@ -24,7 +24,7 @@ std::string Context::FindKernel() { auto kernel_override_path = std::filesystem::path(apfp_kernel_env_var); if (!std::filesystem::exists(kernel_override_path)) { - throw std::runtime_error( + throw KernelNotFoundException( "APFP kernel path specified with APFP_KERNEL environment variable does not exist"); } return kernel_override_path.string(); From 4e5c34d3e014daee4ed03ed6f1724df525f9d1ea Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Fri, 7 Jan 2022 09:26:52 +0100 Subject: [PATCH 61/67] Add comment about memory layout --- device/MatrixMultiplication.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/device/MatrixMultiplication.cpp b/device/MatrixMultiplication.cpp index 5252757..a7aa725 100644 --- a/device/MatrixMultiplication.cpp +++ b/device/MatrixMultiplication.cpp @@ -6,6 +6,11 @@ #include "ArithmeticOperations.h" +// All memory accesses are column-major! +// I.e. a(i,j) = a[i + LDA * j] +// AB = sum_k a(i,k) b(k, j) = sum_k a[i + LDA * k] * b[k + LDA * j] +// LDA (leading dimension of A) = stride + // Annoyingly we have to specialize the innermost loop on whether multiple DRAM flits per number are required or not, // because HLS otherwise gets confused by pragmas applied to a loop of size 1 in the latter case. template From 8b22b71c0883ee3447e6afb6aafb52dd8d5b9500 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Sat, 8 Jan 2022 01:47:14 +0100 Subject: [PATCH 62/67] More descriptive SYRK unit tests --- host/BlasUnitTests.cpp | 51 ++++++++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/host/BlasUnitTests.cpp b/host/BlasUnitTests.cpp index 395fadd..d99fb01 100644 --- a/host/BlasUnitTests.cpp +++ b/host/BlasUnitTests.cpp @@ -17,7 +17,7 @@ void ApfpSetup() { mpfr_set_default_prec(kMantissaBits); #endif auto apfp_error_code = apfp::Init(kMantissaBits); - REQUIRE(apfp_error_code); + REQUIRE(!apfp_error_code); } void ApfpTeardown() { @@ -67,7 +67,8 @@ TEST_CASE("SYRK") { unsigned long N = GENERATE(0, 1, 2, 8, 15, 16, 31, 32, 33); unsigned long K = GENERATE(0, 1, 2, 8, 15, 16, 31, 32, 33); - auto mode = GENERATE(apfp::BlasTrans::normal, apfp::BlasTrans::transpose); + + auto mode = GENERATE(apfp::BlasTrans::transpose); auto uplo_mode = GENERATE(apfp::BlasUplo::upper, apfp::BlasUplo::lower); // Test SYRK // In 'N' mode, we perform AA^T + C @@ -75,6 +76,7 @@ TEST_CASE("SYRK") { // C is NxN // Matrices are stored column major because BLAS { + CAPTURE(N, K, mode, uplo_mode); std::vector a_matrix; a_matrix.resize(N * K); for (auto& v : a_matrix) { @@ -90,25 +92,27 @@ TEST_CASE("SYRK") { std::vector ref_result; ref_result.resize(N * N); + // Capture inputs for when we explode + std::vector a_matrix_d, c_matrix_d; + a_matrix_d.resize(a_matrix.size()); + std::transform(a_matrix.begin(), a_matrix.end(), a_matrix_d.begin(), + [](const auto& v) { return mpfr_get_d(v.get(), kRoundingMode); }); + c_matrix_d.resize(c_matrix.size()); + std::transform(c_matrix.begin(), c_matrix.end(), c_matrix_d.begin(), + [](const auto& v) { return mpfr_get_d(v.get(), kRoundingMode); }); + CAPTURE(a_matrix_d, c_matrix_d); + // Compute reference result apfp::interface::Wrapper prod_temp; for (unsigned long j = 0; j < N; ++j) { // lower half for (unsigned long i = 0; i < N; ++i) { - auto r_idx = i + j * N; + auto r_idx = mode == apfp::BlasTrans::normal ? i + j * N : j + i * N; apfp::interface::Set(ref_result.at(r_idx).get(), c_matrix.at(r_idx).get()); for (unsigned long k = 0; k < K; ++k) { - // A is NxK if N, KxN if T - if (mode == apfp::BlasTrans::normal) { - // (AB)_ij = sum_k A(i,k)B(k,j) - apfp::interface::Mul(prod_temp.get(), a_matrix.at(i + k * N).get(), - a_matrix.at(j + k * N).get()); - } else { - // (AB)_ij = sum_k A(i,k) B(k,j) - apfp::interface::Mul(prod_temp.get(), a_matrix.at(k + i * K).get(), - a_matrix.at(k + j * K).get()); - } + // (AB)_ij = sum_k A(i,k)B(k,j) + apfp::interface::Mul(prod_temp.get(), a_matrix.at(i + k * N).get(), a_matrix.at(j + k * N).get()); apfp::interface::Add(ref_result.at(r_idx).get(), prod_temp.get(), ref_result.at(r_idx).get()); } } @@ -118,17 +122,30 @@ TEST_CASE("SYRK") { auto error_code = apfp::Syrk( uplo_mode, mode, N, K, [&](unsigned long i) { return a_matrix.at(i).get(); }, mode == apfp::BlasTrans::normal ? N : K, [&](unsigned long i) { return c_matrix.at(i).get(); }, N); - REQUIRE(error_code); + REQUIRE(!error_code); + + std::vector c_matrix_result_d, c_matrix_ref_result_d; + c_matrix_result_d.resize(c_matrix.size()); + c_matrix_ref_result_d.resize(c_matrix.size()); + std::transform(c_matrix.begin(), c_matrix.end(), c_matrix_result_d.begin(), + [](const auto& v) { return mpfr_get_d(v.get(), kRoundingMode); }); + std::transform(ref_result.begin(), ref_result.end(), c_matrix_ref_result_d.begin(), + [](const auto& v) { return mpfr_get_d(v.get(), kRoundingMode); }); + + CAPTURE(c_matrix_result_d, c_matrix_ref_result_d); // Check all entries are sufficiently close apfp::interface::Wrapper diff; for (unsigned long j = 0; j < N; ++j) { - // lower half + // upper half for (unsigned long i = 0; i < j; ++i) { - auto ref_value = uplo_mode == apfp::BlasUplo::lower ? ref_result.at(i + j * N).get() + auto ref_value = uplo_mode == apfp::BlasUplo::upper ? ref_result.at(i + j * N).get() : ref_result.at(j + i * N).get(); auto test_value = - uplo_mode == apfp::BlasUplo::lower ? c_matrix.at(i + j * N).get() : c_matrix.at(j + i * N).get(); + uplo_mode == apfp::BlasUplo::upper ? c_matrix.at(i + j * N).get() : c_matrix.at(j + i * N).get(); + CAPTURE(i, j); + CAPTURE(PackedFloat(ref_value), PackedFloat(test_value)); + CAPTURE(mpfr_get_d(ref_value, kRoundingMode), mpfr_get_d(test_value, kRoundingMode)); REQUIRE(IsClose(ref_value, test_value)); } } From ad5d63fcec4bad0fe02382383898cf5289c00f92 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Sat, 8 Jan 2022 01:50:06 +0100 Subject: [PATCH 63/67] Add GEMM --- interface/ApfpBlas.cpp | 77 ++++++++++++++++++++++++++++++++++++++++-- interface/ApfpBlas.h | 6 ++++ 2 files changed, 81 insertions(+), 2 deletions(-) diff --git a/interface/ApfpBlas.cpp b/interface/ApfpBlas.cpp index b201bb5..a727d1a 100644 --- a/interface/ApfpBlas.cpp +++ b/interface/ApfpBlas.cpp @@ -117,9 +117,21 @@ void CopyToMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigne } } +/// Do all the intermediate work to get a device matrix out of a pointer function +/// TODO: we can make this handle the tranpose argument and LDA check (raise exception or option type) +template +DeviceMatrix MakeDeviceMatrix(unsigned long N, unsigned long M, ptr_function_type A, unsigned long LDA) { + std::vector host_a; + host_a.resize(N * M); + CopyFromMatrix(N, M, A, LDA, host_a.data()); + auto device_a = apfp->AllocateDeviceMatrix(N, M); + device_a.TransferToDevice(host_a.data(), host_a.size()); + return device_a; +} + template -int SyrkImpl(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, ptr_function_type_a A, - unsigned long LDA, ptr_function_type_c C, unsigned long LDC) { +int SyrkImpl(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, ptr_function_type_a A, unsigned long LDA, + ptr_function_type_c C, unsigned long LDC) { try { // ==== library input validation stuff ==== if (!IsInitialized()) { @@ -200,4 +212,65 @@ int Syrk(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, Const return SyrkImpl(uplo, trans, N, K, A, LDA, C, LDC); } +template +int GemmImpl(BlasTrans trans_a, BlasTrans trans_b, unsigned long N, unsigned long M, unsigned long K, + ptr_function_type_a A, unsigned long LDA, ptr_function_type_b B, unsigned long LDB, ptr_function_type_c C, + unsigned long LDC) { + try { + // ==== library input validation stuff ==== + if (!IsInitialized()) { + return static_cast(BlasError::uninitialized); + } + + // Implement the transposed versions later + if (trans_a != BlasTrans::normal || trans_b != BlasTrans::normal) { + return static_cast(BlasError::unimplemented); + } + + // Empty matrix + if (N == 0 || M == 0 || K == 0) { + return static_cast(BlasError::success); + } + + // Validate leading dimensions are sane + if (LDA < M) { + return -7; + } + if (LDB < K) { + return -9; + } + + // ==== setup ==== + auto device_a = MakeDeviceMatrix(N, K, A, LDA); + auto device_b = MakeDeviceMatrix(K, M, B, LDB); + auto device_c = MakeDeviceMatrix(N, M, C, LDC); + + // ==== compute and teardown ==== + apfp->MatrixMultiplication(device_a, device_b, &device_c); + std::vector host_c; + host_c.resize(N * M); + device_c.TransferToHost(host_c.data(), host_c.size()); + CopyToMatrix(N, M, C, LDC, host_c.data()); + + } catch (const std::exception& e) { + last_error_message = e.what(); + return static_cast(BlasError::unknown); + } + + return static_cast(BlasError::success); +} +/// See netlib's documentation on Syrk for usage. Alpha and beta unsupported +int Gemm(BlasTrans trans_a, BlasTrans trans_b, unsigned long N, unsigned long M, unsigned long K, interface::ConstPtr A, + unsigned long LDA, interface::Ptr B, unsigned long LDB, interface::Ptr C, unsigned long LDC) { + auto a_ptr_function = [&](unsigned long i) -> interface::ConstPtr { return A + i; }; + auto b_ptr_function = [&](unsigned long i) -> interface::ConstPtr { return B + i; }; + auto c_ptr_function = [&](unsigned long i) -> interface::Ptr { return C + i; }; + return GemmImpl(trans_a, trans_b, N, M, K, a_ptr_function, LDA, b_ptr_function, LDB, c_ptr_function, LDC); +} + +int Gemm(BlasTrans trans_a, BlasTrans trans_b, unsigned long N, unsigned long M, unsigned long K, ConstIndexFunction A, + unsigned long LDA, IndexFunction B, unsigned long LDB, IndexFunction C, unsigned long LDC) { + return GemmImpl(trans_a, trans_b, N, M, K, A, LDA, B, LDB, C, LDC); +} + } // namespace apfp \ No newline at end of file diff --git a/interface/ApfpBlas.h b/interface/ApfpBlas.h index 8badfa5..fc4bf43 100644 --- a/interface/ApfpBlas.h +++ b/interface/ApfpBlas.h @@ -45,4 +45,10 @@ int Syrk(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, inter int Syrk(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, ConstIndexFunction A, unsigned long LDA, IndexFunction C, unsigned long LDC); +/// See netlib's documentation on Gemm for usage. Alpha and beta unsupported +int Gemm(BlasTrans trans_a, BlasTrans trans_b, unsigned long N, unsigned long M, unsigned long K, interface::ConstPtr A, unsigned long LDA, + interface::Ptr B, unsigned long LDB, interface::Ptr C, unsigned long LDC); +int Gemm(BlasTrans trans_a, BlasTrans trans_b, unsigned long N, unsigned long M, unsigned long K, ConstIndexFunction A, unsigned long LDA, + IndexFunction B, unsigned long LDB, IndexFunction C, unsigned long LDC); + } // namespace apfp \ No newline at end of file From 40a61a6c10779c4c50e389872bf011d2460a6a4b Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Sat, 8 Jan 2022 01:54:47 +0100 Subject: [PATCH 64/67] Missing syrk test case --- host/BlasUnitTests.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/host/BlasUnitTests.cpp b/host/BlasUnitTests.cpp index d99fb01..d878027 100644 --- a/host/BlasUnitTests.cpp +++ b/host/BlasUnitTests.cpp @@ -68,7 +68,7 @@ TEST_CASE("SYRK") { unsigned long N = GENERATE(0, 1, 2, 8, 15, 16, 31, 32, 33); unsigned long K = GENERATE(0, 1, 2, 8, 15, 16, 31, 32, 33); - auto mode = GENERATE(apfp::BlasTrans::transpose); + auto mode = GENERATE(apfp::BlasTrans::transpose, apfp::BlasTrans::normal); auto uplo_mode = GENERATE(apfp::BlasUplo::upper, apfp::BlasUplo::lower); // Test SYRK // In 'N' mode, we perform AA^T + C @@ -152,4 +152,16 @@ TEST_CASE("SYRK") { } ApfpTeardown(); +} + +TEST_CASE("GEMM") { + ApfpSetup(); + + auto rng = RandomNumberGenerator(); + + unsigned long N = GENERATE(0, 1, 2, 8, 15, 16, 31, 32, 33); + unsigned long M = GENERATE(0, 1, 2, 8, 15, 16, 31, 32, 33); + unsigned long K = GENERATE(0, 1, 2, 8, 15, 16, 31, 32, 33); + + } \ No newline at end of file From 66ecc7e591131b378e88e1c18dc3ff2d9ceb86fd Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Sat, 8 Jan 2022 02:01:36 +0100 Subject: [PATCH 65/67] Fix M and N in GEMM --- interface/ApfpBlas.cpp | 20 ++++++++++---------- interface/ApfpBlas.h | 9 ++++----- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/interface/ApfpBlas.cpp b/interface/ApfpBlas.cpp index a727d1a..0e757d5 100644 --- a/interface/ApfpBlas.cpp +++ b/interface/ApfpBlas.cpp @@ -213,7 +213,7 @@ int Syrk(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, Const } template -int GemmImpl(BlasTrans trans_a, BlasTrans trans_b, unsigned long N, unsigned long M, unsigned long K, +int GemmImpl(BlasTrans trans_a, BlasTrans trans_b, unsigned long M, unsigned long N, unsigned long K, ptr_function_type_a A, unsigned long LDA, ptr_function_type_b B, unsigned long LDB, ptr_function_type_c C, unsigned long LDC) { try { @@ -241,16 +241,16 @@ int GemmImpl(BlasTrans trans_a, BlasTrans trans_b, unsigned long N, unsigned lon } // ==== setup ==== - auto device_a = MakeDeviceMatrix(N, K, A, LDA); - auto device_b = MakeDeviceMatrix(K, M, B, LDB); - auto device_c = MakeDeviceMatrix(N, M, C, LDC); + auto device_a = MakeDeviceMatrix(M, K, A, LDA); + auto device_b = MakeDeviceMatrix(K, N, B, LDB); + auto device_c = MakeDeviceMatrix(M, N, C, LDC); // ==== compute and teardown ==== apfp->MatrixMultiplication(device_a, device_b, &device_c); std::vector host_c; - host_c.resize(N * M); + host_c.resize(M * N); device_c.TransferToHost(host_c.data(), host_c.size()); - CopyToMatrix(N, M, C, LDC, host_c.data()); + CopyToMatrix(M, N, C, LDC, host_c.data()); } catch (const std::exception& e) { last_error_message = e.what(); @@ -260,17 +260,17 @@ int GemmImpl(BlasTrans trans_a, BlasTrans trans_b, unsigned long N, unsigned lon return static_cast(BlasError::success); } /// See netlib's documentation on Syrk for usage. Alpha and beta unsupported -int Gemm(BlasTrans trans_a, BlasTrans trans_b, unsigned long N, unsigned long M, unsigned long K, interface::ConstPtr A, +int Gemm(BlasTrans trans_a, BlasTrans trans_b, unsigned long M, unsigned long N, unsigned long K, interface::ConstPtr A, unsigned long LDA, interface::Ptr B, unsigned long LDB, interface::Ptr C, unsigned long LDC) { auto a_ptr_function = [&](unsigned long i) -> interface::ConstPtr { return A + i; }; auto b_ptr_function = [&](unsigned long i) -> interface::ConstPtr { return B + i; }; auto c_ptr_function = [&](unsigned long i) -> interface::Ptr { return C + i; }; - return GemmImpl(trans_a, trans_b, N, M, K, a_ptr_function, LDA, b_ptr_function, LDB, c_ptr_function, LDC); + return GemmImpl(trans_a, trans_b, M, N, K, a_ptr_function, LDA, b_ptr_function, LDB, c_ptr_function, LDC); } -int Gemm(BlasTrans trans_a, BlasTrans trans_b, unsigned long N, unsigned long M, unsigned long K, ConstIndexFunction A, +int Gemm(BlasTrans trans_a, BlasTrans trans_b, unsigned long M, unsigned long N, unsigned long K, ConstIndexFunction A, unsigned long LDA, IndexFunction B, unsigned long LDB, IndexFunction C, unsigned long LDC) { - return GemmImpl(trans_a, trans_b, N, M, K, A, LDA, B, LDB, C, LDC); + return GemmImpl(trans_a, trans_b, M, N, K, A, LDA, B, LDB, C, LDC); } } // namespace apfp \ No newline at end of file diff --git a/interface/ApfpBlas.h b/interface/ApfpBlas.h index fc4bf43..89d7593 100644 --- a/interface/ApfpBlas.h +++ b/interface/ApfpBlas.h @@ -24,7 +24,6 @@ enum class BlasTrans : char { transpose = 'T', }; - using IndexFunction = std::function; using ConstIndexFunction = std::function; /// Null terminated string describing the most recent library error if available @@ -46,9 +45,9 @@ int Syrk(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, Const IndexFunction C, unsigned long LDC); /// See netlib's documentation on Gemm for usage. Alpha and beta unsupported -int Gemm(BlasTrans trans_a, BlasTrans trans_b, unsigned long N, unsigned long M, unsigned long K, interface::ConstPtr A, unsigned long LDA, - interface::Ptr B, unsigned long LDB, interface::Ptr C, unsigned long LDC); -int Gemm(BlasTrans trans_a, BlasTrans trans_b, unsigned long N, unsigned long M, unsigned long K, ConstIndexFunction A, unsigned long LDA, - IndexFunction B, unsigned long LDB, IndexFunction C, unsigned long LDC); +int Gemm(BlasTrans trans_a, BlasTrans trans_b, unsigned long M, unsigned long N, unsigned long K, interface::ConstPtr A, + unsigned long LDA, interface::Ptr B, unsigned long LDB, interface::Ptr C, unsigned long LDC); +int Gemm(BlasTrans trans_a, BlasTrans trans_b, unsigned long M, unsigned long N, unsigned long K, ConstIndexFunction A, + unsigned long LDA, IndexFunction B, unsigned long LDB, IndexFunction C, unsigned long LDC); } // namespace apfp \ No newline at end of file From f9c7c3c3097c41714bbc80d697772381e331359e Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Sat, 8 Jan 2022 02:19:24 +0100 Subject: [PATCH 66/67] GEMM unit tests --- host/BlasUnitTests.cpp | 87 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 85 insertions(+), 2 deletions(-) diff --git a/host/BlasUnitTests.cpp b/host/BlasUnitTests.cpp index d878027..aee3eae 100644 --- a/host/BlasUnitTests.cpp +++ b/host/BlasUnitTests.cpp @@ -52,7 +52,8 @@ bool IsClose(apfp::interface::ConstPtr a, apfp::interface::ConstPtr b) { auto exp = mpfr_get_exp(ratio.get()); #endif // Require the numbers to match to the first 90% decimal places - return exp < -((kMantissaBits * 3 * 9) / 10); + // Also demand double precision downcast exactly equal + return (exp < -((kMantissaBits * 3 * 9) / 10)) && (mpfr_get_d(a, kRoundingMode) == mpfr_get_d(b, kRoundingMode)); } TEST_CASE("Init_Teardown") { @@ -159,9 +160,91 @@ TEST_CASE("GEMM") { auto rng = RandomNumberGenerator(); - unsigned long N = GENERATE(0, 1, 2, 8, 15, 16, 31, 32, 33); unsigned long M = GENERATE(0, 1, 2, 8, 15, 16, 31, 32, 33); + unsigned long N = GENERATE(0, 1, 2, 8, 15, 16, 31, 32, 33); unsigned long K = GENERATE(0, 1, 2, 8, 15, 16, 31, 32, 33); + { + CAPTURE(M, N, K); + std::vector a_matrix; + a_matrix.resize(M * K); + for (auto& v : a_matrix) { + rng.Generate(v.get()); + } + + std::vector b_matrix; + b_matrix.resize(K * N); + for (auto& v : b_matrix) { + rng.Generate(v.get()); + } + + std::vector c_matrix; + c_matrix.resize(M * N); + for (auto& v : a_matrix) { + rng.Generate(v.get()); + } + + std::vector ref_result; + ref_result.resize(M * N); + + // Capture inputs for when we explode + std::vector a_matrix_d, c_matrix_d; + a_matrix_d.resize(a_matrix.size()); + std::transform(a_matrix.begin(), a_matrix.end(), a_matrix_d.begin(), + [](const auto& v) { return mpfr_get_d(v.get(), kRoundingMode); }); + c_matrix_d.resize(c_matrix.size()); + std::transform(c_matrix.begin(), c_matrix.end(), c_matrix_d.begin(), + [](const auto& v) { return mpfr_get_d(v.get(), kRoundingMode); }); + CAPTURE(a_matrix_d, c_matrix_d); + + // Compute reference result + apfp::interface::Wrapper prod_temp; + for (unsigned long j = 0; j < N; ++j) { + // lower half + for (unsigned long i = 0; i < M; ++i) { + auto r_idx = i + j * M; + apfp::interface::Set(ref_result.at(r_idx).get(), c_matrix.at(r_idx).get()); + + for (unsigned long k = 0; k < K; ++k) { + // (AB)_ij = sum_k A(i,k)B(k,j) + apfp::interface::Mul(prod_temp.get(), a_matrix.at(i + k * M).get(), b_matrix.at(k + j * K).get()); + apfp::interface::Add(ref_result.at(r_idx).get(), prod_temp.get(), ref_result.at(r_idx).get()); + } + } + } + // Use APFP BLAS library + auto error_code = apfp::Gemm( + apfp::BlasTrans::normal, apfp::BlasTrans::normal, M, N, K, + [&](unsigned long i) { return a_matrix.at(i).get(); }, M, + [&](unsigned long i) { return b_matrix.at(i).get(); }, K, + [&](unsigned long i) { return c_matrix.at(i).get(); }, M); + REQUIRE(!error_code); + + std::vector c_matrix_result_d, c_matrix_ref_result_d; + c_matrix_result_d.resize(c_matrix.size()); + c_matrix_ref_result_d.resize(c_matrix.size()); + std::transform(c_matrix.begin(), c_matrix.end(), c_matrix_result_d.begin(), + [](const auto& v) { return mpfr_get_d(v.get(), kRoundingMode); }); + std::transform(ref_result.begin(), ref_result.end(), c_matrix_ref_result_d.begin(), + [](const auto& v) { return mpfr_get_d(v.get(), kRoundingMode); }); + + CAPTURE(c_matrix_result_d, c_matrix_ref_result_d); + + // Check all entries are sufficiently close + apfp::interface::Wrapper diff; + for (unsigned long j = 0; j < N; ++j) { + // upper half + for (unsigned long i = 0; i < M; ++i) { + auto ref_value = ref_result.at(i + j * M).get(); + auto test_value = ref_result.at(i + j * M).get(); + CAPTURE(i, j); + CAPTURE(PackedFloat(ref_value), PackedFloat(test_value)); + CAPTURE(mpfr_get_d(ref_value, kRoundingMode), mpfr_get_d(test_value, kRoundingMode)); + REQUIRE(IsClose(ref_value, test_value)); + } + } + } + + ApfpTeardown(); } \ No newline at end of file From 1685fd2279dd3923cc99e9ee430cb67c8501a166 Mon Sep 17 00:00:00 2001 From: Chris Pattison Date: Sat, 8 Jan 2022 02:22:40 +0100 Subject: [PATCH 67/67] Go fast and break things - just not the unit tests! --- host/BlasUnitTests.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/host/BlasUnitTests.cpp b/host/BlasUnitTests.cpp index aee3eae..420925a 100644 --- a/host/BlasUnitTests.cpp +++ b/host/BlasUnitTests.cpp @@ -237,7 +237,7 @@ TEST_CASE("GEMM") { // upper half for (unsigned long i = 0; i < M; ++i) { auto ref_value = ref_result.at(i + j * M).get(); - auto test_value = ref_result.at(i + j * M).get(); + auto test_value = c_matrix.at(i + j * M).get(); CAPTURE(i, j); CAPTURE(PackedFloat(ref_value), PackedFloat(test_value)); CAPTURE(mpfr_get_d(ref_value, kRoundingMode), mpfr_get_d(test_value, kRoundingMode));