diff --git a/CMakeLists.txt b/CMakeLists.txt index c54aee8..f742b42 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,6 +15,10 @@ set(APFP_PROFILING OFF CACHE BOOL "Enable profiling in generated kernels.") set(APFP_SAVE_TEMPS OFF CACHE BOOL "Save temporary files from kernel builds.") set_property(CACHE APFP_SEMANTICS PROPERTY STRINGS GMP MPFR) +# One day we might accept both +set(APFP_INTERFACE_TYPE ${APFP_SEMANTICS}) +# but not today + # Validation and derived numbers math(EXPR APFP_ALIGNED "${APFP_BITS} % 512") if(NOT APFP_ALIGNED EQUAL 0) @@ -30,7 +34,7 @@ find_package(GMP REQUIRED) find_package(Threads REQUIRED) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -Wno-unused-label -Wno-unknown-pragmas -Wno-class-memaccess -DAPFP_${APFP_SEMANTICS}_SEMANTICS") -include_directories(${CMAKE_BINARY_DIR} include SYSTEM hlslib/include ${Vitis_INCLUDE_DIRS} ) +include_directories(${CMAKE_BINARY_DIR} include SYSTEM hlslib/include ${Vitis_INCLUDE_DIRS} interface) configure_file(include/Config.h.in Config.h) @@ -40,7 +44,7 @@ set(APFP_KERNEL_FILES device/MatrixMultiplication.cpp # Setup FPGA kernel targets add_vitis_kernel(MatrixMultiplication FILES ${APFP_KERNEL_FILES} - INCLUDE_DIRS include hlslib/include ${CMAKE_BINARY_DIR} + INCLUDE_DIRS include hlslib/include ${CMAKE_BINARY_DIR} ${GMP_INCLUDES} HLS_FLAGS "-DAP_INT_MAX_W=${APFP_MAX_BITS} -DAPFP_${APFP_SEMANTICS}_SEMANTICS" HLS_CONFIG "config_compile -pipeline_style frp\nconfig_dataflow -fifo_depth 16" DEPENDS ${CMAKE_BINARY_DIR}/Config.h @@ -66,9 +70,9 @@ add_library(simulation ${APFP_KERNEL_FILES}) target_compile_options(simulation PRIVATE -Wno-unknown-pragmas -DAP_INT_MAX_W=${APFP_MAX_BITS}) target_link_libraries(simulation ${CMAKE_THREAD_LIBS_INIT}) -add_library(ApfpHostlib SHARED interface/Apfp.cpp) -target_link_libraries(ApfpHostlib ${Vitis_LIBRARIES} ${GMP_LIBRARIES}) -target_compile_definitions(ApfpHostlib PRIVATE HLSLIB_SIMULATE_OPENCL) +add_library(apfpHostlib SHARED interface/Apfp.cpp interface/ApfpBlas.cpp interface/ApfpInterfaceType.cpp) +target_link_libraries(apfpHostlib ${Vitis_LIBRARIES} ${GMP_LIBRARIES}) +target_compile_definitions(apfpHostlib PRIVATE HLSLIB_SIMULATE_OPENCL) # Executable used to run in simulation mode, calling the kernel as a C++ function directly add_executable(TestSimulation host/TestProgram.cpp) @@ -84,7 +88,20 @@ enable_testing() add_test(TestSimulation TestSimulation 4 4 4) add_library(Catch host/Catch.cpp) add_executable(UnitTests host/UnitTests.cpp) -target_link_libraries(UnitTests Catch ${GMP_LIBRARIES} ${MPFR_LIBRARIES} apfp simulation) +target_link_libraries(UnitTests Catch ${GMP_LIBRARIES} ${MPFR_LIBRARIES} apfp apfpHostlib simulation) add_test(UnitTests UnitTests) -install(TARGETS ApfpHostlib) +add_executable(BlasUnitTests host/BlasUnitTests.cpp) +target_link_libraries(BlasUnitTests Catch ${GMP_LIBRARIES} ${MPFR_LIBRARIES} apfp apfpHostlib simulation) + +install(TARGETS apfpHostlib) +install(FILES + interface/Apfp.h + interface/ApfpBlas.h + interface/ApfpInterfaceType.h + ${CMAKE_BINARY_DIR}/Config.h + DESTINATION include/apfp) +install(FILES + ${CMAKE_BINARY_DIR}/MatrixMultiplication_hw.xclbin + ${CMAKE_BINARY_DIR}/MatrixMultiplication_hw_emu.xclbin + DESTINATION lib) diff --git a/device/MatrixMultiplication.cpp b/device/MatrixMultiplication.cpp index ec53647..a7aa725 100644 --- a/device/MatrixMultiplication.cpp +++ b/device/MatrixMultiplication.cpp @@ -6,10 +6,15 @@ #include "ArithmeticOperations.h" +// All memory accesses are column-major! +// I.e. a(i,j) = a[i + LDA * j] +// AB = sum_k a(i,k) b(k, j) = sum_k a[i + LDA * k] * b[k + LDA * j] +// LDA (leading dimension of A) = stride + // Annoyingly we have to specialize the innermost loop on whether multiple DRAM flits per number are required or not, // because HLS otherwise gets confused by pragmas applied to a loop of size 1 in the latter case. template -void ReadAInner(DramLine const *const mem, hlslib::Stream &a_to_feeder, const int size_k, const int n0, +void ReadAInner(DramLine const *const mem, hlslib::Stream &a_to_feeder, const int size_n, const int n0, const int k) { #pragma HLS INLINE DramLine num[kLinesPerNumber]; @@ -19,7 +24,7 @@ void ReadAInner(DramLine const *const mem, hlslib::Stream &a_to_fee for (int i = 0; i < kLinesPerNumber; ++i) { #pragma HLS PIPELINE II = 1 #pragma HLS LOOP_FLATTEN - num[i] = mem[((n0 * kTileSizeN + n1) * size_k + k) * kLinesPerNumber + i]; + num[i] = mem[((n0 * kTileSizeN + n1) + k * size_n) * kLinesPerNumber + i]; if (i == kLinesPerNumber - 1) { a_to_feeder.Push(PackedFloat(num)); } @@ -28,7 +33,7 @@ void ReadAInner(DramLine const *const mem, hlslib::Stream &a_to_fee } template <> -void ReadAInner<1>(DramLine const *const mem, hlslib::Stream &a_to_feeder, const int size_k, const int n0, +void ReadAInner<1>(DramLine const *const mem, hlslib::Stream &a_to_feeder, const int size_n, const int n0, const int k) { #pragma HLS INLINE ReadA_N: @@ -36,7 +41,7 @@ void ReadAInner<1>(DramLine const *const mem, hlslib::Stream &a_to_ #pragma HLS PIPELINE II = 1 #pragma HLS LOOP_FLATTEN DramLine num[1]; - num[0] = mem[(n0 * kTileSizeN + n1) * size_k + k]; + num[0] = mem[((n0 * kTileSizeN + n1) + k * size_n) * kLinesPerNumber]; a_to_feeder.Push(PackedFloat(num)); } } @@ -51,7 +56,7 @@ void ReadA(DramLine const *const mem, hlslib::Stream &a_to_feeder, for (int m0 = 0; m0 < tiles_m; ++m0) { ReadA_K: for (int k = 0; k < size_k; ++k) { - ReadAInner(mem, a_to_feeder, size_k, n0, k); + ReadAInner(mem, a_to_feeder, size_n, n0, k); } } } @@ -90,7 +95,7 @@ void FeedA(hlslib::Stream &a_to_feeder, hlslib::Stream //////////////////////////////////////////////////////////////////////////////// template -void ReadBInner(DramLine const *const mem, hlslib::Stream &b_to_feeder, const int size_m, const int m0, +void ReadBInner(DramLine const *const mem, hlslib::Stream &b_to_feeder, const int size_k, const int m0, const int k) { #pragma HLS INLINE DramLine num[kLinesPerNumber]; @@ -100,7 +105,7 @@ void ReadBInner(DramLine const *const mem, hlslib::Stream &b_to_fee for (int i = 0; i < kLinesPerNumber; ++i) { #pragma HLS PIPELINE II = 1 #pragma HLS LOOP_FLATTEN - num[i] = mem[(k * size_m + m0 * kTileSizeM + m1) * kLinesPerNumber + i]; + num[i] = mem[(k + (m0 * kTileSizeM + m1) * size_k) * kLinesPerNumber + i]; if (i == kLinesPerNumber - 1) { b_to_feeder.Push(PackedFloat(num)); } @@ -109,7 +114,7 @@ void ReadBInner(DramLine const *const mem, hlslib::Stream &b_to_fee } template <> -void ReadBInner<1>(DramLine const *const mem, hlslib::Stream &b_to_feeder, const int size_m, const int m0, +void ReadBInner<1>(DramLine const *const mem, hlslib::Stream &b_to_feeder, const int size_k, const int m0, const int k) { #pragma HLS INLINE ReadB_M: @@ -117,7 +122,7 @@ void ReadBInner<1>(DramLine const *const mem, hlslib::Stream &b_to_ #pragma HLS PIPELINE II = 1 #pragma HLS LOOP_FLATTEN DramLine num[1]; - num[0] = mem[k * size_m + m0 * kTileSizeM + m1]; + num[0] = mem[(k + (m0 * kTileSizeM + m1) * size_k) * kLinesPerNumber]; b_to_feeder.Push(PackedFloat(num)); } } @@ -132,7 +137,7 @@ void ReadB(DramLine const *const mem, hlslib::Stream &b_to_feeder, for (int m0 = 0; m0 < tiles_m; ++m0) { ReadB_K: for (int k = 0; k < size_k; ++k) { - ReadBInner(mem, b_to_feeder, size_m, m0, k); + ReadBInner(mem, b_to_feeder, size_k, m0, k); } } } @@ -169,7 +174,7 @@ void FeedB(hlslib::Stream &b_to_feeder, hlslib::Stream //////////////////////////////////////////////////////////////////////////////// template -void ReadCInner(DramLine const *const mem, hlslib::Stream &c_to_feeder, const int size_m, const int n0, +void ReadCInner(DramLine const *const mem, hlslib::Stream &c_to_feeder, const int size_n, const int n0, const int m0, const int n1) { #pragma HLS INLINE ReadC_M: @@ -179,7 +184,7 @@ void ReadCInner(DramLine const *const mem, hlslib::Stream &c_to_fee for (int i = 0; i < kLinesPerNumber; ++i) { #pragma HLS PIPELINE II = 1 #pragma HLS LOOP_FLATTEN - num[i] = mem[((n0 * kTileSizeN + n1) * size_m + m0 * kTileSizeM + m1) * kLinesPerNumber + i]; + num[i] = mem[((n0 * kTileSizeN + n1) + (m0 * kTileSizeM + m1) * size_n) * kLinesPerNumber + i]; if (i == kLinesPerNumber - 1) { c_to_feeder.Push(PackedFloat(num)); } @@ -188,7 +193,7 @@ void ReadCInner(DramLine const *const mem, hlslib::Stream &c_to_fee } template <> -void ReadCInner<1>(DramLine const *const mem, hlslib::Stream &c_to_feeder, const int size_m, const int n0, +void ReadCInner<1>(DramLine const *const mem, hlslib::Stream &c_to_feeder, const int size_n, const int n0, const int m0, const int n1) { #pragma HLS INLINE ReadC_M: @@ -196,7 +201,7 @@ void ReadCInner<1>(DramLine const *const mem, hlslib::Stream &c_to_ #pragma HLS PIPELINE II = 1 #pragma HLS LOOP_FLATTEN DramLine num[1]; - num[0] = mem[(n0 * kTileSizeN + n1) * size_m + m0 * kTileSizeM + m1]; + num[0] = mem[((n0 * kTileSizeN + n1) + (m0 * kTileSizeM + m1) * size_n) * kLinesPerNumber]; c_to_feeder.Push(PackedFloat(num)); } } @@ -210,7 +215,7 @@ void ReadC(DramLine const *const mem, hlslib::Stream &c_to_feeder, for (int m0 = 0; m0 < tiles_m; ++m0) { ReadC_N: for (int n1 = 0; n1 < kTileSizeN; ++n1) { - ReadCInner(mem, c_to_feeder, size_m, n0, m0, n1); + ReadCInner(mem, c_to_feeder, size_n, n0, m0, n1); } } } @@ -290,7 +295,7 @@ void WriteCInner(hlslib::Stream &from_kernel, DramLine *const mem, } const bool in_bounds = (n0 * kTileSizeN + n1 < size_n) && (m0 * kTileSizeM + m1 < size_m); if (in_bounds) { - mem[((n0 * kTileSizeN + n1) * size_m + m0 * kTileSizeM + m1) * kLinesPerNumber + i] = num[i]; + mem[((n0 * kTileSizeN + n1) + (m0 * kTileSizeM + m1) * size_n) * kLinesPerNumber + i] = num[i]; } } } @@ -308,7 +313,7 @@ void WriteCInner<1>(hlslib::Stream &from_kernel, DramLine *const me from_kernel.Pop().UnpackFlits(num); const bool in_bounds = (n0 * kTileSizeN + n1 < size_n) && (m0 * kTileSizeM + m1 < size_m); if (in_bounds) { - mem[(n0 * kTileSizeN + n1) * size_m + m0 * kTileSizeM + m1] = num[0]; + mem[((n0 * kTileSizeN + n1) + (m0 * kTileSizeM + m1) * size_n) * kLinesPerNumber] = num[0]; } } } @@ -354,7 +359,7 @@ void Compute(hlslib::Stream &a_in, hlslib::Stream &b_i const PackedFloat c_read = c_in.Pop(); const PackedFloat a = (m1 == 0) ? a_read : a_buffer; const PackedFloat b = (n1 == 0) ? b_read : b_buffer[m1]; - const PackedFloat c = (k == 0) ? c_read : c_buffer[n1 * kTileSizeM + m1]; + const PackedFloat c = (k == 0) ? c_read : c_buffer[n1 + m1 * kTileSizeN]; a_buffer = a; b_buffer[m1] = b; // Ignore contributions from out-of-bound indices @@ -363,7 +368,7 @@ void Compute(hlslib::Stream &a_in, hlslib::Stream &b_i const auto res = MultiplyAccumulate(in_bounds ? a : PackedFloat::Zero(), in_bounds ? b : PackedFloat::Zero(), c); // Write back to buffer - c_buffer[n1 * kTileSizeM + m1] = res; + c_buffer[n1 + m1 * kTileSizeN] = res; c_out.Push(res); } } diff --git a/host/BlasUnitTests.cpp b/host/BlasUnitTests.cpp new file mode 100644 index 0000000..420925a --- /dev/null +++ b/host/BlasUnitTests.cpp @@ -0,0 +1,250 @@ +#include +#include +#include + +#include "Config.h" + +// #include "ArithmeticOperations.h" +// #include "Karatsuba.h" +// #include "PackedFloat.h" +#include "ApfpBlas.h" +#include "Random.h" + +void ApfpSetup() { +#ifdef APFP_GMP_INTERFACE_TYPE + mpf_set_default_prec(kMantissaBits); +#else + mpfr_set_default_prec(kMantissaBits); +#endif + auto apfp_error_code = apfp::Init(kMantissaBits); + REQUIRE(!apfp_error_code); +} + +void ApfpTeardown() { + apfp::Finalize(); +} + +bool IsZero(apfp::interface::ConstPtr a) { +#ifdef APFP_GMP_INTERFACE_TYPE + return mpf_sgn(a) == 0; +#else + return mpfr_sgn(a) == 0; +#endif +} + +bool IsClose(apfp::interface::ConstPtr a, apfp::interface::ConstPtr b) { + // Avoids divide by zero if a = b = 0 + if (IsZero(a) && IsZero(b)) { + return true; + } + + apfp::interface::Wrapper diff, sum, ratio; +#ifdef APFP_GMP_INTERFACE_TYPE + mpf_sub(diff.get(), a, b); + mpf_add(sum.get(), a, b); + mpf_div(ratio.get(), diff.get(), sum.get()); + long exp; + mpf_get_d_2exp(&exp, ratio.get()); +#else + mpfr_sub(diff.get(), a, b, kRoundingMode); + mpfr_add(sum.get(), a, b, kRoundingMode); + mpfr_div(ratio.get(), diff.get(), sum.get(), kRoundingMode); + auto exp = mpfr_get_exp(ratio.get()); +#endif + // Require the numbers to match to the first 90% decimal places + // Also demand double precision downcast exactly equal + return (exp < -((kMantissaBits * 3 * 9) / 10)) && (mpfr_get_d(a, kRoundingMode) == mpfr_get_d(b, kRoundingMode)); +} + +TEST_CASE("Init_Teardown") { + ApfpSetup(); + ApfpTeardown(); +} + +TEST_CASE("SYRK") { + ApfpSetup(); + + auto rng = RandomNumberGenerator(); + + unsigned long N = GENERATE(0, 1, 2, 8, 15, 16, 31, 32, 33); + unsigned long K = GENERATE(0, 1, 2, 8, 15, 16, 31, 32, 33); + + auto mode = GENERATE(apfp::BlasTrans::transpose, apfp::BlasTrans::normal); + auto uplo_mode = GENERATE(apfp::BlasUplo::upper, apfp::BlasUplo::lower); + // Test SYRK + // In 'N' mode, we perform AA^T + C + // A is NxK (A : R^K -> R^N) + // C is NxN + // Matrices are stored column major because BLAS + { + CAPTURE(N, K, mode, uplo_mode); + std::vector a_matrix; + a_matrix.resize(N * K); + for (auto& v : a_matrix) { + rng.Generate(v.get()); + } + + std::vector c_matrix; + c_matrix.resize(N * N); + for (auto& v : c_matrix) { + rng.Generate(v.get()); + } + + std::vector ref_result; + ref_result.resize(N * N); + + // Capture inputs for when we explode + std::vector a_matrix_d, c_matrix_d; + a_matrix_d.resize(a_matrix.size()); + std::transform(a_matrix.begin(), a_matrix.end(), a_matrix_d.begin(), + [](const auto& v) { return mpfr_get_d(v.get(), kRoundingMode); }); + c_matrix_d.resize(c_matrix.size()); + std::transform(c_matrix.begin(), c_matrix.end(), c_matrix_d.begin(), + [](const auto& v) { return mpfr_get_d(v.get(), kRoundingMode); }); + CAPTURE(a_matrix_d, c_matrix_d); + + // Compute reference result + apfp::interface::Wrapper prod_temp; + for (unsigned long j = 0; j < N; ++j) { + // lower half + for (unsigned long i = 0; i < N; ++i) { + auto r_idx = mode == apfp::BlasTrans::normal ? i + j * N : j + i * N; + apfp::interface::Set(ref_result.at(r_idx).get(), c_matrix.at(r_idx).get()); + + for (unsigned long k = 0; k < K; ++k) { + // (AB)_ij = sum_k A(i,k)B(k,j) + apfp::interface::Mul(prod_temp.get(), a_matrix.at(i + k * N).get(), a_matrix.at(j + k * N).get()); + apfp::interface::Add(ref_result.at(r_idx).get(), prod_temp.get(), ref_result.at(r_idx).get()); + } + } + } + + // Use APFP BLAS library + auto error_code = apfp::Syrk( + uplo_mode, mode, N, K, [&](unsigned long i) { return a_matrix.at(i).get(); }, + mode == apfp::BlasTrans::normal ? N : K, [&](unsigned long i) { return c_matrix.at(i).get(); }, N); + REQUIRE(!error_code); + + std::vector c_matrix_result_d, c_matrix_ref_result_d; + c_matrix_result_d.resize(c_matrix.size()); + c_matrix_ref_result_d.resize(c_matrix.size()); + std::transform(c_matrix.begin(), c_matrix.end(), c_matrix_result_d.begin(), + [](const auto& v) { return mpfr_get_d(v.get(), kRoundingMode); }); + std::transform(ref_result.begin(), ref_result.end(), c_matrix_ref_result_d.begin(), + [](const auto& v) { return mpfr_get_d(v.get(), kRoundingMode); }); + + CAPTURE(c_matrix_result_d, c_matrix_ref_result_d); + + // Check all entries are sufficiently close + apfp::interface::Wrapper diff; + for (unsigned long j = 0; j < N; ++j) { + // upper half + for (unsigned long i = 0; i < j; ++i) { + auto ref_value = uplo_mode == apfp::BlasUplo::upper ? ref_result.at(i + j * N).get() + : ref_result.at(j + i * N).get(); + auto test_value = + uplo_mode == apfp::BlasUplo::upper ? c_matrix.at(i + j * N).get() : c_matrix.at(j + i * N).get(); + CAPTURE(i, j); + CAPTURE(PackedFloat(ref_value), PackedFloat(test_value)); + CAPTURE(mpfr_get_d(ref_value, kRoundingMode), mpfr_get_d(test_value, kRoundingMode)); + REQUIRE(IsClose(ref_value, test_value)); + } + } + } + + ApfpTeardown(); +} + +TEST_CASE("GEMM") { + ApfpSetup(); + + auto rng = RandomNumberGenerator(); + + unsigned long M = GENERATE(0, 1, 2, 8, 15, 16, 31, 32, 33); + unsigned long N = GENERATE(0, 1, 2, 8, 15, 16, 31, 32, 33); + unsigned long K = GENERATE(0, 1, 2, 8, 15, 16, 31, 32, 33); + + { + CAPTURE(M, N, K); + std::vector a_matrix; + a_matrix.resize(M * K); + for (auto& v : a_matrix) { + rng.Generate(v.get()); + } + + std::vector b_matrix; + b_matrix.resize(K * N); + for (auto& v : b_matrix) { + rng.Generate(v.get()); + } + + std::vector c_matrix; + c_matrix.resize(M * N); + for (auto& v : a_matrix) { + rng.Generate(v.get()); + } + + std::vector ref_result; + ref_result.resize(M * N); + + // Capture inputs for when we explode + std::vector a_matrix_d, c_matrix_d; + a_matrix_d.resize(a_matrix.size()); + std::transform(a_matrix.begin(), a_matrix.end(), a_matrix_d.begin(), + [](const auto& v) { return mpfr_get_d(v.get(), kRoundingMode); }); + c_matrix_d.resize(c_matrix.size()); + std::transform(c_matrix.begin(), c_matrix.end(), c_matrix_d.begin(), + [](const auto& v) { return mpfr_get_d(v.get(), kRoundingMode); }); + CAPTURE(a_matrix_d, c_matrix_d); + + // Compute reference result + apfp::interface::Wrapper prod_temp; + for (unsigned long j = 0; j < N; ++j) { + // lower half + for (unsigned long i = 0; i < M; ++i) { + auto r_idx = i + j * M; + apfp::interface::Set(ref_result.at(r_idx).get(), c_matrix.at(r_idx).get()); + + for (unsigned long k = 0; k < K; ++k) { + // (AB)_ij = sum_k A(i,k)B(k,j) + apfp::interface::Mul(prod_temp.get(), a_matrix.at(i + k * M).get(), b_matrix.at(k + j * K).get()); + apfp::interface::Add(ref_result.at(r_idx).get(), prod_temp.get(), ref_result.at(r_idx).get()); + } + } + } + + // Use APFP BLAS library + auto error_code = apfp::Gemm( + apfp::BlasTrans::normal, apfp::BlasTrans::normal, M, N, K, + [&](unsigned long i) { return a_matrix.at(i).get(); }, M, + [&](unsigned long i) { return b_matrix.at(i).get(); }, K, + [&](unsigned long i) { return c_matrix.at(i).get(); }, M); + REQUIRE(!error_code); + + std::vector c_matrix_result_d, c_matrix_ref_result_d; + c_matrix_result_d.resize(c_matrix.size()); + c_matrix_ref_result_d.resize(c_matrix.size()); + std::transform(c_matrix.begin(), c_matrix.end(), c_matrix_result_d.begin(), + [](const auto& v) { return mpfr_get_d(v.get(), kRoundingMode); }); + std::transform(ref_result.begin(), ref_result.end(), c_matrix_ref_result_d.begin(), + [](const auto& v) { return mpfr_get_d(v.get(), kRoundingMode); }); + + CAPTURE(c_matrix_result_d, c_matrix_ref_result_d); + + // Check all entries are sufficiently close + apfp::interface::Wrapper diff; + for (unsigned long j = 0; j < N; ++j) { + // upper half + for (unsigned long i = 0; i < M; ++i) { + auto ref_value = ref_result.at(i + j * M).get(); + auto test_value = c_matrix.at(i + j * M).get(); + CAPTURE(i, j); + CAPTURE(PackedFloat(ref_value), PackedFloat(test_value)); + CAPTURE(mpfr_get_d(ref_value, kRoundingMode), mpfr_get_d(test_value, kRoundingMode)); + REQUIRE(IsClose(ref_value, test_value)); + } + } + } + + ApfpTeardown(); +} \ No newline at end of file diff --git a/host/MatrixMultiplicationReference.cpp b/host/MatrixMultiplicationReference.cpp index 42d2b98..3c0d180 100644 --- a/host/MatrixMultiplicationReference.cpp +++ b/host/MatrixMultiplicationReference.cpp @@ -8,8 +8,9 @@ void MatrixMultiplicationReference(mpfr_t const *a, mpfr_t const *b, mpfr_t *c, for (int n = 0; n < size_n; ++n) { for (int k = 0; k < size_k; ++k) { for (int m = 0; m < size_m; ++m) { - mpfr_mul(tmp, a[n * size_k + k], b[k * size_m + m], kRoundingMode); - mpfr_t &_c = c[n * size_m + m]; + // C(n, m) = sum_k A(n, k) B(k, m) + mpfr_mul(tmp, a[n + k * size_n], b[k + m * size_k], kRoundingMode); + mpfr_t &_c = c[n + m * size_n]; mpfr_add(_c, _c, tmp, kRoundingMode); } } diff --git a/host/TestProgram.cpp b/host/TestProgram.cpp index b3312d7..c38f300 100644 --- a/host/TestProgram.cpp +++ b/host/TestProgram.cpp @@ -114,8 +114,8 @@ bool RunTest(std::string const &kernel_path, int size_n, int size_k, int size_m, // Verify results for (int n = 0; n < size_n; ++n) { for (int m = 0; m < size_m; ++m) { - const PackedFloat res = c_host[n * size_m + m]; - const PackedFloat ref(c_mpfr[n * size_m + m]); + const PackedFloat res = c_host[n + m * size_n]; + const PackedFloat ref(c_mpfr[n + m * size_n]); if (ref != res) { std::cerr << "Verification failed at (" << n << ", " << m << "):\n\t" << res << "\n\t" << ref << "\n"; return false; @@ -127,17 +127,17 @@ bool RunTest(std::string const &kernel_path, int size_n, int size_k, int size_m, // Clean up for (int n = 0; n < size_n; ++n) { for (int k = 0; k < size_k; ++k) { - mpfr_clear(a_mpfr[n * size_k + k]); + mpfr_clear(a_mpfr[n + k * size_n]); } } for (int k = 0; k < size_k; ++k) { for (int m = 0; m < size_m; ++m) { - mpfr_clear(b_mpfr[k * size_m + m]); + mpfr_clear(b_mpfr[k + m * size_k]); } } for (int n = 0; n < size_n; ++n) { for (int m = 0; m < size_m; ++m) { - mpfr_clear(c_mpfr[n * size_m + m]); + mpfr_clear(c_mpfr[n + m * size_n]); } } diff --git a/include/Config.h.in b/include/Config.h.in index e9e0ab1..6b7a358 100644 --- a/include/Config.h.in +++ b/include/Config.h.in @@ -6,4 +6,7 @@ constexpr int kMultBaseBits = ${APFP_MULT_BASE_BITS}; constexpr int kTileSizeN = ${APFP_TILE_SIZE_N}; constexpr int kTileSizeM = ${APFP_TILE_SIZE_M}; constexpr auto kBuildDir = "${CMAKE_BINARY_DIR}"; +constexpr auto kInstallPrefix = "${CMAKE_INSTALL_PREFIX}"; static_assert(kBits % 8 == 0, "Number of bits must be byte-aligned."); + +#define APFP_${APFP_INTERFACE_TYPE}_INTERFACE_TYPE diff --git a/include/PackedFloat.h b/include/PackedFloat.h index 0673fdf..d3e7290 100644 --- a/include/PackedFloat.h +++ b/include/PackedFloat.h @@ -150,7 +150,7 @@ class PackedFloat { return *this; } - inline void ToGmp(mpf_ptr num) { + inline void ToGmp(mpf_ptr num) const { const size_t gmp_limbs = (mpf_get_prec(num) + 8 * sizeof(mp_limb_t) - 1) / (8 * sizeof(mp_limb_t)); constexpr size_t kNumLimbs = kMantissaBytes / sizeof(Limb); // GMP does not allow graceful rounding, so we cannot handle having insufficient bits in the target GMP number diff --git a/interface/Apfp.cpp b/interface/Apfp.cpp index 9c9cd07..e88585e 100644 --- a/interface/Apfp.cpp +++ b/interface/Apfp.cpp @@ -2,16 +2,59 @@ #include +#include +#include #include +#include #include "Config.h" -Apfp::Apfp() { - program_.emplace(context_.MakeProgram(kernel_path_)); +namespace apfp { + +Context::Context() { + auto kernel_path = FindKernel(); + program_.emplace(context_.MakeProgram(kernel_path)); lines_per_number_ = kLinesPerNumber; } -DeviceMatrix Apfp::AllocateDeviceMatrix(std::size_t rows, std::size_t cols) { +std::string Context::FindKernel() { + { // Specify a path to the APFP kernel manually + char* apfp_kernel_env_var = std::getenv("APFP_KERNEL"); + if (apfp_kernel_env_var != nullptr) { + auto kernel_override_path = std::filesystem::path(apfp_kernel_env_var); + + if (!std::filesystem::exists(kernel_override_path)) { + throw KernelNotFoundException( + "APFP kernel path specified with APFP_KERNEL environment variable does not exist"); + } + return kernel_override_path.string(); + } + } + + char* apfp_use_simulation_env_var = std::getenv("APFP_USE_SIMULATION"); + auto apfp_use_simulation = + apfp_use_simulation_env_var != nullptr && !std::string(apfp_use_simulation_env_var).empty(); + auto kernel_name = std::filesystem::path(apfp_use_simulation ? "MatrixMultiplication_hw_emu.xclbin" + : "MatrixMultiplication_hw.xclbin"); + + { + std::vector search_paths; + search_paths.push_back(std::filesystem::current_path()); + search_paths.push_back(std::filesystem::path(kInstallPrefix)); + + // Search + for (auto candidate_dir : search_paths) { + auto candidate_kernel_path = candidate_dir / kernel_name; + if (std::filesystem::exists(candidate_kernel_path)) { + return candidate_kernel_path.string(); + } + } + } + + throw KernelNotFoundException("Unable to find FPGA kernel"); +} + +DeviceMatrix Context::AllocateDeviceMatrix(std::size_t rows, std::size_t cols) { // This seems like poor encapsulation, is there a better way? DeviceMatrix matrix; @@ -21,13 +64,13 @@ DeviceMatrix Apfp::AllocateDeviceMatrix(std::size_t rows, std::size_t cols) { return matrix; } -DeviceMatrix Apfp::MatrixMultiplication(const DeviceMatrix& a, const DeviceMatrix& b) { +DeviceMatrix Context::MatrixMultiplication(const DeviceMatrix& a, const DeviceMatrix& b) { auto result = AllocateDeviceMatrix(a.rows(), b.cols()); MatrixMultiplication(a, b, &result); return result; } -void Apfp::MatrixMultiplication(const DeviceMatrix& a, const DeviceMatrix& b, DeviceMatrix* result) { +void Context::MatrixMultiplication(const DeviceMatrix& a, const DeviceMatrix& b, DeviceMatrix* result) { if (a.cols() != b.rows() || result->rows() != a.rows() || result->cols() != b.cols()) { throw std::logic_error("Matrix dimension mismatch"); } @@ -37,15 +80,20 @@ void Apfp::MatrixMultiplication(const DeviceMatrix& a, const DeviceMatrix& b, De kernel.ExecuteTask(); } -void Apfp::TransposeInPlace(DeviceMatrix*) { - throw std::exception(); +void Context::MatrixAddition(const DeviceMatrix&, const DeviceMatrix&, DeviceMatrix*) { + throw UnimplementedException(); } -DeviceMatrix Apfp::Transpose(const DeviceMatrix&) { - throw std::exception(); +void Context::TransposeInPlace(DeviceMatrix*) { + throw UnimplementedException(); } -void DeviceMatrix::TransferToDevice(const mpf_t* buffer_ptr, std::size_t buffer_size) { +DeviceMatrix Context::Transpose(const DeviceMatrix&) { + throw UnimplementedException(); +} + +template +void DeviceMatrix::TransferToDeviceImpl(ptr_function_type buffer_ptr_func, std::size_t buffer_size) { if (rows() * cols() > buffer_size) { throw std::runtime_error("Source host buffer size smaller than destination device matrix size"); } @@ -54,17 +102,53 @@ void DeviceMatrix::TransferToDevice(const mpf_t* buffer_ptr, std::size_t buffer_ std::vector host_buffer; host_buffer.resize(cols() * rows()); - std::transform(buffer_ptr, buffer_ptr + host_buffer.size(), host_buffer.begin(), - [](const mpf_t& a) { return PackedFloat(a); }); + for (std::size_t i = 0; i < host_buffer.size(); ++i) { + host_buffer[i] = PackedFloat(buffer_ptr_func(i)); + } buffer_.CopyFromHost(0, host_buffer.size() * kLinesPerNumber, reinterpret_cast(host_buffer.data())); } -void DeviceMatrix::TransferToHost(mpf_t* buffer_ptr, std::size_t buffer_size) { - if (rows() * cols() >= buffer_size) { +void DeviceMatrix::TransferToDevice(interface::ConstPtr buffer_ptr, std::size_t buffer_size) { + TransferToDeviceImpl([&](std::size_t i) { return buffer_ptr + i; }, buffer_size); +} + +void DeviceMatrix::TransferToDevice(const interface::Wrapper* buffer_ptr, std::size_t buffer_size) { + TransferToDeviceImpl([&](std::size_t i) { return buffer_ptr[i].get(); }, buffer_size); +} + +void PackedFloatToInterfaceType(const PackedFloat& packed, mpfr_ptr dest) { + packed.ToMpfr(dest); +} + +void PackedFloatToInterfaceType(const PackedFloat& packed, mpf_ptr dest) { + packed.ToGmp(dest); +} + +template +void DeviceMatrix::TransferToHostImpl(ptr_function_type buffer_ptr_func, std::size_t buffer_size) { + if (rows() * cols() > buffer_size) { throw std::runtime_error("Destination host buffer size smaller than source device matrix size"); } - buffer_.CopyToHost(0, kLinesPerNumber * rows() * cols(), reinterpret_cast(buffer_ptr)); + std::vector host_buffer; + host_buffer.resize(cols() * rows()); + + buffer_.CopyToHost(0, kLinesPerNumber * host_buffer.size(), reinterpret_cast(host_buffer.data())); + + interface::Wrapper scratch; + for (std::size_t i = 0; i < host_buffer.size(); ++i) { + PackedFloatToInterfaceType(host_buffer[i], buffer_ptr_func(i)); + } +} + +void DeviceMatrix::TransferToHost(interface::Ptr buffer_ptr, std::size_t buffer_size) { + TransferToHostImpl([&](std::size_t i) -> interface::Ptr { return buffer_ptr + i; }, buffer_size); } + +void DeviceMatrix::TransferToHost(interface::Wrapper* buffer_ptr, std::size_t buffer_size) { + TransferToHostImpl([&](std::size_t i) -> interface::Ptr { return buffer_ptr[i].get(); }, buffer_size); +} + +} // namespace apfp \ No newline at end of file diff --git a/interface/Apfp.h b/interface/Apfp.h index f17d39e..6be5555 100644 --- a/interface/Apfp.h +++ b/interface/Apfp.h @@ -2,23 +2,28 @@ #include #include +#include #include +#include "ApfpInterfaceType.h" #include "MatrixMultiplication.h" #include "PackedFloat.h" +namespace apfp { + class DeviceMatrix; /// Object oriented interface for Apfp -class Apfp { +class Context { hlslib::ocl::Context context_; std::optional program_; std::size_t lines_per_number_; - const std::string kernel_path_ = ""; + + static std::string FindKernel(); public: - Apfp(); + Context(); /// Allocate a buffer on the device DeviceMatrix AllocateDeviceMatrix(std::size_t rows, std::size_t cols); @@ -29,6 +34,9 @@ class Apfp { /// Three argument matrix multiply with supplied output buffer void MatrixMultiplication(const DeviceMatrix& a, const DeviceMatrix& b, DeviceMatrix* result); + /// Three argument matrix addition with supplied output buffer + void MatrixAddition(const DeviceMatrix& a, const DeviceMatrix& b, DeviceMatrix* result); + // Transpose a matrix in place void TransposeInPlace(DeviceMatrix* a); @@ -37,13 +45,13 @@ class Apfp { }; /// Helper class to track matrices on the device -/// We should probably refactor the interface to Apfp to something more controlled? +/// We should probably refactor the interface to Context to something more controlled? class DeviceMatrix { std::size_t num_rows_; std::size_t num_cols_; hlslib::ocl::Buffer buffer_; - friend Apfp; + friend Context; DeviceMatrix() = default; @@ -58,9 +66,45 @@ class DeviceMatrix { /// Transfer from the host to the device /// TODO: Make this take input iterators - void TransferToDevice(const mpf_t* buffer_ptr, std::size_t buffer_size); + void TransferToDevice(interface::ConstPtr buffer_ptr, std::size_t buffer_size); + void TransferToDevice(const interface::Wrapper* buffer_ptr, std::size_t buffer_size); /// Transfer from the device to the host /// TODO: Make this take output iterators - void TransferToHost(mpf_t* buffer_ptr, std::size_t buffer_size); + void TransferToHost(interface::Ptr buffer_ptr, std::size_t buffer_size); + void TransferToHost(interface::Wrapper* buffer_ptr, std::size_t buffer_size); + + private: + template + void TransferToDeviceImpl(ptr_function_type buffer_ptr_func, std::size_t buffer_size); + + template + void TransferToHostImpl(ptr_function_type buffer_ptr_func, std::size_t buffer_size); +}; + +// === Custom exception types === +struct ApfpException : public std::exception { + std::string e; + + ApfpException() { + e = ""; + } + + ApfpException(const std::string& what_arg) { + e = what_arg; + } + + virtual const char* what() const noexcept { + return e.c_str(); + } }; + +struct KernelNotFoundException : public ApfpException { + using ApfpException::ApfpException; +}; + +struct UnimplementedException : public ApfpException { + using ApfpException::ApfpException; +}; + +} // namespace apfp \ No newline at end of file diff --git a/interface/ApfpBlas.cpp b/interface/ApfpBlas.cpp new file mode 100644 index 0000000..0e757d5 --- /dev/null +++ b/interface/ApfpBlas.cpp @@ -0,0 +1,276 @@ +#include "ApfpBlas.h" + +#include +#include + +#include "Apfp.h" +#include "Config.h" + +namespace apfp { + +static std::optional apfp; +static std::string last_error_message; + +int Init(unsigned long precision) { + try { + if (precision > kBits) { + // Requested bit width too large + last_error_message = "Requested bitwidth too large"; + return static_cast(BlasError::bitwidth); + } + apfp.emplace(); + return static_cast(BlasError::success); + + } catch (const KernelNotFoundException& e) { + last_error_message = e.what(); + return static_cast(BlasError::kernel_not_found); + + } catch (const std::exception& e) { + // Unknown exception + last_error_message = e.what(); + return static_cast(BlasError::unknown); + } +} + +int Finalize() { + apfp.reset(); + return static_cast(BlasError::success); +} + +bool IsInitialized() { + return apfp.has_value(); +} + +const char* ErrorDescription() { + return last_error_message.c_str(); +} + +BlasError InterpretError(int a) { + return a < 0 ? BlasError::argument_error : static_cast(a); +} + +/// Copy the upper or lower triangle from an NxN matrix A to a full size buffer +template +void CopyFromMatrixUplo(BlasUplo uplo, unsigned long N, ptr_function_type A, unsigned long LDA, + interface::Wrapper* buffer) { + auto dest_lda = N; + // Col major layout + for (unsigned long j = 0; j < N; ++j) { + for (unsigned long i = 0; i <= j; ++i) { + auto source = uplo == BlasUplo::lower ? A(i + j * LDA) : A(j + i * LDA); + interface::Set(buffer[i + j * dest_lda].get(), source); + interface::Set(buffer[j + i * dest_lda].get(), source); + } + } +} + +/// Copy from a full size buffer to the upper or lower triangle of an NxN matrix A +template +void CopyToMatrixUplo(BlasUplo uplo, unsigned long N, ptr_function_type A, unsigned long LDA, + interface::Wrapper* buffer) { + auto source_lda = N; + // Col major layout + for (unsigned long j = 0; j < N; ++j) { + for (unsigned long i = 0; i <= j; ++i) { + auto dest = uplo == BlasUplo::lower ? A(i + j * LDA) : A(j + i * LDA); + interface::Set(dest, buffer[i + j * source_lda].get()); + } + } +} + +/// Copy from an NxK matrix A to a full size buffer +template +void CopyFromMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigned long LDA, + interface::Wrapper* buffer) { + auto dest_lda = N; + // Col major layout + for (unsigned long j = 0; j < K; ++j) { + for (unsigned long i = 0; i < N; ++i) { + interface::Set(buffer[i + j * dest_lda].get(), A(i + j * LDA)); + } + } +} + +/// Copy the transpose of a NxK matrix A to a full size buffer +template +void CopyTransposeFromMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigned long LDA, + interface::Wrapper* buffer) { + auto dest_lda = K; + // Col major layout + for (unsigned long j = 0; j < K; ++j) { + for (unsigned long i = 0; i < N; ++i) { + interface::Set(buffer[i * dest_lda + j].get(), A(i + j * LDA)); + } + } +} + +/// Copy to an NxK matrix A from a full size buffer +template +void CopyToMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigned long LDA, + interface::Wrapper* buffer) { + auto source_lda = N; + // Col major layout + for (unsigned long j = 0; j < K; ++j) { + for (unsigned long i = 0; i < N; ++i) { + interface::Set(A(i + j * LDA), buffer[i + j * source_lda].get()); + } + } +} + +/// Do all the intermediate work to get a device matrix out of a pointer function +/// TODO: we can make this handle the tranpose argument and LDA check (raise exception or option type) +template +DeviceMatrix MakeDeviceMatrix(unsigned long N, unsigned long M, ptr_function_type A, unsigned long LDA) { + std::vector host_a; + host_a.resize(N * M); + CopyFromMatrix(N, M, A, LDA, host_a.data()); + auto device_a = apfp->AllocateDeviceMatrix(N, M); + device_a.TransferToDevice(host_a.data(), host_a.size()); + return device_a; +} + +template +int SyrkImpl(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, ptr_function_type_a A, unsigned long LDA, + ptr_function_type_c C, unsigned long LDC) { + try { + // ==== library input validation stuff ==== + if (!IsInitialized()) { + return static_cast(BlasError::uninitialized); + } + + // A is NxK if 'N', KxN if 'T' + // C is always NxN + // N mode + // A A^T + C + // T mode + // A^T A + C + bool use_transpose = trans == BlasTrans::transpose; + + unsigned long A_rows = use_transpose ? K : N; + unsigned long A_cols = use_transpose ? N : K; + + if (LDA < (use_transpose ? K : N)) { + return -6; + } + if (LDC < N) { + return -8; + } + + // Empty matrix no-op + if (N == 0) { + return static_cast(BlasError::success); + } + if (K == 0) { + return static_cast(BlasError::success); + } + + // ==== setup ==== + std::vector host_a, host_a_transpose, host_c; + host_a.resize(N * K); + CopyFromMatrix(A_rows, A_cols, A, LDA, host_a.data()); + auto device_a = apfp->AllocateDeviceMatrix(A_rows, A_cols); + device_a.TransferToDevice(host_a.data(), host_a.size()); + + host_a_transpose.resize(K * N); + CopyTransposeFromMatrix(A_rows, A_cols, A, LDA, host_a_transpose.data()); + auto device_a_transpose = apfp->AllocateDeviceMatrix(A_cols, A_rows); + device_a_transpose.TransferToDevice(host_a_transpose.data(), host_a_transpose.size()); + + host_c.resize(N * N); + CopyFromMatrixUplo(uplo, N, C, LDC, host_c.data()); + auto device_c = apfp->AllocateDeviceMatrix(N, N); + device_c.TransferToDevice(host_c.data(), host_c.size()); + + // ==== compute and teardown ==== + auto mul_result = apfp->AllocateDeviceMatrix(N, N); + if (use_transpose) { + apfp->MatrixMultiplication(device_a_transpose, device_a, &device_c); + } else { + apfp->MatrixMultiplication(device_a, device_a_transpose, &device_c); + } + + device_c.TransferToHost(host_c.data(), host_c.size()); + CopyToMatrixUplo(uplo, N, C, LDC, host_c.data()); + } catch (const std::exception& e) { + last_error_message = e.what(); + return static_cast(BlasError::unknown); + } + + return static_cast(BlasError::success); +} + +/// See netlib's documentation on Syrk for usage. Alpha and beta unsupported +int Syrk(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, interface::ConstPtr A, unsigned long LDA, + interface::Ptr C, unsigned long LDC) { + auto a_ptr_function = [&](unsigned long i) -> interface::ConstPtr { return A + i; }; + auto c_ptr_function = [&](unsigned long i) -> interface::Ptr { return C + i; }; + return SyrkImpl(uplo, trans, N, K, a_ptr_function, LDA, c_ptr_function, LDC); +} + +int Syrk(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, ConstIndexFunction A, unsigned long LDA, + IndexFunction C, unsigned long LDC) { + return SyrkImpl(uplo, trans, N, K, A, LDA, C, LDC); +} + +template +int GemmImpl(BlasTrans trans_a, BlasTrans trans_b, unsigned long M, unsigned long N, unsigned long K, + ptr_function_type_a A, unsigned long LDA, ptr_function_type_b B, unsigned long LDB, ptr_function_type_c C, + unsigned long LDC) { + try { + // ==== library input validation stuff ==== + if (!IsInitialized()) { + return static_cast(BlasError::uninitialized); + } + + // Implement the transposed versions later + if (trans_a != BlasTrans::normal || trans_b != BlasTrans::normal) { + return static_cast(BlasError::unimplemented); + } + + // Empty matrix + if (N == 0 || M == 0 || K == 0) { + return static_cast(BlasError::success); + } + + // Validate leading dimensions are sane + if (LDA < M) { + return -7; + } + if (LDB < K) { + return -9; + } + + // ==== setup ==== + auto device_a = MakeDeviceMatrix(M, K, A, LDA); + auto device_b = MakeDeviceMatrix(K, N, B, LDB); + auto device_c = MakeDeviceMatrix(M, N, C, LDC); + + // ==== compute and teardown ==== + apfp->MatrixMultiplication(device_a, device_b, &device_c); + std::vector host_c; + host_c.resize(M * N); + device_c.TransferToHost(host_c.data(), host_c.size()); + CopyToMatrix(M, N, C, LDC, host_c.data()); + + } catch (const std::exception& e) { + last_error_message = e.what(); + return static_cast(BlasError::unknown); + } + + return static_cast(BlasError::success); +} +/// See netlib's documentation on Syrk for usage. Alpha and beta unsupported +int Gemm(BlasTrans trans_a, BlasTrans trans_b, unsigned long M, unsigned long N, unsigned long K, interface::ConstPtr A, + unsigned long LDA, interface::Ptr B, unsigned long LDB, interface::Ptr C, unsigned long LDC) { + auto a_ptr_function = [&](unsigned long i) -> interface::ConstPtr { return A + i; }; + auto b_ptr_function = [&](unsigned long i) -> interface::ConstPtr { return B + i; }; + auto c_ptr_function = [&](unsigned long i) -> interface::Ptr { return C + i; }; + return GemmImpl(trans_a, trans_b, M, N, K, a_ptr_function, LDA, b_ptr_function, LDB, c_ptr_function, LDC); +} + +int Gemm(BlasTrans trans_a, BlasTrans trans_b, unsigned long M, unsigned long N, unsigned long K, ConstIndexFunction A, + unsigned long LDA, IndexFunction B, unsigned long LDB, IndexFunction C, unsigned long LDC) { + return GemmImpl(trans_a, trans_b, M, N, K, A, LDA, B, LDB, C, LDC); +} + +} // namespace apfp \ No newline at end of file diff --git a/interface/ApfpBlas.h b/interface/ApfpBlas.h new file mode 100644 index 0000000..89d7593 --- /dev/null +++ b/interface/ApfpBlas.h @@ -0,0 +1,53 @@ +#pragma once +#include + +#include + +#include "ApfpInterfaceType.h" + +namespace apfp { + +enum class BlasError : int { + success = 0, + unknown = 1, + unimplemented = 2, + bitwidth = 3, + uninitialized = 4, + kernel_not_found = 5, + argument_error = 6, +}; + +enum class BlasUplo : char { upper = 'U', lower = 'L' }; + +enum class BlasTrans : char { + normal = 'N', + transpose = 'T', +}; + +using IndexFunction = std::function; +using ConstIndexFunction = std::function; +/// Null terminated string describing the most recent library error if available +/// Pointer is only guaranteed to live until the next library call +const char* ErrorDescription(); + +/// Convert a return code to a BlasError type +/// Negative return codes are converted to BlasError::argument_error +BlasError InterpretError(int a); + +int Init(unsigned long precision); + +int Finalize(); + +/// See netlib's documentation on Syrk for usage. Alpha and beta unsupported +int Syrk(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, interface::ConstPtr A, unsigned long LDA, + interface::Ptr C, unsigned long LDC); +int Syrk(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, ConstIndexFunction A, unsigned long LDA, + IndexFunction C, unsigned long LDC); + +/// See netlib's documentation on Gemm for usage. Alpha and beta unsupported +int Gemm(BlasTrans trans_a, BlasTrans trans_b, unsigned long M, unsigned long N, unsigned long K, interface::ConstPtr A, + unsigned long LDA, interface::Ptr B, unsigned long LDB, interface::Ptr C, unsigned long LDC); +int Gemm(BlasTrans trans_a, BlasTrans trans_b, unsigned long M, unsigned long N, unsigned long K, ConstIndexFunction A, + unsigned long LDA, IndexFunction B, unsigned long LDB, IndexFunction C, unsigned long LDC); + +} // namespace apfp \ No newline at end of file diff --git a/interface/ApfpInterfaceType.cpp b/interface/ApfpInterfaceType.cpp new file mode 100644 index 0000000..6862444 --- /dev/null +++ b/interface/ApfpInterfaceType.cpp @@ -0,0 +1,94 @@ +#include "ApfpInterfaceType.h" +#include "DeviceTypes.h" + +namespace apfp::interface { + +void Init(Ptr value) { +#ifdef APFP_GMP_INTERFACE_TYPE + mpf_init(value); +#else + mpfr_init(value); +#endif +} + +void Init2(Ptr value, unsigned long precision) { +#ifdef APFP_GMP_INTERFACE_TYPE + mpf_init2(value, precision); +#else + mpfr_init(value); + mpfr_set_prec(value, precision); +#endif +} + +void Clear(Ptr value) { +#ifdef APFP_GMP_INTERFACE_TYPE + mpf_clear(value); +#else + mpfr_clear(value); +#endif +} + +void Swap(Ptr a, Ptr b) { +#ifdef APFP_GMP_INTERFACE_TYPE + mpf_swap(a, b); +#else + mpfr_swap(a, b); +#endif +} + +void Set(Ptr dest, ConstPtr source) { +#ifdef APFP_GMP_INTERFACE_TYPE + mpf_set(dest, source); +#else + mpfr_set(dest, source, kRoundingMode); +#endif +} + +void Set(Ptr dest, long int source) { +#ifdef APFP_GMP_INTERFACE_TYPE + mpf_set_ui(dest, source); +#else + mpfr_set_si(dest, source, kRoundingMode); +#endif +} + +void Add(Ptr dest, ConstPtr a, ConstPtr b) { +#ifdef APFP_GMP_INTERFACE_TYPE + mpf_add(dest, a, b); +#else + mpfr_add(dest, a, b, kRoundingMode); +#endif +} + +void Mul(Ptr dest, ConstPtr a, ConstPtr b) { +#ifdef APFP_GMP_INTERFACE_TYPE + mpf_mul(dest, a, b); +#else + mpfr_mul(dest, a, b, kRoundingMode); +#endif +} + +Wrapper::~Wrapper() { + Clear(data_); +} + +Wrapper::Wrapper() { + Init(data_); +} + +Wrapper::Wrapper(unsigned long precision) { + Init2(data_, precision); +} + +Wrapper::Wrapper(Wrapper&& other) { + Swap(data_, other.data_); + Clear(other.data_); +} + +Wrapper& Wrapper::operator=(Wrapper&& other) { + Swap(data_, other.data_); + Clear(other.data_); + return *this; +} + +} // namespace apfp::interface \ No newline at end of file diff --git a/interface/ApfpInterfaceType.h b/interface/ApfpInterfaceType.h new file mode 100644 index 0000000..57ed2bf --- /dev/null +++ b/interface/ApfpInterfaceType.h @@ -0,0 +1,70 @@ +#pragma once +#include +#include + +#include "Config.h" + +/* This header abstracts away the choice of MPFR or GMP in the interface + * It defines four types: Value, Ptr, ConstPtr, Wrapper + * The first three directly correspond to MPFR/GMP types + * The last one is a wrapper that manages the memory footprint with RAII + */ +namespace apfp::interface { + +#ifdef APFP_GMP_INTERFACE_TYPE // Interface with GMP types +using Value = mpf_t; +using Ptr = mpf_ptr; +using ConstPtr = mpf_srcptr; +#else +#include +using Value = mpfr_t; +using Ptr = mpfr_ptr; +using ConstPtr = mpfr_srcptr; +#endif + +void Init(Ptr value); + +void Init2(Ptr value, unsigned long precision); + +void Clear(Ptr value); + +void Swap(Ptr a, Ptr b); + +void Set(Ptr dest, ConstPtr source); + +void Set(Ptr dest, long int source); + +void Add(Ptr dest, ConstPtr a, ConstPtr b); + +void Mul(Ptr dest, ConstPtr a, ConstPtr b); + +/// Smart pointer-like wrapper class for GMP/MPFR types +class Wrapper { + Value data_; + + public: + ~Wrapper(); + + Wrapper(); + + Wrapper(unsigned long precision); + + Wrapper(Wrapper&&); + + Wrapper(Wrapper&) = delete; + + Wrapper& operator=(const Wrapper&) = delete; + + Wrapper& operator=(Wrapper&&); + + // This decays to the pointer type + Ptr get() { + return data_; + } + + ConstPtr get() const { + return data_; + } +}; + +} // namespace apfp::interface \ No newline at end of file diff --git a/scripts/run_simulation.sh b/scripts/run_simulation.sh index cee9fbd..f2725f4 100755 --- a/scripts/run_simulation.sh +++ b/scripts/run_simulation.sh @@ -1,13 +1,65 @@ #!/bin/bash sizes=(1 3 4 7 9 15 16 17 31 33 41) +small_sizes=(1 3 4 7 9 15 16 17) +large_sizes=(31 33 41) batch_size=12 -for n in "${sizes[@]}" + +for n in "${small_sizes[@]}" +do + for m in "${small_sizes[@]}" + do + for k in "${small_sizes[@]}" + do + echo $n $m $k 1>&2 + (./TestSimulation $n $m $k | tee sim_output.${n}.${m}.${k}.txt) & + + if [[ $(jobs -r -p | wc -l) -ge $batch_size ]]; then + wait -n + fi + done + done +done + + +for n in "${small_sizes[@]}" +do + for m in "${large_sizes[@]}" + do + for k in "${large_sizes[@]}" + do + echo $n $m $k 1>&2 + (./TestSimulation $n $m $k | tee sim_output.${n}.${m}.${k}.txt) & + + if [[ $(jobs -r -p | wc -l) -ge $batch_size ]]; then + wait -n + fi + done + done +done + +for n in "${large_sizes[@]}" +do + for m in "${small_sizes[@]}" + do + for k in "${large_sizes[@]}" + do + echo $n $m $k 1>&2 + (./TestSimulation $n $m $k | tee sim_output.${n}.${m}.${k}.txt) & + + if [[ $(jobs -r -p | wc -l) -ge $batch_size ]]; then + wait -n + fi + done + done +done + +for n in "${large_sizes[@]}" do - for m in "${sizes[@]}" + for m in "${large_sizes[@]}" do - for k in "${sizes[@]}" + for k in "${small_sizes[@]}" do echo $n $m $k 1>&2 (./TestSimulation $n $m $k | tee sim_output.${n}.${m}.${k}.txt) &