diff --git a/CMakeLists.txt b/CMakeLists.txt index 02ca0c2..ab2566f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,12 +44,19 @@ set(APFP_PORT_MAPPING MatrixMultiplication_1.m_axi_a:DDR[1] MatrixMultiplication_1.m_axi_b:DDR[1] MatrixMultiplication_1.m_axi_c_read:DDR[1] MatrixMultiplication_1.m_axi_c_write:DDR[1]) +set(APFP_CONNECTIVITY MatrixMultiplication_1.a_to_kernel:FreeRunningMultiplication_1.a_to_kernel + MatrixMultiplication_1.b_to_kernel:FreeRunningMultiplication_1.b_to_kernel + FreeRunningMultiplication_1.ab_from_kernel:MatrixMultiplication_1.ab_from_kernel) if(${APFP_COMPUTE_UNITS} GREATER 1) set(APFP_PORT_MAPPING ${APFP_PORT_MAPPING} MatrixMultiplication_2.m_axi_a:DDR[0] MatrixMultiplication_2.m_axi_b:DDR[0] MatrixMultiplication_2.m_axi_c_read:DDR[0] MatrixMultiplication_2.m_axi_c_write:DDR[0]) + set(APFP_CONNECTIVITY ${APFP_CONNECTIVITY} + MatrixMultiplication_2.a_to_kernel:FreeRunningMultiplication_2.a_to_kernel + MatrixMultiplication_2.b_to_kernel:FreeRunningMultiplication_2.b_to_kernel + FreeRunningMultiplication_2.ab_from_kernel:MatrixMultiplication_2.ab_from_kernel) endif() if(${APFP_COMPUTE_UNITS} GREATER 2) set(APFP_PORT_MAPPING ${APFP_PORT_MAPPING} @@ -57,6 +64,10 @@ if(${APFP_COMPUTE_UNITS} GREATER 2) MatrixMultiplication_3.m_axi_b:DDR[2] MatrixMultiplication_3.m_axi_c_read:DDR[2] MatrixMultiplication_3.m_axi_c_write:DDR[2]) + set(APFP_CONNECTIVITY ${APFP_CONNECTIVITY} + MatrixMultiplication_3.a_to_kernel:FreeRunningMultiplication_3.a_to_kernel + MatrixMultiplication_3.b_to_kernel:FreeRunningMultiplication_3.b_to_kernel + FreeRunningMultiplication_3.ab_from_kernel:MatrixMultiplication_3.ab_from_kernel) endif() if(${APFP_COMPUTE_UNITS} GREATER 3) set(APFP_PORT_MAPPING ${APFP_PORT_MAPPING} @@ -64,6 +75,10 @@ if(${APFP_COMPUTE_UNITS} GREATER 3) MatrixMultiplication_4.m_axi_b:DDR[3] MatrixMultiplication_4.m_axi_c_read:DDR[3] MatrixMultiplication_4.m_axi_c_write:DDR[3]) + set(APFP_CONNECTIVITY ${APFP_CONNECTIVITY} + MatrixMultiplication_4.a_to_kernel:FreeRunningMultiplication_4.a_to_kernel + MatrixMultiplication_4.b_to_kernel:FreeRunningMultiplication_4.b_to_kernel + FreeRunningMultiplication_4.ab_from_kernel:MatrixMultiplication_4.ab_from_kernel) endif() if(${APFP_COMPUTE_UNITS} GREATER 4) set(APFP_PORT_MAPPING ${APFP_PORT_MAPPING} @@ -98,20 +113,32 @@ if(${APFP_COMPUTE_UNITS} GREATER 8) endif() # Setup FPGA kernel targets +set(APFP_HLS_FLAGS "-DAP_INT_MAX_W=${APFP_MAX_BITS} -DAPFP_${APFP_SEMANTICS}_SEMANTICS") +set(APFP_HLS_CONFIG "config_compile -pipeline_style frp\nconfig_dataflow -fifo_depth 16") +set(APFP_INCLUDE_DIRS include hlslib/include ${CMAKE_BINARY_DIR}) +set(APFP_DEPENDS ${CMAKE_BINARY_DIR}/Config.h + include/ArithmeticOperations.h + include/DeviceTypes.h + include/Karatsuba.h + include/MatrixMultiplication.h + include/PackedFloat.h + include/PipelinedAdd.h) add_vitis_kernel(MatrixMultiplication FILES ${APFP_KERNEL_FILES} COMPUTE_UNITS ${APFP_COMPUTE_UNITS} - INCLUDE_DIRS include hlslib/include ${CMAKE_BINARY_DIR} - HLS_FLAGS "-DAP_INT_MAX_W=${APFP_MAX_BITS} -DAPFP_${APFP_SEMANTICS}_SEMANTICS" - HLS_CONFIG "config_compile -pipeline_style frp\nconfig_dataflow -fifo_depth 16" - DEPENDS ${CMAKE_BINARY_DIR}/Config.h - include/ArithmeticOperations.h - include/DeviceTypes.h - include/Karatsuba.h - include/MatrixMultiplication.h - include/PackedFloat.h - include/PipelinedAdd.h + INCLUDE_DIRS ${APFP_INCLUDE_DIRS} + HLS_FLAGS ${APFP_HLS_FLAGS} + HLS_CONFIG ${APFP_HLS_CONFIG} + DEPENDS ${APFP_DEPENDS} PORT_MAPPING ${APFP_PORT_MAPPING}) +add_vitis_kernel(FreeRunningMultiplication FILES ${APFP_KERNEL_FILES} + COMPUTE_UNITS ${APFP_COMPUTE_UNITS} + INCLUDE_DIRS ${APFP_INCLUDE_DIRS} + HLS_FLAGS ${APFP_HLS_FLAGS} + HLS_CONFIG ${APFP_HLS_CONFIG} + DEPENDS ${APFP_DEPENDS}) add_vitis_program(MatrixMultiplication ${APFP_PLATFORM} + KERNELS MatrixMultiplication FreeRunningMultiplication + CONNECTIVITY ${APFP_CONNECTIVITY} PROFILING ${APFP_PROFILING} DEBUGGING ${APFP_DEBUGGING} SAVE_TEMPS ${APFP_SAVE_TEMPS}) diff --git a/device/MatrixMultiplication.cpp b/device/MatrixMultiplication.cpp index 594cf69..5e9c00e 100644 --- a/device/MatrixMultiplication.cpp +++ b/device/MatrixMultiplication.cpp @@ -330,42 +330,66 @@ void WriteC(hlslib::Stream &from_kernel, DramLine *const mem, const //////////////////////////////////////////////////////////////////////////////// -void Compute(hlslib::Stream &a_in, hlslib::Stream &b_in, hlslib::Stream &c_in, - hlslib::Stream &c_out, int const size_n, int const size_k, int const size_m) { - PackedFloat a_buffer; // Just to make A symmetric to B and C +void ComputeEntry(hlslib::Stream &a_in, hlslib::Stream &b_in, + hlslib::Stream &a_out, hlslib::Stream &b_out, int const size_n, + int const size_k, int const size_m) { + PackedFloat a_buffer; PackedFloat b_buffer[kTileSizeM]; - PackedFloat c_buffer[kTileSizeN * kTileSizeM]; const int tiles_n = hlslib::CeilDivide(size_n, kTileSizeN); const int tiles_m = hlslib::CeilDivide(size_m, kTileSizeM); -Compute_TilesN: +ComputeEntry_TilesN: for (int n0 = 0; n0 < tiles_n; ++n0) { - Compute_TilesM: + ComputeEntry_TilesM: for (int m0 = 0; m0 < tiles_m; ++m0) { - Compute_K: + ComputeEntry_K: for (int k = 0; k < size_k; ++k) { - Compute_N: + ComputeEntry_N: for (int n1 = 0; n1 < ((n0 < tiles_n - 1) ? kTileSizeN : (size_n - n0 * kTileSizeN)); ++n1) { - Compute_M: + ComputeEntry_M: for (int m1 = 0; m1 < kTileSizeM; ++m1) { #pragma HLS PIPELINE II = 1 #pragma HLS LOOP_FLATTEN const PackedFloat a_read = a_in.Pop(); const PackedFloat b_read = b_in.Pop(); - const PackedFloat c_read = c_in.Pop(); const PackedFloat a = (m1 == 0) ? a_read : a_buffer; const PackedFloat b = (n1 == 0) ? b_read : b_buffer[m1]; - const PackedFloat c = (k == 0) ? c_read : c_buffer[n1 * kTileSizeM + m1]; a_buffer = a; b_buffer[m1] = b; // Ignore contributions from out-of-bound indices const bool in_bounds = (n0 * kTileSizeN + n1 < size_n) && (m0 * kTileSizeM + m1 < size_m); - // Meat of the computation - const auto res = MultiplyAccumulate(in_bounds ? a : PackedFloat::Zero(), - in_bounds ? b : PackedFloat::Zero(), c); - // Write back to buffer + a_out.Push(in_bounds ? a : PackedFloat::Zero()); + b_out.Push(in_bounds ? b : PackedFloat::Zero()); + } + } + } + } + } +} + +void ComputeExit(hlslib::Stream &ab_in, hlslib::Stream &c_in, + hlslib::Stream &c_out, int const size_n, int const size_k, int const size_m) { + PackedFloat c_buffer[kTileSizeN * kTileSizeM]; + const int tiles_n = hlslib::CeilDivide(size_n, kTileSizeN); + const int tiles_m = hlslib::CeilDivide(size_m, kTileSizeM); +ComputeExit_TilesN: + for (int n0 = 0; n0 < tiles_n; ++n0) { + ComputeExit_TilesM: + for (int m0 = 0; m0 < tiles_m; ++m0) { + ComputeExit_K: + for (int k = 0; k < size_k; ++k) { + ComputeExit_N: + for (int n1 = 0; n1 < ((n0 < tiles_n - 1) ? kTileSizeN : (size_n - n0 * kTileSizeN)); ++n1) { + ComputeExit_M: + for (int m1 = 0; m1 < kTileSizeM; ++m1) { +#pragma HLS PIPELINE II = 1 +#pragma HLS LOOP_FLATTEN + const PackedFloat ab = ab_in.Pop(); + const PackedFloat c_read = c_in.Pop(); + const PackedFloat c = (k == 0) ? c_read : c_buffer[n1 * kTileSizeM + m1]; + const PackedFloat res = Add(ab, c); + c_out.Push(res); c_buffer[n1 * kTileSizeM + m1] = res; #pragma HLS DEPENDENCE variable = c_buffer false - c_out.Push(res); } } } @@ -375,8 +399,22 @@ void Compute(hlslib::Stream &a_in, hlslib::Stream &b_i //////////////////////////////////////////////////////////////////////////////// +void FreeRunningMultiplication(hlslib::Stream &a_to_kernel, hlslib::Stream &b_to_kernel, + hlslib::Stream &ab_from_kernel) { +#pragma HLS INTERFACE axis port = a_to_kernel +#pragma HLS INTERFACE axis port = b_to_kernel +#pragma HLS INTERFACE axis port = ab_from_kernel +#pragma HLS interface ap_ctrl_none port = return +#pragma HLS PIPELINE II = 1 + ab_from_kernel.Push(Multiply(a_to_kernel.Pop(), b_to_kernel.Pop())); +} + +//////////////////////////////////////////////////////////////////////////////// + void MatrixMultiplication(DramLine const *const a, DramLine const *const b, DramLine const *const c_read, - DramLine *const c_write, const int size_n, const int size_k, int const size_m) { + DramLine *const c_write, const int size_n, const int size_k, int const size_m, + hlslib::Stream &a_to_kernel, hlslib::Stream &b_to_kernel, + hlslib::Stream &ab_from_kernel) { #pragma HLS INTERFACE m_axi offset = slave port = a bundle = a #pragma HLS INTERFACE m_axi offset = slave port = b bundle = b // Even though they actually point to the same memory location, we use two separate interfaces for reading and writing @@ -390,6 +428,9 @@ void MatrixMultiplication(DramLine const *const a, DramLine const *const b, Dram #pragma HLS INTERFACE s_axilite port = size_n #pragma HLS INTERFACE s_axilite port = size_k #pragma HLS INTERFACE s_axilite port = size_m +#pragma HLS INTERFACE axis port = a_to_kernel +#pragma HLS INTERFACE axis port = b_to_kernel +#pragma HLS INTERFACE axis port = ab_from_kernel #pragma HLS STABLE variable = a #pragma HLS STABLE variable = b #pragma HLS STABLE variable = c_read @@ -399,21 +440,23 @@ void MatrixMultiplication(DramLine const *const a, DramLine const *const b, Dram #pragma HLS STABLE variable = size_m #pragma HLS DATAFLOW hlslib::Stream a_to_feeder("a_to_feeder"); - hlslib::Stream a_to_kernel("a_to_kernel"); + hlslib::Stream a_to_entry("a_to_entry"); hlslib::Stream b_to_feeder("b_to_feeder"); - hlslib::Stream b_to_kernel("b_to_kernel"); + hlslib::Stream b_to_entry("b_to_entry"); hlslib::Stream c_to_feeder("c_to_feeder"); hlslib::Stream c_to_kernel("c_to_kernel"); hlslib::Stream c_from_kernel("c_from_kernel"); + hlslib::Stream c_from_exit("c_from_exit"); hlslib::Stream c_from_drainer("c_from_drainer"); HLSLIB_DATAFLOW_INIT(); HLSLIB_DATAFLOW_FUNCTION(ReadA, a, a_to_feeder, size_n, size_k, size_m); - HLSLIB_DATAFLOW_FUNCTION(FeedA, a_to_feeder, a_to_kernel, size_n, size_k, size_m); + HLSLIB_DATAFLOW_FUNCTION(FeedA, a_to_feeder, a_to_entry, size_n, size_k, size_m); HLSLIB_DATAFLOW_FUNCTION(ReadB, b, b_to_feeder, size_n, size_k, size_m); - HLSLIB_DATAFLOW_FUNCTION(FeedB, b_to_feeder, b_to_kernel, size_n, size_k, size_m); + HLSLIB_DATAFLOW_FUNCTION(FeedB, b_to_feeder, b_to_entry, size_n, size_k, size_m); HLSLIB_DATAFLOW_FUNCTION(ReadC, c_read, c_to_feeder, size_n, size_m); HLSLIB_DATAFLOW_FUNCTION(FeedC, c_to_feeder, c_to_kernel, size_n, size_k, size_m); - HLSLIB_DATAFLOW_FUNCTION(Compute, a_to_kernel, b_to_kernel, c_to_kernel, c_from_kernel, size_n, size_k, size_m); + HLSLIB_DATAFLOW_FUNCTION(ComputeEntry, a_to_entry, b_to_entry, a_to_kernel, b_to_kernel, size_n, size_k, size_m); + HLSLIB_DATAFLOW_FUNCTION(ComputeExit, ab_from_kernel, c_to_kernel, c_from_kernel, size_n, size_k, size_m); HLSLIB_DATAFLOW_FUNCTION(DrainC, c_from_kernel, c_from_drainer, size_n, size_k, size_m); HLSLIB_DATAFLOW_FUNCTION(WriteC, c_from_drainer, c_write, size_n, size_m); HLSLIB_DATAFLOW_FINALIZE(); diff --git a/host/TestProgram.cpp b/host/TestProgram.cpp index 4655a58..271b329 100644 --- a/host/TestProgram.cpp +++ b/host/TestProgram.cpp @@ -1,4 +1,5 @@ #include +#include #include #include // putenv @@ -22,6 +23,13 @@ struct MpfrWrapper { }; #ifdef HLSLIB_SIMULATE_OPENCL +void RunFreeRunningKernel(hlslib::Stream &a_in, hlslib::Stream &b_in, + hlslib::Stream &ab_out) { + while (true) { + FreeRunningMultiplication(a_in, b_in, ab_out); + } +} + bool RunTestSimulation(int size_n, int size_k, int size_m, bool verify) { const std::string kernel_path(""); #else @@ -111,9 +119,14 @@ bool RunTest(std::string const &kernel_path, int size_n, int size_k, int size_m, // In simulation mode, this will call the function "MatrixMultiplication" and run it in software. // Otherwise, the provided path to a kernel binary will be loaded and executed. std::vector kernels; + hlslib::Stream a_to_kernel[kComputeUnits]; + hlslib::Stream b_to_kernel[kComputeUnits]; + hlslib::Stream ab_from_kernel[kComputeUnits]; for (int i = 0; i < kComputeUnits; ++i) { - kernels.emplace_back(program.MakeKernel(MatrixMultiplication, "MatrixMultiplication", a_device[i], b_device[i], - c_device[i], c_device[i], n_partition_size[i], size_k, size_m)); + kernels.emplace_back(program.MakeKernel( + MatrixMultiplication, "MatrixMultiplication", a_device[i], b_device[i], c_device[i], c_device[i], + n_partition_size[i], size_k, size_m, hlslib::ocl::SimulationOnly(a_to_kernel[i]), + hlslib::ocl::SimulationOnly(b_to_kernel[i]), hlslib::ocl::SimulationOnly(ab_from_kernel[i]))); } const float expected_runtime = expected_cycles / 0.3e9; @@ -126,6 +139,14 @@ bool RunTest(std::string const &kernel_path, int size_n, int size_k, int size_m, << bandwidth << " GB/s.\n"; std::cout << "Executing kernel...\n"; +#ifdef HLSLIB_SIMULATE_OPENCL + for (int i = 0; i < kComputeUnits; ++i) { + std::thread free_running(RunFreeRunningKernel, std::ref(a_to_kernel[i]), std::ref(b_to_kernel[i]), + std::ref(ab_from_kernel[i])); + // Will be killed when the program exits + free_running.detach(); + } +#endif std::vector events; auto start = std::chrono::high_resolution_clock::now(); for (int i = 0; i < kComputeUnits; ++i) { diff --git a/include/MatrixMultiplication.h b/include/MatrixMultiplication.h index 43198fe..d2ced76 100644 --- a/include/MatrixMultiplication.h +++ b/include/MatrixMultiplication.h @@ -1,7 +1,16 @@ #pragma once +#include + #include "Config.h" #include "DeviceTypes.h" +#include "PackedFloat.h" extern "C" void MatrixMultiplication(DramLine const *a, DramLine const *b, DramLine const *c_read, DramLine *c_write, - int n, int m, int k); + int n, int m, int k, hlslib::Stream &a_to_kernel, + hlslib::Stream &b_to_kernel, + hlslib::Stream &ab_from_kernel); + +extern "C" void FreeRunningMultiplication(hlslib::Stream &a_to_kernel, + hlslib::Stream &b_to_kernel, + hlslib::Stream &ab_from_kernel);