spcl · definelicht · Jan 14, 2022 · Jan 14, 2022 · Jan 14, 2022 · Jan 16, 2022
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -7,6 +7,7 @@ set(CMAKE_CXX_STANDARD 17)
 set(APFP_PLATFORM "xilinx_u250_gen3x16_xdma_3_1_202020_1" CACHE STRING "Platform string for Vitis.")
 set(APFP_BITS 1024 CACHE STRING "Number of bits to use for a floating point number, including mantissa, exponent, and sign.")
 set(APFP_MULT_BASE_BITS 18 CACHE STRING "Number of bits to bottom out the multiplication at and use native multiplication.")
+set(APFP_STREAMING_BASE_BITS 256 CACHE STRING "Bit width where Karatsuba will be implemented as a single pipeline.")
 set(APFP_TILE_SIZE_N 32 CACHE STRING "Tile size in the N-dimension when running matrix-matrix multiplication.")
 set(APFP_TILE_SIZE_M 32 CACHE STRING "Tile size in the M-dimension when running matrix-matrix multiplication.")
 set(APFP_COMPUTE_UNITS 1 CACHE STRING "Number of replications of the kernel to instantiate.")
@@ -36,82 +37,124 @@ include_directories(${CMAKE_BINARY_DIR} include SYSTEM hlslib/include ${Vitis_IN
 configure_file(include/Config.h.in Config.h)
 
 set(APFP_KERNEL_FILES device/MatrixMultiplication.cpp
-                      device/ArithmeticOperations.cpp
-                      device/Karatsuba.cpp)
+                      device/ArithmeticOperations.cpp)
 
 # Mapping to DDR ports
 set(APFP_PORT_MAPPING MatrixMultiplication_1.m_axi_a:DDR[1]
                       MatrixMultiplication_1.m_axi_b:DDR[1]
                       MatrixMultiplication_1.m_axi_c_read:DDR[1]
                       MatrixMultiplication_1.m_axi_c_write:DDR[1])
+set(APFP_CONNECTIVITY MatrixMultiplication_1.a_to_kernel:FreeRunningMultiplication_1.a_to_kernel
+                      MatrixMultiplication_1.b_to_kernel:FreeRunningMultiplication_1.b_to_kernel
+                      FreeRunningMultiplication_1.ab_from_kernel:MatrixMultiplication_1.ab_from_kernel)
 if(${APFP_COMPUTE_UNITS} GREATER 1)
     set(APFP_PORT_MAPPING ${APFP_PORT_MAPPING}
                           MatrixMultiplication_2.m_axi_a:DDR[0]
                           MatrixMultiplication_2.m_axi_b:DDR[0]
                           MatrixMultiplication_2.m_axi_c_read:DDR[0]
                           MatrixMultiplication_2.m_axi_c_write:DDR[0])
+    set(APFP_CONNECTIVITY ${APFP_CONNECTIVITY}
+                          MatrixMultiplication_2.a_to_kernel:FreeRunningMultiplication_2.a_to_kernel
+                          MatrixMultiplication_2.b_to_kernel:FreeRunningMultiplication_2.b_to_kernel
+                          FreeRunningMultiplication_2.ab_from_kernel:MatrixMultiplication_2.ab_from_kernel)
 endif()
 if(${APFP_COMPUTE_UNITS} GREATER 2)
     set(APFP_PORT_MAPPING ${APFP_PORT_MAPPING}
                           MatrixMultiplication_3.m_axi_a:DDR[2]
                           MatrixMultiplication_3.m_axi_b:DDR[2]
                           MatrixMultiplication_3.m_axi_c_read:DDR[2]
                           MatrixMultiplication_3.m_axi_c_write:DDR[2])
+    set(APFP_CONNECTIVITY ${APFP_CONNECTIVITY}
+                          MatrixMultiplication_3.a_to_kernel:FreeRunningMultiplication_3.a_to_kernel
+                          MatrixMultiplication_3.b_to_kernel:FreeRunningMultiplication_3.b_to_kernel
+                          FreeRunningMultiplication_3.ab_from_kernel:MatrixMultiplication_3.ab_from_kernel)
 endif()
 if(${APFP_COMPUTE_UNITS} GREATER 3)
     set(APFP_PORT_MAPPING ${APFP_PORT_MAPPING}
                           MatrixMultiplication_4.m_axi_a:DDR[3]
                           MatrixMultiplication_4.m_axi_b:DDR[3]
                           MatrixMultiplication_4.m_axi_c_read:DDR[3]
                           MatrixMultiplication_4.m_axi_c_write:DDR[3])
+    set(APFP_CONNECTIVITY ${APFP_CONNECTIVITY}
+                          MatrixMultiplication_4.a_to_kernel:FreeRunningMultiplication_4.a_to_kernel
+                          MatrixMultiplication_4.b_to_kernel:FreeRunningMultiplication_4.b_to_kernel
+                          FreeRunningMultiplication_4.ab_from_kernel:MatrixMultiplication_4.ab_from_kernel)
 endif()
 if(${APFP_COMPUTE_UNITS} GREATER 4)
     set(APFP_PORT_MAPPING ${APFP_PORT_MAPPING}
                           MatrixMultiplication_5.m_axi_a:DDR[1]
                           MatrixMultiplication_5.m_axi_b:DDR[1]
                           MatrixMultiplication_5.m_axi_c_read:DDR[1]
                           MatrixMultiplication_5.m_axi_c_write:DDR[1])
+    set(APFP_CONNECTIVITY ${APFP_CONNECTIVITY}
+                          MatrixMultiplication_5.a_to_kernel:FreeRunningMultiplication_5.a_to_kernel
+                          MatrixMultiplication_5.b_to_kernel:FreeRunningMultiplication_5.b_to_kernel
+                          FreeRunningMultiplication_5.ab_from_kernel:MatrixMultiplication_5.ab_from_kernel)
 endif()
 if(${APFP_COMPUTE_UNITS} GREATER 5)
     set(APFP_PORT_MAPPING ${APFP_PORT_MAPPING}
                           MatrixMultiplication_6.m_axi_a:DDR[0]
                           MatrixMultiplication_6.m_axi_b:DDR[0]
                           MatrixMultiplication_6.m_axi_c_read:DDR[0]
                           MatrixMultiplication_6.m_axi_c_write:DDR[0])
+    set(APFP_CONNECTIVITY ${APFP_CONNECTIVITY}
+                          MatrixMultiplication_6.a_to_kernel:FreeRunningMultiplication_6.a_to_kernel
+                          MatrixMultiplication_6.b_to_kernel:FreeRunningMultiplication_6.b_to_kernel
+                          FreeRunningMultiplication_6.ab_from_kernel:MatrixMultiplication_6.ab_from_kernel)
 endif()
 if(${APFP_COMPUTE_UNITS} GREATER 6)
     set(APFP_PORT_MAPPING ${APFP_PORT_MAPPING}
                           MatrixMultiplication_7.m_axi_a:DDR[2]
                           MatrixMultiplication_7.m_axi_b:DDR[2]
                           MatrixMultiplication_7.m_axi_c_read:DDR[2]
                           MatrixMultiplication_7.m_axi_c_write:DDR[2])
+    set(APFP_CONNECTIVITY ${APFP_CONNECTIVITY}
+                          MatrixMultiplication_7.a_to_kernel:FreeRunningMultiplication_7.a_to_kernel
+                          MatrixMultiplication_7.b_to_kernel:FreeRunningMultiplication_7.b_to_kernel
+                          FreeRunningMultiplication_7.ab_from_kernel:MatrixMultiplication_7.ab_from_kernel)
 endif()
 if(${APFP_COMPUTE_UNITS} GREATER 7)
     set(APFP_PORT_MAPPING ${APFP_PORT_MAPPING}
                           MatrixMultiplication_8.m_axi_a:DDR[3]
                           MatrixMultiplication_8.m_axi_b:DDR[3]
                           MatrixMultiplication_8.m_axi_c_read:DDR[3]
                           MatrixMultiplication_8.m_axi_c_write:DDR[3])
+    set(APFP_CONNECTIVITY ${APFP_CONNECTIVITY}
+                          MatrixMultiplication_8.a_to_kernel:FreeRunningMultiplication_8.a_to_kernel
+                          MatrixMultiplication_8.b_to_kernel:FreeRunningMultiplication_8.b_to_kernel
+                          FreeRunningMultiplication_8.ab_from_kernel:MatrixMultiplication_8.ab_from_kernel)
 endif()
 if(${APFP_COMPUTE_UNITS} GREATER 8)
     message(FATAL_ERROR "More than 8 compute units is not supported.")
 endif()
 
 # Setup FPGA kernel targets
+set(APFP_HLS_FLAGS "-DAP_INT_MAX_W=${APFP_MAX_BITS} -DAPFP_${APFP_SEMANTICS}_SEMANTICS")
+set(APFP_HLS_CONFIG "config_compile -pipeline_style frp\nconfig_dataflow -fifo_depth 16")
+set(APFP_INCLUDE_DIRS include hlslib/include ${CMAKE_BINARY_DIR})
+set(APFP_DEPENDS ${CMAKE_BINARY_DIR}/Config.h
+                 include/ArithmeticOperations.h
+                 include/DeviceTypes.h
+                 include/Karatsuba.h
+                 include/MatrixMultiplication.h
+                 include/PackedFloat.h
+                 include/PipelinedAdd.h)
 add_vitis_kernel(MatrixMultiplication FILES ${APFP_KERNEL_FILES}
                  COMPUTE_UNITS ${APFP_COMPUTE_UNITS}
-                 INCLUDE_DIRS include hlslib/include ${CMAKE_BINARY_DIR}
-                 HLS_FLAGS "-DAP_INT_MAX_W=${APFP_MAX_BITS} -DAPFP_${APFP_SEMANTICS}_SEMANTICS"
-                 HLS_CONFIG "config_compile -pipeline_style frp\nconfig_dataflow -fifo_depth 16"
-                 DEPENDS ${CMAKE_BINARY_DIR}/Config.h
-                         include/ArithmeticOperations.h
-                         include/DeviceTypes.h
-                         include/Karatsuba.h
-                         include/MatrixMultiplication.h
-                         include/PackedFloat.h
-                         include/PipelinedAdd.h
+                 INCLUDE_DIRS ${APFP_INCLUDE_DIRS}
+                 HLS_FLAGS ${APFP_HLS_FLAGS}
+                 HLS_CONFIG ${APFP_HLS_CONFIG}
+                 DEPENDS ${APFP_DEPENDS}
                 PORT_MAPPING ${APFP_PORT_MAPPING})
+add_vitis_kernel(FreeRunningMultiplication FILES ${APFP_KERNEL_FILES}
+                 COMPUTE_UNITS ${APFP_COMPUTE_UNITS}
+                 INCLUDE_DIRS ${APFP_INCLUDE_DIRS}
+                 HLS_FLAGS ${APFP_HLS_FLAGS}
+                 HLS_CONFIG ${APFP_HLS_CONFIG}
+                 DEPENDS ${APFP_DEPENDS})
 add_vitis_program(MatrixMultiplication ${APFP_PLATFORM}
+                  KERNELS MatrixMultiplication FreeRunningMultiplication
+                  CONNECTIVITY ${APFP_CONNECTIVITY}
                   PROFILING ${APFP_PROFILING}
                   DEBUGGING ${APFP_DEBUGGING}
                   SAVE_TEMPS ${APFP_SAVE_TEMPS})

diff --git a/device/ArithmeticOperations.cpp b/device/ArithmeticOperations.cpp
@@ -6,12 +6,6 @@
 #include "Karatsuba.h"
 #include "PipelinedAdd.h"
 
-template <int bits>
-inline bool IsMostSignificantBitSet(ap_uint<bits> const &num) {
-#pragma HLS INLINE
-    return num.test(bits - 1);
-}
-
 template <int bits>
 inline int CountLeadingZeros(ap_uint<bits> const &num) {
 #pragma HLS INLINE
@@ -28,14 +22,6 @@ PackedFloat Multiply(PackedFloat const &a, PackedFloat const &b) {
     // Pad mantissas to avoid passing awkward sizes to Karatsuba
     const ap_uint<kBits> a_mantissa_padded(a.GetMantissa());
     const ap_uint<kBits> b_mantissa_padded(b.GetMantissa());
-#ifdef APFP_GMP_SEMANTICS  // Use GMP semantics
-    constexpr auto kLimbBits = 8 * sizeof(mp_limb_t);
-    // Meat of the computation. Only keep the top bits of the computation and throw away the rest
-    const ap_uint<(2 * kMantissaBits)> _m_mantissa = Karatsuba(a_mantissa_padded, b_mantissa_padded);
-    const bool limb_zero = _m_mantissa.range(kMantissaBits + kLimbBits - 1, kMantissaBits) == 0;
-    ap_uint<kMantissaBits + kLimbBits> m_mantissa = _m_mantissa;  // Truncate
-    const Exponent m_exponent = a.GetExponent() + b.GetExponent() - limb_zero;
-#else  // Otherwise use MPFR semantics
     const ap_uint<kMantissaBits + 1> _m_mantissa =
         Karatsuba(a_mantissa_padded, b_mantissa_padded) >> (kMantissaBits - 1);
     // We need to shift the mantissa forward if the most significant bit is not set
@@ -44,7 +30,6 @@ PackedFloat Multiply(PackedFloat const &a, PackedFloat const &b) {
     // Add up exponents. If the most significant bit was 1, we're done. Otherwise subtract 1 due to
     // the shift.
     const Exponent m_exponent = a.GetExponent() + b.GetExponent() - (should_be_shifted ? 1 : 0);
-#endif
     // The sign is just the XOR of the existing signs
     PackedFloat result;
     result.SetMantissa(m_mantissa);

diff --git a/device/Karatsuba.cpp b/device/Karatsuba.cpp