Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Streaming Karatsuba #20

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 55 additions & 12 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ set(CMAKE_CXX_STANDARD 17)
set(APFP_PLATFORM "xilinx_u250_gen3x16_xdma_3_1_202020_1" CACHE STRING "Platform string for Vitis.")
set(APFP_BITS 1024 CACHE STRING "Number of bits to use for a floating point number, including mantissa, exponent, and sign.")
set(APFP_MULT_BASE_BITS 18 CACHE STRING "Number of bits to bottom out the multiplication at and use native multiplication.")
set(APFP_STREAMING_BASE_BITS 256 CACHE STRING "Bit width where Karatsuba will be implemented as a single pipeline.")
set(APFP_TILE_SIZE_N 32 CACHE STRING "Tile size in the N-dimension when running matrix-matrix multiplication.")
set(APFP_TILE_SIZE_M 32 CACHE STRING "Tile size in the M-dimension when running matrix-matrix multiplication.")
set(APFP_COMPUTE_UNITS 1 CACHE STRING "Number of replications of the kernel to instantiate.")
Expand Down Expand Up @@ -36,82 +37,124 @@ include_directories(${CMAKE_BINARY_DIR} include SYSTEM hlslib/include ${Vitis_IN
configure_file(include/Config.h.in Config.h)

set(APFP_KERNEL_FILES device/MatrixMultiplication.cpp
device/ArithmeticOperations.cpp
device/Karatsuba.cpp)
device/ArithmeticOperations.cpp)

# Mapping to DDR ports
set(APFP_PORT_MAPPING MatrixMultiplication_1.m_axi_a:DDR[1]
MatrixMultiplication_1.m_axi_b:DDR[1]
MatrixMultiplication_1.m_axi_c_read:DDR[1]
MatrixMultiplication_1.m_axi_c_write:DDR[1])
set(APFP_CONNECTIVITY MatrixMultiplication_1.a_to_kernel:FreeRunningMultiplication_1.a_to_kernel
MatrixMultiplication_1.b_to_kernel:FreeRunningMultiplication_1.b_to_kernel
FreeRunningMultiplication_1.ab_from_kernel:MatrixMultiplication_1.ab_from_kernel)
if(${APFP_COMPUTE_UNITS} GREATER 1)
set(APFP_PORT_MAPPING ${APFP_PORT_MAPPING}
MatrixMultiplication_2.m_axi_a:DDR[0]
MatrixMultiplication_2.m_axi_b:DDR[0]
MatrixMultiplication_2.m_axi_c_read:DDR[0]
MatrixMultiplication_2.m_axi_c_write:DDR[0])
set(APFP_CONNECTIVITY ${APFP_CONNECTIVITY}
MatrixMultiplication_2.a_to_kernel:FreeRunningMultiplication_2.a_to_kernel
MatrixMultiplication_2.b_to_kernel:FreeRunningMultiplication_2.b_to_kernel
FreeRunningMultiplication_2.ab_from_kernel:MatrixMultiplication_2.ab_from_kernel)
endif()
if(${APFP_COMPUTE_UNITS} GREATER 2)
set(APFP_PORT_MAPPING ${APFP_PORT_MAPPING}
MatrixMultiplication_3.m_axi_a:DDR[2]
MatrixMultiplication_3.m_axi_b:DDR[2]
MatrixMultiplication_3.m_axi_c_read:DDR[2]
MatrixMultiplication_3.m_axi_c_write:DDR[2])
set(APFP_CONNECTIVITY ${APFP_CONNECTIVITY}
MatrixMultiplication_3.a_to_kernel:FreeRunningMultiplication_3.a_to_kernel
MatrixMultiplication_3.b_to_kernel:FreeRunningMultiplication_3.b_to_kernel
FreeRunningMultiplication_3.ab_from_kernel:MatrixMultiplication_3.ab_from_kernel)
endif()
if(${APFP_COMPUTE_UNITS} GREATER 3)
set(APFP_PORT_MAPPING ${APFP_PORT_MAPPING}
MatrixMultiplication_4.m_axi_a:DDR[3]
MatrixMultiplication_4.m_axi_b:DDR[3]
MatrixMultiplication_4.m_axi_c_read:DDR[3]
MatrixMultiplication_4.m_axi_c_write:DDR[3])
set(APFP_CONNECTIVITY ${APFP_CONNECTIVITY}
MatrixMultiplication_4.a_to_kernel:FreeRunningMultiplication_4.a_to_kernel
MatrixMultiplication_4.b_to_kernel:FreeRunningMultiplication_4.b_to_kernel
FreeRunningMultiplication_4.ab_from_kernel:MatrixMultiplication_4.ab_from_kernel)
endif()
if(${APFP_COMPUTE_UNITS} GREATER 4)
set(APFP_PORT_MAPPING ${APFP_PORT_MAPPING}
MatrixMultiplication_5.m_axi_a:DDR[1]
MatrixMultiplication_5.m_axi_b:DDR[1]
MatrixMultiplication_5.m_axi_c_read:DDR[1]
MatrixMultiplication_5.m_axi_c_write:DDR[1])
set(APFP_CONNECTIVITY ${APFP_CONNECTIVITY}
MatrixMultiplication_5.a_to_kernel:FreeRunningMultiplication_5.a_to_kernel
MatrixMultiplication_5.b_to_kernel:FreeRunningMultiplication_5.b_to_kernel
FreeRunningMultiplication_5.ab_from_kernel:MatrixMultiplication_5.ab_from_kernel)
endif()
if(${APFP_COMPUTE_UNITS} GREATER 5)
set(APFP_PORT_MAPPING ${APFP_PORT_MAPPING}
MatrixMultiplication_6.m_axi_a:DDR[0]
MatrixMultiplication_6.m_axi_b:DDR[0]
MatrixMultiplication_6.m_axi_c_read:DDR[0]
MatrixMultiplication_6.m_axi_c_write:DDR[0])
set(APFP_CONNECTIVITY ${APFP_CONNECTIVITY}
MatrixMultiplication_6.a_to_kernel:FreeRunningMultiplication_6.a_to_kernel
MatrixMultiplication_6.b_to_kernel:FreeRunningMultiplication_6.b_to_kernel
FreeRunningMultiplication_6.ab_from_kernel:MatrixMultiplication_6.ab_from_kernel)
endif()
if(${APFP_COMPUTE_UNITS} GREATER 6)
set(APFP_PORT_MAPPING ${APFP_PORT_MAPPING}
MatrixMultiplication_7.m_axi_a:DDR[2]
MatrixMultiplication_7.m_axi_b:DDR[2]
MatrixMultiplication_7.m_axi_c_read:DDR[2]
MatrixMultiplication_7.m_axi_c_write:DDR[2])
set(APFP_CONNECTIVITY ${APFP_CONNECTIVITY}
MatrixMultiplication_7.a_to_kernel:FreeRunningMultiplication_7.a_to_kernel
MatrixMultiplication_7.b_to_kernel:FreeRunningMultiplication_7.b_to_kernel
FreeRunningMultiplication_7.ab_from_kernel:MatrixMultiplication_7.ab_from_kernel)
endif()
if(${APFP_COMPUTE_UNITS} GREATER 7)
set(APFP_PORT_MAPPING ${APFP_PORT_MAPPING}
MatrixMultiplication_8.m_axi_a:DDR[3]
MatrixMultiplication_8.m_axi_b:DDR[3]
MatrixMultiplication_8.m_axi_c_read:DDR[3]
MatrixMultiplication_8.m_axi_c_write:DDR[3])
set(APFP_CONNECTIVITY ${APFP_CONNECTIVITY}
MatrixMultiplication_8.a_to_kernel:FreeRunningMultiplication_8.a_to_kernel
MatrixMultiplication_8.b_to_kernel:FreeRunningMultiplication_8.b_to_kernel
FreeRunningMultiplication_8.ab_from_kernel:MatrixMultiplication_8.ab_from_kernel)
endif()
if(${APFP_COMPUTE_UNITS} GREATER 8)
message(FATAL_ERROR "More than 8 compute units is not supported.")
endif()

# Setup FPGA kernel targets
set(APFP_HLS_FLAGS "-DAP_INT_MAX_W=${APFP_MAX_BITS} -DAPFP_${APFP_SEMANTICS}_SEMANTICS")
set(APFP_HLS_CONFIG "config_compile -pipeline_style frp\nconfig_dataflow -fifo_depth 16")
set(APFP_INCLUDE_DIRS include hlslib/include ${CMAKE_BINARY_DIR})
set(APFP_DEPENDS ${CMAKE_BINARY_DIR}/Config.h
include/ArithmeticOperations.h
include/DeviceTypes.h
include/Karatsuba.h
include/MatrixMultiplication.h
include/PackedFloat.h
include/PipelinedAdd.h)
add_vitis_kernel(MatrixMultiplication FILES ${APFP_KERNEL_FILES}
COMPUTE_UNITS ${APFP_COMPUTE_UNITS}
INCLUDE_DIRS include hlslib/include ${CMAKE_BINARY_DIR}
HLS_FLAGS "-DAP_INT_MAX_W=${APFP_MAX_BITS} -DAPFP_${APFP_SEMANTICS}_SEMANTICS"
HLS_CONFIG "config_compile -pipeline_style frp\nconfig_dataflow -fifo_depth 16"
DEPENDS ${CMAKE_BINARY_DIR}/Config.h
include/ArithmeticOperations.h
include/DeviceTypes.h
include/Karatsuba.h
include/MatrixMultiplication.h
include/PackedFloat.h
include/PipelinedAdd.h
INCLUDE_DIRS ${APFP_INCLUDE_DIRS}
HLS_FLAGS ${APFP_HLS_FLAGS}
HLS_CONFIG ${APFP_HLS_CONFIG}
DEPENDS ${APFP_DEPENDS}
PORT_MAPPING ${APFP_PORT_MAPPING})
add_vitis_kernel(FreeRunningMultiplication FILES ${APFP_KERNEL_FILES}
COMPUTE_UNITS ${APFP_COMPUTE_UNITS}
INCLUDE_DIRS ${APFP_INCLUDE_DIRS}
HLS_FLAGS ${APFP_HLS_FLAGS}
HLS_CONFIG ${APFP_HLS_CONFIG}
DEPENDS ${APFP_DEPENDS})
add_vitis_program(MatrixMultiplication ${APFP_PLATFORM}
KERNELS MatrixMultiplication FreeRunningMultiplication
CONNECTIVITY ${APFP_CONNECTIVITY}
PROFILING ${APFP_PROFILING}
DEBUGGING ${APFP_DEBUGGING}
SAVE_TEMPS ${APFP_SAVE_TEMPS})
Expand Down
15 changes: 0 additions & 15 deletions device/ArithmeticOperations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,6 @@
#include "Karatsuba.h"
#include "PipelinedAdd.h"

template <int bits>
inline bool IsMostSignificantBitSet(ap_uint<bits> const &num) {
#pragma HLS INLINE
return num.test(bits - 1);
}

template <int bits>
inline int CountLeadingZeros(ap_uint<bits> const &num) {
#pragma HLS INLINE
Expand All @@ -28,14 +22,6 @@ PackedFloat Multiply(PackedFloat const &a, PackedFloat const &b) {
// Pad mantissas to avoid passing awkward sizes to Karatsuba
const ap_uint<kBits> a_mantissa_padded(a.GetMantissa());
const ap_uint<kBits> b_mantissa_padded(b.GetMantissa());
#ifdef APFP_GMP_SEMANTICS // Use GMP semantics
constexpr auto kLimbBits = 8 * sizeof(mp_limb_t);
// Meat of the computation. Only keep the top bits of the computation and throw away the rest
const ap_uint<(2 * kMantissaBits)> _m_mantissa = Karatsuba(a_mantissa_padded, b_mantissa_padded);
const bool limb_zero = _m_mantissa.range(kMantissaBits + kLimbBits - 1, kMantissaBits) == 0;
ap_uint<kMantissaBits + kLimbBits> m_mantissa = _m_mantissa; // Truncate
const Exponent m_exponent = a.GetExponent() + b.GetExponent() - limb_zero;
#else // Otherwise use MPFR semantics
const ap_uint<kMantissaBits + 1> _m_mantissa =
Karatsuba(a_mantissa_padded, b_mantissa_padded) >> (kMantissaBits - 1);
// We need to shift the mantissa forward if the most significant bit is not set
Expand All @@ -44,7 +30,6 @@ PackedFloat Multiply(PackedFloat const &a, PackedFloat const &b) {
// Add up exponents. If the most significant bit was 1, we're done. Otherwise subtract 1 due to
// the shift.
const Exponent m_exponent = a.GetExponent() + b.GetExponent() - (should_be_shifted ? 1 : 0);
#endif
// The sign is just the XOR of the existing signs
PackedFloat result;
result.SetMantissa(m_mantissa);
Expand Down
59 changes: 0 additions & 59 deletions device/Karatsuba.cpp

This file was deleted.

Loading