Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multiplication as Free-Running Kernel #19

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 37 additions & 10 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,26 +44,41 @@ set(APFP_PORT_MAPPING MatrixMultiplication_1.m_axi_a:DDR[1]
MatrixMultiplication_1.m_axi_b:DDR[1]
MatrixMultiplication_1.m_axi_c_read:DDR[1]
MatrixMultiplication_1.m_axi_c_write:DDR[1])
set(APFP_CONNECTIVITY MatrixMultiplication_1.a_to_kernel:FreeRunningMultiplication_1.a_to_kernel
MatrixMultiplication_1.b_to_kernel:FreeRunningMultiplication_1.b_to_kernel
FreeRunningMultiplication_1.ab_from_kernel:MatrixMultiplication_1.ab_from_kernel)
if(${APFP_COMPUTE_UNITS} GREATER 1)
set(APFP_PORT_MAPPING ${APFP_PORT_MAPPING}
MatrixMultiplication_2.m_axi_a:DDR[0]
MatrixMultiplication_2.m_axi_b:DDR[0]
MatrixMultiplication_2.m_axi_c_read:DDR[0]
MatrixMultiplication_2.m_axi_c_write:DDR[0])
set(APFP_CONNECTIVITY ${APFP_CONNECTIVITY}
MatrixMultiplication_2.a_to_kernel:FreeRunningMultiplication_2.a_to_kernel
MatrixMultiplication_2.b_to_kernel:FreeRunningMultiplication_2.b_to_kernel
FreeRunningMultiplication_2.ab_from_kernel:MatrixMultiplication_2.ab_from_kernel)
endif()
if(${APFP_COMPUTE_UNITS} GREATER 2)
set(APFP_PORT_MAPPING ${APFP_PORT_MAPPING}
MatrixMultiplication_3.m_axi_a:DDR[2]
MatrixMultiplication_3.m_axi_b:DDR[2]
MatrixMultiplication_3.m_axi_c_read:DDR[2]
MatrixMultiplication_3.m_axi_c_write:DDR[2])
set(APFP_CONNECTIVITY ${APFP_CONNECTIVITY}
MatrixMultiplication_3.a_to_kernel:FreeRunningMultiplication_3.a_to_kernel
MatrixMultiplication_3.b_to_kernel:FreeRunningMultiplication_3.b_to_kernel
FreeRunningMultiplication_3.ab_from_kernel:MatrixMultiplication_3.ab_from_kernel)
endif()
if(${APFP_COMPUTE_UNITS} GREATER 3)
set(APFP_PORT_MAPPING ${APFP_PORT_MAPPING}
MatrixMultiplication_4.m_axi_a:DDR[3]
MatrixMultiplication_4.m_axi_b:DDR[3]
MatrixMultiplication_4.m_axi_c_read:DDR[3]
MatrixMultiplication_4.m_axi_c_write:DDR[3])
set(APFP_CONNECTIVITY ${APFP_CONNECTIVITY}
MatrixMultiplication_4.a_to_kernel:FreeRunningMultiplication_4.a_to_kernel
MatrixMultiplication_4.b_to_kernel:FreeRunningMultiplication_4.b_to_kernel
FreeRunningMultiplication_4.ab_from_kernel:MatrixMultiplication_4.ab_from_kernel)
endif()
if(${APFP_COMPUTE_UNITS} GREATER 4)
set(APFP_PORT_MAPPING ${APFP_PORT_MAPPING}
Expand Down Expand Up @@ -98,20 +113,32 @@ if(${APFP_COMPUTE_UNITS} GREATER 8)
endif()

# Setup FPGA kernel targets
set(APFP_HLS_FLAGS "-DAP_INT_MAX_W=${APFP_MAX_BITS} -DAPFP_${APFP_SEMANTICS}_SEMANTICS")
set(APFP_HLS_CONFIG "config_compile -pipeline_style frp\nconfig_dataflow -fifo_depth 16")
set(APFP_INCLUDE_DIRS include hlslib/include ${CMAKE_BINARY_DIR})
set(APFP_DEPENDS ${CMAKE_BINARY_DIR}/Config.h
include/ArithmeticOperations.h
include/DeviceTypes.h
include/Karatsuba.h
include/MatrixMultiplication.h
include/PackedFloat.h
include/PipelinedAdd.h)
add_vitis_kernel(MatrixMultiplication FILES ${APFP_KERNEL_FILES}
COMPUTE_UNITS ${APFP_COMPUTE_UNITS}
INCLUDE_DIRS include hlslib/include ${CMAKE_BINARY_DIR}
HLS_FLAGS "-DAP_INT_MAX_W=${APFP_MAX_BITS} -DAPFP_${APFP_SEMANTICS}_SEMANTICS"
HLS_CONFIG "config_compile -pipeline_style frp\nconfig_dataflow -fifo_depth 16"
DEPENDS ${CMAKE_BINARY_DIR}/Config.h
include/ArithmeticOperations.h
include/DeviceTypes.h
include/Karatsuba.h
include/MatrixMultiplication.h
include/PackedFloat.h
include/PipelinedAdd.h
INCLUDE_DIRS ${APFP_INCLUDE_DIRS}
HLS_FLAGS ${APFP_HLS_FLAGS}
HLS_CONFIG ${APFP_HLS_CONFIG}
DEPENDS ${APFP_DEPENDS}
PORT_MAPPING ${APFP_PORT_MAPPING})
add_vitis_kernel(FreeRunningMultiplication FILES ${APFP_KERNEL_FILES}
COMPUTE_UNITS ${APFP_COMPUTE_UNITS}
INCLUDE_DIRS ${APFP_INCLUDE_DIRS}
HLS_FLAGS ${APFP_HLS_FLAGS}
HLS_CONFIG ${APFP_HLS_CONFIG}
DEPENDS ${APFP_DEPENDS})
add_vitis_program(MatrixMultiplication ${APFP_PLATFORM}
KERNELS MatrixMultiplication FreeRunningMultiplication
CONNECTIVITY ${APFP_CONNECTIVITY}
PROFILING ${APFP_PROFILING}
DEBUGGING ${APFP_DEBUGGING}
SAVE_TEMPS ${APFP_SAVE_TEMPS})
Expand Down
87 changes: 65 additions & 22 deletions device/MatrixMultiplication.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -330,42 +330,66 @@ void WriteC(hlslib::Stream<PackedFloat> &from_kernel, DramLine *const mem, const

////////////////////////////////////////////////////////////////////////////////

void Compute(hlslib::Stream<PackedFloat> &a_in, hlslib::Stream<PackedFloat> &b_in, hlslib::Stream<PackedFloat> &c_in,
hlslib::Stream<PackedFloat> &c_out, int const size_n, int const size_k, int const size_m) {
PackedFloat a_buffer; // Just to make A symmetric to B and C
void ComputeEntry(hlslib::Stream<PackedFloat> &a_in, hlslib::Stream<PackedFloat> &b_in,
hlslib::Stream<PackedFloat> &a_out, hlslib::Stream<PackedFloat> &b_out, int const size_n,
int const size_k, int const size_m) {
PackedFloat a_buffer;
PackedFloat b_buffer[kTileSizeM];
PackedFloat c_buffer[kTileSizeN * kTileSizeM];
const int tiles_n = hlslib::CeilDivide(size_n, kTileSizeN);
const int tiles_m = hlslib::CeilDivide(size_m, kTileSizeM);
Compute_TilesN:
ComputeEntry_TilesN:
for (int n0 = 0; n0 < tiles_n; ++n0) {
Compute_TilesM:
ComputeEntry_TilesM:
for (int m0 = 0; m0 < tiles_m; ++m0) {
Compute_K:
ComputeEntry_K:
for (int k = 0; k < size_k; ++k) {
Compute_N:
ComputeEntry_N:
for (int n1 = 0; n1 < ((n0 < tiles_n - 1) ? kTileSizeN : (size_n - n0 * kTileSizeN)); ++n1) {
Compute_M:
ComputeEntry_M:
for (int m1 = 0; m1 < kTileSizeM; ++m1) {
#pragma HLS PIPELINE II = 1
#pragma HLS LOOP_FLATTEN
const PackedFloat a_read = a_in.Pop();
const PackedFloat b_read = b_in.Pop();
const PackedFloat c_read = c_in.Pop();
const PackedFloat a = (m1 == 0) ? a_read : a_buffer;
const PackedFloat b = (n1 == 0) ? b_read : b_buffer[m1];
const PackedFloat c = (k == 0) ? c_read : c_buffer[n1 * kTileSizeM + m1];
a_buffer = a;
b_buffer[m1] = b;
// Ignore contributions from out-of-bound indices
const bool in_bounds = (n0 * kTileSizeN + n1 < size_n) && (m0 * kTileSizeM + m1 < size_m);
// Meat of the computation
const auto res = MultiplyAccumulate(in_bounds ? a : PackedFloat::Zero(),
in_bounds ? b : PackedFloat::Zero(), c);
// Write back to buffer
a_out.Push(in_bounds ? a : PackedFloat::Zero());
b_out.Push(in_bounds ? b : PackedFloat::Zero());
}
}
}
}
}
}

void ComputeExit(hlslib::Stream<PackedFloat> &ab_in, hlslib::Stream<PackedFloat> &c_in,
hlslib::Stream<PackedFloat> &c_out, int const size_n, int const size_k, int const size_m) {
PackedFloat c_buffer[kTileSizeN * kTileSizeM];
const int tiles_n = hlslib::CeilDivide(size_n, kTileSizeN);
const int tiles_m = hlslib::CeilDivide(size_m, kTileSizeM);
ComputeExit_TilesN:
for (int n0 = 0; n0 < tiles_n; ++n0) {
ComputeExit_TilesM:
for (int m0 = 0; m0 < tiles_m; ++m0) {
ComputeExit_K:
for (int k = 0; k < size_k; ++k) {
ComputeExit_N:
for (int n1 = 0; n1 < ((n0 < tiles_n - 1) ? kTileSizeN : (size_n - n0 * kTileSizeN)); ++n1) {
ComputeExit_M:
for (int m1 = 0; m1 < kTileSizeM; ++m1) {
#pragma HLS PIPELINE II = 1
#pragma HLS LOOP_FLATTEN
const PackedFloat ab = ab_in.Pop();
const PackedFloat c_read = c_in.Pop();
const PackedFloat c = (k == 0) ? c_read : c_buffer[n1 * kTileSizeM + m1];
const PackedFloat res = Add(ab, c);
c_out.Push(res);
c_buffer[n1 * kTileSizeM + m1] = res;
#pragma HLS DEPENDENCE variable = c_buffer false
c_out.Push(res);
}
}
}
Expand All @@ -375,8 +399,22 @@ void Compute(hlslib::Stream<PackedFloat> &a_in, hlslib::Stream<PackedFloat> &b_i

////////////////////////////////////////////////////////////////////////////////

void FreeRunningMultiplication(hlslib::Stream<PackedFloat> &a_to_kernel, hlslib::Stream<PackedFloat> &b_to_kernel,
hlslib::Stream<PackedFloat> &ab_from_kernel) {
#pragma HLS INTERFACE axis port = a_to_kernel
#pragma HLS INTERFACE axis port = b_to_kernel
#pragma HLS INTERFACE axis port = ab_from_kernel
#pragma HLS interface ap_ctrl_none port = return
#pragma HLS PIPELINE II = 1
ab_from_kernel.Push(Multiply(a_to_kernel.Pop(), b_to_kernel.Pop()));
}

////////////////////////////////////////////////////////////////////////////////

void MatrixMultiplication(DramLine const *const a, DramLine const *const b, DramLine const *const c_read,
DramLine *const c_write, const int size_n, const int size_k, int const size_m) {
DramLine *const c_write, const int size_n, const int size_k, int const size_m,
hlslib::Stream<PackedFloat> &a_to_kernel, hlslib::Stream<PackedFloat> &b_to_kernel,
hlslib::Stream<PackedFloat> &ab_from_kernel) {
#pragma HLS INTERFACE m_axi offset = slave port = a bundle = a
#pragma HLS INTERFACE m_axi offset = slave port = b bundle = b
// Even though they actually point to the same memory location, we use two separate interfaces for reading and writing
Expand All @@ -390,6 +428,9 @@ void MatrixMultiplication(DramLine const *const a, DramLine const *const b, Dram
#pragma HLS INTERFACE s_axilite port = size_n
#pragma HLS INTERFACE s_axilite port = size_k
#pragma HLS INTERFACE s_axilite port = size_m
#pragma HLS INTERFACE axis port = a_to_kernel
#pragma HLS INTERFACE axis port = b_to_kernel
#pragma HLS INTERFACE axis port = ab_from_kernel
#pragma HLS STABLE variable = a
#pragma HLS STABLE variable = b
#pragma HLS STABLE variable = c_read
Expand All @@ -399,21 +440,23 @@ void MatrixMultiplication(DramLine const *const a, DramLine const *const b, Dram
#pragma HLS STABLE variable = size_m
#pragma HLS DATAFLOW
hlslib::Stream<PackedFloat, 16> a_to_feeder("a_to_feeder");
hlslib::Stream<PackedFloat, 16> a_to_kernel("a_to_kernel");
hlslib::Stream<PackedFloat, 16> a_to_entry("a_to_entry");
hlslib::Stream<PackedFloat, 16> b_to_feeder("b_to_feeder");
hlslib::Stream<PackedFloat, 16> b_to_kernel("b_to_kernel");
hlslib::Stream<PackedFloat, 16> b_to_entry("b_to_entry");
hlslib::Stream<PackedFloat, 16> c_to_feeder("c_to_feeder");
hlslib::Stream<PackedFloat, 16> c_to_kernel("c_to_kernel");
hlslib::Stream<PackedFloat, 16> c_from_kernel("c_from_kernel");
hlslib::Stream<PackedFloat, 16> c_from_exit("c_from_exit");
hlslib::Stream<PackedFloat, 16> c_from_drainer("c_from_drainer");
HLSLIB_DATAFLOW_INIT();
HLSLIB_DATAFLOW_FUNCTION(ReadA, a, a_to_feeder, size_n, size_k, size_m);
HLSLIB_DATAFLOW_FUNCTION(FeedA, a_to_feeder, a_to_kernel, size_n, size_k, size_m);
HLSLIB_DATAFLOW_FUNCTION(FeedA, a_to_feeder, a_to_entry, size_n, size_k, size_m);
HLSLIB_DATAFLOW_FUNCTION(ReadB, b, b_to_feeder, size_n, size_k, size_m);
HLSLIB_DATAFLOW_FUNCTION(FeedB, b_to_feeder, b_to_kernel, size_n, size_k, size_m);
HLSLIB_DATAFLOW_FUNCTION(FeedB, b_to_feeder, b_to_entry, size_n, size_k, size_m);
HLSLIB_DATAFLOW_FUNCTION(ReadC, c_read, c_to_feeder, size_n, size_m);
HLSLIB_DATAFLOW_FUNCTION(FeedC, c_to_feeder, c_to_kernel, size_n, size_k, size_m);
HLSLIB_DATAFLOW_FUNCTION(Compute, a_to_kernel, b_to_kernel, c_to_kernel, c_from_kernel, size_n, size_k, size_m);
HLSLIB_DATAFLOW_FUNCTION(ComputeEntry, a_to_entry, b_to_entry, a_to_kernel, b_to_kernel, size_n, size_k, size_m);
HLSLIB_DATAFLOW_FUNCTION(ComputeExit, ab_from_kernel, c_to_kernel, c_from_kernel, size_n, size_k, size_m);
HLSLIB_DATAFLOW_FUNCTION(DrainC, c_from_kernel, c_from_drainer, size_n, size_k, size_m);
HLSLIB_DATAFLOW_FUNCTION(WriteC, c_from_drainer, c_write, size_n, size_m);
HLSLIB_DATAFLOW_FINALIZE();
Expand Down
25 changes: 23 additions & 2 deletions host/TestProgram.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include <hlslib/xilinx/OpenCL.h>
#include <hlslib/xilinx/Stream.h>
#include <hlslib/xilinx/Utility.h>

#include <cstdlib> // putenv
Expand All @@ -22,6 +23,13 @@ struct MpfrWrapper {
};

#ifdef HLSLIB_SIMULATE_OPENCL
void RunFreeRunningKernel(hlslib::Stream<PackedFloat> &a_in, hlslib::Stream<PackedFloat> &b_in,
hlslib::Stream<PackedFloat> &ab_out) {
while (true) {
FreeRunningMultiplication(a_in, b_in, ab_out);
}
}

bool RunTestSimulation(int size_n, int size_k, int size_m, bool verify) {
const std::string kernel_path("");
#else
Expand Down Expand Up @@ -111,9 +119,14 @@ bool RunTest(std::string const &kernel_path, int size_n, int size_k, int size_m,
// In simulation mode, this will call the function "MatrixMultiplication" and run it in software.
// Otherwise, the provided path to a kernel binary will be loaded and executed.
std::vector<hlslib::ocl::Kernel> kernels;
hlslib::Stream<PackedFloat> a_to_kernel[kComputeUnits];
hlslib::Stream<PackedFloat> b_to_kernel[kComputeUnits];
hlslib::Stream<PackedFloat> ab_from_kernel[kComputeUnits];
for (int i = 0; i < kComputeUnits; ++i) {
kernels.emplace_back(program.MakeKernel(MatrixMultiplication, "MatrixMultiplication", a_device[i], b_device[i],
c_device[i], c_device[i], n_partition_size[i], size_k, size_m));
kernels.emplace_back(program.MakeKernel(
MatrixMultiplication, "MatrixMultiplication", a_device[i], b_device[i], c_device[i], c_device[i],
n_partition_size[i], size_k, size_m, hlslib::ocl::SimulationOnly(a_to_kernel[i]),
hlslib::ocl::SimulationOnly(b_to_kernel[i]), hlslib::ocl::SimulationOnly(ab_from_kernel[i])));
}

const float expected_runtime = expected_cycles / 0.3e9;
Expand All @@ -126,6 +139,14 @@ bool RunTest(std::string const &kernel_path, int size_n, int size_k, int size_m,
<< bandwidth << " GB/s.\n";

std::cout << "Executing kernel...\n";
#ifdef HLSLIB_SIMULATE_OPENCL
for (int i = 0; i < kComputeUnits; ++i) {
std::thread free_running(RunFreeRunningKernel, std::ref(a_to_kernel[i]), std::ref(b_to_kernel[i]),
std::ref(ab_from_kernel[i]));
// Will be killed when the program exits
free_running.detach();
}
#endif
std::vector<hlslib::ocl::Event> events;
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < kComputeUnits; ++i) {
Expand Down
11 changes: 10 additions & 1 deletion include/MatrixMultiplication.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,16 @@
#pragma once

#include <hlslib/xilinx/Stream.h>

#include "Config.h"
#include "DeviceTypes.h"
#include "PackedFloat.h"

extern "C" void MatrixMultiplication(DramLine const *a, DramLine const *b, DramLine const *c_read, DramLine *c_write,
int n, int m, int k);
int n, int m, int k, hlslib::Stream<PackedFloat> &a_to_kernel,
hlslib::Stream<PackedFloat> &b_to_kernel,
hlslib::Stream<PackedFloat> &ab_from_kernel);

extern "C" void FreeRunningMultiplication(hlslib::Stream<PackedFloat> &a_to_kernel,
hlslib::Stream<PackedFloat> &b_to_kernel,
hlslib::Stream<PackedFloat> &ab_from_kernel);