Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch kernel to column major ordering #12

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 19 additions & 19 deletions device/MatrixMultiplication.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
// Annoyingly we have to specialize the innermost loop on whether multiple DRAM flits per number are required or not,
// because HLS otherwise gets confused by pragmas applied to a loop of size 1 in the latter case.
template <int lines_per_number>
void ReadAInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &a_to_feeder, const int size_k, const int n0,
void ReadAInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &a_to_feeder, const int size_n, const int n0,
const int k) {
#pragma HLS INLINE
DramLine num[kLinesPerNumber];
Expand All @@ -19,7 +19,7 @@ void ReadAInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &a_to_fee
for (int i = 0; i < kLinesPerNumber; ++i) {
#pragma HLS PIPELINE II = 1
#pragma HLS LOOP_FLATTEN
num[i] = mem[((n0 * kTileSizeN + n1) * size_k + k) * kLinesPerNumber + i];
num[i] = mem[((n0 * kTileSizeN + n1) + k * size_n) * kLinesPerNumber + i];
if (i == kLinesPerNumber - 1) {
a_to_feeder.Push(PackedFloat(num));
}
Expand All @@ -28,15 +28,15 @@ void ReadAInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &a_to_fee
}

template <>
void ReadAInner<1>(DramLine const *const mem, hlslib::Stream<PackedFloat> &a_to_feeder, const int size_k, const int n0,
void ReadAInner<1>(DramLine const *const mem, hlslib::Stream<PackedFloat> &a_to_feeder, const int size_n, const int n0,
const int k) {
#pragma HLS INLINE
ReadA_N:
for (int n1 = 0; n1 < kTileSizeN; ++n1) {
#pragma HLS PIPELINE II = 1
#pragma HLS LOOP_FLATTEN
DramLine num[1];
num[0] = mem[(n0 * kTileSizeN + n1) * size_k + k];
num[0] = mem[((n0 * kTileSizeN + n1) + k * size_n) * kLinesPerNumber];
a_to_feeder.Push(PackedFloat(num));
}
}
Expand All @@ -51,7 +51,7 @@ void ReadA(DramLine const *const mem, hlslib::Stream<PackedFloat> &a_to_feeder,
for (int m0 = 0; m0 < tiles_m; ++m0) {
ReadA_K:
for (int k = 0; k < size_k; ++k) {
ReadAInner<kLinesPerNumber>(mem, a_to_feeder, size_k, n0, k);
ReadAInner<kLinesPerNumber>(mem, a_to_feeder, size_n, n0, k);
}
}
}
Expand Down Expand Up @@ -90,7 +90,7 @@ void FeedA(hlslib::Stream<PackedFloat> &a_to_feeder, hlslib::Stream<PackedFloat>
////////////////////////////////////////////////////////////////////////////////

template <int lines_per_number>
void ReadBInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &b_to_feeder, const int size_m, const int m0,
void ReadBInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &b_to_feeder, const int size_k, const int m0,
const int k) {
#pragma HLS INLINE
DramLine num[kLinesPerNumber];
Expand All @@ -100,7 +100,7 @@ void ReadBInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &b_to_fee
for (int i = 0; i < kLinesPerNumber; ++i) {
#pragma HLS PIPELINE II = 1
#pragma HLS LOOP_FLATTEN
num[i] = mem[(k * size_m + m0 * kTileSizeM + m1) * kLinesPerNumber + i];
num[i] = mem[(k + (m0 * kTileSizeM + m1) * size_k) * kLinesPerNumber + i];
if (i == kLinesPerNumber - 1) {
b_to_feeder.Push(PackedFloat(num));
}
Expand All @@ -109,15 +109,15 @@ void ReadBInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &b_to_fee
}

template <>
void ReadBInner<1>(DramLine const *const mem, hlslib::Stream<PackedFloat> &b_to_feeder, const int size_m, const int m0,
void ReadBInner<1>(DramLine const *const mem, hlslib::Stream<PackedFloat> &b_to_feeder, const int size_k, const int m0,
const int k) {
#pragma HLS INLINE
ReadB_M:
for (int m1 = 0; m1 < kTileSizeM; ++m1) {
#pragma HLS PIPELINE II = 1
#pragma HLS LOOP_FLATTEN
DramLine num[1];
num[0] = mem[k * size_m + m0 * kTileSizeM + m1];
num[0] = mem[(k + (m0 * kTileSizeM + m1) * size_k) * kLinesPerNumber];
b_to_feeder.Push(PackedFloat(num));
}
}
Expand All @@ -132,7 +132,7 @@ void ReadB(DramLine const *const mem, hlslib::Stream<PackedFloat> &b_to_feeder,
for (int m0 = 0; m0 < tiles_m; ++m0) {
ReadB_K:
for (int k = 0; k < size_k; ++k) {
ReadBInner<kLinesPerNumber>(mem, b_to_feeder, size_m, m0, k);
ReadBInner<kLinesPerNumber>(mem, b_to_feeder, size_k, m0, k);
}
}
}
Expand Down Expand Up @@ -169,7 +169,7 @@ void FeedB(hlslib::Stream<PackedFloat> &b_to_feeder, hlslib::Stream<PackedFloat>
////////////////////////////////////////////////////////////////////////////////

template <int lines_per_number>
void ReadCInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &c_to_feeder, const int size_m, const int n0,
void ReadCInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &c_to_feeder, const int size_n, const int n0,
const int m0, const int n1) {
#pragma HLS INLINE
ReadC_M:
Expand All @@ -179,7 +179,7 @@ void ReadCInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &c_to_fee
for (int i = 0; i < kLinesPerNumber; ++i) {
#pragma HLS PIPELINE II = 1
#pragma HLS LOOP_FLATTEN
num[i] = mem[((n0 * kTileSizeN + n1) * size_m + m0 * kTileSizeM + m1) * kLinesPerNumber + i];
num[i] = mem[((n0 * kTileSizeN + n1) + (m0 * kTileSizeM + m1) * size_n) * kLinesPerNumber + i];
if (i == kLinesPerNumber - 1) {
c_to_feeder.Push(PackedFloat(num));
}
Expand All @@ -188,15 +188,15 @@ void ReadCInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &c_to_fee
}

template <>
void ReadCInner<1>(DramLine const *const mem, hlslib::Stream<PackedFloat> &c_to_feeder, const int size_m, const int n0,
void ReadCInner<1>(DramLine const *const mem, hlslib::Stream<PackedFloat> &c_to_feeder, const int size_n, const int n0,
const int m0, const int n1) {
#pragma HLS INLINE
ReadC_M:
for (int m1 = 0; m1 < kTileSizeM; ++m1) {
#pragma HLS PIPELINE II = 1
#pragma HLS LOOP_FLATTEN
DramLine num[1];
num[0] = mem[(n0 * kTileSizeN + n1) * size_m + m0 * kTileSizeM + m1];
num[0] = mem[((n0 * kTileSizeN + n1) + (m0 * kTileSizeM + m1) * size_n) * kLinesPerNumber];
c_to_feeder.Push(PackedFloat(num));
}
}
Expand All @@ -210,7 +210,7 @@ void ReadC(DramLine const *const mem, hlslib::Stream<PackedFloat> &c_to_feeder,
for (int m0 = 0; m0 < tiles_m; ++m0) {
ReadC_N:
for (int n1 = 0; n1 < kTileSizeN; ++n1) {
ReadCInner<kLinesPerNumber>(mem, c_to_feeder, size_m, n0, m0, n1);
ReadCInner<kLinesPerNumber>(mem, c_to_feeder, size_n, n0, m0, n1);
}
}
}
Expand Down Expand Up @@ -290,7 +290,7 @@ void WriteCInner(hlslib::Stream<PackedFloat> &from_kernel, DramLine *const mem,
}
const bool in_bounds = (n0 * kTileSizeN + n1 < size_n) && (m0 * kTileSizeM + m1 < size_m);
if (in_bounds) {
mem[((n0 * kTileSizeN + n1) * size_m + m0 * kTileSizeM + m1) * kLinesPerNumber + i] = num[i];
mem[((n0 * kTileSizeN + n1) + (m0 * kTileSizeM + m1) * size_n) * kLinesPerNumber + i] = num[i];
}
}
}
Expand All @@ -308,7 +308,7 @@ void WriteCInner<1>(hlslib::Stream<PackedFloat> &from_kernel, DramLine *const me
from_kernel.Pop().UnpackFlits(num);
const bool in_bounds = (n0 * kTileSizeN + n1 < size_n) && (m0 * kTileSizeM + m1 < size_m);
if (in_bounds) {
mem[(n0 * kTileSizeN + n1) * size_m + m0 * kTileSizeM + m1] = num[0];
mem[((n0 * kTileSizeN + n1) + (m0 * kTileSizeM + m1) * size_n) * kLinesPerNumber] = num[0];
}
}
}
Expand Down Expand Up @@ -354,7 +354,7 @@ void Compute(hlslib::Stream<PackedFloat> &a_in, hlslib::Stream<PackedFloat> &b_i
const PackedFloat c_read = c_in.Pop();
const PackedFloat a = (m1 == 0) ? a_read : a_buffer;
const PackedFloat b = (n1 == 0) ? b_read : b_buffer[m1];
const PackedFloat c = (k == 0) ? c_read : c_buffer[n1 * kTileSizeM + m1];
const PackedFloat c = (k == 0) ? c_read : c_buffer[n1 + m1 * kTileSizeN];
a_buffer = a;
b_buffer[m1] = b;
// Ignore contributions from out-of-bound indices
Expand All @@ -363,7 +363,7 @@ void Compute(hlslib::Stream<PackedFloat> &a_in, hlslib::Stream<PackedFloat> &b_i
const auto res = MultiplyAccumulate(in_bounds ? a : PackedFloat::Zero(),
in_bounds ? b : PackedFloat::Zero(), c);
// Write back to buffer
c_buffer[n1 * kTileSizeM + m1] = res;
c_buffer[n1 + m1 * kTileSizeN] = res;
c_out.Push(res);
}
}
Expand Down
5 changes: 3 additions & 2 deletions host/MatrixMultiplicationReference.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@ void MatrixMultiplicationReference(mpfr_t const *a, mpfr_t const *b, mpfr_t *c,
for (int n = 0; n < size_n; ++n) {
for (int k = 0; k < size_k; ++k) {
for (int m = 0; m < size_m; ++m) {
mpfr_mul(tmp, a[n * size_k + k], b[k * size_m + m], kRoundingMode);
mpfr_t &_c = c[n * size_m + m];
// C(n, m) = sum_k A(n, k) B(k, m)
mpfr_mul(tmp, a[n + k * size_n], b[k + m * size_k], kRoundingMode);
mpfr_t &_c = c[n + m * size_n];
mpfr_add(_c, _c, tmp, kRoundingMode);
}
}
Expand Down
10 changes: 5 additions & 5 deletions host/TestProgram.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,8 @@ bool RunTest(std::string const &kernel_path, int size_n, int size_k, int size_m,
// Verify results
for (int n = 0; n < size_n; ++n) {
for (int m = 0; m < size_m; ++m) {
const PackedFloat res = c_host[n * size_m + m];
const PackedFloat ref(c_mpfr[n * size_m + m]);
const PackedFloat res = c_host[n + m * size_n];
const PackedFloat ref(c_mpfr[n + m * size_n]);
if (ref != res) {
std::cerr << "Verification failed at (" << n << ", " << m << "):\n\t" << res << "\n\t" << ref << "\n";
return false;
Expand All @@ -127,17 +127,17 @@ bool RunTest(std::string const &kernel_path, int size_n, int size_k, int size_m,
// Clean up
for (int n = 0; n < size_n; ++n) {
for (int k = 0; k < size_k; ++k) {
mpfr_clear(a_mpfr[n * size_k + k]);
mpfr_clear(a_mpfr[n + k * size_n]);
}
}
for (int k = 0; k < size_k; ++k) {
for (int m = 0; m < size_m; ++m) {
mpfr_clear(b_mpfr[k * size_m + m]);
mpfr_clear(b_mpfr[k + m * size_k]);
}
}
for (int n = 0; n < size_n; ++n) {
for (int m = 0; m < size_m; ++m) {
mpfr_clear(c_mpfr[n * size_m + m]);
mpfr_clear(c_mpfr[n + m * size_n]);
}
}

Expand Down
58 changes: 55 additions & 3 deletions scripts/run_simulation.sh
Original file line number Diff line number Diff line change
@@ -1,13 +1,65 @@
#!/bin/bash

sizes=(1 3 4 7 9 15 16 17 31 33 41)
small_sizes=(1 3 4 7 9 15 16 17)
large_sizes=(31 33 41)
batch_size=12

for n in "${sizes[@]}"

for n in "${small_sizes[@]}"
do
for m in "${small_sizes[@]}"
do
for k in "${small_sizes[@]}"
do
echo $n $m $k 1>&2
(./TestSimulation $n $m $k | tee sim_output.${n}.${m}.${k}.txt) &

if [[ $(jobs -r -p | wc -l) -ge $batch_size ]]; then
wait -n
fi
done
done
done


for n in "${small_sizes[@]}"
do
for m in "${large_sizes[@]}"
do
for k in "${large_sizes[@]}"
do
echo $n $m $k 1>&2
(./TestSimulation $n $m $k | tee sim_output.${n}.${m}.${k}.txt) &

if [[ $(jobs -r -p | wc -l) -ge $batch_size ]]; then
wait -n
fi
done
done
done

for n in "${large_sizes[@]}"
do
for m in "${small_sizes[@]}"
do
for k in "${large_sizes[@]}"
do
echo $n $m $k 1>&2
(./TestSimulation $n $m $k | tee sim_output.${n}.${m}.${k}.txt) &

if [[ $(jobs -r -p | wc -l) -ge $batch_size ]]; then
wait -n
fi
done
done
done

for n in "${large_sizes[@]}"
do
for m in "${sizes[@]}"
for m in "${large_sizes[@]}"
do
for k in "${sizes[@]}"
for k in "${small_sizes[@]}"
do
echo $n $m $k 1>&2
(./TestSimulation $n $m $k | tee sim_output.${n}.${m}.${k}.txt) &
Expand Down