Skip to content

Commit

Permalink
18 accum mm as well as loop support in single and array matmul (#1096)
Browse files Browse the repository at this point in the history
  • Loading branch information
jackl-xilinx authored Mar 6, 2024
1 parent 8368c00 commit 315a9f3
Show file tree
Hide file tree
Showing 5 changed files with 515 additions and 133 deletions.
4 changes: 2 additions & 2 deletions reference_designs/ipu-xrt/matrix_multiplication/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def my_matmul():
K = 256
N = 256
m = 64
k = 32
k = 64
n = 64
r = 4
s = 8
Expand All @@ -26,7 +26,7 @@ def my_matmul():

vectorized = True
enable_tracing = False
trace_size = 8192
trace_size = 16384

A_sz_in_i32s = M * K * word_size_in // 4
B_sz_in_i32s = K * N * word_size_in // 4
Expand Down
210 changes: 153 additions & 57 deletions reference_designs/ipu-xrt/matrix_multiplication/mm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ void matmul_vectorized(const T_in *__restrict pA, const T_in *__restrict pB,

for (unsigned j = 0; j < colB; j += 2)
// chess_loop_range(2, ) {
chess_prepare_for_pipelining chess_loop_range(16, ) {
chess_prepare_for_pipelining chess_loop_range(8, ) {
const T_in *__restrict pA1 = pA + (z * colA + 0) * MMUL::size_A;
const T_in *__restrict pA2 = pA + ((z + 1) * colA + 0) * MMUL::size_A;
const T_in *__restrict pB1 = pB + (0 * colB + j) * MMUL::size_B;
Expand Down Expand Up @@ -103,6 +103,7 @@ void matmul_vectorized(const T_in *__restrict pA, const T_in *__restrict pB,

for (unsigned i = 1; i < colA; ++i)
chess_prepare_for_pipelining chess_loop_range(7, ) {
// chess_unroll_loop() {
A0 = aie::load_v<MMUL::size_A>(pA1);
pA1 += MMUL::size_A;
A1 = aie::load_v<MMUL::size_A>(pA2);
Expand Down Expand Up @@ -133,8 +134,8 @@ void matmul_vectorized(const T_in *__restrict pA, const T_in *__restrict pB,

template <typename T_in, typename T_out, unsigned rowA, unsigned colA,
unsigned colB, unsigned r, unsigned s, unsigned t>
void matmul_vectorized_unroll(const T_in *__restrict pA,
const T_in *__restrict pB, T_out *__restrict pC) {
void matmul_vectorized_2x2(const T_in *__restrict pA, const T_in *__restrict pB,
T_out *__restrict pC) {
using MMUL = aie::mmul<r, s, t, T_in, T_in, accfloat>;

event0();
Expand All @@ -149,105 +150,199 @@ void matmul_vectorized_unroll(const T_in *__restrict pA,
// is laid out contiguously in row-major). An element in row 0, column 4
// would be stored at offset 16 in the same example.

for (unsigned z = 0; z < rowA; z += 2)
for (unsigned z = 0; z < rowA; z += 4)
chess_loop_range(2, ) {
T_out *__restrict pC1 = pC + (z * colB + 0) * MMUL::size_C;
T_out *__restrict pC1b =
pC + (z * colB + 0) * MMUL::size_C + MMUL::size_C;
T_out *__restrict pC2 = pC + ((z + 1) * colB + 0) * MMUL::size_C;
T_out *__restrict pC2b =
pC + ((z + 1) * colB + 0) * MMUL::size_C + MMUL::size_C;
T_out *__restrict pC3 = pC + ((z + 2) * colB + 0) * MMUL::size_C;
T_out *__restrict pC4 = pC + ((z + 3) * colB + 0) * MMUL::size_C;

for (unsigned j = 0; j < colB; j += 2)
// chess_modulo_scheduling_budget_ratio(19000)
// chess_peel_pipelined_loop(1)
for (unsigned j = 0; j < colB; j += 4)
// chess_loop_range(2, ) {
chess_prepare_for_pipelining chess_loop_range(8, ) {
const T_in *__restrict pA1 = pA + (z * colA + 0) * MMUL::size_A;
const T_in *__restrict pA2 = pA + ((z + 1) * colA + 0) * MMUL::size_A;
const T_in *__restrict pA3 = pA + ((z + 2) * colA + 0) * MMUL::size_A;
const T_in *__restrict pA4 = pA + ((z + 3) * colA + 0) * MMUL::size_A;

const T_in *__restrict pB1 = pB + (0 * colB + j) * MMUL::size_B;
const T_in *__restrict pB2 = pB + (0 * colB + (j + 1)) * MMUL::size_B;
const T_in *__restrict pB3 = pB + (0 * colB + (j + 2)) * MMUL::size_B;
const T_in *__restrict pB4 = pB + (0 * colB + (j + 3)) * MMUL::size_B;

aie::vector<T_in, MMUL::size_A> A0 = aie::load_v<MMUL::size_A>(pA1);
pA1 += MMUL::size_A;
aie::vector<T_in, MMUL::size_A> A1 = aie::load_v<MMUL::size_A>(pA2);
pA2 += MMUL::size_A;
aie::vector<T_in, MMUL::size_A> A2 = aie::load_v<MMUL::size_A>(pA3);
pA3 += MMUL::size_A;
aie::vector<T_in, MMUL::size_A> A3 = aie::load_v<MMUL::size_A>(pA4);
pA4 += MMUL::size_A;
aie::vector<T_in, MMUL::size_B> B0 = aie::load_v<MMUL::size_B>(pB1);
pB1 += MMUL::size_B * colB;
aie::vector<T_in, MMUL::size_B> B1 = aie::load_v<MMUL::size_B>(pB2);
pB2 += MMUL::size_B * colB;
aie::vector<T_in, MMUL::size_B> B2 = aie::load_v<MMUL::size_B>(pB3);
pB3 += MMUL::size_B * colB;
aie::vector<T_in, MMUL::size_B> B3 = aie::load_v<MMUL::size_B>(pB4);
pB4 += MMUL::size_B * colB;

// We modify the library documentation implementation to accumulate
// in the C dimension, since this vectorized kernel will be called
// multiple times as we further tile the input at a higher level.
aie::vector<T_out, MMUL::size_C> acc_C00 =
aie::load_v<MMUL::size_C>(pC1);
aie::vector<T_out, MMUL::size_C> acc_C01 =
aie::load_v<MMUL::size_C>(pC1b);
aie::load_v<MMUL::size_C>(pC1 + MMUL::size_C);
aie::vector<T_out, MMUL::size_C> acc_C02 =
aie::load_v<MMUL::size_C>(pC1 + 2 * MMUL::size_C);
aie::vector<T_out, MMUL::size_C> acc_C03 =
aie::load_v<MMUL::size_C>(pC1 + 3 * MMUL::size_C);

aie::vector<T_out, MMUL::size_C> acc_C10 =
aie::load_v<MMUL::size_C>(pC2);
aie::vector<T_out, MMUL::size_C> acc_C11 =
aie::load_v<MMUL::size_C>(pC2b);
aie::load_v<MMUL::size_C>(pC2 + MMUL::size_C);
aie::vector<T_out, MMUL::size_C> acc_C12 =
aie::load_v<MMUL::size_C>(pC2 + 2 * MMUL::size_C);
aie::vector<T_out, MMUL::size_C> acc_C13 =
aie::load_v<MMUL::size_C>(pC2 + 3 * MMUL::size_C);

aie::vector<T_out, MMUL::size_C> acc_C20 =
aie::load_v<MMUL::size_C>(pC3);
aie::vector<T_out, MMUL::size_C> acc_C21 =
aie::load_v<MMUL::size_C>(pC3 + MMUL::size_C);
aie::vector<T_out, MMUL::size_C> acc_C22 =
aie::load_v<MMUL::size_C>(pC3 + 2 * MMUL::size_C);
aie::vector<T_out, MMUL::size_C> acc_C23 =
aie::load_v<MMUL::size_C>(pC3 + 3 * MMUL::size_C);

aie::vector<T_out, MMUL::size_C> acc_C30 =
aie::load_v<MMUL::size_C>(pC4);
aie::vector<T_out, MMUL::size_C> acc_C31 =
aie::load_v<MMUL::size_C>(pC4 + MMUL::size_C);
aie::vector<T_out, MMUL::size_C> acc_C32 =
aie::load_v<MMUL::size_C>(pC4 + 2 * MMUL::size_C);
aie::vector<T_out, MMUL::size_C> acc_C33 =
aie::load_v<MMUL::size_C>(pC4 + 3 * MMUL::size_C);

MMUL C00(acc_C00);
MMUL C01(acc_C01);
MMUL C02(acc_C02);
MMUL C03(acc_C03);

MMUL C10(acc_C10);
MMUL C11(acc_C11);
MMUL C12(acc_C12);
MMUL C13(acc_C13);

MMUL C20(acc_C20);
MMUL C21(acc_C21);
MMUL C22(acc_C22);
MMUL C23(acc_C23);

MMUL C30(acc_C30);
MMUL C31(acc_C31);
MMUL C32(acc_C32);
MMUL C33(acc_C33);

C00.mac(A0, B0);
C01.mac(A0, B1);
C10.mac(A1, B0);
C11.mac(A1, B1);

aie::vector<T_in, MMUL::size_A> A0b = aie::load_v<MMUL::size_A>(pA1);
pA1 += MMUL::size_A;
aie::vector<T_in, MMUL::size_A> A1b = aie::load_v<MMUL::size_A>(pA2);
pA2 += MMUL::size_A;
aie::vector<T_in, MMUL::size_B> B0b = aie::load_v<MMUL::size_B>(pB1);
pB1 += MMUL::size_B * colB;
aie::vector<T_in, MMUL::size_B> B1b = aie::load_v<MMUL::size_B>(pB2);
pB2 += MMUL::size_B * colB;
C02.mac(A0, B2);
C03.mac(A0, B3);
C12.mac(A1, B2);
C13.mac(A1, B3);

C00.mac(A0b, B0b);
C01.mac(A0b, B1b);
C10.mac(A1b, B0b);
C11.mac(A1b, B1b);
C20.mac(A2, B0);
C21.mac(A2, B1);
C30.mac(A3, B0);
C31.mac(A3, B1);

A0 = aie::load_v<MMUL::size_A>(pA1);
pA1 += MMUL::size_A;
A1 = aie::load_v<MMUL::size_A>(pA2);
pA2 += MMUL::size_A;
B0 = aie::load_v<MMUL::size_B>(pB1);
pB1 += MMUL::size_B * colB;
B1 = aie::load_v<MMUL::size_B>(pB2);
pB2 += MMUL::size_B * colB;
C22.mac(A2, B2);
C23.mac(A2, B3);
C32.mac(A3, B2);
C33.mac(A3, B3);

C00.mac(A0, B0);
C01.mac(A0, B1);
C10.mac(A1, B0);
C11.mac(A1, B1);
for (unsigned i = 1; i < colA; ++i)
chess_prepare_for_pipelining chess_loop_range(7, ) {
// chess_unroll_loop() {
A0 = aie::load_v<MMUL::size_A>(pA1);
pA1 += MMUL::size_A;
A1 = aie::load_v<MMUL::size_A>(pA2);
pA2 += MMUL::size_A;
A2 = aie::load_v<MMUL::size_A>(pA3);
pA3 += MMUL::size_A;
A3 = aie::load_v<MMUL::size_A>(pA4);
pA4 += MMUL::size_A;

A0b = aie::load_v<MMUL::size_A>(pA1);
pA1 += MMUL::size_A;
A1b = aie::load_v<MMUL::size_A>(pA2);
pA2 += MMUL::size_A;
B0b = aie::load_v<MMUL::size_B>(pB1);
pB1 += MMUL::size_B * colB;
B1b = aie::load_v<MMUL::size_B>(pB2);
pB2 += MMUL::size_B * colB;
B0 = aie::load_v<MMUL::size_B>(pB1);
pB1 += MMUL::size_B * colB;
B1 = aie::load_v<MMUL::size_B>(pB2);
pB2 += MMUL::size_B * colB;
B2 = aie::load_v<MMUL::size_B>(pB3);
pB3 += MMUL::size_B * colB;
B3 = aie::load_v<MMUL::size_B>(pB4);
pB4 += MMUL::size_B * colB;

C00.mac(A0, B0);
C01.mac(A0, B1);
C10.mac(A1, B0);
C11.mac(A1, B1);

C02.mac(A0, B2);
C03.mac(A0, B3);
C12.mac(A1, B2);
C13.mac(A1, B3);

C00.mac(A0b, B0b);
C01.mac(A0b, B1b);
C10.mac(A1b, B0b);
C11.mac(A1b, B1b);
C20.mac(A2, B0);
C21.mac(A2, B1);
C30.mac(A3, B0);
C31.mac(A3, B1);

C22.mac(A2, B2);
C23.mac(A2, B3);
C32.mac(A3, B2);
C33.mac(A3, B3);
}

aie::store_v(pC1, C00.template to_vector<T_out>());
pC1 += 2 * MMUL::size_C;
aie::store_v(pC1b, C01.template to_vector<T_out>());
pC1b += 2 * MMUL::size_C;
pC1 += MMUL::size_C;
aie::store_v(pC1, C01.template to_vector<T_out>());
pC1 += MMUL::size_C;
aie::store_v(pC1, C02.template to_vector<T_out>());
pC1 += MMUL::size_C;
aie::store_v(pC1, C03.template to_vector<T_out>());
pC1 += MMUL::size_C;

aie::store_v(pC2, C10.template to_vector<T_out>());
pC2 += 2 * MMUL::size_C;
aie::store_v(pC2b, C11.template to_vector<T_out>());
pC2b += 2 * MMUL::size_C;
pC2 += MMUL::size_C;
aie::store_v(pC2, C11.template to_vector<T_out>());
pC2 += MMUL::size_C;
aie::store_v(pC2, C12.template to_vector<T_out>());
pC2 += MMUL::size_C;
aie::store_v(pC2, C13.template to_vector<T_out>());
pC2 += MMUL::size_C;

aie::store_v(pC3, C20.template to_vector<T_out>());
pC3 += MMUL::size_C;
aie::store_v(pC3, C21.template to_vector<T_out>());
pC3 += MMUL::size_C;
aie::store_v(pC3, C22.template to_vector<T_out>());
pC3 += MMUL::size_C;
aie::store_v(pC3, C23.template to_vector<T_out>());
pC3 += MMUL::size_C;

aie::store_v(pC4, C30.template to_vector<T_out>());
pC4 += MMUL::size_C;
aie::store_v(pC4, C31.template to_vector<T_out>());
pC4 += MMUL::size_C;
aie::store_v(pC4, C32.template to_vector<T_out>());
pC4 += MMUL::size_C;
aie::store_v(pC4, C33.template to_vector<T_out>());
pC4 += MMUL::size_C;
}
}

Expand Down Expand Up @@ -281,8 +376,9 @@ void matmul_vectorized_4x8x4_bf16_bf16(const bfloat16 *__restrict pA,
static_assert(m % (2 * r) == 0 && m / (2 * r) > 0);
static_assert(k % (2 * s) == 0 && k / (2 * s) > 0);
static_assert(n % (2 * t) == 0 && n / (2 * t) > 0);
return matmul_vectorized_unroll<bfloat16, bfloat16, m / r, k / s, n / t, r, s,
t>(pA, pB, pC);
// return matmul_vectorized<bfloat16, bfloat16, m / r, k / s, n / t, r, s, t>(
return matmul_vectorized_2x2<bfloat16, bfloat16, m / r, k / s, n / t, r, s,
t>(pA, pB, pC);
}

template <unsigned m, unsigned k, unsigned n>
Expand Down
Loading

0 comments on commit 315a9f3

Please sign in to comment.