-
Notifications
You must be signed in to change notification settings - Fork 274
Matrix Multiplication Example #495
Comments
The simplest change that should be made is to transpose the j/k loops:
before:
for i
for j
t = 0
for k
t += a[i,k] * b[k,j]
end k
c[i,j] = t
end j
end i
after:
for i
c[i,*] = 0
for k
ta = a[i,k]
for j
c[i,j] += ta * b[k,j]
end j
end k
end i
this makes it both cache-friendly and easily vectorized because it iterates
over B[] in row-major order and accesses A as scalars.
if the rows are short enough, you can tile the k loop on top of this, so
you get some temporal re-use out of rows in B, but it has diminishing
returns.
if the rows are super-long, then you need to tile across j as well.
g
…On Thu, Jun 11, 2020 at 6:09 AM JamesKenneyImperas ***@***.***> wrote:
I've attached an attempt at a new example that could be included in the
example directory here - matrix multiplication of matrices containing
single-precision floats. I think something like this might be useful to
show, because it is a bit more complicated than the simple memcpy-style
examples but less inscrutable than the partial sgemm.S one. I've also
included a C test harness to demonstrate use.
I'm sure there are much better or more correct ways to implement matrix
multiplication, but hopefully this is a useful starting point. The example
is:
.text
.balign 4
.global matmulFloat_v
# void matmulFloat_v(
# size_t n, // a0: A columns, B rows
# size_t m, // a1: A rows
# size_t p, // a2: B columns
# const float *A, // a3: A is m x n matrix
# const float *B, // a4: B is n x p matrix
# float *C // a5: B is m x p matrix
# ) {
# int i, j, k;
#
# for (i = 0; i < m; i++) {
# for (j = 0; j < p; j++) {
# float c = 0;
# for (k = 0; k < n; k++) {
# c += A[i*n+k] * B[k*p+j];
# }
# C[i*p+j] = c;
# }
# }
# }
#define n a0
#define m a1
#define p a2
#define A a3
#define B a4
#define C a5
#define i t0
#define j t1
#define k t2
#define ksub t3
#define bstride t4
#define tmp t5
matmulFloat_v:
li t0, 0
slli t4, a2, 2 # calculate B block stride
1:
li t1, 0
2:
li t2, 0
li t5, 1
vsetvli t5, t5, e32,m1,ta,ma # set vector length 1
vxor.vv v0, v0, v0 # initialize c
3:
sub t3, a0, t2 # calculate remaining k
vsetvli t3, t3, e32,m8,ta,ma # prepare for next k block
mul t5, t0, a0 # calculate A block base address
add t5, t5, t2
slli t5, t5, 2
add t5, t5, a3
vle32.v v8, (t5); # load A block
mul t5, t2, a2 # calculate B block base address
add t5, t5, t1
slli t5, t5, 2
add t5, t5, a4
vlse32.v v16, (t5), t4; # load B block
vfmul.vv v8, v8, v16 # multiply N elements
vfredsum.vs v0, v8, v0 # accumulate result in element 0 of v0
add t2, t2, t3
bne t2, a0, 3b # do next k block
li t5, 1
vsetvli t5, t5, e32,m1,ta,ma # set vector length 1
mul t5, t0, a2 # calculate C block address
add t5, t5, t1
slli t5, t5, 2
add t5, t5, a5
vse32.v v0, (t5); # store C block element
addi t1, t1, 1
bne t1, a2, 2b # do next j
addi t0, t0, 1
bne t0, a1, 1b # do next i
ret
The algorithm implemented is shown in the C-style comment. It multiplies m
x n matrix A and n x p matrix B, writing the result to m x p matrix C. The
vector code uses unit stride and strided loads, register grouping, floating
point multiply and floating point reduction operations. The C test harness
is this:
#include <stdio.h>
#include <stdlib.h>
//
// Enable vector extension and floating point
//
extern void init_v(void);
//
// Vector implementation
//
extern void matmulFloat_v(
size_t n, // A columns, B rows
size_t m, // A rows
size_t p, // B columns
const float *A, // A is m x n matrix
const float *B, // B is n x p matrix
float *C // B is m x p matrix
);
//
// Non-vector implementation
//
static void matmulFloat(
size_t n, // A columns, B rows
size_t m, // A rows
size_t p, // B columns
const float *A, // A is m x n matrix
const float *B, // B is n x p matrix
float *C // B is m x p matrix
) {
int i, j, k;
for (i = 0; i < m; i++) {
for (j = 0; j < p; j++) {
float c = 0;
for (k = 0; k < n; k++) {
c += A[i*n+k] * B[k*p+j];
}
C[i*p+j] = c;
}
}
}
//
// Exercise matrix multiplication
//
void testMatMul (int n, int m, int p) {
float A[m][n];
float B[n][p];
float C[m][p];
float aVal = 0.0;
float bVal = 0.0;
int r, c;
printf("\n-----------------------------------------\n");
printf("TEST MATRIX MULTIPLICATION N=%d M=%d P=%d\n", n, m, p);
printf("-----------------------------------------\n");
// fill A with known values
printf("\n--\nA:\n--\n");
for (r = 0; r < m; r++) {
printf(" ");
for (c = 0; c < n; c++) {
A[r][c] = aVal;
aVal += 0.25;
printf(" %7.2f", A[r][c]);
}
printf("\n");
}
// fill B with known values
printf("\n--\nB:\n--\n");
for (r = 0; r < n; r++) {
printf(" ");
for (c = 0; c < p; c++) {
B[r][c] = bVal;
bVal += 1;
printf(" %7.2f", B[r][c]);
}
printf("\n");
}
// do matrix multiplication
if(1) {
matmulFloat_v(n, m, p, &A[0][0], &B[0][0], &C[0][0]);
} else {
matmulFloat(n, m, p, &A[0][0], &B[0][0], &C[0][0]);
}
// show result
printf("\n--\nC:\n--\n");
for (r = 0; r < m; r++) {
printf(" ");
for (c = 0; c < p; c++) {
printf(" %9.2f", C[r][c]);
}
printf("\n");
}
}
//
// Main routine
//
int main(int argc, char ** argv) {
init_v();
testMatMul(2, 4, 6);
testMatMul(18, 31, 23);
return -1;
}
(The init_v function enables floating point and vector instructions. This
may or may not be needed depending on the simulation environment:
init_v:
li t0, 1<<9+1<<13
csrs mstatus, t0
ret
)
The C code will perform a multiply using either the vector implementation
or a traditional non-vector one so the results can be compared (change the
if(1) line in the obvious way). The test function testMatMul defines and
fills appropriately-sized local matrixes with a simple pattern of values so
the results can be seen. For example the testMatMul(2, 4, 6) call produces
this output:
-----------------------------------------
TEST MATRIX MULTIPLICATION N=2 M=4 P=6
-----------------------------------------
--
A:
--
0.00 0.25
0.50 0.75
1.00 1.25
1.50 1.75
--
B:
--
0.00 1.00 2.00 3.00 4.00 5.00
6.00 7.00 8.00 9.00 10.00 11.00
--
C:
--
1.50 1.75 2.00 2.25 2.50 2.75
4.50 5.75 7.00 8.25 9.50 10.75
7.50 9.75 12.00 14.25 16.50 18.75
10.50 13.75 17.00 20.25 23.50 26.75
The test function will accept any values of n, m and p sizes, so this can
be used to investigate the performance of the algorithm on any shape input.
The algorithm is not optimized in any way, so I am sure it can be improved.
—
You are receiving this because you are subscribed to this thread.
Reply to this email directly, view it on GitHub
<#495>, or unsubscribe
<https://github.com/notifications/unsubscribe-auth/ABMAPWMIFSVNVMQDPFWF62LRWDJQLANCNFSM4N3MX6ZA>
.
|
Thanks Guy - I'll transpose the loops and see what that looks like. I wasn't really trying to make a fully-optimized algorithm but simply an example and test harness for new users to play with to help gain understanding of how the instructions work, to supplement the existing examples in this repository. I think having two candidate implementations is even better. Is there a better forum for this thread? Perhaps an existing set of standard examples somewhere else that I am unaware of? |
Just to finish off here, I've attached a second version of
|
there is a problem in sgemm.S, when I try to compile the test_sgemm.c and sgemm.S to elf file, there are errors about "operand must be a symbol with %tprel_add modifier", I notice that this is your diy_sgemm.S, did you meet with this error when compiling?
|
Hi William, As you've noticed on this thread and in your earlier thread #450, the You'll also notice that @JamesKenneyImperas 's example sidesteps this problem by not using any saved registers. Best, EDIT: I'll give you a hint. For the prologue code, you could do:
and for the epilogue code, you could do:
EDIT 2: I'll leave it as a challenge to you to figure out how to deal with the
|
I've attached an attempt at a new example that could be included in the
example
directory here - matrix multiplication of matrices containing single-precision floats. I think something like this might be useful to show, because it is a bit more complicated than the simplememcpy
-style examples but less inscrutable than the partialsgemm.S
one. I've also included a C test harness to demonstrate use.I'm sure there are much better or more correct ways to implement matrix multiplication, but hopefully this is a useful starting point. The example is:
The algorithm implemented is shown in the C-style comment. It multiplies m x n matrix A and n x p matrix B, writing the result to m x p matrix C. The vector code uses unit stride and strided loads, register grouping, floating point multiply and floating point reduction operations. The C test harness is this:
(The
init_v
function enables floating point and vector instructions. This may or may not be needed depending on the simulation environment:)
The C code will perform a multiply using either the vector implementation or a traditional non-vector one so the results can be compared (change the if(1) line in the obvious way). The test function testMatMul defines and fills appropriately-sized local matrixes with a simple pattern of values so the results can be seen. For example the testMatMul(2, 4, 6) call produces this output:
The test function will accept any values of n, m and p sizes, so this can be used to investigate the performance of the algorithm on any shape input.
The algorithm is not optimized in any way, so I am sure it can be improved.
The text was updated successfully, but these errors were encountered: