Matrix Multiplication Example #495

JamesKenneyImperas · 2020-06-11T13:09:08Z

I've attached an attempt at a new example that could be included in the example directory here - matrix multiplication of matrices containing single-precision floats. I think something like this might be useful to show, because it is a bit more complicated than the simple memcpy-style examples but less inscrutable than the partial sgemm.S one. I've also included a C test harness to demonstrate use.

I'm sure there are much better or more correct ways to implement matrix multiplication, but hopefully this is a useful starting point. The example is:

.text
.balign 4
.global matmulFloat_v

# void matmulFloat_v(
#     size_t       n,     // a0: A columns, B rows
#     size_t       m,     // a1: A rows
#     size_t       p,     // a2: B columns
#     const float *A,     // a3: A is m x n matrix
#     const float *B,     // a4: B is n x p matrix
#     float       *C      // a5: C is m x p matrix
# ) {
#     int i, j, k;
# 
#     for (i = 0; i < m; i++) {
#         for (j = 0; j < p; j++) {
#             float c = 0;
#             for (k = 0; k < n; k++) {
#                 c += A[i*n+k] * B[k*p+j];
#             }
#             C[i*p+j] = c;
#         }
#     }
# }

#define n       a0
#define m       a1
#define p       a2
#define A       a3
#define B       a4
#define C       a5

#define i       t0
#define j       t1
#define k       t2
#define ksub    t3
#define bstride t4
#define tmp     t5

matmulFloat_v:
    li t0, 0
    slli t4, a2, 2                  # calculate B block stride
1:
    li t1, 0
2:
    li t2, 0
    li t5, 1
  vsetvli t5, t5, e32,m1,ta,ma      # set vector length 1
  vxor.vv v0, v0, v0                # initialize c
3:
    sub t3, a0, t2                  # calculate remaining k
  vsetvli t3, t3, e32,m8,ta,ma      # prepare for next k block

    mul t5, t0, a0                  # calculate A block base address
    add t5, t5, t2
    slli t5, t5, 2
    add t5, t5, a3

  vle32.v v8, (t5);                 # load A block

    mul t5, t2, a2                  # calculate B block base address
    add t5, t5, t1
    slli t5, t5, 2
    add t5, t5, a4

  vlse32.v v16, (t5), t4;           # load B block

  vfmul.vv v8, v8, v16              # multiply N elements
  vfredsum.vs v0, v8, v0            # accumulate result in element 0 of v0

    add t2, t2, t3
    bne t2, a0, 3b                  # do next k block

    li t5, 1
  vsetvli t5, t5, e32,m1,ta,ma      # set vector length 1

  vse32.v v0, (a5);                 # store C block element

    addi a5, a5, 4                  # increment C block address

    addi t1, t1, 1
    bne t1, a2, 2b                  # do next j

    addi t0, t0, 1
    bne t0, a1, 1b                  # do next i

    ret

The algorithm implemented is shown in the C-style comment. It multiplies m x n matrix A and n x p matrix B, writing the result to m x p matrix C. The vector code uses unit stride and strided loads, register grouping, floating point multiply and floating point reduction operations. The C test harness is this:

#include <stdio.h>
#include <stdlib.h>

//
// Enable vector extension and floating point
//
extern void init_v(void);

//
// Vector implementation
//
extern void matmulFloat_v(
    size_t       n,     // A columns, B rows
    size_t       m,     // A rows
    size_t       p,     // B columns
    const float *A,     // A is m x n matrix
    const float *B,     // B is n x p matrix
    float       *C      // C is m x p matrix
);

//
// Non-vector implementation
//
static void matmulFloat(
    size_t       n,     // A columns, B rows
    size_t       m,     // A rows
    size_t       p,     // B columns
    const float *A,     // A is m x n matrix
    const float *B,     // B is n x p matrix
    float       *C      // C is m x p matrix
) {
    int i, j, k;

    for (i = 0; i < m; i++) {
        for (j = 0; j < p; j++) {
            float c = 0;
            for (k = 0; k < n; k++) {
                c += A[i*n+k] * B[k*p+j];
            }
            C[i*p+j] = c;
        }
    }
}

//
// Exercise matrix multiplication
//
void testMatMul (int n, int m, int p) {

    float A[m][n];
    float B[n][p];
    float C[m][p];
    float aVal = 0.0;
    float bVal = 0.0;
    int   r, c;

    printf("\n-----------------------------------------\n");
    printf("TEST MATRIX MULTIPLICATION N=%d M=%d P=%d\n", n, m, p);
    printf("-----------------------------------------\n");

    // fill A with known values
    printf("\n--\nA:\n--\n");
    for (r = 0; r < m; r++) {
        printf("   ");
        for (c = 0; c < n; c++) {
            A[r][c] = aVal;
            aVal += 0.25;
            printf(" %7.2f", A[r][c]);
        }
        printf("\n");
    }

    // fill B with known values
    printf("\n--\nB:\n--\n");
    for (r = 0; r < n; r++) {
        printf("   ");
        for (c = 0; c < p; c++) {
            B[r][c] = bVal;
            bVal += 1;
            printf(" %7.2f", B[r][c]);
        }
        printf("\n");
    }

    // do matrix multiplication
    if(1) {
        matmulFloat_v(n, m, p, &A[0][0], &B[0][0], &C[0][0]);
    } else {
        matmulFloat(n, m, p, &A[0][0], &B[0][0], &C[0][0]);
    }

    // show result
    printf("\n--\nC:\n--\n");
    for (r = 0; r < m; r++) {
        printf("   ");
        for (c = 0; c < p; c++) {
            printf(" %9.2f", C[r][c]);
        }
        printf("\n");
    }
}

//
// Main routine
//
int main(int argc, char ** argv) {

    init_v();

    testMatMul(2, 4, 6);
    testMatMul(18, 31, 23);

    return -1;
}

(The init_v function enables floating point and vector instructions. This may or may not be needed depending on the simulation environment:

init_v:
    li t0, 1<<9+1<<13
    csrs mstatus, t0
    ret

)

The C code will perform a multiply using either the vector implementation or a traditional non-vector one so the results can be compared (change the if(1) line in the obvious way). The test function testMatMul defines and fills appropriately-sized local matrixes with a simple pattern of values so the results can be seen. For example the testMatMul(2, 4, 6) call produces this output:

-----------------------------------------
TEST MATRIX MULTIPLICATION N=2 M=4 P=6
-----------------------------------------

--
A:
--
       0.00    0.25
       0.50    0.75
       1.00    1.25
       1.50    1.75

--
B:
--
       0.00    1.00    2.00    3.00    4.00    5.00
       6.00    7.00    8.00    9.00   10.00   11.00

--
C:
--
         1.50      1.75      2.00      2.25      2.50      2.75
         4.50      5.75      7.00      8.25      9.50     10.75
         7.50      9.75     12.00     14.25     16.50     18.75
        10.50     13.75     17.00     20.25     23.50     26.75

The test function will accept any values of n, m and p sizes, so this can be used to investigate the performance of the algorithm on any shape input.

The algorithm is not optimized in any way, so I am sure it can be improved.

The text was updated successfully, but these errors were encountered:

vbx-glemieux · 2020-06-11T22:26:58Z

The simplest change that should be made is to transpose the j/k loops: before: for i for j t = 0 for k t += a[i,k] * b[k,j] end k c[i,j] = t end j end i after: for i c[i,*] = 0 for k ta = a[i,k] for j c[i,j] += ta * b[k,j] end j end k end i this makes it both cache-friendly and easily vectorized because it iterates over B[] in row-major order and accesses A as scalars. if the rows are short enough, you can tile the k loop on top of this, so you get some temporal re-use out of rows in B, but it has diminishing returns. if the rows are super-long, then you need to tile across j as well. g

…

On Thu, Jun 11, 2020 at 6:09 AM JamesKenneyImperas ***@***.***> wrote: I've attached an attempt at a new example that could be included in the example directory here - matrix multiplication of matrices containing single-precision floats. I think something like this might be useful to show, because it is a bit more complicated than the simple memcpy-style examples but less inscrutable than the partial sgemm.S one. I've also included a C test harness to demonstrate use. I'm sure there are much better or more correct ways to implement matrix multiplication, but hopefully this is a useful starting point. The example is: .text .balign 4 .global matmulFloat_v # void matmulFloat_v( # size_t n, // a0: A columns, B rows # size_t m, // a1: A rows # size_t p, // a2: B columns # const float *A, // a3: A is m x n matrix # const float *B, // a4: B is n x p matrix # float *C // a5: B is m x p matrix # ) { # int i, j, k; # # for (i = 0; i < m; i++) { # for (j = 0; j < p; j++) { # float c = 0; # for (k = 0; k < n; k++) { # c += A[i*n+k] * B[k*p+j]; # } # C[i*p+j] = c; # } # } # } #define n a0 #define m a1 #define p a2 #define A a3 #define B a4 #define C a5 #define i t0 #define j t1 #define k t2 #define ksub t3 #define bstride t4 #define tmp t5 matmulFloat_v: li t0, 0 slli t4, a2, 2 # calculate B block stride 1: li t1, 0 2: li t2, 0 li t5, 1 vsetvli t5, t5, e32,m1,ta,ma # set vector length 1 vxor.vv v0, v0, v0 # initialize c 3: sub t3, a0, t2 # calculate remaining k vsetvli t3, t3, e32,m8,ta,ma # prepare for next k block mul t5, t0, a0 # calculate A block base address add t5, t5, t2 slli t5, t5, 2 add t5, t5, a3 vle32.v v8, (t5); # load A block mul t5, t2, a2 # calculate B block base address add t5, t5, t1 slli t5, t5, 2 add t5, t5, a4 vlse32.v v16, (t5), t4; # load B block vfmul.vv v8, v8, v16 # multiply N elements vfredsum.vs v0, v8, v0 # accumulate result in element 0 of v0 add t2, t2, t3 bne t2, a0, 3b # do next k block li t5, 1 vsetvli t5, t5, e32,m1,ta,ma # set vector length 1 mul t5, t0, a2 # calculate C block address add t5, t5, t1 slli t5, t5, 2 add t5, t5, a5 vse32.v v0, (t5); # store C block element addi t1, t1, 1 bne t1, a2, 2b # do next j addi t0, t0, 1 bne t0, a1, 1b # do next i ret The algorithm implemented is shown in the C-style comment. It multiplies m x n matrix A and n x p matrix B, writing the result to m x p matrix C. The vector code uses unit stride and strided loads, register grouping, floating point multiply and floating point reduction operations. The C test harness is this: #include <stdio.h> #include <stdlib.h> // // Enable vector extension and floating point // extern void init_v(void); // // Vector implementation // extern void matmulFloat_v( size_t n, // A columns, B rows size_t m, // A rows size_t p, // B columns const float *A, // A is m x n matrix const float *B, // B is n x p matrix float *C // B is m x p matrix ); // // Non-vector implementation // static void matmulFloat( size_t n, // A columns, B rows size_t m, // A rows size_t p, // B columns const float *A, // A is m x n matrix const float *B, // B is n x p matrix float *C // B is m x p matrix ) { int i, j, k; for (i = 0; i < m; i++) { for (j = 0; j < p; j++) { float c = 0; for (k = 0; k < n; k++) { c += A[i*n+k] * B[k*p+j]; } C[i*p+j] = c; } } } // // Exercise matrix multiplication // void testMatMul (int n, int m, int p) { float A[m][n]; float B[n][p]; float C[m][p]; float aVal = 0.0; float bVal = 0.0; int r, c; printf("\n-----------------------------------------\n"); printf("TEST MATRIX MULTIPLICATION N=%d M=%d P=%d\n", n, m, p); printf("-----------------------------------------\n"); // fill A with known values printf("\n--\nA:\n--\n"); for (r = 0; r < m; r++) { printf(" "); for (c = 0; c < n; c++) { A[r][c] = aVal; aVal += 0.25; printf(" %7.2f", A[r][c]); } printf("\n"); } // fill B with known values printf("\n--\nB:\n--\n"); for (r = 0; r < n; r++) { printf(" "); for (c = 0; c < p; c++) { B[r][c] = bVal; bVal += 1; printf(" %7.2f", B[r][c]); } printf("\n"); } // do matrix multiplication if(1) { matmulFloat_v(n, m, p, &A[0][0], &B[0][0], &C[0][0]); } else { matmulFloat(n, m, p, &A[0][0], &B[0][0], &C[0][0]); } // show result printf("\n--\nC:\n--\n"); for (r = 0; r < m; r++) { printf(" "); for (c = 0; c < p; c++) { printf(" %9.2f", C[r][c]); } printf("\n"); } } // // Main routine // int main(int argc, char ** argv) { init_v(); testMatMul(2, 4, 6); testMatMul(18, 31, 23); return -1; } (The init_v function enables floating point and vector instructions. This may or may not be needed depending on the simulation environment: init_v: li t0, 1<<9+1<<13 csrs mstatus, t0 ret ) The C code will perform a multiply using either the vector implementation or a traditional non-vector one so the results can be compared (change the if(1) line in the obvious way). The test function testMatMul defines and fills appropriately-sized local matrixes with a simple pattern of values so the results can be seen. For example the testMatMul(2, 4, 6) call produces this output: ----------------------------------------- TEST MATRIX MULTIPLICATION N=2 M=4 P=6 ----------------------------------------- -- A: -- 0.00 0.25 0.50 0.75 1.00 1.25 1.50 1.75 -- B: -- 0.00 1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.00 10.00 11.00 -- C: -- 1.50 1.75 2.00 2.25 2.50 2.75 4.50 5.75 7.00 8.25 9.50 10.75 7.50 9.75 12.00 14.25 16.50 18.75 10.50 13.75 17.00 20.25 23.50 26.75 The test function will accept any values of n, m and p sizes, so this can be used to investigate the performance of the algorithm on any shape input. The algorithm is not optimized in any way, so I am sure it can be improved. — You are receiving this because you are subscribed to this thread. Reply to this email directly, view it on GitHub <#495>, or unsubscribe <https://github.com/notifications/unsubscribe-auth/ABMAPWMIFSVNVMQDPFWF62LRWDJQLANCNFSM4N3MX6ZA> .

JamesKenneyImperas · 2020-06-12T06:34:13Z

Thanks Guy - I'll transpose the loops and see what that looks like. I wasn't really trying to make a fully-optimized algorithm but simply an example and test harness for new users to play with to help gain understanding of how the instructions work, to supplement the existing examples in this repository. I think having two candidate implementations is even better.

Is there a better forum for this thread? Perhaps an existing set of standard examples somewhere else that I am unaware of?

JamesKenneyImperas · 2020-06-13T05:57:45Z

Just to finish off here, I've attached a second version of matmulFloat_v with transposed j/k loops as Guy suggested. It works for values of p<=VL when LMUL=8 (e.g. p<=64 when VLEN=512), and otherwise branches to a slower version (e.g. the previous implementation in this thread, renamed to matmulFloat_slow). This isn't an area I know anything about (as is clear from the code!) and I repeat that my objective is simply to help new users get started with something they can experiment with to easily see the effect of changing matrix sizes with the vector instructions.

.text
.balign 4
.global matmulFloat_v

# void matmulFloat_v(
#     size_t       n,     // a0: A columns, B rows
#     size_t       m,     // a1: A rows
#     size_t       p,     // a2: B columns
#     const float *A,     // a3: A is m x n matrix
#     const float *B,     // a4: B is n x p matrix
#     float       *C      // a5: C is m x p matrix
# ) {
#     int i, j, k;
# 
#     for (i = 0; i < m; i++) {
#         c[i,*] = 0;
#         for (k = 0; k < n; k++) {
#             float tmpf = A[i*n+k];
#             for (j = 0; j < p; j++) {
#                 c[i,j] += tmpf * B[k*p+j];
#             }
#         }
#     }
# }
#
#   n       a0
#   m       a1
#   p       a2
#   A       a3
#   B       a4
#   C       a5
#   i       t0
#   j       t1
#   k       t2
#   tmp     t3
#   tmpf    ft0

matmulFloat_v:

  vsetvli t3, a2, e32,m8,ta,ma          # set vector length p
    bne t3, a2, matmulFloat_slow        # use slow algorithm if p>VL

    li t0, 0

1:
  vxor.vv v0, v0, v0                    # c[i,*] = 0

    li t2, 0

2:

    flw ft0, (a3);                      # load A value
    addi a3, a3, 4                      # increment A base

    mul t3, t2, a2                      # calculate B block base address
    slli t3, t3, 2
    add t3, t3, a4

  vle32.v v8, (t3);                     # load B block

  vfmacc.vf v0, ft0, v8                 # c[i,*] += ta * b[k,*]

    addi t2, t2, 1
    bne t2, a0, 2b                      # do next k

    mul t3, t0, a2                      # calculate C block base address
    slli t3, t3, 2
    add t3, t3, a5

  vse32.v v0, (t3);                     # store C block

    addi t0, t0, 1
    bne t0, a1, 1b                      # do next i

    ret

WilliamWangPeng · 2020-06-26T05:18:09Z

Just to finish off here, I've attached a second version of matmulFloat_v with transposed j/k loops as Guy suggested. It works for values of p<=VL when LMUL=8 (e.g. p<=64 when VLEN=512), and otherwise branches to a slower version (e.g. the previous implementation in this thread, renamed to matmulFloat_slow). This isn't an area I know anything about (as is clear from the code!) and I repeat that my objective is simply to help new users get started with something they can experiment with to easily see the effect of changing matrix sizes with the vector instructions.

.text
.balign 4
.global matmulFloat_v

# void matmulFloat_v(
#     size_t       n,     // a0: A columns, B rows
#     size_t       m,     // a1: A rows
#     size_t       p,     // a2: B columns
#     const float *A,     // a3: A is m x n matrix
#     const float *B,     // a4: B is n x p matrix
#     float       *C      // a5: C is m x p matrix
# ) {
#     int i, j, k;
# 
#     for (i = 0; i < m; i++) {
#         c[i,*] = 0;
#         for (k = 0; k < n; k++) {
#             float tmpf = A[i*n+k];
#             for (j = 0; j < p; j++) {
#                 c[i,j] += tmpf * B[k*p+j];
#             }
#         }
#     }
# }
#
#   n       a0
#   m       a1
#   p       a2
#   A       a3
#   B       a4
#   C       a5
#   i       t0
#   j       t1
#   k       t2
#   tmp     t3
#   tmpf    ft0

matmulFloat_v:

  vsetvli t3, a2, e32,m8,ta,ma          # set vector length p
    bne t3, a2, matmulFloat_slow        # use slow algorithm if p>VL

    li t0, 0

1:
  vxor.vv v0, v0, v0                    # c[i,*] = 0

    li t2, 0

2:

    flw ft0, (a3);                      # load A value
    addi a3, a3, 4                      # increment A base

    mul t3, t2, a2                      # calculate B block base address
    slli t3, t3, 2
    add t3, t3, a4

  vle32.v v8, (t3);                     # load B block

  vfmacc.vf v0, ft0, v8                 # c[i,*] += ta * b[k,*]

    addi t2, t2, 1
    bne t2, a0, 2b                      # do next k

    mul t3, t0, a2                      # calculate C block base address
    slli t3, t3, 2
    add t3, t3, a5

  vse32.v v0, (t3);                     # store C block

    addi t0, t0, 1
    bne t0, a1, 1b                      # do next i

    ret

there is a problem in sgemm.S, when I try to compile the test_sgemm.c and sgemm.S to elf file, there are errors about "operand must be a symbol with %tprel_add modifier", I notice that this is your diy_sgemm.S, did you meet with this error when compiling?

sgemm_nn:
#    addi sp, sp, -FRAMESIZE
#    sd s0, OFFSET(sp)
#    sd s1, OFFSET(sp)
#    sd s2, OFFSET(sp)

nick-knight · 2020-06-26T05:29:05Z

Hi William,

As you've noticed on this thread and in your earlier thread #450, the sgemm.S example is incomplete in several ways. One of which is the logic for saving and restoring the callee-saved registers (s0, s1, etc.). In particular, as you've noticed, the example expects you, the programmer, to determine the correct values of FRAMESIZE and OFFSET (actually, you'd need three different OFFSETs...). To learn about the correct choices, you should study the (System V) RISC-V psABI.

You'll also notice that @JamesKenneyImperas 's example sidesteps this problem by not using any saved registers.

Best,
Nick Knight

EDIT: I'll give you a hint. For the prologue code, you could do:

    addi sp, sp, -16
    sd s0, 0(sp)
    sd s1, 4(sp)
    sd s2, 8(sp)

and for the epilogue code, you could do:

exit:
    ld s0, 0(sp)
    ld s1, 4(sp)
    ld s2, 8(sp)
    addi sp, sp, 16
    ret

EDIT 2: I'll leave it as a challenge to you to figure out how to deal with the OFFSET in the line

    # Convert elements strides to byte strides.
    ld cstride, OFFSET(sp)   # Get arg from stack frame

nick-knight mentioned this issue Jun 13, 2020

Question about sgemm.S test #450

Closed

kasanovic added the Resolve after v1.0 Does not need to be resolved for v1.0 draft label Aug 7, 2020

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Matrix Multiplication Example #495

Matrix Multiplication Example #495

JamesKenneyImperas commented Jun 11, 2020 •

edited

Loading

vbx-glemieux commented Jun 11, 2020 via email

JamesKenneyImperas commented Jun 12, 2020

JamesKenneyImperas commented Jun 13, 2020 •

edited

Loading

WilliamWangPeng commented Jun 26, 2020

nick-knight commented Jun 26, 2020 •

edited

Loading

Matrix Multiplication Example #495

Matrix Multiplication Example #495

Comments

JamesKenneyImperas commented Jun 11, 2020 • edited Loading

vbx-glemieux commented Jun 11, 2020 via email

JamesKenneyImperas commented Jun 12, 2020

JamesKenneyImperas commented Jun 13, 2020 • edited Loading

WilliamWangPeng commented Jun 26, 2020

nick-knight commented Jun 26, 2020 • edited Loading

JamesKenneyImperas commented Jun 11, 2020 •

edited

Loading

JamesKenneyImperas commented Jun 13, 2020 •

edited

Loading

nick-knight commented Jun 26, 2020 •

edited

Loading