Skip to content

Commit

Permalink
Merge branch 'develop' into gemv_t
Browse files Browse the repository at this point in the history
  • Loading branch information
martin-frbg authored Jan 25, 2025
2 parents c0318ce + 9b11fd5 commit 6e393a5
Show file tree
Hide file tree
Showing 52 changed files with 889 additions and 173 deletions.
9 changes: 7 additions & 2 deletions CONTRIBUTORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,12 @@ In chronological order:
* [2024-01-24] Optimize GEMV forwarding on ARM64 systems

* Aniket P. Garade <https://github.com/garadeaniket> Sushil Pratap Singh <https://github.com/SushilPratap04> Juliya James <https://github.com/Juliya32>
* [2024-12-13] Optimized swap and rot Level-1 BLAS routines with ARM SVE
* [2024-12-13] Optimized swap and rot Level-1 BLAS routines with ARM SVE

* Annop Wongwathanarat <[email protected]>
* [2025-01-21] Optimize gemv_t_sve_v1x3 kernel
* [2025-01-10] Add thread throttling profile for SGEMM on NEOVERSEV1
* [2025-01-21] Optimize gemv_t_sve_v1x3 kernel

* Marek Michalowski <https://github.com/michalowski-arm>
* [2025-01-21] Add thread throttling profile for SGEMV on `NEOVERSEV1`

4 changes: 2 additions & 2 deletions Makefile.install
Original file line number Diff line number Diff line change
Expand Up @@ -315,8 +315,8 @@ endif

endif
ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
@install -m 666 cpp_thread_test/dgemm_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR)
@install -m 666 cpp_thread_test/dgemv_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR)
@install -m 666 cpp_thread_test/dgemm_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR)
@install -m 666 cpp_thread_test/dgemv_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR)
endif
endif

3 changes: 3 additions & 0 deletions cmake/kernel.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ macro(SetDefaultL1)
SetFallback(CROTKERNEL zrot.S)
SetFallback(ZROTKERNEL zrot.S)
SetFallback(XROTKERNEL zrot.S)
SetFallback(SROTMKERNEL rotm.S)
SetFallback(DROTMKERNEL rotm.S)
SetFallback(QROTMKERNEL rotm.S)
SetFallback(SSCALKERNEL scal.S)
SetFallback(DSCALKERNEL scal.S)
SetFallback(CSCALKERNEL zscal.S)
Expand Down
10 changes: 9 additions & 1 deletion cmake/system.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,15 @@ endif()
# Other files expect CORE, which is actually TARGET and will become TARGET_CORE for kernel build. Confused yet?
# It seems we are meant to use TARGET as input and CORE internally as kernel.
if(NOT DEFINED CORE AND DEFINED TARGET)
set(CORE ${TARGET})
if (${TARGET} STREQUAL "LOONGSON3R5")
set(CORE "LA464")
elseif (${TARGET} STREQUAL "LOONGSON2K1000")
set(CORE "LA264")
elseif (${TARGET} STREQUAL "LOONGSONGENERIC")
set(CORE "LA64_GENERIC)")
else ()
set(CORE ${TARGET})
endif()
endif()

# TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1.
Expand Down
8 changes: 8 additions & 0 deletions cmake/utils.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@ endfunction ()
macro(ParseMakefileVars MAKEFILE_IN)
message(STATUS "Reading vars from ${MAKEFILE_IN}...")
set (C_COMPILER ${CMAKE_C_COMPILER_ID})
set (OSNAME ${CMAKE_SYSTEM_NAME})
if (${C_COMPILER} MATCHES Clang)
set (C_COMPILER CLANG)
endif ()
if (${OSNAME} STREQUAL Windows)
set (OSNAME WINNT)
endif ()
message(STATUS OS ${OSNAME} COMPILER ${C_COMPILER})
set (IfElse 0)
set (ElseSeen 0)
set (SkipIfs 0)
Expand Down
2 changes: 2 additions & 0 deletions common_d.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#define DSUM_K dsum_k
#define DSWAP_K dswap_k
#define DROT_K drot_k
#define DROTM_K drotm_k

#define DGEMV_N dgemv_n
#define DGEMV_T dgemv_t
Expand Down Expand Up @@ -180,6 +181,7 @@
#define DSUM_K gotoblas -> dsum_k
#define DSWAP_K gotoblas -> dswap_k
#define DROT_K gotoblas -> drot_k
#define DROTM_K gotoblas -> drotm_k

#define DGEMV_N gotoblas -> dgemv_n
#define DGEMV_T gotoblas -> dgemv_t
Expand Down
6 changes: 3 additions & 3 deletions common_level1.h
Original file line number Diff line number Diff line change
Expand Up @@ -213,9 +213,9 @@ int srotmg_k(float *, float *, float *, float *, float *);
int drotmg_k(double *, double *, double *, double *, double *);
int qrotmg_k(xdouble *, xdouble *, xdouble *, xdouble *, xdouble *);

int srotm_k (BLASLONG, float, BLASLONG, float, BLASLONG, float);
int drotm_k (BLASLONG, double, BLASLONG, double, BLASLONG, double);
int qrotm_k (BLASLONG, xdouble, BLASLONG, xdouble, BLASLONG, xdouble);
int srotm_k (BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
int drotm_k (BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
int qrotm_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *);


int saxpby_k (BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG);
Expand Down
3 changes: 3 additions & 0 deletions common_macro.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
#define SUM_K QSUM_K
#define SWAP_K QSWAP_K
#define ROT_K QROT_K
#define ROTM_K QROTM_K

#define GEMV_N QGEMV_N
#define GEMV_T QGEMV_T
Expand Down Expand Up @@ -361,6 +362,7 @@
#define SUM_K DSUM_K
#define SWAP_K DSWAP_K
#define ROT_K DROT_K
#define ROTM_K DROTM_K

#define GEMV_N DGEMV_N
#define GEMV_T DGEMV_T
Expand Down Expand Up @@ -977,6 +979,7 @@
#define SUM_K SSUM_K
#define SWAP_K SSWAP_K
#define ROT_K SROT_K
#define ROTM_K SROTM_K

#define GEMV_N SGEMV_N
#define GEMV_T SGEMV_T
Expand Down
3 changes: 3 additions & 0 deletions common_param.h
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
//double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);

int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float);
int (*srotm_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
#endif
#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1)
int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
Expand Down Expand Up @@ -330,6 +331,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
#endif
#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1)
int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double);
int (*drotm_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
int (*dswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
Expand Down Expand Up @@ -439,6 +441,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG);
int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble);
int (*qrotm_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *);

int (*qaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
int (*qscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
Expand Down
2 changes: 2 additions & 0 deletions common_q.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#define QSUM_K qsum_k
#define QSWAP_K qswap_k
#define QROT_K qrot_k
#define QROTM_K qrotm_k

#define QGEMV_N qgemv_n
#define QGEMV_T qgemv_t
Expand Down Expand Up @@ -165,6 +166,7 @@
#define QSUM_K gotoblas -> qsum_k
#define QSWAP_K gotoblas -> qswap_k
#define QROT_K gotoblas -> qrot_k
#define QROTM_K gotoblas -> qrotm_k

#define QGEMV_N gotoblas -> qgemv_n
#define QGEMV_T gotoblas -> qgemv_t
Expand Down
2 changes: 2 additions & 0 deletions common_s.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#define SSCAL_K sscal_k
#define SSWAP_K sswap_k
#define SROT_K srot_k
#define SROTM_K srotm_k

#define SGEMV_N sgemv_n
#define SGEMV_T sgemv_t
Expand Down Expand Up @@ -189,6 +190,7 @@
#define SSCAL_K gotoblas -> sscal_k
#define SSWAP_K gotoblas -> sswap_k
#define SROT_K gotoblas -> srot_k
#define SROTM_K gotoblas -> srotm_k

#define SGEMV_N gotoblas -> sgemv_n
#define SGEMV_T gotoblas -> sgemv_t
Expand Down
6 changes: 3 additions & 3 deletions docs/install.md
Original file line number Diff line number Diff line change
Expand Up @@ -480,13 +480,13 @@ the LLVM toolchain enables native compilation of the Fortran sources of LAPACK a
4. Navigate to the OpenBLAS source code directory and start building OpenBLAS
by invoking Ninja:
```cmd
cd OpenBLAS
mkdir build
cd build
cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DTARGET=ARMV8 -DBINARY=64 -DCMAKE_C_COMPILER=clang-cl -DCMAKE_C_COMPILER=arm64-pc-windows-msvc -DCMAKE_ASM_COMPILER=arm64-pc-windows-msvc -DCMAKE_Fortran_COMPILER=flang-new
cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DTARGET=ARMV8 -DBINARY=64 -DCMAKE_C_COMPILER=clang-cl -DCMAKE_C_COMPILER_TARGET=arm64-pc-windows-msvc -DCMAKE_ASM_COMPILER_TARGET=arm64-pc-windows-msvc -DCMAKE_Fortran_COMPILER=flang-new
ninja -j16
```
Expand Down
57 changes: 47 additions & 10 deletions interface/gemm.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*********************************************************************/
/* Copyright 2024 The OpenBLAS Project */
/* Copyright 2024, 2025 The OpenBLAS Project */
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
Expand Down Expand Up @@ -177,6 +177,49 @@ static int init_amxtile_permission() {
}
#endif

#ifdef DYNAMIC_ARCH
extern char* gotoblas_corename(void);
#endif

#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1)
static inline int get_gemm_optimal_nthreads_neoversev1(double MNK, int ncpu) {
return
MNK < 262144L ? 1
: MNK < 1124864L ? MIN(ncpu, 6)
: MNK < 7880599L ? MIN(ncpu, 12)
: MNK < 17173512L ? MIN(ncpu, 16)
: MNK < 33386248L ? MIN(ncpu, 20)
: MNK < 57066625L ? MIN(ncpu, 24)
: MNK < 91733851L ? MIN(ncpu, 32)
: MNK < 265847707L ? MIN(ncpu, 40)
: MNK < 458314011L ? MIN(ncpu, 48)
: MNK < 729000000L ? MIN(ncpu, 56)
: ncpu;
}
#endif

static inline int get_gemm_optimal_nthreads(double MNK) {
int ncpu = num_cpu_avail(3);
#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16)
return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu);
#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16)
if (strcmp(gotoblas_corename(), "neoversev1") == 0) {
return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu);
}
#endif
if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) {
return 1;
}
else {
if (MNK/ncpu < SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD) {
return MNK/(SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD);
}
else {
return ncpu;
}
}
}

#ifndef CBLAS

void NAME(char *TRANSA, char *TRANSB,
Expand Down Expand Up @@ -310,7 +353,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
FLOAT *beta = (FLOAT*) vbeta;
FLOAT *a = (FLOAT*) va;
FLOAT *b = (FLOAT*) vb;
FLOAT *c = (FLOAT*) vc;
FLOAT *c = (FLOAT*) vc;
#endif

blas_arg_t args;
Expand Down Expand Up @@ -352,7 +395,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT)
#ifdef DYNAMIC_ARCH
if (support_avx512() )
#endif
#endif
if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) {
SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc);
return;
Expand Down Expand Up @@ -604,13 +647,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
#endif

MNK = (double) args.m * (double) args.n * (double) args.k;
if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) )
args.nthreads = 1;
else {
args.nthreads = num_cpu_avail(3);
if (MNK/args.nthreads < SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD)
args.nthreads = MNK/(SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD);
}
args.nthreads = get_gemm_optimal_nthreads(MNK);

args.common = NULL;

Expand Down
36 changes: 31 additions & 5 deletions interface/gemv.c
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,36 @@ static int (*gemv_thread[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT
};
#endif

#ifdef DYNAMIC_ARCH
extern char* gotoblas_corename(void);
#endif

#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1)
static inline int get_gemv_optimal_nthreads_neoversev1(BLASLONG MN, int ncpu) {
return
MN < 25600L ? 1
: MN < 63001L ? MIN(ncpu, 4)
: MN < 459684L ? MIN(ncpu, 16)
: ncpu;
}
#endif

static inline int get_gemv_optimal_nthreads(BLASLONG MN) {
int ncpu = num_cpu_avail(3);
#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16)
return get_gemv_optimal_nthreads_neoversev1(MN, ncpu);
#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16)
if (strcmp(gotoblas_corename(), "neoversev1") == 0) {
return get_gemv_optimal_nthreads_neoversev1(MN, ncpu);
}
#endif

if ( MN < 115200L * GEMM_MULTITHREAD_THRESHOLD )
return 1;
else
return num_cpu_avail(2);
}

#ifndef CBLAS

void NAME(char *TRANS, blasint *M, blasint *N,
Expand Down Expand Up @@ -225,11 +255,7 @@ void CNAME(enum CBLAS_ORDER order,
STACK_ALLOC(buffer_size, FLOAT, buffer);

#ifdef SMP

if ( 1L * m * n < 115200L * GEMM_MULTITHREAD_THRESHOLD )
nthreads = 1;
else
nthreads = num_cpu_avail(2);
nthreads = get_gemv_optimal_nthreads(1L * m * n);

if (nthreads == 1) {
#endif
Expand Down
Loading

0 comments on commit 6e393a5

Please sign in to comment.