From c8cd8da496406b83d9c171fe572719bbfcaf2105 Mon Sep 17 00:00:00 2001 From: Annop Wongwathanarat Date: Mon, 13 Jan 2025 15:43:08 +0000 Subject: [PATCH 1/9] Add thread throttling profile for SGEMM on NEOVERSEV1 --- CONTRIBUTORS.md | 3 +++ interface/gemm.c | 57 +++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 50 insertions(+), 10 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 508dbcd0e6..d97eb3bccd 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -232,3 +232,6 @@ In chronological order: * Aniket P. Garade Sushil Pratap Singh Juliya James * [2024-12-13] Optimized swap and rot Level-1 BLAS routines with ARM SVE + +* Annop Wongwathanarat + * [2025-01-10] Add thread throttling profile for SGEMM on NEOVERSEV1 \ No newline at end of file diff --git a/interface/gemm.c b/interface/gemm.c index c9f810faa2..8a806cfb47 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -1,5 +1,5 @@ /*********************************************************************/ -/* Copyright 2024 The OpenBLAS Project */ +/* Copyright 2024, 2025 The OpenBLAS Project */ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ @@ -177,6 +177,49 @@ static int init_amxtile_permission() { } #endif +#ifdef DYNAMIC_ARCH +extern char* gotoblas_corename(void); +#endif + +#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) +static inline int get_gemm_optimal_nthreads_neoversev1(double MNK, int ncpu) { + return + MNK < 262144L ? 1 + : MNK < 1124864L ? MIN(ncpu, 6) + : MNK < 7880599L ? MIN(ncpu, 12) + : MNK < 17173512L ? MIN(ncpu, 16) + : MNK < 33386248L ? MIN(ncpu, 20) + : MNK < 57066625L ? MIN(ncpu, 24) + : MNK < 91733851L ? MIN(ncpu, 32) + : MNK < 265847707L ? MIN(ncpu, 40) + : MNK < 458314011L ? MIN(ncpu, 48) + : MNK < 729000000L ? MIN(ncpu, 56) + : ncpu; +} +#endif + +static inline int get_gemm_optimal_nthreads(double MNK) { + int ncpu = num_cpu_avail(3); +#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) + return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); +#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) + if (strcmp(gotoblas_corename(), "neoversev1") == 0) { + return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); + } +#endif + if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) { + return 1; + } + else { + if (MNK/ncpu < SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD) { + return MNK/(SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD); + } + else { + return ncpu; + } + } +} + #ifndef CBLAS void NAME(char *TRANSA, char *TRANSB, @@ -310,7 +353,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS FLOAT *beta = (FLOAT*) vbeta; FLOAT *a = (FLOAT*) va; FLOAT *b = (FLOAT*) vb; - FLOAT *c = (FLOAT*) vc; + FLOAT *c = (FLOAT*) vc; #endif blas_arg_t args; @@ -352,7 +395,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT) #ifdef DYNAMIC_ARCH if (support_avx512() ) -#endif +#endif if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) { SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); return; @@ -604,13 +647,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #endif MNK = (double) args.m * (double) args.n * (double) args.k; - if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) - args.nthreads = 1; - else { - args.nthreads = num_cpu_avail(3); - if (MNK/args.nthreads < SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD) - args.nthreads = MNK/(SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD); - } + args.nthreads = get_gemm_optimal_nthreads(MNK); args.common = NULL; From 3c8df6358f1b2537b983af4b2f93df87cf91e2c9 Mon Sep 17 00:00:00 2001 From: "tingbo.liao" Date: Wed, 22 Jan 2025 11:41:12 +0800 Subject: [PATCH 2/9] Further rearranged the rotm kernel for the different architectures. Signed-off-by: tingbo.liao --- cmake/kernel.cmake | 3 + common_d.h | 2 + common_level1.h | 6 +- common_macro.h | 3 + common_param.h | 3 + common_q.h | 2 + common_s.h | 2 + interface/rotm.c | 140 +------------- kernel/CMakeLists.txt | 3 + kernel/Makefile.L1 | 22 ++- kernel/alpha/KERNEL | 12 ++ kernel/arm/KERNEL | 10 + kernel/arm64/KERNEL | 10 + kernel/arm64/KERNEL.generic | 12 ++ kernel/csky/KERNEL | 10 + kernel/e2k/KERNEL | 10 + kernel/generic/rotm.c | 159 ++++++++++++++++ kernel/ia64/KERNEL | 12 ++ kernel/loongarch64/KERNEL | 12 ++ kernel/loongarch64/KERNEL.generic | 12 ++ kernel/mips/KERNEL | 10 + kernel/mips/KERNEL.generic | 12 ++ kernel/mips64/KERNEL | 12 ++ kernel/mips64/KERNEL.generic | 12 ++ kernel/power/KERNEL | 12 ++ kernel/riscv64/KERNEL | 10 + kernel/riscv64/KERNEL.C910V | 4 + kernel/riscv64/KERNEL.RISCV64_GENERIC | 4 + kernel/riscv64/KERNEL.RISCV64_ZVL128B | 4 + kernel/riscv64/KERNEL.RISCV64_ZVL256B | 4 + kernel/riscv64/KERNEL.x280 | 4 + kernel/riscv64/rotm_rvv.c | 260 ++++++++++++++++++++++++++ kernel/setparam-ref.c | 5 +- kernel/sparc/KERNEL | 11 ++ kernel/x86/KERNEL | 11 ++ kernel/x86/KERNEL.generic | 12 ++ kernel/x86_64/KERNEL | 12 ++ kernel/x86_64/KERNEL.generic | 12 ++ kernel/zarch/KERNEL | 10 + kernel/zarch/KERNEL.ZARCH_GENERIC | 9 + utest/test_rot.c | 36 ++++ 41 files changed, 770 insertions(+), 141 deletions(-) create mode 100644 kernel/generic/rotm.c create mode 100644 kernel/riscv64/rotm_rvv.c diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index efededcf36..2cea6d9e6e 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -79,6 +79,9 @@ macro(SetDefaultL1) SetFallback(CROTKERNEL zrot.S) SetFallback(ZROTKERNEL zrot.S) SetFallback(XROTKERNEL zrot.S) + SetFallback(SROTMKERNEL rotm.S) + SetFallback(DROTMKERNEL rotm.S) + SetFallback(QROTMKERNEL rotm.S) SetFallback(SSCALKERNEL scal.S) SetFallback(DSCALKERNEL scal.S) SetFallback(CSCALKERNEL zscal.S) diff --git a/common_d.h b/common_d.h index 6f4bb2dedc..1e8c33d7a3 100644 --- a/common_d.h +++ b/common_d.h @@ -22,6 +22,7 @@ #define DSUM_K dsum_k #define DSWAP_K dswap_k #define DROT_K drot_k +#define DROTM_K drotm_k #define DGEMV_N dgemv_n #define DGEMV_T dgemv_t @@ -180,6 +181,7 @@ #define DSUM_K gotoblas -> dsum_k #define DSWAP_K gotoblas -> dswap_k #define DROT_K gotoblas -> drot_k +#define DROTM_K gotoblas -> drotm_k #define DGEMV_N gotoblas -> dgemv_n #define DGEMV_T gotoblas -> dgemv_t diff --git a/common_level1.h b/common_level1.h index d2ed47e567..85b39f7a7c 100644 --- a/common_level1.h +++ b/common_level1.h @@ -213,9 +213,9 @@ int srotmg_k(float *, float *, float *, float *, float *); int drotmg_k(double *, double *, double *, double *, double *); int qrotmg_k(xdouble *, xdouble *, xdouble *, xdouble *, xdouble *); -int srotm_k (BLASLONG, float, BLASLONG, float, BLASLONG, float); -int drotm_k (BLASLONG, double, BLASLONG, double, BLASLONG, double); -int qrotm_k (BLASLONG, xdouble, BLASLONG, xdouble, BLASLONG, xdouble); +int srotm_k (BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int drotm_k (BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int qrotm_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int saxpby_k (BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); diff --git a/common_macro.h b/common_macro.h index a924651de2..820cb472a6 100644 --- a/common_macro.h +++ b/common_macro.h @@ -70,6 +70,7 @@ #define SUM_K QSUM_K #define SWAP_K QSWAP_K #define ROT_K QROT_K +#define ROTM_K QROTM_K #define GEMV_N QGEMV_N #define GEMV_T QGEMV_T @@ -361,6 +362,7 @@ #define SUM_K DSUM_K #define SWAP_K DSWAP_K #define ROT_K DROT_K +#define ROTM_K DROTM_K #define GEMV_N DGEMV_N #define GEMV_T DGEMV_T @@ -977,6 +979,7 @@ #define SUM_K SSUM_K #define SWAP_K SSWAP_K #define ROT_K SROT_K +#define ROTM_K SROTM_K #define GEMV_N SGEMV_N #define GEMV_T SGEMV_T diff --git a/common_param.h b/common_param.h index c082d248e8..a3e4cea6b3 100644 --- a/common_param.h +++ b/common_param.h @@ -197,6 +197,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); //double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); + int (*srotm_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); #endif #if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -330,6 +331,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); #endif #if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); + int (*drotm_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*dswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); @@ -439,6 +441,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); + int (*qrotm_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*qaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int (*qscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); diff --git a/common_q.h b/common_q.h index b4ace3a628..1d976f1e8d 100644 --- a/common_q.h +++ b/common_q.h @@ -22,6 +22,7 @@ #define QSUM_K qsum_k #define QSWAP_K qswap_k #define QROT_K qrot_k +#define QROTM_K qrotm_k #define QGEMV_N qgemv_n #define QGEMV_T qgemv_t @@ -165,6 +166,7 @@ #define QSUM_K gotoblas -> qsum_k #define QSWAP_K gotoblas -> qswap_k #define QROT_K gotoblas -> qrot_k +#define QROTM_K gotoblas -> qrotm_k #define QGEMV_N gotoblas -> qgemv_n #define QGEMV_T gotoblas -> qgemv_t diff --git a/common_s.h b/common_s.h index fdd80b62f6..7c73902596 100644 --- a/common_s.h +++ b/common_s.h @@ -24,6 +24,7 @@ #define SSCAL_K sscal_k #define SSWAP_K sswap_k #define SROT_K srot_k +#define SROTM_K srotm_k #define SGEMV_N sgemv_n #define SGEMV_T sgemv_t @@ -189,6 +190,7 @@ #define SSCAL_K gotoblas -> sscal_k #define SSWAP_K gotoblas -> sswap_k #define SROT_K gotoblas -> srot_k +#define SROTM_K gotoblas -> srotm_k #define SGEMV_N gotoblas -> sgemv_n #define SGEMV_T gotoblas -> sgemv_t diff --git a/interface/rotm.c b/interface/rotm.c index 9dc08354ac..9ef87da329 100644 --- a/interface/rotm.c +++ b/interface/rotm.c @@ -7,149 +7,21 @@ void NAME(blasint *N, FLOAT *dx, blasint *INCX, FLOAT *dy, blasint *INCY, FLOAT *dparam){ - blasint n = *N; - blasint incx = *INCX; - blasint incy = *INCY; + blasint n = *N; + blasint incx = *INCX; + blasint incy = *INCY; + PRINT_DEBUG_NAME #else void CNAME(blasint n, FLOAT *dx, blasint incx, FLOAT *dy, blasint incy, FLOAT *dparam){ -#endif - - blasint i__1, i__2; + PRINT_DEBUG_CNAME; - blasint i__; - FLOAT w, z__; - blasint kx, ky; - FLOAT dh11, dh12, dh22, dh21, dflag; - blasint nsteps; - -#ifndef CBLAS - PRINT_DEBUG_CNAME; -#else - PRINT_DEBUG_CNAME; #endif - --dparam; - --dy; - --dx; - - dflag = dparam[1]; - if (n <= 0 || dflag == - 2.0) goto L140; - - if (! (incx == incy && incx > 0)) goto L70; - - nsteps = n * incx; - if (dflag < 0.) { - goto L50; - } else if (dflag == 0) { - goto L10; - } else { - goto L30; - } -L10: - dh12 = dparam[4]; - dh21 = dparam[3]; - i__1 = nsteps; - i__2 = incx; - for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { - w = dx[i__]; - z__ = dy[i__]; - dx[i__] = w + z__ * dh12; - dy[i__] = w * dh21 + z__; -/* L20: */ - } - goto L140; -L30: - dh11 = dparam[2]; - dh22 = dparam[5]; - i__2 = nsteps; - i__1 = incx; - for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) { - w = dx[i__]; - z__ = dy[i__]; - dx[i__] = w * dh11 + z__; - dy[i__] = -w + dh22 * z__; -/* L40: */ - } - goto L140; -L50: - dh11 = dparam[2]; - dh12 = dparam[4]; - dh21 = dparam[3]; - dh22 = dparam[5]; - i__1 = nsteps; - i__2 = incx; - for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { - w = dx[i__]; - z__ = dy[i__]; - dx[i__] = w * dh11 + z__ * dh12; - dy[i__] = w * dh21 + z__ * dh22; -/* L60: */ - } - goto L140; -L70: - kx = 1; - ky = 1; - if (incx < 0) { - kx = (1 - n) * incx + 1; - } - if (incy < 0) { - ky = (1 - n) * incy + 1; - } + ROTM_K(n, dx, incx, dy, incy, dparam); - if (dflag < 0.) { - goto L120; - } else if (dflag == 0) { - goto L80; - } else { - goto L100; - } -L80: - dh12 = dparam[4]; - dh21 = dparam[3]; - i__2 = n; - for (i__ = 1; i__ <= i__2; ++i__) { - w = dx[kx]; - z__ = dy[ky]; - dx[kx] = w + z__ * dh12; - dy[ky] = w * dh21 + z__; - kx += incx; - ky += incy; -/* L90: */ - } - goto L140; -L100: - dh11 = dparam[2]; - dh22 = dparam[5]; - i__2 = n; - for (i__ = 1; i__ <= i__2; ++i__) { - w = dx[kx]; - z__ = dy[ky]; - dx[kx] = w * dh11 + z__; - dy[ky] = -w + dh22 * z__; - kx += incx; - ky += incy; -/* L110: */ - } - goto L140; -L120: - dh11 = dparam[2]; - dh12 = dparam[4]; - dh21 = dparam[3]; - dh22 = dparam[5]; - i__2 = n; - for (i__ = 1; i__ <= i__2; ++i__) { - w = dx[kx]; - z__ = dy[ky]; - dx[kx] = w * dh11 + z__ * dh12; - dy[ky] = w * dh21 + z__ * dh22; - kx += incx; - ky += incy; -/* L130: */ - } -L140: return; } diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 74e6760c27..bc713e6033 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -125,6 +125,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${SNRM2KERNEL}" "" "nrm2_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SDOTKERNEL}" "" "dot_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SROTKERNEL}" "" "rot_k" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SROTMKERNEL}" "" "rotm_k" false "" "" false "SINGLE") endif () if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) GenerateNamedObjects("${KERNELDIR}/${DAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "DOUBLE") @@ -148,6 +149,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DROTMKERNEL}" "" "rotm_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k" false "" "" false "DOUBLE") @@ -1105,6 +1107,7 @@ endif () GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DROTMKERNEL}" "" "rotm_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k" false "" "" false "DOUBLE") diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1 index 09337363da..6e864e3d8e 100644 --- a/kernel/Makefile.L1 +++ b/kernel/Makefile.L1 @@ -336,6 +336,18 @@ ifndef XROTKERNEL XROTKERNEL = zrot.S endif +ifndef SROTMKERNEL +SROTMKERNEL = rotm.S +endif + +ifndef DROTMKERNEL +DROTMKERNEL = rotm.S +endif + +ifndef QROTMKERNEL +QROTMKERNEL = rotm.S +endif + ### SCAL ### ifndef SSCALKERNEL @@ -504,14 +516,14 @@ SBLASOBJS += \ sasum_k$(TSUFFIX).$(SUFFIX) ssum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \ sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \ snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \ - saxpby_k$(TSUFFIX).$(SUFFIX) + saxpby_k$(TSUFFIX).$(SUFFIX) srotm_k$(TSUFFIX).$(SUFFIX) DBLASOBJS += \ damax_k$(TSUFFIX).$(SUFFIX) damin_k$(TSUFFIX).$(SUFFIX) dmax_k$(TSUFFIX).$(SUFFIX) dmin_k$(TSUFFIX).$(SUFFIX) \ idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \ dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \ dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \ - daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX) + daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX) drotm_k$(TSUFFIX).$(SUFFIX) QBLASOBJS += \ qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \ @@ -841,6 +853,12 @@ $(KDIR)srot_k$(TSUFFIX).$(SUFFIX) $(KDIR)srot_k$(TPSUFFIX).$(PSUFFIX) : $(KERN $(KDIR)drot_k$(TSUFFIX).$(SUFFIX) $(KDIR)drot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTKERNEL) $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ +$(KDIR)srotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)srotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SROTMKERNEL) + $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)drotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)drotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTMKERNEL) + $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ + $(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ diff --git a/kernel/alpha/KERNEL b/kernel/alpha/KERNEL index 01734bf9c5..42ae595aa9 100644 --- a/kernel/alpha/KERNEL +++ b/kernel/alpha/KERNEL @@ -122,3 +122,15 @@ ZTRSMKERNEL_LN = ztrsm_kernel_2x2_LN.S ZTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S + +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/arm/KERNEL b/kernel/arm/KERNEL index aeccfbf4c8..a6ad0bf028 100644 --- a/kernel/arm/KERNEL +++ b/kernel/arm/KERNEL @@ -43,4 +43,14 @@ ifndef ZGEMM_BETA ZGEMM_BETA = ../generic/zgemm_beta.c endif +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/arm64/KERNEL b/kernel/arm64/KERNEL index 7d7e648c48..05d95683dc 100644 --- a/kernel/arm64/KERNEL +++ b/kernel/arm64/KERNEL @@ -45,4 +45,14 @@ ifndef ZGEMM_BETA ZGEMM_BETA = ../generic/zgemm_beta.c endif +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/arm64/KERNEL.generic b/kernel/arm64/KERNEL.generic index 838adb05ab..65c301e686 100644 --- a/kernel/arm64/KERNEL.generic +++ b/kernel/arm64/KERNEL.generic @@ -171,3 +171,15 @@ QCABS_KERNEL = ../generic/cabs.c #Dump kernel CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c + +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/csky/KERNEL b/kernel/csky/KERNEL index afa8a08817..0302057a2a 100644 --- a/kernel/csky/KERNEL +++ b/kernel/csky/KERNEL @@ -146,4 +146,14 @@ DGEMM_BETA = ../generic/gemm_beta.c CGEMM_BETA = ../generic/zgemm_beta.c ZGEMM_BETA = ../generic/zgemm_beta.c +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/e2k/KERNEL b/kernel/e2k/KERNEL index afa8a08817..0302057a2a 100644 --- a/kernel/e2k/KERNEL +++ b/kernel/e2k/KERNEL @@ -146,4 +146,14 @@ DGEMM_BETA = ../generic/gemm_beta.c CGEMM_BETA = ../generic/zgemm_beta.c ZGEMM_BETA = ../generic/zgemm_beta.c +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/generic/rotm.c b/kernel/generic/rotm.c new file mode 100644 index 0000000000..e151aa5f88 --- /dev/null +++ b/kernel/generic/rotm.c @@ -0,0 +1,159 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT *dx, BLASLONG incx, FLOAT *dy, BLASLONG incy, FLOAT *dparam) +{ + BLASLONG i__1, i__2; + BLASLONG i__; + FLOAT w, z__; + BLASLONG kx, ky; + FLOAT dh11, dh12, dh22, dh21, dflag; + BLASLONG nsteps; + + --dparam; + --dy; + --dx; + + dflag = dparam[1]; + if (n <= 0 || dflag == - 2.0) goto L140; + + if (! (incx == incy && incx > 0)) goto L70; + + nsteps = n * incx; + if (dflag < 0.) { + goto L50; + } else if (dflag == 0) { + goto L10; + } else { + goto L30; + } +L10: + dh12 = dparam[4]; + dh21 = dparam[3]; + i__1 = nsteps; + i__2 = incx; + for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { + w = dx[i__]; + z__ = dy[i__]; + dx[i__] = w + z__ * dh12; + dy[i__] = w * dh21 + z__; +/* L20: */ + } + goto L140; +L30: + dh11 = dparam[2]; + dh22 = dparam[5]; + i__2 = nsteps; + i__1 = incx; + for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) { + w = dx[i__]; + z__ = dy[i__]; + dx[i__] = w * dh11 + z__; + dy[i__] = -w + dh22 * z__; +/* L40: */ + } + goto L140; +L50: + dh11 = dparam[2]; + dh12 = dparam[4]; + dh21 = dparam[3]; + dh22 = dparam[5]; + i__1 = nsteps; + i__2 = incx; + for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { + w = dx[i__]; + z__ = dy[i__]; + dx[i__] = w * dh11 + z__ * dh12; + dy[i__] = w * dh21 + z__ * dh22; +/* L60: */ + } + goto L140; +L70: + kx = 1; + ky = 1; + if (incx < 0) { + kx = (1 - n) * incx + 1; + } + if (incy < 0) { + ky = (1 - n) * incy + 1; + } + + if (dflag < 0.) { + goto L120; + } else if (dflag == 0) { + goto L80; + } else { + goto L100; + } +L80: + dh12 = dparam[4]; + dh21 = dparam[3]; + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + w = dx[kx]; + z__ = dy[ky]; + dx[kx] = w + z__ * dh12; + dy[ky] = w * dh21 + z__; + kx += incx; + ky += incy; +/* L90: */ + } + goto L140; +L100: + dh11 = dparam[2]; + dh22 = dparam[5]; + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + w = dx[kx]; + z__ = dy[ky]; + dx[kx] = w * dh11 + z__; + dy[ky] = -w + dh22 * z__; + kx += incx; + ky += incy; +/* L110: */ + } + goto L140; +L120: + dh11 = dparam[2]; + dh12 = dparam[4]; + dh21 = dparam[3]; + dh22 = dparam[5]; + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + w = dx[kx]; + z__ = dy[ky]; + dx[kx] = w * dh11 + z__ * dh12; + dy[ky] = w * dh21 + z__ * dh22; + kx += incx; + ky += incy; +/* L130: */ + } +L140: + return(0); +} diff --git a/kernel/ia64/KERNEL b/kernel/ia64/KERNEL index 870aac473e..bbfec7d556 100644 --- a/kernel/ia64/KERNEL +++ b/kernel/ia64/KERNEL @@ -142,3 +142,15 @@ ZTRSMKERNEL_RT = ztrsm_kernel_RT.S CGEMM3MKERNEL = zgemm3m_kernel.S ZGEMM3MKERNEL = zgemm3m_kernel.S + +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/loongarch64/KERNEL b/kernel/loongarch64/KERNEL index e5d145a718..46d8daaa96 100644 --- a/kernel/loongarch64/KERNEL +++ b/kernel/loongarch64/KERNEL @@ -236,3 +236,15 @@ ZGEMM3MKERNEL = zgemm3m_kernel.S endif DSDOTKERNEL = dot.S + +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/loongarch64/KERNEL.generic b/kernel/loongarch64/KERNEL.generic index 213add9ee5..b2e4cb44ad 100644 --- a/kernel/loongarch64/KERNEL.generic +++ b/kernel/loongarch64/KERNEL.generic @@ -169,3 +169,15 @@ QCABS_KERNEL = ../generic/cabs.c #Dump kernel CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c + +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/mips/KERNEL b/kernel/mips/KERNEL index aeccfbf4c8..a6ad0bf028 100644 --- a/kernel/mips/KERNEL +++ b/kernel/mips/KERNEL @@ -43,4 +43,14 @@ ifndef ZGEMM_BETA ZGEMM_BETA = ../generic/zgemm_beta.c endif +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/mips/KERNEL.generic b/kernel/mips/KERNEL.generic index 17f2ef976b..1f03c65942 100644 --- a/kernel/mips/KERNEL.generic +++ b/kernel/mips/KERNEL.generic @@ -158,3 +158,15 @@ ZHEMV_L_KERNEL = ../generic/zhemv_k.c CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c + +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index 54939a9efe..2ebd8a5bda 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -199,3 +199,15 @@ endif ifndef IQMAXKERNEL IQMAXKERNEL = imax.S endif + +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/mips64/KERNEL.generic b/kernel/mips64/KERNEL.generic index 17f2ef976b..1f03c65942 100644 --- a/kernel/mips64/KERNEL.generic +++ b/kernel/mips64/KERNEL.generic @@ -158,3 +158,15 @@ ZHEMV_L_KERNEL = ../generic/zhemv_k.c CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c + +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/power/KERNEL b/kernel/power/KERNEL index 9070450f4b..45fe0dd292 100644 --- a/kernel/power/KERNEL +++ b/kernel/power/KERNEL @@ -73,3 +73,15 @@ endif ifndef IQMAXKERNEL IQMAXKERNEL = imax.S endif + +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/riscv64/KERNEL b/kernel/riscv64/KERNEL index 68d68b5f86..cd94052035 100644 --- a/kernel/riscv64/KERNEL +++ b/kernel/riscv64/KERNEL @@ -27,4 +27,14 @@ ifndef ZGEMM_BETA ZGEMM_BETA = ../generic/zgemm_beta.c endif +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/riscv64/KERNEL.C910V b/kernel/riscv64/KERNEL.C910V index 2798a870ed..666b3cc5e9 100644 --- a/kernel/riscv64/KERNEL.C910V +++ b/kernel/riscv64/KERNEL.C910V @@ -71,6 +71,10 @@ DROTKERNEL = rot_vector.c CROTKERNEL = zrot_vector.c ZROTKERNEL = zrot_vector.c +SROTMKERNEL = ../generic/rotm.c +DROTMKERNEL = ../generic/rotm.c +QROTMKERNEL = ../generic/rotm.c + SSCALKERNEL = scal_vector.c DSCALKERNEL = scal_vector.c CSCALKERNEL = zscal_vector.c diff --git a/kernel/riscv64/KERNEL.RISCV64_GENERIC b/kernel/riscv64/KERNEL.RISCV64_GENERIC index 67f81cacda..cf7d15d36f 100644 --- a/kernel/riscv64/KERNEL.RISCV64_GENERIC +++ b/kernel/riscv64/KERNEL.RISCV64_GENERIC @@ -71,6 +71,10 @@ DROTKERNEL = ../riscv64/rot.c CROTKERNEL = ../riscv64/zrot.c ZROTKERNEL = ../riscv64/zrot.c +SROTMKERNEL = ../generic/rotm.c +DROTMKERNEL = ../generic/rotm.c +QROTMKERNEL = ../generic/rotm.c + SSCALKERNEL = ../riscv64/scal.c DSCALKERNEL = ../riscv64/scal.c CSCALKERNEL = ../riscv64/zscal.c diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL128B b/kernel/riscv64/KERNEL.RISCV64_ZVL128B index fec69ee094..7fbc26d213 100644 --- a/kernel/riscv64/KERNEL.RISCV64_ZVL128B +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL128B @@ -71,6 +71,10 @@ DROTKERNEL = rot_rvv.c CROTKERNEL = zrot_rvv.c ZROTKERNEL = zrot_rvv.c +SROTMKERNEL = ../generic/rotm.c +DROTMKERNEL = ../generic/rotm.c +QROTMKERNEL = ../generic/rotm.c + SSCALKERNEL = scal_rvv.c DSCALKERNEL = scal_rvv.c CSCALKERNEL = zscal_rvv.c diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL256B b/kernel/riscv64/KERNEL.RISCV64_ZVL256B index d8690682f4..9915fd9496 100644 --- a/kernel/riscv64/KERNEL.RISCV64_ZVL256B +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL256B @@ -66,6 +66,10 @@ DROTKERNEL = rot_vector.c CROTKERNEL = zrot_vector.c ZROTKERNEL = zrot_vector.c +SROTMKERNEL = ../generic/rotm.c +DROTMKERNEL = ../generic/rotm.c +QROTMKERNEL = ../generic/rotm.c + SSCALKERNEL = scal_vector.c DSCALKERNEL = scal_vector.c CSCALKERNEL = zscal_vector.c diff --git a/kernel/riscv64/KERNEL.x280 b/kernel/riscv64/KERNEL.x280 index e909ca9599..18515e812f 100644 --- a/kernel/riscv64/KERNEL.x280 +++ b/kernel/riscv64/KERNEL.x280 @@ -98,6 +98,10 @@ DROTKERNEL = rot_rvv.c CROTKERNEL = zrot_rvv.c ZROTKERNEL = zrot_rvv.c +SROTMKERNEL = rotm_rvv.c +DROTMKERNEL = rotm_rvv.c +QROTMKERNEL = ../generic/rotm.c + SSCALKERNEL = scal_rvv.c DSCALKERNEL = scal_rvv.c CSCALKERNEL = zscal_rvv.c diff --git a/kernel/riscv64/rotm_rvv.c b/kernel/riscv64/rotm_rvv.c new file mode 100644 index 0000000000..49605666fd --- /dev/null +++ b/kernel/riscv64/rotm_rvv.c @@ -0,0 +1,260 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m8 +#else +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m8 +#endif + +int CNAME(BLASLONG n, FLOAT *dx, BLASLONG incx, FLOAT *dy, BLASLONG incy, FLOAT *dparam) +{ + BLASLONG i__1, i__2; + BLASLONG kx, ky; + FLOAT dh11, dh12, dh22, dh21, dflag; + BLASLONG nsteps; + + --dparam; + --dy; + --dx; + + FLOAT_V_T v_w, v_z__, v_dx, v_dy; + BLASLONG stride, stride_x, stride_y, offset; + + dflag = dparam[1]; + if (n <= 0 || dflag == - 2.0) goto L140; + + if (!(incx == incy && incx > 0)) goto L70; + + nsteps = n * incx; + if (dflag < 0.) { + goto L50; + } else if (dflag == 0) { + goto L10; + } else { + goto L30; + } +L10: + dh12 = dparam[4]; + dh21 = dparam[3]; + i__1 = nsteps; + i__2 = incx; + if(i__2 < 0){ + offset = i__1 - 2; + dx += offset; + dy += offset; + i__1 = -i__1; + i__2 = -i__2; + } + stride = i__2 * sizeof(FLOAT); + n = i__1 / i__2; + for (size_t vl; n > 0; n -= vl, dx += vl*i__2, dy += vl*i__2) { + vl = VSETVL(n); + + v_w = VLSEV_FLOAT(&dx[1], stride, vl); + v_z__ = VLSEV_FLOAT(&dy[1], stride, vl); + + v_dx = VFMACCVF_FLOAT(v_w, dh12, v_z__, vl); + v_dy = VFMACCVF_FLOAT(v_z__, dh21, v_w, vl); + + VSSEV_FLOAT(&dx[1], stride, v_dx, vl); + VSSEV_FLOAT(&dy[1], stride, v_dy, vl); + } + goto L140; +L30: + dh11 = dparam[2]; + dh22 = dparam[5]; + i__2 = nsteps; + i__1 = incx; + if(i__1 < 0){ + offset = i__2 - 2; + dx += offset; + dy += offset; + i__1 = -i__1; + i__2 = -i__2; + } + stride = i__1 * sizeof(FLOAT); + n = i__2 / i__1; + for (size_t vl; n > 0; n -= vl, dx += vl*i__1, dy += vl*i__1) { + vl = VSETVL(n); + + v_w = VLSEV_FLOAT(&dx[1], stride, vl); + v_z__ = VLSEV_FLOAT(&dy[1], stride, vl); + + v_dx = VFMACCVF_FLOAT(v_z__, dh11, v_w, vl); + v_dy = VFMSACVF_FLOAT(v_w, dh22, v_z__, vl); + + VSSEV_FLOAT(&dx[1], stride, v_dx, vl); + VSSEV_FLOAT(&dy[1], stride, v_dy, vl); + } + goto L140; +L50: + dh11 = dparam[2]; + dh12 = dparam[4]; + dh21 = dparam[3]; + dh22 = dparam[5]; + i__1 = nsteps; + i__2 = incx; + if(i__2 < 0){ + offset = i__1 - 2; + dx += offset; + dy += offset; + i__1 = -i__1; + i__2 = -i__2; + } + stride = i__2 * sizeof(FLOAT); + n = i__1 / i__2; + for (size_t vl; n > 0; n -= vl, dx += vl*i__2, dy += vl*i__2) { + vl = VSETVL(n); + + v_w = VLSEV_FLOAT(&dx[1], stride, vl); + v_z__ = VLSEV_FLOAT(&dy[1], stride, vl); + + v_dx = VFMULVF_FLOAT(v_w, dh11, vl); + v_dx = VFMACCVF_FLOAT(v_dx, dh12, v_z__, vl); + VSSEV_FLOAT(&dx[1], stride, v_dx, vl); + + v_dy = VFMULVF_FLOAT(v_w, dh21, vl); + v_dy = VFMACCVF_FLOAT(v_dy, dh22, v_z__, vl); + VSSEV_FLOAT(&dy[1], stride, v_dy, vl); + } + goto L140; +L70: + kx = 1; + ky = 1; + if (incx < 0) { + kx = (1 - n) * incx + 1; + } + if (incy < 0) { + ky = (1 - n) * incy + 1; + } + + if (dflag < 0.) { + goto L120; + } else if (dflag == 0) { + goto L80; + } else { + goto L100; + } +L80: + dh12 = dparam[4]; + dh21 = dparam[3]; + if(incx < 0){ + incx = -incx; + dx -= n*incx; + } + if(incy < 0){ + incy = -incy; + dy -= n*incy; + } + stride_x = incx * sizeof(FLOAT); + stride_y = incy * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, dx += vl*incx, dy += vl*incy) { + vl = VSETVL(n); + + v_w = VLSEV_FLOAT(&dx[kx], stride_x, vl); + v_z__ = VLSEV_FLOAT(&dy[ky], stride_y, vl); + + v_dx = VFMACCVF_FLOAT(v_w, dh12, v_z__, vl); + v_dy = VFMACCVF_FLOAT(v_z__, dh21, v_w, vl); + + VSSEV_FLOAT(&dx[kx], stride_x, v_dx, vl); + VSSEV_FLOAT(&dy[ky], stride_y, v_dy, vl); + } + goto L140; +L100: + dh11 = dparam[2]; + dh22 = dparam[5]; + if(incx < 0){ + incx = -incx; + dx -= n*incx; + } + if(incy < 0){ + incy = -incy; + dy -= n*incy; + } + stride_x = incx * sizeof(FLOAT); + stride_y = incy * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, dx += vl*incx, dy += vl*incy) { + vl = VSETVL(n); + + v_w = VLSEV_FLOAT(&dx[kx], stride_x, vl); + v_z__ = VLSEV_FLOAT(&dy[ky], stride_y, vl); + + v_dx = VFMACCVF_FLOAT(v_z__, dh11, v_w, vl); + v_dy = VFMSACVF_FLOAT(v_w, dh22, v_z__, vl); + + VSSEV_FLOAT(&dx[kx], stride_x, v_dx, vl); + VSSEV_FLOAT(&dy[ky], stride_y, v_dy, vl); + } + goto L140; +L120: + dh11 = dparam[2]; + dh12 = dparam[4]; + dh21 = dparam[3]; + dh22 = dparam[5]; + if(incx < 0){ + incx = -incx; + dx -= n*incx; + } + if(incy < 0){ + incy = -incy; + dy -= n*incy; + } + stride_x = incx * sizeof(FLOAT); + stride_y = incy * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, dx += vl*incx, dy += vl*incy) { + vl = VSETVL(n); + + v_w = VLSEV_FLOAT(&dx[kx], stride_x, vl); + v_z__ = VLSEV_FLOAT(&dy[ky], stride_y, vl); + + v_dx = VFMULVF_FLOAT(v_w, dh11, vl); + v_dx = VFMACCVF_FLOAT(v_dx, dh12, v_z__, vl); + VSSEV_FLOAT(&dx[kx], stride_x, v_dx, vl); + + v_dy = VFMULVF_FLOAT(v_w, dh21, vl); + v_dy = VFMACCVF_FLOAT(v_dy, dh22, v_z__, vl); + VSSEV_FLOAT(&dy[ky], stride_y, v_dy, vl); + } +L140: + return(0); +} diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index fa61a209e1..09b148b3e5 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -158,7 +158,7 @@ gotoblas_t TABLE_NAME = { #if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) scopy_kTS, sdot_kTS, // dsdot_kTS, - srot_kTS, saxpy_kTS, + srot_kTS, saxpy_kTS, srotm_kTS, #endif #if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) || (BUILD_COMPLEX16==1) sscal_kTS, @@ -260,6 +260,7 @@ gotoblas_t TABLE_NAME = { #endif #if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) drot_kTS, + drotm_kTS, daxpy_kTS, dscal_kTS, dswap_kTS, @@ -334,7 +335,7 @@ gotoblas_t TABLE_NAME = { qrot_kTS, qaxpy_kTS, qscal_kTS, qswap_kTS, qgemv_nTS, qgemv_tTS, qger_kTS, qsymv_LTS, qsymv_UTS, - + qrotm_kTS, qgemm_kernelTS, qgemm_betaTS, #if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N qgemm_incopyTS, qgemm_itcopyTS, diff --git a/kernel/sparc/KERNEL b/kernel/sparc/KERNEL index a8c958bb45..d6580609bd 100644 --- a/kernel/sparc/KERNEL +++ b/kernel/sparc/KERNEL @@ -75,3 +75,14 @@ DGEMM_BETA = ../generic/gemm_beta.c CGEMM_BETA = ../generic/zgemm_beta.c ZGEMM_BETA = ../generic/zgemm_beta.c +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/x86/KERNEL b/kernel/x86/KERNEL index 83b51db13f..1095c15286 100644 --- a/kernel/x86/KERNEL +++ b/kernel/x86/KERNEL @@ -189,3 +189,14 @@ ZGEMM_BETA = ../generic/zgemm_beta.c QGEMM_BETA = ../generic/gemm_beta.c XGEMM_BETA = ../generic/zgemm_beta.c +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/x86/KERNEL.generic b/kernel/x86/KERNEL.generic index 0aac0ce996..ada3ff42d4 100644 --- a/kernel/x86/KERNEL.generic +++ b/kernel/x86/KERNEL.generic @@ -162,3 +162,15 @@ ZHEMV_L_KERNEL = ../generic/zhemv_k.c CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c + +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index 2deb5a864c..c270ff0771 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -290,6 +290,18 @@ ifndef QROTKERNEL QROTKERNEL = rot.S endif +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif + ifndef CROTKERNEL CROTKERNEL = zrot_sse.S endif diff --git a/kernel/x86_64/KERNEL.generic b/kernel/x86_64/KERNEL.generic index 7cb0cb836c..36dc9f43d2 100644 --- a/kernel/x86_64/KERNEL.generic +++ b/kernel/x86_64/KERNEL.generic @@ -168,3 +168,15 @@ QCABS_KERNEL = ../generic/cabs.c #Dump kernel CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c + +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/zarch/KERNEL b/kernel/zarch/KERNEL index 68d68b5f86..cd94052035 100644 --- a/kernel/zarch/KERNEL +++ b/kernel/zarch/KERNEL @@ -27,4 +27,14 @@ ifndef ZGEMM_BETA ZGEMM_BETA = ../generic/zgemm_beta.c endif +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/zarch/KERNEL.ZARCH_GENERIC b/kernel/zarch/KERNEL.ZARCH_GENERIC index 33850d0f7d..6321cf6e39 100644 --- a/kernel/zarch/KERNEL.ZARCH_GENERIC +++ b/kernel/zarch/KERNEL.ZARCH_GENERIC @@ -135,5 +135,14 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/utest/test_rot.c b/utest/test_rot.c index 03776586b0..e4ba44a034 100644 --- a/utest/test_rot.c +++ b/utest/test_rot.c @@ -70,6 +70,24 @@ CTEST(rot,drot_inc_1) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); } } +CTEST(rot,drotm_inc_1) +{ + blasint i = 0; + blasint N = 12, incX = 1, incY = 1; + double param[5] = {1.0, 2.0, 3.0, 4.0, 5.0}; + double x_actual[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0}; + double y_actual[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0}; + double x_referece[] = {3.0, 6.0, 9.0, 12.0, 15.0, 18.0, 21.0, 24.0, 27.0, 30.0, 33.0, 36.0}; + double y_referece[] = {4.0, 8.0, 12.0, 16.0, 20.0, 24.0, 28.0, 32.0, 36.0, 40.0, 44.0, 48.0}; + + //OpenBLAS + BLASFUNC(drotm)(&N, x_actual, &incX, y_actual, &incY, param); + + for(i = 0; i < N; i++){ + ASSERT_DBL_NEAR_TOL(x_referece[i], x_actual[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y_referece[i], y_actual[i], DOUBLE_EPS); + } +} #endif #ifdef BUILD_COMPLEX16 @@ -130,6 +148,24 @@ CTEST(rot,srot_inc_1) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); } } +CTEST(rot,srotm_inc_1) +{ + blasint i = 0; + blasint N = 12, incX = 1, incY = 1; + float param[5] = {1.0, 2.0, 3.0, 4.0, 5.0}; + float x_actual[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0}; + float y_actual[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0}; + float x_referece[] = {3.0, 6.0, 9.0, 12.0, 15.0, 18.0, 21.0, 24.0, 27.0, 30.0, 33.0, 36.0}; + float y_referece[] = {4.0, 8.0, 12.0, 16.0, 20.0, 24.0, 28.0, 32.0, 36.0, 40.0, 44.0, 48.0}; + + //OpenBLAS + BLASFUNC(srotm)(&N, x_actual, &incX, y_actual, &incY, param); + + for(i = 0; i < N; i++){ + ASSERT_DBL_NEAR_TOL(x_referece[i], x_actual[i], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(y_referece[i], y_actual[i], SINGLE_EPS); + } +} #endif #ifdef BUILD_COMPLEX From b58cba9eb6e32f3abae6c2f5a712039c6cca54de Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 22 Jan 2025 15:51:49 +0100 Subject: [PATCH 3/9] fix qrotm build rules --- kernel/Makefile.L1 | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1 index 6e864e3d8e..0fc6720944 100644 --- a/kernel/Makefile.L1 +++ b/kernel/Makefile.L1 @@ -530,7 +530,7 @@ QBLASOBJS += \ iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \ qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \ qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) \ - qsum_k$(TSUFFIX).$(SUFFIX) + qsum_k$(TSUFFIX).$(SUFFIX) qrotm_k$(TSUFFIX).$(SUFFIX) CBLASOBJS += \ camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \ @@ -853,14 +853,17 @@ $(KDIR)srot_k$(TSUFFIX).$(SUFFIX) $(KDIR)srot_k$(TPSUFFIX).$(PSUFFIX) : $(KERN $(KDIR)drot_k$(TSUFFIX).$(SUFFIX) $(KDIR)drot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTKERNEL) $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ +$(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL) + $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ + $(KDIR)srotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)srotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SROTMKERNEL) $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ $(KDIR)drotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)drotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTMKERNEL) $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ -$(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL) - $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ +$(KDIR)qrotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTMKERNEL) + $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ $(KDIR)csrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)csrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CROTKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UDOUBLE $< -o $@ From 4924319c508bbf72bd0ff9d56a5deff5fb58f31b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 22 Jan 2025 16:07:35 +0100 Subject: [PATCH 4/9] fix position of srotm, qrotm --- kernel/setparam-ref.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 09b148b3e5..3ed45697ee 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -72,9 +72,9 @@ gotoblas_t TABLE_NAME = { samax_kTS, samin_kTS, smax_kTS, smin_kTS, isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS, - snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sbdot_kTS, + snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sbdot_kTS, dsdot_kTS, - srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, + srot_kTS, srotm_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, sbgemv_nTS, sbgemv_tTS, sger_kTS, ssymv_LTS, ssymv_UTS, @@ -158,7 +158,7 @@ gotoblas_t TABLE_NAME = { #if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) scopy_kTS, sdot_kTS, // dsdot_kTS, - srot_kTS, saxpy_kTS, srotm_kTS, + srot_kTS, srotm_kTS, saxpy_kTS, #endif #if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) || (BUILD_COMPLEX16==1) sscal_kTS, @@ -332,10 +332,9 @@ gotoblas_t TABLE_NAME = { qamax_kTS, qamin_kTS, qmax_kTS, qmin_kTS, iqamax_kTS, iqamin_kTS, iqmax_kTS, iqmin_kTS, qnrm2_kTS, qasum_kTS, qsum_kTS, qcopy_kTS, qdot_kTS, - qrot_kTS, qaxpy_kTS, qscal_kTS, qswap_kTS, + qrot_kTS, qrotm_kTS, qaxpy_kTS, qscal_kTS, qswap_kTS, qgemv_nTS, qgemv_tTS, qger_kTS, qsymv_LTS, qsymv_UTS, - qrotm_kTS, qgemm_kernelTS, qgemm_betaTS, #if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N qgemm_incopyTS, qgemm_itcopyTS, From 111c9b0733008175bcfdccc5e3329ac96b314c69 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 22 Jan 2025 19:51:43 +0100 Subject: [PATCH 5/9] Add translations for C_COMPILER and OSNAME --- cmake/utils.cmake | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 9befc9a3c4..a93f21686f 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -16,6 +16,14 @@ endfunction () macro(ParseMakefileVars MAKEFILE_IN) message(STATUS "Reading vars from ${MAKEFILE_IN}...") set (C_COMPILER ${CMAKE_C_COMPILER_ID}) + set (OSNAME ${CMAKE_SYSTEM_NAME}) + if (${C_COMPILER} MATCHES Clang) + set (C_COMPILER CLANG) + endif () + if (${OSNAME} STREQUAL Windows) + set (OSNAME WINNT) + endif () +message(STATUS OS ${OSNAME} COMPILER ${C_COMPILER}) set (IfElse 0) set (ElseSeen 0) set (SkipIfs 0) From 1a6a9fb22f66cacabe620bd5be897833ecfaaded Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 23 Jan 2025 00:17:04 +0100 Subject: [PATCH 6/9] add another generator line for rotm --- kernel/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index bc713e6033..b43cda2c14 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -65,6 +65,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${${float_char}COPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}NRM2KERNEL}" "" "nrm2_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTKERNEL}" "" "rot_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTMKERNEL}" "" "rotm_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type}) From d1bfa979f7830ddf799cde43cfc6ae22000c4c52 Mon Sep 17 00:00:00 2001 From: Deeksha Goplani Date: Thu, 23 Jan 2025 09:41:45 +0530 Subject: [PATCH 7/9] small gemm kernel packing modifications --- kernel/arm64/dgemm_small_kernel_tn_sve.c | 2 +- kernel/arm64/dgemm_small_kernel_tt_sve.c | 2 +- kernel/arm64/sgemm_small_kernel_tn_sve.c | 2 +- kernel/arm64/sgemm_small_kernel_tt_sve.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/arm64/dgemm_small_kernel_tn_sve.c b/kernel/arm64/dgemm_small_kernel_tn_sve.c index 2ef23d7ee4..8419e50655 100644 --- a/kernel/arm64/dgemm_small_kernel_tn_sve.c +++ b/kernel/arm64/dgemm_small_kernel_tn_sve.c @@ -213,7 +213,7 @@ CNAME(BLASLONG M, const BLASLONG n2 = N & -2; const BLASLONG n8 = N & -8; - const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; + const int pack_a = M >= v_size2 && N >= 8 ? 1 : 0; FLOAT* packed_a = (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; diff --git a/kernel/arm64/dgemm_small_kernel_tt_sve.c b/kernel/arm64/dgemm_small_kernel_tt_sve.c index efe11a9f9b..0f06b4ecbd 100644 --- a/kernel/arm64/dgemm_small_kernel_tt_sve.c +++ b/kernel/arm64/dgemm_small_kernel_tt_sve.c @@ -219,7 +219,7 @@ CNAME(BLASLONG M, const BLASLONG n4 = N & -4; const BLASLONG n2 = N & -2; - const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; + const int pack_a = M >= v_size2 && N >= 8 ? 1 : 0; FLOAT* packed_a = (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; diff --git a/kernel/arm64/sgemm_small_kernel_tn_sve.c b/kernel/arm64/sgemm_small_kernel_tn_sve.c index 1146409504..c874af4005 100644 --- a/kernel/arm64/sgemm_small_kernel_tn_sve.c +++ b/kernel/arm64/sgemm_small_kernel_tn_sve.c @@ -222,7 +222,7 @@ CNAME(BLASLONG M, const BLASLONG n8 = N & -8; const BLASLONG n4 = N & -4; - const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; + const int pack_a = M >= v_size2 && N >= 8 ? 1 : 0; FLOAT* packed_a = (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; diff --git a/kernel/arm64/sgemm_small_kernel_tt_sve.c b/kernel/arm64/sgemm_small_kernel_tt_sve.c index 731c9861b8..b29e3e46b5 100644 --- a/kernel/arm64/sgemm_small_kernel_tt_sve.c +++ b/kernel/arm64/sgemm_small_kernel_tt_sve.c @@ -223,7 +223,7 @@ CNAME(BLASLONG M, const BLASLONG n8 = N & -8; const BLASLONG n4 = N & -4; - const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; + const int pack_a = M >= v_size2 && N >= 8 ? 1 : 0; FLOAT* packed_a = (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; From 1ebcbdbab35e0b027e06347c241a38fb61adbf82 Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 23 Jan 2025 09:08:42 +0000 Subject: [PATCH 8/9] LoongArch64: Fixed the issue of using the old-style TARGET in cmake builds --- cmake/system.cmake | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 7413c88c80..9c437fc995 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -21,7 +21,15 @@ endif() # Other files expect CORE, which is actually TARGET and will become TARGET_CORE for kernel build. Confused yet? # It seems we are meant to use TARGET as input and CORE internally as kernel. if(NOT DEFINED CORE AND DEFINED TARGET) - set(CORE ${TARGET}) + if (${TARGET} STREQUAL "LOONGSON3R5") + set(CORE "LA464") + elseif (${TARGET} STREQUAL "LOONGSON2K1000") + set(CORE "LA264") + elseif (${TARGET} STREQUAL "LOONGSONGENERIC") + set(CORE "LA64_GENERIC)") + else () + set(CORE ${TARGET}) + endif() endif() # TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1. From 9faebb3c974ce3665f879e5af8df0b6016e140fa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 23 Jan 2025 17:59:45 +0100 Subject: [PATCH 9/9] fix lost indentation in the rules for the thread safety test --- Makefile.install | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.install b/Makefile.install index cd1dcdabcb..10e6425cce 100644 --- a/Makefile.install +++ b/Makefile.install @@ -315,8 +315,8 @@ endif endif ifeq ($(CPP_THREAD_SAFETY_TEST), 1) -@install -m 666 cpp_thread_test/dgemm_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) -@install -m 666 cpp_thread_test/dgemv_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 cpp_thread_test/dgemm_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 cpp_thread_test/dgemv_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) endif endif