diff --git a/kernel/loongarch64/dsymv_L_lasx.S b/kernel/loongarch64/dsymv_L_lasx.S index 2259966d86..a36cff9a93 100644 --- a/kernel/loongarch64/dsymv_L_lasx.S +++ b/kernel/loongarch64/dsymv_L_lasx.S @@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ASSEMBLER #include "common.h" +#include "loongarch64_asm.S" /* Param */ #define M $r4 @@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define T2 $r28 #define T3 $r29 #define T4 $r30 +#define T5 $r17 +#define T6 $r16 /* LSX vectors */ #define U0 $xr31 @@ -87,10 +90,113 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define a8 $f8 #define a9 $f9 +.macro LOAD_Y_8 + beqz T5, .L01_Y_0 + add.d T2, IY, INCY + fldx.d $f4, Y, T2 + add.d T2, T2, INCY + fldx.d $f5, Y, T2 + add.d T2, T2, INCY + fldx.d $f6, Y, T2 + add.d T2, T2, INCY + fldx.d $f7, Y, T2 - PROLOGUE + add.d T2, T2, INCY + fldx.d $f8, Y, T2 + add.d T2, T2, INCY + fldx.d $f9, Y, T2 + add.d T2, T2, INCY + fldx.d $f10, Y, T2 + add.d T2, T2, INCY + fldx.d $f11, Y, T2 + + vextrins.d $vr4, $vr5, 0x10 + vextrins.d $vr6, $vr7, 0x10 + xvpermi.q U4, U6, 0x02 - LDARG BUFFER, $sp, 0 + vextrins.d $vr8, $vr9, 0x10 + vextrins.d $vr10, $vr11, 0x10 + xvpermi.q U8, U10, 0x02 + b .L01_Y_1 +.L01_Y_0: + add.d T3, IY, INCY + xvldx U4, Y, T3 + alsl.d T4, INCY, T3, 2 + xvldx U8, Y, T4 +.L01_Y_1: +.endm + +.macro LOAD_X_8 + beqz T6, .L01_X_0 + add.d T2, IX, INCX + fldx.d $f4, X, T2 + add.d T2, T2, INCX + fldx.d $f5, X, T2 + add.d T2, T2, INCX + fldx.d $f6, X, T2 + add.d T2, T2, INCX + fldx.d $f7, X, T2 + + add.d T2, T2, INCX + fldx.d $f8, X, T2 + add.d T2, T2, INCX + fldx.d $f9, X, T2 + add.d T2, T2, INCX + fldx.d $f10, X, T2 + add.d T2, T2, INCX + fldx.d $f11, X, T2 + + vextrins.d $vr4, $vr5, 0x10 + vextrins.d $vr6, $vr7, 0x10 + xvpermi.q U4, U6, 0x02 + + vextrins.d $vr8, $vr9, 0x10 + vextrins.d $vr10, $vr11, 0x10 + xvpermi.q U8, U10, 0x02 + b .L01_X_1 +.L01_X_0: + add.d T3, IX, INCX + xvldx U4, X, T3 + alsl.d T2, INCX, T3, 2 + xvldx U8, X, T2 +.L01_X_1: +.endm + +.macro STORE_Y_8 + beqz T5, .L01_Y_2 + xvpermi.d U6, U4, 0xee + vextrins.d $vr5, $vr4, 0x01 + vextrins.d $vr7, $vr6, 0x01 + + xvpermi.d U10, U8, 0xee + vextrins.d $vr9, $vr8, 0x01 + vextrins.d $vr11, $vr10, 0x01 + + add.d T2, IY, INCY + fstx.d $f4, Y, T2 + add.d T2, T2, INCY + fstx.d $f5, Y, T2 + add.d T2, T2, INCY + fstx.d $f6, Y, T2 + add.d T2, T2, INCY + fstx.d $f7, Y, T2 + + add.d T2, T2, INCY + fstx.d $f8, Y, T2 + add.d T2, T2, INCY + fstx.d $f9, Y, T2 + add.d T2, T2, INCY + fstx.d $f10, Y, T2 + add.d T2, T2, INCY + fstx.d $f11, Y, T2 + b .L01_Y_3 +.L01_Y_2: + xvstx U4, Y, T3 + xvstx U8, Y, T4 +.L01_Y_3: +.endm + + PROLOGUE addi.d $sp, $sp, -88 @@ -107,6 +213,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvldrepl.d VALPHA, $sp, 80 + addi.d T5, INCY, -1 + addi.d T6, INCX, -1 slli.d LDA, LDA, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT @@ -122,11 +230,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. beq J, N, .L999 .L01: - MTC a2, $r0 //temp2 + xvxor.v U2, U2, U2 fldx.d a6, X, JX fmul.d a3, ALPHA, a6 //temp1 xvreplve0.d U3, U3 - xvreplve0.d U2, U2 mul.d T0, J, LDA slli.d T1, J, BASE_SHIFT @@ -147,126 +254,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. srai.d T0, T0, 3 add.d T0, T0, J addi.d T0, T0, 1 - beq I, T0, .L03 - bge I, T0, .L03 + beq I, T0, .L03 + bge I, T0, .L03 mul.d T1, J, LDA add.d T1, T1, II .L02: /* /8 */ xvldx U1, AO1, T1 - addi.d T1, T1, 32 - xvldx U14, AO1, T1 - addi.d T1, T1, 32 + addi.d T2, T1, 32 + xvldx U14, AO1, T2 - add.d T2, IY, INCY - fldx.d $f4, Y, T2 - add.d T2, T2, INCY - fldx.d $f5, Y, T2 - add.d T2, T2, INCY - fldx.d $f6, Y, T2 - add.d T2, T2, INCY - fldx.d $f7, Y, T2 - - add.d T2, T2, INCY - fldx.d $f8, Y, T2 - add.d T2, T2, INCY - fldx.d $f9, Y, T2 - add.d T2, T2, INCY - fldx.d $f10, Y, T2 - add.d T2, T2, INCY - fldx.d $f11, Y, T2 - - vextrins.d $vr4, $vr5, 0x10 - vextrins.d $vr6, $vr7, 0x10 - xvpermi.q U4, U6, 0x02 - - vextrins.d $vr8, $vr9, 0x10 - vextrins.d $vr10, $vr11, 0x10 - xvpermi.q U8, U10, 0x02 + LOAD_Y_8 xvfmadd.d U4, U3, U1, U4 xvfmadd.d U8, U3, U14, U8 - xvpermi.d U6, U4, 0xee - vextrins.d $vr5, $vr4, 0x01 - vextrins.d $vr7, $vr6, 0x01 - - xvpermi.d U10, U8, 0xee - vextrins.d $vr9, $vr8, 0x01 - vextrins.d $vr11, $vr10, 0x01 - - add.d T2, IY, INCY - fstx.d $f4, Y, T2 - add.d T2, T2, INCY - fstx.d $f5, Y, T2 - add.d T2, T2, INCY - fstx.d $f6, Y, T2 - add.d T2, T2, INCY - fstx.d $f7, Y, T2 - - add.d T2, T2, INCY - fstx.d $f8, Y, T2 - add.d T2, T2, INCY - fstx.d $f9, Y, T2 - add.d T2, T2, INCY - fstx.d $f10, Y, T2 - add.d T2, T2, INCY - fstx.d $f11, Y, T2 - - slli.d T2, INCY, 3 - add.d IY, IY, T2 - - add.d T2, IX, INCX - fldx.d $f4, X, T2 - add.d T2, T2, INCX - fldx.d $f5, X, T2 - add.d T2, T2, INCX - fldx.d $f6, X, T2 - add.d T2, T2, INCX - fldx.d $f7, X, T2 - - add.d T2, T2, INCX - fldx.d $f8, X, T2 - add.d T2, T2, INCX - fldx.d $f9, X, T2 - add.d T2, T2, INCX - fldx.d $f10, X, T2 - add.d T2, T2, INCX - fldx.d $f11, X, T2 - - vextrins.d $vr4, $vr5, 0x10 - vextrins.d $vr6, $vr7, 0x10 - xvpermi.q U4, U6, 0x02 - - vextrins.d $vr8, $vr9, 0x10 - vextrins.d $vr10, $vr11, 0x10 - xvpermi.q U8, U10, 0x02 - - xvand.v $xr12, $xr2, $xr2 - - xvfmadd.d U2, U1, U4, U2 - xvfsub.d U2, U2, $xr12 - xvfmadd.d U2, U14, U8, U2 + STORE_Y_8 - xvpermi.d U4, U2, 0x01 - xvpermi.d U5, U2, 0x02 - xvpermi.d U6, U2, 0x03 + alsl.d IY, INCY, IY, 3 - fadd.d $f2, $f2, $f4 - fadd.d $f2, $f2, $f5 - fadd.d $f2, $f2, $f6 - fadd.d $f2, $f2, $f12 + LOAD_X_8 - xvreplve0.d U2, U2 + xvfmadd.d U2, U1, U4, U2 + xvfmadd.d U2, U14, U8, U2 - slli.d T2, INCX, 3 - add.d IX, IX, T2 + alsl.d IX, INCX, IX, 3 + addi.d T1, T1, 64 addi.d II, II, 64 addi.d I, I, 1 blt I, T0, .L02 + //Acc U2 + GACC xvf, d, U4, U2 + fmov.d $f2, $f4 .L03: /* &4 */ sub.d T0, M, J addi.d T0, T0, -1 @@ -437,4 +459,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d $sp, $sp, 88 jirl $r0, $r1, 0x0 - EPILOGUE \ No newline at end of file + EPILOGUE diff --git a/kernel/loongarch64/dsymv_U_lasx.S b/kernel/loongarch64/dsymv_U_lasx.S index 57eb90aaef..892c5ed2fa 100644 --- a/kernel/loongarch64/dsymv_U_lasx.S +++ b/kernel/loongarch64/dsymv_U_lasx.S @@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ASSEMBLER #include "common.h" +#include "loongarch64_asm.S" /* Param */ #define M $r4 @@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define T2 $r28 #define T3 $r29 #define T4 $r30 +#define T5 $r17 +#define T6 $r16 /* LSX vectors */ #define U0 $xr31 @@ -87,67 +90,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define a8 $f8 #define a9 $f9 - - PROLOGUE - - LDARG BUFFER, $sp, 0 - - addi.d $sp, $sp, -88 - - SDARG $r23, $sp, 0 - SDARG $r24, $sp, 8 - SDARG $r25, $sp, 16 - SDARG $r26, $sp, 32 - SDARG $r27, $sp, 40 - SDARG $r28, $sp, 48 - SDARG $r29, $sp, 56 - SDARG $r30, $sp, 64 - SDARG $r31, $sp, 72 - ST ALPHA, $sp, 80 - - xvldrepl.d VALPHA, $sp, 80 - - slli.d LDA, LDA, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - - bge $r0, M, .L999 - bge $r0, N, .L999 - - sub.d M1, M, N - - mul.d JY, M1, INCY - mul.d JX, M1, INCX - - move J, M1 - move AO1, A - - beq J, M, .L999 - -.L01: - MTC $f2, $r0 //temp2 - fldx.d $f6, X, JX - fmul.d $f3, ALPHA, $f6 //temp1 - xvreplve0.d U3, U3 - xvreplve0.d U2, U2 - - move IY, $r0 - move IX, $r0 - move II, $r0 - move I, $r0 - - srai.d T0, J, 3 - beq I, T0, .L03 - - mul.d T1, J, LDA - add.d T1, T1, II - -.L02: /* /8 */ - xvldx U1, AO1, T1 - addi.d T1, T1, 32 - xvldx U14, AO1, T1 - addi.d T1, T1, 32 - +.macro LOAD_Y_8 + beqz T5, .L01_Y_0 fldx.d $f4, Y, IY add.d T2, IY, INCY fldx.d $f5, Y, T2 @@ -167,20 +111,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vextrins.d $vr4, $vr5, 0x10 vextrins.d $vr6, $vr7, 0x10 - xvpermi.q U4, U6, 0x02 + xvpermi.q U4, U6, 0x02 vextrins.d $vr8, $vr9, 0x10 vextrins.d $vr10, $vr11, 0x10 - xvpermi.q U8, U10, 0x02 - - xvfmadd.d U4, U3, U1, U4 - xvfmadd.d U8, U3, U14, U8 - - xvpermi.d U6, U4, 0xee + xvpermi.q U8, U10, 0x02 + b .L01_Y_1 +.L01_Y_0: + xvldx U4, Y, IY + alsl.d T4, INCY, IY, 2 + xvldx U8, Y, T4 +.L01_Y_1: +.endm + +.macro STORE_Y_8 + beqz T5, .L01_Y_2 + xvpermi.d U6, U4, 0xee vextrins.d $vr5, $vr4, 0x01 vextrins.d $vr7, $vr6, 0x01 - xvpermi.d U10, U8, 0xee + xvpermi.d U10, U8, 0xee vextrins.d $vr9, $vr8, 0x01 vextrins.d $vr11, $vr10, 0x01 @@ -200,10 +150,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fstx.d $f10, Y, T2 add.d T2, T2, INCY fstx.d $f11, Y, T2 - - slli.d T2, INCY, 3 - add.d IY, IY, T2 - + b .L01_Y_3 +.L01_Y_2: + xvstx U4, Y, IY + xvstx U8, Y, T4 +.L01_Y_3: +.endm + +.macro LOAD_X_8 + beqz T6, .L01_X_0 fldx.d $f4, X, IX add.d T2, IX, INCX fldx.d $f5, X, T2 @@ -223,36 +178,102 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vextrins.d $vr4, $vr5, 0x10 vextrins.d $vr6, $vr7, 0x10 - xvpermi.q U4, U6, 0x02 + xvpermi.q U4, U6, 0x02 vextrins.d $vr8, $vr9, 0x10 vextrins.d $vr10, $vr11, 0x10 - xvpermi.q U8, U10, 0x02 + xvpermi.q U8, U10, 0x02 + b .L01_X_1 +.L01_X_0: + xvldx U4, X, IX + alsl.d T2, INCX, IX, 2 + xvldx U8, X, T2 +.L01_X_1: +.endm - xvand.v $xr12, $xr2, $xr2 + PROLOGUE - xvfmadd.d U2, U1, U4, U2 - xvfsub.d U2, U2, $xr12 - xvfmadd.d U2, U14, U8, U2 + addi.d $sp, $sp, -88 - xvpermi.d U4, U2, 0x01 - xvpermi.d U5, U2, 0x02 - xvpermi.d U6, U2, 0x03 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 32 + SDARG $r27, $sp, 40 + SDARG $r28, $sp, 48 + SDARG $r29, $sp, 56 + SDARG $r30, $sp, 64 + SDARG $r31, $sp, 72 + ST ALPHA, $sp, 80 - fadd.d $f2, $f2, $f4 - fadd.d $f2, $f2, $f5 - fadd.d $f2, $f2, $f6 - fadd.d $f2, $f2, $f12 + xvldrepl.d VALPHA, $sp, 80 - xvreplve0.d U2, U2 + addi.d T5, INCY, -1 + addi.d T6, INCX, -1 + slli.d LDA, LDA, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT - slli.d T2, INCX, 3 - add.d IX, IX, T2 + bge $r0, M, .L999 + bge $r0, N, .L999 + + sub.d M1, M, N + + mul.d JY, M1, INCY + mul.d JX, M1, INCX + + move J, M1 + move AO1, A + beq J, M, .L999 + +.L01: + xvxor.v U2, U2, U2 + fldx.d $f6, X, JX + fmul.d $f3, ALPHA, $f6 //temp1 + xvreplve0.d U3, U3 + + move IY, $r0 + move IX, $r0 + move II, $r0 + move I, $r0 + + srai.d T0, J, 3 + beq I, T0, .L03 + + mul.d T1, J, LDA + add.d T1, T1, II + +.L02: /* /8 */ + xvldx U1, AO1, T1 + addi.d T2, T1, 32 + xvldx U14, AO1, T2 + + LOAD_Y_8 + + xvfmadd.d U4, U3, U1, U4 + xvfmadd.d U8, U3, U14, U8 + + STORE_Y_8 + + alsl.d IY, INCY, IY, 3 + + LOAD_X_8 + + xvfmadd.d U2, U1, U4, U2 + xvfmadd.d U2, U14, U8, U2 + + alsl.d IX, INCX, IX, 3 + + addi.d T1, T1, 64 addi.d II, II, 64 addi.d I, I, 1 blt I, T0, .L02 + //Acc U2 + GACC xvf, d, U4, U2 + fmov.d $f2, $f4 + .L03: /* &4 */ andi T0, J, 4 beq $r0, T0, .L04 @@ -425,4 +446,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d $sp, $sp, 88 jirl $r0, $r1, 0x0 - EPILOGUE \ No newline at end of file + EPILOGUE diff --git a/kernel/loongarch64/ssymv_L_lasx.S b/kernel/loongarch64/ssymv_L_lasx.S index 980c10fd74..81796883d7 100644 --- a/kernel/loongarch64/ssymv_L_lasx.S +++ b/kernel/loongarch64/ssymv_L_lasx.S @@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ASSEMBLER #include "common.h" +#include "loongarch64_asm.S" /* Param */ #define M $r4 @@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define T2 $r28 #define T3 $r29 #define T4 $r30 +#define T5 $r17 +#define T6 $r16 /* LSX vectors */ #define U0 $xr31 @@ -87,75 +90,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define a8 $f8 #define a9 $f9 - - PROLOGUE - - LDARG BUFFER, $sp, 0 - - addi.d $sp, $sp, -88 - - SDARG $r23, $sp, 0 - SDARG $r24, $sp, 8 - SDARG $r25, $sp, 16 - SDARG $r26, $sp, 32 - SDARG $r27, $sp, 40 - SDARG $r28, $sp, 48 - SDARG $r29, $sp, 56 - SDARG $r30, $sp, 64 - SDARG $r31, $sp, 72 - ST ALPHA, $sp, 80 - - xvldrepl.w VALPHA, $sp, 80 - - slli.d LDA, LDA, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - - bge $r0, M, .L999 - bge $r0, N, .L999 - - move J, $r0 - move JY, $r0 - move JX, $r0 - move AO1, A - - beq J, N, .L999 - -.L01: - MTC a2, $r0 //temp2 - fldx.s a6, X, JX - fmul.s a3, ALPHA, a6 //temp1 - xvreplve0.w U3, U3 - xvreplve0.w U2, U2 - - mul.w T0, J, LDA - slli.d T1, J, BASE_SHIFT - add.w T0, T0, T1 - fldx.s a6, AO1, T0 - fldx.s a4, Y, JY - fmadd.s a4, a3, a6, a4 - fstx.s a4, Y, JY - - move IY, JY - move IX, JX - addi.d II, J, 1 - move I, II - slli.d II, II, BASE_SHIFT - - sub.d T0, M, J - addi.d T0, T0, -1 - srai.d T0, T0, 3 - add.d T0, T0, J - addi.d T0, T0, 1 - beq I, T0, .L03 - bge I, T0, .L03 - - mul.w T1, J, LDA - add.d T1, T1, II - -.L02: /* /8 */ - xvldx U1, AO1, T1 - +.macro LOAD_Y_8 + beqz T5, .L01_Y_0 add.d T2, IY, INCY fldx.s $f4, Y, T2 add.d T2, T2, INCY @@ -180,11 +116,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vextrins.w $vr8, $vr9, 0x10 vextrins.w $vr8, $vr10, 0x20 vextrins.w $vr8, $vr11, 0x30 - xvpermi.q U4, U8, 0x02 - - xvfmadd.s U4, U3, U1, U4 - - xvpermi.d U8, U4, 0xee + xvpermi.q U4, U8, 0x02 + b .L01_Y_1 +.L01_Y_0: + add.d T3, IY, INCY + xvldx U4, Y, T3 +.L01_Y_1: +.endm + +.macro STORE_Y_8 + beqz T5, .L01_Y_2 + xvpermi.d U8, U4, 0xee vextrins.w $vr5, $vr4, 0x01 vextrins.w $vr6, $vr4, 0x02 vextrins.w $vr7, $vr4, 0x03 @@ -209,10 +151,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fstx.s $f10, Y, T2 add.d T2, T2, INCY fstx.s $f11, Y, T2 - - slli.d T2, INCY, 3 - add.d IY, IY, T2 - + b .L01_Y_3 +.L01_Y_2: + xvstx U4, Y, T3 +.L01_Y_3: +.endm + +.macro LOAD_X_8 + beqz T6, .L01_X_0 add.d T2, IX, INCX fldx.s $f4, X, T2 add.d T2, T2, INCX @@ -238,39 +184,103 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vextrins.w $vr8, $vr10, 0x20 vextrins.w $vr8, $vr11, 0x30 xvpermi.q U4, U8, 0x02 + b .L01_X_1 +.L01_X_0: + add.d T3, IX, INCX + xvldx U4, X, T3 +.L01_X_1: +.endm + + PROLOGUE - xvand.v $xr12, $xr2, $xr2 + addi.d $sp, $sp, -88 - xvfmadd.s U2, U1, U4, U2 - xvfsub.s U2, U2, $xr12 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 32 + SDARG $r27, $sp, 40 + SDARG $r28, $sp, 48 + SDARG $r29, $sp, 56 + SDARG $r30, $sp, 64 + SDARG $r31, $sp, 72 + ST ALPHA, $sp, 80 - xvpickve.w U4, U2, 0x01 - xvpickve.w U5, U2, 0x02 - xvpickve.w U6, U2, 0x03 - xvpickve.w U7, U2, 0x04 - xvpickve.w U8, U2, 0x05 - xvpickve.w U9, U2, 0x06 - xvpickve.w U10, U2, 0x07 + xvldrepl.w VALPHA, $sp, 80 - fadd.s $f2, $f2, $f4 - fadd.s $f2, $f2, $f5 - fadd.s $f2, $f2, $f6 - fadd.s $f2, $f2, $f7 - fadd.s $f2, $f2, $f8 - fadd.s $f2, $f2, $f9 - fadd.s $f2, $f2, $f10 - fadd.s $f2, $f2, $f12 + addi.d T5, INCY, -1 + addi.d T6, INCX, -1 + slli.d LDA, LDA, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT - xvreplve0.d U2, U2 + bge $r0, M, .L999 + bge $r0, N, .L999 + + move J, $r0 + move JY, $r0 + move JX, $r0 + move AO1, A - slli.d T2, INCX, 3 - add.d IX, IX, T2 + beq J, N, .L999 + +.L01: + xvxor.v U2, U2, U2 + fldx.s a6, X, JX + fmul.s a3, ALPHA, a6 //temp1 + xvreplve0.w U3, U3 + + mul.w T0, J, LDA + slli.d T1, J, BASE_SHIFT + add.w T0, T0, T1 + fldx.s a6, AO1, T0 + fldx.s a4, Y, JY + fmadd.s a4, a3, a6, a4 + fstx.s a4, Y, JY + + move IY, JY + move IX, JX + addi.d II, J, 1 + move I, II + slli.d II, II, BASE_SHIFT + + sub.d T0, M, J + addi.d T0, T0, -1 + srai.d T0, T0, 3 + add.d T0, T0, J + addi.d T0, T0, 1 + beq I, T0, .L03 + bge I, T0, .L03 + + mul.w T1, J, LDA + add.d T1, T1, II + +.L02: /* /8 */ + xvldx U1, AO1, T1 + + LOAD_Y_8 + + xvfmadd.s U4, U3, U1, U4 + + STORE_Y_8 + + alsl.d IY, INCY, IY, 3 + + LOAD_X_8 + + xvfmadd.s U2, U1, U4, U2 + + alsl.d IX, INCX, IX, 3 addi.d II, II, 32 addi.d T1, T1, 32 addi.d I, I, 1 blt I, T0, .L02 + //Acc U2 + GACC xvf, s, U4, U2 + fmov.d $f2, $f4 + .L03: /* &4 */ sub.d T0, M, J addi.d T0, T0, -1 @@ -433,4 +443,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d $sp, $sp, 88 jirl $r0, $r1, 0x0 - EPILOGUE \ No newline at end of file + EPILOGUE diff --git a/kernel/loongarch64/ssymv_U_lasx.S b/kernel/loongarch64/ssymv_U_lasx.S index bd6fd3dd7a..ff68723e1b 100644 --- a/kernel/loongarch64/ssymv_U_lasx.S +++ b/kernel/loongarch64/ssymv_U_lasx.S @@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ASSEMBLER #include "common.h" +#include "loongarch64_asm.S" /* Param */ #define M $r4 @@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define T2 $r28 #define T3 $r29 #define T4 $r30 +#define T5 $r17 +#define T6 $r16 /* LSX vectors */ #define U0 $xr31 @@ -87,64 +90,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define a8 $f8 #define a9 $f9 - - PROLOGUE - - LDARG BUFFER, $sp, 0 - - addi.d $sp, $sp, -88 - - SDARG $r23, $sp, 0 - SDARG $r24, $sp, 8 - SDARG $r25, $sp, 16 - SDARG $r26, $sp, 32 - SDARG $r27, $sp, 40 - SDARG $r28, $sp, 48 - SDARG $r29, $sp, 56 - SDARG $r30, $sp, 64 - SDARG $r31, $sp, 72 - ST ALPHA, $sp, 80 - - xvldrepl.w VALPHA, $sp, 80 - - slli.d LDA, LDA, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - - bge $r0, M, .L999 - bge $r0, N, .L999 - - sub.d M1, M, N - - mul.d JY, M1, INCY - mul.d JX, M1, INCX - - move J, M1 - move AO1, A - - beq J, M, .L999 - -.L01: - MTC $f2, $r0 //temp2 - fldx.s $f6, X, JX - fmul.s $f3, ALPHA, $f6 //temp1 - xvreplve0.w U3, U3 - xvreplve0.w U2, U2 - - move IY, $r0 - move IX, $r0 - move II, $r0 - move I, $r0 - - srai.d T0, J, 3 - beq I, T0, .L03 - - mul.w T1, J, LDA - add.d T1, T1, II - -.L02: /* /8 */ - xvldx U1, AO1, T1 - +.macro LOAD_Y_8 + beqz T5, .L01_Y_0 fldx.s $f4, Y, IY add.d T2, IY, INCY fldx.s $f5, Y, T2 @@ -168,10 +115,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vextrins.w $vr8, $vr9, 0x10 vextrins.w $vr8, $vr10, 0x20 vextrins.w $vr8, $vr11, 0x30 - xvpermi.q U4, U8, 0x02 - - xvfmadd.s U4, U3, U1, U4 - + xvpermi.q U4, U8, 0x02 + b .L01_Y_1 +.L01_Y_0: + xvldx U4, Y, IY +.L01_Y_1: +.endm + +.macro STORE_Y_8 + beqz T5, .L01_Y_2 xvpermi.d U8, U4, 0xee vextrins.w $vr5, $vr4, 0x01 vextrins.w $vr6, $vr4, 0x02 @@ -196,10 +148,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fstx.s $f10, Y, T2 add.d T2, T2, INCY fstx.s $f11, Y, T2 - - slli.d T2, INCY, 3 - add.d IY, IY, T2 - + b .L01_Y_3 +.L01_Y_2: + xvstx U4, Y, IY +.L01_Y_3: +.endm + +.macro LOAD_X_8 + beqz T6, .L01_X_0 fldx.s $f4, X, IX add.d T2, IX, INCX fldx.s $f5, X, T2 @@ -224,39 +180,91 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vextrins.w $vr8, $vr10, 0x20 vextrins.w $vr8, $vr11, 0x30 xvpermi.q U4, U8, 0x02 + b .L01_X_1 +.L01_X_0: + xvldx U4, X, IX +.L01_X_1: +.endm + + PROLOGUE - xvand.v $xr12, $xr2, $xr2 + addi.d $sp, $sp, -88 - xvfmadd.s U2, U1, U4, U2 - xvfsub.s U2, U2, $xr12 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 32 + SDARG $r27, $sp, 40 + SDARG $r28, $sp, 48 + SDARG $r29, $sp, 56 + SDARG $r30, $sp, 64 + SDARG $r31, $sp, 72 + ST ALPHA, $sp, 80 - xvpickve.w U4, U2, 0x01 - xvpickve.w U5, U2, 0x02 - xvpickve.w U6, U2, 0x03 - xvpickve.w U7, U2, 0x04 - xvpickve.w U8, U2, 0x05 - xvpickve.w U9, U2, 0x06 - xvpickve.w U10, U2, 0x07 + xvldrepl.w VALPHA, $sp, 80 - fadd.s $f2, $f2, $f4 - fadd.s $f2, $f2, $f5 - fadd.s $f2, $f2, $f6 - fadd.s $f2, $f2, $f7 - fadd.s $f2, $f2, $f8 - fadd.s $f2, $f2, $f9 - fadd.s $f2, $f2, $f10 - fadd.s $f2, $f2, $f12 + addi.d T5, INCY, -1 + addi.d T6, INCX, -1 + slli.d LDA, LDA, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT - xvreplve0.d U2, U2 + bge $r0, M, .L999 + bge $r0, N, .L999 + + sub.d M1, M, N + + mul.d JY, M1, INCY + mul.d JX, M1, INCX + + move J, M1 + move AO1, A + + beq J, M, .L999 + +.L01: + xvxor.v U2, U2, U2 + fldx.s $f6, X, JX + fmul.s $f3, ALPHA, $f6 //temp1 + xvreplve0.w U3, U3 - slli.d T2, INCX, 3 - add.d IX, IX, T2 + move IY, $r0 + move IX, $r0 + move II, $r0 + move I, $r0 + + srai.d T0, J, 3 + beq I, T0, .L03 + + mul.w T1, J, LDA + add.d T1, T1, II + +.L02: /* /8 */ + xvldx U1, AO1, T1 + + LOAD_Y_8 + + xvfmadd.s U4, U3, U1, U4 + + STORE_Y_8 + + alsl.d IY, INCY, IY, 3 + + LOAD_X_8 + + xvfmadd.s U2, U1, U4, U2 + + alsl.d IX, INCX, IX, 3 addi.d II, II, 32 addi.d T1, T1, 32 addi.d I, I, 1 blt I, T0, .L02 + //Acc U2 + GACC xvf, s, U4, U2 + fmov.d $f2, $f4 + .L03: /* &4 */ andi T0, J, 4 beq $r0, T0, .L04 @@ -421,4 +429,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d $sp, $sp, 88 jirl $r0, $r1, 0x0 - EPILOGUE \ No newline at end of file + EPILOGUE