diff --git a/kernel/arm/iamax.c b/kernel/arm/iamax.c index 8c016ce4d6..4d62354e5b 100644 --- a/kernel/arm/iamax.c +++ b/kernel/arm/iamax.c @@ -56,13 +56,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG max=0; if (n <= 0 || inc_x <= 0) return(max); - + if (n==1) return(1); + if (x[0]!=x[0]) return(1); maxf=ABS(x[0]); ix += inc_x; i++; while(i < n) { + if (x[ix]!=x[ix]) return(i+1); if( ABS(x[ix]) > maxf ) { max = i; diff --git a/kernel/arm/iamin.c b/kernel/arm/iamin.c index 155292bd59..73369f6c19 100644 --- a/kernel/arm/iamin.c +++ b/kernel/arm/iamin.c @@ -56,13 +56,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG min=0; if (n <= 0 || inc_x <= 0) return(min); - - minf=ABS(x[0]); + if (n==1) return(1); + if (x[0]!=x[0]) return(1); + minf=ABS(x[0]); ix += inc_x; i++; while(i < n) { + if (x[ix]!=x[ix]) return(i+1); if( ABS(x[ix]) < ABS(minf) ) { min = i; diff --git a/kernel/mips/iamax.c b/kernel/mips/iamax.c index fcc0b821e9..4d62354e5b 100644 --- a/kernel/mips/iamax.c +++ b/kernel/mips/iamax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2016, The OpenBLAS Project +Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -25,6 +25,15 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + #include "common.h" #include @@ -47,13 +56,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG max=0; if (n <= 0 || inc_x <= 0) return(max); - + if (n==1) return(1); + if (x[0]!=x[0]) return(1); maxf=ABS(x[0]); ix += inc_x; i++; while(i < n) { + if (x[ix]!=x[ix]) return(i+1); if( ABS(x[ix]) > maxf ) { max = i; diff --git a/kernel/mips/iamin.c b/kernel/mips/iamin.c index 7f1c4d9057..73369f6c19 100644 --- a/kernel/mips/iamin.c +++ b/kernel/mips/iamin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2016, The OpenBLAS Project +Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -25,6 +25,15 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + #include "common.h" #include @@ -47,13 +56,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG min=0; if (n <= 0 || inc_x <= 0) return(min); - - minf=ABS(x[0]); + if (n==1) return(1); + if (x[0]!=x[0]) return(1); + minf=ABS(x[0]); ix += inc_x; i++; while(i < n) { + if (x[ix]!=x[ix]) return(i+1); if( ABS(x[ix]) < ABS(minf) ) { min = i; diff --git a/kernel/power/iamax.S b/kernel/power/iamax.S index 45461ae857..147a60f0f3 100644 --- a/kernel/power/iamax.S +++ b/kernel/power/iamax.S @@ -520,6 +520,19 @@ LL(1000): .align 4 LL(1010): + addi RET, RET, 1 + fcmpu cr0, f24, f24 + bun cr0, LL(9999) + addi RET, RET, 1 + fcmpu cr0, f25, f25 + bun cr0, LL(9999) + addi RET, RET, 1 + fcmpu cr0, f26, f26 + bun cr0, LL(9999) + addi RET, RET, 1 + fcmpu cr0, f27, f27 + bun cr0, LL(9999) + fabs f8, f24 fabs f9, f25 fabs f10, f26 @@ -529,6 +542,20 @@ LL(1010): LFD f25, 9 * SIZE(XX) LFD f26, 10 * SIZE(XX) LFD f27, 11 * SIZE(XX) + + addi RET, RET, 1 + fcmpu cr0, f24, f24 + bun cr0, LL(9999) + addi RET, RET, 1 + fcmpu cr0, f25, f25 + bun cr0, LL(9999) + addi RET, RET, 1 + fcmpu cr0, f26, f26 + bun cr0, LL(9999) + addi RET, RET, 1 + fcmpu cr0, f27, f27 + bun cr0, LL(9999) + subi RET, RET, 8 fabs f12, f28 fabs f13, f29 @@ -577,6 +604,32 @@ LL(1010): .align 4 LL(1020): + addi RET, RET, 1 + fcmpu cr0, f24, f24 + bun cr0, LL(9999) + addi RET, RET, 1 + fcmpu cr0, f25, f25 + bun cr0, LL(9999) + addi RET, RET, 1 + fcmpu cr0, f26, f26 + bun cr0, LL(9999) + addi RET, RET, 1 + fcmpu cr0, f27, f27 + bun cr0, LL(9999) + addi RET, RET, 1 + fcmpu cr0, f28, f28 + bun cr0, LL(9999) + addi RET, RET, 1 + fcmpu cr0, f29, f29 + bun cr0, LL(9999) + addi RET, RET, 1 + fcmpu cr0, f30, f30 + bun cr0, LL(9999) + addi RET, RET, 1 + fcmpu cr0, f31, f31 + bun cr0, LL(9999) + subi RET, RET, 8 + fabs f8, f24 fabs f9, f25 fabs f10, f26 @@ -631,8 +684,12 @@ LL(1050): LL(1060): LFD f8, 0 * SIZE(XX) addi XX, XX, 1 * SIZE + addi RET, RET, 1 + fcmpu cr0, f8, f8 + bun cru, LL(9999) + fabs f8, f8 - addi RET, RET, 1 + //addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) bdnz LL(1060) @@ -658,6 +715,18 @@ LL(1100): .align 4 LL(1110): + addi RET, RET, 1 + fcmpu cr0, f24, f24 + bun cr0, LL(9999) + addi RET, RET, 1 + fcmpu cr0, f25, f25 + bun cr0, LL(9999) + addi RET, RET, 1 + fcmpu cr0, f26, f26 + bun cr0, LL(9999) + addi RET, RET, 1 + fcmpu cr0, f27, f27 + bun cr0, LL(9999) fabs f8, f24 fabs f9, f25 fabs f10, f26 @@ -667,7 +736,19 @@ LL(1110): LFDUX f25, XX, INCX LFDUX f26, XX, INCX LFDUX f27, XX, INCX - + addi RET, RET, 1 + fcmpu cr0, f24, f24 + bun cr0, LL(9999) + addi RET, RET, 1 + fcmpu cr0, f25, f25 + bun cr0, LL(9999) + addi RET, RET, 1 + fcmpu cr0, f26, f26 + bun cr0, LL(9999) + addi RET, RET, 1 + fcmpu cr0, f27, f27 + bun cr0, LL(9999) + subi RET, RET, 8 fabs f12, f28 fabs f13, f29 fabs f14, f30 @@ -714,6 +795,30 @@ LL(1110): .align 4 LL(1120): + addi RET, RET, 1 + fcmpu cr0, f24, f24 + bun cr0, LL(9999) + addi RET, RET, 1 + fcmpu cr0, f25, f25 + bun cr0, LL(9999) + addi RET, RET, 1 + fcmpu cr0, f26, f26 + bun cr0, LL(9999) + addi RET, RET, 1 + fcmpu cr0, f27, f27 + bun cr0, LL(9999) + addi RET, RET, 1 + fcmpu cr0, f28, f28 + bun cr0, LL(9999) + addi RET, RET, 1 + fcmpu cr0, f29, f29 + bun cr0, LL(9999) + addi RET, RET, 1 + fcmpu cr0, f30, f30 + bun cr0, LL(9999) + addi RET, RET, 1 + fcmpu cr0, f31, f31 + subi RET, RET, 8 fabs f8, f24 fabs f9, f25 fabs f10, f26 @@ -765,8 +870,11 @@ LL(1150): LL(1160): LFDUX f8, XX, INCX - fabs f8, f8 addi RET, RET, 1 + fcmpu cr0, f8, f8 + bun LL(9999) + fabs f8, f8 +// addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) bdnz LL(1160) diff --git a/kernel/power/idamax.c b/kernel/power/idamax.c index f1ef00066a..8a8471c0f2 100644 --- a/kernel/power/idamax.c +++ b/kernel/power/idamax.c @@ -327,6 +327,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG max = 0; if (n <= 0 || inc_x <= 0) return (max); + if (n == 1) return(1); if (inc_x == 1) { @@ -335,7 +336,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG n1 = n & -32; if (n1 > 0) { - + for (int ii=i;ii maxf) { max = i; maxf = ABS(x[i]); @@ -356,6 +358,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG n1 = n & -4; while (j < n1) { + if (x[i] != x[i]) return(i+1); + if (x[i+inc_x] != x[i+inc_x]) return(j+1); + if (x[i+2*inc_x] != x[i+2*inc_x]) return(j+2); + if (x[i+3*inc_x] != x[i+3*inc_x]) return(j+3); if (ABS(x[i]) > maxf) { max = j; maxf = ABS(x[i]); @@ -381,6 +387,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { while (j < n) { + if (x[i] != x[i]) return(j+1); if (ABS(x[i]) > maxf) { max = j; maxf = ABS(x[i]); diff --git a/kernel/power/isamax.c b/kernel/power/isamax.c index fb2dafec0f..1a116ca640 100644 --- a/kernel/power/isamax.c +++ b/kernel/power/isamax.c @@ -58,6 +58,78 @@ static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) { register __vector float quadruple_values={0,0,0,0}; register __vector float * v_ptrx=(__vector float *)x; for(; i maxf) { max = i; maxf = ABS(x[i]); @@ -251,18 +324,22 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG n1 = n & -4; while (j < n1) { + if (x[i] != x[i]) return(j+1); if (ABS(x[i]) > maxf) { max = j; maxf = ABS(x[i]); } + if (x[i+inc_x] != x[i+inc_x]) return(j+1); if (ABS(x[i + inc_x]) > maxf) { max = j + 1; maxf = ABS(x[i + inc_x]); } + if (x[i+2*inc_x] != x[i+2*inc_x]) return(j+2); if (ABS(x[i + 2 * inc_x]) > maxf) { max = j + 2; maxf = ABS(x[i + 2 * inc_x]); } + if (x[i+3*inc_x] != x[i+3*inc_x]) return(j+3); if (ABS(x[i + 3 * inc_x]) > maxf) { max = j + 3; maxf = ABS(x[i + 3 * inc_x]); @@ -276,6 +353,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { while (j < n) { + if (x[i] != x[i]) return(j+1); if (ABS(x[i]) > maxf) { max = j; maxf = ABS(x[i]); diff --git a/kernel/riscv64/iamax.c b/kernel/riscv64/iamax.c index 8c016ce4d6..4d62354e5b 100644 --- a/kernel/riscv64/iamax.c +++ b/kernel/riscv64/iamax.c @@ -56,13 +56,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG max=0; if (n <= 0 || inc_x <= 0) return(max); - + if (n==1) return(1); + if (x[0]!=x[0]) return(1); maxf=ABS(x[0]); ix += inc_x; i++; while(i < n) { + if (x[ix]!=x[ix]) return(i+1); if( ABS(x[ix]) > maxf ) { max = i; diff --git a/kernel/riscv64/iamin.c b/kernel/riscv64/iamin.c index 155292bd59..73369f6c19 100644 --- a/kernel/riscv64/iamin.c +++ b/kernel/riscv64/iamin.c @@ -56,13 +56,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG min=0; if (n <= 0 || inc_x <= 0) return(min); - - minf=ABS(x[0]); + if (n==1) return(1); + if (x[0]!=x[0]) return(1); + minf=ABS(x[0]); ix += inc_x; i++; while(i < n) { + if (x[ix]!=x[ix]) return(i+1); if( ABS(x[ix]) < ABS(minf) ) { min = i; diff --git a/kernel/x86/iamax.S b/kernel/x86/iamax.S index 1a7378474c..49d75d3f9b 100644 --- a/kernel/x86/iamax.S +++ b/kernel/x86/iamax.S @@ -100,6 +100,8 @@ movl $1, RET FLD (X) + fcomi %st, %st + jp .L999 #ifdef USE_ABS fabs #endif @@ -121,6 +123,8 @@ #endif FLD 0 * SIZE(X) + fucomi + jp .L998 #ifdef USE_ABS fabs #endif @@ -131,6 +135,8 @@ incl NUM FLD 1 * SIZE(X) + fucomi + jp .L998 #ifdef USE_ABS fabs #endif @@ -141,6 +147,8 @@ incl NUM FLD 2 * SIZE(X) + fucomi + jp .L998 #ifdef USE_ABS fabs #endif @@ -151,6 +159,8 @@ incl NUM FLD 3 * SIZE(X) + fucomi + jp .L998 #ifdef USE_ABS fabs #endif @@ -161,6 +171,8 @@ incl NUM FLD 4 * SIZE(X) + fucomi + jp .L998 #ifdef USE_ABS fabs #endif @@ -171,6 +183,8 @@ incl NUM FLD 5 * SIZE(X) + fucomi + jp .L998 #ifdef USE_ABS fabs #endif @@ -181,6 +195,8 @@ incl NUM FLD 6 * SIZE(X) + fucomi + jp .L998 #ifdef USE_ABS fabs #endif @@ -191,6 +207,8 @@ incl NUM FLD 7 * SIZE(X) + fucomi + jp .L998 #ifdef USE_ABS fabs #endif @@ -215,6 +233,8 @@ .L21: FLD 0 * SIZE(X) + fucomi + jp .L998 #ifdef USE_ABS fabs #endif @@ -238,6 +258,8 @@ .L50: FLD 0 * SIZE(X) + fucomi + jp .L998 addl INCX, X #ifdef USE_ABS fabs @@ -249,6 +271,8 @@ incl NUM FLD 0 * SIZE(X) + fucomi + jp .L998 addl INCX, X #ifdef USE_ABS fabs @@ -260,6 +284,8 @@ incl NUM FLD 0 * SIZE(X) + fucomi + jp .L998 addl INCX, X #ifdef USE_ABS fabs @@ -271,6 +297,8 @@ incl NUM FLD 0 * SIZE(X) + fucomi + jp .L998 addl INCX, X #ifdef USE_ABS fabs @@ -282,6 +310,8 @@ incl NUM FLD 0 * SIZE(X) + fucomi + jp .L998 addl INCX, X #ifdef USE_ABS fabs @@ -293,6 +323,8 @@ incl NUM FLD 0 * SIZE(X) + fucomi + jp .L998 addl INCX, X #ifdef USE_ABS fabs @@ -304,6 +336,8 @@ incl NUM FLD 0 * SIZE(X) + fucomi + jp .L998 addl INCX, X #ifdef USE_ABS fabs @@ -315,6 +349,8 @@ incl NUM FLD 0 * SIZE(X) + fucomi + jp .L998 addl INCX, X #ifdef USE_ABS fabs @@ -338,6 +374,8 @@ .L61: FLD 0 * SIZE(X) + fucomi + jp .L998 #ifdef USE_ABS fabs #endif @@ -361,4 +399,7 @@ popl %ebp ret +.L998: mov NUM, RET + jmp .L999 + EPILOGUE diff --git a/kernel/x86_64/iamax_sse.S b/kernel/x86_64/iamax_sse.S index 14c7f43ec7..6195e07f0b 100644 --- a/kernel/x86_64/iamax_sse.S +++ b/kernel/x86_64/iamax_sse.S @@ -93,6 +93,10 @@ addq INCX, X decq M shufps $0, %xmm0, %xmm0 + incq RET + comiss %xmm0, %xmm0 + jp .L999 + decq RET #ifdef USE_ABS andps %xmm15, %xmm0 #endif @@ -254,6 +258,10 @@ decq M addq $SIZE, X + incq RET + comiss %xmm1, %xmm1 + jp .L998 + decq RET #ifdef USE_ABS andps %xmm15, %xmm1 #endif @@ -268,6 +276,14 @@ movss 0 * SIZE(X), %xmm1 movss 1 * SIZE(X), %xmm2 + incq RET + comiss %xmm1, %xmm1 + jp .L998 + incq RET + comiss %xmm2, %xmm2 + jp .L998 + decq RET + decq RET subq $2, M addq $2 * SIZE, X @@ -332,6 +348,31 @@ movss 5 * SIZE(X), %xmm6 movss 6 * SIZE(X), %xmm7 movss 7 * SIZE(X), %xmm8 + incq RET + comiss %xmm1, %xmm1 + jp .L998 + incq RET + comiss %xmm2, %xmm2 + jp .L998 + incq RET + comiss %xmm3, %xmm3 + jp .L998 + incq RET + comiss %xmm4, %xmm4 + jp .L998 + incq RET + comiss %xmm5, %xmm5 + jp .L998 + incq RET + comiss %xmm6, %xmm6 + jp .L998 + incq RET + comiss %xmm7, %xmm7 + jp .L998 + incq RET + comiss %xmm8, %xmm8 + jp .L998 + subq $8, RET #ifdef USE_ABS andps %xmm15, %xmm1 andps %xmm15, %xmm2 @@ -378,6 +419,19 @@ movss 1 * SIZE(X), %xmm2 movss 2 * SIZE(X), %xmm3 movss 3 * SIZE(X), %xmm4 + incq RET + comiss %xmm1, %xmm1 + jp .L998 + incq RET + comiss %xmm2, %xmm2 + jp .L998 + incq RET + comiss %xmm3, %xmm3 + jp .L998 + incq RET + comiss %xmm4, %xmm4 + jp .L998 + subq $4, RET #ifdef USE_ABS andps %xmm15, %xmm1 andps %xmm15, %xmm2 @@ -405,6 +459,13 @@ movss 0 * SIZE(X), %xmm1 movss 1 * SIZE(X), %xmm2 + incq RET + comiss %xmm1, %xmm1 + jp .L998 + incq RET + comiss %xmm2, %xmm2 + jp .L998 + subq $2, RET #ifdef USE_ABS andps %xmm15, %xmm1 andps %xmm15, %xmm2 @@ -593,6 +654,31 @@ movss 5 * SIZE(X), %xmm6 movss 6 * SIZE(X), %xmm7 movss 7 * SIZE(X), %xmm8 + incq RET + comiss %xmm1, %xmm1 + jp .L998 + incq RET + comiss %xmm2, %xmm2 + jp .L998 + incq RET + comiss %xmm3, %xmm3 + jp .L998 + incq RET + comiss %xmm4, %xmm4 + jp .L998 + incq RET + comiss %xmm5, %xmm5 + jp .L998 + incq RET + comiss %xmm6, %xmm6 + jp .L998 + incq RET + comiss %xmm7, %xmm7 + jp .L998 + incq RET + comiss %xmm8, %xmm8 + jp .L998 + subq $8, RET #ifdef USE_ABS andps %xmm15, %xmm1 andps %xmm15, %xmm2 @@ -639,6 +725,19 @@ movss 1 * SIZE(X), %xmm2 movss 2 * SIZE(X), %xmm3 movss 3 * SIZE(X), %xmm4 + incq RET + comiss %xmm1, %xmm1 + jp .L998 + incq RET + comiss %xmm2, %xmm2 + jp .L998 + incq RET + comiss %xmm3, %xmm3 + jp .L998 + incq RET + comiss %xmm4, %xmm4 + jp .L998 + subq $4, RET #ifdef USE_ABS andps %xmm15, %xmm1 andps %xmm15, %xmm2 @@ -666,6 +765,13 @@ movss 0 * SIZE(X), %xmm1 movss 1 * SIZE(X), %xmm2 + incq RET + comiss %xmm1, %xmm1 + jp .L998 + incq RET + comiss %xmm2, %xmm2 + jp .L998 + subq $2, RET #ifdef USE_ABS andps %xmm15, %xmm1 andps %xmm15, %xmm2 @@ -885,6 +991,31 @@ movss 0 * SIZE(X), %xmm2 subq INCX, X movss 0 * SIZE(X), %xmm1 + incq RET + comiss %xmm1, %xmm1 + jp .L998 + incq RET + comiss %xmm2, %xmm2 + jp .L998 + incq RET + comiss %xmm3, %xmm3 + jp .L998 + incq RET + comiss %xmm4, %xmm4 + jp .L998 + incq RET + comiss %xmm5, %xmm5 + jp .L998 + incq RET + comiss %xmm6, %xmm6 + jp .L998 + incq RET + comiss %xmm7, %xmm7 + jp .L998 + incq RET + comiss %xmm8, %xmm8 + jp .L998 + subq $8, RET #ifdef USE_ABS andps %xmm15, %xmm1 andps %xmm15, %xmm2 @@ -932,7 +1063,19 @@ addq INCX, X movss 0 * SIZE(X), %xmm4 addq INCX, X - + incq RET + comiss %xmm1, %xmm1 + jp .L998 + incq RET + comiss %xmm2, %xmm2 + jp .L998 + incq RET + comiss %xmm3, %xmm3 + jp .L998 + incq RET + comiss %xmm4, %xmm4 + jp .L998 + subq $4, RET #ifdef USE_ABS andps %xmm15, %xmm1 andps %xmm15, %xmm2 @@ -961,6 +1104,13 @@ addq INCX, X movss 0 * SIZE(X), %xmm2 addq INCX, X + incq RET + comiss %xmm1, %xmm1 + jp .L998 + incq RET + comiss %xmm2, %xmm2 + jp .L998 + subq $2, RET #ifdef USE_ABS andps %xmm15, %xmm1 andps %xmm15, %xmm2 @@ -982,5 +1132,9 @@ ret +.L998: +// incq RET + jmp .L999 + EPILOGUE diff --git a/kernel/x86_64/iamax_sse2.S b/kernel/x86_64/iamax_sse2.S index 6808f191ba..698bb11e88 100644 --- a/kernel/x86_64/iamax_sse2.S +++ b/kernel/x86_64/iamax_sse2.S @@ -79,6 +79,8 @@ movsd (X), %xmm0 addq INCX, X decq M + comisd %xmm0, %xmm0 + jp .L987 #ifdef USE_ABS andpd %xmm15, %xmm0 #endif @@ -269,6 +271,11 @@ je .L21 movsd 0 * SIZE(X), %xmm1 + incq RET + comisd %xmm1, %xmm1 + jp .L987 + decq RET + #ifdef USE_ABS andpd %xmm15, %xmm1 #endif @@ -342,6 +349,32 @@ movsd 5 * SIZE(X), %xmm6 movsd 6 * SIZE(X), %xmm7 movsd 7 * SIZE(X), %xmm8 + incq RET + comisd %xmm1, %xmm1 + jp .L987 + incq RET + comisd %xmm2, %xmm2 + jp .L987 + incq RET + comisd %xmm3, %xmm3 + jp .L987 + incq RET + comisd %xmm4, %xmm4 + jp .L987 + incq RET + comisd %xmm5, %xmm5 + jp .L987 + incq RET + comisd %xmm6, %xmm6 + jp .L987 + incq RET + comisd %xmm7, %xmm7 + jp .L987 + incq RET + comisd %xmm8, %xmm8 + jp .L987 + subq $8, RET + #ifdef USE_ABS andpd %xmm15, %xmm1 andpd %xmm15, %xmm2 @@ -374,9 +407,9 @@ comisd %xmm0, %xmm6 je .L999 incq RET - comisd %xmm0, %xmm7 - je .L999 - incq RET + comisd %xmm0, %xmm7 + je .L999 + incq RET jmp .L999 ALIGN_3 @@ -388,6 +421,21 @@ movsd 1 * SIZE(X), %xmm2 movsd 2 * SIZE(X), %xmm3 movsd 3 * SIZE(X), %xmm4 + + incq RET + comisd %xmm1, %xmm1 + jp .L987 + incq RET + comisd %xmm2, %xmm2 + jp .L987 + incq RET + comisd %xmm3, %xmm3 + jp .L987 + incq RET + comisd %xmm4, %xmm4 + jp .L987 + subq $4, RET + #ifdef USE_ABS andpd %xmm15, %xmm1 andpd %xmm15, %xmm2 @@ -415,6 +463,14 @@ movsd 0 * SIZE(X), %xmm1 movsd 1 * SIZE(X), %xmm2 + incq RET + comisd %xmm1, %xmm1 + jp .L987 + incq RET + comisd %xmm2, %xmm2 + jp .L987 + subq $2, RET + #ifdef USE_ABS andpd %xmm15, %xmm1 andpd %xmm15, %xmm2 @@ -670,6 +726,32 @@ movsd 5 * SIZE(X), %xmm6 movsd 6 * SIZE(X), %xmm7 movsd 7 * SIZE(X), %xmm8 + incq RET + comisd %xmm1, %xmm1 + jp .L987 + incq RET + comisd %xmm2, %xmm2 + jp .L987 + incq RET + comisd %xmm3, %xmm3 + jp .L987 + incq RET + comisd %xmm4, %xmm4 + jp .L987 + incq RET + comisd %xmm5, %xmm5 + jp .L987 + incq RET + comisd %xmm6, %xmm6 + jp .L987 + incq RET + comisd %xmm7, %xmm7 + jp .L987 + incq RET + comisd %xmm8, %xmm8 + jp .L987 + subq $8, RET + #ifdef USE_ABS andpd %xmm15, %xmm1 andpd %xmm15, %xmm2 @@ -716,6 +798,20 @@ movsd 1 * SIZE(X), %xmm2 movsd 2 * SIZE(X), %xmm3 movsd 3 * SIZE(X), %xmm4 + incq RET + comisd %xmm1, %xmm1 + jp .L987 + incq RET + comisd %xmm2, %xmm2 + jp .L987 + incq RET + comisd %xmm3, %xmm3 + jp .L987 + incq RET + comisd %xmm4, %xmm4 + jp .L987 + subq $4, RET + #ifdef USE_ABS andpd %xmm15, %xmm1 andpd %xmm15, %xmm2 @@ -743,11 +839,21 @@ movsd 0 * SIZE(X), %xmm1 movsd 1 * SIZE(X), %xmm2 + incq RET + comisd %xmm1, %xmm1 + jp .L987 + incq RET + comisd %xmm2, %xmm2 + jp .L987 + subq $2, RET + #ifdef USE_ABS andpd %xmm15, %xmm1 andpd %xmm15, %xmm2 #endif addq $2 * SIZE, X + comisd %xmm0, %xmm0 + jp .L987 incq RET comisd %xmm0, %xmm1 je .L999 @@ -962,6 +1068,7 @@ ALIGN_4 .L92: + movsd 0 * SIZE(X), %xmm1 addq INCX, X movhpd 0 * SIZE(X), %xmm1 @@ -1033,6 +1140,32 @@ movsd 0 * SIZE(X), %xmm2 subq INCX, X movsd 0 * SIZE(X), %xmm1 + incq RET + comisd %xmm1, %xmm1 + jp .L987 + incq RET + comisd %xmm2, %xmm2 + jp .L987 + incq RET + comisd %xmm3, %xmm3 + jp .L987 + incq RET + comisd %xmm4, %xmm4 + jp .L987 + incq RET + comisd %xmm5, %xmm5 + jp .L987 + incq RET + comisd %xmm6, %xmm6 + jp .L987 + incq RET + comisd %xmm7, %xmm7 + jp .L987 + incq RET + comisd %xmm8, %xmm8 + jp .L987 + subq $8, RET + #ifdef USE_ABS andpd %xmm15, %xmm1 andpd %xmm15, %xmm2 @@ -1083,6 +1216,20 @@ addq INCX, X movsd 0 * SIZE(X), %xmm4 addq INCX, X + incq RET + comisd %xmm1, %xmm1 + jp .L987 + incq RET + comisd %xmm2, %xmm2 + jp .L987 + incq RET + comisd %xmm3, %xmm3 + jp .L987 + incq RET + comisd %xmm4, %xmm4 + jp .L987 + subq $4, RET + #ifdef USE_ABS andpd %xmm15, %xmm1 andpd %xmm15, %xmm2 @@ -1111,6 +1258,14 @@ addq INCX, X movsd 0 * SIZE(X), %xmm2 addq INCX, X + incq RET + comisd %xmm1, %xmm1 + jp .L987 + incq RET + comisd %xmm2, %xmm2 + jp .L987 + decq RET + decq RET #ifdef USE_ABS andpd %xmm15, %xmm1 andpd %xmm15, %xmm2 @@ -1122,7 +1277,6 @@ comisd %xmm0, %xmm2 je .L999 ALIGN_3 - .L98: incq RET ALIGN_3 @@ -1132,5 +1286,8 @@ ret - EPILOGUE +.L987: + incq RET //count from xmm0 + jmp .L999 + EPILOGUE diff --git a/utest/test_amax.c b/utest/test_amax.c index a9e5a1c858..b13403955b 100644 --- a/utest/test_amax.c +++ b/utest/test_amax.c @@ -58,3 +58,29 @@ CTEST(amax, damax){ } #endif +#ifdef BUILD_SINGLE +CTEST(amax, isamax_nan){ + blasint N=3, inc=1; + int te_max=0, tr_max=2; + float x[]={1., 0./0., 0./0. }; + //float x[]={ 0./0., 2., 3. }; + + + te_max=BLASFUNC(isamax)(&N, x, &inc); + + ASSERT_EQUAL(tr_max, te_max); +} +#endif + +#ifdef BUILD_DOUBLE +CTEST(amax, idamax_nan){ + blasint N=4, inc=1; + int te_max=0, tr_max=1; + //float x[]={1., 0./0., 0./0. }; + double x[]={ 0./0., 1.,2. ,3.}; + + te_max=BLASFUNC(idamax)(&N, x, &inc); + + ASSERT_EQUAL(tr_max, te_max); +} +#endif