From 29b51f92a786759241f669adc9cb1ea2c7d18478 Mon Sep 17 00:00:00 2001 From: Nicola Torracca Date: Wed, 30 Oct 2024 18:58:09 +0100 Subject: [PATCH 1/4] Use SIMD instructions to update pointers. Signed-off-by: Nicola Torracca --- sha1_mb/sha1_mb_x16_avx512.asm | 15 +++++---------- sha1_mb/sha1_mb_x4_avx.asm | 15 +++++++-------- sha1_mb/sha1_mb_x8_avx2.asm | 23 +++++++---------------- sha256_mb/sha256_mb_x16_avx512.asm | 15 +++++---------- sha256_mb/sha256_mb_x8_avx2.asm | 23 +++++++---------------- sha512_mb/sha512_mb_x2_avx.asm | 9 +++++---- sha512_mb/sha512_mb_x4_avx2.asm | 13 +++++-------- sha512_mb/sha512_mb_x8_avx512.asm | 13 +++---------- 8 files changed, 44 insertions(+), 82 deletions(-) diff --git a/sha1_mb/sha1_mb_x16_avx512.asm b/sha1_mb/sha1_mb_x16_avx512.asm index 84eb0de5..fa3a0194 100644 --- a/sha1_mb/sha1_mb_x16_avx512.asm +++ b/sha1_mb/sha1_mb_x16_avx512.asm @@ -490,16 +490,11 @@ lastLoop: vpaddd E,E,EE ;; update into data pointers -%assign I 0 -%rep 8 - mov inp0, [IN + (2*I)*8] - mov inp1, [IN + (2*I +1)*8] - add inp0, IDX - add inp1, IDX - mov [IN + (2*I)*8], inp0 - mov [IN + (2*I+1)*8], inp1 -%assign I (I+1) -%endrep + vpbroadcastq TMP1, IDX + vpaddq TMP0, TMP1, [IN] + vpaddq TMP1, TMP1, [IN+64] + vmovdqu64 [IN], TMP0 + vmovdqu64 [IN+64], TMP1 ; Write out digest ; Do we need to untranspose digests??? diff --git a/sha1_mb/sha1_mb_x4_avx.asm b/sha1_mb/sha1_mb_x4_avx.asm index eb67309d..2077c949 100644 --- a/sha1_mb/sha1_mb_x4_avx.asm +++ b/sha1_mb/sha1_mb_x4_avx.asm @@ -389,14 +389,13 @@ lloop: vmovdqa [ARG1 + 4*16], E ; update input pointers - add inp0, IDX - mov [ARG1 + _data_ptr + 0*8], inp0 - add inp1, IDX - mov [ARG1 + _data_ptr + 1*8], inp1 - add inp2, IDX - mov [ARG1 + _data_ptr + 2*8], inp2 - add inp3, IDX - mov [ARG1 + _data_ptr + 3*8], inp3 + vmovq xmm1, IDX + vpbroadcastq xmm1, xmm1 + lea IDX, [ARG1 + _data_ptr] + vpaddq xmm0, xmm1, [IDX] + vpaddq xmm1, xmm1, [IDX+16] + vmovdqu [IDX], xmm0 + vmovdqu [IDX+16], xmm1 ;;;;;;;;;;;;;;;; ;; Postamble diff --git a/sha1_mb/sha1_mb_x8_avx2.asm b/sha1_mb/sha1_mb_x8_avx2.asm index edcba6d3..6a9a685b 100644 --- a/sha1_mb/sha1_mb_x8_avx2.asm +++ b/sha1_mb/sha1_mb_x8_avx2.asm @@ -475,22 +475,13 @@ lloop: vmovdqu [arg1 + 4*32], E ;; update input pointers - add inp0, IDX - add inp1, IDX - add inp2, IDX - add inp3, IDX - add inp4, IDX - add inp5, IDX - add inp6, IDX - add inp7, IDX - mov [arg1+_data_ptr+0*8], inp0 - mov [arg1+_data_ptr+1*8], inp1 - mov [arg1+_data_ptr+2*8], inp2 - mov [arg1+_data_ptr+3*8], inp3 - mov [arg1+_data_ptr+4*8], inp4 - mov [arg1+_data_ptr+5*8], inp5 - mov [arg1+_data_ptr+6*8], inp6 - mov [arg1+_data_ptr+7*8], inp7 + vmovq xmm1, IDX + vpbroadcastq ymm1, xmm1 + lea IDX, [arg1+_data_ptr] + vpaddq ymm0, ymm1, [IDX] + vpaddq ymm1, ymm1, [IDX+32] + vmovdqu [IDX], ymm0 + vmovdqu [IDX+32], ymm1 ;;;;;;;;;;;;;;;; ;; Postamble diff --git a/sha256_mb/sha256_mb_x16_avx512.asm b/sha256_mb/sha256_mb_x16_avx512.asm index 66beff06..c069150b 100644 --- a/sha256_mb/sha256_mb_x16_avx512.asm +++ b/sha256_mb/sha256_mb_x16_avx512.asm @@ -607,16 +607,11 @@ lastLoop: vpaddd H, H, [rsp + _DIGEST_SAVE + 64*7] ;; update into data pointers -%assign I 0 -%rep 8 - mov inp0, [IN + (2*I)*8] - mov inp1, [IN + (2*I +1)*8] - add inp0, IDX - add inp1, IDX - mov [IN + (2*I)*8], inp0 - mov [IN + (2*I+1)*8], inp1 -%assign I (I+1) -%endrep + vpbroadcastq TMP1, IDX + vpaddq TMP0, TMP1, [IN] + vpaddq TMP1, TMP1, [IN+64] + vmovdqu64 [IN], TMP0 + vmovdqu64 [IN+64], TMP1 ; Write out digest ; Do we need to untranspose digests??? diff --git a/sha256_mb/sha256_mb_x8_avx2.asm b/sha256_mb/sha256_mb_x8_avx2.asm index dbd9db1b..3d86396c 100644 --- a/sha256_mb/sha256_mb_x8_avx2.asm +++ b/sha256_mb/sha256_mb_x8_avx2.asm @@ -463,22 +463,13 @@ Lrounds_16_xx: vmovdqu [STATE + 7*SHA256_DIGEST_ROW_SIZE],h ; update input pointers - add inp0, IDX - mov [STATE + _args_data_ptr + 0*8], inp0 - add inp1, IDX - mov [STATE + _args_data_ptr + 1*8], inp1 - add inp2, IDX - mov [STATE + _args_data_ptr + 2*8], inp2 - add inp3, IDX - mov [STATE + _args_data_ptr + 3*8], inp3 - add inp4, IDX - mov [STATE + _args_data_ptr + 4*8], inp4 - add inp5, IDX - mov [STATE + _args_data_ptr + 5*8], inp5 - add inp6, IDX - mov [STATE + _args_data_ptr + 6*8], inp6 - add inp7, IDX - mov [STATE + _args_data_ptr + 7*8], inp7 + vmovq XWORD(TMP0), IDX + vpbroadcastq TMP1, XWORD(TMP0) + lea IDX, [STATE + _args_data_ptr] + vpaddq TMP0, TMP1, [IDX] + vpaddq TMP1, TMP1, [IDX + 32] + vmovdqu [IDX], TMP0 + vmovdqu [IDX+32], TMP1 ;;;;;;;;;;;;;;;; ;; Postamble diff --git a/sha512_mb/sha512_mb_x2_avx.asm b/sha512_mb/sha512_mb_x2_avx.asm index 5d443faf..e0edd596 100644 --- a/sha512_mb/sha512_mb_x2_avx.asm +++ b/sha512_mb/sha512_mb_x2_avx.asm @@ -337,10 +337,11 @@ Lrounds_16_xx: vmovdqa [STATE+7*SHA512_DIGEST_ROW_SIZE],h ; update input pointers - add inp0, IDX - mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0 - add inp1, IDX - mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1 + vmovq xmm0, IDX + vpbroadcastq xmm0, xmm0 + lea IDX, [STATE + _data_ptr_sha512] + vpaddq xmm0, xmm0, [IDX] + vmovdqu32 [IDX], xmm0 ;;;;;;;;;;;;;;;; ;; Postamble diff --git a/sha512_mb/sha512_mb_x4_avx2.asm b/sha512_mb/sha512_mb_x4_avx2.asm index 0058f33a..7e6c3b55 100644 --- a/sha512_mb/sha512_mb_x4_avx2.asm +++ b/sha512_mb/sha512_mb_x4_avx2.asm @@ -379,14 +379,11 @@ Lrounds_16_xx: vmovdqu [STATE+ 7*SHA512_DIGEST_ROW_SIZE ],h ;; update input data pointers - add inp0, IDX - mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0 - add inp1, IDX - mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1 - add inp2, IDX - mov [STATE + _data_ptr_sha512 + 2*PTR_SZ], inp2 - add inp3, IDX - mov [STATE + _data_ptr_sha512 + 3*PTR_SZ], inp3 + vmovq xmm0, IDX + lea IDX, [STATE + _data_ptr_sha512] + vpbroadcastq ymm0, xmm0 + vpaddq ymm0, ymm0, [IDX] + vmovdqu [IDX], ymm0 ;;;;;;;;;;;;;;;; ;; Postamble diff --git a/sha512_mb/sha512_mb_x8_avx512.asm b/sha512_mb/sha512_mb_x8_avx512.asm index e2735109..7864c14b 100644 --- a/sha512_mb/sha512_mb_x8_avx512.asm +++ b/sha512_mb/sha512_mb_x8_avx512.asm @@ -494,16 +494,9 @@ lastLoop: vpaddq H, H, [rsp + _DIGEST_SAVE + 64*7] ;; update into data pointers -%assign I 0 -%rep 4 - mov inp0, [IN + (2*I)*8] - mov inp1, [IN + (2*I +1)*8] - add inp0, IDX - add inp1, IDX - mov [IN + (2*I)*8], inp0 - mov [IN + (2*I+1)*8], inp1 -%assign I (I+1) -%endrep + vpbroadcastq TMP0, IDX + vpaddq TMP0, TMP0, [IN] + vmovdqu64 [IN], TMP0 VMOVDQ32 [DIGEST + 0*8*8], A VMOVDQ32 [DIGEST + 1*8*8], B From 6563d48220bef7ed9833d9e27de15543e15dd1af Mon Sep 17 00:00:00 2001 From: Nicola Torracca Date: Mon, 4 Nov 2024 16:34:21 +0100 Subject: [PATCH 2/4] Use vpunpcklqdq to broadcast scalar register for AVX targets. Signed-off-by: Nicola Torracca --- sha1_mb/sha1_mb_x4_avx.asm | 2 +- sha256_mb/sha256_mb_x4_avx.asm | 15 +++++++-------- sha512_mb/sha512_mb_x2_avx.asm | 2 +- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/sha1_mb/sha1_mb_x4_avx.asm b/sha1_mb/sha1_mb_x4_avx.asm index 2077c949..61c57abe 100644 --- a/sha1_mb/sha1_mb_x4_avx.asm +++ b/sha1_mb/sha1_mb_x4_avx.asm @@ -390,7 +390,7 @@ lloop: ; update input pointers vmovq xmm1, IDX - vpbroadcastq xmm1, xmm1 + vpunpcklqdq xmm1, xmm1, xmm1 lea IDX, [ARG1 + _data_ptr] vpaddq xmm0, xmm1, [IDX] vpaddq xmm1, xmm1, [IDX+16] diff --git a/sha256_mb/sha256_mb_x4_avx.asm b/sha256_mb/sha256_mb_x4_avx.asm index 7f8f8829..e0381958 100644 --- a/sha256_mb/sha256_mb_x4_avx.asm +++ b/sha256_mb/sha256_mb_x4_avx.asm @@ -345,14 +345,13 @@ Lrounds_16_xx: vmovdqa [arg1+7*SZ4],h ; update input pointers - add inp0, IDX - mov [arg1 + _data_ptr + 0*8], inp0 - add inp1, IDX - mov [arg1 + _data_ptr + 1*8], inp1 - add inp2, IDX - mov [arg1 + _data_ptr + 2*8], inp2 - add inp3, IDX - mov [arg1 + _data_ptr + 3*8], inp3 + vmovq xmm1, IDX + vpunpcklqdq xmm1, xmm1, xmm1 + lea IDX, [arg1 + _data_ptr] + vpaddq xmm0, xmm1, [IDX] + vpaddq xmm1, xmm1, [IDX+16] + vmovdqu [IDX], xmm0 + vmovdqu [IDX+16], xmm1 ;;;;;;;;;;;;;;;; ;; Postamble diff --git a/sha512_mb/sha512_mb_x2_avx.asm b/sha512_mb/sha512_mb_x2_avx.asm index e0edd596..ed3f298d 100644 --- a/sha512_mb/sha512_mb_x2_avx.asm +++ b/sha512_mb/sha512_mb_x2_avx.asm @@ -338,7 +338,7 @@ Lrounds_16_xx: ; update input pointers vmovq xmm0, IDX - vpbroadcastq xmm0, xmm0 + vpunpcklqdq xmm0, xmm0 lea IDX, [STATE + _data_ptr_sha512] vpaddq xmm0, xmm0, [IDX] vmovdqu32 [IDX], xmm0 From 934c640cbcc2e6fad3c599f18e381692ca892a62 Mon Sep 17 00:00:00 2001 From: Shark64 <6097259+Shark64@users.noreply.github.com> Date: Mon, 4 Nov 2024 19:31:37 +0100 Subject: [PATCH 3/4] Update sha512_mb_x2_avx.asm Fixed wrong vmov instruction type. --- sha512_mb/sha512_mb_x2_avx.asm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sha512_mb/sha512_mb_x2_avx.asm b/sha512_mb/sha512_mb_x2_avx.asm index ed3f298d..d50853e0 100644 --- a/sha512_mb/sha512_mb_x2_avx.asm +++ b/sha512_mb/sha512_mb_x2_avx.asm @@ -341,7 +341,7 @@ Lrounds_16_xx: vpunpcklqdq xmm0, xmm0 lea IDX, [STATE + _data_ptr_sha512] vpaddq xmm0, xmm0, [IDX] - vmovdqu32 [IDX], xmm0 + vmovdqu [IDX], xmm0 ;;;;;;;;;;;;;;;; ;; Postamble From 2ccae01039c80b12dac81608ccd3fcc1e84f60e7 Mon Sep 17 00:00:00 2001 From: Nicola Torracca Date: Wed, 6 Nov 2024 13:30:37 +0100 Subject: [PATCH 4/4] Revert changes for _avx functions. Signed-off-by: Nicola Torracca --- sha1_mb/sha1_mb_x4_avx.asm | 15 ++++++++------- sha256_mb/sha256_mb_x4_avx.asm | 15 ++++++++------- sha512_mb/sha512_mb_x2_avx.asm | 9 ++++----- 3 files changed, 20 insertions(+), 19 deletions(-) diff --git a/sha1_mb/sha1_mb_x4_avx.asm b/sha1_mb/sha1_mb_x4_avx.asm index 61c57abe..eb67309d 100644 --- a/sha1_mb/sha1_mb_x4_avx.asm +++ b/sha1_mb/sha1_mb_x4_avx.asm @@ -389,13 +389,14 @@ lloop: vmovdqa [ARG1 + 4*16], E ; update input pointers - vmovq xmm1, IDX - vpunpcklqdq xmm1, xmm1, xmm1 - lea IDX, [ARG1 + _data_ptr] - vpaddq xmm0, xmm1, [IDX] - vpaddq xmm1, xmm1, [IDX+16] - vmovdqu [IDX], xmm0 - vmovdqu [IDX+16], xmm1 + add inp0, IDX + mov [ARG1 + _data_ptr + 0*8], inp0 + add inp1, IDX + mov [ARG1 + _data_ptr + 1*8], inp1 + add inp2, IDX + mov [ARG1 + _data_ptr + 2*8], inp2 + add inp3, IDX + mov [ARG1 + _data_ptr + 3*8], inp3 ;;;;;;;;;;;;;;;; ;; Postamble diff --git a/sha256_mb/sha256_mb_x4_avx.asm b/sha256_mb/sha256_mb_x4_avx.asm index e0381958..7f8f8829 100644 --- a/sha256_mb/sha256_mb_x4_avx.asm +++ b/sha256_mb/sha256_mb_x4_avx.asm @@ -345,13 +345,14 @@ Lrounds_16_xx: vmovdqa [arg1+7*SZ4],h ; update input pointers - vmovq xmm1, IDX - vpunpcklqdq xmm1, xmm1, xmm1 - lea IDX, [arg1 + _data_ptr] - vpaddq xmm0, xmm1, [IDX] - vpaddq xmm1, xmm1, [IDX+16] - vmovdqu [IDX], xmm0 - vmovdqu [IDX+16], xmm1 + add inp0, IDX + mov [arg1 + _data_ptr + 0*8], inp0 + add inp1, IDX + mov [arg1 + _data_ptr + 1*8], inp1 + add inp2, IDX + mov [arg1 + _data_ptr + 2*8], inp2 + add inp3, IDX + mov [arg1 + _data_ptr + 3*8], inp3 ;;;;;;;;;;;;;;;; ;; Postamble diff --git a/sha512_mb/sha512_mb_x2_avx.asm b/sha512_mb/sha512_mb_x2_avx.asm index d50853e0..5d443faf 100644 --- a/sha512_mb/sha512_mb_x2_avx.asm +++ b/sha512_mb/sha512_mb_x2_avx.asm @@ -337,11 +337,10 @@ Lrounds_16_xx: vmovdqa [STATE+7*SHA512_DIGEST_ROW_SIZE],h ; update input pointers - vmovq xmm0, IDX - vpunpcklqdq xmm0, xmm0 - lea IDX, [STATE + _data_ptr_sha512] - vpaddq xmm0, xmm0, [IDX] - vmovdqu [IDX], xmm0 + add inp0, IDX + mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0 + add inp1, IDX + mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1 ;;;;;;;;;;;;;;;; ;; Postamble