Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use SIMD instructions to update data pointers. #153

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 5 additions & 10 deletions sha1_mb/sha1_mb_x16_avx512.asm
Original file line number Diff line number Diff line change
Expand Up @@ -490,16 +490,11 @@ lastLoop:
vpaddd E,E,EE

;; update into data pointers
%assign I 0
%rep 8
mov inp0, [IN + (2*I)*8]
mov inp1, [IN + (2*I +1)*8]
add inp0, IDX
add inp1, IDX
mov [IN + (2*I)*8], inp0
mov [IN + (2*I+1)*8], inp1
%assign I (I+1)
%endrep
vpbroadcastq TMP1, IDX
vpaddq TMP0, TMP1, [IN]
vpaddq TMP1, TMP1, [IN+64]
vmovdqu64 [IN], TMP0
vmovdqu64 [IN+64], TMP1

; Write out digest
; Do we need to untranspose digests???
Expand Down
15 changes: 7 additions & 8 deletions sha1_mb/sha1_mb_x4_avx.asm
Original file line number Diff line number Diff line change
Expand Up @@ -389,14 +389,13 @@ lloop:
vmovdqa [ARG1 + 4*16], E

; update input pointers
add inp0, IDX
mov [ARG1 + _data_ptr + 0*8], inp0
add inp1, IDX
mov [ARG1 + _data_ptr + 1*8], inp1
add inp2, IDX
mov [ARG1 + _data_ptr + 2*8], inp2
add inp3, IDX
mov [ARG1 + _data_ptr + 3*8], inp3
vmovq xmm1, IDX
vpunpcklqdq xmm1, xmm1, xmm1
lea IDX, [ARG1 + _data_ptr]
vpaddq xmm0, xmm1, [IDX]
vpaddq xmm1, xmm1, [IDX+16]
vmovdqu [IDX], xmm0
vmovdqu [IDX+16], xmm1

;;;;;;;;;;;;;;;;
;; Postamble
Expand Down
23 changes: 7 additions & 16 deletions sha1_mb/sha1_mb_x8_avx2.asm
Original file line number Diff line number Diff line change
Expand Up @@ -475,22 +475,13 @@ lloop:
vmovdqu [arg1 + 4*32], E

;; update input pointers
add inp0, IDX
add inp1, IDX
add inp2, IDX
add inp3, IDX
add inp4, IDX
add inp5, IDX
add inp6, IDX
add inp7, IDX
mov [arg1+_data_ptr+0*8], inp0
mov [arg1+_data_ptr+1*8], inp1
mov [arg1+_data_ptr+2*8], inp2
mov [arg1+_data_ptr+3*8], inp3
mov [arg1+_data_ptr+4*8], inp4
mov [arg1+_data_ptr+5*8], inp5
mov [arg1+_data_ptr+6*8], inp6
mov [arg1+_data_ptr+7*8], inp7
vmovq xmm1, IDX
vpbroadcastq ymm1, xmm1
lea IDX, [arg1+_data_ptr]
vpaddq ymm0, ymm1, [IDX]
vpaddq ymm1, ymm1, [IDX+32]
vmovdqu [IDX], ymm0
vmovdqu [IDX+32], ymm1

;;;;;;;;;;;;;;;;
;; Postamble
Expand Down
15 changes: 5 additions & 10 deletions sha256_mb/sha256_mb_x16_avx512.asm
Original file line number Diff line number Diff line change
Expand Up @@ -607,16 +607,11 @@ lastLoop:
vpaddd H, H, [rsp + _DIGEST_SAVE + 64*7]

;; update into data pointers
%assign I 0
%rep 8
mov inp0, [IN + (2*I)*8]
mov inp1, [IN + (2*I +1)*8]
add inp0, IDX
add inp1, IDX
mov [IN + (2*I)*8], inp0
mov [IN + (2*I+1)*8], inp1
%assign I (I+1)
%endrep
vpbroadcastq TMP1, IDX
vpaddq TMP0, TMP1, [IN]
vpaddq TMP1, TMP1, [IN+64]
vmovdqu64 [IN], TMP0
vmovdqu64 [IN+64], TMP1

; Write out digest
; Do we need to untranspose digests???
Expand Down
15 changes: 7 additions & 8 deletions sha256_mb/sha256_mb_x4_avx.asm
Original file line number Diff line number Diff line change
Expand Up @@ -345,14 +345,13 @@ Lrounds_16_xx:
vmovdqa [arg1+7*SZ4],h

; update input pointers
add inp0, IDX
mov [arg1 + _data_ptr + 0*8], inp0
add inp1, IDX
mov [arg1 + _data_ptr + 1*8], inp1
add inp2, IDX
mov [arg1 + _data_ptr + 2*8], inp2
add inp3, IDX
mov [arg1 + _data_ptr + 3*8], inp3
vmovq xmm1, IDX
vpunpcklqdq xmm1, xmm1, xmm1
lea IDX, [arg1 + _data_ptr]
vpaddq xmm0, xmm1, [IDX]
vpaddq xmm1, xmm1, [IDX+16]
vmovdqu [IDX], xmm0
vmovdqu [IDX+16], xmm1

;;;;;;;;;;;;;;;;
;; Postamble
Expand Down
23 changes: 7 additions & 16 deletions sha256_mb/sha256_mb_x8_avx2.asm
Original file line number Diff line number Diff line change
Expand Up @@ -463,22 +463,13 @@ Lrounds_16_xx:
vmovdqu [STATE + 7*SHA256_DIGEST_ROW_SIZE],h

; update input pointers
add inp0, IDX
mov [STATE + _args_data_ptr + 0*8], inp0
add inp1, IDX
mov [STATE + _args_data_ptr + 1*8], inp1
add inp2, IDX
mov [STATE + _args_data_ptr + 2*8], inp2
add inp3, IDX
mov [STATE + _args_data_ptr + 3*8], inp3
add inp4, IDX
mov [STATE + _args_data_ptr + 4*8], inp4
add inp5, IDX
mov [STATE + _args_data_ptr + 5*8], inp5
add inp6, IDX
mov [STATE + _args_data_ptr + 6*8], inp6
add inp7, IDX
mov [STATE + _args_data_ptr + 7*8], inp7
vmovq XWORD(TMP0), IDX
vpbroadcastq TMP1, XWORD(TMP0)
lea IDX, [STATE + _args_data_ptr]
vpaddq TMP0, TMP1, [IDX]
vpaddq TMP1, TMP1, [IDX + 32]
vmovdqu [IDX], TMP0
vmovdqu [IDX+32], TMP1

;;;;;;;;;;;;;;;;
;; Postamble
Expand Down
9 changes: 5 additions & 4 deletions sha512_mb/sha512_mb_x2_avx.asm
Original file line number Diff line number Diff line change
Expand Up @@ -337,10 +337,11 @@ Lrounds_16_xx:
vmovdqa [STATE+7*SHA512_DIGEST_ROW_SIZE],h

; update input pointers
add inp0, IDX
mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0
add inp1, IDX
mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1
vmovq xmm0, IDX
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After giving an extra thought, I don't think making this change is worth it, for two reasons:
1 - It's replacing 4 instructions with 5 instructions
2 - We are removing AVX code in the next few months (already mentioned it).
So could you drop the changes in the AVX implementations and leave just the other ones?
Many thanks for the work!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, I've just reverted the _avx functions to the old version.

vpunpcklqdq xmm0, xmm0
lea IDX, [STATE + _data_ptr_sha512]
vpaddq xmm0, xmm0, [IDX]
vmovdqu32 [IDX], xmm0
Shark64 marked this conversation as resolved.
Show resolved Hide resolved

;;;;;;;;;;;;;;;;
;; Postamble
Expand Down
13 changes: 5 additions & 8 deletions sha512_mb/sha512_mb_x4_avx2.asm
Original file line number Diff line number Diff line change
Expand Up @@ -379,14 +379,11 @@ Lrounds_16_xx:
vmovdqu [STATE+ 7*SHA512_DIGEST_ROW_SIZE ],h

;; update input data pointers
add inp0, IDX
mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0
add inp1, IDX
mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1
add inp2, IDX
mov [STATE + _data_ptr_sha512 + 2*PTR_SZ], inp2
add inp3, IDX
mov [STATE + _data_ptr_sha512 + 3*PTR_SZ], inp3
vmovq xmm0, IDX
lea IDX, [STATE + _data_ptr_sha512]
vpbroadcastq ymm0, xmm0
vpaddq ymm0, ymm0, [IDX]
vmovdqu [IDX], ymm0

;;;;;;;;;;;;;;;;
;; Postamble
Expand Down
13 changes: 3 additions & 10 deletions sha512_mb/sha512_mb_x8_avx512.asm
Original file line number Diff line number Diff line change
Expand Up @@ -494,16 +494,9 @@ lastLoop:
vpaddq H, H, [rsp + _DIGEST_SAVE + 64*7]

;; update into data pointers
%assign I 0
%rep 4
mov inp0, [IN + (2*I)*8]
mov inp1, [IN + (2*I +1)*8]
add inp0, IDX
add inp1, IDX
mov [IN + (2*I)*8], inp0
mov [IN + (2*I+1)*8], inp1
%assign I (I+1)
%endrep
vpbroadcastq TMP0, IDX
vpaddq TMP0, TMP0, [IN]
vmovdqu64 [IN], TMP0

VMOVDQ32 [DIGEST + 0*8*8], A
VMOVDQ32 [DIGEST + 1*8*8], B
Expand Down