Skip to content

Commit

Permalink
Optimized dataset read (#211)
Browse files Browse the repository at this point in the history
* Optimized dataset read

There was a false dependency on readReg2 and readReg3 (caused by `xor rbp, rax` instruction) when reading dataset item (see design.md - 4.6.2 Loop execution, steps 5 and 7). This change uses `ma` register to read dataset item before the whole `rbp` (`ma` and `mx`) is changed, so superscalar and out-of-order CPU can start executing it earlier.

Results: https://i.imgur.com/Bpeq9mx.png

~1% speedup on modern Intel/AMD CPUs.

* ARMv8: optimized dataset read

Break dependency from readReg2 and readReg3.

* Fixed light mode hashing
  • Loading branch information
SChernykh authored May 22, 2021
1 parent c120974 commit 3c8c7ee
Show file tree
Hide file tree
Showing 5 changed files with 20 additions and 18 deletions.
1 change: 1 addition & 0 deletions src/asm/program_prologue_linux.inc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
mov rsi, rdx ;# uint8_t* scratchpad

mov rax, rbp
ror rbp, 32

;# zero integer registers
xor r8, r8
Expand Down
1 change: 1 addition & 0 deletions src/asm/program_prologue_win64.inc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
mov rbx, r9 ;# loop counter

mov rax, rbp
ror rbp, 32

;# zero integer registers
xor r8, r8
Expand Down
23 changes: 11 additions & 12 deletions src/asm/program_read_dataset.inc
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
mov ecx, ebp ;# ecx = ma
and ecx, RANDOMX_DATASET_BASE_MASK
xor r8, qword ptr [rdi+rcx]
ror rbp, 32 ;# swap "ma" and "mx"
xor rbp, rax ;# modify "mx"
mov edx, ebp ;# edx = mx
and edx, RANDOMX_DATASET_BASE_MASK
prefetchnta byte ptr [rdi+rdx]
ror rbp, 32 ;# swap "ma" and "mx"
mov edx, ebp ;# edx = ma
and edx, RANDOMX_DATASET_BASE_MASK
lea rcx, [rdi+rdx] ;# dataset cache line
xor r8, qword ptr [rcx+0]
xor r9, qword ptr [rcx+8]
xor r10, qword ptr [rcx+16]
xor r11, qword ptr [rcx+24]
xor r12, qword ptr [rcx+32]
xor r13, qword ptr [rcx+40]
xor r14, qword ptr [rcx+48]
xor r15, qword ptr [rcx+56]
xor r9, qword ptr [rdi+rcx+8]
xor r10, qword ptr [rdi+rcx+16]
xor r11, qword ptr [rdi+rcx+24]
xor r12, qword ptr [rdi+rcx+32]
xor r13, qword ptr [rdi+rcx+40]
xor r14, qword ptr [rdi+rcx+48]
xor r15, qword ptr [rdi+rcx+56]
8 changes: 4 additions & 4 deletions src/asm/program_read_dataset_sshash_init.inc
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
mov qword ptr [rsp+16], r13
mov qword ptr [rsp+8], r14
mov qword ptr [rsp+0], r15
xor rbp, rax ;# modify "mx"
ror rbp, 32 ;# swap "ma" and "mx"
mov ebx, ebp ;# ecx = ma
and ebx, RANDOMX_DATASET_BASE_MASK
shr ebx, 6 ;# ebx = Dataset block number
xor rbp, rax ;# modify "mx"
mov rbx, rbp ;# ebx = ma
shr rbx, 38
and ebx, RANDOMX_DATASET_BASE_MASK / 64 ;# ebx = Dataset block number
;# add ebx, datasetOffset / 64
;# call 32768
5 changes: 3 additions & 2 deletions src/jit_compiler_a64_static.S
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,9 @@ literal_v14: .fill 2,8,0
literal_v15: .fill 2,8,0

DECL(randomx_program_aarch64_vm_instructions_end):
# Calculate dataset pointer for dataset read
# Do it here to break false dependency from readReg2 and readReg3 (see next line)
lsr x10, x9, 32

# mx ^= r[readReg2] ^ r[readReg3];
eor x9, x9, x18
Expand All @@ -324,8 +327,6 @@ DECL(randomx_program_aarch64_cacheline_align_mask1):
# mx <-> ma
ror x9, x9, 32

# Calculate dataset pointer for dataset read
mov w10, w9
DECL(randomx_program_aarch64_cacheline_align_mask2):
# Actual mask will be inserted by JIT compiler
and x10, x10, 1
Expand Down

0 comments on commit 3c8c7ee

Please sign in to comment.