Skip to content

Commit

Permalink
Use lddqu instead of loadu since it may perform better in certain sit…
Browse files Browse the repository at this point in the history
…uations

Signed-off-by: João Silva <[email protected]>
  • Loading branch information
vankxr committed May 19, 2024
1 parent 7a07f99 commit ea58420
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 5 deletions.
4 changes: 2 additions & 2 deletions src/random/src/scramble.avx.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ void scramble_data(unsigned char * _x,
// apply static masks
unsigned int i;
for (i=0; i<t; i+=32) {
x = _mm256_loadu_si256((const __m256i_u *)&_x[i]);
x = _mm256_lddqu_si256((const __m256i_u *)&_x[i]);

#if HAVE_AVX2
x = _mm256_xor_si256(x, mask);
Expand Down Expand Up @@ -97,7 +97,7 @@ void unscramble_data_soft(unsigned char * _x,
// apply static masks
unsigned int i;
for (i=0; i<t; i+=4) {
x = _mm256_loadu_si256((const __m256i_u *)&_x[8 * i]);
x = _mm256_lddqu_si256((const __m256i_u *)&_x[8 * i]);
y = _mm256_sub_epi8(mask, x);
x = _mm256_blendv_epi8(x, y, mask);
_mm256_storeu_si256((__m256i_u *)&_x[8 * i], x);
Expand Down
6 changes: 3 additions & 3 deletions src/random/src/scramble.sse.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ void scramble_data(unsigned char * _x,
// apply static masks
unsigned int i;
for (i=0; i<t; i+=16) {
x = _mm_loadu_si128((const __m128i_u *)&_x[i]);
x = _mm_lddqu_si128((const __m128i_u *)&_x[i]);

x = _mm_xor_si128(x, mask);

Expand Down Expand Up @@ -96,12 +96,12 @@ void unscramble_data_soft(unsigned char * _x,
// apply static masks
unsigned int i;
for (i=0; i<t; i+=4) {
x = _mm_loadu_si128((const __m128i_u *)&_x[8 * i]);
x = _mm_lddqu_si128((const __m128i_u *)&_x[8 * i]);
y = _mm_sub_epi8(mask01, x);
x = _mm_blendv_epi8(x, y, mask01);
_mm_storeu_si128((__m128i_u *)&_x[8 * i], x);

x = _mm_loadu_si128((const __m128i_u *)&_x[8 * i + 16]);
x = _mm_lddqu_si128((const __m128i_u *)&_x[8 * i + 16]);
y = _mm_sub_epi8(mask23, x);
x = _mm_blendv_epi8(x, y, mask23);
_mm_storeu_si128((__m128i_u *)&_x[8 * i + 16], x);
Expand Down

0 comments on commit ea58420

Please sign in to comment.