Skip to content

Commit

Permalink
Use x XOR 255 to achieve 255-x to make it faster
Browse files Browse the repository at this point in the history
Signed-off-by: João Silva <[email protected]>
  • Loading branch information
vankxr committed May 20, 2024
1 parent ea58420 commit e1b83f5
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 61 deletions.
4 changes: 2 additions & 2 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ else
AX_EXT

if [ test "$ax_cv_have_avx512bw_ext" = yes ]; then
# AVX512 extensions
# AVX512-BW extensions
MLIBS_DOTPROD="src/dotprod/src/dotprod_cccf.avx512f.o \
src/dotprod/src/dotprod_crcf.avx512f.o \
src/dotprod/src/dotprod_rrrf.avx512f.o \
Expand All @@ -200,7 +200,7 @@ else
MLIBS_RANDOM="src/random/src/scramble.avx512f.o"
ARCH_OPTION='-mavx512bw'
elif [ test "$ax_cv_have_avx512f_ext" = yes ]; then
# AVX512 extensions
# AVX512-F extensions
MLIBS_DOTPROD="src/dotprod/src/dotprod_cccf.avx512f.o \
src/dotprod/src/dotprod_crcf.avx512f.o \
src/dotprod/src/dotprod_rrrf.avx512f.o \
Expand Down
32 changes: 15 additions & 17 deletions src/random/src/scramble.avx.c
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,10 @@ void unscramble_data(unsigned char * _x,
void unscramble_data_soft(unsigned char * _x,
unsigned int _n)
{
#if HAVE_AVX2
// t = 4*(floor(_n/4))
unsigned int t = (_n >> 2) << 2;

__m256i x;
__m256i y;
__m256i mask = _mm256_set_epi8((LIQUID_SCRAMBLE_MASK3 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x08) ? 0xFF : 0,
(LIQUID_SCRAMBLE_MASK3 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x80) ? 0xFF : 0,
(LIQUID_SCRAMBLE_MASK2 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x08) ? 0xFF : 0,
Expand All @@ -98,17 +96,17 @@ void unscramble_data_soft(unsigned char * _x,
unsigned int i;
for (i=0; i<t; i+=4) {
x = _mm256_lddqu_si256((const __m256i_u *)&_x[8 * i]);
y = _mm256_sub_epi8(mask, x);
x = _mm256_blendv_epi8(x, y, mask);

#if HAVE_AVX2
x = _mm256_xor_si256(x, mask);
#else
x = (__m256i)_mm256_xor_pd((__m256d)x, (__m256d)mask); // Same effect as _mm256_xor_si256, but maybe higher latency
#endif

_mm256_storeu_si256((__m256i_u *)&_x[8 * i], x);
}

for (; i<_n; i++) {
#else
// apply static masks
unsigned int i;
for (i=0; i<_n; i++) {
#endif
unsigned char mask;

switch ( i % 4 ) {
Expand All @@ -119,14 +117,14 @@ void unscramble_data_soft(unsigned char * _x,
default:;
}

if ( mask & 0x80 ) _x[8*i+0] = 255 - _x[8*i+0];
if ( mask & 0x40 ) _x[8*i+1] = 255 - _x[8*i+1];
if ( mask & 0x20 ) _x[8*i+2] = 255 - _x[8*i+2];
if ( mask & 0x10 ) _x[8*i+3] = 255 - _x[8*i+3];
if ( mask & 0x08 ) _x[8*i+4] = 255 - _x[8*i+4];
if ( mask & 0x04 ) _x[8*i+5] = 255 - _x[8*i+5];
if ( mask & 0x02 ) _x[8*i+6] = 255 - _x[8*i+6];
if ( mask & 0x01 ) _x[8*i+7] = 255 - _x[8*i+7];
if ( mask & 0x80 ) _x[8*i+0] ^= 255;
if ( mask & 0x40 ) _x[8*i+1] ^= 255;
if ( mask & 0x20 ) _x[8*i+2] ^= 255;
if ( mask & 0x10 ) _x[8*i+3] ^= 255;
if ( mask & 0x08 ) _x[8*i+4] ^= 255;
if ( mask & 0x04 ) _x[8*i+5] ^= 255;
if ( mask & 0x02 ) _x[8*i+6] ^= 255;
if ( mask & 0x01 ) _x[8*i+7] ^= 255;
}
}

48 changes: 27 additions & 21 deletions src/random/src/scramble.avx512f.c
Original file line number Diff line number Diff line change
Expand Up @@ -83,32 +83,38 @@ void unscramble_data(unsigned char * _x,
void unscramble_data_soft(unsigned char * _x,
unsigned int _n)
{
#if HAVE_AVX512_BW
// t = 8*(floor(_n/8))
unsigned int t = (_n >> 3) << 3;

__m512i x;
__mmask64 mask = ((__mmask64)liquid_reverse_byte_gentab[LIQUID_SCRAMBLE_MASK3] << 24) |
((__mmask64)liquid_reverse_byte_gentab[LIQUID_SCRAMBLE_MASK2] << 16) |
((__mmask64)liquid_reverse_byte_gentab[LIQUID_SCRAMBLE_MASK1] << 8) |
(__mmask64)(liquid_reverse_byte_gentab[LIQUID_SCRAMBLE_MASK0] << 0);
mask |= (mask << 32);
__m512i max = _mm512_set1_epi8(255);
__m512i mask = _mm512_set_epi8((LIQUID_SCRAMBLE_MASK3 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x08) ? 0xFF : 0,
(LIQUID_SCRAMBLE_MASK3 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x80) ? 0xFF : 0,
(LIQUID_SCRAMBLE_MASK2 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x08) ? 0xFF : 0,
(LIQUID_SCRAMBLE_MASK2 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x80) ? 0xFF : 0,
(LIQUID_SCRAMBLE_MASK1 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x08) ? 0xFF : 0,
(LIQUID_SCRAMBLE_MASK1 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x80) ? 0xFF : 0,
(LIQUID_SCRAMBLE_MASK0 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x08) ? 0xFF : 0,
(LIQUID_SCRAMBLE_MASK0 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x80) ? 0xFF : 0,
(LIQUID_SCRAMBLE_MASK3 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x08) ? 0xFF : 0,
(LIQUID_SCRAMBLE_MASK3 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x80) ? 0xFF : 0,
(LIQUID_SCRAMBLE_MASK2 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x08) ? 0xFF : 0,
(LIQUID_SCRAMBLE_MASK2 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x80) ? 0xFF : 0,
(LIQUID_SCRAMBLE_MASK1 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x08) ? 0xFF : 0,
(LIQUID_SCRAMBLE_MASK1 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x80) ? 0xFF : 0,
(LIQUID_SCRAMBLE_MASK0 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x08) ? 0xFF : 0,
(LIQUID_SCRAMBLE_MASK0 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x80) ? 0xFF : 0);

// apply static masks
unsigned int i;
for (i=0; i<t; i+=8) {
x = _mm512_loadu_si512((const __m512i_u *)&_x[8 * i]);
x = _mm512_mask_sub_epi8(x, mask, max, x);

x = _mm512_xor_si512(x, mask);

_mm512_storeu_si512((__m512i_u *)&_x[8 * i], x);
}

for (; i<_n; i++) {
#else
// apply static masks
unsigned int i;
for (i=0; i<_n; i++) {
#endif
unsigned char mask;

switch ( i % 4 ) {
Expand All @@ -119,14 +125,14 @@ void unscramble_data_soft(unsigned char * _x,
default:;
}

if ( mask & 0x80 ) _x[8*i+0] = 255 - _x[8*i+0];
if ( mask & 0x40 ) _x[8*i+1] = 255 - _x[8*i+1];
if ( mask & 0x20 ) _x[8*i+2] = 255 - _x[8*i+2];
if ( mask & 0x10 ) _x[8*i+3] = 255 - _x[8*i+3];
if ( mask & 0x08 ) _x[8*i+4] = 255 - _x[8*i+4];
if ( mask & 0x04 ) _x[8*i+5] = 255 - _x[8*i+5];
if ( mask & 0x02 ) _x[8*i+6] = 255 - _x[8*i+6];
if ( mask & 0x01 ) _x[8*i+7] = 255 - _x[8*i+7];
if ( mask & 0x80 ) _x[8*i+0] ^= 255;
if ( mask & 0x40 ) _x[8*i+1] ^= 255;
if ( mask & 0x20 ) _x[8*i+2] ^= 255;
if ( mask & 0x10 ) _x[8*i+3] ^= 255;
if ( mask & 0x08 ) _x[8*i+4] ^= 255;
if ( mask & 0x04 ) _x[8*i+5] ^= 255;
if ( mask & 0x02 ) _x[8*i+6] ^= 255;
if ( mask & 0x01 ) _x[8*i+7] ^= 255;
}
}

16 changes: 8 additions & 8 deletions src/random/src/scramble.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,14 +86,14 @@ void unscramble_data_soft(unsigned char * _x,
_x[8*i+j] = 255 - _x[8*i+j];
}
#else
if ( mask & 0x80 ) _x[8*i+0] = 255 - _x[8*i+0];
if ( mask & 0x40 ) _x[8*i+1] = 255 - _x[8*i+1];
if ( mask & 0x20 ) _x[8*i+2] = 255 - _x[8*i+2];
if ( mask & 0x10 ) _x[8*i+3] = 255 - _x[8*i+3];
if ( mask & 0x08 ) _x[8*i+4] = 255 - _x[8*i+4];
if ( mask & 0x04 ) _x[8*i+5] = 255 - _x[8*i+5];
if ( mask & 0x02 ) _x[8*i+6] = 255 - _x[8*i+6];
if ( mask & 0x01 ) _x[8*i+7] = 255 - _x[8*i+7];
if ( mask & 0x80 ) _x[8*i+0] ^= 255;
if ( mask & 0x40 ) _x[8*i+1] ^= 255;
if ( mask & 0x20 ) _x[8*i+2] ^= 255;
if ( mask & 0x10 ) _x[8*i+3] ^= 255;
if ( mask & 0x08 ) _x[8*i+4] ^= 255;
if ( mask & 0x04 ) _x[8*i+5] ^= 255;
if ( mask & 0x02 ) _x[8*i+6] ^= 255;
if ( mask & 0x01 ) _x[8*i+7] ^= 255;
#endif
}
}
Expand Down
23 changes: 10 additions & 13 deletions src/random/src/scramble.sse.c
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ void unscramble_data_soft(unsigned char * _x,
unsigned int t = (_n >> 2) << 2;

__m128i x;
__m128i y;
__m128i mask01 = _mm_set_epi8((LIQUID_SCRAMBLE_MASK1 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x08) ? 0xFF : 0,
(LIQUID_SCRAMBLE_MASK1 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x80) ? 0xFF : 0,
(LIQUID_SCRAMBLE_MASK0 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x08) ? 0xFF : 0,
Expand All @@ -97,13 +96,11 @@ void unscramble_data_soft(unsigned char * _x,
unsigned int i;
for (i=0; i<t; i+=4) {
x = _mm_lddqu_si128((const __m128i_u *)&_x[8 * i]);
y = _mm_sub_epi8(mask01, x);
x = _mm_blendv_epi8(x, y, mask01);
x = _mm_xor_si128(x, mask01);
_mm_storeu_si128((__m128i_u *)&_x[8 * i], x);

x = _mm_lddqu_si128((const __m128i_u *)&_x[8 * i + 16]);
y = _mm_sub_epi8(mask23, x);
x = _mm_blendv_epi8(x, y, mask23);
x = _mm_xor_si128(x, mask23);
_mm_storeu_si128((__m128i_u *)&_x[8 * i + 16], x);
}

Expand All @@ -123,14 +120,14 @@ void unscramble_data_soft(unsigned char * _x,
default:;
}

if ( mask & 0x80 ) _x[8*i+0] = 255 - _x[8*i+0];
if ( mask & 0x40 ) _x[8*i+1] = 255 - _x[8*i+1];
if ( mask & 0x20 ) _x[8*i+2] = 255 - _x[8*i+2];
if ( mask & 0x10 ) _x[8*i+3] = 255 - _x[8*i+3];
if ( mask & 0x08 ) _x[8*i+4] = 255 - _x[8*i+4];
if ( mask & 0x04 ) _x[8*i+5] = 255 - _x[8*i+5];
if ( mask & 0x02 ) _x[8*i+6] = 255 - _x[8*i+6];
if ( mask & 0x01 ) _x[8*i+7] = 255 - _x[8*i+7];
if ( mask & 0x80 ) _x[8*i+0] ^= 255;
if ( mask & 0x40 ) _x[8*i+1] ^= 255;
if ( mask & 0x20 ) _x[8*i+2] ^= 255;
if ( mask & 0x10 ) _x[8*i+3] ^= 255;
if ( mask & 0x08 ) _x[8*i+4] ^= 255;
if ( mask & 0x04 ) _x[8*i+5] ^= 255;
if ( mask & 0x02 ) _x[8*i+6] ^= 255;
if ( mask & 0x01 ) _x[8*i+7] ^= 255;
}
}

0 comments on commit e1b83f5

Please sign in to comment.