diff --git a/configure.ac b/configure.ac index 72518d1ce..225fbd96d 100644 --- a/configure.ac +++ b/configure.ac @@ -157,6 +157,15 @@ if test "${enable_simdoverride+set}" = set; then src/dotprod/src/dotprod_crcf.o \ src/dotprod/src/dotprod_rrrf.o \ src/dotprod/src/sumsq.o" + MLIBS_VECTOR="src/vector/src/vectorf_add.o \ + src/vector/src/vectorf_norm.o \ + src/vector/src/vectorf_mul.o \ + src/vector/src/vectorf_trig.o \ + src/vector/src/vectorcf_add.o \ + src/vector/src/vectorcf_norm.o \ + src/vector/src/vectorcf_mul.o \ + src/vector/src/vectorcf_trig.o" + MLIBS_RANDOM="src/random/src/scramble.o" ARCH_OPTION="" else # Check canonical system @@ -174,12 +183,37 @@ else # AVX512 : immintrin.h AX_EXT - if [ test "$ax_cv_have_avx512f_ext" = yes ]; then - # AVX512 extensions + if [ test "$ax_cv_have_avx512bw_ext" = yes ]; then + # AVX512-BW extensions MLIBS_DOTPROD="src/dotprod/src/dotprod_cccf.avx512f.o \ src/dotprod/src/dotprod_crcf.avx512f.o \ src/dotprod/src/dotprod_rrrf.avx512f.o \ src/dotprod/src/sumsq.avx512f.o" + MLIBS_VECTOR="src/vector/src/vectorf_add.o \ + src/vector/src/vectorf_norm.o \ + src/vector/src/vectorf_mul.o \ + src/vector/src/vectorf_trig.o \ + src/vector/src/vectorcf_add.o \ + src/vector/src/vectorcf_norm.o \ + src/vector/src/vectorcf_mul.avx512f.o \ + src/vector/src/vectorcf_trig.o" + MLIBS_RANDOM="src/random/src/scramble.avx512f.o" + ARCH_OPTION='-mavx512bw' + elif [ test "$ax_cv_have_avx512f_ext" = yes ]; then + # AVX512-F extensions + MLIBS_DOTPROD="src/dotprod/src/dotprod_cccf.avx512f.o \ + src/dotprod/src/dotprod_crcf.avx512f.o \ + src/dotprod/src/dotprod_rrrf.avx512f.o \ + src/dotprod/src/sumsq.avx512f.o" + MLIBS_VECTOR="src/vector/src/vectorf_add.o \ + src/vector/src/vectorf_norm.o \ + src/vector/src/vectorf_mul.o \ + src/vector/src/vectorf_trig.o \ + src/vector/src/vectorcf_add.o \ + src/vector/src/vectorcf_norm.o \ + src/vector/src/vectorcf_mul.avx512f.o \ + src/vector/src/vectorcf_trig.o" + MLIBS_RANDOM="src/random/src/scramble.avx512f.o" ARCH_OPTION='-mavx512f' elif [ test "$ax_cv_have_avx2_ext" = yes ]; then # AVX2 extensions @@ -187,6 +221,15 @@ else src/dotprod/src/dotprod_crcf.avx.o \ src/dotprod/src/dotprod_rrrf.avx.o \ src/dotprod/src/sumsq.avx.o" + MLIBS_VECTOR="src/vector/src/vectorf_add.o \ + src/vector/src/vectorf_norm.o \ + src/vector/src/vectorf_mul.o \ + src/vector/src/vectorf_trig.o \ + src/vector/src/vectorcf_add.o \ + src/vector/src/vectorcf_norm.o \ + src/vector/src/vectorcf_mul.avx.o \ + src/vector/src/vectorcf_trig.o" + MLIBS_RANDOM="src/random/src/scramble.avx.o" ARCH_OPTION='-mavx2' elif [ test "$ax_cv_have_avx_ext" = yes ]; then # AVX extensions @@ -194,6 +237,15 @@ else src/dotprod/src/dotprod_crcf.avx.o \ src/dotprod/src/dotprod_rrrf.avx.o \ src/dotprod/src/sumsq.avx.o" + MLIBS_VECTOR="src/vector/src/vectorf_add.o \ + src/vector/src/vectorf_norm.o \ + src/vector/src/vectorf_mul.o \ + src/vector/src/vectorf_trig.o \ + src/vector/src/vectorcf_add.o \ + src/vector/src/vectorcf_norm.o \ + src/vector/src/vectorcf_mul.avx.o \ + src/vector/src/vectorcf_trig.o" + MLIBS_RANDOM="src/random/src/scramble.sse.o" ARCH_OPTION='-mavx' elif [ test "$ax_cv_have_sse41_ext" = yes ]; then # SSE4.1/2 extensions @@ -201,6 +253,15 @@ else src/dotprod/src/dotprod_crcf.sse.o \ src/dotprod/src/dotprod_rrrf.sse.o \ src/dotprod/src/sumsq.sse.o" + MLIBS_VECTOR="src/vector/src/vectorf_add.o \ + src/vector/src/vectorf_norm.o \ + src/vector/src/vectorf_mul.o \ + src/vector/src/vectorf_trig.o \ + src/vector/src/vectorcf_add.o \ + src/vector/src/vectorcf_norm.o \ + src/vector/src/vectorcf_mul.sse.o \ + src/vector/src/vectorcf_trig.o" + MLIBS_RANDOM="src/random/src/scramble.sse.o" ARCH_OPTION='-msse4.1' elif [ test "$ax_cv_have_sse3_ext" = yes ]; then # SSE3 extensions @@ -208,6 +269,15 @@ else src/dotprod/src/dotprod_crcf.sse.o \ src/dotprod/src/dotprod_rrrf.sse.o \ src/dotprod/src/sumsq.sse.o" + MLIBS_VECTOR="src/vector/src/vectorf_add.o \ + src/vector/src/vectorf_norm.o \ + src/vector/src/vectorf_mul.o \ + src/vector/src/vectorf_trig.o \ + src/vector/src/vectorcf_add.o \ + src/vector/src/vectorcf_norm.o \ + src/vector/src/vectorcf_mul.sse.o \ + src/vector/src/vectorcf_trig.o" + MLIBS_RANDOM="src/random/src/scramble.sse.o" ARCH_OPTION='-msse3' elif [ test "$ax_cv_have_sse2_ext" = yes ]; then # SSE2 extensions @@ -215,6 +285,15 @@ else src/dotprod/src/dotprod_crcf.sse.o \ src/dotprod/src/dotprod_rrrf.sse.o \ src/dotprod/src/sumsq.sse.o" + MLIBS_VECTOR="src/vector/src/vectorf_add.o \ + src/vector/src/vectorf_norm.o \ + src/vector/src/vectorf_mul.o \ + src/vector/src/vectorf_trig.o \ + src/vector/src/vectorcf_add.o \ + src/vector/src/vectorcf_norm.o \ + src/vector/src/vectorcf_mul.o \ + src/vector/src/vectorcf_trig.o" + MLIBS_RANDOM="src/random/src/scramble.sse.o" ARCH_OPTION='-msse2' else # portable C version @@ -222,12 +301,30 @@ else src/dotprod/src/dotprod_crcf.o \ src/dotprod/src/dotprod_rrrf.o \ src/dotprod/src/sumsq.o" + MLIBS_VECTOR="src/vector/src/vectorf_add.o \ + src/vector/src/vectorf_norm.o \ + src/vector/src/vectorf_mul.o \ + src/vector/src/vectorf_trig.o \ + src/vector/src/vectorcf_add.o \ + src/vector/src/vectorcf_norm.o \ + src/vector/src/vectorcf_mul.o \ + src/vector/src/vectorcf_trig.o" + MLIBS_RANDOM="src/random/src/scramble.o" fi;; powerpc*) MLIBS_DOTPROD="src/dotprod/src/dotprod_cccf.o \ src/dotprod/src/dotprod_rrrf.av.o \ src/dotprod/src/dotprod_crcf.av.o \ src/dotprod/src/sumsq.o" + MLIBS_VECTOR="src/vector/src/vectorf_add.o \ + src/vector/src/vectorf_norm.o \ + src/vector/src/vectorf_mul.o \ + src/vector/src/vectorf_trig.o \ + src/vector/src/vectorcf_add.o \ + src/vector/src/vectorcf_norm.o \ + src/vector/src/vectorcf_mul.o \ + src/vector/src/vectorcf_trig.o" + MLIBS_RANDOM="src/random/src/scramble.o" ARCH_OPTION="-fno-common -faltivec";; armv1*|armv2*|armv3*|armv4*|armv5*|armv6*) # assume neon instructions are NOT available @@ -235,6 +332,15 @@ else src/dotprod/src/dotprod_crcf.o \ src/dotprod/src/dotprod_rrrf.o \ src/dotprod/src/sumsq.o" + MLIBS_VECTOR="src/vector/src/vectorf_add.o \ + src/vector/src/vectorf_norm.o \ + src/vector/src/vectorf_mul.o \ + src/vector/src/vectorf_trig.o \ + src/vector/src/vectorcf_add.o \ + src/vector/src/vectorcf_norm.o \ + src/vector/src/vectorcf_mul.o \ + src/vector/src/vectorcf_trig.o" + MLIBS_RANDOM="src/random/src/scramble.o" ARCH_OPTION="-ffast-math";; arm|armv7*|armv8*) # assume neon instructions are available @@ -245,6 +351,15 @@ else src/dotprod/src/dotprod_crcf.neon.o \ src/dotprod/src/dotprod_rrrf.neon.o \ src/dotprod/src/sumsq.o" + MLIBS_VECTOR="src/vector/src/vectorf_add.o \ + src/vector/src/vectorf_norm.o \ + src/vector/src/vectorf_mul.o \ + src/vector/src/vectorf_trig.o \ + src/vector/src/vectorcf_add.o \ + src/vector/src/vectorcf_norm.o \ + src/vector/src/vectorcf_mul.o \ + src/vector/src/vectorcf_trig.o" + MLIBS_RANDOM="src/random/src/scramble.o" case $target_os in darwin*) # M1 mac, ARM architecture : use neon extensions @@ -260,21 +375,19 @@ else src/dotprod/src/dotprod_crcf.o \ src/dotprod/src/dotprod_rrrf.o \ src/dotprod/src/sumsq.o" + MLIBS_VECTOR="src/vector/src/vectorf_add.o \ + src/vector/src/vectorf_norm.o \ + src/vector/src/vectorf_mul.o \ + src/vector/src/vectorf_trig.o \ + src/vector/src/vectorcf_add.o \ + src/vector/src/vectorcf_norm.o \ + src/vector/src/vectorcf_mul.o \ + src/vector/src/vectorcf_trig.o" + MLIBS_RANDOM="src/random/src/scramble.o" ARCH_OPTION="";; esac fi - -# for now all vector operations are portable C versions -MLIBS_VECTOR="src/vector/src/vectorf_add.port.o \ - src/vector/src/vectorf_norm.port.o \ - src/vector/src/vectorf_mul.port.o \ - src/vector/src/vectorf_trig.port.o \ - src/vector/src/vectorcf_add.port.o \ - src/vector/src/vectorcf_norm.port.o \ - src/vector/src/vectorcf_mul.port.o \ - src/vector/src/vectorcf_trig.port.o" - case $target_os in darwin*) AN_MAKEVAR([LIBTOOL], [AC_PROG_LIBTOOL]) @@ -302,8 +415,9 @@ esac # autoconf variable substitutions # AC_SUBST(LIBS) # shared libraries (-lc, -lm, etc.) -AC_SUBST(MLIBS_DOTPROD) # +AC_SUBST(MLIBS_DOTPROD) # AC_SUBST(MLIBS_VECTOR) # +AC_SUBST(MLIBS_RANDOM) # AC_SUBST(AR_LIB) # archive library AC_SUBST(SH_LIB) # output shared library target diff --git a/makefile.in b/makefile.in index ee3e20c83..6780e72bf 100644 --- a/makefile.in +++ b/makefile.in @@ -208,21 +208,27 @@ src/dotprod/src/sumsq.o : %.o : %.c $(include_headers) # specific machine architectures -# AltiVec -src/dotprod/src/dotprod_rrrf.av.o : %.o : %.c $(include_headers) - -# MMX/SSE2 -src/dotprod/src/dotprod_rrrf.mmx.o : %.o : %.c $(include_headers) -src/dotprod/src/dotprod_crcf.mmx.o : %.o : %.c $(include_headers) -src/dotprod/src/dotprod_cccf.mmx.o : %.o : %.c $(include_headers) - -src/dotprod/src/sumsq.mmx.o : %.o : %.c $(include_headers) +# AVX512F +src/dotprod/src/dotprod_rrrf.avx512f.o : %.o : %.c $(include_headers) +src/dotprod/src/dotprod_crcf.avx512f.o : %.o : %.c $(include_headers) +src/dotprod/src/dotprod_cccf.avx512f.o : %.o : %.c $(include_headers) +src/dotprod/src/sumsq.avx512f.o : %.o : %.c $(include_headers) + +# AVX/AVX2 +src/dotprod/src/dotprod_rrrf.avx.o : %.o : %.c $(include_headers) +src/dotprod/src/dotprod_crcf.avx.o : %.o : %.c $(include_headers) +src/dotprod/src/dotprod_cccf.avx.o : %.o : %.c $(include_headers) +src/dotprod/src/sumsq.avx.o : %.o : %.c $(include_headers) -# SSE4.1/2 -src/dotprod/src/dotprod_rrrf.sse4.o : %.o : %.c $(include_headers) +# SSE2/SSE3/SSE4.1/SSE4.2 +src/dotprod/src/dotprod_rrrf.sse.o : %.o : %.c $(include_headers) +src/dotprod/src/dotprod_crcf.sse.o : %.o : %.c $(include_headers) +src/dotprod/src/dotprod_cccf.sse.o : %.o : %.c $(include_headers) +src/dotprod/src/sumsq.sse.o : %.o : %.c $(include_headers) -# AVX -src/dotprod/src/sumsq.avx.o : %.o : %.c $(include_headers) +# AltiVec +src/dotprod/src/dotprod_rrrf.av.o : %.o : %.c $(include_headers) +src/dotprod/src/dotprod_crcf.av.o : %.o : %.c $(include_headers) # ARM Neon src/dotprod/src/dotprod_rrrf.neon.o : %.o : %.c $(include_headers) @@ -1049,10 +1055,11 @@ quantization_benchmarks := \ src/quantization/bench/quantizer_benchmark.c \ src/quantization/bench/compander_benchmark.c \ -# +# # MODULE : random # +# main objects that only have portable builds random_objects := \ src/random/src/rand.o \ src/random/src/randn.o \ @@ -1061,11 +1068,33 @@ random_objects := \ src/random/src/randgamma.o \ src/random/src/randnakm.o \ src/random/src/randricek.o \ - src/random/src/scramble.o \ - $(random_objects) : %.o : %.c $(include_headers) +# main objects list +random_objects += \ + @MLIBS_RANDOM@ \ + +# portable builds +src/random/src/scramble.o : %.o : %.c $(include_headers) + +# specific machine architectures + +# avx512f +src/random/src/scramble.avx512f.o : %.o : %.c $(include_headers) + +# AVX/AVX2 +src/random/src/scramble.avx.o : %.o : %.c $(include_headers) + +# SSE2/SSE3/SSE4.1/SSE4.2 +src/random/src/scramble.sse.o : %.o : %.c $(include_headers) + +# AltiVec +# TODO... + +# ARM Neon +# TODO... + # autotests random_autotests := \ src/random/tests/scramble_autotest.c \ @@ -1136,17 +1165,31 @@ vector_objects := \ @MLIBS_VECTOR@ \ # portable builds -src/vector/src/vectorf_add.port.o : %.o : %.c $(include_headers) src/vector/src/vector_add.proto.c -src/vector/src/vectorf_norm.port.o : %.o : %.c $(include_headers) src/vector/src/vector_norm.proto.c -src/vector/src/vectorf_mul.port.o : %.o : %.c $(include_headers) src/vector/src/vector_mul.proto.c -src/vector/src/vectorf_trig.port.o : %.o : %.c $(include_headers) src/vector/src/vector_trig.proto.c -src/vector/src/vectorcf_add.port.o : %.o : %.c $(include_headers) src/vector/src/vector_add.proto.c -src/vector/src/vectorcf_norm.port.o : %.o : %.c $(include_headers) src/vector/src/vector_norm.proto.c -src/vector/src/vectorcf_mul.port.o : %.o : %.c $(include_headers) src/vector/src/vector_mul.proto.c -src/vector/src/vectorcf_trig.port.o : %.o : %.c $(include_headers) src/vector/src/vector_trig.proto.c - -# builds for specific architectures -# ... +src/vector/src/vectorf_add.o : %.o : %.c $(include_headers) src/vector/src/vector_add.proto.c +src/vector/src/vectorf_norm.o : %.o : %.c $(include_headers) src/vector/src/vector_norm.proto.c +src/vector/src/vectorf_mul.o : %.o : %.c $(include_headers) src/vector/src/vector_mul.proto.c +src/vector/src/vectorf_trig.o : %.o : %.c $(include_headers) src/vector/src/vector_trig.proto.c +src/vector/src/vectorcf_add.o : %.o : %.c $(include_headers) src/vector/src/vector_add.proto.c +src/vector/src/vectorcf_norm.o : %.o : %.c $(include_headers) src/vector/src/vector_norm.proto.c +src/vector/src/vectorcf_mul.o : %.o : %.c $(include_headers) src/vector/src/vector_mul.proto.c +src/vector/src/vectorcf_trig.o : %.o : %.c $(include_headers) src/vector/src/vector_trig.proto.c + +# specific machine architectures + +# avx512f +src/vector/src/vectorcf_mul.avx512f.o : %.o : %.c $(include_headers) + +# AVX/AVX2 +src/vector/src/vectorcf_mul.avx.o : %.o : %.c $(include_headers) + +# SSE2/SSE3/SSE4.1/SSE4.2 +src/vector/src/vectorcf_mul.sse.o : %.o : %.c $(include_headers) + +# AltiVec +# TODO... + +# ARM Neon +# TODO... # vector autotest scripts vector_autotests := diff --git a/src/random/src/scramble.avx.c b/src/random/src/scramble.avx.c new file mode 100644 index 000000000..f990d5a04 --- /dev/null +++ b/src/random/src/scramble.avx.c @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2007 - 2015 Joseph Gaeddert + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +// +// Scramble (AVX SIMD version) +// + +#include + +#include "liquid.internal.h" + +void scramble_data(unsigned char * _x, + unsigned int _n) +{ + // t = 32*(floor(_n/32)) + unsigned int t = (_n >> 5) << 5; + + __m256i x; + __m256i mask = _mm256_set_epi8(LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0); + + // apply static masks + unsigned int i; + for (i=0; i> 2) << 2; + + __m256i x; + __m256i mask = _mm256_set_epi8((LIQUID_SCRAMBLE_MASK3 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK3 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x80) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK2 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK2 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x80) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK1 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK1 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x80) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK0 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK0 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x80) ? 0xFF : 0); + + // apply static masks + unsigned int i; + for (i=0; i + +#include "liquid.internal.h" + +void scramble_data(unsigned char * _x, + unsigned int _n) +{ + // t = 64*(floor(_n/64)) + unsigned int t = (_n >> 6) << 6; + + __m512i x; + __m512i mask = _mm512_set_epi8(LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0); + + // apply static masks + unsigned int i; + for (i=0; i> 3) << 3; + + __m512i x; + __m512i mask = _mm512_set_epi8((LIQUID_SCRAMBLE_MASK3 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK3 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x80) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK2 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK2 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x80) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK1 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK1 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x80) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK0 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK0 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x80) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK3 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK3 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x80) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK2 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK2 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x80) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK1 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK1 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x80) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK0 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK0 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x80) ? 0xFF : 0); + + // apply static masks + unsigned int i; + for (i=0; i #include @@ -84,14 +86,14 @@ void unscramble_data_soft(unsigned char * _x, _x[8*i+j] = 255 - _x[8*i+j]; } #else - if ( mask & 0x80 ) _x[8*i+0] = 255 - _x[8*i+0]; - if ( mask & 0x40 ) _x[8*i+1] = 255 - _x[8*i+1]; - if ( mask & 0x20 ) _x[8*i+2] = 255 - _x[8*i+2]; - if ( mask & 0x10 ) _x[8*i+3] = 255 - _x[8*i+3]; - if ( mask & 0x08 ) _x[8*i+4] = 255 - _x[8*i+4]; - if ( mask & 0x04 ) _x[8*i+5] = 255 - _x[8*i+5]; - if ( mask & 0x02 ) _x[8*i+6] = 255 - _x[8*i+6]; - if ( mask & 0x01 ) _x[8*i+7] = 255 - _x[8*i+7]; + if ( mask & 0x80 ) _x[8*i+0] ^= 255; + if ( mask & 0x40 ) _x[8*i+1] ^= 255; + if ( mask & 0x20 ) _x[8*i+2] ^= 255; + if ( mask & 0x10 ) _x[8*i+3] ^= 255; + if ( mask & 0x08 ) _x[8*i+4] ^= 255; + if ( mask & 0x04 ) _x[8*i+5] ^= 255; + if ( mask & 0x02 ) _x[8*i+6] ^= 255; + if ( mask & 0x01 ) _x[8*i+7] ^= 255; #endif } } diff --git a/src/random/src/scramble.sse.c b/src/random/src/scramble.sse.c new file mode 100644 index 000000000..ccbf8430c --- /dev/null +++ b/src/random/src/scramble.sse.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2007 - 2015 Joseph Gaeddert + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +// +// Scramble (SSE SIMD version) +// + +#if HAVE_SSE2 +#include +#endif + +#if HAVE_SSE4_1 +#include +#endif + +#include "liquid.internal.h" + +void scramble_data(unsigned char * _x, + unsigned int _n) +{ + // t = 16*(floor(_n/16)) + unsigned int t = (_n >> 4) << 4; + + __m128i x; + __m128i mask = _mm_set_epi8(LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0); + + // apply static masks + unsigned int i; + for (i=0; i> 2) << 2; + + __m128i x; + __m128i mask01 = _mm_set_epi8((LIQUID_SCRAMBLE_MASK1 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK1 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x80) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK0 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK0 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x80) ? 0xFF : 0); + __m128i mask23 = _mm_set_epi8((LIQUID_SCRAMBLE_MASK3 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK3 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x80) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK2 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK2 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x80) ? 0xFF : 0); + + // apply static masks + unsigned int i; + for (i=0; i + +// basic vector multiplication, unrolling loop +// _x : first array [size: _n x 1] +// _y : second array [size: _n x 1] +// _n : array lengths +// _z : output array pointer [size: _n x 1] +void liquid_vectorcf_mul(float complex *_x, + float complex *_y, + unsigned int _n, + float complex *_z) +{ + // type cast as floating point array + float * x = (float*) _x; + float * y = (float*) _y; + float * z = (float*) _z; + + // double effective length + unsigned int n = 2*_n; + + // temporary buffers + __m256 rx, ry, rz; + + // t = 8*(floor(_n/8)) + unsigned int t = (n >> 3) << 3; + + unsigned int i; + for (i=0; i> 3) << 3; + + unsigned int i; + for (i=0; i + +// basic vector multiplication, unrolling loop +// _x : first array [size: _n x 1] +// _y : second array [size: _n x 1] +// _n : array lengths +// _z : output array pointer [size: _n x 1] +void liquid_vectorcf_mul(float complex *_x, + float complex *_y, + unsigned int _n, + float complex *_z) +{ + // type cast as floating point array + float * x = (float*) _x; + float * y = (float*) _y; + float * z = (float*) _z; + + // double effective length + unsigned int n = 2*_n; + + // temporary buffers + __m512 rx, ry, rz; + __m512 one = _mm512_set1_ps(1.0f); + + // t = 16*(floor(_n/16)) + unsigned int t = (n >> 4) << 4; + + unsigned int i; + for (i=0; i> 4) << 4; + + unsigned int i; + for (i=0; i + +// basic vector multiplication, unrolling loop +// _x : first array [size: _n x 1] +// _y : second array [size: _n x 1] +// _n : array lengths +// _z : output array pointer [size: _n x 1] +void liquid_vectorcf_mul(float complex *_x, + float complex *_y, + unsigned int _n, + float complex *_z) +{ + // type cast as floating point array + float * x = (float*) _x; + float * y = (float*) _y; + float * z = (float*) _z; + + // double effective length + unsigned int n = 2*_n; + + // temporary buffers + __m128 rx, ry, rz; + + // t = 4*(floor(_n/4)) + unsigned int t = (n >> 2) << 2; + + unsigned int i; + for (i=0; i> 2) << 2; + + unsigned int i; + for (i=0; i