diff --git a/src/bitshuffle/.github/dependabot.yml b/src/bitshuffle/.github/dependabot.yml new file mode 100644 index 00000000..7bb4cf76 --- /dev/null +++ b/src/bitshuffle/.github/dependabot.yml @@ -0,0 +1,7 @@ +# https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" diff --git a/src/bitshuffle/.github/workflows/lint.yml b/src/bitshuffle/.github/workflows/lint.yml index 6d828a1c..a0df1fc1 100644 --- a/src/bitshuffle/.github/workflows/lint.yml +++ b/src/bitshuffle/.github/workflows/lint.yml @@ -12,10 +12,10 @@ jobs: lint-code: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up Python 3.10 - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: "3.10" diff --git a/src/bitshuffle/.github/workflows/main.yml b/src/bitshuffle/.github/workflows/main.yml index 8ec96b64..6bab0ebe 100644 --- a/src/bitshuffle/.github/workflows/main.yml +++ b/src/bitshuffle/.github/workflows/main.yml @@ -20,7 +20,7 @@ jobs: runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Install apt dependencies if: ${{ matrix.os == 'ubuntu-latest' }} @@ -33,7 +33,7 @@ jobs: brew install hdf5 pkg-config - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} diff --git a/src/bitshuffle/.github/workflows/wheels.yml b/src/bitshuffle/.github/workflows/wheels.yml index def84e0b..06a5c919 100644 --- a/src/bitshuffle/.github/workflows/wheels.yml +++ b/src/bitshuffle/.github/workflows/wheels.yml @@ -17,22 +17,26 @@ jobs: steps: # Checkout bitshuffle - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 # Build wheels for linux and x86 platforms - name: Build wheels - uses: pypa/cibuildwheel@v2.3.1 + uses: pypa/cibuildwheel@v2.11.2 with: output-dir: ./wheelhouse-hdf5-${{ matrix.hdf5}} env: - CIBW_SKIP: "pp* *musllinux*" - CIBW_ARCHS_LINUX: "x86_64" + CIBW_SKIP: "pp* *musllinux* cp311-macosx*" + CIBW_ARCHS: "x86_64" CIBW_BEFORE_ALL: | chmod +x .github/workflows/install_hdf5.sh .github/workflows/install_hdf5.sh ${{ matrix.hdf5 }} git submodule update --init - CIBW_ENVIRONMENT: | - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib ENABLE_ZSTD=1 + # Only build Haswell wheels on x86 for compatibility + CIBW_ENVIRONMENT: > + LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib + CPATH=/usr/local/include + ENABLE_ZSTD=1 + BITSHUFFLE_ARCH=haswell CIBW_TEST_REQUIRES: pytest # Install different version of HDF5 for unit tests to ensure the # wheels are independent of HDF5 installation @@ -41,9 +45,11 @@ jobs: # .github/workflows/install_hdf5.sh 1.8.11 # Run units tests but disable test_h5plugin.py CIBW_TEST_COMMAND: pytest {package}/tests + # The Github runners for macOS don't support AVX2 instructions and so the tests will fail with SIGILL, so skip them + CIBW_TEST_SKIP: "*macosx*" # Package wheels and host on CI - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 with: path: ./wheelhouse-hdf5-${{ matrix.hdf5 }}/*.whl @@ -55,14 +61,14 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Install apt dependencies run: | sudo apt-get install -y libhdf5-serial-dev hdf5-tools pkg-config - name: Install Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} @@ -73,7 +79,7 @@ jobs: - name: Build sdist run: python setup.py sdist - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 with: path: dist/*.tar.gz @@ -86,12 +92,12 @@ jobs: # Alternatively, to publish when a GitHub Release is created, use the following rule: if: github.event_name == 'release' && github.event.action == 'published' steps: - - uses: actions/download-artifact@v2 + - uses: actions/download-artifact@v3 with: name: artifact path: dist - - uses: pypa/gh-action-pypi-publish@v1.4.2 + - uses: pypa/gh-action-pypi-publish@v1.5.1 with: user: __token__ password: ${{ secrets.pypi_password }} diff --git a/src/bitshuffle/README.rst b/src/bitshuffle/README.rst index 7e4be25f..8452e407 100644 --- a/src/bitshuffle/README.rst +++ b/src/bitshuffle/README.rst @@ -17,7 +17,7 @@ except it operates at the bit level instead of the byte level. Arranging a typed data array in to a matrix with the elements as the rows and the bits within the elements as the columns, Bitshuffle "transposes" the matrix, such that all the least-significant-bits are in a row, etc. This transpose -is performed within blocks of data roughly 8kB long [1]_. +is performed within blocks of data roughly 8 kB long [1]_. This does not in itself compress data, only rearranges it for more efficient compression. To perform the actual compression you will need a compression @@ -97,20 +97,35 @@ Comparing Bitshuffle to other compression algorithms and HDF5 filters: Installation for Python ----------------------- -Installation requires python 2.7+ or 3.3+, HDF5 1.8.4 or later, HDF5 for python -(h5py), Numpy and Cython. Bitshuffle is linked against HDF5. To use the dynamically -loaded HDF5 filter requires HDF5 1.8.11 or later. If ZSTD support is enabled the ZSTD -repo needs to pulled into bitshuffle before installation with:: + +In most cases bitshuffle can be installed by `pip`:: + + pip install bitshuffle + +On Linux and macOS x86_64 platforms binary wheels are available, on other platforms a +source build will be performed. The binary wheels are built with AVX2 support and will +only run processors that support these instructions (most processors from 2015 onwards, +i.e. Intel Haswell, AMD Excavator and later). On an unsupported processor these builds +of bitshuffle will crash with `SIGILL`. To run on unsupported x86_64 processors, or +target newer instructions such as AVX512, you should perform a build from source. +This can be forced by giving pip the `--no-binary=bitshuffle` option. + +Source installation requires python 2.7+ or 3.3+, HDF5 1.8.4 or later, HDF5 for python +(h5py), Numpy and Cython. Bitshuffle is linked against HDF5. To use the dynamically +loaded HDF5 filter requires HDF5 1.8.11 or later. + +For total control, bitshuffle can be built using `python setup.py`. If ZSTD support is +to be enabled the ZSTD repo needs to pulled into bitshuffle before installation with:: git submodule update --init -To install bitshuffle:: +To build and install bitshuffle:: python setup.py install [--h5plugin [--h5plugin-dir=spam] --zstd] -To get finer control of installation options, including whether to compile -with OpenMP multi-threading, copy the ``setup.cfg.example`` to ``setup.cfg`` -and edit the values therein. +To get finer control of installation options, including whether to compile with OpenMP +multi-threading and the target microarchitecture copy the ``setup.cfg.example`` to +``setup.cfg`` and edit the values therein. If using the dynamically loaded HDF5 filter (which gives you access to the Bitshuffle and LZF filters outside of python), set the environment variable @@ -143,9 +158,9 @@ interface or through the convenience functions provided in version 2.5.0 and later Bitshuffle can be added to new datasets through the high level interface, as in the example below. -The compression algorithm can be configured using the `filter_opts` in -`bitshuffle.h5.create_dataset()`. LZ4 is chosen with: -`(BLOCK_SIZE, h5.H5_COMPRESS_LZ4)` and ZSTD with: +The compression algorithm can be configured using the `filter_opts` in +`bitshuffle.h5.create_dataset()`. LZ4 is chosen with: +`(BLOCK_SIZE, h5.H5_COMPRESS_LZ4)` and ZSTD with: `(BLOCK_SIZE, h5.H5_COMPRESS_ZSTD, COMP_LVL)`. See `test_h5filter.py` for an example. Example h5py @@ -214,6 +229,27 @@ Then, you use them like this:: .. _`snappy-java`: https://github.com/xerial/snappy-java +Rust HDF5 plugin +---------------- + +If you wish to open HDF5 files compressed with bitshuffle in your Rust program, there is a `Rust binding`_ for it. +In your Cargo.toml:: + + [dependencies] + ... + hdf5-bitshuffle = "0.9" + ... + +To register the plugin in your code:: + + use hdf5_bitshuffle::register_bitshuffle_plugin; + + fn main() { + register_bitshuffle_plugin(); + } + +.. _`Rust binding`: https://docs.rs/hdf5-bitshuffle/latest/hdf5_bitshuffle/ + Anaconda -------- diff --git a/src/bitshuffle/bitshuffle/__init__.py b/src/bitshuffle/bitshuffle/__init__.py index 3f7c0380..896d993a 100644 --- a/src/bitshuffle/bitshuffle/__init__.py +++ b/src/bitshuffle/bitshuffle/__init__.py @@ -8,6 +8,7 @@ using_NEON using_SSE2 using_AVX2 + using_AVX512 bitshuffle bitunshuffle compress_lz4 @@ -28,6 +29,7 @@ using_NEON, using_SSE2, using_AVX2, + using_AVX512, compress_lz4, decompress_lz4, ) @@ -49,6 +51,7 @@ "using_NEON", "using_SSE2", "using_AVX2", + "using_AVX512", "compress_lz4", "decompress_lz4", ] + zstd_api diff --git a/src/bitshuffle/bitshuffle/ext.pyx b/src/bitshuffle/bitshuffle/ext.pyx index edc9c588..2d4cc4c3 100644 --- a/src/bitshuffle/bitshuffle/ext.pyx +++ b/src/bitshuffle/bitshuffle/ext.pyx @@ -24,6 +24,7 @@ cdef extern from b"bitshuffle.h": int bshuf_using_NEON() int bshuf_using_SSE2() int bshuf_using_AVX2() + int bshuf_using_AVX512() int bshuf_bitshuffle(void *A, void *B, int size, int elem_size, int block_size) nogil int bshuf_bitunshuffle(void *A, void *B, int size, int elem_size, @@ -60,7 +61,9 @@ cdef extern int bshuf_trans_bit_byte_scal(void *A, void *B, int size, int elem_s cdef extern int bshuf_trans_bit_byte_SSE(void *A, void *B, int size, int elem_size) cdef extern int bshuf_trans_bit_byte_NEON(void *A, void *B, int size, int elem_size) cdef extern int bshuf_trans_bit_byte_AVX(void *A, void *B, int size, int elem_size) +cdef extern int bshuf_trans_bit_byte_AVX512(void *A, void *B, int size, int elem_size) cdef extern int bshuf_trans_bitrow_eight(void *A, void *B, int size, int elem_size) +cdef extern int bshuf_trans_bit_elem_AVX512(void *A, void *B, int size, int elem_size) cdef extern int bshuf_trans_bit_elem_AVX(void *A, void *B, int size, int elem_size) cdef extern int bshuf_trans_bit_elem_SSE(void *A, void *B, int size, int elem_size) cdef extern int bshuf_trans_bit_elem_NEON(void *A, void *B, int size, int elem_size) @@ -73,9 +76,11 @@ cdef extern int bshuf_shuffle_bit_eightelem_scal(void *A, void *B, int size, int cdef extern int bshuf_shuffle_bit_eightelem_SSE(void *A, void *B, int size, int elem_size) cdef extern int bshuf_shuffle_bit_eightelem_NEON(void *A, void *B, int size, int elem_size) cdef extern int bshuf_shuffle_bit_eightelem_AVX(void *A, void *B, int size, int elem_size) +cdef extern int bshuf_shuffle_bit_eightelem_AVX512(void *A, void *B, int size, int elem_size) cdef extern int bshuf_untrans_bit_elem_SSE(void *A, void *B, int size, int elem_size) cdef extern int bshuf_untrans_bit_elem_NEON(void *A, void *B, int size, int elem_size) cdef extern int bshuf_untrans_bit_elem_AVX(void *A, void *B, int size, int elem_size) +cdef extern int bshuf_untrans_bit_elem_AVX512(void *A, void *B, int size, int elem_size) cdef extern int bshuf_untrans_bit_elem_scal(void *A, void *B, int size, int elem_size) cdef extern int bshuf_trans_bit_elem(void *A, void *B, int size, int elem_size) cdef extern int bshuf_untrans_bit_elem(void *A, void *B, int size, int elem_size) @@ -108,6 +113,14 @@ def using_AVX2(): return False +def using_AVX512(): + """Whether compiled using AVX512 instructions.""" + if bshuf_using_AVX512(): + return True + else: + return False + + def _setup_arr(arr): shape = tuple(arr.shape) if not arr.flags['C_CONTIGUOUS']: @@ -188,10 +201,18 @@ def trans_bit_byte_AVX(np.ndarray arr not None): return _wrap_C_fun(&bshuf_trans_bit_byte_AVX, arr) +def trans_bit_byte_AVX512(np.ndarray arr not None): + return _wrap_C_fun(&bshuf_trans_bit_byte_AVX512, arr) + + def trans_bitrow_eight(np.ndarray arr not None): return _wrap_C_fun(&bshuf_trans_bitrow_eight, arr) +def trans_bit_elem_AVX512(np.ndarray arr not None): + return _wrap_C_fun(&bshuf_trans_bit_elem_AVX512, arr) + + def trans_bit_elem_AVX(np.ndarray arr not None): return _wrap_C_fun(&bshuf_trans_bit_elem_AVX, arr) @@ -240,6 +261,10 @@ def shuffle_bit_eightelem_AVX(np.ndarray arr not None): return _wrap_C_fun(&bshuf_shuffle_bit_eightelem_AVX, arr) +def shuffle_bit_eightelem_AVX512(np.ndarray arr not None): + return _wrap_C_fun(&bshuf_shuffle_bit_eightelem_AVX512, arr) + + def untrans_bit_elem_SSE(np.ndarray arr not None): return _wrap_C_fun(&bshuf_untrans_bit_elem_SSE, arr) @@ -252,6 +277,10 @@ def untrans_bit_elem_AVX(np.ndarray arr not None): return _wrap_C_fun(&bshuf_untrans_bit_elem_AVX, arr) +def untrans_bit_elem_AVX512(np.ndarray arr not None): + return _wrap_C_fun(&bshuf_untrans_bit_elem_AVX512, arr) + + def untrans_bit_elem_scal(np.ndarray arr not None): return _wrap_C_fun(&bshuf_untrans_bit_elem_scal, arr) diff --git a/src/bitshuffle/setup.cfg.example b/src/bitshuffle/setup.cfg.example index 6bd2ccfb..2cdf0c70 100644 --- a/src/bitshuffle/setup.cfg.example +++ b/src/bitshuffle/setup.cfg.example @@ -4,7 +4,7 @@ h5plugin = 0 h5plugin-dir = /usr/local/hdf5/lib/plugin [build_ext] -# Whether to compile with OpenMP multi-threading. Default is system dependant: +# Whether to compile with OpenMP multi-threading. Default is system dependent: # False on OSX (since the clang compiler does not yet support OpenMP) and True # otherwise. omp = 1 diff --git a/src/bitshuffle/setup.py b/src/bitshuffle/setup.py index ff99b8ef..b8ca9cf1 100644 --- a/src/bitshuffle/setup.py +++ b/src/bitshuffle/setup.py @@ -18,8 +18,8 @@ VERSION_MAJOR = 0 -VERSION_MINOR = 4 -VERSION_POINT = 2 +VERSION_MINOR = 5 +VERSION_POINT = 1 # Define ZSTD macro for cython compilation default_options["compile_time_env"] = {"ZSTD_SUPPORT": False} @@ -45,14 +45,21 @@ H5PLUGINS_DEFAULT = "/usr/local/hdf5/lib/plugin" -MARCH_DEFAULT = "native" -# OSX's clang compliler does not support OpenMP. +# OSX's clang compiler does not support OpenMP. if sys.platform == "darwin": OMP_DEFAULT = False else: OMP_DEFAULT = True +# Build against the native architecture unless overridden by an environment variable +# This can also be overridden by a direct command line argument, or a `setup.cfg` entry +# This option is needed for the cibuildwheel action +if "BITSHUFFLE_ARCH" in os.environ: + MARCH_DEFAULT = os.environ["BITSHUFFLE_ARCH"] +else: + MARCH_DEFAULT = "native" + FALLBACK_CONFIG = { "include_dirs": [], "library_dirs": [], @@ -201,9 +208,32 @@ def pkgconfig(*packages, **kw): ) -EXTENSIONS = [ext_bshuf, h5filter] +EXTENSIONS = [ + ext_bshuf, +] + +# Check for HDF5 support +HDF5_FILTER_SUPPORT = False +CPATHS = os.environ["CPATH"].split(":") if "CPATH" in os.environ else [] +for p in ["/usr/include"] + pkgconfig("hdf5")["include_dirs"] + CPATHS: + if os.path.exists(os.path.join(p, "hdf5.h")): + HDF5_FILTER_SUPPORT = True + +if HDF5_FILTER_SUPPORT: + EXTENSIONS.append(h5filter) + +# Check for plugin hdf5 plugin support (hdf5 >= 1.8.11) +HDF5_PLUGIN_SUPPORT = False +CPATHS = os.environ["CPATH"].split(":") if "CPATH" in os.environ else [] +for p in ["/usr/include"] + pkgconfig("hdf5")["include_dirs"] + CPATHS: + if os.path.exists(os.path.join(p, "H5PLextern.h")): + HDF5_PLUGIN_SUPPORT = True + +if HDF5_PLUGIN_SUPPORT: + EXTENSIONS.extend([filter_plugin, lzf_plugin]) # For enabling ZSTD support when building wheels +# This needs to be done after all Extensions have been added to EXTENSIONS if "ENABLE_ZSTD" in os.environ: default_options["compile_time_env"] = {"ZSTD_SUPPORT": True} for ext in EXTENSIONS: @@ -217,16 +247,6 @@ def pkgconfig(*packages, **kw): ext.depends += zstd_headers ext.define_macros += [("ZSTD_SUPPORT", 1)] -# Check for plugin hdf5 plugin support (hdf5 >= 1.8.11) -HDF5_PLUGIN_SUPPORT = False -CPATHS = os.environ["CPATH"].split(":") if "CPATH" in os.environ else [] -for p in ["/usr/include"] + pkgconfig("hdf5")["include_dirs"] + CPATHS: - if os.path.exists(os.path.join(p, "H5PLextern.h")): - HDF5_PLUGIN_SUPPORT = True - -if HDF5_PLUGIN_SUPPORT: - EXTENSIONS.extend([filter_plugin, lzf_plugin]) - class develop(develop_): def run(self): @@ -344,10 +364,25 @@ def finalize_options(self): def build_extensions(self): c = self.compiler.compiler_type + # Set compiler flags including architecture + if self.compiler.compiler_type == "msvc": + openmpflag = "/openmp" + compileflags = COMPILE_FLAGS_MSVC + else: + openmpflag = "-fopenmp" + archi = platform.machine() + if archi in ("i386", "x86_64"): + compileflags = COMPILE_FLAGS + ["-march=%s" % self.march] + else: + compileflags = COMPILE_FLAGS + ["-mcpu=%s" % self.march] + if archi == "ppc64le": + compileflags = COMPILE_FLAGS + ["-DNO_WARN_X86_INTRINSICS"] + if self.omp not in ("0", "1", True, False): raise ValueError("Invalid omp argument. Mut be '0' or '1'.") self.omp = int(self.omp) + # Add the appropriate OpenMP flags if needed if self.omp: if not hasattr(self, "_printed_omp_message"): self._printed_omp_message = True @@ -356,26 +391,15 @@ def build_extensions(self): print("#################################\n") # More portable to pass -fopenmp to linker. # self.libraries += ['gomp'] - if self.compiler.compiler_type == "msvc": - openmpflag = "/openmp" - compileflags = COMPILE_FLAGS_MSVC - else: - openmpflag = "-fopenmp" - archi = platform.machine() - if archi in ("i386", "x86_64"): - compileflags = COMPILE_FLAGS + ["-march=%s" % self.march] - else: - compileflags = COMPILE_FLAGS + ["-mcpu=%s" % self.march] - if archi == "ppc64le": - compileflags = COMPILE_FLAGS + ["-DNO_WARN_X86_INTRINSICS"] - for e in self.extensions: - e.extra_compile_args = list( - set(e.extra_compile_args).union(compileflags) - ) - if openmpflag not in e.extra_compile_args: - e.extra_compile_args += [openmpflag] - if openmpflag not in e.extra_link_args: - e.extra_link_args += [openmpflag] + compileflags += [openmpflag] + linkflags = [openmpflag] + else: + linkflags = [] + + # Add the compile/link options to each extension + for e in self.extensions: + e.extra_compile_args = list(set(e.extra_compile_args).union(compileflags)) + e.extra_link_args = list(set(e.extra_link_args).union(linkflags)) build_ext_.build_extensions(self) diff --git a/src/bitshuffle/src/bitshuffle_core.c b/src/bitshuffle/src/bitshuffle_core.c index ef33bf55..ba41473f 100644 --- a/src/bitshuffle/src/bitshuffle_core.c +++ b/src/bitshuffle/src/bitshuffle_core.c @@ -16,6 +16,10 @@ #include +#if defined(__AVX512F__) && defined (__AVX512BW__) && defined(__AVX2__) && defined(__SSE2__) +#define USEAVX512 +#endif + #if defined(__AVX2__) && defined (__SSE2__) #define USEAVX2 #endif @@ -79,6 +83,14 @@ int bshuf_using_AVX2(void) { } +int bshuf_using_AVX512(void) { +#ifdef USEAVX512 + return 1; +#else + return 0; +#endif +} + /* ---- Worker code not requiring special instruction sets. ---- * * The following code does not use any x86 specific vectorized instructions @@ -1384,7 +1396,6 @@ int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t */ #ifdef USEAVX2 - /* Transpose bits within bytes. */ int64_t bshuf_trans_bit_byte_AVX(const void* in, void* out, const size_t size, const size_t elem_size) { @@ -1625,6 +1636,162 @@ int64_t bshuf_untrans_bit_elem_AVX(const void* in, void* out, const size_t size, #endif // #ifdef USEAVX2 +#ifdef USEAVX512 + +/* Transpose bits within bytes. */ +int64_t bshuf_trans_bit_byte_AVX512(const void* in, void* out, const size_t size, + const size_t elem_size) { + + size_t ii, kk; + const char* in_b = (const char*) in; + char* out_b = (char*) out; + size_t nbyte = elem_size * size; + int64_t count; + + int64_t* out_i64; + __m512i zmm; + __mmask64 bt; + if (nbyte >= 64) { + const __m512i mask = _mm512_set1_epi8(0); + + for (ii = 0; ii + 63 < nbyte; ii += 64) { + zmm = _mm512_loadu_si512((__m512i *) &in_b[ii]); + for (kk = 0; kk < 8; kk++) { + bt = _mm512_cmp_epi8_mask(zmm, mask, 1); + zmm = _mm512_slli_epi16(zmm, 1); + out_i64 = (int64_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; + *out_i64 = (int64_t)bt; + } + } + } + + __m256i ymm; + int32_t bt32; + int32_t* out_i32; + size_t start = nbyte - nbyte % 64; + for (ii = start; ii + 31 < nbyte; ii += 32) { + ymm = _mm256_loadu_si256((__m256i *) &in_b[ii]); + for (kk = 0; kk < 8; kk++) { + bt32 = _mm256_movemask_epi8(ymm); + ymm = _mm256_slli_epi16(ymm, 1); + out_i32 = (int32_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; + *out_i32 = bt32; + } + } + + + count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size, + nbyte - nbyte % 64 % 32); + + return count; +} + + +/* Transpose bits within elements. */ +int64_t bshuf_trans_bit_elem_AVX512(const void* in, void* out, const size_t size, + const size_t elem_size) { + + int64_t count; + + CHECK_MULT_EIGHT(size); + + void* tmp_buf = malloc(size * elem_size); + if (tmp_buf == NULL) return -1; + + count = bshuf_trans_byte_elem_SSE(in, out, size, elem_size); + CHECK_ERR_FREE(count, tmp_buf); + count = bshuf_trans_bit_byte_AVX512(out, tmp_buf, size, elem_size); + CHECK_ERR_FREE(count, tmp_buf); + count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size); + + free(tmp_buf); + + return count; + +} + +/* Shuffle bits within the bytes of eight element blocks. */ +int64_t bshuf_shuffle_bit_eightelem_AVX512(const void* in, void* out, const size_t size, + const size_t elem_size) { + + CHECK_MULT_EIGHT(size); + + // With a bit of care, this could be written such that such that it is + // in_buf = out_buf safe. + const char* in_b = (const char*) in; + char* out_b = (char*) out; + + size_t ii, jj, kk; + size_t nbyte = elem_size * size; + + __m512i zmm; + __mmask64 bt; + + if (elem_size % 8) { + return bshuf_shuffle_bit_eightelem_AVX(in, out, size, elem_size); + } else { + const __m512i mask = _mm512_set1_epi8(0); + for (jj = 0; jj + 63 < 8 * elem_size; jj += 64) { + for (ii = 0; ii + 8 * elem_size - 1 < nbyte; + ii += 8 * elem_size) { + zmm = _mm512_loadu_si512((__m512i *) &in_b[ii + jj]); + for (kk = 0; kk < 8; kk++) { + bt = _mm512_cmp_epi8_mask(zmm, mask, 1); + zmm = _mm512_slli_epi16(zmm, 1); + size_t ind = (ii + jj / 8 + (7 - kk) * elem_size); + * (int64_t *) &out_b[ind] = bt; + } + } + } + + } + return size * elem_size; +} + +/* Untranspose bits within elements. */ +int64_t bshuf_untrans_bit_elem_AVX512(const void* in, void* out, const size_t size, + const size_t elem_size) { + + int64_t count; + + CHECK_MULT_EIGHT(size); + + void* tmp_buf = malloc(size * elem_size); + if (tmp_buf == NULL) return -1; + + count = bshuf_trans_byte_bitrow_AVX(in, tmp_buf, size, elem_size); + CHECK_ERR_FREE(count, tmp_buf); + count = bshuf_shuffle_bit_eightelem_AVX512(tmp_buf, out, size, elem_size); + + free(tmp_buf); + return count; +} + +#else // #ifdef USEAVX512 + +int64_t bshuf_trans_bit_byte_AVX512(const void* in, void* out, const size_t size, + const size_t elem_size) { + + return -14; +} + +int64_t bshuf_trans_bit_elem_AVX512(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -14; + +} + +int64_t bshuf_shuffle_bit_eightelem_AVX512(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -14; +} + +int64_t bshuf_untrans_bit_elem_AVX512(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -14; +} + +#endif /* ---- Drivers selecting best instruction set at compile time. ---- */ @@ -1632,7 +1799,9 @@ int64_t bshuf_trans_bit_elem(const void* in, void* out, const size_t size, const size_t elem_size) { int64_t count; -#ifdef USEAVX2 +#ifdef USEAVX512 + count = bshuf_trans_bit_elem_AVX512(in, out, size, elem_size); +#elif defined USEAVX2 count = bshuf_trans_bit_elem_AVX(in, out, size, elem_size); #elif defined(USESSE2) count = bshuf_trans_bit_elem_SSE(in, out, size, elem_size); @@ -1649,7 +1818,9 @@ int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size, const size_t elem_size) { int64_t count; -#ifdef USEAVX2 +#ifdef USEAVX512 + count = bshuf_untrans_bit_elem_AVX512(in, out, size, elem_size); +#elif defined USEAVX2 count = bshuf_untrans_bit_elem_AVX(in, out, size, elem_size); #elif defined(USESSE2) count = bshuf_untrans_bit_elem_SSE(in, out, size, elem_size); diff --git a/src/bitshuffle/src/bitshuffle_core.h b/src/bitshuffle/src/bitshuffle_core.h index fba7301c..af09b1c4 100644 --- a/src/bitshuffle/src/bitshuffle_core.h +++ b/src/bitshuffle/src/bitshuffle_core.h @@ -19,6 +19,7 @@ * -11 : Missing SSE. * -12 : Missing AVX. * -13 : Missing Arm Neon. + * -14 : Missing AVX512. * -80 : Input size not a multiple of 8. * -81 : block_size not multiple of 8. * -91 : Decompression error, wrong number of bytes processed. @@ -91,6 +92,18 @@ int bshuf_using_NEON(void); int bshuf_using_AVX2(void); +/* ---- bshuf_using_AVX512 ---- + * + * Whether routines where compiled with the AVX512 instruction set. + * + * Returns + * ------- + * 1 if using AVX512, 0 otherwise. + * + */ +int bshuf_using_AVX512(void); + + /* ---- bshuf_default_block_size ---- * * The default block size as function of element size. diff --git a/src/bitshuffle/src/bshuf_h5filter.c b/src/bitshuffle/src/bshuf_h5filter.c index 114b91ff..54de27d1 100644 --- a/src/bitshuffle/src/bshuf_h5filter.c +++ b/src/bitshuffle/src/bshuf_h5filter.c @@ -25,7 +25,7 @@ void bshuf_write_uint32_BE(void* buf, uint32_t num); uint32_t bshuf_read_uint32_BE(const void* buf); -// Only called on compresion, not on reverse. +// Only called on compression, not on reverse. herr_t bshuf_h5_set_local(hid_t dcpl, hid_t type, hid_t space){ herr_t r; @@ -192,7 +192,7 @@ size_t bshuf_h5_filter(unsigned int flags, size_t cd_nelmts, // Bit shuffle/compress. // Write the header, described in // http://www.hdfgroup.org/services/filters/HDF5_LZ4.pdf. - // Techincally we should be using signed integers instead of + // Technically we should be using signed integers instead of // unsigned ones, however for valid inputs (positive numbers) these // have the same representation. bshuf_write_uint64_BE(out_buf, nbytes_uncomp); diff --git a/src/bitshuffle/src/bshuf_h5filter.h b/src/bitshuffle/src/bshuf_h5filter.h index 54ee6775..0c6f153d 100644 --- a/src/bitshuffle/src/bshuf_h5filter.h +++ b/src/bitshuffle/src/bshuf_h5filter.h @@ -13,7 +13,7 @@ * * Filter Options * -------------- - * block_size (option slot 0) : interger (optional) + * block_size (option slot 0) : integer (optional) * What block size to use (in elements not bytes). Default is 0, * for which bitshuffle will pick a block size with a target of 8kb. * Compression (option slot 1) : 0 or BSHUF_H5_COMPRESS_LZ4 diff --git a/src/bitshuffle/src/iochain.c b/src/bitshuffle/src/iochain.c index baa97296..37015614 100644 --- a/src/bitshuffle/src/iochain.c +++ b/src/bitshuffle/src/iochain.c @@ -1,5 +1,5 @@ /* - * IOchain - Distribute a chain of dependant IO events amoung threads. + * IOchain - Distribute a chain of dependent IO events among threads. * * This file is part of Bitshuffle * Author: Kiyoshi Masui diff --git a/src/bitshuffle/src/iochain.h b/src/bitshuffle/src/iochain.h index 4e225d1b..8acafeae 100644 --- a/src/bitshuffle/src/iochain.h +++ b/src/bitshuffle/src/iochain.h @@ -1,5 +1,5 @@ /* - * IOchain - Distribute a chain of dependant IO events amoung threads. + * IOchain - Distribute a chain of dependent IO events among threads. * * This file is part of Bitshuffle * Author: Kiyoshi Masui diff --git a/src/bitshuffle/tests/make_regression_tdata.py b/src/bitshuffle/tests/make_regression_tdata.py index 03deb422..86086226 100644 --- a/src/bitshuffle/tests/make_regression_tdata.py +++ b/src/bitshuffle/tests/make_regression_tdata.py @@ -24,7 +24,7 @@ DTYPES = ["a1", "a2", "a3", "a4", "a6", "a8", "a10"] f = h5py.File(OUT_FILE, "w") -g_orig = f.create_group("origional") +g_orig = f.create_group("original") g_comp_lz4 = f.create_group("compressed") g_comp_zstd = f.create_group("compressed_zstd") diff --git a/src/bitshuffle/tests/test_ext.py b/src/bitshuffle/tests/test_ext.py index b2577c0d..7fbd7baf 100644 --- a/src/bitshuffle/tests/test_ext.py +++ b/src/bitshuffle/tests/test_ext.py @@ -34,7 +34,7 @@ def setUp(self): if TIME: n *= TIME # Almost random bits, but now quite. All bits exercised (to fully test - # transpose) but still slightly compresible. + # transpose) but still slightly compressible. self.data = random.randint(0, 200, n).astype(np.uint8) self.fun = ext.copy self.check = None @@ -58,6 +58,8 @@ def tearDown(self): return if len(err.args) > 1 and (err.args[1] == -12) and not ext.using_AVX2(): return + if len(err.args) > 1 and (err.args[1] == -14) and not ext.using_AVX512(): + return else: raise delta_t = min(delta_ts) @@ -171,6 +173,18 @@ def test_03g_trans_bit_byte_AVX_32(self): self.fun = ext.trans_bit_byte_AVX self.check = trans_bit_byte + def test_03h_trans_bit_byte_AVX512(self): + self.case = "bit T byte AVX512 64" + self.data = self.data.view(np.float64) + self.fun = ext.trans_bit_byte_AVX512 + self.check = trans_bit_byte + + def test_03g_trans_bit_byte_AVX512_32(self): + self.case = "bit T byte AVX512 32" + self.data = self.data.view(np.float32) + self.fun = ext.trans_bit_byte_AVX512 + self.check = trans_bit_byte + def test_04a_trans_bit_elem_AVX(self): self.case = "bit T elem AVX 64" self.data = self.data.view(np.float64) @@ -213,6 +227,30 @@ def test_04g_trans_bit_elem_SSE_64(self): self.fun = ext.trans_bit_elem_SSE self.check = trans_bit_elem + def test_04h_trans_bit_elem_AVX512(self): + self.case = "bit T elem AVX512 64" + self.data = self.data.view(np.float64) + self.fun = ext.trans_bit_elem_AVX512 + self.check = trans_bit_elem + + def test_04i_trans_bit_elem_AVX512(self): + self.case = "bit T elem AVX 128" + self.data = self.data.view(np.complex128) + self.fun = ext.trans_bit_elem_AVX512 + self.check = trans_bit_elem + + def test_04j_trans_bit_elem_AVX512_32(self): + self.case = "bit T elem AVX512 32" + self.data = self.data.view(np.float32) + self.fun = ext.trans_bit_elem_AVX512 + self.check = trans_bit_elem + + def test_04k_trans_bit_elem_AVX512_16(self): + self.case = "bit T elem AVX512 16" + self.data = self.data.view(np.int16) + self.fun = ext.trans_bit_elem_AVX512 + self.check = trans_bit_elem + def test_06a_untrans_bit_elem_16(self): self.case = "bit U elem SSE 16" pre_trans = self.data.view(np.int16) @@ -262,6 +300,20 @@ def test_06g_untrans_bit_elem_64(self): self.fun = ext.untrans_bit_elem_scal self.check_data = pre_trans + def test_06h_untrans_bit_elem_32(self): + self.case = "bit U elem AVX512 32" + pre_trans = self.data.view(np.float32) + self.data = trans_bit_elem(pre_trans) + self.fun = ext.untrans_bit_elem_AVX512 + self.check_data = pre_trans + + def test_06i_untrans_bit_elem_64(self): + self.case = "bit U elem AVX512 64" + pre_trans = self.data.view(np.float64) + self.data = trans_bit_elem(pre_trans) + self.fun = ext.untrans_bit_elem_AVX512 + self.check_data = pre_trans + def test_07a_trans_byte_bitrow_64(self): self.case = "byte T row scal 64" self.data = self.data.view(np.float64) @@ -314,6 +366,30 @@ def test_08f_shuffle_bit_eight_AVX_128(self): self.fun = ext.shuffle_bit_eightelem_AVX self.check = ext.shuffle_bit_eightelem_scal + def test_08g_shuffle_bit_eight_AVX512_32(self): + self.case = "bit S eight AVX 32" + self.data = self.data.view(np.float32) + self.fun = ext.shuffle_bit_eightelem_AVX512 + self.check = ext.shuffle_bit_eightelem_scal + + def test_08h_shuffle_bit_eight_AVX512_64(self): + self.case = "bit S eight AVX512 64" + self.data = self.data.view(np.float64) + self.fun = ext.shuffle_bit_eightelem_AVX512 + self.check = ext.shuffle_bit_eightelem_scal + + def test_08i_shuffle_bit_eight_AVX512_16(self): + self.case = "bit S eight AVX512 16" + self.data = self.data.view(np.int16) + self.fun = ext.shuffle_bit_eightelem_AVX512 + self.check = ext.shuffle_bit_eightelem_scal + + def test_08i_shuffle_bit_eight_AVX512_128(self): + self.case = "bit S eight AVX512 128" + self.data = self.data.view(np.complex128) + self.fun = ext.shuffle_bit_eightelem_AVX512 + self.check = ext.shuffle_bit_eightelem_scal + def test_09a_trans_bit_elem_scal_64(self): self.case = "bit T elem scal 64" self.data = self.data.view(np.float64) @@ -353,6 +429,13 @@ def test_09f_untrans_bit_elem_AVX_64(self): self.fun = ext.untrans_bit_elem_AVX self.check_data = pre_trans + def test_09g_untrans_bit_elem_AVX_64(self): + self.case = "bit U elem AVX512 64" + pre_trans = self.data.view(np.float64) + self.data = trans_bit_elem(pre_trans) + self.fun = ext.untrans_bit_elem_AVX512 + self.check_data = pre_trans + def test_10a_bitshuffle_64(self): self.case = "bitshuffle 64" self.data = self.data.view(np.float64) @@ -481,10 +564,18 @@ def test_trans_bit_elem_AVX(self): self.fun = ext.trans_bit_elem_AVX self.check = trans_bit_elem + def test_trans_bit_elem_AVX512(self): + self.fun = ext.trans_bit_elem_AVX512 + self.check = trans_bit_elem + def test_untrans_bit_elem_AVX(self): self.fun = lambda x: ext.untrans_bit_elem_SSE(ext.trans_bit_elem(x)) self.check = lambda x: x + def test_untrans_bit_elem_AVX512(self): + self.fun = lambda x: ext.untrans_bit_elem_SSE(ext.trans_bit_elem(x)) + self.check = lambda x: x + def test_trans_bit_elem_scal(self): self.fun = ext.trans_bit_elem_scal self.check = trans_bit_elem @@ -515,12 +606,14 @@ def tearDown(self): return if len(err.args) > 1 and (err.args[1] == -12) and not ext.using_AVX2(): return + if len(err.args) > 1 and (err.args[1] == -14) and not ext.using_AVX512(): + return else: raise class TestBitShuffleCircle(unittest.TestCase): - """Ensure that final filter is circularly consistant for any data type and + """Ensure that final filter is circularly consistent for any data type and any length buffer.""" def test_circle(self):