From 53d380896378d86854feb1ff7b28263d9882ecae Mon Sep 17 00:00:00 2001
From: Thomas VINCENT <thomas.vincent@esrf.fr>
Date: Fri, 6 Jan 2023 15:02:35 +0100
Subject: [PATCH] Squashed 'src/bitshuffle/' changes from a60471d3..b9a15461

b9a15461 Don't build on 3.11 for macOS as there is no h5py build
85729f43 Bump bitshuffle version number
ea8a692e Ensure zstd options are added to filter build
21ecaaea Add the CPATH to cibw
eebf617c Update README to add caveats about the binary wheels
c8f05ad5 Restrict binary wheels to x86_64 Haswell builds
71a7bebe Allow setting of architecture without requiring an OpenMP build
330cf900 Allow setting build architecture by environment variable
c1d53f6c Bump pypa/gh-action-pypi-publish from 1.4.2 to 1.5.1
a599038e Bump pypa/cibuildwheel from 2.3.1 to 2.11.2
30b52c3e Add info about the hdf5 bitsuffle Rust plugin into README
07c72656 ci: Automatically update GitHub Actions in the future
3ff41af9 ci: Update GitHub Actions
8df4c178 Fix typos (#124)
881f97a3 Fix build error C2036 on Windows
61c65ae9 Check for HDF5 headers
982f8fec Allow installation without HDF5 libraries on the system
fdfcd404 AVX512 support (#117)

git-subtree-dir: src/bitshuffle
git-subtree-split: b9a1546133959298c56eee686932dbb18ff80f7a
---
 .github/dependabot.yml         |   7 ++
 .github/workflows/lint.yml     |   4 +-
 .github/workflows/main.yml     |   4 +-
 .github/workflows/wheels.yml   |  30 +++---
 README.rst                     |  60 ++++++++---
 bitshuffle/__init__.py         |   3 +
 bitshuffle/ext.pyx             |  29 ++++++
 setup.cfg.example              |   2 +-
 setup.py                       |  94 ++++++++++-------
 src/bitshuffle.c               |   2 +-
 src/bitshuffle_core.c          | 177 ++++++++++++++++++++++++++++++++-
 src/bitshuffle_core.h          |  13 +++
 src/bshuf_h5filter.c           |   4 +-
 src/bshuf_h5filter.h           |   2 +-
 src/iochain.c                  |   2 +-
 src/iochain.h                  |   2 +-
 tests/make_regression_tdata.py |   2 +-
 tests/test_ext.py              |  97 +++++++++++++++++-
 18 files changed, 458 insertions(+), 76 deletions(-)
 create mode 100644 .github/dependabot.yml

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 00000000..7bb4cf76
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,7 @@
+# https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 6d828a1c..a0df1fc1 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -12,10 +12,10 @@ jobs:
   lint-code:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
       - name: Set up Python 3.10
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
           python-version: "3.10"
 
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 8ec96b64..6bab0ebe 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -20,7 +20,7 @@ jobs:
 
     runs-on: ${{ matrix.os }}
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
 
     - name: Install apt dependencies
       if: ${{ matrix.os == 'ubuntu-latest' }}
@@ -33,7 +33,7 @@ jobs:
         brew install hdf5 pkg-config
 
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v4
       with:
         python-version: ${{ matrix.python-version }}
 
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index def84e0b..06a5c919 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -17,22 +17,26 @@ jobs:
 
     steps:
       # Checkout bitshuffle
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
       # Build wheels for linux and x86 platforms
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.3.1
+        uses: pypa/cibuildwheel@v2.11.2
         with:
           output-dir: ./wheelhouse-hdf5-${{ matrix.hdf5}}
         env:
-          CIBW_SKIP: "pp* *musllinux*"
-          CIBW_ARCHS_LINUX: "x86_64"
+          CIBW_SKIP: "pp* *musllinux* cp311-macosx*"
+          CIBW_ARCHS: "x86_64"
           CIBW_BEFORE_ALL: |
             chmod +x .github/workflows/install_hdf5.sh
             .github/workflows/install_hdf5.sh ${{ matrix.hdf5 }}
             git submodule update --init
-          CIBW_ENVIRONMENT: |
-            LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib ENABLE_ZSTD=1
+          # Only build Haswell wheels on x86 for compatibility
+          CIBW_ENVIRONMENT: >
+            LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
+            CPATH=/usr/local/include
+            ENABLE_ZSTD=1
+            BITSHUFFLE_ARCH=haswell
           CIBW_TEST_REQUIRES: pytest
           # Install different version of HDF5 for unit tests to ensure the
           # wheels are independent of HDF5 installation
@@ -41,9 +45,11 @@ jobs:
           #   .github/workflows/install_hdf5.sh 1.8.11
           # Run units tests but disable test_h5plugin.py
           CIBW_TEST_COMMAND: pytest {package}/tests
+          # The Github runners for macOS don't support AVX2 instructions and so the tests will fail with SIGILL, so skip them
+          CIBW_TEST_SKIP: "*macosx*"
 
       # Package wheels and host on CI
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         with:
           path: ./wheelhouse-hdf5-${{ matrix.hdf5 }}/*.whl
 
@@ -55,14 +61,14 @@ jobs:
 
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
       - name: Install apt dependencies
         run: |
           sudo apt-get install -y libhdf5-serial-dev hdf5-tools pkg-config
 
       - name: Install Python
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
 
@@ -73,7 +79,7 @@ jobs:
       - name: Build sdist
         run: python setup.py sdist
 
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         with:
           path: dist/*.tar.gz
 
@@ -86,12 +92,12 @@ jobs:
     # Alternatively, to publish when a GitHub Release is created, use the following rule:
     if: github.event_name == 'release' && github.event.action == 'published'
     steps:
-      - uses: actions/download-artifact@v2
+      - uses: actions/download-artifact@v3
         with:
           name: artifact
           path: dist
 
-      - uses: pypa/gh-action-pypi-publish@v1.4.2
+      - uses: pypa/gh-action-pypi-publish@v1.5.1
         with:
           user: __token__
           password: ${{ secrets.pypi_password }}
diff --git a/README.rst b/README.rst
index 7e4be25f..8452e407 100644
--- a/README.rst
+++ b/README.rst
@@ -17,7 +17,7 @@ except it operates at the bit level instead of the byte level. Arranging a
 typed data array in to a matrix with the elements as the rows and the bits
 within the elements as the columns, Bitshuffle "transposes" the matrix,
 such that all the least-significant-bits are in a row, etc.  This transpose
-is performed within blocks of data roughly 8kB long [1]_.
+is performed within blocks of data roughly 8 kB long [1]_.
 
 This does not in itself compress data, only rearranges it for more efficient
 compression. To perform the actual compression you will need a compression
@@ -97,20 +97,35 @@ Comparing Bitshuffle to other compression algorithms and HDF5 filters:
 Installation for Python
 -----------------------
 
-Installation requires python 2.7+ or 3.3+, HDF5 1.8.4 or later, HDF5 for python
-(h5py), Numpy and Cython. Bitshuffle is linked against HDF5. To use the dynamically 
-loaded HDF5 filter requires HDF5 1.8.11 or later. If ZSTD support is enabled the ZSTD 
-repo needs to pulled into bitshuffle before installation with::
+
+In most cases bitshuffle can be installed by `pip`::
+
+    pip install bitshuffle
+
+On Linux and macOS x86_64 platforms binary wheels are available, on other platforms a
+source build will be performed. The binary wheels are built with AVX2 support and will
+only run processors that support these instructions (most processors from 2015 onwards,
+i.e. Intel Haswell, AMD Excavator and later). On an unsupported processor these builds
+of bitshuffle will crash with `SIGILL`. To run on unsupported x86_64 processors, or
+target newer instructions such as AVX512, you should perform a build from source.
+This can be forced by giving pip the `--no-binary=bitshuffle` option.
+
+Source installation requires python 2.7+ or 3.3+, HDF5 1.8.4 or later, HDF5 for python
+(h5py), Numpy and Cython. Bitshuffle is linked against HDF5. To use the dynamically
+loaded HDF5 filter requires HDF5 1.8.11 or later.
+
+For total control, bitshuffle can be built using `python setup.py`. If ZSTD support is
+to be enabled the ZSTD repo needs to pulled into bitshuffle before installation with::
 
     git submodule update --init
 
-To install bitshuffle::
+To build and install bitshuffle::
 
     python setup.py install [--h5plugin [--h5plugin-dir=spam] --zstd]
 
-To get finer control of installation options, including whether to compile
-with OpenMP multi-threading, copy the ``setup.cfg.example`` to ``setup.cfg``
-and edit the values therein.
+To get finer control of installation options, including whether to compile with OpenMP
+multi-threading and the target microarchitecture copy the ``setup.cfg.example`` to
+``setup.cfg`` and edit the values therein.
 
 If using the dynamically loaded HDF5 filter (which gives you access to the
 Bitshuffle and LZF filters outside of python), set the environment variable
@@ -143,9 +158,9 @@ interface or through the convenience functions provided in
 version 2.5.0 and later Bitshuffle can be added to new datasets through the
 high level interface, as in the example below.
 
-The compression algorithm can be configured using the `filter_opts` in 
-`bitshuffle.h5.create_dataset()`. LZ4 is chosen with: 
-`(BLOCK_SIZE, h5.H5_COMPRESS_LZ4)` and ZSTD with: 
+The compression algorithm can be configured using the `filter_opts` in
+`bitshuffle.h5.create_dataset()`. LZ4 is chosen with:
+`(BLOCK_SIZE, h5.H5_COMPRESS_LZ4)` and ZSTD with:
 `(BLOCK_SIZE, h5.H5_COMPRESS_ZSTD, COMP_LVL)`. See `test_h5filter.py` for an example.
 
 Example h5py
@@ -214,6 +229,27 @@ Then, you use them like this::
 .. _`snappy-java`: https://github.com/xerial/snappy-java
 
 
+Rust HDF5 plugin
+----------------
+
+If you wish to open HDF5 files compressed with bitshuffle in your Rust program, there is a `Rust binding`_ for it.
+In your Cargo.toml::
+
+    [dependencies]
+    ...
+    hdf5-bitshuffle = "0.9"
+    ...
+
+To register the plugin in your code::
+
+    use hdf5_bitshuffle::register_bitshuffle_plugin;
+
+    fn main() {
+        register_bitshuffle_plugin();
+    }
+
+.. _`Rust binding`: https://docs.rs/hdf5-bitshuffle/latest/hdf5_bitshuffle/
+
 Anaconda
 --------
 
diff --git a/bitshuffle/__init__.py b/bitshuffle/__init__.py
index 3f7c0380..896d993a 100644
--- a/bitshuffle/__init__.py
+++ b/bitshuffle/__init__.py
@@ -8,6 +8,7 @@
     using_NEON
     using_SSE2
     using_AVX2
+    using_AVX512
     bitshuffle
     bitunshuffle
     compress_lz4
@@ -28,6 +29,7 @@
     using_NEON,
     using_SSE2,
     using_AVX2,
+    using_AVX512,
     compress_lz4,
     decompress_lz4,
 )
@@ -49,6 +51,7 @@
     "using_NEON",
     "using_SSE2",
     "using_AVX2",
+    "using_AVX512",
     "compress_lz4",
     "decompress_lz4",
 ] + zstd_api
diff --git a/bitshuffle/ext.pyx b/bitshuffle/ext.pyx
index edc9c588..2d4cc4c3 100644
--- a/bitshuffle/ext.pyx
+++ b/bitshuffle/ext.pyx
@@ -24,6 +24,7 @@ cdef extern from b"bitshuffle.h":
     int bshuf_using_NEON()
     int bshuf_using_SSE2()
     int bshuf_using_AVX2()
+    int bshuf_using_AVX512()
     int bshuf_bitshuffle(void *A, void *B, int size, int elem_size,
                          int block_size) nogil
     int bshuf_bitunshuffle(void *A, void *B, int size, int elem_size,
@@ -60,7 +61,9 @@ cdef extern int bshuf_trans_bit_byte_scal(void *A, void *B, int size, int elem_s
 cdef extern int bshuf_trans_bit_byte_SSE(void *A, void *B, int size, int elem_size)
 cdef extern int bshuf_trans_bit_byte_NEON(void *A, void *B, int size, int elem_size)
 cdef extern int bshuf_trans_bit_byte_AVX(void *A, void *B, int size, int elem_size)
+cdef extern int bshuf_trans_bit_byte_AVX512(void *A, void *B, int size, int elem_size)
 cdef extern int bshuf_trans_bitrow_eight(void *A, void *B, int size, int elem_size)
+cdef extern int bshuf_trans_bit_elem_AVX512(void *A, void *B, int size, int elem_size)
 cdef extern int bshuf_trans_bit_elem_AVX(void *A, void *B, int size, int elem_size)
 cdef extern int bshuf_trans_bit_elem_SSE(void *A, void *B, int size, int elem_size)
 cdef extern int bshuf_trans_bit_elem_NEON(void *A, void *B, int size, int elem_size)
@@ -73,9 +76,11 @@ cdef extern int bshuf_shuffle_bit_eightelem_scal(void *A, void *B, int size, int
 cdef extern int bshuf_shuffle_bit_eightelem_SSE(void *A, void *B, int size, int elem_size)
 cdef extern int bshuf_shuffle_bit_eightelem_NEON(void *A, void *B, int size, int elem_size)
 cdef extern int bshuf_shuffle_bit_eightelem_AVX(void *A, void *B, int size, int elem_size)
+cdef extern int bshuf_shuffle_bit_eightelem_AVX512(void *A, void *B, int size, int elem_size)
 cdef extern int bshuf_untrans_bit_elem_SSE(void *A, void *B, int size, int elem_size)
 cdef extern int bshuf_untrans_bit_elem_NEON(void *A, void *B, int size, int elem_size)
 cdef extern int bshuf_untrans_bit_elem_AVX(void *A, void *B, int size, int elem_size)
+cdef extern int bshuf_untrans_bit_elem_AVX512(void *A, void *B, int size, int elem_size)
 cdef extern int bshuf_untrans_bit_elem_scal(void *A, void *B, int size, int elem_size)
 cdef extern int bshuf_trans_bit_elem(void *A, void *B, int size, int elem_size)
 cdef extern int bshuf_untrans_bit_elem(void *A, void *B, int size, int elem_size)
@@ -108,6 +113,14 @@ def using_AVX2():
         return False
 
 
+def using_AVX512():
+    """Whether compiled using AVX512 instructions."""
+    if bshuf_using_AVX512():
+        return True
+    else:
+        return False
+
+
 def _setup_arr(arr):
     shape = tuple(arr.shape)
     if not arr.flags['C_CONTIGUOUS']:
@@ -188,10 +201,18 @@ def trans_bit_byte_AVX(np.ndarray arr not None):
     return _wrap_C_fun(&bshuf_trans_bit_byte_AVX, arr)
 
 
+def trans_bit_byte_AVX512(np.ndarray arr not None):
+    return _wrap_C_fun(&bshuf_trans_bit_byte_AVX512, arr)
+
+
 def trans_bitrow_eight(np.ndarray arr not None):
     return _wrap_C_fun(&bshuf_trans_bitrow_eight, arr)
 
 
+def trans_bit_elem_AVX512(np.ndarray arr not None):
+    return _wrap_C_fun(&bshuf_trans_bit_elem_AVX512, arr)
+
+
 def trans_bit_elem_AVX(np.ndarray arr not None):
     return _wrap_C_fun(&bshuf_trans_bit_elem_AVX, arr)
 
@@ -240,6 +261,10 @@ def shuffle_bit_eightelem_AVX(np.ndarray arr not None):
     return _wrap_C_fun(&bshuf_shuffle_bit_eightelem_AVX, arr)
 
 
+def shuffle_bit_eightelem_AVX512(np.ndarray arr not None):
+    return _wrap_C_fun(&bshuf_shuffle_bit_eightelem_AVX512, arr)
+
+
 def untrans_bit_elem_SSE(np.ndarray arr not None):
     return _wrap_C_fun(&bshuf_untrans_bit_elem_SSE, arr)
 
@@ -252,6 +277,10 @@ def untrans_bit_elem_AVX(np.ndarray arr not None):
     return _wrap_C_fun(&bshuf_untrans_bit_elem_AVX, arr)
 
 
+def untrans_bit_elem_AVX512(np.ndarray arr not None):
+    return _wrap_C_fun(&bshuf_untrans_bit_elem_AVX512, arr)
+
+
 def untrans_bit_elem_scal(np.ndarray arr not None):
     return _wrap_C_fun(&bshuf_untrans_bit_elem_scal, arr)
 
diff --git a/setup.cfg.example b/setup.cfg.example
index 6bd2ccfb..2cdf0c70 100644
--- a/setup.cfg.example
+++ b/setup.cfg.example
@@ -4,7 +4,7 @@ h5plugin = 0
 h5plugin-dir = /usr/local/hdf5/lib/plugin
 
 [build_ext]
-# Whether to compile with OpenMP multi-threading. Default is system dependant:
+# Whether to compile with OpenMP multi-threading. Default is system dependent:
 # False on OSX (since the clang compiler does not yet support OpenMP) and True
 # otherwise.
 omp = 1
diff --git a/setup.py b/setup.py
index ff99b8ef..b8ca9cf1 100644
--- a/setup.py
+++ b/setup.py
@@ -18,8 +18,8 @@
 
 
 VERSION_MAJOR = 0
-VERSION_MINOR = 4
-VERSION_POINT = 2
+VERSION_MINOR = 5
+VERSION_POINT = 1
 # Define ZSTD macro for cython compilation
 default_options["compile_time_env"] = {"ZSTD_SUPPORT": False}
 
@@ -45,14 +45,21 @@
 
 
 H5PLUGINS_DEFAULT = "/usr/local/hdf5/lib/plugin"
-MARCH_DEFAULT = "native"
 
-# OSX's clang compliler does not support OpenMP.
+# OSX's clang compiler does not support OpenMP.
 if sys.platform == "darwin":
     OMP_DEFAULT = False
 else:
     OMP_DEFAULT = True
 
+# Build against the native architecture unless overridden by an environment variable
+# This can also be overridden by a direct command line argument, or a `setup.cfg` entry
+# This option is needed for the cibuildwheel action
+if "BITSHUFFLE_ARCH" in os.environ:
+    MARCH_DEFAULT = os.environ["BITSHUFFLE_ARCH"]
+else:
+    MARCH_DEFAULT = "native"
+
 FALLBACK_CONFIG = {
     "include_dirs": [],
     "library_dirs": [],
@@ -201,9 +208,32 @@ def pkgconfig(*packages, **kw):
 )
 
 
-EXTENSIONS = [ext_bshuf, h5filter]
+EXTENSIONS = [
+    ext_bshuf,
+]
+
+# Check for HDF5 support
+HDF5_FILTER_SUPPORT = False
+CPATHS = os.environ["CPATH"].split(":") if "CPATH" in os.environ else []
+for p in ["/usr/include"] + pkgconfig("hdf5")["include_dirs"] + CPATHS:
+    if os.path.exists(os.path.join(p, "hdf5.h")):
+        HDF5_FILTER_SUPPORT = True
+
+if HDF5_FILTER_SUPPORT:
+    EXTENSIONS.append(h5filter)
+
+# Check for plugin hdf5 plugin support (hdf5 >= 1.8.11)
+HDF5_PLUGIN_SUPPORT = False
+CPATHS = os.environ["CPATH"].split(":") if "CPATH" in os.environ else []
+for p in ["/usr/include"] + pkgconfig("hdf5")["include_dirs"] + CPATHS:
+    if os.path.exists(os.path.join(p, "H5PLextern.h")):
+        HDF5_PLUGIN_SUPPORT = True
+
+if HDF5_PLUGIN_SUPPORT:
+    EXTENSIONS.extend([filter_plugin, lzf_plugin])
 
 # For enabling ZSTD support when building wheels
+# This needs to be done after all Extensions have been added to EXTENSIONS
 if "ENABLE_ZSTD" in os.environ:
     default_options["compile_time_env"] = {"ZSTD_SUPPORT": True}
     for ext in EXTENSIONS:
@@ -217,16 +247,6 @@ def pkgconfig(*packages, **kw):
             ext.depends += zstd_headers
             ext.define_macros += [("ZSTD_SUPPORT", 1)]
 
-# Check for plugin hdf5 plugin support (hdf5 >= 1.8.11)
-HDF5_PLUGIN_SUPPORT = False
-CPATHS = os.environ["CPATH"].split(":") if "CPATH" in os.environ else []
-for p in ["/usr/include"] + pkgconfig("hdf5")["include_dirs"] + CPATHS:
-    if os.path.exists(os.path.join(p, "H5PLextern.h")):
-        HDF5_PLUGIN_SUPPORT = True
-
-if HDF5_PLUGIN_SUPPORT:
-    EXTENSIONS.extend([filter_plugin, lzf_plugin])
-
 
 class develop(develop_):
     def run(self):
@@ -344,10 +364,25 @@ def finalize_options(self):
     def build_extensions(self):
         c = self.compiler.compiler_type
 
+        # Set compiler flags including architecture
+        if self.compiler.compiler_type == "msvc":
+            openmpflag = "/openmp"
+            compileflags = COMPILE_FLAGS_MSVC
+        else:
+            openmpflag = "-fopenmp"
+            archi = platform.machine()
+            if archi in ("i386", "x86_64"):
+                compileflags = COMPILE_FLAGS + ["-march=%s" % self.march]
+            else:
+                compileflags = COMPILE_FLAGS + ["-mcpu=%s" % self.march]
+                if archi == "ppc64le":
+                    compileflags = COMPILE_FLAGS + ["-DNO_WARN_X86_INTRINSICS"]
+
         if self.omp not in ("0", "1", True, False):
             raise ValueError("Invalid omp argument. Mut be '0' or '1'.")
         self.omp = int(self.omp)
 
+        # Add the appropriate OpenMP flags if needed
         if self.omp:
             if not hasattr(self, "_printed_omp_message"):
                 self._printed_omp_message = True
@@ -356,26 +391,15 @@ def build_extensions(self):
                 print("#################################\n")
             # More portable to pass -fopenmp to linker.
             # self.libraries += ['gomp']
-            if self.compiler.compiler_type == "msvc":
-                openmpflag = "/openmp"
-                compileflags = COMPILE_FLAGS_MSVC
-            else:
-                openmpflag = "-fopenmp"
-                archi = platform.machine()
-                if archi in ("i386", "x86_64"):
-                    compileflags = COMPILE_FLAGS + ["-march=%s" % self.march]
-                else:
-                    compileflags = COMPILE_FLAGS + ["-mcpu=%s" % self.march]
-                    if archi == "ppc64le":
-                        compileflags = COMPILE_FLAGS + ["-DNO_WARN_X86_INTRINSICS"]
-            for e in self.extensions:
-                e.extra_compile_args = list(
-                    set(e.extra_compile_args).union(compileflags)
-                )
-                if openmpflag not in e.extra_compile_args:
-                    e.extra_compile_args += [openmpflag]
-                if openmpflag not in e.extra_link_args:
-                    e.extra_link_args += [openmpflag]
+            compileflags += [openmpflag]
+            linkflags = [openmpflag]
+        else:
+            linkflags = []
+
+        # Add the compile/link options to each extension
+        for e in self.extensions:
+            e.extra_compile_args = list(set(e.extra_compile_args).union(compileflags))
+            e.extra_link_args = list(set(e.extra_link_args).union(linkflags))
 
         build_ext_.build_extensions(self)
 
diff --git a/src/bitshuffle.c b/src/bitshuffle.c
index a8ef0b5c..ba5cde3a 100644
--- a/src/bitshuffle.c
+++ b/src/bitshuffle.c
@@ -182,7 +182,7 @@ int64_t bshuf_decompress_zstd_block(ioc_chain *C_ptr,
     tmp_buf = malloc(size * elem_size);
     if (tmp_buf == NULL) return -1;
 
-    nbytes = ZSTD_decompress(tmp_buf, size * elem_size, in + 4, nbytes_from_header);
+    nbytes = ZSTD_decompress(tmp_buf, size * elem_size, (void *)((char *) in + 4), nbytes_from_header);
     CHECK_ERR_FREE_LZ(nbytes, tmp_buf);
     if (nbytes != size * elem_size) {
         free(tmp_buf);
diff --git a/src/bitshuffle_core.c b/src/bitshuffle_core.c
index ef33bf55..ba41473f 100644
--- a/src/bitshuffle_core.c
+++ b/src/bitshuffle_core.c
@@ -16,6 +16,10 @@
 #include <string.h>
 
 
+#if defined(__AVX512F__) && defined (__AVX512BW__) && defined(__AVX2__) && defined(__SSE2__)
+#define USEAVX512
+#endif
+
 #if defined(__AVX2__) && defined (__SSE2__)
 #define USEAVX2
 #endif
@@ -79,6 +83,14 @@ int bshuf_using_AVX2(void) {
 }
 
 
+int bshuf_using_AVX512(void) {
+#ifdef USEAVX512
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 /* ---- Worker code not requiring special instruction sets. ----
  *
  * The following code does not use any x86 specific vectorized instructions
@@ -1384,7 +1396,6 @@ int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t
  */
 
 #ifdef USEAVX2
-
 /* Transpose bits within bytes. */
 int64_t bshuf_trans_bit_byte_AVX(const void* in, void* out, const size_t size,
          const size_t elem_size) {
@@ -1625,6 +1636,162 @@ int64_t bshuf_untrans_bit_elem_AVX(const void* in, void* out, const size_t size,
 
 #endif // #ifdef USEAVX2
 
+#ifdef USEAVX512
+
+/* Transpose bits within bytes. */
+int64_t bshuf_trans_bit_byte_AVX512(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    size_t ii, kk;
+    const char* in_b = (const char*) in;
+    char* out_b = (char*) out;
+    size_t nbyte = elem_size * size;
+    int64_t count;
+
+    int64_t* out_i64;
+    __m512i zmm;
+    __mmask64 bt;
+    if (nbyte >= 64) {
+        const __m512i mask = _mm512_set1_epi8(0);
+
+       for (ii = 0; ii + 63 < nbyte; ii += 64) {
+            zmm = _mm512_loadu_si512((__m512i *) &in_b[ii]);
+            for (kk = 0; kk < 8; kk++) {
+                bt = _mm512_cmp_epi8_mask(zmm, mask, 1);
+                zmm = _mm512_slli_epi16(zmm, 1);
+                out_i64 = (int64_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
+                *out_i64 = (int64_t)bt;
+            }
+        }
+    }
+
+    __m256i ymm;
+    int32_t bt32;
+    int32_t* out_i32;
+    size_t start = nbyte - nbyte % 64;
+    for (ii = start; ii + 31 < nbyte; ii += 32) {
+        ymm = _mm256_loadu_si256((__m256i *) &in_b[ii]);
+        for (kk = 0; kk < 8; kk++) {
+            bt32 = _mm256_movemask_epi8(ymm);
+            ymm = _mm256_slli_epi16(ymm, 1);
+            out_i32 = (int32_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
+            *out_i32 = bt32;
+        }
+    }
+
+
+    count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
+            nbyte - nbyte % 64 % 32);
+
+    return count;
+}
+
+
+/* Transpose bits within elements. */
+int64_t bshuf_trans_bit_elem_AVX512(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    int64_t count;
+
+    CHECK_MULT_EIGHT(size);
+
+    void* tmp_buf = malloc(size * elem_size);
+    if (tmp_buf == NULL) return -1;
+
+    count = bshuf_trans_byte_elem_SSE(in, out, size, elem_size);
+    CHECK_ERR_FREE(count, tmp_buf);
+    count = bshuf_trans_bit_byte_AVX512(out, tmp_buf, size, elem_size);
+    CHECK_ERR_FREE(count, tmp_buf);
+    count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
+
+    free(tmp_buf);
+
+    return count;
+
+}
+
+/* Shuffle bits within the bytes of eight element blocks. */
+int64_t bshuf_shuffle_bit_eightelem_AVX512(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    CHECK_MULT_EIGHT(size);
+
+    // With a bit of care, this could be written such that such that it is
+    // in_buf = out_buf safe.
+    const char* in_b = (const char*) in;
+    char* out_b = (char*) out;
+
+    size_t ii, jj, kk;
+    size_t nbyte = elem_size * size;
+
+    __m512i zmm;
+    __mmask64 bt;
+
+    if (elem_size % 8) {
+        return bshuf_shuffle_bit_eightelem_AVX(in, out, size, elem_size);
+    } else {
+        const __m512i mask = _mm512_set1_epi8(0);
+        for (jj = 0; jj + 63 < 8 * elem_size; jj += 64) {
+            for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
+                    ii += 8 * elem_size) {
+                zmm = _mm512_loadu_si512((__m512i *) &in_b[ii + jj]);
+                for (kk = 0; kk < 8; kk++) {
+                    bt = _mm512_cmp_epi8_mask(zmm, mask, 1);
+                    zmm = _mm512_slli_epi16(zmm, 1);
+                    size_t ind = (ii + jj / 8 + (7 - kk) * elem_size);
+                    * (int64_t *) &out_b[ind] = bt;
+                }
+            }
+        }
+
+    }
+    return size * elem_size;
+}
+
+/* Untranspose bits within elements. */
+int64_t bshuf_untrans_bit_elem_AVX512(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    int64_t count;
+
+    CHECK_MULT_EIGHT(size);
+
+    void* tmp_buf = malloc(size * elem_size);
+    if (tmp_buf == NULL) return -1;
+
+    count = bshuf_trans_byte_bitrow_AVX(in, tmp_buf, size, elem_size);
+    CHECK_ERR_FREE(count, tmp_buf);
+    count =  bshuf_shuffle_bit_eightelem_AVX512(tmp_buf, out, size, elem_size);
+
+    free(tmp_buf);
+    return count;
+}
+
+#else // #ifdef USEAVX512
+
+int64_t bshuf_trans_bit_byte_AVX512(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    return -14;
+}
+
+int64_t bshuf_trans_bit_elem_AVX512(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+    return -14;
+
+}
+
+int64_t bshuf_shuffle_bit_eightelem_AVX512(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+    return -14;
+}
+
+int64_t bshuf_untrans_bit_elem_AVX512(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+    return -14;
+}
+
+#endif
 
 /* ---- Drivers selecting best instruction set at compile time. ---- */
 
@@ -1632,7 +1799,9 @@ int64_t bshuf_trans_bit_elem(const void* in, void* out, const size_t size,
         const size_t elem_size) {
 
     int64_t count;
-#ifdef USEAVX2
+#ifdef USEAVX512
+    count = bshuf_trans_bit_elem_AVX512(in, out, size, elem_size);
+#elif defined USEAVX2
     count = bshuf_trans_bit_elem_AVX(in, out, size, elem_size);
 #elif defined(USESSE2)
     count = bshuf_trans_bit_elem_SSE(in, out, size, elem_size);
@@ -1649,7 +1818,9 @@ int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size,
         const size_t elem_size) {
 
     int64_t count;
-#ifdef USEAVX2
+#ifdef USEAVX512
+    count = bshuf_untrans_bit_elem_AVX512(in, out, size, elem_size);
+#elif defined USEAVX2
     count = bshuf_untrans_bit_elem_AVX(in, out, size, elem_size);
 #elif defined(USESSE2)
     count = bshuf_untrans_bit_elem_SSE(in, out, size, elem_size);
diff --git a/src/bitshuffle_core.h b/src/bitshuffle_core.h
index fba7301c..af09b1c4 100644
--- a/src/bitshuffle_core.h
+++ b/src/bitshuffle_core.h
@@ -19,6 +19,7 @@
  *      -11   : Missing SSE.
  *      -12   : Missing AVX.
  *      -13   : Missing Arm Neon.
+ *      -14   : Missing AVX512.
  *      -80   : Input size not a multiple of 8.
  *      -81   : block_size not multiple of 8.
  *      -91   : Decompression error, wrong number of bytes processed.
@@ -91,6 +92,18 @@ int bshuf_using_NEON(void);
 int bshuf_using_AVX2(void);
 
 
+/* ---- bshuf_using_AVX512 ----
+ *
+ * Whether routines where compiled with the AVX512 instruction set.
+ *
+ * Returns
+ * -------
+ *  1 if using AVX512, 0 otherwise.
+ *
+ */
+int bshuf_using_AVX512(void);
+
+
 /* ---- bshuf_default_block_size ----
  *
  * The default block size as function of element size.
diff --git a/src/bshuf_h5filter.c b/src/bshuf_h5filter.c
index 114b91ff..54de27d1 100644
--- a/src/bshuf_h5filter.c
+++ b/src/bshuf_h5filter.c
@@ -25,7 +25,7 @@ void bshuf_write_uint32_BE(void* buf, uint32_t num);
 uint32_t bshuf_read_uint32_BE(const void* buf);
 
 
-// Only called on compresion, not on reverse.
+// Only called on compression, not on reverse.
 herr_t bshuf_h5_set_local(hid_t dcpl, hid_t type, hid_t space){
 
     herr_t r;
@@ -192,7 +192,7 @@ size_t bshuf_h5_filter(unsigned int flags, size_t cd_nelmts,
             // Bit shuffle/compress.
             // Write the header, described in
             // http://www.hdfgroup.org/services/filters/HDF5_LZ4.pdf.
-            // Techincally we should be using signed integers instead of
+            // Technically we should be using signed integers instead of
             // unsigned ones, however for valid inputs (positive numbers) these
             // have the same representation.
             bshuf_write_uint64_BE(out_buf, nbytes_uncomp);
diff --git a/src/bshuf_h5filter.h b/src/bshuf_h5filter.h
index 54ee6775..0c6f153d 100644
--- a/src/bshuf_h5filter.h
+++ b/src/bshuf_h5filter.h
@@ -13,7 +13,7 @@
  *
  * Filter Options
  * --------------
- *  block_size (option slot 0) : interger (optional)
+ *  block_size (option slot 0) : integer (optional)
  *      What block size to use (in elements not bytes). Default is 0,
  *      for which bitshuffle will pick a block size with a target of 8kb.
  *  Compression (option slot 1) : 0 or BSHUF_H5_COMPRESS_LZ4
diff --git a/src/iochain.c b/src/iochain.c
index baa97296..37015614 100644
--- a/src/iochain.c
+++ b/src/iochain.c
@@ -1,5 +1,5 @@
 /*
- * IOchain - Distribute a chain of dependant IO events amoung threads.
+ * IOchain - Distribute a chain of dependent IO events among threads.
  *
  * This file is part of Bitshuffle
  * Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
diff --git a/src/iochain.h b/src/iochain.h
index 4e225d1b..8acafeae 100644
--- a/src/iochain.h
+++ b/src/iochain.h
@@ -1,5 +1,5 @@
 /*
- * IOchain - Distribute a chain of dependant IO events amoung threads.
+ * IOchain - Distribute a chain of dependent IO events among threads.
  *
  * This file is part of Bitshuffle
  * Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
diff --git a/tests/make_regression_tdata.py b/tests/make_regression_tdata.py
index 03deb422..86086226 100644
--- a/tests/make_regression_tdata.py
+++ b/tests/make_regression_tdata.py
@@ -24,7 +24,7 @@
 DTYPES = ["a1", "a2", "a3", "a4", "a6", "a8", "a10"]
 
 f = h5py.File(OUT_FILE, "w")
-g_orig = f.create_group("origional")
+g_orig = f.create_group("original")
 g_comp_lz4 = f.create_group("compressed")
 g_comp_zstd = f.create_group("compressed_zstd")
 
diff --git a/tests/test_ext.py b/tests/test_ext.py
index b2577c0d..7fbd7baf 100644
--- a/tests/test_ext.py
+++ b/tests/test_ext.py
@@ -34,7 +34,7 @@ def setUp(self):
         if TIME:
             n *= TIME
         # Almost random bits, but now quite. All bits exercised (to fully test
-        # transpose) but still slightly compresible.
+        # transpose) but still slightly compressible.
         self.data = random.randint(0, 200, n).astype(np.uint8)
         self.fun = ext.copy
         self.check = None
@@ -58,6 +58,8 @@ def tearDown(self):
                 return
             if len(err.args) > 1 and (err.args[1] == -12) and not ext.using_AVX2():
                 return
+            if len(err.args) > 1 and (err.args[1] == -14) and not ext.using_AVX512():
+                return
             else:
                 raise
         delta_t = min(delta_ts)
@@ -171,6 +173,18 @@ def test_03g_trans_bit_byte_AVX_32(self):
         self.fun = ext.trans_bit_byte_AVX
         self.check = trans_bit_byte
 
+    def test_03h_trans_bit_byte_AVX512(self):
+        self.case = "bit T byte AVX512 64"
+        self.data = self.data.view(np.float64)
+        self.fun = ext.trans_bit_byte_AVX512
+        self.check = trans_bit_byte
+
+    def test_03g_trans_bit_byte_AVX512_32(self):
+        self.case = "bit T byte AVX512 32"
+        self.data = self.data.view(np.float32)
+        self.fun = ext.trans_bit_byte_AVX512
+        self.check = trans_bit_byte
+
     def test_04a_trans_bit_elem_AVX(self):
         self.case = "bit T elem AVX 64"
         self.data = self.data.view(np.float64)
@@ -213,6 +227,30 @@ def test_04g_trans_bit_elem_SSE_64(self):
         self.fun = ext.trans_bit_elem_SSE
         self.check = trans_bit_elem
 
+    def test_04h_trans_bit_elem_AVX512(self):
+        self.case = "bit T elem AVX512 64"
+        self.data = self.data.view(np.float64)
+        self.fun = ext.trans_bit_elem_AVX512
+        self.check = trans_bit_elem
+
+    def test_04i_trans_bit_elem_AVX512(self):
+        self.case = "bit T elem AVX 128"
+        self.data = self.data.view(np.complex128)
+        self.fun = ext.trans_bit_elem_AVX512
+        self.check = trans_bit_elem
+
+    def test_04j_trans_bit_elem_AVX512_32(self):
+        self.case = "bit T elem AVX512 32"
+        self.data = self.data.view(np.float32)
+        self.fun = ext.trans_bit_elem_AVX512
+        self.check = trans_bit_elem
+
+    def test_04k_trans_bit_elem_AVX512_16(self):
+        self.case = "bit T elem AVX512 16"
+        self.data = self.data.view(np.int16)
+        self.fun = ext.trans_bit_elem_AVX512
+        self.check = trans_bit_elem
+
     def test_06a_untrans_bit_elem_16(self):
         self.case = "bit U elem SSE 16"
         pre_trans = self.data.view(np.int16)
@@ -262,6 +300,20 @@ def test_06g_untrans_bit_elem_64(self):
         self.fun = ext.untrans_bit_elem_scal
         self.check_data = pre_trans
 
+    def test_06h_untrans_bit_elem_32(self):
+        self.case = "bit U elem AVX512 32"
+        pre_trans = self.data.view(np.float32)
+        self.data = trans_bit_elem(pre_trans)
+        self.fun = ext.untrans_bit_elem_AVX512
+        self.check_data = pre_trans
+
+    def test_06i_untrans_bit_elem_64(self):
+        self.case = "bit U elem AVX512 64"
+        pre_trans = self.data.view(np.float64)
+        self.data = trans_bit_elem(pre_trans)
+        self.fun = ext.untrans_bit_elem_AVX512
+        self.check_data = pre_trans
+
     def test_07a_trans_byte_bitrow_64(self):
         self.case = "byte T row scal 64"
         self.data = self.data.view(np.float64)
@@ -314,6 +366,30 @@ def test_08f_shuffle_bit_eight_AVX_128(self):
         self.fun = ext.shuffle_bit_eightelem_AVX
         self.check = ext.shuffle_bit_eightelem_scal
 
+    def test_08g_shuffle_bit_eight_AVX512_32(self):
+        self.case = "bit S eight AVX 32"
+        self.data = self.data.view(np.float32)
+        self.fun = ext.shuffle_bit_eightelem_AVX512
+        self.check = ext.shuffle_bit_eightelem_scal
+
+    def test_08h_shuffle_bit_eight_AVX512_64(self):
+        self.case = "bit S eight AVX512 64"
+        self.data = self.data.view(np.float64)
+        self.fun = ext.shuffle_bit_eightelem_AVX512
+        self.check = ext.shuffle_bit_eightelem_scal
+
+    def test_08i_shuffle_bit_eight_AVX512_16(self):
+        self.case = "bit S eight AVX512 16"
+        self.data = self.data.view(np.int16)
+        self.fun = ext.shuffle_bit_eightelem_AVX512
+        self.check = ext.shuffle_bit_eightelem_scal
+
+    def test_08i_shuffle_bit_eight_AVX512_128(self):
+        self.case = "bit S eight AVX512 128"
+        self.data = self.data.view(np.complex128)
+        self.fun = ext.shuffle_bit_eightelem_AVX512
+        self.check = ext.shuffle_bit_eightelem_scal
+
     def test_09a_trans_bit_elem_scal_64(self):
         self.case = "bit T elem scal 64"
         self.data = self.data.view(np.float64)
@@ -353,6 +429,13 @@ def test_09f_untrans_bit_elem_AVX_64(self):
         self.fun = ext.untrans_bit_elem_AVX
         self.check_data = pre_trans
 
+    def test_09g_untrans_bit_elem_AVX_64(self):
+        self.case = "bit U elem AVX512 64"
+        pre_trans = self.data.view(np.float64)
+        self.data = trans_bit_elem(pre_trans)
+        self.fun = ext.untrans_bit_elem_AVX512
+        self.check_data = pre_trans
+
     def test_10a_bitshuffle_64(self):
         self.case = "bitshuffle 64"
         self.data = self.data.view(np.float64)
@@ -481,10 +564,18 @@ def test_trans_bit_elem_AVX(self):
         self.fun = ext.trans_bit_elem_AVX
         self.check = trans_bit_elem
 
+    def test_trans_bit_elem_AVX512(self):
+        self.fun = ext.trans_bit_elem_AVX512
+        self.check = trans_bit_elem
+
     def test_untrans_bit_elem_AVX(self):
         self.fun = lambda x: ext.untrans_bit_elem_SSE(ext.trans_bit_elem(x))
         self.check = lambda x: x
 
+    def test_untrans_bit_elem_AVX512(self):
+        self.fun = lambda x: ext.untrans_bit_elem_SSE(ext.trans_bit_elem(x))
+        self.check = lambda x: x
+
     def test_trans_bit_elem_scal(self):
         self.fun = ext.trans_bit_elem_scal
         self.check = trans_bit_elem
@@ -515,12 +606,14 @@ def tearDown(self):
                 return
             if len(err.args) > 1 and (err.args[1] == -12) and not ext.using_AVX2():
                 return
+            if len(err.args) > 1 and (err.args[1] == -14) and not ext.using_AVX512():
+                return
             else:
                 raise
 
 
 class TestBitShuffleCircle(unittest.TestCase):
-    """Ensure that final filter is circularly consistant for any data type and
+    """Ensure that final filter is circularly consistent for any data type and
     any length buffer."""
 
     def test_circle(self):