diff --git a/src/bitshuffle/.github/dependabot.yml b/src/bitshuffle/.github/dependabot.yml
new file mode 100644
index 00000000..7bb4cf76
--- /dev/null
+++ b/src/bitshuffle/.github/dependabot.yml
@@ -0,0 +1,7 @@
+# https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"
diff --git a/src/bitshuffle/.github/workflows/lint.yml b/src/bitshuffle/.github/workflows/lint.yml
index 6d828a1c..a0df1fc1 100644
--- a/src/bitshuffle/.github/workflows/lint.yml
+++ b/src/bitshuffle/.github/workflows/lint.yml
@@ -12,10 +12,10 @@ jobs:
   lint-code:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
       - name: Set up Python 3.10
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
           python-version: "3.10"
 
diff --git a/src/bitshuffle/.github/workflows/main.yml b/src/bitshuffle/.github/workflows/main.yml
index 8ec96b64..6bab0ebe 100644
--- a/src/bitshuffle/.github/workflows/main.yml
+++ b/src/bitshuffle/.github/workflows/main.yml
@@ -20,7 +20,7 @@ jobs:
 
     runs-on: ${{ matrix.os }}
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
 
     - name: Install apt dependencies
       if: ${{ matrix.os == 'ubuntu-latest' }}
@@ -33,7 +33,7 @@ jobs:
         brew install hdf5 pkg-config
 
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v4
       with:
         python-version: ${{ matrix.python-version }}
 
diff --git a/src/bitshuffle/.github/workflows/wheels.yml b/src/bitshuffle/.github/workflows/wheels.yml
index def84e0b..06a5c919 100644
--- a/src/bitshuffle/.github/workflows/wheels.yml
+++ b/src/bitshuffle/.github/workflows/wheels.yml
@@ -17,22 +17,26 @@ jobs:
 
     steps:
       # Checkout bitshuffle
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
       # Build wheels for linux and x86 platforms
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.3.1
+        uses: pypa/cibuildwheel@v2.11.2
         with:
           output-dir: ./wheelhouse-hdf5-${{ matrix.hdf5}}
         env:
-          CIBW_SKIP: "pp* *musllinux*"
-          CIBW_ARCHS_LINUX: "x86_64"
+          CIBW_SKIP: "pp* *musllinux* cp311-macosx*"
+          CIBW_ARCHS: "x86_64"
           CIBW_BEFORE_ALL: |
             chmod +x .github/workflows/install_hdf5.sh
             .github/workflows/install_hdf5.sh ${{ matrix.hdf5 }}
             git submodule update --init
-          CIBW_ENVIRONMENT: |
-            LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib ENABLE_ZSTD=1
+          # Only build Haswell wheels on x86 for compatibility
+          CIBW_ENVIRONMENT: >
+            LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
+            CPATH=/usr/local/include
+            ENABLE_ZSTD=1
+            BITSHUFFLE_ARCH=haswell
           CIBW_TEST_REQUIRES: pytest
           # Install different version of HDF5 for unit tests to ensure the
           # wheels are independent of HDF5 installation
@@ -41,9 +45,11 @@ jobs:
           #   .github/workflows/install_hdf5.sh 1.8.11
           # Run units tests but disable test_h5plugin.py
           CIBW_TEST_COMMAND: pytest {package}/tests
+          # The Github runners for macOS don't support AVX2 instructions and so the tests will fail with SIGILL, so skip them
+          CIBW_TEST_SKIP: "*macosx*"
 
       # Package wheels and host on CI
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         with:
           path: ./wheelhouse-hdf5-${{ matrix.hdf5 }}/*.whl
 
@@ -55,14 +61,14 @@ jobs:
 
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
       - name: Install apt dependencies
         run: |
           sudo apt-get install -y libhdf5-serial-dev hdf5-tools pkg-config
 
       - name: Install Python
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
 
@@ -73,7 +79,7 @@ jobs:
       - name: Build sdist
         run: python setup.py sdist
 
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         with:
           path: dist/*.tar.gz
 
@@ -86,12 +92,12 @@ jobs:
     # Alternatively, to publish when a GitHub Release is created, use the following rule:
     if: github.event_name == 'release' && github.event.action == 'published'
     steps:
-      - uses: actions/download-artifact@v2
+      - uses: actions/download-artifact@v3
         with:
           name: artifact
           path: dist
 
-      - uses: pypa/gh-action-pypi-publish@v1.4.2
+      - uses: pypa/gh-action-pypi-publish@v1.5.1
         with:
           user: __token__
           password: ${{ secrets.pypi_password }}
diff --git a/src/bitshuffle/README.rst b/src/bitshuffle/README.rst
index 7e4be25f..8452e407 100644
--- a/src/bitshuffle/README.rst
+++ b/src/bitshuffle/README.rst
@@ -17,7 +17,7 @@ except it operates at the bit level instead of the byte level. Arranging a
 typed data array in to a matrix with the elements as the rows and the bits
 within the elements as the columns, Bitshuffle "transposes" the matrix,
 such that all the least-significant-bits are in a row, etc.  This transpose
-is performed within blocks of data roughly 8kB long [1]_.
+is performed within blocks of data roughly 8 kB long [1]_.
 
 This does not in itself compress data, only rearranges it for more efficient
 compression. To perform the actual compression you will need a compression
@@ -97,20 +97,35 @@ Comparing Bitshuffle to other compression algorithms and HDF5 filters:
 Installation for Python
 -----------------------
 
-Installation requires python 2.7+ or 3.3+, HDF5 1.8.4 or later, HDF5 for python
-(h5py), Numpy and Cython. Bitshuffle is linked against HDF5. To use the dynamically 
-loaded HDF5 filter requires HDF5 1.8.11 or later. If ZSTD support is enabled the ZSTD 
-repo needs to pulled into bitshuffle before installation with::
+
+In most cases bitshuffle can be installed by `pip`::
+
+    pip install bitshuffle
+
+On Linux and macOS x86_64 platforms binary wheels are available, on other platforms a
+source build will be performed. The binary wheels are built with AVX2 support and will
+only run processors that support these instructions (most processors from 2015 onwards,
+i.e. Intel Haswell, AMD Excavator and later). On an unsupported processor these builds
+of bitshuffle will crash with `SIGILL`. To run on unsupported x86_64 processors, or
+target newer instructions such as AVX512, you should perform a build from source.
+This can be forced by giving pip the `--no-binary=bitshuffle` option.
+
+Source installation requires python 2.7+ or 3.3+, HDF5 1.8.4 or later, HDF5 for python
+(h5py), Numpy and Cython. Bitshuffle is linked against HDF5. To use the dynamically
+loaded HDF5 filter requires HDF5 1.8.11 or later.
+
+For total control, bitshuffle can be built using `python setup.py`. If ZSTD support is
+to be enabled the ZSTD repo needs to pulled into bitshuffle before installation with::
 
     git submodule update --init
 
-To install bitshuffle::
+To build and install bitshuffle::
 
     python setup.py install [--h5plugin [--h5plugin-dir=spam] --zstd]
 
-To get finer control of installation options, including whether to compile
-with OpenMP multi-threading, copy the ``setup.cfg.example`` to ``setup.cfg``
-and edit the values therein.
+To get finer control of installation options, including whether to compile with OpenMP
+multi-threading and the target microarchitecture copy the ``setup.cfg.example`` to
+``setup.cfg`` and edit the values therein.
 
 If using the dynamically loaded HDF5 filter (which gives you access to the
 Bitshuffle and LZF filters outside of python), set the environment variable
@@ -143,9 +158,9 @@ interface or through the convenience functions provided in
 version 2.5.0 and later Bitshuffle can be added to new datasets through the
 high level interface, as in the example below.
 
-The compression algorithm can be configured using the `filter_opts` in 
-`bitshuffle.h5.create_dataset()`. LZ4 is chosen with: 
-`(BLOCK_SIZE, h5.H5_COMPRESS_LZ4)` and ZSTD with: 
+The compression algorithm can be configured using the `filter_opts` in
+`bitshuffle.h5.create_dataset()`. LZ4 is chosen with:
+`(BLOCK_SIZE, h5.H5_COMPRESS_LZ4)` and ZSTD with:
 `(BLOCK_SIZE, h5.H5_COMPRESS_ZSTD, COMP_LVL)`. See `test_h5filter.py` for an example.
 
 Example h5py
@@ -214,6 +229,27 @@ Then, you use them like this::
 .. _`snappy-java`: https://github.com/xerial/snappy-java
 
 
+Rust HDF5 plugin
+----------------
+
+If you wish to open HDF5 files compressed with bitshuffle in your Rust program, there is a `Rust binding`_ for it.
+In your Cargo.toml::
+
+    [dependencies]
+    ...
+    hdf5-bitshuffle = "0.9"
+    ...
+
+To register the plugin in your code::
+
+    use hdf5_bitshuffle::register_bitshuffle_plugin;
+
+    fn main() {
+        register_bitshuffle_plugin();
+    }
+
+.. _`Rust binding`: https://docs.rs/hdf5-bitshuffle/latest/hdf5_bitshuffle/
+
 Anaconda
 --------
 
diff --git a/src/bitshuffle/bitshuffle/__init__.py b/src/bitshuffle/bitshuffle/__init__.py
index 3f7c0380..896d993a 100644
--- a/src/bitshuffle/bitshuffle/__init__.py
+++ b/src/bitshuffle/bitshuffle/__init__.py
@@ -8,6 +8,7 @@
     using_NEON
     using_SSE2
     using_AVX2
+    using_AVX512
     bitshuffle
     bitunshuffle
     compress_lz4
@@ -28,6 +29,7 @@
     using_NEON,
     using_SSE2,
     using_AVX2,
+    using_AVX512,
     compress_lz4,
     decompress_lz4,
 )
@@ -49,6 +51,7 @@
     "using_NEON",
     "using_SSE2",
     "using_AVX2",
+    "using_AVX512",
     "compress_lz4",
     "decompress_lz4",
 ] + zstd_api
diff --git a/src/bitshuffle/bitshuffle/ext.pyx b/src/bitshuffle/bitshuffle/ext.pyx
index edc9c588..2d4cc4c3 100644
--- a/src/bitshuffle/bitshuffle/ext.pyx
+++ b/src/bitshuffle/bitshuffle/ext.pyx
@@ -24,6 +24,7 @@ cdef extern from b"bitshuffle.h":
     int bshuf_using_NEON()
     int bshuf_using_SSE2()
     int bshuf_using_AVX2()
+    int bshuf_using_AVX512()
     int bshuf_bitshuffle(void *A, void *B, int size, int elem_size,
                          int block_size) nogil
     int bshuf_bitunshuffle(void *A, void *B, int size, int elem_size,
@@ -60,7 +61,9 @@ cdef extern int bshuf_trans_bit_byte_scal(void *A, void *B, int size, int elem_s
 cdef extern int bshuf_trans_bit_byte_SSE(void *A, void *B, int size, int elem_size)
 cdef extern int bshuf_trans_bit_byte_NEON(void *A, void *B, int size, int elem_size)
 cdef extern int bshuf_trans_bit_byte_AVX(void *A, void *B, int size, int elem_size)
+cdef extern int bshuf_trans_bit_byte_AVX512(void *A, void *B, int size, int elem_size)
 cdef extern int bshuf_trans_bitrow_eight(void *A, void *B, int size, int elem_size)
+cdef extern int bshuf_trans_bit_elem_AVX512(void *A, void *B, int size, int elem_size)
 cdef extern int bshuf_trans_bit_elem_AVX(void *A, void *B, int size, int elem_size)
 cdef extern int bshuf_trans_bit_elem_SSE(void *A, void *B, int size, int elem_size)
 cdef extern int bshuf_trans_bit_elem_NEON(void *A, void *B, int size, int elem_size)
@@ -73,9 +76,11 @@ cdef extern int bshuf_shuffle_bit_eightelem_scal(void *A, void *B, int size, int
 cdef extern int bshuf_shuffle_bit_eightelem_SSE(void *A, void *B, int size, int elem_size)
 cdef extern int bshuf_shuffle_bit_eightelem_NEON(void *A, void *B, int size, int elem_size)
 cdef extern int bshuf_shuffle_bit_eightelem_AVX(void *A, void *B, int size, int elem_size)
+cdef extern int bshuf_shuffle_bit_eightelem_AVX512(void *A, void *B, int size, int elem_size)
 cdef extern int bshuf_untrans_bit_elem_SSE(void *A, void *B, int size, int elem_size)
 cdef extern int bshuf_untrans_bit_elem_NEON(void *A, void *B, int size, int elem_size)
 cdef extern int bshuf_untrans_bit_elem_AVX(void *A, void *B, int size, int elem_size)
+cdef extern int bshuf_untrans_bit_elem_AVX512(void *A, void *B, int size, int elem_size)
 cdef extern int bshuf_untrans_bit_elem_scal(void *A, void *B, int size, int elem_size)
 cdef extern int bshuf_trans_bit_elem(void *A, void *B, int size, int elem_size)
 cdef extern int bshuf_untrans_bit_elem(void *A, void *B, int size, int elem_size)
@@ -108,6 +113,14 @@ def using_AVX2():
         return False
 
 
+def using_AVX512():
+    """Whether compiled using AVX512 instructions."""
+    if bshuf_using_AVX512():
+        return True
+    else:
+        return False
+
+
 def _setup_arr(arr):
     shape = tuple(arr.shape)
     if not arr.flags['C_CONTIGUOUS']:
@@ -188,10 +201,18 @@ def trans_bit_byte_AVX(np.ndarray arr not None):
     return _wrap_C_fun(&bshuf_trans_bit_byte_AVX, arr)
 
 
+def trans_bit_byte_AVX512(np.ndarray arr not None):
+    return _wrap_C_fun(&bshuf_trans_bit_byte_AVX512, arr)
+
+
 def trans_bitrow_eight(np.ndarray arr not None):
     return _wrap_C_fun(&bshuf_trans_bitrow_eight, arr)
 
 
+def trans_bit_elem_AVX512(np.ndarray arr not None):
+    return _wrap_C_fun(&bshuf_trans_bit_elem_AVX512, arr)
+
+
 def trans_bit_elem_AVX(np.ndarray arr not None):
     return _wrap_C_fun(&bshuf_trans_bit_elem_AVX, arr)
 
@@ -240,6 +261,10 @@ def shuffle_bit_eightelem_AVX(np.ndarray arr not None):
     return _wrap_C_fun(&bshuf_shuffle_bit_eightelem_AVX, arr)
 
 
+def shuffle_bit_eightelem_AVX512(np.ndarray arr not None):
+    return _wrap_C_fun(&bshuf_shuffle_bit_eightelem_AVX512, arr)
+
+
 def untrans_bit_elem_SSE(np.ndarray arr not None):
     return _wrap_C_fun(&bshuf_untrans_bit_elem_SSE, arr)
 
@@ -252,6 +277,10 @@ def untrans_bit_elem_AVX(np.ndarray arr not None):
     return _wrap_C_fun(&bshuf_untrans_bit_elem_AVX, arr)
 
 
+def untrans_bit_elem_AVX512(np.ndarray arr not None):
+    return _wrap_C_fun(&bshuf_untrans_bit_elem_AVX512, arr)
+
+
 def untrans_bit_elem_scal(np.ndarray arr not None):
     return _wrap_C_fun(&bshuf_untrans_bit_elem_scal, arr)
 
diff --git a/src/bitshuffle/setup.cfg.example b/src/bitshuffle/setup.cfg.example
index 6bd2ccfb..2cdf0c70 100644
--- a/src/bitshuffle/setup.cfg.example
+++ b/src/bitshuffle/setup.cfg.example
@@ -4,7 +4,7 @@ h5plugin = 0
 h5plugin-dir = /usr/local/hdf5/lib/plugin
 
 [build_ext]
-# Whether to compile with OpenMP multi-threading. Default is system dependant:
+# Whether to compile with OpenMP multi-threading. Default is system dependent:
 # False on OSX (since the clang compiler does not yet support OpenMP) and True
 # otherwise.
 omp = 1
diff --git a/src/bitshuffle/setup.py b/src/bitshuffle/setup.py
index ff99b8ef..b8ca9cf1 100644
--- a/src/bitshuffle/setup.py
+++ b/src/bitshuffle/setup.py
@@ -18,8 +18,8 @@
 
 
 VERSION_MAJOR = 0
-VERSION_MINOR = 4
-VERSION_POINT = 2
+VERSION_MINOR = 5
+VERSION_POINT = 1
 # Define ZSTD macro for cython compilation
 default_options["compile_time_env"] = {"ZSTD_SUPPORT": False}
 
@@ -45,14 +45,21 @@
 
 
 H5PLUGINS_DEFAULT = "/usr/local/hdf5/lib/plugin"
-MARCH_DEFAULT = "native"
 
-# OSX's clang compliler does not support OpenMP.
+# OSX's clang compiler does not support OpenMP.
 if sys.platform == "darwin":
     OMP_DEFAULT = False
 else:
     OMP_DEFAULT = True
 
+# Build against the native architecture unless overridden by an environment variable
+# This can also be overridden by a direct command line argument, or a `setup.cfg` entry
+# This option is needed for the cibuildwheel action
+if "BITSHUFFLE_ARCH" in os.environ:
+    MARCH_DEFAULT = os.environ["BITSHUFFLE_ARCH"]
+else:
+    MARCH_DEFAULT = "native"
+
 FALLBACK_CONFIG = {
     "include_dirs": [],
     "library_dirs": [],
@@ -201,9 +208,32 @@ def pkgconfig(*packages, **kw):
 )
 
 
-EXTENSIONS = [ext_bshuf, h5filter]
+EXTENSIONS = [
+    ext_bshuf,
+]
+
+# Check for HDF5 support
+HDF5_FILTER_SUPPORT = False
+CPATHS = os.environ["CPATH"].split(":") if "CPATH" in os.environ else []
+for p in ["/usr/include"] + pkgconfig("hdf5")["include_dirs"] + CPATHS:
+    if os.path.exists(os.path.join(p, "hdf5.h")):
+        HDF5_FILTER_SUPPORT = True
+
+if HDF5_FILTER_SUPPORT:
+    EXTENSIONS.append(h5filter)
+
+# Check for plugin hdf5 plugin support (hdf5 >= 1.8.11)
+HDF5_PLUGIN_SUPPORT = False
+CPATHS = os.environ["CPATH"].split(":") if "CPATH" in os.environ else []
+for p in ["/usr/include"] + pkgconfig("hdf5")["include_dirs"] + CPATHS:
+    if os.path.exists(os.path.join(p, "H5PLextern.h")):
+        HDF5_PLUGIN_SUPPORT = True
+
+if HDF5_PLUGIN_SUPPORT:
+    EXTENSIONS.extend([filter_plugin, lzf_plugin])
 
 # For enabling ZSTD support when building wheels
+# This needs to be done after all Extensions have been added to EXTENSIONS
 if "ENABLE_ZSTD" in os.environ:
     default_options["compile_time_env"] = {"ZSTD_SUPPORT": True}
     for ext in EXTENSIONS:
@@ -217,16 +247,6 @@ def pkgconfig(*packages, **kw):
             ext.depends += zstd_headers
             ext.define_macros += [("ZSTD_SUPPORT", 1)]
 
-# Check for plugin hdf5 plugin support (hdf5 >= 1.8.11)
-HDF5_PLUGIN_SUPPORT = False
-CPATHS = os.environ["CPATH"].split(":") if "CPATH" in os.environ else []
-for p in ["/usr/include"] + pkgconfig("hdf5")["include_dirs"] + CPATHS:
-    if os.path.exists(os.path.join(p, "H5PLextern.h")):
-        HDF5_PLUGIN_SUPPORT = True
-
-if HDF5_PLUGIN_SUPPORT:
-    EXTENSIONS.extend([filter_plugin, lzf_plugin])
-
 
 class develop(develop_):
     def run(self):
@@ -344,10 +364,25 @@ def finalize_options(self):
     def build_extensions(self):
         c = self.compiler.compiler_type
 
+        # Set compiler flags including architecture
+        if self.compiler.compiler_type == "msvc":
+            openmpflag = "/openmp"
+            compileflags = COMPILE_FLAGS_MSVC
+        else:
+            openmpflag = "-fopenmp"
+            archi = platform.machine()
+            if archi in ("i386", "x86_64"):
+                compileflags = COMPILE_FLAGS + ["-march=%s" % self.march]
+            else:
+                compileflags = COMPILE_FLAGS + ["-mcpu=%s" % self.march]
+                if archi == "ppc64le":
+                    compileflags = COMPILE_FLAGS + ["-DNO_WARN_X86_INTRINSICS"]
+
         if self.omp not in ("0", "1", True, False):
             raise ValueError("Invalid omp argument. Mut be '0' or '1'.")
         self.omp = int(self.omp)
 
+        # Add the appropriate OpenMP flags if needed
         if self.omp:
             if not hasattr(self, "_printed_omp_message"):
                 self._printed_omp_message = True
@@ -356,26 +391,15 @@ def build_extensions(self):
                 print("#################################\n")
             # More portable to pass -fopenmp to linker.
             # self.libraries += ['gomp']
-            if self.compiler.compiler_type == "msvc":
-                openmpflag = "/openmp"
-                compileflags = COMPILE_FLAGS_MSVC
-            else:
-                openmpflag = "-fopenmp"
-                archi = platform.machine()
-                if archi in ("i386", "x86_64"):
-                    compileflags = COMPILE_FLAGS + ["-march=%s" % self.march]
-                else:
-                    compileflags = COMPILE_FLAGS + ["-mcpu=%s" % self.march]
-                    if archi == "ppc64le":
-                        compileflags = COMPILE_FLAGS + ["-DNO_WARN_X86_INTRINSICS"]
-            for e in self.extensions:
-                e.extra_compile_args = list(
-                    set(e.extra_compile_args).union(compileflags)
-                )
-                if openmpflag not in e.extra_compile_args:
-                    e.extra_compile_args += [openmpflag]
-                if openmpflag not in e.extra_link_args:
-                    e.extra_link_args += [openmpflag]
+            compileflags += [openmpflag]
+            linkflags = [openmpflag]
+        else:
+            linkflags = []
+
+        # Add the compile/link options to each extension
+        for e in self.extensions:
+            e.extra_compile_args = list(set(e.extra_compile_args).union(compileflags))
+            e.extra_link_args = list(set(e.extra_link_args).union(linkflags))
 
         build_ext_.build_extensions(self)
 
diff --git a/src/bitshuffle/src/bitshuffle_core.c b/src/bitshuffle/src/bitshuffle_core.c
index ef33bf55..ba41473f 100644
--- a/src/bitshuffle/src/bitshuffle_core.c
+++ b/src/bitshuffle/src/bitshuffle_core.c
@@ -16,6 +16,10 @@
 #include <string.h>
 
 
+#if defined(__AVX512F__) && defined (__AVX512BW__) && defined(__AVX2__) && defined(__SSE2__)
+#define USEAVX512
+#endif
+
 #if defined(__AVX2__) && defined (__SSE2__)
 #define USEAVX2
 #endif
@@ -79,6 +83,14 @@ int bshuf_using_AVX2(void) {
 }
 
 
+int bshuf_using_AVX512(void) {
+#ifdef USEAVX512
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 /* ---- Worker code not requiring special instruction sets. ----
  *
  * The following code does not use any x86 specific vectorized instructions
@@ -1384,7 +1396,6 @@ int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t
  */
 
 #ifdef USEAVX2
-
 /* Transpose bits within bytes. */
 int64_t bshuf_trans_bit_byte_AVX(const void* in, void* out, const size_t size,
          const size_t elem_size) {
@@ -1625,6 +1636,162 @@ int64_t bshuf_untrans_bit_elem_AVX(const void* in, void* out, const size_t size,
 
 #endif // #ifdef USEAVX2
 
+#ifdef USEAVX512
+
+/* Transpose bits within bytes. */
+int64_t bshuf_trans_bit_byte_AVX512(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    size_t ii, kk;
+    const char* in_b = (const char*) in;
+    char* out_b = (char*) out;
+    size_t nbyte = elem_size * size;
+    int64_t count;
+
+    int64_t* out_i64;
+    __m512i zmm;
+    __mmask64 bt;
+    if (nbyte >= 64) {
+        const __m512i mask = _mm512_set1_epi8(0);
+
+       for (ii = 0; ii + 63 < nbyte; ii += 64) {
+            zmm = _mm512_loadu_si512((__m512i *) &in_b[ii]);
+            for (kk = 0; kk < 8; kk++) {
+                bt = _mm512_cmp_epi8_mask(zmm, mask, 1);
+                zmm = _mm512_slli_epi16(zmm, 1);
+                out_i64 = (int64_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
+                *out_i64 = (int64_t)bt;
+            }
+        }
+    }
+
+    __m256i ymm;
+    int32_t bt32;
+    int32_t* out_i32;
+    size_t start = nbyte - nbyte % 64;
+    for (ii = start; ii + 31 < nbyte; ii += 32) {
+        ymm = _mm256_loadu_si256((__m256i *) &in_b[ii]);
+        for (kk = 0; kk < 8; kk++) {
+            bt32 = _mm256_movemask_epi8(ymm);
+            ymm = _mm256_slli_epi16(ymm, 1);
+            out_i32 = (int32_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
+            *out_i32 = bt32;
+        }
+    }
+
+
+    count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
+            nbyte - nbyte % 64 % 32);
+
+    return count;
+}
+
+
+/* Transpose bits within elements. */
+int64_t bshuf_trans_bit_elem_AVX512(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    int64_t count;
+
+    CHECK_MULT_EIGHT(size);
+
+    void* tmp_buf = malloc(size * elem_size);
+    if (tmp_buf == NULL) return -1;
+
+    count = bshuf_trans_byte_elem_SSE(in, out, size, elem_size);
+    CHECK_ERR_FREE(count, tmp_buf);
+    count = bshuf_trans_bit_byte_AVX512(out, tmp_buf, size, elem_size);
+    CHECK_ERR_FREE(count, tmp_buf);
+    count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
+
+    free(tmp_buf);
+
+    return count;
+
+}
+
+/* Shuffle bits within the bytes of eight element blocks. */
+int64_t bshuf_shuffle_bit_eightelem_AVX512(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    CHECK_MULT_EIGHT(size);
+
+    // With a bit of care, this could be written such that such that it is
+    // in_buf = out_buf safe.
+    const char* in_b = (const char*) in;
+    char* out_b = (char*) out;
+
+    size_t ii, jj, kk;
+    size_t nbyte = elem_size * size;
+
+    __m512i zmm;
+    __mmask64 bt;
+
+    if (elem_size % 8) {
+        return bshuf_shuffle_bit_eightelem_AVX(in, out, size, elem_size);
+    } else {
+        const __m512i mask = _mm512_set1_epi8(0);
+        for (jj = 0; jj + 63 < 8 * elem_size; jj += 64) {
+            for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
+                    ii += 8 * elem_size) {
+                zmm = _mm512_loadu_si512((__m512i *) &in_b[ii + jj]);
+                for (kk = 0; kk < 8; kk++) {
+                    bt = _mm512_cmp_epi8_mask(zmm, mask, 1);
+                    zmm = _mm512_slli_epi16(zmm, 1);
+                    size_t ind = (ii + jj / 8 + (7 - kk) * elem_size);
+                    * (int64_t *) &out_b[ind] = bt;
+                }
+            }
+        }
+
+    }
+    return size * elem_size;
+}
+
+/* Untranspose bits within elements. */
+int64_t bshuf_untrans_bit_elem_AVX512(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    int64_t count;
+
+    CHECK_MULT_EIGHT(size);
+
+    void* tmp_buf = malloc(size * elem_size);
+    if (tmp_buf == NULL) return -1;
+
+    count = bshuf_trans_byte_bitrow_AVX(in, tmp_buf, size, elem_size);
+    CHECK_ERR_FREE(count, tmp_buf);
+    count =  bshuf_shuffle_bit_eightelem_AVX512(tmp_buf, out, size, elem_size);
+
+    free(tmp_buf);
+    return count;
+}
+
+#else // #ifdef USEAVX512
+
+int64_t bshuf_trans_bit_byte_AVX512(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    return -14;
+}
+
+int64_t bshuf_trans_bit_elem_AVX512(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+    return -14;
+
+}
+
+int64_t bshuf_shuffle_bit_eightelem_AVX512(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+    return -14;
+}
+
+int64_t bshuf_untrans_bit_elem_AVX512(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+    return -14;
+}
+
+#endif
 
 /* ---- Drivers selecting best instruction set at compile time. ---- */
 
@@ -1632,7 +1799,9 @@ int64_t bshuf_trans_bit_elem(const void* in, void* out, const size_t size,
         const size_t elem_size) {
 
     int64_t count;
-#ifdef USEAVX2
+#ifdef USEAVX512
+    count = bshuf_trans_bit_elem_AVX512(in, out, size, elem_size);
+#elif defined USEAVX2
     count = bshuf_trans_bit_elem_AVX(in, out, size, elem_size);
 #elif defined(USESSE2)
     count = bshuf_trans_bit_elem_SSE(in, out, size, elem_size);
@@ -1649,7 +1818,9 @@ int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size,
         const size_t elem_size) {
 
     int64_t count;
-#ifdef USEAVX2
+#ifdef USEAVX512
+    count = bshuf_untrans_bit_elem_AVX512(in, out, size, elem_size);
+#elif defined USEAVX2
     count = bshuf_untrans_bit_elem_AVX(in, out, size, elem_size);
 #elif defined(USESSE2)
     count = bshuf_untrans_bit_elem_SSE(in, out, size, elem_size);
diff --git a/src/bitshuffle/src/bitshuffle_core.h b/src/bitshuffle/src/bitshuffle_core.h
index fba7301c..af09b1c4 100644
--- a/src/bitshuffle/src/bitshuffle_core.h
+++ b/src/bitshuffle/src/bitshuffle_core.h
@@ -19,6 +19,7 @@
  *      -11   : Missing SSE.
  *      -12   : Missing AVX.
  *      -13   : Missing Arm Neon.
+ *      -14   : Missing AVX512.
  *      -80   : Input size not a multiple of 8.
  *      -81   : block_size not multiple of 8.
  *      -91   : Decompression error, wrong number of bytes processed.
@@ -91,6 +92,18 @@ int bshuf_using_NEON(void);
 int bshuf_using_AVX2(void);
 
 
+/* ---- bshuf_using_AVX512 ----
+ *
+ * Whether routines where compiled with the AVX512 instruction set.
+ *
+ * Returns
+ * -------
+ *  1 if using AVX512, 0 otherwise.
+ *
+ */
+int bshuf_using_AVX512(void);
+
+
 /* ---- bshuf_default_block_size ----
  *
  * The default block size as function of element size.
diff --git a/src/bitshuffle/src/bshuf_h5filter.c b/src/bitshuffle/src/bshuf_h5filter.c
index 114b91ff..54de27d1 100644
--- a/src/bitshuffle/src/bshuf_h5filter.c
+++ b/src/bitshuffle/src/bshuf_h5filter.c
@@ -25,7 +25,7 @@ void bshuf_write_uint32_BE(void* buf, uint32_t num);
 uint32_t bshuf_read_uint32_BE(const void* buf);
 
 
-// Only called on compresion, not on reverse.
+// Only called on compression, not on reverse.
 herr_t bshuf_h5_set_local(hid_t dcpl, hid_t type, hid_t space){
 
     herr_t r;
@@ -192,7 +192,7 @@ size_t bshuf_h5_filter(unsigned int flags, size_t cd_nelmts,
             // Bit shuffle/compress.
             // Write the header, described in
             // http://www.hdfgroup.org/services/filters/HDF5_LZ4.pdf.
-            // Techincally we should be using signed integers instead of
+            // Technically we should be using signed integers instead of
             // unsigned ones, however for valid inputs (positive numbers) these
             // have the same representation.
             bshuf_write_uint64_BE(out_buf, nbytes_uncomp);
diff --git a/src/bitshuffle/src/bshuf_h5filter.h b/src/bitshuffle/src/bshuf_h5filter.h
index 54ee6775..0c6f153d 100644
--- a/src/bitshuffle/src/bshuf_h5filter.h
+++ b/src/bitshuffle/src/bshuf_h5filter.h
@@ -13,7 +13,7 @@
  *
  * Filter Options
  * --------------
- *  block_size (option slot 0) : interger (optional)
+ *  block_size (option slot 0) : integer (optional)
  *      What block size to use (in elements not bytes). Default is 0,
  *      for which bitshuffle will pick a block size with a target of 8kb.
  *  Compression (option slot 1) : 0 or BSHUF_H5_COMPRESS_LZ4
diff --git a/src/bitshuffle/src/iochain.c b/src/bitshuffle/src/iochain.c
index baa97296..37015614 100644
--- a/src/bitshuffle/src/iochain.c
+++ b/src/bitshuffle/src/iochain.c
@@ -1,5 +1,5 @@
 /*
- * IOchain - Distribute a chain of dependant IO events amoung threads.
+ * IOchain - Distribute a chain of dependent IO events among threads.
  *
  * This file is part of Bitshuffle
  * Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
diff --git a/src/bitshuffle/src/iochain.h b/src/bitshuffle/src/iochain.h
index 4e225d1b..8acafeae 100644
--- a/src/bitshuffle/src/iochain.h
+++ b/src/bitshuffle/src/iochain.h
@@ -1,5 +1,5 @@
 /*
- * IOchain - Distribute a chain of dependant IO events amoung threads.
+ * IOchain - Distribute a chain of dependent IO events among threads.
  *
  * This file is part of Bitshuffle
  * Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
diff --git a/src/bitshuffle/tests/make_regression_tdata.py b/src/bitshuffle/tests/make_regression_tdata.py
index 03deb422..86086226 100644
--- a/src/bitshuffle/tests/make_regression_tdata.py
+++ b/src/bitshuffle/tests/make_regression_tdata.py
@@ -24,7 +24,7 @@
 DTYPES = ["a1", "a2", "a3", "a4", "a6", "a8", "a10"]
 
 f = h5py.File(OUT_FILE, "w")
-g_orig = f.create_group("origional")
+g_orig = f.create_group("original")
 g_comp_lz4 = f.create_group("compressed")
 g_comp_zstd = f.create_group("compressed_zstd")
 
diff --git a/src/bitshuffle/tests/test_ext.py b/src/bitshuffle/tests/test_ext.py
index b2577c0d..7fbd7baf 100644
--- a/src/bitshuffle/tests/test_ext.py
+++ b/src/bitshuffle/tests/test_ext.py
@@ -34,7 +34,7 @@ def setUp(self):
         if TIME:
             n *= TIME
         # Almost random bits, but now quite. All bits exercised (to fully test
-        # transpose) but still slightly compresible.
+        # transpose) but still slightly compressible.
         self.data = random.randint(0, 200, n).astype(np.uint8)
         self.fun = ext.copy
         self.check = None
@@ -58,6 +58,8 @@ def tearDown(self):
                 return
             if len(err.args) > 1 and (err.args[1] == -12) and not ext.using_AVX2():
                 return
+            if len(err.args) > 1 and (err.args[1] == -14) and not ext.using_AVX512():
+                return
             else:
                 raise
         delta_t = min(delta_ts)
@@ -171,6 +173,18 @@ def test_03g_trans_bit_byte_AVX_32(self):
         self.fun = ext.trans_bit_byte_AVX
         self.check = trans_bit_byte
 
+    def test_03h_trans_bit_byte_AVX512(self):
+        self.case = "bit T byte AVX512 64"
+        self.data = self.data.view(np.float64)
+        self.fun = ext.trans_bit_byte_AVX512
+        self.check = trans_bit_byte
+
+    def test_03g_trans_bit_byte_AVX512_32(self):
+        self.case = "bit T byte AVX512 32"
+        self.data = self.data.view(np.float32)
+        self.fun = ext.trans_bit_byte_AVX512
+        self.check = trans_bit_byte
+
     def test_04a_trans_bit_elem_AVX(self):
         self.case = "bit T elem AVX 64"
         self.data = self.data.view(np.float64)
@@ -213,6 +227,30 @@ def test_04g_trans_bit_elem_SSE_64(self):
         self.fun = ext.trans_bit_elem_SSE
         self.check = trans_bit_elem
 
+    def test_04h_trans_bit_elem_AVX512(self):
+        self.case = "bit T elem AVX512 64"
+        self.data = self.data.view(np.float64)
+        self.fun = ext.trans_bit_elem_AVX512
+        self.check = trans_bit_elem
+
+    def test_04i_trans_bit_elem_AVX512(self):
+        self.case = "bit T elem AVX 128"
+        self.data = self.data.view(np.complex128)
+        self.fun = ext.trans_bit_elem_AVX512
+        self.check = trans_bit_elem
+
+    def test_04j_trans_bit_elem_AVX512_32(self):
+        self.case = "bit T elem AVX512 32"
+        self.data = self.data.view(np.float32)
+        self.fun = ext.trans_bit_elem_AVX512
+        self.check = trans_bit_elem
+
+    def test_04k_trans_bit_elem_AVX512_16(self):
+        self.case = "bit T elem AVX512 16"
+        self.data = self.data.view(np.int16)
+        self.fun = ext.trans_bit_elem_AVX512
+        self.check = trans_bit_elem
+
     def test_06a_untrans_bit_elem_16(self):
         self.case = "bit U elem SSE 16"
         pre_trans = self.data.view(np.int16)
@@ -262,6 +300,20 @@ def test_06g_untrans_bit_elem_64(self):
         self.fun = ext.untrans_bit_elem_scal
         self.check_data = pre_trans
 
+    def test_06h_untrans_bit_elem_32(self):
+        self.case = "bit U elem AVX512 32"
+        pre_trans = self.data.view(np.float32)
+        self.data = trans_bit_elem(pre_trans)
+        self.fun = ext.untrans_bit_elem_AVX512
+        self.check_data = pre_trans
+
+    def test_06i_untrans_bit_elem_64(self):
+        self.case = "bit U elem AVX512 64"
+        pre_trans = self.data.view(np.float64)
+        self.data = trans_bit_elem(pre_trans)
+        self.fun = ext.untrans_bit_elem_AVX512
+        self.check_data = pre_trans
+
     def test_07a_trans_byte_bitrow_64(self):
         self.case = "byte T row scal 64"
         self.data = self.data.view(np.float64)
@@ -314,6 +366,30 @@ def test_08f_shuffle_bit_eight_AVX_128(self):
         self.fun = ext.shuffle_bit_eightelem_AVX
         self.check = ext.shuffle_bit_eightelem_scal
 
+    def test_08g_shuffle_bit_eight_AVX512_32(self):
+        self.case = "bit S eight AVX 32"
+        self.data = self.data.view(np.float32)
+        self.fun = ext.shuffle_bit_eightelem_AVX512
+        self.check = ext.shuffle_bit_eightelem_scal
+
+    def test_08h_shuffle_bit_eight_AVX512_64(self):
+        self.case = "bit S eight AVX512 64"
+        self.data = self.data.view(np.float64)
+        self.fun = ext.shuffle_bit_eightelem_AVX512
+        self.check = ext.shuffle_bit_eightelem_scal
+
+    def test_08i_shuffle_bit_eight_AVX512_16(self):
+        self.case = "bit S eight AVX512 16"
+        self.data = self.data.view(np.int16)
+        self.fun = ext.shuffle_bit_eightelem_AVX512
+        self.check = ext.shuffle_bit_eightelem_scal
+
+    def test_08i_shuffle_bit_eight_AVX512_128(self):
+        self.case = "bit S eight AVX512 128"
+        self.data = self.data.view(np.complex128)
+        self.fun = ext.shuffle_bit_eightelem_AVX512
+        self.check = ext.shuffle_bit_eightelem_scal
+
     def test_09a_trans_bit_elem_scal_64(self):
         self.case = "bit T elem scal 64"
         self.data = self.data.view(np.float64)
@@ -353,6 +429,13 @@ def test_09f_untrans_bit_elem_AVX_64(self):
         self.fun = ext.untrans_bit_elem_AVX
         self.check_data = pre_trans
 
+    def test_09g_untrans_bit_elem_AVX_64(self):
+        self.case = "bit U elem AVX512 64"
+        pre_trans = self.data.view(np.float64)
+        self.data = trans_bit_elem(pre_trans)
+        self.fun = ext.untrans_bit_elem_AVX512
+        self.check_data = pre_trans
+
     def test_10a_bitshuffle_64(self):
         self.case = "bitshuffle 64"
         self.data = self.data.view(np.float64)
@@ -481,10 +564,18 @@ def test_trans_bit_elem_AVX(self):
         self.fun = ext.trans_bit_elem_AVX
         self.check = trans_bit_elem
 
+    def test_trans_bit_elem_AVX512(self):
+        self.fun = ext.trans_bit_elem_AVX512
+        self.check = trans_bit_elem
+
     def test_untrans_bit_elem_AVX(self):
         self.fun = lambda x: ext.untrans_bit_elem_SSE(ext.trans_bit_elem(x))
         self.check = lambda x: x
 
+    def test_untrans_bit_elem_AVX512(self):
+        self.fun = lambda x: ext.untrans_bit_elem_SSE(ext.trans_bit_elem(x))
+        self.check = lambda x: x
+
     def test_trans_bit_elem_scal(self):
         self.fun = ext.trans_bit_elem_scal
         self.check = trans_bit_elem
@@ -515,12 +606,14 @@ def tearDown(self):
                 return
             if len(err.args) > 1 and (err.args[1] == -12) and not ext.using_AVX2():
                 return
+            if len(err.args) > 1 and (err.args[1] == -14) and not ext.using_AVX512():
+                return
             else:
                 raise
 
 
 class TestBitShuffleCircle(unittest.TestCase):
-    """Ensure that final filter is circularly consistant for any data type and
+    """Ensure that final filter is circularly consistent for any data type and
     any length buffer."""
 
     def test_circle(self):