From 841ab4ee0cd9a7698e5442abe25641cc09ccfc7e Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Wed, 25 Sep 2024 18:25:13 +0100
Subject: [PATCH] Test, improvements on SSE, AVX

---
 Cargo.lock                            |  11 --
 README.md                             |  16 +-
 app/Cargo.toml                        |   2 +-
 app/benches/dilation/main.rs          |  22 +--
 app/src/main.rs                       |  13 +-
 src/filter.rs                         |  68 +++++++-
 src/morph_base.rs                     |   2 +-
 src/morph_rgb.rs                      |   9 +-
 src/morph_rgba.rs                     |   9 +-
 src/op.rs                             |   2 +-
 src/op_f32.rs                         |   2 +-
 src/op_impl.rs                        |   2 +-
 src/ops/avx/mod.rs                    |   3 +
 src/ops/avx/morph_op.rs               | 230 ++++++++++++++++++++++++++
 src/ops/mod.rs                        |   2 +
 src/ops/neon/mod.rs                   |   2 +-
 src/ops/sse/hminmax.rs                |  73 --------
 src/ops/sse/mod.rs                    |   8 +-
 src/ops/sse/morph_op.rs               |   1 +
 src/ops/sse/morph_op_f32.rs           | 187 +++++++++++++++++++++
 src/ops/sse/morph_op_u16.rs           | 186 +++++++++++++++++++++
 src/packing/avx/deinterleave_rgb.rs   | 128 ++++++++++++++
 src/packing/avx/deinterleave_rgba.rs  | 142 ++++++++++++++++
 src/packing/avx/mod.rs                |   5 +
 src/packing/avx/v_load.rs             |  21 +++
 src/packing/mod.rs                    |  10 +-
 src/packing/neon/deinterleave_rgba.rs |   2 +-
 src/packing/pack_gray_alpha.rs        |   2 +-
 src/packing/pack_rgb.rs               |   8 +
 src/packing/pack_rgba.rs              |  11 +-
 src/packing/sse/deinterleave_rgb.rs   |  10 ++
 src/packing/sse/deinterleave_rgba.rs  |  16 +-
 src/packing/sse/mod.rs                |   5 +
 src/packing/sse/pack_rgb.rs           | 111 +++++++++++++
 src/packing/sse/pack_rgba.rs          | 122 ++++++++++++++
 src/packing/sse/v_store.rs            | 121 ++++++++++++++
 src/packing/unpack_rgb.rs             |   5 +
 src/packing/unpack_rgba.rs            |  34 +---
 38 files changed, 1418 insertions(+), 185 deletions(-)
 create mode 100644 src/ops/avx/mod.rs
 create mode 100644 src/ops/avx/morph_op.rs
 delete mode 100644 src/ops/sse/hminmax.rs
 create mode 100644 src/ops/sse/morph_op_f32.rs
 create mode 100644 src/ops/sse/morph_op_u16.rs
 create mode 100644 src/packing/avx/deinterleave_rgb.rs
 create mode 100644 src/packing/avx/deinterleave_rgba.rs
 create mode 100644 src/packing/sse/pack_rgb.rs
 create mode 100644 src/packing/sse/pack_rgba.rs
 create mode 100644 src/packing/sse/v_store.rs

diff --git a/Cargo.lock b/Cargo.lock
index a7b79db..d0cbfc3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -261,7 +261,6 @@ checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
 dependencies = [
  "glob",
  "libc",
- "libloading",
 ]
 
 [[package]]
@@ -679,16 +678,6 @@ dependencies = [
  "once_cell",
 ]
 
-[[package]]
-name = "libloading"
-version = "0.8.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4"
-dependencies = [
- "cfg-if",
- "windows-targets",
-]
-
 [[package]]
 name = "libm"
 version = "0.2.8"
diff --git a/README.md b/README.md
index 2c31e10..fb245b9 100644
--- a/README.md
+++ b/README.md
@@ -50,17 +50,17 @@ M3 Pro, NEON dilation RGBA image 2731x4096 with specified kernel size
 
 SSE dilation RGB image 2731x4096 with specified kernel size
 
-| SE     | 9x9     | 15x15    | 21x21    | 41x41    | 61x61 |
-|--------|---------|----------|----------|----------|-------|
-| FM     | 84.19ms | 186.53ms | 254.70ms | 673.45ms | 1.37s |
-| OpenCV | 28.61ms | 62.43ms  | 114.80ms | 428.87ms | 1.16s |
+| SE     | 9x9     | 15x15   | 21x21    | 41x41    | 61x61    |
+|--------|---------|---------|----------|----------|----------|
+| FM     | 30.71ms | 34.87ms | 39.93ms  | 81.56ms  | 149.37ms |
+| OpenCV | 27.36ms | 63.05ms | 112.54ms | 419.40ms | 1.08s    |
 
 SSE dilation RGBA image 2731x4096 with specified kernel size
 
-| SE     | 9x9      | 15x15    | 21x21    | 41x41    | 61x61 |
-|--------|----------|----------|----------|----------|-------|
-| FM     | 109.37ms | 229.11ms | 329.31ms | 981.48ms | 2.05s |
-| OpenCV | 39.20ms  | 76.09ms  | 149.12ms | 569.36ms | 1.33s |
+| SE     | 9x9     | 15x15   | 21x21    | 41x41    | 61x61    |
+|--------|---------|---------|----------|----------|----------|
+| FM     | 45.03ms | 49.03ms | 56.40ms  | 114.72ms | 206.05ms |
+| OpenCV | 35.50ms | 79.60ms | 147.32ms | 556.56ms | 1.33s    |
 
 This project is licensed under either of
 
diff --git a/app/Cargo.toml b/app/Cargo.toml
index bfbe51f..9da90f9 100644
--- a/app/Cargo.toml
+++ b/app/Cargo.toml
@@ -7,7 +7,7 @@ edition = "2021"
 image = "0.25.2"
 fast_morphology = {path = "../"}
 imageproc = "0.25.0"
-opencv = {version = "0.93.0", features = ["imgproc", "clang-runtime"]}
+opencv = {version = "0.93.0", features = ["imgproc"]}
 
 [dev-dependencies]
 criterion = {version = "0.5.1", features = ["html_reports"]}
diff --git a/app/benches/dilation/main.rs b/app/benches/dilation/main.rs
index bfa901b..f41276d 100644
--- a/app/benches/dilation/main.rs
+++ b/app/benches/dilation/main.rs
@@ -316,19 +316,19 @@ pub fn criterion_benchmark(c: &mut Criterion) {
     opencv::core::set_use_ipp(false).expect("Failed to disable IPP");
     opencv::core::set_use_optimized(false).expect("Failed to disable opts");
 
-    // exec_bench_rgb(c, 4);
-    // exec_bench_rgb(c, 7);
-    // exec_bench_rgb(c, 10);
-    // exec_bench_rgb(c, 20);
-    // exec_bench_rgb(c, 30);
+    exec_bench_rgb(c, 4);
+    exec_bench_rgb(c, 7);
+    exec_bench_rgb(c, 10);
+    exec_bench_rgb(c, 20);
+    exec_bench_rgb(c, 30);
     //
-    exec_bench_rgba(c, 4);
-    exec_bench_rgba(c, 7);
-    exec_bench_rgba(c, 10);
-    exec_bench_rgba(c, 20);
-    exec_bench_rgba(c, 30);
+    // exec_bench_rgba(c, 4);
+    // exec_bench_rgba(c, 7);
+    // exec_bench_rgba(c, 10);
+    // exec_bench_rgba(c, 20);
+    // exec_bench_rgba(c, 30);
 
-    exec_bench_gray(c, 4);
+    // exec_bench_gray(c, 4);
     // exec_bench_gray(c, 7);
     // exec_bench_gray(c, 10);
     // exec_bench_gray(c, 20);
diff --git a/app/src/main.rs b/app/src/main.rs
index b6ed134..ff1c4cc 100644
--- a/app/src/main.rs
+++ b/app/src/main.rs
@@ -1,10 +1,9 @@
 use fast_morphology::{
-    dilate, dilate_rgb, BorderMode, ImageSize, KernelShape, MorphologyThreadingPolicy,
+    dilate, dilate_rgb, dilate_rgba, BorderMode, ImageSize, KernelShape, MorphologyThreadingPolicy,
 };
 use image::{EncodableLayout, GenericImageView, ImageReader};
 use opencv::core::{
-    Mat, MatTrait, MatTraitConstManual, Point, Scalar,
-    BORDER_REPLICATE, CV_8U, CV_8UC3,
+    Mat, MatTrait, MatTraitConstManual, Point, Scalar, BORDER_REPLICATE, CV_8U, CV_8UC3,
 };
 use opencv::imgproc;
 use std::time::Instant;
@@ -164,11 +163,11 @@ fn main() {
     }
 
     let rgba_image = transient_rgba.as_bytes();
-    let mut dst = vec![0u8; saved_origin.len()];
+    let mut dst = vec![0u8; rgba_image.len()];
 
     let exec_time = Instant::now();
-    dilate_rgb(
-        &saved_origin,
+    dilate_rgba(
+        &rgba_image,
         &mut dst,
         image_size,
         &structuring_element,
@@ -237,7 +236,7 @@ fn main() {
         &dst,
         dimensions.0,
         dimensions.1,
-        image::ColorType::Rgb8,
+        image::ColorType::Rgba8,
     )
     .unwrap();
 
diff --git a/src/filter.rs b/src/filter.rs
index a82d5d9..079b576 100644
--- a/src/filter.rs
+++ b/src/filter.rs
@@ -28,15 +28,17 @@
  */
 use crate::filter_op_declare::{Arena, MorthOpFilterFlat2DRow};
 use crate::flat_se::AnalyzedSe;
+use crate::morph_base::MorphNativeOp;
 use crate::op_type::MorphOp;
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+use crate::ops::avx::MorphOpFilterAvx2DRow;
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 use crate::ops::neon::MorphOpFilterNeon2DRow;
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-use crate::ops::sse::{MorphOpFilterSse2DRow};
+use crate::ops::sse::{MorphOpFilterSse2DRow, MorphOpFilterSse2DRowF32, MorphOpFilterSse2DRowU16};
 use crate::ops::MorphFilterFlat2DRow;
 use crate::unsafe_slice::UnsafeSlice;
 use crate::ImageSize;
-use crate::morph_base::MorphNativeOp;
 
 pub struct MorthFilterFlat2DRow<T>
 where
@@ -78,9 +80,34 @@ where
                         }
                         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
                         {
-                            _result = Box::new(
-                                MorphOpFilterSse2DRow::<{ MorphOp::Dilate as u8 }>::default(),
-                            );
+                            if std::arch::is_x86_feature_detected!("sse4.1") {
+                                _result = Box::new(MorphOpFilterSse2DRow::<
+                                    { MorphOp::Dilate as u8 },
+                                >::default());
+                            }
+                            if std::arch::is_x86_feature_detected!("avx2") {
+                                _result = Box::new(MorphOpFilterAvx2DRow::<
+                                    { MorphOp::Dilate as u8 },
+                                >::default());
+                            }
+                        }
+                    } else if std::any::type_name::<T>() == "u16" {
+                        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+                        {
+                            if std::arch::is_x86_feature_detected!("sse4.1") {
+                                _result = Box::new(MorphOpFilterSse2DRowU16::<
+                                    { MorphOp::Dilate as u8 },
+                                >::default());
+                            }
+                        }
+                    } else if std::any::type_name::<T>() == "f32" {
+                        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+                        {
+                            if std::arch::is_x86_feature_detected!("sse4.1") {
+                                _result = Box::new(MorphOpFilterSse2DRowF32::<
+                                    { MorphOp::Dilate as u8 },
+                                >::default());
+                            }
                         }
                     }
                     _result
@@ -97,9 +124,34 @@ where
                         }
                         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
                         {
-                            _result = Box::new(
-                                MorphOpFilterSse2DRow::<{ MorphOp::Erode as u8 }>::default(),
-                            );
+                            if std::arch::is_x86_feature_detected!("sse4.1") {
+                                _result = Box::new(
+                                    MorphOpFilterSse2DRow::<{ MorphOp::Erode as u8 }>::default(),
+                                );
+                            }
+                            if std::arch::is_x86_feature_detected!("avx2") {
+                                _result = Box::new(
+                                    MorphOpFilterAvx2DRow::<{ MorphOp::Erode as u8 }>::default(),
+                                );
+                            }
+                        }
+                    } else if std::any::type_name::<T>() == "u16" {
+                        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+                        {
+                            if std::arch::is_x86_feature_detected!("sse4.1") {
+                                _result = Box::new(MorphOpFilterSse2DRowU16::<
+                                    { MorphOp::Erode as u8 },
+                                >::default());
+                            }
+                        }
+                    } else if std::any::type_name::<T>() == "f32" {
+                        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+                        {
+                            if std::arch::is_x86_feature_detected!("sse4.1") {
+                                _result = Box::new(MorphOpFilterSse2DRowF32::<
+                                    { MorphOp::Erode as u8 },
+                                >::default());
+                            }
                         }
                     }
                     _result
diff --git a/src/morph_base.rs b/src/morph_base.rs
index 4463260..0f8eb9c 100644
--- a/src/morph_base.rs
+++ b/src/morph_base.rs
@@ -60,4 +60,4 @@ impl MorphNativeOp<f32> for f32 {
             MorphOp::Erode => (*self).min(other),
         }
     }
-}
\ No newline at end of file
+}
diff --git a/src/morph_rgb.rs b/src/morph_rgb.rs
index cbf40db..c5ba9fb 100644
--- a/src/morph_rgb.rs
+++ b/src/morph_rgb.rs
@@ -41,14 +41,7 @@ pub(crate) unsafe fn make_morphology_rgb<T, const OP_TYPE: u8>(
     threading_policy: MorphologyThreadingPolicy,
 ) -> Result<(), String>
 where
-    T: RgbPackable<T>
-        + Copy
-        + 'static
-        + Sync
-        + Send
-        + Clone
-        + Default
-        + MorphNativeOp<T>,
+    T: RgbPackable<T> + Copy + 'static + Sync + Send + Clone + Default + MorphNativeOp<T>,
 {
     let unpacked = T::unpack(src, image_size);
     let mut dst_unpacked = UnpackedRgbImage::alloc(image_size);
diff --git a/src/morph_rgba.rs b/src/morph_rgba.rs
index a618b0f..11883a5 100644
--- a/src/morph_rgba.rs
+++ b/src/morph_rgba.rs
@@ -41,14 +41,7 @@ pub(crate) unsafe fn make_morphology_rgba<T, const OP_TYPE: u8>(
     threading_policy: MorphologyThreadingPolicy,
 ) -> Result<(), String>
 where
-    T: RgbaPackable<T>
-        + Default
-        + Copy
-        + Clone
-        + Send
-        + Sync
-        + 'static
-        + MorphNativeOp<T>,
+    T: RgbaPackable<T> + Default + Copy + Clone + Send + Sync + 'static + MorphNativeOp<T>,
 {
     let unpacked = T::unpack(src, image_size);
     let mut dst_unpacked = UnpackedRgbaImage::alloc(image_size);
diff --git a/src/op.rs b/src/op.rs
index 70bb4b1..eb258db 100644
--- a/src/op.rs
+++ b/src/op.rs
@@ -27,13 +27,13 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 use crate::border_mode::BorderMode;
+use crate::morph_gray_alpha::make_morphology_gray_alpha;
 use crate::morph_rgb::make_morphology_rgb;
 use crate::morph_rgba::make_morphology_rgba;
 use crate::op_impl::make_morphology;
 use crate::op_type::MorphOp;
 use crate::structuring_element::KernelShape;
 use crate::{ImageSize, MorphologyThreadingPolicy};
-use crate::morph_gray_alpha::make_morphology_gray_alpha;
 
 /// Dilate a gray (planar) image
 ///
diff --git a/src/op_f32.rs b/src/op_f32.rs
index 5e64740..ec42037 100644
--- a/src/op_f32.rs
+++ b/src/op_f32.rs
@@ -27,13 +27,13 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 use crate::border_mode::BorderMode;
+use crate::morph_gray_alpha::make_morphology_gray_alpha;
 use crate::morph_rgb::make_morphology_rgb;
 use crate::morph_rgba::make_morphology_rgba;
 use crate::op_impl::make_morphology;
 use crate::op_type::MorphOp;
 use crate::structuring_element::KernelShape;
 use crate::{ImageSize, MorphologyThreadingPolicy};
-use crate::morph_gray_alpha::make_morphology_gray_alpha;
 
 /// Dilate a gray (planar) stored in u16 image
 ///
diff --git a/src/op_impl.rs b/src/op_impl.rs
index 78cecfa..d45596d 100644
--- a/src/op_impl.rs
+++ b/src/op_impl.rs
@@ -30,13 +30,13 @@ use crate::arena::make_arena;
 use crate::border_mode::BorderMode;
 use crate::filter::MorthFilterFlat2DRow;
 use crate::filter_op_declare::MorthOpFilterFlat2DRow;
+use crate::morph_base::MorphNativeOp;
 use crate::op_type::MorphOp;
 use crate::se_scan::scan_se;
 use crate::structuring_element::KernelShape;
 use crate::unsafe_slice::UnsafeSlice;
 use crate::{ImageSize, MorphologyThreadingPolicy};
 use std::sync::Arc;
-use crate::morph_base::MorphNativeOp;
 
 pub(crate) unsafe fn make_morphology<T, const OP_TYPE: u8>(
     src: &[T],
diff --git a/src/ops/avx/mod.rs b/src/ops/avx/mod.rs
new file mode 100644
index 0000000..8102778
--- /dev/null
+++ b/src/ops/avx/mod.rs
@@ -0,0 +1,3 @@
+mod morph_op;
+
+pub use morph_op::MorphOpFilterAvx2DRow;
diff --git a/src/ops/avx/morph_op.rs b/src/ops/avx/morph_op.rs
new file mode 100644
index 0000000..d4e16dc
--- /dev/null
+++ b/src/ops/avx/morph_op.rs
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::filter_op_declare::{Arena, MorthOpFilterFlat2DRow};
+use crate::flat_se::AnalyzedSe;
+use crate::morph_base::MorphNativeOp;
+use crate::op_type::MorphOp;
+use crate::unsafe_slice::UnsafeSlice;
+use crate::ImageSize;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+#[derive(Clone)]
+pub struct MorphOpFilterAvx2DRow<const OP_TYPE: u8> {}
+
+impl<const OP_TYPE: u8> Default for MorphOpFilterAvx2DRow<OP_TYPE> {
+    fn default() -> Self {
+        MorphOpFilterAvx2DRow {}
+    }
+}
+
+impl<T, const OP_TYPE: u8> MorthOpFilterFlat2DRow<T> for MorphOpFilterAvx2DRow<OP_TYPE>
+where
+    T: Copy + 'static + MorphNativeOp<T>,
+{
+    #[target_feature(enable = "avx2")]
+    unsafe fn dispatch_row(
+        &self,
+        arena: &Arena<T>,
+        dst: &UnsafeSlice<T>,
+        image_size: ImageSize,
+        analyzed_se: AnalyzedSe,
+        y: usize,
+    ) {
+        let width = image_size.width;
+
+        let op_type: MorphOp = OP_TYPE.into();
+        let stride = width;
+
+        let decision = match op_type {
+            MorphOp::Dilate => _mm_max_epu8,
+            MorphOp::Erode => _mm_min_epu8,
+        };
+
+        let decision_avx = match op_type {
+            MorphOp::Dilate => _mm256_max_epu8,
+            MorphOp::Erode => _mm256_min_epu8,
+        };
+
+        let src: &Vec<u8> = std::mem::transmute(&arena.arena);
+        let dst: &UnsafeSlice<u8> = std::mem::transmute(dst);
+
+        let dx = arena.pad_w as i32;
+        let dy = arena.pad_h as i32;
+
+        let arena_stride = arena.width;
+
+        let offsets = analyzed_se
+            .left_front
+            .element_offsets
+            .iter()
+            .map(|&x| {
+                src.get_unchecked(
+                    ((x.y + dy + y as i32) as usize * arena_stride + (x.x + dx) as usize)..,
+                )
+            })
+            .collect::<Vec<_>>();
+
+        let length = analyzed_se.left_front.element_offsets.iter().len();
+
+        let mut _cx = 0usize;
+
+        while _cx + 128 < width {
+            let ptr0 = (*offsets.get_unchecked(0).get_unchecked(_cx..)).as_ptr();
+            let mut row0 = _mm256_loadu_si256(ptr0 as *const __m256i);
+            let mut row1 = _mm256_loadu_si256(ptr0.add(32) as *const __m256i);
+            let mut row2 = _mm256_loadu_si256(ptr0.add(64) as *const __m256i);
+            let mut row3 = _mm256_loadu_si256(ptr0.add(96) as *const __m256i);
+
+            for i in 1..length {
+                let ptr_d = (*offsets.get_unchecked(i)).get_unchecked(_cx..).as_ptr();
+                let new_row0 = _mm256_loadu_si256(ptr_d as *const __m256i);
+                let new_row1 = _mm256_loadu_si256(ptr_d.add(32) as *const __m256i);
+                let new_row2 = _mm256_loadu_si256(ptr_d.add(64) as *const __m256i);
+                let new_row3 = _mm256_loadu_si256(ptr_d.add(96) as *const __m256i);
+                row0 = decision_avx(row0, new_row0);
+                row1 = decision_avx(row1, new_row1);
+                row2 = decision_avx(row2, new_row2);
+                row3 = decision_avx(row3, new_row3);
+            }
+
+            let v_dst = dst.slice.as_ptr().add(y * stride + _cx) as *mut u8;
+
+            _mm256_storeu_si256(v_dst as *mut __m256i, row0);
+            _mm256_storeu_si256(v_dst.add(32) as *mut __m256i, row1);
+            _mm256_storeu_si256(v_dst.add(64) as *mut __m256i, row2);
+            _mm256_storeu_si256(v_dst.add(96) as *mut __m256i, row3);
+
+            _cx += 128;
+        }
+
+        while _cx + 64 < width {
+            let ptr0 = (*offsets.get_unchecked(0).get_unchecked(_cx..)).as_ptr();
+            let mut row0 = _mm256_loadu_si256(ptr0 as *const __m256i);
+            let mut row1 = _mm256_loadu_si256(ptr0.add(32) as *const __m256i);
+
+            for i in 1..length {
+                let ptr_d = (*offsets.get_unchecked(i)).get_unchecked(_cx..).as_ptr();
+                let new_row0 = _mm256_loadu_si256(ptr_d as *const __m256i);
+                let new_row1 = _mm256_loadu_si256(ptr_d.add(32) as *const __m256i);
+                row0 = decision_avx(row0, new_row0);
+                row1 = decision_avx(row1, new_row1);
+            }
+
+            let v_dst = dst.slice.as_ptr().add(y * stride + _cx) as *mut u8;
+
+            _mm256_storeu_si256(v_dst as *mut __m256i, row0);
+            _mm256_storeu_si256(v_dst.add(32) as *mut __m256i, row1);
+
+            _cx += 64;
+        }
+
+        while _cx + 32 < width {
+            let ptr0 = (*offsets.get_unchecked(0).get_unchecked(_cx..)).as_ptr();
+            let mut row0 = _mm256_loadu_si256(ptr0 as *const __m256i);
+
+            for i in 1..length {
+                let ptr_d = (*offsets.get_unchecked(i)).get_unchecked(_cx..).as_ptr();
+                let new_row0 = _mm256_loadu_si256(ptr_d as *const __m256i);
+                row0 = decision_avx(row0, new_row0);
+            }
+
+            let v_dst = dst.slice.as_ptr().add(y * stride + _cx) as *mut u8;
+
+            _mm256_storeu_si256(v_dst as *mut __m256i, row0);
+
+            _cx += 32;
+        }
+
+        while _cx + 16 < width {
+            let ptr0 = (*offsets.get_unchecked(0).get_unchecked(_cx..)).as_ptr();
+            let mut row0 = _mm_loadu_si128(ptr0 as *const __m128i);
+
+            for i in 1..length {
+                let ptr_d = (*offsets.get_unchecked(i)).get_unchecked(_cx..).as_ptr();
+                let new_row0 = _mm_loadu_si128(ptr_d as *const __m128i);
+                row0 = decision(row0, new_row0);
+            }
+
+            let v_dst = dst.slice.as_ptr().add(y * stride + _cx) as *mut u8;
+            _mm_storeu_si128(v_dst as *mut __m128i, row0);
+
+            _cx += 16;
+        }
+
+        while _cx + 8 < width {
+            let ptr0 = (*offsets.get_unchecked(0).get_unchecked(_cx..)).as_ptr();
+            let mut row0 = _mm_loadu_si64(ptr0);
+
+            for i in 1..length {
+                let ptr_d = (*offsets.get_unchecked(i)).get_unchecked(_cx..).as_ptr();
+                let new_row0 = _mm_loadu_si64(ptr_d);
+                row0 = decision(row0, new_row0);
+            }
+
+            let v_dst = dst.slice.as_ptr().add(y * stride + _cx) as *mut u8;
+            std::ptr::copy_nonoverlapping(&row0 as *const _ as *const u8, v_dst, 8);
+
+            _cx += 8;
+        }
+
+        for x in (_cx..width.saturating_sub(4)).step_by(4) {
+            let mut k0 = *(*offsets.get_unchecked(0)).get_unchecked(x);
+            let mut k1 = *(*offsets.get_unchecked(0)).get_unchecked(x + 1);
+            let mut k2 = *(*offsets.get_unchecked(0)).get_unchecked(x + 2);
+            let mut k3 = *(*offsets.get_unchecked(0)).get_unchecked(x + 3);
+
+            for i in 1..length {
+                k0 = k0.op::<OP_TYPE>(*(*offsets.get_unchecked(i)).get_unchecked(x));
+                k1 = k1.op::<OP_TYPE>(*(*offsets.get_unchecked(i)).get_unchecked(x + 1));
+                k2 = k2.op::<OP_TYPE>(*(*offsets.get_unchecked(i)).get_unchecked(x + 2));
+                k3 = k3.op::<OP_TYPE>(*(*offsets.get_unchecked(i)).get_unchecked(x + 3));
+            }
+
+            let dst_offset = y * stride + x;
+
+            dst.write(dst_offset, k0);
+            dst.write(dst_offset + 1, k1);
+            dst.write(dst_offset + 2, k2);
+            dst.write(dst_offset + 3, k3);
+            _cx = x;
+        }
+
+        for x in _cx..width {
+            let mut k0 = *(*offsets.get_unchecked(0)).get_unchecked(x);
+
+            for i in 1..length {
+                k0 = k0.op::<OP_TYPE>(*(*offsets.get_unchecked(i)).get_unchecked(x));
+            }
+            dst.write(y * stride + x, k0);
+        }
+    }
+}
diff --git a/src/ops/mod.rs b/src/ops/mod.rs
index 65800fc..7fd6373 100644
--- a/src/ops/mod.rs
+++ b/src/ops/mod.rs
@@ -26,6 +26,8 @@
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+pub mod avx;
 mod morph_row_op;
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 pub mod neon;
diff --git a/src/ops/neon/mod.rs b/src/ops/neon/mod.rs
index b1f550b..8deee91 100644
--- a/src/ops/neon/mod.rs
+++ b/src/ops/neon/mod.rs
@@ -28,4 +28,4 @@
  */
 mod morph_op;
 
-pub use morph_op::MorphOpFilterNeon2DRow;
\ No newline at end of file
+pub use morph_op::MorphOpFilterNeon2DRow;
diff --git a/src/ops/sse/hminmax.rs b/src/ops/sse/hminmax.rs
deleted file mode 100644
index 00ab939..0000000
--- a/src/ops/sse/hminmax.rs
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) Radzivon Bartoshyk. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without modification,
- * are permitted provided that the following conditions are met:
- *
- * 1.  Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2.  Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3.  Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#[cfg(target_arch = "x86")]
-use std::arch::x86::*;
-#[cfg(target_arch = "x86_64")]
-use std::arch::x86_64::*;
-
-#[inline]
-#[target_feature(enable = "sse4.1")]
-pub unsafe fn _mm_hmax_epu8(v: __m128i) -> u8 {
-    let mut vmax = v;
-    vmax = _mm_max_epu8(vmax, _mm_alignr_epi8::<1>(vmax, vmax));
-    vmax = _mm_max_epu8(vmax, _mm_alignr_epi8::<2>(vmax, vmax));
-    vmax = _mm_max_epu8(vmax, _mm_alignr_epi8::<4>(vmax, vmax));
-    vmax = _mm_max_epu8(vmax, _mm_alignr_epi8::<8>(vmax, vmax));
-    _mm_extract_epi8::<0>(vmax) as u8
-}
-
-#[inline]
-#[target_feature(enable = "sse4.1")]
-pub unsafe fn _mm_hmin_epu8(v: __m128i) -> u8 {
-    let mut vmax = v;
-    vmax = _mm_min_epu8(vmax, _mm_alignr_epi8::<1>(vmax, vmax));
-    vmax = _mm_min_epu8(vmax, _mm_alignr_epi8::<2>(vmax, vmax));
-    vmax = _mm_min_epu8(vmax, _mm_alignr_epi8::<4>(vmax, vmax));
-    vmax = _mm_min_epu8(vmax, _mm_alignr_epi8::<8>(vmax, vmax));
-    _mm_extract_epi8::<0>(vmax) as u8
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::ops::sse::hminmax::{_mm_hmax_epu8, _mm_hmin_epu8};
-    use std::arch::x86_64::{__m128i, _mm_loadu_si128};
-
-    #[test]
-    fn test_hmax() {
-        unsafe {
-            let values: [u8; 16] = [1, 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
-            let row = _mm_loadu_si128(values.as_ptr() as *const __m128i);
-            let max = _mm_hmax_epu8(row);
-            let min = _mm_hmin_epu8(row);
-            assert_eq!(max, 15);
-            assert_eq!(min, 0);
-        }
-    }
-}
diff --git a/src/ops/sse/mod.rs b/src/ops/sse/mod.rs
index 99526c9..6a621d5 100644
--- a/src/ops/sse/mod.rs
+++ b/src/ops/sse/mod.rs
@@ -26,10 +26,10 @@
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
-mod hminmax;
 mod morph_op;
-mod op;
+mod morph_op_f32;
+mod morph_op_u16;
 
-pub use hminmax::{_mm_hmax_epu8, _mm_hmin_epu8};
 pub use morph_op::MorphOpFilterSse2DRow;
-pub use op::{fast_morph_op_1d_sse, fast_morph_op_3d_sse, fast_morph_op_4d_sse};
+pub use morph_op_f32::MorphOpFilterSse2DRowF32;
+pub use morph_op_u16::MorphOpFilterSse2DRowU16;
diff --git a/src/ops/sse/morph_op.rs b/src/ops/sse/morph_op.rs
index 044b6f6..797c6ef 100644
--- a/src/ops/sse/morph_op.rs
+++ b/src/ops/sse/morph_op.rs
@@ -50,6 +50,7 @@ impl<T, const OP_TYPE: u8> MorthOpFilterFlat2DRow<T> for MorphOpFilterSse2DRow<O
 where
     T: Copy + 'static + MorphNativeOp<T>,
 {
+    #[target_feature(enable = "sse4.1")]
     unsafe fn dispatch_row(
         &self,
         arena: &Arena<T>,
diff --git a/src/ops/sse/morph_op_f32.rs b/src/ops/sse/morph_op_f32.rs
new file mode 100644
index 0000000..15e92be
--- /dev/null
+++ b/src/ops/sse/morph_op_f32.rs
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::filter_op_declare::{Arena, MorthOpFilterFlat2DRow};
+use crate::flat_se::AnalyzedSe;
+use crate::morph_base::MorphNativeOp;
+use crate::op_type::MorphOp;
+use crate::unsafe_slice::UnsafeSlice;
+use crate::ImageSize;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+#[derive(Clone)]
+pub struct MorphOpFilterSse2DRowF32<const OP_TYPE: u8> {}
+
+impl<const OP_TYPE: u8> Default for MorphOpFilterSse2DRowF32<OP_TYPE> {
+    fn default() -> Self {
+        MorphOpFilterSse2DRowF32 {}
+    }
+}
+
+impl<T, const OP_TYPE: u8> MorthOpFilterFlat2DRow<T> for MorphOpFilterSse2DRowF32<OP_TYPE>
+where
+    T: Copy + 'static + MorphNativeOp<T>,
+{
+    #[target_feature(enable = "sse4.1")]
+    unsafe fn dispatch_row(
+        &self,
+        arena: &Arena<T>,
+        dst: &UnsafeSlice<T>,
+        image_size: ImageSize,
+        analyzed_se: AnalyzedSe,
+        y: usize,
+    ) {
+        let width = image_size.width;
+
+        let op_type: MorphOp = OP_TYPE.into();
+        let stride = width;
+
+        let decision = match op_type {
+            MorphOp::Dilate => _mm_max_ps,
+            MorphOp::Erode => _mm_min_ps,
+        };
+
+        let src: &Vec<f32> = std::mem::transmute(&arena.arena);
+        let dst: &UnsafeSlice<f32> = std::mem::transmute(dst);
+
+        let dx = arena.pad_w as i32;
+        let dy = arena.pad_h as i32;
+
+        let arena_stride = arena.width;
+
+        let offsets = analyzed_se
+            .left_front
+            .element_offsets
+            .iter()
+            .map(|&x| {
+                src.get_unchecked(
+                    ((x.y + dy + y as i32) as usize * arena_stride + (x.x + dx) as usize)..,
+                )
+            })
+            .collect::<Vec<_>>();
+
+        let length = analyzed_se.left_front.element_offsets.iter().len();
+
+        let mut _cx = 0usize;
+
+        while _cx + 16 < width {
+            let ptr0 = (*offsets.get_unchecked(0).get_unchecked(_cx..)).as_ptr();
+            let mut row0 = _mm_loadu_ps(ptr0);
+            let mut row1 = _mm_loadu_ps(ptr0.add(4));
+            let mut row2 = _mm_loadu_ps(ptr0.add(8));
+            let mut row3 = _mm_loadu_ps(ptr0.add(12));
+
+            for i in 1..length {
+                let ptr_d = (*offsets.get_unchecked(i)).get_unchecked(_cx..).as_ptr();
+                let new_row0 = _mm_loadu_ps(ptr_d);
+                let new_row1 = _mm_loadu_ps(ptr_d.add(4));
+                let new_row2 = _mm_loadu_ps(ptr_d.add(8));
+                let new_row3 = _mm_loadu_ps(ptr_d.add(12));
+                row0 = decision(row0, new_row0);
+                row1 = decision(row1, new_row1);
+                row2 = decision(row2, new_row2);
+                row3 = decision(row3, new_row3);
+            }
+
+            let v_dst = dst.slice.as_ptr().add(y * stride + _cx) as *mut f32;
+
+            _mm_storeu_ps(v_dst, row0);
+            _mm_storeu_ps(v_dst.add(4), row1);
+            _mm_storeu_ps(v_dst.add(8), row2);
+            _mm_storeu_ps(v_dst.add(12), row3);
+
+            _cx += 16;
+        }
+
+        while _cx + 8 < width {
+            let ptr0 = (*offsets.get_unchecked(0).get_unchecked(_cx..)).as_ptr();
+            let mut row0 = _mm_loadu_ps(ptr0);
+            let mut row1 = _mm_loadu_ps(ptr0.add(4));
+
+            for i in 1..length {
+                let ptr_d = (*offsets.get_unchecked(i)).get_unchecked(_cx..).as_ptr();
+                let new_row0 = _mm_loadu_ps(ptr_d);
+                let new_row1 = _mm_loadu_ps(ptr_d.add(4));
+                row0 = decision(row0, new_row0);
+                row1 = decision(row1, new_row1);
+            }
+
+            let v_dst = dst.slice.as_ptr().add(y * stride + _cx) as *mut f32;
+
+            _mm_storeu_ps(v_dst, row0);
+            _mm_storeu_ps(v_dst.add(8), row1);
+
+            _cx += 8;
+        }
+
+        while _cx + 4 < width {
+            let ptr0 = (*offsets.get_unchecked(0).get_unchecked(_cx..)).as_ptr();
+            let mut row0 = _mm_loadu_ps(ptr0);
+
+            for i in 1..length {
+                let ptr_d = (*offsets.get_unchecked(i)).get_unchecked(_cx..).as_ptr();
+                let new_row0 = _mm_loadu_ps(ptr_d);
+                row0 = decision(row0, new_row0);
+            }
+
+            let v_dst = dst.slice.as_ptr().add(y * stride + _cx) as *mut f32;
+            _mm_storeu_ps(v_dst, row0);
+
+            _cx += 4;
+        }
+
+        while _cx + 2 < width {
+            let ptr0 = (*offsets.get_unchecked(0).get_unchecked(_cx..)).as_ptr();
+            let mut row0 = _mm_castsi128_ps(_mm_loadu_si64(ptr0 as *const u8));
+
+            for i in 1..length {
+                let ptr_d = (*offsets.get_unchecked(i)).get_unchecked(_cx..).as_ptr();
+                let new_row0 = _mm_castsi128_ps(_mm_loadu_si64(ptr_d as *const u8));
+                row0 = decision(row0, new_row0);
+            }
+
+            let v_dst = dst.slice.as_ptr().add(y * stride + _cx) as *mut u8;
+            let v0 = _mm_castps_si128(row0);
+            std::ptr::copy_nonoverlapping(&v0 as *const _ as *const u8, v_dst, 8);
+
+            _cx += 2;
+        }
+
+        for x in _cx..width {
+            let mut k0 = *(*offsets.get_unchecked(0)).get_unchecked(x);
+
+            for i in 1..length {
+                k0 = k0.op::<OP_TYPE>(*(*offsets.get_unchecked(i)).get_unchecked(x));
+            }
+            dst.write(y * stride + x, k0);
+        }
+    }
+}
diff --git a/src/ops/sse/morph_op_u16.rs b/src/ops/sse/morph_op_u16.rs
new file mode 100644
index 0000000..8e0fdf6
--- /dev/null
+++ b/src/ops/sse/morph_op_u16.rs
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::filter_op_declare::{Arena, MorthOpFilterFlat2DRow};
+use crate::flat_se::AnalyzedSe;
+use crate::morph_base::MorphNativeOp;
+use crate::op_type::MorphOp;
+use crate::unsafe_slice::UnsafeSlice;
+use crate::ImageSize;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+#[derive(Clone)]
+pub struct MorphOpFilterSse2DRowU16<const OP_TYPE: u8> {}
+
+impl<const OP_TYPE: u8> Default for MorphOpFilterSse2DRowU16<OP_TYPE> {
+    fn default() -> Self {
+        MorphOpFilterSse2DRowU16 {}
+    }
+}
+
+impl<T, const OP_TYPE: u8> MorthOpFilterFlat2DRow<T> for MorphOpFilterSse2DRowU16<OP_TYPE>
+where
+    T: Copy + 'static + MorphNativeOp<T>,
+{
+    #[target_feature(enable = "sse4.1")]
+    unsafe fn dispatch_row(
+        &self,
+        arena: &Arena<T>,
+        dst: &UnsafeSlice<T>,
+        image_size: ImageSize,
+        analyzed_se: AnalyzedSe,
+        y: usize,
+    ) {
+        let width = image_size.width;
+
+        let op_type: MorphOp = OP_TYPE.into();
+        let stride = width;
+
+        let decision = match op_type {
+            MorphOp::Dilate => _mm_max_epu16,
+            MorphOp::Erode => _mm_min_epu16,
+        };
+
+        let src: &Vec<u16> = std::mem::transmute(&arena.arena);
+        let dst: &UnsafeSlice<u16> = std::mem::transmute(dst);
+
+        let dx = arena.pad_w as i32;
+        let dy = arena.pad_h as i32;
+
+        let arena_stride = arena.width;
+
+        let offsets = analyzed_se
+            .left_front
+            .element_offsets
+            .iter()
+            .map(|&x| {
+                src.get_unchecked(
+                    ((x.y + dy + y as i32) as usize * arena_stride + (x.x + dx) as usize)..,
+                )
+            })
+            .collect::<Vec<_>>();
+
+        let length = analyzed_se.left_front.element_offsets.iter().len();
+
+        let mut _cx = 0usize;
+
+        while _cx + 32 < width {
+            let ptr0 = (*offsets.get_unchecked(0).get_unchecked(_cx..)).as_ptr();
+            let mut row0 = _mm_loadu_si128(ptr0 as *const __m128i);
+            let mut row1 = _mm_loadu_si128(ptr0.add(8) as *const __m128i);
+            let mut row2 = _mm_loadu_si128(ptr0.add(16) as *const __m128i);
+            let mut row3 = _mm_loadu_si128(ptr0.add(24) as *const __m128i);
+
+            for i in 1..length {
+                let ptr_d = (*offsets.get_unchecked(i)).get_unchecked(_cx..).as_ptr();
+                let new_row0 = _mm_loadu_si128(ptr_d as *const __m128i);
+                let new_row1 = _mm_loadu_si128(ptr_d.add(8) as *const __m128i);
+                let new_row2 = _mm_loadu_si128(ptr_d.add(16) as *const __m128i);
+                let new_row3 = _mm_loadu_si128(ptr_d.add(24) as *const __m128i);
+                row0 = decision(row0, new_row0);
+                row1 = decision(row1, new_row1);
+                row2 = decision(row2, new_row2);
+                row3 = decision(row3, new_row3);
+            }
+
+            let v_dst = dst.slice.as_ptr().add(y * stride + _cx) as *mut u8;
+
+            _mm_storeu_si128(v_dst as *mut __m128i, row0);
+            _mm_storeu_si128(v_dst.add(8) as *mut __m128i, row1);
+            _mm_storeu_si128(v_dst.add(16) as *mut __m128i, row2);
+            _mm_storeu_si128(v_dst.add(24) as *mut __m128i, row3);
+
+            _cx += 32;
+        }
+
+        while _cx + 16 < width {
+            let ptr0 = (*offsets.get_unchecked(0).get_unchecked(_cx..)).as_ptr();
+            let mut row0 = _mm_loadu_si128(ptr0 as *const __m128i);
+            let mut row1 = _mm_loadu_si128(ptr0.add(8) as *const __m128i);
+
+            for i in 1..length {
+                let ptr_d = (*offsets.get_unchecked(i)).get_unchecked(_cx..).as_ptr();
+                let new_row0 = _mm_loadu_si128(ptr_d as *const __m128i);
+                let new_row1 = _mm_loadu_si128(ptr_d.add(8) as *const __m128i);
+                row0 = decision(row0, new_row0);
+                row1 = decision(row1, new_row1);
+            }
+
+            let v_dst = dst.slice.as_ptr().add(y * stride + _cx) as *mut u8;
+
+            _mm_storeu_si128(v_dst as *mut __m128i, row0);
+            _mm_storeu_si128(v_dst.add(8) as *mut __m128i, row1);
+
+            _cx += 16;
+        }
+
+        while _cx + 8 < width {
+            let ptr0 = (*offsets.get_unchecked(0).get_unchecked(_cx..)).as_ptr();
+            let mut row0 = _mm_loadu_si128(ptr0 as *const __m128i);
+
+            for i in 1..length {
+                let ptr_d = (*offsets.get_unchecked(i)).get_unchecked(_cx..).as_ptr();
+                let new_row0 = _mm_loadu_si128(ptr_d as *const __m128i);
+                row0 = decision(row0, new_row0);
+            }
+
+            let v_dst = dst.slice.as_ptr().add(y * stride + _cx) as *mut u8;
+            _mm_storeu_si128(v_dst as *mut __m128i, row0);
+
+            _cx += 8;
+        }
+
+        while _cx + 4 < width {
+            let ptr0 = (*offsets.get_unchecked(0).get_unchecked(_cx..)).as_ptr();
+            let mut row0 = _mm_loadu_si64(ptr0 as *const u8);
+
+            for i in 1..length {
+                let ptr_d = (*offsets.get_unchecked(i)).get_unchecked(_cx..).as_ptr();
+                let new_row0 = _mm_loadu_si64(ptr_d as *const u8);
+                row0 = decision(row0, new_row0);
+            }
+
+            let v_dst = dst.slice.as_ptr().add(y * stride + _cx) as *mut u8;
+            std::ptr::copy_nonoverlapping(&row0 as *const _ as *const u8, v_dst, 8);
+
+            _cx += 4;
+        }
+
+        for x in _cx..width {
+            let mut k0 = *(*offsets.get_unchecked(0)).get_unchecked(x);
+
+            for i in 1..length {
+                k0 = k0.op::<OP_TYPE>(*(*offsets.get_unchecked(i)).get_unchecked(x));
+            }
+            dst.write(y * stride + x, k0);
+        }
+    }
+}
diff --git a/src/packing/avx/deinterleave_rgb.rs b/src/packing/avx/deinterleave_rgb.rs
new file mode 100644
index 0000000..90de228
--- /dev/null
+++ b/src/packing/avx/deinterleave_rgb.rs
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::packing::avx::v_load::_mm256_load_deinterleave_rgb;
+use crate::packing::sse::{_mm_load_deinterleave_half_rgb, _mm_load_deinterleave_rgb};
+use crate::packing::UnpackedRgbImage;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+pub fn deinterleave_rgb_avx(rgb_image: &[u8], width: usize, height: usize) -> UnpackedRgbImage<u8> {
+    unsafe { deinterleave_rgb_impl(rgb_image, width, height) }
+}
+
+#[inline]
+#[target_feature(enable = "avx2")]
+unsafe fn deinterleave_rgb_impl(
+    rgb_image: &[u8],
+    width: usize,
+    height: usize,
+) -> UnpackedRgbImage<u8> {
+    if rgb_image.len() != width * height * 3 {
+        panic!(
+            "Image bounds in deinterleave_rgb_sse is mismatched! Expected {} but got {}",
+            width * height * 3,
+            rgb_image.len()
+        );
+    }
+    let mut r_chan = vec![0u8; width * height];
+    let mut g_chan = vec![0u8; width * height];
+    let mut b_chan = vec![0u8; width * height];
+
+    let mut r_dst = r_chan.as_mut_slice();
+    let mut g_dst = g_chan.as_mut_slice();
+    let mut b_dst = b_chan.as_mut_slice();
+
+    let src_stride = width * 3;
+
+    let mut src = rgb_image;
+    unsafe {
+        for _ in 0..height {
+            let mut _cx = 0usize;
+
+            while _cx + 32 < width {
+                let px = _cx * 3;
+                let pixels = _mm256_load_deinterleave_rgb(src.as_ptr().add(px));
+                _mm256_storeu_si256(r_dst.as_mut_ptr().add(_cx) as *mut __m256i, pixels.0);
+                _mm256_storeu_si256(g_dst.as_mut_ptr().add(_cx) as *mut __m256i, pixels.1);
+                _mm256_storeu_si256(b_dst.as_mut_ptr().add(_cx) as *mut __m256i, pixels.2);
+                _cx += 32;
+            }
+
+            while _cx + 16 < width {
+                let px = _cx * 3;
+                let pixels = _mm_load_deinterleave_rgb(src.as_ptr().add(px));
+                _mm_storeu_si128(r_dst.as_mut_ptr().add(_cx) as *mut __m128i, pixels.0);
+                _mm_storeu_si128(g_dst.as_mut_ptr().add(_cx) as *mut __m128i, pixels.1);
+                _mm_storeu_si128(b_dst.as_mut_ptr().add(_cx) as *mut __m128i, pixels.2);
+                _cx += 16;
+            }
+
+            while _cx + 8 < width {
+                let px = _cx * 3;
+                let pixels = _mm_load_deinterleave_half_rgb(src.as_ptr().add(px), 0);
+                let v0 = pixels.0;
+                std::ptr::copy_nonoverlapping(
+                    &v0 as *const _ as *const u8,
+                    r_dst.as_mut_ptr().add(_cx),
+                    8,
+                );
+                let v1 = pixels.1;
+                std::ptr::copy_nonoverlapping(
+                    &v1 as *const _ as *const u8,
+                    g_dst.as_mut_ptr().add(_cx),
+                    8,
+                );
+                let v2 = pixels.2;
+                std::ptr::copy_nonoverlapping(
+                    &v2 as *const _ as *const u8,
+                    b_dst.as_mut_ptr().add(_cx),
+                    8,
+                );
+                _cx += 8;
+            }
+
+            while _cx < width {
+                let px = _cx * 3;
+                let src_align = src.get_unchecked(px..);
+                *r_dst.get_unchecked_mut(_cx) = *src_align.get_unchecked(0);
+                *g_dst.get_unchecked_mut(_cx) = *src_align.get_unchecked(1);
+                *b_dst.get_unchecked_mut(_cx) = *src_align.get_unchecked(2);
+                _cx += 1;
+            }
+
+            src = src.get_unchecked(src_stride..);
+            r_dst = r_dst.get_unchecked_mut(width..);
+            g_dst = g_dst.get_unchecked_mut(width..);
+            b_dst = b_dst.get_unchecked_mut(width..);
+        }
+    }
+    UnpackedRgbImage::new(r_chan, g_chan, b_chan)
+}
diff --git a/src/packing/avx/deinterleave_rgba.rs b/src/packing/avx/deinterleave_rgba.rs
new file mode 100644
index 0000000..9770f06
--- /dev/null
+++ b/src/packing/avx/deinterleave_rgba.rs
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::packing::avx::v_load::_mm256_load_deinterleave_rgba;
+use crate::packing::sse::{_mm_load_deinterleave_half_rgba, _mm_load_deinterleave_rgba};
+use crate::packing::UnpackedRgbaImage;
+use crate::ImageSize;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+pub fn deinterleave_rgba_avx(
+    rgb_image: &[u8],
+    width: usize,
+    height: usize,
+) -> UnpackedRgbaImage<u8> {
+    unsafe { deinterleave_rgba_avx_impl(rgb_image, width, height) }
+}
+
+#[inline]
+#[target_feature(enable = "avx2")]
+unsafe fn deinterleave_rgba_avx_impl(
+    rgb_image: &[u8],
+    width: usize,
+    height: usize,
+) -> UnpackedRgbaImage<u8> {
+    if rgb_image.len() != width * height * 4 {
+        panic!(
+            "Image bounds in deinterleave_rgba_sse is mismatched! Expected {} but got {}",
+            width * height * 4,
+            rgb_image.len()
+        );
+    }
+    let mut unpacked_image = UnpackedRgbaImage::alloc(ImageSize::new(width, height));
+
+    let mut r_dst: &mut [u8] = unpacked_image.r_channel.as_mut_slice();
+    let mut g_dst: &mut [u8] = unpacked_image.g_channel.as_mut_slice();
+    let mut b_dst: &mut [u8] = unpacked_image.b_channel.as_mut_slice();
+    let mut a_dst: &mut [u8] = unpacked_image.a_channel.as_mut_slice();
+
+    let src_stride = width * 4;
+
+    let mut src = rgb_image;
+    unsafe {
+        for _ in 0..height {
+            let mut _cx = 0usize;
+
+            while _cx + 32 < width {
+                let px = _cx * 4;
+                let pixels = _mm256_load_deinterleave_rgba(src.as_ptr().add(px));
+                _mm256_storeu_si256(r_dst.as_mut_ptr().add(_cx) as *mut __m256i, pixels.0);
+                _mm256_storeu_si256(g_dst.as_mut_ptr().add(_cx) as *mut __m256i, pixels.1);
+                _mm256_storeu_si256(b_dst.as_mut_ptr().add(_cx) as *mut __m256i, pixels.2);
+                _mm256_storeu_si256(a_dst.as_mut_ptr().add(_cx) as *mut __m256i, pixels.3);
+                _cx += 32;
+            }
+
+            while _cx + 16 < width {
+                let px = _cx * 4;
+                let pixels = _mm_load_deinterleave_rgba(src.as_ptr().add(px));
+                _mm_storeu_si128(r_dst.as_mut_ptr().add(_cx) as *mut __m128i, pixels.0);
+                _mm_storeu_si128(g_dst.as_mut_ptr().add(_cx) as *mut __m128i, pixels.1);
+                _mm_storeu_si128(b_dst.as_mut_ptr().add(_cx) as *mut __m128i, pixels.2);
+                _mm_storeu_si128(a_dst.as_mut_ptr().add(_cx) as *mut __m128i, pixels.3);
+                _cx += 16;
+            }
+
+            while _cx + 8 < width {
+                let px = _cx * 4;
+                let pixels = _mm_load_deinterleave_half_rgba(src.as_ptr().add(px), 0);
+                let v0 = pixels.0;
+                std::ptr::copy_nonoverlapping(
+                    &v0 as *const _ as *const u8,
+                    r_dst.as_mut_ptr().add(_cx),
+                    8,
+                );
+                let v1 = pixels.1;
+                std::ptr::copy_nonoverlapping(
+                    &v1 as *const _ as *const u8,
+                    g_dst.as_mut_ptr().add(_cx),
+                    8,
+                );
+                let v2 = pixels.2;
+                std::ptr::copy_nonoverlapping(
+                    &v2 as *const _ as *const u8,
+                    b_dst.as_mut_ptr().add(_cx),
+                    8,
+                );
+                let v3 = pixels.3;
+                std::ptr::copy_nonoverlapping(
+                    &v3 as *const _ as *const u8,
+                    a_dst.as_mut_ptr().add(_cx),
+                    8,
+                );
+                _cx += 8;
+            }
+
+            while _cx < width {
+                let px = _cx * 4;
+                let src_align = src.get_unchecked(px..);
+                *r_dst.get_unchecked_mut(_cx) = *src_align.get_unchecked(0);
+                *g_dst.get_unchecked_mut(_cx) = *src_align.get_unchecked(1);
+                *b_dst.get_unchecked_mut(_cx) = *src_align.get_unchecked(2);
+                *a_dst.get_unchecked_mut(_cx) = *src_align.get_unchecked(3);
+                _cx += 1;
+            }
+
+            src = src.get_unchecked(src_stride..);
+            r_dst = r_dst.get_unchecked_mut(width..);
+            g_dst = g_dst.get_unchecked_mut(width..);
+            b_dst = b_dst.get_unchecked_mut(width..);
+            a_dst = a_dst.get_unchecked_mut(width..);
+        }
+    }
+    unpacked_image
+}
diff --git a/src/packing/avx/mod.rs b/src/packing/avx/mod.rs
index e9e8546..ea4ddb5 100644
--- a/src/packing/avx/mod.rs
+++ b/src/packing/avx/mod.rs
@@ -26,4 +26,9 @@
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
+mod deinterleave_rgb;
+mod deinterleave_rgba;
 mod v_load;
+
+pub use deinterleave_rgb::deinterleave_rgb_avx;
+pub use deinterleave_rgba::deinterleave_rgba_avx;
diff --git a/src/packing/avx/v_load.rs b/src/packing/avx/v_load.rs
index ff706e2..053cb30 100644
--- a/src/packing/avx/v_load.rs
+++ b/src/packing/avx/v_load.rs
@@ -145,3 +145,24 @@ pub unsafe fn _mm256_deinterleave_rgb(
     let r0 = _mm256_shuffle_epi8(r0, sh_r);
     (b0, g0, r0)
 }
+
+#[inline]
+#[target_feature(enable = "avx2")]
+pub unsafe fn _mm256_load_deinterleave_rgb(ptr: *const u8) -> (__m256i, __m256i, __m256i) {
+    let row0 = _mm256_loadu_si256(ptr as *const __m256i);
+    let row1 = _mm256_loadu_si256(ptr.add(32) as *const __m256i);
+    let row2 = _mm256_loadu_si256(ptr.add(64) as *const __m256i);
+    _mm256_deinterleave_rgb(row0, row1, row2)
+}
+
+#[inline]
+#[target_feature(enable = "avx2")]
+pub unsafe fn _mm256_load_deinterleave_rgba(
+    ptr: *const u8,
+) -> (__m256i, __m256i, __m256i, __m256i) {
+    let row0 = _mm256_loadu_si256(ptr as *const __m256i);
+    let row1 = _mm256_loadu_si256(ptr.add(32) as *const __m256i);
+    let row2 = _mm256_loadu_si256(ptr.add(64) as *const __m256i);
+    let row3 = _mm256_loadu_si256(ptr.add(96) as *const __m256i);
+    _mm256_deinterleave_rgba_epi8(row0, row1, row2, row3)
+}
diff --git a/src/packing/mod.rs b/src/packing/mod.rs
index ee78823..1e35c6a 100644
--- a/src/packing/mod.rs
+++ b/src/packing/mod.rs
@@ -30,23 +30,23 @@
 pub mod avx;
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 mod neon;
+mod pack_gray_alpha;
 mod pack_rgb;
 mod pack_rgba;
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 mod sse;
 mod traits;
+mod unpack_gray_alpha;
 mod unpack_rgb;
 mod unpack_rgba;
+mod unpacked_gray_alpha;
 mod unpacked_rgb_image;
 mod unpacked_rgba_image;
-mod unpack_gray_alpha;
-mod unpacked_gray_alpha;
-mod pack_gray_alpha;
 
 pub use pack_rgb::pack_rgb;
-pub use traits::{RgbPackable, RgbaPackable, GrayAlphaPackable};
+pub use traits::{GrayAlphaPackable, RgbPackable, RgbaPackable};
 pub use unpack_rgb::unpack_rgb;
 pub use unpack_rgba::unpack_rgba;
+pub use unpacked_gray_alpha::UnpackedGrayAlpha;
 pub use unpacked_rgb_image::UnpackedRgbImage;
 pub use unpacked_rgba_image::UnpackedRgbaImage;
-pub use unpacked_gray_alpha::UnpackedGrayAlpha;
\ No newline at end of file
diff --git a/src/packing/neon/deinterleave_rgba.rs b/src/packing/neon/deinterleave_rgba.rs
index 4cf8780..195a663 100644
--- a/src/packing/neon/deinterleave_rgba.rs
+++ b/src/packing/neon/deinterleave_rgba.rs
@@ -28,8 +28,8 @@
  */
 
 use crate::packing::UnpackedRgbaImage;
-use std::arch::aarch64::*;
 use crate::ImageSize;
+use std::arch::aarch64::*;
 
 pub fn deinterleave_rgba_neon(
     rgb_image: &[u8],
diff --git a/src/packing/pack_gray_alpha.rs b/src/packing/pack_gray_alpha.rs
index 9432a18..3dd864b 100644
--- a/src/packing/pack_gray_alpha.rs
+++ b/src/packing/pack_gray_alpha.rs
@@ -44,4 +44,4 @@ pub fn pack_gray_alpha_naive<T>(
         src[0] = *r;
         src[1] = *g;
     }
-}
\ No newline at end of file
+}
diff --git a/src/packing/pack_rgb.rs b/src/packing/pack_rgb.rs
index 4c398db..8990b99 100644
--- a/src/packing/pack_rgb.rs
+++ b/src/packing/pack_rgb.rs
@@ -28,6 +28,8 @@
  */
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 use crate::packing::neon::pack_rgb_neon;
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+use crate::packing::sse::pack_rgb_sse;
 use crate::packing::UnpackedRgbImage;
 use crate::ImageSize;
 
@@ -61,6 +63,12 @@ pub fn pack_rgb(
     {
         _dispatcher = pack_rgb_neon;
     }
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        if std::arch::is_x86_feature_detected!("sse4.1") {
+            _dispatcher = pack_rgb_sse;
+        }
+    }
     _dispatcher(
         unpacked_rgb_image,
         dst_image,
diff --git a/src/packing/pack_rgba.rs b/src/packing/pack_rgba.rs
index 99b2550..818ccae 100644
--- a/src/packing/pack_rgba.rs
+++ b/src/packing/pack_rgba.rs
@@ -26,9 +26,12 @@
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+use crate::packing::neon::pack_rgba_neon;
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+use crate::packing::sse::pack_rgba_sse;
 use crate::packing::UnpackedRgbaImage;
 use crate::ImageSize;
-use crate::packing::neon::pack_rgba_neon;
 
 pub fn interleave_rgba_naive<T>(
     unpacked_rgba_image: &UnpackedRgbaImage<T>,
@@ -63,6 +66,12 @@ pub fn pack_rgba(
     {
         _dispatcher = pack_rgba_neon;
     }
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        if std::arch::is_x86_feature_detected!("sse4.1") {
+            _dispatcher = pack_rgba_sse;
+        }
+    }
     _dispatcher(
         unpacked_rgb_image,
         dst_image,
diff --git a/src/packing/sse/deinterleave_rgb.rs b/src/packing/sse/deinterleave_rgb.rs
index 1456ae1..77117cf 100644
--- a/src/packing/sse/deinterleave_rgb.rs
+++ b/src/packing/sse/deinterleave_rgb.rs
@@ -34,6 +34,16 @@ use std::arch::x86::*;
 use std::arch::x86_64::*;
 
 pub fn deinterleave_rgb_sse(rgb_image: &[u8], width: usize, height: usize) -> UnpackedRgbImage<u8> {
+    unsafe { deinterleave_rgb_sse_impl(rgb_image, width, height) }
+}
+
+#[inline]
+#[target_feature(enable = "sse4.1")]
+unsafe fn deinterleave_rgb_sse_impl(
+    rgb_image: &[u8],
+    width: usize,
+    height: usize,
+) -> UnpackedRgbImage<u8> {
     if rgb_image.len() != width * height * 3 {
         panic!(
             "Image bounds in deinterleave_rgb_sse is mismatched! Expected {} but got {}",
diff --git a/src/packing/sse/deinterleave_rgba.rs b/src/packing/sse/deinterleave_rgba.rs
index fbaa721..b1dc3a0 100644
--- a/src/packing/sse/deinterleave_rgba.rs
+++ b/src/packing/sse/deinterleave_rgba.rs
@@ -26,9 +26,7 @@
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
-use crate::packing::sse::{
-    _mm_load_deinterleave_half_rgb, _mm_load_deinterleave_half_rgba, _mm_load_deinterleave_rgba,
-};
+use crate::packing::sse::{_mm_load_deinterleave_half_rgba, _mm_load_deinterleave_rgba};
 use crate::packing::UnpackedRgbaImage;
 use crate::ImageSize;
 #[cfg(target_arch = "x86")]
@@ -40,6 +38,16 @@ pub fn deinterleave_rgba_sse(
     rgb_image: &[u8],
     width: usize,
     height: usize,
+) -> UnpackedRgbaImage<u8> {
+    unsafe { deinterleave_rgba_sse_impl(rgb_image, width, height) }
+}
+
+#[inline]
+#[target_feature(enable = "sse4.1")]
+unsafe fn deinterleave_rgba_sse_impl(
+    rgb_image: &[u8],
+    width: usize,
+    height: usize,
 ) -> UnpackedRgbaImage<u8> {
     if rgb_image.len() != width * height * 4 {
         panic!(
@@ -55,7 +63,7 @@ pub fn deinterleave_rgba_sse(
     let mut b_dst: &mut [u8] = unpacked_image.b_channel.as_mut_slice();
     let mut a_dst: &mut [u8] = unpacked_image.a_channel.as_mut_slice();
 
-    let src_stride = width * 3;
+    let src_stride = width * 4;
 
     let mut src = rgb_image;
     unsafe {
diff --git a/src/packing/sse/mod.rs b/src/packing/sse/mod.rs
index cda014a..1364626 100644
--- a/src/packing/sse/mod.rs
+++ b/src/packing/sse/mod.rs
@@ -28,8 +28,13 @@
  */
 mod deinterleave_rgb;
 mod deinterleave_rgba;
+mod pack_rgb;
+mod pack_rgba;
 pub mod v_load;
+mod v_store;
 
 pub use deinterleave_rgb::deinterleave_rgb_sse;
 pub use deinterleave_rgba::deinterleave_rgba_sse;
+pub use pack_rgb::pack_rgb_sse;
+pub use pack_rgba::pack_rgba_sse;
 pub use v_load::*;
diff --git a/src/packing/sse/pack_rgb.rs b/src/packing/sse/pack_rgb.rs
new file mode 100644
index 0000000..99ed15d
--- /dev/null
+++ b/src/packing/sse/pack_rgb.rs
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::packing::sse::v_store::{_mm_store_interleaved_rgb, _mm_store_rgb_half_u8};
+use crate::packing::UnpackedRgbImage;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+pub fn pack_rgb_sse(
+    unpacked_rgb_image: &UnpackedRgbImage<u8>,
+    dst_image: &mut [u8],
+    width: usize,
+    height: usize,
+) {
+    unsafe {
+        pack_rgb_sse_impl(unpacked_rgb_image, dst_image, width, height);
+    }
+}
+
+#[inline]
+#[target_feature(enable = "sse4.1")]
+unsafe fn pack_rgb_sse_impl(
+    unpacked_rgb_image: &UnpackedRgbImage<u8>,
+    dst_image: &mut [u8],
+    width: usize,
+    height: usize,
+) {
+    if dst_image.len() != width * height * 3 {
+        panic!(
+            "Image bounds in pack_rgb_neon is mismatched! Expected {} but got {}",
+            width * height * 3,
+            dst_image.len()
+        );
+    }
+
+    let mut r_src: &[u8] = unpacked_rgb_image.r_channel.as_slice();
+    let mut g_src: &[u8] = unpacked_rgb_image.g_channel.as_slice();
+    let mut b_src: &[u8] = unpacked_rgb_image.b_channel.as_slice();
+
+    let src_stride = width * 3;
+
+    let mut dst = dst_image;
+    unsafe {
+        for _ in 0..height {
+            let mut _cx = 0usize;
+
+            while _cx + 16 < width {
+                let px = _cx * 3;
+                let set = (
+                    _mm_loadu_si128(r_src.as_ptr().add(_cx) as *const __m128i),
+                    _mm_loadu_si128(g_src.as_ptr().add(_cx) as *const __m128i),
+                    _mm_loadu_si128(b_src.as_ptr().add(_cx) as *const __m128i),
+                );
+                _mm_store_interleaved_rgb(dst.as_mut_ptr().add(px), set.0, set.1, set.2);
+                _cx += 16;
+            }
+
+            while _cx + 8 < width {
+                let px = _cx * 3;
+                let set = (
+                    _mm_loadu_si64(r_src.as_ptr().add(_cx)),
+                    _mm_loadu_si64(g_src.as_ptr().add(_cx)),
+                    _mm_loadu_si64(b_src.as_ptr().add(_cx)),
+                );
+                _mm_store_rgb_half_u8(dst.as_mut_ptr().add(px), set.0, set.1, set.2);
+                _cx += 8;
+            }
+
+            while _cx < width {
+                let px = _cx * 3;
+                let dst_align = dst.get_unchecked_mut(px..);
+                *dst_align.get_unchecked_mut(0) = *r_src.get_unchecked(_cx);
+                *dst_align.get_unchecked_mut(1) = *g_src.get_unchecked(_cx);
+                *dst_align.get_unchecked_mut(2) = *b_src.get_unchecked(_cx);
+                _cx += 1;
+            }
+
+            dst = dst.get_unchecked_mut(src_stride..);
+            r_src = r_src.get_unchecked(width..);
+            g_src = g_src.get_unchecked(width..);
+            b_src = b_src.get_unchecked(width..);
+        }
+    }
+}
diff --git a/src/packing/sse/pack_rgba.rs b/src/packing/sse/pack_rgba.rs
new file mode 100644
index 0000000..809468a
--- /dev/null
+++ b/src/packing/sse/pack_rgba.rs
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::packing::sse::v_store::{_mm_store_interleaved_half_rgba, _mm_store_interleaved_rgba};
+use crate::packing::UnpackedRgbaImage;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+pub fn pack_rgba_sse(
+    unpacked_rgb_image: &UnpackedRgbaImage<u8>,
+    dst_image: &mut [u8],
+    width: usize,
+    height: usize,
+) {
+    unsafe {
+        pack_rgba_impl(unpacked_rgb_image, dst_image, width, height);
+    }
+}
+
+#[inline]
+#[target_feature(enable = "sse4.1")]
+unsafe fn pack_rgba_impl(
+    unpacked_rgb_image: &UnpackedRgbaImage<u8>,
+    dst_image: &mut [u8],
+    width: usize,
+    height: usize,
+) {
+    if dst_image.len() != width * height * 4 {
+        panic!(
+            "Image bounds in pack_rgba_neon is mismatched! Expected {} but got {}",
+            width * height * 4,
+            dst_image.len()
+        );
+    }
+
+    let mut r_src: &[u8] = unpacked_rgb_image.r_channel.as_slice();
+    let mut g_src: &[u8] = unpacked_rgb_image.g_channel.as_slice();
+    let mut b_src: &[u8] = unpacked_rgb_image.b_channel.as_slice();
+    let mut a_src: &[u8] = unpacked_rgb_image.a_channel.as_slice();
+
+    let src_stride = width * 4;
+
+    let mut dst = dst_image;
+    unsafe {
+        for _ in 0..height {
+            let mut _cx = 0usize;
+
+            while _cx + 16 < width {
+                let px = _cx * 4;
+                let set = (
+                    _mm_loadu_si128(r_src.as_ptr().add(_cx) as *const __m128i),
+                    _mm_loadu_si128(g_src.as_ptr().add(_cx) as *const __m128i),
+                    _mm_loadu_si128(b_src.as_ptr().add(_cx) as *const __m128i),
+                    _mm_loadu_si128(a_src.as_ptr().add(_cx) as *const __m128i),
+                );
+                _mm_store_interleaved_rgba(dst.as_mut_ptr().add(px), set.0, set.1, set.2, set.3);
+                _cx += 16;
+            }
+
+            while _cx + 8 < width {
+                let px = _cx * 4;
+                let set = (
+                    _mm_loadu_si64(r_src.as_ptr().add(_cx)),
+                    _mm_loadu_si64(g_src.as_ptr().add(_cx)),
+                    _mm_loadu_si64(b_src.as_ptr().add(_cx)),
+                    _mm_loadu_si64(a_src.as_ptr().add(_cx)),
+                );
+                _mm_store_interleaved_half_rgba(
+                    dst.as_mut_ptr().add(px),
+                    set.0,
+                    set.1,
+                    set.2,
+                    set.3,
+                );
+                _cx += 8;
+            }
+
+            while _cx < width {
+                let px = _cx * 4;
+                let dst_align = dst.get_unchecked_mut(px..);
+                *dst_align.get_unchecked_mut(0) = *r_src.get_unchecked(_cx);
+                *dst_align.get_unchecked_mut(1) = *g_src.get_unchecked(_cx);
+                *dst_align.get_unchecked_mut(2) = *b_src.get_unchecked(_cx);
+                *dst_align.get_unchecked_mut(3) = *a_src.get_unchecked(_cx);
+                _cx += 1;
+            }
+
+            dst = dst.get_unchecked_mut(src_stride..);
+            r_src = r_src.get_unchecked(width..);
+            g_src = g_src.get_unchecked(width..);
+            b_src = b_src.get_unchecked(width..);
+            a_src = a_src.get_unchecked(width..);
+        }
+    }
+}
diff --git a/src/packing/sse/v_store.rs b/src/packing/sse/v_store.rs
new file mode 100644
index 0000000..45edbe5
--- /dev/null
+++ b/src/packing/sse/v_store.rs
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub unsafe fn _mm_interleave_rgba(
+    r: __m128i,
+    g: __m128i,
+    b: __m128i,
+    a: __m128i,
+) -> (__m128i, __m128i, __m128i, __m128i) {
+    let rg_lo = _mm_unpacklo_epi8(r, g);
+    let rg_hi = _mm_unpackhi_epi8(r, g);
+    let ba_lo = _mm_unpacklo_epi8(b, a);
+    let ba_hi = _mm_unpackhi_epi8(b, a);
+
+    let rgba_0_lo = _mm_unpacklo_epi16(rg_lo, ba_lo);
+    let rgba_0_hi = _mm_unpackhi_epi16(rg_lo, ba_lo);
+    let rgba_1_lo = _mm_unpacklo_epi16(rg_hi, ba_hi);
+    let rgba_1_hi = _mm_unpackhi_epi16(rg_hi, ba_hi);
+    (rgba_0_lo, rgba_0_hi, rgba_1_lo, rgba_1_hi)
+}
+
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub unsafe fn _mm_store_interleaved_rgba(
+    ptr: *mut u8,
+    r: __m128i,
+    g: __m128i,
+    b: __m128i,
+    a: __m128i,
+) {
+    let (row1, row2, row3, row4) = _mm_interleave_rgba(r, g, b, a);
+    _mm_storeu_si128(ptr as *mut __m128i, row1);
+    _mm_storeu_si128(ptr.add(16) as *mut __m128i, row2);
+    _mm_storeu_si128(ptr.add(32) as *mut __m128i, row3);
+    _mm_storeu_si128(ptr.add(48) as *mut __m128i, row4);
+}
+
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub unsafe fn _mm_store_interleaved_half_rgba(
+    ptr: *mut u8,
+    r: __m128i,
+    g: __m128i,
+    b: __m128i,
+    a: __m128i,
+) {
+    let (row1, row2, _, _) = _mm_interleave_rgba(r, g, b, a);
+    _mm_storeu_si128(ptr as *mut __m128i, row1);
+    _mm_storeu_si128(ptr.add(16) as *mut __m128i, row2);
+}
+
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub unsafe fn _mm_interleave_rgb(
+    r: __m128i,
+    g: __m128i,
+    b: __m128i,
+) -> (__m128i, __m128i, __m128i) {
+    let sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
+    let sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
+    let sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
+    let a0 = _mm_shuffle_epi8(r, sh_a);
+    let b0 = _mm_shuffle_epi8(g, sh_b);
+    let c0 = _mm_shuffle_epi8(b, sh_c);
+
+    let m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
+    let m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    let v0 = _mm_blendv_epi8(_mm_blendv_epi8(a0, b0, m1), c0, m0);
+    let v1 = _mm_blendv_epi8(_mm_blendv_epi8(b0, c0, m1), a0, m0);
+    let v2 = _mm_blendv_epi8(_mm_blendv_epi8(c0, a0, m1), b0, m0);
+    (v0, v1, v2)
+}
+
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub unsafe fn _mm_store_interleaved_rgb(ptr: *mut u8, r: __m128i, g: __m128i, b: __m128i) {
+    let (row1, row2, row3) = _mm_interleave_rgb(r, g, b);
+    _mm_storeu_si128(ptr as *mut __m128i, row1);
+    _mm_storeu_si128(ptr.add(16) as *mut __m128i, row2);
+    _mm_storeu_si128(ptr.add(32) as *mut __m128i, row3);
+}
+
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub unsafe fn _mm_store_rgb_half_u8(ptr: *mut u8, r: __m128i, g: __m128i, b: __m128i) {
+    let (v0, v1, _) = _mm_interleave_rgb(r, g, b);
+    _mm_storeu_si128(ptr as *mut __m128i, v0);
+    std::ptr::copy_nonoverlapping(&v1 as *const _ as *const u8, ptr.add(16), 8);
+}
diff --git a/src/packing/unpack_rgb.rs b/src/packing/unpack_rgb.rs
index 8c44183..d8a833d 100644
--- a/src/packing/unpack_rgb.rs
+++ b/src/packing/unpack_rgb.rs
@@ -26,6 +26,8 @@
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+use crate::packing::avx::deinterleave_rgb_avx;
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 use crate::packing::neon::deinterleave_rgb_neon;
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
@@ -77,6 +79,9 @@ pub fn unpack_rgb(rgb_image: &[u8], image_size: ImageSize) -> UnpackedRgbImage<u
         if std::arch::is_x86_feature_detected!("sse4.1") {
             _dispatcher = deinterleave_rgb_sse;
         }
+        if std::arch::is_x86_feature_detected!("avx2") {
+            _dispatcher = deinterleave_rgb_avx;
+        }
     }
     _dispatcher(rgb_image, image_size.width, image_size.height)
 }
diff --git a/src/packing/unpack_rgba.rs b/src/packing/unpack_rgba.rs
index cf911a7..7b4a557 100644
--- a/src/packing/unpack_rgba.rs
+++ b/src/packing/unpack_rgba.rs
@@ -26,35 +26,8 @@
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
-
-/*
- * Copyright (c) Radzivon Bartoshyk. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without modification,
- * are permitted provided that the following conditions are met:
- *
- * 1.  Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2.  Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3.  Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+use crate::packing::avx::deinterleave_rgba_avx;
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 use crate::packing::neon::deinterleave_rgba_neon;
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
@@ -109,6 +82,9 @@ pub fn unpack_rgba(rgb_image: &[u8], image_size: ImageSize) -> UnpackedRgbaImage
         if std::arch::is_x86_feature_detected!("sse4.1") {
             _dispatcher = deinterleave_rgba_sse;
         }
+        if std::arch::is_x86_feature_detected!("avx2") {
+            _dispatcher = deinterleave_rgba_avx;
+        }
     }
     _dispatcher(rgb_image, image_size.width, image_size.height)
 }