From 841ab4ee0cd9a7698e5442abe25641cc09ccfc7e Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Wed, 25 Sep 2024 18:25:13 +0100 Subject: [PATCH] Test, improvements on SSE, AVX --- Cargo.lock | 11 -- README.md | 16 +- app/Cargo.toml | 2 +- app/benches/dilation/main.rs | 22 +-- app/src/main.rs | 13 +- src/filter.rs | 68 +++++++- src/morph_base.rs | 2 +- src/morph_rgb.rs | 9 +- src/morph_rgba.rs | 9 +- src/op.rs | 2 +- src/op_f32.rs | 2 +- src/op_impl.rs | 2 +- src/ops/avx/mod.rs | 3 + src/ops/avx/morph_op.rs | 230 ++++++++++++++++++++++++++ src/ops/mod.rs | 2 + src/ops/neon/mod.rs | 2 +- src/ops/sse/hminmax.rs | 73 -------- src/ops/sse/mod.rs | 8 +- src/ops/sse/morph_op.rs | 1 + src/ops/sse/morph_op_f32.rs | 187 +++++++++++++++++++++ src/ops/sse/morph_op_u16.rs | 186 +++++++++++++++++++++ src/packing/avx/deinterleave_rgb.rs | 128 ++++++++++++++ src/packing/avx/deinterleave_rgba.rs | 142 ++++++++++++++++ src/packing/avx/mod.rs | 5 + src/packing/avx/v_load.rs | 21 +++ src/packing/mod.rs | 10 +- src/packing/neon/deinterleave_rgba.rs | 2 +- src/packing/pack_gray_alpha.rs | 2 +- src/packing/pack_rgb.rs | 8 + src/packing/pack_rgba.rs | 11 +- src/packing/sse/deinterleave_rgb.rs | 10 ++ src/packing/sse/deinterleave_rgba.rs | 16 +- src/packing/sse/mod.rs | 5 + src/packing/sse/pack_rgb.rs | 111 +++++++++++++ src/packing/sse/pack_rgba.rs | 122 ++++++++++++++ src/packing/sse/v_store.rs | 121 ++++++++++++++ src/packing/unpack_rgb.rs | 5 + src/packing/unpack_rgba.rs | 34 +--- 38 files changed, 1418 insertions(+), 185 deletions(-) create mode 100644 src/ops/avx/mod.rs create mode 100644 src/ops/avx/morph_op.rs delete mode 100644 src/ops/sse/hminmax.rs create mode 100644 src/ops/sse/morph_op_f32.rs create mode 100644 src/ops/sse/morph_op_u16.rs create mode 100644 src/packing/avx/deinterleave_rgb.rs create mode 100644 src/packing/avx/deinterleave_rgba.rs create mode 100644 src/packing/sse/pack_rgb.rs create mode 100644 src/packing/sse/pack_rgba.rs create mode 100644 src/packing/sse/v_store.rs diff --git a/Cargo.lock b/Cargo.lock index a7b79db..d0cbfc3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -261,7 +261,6 @@ checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" dependencies = [ "glob", "libc", - "libloading", ] [[package]] @@ -679,16 +678,6 @@ dependencies = [ "once_cell", ] -[[package]] -name = "libloading" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4" -dependencies = [ - "cfg-if", - "windows-targets", -] - [[package]] name = "libm" version = "0.2.8" diff --git a/README.md b/README.md index 2c31e10..fb245b9 100644 --- a/README.md +++ b/README.md @@ -50,17 +50,17 @@ M3 Pro, NEON dilation RGBA image 2731x4096 with specified kernel size SSE dilation RGB image 2731x4096 with specified kernel size -| SE | 9x9 | 15x15 | 21x21 | 41x41 | 61x61 | -|--------|---------|----------|----------|----------|-------| -| FM | 84.19ms | 186.53ms | 254.70ms | 673.45ms | 1.37s | -| OpenCV | 28.61ms | 62.43ms | 114.80ms | 428.87ms | 1.16s | +| SE | 9x9 | 15x15 | 21x21 | 41x41 | 61x61 | +|--------|---------|---------|----------|----------|----------| +| FM | 30.71ms | 34.87ms | 39.93ms | 81.56ms | 149.37ms | +| OpenCV | 27.36ms | 63.05ms | 112.54ms | 419.40ms | 1.08s | SSE dilation RGBA image 2731x4096 with specified kernel size -| SE | 9x9 | 15x15 | 21x21 | 41x41 | 61x61 | -|--------|----------|----------|----------|----------|-------| -| FM | 109.37ms | 229.11ms | 329.31ms | 981.48ms | 2.05s | -| OpenCV | 39.20ms | 76.09ms | 149.12ms | 569.36ms | 1.33s | +| SE | 9x9 | 15x15 | 21x21 | 41x41 | 61x61 | +|--------|---------|---------|----------|----------|----------| +| FM | 45.03ms | 49.03ms | 56.40ms | 114.72ms | 206.05ms | +| OpenCV | 35.50ms | 79.60ms | 147.32ms | 556.56ms | 1.33s | This project is licensed under either of diff --git a/app/Cargo.toml b/app/Cargo.toml index bfbe51f..9da90f9 100644 --- a/app/Cargo.toml +++ b/app/Cargo.toml @@ -7,7 +7,7 @@ edition = "2021" image = "0.25.2" fast_morphology = {path = "../"} imageproc = "0.25.0" -opencv = {version = "0.93.0", features = ["imgproc", "clang-runtime"]} +opencv = {version = "0.93.0", features = ["imgproc"]} [dev-dependencies] criterion = {version = "0.5.1", features = ["html_reports"]} diff --git a/app/benches/dilation/main.rs b/app/benches/dilation/main.rs index bfa901b..f41276d 100644 --- a/app/benches/dilation/main.rs +++ b/app/benches/dilation/main.rs @@ -316,19 +316,19 @@ pub fn criterion_benchmark(c: &mut Criterion) { opencv::core::set_use_ipp(false).expect("Failed to disable IPP"); opencv::core::set_use_optimized(false).expect("Failed to disable opts"); - // exec_bench_rgb(c, 4); - // exec_bench_rgb(c, 7); - // exec_bench_rgb(c, 10); - // exec_bench_rgb(c, 20); - // exec_bench_rgb(c, 30); + exec_bench_rgb(c, 4); + exec_bench_rgb(c, 7); + exec_bench_rgb(c, 10); + exec_bench_rgb(c, 20); + exec_bench_rgb(c, 30); // - exec_bench_rgba(c, 4); - exec_bench_rgba(c, 7); - exec_bench_rgba(c, 10); - exec_bench_rgba(c, 20); - exec_bench_rgba(c, 30); + // exec_bench_rgba(c, 4); + // exec_bench_rgba(c, 7); + // exec_bench_rgba(c, 10); + // exec_bench_rgba(c, 20); + // exec_bench_rgba(c, 30); - exec_bench_gray(c, 4); + // exec_bench_gray(c, 4); // exec_bench_gray(c, 7); // exec_bench_gray(c, 10); // exec_bench_gray(c, 20); diff --git a/app/src/main.rs b/app/src/main.rs index b6ed134..ff1c4cc 100644 --- a/app/src/main.rs +++ b/app/src/main.rs @@ -1,10 +1,9 @@ use fast_morphology::{ - dilate, dilate_rgb, BorderMode, ImageSize, KernelShape, MorphologyThreadingPolicy, + dilate, dilate_rgb, dilate_rgba, BorderMode, ImageSize, KernelShape, MorphologyThreadingPolicy, }; use image::{EncodableLayout, GenericImageView, ImageReader}; use opencv::core::{ - Mat, MatTrait, MatTraitConstManual, Point, Scalar, - BORDER_REPLICATE, CV_8U, CV_8UC3, + Mat, MatTrait, MatTraitConstManual, Point, Scalar, BORDER_REPLICATE, CV_8U, CV_8UC3, }; use opencv::imgproc; use std::time::Instant; @@ -164,11 +163,11 @@ fn main() { } let rgba_image = transient_rgba.as_bytes(); - let mut dst = vec![0u8; saved_origin.len()]; + let mut dst = vec![0u8; rgba_image.len()]; let exec_time = Instant::now(); - dilate_rgb( - &saved_origin, + dilate_rgba( + &rgba_image, &mut dst, image_size, &structuring_element, @@ -237,7 +236,7 @@ fn main() { &dst, dimensions.0, dimensions.1, - image::ColorType::Rgb8, + image::ColorType::Rgba8, ) .unwrap(); diff --git a/src/filter.rs b/src/filter.rs index a82d5d9..079b576 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -28,15 +28,17 @@ */ use crate::filter_op_declare::{Arena, MorthOpFilterFlat2DRow}; use crate::flat_se::AnalyzedSe; +use crate::morph_base::MorphNativeOp; use crate::op_type::MorphOp; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use crate::ops::avx::MorphOpFilterAvx2DRow; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] use crate::ops::neon::MorphOpFilterNeon2DRow; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -use crate::ops::sse::{MorphOpFilterSse2DRow}; +use crate::ops::sse::{MorphOpFilterSse2DRow, MorphOpFilterSse2DRowF32, MorphOpFilterSse2DRowU16}; use crate::ops::MorphFilterFlat2DRow; use crate::unsafe_slice::UnsafeSlice; use crate::ImageSize; -use crate::morph_base::MorphNativeOp; pub struct MorthFilterFlat2DRow where @@ -78,9 +80,34 @@ where } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { - _result = Box::new( - MorphOpFilterSse2DRow::<{ MorphOp::Dilate as u8 }>::default(), - ); + if std::arch::is_x86_feature_detected!("sse4.1") { + _result = Box::new(MorphOpFilterSse2DRow::< + { MorphOp::Dilate as u8 }, + >::default()); + } + if std::arch::is_x86_feature_detected!("avx2") { + _result = Box::new(MorphOpFilterAvx2DRow::< + { MorphOp::Dilate as u8 }, + >::default()); + } + } + } else if std::any::type_name::() == "u16" { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if std::arch::is_x86_feature_detected!("sse4.1") { + _result = Box::new(MorphOpFilterSse2DRowU16::< + { MorphOp::Dilate as u8 }, + >::default()); + } + } + } else if std::any::type_name::() == "f32" { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if std::arch::is_x86_feature_detected!("sse4.1") { + _result = Box::new(MorphOpFilterSse2DRowF32::< + { MorphOp::Dilate as u8 }, + >::default()); + } } } _result @@ -97,9 +124,34 @@ where } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { - _result = Box::new( - MorphOpFilterSse2DRow::<{ MorphOp::Erode as u8 }>::default(), - ); + if std::arch::is_x86_feature_detected!("sse4.1") { + _result = Box::new( + MorphOpFilterSse2DRow::<{ MorphOp::Erode as u8 }>::default(), + ); + } + if std::arch::is_x86_feature_detected!("avx2") { + _result = Box::new( + MorphOpFilterAvx2DRow::<{ MorphOp::Erode as u8 }>::default(), + ); + } + } + } else if std::any::type_name::() == "u16" { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if std::arch::is_x86_feature_detected!("sse4.1") { + _result = Box::new(MorphOpFilterSse2DRowU16::< + { MorphOp::Erode as u8 }, + >::default()); + } + } + } else if std::any::type_name::() == "f32" { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if std::arch::is_x86_feature_detected!("sse4.1") { + _result = Box::new(MorphOpFilterSse2DRowF32::< + { MorphOp::Erode as u8 }, + >::default()); + } } } _result diff --git a/src/morph_base.rs b/src/morph_base.rs index 4463260..0f8eb9c 100644 --- a/src/morph_base.rs +++ b/src/morph_base.rs @@ -60,4 +60,4 @@ impl MorphNativeOp for f32 { MorphOp::Erode => (*self).min(other), } } -} \ No newline at end of file +} diff --git a/src/morph_rgb.rs b/src/morph_rgb.rs index cbf40db..c5ba9fb 100644 --- a/src/morph_rgb.rs +++ b/src/morph_rgb.rs @@ -41,14 +41,7 @@ pub(crate) unsafe fn make_morphology_rgb( threading_policy: MorphologyThreadingPolicy, ) -> Result<(), String> where - T: RgbPackable - + Copy - + 'static - + Sync - + Send - + Clone - + Default - + MorphNativeOp, + T: RgbPackable + Copy + 'static + Sync + Send + Clone + Default + MorphNativeOp, { let unpacked = T::unpack(src, image_size); let mut dst_unpacked = UnpackedRgbImage::alloc(image_size); diff --git a/src/morph_rgba.rs b/src/morph_rgba.rs index a618b0f..11883a5 100644 --- a/src/morph_rgba.rs +++ b/src/morph_rgba.rs @@ -41,14 +41,7 @@ pub(crate) unsafe fn make_morphology_rgba( threading_policy: MorphologyThreadingPolicy, ) -> Result<(), String> where - T: RgbaPackable - + Default - + Copy - + Clone - + Send - + Sync - + 'static - + MorphNativeOp, + T: RgbaPackable + Default + Copy + Clone + Send + Sync + 'static + MorphNativeOp, { let unpacked = T::unpack(src, image_size); let mut dst_unpacked = UnpackedRgbaImage::alloc(image_size); diff --git a/src/op.rs b/src/op.rs index 70bb4b1..eb258db 100644 --- a/src/op.rs +++ b/src/op.rs @@ -27,13 +27,13 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ use crate::border_mode::BorderMode; +use crate::morph_gray_alpha::make_morphology_gray_alpha; use crate::morph_rgb::make_morphology_rgb; use crate::morph_rgba::make_morphology_rgba; use crate::op_impl::make_morphology; use crate::op_type::MorphOp; use crate::structuring_element::KernelShape; use crate::{ImageSize, MorphologyThreadingPolicy}; -use crate::morph_gray_alpha::make_morphology_gray_alpha; /// Dilate a gray (planar) image /// diff --git a/src/op_f32.rs b/src/op_f32.rs index 5e64740..ec42037 100644 --- a/src/op_f32.rs +++ b/src/op_f32.rs @@ -27,13 +27,13 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ use crate::border_mode::BorderMode; +use crate::morph_gray_alpha::make_morphology_gray_alpha; use crate::morph_rgb::make_morphology_rgb; use crate::morph_rgba::make_morphology_rgba; use crate::op_impl::make_morphology; use crate::op_type::MorphOp; use crate::structuring_element::KernelShape; use crate::{ImageSize, MorphologyThreadingPolicy}; -use crate::morph_gray_alpha::make_morphology_gray_alpha; /// Dilate a gray (planar) stored in u16 image /// diff --git a/src/op_impl.rs b/src/op_impl.rs index 78cecfa..d45596d 100644 --- a/src/op_impl.rs +++ b/src/op_impl.rs @@ -30,13 +30,13 @@ use crate::arena::make_arena; use crate::border_mode::BorderMode; use crate::filter::MorthFilterFlat2DRow; use crate::filter_op_declare::MorthOpFilterFlat2DRow; +use crate::morph_base::MorphNativeOp; use crate::op_type::MorphOp; use crate::se_scan::scan_se; use crate::structuring_element::KernelShape; use crate::unsafe_slice::UnsafeSlice; use crate::{ImageSize, MorphologyThreadingPolicy}; use std::sync::Arc; -use crate::morph_base::MorphNativeOp; pub(crate) unsafe fn make_morphology( src: &[T], diff --git a/src/ops/avx/mod.rs b/src/ops/avx/mod.rs new file mode 100644 index 0000000..8102778 --- /dev/null +++ b/src/ops/avx/mod.rs @@ -0,0 +1,3 @@ +mod morph_op; + +pub use morph_op::MorphOpFilterAvx2DRow; diff --git a/src/ops/avx/morph_op.rs b/src/ops/avx/morph_op.rs new file mode 100644 index 0000000..d4e16dc --- /dev/null +++ b/src/ops/avx/morph_op.rs @@ -0,0 +1,230 @@ +/* + * Copyright (c) Radzivon Bartoshyk. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +use crate::filter_op_declare::{Arena, MorthOpFilterFlat2DRow}; +use crate::flat_se::AnalyzedSe; +use crate::morph_base::MorphNativeOp; +use crate::op_type::MorphOp; +use crate::unsafe_slice::UnsafeSlice; +use crate::ImageSize; +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +#[derive(Clone)] +pub struct MorphOpFilterAvx2DRow {} + +impl Default for MorphOpFilterAvx2DRow { + fn default() -> Self { + MorphOpFilterAvx2DRow {} + } +} + +impl MorthOpFilterFlat2DRow for MorphOpFilterAvx2DRow +where + T: Copy + 'static + MorphNativeOp, +{ + #[target_feature(enable = "avx2")] + unsafe fn dispatch_row( + &self, + arena: &Arena, + dst: &UnsafeSlice, + image_size: ImageSize, + analyzed_se: AnalyzedSe, + y: usize, + ) { + let width = image_size.width; + + let op_type: MorphOp = OP_TYPE.into(); + let stride = width; + + let decision = match op_type { + MorphOp::Dilate => _mm_max_epu8, + MorphOp::Erode => _mm_min_epu8, + }; + + let decision_avx = match op_type { + MorphOp::Dilate => _mm256_max_epu8, + MorphOp::Erode => _mm256_min_epu8, + }; + + let src: &Vec = std::mem::transmute(&arena.arena); + let dst: &UnsafeSlice = std::mem::transmute(dst); + + let dx = arena.pad_w as i32; + let dy = arena.pad_h as i32; + + let arena_stride = arena.width; + + let offsets = analyzed_se + .left_front + .element_offsets + .iter() + .map(|&x| { + src.get_unchecked( + ((x.y + dy + y as i32) as usize * arena_stride + (x.x + dx) as usize).., + ) + }) + .collect::>(); + + let length = analyzed_se.left_front.element_offsets.iter().len(); + + let mut _cx = 0usize; + + while _cx + 128 < width { + let ptr0 = (*offsets.get_unchecked(0).get_unchecked(_cx..)).as_ptr(); + let mut row0 = _mm256_loadu_si256(ptr0 as *const __m256i); + let mut row1 = _mm256_loadu_si256(ptr0.add(32) as *const __m256i); + let mut row2 = _mm256_loadu_si256(ptr0.add(64) as *const __m256i); + let mut row3 = _mm256_loadu_si256(ptr0.add(96) as *const __m256i); + + for i in 1..length { + let ptr_d = (*offsets.get_unchecked(i)).get_unchecked(_cx..).as_ptr(); + let new_row0 = _mm256_loadu_si256(ptr_d as *const __m256i); + let new_row1 = _mm256_loadu_si256(ptr_d.add(32) as *const __m256i); + let new_row2 = _mm256_loadu_si256(ptr_d.add(64) as *const __m256i); + let new_row3 = _mm256_loadu_si256(ptr_d.add(96) as *const __m256i); + row0 = decision_avx(row0, new_row0); + row1 = decision_avx(row1, new_row1); + row2 = decision_avx(row2, new_row2); + row3 = decision_avx(row3, new_row3); + } + + let v_dst = dst.slice.as_ptr().add(y * stride + _cx) as *mut u8; + + _mm256_storeu_si256(v_dst as *mut __m256i, row0); + _mm256_storeu_si256(v_dst.add(32) as *mut __m256i, row1); + _mm256_storeu_si256(v_dst.add(64) as *mut __m256i, row2); + _mm256_storeu_si256(v_dst.add(96) as *mut __m256i, row3); + + _cx += 128; + } + + while _cx + 64 < width { + let ptr0 = (*offsets.get_unchecked(0).get_unchecked(_cx..)).as_ptr(); + let mut row0 = _mm256_loadu_si256(ptr0 as *const __m256i); + let mut row1 = _mm256_loadu_si256(ptr0.add(32) as *const __m256i); + + for i in 1..length { + let ptr_d = (*offsets.get_unchecked(i)).get_unchecked(_cx..).as_ptr(); + let new_row0 = _mm256_loadu_si256(ptr_d as *const __m256i); + let new_row1 = _mm256_loadu_si256(ptr_d.add(32) as *const __m256i); + row0 = decision_avx(row0, new_row0); + row1 = decision_avx(row1, new_row1); + } + + let v_dst = dst.slice.as_ptr().add(y * stride + _cx) as *mut u8; + + _mm256_storeu_si256(v_dst as *mut __m256i, row0); + _mm256_storeu_si256(v_dst.add(32) as *mut __m256i, row1); + + _cx += 64; + } + + while _cx + 32 < width { + let ptr0 = (*offsets.get_unchecked(0).get_unchecked(_cx..)).as_ptr(); + let mut row0 = _mm256_loadu_si256(ptr0 as *const __m256i); + + for i in 1..length { + let ptr_d = (*offsets.get_unchecked(i)).get_unchecked(_cx..).as_ptr(); + let new_row0 = _mm256_loadu_si256(ptr_d as *const __m256i); + row0 = decision_avx(row0, new_row0); + } + + let v_dst = dst.slice.as_ptr().add(y * stride + _cx) as *mut u8; + + _mm256_storeu_si256(v_dst as *mut __m256i, row0); + + _cx += 32; + } + + while _cx + 16 < width { + let ptr0 = (*offsets.get_unchecked(0).get_unchecked(_cx..)).as_ptr(); + let mut row0 = _mm_loadu_si128(ptr0 as *const __m128i); + + for i in 1..length { + let ptr_d = (*offsets.get_unchecked(i)).get_unchecked(_cx..).as_ptr(); + let new_row0 = _mm_loadu_si128(ptr_d as *const __m128i); + row0 = decision(row0, new_row0); + } + + let v_dst = dst.slice.as_ptr().add(y * stride + _cx) as *mut u8; + _mm_storeu_si128(v_dst as *mut __m128i, row0); + + _cx += 16; + } + + while _cx + 8 < width { + let ptr0 = (*offsets.get_unchecked(0).get_unchecked(_cx..)).as_ptr(); + let mut row0 = _mm_loadu_si64(ptr0); + + for i in 1..length { + let ptr_d = (*offsets.get_unchecked(i)).get_unchecked(_cx..).as_ptr(); + let new_row0 = _mm_loadu_si64(ptr_d); + row0 = decision(row0, new_row0); + } + + let v_dst = dst.slice.as_ptr().add(y * stride + _cx) as *mut u8; + std::ptr::copy_nonoverlapping(&row0 as *const _ as *const u8, v_dst, 8); + + _cx += 8; + } + + for x in (_cx..width.saturating_sub(4)).step_by(4) { + let mut k0 = *(*offsets.get_unchecked(0)).get_unchecked(x); + let mut k1 = *(*offsets.get_unchecked(0)).get_unchecked(x + 1); + let mut k2 = *(*offsets.get_unchecked(0)).get_unchecked(x + 2); + let mut k3 = *(*offsets.get_unchecked(0)).get_unchecked(x + 3); + + for i in 1..length { + k0 = k0.op::(*(*offsets.get_unchecked(i)).get_unchecked(x)); + k1 = k1.op::(*(*offsets.get_unchecked(i)).get_unchecked(x + 1)); + k2 = k2.op::(*(*offsets.get_unchecked(i)).get_unchecked(x + 2)); + k3 = k3.op::(*(*offsets.get_unchecked(i)).get_unchecked(x + 3)); + } + + let dst_offset = y * stride + x; + + dst.write(dst_offset, k0); + dst.write(dst_offset + 1, k1); + dst.write(dst_offset + 2, k2); + dst.write(dst_offset + 3, k3); + _cx = x; + } + + for x in _cx..width { + let mut k0 = *(*offsets.get_unchecked(0)).get_unchecked(x); + + for i in 1..length { + k0 = k0.op::(*(*offsets.get_unchecked(i)).get_unchecked(x)); + } + dst.write(y * stride + x, k0); + } + } +} diff --git a/src/ops/mod.rs b/src/ops/mod.rs index 65800fc..7fd6373 100644 --- a/src/ops/mod.rs +++ b/src/ops/mod.rs @@ -26,6 +26,8 @@ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub mod avx; mod morph_row_op; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] pub mod neon; diff --git a/src/ops/neon/mod.rs b/src/ops/neon/mod.rs index b1f550b..8deee91 100644 --- a/src/ops/neon/mod.rs +++ b/src/ops/neon/mod.rs @@ -28,4 +28,4 @@ */ mod morph_op; -pub use morph_op::MorphOpFilterNeon2DRow; \ No newline at end of file +pub use morph_op::MorphOpFilterNeon2DRow; diff --git a/src/ops/sse/hminmax.rs b/src/ops/sse/hminmax.rs deleted file mode 100644 index 00ab939..0000000 --- a/src/ops/sse/hminmax.rs +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) Radzivon Bartoshyk. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#[cfg(target_arch = "x86")] -use std::arch::x86::*; -#[cfg(target_arch = "x86_64")] -use std::arch::x86_64::*; - -#[inline] -#[target_feature(enable = "sse4.1")] -pub unsafe fn _mm_hmax_epu8(v: __m128i) -> u8 { - let mut vmax = v; - vmax = _mm_max_epu8(vmax, _mm_alignr_epi8::<1>(vmax, vmax)); - vmax = _mm_max_epu8(vmax, _mm_alignr_epi8::<2>(vmax, vmax)); - vmax = _mm_max_epu8(vmax, _mm_alignr_epi8::<4>(vmax, vmax)); - vmax = _mm_max_epu8(vmax, _mm_alignr_epi8::<8>(vmax, vmax)); - _mm_extract_epi8::<0>(vmax) as u8 -} - -#[inline] -#[target_feature(enable = "sse4.1")] -pub unsafe fn _mm_hmin_epu8(v: __m128i) -> u8 { - let mut vmax = v; - vmax = _mm_min_epu8(vmax, _mm_alignr_epi8::<1>(vmax, vmax)); - vmax = _mm_min_epu8(vmax, _mm_alignr_epi8::<2>(vmax, vmax)); - vmax = _mm_min_epu8(vmax, _mm_alignr_epi8::<4>(vmax, vmax)); - vmax = _mm_min_epu8(vmax, _mm_alignr_epi8::<8>(vmax, vmax)); - _mm_extract_epi8::<0>(vmax) as u8 -} - -#[cfg(test)] -mod tests { - use crate::ops::sse::hminmax::{_mm_hmax_epu8, _mm_hmin_epu8}; - use std::arch::x86_64::{__m128i, _mm_loadu_si128}; - - #[test] - fn test_hmax() { - unsafe { - let values: [u8; 16] = [1, 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; - let row = _mm_loadu_si128(values.as_ptr() as *const __m128i); - let max = _mm_hmax_epu8(row); - let min = _mm_hmin_epu8(row); - assert_eq!(max, 15); - assert_eq!(min, 0); - } - } -} diff --git a/src/ops/sse/mod.rs b/src/ops/sse/mod.rs index 99526c9..6a621d5 100644 --- a/src/ops/sse/mod.rs +++ b/src/ops/sse/mod.rs @@ -26,10 +26,10 @@ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -mod hminmax; mod morph_op; -mod op; +mod morph_op_f32; +mod morph_op_u16; -pub use hminmax::{_mm_hmax_epu8, _mm_hmin_epu8}; pub use morph_op::MorphOpFilterSse2DRow; -pub use op::{fast_morph_op_1d_sse, fast_morph_op_3d_sse, fast_morph_op_4d_sse}; +pub use morph_op_f32::MorphOpFilterSse2DRowF32; +pub use morph_op_u16::MorphOpFilterSse2DRowU16; diff --git a/src/ops/sse/morph_op.rs b/src/ops/sse/morph_op.rs index 044b6f6..797c6ef 100644 --- a/src/ops/sse/morph_op.rs +++ b/src/ops/sse/morph_op.rs @@ -50,6 +50,7 @@ impl MorthOpFilterFlat2DRow for MorphOpFilterSse2DRow, { + #[target_feature(enable = "sse4.1")] unsafe fn dispatch_row( &self, arena: &Arena, diff --git a/src/ops/sse/morph_op_f32.rs b/src/ops/sse/morph_op_f32.rs new file mode 100644 index 0000000..15e92be --- /dev/null +++ b/src/ops/sse/morph_op_f32.rs @@ -0,0 +1,187 @@ +/* + * Copyright (c) Radzivon Bartoshyk. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +use crate::filter_op_declare::{Arena, MorthOpFilterFlat2DRow}; +use crate::flat_se::AnalyzedSe; +use crate::morph_base::MorphNativeOp; +use crate::op_type::MorphOp; +use crate::unsafe_slice::UnsafeSlice; +use crate::ImageSize; +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +#[derive(Clone)] +pub struct MorphOpFilterSse2DRowF32 {} + +impl Default for MorphOpFilterSse2DRowF32 { + fn default() -> Self { + MorphOpFilterSse2DRowF32 {} + } +} + +impl MorthOpFilterFlat2DRow for MorphOpFilterSse2DRowF32 +where + T: Copy + 'static + MorphNativeOp, +{ + #[target_feature(enable = "sse4.1")] + unsafe fn dispatch_row( + &self, + arena: &Arena, + dst: &UnsafeSlice, + image_size: ImageSize, + analyzed_se: AnalyzedSe, + y: usize, + ) { + let width = image_size.width; + + let op_type: MorphOp = OP_TYPE.into(); + let stride = width; + + let decision = match op_type { + MorphOp::Dilate => _mm_max_ps, + MorphOp::Erode => _mm_min_ps, + }; + + let src: &Vec = std::mem::transmute(&arena.arena); + let dst: &UnsafeSlice = std::mem::transmute(dst); + + let dx = arena.pad_w as i32; + let dy = arena.pad_h as i32; + + let arena_stride = arena.width; + + let offsets = analyzed_se + .left_front + .element_offsets + .iter() + .map(|&x| { + src.get_unchecked( + ((x.y + dy + y as i32) as usize * arena_stride + (x.x + dx) as usize).., + ) + }) + .collect::>(); + + let length = analyzed_se.left_front.element_offsets.iter().len(); + + let mut _cx = 0usize; + + while _cx + 16 < width { + let ptr0 = (*offsets.get_unchecked(0).get_unchecked(_cx..)).as_ptr(); + let mut row0 = _mm_loadu_ps(ptr0); + let mut row1 = _mm_loadu_ps(ptr0.add(4)); + let mut row2 = _mm_loadu_ps(ptr0.add(8)); + let mut row3 = _mm_loadu_ps(ptr0.add(12)); + + for i in 1..length { + let ptr_d = (*offsets.get_unchecked(i)).get_unchecked(_cx..).as_ptr(); + let new_row0 = _mm_loadu_ps(ptr_d); + let new_row1 = _mm_loadu_ps(ptr_d.add(4)); + let new_row2 = _mm_loadu_ps(ptr_d.add(8)); + let new_row3 = _mm_loadu_ps(ptr_d.add(12)); + row0 = decision(row0, new_row0); + row1 = decision(row1, new_row1); + row2 = decision(row2, new_row2); + row3 = decision(row3, new_row3); + } + + let v_dst = dst.slice.as_ptr().add(y * stride + _cx) as *mut f32; + + _mm_storeu_ps(v_dst, row0); + _mm_storeu_ps(v_dst.add(4), row1); + _mm_storeu_ps(v_dst.add(8), row2); + _mm_storeu_ps(v_dst.add(12), row3); + + _cx += 16; + } + + while _cx + 8 < width { + let ptr0 = (*offsets.get_unchecked(0).get_unchecked(_cx..)).as_ptr(); + let mut row0 = _mm_loadu_ps(ptr0); + let mut row1 = _mm_loadu_ps(ptr0.add(4)); + + for i in 1..length { + let ptr_d = (*offsets.get_unchecked(i)).get_unchecked(_cx..).as_ptr(); + let new_row0 = _mm_loadu_ps(ptr_d); + let new_row1 = _mm_loadu_ps(ptr_d.add(4)); + row0 = decision(row0, new_row0); + row1 = decision(row1, new_row1); + } + + let v_dst = dst.slice.as_ptr().add(y * stride + _cx) as *mut f32; + + _mm_storeu_ps(v_dst, row0); + _mm_storeu_ps(v_dst.add(8), row1); + + _cx += 8; + } + + while _cx + 4 < width { + let ptr0 = (*offsets.get_unchecked(0).get_unchecked(_cx..)).as_ptr(); + let mut row0 = _mm_loadu_ps(ptr0); + + for i in 1..length { + let ptr_d = (*offsets.get_unchecked(i)).get_unchecked(_cx..).as_ptr(); + let new_row0 = _mm_loadu_ps(ptr_d); + row0 = decision(row0, new_row0); + } + + let v_dst = dst.slice.as_ptr().add(y * stride + _cx) as *mut f32; + _mm_storeu_ps(v_dst, row0); + + _cx += 4; + } + + while _cx + 2 < width { + let ptr0 = (*offsets.get_unchecked(0).get_unchecked(_cx..)).as_ptr(); + let mut row0 = _mm_castsi128_ps(_mm_loadu_si64(ptr0 as *const u8)); + + for i in 1..length { + let ptr_d = (*offsets.get_unchecked(i)).get_unchecked(_cx..).as_ptr(); + let new_row0 = _mm_castsi128_ps(_mm_loadu_si64(ptr_d as *const u8)); + row0 = decision(row0, new_row0); + } + + let v_dst = dst.slice.as_ptr().add(y * stride + _cx) as *mut u8; + let v0 = _mm_castps_si128(row0); + std::ptr::copy_nonoverlapping(&v0 as *const _ as *const u8, v_dst, 8); + + _cx += 2; + } + + for x in _cx..width { + let mut k0 = *(*offsets.get_unchecked(0)).get_unchecked(x); + + for i in 1..length { + k0 = k0.op::(*(*offsets.get_unchecked(i)).get_unchecked(x)); + } + dst.write(y * stride + x, k0); + } + } +} diff --git a/src/ops/sse/morph_op_u16.rs b/src/ops/sse/morph_op_u16.rs new file mode 100644 index 0000000..8e0fdf6 --- /dev/null +++ b/src/ops/sse/morph_op_u16.rs @@ -0,0 +1,186 @@ +/* + * Copyright (c) Radzivon Bartoshyk. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +use crate::filter_op_declare::{Arena, MorthOpFilterFlat2DRow}; +use crate::flat_se::AnalyzedSe; +use crate::morph_base::MorphNativeOp; +use crate::op_type::MorphOp; +use crate::unsafe_slice::UnsafeSlice; +use crate::ImageSize; +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +#[derive(Clone)] +pub struct MorphOpFilterSse2DRowU16 {} + +impl Default for MorphOpFilterSse2DRowU16 { + fn default() -> Self { + MorphOpFilterSse2DRowU16 {} + } +} + +impl MorthOpFilterFlat2DRow for MorphOpFilterSse2DRowU16 +where + T: Copy + 'static + MorphNativeOp, +{ + #[target_feature(enable = "sse4.1")] + unsafe fn dispatch_row( + &self, + arena: &Arena, + dst: &UnsafeSlice, + image_size: ImageSize, + analyzed_se: AnalyzedSe, + y: usize, + ) { + let width = image_size.width; + + let op_type: MorphOp = OP_TYPE.into(); + let stride = width; + + let decision = match op_type { + MorphOp::Dilate => _mm_max_epu16, + MorphOp::Erode => _mm_min_epu16, + }; + + let src: &Vec = std::mem::transmute(&arena.arena); + let dst: &UnsafeSlice = std::mem::transmute(dst); + + let dx = arena.pad_w as i32; + let dy = arena.pad_h as i32; + + let arena_stride = arena.width; + + let offsets = analyzed_se + .left_front + .element_offsets + .iter() + .map(|&x| { + src.get_unchecked( + ((x.y + dy + y as i32) as usize * arena_stride + (x.x + dx) as usize).., + ) + }) + .collect::>(); + + let length = analyzed_se.left_front.element_offsets.iter().len(); + + let mut _cx = 0usize; + + while _cx + 32 < width { + let ptr0 = (*offsets.get_unchecked(0).get_unchecked(_cx..)).as_ptr(); + let mut row0 = _mm_loadu_si128(ptr0 as *const __m128i); + let mut row1 = _mm_loadu_si128(ptr0.add(8) as *const __m128i); + let mut row2 = _mm_loadu_si128(ptr0.add(16) as *const __m128i); + let mut row3 = _mm_loadu_si128(ptr0.add(24) as *const __m128i); + + for i in 1..length { + let ptr_d = (*offsets.get_unchecked(i)).get_unchecked(_cx..).as_ptr(); + let new_row0 = _mm_loadu_si128(ptr_d as *const __m128i); + let new_row1 = _mm_loadu_si128(ptr_d.add(8) as *const __m128i); + let new_row2 = _mm_loadu_si128(ptr_d.add(16) as *const __m128i); + let new_row3 = _mm_loadu_si128(ptr_d.add(24) as *const __m128i); + row0 = decision(row0, new_row0); + row1 = decision(row1, new_row1); + row2 = decision(row2, new_row2); + row3 = decision(row3, new_row3); + } + + let v_dst = dst.slice.as_ptr().add(y * stride + _cx) as *mut u8; + + _mm_storeu_si128(v_dst as *mut __m128i, row0); + _mm_storeu_si128(v_dst.add(8) as *mut __m128i, row1); + _mm_storeu_si128(v_dst.add(16) as *mut __m128i, row2); + _mm_storeu_si128(v_dst.add(24) as *mut __m128i, row3); + + _cx += 32; + } + + while _cx + 16 < width { + let ptr0 = (*offsets.get_unchecked(0).get_unchecked(_cx..)).as_ptr(); + let mut row0 = _mm_loadu_si128(ptr0 as *const __m128i); + let mut row1 = _mm_loadu_si128(ptr0.add(8) as *const __m128i); + + for i in 1..length { + let ptr_d = (*offsets.get_unchecked(i)).get_unchecked(_cx..).as_ptr(); + let new_row0 = _mm_loadu_si128(ptr_d as *const __m128i); + let new_row1 = _mm_loadu_si128(ptr_d.add(8) as *const __m128i); + row0 = decision(row0, new_row0); + row1 = decision(row1, new_row1); + } + + let v_dst = dst.slice.as_ptr().add(y * stride + _cx) as *mut u8; + + _mm_storeu_si128(v_dst as *mut __m128i, row0); + _mm_storeu_si128(v_dst.add(8) as *mut __m128i, row1); + + _cx += 16; + } + + while _cx + 8 < width { + let ptr0 = (*offsets.get_unchecked(0).get_unchecked(_cx..)).as_ptr(); + let mut row0 = _mm_loadu_si128(ptr0 as *const __m128i); + + for i in 1..length { + let ptr_d = (*offsets.get_unchecked(i)).get_unchecked(_cx..).as_ptr(); + let new_row0 = _mm_loadu_si128(ptr_d as *const __m128i); + row0 = decision(row0, new_row0); + } + + let v_dst = dst.slice.as_ptr().add(y * stride + _cx) as *mut u8; + _mm_storeu_si128(v_dst as *mut __m128i, row0); + + _cx += 8; + } + + while _cx + 4 < width { + let ptr0 = (*offsets.get_unchecked(0).get_unchecked(_cx..)).as_ptr(); + let mut row0 = _mm_loadu_si64(ptr0 as *const u8); + + for i in 1..length { + let ptr_d = (*offsets.get_unchecked(i)).get_unchecked(_cx..).as_ptr(); + let new_row0 = _mm_loadu_si64(ptr_d as *const u8); + row0 = decision(row0, new_row0); + } + + let v_dst = dst.slice.as_ptr().add(y * stride + _cx) as *mut u8; + std::ptr::copy_nonoverlapping(&row0 as *const _ as *const u8, v_dst, 8); + + _cx += 4; + } + + for x in _cx..width { + let mut k0 = *(*offsets.get_unchecked(0)).get_unchecked(x); + + for i in 1..length { + k0 = k0.op::(*(*offsets.get_unchecked(i)).get_unchecked(x)); + } + dst.write(y * stride + x, k0); + } + } +} diff --git a/src/packing/avx/deinterleave_rgb.rs b/src/packing/avx/deinterleave_rgb.rs new file mode 100644 index 0000000..90de228 --- /dev/null +++ b/src/packing/avx/deinterleave_rgb.rs @@ -0,0 +1,128 @@ +/* + * Copyright (c) Radzivon Bartoshyk. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +use crate::packing::avx::v_load::_mm256_load_deinterleave_rgb; +use crate::packing::sse::{_mm_load_deinterleave_half_rgb, _mm_load_deinterleave_rgb}; +use crate::packing::UnpackedRgbImage; +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +pub fn deinterleave_rgb_avx(rgb_image: &[u8], width: usize, height: usize) -> UnpackedRgbImage { + unsafe { deinterleave_rgb_impl(rgb_image, width, height) } +} + +#[inline] +#[target_feature(enable = "avx2")] +unsafe fn deinterleave_rgb_impl( + rgb_image: &[u8], + width: usize, + height: usize, +) -> UnpackedRgbImage { + if rgb_image.len() != width * height * 3 { + panic!( + "Image bounds in deinterleave_rgb_sse is mismatched! Expected {} but got {}", + width * height * 3, + rgb_image.len() + ); + } + let mut r_chan = vec![0u8; width * height]; + let mut g_chan = vec![0u8; width * height]; + let mut b_chan = vec![0u8; width * height]; + + let mut r_dst = r_chan.as_mut_slice(); + let mut g_dst = g_chan.as_mut_slice(); + let mut b_dst = b_chan.as_mut_slice(); + + let src_stride = width * 3; + + let mut src = rgb_image; + unsafe { + for _ in 0..height { + let mut _cx = 0usize; + + while _cx + 32 < width { + let px = _cx * 3; + let pixels = _mm256_load_deinterleave_rgb(src.as_ptr().add(px)); + _mm256_storeu_si256(r_dst.as_mut_ptr().add(_cx) as *mut __m256i, pixels.0); + _mm256_storeu_si256(g_dst.as_mut_ptr().add(_cx) as *mut __m256i, pixels.1); + _mm256_storeu_si256(b_dst.as_mut_ptr().add(_cx) as *mut __m256i, pixels.2); + _cx += 32; + } + + while _cx + 16 < width { + let px = _cx * 3; + let pixels = _mm_load_deinterleave_rgb(src.as_ptr().add(px)); + _mm_storeu_si128(r_dst.as_mut_ptr().add(_cx) as *mut __m128i, pixels.0); + _mm_storeu_si128(g_dst.as_mut_ptr().add(_cx) as *mut __m128i, pixels.1); + _mm_storeu_si128(b_dst.as_mut_ptr().add(_cx) as *mut __m128i, pixels.2); + _cx += 16; + } + + while _cx + 8 < width { + let px = _cx * 3; + let pixels = _mm_load_deinterleave_half_rgb(src.as_ptr().add(px), 0); + let v0 = pixels.0; + std::ptr::copy_nonoverlapping( + &v0 as *const _ as *const u8, + r_dst.as_mut_ptr().add(_cx), + 8, + ); + let v1 = pixels.1; + std::ptr::copy_nonoverlapping( + &v1 as *const _ as *const u8, + g_dst.as_mut_ptr().add(_cx), + 8, + ); + let v2 = pixels.2; + std::ptr::copy_nonoverlapping( + &v2 as *const _ as *const u8, + b_dst.as_mut_ptr().add(_cx), + 8, + ); + _cx += 8; + } + + while _cx < width { + let px = _cx * 3; + let src_align = src.get_unchecked(px..); + *r_dst.get_unchecked_mut(_cx) = *src_align.get_unchecked(0); + *g_dst.get_unchecked_mut(_cx) = *src_align.get_unchecked(1); + *b_dst.get_unchecked_mut(_cx) = *src_align.get_unchecked(2); + _cx += 1; + } + + src = src.get_unchecked(src_stride..); + r_dst = r_dst.get_unchecked_mut(width..); + g_dst = g_dst.get_unchecked_mut(width..); + b_dst = b_dst.get_unchecked_mut(width..); + } + } + UnpackedRgbImage::new(r_chan, g_chan, b_chan) +} diff --git a/src/packing/avx/deinterleave_rgba.rs b/src/packing/avx/deinterleave_rgba.rs new file mode 100644 index 0000000..9770f06 --- /dev/null +++ b/src/packing/avx/deinterleave_rgba.rs @@ -0,0 +1,142 @@ +/* + * Copyright (c) Radzivon Bartoshyk. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +use crate::packing::avx::v_load::_mm256_load_deinterleave_rgba; +use crate::packing::sse::{_mm_load_deinterleave_half_rgba, _mm_load_deinterleave_rgba}; +use crate::packing::UnpackedRgbaImage; +use crate::ImageSize; +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +pub fn deinterleave_rgba_avx( + rgb_image: &[u8], + width: usize, + height: usize, +) -> UnpackedRgbaImage { + unsafe { deinterleave_rgba_avx_impl(rgb_image, width, height) } +} + +#[inline] +#[target_feature(enable = "avx2")] +unsafe fn deinterleave_rgba_avx_impl( + rgb_image: &[u8], + width: usize, + height: usize, +) -> UnpackedRgbaImage { + if rgb_image.len() != width * height * 4 { + panic!( + "Image bounds in deinterleave_rgba_sse is mismatched! Expected {} but got {}", + width * height * 4, + rgb_image.len() + ); + } + let mut unpacked_image = UnpackedRgbaImage::alloc(ImageSize::new(width, height)); + + let mut r_dst: &mut [u8] = unpacked_image.r_channel.as_mut_slice(); + let mut g_dst: &mut [u8] = unpacked_image.g_channel.as_mut_slice(); + let mut b_dst: &mut [u8] = unpacked_image.b_channel.as_mut_slice(); + let mut a_dst: &mut [u8] = unpacked_image.a_channel.as_mut_slice(); + + let src_stride = width * 4; + + let mut src = rgb_image; + unsafe { + for _ in 0..height { + let mut _cx = 0usize; + + while _cx + 32 < width { + let px = _cx * 4; + let pixels = _mm256_load_deinterleave_rgba(src.as_ptr().add(px)); + _mm256_storeu_si256(r_dst.as_mut_ptr().add(_cx) as *mut __m256i, pixels.0); + _mm256_storeu_si256(g_dst.as_mut_ptr().add(_cx) as *mut __m256i, pixels.1); + _mm256_storeu_si256(b_dst.as_mut_ptr().add(_cx) as *mut __m256i, pixels.2); + _mm256_storeu_si256(a_dst.as_mut_ptr().add(_cx) as *mut __m256i, pixels.3); + _cx += 32; + } + + while _cx + 16 < width { + let px = _cx * 4; + let pixels = _mm_load_deinterleave_rgba(src.as_ptr().add(px)); + _mm_storeu_si128(r_dst.as_mut_ptr().add(_cx) as *mut __m128i, pixels.0); + _mm_storeu_si128(g_dst.as_mut_ptr().add(_cx) as *mut __m128i, pixels.1); + _mm_storeu_si128(b_dst.as_mut_ptr().add(_cx) as *mut __m128i, pixels.2); + _mm_storeu_si128(a_dst.as_mut_ptr().add(_cx) as *mut __m128i, pixels.3); + _cx += 16; + } + + while _cx + 8 < width { + let px = _cx * 4; + let pixels = _mm_load_deinterleave_half_rgba(src.as_ptr().add(px), 0); + let v0 = pixels.0; + std::ptr::copy_nonoverlapping( + &v0 as *const _ as *const u8, + r_dst.as_mut_ptr().add(_cx), + 8, + ); + let v1 = pixels.1; + std::ptr::copy_nonoverlapping( + &v1 as *const _ as *const u8, + g_dst.as_mut_ptr().add(_cx), + 8, + ); + let v2 = pixels.2; + std::ptr::copy_nonoverlapping( + &v2 as *const _ as *const u8, + b_dst.as_mut_ptr().add(_cx), + 8, + ); + let v3 = pixels.3; + std::ptr::copy_nonoverlapping( + &v3 as *const _ as *const u8, + a_dst.as_mut_ptr().add(_cx), + 8, + ); + _cx += 8; + } + + while _cx < width { + let px = _cx * 4; + let src_align = src.get_unchecked(px..); + *r_dst.get_unchecked_mut(_cx) = *src_align.get_unchecked(0); + *g_dst.get_unchecked_mut(_cx) = *src_align.get_unchecked(1); + *b_dst.get_unchecked_mut(_cx) = *src_align.get_unchecked(2); + *a_dst.get_unchecked_mut(_cx) = *src_align.get_unchecked(3); + _cx += 1; + } + + src = src.get_unchecked(src_stride..); + r_dst = r_dst.get_unchecked_mut(width..); + g_dst = g_dst.get_unchecked_mut(width..); + b_dst = b_dst.get_unchecked_mut(width..); + a_dst = a_dst.get_unchecked_mut(width..); + } + } + unpacked_image +} diff --git a/src/packing/avx/mod.rs b/src/packing/avx/mod.rs index e9e8546..ea4ddb5 100644 --- a/src/packing/avx/mod.rs +++ b/src/packing/avx/mod.rs @@ -26,4 +26,9 @@ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +mod deinterleave_rgb; +mod deinterleave_rgba; mod v_load; + +pub use deinterleave_rgb::deinterleave_rgb_avx; +pub use deinterleave_rgba::deinterleave_rgba_avx; diff --git a/src/packing/avx/v_load.rs b/src/packing/avx/v_load.rs index ff706e2..053cb30 100644 --- a/src/packing/avx/v_load.rs +++ b/src/packing/avx/v_load.rs @@ -145,3 +145,24 @@ pub unsafe fn _mm256_deinterleave_rgb( let r0 = _mm256_shuffle_epi8(r0, sh_r); (b0, g0, r0) } + +#[inline] +#[target_feature(enable = "avx2")] +pub unsafe fn _mm256_load_deinterleave_rgb(ptr: *const u8) -> (__m256i, __m256i, __m256i) { + let row0 = _mm256_loadu_si256(ptr as *const __m256i); + let row1 = _mm256_loadu_si256(ptr.add(32) as *const __m256i); + let row2 = _mm256_loadu_si256(ptr.add(64) as *const __m256i); + _mm256_deinterleave_rgb(row0, row1, row2) +} + +#[inline] +#[target_feature(enable = "avx2")] +pub unsafe fn _mm256_load_deinterleave_rgba( + ptr: *const u8, +) -> (__m256i, __m256i, __m256i, __m256i) { + let row0 = _mm256_loadu_si256(ptr as *const __m256i); + let row1 = _mm256_loadu_si256(ptr.add(32) as *const __m256i); + let row2 = _mm256_loadu_si256(ptr.add(64) as *const __m256i); + let row3 = _mm256_loadu_si256(ptr.add(96) as *const __m256i); + _mm256_deinterleave_rgba_epi8(row0, row1, row2, row3) +} diff --git a/src/packing/mod.rs b/src/packing/mod.rs index ee78823..1e35c6a 100644 --- a/src/packing/mod.rs +++ b/src/packing/mod.rs @@ -30,23 +30,23 @@ pub mod avx; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] mod neon; +mod pack_gray_alpha; mod pack_rgb; mod pack_rgba; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] mod sse; mod traits; +mod unpack_gray_alpha; mod unpack_rgb; mod unpack_rgba; +mod unpacked_gray_alpha; mod unpacked_rgb_image; mod unpacked_rgba_image; -mod unpack_gray_alpha; -mod unpacked_gray_alpha; -mod pack_gray_alpha; pub use pack_rgb::pack_rgb; -pub use traits::{RgbPackable, RgbaPackable, GrayAlphaPackable}; +pub use traits::{GrayAlphaPackable, RgbPackable, RgbaPackable}; pub use unpack_rgb::unpack_rgb; pub use unpack_rgba::unpack_rgba; +pub use unpacked_gray_alpha::UnpackedGrayAlpha; pub use unpacked_rgb_image::UnpackedRgbImage; pub use unpacked_rgba_image::UnpackedRgbaImage; -pub use unpacked_gray_alpha::UnpackedGrayAlpha; \ No newline at end of file diff --git a/src/packing/neon/deinterleave_rgba.rs b/src/packing/neon/deinterleave_rgba.rs index 4cf8780..195a663 100644 --- a/src/packing/neon/deinterleave_rgba.rs +++ b/src/packing/neon/deinterleave_rgba.rs @@ -28,8 +28,8 @@ */ use crate::packing::UnpackedRgbaImage; -use std::arch::aarch64::*; use crate::ImageSize; +use std::arch::aarch64::*; pub fn deinterleave_rgba_neon( rgb_image: &[u8], diff --git a/src/packing/pack_gray_alpha.rs b/src/packing/pack_gray_alpha.rs index 9432a18..3dd864b 100644 --- a/src/packing/pack_gray_alpha.rs +++ b/src/packing/pack_gray_alpha.rs @@ -44,4 +44,4 @@ pub fn pack_gray_alpha_naive( src[0] = *r; src[1] = *g; } -} \ No newline at end of file +} diff --git a/src/packing/pack_rgb.rs b/src/packing/pack_rgb.rs index 4c398db..8990b99 100644 --- a/src/packing/pack_rgb.rs +++ b/src/packing/pack_rgb.rs @@ -28,6 +28,8 @@ */ #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] use crate::packing::neon::pack_rgb_neon; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use crate::packing::sse::pack_rgb_sse; use crate::packing::UnpackedRgbImage; use crate::ImageSize; @@ -61,6 +63,12 @@ pub fn pack_rgb( { _dispatcher = pack_rgb_neon; } + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if std::arch::is_x86_feature_detected!("sse4.1") { + _dispatcher = pack_rgb_sse; + } + } _dispatcher( unpacked_rgb_image, dst_image, diff --git a/src/packing/pack_rgba.rs b/src/packing/pack_rgba.rs index 99b2550..818ccae 100644 --- a/src/packing/pack_rgba.rs +++ b/src/packing/pack_rgba.rs @@ -26,9 +26,12 @@ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +use crate::packing::neon::pack_rgba_neon; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use crate::packing::sse::pack_rgba_sse; use crate::packing::UnpackedRgbaImage; use crate::ImageSize; -use crate::packing::neon::pack_rgba_neon; pub fn interleave_rgba_naive( unpacked_rgba_image: &UnpackedRgbaImage, @@ -63,6 +66,12 @@ pub fn pack_rgba( { _dispatcher = pack_rgba_neon; } + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if std::arch::is_x86_feature_detected!("sse4.1") { + _dispatcher = pack_rgba_sse; + } + } _dispatcher( unpacked_rgb_image, dst_image, diff --git a/src/packing/sse/deinterleave_rgb.rs b/src/packing/sse/deinterleave_rgb.rs index 1456ae1..77117cf 100644 --- a/src/packing/sse/deinterleave_rgb.rs +++ b/src/packing/sse/deinterleave_rgb.rs @@ -34,6 +34,16 @@ use std::arch::x86::*; use std::arch::x86_64::*; pub fn deinterleave_rgb_sse(rgb_image: &[u8], width: usize, height: usize) -> UnpackedRgbImage { + unsafe { deinterleave_rgb_sse_impl(rgb_image, width, height) } +} + +#[inline] +#[target_feature(enable = "sse4.1")] +unsafe fn deinterleave_rgb_sse_impl( + rgb_image: &[u8], + width: usize, + height: usize, +) -> UnpackedRgbImage { if rgb_image.len() != width * height * 3 { panic!( "Image bounds in deinterleave_rgb_sse is mismatched! Expected {} but got {}", diff --git a/src/packing/sse/deinterleave_rgba.rs b/src/packing/sse/deinterleave_rgba.rs index fbaa721..b1dc3a0 100644 --- a/src/packing/sse/deinterleave_rgba.rs +++ b/src/packing/sse/deinterleave_rgba.rs @@ -26,9 +26,7 @@ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -use crate::packing::sse::{ - _mm_load_deinterleave_half_rgb, _mm_load_deinterleave_half_rgba, _mm_load_deinterleave_rgba, -}; +use crate::packing::sse::{_mm_load_deinterleave_half_rgba, _mm_load_deinterleave_rgba}; use crate::packing::UnpackedRgbaImage; use crate::ImageSize; #[cfg(target_arch = "x86")] @@ -40,6 +38,16 @@ pub fn deinterleave_rgba_sse( rgb_image: &[u8], width: usize, height: usize, +) -> UnpackedRgbaImage { + unsafe { deinterleave_rgba_sse_impl(rgb_image, width, height) } +} + +#[inline] +#[target_feature(enable = "sse4.1")] +unsafe fn deinterleave_rgba_sse_impl( + rgb_image: &[u8], + width: usize, + height: usize, ) -> UnpackedRgbaImage { if rgb_image.len() != width * height * 4 { panic!( @@ -55,7 +63,7 @@ pub fn deinterleave_rgba_sse( let mut b_dst: &mut [u8] = unpacked_image.b_channel.as_mut_slice(); let mut a_dst: &mut [u8] = unpacked_image.a_channel.as_mut_slice(); - let src_stride = width * 3; + let src_stride = width * 4; let mut src = rgb_image; unsafe { diff --git a/src/packing/sse/mod.rs b/src/packing/sse/mod.rs index cda014a..1364626 100644 --- a/src/packing/sse/mod.rs +++ b/src/packing/sse/mod.rs @@ -28,8 +28,13 @@ */ mod deinterleave_rgb; mod deinterleave_rgba; +mod pack_rgb; +mod pack_rgba; pub mod v_load; +mod v_store; pub use deinterleave_rgb::deinterleave_rgb_sse; pub use deinterleave_rgba::deinterleave_rgba_sse; +pub use pack_rgb::pack_rgb_sse; +pub use pack_rgba::pack_rgba_sse; pub use v_load::*; diff --git a/src/packing/sse/pack_rgb.rs b/src/packing/sse/pack_rgb.rs new file mode 100644 index 0000000..99ed15d --- /dev/null +++ b/src/packing/sse/pack_rgb.rs @@ -0,0 +1,111 @@ +/* + * Copyright (c) Radzivon Bartoshyk. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +use crate::packing::sse::v_store::{_mm_store_interleaved_rgb, _mm_store_rgb_half_u8}; +use crate::packing::UnpackedRgbImage; +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +pub fn pack_rgb_sse( + unpacked_rgb_image: &UnpackedRgbImage, + dst_image: &mut [u8], + width: usize, + height: usize, +) { + unsafe { + pack_rgb_sse_impl(unpacked_rgb_image, dst_image, width, height); + } +} + +#[inline] +#[target_feature(enable = "sse4.1")] +unsafe fn pack_rgb_sse_impl( + unpacked_rgb_image: &UnpackedRgbImage, + dst_image: &mut [u8], + width: usize, + height: usize, +) { + if dst_image.len() != width * height * 3 { + panic!( + "Image bounds in pack_rgb_neon is mismatched! Expected {} but got {}", + width * height * 3, + dst_image.len() + ); + } + + let mut r_src: &[u8] = unpacked_rgb_image.r_channel.as_slice(); + let mut g_src: &[u8] = unpacked_rgb_image.g_channel.as_slice(); + let mut b_src: &[u8] = unpacked_rgb_image.b_channel.as_slice(); + + let src_stride = width * 3; + + let mut dst = dst_image; + unsafe { + for _ in 0..height { + let mut _cx = 0usize; + + while _cx + 16 < width { + let px = _cx * 3; + let set = ( + _mm_loadu_si128(r_src.as_ptr().add(_cx) as *const __m128i), + _mm_loadu_si128(g_src.as_ptr().add(_cx) as *const __m128i), + _mm_loadu_si128(b_src.as_ptr().add(_cx) as *const __m128i), + ); + _mm_store_interleaved_rgb(dst.as_mut_ptr().add(px), set.0, set.1, set.2); + _cx += 16; + } + + while _cx + 8 < width { + let px = _cx * 3; + let set = ( + _mm_loadu_si64(r_src.as_ptr().add(_cx)), + _mm_loadu_si64(g_src.as_ptr().add(_cx)), + _mm_loadu_si64(b_src.as_ptr().add(_cx)), + ); + _mm_store_rgb_half_u8(dst.as_mut_ptr().add(px), set.0, set.1, set.2); + _cx += 8; + } + + while _cx < width { + let px = _cx * 3; + let dst_align = dst.get_unchecked_mut(px..); + *dst_align.get_unchecked_mut(0) = *r_src.get_unchecked(_cx); + *dst_align.get_unchecked_mut(1) = *g_src.get_unchecked(_cx); + *dst_align.get_unchecked_mut(2) = *b_src.get_unchecked(_cx); + _cx += 1; + } + + dst = dst.get_unchecked_mut(src_stride..); + r_src = r_src.get_unchecked(width..); + g_src = g_src.get_unchecked(width..); + b_src = b_src.get_unchecked(width..); + } + } +} diff --git a/src/packing/sse/pack_rgba.rs b/src/packing/sse/pack_rgba.rs new file mode 100644 index 0000000..809468a --- /dev/null +++ b/src/packing/sse/pack_rgba.rs @@ -0,0 +1,122 @@ +/* + * Copyright (c) Radzivon Bartoshyk. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +use crate::packing::sse::v_store::{_mm_store_interleaved_half_rgba, _mm_store_interleaved_rgba}; +use crate::packing::UnpackedRgbaImage; +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +pub fn pack_rgba_sse( + unpacked_rgb_image: &UnpackedRgbaImage, + dst_image: &mut [u8], + width: usize, + height: usize, +) { + unsafe { + pack_rgba_impl(unpacked_rgb_image, dst_image, width, height); + } +} + +#[inline] +#[target_feature(enable = "sse4.1")] +unsafe fn pack_rgba_impl( + unpacked_rgb_image: &UnpackedRgbaImage, + dst_image: &mut [u8], + width: usize, + height: usize, +) { + if dst_image.len() != width * height * 4 { + panic!( + "Image bounds in pack_rgba_neon is mismatched! Expected {} but got {}", + width * height * 4, + dst_image.len() + ); + } + + let mut r_src: &[u8] = unpacked_rgb_image.r_channel.as_slice(); + let mut g_src: &[u8] = unpacked_rgb_image.g_channel.as_slice(); + let mut b_src: &[u8] = unpacked_rgb_image.b_channel.as_slice(); + let mut a_src: &[u8] = unpacked_rgb_image.a_channel.as_slice(); + + let src_stride = width * 4; + + let mut dst = dst_image; + unsafe { + for _ in 0..height { + let mut _cx = 0usize; + + while _cx + 16 < width { + let px = _cx * 4; + let set = ( + _mm_loadu_si128(r_src.as_ptr().add(_cx) as *const __m128i), + _mm_loadu_si128(g_src.as_ptr().add(_cx) as *const __m128i), + _mm_loadu_si128(b_src.as_ptr().add(_cx) as *const __m128i), + _mm_loadu_si128(a_src.as_ptr().add(_cx) as *const __m128i), + ); + _mm_store_interleaved_rgba(dst.as_mut_ptr().add(px), set.0, set.1, set.2, set.3); + _cx += 16; + } + + while _cx + 8 < width { + let px = _cx * 4; + let set = ( + _mm_loadu_si64(r_src.as_ptr().add(_cx)), + _mm_loadu_si64(g_src.as_ptr().add(_cx)), + _mm_loadu_si64(b_src.as_ptr().add(_cx)), + _mm_loadu_si64(a_src.as_ptr().add(_cx)), + ); + _mm_store_interleaved_half_rgba( + dst.as_mut_ptr().add(px), + set.0, + set.1, + set.2, + set.3, + ); + _cx += 8; + } + + while _cx < width { + let px = _cx * 4; + let dst_align = dst.get_unchecked_mut(px..); + *dst_align.get_unchecked_mut(0) = *r_src.get_unchecked(_cx); + *dst_align.get_unchecked_mut(1) = *g_src.get_unchecked(_cx); + *dst_align.get_unchecked_mut(2) = *b_src.get_unchecked(_cx); + *dst_align.get_unchecked_mut(3) = *a_src.get_unchecked(_cx); + _cx += 1; + } + + dst = dst.get_unchecked_mut(src_stride..); + r_src = r_src.get_unchecked(width..); + g_src = g_src.get_unchecked(width..); + b_src = b_src.get_unchecked(width..); + a_src = a_src.get_unchecked(width..); + } + } +} diff --git a/src/packing/sse/v_store.rs b/src/packing/sse/v_store.rs new file mode 100644 index 0000000..45edbe5 --- /dev/null +++ b/src/packing/sse/v_store.rs @@ -0,0 +1,121 @@ +/* + * Copyright (c) Radzivon Bartoshyk. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +#[inline] +#[target_feature(enable = "sse4.1")] +pub unsafe fn _mm_interleave_rgba( + r: __m128i, + g: __m128i, + b: __m128i, + a: __m128i, +) -> (__m128i, __m128i, __m128i, __m128i) { + let rg_lo = _mm_unpacklo_epi8(r, g); + let rg_hi = _mm_unpackhi_epi8(r, g); + let ba_lo = _mm_unpacklo_epi8(b, a); + let ba_hi = _mm_unpackhi_epi8(b, a); + + let rgba_0_lo = _mm_unpacklo_epi16(rg_lo, ba_lo); + let rgba_0_hi = _mm_unpackhi_epi16(rg_lo, ba_lo); + let rgba_1_lo = _mm_unpacklo_epi16(rg_hi, ba_hi); + let rgba_1_hi = _mm_unpackhi_epi16(rg_hi, ba_hi); + (rgba_0_lo, rgba_0_hi, rgba_1_lo, rgba_1_hi) +} + +#[inline] +#[target_feature(enable = "sse4.1")] +pub unsafe fn _mm_store_interleaved_rgba( + ptr: *mut u8, + r: __m128i, + g: __m128i, + b: __m128i, + a: __m128i, +) { + let (row1, row2, row3, row4) = _mm_interleave_rgba(r, g, b, a); + _mm_storeu_si128(ptr as *mut __m128i, row1); + _mm_storeu_si128(ptr.add(16) as *mut __m128i, row2); + _mm_storeu_si128(ptr.add(32) as *mut __m128i, row3); + _mm_storeu_si128(ptr.add(48) as *mut __m128i, row4); +} + +#[inline] +#[target_feature(enable = "sse4.1")] +pub unsafe fn _mm_store_interleaved_half_rgba( + ptr: *mut u8, + r: __m128i, + g: __m128i, + b: __m128i, + a: __m128i, +) { + let (row1, row2, _, _) = _mm_interleave_rgba(r, g, b, a); + _mm_storeu_si128(ptr as *mut __m128i, row1); + _mm_storeu_si128(ptr.add(16) as *mut __m128i, row2); +} + +#[inline] +#[target_feature(enable = "sse4.1")] +pub unsafe fn _mm_interleave_rgb( + r: __m128i, + g: __m128i, + b: __m128i, +) -> (__m128i, __m128i, __m128i) { + let sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5); + let sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10); + let sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15); + let a0 = _mm_shuffle_epi8(r, sh_a); + let b0 = _mm_shuffle_epi8(g, sh_b); + let c0 = _mm_shuffle_epi8(b, sh_c); + + let m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0); + let m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0); + let v0 = _mm_blendv_epi8(_mm_blendv_epi8(a0, b0, m1), c0, m0); + let v1 = _mm_blendv_epi8(_mm_blendv_epi8(b0, c0, m1), a0, m0); + let v2 = _mm_blendv_epi8(_mm_blendv_epi8(c0, a0, m1), b0, m0); + (v0, v1, v2) +} + +#[inline] +#[target_feature(enable = "sse4.1")] +pub unsafe fn _mm_store_interleaved_rgb(ptr: *mut u8, r: __m128i, g: __m128i, b: __m128i) { + let (row1, row2, row3) = _mm_interleave_rgb(r, g, b); + _mm_storeu_si128(ptr as *mut __m128i, row1); + _mm_storeu_si128(ptr.add(16) as *mut __m128i, row2); + _mm_storeu_si128(ptr.add(32) as *mut __m128i, row3); +} + +#[inline] +#[target_feature(enable = "sse4.1")] +pub unsafe fn _mm_store_rgb_half_u8(ptr: *mut u8, r: __m128i, g: __m128i, b: __m128i) { + let (v0, v1, _) = _mm_interleave_rgb(r, g, b); + _mm_storeu_si128(ptr as *mut __m128i, v0); + std::ptr::copy_nonoverlapping(&v1 as *const _ as *const u8, ptr.add(16), 8); +} diff --git a/src/packing/unpack_rgb.rs b/src/packing/unpack_rgb.rs index 8c44183..d8a833d 100644 --- a/src/packing/unpack_rgb.rs +++ b/src/packing/unpack_rgb.rs @@ -26,6 +26,8 @@ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use crate::packing::avx::deinterleave_rgb_avx; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] use crate::packing::neon::deinterleave_rgb_neon; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] @@ -77,6 +79,9 @@ pub fn unpack_rgb(rgb_image: &[u8], image_size: ImageSize) -> UnpackedRgbImage UnpackedRgbaImage if std::arch::is_x86_feature_detected!("sse4.1") { _dispatcher = deinterleave_rgba_sse; } + if std::arch::is_x86_feature_detected!("avx2") { + _dispatcher = deinterleave_rgba_avx; + } } _dispatcher(rgb_image, image_size.width, image_size.height) }