From f1b75ae19f0450109aa85fb5f96e026f31bce321 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 29 Sep 2024 00:16:52 +0100 Subject: [PATCH] Replace handwritten SIMD implementation with autovectorization for a surprising performance gain --- src/filter.rs | 77 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 48 insertions(+), 29 deletions(-) diff --git a/src/filter.rs b/src/filter.rs index 9290a040..444a14d6 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -13,17 +13,17 @@ mod simd { use std::simd::num::{SimdInt, SimdUint}; use std::simd::{u8x4, u8x8, LaneCount, Simd, SimdElement, SupportedLaneCount}; - /// This is an equivalent of the `PaethPredictor` function from - /// [the spec](http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html#Filter-type-4-Paeth) - /// except that it simultaneously calculates the predictor for all SIMD lanes. - /// Mapping between parameter names and pixel positions can be found in - /// [a diagram here](https://www.w3.org/TR/png/#filter-byte-positions). + /// Scalar Paeth function wrapped in SIMD scaffolding. /// - /// Examples of how different pixel types may be represented as multiple SIMD lanes: - /// - RGBA => 4 lanes of `i16x4` contain R, G, B, A - /// - RGB => 4 lanes of `i16x4` contain R, G, B, and a ignored 4th value + /// This is needed because simply running the function on the inputs + /// makes the compiler think our inputs are too short + /// to benefit from vectorization. + /// Putting it in SIMD scaffolding fixes that. + /// https://github.com/image-rs/image-png/issues/511 /// - /// The SIMD algorithm below is based on [`libpng`](https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L261-L280). + /// Funnily, the autovectorizer does a better job here + /// than a handwritten algorithm using std::simd! + /// We used to have a handwritten one but this is just faster. fn paeth_predictor( a: Simd, b: Simd, @@ -32,28 +32,26 @@ mod simd { where LaneCount: SupportedLaneCount, { - let pa = b - c; // (p-a) == (a+b-c - a) == (b-c) - let pb = a - c; // (p-b) == (a+b-c - b) == (a-c) - let pc = pa + pb; // (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) - - let pa = pa.abs(); - let pb = pb.abs(); - let pc = pc.abs(); - - let smallest = pc.simd_min(pa.simd_min(pb)); - - // Paeth algorithm breaks ties favoring a over b over c, so we execute the following - // lane-wise selection: - // - // if smalest == pa - // then select a - // else select (if smallest == pb then select b else select c) - smallest - .simd_eq(pa) - .select(a, smallest.simd_eq(pb).select(b, c)) + let mut out = [0; N]; + for i in 0..N { + out[i] = super::filter_paeth_decode_i16(a[i].into(), b[i].into(), c[i].into()); + } + out.into() } - /// Equivalent to `simd::paeth_predictor` but does not temporarily convert + /// This is an equivalent of the `PaethPredictor` function from + /// [the spec](http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html#Filter-type-4-Paeth) + /// except that it simultaneously calculates the predictor for all SIMD lanes. + /// Mapping between parameter names and pixel positions can be found in + /// [a diagram here](https://www.w3.org/TR/png/#filter-byte-positions). + /// + /// Examples of how different pixel types may be represented as multiple SIMD lanes: + /// - RGBA => 4 lanes of `i16x4` contain R, G, B, A + /// - RGB => 4 lanes of `i16x4` contain R, G, B, and a ignored 4th value + /// + /// The SIMD algorithm below is based on [`libpng`](https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L261-L280). + /// + /// Functionally equivalent to `simd::paeth_predictor` but does not temporarily convert /// the SIMD elements to `i16`. fn paeth_predictor_u8( a: Simd, @@ -340,6 +338,27 @@ fn filter_paeth_decode(a: u8, b: u8, c: u8) -> u8 { out } +#[cfg(feature = "unstable")] +fn filter_paeth_decode_i16(a: i16, b: i16, c: i16) -> i16 { + // Like `filter_paeth_decode` but vectorizes better when wrapped in SIMD + let pa = (b - c).abs(); + let pb = (a - c).abs(); + let pc = ((a - c) + (b - c)).abs(); + + let mut out = a; + let mut min = pa; + + if pb < min { + min = pb; + out = b; + } + if pc < min { + out = c; + } + + out +} + fn filter_paeth(a: u8, b: u8, c: u8) -> u8 { // This is an optimized version of the paeth filter from the PNG specification, proposed by // Luca Versari for [FPNGE](https://www.lucaversari.it/FJXL_and_FPNGE.pdf). It operates