From f1b75ae19f0450109aa85fb5f96e026f31bce321 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 29 Sep 2024 00:16:52 +0100
Subject: [PATCH] Replace handwritten SIMD implementation with
 autovectorization for a surprising performance gain

---
 src/filter.rs | 77 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 48 insertions(+), 29 deletions(-)
diff --git a/src/filter.rs b/src/filter.rs
index 9290a040..444a14d6 100644
--- a/src/filter.rs
+++ b/src/filter.rs
@@ -13,17 +13,17 @@ mod simd {
     use std::simd::num::{SimdInt, SimdUint};
     use std::simd::{u8x4, u8x8, LaneCount, Simd, SimdElement, SupportedLaneCount};
 
-    /// This is an equivalent of the `PaethPredictor` function from
-    /// [the spec](http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html#Filter-type-4-Paeth)
-    /// except that it simultaneously calculates the predictor for all SIMD lanes.
-    /// Mapping between parameter names and pixel positions can be found in
-    /// [a diagram here](https://www.w3.org/TR/png/#filter-byte-positions).
+    /// Scalar Paeth function wrapped in SIMD scaffolding.
     ///
-    /// Examples of how different pixel types may be represented as multiple SIMD lanes:
-    /// - RGBA => 4 lanes of `i16x4` contain R, G, B, A
-    /// - RGB  => 4 lanes of `i16x4` contain R, G, B, and a ignored 4th value
+    /// This is needed because simply running the function on the inputs
+    /// makes the compiler think our inputs are too short
+    /// to benefit from vectorization.
+    /// Putting it in SIMD scaffolding fixes that.
+    /// https://github.com/image-rs/image-png/issues/511
     ///
-    /// The SIMD algorithm below is based on [`libpng`](https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L261-L280).
+    /// Funnily, the autovectorizer does a better job here
+    /// than a handwritten algorithm using std::simd!
+    /// We used to have a handwritten one but this is just faster.
     fn paeth_predictor<const N: usize>(
         a: Simd<i16, N>,
         b: Simd<i16, N>,
@@ -32,28 +32,26 @@ mod simd {
     where
         LaneCount<N>: SupportedLaneCount,
     {
-        let pa = b - c; // (p-a) == (a+b-c - a) == (b-c)
-        let pb = a - c; // (p-b) == (a+b-c - b) == (a-c)
-        let pc = pa + pb; // (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c)
-
-        let pa = pa.abs();
-        let pb = pb.abs();
-        let pc = pc.abs();
-
-        let smallest = pc.simd_min(pa.simd_min(pb));
-
-        // Paeth algorithm breaks ties favoring a over b over c, so we execute the following
-        // lane-wise selection:
-        //
-        //     if smalest == pa
-        //         then select a
-        //         else select (if smallest == pb then select b else select c)
-        smallest
-            .simd_eq(pa)
-            .select(a, smallest.simd_eq(pb).select(b, c))
+        let mut out = [0; N];
+        for i in 0..N {
+            out[i] = super::filter_paeth_decode_i16(a[i].into(), b[i].into(), c[i].into());
+        }
+        out.into()
     }
 
-    /// Equivalent to `simd::paeth_predictor` but does not temporarily convert
+    /// This is an equivalent of the `PaethPredictor` function from
+    /// [the spec](http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html#Filter-type-4-Paeth)
+    /// except that it simultaneously calculates the predictor for all SIMD lanes.
+    /// Mapping between parameter names and pixel positions can be found in
+    /// [a diagram here](https://www.w3.org/TR/png/#filter-byte-positions).
+    ///
+    /// Examples of how different pixel types may be represented as multiple SIMD lanes:
+    /// - RGBA => 4 lanes of `i16x4` contain R, G, B, A
+    /// - RGB  => 4 lanes of `i16x4` contain R, G, B, and a ignored 4th value
+    ///
+    /// The SIMD algorithm below is based on [`libpng`](https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L261-L280).
+    ///
+    /// Functionally equivalent to `simd::paeth_predictor` but does not temporarily convert
     /// the SIMD elements to `i16`.
     fn paeth_predictor_u8<const N: usize>(
         a: Simd<u8, N>,
@@ -340,6 +338,27 @@ fn filter_paeth_decode(a: u8, b: u8, c: u8) -> u8 {
     out
 }
 
+#[cfg(feature = "unstable")]
+fn filter_paeth_decode_i16(a: i16, b: i16, c: i16) -> i16 {
+    // Like `filter_paeth_decode` but vectorizes better when wrapped in SIMD
+    let pa = (b - c).abs();
+    let pb = (a - c).abs();
+    let pc = ((a - c) + (b - c)).abs();
+
+    let mut out = a;
+    let mut min = pa;
+
+    if pb < min {
+        min = pb;
+        out = b;
+    }
+    if pc < min {
+        out = c;
+    }
+
+    out
+}
+
 fn filter_paeth(a: u8, b: u8, c: u8) -> u8 {
     // This is an optimized version of the paeth filter from the PNG specification, proposed by
     // Luca Versari for [FPNGE](https://www.lucaversari.it/FJXL_and_FPNGE.pdf). It operates