image-rs · Shnatsel · Oct 5, 2024 · Sep 28, 2024 · okaneco · Oct 5, 2024
diff --git a/src/filter.rs b/src/filter.rs
@@ -13,17 +13,17 @@ mod simd {
     use std::simd::num::{SimdInt, SimdUint};
     use std::simd::{u8x4, u8x8, LaneCount, Simd, SimdElement, SupportedLaneCount};
 
-    /// This is an equivalent of the `PaethPredictor` function from
-    /// [the spec](http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html#Filter-type-4-Paeth)
-    /// except that it simultaneously calculates the predictor for all SIMD lanes.
-    /// Mapping between parameter names and pixel positions can be found in
-    /// [a diagram here](https://www.w3.org/TR/png/#filter-byte-positions).
+    /// Scalar Paeth function wrapped in SIMD scaffolding.
     ///
-    /// Examples of how different pixel types may be represented as multiple SIMD lanes:
-    /// - RGBA => 4 lanes of `i16x4` contain R, G, B, A
-    /// - RGB  => 4 lanes of `i16x4` contain R, G, B, and a ignored 4th value
+    /// This is needed because simply running the function on the inputs
+    /// makes the compiler think our inputs are too short
+    /// to benefit from vectorization.
+    /// Putting it in SIMD scaffolding fixes that.
+    /// https://github.com/image-rs/image-png/issues/511
     ///
-    /// The SIMD algorithm below is based on [`libpng`](https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L261-L280).
+    /// Funnily, the autovectorizer does a better job here
+    /// than a handwritten algorithm using std::simd!
+    /// We used to have a handwritten one but this is just faster.
     fn paeth_predictor<const N: usize>(
         a: Simd<i16, N>,
         b: Simd<i16, N>,
@@ -32,28 +32,26 @@ mod simd {
     where
         LaneCount<N>: SupportedLaneCount,
     {
-        let pa = b - c; // (p-a) == (a+b-c - a) == (b-c)
-        let pb = a - c; // (p-b) == (a+b-c - b) == (a-c)
-        let pc = pa + pb; // (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c)
-
-        let pa = pa.abs();
-        let pb = pb.abs();
-        let pc = pc.abs();
-
-        let smallest = pc.simd_min(pa.simd_min(pb));
-
-        // Paeth algorithm breaks ties favoring a over b over c, so we execute the following
-        // lane-wise selection:
-        //
-        //     if smalest == pa
-        //         then select a
-        //         else select (if smallest == pb then select b else select c)
-        smallest
-            .simd_eq(pa)
-            .select(a, smallest.simd_eq(pb).select(b, c))
+        let mut out = [0; N];
+        for i in 0..N {
+            out[i] = super::filter_paeth_decode_i16(a[i].into(), b[i].into(), c[i].into());
+        }
+        out.into()
     }
 
-    /// Equivalent to `simd::paeth_predictor` but does not temporarily convert
+    /// This is an equivalent of the `PaethPredictor` function from
+    /// [the spec](http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html#Filter-type-4-Paeth)
+    /// except that it simultaneously calculates the predictor for all SIMD lanes.
+    /// Mapping between parameter names and pixel positions can be found in
+    /// [a diagram here](https://www.w3.org/TR/png/#filter-byte-positions).
+    ///
+    /// Examples of how different pixel types may be represented as multiple SIMD lanes:
+    /// - RGBA => 4 lanes of `i16x4` contain R, G, B, A
+    /// - RGB  => 4 lanes of `i16x4` contain R, G, B, and a ignored 4th value
+    ///
+    /// The SIMD algorithm below is based on [`libpng`](https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L261-L280).
 fn filter_paeth(a: u8, b: u8, c: u8) -> u8 { 
     // This is an optimized version of the paeth filter from the PNG specification, proposed by 
     // Luca Versari for [FPNGE](https://www.lucaversari.it/FJXL_and_FPNGE.pdf). It operates 
     // entirely on unsigned 8-bit quantities, making it more conducive to vectorization. 
     // 
     //     p = a + b - c 
     //     pa = |p - a| = |a + b - c - a| = |b - c| = max(b, c) - min(b, c) 
     //     pb = |p - b| = |a + b - c - b| = |a - c| = max(a, c) - min(a, c) 
     //     pc = |p - c| = |a + b - c - c| = |(b - c) + (a - c)| = ... 
     // 
     // Further optimizing the calculation of `pc` a bit tricker. However, notice that: 
     // 
     //        a > c && b > c 
     //    ==> (a - c) > 0 && (b - c) > 0 
     //    ==> pc > (a - c) && pc > (b - c) 
     //    ==> pc > |a - c| && pc > |b - c| 
     //    ==> pc > pb && pc > pa 
     // 
     // Meaning that if `c` is smaller than `a` and `b`, the value of `pc` is irrelevant. Similar 
     // reasoning applies if `c` is larger than the other two inputs. Assuming that `c >= b` and 
     // `c <= b` or vice versa: 
     // 
     //     pc = ||b - c| - |a - c|| =  |pa - pb| = max(pa, pb) - min(pa, pb) 
     // 
     let pa = b.max(c) - c.min(b); 
     let pb = a.max(c) - c.min(a); 
     let pc = if (a < c) == (c < b) { 
         pa.max(pb) - pa.min(pb) 
     } else { 
         255 
     }; 
     if pa <= pb && pa <= pc { 
         a 
     } else if pb <= pc { 
         b 
     } else { 
         c 
     } 
 } 
 fn filter_paeth(a: u8, b: u8, c: u8) -> u8 { 
     // This is an optimized version of the paeth filter from the PNG specification, proposed by 
     // Luca Versari for [FPNGE](https://www.lucaversari.it/FJXL_and_FPNGE.pdf). It operates 
     // entirely on unsigned 8-bit quantities, making it more conducive to vectorization. 
     // 
     //     p = a + b - c 
     //     pa = |p - a| = |a + b - c - a| = |b - c| = max(b, c) - min(b, c) 
     //     pb = |p - b| = |a + b - c - b| = |a - c| = max(a, c) - min(a, c) 
     //     pc = |p - c| = |a + b - c - c| = |(b - c) + (a - c)| = ... 
     // 
     // Further optimizing the calculation of `pc` a bit tricker. However, notice that: 
     // 
     //        a > c && b > c 
     //    ==> (a - c) > 0 && (b - c) > 0 
     //    ==> pc > (a - c) && pc > (b - c) 
     //    ==> pc > |a - c| && pc > |b - c| 
     //    ==> pc > pb && pc > pa 
     // 
     // Meaning that if `c` is smaller than `a` and `b`, the value of `pc` is irrelevant. Similar 
     // reasoning applies if `c` is larger than the other two inputs. Assuming that `c >= b` and 
     // `c <= b` or vice versa: 
     // 
     //     pc = ||b - c| - |a - c|| =  |pa - pb| = max(pa, pb) - min(pa, pb) 
     // 
     let pa = b.max(c) - c.min(b); 
     let pb = a.max(c) - c.min(a); 
     let pc = if (a < c) == (c < b) { 
         pa.max(pb) - pa.min(pb) 
     } else { 
         255 
     }; 
  
     if pa <= pb && pa <= pc { 
         a 
     } else if pb <= pc { 
         b 
     } else { 
         c 
     } 
 } 
+    ///
+    /// Functionally equivalent to `simd::paeth_predictor` but does not temporarily convert
     /// the SIMD elements to `i16`.
     fn paeth_predictor_u8<const N: usize>(
         a: Simd<u8, N>,
@@ -340,6 +338,27 @@ fn filter_paeth_decode(a: u8, b: u8, c: u8) -> u8 {
     out
 }
 
+#[cfg(feature = "unstable")]
+fn filter_paeth_decode_i16(a: i16, b: i16, c: i16) -> i16 {
+    // Like `filter_paeth_decode` but vectorizes better when wrapped in SIMD
+    let pa = (b - c).abs();
+    let pb = (a - c).abs();
+    let pc = ((a - c) + (b - c)).abs();
+
+    let mut out = a;
+    let mut min = pa;
+
+    if pb < min {
+        min = pb;
+        out = b;
+    }
+    if pc < min {
+        out = c;
+    }
+
+    out
+}
+
 fn filter_paeth(a: u8, b: u8, c: u8) -> u8 {
     // This is an optimized version of the paeth filter from the PNG specification, proposed by
     // Luca Versari for [FPNGE](https://www.lucaversari.it/FJXL_and_FPNGE.pdf). It operates