0002-ARM-NEON-Add-inline-assembly-version-of-blit_row_s32.patch

From 4ab707b7097c91719a73ee77fc09bd66a37cfc27 Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Thu, 20 Dec 2018 19:07:48 +0000
Subject: [PATCH 2/8] ARM NEON: Add inline assembly version of
 blit_row_s32a_opaque(), scheduled for Cortex-A53

Benchmarks show:
* opaque images 1.6% faster in RAM, 18% faster in L1 cache
* transparent images 5.6% faster in RAM, 20% faster in L1 cache
* mixed images 11% faster in RAM, 0.9% slower in L1 cache

Implemented only for AArch32; AArch64 builds still use old intrinsics code
---
 src/opts/SkBlitRow_opts.h | 339 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 339 insertions(+)

--- a/third_party/skia/src/opts/SkBlitRow_opts.h
+++ b/third_party/skia/src/opts/SkBlitRow_opts.h
@@ -19,6 +19,9 @@
 namespace SK_OPTS_NS {
 
 #if defined(SK_ARM_HAS_NEON)
+#ifdef __ARM_64BIT_STATE
+// No attempt has been made to adapt the inline assembly version for AArch64
+// so fall back to the less performant version that uses intrinsics instead
 
 // Return a uint8x8_t value, r, computed as r[i] = SkMulDiv255Round(x[i], y[i]), where r[i], x[i],
 // y[i] are the i-th lanes of the corresponding NEON vectors.
@@ -55,8 +58,192 @@
     return vadd_u8(src, SkMulDiv255Round_neon8(nalphas, dst));
 }
 
+#else // __ARM_64BIT_STATE
+// Inline ARM AArch32 assembly version
+
+// Macros to specify instructions to only include if targeting ARM or Thumb instruction sets
+#ifdef __thumb__
+#define A(x)
+#define T(x) x
+#else
+#define A(x) x
+#define T(x)
+#endif
+
+// These macros permit optionally-included features to be switched using a parameter to another macro
+#define YES(x) x
+#define NO(x)
+
+// How far ahead (pixels) to preload (undefine to disable prefetch) - determined empirically
+#define PREFETCH_DISTANCE "24"
+
+#ifdef PREFETCH_DISTANCE
+#define IF_PRELOAD YES
+#else
+#define IF_PRELOAD NO
 #endif
 
+/// Macro to load or store 1..7 pixels in growing powers-of-2 in size - suitable for leading pixels
+#define S32A_LOADSTORE_LEADING_7(ls, r0, r1, r2, r3, base, opt)                                       \
+                "tst         %[group_size], #1                                                \n\t"   \
+                opt"                                                                          \n\t"   \
+                "beq         1f                                                               \n\t"   \
+                "v"#ls"4.8   {"#r0"[1],"#r1"[1],"#r2"[1],"#r3"[1]}, [%["#base"]:32]!          \n\t"   \
+                "1:                                                                           \n\t"   \
+                "lsls        %[tmp], %[group_size], #30                                       \n\t"   \
+                "add         %[tmp], %["#base"], #4                                           \n\t"   \
+                "bpl         1f                                                               \n\t"   \
+                "v"#ls"4.8   {"#r0"[2],"#r1"[2],"#r2"[2],"#r3"[2]}, [%["#base"]:32], %[eight] \n\t"   \
+                "v"#ls"4.8   {"#r0"[3],"#r1"[3],"#r2"[3],"#r3"[3]}, [%[tmp]:32], %[eight]     \n\t"   \
+                "1:                                                                           \n\t"   \
+                "bcc         1f                                                               \n\t"   \
+                "v"#ls"4.8   {"#r0"[4],"#r1"[4],"#r2"[4],"#r3"[4]}, [%["#base"]:32], %[eight] \n\t"   \
+                "v"#ls"4.8   {"#r0"[5],"#r1"[5],"#r2"[5],"#r3"[5]}, [%[tmp]:32], %[eight]     \n\t"   \
+                "v"#ls"4.8   {"#r0"[6],"#r1"[6],"#r2"[6],"#r3"[6]}, [%["#base"]:32], %[eight] \n\t"   \
+                "v"#ls"4.8   {"#r0"[7],"#r1"[7],"#r2"[7],"#r3"[7]}, [%[tmp]:32], %[eight]     \n\t"   \
+                "1:                                                                           \n\t"   \
+
+/// Macro to load or store 1..7 pixels in shrink powers-of-2 in size - suitable for trailing pixels
+#define S32A_LOADSTORE_TRAILING_7(ls, r0, r1, r2, r3, base, opt)                                      \
+                "lsls        %[tmp], %[group_size], #30                                       \n\t"   \
+                "add         %[tmp], %["#base"], #4                                           \n\t"   \
+                opt"                                                                          \n\t"   \
+                "bcc         1f                                                               \n\t"   \
+                "v"#ls"4.8   {"#r0"[0],"#r1"[0],"#r2"[0],"#r3"[0]}, [%["#base"]:32], %[eight] \n\t"   \
+                "v"#ls"4.8   {"#r0"[1],"#r1"[1],"#r2"[1],"#r3"[1]}, [%[tmp]:32], %[eight]     \n\t"   \
+                "v"#ls"4.8   {"#r0"[2],"#r1"[2],"#r2"[2],"#r3"[2]}, [%["#base"]:32], %[eight] \n\t"   \
+                "v"#ls"4.8   {"#r0"[3],"#r1"[3],"#r2"[3],"#r3"[3]}, [%[tmp]:32], %[eight]     \n\t"   \
+                "1:                                                                           \n\t"   \
+                "bpl         1f                                                               \n\t"   \
+                "v"#ls"4.8   {"#r0"[4],"#r1"[4],"#r2"[4],"#r3"[4]}, [%["#base"]:32], %[eight] \n\t"   \
+                "v"#ls"4.8   {"#r0"[5],"#r1"[5],"#r2"[5],"#r3"[5]}, [%[tmp]:32], %[eight]     \n\t"   \
+                "1:                                                                           \n\t"   \
+                "tst         %[group_size], #1                                                \n\t"   \
+                "beq         1f                                                               \n\t"   \
+                "v"#ls"4.8   {"#r0"[6],"#r1"[6],"#r2"[6],"#r3"[6]}, [%["#base"]:32]!          \n\t"   \
+                "1:                                                                           \n\t"   \
+
+/// Macro to do testing and "over" compositing of a group of 1..7 32bpp premultiplied ARGB leading or trailing pixels
+#define S32A_OPAQUE_7PIX_PROCESS(loadstore_7, size)                                                   \
+    do {                                                                                              \
+        __asm__ volatile (                                                                            \
+                /* Load the leading/trailing source pixels,                                           \
+                 * after initialising all the unused indexes from the first pixel                     \
+                 * so the all-opaque and all-transparent tests still work */                          \
+                loadstore_7(ld, d0, d1, d2, d3, src,                                                  \
+                "vld4.8      {d0[],d1[],d2[],d3[]}, [%[src]]")                                        \
+                /* Test for all-opaque or all-transparent */                                          \
+                "vmov        %[alo], s6                                                       \n\t"   \
+                "vmov        %[ahi], s7                                                       \n\t"   \
+                "vmvn        d31, d3                                                          \n\t"   \
+                "orrs        %[tmp], %[alo], %[ahi]                                           \n\t"   \
+                "it          eq                                                               \n\t"   \
+                "addeq       %[dst], %[group_size], lsl #2                                    \n\t"   \
+                "beq         9f                                                               \n\t"   \
+                "cmp         %[alo], #-1                                                      \n\t"   \
+                "it          eq                                                               \n\t"   \
+                "cmpeq       %[ahi], #-1                                                      \n\t"   \
+                "beq         5f                                                               \n\t"   \
+                /* Translucency used, or a mixture of opaque and transparent */                       \
+                loadstore_7(ld, d20, d21, d22, d23, dst, )                                            \
+                "sub         %[dst], %[group_size], lsl #2                                    \n\t"   \
+                S32A_OPAQUE_8PIX_BLEND(, , q0, q1,, NO, NO, NO)                                       \
+                "5:                                                                           \n\t"   \
+                loadstore_7(st, d0, d1, d2, d3, dst, )                                                \
+                /* Drop through */                                                                    \
+                "9:                                                                           \n\t"   \
+        : /* Outputs */                                                                               \
+                [alo]"=&r"(alo),                                                                      \
+                [ahi]"=&r"(ahi),                                                                      \
+                [tmp]"=&r"(tmp),                                                                      \
+                [src]"+r"(src),                                                                       \
+                [dst]"+r"(dst)                                                                        \
+        : /* Inputs */                                                                                \
+                [group_size]"r"(size),                                                                \
+                     [eight]"r"(eight)                                                                \
+        : /* Clobbers */                                                                              \
+                "cc", "memory"                                                                        \
+        );                                                                                            \
+    } while (0)
+
+/// Macro to do testing and "over" compositing of an aligned group of 8 32bpp premultiplied ARGB leading or trailing pixels
+#define S32A_OPAQUE_8PIX_PROCESS(align, if_load)                                                      \
+    do {                                                                                              \
+        __asm__ (                                                                                     \
+if_load(        "vld4.8      {d0-d3}, [%[src]]!                                               \n\t")  \
+                /* Test for all-opaque or all-transparent */                                          \
+                "vmov        %[alo], s6                                                       \n\t"   \
+                "vmov        %[ahi], s7                                                       \n\t"   \
+                "vmvn        d31, d3                                                          \n\t"   \
+                "orrs        %[tmp], %[alo], %[ahi]                                           \n\t"   \
+                "it          eq                                                               \n\t"   \
+                "addeq       %[dst], #8*4                                                     \n\t"   \
+                "beq         9f                                                               \n\t"   \
+                "cmp         %[alo], #-1                                                      \n\t"   \
+                "it          eq                                                               \n\t"   \
+                "cmpeq       %[ahi], #-1                                                      \n\t"   \
+                "beq         5f                                                               \n\t"   \
+                /* Translucency used, or a mixture of opaque and transparent */                       \
+                S32A_OPAQUE_8PIX_BLEND(align, , q0, q1, "5:", YES, NO, NO)                            \
+                /* Drop through */                                                                    \
+                "9:                                                                           \n\t"   \
+        :  /* Outputs */                                                                              \
+                [alo]"=&r"(alo),                                                                      \
+                [ahi]"=&r"(ahi),                                                                      \
+                [tmp]"=&r"(tmp),                                                                      \
+                [src]"+r"(src),                                                                       \
+                [dst]"+r"(dst)                                                                        \
+        : /* Inputs */                                                                                \
+        : /* Clobbers */                                                                              \
+                "cc", "memory"                                                                        \
+        );                                                                                            \
+    } while (0)
+
+/// Macro to do "over" compositing blending on 8 32bpp premultiplied ARGB pixels
+/// which are with either translucent or a mixture of opaque and transparent.
+/// Relies on A(x) to determine whether to emit code in ARM state (as opposed to Thumb state).
+/// @arg align           bit-alignment specifier on destination loads/stores (optional)
+/// @arg other_src_alpha D-register specifier for alpha source in other bank (only IF_OVERLAP)
+/// @arg src0            Q-register specifier for blue/green source in this bank
+/// @arg src1            Q-register specifier for red/alpha source in this bank
+/// @arg opt             optional instruction to emit
+/// @arg if_loadstore    YES or NO: whether to do load/store
+/// @arg if_overlap      YES or NO: whether to interleave processing of next iteration
+/// @arg if_preload      YES or NO: whether to insert prefetch instructions
+#define S32A_OPAQUE_8PIX_BLEND(align, other_src_alpha, src0, src1, opt, if_loadstore, if_overlap, if_preload) \
+if_loadstore(   "vld4.8      {d20-d23}, [%[dst]"#align"]                                      \n\t")  \
+if_preload(     "sub         %[tmp], %[len], #1                                               \n\t")  \
+if_overlap(     "vmov        %[alo], %[ahi], "#other_src_alpha"                               \n\t")  \
+if_preload(     "cmp         %[tmp], #" PREFETCH_DISTANCE "                                   \n\t")  \
+                "vmull.u8    q8, d20, d31                                                     \n\t"   \
+if_preload(     "it          cs                                                               \n\t")  \
+if_preload(     "movcs       %[tmp], #" PREFETCH_DISTANCE "                                   \n\t")  \
+                "vmull.u8    q9, d21, d31                                                     \n\t"   \
+                "vmull.u8    q10, d22, d31                                                    \n\t"   \
+                "vmull.u8    q11, d23, d31                                                    \n\t"   \
+if_preload(     "pld         [%[src], %[tmp], lsl #2]                                         \n\t")  \
+                "vrshr.u16   q12, q8, #8                                                      \n\t"   \
+if_preload(     "add         %[tmp], #(32+32)/4                                               \n\t")  \
+                "vrshr.u16   q13, q9, #8                                                      \n\t"   \
+                "vrshr.u16   q14, q10, #8                                                     \n\t"   \
+                "vrshr.u16   q15, q11, #8                                                     \n\t"   \
+if_preload(     "pld         [%[dst], %[tmp], lsl #2]                                         \n\t")  \
+                "vraddhn.u16 d16, q8, q12                                                     \n\t"   \
+                "vraddhn.u16 d17, q9, q13                                                     \n\t"   \
+                "vraddhn.u16 d18, q10, q14                                                    \n\t"   \
+                "vraddhn.u16 d19, q11, q15                                                    \n\t"   \
+if_overlap(     "mvn         %[tmp], %[alo]                                                   \n\t")  \
+if_overlap(     "vmvn        d31, "#other_src_alpha"                                          \n\t")  \
+if_overlap(A(   "orr         %[alo], %[ahi]                                                   \n\t")) \
+                "vadd.i8     "#src0", q8                                                      \n\t"   \
+if_overlap(A(   "mvn         %[ahi], %[ahi]                                                   \n\t")) \
+                "vadd.i8     "#src1", q9                                                      \n\t"   \
+                opt"                                                                          \n\t"   \
+if_loadstore(   "vst4.8      {"#src0", "#src1"}, [%[dst]"#align"]!                            \n\t")  \
+
+#endif // __ARM_64BIT_STATE
+#endif // defined(SK_ARM_HAS_NEON)
+
 /*not static*/ inline
 void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) {
     SkASSERT(alpha == 0xFF);
@@ -162,6 +349,10 @@
     }
 
 #elif defined(SK_ARM_HAS_NEON)
+#ifdef __ARM_64BIT_STATE
+    // No attempt has been made to adapt the inline assembly version for AArch64
+    // so fall back to the less performant version that uses intrinsics instead
+
     // Do 8-pixels at a time. A 16-pixels at a time version of this code was also tested, but it
     // underperformed on some of the platforms under test for inputs with frequent transitions of
     // alpha (corresponding to changes of the conditions [~]alpha_u64 == 0 below). It may be worth
@@ -208,6 +399,154 @@
         vst1_lane_u32(dst, vreinterpret_u32_u8(result), 0);
     }
     return;
+
+#else // __ARM_64BIT_STATE
+    // Inline ARM AArch32 assembly version
+    uint32_t tmp, alo, ahi;
+    const int eight = 8;
+    if (len < 15) {
+        // Too short to attempt aligned processing
+        if (len & 8)
+            S32A_OPAQUE_8PIX_PROCESS(, YES);
+        if (len & 7)
+            S32A_OPAQUE_7PIX_PROCESS(S32A_LOADSTORE_TRAILING_7, len & 7);
+    } else {
+        // The last 8 - 15 pixels (starting from a 4-pixel boundary) are handled together
+        uintptr_t startrup = (uintptr_t) dst / sizeof (*dst) + 3;
+        uintptr_t end = (uintptr_t) dst / sizeof (*dst) + len;
+        size_t trailing;
+        if ((end & 3) == 0)
+            // No blocks of <8 pixels used at end in these cases
+            trailing = 8;
+        else
+            // If length (discounting alignment to 4-pixel boundaries) is an odd number of 4-pixels,
+            // assign this to trailing end to avoid possibility of a leading run of exactly 4
+            trailing = 8 + ((startrup ^ end) & 4) + (end & 3);
+        // The inner loop handles an integer number (0+) of 16-pixel blocks at 4-pixel boundaries
+        // The 0..15 pixels leading up to this are handled together
+        size_t leading8 = (len - trailing) & 8;
+        size_t leading7 = (len - trailing) & 7;
+
+        // Do leading pixels
+        if (leading7 != 0) {
+            len -= leading7;
+            S32A_OPAQUE_7PIX_PROCESS(S32A_LOADSTORE_LEADING_7, leading7);
+        }
+        if (leading8 != 0) {
+            len -= 8;
+            S32A_OPAQUE_8PIX_PROCESS(:128, YES);
+        }
+
+        // Do inner loop
+        __asm__ (
+                // We enter and leave each iteration of the inner loop with the source
+                // pointer 8 pixels ahead and the destination pointer 8 pixels behind
+                // in order to permit good pipelining. The count of remaining pixels is
+                // reduced by 16 to allow the loop termination test to be combined with
+                // the decrementing of the remaining length.
+                "sub         %[dst], #8*4                                                     \n\t"
+                "vld4.8      {d0-d3}, [%[src]]!                                               \n\t"
+                "subs        %[len], #16                                                      \n\t"
+                "bcc         49f                                                              \n\t"
+
+                "10:                                                                          \n\t"
+                // Move alpha to ARM registers for comparison
+                "vmov        %[alo], s6                                                       \n\t"
+                "vmov        %[ahi], s7                                                       \n\t"
+                // Fetch source data for next iteration
+                "vld4.8      {d4-d7}, [%[src]]!                                               \n\t"
+                "add         %[dst], #8*4                                                     \n\t"
+                // Test if all source pixels are transparent (alpha=0)
+                "orrs        %[tmp], %[alo], %[ahi]                                           \n\t"
+                "beq         19f                                                              \n\t"
+                // Find inverse alpha in case full blending required
+                "vmvn        d31, d3                                                          \n\t"
+                // Test if all source pixels are opaque (alpha=0xff)
+                "cmp         %[alo], #-1                                                      \n\t"
+                "it          eq                                                               \n\t"
+                "cmpeq       %[ahi], #-1                                                      \n\t"
+                "bne         30f                                                              \n\t"
+                // Opaque case: copy source to destination
+                "vst4.8      {d0-d3}, [%[dst]:128]                                            \n\t"
+                // Drop through
+                "19:                                                                          \n\t"
+
+                // Move alpha to ARM registers for comparison
+                "vmov        %[alo], s14                                                      \n\t"
+                "vmov        %[ahi], s15                                                      \n\t"
+                // Fetch source data for next iteration
+                "vld4.8      {d0-d3}, [%[src]]!                                               \n\t"
+                "add         %[dst], #8*4                                                     \n\t"
+                // Test if all source pixels are transparent (alpha=0)
+                "orrs        %[tmp], %[alo], %[ahi]                                           \n\t"
+                "beq         29f                                                              \n\t"
+                // Find inverse alpha in case full blending required
+                "vmvn        d31, d7                                                          \n\t"
+                // Test if all source pixels are opaque (alpha=0xff)
+                "cmp         %[alo], #-1                                                      \n\t"
+                "it          eq                                                               \n\t"
+                "cmpeq       %[ahi], #-1                                                      \n\t"
+                "bne         40f                                                              \n\t"
+                // Opaque case: copy source to destination
+                "vst4.8      {d4-d7}, [%[dst]:128]                                            \n\t"
+                // Drop through
+                "29:                                                                          \n\t"
+                "subs        %[len], #16                                                      \n\t"
+                "bcs         10b                                                              \n\t"
+                "b           49f                                                              \n\t"
+
+                // Mixed or translucent pixels in d0-d3
+                "30:                                                                          \n\t"
+                S32A_OPAQUE_8PIX_BLEND(:128, d7, q0, q1,, YES, YES, IF_PRELOAD)
+A(              "teq         %[alo], #0                                                       \n\t")
+T(              "orrs        %[alo], %[alo], %[ahi]                                           \n\t")
+                "vld4.8      {d0-d3}, [%[src]]!                                               \n\t"
+                "beq         29b                                                              \n\t"
+A(              "orrs        %[tmp], %[tmp], %[ahi]                                           \n\t")
+T(              "orns        %[tmp], %[tmp], %[ahi]                                           \n\t")
+                "bne         40f                                                              \n\t"
+                "vst4.8      {d4-d7}, [%[dst]:128]                                            \n\t"
+                "b           29b                                                              \n\t"
+
+                // Mixed or translucent pixels in d4-d7
+                "40:                                                                          \n\t"
+                S32A_OPAQUE_8PIX_BLEND(:128, d3, q2, q3, \
+                "subs        %[len], #16", YES, YES, NO)
+                "bcc         50f                                                              \n\t"
+A(              "teq         %[alo], #0                                                       \n\t")
+T(              "orrs        %[alo], %[alo], %[ahi]                                           \n\t")
+                "vld4.8      {d4-d7}, [%[src]]!                                               \n\t"
+                "beq         19b                                                              \n\t"
+A(              "orrs        %[tmp], %[tmp], %[ahi]                                           \n\t")
+T(              "orns        %[tmp], %[tmp], %[ahi]                                           \n\t")
+                "bne         30b                                                              \n\t"
+                "vst4.8      {d0-d3}, [%[dst]:128]                                            \n\t"
+                "b           19b                                                              \n\t"
+
+                "49:                                                                          \n\t"
+                "add         %[dst], #8*4                                                     \n\t"
+                "50:                                                                          \n\t"
+        : // Outputs
+                [dst]"+r"(dst),
+                [src]"+r"(src),
+                [len]"+r"(len),
+                [alo]"+r"(alo),
+                [ahi]"+r"(ahi),
+                [tmp]"+r"(tmp)
+        : // Inputs
+        : // Clobbers
+                "cc", "memory"
+        );
+
+        // Do trailing pixels.
+        // There will always be more than 8 of these, and the first 8 are already in d0-d3
+        S32A_OPAQUE_8PIX_PROCESS(:128, NO);
+        if (len & 7)
+            S32A_OPAQUE_7PIX_PROCESS(S32A_LOADSTORE_TRAILING_7, len & 7);
+    }
+    return;
+
+#endif // __ARM_64BIT_STATE
 #endif
 
     while (len-- > 0) {