-
Notifications
You must be signed in to change notification settings - Fork 0
/
0008-ARM-NEON-Add-inline-assembly-version-of-S32_alpha_D3.patch
231 lines (226 loc) · 11.5 KB
/
0008-ARM-NEON-Add-inline-assembly-version-of-S32_alpha_D3.patch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
From ac84df39fa2fa20245501a4c24eb79e492b5fd7b Mon Sep 17 00:00:00 2001
From: Ben Avison <[email protected]>
Date: Thu, 14 Mar 2019 21:23:19 +0000
Subject: [PATCH 8/8] ARM NEON: Add inline assembly version of
S32_alpha_D32_filter_DX(), scheduled for Cortex-A53
Benchmarks show this is 111% faster (more than double the speed) in RAM,
132% faster in L1 cache.
Implemented only for AArch32; AArch64 builds still use old intrinsics code
---
src/opts/SkBitmapProcState_opts.h | 208 ++++++++++++++++++++++++++++++++++++++
1 file changed, 208 insertions(+)
--- a/third_party/skia/src/opts/SkBitmapProcState_opts.h
+++ b/third_party/skia/src/opts/SkBitmapProcState_opts.h
@@ -252,6 +252,214 @@
}
}
+#elif defined(SK_ARM_HAS_NEON) && !defined(__ARM_64BIT_STATE)
+
+#define S32_ALPHA_D32_FILTER_DX_1PIX_NEON(opt) \
+ "ldr %[x], [%[xy]], #4 \n\t" \
+ "uxth %[tmp2], %[x], ror #16 \n\t" \
+ "lsl %[tmp3], %[x], #2 \n\t" \
+ "bic %[tmp2], #3 \n\t" \
+ "uxth %[tmp3], %[tmp3] \n\t" \
+ "add %[tmp0], %[row0], %[tmp2] \n\t" \
+ "add %[tmp1], %[row0], %[tmp3] \n\t" \
+ "add %[tmp2], %[row1], %[tmp2] \n\t" \
+ "add %[tmp3], %[row1], %[tmp3] \n\t" \
+ "lsr %[x], #14 \n\t" \
+ "vldr s0, [%[tmp0]] \n\t" \
+ "and %[x], #0xf \n\t" \
+ "vldr s1, [%[tmp1]] \n\t" \
+ "vldr s2, [%[tmp2]] \n\t" \
+ "vldr s3, [%[tmp3]] \n\t" \
+ "vdup.16 d2, %[x] \n\t" \
+ "vsub.i16 d3, d23, d2 \n\t" \
+ "vmull.u8 q2, d0, d31 \n\t" \
+ "vmlal.u8 q2, d1, d30 \n\t" \
+ "vmul.u16 d0, d4, d3 \n\t" \
+ "vmla.u16 d0, d5, d2 \n\t" \
+ "vshr.u16 d0, #8 \n\t" \
+ "vmul.u16 d0, d10 \n\t" \
+ opt" \n\t" \
+ "vshrn.u16 d0, q0, #8 \n\t" \
+ "vst1.32 {d0[0]}, [%[dst]:32]! \n\t" \
+
+void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
+ const uint32_t* SK_RESTRICT xy,
+ int count, SkPMColor* SK_RESTRICT colors) {
+ SkASSERT(count > 0 && colors != nullptr);
+ SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
+ SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
+ SkASSERT(s.fAlphaScale <= 256);
+
+ int y0, y1, wy;
+ decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
+
+ auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
+ row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
+
+ uint32_t tmp0, tmp1, tmp2, tmp3, x;
+ __asm__ volatile (
+ "vpush {q4-q5} \n\t"
+ "vmov.i16 d22, #0xf \n\t"
+ "vmov.i16 d23, #0x10 \n\t"
+ "vmov.i32 q12, #0x3fff \n\t"
+ "vdup.32 q13, %[row0] \n\t"
+ "vdup.32 q14, %[row1] \n\t"
+ "vdup.i8 d30, %[subY] \n\t"
+ "vmov.i8 d31, #16 \n\t"
+ "vdup.16 q5, %[alpha] \n\t"
+ "vshl.i32 q12, #2 \n\t"
+ "tst %[dst], #0xc \n\t"
+ "vsub.i8 d31, d30 \n\t"
+ "beq 2f \n\t"
+
+ "1: \n\t"
+ S32_ALPHA_D32_FILTER_DX_1PIX_NEON(
+ "add %[tmp0], %[dst], #4 \n\t"
+ "subs %[len], #1 \n\t"
+ "it ne \n\t"
+ "tstne %[tmp0], #0xc"
+ )
+ "bne 1b \n\t"
+
+ "2:"
+ "subs %[len], #4 \n\t"
+ "bmi 13f \n\t"
+
+ "vld1.32 {q8}, [%[xy]]! \n\t"
+ "vshr.u32 q9, q8, #16 \n\t"
+ "vand q9, q12 \n\t"
+ "vadd.i32 q1, q13, q9 \n\t"
+ "vshl.i32 q0, q8, #2 \n\t"
+ "vand q0, q12 \n\t"
+ "vadd.i32 q2, q13, q0 \n\t"
+ "vmov %[tmp0], s4 \n\t"
+ "vmov %[tmp1], s5 \n\t"
+ "vadd.i32 q3, q14, q9 \n\t"
+ "vmov %[tmp2], %[tmp3], d3 \n\t"
+
+ "11: \n\t"
+ "vadd.i32 q4, q14, q0 \n\t"
+ "vldr s4, [%[tmp0]] \n\t"
+ "vmov %[tmp0], s8 \n\t"
+ "vldr s5, [%[tmp1]] \n\t"
+ "vmov %[tmp1], s9 \n\t"
+ "vldr s6, [%[tmp2]] \n\t"
+ "vmov %[tmp2], s10 \n\t"
+ "vldr s7, [%[tmp3]] \n\t"
+ "vmov %[tmp3], s11 \n\t"
+ "vldr s8, [%[tmp0]] \n\t"
+ "vmov %[tmp0], s12 \n\t"
+ "vldr s9, [%[tmp1]] \n\t"
+ "vmov %[tmp1], s13 \n\t"
+ "vldr s10, [%[tmp2]] \n\t"
+ "vmov %[tmp2], s14 \n\t"
+ "vldr s11, [%[tmp3]] \n\t"
+ "vmov %[tmp3], s15 \n\t"
+ "vldr s12, [%[tmp0]] \n\t"
+ "vmov %[tmp0], s16 \n\t"
+ "vldr s13, [%[tmp1]] \n\t"
+ "vmov %[tmp1], s17 \n\t"
+ "vldr s14, [%[tmp2]] \n\t"
+ "vmov %[tmp2], s18 \n\t"
+ "vldr s15, [%[tmp3]] \n\t"
+ "vmov %[tmp3], s19 \n\t"
+ "vldr s16, [%[tmp0]] \n\t"
+ "vshrn.i32 d1, q8, #14 \n\t"
+ "vldr s17, [%[tmp1]] \n\t"
+ "vand d1, d22 \n\t"
+ "vldr s18, [%[tmp2]] \n\t"
+ "vsub.i16 d0, d23, d1 \n\t"
+ "vldr s19, [%[tmp3]] \n\t"
+ "vmull.u8 q10, d2, d31 \n\t"
+ "vmlal.u8 q10, d6, d30 \n\t"
+ "vmull.u8 q1, d3, d31 \n\t"
+ "vmlal.u8 q1, d7, d30 \n\t"
+ "vmull.u8 q3, d4, d31 \n\t"
+ "subs %[len], #4 \n\t"
+ "vmlal.u8 q3, d8, d30 \n\t"
+ "bmi 12f \n\t"
+
+ " vld1.32 {q8}, [%[xy]]! \n\t"
+ "vmull.u8 q2, d5, d31 \n\t"
+ "vmlal.u8 q2, d9, d30 \n\t"
+ "vmul.u16 d8, d20, d0[0] \n\t"
+ " vshr.u32 d18, d16, #16 \n\t"
+ "vmul.u16 d9, d21, d0[1] \n\t"
+ " vshr.u32 d19, d17, #16 \n\t"
+ "vmul.u16 d20, d2, d0[2] \n\t"
+ " vand d18, d24 \n\t"
+ "vmul.u16 d21, d3, d0[3] \n\t"
+ " vand d19, d25 \n\t"
+ "vmla.u16 d8, d6, d1[0] \n\t"
+ " vadd.i32 d2, d26, d18 \n\t"
+ "vmla.u16 d9, d7, d1[1] \n\t"
+ " vadd.i32 d3, d27, d19 \n\t"
+ "vmla.u16 d20, d4, d1[2] \n\t"
+ " vshl.i32 d0, d16, #2 \n\t"
+ "vmla.u16 d21, d5, d1[3] \n\t"
+ " vshl.i32 d1, d17, #2 \n\t"
+ " vand q0, q12 \n\t"
+ " vadd.i32 q2, q13, q0 \n\t"
+ "vshr.u16 q4, #8 \n\t"
+ "vshr.u16 q10, #8 \n\t"
+ "vmul.u16 q4, q5 \n\t"
+ "vmul.u16 q10, q5 \n\t"
+ " vmov %[tmp0], %[tmp1], d2 \n\t"
+ " vadd.i32 q3, q14, q9 \n\t"
+ " vmov %[tmp2], %[tmp3], d3 \n\t"
+ "vshrn.u16 d8, q4, #8 \n\t"
+ "vshrn.u16 d9, q10, #8 \n\t"
+ "vst1.32 {q4}, [%[dst]:128]! \n\t"
+ "b 11b \n\t"
+
+ "12: \n\t"
+ "vmull.u8 q2, d5, d31 \n\t"
+ "vmlal.u8 q2, d9, d30 \n\t"
+ "vmul.u16 d8, d20, d0[0] \n\t"
+ "vmul.u16 d9, d21, d0[1] \n\t"
+ "vmul.u16 d20, d2, d0[2] \n\t"
+ "vmul.u16 d21, d3, d0[3] \n\t"
+ "vmla.u16 d8, d6, d1[0] \n\t"
+ "vmla.u16 d9, d7, d1[1] \n\t"
+ "vmla.u16 d20, d4, d1[2] \n\t"
+ "vmla.u16 d21, d5, d1[3] \n\t"
+ "vshr.u16 q4, #8 \n\t"
+ "vshr.u16 q10, #8 \n\t"
+ "vmul.u16 q4, q5 \n\t"
+ "vmul.u16 q10, q5 \n\t"
+ "vshrn.u16 d8, q4, #8 \n\t"
+ "vshrn.u16 d9, q10, #8 \n\t"
+ "vst1.32 {q4}, [%[dst]:128]! \n\t"
+
+ "13: \n\t"
+ "adds %[len], #4-1 \n\t"
+ "bmi 22f \n\t"
+
+ "21: \n\t"
+ S32_ALPHA_D32_FILTER_DX_1PIX_NEON("subs %[len], #1")
+ "bpl 21b \n\t"
+
+ "22: \n\t"
+ "vpop {q4-q5} \n\t"
+ : // Outputs
+ [dst]"+r"(colors),
+ [xy]"+r"(xy),
+ [len]"+r"(count),
+ [tmp0]"=&r"(tmp0),
+ [tmp1]"=&r"(tmp1),
+ [tmp2]"=&r"(tmp2),
+ [tmp3]"=&r"(tmp3),
+ [x]"=&r"(x)
+ : // Inputs
+ [alpha]"r"(s.fAlphaScale),
+ [row0]"r"(row0),
+ [row1]"r"(row1),
+ [subY]"r"(wy)
+ : // Clobbers
+ "cc", "memory"
+ );
+}
+
#else
// The NEON code only actually differs from the portable code in the