|
1 /* |
|
2 * Copyright 2012 The Android Open Source Project |
|
3 * |
|
4 * Use of this source code is governed by a BSD-style license that can be |
|
5 * found in the LICENSE file. |
|
6 */ |
|
7 |
|
8 #include "SkBlitRow_opts_arm_neon.h" |
|
9 |
|
10 #include "SkBlitMask.h" |
|
11 #include "SkBlitRow.h" |
|
12 #include "SkColorPriv.h" |
|
13 #include "SkDither.h" |
|
14 #include "SkMathPriv.h" |
|
15 #include "SkUtils.h" |
|
16 |
|
17 #include "SkCachePreload_arm.h" |
|
18 #include "SkColor_opts_neon.h" |
|
19 #include <arm_neon.h> |
|
20 |
|
21 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, |
|
22 const SkPMColor* SK_RESTRICT src, int count, |
|
23 U8CPU alpha, int /*x*/, int /*y*/) { |
|
24 SkASSERT(255 == alpha); |
|
25 |
|
26 while (count >= 8) { |
|
27 uint8x8x4_t vsrc; |
|
28 uint16x8_t vdst; |
|
29 |
|
30 // Load |
|
31 vsrc = vld4_u8((uint8_t*)src); |
|
32 |
|
33 // Convert src to 565 |
|
34 vdst = SkPixel32ToPixel16_neon8(vsrc); |
|
35 |
|
36 // Store |
|
37 vst1q_u16(dst, vdst); |
|
38 |
|
39 // Prepare next iteration |
|
40 dst += 8; |
|
41 src += 8; |
|
42 count -= 8; |
|
43 }; |
|
44 |
|
45 // Leftovers |
|
46 while (count > 0) { |
|
47 SkPMColor c = *src++; |
|
48 SkPMColorAssert(c); |
|
49 *dst = SkPixel32ToPixel16_ToU16(c); |
|
50 dst++; |
|
51 count--; |
|
52 }; |
|
53 } |
|
54 |
|
55 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, |
|
56 const SkPMColor* SK_RESTRICT src, int count, |
|
57 U8CPU alpha, int /*x*/, int /*y*/) { |
|
58 SkASSERT(255 == alpha); |
|
59 |
|
60 if (count >= 8) { |
|
61 uint16_t* SK_RESTRICT keep_dst = 0; |
|
62 |
|
63 asm volatile ( |
|
64 "ands ip, %[count], #7 \n\t" |
|
65 "vmov.u8 d31, #1<<7 \n\t" |
|
66 "vld1.16 {q12}, [%[dst]] \n\t" |
|
67 "vld4.8 {d0-d3}, [%[src]] \n\t" |
|
68 // Thumb does not support the standard ARM conditional |
|
69 // instructions but instead requires the 'it' instruction |
|
70 // to signal conditional execution |
|
71 "it eq \n\t" |
|
72 "moveq ip, #8 \n\t" |
|
73 "mov %[keep_dst], %[dst] \n\t" |
|
74 |
|
75 "add %[src], %[src], ip, LSL#2 \n\t" |
|
76 "add %[dst], %[dst], ip, LSL#1 \n\t" |
|
77 "subs %[count], %[count], ip \n\t" |
|
78 "b 9f \n\t" |
|
79 // LOOP |
|
80 "2: \n\t" |
|
81 |
|
82 "vld1.16 {q12}, [%[dst]]! \n\t" |
|
83 "vld4.8 {d0-d3}, [%[src]]! \n\t" |
|
84 "vst1.16 {q10}, [%[keep_dst]] \n\t" |
|
85 "sub %[keep_dst], %[dst], #8*2 \n\t" |
|
86 "subs %[count], %[count], #8 \n\t" |
|
87 "9: \n\t" |
|
88 "pld [%[dst],#32] \n\t" |
|
89 // expand 0565 q12 to 8888 {d4-d7} |
|
90 "vmovn.u16 d4, q12 \n\t" |
|
91 "vshr.u16 q11, q12, #5 \n\t" |
|
92 "vshr.u16 q10, q12, #6+5 \n\t" |
|
93 "vmovn.u16 d5, q11 \n\t" |
|
94 "vmovn.u16 d6, q10 \n\t" |
|
95 "vshl.u8 d4, d4, #3 \n\t" |
|
96 "vshl.u8 d5, d5, #2 \n\t" |
|
97 "vshl.u8 d6, d6, #3 \n\t" |
|
98 |
|
99 "vmovl.u8 q14, d31 \n\t" |
|
100 "vmovl.u8 q13, d31 \n\t" |
|
101 "vmovl.u8 q12, d31 \n\t" |
|
102 |
|
103 // duplicate in 4/2/1 & 8pix vsns |
|
104 "vmvn.8 d30, d3 \n\t" |
|
105 "vmlal.u8 q14, d30, d6 \n\t" |
|
106 "vmlal.u8 q13, d30, d5 \n\t" |
|
107 "vmlal.u8 q12, d30, d4 \n\t" |
|
108 "vshr.u16 q8, q14, #5 \n\t" |
|
109 "vshr.u16 q9, q13, #6 \n\t" |
|
110 "vaddhn.u16 d6, q14, q8 \n\t" |
|
111 "vshr.u16 q8, q12, #5 \n\t" |
|
112 "vaddhn.u16 d5, q13, q9 \n\t" |
|
113 "vqadd.u8 d6, d6, d0 \n\t" // moved up |
|
114 "vaddhn.u16 d4, q12, q8 \n\t" |
|
115 // intentionally don't calculate alpha |
|
116 // result in d4-d6 |
|
117 |
|
118 "vqadd.u8 d5, d5, d1 \n\t" |
|
119 "vqadd.u8 d4, d4, d2 \n\t" |
|
120 |
|
121 // pack 8888 {d4-d6} to 0565 q10 |
|
122 "vshll.u8 q10, d6, #8 \n\t" |
|
123 "vshll.u8 q3, d5, #8 \n\t" |
|
124 "vshll.u8 q2, d4, #8 \n\t" |
|
125 "vsri.u16 q10, q3, #5 \n\t" |
|
126 "vsri.u16 q10, q2, #11 \n\t" |
|
127 |
|
128 "bne 2b \n\t" |
|
129 |
|
130 "1: \n\t" |
|
131 "vst1.16 {q10}, [%[keep_dst]] \n\t" |
|
132 : [count] "+r" (count) |
|
133 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) |
|
134 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", |
|
135 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", |
|
136 "d30","d31" |
|
137 ); |
|
138 } |
|
139 else |
|
140 { // handle count < 8 |
|
141 uint16_t* SK_RESTRICT keep_dst = 0; |
|
142 |
|
143 asm volatile ( |
|
144 "vmov.u8 d31, #1<<7 \n\t" |
|
145 "mov %[keep_dst], %[dst] \n\t" |
|
146 |
|
147 "tst %[count], #4 \n\t" |
|
148 "beq 14f \n\t" |
|
149 "vld1.16 {d25}, [%[dst]]! \n\t" |
|
150 "vld1.32 {q1}, [%[src]]! \n\t" |
|
151 |
|
152 "14: \n\t" |
|
153 "tst %[count], #2 \n\t" |
|
154 "beq 12f \n\t" |
|
155 "vld1.32 {d24[1]}, [%[dst]]! \n\t" |
|
156 "vld1.32 {d1}, [%[src]]! \n\t" |
|
157 |
|
158 "12: \n\t" |
|
159 "tst %[count], #1 \n\t" |
|
160 "beq 11f \n\t" |
|
161 "vld1.16 {d24[1]}, [%[dst]]! \n\t" |
|
162 "vld1.32 {d0[1]}, [%[src]]! \n\t" |
|
163 |
|
164 "11: \n\t" |
|
165 // unzips achieve the same as a vld4 operation |
|
166 "vuzpq.u16 q0, q1 \n\t" |
|
167 "vuzp.u8 d0, d1 \n\t" |
|
168 "vuzp.u8 d2, d3 \n\t" |
|
169 // expand 0565 q12 to 8888 {d4-d7} |
|
170 "vmovn.u16 d4, q12 \n\t" |
|
171 "vshr.u16 q11, q12, #5 \n\t" |
|
172 "vshr.u16 q10, q12, #6+5 \n\t" |
|
173 "vmovn.u16 d5, q11 \n\t" |
|
174 "vmovn.u16 d6, q10 \n\t" |
|
175 "vshl.u8 d4, d4, #3 \n\t" |
|
176 "vshl.u8 d5, d5, #2 \n\t" |
|
177 "vshl.u8 d6, d6, #3 \n\t" |
|
178 |
|
179 "vmovl.u8 q14, d31 \n\t" |
|
180 "vmovl.u8 q13, d31 \n\t" |
|
181 "vmovl.u8 q12, d31 \n\t" |
|
182 |
|
183 // duplicate in 4/2/1 & 8pix vsns |
|
184 "vmvn.8 d30, d3 \n\t" |
|
185 "vmlal.u8 q14, d30, d6 \n\t" |
|
186 "vmlal.u8 q13, d30, d5 \n\t" |
|
187 "vmlal.u8 q12, d30, d4 \n\t" |
|
188 "vshr.u16 q8, q14, #5 \n\t" |
|
189 "vshr.u16 q9, q13, #6 \n\t" |
|
190 "vaddhn.u16 d6, q14, q8 \n\t" |
|
191 "vshr.u16 q8, q12, #5 \n\t" |
|
192 "vaddhn.u16 d5, q13, q9 \n\t" |
|
193 "vqadd.u8 d6, d6, d0 \n\t" // moved up |
|
194 "vaddhn.u16 d4, q12, q8 \n\t" |
|
195 // intentionally don't calculate alpha |
|
196 // result in d4-d6 |
|
197 |
|
198 "vqadd.u8 d5, d5, d1 \n\t" |
|
199 "vqadd.u8 d4, d4, d2 \n\t" |
|
200 |
|
201 // pack 8888 {d4-d6} to 0565 q10 |
|
202 "vshll.u8 q10, d6, #8 \n\t" |
|
203 "vshll.u8 q3, d5, #8 \n\t" |
|
204 "vshll.u8 q2, d4, #8 \n\t" |
|
205 "vsri.u16 q10, q3, #5 \n\t" |
|
206 "vsri.u16 q10, q2, #11 \n\t" |
|
207 |
|
208 // store |
|
209 "tst %[count], #4 \n\t" |
|
210 "beq 24f \n\t" |
|
211 "vst1.16 {d21}, [%[keep_dst]]! \n\t" |
|
212 |
|
213 "24: \n\t" |
|
214 "tst %[count], #2 \n\t" |
|
215 "beq 22f \n\t" |
|
216 "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t" |
|
217 |
|
218 "22: \n\t" |
|
219 "tst %[count], #1 \n\t" |
|
220 "beq 21f \n\t" |
|
221 "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t" |
|
222 |
|
223 "21: \n\t" |
|
224 : [count] "+r" (count) |
|
225 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) |
|
226 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", |
|
227 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", |
|
228 "d30","d31" |
|
229 ); |
|
230 } |
|
231 } |
|
232 |
|
233 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { |
|
234 prod += vdupq_n_u16(128); |
|
235 prod += vshrq_n_u16(prod, 8); |
|
236 return vshrq_n_u16(prod, 8); |
|
237 } |
|
238 |
|
239 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, |
|
240 const SkPMColor* SK_RESTRICT src, int count, |
|
241 U8CPU alpha, int /*x*/, int /*y*/) { |
|
242 SkASSERT(255 > alpha); |
|
243 |
|
244 /* This code implements a Neon version of S32A_D565_Blend. The results have |
|
245 * a few mismatches compared to the original code. These mismatches never |
|
246 * exceed 1. |
|
247 */ |
|
248 |
|
249 if (count >= 8) { |
|
250 uint16x8_t valpha_max, vmask_blue; |
|
251 uint8x8_t valpha; |
|
252 |
|
253 // prepare constants |
|
254 valpha_max = vmovq_n_u16(255); |
|
255 valpha = vdup_n_u8(alpha); |
|
256 vmask_blue = vmovq_n_u16(SK_B16_MASK); |
|
257 |
|
258 do { |
|
259 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; |
|
260 uint16x8_t vres_a, vres_r, vres_g, vres_b; |
|
261 uint8x8x4_t vsrc; |
|
262 |
|
263 // load pixels |
|
264 vdst = vld1q_u16(dst); |
|
265 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) |
|
266 asm ( |
|
267 "vld4.u8 %h[vsrc], [%[src]]!" |
|
268 : [vsrc] "=w" (vsrc), [src] "+&r" (src) |
|
269 : : |
|
270 ); |
|
271 #else |
|
272 register uint8x8_t d0 asm("d0"); |
|
273 register uint8x8_t d1 asm("d1"); |
|
274 register uint8x8_t d2 asm("d2"); |
|
275 register uint8x8_t d3 asm("d3"); |
|
276 |
|
277 asm volatile ( |
|
278 "vld4.u8 {d0-d3},[%[src]]!;" |
|
279 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), |
|
280 [src] "+&r" (src) |
|
281 : : |
|
282 ); |
|
283 vsrc.val[0] = d0; |
|
284 vsrc.val[1] = d1; |
|
285 vsrc.val[2] = d2; |
|
286 vsrc.val[3] = d3; |
|
287 #endif |
|
288 |
|
289 |
|
290 // deinterleave dst |
|
291 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to top of lanes |
|
292 vdst_b = vdst & vmask_blue; // extract blue |
|
293 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red |
|
294 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green |
|
295 |
|
296 // shift src to 565 |
|
297 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS); |
|
298 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS); |
|
299 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS); |
|
300 |
|
301 // calc src * src_scale |
|
302 vres_a = vmull_u8(vsrc.val[NEON_A], valpha); |
|
303 vres_r = vmull_u8(vsrc.val[NEON_R], valpha); |
|
304 vres_g = vmull_u8(vsrc.val[NEON_G], valpha); |
|
305 vres_b = vmull_u8(vsrc.val[NEON_B], valpha); |
|
306 |
|
307 // prepare dst_scale |
|
308 vres_a = SkDiv255Round_neon8(vres_a); |
|
309 vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255 |
|
310 |
|
311 // add dst * dst_scale to previous result |
|
312 vres_r = vmlaq_u16(vres_r, vdst_r, vres_a); |
|
313 vres_g = vmlaq_u16(vres_g, vdst_g, vres_a); |
|
314 vres_b = vmlaq_u16(vres_b, vdst_b, vres_a); |
|
315 |
|
316 #ifdef S32A_D565_BLEND_EXACT |
|
317 // It is possible to get exact results with this but it is slow, |
|
318 // even slower than C code in some cases |
|
319 vres_r = SkDiv255Round_neon8(vres_r); |
|
320 vres_g = SkDiv255Round_neon8(vres_g); |
|
321 vres_b = SkDiv255Round_neon8(vres_b); |
|
322 #else |
|
323 vres_r = vrshrq_n_u16(vres_r, 8); |
|
324 vres_g = vrshrq_n_u16(vres_g, 8); |
|
325 vres_b = vrshrq_n_u16(vres_b, 8); |
|
326 #endif |
|
327 // pack result |
|
328 vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue |
|
329 vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue |
|
330 |
|
331 // store |
|
332 vst1q_u16(dst, vres_b); |
|
333 dst += 8; |
|
334 count -= 8; |
|
335 } while (count >= 8); |
|
336 } |
|
337 |
|
338 // leftovers |
|
339 while (count-- > 0) { |
|
340 SkPMColor sc = *src++; |
|
341 if (sc) { |
|
342 uint16_t dc = *dst; |
|
343 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha); |
|
344 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale); |
|
345 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale); |
|
346 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale); |
|
347 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db)); |
|
348 } |
|
349 dst += 1; |
|
350 } |
|
351 } |
|
352 |
|
353 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16. |
|
354 * each dither value is spaced out into byte lanes, and repeated |
|
355 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the |
|
356 * start of each row. |
|
357 */ |
|
358 static const uint8_t gDitherMatrix_Neon[48] = { |
|
359 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, |
|
360 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, |
|
361 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, |
|
362 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, |
|
363 |
|
364 }; |
|
365 |
|
366 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, |
|
367 int count, U8CPU alpha, int x, int y) |
|
368 { |
|
369 |
|
370 SkASSERT(255 > alpha); |
|
371 |
|
372 // rescale alpha to range 1 - 256 |
|
373 int scale = SkAlpha255To256(alpha); |
|
374 |
|
375 if (count >= 8) { |
|
376 /* select row and offset for dither array */ |
|
377 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; |
|
378 |
|
379 uint8x8_t vdither = vld1_u8(dstart); // load dither values |
|
380 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values |
|
381 |
|
382 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into neon reg |
|
383 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask |
|
384 |
|
385 do { |
|
386 |
|
387 uint8x8_t vsrc_r, vsrc_g, vsrc_b; |
|
388 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b; |
|
389 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b; |
|
390 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b; |
|
391 uint16x8_t vdst; |
|
392 uint16x8_t vdst_r, vdst_g, vdst_b; |
|
393 int16x8_t vres_r, vres_g, vres_b; |
|
394 int8x8_t vres8_r, vres8_g, vres8_b; |
|
395 |
|
396 // Load source and add dither |
|
397 { |
|
398 register uint8x8_t d0 asm("d0"); |
|
399 register uint8x8_t d1 asm("d1"); |
|
400 register uint8x8_t d2 asm("d2"); |
|
401 register uint8x8_t d3 asm("d3"); |
|
402 |
|
403 asm ( |
|
404 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" |
|
405 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) |
|
406 : |
|
407 ); |
|
408 vsrc_g = d1; |
|
409 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) |
|
410 vsrc_r = d2; vsrc_b = d0; |
|
411 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) |
|
412 vsrc_r = d0; vsrc_b = d2; |
|
413 #endif |
|
414 } |
|
415 |
|
416 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6 |
|
417 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5 |
|
418 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5 |
|
419 |
|
420 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen |
|
421 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red and widen |
|
422 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue and widen |
|
423 |
|
424 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red from result |
|
425 vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g); // sub shifted green from result |
|
426 vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b); // sub shifted blue from result |
|
427 |
|
428 vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3); |
|
429 vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2); |
|
430 vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3); |
|
431 |
|
432 // Load dst and unpack |
|
433 vdst = vld1q_u16(dst); |
|
434 vdst_g = vshrq_n_u16(vdst, 5); // shift down to get green |
|
435 vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red |
|
436 vdst_b = vandq_u16(vdst, vmask_b); // mask to get blue |
|
437 |
|
438 // subtract dst from src and widen |
|
439 vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r)); |
|
440 vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g)); |
|
441 vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b)); |
|
442 |
|
443 // multiply diffs by scale and shift |
|
444 vres_r = vmulq_s16(vres_r, vscale); |
|
445 vres_g = vmulq_s16(vres_g, vscale); |
|
446 vres_b = vmulq_s16(vres_b, vscale); |
|
447 |
|
448 vres8_r = vshrn_n_s16(vres_r, 8); |
|
449 vres8_g = vshrn_n_s16(vres_g, 8); |
|
450 vres8_b = vshrn_n_s16(vres_b, 8); |
|
451 |
|
452 // add dst to result |
|
453 vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r); |
|
454 vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g); |
|
455 vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b); |
|
456 |
|
457 // put result into 565 format |
|
458 vres_b = vsliq_n_s16(vres_b, vres_g, 5); // shift up green and insert into blue |
|
459 vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue |
|
460 |
|
461 // Store result |
|
462 vst1q_u16(dst, vreinterpretq_u16_s16(vres_b)); |
|
463 |
|
464 // Next iteration |
|
465 dst += 8; |
|
466 count -= 8; |
|
467 |
|
468 } while (count >= 8); |
|
469 } |
|
470 |
|
471 // Leftovers |
|
472 if (count > 0) { |
|
473 int scale = SkAlpha255To256(alpha); |
|
474 DITHER_565_SCAN(y); |
|
475 do { |
|
476 SkPMColor c = *src++; |
|
477 SkPMColorAssert(c); |
|
478 |
|
479 int dither = DITHER_VALUE(x); |
|
480 int sr = SkGetPackedR32(c); |
|
481 int sg = SkGetPackedG32(c); |
|
482 int sb = SkGetPackedB32(c); |
|
483 sr = SkDITHER_R32To565(sr, dither); |
|
484 sg = SkDITHER_G32To565(sg, dither); |
|
485 sb = SkDITHER_B32To565(sb, dither); |
|
486 |
|
487 uint16_t d = *dst; |
|
488 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), |
|
489 SkAlphaBlend(sg, SkGetPackedG16(d), scale), |
|
490 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); |
|
491 DITHER_INC_X(x); |
|
492 } while (--count != 0); |
|
493 } |
|
494 } |
|
495 |
|
496 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
|
497 const SkPMColor* SK_RESTRICT src, |
|
498 int count, U8CPU alpha) { |
|
499 |
|
500 SkASSERT(255 == alpha); |
|
501 if (count > 0) { |
|
502 |
|
503 |
|
504 uint8x8_t alpha_mask; |
|
505 |
|
506 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; |
|
507 alpha_mask = vld1_u8(alpha_mask_setup); |
|
508 |
|
509 /* do the NEON unrolled code */ |
|
510 #define UNROLL 4 |
|
511 while (count >= UNROLL) { |
|
512 uint8x8_t src_raw, dst_raw, dst_final; |
|
513 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; |
|
514 |
|
515 /* The two prefetches below may make the code slighlty |
|
516 * slower for small values of count but are worth having |
|
517 * in the general case. |
|
518 */ |
|
519 __builtin_prefetch(src+32); |
|
520 __builtin_prefetch(dst+32); |
|
521 |
|
522 /* get the source */ |
|
523 src_raw = vreinterpret_u8_u32(vld1_u32(src)); |
|
524 #if UNROLL > 2 |
|
525 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); |
|
526 #endif |
|
527 |
|
528 /* get and hold the dst too */ |
|
529 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); |
|
530 #if UNROLL > 2 |
|
531 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); |
|
532 #endif |
|
533 |
|
534 /* 1st and 2nd bits of the unrolling */ |
|
535 { |
|
536 uint8x8_t dst_cooked; |
|
537 uint16x8_t dst_wide; |
|
538 uint8x8_t alpha_narrow; |
|
539 uint16x8_t alpha_wide; |
|
540 |
|
541 /* get the alphas spread out properly */ |
|
542 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); |
|
543 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); |
|
544 |
|
545 /* spread the dest */ |
|
546 dst_wide = vmovl_u8(dst_raw); |
|
547 |
|
548 /* alpha mul the dest */ |
|
549 dst_wide = vmulq_u16 (dst_wide, alpha_wide); |
|
550 dst_cooked = vshrn_n_u16(dst_wide, 8); |
|
551 |
|
552 /* sum -- ignoring any byte lane overflows */ |
|
553 dst_final = vadd_u8(src_raw, dst_cooked); |
|
554 } |
|
555 |
|
556 #if UNROLL > 2 |
|
557 /* the 3rd and 4th bits of our unrolling */ |
|
558 { |
|
559 uint8x8_t dst_cooked; |
|
560 uint16x8_t dst_wide; |
|
561 uint8x8_t alpha_narrow; |
|
562 uint16x8_t alpha_wide; |
|
563 |
|
564 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); |
|
565 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); |
|
566 |
|
567 /* spread the dest */ |
|
568 dst_wide = vmovl_u8(dst_raw_2); |
|
569 |
|
570 /* alpha mul the dest */ |
|
571 dst_wide = vmulq_u16 (dst_wide, alpha_wide); |
|
572 dst_cooked = vshrn_n_u16(dst_wide, 8); |
|
573 |
|
574 /* sum -- ignoring any byte lane overflows */ |
|
575 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); |
|
576 } |
|
577 #endif |
|
578 |
|
579 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); |
|
580 #if UNROLL > 2 |
|
581 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); |
|
582 #endif |
|
583 |
|
584 src += UNROLL; |
|
585 dst += UNROLL; |
|
586 count -= UNROLL; |
|
587 } |
|
588 #undef UNROLL |
|
589 |
|
590 /* do any residual iterations */ |
|
591 while (--count >= 0) { |
|
592 *dst = SkPMSrcOver(*src, *dst); |
|
593 src += 1; |
|
594 dst += 1; |
|
595 } |
|
596 } |
|
597 } |
|
598 |
|
599 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst, |
|
600 const SkPMColor* SK_RESTRICT src, |
|
601 int count, U8CPU alpha) { |
|
602 SkASSERT(255 == alpha); |
|
603 |
|
604 if (count <= 0) |
|
605 return; |
|
606 |
|
607 /* Use these to check if src is transparent or opaque */ |
|
608 const unsigned int ALPHA_OPAQ = 0xFF000000; |
|
609 const unsigned int ALPHA_TRANS = 0x00FFFFFF; |
|
610 |
|
611 #define UNROLL 4 |
|
612 const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1); |
|
613 const SkPMColor* SK_RESTRICT src_temp = src; |
|
614 |
|
615 /* set up the NEON variables */ |
|
616 uint8x8_t alpha_mask; |
|
617 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; |
|
618 alpha_mask = vld1_u8(alpha_mask_setup); |
|
619 |
|
620 uint8x8_t src_raw, dst_raw, dst_final; |
|
621 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; |
|
622 uint8x8_t dst_cooked; |
|
623 uint16x8_t dst_wide; |
|
624 uint8x8_t alpha_narrow; |
|
625 uint16x8_t alpha_wide; |
|
626 |
|
627 /* choose the first processing type */ |
|
628 if( src >= src_end) |
|
629 goto TAIL; |
|
630 if(*src <= ALPHA_TRANS) |
|
631 goto ALPHA_0; |
|
632 if(*src >= ALPHA_OPAQ) |
|
633 goto ALPHA_255; |
|
634 /* fall-thru */ |
|
635 |
|
636 ALPHA_1_TO_254: |
|
637 do { |
|
638 |
|
639 /* get the source */ |
|
640 src_raw = vreinterpret_u8_u32(vld1_u32(src)); |
|
641 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); |
|
642 |
|
643 /* get and hold the dst too */ |
|
644 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); |
|
645 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); |
|
646 |
|
647 |
|
648 /* get the alphas spread out properly */ |
|
649 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); |
|
650 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ |
|
651 /* we collapsed (255-a)+1 ... */ |
|
652 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); |
|
653 |
|
654 /* spread the dest */ |
|
655 dst_wide = vmovl_u8(dst_raw); |
|
656 |
|
657 /* alpha mul the dest */ |
|
658 dst_wide = vmulq_u16 (dst_wide, alpha_wide); |
|
659 dst_cooked = vshrn_n_u16(dst_wide, 8); |
|
660 |
|
661 /* sum -- ignoring any byte lane overflows */ |
|
662 dst_final = vadd_u8(src_raw, dst_cooked); |
|
663 |
|
664 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); |
|
665 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ |
|
666 /* we collapsed (255-a)+1 ... */ |
|
667 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); |
|
668 |
|
669 /* spread the dest */ |
|
670 dst_wide = vmovl_u8(dst_raw_2); |
|
671 |
|
672 /* alpha mul the dest */ |
|
673 dst_wide = vmulq_u16 (dst_wide, alpha_wide); |
|
674 dst_cooked = vshrn_n_u16(dst_wide, 8); |
|
675 |
|
676 /* sum -- ignoring any byte lane overflows */ |
|
677 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); |
|
678 |
|
679 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); |
|
680 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); |
|
681 |
|
682 src += UNROLL; |
|
683 dst += UNROLL; |
|
684 |
|
685 /* if 2 of the next pixels aren't between 1 and 254 |
|
686 it might make sense to go to the optimized loops */ |
|
687 if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)) |
|
688 break; |
|
689 |
|
690 } while(src < src_end); |
|
691 |
|
692 if (src >= src_end) |
|
693 goto TAIL; |
|
694 |
|
695 if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ) |
|
696 goto ALPHA_255; |
|
697 |
|
698 /*fall-thru*/ |
|
699 |
|
700 ALPHA_0: |
|
701 |
|
702 /*In this state, we know the current alpha is 0 and |
|
703 we optimize for the next alpha also being zero. */ |
|
704 src_temp = src; //so we don't have to increment dst every time |
|
705 do { |
|
706 if(*(++src) > ALPHA_TRANS) |
|
707 break; |
|
708 if(*(++src) > ALPHA_TRANS) |
|
709 break; |
|
710 if(*(++src) > ALPHA_TRANS) |
|
711 break; |
|
712 if(*(++src) > ALPHA_TRANS) |
|
713 break; |
|
714 } while(src < src_end); |
|
715 |
|
716 dst += (src - src_temp); |
|
717 |
|
718 /* no longer alpha 0, so determine where to go next. */ |
|
719 if( src >= src_end) |
|
720 goto TAIL; |
|
721 if(*src >= ALPHA_OPAQ) |
|
722 goto ALPHA_255; |
|
723 else |
|
724 goto ALPHA_1_TO_254; |
|
725 |
|
726 ALPHA_255: |
|
727 while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) { |
|
728 dst[0]=src[0]; |
|
729 dst[1]=src[1]; |
|
730 dst[2]=src[2]; |
|
731 dst[3]=src[3]; |
|
732 src+=UNROLL; |
|
733 dst+=UNROLL; |
|
734 if(src >= src_end) |
|
735 goto TAIL; |
|
736 } |
|
737 |
|
738 //Handle remainder. |
|
739 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; |
|
740 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; |
|
741 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; } |
|
742 } |
|
743 } |
|
744 |
|
745 if( src >= src_end) |
|
746 goto TAIL; |
|
747 if(*src <= ALPHA_TRANS) |
|
748 goto ALPHA_0; |
|
749 else |
|
750 goto ALPHA_1_TO_254; |
|
751 |
|
752 TAIL: |
|
753 /* do any residual iterations */ |
|
754 src_end += UNROLL + 1; //goto the real end |
|
755 while(src != src_end) { |
|
756 if( *src != 0 ) { |
|
757 if( *src >= ALPHA_OPAQ ) { |
|
758 *dst = *src; |
|
759 } |
|
760 else { |
|
761 *dst = SkPMSrcOver(*src, *dst); |
|
762 } |
|
763 } |
|
764 src++; |
|
765 dst++; |
|
766 } |
|
767 |
|
768 #undef UNROLL |
|
769 return; |
|
770 } |
|
771 |
|
772 /* Neon version of S32_Blend_BlitRow32() |
|
773 * portable version is in src/core/SkBlitRow_D32.cpp |
|
774 */ |
|
775 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
|
776 const SkPMColor* SK_RESTRICT src, |
|
777 int count, U8CPU alpha) { |
|
778 SkASSERT(alpha <= 255); |
|
779 |
|
780 if (count <= 0) { |
|
781 return; |
|
782 } |
|
783 |
|
784 uint16_t src_scale = SkAlpha255To256(alpha); |
|
785 uint16_t dst_scale = 256 - src_scale; |
|
786 |
|
787 while (count >= 2) { |
|
788 uint8x8_t vsrc, vdst, vres; |
|
789 uint16x8_t vsrc_wide, vdst_wide; |
|
790 |
|
791 /* These commented prefetches are a big win for count |
|
792 * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4. |
|
793 * They also hurt a little (<5%) on an A15 |
|
794 */ |
|
795 //__builtin_prefetch(src+32); |
|
796 //__builtin_prefetch(dst+32); |
|
797 |
|
798 // Load |
|
799 vsrc = vreinterpret_u8_u32(vld1_u32(src)); |
|
800 vdst = vreinterpret_u8_u32(vld1_u32(dst)); |
|
801 |
|
802 // Process src |
|
803 vsrc_wide = vmovl_u8(vsrc); |
|
804 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); |
|
805 |
|
806 // Process dst |
|
807 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); |
|
808 |
|
809 // Combine |
|
810 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); |
|
811 |
|
812 // Store |
|
813 vst1_u32(dst, vreinterpret_u32_u8(vres)); |
|
814 |
|
815 src += 2; |
|
816 dst += 2; |
|
817 count -= 2; |
|
818 } |
|
819 |
|
820 if (count == 1) { |
|
821 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; |
|
822 uint16x8_t vsrc_wide, vdst_wide; |
|
823 |
|
824 // Load |
|
825 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); |
|
826 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); |
|
827 |
|
828 // Process |
|
829 vsrc_wide = vmovl_u8(vsrc); |
|
830 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); |
|
831 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); |
|
832 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); |
|
833 |
|
834 // Store |
|
835 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); |
|
836 } |
|
837 } |
|
838 |
|
839 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
|
840 const SkPMColor* SK_RESTRICT src, |
|
841 int count, U8CPU alpha) { |
|
842 |
|
843 SkASSERT(255 >= alpha); |
|
844 |
|
845 if (count <= 0) { |
|
846 return; |
|
847 } |
|
848 |
|
849 unsigned alpha256 = SkAlpha255To256(alpha); |
|
850 |
|
851 // First deal with odd counts |
|
852 if (count & 1) { |
|
853 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; |
|
854 uint16x8_t vdst_wide, vsrc_wide; |
|
855 unsigned dst_scale; |
|
856 |
|
857 // Load |
|
858 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); |
|
859 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); |
|
860 |
|
861 // Calc dst_scale |
|
862 dst_scale = vget_lane_u8(vsrc, 3); |
|
863 dst_scale *= alpha256; |
|
864 dst_scale >>= 8; |
|
865 dst_scale = 256 - dst_scale; |
|
866 |
|
867 // Process src |
|
868 vsrc_wide = vmovl_u8(vsrc); |
|
869 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256); |
|
870 |
|
871 // Process dst |
|
872 vdst_wide = vmovl_u8(vdst); |
|
873 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale); |
|
874 |
|
875 // Combine |
|
876 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); |
|
877 |
|
878 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); |
|
879 dst++; |
|
880 src++; |
|
881 count--; |
|
882 } |
|
883 |
|
884 if (count) { |
|
885 uint8x8_t alpha_mask; |
|
886 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; |
|
887 alpha_mask = vld1_u8(alpha_mask_setup); |
|
888 |
|
889 do { |
|
890 |
|
891 uint8x8_t vsrc, vdst, vres, vsrc_alphas; |
|
892 uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale; |
|
893 |
|
894 __builtin_prefetch(src+32); |
|
895 __builtin_prefetch(dst+32); |
|
896 |
|
897 // Load |
|
898 vsrc = vreinterpret_u8_u32(vld1_u32(src)); |
|
899 vdst = vreinterpret_u8_u32(vld1_u32(dst)); |
|
900 |
|
901 // Prepare src_scale |
|
902 vsrc_scale = vdupq_n_u16(alpha256); |
|
903 |
|
904 // Calc dst_scale |
|
905 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask); |
|
906 vdst_scale = vmovl_u8(vsrc_alphas); |
|
907 vdst_scale *= vsrc_scale; |
|
908 vdst_scale = vshrq_n_u16(vdst_scale, 8); |
|
909 vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale); |
|
910 |
|
911 // Process src |
|
912 vsrc_wide = vmovl_u8(vsrc); |
|
913 vsrc_wide *= vsrc_scale; |
|
914 |
|
915 // Process dst |
|
916 vdst_wide = vmovl_u8(vdst); |
|
917 vdst_wide *= vdst_scale; |
|
918 |
|
919 // Combine |
|
920 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); |
|
921 |
|
922 vst1_u32(dst, vreinterpret_u32_u8(vres)); |
|
923 |
|
924 src += 2; |
|
925 dst += 2; |
|
926 count -= 2; |
|
927 } while(count); |
|
928 } |
|
929 } |
|
930 |
|
931 /////////////////////////////////////////////////////////////////////////////// |
|
932 |
|
933 #undef DEBUG_OPAQUE_DITHER |
|
934 |
|
935 #if defined(DEBUG_OPAQUE_DITHER) |
|
936 static void showme8(char *str, void *p, int len) |
|
937 { |
|
938 static char buf[256]; |
|
939 char tbuf[32]; |
|
940 int i; |
|
941 char *pc = (char*) p; |
|
942 sprintf(buf,"%8s:", str); |
|
943 for(i=0;i<len;i++) { |
|
944 sprintf(tbuf, " %02x", pc[i]); |
|
945 strcat(buf, tbuf); |
|
946 } |
|
947 SkDebugf("%s\n", buf); |
|
948 } |
|
949 static void showme16(char *str, void *p, int len) |
|
950 { |
|
951 static char buf[256]; |
|
952 char tbuf[32]; |
|
953 int i; |
|
954 uint16_t *pc = (uint16_t*) p; |
|
955 sprintf(buf,"%8s:", str); |
|
956 len = (len / sizeof(uint16_t)); /* passed as bytes */ |
|
957 for(i=0;i<len;i++) { |
|
958 sprintf(tbuf, " %04x", pc[i]); |
|
959 strcat(buf, tbuf); |
|
960 } |
|
961 SkDebugf("%s\n", buf); |
|
962 } |
|
963 #endif |
|
964 |
|
965 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, |
|
966 const SkPMColor* SK_RESTRICT src, |
|
967 int count, U8CPU alpha, int x, int y) { |
|
968 SkASSERT(255 == alpha); |
|
969 |
|
970 #define UNROLL 8 |
|
971 |
|
972 if (count >= UNROLL) { |
|
973 |
|
974 #if defined(DEBUG_OPAQUE_DITHER) |
|
975 uint16_t tmpbuf[UNROLL]; |
|
976 int td[UNROLL]; |
|
977 int tdv[UNROLL]; |
|
978 int ta[UNROLL]; |
|
979 int tap[UNROLL]; |
|
980 uint16_t in_dst[UNROLL]; |
|
981 int offset = 0; |
|
982 int noisy = 0; |
|
983 #endif |
|
984 |
|
985 uint8x8_t dbase; |
|
986 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; |
|
987 dbase = vld1_u8(dstart); |
|
988 |
|
989 do { |
|
990 uint8x8_t sr, sg, sb, sa, d; |
|
991 uint16x8_t dst8, scale8, alpha8; |
|
992 uint16x8_t dst_r, dst_g, dst_b; |
|
993 |
|
994 #if defined(DEBUG_OPAQUE_DITHER) |
|
995 // calculate 8 elements worth into a temp buffer |
|
996 { |
|
997 int my_y = y; |
|
998 int my_x = x; |
|
999 SkPMColor* my_src = (SkPMColor*)src; |
|
1000 uint16_t* my_dst = dst; |
|
1001 int i; |
|
1002 |
|
1003 DITHER_565_SCAN(my_y); |
|
1004 for(i = 0; i < UNROLL; i++) { |
|
1005 SkPMColor c = *my_src++; |
|
1006 SkPMColorAssert(c); |
|
1007 if (c) { |
|
1008 unsigned a = SkGetPackedA32(c); |
|
1009 |
|
1010 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a)); |
|
1011 tdv[i] = DITHER_VALUE(my_x); |
|
1012 ta[i] = a; |
|
1013 tap[i] = SkAlpha255To256(a); |
|
1014 td[i] = d; |
|
1015 |
|
1016 unsigned sr = SkGetPackedR32(c); |
|
1017 unsigned sg = SkGetPackedG32(c); |
|
1018 unsigned sb = SkGetPackedB32(c); |
|
1019 sr = SkDITHER_R32_FOR_565(sr, d); |
|
1020 sg = SkDITHER_G32_FOR_565(sg, d); |
|
1021 sb = SkDITHER_B32_FOR_565(sb, d); |
|
1022 |
|
1023 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); |
|
1024 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst); |
|
1025 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); |
|
1026 // now src and dst expanded are in g:11 r:10 x:1 b:10 |
|
1027 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); |
|
1028 td[i] = d; |
|
1029 } else { |
|
1030 tmpbuf[i] = *my_dst; |
|
1031 ta[i] = tdv[i] = td[i] = 0xbeef; |
|
1032 } |
|
1033 in_dst[i] = *my_dst; |
|
1034 my_dst += 1; |
|
1035 DITHER_INC_X(my_x); |
|
1036 } |
|
1037 } |
|
1038 #endif |
|
1039 |
|
1040 |
|
1041 { |
|
1042 register uint8x8_t d0 asm("d0"); |
|
1043 register uint8x8_t d1 asm("d1"); |
|
1044 register uint8x8_t d2 asm("d2"); |
|
1045 register uint8x8_t d3 asm("d3"); |
|
1046 |
|
1047 asm ("vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" |
|
1048 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src) |
|
1049 : |
|
1050 ); |
|
1051 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) |
|
1052 sr = d2; sg = d1; sb = d0; sa = d3; |
|
1053 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) |
|
1054 sr = d0; sg = d1; sb = d2; sa = d3; |
|
1055 #endif |
|
1056 } |
|
1057 |
|
1058 /* calculate 'd', which will be 0..7 |
|
1059 * dbase[] is 0..7; alpha is 0..256; 16 bits suffice |
|
1060 */ |
|
1061 alpha8 = vmovl_u8(dbase); |
|
1062 alpha8 = vmlal_u8(alpha8, sa, dbase); |
|
1063 d = vshrn_n_u16(alpha8, 8); // narrowing too |
|
1064 |
|
1065 // sr = sr - (sr>>5) + d |
|
1066 /* watching for 8-bit overflow. d is 0..7; risky range of |
|
1067 * sr is >248; and then (sr>>5) is 7 so it offsets 'd'; |
|
1068 * safe as long as we do ((sr-sr>>5) + d) |
|
1069 */ |
|
1070 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); |
|
1071 sr = vadd_u8(sr, d); |
|
1072 |
|
1073 // sb = sb - (sb>>5) + d |
|
1074 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); |
|
1075 sb = vadd_u8(sb, d); |
|
1076 |
|
1077 // sg = sg - (sg>>6) + d>>1; similar logic for overflows |
|
1078 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); |
|
1079 sg = vadd_u8(sg, vshr_n_u8(d,1)); |
|
1080 |
|
1081 // need to pick up 8 dst's -- at 16 bits each, 128 bits |
|
1082 dst8 = vld1q_u16(dst); |
|
1083 dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK)); |
|
1084 dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS); |
|
1085 dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT); // clearing hi bits |
|
1086 |
|
1087 // blend |
|
1088 scale8 = vsubw_u8(vdupq_n_u16(256), sa); |
|
1089 |
|
1090 // combine the addq and mul, save 3 insns |
|
1091 scale8 = vshrq_n_u16(scale8, 3); |
|
1092 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8); |
|
1093 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); |
|
1094 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); |
|
1095 |
|
1096 // repack to store |
|
1097 dst8 = vshrq_n_u16(dst_b, 5); |
|
1098 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); |
|
1099 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); |
|
1100 |
|
1101 vst1q_u16(dst, dst8); |
|
1102 |
|
1103 #if defined(DEBUG_OPAQUE_DITHER) |
|
1104 // verify my 8 elements match the temp buffer |
|
1105 { |
|
1106 int i, bad=0; |
|
1107 static int invocation; |
|
1108 |
|
1109 for (i = 0; i < UNROLL; i++) { |
|
1110 if (tmpbuf[i] != dst[i]) { |
|
1111 bad=1; |
|
1112 } |
|
1113 } |
|
1114 if (bad) { |
|
1115 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n", |
|
1116 invocation, offset); |
|
1117 SkDebugf(" alpha 0x%x\n", alpha); |
|
1118 for (i = 0; i < UNROLL; i++) |
|
1119 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n", |
|
1120 i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i], |
|
1121 in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]); |
|
1122 |
|
1123 showme16("alpha8", &alpha8, sizeof(alpha8)); |
|
1124 showme16("scale8", &scale8, sizeof(scale8)); |
|
1125 showme8("d", &d, sizeof(d)); |
|
1126 showme16("dst8", &dst8, sizeof(dst8)); |
|
1127 showme16("dst_b", &dst_b, sizeof(dst_b)); |
|
1128 showme16("dst_g", &dst_g, sizeof(dst_g)); |
|
1129 showme16("dst_r", &dst_r, sizeof(dst_r)); |
|
1130 showme8("sb", &sb, sizeof(sb)); |
|
1131 showme8("sg", &sg, sizeof(sg)); |
|
1132 showme8("sr", &sr, sizeof(sr)); |
|
1133 |
|
1134 return; |
|
1135 } |
|
1136 offset += UNROLL; |
|
1137 invocation++; |
|
1138 } |
|
1139 #endif |
|
1140 dst += UNROLL; |
|
1141 count -= UNROLL; |
|
1142 // skip x += UNROLL, since it's unchanged mod-4 |
|
1143 } while (count >= UNROLL); |
|
1144 } |
|
1145 #undef UNROLL |
|
1146 |
|
1147 // residuals |
|
1148 if (count > 0) { |
|
1149 DITHER_565_SCAN(y); |
|
1150 do { |
|
1151 SkPMColor c = *src++; |
|
1152 SkPMColorAssert(c); |
|
1153 if (c) { |
|
1154 unsigned a = SkGetPackedA32(c); |
|
1155 |
|
1156 // dither and alpha are just temporary variables to work-around |
|
1157 // an ICE in debug. |
|
1158 unsigned dither = DITHER_VALUE(x); |
|
1159 unsigned alpha = SkAlpha255To256(a); |
|
1160 int d = SkAlphaMul(dither, alpha); |
|
1161 |
|
1162 unsigned sr = SkGetPackedR32(c); |
|
1163 unsigned sg = SkGetPackedG32(c); |
|
1164 unsigned sb = SkGetPackedB32(c); |
|
1165 sr = SkDITHER_R32_FOR_565(sr, d); |
|
1166 sg = SkDITHER_G32_FOR_565(sg, d); |
|
1167 sb = SkDITHER_B32_FOR_565(sb, d); |
|
1168 |
|
1169 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); |
|
1170 uint32_t dst_expanded = SkExpand_rgb_16(*dst); |
|
1171 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); |
|
1172 // now src and dst expanded are in g:11 r:10 x:1 b:10 |
|
1173 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); |
|
1174 } |
|
1175 dst += 1; |
|
1176 DITHER_INC_X(x); |
|
1177 } while (--count != 0); |
|
1178 } |
|
1179 } |
|
1180 |
|
1181 /////////////////////////////////////////////////////////////////////////////// |
|
1182 |
|
1183 #undef DEBUG_S32_OPAQUE_DITHER |
|
1184 |
|
1185 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, |
|
1186 const SkPMColor* SK_RESTRICT src, |
|
1187 int count, U8CPU alpha, int x, int y) { |
|
1188 SkASSERT(255 == alpha); |
|
1189 |
|
1190 #define UNROLL 8 |
|
1191 if (count >= UNROLL) { |
|
1192 uint8x8_t d; |
|
1193 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; |
|
1194 d = vld1_u8(dstart); |
|
1195 |
|
1196 while (count >= UNROLL) { |
|
1197 uint8x8_t sr, sg, sb; |
|
1198 uint16x8_t dr, dg, db; |
|
1199 uint16x8_t dst8; |
|
1200 |
|
1201 { |
|
1202 register uint8x8_t d0 asm("d0"); |
|
1203 register uint8x8_t d1 asm("d1"); |
|
1204 register uint8x8_t d2 asm("d2"); |
|
1205 register uint8x8_t d3 asm("d3"); |
|
1206 |
|
1207 asm ( |
|
1208 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" |
|
1209 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) |
|
1210 : |
|
1211 ); |
|
1212 sg = d1; |
|
1213 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) |
|
1214 sr = d2; sb = d0; |
|
1215 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) |
|
1216 sr = d0; sb = d2; |
|
1217 #endif |
|
1218 } |
|
1219 /* XXX: if we want to prefetch, hide it in the above asm() |
|
1220 * using the gcc __builtin_prefetch(), the prefetch will |
|
1221 * fall to the bottom of the loop -- it won't stick up |
|
1222 * at the top of the loop, just after the vld4. |
|
1223 */ |
|
1224 |
|
1225 // sr = sr - (sr>>5) + d |
|
1226 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); |
|
1227 dr = vaddl_u8(sr, d); |
|
1228 |
|
1229 // sb = sb - (sb>>5) + d |
|
1230 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); |
|
1231 db = vaddl_u8(sb, d); |
|
1232 |
|
1233 // sg = sg - (sg>>6) + d>>1; similar logic for overflows |
|
1234 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); |
|
1235 dg = vaddl_u8(sg, vshr_n_u8(d, 1)); |
|
1236 |
|
1237 // pack high bits of each into 565 format (rgb, b is lsb) |
|
1238 dst8 = vshrq_n_u16(db, 3); |
|
1239 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); |
|
1240 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11); |
|
1241 |
|
1242 // store it |
|
1243 vst1q_u16(dst, dst8); |
|
1244 |
|
1245 #if defined(DEBUG_S32_OPAQUE_DITHER) |
|
1246 // always good to know if we generated good results |
|
1247 { |
|
1248 int i, myx = x, myy = y; |
|
1249 DITHER_565_SCAN(myy); |
|
1250 for (i=0;i<UNROLL;i++) { |
|
1251 // the '!' in the asm block above post-incremented src by the 8 pixels it reads. |
|
1252 SkPMColor c = src[i-8]; |
|
1253 unsigned dither = DITHER_VALUE(myx); |
|
1254 uint16_t val = SkDitherRGB32To565(c, dither); |
|
1255 if (val != dst[i]) { |
|
1256 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n", |
|
1257 c, dither, val, dst[i], dstart[i]); |
|
1258 } |
|
1259 DITHER_INC_X(myx); |
|
1260 } |
|
1261 } |
|
1262 #endif |
|
1263 |
|
1264 dst += UNROLL; |
|
1265 // we don't need to increment src as the asm above has already done it |
|
1266 count -= UNROLL; |
|
1267 x += UNROLL; // probably superfluous |
|
1268 } |
|
1269 } |
|
1270 #undef UNROLL |
|
1271 |
|
1272 // residuals |
|
1273 if (count > 0) { |
|
1274 DITHER_565_SCAN(y); |
|
1275 do { |
|
1276 SkPMColor c = *src++; |
|
1277 SkPMColorAssert(c); |
|
1278 SkASSERT(SkGetPackedA32(c) == 255); |
|
1279 |
|
1280 unsigned dither = DITHER_VALUE(x); |
|
1281 *dst++ = SkDitherRGB32To565(c, dither); |
|
1282 DITHER_INC_X(x); |
|
1283 } while (--count != 0); |
|
1284 } |
|
1285 } |
|
1286 |
|
1287 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, |
|
1288 SkPMColor color) { |
|
1289 if (count <= 0) { |
|
1290 return; |
|
1291 } |
|
1292 |
|
1293 if (0 == color) { |
|
1294 if (src != dst) { |
|
1295 memcpy(dst, src, count * sizeof(SkPMColor)); |
|
1296 } |
|
1297 return; |
|
1298 } |
|
1299 |
|
1300 unsigned colorA = SkGetPackedA32(color); |
|
1301 if (255 == colorA) { |
|
1302 sk_memset32(dst, color, count); |
|
1303 } else { |
|
1304 unsigned scale = 256 - SkAlpha255To256(colorA); |
|
1305 |
|
1306 if (count >= 8) { |
|
1307 // at the end of this assembly, count will have been decremented |
|
1308 // to a negative value. That is, if count mod 8 = x, it will be |
|
1309 // -8 +x coming out. |
|
1310 asm volatile ( |
|
1311 PLD128(src, 0) |
|
1312 |
|
1313 "vdup.32 q0, %[color] \n\t" |
|
1314 |
|
1315 PLD128(src, 128) |
|
1316 |
|
1317 // scale numerical interval [0-255], so load as 8 bits |
|
1318 "vdup.8 d2, %[scale] \n\t" |
|
1319 |
|
1320 PLD128(src, 256) |
|
1321 |
|
1322 "subs %[count], %[count], #8 \n\t" |
|
1323 |
|
1324 PLD128(src, 384) |
|
1325 |
|
1326 "Loop_Color32: \n\t" |
|
1327 |
|
1328 // load src color, 8 pixels, 4 64 bit registers |
|
1329 // (and increment src). |
|
1330 "vld1.32 {d4-d7}, [%[src]]! \n\t" |
|
1331 |
|
1332 PLD128(src, 384) |
|
1333 |
|
1334 // multiply long by scale, 64 bits at a time, |
|
1335 // destination into a 128 bit register. |
|
1336 "vmull.u8 q4, d4, d2 \n\t" |
|
1337 "vmull.u8 q5, d5, d2 \n\t" |
|
1338 "vmull.u8 q6, d6, d2 \n\t" |
|
1339 "vmull.u8 q7, d7, d2 \n\t" |
|
1340 |
|
1341 // shift the 128 bit registers, containing the 16 |
|
1342 // bit scaled values back to 8 bits, narrowing the |
|
1343 // results to 64 bit registers. |
|
1344 "vshrn.i16 d8, q4, #8 \n\t" |
|
1345 "vshrn.i16 d9, q5, #8 \n\t" |
|
1346 "vshrn.i16 d10, q6, #8 \n\t" |
|
1347 "vshrn.i16 d11, q7, #8 \n\t" |
|
1348 |
|
1349 // adding back the color, using 128 bit registers. |
|
1350 "vadd.i8 q6, q4, q0 \n\t" |
|
1351 "vadd.i8 q7, q5, q0 \n\t" |
|
1352 |
|
1353 // store back the 8 calculated pixels (2 128 bit |
|
1354 // registers), and increment dst. |
|
1355 "vst1.32 {d12-d15}, [%[dst]]! \n\t" |
|
1356 |
|
1357 "subs %[count], %[count], #8 \n\t" |
|
1358 "bge Loop_Color32 \n\t" |
|
1359 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count) |
|
1360 : [color] "r" (color), [scale] "r" (scale) |
|
1361 : "cc", "memory", |
|
1362 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", |
|
1363 "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15" |
|
1364 ); |
|
1365 // At this point, if we went through the inline assembly, count is |
|
1366 // a negative value: |
|
1367 // if the value is -8, there is no pixel left to process. |
|
1368 // if the value is -7, there is one pixel left to process |
|
1369 // ... |
|
1370 // And'ing it with 7 will give us the number of pixels |
|
1371 // left to process. |
|
1372 count = count & 0x7; |
|
1373 } |
|
1374 |
|
1375 while (count > 0) { |
|
1376 *dst = color + SkAlphaMulQ(*src, scale); |
|
1377 src += 1; |
|
1378 dst += 1; |
|
1379 count--; |
|
1380 } |
|
1381 } |
|
1382 } |
|
1383 |
|
1384 /////////////////////////////////////////////////////////////////////////////// |
|
1385 |
|
1386 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { |
|
1387 // no dither |
|
1388 // NOTE: For the S32_D565_Blend function below, we don't have a special |
|
1389 // version that assumes that each source pixel is opaque. But our |
|
1390 // S32A is still faster than the default, so use it. |
|
1391 S32_D565_Opaque_neon, |
|
1392 S32A_D565_Blend_neon, // really S32_D565_Blend |
|
1393 S32A_D565_Opaque_neon, |
|
1394 S32A_D565_Blend_neon, |
|
1395 |
|
1396 // dither |
|
1397 S32_D565_Opaque_Dither_neon, |
|
1398 S32_D565_Blend_Dither_neon, |
|
1399 S32A_D565_Opaque_Dither_neon, |
|
1400 NULL, // S32A_D565_Blend_Dither |
|
1401 }; |
|
1402 |
|
1403 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { |
|
1404 NULL, // S32_Opaque, |
|
1405 S32_Blend_BlitRow32_neon, // S32_Blend, |
|
1406 /* |
|
1407 * We have two choices for S32A_Opaque procs. The one reads the src alpha |
|
1408 * value and attempts to optimize accordingly. The optimization is |
|
1409 * sensitive to the source content and is not a win in all cases. For |
|
1410 * example, if there are a lot of transitions between the alpha states, |
|
1411 * the performance will almost certainly be worse. However, for many |
|
1412 * common cases the performance is equivalent or better than the standard |
|
1413 * case where we do not inspect the src alpha. |
|
1414 */ |
|
1415 #if SK_A32_SHIFT == 24 |
|
1416 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor |
|
1417 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, |
|
1418 #else |
|
1419 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, |
|
1420 #endif |
|
1421 S32A_Blend_BlitRow32_neon // S32A_Blend |
|
1422 }; |