gfx/skia/trunk/src/opts/SkBlitRow_opts_arm_neon.cpp

branch
TOR_BUG_3246
changeset 7
129ffea94266
equal deleted inserted replaced
-1:000000000000 0:f625ba743ad0
1 /*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include "SkBlitRow_opts_arm_neon.h"
9
10 #include "SkBlitMask.h"
11 #include "SkBlitRow.h"
12 #include "SkColorPriv.h"
13 #include "SkDither.h"
14 #include "SkMathPriv.h"
15 #include "SkUtils.h"
16
17 #include "SkCachePreload_arm.h"
18 #include "SkColor_opts_neon.h"
19 #include <arm_neon.h>
20
21 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
22 const SkPMColor* SK_RESTRICT src, int count,
23 U8CPU alpha, int /*x*/, int /*y*/) {
24 SkASSERT(255 == alpha);
25
26 while (count >= 8) {
27 uint8x8x4_t vsrc;
28 uint16x8_t vdst;
29
30 // Load
31 vsrc = vld4_u8((uint8_t*)src);
32
33 // Convert src to 565
34 vdst = SkPixel32ToPixel16_neon8(vsrc);
35
36 // Store
37 vst1q_u16(dst, vdst);
38
39 // Prepare next iteration
40 dst += 8;
41 src += 8;
42 count -= 8;
43 };
44
45 // Leftovers
46 while (count > 0) {
47 SkPMColor c = *src++;
48 SkPMColorAssert(c);
49 *dst = SkPixel32ToPixel16_ToU16(c);
50 dst++;
51 count--;
52 };
53 }
54
55 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
56 const SkPMColor* SK_RESTRICT src, int count,
57 U8CPU alpha, int /*x*/, int /*y*/) {
58 SkASSERT(255 == alpha);
59
60 if (count >= 8) {
61 uint16_t* SK_RESTRICT keep_dst = 0;
62
63 asm volatile (
64 "ands ip, %[count], #7 \n\t"
65 "vmov.u8 d31, #1<<7 \n\t"
66 "vld1.16 {q12}, [%[dst]] \n\t"
67 "vld4.8 {d0-d3}, [%[src]] \n\t"
68 // Thumb does not support the standard ARM conditional
69 // instructions but instead requires the 'it' instruction
70 // to signal conditional execution
71 "it eq \n\t"
72 "moveq ip, #8 \n\t"
73 "mov %[keep_dst], %[dst] \n\t"
74
75 "add %[src], %[src], ip, LSL#2 \n\t"
76 "add %[dst], %[dst], ip, LSL#1 \n\t"
77 "subs %[count], %[count], ip \n\t"
78 "b 9f \n\t"
79 // LOOP
80 "2: \n\t"
81
82 "vld1.16 {q12}, [%[dst]]! \n\t"
83 "vld4.8 {d0-d3}, [%[src]]! \n\t"
84 "vst1.16 {q10}, [%[keep_dst]] \n\t"
85 "sub %[keep_dst], %[dst], #8*2 \n\t"
86 "subs %[count], %[count], #8 \n\t"
87 "9: \n\t"
88 "pld [%[dst],#32] \n\t"
89 // expand 0565 q12 to 8888 {d4-d7}
90 "vmovn.u16 d4, q12 \n\t"
91 "vshr.u16 q11, q12, #5 \n\t"
92 "vshr.u16 q10, q12, #6+5 \n\t"
93 "vmovn.u16 d5, q11 \n\t"
94 "vmovn.u16 d6, q10 \n\t"
95 "vshl.u8 d4, d4, #3 \n\t"
96 "vshl.u8 d5, d5, #2 \n\t"
97 "vshl.u8 d6, d6, #3 \n\t"
98
99 "vmovl.u8 q14, d31 \n\t"
100 "vmovl.u8 q13, d31 \n\t"
101 "vmovl.u8 q12, d31 \n\t"
102
103 // duplicate in 4/2/1 & 8pix vsns
104 "vmvn.8 d30, d3 \n\t"
105 "vmlal.u8 q14, d30, d6 \n\t"
106 "vmlal.u8 q13, d30, d5 \n\t"
107 "vmlal.u8 q12, d30, d4 \n\t"
108 "vshr.u16 q8, q14, #5 \n\t"
109 "vshr.u16 q9, q13, #6 \n\t"
110 "vaddhn.u16 d6, q14, q8 \n\t"
111 "vshr.u16 q8, q12, #5 \n\t"
112 "vaddhn.u16 d5, q13, q9 \n\t"
113 "vqadd.u8 d6, d6, d0 \n\t" // moved up
114 "vaddhn.u16 d4, q12, q8 \n\t"
115 // intentionally don't calculate alpha
116 // result in d4-d6
117
118 "vqadd.u8 d5, d5, d1 \n\t"
119 "vqadd.u8 d4, d4, d2 \n\t"
120
121 // pack 8888 {d4-d6} to 0565 q10
122 "vshll.u8 q10, d6, #8 \n\t"
123 "vshll.u8 q3, d5, #8 \n\t"
124 "vshll.u8 q2, d4, #8 \n\t"
125 "vsri.u16 q10, q3, #5 \n\t"
126 "vsri.u16 q10, q2, #11 \n\t"
127
128 "bne 2b \n\t"
129
130 "1: \n\t"
131 "vst1.16 {q10}, [%[keep_dst]] \n\t"
132 : [count] "+r" (count)
133 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
134 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
135 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
136 "d30","d31"
137 );
138 }
139 else
140 { // handle count < 8
141 uint16_t* SK_RESTRICT keep_dst = 0;
142
143 asm volatile (
144 "vmov.u8 d31, #1<<7 \n\t"
145 "mov %[keep_dst], %[dst] \n\t"
146
147 "tst %[count], #4 \n\t"
148 "beq 14f \n\t"
149 "vld1.16 {d25}, [%[dst]]! \n\t"
150 "vld1.32 {q1}, [%[src]]! \n\t"
151
152 "14: \n\t"
153 "tst %[count], #2 \n\t"
154 "beq 12f \n\t"
155 "vld1.32 {d24[1]}, [%[dst]]! \n\t"
156 "vld1.32 {d1}, [%[src]]! \n\t"
157
158 "12: \n\t"
159 "tst %[count], #1 \n\t"
160 "beq 11f \n\t"
161 "vld1.16 {d24[1]}, [%[dst]]! \n\t"
162 "vld1.32 {d0[1]}, [%[src]]! \n\t"
163
164 "11: \n\t"
165 // unzips achieve the same as a vld4 operation
166 "vuzpq.u16 q0, q1 \n\t"
167 "vuzp.u8 d0, d1 \n\t"
168 "vuzp.u8 d2, d3 \n\t"
169 // expand 0565 q12 to 8888 {d4-d7}
170 "vmovn.u16 d4, q12 \n\t"
171 "vshr.u16 q11, q12, #5 \n\t"
172 "vshr.u16 q10, q12, #6+5 \n\t"
173 "vmovn.u16 d5, q11 \n\t"
174 "vmovn.u16 d6, q10 \n\t"
175 "vshl.u8 d4, d4, #3 \n\t"
176 "vshl.u8 d5, d5, #2 \n\t"
177 "vshl.u8 d6, d6, #3 \n\t"
178
179 "vmovl.u8 q14, d31 \n\t"
180 "vmovl.u8 q13, d31 \n\t"
181 "vmovl.u8 q12, d31 \n\t"
182
183 // duplicate in 4/2/1 & 8pix vsns
184 "vmvn.8 d30, d3 \n\t"
185 "vmlal.u8 q14, d30, d6 \n\t"
186 "vmlal.u8 q13, d30, d5 \n\t"
187 "vmlal.u8 q12, d30, d4 \n\t"
188 "vshr.u16 q8, q14, #5 \n\t"
189 "vshr.u16 q9, q13, #6 \n\t"
190 "vaddhn.u16 d6, q14, q8 \n\t"
191 "vshr.u16 q8, q12, #5 \n\t"
192 "vaddhn.u16 d5, q13, q9 \n\t"
193 "vqadd.u8 d6, d6, d0 \n\t" // moved up
194 "vaddhn.u16 d4, q12, q8 \n\t"
195 // intentionally don't calculate alpha
196 // result in d4-d6
197
198 "vqadd.u8 d5, d5, d1 \n\t"
199 "vqadd.u8 d4, d4, d2 \n\t"
200
201 // pack 8888 {d4-d6} to 0565 q10
202 "vshll.u8 q10, d6, #8 \n\t"
203 "vshll.u8 q3, d5, #8 \n\t"
204 "vshll.u8 q2, d4, #8 \n\t"
205 "vsri.u16 q10, q3, #5 \n\t"
206 "vsri.u16 q10, q2, #11 \n\t"
207
208 // store
209 "tst %[count], #4 \n\t"
210 "beq 24f \n\t"
211 "vst1.16 {d21}, [%[keep_dst]]! \n\t"
212
213 "24: \n\t"
214 "tst %[count], #2 \n\t"
215 "beq 22f \n\t"
216 "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t"
217
218 "22: \n\t"
219 "tst %[count], #1 \n\t"
220 "beq 21f \n\t"
221 "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t"
222
223 "21: \n\t"
224 : [count] "+r" (count)
225 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
226 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
227 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
228 "d30","d31"
229 );
230 }
231 }
232
233 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
234 prod += vdupq_n_u16(128);
235 prod += vshrq_n_u16(prod, 8);
236 return vshrq_n_u16(prod, 8);
237 }
238
239 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
240 const SkPMColor* SK_RESTRICT src, int count,
241 U8CPU alpha, int /*x*/, int /*y*/) {
242 SkASSERT(255 > alpha);
243
244 /* This code implements a Neon version of S32A_D565_Blend. The results have
245 * a few mismatches compared to the original code. These mismatches never
246 * exceed 1.
247 */
248
249 if (count >= 8) {
250 uint16x8_t valpha_max, vmask_blue;
251 uint8x8_t valpha;
252
253 // prepare constants
254 valpha_max = vmovq_n_u16(255);
255 valpha = vdup_n_u8(alpha);
256 vmask_blue = vmovq_n_u16(SK_B16_MASK);
257
258 do {
259 uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
260 uint16x8_t vres_a, vres_r, vres_g, vres_b;
261 uint8x8x4_t vsrc;
262
263 // load pixels
264 vdst = vld1q_u16(dst);
265 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
266 asm (
267 "vld4.u8 %h[vsrc], [%[src]]!"
268 : [vsrc] "=w" (vsrc), [src] "+&r" (src)
269 : :
270 );
271 #else
272 register uint8x8_t d0 asm("d0");
273 register uint8x8_t d1 asm("d1");
274 register uint8x8_t d2 asm("d2");
275 register uint8x8_t d3 asm("d3");
276
277 asm volatile (
278 "vld4.u8 {d0-d3},[%[src]]!;"
279 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
280 [src] "+&r" (src)
281 : :
282 );
283 vsrc.val[0] = d0;
284 vsrc.val[1] = d1;
285 vsrc.val[2] = d2;
286 vsrc.val[3] = d3;
287 #endif
288
289
290 // deinterleave dst
291 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to top of lanes
292 vdst_b = vdst & vmask_blue; // extract blue
293 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red
294 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
295
296 // shift src to 565
297 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
298 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
299 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
300
301 // calc src * src_scale
302 vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
303 vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
304 vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
305 vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
306
307 // prepare dst_scale
308 vres_a = SkDiv255Round_neon8(vres_a);
309 vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
310
311 // add dst * dst_scale to previous result
312 vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
313 vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
314 vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
315
316 #ifdef S32A_D565_BLEND_EXACT
317 // It is possible to get exact results with this but it is slow,
318 // even slower than C code in some cases
319 vres_r = SkDiv255Round_neon8(vres_r);
320 vres_g = SkDiv255Round_neon8(vres_g);
321 vres_b = SkDiv255Round_neon8(vres_b);
322 #else
323 vres_r = vrshrq_n_u16(vres_r, 8);
324 vres_g = vrshrq_n_u16(vres_g, 8);
325 vres_b = vrshrq_n_u16(vres_b, 8);
326 #endif
327 // pack result
328 vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
329 vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
330
331 // store
332 vst1q_u16(dst, vres_b);
333 dst += 8;
334 count -= 8;
335 } while (count >= 8);
336 }
337
338 // leftovers
339 while (count-- > 0) {
340 SkPMColor sc = *src++;
341 if (sc) {
342 uint16_t dc = *dst;
343 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
344 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
345 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
346 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
347 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
348 }
349 dst += 1;
350 }
351 }
352
353 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
354 * each dither value is spaced out into byte lanes, and repeated
355 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
356 * start of each row.
357 */
358 static const uint8_t gDitherMatrix_Neon[48] = {
359 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
360 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
361 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
362 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
363
364 };
365
366 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
367 int count, U8CPU alpha, int x, int y)
368 {
369
370 SkASSERT(255 > alpha);
371
372 // rescale alpha to range 1 - 256
373 int scale = SkAlpha255To256(alpha);
374
375 if (count >= 8) {
376 /* select row and offset for dither array */
377 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
378
379 uint8x8_t vdither = vld1_u8(dstart); // load dither values
380 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
381
382 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into neon reg
383 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask
384
385 do {
386
387 uint8x8_t vsrc_r, vsrc_g, vsrc_b;
388 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
389 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
390 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
391 uint16x8_t vdst;
392 uint16x8_t vdst_r, vdst_g, vdst_b;
393 int16x8_t vres_r, vres_g, vres_b;
394 int8x8_t vres8_r, vres8_g, vres8_b;
395
396 // Load source and add dither
397 {
398 register uint8x8_t d0 asm("d0");
399 register uint8x8_t d1 asm("d1");
400 register uint8x8_t d2 asm("d2");
401 register uint8x8_t d3 asm("d3");
402
403 asm (
404 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"
405 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
406 :
407 );
408 vsrc_g = d1;
409 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
410 vsrc_r = d2; vsrc_b = d0;
411 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
412 vsrc_r = d0; vsrc_b = d2;
413 #endif
414 }
415
416 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
417 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
418 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
419
420 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
421 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red and widen
422 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue and widen
423
424 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red from result
425 vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g); // sub shifted green from result
426 vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b); // sub shifted blue from result
427
428 vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
429 vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
430 vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
431
432 // Load dst and unpack
433 vdst = vld1q_u16(dst);
434 vdst_g = vshrq_n_u16(vdst, 5); // shift down to get green
435 vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
436 vdst_b = vandq_u16(vdst, vmask_b); // mask to get blue
437
438 // subtract dst from src and widen
439 vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
440 vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
441 vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
442
443 // multiply diffs by scale and shift
444 vres_r = vmulq_s16(vres_r, vscale);
445 vres_g = vmulq_s16(vres_g, vscale);
446 vres_b = vmulq_s16(vres_b, vscale);
447
448 vres8_r = vshrn_n_s16(vres_r, 8);
449 vres8_g = vshrn_n_s16(vres_g, 8);
450 vres8_b = vshrn_n_s16(vres_b, 8);
451
452 // add dst to result
453 vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
454 vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
455 vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
456
457 // put result into 565 format
458 vres_b = vsliq_n_s16(vres_b, vres_g, 5); // shift up green and insert into blue
459 vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
460
461 // Store result
462 vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
463
464 // Next iteration
465 dst += 8;
466 count -= 8;
467
468 } while (count >= 8);
469 }
470
471 // Leftovers
472 if (count > 0) {
473 int scale = SkAlpha255To256(alpha);
474 DITHER_565_SCAN(y);
475 do {
476 SkPMColor c = *src++;
477 SkPMColorAssert(c);
478
479 int dither = DITHER_VALUE(x);
480 int sr = SkGetPackedR32(c);
481 int sg = SkGetPackedG32(c);
482 int sb = SkGetPackedB32(c);
483 sr = SkDITHER_R32To565(sr, dither);
484 sg = SkDITHER_G32To565(sg, dither);
485 sb = SkDITHER_B32To565(sb, dither);
486
487 uint16_t d = *dst;
488 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
489 SkAlphaBlend(sg, SkGetPackedG16(d), scale),
490 SkAlphaBlend(sb, SkGetPackedB16(d), scale));
491 DITHER_INC_X(x);
492 } while (--count != 0);
493 }
494 }
495
496 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
497 const SkPMColor* SK_RESTRICT src,
498 int count, U8CPU alpha) {
499
500 SkASSERT(255 == alpha);
501 if (count > 0) {
502
503
504 uint8x8_t alpha_mask;
505
506 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
507 alpha_mask = vld1_u8(alpha_mask_setup);
508
509 /* do the NEON unrolled code */
510 #define UNROLL 4
511 while (count >= UNROLL) {
512 uint8x8_t src_raw, dst_raw, dst_final;
513 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
514
515 /* The two prefetches below may make the code slighlty
516 * slower for small values of count but are worth having
517 * in the general case.
518 */
519 __builtin_prefetch(src+32);
520 __builtin_prefetch(dst+32);
521
522 /* get the source */
523 src_raw = vreinterpret_u8_u32(vld1_u32(src));
524 #if UNROLL > 2
525 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
526 #endif
527
528 /* get and hold the dst too */
529 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
530 #if UNROLL > 2
531 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
532 #endif
533
534 /* 1st and 2nd bits of the unrolling */
535 {
536 uint8x8_t dst_cooked;
537 uint16x8_t dst_wide;
538 uint8x8_t alpha_narrow;
539 uint16x8_t alpha_wide;
540
541 /* get the alphas spread out properly */
542 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
543 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
544
545 /* spread the dest */
546 dst_wide = vmovl_u8(dst_raw);
547
548 /* alpha mul the dest */
549 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
550 dst_cooked = vshrn_n_u16(dst_wide, 8);
551
552 /* sum -- ignoring any byte lane overflows */
553 dst_final = vadd_u8(src_raw, dst_cooked);
554 }
555
556 #if UNROLL > 2
557 /* the 3rd and 4th bits of our unrolling */
558 {
559 uint8x8_t dst_cooked;
560 uint16x8_t dst_wide;
561 uint8x8_t alpha_narrow;
562 uint16x8_t alpha_wide;
563
564 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
565 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
566
567 /* spread the dest */
568 dst_wide = vmovl_u8(dst_raw_2);
569
570 /* alpha mul the dest */
571 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
572 dst_cooked = vshrn_n_u16(dst_wide, 8);
573
574 /* sum -- ignoring any byte lane overflows */
575 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
576 }
577 #endif
578
579 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
580 #if UNROLL > 2
581 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
582 #endif
583
584 src += UNROLL;
585 dst += UNROLL;
586 count -= UNROLL;
587 }
588 #undef UNROLL
589
590 /* do any residual iterations */
591 while (--count >= 0) {
592 *dst = SkPMSrcOver(*src, *dst);
593 src += 1;
594 dst += 1;
595 }
596 }
597 }
598
599 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
600 const SkPMColor* SK_RESTRICT src,
601 int count, U8CPU alpha) {
602 SkASSERT(255 == alpha);
603
604 if (count <= 0)
605 return;
606
607 /* Use these to check if src is transparent or opaque */
608 const unsigned int ALPHA_OPAQ = 0xFF000000;
609 const unsigned int ALPHA_TRANS = 0x00FFFFFF;
610
611 #define UNROLL 4
612 const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
613 const SkPMColor* SK_RESTRICT src_temp = src;
614
615 /* set up the NEON variables */
616 uint8x8_t alpha_mask;
617 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
618 alpha_mask = vld1_u8(alpha_mask_setup);
619
620 uint8x8_t src_raw, dst_raw, dst_final;
621 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
622 uint8x8_t dst_cooked;
623 uint16x8_t dst_wide;
624 uint8x8_t alpha_narrow;
625 uint16x8_t alpha_wide;
626
627 /* choose the first processing type */
628 if( src >= src_end)
629 goto TAIL;
630 if(*src <= ALPHA_TRANS)
631 goto ALPHA_0;
632 if(*src >= ALPHA_OPAQ)
633 goto ALPHA_255;
634 /* fall-thru */
635
636 ALPHA_1_TO_254:
637 do {
638
639 /* get the source */
640 src_raw = vreinterpret_u8_u32(vld1_u32(src));
641 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
642
643 /* get and hold the dst too */
644 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
645 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
646
647
648 /* get the alphas spread out properly */
649 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
650 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
651 /* we collapsed (255-a)+1 ... */
652 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
653
654 /* spread the dest */
655 dst_wide = vmovl_u8(dst_raw);
656
657 /* alpha mul the dest */
658 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
659 dst_cooked = vshrn_n_u16(dst_wide, 8);
660
661 /* sum -- ignoring any byte lane overflows */
662 dst_final = vadd_u8(src_raw, dst_cooked);
663
664 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
665 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
666 /* we collapsed (255-a)+1 ... */
667 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
668
669 /* spread the dest */
670 dst_wide = vmovl_u8(dst_raw_2);
671
672 /* alpha mul the dest */
673 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
674 dst_cooked = vshrn_n_u16(dst_wide, 8);
675
676 /* sum -- ignoring any byte lane overflows */
677 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
678
679 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
680 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
681
682 src += UNROLL;
683 dst += UNROLL;
684
685 /* if 2 of the next pixels aren't between 1 and 254
686 it might make sense to go to the optimized loops */
687 if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
688 break;
689
690 } while(src < src_end);
691
692 if (src >= src_end)
693 goto TAIL;
694
695 if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
696 goto ALPHA_255;
697
698 /*fall-thru*/
699
700 ALPHA_0:
701
702 /*In this state, we know the current alpha is 0 and
703 we optimize for the next alpha also being zero. */
704 src_temp = src; //so we don't have to increment dst every time
705 do {
706 if(*(++src) > ALPHA_TRANS)
707 break;
708 if(*(++src) > ALPHA_TRANS)
709 break;
710 if(*(++src) > ALPHA_TRANS)
711 break;
712 if(*(++src) > ALPHA_TRANS)
713 break;
714 } while(src < src_end);
715
716 dst += (src - src_temp);
717
718 /* no longer alpha 0, so determine where to go next. */
719 if( src >= src_end)
720 goto TAIL;
721 if(*src >= ALPHA_OPAQ)
722 goto ALPHA_255;
723 else
724 goto ALPHA_1_TO_254;
725
726 ALPHA_255:
727 while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
728 dst[0]=src[0];
729 dst[1]=src[1];
730 dst[2]=src[2];
731 dst[3]=src[3];
732 src+=UNROLL;
733 dst+=UNROLL;
734 if(src >= src_end)
735 goto TAIL;
736 }
737
738 //Handle remainder.
739 if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
740 if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
741 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
742 }
743 }
744
745 if( src >= src_end)
746 goto TAIL;
747 if(*src <= ALPHA_TRANS)
748 goto ALPHA_0;
749 else
750 goto ALPHA_1_TO_254;
751
752 TAIL:
753 /* do any residual iterations */
754 src_end += UNROLL + 1; //goto the real end
755 while(src != src_end) {
756 if( *src != 0 ) {
757 if( *src >= ALPHA_OPAQ ) {
758 *dst = *src;
759 }
760 else {
761 *dst = SkPMSrcOver(*src, *dst);
762 }
763 }
764 src++;
765 dst++;
766 }
767
768 #undef UNROLL
769 return;
770 }
771
772 /* Neon version of S32_Blend_BlitRow32()
773 * portable version is in src/core/SkBlitRow_D32.cpp
774 */
775 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
776 const SkPMColor* SK_RESTRICT src,
777 int count, U8CPU alpha) {
778 SkASSERT(alpha <= 255);
779
780 if (count <= 0) {
781 return;
782 }
783
784 uint16_t src_scale = SkAlpha255To256(alpha);
785 uint16_t dst_scale = 256 - src_scale;
786
787 while (count >= 2) {
788 uint8x8_t vsrc, vdst, vres;
789 uint16x8_t vsrc_wide, vdst_wide;
790
791 /* These commented prefetches are a big win for count
792 * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
793 * They also hurt a little (<5%) on an A15
794 */
795 //__builtin_prefetch(src+32);
796 //__builtin_prefetch(dst+32);
797
798 // Load
799 vsrc = vreinterpret_u8_u32(vld1_u32(src));
800 vdst = vreinterpret_u8_u32(vld1_u32(dst));
801
802 // Process src
803 vsrc_wide = vmovl_u8(vsrc);
804 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
805
806 // Process dst
807 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
808
809 // Combine
810 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
811
812 // Store
813 vst1_u32(dst, vreinterpret_u32_u8(vres));
814
815 src += 2;
816 dst += 2;
817 count -= 2;
818 }
819
820 if (count == 1) {
821 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
822 uint16x8_t vsrc_wide, vdst_wide;
823
824 // Load
825 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
826 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
827
828 // Process
829 vsrc_wide = vmovl_u8(vsrc);
830 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
831 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
832 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
833
834 // Store
835 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
836 }
837 }
838
839 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
840 const SkPMColor* SK_RESTRICT src,
841 int count, U8CPU alpha) {
842
843 SkASSERT(255 >= alpha);
844
845 if (count <= 0) {
846 return;
847 }
848
849 unsigned alpha256 = SkAlpha255To256(alpha);
850
851 // First deal with odd counts
852 if (count & 1) {
853 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
854 uint16x8_t vdst_wide, vsrc_wide;
855 unsigned dst_scale;
856
857 // Load
858 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
859 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
860
861 // Calc dst_scale
862 dst_scale = vget_lane_u8(vsrc, 3);
863 dst_scale *= alpha256;
864 dst_scale >>= 8;
865 dst_scale = 256 - dst_scale;
866
867 // Process src
868 vsrc_wide = vmovl_u8(vsrc);
869 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
870
871 // Process dst
872 vdst_wide = vmovl_u8(vdst);
873 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
874
875 // Combine
876 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
877
878 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
879 dst++;
880 src++;
881 count--;
882 }
883
884 if (count) {
885 uint8x8_t alpha_mask;
886 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
887 alpha_mask = vld1_u8(alpha_mask_setup);
888
889 do {
890
891 uint8x8_t vsrc, vdst, vres, vsrc_alphas;
892 uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
893
894 __builtin_prefetch(src+32);
895 __builtin_prefetch(dst+32);
896
897 // Load
898 vsrc = vreinterpret_u8_u32(vld1_u32(src));
899 vdst = vreinterpret_u8_u32(vld1_u32(dst));
900
901 // Prepare src_scale
902 vsrc_scale = vdupq_n_u16(alpha256);
903
904 // Calc dst_scale
905 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
906 vdst_scale = vmovl_u8(vsrc_alphas);
907 vdst_scale *= vsrc_scale;
908 vdst_scale = vshrq_n_u16(vdst_scale, 8);
909 vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
910
911 // Process src
912 vsrc_wide = vmovl_u8(vsrc);
913 vsrc_wide *= vsrc_scale;
914
915 // Process dst
916 vdst_wide = vmovl_u8(vdst);
917 vdst_wide *= vdst_scale;
918
919 // Combine
920 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
921
922 vst1_u32(dst, vreinterpret_u32_u8(vres));
923
924 src += 2;
925 dst += 2;
926 count -= 2;
927 } while(count);
928 }
929 }
930
931 ///////////////////////////////////////////////////////////////////////////////
932
933 #undef DEBUG_OPAQUE_DITHER
934
935 #if defined(DEBUG_OPAQUE_DITHER)
936 static void showme8(char *str, void *p, int len)
937 {
938 static char buf[256];
939 char tbuf[32];
940 int i;
941 char *pc = (char*) p;
942 sprintf(buf,"%8s:", str);
943 for(i=0;i<len;i++) {
944 sprintf(tbuf, " %02x", pc[i]);
945 strcat(buf, tbuf);
946 }
947 SkDebugf("%s\n", buf);
948 }
949 static void showme16(char *str, void *p, int len)
950 {
951 static char buf[256];
952 char tbuf[32];
953 int i;
954 uint16_t *pc = (uint16_t*) p;
955 sprintf(buf,"%8s:", str);
956 len = (len / sizeof(uint16_t)); /* passed as bytes */
957 for(i=0;i<len;i++) {
958 sprintf(tbuf, " %04x", pc[i]);
959 strcat(buf, tbuf);
960 }
961 SkDebugf("%s\n", buf);
962 }
963 #endif
964
965 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
966 const SkPMColor* SK_RESTRICT src,
967 int count, U8CPU alpha, int x, int y) {
968 SkASSERT(255 == alpha);
969
970 #define UNROLL 8
971
972 if (count >= UNROLL) {
973
974 #if defined(DEBUG_OPAQUE_DITHER)
975 uint16_t tmpbuf[UNROLL];
976 int td[UNROLL];
977 int tdv[UNROLL];
978 int ta[UNROLL];
979 int tap[UNROLL];
980 uint16_t in_dst[UNROLL];
981 int offset = 0;
982 int noisy = 0;
983 #endif
984
985 uint8x8_t dbase;
986 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
987 dbase = vld1_u8(dstart);
988
989 do {
990 uint8x8_t sr, sg, sb, sa, d;
991 uint16x8_t dst8, scale8, alpha8;
992 uint16x8_t dst_r, dst_g, dst_b;
993
994 #if defined(DEBUG_OPAQUE_DITHER)
995 // calculate 8 elements worth into a temp buffer
996 {
997 int my_y = y;
998 int my_x = x;
999 SkPMColor* my_src = (SkPMColor*)src;
1000 uint16_t* my_dst = dst;
1001 int i;
1002
1003 DITHER_565_SCAN(my_y);
1004 for(i = 0; i < UNROLL; i++) {
1005 SkPMColor c = *my_src++;
1006 SkPMColorAssert(c);
1007 if (c) {
1008 unsigned a = SkGetPackedA32(c);
1009
1010 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
1011 tdv[i] = DITHER_VALUE(my_x);
1012 ta[i] = a;
1013 tap[i] = SkAlpha255To256(a);
1014 td[i] = d;
1015
1016 unsigned sr = SkGetPackedR32(c);
1017 unsigned sg = SkGetPackedG32(c);
1018 unsigned sb = SkGetPackedB32(c);
1019 sr = SkDITHER_R32_FOR_565(sr, d);
1020 sg = SkDITHER_G32_FOR_565(sg, d);
1021 sb = SkDITHER_B32_FOR_565(sb, d);
1022
1023 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1024 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
1025 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1026 // now src and dst expanded are in g:11 r:10 x:1 b:10
1027 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1028 td[i] = d;
1029 } else {
1030 tmpbuf[i] = *my_dst;
1031 ta[i] = tdv[i] = td[i] = 0xbeef;
1032 }
1033 in_dst[i] = *my_dst;
1034 my_dst += 1;
1035 DITHER_INC_X(my_x);
1036 }
1037 }
1038 #endif
1039
1040
1041 {
1042 register uint8x8_t d0 asm("d0");
1043 register uint8x8_t d1 asm("d1");
1044 register uint8x8_t d2 asm("d2");
1045 register uint8x8_t d3 asm("d3");
1046
1047 asm ("vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1048 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
1049 :
1050 );
1051 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
1052 sr = d2; sg = d1; sb = d0; sa = d3;
1053 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
1054 sr = d0; sg = d1; sb = d2; sa = d3;
1055 #endif
1056 }
1057
1058 /* calculate 'd', which will be 0..7
1059 * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
1060 */
1061 alpha8 = vmovl_u8(dbase);
1062 alpha8 = vmlal_u8(alpha8, sa, dbase);
1063 d = vshrn_n_u16(alpha8, 8); // narrowing too
1064
1065 // sr = sr - (sr>>5) + d
1066 /* watching for 8-bit overflow. d is 0..7; risky range of
1067 * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1068 * safe as long as we do ((sr-sr>>5) + d)
1069 */
1070 sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1071 sr = vadd_u8(sr, d);
1072
1073 // sb = sb - (sb>>5) + d
1074 sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1075 sb = vadd_u8(sb, d);
1076
1077 // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1078 sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1079 sg = vadd_u8(sg, vshr_n_u8(d,1));
1080
1081 // need to pick up 8 dst's -- at 16 bits each, 128 bits
1082 dst8 = vld1q_u16(dst);
1083 dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
1084 dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
1085 dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT); // clearing hi bits
1086
1087 // blend
1088 scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1089
1090 // combine the addq and mul, save 3 insns
1091 scale8 = vshrq_n_u16(scale8, 3);
1092 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1093 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1094 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1095
1096 // repack to store
1097 dst8 = vshrq_n_u16(dst_b, 5);
1098 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1099 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1100
1101 vst1q_u16(dst, dst8);
1102
1103 #if defined(DEBUG_OPAQUE_DITHER)
1104 // verify my 8 elements match the temp buffer
1105 {
1106 int i, bad=0;
1107 static int invocation;
1108
1109 for (i = 0; i < UNROLL; i++) {
1110 if (tmpbuf[i] != dst[i]) {
1111 bad=1;
1112 }
1113 }
1114 if (bad) {
1115 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
1116 invocation, offset);
1117 SkDebugf(" alpha 0x%x\n", alpha);
1118 for (i = 0; i < UNROLL; i++)
1119 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
1120 i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i],
1121 in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]);
1122
1123 showme16("alpha8", &alpha8, sizeof(alpha8));
1124 showme16("scale8", &scale8, sizeof(scale8));
1125 showme8("d", &d, sizeof(d));
1126 showme16("dst8", &dst8, sizeof(dst8));
1127 showme16("dst_b", &dst_b, sizeof(dst_b));
1128 showme16("dst_g", &dst_g, sizeof(dst_g));
1129 showme16("dst_r", &dst_r, sizeof(dst_r));
1130 showme8("sb", &sb, sizeof(sb));
1131 showme8("sg", &sg, sizeof(sg));
1132 showme8("sr", &sr, sizeof(sr));
1133
1134 return;
1135 }
1136 offset += UNROLL;
1137 invocation++;
1138 }
1139 #endif
1140 dst += UNROLL;
1141 count -= UNROLL;
1142 // skip x += UNROLL, since it's unchanged mod-4
1143 } while (count >= UNROLL);
1144 }
1145 #undef UNROLL
1146
1147 // residuals
1148 if (count > 0) {
1149 DITHER_565_SCAN(y);
1150 do {
1151 SkPMColor c = *src++;
1152 SkPMColorAssert(c);
1153 if (c) {
1154 unsigned a = SkGetPackedA32(c);
1155
1156 // dither and alpha are just temporary variables to work-around
1157 // an ICE in debug.
1158 unsigned dither = DITHER_VALUE(x);
1159 unsigned alpha = SkAlpha255To256(a);
1160 int d = SkAlphaMul(dither, alpha);
1161
1162 unsigned sr = SkGetPackedR32(c);
1163 unsigned sg = SkGetPackedG32(c);
1164 unsigned sb = SkGetPackedB32(c);
1165 sr = SkDITHER_R32_FOR_565(sr, d);
1166 sg = SkDITHER_G32_FOR_565(sg, d);
1167 sb = SkDITHER_B32_FOR_565(sb, d);
1168
1169 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1170 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1171 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1172 // now src and dst expanded are in g:11 r:10 x:1 b:10
1173 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1174 }
1175 dst += 1;
1176 DITHER_INC_X(x);
1177 } while (--count != 0);
1178 }
1179 }
1180
1181 ///////////////////////////////////////////////////////////////////////////////
1182
1183 #undef DEBUG_S32_OPAQUE_DITHER
1184
1185 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1186 const SkPMColor* SK_RESTRICT src,
1187 int count, U8CPU alpha, int x, int y) {
1188 SkASSERT(255 == alpha);
1189
1190 #define UNROLL 8
1191 if (count >= UNROLL) {
1192 uint8x8_t d;
1193 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1194 d = vld1_u8(dstart);
1195
1196 while (count >= UNROLL) {
1197 uint8x8_t sr, sg, sb;
1198 uint16x8_t dr, dg, db;
1199 uint16x8_t dst8;
1200
1201 {
1202 register uint8x8_t d0 asm("d0");
1203 register uint8x8_t d1 asm("d1");
1204 register uint8x8_t d2 asm("d2");
1205 register uint8x8_t d3 asm("d3");
1206
1207 asm (
1208 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1209 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
1210 :
1211 );
1212 sg = d1;
1213 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
1214 sr = d2; sb = d0;
1215 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
1216 sr = d0; sb = d2;
1217 #endif
1218 }
1219 /* XXX: if we want to prefetch, hide it in the above asm()
1220 * using the gcc __builtin_prefetch(), the prefetch will
1221 * fall to the bottom of the loop -- it won't stick up
1222 * at the top of the loop, just after the vld4.
1223 */
1224
1225 // sr = sr - (sr>>5) + d
1226 sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1227 dr = vaddl_u8(sr, d);
1228
1229 // sb = sb - (sb>>5) + d
1230 sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1231 db = vaddl_u8(sb, d);
1232
1233 // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1234 sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1235 dg = vaddl_u8(sg, vshr_n_u8(d, 1));
1236
1237 // pack high bits of each into 565 format (rgb, b is lsb)
1238 dst8 = vshrq_n_u16(db, 3);
1239 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1240 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
1241
1242 // store it
1243 vst1q_u16(dst, dst8);
1244
1245 #if defined(DEBUG_S32_OPAQUE_DITHER)
1246 // always good to know if we generated good results
1247 {
1248 int i, myx = x, myy = y;
1249 DITHER_565_SCAN(myy);
1250 for (i=0;i<UNROLL;i++) {
1251 // the '!' in the asm block above post-incremented src by the 8 pixels it reads.
1252 SkPMColor c = src[i-8];
1253 unsigned dither = DITHER_VALUE(myx);
1254 uint16_t val = SkDitherRGB32To565(c, dither);
1255 if (val != dst[i]) {
1256 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1257 c, dither, val, dst[i], dstart[i]);
1258 }
1259 DITHER_INC_X(myx);
1260 }
1261 }
1262 #endif
1263
1264 dst += UNROLL;
1265 // we don't need to increment src as the asm above has already done it
1266 count -= UNROLL;
1267 x += UNROLL; // probably superfluous
1268 }
1269 }
1270 #undef UNROLL
1271
1272 // residuals
1273 if (count > 0) {
1274 DITHER_565_SCAN(y);
1275 do {
1276 SkPMColor c = *src++;
1277 SkPMColorAssert(c);
1278 SkASSERT(SkGetPackedA32(c) == 255);
1279
1280 unsigned dither = DITHER_VALUE(x);
1281 *dst++ = SkDitherRGB32To565(c, dither);
1282 DITHER_INC_X(x);
1283 } while (--count != 0);
1284 }
1285 }
1286
1287 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
1288 SkPMColor color) {
1289 if (count <= 0) {
1290 return;
1291 }
1292
1293 if (0 == color) {
1294 if (src != dst) {
1295 memcpy(dst, src, count * sizeof(SkPMColor));
1296 }
1297 return;
1298 }
1299
1300 unsigned colorA = SkGetPackedA32(color);
1301 if (255 == colorA) {
1302 sk_memset32(dst, color, count);
1303 } else {
1304 unsigned scale = 256 - SkAlpha255To256(colorA);
1305
1306 if (count >= 8) {
1307 // at the end of this assembly, count will have been decremented
1308 // to a negative value. That is, if count mod 8 = x, it will be
1309 // -8 +x coming out.
1310 asm volatile (
1311 PLD128(src, 0)
1312
1313 "vdup.32 q0, %[color] \n\t"
1314
1315 PLD128(src, 128)
1316
1317 // scale numerical interval [0-255], so load as 8 bits
1318 "vdup.8 d2, %[scale] \n\t"
1319
1320 PLD128(src, 256)
1321
1322 "subs %[count], %[count], #8 \n\t"
1323
1324 PLD128(src, 384)
1325
1326 "Loop_Color32: \n\t"
1327
1328 // load src color, 8 pixels, 4 64 bit registers
1329 // (and increment src).
1330 "vld1.32 {d4-d7}, [%[src]]! \n\t"
1331
1332 PLD128(src, 384)
1333
1334 // multiply long by scale, 64 bits at a time,
1335 // destination into a 128 bit register.
1336 "vmull.u8 q4, d4, d2 \n\t"
1337 "vmull.u8 q5, d5, d2 \n\t"
1338 "vmull.u8 q6, d6, d2 \n\t"
1339 "vmull.u8 q7, d7, d2 \n\t"
1340
1341 // shift the 128 bit registers, containing the 16
1342 // bit scaled values back to 8 bits, narrowing the
1343 // results to 64 bit registers.
1344 "vshrn.i16 d8, q4, #8 \n\t"
1345 "vshrn.i16 d9, q5, #8 \n\t"
1346 "vshrn.i16 d10, q6, #8 \n\t"
1347 "vshrn.i16 d11, q7, #8 \n\t"
1348
1349 // adding back the color, using 128 bit registers.
1350 "vadd.i8 q6, q4, q0 \n\t"
1351 "vadd.i8 q7, q5, q0 \n\t"
1352
1353 // store back the 8 calculated pixels (2 128 bit
1354 // registers), and increment dst.
1355 "vst1.32 {d12-d15}, [%[dst]]! \n\t"
1356
1357 "subs %[count], %[count], #8 \n\t"
1358 "bge Loop_Color32 \n\t"
1359 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
1360 : [color] "r" (color), [scale] "r" (scale)
1361 : "cc", "memory",
1362 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
1363 "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"
1364 );
1365 // At this point, if we went through the inline assembly, count is
1366 // a negative value:
1367 // if the value is -8, there is no pixel left to process.
1368 // if the value is -7, there is one pixel left to process
1369 // ...
1370 // And'ing it with 7 will give us the number of pixels
1371 // left to process.
1372 count = count & 0x7;
1373 }
1374
1375 while (count > 0) {
1376 *dst = color + SkAlphaMulQ(*src, scale);
1377 src += 1;
1378 dst += 1;
1379 count--;
1380 }
1381 }
1382 }
1383
1384 ///////////////////////////////////////////////////////////////////////////////
1385
1386 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
1387 // no dither
1388 // NOTE: For the S32_D565_Blend function below, we don't have a special
1389 // version that assumes that each source pixel is opaque. But our
1390 // S32A is still faster than the default, so use it.
1391 S32_D565_Opaque_neon,
1392 S32A_D565_Blend_neon, // really S32_D565_Blend
1393 S32A_D565_Opaque_neon,
1394 S32A_D565_Blend_neon,
1395
1396 // dither
1397 S32_D565_Opaque_Dither_neon,
1398 S32_D565_Blend_Dither_neon,
1399 S32A_D565_Opaque_Dither_neon,
1400 NULL, // S32A_D565_Blend_Dither
1401 };
1402
1403 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1404 NULL, // S32_Opaque,
1405 S32_Blend_BlitRow32_neon, // S32_Blend,
1406 /*
1407 * We have two choices for S32A_Opaque procs. The one reads the src alpha
1408 * value and attempts to optimize accordingly. The optimization is
1409 * sensitive to the source content and is not a win in all cases. For
1410 * example, if there are a lot of transitions between the alpha states,
1411 * the performance will almost certainly be worse. However, for many
1412 * common cases the performance is equivalent or better than the standard
1413 * case where we do not inspect the src alpha.
1414 */
1415 #if SK_A32_SHIFT == 24
1416 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1417 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,
1418 #else
1419 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
1420 #endif
1421 S32A_Blend_BlitRow32_neon // S32A_Blend
1422 };

mercurial