Sat, 03 Jan 2015 20:18:00 +0100
Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.
1 /*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
9 #include "SkBlitRow_opts_SSE2.h"
10 #include "SkBitmapProcState_opts_SSE2.h"
11 #include "SkColorPriv.h"
12 #include "SkColor_opts_SSE2.h"
13 #include "SkDither.h"
14 #include "SkUtils.h"
16 #include <emmintrin.h>
18 /* SSE2 version of S32_Blend_BlitRow32()
19 * portable version is in core/SkBlitRow_D32.cpp
20 */
21 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
22 const SkPMColor* SK_RESTRICT src,
23 int count, U8CPU alpha) {
24 SkASSERT(alpha <= 255);
25 if (count <= 0) {
26 return;
27 }
29 uint32_t src_scale = SkAlpha255To256(alpha);
30 uint32_t dst_scale = 256 - src_scale;
32 if (count >= 4) {
33 SkASSERT(((size_t)dst & 0x03) == 0);
34 while (((size_t)dst & 0x0F) != 0) {
35 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
36 src++;
37 dst++;
38 count--;
39 }
41 const __m128i *s = reinterpret_cast<const __m128i*>(src);
42 __m128i *d = reinterpret_cast<__m128i*>(dst);
43 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
44 __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
46 // Move scale factors to upper byte of word
47 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
48 __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
49 while (count >= 4) {
50 // Load 4 pixels each of src and dest.
51 __m128i src_pixel = _mm_loadu_si128(s);
52 __m128i dst_pixel = _mm_load_si128(d);
54 // Interleave Atom port 0/1 operations based on the execution port
55 // constraints that multiply can only be executed on port 0 (while
56 // boolean operations can be executed on either port 0 or port 1)
57 // because GCC currently doesn't do a good job scheduling
58 // instructions based on these constraints.
60 // Get red and blue pixels into lower byte of each word.
61 // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
62 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
64 // Multiply by scale.
65 // (4 x (0, rs.h, 0, bs.h))
66 // where rs.h stands for the higher byte of r * scale, and
67 // bs.h the higher byte of b * scale.
68 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
70 // Get alpha and green pixels into higher byte of each word.
71 // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
72 __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
74 // Multiply by scale.
75 // (4 x (as.h, as.l, gs.h, gs.l))
76 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
78 // Clear the lower byte of the a*scale and g*scale results
79 // (4 x (as.h, 0, gs.h, 0))
80 src_ag = _mm_and_si128(src_ag, ag_mask);
82 // Operations the destination pixels are the same as on the
83 // source pixels. See the comments above.
84 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
85 dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
86 __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
87 dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
88 dst_ag = _mm_and_si128(dst_ag, ag_mask);
90 // Combine back into RGBA.
91 // (4 x (as.h, rs.h, gs.h, bs.h))
92 src_pixel = _mm_or_si128(src_rb, src_ag);
93 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
95 // Add result
96 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
97 _mm_store_si128(d, result);
98 s++;
99 d++;
100 count -= 4;
101 }
102 src = reinterpret_cast<const SkPMColor*>(s);
103 dst = reinterpret_cast<SkPMColor*>(d);
104 }
106 while (count > 0) {
107 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
108 src++;
109 dst++;
110 count--;
111 }
112 }
114 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
115 const SkPMColor* SK_RESTRICT src,
116 int count, U8CPU alpha) {
117 SkASSERT(alpha == 255);
118 if (count <= 0) {
119 return;
120 }
122 if (count >= 4) {
123 SkASSERT(((size_t)dst & 0x03) == 0);
124 while (((size_t)dst & 0x0F) != 0) {
125 *dst = SkPMSrcOver(*src, *dst);
126 src++;
127 dst++;
128 count--;
129 }
131 const __m128i *s = reinterpret_cast<const __m128i*>(src);
132 __m128i *d = reinterpret_cast<__m128i*>(dst);
133 #ifdef SK_USE_ACCURATE_BLENDING
134 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
135 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit)
136 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit)
137 while (count >= 4) {
138 // Load 4 pixels
139 __m128i src_pixel = _mm_loadu_si128(s);
140 __m128i dst_pixel = _mm_load_si128(d);
142 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
143 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
144 // Shift alphas down to lower 8 bits of each quad.
145 __m128i alpha = _mm_srli_epi32(src_pixel, 24);
147 // Copy alpha to upper 3rd byte of each quad
148 alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
150 // Subtract alphas from 255, to get 0..255
151 alpha = _mm_sub_epi16(c_255, alpha);
153 // Multiply by red and blue by src alpha.
154 dst_rb = _mm_mullo_epi16(dst_rb, alpha);
155 // Multiply by alpha and green by src alpha.
156 dst_ag = _mm_mullo_epi16(dst_ag, alpha);
158 // dst_rb_low = (dst_rb >> 8)
159 __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
160 __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
162 // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
163 dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
164 dst_rb = _mm_add_epi16(dst_rb, c_128);
165 dst_rb = _mm_srli_epi16(dst_rb, 8);
167 // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
168 dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
169 dst_ag = _mm_add_epi16(dst_ag, c_128);
170 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
172 // Combine back into RGBA.
173 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
175 // Add result
176 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
177 _mm_store_si128(d, result);
178 s++;
179 d++;
180 count -= 4;
181 }
182 #else
183 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
184 __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit)
185 while (count >= 4) {
186 // Load 4 pixels
187 __m128i src_pixel = _mm_loadu_si128(s);
188 __m128i dst_pixel = _mm_load_si128(d);
190 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
191 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
193 // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word)
194 __m128i alpha = _mm_srli_epi16(src_pixel, 8);
196 // (a0, a0, a1, a1, a2, g2, a3, g3)
197 alpha = _mm_shufflehi_epi16(alpha, 0xF5);
199 // (a0, a0, a1, a1, a2, a2, a3, a3)
200 alpha = _mm_shufflelo_epi16(alpha, 0xF5);
202 // Subtract alphas from 256, to get 1..256
203 alpha = _mm_sub_epi16(c_256, alpha);
205 // Multiply by red and blue by src alpha.
206 dst_rb = _mm_mullo_epi16(dst_rb, alpha);
207 // Multiply by alpha and green by src alpha.
208 dst_ag = _mm_mullo_epi16(dst_ag, alpha);
210 // Divide by 256.
211 dst_rb = _mm_srli_epi16(dst_rb, 8);
213 // Mask out high bits (already in the right place)
214 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
216 // Combine back into RGBA.
217 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
219 // Add result
220 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
221 _mm_store_si128(d, result);
222 s++;
223 d++;
224 count -= 4;
225 }
226 #endif
227 src = reinterpret_cast<const SkPMColor*>(s);
228 dst = reinterpret_cast<SkPMColor*>(d);
229 }
231 while (count > 0) {
232 *dst = SkPMSrcOver(*src, *dst);
233 src++;
234 dst++;
235 count--;
236 }
237 }
239 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
240 const SkPMColor* SK_RESTRICT src,
241 int count, U8CPU alpha) {
242 SkASSERT(alpha <= 255);
243 if (count <= 0) {
244 return;
245 }
247 if (count >= 4) {
248 while (((size_t)dst & 0x0F) != 0) {
249 *dst = SkBlendARGB32(*src, *dst, alpha);
250 src++;
251 dst++;
252 count--;
253 }
255 uint32_t src_scale = SkAlpha255To256(alpha);
257 const __m128i *s = reinterpret_cast<const __m128i*>(src);
258 __m128i *d = reinterpret_cast<__m128i*>(dst);
259 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
260 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
261 __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit)
262 while (count >= 4) {
263 // Load 4 pixels each of src and dest.
264 __m128i src_pixel = _mm_loadu_si128(s);
265 __m128i dst_pixel = _mm_load_si128(d);
267 // Get red and blue pixels into lower byte of each word.
268 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
269 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
271 // Get alpha and green into lower byte of each word.
272 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
273 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
275 // Put per-pixel alpha in low byte of each word.
276 // After the following two statements, the dst_alpha looks like
277 // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
278 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
279 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
281 // dst_alpha = dst_alpha * src_scale
282 // Because src_scales are in the higher byte of each word and
283 // we use mulhi here, the resulting alpha values are already
284 // in the right place and don't need to be divided by 256.
285 // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
286 dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
288 // Subtract alphas from 256, to get 1..256
289 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
291 // Multiply red and blue by dst pixel alpha.
292 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
293 // Multiply alpha and green by dst pixel alpha.
294 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
296 // Multiply red and blue by global alpha.
297 // (4 x (0, rs.h, 0, bs.h))
298 // where rs.h stands for the higher byte of r * src_scale,
299 // and bs.h the higher byte of b * src_scale.
300 // Again, because we use mulhi, the resuling red and blue
301 // values are already in the right place and don't need to
302 // be divided by 256.
303 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
304 // Multiply alpha and green by global alpha.
305 // (4 x (0, as.h, 0, gs.h))
306 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
308 // Divide by 256.
309 dst_rb = _mm_srli_epi16(dst_rb, 8);
311 // Mask out low bits (goodies already in the right place; no need to divide)
312 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
313 // Shift alpha and green to higher byte of each word.
314 // (4 x (as.h, 0, gs.h, 0))
315 src_ag = _mm_slli_epi16(src_ag, 8);
317 // Combine back into RGBA.
318 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
319 src_pixel = _mm_or_si128(src_rb, src_ag);
321 // Add two pixels into result.
322 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
323 _mm_store_si128(d, result);
324 s++;
325 d++;
326 count -= 4;
327 }
328 src = reinterpret_cast<const SkPMColor*>(s);
329 dst = reinterpret_cast<SkPMColor*>(d);
330 }
332 while (count > 0) {
333 *dst = SkBlendARGB32(*src, *dst, alpha);
334 src++;
335 dst++;
336 count--;
337 }
338 }
340 /* SSE2 version of Color32()
341 * portable version is in core/SkBlitRow_D32.cpp
342 */
343 void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
344 SkPMColor color) {
346 if (count <= 0) {
347 return;
348 }
350 if (0 == color) {
351 if (src != dst) {
352 memcpy(dst, src, count * sizeof(SkPMColor));
353 }
354 return;
355 }
357 unsigned colorA = SkGetPackedA32(color);
358 if (255 == colorA) {
359 sk_memset32(dst, color, count);
360 } else {
361 unsigned scale = 256 - SkAlpha255To256(colorA);
363 if (count >= 4) {
364 SkASSERT(((size_t)dst & 0x03) == 0);
365 while (((size_t)dst & 0x0F) != 0) {
366 *dst = color + SkAlphaMulQ(*src, scale);
367 src++;
368 dst++;
369 count--;
370 }
372 const __m128i *s = reinterpret_cast<const __m128i*>(src);
373 __m128i *d = reinterpret_cast<__m128i*>(dst);
374 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
375 __m128i src_scale_wide = _mm_set1_epi16(scale);
376 __m128i color_wide = _mm_set1_epi32(color);
377 while (count >= 4) {
378 // Load 4 pixels each of src and dest.
379 __m128i src_pixel = _mm_loadu_si128(s);
381 // Get red and blue pixels into lower byte of each word.
382 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
384 // Get alpha and green into lower byte of each word.
385 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
387 // Multiply by scale.
388 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
389 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
391 // Divide by 256.
392 src_rb = _mm_srli_epi16(src_rb, 8);
393 src_ag = _mm_andnot_si128(rb_mask, src_ag);
395 // Combine back into RGBA.
396 src_pixel = _mm_or_si128(src_rb, src_ag);
398 // Add color to result.
399 __m128i result = _mm_add_epi8(color_wide, src_pixel);
401 // Store result.
402 _mm_store_si128(d, result);
403 s++;
404 d++;
405 count -= 4;
406 }
407 src = reinterpret_cast<const SkPMColor*>(s);
408 dst = reinterpret_cast<SkPMColor*>(d);
409 }
411 while (count > 0) {
412 *dst = color + SkAlphaMulQ(*src, scale);
413 src += 1;
414 dst += 1;
415 count--;
416 }
417 }
418 }
420 void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
421 size_t maskRB, SkColor origColor,
422 int width, int height) {
423 SkPMColor color = SkPreMultiplyColor(origColor);
424 size_t dstOffset = dstRB - (width << 2);
425 size_t maskOffset = maskRB - width;
426 SkPMColor* dst = (SkPMColor *)device;
427 const uint8_t* mask = (const uint8_t*)maskPtr;
428 do {
429 int count = width;
430 if (count >= 4) {
431 while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
432 *dst = SkBlendARGB32(color, *dst, *mask);
433 mask++;
434 dst++;
435 count--;
436 }
437 __m128i *d = reinterpret_cast<__m128i*>(dst);
438 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
439 __m128i c_256 = _mm_set1_epi16(256);
440 __m128i c_1 = _mm_set1_epi16(1);
441 __m128i src_pixel = _mm_set1_epi32(color);
442 while (count >= 4) {
443 // Load 4 pixels each of src and dest.
444 __m128i dst_pixel = _mm_load_si128(d);
446 //set the aphla value
447 __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\
448 0, *(mask+3),0, \
449 *(mask+2),0, *(mask+2),\
450 0,*(mask+1), 0,*(mask+1),\
451 0, *mask,0,*mask);
453 //call SkAlpha255To256()
454 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
456 // Get red and blue pixels into lower byte of each word.
457 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
458 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
460 // Get alpha and green into lower byte of each word.
461 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
462 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
464 // Put per-pixel alpha in low byte of each word.
465 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
466 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
468 // dst_alpha = dst_alpha * src_scale
469 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
471 // Divide by 256.
472 dst_alpha = _mm_srli_epi16(dst_alpha, 8);
474 // Subtract alphas from 256, to get 1..256
475 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
476 // Multiply red and blue by dst pixel alpha.
477 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
478 // Multiply alpha and green by dst pixel alpha.
479 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
481 // Multiply red and blue by global alpha.
482 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
483 // Multiply alpha and green by global alpha.
484 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
485 // Divide by 256.
486 dst_rb = _mm_srli_epi16(dst_rb, 8);
487 src_rb = _mm_srli_epi16(src_rb, 8);
489 // Mask out low bits (goodies already in the right place; no need to divide)
490 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
491 src_ag = _mm_andnot_si128(rb_mask, src_ag);
493 // Combine back into RGBA.
494 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
495 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
497 // Add two pixels into result.
498 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
499 _mm_store_si128(d, result);
500 // load the next 4 pixel
501 mask = mask + 4;
502 d++;
503 count -= 4;
504 }
505 dst = reinterpret_cast<SkPMColor *>(d);
506 }
507 while(count > 0) {
508 *dst= SkBlendARGB32(color, *dst, *mask);
509 dst += 1;
510 mask++;
511 count --;
512 }
513 dst = (SkPMColor *)((char*)dst + dstOffset);
514 mask += maskOffset;
515 } while (--height != 0);
516 }
518 // The following (left) shifts cause the top 5 bits of the mask components to
519 // line up with the corresponding components in an SkPMColor.
520 // Note that the mask's RGB16 order may differ from the SkPMColor order.
521 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
522 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
523 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
525 #if SK_R16x5_R32x5_SHIFT == 0
526 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
527 #elif SK_R16x5_R32x5_SHIFT > 0
528 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
529 #else
530 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
531 #endif
533 #if SK_G16x5_G32x5_SHIFT == 0
534 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
535 #elif SK_G16x5_G32x5_SHIFT > 0
536 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
537 #else
538 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
539 #endif
541 #if SK_B16x5_B32x5_SHIFT == 0
542 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
543 #elif SK_B16x5_B32x5_SHIFT > 0
544 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
545 #else
546 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
547 #endif
549 static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
550 __m128i &mask, __m128i &srcA) {
551 // In the following comments, the components of src, dst and mask are
552 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
553 // by an R, G, B, or A suffix. Components of one of the four pixels that
554 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
555 // example is the blue channel of the second destination pixel. Memory
556 // layout is shown for an ARGB byte order in a color value.
558 // src and srcA store 8-bit values interleaved with zeros.
559 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
560 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
561 // srcA, 0, srcA, 0, srcA, 0, srcA, 0)
562 // mask stores 16-bit values (compressed three channels) interleaved with zeros.
563 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
564 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
565 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
567 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
568 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
569 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
570 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
572 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
573 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
574 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
576 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
577 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
578 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
580 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
581 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
582 // 8-bit position
583 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
584 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
585 mask = _mm_or_si128(_mm_or_si128(r, g), b);
587 // Interleave R,G,B into the lower byte of word.
588 // i.e. split the sixteen 8-bit values from mask into two sets of eight
589 // 16-bit values, padded by zero.
590 __m128i maskLo, maskHi;
591 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
592 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
593 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
594 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
596 // Upscale from 0..31 to 0..32
597 // (allows to replace division by left-shift further down)
598 // Left-shift each component by 4 and add the result back to that component,
599 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
600 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
601 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
603 // Multiply each component of maskLo and maskHi by srcA
604 maskLo = _mm_mullo_epi16(maskLo, srcA);
605 maskHi = _mm_mullo_epi16(maskHi, srcA);
607 // Left shift mask components by 8 (divide by 256)
608 maskLo = _mm_srli_epi16(maskLo, 8);
609 maskHi = _mm_srli_epi16(maskHi, 8);
611 // Interleave R,G,B into the lower byte of the word
612 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
613 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
614 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
615 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
617 // mask = (src - dst) * mask
618 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
619 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
621 // mask = (src - dst) * mask >> 5
622 maskLo = _mm_srai_epi16(maskLo, 5);
623 maskHi = _mm_srai_epi16(maskHi, 5);
625 // Add two pixels into result.
626 // result = dst + ((src - dst) * mask >> 5)
627 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
628 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
630 // Pack into 4 32bit dst pixels.
631 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
632 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
633 // clamping to 255 if necessary.
634 return _mm_packus_epi16(resultLo, resultHi);
635 }
637 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
638 __m128i &mask) {
639 // In the following comments, the components of src, dst and mask are
640 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
641 // by an R, G, B, or A suffix. Components of one of the four pixels that
642 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
643 // example is the blue channel of the second destination pixel. Memory
644 // layout is shown for an ARGB byte order in a color value.
646 // src and srcA store 8-bit values interleaved with zeros.
647 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
648 // mask stores 16-bit values (shown as high and low bytes) interleaved with
649 // zeros
650 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
651 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
653 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
654 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
655 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
656 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
658 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
659 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
660 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
662 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
663 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
664 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
666 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
667 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
668 // 8-bit position
669 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
670 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
671 mask = _mm_or_si128(_mm_or_si128(r, g), b);
673 // Interleave R,G,B into the lower byte of word.
674 // i.e. split the sixteen 8-bit values from mask into two sets of eight
675 // 16-bit values, padded by zero.
676 __m128i maskLo, maskHi;
677 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
678 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
679 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
680 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
682 // Upscale from 0..31 to 0..32
683 // (allows to replace division by left-shift further down)
684 // Left-shift each component by 4 and add the result back to that component,
685 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
686 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
687 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
689 // Interleave R,G,B into the lower byte of the word
690 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
691 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
692 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
693 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
695 // mask = (src - dst) * mask
696 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
697 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
699 // mask = (src - dst) * mask >> 5
700 maskLo = _mm_srai_epi16(maskLo, 5);
701 maskHi = _mm_srai_epi16(maskHi, 5);
703 // Add two pixels into result.
704 // result = dst + ((src - dst) * mask >> 5)
705 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
706 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
708 // Pack into 4 32bit dst pixels and force opaque.
709 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
710 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
711 // clamping to 255 if necessary. Set alpha components to 0xFF.
712 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
713 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
714 }
716 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
717 SkColor src, int width, SkPMColor) {
718 if (width <= 0) {
719 return;
720 }
722 int srcA = SkColorGetA(src);
723 int srcR = SkColorGetR(src);
724 int srcG = SkColorGetG(src);
725 int srcB = SkColorGetB(src);
727 srcA = SkAlpha255To256(srcA);
729 if (width >= 4) {
730 SkASSERT(((size_t)dst & 0x03) == 0);
731 while (((size_t)dst & 0x0F) != 0) {
732 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
733 mask++;
734 dst++;
735 width--;
736 }
738 __m128i *d = reinterpret_cast<__m128i*>(dst);
739 // Set alpha to 0xFF and replicate source four times in SSE register.
740 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
741 // Interleave with zeros to get two sets of four 16-bit values.
742 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
743 // Set srcA_sse to contain eight copies of srcA, padded with zero.
744 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
745 __m128i srcA_sse = _mm_set1_epi16(srcA);
746 while (width >= 4) {
747 // Load four destination pixels into dst_sse.
748 __m128i dst_sse = _mm_load_si128(d);
749 // Load four 16-bit masks into lower half of mask_sse.
750 __m128i mask_sse = _mm_loadl_epi64(
751 reinterpret_cast<const __m128i*>(mask));
753 // Check whether masks are equal to 0 and get the highest bit
754 // of each byte of result, if masks are all zero, we will get
755 // pack_cmp to 0xFFFF
756 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
757 _mm_setzero_si128()));
759 // if mask pixels are not all zero, we will blend the dst pixels
760 if (pack_cmp != 0xFFFF) {
761 // Unpack 4 16bit mask pixels to
762 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
763 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
764 mask_sse = _mm_unpacklo_epi16(mask_sse,
765 _mm_setzero_si128());
767 // Process 4 32bit dst pixels
768 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
769 mask_sse, srcA_sse);
770 _mm_store_si128(d, result);
771 }
773 d++;
774 mask += 4;
775 width -= 4;
776 }
778 dst = reinterpret_cast<SkPMColor*>(d);
779 }
781 while (width > 0) {
782 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
783 mask++;
784 dst++;
785 width--;
786 }
787 }
789 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
790 SkColor src, int width, SkPMColor opaqueDst) {
791 if (width <= 0) {
792 return;
793 }
795 int srcR = SkColorGetR(src);
796 int srcG = SkColorGetG(src);
797 int srcB = SkColorGetB(src);
799 if (width >= 4) {
800 SkASSERT(((size_t)dst & 0x03) == 0);
801 while (((size_t)dst & 0x0F) != 0) {
802 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
803 mask++;
804 dst++;
805 width--;
806 }
808 __m128i *d = reinterpret_cast<__m128i*>(dst);
809 // Set alpha to 0xFF and replicate source four times in SSE register.
810 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
811 // Set srcA_sse to contain eight copies of srcA, padded with zero.
812 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
813 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
814 while (width >= 4) {
815 // Load four destination pixels into dst_sse.
816 __m128i dst_sse = _mm_load_si128(d);
817 // Load four 16-bit masks into lower half of mask_sse.
818 __m128i mask_sse = _mm_loadl_epi64(
819 reinterpret_cast<const __m128i*>(mask));
821 // Check whether masks are equal to 0 and get the highest bit
822 // of each byte of result, if masks are all zero, we will get
823 // pack_cmp to 0xFFFF
824 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
825 _mm_setzero_si128()));
827 // if mask pixels are not all zero, we will blend the dst pixels
828 if (pack_cmp != 0xFFFF) {
829 // Unpack 4 16bit mask pixels to
830 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
831 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
832 mask_sse = _mm_unpacklo_epi16(mask_sse,
833 _mm_setzero_si128());
835 // Process 4 32bit dst pixels
836 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
837 mask_sse);
838 _mm_store_si128(d, result);
839 }
841 d++;
842 mask += 4;
843 width -= 4;
844 }
846 dst = reinterpret_cast<SkPMColor*>(d);
847 }
849 while (width > 0) {
850 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
851 mask++;
852 dst++;
853 width--;
854 }
855 }
857 /* SSE2 version of S32_D565_Opaque()
858 * portable version is in core/SkBlitRow_D16.cpp
859 */
860 void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
861 const SkPMColor* SK_RESTRICT src, int count,
862 U8CPU alpha, int /*x*/, int /*y*/) {
863 SkASSERT(255 == alpha);
865 if (count <= 0) {
866 return;
867 }
869 if (count >= 8) {
870 while (((size_t)dst & 0x0F) != 0) {
871 SkPMColor c = *src++;
872 SkPMColorAssert(c);
874 *dst++ = SkPixel32ToPixel16_ToU16(c);
875 count--;
876 }
878 const __m128i* s = reinterpret_cast<const __m128i*>(src);
879 __m128i* d = reinterpret_cast<__m128i*>(dst);
880 __m128i r16_mask = _mm_set1_epi32(SK_R16_MASK);
881 __m128i g16_mask = _mm_set1_epi32(SK_G16_MASK);
882 __m128i b16_mask = _mm_set1_epi32(SK_B16_MASK);
884 while (count >= 8) {
885 // Load 8 pixels of src.
886 __m128i src_pixel1 = _mm_loadu_si128(s++);
887 __m128i src_pixel2 = _mm_loadu_si128(s++);
889 // Calculate result r.
890 __m128i r1 = _mm_srli_epi32(src_pixel1,
891 SK_R32_SHIFT + (8 - SK_R16_BITS));
892 r1 = _mm_and_si128(r1, r16_mask);
893 __m128i r2 = _mm_srli_epi32(src_pixel2,
894 SK_R32_SHIFT + (8 - SK_R16_BITS));
895 r2 = _mm_and_si128(r2, r16_mask);
896 __m128i r = _mm_packs_epi32(r1, r2);
898 // Calculate result g.
899 __m128i g1 = _mm_srli_epi32(src_pixel1,
900 SK_G32_SHIFT + (8 - SK_G16_BITS));
901 g1 = _mm_and_si128(g1, g16_mask);
902 __m128i g2 = _mm_srli_epi32(src_pixel2,
903 SK_G32_SHIFT + (8 - SK_G16_BITS));
904 g2 = _mm_and_si128(g2, g16_mask);
905 __m128i g = _mm_packs_epi32(g1, g2);
907 // Calculate result b.
908 __m128i b1 = _mm_srli_epi32(src_pixel1,
909 SK_B32_SHIFT + (8 - SK_B16_BITS));
910 b1 = _mm_and_si128(b1, b16_mask);
911 __m128i b2 = _mm_srli_epi32(src_pixel2,
912 SK_B32_SHIFT + (8 - SK_B16_BITS));
913 b2 = _mm_and_si128(b2, b16_mask);
914 __m128i b = _mm_packs_epi32(b1, b2);
916 // Store 8 16-bit colors in dst.
917 __m128i d_pixel = SkPackRGB16_SSE(r, g, b);
918 _mm_store_si128(d++, d_pixel);
919 count -= 8;
920 }
921 src = reinterpret_cast<const SkPMColor*>(s);
922 dst = reinterpret_cast<uint16_t*>(d);
923 }
925 if (count > 0) {
926 do {
927 SkPMColor c = *src++;
928 SkPMColorAssert(c);
929 *dst++ = SkPixel32ToPixel16_ToU16(c);
930 } while (--count != 0);
931 }
932 }
934 /* SSE2 version of S32A_D565_Opaque()
935 * portable version is in core/SkBlitRow_D16.cpp
936 */
937 void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
938 const SkPMColor* SK_RESTRICT src,
939 int count, U8CPU alpha, int /*x*/, int /*y*/) {
940 SkASSERT(255 == alpha);
942 if (count <= 0) {
943 return;
944 }
946 if (count >= 8) {
947 // Make dst 16 bytes alignment
948 while (((size_t)dst & 0x0F) != 0) {
949 SkPMColor c = *src++;
950 if (c) {
951 *dst = SkSrcOver32To16(c, *dst);
952 }
953 dst += 1;
954 count--;
955 }
957 const __m128i* s = reinterpret_cast<const __m128i*>(src);
958 __m128i* d = reinterpret_cast<__m128i*>(dst);
959 __m128i var255 = _mm_set1_epi16(255);
960 __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
961 __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
962 __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
964 while (count >= 8) {
965 // Load 8 pixels of src.
966 __m128i src_pixel1 = _mm_loadu_si128(s++);
967 __m128i src_pixel2 = _mm_loadu_si128(s++);
969 // Check whether src pixels are equal to 0 and get the highest bit
970 // of each byte of result, if src pixels are all zero, src_cmp1 and
971 // src_cmp2 will be 0xFFFF.
972 int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
973 _mm_setzero_si128()));
974 int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
975 _mm_setzero_si128()));
976 if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
977 d++;
978 count -= 8;
979 continue;
980 }
982 // Load 8 pixels of dst.
983 __m128i dst_pixel = _mm_load_si128(d);
985 // Extract A from src.
986 __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT));
987 sa1 = _mm_srli_epi32(sa1, 24);
988 __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT));
989 sa2 = _mm_srli_epi32(sa2, 24);
990 __m128i sa = _mm_packs_epi32(sa1, sa2);
992 // Extract R from src.
993 __m128i sr1 = _mm_slli_epi32(src_pixel1,(24 - SK_R32_SHIFT));
994 sr1 = _mm_srli_epi32(sr1, 24);
995 __m128i sr2 = _mm_slli_epi32(src_pixel2,(24 - SK_R32_SHIFT));
996 sr2 = _mm_srli_epi32(sr2, 24);
997 __m128i sr = _mm_packs_epi32(sr1, sr2);
999 // Extract G from src.
1000 __m128i sg1 = _mm_slli_epi32(src_pixel1,(24 - SK_G32_SHIFT));
1001 sg1 = _mm_srli_epi32(sg1, 24);
1002 __m128i sg2 = _mm_slli_epi32(src_pixel2,(24 - SK_G32_SHIFT));
1003 sg2 = _mm_srli_epi32(sg2, 24);
1004 __m128i sg = _mm_packs_epi32(sg1, sg2);
1006 // Extract B from src.
1007 __m128i sb1 = _mm_slli_epi32(src_pixel1,(24 - SK_B32_SHIFT));
1008 sb1 = _mm_srli_epi32(sb1, 24);
1009 __m128i sb2 = _mm_slli_epi32(src_pixel2,(24 - SK_B32_SHIFT));
1010 sb2 = _mm_srli_epi32(sb2, 24);
1011 __m128i sb = _mm_packs_epi32(sb1, sb2);
1013 // Extract R G B from dst.
1014 __m128i dr = _mm_srli_epi16(dst_pixel,SK_R16_SHIFT);
1015 dr = _mm_and_si128(dr, r16_mask);
1016 __m128i dg = _mm_srli_epi16(dst_pixel,SK_G16_SHIFT);
1017 dg = _mm_and_si128(dg, g16_mask);
1018 __m128i db = _mm_srli_epi16(dst_pixel,SK_B16_SHIFT);
1019 db = _mm_and_si128(db, b16_mask);
1021 __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
1023 // Calculate R G B of result.
1024 // Original algorithm is in SkSrcOver32To16().
1025 dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE(dr, isa, SK_R16_BITS));
1026 dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
1027 dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE(dg, isa, SK_G16_BITS));
1028 dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
1029 db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE(db, isa, SK_B16_BITS));
1030 db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
1032 // Pack R G B into 16-bit color.
1033 __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db);
1035 // Store 8 16-bit colors in dst.
1036 _mm_store_si128(d++, d_pixel);
1037 count -= 8;
1038 }
1040 src = reinterpret_cast<const SkPMColor*>(s);
1041 dst = reinterpret_cast<uint16_t*>(d);
1042 }
1044 if (count > 0) {
1045 do {
1046 SkPMColor c = *src++;
1047 SkPMColorAssert(c);
1048 if (c) {
1049 *dst = SkSrcOver32To16(c, *dst);
1050 }
1051 dst += 1;
1052 } while (--count != 0);
1053 }
1054 }
1056 void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
1057 const SkPMColor* SK_RESTRICT src,
1058 int count, U8CPU alpha, int x, int y) {
1059 SkASSERT(255 == alpha);
1061 if (count <= 0) {
1062 return;
1063 }
1065 if (count >= 8) {
1066 while (((size_t)dst & 0x0F) != 0) {
1067 DITHER_565_SCAN(y);
1068 SkPMColor c = *src++;
1069 SkPMColorAssert(c);
1071 unsigned dither = DITHER_VALUE(x);
1072 *dst++ = SkDitherRGB32To565(c, dither);
1073 DITHER_INC_X(x);
1074 count--;
1075 }
1077 unsigned short dither_value[8];
1078 __m128i dither;
1079 #ifdef ENABLE_DITHER_MATRIX_4X4
1080 const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
1081 dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
1082 dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
1083 dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
1084 dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
1085 #else
1086 const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
1087 dither_value[0] = dither_value[4] = (dither_scan
1088 >> (((x) & 3) << 2)) & 0xF;
1089 dither_value[1] = dither_value[5] = (dither_scan
1090 >> (((x + 1) & 3) << 2)) & 0xF;
1091 dither_value[2] = dither_value[6] = (dither_scan
1092 >> (((x + 2) & 3) << 2)) & 0xF;
1093 dither_value[3] = dither_value[7] = (dither_scan
1094 >> (((x + 3) & 3) << 2)) & 0xF;
1095 #endif
1096 dither = _mm_loadu_si128((__m128i*) dither_value);
1098 const __m128i* s = reinterpret_cast<const __m128i*>(src);
1099 __m128i* d = reinterpret_cast<__m128i*>(dst);
1101 while (count >= 8) {
1102 // Load 8 pixels of src.
1103 __m128i src_pixel1 = _mm_loadu_si128(s++);
1104 __m128i src_pixel2 = _mm_loadu_si128(s++);
1106 // Extract R from src.
1107 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
1108 sr1 = _mm_srli_epi32(sr1, 24);
1109 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
1110 sr2 = _mm_srli_epi32(sr2, 24);
1111 __m128i sr = _mm_packs_epi32(sr1, sr2);
1113 // SkDITHER_R32To565(sr, dither)
1114 __m128i sr_offset = _mm_srli_epi16(sr, 5);
1115 sr = _mm_add_epi16(sr, dither);
1116 sr = _mm_sub_epi16(sr, sr_offset);
1117 sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
1119 // Extract G from src.
1120 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
1121 sg1 = _mm_srli_epi32(sg1, 24);
1122 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
1123 sg2 = _mm_srli_epi32(sg2, 24);
1124 __m128i sg = _mm_packs_epi32(sg1, sg2);
1126 // SkDITHER_R32To565(sg, dither)
1127 __m128i sg_offset = _mm_srli_epi16(sg, 6);
1128 sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
1129 sg = _mm_sub_epi16(sg, sg_offset);
1130 sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
1132 // Extract B from src.
1133 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
1134 sb1 = _mm_srli_epi32(sb1, 24);
1135 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
1136 sb2 = _mm_srli_epi32(sb2, 24);
1137 __m128i sb = _mm_packs_epi32(sb1, sb2);
1139 // SkDITHER_R32To565(sb, dither)
1140 __m128i sb_offset = _mm_srli_epi16(sb, 5);
1141 sb = _mm_add_epi16(sb, dither);
1142 sb = _mm_sub_epi16(sb, sb_offset);
1143 sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
1145 // Pack and store 16-bit dst pixel.
1146 __m128i d_pixel = SkPackRGB16_SSE(sr, sg, sb);
1147 _mm_store_si128(d++, d_pixel);
1149 count -= 8;
1150 x += 8;
1151 }
1153 src = reinterpret_cast<const SkPMColor*>(s);
1154 dst = reinterpret_cast<uint16_t*>(d);
1155 }
1157 if (count > 0) {
1158 DITHER_565_SCAN(y);
1159 do {
1160 SkPMColor c = *src++;
1161 SkPMColorAssert(c);
1163 unsigned dither = DITHER_VALUE(x);
1164 *dst++ = SkDitherRGB32To565(c, dither);
1165 DITHER_INC_X(x);
1166 } while (--count != 0);
1167 }
1168 }
1170 /* SSE2 version of S32A_D565_Opaque_Dither()
1171 * portable version is in core/SkBlitRow_D16.cpp
1172 */
1173 void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
1174 const SkPMColor* SK_RESTRICT src,
1175 int count, U8CPU alpha, int x, int y) {
1176 SkASSERT(255 == alpha);
1178 if (count <= 0) {
1179 return;
1180 }
1182 if (count >= 8) {
1183 while (((size_t)dst & 0x0F) != 0) {
1184 DITHER_565_SCAN(y);
1185 SkPMColor c = *src++;
1186 SkPMColorAssert(c);
1187 if (c) {
1188 unsigned a = SkGetPackedA32(c);
1190 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
1192 unsigned sr = SkGetPackedR32(c);
1193 unsigned sg = SkGetPackedG32(c);
1194 unsigned sb = SkGetPackedB32(c);
1195 sr = SkDITHER_R32_FOR_565(sr, d);
1196 sg = SkDITHER_G32_FOR_565(sg, d);
1197 sb = SkDITHER_B32_FOR_565(sb, d);
1199 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1200 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1201 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1202 // now src and dst expanded are in g:11 r:10 x:1 b:10
1203 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1204 }
1205 dst += 1;
1206 DITHER_INC_X(x);
1207 count--;
1208 }
1210 unsigned short dither_value[8];
1211 __m128i dither, dither_cur;
1212 #ifdef ENABLE_DITHER_MATRIX_4X4
1213 const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
1214 dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
1215 dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
1216 dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
1217 dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
1218 #else
1219 const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
1220 dither_value[0] = dither_value[4] = (dither_scan
1221 >> (((x) & 3) << 2)) & 0xF;
1222 dither_value[1] = dither_value[5] = (dither_scan
1223 >> (((x + 1) & 3) << 2)) & 0xF;
1224 dither_value[2] = dither_value[6] = (dither_scan
1225 >> (((x + 2) & 3) << 2)) & 0xF;
1226 dither_value[3] = dither_value[7] = (dither_scan
1227 >> (((x + 3) & 3) << 2)) & 0xF;
1228 #endif
1229 dither = _mm_loadu_si128((__m128i*) dither_value);
1231 const __m128i* s = reinterpret_cast<const __m128i*>(src);
1232 __m128i* d = reinterpret_cast<__m128i*>(dst);
1233 __m128i var256 = _mm_set1_epi16(256);
1234 __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
1235 __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
1236 __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
1238 while (count >= 8) {
1239 // Load 8 pixels of src and dst.
1240 __m128i src_pixel1 = _mm_loadu_si128(s++);
1241 __m128i src_pixel2 = _mm_loadu_si128(s++);
1242 __m128i dst_pixel = _mm_load_si128(d);
1244 // Extract A from src.
1245 __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT));
1246 sa1 = _mm_srli_epi32(sa1, 24);
1247 __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT));
1248 sa2 = _mm_srli_epi32(sa2, 24);
1249 __m128i sa = _mm_packs_epi32(sa1, sa2);
1251 // Calculate current dither value.
1252 dither_cur = _mm_mullo_epi16(dither,
1253 _mm_add_epi16(sa, _mm_set1_epi16(1)));
1254 dither_cur = _mm_srli_epi16(dither_cur, 8);
1256 // Extract R from src.
1257 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
1258 sr1 = _mm_srli_epi32(sr1, 24);
1259 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
1260 sr2 = _mm_srli_epi32(sr2, 24);
1261 __m128i sr = _mm_packs_epi32(sr1, sr2);
1263 // SkDITHER_R32_FOR_565(sr, d)
1264 __m128i sr_offset = _mm_srli_epi16(sr, 5);
1265 sr = _mm_add_epi16(sr, dither_cur);
1266 sr = _mm_sub_epi16(sr, sr_offset);
1268 // Expand sr.
1269 sr = _mm_slli_epi16(sr, 2);
1271 // Extract G from src.
1272 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
1273 sg1 = _mm_srli_epi32(sg1, 24);
1274 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
1275 sg2 = _mm_srli_epi32(sg2, 24);
1276 __m128i sg = _mm_packs_epi32(sg1, sg2);
1278 // sg = SkDITHER_G32_FOR_565(sg, d).
1279 __m128i sg_offset = _mm_srli_epi16(sg, 6);
1280 sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));
1281 sg = _mm_sub_epi16(sg, sg_offset);
1283 // Expand sg.
1284 sg = _mm_slli_epi16(sg, 3);
1286 // Extract B from src.
1287 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
1288 sb1 = _mm_srli_epi32(sb1, 24);
1289 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
1290 sb2 = _mm_srli_epi32(sb2, 24);
1291 __m128i sb = _mm_packs_epi32(sb1, sb2);
1293 // sb = SkDITHER_B32_FOR_565(sb, d).
1294 __m128i sb_offset = _mm_srli_epi16(sb, 5);
1295 sb = _mm_add_epi16(sb, dither_cur);
1296 sb = _mm_sub_epi16(sb, sb_offset);
1298 // Expand sb.
1299 sb = _mm_slli_epi16(sb, 2);
1301 // Extract R G B from dst.
1302 __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
1303 dr = _mm_and_si128(dr, r16_mask);
1304 __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
1305 dg = _mm_and_si128(dg, g16_mask);
1306 __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
1307 db = _mm_and_si128(db, b16_mask);
1309 // SkAlpha255To256(255 - a) >> 3
1310 __m128i isa = _mm_sub_epi16(var256, sa);
1311 isa = _mm_srli_epi16(isa, 3);
1313 dr = _mm_mullo_epi16(dr, isa);
1314 dr = _mm_add_epi16(dr, sr);
1315 dr = _mm_srli_epi16(dr, 5);
1317 dg = _mm_mullo_epi16(dg, isa);
1318 dg = _mm_add_epi16(dg, sg);
1319 dg = _mm_srli_epi16(dg, 5);
1321 db = _mm_mullo_epi16(db, isa);
1322 db = _mm_add_epi16(db, sb);
1323 db = _mm_srli_epi16(db, 5);
1325 // Package and store dst pixel.
1326 __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db);
1327 _mm_store_si128(d++, d_pixel);
1329 count -= 8;
1330 x += 8;
1331 }
1333 src = reinterpret_cast<const SkPMColor*>(s);
1334 dst = reinterpret_cast<uint16_t*>(d);
1335 }
1337 if (count > 0) {
1338 DITHER_565_SCAN(y);
1339 do {
1340 SkPMColor c = *src++;
1341 SkPMColorAssert(c);
1342 if (c) {
1343 unsigned a = SkGetPackedA32(c);
1345 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
1347 unsigned sr = SkGetPackedR32(c);
1348 unsigned sg = SkGetPackedG32(c);
1349 unsigned sb = SkGetPackedB32(c);
1350 sr = SkDITHER_R32_FOR_565(sr, d);
1351 sg = SkDITHER_G32_FOR_565(sg, d);
1352 sb = SkDITHER_B32_FOR_565(sb, d);
1354 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1355 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1356 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1357 // now src and dst expanded are in g:11 r:10 x:1 b:10
1358 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1359 }
1360 dst += 1;
1361 DITHER_INC_X(x);
1362 } while (--count != 0);
1363 }
1364 }