gfx/skia/trunk/src/opts/SkBlitRow_opts_SSE2.cpp

Sat, 03 Jan 2015 20:18:00 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Sat, 03 Jan 2015 20:18:00 +0100
branch
TOR_BUG_3246
changeset 7
129ffea94266
permissions
-rw-r--r--

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

michael@0 1 /*
michael@0 2 * Copyright 2012 The Android Open Source Project
michael@0 3 *
michael@0 4 * Use of this source code is governed by a BSD-style license that can be
michael@0 5 * found in the LICENSE file.
michael@0 6 */
michael@0 7
michael@0 8
michael@0 9 #include "SkBlitRow_opts_SSE2.h"
michael@0 10 #include "SkBitmapProcState_opts_SSE2.h"
michael@0 11 #include "SkColorPriv.h"
michael@0 12 #include "SkColor_opts_SSE2.h"
michael@0 13 #include "SkDither.h"
michael@0 14 #include "SkUtils.h"
michael@0 15
michael@0 16 #include <emmintrin.h>
michael@0 17
michael@0 18 /* SSE2 version of S32_Blend_BlitRow32()
michael@0 19 * portable version is in core/SkBlitRow_D32.cpp
michael@0 20 */
michael@0 21 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
michael@0 22 const SkPMColor* SK_RESTRICT src,
michael@0 23 int count, U8CPU alpha) {
michael@0 24 SkASSERT(alpha <= 255);
michael@0 25 if (count <= 0) {
michael@0 26 return;
michael@0 27 }
michael@0 28
michael@0 29 uint32_t src_scale = SkAlpha255To256(alpha);
michael@0 30 uint32_t dst_scale = 256 - src_scale;
michael@0 31
michael@0 32 if (count >= 4) {
michael@0 33 SkASSERT(((size_t)dst & 0x03) == 0);
michael@0 34 while (((size_t)dst & 0x0F) != 0) {
michael@0 35 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
michael@0 36 src++;
michael@0 37 dst++;
michael@0 38 count--;
michael@0 39 }
michael@0 40
michael@0 41 const __m128i *s = reinterpret_cast<const __m128i*>(src);
michael@0 42 __m128i *d = reinterpret_cast<__m128i*>(dst);
michael@0 43 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
michael@0 44 __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
michael@0 45
michael@0 46 // Move scale factors to upper byte of word
michael@0 47 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
michael@0 48 __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
michael@0 49 while (count >= 4) {
michael@0 50 // Load 4 pixels each of src and dest.
michael@0 51 __m128i src_pixel = _mm_loadu_si128(s);
michael@0 52 __m128i dst_pixel = _mm_load_si128(d);
michael@0 53
michael@0 54 // Interleave Atom port 0/1 operations based on the execution port
michael@0 55 // constraints that multiply can only be executed on port 0 (while
michael@0 56 // boolean operations can be executed on either port 0 or port 1)
michael@0 57 // because GCC currently doesn't do a good job scheduling
michael@0 58 // instructions based on these constraints.
michael@0 59
michael@0 60 // Get red and blue pixels into lower byte of each word.
michael@0 61 // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
michael@0 62 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
michael@0 63
michael@0 64 // Multiply by scale.
michael@0 65 // (4 x (0, rs.h, 0, bs.h))
michael@0 66 // where rs.h stands for the higher byte of r * scale, and
michael@0 67 // bs.h the higher byte of b * scale.
michael@0 68 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
michael@0 69
michael@0 70 // Get alpha and green pixels into higher byte of each word.
michael@0 71 // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
michael@0 72 __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
michael@0 73
michael@0 74 // Multiply by scale.
michael@0 75 // (4 x (as.h, as.l, gs.h, gs.l))
michael@0 76 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
michael@0 77
michael@0 78 // Clear the lower byte of the a*scale and g*scale results
michael@0 79 // (4 x (as.h, 0, gs.h, 0))
michael@0 80 src_ag = _mm_and_si128(src_ag, ag_mask);
michael@0 81
michael@0 82 // Operations the destination pixels are the same as on the
michael@0 83 // source pixels. See the comments above.
michael@0 84 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
michael@0 85 dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
michael@0 86 __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
michael@0 87 dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
michael@0 88 dst_ag = _mm_and_si128(dst_ag, ag_mask);
michael@0 89
michael@0 90 // Combine back into RGBA.
michael@0 91 // (4 x (as.h, rs.h, gs.h, bs.h))
michael@0 92 src_pixel = _mm_or_si128(src_rb, src_ag);
michael@0 93 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
michael@0 94
michael@0 95 // Add result
michael@0 96 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
michael@0 97 _mm_store_si128(d, result);
michael@0 98 s++;
michael@0 99 d++;
michael@0 100 count -= 4;
michael@0 101 }
michael@0 102 src = reinterpret_cast<const SkPMColor*>(s);
michael@0 103 dst = reinterpret_cast<SkPMColor*>(d);
michael@0 104 }
michael@0 105
michael@0 106 while (count > 0) {
michael@0 107 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
michael@0 108 src++;
michael@0 109 dst++;
michael@0 110 count--;
michael@0 111 }
michael@0 112 }
michael@0 113
michael@0 114 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
michael@0 115 const SkPMColor* SK_RESTRICT src,
michael@0 116 int count, U8CPU alpha) {
michael@0 117 SkASSERT(alpha == 255);
michael@0 118 if (count <= 0) {
michael@0 119 return;
michael@0 120 }
michael@0 121
michael@0 122 if (count >= 4) {
michael@0 123 SkASSERT(((size_t)dst & 0x03) == 0);
michael@0 124 while (((size_t)dst & 0x0F) != 0) {
michael@0 125 *dst = SkPMSrcOver(*src, *dst);
michael@0 126 src++;
michael@0 127 dst++;
michael@0 128 count--;
michael@0 129 }
michael@0 130
michael@0 131 const __m128i *s = reinterpret_cast<const __m128i*>(src);
michael@0 132 __m128i *d = reinterpret_cast<__m128i*>(dst);
michael@0 133 #ifdef SK_USE_ACCURATE_BLENDING
michael@0 134 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
michael@0 135 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit)
michael@0 136 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit)
michael@0 137 while (count >= 4) {
michael@0 138 // Load 4 pixels
michael@0 139 __m128i src_pixel = _mm_loadu_si128(s);
michael@0 140 __m128i dst_pixel = _mm_load_si128(d);
michael@0 141
michael@0 142 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
michael@0 143 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
michael@0 144 // Shift alphas down to lower 8 bits of each quad.
michael@0 145 __m128i alpha = _mm_srli_epi32(src_pixel, 24);
michael@0 146
michael@0 147 // Copy alpha to upper 3rd byte of each quad
michael@0 148 alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
michael@0 149
michael@0 150 // Subtract alphas from 255, to get 0..255
michael@0 151 alpha = _mm_sub_epi16(c_255, alpha);
michael@0 152
michael@0 153 // Multiply by red and blue by src alpha.
michael@0 154 dst_rb = _mm_mullo_epi16(dst_rb, alpha);
michael@0 155 // Multiply by alpha and green by src alpha.
michael@0 156 dst_ag = _mm_mullo_epi16(dst_ag, alpha);
michael@0 157
michael@0 158 // dst_rb_low = (dst_rb >> 8)
michael@0 159 __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
michael@0 160 __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
michael@0 161
michael@0 162 // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
michael@0 163 dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
michael@0 164 dst_rb = _mm_add_epi16(dst_rb, c_128);
michael@0 165 dst_rb = _mm_srli_epi16(dst_rb, 8);
michael@0 166
michael@0 167 // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
michael@0 168 dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
michael@0 169 dst_ag = _mm_add_epi16(dst_ag, c_128);
michael@0 170 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
michael@0 171
michael@0 172 // Combine back into RGBA.
michael@0 173 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
michael@0 174
michael@0 175 // Add result
michael@0 176 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
michael@0 177 _mm_store_si128(d, result);
michael@0 178 s++;
michael@0 179 d++;
michael@0 180 count -= 4;
michael@0 181 }
michael@0 182 #else
michael@0 183 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
michael@0 184 __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit)
michael@0 185 while (count >= 4) {
michael@0 186 // Load 4 pixels
michael@0 187 __m128i src_pixel = _mm_loadu_si128(s);
michael@0 188 __m128i dst_pixel = _mm_load_si128(d);
michael@0 189
michael@0 190 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
michael@0 191 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
michael@0 192
michael@0 193 // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word)
michael@0 194 __m128i alpha = _mm_srli_epi16(src_pixel, 8);
michael@0 195
michael@0 196 // (a0, a0, a1, a1, a2, g2, a3, g3)
michael@0 197 alpha = _mm_shufflehi_epi16(alpha, 0xF5);
michael@0 198
michael@0 199 // (a0, a0, a1, a1, a2, a2, a3, a3)
michael@0 200 alpha = _mm_shufflelo_epi16(alpha, 0xF5);
michael@0 201
michael@0 202 // Subtract alphas from 256, to get 1..256
michael@0 203 alpha = _mm_sub_epi16(c_256, alpha);
michael@0 204
michael@0 205 // Multiply by red and blue by src alpha.
michael@0 206 dst_rb = _mm_mullo_epi16(dst_rb, alpha);
michael@0 207 // Multiply by alpha and green by src alpha.
michael@0 208 dst_ag = _mm_mullo_epi16(dst_ag, alpha);
michael@0 209
michael@0 210 // Divide by 256.
michael@0 211 dst_rb = _mm_srli_epi16(dst_rb, 8);
michael@0 212
michael@0 213 // Mask out high bits (already in the right place)
michael@0 214 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
michael@0 215
michael@0 216 // Combine back into RGBA.
michael@0 217 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
michael@0 218
michael@0 219 // Add result
michael@0 220 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
michael@0 221 _mm_store_si128(d, result);
michael@0 222 s++;
michael@0 223 d++;
michael@0 224 count -= 4;
michael@0 225 }
michael@0 226 #endif
michael@0 227 src = reinterpret_cast<const SkPMColor*>(s);
michael@0 228 dst = reinterpret_cast<SkPMColor*>(d);
michael@0 229 }
michael@0 230
michael@0 231 while (count > 0) {
michael@0 232 *dst = SkPMSrcOver(*src, *dst);
michael@0 233 src++;
michael@0 234 dst++;
michael@0 235 count--;
michael@0 236 }
michael@0 237 }
michael@0 238
michael@0 239 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
michael@0 240 const SkPMColor* SK_RESTRICT src,
michael@0 241 int count, U8CPU alpha) {
michael@0 242 SkASSERT(alpha <= 255);
michael@0 243 if (count <= 0) {
michael@0 244 return;
michael@0 245 }
michael@0 246
michael@0 247 if (count >= 4) {
michael@0 248 while (((size_t)dst & 0x0F) != 0) {
michael@0 249 *dst = SkBlendARGB32(*src, *dst, alpha);
michael@0 250 src++;
michael@0 251 dst++;
michael@0 252 count--;
michael@0 253 }
michael@0 254
michael@0 255 uint32_t src_scale = SkAlpha255To256(alpha);
michael@0 256
michael@0 257 const __m128i *s = reinterpret_cast<const __m128i*>(src);
michael@0 258 __m128i *d = reinterpret_cast<__m128i*>(dst);
michael@0 259 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
michael@0 260 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
michael@0 261 __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit)
michael@0 262 while (count >= 4) {
michael@0 263 // Load 4 pixels each of src and dest.
michael@0 264 __m128i src_pixel = _mm_loadu_si128(s);
michael@0 265 __m128i dst_pixel = _mm_load_si128(d);
michael@0 266
michael@0 267 // Get red and blue pixels into lower byte of each word.
michael@0 268 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
michael@0 269 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
michael@0 270
michael@0 271 // Get alpha and green into lower byte of each word.
michael@0 272 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
michael@0 273 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
michael@0 274
michael@0 275 // Put per-pixel alpha in low byte of each word.
michael@0 276 // After the following two statements, the dst_alpha looks like
michael@0 277 // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
michael@0 278 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
michael@0 279 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
michael@0 280
michael@0 281 // dst_alpha = dst_alpha * src_scale
michael@0 282 // Because src_scales are in the higher byte of each word and
michael@0 283 // we use mulhi here, the resulting alpha values are already
michael@0 284 // in the right place and don't need to be divided by 256.
michael@0 285 // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
michael@0 286 dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
michael@0 287
michael@0 288 // Subtract alphas from 256, to get 1..256
michael@0 289 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
michael@0 290
michael@0 291 // Multiply red and blue by dst pixel alpha.
michael@0 292 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
michael@0 293 // Multiply alpha and green by dst pixel alpha.
michael@0 294 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
michael@0 295
michael@0 296 // Multiply red and blue by global alpha.
michael@0 297 // (4 x (0, rs.h, 0, bs.h))
michael@0 298 // where rs.h stands for the higher byte of r * src_scale,
michael@0 299 // and bs.h the higher byte of b * src_scale.
michael@0 300 // Again, because we use mulhi, the resuling red and blue
michael@0 301 // values are already in the right place and don't need to
michael@0 302 // be divided by 256.
michael@0 303 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
michael@0 304 // Multiply alpha and green by global alpha.
michael@0 305 // (4 x (0, as.h, 0, gs.h))
michael@0 306 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
michael@0 307
michael@0 308 // Divide by 256.
michael@0 309 dst_rb = _mm_srli_epi16(dst_rb, 8);
michael@0 310
michael@0 311 // Mask out low bits (goodies already in the right place; no need to divide)
michael@0 312 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
michael@0 313 // Shift alpha and green to higher byte of each word.
michael@0 314 // (4 x (as.h, 0, gs.h, 0))
michael@0 315 src_ag = _mm_slli_epi16(src_ag, 8);
michael@0 316
michael@0 317 // Combine back into RGBA.
michael@0 318 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
michael@0 319 src_pixel = _mm_or_si128(src_rb, src_ag);
michael@0 320
michael@0 321 // Add two pixels into result.
michael@0 322 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
michael@0 323 _mm_store_si128(d, result);
michael@0 324 s++;
michael@0 325 d++;
michael@0 326 count -= 4;
michael@0 327 }
michael@0 328 src = reinterpret_cast<const SkPMColor*>(s);
michael@0 329 dst = reinterpret_cast<SkPMColor*>(d);
michael@0 330 }
michael@0 331
michael@0 332 while (count > 0) {
michael@0 333 *dst = SkBlendARGB32(*src, *dst, alpha);
michael@0 334 src++;
michael@0 335 dst++;
michael@0 336 count--;
michael@0 337 }
michael@0 338 }
michael@0 339
michael@0 340 /* SSE2 version of Color32()
michael@0 341 * portable version is in core/SkBlitRow_D32.cpp
michael@0 342 */
michael@0 343 void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
michael@0 344 SkPMColor color) {
michael@0 345
michael@0 346 if (count <= 0) {
michael@0 347 return;
michael@0 348 }
michael@0 349
michael@0 350 if (0 == color) {
michael@0 351 if (src != dst) {
michael@0 352 memcpy(dst, src, count * sizeof(SkPMColor));
michael@0 353 }
michael@0 354 return;
michael@0 355 }
michael@0 356
michael@0 357 unsigned colorA = SkGetPackedA32(color);
michael@0 358 if (255 == colorA) {
michael@0 359 sk_memset32(dst, color, count);
michael@0 360 } else {
michael@0 361 unsigned scale = 256 - SkAlpha255To256(colorA);
michael@0 362
michael@0 363 if (count >= 4) {
michael@0 364 SkASSERT(((size_t)dst & 0x03) == 0);
michael@0 365 while (((size_t)dst & 0x0F) != 0) {
michael@0 366 *dst = color + SkAlphaMulQ(*src, scale);
michael@0 367 src++;
michael@0 368 dst++;
michael@0 369 count--;
michael@0 370 }
michael@0 371
michael@0 372 const __m128i *s = reinterpret_cast<const __m128i*>(src);
michael@0 373 __m128i *d = reinterpret_cast<__m128i*>(dst);
michael@0 374 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
michael@0 375 __m128i src_scale_wide = _mm_set1_epi16(scale);
michael@0 376 __m128i color_wide = _mm_set1_epi32(color);
michael@0 377 while (count >= 4) {
michael@0 378 // Load 4 pixels each of src and dest.
michael@0 379 __m128i src_pixel = _mm_loadu_si128(s);
michael@0 380
michael@0 381 // Get red and blue pixels into lower byte of each word.
michael@0 382 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
michael@0 383
michael@0 384 // Get alpha and green into lower byte of each word.
michael@0 385 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
michael@0 386
michael@0 387 // Multiply by scale.
michael@0 388 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
michael@0 389 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
michael@0 390
michael@0 391 // Divide by 256.
michael@0 392 src_rb = _mm_srli_epi16(src_rb, 8);
michael@0 393 src_ag = _mm_andnot_si128(rb_mask, src_ag);
michael@0 394
michael@0 395 // Combine back into RGBA.
michael@0 396 src_pixel = _mm_or_si128(src_rb, src_ag);
michael@0 397
michael@0 398 // Add color to result.
michael@0 399 __m128i result = _mm_add_epi8(color_wide, src_pixel);
michael@0 400
michael@0 401 // Store result.
michael@0 402 _mm_store_si128(d, result);
michael@0 403 s++;
michael@0 404 d++;
michael@0 405 count -= 4;
michael@0 406 }
michael@0 407 src = reinterpret_cast<const SkPMColor*>(s);
michael@0 408 dst = reinterpret_cast<SkPMColor*>(d);
michael@0 409 }
michael@0 410
michael@0 411 while (count > 0) {
michael@0 412 *dst = color + SkAlphaMulQ(*src, scale);
michael@0 413 src += 1;
michael@0 414 dst += 1;
michael@0 415 count--;
michael@0 416 }
michael@0 417 }
michael@0 418 }
michael@0 419
michael@0 420 void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
michael@0 421 size_t maskRB, SkColor origColor,
michael@0 422 int width, int height) {
michael@0 423 SkPMColor color = SkPreMultiplyColor(origColor);
michael@0 424 size_t dstOffset = dstRB - (width << 2);
michael@0 425 size_t maskOffset = maskRB - width;
michael@0 426 SkPMColor* dst = (SkPMColor *)device;
michael@0 427 const uint8_t* mask = (const uint8_t*)maskPtr;
michael@0 428 do {
michael@0 429 int count = width;
michael@0 430 if (count >= 4) {
michael@0 431 while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
michael@0 432 *dst = SkBlendARGB32(color, *dst, *mask);
michael@0 433 mask++;
michael@0 434 dst++;
michael@0 435 count--;
michael@0 436 }
michael@0 437 __m128i *d = reinterpret_cast<__m128i*>(dst);
michael@0 438 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
michael@0 439 __m128i c_256 = _mm_set1_epi16(256);
michael@0 440 __m128i c_1 = _mm_set1_epi16(1);
michael@0 441 __m128i src_pixel = _mm_set1_epi32(color);
michael@0 442 while (count >= 4) {
michael@0 443 // Load 4 pixels each of src and dest.
michael@0 444 __m128i dst_pixel = _mm_load_si128(d);
michael@0 445
michael@0 446 //set the aphla value
michael@0 447 __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\
michael@0 448 0, *(mask+3),0, \
michael@0 449 *(mask+2),0, *(mask+2),\
michael@0 450 0,*(mask+1), 0,*(mask+1),\
michael@0 451 0, *mask,0,*mask);
michael@0 452
michael@0 453 //call SkAlpha255To256()
michael@0 454 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
michael@0 455
michael@0 456 // Get red and blue pixels into lower byte of each word.
michael@0 457 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
michael@0 458 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
michael@0 459
michael@0 460 // Get alpha and green into lower byte of each word.
michael@0 461 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
michael@0 462 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
michael@0 463
michael@0 464 // Put per-pixel alpha in low byte of each word.
michael@0 465 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
michael@0 466 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
michael@0 467
michael@0 468 // dst_alpha = dst_alpha * src_scale
michael@0 469 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
michael@0 470
michael@0 471 // Divide by 256.
michael@0 472 dst_alpha = _mm_srli_epi16(dst_alpha, 8);
michael@0 473
michael@0 474 // Subtract alphas from 256, to get 1..256
michael@0 475 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
michael@0 476 // Multiply red and blue by dst pixel alpha.
michael@0 477 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
michael@0 478 // Multiply alpha and green by dst pixel alpha.
michael@0 479 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
michael@0 480
michael@0 481 // Multiply red and blue by global alpha.
michael@0 482 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
michael@0 483 // Multiply alpha and green by global alpha.
michael@0 484 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
michael@0 485 // Divide by 256.
michael@0 486 dst_rb = _mm_srli_epi16(dst_rb, 8);
michael@0 487 src_rb = _mm_srli_epi16(src_rb, 8);
michael@0 488
michael@0 489 // Mask out low bits (goodies already in the right place; no need to divide)
michael@0 490 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
michael@0 491 src_ag = _mm_andnot_si128(rb_mask, src_ag);
michael@0 492
michael@0 493 // Combine back into RGBA.
michael@0 494 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
michael@0 495 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
michael@0 496
michael@0 497 // Add two pixels into result.
michael@0 498 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
michael@0 499 _mm_store_si128(d, result);
michael@0 500 // load the next 4 pixel
michael@0 501 mask = mask + 4;
michael@0 502 d++;
michael@0 503 count -= 4;
michael@0 504 }
michael@0 505 dst = reinterpret_cast<SkPMColor *>(d);
michael@0 506 }
michael@0 507 while(count > 0) {
michael@0 508 *dst= SkBlendARGB32(color, *dst, *mask);
michael@0 509 dst += 1;
michael@0 510 mask++;
michael@0 511 count --;
michael@0 512 }
michael@0 513 dst = (SkPMColor *)((char*)dst + dstOffset);
michael@0 514 mask += maskOffset;
michael@0 515 } while (--height != 0);
michael@0 516 }
michael@0 517
michael@0 518 // The following (left) shifts cause the top 5 bits of the mask components to
michael@0 519 // line up with the corresponding components in an SkPMColor.
michael@0 520 // Note that the mask's RGB16 order may differ from the SkPMColor order.
michael@0 521 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
michael@0 522 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
michael@0 523 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
michael@0 524
michael@0 525 #if SK_R16x5_R32x5_SHIFT == 0
michael@0 526 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
michael@0 527 #elif SK_R16x5_R32x5_SHIFT > 0
michael@0 528 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
michael@0 529 #else
michael@0 530 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
michael@0 531 #endif
michael@0 532
michael@0 533 #if SK_G16x5_G32x5_SHIFT == 0
michael@0 534 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
michael@0 535 #elif SK_G16x5_G32x5_SHIFT > 0
michael@0 536 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
michael@0 537 #else
michael@0 538 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
michael@0 539 #endif
michael@0 540
michael@0 541 #if SK_B16x5_B32x5_SHIFT == 0
michael@0 542 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
michael@0 543 #elif SK_B16x5_B32x5_SHIFT > 0
michael@0 544 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
michael@0 545 #else
michael@0 546 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
michael@0 547 #endif
michael@0 548
michael@0 549 static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
michael@0 550 __m128i &mask, __m128i &srcA) {
michael@0 551 // In the following comments, the components of src, dst and mask are
michael@0 552 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
michael@0 553 // by an R, G, B, or A suffix. Components of one of the four pixels that
michael@0 554 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
michael@0 555 // example is the blue channel of the second destination pixel. Memory
michael@0 556 // layout is shown for an ARGB byte order in a color value.
michael@0 557
michael@0 558 // src and srcA store 8-bit values interleaved with zeros.
michael@0 559 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
michael@0 560 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
michael@0 561 // srcA, 0, srcA, 0, srcA, 0, srcA, 0)
michael@0 562 // mask stores 16-bit values (compressed three channels) interleaved with zeros.
michael@0 563 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
michael@0 564 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
michael@0 565 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
michael@0 566
michael@0 567 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
michael@0 568 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
michael@0 569 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
michael@0 570 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
michael@0 571
michael@0 572 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
michael@0 573 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
michael@0 574 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
michael@0 575
michael@0 576 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
michael@0 577 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
michael@0 578 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
michael@0 579
michael@0 580 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
michael@0 581 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
michael@0 582 // 8-bit position
michael@0 583 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
michael@0 584 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
michael@0 585 mask = _mm_or_si128(_mm_or_si128(r, g), b);
michael@0 586
michael@0 587 // Interleave R,G,B into the lower byte of word.
michael@0 588 // i.e. split the sixteen 8-bit values from mask into two sets of eight
michael@0 589 // 16-bit values, padded by zero.
michael@0 590 __m128i maskLo, maskHi;
michael@0 591 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
michael@0 592 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
michael@0 593 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
michael@0 594 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
michael@0 595
michael@0 596 // Upscale from 0..31 to 0..32
michael@0 597 // (allows to replace division by left-shift further down)
michael@0 598 // Left-shift each component by 4 and add the result back to that component,
michael@0 599 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
michael@0 600 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
michael@0 601 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
michael@0 602
michael@0 603 // Multiply each component of maskLo and maskHi by srcA
michael@0 604 maskLo = _mm_mullo_epi16(maskLo, srcA);
michael@0 605 maskHi = _mm_mullo_epi16(maskHi, srcA);
michael@0 606
michael@0 607 // Left shift mask components by 8 (divide by 256)
michael@0 608 maskLo = _mm_srli_epi16(maskLo, 8);
michael@0 609 maskHi = _mm_srli_epi16(maskHi, 8);
michael@0 610
michael@0 611 // Interleave R,G,B into the lower byte of the word
michael@0 612 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
michael@0 613 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
michael@0 614 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
michael@0 615 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
michael@0 616
michael@0 617 // mask = (src - dst) * mask
michael@0 618 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
michael@0 619 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
michael@0 620
michael@0 621 // mask = (src - dst) * mask >> 5
michael@0 622 maskLo = _mm_srai_epi16(maskLo, 5);
michael@0 623 maskHi = _mm_srai_epi16(maskHi, 5);
michael@0 624
michael@0 625 // Add two pixels into result.
michael@0 626 // result = dst + ((src - dst) * mask >> 5)
michael@0 627 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
michael@0 628 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
michael@0 629
michael@0 630 // Pack into 4 32bit dst pixels.
michael@0 631 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
michael@0 632 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
michael@0 633 // clamping to 255 if necessary.
michael@0 634 return _mm_packus_epi16(resultLo, resultHi);
michael@0 635 }
michael@0 636
michael@0 637 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
michael@0 638 __m128i &mask) {
michael@0 639 // In the following comments, the components of src, dst and mask are
michael@0 640 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
michael@0 641 // by an R, G, B, or A suffix. Components of one of the four pixels that
michael@0 642 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
michael@0 643 // example is the blue channel of the second destination pixel. Memory
michael@0 644 // layout is shown for an ARGB byte order in a color value.
michael@0 645
michael@0 646 // src and srcA store 8-bit values interleaved with zeros.
michael@0 647 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
michael@0 648 // mask stores 16-bit values (shown as high and low bytes) interleaved with
michael@0 649 // zeros
michael@0 650 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
michael@0 651 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
michael@0 652
michael@0 653 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
michael@0 654 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
michael@0 655 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
michael@0 656 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
michael@0 657
michael@0 658 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
michael@0 659 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
michael@0 660 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
michael@0 661
michael@0 662 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
michael@0 663 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
michael@0 664 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
michael@0 665
michael@0 666 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
michael@0 667 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
michael@0 668 // 8-bit position
michael@0 669 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
michael@0 670 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
michael@0 671 mask = _mm_or_si128(_mm_or_si128(r, g), b);
michael@0 672
michael@0 673 // Interleave R,G,B into the lower byte of word.
michael@0 674 // i.e. split the sixteen 8-bit values from mask into two sets of eight
michael@0 675 // 16-bit values, padded by zero.
michael@0 676 __m128i maskLo, maskHi;
michael@0 677 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
michael@0 678 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
michael@0 679 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
michael@0 680 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
michael@0 681
michael@0 682 // Upscale from 0..31 to 0..32
michael@0 683 // (allows to replace division by left-shift further down)
michael@0 684 // Left-shift each component by 4 and add the result back to that component,
michael@0 685 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
michael@0 686 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
michael@0 687 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
michael@0 688
michael@0 689 // Interleave R,G,B into the lower byte of the word
michael@0 690 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
michael@0 691 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
michael@0 692 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
michael@0 693 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
michael@0 694
michael@0 695 // mask = (src - dst) * mask
michael@0 696 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
michael@0 697 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
michael@0 698
michael@0 699 // mask = (src - dst) * mask >> 5
michael@0 700 maskLo = _mm_srai_epi16(maskLo, 5);
michael@0 701 maskHi = _mm_srai_epi16(maskHi, 5);
michael@0 702
michael@0 703 // Add two pixels into result.
michael@0 704 // result = dst + ((src - dst) * mask >> 5)
michael@0 705 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
michael@0 706 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
michael@0 707
michael@0 708 // Pack into 4 32bit dst pixels and force opaque.
michael@0 709 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
michael@0 710 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
michael@0 711 // clamping to 255 if necessary. Set alpha components to 0xFF.
michael@0 712 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
michael@0 713 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
michael@0 714 }
michael@0 715
michael@0 716 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
michael@0 717 SkColor src, int width, SkPMColor) {
michael@0 718 if (width <= 0) {
michael@0 719 return;
michael@0 720 }
michael@0 721
michael@0 722 int srcA = SkColorGetA(src);
michael@0 723 int srcR = SkColorGetR(src);
michael@0 724 int srcG = SkColorGetG(src);
michael@0 725 int srcB = SkColorGetB(src);
michael@0 726
michael@0 727 srcA = SkAlpha255To256(srcA);
michael@0 728
michael@0 729 if (width >= 4) {
michael@0 730 SkASSERT(((size_t)dst & 0x03) == 0);
michael@0 731 while (((size_t)dst & 0x0F) != 0) {
michael@0 732 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
michael@0 733 mask++;
michael@0 734 dst++;
michael@0 735 width--;
michael@0 736 }
michael@0 737
michael@0 738 __m128i *d = reinterpret_cast<__m128i*>(dst);
michael@0 739 // Set alpha to 0xFF and replicate source four times in SSE register.
michael@0 740 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
michael@0 741 // Interleave with zeros to get two sets of four 16-bit values.
michael@0 742 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
michael@0 743 // Set srcA_sse to contain eight copies of srcA, padded with zero.
michael@0 744 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
michael@0 745 __m128i srcA_sse = _mm_set1_epi16(srcA);
michael@0 746 while (width >= 4) {
michael@0 747 // Load four destination pixels into dst_sse.
michael@0 748 __m128i dst_sse = _mm_load_si128(d);
michael@0 749 // Load four 16-bit masks into lower half of mask_sse.
michael@0 750 __m128i mask_sse = _mm_loadl_epi64(
michael@0 751 reinterpret_cast<const __m128i*>(mask));
michael@0 752
michael@0 753 // Check whether masks are equal to 0 and get the highest bit
michael@0 754 // of each byte of result, if masks are all zero, we will get
michael@0 755 // pack_cmp to 0xFFFF
michael@0 756 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
michael@0 757 _mm_setzero_si128()));
michael@0 758
michael@0 759 // if mask pixels are not all zero, we will blend the dst pixels
michael@0 760 if (pack_cmp != 0xFFFF) {
michael@0 761 // Unpack 4 16bit mask pixels to
michael@0 762 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
michael@0 763 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
michael@0 764 mask_sse = _mm_unpacklo_epi16(mask_sse,
michael@0 765 _mm_setzero_si128());
michael@0 766
michael@0 767 // Process 4 32bit dst pixels
michael@0 768 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
michael@0 769 mask_sse, srcA_sse);
michael@0 770 _mm_store_si128(d, result);
michael@0 771 }
michael@0 772
michael@0 773 d++;
michael@0 774 mask += 4;
michael@0 775 width -= 4;
michael@0 776 }
michael@0 777
michael@0 778 dst = reinterpret_cast<SkPMColor*>(d);
michael@0 779 }
michael@0 780
michael@0 781 while (width > 0) {
michael@0 782 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
michael@0 783 mask++;
michael@0 784 dst++;
michael@0 785 width--;
michael@0 786 }
michael@0 787 }
michael@0 788
michael@0 789 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
michael@0 790 SkColor src, int width, SkPMColor opaqueDst) {
michael@0 791 if (width <= 0) {
michael@0 792 return;
michael@0 793 }
michael@0 794
michael@0 795 int srcR = SkColorGetR(src);
michael@0 796 int srcG = SkColorGetG(src);
michael@0 797 int srcB = SkColorGetB(src);
michael@0 798
michael@0 799 if (width >= 4) {
michael@0 800 SkASSERT(((size_t)dst & 0x03) == 0);
michael@0 801 while (((size_t)dst & 0x0F) != 0) {
michael@0 802 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
michael@0 803 mask++;
michael@0 804 dst++;
michael@0 805 width--;
michael@0 806 }
michael@0 807
michael@0 808 __m128i *d = reinterpret_cast<__m128i*>(dst);
michael@0 809 // Set alpha to 0xFF and replicate source four times in SSE register.
michael@0 810 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
michael@0 811 // Set srcA_sse to contain eight copies of srcA, padded with zero.
michael@0 812 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
michael@0 813 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
michael@0 814 while (width >= 4) {
michael@0 815 // Load four destination pixels into dst_sse.
michael@0 816 __m128i dst_sse = _mm_load_si128(d);
michael@0 817 // Load four 16-bit masks into lower half of mask_sse.
michael@0 818 __m128i mask_sse = _mm_loadl_epi64(
michael@0 819 reinterpret_cast<const __m128i*>(mask));
michael@0 820
michael@0 821 // Check whether masks are equal to 0 and get the highest bit
michael@0 822 // of each byte of result, if masks are all zero, we will get
michael@0 823 // pack_cmp to 0xFFFF
michael@0 824 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
michael@0 825 _mm_setzero_si128()));
michael@0 826
michael@0 827 // if mask pixels are not all zero, we will blend the dst pixels
michael@0 828 if (pack_cmp != 0xFFFF) {
michael@0 829 // Unpack 4 16bit mask pixels to
michael@0 830 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
michael@0 831 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
michael@0 832 mask_sse = _mm_unpacklo_epi16(mask_sse,
michael@0 833 _mm_setzero_si128());
michael@0 834
michael@0 835 // Process 4 32bit dst pixels
michael@0 836 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
michael@0 837 mask_sse);
michael@0 838 _mm_store_si128(d, result);
michael@0 839 }
michael@0 840
michael@0 841 d++;
michael@0 842 mask += 4;
michael@0 843 width -= 4;
michael@0 844 }
michael@0 845
michael@0 846 dst = reinterpret_cast<SkPMColor*>(d);
michael@0 847 }
michael@0 848
michael@0 849 while (width > 0) {
michael@0 850 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
michael@0 851 mask++;
michael@0 852 dst++;
michael@0 853 width--;
michael@0 854 }
michael@0 855 }
michael@0 856
michael@0 857 /* SSE2 version of S32_D565_Opaque()
michael@0 858 * portable version is in core/SkBlitRow_D16.cpp
michael@0 859 */
michael@0 860 void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
michael@0 861 const SkPMColor* SK_RESTRICT src, int count,
michael@0 862 U8CPU alpha, int /*x*/, int /*y*/) {
michael@0 863 SkASSERT(255 == alpha);
michael@0 864
michael@0 865 if (count <= 0) {
michael@0 866 return;
michael@0 867 }
michael@0 868
michael@0 869 if (count >= 8) {
michael@0 870 while (((size_t)dst & 0x0F) != 0) {
michael@0 871 SkPMColor c = *src++;
michael@0 872 SkPMColorAssert(c);
michael@0 873
michael@0 874 *dst++ = SkPixel32ToPixel16_ToU16(c);
michael@0 875 count--;
michael@0 876 }
michael@0 877
michael@0 878 const __m128i* s = reinterpret_cast<const __m128i*>(src);
michael@0 879 __m128i* d = reinterpret_cast<__m128i*>(dst);
michael@0 880 __m128i r16_mask = _mm_set1_epi32(SK_R16_MASK);
michael@0 881 __m128i g16_mask = _mm_set1_epi32(SK_G16_MASK);
michael@0 882 __m128i b16_mask = _mm_set1_epi32(SK_B16_MASK);
michael@0 883
michael@0 884 while (count >= 8) {
michael@0 885 // Load 8 pixels of src.
michael@0 886 __m128i src_pixel1 = _mm_loadu_si128(s++);
michael@0 887 __m128i src_pixel2 = _mm_loadu_si128(s++);
michael@0 888
michael@0 889 // Calculate result r.
michael@0 890 __m128i r1 = _mm_srli_epi32(src_pixel1,
michael@0 891 SK_R32_SHIFT + (8 - SK_R16_BITS));
michael@0 892 r1 = _mm_and_si128(r1, r16_mask);
michael@0 893 __m128i r2 = _mm_srli_epi32(src_pixel2,
michael@0 894 SK_R32_SHIFT + (8 - SK_R16_BITS));
michael@0 895 r2 = _mm_and_si128(r2, r16_mask);
michael@0 896 __m128i r = _mm_packs_epi32(r1, r2);
michael@0 897
michael@0 898 // Calculate result g.
michael@0 899 __m128i g1 = _mm_srli_epi32(src_pixel1,
michael@0 900 SK_G32_SHIFT + (8 - SK_G16_BITS));
michael@0 901 g1 = _mm_and_si128(g1, g16_mask);
michael@0 902 __m128i g2 = _mm_srli_epi32(src_pixel2,
michael@0 903 SK_G32_SHIFT + (8 - SK_G16_BITS));
michael@0 904 g2 = _mm_and_si128(g2, g16_mask);
michael@0 905 __m128i g = _mm_packs_epi32(g1, g2);
michael@0 906
michael@0 907 // Calculate result b.
michael@0 908 __m128i b1 = _mm_srli_epi32(src_pixel1,
michael@0 909 SK_B32_SHIFT + (8 - SK_B16_BITS));
michael@0 910 b1 = _mm_and_si128(b1, b16_mask);
michael@0 911 __m128i b2 = _mm_srli_epi32(src_pixel2,
michael@0 912 SK_B32_SHIFT + (8 - SK_B16_BITS));
michael@0 913 b2 = _mm_and_si128(b2, b16_mask);
michael@0 914 __m128i b = _mm_packs_epi32(b1, b2);
michael@0 915
michael@0 916 // Store 8 16-bit colors in dst.
michael@0 917 __m128i d_pixel = SkPackRGB16_SSE(r, g, b);
michael@0 918 _mm_store_si128(d++, d_pixel);
michael@0 919 count -= 8;
michael@0 920 }
michael@0 921 src = reinterpret_cast<const SkPMColor*>(s);
michael@0 922 dst = reinterpret_cast<uint16_t*>(d);
michael@0 923 }
michael@0 924
michael@0 925 if (count > 0) {
michael@0 926 do {
michael@0 927 SkPMColor c = *src++;
michael@0 928 SkPMColorAssert(c);
michael@0 929 *dst++ = SkPixel32ToPixel16_ToU16(c);
michael@0 930 } while (--count != 0);
michael@0 931 }
michael@0 932 }
michael@0 933
michael@0 934 /* SSE2 version of S32A_D565_Opaque()
michael@0 935 * portable version is in core/SkBlitRow_D16.cpp
michael@0 936 */
michael@0 937 void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
michael@0 938 const SkPMColor* SK_RESTRICT src,
michael@0 939 int count, U8CPU alpha, int /*x*/, int /*y*/) {
michael@0 940 SkASSERT(255 == alpha);
michael@0 941
michael@0 942 if (count <= 0) {
michael@0 943 return;
michael@0 944 }
michael@0 945
michael@0 946 if (count >= 8) {
michael@0 947 // Make dst 16 bytes alignment
michael@0 948 while (((size_t)dst & 0x0F) != 0) {
michael@0 949 SkPMColor c = *src++;
michael@0 950 if (c) {
michael@0 951 *dst = SkSrcOver32To16(c, *dst);
michael@0 952 }
michael@0 953 dst += 1;
michael@0 954 count--;
michael@0 955 }
michael@0 956
michael@0 957 const __m128i* s = reinterpret_cast<const __m128i*>(src);
michael@0 958 __m128i* d = reinterpret_cast<__m128i*>(dst);
michael@0 959 __m128i var255 = _mm_set1_epi16(255);
michael@0 960 __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
michael@0 961 __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
michael@0 962 __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
michael@0 963
michael@0 964 while (count >= 8) {
michael@0 965 // Load 8 pixels of src.
michael@0 966 __m128i src_pixel1 = _mm_loadu_si128(s++);
michael@0 967 __m128i src_pixel2 = _mm_loadu_si128(s++);
michael@0 968
michael@0 969 // Check whether src pixels are equal to 0 and get the highest bit
michael@0 970 // of each byte of result, if src pixels are all zero, src_cmp1 and
michael@0 971 // src_cmp2 will be 0xFFFF.
michael@0 972 int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
michael@0 973 _mm_setzero_si128()));
michael@0 974 int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
michael@0 975 _mm_setzero_si128()));
michael@0 976 if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
michael@0 977 d++;
michael@0 978 count -= 8;
michael@0 979 continue;
michael@0 980 }
michael@0 981
michael@0 982 // Load 8 pixels of dst.
michael@0 983 __m128i dst_pixel = _mm_load_si128(d);
michael@0 984
michael@0 985 // Extract A from src.
michael@0 986 __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT));
michael@0 987 sa1 = _mm_srli_epi32(sa1, 24);
michael@0 988 __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT));
michael@0 989 sa2 = _mm_srli_epi32(sa2, 24);
michael@0 990 __m128i sa = _mm_packs_epi32(sa1, sa2);
michael@0 991
michael@0 992 // Extract R from src.
michael@0 993 __m128i sr1 = _mm_slli_epi32(src_pixel1,(24 - SK_R32_SHIFT));
michael@0 994 sr1 = _mm_srli_epi32(sr1, 24);
michael@0 995 __m128i sr2 = _mm_slli_epi32(src_pixel2,(24 - SK_R32_SHIFT));
michael@0 996 sr2 = _mm_srli_epi32(sr2, 24);
michael@0 997 __m128i sr = _mm_packs_epi32(sr1, sr2);
michael@0 998
michael@0 999 // Extract G from src.
michael@0 1000 __m128i sg1 = _mm_slli_epi32(src_pixel1,(24 - SK_G32_SHIFT));
michael@0 1001 sg1 = _mm_srli_epi32(sg1, 24);
michael@0 1002 __m128i sg2 = _mm_slli_epi32(src_pixel2,(24 - SK_G32_SHIFT));
michael@0 1003 sg2 = _mm_srli_epi32(sg2, 24);
michael@0 1004 __m128i sg = _mm_packs_epi32(sg1, sg2);
michael@0 1005
michael@0 1006 // Extract B from src.
michael@0 1007 __m128i sb1 = _mm_slli_epi32(src_pixel1,(24 - SK_B32_SHIFT));
michael@0 1008 sb1 = _mm_srli_epi32(sb1, 24);
michael@0 1009 __m128i sb2 = _mm_slli_epi32(src_pixel2,(24 - SK_B32_SHIFT));
michael@0 1010 sb2 = _mm_srli_epi32(sb2, 24);
michael@0 1011 __m128i sb = _mm_packs_epi32(sb1, sb2);
michael@0 1012
michael@0 1013 // Extract R G B from dst.
michael@0 1014 __m128i dr = _mm_srli_epi16(dst_pixel,SK_R16_SHIFT);
michael@0 1015 dr = _mm_and_si128(dr, r16_mask);
michael@0 1016 __m128i dg = _mm_srli_epi16(dst_pixel,SK_G16_SHIFT);
michael@0 1017 dg = _mm_and_si128(dg, g16_mask);
michael@0 1018 __m128i db = _mm_srli_epi16(dst_pixel,SK_B16_SHIFT);
michael@0 1019 db = _mm_and_si128(db, b16_mask);
michael@0 1020
michael@0 1021 __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
michael@0 1022
michael@0 1023 // Calculate R G B of result.
michael@0 1024 // Original algorithm is in SkSrcOver32To16().
michael@0 1025 dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE(dr, isa, SK_R16_BITS));
michael@0 1026 dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
michael@0 1027 dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE(dg, isa, SK_G16_BITS));
michael@0 1028 dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
michael@0 1029 db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE(db, isa, SK_B16_BITS));
michael@0 1030 db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
michael@0 1031
michael@0 1032 // Pack R G B into 16-bit color.
michael@0 1033 __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db);
michael@0 1034
michael@0 1035 // Store 8 16-bit colors in dst.
michael@0 1036 _mm_store_si128(d++, d_pixel);
michael@0 1037 count -= 8;
michael@0 1038 }
michael@0 1039
michael@0 1040 src = reinterpret_cast<const SkPMColor*>(s);
michael@0 1041 dst = reinterpret_cast<uint16_t*>(d);
michael@0 1042 }
michael@0 1043
michael@0 1044 if (count > 0) {
michael@0 1045 do {
michael@0 1046 SkPMColor c = *src++;
michael@0 1047 SkPMColorAssert(c);
michael@0 1048 if (c) {
michael@0 1049 *dst = SkSrcOver32To16(c, *dst);
michael@0 1050 }
michael@0 1051 dst += 1;
michael@0 1052 } while (--count != 0);
michael@0 1053 }
michael@0 1054 }
michael@0 1055
michael@0 1056 void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
michael@0 1057 const SkPMColor* SK_RESTRICT src,
michael@0 1058 int count, U8CPU alpha, int x, int y) {
michael@0 1059 SkASSERT(255 == alpha);
michael@0 1060
michael@0 1061 if (count <= 0) {
michael@0 1062 return;
michael@0 1063 }
michael@0 1064
michael@0 1065 if (count >= 8) {
michael@0 1066 while (((size_t)dst & 0x0F) != 0) {
michael@0 1067 DITHER_565_SCAN(y);
michael@0 1068 SkPMColor c = *src++;
michael@0 1069 SkPMColorAssert(c);
michael@0 1070
michael@0 1071 unsigned dither = DITHER_VALUE(x);
michael@0 1072 *dst++ = SkDitherRGB32To565(c, dither);
michael@0 1073 DITHER_INC_X(x);
michael@0 1074 count--;
michael@0 1075 }
michael@0 1076
michael@0 1077 unsigned short dither_value[8];
michael@0 1078 __m128i dither;
michael@0 1079 #ifdef ENABLE_DITHER_MATRIX_4X4
michael@0 1080 const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
michael@0 1081 dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
michael@0 1082 dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
michael@0 1083 dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
michael@0 1084 dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
michael@0 1085 #else
michael@0 1086 const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
michael@0 1087 dither_value[0] = dither_value[4] = (dither_scan
michael@0 1088 >> (((x) & 3) << 2)) & 0xF;
michael@0 1089 dither_value[1] = dither_value[5] = (dither_scan
michael@0 1090 >> (((x + 1) & 3) << 2)) & 0xF;
michael@0 1091 dither_value[2] = dither_value[6] = (dither_scan
michael@0 1092 >> (((x + 2) & 3) << 2)) & 0xF;
michael@0 1093 dither_value[3] = dither_value[7] = (dither_scan
michael@0 1094 >> (((x + 3) & 3) << 2)) & 0xF;
michael@0 1095 #endif
michael@0 1096 dither = _mm_loadu_si128((__m128i*) dither_value);
michael@0 1097
michael@0 1098 const __m128i* s = reinterpret_cast<const __m128i*>(src);
michael@0 1099 __m128i* d = reinterpret_cast<__m128i*>(dst);
michael@0 1100
michael@0 1101 while (count >= 8) {
michael@0 1102 // Load 8 pixels of src.
michael@0 1103 __m128i src_pixel1 = _mm_loadu_si128(s++);
michael@0 1104 __m128i src_pixel2 = _mm_loadu_si128(s++);
michael@0 1105
michael@0 1106 // Extract R from src.
michael@0 1107 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
michael@0 1108 sr1 = _mm_srli_epi32(sr1, 24);
michael@0 1109 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
michael@0 1110 sr2 = _mm_srli_epi32(sr2, 24);
michael@0 1111 __m128i sr = _mm_packs_epi32(sr1, sr2);
michael@0 1112
michael@0 1113 // SkDITHER_R32To565(sr, dither)
michael@0 1114 __m128i sr_offset = _mm_srli_epi16(sr, 5);
michael@0 1115 sr = _mm_add_epi16(sr, dither);
michael@0 1116 sr = _mm_sub_epi16(sr, sr_offset);
michael@0 1117 sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
michael@0 1118
michael@0 1119 // Extract G from src.
michael@0 1120 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
michael@0 1121 sg1 = _mm_srli_epi32(sg1, 24);
michael@0 1122 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
michael@0 1123 sg2 = _mm_srli_epi32(sg2, 24);
michael@0 1124 __m128i sg = _mm_packs_epi32(sg1, sg2);
michael@0 1125
michael@0 1126 // SkDITHER_R32To565(sg, dither)
michael@0 1127 __m128i sg_offset = _mm_srli_epi16(sg, 6);
michael@0 1128 sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
michael@0 1129 sg = _mm_sub_epi16(sg, sg_offset);
michael@0 1130 sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
michael@0 1131
michael@0 1132 // Extract B from src.
michael@0 1133 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
michael@0 1134 sb1 = _mm_srli_epi32(sb1, 24);
michael@0 1135 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
michael@0 1136 sb2 = _mm_srli_epi32(sb2, 24);
michael@0 1137 __m128i sb = _mm_packs_epi32(sb1, sb2);
michael@0 1138
michael@0 1139 // SkDITHER_R32To565(sb, dither)
michael@0 1140 __m128i sb_offset = _mm_srli_epi16(sb, 5);
michael@0 1141 sb = _mm_add_epi16(sb, dither);
michael@0 1142 sb = _mm_sub_epi16(sb, sb_offset);
michael@0 1143 sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
michael@0 1144
michael@0 1145 // Pack and store 16-bit dst pixel.
michael@0 1146 __m128i d_pixel = SkPackRGB16_SSE(sr, sg, sb);
michael@0 1147 _mm_store_si128(d++, d_pixel);
michael@0 1148
michael@0 1149 count -= 8;
michael@0 1150 x += 8;
michael@0 1151 }
michael@0 1152
michael@0 1153 src = reinterpret_cast<const SkPMColor*>(s);
michael@0 1154 dst = reinterpret_cast<uint16_t*>(d);
michael@0 1155 }
michael@0 1156
michael@0 1157 if (count > 0) {
michael@0 1158 DITHER_565_SCAN(y);
michael@0 1159 do {
michael@0 1160 SkPMColor c = *src++;
michael@0 1161 SkPMColorAssert(c);
michael@0 1162
michael@0 1163 unsigned dither = DITHER_VALUE(x);
michael@0 1164 *dst++ = SkDitherRGB32To565(c, dither);
michael@0 1165 DITHER_INC_X(x);
michael@0 1166 } while (--count != 0);
michael@0 1167 }
michael@0 1168 }
michael@0 1169
michael@0 1170 /* SSE2 version of S32A_D565_Opaque_Dither()
michael@0 1171 * portable version is in core/SkBlitRow_D16.cpp
michael@0 1172 */
michael@0 1173 void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
michael@0 1174 const SkPMColor* SK_RESTRICT src,
michael@0 1175 int count, U8CPU alpha, int x, int y) {
michael@0 1176 SkASSERT(255 == alpha);
michael@0 1177
michael@0 1178 if (count <= 0) {
michael@0 1179 return;
michael@0 1180 }
michael@0 1181
michael@0 1182 if (count >= 8) {
michael@0 1183 while (((size_t)dst & 0x0F) != 0) {
michael@0 1184 DITHER_565_SCAN(y);
michael@0 1185 SkPMColor c = *src++;
michael@0 1186 SkPMColorAssert(c);
michael@0 1187 if (c) {
michael@0 1188 unsigned a = SkGetPackedA32(c);
michael@0 1189
michael@0 1190 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
michael@0 1191
michael@0 1192 unsigned sr = SkGetPackedR32(c);
michael@0 1193 unsigned sg = SkGetPackedG32(c);
michael@0 1194 unsigned sb = SkGetPackedB32(c);
michael@0 1195 sr = SkDITHER_R32_FOR_565(sr, d);
michael@0 1196 sg = SkDITHER_G32_FOR_565(sg, d);
michael@0 1197 sb = SkDITHER_B32_FOR_565(sb, d);
michael@0 1198
michael@0 1199 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
michael@0 1200 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
michael@0 1201 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
michael@0 1202 // now src and dst expanded are in g:11 r:10 x:1 b:10
michael@0 1203 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
michael@0 1204 }
michael@0 1205 dst += 1;
michael@0 1206 DITHER_INC_X(x);
michael@0 1207 count--;
michael@0 1208 }
michael@0 1209
michael@0 1210 unsigned short dither_value[8];
michael@0 1211 __m128i dither, dither_cur;
michael@0 1212 #ifdef ENABLE_DITHER_MATRIX_4X4
michael@0 1213 const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
michael@0 1214 dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
michael@0 1215 dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
michael@0 1216 dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
michael@0 1217 dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
michael@0 1218 #else
michael@0 1219 const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
michael@0 1220 dither_value[0] = dither_value[4] = (dither_scan
michael@0 1221 >> (((x) & 3) << 2)) & 0xF;
michael@0 1222 dither_value[1] = dither_value[5] = (dither_scan
michael@0 1223 >> (((x + 1) & 3) << 2)) & 0xF;
michael@0 1224 dither_value[2] = dither_value[6] = (dither_scan
michael@0 1225 >> (((x + 2) & 3) << 2)) & 0xF;
michael@0 1226 dither_value[3] = dither_value[7] = (dither_scan
michael@0 1227 >> (((x + 3) & 3) << 2)) & 0xF;
michael@0 1228 #endif
michael@0 1229 dither = _mm_loadu_si128((__m128i*) dither_value);
michael@0 1230
michael@0 1231 const __m128i* s = reinterpret_cast<const __m128i*>(src);
michael@0 1232 __m128i* d = reinterpret_cast<__m128i*>(dst);
michael@0 1233 __m128i var256 = _mm_set1_epi16(256);
michael@0 1234 __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
michael@0 1235 __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
michael@0 1236 __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
michael@0 1237
michael@0 1238 while (count >= 8) {
michael@0 1239 // Load 8 pixels of src and dst.
michael@0 1240 __m128i src_pixel1 = _mm_loadu_si128(s++);
michael@0 1241 __m128i src_pixel2 = _mm_loadu_si128(s++);
michael@0 1242 __m128i dst_pixel = _mm_load_si128(d);
michael@0 1243
michael@0 1244 // Extract A from src.
michael@0 1245 __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT));
michael@0 1246 sa1 = _mm_srli_epi32(sa1, 24);
michael@0 1247 __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT));
michael@0 1248 sa2 = _mm_srli_epi32(sa2, 24);
michael@0 1249 __m128i sa = _mm_packs_epi32(sa1, sa2);
michael@0 1250
michael@0 1251 // Calculate current dither value.
michael@0 1252 dither_cur = _mm_mullo_epi16(dither,
michael@0 1253 _mm_add_epi16(sa, _mm_set1_epi16(1)));
michael@0 1254 dither_cur = _mm_srli_epi16(dither_cur, 8);
michael@0 1255
michael@0 1256 // Extract R from src.
michael@0 1257 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
michael@0 1258 sr1 = _mm_srli_epi32(sr1, 24);
michael@0 1259 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
michael@0 1260 sr2 = _mm_srli_epi32(sr2, 24);
michael@0 1261 __m128i sr = _mm_packs_epi32(sr1, sr2);
michael@0 1262
michael@0 1263 // SkDITHER_R32_FOR_565(sr, d)
michael@0 1264 __m128i sr_offset = _mm_srli_epi16(sr, 5);
michael@0 1265 sr = _mm_add_epi16(sr, dither_cur);
michael@0 1266 sr = _mm_sub_epi16(sr, sr_offset);
michael@0 1267
michael@0 1268 // Expand sr.
michael@0 1269 sr = _mm_slli_epi16(sr, 2);
michael@0 1270
michael@0 1271 // Extract G from src.
michael@0 1272 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
michael@0 1273 sg1 = _mm_srli_epi32(sg1, 24);
michael@0 1274 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
michael@0 1275 sg2 = _mm_srli_epi32(sg2, 24);
michael@0 1276 __m128i sg = _mm_packs_epi32(sg1, sg2);
michael@0 1277
michael@0 1278 // sg = SkDITHER_G32_FOR_565(sg, d).
michael@0 1279 __m128i sg_offset = _mm_srli_epi16(sg, 6);
michael@0 1280 sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));
michael@0 1281 sg = _mm_sub_epi16(sg, sg_offset);
michael@0 1282
michael@0 1283 // Expand sg.
michael@0 1284 sg = _mm_slli_epi16(sg, 3);
michael@0 1285
michael@0 1286 // Extract B from src.
michael@0 1287 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
michael@0 1288 sb1 = _mm_srli_epi32(sb1, 24);
michael@0 1289 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
michael@0 1290 sb2 = _mm_srli_epi32(sb2, 24);
michael@0 1291 __m128i sb = _mm_packs_epi32(sb1, sb2);
michael@0 1292
michael@0 1293 // sb = SkDITHER_B32_FOR_565(sb, d).
michael@0 1294 __m128i sb_offset = _mm_srli_epi16(sb, 5);
michael@0 1295 sb = _mm_add_epi16(sb, dither_cur);
michael@0 1296 sb = _mm_sub_epi16(sb, sb_offset);
michael@0 1297
michael@0 1298 // Expand sb.
michael@0 1299 sb = _mm_slli_epi16(sb, 2);
michael@0 1300
michael@0 1301 // Extract R G B from dst.
michael@0 1302 __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
michael@0 1303 dr = _mm_and_si128(dr, r16_mask);
michael@0 1304 __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
michael@0 1305 dg = _mm_and_si128(dg, g16_mask);
michael@0 1306 __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
michael@0 1307 db = _mm_and_si128(db, b16_mask);
michael@0 1308
michael@0 1309 // SkAlpha255To256(255 - a) >> 3
michael@0 1310 __m128i isa = _mm_sub_epi16(var256, sa);
michael@0 1311 isa = _mm_srli_epi16(isa, 3);
michael@0 1312
michael@0 1313 dr = _mm_mullo_epi16(dr, isa);
michael@0 1314 dr = _mm_add_epi16(dr, sr);
michael@0 1315 dr = _mm_srli_epi16(dr, 5);
michael@0 1316
michael@0 1317 dg = _mm_mullo_epi16(dg, isa);
michael@0 1318 dg = _mm_add_epi16(dg, sg);
michael@0 1319 dg = _mm_srli_epi16(dg, 5);
michael@0 1320
michael@0 1321 db = _mm_mullo_epi16(db, isa);
michael@0 1322 db = _mm_add_epi16(db, sb);
michael@0 1323 db = _mm_srli_epi16(db, 5);
michael@0 1324
michael@0 1325 // Package and store dst pixel.
michael@0 1326 __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db);
michael@0 1327 _mm_store_si128(d++, d_pixel);
michael@0 1328
michael@0 1329 count -= 8;
michael@0 1330 x += 8;
michael@0 1331 }
michael@0 1332
michael@0 1333 src = reinterpret_cast<const SkPMColor*>(s);
michael@0 1334 dst = reinterpret_cast<uint16_t*>(d);
michael@0 1335 }
michael@0 1336
michael@0 1337 if (count > 0) {
michael@0 1338 DITHER_565_SCAN(y);
michael@0 1339 do {
michael@0 1340 SkPMColor c = *src++;
michael@0 1341 SkPMColorAssert(c);
michael@0 1342 if (c) {
michael@0 1343 unsigned a = SkGetPackedA32(c);
michael@0 1344
michael@0 1345 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
michael@0 1346
michael@0 1347 unsigned sr = SkGetPackedR32(c);
michael@0 1348 unsigned sg = SkGetPackedG32(c);
michael@0 1349 unsigned sb = SkGetPackedB32(c);
michael@0 1350 sr = SkDITHER_R32_FOR_565(sr, d);
michael@0 1351 sg = SkDITHER_G32_FOR_565(sg, d);
michael@0 1352 sb = SkDITHER_B32_FOR_565(sb, d);
michael@0 1353
michael@0 1354 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
michael@0 1355 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
michael@0 1356 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
michael@0 1357 // now src and dst expanded are in g:11 r:10 x:1 b:10
michael@0 1358 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
michael@0 1359 }
michael@0 1360 dst += 1;
michael@0 1361 DITHER_INC_X(x);
michael@0 1362 } while (--count != 0);
michael@0 1363 }
michael@0 1364 }

mercurial