1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/skia/trunk/src/opts/SkBlitRow_opts_SSE2.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1364 @@ 1.4 +/* 1.5 + * Copyright 2012 The Android Open Source Project 1.6 + * 1.7 + * Use of this source code is governed by a BSD-style license that can be 1.8 + * found in the LICENSE file. 1.9 + */ 1.10 + 1.11 + 1.12 +#include "SkBlitRow_opts_SSE2.h" 1.13 +#include "SkBitmapProcState_opts_SSE2.h" 1.14 +#include "SkColorPriv.h" 1.15 +#include "SkColor_opts_SSE2.h" 1.16 +#include "SkDither.h" 1.17 +#include "SkUtils.h" 1.18 + 1.19 +#include <emmintrin.h> 1.20 + 1.21 +/* SSE2 version of S32_Blend_BlitRow32() 1.22 + * portable version is in core/SkBlitRow_D32.cpp 1.23 + */ 1.24 +void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 1.25 + const SkPMColor* SK_RESTRICT src, 1.26 + int count, U8CPU alpha) { 1.27 + SkASSERT(alpha <= 255); 1.28 + if (count <= 0) { 1.29 + return; 1.30 + } 1.31 + 1.32 + uint32_t src_scale = SkAlpha255To256(alpha); 1.33 + uint32_t dst_scale = 256 - src_scale; 1.34 + 1.35 + if (count >= 4) { 1.36 + SkASSERT(((size_t)dst & 0x03) == 0); 1.37 + while (((size_t)dst & 0x0F) != 0) { 1.38 + *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 1.39 + src++; 1.40 + dst++; 1.41 + count--; 1.42 + } 1.43 + 1.44 + const __m128i *s = reinterpret_cast<const __m128i*>(src); 1.45 + __m128i *d = reinterpret_cast<__m128i*>(dst); 1.46 + __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 1.47 + __m128i ag_mask = _mm_set1_epi32(0xFF00FF00); 1.48 + 1.49 + // Move scale factors to upper byte of word 1.50 + __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); 1.51 + __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8); 1.52 + while (count >= 4) { 1.53 + // Load 4 pixels each of src and dest. 1.54 + __m128i src_pixel = _mm_loadu_si128(s); 1.55 + __m128i dst_pixel = _mm_load_si128(d); 1.56 + 1.57 + // Interleave Atom port 0/1 operations based on the execution port 1.58 + // constraints that multiply can only be executed on port 0 (while 1.59 + // boolean operations can be executed on either port 0 or port 1) 1.60 + // because GCC currently doesn't do a good job scheduling 1.61 + // instructions based on these constraints. 1.62 + 1.63 + // Get red and blue pixels into lower byte of each word. 1.64 + // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b) 1.65 + __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 1.66 + 1.67 + // Multiply by scale. 1.68 + // (4 x (0, rs.h, 0, bs.h)) 1.69 + // where rs.h stands for the higher byte of r * scale, and 1.70 + // bs.h the higher byte of b * scale. 1.71 + src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); 1.72 + 1.73 + // Get alpha and green pixels into higher byte of each word. 1.74 + // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0) 1.75 + __m128i src_ag = _mm_and_si128(ag_mask, src_pixel); 1.76 + 1.77 + // Multiply by scale. 1.78 + // (4 x (as.h, as.l, gs.h, gs.l)) 1.79 + src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); 1.80 + 1.81 + // Clear the lower byte of the a*scale and g*scale results 1.82 + // (4 x (as.h, 0, gs.h, 0)) 1.83 + src_ag = _mm_and_si128(src_ag, ag_mask); 1.84 + 1.85 + // Operations the destination pixels are the same as on the 1.86 + // source pixels. See the comments above. 1.87 + __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 1.88 + dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide); 1.89 + __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel); 1.90 + dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide); 1.91 + dst_ag = _mm_and_si128(dst_ag, ag_mask); 1.92 + 1.93 + // Combine back into RGBA. 1.94 + // (4 x (as.h, rs.h, gs.h, bs.h)) 1.95 + src_pixel = _mm_or_si128(src_rb, src_ag); 1.96 + dst_pixel = _mm_or_si128(dst_rb, dst_ag); 1.97 + 1.98 + // Add result 1.99 + __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 1.100 + _mm_store_si128(d, result); 1.101 + s++; 1.102 + d++; 1.103 + count -= 4; 1.104 + } 1.105 + src = reinterpret_cast<const SkPMColor*>(s); 1.106 + dst = reinterpret_cast<SkPMColor*>(d); 1.107 + } 1.108 + 1.109 + while (count > 0) { 1.110 + *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 1.111 + src++; 1.112 + dst++; 1.113 + count--; 1.114 + } 1.115 +} 1.116 + 1.117 +void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 1.118 + const SkPMColor* SK_RESTRICT src, 1.119 + int count, U8CPU alpha) { 1.120 + SkASSERT(alpha == 255); 1.121 + if (count <= 0) { 1.122 + return; 1.123 + } 1.124 + 1.125 + if (count >= 4) { 1.126 + SkASSERT(((size_t)dst & 0x03) == 0); 1.127 + while (((size_t)dst & 0x0F) != 0) { 1.128 + *dst = SkPMSrcOver(*src, *dst); 1.129 + src++; 1.130 + dst++; 1.131 + count--; 1.132 + } 1.133 + 1.134 + const __m128i *s = reinterpret_cast<const __m128i*>(src); 1.135 + __m128i *d = reinterpret_cast<__m128i*>(dst); 1.136 +#ifdef SK_USE_ACCURATE_BLENDING 1.137 + __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 1.138 + __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit) 1.139 + __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit) 1.140 + while (count >= 4) { 1.141 + // Load 4 pixels 1.142 + __m128i src_pixel = _mm_loadu_si128(s); 1.143 + __m128i dst_pixel = _mm_load_si128(d); 1.144 + 1.145 + __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 1.146 + __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 1.147 + // Shift alphas down to lower 8 bits of each quad. 1.148 + __m128i alpha = _mm_srli_epi32(src_pixel, 24); 1.149 + 1.150 + // Copy alpha to upper 3rd byte of each quad 1.151 + alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16)); 1.152 + 1.153 + // Subtract alphas from 255, to get 0..255 1.154 + alpha = _mm_sub_epi16(c_255, alpha); 1.155 + 1.156 + // Multiply by red and blue by src alpha. 1.157 + dst_rb = _mm_mullo_epi16(dst_rb, alpha); 1.158 + // Multiply by alpha and green by src alpha. 1.159 + dst_ag = _mm_mullo_epi16(dst_ag, alpha); 1.160 + 1.161 + // dst_rb_low = (dst_rb >> 8) 1.162 + __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8); 1.163 + __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8); 1.164 + 1.165 + // dst_rb = (dst_rb + dst_rb_low + 128) >> 8 1.166 + dst_rb = _mm_add_epi16(dst_rb, dst_rb_low); 1.167 + dst_rb = _mm_add_epi16(dst_rb, c_128); 1.168 + dst_rb = _mm_srli_epi16(dst_rb, 8); 1.169 + 1.170 + // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask 1.171 + dst_ag = _mm_add_epi16(dst_ag, dst_ag_low); 1.172 + dst_ag = _mm_add_epi16(dst_ag, c_128); 1.173 + dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 1.174 + 1.175 + // Combine back into RGBA. 1.176 + dst_pixel = _mm_or_si128(dst_rb, dst_ag); 1.177 + 1.178 + // Add result 1.179 + __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 1.180 + _mm_store_si128(d, result); 1.181 + s++; 1.182 + d++; 1.183 + count -= 4; 1.184 + } 1.185 + #else 1.186 + __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 1.187 + __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit) 1.188 + while (count >= 4) { 1.189 + // Load 4 pixels 1.190 + __m128i src_pixel = _mm_loadu_si128(s); 1.191 + __m128i dst_pixel = _mm_load_si128(d); 1.192 + 1.193 + __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 1.194 + __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 1.195 + 1.196 + // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word) 1.197 + __m128i alpha = _mm_srli_epi16(src_pixel, 8); 1.198 + 1.199 + // (a0, a0, a1, a1, a2, g2, a3, g3) 1.200 + alpha = _mm_shufflehi_epi16(alpha, 0xF5); 1.201 + 1.202 + // (a0, a0, a1, a1, a2, a2, a3, a3) 1.203 + alpha = _mm_shufflelo_epi16(alpha, 0xF5); 1.204 + 1.205 + // Subtract alphas from 256, to get 1..256 1.206 + alpha = _mm_sub_epi16(c_256, alpha); 1.207 + 1.208 + // Multiply by red and blue by src alpha. 1.209 + dst_rb = _mm_mullo_epi16(dst_rb, alpha); 1.210 + // Multiply by alpha and green by src alpha. 1.211 + dst_ag = _mm_mullo_epi16(dst_ag, alpha); 1.212 + 1.213 + // Divide by 256. 1.214 + dst_rb = _mm_srli_epi16(dst_rb, 8); 1.215 + 1.216 + // Mask out high bits (already in the right place) 1.217 + dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 1.218 + 1.219 + // Combine back into RGBA. 1.220 + dst_pixel = _mm_or_si128(dst_rb, dst_ag); 1.221 + 1.222 + // Add result 1.223 + __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 1.224 + _mm_store_si128(d, result); 1.225 + s++; 1.226 + d++; 1.227 + count -= 4; 1.228 + } 1.229 +#endif 1.230 + src = reinterpret_cast<const SkPMColor*>(s); 1.231 + dst = reinterpret_cast<SkPMColor*>(d); 1.232 + } 1.233 + 1.234 + while (count > 0) { 1.235 + *dst = SkPMSrcOver(*src, *dst); 1.236 + src++; 1.237 + dst++; 1.238 + count--; 1.239 + } 1.240 +} 1.241 + 1.242 +void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 1.243 + const SkPMColor* SK_RESTRICT src, 1.244 + int count, U8CPU alpha) { 1.245 + SkASSERT(alpha <= 255); 1.246 + if (count <= 0) { 1.247 + return; 1.248 + } 1.249 + 1.250 + if (count >= 4) { 1.251 + while (((size_t)dst & 0x0F) != 0) { 1.252 + *dst = SkBlendARGB32(*src, *dst, alpha); 1.253 + src++; 1.254 + dst++; 1.255 + count--; 1.256 + } 1.257 + 1.258 + uint32_t src_scale = SkAlpha255To256(alpha); 1.259 + 1.260 + const __m128i *s = reinterpret_cast<const __m128i*>(src); 1.261 + __m128i *d = reinterpret_cast<__m128i*>(dst); 1.262 + __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); 1.263 + __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 1.264 + __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit) 1.265 + while (count >= 4) { 1.266 + // Load 4 pixels each of src and dest. 1.267 + __m128i src_pixel = _mm_loadu_si128(s); 1.268 + __m128i dst_pixel = _mm_load_si128(d); 1.269 + 1.270 + // Get red and blue pixels into lower byte of each word. 1.271 + __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 1.272 + __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 1.273 + 1.274 + // Get alpha and green into lower byte of each word. 1.275 + __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 1.276 + __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 1.277 + 1.278 + // Put per-pixel alpha in low byte of each word. 1.279 + // After the following two statements, the dst_alpha looks like 1.280 + // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3) 1.281 + __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); 1.282 + dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); 1.283 + 1.284 + // dst_alpha = dst_alpha * src_scale 1.285 + // Because src_scales are in the higher byte of each word and 1.286 + // we use mulhi here, the resulting alpha values are already 1.287 + // in the right place and don't need to be divided by 256. 1.288 + // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3) 1.289 + dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide); 1.290 + 1.291 + // Subtract alphas from 256, to get 1..256 1.292 + dst_alpha = _mm_sub_epi16(c_256, dst_alpha); 1.293 + 1.294 + // Multiply red and blue by dst pixel alpha. 1.295 + dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); 1.296 + // Multiply alpha and green by dst pixel alpha. 1.297 + dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); 1.298 + 1.299 + // Multiply red and blue by global alpha. 1.300 + // (4 x (0, rs.h, 0, bs.h)) 1.301 + // where rs.h stands for the higher byte of r * src_scale, 1.302 + // and bs.h the higher byte of b * src_scale. 1.303 + // Again, because we use mulhi, the resuling red and blue 1.304 + // values are already in the right place and don't need to 1.305 + // be divided by 256. 1.306 + src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); 1.307 + // Multiply alpha and green by global alpha. 1.308 + // (4 x (0, as.h, 0, gs.h)) 1.309 + src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); 1.310 + 1.311 + // Divide by 256. 1.312 + dst_rb = _mm_srli_epi16(dst_rb, 8); 1.313 + 1.314 + // Mask out low bits (goodies already in the right place; no need to divide) 1.315 + dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 1.316 + // Shift alpha and green to higher byte of each word. 1.317 + // (4 x (as.h, 0, gs.h, 0)) 1.318 + src_ag = _mm_slli_epi16(src_ag, 8); 1.319 + 1.320 + // Combine back into RGBA. 1.321 + dst_pixel = _mm_or_si128(dst_rb, dst_ag); 1.322 + src_pixel = _mm_or_si128(src_rb, src_ag); 1.323 + 1.324 + // Add two pixels into result. 1.325 + __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 1.326 + _mm_store_si128(d, result); 1.327 + s++; 1.328 + d++; 1.329 + count -= 4; 1.330 + } 1.331 + src = reinterpret_cast<const SkPMColor*>(s); 1.332 + dst = reinterpret_cast<SkPMColor*>(d); 1.333 + } 1.334 + 1.335 + while (count > 0) { 1.336 + *dst = SkBlendARGB32(*src, *dst, alpha); 1.337 + src++; 1.338 + dst++; 1.339 + count--; 1.340 + } 1.341 +} 1.342 + 1.343 +/* SSE2 version of Color32() 1.344 + * portable version is in core/SkBlitRow_D32.cpp 1.345 + */ 1.346 +void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count, 1.347 + SkPMColor color) { 1.348 + 1.349 + if (count <= 0) { 1.350 + return; 1.351 + } 1.352 + 1.353 + if (0 == color) { 1.354 + if (src != dst) { 1.355 + memcpy(dst, src, count * sizeof(SkPMColor)); 1.356 + } 1.357 + return; 1.358 + } 1.359 + 1.360 + unsigned colorA = SkGetPackedA32(color); 1.361 + if (255 == colorA) { 1.362 + sk_memset32(dst, color, count); 1.363 + } else { 1.364 + unsigned scale = 256 - SkAlpha255To256(colorA); 1.365 + 1.366 + if (count >= 4) { 1.367 + SkASSERT(((size_t)dst & 0x03) == 0); 1.368 + while (((size_t)dst & 0x0F) != 0) { 1.369 + *dst = color + SkAlphaMulQ(*src, scale); 1.370 + src++; 1.371 + dst++; 1.372 + count--; 1.373 + } 1.374 + 1.375 + const __m128i *s = reinterpret_cast<const __m128i*>(src); 1.376 + __m128i *d = reinterpret_cast<__m128i*>(dst); 1.377 + __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 1.378 + __m128i src_scale_wide = _mm_set1_epi16(scale); 1.379 + __m128i color_wide = _mm_set1_epi32(color); 1.380 + while (count >= 4) { 1.381 + // Load 4 pixels each of src and dest. 1.382 + __m128i src_pixel = _mm_loadu_si128(s); 1.383 + 1.384 + // Get red and blue pixels into lower byte of each word. 1.385 + __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 1.386 + 1.387 + // Get alpha and green into lower byte of each word. 1.388 + __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 1.389 + 1.390 + // Multiply by scale. 1.391 + src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 1.392 + src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 1.393 + 1.394 + // Divide by 256. 1.395 + src_rb = _mm_srli_epi16(src_rb, 8); 1.396 + src_ag = _mm_andnot_si128(rb_mask, src_ag); 1.397 + 1.398 + // Combine back into RGBA. 1.399 + src_pixel = _mm_or_si128(src_rb, src_ag); 1.400 + 1.401 + // Add color to result. 1.402 + __m128i result = _mm_add_epi8(color_wide, src_pixel); 1.403 + 1.404 + // Store result. 1.405 + _mm_store_si128(d, result); 1.406 + s++; 1.407 + d++; 1.408 + count -= 4; 1.409 + } 1.410 + src = reinterpret_cast<const SkPMColor*>(s); 1.411 + dst = reinterpret_cast<SkPMColor*>(d); 1.412 + } 1.413 + 1.414 + while (count > 0) { 1.415 + *dst = color + SkAlphaMulQ(*src, scale); 1.416 + src += 1; 1.417 + dst += 1; 1.418 + count--; 1.419 + } 1.420 + } 1.421 +} 1.422 + 1.423 +void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr, 1.424 + size_t maskRB, SkColor origColor, 1.425 + int width, int height) { 1.426 + SkPMColor color = SkPreMultiplyColor(origColor); 1.427 + size_t dstOffset = dstRB - (width << 2); 1.428 + size_t maskOffset = maskRB - width; 1.429 + SkPMColor* dst = (SkPMColor *)device; 1.430 + const uint8_t* mask = (const uint8_t*)maskPtr; 1.431 + do { 1.432 + int count = width; 1.433 + if (count >= 4) { 1.434 + while (((size_t)dst & 0x0F) != 0 && (count > 0)) { 1.435 + *dst = SkBlendARGB32(color, *dst, *mask); 1.436 + mask++; 1.437 + dst++; 1.438 + count--; 1.439 + } 1.440 + __m128i *d = reinterpret_cast<__m128i*>(dst); 1.441 + __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 1.442 + __m128i c_256 = _mm_set1_epi16(256); 1.443 + __m128i c_1 = _mm_set1_epi16(1); 1.444 + __m128i src_pixel = _mm_set1_epi32(color); 1.445 + while (count >= 4) { 1.446 + // Load 4 pixels each of src and dest. 1.447 + __m128i dst_pixel = _mm_load_si128(d); 1.448 + 1.449 + //set the aphla value 1.450 + __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\ 1.451 + 0, *(mask+3),0, \ 1.452 + *(mask+2),0, *(mask+2),\ 1.453 + 0,*(mask+1), 0,*(mask+1),\ 1.454 + 0, *mask,0,*mask); 1.455 + 1.456 + //call SkAlpha255To256() 1.457 + src_scale_wide = _mm_add_epi16(src_scale_wide, c_1); 1.458 + 1.459 + // Get red and blue pixels into lower byte of each word. 1.460 + __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 1.461 + __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 1.462 + 1.463 + // Get alpha and green into lower byte of each word. 1.464 + __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 1.465 + __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 1.466 + 1.467 + // Put per-pixel alpha in low byte of each word. 1.468 + __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); 1.469 + dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); 1.470 + 1.471 + // dst_alpha = dst_alpha * src_scale 1.472 + dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide); 1.473 + 1.474 + // Divide by 256. 1.475 + dst_alpha = _mm_srli_epi16(dst_alpha, 8); 1.476 + 1.477 + // Subtract alphas from 256, to get 1..256 1.478 + dst_alpha = _mm_sub_epi16(c_256, dst_alpha); 1.479 + // Multiply red and blue by dst pixel alpha. 1.480 + dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); 1.481 + // Multiply alpha and green by dst pixel alpha. 1.482 + dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); 1.483 + 1.484 + // Multiply red and blue by global alpha. 1.485 + src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 1.486 + // Multiply alpha and green by global alpha. 1.487 + src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 1.488 + // Divide by 256. 1.489 + dst_rb = _mm_srli_epi16(dst_rb, 8); 1.490 + src_rb = _mm_srli_epi16(src_rb, 8); 1.491 + 1.492 + // Mask out low bits (goodies already in the right place; no need to divide) 1.493 + dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 1.494 + src_ag = _mm_andnot_si128(rb_mask, src_ag); 1.495 + 1.496 + // Combine back into RGBA. 1.497 + dst_pixel = _mm_or_si128(dst_rb, dst_ag); 1.498 + __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag); 1.499 + 1.500 + // Add two pixels into result. 1.501 + __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel); 1.502 + _mm_store_si128(d, result); 1.503 + // load the next 4 pixel 1.504 + mask = mask + 4; 1.505 + d++; 1.506 + count -= 4; 1.507 + } 1.508 + dst = reinterpret_cast<SkPMColor *>(d); 1.509 + } 1.510 + while(count > 0) { 1.511 + *dst= SkBlendARGB32(color, *dst, *mask); 1.512 + dst += 1; 1.513 + mask++; 1.514 + count --; 1.515 + } 1.516 + dst = (SkPMColor *)((char*)dst + dstOffset); 1.517 + mask += maskOffset; 1.518 + } while (--height != 0); 1.519 +} 1.520 + 1.521 +// The following (left) shifts cause the top 5 bits of the mask components to 1.522 +// line up with the corresponding components in an SkPMColor. 1.523 +// Note that the mask's RGB16 order may differ from the SkPMColor order. 1.524 +#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5) 1.525 +#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5) 1.526 +#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5) 1.527 + 1.528 +#if SK_R16x5_R32x5_SHIFT == 0 1.529 + #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x) 1.530 +#elif SK_R16x5_R32x5_SHIFT > 0 1.531 + #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT)) 1.532 +#else 1.533 + #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT)) 1.534 +#endif 1.535 + 1.536 +#if SK_G16x5_G32x5_SHIFT == 0 1.537 + #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x) 1.538 +#elif SK_G16x5_G32x5_SHIFT > 0 1.539 + #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT)) 1.540 +#else 1.541 + #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT)) 1.542 +#endif 1.543 + 1.544 +#if SK_B16x5_B32x5_SHIFT == 0 1.545 + #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x) 1.546 +#elif SK_B16x5_B32x5_SHIFT > 0 1.547 + #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT)) 1.548 +#else 1.549 + #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT)) 1.550 +#endif 1.551 + 1.552 +static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst, 1.553 + __m128i &mask, __m128i &srcA) { 1.554 + // In the following comments, the components of src, dst and mask are 1.555 + // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked 1.556 + // by an R, G, B, or A suffix. Components of one of the four pixels that 1.557 + // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for 1.558 + // example is the blue channel of the second destination pixel. Memory 1.559 + // layout is shown for an ARGB byte order in a color value. 1.560 + 1.561 + // src and srcA store 8-bit values interleaved with zeros. 1.562 + // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 1.563 + // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0, 1.564 + // srcA, 0, srcA, 0, srcA, 0, srcA, 0) 1.565 + // mask stores 16-bit values (compressed three channels) interleaved with zeros. 1.566 + // Lo and Hi denote the low and high bytes of a 16-bit value, respectively. 1.567 + // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 1.568 + // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 1.569 + 1.570 + // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. 1.571 + // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) 1.572 + __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), 1.573 + _mm_set1_epi32(0x1F << SK_R32_SHIFT)); 1.574 + 1.575 + // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) 1.576 + __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), 1.577 + _mm_set1_epi32(0x1F << SK_G32_SHIFT)); 1.578 + 1.579 + // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) 1.580 + __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), 1.581 + _mm_set1_epi32(0x1F << SK_B32_SHIFT)); 1.582 + 1.583 + // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) 1.584 + // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an 1.585 + // 8-bit position 1.586 + // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, 1.587 + // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) 1.588 + mask = _mm_or_si128(_mm_or_si128(r, g), b); 1.589 + 1.590 + // Interleave R,G,B into the lower byte of word. 1.591 + // i.e. split the sixteen 8-bit values from mask into two sets of eight 1.592 + // 16-bit values, padded by zero. 1.593 + __m128i maskLo, maskHi; 1.594 + // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) 1.595 + maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); 1.596 + // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) 1.597 + maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); 1.598 + 1.599 + // Upscale from 0..31 to 0..32 1.600 + // (allows to replace division by left-shift further down) 1.601 + // Left-shift each component by 4 and add the result back to that component, 1.602 + // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 1.603 + maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); 1.604 + maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); 1.605 + 1.606 + // Multiply each component of maskLo and maskHi by srcA 1.607 + maskLo = _mm_mullo_epi16(maskLo, srcA); 1.608 + maskHi = _mm_mullo_epi16(maskHi, srcA); 1.609 + 1.610 + // Left shift mask components by 8 (divide by 256) 1.611 + maskLo = _mm_srli_epi16(maskLo, 8); 1.612 + maskHi = _mm_srli_epi16(maskHi, 8); 1.613 + 1.614 + // Interleave R,G,B into the lower byte of the word 1.615 + // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) 1.616 + __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); 1.617 + // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) 1.618 + __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); 1.619 + 1.620 + // mask = (src - dst) * mask 1.621 + maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); 1.622 + maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); 1.623 + 1.624 + // mask = (src - dst) * mask >> 5 1.625 + maskLo = _mm_srai_epi16(maskLo, 5); 1.626 + maskHi = _mm_srai_epi16(maskHi, 5); 1.627 + 1.628 + // Add two pixels into result. 1.629 + // result = dst + ((src - dst) * mask >> 5) 1.630 + __m128i resultLo = _mm_add_epi16(dstLo, maskLo); 1.631 + __m128i resultHi = _mm_add_epi16(dstHi, maskHi); 1.632 + 1.633 + // Pack into 4 32bit dst pixels. 1.634 + // resultLo and resultHi contain eight 16-bit components (two pixels) each. 1.635 + // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), 1.636 + // clamping to 255 if necessary. 1.637 + return _mm_packus_epi16(resultLo, resultHi); 1.638 +} 1.639 + 1.640 +static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst, 1.641 + __m128i &mask) { 1.642 + // In the following comments, the components of src, dst and mask are 1.643 + // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked 1.644 + // by an R, G, B, or A suffix. Components of one of the four pixels that 1.645 + // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for 1.646 + // example is the blue channel of the second destination pixel. Memory 1.647 + // layout is shown for an ARGB byte order in a color value. 1.648 + 1.649 + // src and srcA store 8-bit values interleaved with zeros. 1.650 + // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 1.651 + // mask stores 16-bit values (shown as high and low bytes) interleaved with 1.652 + // zeros 1.653 + // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 1.654 + // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 1.655 + 1.656 + // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. 1.657 + // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) 1.658 + __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), 1.659 + _mm_set1_epi32(0x1F << SK_R32_SHIFT)); 1.660 + 1.661 + // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) 1.662 + __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), 1.663 + _mm_set1_epi32(0x1F << SK_G32_SHIFT)); 1.664 + 1.665 + // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) 1.666 + __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), 1.667 + _mm_set1_epi32(0x1F << SK_B32_SHIFT)); 1.668 + 1.669 + // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) 1.670 + // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an 1.671 + // 8-bit position 1.672 + // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, 1.673 + // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) 1.674 + mask = _mm_or_si128(_mm_or_si128(r, g), b); 1.675 + 1.676 + // Interleave R,G,B into the lower byte of word. 1.677 + // i.e. split the sixteen 8-bit values from mask into two sets of eight 1.678 + // 16-bit values, padded by zero. 1.679 + __m128i maskLo, maskHi; 1.680 + // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) 1.681 + maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); 1.682 + // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) 1.683 + maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); 1.684 + 1.685 + // Upscale from 0..31 to 0..32 1.686 + // (allows to replace division by left-shift further down) 1.687 + // Left-shift each component by 4 and add the result back to that component, 1.688 + // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 1.689 + maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); 1.690 + maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); 1.691 + 1.692 + // Interleave R,G,B into the lower byte of the word 1.693 + // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) 1.694 + __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); 1.695 + // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) 1.696 + __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); 1.697 + 1.698 + // mask = (src - dst) * mask 1.699 + maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); 1.700 + maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); 1.701 + 1.702 + // mask = (src - dst) * mask >> 5 1.703 + maskLo = _mm_srai_epi16(maskLo, 5); 1.704 + maskHi = _mm_srai_epi16(maskHi, 5); 1.705 + 1.706 + // Add two pixels into result. 1.707 + // result = dst + ((src - dst) * mask >> 5) 1.708 + __m128i resultLo = _mm_add_epi16(dstLo, maskLo); 1.709 + __m128i resultHi = _mm_add_epi16(dstHi, maskHi); 1.710 + 1.711 + // Pack into 4 32bit dst pixels and force opaque. 1.712 + // resultLo and resultHi contain eight 16-bit components (two pixels) each. 1.713 + // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), 1.714 + // clamping to 255 if necessary. Set alpha components to 0xFF. 1.715 + return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi), 1.716 + _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT)); 1.717 +} 1.718 + 1.719 +void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[], 1.720 + SkColor src, int width, SkPMColor) { 1.721 + if (width <= 0) { 1.722 + return; 1.723 + } 1.724 + 1.725 + int srcA = SkColorGetA(src); 1.726 + int srcR = SkColorGetR(src); 1.727 + int srcG = SkColorGetG(src); 1.728 + int srcB = SkColorGetB(src); 1.729 + 1.730 + srcA = SkAlpha255To256(srcA); 1.731 + 1.732 + if (width >= 4) { 1.733 + SkASSERT(((size_t)dst & 0x03) == 0); 1.734 + while (((size_t)dst & 0x0F) != 0) { 1.735 + *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); 1.736 + mask++; 1.737 + dst++; 1.738 + width--; 1.739 + } 1.740 + 1.741 + __m128i *d = reinterpret_cast<__m128i*>(dst); 1.742 + // Set alpha to 0xFF and replicate source four times in SSE register. 1.743 + __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); 1.744 + // Interleave with zeros to get two sets of four 16-bit values. 1.745 + src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); 1.746 + // Set srcA_sse to contain eight copies of srcA, padded with zero. 1.747 + // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 1.748 + __m128i srcA_sse = _mm_set1_epi16(srcA); 1.749 + while (width >= 4) { 1.750 + // Load four destination pixels into dst_sse. 1.751 + __m128i dst_sse = _mm_load_si128(d); 1.752 + // Load four 16-bit masks into lower half of mask_sse. 1.753 + __m128i mask_sse = _mm_loadl_epi64( 1.754 + reinterpret_cast<const __m128i*>(mask)); 1.755 + 1.756 + // Check whether masks are equal to 0 and get the highest bit 1.757 + // of each byte of result, if masks are all zero, we will get 1.758 + // pack_cmp to 0xFFFF 1.759 + int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, 1.760 + _mm_setzero_si128())); 1.761 + 1.762 + // if mask pixels are not all zero, we will blend the dst pixels 1.763 + if (pack_cmp != 0xFFFF) { 1.764 + // Unpack 4 16bit mask pixels to 1.765 + // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 1.766 + // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 1.767 + mask_sse = _mm_unpacklo_epi16(mask_sse, 1.768 + _mm_setzero_si128()); 1.769 + 1.770 + // Process 4 32bit dst pixels 1.771 + __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse, 1.772 + mask_sse, srcA_sse); 1.773 + _mm_store_si128(d, result); 1.774 + } 1.775 + 1.776 + d++; 1.777 + mask += 4; 1.778 + width -= 4; 1.779 + } 1.780 + 1.781 + dst = reinterpret_cast<SkPMColor*>(d); 1.782 + } 1.783 + 1.784 + while (width > 0) { 1.785 + *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); 1.786 + mask++; 1.787 + dst++; 1.788 + width--; 1.789 + } 1.790 +} 1.791 + 1.792 +void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[], 1.793 + SkColor src, int width, SkPMColor opaqueDst) { 1.794 + if (width <= 0) { 1.795 + return; 1.796 + } 1.797 + 1.798 + int srcR = SkColorGetR(src); 1.799 + int srcG = SkColorGetG(src); 1.800 + int srcB = SkColorGetB(src); 1.801 + 1.802 + if (width >= 4) { 1.803 + SkASSERT(((size_t)dst & 0x03) == 0); 1.804 + while (((size_t)dst & 0x0F) != 0) { 1.805 + *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); 1.806 + mask++; 1.807 + dst++; 1.808 + width--; 1.809 + } 1.810 + 1.811 + __m128i *d = reinterpret_cast<__m128i*>(dst); 1.812 + // Set alpha to 0xFF and replicate source four times in SSE register. 1.813 + __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); 1.814 + // Set srcA_sse to contain eight copies of srcA, padded with zero. 1.815 + // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 1.816 + src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); 1.817 + while (width >= 4) { 1.818 + // Load four destination pixels into dst_sse. 1.819 + __m128i dst_sse = _mm_load_si128(d); 1.820 + // Load four 16-bit masks into lower half of mask_sse. 1.821 + __m128i mask_sse = _mm_loadl_epi64( 1.822 + reinterpret_cast<const __m128i*>(mask)); 1.823 + 1.824 + // Check whether masks are equal to 0 and get the highest bit 1.825 + // of each byte of result, if masks are all zero, we will get 1.826 + // pack_cmp to 0xFFFF 1.827 + int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, 1.828 + _mm_setzero_si128())); 1.829 + 1.830 + // if mask pixels are not all zero, we will blend the dst pixels 1.831 + if (pack_cmp != 0xFFFF) { 1.832 + // Unpack 4 16bit mask pixels to 1.833 + // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 1.834 + // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 1.835 + mask_sse = _mm_unpacklo_epi16(mask_sse, 1.836 + _mm_setzero_si128()); 1.837 + 1.838 + // Process 4 32bit dst pixels 1.839 + __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse, 1.840 + mask_sse); 1.841 + _mm_store_si128(d, result); 1.842 + } 1.843 + 1.844 + d++; 1.845 + mask += 4; 1.846 + width -= 4; 1.847 + } 1.848 + 1.849 + dst = reinterpret_cast<SkPMColor*>(d); 1.850 + } 1.851 + 1.852 + while (width > 0) { 1.853 + *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); 1.854 + mask++; 1.855 + dst++; 1.856 + width--; 1.857 + } 1.858 +} 1.859 + 1.860 +/* SSE2 version of S32_D565_Opaque() 1.861 + * portable version is in core/SkBlitRow_D16.cpp 1.862 + */ 1.863 +void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, 1.864 + const SkPMColor* SK_RESTRICT src, int count, 1.865 + U8CPU alpha, int /*x*/, int /*y*/) { 1.866 + SkASSERT(255 == alpha); 1.867 + 1.868 + if (count <= 0) { 1.869 + return; 1.870 + } 1.871 + 1.872 + if (count >= 8) { 1.873 + while (((size_t)dst & 0x0F) != 0) { 1.874 + SkPMColor c = *src++; 1.875 + SkPMColorAssert(c); 1.876 + 1.877 + *dst++ = SkPixel32ToPixel16_ToU16(c); 1.878 + count--; 1.879 + } 1.880 + 1.881 + const __m128i* s = reinterpret_cast<const __m128i*>(src); 1.882 + __m128i* d = reinterpret_cast<__m128i*>(dst); 1.883 + __m128i r16_mask = _mm_set1_epi32(SK_R16_MASK); 1.884 + __m128i g16_mask = _mm_set1_epi32(SK_G16_MASK); 1.885 + __m128i b16_mask = _mm_set1_epi32(SK_B16_MASK); 1.886 + 1.887 + while (count >= 8) { 1.888 + // Load 8 pixels of src. 1.889 + __m128i src_pixel1 = _mm_loadu_si128(s++); 1.890 + __m128i src_pixel2 = _mm_loadu_si128(s++); 1.891 + 1.892 + // Calculate result r. 1.893 + __m128i r1 = _mm_srli_epi32(src_pixel1, 1.894 + SK_R32_SHIFT + (8 - SK_R16_BITS)); 1.895 + r1 = _mm_and_si128(r1, r16_mask); 1.896 + __m128i r2 = _mm_srli_epi32(src_pixel2, 1.897 + SK_R32_SHIFT + (8 - SK_R16_BITS)); 1.898 + r2 = _mm_and_si128(r2, r16_mask); 1.899 + __m128i r = _mm_packs_epi32(r1, r2); 1.900 + 1.901 + // Calculate result g. 1.902 + __m128i g1 = _mm_srli_epi32(src_pixel1, 1.903 + SK_G32_SHIFT + (8 - SK_G16_BITS)); 1.904 + g1 = _mm_and_si128(g1, g16_mask); 1.905 + __m128i g2 = _mm_srli_epi32(src_pixel2, 1.906 + SK_G32_SHIFT + (8 - SK_G16_BITS)); 1.907 + g2 = _mm_and_si128(g2, g16_mask); 1.908 + __m128i g = _mm_packs_epi32(g1, g2); 1.909 + 1.910 + // Calculate result b. 1.911 + __m128i b1 = _mm_srli_epi32(src_pixel1, 1.912 + SK_B32_SHIFT + (8 - SK_B16_BITS)); 1.913 + b1 = _mm_and_si128(b1, b16_mask); 1.914 + __m128i b2 = _mm_srli_epi32(src_pixel2, 1.915 + SK_B32_SHIFT + (8 - SK_B16_BITS)); 1.916 + b2 = _mm_and_si128(b2, b16_mask); 1.917 + __m128i b = _mm_packs_epi32(b1, b2); 1.918 + 1.919 + // Store 8 16-bit colors in dst. 1.920 + __m128i d_pixel = SkPackRGB16_SSE(r, g, b); 1.921 + _mm_store_si128(d++, d_pixel); 1.922 + count -= 8; 1.923 + } 1.924 + src = reinterpret_cast<const SkPMColor*>(s); 1.925 + dst = reinterpret_cast<uint16_t*>(d); 1.926 + } 1.927 + 1.928 + if (count > 0) { 1.929 + do { 1.930 + SkPMColor c = *src++; 1.931 + SkPMColorAssert(c); 1.932 + *dst++ = SkPixel32ToPixel16_ToU16(c); 1.933 + } while (--count != 0); 1.934 + } 1.935 +} 1.936 + 1.937 +/* SSE2 version of S32A_D565_Opaque() 1.938 + * portable version is in core/SkBlitRow_D16.cpp 1.939 + */ 1.940 +void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, 1.941 + const SkPMColor* SK_RESTRICT src, 1.942 + int count, U8CPU alpha, int /*x*/, int /*y*/) { 1.943 + SkASSERT(255 == alpha); 1.944 + 1.945 + if (count <= 0) { 1.946 + return; 1.947 + } 1.948 + 1.949 + if (count >= 8) { 1.950 + // Make dst 16 bytes alignment 1.951 + while (((size_t)dst & 0x0F) != 0) { 1.952 + SkPMColor c = *src++; 1.953 + if (c) { 1.954 + *dst = SkSrcOver32To16(c, *dst); 1.955 + } 1.956 + dst += 1; 1.957 + count--; 1.958 + } 1.959 + 1.960 + const __m128i* s = reinterpret_cast<const __m128i*>(src); 1.961 + __m128i* d = reinterpret_cast<__m128i*>(dst); 1.962 + __m128i var255 = _mm_set1_epi16(255); 1.963 + __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK); 1.964 + __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK); 1.965 + __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK); 1.966 + 1.967 + while (count >= 8) { 1.968 + // Load 8 pixels of src. 1.969 + __m128i src_pixel1 = _mm_loadu_si128(s++); 1.970 + __m128i src_pixel2 = _mm_loadu_si128(s++); 1.971 + 1.972 + // Check whether src pixels are equal to 0 and get the highest bit 1.973 + // of each byte of result, if src pixels are all zero, src_cmp1 and 1.974 + // src_cmp2 will be 0xFFFF. 1.975 + int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1, 1.976 + _mm_setzero_si128())); 1.977 + int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2, 1.978 + _mm_setzero_si128())); 1.979 + if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) { 1.980 + d++; 1.981 + count -= 8; 1.982 + continue; 1.983 + } 1.984 + 1.985 + // Load 8 pixels of dst. 1.986 + __m128i dst_pixel = _mm_load_si128(d); 1.987 + 1.988 + // Extract A from src. 1.989 + __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT)); 1.990 + sa1 = _mm_srli_epi32(sa1, 24); 1.991 + __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT)); 1.992 + sa2 = _mm_srli_epi32(sa2, 24); 1.993 + __m128i sa = _mm_packs_epi32(sa1, sa2); 1.994 + 1.995 + // Extract R from src. 1.996 + __m128i sr1 = _mm_slli_epi32(src_pixel1,(24 - SK_R32_SHIFT)); 1.997 + sr1 = _mm_srli_epi32(sr1, 24); 1.998 + __m128i sr2 = _mm_slli_epi32(src_pixel2,(24 - SK_R32_SHIFT)); 1.999 + sr2 = _mm_srli_epi32(sr2, 24); 1.1000 + __m128i sr = _mm_packs_epi32(sr1, sr2); 1.1001 + 1.1002 + // Extract G from src. 1.1003 + __m128i sg1 = _mm_slli_epi32(src_pixel1,(24 - SK_G32_SHIFT)); 1.1004 + sg1 = _mm_srli_epi32(sg1, 24); 1.1005 + __m128i sg2 = _mm_slli_epi32(src_pixel2,(24 - SK_G32_SHIFT)); 1.1006 + sg2 = _mm_srli_epi32(sg2, 24); 1.1007 + __m128i sg = _mm_packs_epi32(sg1, sg2); 1.1008 + 1.1009 + // Extract B from src. 1.1010 + __m128i sb1 = _mm_slli_epi32(src_pixel1,(24 - SK_B32_SHIFT)); 1.1011 + sb1 = _mm_srli_epi32(sb1, 24); 1.1012 + __m128i sb2 = _mm_slli_epi32(src_pixel2,(24 - SK_B32_SHIFT)); 1.1013 + sb2 = _mm_srli_epi32(sb2, 24); 1.1014 + __m128i sb = _mm_packs_epi32(sb1, sb2); 1.1015 + 1.1016 + // Extract R G B from dst. 1.1017 + __m128i dr = _mm_srli_epi16(dst_pixel,SK_R16_SHIFT); 1.1018 + dr = _mm_and_si128(dr, r16_mask); 1.1019 + __m128i dg = _mm_srli_epi16(dst_pixel,SK_G16_SHIFT); 1.1020 + dg = _mm_and_si128(dg, g16_mask); 1.1021 + __m128i db = _mm_srli_epi16(dst_pixel,SK_B16_SHIFT); 1.1022 + db = _mm_and_si128(db, b16_mask); 1.1023 + 1.1024 + __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa 1.1025 + 1.1026 + // Calculate R G B of result. 1.1027 + // Original algorithm is in SkSrcOver32To16(). 1.1028 + dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE(dr, isa, SK_R16_BITS)); 1.1029 + dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS); 1.1030 + dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE(dg, isa, SK_G16_BITS)); 1.1031 + dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS); 1.1032 + db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE(db, isa, SK_B16_BITS)); 1.1033 + db = _mm_srli_epi16(db, 8 - SK_B16_BITS); 1.1034 + 1.1035 + // Pack R G B into 16-bit color. 1.1036 + __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db); 1.1037 + 1.1038 + // Store 8 16-bit colors in dst. 1.1039 + _mm_store_si128(d++, d_pixel); 1.1040 + count -= 8; 1.1041 + } 1.1042 + 1.1043 + src = reinterpret_cast<const SkPMColor*>(s); 1.1044 + dst = reinterpret_cast<uint16_t*>(d); 1.1045 + } 1.1046 + 1.1047 + if (count > 0) { 1.1048 + do { 1.1049 + SkPMColor c = *src++; 1.1050 + SkPMColorAssert(c); 1.1051 + if (c) { 1.1052 + *dst = SkSrcOver32To16(c, *dst); 1.1053 + } 1.1054 + dst += 1; 1.1055 + } while (--count != 0); 1.1056 + } 1.1057 +} 1.1058 + 1.1059 +void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, 1.1060 + const SkPMColor* SK_RESTRICT src, 1.1061 + int count, U8CPU alpha, int x, int y) { 1.1062 + SkASSERT(255 == alpha); 1.1063 + 1.1064 + if (count <= 0) { 1.1065 + return; 1.1066 + } 1.1067 + 1.1068 + if (count >= 8) { 1.1069 + while (((size_t)dst & 0x0F) != 0) { 1.1070 + DITHER_565_SCAN(y); 1.1071 + SkPMColor c = *src++; 1.1072 + SkPMColorAssert(c); 1.1073 + 1.1074 + unsigned dither = DITHER_VALUE(x); 1.1075 + *dst++ = SkDitherRGB32To565(c, dither); 1.1076 + DITHER_INC_X(x); 1.1077 + count--; 1.1078 + } 1.1079 + 1.1080 + unsigned short dither_value[8]; 1.1081 + __m128i dither; 1.1082 +#ifdef ENABLE_DITHER_MATRIX_4X4 1.1083 + const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3]; 1.1084 + dither_value[0] = dither_value[4] = dither_scan[(x) & 3]; 1.1085 + dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3]; 1.1086 + dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3]; 1.1087 + dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3]; 1.1088 +#else 1.1089 + const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3]; 1.1090 + dither_value[0] = dither_value[4] = (dither_scan 1.1091 + >> (((x) & 3) << 2)) & 0xF; 1.1092 + dither_value[1] = dither_value[5] = (dither_scan 1.1093 + >> (((x + 1) & 3) << 2)) & 0xF; 1.1094 + dither_value[2] = dither_value[6] = (dither_scan 1.1095 + >> (((x + 2) & 3) << 2)) & 0xF; 1.1096 + dither_value[3] = dither_value[7] = (dither_scan 1.1097 + >> (((x + 3) & 3) << 2)) & 0xF; 1.1098 +#endif 1.1099 + dither = _mm_loadu_si128((__m128i*) dither_value); 1.1100 + 1.1101 + const __m128i* s = reinterpret_cast<const __m128i*>(src); 1.1102 + __m128i* d = reinterpret_cast<__m128i*>(dst); 1.1103 + 1.1104 + while (count >= 8) { 1.1105 + // Load 8 pixels of src. 1.1106 + __m128i src_pixel1 = _mm_loadu_si128(s++); 1.1107 + __m128i src_pixel2 = _mm_loadu_si128(s++); 1.1108 + 1.1109 + // Extract R from src. 1.1110 + __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); 1.1111 + sr1 = _mm_srli_epi32(sr1, 24); 1.1112 + __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT)); 1.1113 + sr2 = _mm_srli_epi32(sr2, 24); 1.1114 + __m128i sr = _mm_packs_epi32(sr1, sr2); 1.1115 + 1.1116 + // SkDITHER_R32To565(sr, dither) 1.1117 + __m128i sr_offset = _mm_srli_epi16(sr, 5); 1.1118 + sr = _mm_add_epi16(sr, dither); 1.1119 + sr = _mm_sub_epi16(sr, sr_offset); 1.1120 + sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS); 1.1121 + 1.1122 + // Extract G from src. 1.1123 + __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT)); 1.1124 + sg1 = _mm_srli_epi32(sg1, 24); 1.1125 + __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT)); 1.1126 + sg2 = _mm_srli_epi32(sg2, 24); 1.1127 + __m128i sg = _mm_packs_epi32(sg1, sg2); 1.1128 + 1.1129 + // SkDITHER_R32To565(sg, dither) 1.1130 + __m128i sg_offset = _mm_srli_epi16(sg, 6); 1.1131 + sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1)); 1.1132 + sg = _mm_sub_epi16(sg, sg_offset); 1.1133 + sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS); 1.1134 + 1.1135 + // Extract B from src. 1.1136 + __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT)); 1.1137 + sb1 = _mm_srli_epi32(sb1, 24); 1.1138 + __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT)); 1.1139 + sb2 = _mm_srli_epi32(sb2, 24); 1.1140 + __m128i sb = _mm_packs_epi32(sb1, sb2); 1.1141 + 1.1142 + // SkDITHER_R32To565(sb, dither) 1.1143 + __m128i sb_offset = _mm_srli_epi16(sb, 5); 1.1144 + sb = _mm_add_epi16(sb, dither); 1.1145 + sb = _mm_sub_epi16(sb, sb_offset); 1.1146 + sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS); 1.1147 + 1.1148 + // Pack and store 16-bit dst pixel. 1.1149 + __m128i d_pixel = SkPackRGB16_SSE(sr, sg, sb); 1.1150 + _mm_store_si128(d++, d_pixel); 1.1151 + 1.1152 + count -= 8; 1.1153 + x += 8; 1.1154 + } 1.1155 + 1.1156 + src = reinterpret_cast<const SkPMColor*>(s); 1.1157 + dst = reinterpret_cast<uint16_t*>(d); 1.1158 + } 1.1159 + 1.1160 + if (count > 0) { 1.1161 + DITHER_565_SCAN(y); 1.1162 + do { 1.1163 + SkPMColor c = *src++; 1.1164 + SkPMColorAssert(c); 1.1165 + 1.1166 + unsigned dither = DITHER_VALUE(x); 1.1167 + *dst++ = SkDitherRGB32To565(c, dither); 1.1168 + DITHER_INC_X(x); 1.1169 + } while (--count != 0); 1.1170 + } 1.1171 +} 1.1172 + 1.1173 +/* SSE2 version of S32A_D565_Opaque_Dither() 1.1174 + * portable version is in core/SkBlitRow_D16.cpp 1.1175 + */ 1.1176 +void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, 1.1177 + const SkPMColor* SK_RESTRICT src, 1.1178 + int count, U8CPU alpha, int x, int y) { 1.1179 + SkASSERT(255 == alpha); 1.1180 + 1.1181 + if (count <= 0) { 1.1182 + return; 1.1183 + } 1.1184 + 1.1185 + if (count >= 8) { 1.1186 + while (((size_t)dst & 0x0F) != 0) { 1.1187 + DITHER_565_SCAN(y); 1.1188 + SkPMColor c = *src++; 1.1189 + SkPMColorAssert(c); 1.1190 + if (c) { 1.1191 + unsigned a = SkGetPackedA32(c); 1.1192 + 1.1193 + int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a)); 1.1194 + 1.1195 + unsigned sr = SkGetPackedR32(c); 1.1196 + unsigned sg = SkGetPackedG32(c); 1.1197 + unsigned sb = SkGetPackedB32(c); 1.1198 + sr = SkDITHER_R32_FOR_565(sr, d); 1.1199 + sg = SkDITHER_G32_FOR_565(sg, d); 1.1200 + sb = SkDITHER_B32_FOR_565(sb, d); 1.1201 + 1.1202 + uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1.1203 + uint32_t dst_expanded = SkExpand_rgb_16(*dst); 1.1204 + dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1.1205 + // now src and dst expanded are in g:11 r:10 x:1 b:10 1.1206 + *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1.1207 + } 1.1208 + dst += 1; 1.1209 + DITHER_INC_X(x); 1.1210 + count--; 1.1211 + } 1.1212 + 1.1213 + unsigned short dither_value[8]; 1.1214 + __m128i dither, dither_cur; 1.1215 +#ifdef ENABLE_DITHER_MATRIX_4X4 1.1216 + const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3]; 1.1217 + dither_value[0] = dither_value[4] = dither_scan[(x) & 3]; 1.1218 + dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3]; 1.1219 + dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3]; 1.1220 + dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3]; 1.1221 +#else 1.1222 + const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3]; 1.1223 + dither_value[0] = dither_value[4] = (dither_scan 1.1224 + >> (((x) & 3) << 2)) & 0xF; 1.1225 + dither_value[1] = dither_value[5] = (dither_scan 1.1226 + >> (((x + 1) & 3) << 2)) & 0xF; 1.1227 + dither_value[2] = dither_value[6] = (dither_scan 1.1228 + >> (((x + 2) & 3) << 2)) & 0xF; 1.1229 + dither_value[3] = dither_value[7] = (dither_scan 1.1230 + >> (((x + 3) & 3) << 2)) & 0xF; 1.1231 +#endif 1.1232 + dither = _mm_loadu_si128((__m128i*) dither_value); 1.1233 + 1.1234 + const __m128i* s = reinterpret_cast<const __m128i*>(src); 1.1235 + __m128i* d = reinterpret_cast<__m128i*>(dst); 1.1236 + __m128i var256 = _mm_set1_epi16(256); 1.1237 + __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK); 1.1238 + __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK); 1.1239 + __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK); 1.1240 + 1.1241 + while (count >= 8) { 1.1242 + // Load 8 pixels of src and dst. 1.1243 + __m128i src_pixel1 = _mm_loadu_si128(s++); 1.1244 + __m128i src_pixel2 = _mm_loadu_si128(s++); 1.1245 + __m128i dst_pixel = _mm_load_si128(d); 1.1246 + 1.1247 + // Extract A from src. 1.1248 + __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT)); 1.1249 + sa1 = _mm_srli_epi32(sa1, 24); 1.1250 + __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT)); 1.1251 + sa2 = _mm_srli_epi32(sa2, 24); 1.1252 + __m128i sa = _mm_packs_epi32(sa1, sa2); 1.1253 + 1.1254 + // Calculate current dither value. 1.1255 + dither_cur = _mm_mullo_epi16(dither, 1.1256 + _mm_add_epi16(sa, _mm_set1_epi16(1))); 1.1257 + dither_cur = _mm_srli_epi16(dither_cur, 8); 1.1258 + 1.1259 + // Extract R from src. 1.1260 + __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); 1.1261 + sr1 = _mm_srli_epi32(sr1, 24); 1.1262 + __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT)); 1.1263 + sr2 = _mm_srli_epi32(sr2, 24); 1.1264 + __m128i sr = _mm_packs_epi32(sr1, sr2); 1.1265 + 1.1266 + // SkDITHER_R32_FOR_565(sr, d) 1.1267 + __m128i sr_offset = _mm_srli_epi16(sr, 5); 1.1268 + sr = _mm_add_epi16(sr, dither_cur); 1.1269 + sr = _mm_sub_epi16(sr, sr_offset); 1.1270 + 1.1271 + // Expand sr. 1.1272 + sr = _mm_slli_epi16(sr, 2); 1.1273 + 1.1274 + // Extract G from src. 1.1275 + __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT)); 1.1276 + sg1 = _mm_srli_epi32(sg1, 24); 1.1277 + __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT)); 1.1278 + sg2 = _mm_srli_epi32(sg2, 24); 1.1279 + __m128i sg = _mm_packs_epi32(sg1, sg2); 1.1280 + 1.1281 + // sg = SkDITHER_G32_FOR_565(sg, d). 1.1282 + __m128i sg_offset = _mm_srli_epi16(sg, 6); 1.1283 + sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1)); 1.1284 + sg = _mm_sub_epi16(sg, sg_offset); 1.1285 + 1.1286 + // Expand sg. 1.1287 + sg = _mm_slli_epi16(sg, 3); 1.1288 + 1.1289 + // Extract B from src. 1.1290 + __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT)); 1.1291 + sb1 = _mm_srli_epi32(sb1, 24); 1.1292 + __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT)); 1.1293 + sb2 = _mm_srli_epi32(sb2, 24); 1.1294 + __m128i sb = _mm_packs_epi32(sb1, sb2); 1.1295 + 1.1296 + // sb = SkDITHER_B32_FOR_565(sb, d). 1.1297 + __m128i sb_offset = _mm_srli_epi16(sb, 5); 1.1298 + sb = _mm_add_epi16(sb, dither_cur); 1.1299 + sb = _mm_sub_epi16(sb, sb_offset); 1.1300 + 1.1301 + // Expand sb. 1.1302 + sb = _mm_slli_epi16(sb, 2); 1.1303 + 1.1304 + // Extract R G B from dst. 1.1305 + __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT); 1.1306 + dr = _mm_and_si128(dr, r16_mask); 1.1307 + __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT); 1.1308 + dg = _mm_and_si128(dg, g16_mask); 1.1309 + __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT); 1.1310 + db = _mm_and_si128(db, b16_mask); 1.1311 + 1.1312 + // SkAlpha255To256(255 - a) >> 3 1.1313 + __m128i isa = _mm_sub_epi16(var256, sa); 1.1314 + isa = _mm_srli_epi16(isa, 3); 1.1315 + 1.1316 + dr = _mm_mullo_epi16(dr, isa); 1.1317 + dr = _mm_add_epi16(dr, sr); 1.1318 + dr = _mm_srli_epi16(dr, 5); 1.1319 + 1.1320 + dg = _mm_mullo_epi16(dg, isa); 1.1321 + dg = _mm_add_epi16(dg, sg); 1.1322 + dg = _mm_srli_epi16(dg, 5); 1.1323 + 1.1324 + db = _mm_mullo_epi16(db, isa); 1.1325 + db = _mm_add_epi16(db, sb); 1.1326 + db = _mm_srli_epi16(db, 5); 1.1327 + 1.1328 + // Package and store dst pixel. 1.1329 + __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db); 1.1330 + _mm_store_si128(d++, d_pixel); 1.1331 + 1.1332 + count -= 8; 1.1333 + x += 8; 1.1334 + } 1.1335 + 1.1336 + src = reinterpret_cast<const SkPMColor*>(s); 1.1337 + dst = reinterpret_cast<uint16_t*>(d); 1.1338 + } 1.1339 + 1.1340 + if (count > 0) { 1.1341 + DITHER_565_SCAN(y); 1.1342 + do { 1.1343 + SkPMColor c = *src++; 1.1344 + SkPMColorAssert(c); 1.1345 + if (c) { 1.1346 + unsigned a = SkGetPackedA32(c); 1.1347 + 1.1348 + int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a)); 1.1349 + 1.1350 + unsigned sr = SkGetPackedR32(c); 1.1351 + unsigned sg = SkGetPackedG32(c); 1.1352 + unsigned sb = SkGetPackedB32(c); 1.1353 + sr = SkDITHER_R32_FOR_565(sr, d); 1.1354 + sg = SkDITHER_G32_FOR_565(sg, d); 1.1355 + sb = SkDITHER_B32_FOR_565(sb, d); 1.1356 + 1.1357 + uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1.1358 + uint32_t dst_expanded = SkExpand_rgb_16(*dst); 1.1359 + dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1.1360 + // now src and dst expanded are in g:11 r:10 x:1 b:10 1.1361 + *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1.1362 + } 1.1363 + dst += 1; 1.1364 + DITHER_INC_X(x); 1.1365 + } while (--count != 0); 1.1366 + } 1.1367 +}