gfx/skia/trunk/src/opts/SkBlitRow_opts_SSE2.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/skia/trunk/src/opts/SkBlitRow_opts_SSE2.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1364 @@
     1.4 +/*
     1.5 + * Copyright 2012 The Android Open Source Project
     1.6 + *
     1.7 + * Use of this source code is governed by a BSD-style license that can be
     1.8 + * found in the LICENSE file.
     1.9 + */
    1.10 +
    1.11 +
    1.12 +#include "SkBlitRow_opts_SSE2.h"
    1.13 +#include "SkBitmapProcState_opts_SSE2.h"
    1.14 +#include "SkColorPriv.h"
    1.15 +#include "SkColor_opts_SSE2.h"
    1.16 +#include "SkDither.h"
    1.17 +#include "SkUtils.h"
    1.18 +
    1.19 +#include <emmintrin.h>
    1.20 +
    1.21 +/* SSE2 version of S32_Blend_BlitRow32()
    1.22 + * portable version is in core/SkBlitRow_D32.cpp
    1.23 + */
    1.24 +void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
    1.25 +                              const SkPMColor* SK_RESTRICT src,
    1.26 +                              int count, U8CPU alpha) {
    1.27 +    SkASSERT(alpha <= 255);
    1.28 +    if (count <= 0) {
    1.29 +        return;
    1.30 +    }
    1.31 +
    1.32 +    uint32_t src_scale = SkAlpha255To256(alpha);
    1.33 +    uint32_t dst_scale = 256 - src_scale;
    1.34 +
    1.35 +    if (count >= 4) {
    1.36 +        SkASSERT(((size_t)dst & 0x03) == 0);
    1.37 +        while (((size_t)dst & 0x0F) != 0) {
    1.38 +            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
    1.39 +            src++;
    1.40 +            dst++;
    1.41 +            count--;
    1.42 +        }
    1.43 +
    1.44 +        const __m128i *s = reinterpret_cast<const __m128i*>(src);
    1.45 +        __m128i *d = reinterpret_cast<__m128i*>(dst);
    1.46 +        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
    1.47 +        __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
    1.48 +
    1.49 +        // Move scale factors to upper byte of word
    1.50 +        __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
    1.51 +        __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
    1.52 +        while (count >= 4) {
    1.53 +            // Load 4 pixels each of src and dest.
    1.54 +            __m128i src_pixel = _mm_loadu_si128(s);
    1.55 +            __m128i dst_pixel = _mm_load_si128(d);
    1.56 +
    1.57 +            // Interleave Atom port 0/1 operations based on the execution port
    1.58 +            // constraints that multiply can only be executed on port 0 (while
    1.59 +            // boolean operations can be executed on either port 0 or port 1)
    1.60 +            // because GCC currently doesn't do a good job scheduling
    1.61 +            // instructions based on these constraints.
    1.62 +
    1.63 +            // Get red and blue pixels into lower byte of each word.
    1.64 +            // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
    1.65 +            __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
    1.66 +
    1.67 +            // Multiply by scale.
    1.68 +            // (4 x (0, rs.h, 0, bs.h))
    1.69 +            // where rs.h stands for the higher byte of r * scale, and
    1.70 +            // bs.h the higher byte of b * scale.
    1.71 +            src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
    1.72 +
    1.73 +            // Get alpha and green pixels into higher byte of each word.
    1.74 +            // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
    1.75 +            __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
    1.76 +
    1.77 +            // Multiply by scale.
    1.78 +            // (4 x (as.h, as.l, gs.h, gs.l))
    1.79 +            src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
    1.80 +
    1.81 +            // Clear the lower byte of the a*scale and g*scale results
    1.82 +            // (4 x (as.h, 0, gs.h, 0))
    1.83 +            src_ag = _mm_and_si128(src_ag, ag_mask);
    1.84 +
    1.85 +            // Operations the destination pixels are the same as on the
    1.86 +            // source pixels. See the comments above.
    1.87 +            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
    1.88 +            dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
    1.89 +            __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
    1.90 +            dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
    1.91 +            dst_ag = _mm_and_si128(dst_ag, ag_mask);
    1.92 +
    1.93 +            // Combine back into RGBA.
    1.94 +            // (4 x (as.h, rs.h, gs.h, bs.h))
    1.95 +            src_pixel = _mm_or_si128(src_rb, src_ag);
    1.96 +            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
    1.97 +
    1.98 +            // Add result
    1.99 +            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
   1.100 +            _mm_store_si128(d, result);
   1.101 +            s++;
   1.102 +            d++;
   1.103 +            count -= 4;
   1.104 +        }
   1.105 +        src = reinterpret_cast<const SkPMColor*>(s);
   1.106 +        dst = reinterpret_cast<SkPMColor*>(d);
   1.107 +    }
   1.108 +
   1.109 +    while (count > 0) {
   1.110 +        *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
   1.111 +        src++;
   1.112 +        dst++;
   1.113 +        count--;
   1.114 +    }
   1.115 +}
   1.116 +
   1.117 +void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
   1.118 +                                const SkPMColor* SK_RESTRICT src,
   1.119 +                                int count, U8CPU alpha) {
   1.120 +    SkASSERT(alpha == 255);
   1.121 +    if (count <= 0) {
   1.122 +        return;
   1.123 +    }
   1.124 +
   1.125 +    if (count >= 4) {
   1.126 +        SkASSERT(((size_t)dst & 0x03) == 0);
   1.127 +        while (((size_t)dst & 0x0F) != 0) {
   1.128 +            *dst = SkPMSrcOver(*src, *dst);
   1.129 +            src++;
   1.130 +            dst++;
   1.131 +            count--;
   1.132 +        }
   1.133 +
   1.134 +        const __m128i *s = reinterpret_cast<const __m128i*>(src);
   1.135 +        __m128i *d = reinterpret_cast<__m128i*>(dst);
   1.136 +#ifdef SK_USE_ACCURATE_BLENDING
   1.137 +        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
   1.138 +        __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)
   1.139 +        __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)
   1.140 +        while (count >= 4) {
   1.141 +            // Load 4 pixels
   1.142 +            __m128i src_pixel = _mm_loadu_si128(s);
   1.143 +            __m128i dst_pixel = _mm_load_si128(d);
   1.144 +
   1.145 +            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
   1.146 +            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
   1.147 +            // Shift alphas down to lower 8 bits of each quad.
   1.148 +            __m128i alpha = _mm_srli_epi32(src_pixel, 24);
   1.149 +
   1.150 +            // Copy alpha to upper 3rd byte of each quad
   1.151 +            alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
   1.152 +
   1.153 +            // Subtract alphas from 255, to get 0..255
   1.154 +            alpha = _mm_sub_epi16(c_255, alpha);
   1.155 +
   1.156 +            // Multiply by red and blue by src alpha.
   1.157 +            dst_rb = _mm_mullo_epi16(dst_rb, alpha);
   1.158 +            // Multiply by alpha and green by src alpha.
   1.159 +            dst_ag = _mm_mullo_epi16(dst_ag, alpha);
   1.160 +
   1.161 +            // dst_rb_low = (dst_rb >> 8)
   1.162 +            __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
   1.163 +            __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
   1.164 +
   1.165 +            // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
   1.166 +            dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
   1.167 +            dst_rb = _mm_add_epi16(dst_rb, c_128);
   1.168 +            dst_rb = _mm_srli_epi16(dst_rb, 8);
   1.169 +
   1.170 +            // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
   1.171 +            dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
   1.172 +            dst_ag = _mm_add_epi16(dst_ag, c_128);
   1.173 +            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
   1.174 +
   1.175 +            // Combine back into RGBA.
   1.176 +            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
   1.177 +
   1.178 +            // Add result
   1.179 +            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
   1.180 +            _mm_store_si128(d, result);
   1.181 +            s++;
   1.182 +            d++;
   1.183 +            count -= 4;
   1.184 +        }
   1.185 +    #else
   1.186 +        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
   1.187 +        __m128i c_256 = _mm_set1_epi16(0x0100);  // 8 copies of 256 (16-bit)
   1.188 +        while (count >= 4) {
   1.189 +            // Load 4 pixels
   1.190 +            __m128i src_pixel = _mm_loadu_si128(s);
   1.191 +            __m128i dst_pixel = _mm_load_si128(d);
   1.192 +
   1.193 +            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
   1.194 +            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
   1.195 +
   1.196 +            // (a0, g0, a1, g1, a2, g2, a3, g3)  (low byte of each word)
   1.197 +            __m128i alpha = _mm_srli_epi16(src_pixel, 8);
   1.198 +
   1.199 +            // (a0, a0, a1, a1, a2, g2, a3, g3)
   1.200 +            alpha = _mm_shufflehi_epi16(alpha, 0xF5);
   1.201 +
   1.202 +            // (a0, a0, a1, a1, a2, a2, a3, a3)
   1.203 +            alpha = _mm_shufflelo_epi16(alpha, 0xF5);
   1.204 +
   1.205 +            // Subtract alphas from 256, to get 1..256
   1.206 +            alpha = _mm_sub_epi16(c_256, alpha);
   1.207 +
   1.208 +            // Multiply by red and blue by src alpha.
   1.209 +            dst_rb = _mm_mullo_epi16(dst_rb, alpha);
   1.210 +            // Multiply by alpha and green by src alpha.
   1.211 +            dst_ag = _mm_mullo_epi16(dst_ag, alpha);
   1.212 +
   1.213 +            // Divide by 256.
   1.214 +            dst_rb = _mm_srli_epi16(dst_rb, 8);
   1.215 +
   1.216 +            // Mask out high bits (already in the right place)
   1.217 +            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
   1.218 +
   1.219 +            // Combine back into RGBA.
   1.220 +            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
   1.221 +
   1.222 +            // Add result
   1.223 +            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
   1.224 +            _mm_store_si128(d, result);
   1.225 +            s++;
   1.226 +            d++;
   1.227 +            count -= 4;
   1.228 +        }
   1.229 +#endif
   1.230 +        src = reinterpret_cast<const SkPMColor*>(s);
   1.231 +        dst = reinterpret_cast<SkPMColor*>(d);
   1.232 +    }
   1.233 +
   1.234 +    while (count > 0) {
   1.235 +        *dst = SkPMSrcOver(*src, *dst);
   1.236 +        src++;
   1.237 +        dst++;
   1.238 +        count--;
   1.239 +    }
   1.240 +}
   1.241 +
   1.242 +void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
   1.243 +                               const SkPMColor* SK_RESTRICT src,
   1.244 +                               int count, U8CPU alpha) {
   1.245 +    SkASSERT(alpha <= 255);
   1.246 +    if (count <= 0) {
   1.247 +        return;
   1.248 +    }
   1.249 +
   1.250 +    if (count >= 4) {
   1.251 +        while (((size_t)dst & 0x0F) != 0) {
   1.252 +            *dst = SkBlendARGB32(*src, *dst, alpha);
   1.253 +            src++;
   1.254 +            dst++;
   1.255 +            count--;
   1.256 +        }
   1.257 +
   1.258 +        uint32_t src_scale = SkAlpha255To256(alpha);
   1.259 +
   1.260 +        const __m128i *s = reinterpret_cast<const __m128i*>(src);
   1.261 +        __m128i *d = reinterpret_cast<__m128i*>(dst);
   1.262 +        __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
   1.263 +        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
   1.264 +        __m128i c_256 = _mm_set1_epi16(256);  // 8 copies of 256 (16-bit)
   1.265 +        while (count >= 4) {
   1.266 +            // Load 4 pixels each of src and dest.
   1.267 +            __m128i src_pixel = _mm_loadu_si128(s);
   1.268 +            __m128i dst_pixel = _mm_load_si128(d);
   1.269 +
   1.270 +            // Get red and blue pixels into lower byte of each word.
   1.271 +            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
   1.272 +            __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
   1.273 +
   1.274 +            // Get alpha and green into lower byte of each word.
   1.275 +            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
   1.276 +            __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
   1.277 +
   1.278 +            // Put per-pixel alpha in low byte of each word.
   1.279 +            // After the following two statements, the dst_alpha looks like
   1.280 +            // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
   1.281 +            __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
   1.282 +            dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
   1.283 +
   1.284 +            // dst_alpha = dst_alpha * src_scale
   1.285 +            // Because src_scales are in the higher byte of each word and
   1.286 +            // we use mulhi here, the resulting alpha values are already
   1.287 +            // in the right place and don't need to be divided by 256.
   1.288 +            // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
   1.289 +            dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
   1.290 +
   1.291 +            // Subtract alphas from 256, to get 1..256
   1.292 +            dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
   1.293 +
   1.294 +            // Multiply red and blue by dst pixel alpha.
   1.295 +            dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
   1.296 +            // Multiply alpha and green by dst pixel alpha.
   1.297 +            dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
   1.298 +
   1.299 +            // Multiply red and blue by global alpha.
   1.300 +            // (4 x (0, rs.h, 0, bs.h))
   1.301 +            // where rs.h stands for the higher byte of r * src_scale,
   1.302 +            // and bs.h the higher byte of b * src_scale.
   1.303 +            // Again, because we use mulhi, the resuling red and blue
   1.304 +            // values are already in the right place and don't need to
   1.305 +            // be divided by 256.
   1.306 +            src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
   1.307 +            // Multiply alpha and green by global alpha.
   1.308 +            // (4 x (0, as.h, 0, gs.h))
   1.309 +            src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
   1.310 +
   1.311 +            // Divide by 256.
   1.312 +            dst_rb = _mm_srli_epi16(dst_rb, 8);
   1.313 +
   1.314 +            // Mask out low bits (goodies already in the right place; no need to divide)
   1.315 +            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
   1.316 +            // Shift alpha and green to higher byte of each word.
   1.317 +            // (4 x (as.h, 0, gs.h, 0))
   1.318 +            src_ag = _mm_slli_epi16(src_ag, 8);
   1.319 +
   1.320 +            // Combine back into RGBA.
   1.321 +            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
   1.322 +            src_pixel = _mm_or_si128(src_rb, src_ag);
   1.323 +
   1.324 +            // Add two pixels into result.
   1.325 +            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
   1.326 +            _mm_store_si128(d, result);
   1.327 +            s++;
   1.328 +            d++;
   1.329 +            count -= 4;
   1.330 +        }
   1.331 +        src = reinterpret_cast<const SkPMColor*>(s);
   1.332 +        dst = reinterpret_cast<SkPMColor*>(d);
   1.333 +    }
   1.334 +
   1.335 +    while (count > 0) {
   1.336 +        *dst = SkBlendARGB32(*src, *dst, alpha);
   1.337 +        src++;
   1.338 +        dst++;
   1.339 +        count--;
   1.340 +    }
   1.341 +}
   1.342 +
   1.343 +/* SSE2 version of Color32()
   1.344 + * portable version is in core/SkBlitRow_D32.cpp
   1.345 + */
   1.346 +void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
   1.347 +                  SkPMColor color) {
   1.348 +
   1.349 +    if (count <= 0) {
   1.350 +        return;
   1.351 +    }
   1.352 +
   1.353 +    if (0 == color) {
   1.354 +        if (src != dst) {
   1.355 +            memcpy(dst, src, count * sizeof(SkPMColor));
   1.356 +        }
   1.357 +        return;
   1.358 +    }
   1.359 +
   1.360 +    unsigned colorA = SkGetPackedA32(color);
   1.361 +    if (255 == colorA) {
   1.362 +        sk_memset32(dst, color, count);
   1.363 +    } else {
   1.364 +        unsigned scale = 256 - SkAlpha255To256(colorA);
   1.365 +
   1.366 +        if (count >= 4) {
   1.367 +            SkASSERT(((size_t)dst & 0x03) == 0);
   1.368 +            while (((size_t)dst & 0x0F) != 0) {
   1.369 +                *dst = color + SkAlphaMulQ(*src, scale);
   1.370 +                src++;
   1.371 +                dst++;
   1.372 +                count--;
   1.373 +            }
   1.374 +
   1.375 +            const __m128i *s = reinterpret_cast<const __m128i*>(src);
   1.376 +            __m128i *d = reinterpret_cast<__m128i*>(dst);
   1.377 +            __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
   1.378 +            __m128i src_scale_wide = _mm_set1_epi16(scale);
   1.379 +            __m128i color_wide = _mm_set1_epi32(color);
   1.380 +            while (count >= 4) {
   1.381 +                // Load 4 pixels each of src and dest.
   1.382 +                __m128i src_pixel = _mm_loadu_si128(s);
   1.383 +
   1.384 +                // Get red and blue pixels into lower byte of each word.
   1.385 +                __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
   1.386 +
   1.387 +                // Get alpha and green into lower byte of each word.
   1.388 +                __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
   1.389 +
   1.390 +                // Multiply by scale.
   1.391 +                src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
   1.392 +                src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
   1.393 +
   1.394 +                // Divide by 256.
   1.395 +                src_rb = _mm_srli_epi16(src_rb, 8);
   1.396 +                src_ag = _mm_andnot_si128(rb_mask, src_ag);
   1.397 +
   1.398 +                // Combine back into RGBA.
   1.399 +                src_pixel = _mm_or_si128(src_rb, src_ag);
   1.400 +
   1.401 +                // Add color to result.
   1.402 +                __m128i result = _mm_add_epi8(color_wide, src_pixel);
   1.403 +
   1.404 +                // Store result.
   1.405 +                _mm_store_si128(d, result);
   1.406 +                s++;
   1.407 +                d++;
   1.408 +                count -= 4;
   1.409 +            }
   1.410 +            src = reinterpret_cast<const SkPMColor*>(s);
   1.411 +            dst = reinterpret_cast<SkPMColor*>(d);
   1.412 +         }
   1.413 +
   1.414 +        while (count > 0) {
   1.415 +            *dst = color + SkAlphaMulQ(*src, scale);
   1.416 +            src += 1;
   1.417 +            dst += 1;
   1.418 +            count--;
   1.419 +        }
   1.420 +    }
   1.421 +}
   1.422 +
   1.423 +void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
   1.424 +                               size_t maskRB, SkColor origColor,
   1.425 +                               int width, int height) {
   1.426 +    SkPMColor color = SkPreMultiplyColor(origColor);
   1.427 +    size_t dstOffset = dstRB - (width << 2);
   1.428 +    size_t maskOffset = maskRB - width;
   1.429 +    SkPMColor* dst = (SkPMColor *)device;
   1.430 +    const uint8_t* mask = (const uint8_t*)maskPtr;
   1.431 +    do {
   1.432 +        int count = width;
   1.433 +        if (count >= 4) {
   1.434 +            while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
   1.435 +                *dst = SkBlendARGB32(color, *dst, *mask);
   1.436 +                mask++;
   1.437 +                dst++;
   1.438 +                count--;
   1.439 +            }
   1.440 +            __m128i *d = reinterpret_cast<__m128i*>(dst);
   1.441 +            __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
   1.442 +            __m128i c_256 = _mm_set1_epi16(256);
   1.443 +            __m128i c_1 = _mm_set1_epi16(1);
   1.444 +            __m128i src_pixel = _mm_set1_epi32(color);
   1.445 +            while (count >= 4) {
   1.446 +                // Load 4 pixels each of src and dest.
   1.447 +                __m128i dst_pixel = _mm_load_si128(d);
   1.448 +
   1.449 +                //set the aphla value
   1.450 +                __m128i src_scale_wide =  _mm_set_epi8(0, *(mask+3),\
   1.451 +                                0, *(mask+3),0, \
   1.452 +                                *(mask+2),0, *(mask+2),\
   1.453 +                                0,*(mask+1), 0,*(mask+1),\
   1.454 +                                0, *mask,0,*mask);
   1.455 +
   1.456 +                //call SkAlpha255To256()
   1.457 +                src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
   1.458 +
   1.459 +                // Get red and blue pixels into lower byte of each word.
   1.460 +                __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
   1.461 +                __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
   1.462 +
   1.463 +                // Get alpha and green into lower byte of each word.
   1.464 +                __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
   1.465 +                __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
   1.466 +
   1.467 +                // Put per-pixel alpha in low byte of each word.
   1.468 +                __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
   1.469 +                dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
   1.470 +
   1.471 +                // dst_alpha = dst_alpha * src_scale
   1.472 +                dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
   1.473 +
   1.474 +                // Divide by 256.
   1.475 +                dst_alpha = _mm_srli_epi16(dst_alpha, 8);
   1.476 +
   1.477 +                // Subtract alphas from 256, to get 1..256
   1.478 +                dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
   1.479 +                // Multiply red and blue by dst pixel alpha.
   1.480 +                dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
   1.481 +                // Multiply alpha and green by dst pixel alpha.
   1.482 +                dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
   1.483 +
   1.484 +                // Multiply red and blue by global alpha.
   1.485 +                src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
   1.486 +                // Multiply alpha and green by global alpha.
   1.487 +                src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
   1.488 +                // Divide by 256.
   1.489 +                dst_rb = _mm_srli_epi16(dst_rb, 8);
   1.490 +                src_rb = _mm_srli_epi16(src_rb, 8);
   1.491 +
   1.492 +                // Mask out low bits (goodies already in the right place; no need to divide)
   1.493 +                dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
   1.494 +                src_ag = _mm_andnot_si128(rb_mask, src_ag);
   1.495 +
   1.496 +                // Combine back into RGBA.
   1.497 +                dst_pixel = _mm_or_si128(dst_rb, dst_ag);
   1.498 +                __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
   1.499 +
   1.500 +                // Add two pixels into result.
   1.501 +                __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
   1.502 +                _mm_store_si128(d, result);
   1.503 +                // load the next 4 pixel
   1.504 +                mask = mask + 4;
   1.505 +                d++;
   1.506 +                count -= 4;
   1.507 +            }
   1.508 +            dst = reinterpret_cast<SkPMColor *>(d);
   1.509 +        }
   1.510 +        while(count > 0) {
   1.511 +            *dst= SkBlendARGB32(color, *dst, *mask);
   1.512 +            dst += 1;
   1.513 +            mask++;
   1.514 +            count --;
   1.515 +        }
   1.516 +        dst = (SkPMColor *)((char*)dst + dstOffset);
   1.517 +        mask += maskOffset;
   1.518 +    } while (--height != 0);
   1.519 +}
   1.520 +
   1.521 +// The following (left) shifts cause the top 5 bits of the mask components to
   1.522 +// line up with the corresponding components in an SkPMColor.
   1.523 +// Note that the mask's RGB16 order may differ from the SkPMColor order.
   1.524 +#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
   1.525 +#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
   1.526 +#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
   1.527 +
   1.528 +#if SK_R16x5_R32x5_SHIFT == 0
   1.529 +    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
   1.530 +#elif SK_R16x5_R32x5_SHIFT > 0
   1.531 +    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
   1.532 +#else
   1.533 +    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
   1.534 +#endif
   1.535 +
   1.536 +#if SK_G16x5_G32x5_SHIFT == 0
   1.537 +    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
   1.538 +#elif SK_G16x5_G32x5_SHIFT > 0
   1.539 +    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
   1.540 +#else
   1.541 +    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
   1.542 +#endif
   1.543 +
   1.544 +#if SK_B16x5_B32x5_SHIFT == 0
   1.545 +    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
   1.546 +#elif SK_B16x5_B32x5_SHIFT > 0
   1.547 +    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
   1.548 +#else
   1.549 +    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
   1.550 +#endif
   1.551 +
   1.552 +static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
   1.553 +                                 __m128i &mask, __m128i &srcA) {
   1.554 +    // In the following comments, the components of src, dst and mask are
   1.555 +    // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
   1.556 +    // by an R, G, B, or A suffix. Components of one of the four pixels that
   1.557 +    // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
   1.558 +    // example is the blue channel of the second destination pixel. Memory
   1.559 +    // layout is shown for an ARGB byte order in a color value.
   1.560 +
   1.561 +    // src and srcA store 8-bit values interleaved with zeros.
   1.562 +    // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
   1.563 +    // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
   1.564 +    //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
   1.565 +    // mask stores 16-bit values (compressed three channels) interleaved with zeros.
   1.566 +    // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
   1.567 +    // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
   1.568 +    //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
   1.569 +
   1.570 +    // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
   1.571 +    // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
   1.572 +    __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
   1.573 +                              _mm_set1_epi32(0x1F << SK_R32_SHIFT));
   1.574 +
   1.575 +    // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
   1.576 +    __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
   1.577 +                              _mm_set1_epi32(0x1F << SK_G32_SHIFT));
   1.578 +
   1.579 +    // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
   1.580 +    __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
   1.581 +                              _mm_set1_epi32(0x1F << SK_B32_SHIFT));
   1.582 +
   1.583 +    // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
   1.584 +    // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
   1.585 +    // 8-bit position
   1.586 +    // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
   1.587 +    //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
   1.588 +    mask = _mm_or_si128(_mm_or_si128(r, g), b);
   1.589 +
   1.590 +    // Interleave R,G,B into the lower byte of word.
   1.591 +    // i.e. split the sixteen 8-bit values from mask into two sets of eight
   1.592 +    // 16-bit values, padded by zero.
   1.593 +    __m128i maskLo, maskHi;
   1.594 +    // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
   1.595 +    maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
   1.596 +    // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
   1.597 +    maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
   1.598 +
   1.599 +    // Upscale from 0..31 to 0..32
   1.600 +    // (allows to replace division by left-shift further down)
   1.601 +    // Left-shift each component by 4 and add the result back to that component,
   1.602 +    // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
   1.603 +    maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
   1.604 +    maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
   1.605 +
   1.606 +    // Multiply each component of maskLo and maskHi by srcA
   1.607 +    maskLo = _mm_mullo_epi16(maskLo, srcA);
   1.608 +    maskHi = _mm_mullo_epi16(maskHi, srcA);
   1.609 +
   1.610 +    // Left shift mask components by 8 (divide by 256)
   1.611 +    maskLo = _mm_srli_epi16(maskLo, 8);
   1.612 +    maskHi = _mm_srli_epi16(maskHi, 8);
   1.613 +
   1.614 +    // Interleave R,G,B into the lower byte of the word
   1.615 +    // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
   1.616 +    __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
   1.617 +    // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
   1.618 +    __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
   1.619 +
   1.620 +    // mask = (src - dst) * mask
   1.621 +    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
   1.622 +    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
   1.623 +
   1.624 +    // mask = (src - dst) * mask >> 5
   1.625 +    maskLo = _mm_srai_epi16(maskLo, 5);
   1.626 +    maskHi = _mm_srai_epi16(maskHi, 5);
   1.627 +
   1.628 +    // Add two pixels into result.
   1.629 +    // result = dst + ((src - dst) * mask >> 5)
   1.630 +    __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
   1.631 +    __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
   1.632 +
   1.633 +    // Pack into 4 32bit dst pixels.
   1.634 +    // resultLo and resultHi contain eight 16-bit components (two pixels) each.
   1.635 +    // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
   1.636 +    // clamping to 255 if necessary.
   1.637 +    return _mm_packus_epi16(resultLo, resultHi);
   1.638 +}
   1.639 +
   1.640 +static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
   1.641 +                                       __m128i &mask) {
   1.642 +    // In the following comments, the components of src, dst and mask are
   1.643 +    // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
   1.644 +    // by an R, G, B, or A suffix. Components of one of the four pixels that
   1.645 +    // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
   1.646 +    // example is the blue channel of the second destination pixel. Memory
   1.647 +    // layout is shown for an ARGB byte order in a color value.
   1.648 +
   1.649 +    // src and srcA store 8-bit values interleaved with zeros.
   1.650 +    // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
   1.651 +    // mask stores 16-bit values (shown as high and low bytes) interleaved with
   1.652 +    // zeros
   1.653 +    // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
   1.654 +    //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
   1.655 +
   1.656 +    // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
   1.657 +    // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
   1.658 +    __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
   1.659 +                              _mm_set1_epi32(0x1F << SK_R32_SHIFT));
   1.660 +
   1.661 +    // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
   1.662 +    __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
   1.663 +                              _mm_set1_epi32(0x1F << SK_G32_SHIFT));
   1.664 +
   1.665 +    // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
   1.666 +    __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
   1.667 +                              _mm_set1_epi32(0x1F << SK_B32_SHIFT));
   1.668 +
   1.669 +    // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
   1.670 +    // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
   1.671 +    // 8-bit position
   1.672 +    // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
   1.673 +    //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
   1.674 +    mask = _mm_or_si128(_mm_or_si128(r, g), b);
   1.675 +
   1.676 +    // Interleave R,G,B into the lower byte of word.
   1.677 +    // i.e. split the sixteen 8-bit values from mask into two sets of eight
   1.678 +    // 16-bit values, padded by zero.
   1.679 +    __m128i maskLo, maskHi;
   1.680 +    // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
   1.681 +    maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
   1.682 +    // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
   1.683 +    maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
   1.684 +
   1.685 +    // Upscale from 0..31 to 0..32
   1.686 +    // (allows to replace division by left-shift further down)
   1.687 +    // Left-shift each component by 4 and add the result back to that component,
   1.688 +    // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
   1.689 +    maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
   1.690 +    maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
   1.691 +
   1.692 +    // Interleave R,G,B into the lower byte of the word
   1.693 +    // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
   1.694 +    __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
   1.695 +    // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
   1.696 +    __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
   1.697 +
   1.698 +    // mask = (src - dst) * mask
   1.699 +    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
   1.700 +    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
   1.701 +
   1.702 +    // mask = (src - dst) * mask >> 5
   1.703 +    maskLo = _mm_srai_epi16(maskLo, 5);
   1.704 +    maskHi = _mm_srai_epi16(maskHi, 5);
   1.705 +
   1.706 +    // Add two pixels into result.
   1.707 +    // result = dst + ((src - dst) * mask >> 5)
   1.708 +    __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
   1.709 +    __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
   1.710 +
   1.711 +    // Pack into 4 32bit dst pixels and force opaque.
   1.712 +    // resultLo and resultHi contain eight 16-bit components (two pixels) each.
   1.713 +    // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
   1.714 +    // clamping to 255 if necessary. Set alpha components to 0xFF.
   1.715 +    return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
   1.716 +                        _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
   1.717 +}
   1.718 +
   1.719 +void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
   1.720 +                         SkColor src, int width, SkPMColor) {
   1.721 +    if (width <= 0) {
   1.722 +        return;
   1.723 +    }
   1.724 +
   1.725 +    int srcA = SkColorGetA(src);
   1.726 +    int srcR = SkColorGetR(src);
   1.727 +    int srcG = SkColorGetG(src);
   1.728 +    int srcB = SkColorGetB(src);
   1.729 +
   1.730 +    srcA = SkAlpha255To256(srcA);
   1.731 +
   1.732 +    if (width >= 4) {
   1.733 +        SkASSERT(((size_t)dst & 0x03) == 0);
   1.734 +        while (((size_t)dst & 0x0F) != 0) {
   1.735 +            *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
   1.736 +            mask++;
   1.737 +            dst++;
   1.738 +            width--;
   1.739 +        }
   1.740 +
   1.741 +        __m128i *d = reinterpret_cast<__m128i*>(dst);
   1.742 +        // Set alpha to 0xFF and replicate source four times in SSE register.
   1.743 +        __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
   1.744 +        // Interleave with zeros to get two sets of four 16-bit values.
   1.745 +        src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
   1.746 +        // Set srcA_sse to contain eight copies of srcA, padded with zero.
   1.747 +        // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
   1.748 +        __m128i srcA_sse = _mm_set1_epi16(srcA);
   1.749 +        while (width >= 4) {
   1.750 +            // Load four destination pixels into dst_sse.
   1.751 +            __m128i dst_sse = _mm_load_si128(d);
   1.752 +            // Load four 16-bit masks into lower half of mask_sse.
   1.753 +            __m128i mask_sse = _mm_loadl_epi64(
   1.754 +                                   reinterpret_cast<const __m128i*>(mask));
   1.755 +
   1.756 +            // Check whether masks are equal to 0 and get the highest bit
   1.757 +            // of each byte of result, if masks are all zero, we will get
   1.758 +            // pack_cmp to 0xFFFF
   1.759 +            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
   1.760 +                                             _mm_setzero_si128()));
   1.761 +
   1.762 +            // if mask pixels are not all zero, we will blend the dst pixels
   1.763 +            if (pack_cmp != 0xFFFF) {
   1.764 +                // Unpack 4 16bit mask pixels to
   1.765 +                // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
   1.766 +                //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
   1.767 +                mask_sse = _mm_unpacklo_epi16(mask_sse,
   1.768 +                                              _mm_setzero_si128());
   1.769 +
   1.770 +                // Process 4 32bit dst pixels
   1.771 +                __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
   1.772 +                                                   mask_sse, srcA_sse);
   1.773 +                _mm_store_si128(d, result);
   1.774 +            }
   1.775 +
   1.776 +            d++;
   1.777 +            mask += 4;
   1.778 +            width -= 4;
   1.779 +        }
   1.780 +
   1.781 +        dst = reinterpret_cast<SkPMColor*>(d);
   1.782 +    }
   1.783 +
   1.784 +    while (width > 0) {
   1.785 +        *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
   1.786 +        mask++;
   1.787 +        dst++;
   1.788 +        width--;
   1.789 +    }
   1.790 +}
   1.791 +
   1.792 +void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
   1.793 +                               SkColor src, int width, SkPMColor opaqueDst) {
   1.794 +    if (width <= 0) {
   1.795 +        return;
   1.796 +    }
   1.797 +
   1.798 +    int srcR = SkColorGetR(src);
   1.799 +    int srcG = SkColorGetG(src);
   1.800 +    int srcB = SkColorGetB(src);
   1.801 +
   1.802 +    if (width >= 4) {
   1.803 +        SkASSERT(((size_t)dst & 0x03) == 0);
   1.804 +        while (((size_t)dst & 0x0F) != 0) {
   1.805 +            *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
   1.806 +            mask++;
   1.807 +            dst++;
   1.808 +            width--;
   1.809 +        }
   1.810 +
   1.811 +        __m128i *d = reinterpret_cast<__m128i*>(dst);
   1.812 +        // Set alpha to 0xFF and replicate source four times in SSE register.
   1.813 +        __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
   1.814 +        // Set srcA_sse to contain eight copies of srcA, padded with zero.
   1.815 +        // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
   1.816 +        src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
   1.817 +        while (width >= 4) {
   1.818 +            // Load four destination pixels into dst_sse.
   1.819 +            __m128i dst_sse = _mm_load_si128(d);
   1.820 +            // Load four 16-bit masks into lower half of mask_sse.
   1.821 +            __m128i mask_sse = _mm_loadl_epi64(
   1.822 +                                   reinterpret_cast<const __m128i*>(mask));
   1.823 +
   1.824 +            // Check whether masks are equal to 0 and get the highest bit
   1.825 +            // of each byte of result, if masks are all zero, we will get
   1.826 +            // pack_cmp to 0xFFFF
   1.827 +            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
   1.828 +                                             _mm_setzero_si128()));
   1.829 +
   1.830 +            // if mask pixels are not all zero, we will blend the dst pixels
   1.831 +            if (pack_cmp != 0xFFFF) {
   1.832 +                // Unpack 4 16bit mask pixels to
   1.833 +                // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
   1.834 +                //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
   1.835 +                mask_sse = _mm_unpacklo_epi16(mask_sse,
   1.836 +                                              _mm_setzero_si128());
   1.837 +
   1.838 +                // Process 4 32bit dst pixels
   1.839 +                __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
   1.840 +                                                         mask_sse);
   1.841 +                _mm_store_si128(d, result);
   1.842 +            }
   1.843 +
   1.844 +            d++;
   1.845 +            mask += 4;
   1.846 +            width -= 4;
   1.847 +        }
   1.848 +
   1.849 +        dst = reinterpret_cast<SkPMColor*>(d);
   1.850 +    }
   1.851 +
   1.852 +    while (width > 0) {
   1.853 +        *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
   1.854 +        mask++;
   1.855 +        dst++;
   1.856 +        width--;
   1.857 +    }
   1.858 +}
   1.859 +
   1.860 +/* SSE2 version of S32_D565_Opaque()
   1.861 + * portable version is in core/SkBlitRow_D16.cpp
   1.862 + */
   1.863 +void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
   1.864 +                          const SkPMColor* SK_RESTRICT src, int count,
   1.865 +                          U8CPU alpha, int /*x*/, int /*y*/) {
   1.866 +    SkASSERT(255 == alpha);
   1.867 +
   1.868 +    if (count <= 0) {
   1.869 +        return;
   1.870 +    }
   1.871 +
   1.872 +    if (count >= 8) {
   1.873 +        while (((size_t)dst & 0x0F) != 0) {
   1.874 +            SkPMColor c = *src++;
   1.875 +            SkPMColorAssert(c);
   1.876 +
   1.877 +            *dst++ = SkPixel32ToPixel16_ToU16(c);
   1.878 +            count--;
   1.879 +        }
   1.880 +
   1.881 +        const __m128i* s = reinterpret_cast<const __m128i*>(src);
   1.882 +        __m128i* d = reinterpret_cast<__m128i*>(dst);
   1.883 +        __m128i r16_mask = _mm_set1_epi32(SK_R16_MASK);
   1.884 +        __m128i g16_mask = _mm_set1_epi32(SK_G16_MASK);
   1.885 +        __m128i b16_mask = _mm_set1_epi32(SK_B16_MASK);
   1.886 +
   1.887 +        while (count >= 8) {
   1.888 +            // Load 8 pixels of src.
   1.889 +            __m128i src_pixel1 = _mm_loadu_si128(s++);
   1.890 +            __m128i src_pixel2 = _mm_loadu_si128(s++);
   1.891 +
   1.892 +            // Calculate result r.
   1.893 +            __m128i r1 = _mm_srli_epi32(src_pixel1,
   1.894 +                                        SK_R32_SHIFT + (8 - SK_R16_BITS));
   1.895 +            r1 = _mm_and_si128(r1, r16_mask);
   1.896 +            __m128i r2 = _mm_srli_epi32(src_pixel2,
   1.897 +                                        SK_R32_SHIFT + (8 - SK_R16_BITS));
   1.898 +            r2 = _mm_and_si128(r2, r16_mask);
   1.899 +            __m128i r = _mm_packs_epi32(r1, r2);
   1.900 +
   1.901 +            // Calculate result g.
   1.902 +            __m128i g1 = _mm_srli_epi32(src_pixel1,
   1.903 +                                        SK_G32_SHIFT + (8 - SK_G16_BITS));
   1.904 +            g1 = _mm_and_si128(g1, g16_mask);
   1.905 +            __m128i g2 = _mm_srli_epi32(src_pixel2,
   1.906 +                                        SK_G32_SHIFT + (8 - SK_G16_BITS));
   1.907 +            g2 = _mm_and_si128(g2, g16_mask);
   1.908 +            __m128i g = _mm_packs_epi32(g1, g2);
   1.909 +
   1.910 +            // Calculate result b.
   1.911 +            __m128i b1 = _mm_srli_epi32(src_pixel1,
   1.912 +                                        SK_B32_SHIFT + (8 - SK_B16_BITS));
   1.913 +            b1 = _mm_and_si128(b1, b16_mask);
   1.914 +            __m128i b2 = _mm_srli_epi32(src_pixel2,
   1.915 +                                        SK_B32_SHIFT + (8 - SK_B16_BITS));
   1.916 +            b2 = _mm_and_si128(b2, b16_mask);
   1.917 +            __m128i b = _mm_packs_epi32(b1, b2);
   1.918 +
   1.919 +            // Store 8 16-bit colors in dst.
   1.920 +            __m128i d_pixel = SkPackRGB16_SSE(r, g, b);
   1.921 +            _mm_store_si128(d++, d_pixel);
   1.922 +            count -= 8;
   1.923 +        }
   1.924 +        src = reinterpret_cast<const SkPMColor*>(s);
   1.925 +        dst = reinterpret_cast<uint16_t*>(d);
   1.926 +    }
   1.927 +
   1.928 +    if (count > 0) {
   1.929 +        do {
   1.930 +            SkPMColor c = *src++;
   1.931 +            SkPMColorAssert(c);
   1.932 +            *dst++ = SkPixel32ToPixel16_ToU16(c);
   1.933 +        } while (--count != 0);
   1.934 +    }
   1.935 +}
   1.936 +
   1.937 +/* SSE2 version of S32A_D565_Opaque()
   1.938 + * portable version is in core/SkBlitRow_D16.cpp
   1.939 + */
   1.940 +void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
   1.941 +                           const SkPMColor* SK_RESTRICT src,
   1.942 +                           int count, U8CPU alpha, int /*x*/, int /*y*/) {
   1.943 +    SkASSERT(255 == alpha);
   1.944 +
   1.945 +    if (count <= 0) {
   1.946 +        return;
   1.947 +    }
   1.948 +
   1.949 +    if (count >= 8) {
   1.950 +        // Make dst 16 bytes alignment
   1.951 +        while (((size_t)dst & 0x0F) != 0) {
   1.952 +            SkPMColor c = *src++;
   1.953 +            if (c) {
   1.954 +              *dst = SkSrcOver32To16(c, *dst);
   1.955 +            }
   1.956 +            dst += 1;
   1.957 +            count--;
   1.958 +        }
   1.959 +
   1.960 +        const __m128i* s = reinterpret_cast<const __m128i*>(src);
   1.961 +        __m128i* d = reinterpret_cast<__m128i*>(dst);
   1.962 +        __m128i var255 = _mm_set1_epi16(255);
   1.963 +        __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
   1.964 +        __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
   1.965 +        __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
   1.966 +
   1.967 +        while (count >= 8) {
   1.968 +            // Load 8 pixels of src.
   1.969 +            __m128i src_pixel1 = _mm_loadu_si128(s++);
   1.970 +            __m128i src_pixel2 = _mm_loadu_si128(s++);
   1.971 +
   1.972 +            // Check whether src pixels are equal to 0 and get the highest bit
   1.973 +            // of each byte of result, if src pixels are all zero, src_cmp1 and
   1.974 +            // src_cmp2 will be 0xFFFF.
   1.975 +            int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
   1.976 +                                             _mm_setzero_si128()));
   1.977 +            int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
   1.978 +                                             _mm_setzero_si128()));
   1.979 +            if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
   1.980 +                d++;
   1.981 +                count -= 8;
   1.982 +                continue;
   1.983 +            }
   1.984 +
   1.985 +            // Load 8 pixels of dst.
   1.986 +            __m128i dst_pixel = _mm_load_si128(d);
   1.987 +
   1.988 +            // Extract A from src.
   1.989 +            __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT));
   1.990 +            sa1 = _mm_srli_epi32(sa1, 24);
   1.991 +            __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT));
   1.992 +            sa2 = _mm_srli_epi32(sa2, 24);
   1.993 +            __m128i sa = _mm_packs_epi32(sa1, sa2);
   1.994 +
   1.995 +            // Extract R from src.
   1.996 +            __m128i sr1 = _mm_slli_epi32(src_pixel1,(24 - SK_R32_SHIFT));
   1.997 +            sr1 = _mm_srli_epi32(sr1, 24);
   1.998 +            __m128i sr2 = _mm_slli_epi32(src_pixel2,(24 - SK_R32_SHIFT));
   1.999 +            sr2 = _mm_srli_epi32(sr2, 24);
  1.1000 +            __m128i sr = _mm_packs_epi32(sr1, sr2);
  1.1001 +
  1.1002 +            // Extract G from src.
  1.1003 +            __m128i sg1 = _mm_slli_epi32(src_pixel1,(24 - SK_G32_SHIFT));
  1.1004 +            sg1 = _mm_srli_epi32(sg1, 24);
  1.1005 +            __m128i sg2 = _mm_slli_epi32(src_pixel2,(24 - SK_G32_SHIFT));
  1.1006 +            sg2 = _mm_srli_epi32(sg2, 24);
  1.1007 +            __m128i sg = _mm_packs_epi32(sg1, sg2);
  1.1008 +
  1.1009 +            // Extract B from src.
  1.1010 +            __m128i sb1 = _mm_slli_epi32(src_pixel1,(24 - SK_B32_SHIFT));
  1.1011 +            sb1 = _mm_srli_epi32(sb1, 24);
  1.1012 +            __m128i sb2 = _mm_slli_epi32(src_pixel2,(24 - SK_B32_SHIFT));
  1.1013 +            sb2 = _mm_srli_epi32(sb2, 24);
  1.1014 +            __m128i sb = _mm_packs_epi32(sb1, sb2);
  1.1015 +
  1.1016 +            // Extract R G B from dst.
  1.1017 +            __m128i dr = _mm_srli_epi16(dst_pixel,SK_R16_SHIFT);
  1.1018 +            dr = _mm_and_si128(dr, r16_mask);
  1.1019 +            __m128i dg = _mm_srli_epi16(dst_pixel,SK_G16_SHIFT);
  1.1020 +            dg = _mm_and_si128(dg, g16_mask);
  1.1021 +            __m128i db = _mm_srli_epi16(dst_pixel,SK_B16_SHIFT);
  1.1022 +            db = _mm_and_si128(db, b16_mask);
  1.1023 +
  1.1024 +            __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
  1.1025 +
  1.1026 +            // Calculate R G B of result.
  1.1027 +            // Original algorithm is in SkSrcOver32To16().
  1.1028 +            dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE(dr, isa, SK_R16_BITS));
  1.1029 +            dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
  1.1030 +            dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE(dg, isa, SK_G16_BITS));
  1.1031 +            dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
  1.1032 +            db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE(db, isa, SK_B16_BITS));
  1.1033 +            db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
  1.1034 +
  1.1035 +            // Pack R G B into 16-bit color.
  1.1036 +            __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db);
  1.1037 +
  1.1038 +            // Store 8 16-bit colors in dst.
  1.1039 +            _mm_store_si128(d++, d_pixel);
  1.1040 +            count -= 8;
  1.1041 +        }
  1.1042 +
  1.1043 +        src = reinterpret_cast<const SkPMColor*>(s);
  1.1044 +        dst = reinterpret_cast<uint16_t*>(d);
  1.1045 +    }
  1.1046 +
  1.1047 +    if (count > 0) {
  1.1048 +        do {
  1.1049 +            SkPMColor c = *src++;
  1.1050 +            SkPMColorAssert(c);
  1.1051 +            if (c) {
  1.1052 +                *dst = SkSrcOver32To16(c, *dst);
  1.1053 +            }
  1.1054 +            dst += 1;
  1.1055 +        } while (--count != 0);
  1.1056 +    }
  1.1057 +}
  1.1058 +
  1.1059 +void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
  1.1060 +                                 const SkPMColor* SK_RESTRICT src,
  1.1061 +                                 int count, U8CPU alpha, int x, int y) {
  1.1062 +    SkASSERT(255 == alpha);
  1.1063 +
  1.1064 +    if (count <= 0) {
  1.1065 +        return;
  1.1066 +    }
  1.1067 +
  1.1068 +    if (count >= 8) {
  1.1069 +        while (((size_t)dst & 0x0F) != 0) {
  1.1070 +            DITHER_565_SCAN(y);
  1.1071 +            SkPMColor c = *src++;
  1.1072 +            SkPMColorAssert(c);
  1.1073 +
  1.1074 +            unsigned dither = DITHER_VALUE(x);
  1.1075 +            *dst++ = SkDitherRGB32To565(c, dither);
  1.1076 +            DITHER_INC_X(x);
  1.1077 +            count--;
  1.1078 +        }
  1.1079 +
  1.1080 +        unsigned short dither_value[8];
  1.1081 +        __m128i dither;
  1.1082 +#ifdef ENABLE_DITHER_MATRIX_4X4
  1.1083 +        const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
  1.1084 +        dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
  1.1085 +        dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
  1.1086 +        dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
  1.1087 +        dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
  1.1088 +#else
  1.1089 +        const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
  1.1090 +        dither_value[0] = dither_value[4] = (dither_scan
  1.1091 +                                             >> (((x) & 3) << 2)) & 0xF;
  1.1092 +        dither_value[1] = dither_value[5] = (dither_scan
  1.1093 +                                             >> (((x + 1) & 3) << 2)) & 0xF;
  1.1094 +        dither_value[2] = dither_value[6] = (dither_scan
  1.1095 +                                             >> (((x + 2) & 3) << 2)) & 0xF;
  1.1096 +        dither_value[3] = dither_value[7] = (dither_scan
  1.1097 +                                             >> (((x + 3) & 3) << 2)) & 0xF;
  1.1098 +#endif
  1.1099 +        dither = _mm_loadu_si128((__m128i*) dither_value);
  1.1100 +
  1.1101 +        const __m128i* s = reinterpret_cast<const __m128i*>(src);
  1.1102 +        __m128i* d = reinterpret_cast<__m128i*>(dst);
  1.1103 +
  1.1104 +        while (count >= 8) {
  1.1105 +            // Load 8 pixels of src.
  1.1106 +            __m128i src_pixel1 = _mm_loadu_si128(s++);
  1.1107 +            __m128i src_pixel2 = _mm_loadu_si128(s++);
  1.1108 +
  1.1109 +            // Extract R from src.
  1.1110 +            __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
  1.1111 +            sr1 = _mm_srli_epi32(sr1, 24);
  1.1112 +            __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
  1.1113 +            sr2 = _mm_srli_epi32(sr2, 24);
  1.1114 +            __m128i sr = _mm_packs_epi32(sr1, sr2);
  1.1115 +
  1.1116 +            // SkDITHER_R32To565(sr, dither)
  1.1117 +            __m128i sr_offset = _mm_srli_epi16(sr, 5);
  1.1118 +            sr = _mm_add_epi16(sr, dither);
  1.1119 +            sr = _mm_sub_epi16(sr, sr_offset);
  1.1120 +            sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
  1.1121 +
  1.1122 +            // Extract G from src.
  1.1123 +            __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
  1.1124 +            sg1 = _mm_srli_epi32(sg1, 24);
  1.1125 +            __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
  1.1126 +            sg2 = _mm_srli_epi32(sg2, 24);
  1.1127 +            __m128i sg = _mm_packs_epi32(sg1, sg2);
  1.1128 +
  1.1129 +            // SkDITHER_R32To565(sg, dither)
  1.1130 +            __m128i sg_offset = _mm_srli_epi16(sg, 6);
  1.1131 +            sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
  1.1132 +            sg = _mm_sub_epi16(sg, sg_offset);
  1.1133 +            sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
  1.1134 +
  1.1135 +            // Extract B from src.
  1.1136 +            __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
  1.1137 +            sb1 = _mm_srli_epi32(sb1, 24);
  1.1138 +            __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
  1.1139 +            sb2 = _mm_srli_epi32(sb2, 24);
  1.1140 +            __m128i sb = _mm_packs_epi32(sb1, sb2);
  1.1141 +
  1.1142 +            // SkDITHER_R32To565(sb, dither)
  1.1143 +            __m128i sb_offset = _mm_srli_epi16(sb, 5);
  1.1144 +            sb = _mm_add_epi16(sb, dither);
  1.1145 +            sb = _mm_sub_epi16(sb, sb_offset);
  1.1146 +            sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
  1.1147 +
  1.1148 +            // Pack and store 16-bit dst pixel.
  1.1149 +            __m128i d_pixel = SkPackRGB16_SSE(sr, sg, sb);
  1.1150 +            _mm_store_si128(d++, d_pixel);
  1.1151 +
  1.1152 +            count -= 8;
  1.1153 +            x += 8;
  1.1154 +        }
  1.1155 +
  1.1156 +        src = reinterpret_cast<const SkPMColor*>(s);
  1.1157 +        dst = reinterpret_cast<uint16_t*>(d);
  1.1158 +    }
  1.1159 +
  1.1160 +    if (count > 0) {
  1.1161 +        DITHER_565_SCAN(y);
  1.1162 +        do {
  1.1163 +            SkPMColor c = *src++;
  1.1164 +            SkPMColorAssert(c);
  1.1165 +
  1.1166 +            unsigned dither = DITHER_VALUE(x);
  1.1167 +            *dst++ = SkDitherRGB32To565(c, dither);
  1.1168 +            DITHER_INC_X(x);
  1.1169 +        } while (--count != 0);
  1.1170 +    }
  1.1171 +}
  1.1172 +
  1.1173 +/* SSE2 version of S32A_D565_Opaque_Dither()
  1.1174 + * portable version is in core/SkBlitRow_D16.cpp
  1.1175 + */
  1.1176 +void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
  1.1177 +                                  const SkPMColor* SK_RESTRICT src,
  1.1178 +                                  int count, U8CPU alpha, int x, int y) {
  1.1179 +    SkASSERT(255 == alpha);
  1.1180 +
  1.1181 +    if (count <= 0) {
  1.1182 +        return;
  1.1183 +    }
  1.1184 +
  1.1185 +    if (count >= 8) {
  1.1186 +        while (((size_t)dst & 0x0F) != 0) {
  1.1187 +            DITHER_565_SCAN(y);
  1.1188 +            SkPMColor c = *src++;
  1.1189 +            SkPMColorAssert(c);
  1.1190 +            if (c) {
  1.1191 +                unsigned a = SkGetPackedA32(c);
  1.1192 +
  1.1193 +                int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
  1.1194 +
  1.1195 +                unsigned sr = SkGetPackedR32(c);
  1.1196 +                unsigned sg = SkGetPackedG32(c);
  1.1197 +                unsigned sb = SkGetPackedB32(c);
  1.1198 +                sr = SkDITHER_R32_FOR_565(sr, d);
  1.1199 +                sg = SkDITHER_G32_FOR_565(sg, d);
  1.1200 +                sb = SkDITHER_B32_FOR_565(sb, d);
  1.1201 +
  1.1202 +                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
  1.1203 +                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
  1.1204 +                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
  1.1205 +                // now src and dst expanded are in g:11 r:10 x:1 b:10
  1.1206 +                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
  1.1207 +            }
  1.1208 +            dst += 1;
  1.1209 +            DITHER_INC_X(x);
  1.1210 +            count--;
  1.1211 +        }
  1.1212 +
  1.1213 +        unsigned short dither_value[8];
  1.1214 +        __m128i dither, dither_cur;
  1.1215 +#ifdef ENABLE_DITHER_MATRIX_4X4
  1.1216 +        const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
  1.1217 +        dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
  1.1218 +        dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
  1.1219 +        dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
  1.1220 +        dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
  1.1221 +#else
  1.1222 +        const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
  1.1223 +        dither_value[0] = dither_value[4] = (dither_scan
  1.1224 +                                             >> (((x) & 3) << 2)) & 0xF;
  1.1225 +        dither_value[1] = dither_value[5] = (dither_scan
  1.1226 +                                             >> (((x + 1) & 3) << 2)) & 0xF;
  1.1227 +        dither_value[2] = dither_value[6] = (dither_scan
  1.1228 +                                             >> (((x + 2) & 3) << 2)) & 0xF;
  1.1229 +        dither_value[3] = dither_value[7] = (dither_scan
  1.1230 +                                             >> (((x + 3) & 3) << 2)) & 0xF;
  1.1231 +#endif
  1.1232 +        dither = _mm_loadu_si128((__m128i*) dither_value);
  1.1233 +
  1.1234 +        const __m128i* s = reinterpret_cast<const __m128i*>(src);
  1.1235 +        __m128i* d = reinterpret_cast<__m128i*>(dst);
  1.1236 +        __m128i var256 = _mm_set1_epi16(256);
  1.1237 +        __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
  1.1238 +        __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
  1.1239 +        __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
  1.1240 +
  1.1241 +        while (count >= 8) {
  1.1242 +            // Load 8 pixels of src and dst.
  1.1243 +            __m128i src_pixel1 = _mm_loadu_si128(s++);
  1.1244 +            __m128i src_pixel2 = _mm_loadu_si128(s++);
  1.1245 +            __m128i dst_pixel = _mm_load_si128(d);
  1.1246 +
  1.1247 +            // Extract A from src.
  1.1248 +            __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT));
  1.1249 +            sa1 = _mm_srli_epi32(sa1, 24);
  1.1250 +            __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT));
  1.1251 +            sa2 = _mm_srli_epi32(sa2, 24);
  1.1252 +            __m128i sa = _mm_packs_epi32(sa1, sa2);
  1.1253 +
  1.1254 +            // Calculate current dither value.
  1.1255 +            dither_cur = _mm_mullo_epi16(dither,
  1.1256 +                                         _mm_add_epi16(sa, _mm_set1_epi16(1)));
  1.1257 +            dither_cur = _mm_srli_epi16(dither_cur, 8);
  1.1258 +
  1.1259 +            // Extract R from src.
  1.1260 +            __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
  1.1261 +            sr1 = _mm_srli_epi32(sr1, 24);
  1.1262 +            __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
  1.1263 +            sr2 = _mm_srli_epi32(sr2, 24);
  1.1264 +            __m128i sr = _mm_packs_epi32(sr1, sr2);
  1.1265 +
  1.1266 +            // SkDITHER_R32_FOR_565(sr, d)
  1.1267 +            __m128i sr_offset = _mm_srli_epi16(sr, 5);
  1.1268 +            sr = _mm_add_epi16(sr, dither_cur);
  1.1269 +            sr = _mm_sub_epi16(sr, sr_offset);
  1.1270 +
  1.1271 +            // Expand sr.
  1.1272 +            sr = _mm_slli_epi16(sr, 2);
  1.1273 +
  1.1274 +            // Extract G from src.
  1.1275 +            __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
  1.1276 +            sg1 = _mm_srli_epi32(sg1, 24);
  1.1277 +            __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
  1.1278 +            sg2 = _mm_srli_epi32(sg2, 24);
  1.1279 +            __m128i sg = _mm_packs_epi32(sg1, sg2);
  1.1280 +
  1.1281 +            // sg = SkDITHER_G32_FOR_565(sg, d).
  1.1282 +            __m128i sg_offset = _mm_srli_epi16(sg, 6);
  1.1283 +            sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));
  1.1284 +            sg = _mm_sub_epi16(sg, sg_offset);
  1.1285 +
  1.1286 +            // Expand sg.
  1.1287 +            sg = _mm_slli_epi16(sg, 3);
  1.1288 +
  1.1289 +            // Extract B from src.
  1.1290 +            __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
  1.1291 +            sb1 = _mm_srli_epi32(sb1, 24);
  1.1292 +            __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
  1.1293 +            sb2 = _mm_srli_epi32(sb2, 24);
  1.1294 +            __m128i sb = _mm_packs_epi32(sb1, sb2);
  1.1295 +
  1.1296 +            // sb = SkDITHER_B32_FOR_565(sb, d).
  1.1297 +            __m128i sb_offset = _mm_srli_epi16(sb, 5);
  1.1298 +            sb = _mm_add_epi16(sb, dither_cur);
  1.1299 +            sb = _mm_sub_epi16(sb, sb_offset);
  1.1300 +
  1.1301 +            // Expand sb.
  1.1302 +            sb = _mm_slli_epi16(sb, 2);
  1.1303 +
  1.1304 +            // Extract R G B from dst.
  1.1305 +            __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
  1.1306 +            dr = _mm_and_si128(dr, r16_mask);
  1.1307 +            __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
  1.1308 +            dg = _mm_and_si128(dg, g16_mask);
  1.1309 +            __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
  1.1310 +            db = _mm_and_si128(db, b16_mask);
  1.1311 +
  1.1312 +            // SkAlpha255To256(255 - a) >> 3
  1.1313 +            __m128i isa = _mm_sub_epi16(var256, sa);
  1.1314 +            isa = _mm_srli_epi16(isa, 3);
  1.1315 +
  1.1316 +            dr = _mm_mullo_epi16(dr, isa);
  1.1317 +            dr = _mm_add_epi16(dr, sr);
  1.1318 +            dr = _mm_srli_epi16(dr, 5);
  1.1319 +
  1.1320 +            dg = _mm_mullo_epi16(dg, isa);
  1.1321 +            dg = _mm_add_epi16(dg, sg);
  1.1322 +            dg = _mm_srli_epi16(dg, 5);
  1.1323 +
  1.1324 +            db = _mm_mullo_epi16(db, isa);
  1.1325 +            db = _mm_add_epi16(db, sb);
  1.1326 +            db = _mm_srli_epi16(db, 5);
  1.1327 +
  1.1328 +            // Package and store dst pixel.
  1.1329 +            __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db);
  1.1330 +            _mm_store_si128(d++, d_pixel);
  1.1331 +
  1.1332 +            count -= 8;
  1.1333 +            x += 8;
  1.1334 +        }
  1.1335 +
  1.1336 +        src = reinterpret_cast<const SkPMColor*>(s);
  1.1337 +        dst = reinterpret_cast<uint16_t*>(d);
  1.1338 +    }
  1.1339 +
  1.1340 +    if (count > 0) {
  1.1341 +        DITHER_565_SCAN(y);
  1.1342 +        do {
  1.1343 +            SkPMColor c = *src++;
  1.1344 +            SkPMColorAssert(c);
  1.1345 +            if (c) {
  1.1346 +                unsigned a = SkGetPackedA32(c);
  1.1347 +
  1.1348 +                int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
  1.1349 +
  1.1350 +                unsigned sr = SkGetPackedR32(c);
  1.1351 +                unsigned sg = SkGetPackedG32(c);
  1.1352 +                unsigned sb = SkGetPackedB32(c);
  1.1353 +                sr = SkDITHER_R32_FOR_565(sr, d);
  1.1354 +                sg = SkDITHER_G32_FOR_565(sg, d);
  1.1355 +                sb = SkDITHER_B32_FOR_565(sb, d);
  1.1356 +
  1.1357 +                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
  1.1358 +                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
  1.1359 +                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
  1.1360 +                // now src and dst expanded are in g:11 r:10 x:1 b:10
  1.1361 +                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
  1.1362 +            }
  1.1363 +            dst += 1;
  1.1364 +            DITHER_INC_X(x);
  1.1365 +        } while (--count != 0);
  1.1366 +    }
  1.1367 +}

mercurial