gfx/skia/trunk/src/opts/SkBlitRow_opts_SSE2.cpp

Sat, 03 Jan 2015 20:18:00 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Sat, 03 Jan 2015 20:18:00 +0100
branch
TOR_BUG_3246
changeset 7
129ffea94266
permissions
-rw-r--r--

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

     1 /*
     2  * Copyright 2012 The Android Open Source Project
     3  *
     4  * Use of this source code is governed by a BSD-style license that can be
     5  * found in the LICENSE file.
     6  */
     9 #include "SkBlitRow_opts_SSE2.h"
    10 #include "SkBitmapProcState_opts_SSE2.h"
    11 #include "SkColorPriv.h"
    12 #include "SkColor_opts_SSE2.h"
    13 #include "SkDither.h"
    14 #include "SkUtils.h"
    16 #include <emmintrin.h>
    18 /* SSE2 version of S32_Blend_BlitRow32()
    19  * portable version is in core/SkBlitRow_D32.cpp
    20  */
    21 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
    22                               const SkPMColor* SK_RESTRICT src,
    23                               int count, U8CPU alpha) {
    24     SkASSERT(alpha <= 255);
    25     if (count <= 0) {
    26         return;
    27     }
    29     uint32_t src_scale = SkAlpha255To256(alpha);
    30     uint32_t dst_scale = 256 - src_scale;
    32     if (count >= 4) {
    33         SkASSERT(((size_t)dst & 0x03) == 0);
    34         while (((size_t)dst & 0x0F) != 0) {
    35             *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
    36             src++;
    37             dst++;
    38             count--;
    39         }
    41         const __m128i *s = reinterpret_cast<const __m128i*>(src);
    42         __m128i *d = reinterpret_cast<__m128i*>(dst);
    43         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
    44         __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
    46         // Move scale factors to upper byte of word
    47         __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
    48         __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
    49         while (count >= 4) {
    50             // Load 4 pixels each of src and dest.
    51             __m128i src_pixel = _mm_loadu_si128(s);
    52             __m128i dst_pixel = _mm_load_si128(d);
    54             // Interleave Atom port 0/1 operations based on the execution port
    55             // constraints that multiply can only be executed on port 0 (while
    56             // boolean operations can be executed on either port 0 or port 1)
    57             // because GCC currently doesn't do a good job scheduling
    58             // instructions based on these constraints.
    60             // Get red and blue pixels into lower byte of each word.
    61             // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
    62             __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
    64             // Multiply by scale.
    65             // (4 x (0, rs.h, 0, bs.h))
    66             // where rs.h stands for the higher byte of r * scale, and
    67             // bs.h the higher byte of b * scale.
    68             src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
    70             // Get alpha and green pixels into higher byte of each word.
    71             // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
    72             __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
    74             // Multiply by scale.
    75             // (4 x (as.h, as.l, gs.h, gs.l))
    76             src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
    78             // Clear the lower byte of the a*scale and g*scale results
    79             // (4 x (as.h, 0, gs.h, 0))
    80             src_ag = _mm_and_si128(src_ag, ag_mask);
    82             // Operations the destination pixels are the same as on the
    83             // source pixels. See the comments above.
    84             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
    85             dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
    86             __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
    87             dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
    88             dst_ag = _mm_and_si128(dst_ag, ag_mask);
    90             // Combine back into RGBA.
    91             // (4 x (as.h, rs.h, gs.h, bs.h))
    92             src_pixel = _mm_or_si128(src_rb, src_ag);
    93             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
    95             // Add result
    96             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
    97             _mm_store_si128(d, result);
    98             s++;
    99             d++;
   100             count -= 4;
   101         }
   102         src = reinterpret_cast<const SkPMColor*>(s);
   103         dst = reinterpret_cast<SkPMColor*>(d);
   104     }
   106     while (count > 0) {
   107         *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
   108         src++;
   109         dst++;
   110         count--;
   111     }
   112 }
   114 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
   115                                 const SkPMColor* SK_RESTRICT src,
   116                                 int count, U8CPU alpha) {
   117     SkASSERT(alpha == 255);
   118     if (count <= 0) {
   119         return;
   120     }
   122     if (count >= 4) {
   123         SkASSERT(((size_t)dst & 0x03) == 0);
   124         while (((size_t)dst & 0x0F) != 0) {
   125             *dst = SkPMSrcOver(*src, *dst);
   126             src++;
   127             dst++;
   128             count--;
   129         }
   131         const __m128i *s = reinterpret_cast<const __m128i*>(src);
   132         __m128i *d = reinterpret_cast<__m128i*>(dst);
   133 #ifdef SK_USE_ACCURATE_BLENDING
   134         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
   135         __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)
   136         __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)
   137         while (count >= 4) {
   138             // Load 4 pixels
   139             __m128i src_pixel = _mm_loadu_si128(s);
   140             __m128i dst_pixel = _mm_load_si128(d);
   142             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
   143             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
   144             // Shift alphas down to lower 8 bits of each quad.
   145             __m128i alpha = _mm_srli_epi32(src_pixel, 24);
   147             // Copy alpha to upper 3rd byte of each quad
   148             alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
   150             // Subtract alphas from 255, to get 0..255
   151             alpha = _mm_sub_epi16(c_255, alpha);
   153             // Multiply by red and blue by src alpha.
   154             dst_rb = _mm_mullo_epi16(dst_rb, alpha);
   155             // Multiply by alpha and green by src alpha.
   156             dst_ag = _mm_mullo_epi16(dst_ag, alpha);
   158             // dst_rb_low = (dst_rb >> 8)
   159             __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
   160             __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
   162             // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
   163             dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
   164             dst_rb = _mm_add_epi16(dst_rb, c_128);
   165             dst_rb = _mm_srli_epi16(dst_rb, 8);
   167             // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
   168             dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
   169             dst_ag = _mm_add_epi16(dst_ag, c_128);
   170             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
   172             // Combine back into RGBA.
   173             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
   175             // Add result
   176             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
   177             _mm_store_si128(d, result);
   178             s++;
   179             d++;
   180             count -= 4;
   181         }
   182     #else
   183         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
   184         __m128i c_256 = _mm_set1_epi16(0x0100);  // 8 copies of 256 (16-bit)
   185         while (count >= 4) {
   186             // Load 4 pixels
   187             __m128i src_pixel = _mm_loadu_si128(s);
   188             __m128i dst_pixel = _mm_load_si128(d);
   190             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
   191             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
   193             // (a0, g0, a1, g1, a2, g2, a3, g3)  (low byte of each word)
   194             __m128i alpha = _mm_srli_epi16(src_pixel, 8);
   196             // (a0, a0, a1, a1, a2, g2, a3, g3)
   197             alpha = _mm_shufflehi_epi16(alpha, 0xF5);
   199             // (a0, a0, a1, a1, a2, a2, a3, a3)
   200             alpha = _mm_shufflelo_epi16(alpha, 0xF5);
   202             // Subtract alphas from 256, to get 1..256
   203             alpha = _mm_sub_epi16(c_256, alpha);
   205             // Multiply by red and blue by src alpha.
   206             dst_rb = _mm_mullo_epi16(dst_rb, alpha);
   207             // Multiply by alpha and green by src alpha.
   208             dst_ag = _mm_mullo_epi16(dst_ag, alpha);
   210             // Divide by 256.
   211             dst_rb = _mm_srli_epi16(dst_rb, 8);
   213             // Mask out high bits (already in the right place)
   214             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
   216             // Combine back into RGBA.
   217             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
   219             // Add result
   220             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
   221             _mm_store_si128(d, result);
   222             s++;
   223             d++;
   224             count -= 4;
   225         }
   226 #endif
   227         src = reinterpret_cast<const SkPMColor*>(s);
   228         dst = reinterpret_cast<SkPMColor*>(d);
   229     }
   231     while (count > 0) {
   232         *dst = SkPMSrcOver(*src, *dst);
   233         src++;
   234         dst++;
   235         count--;
   236     }
   237 }
   239 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
   240                                const SkPMColor* SK_RESTRICT src,
   241                                int count, U8CPU alpha) {
   242     SkASSERT(alpha <= 255);
   243     if (count <= 0) {
   244         return;
   245     }
   247     if (count >= 4) {
   248         while (((size_t)dst & 0x0F) != 0) {
   249             *dst = SkBlendARGB32(*src, *dst, alpha);
   250             src++;
   251             dst++;
   252             count--;
   253         }
   255         uint32_t src_scale = SkAlpha255To256(alpha);
   257         const __m128i *s = reinterpret_cast<const __m128i*>(src);
   258         __m128i *d = reinterpret_cast<__m128i*>(dst);
   259         __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
   260         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
   261         __m128i c_256 = _mm_set1_epi16(256);  // 8 copies of 256 (16-bit)
   262         while (count >= 4) {
   263             // Load 4 pixels each of src and dest.
   264             __m128i src_pixel = _mm_loadu_si128(s);
   265             __m128i dst_pixel = _mm_load_si128(d);
   267             // Get red and blue pixels into lower byte of each word.
   268             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
   269             __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
   271             // Get alpha and green into lower byte of each word.
   272             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
   273             __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
   275             // Put per-pixel alpha in low byte of each word.
   276             // After the following two statements, the dst_alpha looks like
   277             // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
   278             __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
   279             dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
   281             // dst_alpha = dst_alpha * src_scale
   282             // Because src_scales are in the higher byte of each word and
   283             // we use mulhi here, the resulting alpha values are already
   284             // in the right place and don't need to be divided by 256.
   285             // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
   286             dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
   288             // Subtract alphas from 256, to get 1..256
   289             dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
   291             // Multiply red and blue by dst pixel alpha.
   292             dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
   293             // Multiply alpha and green by dst pixel alpha.
   294             dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
   296             // Multiply red and blue by global alpha.
   297             // (4 x (0, rs.h, 0, bs.h))
   298             // where rs.h stands for the higher byte of r * src_scale,
   299             // and bs.h the higher byte of b * src_scale.
   300             // Again, because we use mulhi, the resuling red and blue
   301             // values are already in the right place and don't need to
   302             // be divided by 256.
   303             src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
   304             // Multiply alpha and green by global alpha.
   305             // (4 x (0, as.h, 0, gs.h))
   306             src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
   308             // Divide by 256.
   309             dst_rb = _mm_srli_epi16(dst_rb, 8);
   311             // Mask out low bits (goodies already in the right place; no need to divide)
   312             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
   313             // Shift alpha and green to higher byte of each word.
   314             // (4 x (as.h, 0, gs.h, 0))
   315             src_ag = _mm_slli_epi16(src_ag, 8);
   317             // Combine back into RGBA.
   318             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
   319             src_pixel = _mm_or_si128(src_rb, src_ag);
   321             // Add two pixels into result.
   322             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
   323             _mm_store_si128(d, result);
   324             s++;
   325             d++;
   326             count -= 4;
   327         }
   328         src = reinterpret_cast<const SkPMColor*>(s);
   329         dst = reinterpret_cast<SkPMColor*>(d);
   330     }
   332     while (count > 0) {
   333         *dst = SkBlendARGB32(*src, *dst, alpha);
   334         src++;
   335         dst++;
   336         count--;
   337     }
   338 }
   340 /* SSE2 version of Color32()
   341  * portable version is in core/SkBlitRow_D32.cpp
   342  */
   343 void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
   344                   SkPMColor color) {
   346     if (count <= 0) {
   347         return;
   348     }
   350     if (0 == color) {
   351         if (src != dst) {
   352             memcpy(dst, src, count * sizeof(SkPMColor));
   353         }
   354         return;
   355     }
   357     unsigned colorA = SkGetPackedA32(color);
   358     if (255 == colorA) {
   359         sk_memset32(dst, color, count);
   360     } else {
   361         unsigned scale = 256 - SkAlpha255To256(colorA);
   363         if (count >= 4) {
   364             SkASSERT(((size_t)dst & 0x03) == 0);
   365             while (((size_t)dst & 0x0F) != 0) {
   366                 *dst = color + SkAlphaMulQ(*src, scale);
   367                 src++;
   368                 dst++;
   369                 count--;
   370             }
   372             const __m128i *s = reinterpret_cast<const __m128i*>(src);
   373             __m128i *d = reinterpret_cast<__m128i*>(dst);
   374             __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
   375             __m128i src_scale_wide = _mm_set1_epi16(scale);
   376             __m128i color_wide = _mm_set1_epi32(color);
   377             while (count >= 4) {
   378                 // Load 4 pixels each of src and dest.
   379                 __m128i src_pixel = _mm_loadu_si128(s);
   381                 // Get red and blue pixels into lower byte of each word.
   382                 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
   384                 // Get alpha and green into lower byte of each word.
   385                 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
   387                 // Multiply by scale.
   388                 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
   389                 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
   391                 // Divide by 256.
   392                 src_rb = _mm_srli_epi16(src_rb, 8);
   393                 src_ag = _mm_andnot_si128(rb_mask, src_ag);
   395                 // Combine back into RGBA.
   396                 src_pixel = _mm_or_si128(src_rb, src_ag);
   398                 // Add color to result.
   399                 __m128i result = _mm_add_epi8(color_wide, src_pixel);
   401                 // Store result.
   402                 _mm_store_si128(d, result);
   403                 s++;
   404                 d++;
   405                 count -= 4;
   406             }
   407             src = reinterpret_cast<const SkPMColor*>(s);
   408             dst = reinterpret_cast<SkPMColor*>(d);
   409          }
   411         while (count > 0) {
   412             *dst = color + SkAlphaMulQ(*src, scale);
   413             src += 1;
   414             dst += 1;
   415             count--;
   416         }
   417     }
   418 }
   420 void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
   421                                size_t maskRB, SkColor origColor,
   422                                int width, int height) {
   423     SkPMColor color = SkPreMultiplyColor(origColor);
   424     size_t dstOffset = dstRB - (width << 2);
   425     size_t maskOffset = maskRB - width;
   426     SkPMColor* dst = (SkPMColor *)device;
   427     const uint8_t* mask = (const uint8_t*)maskPtr;
   428     do {
   429         int count = width;
   430         if (count >= 4) {
   431             while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
   432                 *dst = SkBlendARGB32(color, *dst, *mask);
   433                 mask++;
   434                 dst++;
   435                 count--;
   436             }
   437             __m128i *d = reinterpret_cast<__m128i*>(dst);
   438             __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
   439             __m128i c_256 = _mm_set1_epi16(256);
   440             __m128i c_1 = _mm_set1_epi16(1);
   441             __m128i src_pixel = _mm_set1_epi32(color);
   442             while (count >= 4) {
   443                 // Load 4 pixels each of src and dest.
   444                 __m128i dst_pixel = _mm_load_si128(d);
   446                 //set the aphla value
   447                 __m128i src_scale_wide =  _mm_set_epi8(0, *(mask+3),\
   448                                 0, *(mask+3),0, \
   449                                 *(mask+2),0, *(mask+2),\
   450                                 0,*(mask+1), 0,*(mask+1),\
   451                                 0, *mask,0,*mask);
   453                 //call SkAlpha255To256()
   454                 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
   456                 // Get red and blue pixels into lower byte of each word.
   457                 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
   458                 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
   460                 // Get alpha and green into lower byte of each word.
   461                 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
   462                 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
   464                 // Put per-pixel alpha in low byte of each word.
   465                 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
   466                 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
   468                 // dst_alpha = dst_alpha * src_scale
   469                 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
   471                 // Divide by 256.
   472                 dst_alpha = _mm_srli_epi16(dst_alpha, 8);
   474                 // Subtract alphas from 256, to get 1..256
   475                 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
   476                 // Multiply red and blue by dst pixel alpha.
   477                 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
   478                 // Multiply alpha and green by dst pixel alpha.
   479                 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
   481                 // Multiply red and blue by global alpha.
   482                 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
   483                 // Multiply alpha and green by global alpha.
   484                 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
   485                 // Divide by 256.
   486                 dst_rb = _mm_srli_epi16(dst_rb, 8);
   487                 src_rb = _mm_srli_epi16(src_rb, 8);
   489                 // Mask out low bits (goodies already in the right place; no need to divide)
   490                 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
   491                 src_ag = _mm_andnot_si128(rb_mask, src_ag);
   493                 // Combine back into RGBA.
   494                 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
   495                 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
   497                 // Add two pixels into result.
   498                 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
   499                 _mm_store_si128(d, result);
   500                 // load the next 4 pixel
   501                 mask = mask + 4;
   502                 d++;
   503                 count -= 4;
   504             }
   505             dst = reinterpret_cast<SkPMColor *>(d);
   506         }
   507         while(count > 0) {
   508             *dst= SkBlendARGB32(color, *dst, *mask);
   509             dst += 1;
   510             mask++;
   511             count --;
   512         }
   513         dst = (SkPMColor *)((char*)dst + dstOffset);
   514         mask += maskOffset;
   515     } while (--height != 0);
   516 }
   518 // The following (left) shifts cause the top 5 bits of the mask components to
   519 // line up with the corresponding components in an SkPMColor.
   520 // Note that the mask's RGB16 order may differ from the SkPMColor order.
   521 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
   522 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
   523 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
   525 #if SK_R16x5_R32x5_SHIFT == 0
   526     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
   527 #elif SK_R16x5_R32x5_SHIFT > 0
   528     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
   529 #else
   530     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
   531 #endif
   533 #if SK_G16x5_G32x5_SHIFT == 0
   534     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
   535 #elif SK_G16x5_G32x5_SHIFT > 0
   536     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
   537 #else
   538     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
   539 #endif
   541 #if SK_B16x5_B32x5_SHIFT == 0
   542     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
   543 #elif SK_B16x5_B32x5_SHIFT > 0
   544     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
   545 #else
   546     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
   547 #endif
   549 static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
   550                                  __m128i &mask, __m128i &srcA) {
   551     // In the following comments, the components of src, dst and mask are
   552     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
   553     // by an R, G, B, or A suffix. Components of one of the four pixels that
   554     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
   555     // example is the blue channel of the second destination pixel. Memory
   556     // layout is shown for an ARGB byte order in a color value.
   558     // src and srcA store 8-bit values interleaved with zeros.
   559     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
   560     // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
   561     //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
   562     // mask stores 16-bit values (compressed three channels) interleaved with zeros.
   563     // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
   564     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
   565     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
   567     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
   568     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
   569     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
   570                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
   572     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
   573     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
   574                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
   576     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
   577     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
   578                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
   580     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
   581     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
   582     // 8-bit position
   583     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
   584     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
   585     mask = _mm_or_si128(_mm_or_si128(r, g), b);
   587     // Interleave R,G,B into the lower byte of word.
   588     // i.e. split the sixteen 8-bit values from mask into two sets of eight
   589     // 16-bit values, padded by zero.
   590     __m128i maskLo, maskHi;
   591     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
   592     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
   593     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
   594     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
   596     // Upscale from 0..31 to 0..32
   597     // (allows to replace division by left-shift further down)
   598     // Left-shift each component by 4 and add the result back to that component,
   599     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
   600     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
   601     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
   603     // Multiply each component of maskLo and maskHi by srcA
   604     maskLo = _mm_mullo_epi16(maskLo, srcA);
   605     maskHi = _mm_mullo_epi16(maskHi, srcA);
   607     // Left shift mask components by 8 (divide by 256)
   608     maskLo = _mm_srli_epi16(maskLo, 8);
   609     maskHi = _mm_srli_epi16(maskHi, 8);
   611     // Interleave R,G,B into the lower byte of the word
   612     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
   613     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
   614     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
   615     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
   617     // mask = (src - dst) * mask
   618     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
   619     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
   621     // mask = (src - dst) * mask >> 5
   622     maskLo = _mm_srai_epi16(maskLo, 5);
   623     maskHi = _mm_srai_epi16(maskHi, 5);
   625     // Add two pixels into result.
   626     // result = dst + ((src - dst) * mask >> 5)
   627     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
   628     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
   630     // Pack into 4 32bit dst pixels.
   631     // resultLo and resultHi contain eight 16-bit components (two pixels) each.
   632     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
   633     // clamping to 255 if necessary.
   634     return _mm_packus_epi16(resultLo, resultHi);
   635 }
   637 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
   638                                        __m128i &mask) {
   639     // In the following comments, the components of src, dst and mask are
   640     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
   641     // by an R, G, B, or A suffix. Components of one of the four pixels that
   642     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
   643     // example is the blue channel of the second destination pixel. Memory
   644     // layout is shown for an ARGB byte order in a color value.
   646     // src and srcA store 8-bit values interleaved with zeros.
   647     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
   648     // mask stores 16-bit values (shown as high and low bytes) interleaved with
   649     // zeros
   650     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
   651     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
   653     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
   654     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
   655     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
   656                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
   658     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
   659     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
   660                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
   662     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
   663     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
   664                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
   666     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
   667     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
   668     // 8-bit position
   669     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
   670     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
   671     mask = _mm_or_si128(_mm_or_si128(r, g), b);
   673     // Interleave R,G,B into the lower byte of word.
   674     // i.e. split the sixteen 8-bit values from mask into two sets of eight
   675     // 16-bit values, padded by zero.
   676     __m128i maskLo, maskHi;
   677     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
   678     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
   679     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
   680     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
   682     // Upscale from 0..31 to 0..32
   683     // (allows to replace division by left-shift further down)
   684     // Left-shift each component by 4 and add the result back to that component,
   685     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
   686     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
   687     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
   689     // Interleave R,G,B into the lower byte of the word
   690     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
   691     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
   692     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
   693     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
   695     // mask = (src - dst) * mask
   696     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
   697     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
   699     // mask = (src - dst) * mask >> 5
   700     maskLo = _mm_srai_epi16(maskLo, 5);
   701     maskHi = _mm_srai_epi16(maskHi, 5);
   703     // Add two pixels into result.
   704     // result = dst + ((src - dst) * mask >> 5)
   705     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
   706     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
   708     // Pack into 4 32bit dst pixels and force opaque.
   709     // resultLo and resultHi contain eight 16-bit components (two pixels) each.
   710     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
   711     // clamping to 255 if necessary. Set alpha components to 0xFF.
   712     return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
   713                         _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
   714 }
   716 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
   717                          SkColor src, int width, SkPMColor) {
   718     if (width <= 0) {
   719         return;
   720     }
   722     int srcA = SkColorGetA(src);
   723     int srcR = SkColorGetR(src);
   724     int srcG = SkColorGetG(src);
   725     int srcB = SkColorGetB(src);
   727     srcA = SkAlpha255To256(srcA);
   729     if (width >= 4) {
   730         SkASSERT(((size_t)dst & 0x03) == 0);
   731         while (((size_t)dst & 0x0F) != 0) {
   732             *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
   733             mask++;
   734             dst++;
   735             width--;
   736         }
   738         __m128i *d = reinterpret_cast<__m128i*>(dst);
   739         // Set alpha to 0xFF and replicate source four times in SSE register.
   740         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
   741         // Interleave with zeros to get two sets of four 16-bit values.
   742         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
   743         // Set srcA_sse to contain eight copies of srcA, padded with zero.
   744         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
   745         __m128i srcA_sse = _mm_set1_epi16(srcA);
   746         while (width >= 4) {
   747             // Load four destination pixels into dst_sse.
   748             __m128i dst_sse = _mm_load_si128(d);
   749             // Load four 16-bit masks into lower half of mask_sse.
   750             __m128i mask_sse = _mm_loadl_epi64(
   751                                    reinterpret_cast<const __m128i*>(mask));
   753             // Check whether masks are equal to 0 and get the highest bit
   754             // of each byte of result, if masks are all zero, we will get
   755             // pack_cmp to 0xFFFF
   756             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
   757                                              _mm_setzero_si128()));
   759             // if mask pixels are not all zero, we will blend the dst pixels
   760             if (pack_cmp != 0xFFFF) {
   761                 // Unpack 4 16bit mask pixels to
   762                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
   763                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
   764                 mask_sse = _mm_unpacklo_epi16(mask_sse,
   765                                               _mm_setzero_si128());
   767                 // Process 4 32bit dst pixels
   768                 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
   769                                                    mask_sse, srcA_sse);
   770                 _mm_store_si128(d, result);
   771             }
   773             d++;
   774             mask += 4;
   775             width -= 4;
   776         }
   778         dst = reinterpret_cast<SkPMColor*>(d);
   779     }
   781     while (width > 0) {
   782         *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
   783         mask++;
   784         dst++;
   785         width--;
   786     }
   787 }
   789 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
   790                                SkColor src, int width, SkPMColor opaqueDst) {
   791     if (width <= 0) {
   792         return;
   793     }
   795     int srcR = SkColorGetR(src);
   796     int srcG = SkColorGetG(src);
   797     int srcB = SkColorGetB(src);
   799     if (width >= 4) {
   800         SkASSERT(((size_t)dst & 0x03) == 0);
   801         while (((size_t)dst & 0x0F) != 0) {
   802             *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
   803             mask++;
   804             dst++;
   805             width--;
   806         }
   808         __m128i *d = reinterpret_cast<__m128i*>(dst);
   809         // Set alpha to 0xFF and replicate source four times in SSE register.
   810         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
   811         // Set srcA_sse to contain eight copies of srcA, padded with zero.
   812         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
   813         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
   814         while (width >= 4) {
   815             // Load four destination pixels into dst_sse.
   816             __m128i dst_sse = _mm_load_si128(d);
   817             // Load four 16-bit masks into lower half of mask_sse.
   818             __m128i mask_sse = _mm_loadl_epi64(
   819                                    reinterpret_cast<const __m128i*>(mask));
   821             // Check whether masks are equal to 0 and get the highest bit
   822             // of each byte of result, if masks are all zero, we will get
   823             // pack_cmp to 0xFFFF
   824             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
   825                                              _mm_setzero_si128()));
   827             // if mask pixels are not all zero, we will blend the dst pixels
   828             if (pack_cmp != 0xFFFF) {
   829                 // Unpack 4 16bit mask pixels to
   830                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
   831                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
   832                 mask_sse = _mm_unpacklo_epi16(mask_sse,
   833                                               _mm_setzero_si128());
   835                 // Process 4 32bit dst pixels
   836                 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
   837                                                          mask_sse);
   838                 _mm_store_si128(d, result);
   839             }
   841             d++;
   842             mask += 4;
   843             width -= 4;
   844         }
   846         dst = reinterpret_cast<SkPMColor*>(d);
   847     }
   849     while (width > 0) {
   850         *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
   851         mask++;
   852         dst++;
   853         width--;
   854     }
   855 }
   857 /* SSE2 version of S32_D565_Opaque()
   858  * portable version is in core/SkBlitRow_D16.cpp
   859  */
   860 void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
   861                           const SkPMColor* SK_RESTRICT src, int count,
   862                           U8CPU alpha, int /*x*/, int /*y*/) {
   863     SkASSERT(255 == alpha);
   865     if (count <= 0) {
   866         return;
   867     }
   869     if (count >= 8) {
   870         while (((size_t)dst & 0x0F) != 0) {
   871             SkPMColor c = *src++;
   872             SkPMColorAssert(c);
   874             *dst++ = SkPixel32ToPixel16_ToU16(c);
   875             count--;
   876         }
   878         const __m128i* s = reinterpret_cast<const __m128i*>(src);
   879         __m128i* d = reinterpret_cast<__m128i*>(dst);
   880         __m128i r16_mask = _mm_set1_epi32(SK_R16_MASK);
   881         __m128i g16_mask = _mm_set1_epi32(SK_G16_MASK);
   882         __m128i b16_mask = _mm_set1_epi32(SK_B16_MASK);
   884         while (count >= 8) {
   885             // Load 8 pixels of src.
   886             __m128i src_pixel1 = _mm_loadu_si128(s++);
   887             __m128i src_pixel2 = _mm_loadu_si128(s++);
   889             // Calculate result r.
   890             __m128i r1 = _mm_srli_epi32(src_pixel1,
   891                                         SK_R32_SHIFT + (8 - SK_R16_BITS));
   892             r1 = _mm_and_si128(r1, r16_mask);
   893             __m128i r2 = _mm_srli_epi32(src_pixel2,
   894                                         SK_R32_SHIFT + (8 - SK_R16_BITS));
   895             r2 = _mm_and_si128(r2, r16_mask);
   896             __m128i r = _mm_packs_epi32(r1, r2);
   898             // Calculate result g.
   899             __m128i g1 = _mm_srli_epi32(src_pixel1,
   900                                         SK_G32_SHIFT + (8 - SK_G16_BITS));
   901             g1 = _mm_and_si128(g1, g16_mask);
   902             __m128i g2 = _mm_srli_epi32(src_pixel2,
   903                                         SK_G32_SHIFT + (8 - SK_G16_BITS));
   904             g2 = _mm_and_si128(g2, g16_mask);
   905             __m128i g = _mm_packs_epi32(g1, g2);
   907             // Calculate result b.
   908             __m128i b1 = _mm_srli_epi32(src_pixel1,
   909                                         SK_B32_SHIFT + (8 - SK_B16_BITS));
   910             b1 = _mm_and_si128(b1, b16_mask);
   911             __m128i b2 = _mm_srli_epi32(src_pixel2,
   912                                         SK_B32_SHIFT + (8 - SK_B16_BITS));
   913             b2 = _mm_and_si128(b2, b16_mask);
   914             __m128i b = _mm_packs_epi32(b1, b2);
   916             // Store 8 16-bit colors in dst.
   917             __m128i d_pixel = SkPackRGB16_SSE(r, g, b);
   918             _mm_store_si128(d++, d_pixel);
   919             count -= 8;
   920         }
   921         src = reinterpret_cast<const SkPMColor*>(s);
   922         dst = reinterpret_cast<uint16_t*>(d);
   923     }
   925     if (count > 0) {
   926         do {
   927             SkPMColor c = *src++;
   928             SkPMColorAssert(c);
   929             *dst++ = SkPixel32ToPixel16_ToU16(c);
   930         } while (--count != 0);
   931     }
   932 }
   934 /* SSE2 version of S32A_D565_Opaque()
   935  * portable version is in core/SkBlitRow_D16.cpp
   936  */
   937 void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
   938                            const SkPMColor* SK_RESTRICT src,
   939                            int count, U8CPU alpha, int /*x*/, int /*y*/) {
   940     SkASSERT(255 == alpha);
   942     if (count <= 0) {
   943         return;
   944     }
   946     if (count >= 8) {
   947         // Make dst 16 bytes alignment
   948         while (((size_t)dst & 0x0F) != 0) {
   949             SkPMColor c = *src++;
   950             if (c) {
   951               *dst = SkSrcOver32To16(c, *dst);
   952             }
   953             dst += 1;
   954             count--;
   955         }
   957         const __m128i* s = reinterpret_cast<const __m128i*>(src);
   958         __m128i* d = reinterpret_cast<__m128i*>(dst);
   959         __m128i var255 = _mm_set1_epi16(255);
   960         __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
   961         __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
   962         __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
   964         while (count >= 8) {
   965             // Load 8 pixels of src.
   966             __m128i src_pixel1 = _mm_loadu_si128(s++);
   967             __m128i src_pixel2 = _mm_loadu_si128(s++);
   969             // Check whether src pixels are equal to 0 and get the highest bit
   970             // of each byte of result, if src pixels are all zero, src_cmp1 and
   971             // src_cmp2 will be 0xFFFF.
   972             int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
   973                                              _mm_setzero_si128()));
   974             int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
   975                                              _mm_setzero_si128()));
   976             if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
   977                 d++;
   978                 count -= 8;
   979                 continue;
   980             }
   982             // Load 8 pixels of dst.
   983             __m128i dst_pixel = _mm_load_si128(d);
   985             // Extract A from src.
   986             __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT));
   987             sa1 = _mm_srli_epi32(sa1, 24);
   988             __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT));
   989             sa2 = _mm_srli_epi32(sa2, 24);
   990             __m128i sa = _mm_packs_epi32(sa1, sa2);
   992             // Extract R from src.
   993             __m128i sr1 = _mm_slli_epi32(src_pixel1,(24 - SK_R32_SHIFT));
   994             sr1 = _mm_srli_epi32(sr1, 24);
   995             __m128i sr2 = _mm_slli_epi32(src_pixel2,(24 - SK_R32_SHIFT));
   996             sr2 = _mm_srli_epi32(sr2, 24);
   997             __m128i sr = _mm_packs_epi32(sr1, sr2);
   999             // Extract G from src.
  1000             __m128i sg1 = _mm_slli_epi32(src_pixel1,(24 - SK_G32_SHIFT));
  1001             sg1 = _mm_srli_epi32(sg1, 24);
  1002             __m128i sg2 = _mm_slli_epi32(src_pixel2,(24 - SK_G32_SHIFT));
  1003             sg2 = _mm_srli_epi32(sg2, 24);
  1004             __m128i sg = _mm_packs_epi32(sg1, sg2);
  1006             // Extract B from src.
  1007             __m128i sb1 = _mm_slli_epi32(src_pixel1,(24 - SK_B32_SHIFT));
  1008             sb1 = _mm_srli_epi32(sb1, 24);
  1009             __m128i sb2 = _mm_slli_epi32(src_pixel2,(24 - SK_B32_SHIFT));
  1010             sb2 = _mm_srli_epi32(sb2, 24);
  1011             __m128i sb = _mm_packs_epi32(sb1, sb2);
  1013             // Extract R G B from dst.
  1014             __m128i dr = _mm_srli_epi16(dst_pixel,SK_R16_SHIFT);
  1015             dr = _mm_and_si128(dr, r16_mask);
  1016             __m128i dg = _mm_srli_epi16(dst_pixel,SK_G16_SHIFT);
  1017             dg = _mm_and_si128(dg, g16_mask);
  1018             __m128i db = _mm_srli_epi16(dst_pixel,SK_B16_SHIFT);
  1019             db = _mm_and_si128(db, b16_mask);
  1021             __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
  1023             // Calculate R G B of result.
  1024             // Original algorithm is in SkSrcOver32To16().
  1025             dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE(dr, isa, SK_R16_BITS));
  1026             dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
  1027             dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE(dg, isa, SK_G16_BITS));
  1028             dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
  1029             db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE(db, isa, SK_B16_BITS));
  1030             db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
  1032             // Pack R G B into 16-bit color.
  1033             __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db);
  1035             // Store 8 16-bit colors in dst.
  1036             _mm_store_si128(d++, d_pixel);
  1037             count -= 8;
  1040         src = reinterpret_cast<const SkPMColor*>(s);
  1041         dst = reinterpret_cast<uint16_t*>(d);
  1044     if (count > 0) {
  1045         do {
  1046             SkPMColor c = *src++;
  1047             SkPMColorAssert(c);
  1048             if (c) {
  1049                 *dst = SkSrcOver32To16(c, *dst);
  1051             dst += 1;
  1052         } while (--count != 0);
  1056 void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
  1057                                  const SkPMColor* SK_RESTRICT src,
  1058                                  int count, U8CPU alpha, int x, int y) {
  1059     SkASSERT(255 == alpha);
  1061     if (count <= 0) {
  1062         return;
  1065     if (count >= 8) {
  1066         while (((size_t)dst & 0x0F) != 0) {
  1067             DITHER_565_SCAN(y);
  1068             SkPMColor c = *src++;
  1069             SkPMColorAssert(c);
  1071             unsigned dither = DITHER_VALUE(x);
  1072             *dst++ = SkDitherRGB32To565(c, dither);
  1073             DITHER_INC_X(x);
  1074             count--;
  1077         unsigned short dither_value[8];
  1078         __m128i dither;
  1079 #ifdef ENABLE_DITHER_MATRIX_4X4
  1080         const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
  1081         dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
  1082         dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
  1083         dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
  1084         dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
  1085 #else
  1086         const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
  1087         dither_value[0] = dither_value[4] = (dither_scan
  1088                                              >> (((x) & 3) << 2)) & 0xF;
  1089         dither_value[1] = dither_value[5] = (dither_scan
  1090                                              >> (((x + 1) & 3) << 2)) & 0xF;
  1091         dither_value[2] = dither_value[6] = (dither_scan
  1092                                              >> (((x + 2) & 3) << 2)) & 0xF;
  1093         dither_value[3] = dither_value[7] = (dither_scan
  1094                                              >> (((x + 3) & 3) << 2)) & 0xF;
  1095 #endif
  1096         dither = _mm_loadu_si128((__m128i*) dither_value);
  1098         const __m128i* s = reinterpret_cast<const __m128i*>(src);
  1099         __m128i* d = reinterpret_cast<__m128i*>(dst);
  1101         while (count >= 8) {
  1102             // Load 8 pixels of src.
  1103             __m128i src_pixel1 = _mm_loadu_si128(s++);
  1104             __m128i src_pixel2 = _mm_loadu_si128(s++);
  1106             // Extract R from src.
  1107             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
  1108             sr1 = _mm_srli_epi32(sr1, 24);
  1109             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
  1110             sr2 = _mm_srli_epi32(sr2, 24);
  1111             __m128i sr = _mm_packs_epi32(sr1, sr2);
  1113             // SkDITHER_R32To565(sr, dither)
  1114             __m128i sr_offset = _mm_srli_epi16(sr, 5);
  1115             sr = _mm_add_epi16(sr, dither);
  1116             sr = _mm_sub_epi16(sr, sr_offset);
  1117             sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
  1119             // Extract G from src.
  1120             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
  1121             sg1 = _mm_srli_epi32(sg1, 24);
  1122             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
  1123             sg2 = _mm_srli_epi32(sg2, 24);
  1124             __m128i sg = _mm_packs_epi32(sg1, sg2);
  1126             // SkDITHER_R32To565(sg, dither)
  1127             __m128i sg_offset = _mm_srli_epi16(sg, 6);
  1128             sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
  1129             sg = _mm_sub_epi16(sg, sg_offset);
  1130             sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
  1132             // Extract B from src.
  1133             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
  1134             sb1 = _mm_srli_epi32(sb1, 24);
  1135             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
  1136             sb2 = _mm_srli_epi32(sb2, 24);
  1137             __m128i sb = _mm_packs_epi32(sb1, sb2);
  1139             // SkDITHER_R32To565(sb, dither)
  1140             __m128i sb_offset = _mm_srli_epi16(sb, 5);
  1141             sb = _mm_add_epi16(sb, dither);
  1142             sb = _mm_sub_epi16(sb, sb_offset);
  1143             sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
  1145             // Pack and store 16-bit dst pixel.
  1146             __m128i d_pixel = SkPackRGB16_SSE(sr, sg, sb);
  1147             _mm_store_si128(d++, d_pixel);
  1149             count -= 8;
  1150             x += 8;
  1153         src = reinterpret_cast<const SkPMColor*>(s);
  1154         dst = reinterpret_cast<uint16_t*>(d);
  1157     if (count > 0) {
  1158         DITHER_565_SCAN(y);
  1159         do {
  1160             SkPMColor c = *src++;
  1161             SkPMColorAssert(c);
  1163             unsigned dither = DITHER_VALUE(x);
  1164             *dst++ = SkDitherRGB32To565(c, dither);
  1165             DITHER_INC_X(x);
  1166         } while (--count != 0);
  1170 /* SSE2 version of S32A_D565_Opaque_Dither()
  1171  * portable version is in core/SkBlitRow_D16.cpp
  1172  */
  1173 void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
  1174                                   const SkPMColor* SK_RESTRICT src,
  1175                                   int count, U8CPU alpha, int x, int y) {
  1176     SkASSERT(255 == alpha);
  1178     if (count <= 0) {
  1179         return;
  1182     if (count >= 8) {
  1183         while (((size_t)dst & 0x0F) != 0) {
  1184             DITHER_565_SCAN(y);
  1185             SkPMColor c = *src++;
  1186             SkPMColorAssert(c);
  1187             if (c) {
  1188                 unsigned a = SkGetPackedA32(c);
  1190                 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
  1192                 unsigned sr = SkGetPackedR32(c);
  1193                 unsigned sg = SkGetPackedG32(c);
  1194                 unsigned sb = SkGetPackedB32(c);
  1195                 sr = SkDITHER_R32_FOR_565(sr, d);
  1196                 sg = SkDITHER_G32_FOR_565(sg, d);
  1197                 sb = SkDITHER_B32_FOR_565(sb, d);
  1199                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
  1200                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
  1201                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
  1202                 // now src and dst expanded are in g:11 r:10 x:1 b:10
  1203                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
  1205             dst += 1;
  1206             DITHER_INC_X(x);
  1207             count--;
  1210         unsigned short dither_value[8];
  1211         __m128i dither, dither_cur;
  1212 #ifdef ENABLE_DITHER_MATRIX_4X4
  1213         const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
  1214         dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
  1215         dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
  1216         dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
  1217         dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
  1218 #else
  1219         const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
  1220         dither_value[0] = dither_value[4] = (dither_scan
  1221                                              >> (((x) & 3) << 2)) & 0xF;
  1222         dither_value[1] = dither_value[5] = (dither_scan
  1223                                              >> (((x + 1) & 3) << 2)) & 0xF;
  1224         dither_value[2] = dither_value[6] = (dither_scan
  1225                                              >> (((x + 2) & 3) << 2)) & 0xF;
  1226         dither_value[3] = dither_value[7] = (dither_scan
  1227                                              >> (((x + 3) & 3) << 2)) & 0xF;
  1228 #endif
  1229         dither = _mm_loadu_si128((__m128i*) dither_value);
  1231         const __m128i* s = reinterpret_cast<const __m128i*>(src);
  1232         __m128i* d = reinterpret_cast<__m128i*>(dst);
  1233         __m128i var256 = _mm_set1_epi16(256);
  1234         __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
  1235         __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
  1236         __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
  1238         while (count >= 8) {
  1239             // Load 8 pixels of src and dst.
  1240             __m128i src_pixel1 = _mm_loadu_si128(s++);
  1241             __m128i src_pixel2 = _mm_loadu_si128(s++);
  1242             __m128i dst_pixel = _mm_load_si128(d);
  1244             // Extract A from src.
  1245             __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT));
  1246             sa1 = _mm_srli_epi32(sa1, 24);
  1247             __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT));
  1248             sa2 = _mm_srli_epi32(sa2, 24);
  1249             __m128i sa = _mm_packs_epi32(sa1, sa2);
  1251             // Calculate current dither value.
  1252             dither_cur = _mm_mullo_epi16(dither,
  1253                                          _mm_add_epi16(sa, _mm_set1_epi16(1)));
  1254             dither_cur = _mm_srli_epi16(dither_cur, 8);
  1256             // Extract R from src.
  1257             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
  1258             sr1 = _mm_srli_epi32(sr1, 24);
  1259             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
  1260             sr2 = _mm_srli_epi32(sr2, 24);
  1261             __m128i sr = _mm_packs_epi32(sr1, sr2);
  1263             // SkDITHER_R32_FOR_565(sr, d)
  1264             __m128i sr_offset = _mm_srli_epi16(sr, 5);
  1265             sr = _mm_add_epi16(sr, dither_cur);
  1266             sr = _mm_sub_epi16(sr, sr_offset);
  1268             // Expand sr.
  1269             sr = _mm_slli_epi16(sr, 2);
  1271             // Extract G from src.
  1272             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
  1273             sg1 = _mm_srli_epi32(sg1, 24);
  1274             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
  1275             sg2 = _mm_srli_epi32(sg2, 24);
  1276             __m128i sg = _mm_packs_epi32(sg1, sg2);
  1278             // sg = SkDITHER_G32_FOR_565(sg, d).
  1279             __m128i sg_offset = _mm_srli_epi16(sg, 6);
  1280             sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));
  1281             sg = _mm_sub_epi16(sg, sg_offset);
  1283             // Expand sg.
  1284             sg = _mm_slli_epi16(sg, 3);
  1286             // Extract B from src.
  1287             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
  1288             sb1 = _mm_srli_epi32(sb1, 24);
  1289             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
  1290             sb2 = _mm_srli_epi32(sb2, 24);
  1291             __m128i sb = _mm_packs_epi32(sb1, sb2);
  1293             // sb = SkDITHER_B32_FOR_565(sb, d).
  1294             __m128i sb_offset = _mm_srli_epi16(sb, 5);
  1295             sb = _mm_add_epi16(sb, dither_cur);
  1296             sb = _mm_sub_epi16(sb, sb_offset);
  1298             // Expand sb.
  1299             sb = _mm_slli_epi16(sb, 2);
  1301             // Extract R G B from dst.
  1302             __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
  1303             dr = _mm_and_si128(dr, r16_mask);
  1304             __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
  1305             dg = _mm_and_si128(dg, g16_mask);
  1306             __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
  1307             db = _mm_and_si128(db, b16_mask);
  1309             // SkAlpha255To256(255 - a) >> 3
  1310             __m128i isa = _mm_sub_epi16(var256, sa);
  1311             isa = _mm_srli_epi16(isa, 3);
  1313             dr = _mm_mullo_epi16(dr, isa);
  1314             dr = _mm_add_epi16(dr, sr);
  1315             dr = _mm_srli_epi16(dr, 5);
  1317             dg = _mm_mullo_epi16(dg, isa);
  1318             dg = _mm_add_epi16(dg, sg);
  1319             dg = _mm_srli_epi16(dg, 5);
  1321             db = _mm_mullo_epi16(db, isa);
  1322             db = _mm_add_epi16(db, sb);
  1323             db = _mm_srli_epi16(db, 5);
  1325             // Package and store dst pixel.
  1326             __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db);
  1327             _mm_store_si128(d++, d_pixel);
  1329             count -= 8;
  1330             x += 8;
  1333         src = reinterpret_cast<const SkPMColor*>(s);
  1334         dst = reinterpret_cast<uint16_t*>(d);
  1337     if (count > 0) {
  1338         DITHER_565_SCAN(y);
  1339         do {
  1340             SkPMColor c = *src++;
  1341             SkPMColorAssert(c);
  1342             if (c) {
  1343                 unsigned a = SkGetPackedA32(c);
  1345                 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
  1347                 unsigned sr = SkGetPackedR32(c);
  1348                 unsigned sg = SkGetPackedG32(c);
  1349                 unsigned sb = SkGetPackedB32(c);
  1350                 sr = SkDITHER_R32_FOR_565(sr, d);
  1351                 sg = SkDITHER_G32_FOR_565(sg, d);
  1352                 sb = SkDITHER_B32_FOR_565(sb, d);
  1354                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
  1355                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
  1356                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
  1357                 // now src and dst expanded are in g:11 r:10 x:1 b:10
  1358                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
  1360             dst += 1;
  1361             DITHER_INC_X(x);
  1362         } while (--count != 0);

mercurial