The Tor Browser: gfx/skia/trunk/src/opts/SkBlitRow_opts

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

     1 /*

     2  * Copyright 2012 The Android Open Source Project

     3  *

     4  * Use of this source code is governed by a BSD-style license that can be

     5  * found in the LICENSE file.

     6  */

     9 #include "SkBlitRow_opts_SSE2.h"

    10 #include "SkBitmapProcState_opts_SSE2.h"

    11 #include "SkColorPriv.h"

    12 #include "SkColor_opts_SSE2.h"

    13 #include "SkDither.h"

    14 #include "SkUtils.h"

    16 #include <emmintrin.h>

    18 /* SSE2 version of S32_Blend_BlitRow32()

    19  * portable version is in core/SkBlitRow_D32.cpp

    20  */

    21 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,

    22                               const SkPMColor* SK_RESTRICT src,

    23                               int count, U8CPU alpha) {

    24     SkASSERT(alpha <= 255);

    25     if (count <= 0) {

    26         return;

    27     }

    29     uint32_t src_scale = SkAlpha255To256(alpha);

    30     uint32_t dst_scale = 256 - src_scale;

    32     if (count >= 4) {

    33         SkASSERT(((size_t)dst & 0x03) == 0);

    34         while (((size_t)dst & 0x0F) != 0) {

    35             *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);

    36             src++;

    37             dst++;

    38             count--;

    39         }

    41         const __m128i *s = reinterpret_cast<const __m128i*>(src);

    42         __m128i *d = reinterpret_cast<__m128i*>(dst);

    43         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

    44         __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);

    46         // Move scale factors to upper byte of word

    47         __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);

    48         __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);

    49         while (count >= 4) {

    50             // Load 4 pixels each of src and dest.

    51             __m128i src_pixel = _mm_loadu_si128(s);

    52             __m128i dst_pixel = _mm_load_si128(d);

    54             // Interleave Atom port 0/1 operations based on the execution port

    55             // constraints that multiply can only be executed on port 0 (while

    56             // boolean operations can be executed on either port 0 or port 1)

    57             // because GCC currently doesn't do a good job scheduling

    58             // instructions based on these constraints.

    60             // Get red and blue pixels into lower byte of each word.

    61             // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)

    62             __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);

    64             // Multiply by scale.

    65             // (4 x (0, rs.h, 0, bs.h))

    66             // where rs.h stands for the higher byte of r * scale, and

    67             // bs.h the higher byte of b * scale.

    68             src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);

    70             // Get alpha and green pixels into higher byte of each word.

    71             // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)

    72             __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);

    74             // Multiply by scale.

    75             // (4 x (as.h, as.l, gs.h, gs.l))

    76             src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);

    78             // Clear the lower byte of the a*scale and g*scale results

    79             // (4 x (as.h, 0, gs.h, 0))

    80             src_ag = _mm_and_si128(src_ag, ag_mask);

    82             // Operations the destination pixels are the same as on the

    83             // source pixels. See the comments above.

    84             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);

    85             dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);

    86             __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);

    87             dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);

    88             dst_ag = _mm_and_si128(dst_ag, ag_mask);

    90             // Combine back into RGBA.

    91             // (4 x (as.h, rs.h, gs.h, bs.h))

    92             src_pixel = _mm_or_si128(src_rb, src_ag);

    93             dst_pixel = _mm_or_si128(dst_rb, dst_ag);

    95             // Add result

    96             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);

    97             _mm_store_si128(d, result);

    98             s++;

    99             d++;

   100             count -= 4;

   101         }

   102         src = reinterpret_cast<const SkPMColor*>(s);

   103         dst = reinterpret_cast<SkPMColor*>(d);

   104     }

   106     while (count > 0) {

   107         *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);

   108         src++;

   109         dst++;

   110         count--;

   111     }

   112 }

   114 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,

   115                                 const SkPMColor* SK_RESTRICT src,

   116                                 int count, U8CPU alpha) {

   117     SkASSERT(alpha == 255);

   118     if (count <= 0) {

   119         return;

   120     }

   122     if (count >= 4) {

   123         SkASSERT(((size_t)dst & 0x03) == 0);

   124         while (((size_t)dst & 0x0F) != 0) {

   125             *dst = SkPMSrcOver(*src, *dst);

   126             src++;

   127             dst++;

   128             count--;

   129         }

   131         const __m128i *s = reinterpret_cast<const __m128i*>(src);

   132         __m128i *d = reinterpret_cast<__m128i*>(dst);

   133 #ifdef SK_USE_ACCURATE_BLENDING

   134         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

   135         __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)

   136         __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)

   137         while (count >= 4) {

   138             // Load 4 pixels

   139             __m128i src_pixel = _mm_loadu_si128(s);

   140             __m128i dst_pixel = _mm_load_si128(d);

   142             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);

   143             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);

   144             // Shift alphas down to lower 8 bits of each quad.

   145             __m128i alpha = _mm_srli_epi32(src_pixel, 24);

   147             // Copy alpha to upper 3rd byte of each quad

   148             alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));

   150             // Subtract alphas from 255, to get 0..255

   151             alpha = _mm_sub_epi16(c_255, alpha);

   153             // Multiply by red and blue by src alpha.

   154             dst_rb = _mm_mullo_epi16(dst_rb, alpha);

   155             // Multiply by alpha and green by src alpha.

   156             dst_ag = _mm_mullo_epi16(dst_ag, alpha);

   158             // dst_rb_low = (dst_rb >> 8)

   159             __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);

   160             __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);

   162             // dst_rb = (dst_rb + dst_rb_low + 128) >> 8

   163             dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);

   164             dst_rb = _mm_add_epi16(dst_rb, c_128);

   165             dst_rb = _mm_srli_epi16(dst_rb, 8);

   167             // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask

   168             dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);

   169             dst_ag = _mm_add_epi16(dst_ag, c_128);

   170             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);

   172             // Combine back into RGBA.

   173             dst_pixel = _mm_or_si128(dst_rb, dst_ag);

   175             // Add result

   176             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);

   177             _mm_store_si128(d, result);

   178             s++;

   179             d++;

   180             count -= 4;

   181         }

   182     #else

   183         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

   184         __m128i c_256 = _mm_set1_epi16(0x0100);  // 8 copies of 256 (16-bit)

   185         while (count >= 4) {

   186             // Load 4 pixels

   187             __m128i src_pixel = _mm_loadu_si128(s);

   188             __m128i dst_pixel = _mm_load_si128(d);

   190             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);

   191             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);

   193             // (a0, g0, a1, g1, a2, g2, a3, g3)  (low byte of each word)

   194             __m128i alpha = _mm_srli_epi16(src_pixel, 8);

   196             // (a0, a0, a1, a1, a2, g2, a3, g3)

   197             alpha = _mm_shufflehi_epi16(alpha, 0xF5);

   199             // (a0, a0, a1, a1, a2, a2, a3, a3)

   200             alpha = _mm_shufflelo_epi16(alpha, 0xF5);

   202             // Subtract alphas from 256, to get 1..256

   203             alpha = _mm_sub_epi16(c_256, alpha);

   205             // Multiply by red and blue by src alpha.

   206             dst_rb = _mm_mullo_epi16(dst_rb, alpha);

   207             // Multiply by alpha and green by src alpha.

   208             dst_ag = _mm_mullo_epi16(dst_ag, alpha);

   210             // Divide by 256.

   211             dst_rb = _mm_srli_epi16(dst_rb, 8);

   213             // Mask out high bits (already in the right place)

   214             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);

   216             // Combine back into RGBA.

   217             dst_pixel = _mm_or_si128(dst_rb, dst_ag);

   219             // Add result

   220             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);

   221             _mm_store_si128(d, result);

   222             s++;

   223             d++;

   224             count -= 4;

   225         }

   226 #endif

   227         src = reinterpret_cast<const SkPMColor*>(s);

   228         dst = reinterpret_cast<SkPMColor*>(d);

   229     }

   231     while (count > 0) {

   232         *dst = SkPMSrcOver(*src, *dst);

   233         src++;

   234         dst++;

   235         count--;

   236     }

   237 }

   239 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,

   240                                const SkPMColor* SK_RESTRICT src,

   241                                int count, U8CPU alpha) {

   242     SkASSERT(alpha <= 255);

   243     if (count <= 0) {

   244         return;

   245     }

   247     if (count >= 4) {

   248         while (((size_t)dst & 0x0F) != 0) {

   249             *dst = SkBlendARGB32(*src, *dst, alpha);

   250             src++;

   251             dst++;

   252             count--;

   253         }

   255         uint32_t src_scale = SkAlpha255To256(alpha);

   257         const __m128i *s = reinterpret_cast<const __m128i*>(src);

   258         __m128i *d = reinterpret_cast<__m128i*>(dst);

   259         __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);

   260         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

   261         __m128i c_256 = _mm_set1_epi16(256);  // 8 copies of 256 (16-bit)

   262         while (count >= 4) {

   263             // Load 4 pixels each of src and dest.

   264             __m128i src_pixel = _mm_loadu_si128(s);

   265             __m128i dst_pixel = _mm_load_si128(d);

   267             // Get red and blue pixels into lower byte of each word.

   268             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);

   269             __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);

   271             // Get alpha and green into lower byte of each word.

   272             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);

   273             __m128i src_ag = _mm_srli_epi16(src_pixel, 8);

   275             // Put per-pixel alpha in low byte of each word.

   276             // After the following two statements, the dst_alpha looks like

   277             // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)

   278             __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);

   279             dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);

   281             // dst_alpha = dst_alpha * src_scale

   282             // Because src_scales are in the higher byte of each word and

   283             // we use mulhi here, the resulting alpha values are already

   284             // in the right place and don't need to be divided by 256.

   285             // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)

   286             dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);

   288             // Subtract alphas from 256, to get 1..256

   289             dst_alpha = _mm_sub_epi16(c_256, dst_alpha);

   291             // Multiply red and blue by dst pixel alpha.

   292             dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);

   293             // Multiply alpha and green by dst pixel alpha.

   294             dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);

   296             // Multiply red and blue by global alpha.

   297             // (4 x (0, rs.h, 0, bs.h))

   298             // where rs.h stands for the higher byte of r * src_scale,

   299             // and bs.h the higher byte of b * src_scale.

   300             // Again, because we use mulhi, the resuling red and blue

   301             // values are already in the right place and don't need to

   302             // be divided by 256.

   303             src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);

   304             // Multiply alpha and green by global alpha.

   305             // (4 x (0, as.h, 0, gs.h))

   306             src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);

   308             // Divide by 256.

   309             dst_rb = _mm_srli_epi16(dst_rb, 8);

   311             // Mask out low bits (goodies already in the right place; no need to divide)

   312             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);

   313             // Shift alpha and green to higher byte of each word.

   314             // (4 x (as.h, 0, gs.h, 0))

   315             src_ag = _mm_slli_epi16(src_ag, 8);

   317             // Combine back into RGBA.

   318             dst_pixel = _mm_or_si128(dst_rb, dst_ag);

   319             src_pixel = _mm_or_si128(src_rb, src_ag);

   321             // Add two pixels into result.

   322             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);

   323             _mm_store_si128(d, result);

   324             s++;

   325             d++;

   326             count -= 4;

   327         }

   328         src = reinterpret_cast<const SkPMColor*>(s);

   329         dst = reinterpret_cast<SkPMColor*>(d);

   330     }

   332     while (count > 0) {

   333         *dst = SkBlendARGB32(*src, *dst, alpha);

   334         src++;

   335         dst++;

   336         count--;

   337     }

   338 }

   340 /* SSE2 version of Color32()

   341  * portable version is in core/SkBlitRow_D32.cpp

   342  */

   343 void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,

   344                   SkPMColor color) {

   346     if (count <= 0) {

   347         return;

   348     }

   350     if (0 == color) {

   351         if (src != dst) {

   352             memcpy(dst, src, count * sizeof(SkPMColor));

   353         }

   354         return;

   355     }

   357     unsigned colorA = SkGetPackedA32(color);

   358     if (255 == colorA) {

   359         sk_memset32(dst, color, count);

   360     } else {

   361         unsigned scale = 256 - SkAlpha255To256(colorA);

   363         if (count >= 4) {

   364             SkASSERT(((size_t)dst & 0x03) == 0);

   365             while (((size_t)dst & 0x0F) != 0) {

   366                 *dst = color + SkAlphaMulQ(*src, scale);

   367                 src++;

   368                 dst++;

   369                 count--;

   370             }

   372             const __m128i *s = reinterpret_cast<const __m128i*>(src);

   373             __m128i *d = reinterpret_cast<__m128i*>(dst);

   374             __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

   375             __m128i src_scale_wide = _mm_set1_epi16(scale);

   376             __m128i color_wide = _mm_set1_epi32(color);

   377             while (count >= 4) {

   378                 // Load 4 pixels each of src and dest.

   379                 __m128i src_pixel = _mm_loadu_si128(s);

   381                 // Get red and blue pixels into lower byte of each word.

   382                 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);

   384                 // Get alpha and green into lower byte of each word.

   385                 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);

   387                 // Multiply by scale.

   388                 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);

   389                 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);

   391                 // Divide by 256.

   392                 src_rb = _mm_srli_epi16(src_rb, 8);

   393                 src_ag = _mm_andnot_si128(rb_mask, src_ag);

   395                 // Combine back into RGBA.

   396                 src_pixel = _mm_or_si128(src_rb, src_ag);

   398                 // Add color to result.

   399                 __m128i result = _mm_add_epi8(color_wide, src_pixel);

   401                 // Store result.

   402                 _mm_store_si128(d, result);

   403                 s++;

   404                 d++;

   405                 count -= 4;

   406             }

   407             src = reinterpret_cast<const SkPMColor*>(s);

   408             dst = reinterpret_cast<SkPMColor*>(d);

   409          }

   411         while (count > 0) {

   412             *dst = color + SkAlphaMulQ(*src, scale);

   413             src += 1;

   414             dst += 1;

   415             count--;

   416         }

   417     }

   418 }

   420 void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,

   421                                size_t maskRB, SkColor origColor,

   422                                int width, int height) {

   423     SkPMColor color = SkPreMultiplyColor(origColor);

   424     size_t dstOffset = dstRB - (width << 2);

   425     size_t maskOffset = maskRB - width;

   426     SkPMColor* dst = (SkPMColor *)device;

   427     const uint8_t* mask = (const uint8_t*)maskPtr;

   428     do {

   429         int count = width;

   430         if (count >= 4) {

   431             while (((size_t)dst & 0x0F) != 0 && (count > 0)) {

   432                 *dst = SkBlendARGB32(color, *dst, *mask);

   433                 mask++;

   434                 dst++;

   435                 count--;

   436             }

   437             __m128i *d = reinterpret_cast<__m128i*>(dst);

   438             __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

   439             __m128i c_256 = _mm_set1_epi16(256);

   440             __m128i c_1 = _mm_set1_epi16(1);

   441             __m128i src_pixel = _mm_set1_epi32(color);

   442             while (count >= 4) {

   443                 // Load 4 pixels each of src and dest.

   444                 __m128i dst_pixel = _mm_load_si128(d);

   446                 //set the aphla value

   447                 __m128i src_scale_wide =  _mm_set_epi8(0, *(mask+3),\

   448                                 0, *(mask+3),0, \

   449                                 *(mask+2),0, *(mask+2),\

   450                                 0,*(mask+1), 0,*(mask+1),\

   451                                 0, *mask,0,*mask);

   453                 //call SkAlpha255To256()

   454                 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);

   456                 // Get red and blue pixels into lower byte of each word.

   457                 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);

   458                 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);

   460                 // Get alpha and green into lower byte of each word.

   461                 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);

   462                 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);

   464                 // Put per-pixel alpha in low byte of each word.

   465                 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);

   466                 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);

   468                 // dst_alpha = dst_alpha * src_scale

   469                 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);

   471                 // Divide by 256.

   472                 dst_alpha = _mm_srli_epi16(dst_alpha, 8);

   474                 // Subtract alphas from 256, to get 1..256

   475                 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);

   476                 // Multiply red and blue by dst pixel alpha.

   477                 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);

   478                 // Multiply alpha and green by dst pixel alpha.

   479                 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);

   481                 // Multiply red and blue by global alpha.

   482                 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);

   483                 // Multiply alpha and green by global alpha.

   484                 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);

   485                 // Divide by 256.

   486                 dst_rb = _mm_srli_epi16(dst_rb, 8);

   487                 src_rb = _mm_srli_epi16(src_rb, 8);

   489                 // Mask out low bits (goodies already in the right place; no need to divide)

   490                 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);

   491                 src_ag = _mm_andnot_si128(rb_mask, src_ag);

   493                 // Combine back into RGBA.

   494                 dst_pixel = _mm_or_si128(dst_rb, dst_ag);

   495                 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);

   497                 // Add two pixels into result.

   498                 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);

   499                 _mm_store_si128(d, result);

   500                 // load the next 4 pixel

   501                 mask = mask + 4;

   502                 d++;

   503                 count -= 4;

   504             }

   505             dst = reinterpret_cast<SkPMColor *>(d);

   506         }

   507         while(count > 0) {

   508             *dst= SkBlendARGB32(color, *dst, *mask);

   509             dst += 1;

   510             mask++;

   511             count --;

   512         }

   513         dst = (SkPMColor *)((char*)dst + dstOffset);

   514         mask += maskOffset;

   515     } while (--height != 0);

   516 }

   518 // The following (left) shifts cause the top 5 bits of the mask components to

   519 // line up with the corresponding components in an SkPMColor.

   520 // Note that the mask's RGB16 order may differ from the SkPMColor order.

   521 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)

   522 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)

   523 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)

   525 #if SK_R16x5_R32x5_SHIFT == 0

   526     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)

   527 #elif SK_R16x5_R32x5_SHIFT > 0

   528     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))

   529 #else

   530     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))

   531 #endif

   533 #if SK_G16x5_G32x5_SHIFT == 0

   534     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)

   535 #elif SK_G16x5_G32x5_SHIFT > 0

   536     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))

   537 #else

   538     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))

   539 #endif

   541 #if SK_B16x5_B32x5_SHIFT == 0

   542     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)

   543 #elif SK_B16x5_B32x5_SHIFT > 0

   544     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))

   545 #else

   546     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))

   547 #endif

   549 static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,

   550                                  __m128i &mask, __m128i &srcA) {

   551     // In the following comments, the components of src, dst and mask are

   552     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked

   553     // by an R, G, B, or A suffix. Components of one of the four pixels that

   554     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for

   555     // example is the blue channel of the second destination pixel. Memory

   556     // layout is shown for an ARGB byte order in a color value.

   558     // src and srcA store 8-bit values interleaved with zeros.

   559     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)

   560     // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,

   561     //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)

   562     // mask stores 16-bit values (compressed three channels) interleaved with zeros.

   563     // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.

   564     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,

   565     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

   567     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.

   568     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)

   569     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),

   570                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));

   572     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)

   573     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),

   574                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));

   576     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)

   577     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),

   578                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));

   580     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)

   581     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an

   582     // 8-bit position

   583     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,

   584     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)

   585     mask = _mm_or_si128(_mm_or_si128(r, g), b);

   587     // Interleave R,G,B into the lower byte of word.

   588     // i.e. split the sixteen 8-bit values from mask into two sets of eight

   589     // 16-bit values, padded by zero.

   590     __m128i maskLo, maskHi;

   591     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)

   592     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());

   593     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)

   594     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());

   596     // Upscale from 0..31 to 0..32

   597     // (allows to replace division by left-shift further down)

   598     // Left-shift each component by 4 and add the result back to that component,

   599     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32

   600     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));

   601     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));

   603     // Multiply each component of maskLo and maskHi by srcA

   604     maskLo = _mm_mullo_epi16(maskLo, srcA);

   605     maskHi = _mm_mullo_epi16(maskHi, srcA);

   607     // Left shift mask components by 8 (divide by 256)

   608     maskLo = _mm_srli_epi16(maskLo, 8);

   609     maskHi = _mm_srli_epi16(maskHi, 8);

   611     // Interleave R,G,B into the lower byte of the word

   612     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)

   613     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());

   614     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)

   615     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());

   617     // mask = (src - dst) * mask

   618     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));

   619     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));

   621     // mask = (src - dst) * mask >> 5

   622     maskLo = _mm_srai_epi16(maskLo, 5);

   623     maskHi = _mm_srai_epi16(maskHi, 5);

   625     // Add two pixels into result.

   626     // result = dst + ((src - dst) * mask >> 5)

   627     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);

   628     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);

   630     // Pack into 4 32bit dst pixels.

   631     // resultLo and resultHi contain eight 16-bit components (two pixels) each.

   632     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),

   633     // clamping to 255 if necessary.

   634     return _mm_packus_epi16(resultLo, resultHi);

   635 }

   637 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,

   638                                        __m128i &mask) {

   639     // In the following comments, the components of src, dst and mask are

   640     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked

   641     // by an R, G, B, or A suffix. Components of one of the four pixels that

   642     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for

   643     // example is the blue channel of the second destination pixel. Memory

   644     // layout is shown for an ARGB byte order in a color value.

   646     // src and srcA store 8-bit values interleaved with zeros.

   647     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)

   648     // mask stores 16-bit values (shown as high and low bytes) interleaved with

   649     // zeros

   650     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,

   651     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

   653     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.

   654     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)

   655     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),

   656                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));

   658     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)

   659     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),

   660                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));

   662     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)

   663     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),

   664                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));

   666     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)

   667     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an

   668     // 8-bit position

   669     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,

   670     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)

   671     mask = _mm_or_si128(_mm_or_si128(r, g), b);

   673     // Interleave R,G,B into the lower byte of word.

   674     // i.e. split the sixteen 8-bit values from mask into two sets of eight

   675     // 16-bit values, padded by zero.

   676     __m128i maskLo, maskHi;

   677     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)

   678     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());

   679     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)

   680     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());

   682     // Upscale from 0..31 to 0..32

   683     // (allows to replace division by left-shift further down)

   684     // Left-shift each component by 4 and add the result back to that component,

   685     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32

   686     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));

   687     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));

   689     // Interleave R,G,B into the lower byte of the word

   690     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)

   691     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());

   692     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)

   693     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());

   695     // mask = (src - dst) * mask

   696     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));

   697     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));

   699     // mask = (src - dst) * mask >> 5

   700     maskLo = _mm_srai_epi16(maskLo, 5);

   701     maskHi = _mm_srai_epi16(maskHi, 5);

   703     // Add two pixels into result.

   704     // result = dst + ((src - dst) * mask >> 5)

   705     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);

   706     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);

   708     // Pack into 4 32bit dst pixels and force opaque.

   709     // resultLo and resultHi contain eight 16-bit components (two pixels) each.

   710     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),

   711     // clamping to 255 if necessary. Set alpha components to 0xFF.

   712     return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),

   713                         _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));

   714 }

   716 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],

   717                          SkColor src, int width, SkPMColor) {

   718     if (width <= 0) {

   719         return;

   720     }

   722     int srcA = SkColorGetA(src);

   723     int srcR = SkColorGetR(src);

   724     int srcG = SkColorGetG(src);

   725     int srcB = SkColorGetB(src);

   727     srcA = SkAlpha255To256(srcA);

   729     if (width >= 4) {

   730         SkASSERT(((size_t)dst & 0x03) == 0);

   731         while (((size_t)dst & 0x0F) != 0) {

   732             *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);

   733             mask++;

   734             dst++;

   735             width--;

   736         }

   738         __m128i *d = reinterpret_cast<__m128i*>(dst);

   739         // Set alpha to 0xFF and replicate source four times in SSE register.

   740         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));

   741         // Interleave with zeros to get two sets of four 16-bit values.

   742         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());

   743         // Set srcA_sse to contain eight copies of srcA, padded with zero.

   744         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)

   745         __m128i srcA_sse = _mm_set1_epi16(srcA);

   746         while (width >= 4) {

   747             // Load four destination pixels into dst_sse.

   748             __m128i dst_sse = _mm_load_si128(d);

   749             // Load four 16-bit masks into lower half of mask_sse.

   750             __m128i mask_sse = _mm_loadl_epi64(

   751                                    reinterpret_cast<const __m128i*>(mask));

   753             // Check whether masks are equal to 0 and get the highest bit

   754             // of each byte of result, if masks are all zero, we will get

   755             // pack_cmp to 0xFFFF

   756             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,

   757                                              _mm_setzero_si128()));

   759             // if mask pixels are not all zero, we will blend the dst pixels

   760             if (pack_cmp != 0xFFFF) {

   761                 // Unpack 4 16bit mask pixels to

   762                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,

   763                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

   764                 mask_sse = _mm_unpacklo_epi16(mask_sse,

   765                                               _mm_setzero_si128());

   767                 // Process 4 32bit dst pixels

   768                 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,

   769                                                    mask_sse, srcA_sse);

   770                 _mm_store_si128(d, result);

   771             }

   773             d++;

   774             mask += 4;

   775             width -= 4;

   776         }

   778         dst = reinterpret_cast<SkPMColor*>(d);

   779     }

   781     while (width > 0) {

   782         *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);

   783         mask++;

   784         dst++;

   785         width--;

   786     }

   787 }

   789 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],

   790                                SkColor src, int width, SkPMColor opaqueDst) {

   791     if (width <= 0) {

   792         return;

   793     }

   795     int srcR = SkColorGetR(src);

   796     int srcG = SkColorGetG(src);

   797     int srcB = SkColorGetB(src);

   799     if (width >= 4) {

   800         SkASSERT(((size_t)dst & 0x03) == 0);

   801         while (((size_t)dst & 0x0F) != 0) {

   802             *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);

   803             mask++;

   804             dst++;

   805             width--;

   806         }

   808         __m128i *d = reinterpret_cast<__m128i*>(dst);

   809         // Set alpha to 0xFF and replicate source four times in SSE register.

   810         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));

   811         // Set srcA_sse to contain eight copies of srcA, padded with zero.

   812         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)

   813         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());

   814         while (width >= 4) {

   815             // Load four destination pixels into dst_sse.

   816             __m128i dst_sse = _mm_load_si128(d);

   817             // Load four 16-bit masks into lower half of mask_sse.

   818             __m128i mask_sse = _mm_loadl_epi64(

   819                                    reinterpret_cast<const __m128i*>(mask));

   821             // Check whether masks are equal to 0 and get the highest bit

   822             // of each byte of result, if masks are all zero, we will get

   823             // pack_cmp to 0xFFFF

   824             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,

   825                                              _mm_setzero_si128()));

   827             // if mask pixels are not all zero, we will blend the dst pixels

   828             if (pack_cmp != 0xFFFF) {

   829                 // Unpack 4 16bit mask pixels to

   830                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,

   831                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

   832                 mask_sse = _mm_unpacklo_epi16(mask_sse,

   833                                               _mm_setzero_si128());

   835                 // Process 4 32bit dst pixels

   836                 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,

   837                                                          mask_sse);

   838                 _mm_store_si128(d, result);

   839             }

   841             d++;

   842             mask += 4;

   843             width -= 4;

   844         }

   846         dst = reinterpret_cast<SkPMColor*>(d);

   847     }

   849     while (width > 0) {

   850         *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);

   851         mask++;

   852         dst++;

   853         width--;

   854     }

   855 }

   857 /* SSE2 version of S32_D565_Opaque()

   858  * portable version is in core/SkBlitRow_D16.cpp

   859  */

   860 void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,

   861                           const SkPMColor* SK_RESTRICT src, int count,

   862                           U8CPU alpha, int /*x*/, int /*y*/) {

   863     SkASSERT(255 == alpha);

   865     if (count <= 0) {

   866         return;

   867     }

   869     if (count >= 8) {

   870         while (((size_t)dst & 0x0F) != 0) {

   871             SkPMColor c = *src++;

   872             SkPMColorAssert(c);

   874             *dst++ = SkPixel32ToPixel16_ToU16(c);

   875             count--;

   876         }

   878         const __m128i* s = reinterpret_cast<const __m128i*>(src);

   879         __m128i* d = reinterpret_cast<__m128i*>(dst);

   880         __m128i r16_mask = _mm_set1_epi32(SK_R16_MASK);

   881         __m128i g16_mask = _mm_set1_epi32(SK_G16_MASK);

   882         __m128i b16_mask = _mm_set1_epi32(SK_B16_MASK);

   884         while (count >= 8) {

   885             // Load 8 pixels of src.

   886             __m128i src_pixel1 = _mm_loadu_si128(s++);

   887             __m128i src_pixel2 = _mm_loadu_si128(s++);

   889             // Calculate result r.

   890             __m128i r1 = _mm_srli_epi32(src_pixel1,

   891                                         SK_R32_SHIFT + (8 - SK_R16_BITS));

   892             r1 = _mm_and_si128(r1, r16_mask);

   893             __m128i r2 = _mm_srli_epi32(src_pixel2,

   894                                         SK_R32_SHIFT + (8 - SK_R16_BITS));

   895             r2 = _mm_and_si128(r2, r16_mask);

   896             __m128i r = _mm_packs_epi32(r1, r2);

   898             // Calculate result g.

   899             __m128i g1 = _mm_srli_epi32(src_pixel1,

   900                                         SK_G32_SHIFT + (8 - SK_G16_BITS));

   901             g1 = _mm_and_si128(g1, g16_mask);

   902             __m128i g2 = _mm_srli_epi32(src_pixel2,

   903                                         SK_G32_SHIFT + (8 - SK_G16_BITS));

   904             g2 = _mm_and_si128(g2, g16_mask);

   905             __m128i g = _mm_packs_epi32(g1, g2);

   907             // Calculate result b.

   908             __m128i b1 = _mm_srli_epi32(src_pixel1,

   909                                         SK_B32_SHIFT + (8 - SK_B16_BITS));

   910             b1 = _mm_and_si128(b1, b16_mask);

   911             __m128i b2 = _mm_srli_epi32(src_pixel2,

   912                                         SK_B32_SHIFT + (8 - SK_B16_BITS));

   913             b2 = _mm_and_si128(b2, b16_mask);

   914             __m128i b = _mm_packs_epi32(b1, b2);

   916             // Store 8 16-bit colors in dst.

   917             __m128i d_pixel = SkPackRGB16_SSE(r, g, b);

   918             _mm_store_si128(d++, d_pixel);

   919             count -= 8;

   920         }

   921         src = reinterpret_cast<const SkPMColor*>(s);

   922         dst = reinterpret_cast<uint16_t*>(d);

   923     }

   925     if (count > 0) {

   926         do {

   927             SkPMColor c = *src++;

   928             SkPMColorAssert(c);

   929             *dst++ = SkPixel32ToPixel16_ToU16(c);

   930         } while (--count != 0);

   931     }

   932 }

   934 /* SSE2 version of S32A_D565_Opaque()

   935  * portable version is in core/SkBlitRow_D16.cpp

   936  */

   937 void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,

   938                            const SkPMColor* SK_RESTRICT src,

   939                            int count, U8CPU alpha, int /*x*/, int /*y*/) {

   940     SkASSERT(255 == alpha);

   942     if (count <= 0) {

   943         return;

   944     }

   946     if (count >= 8) {

   947         // Make dst 16 bytes alignment

   948         while (((size_t)dst & 0x0F) != 0) {

   949             SkPMColor c = *src++;

   950             if (c) {

   951               *dst = SkSrcOver32To16(c, *dst);

   952             }

   953             dst += 1;

   954             count--;

   955         }

   957         const __m128i* s = reinterpret_cast<const __m128i*>(src);

   958         __m128i* d = reinterpret_cast<__m128i*>(dst);

   959         __m128i var255 = _mm_set1_epi16(255);

   960         __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);

   961         __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);

   962         __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);

   964         while (count >= 8) {

   965             // Load 8 pixels of src.

   966             __m128i src_pixel1 = _mm_loadu_si128(s++);

   967             __m128i src_pixel2 = _mm_loadu_si128(s++);

   969             // Check whether src pixels are equal to 0 and get the highest bit

   970             // of each byte of result, if src pixels are all zero, src_cmp1 and

   971             // src_cmp2 will be 0xFFFF.

   972             int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,

   973                                              _mm_setzero_si128()));

   974             int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,

   975                                              _mm_setzero_si128()));

   976             if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {

   977                 d++;

   978                 count -= 8;

   979                 continue;

   980             }

   982             // Load 8 pixels of dst.

   983             __m128i dst_pixel = _mm_load_si128(d);

   985             // Extract A from src.

   986             __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT));

   987             sa1 = _mm_srli_epi32(sa1, 24);

   988             __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT));

   989             sa2 = _mm_srli_epi32(sa2, 24);

   990             __m128i sa = _mm_packs_epi32(sa1, sa2);

   992             // Extract R from src.

   993             __m128i sr1 = _mm_slli_epi32(src_pixel1,(24 - SK_R32_SHIFT));

   994             sr1 = _mm_srli_epi32(sr1, 24);

   995             __m128i sr2 = _mm_slli_epi32(src_pixel2,(24 - SK_R32_SHIFT));

   996             sr2 = _mm_srli_epi32(sr2, 24);

   997             __m128i sr = _mm_packs_epi32(sr1, sr2);

   999             // Extract G from src.

  1000             __m128i sg1 = _mm_slli_epi32(src_pixel1,(24 - SK_G32_SHIFT));

  1001             sg1 = _mm_srli_epi32(sg1, 24);

  1002             __m128i sg2 = _mm_slli_epi32(src_pixel2,(24 - SK_G32_SHIFT));

  1003             sg2 = _mm_srli_epi32(sg2, 24);

  1004             __m128i sg = _mm_packs_epi32(sg1, sg2);

  1006             // Extract B from src.

  1007             __m128i sb1 = _mm_slli_epi32(src_pixel1,(24 - SK_B32_SHIFT));

  1008             sb1 = _mm_srli_epi32(sb1, 24);

  1009             __m128i sb2 = _mm_slli_epi32(src_pixel2,(24 - SK_B32_SHIFT));

  1010             sb2 = _mm_srli_epi32(sb2, 24);

  1011             __m128i sb = _mm_packs_epi32(sb1, sb2);

  1013             // Extract R G B from dst.

  1014             __m128i dr = _mm_srli_epi16(dst_pixel,SK_R16_SHIFT);

  1015             dr = _mm_and_si128(dr, r16_mask);

  1016             __m128i dg = _mm_srli_epi16(dst_pixel,SK_G16_SHIFT);

  1017             dg = _mm_and_si128(dg, g16_mask);

  1018             __m128i db = _mm_srli_epi16(dst_pixel,SK_B16_SHIFT);

  1019             db = _mm_and_si128(db, b16_mask);

  1021             __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa

  1023             // Calculate R G B of result.

  1024             // Original algorithm is in SkSrcOver32To16().

  1025             dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE(dr, isa, SK_R16_BITS));

  1026             dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);

  1027             dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE(dg, isa, SK_G16_BITS));

  1028             dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);

  1029             db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE(db, isa, SK_B16_BITS));

  1030             db = _mm_srli_epi16(db, 8 - SK_B16_BITS);

  1032             // Pack R G B into 16-bit color.

  1033             __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db);

  1035             // Store 8 16-bit colors in dst.

  1036             _mm_store_si128(d++, d_pixel);

  1037             count -= 8;

  1038         }

  1040         src = reinterpret_cast<const SkPMColor*>(s);

  1041         dst = reinterpret_cast<uint16_t*>(d);

  1042     }

  1044     if (count > 0) {

  1045         do {

  1046             SkPMColor c = *src++;

  1047             SkPMColorAssert(c);

  1048             if (c) {

  1049                 *dst = SkSrcOver32To16(c, *dst);

  1050             }

  1051             dst += 1;

  1052         } while (--count != 0);

  1053     }

  1054 }

  1056 void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,

  1057                                  const SkPMColor* SK_RESTRICT src,

  1058                                  int count, U8CPU alpha, int x, int y) {

  1059     SkASSERT(255 == alpha);

  1061     if (count <= 0) {

  1062         return;

  1063     }

  1065     if (count >= 8) {

  1066         while (((size_t)dst & 0x0F) != 0) {

  1067             DITHER_565_SCAN(y);

  1068             SkPMColor c = *src++;

  1069             SkPMColorAssert(c);

  1071             unsigned dither = DITHER_VALUE(x);

  1072             *dst++ = SkDitherRGB32To565(c, dither);

  1073             DITHER_INC_X(x);

  1074             count--;

  1075         }

  1077         unsigned short dither_value[8];

  1078         __m128i dither;

  1079 #ifdef ENABLE_DITHER_MATRIX_4X4

  1080         const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];

  1081         dither_value[0] = dither_value[4] = dither_scan[(x) & 3];

  1082         dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];

  1083         dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];

  1084         dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];

  1085 #else

  1086         const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];

  1087         dither_value[0] = dither_value[4] = (dither_scan

  1088                                              >> (((x) & 3) << 2)) & 0xF;

  1089         dither_value[1] = dither_value[5] = (dither_scan

  1090                                              >> (((x + 1) & 3) << 2)) & 0xF;

  1091         dither_value[2] = dither_value[6] = (dither_scan

  1092                                              >> (((x + 2) & 3) << 2)) & 0xF;

  1093         dither_value[3] = dither_value[7] = (dither_scan

  1094                                              >> (((x + 3) & 3) << 2)) & 0xF;

  1095 #endif

  1096         dither = _mm_loadu_si128((__m128i*) dither_value);

  1098         const __m128i* s = reinterpret_cast<const __m128i*>(src);

  1099         __m128i* d = reinterpret_cast<__m128i*>(dst);

  1101         while (count >= 8) {

  1102             // Load 8 pixels of src.

  1103             __m128i src_pixel1 = _mm_loadu_si128(s++);

  1104             __m128i src_pixel2 = _mm_loadu_si128(s++);

  1106             // Extract R from src.

  1107             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));

  1108             sr1 = _mm_srli_epi32(sr1, 24);

  1109             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));

  1110             sr2 = _mm_srli_epi32(sr2, 24);

  1111             __m128i sr = _mm_packs_epi32(sr1, sr2);

  1113             // SkDITHER_R32To565(sr, dither)

  1114             __m128i sr_offset = _mm_srli_epi16(sr, 5);

  1115             sr = _mm_add_epi16(sr, dither);

  1116             sr = _mm_sub_epi16(sr, sr_offset);

  1117             sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);

  1119             // Extract G from src.

  1120             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));

  1121             sg1 = _mm_srli_epi32(sg1, 24);

  1122             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));

  1123             sg2 = _mm_srli_epi32(sg2, 24);

  1124             __m128i sg = _mm_packs_epi32(sg1, sg2);

  1126             // SkDITHER_R32To565(sg, dither)

  1127             __m128i sg_offset = _mm_srli_epi16(sg, 6);

  1128             sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));

  1129             sg = _mm_sub_epi16(sg, sg_offset);

  1130             sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);

  1132             // Extract B from src.

  1133             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));

  1134             sb1 = _mm_srli_epi32(sb1, 24);

  1135             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));

  1136             sb2 = _mm_srli_epi32(sb2, 24);

  1137             __m128i sb = _mm_packs_epi32(sb1, sb2);

  1139             // SkDITHER_R32To565(sb, dither)

  1140             __m128i sb_offset = _mm_srli_epi16(sb, 5);

  1141             sb = _mm_add_epi16(sb, dither);

  1142             sb = _mm_sub_epi16(sb, sb_offset);

  1143             sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);

  1145             // Pack and store 16-bit dst pixel.

  1146             __m128i d_pixel = SkPackRGB16_SSE(sr, sg, sb);

  1147             _mm_store_si128(d++, d_pixel);

  1149             count -= 8;

  1150             x += 8;

  1151         }

  1153         src = reinterpret_cast<const SkPMColor*>(s);

  1154         dst = reinterpret_cast<uint16_t*>(d);

  1155     }

  1157     if (count > 0) {

  1158         DITHER_565_SCAN(y);

  1159         do {

  1160             SkPMColor c = *src++;

  1161             SkPMColorAssert(c);

  1163             unsigned dither = DITHER_VALUE(x);

  1164             *dst++ = SkDitherRGB32To565(c, dither);

  1165             DITHER_INC_X(x);

  1166         } while (--count != 0);

  1167     }

  1168 }

  1170 /* SSE2 version of S32A_D565_Opaque_Dither()

  1171  * portable version is in core/SkBlitRow_D16.cpp

  1172  */

  1173 void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,

  1174                                   const SkPMColor* SK_RESTRICT src,

  1175                                   int count, U8CPU alpha, int x, int y) {

  1176     SkASSERT(255 == alpha);

  1178     if (count <= 0) {

  1179         return;

  1180     }

  1182     if (count >= 8) {

  1183         while (((size_t)dst & 0x0F) != 0) {

  1184             DITHER_565_SCAN(y);

  1185             SkPMColor c = *src++;

  1186             SkPMColorAssert(c);

  1187             if (c) {

  1188                 unsigned a = SkGetPackedA32(c);

  1190                 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));

  1192                 unsigned sr = SkGetPackedR32(c);

  1193                 unsigned sg = SkGetPackedG32(c);

  1194                 unsigned sb = SkGetPackedB32(c);

  1195                 sr = SkDITHER_R32_FOR_565(sr, d);

  1196                 sg = SkDITHER_G32_FOR_565(sg, d);

  1197                 sb = SkDITHER_B32_FOR_565(sb, d);

  1199                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);

  1200                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);

  1201                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);

  1202                 // now src and dst expanded are in g:11 r:10 x:1 b:10

  1203                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);

  1204             }

  1205             dst += 1;

  1206             DITHER_INC_X(x);

  1207             count--;

  1208         }

  1210         unsigned short dither_value[8];

  1211         __m128i dither, dither_cur;

  1212 #ifdef ENABLE_DITHER_MATRIX_4X4

  1213         const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];

  1214         dither_value[0] = dither_value[4] = dither_scan[(x) & 3];

  1215         dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];

  1216         dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];

  1217         dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];

  1218 #else

  1219         const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];

  1220         dither_value[0] = dither_value[4] = (dither_scan

  1221                                              >> (((x) & 3) << 2)) & 0xF;

  1222         dither_value[1] = dither_value[5] = (dither_scan

  1223                                              >> (((x + 1) & 3) << 2)) & 0xF;

  1224         dither_value[2] = dither_value[6] = (dither_scan

  1225                                              >> (((x + 2) & 3) << 2)) & 0xF;

  1226         dither_value[3] = dither_value[7] = (dither_scan

  1227                                              >> (((x + 3) & 3) << 2)) & 0xF;

  1228 #endif

  1229         dither = _mm_loadu_si128((__m128i*) dither_value);

  1231         const __m128i* s = reinterpret_cast<const __m128i*>(src);

  1232         __m128i* d = reinterpret_cast<__m128i*>(dst);

  1233         __m128i var256 = _mm_set1_epi16(256);

  1234         __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);

  1235         __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);

  1236         __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);

  1238         while (count >= 8) {

  1239             // Load 8 pixels of src and dst.

  1240             __m128i src_pixel1 = _mm_loadu_si128(s++);

  1241             __m128i src_pixel2 = _mm_loadu_si128(s++);

  1242             __m128i dst_pixel = _mm_load_si128(d);

  1244             // Extract A from src.

  1245             __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT));

  1246             sa1 = _mm_srli_epi32(sa1, 24);

  1247             __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT));

  1248             sa2 = _mm_srli_epi32(sa2, 24);

  1249             __m128i sa = _mm_packs_epi32(sa1, sa2);

  1251             // Calculate current dither value.

  1252             dither_cur = _mm_mullo_epi16(dither,

  1253                                          _mm_add_epi16(sa, _mm_set1_epi16(1)));

  1254             dither_cur = _mm_srli_epi16(dither_cur, 8);

  1256             // Extract R from src.

  1257             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));

  1258             sr1 = _mm_srli_epi32(sr1, 24);

  1259             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));

  1260             sr2 = _mm_srli_epi32(sr2, 24);

  1261             __m128i sr = _mm_packs_epi32(sr1, sr2);

  1263             // SkDITHER_R32_FOR_565(sr, d)

  1264             __m128i sr_offset = _mm_srli_epi16(sr, 5);

  1265             sr = _mm_add_epi16(sr, dither_cur);

  1266             sr = _mm_sub_epi16(sr, sr_offset);

  1268             // Expand sr.

  1269             sr = _mm_slli_epi16(sr, 2);

  1271             // Extract G from src.

  1272             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));

  1273             sg1 = _mm_srli_epi32(sg1, 24);

  1274             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));

  1275             sg2 = _mm_srli_epi32(sg2, 24);

  1276             __m128i sg = _mm_packs_epi32(sg1, sg2);

  1278             // sg = SkDITHER_G32_FOR_565(sg, d).

  1279             __m128i sg_offset = _mm_srli_epi16(sg, 6);

  1280             sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));

  1281             sg = _mm_sub_epi16(sg, sg_offset);

  1283             // Expand sg.

  1284             sg = _mm_slli_epi16(sg, 3);

  1286             // Extract B from src.

  1287             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));

  1288             sb1 = _mm_srli_epi32(sb1, 24);

  1289             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));

  1290             sb2 = _mm_srli_epi32(sb2, 24);

  1291             __m128i sb = _mm_packs_epi32(sb1, sb2);

  1293             // sb = SkDITHER_B32_FOR_565(sb, d).

  1294             __m128i sb_offset = _mm_srli_epi16(sb, 5);

  1295             sb = _mm_add_epi16(sb, dither_cur);

  1296             sb = _mm_sub_epi16(sb, sb_offset);

  1298             // Expand sb.

  1299             sb = _mm_slli_epi16(sb, 2);

  1301             // Extract R G B from dst.

  1302             __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);

  1303             dr = _mm_and_si128(dr, r16_mask);

  1304             __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);

  1305             dg = _mm_and_si128(dg, g16_mask);

  1306             __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);

  1307             db = _mm_and_si128(db, b16_mask);

  1309             // SkAlpha255To256(255 - a) >> 3

  1310             __m128i isa = _mm_sub_epi16(var256, sa);

  1311             isa = _mm_srli_epi16(isa, 3);

  1313             dr = _mm_mullo_epi16(dr, isa);

  1314             dr = _mm_add_epi16(dr, sr);

  1315             dr = _mm_srli_epi16(dr, 5);

  1317             dg = _mm_mullo_epi16(dg, isa);

  1318             dg = _mm_add_epi16(dg, sg);

  1319             dg = _mm_srli_epi16(dg, 5);

  1321             db = _mm_mullo_epi16(db, isa);

  1322             db = _mm_add_epi16(db, sb);

  1323             db = _mm_srli_epi16(db, 5);

  1325             // Package and store dst pixel.

  1326             __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db);

  1327             _mm_store_si128(d++, d_pixel);

  1329             count -= 8;

  1330             x += 8;

  1331         }

  1333         src = reinterpret_cast<const SkPMColor*>(s);

  1334         dst = reinterpret_cast<uint16_t*>(d);

  1335     }

  1337     if (count > 0) {

  1338         DITHER_565_SCAN(y);

  1339         do {

  1340             SkPMColor c = *src++;

  1341             SkPMColorAssert(c);

  1342             if (c) {

  1343                 unsigned a = SkGetPackedA32(c);

  1345                 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));

  1347                 unsigned sr = SkGetPackedR32(c);

  1348                 unsigned sg = SkGetPackedG32(c);

  1349                 unsigned sb = SkGetPackedB32(c);

  1350                 sr = SkDITHER_R32_FOR_565(sr, d);

  1351                 sg = SkDITHER_G32_FOR_565(sg, d);

  1352                 sb = SkDITHER_B32_FOR_565(sb, d);

  1354                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);

  1355                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);

  1356                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);

  1357                 // now src and dst expanded are in g:11 r:10 x:1 b:10

  1358                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);

  1359             }

  1360             dst += 1;

  1361             DITHER_INC_X(x);

  1362         } while (--count != 0);

  1363     }

  1364 }

The Tor Browser / file revision

gfx/skia/trunk/src/opts/SkBlitRow_opts_SSE2.cpp@129ffea94266

gfx/skia/trunk/src/opts/SkBlitRow_opts_SSE2.cpp