gfx/skia/trunk/src/opts/SkBitmapProcState_opts_SSE2.cpp

Sat, 03 Jan 2015 20:18:00 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Sat, 03 Jan 2015 20:18:00 +0100
branch
TOR_BUG_3246
changeset 7
129ffea94266
permissions
-rw-r--r--

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

     2 /*
     3  * Copyright 2009 The Android Open Source Project
     4  *
     5  * Use of this source code is governed by a BSD-style license that can be
     6  * found in the LICENSE file.
     7  */
    10 #include <emmintrin.h>
    11 #include "SkBitmapProcState_opts_SSE2.h"
    12 #include "SkPaint.h"
    13 #include "SkUtils.h"
    15 void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s,
    16                                    const uint32_t* xy,
    17                                    int count, uint32_t* colors) {
    18     SkASSERT(count > 0 && colors != NULL);
    19     SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
    20     SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
    21     SkASSERT(s.fAlphaScale == 256);
    23     const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
    24     size_t rb = s.fBitmap->rowBytes();
    25     uint32_t XY = *xy++;
    26     unsigned y0 = XY >> 14;
    27     const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
    28     const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
    29     unsigned subY = y0 & 0xF;
    31     // ( 0,  0,  0,  0,  0,  0,  0, 16)
    32     __m128i sixteen = _mm_cvtsi32_si128(16);
    34     // ( 0,  0,  0,  0, 16, 16, 16, 16)
    35     sixteen = _mm_shufflelo_epi16(sixteen, 0);
    37     // ( 0,  0,  0,  0,  0,  0,  0,  y)
    38     __m128i allY = _mm_cvtsi32_si128(subY);
    40     // ( 0,  0,  0,  0,  y,  y,  y,  y)
    41     allY = _mm_shufflelo_epi16(allY, 0);
    43     // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
    44     __m128i negY = _mm_sub_epi16(sixteen, allY);
    46     // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
    47     allY = _mm_unpacklo_epi64(allY, negY);
    49     // (16, 16, 16, 16, 16, 16, 16, 16 )
    50     sixteen = _mm_shuffle_epi32(sixteen, 0);
    52     // ( 0,  0,  0,  0,  0,  0,  0,  0)
    53     __m128i zero = _mm_setzero_si128();
    54     do {
    55         uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
    56         unsigned x0 = XX >> 18;
    57         unsigned x1 = XX & 0x3FFF;
    59         // (0, 0, 0, 0, 0, 0, 0, x)
    60         __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
    62         // (0, 0, 0, 0, x, x, x, x)
    63         allX = _mm_shufflelo_epi16(allX, 0);
    65         // (x, x, x, x, x, x, x, x)
    66         allX = _mm_shuffle_epi32(allX, 0);
    68         // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
    69         __m128i negX = _mm_sub_epi16(sixteen, allX);
    71         // Load 4 samples (pixels).
    72         __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
    73         __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
    74         __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
    75         __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
    77         // (0, 0, a00, a10)
    78         __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
    80         // Expand to 16 bits per component.
    81         a00a10 = _mm_unpacklo_epi8(a00a10, zero);
    83         // ((a00 * (16-y)), (a10 * y)).
    84         a00a10 = _mm_mullo_epi16(a00a10, allY);
    86         // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
    87         a00a10 = _mm_mullo_epi16(a00a10, negX);
    89         // (0, 0, a01, a10)
    90         __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
    92         // Expand to 16 bits per component.
    93         a01a11 = _mm_unpacklo_epi8(a01a11, zero);
    95         // (a01 * (16-y)), (a11 * y)
    96         a01a11 = _mm_mullo_epi16(a01a11, allY);
    98         // (a01 * (16-y) * x), (a11 * y * x)
    99         a01a11 = _mm_mullo_epi16(a01a11, allX);
   101         // (a00*w00 + a01*w01, a10*w10 + a11*w11)
   102         __m128i sum = _mm_add_epi16(a00a10, a01a11);
   104         // (DC, a00*w00 + a01*w01)
   105         __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
   107         // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
   108         sum = _mm_add_epi16(sum, shifted);
   110         // Divide each 16 bit component by 256.
   111         sum = _mm_srli_epi16(sum, 8);
   113         // Pack lower 4 16 bit values of sum into lower 4 bytes.
   114         sum = _mm_packus_epi16(sum, zero);
   116         // Extract low int and store.
   117         *colors++ = _mm_cvtsi128_si32(sum);
   118     } while (--count > 0);
   119 }
   121 void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
   122                                   const uint32_t* xy,
   123                                   int count, uint32_t* colors) {
   124     SkASSERT(count > 0 && colors != NULL);
   125     SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
   126     SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
   127     SkASSERT(s.fAlphaScale < 256);
   129     const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
   130     size_t rb = s.fBitmap->rowBytes();
   131     uint32_t XY = *xy++;
   132     unsigned y0 = XY >> 14;
   133     const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
   134     const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
   135     unsigned subY = y0 & 0xF;
   137     // ( 0,  0,  0,  0,  0,  0,  0, 16)
   138     __m128i sixteen = _mm_cvtsi32_si128(16);
   140     // ( 0,  0,  0,  0, 16, 16, 16, 16)
   141     sixteen = _mm_shufflelo_epi16(sixteen, 0);
   143     // ( 0,  0,  0,  0,  0,  0,  0,  y)
   144     __m128i allY = _mm_cvtsi32_si128(subY);
   146     // ( 0,  0,  0,  0,  y,  y,  y,  y)
   147     allY = _mm_shufflelo_epi16(allY, 0);
   149     // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
   150     __m128i negY = _mm_sub_epi16(sixteen, allY);
   152     // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
   153     allY = _mm_unpacklo_epi64(allY, negY);
   155     // (16, 16, 16, 16, 16, 16, 16, 16 )
   156     sixteen = _mm_shuffle_epi32(sixteen, 0);
   158     // ( 0,  0,  0,  0,  0,  0,  0,  0)
   159     __m128i zero = _mm_setzero_si128();
   161     // ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha )
   162     __m128i alpha = _mm_set1_epi16(s.fAlphaScale);
   164     do {
   165         uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
   166         unsigned x0 = XX >> 18;
   167         unsigned x1 = XX & 0x3FFF;
   169         // (0, 0, 0, 0, 0, 0, 0, x)
   170         __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
   172         // (0, 0, 0, 0, x, x, x, x)
   173         allX = _mm_shufflelo_epi16(allX, 0);
   175         // (x, x, x, x, x, x, x, x)
   176         allX = _mm_shuffle_epi32(allX, 0);
   178         // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
   179         __m128i negX = _mm_sub_epi16(sixteen, allX);
   181         // Load 4 samples (pixels).
   182         __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
   183         __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
   184         __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
   185         __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
   187         // (0, 0, a00, a10)
   188         __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
   190         // Expand to 16 bits per component.
   191         a00a10 = _mm_unpacklo_epi8(a00a10, zero);
   193         // ((a00 * (16-y)), (a10 * y)).
   194         a00a10 = _mm_mullo_epi16(a00a10, allY);
   196         // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
   197         a00a10 = _mm_mullo_epi16(a00a10, negX);
   199         // (0, 0, a01, a10)
   200         __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
   202         // Expand to 16 bits per component.
   203         a01a11 = _mm_unpacklo_epi8(a01a11, zero);
   205         // (a01 * (16-y)), (a11 * y)
   206         a01a11 = _mm_mullo_epi16(a01a11, allY);
   208         // (a01 * (16-y) * x), (a11 * y * x)
   209         a01a11 = _mm_mullo_epi16(a01a11, allX);
   211         // (a00*w00 + a01*w01, a10*w10 + a11*w11)
   212         __m128i sum = _mm_add_epi16(a00a10, a01a11);
   214         // (DC, a00*w00 + a01*w01)
   215         __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
   217         // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
   218         sum = _mm_add_epi16(sum, shifted);
   220         // Divide each 16 bit component by 256.
   221         sum = _mm_srli_epi16(sum, 8);
   223         // Multiply by alpha.
   224         sum = _mm_mullo_epi16(sum, alpha);
   226         // Divide each 16 bit component by 256.
   227         sum = _mm_srli_epi16(sum, 8);
   229         // Pack lower 4 16 bit values of sum into lower 4 bytes.
   230         sum = _mm_packus_epi16(sum, zero);
   232         // Extract low int and store.
   233         *colors++ = _mm_cvtsi128_si32(sum);
   234     } while (--count > 0);
   235 }
   237 static inline uint32_t ClampX_ClampY_pack_filter(SkFixed f, unsigned max,
   238                                                  SkFixed one) {
   239     unsigned i = SkClampMax(f >> 16, max);
   240     i = (i << 4) | ((f >> 12) & 0xF);
   241     return (i << 14) | SkClampMax((f + one) >> 16, max);
   242 }
   244 /*  SSE version of ClampX_ClampY_filter_scale()
   245  *  portable version is in core/SkBitmapProcState_matrix.h
   246  */
   247 void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[],
   248                                      int count, int x, int y) {
   249     SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
   250                              SkMatrix::kScale_Mask)) == 0);
   251     SkASSERT(s.fInvKy == 0);
   253     const unsigned maxX = s.fBitmap->width() - 1;
   254     const SkFixed one = s.fFilterOneX;
   255     const SkFixed dx = s.fInvSx;
   256     SkFixed fx;
   258     SkPoint pt;
   259     s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
   260                              SkIntToScalar(y) + SK_ScalarHalf, &pt);
   261     const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1);
   262     const unsigned maxY = s.fBitmap->height() - 1;
   263     // compute our two Y values up front
   264     *xy++ = ClampX_ClampY_pack_filter(fy, maxY, s.fFilterOneY);
   265     // now initialize fx
   266     fx = SkScalarToFixed(pt.fX) - (one >> 1);
   268     // test if we don't need to apply the tile proc
   269     if (dx > 0 && (unsigned)(fx >> 16) <= maxX &&
   270         (unsigned)((fx + dx * (count - 1)) >> 16) < maxX) {
   271         if (count >= 4) {
   272             // SSE version of decal_filter_scale
   273             while ((size_t(xy) & 0x0F) != 0) {
   274                 SkASSERT((fx >> (16 + 14)) == 0);
   275                 *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
   276                 fx += dx;
   277                 count--;
   278             }
   280             __m128i wide_1    = _mm_set1_epi32(1);
   281             __m128i wide_dx4  = _mm_set1_epi32(dx * 4);
   282             __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
   283                                               fx + dx, fx);
   285             while (count >= 4) {
   286                 __m128i wide_out;
   288                 wide_out = _mm_slli_epi32(_mm_srai_epi32(wide_fx, 12), 14);
   289                 wide_out = _mm_or_si128(wide_out, _mm_add_epi32(
   290                                         _mm_srai_epi32(wide_fx, 16), wide_1));
   292                 _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_out);
   294                 xy += 4;
   295                 fx += dx * 4;
   296                 wide_fx  = _mm_add_epi32(wide_fx, wide_dx4);
   297                 count -= 4;
   298             } // while count >= 4
   299         } // if count >= 4
   301         while (count-- > 0) {
   302             SkASSERT((fx >> (16 + 14)) == 0);
   303             *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
   304             fx += dx;
   305         }
   306     } else {
   307         // SSE2 only support 16bit interger max & min, so only process the case
   308         // maxX less than the max 16bit interger. Actually maxX is the bitmap's
   309         // height, there should be rare bitmap whose height will be greater
   310         // than max 16bit interger in the real world.
   311         if ((count >= 4) && (maxX <= 0xFFFF)) {
   312             while (((size_t)xy & 0x0F) != 0) {
   313                 *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
   314                 fx += dx;
   315                 count--;
   316             }
   318             __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
   319                                               fx + dx, fx);
   320             __m128i wide_dx4  = _mm_set1_epi32(dx * 4);
   321             __m128i wide_one  = _mm_set1_epi32(one);
   322             __m128i wide_maxX = _mm_set1_epi32(maxX);
   323             __m128i wide_mask = _mm_set1_epi32(0xF);
   325              while (count >= 4) {
   326                 __m128i wide_i;
   327                 __m128i wide_lo;
   328                 __m128i wide_fx1;
   330                 // i = SkClampMax(f>>16,maxX)
   331                 wide_i = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16),
   332                                        _mm_setzero_si128());
   333                 wide_i = _mm_min_epi16(wide_i, wide_maxX);
   335                 // i<<4 | TILEX_LOW_BITS(fx)
   336                 wide_lo = _mm_srli_epi32(wide_fx, 12);
   337                 wide_lo = _mm_and_si128(wide_lo, wide_mask);
   338                 wide_i  = _mm_slli_epi32(wide_i, 4);
   339                 wide_i  = _mm_or_si128(wide_i, wide_lo);
   341                 // i<<14
   342                 wide_i = _mm_slli_epi32(wide_i, 14);
   344                 // SkClampMax(((f+one))>>16,max)
   345                 wide_fx1 = _mm_add_epi32(wide_fx, wide_one);
   346                 wide_fx1 = _mm_max_epi16(_mm_srli_epi32(wide_fx1, 16),
   347                                                         _mm_setzero_si128());
   348                 wide_fx1 = _mm_min_epi16(wide_fx1, wide_maxX);
   350                 // final combination
   351                 wide_i = _mm_or_si128(wide_i, wide_fx1);
   352                 _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i);
   354                 wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
   355                 fx += dx * 4;
   356                 xy += 4;
   357                 count -= 4;
   358             } // while count >= 4
   359         } // if count >= 4
   361         while (count-- > 0) {
   362             *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
   363             fx += dx;
   364         }
   365     }
   366 }
   368 /*  SSE version of ClampX_ClampY_nofilter_scale()
   369  *  portable version is in core/SkBitmapProcState_matrix.h
   370  */
   371 void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s,
   372                                     uint32_t xy[], int count, int x, int y) {
   373     SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
   374                              SkMatrix::kScale_Mask)) == 0);
   376     // we store y, x, x, x, x, x
   377     const unsigned maxX = s.fBitmap->width() - 1;
   378     SkFixed fx;
   379     SkPoint pt;
   380     s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
   381                              SkIntToScalar(y) + SK_ScalarHalf, &pt);
   382     fx = SkScalarToFixed(pt.fY);
   383     const unsigned maxY = s.fBitmap->height() - 1;
   384     *xy++ = SkClampMax(fx >> 16, maxY);
   385     fx = SkScalarToFixed(pt.fX);
   387     if (0 == maxX) {
   388         // all of the following X values must be 0
   389         memset(xy, 0, count * sizeof(uint16_t));
   390         return;
   391     }
   393     const SkFixed dx = s.fInvSx;
   395     // test if we don't need to apply the tile proc
   396     if ((unsigned)(fx >> 16) <= maxX &&
   397         (unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) {
   398         // SSE version of decal_nofilter_scale
   399         if (count >= 8) {
   400             while (((size_t)xy & 0x0F) != 0) {
   401                 *xy++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
   402                 fx += 2 * dx;
   403                 count -= 2;
   404             }
   406             __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
   407             __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
   409             __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
   410                                              fx + dx, fx);
   411             __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
   413             while (count >= 8) {
   414                 __m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
   415                 __m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
   417                 __m128i wide_result = _mm_packs_epi32(wide_out_low,
   418                                                       wide_out_high);
   419                 _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
   421                 wide_low = _mm_add_epi32(wide_low, wide_dx8);
   422                 wide_high = _mm_add_epi32(wide_high, wide_dx8);
   424                 xy += 4;
   425                 fx += dx * 8;
   426                 count -= 8;
   427             }
   428         } // if count >= 8
   430         uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
   431         while (count-- > 0) {
   432             *xx++ = SkToU16(fx >> 16);
   433             fx += dx;
   434         }
   435     } else {
   436         // SSE2 only support 16bit interger max & min, so only process the case
   437         // maxX less than the max 16bit interger. Actually maxX is the bitmap's
   438         // height, there should be rare bitmap whose height will be greater
   439         // than max 16bit interger in the real world.
   440         if ((count >= 8) && (maxX <= 0xFFFF)) {
   441             while (((size_t)xy & 0x0F) != 0) {
   442                 *xy++ = pack_two_shorts(SkClampMax((fx + dx) >> 16, maxX),
   443                                         SkClampMax(fx >> 16, maxX));
   444                 fx += 2 * dx;
   445                 count -= 2;
   446             }
   448             __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
   449             __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
   451             __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
   452                                              fx + dx, fx);
   453             __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
   454             __m128i wide_maxX = _mm_set1_epi32(maxX);
   456             while (count >= 8) {
   457                 __m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
   458                 __m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
   460                 wide_out_low  = _mm_max_epi16(wide_out_low,
   461                                               _mm_setzero_si128());
   462                 wide_out_low  = _mm_min_epi16(wide_out_low, wide_maxX);
   463                 wide_out_high = _mm_max_epi16(wide_out_high,
   464                                               _mm_setzero_si128());
   465                 wide_out_high = _mm_min_epi16(wide_out_high, wide_maxX);
   467                 __m128i wide_result = _mm_packs_epi32(wide_out_low,
   468                                                       wide_out_high);
   469                 _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
   471                 wide_low  = _mm_add_epi32(wide_low, wide_dx8);
   472                 wide_high = _mm_add_epi32(wide_high, wide_dx8);
   474                 xy += 4;
   475                 fx += dx * 8;
   476                 count -= 8;
   477             }
   478         } // if count >= 8
   480         uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
   481         while (count-- > 0) {
   482             *xx++ = SkClampMax(fx >> 16, maxX);
   483             fx += dx;
   484         }
   485     }
   486 }
   488 /*  SSE version of ClampX_ClampY_filter_affine()
   489  *  portable version is in core/SkBitmapProcState_matrix.h
   490  */
   491 void ClampX_ClampY_filter_affine_SSE2(const SkBitmapProcState& s,
   492                                       uint32_t xy[], int count, int x, int y) {
   493     SkPoint srcPt;
   494     s.fInvProc(s.fInvMatrix,
   495                SkIntToScalar(x) + SK_ScalarHalf,
   496                SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
   498     SkFixed oneX = s.fFilterOneX;
   499     SkFixed oneY = s.fFilterOneY;
   500     SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1);
   501     SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1);
   502     SkFixed dx = s.fInvSx;
   503     SkFixed dy = s.fInvKy;
   504     unsigned maxX = s.fBitmap->width() - 1;
   505     unsigned maxY = s.fBitmap->height() - 1;
   507     if (count >= 2 && (maxX <= 0xFFFF)) {
   508         SkFixed dx2 = dx + dx;
   509         SkFixed dy2 = dy + dy;
   511         __m128i wide_f = _mm_set_epi32(fx + dx, fy + dy, fx, fy);
   512         __m128i wide_d2  = _mm_set_epi32(dx2, dy2, dx2, dy2);
   513         __m128i wide_one  = _mm_set_epi32(oneX, oneY, oneX, oneY);
   514         __m128i wide_max = _mm_set_epi32(maxX, maxY, maxX, maxY);
   515         __m128i wide_mask = _mm_set1_epi32(0xF);
   517         while (count >= 2) {
   518             // i = SkClampMax(f>>16,maxX)
   519             __m128i wide_i = _mm_max_epi16(_mm_srli_epi32(wide_f, 16),
   520                                            _mm_setzero_si128());
   521             wide_i = _mm_min_epi16(wide_i, wide_max);
   523             // i<<4 | TILEX_LOW_BITS(f)
   524             __m128i wide_lo = _mm_srli_epi32(wide_f, 12);
   525             wide_lo = _mm_and_si128(wide_lo, wide_mask);
   526             wide_i  = _mm_slli_epi32(wide_i, 4);
   527             wide_i  = _mm_or_si128(wide_i, wide_lo);
   529             // i<<14
   530             wide_i = _mm_slli_epi32(wide_i, 14);
   532             // SkClampMax(((f+one))>>16,max)
   533             __m128i wide_f1 = _mm_add_epi32(wide_f, wide_one);
   534             wide_f1 = _mm_max_epi16(_mm_srli_epi32(wide_f1, 16),
   535                                                    _mm_setzero_si128());
   536             wide_f1 = _mm_min_epi16(wide_f1, wide_max);
   538             // final combination
   539             wide_i = _mm_or_si128(wide_i, wide_f1);
   540             _mm_storeu_si128(reinterpret_cast<__m128i*>(xy), wide_i);
   542             wide_f = _mm_add_epi32(wide_f, wide_d2);
   544             fx += dx2;
   545             fy += dy2;
   546             xy += 4;
   547             count -= 2;
   548         } // while count >= 2
   549     } // if count >= 2
   551     while (count-- > 0) {
   552         *xy++ = ClampX_ClampY_pack_filter(fy, maxY, oneY);
   553         fy += dy;
   554         *xy++ = ClampX_ClampY_pack_filter(fx, maxX, oneX);
   555         fx += dx;
   556     }
   557 }
   559 /*  SSE version of ClampX_ClampY_nofilter_affine()
   560  *  portable version is in core/SkBitmapProcState_matrix.h
   561  */
   562 void ClampX_ClampY_nofilter_affine_SSE2(const SkBitmapProcState& s,
   563                                       uint32_t xy[], int count, int x, int y) {
   564     SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
   565     SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
   566                              SkMatrix::kScale_Mask |
   567                              SkMatrix::kAffine_Mask)) == 0);
   569     SkPoint srcPt;
   570     s.fInvProc(s.fInvMatrix,
   571                SkIntToScalar(x) + SK_ScalarHalf,
   572                SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
   574     SkFixed fx = SkScalarToFixed(srcPt.fX);
   575     SkFixed fy = SkScalarToFixed(srcPt.fY);
   576     SkFixed dx = s.fInvSx;
   577     SkFixed dy = s.fInvKy;
   578     int maxX = s.fBitmap->width() - 1;
   579     int maxY = s.fBitmap->height() - 1;
   581     if (count >= 4 && (maxX <= 0xFFFF)) {
   582         while (((size_t)xy & 0x0F) != 0) {
   583             *xy++ = (SkClampMax(fy >> 16, maxY) << 16) |
   584                                   SkClampMax(fx >> 16, maxX);
   585             fx += dx;
   586             fy += dy;
   587             count--;
   588         }
   590         SkFixed dx4 = dx * 4;
   591         SkFixed dy4 = dy * 4;
   593         __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
   594                                           fx + dx, fx);
   595         __m128i wide_fy   = _mm_set_epi32(fy + dy * 3, fy + dy * 2,
   596                                           fy + dy, fy);
   597         __m128i wide_dx4  = _mm_set1_epi32(dx4);
   598         __m128i wide_dy4  = _mm_set1_epi32(dy4);
   600         __m128i wide_maxX = _mm_set1_epi32(maxX);
   601         __m128i wide_maxY = _mm_set1_epi32(maxY);
   603         while (count >= 4) {
   604             // SkClampMax(fx>>16,maxX)
   605             __m128i wide_lo = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16),
   606                                             _mm_setzero_si128());
   607             wide_lo = _mm_min_epi16(wide_lo, wide_maxX);
   609             // SkClampMax(fy>>16,maxY)
   610             __m128i wide_hi = _mm_max_epi16(_mm_srli_epi32(wide_fy, 16),
   611                                             _mm_setzero_si128());
   612             wide_hi = _mm_min_epi16(wide_hi, wide_maxY);
   614             // final combination
   615             __m128i wide_i = _mm_or_si128(_mm_slli_epi32(wide_hi, 16),
   616                                           wide_lo);
   617             _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i);
   619             wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
   620             wide_fy = _mm_add_epi32(wide_fy, wide_dy4);
   622             fx += dx4;
   623             fy += dy4;
   624             xy += 4;
   625             count -= 4;
   626         } // while count >= 4
   627     } // if count >= 4
   629     while (count-- > 0) {
   630         *xy++ = (SkClampMax(fy >> 16, maxY) << 16) |
   631                               SkClampMax(fx >> 16, maxX);
   632         fx += dx;
   633         fy += dy;
   634     }
   635 }
   637 /*  SSE version of S32_D16_filter_DX_SSE2
   638  *  Definition is in section of "D16 functions for SRC == 8888" in SkBitmapProcState.cpp
   639  *  It combines S32_opaque_D32_filter_DX_SSE2 and SkPixel32ToPixel16
   640  */
   641 void S32_D16_filter_DX_SSE2(const SkBitmapProcState& s,
   642                                    const uint32_t* xy,
   643                                    int count, uint16_t* colors) {
   644     SkASSERT(count > 0 && colors != NULL);
   645     SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
   646     SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
   647     SkASSERT(s.fBitmap->isOpaque());
   649     SkPMColor dstColor;
   650     const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
   651     size_t rb = s.fBitmap->rowBytes();
   652     uint32_t XY = *xy++;
   653     unsigned y0 = XY >> 14;
   654     const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
   655     const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
   656     unsigned subY = y0 & 0xF;
   658     // ( 0,  0,  0,  0,  0,  0,  0, 16)
   659     __m128i sixteen = _mm_cvtsi32_si128(16);
   661     // ( 0,  0,  0,  0, 16, 16, 16, 16)
   662     sixteen = _mm_shufflelo_epi16(sixteen, 0);
   664     // ( 0,  0,  0,  0,  0,  0,  0,  y)
   665     __m128i allY = _mm_cvtsi32_si128(subY);
   667     // ( 0,  0,  0,  0,  y,  y,  y,  y)
   668     allY = _mm_shufflelo_epi16(allY, 0);
   670     // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
   671     __m128i negY = _mm_sub_epi16(sixteen, allY);
   673     // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
   674     allY = _mm_unpacklo_epi64(allY, negY);
   676     // (16, 16, 16, 16, 16, 16, 16, 16 )
   677     sixteen = _mm_shuffle_epi32(sixteen, 0);
   679     // ( 0,  0,  0,  0,  0,  0,  0,  0)
   680     __m128i zero = _mm_setzero_si128();
   682     do {
   683         uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
   684         unsigned x0 = XX >> 18;
   685         unsigned x1 = XX & 0x3FFF;
   687         // (0, 0, 0, 0, 0, 0, 0, x)
   688         __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
   690         // (0, 0, 0, 0, x, x, x, x)
   691         allX = _mm_shufflelo_epi16(allX, 0);
   693         // (x, x, x, x, x, x, x, x)
   694         allX = _mm_shuffle_epi32(allX, 0);
   696         // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
   697         __m128i negX = _mm_sub_epi16(sixteen, allX);
   699         // Load 4 samples (pixels).
   700         __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
   701         __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
   702         __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
   703         __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
   705         // (0, 0, a00, a10)
   706         __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
   708         // Expand to 16 bits per component.
   709         a00a10 = _mm_unpacklo_epi8(a00a10, zero);
   711         // ((a00 * (16-y)), (a10 * y)).
   712         a00a10 = _mm_mullo_epi16(a00a10, allY);
   714         // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
   715         a00a10 = _mm_mullo_epi16(a00a10, negX);
   717         // (0, 0, a01, a10)
   718         __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
   720         // Expand to 16 bits per component.
   721         a01a11 = _mm_unpacklo_epi8(a01a11, zero);
   723         // (a01 * (16-y)), (a11 * y)
   724         a01a11 = _mm_mullo_epi16(a01a11, allY);
   726         // (a01 * (16-y) * x), (a11 * y * x)
   727         a01a11 = _mm_mullo_epi16(a01a11, allX);
   729         // (a00*w00 + a01*w01, a10*w10 + a11*w11)
   730         __m128i sum = _mm_add_epi16(a00a10, a01a11);
   732         // (DC, a00*w00 + a01*w01)
   733         __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
   735         // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
   736         sum = _mm_add_epi16(sum, shifted);
   738         // Divide each 16 bit component by 256.
   739         sum = _mm_srli_epi16(sum, 8);
   741         // Pack lower 4 16 bit values of sum into lower 4 bytes.
   742         sum = _mm_packus_epi16(sum, zero);
   744         // Extract low int and store.
   745         dstColor = _mm_cvtsi128_si32(sum);
   747         //*colors++ = SkPixel32ToPixel16(dstColor);
   748         // below is much faster than the above. It's tested for Android benchmark--Softweg
   749         __m128i _m_temp1 = _mm_set1_epi32(dstColor);
   750         __m128i _m_temp2 = _mm_srli_epi32(_m_temp1, 3);
   752         unsigned int r32 = _mm_cvtsi128_si32(_m_temp2);
   753         unsigned r = (r32 & ((1<<5) -1)) << 11;
   755         _m_temp2 = _mm_srli_epi32(_m_temp2, 7);
   756         unsigned int g32 = _mm_cvtsi128_si32(_m_temp2);
   757         unsigned g = (g32 & ((1<<6) -1)) << 5;
   759         _m_temp2 = _mm_srli_epi32(_m_temp2, 9);
   760         unsigned int b32 = _mm_cvtsi128_si32(_m_temp2);
   761         unsigned b = (b32 & ((1<<5) -1));
   763         *colors++ = r | g | b;
   765     } while (--count > 0);
   766 }

mercurial