gfx/skia/trunk/src/opts/SkBlitRect_opts_SSE2.cpp

Sat, 03 Jan 2015 20:18:00 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Sat, 03 Jan 2015 20:18:00 +0100
branch
TOR_BUG_3246
changeset 7
129ffea94266
permissions
-rw-r--r--

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

     1 /*
     2  * Copyright 2011 Google Inc.
     3  *
     4  * Use of this source code is governed by a BSD-style license that can be
     5  * found in the LICENSE file.
     6  */
     8 #include "SkBlitRect_opts_SSE2.h"
     9 #include "SkBlitRow.h"
    10 #include "SkColorPriv.h"
    12 #include <emmintrin.h>
    14 /** Simple blitting of opaque rectangles less than 31 pixels wide:
    15     inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
    16 */
    17 static void BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination,
    18                                   int width, int height,
    19                                   size_t rowBytes, uint32_t color) {
    20     SkASSERT(255 == SkGetPackedA32(color));
    21     SkASSERT(width > 0);
    22     SkASSERT(width < 31);
    24     while (--height >= 0) {
    25         SkPMColor* dst = destination;
    26         int count = width;
    28         while (count > 4) {
    29             *dst++ = color;
    30             *dst++ = color;
    31             *dst++ = color;
    32             *dst++ = color;
    33             count -= 4;
    34         }
    36         while (count > 0) {
    37             *dst++ = color;
    38             --count;
    39         }
    41         destination = (uint32_t*)((char*)destination + rowBytes);
    42     }
    43 }
    45 /**
    46   Fast blitting of opaque rectangles at least 31 pixels wide:
    47   inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
    48   A 31 pixel rectangle is guaranteed to have at least one
    49   16-pixel aligned span that can take advantage of mm_store.
    50 */
    51 static void BlitRect32_OpaqueWide_SSE2(SkPMColor* SK_RESTRICT destination,
    52                                 int width, int height,
    53                                 size_t rowBytes, uint32_t color) {
    54     SkASSERT(255 == SkGetPackedA32(color));
    55     SkASSERT(width >= 31);
    57     __m128i color_wide = _mm_set1_epi32(color);
    58     while (--height >= 0) {
    59         // Prefetching one row ahead to L1 cache can equal hardware
    60         // performance for large/tall rects, but never *beats*
    61         // hardware performance.
    62         SkPMColor* dst = destination;
    63         int count = width;
    65         while (((size_t)dst) & 0x0F) {
    66             *dst++ = color;
    67             --count;
    68         }
    69         __m128i *d = reinterpret_cast<__m128i*>(dst);
    71         // Googling suggests _mm_stream is only going to beat _mm_store
    72         // for things that wouldn't fit in L2 cache anyway, typically
    73         // >500kB, and precisely fill cache lines.  For us, with
    74         // arrays > 100k elements _mm_stream is still 100%+ slower than
    75         // mm_store.
    77         // Unrolling to count >= 64 is a break-even for most
    78         // input patterns; we seem to be saturating the bus and having
    79         // low enough overhead at 32.
    81         while (count >= 32) {
    82             _mm_store_si128(d++, color_wide);
    83             _mm_store_si128(d++, color_wide);
    84             _mm_store_si128(d++, color_wide);
    85             _mm_store_si128(d++, color_wide);
    86             _mm_store_si128(d++, color_wide);
    87             _mm_store_si128(d++, color_wide);
    88             _mm_store_si128(d++, color_wide);
    89             _mm_store_si128(d++, color_wide);
    90             count -= 32;
    91         }
    92         if (count >= 16) {
    93             _mm_store_si128(d++, color_wide);
    94             _mm_store_si128(d++, color_wide);
    95             _mm_store_si128(d++, color_wide);
    96             _mm_store_si128(d++, color_wide);
    97             count -= 16;
    98         }
    99         dst = reinterpret_cast<uint32_t*>(d);
   101         // Unrolling the loop in the Narrow code is a significant performance
   102         // gain, but unrolling this loop appears to make no difference in
   103         // benchmarks with either mm_store_si128 or individual sets.
   105         while (count > 0) {
   106             *dst++ = color;
   107             --count;
   108         }
   110         destination = (uint32_t*)((char*)destination + rowBytes);
   111     }
   112 }
   114 void ColorRect32_SSE2(SkPMColor* destination,
   115                       int width, int height,
   116                       size_t rowBytes, uint32_t color) {
   117     if (0 == height || 0 == width || 0 == color) {
   118         return;
   119     }
   120     unsigned colorA = SkGetPackedA32(color);
   121     colorA = 0; // skip below if () for now...(has been disabled since this was added in r3423).
   122     if (255 == colorA) {
   123         if (width < 31) {
   124             BlitRect32_OpaqueNarrow_SSE2(destination, width, height,
   125                                          rowBytes, color);
   126         } else {
   127             BlitRect32_OpaqueWide_SSE2(destination, width, height,
   128                                        rowBytes, color);
   129         }
   130     } else {
   131         SkBlitRow::ColorRect32(destination, width, height, rowBytes, color);
   132     }
   133 }

mercurial