The Tor Browser: gfx/skia/trunk/src/opts/SkBlitRect_opts

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

     1 /*

     2  * Copyright 2011 Google Inc.

     3  *

     4  * Use of this source code is governed by a BSD-style license that can be

     5  * found in the LICENSE file.

     6  */

     8 #include "SkBlitRect_opts_SSE2.h"

     9 #include "SkBlitRow.h"

    10 #include "SkColorPriv.h"

    12 #include <emmintrin.h>

    14 /** Simple blitting of opaque rectangles less than 31 pixels wide:

    15     inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.

    16 */

    17 static void BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination,

    18                                   int width, int height,

    19                                   size_t rowBytes, uint32_t color) {

    20     SkASSERT(255 == SkGetPackedA32(color));

    21     SkASSERT(width > 0);

    22     SkASSERT(width < 31);

    24     while (--height >= 0) {

    25         SkPMColor* dst = destination;

    26         int count = width;

    28         while (count > 4) {

    29             *dst++ = color;

    30             *dst++ = color;

    31             *dst++ = color;

    32             *dst++ = color;

    33             count -= 4;

    34         }

    36         while (count > 0) {

    37             *dst++ = color;

    38             --count;

    39         }

    41         destination = (uint32_t*)((char*)destination + rowBytes);

    42     }

    43 }

    45 /**

    46   Fast blitting of opaque rectangles at least 31 pixels wide:

    47   inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.

    48   A 31 pixel rectangle is guaranteed to have at least one

    49   16-pixel aligned span that can take advantage of mm_store.

    50 */

    51 static void BlitRect32_OpaqueWide_SSE2(SkPMColor* SK_RESTRICT destination,

    52                                 int width, int height,

    53                                 size_t rowBytes, uint32_t color) {

    54     SkASSERT(255 == SkGetPackedA32(color));

    55     SkASSERT(width >= 31);

    57     __m128i color_wide = _mm_set1_epi32(color);

    58     while (--height >= 0) {

    59         // Prefetching one row ahead to L1 cache can equal hardware

    60         // performance for large/tall rects, but never *beats*

    61         // hardware performance.

    62         SkPMColor* dst = destination;

    63         int count = width;

    65         while (((size_t)dst) & 0x0F) {

    66             *dst++ = color;

    67             --count;

    68         }

    69         __m128i *d = reinterpret_cast<__m128i*>(dst);

    71         // Googling suggests _mm_stream is only going to beat _mm_store

    72         // for things that wouldn't fit in L2 cache anyway, typically

    73         // >500kB, and precisely fill cache lines.  For us, with

    74         // arrays > 100k elements _mm_stream is still 100%+ slower than

    75         // mm_store.

    77         // Unrolling to count >= 64 is a break-even for most

    78         // input patterns; we seem to be saturating the bus and having

    79         // low enough overhead at 32.

    81         while (count >= 32) {

    82             _mm_store_si128(d++, color_wide);

    83             _mm_store_si128(d++, color_wide);

    84             _mm_store_si128(d++, color_wide);

    85             _mm_store_si128(d++, color_wide);

    86             _mm_store_si128(d++, color_wide);

    87             _mm_store_si128(d++, color_wide);

    88             _mm_store_si128(d++, color_wide);

    89             _mm_store_si128(d++, color_wide);

    90             count -= 32;

    91         }

    92         if (count >= 16) {

    93             _mm_store_si128(d++, color_wide);

    94             _mm_store_si128(d++, color_wide);

    95             _mm_store_si128(d++, color_wide);

    96             _mm_store_si128(d++, color_wide);

    97             count -= 16;

    98         }

    99         dst = reinterpret_cast<uint32_t*>(d);

   101         // Unrolling the loop in the Narrow code is a significant performance

   102         // gain, but unrolling this loop appears to make no difference in

   103         // benchmarks with either mm_store_si128 or individual sets.

   105         while (count > 0) {

   106             *dst++ = color;

   107             --count;

   108         }

   110         destination = (uint32_t*)((char*)destination + rowBytes);

   111     }

   112 }

   114 void ColorRect32_SSE2(SkPMColor* destination,

   115                       int width, int height,

   116                       size_t rowBytes, uint32_t color) {

   117     if (0 == height || 0 == width || 0 == color) {

   118         return;

   119     }

   120     unsigned colorA = SkGetPackedA32(color);

   121     colorA = 0; // skip below if () for now...(has been disabled since this was added in r3423).

   122     if (255 == colorA) {

   123         if (width < 31) {

   124             BlitRect32_OpaqueNarrow_SSE2(destination, width, height,

   125                                          rowBytes, color);

   126         } else {

   127             BlitRect32_OpaqueWide_SSE2(destination, width, height,

   128                                        rowBytes, color);

   129         }

   130     } else {

   131         SkBlitRow::ColorRect32(destination, width, height, rowBytes, color);

   132     }

   133 }

The Tor Browser / file revision

gfx/skia/trunk/src/opts/SkBlitRect_opts_SSE2.cpp@129ffea94266

gfx/skia/trunk/src/opts/SkBlitRect_opts_SSE2.cpp