gfx/skia/trunk/src/opts/SkBlitRect_opts_SSE2.cpp

Sat, 03 Jan 2015 20:18:00 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Sat, 03 Jan 2015 20:18:00 +0100
branch
TOR_BUG_3246
changeset 7
129ffea94266
permissions
-rw-r--r--

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

michael@0 1 /*
michael@0 2 * Copyright 2011 Google Inc.
michael@0 3 *
michael@0 4 * Use of this source code is governed by a BSD-style license that can be
michael@0 5 * found in the LICENSE file.
michael@0 6 */
michael@0 7
michael@0 8 #include "SkBlitRect_opts_SSE2.h"
michael@0 9 #include "SkBlitRow.h"
michael@0 10 #include "SkColorPriv.h"
michael@0 11
michael@0 12 #include <emmintrin.h>
michael@0 13
michael@0 14 /** Simple blitting of opaque rectangles less than 31 pixels wide:
michael@0 15 inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
michael@0 16 */
michael@0 17 static void BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination,
michael@0 18 int width, int height,
michael@0 19 size_t rowBytes, uint32_t color) {
michael@0 20 SkASSERT(255 == SkGetPackedA32(color));
michael@0 21 SkASSERT(width > 0);
michael@0 22 SkASSERT(width < 31);
michael@0 23
michael@0 24 while (--height >= 0) {
michael@0 25 SkPMColor* dst = destination;
michael@0 26 int count = width;
michael@0 27
michael@0 28 while (count > 4) {
michael@0 29 *dst++ = color;
michael@0 30 *dst++ = color;
michael@0 31 *dst++ = color;
michael@0 32 *dst++ = color;
michael@0 33 count -= 4;
michael@0 34 }
michael@0 35
michael@0 36 while (count > 0) {
michael@0 37 *dst++ = color;
michael@0 38 --count;
michael@0 39 }
michael@0 40
michael@0 41 destination = (uint32_t*)((char*)destination + rowBytes);
michael@0 42 }
michael@0 43 }
michael@0 44
michael@0 45 /**
michael@0 46 Fast blitting of opaque rectangles at least 31 pixels wide:
michael@0 47 inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
michael@0 48 A 31 pixel rectangle is guaranteed to have at least one
michael@0 49 16-pixel aligned span that can take advantage of mm_store.
michael@0 50 */
michael@0 51 static void BlitRect32_OpaqueWide_SSE2(SkPMColor* SK_RESTRICT destination,
michael@0 52 int width, int height,
michael@0 53 size_t rowBytes, uint32_t color) {
michael@0 54 SkASSERT(255 == SkGetPackedA32(color));
michael@0 55 SkASSERT(width >= 31);
michael@0 56
michael@0 57 __m128i color_wide = _mm_set1_epi32(color);
michael@0 58 while (--height >= 0) {
michael@0 59 // Prefetching one row ahead to L1 cache can equal hardware
michael@0 60 // performance for large/tall rects, but never *beats*
michael@0 61 // hardware performance.
michael@0 62 SkPMColor* dst = destination;
michael@0 63 int count = width;
michael@0 64
michael@0 65 while (((size_t)dst) & 0x0F) {
michael@0 66 *dst++ = color;
michael@0 67 --count;
michael@0 68 }
michael@0 69 __m128i *d = reinterpret_cast<__m128i*>(dst);
michael@0 70
michael@0 71 // Googling suggests _mm_stream is only going to beat _mm_store
michael@0 72 // for things that wouldn't fit in L2 cache anyway, typically
michael@0 73 // >500kB, and precisely fill cache lines. For us, with
michael@0 74 // arrays > 100k elements _mm_stream is still 100%+ slower than
michael@0 75 // mm_store.
michael@0 76
michael@0 77 // Unrolling to count >= 64 is a break-even for most
michael@0 78 // input patterns; we seem to be saturating the bus and having
michael@0 79 // low enough overhead at 32.
michael@0 80
michael@0 81 while (count >= 32) {
michael@0 82 _mm_store_si128(d++, color_wide);
michael@0 83 _mm_store_si128(d++, color_wide);
michael@0 84 _mm_store_si128(d++, color_wide);
michael@0 85 _mm_store_si128(d++, color_wide);
michael@0 86 _mm_store_si128(d++, color_wide);
michael@0 87 _mm_store_si128(d++, color_wide);
michael@0 88 _mm_store_si128(d++, color_wide);
michael@0 89 _mm_store_si128(d++, color_wide);
michael@0 90 count -= 32;
michael@0 91 }
michael@0 92 if (count >= 16) {
michael@0 93 _mm_store_si128(d++, color_wide);
michael@0 94 _mm_store_si128(d++, color_wide);
michael@0 95 _mm_store_si128(d++, color_wide);
michael@0 96 _mm_store_si128(d++, color_wide);
michael@0 97 count -= 16;
michael@0 98 }
michael@0 99 dst = reinterpret_cast<uint32_t*>(d);
michael@0 100
michael@0 101 // Unrolling the loop in the Narrow code is a significant performance
michael@0 102 // gain, but unrolling this loop appears to make no difference in
michael@0 103 // benchmarks with either mm_store_si128 or individual sets.
michael@0 104
michael@0 105 while (count > 0) {
michael@0 106 *dst++ = color;
michael@0 107 --count;
michael@0 108 }
michael@0 109
michael@0 110 destination = (uint32_t*)((char*)destination + rowBytes);
michael@0 111 }
michael@0 112 }
michael@0 113
michael@0 114 void ColorRect32_SSE2(SkPMColor* destination,
michael@0 115 int width, int height,
michael@0 116 size_t rowBytes, uint32_t color) {
michael@0 117 if (0 == height || 0 == width || 0 == color) {
michael@0 118 return;
michael@0 119 }
michael@0 120 unsigned colorA = SkGetPackedA32(color);
michael@0 121 colorA = 0; // skip below if () for now...(has been disabled since this was added in r3423).
michael@0 122 if (255 == colorA) {
michael@0 123 if (width < 31) {
michael@0 124 BlitRect32_OpaqueNarrow_SSE2(destination, width, height,
michael@0 125 rowBytes, color);
michael@0 126 } else {
michael@0 127 BlitRect32_OpaqueWide_SSE2(destination, width, height,
michael@0 128 rowBytes, color);
michael@0 129 }
michael@0 130 } else {
michael@0 131 SkBlitRow::ColorRect32(destination, width, height, rowBytes, color);
michael@0 132 }
michael@0 133 }

mercurial