1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/skia/trunk/src/opts/SkBlitRect_opts_SSE2.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,133 @@ 1.4 +/* 1.5 + * Copyright 2011 Google Inc. 1.6 + * 1.7 + * Use of this source code is governed by a BSD-style license that can be 1.8 + * found in the LICENSE file. 1.9 + */ 1.10 + 1.11 +#include "SkBlitRect_opts_SSE2.h" 1.12 +#include "SkBlitRow.h" 1.13 +#include "SkColorPriv.h" 1.14 + 1.15 +#include <emmintrin.h> 1.16 + 1.17 +/** Simple blitting of opaque rectangles less than 31 pixels wide: 1.18 + inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2. 1.19 +*/ 1.20 +static void BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination, 1.21 + int width, int height, 1.22 + size_t rowBytes, uint32_t color) { 1.23 + SkASSERT(255 == SkGetPackedA32(color)); 1.24 + SkASSERT(width > 0); 1.25 + SkASSERT(width < 31); 1.26 + 1.27 + while (--height >= 0) { 1.28 + SkPMColor* dst = destination; 1.29 + int count = width; 1.30 + 1.31 + while (count > 4) { 1.32 + *dst++ = color; 1.33 + *dst++ = color; 1.34 + *dst++ = color; 1.35 + *dst++ = color; 1.36 + count -= 4; 1.37 + } 1.38 + 1.39 + while (count > 0) { 1.40 + *dst++ = color; 1.41 + --count; 1.42 + } 1.43 + 1.44 + destination = (uint32_t*)((char*)destination + rowBytes); 1.45 + } 1.46 +} 1.47 + 1.48 +/** 1.49 + Fast blitting of opaque rectangles at least 31 pixels wide: 1.50 + inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2. 1.51 + A 31 pixel rectangle is guaranteed to have at least one 1.52 + 16-pixel aligned span that can take advantage of mm_store. 1.53 +*/ 1.54 +static void BlitRect32_OpaqueWide_SSE2(SkPMColor* SK_RESTRICT destination, 1.55 + int width, int height, 1.56 + size_t rowBytes, uint32_t color) { 1.57 + SkASSERT(255 == SkGetPackedA32(color)); 1.58 + SkASSERT(width >= 31); 1.59 + 1.60 + __m128i color_wide = _mm_set1_epi32(color); 1.61 + while (--height >= 0) { 1.62 + // Prefetching one row ahead to L1 cache can equal hardware 1.63 + // performance for large/tall rects, but never *beats* 1.64 + // hardware performance. 1.65 + SkPMColor* dst = destination; 1.66 + int count = width; 1.67 + 1.68 + while (((size_t)dst) & 0x0F) { 1.69 + *dst++ = color; 1.70 + --count; 1.71 + } 1.72 + __m128i *d = reinterpret_cast<__m128i*>(dst); 1.73 + 1.74 + // Googling suggests _mm_stream is only going to beat _mm_store 1.75 + // for things that wouldn't fit in L2 cache anyway, typically 1.76 + // >500kB, and precisely fill cache lines. For us, with 1.77 + // arrays > 100k elements _mm_stream is still 100%+ slower than 1.78 + // mm_store. 1.79 + 1.80 + // Unrolling to count >= 64 is a break-even for most 1.81 + // input patterns; we seem to be saturating the bus and having 1.82 + // low enough overhead at 32. 1.83 + 1.84 + while (count >= 32) { 1.85 + _mm_store_si128(d++, color_wide); 1.86 + _mm_store_si128(d++, color_wide); 1.87 + _mm_store_si128(d++, color_wide); 1.88 + _mm_store_si128(d++, color_wide); 1.89 + _mm_store_si128(d++, color_wide); 1.90 + _mm_store_si128(d++, color_wide); 1.91 + _mm_store_si128(d++, color_wide); 1.92 + _mm_store_si128(d++, color_wide); 1.93 + count -= 32; 1.94 + } 1.95 + if (count >= 16) { 1.96 + _mm_store_si128(d++, color_wide); 1.97 + _mm_store_si128(d++, color_wide); 1.98 + _mm_store_si128(d++, color_wide); 1.99 + _mm_store_si128(d++, color_wide); 1.100 + count -= 16; 1.101 + } 1.102 + dst = reinterpret_cast<uint32_t*>(d); 1.103 + 1.104 + // Unrolling the loop in the Narrow code is a significant performance 1.105 + // gain, but unrolling this loop appears to make no difference in 1.106 + // benchmarks with either mm_store_si128 or individual sets. 1.107 + 1.108 + while (count > 0) { 1.109 + *dst++ = color; 1.110 + --count; 1.111 + } 1.112 + 1.113 + destination = (uint32_t*)((char*)destination + rowBytes); 1.114 + } 1.115 +} 1.116 + 1.117 +void ColorRect32_SSE2(SkPMColor* destination, 1.118 + int width, int height, 1.119 + size_t rowBytes, uint32_t color) { 1.120 + if (0 == height || 0 == width || 0 == color) { 1.121 + return; 1.122 + } 1.123 + unsigned colorA = SkGetPackedA32(color); 1.124 + colorA = 0; // skip below if () for now...(has been disabled since this was added in r3423). 1.125 + if (255 == colorA) { 1.126 + if (width < 31) { 1.127 + BlitRect32_OpaqueNarrow_SSE2(destination, width, height, 1.128 + rowBytes, color); 1.129 + } else { 1.130 + BlitRect32_OpaqueWide_SSE2(destination, width, height, 1.131 + rowBytes, color); 1.132 + } 1.133 + } else { 1.134 + SkBlitRow::ColorRect32(destination, width, height, rowBytes, color); 1.135 + } 1.136 +}