michael@0: /* michael@0: * Copyright 2011 Google Inc. michael@0: * michael@0: * Use of this source code is governed by a BSD-style license that can be michael@0: * found in the LICENSE file. michael@0: */ michael@0: michael@0: #include "SkBlitRect_opts_SSE2.h" michael@0: #include "SkBlitRow.h" michael@0: #include "SkColorPriv.h" michael@0: michael@0: #include michael@0: michael@0: /** Simple blitting of opaque rectangles less than 31 pixels wide: michael@0: inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2. michael@0: */ michael@0: static void BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination, michael@0: int width, int height, michael@0: size_t rowBytes, uint32_t color) { michael@0: SkASSERT(255 == SkGetPackedA32(color)); michael@0: SkASSERT(width > 0); michael@0: SkASSERT(width < 31); michael@0: michael@0: while (--height >= 0) { michael@0: SkPMColor* dst = destination; michael@0: int count = width; michael@0: michael@0: while (count > 4) { michael@0: *dst++ = color; michael@0: *dst++ = color; michael@0: *dst++ = color; michael@0: *dst++ = color; michael@0: count -= 4; michael@0: } michael@0: michael@0: while (count > 0) { michael@0: *dst++ = color; michael@0: --count; michael@0: } michael@0: michael@0: destination = (uint32_t*)((char*)destination + rowBytes); michael@0: } michael@0: } michael@0: michael@0: /** michael@0: Fast blitting of opaque rectangles at least 31 pixels wide: michael@0: inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2. michael@0: A 31 pixel rectangle is guaranteed to have at least one michael@0: 16-pixel aligned span that can take advantage of mm_store. michael@0: */ michael@0: static void BlitRect32_OpaqueWide_SSE2(SkPMColor* SK_RESTRICT destination, michael@0: int width, int height, michael@0: size_t rowBytes, uint32_t color) { michael@0: SkASSERT(255 == SkGetPackedA32(color)); michael@0: SkASSERT(width >= 31); michael@0: michael@0: __m128i color_wide = _mm_set1_epi32(color); michael@0: while (--height >= 0) { michael@0: // Prefetching one row ahead to L1 cache can equal hardware michael@0: // performance for large/tall rects, but never *beats* michael@0: // hardware performance. michael@0: SkPMColor* dst = destination; michael@0: int count = width; michael@0: michael@0: while (((size_t)dst) & 0x0F) { michael@0: *dst++ = color; michael@0: --count; michael@0: } michael@0: __m128i *d = reinterpret_cast<__m128i*>(dst); michael@0: michael@0: // Googling suggests _mm_stream is only going to beat _mm_store michael@0: // for things that wouldn't fit in L2 cache anyway, typically michael@0: // >500kB, and precisely fill cache lines. For us, with michael@0: // arrays > 100k elements _mm_stream is still 100%+ slower than michael@0: // mm_store. michael@0: michael@0: // Unrolling to count >= 64 is a break-even for most michael@0: // input patterns; we seem to be saturating the bus and having michael@0: // low enough overhead at 32. michael@0: michael@0: while (count >= 32) { michael@0: _mm_store_si128(d++, color_wide); michael@0: _mm_store_si128(d++, color_wide); michael@0: _mm_store_si128(d++, color_wide); michael@0: _mm_store_si128(d++, color_wide); michael@0: _mm_store_si128(d++, color_wide); michael@0: _mm_store_si128(d++, color_wide); michael@0: _mm_store_si128(d++, color_wide); michael@0: _mm_store_si128(d++, color_wide); michael@0: count -= 32; michael@0: } michael@0: if (count >= 16) { michael@0: _mm_store_si128(d++, color_wide); michael@0: _mm_store_si128(d++, color_wide); michael@0: _mm_store_si128(d++, color_wide); michael@0: _mm_store_si128(d++, color_wide); michael@0: count -= 16; michael@0: } michael@0: dst = reinterpret_cast(d); michael@0: michael@0: // Unrolling the loop in the Narrow code is a significant performance michael@0: // gain, but unrolling this loop appears to make no difference in michael@0: // benchmarks with either mm_store_si128 or individual sets. michael@0: michael@0: while (count > 0) { michael@0: *dst++ = color; michael@0: --count; michael@0: } michael@0: michael@0: destination = (uint32_t*)((char*)destination + rowBytes); michael@0: } michael@0: } michael@0: michael@0: void ColorRect32_SSE2(SkPMColor* destination, michael@0: int width, int height, michael@0: size_t rowBytes, uint32_t color) { michael@0: if (0 == height || 0 == width || 0 == color) { michael@0: return; michael@0: } michael@0: unsigned colorA = SkGetPackedA32(color); michael@0: colorA = 0; // skip below if () for now...(has been disabled since this was added in r3423). michael@0: if (255 == colorA) { michael@0: if (width < 31) { michael@0: BlitRect32_OpaqueNarrow_SSE2(destination, width, height, michael@0: rowBytes, color); michael@0: } else { michael@0: BlitRect32_OpaqueWide_SSE2(destination, width, height, michael@0: rowBytes, color); michael@0: } michael@0: } else { michael@0: SkBlitRow::ColorRect32(destination, width, height, rowBytes, color); michael@0: } michael@0: }