michael@0: /*
michael@0:  * Copyright 2011 Google Inc.
michael@0:  *
michael@0:  * Use of this source code is governed by a BSD-style license that can be
michael@0:  * found in the LICENSE file.
michael@0:  */
michael@0: 
michael@0: #include "SkBlitRect_opts_SSE2.h"
michael@0: #include "SkBlitRow.h"
michael@0: #include "SkColorPriv.h"
michael@0: 
michael@0: #include <emmintrin.h>
michael@0: 
michael@0: /** Simple blitting of opaque rectangles less than 31 pixels wide:
michael@0:     inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
michael@0: */
michael@0: static void BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination,
michael@0:                                   int width, int height,
michael@0:                                   size_t rowBytes, uint32_t color) {
michael@0:     SkASSERT(255 == SkGetPackedA32(color));
michael@0:     SkASSERT(width > 0);
michael@0:     SkASSERT(width < 31);
michael@0: 
michael@0:     while (--height >= 0) {
michael@0:         SkPMColor* dst = destination;
michael@0:         int count = width;
michael@0: 
michael@0:         while (count > 4) {
michael@0:             *dst++ = color;
michael@0:             *dst++ = color;
michael@0:             *dst++ = color;
michael@0:             *dst++ = color;
michael@0:             count -= 4;
michael@0:         }
michael@0: 
michael@0:         while (count > 0) {
michael@0:             *dst++ = color;
michael@0:             --count;
michael@0:         }
michael@0: 
michael@0:         destination = (uint32_t*)((char*)destination + rowBytes);
michael@0:     }
michael@0: }
michael@0: 
michael@0: /**
michael@0:   Fast blitting of opaque rectangles at least 31 pixels wide:
michael@0:   inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
michael@0:   A 31 pixel rectangle is guaranteed to have at least one
michael@0:   16-pixel aligned span that can take advantage of mm_store.
michael@0: */
michael@0: static void BlitRect32_OpaqueWide_SSE2(SkPMColor* SK_RESTRICT destination,
michael@0:                                 int width, int height,
michael@0:                                 size_t rowBytes, uint32_t color) {
michael@0:     SkASSERT(255 == SkGetPackedA32(color));
michael@0:     SkASSERT(width >= 31);
michael@0: 
michael@0:     __m128i color_wide = _mm_set1_epi32(color);
michael@0:     while (--height >= 0) {
michael@0:         // Prefetching one row ahead to L1 cache can equal hardware
michael@0:         // performance for large/tall rects, but never *beats*
michael@0:         // hardware performance.
michael@0:         SkPMColor* dst = destination;
michael@0:         int count = width;
michael@0: 
michael@0:         while (((size_t)dst) & 0x0F) {
michael@0:             *dst++ = color;
michael@0:             --count;
michael@0:         }
michael@0:         __m128i *d = reinterpret_cast<__m128i*>(dst);
michael@0: 
michael@0:         // Googling suggests _mm_stream is only going to beat _mm_store
michael@0:         // for things that wouldn't fit in L2 cache anyway, typically
michael@0:         // >500kB, and precisely fill cache lines.  For us, with
michael@0:         // arrays > 100k elements _mm_stream is still 100%+ slower than
michael@0:         // mm_store.
michael@0: 
michael@0:         // Unrolling to count >= 64 is a break-even for most
michael@0:         // input patterns; we seem to be saturating the bus and having
michael@0:         // low enough overhead at 32.
michael@0: 
michael@0:         while (count >= 32) {
michael@0:             _mm_store_si128(d++, color_wide);
michael@0:             _mm_store_si128(d++, color_wide);
michael@0:             _mm_store_si128(d++, color_wide);
michael@0:             _mm_store_si128(d++, color_wide);
michael@0:             _mm_store_si128(d++, color_wide);
michael@0:             _mm_store_si128(d++, color_wide);
michael@0:             _mm_store_si128(d++, color_wide);
michael@0:             _mm_store_si128(d++, color_wide);
michael@0:             count -= 32;
michael@0:         }
michael@0:         if (count >= 16) {
michael@0:             _mm_store_si128(d++, color_wide);
michael@0:             _mm_store_si128(d++, color_wide);
michael@0:             _mm_store_si128(d++, color_wide);
michael@0:             _mm_store_si128(d++, color_wide);
michael@0:             count -= 16;
michael@0:         }
michael@0:         dst = reinterpret_cast<uint32_t*>(d);
michael@0: 
michael@0:         // Unrolling the loop in the Narrow code is a significant performance
michael@0:         // gain, but unrolling this loop appears to make no difference in
michael@0:         // benchmarks with either mm_store_si128 or individual sets.
michael@0: 
michael@0:         while (count > 0) {
michael@0:             *dst++ = color;
michael@0:             --count;
michael@0:         }
michael@0: 
michael@0:         destination = (uint32_t*)((char*)destination + rowBytes);
michael@0:     }
michael@0: }
michael@0: 
michael@0: void ColorRect32_SSE2(SkPMColor* destination,
michael@0:                       int width, int height,
michael@0:                       size_t rowBytes, uint32_t color) {
michael@0:     if (0 == height || 0 == width || 0 == color) {
michael@0:         return;
michael@0:     }
michael@0:     unsigned colorA = SkGetPackedA32(color);
michael@0:     colorA = 0; // skip below if () for now...(has been disabled since this was added in r3423).
michael@0:     if (255 == colorA) {
michael@0:         if (width < 31) {
michael@0:             BlitRect32_OpaqueNarrow_SSE2(destination, width, height,
michael@0:                                          rowBytes, color);
michael@0:         } else {
michael@0:             BlitRect32_OpaqueWide_SSE2(destination, width, height,
michael@0:                                        rowBytes, color);
michael@0:         }
michael@0:     } else {
michael@0:         SkBlitRow::ColorRect32(destination, width, height, rowBytes, color);
michael@0:     }
michael@0: }