gfx/skia/trunk/src/opts/SkBlitRect_opts_SSE2.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/skia/trunk/src/opts/SkBlitRect_opts_SSE2.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,133 @@
     1.4 +/*
     1.5 + * Copyright 2011 Google Inc.
     1.6 + *
     1.7 + * Use of this source code is governed by a BSD-style license that can be
     1.8 + * found in the LICENSE file.
     1.9 + */
    1.10 +
    1.11 +#include "SkBlitRect_opts_SSE2.h"
    1.12 +#include "SkBlitRow.h"
    1.13 +#include "SkColorPriv.h"
    1.14 +
    1.15 +#include <emmintrin.h>
    1.16 +
    1.17 +/** Simple blitting of opaque rectangles less than 31 pixels wide:
    1.18 +    inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
    1.19 +*/
    1.20 +static void BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination,
    1.21 +                                  int width, int height,
    1.22 +                                  size_t rowBytes, uint32_t color) {
    1.23 +    SkASSERT(255 == SkGetPackedA32(color));
    1.24 +    SkASSERT(width > 0);
    1.25 +    SkASSERT(width < 31);
    1.26 +
    1.27 +    while (--height >= 0) {
    1.28 +        SkPMColor* dst = destination;
    1.29 +        int count = width;
    1.30 +
    1.31 +        while (count > 4) {
    1.32 +            *dst++ = color;
    1.33 +            *dst++ = color;
    1.34 +            *dst++ = color;
    1.35 +            *dst++ = color;
    1.36 +            count -= 4;
    1.37 +        }
    1.38 +
    1.39 +        while (count > 0) {
    1.40 +            *dst++ = color;
    1.41 +            --count;
    1.42 +        }
    1.43 +
    1.44 +        destination = (uint32_t*)((char*)destination + rowBytes);
    1.45 +    }
    1.46 +}
    1.47 +
    1.48 +/**
    1.49 +  Fast blitting of opaque rectangles at least 31 pixels wide:
    1.50 +  inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
    1.51 +  A 31 pixel rectangle is guaranteed to have at least one
    1.52 +  16-pixel aligned span that can take advantage of mm_store.
    1.53 +*/
    1.54 +static void BlitRect32_OpaqueWide_SSE2(SkPMColor* SK_RESTRICT destination,
    1.55 +                                int width, int height,
    1.56 +                                size_t rowBytes, uint32_t color) {
    1.57 +    SkASSERT(255 == SkGetPackedA32(color));
    1.58 +    SkASSERT(width >= 31);
    1.59 +
    1.60 +    __m128i color_wide = _mm_set1_epi32(color);
    1.61 +    while (--height >= 0) {
    1.62 +        // Prefetching one row ahead to L1 cache can equal hardware
    1.63 +        // performance for large/tall rects, but never *beats*
    1.64 +        // hardware performance.
    1.65 +        SkPMColor* dst = destination;
    1.66 +        int count = width;
    1.67 +
    1.68 +        while (((size_t)dst) & 0x0F) {
    1.69 +            *dst++ = color;
    1.70 +            --count;
    1.71 +        }
    1.72 +        __m128i *d = reinterpret_cast<__m128i*>(dst);
    1.73 +
    1.74 +        // Googling suggests _mm_stream is only going to beat _mm_store
    1.75 +        // for things that wouldn't fit in L2 cache anyway, typically
    1.76 +        // >500kB, and precisely fill cache lines.  For us, with
    1.77 +        // arrays > 100k elements _mm_stream is still 100%+ slower than
    1.78 +        // mm_store.
    1.79 +
    1.80 +        // Unrolling to count >= 64 is a break-even for most
    1.81 +        // input patterns; we seem to be saturating the bus and having
    1.82 +        // low enough overhead at 32.
    1.83 +
    1.84 +        while (count >= 32) {
    1.85 +            _mm_store_si128(d++, color_wide);
    1.86 +            _mm_store_si128(d++, color_wide);
    1.87 +            _mm_store_si128(d++, color_wide);
    1.88 +            _mm_store_si128(d++, color_wide);
    1.89 +            _mm_store_si128(d++, color_wide);
    1.90 +            _mm_store_si128(d++, color_wide);
    1.91 +            _mm_store_si128(d++, color_wide);
    1.92 +            _mm_store_si128(d++, color_wide);
    1.93 +            count -= 32;
    1.94 +        }
    1.95 +        if (count >= 16) {
    1.96 +            _mm_store_si128(d++, color_wide);
    1.97 +            _mm_store_si128(d++, color_wide);
    1.98 +            _mm_store_si128(d++, color_wide);
    1.99 +            _mm_store_si128(d++, color_wide);
   1.100 +            count -= 16;
   1.101 +        }
   1.102 +        dst = reinterpret_cast<uint32_t*>(d);
   1.103 +
   1.104 +        // Unrolling the loop in the Narrow code is a significant performance
   1.105 +        // gain, but unrolling this loop appears to make no difference in
   1.106 +        // benchmarks with either mm_store_si128 or individual sets.
   1.107 +
   1.108 +        while (count > 0) {
   1.109 +            *dst++ = color;
   1.110 +            --count;
   1.111 +        }
   1.112 +
   1.113 +        destination = (uint32_t*)((char*)destination + rowBytes);
   1.114 +    }
   1.115 +}
   1.116 +
   1.117 +void ColorRect32_SSE2(SkPMColor* destination,
   1.118 +                      int width, int height,
   1.119 +                      size_t rowBytes, uint32_t color) {
   1.120 +    if (0 == height || 0 == width || 0 == color) {
   1.121 +        return;
   1.122 +    }
   1.123 +    unsigned colorA = SkGetPackedA32(color);
   1.124 +    colorA = 0; // skip below if () for now...(has been disabled since this was added in r3423).
   1.125 +    if (255 == colorA) {
   1.126 +        if (width < 31) {
   1.127 +            BlitRect32_OpaqueNarrow_SSE2(destination, width, height,
   1.128 +                                         rowBytes, color);
   1.129 +        } else {
   1.130 +            BlitRect32_OpaqueWide_SSE2(destination, width, height,
   1.131 +                                       rowBytes, color);
   1.132 +        }
   1.133 +    } else {
   1.134 +        SkBlitRow::ColorRect32(destination, width, height, rowBytes, color);
   1.135 +    }
   1.136 +}

mercurial