gfx/skia/trunk/src/opts/SkBitmapProcState_opts_SSE2.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/skia/trunk/src/opts/SkBitmapProcState_opts_SSE2.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,766 @@
     1.4 +
     1.5 +/*
     1.6 + * Copyright 2009 The Android Open Source Project
     1.7 + *
     1.8 + * Use of this source code is governed by a BSD-style license that can be
     1.9 + * found in the LICENSE file.
    1.10 + */
    1.11 +
    1.12 +
    1.13 +#include <emmintrin.h>
    1.14 +#include "SkBitmapProcState_opts_SSE2.h"
    1.15 +#include "SkPaint.h"
    1.16 +#include "SkUtils.h"
    1.17 +
    1.18 +void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s,
    1.19 +                                   const uint32_t* xy,
    1.20 +                                   int count, uint32_t* colors) {
    1.21 +    SkASSERT(count > 0 && colors != NULL);
    1.22 +    SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
    1.23 +    SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
    1.24 +    SkASSERT(s.fAlphaScale == 256);
    1.25 +
    1.26 +    const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
    1.27 +    size_t rb = s.fBitmap->rowBytes();
    1.28 +    uint32_t XY = *xy++;
    1.29 +    unsigned y0 = XY >> 14;
    1.30 +    const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
    1.31 +    const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
    1.32 +    unsigned subY = y0 & 0xF;
    1.33 +
    1.34 +    // ( 0,  0,  0,  0,  0,  0,  0, 16)
    1.35 +    __m128i sixteen = _mm_cvtsi32_si128(16);
    1.36 +
    1.37 +    // ( 0,  0,  0,  0, 16, 16, 16, 16)
    1.38 +    sixteen = _mm_shufflelo_epi16(sixteen, 0);
    1.39 +
    1.40 +    // ( 0,  0,  0,  0,  0,  0,  0,  y)
    1.41 +    __m128i allY = _mm_cvtsi32_si128(subY);
    1.42 +
    1.43 +    // ( 0,  0,  0,  0,  y,  y,  y,  y)
    1.44 +    allY = _mm_shufflelo_epi16(allY, 0);
    1.45 +
    1.46 +    // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
    1.47 +    __m128i negY = _mm_sub_epi16(sixteen, allY);
    1.48 +
    1.49 +    // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
    1.50 +    allY = _mm_unpacklo_epi64(allY, negY);
    1.51 +
    1.52 +    // (16, 16, 16, 16, 16, 16, 16, 16 )
    1.53 +    sixteen = _mm_shuffle_epi32(sixteen, 0);
    1.54 +
    1.55 +    // ( 0,  0,  0,  0,  0,  0,  0,  0)
    1.56 +    __m128i zero = _mm_setzero_si128();
    1.57 +    do {
    1.58 +        uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
    1.59 +        unsigned x0 = XX >> 18;
    1.60 +        unsigned x1 = XX & 0x3FFF;
    1.61 +
    1.62 +        // (0, 0, 0, 0, 0, 0, 0, x)
    1.63 +        __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
    1.64 +
    1.65 +        // (0, 0, 0, 0, x, x, x, x)
    1.66 +        allX = _mm_shufflelo_epi16(allX, 0);
    1.67 +
    1.68 +        // (x, x, x, x, x, x, x, x)
    1.69 +        allX = _mm_shuffle_epi32(allX, 0);
    1.70 +
    1.71 +        // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
    1.72 +        __m128i negX = _mm_sub_epi16(sixteen, allX);
    1.73 +
    1.74 +        // Load 4 samples (pixels).
    1.75 +        __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
    1.76 +        __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
    1.77 +        __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
    1.78 +        __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
    1.79 +
    1.80 +        // (0, 0, a00, a10)
    1.81 +        __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
    1.82 +
    1.83 +        // Expand to 16 bits per component.
    1.84 +        a00a10 = _mm_unpacklo_epi8(a00a10, zero);
    1.85 +
    1.86 +        // ((a00 * (16-y)), (a10 * y)).
    1.87 +        a00a10 = _mm_mullo_epi16(a00a10, allY);
    1.88 +
    1.89 +        // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
    1.90 +        a00a10 = _mm_mullo_epi16(a00a10, negX);
    1.91 +
    1.92 +        // (0, 0, a01, a10)
    1.93 +        __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
    1.94 +
    1.95 +        // Expand to 16 bits per component.
    1.96 +        a01a11 = _mm_unpacklo_epi8(a01a11, zero);
    1.97 +
    1.98 +        // (a01 * (16-y)), (a11 * y)
    1.99 +        a01a11 = _mm_mullo_epi16(a01a11, allY);
   1.100 +
   1.101 +        // (a01 * (16-y) * x), (a11 * y * x)
   1.102 +        a01a11 = _mm_mullo_epi16(a01a11, allX);
   1.103 +
   1.104 +        // (a00*w00 + a01*w01, a10*w10 + a11*w11)
   1.105 +        __m128i sum = _mm_add_epi16(a00a10, a01a11);
   1.106 +
   1.107 +        // (DC, a00*w00 + a01*w01)
   1.108 +        __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
   1.109 +
   1.110 +        // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
   1.111 +        sum = _mm_add_epi16(sum, shifted);
   1.112 +
   1.113 +        // Divide each 16 bit component by 256.
   1.114 +        sum = _mm_srli_epi16(sum, 8);
   1.115 +
   1.116 +        // Pack lower 4 16 bit values of sum into lower 4 bytes.
   1.117 +        sum = _mm_packus_epi16(sum, zero);
   1.118 +
   1.119 +        // Extract low int and store.
   1.120 +        *colors++ = _mm_cvtsi128_si32(sum);
   1.121 +    } while (--count > 0);
   1.122 +}
   1.123 +
   1.124 +void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
   1.125 +                                  const uint32_t* xy,
   1.126 +                                  int count, uint32_t* colors) {
   1.127 +    SkASSERT(count > 0 && colors != NULL);
   1.128 +    SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
   1.129 +    SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
   1.130 +    SkASSERT(s.fAlphaScale < 256);
   1.131 +
   1.132 +    const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
   1.133 +    size_t rb = s.fBitmap->rowBytes();
   1.134 +    uint32_t XY = *xy++;
   1.135 +    unsigned y0 = XY >> 14;
   1.136 +    const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
   1.137 +    const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
   1.138 +    unsigned subY = y0 & 0xF;
   1.139 +
   1.140 +    // ( 0,  0,  0,  0,  0,  0,  0, 16)
   1.141 +    __m128i sixteen = _mm_cvtsi32_si128(16);
   1.142 +
   1.143 +    // ( 0,  0,  0,  0, 16, 16, 16, 16)
   1.144 +    sixteen = _mm_shufflelo_epi16(sixteen, 0);
   1.145 +
   1.146 +    // ( 0,  0,  0,  0,  0,  0,  0,  y)
   1.147 +    __m128i allY = _mm_cvtsi32_si128(subY);
   1.148 +
   1.149 +    // ( 0,  0,  0,  0,  y,  y,  y,  y)
   1.150 +    allY = _mm_shufflelo_epi16(allY, 0);
   1.151 +
   1.152 +    // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
   1.153 +    __m128i negY = _mm_sub_epi16(sixteen, allY);
   1.154 +
   1.155 +    // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
   1.156 +    allY = _mm_unpacklo_epi64(allY, negY);
   1.157 +
   1.158 +    // (16, 16, 16, 16, 16, 16, 16, 16 )
   1.159 +    sixteen = _mm_shuffle_epi32(sixteen, 0);
   1.160 +
   1.161 +    // ( 0,  0,  0,  0,  0,  0,  0,  0)
   1.162 +    __m128i zero = _mm_setzero_si128();
   1.163 +
   1.164 +    // ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha )
   1.165 +    __m128i alpha = _mm_set1_epi16(s.fAlphaScale);
   1.166 +
   1.167 +    do {
   1.168 +        uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
   1.169 +        unsigned x0 = XX >> 18;
   1.170 +        unsigned x1 = XX & 0x3FFF;
   1.171 +
   1.172 +        // (0, 0, 0, 0, 0, 0, 0, x)
   1.173 +        __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
   1.174 +
   1.175 +        // (0, 0, 0, 0, x, x, x, x)
   1.176 +        allX = _mm_shufflelo_epi16(allX, 0);
   1.177 +
   1.178 +        // (x, x, x, x, x, x, x, x)
   1.179 +        allX = _mm_shuffle_epi32(allX, 0);
   1.180 +
   1.181 +        // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
   1.182 +        __m128i negX = _mm_sub_epi16(sixteen, allX);
   1.183 +
   1.184 +        // Load 4 samples (pixels).
   1.185 +        __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
   1.186 +        __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
   1.187 +        __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
   1.188 +        __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
   1.189 +
   1.190 +        // (0, 0, a00, a10)
   1.191 +        __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
   1.192 +
   1.193 +        // Expand to 16 bits per component.
   1.194 +        a00a10 = _mm_unpacklo_epi8(a00a10, zero);
   1.195 +
   1.196 +        // ((a00 * (16-y)), (a10 * y)).
   1.197 +        a00a10 = _mm_mullo_epi16(a00a10, allY);
   1.198 +
   1.199 +        // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
   1.200 +        a00a10 = _mm_mullo_epi16(a00a10, negX);
   1.201 +
   1.202 +        // (0, 0, a01, a10)
   1.203 +        __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
   1.204 +
   1.205 +        // Expand to 16 bits per component.
   1.206 +        a01a11 = _mm_unpacklo_epi8(a01a11, zero);
   1.207 +
   1.208 +        // (a01 * (16-y)), (a11 * y)
   1.209 +        a01a11 = _mm_mullo_epi16(a01a11, allY);
   1.210 +
   1.211 +        // (a01 * (16-y) * x), (a11 * y * x)
   1.212 +        a01a11 = _mm_mullo_epi16(a01a11, allX);
   1.213 +
   1.214 +        // (a00*w00 + a01*w01, a10*w10 + a11*w11)
   1.215 +        __m128i sum = _mm_add_epi16(a00a10, a01a11);
   1.216 +
   1.217 +        // (DC, a00*w00 + a01*w01)
   1.218 +        __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
   1.219 +
   1.220 +        // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
   1.221 +        sum = _mm_add_epi16(sum, shifted);
   1.222 +
   1.223 +        // Divide each 16 bit component by 256.
   1.224 +        sum = _mm_srli_epi16(sum, 8);
   1.225 +
   1.226 +        // Multiply by alpha.
   1.227 +        sum = _mm_mullo_epi16(sum, alpha);
   1.228 +
   1.229 +        // Divide each 16 bit component by 256.
   1.230 +        sum = _mm_srli_epi16(sum, 8);
   1.231 +
   1.232 +        // Pack lower 4 16 bit values of sum into lower 4 bytes.
   1.233 +        sum = _mm_packus_epi16(sum, zero);
   1.234 +
   1.235 +        // Extract low int and store.
   1.236 +        *colors++ = _mm_cvtsi128_si32(sum);
   1.237 +    } while (--count > 0);
   1.238 +}
   1.239 +
   1.240 +static inline uint32_t ClampX_ClampY_pack_filter(SkFixed f, unsigned max,
   1.241 +                                                 SkFixed one) {
   1.242 +    unsigned i = SkClampMax(f >> 16, max);
   1.243 +    i = (i << 4) | ((f >> 12) & 0xF);
   1.244 +    return (i << 14) | SkClampMax((f + one) >> 16, max);
   1.245 +}
   1.246 +
   1.247 +/*  SSE version of ClampX_ClampY_filter_scale()
   1.248 + *  portable version is in core/SkBitmapProcState_matrix.h
   1.249 + */
   1.250 +void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[],
   1.251 +                                     int count, int x, int y) {
   1.252 +    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
   1.253 +                             SkMatrix::kScale_Mask)) == 0);
   1.254 +    SkASSERT(s.fInvKy == 0);
   1.255 +
   1.256 +    const unsigned maxX = s.fBitmap->width() - 1;
   1.257 +    const SkFixed one = s.fFilterOneX;
   1.258 +    const SkFixed dx = s.fInvSx;
   1.259 +    SkFixed fx;
   1.260 +
   1.261 +    SkPoint pt;
   1.262 +    s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
   1.263 +                             SkIntToScalar(y) + SK_ScalarHalf, &pt);
   1.264 +    const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1);
   1.265 +    const unsigned maxY = s.fBitmap->height() - 1;
   1.266 +    // compute our two Y values up front
   1.267 +    *xy++ = ClampX_ClampY_pack_filter(fy, maxY, s.fFilterOneY);
   1.268 +    // now initialize fx
   1.269 +    fx = SkScalarToFixed(pt.fX) - (one >> 1);
   1.270 +
   1.271 +    // test if we don't need to apply the tile proc
   1.272 +    if (dx > 0 && (unsigned)(fx >> 16) <= maxX &&
   1.273 +        (unsigned)((fx + dx * (count - 1)) >> 16) < maxX) {
   1.274 +        if (count >= 4) {
   1.275 +            // SSE version of decal_filter_scale
   1.276 +            while ((size_t(xy) & 0x0F) != 0) {
   1.277 +                SkASSERT((fx >> (16 + 14)) == 0);
   1.278 +                *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
   1.279 +                fx += dx;
   1.280 +                count--;
   1.281 +            }
   1.282 +
   1.283 +            __m128i wide_1    = _mm_set1_epi32(1);
   1.284 +            __m128i wide_dx4  = _mm_set1_epi32(dx * 4);
   1.285 +            __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
   1.286 +                                              fx + dx, fx);
   1.287 +
   1.288 +            while (count >= 4) {
   1.289 +                __m128i wide_out;
   1.290 +
   1.291 +                wide_out = _mm_slli_epi32(_mm_srai_epi32(wide_fx, 12), 14);
   1.292 +                wide_out = _mm_or_si128(wide_out, _mm_add_epi32(
   1.293 +                                        _mm_srai_epi32(wide_fx, 16), wide_1));
   1.294 +
   1.295 +                _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_out);
   1.296 +
   1.297 +                xy += 4;
   1.298 +                fx += dx * 4;
   1.299 +                wide_fx  = _mm_add_epi32(wide_fx, wide_dx4);
   1.300 +                count -= 4;
   1.301 +            } // while count >= 4
   1.302 +        } // if count >= 4
   1.303 +
   1.304 +        while (count-- > 0) {
   1.305 +            SkASSERT((fx >> (16 + 14)) == 0);
   1.306 +            *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
   1.307 +            fx += dx;
   1.308 +        }
   1.309 +    } else {
   1.310 +        // SSE2 only support 16bit interger max & min, so only process the case
   1.311 +        // maxX less than the max 16bit interger. Actually maxX is the bitmap's
   1.312 +        // height, there should be rare bitmap whose height will be greater
   1.313 +        // than max 16bit interger in the real world.
   1.314 +        if ((count >= 4) && (maxX <= 0xFFFF)) {
   1.315 +            while (((size_t)xy & 0x0F) != 0) {
   1.316 +                *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
   1.317 +                fx += dx;
   1.318 +                count--;
   1.319 +            }
   1.320 +
   1.321 +            __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
   1.322 +                                              fx + dx, fx);
   1.323 +            __m128i wide_dx4  = _mm_set1_epi32(dx * 4);
   1.324 +            __m128i wide_one  = _mm_set1_epi32(one);
   1.325 +            __m128i wide_maxX = _mm_set1_epi32(maxX);
   1.326 +            __m128i wide_mask = _mm_set1_epi32(0xF);
   1.327 +
   1.328 +             while (count >= 4) {
   1.329 +                __m128i wide_i;
   1.330 +                __m128i wide_lo;
   1.331 +                __m128i wide_fx1;
   1.332 +
   1.333 +                // i = SkClampMax(f>>16,maxX)
   1.334 +                wide_i = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16),
   1.335 +                                       _mm_setzero_si128());
   1.336 +                wide_i = _mm_min_epi16(wide_i, wide_maxX);
   1.337 +
   1.338 +                // i<<4 | TILEX_LOW_BITS(fx)
   1.339 +                wide_lo = _mm_srli_epi32(wide_fx, 12);
   1.340 +                wide_lo = _mm_and_si128(wide_lo, wide_mask);
   1.341 +                wide_i  = _mm_slli_epi32(wide_i, 4);
   1.342 +                wide_i  = _mm_or_si128(wide_i, wide_lo);
   1.343 +
   1.344 +                // i<<14
   1.345 +                wide_i = _mm_slli_epi32(wide_i, 14);
   1.346 +
   1.347 +                // SkClampMax(((f+one))>>16,max)
   1.348 +                wide_fx1 = _mm_add_epi32(wide_fx, wide_one);
   1.349 +                wide_fx1 = _mm_max_epi16(_mm_srli_epi32(wide_fx1, 16),
   1.350 +                                                        _mm_setzero_si128());
   1.351 +                wide_fx1 = _mm_min_epi16(wide_fx1, wide_maxX);
   1.352 +
   1.353 +                // final combination
   1.354 +                wide_i = _mm_or_si128(wide_i, wide_fx1);
   1.355 +                _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i);
   1.356 +
   1.357 +                wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
   1.358 +                fx += dx * 4;
   1.359 +                xy += 4;
   1.360 +                count -= 4;
   1.361 +            } // while count >= 4
   1.362 +        } // if count >= 4
   1.363 +
   1.364 +        while (count-- > 0) {
   1.365 +            *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
   1.366 +            fx += dx;
   1.367 +        }
   1.368 +    }
   1.369 +}
   1.370 +
   1.371 +/*  SSE version of ClampX_ClampY_nofilter_scale()
   1.372 + *  portable version is in core/SkBitmapProcState_matrix.h
   1.373 + */
   1.374 +void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s,
   1.375 +                                    uint32_t xy[], int count, int x, int y) {
   1.376 +    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
   1.377 +                             SkMatrix::kScale_Mask)) == 0);
   1.378 +
   1.379 +    // we store y, x, x, x, x, x
   1.380 +    const unsigned maxX = s.fBitmap->width() - 1;
   1.381 +    SkFixed fx;
   1.382 +    SkPoint pt;
   1.383 +    s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
   1.384 +                             SkIntToScalar(y) + SK_ScalarHalf, &pt);
   1.385 +    fx = SkScalarToFixed(pt.fY);
   1.386 +    const unsigned maxY = s.fBitmap->height() - 1;
   1.387 +    *xy++ = SkClampMax(fx >> 16, maxY);
   1.388 +    fx = SkScalarToFixed(pt.fX);
   1.389 +
   1.390 +    if (0 == maxX) {
   1.391 +        // all of the following X values must be 0
   1.392 +        memset(xy, 0, count * sizeof(uint16_t));
   1.393 +        return;
   1.394 +    }
   1.395 +
   1.396 +    const SkFixed dx = s.fInvSx;
   1.397 +
   1.398 +    // test if we don't need to apply the tile proc
   1.399 +    if ((unsigned)(fx >> 16) <= maxX &&
   1.400 +        (unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) {
   1.401 +        // SSE version of decal_nofilter_scale
   1.402 +        if (count >= 8) {
   1.403 +            while (((size_t)xy & 0x0F) != 0) {
   1.404 +                *xy++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
   1.405 +                fx += 2 * dx;
   1.406 +                count -= 2;
   1.407 +            }
   1.408 +
   1.409 +            __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
   1.410 +            __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
   1.411 +
   1.412 +            __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
   1.413 +                                             fx + dx, fx);
   1.414 +            __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
   1.415 +
   1.416 +            while (count >= 8) {
   1.417 +                __m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
   1.418 +                __m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
   1.419 +
   1.420 +                __m128i wide_result = _mm_packs_epi32(wide_out_low,
   1.421 +                                                      wide_out_high);
   1.422 +                _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
   1.423 +
   1.424 +                wide_low = _mm_add_epi32(wide_low, wide_dx8);
   1.425 +                wide_high = _mm_add_epi32(wide_high, wide_dx8);
   1.426 +
   1.427 +                xy += 4;
   1.428 +                fx += dx * 8;
   1.429 +                count -= 8;
   1.430 +            }
   1.431 +        } // if count >= 8
   1.432 +
   1.433 +        uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
   1.434 +        while (count-- > 0) {
   1.435 +            *xx++ = SkToU16(fx >> 16);
   1.436 +            fx += dx;
   1.437 +        }
   1.438 +    } else {
   1.439 +        // SSE2 only support 16bit interger max & min, so only process the case
   1.440 +        // maxX less than the max 16bit interger. Actually maxX is the bitmap's
   1.441 +        // height, there should be rare bitmap whose height will be greater
   1.442 +        // than max 16bit interger in the real world.
   1.443 +        if ((count >= 8) && (maxX <= 0xFFFF)) {
   1.444 +            while (((size_t)xy & 0x0F) != 0) {
   1.445 +                *xy++ = pack_two_shorts(SkClampMax((fx + dx) >> 16, maxX),
   1.446 +                                        SkClampMax(fx >> 16, maxX));
   1.447 +                fx += 2 * dx;
   1.448 +                count -= 2;
   1.449 +            }
   1.450 +
   1.451 +            __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
   1.452 +            __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
   1.453 +
   1.454 +            __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
   1.455 +                                             fx + dx, fx);
   1.456 +            __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
   1.457 +            __m128i wide_maxX = _mm_set1_epi32(maxX);
   1.458 +
   1.459 +            while (count >= 8) {
   1.460 +                __m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
   1.461 +                __m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
   1.462 +
   1.463 +                wide_out_low  = _mm_max_epi16(wide_out_low,
   1.464 +                                              _mm_setzero_si128());
   1.465 +                wide_out_low  = _mm_min_epi16(wide_out_low, wide_maxX);
   1.466 +                wide_out_high = _mm_max_epi16(wide_out_high,
   1.467 +                                              _mm_setzero_si128());
   1.468 +                wide_out_high = _mm_min_epi16(wide_out_high, wide_maxX);
   1.469 +
   1.470 +                __m128i wide_result = _mm_packs_epi32(wide_out_low,
   1.471 +                                                      wide_out_high);
   1.472 +                _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
   1.473 +
   1.474 +                wide_low  = _mm_add_epi32(wide_low, wide_dx8);
   1.475 +                wide_high = _mm_add_epi32(wide_high, wide_dx8);
   1.476 +
   1.477 +                xy += 4;
   1.478 +                fx += dx * 8;
   1.479 +                count -= 8;
   1.480 +            }
   1.481 +        } // if count >= 8
   1.482 +
   1.483 +        uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
   1.484 +        while (count-- > 0) {
   1.485 +            *xx++ = SkClampMax(fx >> 16, maxX);
   1.486 +            fx += dx;
   1.487 +        }
   1.488 +    }
   1.489 +}
   1.490 +
   1.491 +/*  SSE version of ClampX_ClampY_filter_affine()
   1.492 + *  portable version is in core/SkBitmapProcState_matrix.h
   1.493 + */
   1.494 +void ClampX_ClampY_filter_affine_SSE2(const SkBitmapProcState& s,
   1.495 +                                      uint32_t xy[], int count, int x, int y) {
   1.496 +    SkPoint srcPt;
   1.497 +    s.fInvProc(s.fInvMatrix,
   1.498 +               SkIntToScalar(x) + SK_ScalarHalf,
   1.499 +               SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
   1.500 +
   1.501 +    SkFixed oneX = s.fFilterOneX;
   1.502 +    SkFixed oneY = s.fFilterOneY;
   1.503 +    SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1);
   1.504 +    SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1);
   1.505 +    SkFixed dx = s.fInvSx;
   1.506 +    SkFixed dy = s.fInvKy;
   1.507 +    unsigned maxX = s.fBitmap->width() - 1;
   1.508 +    unsigned maxY = s.fBitmap->height() - 1;
   1.509 +
   1.510 +    if (count >= 2 && (maxX <= 0xFFFF)) {
   1.511 +        SkFixed dx2 = dx + dx;
   1.512 +        SkFixed dy2 = dy + dy;
   1.513 +
   1.514 +        __m128i wide_f = _mm_set_epi32(fx + dx, fy + dy, fx, fy);
   1.515 +        __m128i wide_d2  = _mm_set_epi32(dx2, dy2, dx2, dy2);
   1.516 +        __m128i wide_one  = _mm_set_epi32(oneX, oneY, oneX, oneY);
   1.517 +        __m128i wide_max = _mm_set_epi32(maxX, maxY, maxX, maxY);
   1.518 +        __m128i wide_mask = _mm_set1_epi32(0xF);
   1.519 +
   1.520 +        while (count >= 2) {
   1.521 +            // i = SkClampMax(f>>16,maxX)
   1.522 +            __m128i wide_i = _mm_max_epi16(_mm_srli_epi32(wide_f, 16),
   1.523 +                                           _mm_setzero_si128());
   1.524 +            wide_i = _mm_min_epi16(wide_i, wide_max);
   1.525 +
   1.526 +            // i<<4 | TILEX_LOW_BITS(f)
   1.527 +            __m128i wide_lo = _mm_srli_epi32(wide_f, 12);
   1.528 +            wide_lo = _mm_and_si128(wide_lo, wide_mask);
   1.529 +            wide_i  = _mm_slli_epi32(wide_i, 4);
   1.530 +            wide_i  = _mm_or_si128(wide_i, wide_lo);
   1.531 +
   1.532 +            // i<<14
   1.533 +            wide_i = _mm_slli_epi32(wide_i, 14);
   1.534 +
   1.535 +            // SkClampMax(((f+one))>>16,max)
   1.536 +            __m128i wide_f1 = _mm_add_epi32(wide_f, wide_one);
   1.537 +            wide_f1 = _mm_max_epi16(_mm_srli_epi32(wide_f1, 16),
   1.538 +                                                   _mm_setzero_si128());
   1.539 +            wide_f1 = _mm_min_epi16(wide_f1, wide_max);
   1.540 +
   1.541 +            // final combination
   1.542 +            wide_i = _mm_or_si128(wide_i, wide_f1);
   1.543 +            _mm_storeu_si128(reinterpret_cast<__m128i*>(xy), wide_i);
   1.544 +
   1.545 +            wide_f = _mm_add_epi32(wide_f, wide_d2);
   1.546 +
   1.547 +            fx += dx2;
   1.548 +            fy += dy2;
   1.549 +            xy += 4;
   1.550 +            count -= 2;
   1.551 +        } // while count >= 2
   1.552 +    } // if count >= 2
   1.553 +
   1.554 +    while (count-- > 0) {
   1.555 +        *xy++ = ClampX_ClampY_pack_filter(fy, maxY, oneY);
   1.556 +        fy += dy;
   1.557 +        *xy++ = ClampX_ClampY_pack_filter(fx, maxX, oneX);
   1.558 +        fx += dx;
   1.559 +    }
   1.560 +}
   1.561 +
   1.562 +/*  SSE version of ClampX_ClampY_nofilter_affine()
   1.563 + *  portable version is in core/SkBitmapProcState_matrix.h
   1.564 + */
   1.565 +void ClampX_ClampY_nofilter_affine_SSE2(const SkBitmapProcState& s,
   1.566 +                                      uint32_t xy[], int count, int x, int y) {
   1.567 +    SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
   1.568 +    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
   1.569 +                             SkMatrix::kScale_Mask |
   1.570 +                             SkMatrix::kAffine_Mask)) == 0);
   1.571 +
   1.572 +    SkPoint srcPt;
   1.573 +    s.fInvProc(s.fInvMatrix,
   1.574 +               SkIntToScalar(x) + SK_ScalarHalf,
   1.575 +               SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
   1.576 +
   1.577 +    SkFixed fx = SkScalarToFixed(srcPt.fX);
   1.578 +    SkFixed fy = SkScalarToFixed(srcPt.fY);
   1.579 +    SkFixed dx = s.fInvSx;
   1.580 +    SkFixed dy = s.fInvKy;
   1.581 +    int maxX = s.fBitmap->width() - 1;
   1.582 +    int maxY = s.fBitmap->height() - 1;
   1.583 +
   1.584 +    if (count >= 4 && (maxX <= 0xFFFF)) {
   1.585 +        while (((size_t)xy & 0x0F) != 0) {
   1.586 +            *xy++ = (SkClampMax(fy >> 16, maxY) << 16) |
   1.587 +                                  SkClampMax(fx >> 16, maxX);
   1.588 +            fx += dx;
   1.589 +            fy += dy;
   1.590 +            count--;
   1.591 +        }
   1.592 +
   1.593 +        SkFixed dx4 = dx * 4;
   1.594 +        SkFixed dy4 = dy * 4;
   1.595 +
   1.596 +        __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
   1.597 +                                          fx + dx, fx);
   1.598 +        __m128i wide_fy   = _mm_set_epi32(fy + dy * 3, fy + dy * 2,
   1.599 +                                          fy + dy, fy);
   1.600 +        __m128i wide_dx4  = _mm_set1_epi32(dx4);
   1.601 +        __m128i wide_dy4  = _mm_set1_epi32(dy4);
   1.602 +
   1.603 +        __m128i wide_maxX = _mm_set1_epi32(maxX);
   1.604 +        __m128i wide_maxY = _mm_set1_epi32(maxY);
   1.605 +
   1.606 +        while (count >= 4) {
   1.607 +            // SkClampMax(fx>>16,maxX)
   1.608 +            __m128i wide_lo = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16),
   1.609 +                                            _mm_setzero_si128());
   1.610 +            wide_lo = _mm_min_epi16(wide_lo, wide_maxX);
   1.611 +
   1.612 +            // SkClampMax(fy>>16,maxY)
   1.613 +            __m128i wide_hi = _mm_max_epi16(_mm_srli_epi32(wide_fy, 16),
   1.614 +                                            _mm_setzero_si128());
   1.615 +            wide_hi = _mm_min_epi16(wide_hi, wide_maxY);
   1.616 +
   1.617 +            // final combination
   1.618 +            __m128i wide_i = _mm_or_si128(_mm_slli_epi32(wide_hi, 16),
   1.619 +                                          wide_lo);
   1.620 +            _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i);
   1.621 +
   1.622 +            wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
   1.623 +            wide_fy = _mm_add_epi32(wide_fy, wide_dy4);
   1.624 +
   1.625 +            fx += dx4;
   1.626 +            fy += dy4;
   1.627 +            xy += 4;
   1.628 +            count -= 4;
   1.629 +        } // while count >= 4
   1.630 +    } // if count >= 4
   1.631 +
   1.632 +    while (count-- > 0) {
   1.633 +        *xy++ = (SkClampMax(fy >> 16, maxY) << 16) |
   1.634 +                              SkClampMax(fx >> 16, maxX);
   1.635 +        fx += dx;
   1.636 +        fy += dy;
   1.637 +    }
   1.638 +}
   1.639 +
   1.640 +/*  SSE version of S32_D16_filter_DX_SSE2
   1.641 + *  Definition is in section of "D16 functions for SRC == 8888" in SkBitmapProcState.cpp
   1.642 + *  It combines S32_opaque_D32_filter_DX_SSE2 and SkPixel32ToPixel16
   1.643 + */
   1.644 +void S32_D16_filter_DX_SSE2(const SkBitmapProcState& s,
   1.645 +                                   const uint32_t* xy,
   1.646 +                                   int count, uint16_t* colors) {
   1.647 +    SkASSERT(count > 0 && colors != NULL);
   1.648 +    SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
   1.649 +    SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
   1.650 +    SkASSERT(s.fBitmap->isOpaque());
   1.651 +
   1.652 +    SkPMColor dstColor;
   1.653 +    const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
   1.654 +    size_t rb = s.fBitmap->rowBytes();
   1.655 +    uint32_t XY = *xy++;
   1.656 +    unsigned y0 = XY >> 14;
   1.657 +    const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
   1.658 +    const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
   1.659 +    unsigned subY = y0 & 0xF;
   1.660 +
   1.661 +    // ( 0,  0,  0,  0,  0,  0,  0, 16)
   1.662 +    __m128i sixteen = _mm_cvtsi32_si128(16);
   1.663 +
   1.664 +    // ( 0,  0,  0,  0, 16, 16, 16, 16)
   1.665 +    sixteen = _mm_shufflelo_epi16(sixteen, 0);
   1.666 +
   1.667 +    // ( 0,  0,  0,  0,  0,  0,  0,  y)
   1.668 +    __m128i allY = _mm_cvtsi32_si128(subY);
   1.669 +
   1.670 +    // ( 0,  0,  0,  0,  y,  y,  y,  y)
   1.671 +    allY = _mm_shufflelo_epi16(allY, 0);
   1.672 +
   1.673 +    // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
   1.674 +    __m128i negY = _mm_sub_epi16(sixteen, allY);
   1.675 +
   1.676 +    // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
   1.677 +    allY = _mm_unpacklo_epi64(allY, negY);
   1.678 +
   1.679 +    // (16, 16, 16, 16, 16, 16, 16, 16 )
   1.680 +    sixteen = _mm_shuffle_epi32(sixteen, 0);
   1.681 +
   1.682 +    // ( 0,  0,  0,  0,  0,  0,  0,  0)
   1.683 +    __m128i zero = _mm_setzero_si128();
   1.684 +
   1.685 +    do {
   1.686 +        uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
   1.687 +        unsigned x0 = XX >> 18;
   1.688 +        unsigned x1 = XX & 0x3FFF;
   1.689 +
   1.690 +        // (0, 0, 0, 0, 0, 0, 0, x)
   1.691 +        __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
   1.692 +
   1.693 +        // (0, 0, 0, 0, x, x, x, x)
   1.694 +        allX = _mm_shufflelo_epi16(allX, 0);
   1.695 +
   1.696 +        // (x, x, x, x, x, x, x, x)
   1.697 +        allX = _mm_shuffle_epi32(allX, 0);
   1.698 +
   1.699 +        // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
   1.700 +        __m128i negX = _mm_sub_epi16(sixteen, allX);
   1.701 +
   1.702 +        // Load 4 samples (pixels).
   1.703 +        __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
   1.704 +        __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
   1.705 +        __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
   1.706 +        __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
   1.707 +
   1.708 +        // (0, 0, a00, a10)
   1.709 +        __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
   1.710 +
   1.711 +        // Expand to 16 bits per component.
   1.712 +        a00a10 = _mm_unpacklo_epi8(a00a10, zero);
   1.713 +
   1.714 +        // ((a00 * (16-y)), (a10 * y)).
   1.715 +        a00a10 = _mm_mullo_epi16(a00a10, allY);
   1.716 +
   1.717 +        // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
   1.718 +        a00a10 = _mm_mullo_epi16(a00a10, negX);
   1.719 +
   1.720 +        // (0, 0, a01, a10)
   1.721 +        __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
   1.722 +
   1.723 +        // Expand to 16 bits per component.
   1.724 +        a01a11 = _mm_unpacklo_epi8(a01a11, zero);
   1.725 +
   1.726 +        // (a01 * (16-y)), (a11 * y)
   1.727 +        a01a11 = _mm_mullo_epi16(a01a11, allY);
   1.728 +
   1.729 +        // (a01 * (16-y) * x), (a11 * y * x)
   1.730 +        a01a11 = _mm_mullo_epi16(a01a11, allX);
   1.731 +
   1.732 +        // (a00*w00 + a01*w01, a10*w10 + a11*w11)
   1.733 +        __m128i sum = _mm_add_epi16(a00a10, a01a11);
   1.734 +
   1.735 +        // (DC, a00*w00 + a01*w01)
   1.736 +        __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
   1.737 +
   1.738 +        // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
   1.739 +        sum = _mm_add_epi16(sum, shifted);
   1.740 +
   1.741 +        // Divide each 16 bit component by 256.
   1.742 +        sum = _mm_srli_epi16(sum, 8);
   1.743 +
   1.744 +        // Pack lower 4 16 bit values of sum into lower 4 bytes.
   1.745 +        sum = _mm_packus_epi16(sum, zero);
   1.746 +
   1.747 +        // Extract low int and store.
   1.748 +        dstColor = _mm_cvtsi128_si32(sum);
   1.749 +
   1.750 +        //*colors++ = SkPixel32ToPixel16(dstColor);
   1.751 +        // below is much faster than the above. It's tested for Android benchmark--Softweg
   1.752 +        __m128i _m_temp1 = _mm_set1_epi32(dstColor);
   1.753 +        __m128i _m_temp2 = _mm_srli_epi32(_m_temp1, 3);
   1.754 +
   1.755 +        unsigned int r32 = _mm_cvtsi128_si32(_m_temp2);
   1.756 +        unsigned r = (r32 & ((1<<5) -1)) << 11;
   1.757 +
   1.758 +        _m_temp2 = _mm_srli_epi32(_m_temp2, 7);
   1.759 +        unsigned int g32 = _mm_cvtsi128_si32(_m_temp2);
   1.760 +        unsigned g = (g32 & ((1<<6) -1)) << 5;
   1.761 +
   1.762 +        _m_temp2 = _mm_srli_epi32(_m_temp2, 9);
   1.763 +        unsigned int b32 = _mm_cvtsi128_si32(_m_temp2);
   1.764 +        unsigned b = (b32 & ((1<<5) -1));
   1.765 +
   1.766 +        *colors++ = r | g | b;
   1.767 +
   1.768 +    } while (--count > 0);
   1.769 +}

mercurial