1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/skia/trunk/src/opts/SkBitmapProcState_opts_SSE2.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,766 @@ 1.4 + 1.5 +/* 1.6 + * Copyright 2009 The Android Open Source Project 1.7 + * 1.8 + * Use of this source code is governed by a BSD-style license that can be 1.9 + * found in the LICENSE file. 1.10 + */ 1.11 + 1.12 + 1.13 +#include <emmintrin.h> 1.14 +#include "SkBitmapProcState_opts_SSE2.h" 1.15 +#include "SkPaint.h" 1.16 +#include "SkUtils.h" 1.17 + 1.18 +void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s, 1.19 + const uint32_t* xy, 1.20 + int count, uint32_t* colors) { 1.21 + SkASSERT(count > 0 && colors != NULL); 1.22 + SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel); 1.23 + SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config); 1.24 + SkASSERT(s.fAlphaScale == 256); 1.25 + 1.26 + const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels()); 1.27 + size_t rb = s.fBitmap->rowBytes(); 1.28 + uint32_t XY = *xy++; 1.29 + unsigned y0 = XY >> 14; 1.30 + const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb); 1.31 + const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb); 1.32 + unsigned subY = y0 & 0xF; 1.33 + 1.34 + // ( 0, 0, 0, 0, 0, 0, 0, 16) 1.35 + __m128i sixteen = _mm_cvtsi32_si128(16); 1.36 + 1.37 + // ( 0, 0, 0, 0, 16, 16, 16, 16) 1.38 + sixteen = _mm_shufflelo_epi16(sixteen, 0); 1.39 + 1.40 + // ( 0, 0, 0, 0, 0, 0, 0, y) 1.41 + __m128i allY = _mm_cvtsi32_si128(subY); 1.42 + 1.43 + // ( 0, 0, 0, 0, y, y, y, y) 1.44 + allY = _mm_shufflelo_epi16(allY, 0); 1.45 + 1.46 + // ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y) 1.47 + __m128i negY = _mm_sub_epi16(sixteen, allY); 1.48 + 1.49 + // (16-y, 16-y, 16-y, 16-y, y, y, y, y) 1.50 + allY = _mm_unpacklo_epi64(allY, negY); 1.51 + 1.52 + // (16, 16, 16, 16, 16, 16, 16, 16 ) 1.53 + sixteen = _mm_shuffle_epi32(sixteen, 0); 1.54 + 1.55 + // ( 0, 0, 0, 0, 0, 0, 0, 0) 1.56 + __m128i zero = _mm_setzero_si128(); 1.57 + do { 1.58 + uint32_t XX = *xy++; // x0:14 | 4 | x1:14 1.59 + unsigned x0 = XX >> 18; 1.60 + unsigned x1 = XX & 0x3FFF; 1.61 + 1.62 + // (0, 0, 0, 0, 0, 0, 0, x) 1.63 + __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F); 1.64 + 1.65 + // (0, 0, 0, 0, x, x, x, x) 1.66 + allX = _mm_shufflelo_epi16(allX, 0); 1.67 + 1.68 + // (x, x, x, x, x, x, x, x) 1.69 + allX = _mm_shuffle_epi32(allX, 0); 1.70 + 1.71 + // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x) 1.72 + __m128i negX = _mm_sub_epi16(sixteen, allX); 1.73 + 1.74 + // Load 4 samples (pixels). 1.75 + __m128i a00 = _mm_cvtsi32_si128(row0[x0]); 1.76 + __m128i a01 = _mm_cvtsi32_si128(row0[x1]); 1.77 + __m128i a10 = _mm_cvtsi32_si128(row1[x0]); 1.78 + __m128i a11 = _mm_cvtsi32_si128(row1[x1]); 1.79 + 1.80 + // (0, 0, a00, a10) 1.81 + __m128i a00a10 = _mm_unpacklo_epi32(a10, a00); 1.82 + 1.83 + // Expand to 16 bits per component. 1.84 + a00a10 = _mm_unpacklo_epi8(a00a10, zero); 1.85 + 1.86 + // ((a00 * (16-y)), (a10 * y)). 1.87 + a00a10 = _mm_mullo_epi16(a00a10, allY); 1.88 + 1.89 + // (a00 * (16-y) * (16-x), a10 * y * (16-x)). 1.90 + a00a10 = _mm_mullo_epi16(a00a10, negX); 1.91 + 1.92 + // (0, 0, a01, a10) 1.93 + __m128i a01a11 = _mm_unpacklo_epi32(a11, a01); 1.94 + 1.95 + // Expand to 16 bits per component. 1.96 + a01a11 = _mm_unpacklo_epi8(a01a11, zero); 1.97 + 1.98 + // (a01 * (16-y)), (a11 * y) 1.99 + a01a11 = _mm_mullo_epi16(a01a11, allY); 1.100 + 1.101 + // (a01 * (16-y) * x), (a11 * y * x) 1.102 + a01a11 = _mm_mullo_epi16(a01a11, allX); 1.103 + 1.104 + // (a00*w00 + a01*w01, a10*w10 + a11*w11) 1.105 + __m128i sum = _mm_add_epi16(a00a10, a01a11); 1.106 + 1.107 + // (DC, a00*w00 + a01*w01) 1.108 + __m128i shifted = _mm_shuffle_epi32(sum, 0xEE); 1.109 + 1.110 + // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11) 1.111 + sum = _mm_add_epi16(sum, shifted); 1.112 + 1.113 + // Divide each 16 bit component by 256. 1.114 + sum = _mm_srli_epi16(sum, 8); 1.115 + 1.116 + // Pack lower 4 16 bit values of sum into lower 4 bytes. 1.117 + sum = _mm_packus_epi16(sum, zero); 1.118 + 1.119 + // Extract low int and store. 1.120 + *colors++ = _mm_cvtsi128_si32(sum); 1.121 + } while (--count > 0); 1.122 +} 1.123 + 1.124 +void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s, 1.125 + const uint32_t* xy, 1.126 + int count, uint32_t* colors) { 1.127 + SkASSERT(count > 0 && colors != NULL); 1.128 + SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel); 1.129 + SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config); 1.130 + SkASSERT(s.fAlphaScale < 256); 1.131 + 1.132 + const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels()); 1.133 + size_t rb = s.fBitmap->rowBytes(); 1.134 + uint32_t XY = *xy++; 1.135 + unsigned y0 = XY >> 14; 1.136 + const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb); 1.137 + const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb); 1.138 + unsigned subY = y0 & 0xF; 1.139 + 1.140 + // ( 0, 0, 0, 0, 0, 0, 0, 16) 1.141 + __m128i sixteen = _mm_cvtsi32_si128(16); 1.142 + 1.143 + // ( 0, 0, 0, 0, 16, 16, 16, 16) 1.144 + sixteen = _mm_shufflelo_epi16(sixteen, 0); 1.145 + 1.146 + // ( 0, 0, 0, 0, 0, 0, 0, y) 1.147 + __m128i allY = _mm_cvtsi32_si128(subY); 1.148 + 1.149 + // ( 0, 0, 0, 0, y, y, y, y) 1.150 + allY = _mm_shufflelo_epi16(allY, 0); 1.151 + 1.152 + // ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y) 1.153 + __m128i negY = _mm_sub_epi16(sixteen, allY); 1.154 + 1.155 + // (16-y, 16-y, 16-y, 16-y, y, y, y, y) 1.156 + allY = _mm_unpacklo_epi64(allY, negY); 1.157 + 1.158 + // (16, 16, 16, 16, 16, 16, 16, 16 ) 1.159 + sixteen = _mm_shuffle_epi32(sixteen, 0); 1.160 + 1.161 + // ( 0, 0, 0, 0, 0, 0, 0, 0) 1.162 + __m128i zero = _mm_setzero_si128(); 1.163 + 1.164 + // ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha ) 1.165 + __m128i alpha = _mm_set1_epi16(s.fAlphaScale); 1.166 + 1.167 + do { 1.168 + uint32_t XX = *xy++; // x0:14 | 4 | x1:14 1.169 + unsigned x0 = XX >> 18; 1.170 + unsigned x1 = XX & 0x3FFF; 1.171 + 1.172 + // (0, 0, 0, 0, 0, 0, 0, x) 1.173 + __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F); 1.174 + 1.175 + // (0, 0, 0, 0, x, x, x, x) 1.176 + allX = _mm_shufflelo_epi16(allX, 0); 1.177 + 1.178 + // (x, x, x, x, x, x, x, x) 1.179 + allX = _mm_shuffle_epi32(allX, 0); 1.180 + 1.181 + // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x) 1.182 + __m128i negX = _mm_sub_epi16(sixteen, allX); 1.183 + 1.184 + // Load 4 samples (pixels). 1.185 + __m128i a00 = _mm_cvtsi32_si128(row0[x0]); 1.186 + __m128i a01 = _mm_cvtsi32_si128(row0[x1]); 1.187 + __m128i a10 = _mm_cvtsi32_si128(row1[x0]); 1.188 + __m128i a11 = _mm_cvtsi32_si128(row1[x1]); 1.189 + 1.190 + // (0, 0, a00, a10) 1.191 + __m128i a00a10 = _mm_unpacklo_epi32(a10, a00); 1.192 + 1.193 + // Expand to 16 bits per component. 1.194 + a00a10 = _mm_unpacklo_epi8(a00a10, zero); 1.195 + 1.196 + // ((a00 * (16-y)), (a10 * y)). 1.197 + a00a10 = _mm_mullo_epi16(a00a10, allY); 1.198 + 1.199 + // (a00 * (16-y) * (16-x), a10 * y * (16-x)). 1.200 + a00a10 = _mm_mullo_epi16(a00a10, negX); 1.201 + 1.202 + // (0, 0, a01, a10) 1.203 + __m128i a01a11 = _mm_unpacklo_epi32(a11, a01); 1.204 + 1.205 + // Expand to 16 bits per component. 1.206 + a01a11 = _mm_unpacklo_epi8(a01a11, zero); 1.207 + 1.208 + // (a01 * (16-y)), (a11 * y) 1.209 + a01a11 = _mm_mullo_epi16(a01a11, allY); 1.210 + 1.211 + // (a01 * (16-y) * x), (a11 * y * x) 1.212 + a01a11 = _mm_mullo_epi16(a01a11, allX); 1.213 + 1.214 + // (a00*w00 + a01*w01, a10*w10 + a11*w11) 1.215 + __m128i sum = _mm_add_epi16(a00a10, a01a11); 1.216 + 1.217 + // (DC, a00*w00 + a01*w01) 1.218 + __m128i shifted = _mm_shuffle_epi32(sum, 0xEE); 1.219 + 1.220 + // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11) 1.221 + sum = _mm_add_epi16(sum, shifted); 1.222 + 1.223 + // Divide each 16 bit component by 256. 1.224 + sum = _mm_srli_epi16(sum, 8); 1.225 + 1.226 + // Multiply by alpha. 1.227 + sum = _mm_mullo_epi16(sum, alpha); 1.228 + 1.229 + // Divide each 16 bit component by 256. 1.230 + sum = _mm_srli_epi16(sum, 8); 1.231 + 1.232 + // Pack lower 4 16 bit values of sum into lower 4 bytes. 1.233 + sum = _mm_packus_epi16(sum, zero); 1.234 + 1.235 + // Extract low int and store. 1.236 + *colors++ = _mm_cvtsi128_si32(sum); 1.237 + } while (--count > 0); 1.238 +} 1.239 + 1.240 +static inline uint32_t ClampX_ClampY_pack_filter(SkFixed f, unsigned max, 1.241 + SkFixed one) { 1.242 + unsigned i = SkClampMax(f >> 16, max); 1.243 + i = (i << 4) | ((f >> 12) & 0xF); 1.244 + return (i << 14) | SkClampMax((f + one) >> 16, max); 1.245 +} 1.246 + 1.247 +/* SSE version of ClampX_ClampY_filter_scale() 1.248 + * portable version is in core/SkBitmapProcState_matrix.h 1.249 + */ 1.250 +void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[], 1.251 + int count, int x, int y) { 1.252 + SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | 1.253 + SkMatrix::kScale_Mask)) == 0); 1.254 + SkASSERT(s.fInvKy == 0); 1.255 + 1.256 + const unsigned maxX = s.fBitmap->width() - 1; 1.257 + const SkFixed one = s.fFilterOneX; 1.258 + const SkFixed dx = s.fInvSx; 1.259 + SkFixed fx; 1.260 + 1.261 + SkPoint pt; 1.262 + s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, 1.263 + SkIntToScalar(y) + SK_ScalarHalf, &pt); 1.264 + const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1); 1.265 + const unsigned maxY = s.fBitmap->height() - 1; 1.266 + // compute our two Y values up front 1.267 + *xy++ = ClampX_ClampY_pack_filter(fy, maxY, s.fFilterOneY); 1.268 + // now initialize fx 1.269 + fx = SkScalarToFixed(pt.fX) - (one >> 1); 1.270 + 1.271 + // test if we don't need to apply the tile proc 1.272 + if (dx > 0 && (unsigned)(fx >> 16) <= maxX && 1.273 + (unsigned)((fx + dx * (count - 1)) >> 16) < maxX) { 1.274 + if (count >= 4) { 1.275 + // SSE version of decal_filter_scale 1.276 + while ((size_t(xy) & 0x0F) != 0) { 1.277 + SkASSERT((fx >> (16 + 14)) == 0); 1.278 + *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 1.279 + fx += dx; 1.280 + count--; 1.281 + } 1.282 + 1.283 + __m128i wide_1 = _mm_set1_epi32(1); 1.284 + __m128i wide_dx4 = _mm_set1_epi32(dx * 4); 1.285 + __m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2, 1.286 + fx + dx, fx); 1.287 + 1.288 + while (count >= 4) { 1.289 + __m128i wide_out; 1.290 + 1.291 + wide_out = _mm_slli_epi32(_mm_srai_epi32(wide_fx, 12), 14); 1.292 + wide_out = _mm_or_si128(wide_out, _mm_add_epi32( 1.293 + _mm_srai_epi32(wide_fx, 16), wide_1)); 1.294 + 1.295 + _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_out); 1.296 + 1.297 + xy += 4; 1.298 + fx += dx * 4; 1.299 + wide_fx = _mm_add_epi32(wide_fx, wide_dx4); 1.300 + count -= 4; 1.301 + } // while count >= 4 1.302 + } // if count >= 4 1.303 + 1.304 + while (count-- > 0) { 1.305 + SkASSERT((fx >> (16 + 14)) == 0); 1.306 + *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 1.307 + fx += dx; 1.308 + } 1.309 + } else { 1.310 + // SSE2 only support 16bit interger max & min, so only process the case 1.311 + // maxX less than the max 16bit interger. Actually maxX is the bitmap's 1.312 + // height, there should be rare bitmap whose height will be greater 1.313 + // than max 16bit interger in the real world. 1.314 + if ((count >= 4) && (maxX <= 0xFFFF)) { 1.315 + while (((size_t)xy & 0x0F) != 0) { 1.316 + *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one); 1.317 + fx += dx; 1.318 + count--; 1.319 + } 1.320 + 1.321 + __m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2, 1.322 + fx + dx, fx); 1.323 + __m128i wide_dx4 = _mm_set1_epi32(dx * 4); 1.324 + __m128i wide_one = _mm_set1_epi32(one); 1.325 + __m128i wide_maxX = _mm_set1_epi32(maxX); 1.326 + __m128i wide_mask = _mm_set1_epi32(0xF); 1.327 + 1.328 + while (count >= 4) { 1.329 + __m128i wide_i; 1.330 + __m128i wide_lo; 1.331 + __m128i wide_fx1; 1.332 + 1.333 + // i = SkClampMax(f>>16,maxX) 1.334 + wide_i = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16), 1.335 + _mm_setzero_si128()); 1.336 + wide_i = _mm_min_epi16(wide_i, wide_maxX); 1.337 + 1.338 + // i<<4 | TILEX_LOW_BITS(fx) 1.339 + wide_lo = _mm_srli_epi32(wide_fx, 12); 1.340 + wide_lo = _mm_and_si128(wide_lo, wide_mask); 1.341 + wide_i = _mm_slli_epi32(wide_i, 4); 1.342 + wide_i = _mm_or_si128(wide_i, wide_lo); 1.343 + 1.344 + // i<<14 1.345 + wide_i = _mm_slli_epi32(wide_i, 14); 1.346 + 1.347 + // SkClampMax(((f+one))>>16,max) 1.348 + wide_fx1 = _mm_add_epi32(wide_fx, wide_one); 1.349 + wide_fx1 = _mm_max_epi16(_mm_srli_epi32(wide_fx1, 16), 1.350 + _mm_setzero_si128()); 1.351 + wide_fx1 = _mm_min_epi16(wide_fx1, wide_maxX); 1.352 + 1.353 + // final combination 1.354 + wide_i = _mm_or_si128(wide_i, wide_fx1); 1.355 + _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i); 1.356 + 1.357 + wide_fx = _mm_add_epi32(wide_fx, wide_dx4); 1.358 + fx += dx * 4; 1.359 + xy += 4; 1.360 + count -= 4; 1.361 + } // while count >= 4 1.362 + } // if count >= 4 1.363 + 1.364 + while (count-- > 0) { 1.365 + *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one); 1.366 + fx += dx; 1.367 + } 1.368 + } 1.369 +} 1.370 + 1.371 +/* SSE version of ClampX_ClampY_nofilter_scale() 1.372 + * portable version is in core/SkBitmapProcState_matrix.h 1.373 + */ 1.374 +void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s, 1.375 + uint32_t xy[], int count, int x, int y) { 1.376 + SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | 1.377 + SkMatrix::kScale_Mask)) == 0); 1.378 + 1.379 + // we store y, x, x, x, x, x 1.380 + const unsigned maxX = s.fBitmap->width() - 1; 1.381 + SkFixed fx; 1.382 + SkPoint pt; 1.383 + s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, 1.384 + SkIntToScalar(y) + SK_ScalarHalf, &pt); 1.385 + fx = SkScalarToFixed(pt.fY); 1.386 + const unsigned maxY = s.fBitmap->height() - 1; 1.387 + *xy++ = SkClampMax(fx >> 16, maxY); 1.388 + fx = SkScalarToFixed(pt.fX); 1.389 + 1.390 + if (0 == maxX) { 1.391 + // all of the following X values must be 0 1.392 + memset(xy, 0, count * sizeof(uint16_t)); 1.393 + return; 1.394 + } 1.395 + 1.396 + const SkFixed dx = s.fInvSx; 1.397 + 1.398 + // test if we don't need to apply the tile proc 1.399 + if ((unsigned)(fx >> 16) <= maxX && 1.400 + (unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) { 1.401 + // SSE version of decal_nofilter_scale 1.402 + if (count >= 8) { 1.403 + while (((size_t)xy & 0x0F) != 0) { 1.404 + *xy++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16); 1.405 + fx += 2 * dx; 1.406 + count -= 2; 1.407 + } 1.408 + 1.409 + __m128i wide_dx4 = _mm_set1_epi32(dx * 4); 1.410 + __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4); 1.411 + 1.412 + __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2, 1.413 + fx + dx, fx); 1.414 + __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4); 1.415 + 1.416 + while (count >= 8) { 1.417 + __m128i wide_out_low = _mm_srli_epi32(wide_low, 16); 1.418 + __m128i wide_out_high = _mm_srli_epi32(wide_high, 16); 1.419 + 1.420 + __m128i wide_result = _mm_packs_epi32(wide_out_low, 1.421 + wide_out_high); 1.422 + _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result); 1.423 + 1.424 + wide_low = _mm_add_epi32(wide_low, wide_dx8); 1.425 + wide_high = _mm_add_epi32(wide_high, wide_dx8); 1.426 + 1.427 + xy += 4; 1.428 + fx += dx * 8; 1.429 + count -= 8; 1.430 + } 1.431 + } // if count >= 8 1.432 + 1.433 + uint16_t* xx = reinterpret_cast<uint16_t*>(xy); 1.434 + while (count-- > 0) { 1.435 + *xx++ = SkToU16(fx >> 16); 1.436 + fx += dx; 1.437 + } 1.438 + } else { 1.439 + // SSE2 only support 16bit interger max & min, so only process the case 1.440 + // maxX less than the max 16bit interger. Actually maxX is the bitmap's 1.441 + // height, there should be rare bitmap whose height will be greater 1.442 + // than max 16bit interger in the real world. 1.443 + if ((count >= 8) && (maxX <= 0xFFFF)) { 1.444 + while (((size_t)xy & 0x0F) != 0) { 1.445 + *xy++ = pack_two_shorts(SkClampMax((fx + dx) >> 16, maxX), 1.446 + SkClampMax(fx >> 16, maxX)); 1.447 + fx += 2 * dx; 1.448 + count -= 2; 1.449 + } 1.450 + 1.451 + __m128i wide_dx4 = _mm_set1_epi32(dx * 4); 1.452 + __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4); 1.453 + 1.454 + __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2, 1.455 + fx + dx, fx); 1.456 + __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4); 1.457 + __m128i wide_maxX = _mm_set1_epi32(maxX); 1.458 + 1.459 + while (count >= 8) { 1.460 + __m128i wide_out_low = _mm_srli_epi32(wide_low, 16); 1.461 + __m128i wide_out_high = _mm_srli_epi32(wide_high, 16); 1.462 + 1.463 + wide_out_low = _mm_max_epi16(wide_out_low, 1.464 + _mm_setzero_si128()); 1.465 + wide_out_low = _mm_min_epi16(wide_out_low, wide_maxX); 1.466 + wide_out_high = _mm_max_epi16(wide_out_high, 1.467 + _mm_setzero_si128()); 1.468 + wide_out_high = _mm_min_epi16(wide_out_high, wide_maxX); 1.469 + 1.470 + __m128i wide_result = _mm_packs_epi32(wide_out_low, 1.471 + wide_out_high); 1.472 + _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result); 1.473 + 1.474 + wide_low = _mm_add_epi32(wide_low, wide_dx8); 1.475 + wide_high = _mm_add_epi32(wide_high, wide_dx8); 1.476 + 1.477 + xy += 4; 1.478 + fx += dx * 8; 1.479 + count -= 8; 1.480 + } 1.481 + } // if count >= 8 1.482 + 1.483 + uint16_t* xx = reinterpret_cast<uint16_t*>(xy); 1.484 + while (count-- > 0) { 1.485 + *xx++ = SkClampMax(fx >> 16, maxX); 1.486 + fx += dx; 1.487 + } 1.488 + } 1.489 +} 1.490 + 1.491 +/* SSE version of ClampX_ClampY_filter_affine() 1.492 + * portable version is in core/SkBitmapProcState_matrix.h 1.493 + */ 1.494 +void ClampX_ClampY_filter_affine_SSE2(const SkBitmapProcState& s, 1.495 + uint32_t xy[], int count, int x, int y) { 1.496 + SkPoint srcPt; 1.497 + s.fInvProc(s.fInvMatrix, 1.498 + SkIntToScalar(x) + SK_ScalarHalf, 1.499 + SkIntToScalar(y) + SK_ScalarHalf, &srcPt); 1.500 + 1.501 + SkFixed oneX = s.fFilterOneX; 1.502 + SkFixed oneY = s.fFilterOneY; 1.503 + SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1); 1.504 + SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1); 1.505 + SkFixed dx = s.fInvSx; 1.506 + SkFixed dy = s.fInvKy; 1.507 + unsigned maxX = s.fBitmap->width() - 1; 1.508 + unsigned maxY = s.fBitmap->height() - 1; 1.509 + 1.510 + if (count >= 2 && (maxX <= 0xFFFF)) { 1.511 + SkFixed dx2 = dx + dx; 1.512 + SkFixed dy2 = dy + dy; 1.513 + 1.514 + __m128i wide_f = _mm_set_epi32(fx + dx, fy + dy, fx, fy); 1.515 + __m128i wide_d2 = _mm_set_epi32(dx2, dy2, dx2, dy2); 1.516 + __m128i wide_one = _mm_set_epi32(oneX, oneY, oneX, oneY); 1.517 + __m128i wide_max = _mm_set_epi32(maxX, maxY, maxX, maxY); 1.518 + __m128i wide_mask = _mm_set1_epi32(0xF); 1.519 + 1.520 + while (count >= 2) { 1.521 + // i = SkClampMax(f>>16,maxX) 1.522 + __m128i wide_i = _mm_max_epi16(_mm_srli_epi32(wide_f, 16), 1.523 + _mm_setzero_si128()); 1.524 + wide_i = _mm_min_epi16(wide_i, wide_max); 1.525 + 1.526 + // i<<4 | TILEX_LOW_BITS(f) 1.527 + __m128i wide_lo = _mm_srli_epi32(wide_f, 12); 1.528 + wide_lo = _mm_and_si128(wide_lo, wide_mask); 1.529 + wide_i = _mm_slli_epi32(wide_i, 4); 1.530 + wide_i = _mm_or_si128(wide_i, wide_lo); 1.531 + 1.532 + // i<<14 1.533 + wide_i = _mm_slli_epi32(wide_i, 14); 1.534 + 1.535 + // SkClampMax(((f+one))>>16,max) 1.536 + __m128i wide_f1 = _mm_add_epi32(wide_f, wide_one); 1.537 + wide_f1 = _mm_max_epi16(_mm_srli_epi32(wide_f1, 16), 1.538 + _mm_setzero_si128()); 1.539 + wide_f1 = _mm_min_epi16(wide_f1, wide_max); 1.540 + 1.541 + // final combination 1.542 + wide_i = _mm_or_si128(wide_i, wide_f1); 1.543 + _mm_storeu_si128(reinterpret_cast<__m128i*>(xy), wide_i); 1.544 + 1.545 + wide_f = _mm_add_epi32(wide_f, wide_d2); 1.546 + 1.547 + fx += dx2; 1.548 + fy += dy2; 1.549 + xy += 4; 1.550 + count -= 2; 1.551 + } // while count >= 2 1.552 + } // if count >= 2 1.553 + 1.554 + while (count-- > 0) { 1.555 + *xy++ = ClampX_ClampY_pack_filter(fy, maxY, oneY); 1.556 + fy += dy; 1.557 + *xy++ = ClampX_ClampY_pack_filter(fx, maxX, oneX); 1.558 + fx += dx; 1.559 + } 1.560 +} 1.561 + 1.562 +/* SSE version of ClampX_ClampY_nofilter_affine() 1.563 + * portable version is in core/SkBitmapProcState_matrix.h 1.564 + */ 1.565 +void ClampX_ClampY_nofilter_affine_SSE2(const SkBitmapProcState& s, 1.566 + uint32_t xy[], int count, int x, int y) { 1.567 + SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); 1.568 + SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | 1.569 + SkMatrix::kScale_Mask | 1.570 + SkMatrix::kAffine_Mask)) == 0); 1.571 + 1.572 + SkPoint srcPt; 1.573 + s.fInvProc(s.fInvMatrix, 1.574 + SkIntToScalar(x) + SK_ScalarHalf, 1.575 + SkIntToScalar(y) + SK_ScalarHalf, &srcPt); 1.576 + 1.577 + SkFixed fx = SkScalarToFixed(srcPt.fX); 1.578 + SkFixed fy = SkScalarToFixed(srcPt.fY); 1.579 + SkFixed dx = s.fInvSx; 1.580 + SkFixed dy = s.fInvKy; 1.581 + int maxX = s.fBitmap->width() - 1; 1.582 + int maxY = s.fBitmap->height() - 1; 1.583 + 1.584 + if (count >= 4 && (maxX <= 0xFFFF)) { 1.585 + while (((size_t)xy & 0x0F) != 0) { 1.586 + *xy++ = (SkClampMax(fy >> 16, maxY) << 16) | 1.587 + SkClampMax(fx >> 16, maxX); 1.588 + fx += dx; 1.589 + fy += dy; 1.590 + count--; 1.591 + } 1.592 + 1.593 + SkFixed dx4 = dx * 4; 1.594 + SkFixed dy4 = dy * 4; 1.595 + 1.596 + __m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2, 1.597 + fx + dx, fx); 1.598 + __m128i wide_fy = _mm_set_epi32(fy + dy * 3, fy + dy * 2, 1.599 + fy + dy, fy); 1.600 + __m128i wide_dx4 = _mm_set1_epi32(dx4); 1.601 + __m128i wide_dy4 = _mm_set1_epi32(dy4); 1.602 + 1.603 + __m128i wide_maxX = _mm_set1_epi32(maxX); 1.604 + __m128i wide_maxY = _mm_set1_epi32(maxY); 1.605 + 1.606 + while (count >= 4) { 1.607 + // SkClampMax(fx>>16,maxX) 1.608 + __m128i wide_lo = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16), 1.609 + _mm_setzero_si128()); 1.610 + wide_lo = _mm_min_epi16(wide_lo, wide_maxX); 1.611 + 1.612 + // SkClampMax(fy>>16,maxY) 1.613 + __m128i wide_hi = _mm_max_epi16(_mm_srli_epi32(wide_fy, 16), 1.614 + _mm_setzero_si128()); 1.615 + wide_hi = _mm_min_epi16(wide_hi, wide_maxY); 1.616 + 1.617 + // final combination 1.618 + __m128i wide_i = _mm_or_si128(_mm_slli_epi32(wide_hi, 16), 1.619 + wide_lo); 1.620 + _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i); 1.621 + 1.622 + wide_fx = _mm_add_epi32(wide_fx, wide_dx4); 1.623 + wide_fy = _mm_add_epi32(wide_fy, wide_dy4); 1.624 + 1.625 + fx += dx4; 1.626 + fy += dy4; 1.627 + xy += 4; 1.628 + count -= 4; 1.629 + } // while count >= 4 1.630 + } // if count >= 4 1.631 + 1.632 + while (count-- > 0) { 1.633 + *xy++ = (SkClampMax(fy >> 16, maxY) << 16) | 1.634 + SkClampMax(fx >> 16, maxX); 1.635 + fx += dx; 1.636 + fy += dy; 1.637 + } 1.638 +} 1.639 + 1.640 +/* SSE version of S32_D16_filter_DX_SSE2 1.641 + * Definition is in section of "D16 functions for SRC == 8888" in SkBitmapProcState.cpp 1.642 + * It combines S32_opaque_D32_filter_DX_SSE2 and SkPixel32ToPixel16 1.643 + */ 1.644 +void S32_D16_filter_DX_SSE2(const SkBitmapProcState& s, 1.645 + const uint32_t* xy, 1.646 + int count, uint16_t* colors) { 1.647 + SkASSERT(count > 0 && colors != NULL); 1.648 + SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel); 1.649 + SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config); 1.650 + SkASSERT(s.fBitmap->isOpaque()); 1.651 + 1.652 + SkPMColor dstColor; 1.653 + const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels()); 1.654 + size_t rb = s.fBitmap->rowBytes(); 1.655 + uint32_t XY = *xy++; 1.656 + unsigned y0 = XY >> 14; 1.657 + const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb); 1.658 + const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb); 1.659 + unsigned subY = y0 & 0xF; 1.660 + 1.661 + // ( 0, 0, 0, 0, 0, 0, 0, 16) 1.662 + __m128i sixteen = _mm_cvtsi32_si128(16); 1.663 + 1.664 + // ( 0, 0, 0, 0, 16, 16, 16, 16) 1.665 + sixteen = _mm_shufflelo_epi16(sixteen, 0); 1.666 + 1.667 + // ( 0, 0, 0, 0, 0, 0, 0, y) 1.668 + __m128i allY = _mm_cvtsi32_si128(subY); 1.669 + 1.670 + // ( 0, 0, 0, 0, y, y, y, y) 1.671 + allY = _mm_shufflelo_epi16(allY, 0); 1.672 + 1.673 + // ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y) 1.674 + __m128i negY = _mm_sub_epi16(sixteen, allY); 1.675 + 1.676 + // (16-y, 16-y, 16-y, 16-y, y, y, y, y) 1.677 + allY = _mm_unpacklo_epi64(allY, negY); 1.678 + 1.679 + // (16, 16, 16, 16, 16, 16, 16, 16 ) 1.680 + sixteen = _mm_shuffle_epi32(sixteen, 0); 1.681 + 1.682 + // ( 0, 0, 0, 0, 0, 0, 0, 0) 1.683 + __m128i zero = _mm_setzero_si128(); 1.684 + 1.685 + do { 1.686 + uint32_t XX = *xy++; // x0:14 | 4 | x1:14 1.687 + unsigned x0 = XX >> 18; 1.688 + unsigned x1 = XX & 0x3FFF; 1.689 + 1.690 + // (0, 0, 0, 0, 0, 0, 0, x) 1.691 + __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F); 1.692 + 1.693 + // (0, 0, 0, 0, x, x, x, x) 1.694 + allX = _mm_shufflelo_epi16(allX, 0); 1.695 + 1.696 + // (x, x, x, x, x, x, x, x) 1.697 + allX = _mm_shuffle_epi32(allX, 0); 1.698 + 1.699 + // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x) 1.700 + __m128i negX = _mm_sub_epi16(sixteen, allX); 1.701 + 1.702 + // Load 4 samples (pixels). 1.703 + __m128i a00 = _mm_cvtsi32_si128(row0[x0]); 1.704 + __m128i a01 = _mm_cvtsi32_si128(row0[x1]); 1.705 + __m128i a10 = _mm_cvtsi32_si128(row1[x0]); 1.706 + __m128i a11 = _mm_cvtsi32_si128(row1[x1]); 1.707 + 1.708 + // (0, 0, a00, a10) 1.709 + __m128i a00a10 = _mm_unpacklo_epi32(a10, a00); 1.710 + 1.711 + // Expand to 16 bits per component. 1.712 + a00a10 = _mm_unpacklo_epi8(a00a10, zero); 1.713 + 1.714 + // ((a00 * (16-y)), (a10 * y)). 1.715 + a00a10 = _mm_mullo_epi16(a00a10, allY); 1.716 + 1.717 + // (a00 * (16-y) * (16-x), a10 * y * (16-x)). 1.718 + a00a10 = _mm_mullo_epi16(a00a10, negX); 1.719 + 1.720 + // (0, 0, a01, a10) 1.721 + __m128i a01a11 = _mm_unpacklo_epi32(a11, a01); 1.722 + 1.723 + // Expand to 16 bits per component. 1.724 + a01a11 = _mm_unpacklo_epi8(a01a11, zero); 1.725 + 1.726 + // (a01 * (16-y)), (a11 * y) 1.727 + a01a11 = _mm_mullo_epi16(a01a11, allY); 1.728 + 1.729 + // (a01 * (16-y) * x), (a11 * y * x) 1.730 + a01a11 = _mm_mullo_epi16(a01a11, allX); 1.731 + 1.732 + // (a00*w00 + a01*w01, a10*w10 + a11*w11) 1.733 + __m128i sum = _mm_add_epi16(a00a10, a01a11); 1.734 + 1.735 + // (DC, a00*w00 + a01*w01) 1.736 + __m128i shifted = _mm_shuffle_epi32(sum, 0xEE); 1.737 + 1.738 + // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11) 1.739 + sum = _mm_add_epi16(sum, shifted); 1.740 + 1.741 + // Divide each 16 bit component by 256. 1.742 + sum = _mm_srli_epi16(sum, 8); 1.743 + 1.744 + // Pack lower 4 16 bit values of sum into lower 4 bytes. 1.745 + sum = _mm_packus_epi16(sum, zero); 1.746 + 1.747 + // Extract low int and store. 1.748 + dstColor = _mm_cvtsi128_si32(sum); 1.749 + 1.750 + //*colors++ = SkPixel32ToPixel16(dstColor); 1.751 + // below is much faster than the above. It's tested for Android benchmark--Softweg 1.752 + __m128i _m_temp1 = _mm_set1_epi32(dstColor); 1.753 + __m128i _m_temp2 = _mm_srli_epi32(_m_temp1, 3); 1.754 + 1.755 + unsigned int r32 = _mm_cvtsi128_si32(_m_temp2); 1.756 + unsigned r = (r32 & ((1<<5) -1)) << 11; 1.757 + 1.758 + _m_temp2 = _mm_srli_epi32(_m_temp2, 7); 1.759 + unsigned int g32 = _mm_cvtsi128_si32(_m_temp2); 1.760 + unsigned g = (g32 & ((1<<6) -1)) << 5; 1.761 + 1.762 + _m_temp2 = _mm_srli_epi32(_m_temp2, 9); 1.763 + unsigned int b32 = _mm_cvtsi128_si32(_m_temp2); 1.764 + unsigned b = (b32 & ((1<<5) -1)); 1.765 + 1.766 + *colors++ = r | g | b; 1.767 + 1.768 + } while (--count > 0); 1.769 +}