gfx/skia/trunk/src/opts/SkBitmapProcState_opts_SSE2.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1
michael@0 2 /*
michael@0 3 * Copyright 2009 The Android Open Source Project
michael@0 4 *
michael@0 5 * Use of this source code is governed by a BSD-style license that can be
michael@0 6 * found in the LICENSE file.
michael@0 7 */
michael@0 8
michael@0 9
michael@0 10 #include <emmintrin.h>
michael@0 11 #include "SkBitmapProcState_opts_SSE2.h"
michael@0 12 #include "SkPaint.h"
michael@0 13 #include "SkUtils.h"
michael@0 14
michael@0 15 void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s,
michael@0 16 const uint32_t* xy,
michael@0 17 int count, uint32_t* colors) {
michael@0 18 SkASSERT(count > 0 && colors != NULL);
michael@0 19 SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
michael@0 20 SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
michael@0 21 SkASSERT(s.fAlphaScale == 256);
michael@0 22
michael@0 23 const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
michael@0 24 size_t rb = s.fBitmap->rowBytes();
michael@0 25 uint32_t XY = *xy++;
michael@0 26 unsigned y0 = XY >> 14;
michael@0 27 const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
michael@0 28 const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
michael@0 29 unsigned subY = y0 & 0xF;
michael@0 30
michael@0 31 // ( 0, 0, 0, 0, 0, 0, 0, 16)
michael@0 32 __m128i sixteen = _mm_cvtsi32_si128(16);
michael@0 33
michael@0 34 // ( 0, 0, 0, 0, 16, 16, 16, 16)
michael@0 35 sixteen = _mm_shufflelo_epi16(sixteen, 0);
michael@0 36
michael@0 37 // ( 0, 0, 0, 0, 0, 0, 0, y)
michael@0 38 __m128i allY = _mm_cvtsi32_si128(subY);
michael@0 39
michael@0 40 // ( 0, 0, 0, 0, y, y, y, y)
michael@0 41 allY = _mm_shufflelo_epi16(allY, 0);
michael@0 42
michael@0 43 // ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y)
michael@0 44 __m128i negY = _mm_sub_epi16(sixteen, allY);
michael@0 45
michael@0 46 // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
michael@0 47 allY = _mm_unpacklo_epi64(allY, negY);
michael@0 48
michael@0 49 // (16, 16, 16, 16, 16, 16, 16, 16 )
michael@0 50 sixteen = _mm_shuffle_epi32(sixteen, 0);
michael@0 51
michael@0 52 // ( 0, 0, 0, 0, 0, 0, 0, 0)
michael@0 53 __m128i zero = _mm_setzero_si128();
michael@0 54 do {
michael@0 55 uint32_t XX = *xy++; // x0:14 | 4 | x1:14
michael@0 56 unsigned x0 = XX >> 18;
michael@0 57 unsigned x1 = XX & 0x3FFF;
michael@0 58
michael@0 59 // (0, 0, 0, 0, 0, 0, 0, x)
michael@0 60 __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
michael@0 61
michael@0 62 // (0, 0, 0, 0, x, x, x, x)
michael@0 63 allX = _mm_shufflelo_epi16(allX, 0);
michael@0 64
michael@0 65 // (x, x, x, x, x, x, x, x)
michael@0 66 allX = _mm_shuffle_epi32(allX, 0);
michael@0 67
michael@0 68 // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
michael@0 69 __m128i negX = _mm_sub_epi16(sixteen, allX);
michael@0 70
michael@0 71 // Load 4 samples (pixels).
michael@0 72 __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
michael@0 73 __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
michael@0 74 __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
michael@0 75 __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
michael@0 76
michael@0 77 // (0, 0, a00, a10)
michael@0 78 __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
michael@0 79
michael@0 80 // Expand to 16 bits per component.
michael@0 81 a00a10 = _mm_unpacklo_epi8(a00a10, zero);
michael@0 82
michael@0 83 // ((a00 * (16-y)), (a10 * y)).
michael@0 84 a00a10 = _mm_mullo_epi16(a00a10, allY);
michael@0 85
michael@0 86 // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
michael@0 87 a00a10 = _mm_mullo_epi16(a00a10, negX);
michael@0 88
michael@0 89 // (0, 0, a01, a10)
michael@0 90 __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
michael@0 91
michael@0 92 // Expand to 16 bits per component.
michael@0 93 a01a11 = _mm_unpacklo_epi8(a01a11, zero);
michael@0 94
michael@0 95 // (a01 * (16-y)), (a11 * y)
michael@0 96 a01a11 = _mm_mullo_epi16(a01a11, allY);
michael@0 97
michael@0 98 // (a01 * (16-y) * x), (a11 * y * x)
michael@0 99 a01a11 = _mm_mullo_epi16(a01a11, allX);
michael@0 100
michael@0 101 // (a00*w00 + a01*w01, a10*w10 + a11*w11)
michael@0 102 __m128i sum = _mm_add_epi16(a00a10, a01a11);
michael@0 103
michael@0 104 // (DC, a00*w00 + a01*w01)
michael@0 105 __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
michael@0 106
michael@0 107 // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
michael@0 108 sum = _mm_add_epi16(sum, shifted);
michael@0 109
michael@0 110 // Divide each 16 bit component by 256.
michael@0 111 sum = _mm_srli_epi16(sum, 8);
michael@0 112
michael@0 113 // Pack lower 4 16 bit values of sum into lower 4 bytes.
michael@0 114 sum = _mm_packus_epi16(sum, zero);
michael@0 115
michael@0 116 // Extract low int and store.
michael@0 117 *colors++ = _mm_cvtsi128_si32(sum);
michael@0 118 } while (--count > 0);
michael@0 119 }
michael@0 120
michael@0 121 void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
michael@0 122 const uint32_t* xy,
michael@0 123 int count, uint32_t* colors) {
michael@0 124 SkASSERT(count > 0 && colors != NULL);
michael@0 125 SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
michael@0 126 SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
michael@0 127 SkASSERT(s.fAlphaScale < 256);
michael@0 128
michael@0 129 const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
michael@0 130 size_t rb = s.fBitmap->rowBytes();
michael@0 131 uint32_t XY = *xy++;
michael@0 132 unsigned y0 = XY >> 14;
michael@0 133 const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
michael@0 134 const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
michael@0 135 unsigned subY = y0 & 0xF;
michael@0 136
michael@0 137 // ( 0, 0, 0, 0, 0, 0, 0, 16)
michael@0 138 __m128i sixteen = _mm_cvtsi32_si128(16);
michael@0 139
michael@0 140 // ( 0, 0, 0, 0, 16, 16, 16, 16)
michael@0 141 sixteen = _mm_shufflelo_epi16(sixteen, 0);
michael@0 142
michael@0 143 // ( 0, 0, 0, 0, 0, 0, 0, y)
michael@0 144 __m128i allY = _mm_cvtsi32_si128(subY);
michael@0 145
michael@0 146 // ( 0, 0, 0, 0, y, y, y, y)
michael@0 147 allY = _mm_shufflelo_epi16(allY, 0);
michael@0 148
michael@0 149 // ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y)
michael@0 150 __m128i negY = _mm_sub_epi16(sixteen, allY);
michael@0 151
michael@0 152 // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
michael@0 153 allY = _mm_unpacklo_epi64(allY, negY);
michael@0 154
michael@0 155 // (16, 16, 16, 16, 16, 16, 16, 16 )
michael@0 156 sixteen = _mm_shuffle_epi32(sixteen, 0);
michael@0 157
michael@0 158 // ( 0, 0, 0, 0, 0, 0, 0, 0)
michael@0 159 __m128i zero = _mm_setzero_si128();
michael@0 160
michael@0 161 // ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha )
michael@0 162 __m128i alpha = _mm_set1_epi16(s.fAlphaScale);
michael@0 163
michael@0 164 do {
michael@0 165 uint32_t XX = *xy++; // x0:14 | 4 | x1:14
michael@0 166 unsigned x0 = XX >> 18;
michael@0 167 unsigned x1 = XX & 0x3FFF;
michael@0 168
michael@0 169 // (0, 0, 0, 0, 0, 0, 0, x)
michael@0 170 __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
michael@0 171
michael@0 172 // (0, 0, 0, 0, x, x, x, x)
michael@0 173 allX = _mm_shufflelo_epi16(allX, 0);
michael@0 174
michael@0 175 // (x, x, x, x, x, x, x, x)
michael@0 176 allX = _mm_shuffle_epi32(allX, 0);
michael@0 177
michael@0 178 // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
michael@0 179 __m128i negX = _mm_sub_epi16(sixteen, allX);
michael@0 180
michael@0 181 // Load 4 samples (pixels).
michael@0 182 __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
michael@0 183 __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
michael@0 184 __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
michael@0 185 __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
michael@0 186
michael@0 187 // (0, 0, a00, a10)
michael@0 188 __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
michael@0 189
michael@0 190 // Expand to 16 bits per component.
michael@0 191 a00a10 = _mm_unpacklo_epi8(a00a10, zero);
michael@0 192
michael@0 193 // ((a00 * (16-y)), (a10 * y)).
michael@0 194 a00a10 = _mm_mullo_epi16(a00a10, allY);
michael@0 195
michael@0 196 // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
michael@0 197 a00a10 = _mm_mullo_epi16(a00a10, negX);
michael@0 198
michael@0 199 // (0, 0, a01, a10)
michael@0 200 __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
michael@0 201
michael@0 202 // Expand to 16 bits per component.
michael@0 203 a01a11 = _mm_unpacklo_epi8(a01a11, zero);
michael@0 204
michael@0 205 // (a01 * (16-y)), (a11 * y)
michael@0 206 a01a11 = _mm_mullo_epi16(a01a11, allY);
michael@0 207
michael@0 208 // (a01 * (16-y) * x), (a11 * y * x)
michael@0 209 a01a11 = _mm_mullo_epi16(a01a11, allX);
michael@0 210
michael@0 211 // (a00*w00 + a01*w01, a10*w10 + a11*w11)
michael@0 212 __m128i sum = _mm_add_epi16(a00a10, a01a11);
michael@0 213
michael@0 214 // (DC, a00*w00 + a01*w01)
michael@0 215 __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
michael@0 216
michael@0 217 // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
michael@0 218 sum = _mm_add_epi16(sum, shifted);
michael@0 219
michael@0 220 // Divide each 16 bit component by 256.
michael@0 221 sum = _mm_srli_epi16(sum, 8);
michael@0 222
michael@0 223 // Multiply by alpha.
michael@0 224 sum = _mm_mullo_epi16(sum, alpha);
michael@0 225
michael@0 226 // Divide each 16 bit component by 256.
michael@0 227 sum = _mm_srli_epi16(sum, 8);
michael@0 228
michael@0 229 // Pack lower 4 16 bit values of sum into lower 4 bytes.
michael@0 230 sum = _mm_packus_epi16(sum, zero);
michael@0 231
michael@0 232 // Extract low int and store.
michael@0 233 *colors++ = _mm_cvtsi128_si32(sum);
michael@0 234 } while (--count > 0);
michael@0 235 }
michael@0 236
michael@0 237 static inline uint32_t ClampX_ClampY_pack_filter(SkFixed f, unsigned max,
michael@0 238 SkFixed one) {
michael@0 239 unsigned i = SkClampMax(f >> 16, max);
michael@0 240 i = (i << 4) | ((f >> 12) & 0xF);
michael@0 241 return (i << 14) | SkClampMax((f + one) >> 16, max);
michael@0 242 }
michael@0 243
michael@0 244 /* SSE version of ClampX_ClampY_filter_scale()
michael@0 245 * portable version is in core/SkBitmapProcState_matrix.h
michael@0 246 */
michael@0 247 void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[],
michael@0 248 int count, int x, int y) {
michael@0 249 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
michael@0 250 SkMatrix::kScale_Mask)) == 0);
michael@0 251 SkASSERT(s.fInvKy == 0);
michael@0 252
michael@0 253 const unsigned maxX = s.fBitmap->width() - 1;
michael@0 254 const SkFixed one = s.fFilterOneX;
michael@0 255 const SkFixed dx = s.fInvSx;
michael@0 256 SkFixed fx;
michael@0 257
michael@0 258 SkPoint pt;
michael@0 259 s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
michael@0 260 SkIntToScalar(y) + SK_ScalarHalf, &pt);
michael@0 261 const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1);
michael@0 262 const unsigned maxY = s.fBitmap->height() - 1;
michael@0 263 // compute our two Y values up front
michael@0 264 *xy++ = ClampX_ClampY_pack_filter(fy, maxY, s.fFilterOneY);
michael@0 265 // now initialize fx
michael@0 266 fx = SkScalarToFixed(pt.fX) - (one >> 1);
michael@0 267
michael@0 268 // test if we don't need to apply the tile proc
michael@0 269 if (dx > 0 && (unsigned)(fx >> 16) <= maxX &&
michael@0 270 (unsigned)((fx + dx * (count - 1)) >> 16) < maxX) {
michael@0 271 if (count >= 4) {
michael@0 272 // SSE version of decal_filter_scale
michael@0 273 while ((size_t(xy) & 0x0F) != 0) {
michael@0 274 SkASSERT((fx >> (16 + 14)) == 0);
michael@0 275 *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
michael@0 276 fx += dx;
michael@0 277 count--;
michael@0 278 }
michael@0 279
michael@0 280 __m128i wide_1 = _mm_set1_epi32(1);
michael@0 281 __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
michael@0 282 __m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
michael@0 283 fx + dx, fx);
michael@0 284
michael@0 285 while (count >= 4) {
michael@0 286 __m128i wide_out;
michael@0 287
michael@0 288 wide_out = _mm_slli_epi32(_mm_srai_epi32(wide_fx, 12), 14);
michael@0 289 wide_out = _mm_or_si128(wide_out, _mm_add_epi32(
michael@0 290 _mm_srai_epi32(wide_fx, 16), wide_1));
michael@0 291
michael@0 292 _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_out);
michael@0 293
michael@0 294 xy += 4;
michael@0 295 fx += dx * 4;
michael@0 296 wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
michael@0 297 count -= 4;
michael@0 298 } // while count >= 4
michael@0 299 } // if count >= 4
michael@0 300
michael@0 301 while (count-- > 0) {
michael@0 302 SkASSERT((fx >> (16 + 14)) == 0);
michael@0 303 *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
michael@0 304 fx += dx;
michael@0 305 }
michael@0 306 } else {
michael@0 307 // SSE2 only support 16bit interger max & min, so only process the case
michael@0 308 // maxX less than the max 16bit interger. Actually maxX is the bitmap's
michael@0 309 // height, there should be rare bitmap whose height will be greater
michael@0 310 // than max 16bit interger in the real world.
michael@0 311 if ((count >= 4) && (maxX <= 0xFFFF)) {
michael@0 312 while (((size_t)xy & 0x0F) != 0) {
michael@0 313 *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
michael@0 314 fx += dx;
michael@0 315 count--;
michael@0 316 }
michael@0 317
michael@0 318 __m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
michael@0 319 fx + dx, fx);
michael@0 320 __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
michael@0 321 __m128i wide_one = _mm_set1_epi32(one);
michael@0 322 __m128i wide_maxX = _mm_set1_epi32(maxX);
michael@0 323 __m128i wide_mask = _mm_set1_epi32(0xF);
michael@0 324
michael@0 325 while (count >= 4) {
michael@0 326 __m128i wide_i;
michael@0 327 __m128i wide_lo;
michael@0 328 __m128i wide_fx1;
michael@0 329
michael@0 330 // i = SkClampMax(f>>16,maxX)
michael@0 331 wide_i = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16),
michael@0 332 _mm_setzero_si128());
michael@0 333 wide_i = _mm_min_epi16(wide_i, wide_maxX);
michael@0 334
michael@0 335 // i<<4 | TILEX_LOW_BITS(fx)
michael@0 336 wide_lo = _mm_srli_epi32(wide_fx, 12);
michael@0 337 wide_lo = _mm_and_si128(wide_lo, wide_mask);
michael@0 338 wide_i = _mm_slli_epi32(wide_i, 4);
michael@0 339 wide_i = _mm_or_si128(wide_i, wide_lo);
michael@0 340
michael@0 341 // i<<14
michael@0 342 wide_i = _mm_slli_epi32(wide_i, 14);
michael@0 343
michael@0 344 // SkClampMax(((f+one))>>16,max)
michael@0 345 wide_fx1 = _mm_add_epi32(wide_fx, wide_one);
michael@0 346 wide_fx1 = _mm_max_epi16(_mm_srli_epi32(wide_fx1, 16),
michael@0 347 _mm_setzero_si128());
michael@0 348 wide_fx1 = _mm_min_epi16(wide_fx1, wide_maxX);
michael@0 349
michael@0 350 // final combination
michael@0 351 wide_i = _mm_or_si128(wide_i, wide_fx1);
michael@0 352 _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i);
michael@0 353
michael@0 354 wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
michael@0 355 fx += dx * 4;
michael@0 356 xy += 4;
michael@0 357 count -= 4;
michael@0 358 } // while count >= 4
michael@0 359 } // if count >= 4
michael@0 360
michael@0 361 while (count-- > 0) {
michael@0 362 *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
michael@0 363 fx += dx;
michael@0 364 }
michael@0 365 }
michael@0 366 }
michael@0 367
michael@0 368 /* SSE version of ClampX_ClampY_nofilter_scale()
michael@0 369 * portable version is in core/SkBitmapProcState_matrix.h
michael@0 370 */
michael@0 371 void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s,
michael@0 372 uint32_t xy[], int count, int x, int y) {
michael@0 373 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
michael@0 374 SkMatrix::kScale_Mask)) == 0);
michael@0 375
michael@0 376 // we store y, x, x, x, x, x
michael@0 377 const unsigned maxX = s.fBitmap->width() - 1;
michael@0 378 SkFixed fx;
michael@0 379 SkPoint pt;
michael@0 380 s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
michael@0 381 SkIntToScalar(y) + SK_ScalarHalf, &pt);
michael@0 382 fx = SkScalarToFixed(pt.fY);
michael@0 383 const unsigned maxY = s.fBitmap->height() - 1;
michael@0 384 *xy++ = SkClampMax(fx >> 16, maxY);
michael@0 385 fx = SkScalarToFixed(pt.fX);
michael@0 386
michael@0 387 if (0 == maxX) {
michael@0 388 // all of the following X values must be 0
michael@0 389 memset(xy, 0, count * sizeof(uint16_t));
michael@0 390 return;
michael@0 391 }
michael@0 392
michael@0 393 const SkFixed dx = s.fInvSx;
michael@0 394
michael@0 395 // test if we don't need to apply the tile proc
michael@0 396 if ((unsigned)(fx >> 16) <= maxX &&
michael@0 397 (unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) {
michael@0 398 // SSE version of decal_nofilter_scale
michael@0 399 if (count >= 8) {
michael@0 400 while (((size_t)xy & 0x0F) != 0) {
michael@0 401 *xy++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
michael@0 402 fx += 2 * dx;
michael@0 403 count -= 2;
michael@0 404 }
michael@0 405
michael@0 406 __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
michael@0 407 __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
michael@0 408
michael@0 409 __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
michael@0 410 fx + dx, fx);
michael@0 411 __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
michael@0 412
michael@0 413 while (count >= 8) {
michael@0 414 __m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
michael@0 415 __m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
michael@0 416
michael@0 417 __m128i wide_result = _mm_packs_epi32(wide_out_low,
michael@0 418 wide_out_high);
michael@0 419 _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
michael@0 420
michael@0 421 wide_low = _mm_add_epi32(wide_low, wide_dx8);
michael@0 422 wide_high = _mm_add_epi32(wide_high, wide_dx8);
michael@0 423
michael@0 424 xy += 4;
michael@0 425 fx += dx * 8;
michael@0 426 count -= 8;
michael@0 427 }
michael@0 428 } // if count >= 8
michael@0 429
michael@0 430 uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
michael@0 431 while (count-- > 0) {
michael@0 432 *xx++ = SkToU16(fx >> 16);
michael@0 433 fx += dx;
michael@0 434 }
michael@0 435 } else {
michael@0 436 // SSE2 only support 16bit interger max & min, so only process the case
michael@0 437 // maxX less than the max 16bit interger. Actually maxX is the bitmap's
michael@0 438 // height, there should be rare bitmap whose height will be greater
michael@0 439 // than max 16bit interger in the real world.
michael@0 440 if ((count >= 8) && (maxX <= 0xFFFF)) {
michael@0 441 while (((size_t)xy & 0x0F) != 0) {
michael@0 442 *xy++ = pack_two_shorts(SkClampMax((fx + dx) >> 16, maxX),
michael@0 443 SkClampMax(fx >> 16, maxX));
michael@0 444 fx += 2 * dx;
michael@0 445 count -= 2;
michael@0 446 }
michael@0 447
michael@0 448 __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
michael@0 449 __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
michael@0 450
michael@0 451 __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
michael@0 452 fx + dx, fx);
michael@0 453 __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
michael@0 454 __m128i wide_maxX = _mm_set1_epi32(maxX);
michael@0 455
michael@0 456 while (count >= 8) {
michael@0 457 __m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
michael@0 458 __m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
michael@0 459
michael@0 460 wide_out_low = _mm_max_epi16(wide_out_low,
michael@0 461 _mm_setzero_si128());
michael@0 462 wide_out_low = _mm_min_epi16(wide_out_low, wide_maxX);
michael@0 463 wide_out_high = _mm_max_epi16(wide_out_high,
michael@0 464 _mm_setzero_si128());
michael@0 465 wide_out_high = _mm_min_epi16(wide_out_high, wide_maxX);
michael@0 466
michael@0 467 __m128i wide_result = _mm_packs_epi32(wide_out_low,
michael@0 468 wide_out_high);
michael@0 469 _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
michael@0 470
michael@0 471 wide_low = _mm_add_epi32(wide_low, wide_dx8);
michael@0 472 wide_high = _mm_add_epi32(wide_high, wide_dx8);
michael@0 473
michael@0 474 xy += 4;
michael@0 475 fx += dx * 8;
michael@0 476 count -= 8;
michael@0 477 }
michael@0 478 } // if count >= 8
michael@0 479
michael@0 480 uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
michael@0 481 while (count-- > 0) {
michael@0 482 *xx++ = SkClampMax(fx >> 16, maxX);
michael@0 483 fx += dx;
michael@0 484 }
michael@0 485 }
michael@0 486 }
michael@0 487
michael@0 488 /* SSE version of ClampX_ClampY_filter_affine()
michael@0 489 * portable version is in core/SkBitmapProcState_matrix.h
michael@0 490 */
michael@0 491 void ClampX_ClampY_filter_affine_SSE2(const SkBitmapProcState& s,
michael@0 492 uint32_t xy[], int count, int x, int y) {
michael@0 493 SkPoint srcPt;
michael@0 494 s.fInvProc(s.fInvMatrix,
michael@0 495 SkIntToScalar(x) + SK_ScalarHalf,
michael@0 496 SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
michael@0 497
michael@0 498 SkFixed oneX = s.fFilterOneX;
michael@0 499 SkFixed oneY = s.fFilterOneY;
michael@0 500 SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1);
michael@0 501 SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1);
michael@0 502 SkFixed dx = s.fInvSx;
michael@0 503 SkFixed dy = s.fInvKy;
michael@0 504 unsigned maxX = s.fBitmap->width() - 1;
michael@0 505 unsigned maxY = s.fBitmap->height() - 1;
michael@0 506
michael@0 507 if (count >= 2 && (maxX <= 0xFFFF)) {
michael@0 508 SkFixed dx2 = dx + dx;
michael@0 509 SkFixed dy2 = dy + dy;
michael@0 510
michael@0 511 __m128i wide_f = _mm_set_epi32(fx + dx, fy + dy, fx, fy);
michael@0 512 __m128i wide_d2 = _mm_set_epi32(dx2, dy2, dx2, dy2);
michael@0 513 __m128i wide_one = _mm_set_epi32(oneX, oneY, oneX, oneY);
michael@0 514 __m128i wide_max = _mm_set_epi32(maxX, maxY, maxX, maxY);
michael@0 515 __m128i wide_mask = _mm_set1_epi32(0xF);
michael@0 516
michael@0 517 while (count >= 2) {
michael@0 518 // i = SkClampMax(f>>16,maxX)
michael@0 519 __m128i wide_i = _mm_max_epi16(_mm_srli_epi32(wide_f, 16),
michael@0 520 _mm_setzero_si128());
michael@0 521 wide_i = _mm_min_epi16(wide_i, wide_max);
michael@0 522
michael@0 523 // i<<4 | TILEX_LOW_BITS(f)
michael@0 524 __m128i wide_lo = _mm_srli_epi32(wide_f, 12);
michael@0 525 wide_lo = _mm_and_si128(wide_lo, wide_mask);
michael@0 526 wide_i = _mm_slli_epi32(wide_i, 4);
michael@0 527 wide_i = _mm_or_si128(wide_i, wide_lo);
michael@0 528
michael@0 529 // i<<14
michael@0 530 wide_i = _mm_slli_epi32(wide_i, 14);
michael@0 531
michael@0 532 // SkClampMax(((f+one))>>16,max)
michael@0 533 __m128i wide_f1 = _mm_add_epi32(wide_f, wide_one);
michael@0 534 wide_f1 = _mm_max_epi16(_mm_srli_epi32(wide_f1, 16),
michael@0 535 _mm_setzero_si128());
michael@0 536 wide_f1 = _mm_min_epi16(wide_f1, wide_max);
michael@0 537
michael@0 538 // final combination
michael@0 539 wide_i = _mm_or_si128(wide_i, wide_f1);
michael@0 540 _mm_storeu_si128(reinterpret_cast<__m128i*>(xy), wide_i);
michael@0 541
michael@0 542 wide_f = _mm_add_epi32(wide_f, wide_d2);
michael@0 543
michael@0 544 fx += dx2;
michael@0 545 fy += dy2;
michael@0 546 xy += 4;
michael@0 547 count -= 2;
michael@0 548 } // while count >= 2
michael@0 549 } // if count >= 2
michael@0 550
michael@0 551 while (count-- > 0) {
michael@0 552 *xy++ = ClampX_ClampY_pack_filter(fy, maxY, oneY);
michael@0 553 fy += dy;
michael@0 554 *xy++ = ClampX_ClampY_pack_filter(fx, maxX, oneX);
michael@0 555 fx += dx;
michael@0 556 }
michael@0 557 }
michael@0 558
michael@0 559 /* SSE version of ClampX_ClampY_nofilter_affine()
michael@0 560 * portable version is in core/SkBitmapProcState_matrix.h
michael@0 561 */
michael@0 562 void ClampX_ClampY_nofilter_affine_SSE2(const SkBitmapProcState& s,
michael@0 563 uint32_t xy[], int count, int x, int y) {
michael@0 564 SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
michael@0 565 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
michael@0 566 SkMatrix::kScale_Mask |
michael@0 567 SkMatrix::kAffine_Mask)) == 0);
michael@0 568
michael@0 569 SkPoint srcPt;
michael@0 570 s.fInvProc(s.fInvMatrix,
michael@0 571 SkIntToScalar(x) + SK_ScalarHalf,
michael@0 572 SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
michael@0 573
michael@0 574 SkFixed fx = SkScalarToFixed(srcPt.fX);
michael@0 575 SkFixed fy = SkScalarToFixed(srcPt.fY);
michael@0 576 SkFixed dx = s.fInvSx;
michael@0 577 SkFixed dy = s.fInvKy;
michael@0 578 int maxX = s.fBitmap->width() - 1;
michael@0 579 int maxY = s.fBitmap->height() - 1;
michael@0 580
michael@0 581 if (count >= 4 && (maxX <= 0xFFFF)) {
michael@0 582 while (((size_t)xy & 0x0F) != 0) {
michael@0 583 *xy++ = (SkClampMax(fy >> 16, maxY) << 16) |
michael@0 584 SkClampMax(fx >> 16, maxX);
michael@0 585 fx += dx;
michael@0 586 fy += dy;
michael@0 587 count--;
michael@0 588 }
michael@0 589
michael@0 590 SkFixed dx4 = dx * 4;
michael@0 591 SkFixed dy4 = dy * 4;
michael@0 592
michael@0 593 __m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
michael@0 594 fx + dx, fx);
michael@0 595 __m128i wide_fy = _mm_set_epi32(fy + dy * 3, fy + dy * 2,
michael@0 596 fy + dy, fy);
michael@0 597 __m128i wide_dx4 = _mm_set1_epi32(dx4);
michael@0 598 __m128i wide_dy4 = _mm_set1_epi32(dy4);
michael@0 599
michael@0 600 __m128i wide_maxX = _mm_set1_epi32(maxX);
michael@0 601 __m128i wide_maxY = _mm_set1_epi32(maxY);
michael@0 602
michael@0 603 while (count >= 4) {
michael@0 604 // SkClampMax(fx>>16,maxX)
michael@0 605 __m128i wide_lo = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16),
michael@0 606 _mm_setzero_si128());
michael@0 607 wide_lo = _mm_min_epi16(wide_lo, wide_maxX);
michael@0 608
michael@0 609 // SkClampMax(fy>>16,maxY)
michael@0 610 __m128i wide_hi = _mm_max_epi16(_mm_srli_epi32(wide_fy, 16),
michael@0 611 _mm_setzero_si128());
michael@0 612 wide_hi = _mm_min_epi16(wide_hi, wide_maxY);
michael@0 613
michael@0 614 // final combination
michael@0 615 __m128i wide_i = _mm_or_si128(_mm_slli_epi32(wide_hi, 16),
michael@0 616 wide_lo);
michael@0 617 _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i);
michael@0 618
michael@0 619 wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
michael@0 620 wide_fy = _mm_add_epi32(wide_fy, wide_dy4);
michael@0 621
michael@0 622 fx += dx4;
michael@0 623 fy += dy4;
michael@0 624 xy += 4;
michael@0 625 count -= 4;
michael@0 626 } // while count >= 4
michael@0 627 } // if count >= 4
michael@0 628
michael@0 629 while (count-- > 0) {
michael@0 630 *xy++ = (SkClampMax(fy >> 16, maxY) << 16) |
michael@0 631 SkClampMax(fx >> 16, maxX);
michael@0 632 fx += dx;
michael@0 633 fy += dy;
michael@0 634 }
michael@0 635 }
michael@0 636
michael@0 637 /* SSE version of S32_D16_filter_DX_SSE2
michael@0 638 * Definition is in section of "D16 functions for SRC == 8888" in SkBitmapProcState.cpp
michael@0 639 * It combines S32_opaque_D32_filter_DX_SSE2 and SkPixel32ToPixel16
michael@0 640 */
michael@0 641 void S32_D16_filter_DX_SSE2(const SkBitmapProcState& s,
michael@0 642 const uint32_t* xy,
michael@0 643 int count, uint16_t* colors) {
michael@0 644 SkASSERT(count > 0 && colors != NULL);
michael@0 645 SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
michael@0 646 SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
michael@0 647 SkASSERT(s.fBitmap->isOpaque());
michael@0 648
michael@0 649 SkPMColor dstColor;
michael@0 650 const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
michael@0 651 size_t rb = s.fBitmap->rowBytes();
michael@0 652 uint32_t XY = *xy++;
michael@0 653 unsigned y0 = XY >> 14;
michael@0 654 const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
michael@0 655 const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
michael@0 656 unsigned subY = y0 & 0xF;
michael@0 657
michael@0 658 // ( 0, 0, 0, 0, 0, 0, 0, 16)
michael@0 659 __m128i sixteen = _mm_cvtsi32_si128(16);
michael@0 660
michael@0 661 // ( 0, 0, 0, 0, 16, 16, 16, 16)
michael@0 662 sixteen = _mm_shufflelo_epi16(sixteen, 0);
michael@0 663
michael@0 664 // ( 0, 0, 0, 0, 0, 0, 0, y)
michael@0 665 __m128i allY = _mm_cvtsi32_si128(subY);
michael@0 666
michael@0 667 // ( 0, 0, 0, 0, y, y, y, y)
michael@0 668 allY = _mm_shufflelo_epi16(allY, 0);
michael@0 669
michael@0 670 // ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y)
michael@0 671 __m128i negY = _mm_sub_epi16(sixteen, allY);
michael@0 672
michael@0 673 // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
michael@0 674 allY = _mm_unpacklo_epi64(allY, negY);
michael@0 675
michael@0 676 // (16, 16, 16, 16, 16, 16, 16, 16 )
michael@0 677 sixteen = _mm_shuffle_epi32(sixteen, 0);
michael@0 678
michael@0 679 // ( 0, 0, 0, 0, 0, 0, 0, 0)
michael@0 680 __m128i zero = _mm_setzero_si128();
michael@0 681
michael@0 682 do {
michael@0 683 uint32_t XX = *xy++; // x0:14 | 4 | x1:14
michael@0 684 unsigned x0 = XX >> 18;
michael@0 685 unsigned x1 = XX & 0x3FFF;
michael@0 686
michael@0 687 // (0, 0, 0, 0, 0, 0, 0, x)
michael@0 688 __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
michael@0 689
michael@0 690 // (0, 0, 0, 0, x, x, x, x)
michael@0 691 allX = _mm_shufflelo_epi16(allX, 0);
michael@0 692
michael@0 693 // (x, x, x, x, x, x, x, x)
michael@0 694 allX = _mm_shuffle_epi32(allX, 0);
michael@0 695
michael@0 696 // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
michael@0 697 __m128i negX = _mm_sub_epi16(sixteen, allX);
michael@0 698
michael@0 699 // Load 4 samples (pixels).
michael@0 700 __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
michael@0 701 __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
michael@0 702 __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
michael@0 703 __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
michael@0 704
michael@0 705 // (0, 0, a00, a10)
michael@0 706 __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
michael@0 707
michael@0 708 // Expand to 16 bits per component.
michael@0 709 a00a10 = _mm_unpacklo_epi8(a00a10, zero);
michael@0 710
michael@0 711 // ((a00 * (16-y)), (a10 * y)).
michael@0 712 a00a10 = _mm_mullo_epi16(a00a10, allY);
michael@0 713
michael@0 714 // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
michael@0 715 a00a10 = _mm_mullo_epi16(a00a10, negX);
michael@0 716
michael@0 717 // (0, 0, a01, a10)
michael@0 718 __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
michael@0 719
michael@0 720 // Expand to 16 bits per component.
michael@0 721 a01a11 = _mm_unpacklo_epi8(a01a11, zero);
michael@0 722
michael@0 723 // (a01 * (16-y)), (a11 * y)
michael@0 724 a01a11 = _mm_mullo_epi16(a01a11, allY);
michael@0 725
michael@0 726 // (a01 * (16-y) * x), (a11 * y * x)
michael@0 727 a01a11 = _mm_mullo_epi16(a01a11, allX);
michael@0 728
michael@0 729 // (a00*w00 + a01*w01, a10*w10 + a11*w11)
michael@0 730 __m128i sum = _mm_add_epi16(a00a10, a01a11);
michael@0 731
michael@0 732 // (DC, a00*w00 + a01*w01)
michael@0 733 __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
michael@0 734
michael@0 735 // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
michael@0 736 sum = _mm_add_epi16(sum, shifted);
michael@0 737
michael@0 738 // Divide each 16 bit component by 256.
michael@0 739 sum = _mm_srli_epi16(sum, 8);
michael@0 740
michael@0 741 // Pack lower 4 16 bit values of sum into lower 4 bytes.
michael@0 742 sum = _mm_packus_epi16(sum, zero);
michael@0 743
michael@0 744 // Extract low int and store.
michael@0 745 dstColor = _mm_cvtsi128_si32(sum);
michael@0 746
michael@0 747 //*colors++ = SkPixel32ToPixel16(dstColor);
michael@0 748 // below is much faster than the above. It's tested for Android benchmark--Softweg
michael@0 749 __m128i _m_temp1 = _mm_set1_epi32(dstColor);
michael@0 750 __m128i _m_temp2 = _mm_srli_epi32(_m_temp1, 3);
michael@0 751
michael@0 752 unsigned int r32 = _mm_cvtsi128_si32(_m_temp2);
michael@0 753 unsigned r = (r32 & ((1<<5) -1)) << 11;
michael@0 754
michael@0 755 _m_temp2 = _mm_srli_epi32(_m_temp2, 7);
michael@0 756 unsigned int g32 = _mm_cvtsi128_si32(_m_temp2);
michael@0 757 unsigned g = (g32 & ((1<<6) -1)) << 5;
michael@0 758
michael@0 759 _m_temp2 = _mm_srli_epi32(_m_temp2, 9);
michael@0 760 unsigned int b32 = _mm_cvtsi128_si32(_m_temp2);
michael@0 761 unsigned b = (b32 & ((1<<5) -1));
michael@0 762
michael@0 763 *colors++ = r | g | b;
michael@0 764
michael@0 765 } while (--count > 0);
michael@0 766 }

mercurial