michael@0: /* michael@0: * Copyright 2012 The Android Open Source Project michael@0: * michael@0: * Use of this source code is governed by a BSD-style license that can be michael@0: * found in the LICENSE file. michael@0: */ michael@0: michael@0: #include "SkBitmapProcState_opts_SSSE3.h" michael@0: #include "SkPaint.h" michael@0: #include "SkUtils.h" michael@0: michael@0: /* With the exception of the Android framework we always build the SSSE3 functions michael@0: * and enable the caller to determine SSSE3 support. However for the Android framework michael@0: * if the device does not support SSSE3 then the compiler will not supply the required michael@0: * -mssse3 option needed to build this file, so instead we provide a stub implementation. michael@0: */ michael@0: #if !defined(SK_BUILD_FOR_ANDROID_FRAMEWORK) || SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 michael@0: michael@0: #include // SSSE3 michael@0: michael@0: // adding anonymous namespace seemed to force gcc to inline directly the michael@0: // instantiation, instead of creating the functions michael@0: // S32_generic_D32_filter_DX_SSSE3 and michael@0: // S32_generic_D32_filter_DX_SSSE3 which were then called by the michael@0: // external functions. michael@0: namespace { michael@0: // In this file, variations for alpha and non alpha versions are implemented michael@0: // with a template, as it makes the code more compact and a bit easier to michael@0: // maintain, while making the compiler generate the same exact code as with michael@0: // two functions that only differ by a few lines. michael@0: michael@0: michael@0: // Prepare all necessary constants for a round of processing for two pixel michael@0: // pairs. michael@0: // @param xy is the location where the xy parameters for four pixels should be michael@0: // read from. It is identical in concept with argument two of michael@0: // S32_{opaque}_D32_filter_DX methods. michael@0: // @param mask_3FFF vector of 32 bit constants containing 3FFF, michael@0: // suitable to mask the bottom 14 bits of a XY value. michael@0: // @param mask_000F vector of 32 bit constants containing 000F, michael@0: // suitable to mask the bottom 4 bits of a XY value. michael@0: // @param sixteen_8bit vector of 8 bit components containing the value 16. michael@0: // @param mask_dist_select vector of 8 bit components containing the shuffling michael@0: // parameters to reorder x[0-3] parameters. michael@0: // @param all_x_result vector of 8 bit components that will contain the michael@0: // (4x(x3), 4x(x2), 4x(x1), 4x(x0)) upon return. michael@0: // @param sixteen_minus_x vector of 8 bit components, containing michael@0: // (4x(16 - x3), 4x(16 - x2), 4x(16 - x1), 4x(16 - x0)) michael@0: inline void PrepareConstantsTwoPixelPairs(const uint32_t* xy, michael@0: const __m128i& mask_3FFF, michael@0: const __m128i& mask_000F, michael@0: const __m128i& sixteen_8bit, michael@0: const __m128i& mask_dist_select, michael@0: __m128i* all_x_result, michael@0: __m128i* sixteen_minus_x, michael@0: int* x0, michael@0: int* x1) { michael@0: const __m128i xx = _mm_loadu_si128(reinterpret_cast(xy)); michael@0: michael@0: // 4 delta X michael@0: // (x03, x02, x01, x00) michael@0: const __m128i x0_wide = _mm_srli_epi32(xx, 18); michael@0: // (x13, x12, x11, x10) michael@0: const __m128i x1_wide = _mm_and_si128(xx, mask_3FFF); michael@0: michael@0: _mm_storeu_si128(reinterpret_cast<__m128i *>(x0), x0_wide); michael@0: _mm_storeu_si128(reinterpret_cast<__m128i *>(x1), x1_wide); michael@0: michael@0: __m128i all_x = _mm_and_si128(_mm_srli_epi32(xx, 14), mask_000F); michael@0: michael@0: // (4x(x3), 4x(x2), 4x(x1), 4x(x0)) michael@0: all_x = _mm_shuffle_epi8(all_x, mask_dist_select); michael@0: michael@0: *all_x_result = all_x; michael@0: // (4x(16-x3), 4x(16-x2), 4x(16-x1), 4x(16-x0)) michael@0: *sixteen_minus_x = _mm_sub_epi8(sixteen_8bit, all_x); michael@0: } michael@0: michael@0: // Prepare all necessary constants for a round of processing for two pixel michael@0: // pairs. michael@0: // @param xy is the location where the xy parameters for four pixels should be michael@0: // read from. It is identical in concept with argument two of michael@0: // S32_{opaque}_D32_filter_DXDY methods. michael@0: // @param mask_3FFF vector of 32 bit constants containing 3FFF, michael@0: // suitable to mask the bottom 14 bits of a XY value. michael@0: // @param mask_000F vector of 32 bit constants containing 000F, michael@0: // suitable to mask the bottom 4 bits of a XY value. michael@0: // @param sixteen_8bit vector of 8 bit components containing the value 16. michael@0: // @param mask_dist_select vector of 8 bit components containing the shuffling michael@0: // parameters to reorder x[0-3] parameters. michael@0: // @param all_xy_result vector of 8 bit components that will contain the michael@0: // (4x(y1), 4x(y0), 4x(x1), 4x(x0)) upon return. michael@0: // @param sixteen_minus_x vector of 8 bit components, containing michael@0: // (4x(16-y1), 4x(16-y0), 4x(16-x1), 4x(16-x0)). michael@0: inline void PrepareConstantsTwoPixelPairsDXDY(const uint32_t* xy, michael@0: const __m128i& mask_3FFF, michael@0: const __m128i& mask_000F, michael@0: const __m128i& sixteen_8bit, michael@0: const __m128i& mask_dist_select, michael@0: __m128i* all_xy_result, michael@0: __m128i* sixteen_minus_xy, michael@0: int* xy0, int* xy1) { michael@0: const __m128i xy_wide = michael@0: _mm_loadu_si128(reinterpret_cast(xy)); michael@0: michael@0: // (x10, y10, x00, y00) michael@0: __m128i xy0_wide = _mm_srli_epi32(xy_wide, 18); michael@0: // (y10, y00, x10, x00) michael@0: xy0_wide = _mm_shuffle_epi32(xy0_wide, _MM_SHUFFLE(2, 0, 3, 1)); michael@0: // (x11, y11, x01, y01) michael@0: __m128i xy1_wide = _mm_and_si128(xy_wide, mask_3FFF); michael@0: // (y11, y01, x11, x01) michael@0: xy1_wide = _mm_shuffle_epi32(xy1_wide, _MM_SHUFFLE(2, 0, 3, 1)); michael@0: michael@0: _mm_storeu_si128(reinterpret_cast<__m128i *>(xy0), xy0_wide); michael@0: _mm_storeu_si128(reinterpret_cast<__m128i *>(xy1), xy1_wide); michael@0: michael@0: // (x1, y1, x0, y0) michael@0: __m128i all_xy = _mm_and_si128(_mm_srli_epi32(xy_wide, 14), mask_000F); michael@0: // (y1, y0, x1, x0) michael@0: all_xy = _mm_shuffle_epi32(all_xy, _MM_SHUFFLE(2, 0, 3, 1)); michael@0: // (4x(y1), 4x(y0), 4x(x1), 4x(x0)) michael@0: all_xy = _mm_shuffle_epi8(all_xy, mask_dist_select); michael@0: michael@0: *all_xy_result = all_xy; michael@0: // (4x(16-y1), 4x(16-y0), 4x(16-x1), 4x(16-x0)) michael@0: *sixteen_minus_xy = _mm_sub_epi8(sixteen_8bit, all_xy); michael@0: } michael@0: michael@0: // Helper function used when processing one pixel pair. michael@0: // @param pixel0..3 are the four input pixels michael@0: // @param scale_x vector of 8 bit components to multiply the pixel[0:3]. This michael@0: // will contain (4x(x1, 16-x1), 4x(x0, 16-x0)) michael@0: // or (4x(x3, 16-x3), 4x(x2, 16-x2)) michael@0: // @return a vector of 16 bit components containing: michael@0: // (Aa2 * (16 - x1) + Aa3 * x1, ... , Ra0 * (16 - x0) + Ra1 * x0) michael@0: inline __m128i ProcessPixelPairHelper(uint32_t pixel0, michael@0: uint32_t pixel1, michael@0: uint32_t pixel2, michael@0: uint32_t pixel3, michael@0: const __m128i& scale_x) { michael@0: __m128i a0, a1, a2, a3; michael@0: // Load 2 pairs of pixels michael@0: a0 = _mm_cvtsi32_si128(pixel0); michael@0: a1 = _mm_cvtsi32_si128(pixel1); michael@0: michael@0: // Interleave pixels. michael@0: // (0, 0, 0, 0, 0, 0, 0, 0, Aa1, Aa0, Ba1, Ba0, Ga1, Ga0, Ra1, Ra0) michael@0: a0 = _mm_unpacklo_epi8(a0, a1); michael@0: michael@0: a2 = _mm_cvtsi32_si128(pixel2); michael@0: a3 = _mm_cvtsi32_si128(pixel3); michael@0: // (0, 0, 0, 0, 0, 0, 0, 0, Aa3, Aa2, Ba3, Ba2, Ga3, Ga2, Ra3, Ra2) michael@0: a2 = _mm_unpacklo_epi8(a2, a3); michael@0: michael@0: // two pairs of pixel pairs, interleaved. michael@0: // (Aa3, Aa2, Ba3, Ba2, Ga3, Ga2, Ra3, Ra2, michael@0: // Aa1, Aa0, Ba1, Ba0, Ga1, Ga0, Ra1, Ra0) michael@0: a0 = _mm_unpacklo_epi64(a0, a2); michael@0: michael@0: // multiply and sum to 16 bit components. michael@0: // (Aa2 * (16 - x1) + Aa3 * x1, ... , Ra0 * (16 - x0) + Ra1 * x0) michael@0: // At that point, we use up a bit less than 12 bits for each 16 bit michael@0: // component: michael@0: // All components are less than 255. So, michael@0: // C0 * (16 - x) + C1 * x <= 255 * (16 - x) + 255 * x = 255 * 16. michael@0: return _mm_maddubs_epi16(a0, scale_x); michael@0: } michael@0: michael@0: // Scale back the results after multiplications to the [0:255] range, and scale michael@0: // by alpha when has_alpha is true. michael@0: // Depending on whether one set or two sets of multiplications had been applied, michael@0: // the results have to be shifted by four places (dividing by 16), or shifted michael@0: // by eight places (dividing by 256), since each multiplication is by a quantity michael@0: // in the range [0:16]. michael@0: template michael@0: inline __m128i ScaleFourPixels(__m128i* pixels, michael@0: const __m128i& alpha) { michael@0: // Divide each 16 bit component by 16 (or 256 depending on scale). michael@0: *pixels = _mm_srli_epi16(*pixels, scale); michael@0: michael@0: if (has_alpha) { michael@0: // Multiply by alpha. michael@0: *pixels = _mm_mullo_epi16(*pixels, alpha); michael@0: michael@0: // Divide each 16 bit component by 256. michael@0: *pixels = _mm_srli_epi16(*pixels, 8); michael@0: } michael@0: return *pixels; michael@0: } michael@0: michael@0: // Wrapper to calculate two output pixels from four input pixels. The michael@0: // arguments are the same as ProcessPixelPairHelper. Technically, there are michael@0: // eight input pixels, but since sub_y == 0, the factors applied to half of the michael@0: // pixels is zero (sub_y), and are therefore omitted here to save on some michael@0: // processing. michael@0: // @param alpha when has_alpha is true, scale all resulting components by this michael@0: // value. michael@0: // @return a vector of 16 bit components containing: michael@0: // ((Aa2 * (16 - x1) + Aa3 * x1) * alpha, ..., michael@0: // (Ra0 * (16 - x0) + Ra1 * x0) * alpha) (when has_alpha is true) michael@0: // otherwise michael@0: // (Aa2 * (16 - x1) + Aa3 * x1, ... , Ra0 * (16 - x0) + Ra1 * x0) michael@0: // In both cases, the results are renormalized (divided by 16) to match the michael@0: // expected formats when storing back the results into memory. michael@0: template michael@0: inline __m128i ProcessPixelPairZeroSubY(uint32_t pixel0, michael@0: uint32_t pixel1, michael@0: uint32_t pixel2, michael@0: uint32_t pixel3, michael@0: const __m128i& scale_x, michael@0: const __m128i& alpha) { michael@0: __m128i sum = ProcessPixelPairHelper(pixel0, pixel1, pixel2, pixel3, michael@0: scale_x); michael@0: return ScaleFourPixels(&sum, alpha); michael@0: } michael@0: michael@0: // Same as ProcessPixelPairZeroSubY, expect processing one output pixel at a michael@0: // time instead of two. As in the above function, only two pixels are needed michael@0: // to generate a single pixel since sub_y == 0. michael@0: // @return same as ProcessPixelPairZeroSubY, except that only the bottom 4 michael@0: // 16 bit components are set. michael@0: template michael@0: inline __m128i ProcessOnePixelZeroSubY(uint32_t pixel0, michael@0: uint32_t pixel1, michael@0: __m128i scale_x, michael@0: __m128i alpha) { michael@0: __m128i a0 = _mm_cvtsi32_si128(pixel0); michael@0: __m128i a1 = _mm_cvtsi32_si128(pixel1); michael@0: michael@0: // Interleave michael@0: a0 = _mm_unpacklo_epi8(a0, a1); michael@0: michael@0: // (a0 * (16-x) + a1 * x) michael@0: __m128i sum = _mm_maddubs_epi16(a0, scale_x); michael@0: michael@0: return ScaleFourPixels(&sum, alpha); michael@0: } michael@0: michael@0: // Methods when sub_y != 0 michael@0: michael@0: michael@0: // Same as ProcessPixelPairHelper, except that the values are scaled by y. michael@0: // @param y vector of 16 bit components containing 'y' values. There are two michael@0: // cases in practice, where y will contain the sub_y constant, or will michael@0: // contain the 16 - sub_y constant. michael@0: // @return vector of 16 bit components containing: michael@0: // (y * (Aa2 * (16 - x1) + Aa3 * x1), ... , y * (Ra0 * (16 - x0) + Ra1 * x0)) michael@0: inline __m128i ProcessPixelPair(uint32_t pixel0, michael@0: uint32_t pixel1, michael@0: uint32_t pixel2, michael@0: uint32_t pixel3, michael@0: const __m128i& scale_x, michael@0: const __m128i& y) { michael@0: __m128i sum = ProcessPixelPairHelper(pixel0, pixel1, pixel2, pixel3, michael@0: scale_x); michael@0: michael@0: // first row times 16-y or y depending on whether 'y' represents one or michael@0: // the other. michael@0: // Values will be up to 255 * 16 * 16 = 65280. michael@0: // (y * (Aa2 * (16 - x1) + Aa3 * x1), ... , michael@0: // y * (Ra0 * (16 - x0) + Ra1 * x0)) michael@0: sum = _mm_mullo_epi16(sum, y); michael@0: michael@0: return sum; michael@0: } michael@0: michael@0: // Process two pixel pairs out of eight input pixels. michael@0: // In other methods, the distinct pixels are passed one by one, but in this michael@0: // case, the rows, and index offsets to the pixels into the row are passed michael@0: // to generate the 8 pixels. michael@0: // @param row0..1 top and bottom row where to find input pixels. michael@0: // @param x0..1 offsets into the row for all eight input pixels. michael@0: // @param all_y vector of 16 bit components containing the constant sub_y michael@0: // @param neg_y vector of 16 bit components containing the constant 16 - sub_y michael@0: // @param alpha vector of 16 bit components containing the alpha value to scale michael@0: // the results by, when has_alpha is true. michael@0: // @return michael@0: // (alpha * ((16-y) * (Aa2 * (16-x1) + Aa3 * x1) + michael@0: // y * (Aa2' * (16-x1) + Aa3' * x1)), michael@0: // ... michael@0: // alpha * ((16-y) * (Ra0 * (16-x0) + Ra1 * x0) + michael@0: // y * (Ra0' * (16-x0) + Ra1' * x0)) michael@0: // With the factor alpha removed when has_alpha is false. michael@0: // The values are scaled back to 16 bit components, but with only the bottom michael@0: // 8 bits being set. michael@0: template michael@0: inline __m128i ProcessTwoPixelPairs(const uint32_t* row0, michael@0: const uint32_t* row1, michael@0: const int* x0, michael@0: const int* x1, michael@0: const __m128i& scale_x, michael@0: const __m128i& all_y, michael@0: const __m128i& neg_y, michael@0: const __m128i& alpha) { michael@0: __m128i sum0 = ProcessPixelPair( michael@0: row0[x0[0]], row0[x1[0]], row0[x0[1]], row0[x1[1]], michael@0: scale_x, neg_y); michael@0: __m128i sum1 = ProcessPixelPair( michael@0: row1[x0[0]], row1[x1[0]], row1[x0[1]], row1[x1[1]], michael@0: scale_x, all_y); michael@0: michael@0: // 2 samples fully summed. michael@0: // ((16-y) * (Aa2 * (16-x1) + Aa3 * x1) + michael@0: // y * (Aa2' * (16-x1) + Aa3' * x1), michael@0: // ... michael@0: // (16-y) * (Ra0 * (16 - x0) + Ra1 * x0)) + michael@0: // y * (Ra0' * (16-x0) + Ra1' * x0)) michael@0: // Each component, again can be at most 256 * 255 = 65280, so no overflow. michael@0: sum0 = _mm_add_epi16(sum0, sum1); michael@0: michael@0: return ScaleFourPixels(&sum0, alpha); michael@0: } michael@0: michael@0: // Similar to ProcessTwoPixelPairs except the pixel indexes. michael@0: template michael@0: inline __m128i ProcessTwoPixelPairsDXDY(const uint32_t* row00, michael@0: const uint32_t* row01, michael@0: const uint32_t* row10, michael@0: const uint32_t* row11, michael@0: const int* xy0, michael@0: const int* xy1, michael@0: const __m128i& scale_x, michael@0: const __m128i& all_y, michael@0: const __m128i& neg_y, michael@0: const __m128i& alpha) { michael@0: // first row michael@0: __m128i sum0 = ProcessPixelPair( michael@0: row00[xy0[0]], row00[xy1[0]], row10[xy0[1]], row10[xy1[1]], michael@0: scale_x, neg_y); michael@0: // second row michael@0: __m128i sum1 = ProcessPixelPair( michael@0: row01[xy0[0]], row01[xy1[0]], row11[xy0[1]], row11[xy1[1]], michael@0: scale_x, all_y); michael@0: michael@0: // 2 samples fully summed. michael@0: // ((16-y1) * (Aa2 * (16-x1) + Aa3 * x1) + michael@0: // y0 * (Aa2' * (16-x1) + Aa3' * x1), michael@0: // ... michael@0: // (16-y0) * (Ra0 * (16 - x0) + Ra1 * x0)) + michael@0: // y0 * (Ra0' * (16-x0) + Ra1' * x0)) michael@0: // Each component, again can be at most 256 * 255 = 65280, so no overflow. michael@0: sum0 = _mm_add_epi16(sum0, sum1); michael@0: michael@0: return ScaleFourPixels(&sum0, alpha); michael@0: } michael@0: michael@0: michael@0: // Same as ProcessPixelPair, except that performing the math one output pixel michael@0: // at a time. This means that only the bottom four 16 bit components are set. michael@0: inline __m128i ProcessOnePixel(uint32_t pixel0, uint32_t pixel1, michael@0: const __m128i& scale_x, const __m128i& y) { michael@0: __m128i a0 = _mm_cvtsi32_si128(pixel0); michael@0: __m128i a1 = _mm_cvtsi32_si128(pixel1); michael@0: michael@0: // Interleave michael@0: // (0, 0, 0, 0, 0, 0, 0, 0, Aa1, Aa0, Ba1, Ba0, Ga1, Ga0, Ra1, Ra0) michael@0: a0 = _mm_unpacklo_epi8(a0, a1); michael@0: michael@0: // (a0 * (16-x) + a1 * x) michael@0: a0 = _mm_maddubs_epi16(a0, scale_x); michael@0: michael@0: // scale row by y michael@0: return _mm_mullo_epi16(a0, y); michael@0: } michael@0: michael@0: // Notes about the various tricks that are used in this implementation: michael@0: // - specialization for sub_y == 0. michael@0: // Statistically, 1/16th of the samples will have sub_y == 0. When this michael@0: // happens, the math goes from: michael@0: // (16 - x)*(16 - y)*a00 + x*(16 - y)*a01 + (16 - x)*y*a10 + x*y*a11 michael@0: // to: michael@0: // (16 - x)*a00 + 16*x*a01 michael@0: // much simpler. The simplification makes for an easy boost in performance. michael@0: // - calculating 4 output pixels at a time. michael@0: // This allows loading the coefficients x0 and x1 and shuffling them to the michael@0: // optimum location only once per loop, instead of twice per loop. michael@0: // This also allows us to store the four pixels with a single store. michael@0: // - Use of 2 special SSSE3 instructions (comparatively to the SSE2 instruction michael@0: // version): michael@0: // _mm_shuffle_epi8 : this allows us to spread the coefficients x[0-3] loaded michael@0: // in 32 bit values to 8 bit values repeated four times. michael@0: // _mm_maddubs_epi16 : this allows us to perform multiplications and additions michael@0: // in one swoop of 8bit values storing the results in 16 bit values. This michael@0: // instruction is actually crucial for the speed of the implementation since michael@0: // as one can see in the SSE2 implementation, all inputs have to be used as michael@0: // 16 bits because the results are 16 bits. This basically allows us to process michael@0: // twice as many pixel components per iteration. michael@0: // michael@0: // As a result, this method behaves faster than the traditional SSE2. The actual michael@0: // boost varies greatly on the underlying architecture. michael@0: template michael@0: void S32_generic_D32_filter_DX_SSSE3(const SkBitmapProcState& s, michael@0: const uint32_t* xy, michael@0: int count, uint32_t* colors) { michael@0: SkASSERT(count > 0 && colors != NULL); michael@0: SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel); michael@0: SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config); michael@0: if (has_alpha) { michael@0: SkASSERT(s.fAlphaScale < 256); michael@0: } else { michael@0: SkASSERT(s.fAlphaScale == 256); michael@0: } michael@0: michael@0: const uint8_t* src_addr = michael@0: static_cast(s.fBitmap->getPixels()); michael@0: const size_t rb = s.fBitmap->rowBytes(); michael@0: const uint32_t XY = *xy++; michael@0: const unsigned y0 = XY >> 14; michael@0: const uint32_t* row0 = michael@0: reinterpret_cast(src_addr + (y0 >> 4) * rb); michael@0: const uint32_t* row1 = michael@0: reinterpret_cast(src_addr + (XY & 0x3FFF) * rb); michael@0: const unsigned sub_y = y0 & 0xF; michael@0: michael@0: // vector constants michael@0: const __m128i mask_dist_select = _mm_set_epi8(12, 12, 12, 12, michael@0: 8, 8, 8, 8, michael@0: 4, 4, 4, 4, michael@0: 0, 0, 0, 0); michael@0: const __m128i mask_3FFF = _mm_set1_epi32(0x3FFF); michael@0: const __m128i mask_000F = _mm_set1_epi32(0x000F); michael@0: const __m128i sixteen_8bit = _mm_set1_epi8(16); michael@0: // (0, 0, 0, 0, 0, 0, 0, 0) michael@0: const __m128i zero = _mm_setzero_si128(); michael@0: michael@0: __m128i alpha = _mm_setzero_si128(); michael@0: if (has_alpha) michael@0: // 8x(alpha) michael@0: alpha = _mm_set1_epi16(s.fAlphaScale); michael@0: michael@0: if (sub_y == 0) { michael@0: // Unroll 4x, interleave bytes, use pmaddubsw (all_x is small) michael@0: while (count > 3) { michael@0: count -= 4; michael@0: michael@0: int x0[4]; michael@0: int x1[4]; michael@0: __m128i all_x, sixteen_minus_x; michael@0: PrepareConstantsTwoPixelPairs(xy, mask_3FFF, mask_000F, michael@0: sixteen_8bit, mask_dist_select, michael@0: &all_x, &sixteen_minus_x, x0, x1); michael@0: xy += 4; michael@0: michael@0: // First pair of pixel pairs. michael@0: // (4x(x1, 16-x1), 4x(x0, 16-x0)) michael@0: __m128i scale_x; michael@0: scale_x = _mm_unpacklo_epi8(sixteen_minus_x, all_x); michael@0: michael@0: __m128i sum0 = ProcessPixelPairZeroSubY( michael@0: row0[x0[0]], row0[x1[0]], row0[x0[1]], row0[x1[1]], michael@0: scale_x, alpha); michael@0: michael@0: // second pair of pixel pairs michael@0: // (4x (x3, 16-x3), 4x (16-x2, x2)) michael@0: scale_x = _mm_unpackhi_epi8(sixteen_minus_x, all_x); michael@0: michael@0: __m128i sum1 = ProcessPixelPairZeroSubY( michael@0: row0[x0[2]], row0[x1[2]], row0[x0[3]], row0[x1[3]], michael@0: scale_x, alpha); michael@0: michael@0: // Pack lower 4 16 bit values of sum into lower 4 bytes. michael@0: sum0 = _mm_packus_epi16(sum0, sum1); michael@0: michael@0: // Extract low int and store. michael@0: _mm_storeu_si128(reinterpret_cast<__m128i *>(colors), sum0); michael@0: michael@0: colors += 4; michael@0: } michael@0: michael@0: // handle remainder michael@0: while (count-- > 0) { michael@0: uint32_t xx = *xy++; // x0:14 | 4 | x1:14 michael@0: unsigned x0 = xx >> 18; michael@0: unsigned x1 = xx & 0x3FFF; michael@0: michael@0: // 16x(x) michael@0: const __m128i all_x = _mm_set1_epi8((xx >> 14) & 0x0F); michael@0: michael@0: // (16x(16-x)) michael@0: __m128i scale_x = _mm_sub_epi8(sixteen_8bit, all_x); michael@0: michael@0: scale_x = _mm_unpacklo_epi8(scale_x, all_x); michael@0: michael@0: __m128i sum = ProcessOnePixelZeroSubY( michael@0: row0[x0], row0[x1], michael@0: scale_x, alpha); michael@0: michael@0: // Pack lower 4 16 bit values of sum into lower 4 bytes. michael@0: sum = _mm_packus_epi16(sum, zero); michael@0: michael@0: // Extract low int and store. michael@0: *colors++ = _mm_cvtsi128_si32(sum); michael@0: } michael@0: } else { // more general case, y != 0 michael@0: // 8x(16) michael@0: const __m128i sixteen_16bit = _mm_set1_epi16(16); michael@0: michael@0: // 8x (y) michael@0: const __m128i all_y = _mm_set1_epi16(sub_y); michael@0: michael@0: // 8x (16-y) michael@0: const __m128i neg_y = _mm_sub_epi16(sixteen_16bit, all_y); michael@0: michael@0: // Unroll 4x, interleave bytes, use pmaddubsw (all_x is small) michael@0: while (count > 3) { michael@0: count -= 4; michael@0: michael@0: int x0[4]; michael@0: int x1[4]; michael@0: __m128i all_x, sixteen_minus_x; michael@0: PrepareConstantsTwoPixelPairs(xy, mask_3FFF, mask_000F, michael@0: sixteen_8bit, mask_dist_select, michael@0: &all_x, &sixteen_minus_x, x0, x1); michael@0: xy += 4; michael@0: michael@0: // First pair of pixel pairs michael@0: // (4x(x1, 16-x1), 4x(x0, 16-x0)) michael@0: __m128i scale_x; michael@0: scale_x = _mm_unpacklo_epi8(sixteen_minus_x, all_x); michael@0: michael@0: __m128i sum0 = ProcessTwoPixelPairs( michael@0: row0, row1, x0, x1, michael@0: scale_x, all_y, neg_y, alpha); michael@0: michael@0: // second pair of pixel pairs michael@0: // (4x (x3, 16-x3), 4x (16-x2, x2)) michael@0: scale_x = _mm_unpackhi_epi8(sixteen_minus_x, all_x); michael@0: michael@0: __m128i sum1 = ProcessTwoPixelPairs( michael@0: row0, row1, x0 + 2, x1 + 2, michael@0: scale_x, all_y, neg_y, alpha); michael@0: michael@0: // Do the final packing of the two results michael@0: michael@0: // Pack lower 4 16 bit values of sum into lower 4 bytes. michael@0: sum0 = _mm_packus_epi16(sum0, sum1); michael@0: michael@0: // Extract low int and store. michael@0: _mm_storeu_si128(reinterpret_cast<__m128i *>(colors), sum0); michael@0: michael@0: colors += 4; michael@0: } michael@0: michael@0: // Left over. michael@0: while (count-- > 0) { michael@0: const uint32_t xx = *xy++; // x0:14 | 4 | x1:14 michael@0: const unsigned x0 = xx >> 18; michael@0: const unsigned x1 = xx & 0x3FFF; michael@0: michael@0: // 16x(x) michael@0: const __m128i all_x = _mm_set1_epi8((xx >> 14) & 0x0F); michael@0: michael@0: // 16x (16-x) michael@0: __m128i scale_x = _mm_sub_epi8(sixteen_8bit, all_x); michael@0: michael@0: // (8x (x, 16-x)) michael@0: scale_x = _mm_unpacklo_epi8(scale_x, all_x); michael@0: michael@0: // first row. michael@0: __m128i sum0 = ProcessOnePixel(row0[x0], row0[x1], scale_x, neg_y); michael@0: // second row. michael@0: __m128i sum1 = ProcessOnePixel(row1[x0], row1[x1], scale_x, all_y); michael@0: michael@0: // Add both rows for full sample michael@0: sum0 = _mm_add_epi16(sum0, sum1); michael@0: michael@0: sum0 = ScaleFourPixels(&sum0, alpha); michael@0: michael@0: // Pack lower 4 16 bit values of sum into lower 4 bytes. michael@0: sum0 = _mm_packus_epi16(sum0, zero); michael@0: michael@0: // Extract low int and store. michael@0: *colors++ = _mm_cvtsi128_si32(sum0); michael@0: } michael@0: } michael@0: } michael@0: michael@0: /* michael@0: * Similar to S32_generic_D32_filter_DX_SSSE3, we do not need to handle the michael@0: * special case suby == 0 as suby is changing in every loop. michael@0: */ michael@0: template michael@0: void S32_generic_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s, michael@0: const uint32_t* xy, michael@0: int count, uint32_t* colors) { michael@0: SkASSERT(count > 0 && colors != NULL); michael@0: SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel); michael@0: SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config); michael@0: if (has_alpha) { michael@0: SkASSERT(s.fAlphaScale < 256); michael@0: } else { michael@0: SkASSERT(s.fAlphaScale == 256); michael@0: } michael@0: michael@0: const uint8_t* src_addr = michael@0: static_cast(s.fBitmap->getPixels()); michael@0: const size_t rb = s.fBitmap->rowBytes(); michael@0: michael@0: // vector constants michael@0: const __m128i mask_dist_select = _mm_set_epi8(12, 12, 12, 12, michael@0: 8, 8, 8, 8, michael@0: 4, 4, 4, 4, michael@0: 0, 0, 0, 0); michael@0: const __m128i mask_3FFF = _mm_set1_epi32(0x3FFF); michael@0: const __m128i mask_000F = _mm_set1_epi32(0x000F); michael@0: const __m128i sixteen_8bit = _mm_set1_epi8(16); michael@0: michael@0: __m128i alpha; michael@0: if (has_alpha) { michael@0: // 8x(alpha) michael@0: alpha = _mm_set1_epi16(s.fAlphaScale); michael@0: } michael@0: michael@0: // Unroll 2x, interleave bytes, use pmaddubsw (all_x is small) michael@0: while (count >= 2) { michael@0: int xy0[4]; michael@0: int xy1[4]; michael@0: __m128i all_xy, sixteen_minus_xy; michael@0: PrepareConstantsTwoPixelPairsDXDY(xy, mask_3FFF, mask_000F, michael@0: sixteen_8bit, mask_dist_select, michael@0: &all_xy, &sixteen_minus_xy, xy0, xy1); michael@0: michael@0: // (4x(x1, 16-x1), 4x(x0, 16-x0)) michael@0: __m128i scale_x = _mm_unpacklo_epi8(sixteen_minus_xy, all_xy); michael@0: // (4x(0, y1), 4x(0, y0)) michael@0: __m128i all_y = _mm_unpackhi_epi8(all_xy, _mm_setzero_si128()); michael@0: __m128i neg_y = _mm_sub_epi16(_mm_set1_epi16(16), all_y); michael@0: michael@0: const uint32_t* row00 = michael@0: reinterpret_cast(src_addr + xy0[2] * rb); michael@0: const uint32_t* row01 = michael@0: reinterpret_cast(src_addr + xy1[2] * rb); michael@0: const uint32_t* row10 = michael@0: reinterpret_cast(src_addr + xy0[3] * rb); michael@0: const uint32_t* row11 = michael@0: reinterpret_cast(src_addr + xy1[3] * rb); michael@0: michael@0: __m128i sum0 = ProcessTwoPixelPairsDXDY( michael@0: row00, row01, row10, row11, xy0, xy1, michael@0: scale_x, all_y, neg_y, alpha); michael@0: michael@0: // Pack lower 4 16 bit values of sum into lower 4 bytes. michael@0: sum0 = _mm_packus_epi16(sum0, _mm_setzero_si128()); michael@0: michael@0: // Extract low int and store. michael@0: _mm_storel_epi64(reinterpret_cast<__m128i *>(colors), sum0); michael@0: michael@0: xy += 4; michael@0: colors += 2; michael@0: count -= 2; michael@0: } michael@0: michael@0: // Handle the remainder michael@0: while (count-- > 0) { michael@0: uint32_t data = *xy++; michael@0: unsigned y0 = data >> 14; michael@0: unsigned y1 = data & 0x3FFF; michael@0: unsigned subY = y0 & 0xF; michael@0: y0 >>= 4; michael@0: michael@0: data = *xy++; michael@0: unsigned x0 = data >> 14; michael@0: unsigned x1 = data & 0x3FFF; michael@0: unsigned subX = x0 & 0xF; michael@0: x0 >>= 4; michael@0: michael@0: const uint32_t* row0 = michael@0: reinterpret_cast(src_addr + y0 * rb); michael@0: const uint32_t* row1 = michael@0: reinterpret_cast(src_addr + y1 * rb); michael@0: michael@0: // 16x(x) michael@0: const __m128i all_x = _mm_set1_epi8(subX); michael@0: michael@0: // 16x (16-x) michael@0: __m128i scale_x = _mm_sub_epi8(sixteen_8bit, all_x); michael@0: michael@0: // (8x (x, 16-x)) michael@0: scale_x = _mm_unpacklo_epi8(scale_x, all_x); michael@0: michael@0: // 8x(16) michael@0: const __m128i sixteen_16bit = _mm_set1_epi16(16); michael@0: michael@0: // 8x (y) michael@0: const __m128i all_y = _mm_set1_epi16(subY); michael@0: michael@0: // 8x (16-y) michael@0: const __m128i neg_y = _mm_sub_epi16(sixteen_16bit, all_y); michael@0: michael@0: // first row. michael@0: __m128i sum0 = ProcessOnePixel(row0[x0], row0[x1], scale_x, neg_y); michael@0: // second row. michael@0: __m128i sum1 = ProcessOnePixel(row1[x0], row1[x1], scale_x, all_y); michael@0: michael@0: // Add both rows for full sample michael@0: sum0 = _mm_add_epi16(sum0, sum1); michael@0: michael@0: sum0 = ScaleFourPixels(&sum0, alpha); michael@0: michael@0: // Pack lower 4 16 bit values of sum into lower 4 bytes. michael@0: sum0 = _mm_packus_epi16(sum0, _mm_setzero_si128()); michael@0: michael@0: // Extract low int and store. michael@0: *colors++ = _mm_cvtsi128_si32(sum0); michael@0: } michael@0: } michael@0: } // namepace michael@0: michael@0: void S32_opaque_D32_filter_DX_SSSE3(const SkBitmapProcState& s, michael@0: const uint32_t* xy, michael@0: int count, uint32_t* colors) { michael@0: S32_generic_D32_filter_DX_SSSE3(s, xy, count, colors); michael@0: } michael@0: michael@0: void S32_alpha_D32_filter_DX_SSSE3(const SkBitmapProcState& s, michael@0: const uint32_t* xy, michael@0: int count, uint32_t* colors) { michael@0: S32_generic_D32_filter_DX_SSSE3(s, xy, count, colors); michael@0: } michael@0: michael@0: void S32_opaque_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s, michael@0: const uint32_t* xy, michael@0: int count, uint32_t* colors) { michael@0: S32_generic_D32_filter_DXDY_SSSE3(s, xy, count, colors); michael@0: } michael@0: michael@0: void S32_alpha_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s, michael@0: const uint32_t* xy, michael@0: int count, uint32_t* colors) { michael@0: S32_generic_D32_filter_DXDY_SSSE3(s, xy, count, colors); michael@0: } michael@0: michael@0: #else // !defined(SK_BUILD_FOR_ANDROID_FRAMEWORK) || SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 michael@0: michael@0: void S32_opaque_D32_filter_DX_SSSE3(const SkBitmapProcState& s, michael@0: const uint32_t* xy, michael@0: int count, uint32_t* colors) { michael@0: sk_throw(); michael@0: } michael@0: michael@0: void S32_alpha_D32_filter_DX_SSSE3(const SkBitmapProcState& s, michael@0: const uint32_t* xy, michael@0: int count, uint32_t* colors) { michael@0: sk_throw(); michael@0: } michael@0: michael@0: void S32_opaque_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s, michael@0: const uint32_t* xy, michael@0: int count, uint32_t* colors) { michael@0: sk_throw(); michael@0: } michael@0: michael@0: void S32_alpha_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s, michael@0: const uint32_t* xy, michael@0: int count, uint32_t* colors) { michael@0: sk_throw(); michael@0: } michael@0: michael@0: #endif