gfx/2d/convolver.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/2d/convolver.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,909 @@
     1.4 +// Copyright (c) 2006-2011 The Chromium Authors. All rights reserved.
     1.5 +//
     1.6 +// Redistribution and use in source and binary forms, with or without
     1.7 +// modification, are permitted provided that the following conditions
     1.8 +// are met:
     1.9 +//  * Redistributions of source code must retain the above copyright
    1.10 +//    notice, this list of conditions and the following disclaimer.
    1.11 +//  * Redistributions in binary form must reproduce the above copyright
    1.12 +//    notice, this list of conditions and the following disclaimer in
    1.13 +//    the documentation and/or other materials provided with the
    1.14 +//    distribution.
    1.15 +//  * Neither the name of Google, Inc. nor the names of its contributors
    1.16 +//    may be used to endorse or promote products derived from this
    1.17 +//    software without specific prior written permission.
    1.18 +//
    1.19 +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    1.20 +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    1.21 +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
    1.22 +// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
    1.23 +// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
    1.24 +// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
    1.25 +// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
    1.26 +// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
    1.27 +// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
    1.28 +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
    1.29 +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
    1.30 +// SUCH DAMAGE.
    1.31 +
    1.32 +#include "convolver.h"
    1.33 +
    1.34 +#include <algorithm>
    1.35 +
    1.36 +#include "skia/SkTypes.h"
    1.37 +
    1.38 +// note: SIMD_SSE2 is not enabled because of bugs, apparently
    1.39 +
    1.40 +#if defined(SIMD_SSE2)
    1.41 +#include <emmintrin.h>  // ARCH_CPU_X86_FAMILY was defined in build/config.h
    1.42 +#endif
    1.43 +
    1.44 +#if defined(SK_CPU_LENDIAN)
    1.45 +#define R_OFFSET_IDX 0
    1.46 +#define G_OFFSET_IDX 1
    1.47 +#define B_OFFSET_IDX 2
    1.48 +#define A_OFFSET_IDX 3
    1.49 +#else
    1.50 +#define R_OFFSET_IDX 3
    1.51 +#define G_OFFSET_IDX 2
    1.52 +#define B_OFFSET_IDX 1
    1.53 +#define A_OFFSET_IDX 0
    1.54 +#endif
    1.55 +
    1.56 +namespace skia {
    1.57 +
    1.58 +namespace {
    1.59 +
    1.60 +// Converts the argument to an 8-bit unsigned value by clamping to the range
    1.61 +// 0-255.
    1.62 +inline unsigned char ClampTo8(int a) {
    1.63 +  if (static_cast<unsigned>(a) < 256)
    1.64 +    return a;  // Avoid the extra check in the common case.
    1.65 +  if (a < 0)
    1.66 +    return 0;
    1.67 +  return 255;
    1.68 +}
    1.69 +
    1.70 +// Stores a list of rows in a circular buffer. The usage is you write into it
    1.71 +// by calling AdvanceRow. It will keep track of which row in the buffer it
    1.72 +// should use next, and the total number of rows added.
    1.73 +class CircularRowBuffer {
    1.74 + public:
    1.75 +  // The number of pixels in each row is given in |source_row_pixel_width|.
    1.76 +  // The maximum number of rows needed in the buffer is |max_y_filter_size|
    1.77 +  // (we only need to store enough rows for the biggest filter).
    1.78 +  //
    1.79 +  // We use the |first_input_row| to compute the coordinates of all of the
    1.80 +  // following rows returned by Advance().
    1.81 +  CircularRowBuffer(int dest_row_pixel_width, int max_y_filter_size,
    1.82 +                    int first_input_row)
    1.83 +      : row_byte_width_(dest_row_pixel_width * 4),
    1.84 +        num_rows_(max_y_filter_size),
    1.85 +        next_row_(0),
    1.86 +        next_row_coordinate_(first_input_row) {
    1.87 +    buffer_.resize(row_byte_width_ * max_y_filter_size);
    1.88 +    row_addresses_.resize(num_rows_);
    1.89 +  }
    1.90 +
    1.91 +  // Moves to the next row in the buffer, returning a pointer to the beginning
    1.92 +  // of it.
    1.93 +  unsigned char* AdvanceRow() {
    1.94 +    unsigned char* row = &buffer_[next_row_ * row_byte_width_];
    1.95 +    next_row_coordinate_++;
    1.96 +
    1.97 +    // Set the pointer to the next row to use, wrapping around if necessary.
    1.98 +    next_row_++;
    1.99 +    if (next_row_ == num_rows_)
   1.100 +      next_row_ = 0;
   1.101 +    return row;
   1.102 +  }
   1.103 +
   1.104 +  // Returns a pointer to an "unrolled" array of rows. These rows will start
   1.105 +  // at the y coordinate placed into |*first_row_index| and will continue in
   1.106 +  // order for the maximum number of rows in this circular buffer.
   1.107 +  //
   1.108 +  // The |first_row_index_| may be negative. This means the circular buffer
   1.109 +  // starts before the top of the image (it hasn't been filled yet).
   1.110 +  unsigned char* const* GetRowAddresses(int* first_row_index) {
   1.111 +    // Example for a 4-element circular buffer holding coords 6-9.
   1.112 +    //   Row 0   Coord 8
   1.113 +    //   Row 1   Coord 9
   1.114 +    //   Row 2   Coord 6  <- next_row_ = 2, next_row_coordinate_ = 10.
   1.115 +    //   Row 3   Coord 7
   1.116 +    //
   1.117 +    // The "next" row is also the first (lowest) coordinate. This computation
   1.118 +    // may yield a negative value, but that's OK, the math will work out
   1.119 +    // since the user of this buffer will compute the offset relative
   1.120 +    // to the first_row_index and the negative rows will never be used.
   1.121 +    *first_row_index = next_row_coordinate_ - num_rows_;
   1.122 +
   1.123 +    int cur_row = next_row_;
   1.124 +    for (int i = 0; i < num_rows_; i++) {
   1.125 +      row_addresses_[i] = &buffer_[cur_row * row_byte_width_];
   1.126 +
   1.127 +      // Advance to the next row, wrapping if necessary.
   1.128 +      cur_row++;
   1.129 +      if (cur_row == num_rows_)
   1.130 +        cur_row = 0;
   1.131 +    }
   1.132 +    return &row_addresses_[0];
   1.133 +  }
   1.134 +
   1.135 + private:
   1.136 +  // The buffer storing the rows. They are packed, each one row_byte_width_.
   1.137 +  std::vector<unsigned char> buffer_;
   1.138 +
   1.139 +  // Number of bytes per row in the |buffer_|.
   1.140 +  int row_byte_width_;
   1.141 +
   1.142 +  // The number of rows available in the buffer.
   1.143 +  int num_rows_;
   1.144 +
   1.145 +  // The next row index we should write into. This wraps around as the
   1.146 +  // circular buffer is used.
   1.147 +  int next_row_;
   1.148 +
   1.149 +  // The y coordinate of the |next_row_|. This is incremented each time a
   1.150 +  // new row is appended and does not wrap.
   1.151 +  int next_row_coordinate_;
   1.152 +
   1.153 +  // Buffer used by GetRowAddresses().
   1.154 +  std::vector<unsigned char*> row_addresses_;
   1.155 +};
   1.156 +
   1.157 +// Convolves horizontally along a single row. The row data is given in
   1.158 +// |src_data| and continues for the num_values() of the filter.
   1.159 +template<bool has_alpha>
   1.160 +// This function is miscompiled with gcc 4.5 with pgo. See bug 827946.
   1.161 +#if defined(__GNUC__) && defined(MOZ_GCC_VERSION_AT_LEAST)
   1.162 +#if MOZ_GCC_VERSION_AT_LEAST(4, 5, 0) && !MOZ_GCC_VERSION_AT_LEAST(4, 6, 0)
   1.163 +__attribute__((optimize("-O1")))
   1.164 +#endif
   1.165 +#endif
   1.166 +void ConvolveHorizontally(const unsigned char* src_data,
   1.167 +                          const ConvolutionFilter1D& filter,
   1.168 +                          unsigned char* out_row) {
   1.169 +  // Loop over each pixel on this row in the output image.
   1.170 +  int num_values = filter.num_values();
   1.171 +  for (int out_x = 0; out_x < num_values; out_x++) {
   1.172 +    // Get the filter that determines the current output pixel.
   1.173 +    int filter_offset, filter_length;
   1.174 +    const ConvolutionFilter1D::Fixed* filter_values =
   1.175 +        filter.FilterForValue(out_x, &filter_offset, &filter_length);
   1.176 +
   1.177 +    // Compute the first pixel in this row that the filter affects. It will
   1.178 +    // touch |filter_length| pixels (4 bytes each) after this.
   1.179 +    const unsigned char* row_to_filter = &src_data[filter_offset * 4];
   1.180 +
   1.181 +    // Apply the filter to the row to get the destination pixel in |accum|.
   1.182 +    int accum[4] = {0};
   1.183 +    for (int filter_x = 0; filter_x < filter_length; filter_x++) {
   1.184 +      ConvolutionFilter1D::Fixed cur_filter = filter_values[filter_x];
   1.185 +      accum[0] += cur_filter * row_to_filter[filter_x * 4 + R_OFFSET_IDX];
   1.186 +      accum[1] += cur_filter * row_to_filter[filter_x * 4 + G_OFFSET_IDX];
   1.187 +      accum[2] += cur_filter * row_to_filter[filter_x * 4 + B_OFFSET_IDX];
   1.188 +      if (has_alpha)
   1.189 +        accum[3] += cur_filter * row_to_filter[filter_x * 4 + A_OFFSET_IDX];
   1.190 +    }
   1.191 +
   1.192 +    // Bring this value back in range. All of the filter scaling factors
   1.193 +    // are in fixed point with kShiftBits bits of fractional part.
   1.194 +    accum[0] >>= ConvolutionFilter1D::kShiftBits;
   1.195 +    accum[1] >>= ConvolutionFilter1D::kShiftBits;
   1.196 +    accum[2] >>= ConvolutionFilter1D::kShiftBits;
   1.197 +    if (has_alpha)
   1.198 +      accum[3] >>= ConvolutionFilter1D::kShiftBits;
   1.199 +
   1.200 +    // Store the new pixel.
   1.201 +    out_row[out_x * 4 + R_OFFSET_IDX] = ClampTo8(accum[0]);
   1.202 +    out_row[out_x * 4 + G_OFFSET_IDX] = ClampTo8(accum[1]);
   1.203 +    out_row[out_x * 4 + B_OFFSET_IDX] = ClampTo8(accum[2]);
   1.204 +    if (has_alpha)
   1.205 +      out_row[out_x * 4 + A_OFFSET_IDX] = ClampTo8(accum[3]);
   1.206 +  }
   1.207 +}
   1.208 +
   1.209 +// Does vertical convolution to produce one output row. The filter values and
   1.210 +// length are given in the first two parameters. These are applied to each
   1.211 +// of the rows pointed to in the |source_data_rows| array, with each row
   1.212 +// being |pixel_width| wide.
   1.213 +//
   1.214 +// The output must have room for |pixel_width * 4| bytes.
   1.215 +template<bool has_alpha>
   1.216 +void ConvolveVertically(const ConvolutionFilter1D::Fixed* filter_values,
   1.217 +                        int filter_length,
   1.218 +                        unsigned char* const* source_data_rows,
   1.219 +                        int pixel_width,
   1.220 +                        unsigned char* out_row) {
   1.221 +  // We go through each column in the output and do a vertical convolution,
   1.222 +  // generating one output pixel each time.
   1.223 +  for (int out_x = 0; out_x < pixel_width; out_x++) {
   1.224 +    // Compute the number of bytes over in each row that the current column
   1.225 +    // we're convolving starts at. The pixel will cover the next 4 bytes.
   1.226 +    int byte_offset = out_x * 4;
   1.227 +
   1.228 +    // Apply the filter to one column of pixels.
   1.229 +    int accum[4] = {0};
   1.230 +    for (int filter_y = 0; filter_y < filter_length; filter_y++) {
   1.231 +      ConvolutionFilter1D::Fixed cur_filter = filter_values[filter_y];
   1.232 +      accum[0] += cur_filter 
   1.233 +	* source_data_rows[filter_y][byte_offset + R_OFFSET_IDX];
   1.234 +      accum[1] += cur_filter 
   1.235 +	* source_data_rows[filter_y][byte_offset + G_OFFSET_IDX];
   1.236 +      accum[2] += cur_filter 
   1.237 +	* source_data_rows[filter_y][byte_offset + B_OFFSET_IDX];
   1.238 +      if (has_alpha)
   1.239 +        accum[3] += cur_filter 
   1.240 +	  * source_data_rows[filter_y][byte_offset + A_OFFSET_IDX];
   1.241 +    }
   1.242 +
   1.243 +    // Bring this value back in range. All of the filter scaling factors
   1.244 +    // are in fixed point with kShiftBits bits of precision.
   1.245 +    accum[0] >>= ConvolutionFilter1D::kShiftBits;
   1.246 +    accum[1] >>= ConvolutionFilter1D::kShiftBits;
   1.247 +    accum[2] >>= ConvolutionFilter1D::kShiftBits;
   1.248 +    if (has_alpha)
   1.249 +      accum[3] >>= ConvolutionFilter1D::kShiftBits;
   1.250 +
   1.251 +    // Store the new pixel.
   1.252 +    out_row[byte_offset + R_OFFSET_IDX] = ClampTo8(accum[0]);
   1.253 +    out_row[byte_offset + G_OFFSET_IDX] = ClampTo8(accum[1]);
   1.254 +    out_row[byte_offset + B_OFFSET_IDX] = ClampTo8(accum[2]);
   1.255 +    if (has_alpha) {
   1.256 +      unsigned char alpha = ClampTo8(accum[3]);
   1.257 +
   1.258 +      // Make sure the alpha channel doesn't come out smaller than any of the
   1.259 +      // color channels. We use premultipled alpha channels, so this should
   1.260 +      // never happen, but rounding errors will cause this from time to time.
   1.261 +      // These "impossible" colors will cause overflows (and hence random pixel
   1.262 +      // values) when the resulting bitmap is drawn to the screen.
   1.263 +      //
   1.264 +      // We only need to do this when generating the final output row (here).
   1.265 +      int max_color_channel = std::max(out_row[byte_offset + R_OFFSET_IDX],
   1.266 +          std::max(out_row[byte_offset + G_OFFSET_IDX], out_row[byte_offset + B_OFFSET_IDX]));
   1.267 +      if (alpha < max_color_channel)
   1.268 +        out_row[byte_offset + A_OFFSET_IDX] = max_color_channel;
   1.269 +      else
   1.270 +        out_row[byte_offset + A_OFFSET_IDX] = alpha;
   1.271 +    } else {
   1.272 +      // No alpha channel, the image is opaque.
   1.273 +      out_row[byte_offset + A_OFFSET_IDX] = 0xff;
   1.274 +    }
   1.275 +  }
   1.276 +}
   1.277 +
   1.278 +
   1.279 +// Convolves horizontally along a single row. The row data is given in
   1.280 +// |src_data| and continues for the num_values() of the filter.
   1.281 +void ConvolveHorizontally_SSE2(const unsigned char* src_data,
   1.282 +                               const ConvolutionFilter1D& filter,
   1.283 +                               unsigned char* out_row) {
   1.284 +#if defined(SIMD_SSE2)
   1.285 +  int num_values = filter.num_values();
   1.286 +
   1.287 +  int filter_offset, filter_length;
   1.288 +  __m128i zero = _mm_setzero_si128();
   1.289 +  __m128i mask[4];
   1.290 +  // |mask| will be used to decimate all extra filter coefficients that are
   1.291 +  // loaded by SIMD when |filter_length| is not divisible by 4.
   1.292 +  // mask[0] is not used in following algorithm.
   1.293 +  mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
   1.294 +  mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
   1.295 +  mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
   1.296 +
   1.297 +  // Output one pixel each iteration, calculating all channels (RGBA) together.
   1.298 +  for (int out_x = 0; out_x < num_values; out_x++) {
   1.299 +    const ConvolutionFilter1D::Fixed* filter_values =
   1.300 +        filter.FilterForValue(out_x, &filter_offset, &filter_length);
   1.301 +
   1.302 +    __m128i accum = _mm_setzero_si128();
   1.303 +
   1.304 +    // Compute the first pixel in this row that the filter affects. It will
   1.305 +    // touch |filter_length| pixels (4 bytes each) after this.
   1.306 +    const __m128i* row_to_filter =
   1.307 +        reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);
   1.308 +
   1.309 +    // We will load and accumulate with four coefficients per iteration.
   1.310 +    for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {
   1.311 +
   1.312 +      // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
   1.313 +      __m128i coeff, coeff16;
   1.314 +      // [16] xx xx xx xx c3 c2 c1 c0
   1.315 +      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
   1.316 +      // [16] xx xx xx xx c1 c1 c0 c0
   1.317 +      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
   1.318 +      // [16] c1 c1 c1 c1 c0 c0 c0 c0
   1.319 +      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
   1.320 +
   1.321 +      // Load four pixels => unpack the first two pixels to 16 bits =>
   1.322 +      // multiply with coefficients => accumulate the convolution result.
   1.323 +      // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
   1.324 +      __m128i src8 = _mm_loadu_si128(row_to_filter);
   1.325 +      // [16] a1 b1 g1 r1 a0 b0 g0 r0
   1.326 +      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
   1.327 +      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
   1.328 +      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
   1.329 +      // [32]  a0*c0 b0*c0 g0*c0 r0*c0
   1.330 +      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
   1.331 +      accum = _mm_add_epi32(accum, t);
   1.332 +      // [32]  a1*c1 b1*c1 g1*c1 r1*c1
   1.333 +      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
   1.334 +      accum = _mm_add_epi32(accum, t);
   1.335 +
   1.336 +      // Duplicate 3rd and 4th coefficients for all channels =>
   1.337 +      // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
   1.338 +      // => accumulate the convolution results.
   1.339 +      // [16] xx xx xx xx c3 c3 c2 c2
   1.340 +      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
   1.341 +      // [16] c3 c3 c3 c3 c2 c2 c2 c2
   1.342 +      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
   1.343 +      // [16] a3 g3 b3 r3 a2 g2 b2 r2
   1.344 +      src16 = _mm_unpackhi_epi8(src8, zero);
   1.345 +      mul_hi = _mm_mulhi_epi16(src16, coeff16);
   1.346 +      mul_lo = _mm_mullo_epi16(src16, coeff16);
   1.347 +      // [32]  a2*c2 b2*c2 g2*c2 r2*c2
   1.348 +      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
   1.349 +      accum = _mm_add_epi32(accum, t);
   1.350 +      // [32]  a3*c3 b3*c3 g3*c3 r3*c3
   1.351 +      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
   1.352 +      accum = _mm_add_epi32(accum, t);
   1.353 +
   1.354 +      // Advance the pixel and coefficients pointers.
   1.355 +      row_to_filter += 1;
   1.356 +      filter_values += 4;
   1.357 +    }
   1.358 +
   1.359 +    // When |filter_length| is not divisible by 4, we need to decimate some of
   1.360 +    // the filter coefficient that was loaded incorrectly to zero; Other than
   1.361 +    // that the algorithm is same with above, exceot that the 4th pixel will be
   1.362 +    // always absent.
   1.363 +    int r = filter_length&3;
   1.364 +    if (r) {
   1.365 +      // Note: filter_values must be padded to align_up(filter_offset, 8).
   1.366 +      __m128i coeff, coeff16;
   1.367 +      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
   1.368 +      // Mask out extra filter taps.
   1.369 +      coeff = _mm_and_si128(coeff, mask[r]);
   1.370 +      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
   1.371 +      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
   1.372 +
   1.373 +      // Note: line buffer must be padded to align_up(filter_offset, 16).
   1.374 +      // We resolve this by use C-version for the last horizontal line.
   1.375 +      __m128i src8 = _mm_loadu_si128(row_to_filter);
   1.376 +      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
   1.377 +      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
   1.378 +      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
   1.379 +      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
   1.380 +      accum = _mm_add_epi32(accum, t);
   1.381 +      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
   1.382 +      accum = _mm_add_epi32(accum, t);
   1.383 +
   1.384 +      src16 = _mm_unpackhi_epi8(src8, zero);
   1.385 +      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
   1.386 +      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
   1.387 +      mul_hi = _mm_mulhi_epi16(src16, coeff16);
   1.388 +      mul_lo = _mm_mullo_epi16(src16, coeff16);
   1.389 +      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
   1.390 +      accum = _mm_add_epi32(accum, t);
   1.391 +    }
   1.392 +
   1.393 +    // Shift right for fixed point implementation.
   1.394 +    accum = _mm_srai_epi32(accum, ConvolutionFilter1D::kShiftBits);
   1.395 +
   1.396 +    // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
   1.397 +    accum = _mm_packs_epi32(accum, zero);
   1.398 +    // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
   1.399 +    accum = _mm_packus_epi16(accum, zero);
   1.400 +
   1.401 +    // Store the pixel value of 32 bits.
   1.402 +    *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum);
   1.403 +    out_row += 4;
   1.404 +  }
   1.405 +#endif
   1.406 +}
   1.407 +
   1.408 +// Convolves horizontally along four rows. The row data is given in
   1.409 +// |src_data| and continues for the num_values() of the filter.
   1.410 +// The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
   1.411 +// refer to that function for detailed comments.
   1.412 +void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],
   1.413 +                                const ConvolutionFilter1D& filter,
   1.414 +                                unsigned char* out_row[4]) {
   1.415 +#if defined(SIMD_SSE2)
   1.416 +  int num_values = filter.num_values();
   1.417 +
   1.418 +  int filter_offset, filter_length;
   1.419 +  __m128i zero = _mm_setzero_si128();
   1.420 +  __m128i mask[4];
   1.421 +  // |mask| will be used to decimate all extra filter coefficients that are
   1.422 +  // loaded by SIMD when |filter_length| is not divisible by 4.
   1.423 +  // mask[0] is not used in following algorithm.
   1.424 +  mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
   1.425 +  mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
   1.426 +  mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
   1.427 +
   1.428 +  // Output one pixel each iteration, calculating all channels (RGBA) together.
   1.429 +  for (int out_x = 0; out_x < num_values; out_x++) {
   1.430 +    const ConvolutionFilter1D::Fixed* filter_values =
   1.431 +        filter.FilterForValue(out_x, &filter_offset, &filter_length);
   1.432 +
   1.433 +    // four pixels in a column per iteration.
   1.434 +    __m128i accum0 = _mm_setzero_si128();
   1.435 +    __m128i accum1 = _mm_setzero_si128();
   1.436 +    __m128i accum2 = _mm_setzero_si128();
   1.437 +    __m128i accum3 = _mm_setzero_si128();
   1.438 +    int start = (filter_offset<<2);
   1.439 +    // We will load and accumulate with four coefficients per iteration.
   1.440 +    for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {
   1.441 +      __m128i coeff, coeff16lo, coeff16hi;
   1.442 +      // [16] xx xx xx xx c3 c2 c1 c0
   1.443 +      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
   1.444 +      // [16] xx xx xx xx c1 c1 c0 c0
   1.445 +      coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
   1.446 +      // [16] c1 c1 c1 c1 c0 c0 c0 c0
   1.447 +      coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
   1.448 +      // [16] xx xx xx xx c3 c3 c2 c2
   1.449 +      coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
   1.450 +      // [16] c3 c3 c3 c3 c2 c2 c2 c2
   1.451 +      coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
   1.452 +
   1.453 +      __m128i src8, src16, mul_hi, mul_lo, t;
   1.454 +
   1.455 +#define ITERATION(src, accum)                                          \
   1.456 +      src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));   \
   1.457 +      src16 = _mm_unpacklo_epi8(src8, zero);                           \
   1.458 +      mul_hi = _mm_mulhi_epi16(src16, coeff16lo);                      \
   1.459 +      mul_lo = _mm_mullo_epi16(src16, coeff16lo);                      \
   1.460 +      t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
   1.461 +      accum = _mm_add_epi32(accum, t);                                 \
   1.462 +      t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
   1.463 +      accum = _mm_add_epi32(accum, t);                                 \
   1.464 +      src16 = _mm_unpackhi_epi8(src8, zero);                           \
   1.465 +      mul_hi = _mm_mulhi_epi16(src16, coeff16hi);                      \
   1.466 +      mul_lo = _mm_mullo_epi16(src16, coeff16hi);                      \
   1.467 +      t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
   1.468 +      accum = _mm_add_epi32(accum, t);                                 \
   1.469 +      t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
   1.470 +      accum = _mm_add_epi32(accum, t)
   1.471 +
   1.472 +      ITERATION(src_data[0] + start, accum0);
   1.473 +      ITERATION(src_data[1] + start, accum1);
   1.474 +      ITERATION(src_data[2] + start, accum2);
   1.475 +      ITERATION(src_data[3] + start, accum3);
   1.476 +
   1.477 +      start += 16;
   1.478 +      filter_values += 4;
   1.479 +    }
   1.480 +
   1.481 +    int r = filter_length & 3;
   1.482 +    if (r) {
   1.483 +      // Note: filter_values must be padded to align_up(filter_offset, 8);
   1.484 +      __m128i coeff;
   1.485 +      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
   1.486 +      // Mask out extra filter taps.
   1.487 +      coeff = _mm_and_si128(coeff, mask[r]);
   1.488 +
   1.489 +      __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
   1.490 +      /* c1 c1 c1 c1 c0 c0 c0 c0 */
   1.491 +      coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
   1.492 +      __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
   1.493 +      coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
   1.494 +
   1.495 +      __m128i src8, src16, mul_hi, mul_lo, t;
   1.496 +
   1.497 +      ITERATION(src_data[0] + start, accum0);
   1.498 +      ITERATION(src_data[1] + start, accum1);
   1.499 +      ITERATION(src_data[2] + start, accum2);
   1.500 +      ITERATION(src_data[3] + start, accum3);
   1.501 +    }
   1.502 +
   1.503 +    accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
   1.504 +    accum0 = _mm_packs_epi32(accum0, zero);
   1.505 +    accum0 = _mm_packus_epi16(accum0, zero);
   1.506 +    accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
   1.507 +    accum1 = _mm_packs_epi32(accum1, zero);
   1.508 +    accum1 = _mm_packus_epi16(accum1, zero);
   1.509 +    accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
   1.510 +    accum2 = _mm_packs_epi32(accum2, zero);
   1.511 +    accum2 = _mm_packus_epi16(accum2, zero);
   1.512 +    accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);
   1.513 +    accum3 = _mm_packs_epi32(accum3, zero);
   1.514 +    accum3 = _mm_packus_epi16(accum3, zero);
   1.515 +
   1.516 +    *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0);
   1.517 +    *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1);
   1.518 +    *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2);
   1.519 +    *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3);
   1.520 +
   1.521 +    out_row[0] += 4;
   1.522 +    out_row[1] += 4;
   1.523 +    out_row[2] += 4;
   1.524 +    out_row[3] += 4;
   1.525 +  }
   1.526 +#endif
   1.527 +}
   1.528 +
   1.529 +// Does vertical convolution to produce one output row. The filter values and
   1.530 +// length are given in the first two parameters. These are applied to each
   1.531 +// of the rows pointed to in the |source_data_rows| array, with each row
   1.532 +// being |pixel_width| wide.
   1.533 +//
   1.534 +// The output must have room for |pixel_width * 4| bytes.
   1.535 +template<bool has_alpha>
   1.536 +void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,
   1.537 +                             int filter_length,
   1.538 +                             unsigned char* const* source_data_rows,
   1.539 +                             int pixel_width,
   1.540 +                             unsigned char* out_row) {
   1.541 +#if defined(SIMD_SSE2)
   1.542 +  int width = pixel_width & ~3;
   1.543 +
   1.544 +  __m128i zero = _mm_setzero_si128();
   1.545 +  __m128i accum0, accum1, accum2, accum3, coeff16;
   1.546 +  const __m128i* src;
   1.547 +  // Output four pixels per iteration (16 bytes).
   1.548 +  for (int out_x = 0; out_x < width; out_x += 4) {
   1.549 +
   1.550 +    // Accumulated result for each pixel. 32 bits per RGBA channel.
   1.551 +    accum0 = _mm_setzero_si128();
   1.552 +    accum1 = _mm_setzero_si128();
   1.553 +    accum2 = _mm_setzero_si128();
   1.554 +    accum3 = _mm_setzero_si128();
   1.555 +
   1.556 +    // Convolve with one filter coefficient per iteration.
   1.557 +    for (int filter_y = 0; filter_y < filter_length; filter_y++) {
   1.558 +
   1.559 +      // Duplicate the filter coefficient 8 times.
   1.560 +      // [16] cj cj cj cj cj cj cj cj
   1.561 +      coeff16 = _mm_set1_epi16(filter_values[filter_y]);
   1.562 +
   1.563 +      // Load four pixels (16 bytes) together.
   1.564 +      // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
   1.565 +      src = reinterpret_cast<const __m128i*>(
   1.566 +          &source_data_rows[filter_y][out_x << 2]);
   1.567 +      __m128i src8 = _mm_loadu_si128(src);
   1.568 +
   1.569 +      // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
   1.570 +      // multiply with current coefficient => accumulate the result.
   1.571 +      // [16] a1 b1 g1 r1 a0 b0 g0 r0
   1.572 +      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
   1.573 +      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
   1.574 +      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
   1.575 +      // [32] a0 b0 g0 r0
   1.576 +      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
   1.577 +      accum0 = _mm_add_epi32(accum0, t);
   1.578 +      // [32] a1 b1 g1 r1
   1.579 +      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
   1.580 +      accum1 = _mm_add_epi32(accum1, t);
   1.581 +
   1.582 +      // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
   1.583 +      // multiply with current coefficient => accumulate the result.
   1.584 +      // [16] a3 b3 g3 r3 a2 b2 g2 r2
   1.585 +      src16 = _mm_unpackhi_epi8(src8, zero);
   1.586 +      mul_hi = _mm_mulhi_epi16(src16, coeff16);
   1.587 +      mul_lo = _mm_mullo_epi16(src16, coeff16);
   1.588 +      // [32] a2 b2 g2 r2
   1.589 +      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
   1.590 +      accum2 = _mm_add_epi32(accum2, t);
   1.591 +      // [32] a3 b3 g3 r3
   1.592 +      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
   1.593 +      accum3 = _mm_add_epi32(accum3, t);
   1.594 +    }
   1.595 +
   1.596 +    // Shift right for fixed point implementation.
   1.597 +    accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
   1.598 +    accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
   1.599 +    accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
   1.600 +    accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);
   1.601 +
   1.602 +    // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
   1.603 +    // [16] a1 b1 g1 r1 a0 b0 g0 r0
   1.604 +    accum0 = _mm_packs_epi32(accum0, accum1);
   1.605 +    // [16] a3 b3 g3 r3 a2 b2 g2 r2
   1.606 +    accum2 = _mm_packs_epi32(accum2, accum3);
   1.607 +
   1.608 +    // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
   1.609 +    // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
   1.610 +    accum0 = _mm_packus_epi16(accum0, accum2);
   1.611 +
   1.612 +    if (has_alpha) {
   1.613 +      // Compute the max(ri, gi, bi) for each pixel.
   1.614 +      // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
   1.615 +      __m128i a = _mm_srli_epi32(accum0, 8);
   1.616 +      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
   1.617 +      __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
   1.618 +      // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
   1.619 +      a = _mm_srli_epi32(accum0, 16);
   1.620 +      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
   1.621 +      b = _mm_max_epu8(a, b);  // Max of r and g and b.
   1.622 +      // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
   1.623 +      b = _mm_slli_epi32(b, 24);
   1.624 +
   1.625 +      // Make sure the value of alpha channel is always larger than maximum
   1.626 +      // value of color channels.
   1.627 +      accum0 = _mm_max_epu8(b, accum0);
   1.628 +    } else {
   1.629 +      // Set value of alpha channels to 0xFF.
   1.630 +      __m128i mask = _mm_set1_epi32(0xff000000);
   1.631 +      accum0 = _mm_or_si128(accum0, mask);
   1.632 +    }
   1.633 +
   1.634 +    // Store the convolution result (16 bytes) and advance the pixel pointers.
   1.635 +    _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);
   1.636 +    out_row += 16;
   1.637 +  }
   1.638 +
   1.639 +  // When the width of the output is not divisible by 4, We need to save one
   1.640 +  // pixel (4 bytes) each time. And also the fourth pixel is always absent.
   1.641 +  if (pixel_width & 3) {
   1.642 +    accum0 = _mm_setzero_si128();
   1.643 +    accum1 = _mm_setzero_si128();
   1.644 +    accum2 = _mm_setzero_si128();
   1.645 +    for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
   1.646 +      coeff16 = _mm_set1_epi16(filter_values[filter_y]);
   1.647 +      // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
   1.648 +      src = reinterpret_cast<const __m128i*>(
   1.649 +          &source_data_rows[filter_y][width<<2]);
   1.650 +      __m128i src8 = _mm_loadu_si128(src);
   1.651 +      // [16] a1 b1 g1 r1 a0 b0 g0 r0
   1.652 +      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
   1.653 +      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
   1.654 +      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
   1.655 +      // [32] a0 b0 g0 r0
   1.656 +      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
   1.657 +      accum0 = _mm_add_epi32(accum0, t);
   1.658 +      // [32] a1 b1 g1 r1
   1.659 +      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
   1.660 +      accum1 = _mm_add_epi32(accum1, t);
   1.661 +      // [16] a3 b3 g3 r3 a2 b2 g2 r2
   1.662 +      src16 = _mm_unpackhi_epi8(src8, zero);
   1.663 +      mul_hi = _mm_mulhi_epi16(src16, coeff16);
   1.664 +      mul_lo = _mm_mullo_epi16(src16, coeff16);
   1.665 +      // [32] a2 b2 g2 r2
   1.666 +      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
   1.667 +      accum2 = _mm_add_epi32(accum2, t);
   1.668 +    }
   1.669 +
   1.670 +    accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
   1.671 +    accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
   1.672 +    accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
   1.673 +    // [16] a1 b1 g1 r1 a0 b0 g0 r0
   1.674 +    accum0 = _mm_packs_epi32(accum0, accum1);
   1.675 +    // [16] a3 b3 g3 r3 a2 b2 g2 r2
   1.676 +    accum2 = _mm_packs_epi32(accum2, zero);
   1.677 +    // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
   1.678 +    accum0 = _mm_packus_epi16(accum0, accum2);
   1.679 +    if (has_alpha) {
   1.680 +      // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
   1.681 +      __m128i a = _mm_srli_epi32(accum0, 8);
   1.682 +      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
   1.683 +      __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
   1.684 +      // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
   1.685 +      a = _mm_srli_epi32(accum0, 16);
   1.686 +      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
   1.687 +      b = _mm_max_epu8(a, b);  // Max of r and g and b.
   1.688 +      // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
   1.689 +      b = _mm_slli_epi32(b, 24);
   1.690 +      accum0 = _mm_max_epu8(b, accum0);
   1.691 +    } else {
   1.692 +      __m128i mask = _mm_set1_epi32(0xff000000);
   1.693 +      accum0 = _mm_or_si128(accum0, mask);
   1.694 +    }
   1.695 +
   1.696 +    for (int out_x = width; out_x < pixel_width; out_x++) {
   1.697 +      *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);
   1.698 +      accum0 = _mm_srli_si128(accum0, 4);
   1.699 +      out_row += 4;
   1.700 +    }
   1.701 +  }
   1.702 +#endif
   1.703 +}
   1.704 +
   1.705 +}  // namespace
   1.706 +
   1.707 +// ConvolutionFilter1D ---------------------------------------------------------
   1.708 +
   1.709 +ConvolutionFilter1D::ConvolutionFilter1D()
   1.710 +    : max_filter_(0) {
   1.711 +}
   1.712 +
   1.713 +ConvolutionFilter1D::~ConvolutionFilter1D() {
   1.714 +}
   1.715 +
   1.716 +void ConvolutionFilter1D::AddFilter(int filter_offset,
   1.717 +                                    const float* filter_values,
   1.718 +                                    int filter_length) {
   1.719 +  SkASSERT(filter_length > 0);
   1.720 +
   1.721 +  std::vector<Fixed> fixed_values;
   1.722 +  fixed_values.reserve(filter_length);
   1.723 +
   1.724 +  for (int i = 0; i < filter_length; ++i)
   1.725 +    fixed_values.push_back(FloatToFixed(filter_values[i]));
   1.726 +
   1.727 +  AddFilter(filter_offset, &fixed_values[0], filter_length);
   1.728 +}
   1.729 +
   1.730 +void ConvolutionFilter1D::AddFilter(int filter_offset,
   1.731 +                                    const Fixed* filter_values,
   1.732 +                                    int filter_length) {
   1.733 +  // It is common for leading/trailing filter values to be zeros. In such
   1.734 +  // cases it is beneficial to only store the central factors.
   1.735 +  // For a scaling to 1/4th in each dimension using a Lanczos-2 filter on
   1.736 +  // a 1080p image this optimization gives a ~10% speed improvement.
   1.737 +  int first_non_zero = 0;
   1.738 +  while (first_non_zero < filter_length && filter_values[first_non_zero] == 0)
   1.739 +    first_non_zero++;
   1.740 +
   1.741 +  if (first_non_zero < filter_length) {
   1.742 +    // Here we have at least one non-zero factor.
   1.743 +    int last_non_zero = filter_length - 1;
   1.744 +    while (last_non_zero >= 0 && filter_values[last_non_zero] == 0)
   1.745 +      last_non_zero--;
   1.746 +
   1.747 +    filter_offset += first_non_zero;
   1.748 +    filter_length = last_non_zero + 1 - first_non_zero;
   1.749 +    SkASSERT(filter_length > 0);
   1.750 +
   1.751 +    for (int i = first_non_zero; i <= last_non_zero; i++)
   1.752 +      filter_values_.push_back(filter_values[i]);
   1.753 +  } else {
   1.754 +    // Here all the factors were zeroes.
   1.755 +    filter_length = 0;
   1.756 +  }
   1.757 +
   1.758 +  FilterInstance instance;
   1.759 +
   1.760 +  // We pushed filter_length elements onto filter_values_
   1.761 +  instance.data_location = (static_cast<int>(filter_values_.size()) -
   1.762 +                            filter_length);
   1.763 +  instance.offset = filter_offset;
   1.764 +  instance.length = filter_length;
   1.765 +  filters_.push_back(instance);
   1.766 +
   1.767 +  max_filter_ = std::max(max_filter_, filter_length);
   1.768 +}
   1.769 +
   1.770 +void BGRAConvolve2D(const unsigned char* source_data,
   1.771 +                    int source_byte_row_stride,
   1.772 +                    bool source_has_alpha,
   1.773 +                    const ConvolutionFilter1D& filter_x,
   1.774 +                    const ConvolutionFilter1D& filter_y,
   1.775 +                    int output_byte_row_stride,
   1.776 +                    unsigned char* output,
   1.777 +                    bool use_sse2) {
   1.778 +#if !defined(SIMD_SSE2)
   1.779 +  // Even we have runtime support for SSE2 instructions, since the binary
   1.780 +  // was not built with SSE2 support, we had to fallback to C version.
   1.781 +  use_sse2 = false;
   1.782 +#endif
   1.783 +
   1.784 +  int max_y_filter_size = filter_y.max_filter();
   1.785 +
   1.786 +  // The next row in the input that we will generate a horizontally
   1.787 +  // convolved row for. If the filter doesn't start at the beginning of the
   1.788 +  // image (this is the case when we are only resizing a subset), then we
   1.789 +  // don't want to generate any output rows before that. Compute the starting
   1.790 +  // row for convolution as the first pixel for the first vertical filter.
   1.791 +  int filter_offset, filter_length;
   1.792 +  const ConvolutionFilter1D::Fixed* filter_values =
   1.793 +      filter_y.FilterForValue(0, &filter_offset, &filter_length);
   1.794 +  int next_x_row = filter_offset;
   1.795 +
   1.796 +  // We loop over each row in the input doing a horizontal convolution. This
   1.797 +  // will result in a horizontally convolved image. We write the results into
   1.798 +  // a circular buffer of convolved rows and do vertical convolution as rows
   1.799 +  // are available. This prevents us from having to store the entire
   1.800 +  // intermediate image and helps cache coherency.
   1.801 +  // We will need four extra rows to allow horizontal convolution could be done
   1.802 +  // simultaneously. We also padding each row in row buffer to be aligned-up to
   1.803 +  // 16 bytes.
   1.804 +  // TODO(jiesun): We do not use aligned load from row buffer in vertical
   1.805 +  // convolution pass yet. Somehow Windows does not like it.
   1.806 +  int row_buffer_width = (filter_x.num_values() + 15) & ~0xF;
   1.807 +  int row_buffer_height = max_y_filter_size + (use_sse2 ? 4 : 0);
   1.808 +  CircularRowBuffer row_buffer(row_buffer_width,
   1.809 +                               row_buffer_height,
   1.810 +                               filter_offset);
   1.811 +
   1.812 +  // Loop over every possible output row, processing just enough horizontal
   1.813 +  // convolutions to run each subsequent vertical convolution.
   1.814 +  SkASSERT(output_byte_row_stride >= filter_x.num_values() * 4);
   1.815 +  int num_output_rows = filter_y.num_values();
   1.816 +
   1.817 +  // We need to check which is the last line to convolve before we advance 4
   1.818 +  // lines in one iteration.
   1.819 +  int last_filter_offset, last_filter_length;
   1.820 +  filter_y.FilterForValue(num_output_rows - 1, &last_filter_offset,
   1.821 +                          &last_filter_length);
   1.822 +
   1.823 +  for (int out_y = 0; out_y < num_output_rows; out_y++) {
   1.824 +    filter_values = filter_y.FilterForValue(out_y,
   1.825 +                                            &filter_offset, &filter_length);
   1.826 +
   1.827 +    // Generate output rows until we have enough to run the current filter.
   1.828 +    if (use_sse2) {
   1.829 +      while (next_x_row < filter_offset + filter_length) {
   1.830 +        if (next_x_row + 3 < last_filter_offset + last_filter_length - 1) {
   1.831 +          const unsigned char* src[4];
   1.832 +          unsigned char* out_row[4];
   1.833 +          for (int i = 0; i < 4; ++i) {
   1.834 +            src[i] = &source_data[(next_x_row + i) * source_byte_row_stride];
   1.835 +            out_row[i] = row_buffer.AdvanceRow();
   1.836 +          }
   1.837 +          ConvolveHorizontally4_SSE2(src, filter_x, out_row);
   1.838 +          next_x_row += 4;
   1.839 +        } else {
   1.840 +          // For the last row, SSE2 load possibly to access data beyond the
   1.841 +          // image area. therefore we use C version here. 
   1.842 +          if (next_x_row == last_filter_offset + last_filter_length - 1) {
   1.843 +            if (source_has_alpha) {
   1.844 +              ConvolveHorizontally<true>(
   1.845 +                  &source_data[next_x_row * source_byte_row_stride],
   1.846 +                  filter_x, row_buffer.AdvanceRow());
   1.847 +            } else {
   1.848 +              ConvolveHorizontally<false>(
   1.849 +                  &source_data[next_x_row * source_byte_row_stride],
   1.850 +                  filter_x, row_buffer.AdvanceRow());
   1.851 +            }
   1.852 +          } else {
   1.853 +            ConvolveHorizontally_SSE2(
   1.854 +                &source_data[next_x_row * source_byte_row_stride],
   1.855 +                filter_x, row_buffer.AdvanceRow());
   1.856 +          }
   1.857 +          next_x_row++;
   1.858 +        }
   1.859 +      }
   1.860 +    } else {
   1.861 +      while (next_x_row < filter_offset + filter_length) {
   1.862 +        if (source_has_alpha) {
   1.863 +          ConvolveHorizontally<true>(
   1.864 +              &source_data[next_x_row * source_byte_row_stride],
   1.865 +              filter_x, row_buffer.AdvanceRow());
   1.866 +        } else {
   1.867 +          ConvolveHorizontally<false>(
   1.868 +              &source_data[next_x_row * source_byte_row_stride],
   1.869 +              filter_x, row_buffer.AdvanceRow());
   1.870 +        }
   1.871 +        next_x_row++;
   1.872 +      }
   1.873 +    }
   1.874 +
   1.875 +    // Compute where in the output image this row of final data will go.
   1.876 +    unsigned char* cur_output_row = &output[out_y * output_byte_row_stride];
   1.877 +
   1.878 +    // Get the list of rows that the circular buffer has, in order.
   1.879 +    int first_row_in_circular_buffer;
   1.880 +    unsigned char* const* rows_to_convolve =
   1.881 +        row_buffer.GetRowAddresses(&first_row_in_circular_buffer);
   1.882 +
   1.883 +    // Now compute the start of the subset of those rows that the filter
   1.884 +    // needs.
   1.885 +    unsigned char* const* first_row_for_filter =
   1.886 +        &rows_to_convolve[filter_offset - first_row_in_circular_buffer];
   1.887 +
   1.888 +    if (source_has_alpha) {
   1.889 +      if (use_sse2) {
   1.890 +        ConvolveVertically_SSE2<true>(filter_values, filter_length,
   1.891 +                                      first_row_for_filter,
   1.892 +                                      filter_x.num_values(), cur_output_row);
   1.893 +      } else {
   1.894 +        ConvolveVertically<true>(filter_values, filter_length,
   1.895 +                                 first_row_for_filter,
   1.896 +                                 filter_x.num_values(), cur_output_row);
   1.897 +      }
   1.898 +    } else {
   1.899 +      if (use_sse2) {
   1.900 +        ConvolveVertically_SSE2<false>(filter_values, filter_length,
   1.901 +                                       first_row_for_filter,
   1.902 +                                       filter_x.num_values(), cur_output_row);
   1.903 +      } else {
   1.904 +        ConvolveVertically<false>(filter_values, filter_length,
   1.905 +                                 first_row_for_filter,
   1.906 +                                 filter_x.num_values(), cur_output_row);
   1.907 +      }
   1.908 +    }
   1.909 +  }
   1.910 +}
   1.911 +
   1.912 +}  // namespace skia

mercurial