1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/2d/convolver.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,909 @@ 1.4 +// Copyright (c) 2006-2011 The Chromium Authors. All rights reserved. 1.5 +// 1.6 +// Redistribution and use in source and binary forms, with or without 1.7 +// modification, are permitted provided that the following conditions 1.8 +// are met: 1.9 +// * Redistributions of source code must retain the above copyright 1.10 +// notice, this list of conditions and the following disclaimer. 1.11 +// * Redistributions in binary form must reproduce the above copyright 1.12 +// notice, this list of conditions and the following disclaimer in 1.13 +// the documentation and/or other materials provided with the 1.14 +// distribution. 1.15 +// * Neither the name of Google, Inc. nor the names of its contributors 1.16 +// may be used to endorse or promote products derived from this 1.17 +// software without specific prior written permission. 1.18 +// 1.19 +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 1.20 +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 1.21 +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 1.22 +// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 1.23 +// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 1.24 +// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 1.25 +// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 1.26 +// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 1.27 +// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 1.28 +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 1.29 +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 1.30 +// SUCH DAMAGE. 1.31 + 1.32 +#include "convolver.h" 1.33 + 1.34 +#include <algorithm> 1.35 + 1.36 +#include "skia/SkTypes.h" 1.37 + 1.38 +// note: SIMD_SSE2 is not enabled because of bugs, apparently 1.39 + 1.40 +#if defined(SIMD_SSE2) 1.41 +#include <emmintrin.h> // ARCH_CPU_X86_FAMILY was defined in build/config.h 1.42 +#endif 1.43 + 1.44 +#if defined(SK_CPU_LENDIAN) 1.45 +#define R_OFFSET_IDX 0 1.46 +#define G_OFFSET_IDX 1 1.47 +#define B_OFFSET_IDX 2 1.48 +#define A_OFFSET_IDX 3 1.49 +#else 1.50 +#define R_OFFSET_IDX 3 1.51 +#define G_OFFSET_IDX 2 1.52 +#define B_OFFSET_IDX 1 1.53 +#define A_OFFSET_IDX 0 1.54 +#endif 1.55 + 1.56 +namespace skia { 1.57 + 1.58 +namespace { 1.59 + 1.60 +// Converts the argument to an 8-bit unsigned value by clamping to the range 1.61 +// 0-255. 1.62 +inline unsigned char ClampTo8(int a) { 1.63 + if (static_cast<unsigned>(a) < 256) 1.64 + return a; // Avoid the extra check in the common case. 1.65 + if (a < 0) 1.66 + return 0; 1.67 + return 255; 1.68 +} 1.69 + 1.70 +// Stores a list of rows in a circular buffer. The usage is you write into it 1.71 +// by calling AdvanceRow. It will keep track of which row in the buffer it 1.72 +// should use next, and the total number of rows added. 1.73 +class CircularRowBuffer { 1.74 + public: 1.75 + // The number of pixels in each row is given in |source_row_pixel_width|. 1.76 + // The maximum number of rows needed in the buffer is |max_y_filter_size| 1.77 + // (we only need to store enough rows for the biggest filter). 1.78 + // 1.79 + // We use the |first_input_row| to compute the coordinates of all of the 1.80 + // following rows returned by Advance(). 1.81 + CircularRowBuffer(int dest_row_pixel_width, int max_y_filter_size, 1.82 + int first_input_row) 1.83 + : row_byte_width_(dest_row_pixel_width * 4), 1.84 + num_rows_(max_y_filter_size), 1.85 + next_row_(0), 1.86 + next_row_coordinate_(first_input_row) { 1.87 + buffer_.resize(row_byte_width_ * max_y_filter_size); 1.88 + row_addresses_.resize(num_rows_); 1.89 + } 1.90 + 1.91 + // Moves to the next row in the buffer, returning a pointer to the beginning 1.92 + // of it. 1.93 + unsigned char* AdvanceRow() { 1.94 + unsigned char* row = &buffer_[next_row_ * row_byte_width_]; 1.95 + next_row_coordinate_++; 1.96 + 1.97 + // Set the pointer to the next row to use, wrapping around if necessary. 1.98 + next_row_++; 1.99 + if (next_row_ == num_rows_) 1.100 + next_row_ = 0; 1.101 + return row; 1.102 + } 1.103 + 1.104 + // Returns a pointer to an "unrolled" array of rows. These rows will start 1.105 + // at the y coordinate placed into |*first_row_index| and will continue in 1.106 + // order for the maximum number of rows in this circular buffer. 1.107 + // 1.108 + // The |first_row_index_| may be negative. This means the circular buffer 1.109 + // starts before the top of the image (it hasn't been filled yet). 1.110 + unsigned char* const* GetRowAddresses(int* first_row_index) { 1.111 + // Example for a 4-element circular buffer holding coords 6-9. 1.112 + // Row 0 Coord 8 1.113 + // Row 1 Coord 9 1.114 + // Row 2 Coord 6 <- next_row_ = 2, next_row_coordinate_ = 10. 1.115 + // Row 3 Coord 7 1.116 + // 1.117 + // The "next" row is also the first (lowest) coordinate. This computation 1.118 + // may yield a negative value, but that's OK, the math will work out 1.119 + // since the user of this buffer will compute the offset relative 1.120 + // to the first_row_index and the negative rows will never be used. 1.121 + *first_row_index = next_row_coordinate_ - num_rows_; 1.122 + 1.123 + int cur_row = next_row_; 1.124 + for (int i = 0; i < num_rows_; i++) { 1.125 + row_addresses_[i] = &buffer_[cur_row * row_byte_width_]; 1.126 + 1.127 + // Advance to the next row, wrapping if necessary. 1.128 + cur_row++; 1.129 + if (cur_row == num_rows_) 1.130 + cur_row = 0; 1.131 + } 1.132 + return &row_addresses_[0]; 1.133 + } 1.134 + 1.135 + private: 1.136 + // The buffer storing the rows. They are packed, each one row_byte_width_. 1.137 + std::vector<unsigned char> buffer_; 1.138 + 1.139 + // Number of bytes per row in the |buffer_|. 1.140 + int row_byte_width_; 1.141 + 1.142 + // The number of rows available in the buffer. 1.143 + int num_rows_; 1.144 + 1.145 + // The next row index we should write into. This wraps around as the 1.146 + // circular buffer is used. 1.147 + int next_row_; 1.148 + 1.149 + // The y coordinate of the |next_row_|. This is incremented each time a 1.150 + // new row is appended and does not wrap. 1.151 + int next_row_coordinate_; 1.152 + 1.153 + // Buffer used by GetRowAddresses(). 1.154 + std::vector<unsigned char*> row_addresses_; 1.155 +}; 1.156 + 1.157 +// Convolves horizontally along a single row. The row data is given in 1.158 +// |src_data| and continues for the num_values() of the filter. 1.159 +template<bool has_alpha> 1.160 +// This function is miscompiled with gcc 4.5 with pgo. See bug 827946. 1.161 +#if defined(__GNUC__) && defined(MOZ_GCC_VERSION_AT_LEAST) 1.162 +#if MOZ_GCC_VERSION_AT_LEAST(4, 5, 0) && !MOZ_GCC_VERSION_AT_LEAST(4, 6, 0) 1.163 +__attribute__((optimize("-O1"))) 1.164 +#endif 1.165 +#endif 1.166 +void ConvolveHorizontally(const unsigned char* src_data, 1.167 + const ConvolutionFilter1D& filter, 1.168 + unsigned char* out_row) { 1.169 + // Loop over each pixel on this row in the output image. 1.170 + int num_values = filter.num_values(); 1.171 + for (int out_x = 0; out_x < num_values; out_x++) { 1.172 + // Get the filter that determines the current output pixel. 1.173 + int filter_offset, filter_length; 1.174 + const ConvolutionFilter1D::Fixed* filter_values = 1.175 + filter.FilterForValue(out_x, &filter_offset, &filter_length); 1.176 + 1.177 + // Compute the first pixel in this row that the filter affects. It will 1.178 + // touch |filter_length| pixels (4 bytes each) after this. 1.179 + const unsigned char* row_to_filter = &src_data[filter_offset * 4]; 1.180 + 1.181 + // Apply the filter to the row to get the destination pixel in |accum|. 1.182 + int accum[4] = {0}; 1.183 + for (int filter_x = 0; filter_x < filter_length; filter_x++) { 1.184 + ConvolutionFilter1D::Fixed cur_filter = filter_values[filter_x]; 1.185 + accum[0] += cur_filter * row_to_filter[filter_x * 4 + R_OFFSET_IDX]; 1.186 + accum[1] += cur_filter * row_to_filter[filter_x * 4 + G_OFFSET_IDX]; 1.187 + accum[2] += cur_filter * row_to_filter[filter_x * 4 + B_OFFSET_IDX]; 1.188 + if (has_alpha) 1.189 + accum[3] += cur_filter * row_to_filter[filter_x * 4 + A_OFFSET_IDX]; 1.190 + } 1.191 + 1.192 + // Bring this value back in range. All of the filter scaling factors 1.193 + // are in fixed point with kShiftBits bits of fractional part. 1.194 + accum[0] >>= ConvolutionFilter1D::kShiftBits; 1.195 + accum[1] >>= ConvolutionFilter1D::kShiftBits; 1.196 + accum[2] >>= ConvolutionFilter1D::kShiftBits; 1.197 + if (has_alpha) 1.198 + accum[3] >>= ConvolutionFilter1D::kShiftBits; 1.199 + 1.200 + // Store the new pixel. 1.201 + out_row[out_x * 4 + R_OFFSET_IDX] = ClampTo8(accum[0]); 1.202 + out_row[out_x * 4 + G_OFFSET_IDX] = ClampTo8(accum[1]); 1.203 + out_row[out_x * 4 + B_OFFSET_IDX] = ClampTo8(accum[2]); 1.204 + if (has_alpha) 1.205 + out_row[out_x * 4 + A_OFFSET_IDX] = ClampTo8(accum[3]); 1.206 + } 1.207 +} 1.208 + 1.209 +// Does vertical convolution to produce one output row. The filter values and 1.210 +// length are given in the first two parameters. These are applied to each 1.211 +// of the rows pointed to in the |source_data_rows| array, with each row 1.212 +// being |pixel_width| wide. 1.213 +// 1.214 +// The output must have room for |pixel_width * 4| bytes. 1.215 +template<bool has_alpha> 1.216 +void ConvolveVertically(const ConvolutionFilter1D::Fixed* filter_values, 1.217 + int filter_length, 1.218 + unsigned char* const* source_data_rows, 1.219 + int pixel_width, 1.220 + unsigned char* out_row) { 1.221 + // We go through each column in the output and do a vertical convolution, 1.222 + // generating one output pixel each time. 1.223 + for (int out_x = 0; out_x < pixel_width; out_x++) { 1.224 + // Compute the number of bytes over in each row that the current column 1.225 + // we're convolving starts at. The pixel will cover the next 4 bytes. 1.226 + int byte_offset = out_x * 4; 1.227 + 1.228 + // Apply the filter to one column of pixels. 1.229 + int accum[4] = {0}; 1.230 + for (int filter_y = 0; filter_y < filter_length; filter_y++) { 1.231 + ConvolutionFilter1D::Fixed cur_filter = filter_values[filter_y]; 1.232 + accum[0] += cur_filter 1.233 + * source_data_rows[filter_y][byte_offset + R_OFFSET_IDX]; 1.234 + accum[1] += cur_filter 1.235 + * source_data_rows[filter_y][byte_offset + G_OFFSET_IDX]; 1.236 + accum[2] += cur_filter 1.237 + * source_data_rows[filter_y][byte_offset + B_OFFSET_IDX]; 1.238 + if (has_alpha) 1.239 + accum[3] += cur_filter 1.240 + * source_data_rows[filter_y][byte_offset + A_OFFSET_IDX]; 1.241 + } 1.242 + 1.243 + // Bring this value back in range. All of the filter scaling factors 1.244 + // are in fixed point with kShiftBits bits of precision. 1.245 + accum[0] >>= ConvolutionFilter1D::kShiftBits; 1.246 + accum[1] >>= ConvolutionFilter1D::kShiftBits; 1.247 + accum[2] >>= ConvolutionFilter1D::kShiftBits; 1.248 + if (has_alpha) 1.249 + accum[3] >>= ConvolutionFilter1D::kShiftBits; 1.250 + 1.251 + // Store the new pixel. 1.252 + out_row[byte_offset + R_OFFSET_IDX] = ClampTo8(accum[0]); 1.253 + out_row[byte_offset + G_OFFSET_IDX] = ClampTo8(accum[1]); 1.254 + out_row[byte_offset + B_OFFSET_IDX] = ClampTo8(accum[2]); 1.255 + if (has_alpha) { 1.256 + unsigned char alpha = ClampTo8(accum[3]); 1.257 + 1.258 + // Make sure the alpha channel doesn't come out smaller than any of the 1.259 + // color channels. We use premultipled alpha channels, so this should 1.260 + // never happen, but rounding errors will cause this from time to time. 1.261 + // These "impossible" colors will cause overflows (and hence random pixel 1.262 + // values) when the resulting bitmap is drawn to the screen. 1.263 + // 1.264 + // We only need to do this when generating the final output row (here). 1.265 + int max_color_channel = std::max(out_row[byte_offset + R_OFFSET_IDX], 1.266 + std::max(out_row[byte_offset + G_OFFSET_IDX], out_row[byte_offset + B_OFFSET_IDX])); 1.267 + if (alpha < max_color_channel) 1.268 + out_row[byte_offset + A_OFFSET_IDX] = max_color_channel; 1.269 + else 1.270 + out_row[byte_offset + A_OFFSET_IDX] = alpha; 1.271 + } else { 1.272 + // No alpha channel, the image is opaque. 1.273 + out_row[byte_offset + A_OFFSET_IDX] = 0xff; 1.274 + } 1.275 + } 1.276 +} 1.277 + 1.278 + 1.279 +// Convolves horizontally along a single row. The row data is given in 1.280 +// |src_data| and continues for the num_values() of the filter. 1.281 +void ConvolveHorizontally_SSE2(const unsigned char* src_data, 1.282 + const ConvolutionFilter1D& filter, 1.283 + unsigned char* out_row) { 1.284 +#if defined(SIMD_SSE2) 1.285 + int num_values = filter.num_values(); 1.286 + 1.287 + int filter_offset, filter_length; 1.288 + __m128i zero = _mm_setzero_si128(); 1.289 + __m128i mask[4]; 1.290 + // |mask| will be used to decimate all extra filter coefficients that are 1.291 + // loaded by SIMD when |filter_length| is not divisible by 4. 1.292 + // mask[0] is not used in following algorithm. 1.293 + mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); 1.294 + mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); 1.295 + mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); 1.296 + 1.297 + // Output one pixel each iteration, calculating all channels (RGBA) together. 1.298 + for (int out_x = 0; out_x < num_values; out_x++) { 1.299 + const ConvolutionFilter1D::Fixed* filter_values = 1.300 + filter.FilterForValue(out_x, &filter_offset, &filter_length); 1.301 + 1.302 + __m128i accum = _mm_setzero_si128(); 1.303 + 1.304 + // Compute the first pixel in this row that the filter affects. It will 1.305 + // touch |filter_length| pixels (4 bytes each) after this. 1.306 + const __m128i* row_to_filter = 1.307 + reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]); 1.308 + 1.309 + // We will load and accumulate with four coefficients per iteration. 1.310 + for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) { 1.311 + 1.312 + // Load 4 coefficients => duplicate 1st and 2nd of them for all channels. 1.313 + __m128i coeff, coeff16; 1.314 + // [16] xx xx xx xx c3 c2 c1 c0 1.315 + coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); 1.316 + // [16] xx xx xx xx c1 c1 c0 c0 1.317 + coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); 1.318 + // [16] c1 c1 c1 c1 c0 c0 c0 c0 1.319 + coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); 1.320 + 1.321 + // Load four pixels => unpack the first two pixels to 16 bits => 1.322 + // multiply with coefficients => accumulate the convolution result. 1.323 + // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 1.324 + __m128i src8 = _mm_loadu_si128(row_to_filter); 1.325 + // [16] a1 b1 g1 r1 a0 b0 g0 r0 1.326 + __m128i src16 = _mm_unpacklo_epi8(src8, zero); 1.327 + __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); 1.328 + __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); 1.329 + // [32] a0*c0 b0*c0 g0*c0 r0*c0 1.330 + __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); 1.331 + accum = _mm_add_epi32(accum, t); 1.332 + // [32] a1*c1 b1*c1 g1*c1 r1*c1 1.333 + t = _mm_unpackhi_epi16(mul_lo, mul_hi); 1.334 + accum = _mm_add_epi32(accum, t); 1.335 + 1.336 + // Duplicate 3rd and 4th coefficients for all channels => 1.337 + // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients 1.338 + // => accumulate the convolution results. 1.339 + // [16] xx xx xx xx c3 c3 c2 c2 1.340 + coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); 1.341 + // [16] c3 c3 c3 c3 c2 c2 c2 c2 1.342 + coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); 1.343 + // [16] a3 g3 b3 r3 a2 g2 b2 r2 1.344 + src16 = _mm_unpackhi_epi8(src8, zero); 1.345 + mul_hi = _mm_mulhi_epi16(src16, coeff16); 1.346 + mul_lo = _mm_mullo_epi16(src16, coeff16); 1.347 + // [32] a2*c2 b2*c2 g2*c2 r2*c2 1.348 + t = _mm_unpacklo_epi16(mul_lo, mul_hi); 1.349 + accum = _mm_add_epi32(accum, t); 1.350 + // [32] a3*c3 b3*c3 g3*c3 r3*c3 1.351 + t = _mm_unpackhi_epi16(mul_lo, mul_hi); 1.352 + accum = _mm_add_epi32(accum, t); 1.353 + 1.354 + // Advance the pixel and coefficients pointers. 1.355 + row_to_filter += 1; 1.356 + filter_values += 4; 1.357 + } 1.358 + 1.359 + // When |filter_length| is not divisible by 4, we need to decimate some of 1.360 + // the filter coefficient that was loaded incorrectly to zero; Other than 1.361 + // that the algorithm is same with above, exceot that the 4th pixel will be 1.362 + // always absent. 1.363 + int r = filter_length&3; 1.364 + if (r) { 1.365 + // Note: filter_values must be padded to align_up(filter_offset, 8). 1.366 + __m128i coeff, coeff16; 1.367 + coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); 1.368 + // Mask out extra filter taps. 1.369 + coeff = _mm_and_si128(coeff, mask[r]); 1.370 + coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); 1.371 + coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); 1.372 + 1.373 + // Note: line buffer must be padded to align_up(filter_offset, 16). 1.374 + // We resolve this by use C-version for the last horizontal line. 1.375 + __m128i src8 = _mm_loadu_si128(row_to_filter); 1.376 + __m128i src16 = _mm_unpacklo_epi8(src8, zero); 1.377 + __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); 1.378 + __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); 1.379 + __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); 1.380 + accum = _mm_add_epi32(accum, t); 1.381 + t = _mm_unpackhi_epi16(mul_lo, mul_hi); 1.382 + accum = _mm_add_epi32(accum, t); 1.383 + 1.384 + src16 = _mm_unpackhi_epi8(src8, zero); 1.385 + coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); 1.386 + coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); 1.387 + mul_hi = _mm_mulhi_epi16(src16, coeff16); 1.388 + mul_lo = _mm_mullo_epi16(src16, coeff16); 1.389 + t = _mm_unpacklo_epi16(mul_lo, mul_hi); 1.390 + accum = _mm_add_epi32(accum, t); 1.391 + } 1.392 + 1.393 + // Shift right for fixed point implementation. 1.394 + accum = _mm_srai_epi32(accum, ConvolutionFilter1D::kShiftBits); 1.395 + 1.396 + // Packing 32 bits |accum| to 16 bits per channel (signed saturation). 1.397 + accum = _mm_packs_epi32(accum, zero); 1.398 + // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). 1.399 + accum = _mm_packus_epi16(accum, zero); 1.400 + 1.401 + // Store the pixel value of 32 bits. 1.402 + *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum); 1.403 + out_row += 4; 1.404 + } 1.405 +#endif 1.406 +} 1.407 + 1.408 +// Convolves horizontally along four rows. The row data is given in 1.409 +// |src_data| and continues for the num_values() of the filter. 1.410 +// The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please 1.411 +// refer to that function for detailed comments. 1.412 +void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4], 1.413 + const ConvolutionFilter1D& filter, 1.414 + unsigned char* out_row[4]) { 1.415 +#if defined(SIMD_SSE2) 1.416 + int num_values = filter.num_values(); 1.417 + 1.418 + int filter_offset, filter_length; 1.419 + __m128i zero = _mm_setzero_si128(); 1.420 + __m128i mask[4]; 1.421 + // |mask| will be used to decimate all extra filter coefficients that are 1.422 + // loaded by SIMD when |filter_length| is not divisible by 4. 1.423 + // mask[0] is not used in following algorithm. 1.424 + mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); 1.425 + mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); 1.426 + mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); 1.427 + 1.428 + // Output one pixel each iteration, calculating all channels (RGBA) together. 1.429 + for (int out_x = 0; out_x < num_values; out_x++) { 1.430 + const ConvolutionFilter1D::Fixed* filter_values = 1.431 + filter.FilterForValue(out_x, &filter_offset, &filter_length); 1.432 + 1.433 + // four pixels in a column per iteration. 1.434 + __m128i accum0 = _mm_setzero_si128(); 1.435 + __m128i accum1 = _mm_setzero_si128(); 1.436 + __m128i accum2 = _mm_setzero_si128(); 1.437 + __m128i accum3 = _mm_setzero_si128(); 1.438 + int start = (filter_offset<<2); 1.439 + // We will load and accumulate with four coefficients per iteration. 1.440 + for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) { 1.441 + __m128i coeff, coeff16lo, coeff16hi; 1.442 + // [16] xx xx xx xx c3 c2 c1 c0 1.443 + coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); 1.444 + // [16] xx xx xx xx c1 c1 c0 c0 1.445 + coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); 1.446 + // [16] c1 c1 c1 c1 c0 c0 c0 c0 1.447 + coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); 1.448 + // [16] xx xx xx xx c3 c3 c2 c2 1.449 + coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); 1.450 + // [16] c3 c3 c3 c3 c2 c2 c2 c2 1.451 + coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); 1.452 + 1.453 + __m128i src8, src16, mul_hi, mul_lo, t; 1.454 + 1.455 +#define ITERATION(src, accum) \ 1.456 + src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \ 1.457 + src16 = _mm_unpacklo_epi8(src8, zero); \ 1.458 + mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \ 1.459 + mul_lo = _mm_mullo_epi16(src16, coeff16lo); \ 1.460 + t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ 1.461 + accum = _mm_add_epi32(accum, t); \ 1.462 + t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ 1.463 + accum = _mm_add_epi32(accum, t); \ 1.464 + src16 = _mm_unpackhi_epi8(src8, zero); \ 1.465 + mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \ 1.466 + mul_lo = _mm_mullo_epi16(src16, coeff16hi); \ 1.467 + t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ 1.468 + accum = _mm_add_epi32(accum, t); \ 1.469 + t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ 1.470 + accum = _mm_add_epi32(accum, t) 1.471 + 1.472 + ITERATION(src_data[0] + start, accum0); 1.473 + ITERATION(src_data[1] + start, accum1); 1.474 + ITERATION(src_data[2] + start, accum2); 1.475 + ITERATION(src_data[3] + start, accum3); 1.476 + 1.477 + start += 16; 1.478 + filter_values += 4; 1.479 + } 1.480 + 1.481 + int r = filter_length & 3; 1.482 + if (r) { 1.483 + // Note: filter_values must be padded to align_up(filter_offset, 8); 1.484 + __m128i coeff; 1.485 + coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); 1.486 + // Mask out extra filter taps. 1.487 + coeff = _mm_and_si128(coeff, mask[r]); 1.488 + 1.489 + __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); 1.490 + /* c1 c1 c1 c1 c0 c0 c0 c0 */ 1.491 + coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); 1.492 + __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); 1.493 + coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); 1.494 + 1.495 + __m128i src8, src16, mul_hi, mul_lo, t; 1.496 + 1.497 + ITERATION(src_data[0] + start, accum0); 1.498 + ITERATION(src_data[1] + start, accum1); 1.499 + ITERATION(src_data[2] + start, accum2); 1.500 + ITERATION(src_data[3] + start, accum3); 1.501 + } 1.502 + 1.503 + accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits); 1.504 + accum0 = _mm_packs_epi32(accum0, zero); 1.505 + accum0 = _mm_packus_epi16(accum0, zero); 1.506 + accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits); 1.507 + accum1 = _mm_packs_epi32(accum1, zero); 1.508 + accum1 = _mm_packus_epi16(accum1, zero); 1.509 + accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits); 1.510 + accum2 = _mm_packs_epi32(accum2, zero); 1.511 + accum2 = _mm_packus_epi16(accum2, zero); 1.512 + accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits); 1.513 + accum3 = _mm_packs_epi32(accum3, zero); 1.514 + accum3 = _mm_packus_epi16(accum3, zero); 1.515 + 1.516 + *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0); 1.517 + *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1); 1.518 + *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2); 1.519 + *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3); 1.520 + 1.521 + out_row[0] += 4; 1.522 + out_row[1] += 4; 1.523 + out_row[2] += 4; 1.524 + out_row[3] += 4; 1.525 + } 1.526 +#endif 1.527 +} 1.528 + 1.529 +// Does vertical convolution to produce one output row. The filter values and 1.530 +// length are given in the first two parameters. These are applied to each 1.531 +// of the rows pointed to in the |source_data_rows| array, with each row 1.532 +// being |pixel_width| wide. 1.533 +// 1.534 +// The output must have room for |pixel_width * 4| bytes. 1.535 +template<bool has_alpha> 1.536 +void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values, 1.537 + int filter_length, 1.538 + unsigned char* const* source_data_rows, 1.539 + int pixel_width, 1.540 + unsigned char* out_row) { 1.541 +#if defined(SIMD_SSE2) 1.542 + int width = pixel_width & ~3; 1.543 + 1.544 + __m128i zero = _mm_setzero_si128(); 1.545 + __m128i accum0, accum1, accum2, accum3, coeff16; 1.546 + const __m128i* src; 1.547 + // Output four pixels per iteration (16 bytes). 1.548 + for (int out_x = 0; out_x < width; out_x += 4) { 1.549 + 1.550 + // Accumulated result for each pixel. 32 bits per RGBA channel. 1.551 + accum0 = _mm_setzero_si128(); 1.552 + accum1 = _mm_setzero_si128(); 1.553 + accum2 = _mm_setzero_si128(); 1.554 + accum3 = _mm_setzero_si128(); 1.555 + 1.556 + // Convolve with one filter coefficient per iteration. 1.557 + for (int filter_y = 0; filter_y < filter_length; filter_y++) { 1.558 + 1.559 + // Duplicate the filter coefficient 8 times. 1.560 + // [16] cj cj cj cj cj cj cj cj 1.561 + coeff16 = _mm_set1_epi16(filter_values[filter_y]); 1.562 + 1.563 + // Load four pixels (16 bytes) together. 1.564 + // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 1.565 + src = reinterpret_cast<const __m128i*>( 1.566 + &source_data_rows[filter_y][out_x << 2]); 1.567 + __m128i src8 = _mm_loadu_si128(src); 1.568 + 1.569 + // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels => 1.570 + // multiply with current coefficient => accumulate the result. 1.571 + // [16] a1 b1 g1 r1 a0 b0 g0 r0 1.572 + __m128i src16 = _mm_unpacklo_epi8(src8, zero); 1.573 + __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); 1.574 + __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); 1.575 + // [32] a0 b0 g0 r0 1.576 + __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); 1.577 + accum0 = _mm_add_epi32(accum0, t); 1.578 + // [32] a1 b1 g1 r1 1.579 + t = _mm_unpackhi_epi16(mul_lo, mul_hi); 1.580 + accum1 = _mm_add_epi32(accum1, t); 1.581 + 1.582 + // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels => 1.583 + // multiply with current coefficient => accumulate the result. 1.584 + // [16] a3 b3 g3 r3 a2 b2 g2 r2 1.585 + src16 = _mm_unpackhi_epi8(src8, zero); 1.586 + mul_hi = _mm_mulhi_epi16(src16, coeff16); 1.587 + mul_lo = _mm_mullo_epi16(src16, coeff16); 1.588 + // [32] a2 b2 g2 r2 1.589 + t = _mm_unpacklo_epi16(mul_lo, mul_hi); 1.590 + accum2 = _mm_add_epi32(accum2, t); 1.591 + // [32] a3 b3 g3 r3 1.592 + t = _mm_unpackhi_epi16(mul_lo, mul_hi); 1.593 + accum3 = _mm_add_epi32(accum3, t); 1.594 + } 1.595 + 1.596 + // Shift right for fixed point implementation. 1.597 + accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits); 1.598 + accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits); 1.599 + accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits); 1.600 + accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits); 1.601 + 1.602 + // Packing 32 bits |accum| to 16 bits per channel (signed saturation). 1.603 + // [16] a1 b1 g1 r1 a0 b0 g0 r0 1.604 + accum0 = _mm_packs_epi32(accum0, accum1); 1.605 + // [16] a3 b3 g3 r3 a2 b2 g2 r2 1.606 + accum2 = _mm_packs_epi32(accum2, accum3); 1.607 + 1.608 + // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). 1.609 + // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 1.610 + accum0 = _mm_packus_epi16(accum0, accum2); 1.611 + 1.612 + if (has_alpha) { 1.613 + // Compute the max(ri, gi, bi) for each pixel. 1.614 + // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 1.615 + __m128i a = _mm_srli_epi32(accum0, 8); 1.616 + // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 1.617 + __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. 1.618 + // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 1.619 + a = _mm_srli_epi32(accum0, 16); 1.620 + // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 1.621 + b = _mm_max_epu8(a, b); // Max of r and g and b. 1.622 + // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 1.623 + b = _mm_slli_epi32(b, 24); 1.624 + 1.625 + // Make sure the value of alpha channel is always larger than maximum 1.626 + // value of color channels. 1.627 + accum0 = _mm_max_epu8(b, accum0); 1.628 + } else { 1.629 + // Set value of alpha channels to 0xFF. 1.630 + __m128i mask = _mm_set1_epi32(0xff000000); 1.631 + accum0 = _mm_or_si128(accum0, mask); 1.632 + } 1.633 + 1.634 + // Store the convolution result (16 bytes) and advance the pixel pointers. 1.635 + _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0); 1.636 + out_row += 16; 1.637 + } 1.638 + 1.639 + // When the width of the output is not divisible by 4, We need to save one 1.640 + // pixel (4 bytes) each time. And also the fourth pixel is always absent. 1.641 + if (pixel_width & 3) { 1.642 + accum0 = _mm_setzero_si128(); 1.643 + accum1 = _mm_setzero_si128(); 1.644 + accum2 = _mm_setzero_si128(); 1.645 + for (int filter_y = 0; filter_y < filter_length; ++filter_y) { 1.646 + coeff16 = _mm_set1_epi16(filter_values[filter_y]); 1.647 + // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 1.648 + src = reinterpret_cast<const __m128i*>( 1.649 + &source_data_rows[filter_y][width<<2]); 1.650 + __m128i src8 = _mm_loadu_si128(src); 1.651 + // [16] a1 b1 g1 r1 a0 b0 g0 r0 1.652 + __m128i src16 = _mm_unpacklo_epi8(src8, zero); 1.653 + __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); 1.654 + __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); 1.655 + // [32] a0 b0 g0 r0 1.656 + __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); 1.657 + accum0 = _mm_add_epi32(accum0, t); 1.658 + // [32] a1 b1 g1 r1 1.659 + t = _mm_unpackhi_epi16(mul_lo, mul_hi); 1.660 + accum1 = _mm_add_epi32(accum1, t); 1.661 + // [16] a3 b3 g3 r3 a2 b2 g2 r2 1.662 + src16 = _mm_unpackhi_epi8(src8, zero); 1.663 + mul_hi = _mm_mulhi_epi16(src16, coeff16); 1.664 + mul_lo = _mm_mullo_epi16(src16, coeff16); 1.665 + // [32] a2 b2 g2 r2 1.666 + t = _mm_unpacklo_epi16(mul_lo, mul_hi); 1.667 + accum2 = _mm_add_epi32(accum2, t); 1.668 + } 1.669 + 1.670 + accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits); 1.671 + accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits); 1.672 + accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits); 1.673 + // [16] a1 b1 g1 r1 a0 b0 g0 r0 1.674 + accum0 = _mm_packs_epi32(accum0, accum1); 1.675 + // [16] a3 b3 g3 r3 a2 b2 g2 r2 1.676 + accum2 = _mm_packs_epi32(accum2, zero); 1.677 + // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 1.678 + accum0 = _mm_packus_epi16(accum0, accum2); 1.679 + if (has_alpha) { 1.680 + // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 1.681 + __m128i a = _mm_srli_epi32(accum0, 8); 1.682 + // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 1.683 + __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. 1.684 + // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 1.685 + a = _mm_srli_epi32(accum0, 16); 1.686 + // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 1.687 + b = _mm_max_epu8(a, b); // Max of r and g and b. 1.688 + // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 1.689 + b = _mm_slli_epi32(b, 24); 1.690 + accum0 = _mm_max_epu8(b, accum0); 1.691 + } else { 1.692 + __m128i mask = _mm_set1_epi32(0xff000000); 1.693 + accum0 = _mm_or_si128(accum0, mask); 1.694 + } 1.695 + 1.696 + for (int out_x = width; out_x < pixel_width; out_x++) { 1.697 + *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0); 1.698 + accum0 = _mm_srli_si128(accum0, 4); 1.699 + out_row += 4; 1.700 + } 1.701 + } 1.702 +#endif 1.703 +} 1.704 + 1.705 +} // namespace 1.706 + 1.707 +// ConvolutionFilter1D --------------------------------------------------------- 1.708 + 1.709 +ConvolutionFilter1D::ConvolutionFilter1D() 1.710 + : max_filter_(0) { 1.711 +} 1.712 + 1.713 +ConvolutionFilter1D::~ConvolutionFilter1D() { 1.714 +} 1.715 + 1.716 +void ConvolutionFilter1D::AddFilter(int filter_offset, 1.717 + const float* filter_values, 1.718 + int filter_length) { 1.719 + SkASSERT(filter_length > 0); 1.720 + 1.721 + std::vector<Fixed> fixed_values; 1.722 + fixed_values.reserve(filter_length); 1.723 + 1.724 + for (int i = 0; i < filter_length; ++i) 1.725 + fixed_values.push_back(FloatToFixed(filter_values[i])); 1.726 + 1.727 + AddFilter(filter_offset, &fixed_values[0], filter_length); 1.728 +} 1.729 + 1.730 +void ConvolutionFilter1D::AddFilter(int filter_offset, 1.731 + const Fixed* filter_values, 1.732 + int filter_length) { 1.733 + // It is common for leading/trailing filter values to be zeros. In such 1.734 + // cases it is beneficial to only store the central factors. 1.735 + // For a scaling to 1/4th in each dimension using a Lanczos-2 filter on 1.736 + // a 1080p image this optimization gives a ~10% speed improvement. 1.737 + int first_non_zero = 0; 1.738 + while (first_non_zero < filter_length && filter_values[first_non_zero] == 0) 1.739 + first_non_zero++; 1.740 + 1.741 + if (first_non_zero < filter_length) { 1.742 + // Here we have at least one non-zero factor. 1.743 + int last_non_zero = filter_length - 1; 1.744 + while (last_non_zero >= 0 && filter_values[last_non_zero] == 0) 1.745 + last_non_zero--; 1.746 + 1.747 + filter_offset += first_non_zero; 1.748 + filter_length = last_non_zero + 1 - first_non_zero; 1.749 + SkASSERT(filter_length > 0); 1.750 + 1.751 + for (int i = first_non_zero; i <= last_non_zero; i++) 1.752 + filter_values_.push_back(filter_values[i]); 1.753 + } else { 1.754 + // Here all the factors were zeroes. 1.755 + filter_length = 0; 1.756 + } 1.757 + 1.758 + FilterInstance instance; 1.759 + 1.760 + // We pushed filter_length elements onto filter_values_ 1.761 + instance.data_location = (static_cast<int>(filter_values_.size()) - 1.762 + filter_length); 1.763 + instance.offset = filter_offset; 1.764 + instance.length = filter_length; 1.765 + filters_.push_back(instance); 1.766 + 1.767 + max_filter_ = std::max(max_filter_, filter_length); 1.768 +} 1.769 + 1.770 +void BGRAConvolve2D(const unsigned char* source_data, 1.771 + int source_byte_row_stride, 1.772 + bool source_has_alpha, 1.773 + const ConvolutionFilter1D& filter_x, 1.774 + const ConvolutionFilter1D& filter_y, 1.775 + int output_byte_row_stride, 1.776 + unsigned char* output, 1.777 + bool use_sse2) { 1.778 +#if !defined(SIMD_SSE2) 1.779 + // Even we have runtime support for SSE2 instructions, since the binary 1.780 + // was not built with SSE2 support, we had to fallback to C version. 1.781 + use_sse2 = false; 1.782 +#endif 1.783 + 1.784 + int max_y_filter_size = filter_y.max_filter(); 1.785 + 1.786 + // The next row in the input that we will generate a horizontally 1.787 + // convolved row for. If the filter doesn't start at the beginning of the 1.788 + // image (this is the case when we are only resizing a subset), then we 1.789 + // don't want to generate any output rows before that. Compute the starting 1.790 + // row for convolution as the first pixel for the first vertical filter. 1.791 + int filter_offset, filter_length; 1.792 + const ConvolutionFilter1D::Fixed* filter_values = 1.793 + filter_y.FilterForValue(0, &filter_offset, &filter_length); 1.794 + int next_x_row = filter_offset; 1.795 + 1.796 + // We loop over each row in the input doing a horizontal convolution. This 1.797 + // will result in a horizontally convolved image. We write the results into 1.798 + // a circular buffer of convolved rows and do vertical convolution as rows 1.799 + // are available. This prevents us from having to store the entire 1.800 + // intermediate image and helps cache coherency. 1.801 + // We will need four extra rows to allow horizontal convolution could be done 1.802 + // simultaneously. We also padding each row in row buffer to be aligned-up to 1.803 + // 16 bytes. 1.804 + // TODO(jiesun): We do not use aligned load from row buffer in vertical 1.805 + // convolution pass yet. Somehow Windows does not like it. 1.806 + int row_buffer_width = (filter_x.num_values() + 15) & ~0xF; 1.807 + int row_buffer_height = max_y_filter_size + (use_sse2 ? 4 : 0); 1.808 + CircularRowBuffer row_buffer(row_buffer_width, 1.809 + row_buffer_height, 1.810 + filter_offset); 1.811 + 1.812 + // Loop over every possible output row, processing just enough horizontal 1.813 + // convolutions to run each subsequent vertical convolution. 1.814 + SkASSERT(output_byte_row_stride >= filter_x.num_values() * 4); 1.815 + int num_output_rows = filter_y.num_values(); 1.816 + 1.817 + // We need to check which is the last line to convolve before we advance 4 1.818 + // lines in one iteration. 1.819 + int last_filter_offset, last_filter_length; 1.820 + filter_y.FilterForValue(num_output_rows - 1, &last_filter_offset, 1.821 + &last_filter_length); 1.822 + 1.823 + for (int out_y = 0; out_y < num_output_rows; out_y++) { 1.824 + filter_values = filter_y.FilterForValue(out_y, 1.825 + &filter_offset, &filter_length); 1.826 + 1.827 + // Generate output rows until we have enough to run the current filter. 1.828 + if (use_sse2) { 1.829 + while (next_x_row < filter_offset + filter_length) { 1.830 + if (next_x_row + 3 < last_filter_offset + last_filter_length - 1) { 1.831 + const unsigned char* src[4]; 1.832 + unsigned char* out_row[4]; 1.833 + for (int i = 0; i < 4; ++i) { 1.834 + src[i] = &source_data[(next_x_row + i) * source_byte_row_stride]; 1.835 + out_row[i] = row_buffer.AdvanceRow(); 1.836 + } 1.837 + ConvolveHorizontally4_SSE2(src, filter_x, out_row); 1.838 + next_x_row += 4; 1.839 + } else { 1.840 + // For the last row, SSE2 load possibly to access data beyond the 1.841 + // image area. therefore we use C version here. 1.842 + if (next_x_row == last_filter_offset + last_filter_length - 1) { 1.843 + if (source_has_alpha) { 1.844 + ConvolveHorizontally<true>( 1.845 + &source_data[next_x_row * source_byte_row_stride], 1.846 + filter_x, row_buffer.AdvanceRow()); 1.847 + } else { 1.848 + ConvolveHorizontally<false>( 1.849 + &source_data[next_x_row * source_byte_row_stride], 1.850 + filter_x, row_buffer.AdvanceRow()); 1.851 + } 1.852 + } else { 1.853 + ConvolveHorizontally_SSE2( 1.854 + &source_data[next_x_row * source_byte_row_stride], 1.855 + filter_x, row_buffer.AdvanceRow()); 1.856 + } 1.857 + next_x_row++; 1.858 + } 1.859 + } 1.860 + } else { 1.861 + while (next_x_row < filter_offset + filter_length) { 1.862 + if (source_has_alpha) { 1.863 + ConvolveHorizontally<true>( 1.864 + &source_data[next_x_row * source_byte_row_stride], 1.865 + filter_x, row_buffer.AdvanceRow()); 1.866 + } else { 1.867 + ConvolveHorizontally<false>( 1.868 + &source_data[next_x_row * source_byte_row_stride], 1.869 + filter_x, row_buffer.AdvanceRow()); 1.870 + } 1.871 + next_x_row++; 1.872 + } 1.873 + } 1.874 + 1.875 + // Compute where in the output image this row of final data will go. 1.876 + unsigned char* cur_output_row = &output[out_y * output_byte_row_stride]; 1.877 + 1.878 + // Get the list of rows that the circular buffer has, in order. 1.879 + int first_row_in_circular_buffer; 1.880 + unsigned char* const* rows_to_convolve = 1.881 + row_buffer.GetRowAddresses(&first_row_in_circular_buffer); 1.882 + 1.883 + // Now compute the start of the subset of those rows that the filter 1.884 + // needs. 1.885 + unsigned char* const* first_row_for_filter = 1.886 + &rows_to_convolve[filter_offset - first_row_in_circular_buffer]; 1.887 + 1.888 + if (source_has_alpha) { 1.889 + if (use_sse2) { 1.890 + ConvolveVertically_SSE2<true>(filter_values, filter_length, 1.891 + first_row_for_filter, 1.892 + filter_x.num_values(), cur_output_row); 1.893 + } else { 1.894 + ConvolveVertically<true>(filter_values, filter_length, 1.895 + first_row_for_filter, 1.896 + filter_x.num_values(), cur_output_row); 1.897 + } 1.898 + } else { 1.899 + if (use_sse2) { 1.900 + ConvolveVertically_SSE2<false>(filter_values, filter_length, 1.901 + first_row_for_filter, 1.902 + filter_x.num_values(), cur_output_row); 1.903 + } else { 1.904 + ConvolveVertically<false>(filter_values, filter_length, 1.905 + first_row_for_filter, 1.906 + filter_x.num_values(), cur_output_row); 1.907 + } 1.908 + } 1.909 + } 1.910 +} 1.911 + 1.912 +} // namespace skia