gfx/2d/convolver.cpp

Tue, 06 Jan 2015 21:39:09 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Tue, 06 Jan 2015 21:39:09 +0100
branch
TOR_BUG_9701
changeset 8
97036ab72558
permissions
-rw-r--r--

Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 // Copyright (c) 2006-2011 The Chromium Authors. All rights reserved.
michael@0 2 //
michael@0 3 // Redistribution and use in source and binary forms, with or without
michael@0 4 // modification, are permitted provided that the following conditions
michael@0 5 // are met:
michael@0 6 // * Redistributions of source code must retain the above copyright
michael@0 7 // notice, this list of conditions and the following disclaimer.
michael@0 8 // * Redistributions in binary form must reproduce the above copyright
michael@0 9 // notice, this list of conditions and the following disclaimer in
michael@0 10 // the documentation and/or other materials provided with the
michael@0 11 // distribution.
michael@0 12 // * Neither the name of Google, Inc. nor the names of its contributors
michael@0 13 // may be used to endorse or promote products derived from this
michael@0 14 // software without specific prior written permission.
michael@0 15 //
michael@0 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
michael@0 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
michael@0 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
michael@0 19 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
michael@0 20 // COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
michael@0 21 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
michael@0 22 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
michael@0 23 // OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
michael@0 24 // AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
michael@0 25 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
michael@0 26 // OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
michael@0 27 // SUCH DAMAGE.
michael@0 28
michael@0 29 #include "convolver.h"
michael@0 30
michael@0 31 #include <algorithm>
michael@0 32
michael@0 33 #include "skia/SkTypes.h"
michael@0 34
michael@0 35 // note: SIMD_SSE2 is not enabled because of bugs, apparently
michael@0 36
michael@0 37 #if defined(SIMD_SSE2)
michael@0 38 #include <emmintrin.h> // ARCH_CPU_X86_FAMILY was defined in build/config.h
michael@0 39 #endif
michael@0 40
michael@0 41 #if defined(SK_CPU_LENDIAN)
michael@0 42 #define R_OFFSET_IDX 0
michael@0 43 #define G_OFFSET_IDX 1
michael@0 44 #define B_OFFSET_IDX 2
michael@0 45 #define A_OFFSET_IDX 3
michael@0 46 #else
michael@0 47 #define R_OFFSET_IDX 3
michael@0 48 #define G_OFFSET_IDX 2
michael@0 49 #define B_OFFSET_IDX 1
michael@0 50 #define A_OFFSET_IDX 0
michael@0 51 #endif
michael@0 52
michael@0 53 namespace skia {
michael@0 54
michael@0 55 namespace {
michael@0 56
michael@0 57 // Converts the argument to an 8-bit unsigned value by clamping to the range
michael@0 58 // 0-255.
michael@0 59 inline unsigned char ClampTo8(int a) {
michael@0 60 if (static_cast<unsigned>(a) < 256)
michael@0 61 return a; // Avoid the extra check in the common case.
michael@0 62 if (a < 0)
michael@0 63 return 0;
michael@0 64 return 255;
michael@0 65 }
michael@0 66
michael@0 67 // Stores a list of rows in a circular buffer. The usage is you write into it
michael@0 68 // by calling AdvanceRow. It will keep track of which row in the buffer it
michael@0 69 // should use next, and the total number of rows added.
michael@0 70 class CircularRowBuffer {
michael@0 71 public:
michael@0 72 // The number of pixels in each row is given in |source_row_pixel_width|.
michael@0 73 // The maximum number of rows needed in the buffer is |max_y_filter_size|
michael@0 74 // (we only need to store enough rows for the biggest filter).
michael@0 75 //
michael@0 76 // We use the |first_input_row| to compute the coordinates of all of the
michael@0 77 // following rows returned by Advance().
michael@0 78 CircularRowBuffer(int dest_row_pixel_width, int max_y_filter_size,
michael@0 79 int first_input_row)
michael@0 80 : row_byte_width_(dest_row_pixel_width * 4),
michael@0 81 num_rows_(max_y_filter_size),
michael@0 82 next_row_(0),
michael@0 83 next_row_coordinate_(first_input_row) {
michael@0 84 buffer_.resize(row_byte_width_ * max_y_filter_size);
michael@0 85 row_addresses_.resize(num_rows_);
michael@0 86 }
michael@0 87
michael@0 88 // Moves to the next row in the buffer, returning a pointer to the beginning
michael@0 89 // of it.
michael@0 90 unsigned char* AdvanceRow() {
michael@0 91 unsigned char* row = &buffer_[next_row_ * row_byte_width_];
michael@0 92 next_row_coordinate_++;
michael@0 93
michael@0 94 // Set the pointer to the next row to use, wrapping around if necessary.
michael@0 95 next_row_++;
michael@0 96 if (next_row_ == num_rows_)
michael@0 97 next_row_ = 0;
michael@0 98 return row;
michael@0 99 }
michael@0 100
michael@0 101 // Returns a pointer to an "unrolled" array of rows. These rows will start
michael@0 102 // at the y coordinate placed into |*first_row_index| and will continue in
michael@0 103 // order for the maximum number of rows in this circular buffer.
michael@0 104 //
michael@0 105 // The |first_row_index_| may be negative. This means the circular buffer
michael@0 106 // starts before the top of the image (it hasn't been filled yet).
michael@0 107 unsigned char* const* GetRowAddresses(int* first_row_index) {
michael@0 108 // Example for a 4-element circular buffer holding coords 6-9.
michael@0 109 // Row 0 Coord 8
michael@0 110 // Row 1 Coord 9
michael@0 111 // Row 2 Coord 6 <- next_row_ = 2, next_row_coordinate_ = 10.
michael@0 112 // Row 3 Coord 7
michael@0 113 //
michael@0 114 // The "next" row is also the first (lowest) coordinate. This computation
michael@0 115 // may yield a negative value, but that's OK, the math will work out
michael@0 116 // since the user of this buffer will compute the offset relative
michael@0 117 // to the first_row_index and the negative rows will never be used.
michael@0 118 *first_row_index = next_row_coordinate_ - num_rows_;
michael@0 119
michael@0 120 int cur_row = next_row_;
michael@0 121 for (int i = 0; i < num_rows_; i++) {
michael@0 122 row_addresses_[i] = &buffer_[cur_row * row_byte_width_];
michael@0 123
michael@0 124 // Advance to the next row, wrapping if necessary.
michael@0 125 cur_row++;
michael@0 126 if (cur_row == num_rows_)
michael@0 127 cur_row = 0;
michael@0 128 }
michael@0 129 return &row_addresses_[0];
michael@0 130 }
michael@0 131
michael@0 132 private:
michael@0 133 // The buffer storing the rows. They are packed, each one row_byte_width_.
michael@0 134 std::vector<unsigned char> buffer_;
michael@0 135
michael@0 136 // Number of bytes per row in the |buffer_|.
michael@0 137 int row_byte_width_;
michael@0 138
michael@0 139 // The number of rows available in the buffer.
michael@0 140 int num_rows_;
michael@0 141
michael@0 142 // The next row index we should write into. This wraps around as the
michael@0 143 // circular buffer is used.
michael@0 144 int next_row_;
michael@0 145
michael@0 146 // The y coordinate of the |next_row_|. This is incremented each time a
michael@0 147 // new row is appended and does not wrap.
michael@0 148 int next_row_coordinate_;
michael@0 149
michael@0 150 // Buffer used by GetRowAddresses().
michael@0 151 std::vector<unsigned char*> row_addresses_;
michael@0 152 };
michael@0 153
michael@0 154 // Convolves horizontally along a single row. The row data is given in
michael@0 155 // |src_data| and continues for the num_values() of the filter.
michael@0 156 template<bool has_alpha>
michael@0 157 // This function is miscompiled with gcc 4.5 with pgo. See bug 827946.
michael@0 158 #if defined(__GNUC__) && defined(MOZ_GCC_VERSION_AT_LEAST)
michael@0 159 #if MOZ_GCC_VERSION_AT_LEAST(4, 5, 0) && !MOZ_GCC_VERSION_AT_LEAST(4, 6, 0)
michael@0 160 __attribute__((optimize("-O1")))
michael@0 161 #endif
michael@0 162 #endif
michael@0 163 void ConvolveHorizontally(const unsigned char* src_data,
michael@0 164 const ConvolutionFilter1D& filter,
michael@0 165 unsigned char* out_row) {
michael@0 166 // Loop over each pixel on this row in the output image.
michael@0 167 int num_values = filter.num_values();
michael@0 168 for (int out_x = 0; out_x < num_values; out_x++) {
michael@0 169 // Get the filter that determines the current output pixel.
michael@0 170 int filter_offset, filter_length;
michael@0 171 const ConvolutionFilter1D::Fixed* filter_values =
michael@0 172 filter.FilterForValue(out_x, &filter_offset, &filter_length);
michael@0 173
michael@0 174 // Compute the first pixel in this row that the filter affects. It will
michael@0 175 // touch |filter_length| pixels (4 bytes each) after this.
michael@0 176 const unsigned char* row_to_filter = &src_data[filter_offset * 4];
michael@0 177
michael@0 178 // Apply the filter to the row to get the destination pixel in |accum|.
michael@0 179 int accum[4] = {0};
michael@0 180 for (int filter_x = 0; filter_x < filter_length; filter_x++) {
michael@0 181 ConvolutionFilter1D::Fixed cur_filter = filter_values[filter_x];
michael@0 182 accum[0] += cur_filter * row_to_filter[filter_x * 4 + R_OFFSET_IDX];
michael@0 183 accum[1] += cur_filter * row_to_filter[filter_x * 4 + G_OFFSET_IDX];
michael@0 184 accum[2] += cur_filter * row_to_filter[filter_x * 4 + B_OFFSET_IDX];
michael@0 185 if (has_alpha)
michael@0 186 accum[3] += cur_filter * row_to_filter[filter_x * 4 + A_OFFSET_IDX];
michael@0 187 }
michael@0 188
michael@0 189 // Bring this value back in range. All of the filter scaling factors
michael@0 190 // are in fixed point with kShiftBits bits of fractional part.
michael@0 191 accum[0] >>= ConvolutionFilter1D::kShiftBits;
michael@0 192 accum[1] >>= ConvolutionFilter1D::kShiftBits;
michael@0 193 accum[2] >>= ConvolutionFilter1D::kShiftBits;
michael@0 194 if (has_alpha)
michael@0 195 accum[3] >>= ConvolutionFilter1D::kShiftBits;
michael@0 196
michael@0 197 // Store the new pixel.
michael@0 198 out_row[out_x * 4 + R_OFFSET_IDX] = ClampTo8(accum[0]);
michael@0 199 out_row[out_x * 4 + G_OFFSET_IDX] = ClampTo8(accum[1]);
michael@0 200 out_row[out_x * 4 + B_OFFSET_IDX] = ClampTo8(accum[2]);
michael@0 201 if (has_alpha)
michael@0 202 out_row[out_x * 4 + A_OFFSET_IDX] = ClampTo8(accum[3]);
michael@0 203 }
michael@0 204 }
michael@0 205
michael@0 206 // Does vertical convolution to produce one output row. The filter values and
michael@0 207 // length are given in the first two parameters. These are applied to each
michael@0 208 // of the rows pointed to in the |source_data_rows| array, with each row
michael@0 209 // being |pixel_width| wide.
michael@0 210 //
michael@0 211 // The output must have room for |pixel_width * 4| bytes.
michael@0 212 template<bool has_alpha>
michael@0 213 void ConvolveVertically(const ConvolutionFilter1D::Fixed* filter_values,
michael@0 214 int filter_length,
michael@0 215 unsigned char* const* source_data_rows,
michael@0 216 int pixel_width,
michael@0 217 unsigned char* out_row) {
michael@0 218 // We go through each column in the output and do a vertical convolution,
michael@0 219 // generating one output pixel each time.
michael@0 220 for (int out_x = 0; out_x < pixel_width; out_x++) {
michael@0 221 // Compute the number of bytes over in each row that the current column
michael@0 222 // we're convolving starts at. The pixel will cover the next 4 bytes.
michael@0 223 int byte_offset = out_x * 4;
michael@0 224
michael@0 225 // Apply the filter to one column of pixels.
michael@0 226 int accum[4] = {0};
michael@0 227 for (int filter_y = 0; filter_y < filter_length; filter_y++) {
michael@0 228 ConvolutionFilter1D::Fixed cur_filter = filter_values[filter_y];
michael@0 229 accum[0] += cur_filter
michael@0 230 * source_data_rows[filter_y][byte_offset + R_OFFSET_IDX];
michael@0 231 accum[1] += cur_filter
michael@0 232 * source_data_rows[filter_y][byte_offset + G_OFFSET_IDX];
michael@0 233 accum[2] += cur_filter
michael@0 234 * source_data_rows[filter_y][byte_offset + B_OFFSET_IDX];
michael@0 235 if (has_alpha)
michael@0 236 accum[3] += cur_filter
michael@0 237 * source_data_rows[filter_y][byte_offset + A_OFFSET_IDX];
michael@0 238 }
michael@0 239
michael@0 240 // Bring this value back in range. All of the filter scaling factors
michael@0 241 // are in fixed point with kShiftBits bits of precision.
michael@0 242 accum[0] >>= ConvolutionFilter1D::kShiftBits;
michael@0 243 accum[1] >>= ConvolutionFilter1D::kShiftBits;
michael@0 244 accum[2] >>= ConvolutionFilter1D::kShiftBits;
michael@0 245 if (has_alpha)
michael@0 246 accum[3] >>= ConvolutionFilter1D::kShiftBits;
michael@0 247
michael@0 248 // Store the new pixel.
michael@0 249 out_row[byte_offset + R_OFFSET_IDX] = ClampTo8(accum[0]);
michael@0 250 out_row[byte_offset + G_OFFSET_IDX] = ClampTo8(accum[1]);
michael@0 251 out_row[byte_offset + B_OFFSET_IDX] = ClampTo8(accum[2]);
michael@0 252 if (has_alpha) {
michael@0 253 unsigned char alpha = ClampTo8(accum[3]);
michael@0 254
michael@0 255 // Make sure the alpha channel doesn't come out smaller than any of the
michael@0 256 // color channels. We use premultipled alpha channels, so this should
michael@0 257 // never happen, but rounding errors will cause this from time to time.
michael@0 258 // These "impossible" colors will cause overflows (and hence random pixel
michael@0 259 // values) when the resulting bitmap is drawn to the screen.
michael@0 260 //
michael@0 261 // We only need to do this when generating the final output row (here).
michael@0 262 int max_color_channel = std::max(out_row[byte_offset + R_OFFSET_IDX],
michael@0 263 std::max(out_row[byte_offset + G_OFFSET_IDX], out_row[byte_offset + B_OFFSET_IDX]));
michael@0 264 if (alpha < max_color_channel)
michael@0 265 out_row[byte_offset + A_OFFSET_IDX] = max_color_channel;
michael@0 266 else
michael@0 267 out_row[byte_offset + A_OFFSET_IDX] = alpha;
michael@0 268 } else {
michael@0 269 // No alpha channel, the image is opaque.
michael@0 270 out_row[byte_offset + A_OFFSET_IDX] = 0xff;
michael@0 271 }
michael@0 272 }
michael@0 273 }
michael@0 274
michael@0 275
michael@0 276 // Convolves horizontally along a single row. The row data is given in
michael@0 277 // |src_data| and continues for the num_values() of the filter.
michael@0 278 void ConvolveHorizontally_SSE2(const unsigned char* src_data,
michael@0 279 const ConvolutionFilter1D& filter,
michael@0 280 unsigned char* out_row) {
michael@0 281 #if defined(SIMD_SSE2)
michael@0 282 int num_values = filter.num_values();
michael@0 283
michael@0 284 int filter_offset, filter_length;
michael@0 285 __m128i zero = _mm_setzero_si128();
michael@0 286 __m128i mask[4];
michael@0 287 // |mask| will be used to decimate all extra filter coefficients that are
michael@0 288 // loaded by SIMD when |filter_length| is not divisible by 4.
michael@0 289 // mask[0] is not used in following algorithm.
michael@0 290 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
michael@0 291 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
michael@0 292 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
michael@0 293
michael@0 294 // Output one pixel each iteration, calculating all channels (RGBA) together.
michael@0 295 for (int out_x = 0; out_x < num_values; out_x++) {
michael@0 296 const ConvolutionFilter1D::Fixed* filter_values =
michael@0 297 filter.FilterForValue(out_x, &filter_offset, &filter_length);
michael@0 298
michael@0 299 __m128i accum = _mm_setzero_si128();
michael@0 300
michael@0 301 // Compute the first pixel in this row that the filter affects. It will
michael@0 302 // touch |filter_length| pixels (4 bytes each) after this.
michael@0 303 const __m128i* row_to_filter =
michael@0 304 reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);
michael@0 305
michael@0 306 // We will load and accumulate with four coefficients per iteration.
michael@0 307 for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {
michael@0 308
michael@0 309 // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
michael@0 310 __m128i coeff, coeff16;
michael@0 311 // [16] xx xx xx xx c3 c2 c1 c0
michael@0 312 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
michael@0 313 // [16] xx xx xx xx c1 c1 c0 c0
michael@0 314 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
michael@0 315 // [16] c1 c1 c1 c1 c0 c0 c0 c0
michael@0 316 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
michael@0 317
michael@0 318 // Load four pixels => unpack the first two pixels to 16 bits =>
michael@0 319 // multiply with coefficients => accumulate the convolution result.
michael@0 320 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
michael@0 321 __m128i src8 = _mm_loadu_si128(row_to_filter);
michael@0 322 // [16] a1 b1 g1 r1 a0 b0 g0 r0
michael@0 323 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
michael@0 324 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
michael@0 325 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
michael@0 326 // [32] a0*c0 b0*c0 g0*c0 r0*c0
michael@0 327 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
michael@0 328 accum = _mm_add_epi32(accum, t);
michael@0 329 // [32] a1*c1 b1*c1 g1*c1 r1*c1
michael@0 330 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
michael@0 331 accum = _mm_add_epi32(accum, t);
michael@0 332
michael@0 333 // Duplicate 3rd and 4th coefficients for all channels =>
michael@0 334 // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
michael@0 335 // => accumulate the convolution results.
michael@0 336 // [16] xx xx xx xx c3 c3 c2 c2
michael@0 337 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
michael@0 338 // [16] c3 c3 c3 c3 c2 c2 c2 c2
michael@0 339 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
michael@0 340 // [16] a3 g3 b3 r3 a2 g2 b2 r2
michael@0 341 src16 = _mm_unpackhi_epi8(src8, zero);
michael@0 342 mul_hi = _mm_mulhi_epi16(src16, coeff16);
michael@0 343 mul_lo = _mm_mullo_epi16(src16, coeff16);
michael@0 344 // [32] a2*c2 b2*c2 g2*c2 r2*c2
michael@0 345 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
michael@0 346 accum = _mm_add_epi32(accum, t);
michael@0 347 // [32] a3*c3 b3*c3 g3*c3 r3*c3
michael@0 348 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
michael@0 349 accum = _mm_add_epi32(accum, t);
michael@0 350
michael@0 351 // Advance the pixel and coefficients pointers.
michael@0 352 row_to_filter += 1;
michael@0 353 filter_values += 4;
michael@0 354 }
michael@0 355
michael@0 356 // When |filter_length| is not divisible by 4, we need to decimate some of
michael@0 357 // the filter coefficient that was loaded incorrectly to zero; Other than
michael@0 358 // that the algorithm is same with above, exceot that the 4th pixel will be
michael@0 359 // always absent.
michael@0 360 int r = filter_length&3;
michael@0 361 if (r) {
michael@0 362 // Note: filter_values must be padded to align_up(filter_offset, 8).
michael@0 363 __m128i coeff, coeff16;
michael@0 364 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
michael@0 365 // Mask out extra filter taps.
michael@0 366 coeff = _mm_and_si128(coeff, mask[r]);
michael@0 367 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
michael@0 368 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
michael@0 369
michael@0 370 // Note: line buffer must be padded to align_up(filter_offset, 16).
michael@0 371 // We resolve this by use C-version for the last horizontal line.
michael@0 372 __m128i src8 = _mm_loadu_si128(row_to_filter);
michael@0 373 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
michael@0 374 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
michael@0 375 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
michael@0 376 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
michael@0 377 accum = _mm_add_epi32(accum, t);
michael@0 378 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
michael@0 379 accum = _mm_add_epi32(accum, t);
michael@0 380
michael@0 381 src16 = _mm_unpackhi_epi8(src8, zero);
michael@0 382 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
michael@0 383 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
michael@0 384 mul_hi = _mm_mulhi_epi16(src16, coeff16);
michael@0 385 mul_lo = _mm_mullo_epi16(src16, coeff16);
michael@0 386 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
michael@0 387 accum = _mm_add_epi32(accum, t);
michael@0 388 }
michael@0 389
michael@0 390 // Shift right for fixed point implementation.
michael@0 391 accum = _mm_srai_epi32(accum, ConvolutionFilter1D::kShiftBits);
michael@0 392
michael@0 393 // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
michael@0 394 accum = _mm_packs_epi32(accum, zero);
michael@0 395 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
michael@0 396 accum = _mm_packus_epi16(accum, zero);
michael@0 397
michael@0 398 // Store the pixel value of 32 bits.
michael@0 399 *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum);
michael@0 400 out_row += 4;
michael@0 401 }
michael@0 402 #endif
michael@0 403 }
michael@0 404
michael@0 405 // Convolves horizontally along four rows. The row data is given in
michael@0 406 // |src_data| and continues for the num_values() of the filter.
michael@0 407 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
michael@0 408 // refer to that function for detailed comments.
michael@0 409 void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],
michael@0 410 const ConvolutionFilter1D& filter,
michael@0 411 unsigned char* out_row[4]) {
michael@0 412 #if defined(SIMD_SSE2)
michael@0 413 int num_values = filter.num_values();
michael@0 414
michael@0 415 int filter_offset, filter_length;
michael@0 416 __m128i zero = _mm_setzero_si128();
michael@0 417 __m128i mask[4];
michael@0 418 // |mask| will be used to decimate all extra filter coefficients that are
michael@0 419 // loaded by SIMD when |filter_length| is not divisible by 4.
michael@0 420 // mask[0] is not used in following algorithm.
michael@0 421 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
michael@0 422 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
michael@0 423 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
michael@0 424
michael@0 425 // Output one pixel each iteration, calculating all channels (RGBA) together.
michael@0 426 for (int out_x = 0; out_x < num_values; out_x++) {
michael@0 427 const ConvolutionFilter1D::Fixed* filter_values =
michael@0 428 filter.FilterForValue(out_x, &filter_offset, &filter_length);
michael@0 429
michael@0 430 // four pixels in a column per iteration.
michael@0 431 __m128i accum0 = _mm_setzero_si128();
michael@0 432 __m128i accum1 = _mm_setzero_si128();
michael@0 433 __m128i accum2 = _mm_setzero_si128();
michael@0 434 __m128i accum3 = _mm_setzero_si128();
michael@0 435 int start = (filter_offset<<2);
michael@0 436 // We will load and accumulate with four coefficients per iteration.
michael@0 437 for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {
michael@0 438 __m128i coeff, coeff16lo, coeff16hi;
michael@0 439 // [16] xx xx xx xx c3 c2 c1 c0
michael@0 440 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
michael@0 441 // [16] xx xx xx xx c1 c1 c0 c0
michael@0 442 coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
michael@0 443 // [16] c1 c1 c1 c1 c0 c0 c0 c0
michael@0 444 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
michael@0 445 // [16] xx xx xx xx c3 c3 c2 c2
michael@0 446 coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
michael@0 447 // [16] c3 c3 c3 c3 c2 c2 c2 c2
michael@0 448 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
michael@0 449
michael@0 450 __m128i src8, src16, mul_hi, mul_lo, t;
michael@0 451
michael@0 452 #define ITERATION(src, accum) \
michael@0 453 src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \
michael@0 454 src16 = _mm_unpacklo_epi8(src8, zero); \
michael@0 455 mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \
michael@0 456 mul_lo = _mm_mullo_epi16(src16, coeff16lo); \
michael@0 457 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
michael@0 458 accum = _mm_add_epi32(accum, t); \
michael@0 459 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
michael@0 460 accum = _mm_add_epi32(accum, t); \
michael@0 461 src16 = _mm_unpackhi_epi8(src8, zero); \
michael@0 462 mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \
michael@0 463 mul_lo = _mm_mullo_epi16(src16, coeff16hi); \
michael@0 464 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
michael@0 465 accum = _mm_add_epi32(accum, t); \
michael@0 466 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
michael@0 467 accum = _mm_add_epi32(accum, t)
michael@0 468
michael@0 469 ITERATION(src_data[0] + start, accum0);
michael@0 470 ITERATION(src_data[1] + start, accum1);
michael@0 471 ITERATION(src_data[2] + start, accum2);
michael@0 472 ITERATION(src_data[3] + start, accum3);
michael@0 473
michael@0 474 start += 16;
michael@0 475 filter_values += 4;
michael@0 476 }
michael@0 477
michael@0 478 int r = filter_length & 3;
michael@0 479 if (r) {
michael@0 480 // Note: filter_values must be padded to align_up(filter_offset, 8);
michael@0 481 __m128i coeff;
michael@0 482 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
michael@0 483 // Mask out extra filter taps.
michael@0 484 coeff = _mm_and_si128(coeff, mask[r]);
michael@0 485
michael@0 486 __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
michael@0 487 /* c1 c1 c1 c1 c0 c0 c0 c0 */
michael@0 488 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
michael@0 489 __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
michael@0 490 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
michael@0 491
michael@0 492 __m128i src8, src16, mul_hi, mul_lo, t;
michael@0 493
michael@0 494 ITERATION(src_data[0] + start, accum0);
michael@0 495 ITERATION(src_data[1] + start, accum1);
michael@0 496 ITERATION(src_data[2] + start, accum2);
michael@0 497 ITERATION(src_data[3] + start, accum3);
michael@0 498 }
michael@0 499
michael@0 500 accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
michael@0 501 accum0 = _mm_packs_epi32(accum0, zero);
michael@0 502 accum0 = _mm_packus_epi16(accum0, zero);
michael@0 503 accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
michael@0 504 accum1 = _mm_packs_epi32(accum1, zero);
michael@0 505 accum1 = _mm_packus_epi16(accum1, zero);
michael@0 506 accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
michael@0 507 accum2 = _mm_packs_epi32(accum2, zero);
michael@0 508 accum2 = _mm_packus_epi16(accum2, zero);
michael@0 509 accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);
michael@0 510 accum3 = _mm_packs_epi32(accum3, zero);
michael@0 511 accum3 = _mm_packus_epi16(accum3, zero);
michael@0 512
michael@0 513 *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0);
michael@0 514 *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1);
michael@0 515 *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2);
michael@0 516 *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3);
michael@0 517
michael@0 518 out_row[0] += 4;
michael@0 519 out_row[1] += 4;
michael@0 520 out_row[2] += 4;
michael@0 521 out_row[3] += 4;
michael@0 522 }
michael@0 523 #endif
michael@0 524 }
michael@0 525
michael@0 526 // Does vertical convolution to produce one output row. The filter values and
michael@0 527 // length are given in the first two parameters. These are applied to each
michael@0 528 // of the rows pointed to in the |source_data_rows| array, with each row
michael@0 529 // being |pixel_width| wide.
michael@0 530 //
michael@0 531 // The output must have room for |pixel_width * 4| bytes.
michael@0 532 template<bool has_alpha>
michael@0 533 void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,
michael@0 534 int filter_length,
michael@0 535 unsigned char* const* source_data_rows,
michael@0 536 int pixel_width,
michael@0 537 unsigned char* out_row) {
michael@0 538 #if defined(SIMD_SSE2)
michael@0 539 int width = pixel_width & ~3;
michael@0 540
michael@0 541 __m128i zero = _mm_setzero_si128();
michael@0 542 __m128i accum0, accum1, accum2, accum3, coeff16;
michael@0 543 const __m128i* src;
michael@0 544 // Output four pixels per iteration (16 bytes).
michael@0 545 for (int out_x = 0; out_x < width; out_x += 4) {
michael@0 546
michael@0 547 // Accumulated result for each pixel. 32 bits per RGBA channel.
michael@0 548 accum0 = _mm_setzero_si128();
michael@0 549 accum1 = _mm_setzero_si128();
michael@0 550 accum2 = _mm_setzero_si128();
michael@0 551 accum3 = _mm_setzero_si128();
michael@0 552
michael@0 553 // Convolve with one filter coefficient per iteration.
michael@0 554 for (int filter_y = 0; filter_y < filter_length; filter_y++) {
michael@0 555
michael@0 556 // Duplicate the filter coefficient 8 times.
michael@0 557 // [16] cj cj cj cj cj cj cj cj
michael@0 558 coeff16 = _mm_set1_epi16(filter_values[filter_y]);
michael@0 559
michael@0 560 // Load four pixels (16 bytes) together.
michael@0 561 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
michael@0 562 src = reinterpret_cast<const __m128i*>(
michael@0 563 &source_data_rows[filter_y][out_x << 2]);
michael@0 564 __m128i src8 = _mm_loadu_si128(src);
michael@0 565
michael@0 566 // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
michael@0 567 // multiply with current coefficient => accumulate the result.
michael@0 568 // [16] a1 b1 g1 r1 a0 b0 g0 r0
michael@0 569 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
michael@0 570 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
michael@0 571 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
michael@0 572 // [32] a0 b0 g0 r0
michael@0 573 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
michael@0 574 accum0 = _mm_add_epi32(accum0, t);
michael@0 575 // [32] a1 b1 g1 r1
michael@0 576 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
michael@0 577 accum1 = _mm_add_epi32(accum1, t);
michael@0 578
michael@0 579 // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
michael@0 580 // multiply with current coefficient => accumulate the result.
michael@0 581 // [16] a3 b3 g3 r3 a2 b2 g2 r2
michael@0 582 src16 = _mm_unpackhi_epi8(src8, zero);
michael@0 583 mul_hi = _mm_mulhi_epi16(src16, coeff16);
michael@0 584 mul_lo = _mm_mullo_epi16(src16, coeff16);
michael@0 585 // [32] a2 b2 g2 r2
michael@0 586 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
michael@0 587 accum2 = _mm_add_epi32(accum2, t);
michael@0 588 // [32] a3 b3 g3 r3
michael@0 589 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
michael@0 590 accum3 = _mm_add_epi32(accum3, t);
michael@0 591 }
michael@0 592
michael@0 593 // Shift right for fixed point implementation.
michael@0 594 accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
michael@0 595 accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
michael@0 596 accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
michael@0 597 accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);
michael@0 598
michael@0 599 // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
michael@0 600 // [16] a1 b1 g1 r1 a0 b0 g0 r0
michael@0 601 accum0 = _mm_packs_epi32(accum0, accum1);
michael@0 602 // [16] a3 b3 g3 r3 a2 b2 g2 r2
michael@0 603 accum2 = _mm_packs_epi32(accum2, accum3);
michael@0 604
michael@0 605 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
michael@0 606 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
michael@0 607 accum0 = _mm_packus_epi16(accum0, accum2);
michael@0 608
michael@0 609 if (has_alpha) {
michael@0 610 // Compute the max(ri, gi, bi) for each pixel.
michael@0 611 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
michael@0 612 __m128i a = _mm_srli_epi32(accum0, 8);
michael@0 613 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
michael@0 614 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
michael@0 615 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
michael@0 616 a = _mm_srli_epi32(accum0, 16);
michael@0 617 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
michael@0 618 b = _mm_max_epu8(a, b); // Max of r and g and b.
michael@0 619 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
michael@0 620 b = _mm_slli_epi32(b, 24);
michael@0 621
michael@0 622 // Make sure the value of alpha channel is always larger than maximum
michael@0 623 // value of color channels.
michael@0 624 accum0 = _mm_max_epu8(b, accum0);
michael@0 625 } else {
michael@0 626 // Set value of alpha channels to 0xFF.
michael@0 627 __m128i mask = _mm_set1_epi32(0xff000000);
michael@0 628 accum0 = _mm_or_si128(accum0, mask);
michael@0 629 }
michael@0 630
michael@0 631 // Store the convolution result (16 bytes) and advance the pixel pointers.
michael@0 632 _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);
michael@0 633 out_row += 16;
michael@0 634 }
michael@0 635
michael@0 636 // When the width of the output is not divisible by 4, We need to save one
michael@0 637 // pixel (4 bytes) each time. And also the fourth pixel is always absent.
michael@0 638 if (pixel_width & 3) {
michael@0 639 accum0 = _mm_setzero_si128();
michael@0 640 accum1 = _mm_setzero_si128();
michael@0 641 accum2 = _mm_setzero_si128();
michael@0 642 for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
michael@0 643 coeff16 = _mm_set1_epi16(filter_values[filter_y]);
michael@0 644 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
michael@0 645 src = reinterpret_cast<const __m128i*>(
michael@0 646 &source_data_rows[filter_y][width<<2]);
michael@0 647 __m128i src8 = _mm_loadu_si128(src);
michael@0 648 // [16] a1 b1 g1 r1 a0 b0 g0 r0
michael@0 649 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
michael@0 650 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
michael@0 651 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
michael@0 652 // [32] a0 b0 g0 r0
michael@0 653 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
michael@0 654 accum0 = _mm_add_epi32(accum0, t);
michael@0 655 // [32] a1 b1 g1 r1
michael@0 656 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
michael@0 657 accum1 = _mm_add_epi32(accum1, t);
michael@0 658 // [16] a3 b3 g3 r3 a2 b2 g2 r2
michael@0 659 src16 = _mm_unpackhi_epi8(src8, zero);
michael@0 660 mul_hi = _mm_mulhi_epi16(src16, coeff16);
michael@0 661 mul_lo = _mm_mullo_epi16(src16, coeff16);
michael@0 662 // [32] a2 b2 g2 r2
michael@0 663 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
michael@0 664 accum2 = _mm_add_epi32(accum2, t);
michael@0 665 }
michael@0 666
michael@0 667 accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
michael@0 668 accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
michael@0 669 accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
michael@0 670 // [16] a1 b1 g1 r1 a0 b0 g0 r0
michael@0 671 accum0 = _mm_packs_epi32(accum0, accum1);
michael@0 672 // [16] a3 b3 g3 r3 a2 b2 g2 r2
michael@0 673 accum2 = _mm_packs_epi32(accum2, zero);
michael@0 674 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
michael@0 675 accum0 = _mm_packus_epi16(accum0, accum2);
michael@0 676 if (has_alpha) {
michael@0 677 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
michael@0 678 __m128i a = _mm_srli_epi32(accum0, 8);
michael@0 679 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
michael@0 680 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
michael@0 681 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
michael@0 682 a = _mm_srli_epi32(accum0, 16);
michael@0 683 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
michael@0 684 b = _mm_max_epu8(a, b); // Max of r and g and b.
michael@0 685 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
michael@0 686 b = _mm_slli_epi32(b, 24);
michael@0 687 accum0 = _mm_max_epu8(b, accum0);
michael@0 688 } else {
michael@0 689 __m128i mask = _mm_set1_epi32(0xff000000);
michael@0 690 accum0 = _mm_or_si128(accum0, mask);
michael@0 691 }
michael@0 692
michael@0 693 for (int out_x = width; out_x < pixel_width; out_x++) {
michael@0 694 *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);
michael@0 695 accum0 = _mm_srli_si128(accum0, 4);
michael@0 696 out_row += 4;
michael@0 697 }
michael@0 698 }
michael@0 699 #endif
michael@0 700 }
michael@0 701
michael@0 702 } // namespace
michael@0 703
michael@0 704 // ConvolutionFilter1D ---------------------------------------------------------
michael@0 705
michael@0 706 ConvolutionFilter1D::ConvolutionFilter1D()
michael@0 707 : max_filter_(0) {
michael@0 708 }
michael@0 709
michael@0 710 ConvolutionFilter1D::~ConvolutionFilter1D() {
michael@0 711 }
michael@0 712
michael@0 713 void ConvolutionFilter1D::AddFilter(int filter_offset,
michael@0 714 const float* filter_values,
michael@0 715 int filter_length) {
michael@0 716 SkASSERT(filter_length > 0);
michael@0 717
michael@0 718 std::vector<Fixed> fixed_values;
michael@0 719 fixed_values.reserve(filter_length);
michael@0 720
michael@0 721 for (int i = 0; i < filter_length; ++i)
michael@0 722 fixed_values.push_back(FloatToFixed(filter_values[i]));
michael@0 723
michael@0 724 AddFilter(filter_offset, &fixed_values[0], filter_length);
michael@0 725 }
michael@0 726
michael@0 727 void ConvolutionFilter1D::AddFilter(int filter_offset,
michael@0 728 const Fixed* filter_values,
michael@0 729 int filter_length) {
michael@0 730 // It is common for leading/trailing filter values to be zeros. In such
michael@0 731 // cases it is beneficial to only store the central factors.
michael@0 732 // For a scaling to 1/4th in each dimension using a Lanczos-2 filter on
michael@0 733 // a 1080p image this optimization gives a ~10% speed improvement.
michael@0 734 int first_non_zero = 0;
michael@0 735 while (first_non_zero < filter_length && filter_values[first_non_zero] == 0)
michael@0 736 first_non_zero++;
michael@0 737
michael@0 738 if (first_non_zero < filter_length) {
michael@0 739 // Here we have at least one non-zero factor.
michael@0 740 int last_non_zero = filter_length - 1;
michael@0 741 while (last_non_zero >= 0 && filter_values[last_non_zero] == 0)
michael@0 742 last_non_zero--;
michael@0 743
michael@0 744 filter_offset += first_non_zero;
michael@0 745 filter_length = last_non_zero + 1 - first_non_zero;
michael@0 746 SkASSERT(filter_length > 0);
michael@0 747
michael@0 748 for (int i = first_non_zero; i <= last_non_zero; i++)
michael@0 749 filter_values_.push_back(filter_values[i]);
michael@0 750 } else {
michael@0 751 // Here all the factors were zeroes.
michael@0 752 filter_length = 0;
michael@0 753 }
michael@0 754
michael@0 755 FilterInstance instance;
michael@0 756
michael@0 757 // We pushed filter_length elements onto filter_values_
michael@0 758 instance.data_location = (static_cast<int>(filter_values_.size()) -
michael@0 759 filter_length);
michael@0 760 instance.offset = filter_offset;
michael@0 761 instance.length = filter_length;
michael@0 762 filters_.push_back(instance);
michael@0 763
michael@0 764 max_filter_ = std::max(max_filter_, filter_length);
michael@0 765 }
michael@0 766
michael@0 767 void BGRAConvolve2D(const unsigned char* source_data,
michael@0 768 int source_byte_row_stride,
michael@0 769 bool source_has_alpha,
michael@0 770 const ConvolutionFilter1D& filter_x,
michael@0 771 const ConvolutionFilter1D& filter_y,
michael@0 772 int output_byte_row_stride,
michael@0 773 unsigned char* output,
michael@0 774 bool use_sse2) {
michael@0 775 #if !defined(SIMD_SSE2)
michael@0 776 // Even we have runtime support for SSE2 instructions, since the binary
michael@0 777 // was not built with SSE2 support, we had to fallback to C version.
michael@0 778 use_sse2 = false;
michael@0 779 #endif
michael@0 780
michael@0 781 int max_y_filter_size = filter_y.max_filter();
michael@0 782
michael@0 783 // The next row in the input that we will generate a horizontally
michael@0 784 // convolved row for. If the filter doesn't start at the beginning of the
michael@0 785 // image (this is the case when we are only resizing a subset), then we
michael@0 786 // don't want to generate any output rows before that. Compute the starting
michael@0 787 // row for convolution as the first pixel for the first vertical filter.
michael@0 788 int filter_offset, filter_length;
michael@0 789 const ConvolutionFilter1D::Fixed* filter_values =
michael@0 790 filter_y.FilterForValue(0, &filter_offset, &filter_length);
michael@0 791 int next_x_row = filter_offset;
michael@0 792
michael@0 793 // We loop over each row in the input doing a horizontal convolution. This
michael@0 794 // will result in a horizontally convolved image. We write the results into
michael@0 795 // a circular buffer of convolved rows and do vertical convolution as rows
michael@0 796 // are available. This prevents us from having to store the entire
michael@0 797 // intermediate image and helps cache coherency.
michael@0 798 // We will need four extra rows to allow horizontal convolution could be done
michael@0 799 // simultaneously. We also padding each row in row buffer to be aligned-up to
michael@0 800 // 16 bytes.
michael@0 801 // TODO(jiesun): We do not use aligned load from row buffer in vertical
michael@0 802 // convolution pass yet. Somehow Windows does not like it.
michael@0 803 int row_buffer_width = (filter_x.num_values() + 15) & ~0xF;
michael@0 804 int row_buffer_height = max_y_filter_size + (use_sse2 ? 4 : 0);
michael@0 805 CircularRowBuffer row_buffer(row_buffer_width,
michael@0 806 row_buffer_height,
michael@0 807 filter_offset);
michael@0 808
michael@0 809 // Loop over every possible output row, processing just enough horizontal
michael@0 810 // convolutions to run each subsequent vertical convolution.
michael@0 811 SkASSERT(output_byte_row_stride >= filter_x.num_values() * 4);
michael@0 812 int num_output_rows = filter_y.num_values();
michael@0 813
michael@0 814 // We need to check which is the last line to convolve before we advance 4
michael@0 815 // lines in one iteration.
michael@0 816 int last_filter_offset, last_filter_length;
michael@0 817 filter_y.FilterForValue(num_output_rows - 1, &last_filter_offset,
michael@0 818 &last_filter_length);
michael@0 819
michael@0 820 for (int out_y = 0; out_y < num_output_rows; out_y++) {
michael@0 821 filter_values = filter_y.FilterForValue(out_y,
michael@0 822 &filter_offset, &filter_length);
michael@0 823
michael@0 824 // Generate output rows until we have enough to run the current filter.
michael@0 825 if (use_sse2) {
michael@0 826 while (next_x_row < filter_offset + filter_length) {
michael@0 827 if (next_x_row + 3 < last_filter_offset + last_filter_length - 1) {
michael@0 828 const unsigned char* src[4];
michael@0 829 unsigned char* out_row[4];
michael@0 830 for (int i = 0; i < 4; ++i) {
michael@0 831 src[i] = &source_data[(next_x_row + i) * source_byte_row_stride];
michael@0 832 out_row[i] = row_buffer.AdvanceRow();
michael@0 833 }
michael@0 834 ConvolveHorizontally4_SSE2(src, filter_x, out_row);
michael@0 835 next_x_row += 4;
michael@0 836 } else {
michael@0 837 // For the last row, SSE2 load possibly to access data beyond the
michael@0 838 // image area. therefore we use C version here.
michael@0 839 if (next_x_row == last_filter_offset + last_filter_length - 1) {
michael@0 840 if (source_has_alpha) {
michael@0 841 ConvolveHorizontally<true>(
michael@0 842 &source_data[next_x_row * source_byte_row_stride],
michael@0 843 filter_x, row_buffer.AdvanceRow());
michael@0 844 } else {
michael@0 845 ConvolveHorizontally<false>(
michael@0 846 &source_data[next_x_row * source_byte_row_stride],
michael@0 847 filter_x, row_buffer.AdvanceRow());
michael@0 848 }
michael@0 849 } else {
michael@0 850 ConvolveHorizontally_SSE2(
michael@0 851 &source_data[next_x_row * source_byte_row_stride],
michael@0 852 filter_x, row_buffer.AdvanceRow());
michael@0 853 }
michael@0 854 next_x_row++;
michael@0 855 }
michael@0 856 }
michael@0 857 } else {
michael@0 858 while (next_x_row < filter_offset + filter_length) {
michael@0 859 if (source_has_alpha) {
michael@0 860 ConvolveHorizontally<true>(
michael@0 861 &source_data[next_x_row * source_byte_row_stride],
michael@0 862 filter_x, row_buffer.AdvanceRow());
michael@0 863 } else {
michael@0 864 ConvolveHorizontally<false>(
michael@0 865 &source_data[next_x_row * source_byte_row_stride],
michael@0 866 filter_x, row_buffer.AdvanceRow());
michael@0 867 }
michael@0 868 next_x_row++;
michael@0 869 }
michael@0 870 }
michael@0 871
michael@0 872 // Compute where in the output image this row of final data will go.
michael@0 873 unsigned char* cur_output_row = &output[out_y * output_byte_row_stride];
michael@0 874
michael@0 875 // Get the list of rows that the circular buffer has, in order.
michael@0 876 int first_row_in_circular_buffer;
michael@0 877 unsigned char* const* rows_to_convolve =
michael@0 878 row_buffer.GetRowAddresses(&first_row_in_circular_buffer);
michael@0 879
michael@0 880 // Now compute the start of the subset of those rows that the filter
michael@0 881 // needs.
michael@0 882 unsigned char* const* first_row_for_filter =
michael@0 883 &rows_to_convolve[filter_offset - first_row_in_circular_buffer];
michael@0 884
michael@0 885 if (source_has_alpha) {
michael@0 886 if (use_sse2) {
michael@0 887 ConvolveVertically_SSE2<true>(filter_values, filter_length,
michael@0 888 first_row_for_filter,
michael@0 889 filter_x.num_values(), cur_output_row);
michael@0 890 } else {
michael@0 891 ConvolveVertically<true>(filter_values, filter_length,
michael@0 892 first_row_for_filter,
michael@0 893 filter_x.num_values(), cur_output_row);
michael@0 894 }
michael@0 895 } else {
michael@0 896 if (use_sse2) {
michael@0 897 ConvolveVertically_SSE2<false>(filter_values, filter_length,
michael@0 898 first_row_for_filter,
michael@0 899 filter_x.num_values(), cur_output_row);
michael@0 900 } else {
michael@0 901 ConvolveVertically<false>(filter_values, filter_length,
michael@0 902 first_row_for_filter,
michael@0 903 filter_x.num_values(), cur_output_row);
michael@0 904 }
michael@0 905 }
michael@0 906 }
michael@0 907 }
michael@0 908
michael@0 909 } // namespace skia

mercurial