Tue, 06 Jan 2015 21:39:09 +0100
Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.
michael@0 | 1 | // Copyright (c) 2006-2011 The Chromium Authors. All rights reserved. |
michael@0 | 2 | // |
michael@0 | 3 | // Redistribution and use in source and binary forms, with or without |
michael@0 | 4 | // modification, are permitted provided that the following conditions |
michael@0 | 5 | // are met: |
michael@0 | 6 | // * Redistributions of source code must retain the above copyright |
michael@0 | 7 | // notice, this list of conditions and the following disclaimer. |
michael@0 | 8 | // * Redistributions in binary form must reproduce the above copyright |
michael@0 | 9 | // notice, this list of conditions and the following disclaimer in |
michael@0 | 10 | // the documentation and/or other materials provided with the |
michael@0 | 11 | // distribution. |
michael@0 | 12 | // * Neither the name of Google, Inc. nor the names of its contributors |
michael@0 | 13 | // may be used to endorse or promote products derived from this |
michael@0 | 14 | // software without specific prior written permission. |
michael@0 | 15 | // |
michael@0 | 16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
michael@0 | 17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
michael@0 | 18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
michael@0 | 19 | // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
michael@0 | 20 | // COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
michael@0 | 21 | // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
michael@0 | 22 | // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS |
michael@0 | 23 | // OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED |
michael@0 | 24 | // AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
michael@0 | 25 | // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT |
michael@0 | 26 | // OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
michael@0 | 27 | // SUCH DAMAGE. |
michael@0 | 28 | |
michael@0 | 29 | #include "convolver.h" |
michael@0 | 30 | |
michael@0 | 31 | #include <algorithm> |
michael@0 | 32 | |
michael@0 | 33 | #include "skia/SkTypes.h" |
michael@0 | 34 | |
michael@0 | 35 | // note: SIMD_SSE2 is not enabled because of bugs, apparently |
michael@0 | 36 | |
michael@0 | 37 | #if defined(SIMD_SSE2) |
michael@0 | 38 | #include <emmintrin.h> // ARCH_CPU_X86_FAMILY was defined in build/config.h |
michael@0 | 39 | #endif |
michael@0 | 40 | |
michael@0 | 41 | #if defined(SK_CPU_LENDIAN) |
michael@0 | 42 | #define R_OFFSET_IDX 0 |
michael@0 | 43 | #define G_OFFSET_IDX 1 |
michael@0 | 44 | #define B_OFFSET_IDX 2 |
michael@0 | 45 | #define A_OFFSET_IDX 3 |
michael@0 | 46 | #else |
michael@0 | 47 | #define R_OFFSET_IDX 3 |
michael@0 | 48 | #define G_OFFSET_IDX 2 |
michael@0 | 49 | #define B_OFFSET_IDX 1 |
michael@0 | 50 | #define A_OFFSET_IDX 0 |
michael@0 | 51 | #endif |
michael@0 | 52 | |
michael@0 | 53 | namespace skia { |
michael@0 | 54 | |
michael@0 | 55 | namespace { |
michael@0 | 56 | |
michael@0 | 57 | // Converts the argument to an 8-bit unsigned value by clamping to the range |
michael@0 | 58 | // 0-255. |
michael@0 | 59 | inline unsigned char ClampTo8(int a) { |
michael@0 | 60 | if (static_cast<unsigned>(a) < 256) |
michael@0 | 61 | return a; // Avoid the extra check in the common case. |
michael@0 | 62 | if (a < 0) |
michael@0 | 63 | return 0; |
michael@0 | 64 | return 255; |
michael@0 | 65 | } |
michael@0 | 66 | |
michael@0 | 67 | // Stores a list of rows in a circular buffer. The usage is you write into it |
michael@0 | 68 | // by calling AdvanceRow. It will keep track of which row in the buffer it |
michael@0 | 69 | // should use next, and the total number of rows added. |
michael@0 | 70 | class CircularRowBuffer { |
michael@0 | 71 | public: |
michael@0 | 72 | // The number of pixels in each row is given in |source_row_pixel_width|. |
michael@0 | 73 | // The maximum number of rows needed in the buffer is |max_y_filter_size| |
michael@0 | 74 | // (we only need to store enough rows for the biggest filter). |
michael@0 | 75 | // |
michael@0 | 76 | // We use the |first_input_row| to compute the coordinates of all of the |
michael@0 | 77 | // following rows returned by Advance(). |
michael@0 | 78 | CircularRowBuffer(int dest_row_pixel_width, int max_y_filter_size, |
michael@0 | 79 | int first_input_row) |
michael@0 | 80 | : row_byte_width_(dest_row_pixel_width * 4), |
michael@0 | 81 | num_rows_(max_y_filter_size), |
michael@0 | 82 | next_row_(0), |
michael@0 | 83 | next_row_coordinate_(first_input_row) { |
michael@0 | 84 | buffer_.resize(row_byte_width_ * max_y_filter_size); |
michael@0 | 85 | row_addresses_.resize(num_rows_); |
michael@0 | 86 | } |
michael@0 | 87 | |
michael@0 | 88 | // Moves to the next row in the buffer, returning a pointer to the beginning |
michael@0 | 89 | // of it. |
michael@0 | 90 | unsigned char* AdvanceRow() { |
michael@0 | 91 | unsigned char* row = &buffer_[next_row_ * row_byte_width_]; |
michael@0 | 92 | next_row_coordinate_++; |
michael@0 | 93 | |
michael@0 | 94 | // Set the pointer to the next row to use, wrapping around if necessary. |
michael@0 | 95 | next_row_++; |
michael@0 | 96 | if (next_row_ == num_rows_) |
michael@0 | 97 | next_row_ = 0; |
michael@0 | 98 | return row; |
michael@0 | 99 | } |
michael@0 | 100 | |
michael@0 | 101 | // Returns a pointer to an "unrolled" array of rows. These rows will start |
michael@0 | 102 | // at the y coordinate placed into |*first_row_index| and will continue in |
michael@0 | 103 | // order for the maximum number of rows in this circular buffer. |
michael@0 | 104 | // |
michael@0 | 105 | // The |first_row_index_| may be negative. This means the circular buffer |
michael@0 | 106 | // starts before the top of the image (it hasn't been filled yet). |
michael@0 | 107 | unsigned char* const* GetRowAddresses(int* first_row_index) { |
michael@0 | 108 | // Example for a 4-element circular buffer holding coords 6-9. |
michael@0 | 109 | // Row 0 Coord 8 |
michael@0 | 110 | // Row 1 Coord 9 |
michael@0 | 111 | // Row 2 Coord 6 <- next_row_ = 2, next_row_coordinate_ = 10. |
michael@0 | 112 | // Row 3 Coord 7 |
michael@0 | 113 | // |
michael@0 | 114 | // The "next" row is also the first (lowest) coordinate. This computation |
michael@0 | 115 | // may yield a negative value, but that's OK, the math will work out |
michael@0 | 116 | // since the user of this buffer will compute the offset relative |
michael@0 | 117 | // to the first_row_index and the negative rows will never be used. |
michael@0 | 118 | *first_row_index = next_row_coordinate_ - num_rows_; |
michael@0 | 119 | |
michael@0 | 120 | int cur_row = next_row_; |
michael@0 | 121 | for (int i = 0; i < num_rows_; i++) { |
michael@0 | 122 | row_addresses_[i] = &buffer_[cur_row * row_byte_width_]; |
michael@0 | 123 | |
michael@0 | 124 | // Advance to the next row, wrapping if necessary. |
michael@0 | 125 | cur_row++; |
michael@0 | 126 | if (cur_row == num_rows_) |
michael@0 | 127 | cur_row = 0; |
michael@0 | 128 | } |
michael@0 | 129 | return &row_addresses_[0]; |
michael@0 | 130 | } |
michael@0 | 131 | |
michael@0 | 132 | private: |
michael@0 | 133 | // The buffer storing the rows. They are packed, each one row_byte_width_. |
michael@0 | 134 | std::vector<unsigned char> buffer_; |
michael@0 | 135 | |
michael@0 | 136 | // Number of bytes per row in the |buffer_|. |
michael@0 | 137 | int row_byte_width_; |
michael@0 | 138 | |
michael@0 | 139 | // The number of rows available in the buffer. |
michael@0 | 140 | int num_rows_; |
michael@0 | 141 | |
michael@0 | 142 | // The next row index we should write into. This wraps around as the |
michael@0 | 143 | // circular buffer is used. |
michael@0 | 144 | int next_row_; |
michael@0 | 145 | |
michael@0 | 146 | // The y coordinate of the |next_row_|. This is incremented each time a |
michael@0 | 147 | // new row is appended and does not wrap. |
michael@0 | 148 | int next_row_coordinate_; |
michael@0 | 149 | |
michael@0 | 150 | // Buffer used by GetRowAddresses(). |
michael@0 | 151 | std::vector<unsigned char*> row_addresses_; |
michael@0 | 152 | }; |
michael@0 | 153 | |
michael@0 | 154 | // Convolves horizontally along a single row. The row data is given in |
michael@0 | 155 | // |src_data| and continues for the num_values() of the filter. |
michael@0 | 156 | template<bool has_alpha> |
michael@0 | 157 | // This function is miscompiled with gcc 4.5 with pgo. See bug 827946. |
michael@0 | 158 | #if defined(__GNUC__) && defined(MOZ_GCC_VERSION_AT_LEAST) |
michael@0 | 159 | #if MOZ_GCC_VERSION_AT_LEAST(4, 5, 0) && !MOZ_GCC_VERSION_AT_LEAST(4, 6, 0) |
michael@0 | 160 | __attribute__((optimize("-O1"))) |
michael@0 | 161 | #endif |
michael@0 | 162 | #endif |
michael@0 | 163 | void ConvolveHorizontally(const unsigned char* src_data, |
michael@0 | 164 | const ConvolutionFilter1D& filter, |
michael@0 | 165 | unsigned char* out_row) { |
michael@0 | 166 | // Loop over each pixel on this row in the output image. |
michael@0 | 167 | int num_values = filter.num_values(); |
michael@0 | 168 | for (int out_x = 0; out_x < num_values; out_x++) { |
michael@0 | 169 | // Get the filter that determines the current output pixel. |
michael@0 | 170 | int filter_offset, filter_length; |
michael@0 | 171 | const ConvolutionFilter1D::Fixed* filter_values = |
michael@0 | 172 | filter.FilterForValue(out_x, &filter_offset, &filter_length); |
michael@0 | 173 | |
michael@0 | 174 | // Compute the first pixel in this row that the filter affects. It will |
michael@0 | 175 | // touch |filter_length| pixels (4 bytes each) after this. |
michael@0 | 176 | const unsigned char* row_to_filter = &src_data[filter_offset * 4]; |
michael@0 | 177 | |
michael@0 | 178 | // Apply the filter to the row to get the destination pixel in |accum|. |
michael@0 | 179 | int accum[4] = {0}; |
michael@0 | 180 | for (int filter_x = 0; filter_x < filter_length; filter_x++) { |
michael@0 | 181 | ConvolutionFilter1D::Fixed cur_filter = filter_values[filter_x]; |
michael@0 | 182 | accum[0] += cur_filter * row_to_filter[filter_x * 4 + R_OFFSET_IDX]; |
michael@0 | 183 | accum[1] += cur_filter * row_to_filter[filter_x * 4 + G_OFFSET_IDX]; |
michael@0 | 184 | accum[2] += cur_filter * row_to_filter[filter_x * 4 + B_OFFSET_IDX]; |
michael@0 | 185 | if (has_alpha) |
michael@0 | 186 | accum[3] += cur_filter * row_to_filter[filter_x * 4 + A_OFFSET_IDX]; |
michael@0 | 187 | } |
michael@0 | 188 | |
michael@0 | 189 | // Bring this value back in range. All of the filter scaling factors |
michael@0 | 190 | // are in fixed point with kShiftBits bits of fractional part. |
michael@0 | 191 | accum[0] >>= ConvolutionFilter1D::kShiftBits; |
michael@0 | 192 | accum[1] >>= ConvolutionFilter1D::kShiftBits; |
michael@0 | 193 | accum[2] >>= ConvolutionFilter1D::kShiftBits; |
michael@0 | 194 | if (has_alpha) |
michael@0 | 195 | accum[3] >>= ConvolutionFilter1D::kShiftBits; |
michael@0 | 196 | |
michael@0 | 197 | // Store the new pixel. |
michael@0 | 198 | out_row[out_x * 4 + R_OFFSET_IDX] = ClampTo8(accum[0]); |
michael@0 | 199 | out_row[out_x * 4 + G_OFFSET_IDX] = ClampTo8(accum[1]); |
michael@0 | 200 | out_row[out_x * 4 + B_OFFSET_IDX] = ClampTo8(accum[2]); |
michael@0 | 201 | if (has_alpha) |
michael@0 | 202 | out_row[out_x * 4 + A_OFFSET_IDX] = ClampTo8(accum[3]); |
michael@0 | 203 | } |
michael@0 | 204 | } |
michael@0 | 205 | |
michael@0 | 206 | // Does vertical convolution to produce one output row. The filter values and |
michael@0 | 207 | // length are given in the first two parameters. These are applied to each |
michael@0 | 208 | // of the rows pointed to in the |source_data_rows| array, with each row |
michael@0 | 209 | // being |pixel_width| wide. |
michael@0 | 210 | // |
michael@0 | 211 | // The output must have room for |pixel_width * 4| bytes. |
michael@0 | 212 | template<bool has_alpha> |
michael@0 | 213 | void ConvolveVertically(const ConvolutionFilter1D::Fixed* filter_values, |
michael@0 | 214 | int filter_length, |
michael@0 | 215 | unsigned char* const* source_data_rows, |
michael@0 | 216 | int pixel_width, |
michael@0 | 217 | unsigned char* out_row) { |
michael@0 | 218 | // We go through each column in the output and do a vertical convolution, |
michael@0 | 219 | // generating one output pixel each time. |
michael@0 | 220 | for (int out_x = 0; out_x < pixel_width; out_x++) { |
michael@0 | 221 | // Compute the number of bytes over in each row that the current column |
michael@0 | 222 | // we're convolving starts at. The pixel will cover the next 4 bytes. |
michael@0 | 223 | int byte_offset = out_x * 4; |
michael@0 | 224 | |
michael@0 | 225 | // Apply the filter to one column of pixels. |
michael@0 | 226 | int accum[4] = {0}; |
michael@0 | 227 | for (int filter_y = 0; filter_y < filter_length; filter_y++) { |
michael@0 | 228 | ConvolutionFilter1D::Fixed cur_filter = filter_values[filter_y]; |
michael@0 | 229 | accum[0] += cur_filter |
michael@0 | 230 | * source_data_rows[filter_y][byte_offset + R_OFFSET_IDX]; |
michael@0 | 231 | accum[1] += cur_filter |
michael@0 | 232 | * source_data_rows[filter_y][byte_offset + G_OFFSET_IDX]; |
michael@0 | 233 | accum[2] += cur_filter |
michael@0 | 234 | * source_data_rows[filter_y][byte_offset + B_OFFSET_IDX]; |
michael@0 | 235 | if (has_alpha) |
michael@0 | 236 | accum[3] += cur_filter |
michael@0 | 237 | * source_data_rows[filter_y][byte_offset + A_OFFSET_IDX]; |
michael@0 | 238 | } |
michael@0 | 239 | |
michael@0 | 240 | // Bring this value back in range. All of the filter scaling factors |
michael@0 | 241 | // are in fixed point with kShiftBits bits of precision. |
michael@0 | 242 | accum[0] >>= ConvolutionFilter1D::kShiftBits; |
michael@0 | 243 | accum[1] >>= ConvolutionFilter1D::kShiftBits; |
michael@0 | 244 | accum[2] >>= ConvolutionFilter1D::kShiftBits; |
michael@0 | 245 | if (has_alpha) |
michael@0 | 246 | accum[3] >>= ConvolutionFilter1D::kShiftBits; |
michael@0 | 247 | |
michael@0 | 248 | // Store the new pixel. |
michael@0 | 249 | out_row[byte_offset + R_OFFSET_IDX] = ClampTo8(accum[0]); |
michael@0 | 250 | out_row[byte_offset + G_OFFSET_IDX] = ClampTo8(accum[1]); |
michael@0 | 251 | out_row[byte_offset + B_OFFSET_IDX] = ClampTo8(accum[2]); |
michael@0 | 252 | if (has_alpha) { |
michael@0 | 253 | unsigned char alpha = ClampTo8(accum[3]); |
michael@0 | 254 | |
michael@0 | 255 | // Make sure the alpha channel doesn't come out smaller than any of the |
michael@0 | 256 | // color channels. We use premultipled alpha channels, so this should |
michael@0 | 257 | // never happen, but rounding errors will cause this from time to time. |
michael@0 | 258 | // These "impossible" colors will cause overflows (and hence random pixel |
michael@0 | 259 | // values) when the resulting bitmap is drawn to the screen. |
michael@0 | 260 | // |
michael@0 | 261 | // We only need to do this when generating the final output row (here). |
michael@0 | 262 | int max_color_channel = std::max(out_row[byte_offset + R_OFFSET_IDX], |
michael@0 | 263 | std::max(out_row[byte_offset + G_OFFSET_IDX], out_row[byte_offset + B_OFFSET_IDX])); |
michael@0 | 264 | if (alpha < max_color_channel) |
michael@0 | 265 | out_row[byte_offset + A_OFFSET_IDX] = max_color_channel; |
michael@0 | 266 | else |
michael@0 | 267 | out_row[byte_offset + A_OFFSET_IDX] = alpha; |
michael@0 | 268 | } else { |
michael@0 | 269 | // No alpha channel, the image is opaque. |
michael@0 | 270 | out_row[byte_offset + A_OFFSET_IDX] = 0xff; |
michael@0 | 271 | } |
michael@0 | 272 | } |
michael@0 | 273 | } |
michael@0 | 274 | |
michael@0 | 275 | |
michael@0 | 276 | // Convolves horizontally along a single row. The row data is given in |
michael@0 | 277 | // |src_data| and continues for the num_values() of the filter. |
michael@0 | 278 | void ConvolveHorizontally_SSE2(const unsigned char* src_data, |
michael@0 | 279 | const ConvolutionFilter1D& filter, |
michael@0 | 280 | unsigned char* out_row) { |
michael@0 | 281 | #if defined(SIMD_SSE2) |
michael@0 | 282 | int num_values = filter.num_values(); |
michael@0 | 283 | |
michael@0 | 284 | int filter_offset, filter_length; |
michael@0 | 285 | __m128i zero = _mm_setzero_si128(); |
michael@0 | 286 | __m128i mask[4]; |
michael@0 | 287 | // |mask| will be used to decimate all extra filter coefficients that are |
michael@0 | 288 | // loaded by SIMD when |filter_length| is not divisible by 4. |
michael@0 | 289 | // mask[0] is not used in following algorithm. |
michael@0 | 290 | mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); |
michael@0 | 291 | mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); |
michael@0 | 292 | mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); |
michael@0 | 293 | |
michael@0 | 294 | // Output one pixel each iteration, calculating all channels (RGBA) together. |
michael@0 | 295 | for (int out_x = 0; out_x < num_values; out_x++) { |
michael@0 | 296 | const ConvolutionFilter1D::Fixed* filter_values = |
michael@0 | 297 | filter.FilterForValue(out_x, &filter_offset, &filter_length); |
michael@0 | 298 | |
michael@0 | 299 | __m128i accum = _mm_setzero_si128(); |
michael@0 | 300 | |
michael@0 | 301 | // Compute the first pixel in this row that the filter affects. It will |
michael@0 | 302 | // touch |filter_length| pixels (4 bytes each) after this. |
michael@0 | 303 | const __m128i* row_to_filter = |
michael@0 | 304 | reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]); |
michael@0 | 305 | |
michael@0 | 306 | // We will load and accumulate with four coefficients per iteration. |
michael@0 | 307 | for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) { |
michael@0 | 308 | |
michael@0 | 309 | // Load 4 coefficients => duplicate 1st and 2nd of them for all channels. |
michael@0 | 310 | __m128i coeff, coeff16; |
michael@0 | 311 | // [16] xx xx xx xx c3 c2 c1 c0 |
michael@0 | 312 | coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); |
michael@0 | 313 | // [16] xx xx xx xx c1 c1 c0 c0 |
michael@0 | 314 | coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); |
michael@0 | 315 | // [16] c1 c1 c1 c1 c0 c0 c0 c0 |
michael@0 | 316 | coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); |
michael@0 | 317 | |
michael@0 | 318 | // Load four pixels => unpack the first two pixels to 16 bits => |
michael@0 | 319 | // multiply with coefficients => accumulate the convolution result. |
michael@0 | 320 | // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |
michael@0 | 321 | __m128i src8 = _mm_loadu_si128(row_to_filter); |
michael@0 | 322 | // [16] a1 b1 g1 r1 a0 b0 g0 r0 |
michael@0 | 323 | __m128i src16 = _mm_unpacklo_epi8(src8, zero); |
michael@0 | 324 | __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); |
michael@0 | 325 | __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); |
michael@0 | 326 | // [32] a0*c0 b0*c0 g0*c0 r0*c0 |
michael@0 | 327 | __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
michael@0 | 328 | accum = _mm_add_epi32(accum, t); |
michael@0 | 329 | // [32] a1*c1 b1*c1 g1*c1 r1*c1 |
michael@0 | 330 | t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
michael@0 | 331 | accum = _mm_add_epi32(accum, t); |
michael@0 | 332 | |
michael@0 | 333 | // Duplicate 3rd and 4th coefficients for all channels => |
michael@0 | 334 | // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients |
michael@0 | 335 | // => accumulate the convolution results. |
michael@0 | 336 | // [16] xx xx xx xx c3 c3 c2 c2 |
michael@0 | 337 | coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); |
michael@0 | 338 | // [16] c3 c3 c3 c3 c2 c2 c2 c2 |
michael@0 | 339 | coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); |
michael@0 | 340 | // [16] a3 g3 b3 r3 a2 g2 b2 r2 |
michael@0 | 341 | src16 = _mm_unpackhi_epi8(src8, zero); |
michael@0 | 342 | mul_hi = _mm_mulhi_epi16(src16, coeff16); |
michael@0 | 343 | mul_lo = _mm_mullo_epi16(src16, coeff16); |
michael@0 | 344 | // [32] a2*c2 b2*c2 g2*c2 r2*c2 |
michael@0 | 345 | t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
michael@0 | 346 | accum = _mm_add_epi32(accum, t); |
michael@0 | 347 | // [32] a3*c3 b3*c3 g3*c3 r3*c3 |
michael@0 | 348 | t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
michael@0 | 349 | accum = _mm_add_epi32(accum, t); |
michael@0 | 350 | |
michael@0 | 351 | // Advance the pixel and coefficients pointers. |
michael@0 | 352 | row_to_filter += 1; |
michael@0 | 353 | filter_values += 4; |
michael@0 | 354 | } |
michael@0 | 355 | |
michael@0 | 356 | // When |filter_length| is not divisible by 4, we need to decimate some of |
michael@0 | 357 | // the filter coefficient that was loaded incorrectly to zero; Other than |
michael@0 | 358 | // that the algorithm is same with above, exceot that the 4th pixel will be |
michael@0 | 359 | // always absent. |
michael@0 | 360 | int r = filter_length&3; |
michael@0 | 361 | if (r) { |
michael@0 | 362 | // Note: filter_values must be padded to align_up(filter_offset, 8). |
michael@0 | 363 | __m128i coeff, coeff16; |
michael@0 | 364 | coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); |
michael@0 | 365 | // Mask out extra filter taps. |
michael@0 | 366 | coeff = _mm_and_si128(coeff, mask[r]); |
michael@0 | 367 | coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); |
michael@0 | 368 | coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); |
michael@0 | 369 | |
michael@0 | 370 | // Note: line buffer must be padded to align_up(filter_offset, 16). |
michael@0 | 371 | // We resolve this by use C-version for the last horizontal line. |
michael@0 | 372 | __m128i src8 = _mm_loadu_si128(row_to_filter); |
michael@0 | 373 | __m128i src16 = _mm_unpacklo_epi8(src8, zero); |
michael@0 | 374 | __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); |
michael@0 | 375 | __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); |
michael@0 | 376 | __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
michael@0 | 377 | accum = _mm_add_epi32(accum, t); |
michael@0 | 378 | t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
michael@0 | 379 | accum = _mm_add_epi32(accum, t); |
michael@0 | 380 | |
michael@0 | 381 | src16 = _mm_unpackhi_epi8(src8, zero); |
michael@0 | 382 | coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); |
michael@0 | 383 | coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); |
michael@0 | 384 | mul_hi = _mm_mulhi_epi16(src16, coeff16); |
michael@0 | 385 | mul_lo = _mm_mullo_epi16(src16, coeff16); |
michael@0 | 386 | t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
michael@0 | 387 | accum = _mm_add_epi32(accum, t); |
michael@0 | 388 | } |
michael@0 | 389 | |
michael@0 | 390 | // Shift right for fixed point implementation. |
michael@0 | 391 | accum = _mm_srai_epi32(accum, ConvolutionFilter1D::kShiftBits); |
michael@0 | 392 | |
michael@0 | 393 | // Packing 32 bits |accum| to 16 bits per channel (signed saturation). |
michael@0 | 394 | accum = _mm_packs_epi32(accum, zero); |
michael@0 | 395 | // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). |
michael@0 | 396 | accum = _mm_packus_epi16(accum, zero); |
michael@0 | 397 | |
michael@0 | 398 | // Store the pixel value of 32 bits. |
michael@0 | 399 | *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum); |
michael@0 | 400 | out_row += 4; |
michael@0 | 401 | } |
michael@0 | 402 | #endif |
michael@0 | 403 | } |
michael@0 | 404 | |
michael@0 | 405 | // Convolves horizontally along four rows. The row data is given in |
michael@0 | 406 | // |src_data| and continues for the num_values() of the filter. |
michael@0 | 407 | // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please |
michael@0 | 408 | // refer to that function for detailed comments. |
michael@0 | 409 | void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4], |
michael@0 | 410 | const ConvolutionFilter1D& filter, |
michael@0 | 411 | unsigned char* out_row[4]) { |
michael@0 | 412 | #if defined(SIMD_SSE2) |
michael@0 | 413 | int num_values = filter.num_values(); |
michael@0 | 414 | |
michael@0 | 415 | int filter_offset, filter_length; |
michael@0 | 416 | __m128i zero = _mm_setzero_si128(); |
michael@0 | 417 | __m128i mask[4]; |
michael@0 | 418 | // |mask| will be used to decimate all extra filter coefficients that are |
michael@0 | 419 | // loaded by SIMD when |filter_length| is not divisible by 4. |
michael@0 | 420 | // mask[0] is not used in following algorithm. |
michael@0 | 421 | mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); |
michael@0 | 422 | mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); |
michael@0 | 423 | mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); |
michael@0 | 424 | |
michael@0 | 425 | // Output one pixel each iteration, calculating all channels (RGBA) together. |
michael@0 | 426 | for (int out_x = 0; out_x < num_values; out_x++) { |
michael@0 | 427 | const ConvolutionFilter1D::Fixed* filter_values = |
michael@0 | 428 | filter.FilterForValue(out_x, &filter_offset, &filter_length); |
michael@0 | 429 | |
michael@0 | 430 | // four pixels in a column per iteration. |
michael@0 | 431 | __m128i accum0 = _mm_setzero_si128(); |
michael@0 | 432 | __m128i accum1 = _mm_setzero_si128(); |
michael@0 | 433 | __m128i accum2 = _mm_setzero_si128(); |
michael@0 | 434 | __m128i accum3 = _mm_setzero_si128(); |
michael@0 | 435 | int start = (filter_offset<<2); |
michael@0 | 436 | // We will load and accumulate with four coefficients per iteration. |
michael@0 | 437 | for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) { |
michael@0 | 438 | __m128i coeff, coeff16lo, coeff16hi; |
michael@0 | 439 | // [16] xx xx xx xx c3 c2 c1 c0 |
michael@0 | 440 | coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); |
michael@0 | 441 | // [16] xx xx xx xx c1 c1 c0 c0 |
michael@0 | 442 | coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); |
michael@0 | 443 | // [16] c1 c1 c1 c1 c0 c0 c0 c0 |
michael@0 | 444 | coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); |
michael@0 | 445 | // [16] xx xx xx xx c3 c3 c2 c2 |
michael@0 | 446 | coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); |
michael@0 | 447 | // [16] c3 c3 c3 c3 c2 c2 c2 c2 |
michael@0 | 448 | coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); |
michael@0 | 449 | |
michael@0 | 450 | __m128i src8, src16, mul_hi, mul_lo, t; |
michael@0 | 451 | |
michael@0 | 452 | #define ITERATION(src, accum) \ |
michael@0 | 453 | src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \ |
michael@0 | 454 | src16 = _mm_unpacklo_epi8(src8, zero); \ |
michael@0 | 455 | mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \ |
michael@0 | 456 | mul_lo = _mm_mullo_epi16(src16, coeff16lo); \ |
michael@0 | 457 | t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ |
michael@0 | 458 | accum = _mm_add_epi32(accum, t); \ |
michael@0 | 459 | t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ |
michael@0 | 460 | accum = _mm_add_epi32(accum, t); \ |
michael@0 | 461 | src16 = _mm_unpackhi_epi8(src8, zero); \ |
michael@0 | 462 | mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \ |
michael@0 | 463 | mul_lo = _mm_mullo_epi16(src16, coeff16hi); \ |
michael@0 | 464 | t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ |
michael@0 | 465 | accum = _mm_add_epi32(accum, t); \ |
michael@0 | 466 | t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ |
michael@0 | 467 | accum = _mm_add_epi32(accum, t) |
michael@0 | 468 | |
michael@0 | 469 | ITERATION(src_data[0] + start, accum0); |
michael@0 | 470 | ITERATION(src_data[1] + start, accum1); |
michael@0 | 471 | ITERATION(src_data[2] + start, accum2); |
michael@0 | 472 | ITERATION(src_data[3] + start, accum3); |
michael@0 | 473 | |
michael@0 | 474 | start += 16; |
michael@0 | 475 | filter_values += 4; |
michael@0 | 476 | } |
michael@0 | 477 | |
michael@0 | 478 | int r = filter_length & 3; |
michael@0 | 479 | if (r) { |
michael@0 | 480 | // Note: filter_values must be padded to align_up(filter_offset, 8); |
michael@0 | 481 | __m128i coeff; |
michael@0 | 482 | coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); |
michael@0 | 483 | // Mask out extra filter taps. |
michael@0 | 484 | coeff = _mm_and_si128(coeff, mask[r]); |
michael@0 | 485 | |
michael@0 | 486 | __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); |
michael@0 | 487 | /* c1 c1 c1 c1 c0 c0 c0 c0 */ |
michael@0 | 488 | coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); |
michael@0 | 489 | __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); |
michael@0 | 490 | coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); |
michael@0 | 491 | |
michael@0 | 492 | __m128i src8, src16, mul_hi, mul_lo, t; |
michael@0 | 493 | |
michael@0 | 494 | ITERATION(src_data[0] + start, accum0); |
michael@0 | 495 | ITERATION(src_data[1] + start, accum1); |
michael@0 | 496 | ITERATION(src_data[2] + start, accum2); |
michael@0 | 497 | ITERATION(src_data[3] + start, accum3); |
michael@0 | 498 | } |
michael@0 | 499 | |
michael@0 | 500 | accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits); |
michael@0 | 501 | accum0 = _mm_packs_epi32(accum0, zero); |
michael@0 | 502 | accum0 = _mm_packus_epi16(accum0, zero); |
michael@0 | 503 | accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits); |
michael@0 | 504 | accum1 = _mm_packs_epi32(accum1, zero); |
michael@0 | 505 | accum1 = _mm_packus_epi16(accum1, zero); |
michael@0 | 506 | accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits); |
michael@0 | 507 | accum2 = _mm_packs_epi32(accum2, zero); |
michael@0 | 508 | accum2 = _mm_packus_epi16(accum2, zero); |
michael@0 | 509 | accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits); |
michael@0 | 510 | accum3 = _mm_packs_epi32(accum3, zero); |
michael@0 | 511 | accum3 = _mm_packus_epi16(accum3, zero); |
michael@0 | 512 | |
michael@0 | 513 | *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0); |
michael@0 | 514 | *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1); |
michael@0 | 515 | *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2); |
michael@0 | 516 | *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3); |
michael@0 | 517 | |
michael@0 | 518 | out_row[0] += 4; |
michael@0 | 519 | out_row[1] += 4; |
michael@0 | 520 | out_row[2] += 4; |
michael@0 | 521 | out_row[3] += 4; |
michael@0 | 522 | } |
michael@0 | 523 | #endif |
michael@0 | 524 | } |
michael@0 | 525 | |
michael@0 | 526 | // Does vertical convolution to produce one output row. The filter values and |
michael@0 | 527 | // length are given in the first two parameters. These are applied to each |
michael@0 | 528 | // of the rows pointed to in the |source_data_rows| array, with each row |
michael@0 | 529 | // being |pixel_width| wide. |
michael@0 | 530 | // |
michael@0 | 531 | // The output must have room for |pixel_width * 4| bytes. |
michael@0 | 532 | template<bool has_alpha> |
michael@0 | 533 | void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values, |
michael@0 | 534 | int filter_length, |
michael@0 | 535 | unsigned char* const* source_data_rows, |
michael@0 | 536 | int pixel_width, |
michael@0 | 537 | unsigned char* out_row) { |
michael@0 | 538 | #if defined(SIMD_SSE2) |
michael@0 | 539 | int width = pixel_width & ~3; |
michael@0 | 540 | |
michael@0 | 541 | __m128i zero = _mm_setzero_si128(); |
michael@0 | 542 | __m128i accum0, accum1, accum2, accum3, coeff16; |
michael@0 | 543 | const __m128i* src; |
michael@0 | 544 | // Output four pixels per iteration (16 bytes). |
michael@0 | 545 | for (int out_x = 0; out_x < width; out_x += 4) { |
michael@0 | 546 | |
michael@0 | 547 | // Accumulated result for each pixel. 32 bits per RGBA channel. |
michael@0 | 548 | accum0 = _mm_setzero_si128(); |
michael@0 | 549 | accum1 = _mm_setzero_si128(); |
michael@0 | 550 | accum2 = _mm_setzero_si128(); |
michael@0 | 551 | accum3 = _mm_setzero_si128(); |
michael@0 | 552 | |
michael@0 | 553 | // Convolve with one filter coefficient per iteration. |
michael@0 | 554 | for (int filter_y = 0; filter_y < filter_length; filter_y++) { |
michael@0 | 555 | |
michael@0 | 556 | // Duplicate the filter coefficient 8 times. |
michael@0 | 557 | // [16] cj cj cj cj cj cj cj cj |
michael@0 | 558 | coeff16 = _mm_set1_epi16(filter_values[filter_y]); |
michael@0 | 559 | |
michael@0 | 560 | // Load four pixels (16 bytes) together. |
michael@0 | 561 | // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |
michael@0 | 562 | src = reinterpret_cast<const __m128i*>( |
michael@0 | 563 | &source_data_rows[filter_y][out_x << 2]); |
michael@0 | 564 | __m128i src8 = _mm_loadu_si128(src); |
michael@0 | 565 | |
michael@0 | 566 | // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels => |
michael@0 | 567 | // multiply with current coefficient => accumulate the result. |
michael@0 | 568 | // [16] a1 b1 g1 r1 a0 b0 g0 r0 |
michael@0 | 569 | __m128i src16 = _mm_unpacklo_epi8(src8, zero); |
michael@0 | 570 | __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); |
michael@0 | 571 | __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); |
michael@0 | 572 | // [32] a0 b0 g0 r0 |
michael@0 | 573 | __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
michael@0 | 574 | accum0 = _mm_add_epi32(accum0, t); |
michael@0 | 575 | // [32] a1 b1 g1 r1 |
michael@0 | 576 | t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
michael@0 | 577 | accum1 = _mm_add_epi32(accum1, t); |
michael@0 | 578 | |
michael@0 | 579 | // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels => |
michael@0 | 580 | // multiply with current coefficient => accumulate the result. |
michael@0 | 581 | // [16] a3 b3 g3 r3 a2 b2 g2 r2 |
michael@0 | 582 | src16 = _mm_unpackhi_epi8(src8, zero); |
michael@0 | 583 | mul_hi = _mm_mulhi_epi16(src16, coeff16); |
michael@0 | 584 | mul_lo = _mm_mullo_epi16(src16, coeff16); |
michael@0 | 585 | // [32] a2 b2 g2 r2 |
michael@0 | 586 | t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
michael@0 | 587 | accum2 = _mm_add_epi32(accum2, t); |
michael@0 | 588 | // [32] a3 b3 g3 r3 |
michael@0 | 589 | t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
michael@0 | 590 | accum3 = _mm_add_epi32(accum3, t); |
michael@0 | 591 | } |
michael@0 | 592 | |
michael@0 | 593 | // Shift right for fixed point implementation. |
michael@0 | 594 | accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits); |
michael@0 | 595 | accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits); |
michael@0 | 596 | accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits); |
michael@0 | 597 | accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits); |
michael@0 | 598 | |
michael@0 | 599 | // Packing 32 bits |accum| to 16 bits per channel (signed saturation). |
michael@0 | 600 | // [16] a1 b1 g1 r1 a0 b0 g0 r0 |
michael@0 | 601 | accum0 = _mm_packs_epi32(accum0, accum1); |
michael@0 | 602 | // [16] a3 b3 g3 r3 a2 b2 g2 r2 |
michael@0 | 603 | accum2 = _mm_packs_epi32(accum2, accum3); |
michael@0 | 604 | |
michael@0 | 605 | // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). |
michael@0 | 606 | // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |
michael@0 | 607 | accum0 = _mm_packus_epi16(accum0, accum2); |
michael@0 | 608 | |
michael@0 | 609 | if (has_alpha) { |
michael@0 | 610 | // Compute the max(ri, gi, bi) for each pixel. |
michael@0 | 611 | // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 |
michael@0 | 612 | __m128i a = _mm_srli_epi32(accum0, 8); |
michael@0 | 613 | // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 |
michael@0 | 614 | __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. |
michael@0 | 615 | // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 |
michael@0 | 616 | a = _mm_srli_epi32(accum0, 16); |
michael@0 | 617 | // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 |
michael@0 | 618 | b = _mm_max_epu8(a, b); // Max of r and g and b. |
michael@0 | 619 | // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 |
michael@0 | 620 | b = _mm_slli_epi32(b, 24); |
michael@0 | 621 | |
michael@0 | 622 | // Make sure the value of alpha channel is always larger than maximum |
michael@0 | 623 | // value of color channels. |
michael@0 | 624 | accum0 = _mm_max_epu8(b, accum0); |
michael@0 | 625 | } else { |
michael@0 | 626 | // Set value of alpha channels to 0xFF. |
michael@0 | 627 | __m128i mask = _mm_set1_epi32(0xff000000); |
michael@0 | 628 | accum0 = _mm_or_si128(accum0, mask); |
michael@0 | 629 | } |
michael@0 | 630 | |
michael@0 | 631 | // Store the convolution result (16 bytes) and advance the pixel pointers. |
michael@0 | 632 | _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0); |
michael@0 | 633 | out_row += 16; |
michael@0 | 634 | } |
michael@0 | 635 | |
michael@0 | 636 | // When the width of the output is not divisible by 4, We need to save one |
michael@0 | 637 | // pixel (4 bytes) each time. And also the fourth pixel is always absent. |
michael@0 | 638 | if (pixel_width & 3) { |
michael@0 | 639 | accum0 = _mm_setzero_si128(); |
michael@0 | 640 | accum1 = _mm_setzero_si128(); |
michael@0 | 641 | accum2 = _mm_setzero_si128(); |
michael@0 | 642 | for (int filter_y = 0; filter_y < filter_length; ++filter_y) { |
michael@0 | 643 | coeff16 = _mm_set1_epi16(filter_values[filter_y]); |
michael@0 | 644 | // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |
michael@0 | 645 | src = reinterpret_cast<const __m128i*>( |
michael@0 | 646 | &source_data_rows[filter_y][width<<2]); |
michael@0 | 647 | __m128i src8 = _mm_loadu_si128(src); |
michael@0 | 648 | // [16] a1 b1 g1 r1 a0 b0 g0 r0 |
michael@0 | 649 | __m128i src16 = _mm_unpacklo_epi8(src8, zero); |
michael@0 | 650 | __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); |
michael@0 | 651 | __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); |
michael@0 | 652 | // [32] a0 b0 g0 r0 |
michael@0 | 653 | __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
michael@0 | 654 | accum0 = _mm_add_epi32(accum0, t); |
michael@0 | 655 | // [32] a1 b1 g1 r1 |
michael@0 | 656 | t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
michael@0 | 657 | accum1 = _mm_add_epi32(accum1, t); |
michael@0 | 658 | // [16] a3 b3 g3 r3 a2 b2 g2 r2 |
michael@0 | 659 | src16 = _mm_unpackhi_epi8(src8, zero); |
michael@0 | 660 | mul_hi = _mm_mulhi_epi16(src16, coeff16); |
michael@0 | 661 | mul_lo = _mm_mullo_epi16(src16, coeff16); |
michael@0 | 662 | // [32] a2 b2 g2 r2 |
michael@0 | 663 | t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
michael@0 | 664 | accum2 = _mm_add_epi32(accum2, t); |
michael@0 | 665 | } |
michael@0 | 666 | |
michael@0 | 667 | accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits); |
michael@0 | 668 | accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits); |
michael@0 | 669 | accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits); |
michael@0 | 670 | // [16] a1 b1 g1 r1 a0 b0 g0 r0 |
michael@0 | 671 | accum0 = _mm_packs_epi32(accum0, accum1); |
michael@0 | 672 | // [16] a3 b3 g3 r3 a2 b2 g2 r2 |
michael@0 | 673 | accum2 = _mm_packs_epi32(accum2, zero); |
michael@0 | 674 | // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |
michael@0 | 675 | accum0 = _mm_packus_epi16(accum0, accum2); |
michael@0 | 676 | if (has_alpha) { |
michael@0 | 677 | // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 |
michael@0 | 678 | __m128i a = _mm_srli_epi32(accum0, 8); |
michael@0 | 679 | // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 |
michael@0 | 680 | __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. |
michael@0 | 681 | // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 |
michael@0 | 682 | a = _mm_srli_epi32(accum0, 16); |
michael@0 | 683 | // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 |
michael@0 | 684 | b = _mm_max_epu8(a, b); // Max of r and g and b. |
michael@0 | 685 | // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 |
michael@0 | 686 | b = _mm_slli_epi32(b, 24); |
michael@0 | 687 | accum0 = _mm_max_epu8(b, accum0); |
michael@0 | 688 | } else { |
michael@0 | 689 | __m128i mask = _mm_set1_epi32(0xff000000); |
michael@0 | 690 | accum0 = _mm_or_si128(accum0, mask); |
michael@0 | 691 | } |
michael@0 | 692 | |
michael@0 | 693 | for (int out_x = width; out_x < pixel_width; out_x++) { |
michael@0 | 694 | *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0); |
michael@0 | 695 | accum0 = _mm_srli_si128(accum0, 4); |
michael@0 | 696 | out_row += 4; |
michael@0 | 697 | } |
michael@0 | 698 | } |
michael@0 | 699 | #endif |
michael@0 | 700 | } |
michael@0 | 701 | |
michael@0 | 702 | } // namespace |
michael@0 | 703 | |
michael@0 | 704 | // ConvolutionFilter1D --------------------------------------------------------- |
michael@0 | 705 | |
michael@0 | 706 | ConvolutionFilter1D::ConvolutionFilter1D() |
michael@0 | 707 | : max_filter_(0) { |
michael@0 | 708 | } |
michael@0 | 709 | |
michael@0 | 710 | ConvolutionFilter1D::~ConvolutionFilter1D() { |
michael@0 | 711 | } |
michael@0 | 712 | |
michael@0 | 713 | void ConvolutionFilter1D::AddFilter(int filter_offset, |
michael@0 | 714 | const float* filter_values, |
michael@0 | 715 | int filter_length) { |
michael@0 | 716 | SkASSERT(filter_length > 0); |
michael@0 | 717 | |
michael@0 | 718 | std::vector<Fixed> fixed_values; |
michael@0 | 719 | fixed_values.reserve(filter_length); |
michael@0 | 720 | |
michael@0 | 721 | for (int i = 0; i < filter_length; ++i) |
michael@0 | 722 | fixed_values.push_back(FloatToFixed(filter_values[i])); |
michael@0 | 723 | |
michael@0 | 724 | AddFilter(filter_offset, &fixed_values[0], filter_length); |
michael@0 | 725 | } |
michael@0 | 726 | |
michael@0 | 727 | void ConvolutionFilter1D::AddFilter(int filter_offset, |
michael@0 | 728 | const Fixed* filter_values, |
michael@0 | 729 | int filter_length) { |
michael@0 | 730 | // It is common for leading/trailing filter values to be zeros. In such |
michael@0 | 731 | // cases it is beneficial to only store the central factors. |
michael@0 | 732 | // For a scaling to 1/4th in each dimension using a Lanczos-2 filter on |
michael@0 | 733 | // a 1080p image this optimization gives a ~10% speed improvement. |
michael@0 | 734 | int first_non_zero = 0; |
michael@0 | 735 | while (first_non_zero < filter_length && filter_values[first_non_zero] == 0) |
michael@0 | 736 | first_non_zero++; |
michael@0 | 737 | |
michael@0 | 738 | if (first_non_zero < filter_length) { |
michael@0 | 739 | // Here we have at least one non-zero factor. |
michael@0 | 740 | int last_non_zero = filter_length - 1; |
michael@0 | 741 | while (last_non_zero >= 0 && filter_values[last_non_zero] == 0) |
michael@0 | 742 | last_non_zero--; |
michael@0 | 743 | |
michael@0 | 744 | filter_offset += first_non_zero; |
michael@0 | 745 | filter_length = last_non_zero + 1 - first_non_zero; |
michael@0 | 746 | SkASSERT(filter_length > 0); |
michael@0 | 747 | |
michael@0 | 748 | for (int i = first_non_zero; i <= last_non_zero; i++) |
michael@0 | 749 | filter_values_.push_back(filter_values[i]); |
michael@0 | 750 | } else { |
michael@0 | 751 | // Here all the factors were zeroes. |
michael@0 | 752 | filter_length = 0; |
michael@0 | 753 | } |
michael@0 | 754 | |
michael@0 | 755 | FilterInstance instance; |
michael@0 | 756 | |
michael@0 | 757 | // We pushed filter_length elements onto filter_values_ |
michael@0 | 758 | instance.data_location = (static_cast<int>(filter_values_.size()) - |
michael@0 | 759 | filter_length); |
michael@0 | 760 | instance.offset = filter_offset; |
michael@0 | 761 | instance.length = filter_length; |
michael@0 | 762 | filters_.push_back(instance); |
michael@0 | 763 | |
michael@0 | 764 | max_filter_ = std::max(max_filter_, filter_length); |
michael@0 | 765 | } |
michael@0 | 766 | |
michael@0 | 767 | void BGRAConvolve2D(const unsigned char* source_data, |
michael@0 | 768 | int source_byte_row_stride, |
michael@0 | 769 | bool source_has_alpha, |
michael@0 | 770 | const ConvolutionFilter1D& filter_x, |
michael@0 | 771 | const ConvolutionFilter1D& filter_y, |
michael@0 | 772 | int output_byte_row_stride, |
michael@0 | 773 | unsigned char* output, |
michael@0 | 774 | bool use_sse2) { |
michael@0 | 775 | #if !defined(SIMD_SSE2) |
michael@0 | 776 | // Even we have runtime support for SSE2 instructions, since the binary |
michael@0 | 777 | // was not built with SSE2 support, we had to fallback to C version. |
michael@0 | 778 | use_sse2 = false; |
michael@0 | 779 | #endif |
michael@0 | 780 | |
michael@0 | 781 | int max_y_filter_size = filter_y.max_filter(); |
michael@0 | 782 | |
michael@0 | 783 | // The next row in the input that we will generate a horizontally |
michael@0 | 784 | // convolved row for. If the filter doesn't start at the beginning of the |
michael@0 | 785 | // image (this is the case when we are only resizing a subset), then we |
michael@0 | 786 | // don't want to generate any output rows before that. Compute the starting |
michael@0 | 787 | // row for convolution as the first pixel for the first vertical filter. |
michael@0 | 788 | int filter_offset, filter_length; |
michael@0 | 789 | const ConvolutionFilter1D::Fixed* filter_values = |
michael@0 | 790 | filter_y.FilterForValue(0, &filter_offset, &filter_length); |
michael@0 | 791 | int next_x_row = filter_offset; |
michael@0 | 792 | |
michael@0 | 793 | // We loop over each row in the input doing a horizontal convolution. This |
michael@0 | 794 | // will result in a horizontally convolved image. We write the results into |
michael@0 | 795 | // a circular buffer of convolved rows and do vertical convolution as rows |
michael@0 | 796 | // are available. This prevents us from having to store the entire |
michael@0 | 797 | // intermediate image and helps cache coherency. |
michael@0 | 798 | // We will need four extra rows to allow horizontal convolution could be done |
michael@0 | 799 | // simultaneously. We also padding each row in row buffer to be aligned-up to |
michael@0 | 800 | // 16 bytes. |
michael@0 | 801 | // TODO(jiesun): We do not use aligned load from row buffer in vertical |
michael@0 | 802 | // convolution pass yet. Somehow Windows does not like it. |
michael@0 | 803 | int row_buffer_width = (filter_x.num_values() + 15) & ~0xF; |
michael@0 | 804 | int row_buffer_height = max_y_filter_size + (use_sse2 ? 4 : 0); |
michael@0 | 805 | CircularRowBuffer row_buffer(row_buffer_width, |
michael@0 | 806 | row_buffer_height, |
michael@0 | 807 | filter_offset); |
michael@0 | 808 | |
michael@0 | 809 | // Loop over every possible output row, processing just enough horizontal |
michael@0 | 810 | // convolutions to run each subsequent vertical convolution. |
michael@0 | 811 | SkASSERT(output_byte_row_stride >= filter_x.num_values() * 4); |
michael@0 | 812 | int num_output_rows = filter_y.num_values(); |
michael@0 | 813 | |
michael@0 | 814 | // We need to check which is the last line to convolve before we advance 4 |
michael@0 | 815 | // lines in one iteration. |
michael@0 | 816 | int last_filter_offset, last_filter_length; |
michael@0 | 817 | filter_y.FilterForValue(num_output_rows - 1, &last_filter_offset, |
michael@0 | 818 | &last_filter_length); |
michael@0 | 819 | |
michael@0 | 820 | for (int out_y = 0; out_y < num_output_rows; out_y++) { |
michael@0 | 821 | filter_values = filter_y.FilterForValue(out_y, |
michael@0 | 822 | &filter_offset, &filter_length); |
michael@0 | 823 | |
michael@0 | 824 | // Generate output rows until we have enough to run the current filter. |
michael@0 | 825 | if (use_sse2) { |
michael@0 | 826 | while (next_x_row < filter_offset + filter_length) { |
michael@0 | 827 | if (next_x_row + 3 < last_filter_offset + last_filter_length - 1) { |
michael@0 | 828 | const unsigned char* src[4]; |
michael@0 | 829 | unsigned char* out_row[4]; |
michael@0 | 830 | for (int i = 0; i < 4; ++i) { |
michael@0 | 831 | src[i] = &source_data[(next_x_row + i) * source_byte_row_stride]; |
michael@0 | 832 | out_row[i] = row_buffer.AdvanceRow(); |
michael@0 | 833 | } |
michael@0 | 834 | ConvolveHorizontally4_SSE2(src, filter_x, out_row); |
michael@0 | 835 | next_x_row += 4; |
michael@0 | 836 | } else { |
michael@0 | 837 | // For the last row, SSE2 load possibly to access data beyond the |
michael@0 | 838 | // image area. therefore we use C version here. |
michael@0 | 839 | if (next_x_row == last_filter_offset + last_filter_length - 1) { |
michael@0 | 840 | if (source_has_alpha) { |
michael@0 | 841 | ConvolveHorizontally<true>( |
michael@0 | 842 | &source_data[next_x_row * source_byte_row_stride], |
michael@0 | 843 | filter_x, row_buffer.AdvanceRow()); |
michael@0 | 844 | } else { |
michael@0 | 845 | ConvolveHorizontally<false>( |
michael@0 | 846 | &source_data[next_x_row * source_byte_row_stride], |
michael@0 | 847 | filter_x, row_buffer.AdvanceRow()); |
michael@0 | 848 | } |
michael@0 | 849 | } else { |
michael@0 | 850 | ConvolveHorizontally_SSE2( |
michael@0 | 851 | &source_data[next_x_row * source_byte_row_stride], |
michael@0 | 852 | filter_x, row_buffer.AdvanceRow()); |
michael@0 | 853 | } |
michael@0 | 854 | next_x_row++; |
michael@0 | 855 | } |
michael@0 | 856 | } |
michael@0 | 857 | } else { |
michael@0 | 858 | while (next_x_row < filter_offset + filter_length) { |
michael@0 | 859 | if (source_has_alpha) { |
michael@0 | 860 | ConvolveHorizontally<true>( |
michael@0 | 861 | &source_data[next_x_row * source_byte_row_stride], |
michael@0 | 862 | filter_x, row_buffer.AdvanceRow()); |
michael@0 | 863 | } else { |
michael@0 | 864 | ConvolveHorizontally<false>( |
michael@0 | 865 | &source_data[next_x_row * source_byte_row_stride], |
michael@0 | 866 | filter_x, row_buffer.AdvanceRow()); |
michael@0 | 867 | } |
michael@0 | 868 | next_x_row++; |
michael@0 | 869 | } |
michael@0 | 870 | } |
michael@0 | 871 | |
michael@0 | 872 | // Compute where in the output image this row of final data will go. |
michael@0 | 873 | unsigned char* cur_output_row = &output[out_y * output_byte_row_stride]; |
michael@0 | 874 | |
michael@0 | 875 | // Get the list of rows that the circular buffer has, in order. |
michael@0 | 876 | int first_row_in_circular_buffer; |
michael@0 | 877 | unsigned char* const* rows_to_convolve = |
michael@0 | 878 | row_buffer.GetRowAddresses(&first_row_in_circular_buffer); |
michael@0 | 879 | |
michael@0 | 880 | // Now compute the start of the subset of those rows that the filter |
michael@0 | 881 | // needs. |
michael@0 | 882 | unsigned char* const* first_row_for_filter = |
michael@0 | 883 | &rows_to_convolve[filter_offset - first_row_in_circular_buffer]; |
michael@0 | 884 | |
michael@0 | 885 | if (source_has_alpha) { |
michael@0 | 886 | if (use_sse2) { |
michael@0 | 887 | ConvolveVertically_SSE2<true>(filter_values, filter_length, |
michael@0 | 888 | first_row_for_filter, |
michael@0 | 889 | filter_x.num_values(), cur_output_row); |
michael@0 | 890 | } else { |
michael@0 | 891 | ConvolveVertically<true>(filter_values, filter_length, |
michael@0 | 892 | first_row_for_filter, |
michael@0 | 893 | filter_x.num_values(), cur_output_row); |
michael@0 | 894 | } |
michael@0 | 895 | } else { |
michael@0 | 896 | if (use_sse2) { |
michael@0 | 897 | ConvolveVertically_SSE2<false>(filter_values, filter_length, |
michael@0 | 898 | first_row_for_filter, |
michael@0 | 899 | filter_x.num_values(), cur_output_row); |
michael@0 | 900 | } else { |
michael@0 | 901 | ConvolveVertically<false>(filter_values, filter_length, |
michael@0 | 902 | first_row_for_filter, |
michael@0 | 903 | filter_x.num_values(), cur_output_row); |
michael@0 | 904 | } |
michael@0 | 905 | } |
michael@0 | 906 | } |
michael@0 | 907 | } |
michael@0 | 908 | |
michael@0 | 909 | } // namespace skia |