The Tor Browser: gfx/2d/convolver.cpp@97036ab72558

Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 // Copyright (c) 2006-2011 The Chromium Authors. All rights reserved.

     2 //

     3 // Redistribution and use in source and binary forms, with or without

     4 // modification, are permitted provided that the following conditions

     5 // are met:

     6 //  * Redistributions of source code must retain the above copyright

     7 //    notice, this list of conditions and the following disclaimer.

     8 //  * Redistributions in binary form must reproduce the above copyright

     9 //    notice, this list of conditions and the following disclaimer in

    10 //    the documentation and/or other materials provided with the

    11 //    distribution.

    12 //  * Neither the name of Google, Inc. nor the names of its contributors

    13 //    may be used to endorse or promote products derived from this

    14 //    software without specific prior written permission.

    15 //

    16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

    17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

    18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

    19 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

    20 // COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

    21 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

    22 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS

    23 // OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED

    24 // AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,

    25 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT

    26 // OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF

    27 // SUCH DAMAGE.

    29 #include "convolver.h"

    31 #include <algorithm>

    33 #include "skia/SkTypes.h"

    35 // note: SIMD_SSE2 is not enabled because of bugs, apparently

    37 #if defined(SIMD_SSE2)

    38 #include <emmintrin.h>  // ARCH_CPU_X86_FAMILY was defined in build/config.h

    39 #endif

    41 #if defined(SK_CPU_LENDIAN)

    42 #define R_OFFSET_IDX 0

    43 #define G_OFFSET_IDX 1

    44 #define B_OFFSET_IDX 2

    45 #define A_OFFSET_IDX 3

    46 #else

    47 #define R_OFFSET_IDX 3

    48 #define G_OFFSET_IDX 2

    49 #define B_OFFSET_IDX 1

    50 #define A_OFFSET_IDX 0

    51 #endif

    53 namespace skia {

    55 namespace {

    57 // Converts the argument to an 8-bit unsigned value by clamping to the range

    58 // 0-255.

    59 inline unsigned char ClampTo8(int a) {

    60   if (static_cast<unsigned>(a) < 256)

    61     return a;  // Avoid the extra check in the common case.

    62   if (a < 0)

    63     return 0;

    64   return 255;

    65 }

    67 // Stores a list of rows in a circular buffer. The usage is you write into it

    68 // by calling AdvanceRow. It will keep track of which row in the buffer it

    69 // should use next, and the total number of rows added.

    70 class CircularRowBuffer {

    71  public:

    72   // The number of pixels in each row is given in |source_row_pixel_width|.

    73   // The maximum number of rows needed in the buffer is |max_y_filter_size|

    74   // (we only need to store enough rows for the biggest filter).

    75   //

    76   // We use the |first_input_row| to compute the coordinates of all of the

    77   // following rows returned by Advance().

    78   CircularRowBuffer(int dest_row_pixel_width, int max_y_filter_size,

    79                     int first_input_row)

    80       : row_byte_width_(dest_row_pixel_width * 4),

    81         num_rows_(max_y_filter_size),

    82         next_row_(0),

    83         next_row_coordinate_(first_input_row) {

    84     buffer_.resize(row_byte_width_ * max_y_filter_size);

    85     row_addresses_.resize(num_rows_);

    86   }

    88   // Moves to the next row in the buffer, returning a pointer to the beginning

    89   // of it.

    90   unsigned char* AdvanceRow() {

    91     unsigned char* row = &buffer_[next_row_ * row_byte_width_];

    92     next_row_coordinate_++;

    94     // Set the pointer to the next row to use, wrapping around if necessary.

    95     next_row_++;

    96     if (next_row_ == num_rows_)

    97       next_row_ = 0;

    98     return row;

    99   }

   101   // Returns a pointer to an "unrolled" array of rows. These rows will start

   102   // at the y coordinate placed into |*first_row_index| and will continue in

   103   // order for the maximum number of rows in this circular buffer.

   104   //

   105   // The |first_row_index_| may be negative. This means the circular buffer

   106   // starts before the top of the image (it hasn't been filled yet).

   107   unsigned char* const* GetRowAddresses(int* first_row_index) {

   108     // Example for a 4-element circular buffer holding coords 6-9.

   109     //   Row 0   Coord 8

   110     //   Row 1   Coord 9

   111     //   Row 2   Coord 6  <- next_row_ = 2, next_row_coordinate_ = 10.

   112     //   Row 3   Coord 7

   113     //

   114     // The "next" row is also the first (lowest) coordinate. This computation

   115     // may yield a negative value, but that's OK, the math will work out

   116     // since the user of this buffer will compute the offset relative

   117     // to the first_row_index and the negative rows will never be used.

   118     *first_row_index = next_row_coordinate_ - num_rows_;

   120     int cur_row = next_row_;

   121     for (int i = 0; i < num_rows_; i++) {

   122       row_addresses_[i] = &buffer_[cur_row * row_byte_width_];

   124       // Advance to the next row, wrapping if necessary.

   125       cur_row++;

   126       if (cur_row == num_rows_)

   127         cur_row = 0;

   128     }

   129     return &row_addresses_[0];

   130   }

   132  private:

   133   // The buffer storing the rows. They are packed, each one row_byte_width_.

   134   std::vector<unsigned char> buffer_;

   136   // Number of bytes per row in the |buffer_|.

   137   int row_byte_width_;

   139   // The number of rows available in the buffer.

   140   int num_rows_;

   142   // The next row index we should write into. This wraps around as the

   143   // circular buffer is used.

   144   int next_row_;

   146   // The y coordinate of the |next_row_|. This is incremented each time a

   147   // new row is appended and does not wrap.

   148   int next_row_coordinate_;

   150   // Buffer used by GetRowAddresses().

   151   std::vector<unsigned char*> row_addresses_;

   152 };

   154 // Convolves horizontally along a single row. The row data is given in

   155 // |src_data| and continues for the num_values() of the filter.

   156 template<bool has_alpha>

   157 // This function is miscompiled with gcc 4.5 with pgo. See bug 827946.

   158 #if defined(__GNUC__) && defined(MOZ_GCC_VERSION_AT_LEAST)

   159 #if MOZ_GCC_VERSION_AT_LEAST(4, 5, 0) && !MOZ_GCC_VERSION_AT_LEAST(4, 6, 0)

   160 __attribute__((optimize("-O1")))

   161 #endif

   162 #endif

   163 void ConvolveHorizontally(const unsigned char* src_data,

   164                           const ConvolutionFilter1D& filter,

   165                           unsigned char* out_row) {

   166   // Loop over each pixel on this row in the output image.

   167   int num_values = filter.num_values();

   168   for (int out_x = 0; out_x < num_values; out_x++) {

   169     // Get the filter that determines the current output pixel.

   170     int filter_offset, filter_length;

   171     const ConvolutionFilter1D::Fixed* filter_values =

   172         filter.FilterForValue(out_x, &filter_offset, &filter_length);

   174     // Compute the first pixel in this row that the filter affects. It will

   175     // touch |filter_length| pixels (4 bytes each) after this.

   176     const unsigned char* row_to_filter = &src_data[filter_offset * 4];

   178     // Apply the filter to the row to get the destination pixel in |accum|.

   179     int accum[4] = {0};

   180     for (int filter_x = 0; filter_x < filter_length; filter_x++) {

   181       ConvolutionFilter1D::Fixed cur_filter = filter_values[filter_x];

   182       accum[0] += cur_filter * row_to_filter[filter_x * 4 + R_OFFSET_IDX];

   183       accum[1] += cur_filter * row_to_filter[filter_x * 4 + G_OFFSET_IDX];

   184       accum[2] += cur_filter * row_to_filter[filter_x * 4 + B_OFFSET_IDX];

   185       if (has_alpha)

   186         accum[3] += cur_filter * row_to_filter[filter_x * 4 + A_OFFSET_IDX];

   187     }

   189     // Bring this value back in range. All of the filter scaling factors

   190     // are in fixed point with kShiftBits bits of fractional part.

   191     accum[0] >>= ConvolutionFilter1D::kShiftBits;

   192     accum[1] >>= ConvolutionFilter1D::kShiftBits;

   193     accum[2] >>= ConvolutionFilter1D::kShiftBits;

   194     if (has_alpha)

   195       accum[3] >>= ConvolutionFilter1D::kShiftBits;

   197     // Store the new pixel.

   198     out_row[out_x * 4 + R_OFFSET_IDX] = ClampTo8(accum[0]);

   199     out_row[out_x * 4 + G_OFFSET_IDX] = ClampTo8(accum[1]);

   200     out_row[out_x * 4 + B_OFFSET_IDX] = ClampTo8(accum[2]);

   201     if (has_alpha)

   202       out_row[out_x * 4 + A_OFFSET_IDX] = ClampTo8(accum[3]);

   203   }

   204 }

   206 // Does vertical convolution to produce one output row. The filter values and

   207 // length are given in the first two parameters. These are applied to each

   208 // of the rows pointed to in the |source_data_rows| array, with each row

   209 // being |pixel_width| wide.

   210 //

   211 // The output must have room for |pixel_width * 4| bytes.

   212 template<bool has_alpha>

   213 void ConvolveVertically(const ConvolutionFilter1D::Fixed* filter_values,

   214                         int filter_length,

   215                         unsigned char* const* source_data_rows,

   216                         int pixel_width,

   217                         unsigned char* out_row) {

   218   // We go through each column in the output and do a vertical convolution,

   219   // generating one output pixel each time.

   220   for (int out_x = 0; out_x < pixel_width; out_x++) {

   221     // Compute the number of bytes over in each row that the current column

   222     // we're convolving starts at. The pixel will cover the next 4 bytes.

   223     int byte_offset = out_x * 4;

   225     // Apply the filter to one column of pixels.

   226     int accum[4] = {0};

   227     for (int filter_y = 0; filter_y < filter_length; filter_y++) {

   228       ConvolutionFilter1D::Fixed cur_filter = filter_values[filter_y];

   229       accum[0] += cur_filter

   230 	* source_data_rows[filter_y][byte_offset + R_OFFSET_IDX];

   231       accum[1] += cur_filter

   232 	* source_data_rows[filter_y][byte_offset + G_OFFSET_IDX];

   233       accum[2] += cur_filter

   234 	* source_data_rows[filter_y][byte_offset + B_OFFSET_IDX];

   235       if (has_alpha)

   236         accum[3] += cur_filter

   237 	  * source_data_rows[filter_y][byte_offset + A_OFFSET_IDX];

   238     }

   240     // Bring this value back in range. All of the filter scaling factors

   241     // are in fixed point with kShiftBits bits of precision.

   242     accum[0] >>= ConvolutionFilter1D::kShiftBits;

   243     accum[1] >>= ConvolutionFilter1D::kShiftBits;

   244     accum[2] >>= ConvolutionFilter1D::kShiftBits;

   245     if (has_alpha)

   246       accum[3] >>= ConvolutionFilter1D::kShiftBits;

   248     // Store the new pixel.

   249     out_row[byte_offset + R_OFFSET_IDX] = ClampTo8(accum[0]);

   250     out_row[byte_offset + G_OFFSET_IDX] = ClampTo8(accum[1]);

   251     out_row[byte_offset + B_OFFSET_IDX] = ClampTo8(accum[2]);

   252     if (has_alpha) {

   253       unsigned char alpha = ClampTo8(accum[3]);

   255       // Make sure the alpha channel doesn't come out smaller than any of the

   256       // color channels. We use premultipled alpha channels, so this should

   257       // never happen, but rounding errors will cause this from time to time.

   258       // These "impossible" colors will cause overflows (and hence random pixel

   259       // values) when the resulting bitmap is drawn to the screen.

   260       //

   261       // We only need to do this when generating the final output row (here).

   262       int max_color_channel = std::max(out_row[byte_offset + R_OFFSET_IDX],

   263           std::max(out_row[byte_offset + G_OFFSET_IDX], out_row[byte_offset + B_OFFSET_IDX]));

   264       if (alpha < max_color_channel)

   265         out_row[byte_offset + A_OFFSET_IDX] = max_color_channel;

   266       else

   267         out_row[byte_offset + A_OFFSET_IDX] = alpha;

   268     } else {

   269       // No alpha channel, the image is opaque.

   270       out_row[byte_offset + A_OFFSET_IDX] = 0xff;

   271     }

   272   }

   273 }

   276 // Convolves horizontally along a single row. The row data is given in

   277 // |src_data| and continues for the num_values() of the filter.

   278 void ConvolveHorizontally_SSE2(const unsigned char* src_data,

   279                                const ConvolutionFilter1D& filter,

   280                                unsigned char* out_row) {

   281 #if defined(SIMD_SSE2)

   282   int num_values = filter.num_values();

   284   int filter_offset, filter_length;

   285   __m128i zero = _mm_setzero_si128();

   286   __m128i mask[4];

   287   // |mask| will be used to decimate all extra filter coefficients that are

   288   // loaded by SIMD when |filter_length| is not divisible by 4.

   289   // mask[0] is not used in following algorithm.

   290   mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);

   291   mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);

   292   mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);

   294   // Output one pixel each iteration, calculating all channels (RGBA) together.

   295   for (int out_x = 0; out_x < num_values; out_x++) {

   296     const ConvolutionFilter1D::Fixed* filter_values =

   297         filter.FilterForValue(out_x, &filter_offset, &filter_length);

   299     __m128i accum = _mm_setzero_si128();

   301     // Compute the first pixel in this row that the filter affects. It will

   302     // touch |filter_length| pixels (4 bytes each) after this.

   303     const __m128i* row_to_filter =

   304         reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);

   306     // We will load and accumulate with four coefficients per iteration.

   307     for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {

   309       // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.

   310       __m128i coeff, coeff16;

   311       // [16] xx xx xx xx c3 c2 c1 c0

   312       coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));

   313       // [16] xx xx xx xx c1 c1 c0 c0

   314       coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

   315       // [16] c1 c1 c1 c1 c0 c0 c0 c0

   316       coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

   318       // Load four pixels => unpack the first two pixels to 16 bits =>

   319       // multiply with coefficients => accumulate the convolution result.

   320       // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

   321       __m128i src8 = _mm_loadu_si128(row_to_filter);

   322       // [16] a1 b1 g1 r1 a0 b0 g0 r0

   323       __m128i src16 = _mm_unpacklo_epi8(src8, zero);

   324       __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

   325       __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

   326       // [32]  a0*c0 b0*c0 g0*c0 r0*c0

   327       __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

   328       accum = _mm_add_epi32(accum, t);

   329       // [32]  a1*c1 b1*c1 g1*c1 r1*c1

   330       t = _mm_unpackhi_epi16(mul_lo, mul_hi);

   331       accum = _mm_add_epi32(accum, t);

   333       // Duplicate 3rd and 4th coefficients for all channels =>

   334       // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients

   335       // => accumulate the convolution results.

   336       // [16] xx xx xx xx c3 c3 c2 c2

   337       coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

   338       // [16] c3 c3 c3 c3 c2 c2 c2 c2

   339       coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

   340       // [16] a3 g3 b3 r3 a2 g2 b2 r2

   341       src16 = _mm_unpackhi_epi8(src8, zero);

   342       mul_hi = _mm_mulhi_epi16(src16, coeff16);

   343       mul_lo = _mm_mullo_epi16(src16, coeff16);

   344       // [32]  a2*c2 b2*c2 g2*c2 r2*c2

   345       t = _mm_unpacklo_epi16(mul_lo, mul_hi);

   346       accum = _mm_add_epi32(accum, t);

   347       // [32]  a3*c3 b3*c3 g3*c3 r3*c3

   348       t = _mm_unpackhi_epi16(mul_lo, mul_hi);

   349       accum = _mm_add_epi32(accum, t);

   351       // Advance the pixel and coefficients pointers.

   352       row_to_filter += 1;

   353       filter_values += 4;

   354     }

   356     // When |filter_length| is not divisible by 4, we need to decimate some of

   357     // the filter coefficient that was loaded incorrectly to zero; Other than

   358     // that the algorithm is same with above, exceot that the 4th pixel will be

   359     // always absent.

   360     int r = filter_length&3;

   361     if (r) {

   362       // Note: filter_values must be padded to align_up(filter_offset, 8).

   363       __m128i coeff, coeff16;

   364       coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));

   365       // Mask out extra filter taps.

   366       coeff = _mm_and_si128(coeff, mask[r]);

   367       coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

   368       coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

   370       // Note: line buffer must be padded to align_up(filter_offset, 16).

   371       // We resolve this by use C-version for the last horizontal line.

   372       __m128i src8 = _mm_loadu_si128(row_to_filter);

   373       __m128i src16 = _mm_unpacklo_epi8(src8, zero);

   374       __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

   375       __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

   376       __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

   377       accum = _mm_add_epi32(accum, t);

   378       t = _mm_unpackhi_epi16(mul_lo, mul_hi);

   379       accum = _mm_add_epi32(accum, t);

   381       src16 = _mm_unpackhi_epi8(src8, zero);

   382       coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

   383       coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

   384       mul_hi = _mm_mulhi_epi16(src16, coeff16);

   385       mul_lo = _mm_mullo_epi16(src16, coeff16);

   386       t = _mm_unpacklo_epi16(mul_lo, mul_hi);

   387       accum = _mm_add_epi32(accum, t);

   388     }

   390     // Shift right for fixed point implementation.

   391     accum = _mm_srai_epi32(accum, ConvolutionFilter1D::kShiftBits);

   393     // Packing 32 bits |accum| to 16 bits per channel (signed saturation).

   394     accum = _mm_packs_epi32(accum, zero);

   395     // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).

   396     accum = _mm_packus_epi16(accum, zero);

   398     // Store the pixel value of 32 bits.

   399     *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum);

   400     out_row += 4;

   401   }

   402 #endif

   403 }

   405 // Convolves horizontally along four rows. The row data is given in

   406 // |src_data| and continues for the num_values() of the filter.

   407 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please

   408 // refer to that function for detailed comments.

   409 void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],

   410                                 const ConvolutionFilter1D& filter,

   411                                 unsigned char* out_row[4]) {

   412 #if defined(SIMD_SSE2)

   413   int num_values = filter.num_values();

   415   int filter_offset, filter_length;

   416   __m128i zero = _mm_setzero_si128();

   417   __m128i mask[4];

   418   // |mask| will be used to decimate all extra filter coefficients that are

   419   // loaded by SIMD when |filter_length| is not divisible by 4.

   420   // mask[0] is not used in following algorithm.

   421   mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);

   422   mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);

   423   mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);

   425   // Output one pixel each iteration, calculating all channels (RGBA) together.

   426   for (int out_x = 0; out_x < num_values; out_x++) {

   427     const ConvolutionFilter1D::Fixed* filter_values =

   428         filter.FilterForValue(out_x, &filter_offset, &filter_length);

   430     // four pixels in a column per iteration.

   431     __m128i accum0 = _mm_setzero_si128();

   432     __m128i accum1 = _mm_setzero_si128();

   433     __m128i accum2 = _mm_setzero_si128();

   434     __m128i accum3 = _mm_setzero_si128();

   435     int start = (filter_offset<<2);

   436     // We will load and accumulate with four coefficients per iteration.

   437     for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {

   438       __m128i coeff, coeff16lo, coeff16hi;

   439       // [16] xx xx xx xx c3 c2 c1 c0

   440       coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));

   441       // [16] xx xx xx xx c1 c1 c0 c0

   442       coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

   443       // [16] c1 c1 c1 c1 c0 c0 c0 c0

   444       coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);

   445       // [16] xx xx xx xx c3 c3 c2 c2

   446       coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

   447       // [16] c3 c3 c3 c3 c2 c2 c2 c2

   448       coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);

   450       __m128i src8, src16, mul_hi, mul_lo, t;

   452 #define ITERATION(src, accum)                                          \

   453       src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));   \

   454       src16 = _mm_unpacklo_epi8(src8, zero);                           \

   455       mul_hi = _mm_mulhi_epi16(src16, coeff16lo);                      \

   456       mul_lo = _mm_mullo_epi16(src16, coeff16lo);                      \

   457       t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \

   458       accum = _mm_add_epi32(accum, t);                                 \

   459       t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \

   460       accum = _mm_add_epi32(accum, t);                                 \

   461       src16 = _mm_unpackhi_epi8(src8, zero);                           \

   462       mul_hi = _mm_mulhi_epi16(src16, coeff16hi);                      \

   463       mul_lo = _mm_mullo_epi16(src16, coeff16hi);                      \

   464       t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \

   465       accum = _mm_add_epi32(accum, t);                                 \

   466       t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \

   467       accum = _mm_add_epi32(accum, t)

   469       ITERATION(src_data[0] + start, accum0);

   470       ITERATION(src_data[1] + start, accum1);

   471       ITERATION(src_data[2] + start, accum2);

   472       ITERATION(src_data[3] + start, accum3);

   474       start += 16;

   475       filter_values += 4;

   476     }

   478     int r = filter_length & 3;

   479     if (r) {

   480       // Note: filter_values must be padded to align_up(filter_offset, 8);

   481       __m128i coeff;

   482       coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));

   483       // Mask out extra filter taps.

   484       coeff = _mm_and_si128(coeff, mask[r]);

   486       __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));

   487       /* c1 c1 c1 c1 c0 c0 c0 c0 */

   488       coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);

   489       __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));

   490       coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);

   492       __m128i src8, src16, mul_hi, mul_lo, t;

   494       ITERATION(src_data[0] + start, accum0);

   495       ITERATION(src_data[1] + start, accum1);

   496       ITERATION(src_data[2] + start, accum2);

   497       ITERATION(src_data[3] + start, accum3);

   498     }

   500     accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);

   501     accum0 = _mm_packs_epi32(accum0, zero);

   502     accum0 = _mm_packus_epi16(accum0, zero);

   503     accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);

   504     accum1 = _mm_packs_epi32(accum1, zero);

   505     accum1 = _mm_packus_epi16(accum1, zero);

   506     accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);

   507     accum2 = _mm_packs_epi32(accum2, zero);

   508     accum2 = _mm_packus_epi16(accum2, zero);

   509     accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);

   510     accum3 = _mm_packs_epi32(accum3, zero);

   511     accum3 = _mm_packus_epi16(accum3, zero);

   513     *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0);

   514     *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1);

   515     *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2);

   516     *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3);

   518     out_row[0] += 4;

   519     out_row[1] += 4;

   520     out_row[2] += 4;

   521     out_row[3] += 4;

   522   }

   523 #endif

   524 }

   526 // Does vertical convolution to produce one output row. The filter values and

   527 // length are given in the first two parameters. These are applied to each

   528 // of the rows pointed to in the |source_data_rows| array, with each row

   529 // being |pixel_width| wide.

   530 //

   531 // The output must have room for |pixel_width * 4| bytes.

   532 template<bool has_alpha>

   533 void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,

   534                              int filter_length,

   535                              unsigned char* const* source_data_rows,

   536                              int pixel_width,

   537                              unsigned char* out_row) {

   538 #if defined(SIMD_SSE2)

   539   int width = pixel_width & ~3;

   541   __m128i zero = _mm_setzero_si128();

   542   __m128i accum0, accum1, accum2, accum3, coeff16;

   543   const __m128i* src;

   544   // Output four pixels per iteration (16 bytes).

   545   for (int out_x = 0; out_x < width; out_x += 4) {

   547     // Accumulated result for each pixel. 32 bits per RGBA channel.

   548     accum0 = _mm_setzero_si128();

   549     accum1 = _mm_setzero_si128();

   550     accum2 = _mm_setzero_si128();

   551     accum3 = _mm_setzero_si128();

   553     // Convolve with one filter coefficient per iteration.

   554     for (int filter_y = 0; filter_y < filter_length; filter_y++) {

   556       // Duplicate the filter coefficient 8 times.

   557       // [16] cj cj cj cj cj cj cj cj

   558       coeff16 = _mm_set1_epi16(filter_values[filter_y]);

   560       // Load four pixels (16 bytes) together.

   561       // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

   562       src = reinterpret_cast<const __m128i*>(

   563           &source_data_rows[filter_y][out_x << 2]);

   564       __m128i src8 = _mm_loadu_si128(src);

   566       // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>

   567       // multiply with current coefficient => accumulate the result.

   568       // [16] a1 b1 g1 r1 a0 b0 g0 r0

   569       __m128i src16 = _mm_unpacklo_epi8(src8, zero);

   570       __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

   571       __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

   572       // [32] a0 b0 g0 r0

   573       __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

   574       accum0 = _mm_add_epi32(accum0, t);

   575       // [32] a1 b1 g1 r1

   576       t = _mm_unpackhi_epi16(mul_lo, mul_hi);

   577       accum1 = _mm_add_epi32(accum1, t);

   579       // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>

   580       // multiply with current coefficient => accumulate the result.

   581       // [16] a3 b3 g3 r3 a2 b2 g2 r2

   582       src16 = _mm_unpackhi_epi8(src8, zero);

   583       mul_hi = _mm_mulhi_epi16(src16, coeff16);

   584       mul_lo = _mm_mullo_epi16(src16, coeff16);

   585       // [32] a2 b2 g2 r2

   586       t = _mm_unpacklo_epi16(mul_lo, mul_hi);

   587       accum2 = _mm_add_epi32(accum2, t);

   588       // [32] a3 b3 g3 r3

   589       t = _mm_unpackhi_epi16(mul_lo, mul_hi);

   590       accum3 = _mm_add_epi32(accum3, t);

   591     }

   593     // Shift right for fixed point implementation.

   594     accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);

   595     accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);

   596     accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);

   597     accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);

   599     // Packing 32 bits |accum| to 16 bits per channel (signed saturation).

   600     // [16] a1 b1 g1 r1 a0 b0 g0 r0

   601     accum0 = _mm_packs_epi32(accum0, accum1);

   602     // [16] a3 b3 g3 r3 a2 b2 g2 r2

   603     accum2 = _mm_packs_epi32(accum2, accum3);

   605     // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).

   606     // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

   607     accum0 = _mm_packus_epi16(accum0, accum2);

   609     if (has_alpha) {

   610       // Compute the max(ri, gi, bi) for each pixel.

   611       // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

   612       __m128i a = _mm_srli_epi32(accum0, 8);

   613       // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

   614       __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.

   615       // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

   616       a = _mm_srli_epi32(accum0, 16);

   617       // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

   618       b = _mm_max_epu8(a, b);  // Max of r and g and b.

   619       // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

   620       b = _mm_slli_epi32(b, 24);

   622       // Make sure the value of alpha channel is always larger than maximum

   623       // value of color channels.

   624       accum0 = _mm_max_epu8(b, accum0);

   625     } else {

   626       // Set value of alpha channels to 0xFF.

   627       __m128i mask = _mm_set1_epi32(0xff000000);

   628       accum0 = _mm_or_si128(accum0, mask);

   629     }

   631     // Store the convolution result (16 bytes) and advance the pixel pointers.

   632     _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);

   633     out_row += 16;

   634   }

   636   // When the width of the output is not divisible by 4, We need to save one

   637   // pixel (4 bytes) each time. And also the fourth pixel is always absent.

   638   if (pixel_width & 3) {

   639     accum0 = _mm_setzero_si128();

   640     accum1 = _mm_setzero_si128();

   641     accum2 = _mm_setzero_si128();

   642     for (int filter_y = 0; filter_y < filter_length; ++filter_y) {

   643       coeff16 = _mm_set1_epi16(filter_values[filter_y]);

   644       // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

   645       src = reinterpret_cast<const __m128i*>(

   646           &source_data_rows[filter_y][width<<2]);

   647       __m128i src8 = _mm_loadu_si128(src);

   648       // [16] a1 b1 g1 r1 a0 b0 g0 r0

   649       __m128i src16 = _mm_unpacklo_epi8(src8, zero);

   650       __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);

   651       __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);

   652       // [32] a0 b0 g0 r0

   653       __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);

   654       accum0 = _mm_add_epi32(accum0, t);

   655       // [32] a1 b1 g1 r1

   656       t = _mm_unpackhi_epi16(mul_lo, mul_hi);

   657       accum1 = _mm_add_epi32(accum1, t);

   658       // [16] a3 b3 g3 r3 a2 b2 g2 r2

   659       src16 = _mm_unpackhi_epi8(src8, zero);

   660       mul_hi = _mm_mulhi_epi16(src16, coeff16);

   661       mul_lo = _mm_mullo_epi16(src16, coeff16);

   662       // [32] a2 b2 g2 r2

   663       t = _mm_unpacklo_epi16(mul_lo, mul_hi);

   664       accum2 = _mm_add_epi32(accum2, t);

   665     }

   667     accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);

   668     accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);

   669     accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);

   670     // [16] a1 b1 g1 r1 a0 b0 g0 r0

   671     accum0 = _mm_packs_epi32(accum0, accum1);

   672     // [16] a3 b3 g3 r3 a2 b2 g2 r2

   673     accum2 = _mm_packs_epi32(accum2, zero);

   674     // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

   675     accum0 = _mm_packus_epi16(accum0, accum2);

   676     if (has_alpha) {

   677       // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

   678       __m128i a = _mm_srli_epi32(accum0, 8);

   679       // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

   680       __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.

   681       // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

   682       a = _mm_srli_epi32(accum0, 16);

   683       // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

   684       b = _mm_max_epu8(a, b);  // Max of r and g and b.

   685       // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

   686       b = _mm_slli_epi32(b, 24);

   687       accum0 = _mm_max_epu8(b, accum0);

   688     } else {

   689       __m128i mask = _mm_set1_epi32(0xff000000);

   690       accum0 = _mm_or_si128(accum0, mask);

   691     }

   693     for (int out_x = width; out_x < pixel_width; out_x++) {

   694       *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);

   695       accum0 = _mm_srli_si128(accum0, 4);

   696       out_row += 4;

   697     }

   698   }

   699 #endif

   700 }

   702 }  // namespace

   704 // ConvolutionFilter1D ---------------------------------------------------------

   706 ConvolutionFilter1D::ConvolutionFilter1D()

   707     : max_filter_(0) {

   708 }

   710 ConvolutionFilter1D::~ConvolutionFilter1D() {

   711 }

   713 void ConvolutionFilter1D::AddFilter(int filter_offset,

   714                                     const float* filter_values,

   715                                     int filter_length) {

   716   SkASSERT(filter_length > 0);

   718   std::vector<Fixed> fixed_values;

   719   fixed_values.reserve(filter_length);

   721   for (int i = 0; i < filter_length; ++i)

   722     fixed_values.push_back(FloatToFixed(filter_values[i]));

   724   AddFilter(filter_offset, &fixed_values[0], filter_length);

   725 }

   727 void ConvolutionFilter1D::AddFilter(int filter_offset,

   728                                     const Fixed* filter_values,

   729                                     int filter_length) {

   730   // It is common for leading/trailing filter values to be zeros. In such

   731   // cases it is beneficial to only store the central factors.

   732   // For a scaling to 1/4th in each dimension using a Lanczos-2 filter on

   733   // a 1080p image this optimization gives a ~10% speed improvement.

   734   int first_non_zero = 0;

   735   while (first_non_zero < filter_length && filter_values[first_non_zero] == 0)

   736     first_non_zero++;

   738   if (first_non_zero < filter_length) {

   739     // Here we have at least one non-zero factor.

   740     int last_non_zero = filter_length - 1;

   741     while (last_non_zero >= 0 && filter_values[last_non_zero] == 0)

   742       last_non_zero--;

   744     filter_offset += first_non_zero;

   745     filter_length = last_non_zero + 1 - first_non_zero;

   746     SkASSERT(filter_length > 0);

   748     for (int i = first_non_zero; i <= last_non_zero; i++)

   749       filter_values_.push_back(filter_values[i]);

   750   } else {

   751     // Here all the factors were zeroes.

   752     filter_length = 0;

   753   }

   755   FilterInstance instance;

   757   // We pushed filter_length elements onto filter_values_

   758   instance.data_location = (static_cast<int>(filter_values_.size()) -

   759                             filter_length);

   760   instance.offset = filter_offset;

   761   instance.length = filter_length;

   762   filters_.push_back(instance);

   764   max_filter_ = std::max(max_filter_, filter_length);

   765 }

   767 void BGRAConvolve2D(const unsigned char* source_data,

   768                     int source_byte_row_stride,

   769                     bool source_has_alpha,

   770                     const ConvolutionFilter1D& filter_x,

   771                     const ConvolutionFilter1D& filter_y,

   772                     int output_byte_row_stride,

   773                     unsigned char* output,

   774                     bool use_sse2) {

   775 #if !defined(SIMD_SSE2)

   776   // Even we have runtime support for SSE2 instructions, since the binary

   777   // was not built with SSE2 support, we had to fallback to C version.

   778   use_sse2 = false;

   779 #endif

   781   int max_y_filter_size = filter_y.max_filter();

   783   // The next row in the input that we will generate a horizontally

   784   // convolved row for. If the filter doesn't start at the beginning of the

   785   // image (this is the case when we are only resizing a subset), then we

   786   // don't want to generate any output rows before that. Compute the starting

   787   // row for convolution as the first pixel for the first vertical filter.

   788   int filter_offset, filter_length;

   789   const ConvolutionFilter1D::Fixed* filter_values =

   790       filter_y.FilterForValue(0, &filter_offset, &filter_length);

   791   int next_x_row = filter_offset;

   793   // We loop over each row in the input doing a horizontal convolution. This

   794   // will result in a horizontally convolved image. We write the results into

   795   // a circular buffer of convolved rows and do vertical convolution as rows

   796   // are available. This prevents us from having to store the entire

   797   // intermediate image and helps cache coherency.

   798   // We will need four extra rows to allow horizontal convolution could be done

   799   // simultaneously. We also padding each row in row buffer to be aligned-up to

   800   // 16 bytes.

   801   // TODO(jiesun): We do not use aligned load from row buffer in vertical

   802   // convolution pass yet. Somehow Windows does not like it.

   803   int row_buffer_width = (filter_x.num_values() + 15) & ~0xF;

   804   int row_buffer_height = max_y_filter_size + (use_sse2 ? 4 : 0);

   805   CircularRowBuffer row_buffer(row_buffer_width,

   806                                row_buffer_height,

   807                                filter_offset);

   809   // Loop over every possible output row, processing just enough horizontal

   810   // convolutions to run each subsequent vertical convolution.

   811   SkASSERT(output_byte_row_stride >= filter_x.num_values() * 4);

   812   int num_output_rows = filter_y.num_values();

   814   // We need to check which is the last line to convolve before we advance 4

   815   // lines in one iteration.

   816   int last_filter_offset, last_filter_length;

   817   filter_y.FilterForValue(num_output_rows - 1, &last_filter_offset,

   818                           &last_filter_length);

   820   for (int out_y = 0; out_y < num_output_rows; out_y++) {

   821     filter_values = filter_y.FilterForValue(out_y,

   822                                             &filter_offset, &filter_length);

   824     // Generate output rows until we have enough to run the current filter.

   825     if (use_sse2) {

   826       while (next_x_row < filter_offset + filter_length) {

   827         if (next_x_row + 3 < last_filter_offset + last_filter_length - 1) {

   828           const unsigned char* src[4];

   829           unsigned char* out_row[4];

   830           for (int i = 0; i < 4; ++i) {

   831             src[i] = &source_data[(next_x_row + i) * source_byte_row_stride];

   832             out_row[i] = row_buffer.AdvanceRow();

   833           }

   834           ConvolveHorizontally4_SSE2(src, filter_x, out_row);

   835           next_x_row += 4;

   836         } else {

   837           // For the last row, SSE2 load possibly to access data beyond the

   838           // image area. therefore we use C version here.

   839           if (next_x_row == last_filter_offset + last_filter_length - 1) {

   840             if (source_has_alpha) {

   841               ConvolveHorizontally<true>(

   842                   &source_data[next_x_row * source_byte_row_stride],

   843                   filter_x, row_buffer.AdvanceRow());

   844             } else {

   845               ConvolveHorizontally<false>(

   846                   &source_data[next_x_row * source_byte_row_stride],

   847                   filter_x, row_buffer.AdvanceRow());

   848             }

   849           } else {

   850             ConvolveHorizontally_SSE2(

   851                 &source_data[next_x_row * source_byte_row_stride],

   852                 filter_x, row_buffer.AdvanceRow());

   853           }

   854           next_x_row++;

   855         }

   856       }

   857     } else {

   858       while (next_x_row < filter_offset + filter_length) {

   859         if (source_has_alpha) {

   860           ConvolveHorizontally<true>(

   861               &source_data[next_x_row * source_byte_row_stride],

   862               filter_x, row_buffer.AdvanceRow());

   863         } else {

   864           ConvolveHorizontally<false>(

   865               &source_data[next_x_row * source_byte_row_stride],

   866               filter_x, row_buffer.AdvanceRow());

   867         }

   868         next_x_row++;

   869       }

   870     }

   872     // Compute where in the output image this row of final data will go.

   873     unsigned char* cur_output_row = &output[out_y * output_byte_row_stride];

   875     // Get the list of rows that the circular buffer has, in order.

   876     int first_row_in_circular_buffer;

   877     unsigned char* const* rows_to_convolve =

   878         row_buffer.GetRowAddresses(&first_row_in_circular_buffer);

   880     // Now compute the start of the subset of those rows that the filter

   881     // needs.

   882     unsigned char* const* first_row_for_filter =

   883         &rows_to_convolve[filter_offset - first_row_in_circular_buffer];

   885     if (source_has_alpha) {

   886       if (use_sse2) {

   887         ConvolveVertically_SSE2<true>(filter_values, filter_length,

   888                                       first_row_for_filter,

   889                                       filter_x.num_values(), cur_output_row);

   890       } else {

   891         ConvolveVertically<true>(filter_values, filter_length,

   892                                  first_row_for_filter,

   893                                  filter_x.num_values(), cur_output_row);

   894       }

   895     } else {

   896       if (use_sse2) {

   897         ConvolveVertically_SSE2<false>(filter_values, filter_length,

   898                                        first_row_for_filter,

   899                                        filter_x.num_values(), cur_output_row);

   900       } else {

   901         ConvolveVertically<false>(filter_values, filter_length,

   902                                  first_row_for_filter,

   903                                  filter_x.num_values(), cur_output_row);

   904       }

   905     }

   906   }

   907 }

   909 }  // namespace skia

The Tor Browser / file revision

gfx/2d/convolver.cpp@97036ab72558

gfx/2d/convolver.cpp