The Tor Browser: gfx/skia/trunk/src/opts/SkBitmapProcState_arm

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

     2 /*

     3  * Copyright 2012 Google Inc.

     4  *

     5  * Use of this source code is governed by a BSD-style license that can be

     6  * found in the LICENSE file.

     7  */

     8 #include "SkBitmapProcState.h"

     9 #include "SkBitmapProcState_filter.h"

    10 #include "SkColorPriv.h"

    11 #include "SkFilterProc.h"

    12 #include "SkPaint.h"

    13 #include "SkShader.h"   // for tilemodes

    14 #include "SkUtilsArm.h"

    16 // Required to ensure the table is part of the final binary.

    17 extern const SkBitmapProcState::SampleProc32 gSkBitmapProcStateSample32_neon[];

    18 extern const SkBitmapProcState::SampleProc16 gSkBitmapProcStateSample16_neon[];

    20 #define   NAME_WRAP(x)  x ## _neon

    21 #include "SkBitmapProcState_filter_neon.h"

    22 #include "SkBitmapProcState_procs.h"

    24 const SkBitmapProcState::SampleProc32 gSkBitmapProcStateSample32_neon[] = {

    25     S32_opaque_D32_nofilter_DXDY_neon,

    26     S32_alpha_D32_nofilter_DXDY_neon,

    27     S32_opaque_D32_nofilter_DX_neon,

    28     S32_alpha_D32_nofilter_DX_neon,

    29     S32_opaque_D32_filter_DXDY_neon,

    30     S32_alpha_D32_filter_DXDY_neon,

    31     S32_opaque_D32_filter_DX_neon,

    32     S32_alpha_D32_filter_DX_neon,

    34     S16_opaque_D32_nofilter_DXDY_neon,

    35     S16_alpha_D32_nofilter_DXDY_neon,

    36     S16_opaque_D32_nofilter_DX_neon,

    37     S16_alpha_D32_nofilter_DX_neon,

    38     S16_opaque_D32_filter_DXDY_neon,

    39     S16_alpha_D32_filter_DXDY_neon,

    40     S16_opaque_D32_filter_DX_neon,

    41     S16_alpha_D32_filter_DX_neon,

    43     SI8_opaque_D32_nofilter_DXDY_neon,

    44     SI8_alpha_D32_nofilter_DXDY_neon,

    45     SI8_opaque_D32_nofilter_DX_neon,

    46     SI8_alpha_D32_nofilter_DX_neon,

    47     SI8_opaque_D32_filter_DXDY_neon,

    48     SI8_alpha_D32_filter_DXDY_neon,

    49     SI8_opaque_D32_filter_DX_neon,

    50     SI8_alpha_D32_filter_DX_neon,

    52     S4444_opaque_D32_nofilter_DXDY_neon,

    53     S4444_alpha_D32_nofilter_DXDY_neon,

    54     S4444_opaque_D32_nofilter_DX_neon,

    55     S4444_alpha_D32_nofilter_DX_neon,

    56     S4444_opaque_D32_filter_DXDY_neon,

    57     S4444_alpha_D32_filter_DXDY_neon,

    58     S4444_opaque_D32_filter_DX_neon,

    59     S4444_alpha_D32_filter_DX_neon,

    61     // A8 treats alpha/opauqe the same (equally efficient)

    62     SA8_alpha_D32_nofilter_DXDY_neon,

    63     SA8_alpha_D32_nofilter_DXDY_neon,

    64     SA8_alpha_D32_nofilter_DX_neon,

    65     SA8_alpha_D32_nofilter_DX_neon,

    66     SA8_alpha_D32_filter_DXDY_neon,

    67     SA8_alpha_D32_filter_DXDY_neon,

    68     SA8_alpha_D32_filter_DX_neon,

    69     SA8_alpha_D32_filter_DX_neon

    70 };

    72 const SkBitmapProcState::SampleProc16 gSkBitmapProcStateSample16_neon[] = {

    73     S32_D16_nofilter_DXDY_neon,

    74     S32_D16_nofilter_DX_neon,

    75     S32_D16_filter_DXDY_neon,

    76     S32_D16_filter_DX_neon,

    78     S16_D16_nofilter_DXDY_neon,

    79     S16_D16_nofilter_DX_neon,

    80     S16_D16_filter_DXDY_neon,

    81     S16_D16_filter_DX_neon,

    83     SI8_D16_nofilter_DXDY_neon,

    84     SI8_D16_nofilter_DX_neon,

    85     SI8_D16_filter_DXDY_neon,

    86     SI8_D16_filter_DX_neon,

    88     // Don't support 4444 -> 565

    89     NULL, NULL, NULL, NULL,

    90     // Don't support A8 -> 565

    91     NULL, NULL, NULL, NULL

    92 };

    94 ///////////////////////////////////////////////////////////////////////////////

    96 #include <arm_neon.h>

    97 #include "SkConvolver.h"

    99 // Convolves horizontally along a single row. The row data is given in

   100 // |srcData| and continues for the numValues() of the filter.

   101 void convolveHorizontally_neon(const unsigned char* srcData,

   102                                const SkConvolutionFilter1D& filter,

   103                                unsigned char* outRow,

   104                                bool hasAlpha) {

   105     // Loop over each pixel on this row in the output image.

   106     int numValues = filter.numValues();

   107     for (int outX = 0; outX < numValues; outX++) {

   108         uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);

   109         uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);

   110         uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);

   111         uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);

   112         // Get the filter that determines the current output pixel.

   113         int filterOffset, filterLength;

   114         const SkConvolutionFilter1D::ConvolutionFixed* filterValues =

   115             filter.FilterForValue(outX, &filterOffset, &filterLength);

   117         // Compute the first pixel in this row that the filter affects. It will

   118         // touch |filterLength| pixels (4 bytes each) after this.

   119         const unsigned char* rowToFilter = &srcData[filterOffset * 4];

   121         // Apply the filter to the row to get the destination pixel in |accum|.

   122         int32x4_t accum = vdupq_n_s32(0);

   123         for (int filterX = 0; filterX < filterLength >> 2; filterX++) {

   124             // Load 4 coefficients

   125             int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;

   126             coeffs = vld1_s16(filterValues);

   127             coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0));

   128             coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1));

   129             coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2));

   130             coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3));

   132             // Load pixels and calc

   133             uint8x16_t pixels = vld1q_u8(rowToFilter);

   134             int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));

   135             int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels)));

   137             int16x4_t p0_src = vget_low_s16(p01_16);

   138             int16x4_t p1_src = vget_high_s16(p01_16);

   139             int16x4_t p2_src = vget_low_s16(p23_16);

   140             int16x4_t p3_src = vget_high_s16(p23_16);

   142             int32x4_t p0 = vmull_s16(p0_src, coeff0);

   143             int32x4_t p1 = vmull_s16(p1_src, coeff1);

   144             int32x4_t p2 = vmull_s16(p2_src, coeff2);

   145             int32x4_t p3 = vmull_s16(p3_src, coeff3);

   147             accum += p0;

   148             accum += p1;

   149             accum += p2;

   150             accum += p3;

   152             // Advance the pointers

   153             rowToFilter += 16;

   154             filterValues += 4;

   155         }

   156         int r = filterLength & 3;

   157         if (r) {

   158             const uint16_t mask[4][4] = {

   159                 {0, 0, 0, 0},

   160                 {0xFFFF, 0, 0, 0},

   161                 {0xFFFF, 0xFFFF, 0, 0},

   162                 {0xFFFF, 0xFFFF, 0xFFFF, 0}

   163             };

   164             uint16x4_t coeffs;

   165             int16x4_t coeff0, coeff1, coeff2;

   166             coeffs = vld1_u16(reinterpret_cast<const uint16_t*>(filterValues));

   167             coeffs &= vld1_u16(&mask[r][0]);

   168             coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask0));

   169             coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask1));

   170             coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask2));

   172             // Load pixels and calc

   173             uint8x16_t pixels = vld1q_u8(rowToFilter);

   174             int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));

   175             int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels)));

   176             int32x4_t p0 = vmull_s16(vget_low_s16(p01_16), coeff0);

   177             int32x4_t p1 = vmull_s16(vget_high_s16(p01_16), coeff1);

   178             int32x4_t p2 = vmull_s16(vget_low_s16(p23_16), coeff2);

   180             accum += p0;

   181             accum += p1;

   182             accum += p2;

   183         }

   185         // Bring this value back in range. All of the filter scaling factors

   186         // are in fixed point with kShiftBits bits of fractional part.

   187         accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits);

   189         // Pack and store the new pixel.

   190         int16x4_t accum16 = vqmovn_s32(accum);

   191         uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16));

   192         vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(accum8), 0);

   193         outRow += 4;

   194     }

   195 }

   197 // Does vertical convolution to produce one output row. The filter values and

   198 // length are given in the first two parameters. These are applied to each

   199 // of the rows pointed to in the |sourceDataRows| array, with each row

   200 // being |pixelWidth| wide.

   201 //

   202 // The output must have room for |pixelWidth * 4| bytes.

   203 template<bool hasAlpha>

   204 void convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,

   205                              int filterLength,

   206                              unsigned char* const* sourceDataRows,

   207                              int pixelWidth,

   208                              unsigned char* outRow) {

   209     int width = pixelWidth & ~3;

   211     int32x4_t accum0, accum1, accum2, accum3;

   212     int16x4_t coeff16;

   214     // Output four pixels per iteration (16 bytes).

   215     for (int outX = 0; outX < width; outX += 4) {

   217         // Accumulated result for each pixel. 32 bits per RGBA channel.

   218         accum0 = accum1 = accum2 = accum3 = vdupq_n_s32(0);

   220         // Convolve with one filter coefficient per iteration.

   221         for (int filterY = 0; filterY < filterLength; filterY++) {

   223             // Duplicate the filter coefficient 4 times.

   224             // [16] cj cj cj cj

   225             coeff16 = vdup_n_s16(filterValues[filterY]);

   227             // Load four pixels (16 bytes) together.

   228             // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

   229             uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][outX << 2]);

   231             int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8)));

   232             int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8)));

   233             int16x4_t src16_0 = vget_low_s16(src16_01);

   234             int16x4_t src16_1 = vget_high_s16(src16_01);

   235             int16x4_t src16_2 = vget_low_s16(src16_23);

   236             int16x4_t src16_3 = vget_high_s16(src16_23);

   238             accum0 += vmull_s16(src16_0, coeff16);

   239             accum1 += vmull_s16(src16_1, coeff16);

   240             accum2 += vmull_s16(src16_2, coeff16);

   241             accum3 += vmull_s16(src16_3, coeff16);

   242         }

   244         // Shift right for fixed point implementation.

   245         accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);

   246         accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);

   247         accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);

   248         accum3 = vshrq_n_s32(accum3, SkConvolutionFilter1D::kShiftBits);

   250         // Packing 32 bits |accum| to 16 bits per channel (signed saturation).

   251         // [16] a1 b1 g1 r1 a0 b0 g0 r0

   252         int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1));

   253         // [16] a3 b3 g3 r3 a2 b2 g2 r2

   254         int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum3));

   256         // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).

   257         // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

   258         uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1));

   260         if (hasAlpha) {

   261             // Compute the max(ri, gi, bi) for each pixel.

   262             // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

   263             uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8));

   264             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

   265             uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g

   266             // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

   267             a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16));

   268             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

   269             b = vmaxq_u8(a, b); // Max of r and g and b.

   270             // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

   271             b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24));

   273             // Make sure the value of alpha channel is always larger than maximum

   274             // value of color channels.

   275             accum8 = vmaxq_u8(b, accum8);

   276         } else {

   277             // Set value of alpha channels to 0xFF.

   278             accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n_u32(0xFF000000));

   279         }

   281         // Store the convolution result (16 bytes) and advance the pixel pointers.

   282         vst1q_u8(outRow, accum8);

   283         outRow += 16;

   284     }

   286     // Process the leftovers when the width of the output is not divisible

   287     // by 4, that is at most 3 pixels.

   288     int r = pixelWidth & 3;

   289     if (r) {

   291         accum0 = accum1 = accum2 = vdupq_n_s32(0);

   293         for (int filterY = 0; filterY < filterLength; ++filterY) {

   294             coeff16 = vdup_n_s16(filterValues[filterY]);

   296             // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

   297             uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][width << 2]);

   299             int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8)));

   300             int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8)));

   301             int16x4_t src16_0 = vget_low_s16(src16_01);

   302             int16x4_t src16_1 = vget_high_s16(src16_01);

   303             int16x4_t src16_2 = vget_low_s16(src16_23);

   305             accum0 += vmull_s16(src16_0, coeff16);

   306             accum1 += vmull_s16(src16_1, coeff16);

   307             accum2 += vmull_s16(src16_2, coeff16);

   308         }

   310         accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);

   311         accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);

   312         accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);

   314         int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1));

   315         int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum2));

   317         uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1));

   319         if (hasAlpha) {

   320             // Compute the max(ri, gi, bi) for each pixel.

   321             // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

   322             uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8));

   323             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

   324             uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g

   325             // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

   326             a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16));

   327             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

   328             b = vmaxq_u8(a, b); // Max of r and g and b.

   329             // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

   330             b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24));

   332             // Make sure the value of alpha channel is always larger than maximum

   333             // value of color channels.

   334             accum8 = vmaxq_u8(b, accum8);

   335         } else {

   336             // Set value of alpha channels to 0xFF.

   337             accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n_u32(0xFF000000));

   338         }

   340         switch(r) {

   341         case 1:

   342             vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpretq_u32_u8(accum8), 0);

   343             break;

   344         case 2:

   345             vst1_u32(reinterpret_cast<uint32_t*>(outRow),

   346                      vreinterpret_u32_u8(vget_low_u8(accum8)));

   347             break;

   348         case 3:

   349             vst1_u32(reinterpret_cast<uint32_t*>(outRow),

   350                      vreinterpret_u32_u8(vget_low_u8(accum8)));

   351             vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow+8), vreinterpretq_u32_u8(accum8), 2);

   352             break;

   353         }

   354     }

   355 }

   357 void convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,

   358                              int filterLength,

   359                              unsigned char* const* sourceDataRows,

   360                              int pixelWidth,

   361                              unsigned char* outRow,

   362                              bool sourceHasAlpha) {

   363     if (sourceHasAlpha) {

   364         convolveVertically_neon<true>(filterValues, filterLength,

   365                                       sourceDataRows, pixelWidth,

   366                                       outRow);

   367     } else {

   368         convolveVertically_neon<false>(filterValues, filterLength,

   369                                        sourceDataRows, pixelWidth,

   370                                        outRow);

   371     }

   372 }

   374 // Convolves horizontally along four rows. The row data is given in

   375 // |src_data| and continues for the num_values() of the filter.

   376 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please

   377 // refer to that function for detailed comments.

   378 void convolve4RowsHorizontally_neon(const unsigned char* srcData[4],

   379                                     const SkConvolutionFilter1D& filter,

   380                                     unsigned char* outRow[4]) {

   382     uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);

   383     uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);

   384     uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);

   385     uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);

   386     int num_values = filter.numValues();

   388     int filterOffset, filterLength;

   389     // |mask| will be used to decimate all extra filter coefficients that are

   390     // loaded by SIMD when |filter_length| is not divisible by 4.

   391     // mask[0] is not used in following algorithm.

   392     const uint16_t mask[4][4] = {

   393         {0, 0, 0, 0},

   394         {0xFFFF, 0, 0, 0},

   395         {0xFFFF, 0xFFFF, 0, 0},

   396         {0xFFFF, 0xFFFF, 0xFFFF, 0}

   397     };

   399     // Output one pixel each iteration, calculating all channels (RGBA) together.

   400     for (int outX = 0; outX < num_values; outX++) {

   402         const SkConvolutionFilter1D::ConvolutionFixed* filterValues =

   403         filter.FilterForValue(outX, &filterOffset, &filterLength);

   405         // four pixels in a column per iteration.

   406         int32x4_t accum0 = vdupq_n_s32(0);

   407         int32x4_t accum1 = vdupq_n_s32(0);

   408         int32x4_t accum2 = vdupq_n_s32(0);

   409         int32x4_t accum3 = vdupq_n_s32(0);

   411         int start = (filterOffset<<2);

   413         // We will load and accumulate with four coefficients per iteration.

   414         for (int filter_x = 0; filter_x < (filterLength >> 2); filter_x++) {

   415             int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;

   417             coeffs = vld1_s16(filterValues);

   418             coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0));

   419             coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1));

   420             coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2));

   421             coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3));

   423             uint8x16_t pixels;

   424             int16x8_t p01_16, p23_16;

   425             int32x4_t p0, p1, p2, p3;

   428 #define ITERATION(src, accum)                                       \

   429     pixels = vld1q_u8(src);                                         \

   430     p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));  \

   431     p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels))); \

   432     p0 = vmull_s16(vget_low_s16(p01_16), coeff0);                   \

   433     p1 = vmull_s16(vget_high_s16(p01_16), coeff1);                  \

   434     p2 = vmull_s16(vget_low_s16(p23_16), coeff2);                   \

   435     p3 = vmull_s16(vget_high_s16(p23_16), coeff3);                  \

   436     accum += p0;                                                    \

   437     accum += p1;                                                    \

   438     accum += p2;                                                    \

   439     accum += p3

   441             ITERATION(srcData[0] + start, accum0);

   442             ITERATION(srcData[1] + start, accum1);

   443             ITERATION(srcData[2] + start, accum2);

   444             ITERATION(srcData[3] + start, accum3);

   446             start += 16;

   447             filterValues += 4;

   448         }

   450         int r = filterLength & 3;

   451         if (r) {

   452             int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;

   453             coeffs = vld1_s16(filterValues);

   454             coeffs &= vreinterpret_s16_u16(vld1_u16(&mask[r][0]));

   455             coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0));

   456             coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1));

   457             coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2));

   458             coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3));

   460             uint8x16_t pixels;

   461             int16x8_t p01_16, p23_16;

   462             int32x4_t p0, p1, p2, p3;

   464             ITERATION(srcData[0] + start, accum0);

   465             ITERATION(srcData[1] + start, accum1);

   466             ITERATION(srcData[2] + start, accum2);

   467             ITERATION(srcData[3] + start, accum3);

   468         }

   470         int16x4_t accum16;

   471         uint8x8_t res0, res1, res2, res3;

   473 #define PACK_RESULT(accum, res)                                         \

   474         accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits);  \

   475         accum16 = vqmovn_s32(accum);                                    \

   476         res = vqmovun_s16(vcombine_s16(accum16, accum16));

   478         PACK_RESULT(accum0, res0);

   479         PACK_RESULT(accum1, res1);

   480         PACK_RESULT(accum2, res2);

   481         PACK_RESULT(accum3, res3);

   483         vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u8(res0), 0);

   484         vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u8(res1), 0);

   485         vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u8(res2), 0);

   486         vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u8(res3), 0);

   487         outRow[0] += 4;

   488         outRow[1] += 4;

   489         outRow[2] += 4;

   490         outRow[3] += 4;

   491     }

   492 }

   494 void applySIMDPadding_neon(SkConvolutionFilter1D *filter) {

   495     // Padding |paddingCount| of more dummy coefficients after the coefficients

   496     // of last filter to prevent SIMD instructions which load 8 or 16 bytes

   497     // together to access invalid memory areas. We are not trying to align the

   498     // coefficients right now due to the opaqueness of <vector> implementation.

   499     // This has to be done after all |AddFilter| calls.

   500     for (int i = 0; i < 8; ++i) {

   501         filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0));

   502     }

   503 }

   505 void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) {

   506     procs->fExtraHorizontalReads = 3;

   507     procs->fConvolveVertically = &convolveVertically_neon;

   508     procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon;

   509     procs->fConvolveHorizontally = &convolveHorizontally_neon;

   510     procs->fApplySIMDPadding = &applySIMDPadding_neon;

   511 }

The Tor Browser / file revision

gfx/skia/trunk/src/opts/SkBitmapProcState_arm_neon.cpp@129ffea94266

gfx/skia/trunk/src/opts/SkBitmapProcState_arm_neon.cpp