gfx/skia/trunk/src/opts/SkBitmapProcState_arm_neon.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/skia/trunk/src/opts/SkBitmapProcState_arm_neon.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,511 @@
     1.4 +
     1.5 +/*
     1.6 + * Copyright 2012 Google Inc.
     1.7 + *
     1.8 + * Use of this source code is governed by a BSD-style license that can be
     1.9 + * found in the LICENSE file.
    1.10 + */
    1.11 +#include "SkBitmapProcState.h"
    1.12 +#include "SkBitmapProcState_filter.h"
    1.13 +#include "SkColorPriv.h"
    1.14 +#include "SkFilterProc.h"
    1.15 +#include "SkPaint.h"
    1.16 +#include "SkShader.h"   // for tilemodes
    1.17 +#include "SkUtilsArm.h"
    1.18 +
    1.19 +// Required to ensure the table is part of the final binary.
    1.20 +extern const SkBitmapProcState::SampleProc32 gSkBitmapProcStateSample32_neon[];
    1.21 +extern const SkBitmapProcState::SampleProc16 gSkBitmapProcStateSample16_neon[];
    1.22 +
    1.23 +#define   NAME_WRAP(x)  x ## _neon
    1.24 +#include "SkBitmapProcState_filter_neon.h"
    1.25 +#include "SkBitmapProcState_procs.h"
    1.26 +
    1.27 +const SkBitmapProcState::SampleProc32 gSkBitmapProcStateSample32_neon[] = {
    1.28 +    S32_opaque_D32_nofilter_DXDY_neon,
    1.29 +    S32_alpha_D32_nofilter_DXDY_neon,
    1.30 +    S32_opaque_D32_nofilter_DX_neon,
    1.31 +    S32_alpha_D32_nofilter_DX_neon,
    1.32 +    S32_opaque_D32_filter_DXDY_neon,
    1.33 +    S32_alpha_D32_filter_DXDY_neon,
    1.34 +    S32_opaque_D32_filter_DX_neon,
    1.35 +    S32_alpha_D32_filter_DX_neon,
    1.36 +
    1.37 +    S16_opaque_D32_nofilter_DXDY_neon,
    1.38 +    S16_alpha_D32_nofilter_DXDY_neon,
    1.39 +    S16_opaque_D32_nofilter_DX_neon,
    1.40 +    S16_alpha_D32_nofilter_DX_neon,
    1.41 +    S16_opaque_D32_filter_DXDY_neon,
    1.42 +    S16_alpha_D32_filter_DXDY_neon,
    1.43 +    S16_opaque_D32_filter_DX_neon,
    1.44 +    S16_alpha_D32_filter_DX_neon,
    1.45 +
    1.46 +    SI8_opaque_D32_nofilter_DXDY_neon,
    1.47 +    SI8_alpha_D32_nofilter_DXDY_neon,
    1.48 +    SI8_opaque_D32_nofilter_DX_neon,
    1.49 +    SI8_alpha_D32_nofilter_DX_neon,
    1.50 +    SI8_opaque_D32_filter_DXDY_neon,
    1.51 +    SI8_alpha_D32_filter_DXDY_neon,
    1.52 +    SI8_opaque_D32_filter_DX_neon,
    1.53 +    SI8_alpha_D32_filter_DX_neon,
    1.54 +
    1.55 +    S4444_opaque_D32_nofilter_DXDY_neon,
    1.56 +    S4444_alpha_D32_nofilter_DXDY_neon,
    1.57 +    S4444_opaque_D32_nofilter_DX_neon,
    1.58 +    S4444_alpha_D32_nofilter_DX_neon,
    1.59 +    S4444_opaque_D32_filter_DXDY_neon,
    1.60 +    S4444_alpha_D32_filter_DXDY_neon,
    1.61 +    S4444_opaque_D32_filter_DX_neon,
    1.62 +    S4444_alpha_D32_filter_DX_neon,
    1.63 +
    1.64 +    // A8 treats alpha/opauqe the same (equally efficient)
    1.65 +    SA8_alpha_D32_nofilter_DXDY_neon,
    1.66 +    SA8_alpha_D32_nofilter_DXDY_neon,
    1.67 +    SA8_alpha_D32_nofilter_DX_neon,
    1.68 +    SA8_alpha_D32_nofilter_DX_neon,
    1.69 +    SA8_alpha_D32_filter_DXDY_neon,
    1.70 +    SA8_alpha_D32_filter_DXDY_neon,
    1.71 +    SA8_alpha_D32_filter_DX_neon,
    1.72 +    SA8_alpha_D32_filter_DX_neon
    1.73 +};
    1.74 +
    1.75 +const SkBitmapProcState::SampleProc16 gSkBitmapProcStateSample16_neon[] = {
    1.76 +    S32_D16_nofilter_DXDY_neon,
    1.77 +    S32_D16_nofilter_DX_neon,
    1.78 +    S32_D16_filter_DXDY_neon,
    1.79 +    S32_D16_filter_DX_neon,
    1.80 +
    1.81 +    S16_D16_nofilter_DXDY_neon,
    1.82 +    S16_D16_nofilter_DX_neon,
    1.83 +    S16_D16_filter_DXDY_neon,
    1.84 +    S16_D16_filter_DX_neon,
    1.85 +
    1.86 +    SI8_D16_nofilter_DXDY_neon,
    1.87 +    SI8_D16_nofilter_DX_neon,
    1.88 +    SI8_D16_filter_DXDY_neon,
    1.89 +    SI8_D16_filter_DX_neon,
    1.90 +
    1.91 +    // Don't support 4444 -> 565
    1.92 +    NULL, NULL, NULL, NULL,
    1.93 +    // Don't support A8 -> 565
    1.94 +    NULL, NULL, NULL, NULL
    1.95 +};
    1.96 +
    1.97 +///////////////////////////////////////////////////////////////////////////////
    1.98 +
    1.99 +#include <arm_neon.h>
   1.100 +#include "SkConvolver.h"
   1.101 +
   1.102 +// Convolves horizontally along a single row. The row data is given in
   1.103 +// |srcData| and continues for the numValues() of the filter.
   1.104 +void convolveHorizontally_neon(const unsigned char* srcData,
   1.105 +                               const SkConvolutionFilter1D& filter,
   1.106 +                               unsigned char* outRow,
   1.107 +                               bool hasAlpha) {
   1.108 +    // Loop over each pixel on this row in the output image.
   1.109 +    int numValues = filter.numValues();
   1.110 +    for (int outX = 0; outX < numValues; outX++) {
   1.111 +        uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);
   1.112 +        uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);
   1.113 +        uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);
   1.114 +        uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);
   1.115 +        // Get the filter that determines the current output pixel.
   1.116 +        int filterOffset, filterLength;
   1.117 +        const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
   1.118 +            filter.FilterForValue(outX, &filterOffset, &filterLength);
   1.119 +
   1.120 +        // Compute the first pixel in this row that the filter affects. It will
   1.121 +        // touch |filterLength| pixels (4 bytes each) after this.
   1.122 +        const unsigned char* rowToFilter = &srcData[filterOffset * 4];
   1.123 +
   1.124 +        // Apply the filter to the row to get the destination pixel in |accum|.
   1.125 +        int32x4_t accum = vdupq_n_s32(0);
   1.126 +        for (int filterX = 0; filterX < filterLength >> 2; filterX++) {
   1.127 +            // Load 4 coefficients
   1.128 +            int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
   1.129 +            coeffs = vld1_s16(filterValues);
   1.130 +            coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0));
   1.131 +            coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1));
   1.132 +            coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2));
   1.133 +            coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3));
   1.134 +
   1.135 +            // Load pixels and calc
   1.136 +            uint8x16_t pixels = vld1q_u8(rowToFilter);
   1.137 +            int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));
   1.138 +            int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels)));
   1.139 +
   1.140 +            int16x4_t p0_src = vget_low_s16(p01_16);
   1.141 +            int16x4_t p1_src = vget_high_s16(p01_16);
   1.142 +            int16x4_t p2_src = vget_low_s16(p23_16);
   1.143 +            int16x4_t p3_src = vget_high_s16(p23_16);
   1.144 +
   1.145 +            int32x4_t p0 = vmull_s16(p0_src, coeff0);
   1.146 +            int32x4_t p1 = vmull_s16(p1_src, coeff1);
   1.147 +            int32x4_t p2 = vmull_s16(p2_src, coeff2);
   1.148 +            int32x4_t p3 = vmull_s16(p3_src, coeff3);
   1.149 +
   1.150 +            accum += p0;
   1.151 +            accum += p1;
   1.152 +            accum += p2;
   1.153 +            accum += p3;
   1.154 +
   1.155 +            // Advance the pointers
   1.156 +            rowToFilter += 16;
   1.157 +            filterValues += 4;
   1.158 +        }
   1.159 +        int r = filterLength & 3;
   1.160 +        if (r) {
   1.161 +            const uint16_t mask[4][4] = {
   1.162 +                {0, 0, 0, 0},
   1.163 +                {0xFFFF, 0, 0, 0},
   1.164 +                {0xFFFF, 0xFFFF, 0, 0},
   1.165 +                {0xFFFF, 0xFFFF, 0xFFFF, 0}
   1.166 +            };
   1.167 +            uint16x4_t coeffs;
   1.168 +            int16x4_t coeff0, coeff1, coeff2;
   1.169 +            coeffs = vld1_u16(reinterpret_cast<const uint16_t*>(filterValues));
   1.170 +            coeffs &= vld1_u16(&mask[r][0]);
   1.171 +            coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask0));
   1.172 +            coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask1));
   1.173 +            coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask2));
   1.174 +
   1.175 +            // Load pixels and calc
   1.176 +            uint8x16_t pixels = vld1q_u8(rowToFilter);
   1.177 +            int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));
   1.178 +            int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels)));
   1.179 +            int32x4_t p0 = vmull_s16(vget_low_s16(p01_16), coeff0);
   1.180 +            int32x4_t p1 = vmull_s16(vget_high_s16(p01_16), coeff1);
   1.181 +            int32x4_t p2 = vmull_s16(vget_low_s16(p23_16), coeff2);
   1.182 +
   1.183 +            accum += p0;
   1.184 +            accum += p1;
   1.185 +            accum += p2;
   1.186 +        }
   1.187 +
   1.188 +        // Bring this value back in range. All of the filter scaling factors
   1.189 +        // are in fixed point with kShiftBits bits of fractional part.
   1.190 +        accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits);
   1.191 +
   1.192 +        // Pack and store the new pixel.
   1.193 +        int16x4_t accum16 = vqmovn_s32(accum);
   1.194 +        uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16));
   1.195 +        vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(accum8), 0);
   1.196 +        outRow += 4;
   1.197 +    }
   1.198 +}
   1.199 +
   1.200 +// Does vertical convolution to produce one output row. The filter values and
   1.201 +// length are given in the first two parameters. These are applied to each
   1.202 +// of the rows pointed to in the |sourceDataRows| array, with each row
   1.203 +// being |pixelWidth| wide.
   1.204 +//
   1.205 +// The output must have room for |pixelWidth * 4| bytes.
   1.206 +template<bool hasAlpha>
   1.207 +void convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
   1.208 +                             int filterLength,
   1.209 +                             unsigned char* const* sourceDataRows,
   1.210 +                             int pixelWidth,
   1.211 +                             unsigned char* outRow) {
   1.212 +    int width = pixelWidth & ~3;
   1.213 +
   1.214 +    int32x4_t accum0, accum1, accum2, accum3;
   1.215 +    int16x4_t coeff16;
   1.216 +
   1.217 +    // Output four pixels per iteration (16 bytes).
   1.218 +    for (int outX = 0; outX < width; outX += 4) {
   1.219 +
   1.220 +        // Accumulated result for each pixel. 32 bits per RGBA channel.
   1.221 +        accum0 = accum1 = accum2 = accum3 = vdupq_n_s32(0);
   1.222 +
   1.223 +        // Convolve with one filter coefficient per iteration.
   1.224 +        for (int filterY = 0; filterY < filterLength; filterY++) {
   1.225 +
   1.226 +            // Duplicate the filter coefficient 4 times.
   1.227 +            // [16] cj cj cj cj
   1.228 +            coeff16 = vdup_n_s16(filterValues[filterY]);
   1.229 +
   1.230 +            // Load four pixels (16 bytes) together.
   1.231 +            // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
   1.232 +            uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][outX << 2]);
   1.233 +
   1.234 +            int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8)));
   1.235 +            int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8)));
   1.236 +            int16x4_t src16_0 = vget_low_s16(src16_01);
   1.237 +            int16x4_t src16_1 = vget_high_s16(src16_01);
   1.238 +            int16x4_t src16_2 = vget_low_s16(src16_23);
   1.239 +            int16x4_t src16_3 = vget_high_s16(src16_23);
   1.240 +
   1.241 +            accum0 += vmull_s16(src16_0, coeff16);
   1.242 +            accum1 += vmull_s16(src16_1, coeff16);
   1.243 +            accum2 += vmull_s16(src16_2, coeff16);
   1.244 +            accum3 += vmull_s16(src16_3, coeff16);
   1.245 +        }
   1.246 +
   1.247 +        // Shift right for fixed point implementation.
   1.248 +        accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);
   1.249 +        accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);
   1.250 +        accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);
   1.251 +        accum3 = vshrq_n_s32(accum3, SkConvolutionFilter1D::kShiftBits);
   1.252 +
   1.253 +        // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
   1.254 +        // [16] a1 b1 g1 r1 a0 b0 g0 r0
   1.255 +        int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1));
   1.256 +        // [16] a3 b3 g3 r3 a2 b2 g2 r2
   1.257 +        int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum3));
   1.258 +
   1.259 +        // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
   1.260 +        // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
   1.261 +        uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1));
   1.262 +
   1.263 +        if (hasAlpha) {
   1.264 +            // Compute the max(ri, gi, bi) for each pixel.
   1.265 +            // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
   1.266 +            uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8));
   1.267 +            // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
   1.268 +            uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g
   1.269 +            // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
   1.270 +            a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16));
   1.271 +            // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
   1.272 +            b = vmaxq_u8(a, b); // Max of r and g and b.
   1.273 +            // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
   1.274 +            b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24));
   1.275 +
   1.276 +            // Make sure the value of alpha channel is always larger than maximum
   1.277 +            // value of color channels.
   1.278 +            accum8 = vmaxq_u8(b, accum8);
   1.279 +        } else {
   1.280 +            // Set value of alpha channels to 0xFF.
   1.281 +            accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n_u32(0xFF000000));
   1.282 +        }
   1.283 +
   1.284 +        // Store the convolution result (16 bytes) and advance the pixel pointers.
   1.285 +        vst1q_u8(outRow, accum8);
   1.286 +        outRow += 16;
   1.287 +    }
   1.288 +
   1.289 +    // Process the leftovers when the width of the output is not divisible
   1.290 +    // by 4, that is at most 3 pixels.
   1.291 +    int r = pixelWidth & 3;
   1.292 +    if (r) {
   1.293 +
   1.294 +        accum0 = accum1 = accum2 = vdupq_n_s32(0);
   1.295 +
   1.296 +        for (int filterY = 0; filterY < filterLength; ++filterY) {
   1.297 +            coeff16 = vdup_n_s16(filterValues[filterY]);
   1.298 +
   1.299 +            // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
   1.300 +            uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][width << 2]);
   1.301 +
   1.302 +            int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8)));
   1.303 +            int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8)));
   1.304 +            int16x4_t src16_0 = vget_low_s16(src16_01);
   1.305 +            int16x4_t src16_1 = vget_high_s16(src16_01);
   1.306 +            int16x4_t src16_2 = vget_low_s16(src16_23);
   1.307 +
   1.308 +            accum0 += vmull_s16(src16_0, coeff16);
   1.309 +            accum1 += vmull_s16(src16_1, coeff16);
   1.310 +            accum2 += vmull_s16(src16_2, coeff16);
   1.311 +        }
   1.312 +
   1.313 +        accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);
   1.314 +        accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);
   1.315 +        accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);
   1.316 +
   1.317 +        int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1));
   1.318 +        int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum2));
   1.319 +
   1.320 +        uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1));
   1.321 +
   1.322 +        if (hasAlpha) {
   1.323 +            // Compute the max(ri, gi, bi) for each pixel.
   1.324 +            // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
   1.325 +            uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8));
   1.326 +            // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
   1.327 +            uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g
   1.328 +            // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
   1.329 +            a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16));
   1.330 +            // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
   1.331 +            b = vmaxq_u8(a, b); // Max of r and g and b.
   1.332 +            // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
   1.333 +            b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24));
   1.334 +
   1.335 +            // Make sure the value of alpha channel is always larger than maximum
   1.336 +            // value of color channels.
   1.337 +            accum8 = vmaxq_u8(b, accum8);
   1.338 +        } else {
   1.339 +            // Set value of alpha channels to 0xFF.
   1.340 +            accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n_u32(0xFF000000));
   1.341 +        }
   1.342 +
   1.343 +        switch(r) {
   1.344 +        case 1:
   1.345 +            vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpretq_u32_u8(accum8), 0);
   1.346 +            break;
   1.347 +        case 2:
   1.348 +            vst1_u32(reinterpret_cast<uint32_t*>(outRow),
   1.349 +                     vreinterpret_u32_u8(vget_low_u8(accum8)));
   1.350 +            break;
   1.351 +        case 3:
   1.352 +            vst1_u32(reinterpret_cast<uint32_t*>(outRow),
   1.353 +                     vreinterpret_u32_u8(vget_low_u8(accum8)));
   1.354 +            vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow+8), vreinterpretq_u32_u8(accum8), 2);
   1.355 +            break;
   1.356 +        }
   1.357 +    }
   1.358 +}
   1.359 +
   1.360 +void convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
   1.361 +                             int filterLength,
   1.362 +                             unsigned char* const* sourceDataRows,
   1.363 +                             int pixelWidth,
   1.364 +                             unsigned char* outRow,
   1.365 +                             bool sourceHasAlpha) {
   1.366 +    if (sourceHasAlpha) {
   1.367 +        convolveVertically_neon<true>(filterValues, filterLength,
   1.368 +                                      sourceDataRows, pixelWidth,
   1.369 +                                      outRow);
   1.370 +    } else {
   1.371 +        convolveVertically_neon<false>(filterValues, filterLength,
   1.372 +                                       sourceDataRows, pixelWidth,
   1.373 +                                       outRow);
   1.374 +    }
   1.375 +}
   1.376 +
   1.377 +// Convolves horizontally along four rows. The row data is given in
   1.378 +// |src_data| and continues for the num_values() of the filter.
   1.379 +// The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
   1.380 +// refer to that function for detailed comments.
   1.381 +void convolve4RowsHorizontally_neon(const unsigned char* srcData[4],
   1.382 +                                    const SkConvolutionFilter1D& filter,
   1.383 +                                    unsigned char* outRow[4]) {
   1.384 +
   1.385 +    uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);
   1.386 +    uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);
   1.387 +    uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);
   1.388 +    uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);
   1.389 +    int num_values = filter.numValues();
   1.390 +
   1.391 +    int filterOffset, filterLength;
   1.392 +    // |mask| will be used to decimate all extra filter coefficients that are
   1.393 +    // loaded by SIMD when |filter_length| is not divisible by 4.
   1.394 +    // mask[0] is not used in following algorithm.
   1.395 +    const uint16_t mask[4][4] = {
   1.396 +        {0, 0, 0, 0},
   1.397 +        {0xFFFF, 0, 0, 0},
   1.398 +        {0xFFFF, 0xFFFF, 0, 0},
   1.399 +        {0xFFFF, 0xFFFF, 0xFFFF, 0}
   1.400 +    };
   1.401 +
   1.402 +    // Output one pixel each iteration, calculating all channels (RGBA) together.
   1.403 +    for (int outX = 0; outX < num_values; outX++) {
   1.404 +
   1.405 +        const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
   1.406 +        filter.FilterForValue(outX, &filterOffset, &filterLength);
   1.407 +
   1.408 +        // four pixels in a column per iteration.
   1.409 +        int32x4_t accum0 = vdupq_n_s32(0);
   1.410 +        int32x4_t accum1 = vdupq_n_s32(0);
   1.411 +        int32x4_t accum2 = vdupq_n_s32(0);
   1.412 +        int32x4_t accum3 = vdupq_n_s32(0);
   1.413 +
   1.414 +        int start = (filterOffset<<2);
   1.415 +
   1.416 +        // We will load and accumulate with four coefficients per iteration.
   1.417 +        for (int filter_x = 0; filter_x < (filterLength >> 2); filter_x++) {
   1.418 +            int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
   1.419 +
   1.420 +            coeffs = vld1_s16(filterValues);
   1.421 +            coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0));
   1.422 +            coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1));
   1.423 +            coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2));
   1.424 +            coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3));
   1.425 +
   1.426 +            uint8x16_t pixels;
   1.427 +            int16x8_t p01_16, p23_16;
   1.428 +            int32x4_t p0, p1, p2, p3;
   1.429 +
   1.430 +
   1.431 +#define ITERATION(src, accum)                                       \
   1.432 +    pixels = vld1q_u8(src);                                         \
   1.433 +    p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));  \
   1.434 +    p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels))); \
   1.435 +    p0 = vmull_s16(vget_low_s16(p01_16), coeff0);                   \
   1.436 +    p1 = vmull_s16(vget_high_s16(p01_16), coeff1);                  \
   1.437 +    p2 = vmull_s16(vget_low_s16(p23_16), coeff2);                   \
   1.438 +    p3 = vmull_s16(vget_high_s16(p23_16), coeff3);                  \
   1.439 +    accum += p0;                                                    \
   1.440 +    accum += p1;                                                    \
   1.441 +    accum += p2;                                                    \
   1.442 +    accum += p3
   1.443 +
   1.444 +            ITERATION(srcData[0] + start, accum0);
   1.445 +            ITERATION(srcData[1] + start, accum1);
   1.446 +            ITERATION(srcData[2] + start, accum2);
   1.447 +            ITERATION(srcData[3] + start, accum3);
   1.448 +
   1.449 +            start += 16;
   1.450 +            filterValues += 4;
   1.451 +        }
   1.452 +
   1.453 +        int r = filterLength & 3;
   1.454 +        if (r) {
   1.455 +            int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
   1.456 +            coeffs = vld1_s16(filterValues);
   1.457 +            coeffs &= vreinterpret_s16_u16(vld1_u16(&mask[r][0]));
   1.458 +            coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0));
   1.459 +            coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1));
   1.460 +            coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2));
   1.461 +            coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3));
   1.462 +
   1.463 +            uint8x16_t pixels;
   1.464 +            int16x8_t p01_16, p23_16;
   1.465 +            int32x4_t p0, p1, p2, p3;
   1.466 +
   1.467 +            ITERATION(srcData[0] + start, accum0);
   1.468 +            ITERATION(srcData[1] + start, accum1);
   1.469 +            ITERATION(srcData[2] + start, accum2);
   1.470 +            ITERATION(srcData[3] + start, accum3);
   1.471 +        }
   1.472 +
   1.473 +        int16x4_t accum16;
   1.474 +        uint8x8_t res0, res1, res2, res3;
   1.475 +
   1.476 +#define PACK_RESULT(accum, res)                                         \
   1.477 +        accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits);  \
   1.478 +        accum16 = vqmovn_s32(accum);                                    \
   1.479 +        res = vqmovun_s16(vcombine_s16(accum16, accum16));
   1.480 +
   1.481 +        PACK_RESULT(accum0, res0);
   1.482 +        PACK_RESULT(accum1, res1);
   1.483 +        PACK_RESULT(accum2, res2);
   1.484 +        PACK_RESULT(accum3, res3);
   1.485 +
   1.486 +        vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u8(res0), 0);
   1.487 +        vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u8(res1), 0);
   1.488 +        vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u8(res2), 0);
   1.489 +        vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u8(res3), 0);
   1.490 +        outRow[0] += 4;
   1.491 +        outRow[1] += 4;
   1.492 +        outRow[2] += 4;
   1.493 +        outRow[3] += 4;
   1.494 +    }
   1.495 +}
   1.496 +
   1.497 +void applySIMDPadding_neon(SkConvolutionFilter1D *filter) {
   1.498 +    // Padding |paddingCount| of more dummy coefficients after the coefficients
   1.499 +    // of last filter to prevent SIMD instructions which load 8 or 16 bytes
   1.500 +    // together to access invalid memory areas. We are not trying to align the
   1.501 +    // coefficients right now due to the opaqueness of <vector> implementation.
   1.502 +    // This has to be done after all |AddFilter| calls.
   1.503 +    for (int i = 0; i < 8; ++i) {
   1.504 +        filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0));
   1.505 +    }
   1.506 +}
   1.507 +
   1.508 +void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) {
   1.509 +    procs->fExtraHorizontalReads = 3;
   1.510 +    procs->fConvolveVertically = &convolveVertically_neon;
   1.511 +    procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon;
   1.512 +    procs->fConvolveHorizontally = &convolveHorizontally_neon;
   1.513 +    procs->fApplySIMDPadding = &applySIMDPadding_neon;
   1.514 +}

mercurial