gfx/skia/trunk/src/opts/SkBitmapProcState_arm_neon.cpp

Sat, 03 Jan 2015 20:18:00 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Sat, 03 Jan 2015 20:18:00 +0100
branch
TOR_BUG_3246
changeset 7
129ffea94266
permissions
-rw-r--r--

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

michael@0 1
michael@0 2 /*
michael@0 3 * Copyright 2012 Google Inc.
michael@0 4 *
michael@0 5 * Use of this source code is governed by a BSD-style license that can be
michael@0 6 * found in the LICENSE file.
michael@0 7 */
michael@0 8 #include "SkBitmapProcState.h"
michael@0 9 #include "SkBitmapProcState_filter.h"
michael@0 10 #include "SkColorPriv.h"
michael@0 11 #include "SkFilterProc.h"
michael@0 12 #include "SkPaint.h"
michael@0 13 #include "SkShader.h" // for tilemodes
michael@0 14 #include "SkUtilsArm.h"
michael@0 15
michael@0 16 // Required to ensure the table is part of the final binary.
michael@0 17 extern const SkBitmapProcState::SampleProc32 gSkBitmapProcStateSample32_neon[];
michael@0 18 extern const SkBitmapProcState::SampleProc16 gSkBitmapProcStateSample16_neon[];
michael@0 19
michael@0 20 #define NAME_WRAP(x) x ## _neon
michael@0 21 #include "SkBitmapProcState_filter_neon.h"
michael@0 22 #include "SkBitmapProcState_procs.h"
michael@0 23
michael@0 24 const SkBitmapProcState::SampleProc32 gSkBitmapProcStateSample32_neon[] = {
michael@0 25 S32_opaque_D32_nofilter_DXDY_neon,
michael@0 26 S32_alpha_D32_nofilter_DXDY_neon,
michael@0 27 S32_opaque_D32_nofilter_DX_neon,
michael@0 28 S32_alpha_D32_nofilter_DX_neon,
michael@0 29 S32_opaque_D32_filter_DXDY_neon,
michael@0 30 S32_alpha_D32_filter_DXDY_neon,
michael@0 31 S32_opaque_D32_filter_DX_neon,
michael@0 32 S32_alpha_D32_filter_DX_neon,
michael@0 33
michael@0 34 S16_opaque_D32_nofilter_DXDY_neon,
michael@0 35 S16_alpha_D32_nofilter_DXDY_neon,
michael@0 36 S16_opaque_D32_nofilter_DX_neon,
michael@0 37 S16_alpha_D32_nofilter_DX_neon,
michael@0 38 S16_opaque_D32_filter_DXDY_neon,
michael@0 39 S16_alpha_D32_filter_DXDY_neon,
michael@0 40 S16_opaque_D32_filter_DX_neon,
michael@0 41 S16_alpha_D32_filter_DX_neon,
michael@0 42
michael@0 43 SI8_opaque_D32_nofilter_DXDY_neon,
michael@0 44 SI8_alpha_D32_nofilter_DXDY_neon,
michael@0 45 SI8_opaque_D32_nofilter_DX_neon,
michael@0 46 SI8_alpha_D32_nofilter_DX_neon,
michael@0 47 SI8_opaque_D32_filter_DXDY_neon,
michael@0 48 SI8_alpha_D32_filter_DXDY_neon,
michael@0 49 SI8_opaque_D32_filter_DX_neon,
michael@0 50 SI8_alpha_D32_filter_DX_neon,
michael@0 51
michael@0 52 S4444_opaque_D32_nofilter_DXDY_neon,
michael@0 53 S4444_alpha_D32_nofilter_DXDY_neon,
michael@0 54 S4444_opaque_D32_nofilter_DX_neon,
michael@0 55 S4444_alpha_D32_nofilter_DX_neon,
michael@0 56 S4444_opaque_D32_filter_DXDY_neon,
michael@0 57 S4444_alpha_D32_filter_DXDY_neon,
michael@0 58 S4444_opaque_D32_filter_DX_neon,
michael@0 59 S4444_alpha_D32_filter_DX_neon,
michael@0 60
michael@0 61 // A8 treats alpha/opauqe the same (equally efficient)
michael@0 62 SA8_alpha_D32_nofilter_DXDY_neon,
michael@0 63 SA8_alpha_D32_nofilter_DXDY_neon,
michael@0 64 SA8_alpha_D32_nofilter_DX_neon,
michael@0 65 SA8_alpha_D32_nofilter_DX_neon,
michael@0 66 SA8_alpha_D32_filter_DXDY_neon,
michael@0 67 SA8_alpha_D32_filter_DXDY_neon,
michael@0 68 SA8_alpha_D32_filter_DX_neon,
michael@0 69 SA8_alpha_D32_filter_DX_neon
michael@0 70 };
michael@0 71
michael@0 72 const SkBitmapProcState::SampleProc16 gSkBitmapProcStateSample16_neon[] = {
michael@0 73 S32_D16_nofilter_DXDY_neon,
michael@0 74 S32_D16_nofilter_DX_neon,
michael@0 75 S32_D16_filter_DXDY_neon,
michael@0 76 S32_D16_filter_DX_neon,
michael@0 77
michael@0 78 S16_D16_nofilter_DXDY_neon,
michael@0 79 S16_D16_nofilter_DX_neon,
michael@0 80 S16_D16_filter_DXDY_neon,
michael@0 81 S16_D16_filter_DX_neon,
michael@0 82
michael@0 83 SI8_D16_nofilter_DXDY_neon,
michael@0 84 SI8_D16_nofilter_DX_neon,
michael@0 85 SI8_D16_filter_DXDY_neon,
michael@0 86 SI8_D16_filter_DX_neon,
michael@0 87
michael@0 88 // Don't support 4444 -> 565
michael@0 89 NULL, NULL, NULL, NULL,
michael@0 90 // Don't support A8 -> 565
michael@0 91 NULL, NULL, NULL, NULL
michael@0 92 };
michael@0 93
michael@0 94 ///////////////////////////////////////////////////////////////////////////////
michael@0 95
michael@0 96 #include <arm_neon.h>
michael@0 97 #include "SkConvolver.h"
michael@0 98
michael@0 99 // Convolves horizontally along a single row. The row data is given in
michael@0 100 // |srcData| and continues for the numValues() of the filter.
michael@0 101 void convolveHorizontally_neon(const unsigned char* srcData,
michael@0 102 const SkConvolutionFilter1D& filter,
michael@0 103 unsigned char* outRow,
michael@0 104 bool hasAlpha) {
michael@0 105 // Loop over each pixel on this row in the output image.
michael@0 106 int numValues = filter.numValues();
michael@0 107 for (int outX = 0; outX < numValues; outX++) {
michael@0 108 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);
michael@0 109 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);
michael@0 110 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);
michael@0 111 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);
michael@0 112 // Get the filter that determines the current output pixel.
michael@0 113 int filterOffset, filterLength;
michael@0 114 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
michael@0 115 filter.FilterForValue(outX, &filterOffset, &filterLength);
michael@0 116
michael@0 117 // Compute the first pixel in this row that the filter affects. It will
michael@0 118 // touch |filterLength| pixels (4 bytes each) after this.
michael@0 119 const unsigned char* rowToFilter = &srcData[filterOffset * 4];
michael@0 120
michael@0 121 // Apply the filter to the row to get the destination pixel in |accum|.
michael@0 122 int32x4_t accum = vdupq_n_s32(0);
michael@0 123 for (int filterX = 0; filterX < filterLength >> 2; filterX++) {
michael@0 124 // Load 4 coefficients
michael@0 125 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
michael@0 126 coeffs = vld1_s16(filterValues);
michael@0 127 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0));
michael@0 128 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1));
michael@0 129 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2));
michael@0 130 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3));
michael@0 131
michael@0 132 // Load pixels and calc
michael@0 133 uint8x16_t pixels = vld1q_u8(rowToFilter);
michael@0 134 int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));
michael@0 135 int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels)));
michael@0 136
michael@0 137 int16x4_t p0_src = vget_low_s16(p01_16);
michael@0 138 int16x4_t p1_src = vget_high_s16(p01_16);
michael@0 139 int16x4_t p2_src = vget_low_s16(p23_16);
michael@0 140 int16x4_t p3_src = vget_high_s16(p23_16);
michael@0 141
michael@0 142 int32x4_t p0 = vmull_s16(p0_src, coeff0);
michael@0 143 int32x4_t p1 = vmull_s16(p1_src, coeff1);
michael@0 144 int32x4_t p2 = vmull_s16(p2_src, coeff2);
michael@0 145 int32x4_t p3 = vmull_s16(p3_src, coeff3);
michael@0 146
michael@0 147 accum += p0;
michael@0 148 accum += p1;
michael@0 149 accum += p2;
michael@0 150 accum += p3;
michael@0 151
michael@0 152 // Advance the pointers
michael@0 153 rowToFilter += 16;
michael@0 154 filterValues += 4;
michael@0 155 }
michael@0 156 int r = filterLength & 3;
michael@0 157 if (r) {
michael@0 158 const uint16_t mask[4][4] = {
michael@0 159 {0, 0, 0, 0},
michael@0 160 {0xFFFF, 0, 0, 0},
michael@0 161 {0xFFFF, 0xFFFF, 0, 0},
michael@0 162 {0xFFFF, 0xFFFF, 0xFFFF, 0}
michael@0 163 };
michael@0 164 uint16x4_t coeffs;
michael@0 165 int16x4_t coeff0, coeff1, coeff2;
michael@0 166 coeffs = vld1_u16(reinterpret_cast<const uint16_t*>(filterValues));
michael@0 167 coeffs &= vld1_u16(&mask[r][0]);
michael@0 168 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask0));
michael@0 169 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask1));
michael@0 170 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask2));
michael@0 171
michael@0 172 // Load pixels and calc
michael@0 173 uint8x16_t pixels = vld1q_u8(rowToFilter);
michael@0 174 int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));
michael@0 175 int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels)));
michael@0 176 int32x4_t p0 = vmull_s16(vget_low_s16(p01_16), coeff0);
michael@0 177 int32x4_t p1 = vmull_s16(vget_high_s16(p01_16), coeff1);
michael@0 178 int32x4_t p2 = vmull_s16(vget_low_s16(p23_16), coeff2);
michael@0 179
michael@0 180 accum += p0;
michael@0 181 accum += p1;
michael@0 182 accum += p2;
michael@0 183 }
michael@0 184
michael@0 185 // Bring this value back in range. All of the filter scaling factors
michael@0 186 // are in fixed point with kShiftBits bits of fractional part.
michael@0 187 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits);
michael@0 188
michael@0 189 // Pack and store the new pixel.
michael@0 190 int16x4_t accum16 = vqmovn_s32(accum);
michael@0 191 uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16));
michael@0 192 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(accum8), 0);
michael@0 193 outRow += 4;
michael@0 194 }
michael@0 195 }
michael@0 196
michael@0 197 // Does vertical convolution to produce one output row. The filter values and
michael@0 198 // length are given in the first two parameters. These are applied to each
michael@0 199 // of the rows pointed to in the |sourceDataRows| array, with each row
michael@0 200 // being |pixelWidth| wide.
michael@0 201 //
michael@0 202 // The output must have room for |pixelWidth * 4| bytes.
michael@0 203 template<bool hasAlpha>
michael@0 204 void convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
michael@0 205 int filterLength,
michael@0 206 unsigned char* const* sourceDataRows,
michael@0 207 int pixelWidth,
michael@0 208 unsigned char* outRow) {
michael@0 209 int width = pixelWidth & ~3;
michael@0 210
michael@0 211 int32x4_t accum0, accum1, accum2, accum3;
michael@0 212 int16x4_t coeff16;
michael@0 213
michael@0 214 // Output four pixels per iteration (16 bytes).
michael@0 215 for (int outX = 0; outX < width; outX += 4) {
michael@0 216
michael@0 217 // Accumulated result for each pixel. 32 bits per RGBA channel.
michael@0 218 accum0 = accum1 = accum2 = accum3 = vdupq_n_s32(0);
michael@0 219
michael@0 220 // Convolve with one filter coefficient per iteration.
michael@0 221 for (int filterY = 0; filterY < filterLength; filterY++) {
michael@0 222
michael@0 223 // Duplicate the filter coefficient 4 times.
michael@0 224 // [16] cj cj cj cj
michael@0 225 coeff16 = vdup_n_s16(filterValues[filterY]);
michael@0 226
michael@0 227 // Load four pixels (16 bytes) together.
michael@0 228 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
michael@0 229 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][outX << 2]);
michael@0 230
michael@0 231 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8)));
michael@0 232 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8)));
michael@0 233 int16x4_t src16_0 = vget_low_s16(src16_01);
michael@0 234 int16x4_t src16_1 = vget_high_s16(src16_01);
michael@0 235 int16x4_t src16_2 = vget_low_s16(src16_23);
michael@0 236 int16x4_t src16_3 = vget_high_s16(src16_23);
michael@0 237
michael@0 238 accum0 += vmull_s16(src16_0, coeff16);
michael@0 239 accum1 += vmull_s16(src16_1, coeff16);
michael@0 240 accum2 += vmull_s16(src16_2, coeff16);
michael@0 241 accum3 += vmull_s16(src16_3, coeff16);
michael@0 242 }
michael@0 243
michael@0 244 // Shift right for fixed point implementation.
michael@0 245 accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);
michael@0 246 accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);
michael@0 247 accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);
michael@0 248 accum3 = vshrq_n_s32(accum3, SkConvolutionFilter1D::kShiftBits);
michael@0 249
michael@0 250 // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
michael@0 251 // [16] a1 b1 g1 r1 a0 b0 g0 r0
michael@0 252 int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1));
michael@0 253 // [16] a3 b3 g3 r3 a2 b2 g2 r2
michael@0 254 int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum3));
michael@0 255
michael@0 256 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
michael@0 257 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
michael@0 258 uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1));
michael@0 259
michael@0 260 if (hasAlpha) {
michael@0 261 // Compute the max(ri, gi, bi) for each pixel.
michael@0 262 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
michael@0 263 uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8));
michael@0 264 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
michael@0 265 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g
michael@0 266 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
michael@0 267 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16));
michael@0 268 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
michael@0 269 b = vmaxq_u8(a, b); // Max of r and g and b.
michael@0 270 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
michael@0 271 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24));
michael@0 272
michael@0 273 // Make sure the value of alpha channel is always larger than maximum
michael@0 274 // value of color channels.
michael@0 275 accum8 = vmaxq_u8(b, accum8);
michael@0 276 } else {
michael@0 277 // Set value of alpha channels to 0xFF.
michael@0 278 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n_u32(0xFF000000));
michael@0 279 }
michael@0 280
michael@0 281 // Store the convolution result (16 bytes) and advance the pixel pointers.
michael@0 282 vst1q_u8(outRow, accum8);
michael@0 283 outRow += 16;
michael@0 284 }
michael@0 285
michael@0 286 // Process the leftovers when the width of the output is not divisible
michael@0 287 // by 4, that is at most 3 pixels.
michael@0 288 int r = pixelWidth & 3;
michael@0 289 if (r) {
michael@0 290
michael@0 291 accum0 = accum1 = accum2 = vdupq_n_s32(0);
michael@0 292
michael@0 293 for (int filterY = 0; filterY < filterLength; ++filterY) {
michael@0 294 coeff16 = vdup_n_s16(filterValues[filterY]);
michael@0 295
michael@0 296 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
michael@0 297 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][width << 2]);
michael@0 298
michael@0 299 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8)));
michael@0 300 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8)));
michael@0 301 int16x4_t src16_0 = vget_low_s16(src16_01);
michael@0 302 int16x4_t src16_1 = vget_high_s16(src16_01);
michael@0 303 int16x4_t src16_2 = vget_low_s16(src16_23);
michael@0 304
michael@0 305 accum0 += vmull_s16(src16_0, coeff16);
michael@0 306 accum1 += vmull_s16(src16_1, coeff16);
michael@0 307 accum2 += vmull_s16(src16_2, coeff16);
michael@0 308 }
michael@0 309
michael@0 310 accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);
michael@0 311 accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);
michael@0 312 accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);
michael@0 313
michael@0 314 int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1));
michael@0 315 int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum2));
michael@0 316
michael@0 317 uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1));
michael@0 318
michael@0 319 if (hasAlpha) {
michael@0 320 // Compute the max(ri, gi, bi) for each pixel.
michael@0 321 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
michael@0 322 uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8));
michael@0 323 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
michael@0 324 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g
michael@0 325 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
michael@0 326 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16));
michael@0 327 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
michael@0 328 b = vmaxq_u8(a, b); // Max of r and g and b.
michael@0 329 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
michael@0 330 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24));
michael@0 331
michael@0 332 // Make sure the value of alpha channel is always larger than maximum
michael@0 333 // value of color channels.
michael@0 334 accum8 = vmaxq_u8(b, accum8);
michael@0 335 } else {
michael@0 336 // Set value of alpha channels to 0xFF.
michael@0 337 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n_u32(0xFF000000));
michael@0 338 }
michael@0 339
michael@0 340 switch(r) {
michael@0 341 case 1:
michael@0 342 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpretq_u32_u8(accum8), 0);
michael@0 343 break;
michael@0 344 case 2:
michael@0 345 vst1_u32(reinterpret_cast<uint32_t*>(outRow),
michael@0 346 vreinterpret_u32_u8(vget_low_u8(accum8)));
michael@0 347 break;
michael@0 348 case 3:
michael@0 349 vst1_u32(reinterpret_cast<uint32_t*>(outRow),
michael@0 350 vreinterpret_u32_u8(vget_low_u8(accum8)));
michael@0 351 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow+8), vreinterpretq_u32_u8(accum8), 2);
michael@0 352 break;
michael@0 353 }
michael@0 354 }
michael@0 355 }
michael@0 356
michael@0 357 void convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
michael@0 358 int filterLength,
michael@0 359 unsigned char* const* sourceDataRows,
michael@0 360 int pixelWidth,
michael@0 361 unsigned char* outRow,
michael@0 362 bool sourceHasAlpha) {
michael@0 363 if (sourceHasAlpha) {
michael@0 364 convolveVertically_neon<true>(filterValues, filterLength,
michael@0 365 sourceDataRows, pixelWidth,
michael@0 366 outRow);
michael@0 367 } else {
michael@0 368 convolveVertically_neon<false>(filterValues, filterLength,
michael@0 369 sourceDataRows, pixelWidth,
michael@0 370 outRow);
michael@0 371 }
michael@0 372 }
michael@0 373
michael@0 374 // Convolves horizontally along four rows. The row data is given in
michael@0 375 // |src_data| and continues for the num_values() of the filter.
michael@0 376 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
michael@0 377 // refer to that function for detailed comments.
michael@0 378 void convolve4RowsHorizontally_neon(const unsigned char* srcData[4],
michael@0 379 const SkConvolutionFilter1D& filter,
michael@0 380 unsigned char* outRow[4]) {
michael@0 381
michael@0 382 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);
michael@0 383 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);
michael@0 384 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);
michael@0 385 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);
michael@0 386 int num_values = filter.numValues();
michael@0 387
michael@0 388 int filterOffset, filterLength;
michael@0 389 // |mask| will be used to decimate all extra filter coefficients that are
michael@0 390 // loaded by SIMD when |filter_length| is not divisible by 4.
michael@0 391 // mask[0] is not used in following algorithm.
michael@0 392 const uint16_t mask[4][4] = {
michael@0 393 {0, 0, 0, 0},
michael@0 394 {0xFFFF, 0, 0, 0},
michael@0 395 {0xFFFF, 0xFFFF, 0, 0},
michael@0 396 {0xFFFF, 0xFFFF, 0xFFFF, 0}
michael@0 397 };
michael@0 398
michael@0 399 // Output one pixel each iteration, calculating all channels (RGBA) together.
michael@0 400 for (int outX = 0; outX < num_values; outX++) {
michael@0 401
michael@0 402 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
michael@0 403 filter.FilterForValue(outX, &filterOffset, &filterLength);
michael@0 404
michael@0 405 // four pixels in a column per iteration.
michael@0 406 int32x4_t accum0 = vdupq_n_s32(0);
michael@0 407 int32x4_t accum1 = vdupq_n_s32(0);
michael@0 408 int32x4_t accum2 = vdupq_n_s32(0);
michael@0 409 int32x4_t accum3 = vdupq_n_s32(0);
michael@0 410
michael@0 411 int start = (filterOffset<<2);
michael@0 412
michael@0 413 // We will load and accumulate with four coefficients per iteration.
michael@0 414 for (int filter_x = 0; filter_x < (filterLength >> 2); filter_x++) {
michael@0 415 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
michael@0 416
michael@0 417 coeffs = vld1_s16(filterValues);
michael@0 418 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0));
michael@0 419 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1));
michael@0 420 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2));
michael@0 421 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3));
michael@0 422
michael@0 423 uint8x16_t pixels;
michael@0 424 int16x8_t p01_16, p23_16;
michael@0 425 int32x4_t p0, p1, p2, p3;
michael@0 426
michael@0 427
michael@0 428 #define ITERATION(src, accum) \
michael@0 429 pixels = vld1q_u8(src); \
michael@0 430 p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels))); \
michael@0 431 p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels))); \
michael@0 432 p0 = vmull_s16(vget_low_s16(p01_16), coeff0); \
michael@0 433 p1 = vmull_s16(vget_high_s16(p01_16), coeff1); \
michael@0 434 p2 = vmull_s16(vget_low_s16(p23_16), coeff2); \
michael@0 435 p3 = vmull_s16(vget_high_s16(p23_16), coeff3); \
michael@0 436 accum += p0; \
michael@0 437 accum += p1; \
michael@0 438 accum += p2; \
michael@0 439 accum += p3
michael@0 440
michael@0 441 ITERATION(srcData[0] + start, accum0);
michael@0 442 ITERATION(srcData[1] + start, accum1);
michael@0 443 ITERATION(srcData[2] + start, accum2);
michael@0 444 ITERATION(srcData[3] + start, accum3);
michael@0 445
michael@0 446 start += 16;
michael@0 447 filterValues += 4;
michael@0 448 }
michael@0 449
michael@0 450 int r = filterLength & 3;
michael@0 451 if (r) {
michael@0 452 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
michael@0 453 coeffs = vld1_s16(filterValues);
michael@0 454 coeffs &= vreinterpret_s16_u16(vld1_u16(&mask[r][0]));
michael@0 455 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0));
michael@0 456 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1));
michael@0 457 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2));
michael@0 458 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3));
michael@0 459
michael@0 460 uint8x16_t pixels;
michael@0 461 int16x8_t p01_16, p23_16;
michael@0 462 int32x4_t p0, p1, p2, p3;
michael@0 463
michael@0 464 ITERATION(srcData[0] + start, accum0);
michael@0 465 ITERATION(srcData[1] + start, accum1);
michael@0 466 ITERATION(srcData[2] + start, accum2);
michael@0 467 ITERATION(srcData[3] + start, accum3);
michael@0 468 }
michael@0 469
michael@0 470 int16x4_t accum16;
michael@0 471 uint8x8_t res0, res1, res2, res3;
michael@0 472
michael@0 473 #define PACK_RESULT(accum, res) \
michael@0 474 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \
michael@0 475 accum16 = vqmovn_s32(accum); \
michael@0 476 res = vqmovun_s16(vcombine_s16(accum16, accum16));
michael@0 477
michael@0 478 PACK_RESULT(accum0, res0);
michael@0 479 PACK_RESULT(accum1, res1);
michael@0 480 PACK_RESULT(accum2, res2);
michael@0 481 PACK_RESULT(accum3, res3);
michael@0 482
michael@0 483 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u8(res0), 0);
michael@0 484 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u8(res1), 0);
michael@0 485 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u8(res2), 0);
michael@0 486 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u8(res3), 0);
michael@0 487 outRow[0] += 4;
michael@0 488 outRow[1] += 4;
michael@0 489 outRow[2] += 4;
michael@0 490 outRow[3] += 4;
michael@0 491 }
michael@0 492 }
michael@0 493
michael@0 494 void applySIMDPadding_neon(SkConvolutionFilter1D *filter) {
michael@0 495 // Padding |paddingCount| of more dummy coefficients after the coefficients
michael@0 496 // of last filter to prevent SIMD instructions which load 8 or 16 bytes
michael@0 497 // together to access invalid memory areas. We are not trying to align the
michael@0 498 // coefficients right now due to the opaqueness of <vector> implementation.
michael@0 499 // This has to be done after all |AddFilter| calls.
michael@0 500 for (int i = 0; i < 8; ++i) {
michael@0 501 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0));
michael@0 502 }
michael@0 503 }
michael@0 504
michael@0 505 void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) {
michael@0 506 procs->fExtraHorizontalReads = 3;
michael@0 507 procs->fConvolveVertically = &convolveVertically_neon;
michael@0 508 procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon;
michael@0 509 procs->fConvolveHorizontally = &convolveHorizontally_neon;
michael@0 510 procs->fApplySIMDPadding = &applySIMDPadding_neon;
michael@0 511 }

mercurial