gfx/skia/trunk/src/opts/SkBlurImage_opts_neon.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/skia/trunk/src/opts/SkBlurImage_opts_neon.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,188 @@
     1.4 +/*
     1.5 + * Copyright 2013 The Android Open Source Project
     1.6 + *
     1.7 + * Use of this source code is governed by a BSD-style license that can be
     1.8 + * found in the LICENSE file.
     1.9 + */
    1.10 +
    1.11 +
    1.12 +#include "SkBitmap.h"
    1.13 +#include "SkColorPriv.h"
    1.14 +#include "SkBlurImage_opts.h"
    1.15 +#include "SkRect.h"
    1.16 +
    1.17 +#include <arm_neon.h>
    1.18 +
    1.19 +namespace {
    1.20 +
    1.21 +enum BlurDirection {
    1.22 +    kX, kY
    1.23 +};
    1.24 +
    1.25 +/**
    1.26 + * Helper function to load 2 pixels from diffent rows to a 8x8 NEON register
    1.27 + * and also pre-load pixels for future read
    1.28 + */
    1.29 +template<BlurDirection srcDirection>
    1.30 +inline uint8x8_t load_2_pixels(const SkPMColor* src, int srcStride) {
    1.31 +    if (srcDirection == kX) {
    1.32 +        uint32x2_t temp = vdup_n_u32(0);
    1.33 +        // 10% faster by adding these 2 prefetches
    1.34 +        SK_PREFETCH(src + 16);
    1.35 +        SK_PREFETCH(src + srcStride + 16);
    1.36 +        return vreinterpret_u8_u32(vld1_lane_u32(src + srcStride, vld1_lane_u32(src, temp, 0), 1));
    1.37 +     } else {
    1.38 +         return vld1_u8((uint8_t*)src);
    1.39 +     }
    1.40 +}
    1.41 +
    1.42 +/**
    1.43 + * Helper function to store the low 8-bits from a 16x8 NEON register to 2 rows
    1.44 + */
    1.45 +template<BlurDirection dstDirection>
    1.46 +inline void store_2_pixels(uint16x8_t result16x8, SkPMColor* dst, int dstStride) {
    1.47 +    if (dstDirection == kX) {
    1.48 +        uint32x2_t temp = vreinterpret_u32_u8(vmovn_u16(result16x8));
    1.49 +        vst1_lane_u32(dst, temp, 0);
    1.50 +        vst1_lane_u32(dst + dstStride, temp, 1);
    1.51 +    } else {
    1.52 +        uint8x8_t temp = vmovn_u16(result16x8);
    1.53 +        vst1_u8((uint8_t*)dst, temp);
    1.54 +    }
    1.55 +}
    1.56 +
    1.57 +/**
    1.58 + * fast path for kernel size less than 128
    1.59 + */
    1.60 +template<BlurDirection srcDirection, BlurDirection dstDirection>
    1.61 +void SkDoubleRowBoxBlur_NEON(const SkPMColor** src, int srcStride, SkPMColor** dst, int kernelSize,
    1.62 +                        int leftOffset, int rightOffset, int width, int* height)
    1.63 +{
    1.64 +    const int rightBorder = SkMin32(rightOffset + 1, width);
    1.65 +    const int srcStrideX = srcDirection == kX ? 1 : srcStride;
    1.66 +    const int dstStrideX = dstDirection == kX ? 1 : *height;
    1.67 +    const int srcStrideY = srcDirection == kX ? srcStride : 1;
    1.68 +    const int dstStrideY = dstDirection == kX ? width : 1;
    1.69 +    const uint16x8_t scale = vdupq_n_u16((1 << 15) / kernelSize);
    1.70 +
    1.71 +    for (; *height >= 2; *height -= 2) {
    1.72 +        uint16x8_t sum = vdupq_n_u16(0);
    1.73 +        const SkPMColor* p = *src;
    1.74 +        for (int i = 0; i < rightBorder; i++) {
    1.75 +            sum = vaddw_u8(sum,
    1.76 +                load_2_pixels<srcDirection>(p, srcStride));
    1.77 +            p += srcStrideX;
    1.78 +        }
    1.79 +
    1.80 +        const SkPMColor* sptr = *src;
    1.81 +        SkPMColor* dptr = *dst;
    1.82 +        for (int x = 0; x < width; x++) {
    1.83 +            // val = (sum * scale * 2 + 0x8000) >> 16
    1.84 +            uint16x8_t resultPixels = vreinterpretq_u16_s16(vqrdmulhq_s16(
    1.85 +                vreinterpretq_s16_u16(sum), vreinterpretq_s16_u16(scale)));
    1.86 +            store_2_pixels<dstDirection>(resultPixels, dptr, width);
    1.87 +
    1.88 +            if (x >= leftOffset) {
    1.89 +                sum = vsubw_u8(sum,
    1.90 +                    load_2_pixels<srcDirection>(sptr - leftOffset * srcStrideX, srcStride));
    1.91 +            }
    1.92 +            if (x + rightOffset + 1 < width) {
    1.93 +                sum = vaddw_u8(sum,
    1.94 +                    load_2_pixels<srcDirection>(sptr + (rightOffset + 1) * srcStrideX, srcStride));
    1.95 +            }
    1.96 +            sptr += srcStrideX;
    1.97 +            dptr += dstStrideX;
    1.98 +        }
    1.99 +        *src += srcStrideY * 2;
   1.100 +        *dst += dstStrideY * 2;
   1.101 +    }
   1.102 +}
   1.103 +
   1.104 +
   1.105 +/**
   1.106 + * Helper function to spread the components of a 32-bit integer into the
   1.107 + * lower 8 bits of each 16-bit element of a NEON register.
   1.108 + */
   1.109 +
   1.110 +static inline uint16x4_t expand(uint32_t a) {
   1.111 +    // ( ARGB ) -> ( ARGB ARGB ) -> ( A R G B A R G B )
   1.112 +    uint8x8_t v8 = vreinterpret_u8_u32(vdup_n_u32(a));
   1.113 +    // ( A R G B A R G B ) -> ( 0A 0R 0G 0B 0A 0R 0G 0B ) -> ( 0A 0R 0G 0B )
   1.114 +    return vget_low_u16(vmovl_u8(v8));
   1.115 +}
   1.116 +
   1.117 +template<BlurDirection srcDirection, BlurDirection dstDirection>
   1.118 +void SkBoxBlur_NEON(const SkPMColor* src, int srcStride, SkPMColor* dst, int kernelSize,
   1.119 +                    int leftOffset, int rightOffset, int width, int height)
   1.120 +{
   1.121 +    const int rightBorder = SkMin32(rightOffset + 1, width);
   1.122 +    const int srcStrideX = srcDirection == kX ? 1 : srcStride;
   1.123 +    const int dstStrideX = dstDirection == kX ? 1 : height;
   1.124 +    const int srcStrideY = srcDirection == kX ? srcStride : 1;
   1.125 +    const int dstStrideY = dstDirection == kX ? width : 1;
   1.126 +    const uint32x4_t scale = vdupq_n_u32((1 << 24) / kernelSize);
   1.127 +    const uint32x4_t half = vdupq_n_u32(1 << 23);
   1.128 +
   1.129 +    if (kernelSize < 128)
   1.130 +    {
   1.131 +        SkDoubleRowBoxBlur_NEON<srcDirection, dstDirection>(&src, srcStride, &dst, kernelSize,
   1.132 +            leftOffset, rightOffset, width, &height);
   1.133 +    }
   1.134 +
   1.135 +    for (; height > 0; height--) {
   1.136 +        uint32x4_t sum = vdupq_n_u32(0);
   1.137 +        const SkPMColor* p = src;
   1.138 +        for (int i = 0; i < rightBorder; ++i) {
   1.139 +            sum = vaddw_u16(sum, expand(*p));
   1.140 +            p += srcStrideX;
   1.141 +        }
   1.142 +
   1.143 +        const SkPMColor* sptr = src;
   1.144 +        SkPMColor* dptr = dst;
   1.145 +        for (int x = 0; x < width; ++x) {
   1.146 +            // ( half+sumA*scale half+sumR*scale half+sumG*scale half+sumB*scale )
   1.147 +            uint32x4_t result = vmlaq_u32(half, sum, scale);
   1.148 +
   1.149 +            // Saturated conversion to 16-bit.
   1.150 +            // ( AAAA RRRR GGGG BBBB ) -> ( 0A 0R 0G 0B )
   1.151 +            uint16x4_t result16 = vqshrn_n_u32(result, 16);
   1.152 +
   1.153 +            // Saturated conversion to 8-bit.
   1.154 +            // ( 0A 0R 0G 0B ) -> ( 0A 0R 0G 0B 0A 0R 0G 0B ) -> ( A R G B A R G B )
   1.155 +            uint8x8_t result8 = vqshrn_n_u16(vcombine_u16(result16, result16), 8);
   1.156 +
   1.157 +            // ( A R G B A R G B ) -> ( ARGB ARGB ) -> ( ARGB )
   1.158 +            // Store low 32 bits to destination.
   1.159 +            vst1_lane_u32(dptr, vreinterpret_u32_u8(result8), 0);
   1.160 +
   1.161 +            if (x >= leftOffset) {
   1.162 +                const SkPMColor* l = sptr - leftOffset * srcStrideX;
   1.163 +                sum = vsubw_u16(sum, expand(*l));
   1.164 +            }
   1.165 +            if (x + rightOffset + 1 < width) {
   1.166 +                const SkPMColor* r = sptr + (rightOffset + 1) * srcStrideX;
   1.167 +                sum = vaddw_u16(sum, expand(*r));
   1.168 +            }
   1.169 +            sptr += srcStrideX;
   1.170 +            if (srcDirection == kX) {
   1.171 +                SK_PREFETCH(sptr + (rightOffset + 16) * srcStrideX);
   1.172 +            }
   1.173 +            dptr += dstStrideX;
   1.174 +        }
   1.175 +        src += srcStrideY;
   1.176 +        dst += dstStrideY;
   1.177 +    }
   1.178 +}
   1.179 +
   1.180 +} // namespace
   1.181 +
   1.182 +bool SkBoxBlurGetPlatformProcs_NEON(SkBoxBlurProc* boxBlurX,
   1.183 +                                    SkBoxBlurProc* boxBlurY,
   1.184 +                                    SkBoxBlurProc* boxBlurXY,
   1.185 +                                    SkBoxBlurProc* boxBlurYX) {
   1.186 +    *boxBlurX = SkBoxBlur_NEON<kX, kX>;
   1.187 +    *boxBlurY = SkBoxBlur_NEON<kY, kY>;
   1.188 +    *boxBlurXY = SkBoxBlur_NEON<kX, kY>;
   1.189 +    *boxBlurYX = SkBoxBlur_NEON<kY, kX>;
   1.190 +    return true;
   1.191 +}

mercurial