gfx/skia/trunk/src/opts/SkBitmapProcState_matrixProcs_neon.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/skia/trunk/src/opts/SkBitmapProcState_matrixProcs_neon.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,234 @@
     1.4 +/* NEON optimized code (C) COPYRIGHT 2009 Motorola
     1.5 + *
     1.6 + * Use of this source code is governed by a BSD-style license that can be
     1.7 + * found in the LICENSE file.
     1.8 + */
     1.9 +
    1.10 +#include "SkBitmapProcState.h"
    1.11 +#include "SkPerspIter.h"
    1.12 +#include "SkShader.h"
    1.13 +#include "SkUtilsArm.h"
    1.14 +#include "SkBitmapProcState_utils.h"
    1.15 +
    1.16 +#include <arm_neon.h>
    1.17 +
    1.18 +extern const SkBitmapProcState::MatrixProc ClampX_ClampY_Procs_neon[];
    1.19 +extern const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs_neon[];
    1.20 +
    1.21 +static void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
    1.22 +static void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
    1.23 +
    1.24 +// TILEX_PROCF(fx, max)    SkClampMax((fx) >> 16, max)
    1.25 +static inline int16x8_t sbpsm_clamp_tile8(int32x4_t low, int32x4_t high, unsigned max) {
    1.26 +    int16x8_t res;
    1.27 +
    1.28 +    // get the hi 16s of all those 32s
    1.29 +    res = vuzpq_s16(vreinterpretq_s16_s32(low), vreinterpretq_s16_s32(high)).val[1];
    1.30 +
    1.31 +    // clamp
    1.32 +    res = vmaxq_s16(res, vdupq_n_s16(0));
    1.33 +    res = vminq_s16(res, vdupq_n_s16(max));
    1.34 +
    1.35 +    return res;
    1.36 +}
    1.37 +
    1.38 +// TILEX_PROCF(fx, max)    SkClampMax((fx) >> 16, max)
    1.39 +static inline int32x4_t sbpsm_clamp_tile4(int32x4_t f, unsigned max) {
    1.40 +    int32x4_t res;
    1.41 +
    1.42 +    // get the hi 16s of all those 32s
    1.43 +    res = vshrq_n_s32(f, 16);
    1.44 +
    1.45 +    // clamp
    1.46 +    res = vmaxq_s32(res, vdupq_n_s32(0));
    1.47 +    res = vminq_s32(res, vdupq_n_s32(max));
    1.48 +
    1.49 +    return res;
    1.50 +}
    1.51 +
    1.52 +// TILEY_LOW_BITS(fy, max)         (((fy) >> 12) & 0xF)
    1.53 +static inline int32x4_t sbpsm_clamp_tile4_low_bits(int32x4_t fx) {
    1.54 +    int32x4_t ret;
    1.55 +
    1.56 +    ret = vshrq_n_s32(fx, 12);
    1.57 +
    1.58 +    /* We don't need the mask below because the caller will
    1.59 +     * overwrite the non-masked bits
    1.60 +     */
    1.61 +    //ret = vandq_s32(ret, vdupq_n_s32(0xF));
    1.62 +
    1.63 +    return ret;
    1.64 +}
    1.65 +
    1.66 +// TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16)
    1.67 +static inline int16x8_t sbpsm_repeat_tile8(int32x4_t low, int32x4_t high, unsigned max) {
    1.68 +    uint16x8_t res;
    1.69 +    uint32x4_t tmpl, tmph;
    1.70 +
    1.71 +    // get the lower 16 bits
    1.72 +    res = vuzpq_u16(vreinterpretq_u16_s32(low), vreinterpretq_u16_s32(high)).val[0];
    1.73 +
    1.74 +    // bare multiplication, not SkFixedMul
    1.75 +    tmpl = vmull_u16(vget_low_u16(res), vdup_n_u16(max+1));
    1.76 +    tmph = vmull_u16(vget_high_u16(res), vdup_n_u16(max+1));
    1.77 +
    1.78 +    // extraction of the 16 upper bits
    1.79 +    res = vuzpq_u16(vreinterpretq_u16_u32(tmpl), vreinterpretq_u16_u32(tmph)).val[1];
    1.80 +
    1.81 +    return vreinterpretq_s16_u16(res);
    1.82 +}
    1.83 +
    1.84 +// TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16)
    1.85 +static inline int32x4_t sbpsm_repeat_tile4(int32x4_t f, unsigned max) {
    1.86 +    uint16x4_t res;
    1.87 +    uint32x4_t tmp;
    1.88 +
    1.89 +    // get the lower 16 bits
    1.90 +    res = vmovn_u32(vreinterpretq_u32_s32(f));
    1.91 +
    1.92 +    // bare multiplication, not SkFixedMul
    1.93 +    tmp = vmull_u16(res, vdup_n_u16(max+1));
    1.94 +
    1.95 +    // extraction of the 16 upper bits
    1.96 +    tmp = vshrq_n_u32(tmp, 16);
    1.97 +
    1.98 +    return vreinterpretq_s32_u32(tmp);
    1.99 +}
   1.100 +
   1.101 +// TILEX_LOW_BITS(fx, max)         ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
   1.102 +static inline int32x4_t sbpsm_repeat_tile4_low_bits(int32x4_t fx, unsigned max) {
   1.103 +    uint16x4_t res;
   1.104 +    uint32x4_t tmp;
   1.105 +    int32x4_t ret;
   1.106 +
   1.107 +    // get the lower 16 bits
   1.108 +    res = vmovn_u32(vreinterpretq_u32_s32(fx));
   1.109 +
   1.110 +    // bare multiplication, not SkFixedMul
   1.111 +    tmp = vmull_u16(res, vdup_n_u16(max + 1));
   1.112 +
   1.113 +    // shift and mask
   1.114 +    ret = vshrq_n_s32(vreinterpretq_s32_u32(tmp), 12);
   1.115 +
   1.116 +    /* We don't need the mask below because the caller will
   1.117 +     * overwrite the non-masked bits
   1.118 +     */
   1.119 +    //ret = vandq_s32(ret, vdupq_n_s32(0xF));
   1.120 +
   1.121 +    return ret;
   1.122 +}
   1.123 +
   1.124 +#define MAKENAME(suffix)                ClampX_ClampY ## suffix ## _neon
   1.125 +#define TILEX_PROCF(fx, max)            SkClampMax((fx) >> 16, max)
   1.126 +#define TILEY_PROCF(fy, max)            SkClampMax((fy) >> 16, max)
   1.127 +#define TILEX_PROCF_NEON8(l, h, max)    sbpsm_clamp_tile8(l, h, max)
   1.128 +#define TILEY_PROCF_NEON8(l, h, max)    sbpsm_clamp_tile8(l, h, max)
   1.129 +#define TILEX_PROCF_NEON4(fx, max)      sbpsm_clamp_tile4(fx, max)
   1.130 +#define TILEY_PROCF_NEON4(fy, max)      sbpsm_clamp_tile4(fy, max)
   1.131 +#define TILEX_LOW_BITS(fx, max)         (((fx) >> 12) & 0xF)
   1.132 +#define TILEY_LOW_BITS(fy, max)         (((fy) >> 12) & 0xF)
   1.133 +#define TILEX_LOW_BITS_NEON4(fx, max)   sbpsm_clamp_tile4_low_bits(fx)
   1.134 +#define TILEY_LOW_BITS_NEON4(fy, max)   sbpsm_clamp_tile4_low_bits(fy)
   1.135 +#define CHECK_FOR_DECAL
   1.136 +#include "SkBitmapProcState_matrix_neon.h"
   1.137 +
   1.138 +#define MAKENAME(suffix)                RepeatX_RepeatY ## suffix ## _neon
   1.139 +#define TILEX_PROCF(fx, max)            SK_USHIFT16(((fx) & 0xFFFF) * ((max) + 1))
   1.140 +#define TILEY_PROCF(fy, max)            SK_USHIFT16(((fy) & 0xFFFF) * ((max) + 1))
   1.141 +#define TILEX_PROCF_NEON8(l, h, max)    sbpsm_repeat_tile8(l, h, max)
   1.142 +#define TILEY_PROCF_NEON8(l, h, max)    sbpsm_repeat_tile8(l, h, max)
   1.143 +#define TILEX_PROCF_NEON4(fx, max)      sbpsm_repeat_tile4(fx, max)
   1.144 +#define TILEY_PROCF_NEON4(fy, max)      sbpsm_repeat_tile4(fy, max)
   1.145 +#define TILEX_LOW_BITS(fx, max)         ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
   1.146 +#define TILEY_LOW_BITS(fy, max)         ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
   1.147 +#define TILEX_LOW_BITS_NEON4(fx, max)   sbpsm_repeat_tile4_low_bits(fx, max)
   1.148 +#define TILEY_LOW_BITS_NEON4(fy, max)   sbpsm_repeat_tile4_low_bits(fy, max)
   1.149 +#include "SkBitmapProcState_matrix_neon.h"
   1.150 +
   1.151 +
   1.152 +
   1.153 +void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count) {
   1.154 +    if (count >= 8) {
   1.155 +        // SkFixed is 16.16 fixed point
   1.156 +        SkFixed dx8 = dx * 8;
   1.157 +        int32x4_t vdx8 = vdupq_n_s32(dx8);
   1.158 +
   1.159 +        // setup lbase and hbase
   1.160 +        int32x4_t lbase, hbase;
   1.161 +        lbase = vdupq_n_s32(fx);
   1.162 +        lbase = vsetq_lane_s32(fx + dx, lbase, 1);
   1.163 +        lbase = vsetq_lane_s32(fx + dx + dx, lbase, 2);
   1.164 +        lbase = vsetq_lane_s32(fx + dx + dx + dx, lbase, 3);
   1.165 +        hbase = lbase + vdupq_n_s32(4 * dx);
   1.166 +
   1.167 +        do {
   1.168 +            // store the upper 16 bits
   1.169 +            vst1q_u32(dst, vreinterpretq_u32_s16(
   1.170 +                vuzpq_s16(vreinterpretq_s16_s32(lbase), vreinterpretq_s16_s32(hbase)).val[1]
   1.171 +            ));
   1.172 +
   1.173 +            // on to the next group of 8
   1.174 +            lbase += vdx8;
   1.175 +            hbase += vdx8;
   1.176 +            dst += 4; // we did 8 elements but the result is twice smaller
   1.177 +            count -= 8;
   1.178 +            fx += dx8;
   1.179 +        } while (count >= 8);
   1.180 +    }
   1.181 +
   1.182 +    uint16_t* xx = (uint16_t*)dst;
   1.183 +    for (int i = count; i > 0; --i) {
   1.184 +        *xx++ = SkToU16(fx >> 16); fx += dx;
   1.185 +    }
   1.186 +}
   1.187 +
   1.188 +void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count) {
   1.189 +    if (count >= 8) {
   1.190 +        SkFixed dx8 = dx * 8;
   1.191 +        int32x4_t vdx8 = vdupq_n_s32(dx8);
   1.192 +
   1.193 +        int32x4_t wide_fx, wide_fx2;
   1.194 +        wide_fx = vdupq_n_s32(fx);
   1.195 +        wide_fx = vsetq_lane_s32(fx + dx, wide_fx, 1);
   1.196 +        wide_fx = vsetq_lane_s32(fx + dx + dx, wide_fx, 2);
   1.197 +        wide_fx = vsetq_lane_s32(fx + dx + dx + dx, wide_fx, 3);
   1.198 +
   1.199 +        wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(4 * dx));
   1.200 +
   1.201 +        while (count >= 8) {
   1.202 +            int32x4_t wide_out;
   1.203 +            int32x4_t wide_out2;
   1.204 +
   1.205 +            wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14);
   1.206 +            wide_out = wide_out | (vshrq_n_s32(wide_fx,16) + vdupq_n_s32(1));
   1.207 +
   1.208 +            wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14);
   1.209 +            wide_out2 = wide_out2 | (vshrq_n_s32(wide_fx2,16) + vdupq_n_s32(1));
   1.210 +
   1.211 +            vst1q_u32(dst, vreinterpretq_u32_s32(wide_out));
   1.212 +            vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2));
   1.213 +
   1.214 +            dst += 8;
   1.215 +            fx += dx8;
   1.216 +            wide_fx += vdx8;
   1.217 +            wide_fx2 += vdx8;
   1.218 +            count -= 8;
   1.219 +        }
   1.220 +    }
   1.221 +
   1.222 +    if (count & 1)
   1.223 +    {
   1.224 +        SkASSERT((fx >> (16 + 14)) == 0);
   1.225 +        *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
   1.226 +        fx += dx;
   1.227 +    }
   1.228 +    while ((count -= 2) >= 0)
   1.229 +    {
   1.230 +        SkASSERT((fx >> (16 + 14)) == 0);
   1.231 +        *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
   1.232 +        fx += dx;
   1.233 +
   1.234 +        *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
   1.235 +        fx += dx;
   1.236 +    }
   1.237 +}

mercurial