1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/skia/trunk/src/opts/SkBitmapProcState_matrixProcs_neon.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,234 @@ 1.4 +/* NEON optimized code (C) COPYRIGHT 2009 Motorola 1.5 + * 1.6 + * Use of this source code is governed by a BSD-style license that can be 1.7 + * found in the LICENSE file. 1.8 + */ 1.9 + 1.10 +#include "SkBitmapProcState.h" 1.11 +#include "SkPerspIter.h" 1.12 +#include "SkShader.h" 1.13 +#include "SkUtilsArm.h" 1.14 +#include "SkBitmapProcState_utils.h" 1.15 + 1.16 +#include <arm_neon.h> 1.17 + 1.18 +extern const SkBitmapProcState::MatrixProc ClampX_ClampY_Procs_neon[]; 1.19 +extern const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs_neon[]; 1.20 + 1.21 +static void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count); 1.22 +static void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count); 1.23 + 1.24 +// TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max) 1.25 +static inline int16x8_t sbpsm_clamp_tile8(int32x4_t low, int32x4_t high, unsigned max) { 1.26 + int16x8_t res; 1.27 + 1.28 + // get the hi 16s of all those 32s 1.29 + res = vuzpq_s16(vreinterpretq_s16_s32(low), vreinterpretq_s16_s32(high)).val[1]; 1.30 + 1.31 + // clamp 1.32 + res = vmaxq_s16(res, vdupq_n_s16(0)); 1.33 + res = vminq_s16(res, vdupq_n_s16(max)); 1.34 + 1.35 + return res; 1.36 +} 1.37 + 1.38 +// TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max) 1.39 +static inline int32x4_t sbpsm_clamp_tile4(int32x4_t f, unsigned max) { 1.40 + int32x4_t res; 1.41 + 1.42 + // get the hi 16s of all those 32s 1.43 + res = vshrq_n_s32(f, 16); 1.44 + 1.45 + // clamp 1.46 + res = vmaxq_s32(res, vdupq_n_s32(0)); 1.47 + res = vminq_s32(res, vdupq_n_s32(max)); 1.48 + 1.49 + return res; 1.50 +} 1.51 + 1.52 +// TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF) 1.53 +static inline int32x4_t sbpsm_clamp_tile4_low_bits(int32x4_t fx) { 1.54 + int32x4_t ret; 1.55 + 1.56 + ret = vshrq_n_s32(fx, 12); 1.57 + 1.58 + /* We don't need the mask below because the caller will 1.59 + * overwrite the non-masked bits 1.60 + */ 1.61 + //ret = vandq_s32(ret, vdupq_n_s32(0xF)); 1.62 + 1.63 + return ret; 1.64 +} 1.65 + 1.66 +// TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) 1.67 +static inline int16x8_t sbpsm_repeat_tile8(int32x4_t low, int32x4_t high, unsigned max) { 1.68 + uint16x8_t res; 1.69 + uint32x4_t tmpl, tmph; 1.70 + 1.71 + // get the lower 16 bits 1.72 + res = vuzpq_u16(vreinterpretq_u16_s32(low), vreinterpretq_u16_s32(high)).val[0]; 1.73 + 1.74 + // bare multiplication, not SkFixedMul 1.75 + tmpl = vmull_u16(vget_low_u16(res), vdup_n_u16(max+1)); 1.76 + tmph = vmull_u16(vget_high_u16(res), vdup_n_u16(max+1)); 1.77 + 1.78 + // extraction of the 16 upper bits 1.79 + res = vuzpq_u16(vreinterpretq_u16_u32(tmpl), vreinterpretq_u16_u32(tmph)).val[1]; 1.80 + 1.81 + return vreinterpretq_s16_u16(res); 1.82 +} 1.83 + 1.84 +// TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) 1.85 +static inline int32x4_t sbpsm_repeat_tile4(int32x4_t f, unsigned max) { 1.86 + uint16x4_t res; 1.87 + uint32x4_t tmp; 1.88 + 1.89 + // get the lower 16 bits 1.90 + res = vmovn_u32(vreinterpretq_u32_s32(f)); 1.91 + 1.92 + // bare multiplication, not SkFixedMul 1.93 + tmp = vmull_u16(res, vdup_n_u16(max+1)); 1.94 + 1.95 + // extraction of the 16 upper bits 1.96 + tmp = vshrq_n_u32(tmp, 16); 1.97 + 1.98 + return vreinterpretq_s32_u32(tmp); 1.99 +} 1.100 + 1.101 +// TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) 1.102 +static inline int32x4_t sbpsm_repeat_tile4_low_bits(int32x4_t fx, unsigned max) { 1.103 + uint16x4_t res; 1.104 + uint32x4_t tmp; 1.105 + int32x4_t ret; 1.106 + 1.107 + // get the lower 16 bits 1.108 + res = vmovn_u32(vreinterpretq_u32_s32(fx)); 1.109 + 1.110 + // bare multiplication, not SkFixedMul 1.111 + tmp = vmull_u16(res, vdup_n_u16(max + 1)); 1.112 + 1.113 + // shift and mask 1.114 + ret = vshrq_n_s32(vreinterpretq_s32_u32(tmp), 12); 1.115 + 1.116 + /* We don't need the mask below because the caller will 1.117 + * overwrite the non-masked bits 1.118 + */ 1.119 + //ret = vandq_s32(ret, vdupq_n_s32(0xF)); 1.120 + 1.121 + return ret; 1.122 +} 1.123 + 1.124 +#define MAKENAME(suffix) ClampX_ClampY ## suffix ## _neon 1.125 +#define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max) 1.126 +#define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max) 1.127 +#define TILEX_PROCF_NEON8(l, h, max) sbpsm_clamp_tile8(l, h, max) 1.128 +#define TILEY_PROCF_NEON8(l, h, max) sbpsm_clamp_tile8(l, h, max) 1.129 +#define TILEX_PROCF_NEON4(fx, max) sbpsm_clamp_tile4(fx, max) 1.130 +#define TILEY_PROCF_NEON4(fy, max) sbpsm_clamp_tile4(fy, max) 1.131 +#define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF) 1.132 +#define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF) 1.133 +#define TILEX_LOW_BITS_NEON4(fx, max) sbpsm_clamp_tile4_low_bits(fx) 1.134 +#define TILEY_LOW_BITS_NEON4(fy, max) sbpsm_clamp_tile4_low_bits(fy) 1.135 +#define CHECK_FOR_DECAL 1.136 +#include "SkBitmapProcState_matrix_neon.h" 1.137 + 1.138 +#define MAKENAME(suffix) RepeatX_RepeatY ## suffix ## _neon 1.139 +#define TILEX_PROCF(fx, max) SK_USHIFT16(((fx) & 0xFFFF) * ((max) + 1)) 1.140 +#define TILEY_PROCF(fy, max) SK_USHIFT16(((fy) & 0xFFFF) * ((max) + 1)) 1.141 +#define TILEX_PROCF_NEON8(l, h, max) sbpsm_repeat_tile8(l, h, max) 1.142 +#define TILEY_PROCF_NEON8(l, h, max) sbpsm_repeat_tile8(l, h, max) 1.143 +#define TILEX_PROCF_NEON4(fx, max) sbpsm_repeat_tile4(fx, max) 1.144 +#define TILEY_PROCF_NEON4(fy, max) sbpsm_repeat_tile4(fy, max) 1.145 +#define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) 1.146 +#define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) 1.147 +#define TILEX_LOW_BITS_NEON4(fx, max) sbpsm_repeat_tile4_low_bits(fx, max) 1.148 +#define TILEY_LOW_BITS_NEON4(fy, max) sbpsm_repeat_tile4_low_bits(fy, max) 1.149 +#include "SkBitmapProcState_matrix_neon.h" 1.150 + 1.151 + 1.152 + 1.153 +void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count) { 1.154 + if (count >= 8) { 1.155 + // SkFixed is 16.16 fixed point 1.156 + SkFixed dx8 = dx * 8; 1.157 + int32x4_t vdx8 = vdupq_n_s32(dx8); 1.158 + 1.159 + // setup lbase and hbase 1.160 + int32x4_t lbase, hbase; 1.161 + lbase = vdupq_n_s32(fx); 1.162 + lbase = vsetq_lane_s32(fx + dx, lbase, 1); 1.163 + lbase = vsetq_lane_s32(fx + dx + dx, lbase, 2); 1.164 + lbase = vsetq_lane_s32(fx + dx + dx + dx, lbase, 3); 1.165 + hbase = lbase + vdupq_n_s32(4 * dx); 1.166 + 1.167 + do { 1.168 + // store the upper 16 bits 1.169 + vst1q_u32(dst, vreinterpretq_u32_s16( 1.170 + vuzpq_s16(vreinterpretq_s16_s32(lbase), vreinterpretq_s16_s32(hbase)).val[1] 1.171 + )); 1.172 + 1.173 + // on to the next group of 8 1.174 + lbase += vdx8; 1.175 + hbase += vdx8; 1.176 + dst += 4; // we did 8 elements but the result is twice smaller 1.177 + count -= 8; 1.178 + fx += dx8; 1.179 + } while (count >= 8); 1.180 + } 1.181 + 1.182 + uint16_t* xx = (uint16_t*)dst; 1.183 + for (int i = count; i > 0; --i) { 1.184 + *xx++ = SkToU16(fx >> 16); fx += dx; 1.185 + } 1.186 +} 1.187 + 1.188 +void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count) { 1.189 + if (count >= 8) { 1.190 + SkFixed dx8 = dx * 8; 1.191 + int32x4_t vdx8 = vdupq_n_s32(dx8); 1.192 + 1.193 + int32x4_t wide_fx, wide_fx2; 1.194 + wide_fx = vdupq_n_s32(fx); 1.195 + wide_fx = vsetq_lane_s32(fx + dx, wide_fx, 1); 1.196 + wide_fx = vsetq_lane_s32(fx + dx + dx, wide_fx, 2); 1.197 + wide_fx = vsetq_lane_s32(fx + dx + dx + dx, wide_fx, 3); 1.198 + 1.199 + wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(4 * dx)); 1.200 + 1.201 + while (count >= 8) { 1.202 + int32x4_t wide_out; 1.203 + int32x4_t wide_out2; 1.204 + 1.205 + wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14); 1.206 + wide_out = wide_out | (vshrq_n_s32(wide_fx,16) + vdupq_n_s32(1)); 1.207 + 1.208 + wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14); 1.209 + wide_out2 = wide_out2 | (vshrq_n_s32(wide_fx2,16) + vdupq_n_s32(1)); 1.210 + 1.211 + vst1q_u32(dst, vreinterpretq_u32_s32(wide_out)); 1.212 + vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2)); 1.213 + 1.214 + dst += 8; 1.215 + fx += dx8; 1.216 + wide_fx += vdx8; 1.217 + wide_fx2 += vdx8; 1.218 + count -= 8; 1.219 + } 1.220 + } 1.221 + 1.222 + if (count & 1) 1.223 + { 1.224 + SkASSERT((fx >> (16 + 14)) == 0); 1.225 + *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 1.226 + fx += dx; 1.227 + } 1.228 + while ((count -= 2) >= 0) 1.229 + { 1.230 + SkASSERT((fx >> (16 + 14)) == 0); 1.231 + *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 1.232 + fx += dx; 1.233 + 1.234 + *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 1.235 + fx += dx; 1.236 + } 1.237 +}