1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/skia/trunk/src/opts/SkBlitMask_opts_arm_neon.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,254 @@ 1.4 + 1.5 +#include "SkBlitMask.h" 1.6 +#include "SkColor_opts_neon.h" 1.7 + 1.8 +static void D32_A8_Black_neon(void* SK_RESTRICT dst, size_t dstRB, 1.9 + const void* SK_RESTRICT maskPtr, size_t maskRB, 1.10 + SkColor, int width, int height) { 1.11 + SkPMColor* SK_RESTRICT device = (SkPMColor*)dst; 1.12 + const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr; 1.13 + 1.14 + maskRB -= width; 1.15 + dstRB -= (width << 2); 1.16 + do { 1.17 + int w = width; 1.18 + while (w >= 8) { 1.19 + uint8x8_t vmask = vld1_u8(mask); 1.20 + uint16x8_t vscale = vsubw_u8(vdupq_n_u16(256), vmask); 1.21 + uint8x8x4_t vdevice = vld4_u8((uint8_t*)device); 1.22 + 1.23 + vdevice = SkAlphaMulQ_neon8(vdevice, vscale); 1.24 + vdevice.val[NEON_A] += vmask; 1.25 + 1.26 + vst4_u8((uint8_t*)device, vdevice); 1.27 + 1.28 + mask += 8; 1.29 + device += 8; 1.30 + w -= 8; 1.31 + } 1.32 + while (w-- > 0) { 1.33 + unsigned aa = *mask++; 1.34 + *device = (aa << SK_A32_SHIFT) 1.35 + + SkAlphaMulQ(*device, SkAlpha255To256(255 - aa)); 1.36 + device += 1; 1.37 + }; 1.38 + device = (uint32_t*)((char*)device + dstRB); 1.39 + mask += maskRB; 1.40 + } while (--height != 0); 1.41 +} 1.42 + 1.43 +template <bool isColor> 1.44 +static void D32_A8_Opaque_Color_neon(void* SK_RESTRICT dst, size_t dstRB, 1.45 + const void* SK_RESTRICT maskPtr, size_t maskRB, 1.46 + SkColor color, int width, int height) { 1.47 + SkPMColor pmc = SkPreMultiplyColor(color); 1.48 + SkPMColor* SK_RESTRICT device = (SkPMColor*)dst; 1.49 + const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr; 1.50 + uint8x8x4_t vpmc; 1.51 + 1.52 + maskRB -= width; 1.53 + dstRB -= (width << 2); 1.54 + 1.55 + if (width >= 8) { 1.56 + vpmc.val[NEON_A] = vdup_n_u8(SkGetPackedA32(pmc)); 1.57 + vpmc.val[NEON_R] = vdup_n_u8(SkGetPackedR32(pmc)); 1.58 + vpmc.val[NEON_G] = vdup_n_u8(SkGetPackedG32(pmc)); 1.59 + vpmc.val[NEON_B] = vdup_n_u8(SkGetPackedB32(pmc)); 1.60 + } 1.61 + do { 1.62 + int w = width; 1.63 + while (w >= 8) { 1.64 + uint8x8_t vmask = vld1_u8(mask); 1.65 + uint16x8_t vscale, vmask256 = SkAlpha255To256_neon8(vmask); 1.66 + if (isColor) { 1.67 + vscale = vsubw_u8(vdupq_n_u16(256), 1.68 + SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256)); 1.69 + } else { 1.70 + vscale = vsubw_u8(vdupq_n_u16(256), vmask); 1.71 + } 1.72 + uint8x8x4_t vdev = vld4_u8((uint8_t*)device); 1.73 + 1.74 + vdev.val[NEON_A] = SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256) 1.75 + + SkAlphaMul_neon8(vdev.val[NEON_A], vscale); 1.76 + vdev.val[NEON_R] = SkAlphaMul_neon8(vpmc.val[NEON_R], vmask256) 1.77 + + SkAlphaMul_neon8(vdev.val[NEON_R], vscale); 1.78 + vdev.val[NEON_G] = SkAlphaMul_neon8(vpmc.val[NEON_G], vmask256) 1.79 + + SkAlphaMul_neon8(vdev.val[NEON_G], vscale); 1.80 + vdev.val[NEON_B] = SkAlphaMul_neon8(vpmc.val[NEON_B], vmask256) 1.81 + + SkAlphaMul_neon8(vdev.val[NEON_B], vscale); 1.82 + 1.83 + vst4_u8((uint8_t*)device, vdev); 1.84 + 1.85 + mask += 8; 1.86 + device += 8; 1.87 + w -= 8; 1.88 + } 1.89 + 1.90 + while (w--) { 1.91 + unsigned aa = *mask++; 1.92 + if (isColor) { 1.93 + *device = SkBlendARGB32(pmc, *device, aa); 1.94 + } else { 1.95 + *device = SkAlphaMulQ(pmc, SkAlpha255To256(aa)) 1.96 + + SkAlphaMulQ(*device, SkAlpha255To256(255 - aa)); 1.97 + } 1.98 + device += 1; 1.99 + }; 1.100 + 1.101 + device = (uint32_t*)((char*)device + dstRB); 1.102 + mask += maskRB; 1.103 + 1.104 + } while (--height != 0); 1.105 +} 1.106 + 1.107 +static void D32_A8_Opaque_neon(void* SK_RESTRICT dst, size_t dstRB, 1.108 + const void* SK_RESTRICT maskPtr, size_t maskRB, 1.109 + SkColor color, int width, int height) { 1.110 + D32_A8_Opaque_Color_neon<false>(dst, dstRB, maskPtr, maskRB, color, width, height); 1.111 +} 1.112 + 1.113 +static void D32_A8_Color_neon(void* SK_RESTRICT dst, size_t dstRB, 1.114 + const void* SK_RESTRICT maskPtr, size_t maskRB, 1.115 + SkColor color, int width, int height) { 1.116 + D32_A8_Opaque_Color_neon<true>(dst, dstRB, maskPtr, maskRB, color, width, height); 1.117 +} 1.118 + 1.119 +SkBlitMask::ColorProc D32_A8_Factory_neon(SkColor color) { 1.120 + if (SK_ColorBLACK == color) { 1.121 + return D32_A8_Black_neon; 1.122 + } else if (0xFF == SkColorGetA(color)) { 1.123 + return D32_A8_Opaque_neon; 1.124 + } else { 1.125 + return D32_A8_Color_neon; 1.126 + } 1.127 +} 1.128 + 1.129 +//////////////////////////////////////////////////////////////////////////////// 1.130 + 1.131 +void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[], 1.132 + SkColor color, int width, 1.133 + SkPMColor opaqueDst) { 1.134 + int colR = SkColorGetR(color); 1.135 + int colG = SkColorGetG(color); 1.136 + int colB = SkColorGetB(color); 1.137 + 1.138 + uint8x8_t vcolR, vcolG, vcolB; 1.139 + uint8x8_t vopqDstA, vopqDstR, vopqDstG, vopqDstB; 1.140 + 1.141 + if (width >= 8) { 1.142 + vcolR = vdup_n_u8(colR); 1.143 + vcolG = vdup_n_u8(colG); 1.144 + vcolB = vdup_n_u8(colB); 1.145 + vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst)); 1.146 + vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst)); 1.147 + vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst)); 1.148 + vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst)); 1.149 + } 1.150 + 1.151 + while (width >= 8) { 1.152 + uint8x8x4_t vdst; 1.153 + uint16x8_t vmask; 1.154 + uint16x8_t vmaskR, vmaskG, vmaskB; 1.155 + uint8x8_t vsel_trans, vsel_opq; 1.156 + 1.157 + vdst = vld4_u8((uint8_t*)dst); 1.158 + vmask = vld1q_u16(src); 1.159 + 1.160 + // Prepare compare masks 1.161 + vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0))); 1.162 + vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF))); 1.163 + 1.164 + // Get all the color masks on 5 bits 1.165 + vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT); 1.166 + vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS), 1.167 + SK_B16_BITS + SK_R16_BITS + 1); 1.168 + vmaskB = vmask & vdupq_n_u16(SK_B16_MASK); 1.169 + 1.170 + // Upscale to 0..32 1.171 + vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4); 1.172 + vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4); 1.173 + vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4); 1.174 + 1.175 + vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF)); 1.176 + vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]); 1.177 + 1.178 + vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR); 1.179 + vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG); 1.180 + vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB); 1.181 + 1.182 + vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]); 1.183 + vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]); 1.184 + vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]); 1.185 + 1.186 + vst4_u8((uint8_t*)dst, vdst); 1.187 + 1.188 + dst += 8; 1.189 + src += 8; 1.190 + width -= 8; 1.191 + } 1.192 + 1.193 + // Leftovers 1.194 + for (int i = 0; i < width; i++) { 1.195 + dst[i] = SkBlendLCD16Opaque(colR, colG, colB, dst[i], src[i], 1.196 + opaqueDst); 1.197 + } 1.198 +} 1.199 + 1.200 +void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[], 1.201 + SkColor color, int width, SkPMColor) { 1.202 + int colA = SkColorGetA(color); 1.203 + int colR = SkColorGetR(color); 1.204 + int colG = SkColorGetG(color); 1.205 + int colB = SkColorGetB(color); 1.206 + 1.207 + colA = SkAlpha255To256(colA); 1.208 + 1.209 + uint8x8_t vcolR, vcolG, vcolB; 1.210 + uint16x8_t vcolA; 1.211 + 1.212 + if (width >= 8) { 1.213 + vcolA = vdupq_n_u16(colA); 1.214 + vcolR = vdup_n_u8(colR); 1.215 + vcolG = vdup_n_u8(colG); 1.216 + vcolB = vdup_n_u8(colB); 1.217 + } 1.218 + 1.219 + while (width >= 8) { 1.220 + uint8x8x4_t vdst; 1.221 + uint16x8_t vmask; 1.222 + uint16x8_t vmaskR, vmaskG, vmaskB; 1.223 + 1.224 + vdst = vld4_u8((uint8_t*)dst); 1.225 + vmask = vld1q_u16(src); 1.226 + 1.227 + // Get all the color masks on 5 bits 1.228 + vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT); 1.229 + vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS), 1.230 + SK_B16_BITS + SK_R16_BITS + 1); 1.231 + vmaskB = vmask & vdupq_n_u16(SK_B16_MASK); 1.232 + 1.233 + // Upscale to 0..32 1.234 + vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4); 1.235 + vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4); 1.236 + vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4); 1.237 + 1.238 + vmaskR = vshrq_n_u16(vmaskR * vcolA, 8); 1.239 + vmaskG = vshrq_n_u16(vmaskG * vcolA, 8); 1.240 + vmaskB = vshrq_n_u16(vmaskB * vcolA, 8); 1.241 + 1.242 + vdst.val[NEON_A] = vdup_n_u8(0xFF); 1.243 + vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR); 1.244 + vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG); 1.245 + vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB); 1.246 + 1.247 + vst4_u8((uint8_t*)dst, vdst); 1.248 + 1.249 + dst += 8; 1.250 + src += 8; 1.251 + width -= 8; 1.252 + } 1.253 + 1.254 + for (int i = 0; i < width; i++) { 1.255 + dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]); 1.256 + } 1.257 +}