Sat, 03 Jan 2015 20:18:00 +0100
Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.
michael@0 | 1 | #include "SkXfermode.h" |
michael@0 | 2 | #include "SkXfermode_proccoeff.h" |
michael@0 | 3 | #include "SkColorPriv.h" |
michael@0 | 4 | |
michael@0 | 5 | #include <arm_neon.h> |
michael@0 | 6 | #include "SkColor_opts_neon.h" |
michael@0 | 7 | #include "SkXfermode_opts_arm_neon.h" |
michael@0 | 8 | |
michael@0 | 9 | #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b) |
michael@0 | 10 | |
michael@0 | 11 | |
michael@0 | 12 | //////////////////////////////////////////////////////////////////////////////// |
michael@0 | 13 | // NEONized skia functions |
michael@0 | 14 | //////////////////////////////////////////////////////////////////////////////// |
michael@0 | 15 | |
michael@0 | 16 | static inline uint8x8_t SkAlphaMulAlpha_neon8(uint8x8_t color, uint8x8_t alpha) { |
michael@0 | 17 | uint16x8_t tmp; |
michael@0 | 18 | uint8x8_t ret; |
michael@0 | 19 | |
michael@0 | 20 | tmp = vmull_u8(color, alpha); |
michael@0 | 21 | tmp = vaddq_u16(tmp, vdupq_n_u16(128)); |
michael@0 | 22 | tmp = vaddq_u16(tmp, vshrq_n_u16(tmp, 8)); |
michael@0 | 23 | |
michael@0 | 24 | ret = vshrn_n_u16(tmp, 8); |
michael@0 | 25 | |
michael@0 | 26 | return ret; |
michael@0 | 27 | } |
michael@0 | 28 | |
michael@0 | 29 | static inline uint16x8_t SkAlphaMulAlpha_neon8_16(uint8x8_t color, uint8x8_t alpha) { |
michael@0 | 30 | uint16x8_t ret; |
michael@0 | 31 | |
michael@0 | 32 | ret = vmull_u8(color, alpha); |
michael@0 | 33 | ret = vaddq_u16(ret, vdupq_n_u16(128)); |
michael@0 | 34 | ret = vaddq_u16(ret, vshrq_n_u16(ret, 8)); |
michael@0 | 35 | |
michael@0 | 36 | ret = vshrq_n_u16(ret, 8); |
michael@0 | 37 | |
michael@0 | 38 | return ret; |
michael@0 | 39 | } |
michael@0 | 40 | |
michael@0 | 41 | static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) { |
michael@0 | 42 | uint16x8_t tmp; |
michael@0 | 43 | |
michael@0 | 44 | tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)), |
michael@0 | 45 | vmovn_u32(vreinterpretq_u32_s32(p2))); |
michael@0 | 46 | |
michael@0 | 47 | tmp += vdupq_n_u16(128); |
michael@0 | 48 | tmp += vshrq_n_u16(tmp, 8); |
michael@0 | 49 | |
michael@0 | 50 | return vshrn_n_u16(tmp, 8); |
michael@0 | 51 | } |
michael@0 | 52 | |
michael@0 | 53 | static inline uint16x8_t SkDiv255Round_neon8_16_16(uint16x8_t prod) { |
michael@0 | 54 | prod += vdupq_n_u16(128); |
michael@0 | 55 | prod += vshrq_n_u16(prod, 8); |
michael@0 | 56 | |
michael@0 | 57 | return vshrq_n_u16(prod, 8); |
michael@0 | 58 | } |
michael@0 | 59 | |
michael@0 | 60 | static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val2) { |
michael@0 | 61 | uint8x8_t ret; |
michael@0 | 62 | uint32x4_t cmp1, cmp2; |
michael@0 | 63 | uint16x8_t cmp16; |
michael@0 | 64 | uint8x8_t cmp8, cmp8_1; |
michael@0 | 65 | |
michael@0 | 66 | // Test if <= 0 |
michael@0 | 67 | cmp1 = vcleq_s32(val1, vdupq_n_s32(0)); |
michael@0 | 68 | cmp2 = vcleq_s32(val2, vdupq_n_s32(0)); |
michael@0 | 69 | cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); |
michael@0 | 70 | cmp8_1 = vmovn_u16(cmp16); |
michael@0 | 71 | |
michael@0 | 72 | // Init to zero |
michael@0 | 73 | ret = vdup_n_u8(0); |
michael@0 | 74 | |
michael@0 | 75 | // Test if >= 255*255 |
michael@0 | 76 | cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255)); |
michael@0 | 77 | cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255)); |
michael@0 | 78 | cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); |
michael@0 | 79 | cmp8 = vmovn_u16(cmp16); |
michael@0 | 80 | |
michael@0 | 81 | // Insert 255 where true |
michael@0 | 82 | ret = vbsl_u8(cmp8, vdup_n_u8(255), ret); |
michael@0 | 83 | |
michael@0 | 84 | // Calc SkDiv255Round |
michael@0 | 85 | uint8x8_t div = SkDiv255Round_neon8_32_8(val1, val2); |
michael@0 | 86 | |
michael@0 | 87 | // Insert where false and previous test false |
michael@0 | 88 | cmp8 = cmp8 | cmp8_1; |
michael@0 | 89 | ret = vbsl_u8(cmp8, ret, div); |
michael@0 | 90 | |
michael@0 | 91 | // Return the final combination |
michael@0 | 92 | return ret; |
michael@0 | 93 | } |
michael@0 | 94 | |
michael@0 | 95 | //////////////////////////////////////////////////////////////////////////////// |
michael@0 | 96 | // 1 pixel modeprocs |
michael@0 | 97 | //////////////////////////////////////////////////////////////////////////////// |
michael@0 | 98 | |
michael@0 | 99 | // kSrcATop_Mode, //!< [Da, Sc * Da + (1 - Sa) * Dc] |
michael@0 | 100 | SkPMColor srcatop_modeproc_neon(SkPMColor src, SkPMColor dst) { |
michael@0 | 101 | unsigned sa = SkGetPackedA32(src); |
michael@0 | 102 | unsigned da = SkGetPackedA32(dst); |
michael@0 | 103 | unsigned isa = 255 - sa; |
michael@0 | 104 | |
michael@0 | 105 | uint8x8_t vda, visa, vsrc, vdst; |
michael@0 | 106 | |
michael@0 | 107 | vda = vdup_n_u8(da); |
michael@0 | 108 | visa = vdup_n_u8(isa); |
michael@0 | 109 | |
michael@0 | 110 | uint16x8_t vsrc_wide, vdst_wide; |
michael@0 | 111 | vsrc_wide = vmull_u8(vda, vreinterpret_u8_u32(vdup_n_u32(src))); |
michael@0 | 112 | vdst_wide = vmull_u8(visa, vreinterpret_u8_u32(vdup_n_u32(dst))); |
michael@0 | 113 | |
michael@0 | 114 | vsrc_wide += vdupq_n_u16(128); |
michael@0 | 115 | vsrc_wide += vshrq_n_u16(vsrc_wide, 8); |
michael@0 | 116 | |
michael@0 | 117 | vdst_wide += vdupq_n_u16(128); |
michael@0 | 118 | vdst_wide += vshrq_n_u16(vdst_wide, 8); |
michael@0 | 119 | |
michael@0 | 120 | vsrc = vshrn_n_u16(vsrc_wide, 8); |
michael@0 | 121 | vdst = vshrn_n_u16(vdst_wide, 8); |
michael@0 | 122 | |
michael@0 | 123 | vsrc += vdst; |
michael@0 | 124 | vsrc = vset_lane_u8(da, vsrc, 3); |
michael@0 | 125 | |
michael@0 | 126 | return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0); |
michael@0 | 127 | } |
michael@0 | 128 | |
michael@0 | 129 | // kDstATop_Mode, //!< [Sa, Sa * Dc + Sc * (1 - Da)] |
michael@0 | 130 | SkPMColor dstatop_modeproc_neon(SkPMColor src, SkPMColor dst) { |
michael@0 | 131 | unsigned sa = SkGetPackedA32(src); |
michael@0 | 132 | unsigned da = SkGetPackedA32(dst); |
michael@0 | 133 | unsigned ida = 255 - da; |
michael@0 | 134 | |
michael@0 | 135 | uint8x8_t vsa, vida, vsrc, vdst; |
michael@0 | 136 | |
michael@0 | 137 | vsa = vdup_n_u8(sa); |
michael@0 | 138 | vida = vdup_n_u8(ida); |
michael@0 | 139 | |
michael@0 | 140 | uint16x8_t vsrc_wide, vdst_wide; |
michael@0 | 141 | vsrc_wide = vmull_u8(vida, vreinterpret_u8_u32(vdup_n_u32(src))); |
michael@0 | 142 | vdst_wide = vmull_u8(vsa, vreinterpret_u8_u32(vdup_n_u32(dst))); |
michael@0 | 143 | |
michael@0 | 144 | vsrc_wide += vdupq_n_u16(128); |
michael@0 | 145 | vsrc_wide += vshrq_n_u16(vsrc_wide, 8); |
michael@0 | 146 | |
michael@0 | 147 | vdst_wide += vdupq_n_u16(128); |
michael@0 | 148 | vdst_wide += vshrq_n_u16(vdst_wide, 8); |
michael@0 | 149 | |
michael@0 | 150 | vsrc = vshrn_n_u16(vsrc_wide, 8); |
michael@0 | 151 | vdst = vshrn_n_u16(vdst_wide, 8); |
michael@0 | 152 | |
michael@0 | 153 | vsrc += vdst; |
michael@0 | 154 | vsrc = vset_lane_u8(sa, vsrc, 3); |
michael@0 | 155 | |
michael@0 | 156 | return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0); |
michael@0 | 157 | } |
michael@0 | 158 | |
michael@0 | 159 | // kXor_Mode [Sa + Da - 2 * Sa * Da, Sc * (1 - Da) + (1 - Sa) * Dc] |
michael@0 | 160 | SkPMColor xor_modeproc_neon(SkPMColor src, SkPMColor dst) { |
michael@0 | 161 | unsigned sa = SkGetPackedA32(src); |
michael@0 | 162 | unsigned da = SkGetPackedA32(dst); |
michael@0 | 163 | unsigned ret_alpha = sa + da - (SkAlphaMulAlpha(sa, da) << 1); |
michael@0 | 164 | unsigned isa = 255 - sa; |
michael@0 | 165 | unsigned ida = 255 - da; |
michael@0 | 166 | |
michael@0 | 167 | uint8x8_t vsrc, vdst, visa, vida; |
michael@0 | 168 | uint16x8_t vsrc_wide, vdst_wide; |
michael@0 | 169 | |
michael@0 | 170 | visa = vdup_n_u8(isa); |
michael@0 | 171 | vida = vdup_n_u8(ida); |
michael@0 | 172 | vsrc = vreinterpret_u8_u32(vdup_n_u32(src)); |
michael@0 | 173 | vdst = vreinterpret_u8_u32(vdup_n_u32(dst)); |
michael@0 | 174 | |
michael@0 | 175 | vsrc_wide = vmull_u8(vsrc, vida); |
michael@0 | 176 | vdst_wide = vmull_u8(vdst, visa); |
michael@0 | 177 | |
michael@0 | 178 | vsrc_wide += vdupq_n_u16(128); |
michael@0 | 179 | vsrc_wide += vshrq_n_u16(vsrc_wide, 8); |
michael@0 | 180 | |
michael@0 | 181 | vdst_wide += vdupq_n_u16(128); |
michael@0 | 182 | vdst_wide += vshrq_n_u16(vdst_wide, 8); |
michael@0 | 183 | |
michael@0 | 184 | vsrc = vshrn_n_u16(vsrc_wide, 8); |
michael@0 | 185 | vdst = vshrn_n_u16(vdst_wide, 8); |
michael@0 | 186 | |
michael@0 | 187 | vsrc += vdst; |
michael@0 | 188 | |
michael@0 | 189 | vsrc = vset_lane_u8(ret_alpha, vsrc, 3); |
michael@0 | 190 | |
michael@0 | 191 | return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0); |
michael@0 | 192 | } |
michael@0 | 193 | |
michael@0 | 194 | // kPlus_Mode |
michael@0 | 195 | SkPMColor plus_modeproc_neon(SkPMColor src, SkPMColor dst) { |
michael@0 | 196 | uint8x8_t vsrc, vdst; |
michael@0 | 197 | vsrc = vreinterpret_u8_u32(vdup_n_u32(src)); |
michael@0 | 198 | vdst = vreinterpret_u8_u32(vdup_n_u32(dst)); |
michael@0 | 199 | vsrc = vqadd_u8(vsrc, vdst); |
michael@0 | 200 | |
michael@0 | 201 | return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0); |
michael@0 | 202 | } |
michael@0 | 203 | |
michael@0 | 204 | // kModulate_Mode |
michael@0 | 205 | SkPMColor modulate_modeproc_neon(SkPMColor src, SkPMColor dst) { |
michael@0 | 206 | uint8x8_t vsrc, vdst, vres; |
michael@0 | 207 | uint16x8_t vres_wide; |
michael@0 | 208 | |
michael@0 | 209 | vsrc = vreinterpret_u8_u32(vdup_n_u32(src)); |
michael@0 | 210 | vdst = vreinterpret_u8_u32(vdup_n_u32(dst)); |
michael@0 | 211 | |
michael@0 | 212 | vres_wide = vmull_u8(vsrc, vdst); |
michael@0 | 213 | |
michael@0 | 214 | vres_wide += vdupq_n_u16(128); |
michael@0 | 215 | vres_wide += vshrq_n_u16(vres_wide, 8); |
michael@0 | 216 | |
michael@0 | 217 | vres = vshrn_n_u16(vres_wide, 8); |
michael@0 | 218 | |
michael@0 | 219 | return vget_lane_u32(vreinterpret_u32_u8(vres), 0); |
michael@0 | 220 | } |
michael@0 | 221 | |
michael@0 | 222 | //////////////////////////////////////////////////////////////////////////////// |
michael@0 | 223 | // 8 pixels modeprocs |
michael@0 | 224 | //////////////////////////////////////////////////////////////////////////////// |
michael@0 | 225 | |
michael@0 | 226 | uint8x8x4_t dstover_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
michael@0 | 227 | uint8x8x4_t ret; |
michael@0 | 228 | uint16x8_t src_scale; |
michael@0 | 229 | |
michael@0 | 230 | src_scale = vsubw_u8(vdupq_n_u16(256), dst.val[NEON_A]); |
michael@0 | 231 | |
michael@0 | 232 | ret.val[NEON_A] = dst.val[NEON_A] + SkAlphaMul_neon8(src.val[NEON_A], src_scale); |
michael@0 | 233 | ret.val[NEON_R] = dst.val[NEON_R] + SkAlphaMul_neon8(src.val[NEON_R], src_scale); |
michael@0 | 234 | ret.val[NEON_G] = dst.val[NEON_G] + SkAlphaMul_neon8(src.val[NEON_G], src_scale); |
michael@0 | 235 | ret.val[NEON_B] = dst.val[NEON_B] + SkAlphaMul_neon8(src.val[NEON_B], src_scale); |
michael@0 | 236 | |
michael@0 | 237 | return ret; |
michael@0 | 238 | } |
michael@0 | 239 | |
michael@0 | 240 | uint8x8x4_t srcin_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
michael@0 | 241 | uint8x8x4_t ret; |
michael@0 | 242 | uint16x8_t scale; |
michael@0 | 243 | |
michael@0 | 244 | scale = SkAlpha255To256_neon8(dst.val[NEON_A]); |
michael@0 | 245 | |
michael@0 | 246 | ret.val[NEON_A] = SkAlphaMul_neon8(src.val[NEON_A], scale); |
michael@0 | 247 | ret.val[NEON_R] = SkAlphaMul_neon8(src.val[NEON_R], scale); |
michael@0 | 248 | ret.val[NEON_G] = SkAlphaMul_neon8(src.val[NEON_G], scale); |
michael@0 | 249 | ret.val[NEON_B] = SkAlphaMul_neon8(src.val[NEON_B], scale); |
michael@0 | 250 | |
michael@0 | 251 | return ret; |
michael@0 | 252 | } |
michael@0 | 253 | |
michael@0 | 254 | uint8x8x4_t dstin_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
michael@0 | 255 | uint8x8x4_t ret; |
michael@0 | 256 | uint16x8_t scale; |
michael@0 | 257 | |
michael@0 | 258 | scale = SkAlpha255To256_neon8(src.val[NEON_A]); |
michael@0 | 259 | |
michael@0 | 260 | ret = SkAlphaMulQ_neon8(dst, scale); |
michael@0 | 261 | |
michael@0 | 262 | return ret; |
michael@0 | 263 | } |
michael@0 | 264 | |
michael@0 | 265 | uint8x8x4_t srcout_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
michael@0 | 266 | uint8x8x4_t ret; |
michael@0 | 267 | uint16x8_t scale = vsubw_u8(vdupq_n_u16(256), dst.val[NEON_A]); |
michael@0 | 268 | |
michael@0 | 269 | ret = SkAlphaMulQ_neon8(src, scale); |
michael@0 | 270 | |
michael@0 | 271 | return ret; |
michael@0 | 272 | } |
michael@0 | 273 | |
michael@0 | 274 | uint8x8x4_t dstout_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
michael@0 | 275 | uint8x8x4_t ret; |
michael@0 | 276 | uint16x8_t scale = vsubw_u8(vdupq_n_u16(256), src.val[NEON_A]); |
michael@0 | 277 | |
michael@0 | 278 | ret = SkAlphaMulQ_neon8(dst, scale); |
michael@0 | 279 | |
michael@0 | 280 | return ret; |
michael@0 | 281 | } |
michael@0 | 282 | |
michael@0 | 283 | uint8x8x4_t srcatop_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
michael@0 | 284 | uint8x8x4_t ret; |
michael@0 | 285 | uint8x8_t isa; |
michael@0 | 286 | |
michael@0 | 287 | isa = vsub_u8(vdup_n_u8(255), src.val[NEON_A]); |
michael@0 | 288 | |
michael@0 | 289 | ret.val[NEON_A] = dst.val[NEON_A]; |
michael@0 | 290 | ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], dst.val[NEON_A]) |
michael@0 | 291 | + SkAlphaMulAlpha_neon8(dst.val[NEON_R], isa); |
michael@0 | 292 | ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], dst.val[NEON_A]) |
michael@0 | 293 | + SkAlphaMulAlpha_neon8(dst.val[NEON_G], isa); |
michael@0 | 294 | ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], dst.val[NEON_A]) |
michael@0 | 295 | + SkAlphaMulAlpha_neon8(dst.val[NEON_B], isa); |
michael@0 | 296 | |
michael@0 | 297 | return ret; |
michael@0 | 298 | } |
michael@0 | 299 | |
michael@0 | 300 | uint8x8x4_t dstatop_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
michael@0 | 301 | uint8x8x4_t ret; |
michael@0 | 302 | uint8x8_t ida; |
michael@0 | 303 | |
michael@0 | 304 | ida = vsub_u8(vdup_n_u8(255), dst.val[NEON_A]); |
michael@0 | 305 | |
michael@0 | 306 | ret.val[NEON_A] = src.val[NEON_A]; |
michael@0 | 307 | ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], ida) |
michael@0 | 308 | + SkAlphaMulAlpha_neon8(dst.val[NEON_R], src.val[NEON_A]); |
michael@0 | 309 | ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], ida) |
michael@0 | 310 | + SkAlphaMulAlpha_neon8(dst.val[NEON_G], src.val[NEON_A]); |
michael@0 | 311 | ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], ida) |
michael@0 | 312 | + SkAlphaMulAlpha_neon8(dst.val[NEON_B], src.val[NEON_A]); |
michael@0 | 313 | |
michael@0 | 314 | return ret; |
michael@0 | 315 | } |
michael@0 | 316 | |
michael@0 | 317 | uint8x8x4_t xor_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
michael@0 | 318 | uint8x8x4_t ret; |
michael@0 | 319 | uint8x8_t isa, ida; |
michael@0 | 320 | uint16x8_t tmp_wide, tmp_wide2; |
michael@0 | 321 | |
michael@0 | 322 | isa = vsub_u8(vdup_n_u8(255), src.val[NEON_A]); |
michael@0 | 323 | ida = vsub_u8(vdup_n_u8(255), dst.val[NEON_A]); |
michael@0 | 324 | |
michael@0 | 325 | // First calc alpha |
michael@0 | 326 | tmp_wide = vmovl_u8(src.val[NEON_A]); |
michael@0 | 327 | tmp_wide = vaddw_u8(tmp_wide, dst.val[NEON_A]); |
michael@0 | 328 | tmp_wide2 = vshll_n_u8(SkAlphaMulAlpha_neon8(src.val[NEON_A], dst.val[NEON_A]), 1); |
michael@0 | 329 | tmp_wide = vsubq_u16(tmp_wide, tmp_wide2); |
michael@0 | 330 | ret.val[NEON_A] = vmovn_u16(tmp_wide); |
michael@0 | 331 | |
michael@0 | 332 | // Then colors |
michael@0 | 333 | ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], ida) |
michael@0 | 334 | + SkAlphaMulAlpha_neon8(dst.val[NEON_R], isa); |
michael@0 | 335 | ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], ida) |
michael@0 | 336 | + SkAlphaMulAlpha_neon8(dst.val[NEON_G], isa); |
michael@0 | 337 | ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], ida) |
michael@0 | 338 | + SkAlphaMulAlpha_neon8(dst.val[NEON_B], isa); |
michael@0 | 339 | |
michael@0 | 340 | return ret; |
michael@0 | 341 | } |
michael@0 | 342 | |
michael@0 | 343 | uint8x8x4_t plus_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
michael@0 | 344 | uint8x8x4_t ret; |
michael@0 | 345 | |
michael@0 | 346 | ret.val[NEON_A] = vqadd_u8(src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 347 | ret.val[NEON_R] = vqadd_u8(src.val[NEON_R], dst.val[NEON_R]); |
michael@0 | 348 | ret.val[NEON_G] = vqadd_u8(src.val[NEON_G], dst.val[NEON_G]); |
michael@0 | 349 | ret.val[NEON_B] = vqadd_u8(src.val[NEON_B], dst.val[NEON_B]); |
michael@0 | 350 | |
michael@0 | 351 | return ret; |
michael@0 | 352 | } |
michael@0 | 353 | |
michael@0 | 354 | uint8x8x4_t modulate_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
michael@0 | 355 | uint8x8x4_t ret; |
michael@0 | 356 | |
michael@0 | 357 | ret.val[NEON_A] = SkAlphaMulAlpha_neon8(src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 358 | ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], dst.val[NEON_R]); |
michael@0 | 359 | ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], dst.val[NEON_G]); |
michael@0 | 360 | ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], dst.val[NEON_B]); |
michael@0 | 361 | |
michael@0 | 362 | return ret; |
michael@0 | 363 | } |
michael@0 | 364 | |
michael@0 | 365 | static inline uint8x8_t srcover_color(uint8x8_t a, uint8x8_t b) { |
michael@0 | 366 | uint16x8_t tmp; |
michael@0 | 367 | |
michael@0 | 368 | tmp = vaddl_u8(a, b); |
michael@0 | 369 | tmp -= SkAlphaMulAlpha_neon8_16(a, b); |
michael@0 | 370 | |
michael@0 | 371 | return vmovn_u16(tmp); |
michael@0 | 372 | } |
michael@0 | 373 | |
michael@0 | 374 | uint8x8x4_t screen_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
michael@0 | 375 | uint8x8x4_t ret; |
michael@0 | 376 | |
michael@0 | 377 | ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 378 | ret.val[NEON_R] = srcover_color(src.val[NEON_R], dst.val[NEON_R]); |
michael@0 | 379 | ret.val[NEON_G] = srcover_color(src.val[NEON_G], dst.val[NEON_G]); |
michael@0 | 380 | ret.val[NEON_B] = srcover_color(src.val[NEON_B], dst.val[NEON_B]); |
michael@0 | 381 | |
michael@0 | 382 | return ret; |
michael@0 | 383 | } |
michael@0 | 384 | |
michael@0 | 385 | template <bool overlay> |
michael@0 | 386 | static inline uint8x8_t overlay_hardlight_color(uint8x8_t sc, uint8x8_t dc, |
michael@0 | 387 | uint8x8_t sa, uint8x8_t da) { |
michael@0 | 388 | /* |
michael@0 | 389 | * In the end we're gonna use (rc + tmp) with a different rc |
michael@0 | 390 | * coming from an alternative. |
michael@0 | 391 | * The whole value (rc + tmp) can always be expressed as |
michael@0 | 392 | * VAL = COM - SUB in the if case |
michael@0 | 393 | * VAL = COM + SUB - sa*da in the else case |
michael@0 | 394 | * |
michael@0 | 395 | * with COM = 255 * (sc + dc) |
michael@0 | 396 | * and SUB = sc*da + dc*sa - 2*dc*sc |
michael@0 | 397 | */ |
michael@0 | 398 | |
michael@0 | 399 | // Prepare common subexpressions |
michael@0 | 400 | uint16x8_t const255 = vdupq_n_u16(255); |
michael@0 | 401 | uint16x8_t sc_plus_dc = vaddl_u8(sc, dc); |
michael@0 | 402 | uint16x8_t scda = vmull_u8(sc, da); |
michael@0 | 403 | uint16x8_t dcsa = vmull_u8(dc, sa); |
michael@0 | 404 | uint16x8_t sada = vmull_u8(sa, da); |
michael@0 | 405 | |
michael@0 | 406 | // Prepare non common subexpressions |
michael@0 | 407 | uint16x8_t dc2, sc2; |
michael@0 | 408 | uint32x4_t scdc2_1, scdc2_2; |
michael@0 | 409 | if (overlay) { |
michael@0 | 410 | dc2 = vshll_n_u8(dc, 1); |
michael@0 | 411 | scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc))); |
michael@0 | 412 | scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc))); |
michael@0 | 413 | } else { |
michael@0 | 414 | sc2 = vshll_n_u8(sc, 1); |
michael@0 | 415 | scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc))); |
michael@0 | 416 | scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc))); |
michael@0 | 417 | } |
michael@0 | 418 | |
michael@0 | 419 | // Calc COM |
michael@0 | 420 | int32x4_t com1, com2; |
michael@0 | 421 | com1 = vreinterpretq_s32_u32( |
michael@0 | 422 | vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); |
michael@0 | 423 | com2 = vreinterpretq_s32_u32( |
michael@0 | 424 | vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); |
michael@0 | 425 | |
michael@0 | 426 | // Calc SUB |
michael@0 | 427 | int32x4_t sub1, sub2; |
michael@0 | 428 | sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa))); |
michael@0 | 429 | sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dcsa))); |
michael@0 | 430 | sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1)); |
michael@0 | 431 | sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2)); |
michael@0 | 432 | |
michael@0 | 433 | // Compare 2*dc <= da |
michael@0 | 434 | uint16x8_t cmp; |
michael@0 | 435 | |
michael@0 | 436 | if (overlay) { |
michael@0 | 437 | cmp = vcleq_u16(dc2, vmovl_u8(da)); |
michael@0 | 438 | } else { |
michael@0 | 439 | cmp = vcleq_u16(sc2, vmovl_u8(sa)); |
michael@0 | 440 | } |
michael@0 | 441 | |
michael@0 | 442 | // Prepare variables |
michael@0 | 443 | int32x4_t val1_1, val1_2; |
michael@0 | 444 | int32x4_t val2_1, val2_2; |
michael@0 | 445 | uint32x4_t cmp1, cmp2; |
michael@0 | 446 | |
michael@0 | 447 | cmp1 = vmovl_u16(vget_low_u16(cmp)); |
michael@0 | 448 | cmp1 |= vshlq_n_u32(cmp1, 16); |
michael@0 | 449 | cmp2 = vmovl_u16(vget_high_u16(cmp)); |
michael@0 | 450 | cmp2 |= vshlq_n_u32(cmp2, 16); |
michael@0 | 451 | |
michael@0 | 452 | // Calc COM - SUB |
michael@0 | 453 | val1_1 = com1 - sub1; |
michael@0 | 454 | val1_2 = com2 - sub2; |
michael@0 | 455 | |
michael@0 | 456 | // Calc COM + SUB - sa*da |
michael@0 | 457 | val2_1 = com1 + sub1; |
michael@0 | 458 | val2_2 = com2 + sub2; |
michael@0 | 459 | |
michael@0 | 460 | val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada)))); |
michael@0 | 461 | val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sada)))); |
michael@0 | 462 | |
michael@0 | 463 | // Insert where needed |
michael@0 | 464 | val1_1 = vbslq_s32(cmp1, val1_1, val2_1); |
michael@0 | 465 | val1_2 = vbslq_s32(cmp2, val1_2, val2_2); |
michael@0 | 466 | |
michael@0 | 467 | // Call the clamp_div255round function |
michael@0 | 468 | return clamp_div255round_simd8_32(val1_1, val1_2); |
michael@0 | 469 | } |
michael@0 | 470 | |
michael@0 | 471 | static inline uint8x8_t overlay_color(uint8x8_t sc, uint8x8_t dc, |
michael@0 | 472 | uint8x8_t sa, uint8x8_t da) { |
michael@0 | 473 | return overlay_hardlight_color<true>(sc, dc, sa, da); |
michael@0 | 474 | } |
michael@0 | 475 | |
michael@0 | 476 | uint8x8x4_t overlay_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
michael@0 | 477 | uint8x8x4_t ret; |
michael@0 | 478 | |
michael@0 | 479 | ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 480 | ret.val[NEON_R] = overlay_color(src.val[NEON_R], dst.val[NEON_R], |
michael@0 | 481 | src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 482 | ret.val[NEON_G] = overlay_color(src.val[NEON_G], dst.val[NEON_G], |
michael@0 | 483 | src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 484 | ret.val[NEON_B] = overlay_color(src.val[NEON_B], dst.val[NEON_B], |
michael@0 | 485 | src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 486 | |
michael@0 | 487 | return ret; |
michael@0 | 488 | } |
michael@0 | 489 | |
michael@0 | 490 | template <bool lighten> |
michael@0 | 491 | static inline uint8x8_t lighten_darken_color(uint8x8_t sc, uint8x8_t dc, |
michael@0 | 492 | uint8x8_t sa, uint8x8_t da) { |
michael@0 | 493 | uint16x8_t sd, ds, cmp, tmp, tmp2; |
michael@0 | 494 | |
michael@0 | 495 | // Prepare |
michael@0 | 496 | sd = vmull_u8(sc, da); |
michael@0 | 497 | ds = vmull_u8(dc, sa); |
michael@0 | 498 | |
michael@0 | 499 | // Do test |
michael@0 | 500 | if (lighten) { |
michael@0 | 501 | cmp = vcgtq_u16(sd, ds); |
michael@0 | 502 | } else { |
michael@0 | 503 | cmp = vcltq_u16(sd, ds); |
michael@0 | 504 | } |
michael@0 | 505 | |
michael@0 | 506 | // Assign if |
michael@0 | 507 | tmp = vaddl_u8(sc, dc); |
michael@0 | 508 | tmp2 = tmp; |
michael@0 | 509 | tmp -= SkDiv255Round_neon8_16_16(ds); |
michael@0 | 510 | |
michael@0 | 511 | // Calc else |
michael@0 | 512 | tmp2 -= SkDiv255Round_neon8_16_16(sd); |
michael@0 | 513 | |
michael@0 | 514 | // Insert where needed |
michael@0 | 515 | tmp = vbslq_u16(cmp, tmp, tmp2); |
michael@0 | 516 | |
michael@0 | 517 | return vmovn_u16(tmp); |
michael@0 | 518 | } |
michael@0 | 519 | |
michael@0 | 520 | static inline uint8x8_t darken_color(uint8x8_t sc, uint8x8_t dc, |
michael@0 | 521 | uint8x8_t sa, uint8x8_t da) { |
michael@0 | 522 | return lighten_darken_color<false>(sc, dc, sa, da); |
michael@0 | 523 | } |
michael@0 | 524 | |
michael@0 | 525 | uint8x8x4_t darken_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
michael@0 | 526 | uint8x8x4_t ret; |
michael@0 | 527 | |
michael@0 | 528 | ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 529 | ret.val[NEON_R] = darken_color(src.val[NEON_R], dst.val[NEON_R], |
michael@0 | 530 | src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 531 | ret.val[NEON_G] = darken_color(src.val[NEON_G], dst.val[NEON_G], |
michael@0 | 532 | src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 533 | ret.val[NEON_B] = darken_color(src.val[NEON_B], dst.val[NEON_B], |
michael@0 | 534 | src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 535 | |
michael@0 | 536 | return ret; |
michael@0 | 537 | } |
michael@0 | 538 | |
michael@0 | 539 | static inline uint8x8_t lighten_color(uint8x8_t sc, uint8x8_t dc, |
michael@0 | 540 | uint8x8_t sa, uint8x8_t da) { |
michael@0 | 541 | return lighten_darken_color<true>(sc, dc, sa, da); |
michael@0 | 542 | } |
michael@0 | 543 | |
michael@0 | 544 | uint8x8x4_t lighten_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
michael@0 | 545 | uint8x8x4_t ret; |
michael@0 | 546 | |
michael@0 | 547 | ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 548 | ret.val[NEON_R] = lighten_color(src.val[NEON_R], dst.val[NEON_R], |
michael@0 | 549 | src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 550 | ret.val[NEON_G] = lighten_color(src.val[NEON_G], dst.val[NEON_G], |
michael@0 | 551 | src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 552 | ret.val[NEON_B] = lighten_color(src.val[NEON_B], dst.val[NEON_B], |
michael@0 | 553 | src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 554 | |
michael@0 | 555 | return ret; |
michael@0 | 556 | } |
michael@0 | 557 | |
michael@0 | 558 | static inline uint8x8_t hardlight_color(uint8x8_t sc, uint8x8_t dc, |
michael@0 | 559 | uint8x8_t sa, uint8x8_t da) { |
michael@0 | 560 | return overlay_hardlight_color<false>(sc, dc, sa, da); |
michael@0 | 561 | } |
michael@0 | 562 | |
michael@0 | 563 | uint8x8x4_t hardlight_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
michael@0 | 564 | uint8x8x4_t ret; |
michael@0 | 565 | |
michael@0 | 566 | ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 567 | ret.val[NEON_R] = hardlight_color(src.val[NEON_R], dst.val[NEON_R], |
michael@0 | 568 | src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 569 | ret.val[NEON_G] = hardlight_color(src.val[NEON_G], dst.val[NEON_G], |
michael@0 | 570 | src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 571 | ret.val[NEON_B] = hardlight_color(src.val[NEON_B], dst.val[NEON_B], |
michael@0 | 572 | src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 573 | |
michael@0 | 574 | return ret; |
michael@0 | 575 | } |
michael@0 | 576 | |
michael@0 | 577 | static inline uint8x8_t difference_color(uint8x8_t sc, uint8x8_t dc, |
michael@0 | 578 | uint8x8_t sa, uint8x8_t da) { |
michael@0 | 579 | uint16x8_t sd, ds, tmp; |
michael@0 | 580 | int16x8_t val; |
michael@0 | 581 | |
michael@0 | 582 | sd = vmull_u8(sc, da); |
michael@0 | 583 | ds = vmull_u8(dc, sa); |
michael@0 | 584 | |
michael@0 | 585 | tmp = vminq_u16(sd, ds); |
michael@0 | 586 | tmp = SkDiv255Round_neon8_16_16(tmp); |
michael@0 | 587 | tmp = vshlq_n_u16(tmp, 1); |
michael@0 | 588 | |
michael@0 | 589 | val = vreinterpretq_s16_u16(vaddl_u8(sc, dc)); |
michael@0 | 590 | |
michael@0 | 591 | val -= vreinterpretq_s16_u16(tmp); |
michael@0 | 592 | |
michael@0 | 593 | val = vmaxq_s16(val, vdupq_n_s16(0)); |
michael@0 | 594 | val = vminq_s16(val, vdupq_n_s16(255)); |
michael@0 | 595 | |
michael@0 | 596 | return vmovn_u16(vreinterpretq_u16_s16(val)); |
michael@0 | 597 | } |
michael@0 | 598 | |
michael@0 | 599 | uint8x8x4_t difference_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
michael@0 | 600 | uint8x8x4_t ret; |
michael@0 | 601 | |
michael@0 | 602 | ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 603 | ret.val[NEON_R] = difference_color(src.val[NEON_R], dst.val[NEON_R], |
michael@0 | 604 | src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 605 | ret.val[NEON_G] = difference_color(src.val[NEON_G], dst.val[NEON_G], |
michael@0 | 606 | src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 607 | ret.val[NEON_B] = difference_color(src.val[NEON_B], dst.val[NEON_B], |
michael@0 | 608 | src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 609 | |
michael@0 | 610 | return ret; |
michael@0 | 611 | } |
michael@0 | 612 | |
michael@0 | 613 | static inline uint8x8_t exclusion_color(uint8x8_t sc, uint8x8_t dc, |
michael@0 | 614 | uint8x8_t sa, uint8x8_t da) { |
michael@0 | 615 | /* The equation can be simplified to 255(sc + dc) - 2 * sc * dc */ |
michael@0 | 616 | |
michael@0 | 617 | uint16x8_t sc_plus_dc, scdc, const255; |
michael@0 | 618 | int32x4_t term1_1, term1_2, term2_1, term2_2; |
michael@0 | 619 | |
michael@0 | 620 | /* Calc (sc + dc) and (sc * dc) */ |
michael@0 | 621 | sc_plus_dc = vaddl_u8(sc, dc); |
michael@0 | 622 | scdc = vmull_u8(sc, dc); |
michael@0 | 623 | |
michael@0 | 624 | /* Prepare constants */ |
michael@0 | 625 | const255 = vdupq_n_u16(255); |
michael@0 | 626 | |
michael@0 | 627 | /* Calc the first term */ |
michael@0 | 628 | term1_1 = vreinterpretq_s32_u32( |
michael@0 | 629 | vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); |
michael@0 | 630 | term1_2 = vreinterpretq_s32_u32( |
michael@0 | 631 | vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); |
michael@0 | 632 | |
michael@0 | 633 | /* Calc the second term */ |
michael@0 | 634 | term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1)); |
michael@0 | 635 | term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1)); |
michael@0 | 636 | |
michael@0 | 637 | return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2); |
michael@0 | 638 | } |
michael@0 | 639 | |
michael@0 | 640 | uint8x8x4_t exclusion_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
michael@0 | 641 | uint8x8x4_t ret; |
michael@0 | 642 | |
michael@0 | 643 | ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 644 | ret.val[NEON_R] = exclusion_color(src.val[NEON_R], dst.val[NEON_R], |
michael@0 | 645 | src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 646 | ret.val[NEON_G] = exclusion_color(src.val[NEON_G], dst.val[NEON_G], |
michael@0 | 647 | src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 648 | ret.val[NEON_B] = exclusion_color(src.val[NEON_B], dst.val[NEON_B], |
michael@0 | 649 | src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 650 | |
michael@0 | 651 | return ret; |
michael@0 | 652 | } |
michael@0 | 653 | |
michael@0 | 654 | static inline uint8x8_t blendfunc_multiply_color(uint8x8_t sc, uint8x8_t dc, |
michael@0 | 655 | uint8x8_t sa, uint8x8_t da) { |
michael@0 | 656 | uint32x4_t val1, val2; |
michael@0 | 657 | uint16x8_t scdc, t1, t2; |
michael@0 | 658 | |
michael@0 | 659 | t1 = vmull_u8(sc, vdup_n_u8(255) - da); |
michael@0 | 660 | t2 = vmull_u8(dc, vdup_n_u8(255) - sa); |
michael@0 | 661 | scdc = vmull_u8(sc, dc); |
michael@0 | 662 | |
michael@0 | 663 | val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2)); |
michael@0 | 664 | val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2)); |
michael@0 | 665 | |
michael@0 | 666 | val1 = vaddw_u16(val1, vget_low_u16(scdc)); |
michael@0 | 667 | val2 = vaddw_u16(val2, vget_high_u16(scdc)); |
michael@0 | 668 | |
michael@0 | 669 | return clamp_div255round_simd8_32( |
michael@0 | 670 | vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2)); |
michael@0 | 671 | } |
michael@0 | 672 | |
michael@0 | 673 | uint8x8x4_t multiply_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
michael@0 | 674 | uint8x8x4_t ret; |
michael@0 | 675 | |
michael@0 | 676 | ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 677 | ret.val[NEON_R] = blendfunc_multiply_color(src.val[NEON_R], dst.val[NEON_R], |
michael@0 | 678 | src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 679 | ret.val[NEON_G] = blendfunc_multiply_color(src.val[NEON_G], dst.val[NEON_G], |
michael@0 | 680 | src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 681 | ret.val[NEON_B] = blendfunc_multiply_color(src.val[NEON_B], dst.val[NEON_B], |
michael@0 | 682 | src.val[NEON_A], dst.val[NEON_A]); |
michael@0 | 683 | |
michael@0 | 684 | return ret; |
michael@0 | 685 | } |
michael@0 | 686 | |
michael@0 | 687 | //////////////////////////////////////////////////////////////////////////////// |
michael@0 | 688 | |
michael@0 | 689 | typedef uint8x8x4_t (*SkXfermodeProcSIMD)(uint8x8x4_t src, uint8x8x4_t dst); |
michael@0 | 690 | |
michael@0 | 691 | extern SkXfermodeProcSIMD gNEONXfermodeProcs[]; |
michael@0 | 692 | |
michael@0 | 693 | SkNEONProcCoeffXfermode::SkNEONProcCoeffXfermode(SkReadBuffer& buffer) |
michael@0 | 694 | : INHERITED(buffer) { |
michael@0 | 695 | fProcSIMD = reinterpret_cast<void*>(gNEONXfermodeProcs[this->getMode()]); |
michael@0 | 696 | } |
michael@0 | 697 | |
michael@0 | 698 | void SkNEONProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[], |
michael@0 | 699 | int count, const SkAlpha aa[]) const { |
michael@0 | 700 | SkASSERT(dst && src && count >= 0); |
michael@0 | 701 | |
michael@0 | 702 | SkXfermodeProc proc = this->getProc(); |
michael@0 | 703 | SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD); |
michael@0 | 704 | SkASSERT(procSIMD != NULL); |
michael@0 | 705 | |
michael@0 | 706 | if (NULL == aa) { |
michael@0 | 707 | // Unrolled NEON code |
michael@0 | 708 | while (count >= 8) { |
michael@0 | 709 | uint8x8x4_t vsrc, vdst, vres; |
michael@0 | 710 | |
michael@0 | 711 | #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) |
michael@0 | 712 | asm volatile ( |
michael@0 | 713 | "vld4.u8 %h[vsrc], [%[src]]! \t\n" |
michael@0 | 714 | "vld4.u8 %h[vdst], [%[dst]] \t\n" |
michael@0 | 715 | : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst), [src] "+&r" (src) |
michael@0 | 716 | : [dst] "r" (dst) |
michael@0 | 717 | : |
michael@0 | 718 | ); |
michael@0 | 719 | #else |
michael@0 | 720 | register uint8x8_t d0 asm("d0"); |
michael@0 | 721 | register uint8x8_t d1 asm("d1"); |
michael@0 | 722 | register uint8x8_t d2 asm("d2"); |
michael@0 | 723 | register uint8x8_t d3 asm("d3"); |
michael@0 | 724 | register uint8x8_t d4 asm("d4"); |
michael@0 | 725 | register uint8x8_t d5 asm("d5"); |
michael@0 | 726 | register uint8x8_t d6 asm("d6"); |
michael@0 | 727 | register uint8x8_t d7 asm("d7"); |
michael@0 | 728 | |
michael@0 | 729 | asm volatile ( |
michael@0 | 730 | "vld4.u8 {d0-d3},[%[src]]!;" |
michael@0 | 731 | "vld4.u8 {d4-d7},[%[dst]];" |
michael@0 | 732 | : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), |
michael@0 | 733 | "=w" (d4), "=w" (d5), "=w" (d6), "=w" (d7), |
michael@0 | 734 | [src] "+&r" (src) |
michael@0 | 735 | : [dst] "r" (dst) |
michael@0 | 736 | : |
michael@0 | 737 | ); |
michael@0 | 738 | vsrc.val[0] = d0; vdst.val[0] = d4; |
michael@0 | 739 | vsrc.val[1] = d1; vdst.val[1] = d5; |
michael@0 | 740 | vsrc.val[2] = d2; vdst.val[2] = d6; |
michael@0 | 741 | vsrc.val[3] = d3; vdst.val[3] = d7; |
michael@0 | 742 | #endif |
michael@0 | 743 | |
michael@0 | 744 | vres = procSIMD(vsrc, vdst); |
michael@0 | 745 | |
michael@0 | 746 | vst4_u8((uint8_t*)dst, vres); |
michael@0 | 747 | |
michael@0 | 748 | count -= 8; |
michael@0 | 749 | dst += 8; |
michael@0 | 750 | } |
michael@0 | 751 | // Leftovers |
michael@0 | 752 | for (int i = 0; i < count; i++) { |
michael@0 | 753 | dst[i] = proc(src[i], dst[i]); |
michael@0 | 754 | } |
michael@0 | 755 | } else { |
michael@0 | 756 | for (int i = count - 1; i >= 0; --i) { |
michael@0 | 757 | unsigned a = aa[i]; |
michael@0 | 758 | if (0 != a) { |
michael@0 | 759 | SkPMColor dstC = dst[i]; |
michael@0 | 760 | SkPMColor C = proc(src[i], dstC); |
michael@0 | 761 | if (a != 0xFF) { |
michael@0 | 762 | C = SkFourByteInterp_neon(C, dstC, a); |
michael@0 | 763 | } |
michael@0 | 764 | dst[i] = C; |
michael@0 | 765 | } |
michael@0 | 766 | } |
michael@0 | 767 | } |
michael@0 | 768 | } |
michael@0 | 769 | |
michael@0 | 770 | void SkNEONProcCoeffXfermode::xfer16(uint16_t* SK_RESTRICT dst, |
michael@0 | 771 | const SkPMColor* SK_RESTRICT src, int count, |
michael@0 | 772 | const SkAlpha* SK_RESTRICT aa) const { |
michael@0 | 773 | SkASSERT(dst && src && count >= 0); |
michael@0 | 774 | |
michael@0 | 775 | SkXfermodeProc proc = this->getProc(); |
michael@0 | 776 | SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD); |
michael@0 | 777 | SkASSERT(procSIMD != NULL); |
michael@0 | 778 | |
michael@0 | 779 | if (NULL == aa) { |
michael@0 | 780 | while(count >= 8) { |
michael@0 | 781 | uint16x8_t vdst, vres16; |
michael@0 | 782 | uint8x8x4_t vdst32, vsrc, vres; |
michael@0 | 783 | |
michael@0 | 784 | vdst = vld1q_u16(dst); |
michael@0 | 785 | |
michael@0 | 786 | #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) |
michael@0 | 787 | asm volatile ( |
michael@0 | 788 | "vld4.u8 %h[vsrc], [%[src]]! \t\n" |
michael@0 | 789 | : [vsrc] "=w" (vsrc), [src] "+&r" (src) |
michael@0 | 790 | : : |
michael@0 | 791 | ); |
michael@0 | 792 | #else |
michael@0 | 793 | register uint8x8_t d0 asm("d0"); |
michael@0 | 794 | register uint8x8_t d1 asm("d1"); |
michael@0 | 795 | register uint8x8_t d2 asm("d2"); |
michael@0 | 796 | register uint8x8_t d3 asm("d3"); |
michael@0 | 797 | |
michael@0 | 798 | asm volatile ( |
michael@0 | 799 | "vld4.u8 {d0-d3},[%[src]]!;" |
michael@0 | 800 | : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), |
michael@0 | 801 | [src] "+&r" (src) |
michael@0 | 802 | : : |
michael@0 | 803 | ); |
michael@0 | 804 | vsrc.val[0] = d0; |
michael@0 | 805 | vsrc.val[1] = d1; |
michael@0 | 806 | vsrc.val[2] = d2; |
michael@0 | 807 | vsrc.val[3] = d3; |
michael@0 | 808 | #endif |
michael@0 | 809 | |
michael@0 | 810 | vdst32 = SkPixel16ToPixel32_neon8(vdst); |
michael@0 | 811 | vres = procSIMD(vsrc, vdst32); |
michael@0 | 812 | vres16 = SkPixel32ToPixel16_neon8(vres); |
michael@0 | 813 | |
michael@0 | 814 | vst1q_u16(dst, vres16); |
michael@0 | 815 | |
michael@0 | 816 | count -= 8; |
michael@0 | 817 | dst += 8; |
michael@0 | 818 | } |
michael@0 | 819 | for (int i = 0; i < count; i++) { |
michael@0 | 820 | SkPMColor dstC = SkPixel16ToPixel32(dst[i]); |
michael@0 | 821 | dst[i] = SkPixel32ToPixel16_ToU16(proc(src[i], dstC)); |
michael@0 | 822 | } |
michael@0 | 823 | } else { |
michael@0 | 824 | for (int i = count - 1; i >= 0; --i) { |
michael@0 | 825 | unsigned a = aa[i]; |
michael@0 | 826 | if (0 != a) { |
michael@0 | 827 | SkPMColor dstC = SkPixel16ToPixel32(dst[i]); |
michael@0 | 828 | SkPMColor C = proc(src[i], dstC); |
michael@0 | 829 | if (0xFF != a) { |
michael@0 | 830 | C = SkFourByteInterp_neon(C, dstC, a); |
michael@0 | 831 | } |
michael@0 | 832 | dst[i] = SkPixel32ToPixel16_ToU16(C); |
michael@0 | 833 | } |
michael@0 | 834 | } |
michael@0 | 835 | } |
michael@0 | 836 | } |
michael@0 | 837 | |
michael@0 | 838 | #ifndef SK_IGNORE_TO_STRING |
michael@0 | 839 | void SkNEONProcCoeffXfermode::toString(SkString* str) const { |
michael@0 | 840 | this->INHERITED::toString(str); |
michael@0 | 841 | } |
michael@0 | 842 | #endif |
michael@0 | 843 | |
michael@0 | 844 | //////////////////////////////////////////////////////////////////////////////// |
michael@0 | 845 | |
michael@0 | 846 | SkXfermodeProcSIMD gNEONXfermodeProcs[] = { |
michael@0 | 847 | NULL, // kClear_Mode |
michael@0 | 848 | NULL, // kSrc_Mode |
michael@0 | 849 | NULL, // kDst_Mode |
michael@0 | 850 | NULL, // kSrcOver_Mode |
michael@0 | 851 | dstover_modeproc_neon8, |
michael@0 | 852 | srcin_modeproc_neon8, |
michael@0 | 853 | dstin_modeproc_neon8, |
michael@0 | 854 | srcout_modeproc_neon8, |
michael@0 | 855 | dstout_modeproc_neon8, |
michael@0 | 856 | srcatop_modeproc_neon8, |
michael@0 | 857 | dstatop_modeproc_neon8, |
michael@0 | 858 | xor_modeproc_neon8, |
michael@0 | 859 | plus_modeproc_neon8, |
michael@0 | 860 | modulate_modeproc_neon8, |
michael@0 | 861 | screen_modeproc_neon8, |
michael@0 | 862 | |
michael@0 | 863 | overlay_modeproc_neon8, |
michael@0 | 864 | darken_modeproc_neon8, |
michael@0 | 865 | lighten_modeproc_neon8, |
michael@0 | 866 | NULL, // kColorDodge_Mode |
michael@0 | 867 | NULL, // kColorBurn_Mode |
michael@0 | 868 | hardlight_modeproc_neon8, |
michael@0 | 869 | NULL, // kSoftLight_Mode |
michael@0 | 870 | difference_modeproc_neon8, |
michael@0 | 871 | exclusion_modeproc_neon8, |
michael@0 | 872 | multiply_modeproc_neon8, |
michael@0 | 873 | |
michael@0 | 874 | NULL, // kHue_Mode |
michael@0 | 875 | NULL, // kSaturation_Mode |
michael@0 | 876 | NULL, // kColor_Mode |
michael@0 | 877 | NULL, // kLuminosity_Mode |
michael@0 | 878 | }; |
michael@0 | 879 | |
michael@0 | 880 | SK_COMPILE_ASSERT( |
michael@0 | 881 | SK_ARRAY_COUNT(gNEONXfermodeProcs) == SkXfermode::kLastMode + 1, |
michael@0 | 882 | mode_count_arm |
michael@0 | 883 | ); |
michael@0 | 884 | |
michael@0 | 885 | SkXfermodeProc gNEONXfermodeProcs1[] = { |
michael@0 | 886 | NULL, // kClear_Mode |
michael@0 | 887 | NULL, // kSrc_Mode |
michael@0 | 888 | NULL, // kDst_Mode |
michael@0 | 889 | NULL, // kSrcOver_Mode |
michael@0 | 890 | NULL, // kDstOver_Mode |
michael@0 | 891 | NULL, // kSrcIn_Mode |
michael@0 | 892 | NULL, // kDstIn_Mode |
michael@0 | 893 | NULL, // kSrcOut_Mode |
michael@0 | 894 | NULL, // kDstOut_Mode |
michael@0 | 895 | srcatop_modeproc_neon, |
michael@0 | 896 | dstatop_modeproc_neon, |
michael@0 | 897 | xor_modeproc_neon, |
michael@0 | 898 | plus_modeproc_neon, |
michael@0 | 899 | modulate_modeproc_neon, |
michael@0 | 900 | NULL, // kScreen_Mode |
michael@0 | 901 | |
michael@0 | 902 | NULL, // kOverlay_Mode |
michael@0 | 903 | NULL, // kDarken_Mode |
michael@0 | 904 | NULL, // kLighten_Mode |
michael@0 | 905 | NULL, // kColorDodge_Mode |
michael@0 | 906 | NULL, // kColorBurn_Mode |
michael@0 | 907 | NULL, // kHardLight_Mode |
michael@0 | 908 | NULL, // kSoftLight_Mode |
michael@0 | 909 | NULL, // kDifference_Mode |
michael@0 | 910 | NULL, // kExclusion_Mode |
michael@0 | 911 | NULL, // kMultiply_Mode |
michael@0 | 912 | |
michael@0 | 913 | NULL, // kHue_Mode |
michael@0 | 914 | NULL, // kSaturation_Mode |
michael@0 | 915 | NULL, // kColor_Mode |
michael@0 | 916 | NULL, // kLuminosity_Mode |
michael@0 | 917 | }; |
michael@0 | 918 | |
michael@0 | 919 | SK_COMPILE_ASSERT( |
michael@0 | 920 | SK_ARRAY_COUNT(gNEONXfermodeProcs1) == SkXfermode::kLastMode + 1, |
michael@0 | 921 | mode1_count_arm |
michael@0 | 922 | ); |
michael@0 | 923 | |
michael@0 | 924 | SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_neon(const ProcCoeff& rec, |
michael@0 | 925 | SkXfermode::Mode mode) { |
michael@0 | 926 | |
michael@0 | 927 | void* procSIMD = reinterpret_cast<void*>(gNEONXfermodeProcs[mode]); |
michael@0 | 928 | |
michael@0 | 929 | if (procSIMD != NULL) { |
michael@0 | 930 | return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, procSIMD)); |
michael@0 | 931 | } |
michael@0 | 932 | return NULL; |
michael@0 | 933 | } |
michael@0 | 934 | |
michael@0 | 935 | SkXfermodeProc SkPlatformXfermodeProcFactory_impl_neon(SkXfermode::Mode mode) { |
michael@0 | 936 | return gNEONXfermodeProcs1[mode]; |
michael@0 | 937 | } |