michael@0: michael@0: #include "SkBlitMask.h" michael@0: #include "SkColor_opts_neon.h" michael@0: michael@0: static void D32_A8_Black_neon(void* SK_RESTRICT dst, size_t dstRB, michael@0: const void* SK_RESTRICT maskPtr, size_t maskRB, michael@0: SkColor, int width, int height) { michael@0: SkPMColor* SK_RESTRICT device = (SkPMColor*)dst; michael@0: const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr; michael@0: michael@0: maskRB -= width; michael@0: dstRB -= (width << 2); michael@0: do { michael@0: int w = width; michael@0: while (w >= 8) { michael@0: uint8x8_t vmask = vld1_u8(mask); michael@0: uint16x8_t vscale = vsubw_u8(vdupq_n_u16(256), vmask); michael@0: uint8x8x4_t vdevice = vld4_u8((uint8_t*)device); michael@0: michael@0: vdevice = SkAlphaMulQ_neon8(vdevice, vscale); michael@0: vdevice.val[NEON_A] += vmask; michael@0: michael@0: vst4_u8((uint8_t*)device, vdevice); michael@0: michael@0: mask += 8; michael@0: device += 8; michael@0: w -= 8; michael@0: } michael@0: while (w-- > 0) { michael@0: unsigned aa = *mask++; michael@0: *device = (aa << SK_A32_SHIFT) michael@0: + SkAlphaMulQ(*device, SkAlpha255To256(255 - aa)); michael@0: device += 1; michael@0: }; michael@0: device = (uint32_t*)((char*)device + dstRB); michael@0: mask += maskRB; michael@0: } while (--height != 0); michael@0: } michael@0: michael@0: template michael@0: static void D32_A8_Opaque_Color_neon(void* SK_RESTRICT dst, size_t dstRB, michael@0: const void* SK_RESTRICT maskPtr, size_t maskRB, michael@0: SkColor color, int width, int height) { michael@0: SkPMColor pmc = SkPreMultiplyColor(color); michael@0: SkPMColor* SK_RESTRICT device = (SkPMColor*)dst; michael@0: const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr; michael@0: uint8x8x4_t vpmc; michael@0: michael@0: maskRB -= width; michael@0: dstRB -= (width << 2); michael@0: michael@0: if (width >= 8) { michael@0: vpmc.val[NEON_A] = vdup_n_u8(SkGetPackedA32(pmc)); michael@0: vpmc.val[NEON_R] = vdup_n_u8(SkGetPackedR32(pmc)); michael@0: vpmc.val[NEON_G] = vdup_n_u8(SkGetPackedG32(pmc)); michael@0: vpmc.val[NEON_B] = vdup_n_u8(SkGetPackedB32(pmc)); michael@0: } michael@0: do { michael@0: int w = width; michael@0: while (w >= 8) { michael@0: uint8x8_t vmask = vld1_u8(mask); michael@0: uint16x8_t vscale, vmask256 = SkAlpha255To256_neon8(vmask); michael@0: if (isColor) { michael@0: vscale = vsubw_u8(vdupq_n_u16(256), michael@0: SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256)); michael@0: } else { michael@0: vscale = vsubw_u8(vdupq_n_u16(256), vmask); michael@0: } michael@0: uint8x8x4_t vdev = vld4_u8((uint8_t*)device); michael@0: michael@0: vdev.val[NEON_A] = SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256) michael@0: + SkAlphaMul_neon8(vdev.val[NEON_A], vscale); michael@0: vdev.val[NEON_R] = SkAlphaMul_neon8(vpmc.val[NEON_R], vmask256) michael@0: + SkAlphaMul_neon8(vdev.val[NEON_R], vscale); michael@0: vdev.val[NEON_G] = SkAlphaMul_neon8(vpmc.val[NEON_G], vmask256) michael@0: + SkAlphaMul_neon8(vdev.val[NEON_G], vscale); michael@0: vdev.val[NEON_B] = SkAlphaMul_neon8(vpmc.val[NEON_B], vmask256) michael@0: + SkAlphaMul_neon8(vdev.val[NEON_B], vscale); michael@0: michael@0: vst4_u8((uint8_t*)device, vdev); michael@0: michael@0: mask += 8; michael@0: device += 8; michael@0: w -= 8; michael@0: } michael@0: michael@0: while (w--) { michael@0: unsigned aa = *mask++; michael@0: if (isColor) { michael@0: *device = SkBlendARGB32(pmc, *device, aa); michael@0: } else { michael@0: *device = SkAlphaMulQ(pmc, SkAlpha255To256(aa)) michael@0: + SkAlphaMulQ(*device, SkAlpha255To256(255 - aa)); michael@0: } michael@0: device += 1; michael@0: }; michael@0: michael@0: device = (uint32_t*)((char*)device + dstRB); michael@0: mask += maskRB; michael@0: michael@0: } while (--height != 0); michael@0: } michael@0: michael@0: static void D32_A8_Opaque_neon(void* SK_RESTRICT dst, size_t dstRB, michael@0: const void* SK_RESTRICT maskPtr, size_t maskRB, michael@0: SkColor color, int width, int height) { michael@0: D32_A8_Opaque_Color_neon(dst, dstRB, maskPtr, maskRB, color, width, height); michael@0: } michael@0: michael@0: static void D32_A8_Color_neon(void* SK_RESTRICT dst, size_t dstRB, michael@0: const void* SK_RESTRICT maskPtr, size_t maskRB, michael@0: SkColor color, int width, int height) { michael@0: D32_A8_Opaque_Color_neon(dst, dstRB, maskPtr, maskRB, color, width, height); michael@0: } michael@0: michael@0: SkBlitMask::ColorProc D32_A8_Factory_neon(SkColor color) { michael@0: if (SK_ColorBLACK == color) { michael@0: return D32_A8_Black_neon; michael@0: } else if (0xFF == SkColorGetA(color)) { michael@0: return D32_A8_Opaque_neon; michael@0: } else { michael@0: return D32_A8_Color_neon; michael@0: } michael@0: } michael@0: michael@0: //////////////////////////////////////////////////////////////////////////////// michael@0: michael@0: void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[], michael@0: SkColor color, int width, michael@0: SkPMColor opaqueDst) { michael@0: int colR = SkColorGetR(color); michael@0: int colG = SkColorGetG(color); michael@0: int colB = SkColorGetB(color); michael@0: michael@0: uint8x8_t vcolR, vcolG, vcolB; michael@0: uint8x8_t vopqDstA, vopqDstR, vopqDstG, vopqDstB; michael@0: michael@0: if (width >= 8) { michael@0: vcolR = vdup_n_u8(colR); michael@0: vcolG = vdup_n_u8(colG); michael@0: vcolB = vdup_n_u8(colB); michael@0: vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst)); michael@0: vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst)); michael@0: vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst)); michael@0: vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst)); michael@0: } michael@0: michael@0: while (width >= 8) { michael@0: uint8x8x4_t vdst; michael@0: uint16x8_t vmask; michael@0: uint16x8_t vmaskR, vmaskG, vmaskB; michael@0: uint8x8_t vsel_trans, vsel_opq; michael@0: michael@0: vdst = vld4_u8((uint8_t*)dst); michael@0: vmask = vld1q_u16(src); michael@0: michael@0: // Prepare compare masks michael@0: vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0))); michael@0: vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF))); michael@0: michael@0: // Get all the color masks on 5 bits michael@0: vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT); michael@0: vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS), michael@0: SK_B16_BITS + SK_R16_BITS + 1); michael@0: vmaskB = vmask & vdupq_n_u16(SK_B16_MASK); michael@0: michael@0: // Upscale to 0..32 michael@0: vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4); michael@0: vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4); michael@0: vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4); michael@0: michael@0: vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF)); michael@0: vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]); michael@0: michael@0: vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR); michael@0: vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG); michael@0: vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB); michael@0: michael@0: vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]); michael@0: vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]); michael@0: vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]); michael@0: michael@0: vst4_u8((uint8_t*)dst, vdst); michael@0: michael@0: dst += 8; michael@0: src += 8; michael@0: width -= 8; michael@0: } michael@0: michael@0: // Leftovers michael@0: for (int i = 0; i < width; i++) { michael@0: dst[i] = SkBlendLCD16Opaque(colR, colG, colB, dst[i], src[i], michael@0: opaqueDst); michael@0: } michael@0: } michael@0: michael@0: void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[], michael@0: SkColor color, int width, SkPMColor) { michael@0: int colA = SkColorGetA(color); michael@0: int colR = SkColorGetR(color); michael@0: int colG = SkColorGetG(color); michael@0: int colB = SkColorGetB(color); michael@0: michael@0: colA = SkAlpha255To256(colA); michael@0: michael@0: uint8x8_t vcolR, vcolG, vcolB; michael@0: uint16x8_t vcolA; michael@0: michael@0: if (width >= 8) { michael@0: vcolA = vdupq_n_u16(colA); michael@0: vcolR = vdup_n_u8(colR); michael@0: vcolG = vdup_n_u8(colG); michael@0: vcolB = vdup_n_u8(colB); michael@0: } michael@0: michael@0: while (width >= 8) { michael@0: uint8x8x4_t vdst; michael@0: uint16x8_t vmask; michael@0: uint16x8_t vmaskR, vmaskG, vmaskB; michael@0: michael@0: vdst = vld4_u8((uint8_t*)dst); michael@0: vmask = vld1q_u16(src); michael@0: michael@0: // Get all the color masks on 5 bits michael@0: vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT); michael@0: vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS), michael@0: SK_B16_BITS + SK_R16_BITS + 1); michael@0: vmaskB = vmask & vdupq_n_u16(SK_B16_MASK); michael@0: michael@0: // Upscale to 0..32 michael@0: vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4); michael@0: vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4); michael@0: vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4); michael@0: michael@0: vmaskR = vshrq_n_u16(vmaskR * vcolA, 8); michael@0: vmaskG = vshrq_n_u16(vmaskG * vcolA, 8); michael@0: vmaskB = vshrq_n_u16(vmaskB * vcolA, 8); michael@0: michael@0: vdst.val[NEON_A] = vdup_n_u8(0xFF); michael@0: vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR); michael@0: vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG); michael@0: vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB); michael@0: michael@0: vst4_u8((uint8_t*)dst, vdst); michael@0: michael@0: dst += 8; michael@0: src += 8; michael@0: width -= 8; michael@0: } michael@0: michael@0: for (int i = 0; i < width; i++) { michael@0: dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]); michael@0: } michael@0: }