michael@0: /* michael@0: * Copyright 2012 The Android Open Source Project michael@0: * michael@0: * Use of this source code is governed by a BSD-style license that can be michael@0: * found in the LICENSE file. michael@0: */ michael@0: michael@0: michael@0: #include "SkBlitRow_opts_SSE2.h" michael@0: #include "SkBitmapProcState_opts_SSE2.h" michael@0: #include "SkColorPriv.h" michael@0: #include "SkColor_opts_SSE2.h" michael@0: #include "SkDither.h" michael@0: #include "SkUtils.h" michael@0: michael@0: #include michael@0: michael@0: /* SSE2 version of S32_Blend_BlitRow32() michael@0: * portable version is in core/SkBlitRow_D32.cpp michael@0: */ michael@0: void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, michael@0: const SkPMColor* SK_RESTRICT src, michael@0: int count, U8CPU alpha) { michael@0: SkASSERT(alpha <= 255); michael@0: if (count <= 0) { michael@0: return; michael@0: } michael@0: michael@0: uint32_t src_scale = SkAlpha255To256(alpha); michael@0: uint32_t dst_scale = 256 - src_scale; michael@0: michael@0: if (count >= 4) { michael@0: SkASSERT(((size_t)dst & 0x03) == 0); michael@0: while (((size_t)dst & 0x0F) != 0) { michael@0: *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); michael@0: src++; michael@0: dst++; michael@0: count--; michael@0: } michael@0: michael@0: const __m128i *s = reinterpret_cast(src); michael@0: __m128i *d = reinterpret_cast<__m128i*>(dst); michael@0: __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); michael@0: __m128i ag_mask = _mm_set1_epi32(0xFF00FF00); michael@0: michael@0: // Move scale factors to upper byte of word michael@0: __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); michael@0: __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8); michael@0: while (count >= 4) { michael@0: // Load 4 pixels each of src and dest. michael@0: __m128i src_pixel = _mm_loadu_si128(s); michael@0: __m128i dst_pixel = _mm_load_si128(d); michael@0: michael@0: // Interleave Atom port 0/1 operations based on the execution port michael@0: // constraints that multiply can only be executed on port 0 (while michael@0: // boolean operations can be executed on either port 0 or port 1) michael@0: // because GCC currently doesn't do a good job scheduling michael@0: // instructions based on these constraints. michael@0: michael@0: // Get red and blue pixels into lower byte of each word. michael@0: // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b) michael@0: __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); michael@0: michael@0: // Multiply by scale. michael@0: // (4 x (0, rs.h, 0, bs.h)) michael@0: // where rs.h stands for the higher byte of r * scale, and michael@0: // bs.h the higher byte of b * scale. michael@0: src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); michael@0: michael@0: // Get alpha and green pixels into higher byte of each word. michael@0: // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0) michael@0: __m128i src_ag = _mm_and_si128(ag_mask, src_pixel); michael@0: michael@0: // Multiply by scale. michael@0: // (4 x (as.h, as.l, gs.h, gs.l)) michael@0: src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); michael@0: michael@0: // Clear the lower byte of the a*scale and g*scale results michael@0: // (4 x (as.h, 0, gs.h, 0)) michael@0: src_ag = _mm_and_si128(src_ag, ag_mask); michael@0: michael@0: // Operations the destination pixels are the same as on the michael@0: // source pixels. See the comments above. michael@0: __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); michael@0: dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide); michael@0: __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel); michael@0: dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide); michael@0: dst_ag = _mm_and_si128(dst_ag, ag_mask); michael@0: michael@0: // Combine back into RGBA. michael@0: // (4 x (as.h, rs.h, gs.h, bs.h)) michael@0: src_pixel = _mm_or_si128(src_rb, src_ag); michael@0: dst_pixel = _mm_or_si128(dst_rb, dst_ag); michael@0: michael@0: // Add result michael@0: __m128i result = _mm_add_epi8(src_pixel, dst_pixel); michael@0: _mm_store_si128(d, result); michael@0: s++; michael@0: d++; michael@0: count -= 4; michael@0: } michael@0: src = reinterpret_cast(s); michael@0: dst = reinterpret_cast(d); michael@0: } michael@0: michael@0: while (count > 0) { michael@0: *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); michael@0: src++; michael@0: dst++; michael@0: count--; michael@0: } michael@0: } michael@0: michael@0: void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, michael@0: const SkPMColor* SK_RESTRICT src, michael@0: int count, U8CPU alpha) { michael@0: SkASSERT(alpha == 255); michael@0: if (count <= 0) { michael@0: return; michael@0: } michael@0: michael@0: if (count >= 4) { michael@0: SkASSERT(((size_t)dst & 0x03) == 0); michael@0: while (((size_t)dst & 0x0F) != 0) { michael@0: *dst = SkPMSrcOver(*src, *dst); michael@0: src++; michael@0: dst++; michael@0: count--; michael@0: } michael@0: michael@0: const __m128i *s = reinterpret_cast(src); michael@0: __m128i *d = reinterpret_cast<__m128i*>(dst); michael@0: #ifdef SK_USE_ACCURATE_BLENDING michael@0: __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); michael@0: __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit) michael@0: __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit) michael@0: while (count >= 4) { michael@0: // Load 4 pixels michael@0: __m128i src_pixel = _mm_loadu_si128(s); michael@0: __m128i dst_pixel = _mm_load_si128(d); michael@0: michael@0: __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); michael@0: __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); michael@0: // Shift alphas down to lower 8 bits of each quad. michael@0: __m128i alpha = _mm_srli_epi32(src_pixel, 24); michael@0: michael@0: // Copy alpha to upper 3rd byte of each quad michael@0: alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16)); michael@0: michael@0: // Subtract alphas from 255, to get 0..255 michael@0: alpha = _mm_sub_epi16(c_255, alpha); michael@0: michael@0: // Multiply by red and blue by src alpha. michael@0: dst_rb = _mm_mullo_epi16(dst_rb, alpha); michael@0: // Multiply by alpha and green by src alpha. michael@0: dst_ag = _mm_mullo_epi16(dst_ag, alpha); michael@0: michael@0: // dst_rb_low = (dst_rb >> 8) michael@0: __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8); michael@0: __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8); michael@0: michael@0: // dst_rb = (dst_rb + dst_rb_low + 128) >> 8 michael@0: dst_rb = _mm_add_epi16(dst_rb, dst_rb_low); michael@0: dst_rb = _mm_add_epi16(dst_rb, c_128); michael@0: dst_rb = _mm_srli_epi16(dst_rb, 8); michael@0: michael@0: // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask michael@0: dst_ag = _mm_add_epi16(dst_ag, dst_ag_low); michael@0: dst_ag = _mm_add_epi16(dst_ag, c_128); michael@0: dst_ag = _mm_andnot_si128(rb_mask, dst_ag); michael@0: michael@0: // Combine back into RGBA. michael@0: dst_pixel = _mm_or_si128(dst_rb, dst_ag); michael@0: michael@0: // Add result michael@0: __m128i result = _mm_add_epi8(src_pixel, dst_pixel); michael@0: _mm_store_si128(d, result); michael@0: s++; michael@0: d++; michael@0: count -= 4; michael@0: } michael@0: #else michael@0: __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); michael@0: __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit) michael@0: while (count >= 4) { michael@0: // Load 4 pixels michael@0: __m128i src_pixel = _mm_loadu_si128(s); michael@0: __m128i dst_pixel = _mm_load_si128(d); michael@0: michael@0: __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); michael@0: __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); michael@0: michael@0: // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word) michael@0: __m128i alpha = _mm_srli_epi16(src_pixel, 8); michael@0: michael@0: // (a0, a0, a1, a1, a2, g2, a3, g3) michael@0: alpha = _mm_shufflehi_epi16(alpha, 0xF5); michael@0: michael@0: // (a0, a0, a1, a1, a2, a2, a3, a3) michael@0: alpha = _mm_shufflelo_epi16(alpha, 0xF5); michael@0: michael@0: // Subtract alphas from 256, to get 1..256 michael@0: alpha = _mm_sub_epi16(c_256, alpha); michael@0: michael@0: // Multiply by red and blue by src alpha. michael@0: dst_rb = _mm_mullo_epi16(dst_rb, alpha); michael@0: // Multiply by alpha and green by src alpha. michael@0: dst_ag = _mm_mullo_epi16(dst_ag, alpha); michael@0: michael@0: // Divide by 256. michael@0: dst_rb = _mm_srli_epi16(dst_rb, 8); michael@0: michael@0: // Mask out high bits (already in the right place) michael@0: dst_ag = _mm_andnot_si128(rb_mask, dst_ag); michael@0: michael@0: // Combine back into RGBA. michael@0: dst_pixel = _mm_or_si128(dst_rb, dst_ag); michael@0: michael@0: // Add result michael@0: __m128i result = _mm_add_epi8(src_pixel, dst_pixel); michael@0: _mm_store_si128(d, result); michael@0: s++; michael@0: d++; michael@0: count -= 4; michael@0: } michael@0: #endif michael@0: src = reinterpret_cast(s); michael@0: dst = reinterpret_cast(d); michael@0: } michael@0: michael@0: while (count > 0) { michael@0: *dst = SkPMSrcOver(*src, *dst); michael@0: src++; michael@0: dst++; michael@0: count--; michael@0: } michael@0: } michael@0: michael@0: void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, michael@0: const SkPMColor* SK_RESTRICT src, michael@0: int count, U8CPU alpha) { michael@0: SkASSERT(alpha <= 255); michael@0: if (count <= 0) { michael@0: return; michael@0: } michael@0: michael@0: if (count >= 4) { michael@0: while (((size_t)dst & 0x0F) != 0) { michael@0: *dst = SkBlendARGB32(*src, *dst, alpha); michael@0: src++; michael@0: dst++; michael@0: count--; michael@0: } michael@0: michael@0: uint32_t src_scale = SkAlpha255To256(alpha); michael@0: michael@0: const __m128i *s = reinterpret_cast(src); michael@0: __m128i *d = reinterpret_cast<__m128i*>(dst); michael@0: __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); michael@0: __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); michael@0: __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit) michael@0: while (count >= 4) { michael@0: // Load 4 pixels each of src and dest. michael@0: __m128i src_pixel = _mm_loadu_si128(s); michael@0: __m128i dst_pixel = _mm_load_si128(d); michael@0: michael@0: // Get red and blue pixels into lower byte of each word. michael@0: __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); michael@0: __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); michael@0: michael@0: // Get alpha and green into lower byte of each word. michael@0: __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); michael@0: __m128i src_ag = _mm_srli_epi16(src_pixel, 8); michael@0: michael@0: // Put per-pixel alpha in low byte of each word. michael@0: // After the following two statements, the dst_alpha looks like michael@0: // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3) michael@0: __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); michael@0: dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); michael@0: michael@0: // dst_alpha = dst_alpha * src_scale michael@0: // Because src_scales are in the higher byte of each word and michael@0: // we use mulhi here, the resulting alpha values are already michael@0: // in the right place and don't need to be divided by 256. michael@0: // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3) michael@0: dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide); michael@0: michael@0: // Subtract alphas from 256, to get 1..256 michael@0: dst_alpha = _mm_sub_epi16(c_256, dst_alpha); michael@0: michael@0: // Multiply red and blue by dst pixel alpha. michael@0: dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); michael@0: // Multiply alpha and green by dst pixel alpha. michael@0: dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); michael@0: michael@0: // Multiply red and blue by global alpha. michael@0: // (4 x (0, rs.h, 0, bs.h)) michael@0: // where rs.h stands for the higher byte of r * src_scale, michael@0: // and bs.h the higher byte of b * src_scale. michael@0: // Again, because we use mulhi, the resuling red and blue michael@0: // values are already in the right place and don't need to michael@0: // be divided by 256. michael@0: src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); michael@0: // Multiply alpha and green by global alpha. michael@0: // (4 x (0, as.h, 0, gs.h)) michael@0: src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); michael@0: michael@0: // Divide by 256. michael@0: dst_rb = _mm_srli_epi16(dst_rb, 8); michael@0: michael@0: // Mask out low bits (goodies already in the right place; no need to divide) michael@0: dst_ag = _mm_andnot_si128(rb_mask, dst_ag); michael@0: // Shift alpha and green to higher byte of each word. michael@0: // (4 x (as.h, 0, gs.h, 0)) michael@0: src_ag = _mm_slli_epi16(src_ag, 8); michael@0: michael@0: // Combine back into RGBA. michael@0: dst_pixel = _mm_or_si128(dst_rb, dst_ag); michael@0: src_pixel = _mm_or_si128(src_rb, src_ag); michael@0: michael@0: // Add two pixels into result. michael@0: __m128i result = _mm_add_epi8(src_pixel, dst_pixel); michael@0: _mm_store_si128(d, result); michael@0: s++; michael@0: d++; michael@0: count -= 4; michael@0: } michael@0: src = reinterpret_cast(s); michael@0: dst = reinterpret_cast(d); michael@0: } michael@0: michael@0: while (count > 0) { michael@0: *dst = SkBlendARGB32(*src, *dst, alpha); michael@0: src++; michael@0: dst++; michael@0: count--; michael@0: } michael@0: } michael@0: michael@0: /* SSE2 version of Color32() michael@0: * portable version is in core/SkBlitRow_D32.cpp michael@0: */ michael@0: void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count, michael@0: SkPMColor color) { michael@0: michael@0: if (count <= 0) { michael@0: return; michael@0: } michael@0: michael@0: if (0 == color) { michael@0: if (src != dst) { michael@0: memcpy(dst, src, count * sizeof(SkPMColor)); michael@0: } michael@0: return; michael@0: } michael@0: michael@0: unsigned colorA = SkGetPackedA32(color); michael@0: if (255 == colorA) { michael@0: sk_memset32(dst, color, count); michael@0: } else { michael@0: unsigned scale = 256 - SkAlpha255To256(colorA); michael@0: michael@0: if (count >= 4) { michael@0: SkASSERT(((size_t)dst & 0x03) == 0); michael@0: while (((size_t)dst & 0x0F) != 0) { michael@0: *dst = color + SkAlphaMulQ(*src, scale); michael@0: src++; michael@0: dst++; michael@0: count--; michael@0: } michael@0: michael@0: const __m128i *s = reinterpret_cast(src); michael@0: __m128i *d = reinterpret_cast<__m128i*>(dst); michael@0: __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); michael@0: __m128i src_scale_wide = _mm_set1_epi16(scale); michael@0: __m128i color_wide = _mm_set1_epi32(color); michael@0: while (count >= 4) { michael@0: // Load 4 pixels each of src and dest. michael@0: __m128i src_pixel = _mm_loadu_si128(s); michael@0: michael@0: // Get red and blue pixels into lower byte of each word. michael@0: __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); michael@0: michael@0: // Get alpha and green into lower byte of each word. michael@0: __m128i src_ag = _mm_srli_epi16(src_pixel, 8); michael@0: michael@0: // Multiply by scale. michael@0: src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); michael@0: src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); michael@0: michael@0: // Divide by 256. michael@0: src_rb = _mm_srli_epi16(src_rb, 8); michael@0: src_ag = _mm_andnot_si128(rb_mask, src_ag); michael@0: michael@0: // Combine back into RGBA. michael@0: src_pixel = _mm_or_si128(src_rb, src_ag); michael@0: michael@0: // Add color to result. michael@0: __m128i result = _mm_add_epi8(color_wide, src_pixel); michael@0: michael@0: // Store result. michael@0: _mm_store_si128(d, result); michael@0: s++; michael@0: d++; michael@0: count -= 4; michael@0: } michael@0: src = reinterpret_cast(s); michael@0: dst = reinterpret_cast(d); michael@0: } michael@0: michael@0: while (count > 0) { michael@0: *dst = color + SkAlphaMulQ(*src, scale); michael@0: src += 1; michael@0: dst += 1; michael@0: count--; michael@0: } michael@0: } michael@0: } michael@0: michael@0: void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr, michael@0: size_t maskRB, SkColor origColor, michael@0: int width, int height) { michael@0: SkPMColor color = SkPreMultiplyColor(origColor); michael@0: size_t dstOffset = dstRB - (width << 2); michael@0: size_t maskOffset = maskRB - width; michael@0: SkPMColor* dst = (SkPMColor *)device; michael@0: const uint8_t* mask = (const uint8_t*)maskPtr; michael@0: do { michael@0: int count = width; michael@0: if (count >= 4) { michael@0: while (((size_t)dst & 0x0F) != 0 && (count > 0)) { michael@0: *dst = SkBlendARGB32(color, *dst, *mask); michael@0: mask++; michael@0: dst++; michael@0: count--; michael@0: } michael@0: __m128i *d = reinterpret_cast<__m128i*>(dst); michael@0: __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); michael@0: __m128i c_256 = _mm_set1_epi16(256); michael@0: __m128i c_1 = _mm_set1_epi16(1); michael@0: __m128i src_pixel = _mm_set1_epi32(color); michael@0: while (count >= 4) { michael@0: // Load 4 pixels each of src and dest. michael@0: __m128i dst_pixel = _mm_load_si128(d); michael@0: michael@0: //set the aphla value michael@0: __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\ michael@0: 0, *(mask+3),0, \ michael@0: *(mask+2),0, *(mask+2),\ michael@0: 0,*(mask+1), 0,*(mask+1),\ michael@0: 0, *mask,0,*mask); michael@0: michael@0: //call SkAlpha255To256() michael@0: src_scale_wide = _mm_add_epi16(src_scale_wide, c_1); michael@0: michael@0: // Get red and blue pixels into lower byte of each word. michael@0: __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); michael@0: __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); michael@0: michael@0: // Get alpha and green into lower byte of each word. michael@0: __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); michael@0: __m128i src_ag = _mm_srli_epi16(src_pixel, 8); michael@0: michael@0: // Put per-pixel alpha in low byte of each word. michael@0: __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); michael@0: dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); michael@0: michael@0: // dst_alpha = dst_alpha * src_scale michael@0: dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide); michael@0: michael@0: // Divide by 256. michael@0: dst_alpha = _mm_srli_epi16(dst_alpha, 8); michael@0: michael@0: // Subtract alphas from 256, to get 1..256 michael@0: dst_alpha = _mm_sub_epi16(c_256, dst_alpha); michael@0: // Multiply red and blue by dst pixel alpha. michael@0: dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); michael@0: // Multiply alpha and green by dst pixel alpha. michael@0: dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); michael@0: michael@0: // Multiply red and blue by global alpha. michael@0: src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); michael@0: // Multiply alpha and green by global alpha. michael@0: src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); michael@0: // Divide by 256. michael@0: dst_rb = _mm_srli_epi16(dst_rb, 8); michael@0: src_rb = _mm_srli_epi16(src_rb, 8); michael@0: michael@0: // Mask out low bits (goodies already in the right place; no need to divide) michael@0: dst_ag = _mm_andnot_si128(rb_mask, dst_ag); michael@0: src_ag = _mm_andnot_si128(rb_mask, src_ag); michael@0: michael@0: // Combine back into RGBA. michael@0: dst_pixel = _mm_or_si128(dst_rb, dst_ag); michael@0: __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag); michael@0: michael@0: // Add two pixels into result. michael@0: __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel); michael@0: _mm_store_si128(d, result); michael@0: // load the next 4 pixel michael@0: mask = mask + 4; michael@0: d++; michael@0: count -= 4; michael@0: } michael@0: dst = reinterpret_cast(d); michael@0: } michael@0: while(count > 0) { michael@0: *dst= SkBlendARGB32(color, *dst, *mask); michael@0: dst += 1; michael@0: mask++; michael@0: count --; michael@0: } michael@0: dst = (SkPMColor *)((char*)dst + dstOffset); michael@0: mask += maskOffset; michael@0: } while (--height != 0); michael@0: } michael@0: michael@0: // The following (left) shifts cause the top 5 bits of the mask components to michael@0: // line up with the corresponding components in an SkPMColor. michael@0: // Note that the mask's RGB16 order may differ from the SkPMColor order. michael@0: #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5) michael@0: #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5) michael@0: #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5) michael@0: michael@0: #if SK_R16x5_R32x5_SHIFT == 0 michael@0: #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x) michael@0: #elif SK_R16x5_R32x5_SHIFT > 0 michael@0: #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT)) michael@0: #else michael@0: #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT)) michael@0: #endif michael@0: michael@0: #if SK_G16x5_G32x5_SHIFT == 0 michael@0: #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x) michael@0: #elif SK_G16x5_G32x5_SHIFT > 0 michael@0: #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT)) michael@0: #else michael@0: #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT)) michael@0: #endif michael@0: michael@0: #if SK_B16x5_B32x5_SHIFT == 0 michael@0: #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x) michael@0: #elif SK_B16x5_B32x5_SHIFT > 0 michael@0: #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT)) michael@0: #else michael@0: #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT)) michael@0: #endif michael@0: michael@0: static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst, michael@0: __m128i &mask, __m128i &srcA) { michael@0: // In the following comments, the components of src, dst and mask are michael@0: // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked michael@0: // by an R, G, B, or A suffix. Components of one of the four pixels that michael@0: // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for michael@0: // example is the blue channel of the second destination pixel. Memory michael@0: // layout is shown for an ARGB byte order in a color value. michael@0: michael@0: // src and srcA store 8-bit values interleaved with zeros. michael@0: // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) michael@0: // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0, michael@0: // srcA, 0, srcA, 0, srcA, 0, srcA, 0) michael@0: // mask stores 16-bit values (compressed three channels) interleaved with zeros. michael@0: // Lo and Hi denote the low and high bytes of a 16-bit value, respectively. michael@0: // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, michael@0: // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) michael@0: michael@0: // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. michael@0: // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) michael@0: __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), michael@0: _mm_set1_epi32(0x1F << SK_R32_SHIFT)); michael@0: michael@0: // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) michael@0: __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), michael@0: _mm_set1_epi32(0x1F << SK_G32_SHIFT)); michael@0: michael@0: // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) michael@0: __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), michael@0: _mm_set1_epi32(0x1F << SK_B32_SHIFT)); michael@0: michael@0: // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) michael@0: // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an michael@0: // 8-bit position michael@0: // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, michael@0: // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) michael@0: mask = _mm_or_si128(_mm_or_si128(r, g), b); michael@0: michael@0: // Interleave R,G,B into the lower byte of word. michael@0: // i.e. split the sixteen 8-bit values from mask into two sets of eight michael@0: // 16-bit values, padded by zero. michael@0: __m128i maskLo, maskHi; michael@0: // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) michael@0: maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); michael@0: // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) michael@0: maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); michael@0: michael@0: // Upscale from 0..31 to 0..32 michael@0: // (allows to replace division by left-shift further down) michael@0: // Left-shift each component by 4 and add the result back to that component, michael@0: // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 michael@0: maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); michael@0: maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); michael@0: michael@0: // Multiply each component of maskLo and maskHi by srcA michael@0: maskLo = _mm_mullo_epi16(maskLo, srcA); michael@0: maskHi = _mm_mullo_epi16(maskHi, srcA); michael@0: michael@0: // Left shift mask components by 8 (divide by 256) michael@0: maskLo = _mm_srli_epi16(maskLo, 8); michael@0: maskHi = _mm_srli_epi16(maskHi, 8); michael@0: michael@0: // Interleave R,G,B into the lower byte of the word michael@0: // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) michael@0: __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); michael@0: // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) michael@0: __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); michael@0: michael@0: // mask = (src - dst) * mask michael@0: maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); michael@0: maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); michael@0: michael@0: // mask = (src - dst) * mask >> 5 michael@0: maskLo = _mm_srai_epi16(maskLo, 5); michael@0: maskHi = _mm_srai_epi16(maskHi, 5); michael@0: michael@0: // Add two pixels into result. michael@0: // result = dst + ((src - dst) * mask >> 5) michael@0: __m128i resultLo = _mm_add_epi16(dstLo, maskLo); michael@0: __m128i resultHi = _mm_add_epi16(dstHi, maskHi); michael@0: michael@0: // Pack into 4 32bit dst pixels. michael@0: // resultLo and resultHi contain eight 16-bit components (two pixels) each. michael@0: // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), michael@0: // clamping to 255 if necessary. michael@0: return _mm_packus_epi16(resultLo, resultHi); michael@0: } michael@0: michael@0: static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst, michael@0: __m128i &mask) { michael@0: // In the following comments, the components of src, dst and mask are michael@0: // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked michael@0: // by an R, G, B, or A suffix. Components of one of the four pixels that michael@0: // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for michael@0: // example is the blue channel of the second destination pixel. Memory michael@0: // layout is shown for an ARGB byte order in a color value. michael@0: michael@0: // src and srcA store 8-bit values interleaved with zeros. michael@0: // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) michael@0: // mask stores 16-bit values (shown as high and low bytes) interleaved with michael@0: // zeros michael@0: // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, michael@0: // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) michael@0: michael@0: // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. michael@0: // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) michael@0: __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), michael@0: _mm_set1_epi32(0x1F << SK_R32_SHIFT)); michael@0: michael@0: // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) michael@0: __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), michael@0: _mm_set1_epi32(0x1F << SK_G32_SHIFT)); michael@0: michael@0: // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) michael@0: __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), michael@0: _mm_set1_epi32(0x1F << SK_B32_SHIFT)); michael@0: michael@0: // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) michael@0: // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an michael@0: // 8-bit position michael@0: // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, michael@0: // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) michael@0: mask = _mm_or_si128(_mm_or_si128(r, g), b); michael@0: michael@0: // Interleave R,G,B into the lower byte of word. michael@0: // i.e. split the sixteen 8-bit values from mask into two sets of eight michael@0: // 16-bit values, padded by zero. michael@0: __m128i maskLo, maskHi; michael@0: // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) michael@0: maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); michael@0: // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) michael@0: maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); michael@0: michael@0: // Upscale from 0..31 to 0..32 michael@0: // (allows to replace division by left-shift further down) michael@0: // Left-shift each component by 4 and add the result back to that component, michael@0: // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 michael@0: maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); michael@0: maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); michael@0: michael@0: // Interleave R,G,B into the lower byte of the word michael@0: // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) michael@0: __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); michael@0: // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) michael@0: __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); michael@0: michael@0: // mask = (src - dst) * mask michael@0: maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); michael@0: maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); michael@0: michael@0: // mask = (src - dst) * mask >> 5 michael@0: maskLo = _mm_srai_epi16(maskLo, 5); michael@0: maskHi = _mm_srai_epi16(maskHi, 5); michael@0: michael@0: // Add two pixels into result. michael@0: // result = dst + ((src - dst) * mask >> 5) michael@0: __m128i resultLo = _mm_add_epi16(dstLo, maskLo); michael@0: __m128i resultHi = _mm_add_epi16(dstHi, maskHi); michael@0: michael@0: // Pack into 4 32bit dst pixels and force opaque. michael@0: // resultLo and resultHi contain eight 16-bit components (two pixels) each. michael@0: // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), michael@0: // clamping to 255 if necessary. Set alpha components to 0xFF. michael@0: return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi), michael@0: _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT)); michael@0: } michael@0: michael@0: void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[], michael@0: SkColor src, int width, SkPMColor) { michael@0: if (width <= 0) { michael@0: return; michael@0: } michael@0: michael@0: int srcA = SkColorGetA(src); michael@0: int srcR = SkColorGetR(src); michael@0: int srcG = SkColorGetG(src); michael@0: int srcB = SkColorGetB(src); michael@0: michael@0: srcA = SkAlpha255To256(srcA); michael@0: michael@0: if (width >= 4) { michael@0: SkASSERT(((size_t)dst & 0x03) == 0); michael@0: while (((size_t)dst & 0x0F) != 0) { michael@0: *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); michael@0: mask++; michael@0: dst++; michael@0: width--; michael@0: } michael@0: michael@0: __m128i *d = reinterpret_cast<__m128i*>(dst); michael@0: // Set alpha to 0xFF and replicate source four times in SSE register. michael@0: __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); michael@0: // Interleave with zeros to get two sets of four 16-bit values. michael@0: src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); michael@0: // Set srcA_sse to contain eight copies of srcA, padded with zero. michael@0: // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) michael@0: __m128i srcA_sse = _mm_set1_epi16(srcA); michael@0: while (width >= 4) { michael@0: // Load four destination pixels into dst_sse. michael@0: __m128i dst_sse = _mm_load_si128(d); michael@0: // Load four 16-bit masks into lower half of mask_sse. michael@0: __m128i mask_sse = _mm_loadl_epi64( michael@0: reinterpret_cast(mask)); michael@0: michael@0: // Check whether masks are equal to 0 and get the highest bit michael@0: // of each byte of result, if masks are all zero, we will get michael@0: // pack_cmp to 0xFFFF michael@0: int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, michael@0: _mm_setzero_si128())); michael@0: michael@0: // if mask pixels are not all zero, we will blend the dst pixels michael@0: if (pack_cmp != 0xFFFF) { michael@0: // Unpack 4 16bit mask pixels to michael@0: // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, michael@0: // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) michael@0: mask_sse = _mm_unpacklo_epi16(mask_sse, michael@0: _mm_setzero_si128()); michael@0: michael@0: // Process 4 32bit dst pixels michael@0: __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse, michael@0: mask_sse, srcA_sse); michael@0: _mm_store_si128(d, result); michael@0: } michael@0: michael@0: d++; michael@0: mask += 4; michael@0: width -= 4; michael@0: } michael@0: michael@0: dst = reinterpret_cast(d); michael@0: } michael@0: michael@0: while (width > 0) { michael@0: *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); michael@0: mask++; michael@0: dst++; michael@0: width--; michael@0: } michael@0: } michael@0: michael@0: void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[], michael@0: SkColor src, int width, SkPMColor opaqueDst) { michael@0: if (width <= 0) { michael@0: return; michael@0: } michael@0: michael@0: int srcR = SkColorGetR(src); michael@0: int srcG = SkColorGetG(src); michael@0: int srcB = SkColorGetB(src); michael@0: michael@0: if (width >= 4) { michael@0: SkASSERT(((size_t)dst & 0x03) == 0); michael@0: while (((size_t)dst & 0x0F) != 0) { michael@0: *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); michael@0: mask++; michael@0: dst++; michael@0: width--; michael@0: } michael@0: michael@0: __m128i *d = reinterpret_cast<__m128i*>(dst); michael@0: // Set alpha to 0xFF and replicate source four times in SSE register. michael@0: __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); michael@0: // Set srcA_sse to contain eight copies of srcA, padded with zero. michael@0: // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) michael@0: src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); michael@0: while (width >= 4) { michael@0: // Load four destination pixels into dst_sse. michael@0: __m128i dst_sse = _mm_load_si128(d); michael@0: // Load four 16-bit masks into lower half of mask_sse. michael@0: __m128i mask_sse = _mm_loadl_epi64( michael@0: reinterpret_cast(mask)); michael@0: michael@0: // Check whether masks are equal to 0 and get the highest bit michael@0: // of each byte of result, if masks are all zero, we will get michael@0: // pack_cmp to 0xFFFF michael@0: int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, michael@0: _mm_setzero_si128())); michael@0: michael@0: // if mask pixels are not all zero, we will blend the dst pixels michael@0: if (pack_cmp != 0xFFFF) { michael@0: // Unpack 4 16bit mask pixels to michael@0: // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, michael@0: // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) michael@0: mask_sse = _mm_unpacklo_epi16(mask_sse, michael@0: _mm_setzero_si128()); michael@0: michael@0: // Process 4 32bit dst pixels michael@0: __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse, michael@0: mask_sse); michael@0: _mm_store_si128(d, result); michael@0: } michael@0: michael@0: d++; michael@0: mask += 4; michael@0: width -= 4; michael@0: } michael@0: michael@0: dst = reinterpret_cast(d); michael@0: } michael@0: michael@0: while (width > 0) { michael@0: *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); michael@0: mask++; michael@0: dst++; michael@0: width--; michael@0: } michael@0: } michael@0: michael@0: /* SSE2 version of S32_D565_Opaque() michael@0: * portable version is in core/SkBlitRow_D16.cpp michael@0: */ michael@0: void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, michael@0: const SkPMColor* SK_RESTRICT src, int count, michael@0: U8CPU alpha, int /*x*/, int /*y*/) { michael@0: SkASSERT(255 == alpha); michael@0: michael@0: if (count <= 0) { michael@0: return; michael@0: } michael@0: michael@0: if (count >= 8) { michael@0: while (((size_t)dst & 0x0F) != 0) { michael@0: SkPMColor c = *src++; michael@0: SkPMColorAssert(c); michael@0: michael@0: *dst++ = SkPixel32ToPixel16_ToU16(c); michael@0: count--; michael@0: } michael@0: michael@0: const __m128i* s = reinterpret_cast(src); michael@0: __m128i* d = reinterpret_cast<__m128i*>(dst); michael@0: __m128i r16_mask = _mm_set1_epi32(SK_R16_MASK); michael@0: __m128i g16_mask = _mm_set1_epi32(SK_G16_MASK); michael@0: __m128i b16_mask = _mm_set1_epi32(SK_B16_MASK); michael@0: michael@0: while (count >= 8) { michael@0: // Load 8 pixels of src. michael@0: __m128i src_pixel1 = _mm_loadu_si128(s++); michael@0: __m128i src_pixel2 = _mm_loadu_si128(s++); michael@0: michael@0: // Calculate result r. michael@0: __m128i r1 = _mm_srli_epi32(src_pixel1, michael@0: SK_R32_SHIFT + (8 - SK_R16_BITS)); michael@0: r1 = _mm_and_si128(r1, r16_mask); michael@0: __m128i r2 = _mm_srli_epi32(src_pixel2, michael@0: SK_R32_SHIFT + (8 - SK_R16_BITS)); michael@0: r2 = _mm_and_si128(r2, r16_mask); michael@0: __m128i r = _mm_packs_epi32(r1, r2); michael@0: michael@0: // Calculate result g. michael@0: __m128i g1 = _mm_srli_epi32(src_pixel1, michael@0: SK_G32_SHIFT + (8 - SK_G16_BITS)); michael@0: g1 = _mm_and_si128(g1, g16_mask); michael@0: __m128i g2 = _mm_srli_epi32(src_pixel2, michael@0: SK_G32_SHIFT + (8 - SK_G16_BITS)); michael@0: g2 = _mm_and_si128(g2, g16_mask); michael@0: __m128i g = _mm_packs_epi32(g1, g2); michael@0: michael@0: // Calculate result b. michael@0: __m128i b1 = _mm_srli_epi32(src_pixel1, michael@0: SK_B32_SHIFT + (8 - SK_B16_BITS)); michael@0: b1 = _mm_and_si128(b1, b16_mask); michael@0: __m128i b2 = _mm_srli_epi32(src_pixel2, michael@0: SK_B32_SHIFT + (8 - SK_B16_BITS)); michael@0: b2 = _mm_and_si128(b2, b16_mask); michael@0: __m128i b = _mm_packs_epi32(b1, b2); michael@0: michael@0: // Store 8 16-bit colors in dst. michael@0: __m128i d_pixel = SkPackRGB16_SSE(r, g, b); michael@0: _mm_store_si128(d++, d_pixel); michael@0: count -= 8; michael@0: } michael@0: src = reinterpret_cast(s); michael@0: dst = reinterpret_cast(d); michael@0: } michael@0: michael@0: if (count > 0) { michael@0: do { michael@0: SkPMColor c = *src++; michael@0: SkPMColorAssert(c); michael@0: *dst++ = SkPixel32ToPixel16_ToU16(c); michael@0: } while (--count != 0); michael@0: } michael@0: } michael@0: michael@0: /* SSE2 version of S32A_D565_Opaque() michael@0: * portable version is in core/SkBlitRow_D16.cpp michael@0: */ michael@0: void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, michael@0: const SkPMColor* SK_RESTRICT src, michael@0: int count, U8CPU alpha, int /*x*/, int /*y*/) { michael@0: SkASSERT(255 == alpha); michael@0: michael@0: if (count <= 0) { michael@0: return; michael@0: } michael@0: michael@0: if (count >= 8) { michael@0: // Make dst 16 bytes alignment michael@0: while (((size_t)dst & 0x0F) != 0) { michael@0: SkPMColor c = *src++; michael@0: if (c) { michael@0: *dst = SkSrcOver32To16(c, *dst); michael@0: } michael@0: dst += 1; michael@0: count--; michael@0: } michael@0: michael@0: const __m128i* s = reinterpret_cast(src); michael@0: __m128i* d = reinterpret_cast<__m128i*>(dst); michael@0: __m128i var255 = _mm_set1_epi16(255); michael@0: __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK); michael@0: __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK); michael@0: __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK); michael@0: michael@0: while (count >= 8) { michael@0: // Load 8 pixels of src. michael@0: __m128i src_pixel1 = _mm_loadu_si128(s++); michael@0: __m128i src_pixel2 = _mm_loadu_si128(s++); michael@0: michael@0: // Check whether src pixels are equal to 0 and get the highest bit michael@0: // of each byte of result, if src pixels are all zero, src_cmp1 and michael@0: // src_cmp2 will be 0xFFFF. michael@0: int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1, michael@0: _mm_setzero_si128())); michael@0: int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2, michael@0: _mm_setzero_si128())); michael@0: if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) { michael@0: d++; michael@0: count -= 8; michael@0: continue; michael@0: } michael@0: michael@0: // Load 8 pixels of dst. michael@0: __m128i dst_pixel = _mm_load_si128(d); michael@0: michael@0: // Extract A from src. michael@0: __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT)); michael@0: sa1 = _mm_srli_epi32(sa1, 24); michael@0: __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT)); michael@0: sa2 = _mm_srli_epi32(sa2, 24); michael@0: __m128i sa = _mm_packs_epi32(sa1, sa2); michael@0: michael@0: // Extract R from src. michael@0: __m128i sr1 = _mm_slli_epi32(src_pixel1,(24 - SK_R32_SHIFT)); michael@0: sr1 = _mm_srli_epi32(sr1, 24); michael@0: __m128i sr2 = _mm_slli_epi32(src_pixel2,(24 - SK_R32_SHIFT)); michael@0: sr2 = _mm_srli_epi32(sr2, 24); michael@0: __m128i sr = _mm_packs_epi32(sr1, sr2); michael@0: michael@0: // Extract G from src. michael@0: __m128i sg1 = _mm_slli_epi32(src_pixel1,(24 - SK_G32_SHIFT)); michael@0: sg1 = _mm_srli_epi32(sg1, 24); michael@0: __m128i sg2 = _mm_slli_epi32(src_pixel2,(24 - SK_G32_SHIFT)); michael@0: sg2 = _mm_srli_epi32(sg2, 24); michael@0: __m128i sg = _mm_packs_epi32(sg1, sg2); michael@0: michael@0: // Extract B from src. michael@0: __m128i sb1 = _mm_slli_epi32(src_pixel1,(24 - SK_B32_SHIFT)); michael@0: sb1 = _mm_srli_epi32(sb1, 24); michael@0: __m128i sb2 = _mm_slli_epi32(src_pixel2,(24 - SK_B32_SHIFT)); michael@0: sb2 = _mm_srli_epi32(sb2, 24); michael@0: __m128i sb = _mm_packs_epi32(sb1, sb2); michael@0: michael@0: // Extract R G B from dst. michael@0: __m128i dr = _mm_srli_epi16(dst_pixel,SK_R16_SHIFT); michael@0: dr = _mm_and_si128(dr, r16_mask); michael@0: __m128i dg = _mm_srli_epi16(dst_pixel,SK_G16_SHIFT); michael@0: dg = _mm_and_si128(dg, g16_mask); michael@0: __m128i db = _mm_srli_epi16(dst_pixel,SK_B16_SHIFT); michael@0: db = _mm_and_si128(db, b16_mask); michael@0: michael@0: __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa michael@0: michael@0: // Calculate R G B of result. michael@0: // Original algorithm is in SkSrcOver32To16(). michael@0: dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE(dr, isa, SK_R16_BITS)); michael@0: dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS); michael@0: dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE(dg, isa, SK_G16_BITS)); michael@0: dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS); michael@0: db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE(db, isa, SK_B16_BITS)); michael@0: db = _mm_srli_epi16(db, 8 - SK_B16_BITS); michael@0: michael@0: // Pack R G B into 16-bit color. michael@0: __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db); michael@0: michael@0: // Store 8 16-bit colors in dst. michael@0: _mm_store_si128(d++, d_pixel); michael@0: count -= 8; michael@0: } michael@0: michael@0: src = reinterpret_cast(s); michael@0: dst = reinterpret_cast(d); michael@0: } michael@0: michael@0: if (count > 0) { michael@0: do { michael@0: SkPMColor c = *src++; michael@0: SkPMColorAssert(c); michael@0: if (c) { michael@0: *dst = SkSrcOver32To16(c, *dst); michael@0: } michael@0: dst += 1; michael@0: } while (--count != 0); michael@0: } michael@0: } michael@0: michael@0: void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, michael@0: const SkPMColor* SK_RESTRICT src, michael@0: int count, U8CPU alpha, int x, int y) { michael@0: SkASSERT(255 == alpha); michael@0: michael@0: if (count <= 0) { michael@0: return; michael@0: } michael@0: michael@0: if (count >= 8) { michael@0: while (((size_t)dst & 0x0F) != 0) { michael@0: DITHER_565_SCAN(y); michael@0: SkPMColor c = *src++; michael@0: SkPMColorAssert(c); michael@0: michael@0: unsigned dither = DITHER_VALUE(x); michael@0: *dst++ = SkDitherRGB32To565(c, dither); michael@0: DITHER_INC_X(x); michael@0: count--; michael@0: } michael@0: michael@0: unsigned short dither_value[8]; michael@0: __m128i dither; michael@0: #ifdef ENABLE_DITHER_MATRIX_4X4 michael@0: const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3]; michael@0: dither_value[0] = dither_value[4] = dither_scan[(x) & 3]; michael@0: dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3]; michael@0: dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3]; michael@0: dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3]; michael@0: #else michael@0: const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3]; michael@0: dither_value[0] = dither_value[4] = (dither_scan michael@0: >> (((x) & 3) << 2)) & 0xF; michael@0: dither_value[1] = dither_value[5] = (dither_scan michael@0: >> (((x + 1) & 3) << 2)) & 0xF; michael@0: dither_value[2] = dither_value[6] = (dither_scan michael@0: >> (((x + 2) & 3) << 2)) & 0xF; michael@0: dither_value[3] = dither_value[7] = (dither_scan michael@0: >> (((x + 3) & 3) << 2)) & 0xF; michael@0: #endif michael@0: dither = _mm_loadu_si128((__m128i*) dither_value); michael@0: michael@0: const __m128i* s = reinterpret_cast(src); michael@0: __m128i* d = reinterpret_cast<__m128i*>(dst); michael@0: michael@0: while (count >= 8) { michael@0: // Load 8 pixels of src. michael@0: __m128i src_pixel1 = _mm_loadu_si128(s++); michael@0: __m128i src_pixel2 = _mm_loadu_si128(s++); michael@0: michael@0: // Extract R from src. michael@0: __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); michael@0: sr1 = _mm_srli_epi32(sr1, 24); michael@0: __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT)); michael@0: sr2 = _mm_srli_epi32(sr2, 24); michael@0: __m128i sr = _mm_packs_epi32(sr1, sr2); michael@0: michael@0: // SkDITHER_R32To565(sr, dither) michael@0: __m128i sr_offset = _mm_srli_epi16(sr, 5); michael@0: sr = _mm_add_epi16(sr, dither); michael@0: sr = _mm_sub_epi16(sr, sr_offset); michael@0: sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS); michael@0: michael@0: // Extract G from src. michael@0: __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT)); michael@0: sg1 = _mm_srli_epi32(sg1, 24); michael@0: __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT)); michael@0: sg2 = _mm_srli_epi32(sg2, 24); michael@0: __m128i sg = _mm_packs_epi32(sg1, sg2); michael@0: michael@0: // SkDITHER_R32To565(sg, dither) michael@0: __m128i sg_offset = _mm_srli_epi16(sg, 6); michael@0: sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1)); michael@0: sg = _mm_sub_epi16(sg, sg_offset); michael@0: sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS); michael@0: michael@0: // Extract B from src. michael@0: __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT)); michael@0: sb1 = _mm_srli_epi32(sb1, 24); michael@0: __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT)); michael@0: sb2 = _mm_srli_epi32(sb2, 24); michael@0: __m128i sb = _mm_packs_epi32(sb1, sb2); michael@0: michael@0: // SkDITHER_R32To565(sb, dither) michael@0: __m128i sb_offset = _mm_srli_epi16(sb, 5); michael@0: sb = _mm_add_epi16(sb, dither); michael@0: sb = _mm_sub_epi16(sb, sb_offset); michael@0: sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS); michael@0: michael@0: // Pack and store 16-bit dst pixel. michael@0: __m128i d_pixel = SkPackRGB16_SSE(sr, sg, sb); michael@0: _mm_store_si128(d++, d_pixel); michael@0: michael@0: count -= 8; michael@0: x += 8; michael@0: } michael@0: michael@0: src = reinterpret_cast(s); michael@0: dst = reinterpret_cast(d); michael@0: } michael@0: michael@0: if (count > 0) { michael@0: DITHER_565_SCAN(y); michael@0: do { michael@0: SkPMColor c = *src++; michael@0: SkPMColorAssert(c); michael@0: michael@0: unsigned dither = DITHER_VALUE(x); michael@0: *dst++ = SkDitherRGB32To565(c, dither); michael@0: DITHER_INC_X(x); michael@0: } while (--count != 0); michael@0: } michael@0: } michael@0: michael@0: /* SSE2 version of S32A_D565_Opaque_Dither() michael@0: * portable version is in core/SkBlitRow_D16.cpp michael@0: */ michael@0: void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, michael@0: const SkPMColor* SK_RESTRICT src, michael@0: int count, U8CPU alpha, int x, int y) { michael@0: SkASSERT(255 == alpha); michael@0: michael@0: if (count <= 0) { michael@0: return; michael@0: } michael@0: michael@0: if (count >= 8) { michael@0: while (((size_t)dst & 0x0F) != 0) { michael@0: DITHER_565_SCAN(y); michael@0: SkPMColor c = *src++; michael@0: SkPMColorAssert(c); michael@0: if (c) { michael@0: unsigned a = SkGetPackedA32(c); michael@0: michael@0: int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a)); michael@0: michael@0: unsigned sr = SkGetPackedR32(c); michael@0: unsigned sg = SkGetPackedG32(c); michael@0: unsigned sb = SkGetPackedB32(c); michael@0: sr = SkDITHER_R32_FOR_565(sr, d); michael@0: sg = SkDITHER_G32_FOR_565(sg, d); michael@0: sb = SkDITHER_B32_FOR_565(sb, d); michael@0: michael@0: uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); michael@0: uint32_t dst_expanded = SkExpand_rgb_16(*dst); michael@0: dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); michael@0: // now src and dst expanded are in g:11 r:10 x:1 b:10 michael@0: *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); michael@0: } michael@0: dst += 1; michael@0: DITHER_INC_X(x); michael@0: count--; michael@0: } michael@0: michael@0: unsigned short dither_value[8]; michael@0: __m128i dither, dither_cur; michael@0: #ifdef ENABLE_DITHER_MATRIX_4X4 michael@0: const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3]; michael@0: dither_value[0] = dither_value[4] = dither_scan[(x) & 3]; michael@0: dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3]; michael@0: dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3]; michael@0: dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3]; michael@0: #else michael@0: const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3]; michael@0: dither_value[0] = dither_value[4] = (dither_scan michael@0: >> (((x) & 3) << 2)) & 0xF; michael@0: dither_value[1] = dither_value[5] = (dither_scan michael@0: >> (((x + 1) & 3) << 2)) & 0xF; michael@0: dither_value[2] = dither_value[6] = (dither_scan michael@0: >> (((x + 2) & 3) << 2)) & 0xF; michael@0: dither_value[3] = dither_value[7] = (dither_scan michael@0: >> (((x + 3) & 3) << 2)) & 0xF; michael@0: #endif michael@0: dither = _mm_loadu_si128((__m128i*) dither_value); michael@0: michael@0: const __m128i* s = reinterpret_cast(src); michael@0: __m128i* d = reinterpret_cast<__m128i*>(dst); michael@0: __m128i var256 = _mm_set1_epi16(256); michael@0: __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK); michael@0: __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK); michael@0: __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK); michael@0: michael@0: while (count >= 8) { michael@0: // Load 8 pixels of src and dst. michael@0: __m128i src_pixel1 = _mm_loadu_si128(s++); michael@0: __m128i src_pixel2 = _mm_loadu_si128(s++); michael@0: __m128i dst_pixel = _mm_load_si128(d); michael@0: michael@0: // Extract A from src. michael@0: __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT)); michael@0: sa1 = _mm_srli_epi32(sa1, 24); michael@0: __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT)); michael@0: sa2 = _mm_srli_epi32(sa2, 24); michael@0: __m128i sa = _mm_packs_epi32(sa1, sa2); michael@0: michael@0: // Calculate current dither value. michael@0: dither_cur = _mm_mullo_epi16(dither, michael@0: _mm_add_epi16(sa, _mm_set1_epi16(1))); michael@0: dither_cur = _mm_srli_epi16(dither_cur, 8); michael@0: michael@0: // Extract R from src. michael@0: __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); michael@0: sr1 = _mm_srli_epi32(sr1, 24); michael@0: __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT)); michael@0: sr2 = _mm_srli_epi32(sr2, 24); michael@0: __m128i sr = _mm_packs_epi32(sr1, sr2); michael@0: michael@0: // SkDITHER_R32_FOR_565(sr, d) michael@0: __m128i sr_offset = _mm_srli_epi16(sr, 5); michael@0: sr = _mm_add_epi16(sr, dither_cur); michael@0: sr = _mm_sub_epi16(sr, sr_offset); michael@0: michael@0: // Expand sr. michael@0: sr = _mm_slli_epi16(sr, 2); michael@0: michael@0: // Extract G from src. michael@0: __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT)); michael@0: sg1 = _mm_srli_epi32(sg1, 24); michael@0: __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT)); michael@0: sg2 = _mm_srli_epi32(sg2, 24); michael@0: __m128i sg = _mm_packs_epi32(sg1, sg2); michael@0: michael@0: // sg = SkDITHER_G32_FOR_565(sg, d). michael@0: __m128i sg_offset = _mm_srli_epi16(sg, 6); michael@0: sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1)); michael@0: sg = _mm_sub_epi16(sg, sg_offset); michael@0: michael@0: // Expand sg. michael@0: sg = _mm_slli_epi16(sg, 3); michael@0: michael@0: // Extract B from src. michael@0: __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT)); michael@0: sb1 = _mm_srli_epi32(sb1, 24); michael@0: __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT)); michael@0: sb2 = _mm_srli_epi32(sb2, 24); michael@0: __m128i sb = _mm_packs_epi32(sb1, sb2); michael@0: michael@0: // sb = SkDITHER_B32_FOR_565(sb, d). michael@0: __m128i sb_offset = _mm_srli_epi16(sb, 5); michael@0: sb = _mm_add_epi16(sb, dither_cur); michael@0: sb = _mm_sub_epi16(sb, sb_offset); michael@0: michael@0: // Expand sb. michael@0: sb = _mm_slli_epi16(sb, 2); michael@0: michael@0: // Extract R G B from dst. michael@0: __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT); michael@0: dr = _mm_and_si128(dr, r16_mask); michael@0: __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT); michael@0: dg = _mm_and_si128(dg, g16_mask); michael@0: __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT); michael@0: db = _mm_and_si128(db, b16_mask); michael@0: michael@0: // SkAlpha255To256(255 - a) >> 3 michael@0: __m128i isa = _mm_sub_epi16(var256, sa); michael@0: isa = _mm_srli_epi16(isa, 3); michael@0: michael@0: dr = _mm_mullo_epi16(dr, isa); michael@0: dr = _mm_add_epi16(dr, sr); michael@0: dr = _mm_srli_epi16(dr, 5); michael@0: michael@0: dg = _mm_mullo_epi16(dg, isa); michael@0: dg = _mm_add_epi16(dg, sg); michael@0: dg = _mm_srli_epi16(dg, 5); michael@0: michael@0: db = _mm_mullo_epi16(db, isa); michael@0: db = _mm_add_epi16(db, sb); michael@0: db = _mm_srli_epi16(db, 5); michael@0: michael@0: // Package and store dst pixel. michael@0: __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db); michael@0: _mm_store_si128(d++, d_pixel); michael@0: michael@0: count -= 8; michael@0: x += 8; michael@0: } michael@0: michael@0: src = reinterpret_cast(s); michael@0: dst = reinterpret_cast(d); michael@0: } michael@0: michael@0: if (count > 0) { michael@0: DITHER_565_SCAN(y); michael@0: do { michael@0: SkPMColor c = *src++; michael@0: SkPMColorAssert(c); michael@0: if (c) { michael@0: unsigned a = SkGetPackedA32(c); michael@0: michael@0: int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a)); michael@0: michael@0: unsigned sr = SkGetPackedR32(c); michael@0: unsigned sg = SkGetPackedG32(c); michael@0: unsigned sb = SkGetPackedB32(c); michael@0: sr = SkDITHER_R32_FOR_565(sr, d); michael@0: sg = SkDITHER_G32_FOR_565(sg, d); michael@0: sb = SkDITHER_B32_FOR_565(sb, d); michael@0: michael@0: uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); michael@0: uint32_t dst_expanded = SkExpand_rgb_16(*dst); michael@0: dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); michael@0: // now src and dst expanded are in g:11 r:10 x:1 b:10 michael@0: *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); michael@0: } michael@0: dst += 1; michael@0: DITHER_INC_X(x); michael@0: } while (--count != 0); michael@0: } michael@0: }