|
1 /* |
|
2 * Copyright 2012 The Android Open Source Project |
|
3 * |
|
4 * Use of this source code is governed by a BSD-style license that can be |
|
5 * found in the LICENSE file. |
|
6 */ |
|
7 |
|
8 |
|
9 #include "SkBlitRow_opts_SSE2.h" |
|
10 #include "SkBitmapProcState_opts_SSE2.h" |
|
11 #include "SkColorPriv.h" |
|
12 #include "SkColor_opts_SSE2.h" |
|
13 #include "SkDither.h" |
|
14 #include "SkUtils.h" |
|
15 |
|
16 #include <emmintrin.h> |
|
17 |
|
18 /* SSE2 version of S32_Blend_BlitRow32() |
|
19 * portable version is in core/SkBlitRow_D32.cpp |
|
20 */ |
|
21 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, |
|
22 const SkPMColor* SK_RESTRICT src, |
|
23 int count, U8CPU alpha) { |
|
24 SkASSERT(alpha <= 255); |
|
25 if (count <= 0) { |
|
26 return; |
|
27 } |
|
28 |
|
29 uint32_t src_scale = SkAlpha255To256(alpha); |
|
30 uint32_t dst_scale = 256 - src_scale; |
|
31 |
|
32 if (count >= 4) { |
|
33 SkASSERT(((size_t)dst & 0x03) == 0); |
|
34 while (((size_t)dst & 0x0F) != 0) { |
|
35 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); |
|
36 src++; |
|
37 dst++; |
|
38 count--; |
|
39 } |
|
40 |
|
41 const __m128i *s = reinterpret_cast<const __m128i*>(src); |
|
42 __m128i *d = reinterpret_cast<__m128i*>(dst); |
|
43 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); |
|
44 __m128i ag_mask = _mm_set1_epi32(0xFF00FF00); |
|
45 |
|
46 // Move scale factors to upper byte of word |
|
47 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); |
|
48 __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8); |
|
49 while (count >= 4) { |
|
50 // Load 4 pixels each of src and dest. |
|
51 __m128i src_pixel = _mm_loadu_si128(s); |
|
52 __m128i dst_pixel = _mm_load_si128(d); |
|
53 |
|
54 // Interleave Atom port 0/1 operations based on the execution port |
|
55 // constraints that multiply can only be executed on port 0 (while |
|
56 // boolean operations can be executed on either port 0 or port 1) |
|
57 // because GCC currently doesn't do a good job scheduling |
|
58 // instructions based on these constraints. |
|
59 |
|
60 // Get red and blue pixels into lower byte of each word. |
|
61 // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b) |
|
62 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); |
|
63 |
|
64 // Multiply by scale. |
|
65 // (4 x (0, rs.h, 0, bs.h)) |
|
66 // where rs.h stands for the higher byte of r * scale, and |
|
67 // bs.h the higher byte of b * scale. |
|
68 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); |
|
69 |
|
70 // Get alpha and green pixels into higher byte of each word. |
|
71 // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0) |
|
72 __m128i src_ag = _mm_and_si128(ag_mask, src_pixel); |
|
73 |
|
74 // Multiply by scale. |
|
75 // (4 x (as.h, as.l, gs.h, gs.l)) |
|
76 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); |
|
77 |
|
78 // Clear the lower byte of the a*scale and g*scale results |
|
79 // (4 x (as.h, 0, gs.h, 0)) |
|
80 src_ag = _mm_and_si128(src_ag, ag_mask); |
|
81 |
|
82 // Operations the destination pixels are the same as on the |
|
83 // source pixels. See the comments above. |
|
84 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); |
|
85 dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide); |
|
86 __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel); |
|
87 dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide); |
|
88 dst_ag = _mm_and_si128(dst_ag, ag_mask); |
|
89 |
|
90 // Combine back into RGBA. |
|
91 // (4 x (as.h, rs.h, gs.h, bs.h)) |
|
92 src_pixel = _mm_or_si128(src_rb, src_ag); |
|
93 dst_pixel = _mm_or_si128(dst_rb, dst_ag); |
|
94 |
|
95 // Add result |
|
96 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); |
|
97 _mm_store_si128(d, result); |
|
98 s++; |
|
99 d++; |
|
100 count -= 4; |
|
101 } |
|
102 src = reinterpret_cast<const SkPMColor*>(s); |
|
103 dst = reinterpret_cast<SkPMColor*>(d); |
|
104 } |
|
105 |
|
106 while (count > 0) { |
|
107 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); |
|
108 src++; |
|
109 dst++; |
|
110 count--; |
|
111 } |
|
112 } |
|
113 |
|
114 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, |
|
115 const SkPMColor* SK_RESTRICT src, |
|
116 int count, U8CPU alpha) { |
|
117 SkASSERT(alpha == 255); |
|
118 if (count <= 0) { |
|
119 return; |
|
120 } |
|
121 |
|
122 if (count >= 4) { |
|
123 SkASSERT(((size_t)dst & 0x03) == 0); |
|
124 while (((size_t)dst & 0x0F) != 0) { |
|
125 *dst = SkPMSrcOver(*src, *dst); |
|
126 src++; |
|
127 dst++; |
|
128 count--; |
|
129 } |
|
130 |
|
131 const __m128i *s = reinterpret_cast<const __m128i*>(src); |
|
132 __m128i *d = reinterpret_cast<__m128i*>(dst); |
|
133 #ifdef SK_USE_ACCURATE_BLENDING |
|
134 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); |
|
135 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit) |
|
136 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit) |
|
137 while (count >= 4) { |
|
138 // Load 4 pixels |
|
139 __m128i src_pixel = _mm_loadu_si128(s); |
|
140 __m128i dst_pixel = _mm_load_si128(d); |
|
141 |
|
142 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); |
|
143 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); |
|
144 // Shift alphas down to lower 8 bits of each quad. |
|
145 __m128i alpha = _mm_srli_epi32(src_pixel, 24); |
|
146 |
|
147 // Copy alpha to upper 3rd byte of each quad |
|
148 alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16)); |
|
149 |
|
150 // Subtract alphas from 255, to get 0..255 |
|
151 alpha = _mm_sub_epi16(c_255, alpha); |
|
152 |
|
153 // Multiply by red and blue by src alpha. |
|
154 dst_rb = _mm_mullo_epi16(dst_rb, alpha); |
|
155 // Multiply by alpha and green by src alpha. |
|
156 dst_ag = _mm_mullo_epi16(dst_ag, alpha); |
|
157 |
|
158 // dst_rb_low = (dst_rb >> 8) |
|
159 __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8); |
|
160 __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8); |
|
161 |
|
162 // dst_rb = (dst_rb + dst_rb_low + 128) >> 8 |
|
163 dst_rb = _mm_add_epi16(dst_rb, dst_rb_low); |
|
164 dst_rb = _mm_add_epi16(dst_rb, c_128); |
|
165 dst_rb = _mm_srli_epi16(dst_rb, 8); |
|
166 |
|
167 // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask |
|
168 dst_ag = _mm_add_epi16(dst_ag, dst_ag_low); |
|
169 dst_ag = _mm_add_epi16(dst_ag, c_128); |
|
170 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); |
|
171 |
|
172 // Combine back into RGBA. |
|
173 dst_pixel = _mm_or_si128(dst_rb, dst_ag); |
|
174 |
|
175 // Add result |
|
176 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); |
|
177 _mm_store_si128(d, result); |
|
178 s++; |
|
179 d++; |
|
180 count -= 4; |
|
181 } |
|
182 #else |
|
183 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); |
|
184 __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit) |
|
185 while (count >= 4) { |
|
186 // Load 4 pixels |
|
187 __m128i src_pixel = _mm_loadu_si128(s); |
|
188 __m128i dst_pixel = _mm_load_si128(d); |
|
189 |
|
190 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); |
|
191 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); |
|
192 |
|
193 // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word) |
|
194 __m128i alpha = _mm_srli_epi16(src_pixel, 8); |
|
195 |
|
196 // (a0, a0, a1, a1, a2, g2, a3, g3) |
|
197 alpha = _mm_shufflehi_epi16(alpha, 0xF5); |
|
198 |
|
199 // (a0, a0, a1, a1, a2, a2, a3, a3) |
|
200 alpha = _mm_shufflelo_epi16(alpha, 0xF5); |
|
201 |
|
202 // Subtract alphas from 256, to get 1..256 |
|
203 alpha = _mm_sub_epi16(c_256, alpha); |
|
204 |
|
205 // Multiply by red and blue by src alpha. |
|
206 dst_rb = _mm_mullo_epi16(dst_rb, alpha); |
|
207 // Multiply by alpha and green by src alpha. |
|
208 dst_ag = _mm_mullo_epi16(dst_ag, alpha); |
|
209 |
|
210 // Divide by 256. |
|
211 dst_rb = _mm_srli_epi16(dst_rb, 8); |
|
212 |
|
213 // Mask out high bits (already in the right place) |
|
214 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); |
|
215 |
|
216 // Combine back into RGBA. |
|
217 dst_pixel = _mm_or_si128(dst_rb, dst_ag); |
|
218 |
|
219 // Add result |
|
220 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); |
|
221 _mm_store_si128(d, result); |
|
222 s++; |
|
223 d++; |
|
224 count -= 4; |
|
225 } |
|
226 #endif |
|
227 src = reinterpret_cast<const SkPMColor*>(s); |
|
228 dst = reinterpret_cast<SkPMColor*>(d); |
|
229 } |
|
230 |
|
231 while (count > 0) { |
|
232 *dst = SkPMSrcOver(*src, *dst); |
|
233 src++; |
|
234 dst++; |
|
235 count--; |
|
236 } |
|
237 } |
|
238 |
|
239 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, |
|
240 const SkPMColor* SK_RESTRICT src, |
|
241 int count, U8CPU alpha) { |
|
242 SkASSERT(alpha <= 255); |
|
243 if (count <= 0) { |
|
244 return; |
|
245 } |
|
246 |
|
247 if (count >= 4) { |
|
248 while (((size_t)dst & 0x0F) != 0) { |
|
249 *dst = SkBlendARGB32(*src, *dst, alpha); |
|
250 src++; |
|
251 dst++; |
|
252 count--; |
|
253 } |
|
254 |
|
255 uint32_t src_scale = SkAlpha255To256(alpha); |
|
256 |
|
257 const __m128i *s = reinterpret_cast<const __m128i*>(src); |
|
258 __m128i *d = reinterpret_cast<__m128i*>(dst); |
|
259 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); |
|
260 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); |
|
261 __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit) |
|
262 while (count >= 4) { |
|
263 // Load 4 pixels each of src and dest. |
|
264 __m128i src_pixel = _mm_loadu_si128(s); |
|
265 __m128i dst_pixel = _mm_load_si128(d); |
|
266 |
|
267 // Get red and blue pixels into lower byte of each word. |
|
268 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); |
|
269 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); |
|
270 |
|
271 // Get alpha and green into lower byte of each word. |
|
272 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); |
|
273 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); |
|
274 |
|
275 // Put per-pixel alpha in low byte of each word. |
|
276 // After the following two statements, the dst_alpha looks like |
|
277 // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3) |
|
278 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); |
|
279 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); |
|
280 |
|
281 // dst_alpha = dst_alpha * src_scale |
|
282 // Because src_scales are in the higher byte of each word and |
|
283 // we use mulhi here, the resulting alpha values are already |
|
284 // in the right place and don't need to be divided by 256. |
|
285 // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3) |
|
286 dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide); |
|
287 |
|
288 // Subtract alphas from 256, to get 1..256 |
|
289 dst_alpha = _mm_sub_epi16(c_256, dst_alpha); |
|
290 |
|
291 // Multiply red and blue by dst pixel alpha. |
|
292 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); |
|
293 // Multiply alpha and green by dst pixel alpha. |
|
294 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); |
|
295 |
|
296 // Multiply red and blue by global alpha. |
|
297 // (4 x (0, rs.h, 0, bs.h)) |
|
298 // where rs.h stands for the higher byte of r * src_scale, |
|
299 // and bs.h the higher byte of b * src_scale. |
|
300 // Again, because we use mulhi, the resuling red and blue |
|
301 // values are already in the right place and don't need to |
|
302 // be divided by 256. |
|
303 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); |
|
304 // Multiply alpha and green by global alpha. |
|
305 // (4 x (0, as.h, 0, gs.h)) |
|
306 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); |
|
307 |
|
308 // Divide by 256. |
|
309 dst_rb = _mm_srli_epi16(dst_rb, 8); |
|
310 |
|
311 // Mask out low bits (goodies already in the right place; no need to divide) |
|
312 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); |
|
313 // Shift alpha and green to higher byte of each word. |
|
314 // (4 x (as.h, 0, gs.h, 0)) |
|
315 src_ag = _mm_slli_epi16(src_ag, 8); |
|
316 |
|
317 // Combine back into RGBA. |
|
318 dst_pixel = _mm_or_si128(dst_rb, dst_ag); |
|
319 src_pixel = _mm_or_si128(src_rb, src_ag); |
|
320 |
|
321 // Add two pixels into result. |
|
322 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); |
|
323 _mm_store_si128(d, result); |
|
324 s++; |
|
325 d++; |
|
326 count -= 4; |
|
327 } |
|
328 src = reinterpret_cast<const SkPMColor*>(s); |
|
329 dst = reinterpret_cast<SkPMColor*>(d); |
|
330 } |
|
331 |
|
332 while (count > 0) { |
|
333 *dst = SkBlendARGB32(*src, *dst, alpha); |
|
334 src++; |
|
335 dst++; |
|
336 count--; |
|
337 } |
|
338 } |
|
339 |
|
340 /* SSE2 version of Color32() |
|
341 * portable version is in core/SkBlitRow_D32.cpp |
|
342 */ |
|
343 void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count, |
|
344 SkPMColor color) { |
|
345 |
|
346 if (count <= 0) { |
|
347 return; |
|
348 } |
|
349 |
|
350 if (0 == color) { |
|
351 if (src != dst) { |
|
352 memcpy(dst, src, count * sizeof(SkPMColor)); |
|
353 } |
|
354 return; |
|
355 } |
|
356 |
|
357 unsigned colorA = SkGetPackedA32(color); |
|
358 if (255 == colorA) { |
|
359 sk_memset32(dst, color, count); |
|
360 } else { |
|
361 unsigned scale = 256 - SkAlpha255To256(colorA); |
|
362 |
|
363 if (count >= 4) { |
|
364 SkASSERT(((size_t)dst & 0x03) == 0); |
|
365 while (((size_t)dst & 0x0F) != 0) { |
|
366 *dst = color + SkAlphaMulQ(*src, scale); |
|
367 src++; |
|
368 dst++; |
|
369 count--; |
|
370 } |
|
371 |
|
372 const __m128i *s = reinterpret_cast<const __m128i*>(src); |
|
373 __m128i *d = reinterpret_cast<__m128i*>(dst); |
|
374 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); |
|
375 __m128i src_scale_wide = _mm_set1_epi16(scale); |
|
376 __m128i color_wide = _mm_set1_epi32(color); |
|
377 while (count >= 4) { |
|
378 // Load 4 pixels each of src and dest. |
|
379 __m128i src_pixel = _mm_loadu_si128(s); |
|
380 |
|
381 // Get red and blue pixels into lower byte of each word. |
|
382 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); |
|
383 |
|
384 // Get alpha and green into lower byte of each word. |
|
385 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); |
|
386 |
|
387 // Multiply by scale. |
|
388 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); |
|
389 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); |
|
390 |
|
391 // Divide by 256. |
|
392 src_rb = _mm_srli_epi16(src_rb, 8); |
|
393 src_ag = _mm_andnot_si128(rb_mask, src_ag); |
|
394 |
|
395 // Combine back into RGBA. |
|
396 src_pixel = _mm_or_si128(src_rb, src_ag); |
|
397 |
|
398 // Add color to result. |
|
399 __m128i result = _mm_add_epi8(color_wide, src_pixel); |
|
400 |
|
401 // Store result. |
|
402 _mm_store_si128(d, result); |
|
403 s++; |
|
404 d++; |
|
405 count -= 4; |
|
406 } |
|
407 src = reinterpret_cast<const SkPMColor*>(s); |
|
408 dst = reinterpret_cast<SkPMColor*>(d); |
|
409 } |
|
410 |
|
411 while (count > 0) { |
|
412 *dst = color + SkAlphaMulQ(*src, scale); |
|
413 src += 1; |
|
414 dst += 1; |
|
415 count--; |
|
416 } |
|
417 } |
|
418 } |
|
419 |
|
420 void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr, |
|
421 size_t maskRB, SkColor origColor, |
|
422 int width, int height) { |
|
423 SkPMColor color = SkPreMultiplyColor(origColor); |
|
424 size_t dstOffset = dstRB - (width << 2); |
|
425 size_t maskOffset = maskRB - width; |
|
426 SkPMColor* dst = (SkPMColor *)device; |
|
427 const uint8_t* mask = (const uint8_t*)maskPtr; |
|
428 do { |
|
429 int count = width; |
|
430 if (count >= 4) { |
|
431 while (((size_t)dst & 0x0F) != 0 && (count > 0)) { |
|
432 *dst = SkBlendARGB32(color, *dst, *mask); |
|
433 mask++; |
|
434 dst++; |
|
435 count--; |
|
436 } |
|
437 __m128i *d = reinterpret_cast<__m128i*>(dst); |
|
438 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); |
|
439 __m128i c_256 = _mm_set1_epi16(256); |
|
440 __m128i c_1 = _mm_set1_epi16(1); |
|
441 __m128i src_pixel = _mm_set1_epi32(color); |
|
442 while (count >= 4) { |
|
443 // Load 4 pixels each of src and dest. |
|
444 __m128i dst_pixel = _mm_load_si128(d); |
|
445 |
|
446 //set the aphla value |
|
447 __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\ |
|
448 0, *(mask+3),0, \ |
|
449 *(mask+2),0, *(mask+2),\ |
|
450 0,*(mask+1), 0,*(mask+1),\ |
|
451 0, *mask,0,*mask); |
|
452 |
|
453 //call SkAlpha255To256() |
|
454 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1); |
|
455 |
|
456 // Get red and blue pixels into lower byte of each word. |
|
457 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); |
|
458 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); |
|
459 |
|
460 // Get alpha and green into lower byte of each word. |
|
461 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); |
|
462 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); |
|
463 |
|
464 // Put per-pixel alpha in low byte of each word. |
|
465 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); |
|
466 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); |
|
467 |
|
468 // dst_alpha = dst_alpha * src_scale |
|
469 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide); |
|
470 |
|
471 // Divide by 256. |
|
472 dst_alpha = _mm_srli_epi16(dst_alpha, 8); |
|
473 |
|
474 // Subtract alphas from 256, to get 1..256 |
|
475 dst_alpha = _mm_sub_epi16(c_256, dst_alpha); |
|
476 // Multiply red and blue by dst pixel alpha. |
|
477 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); |
|
478 // Multiply alpha and green by dst pixel alpha. |
|
479 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); |
|
480 |
|
481 // Multiply red and blue by global alpha. |
|
482 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); |
|
483 // Multiply alpha and green by global alpha. |
|
484 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); |
|
485 // Divide by 256. |
|
486 dst_rb = _mm_srli_epi16(dst_rb, 8); |
|
487 src_rb = _mm_srli_epi16(src_rb, 8); |
|
488 |
|
489 // Mask out low bits (goodies already in the right place; no need to divide) |
|
490 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); |
|
491 src_ag = _mm_andnot_si128(rb_mask, src_ag); |
|
492 |
|
493 // Combine back into RGBA. |
|
494 dst_pixel = _mm_or_si128(dst_rb, dst_ag); |
|
495 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag); |
|
496 |
|
497 // Add two pixels into result. |
|
498 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel); |
|
499 _mm_store_si128(d, result); |
|
500 // load the next 4 pixel |
|
501 mask = mask + 4; |
|
502 d++; |
|
503 count -= 4; |
|
504 } |
|
505 dst = reinterpret_cast<SkPMColor *>(d); |
|
506 } |
|
507 while(count > 0) { |
|
508 *dst= SkBlendARGB32(color, *dst, *mask); |
|
509 dst += 1; |
|
510 mask++; |
|
511 count --; |
|
512 } |
|
513 dst = (SkPMColor *)((char*)dst + dstOffset); |
|
514 mask += maskOffset; |
|
515 } while (--height != 0); |
|
516 } |
|
517 |
|
518 // The following (left) shifts cause the top 5 bits of the mask components to |
|
519 // line up with the corresponding components in an SkPMColor. |
|
520 // Note that the mask's RGB16 order may differ from the SkPMColor order. |
|
521 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5) |
|
522 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5) |
|
523 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5) |
|
524 |
|
525 #if SK_R16x5_R32x5_SHIFT == 0 |
|
526 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x) |
|
527 #elif SK_R16x5_R32x5_SHIFT > 0 |
|
528 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT)) |
|
529 #else |
|
530 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT)) |
|
531 #endif |
|
532 |
|
533 #if SK_G16x5_G32x5_SHIFT == 0 |
|
534 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x) |
|
535 #elif SK_G16x5_G32x5_SHIFT > 0 |
|
536 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT)) |
|
537 #else |
|
538 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT)) |
|
539 #endif |
|
540 |
|
541 #if SK_B16x5_B32x5_SHIFT == 0 |
|
542 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x) |
|
543 #elif SK_B16x5_B32x5_SHIFT > 0 |
|
544 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT)) |
|
545 #else |
|
546 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT)) |
|
547 #endif |
|
548 |
|
549 static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst, |
|
550 __m128i &mask, __m128i &srcA) { |
|
551 // In the following comments, the components of src, dst and mask are |
|
552 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked |
|
553 // by an R, G, B, or A suffix. Components of one of the four pixels that |
|
554 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for |
|
555 // example is the blue channel of the second destination pixel. Memory |
|
556 // layout is shown for an ARGB byte order in a color value. |
|
557 |
|
558 // src and srcA store 8-bit values interleaved with zeros. |
|
559 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) |
|
560 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0, |
|
561 // srcA, 0, srcA, 0, srcA, 0, srcA, 0) |
|
562 // mask stores 16-bit values (compressed three channels) interleaved with zeros. |
|
563 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively. |
|
564 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, |
|
565 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) |
|
566 |
|
567 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. |
|
568 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) |
|
569 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), |
|
570 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); |
|
571 |
|
572 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) |
|
573 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), |
|
574 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); |
|
575 |
|
576 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) |
|
577 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), |
|
578 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); |
|
579 |
|
580 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) |
|
581 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an |
|
582 // 8-bit position |
|
583 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, |
|
584 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) |
|
585 mask = _mm_or_si128(_mm_or_si128(r, g), b); |
|
586 |
|
587 // Interleave R,G,B into the lower byte of word. |
|
588 // i.e. split the sixteen 8-bit values from mask into two sets of eight |
|
589 // 16-bit values, padded by zero. |
|
590 __m128i maskLo, maskHi; |
|
591 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) |
|
592 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); |
|
593 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) |
|
594 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); |
|
595 |
|
596 // Upscale from 0..31 to 0..32 |
|
597 // (allows to replace division by left-shift further down) |
|
598 // Left-shift each component by 4 and add the result back to that component, |
|
599 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 |
|
600 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); |
|
601 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); |
|
602 |
|
603 // Multiply each component of maskLo and maskHi by srcA |
|
604 maskLo = _mm_mullo_epi16(maskLo, srcA); |
|
605 maskHi = _mm_mullo_epi16(maskHi, srcA); |
|
606 |
|
607 // Left shift mask components by 8 (divide by 256) |
|
608 maskLo = _mm_srli_epi16(maskLo, 8); |
|
609 maskHi = _mm_srli_epi16(maskHi, 8); |
|
610 |
|
611 // Interleave R,G,B into the lower byte of the word |
|
612 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) |
|
613 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); |
|
614 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) |
|
615 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); |
|
616 |
|
617 // mask = (src - dst) * mask |
|
618 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); |
|
619 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); |
|
620 |
|
621 // mask = (src - dst) * mask >> 5 |
|
622 maskLo = _mm_srai_epi16(maskLo, 5); |
|
623 maskHi = _mm_srai_epi16(maskHi, 5); |
|
624 |
|
625 // Add two pixels into result. |
|
626 // result = dst + ((src - dst) * mask >> 5) |
|
627 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); |
|
628 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); |
|
629 |
|
630 // Pack into 4 32bit dst pixels. |
|
631 // resultLo and resultHi contain eight 16-bit components (two pixels) each. |
|
632 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), |
|
633 // clamping to 255 if necessary. |
|
634 return _mm_packus_epi16(resultLo, resultHi); |
|
635 } |
|
636 |
|
637 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst, |
|
638 __m128i &mask) { |
|
639 // In the following comments, the components of src, dst and mask are |
|
640 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked |
|
641 // by an R, G, B, or A suffix. Components of one of the four pixels that |
|
642 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for |
|
643 // example is the blue channel of the second destination pixel. Memory |
|
644 // layout is shown for an ARGB byte order in a color value. |
|
645 |
|
646 // src and srcA store 8-bit values interleaved with zeros. |
|
647 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) |
|
648 // mask stores 16-bit values (shown as high and low bytes) interleaved with |
|
649 // zeros |
|
650 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, |
|
651 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) |
|
652 |
|
653 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. |
|
654 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) |
|
655 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), |
|
656 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); |
|
657 |
|
658 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) |
|
659 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), |
|
660 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); |
|
661 |
|
662 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) |
|
663 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), |
|
664 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); |
|
665 |
|
666 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) |
|
667 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an |
|
668 // 8-bit position |
|
669 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, |
|
670 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) |
|
671 mask = _mm_or_si128(_mm_or_si128(r, g), b); |
|
672 |
|
673 // Interleave R,G,B into the lower byte of word. |
|
674 // i.e. split the sixteen 8-bit values from mask into two sets of eight |
|
675 // 16-bit values, padded by zero. |
|
676 __m128i maskLo, maskHi; |
|
677 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) |
|
678 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); |
|
679 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) |
|
680 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); |
|
681 |
|
682 // Upscale from 0..31 to 0..32 |
|
683 // (allows to replace division by left-shift further down) |
|
684 // Left-shift each component by 4 and add the result back to that component, |
|
685 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 |
|
686 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); |
|
687 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); |
|
688 |
|
689 // Interleave R,G,B into the lower byte of the word |
|
690 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) |
|
691 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); |
|
692 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) |
|
693 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); |
|
694 |
|
695 // mask = (src - dst) * mask |
|
696 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); |
|
697 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); |
|
698 |
|
699 // mask = (src - dst) * mask >> 5 |
|
700 maskLo = _mm_srai_epi16(maskLo, 5); |
|
701 maskHi = _mm_srai_epi16(maskHi, 5); |
|
702 |
|
703 // Add two pixels into result. |
|
704 // result = dst + ((src - dst) * mask >> 5) |
|
705 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); |
|
706 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); |
|
707 |
|
708 // Pack into 4 32bit dst pixels and force opaque. |
|
709 // resultLo and resultHi contain eight 16-bit components (two pixels) each. |
|
710 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), |
|
711 // clamping to 255 if necessary. Set alpha components to 0xFF. |
|
712 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi), |
|
713 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT)); |
|
714 } |
|
715 |
|
716 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[], |
|
717 SkColor src, int width, SkPMColor) { |
|
718 if (width <= 0) { |
|
719 return; |
|
720 } |
|
721 |
|
722 int srcA = SkColorGetA(src); |
|
723 int srcR = SkColorGetR(src); |
|
724 int srcG = SkColorGetG(src); |
|
725 int srcB = SkColorGetB(src); |
|
726 |
|
727 srcA = SkAlpha255To256(srcA); |
|
728 |
|
729 if (width >= 4) { |
|
730 SkASSERT(((size_t)dst & 0x03) == 0); |
|
731 while (((size_t)dst & 0x0F) != 0) { |
|
732 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); |
|
733 mask++; |
|
734 dst++; |
|
735 width--; |
|
736 } |
|
737 |
|
738 __m128i *d = reinterpret_cast<__m128i*>(dst); |
|
739 // Set alpha to 0xFF and replicate source four times in SSE register. |
|
740 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); |
|
741 // Interleave with zeros to get two sets of four 16-bit values. |
|
742 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); |
|
743 // Set srcA_sse to contain eight copies of srcA, padded with zero. |
|
744 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) |
|
745 __m128i srcA_sse = _mm_set1_epi16(srcA); |
|
746 while (width >= 4) { |
|
747 // Load four destination pixels into dst_sse. |
|
748 __m128i dst_sse = _mm_load_si128(d); |
|
749 // Load four 16-bit masks into lower half of mask_sse. |
|
750 __m128i mask_sse = _mm_loadl_epi64( |
|
751 reinterpret_cast<const __m128i*>(mask)); |
|
752 |
|
753 // Check whether masks are equal to 0 and get the highest bit |
|
754 // of each byte of result, if masks are all zero, we will get |
|
755 // pack_cmp to 0xFFFF |
|
756 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, |
|
757 _mm_setzero_si128())); |
|
758 |
|
759 // if mask pixels are not all zero, we will blend the dst pixels |
|
760 if (pack_cmp != 0xFFFF) { |
|
761 // Unpack 4 16bit mask pixels to |
|
762 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, |
|
763 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) |
|
764 mask_sse = _mm_unpacklo_epi16(mask_sse, |
|
765 _mm_setzero_si128()); |
|
766 |
|
767 // Process 4 32bit dst pixels |
|
768 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse, |
|
769 mask_sse, srcA_sse); |
|
770 _mm_store_si128(d, result); |
|
771 } |
|
772 |
|
773 d++; |
|
774 mask += 4; |
|
775 width -= 4; |
|
776 } |
|
777 |
|
778 dst = reinterpret_cast<SkPMColor*>(d); |
|
779 } |
|
780 |
|
781 while (width > 0) { |
|
782 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); |
|
783 mask++; |
|
784 dst++; |
|
785 width--; |
|
786 } |
|
787 } |
|
788 |
|
789 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[], |
|
790 SkColor src, int width, SkPMColor opaqueDst) { |
|
791 if (width <= 0) { |
|
792 return; |
|
793 } |
|
794 |
|
795 int srcR = SkColorGetR(src); |
|
796 int srcG = SkColorGetG(src); |
|
797 int srcB = SkColorGetB(src); |
|
798 |
|
799 if (width >= 4) { |
|
800 SkASSERT(((size_t)dst & 0x03) == 0); |
|
801 while (((size_t)dst & 0x0F) != 0) { |
|
802 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); |
|
803 mask++; |
|
804 dst++; |
|
805 width--; |
|
806 } |
|
807 |
|
808 __m128i *d = reinterpret_cast<__m128i*>(dst); |
|
809 // Set alpha to 0xFF and replicate source four times in SSE register. |
|
810 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); |
|
811 // Set srcA_sse to contain eight copies of srcA, padded with zero. |
|
812 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) |
|
813 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); |
|
814 while (width >= 4) { |
|
815 // Load four destination pixels into dst_sse. |
|
816 __m128i dst_sse = _mm_load_si128(d); |
|
817 // Load four 16-bit masks into lower half of mask_sse. |
|
818 __m128i mask_sse = _mm_loadl_epi64( |
|
819 reinterpret_cast<const __m128i*>(mask)); |
|
820 |
|
821 // Check whether masks are equal to 0 and get the highest bit |
|
822 // of each byte of result, if masks are all zero, we will get |
|
823 // pack_cmp to 0xFFFF |
|
824 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, |
|
825 _mm_setzero_si128())); |
|
826 |
|
827 // if mask pixels are not all zero, we will blend the dst pixels |
|
828 if (pack_cmp != 0xFFFF) { |
|
829 // Unpack 4 16bit mask pixels to |
|
830 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, |
|
831 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) |
|
832 mask_sse = _mm_unpacklo_epi16(mask_sse, |
|
833 _mm_setzero_si128()); |
|
834 |
|
835 // Process 4 32bit dst pixels |
|
836 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse, |
|
837 mask_sse); |
|
838 _mm_store_si128(d, result); |
|
839 } |
|
840 |
|
841 d++; |
|
842 mask += 4; |
|
843 width -= 4; |
|
844 } |
|
845 |
|
846 dst = reinterpret_cast<SkPMColor*>(d); |
|
847 } |
|
848 |
|
849 while (width > 0) { |
|
850 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); |
|
851 mask++; |
|
852 dst++; |
|
853 width--; |
|
854 } |
|
855 } |
|
856 |
|
857 /* SSE2 version of S32_D565_Opaque() |
|
858 * portable version is in core/SkBlitRow_D16.cpp |
|
859 */ |
|
860 void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, |
|
861 const SkPMColor* SK_RESTRICT src, int count, |
|
862 U8CPU alpha, int /*x*/, int /*y*/) { |
|
863 SkASSERT(255 == alpha); |
|
864 |
|
865 if (count <= 0) { |
|
866 return; |
|
867 } |
|
868 |
|
869 if (count >= 8) { |
|
870 while (((size_t)dst & 0x0F) != 0) { |
|
871 SkPMColor c = *src++; |
|
872 SkPMColorAssert(c); |
|
873 |
|
874 *dst++ = SkPixel32ToPixel16_ToU16(c); |
|
875 count--; |
|
876 } |
|
877 |
|
878 const __m128i* s = reinterpret_cast<const __m128i*>(src); |
|
879 __m128i* d = reinterpret_cast<__m128i*>(dst); |
|
880 __m128i r16_mask = _mm_set1_epi32(SK_R16_MASK); |
|
881 __m128i g16_mask = _mm_set1_epi32(SK_G16_MASK); |
|
882 __m128i b16_mask = _mm_set1_epi32(SK_B16_MASK); |
|
883 |
|
884 while (count >= 8) { |
|
885 // Load 8 pixels of src. |
|
886 __m128i src_pixel1 = _mm_loadu_si128(s++); |
|
887 __m128i src_pixel2 = _mm_loadu_si128(s++); |
|
888 |
|
889 // Calculate result r. |
|
890 __m128i r1 = _mm_srli_epi32(src_pixel1, |
|
891 SK_R32_SHIFT + (8 - SK_R16_BITS)); |
|
892 r1 = _mm_and_si128(r1, r16_mask); |
|
893 __m128i r2 = _mm_srli_epi32(src_pixel2, |
|
894 SK_R32_SHIFT + (8 - SK_R16_BITS)); |
|
895 r2 = _mm_and_si128(r2, r16_mask); |
|
896 __m128i r = _mm_packs_epi32(r1, r2); |
|
897 |
|
898 // Calculate result g. |
|
899 __m128i g1 = _mm_srli_epi32(src_pixel1, |
|
900 SK_G32_SHIFT + (8 - SK_G16_BITS)); |
|
901 g1 = _mm_and_si128(g1, g16_mask); |
|
902 __m128i g2 = _mm_srli_epi32(src_pixel2, |
|
903 SK_G32_SHIFT + (8 - SK_G16_BITS)); |
|
904 g2 = _mm_and_si128(g2, g16_mask); |
|
905 __m128i g = _mm_packs_epi32(g1, g2); |
|
906 |
|
907 // Calculate result b. |
|
908 __m128i b1 = _mm_srli_epi32(src_pixel1, |
|
909 SK_B32_SHIFT + (8 - SK_B16_BITS)); |
|
910 b1 = _mm_and_si128(b1, b16_mask); |
|
911 __m128i b2 = _mm_srli_epi32(src_pixel2, |
|
912 SK_B32_SHIFT + (8 - SK_B16_BITS)); |
|
913 b2 = _mm_and_si128(b2, b16_mask); |
|
914 __m128i b = _mm_packs_epi32(b1, b2); |
|
915 |
|
916 // Store 8 16-bit colors in dst. |
|
917 __m128i d_pixel = SkPackRGB16_SSE(r, g, b); |
|
918 _mm_store_si128(d++, d_pixel); |
|
919 count -= 8; |
|
920 } |
|
921 src = reinterpret_cast<const SkPMColor*>(s); |
|
922 dst = reinterpret_cast<uint16_t*>(d); |
|
923 } |
|
924 |
|
925 if (count > 0) { |
|
926 do { |
|
927 SkPMColor c = *src++; |
|
928 SkPMColorAssert(c); |
|
929 *dst++ = SkPixel32ToPixel16_ToU16(c); |
|
930 } while (--count != 0); |
|
931 } |
|
932 } |
|
933 |
|
934 /* SSE2 version of S32A_D565_Opaque() |
|
935 * portable version is in core/SkBlitRow_D16.cpp |
|
936 */ |
|
937 void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, |
|
938 const SkPMColor* SK_RESTRICT src, |
|
939 int count, U8CPU alpha, int /*x*/, int /*y*/) { |
|
940 SkASSERT(255 == alpha); |
|
941 |
|
942 if (count <= 0) { |
|
943 return; |
|
944 } |
|
945 |
|
946 if (count >= 8) { |
|
947 // Make dst 16 bytes alignment |
|
948 while (((size_t)dst & 0x0F) != 0) { |
|
949 SkPMColor c = *src++; |
|
950 if (c) { |
|
951 *dst = SkSrcOver32To16(c, *dst); |
|
952 } |
|
953 dst += 1; |
|
954 count--; |
|
955 } |
|
956 |
|
957 const __m128i* s = reinterpret_cast<const __m128i*>(src); |
|
958 __m128i* d = reinterpret_cast<__m128i*>(dst); |
|
959 __m128i var255 = _mm_set1_epi16(255); |
|
960 __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK); |
|
961 __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK); |
|
962 __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK); |
|
963 |
|
964 while (count >= 8) { |
|
965 // Load 8 pixels of src. |
|
966 __m128i src_pixel1 = _mm_loadu_si128(s++); |
|
967 __m128i src_pixel2 = _mm_loadu_si128(s++); |
|
968 |
|
969 // Check whether src pixels are equal to 0 and get the highest bit |
|
970 // of each byte of result, if src pixels are all zero, src_cmp1 and |
|
971 // src_cmp2 will be 0xFFFF. |
|
972 int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1, |
|
973 _mm_setzero_si128())); |
|
974 int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2, |
|
975 _mm_setzero_si128())); |
|
976 if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) { |
|
977 d++; |
|
978 count -= 8; |
|
979 continue; |
|
980 } |
|
981 |
|
982 // Load 8 pixels of dst. |
|
983 __m128i dst_pixel = _mm_load_si128(d); |
|
984 |
|
985 // Extract A from src. |
|
986 __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT)); |
|
987 sa1 = _mm_srli_epi32(sa1, 24); |
|
988 __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT)); |
|
989 sa2 = _mm_srli_epi32(sa2, 24); |
|
990 __m128i sa = _mm_packs_epi32(sa1, sa2); |
|
991 |
|
992 // Extract R from src. |
|
993 __m128i sr1 = _mm_slli_epi32(src_pixel1,(24 - SK_R32_SHIFT)); |
|
994 sr1 = _mm_srli_epi32(sr1, 24); |
|
995 __m128i sr2 = _mm_slli_epi32(src_pixel2,(24 - SK_R32_SHIFT)); |
|
996 sr2 = _mm_srli_epi32(sr2, 24); |
|
997 __m128i sr = _mm_packs_epi32(sr1, sr2); |
|
998 |
|
999 // Extract G from src. |
|
1000 __m128i sg1 = _mm_slli_epi32(src_pixel1,(24 - SK_G32_SHIFT)); |
|
1001 sg1 = _mm_srli_epi32(sg1, 24); |
|
1002 __m128i sg2 = _mm_slli_epi32(src_pixel2,(24 - SK_G32_SHIFT)); |
|
1003 sg2 = _mm_srli_epi32(sg2, 24); |
|
1004 __m128i sg = _mm_packs_epi32(sg1, sg2); |
|
1005 |
|
1006 // Extract B from src. |
|
1007 __m128i sb1 = _mm_slli_epi32(src_pixel1,(24 - SK_B32_SHIFT)); |
|
1008 sb1 = _mm_srli_epi32(sb1, 24); |
|
1009 __m128i sb2 = _mm_slli_epi32(src_pixel2,(24 - SK_B32_SHIFT)); |
|
1010 sb2 = _mm_srli_epi32(sb2, 24); |
|
1011 __m128i sb = _mm_packs_epi32(sb1, sb2); |
|
1012 |
|
1013 // Extract R G B from dst. |
|
1014 __m128i dr = _mm_srli_epi16(dst_pixel,SK_R16_SHIFT); |
|
1015 dr = _mm_and_si128(dr, r16_mask); |
|
1016 __m128i dg = _mm_srli_epi16(dst_pixel,SK_G16_SHIFT); |
|
1017 dg = _mm_and_si128(dg, g16_mask); |
|
1018 __m128i db = _mm_srli_epi16(dst_pixel,SK_B16_SHIFT); |
|
1019 db = _mm_and_si128(db, b16_mask); |
|
1020 |
|
1021 __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa |
|
1022 |
|
1023 // Calculate R G B of result. |
|
1024 // Original algorithm is in SkSrcOver32To16(). |
|
1025 dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE(dr, isa, SK_R16_BITS)); |
|
1026 dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS); |
|
1027 dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE(dg, isa, SK_G16_BITS)); |
|
1028 dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS); |
|
1029 db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE(db, isa, SK_B16_BITS)); |
|
1030 db = _mm_srli_epi16(db, 8 - SK_B16_BITS); |
|
1031 |
|
1032 // Pack R G B into 16-bit color. |
|
1033 __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db); |
|
1034 |
|
1035 // Store 8 16-bit colors in dst. |
|
1036 _mm_store_si128(d++, d_pixel); |
|
1037 count -= 8; |
|
1038 } |
|
1039 |
|
1040 src = reinterpret_cast<const SkPMColor*>(s); |
|
1041 dst = reinterpret_cast<uint16_t*>(d); |
|
1042 } |
|
1043 |
|
1044 if (count > 0) { |
|
1045 do { |
|
1046 SkPMColor c = *src++; |
|
1047 SkPMColorAssert(c); |
|
1048 if (c) { |
|
1049 *dst = SkSrcOver32To16(c, *dst); |
|
1050 } |
|
1051 dst += 1; |
|
1052 } while (--count != 0); |
|
1053 } |
|
1054 } |
|
1055 |
|
1056 void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, |
|
1057 const SkPMColor* SK_RESTRICT src, |
|
1058 int count, U8CPU alpha, int x, int y) { |
|
1059 SkASSERT(255 == alpha); |
|
1060 |
|
1061 if (count <= 0) { |
|
1062 return; |
|
1063 } |
|
1064 |
|
1065 if (count >= 8) { |
|
1066 while (((size_t)dst & 0x0F) != 0) { |
|
1067 DITHER_565_SCAN(y); |
|
1068 SkPMColor c = *src++; |
|
1069 SkPMColorAssert(c); |
|
1070 |
|
1071 unsigned dither = DITHER_VALUE(x); |
|
1072 *dst++ = SkDitherRGB32To565(c, dither); |
|
1073 DITHER_INC_X(x); |
|
1074 count--; |
|
1075 } |
|
1076 |
|
1077 unsigned short dither_value[8]; |
|
1078 __m128i dither; |
|
1079 #ifdef ENABLE_DITHER_MATRIX_4X4 |
|
1080 const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3]; |
|
1081 dither_value[0] = dither_value[4] = dither_scan[(x) & 3]; |
|
1082 dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3]; |
|
1083 dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3]; |
|
1084 dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3]; |
|
1085 #else |
|
1086 const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3]; |
|
1087 dither_value[0] = dither_value[4] = (dither_scan |
|
1088 >> (((x) & 3) << 2)) & 0xF; |
|
1089 dither_value[1] = dither_value[5] = (dither_scan |
|
1090 >> (((x + 1) & 3) << 2)) & 0xF; |
|
1091 dither_value[2] = dither_value[6] = (dither_scan |
|
1092 >> (((x + 2) & 3) << 2)) & 0xF; |
|
1093 dither_value[3] = dither_value[7] = (dither_scan |
|
1094 >> (((x + 3) & 3) << 2)) & 0xF; |
|
1095 #endif |
|
1096 dither = _mm_loadu_si128((__m128i*) dither_value); |
|
1097 |
|
1098 const __m128i* s = reinterpret_cast<const __m128i*>(src); |
|
1099 __m128i* d = reinterpret_cast<__m128i*>(dst); |
|
1100 |
|
1101 while (count >= 8) { |
|
1102 // Load 8 pixels of src. |
|
1103 __m128i src_pixel1 = _mm_loadu_si128(s++); |
|
1104 __m128i src_pixel2 = _mm_loadu_si128(s++); |
|
1105 |
|
1106 // Extract R from src. |
|
1107 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); |
|
1108 sr1 = _mm_srli_epi32(sr1, 24); |
|
1109 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT)); |
|
1110 sr2 = _mm_srli_epi32(sr2, 24); |
|
1111 __m128i sr = _mm_packs_epi32(sr1, sr2); |
|
1112 |
|
1113 // SkDITHER_R32To565(sr, dither) |
|
1114 __m128i sr_offset = _mm_srli_epi16(sr, 5); |
|
1115 sr = _mm_add_epi16(sr, dither); |
|
1116 sr = _mm_sub_epi16(sr, sr_offset); |
|
1117 sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS); |
|
1118 |
|
1119 // Extract G from src. |
|
1120 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT)); |
|
1121 sg1 = _mm_srli_epi32(sg1, 24); |
|
1122 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT)); |
|
1123 sg2 = _mm_srli_epi32(sg2, 24); |
|
1124 __m128i sg = _mm_packs_epi32(sg1, sg2); |
|
1125 |
|
1126 // SkDITHER_R32To565(sg, dither) |
|
1127 __m128i sg_offset = _mm_srli_epi16(sg, 6); |
|
1128 sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1)); |
|
1129 sg = _mm_sub_epi16(sg, sg_offset); |
|
1130 sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS); |
|
1131 |
|
1132 // Extract B from src. |
|
1133 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT)); |
|
1134 sb1 = _mm_srli_epi32(sb1, 24); |
|
1135 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT)); |
|
1136 sb2 = _mm_srli_epi32(sb2, 24); |
|
1137 __m128i sb = _mm_packs_epi32(sb1, sb2); |
|
1138 |
|
1139 // SkDITHER_R32To565(sb, dither) |
|
1140 __m128i sb_offset = _mm_srli_epi16(sb, 5); |
|
1141 sb = _mm_add_epi16(sb, dither); |
|
1142 sb = _mm_sub_epi16(sb, sb_offset); |
|
1143 sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS); |
|
1144 |
|
1145 // Pack and store 16-bit dst pixel. |
|
1146 __m128i d_pixel = SkPackRGB16_SSE(sr, sg, sb); |
|
1147 _mm_store_si128(d++, d_pixel); |
|
1148 |
|
1149 count -= 8; |
|
1150 x += 8; |
|
1151 } |
|
1152 |
|
1153 src = reinterpret_cast<const SkPMColor*>(s); |
|
1154 dst = reinterpret_cast<uint16_t*>(d); |
|
1155 } |
|
1156 |
|
1157 if (count > 0) { |
|
1158 DITHER_565_SCAN(y); |
|
1159 do { |
|
1160 SkPMColor c = *src++; |
|
1161 SkPMColorAssert(c); |
|
1162 |
|
1163 unsigned dither = DITHER_VALUE(x); |
|
1164 *dst++ = SkDitherRGB32To565(c, dither); |
|
1165 DITHER_INC_X(x); |
|
1166 } while (--count != 0); |
|
1167 } |
|
1168 } |
|
1169 |
|
1170 /* SSE2 version of S32A_D565_Opaque_Dither() |
|
1171 * portable version is in core/SkBlitRow_D16.cpp |
|
1172 */ |
|
1173 void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, |
|
1174 const SkPMColor* SK_RESTRICT src, |
|
1175 int count, U8CPU alpha, int x, int y) { |
|
1176 SkASSERT(255 == alpha); |
|
1177 |
|
1178 if (count <= 0) { |
|
1179 return; |
|
1180 } |
|
1181 |
|
1182 if (count >= 8) { |
|
1183 while (((size_t)dst & 0x0F) != 0) { |
|
1184 DITHER_565_SCAN(y); |
|
1185 SkPMColor c = *src++; |
|
1186 SkPMColorAssert(c); |
|
1187 if (c) { |
|
1188 unsigned a = SkGetPackedA32(c); |
|
1189 |
|
1190 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a)); |
|
1191 |
|
1192 unsigned sr = SkGetPackedR32(c); |
|
1193 unsigned sg = SkGetPackedG32(c); |
|
1194 unsigned sb = SkGetPackedB32(c); |
|
1195 sr = SkDITHER_R32_FOR_565(sr, d); |
|
1196 sg = SkDITHER_G32_FOR_565(sg, d); |
|
1197 sb = SkDITHER_B32_FOR_565(sb, d); |
|
1198 |
|
1199 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); |
|
1200 uint32_t dst_expanded = SkExpand_rgb_16(*dst); |
|
1201 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); |
|
1202 // now src and dst expanded are in g:11 r:10 x:1 b:10 |
|
1203 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); |
|
1204 } |
|
1205 dst += 1; |
|
1206 DITHER_INC_X(x); |
|
1207 count--; |
|
1208 } |
|
1209 |
|
1210 unsigned short dither_value[8]; |
|
1211 __m128i dither, dither_cur; |
|
1212 #ifdef ENABLE_DITHER_MATRIX_4X4 |
|
1213 const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3]; |
|
1214 dither_value[0] = dither_value[4] = dither_scan[(x) & 3]; |
|
1215 dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3]; |
|
1216 dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3]; |
|
1217 dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3]; |
|
1218 #else |
|
1219 const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3]; |
|
1220 dither_value[0] = dither_value[4] = (dither_scan |
|
1221 >> (((x) & 3) << 2)) & 0xF; |
|
1222 dither_value[1] = dither_value[5] = (dither_scan |
|
1223 >> (((x + 1) & 3) << 2)) & 0xF; |
|
1224 dither_value[2] = dither_value[6] = (dither_scan |
|
1225 >> (((x + 2) & 3) << 2)) & 0xF; |
|
1226 dither_value[3] = dither_value[7] = (dither_scan |
|
1227 >> (((x + 3) & 3) << 2)) & 0xF; |
|
1228 #endif |
|
1229 dither = _mm_loadu_si128((__m128i*) dither_value); |
|
1230 |
|
1231 const __m128i* s = reinterpret_cast<const __m128i*>(src); |
|
1232 __m128i* d = reinterpret_cast<__m128i*>(dst); |
|
1233 __m128i var256 = _mm_set1_epi16(256); |
|
1234 __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK); |
|
1235 __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK); |
|
1236 __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK); |
|
1237 |
|
1238 while (count >= 8) { |
|
1239 // Load 8 pixels of src and dst. |
|
1240 __m128i src_pixel1 = _mm_loadu_si128(s++); |
|
1241 __m128i src_pixel2 = _mm_loadu_si128(s++); |
|
1242 __m128i dst_pixel = _mm_load_si128(d); |
|
1243 |
|
1244 // Extract A from src. |
|
1245 __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT)); |
|
1246 sa1 = _mm_srli_epi32(sa1, 24); |
|
1247 __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT)); |
|
1248 sa2 = _mm_srli_epi32(sa2, 24); |
|
1249 __m128i sa = _mm_packs_epi32(sa1, sa2); |
|
1250 |
|
1251 // Calculate current dither value. |
|
1252 dither_cur = _mm_mullo_epi16(dither, |
|
1253 _mm_add_epi16(sa, _mm_set1_epi16(1))); |
|
1254 dither_cur = _mm_srli_epi16(dither_cur, 8); |
|
1255 |
|
1256 // Extract R from src. |
|
1257 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); |
|
1258 sr1 = _mm_srli_epi32(sr1, 24); |
|
1259 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT)); |
|
1260 sr2 = _mm_srli_epi32(sr2, 24); |
|
1261 __m128i sr = _mm_packs_epi32(sr1, sr2); |
|
1262 |
|
1263 // SkDITHER_R32_FOR_565(sr, d) |
|
1264 __m128i sr_offset = _mm_srli_epi16(sr, 5); |
|
1265 sr = _mm_add_epi16(sr, dither_cur); |
|
1266 sr = _mm_sub_epi16(sr, sr_offset); |
|
1267 |
|
1268 // Expand sr. |
|
1269 sr = _mm_slli_epi16(sr, 2); |
|
1270 |
|
1271 // Extract G from src. |
|
1272 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT)); |
|
1273 sg1 = _mm_srli_epi32(sg1, 24); |
|
1274 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT)); |
|
1275 sg2 = _mm_srli_epi32(sg2, 24); |
|
1276 __m128i sg = _mm_packs_epi32(sg1, sg2); |
|
1277 |
|
1278 // sg = SkDITHER_G32_FOR_565(sg, d). |
|
1279 __m128i sg_offset = _mm_srli_epi16(sg, 6); |
|
1280 sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1)); |
|
1281 sg = _mm_sub_epi16(sg, sg_offset); |
|
1282 |
|
1283 // Expand sg. |
|
1284 sg = _mm_slli_epi16(sg, 3); |
|
1285 |
|
1286 // Extract B from src. |
|
1287 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT)); |
|
1288 sb1 = _mm_srli_epi32(sb1, 24); |
|
1289 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT)); |
|
1290 sb2 = _mm_srli_epi32(sb2, 24); |
|
1291 __m128i sb = _mm_packs_epi32(sb1, sb2); |
|
1292 |
|
1293 // sb = SkDITHER_B32_FOR_565(sb, d). |
|
1294 __m128i sb_offset = _mm_srli_epi16(sb, 5); |
|
1295 sb = _mm_add_epi16(sb, dither_cur); |
|
1296 sb = _mm_sub_epi16(sb, sb_offset); |
|
1297 |
|
1298 // Expand sb. |
|
1299 sb = _mm_slli_epi16(sb, 2); |
|
1300 |
|
1301 // Extract R G B from dst. |
|
1302 __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT); |
|
1303 dr = _mm_and_si128(dr, r16_mask); |
|
1304 __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT); |
|
1305 dg = _mm_and_si128(dg, g16_mask); |
|
1306 __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT); |
|
1307 db = _mm_and_si128(db, b16_mask); |
|
1308 |
|
1309 // SkAlpha255To256(255 - a) >> 3 |
|
1310 __m128i isa = _mm_sub_epi16(var256, sa); |
|
1311 isa = _mm_srli_epi16(isa, 3); |
|
1312 |
|
1313 dr = _mm_mullo_epi16(dr, isa); |
|
1314 dr = _mm_add_epi16(dr, sr); |
|
1315 dr = _mm_srli_epi16(dr, 5); |
|
1316 |
|
1317 dg = _mm_mullo_epi16(dg, isa); |
|
1318 dg = _mm_add_epi16(dg, sg); |
|
1319 dg = _mm_srli_epi16(dg, 5); |
|
1320 |
|
1321 db = _mm_mullo_epi16(db, isa); |
|
1322 db = _mm_add_epi16(db, sb); |
|
1323 db = _mm_srli_epi16(db, 5); |
|
1324 |
|
1325 // Package and store dst pixel. |
|
1326 __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db); |
|
1327 _mm_store_si128(d++, d_pixel); |
|
1328 |
|
1329 count -= 8; |
|
1330 x += 8; |
|
1331 } |
|
1332 |
|
1333 src = reinterpret_cast<const SkPMColor*>(s); |
|
1334 dst = reinterpret_cast<uint16_t*>(d); |
|
1335 } |
|
1336 |
|
1337 if (count > 0) { |
|
1338 DITHER_565_SCAN(y); |
|
1339 do { |
|
1340 SkPMColor c = *src++; |
|
1341 SkPMColorAssert(c); |
|
1342 if (c) { |
|
1343 unsigned a = SkGetPackedA32(c); |
|
1344 |
|
1345 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a)); |
|
1346 |
|
1347 unsigned sr = SkGetPackedR32(c); |
|
1348 unsigned sg = SkGetPackedG32(c); |
|
1349 unsigned sb = SkGetPackedB32(c); |
|
1350 sr = SkDITHER_R32_FOR_565(sr, d); |
|
1351 sg = SkDITHER_G32_FOR_565(sg, d); |
|
1352 sb = SkDITHER_B32_FOR_565(sb, d); |
|
1353 |
|
1354 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); |
|
1355 uint32_t dst_expanded = SkExpand_rgb_16(*dst); |
|
1356 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); |
|
1357 // now src and dst expanded are in g:11 r:10 x:1 b:10 |
|
1358 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); |
|
1359 } |
|
1360 dst += 1; |
|
1361 DITHER_INC_X(x); |
|
1362 } while (--count != 0); |
|
1363 } |
|
1364 } |