|
1 /* |
|
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
|
3 * |
|
4 * Use of this source code is governed by a BSD-style license |
|
5 * that can be found in the LICENSE file in the root of the source |
|
6 * tree. An additional intellectual property rights grant can be found |
|
7 * in the file PATENTS. All contributing project authors may |
|
8 * be found in the AUTHORS file in the root of the source tree. |
|
9 */ |
|
10 |
|
11 #include "libyuv/row.h" |
|
12 |
|
13 #ifdef __cplusplus |
|
14 namespace libyuv { |
|
15 extern "C" { |
|
16 #endif |
|
17 |
|
18 // This module is for Visual C x86. |
|
19 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) |
|
20 |
|
21 #ifdef HAS_ARGBTOYROW_SSSE3 |
|
22 |
|
23 // Constants for ARGB. |
|
24 static const vec8 kARGBToY = { |
|
25 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 |
|
26 }; |
|
27 |
|
28 // JPeg full range. |
|
29 static const vec8 kARGBToYJ = { |
|
30 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 |
|
31 }; |
|
32 |
|
33 static const vec8 kARGBToU = { |
|
34 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 |
|
35 }; |
|
36 |
|
37 static const vec8 kARGBToUJ = { |
|
38 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 |
|
39 }; |
|
40 |
|
41 static const vec8 kARGBToV = { |
|
42 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, |
|
43 }; |
|
44 |
|
45 static const vec8 kARGBToVJ = { |
|
46 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 |
|
47 }; |
|
48 |
|
49 // vpermd for vphaddw + vpackuswb vpermd. |
|
50 static const lvec32 kPermdARGBToY_AVX = { |
|
51 0, 4, 1, 5, 2, 6, 3, 7 |
|
52 }; |
|
53 |
|
54 // vpshufb for vphaddw + vpackuswb packed to shorts. |
|
55 static const lvec8 kShufARGBToUV_AVX = { |
|
56 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, |
|
57 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, |
|
58 }; |
|
59 |
|
60 // Constants for BGRA. |
|
61 static const vec8 kBGRAToY = { |
|
62 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 |
|
63 }; |
|
64 |
|
65 static const vec8 kBGRAToU = { |
|
66 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 |
|
67 }; |
|
68 |
|
69 static const vec8 kBGRAToV = { |
|
70 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 |
|
71 }; |
|
72 |
|
73 // Constants for ABGR. |
|
74 static const vec8 kABGRToY = { |
|
75 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 |
|
76 }; |
|
77 |
|
78 static const vec8 kABGRToU = { |
|
79 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 |
|
80 }; |
|
81 |
|
82 static const vec8 kABGRToV = { |
|
83 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 |
|
84 }; |
|
85 |
|
86 // Constants for RGBA. |
|
87 static const vec8 kRGBAToY = { |
|
88 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 |
|
89 }; |
|
90 |
|
91 static const vec8 kRGBAToU = { |
|
92 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 |
|
93 }; |
|
94 |
|
95 static const vec8 kRGBAToV = { |
|
96 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 |
|
97 }; |
|
98 |
|
99 static const uvec8 kAddY16 = { |
|
100 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u |
|
101 }; |
|
102 |
|
103 static const vec16 kAddYJ64 = { |
|
104 64, 64, 64, 64, 64, 64, 64, 64 |
|
105 }; |
|
106 |
|
107 static const uvec8 kAddUV128 = { |
|
108 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, |
|
109 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u |
|
110 }; |
|
111 |
|
112 static const uvec16 kAddUVJ128 = { |
|
113 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u |
|
114 }; |
|
115 |
|
116 // Shuffle table for converting RGB24 to ARGB. |
|
117 static const uvec8 kShuffleMaskRGB24ToARGB = { |
|
118 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u |
|
119 }; |
|
120 |
|
121 // Shuffle table for converting RAW to ARGB. |
|
122 static const uvec8 kShuffleMaskRAWToARGB = { |
|
123 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u |
|
124 }; |
|
125 |
|
126 // Shuffle table for converting ARGB to RGB24. |
|
127 static const uvec8 kShuffleMaskARGBToRGB24 = { |
|
128 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u |
|
129 }; |
|
130 |
|
131 // Shuffle table for converting ARGB to RAW. |
|
132 static const uvec8 kShuffleMaskARGBToRAW = { |
|
133 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u |
|
134 }; |
|
135 |
|
136 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 |
|
137 static const uvec8 kShuffleMaskARGBToRGB24_0 = { |
|
138 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u |
|
139 }; |
|
140 |
|
141 // Shuffle table for converting ARGB to RAW. |
|
142 static const uvec8 kShuffleMaskARGBToRAW_0 = { |
|
143 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u |
|
144 }; |
|
145 |
|
146 // Duplicates gray value 3 times and fills in alpha opaque. |
|
147 __declspec(naked) __declspec(align(16)) |
|
148 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { |
|
149 __asm { |
|
150 mov eax, [esp + 4] // src_y |
|
151 mov edx, [esp + 8] // dst_argb |
|
152 mov ecx, [esp + 12] // pix |
|
153 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
|
154 pslld xmm5, 24 |
|
155 |
|
156 align 4 |
|
157 convertloop: |
|
158 movq xmm0, qword ptr [eax] |
|
159 lea eax, [eax + 8] |
|
160 punpcklbw xmm0, xmm0 |
|
161 movdqa xmm1, xmm0 |
|
162 punpcklwd xmm0, xmm0 |
|
163 punpckhwd xmm1, xmm1 |
|
164 por xmm0, xmm5 |
|
165 por xmm1, xmm5 |
|
166 movdqa [edx], xmm0 |
|
167 movdqa [edx + 16], xmm1 |
|
168 lea edx, [edx + 32] |
|
169 sub ecx, 8 |
|
170 jg convertloop |
|
171 ret |
|
172 } |
|
173 } |
|
174 |
|
175 __declspec(naked) __declspec(align(16)) |
|
176 void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, |
|
177 int pix) { |
|
178 __asm { |
|
179 mov eax, [esp + 4] // src_y |
|
180 mov edx, [esp + 8] // dst_argb |
|
181 mov ecx, [esp + 12] // pix |
|
182 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
|
183 pslld xmm5, 24 |
|
184 |
|
185 align 4 |
|
186 convertloop: |
|
187 movq xmm0, qword ptr [eax] |
|
188 lea eax, [eax + 8] |
|
189 punpcklbw xmm0, xmm0 |
|
190 movdqa xmm1, xmm0 |
|
191 punpcklwd xmm0, xmm0 |
|
192 punpckhwd xmm1, xmm1 |
|
193 por xmm0, xmm5 |
|
194 por xmm1, xmm5 |
|
195 movdqu [edx], xmm0 |
|
196 movdqu [edx + 16], xmm1 |
|
197 lea edx, [edx + 32] |
|
198 sub ecx, 8 |
|
199 jg convertloop |
|
200 ret |
|
201 } |
|
202 } |
|
203 |
|
204 __declspec(naked) __declspec(align(16)) |
|
205 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { |
|
206 __asm { |
|
207 mov eax, [esp + 4] // src_rgb24 |
|
208 mov edx, [esp + 8] // dst_argb |
|
209 mov ecx, [esp + 12] // pix |
|
210 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
|
211 pslld xmm5, 24 |
|
212 movdqa xmm4, kShuffleMaskRGB24ToARGB |
|
213 |
|
214 align 4 |
|
215 convertloop: |
|
216 movdqu xmm0, [eax] |
|
217 movdqu xmm1, [eax + 16] |
|
218 movdqu xmm3, [eax + 32] |
|
219 lea eax, [eax + 48] |
|
220 movdqa xmm2, xmm3 |
|
221 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} |
|
222 pshufb xmm2, xmm4 |
|
223 por xmm2, xmm5 |
|
224 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} |
|
225 pshufb xmm0, xmm4 |
|
226 movdqa [edx + 32], xmm2 |
|
227 por xmm0, xmm5 |
|
228 pshufb xmm1, xmm4 |
|
229 movdqa [edx], xmm0 |
|
230 por xmm1, xmm5 |
|
231 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} |
|
232 pshufb xmm3, xmm4 |
|
233 movdqa [edx + 16], xmm1 |
|
234 por xmm3, xmm5 |
|
235 sub ecx, 16 |
|
236 movdqa [edx + 48], xmm3 |
|
237 lea edx, [edx + 64] |
|
238 jg convertloop |
|
239 ret |
|
240 } |
|
241 } |
|
242 |
|
243 __declspec(naked) __declspec(align(16)) |
|
244 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, |
|
245 int pix) { |
|
246 __asm { |
|
247 mov eax, [esp + 4] // src_raw |
|
248 mov edx, [esp + 8] // dst_argb |
|
249 mov ecx, [esp + 12] // pix |
|
250 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
|
251 pslld xmm5, 24 |
|
252 movdqa xmm4, kShuffleMaskRAWToARGB |
|
253 |
|
254 align 4 |
|
255 convertloop: |
|
256 movdqu xmm0, [eax] |
|
257 movdqu xmm1, [eax + 16] |
|
258 movdqu xmm3, [eax + 32] |
|
259 lea eax, [eax + 48] |
|
260 movdqa xmm2, xmm3 |
|
261 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} |
|
262 pshufb xmm2, xmm4 |
|
263 por xmm2, xmm5 |
|
264 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} |
|
265 pshufb xmm0, xmm4 |
|
266 movdqa [edx + 32], xmm2 |
|
267 por xmm0, xmm5 |
|
268 pshufb xmm1, xmm4 |
|
269 movdqa [edx], xmm0 |
|
270 por xmm1, xmm5 |
|
271 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} |
|
272 pshufb xmm3, xmm4 |
|
273 movdqa [edx + 16], xmm1 |
|
274 por xmm3, xmm5 |
|
275 sub ecx, 16 |
|
276 movdqa [edx + 48], xmm3 |
|
277 lea edx, [edx + 64] |
|
278 jg convertloop |
|
279 ret |
|
280 } |
|
281 } |
|
282 |
|
283 // pmul method to replicate bits. |
|
284 // Math to replicate bits: |
|
285 // (v << 8) | (v << 3) |
|
286 // v * 256 + v * 8 |
|
287 // v * (256 + 8) |
|
288 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 |
|
289 // 20 instructions. |
|
290 __declspec(naked) __declspec(align(16)) |
|
291 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, |
|
292 int pix) { |
|
293 __asm { |
|
294 mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
|
295 movd xmm5, eax |
|
296 pshufd xmm5, xmm5, 0 |
|
297 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits |
|
298 movd xmm6, eax |
|
299 pshufd xmm6, xmm6, 0 |
|
300 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red |
|
301 psllw xmm3, 11 |
|
302 pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green |
|
303 psllw xmm4, 10 |
|
304 psrlw xmm4, 5 |
|
305 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha |
|
306 psllw xmm7, 8 |
|
307 |
|
308 mov eax, [esp + 4] // src_rgb565 |
|
309 mov edx, [esp + 8] // dst_argb |
|
310 mov ecx, [esp + 12] // pix |
|
311 sub edx, eax |
|
312 sub edx, eax |
|
313 |
|
314 align 4 |
|
315 convertloop: |
|
316 movdqu xmm0, [eax] // fetch 8 pixels of bgr565 |
|
317 movdqa xmm1, xmm0 |
|
318 movdqa xmm2, xmm0 |
|
319 pand xmm1, xmm3 // R in upper 5 bits |
|
320 psllw xmm2, 11 // B in upper 5 bits |
|
321 pmulhuw xmm1, xmm5 // * (256 + 8) |
|
322 pmulhuw xmm2, xmm5 // * (256 + 8) |
|
323 psllw xmm1, 8 |
|
324 por xmm1, xmm2 // RB |
|
325 pand xmm0, xmm4 // G in middle 6 bits |
|
326 pmulhuw xmm0, xmm6 // << 5 * (256 + 4) |
|
327 por xmm0, xmm7 // AG |
|
328 movdqa xmm2, xmm1 |
|
329 punpcklbw xmm1, xmm0 |
|
330 punpckhbw xmm2, xmm0 |
|
331 movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB |
|
332 movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB |
|
333 lea eax, [eax + 16] |
|
334 sub ecx, 8 |
|
335 jg convertloop |
|
336 ret |
|
337 } |
|
338 } |
|
339 |
|
340 // 24 instructions |
|
341 __declspec(naked) __declspec(align(16)) |
|
342 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, |
|
343 int pix) { |
|
344 __asm { |
|
345 mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
|
346 movd xmm5, eax |
|
347 pshufd xmm5, xmm5, 0 |
|
348 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits |
|
349 movd xmm6, eax |
|
350 pshufd xmm6, xmm6, 0 |
|
351 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red |
|
352 psllw xmm3, 11 |
|
353 movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green |
|
354 psrlw xmm4, 6 |
|
355 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha |
|
356 psllw xmm7, 8 |
|
357 |
|
358 mov eax, [esp + 4] // src_argb1555 |
|
359 mov edx, [esp + 8] // dst_argb |
|
360 mov ecx, [esp + 12] // pix |
|
361 sub edx, eax |
|
362 sub edx, eax |
|
363 |
|
364 align 4 |
|
365 convertloop: |
|
366 movdqu xmm0, [eax] // fetch 8 pixels of 1555 |
|
367 movdqa xmm1, xmm0 |
|
368 movdqa xmm2, xmm0 |
|
369 psllw xmm1, 1 // R in upper 5 bits |
|
370 psllw xmm2, 11 // B in upper 5 bits |
|
371 pand xmm1, xmm3 |
|
372 pmulhuw xmm2, xmm5 // * (256 + 8) |
|
373 pmulhuw xmm1, xmm5 // * (256 + 8) |
|
374 psllw xmm1, 8 |
|
375 por xmm1, xmm2 // RB |
|
376 movdqa xmm2, xmm0 |
|
377 pand xmm0, xmm4 // G in middle 5 bits |
|
378 psraw xmm2, 8 // A |
|
379 pmulhuw xmm0, xmm6 // << 6 * (256 + 8) |
|
380 pand xmm2, xmm7 |
|
381 por xmm0, xmm2 // AG |
|
382 movdqa xmm2, xmm1 |
|
383 punpcklbw xmm1, xmm0 |
|
384 punpckhbw xmm2, xmm0 |
|
385 movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB |
|
386 movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB |
|
387 lea eax, [eax + 16] |
|
388 sub ecx, 8 |
|
389 jg convertloop |
|
390 ret |
|
391 } |
|
392 } |
|
393 |
|
394 // 18 instructions. |
|
395 __declspec(naked) __declspec(align(16)) |
|
396 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, |
|
397 int pix) { |
|
398 __asm { |
|
399 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f |
|
400 movd xmm4, eax |
|
401 pshufd xmm4, xmm4, 0 |
|
402 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles |
|
403 pslld xmm5, 4 |
|
404 mov eax, [esp + 4] // src_argb4444 |
|
405 mov edx, [esp + 8] // dst_argb |
|
406 mov ecx, [esp + 12] // pix |
|
407 sub edx, eax |
|
408 sub edx, eax |
|
409 |
|
410 align 4 |
|
411 convertloop: |
|
412 movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 |
|
413 movdqa xmm2, xmm0 |
|
414 pand xmm0, xmm4 // mask low nibbles |
|
415 pand xmm2, xmm5 // mask high nibbles |
|
416 movdqa xmm1, xmm0 |
|
417 movdqa xmm3, xmm2 |
|
418 psllw xmm1, 4 |
|
419 psrlw xmm3, 4 |
|
420 por xmm0, xmm1 |
|
421 por xmm2, xmm3 |
|
422 movdqa xmm1, xmm0 |
|
423 punpcklbw xmm0, xmm2 |
|
424 punpckhbw xmm1, xmm2 |
|
425 movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB |
|
426 movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB |
|
427 lea eax, [eax + 16] |
|
428 sub ecx, 8 |
|
429 jg convertloop |
|
430 ret |
|
431 } |
|
432 } |
|
433 |
|
434 __declspec(naked) __declspec(align(16)) |
|
435 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { |
|
436 __asm { |
|
437 mov eax, [esp + 4] // src_argb |
|
438 mov edx, [esp + 8] // dst_rgb |
|
439 mov ecx, [esp + 12] // pix |
|
440 movdqa xmm6, kShuffleMaskARGBToRGB24 |
|
441 |
|
442 align 4 |
|
443 convertloop: |
|
444 movdqu xmm0, [eax] // fetch 16 pixels of argb |
|
445 movdqu xmm1, [eax + 16] |
|
446 movdqu xmm2, [eax + 32] |
|
447 movdqu xmm3, [eax + 48] |
|
448 lea eax, [eax + 64] |
|
449 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB |
|
450 pshufb xmm1, xmm6 |
|
451 pshufb xmm2, xmm6 |
|
452 pshufb xmm3, xmm6 |
|
453 movdqa xmm4, xmm1 // 4 bytes from 1 for 0 |
|
454 psrldq xmm1, 4 // 8 bytes from 1 |
|
455 pslldq xmm4, 12 // 4 bytes from 1 for 0 |
|
456 movdqa xmm5, xmm2 // 8 bytes from 2 for 1 |
|
457 por xmm0, xmm4 // 4 bytes from 1 for 0 |
|
458 pslldq xmm5, 8 // 8 bytes from 2 for 1 |
|
459 movdqu [edx], xmm0 // store 0 |
|
460 por xmm1, xmm5 // 8 bytes from 2 for 1 |
|
461 psrldq xmm2, 8 // 4 bytes from 2 |
|
462 pslldq xmm3, 4 // 12 bytes from 3 for 2 |
|
463 por xmm2, xmm3 // 12 bytes from 3 for 2 |
|
464 movdqu [edx + 16], xmm1 // store 1 |
|
465 movdqu [edx + 32], xmm2 // store 2 |
|
466 lea edx, [edx + 48] |
|
467 sub ecx, 16 |
|
468 jg convertloop |
|
469 ret |
|
470 } |
|
471 } |
|
472 |
|
473 __declspec(naked) __declspec(align(16)) |
|
474 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { |
|
475 __asm { |
|
476 mov eax, [esp + 4] // src_argb |
|
477 mov edx, [esp + 8] // dst_rgb |
|
478 mov ecx, [esp + 12] // pix |
|
479 movdqa xmm6, kShuffleMaskARGBToRAW |
|
480 |
|
481 align 4 |
|
482 convertloop: |
|
483 movdqu xmm0, [eax] // fetch 16 pixels of argb |
|
484 movdqu xmm1, [eax + 16] |
|
485 movdqu xmm2, [eax + 32] |
|
486 movdqu xmm3, [eax + 48] |
|
487 lea eax, [eax + 64] |
|
488 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB |
|
489 pshufb xmm1, xmm6 |
|
490 pshufb xmm2, xmm6 |
|
491 pshufb xmm3, xmm6 |
|
492 movdqa xmm4, xmm1 // 4 bytes from 1 for 0 |
|
493 psrldq xmm1, 4 // 8 bytes from 1 |
|
494 pslldq xmm4, 12 // 4 bytes from 1 for 0 |
|
495 movdqa xmm5, xmm2 // 8 bytes from 2 for 1 |
|
496 por xmm0, xmm4 // 4 bytes from 1 for 0 |
|
497 pslldq xmm5, 8 // 8 bytes from 2 for 1 |
|
498 movdqu [edx], xmm0 // store 0 |
|
499 por xmm1, xmm5 // 8 bytes from 2 for 1 |
|
500 psrldq xmm2, 8 // 4 bytes from 2 |
|
501 pslldq xmm3, 4 // 12 bytes from 3 for 2 |
|
502 por xmm2, xmm3 // 12 bytes from 3 for 2 |
|
503 movdqu [edx + 16], xmm1 // store 1 |
|
504 movdqu [edx + 32], xmm2 // store 2 |
|
505 lea edx, [edx + 48] |
|
506 sub ecx, 16 |
|
507 jg convertloop |
|
508 ret |
|
509 } |
|
510 } |
|
511 |
|
512 __declspec(naked) __declspec(align(16)) |
|
513 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { |
|
514 __asm { |
|
515 mov eax, [esp + 4] // src_argb |
|
516 mov edx, [esp + 8] // dst_rgb |
|
517 mov ecx, [esp + 12] // pix |
|
518 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f |
|
519 psrld xmm3, 27 |
|
520 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 |
|
521 psrld xmm4, 26 |
|
522 pslld xmm4, 5 |
|
523 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 |
|
524 pslld xmm5, 11 |
|
525 |
|
526 align 4 |
|
527 convertloop: |
|
528 movdqa xmm0, [eax] // fetch 4 pixels of argb |
|
529 movdqa xmm1, xmm0 // B |
|
530 movdqa xmm2, xmm0 // G |
|
531 pslld xmm0, 8 // R |
|
532 psrld xmm1, 3 // B |
|
533 psrld xmm2, 5 // G |
|
534 psrad xmm0, 16 // R |
|
535 pand xmm1, xmm3 // B |
|
536 pand xmm2, xmm4 // G |
|
537 pand xmm0, xmm5 // R |
|
538 por xmm1, xmm2 // BG |
|
539 por xmm0, xmm1 // BGR |
|
540 packssdw xmm0, xmm0 |
|
541 lea eax, [eax + 16] |
|
542 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 |
|
543 lea edx, [edx + 8] |
|
544 sub ecx, 4 |
|
545 jg convertloop |
|
546 ret |
|
547 } |
|
548 } |
|
549 |
|
550 // TODO(fbarchard): Improve sign extension/packing. |
|
551 __declspec(naked) __declspec(align(16)) |
|
552 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { |
|
553 __asm { |
|
554 mov eax, [esp + 4] // src_argb |
|
555 mov edx, [esp + 8] // dst_rgb |
|
556 mov ecx, [esp + 12] // pix |
|
557 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f |
|
558 psrld xmm4, 27 |
|
559 movdqa xmm5, xmm4 // generate mask 0x000003e0 |
|
560 pslld xmm5, 5 |
|
561 movdqa xmm6, xmm4 // generate mask 0x00007c00 |
|
562 pslld xmm6, 10 |
|
563 pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 |
|
564 pslld xmm7, 15 |
|
565 |
|
566 align 4 |
|
567 convertloop: |
|
568 movdqa xmm0, [eax] // fetch 4 pixels of argb |
|
569 movdqa xmm1, xmm0 // B |
|
570 movdqa xmm2, xmm0 // G |
|
571 movdqa xmm3, xmm0 // R |
|
572 psrad xmm0, 16 // A |
|
573 psrld xmm1, 3 // B |
|
574 psrld xmm2, 6 // G |
|
575 psrld xmm3, 9 // R |
|
576 pand xmm0, xmm7 // A |
|
577 pand xmm1, xmm4 // B |
|
578 pand xmm2, xmm5 // G |
|
579 pand xmm3, xmm6 // R |
|
580 por xmm0, xmm1 // BA |
|
581 por xmm2, xmm3 // GR |
|
582 por xmm0, xmm2 // BGRA |
|
583 packssdw xmm0, xmm0 |
|
584 lea eax, [eax + 16] |
|
585 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 |
|
586 lea edx, [edx + 8] |
|
587 sub ecx, 4 |
|
588 jg convertloop |
|
589 ret |
|
590 } |
|
591 } |
|
592 |
|
593 __declspec(naked) __declspec(align(16)) |
|
594 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { |
|
595 __asm { |
|
596 mov eax, [esp + 4] // src_argb |
|
597 mov edx, [esp + 8] // dst_rgb |
|
598 mov ecx, [esp + 12] // pix |
|
599 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 |
|
600 psllw xmm4, 12 |
|
601 movdqa xmm3, xmm4 // generate mask 0x00f000f0 |
|
602 psrlw xmm3, 8 |
|
603 |
|
604 align 4 |
|
605 convertloop: |
|
606 movdqa xmm0, [eax] // fetch 4 pixels of argb |
|
607 movdqa xmm1, xmm0 |
|
608 pand xmm0, xmm3 // low nibble |
|
609 pand xmm1, xmm4 // high nibble |
|
610 psrl xmm0, 4 |
|
611 psrl xmm1, 8 |
|
612 por xmm0, xmm1 |
|
613 packuswb xmm0, xmm0 |
|
614 lea eax, [eax + 16] |
|
615 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 |
|
616 lea edx, [edx + 8] |
|
617 sub ecx, 4 |
|
618 jg convertloop |
|
619 ret |
|
620 } |
|
621 } |
|
622 |
|
623 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. |
|
624 __declspec(naked) __declspec(align(16)) |
|
625 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
|
626 __asm { |
|
627 mov eax, [esp + 4] /* src_argb */ |
|
628 mov edx, [esp + 8] /* dst_y */ |
|
629 mov ecx, [esp + 12] /* pix */ |
|
630 movdqa xmm5, kAddY16 |
|
631 movdqa xmm4, kARGBToY |
|
632 |
|
633 align 4 |
|
634 convertloop: |
|
635 movdqa xmm0, [eax] |
|
636 movdqa xmm1, [eax + 16] |
|
637 movdqa xmm2, [eax + 32] |
|
638 movdqa xmm3, [eax + 48] |
|
639 pmaddubsw xmm0, xmm4 |
|
640 pmaddubsw xmm1, xmm4 |
|
641 pmaddubsw xmm2, xmm4 |
|
642 pmaddubsw xmm3, xmm4 |
|
643 lea eax, [eax + 64] |
|
644 phaddw xmm0, xmm1 |
|
645 phaddw xmm2, xmm3 |
|
646 psrlw xmm0, 7 |
|
647 psrlw xmm2, 7 |
|
648 packuswb xmm0, xmm2 |
|
649 paddb xmm0, xmm5 |
|
650 sub ecx, 16 |
|
651 movdqa [edx], xmm0 |
|
652 lea edx, [edx + 16] |
|
653 jg convertloop |
|
654 ret |
|
655 } |
|
656 } |
|
657 |
|
658 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. |
|
659 __declspec(naked) __declspec(align(16)) |
|
660 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
|
661 __asm { |
|
662 mov eax, [esp + 4] /* src_argb */ |
|
663 mov edx, [esp + 8] /* dst_y */ |
|
664 mov ecx, [esp + 12] /* pix */ |
|
665 movdqa xmm4, kARGBToYJ |
|
666 movdqa xmm5, kAddYJ64 |
|
667 |
|
668 align 4 |
|
669 convertloop: |
|
670 movdqa xmm0, [eax] |
|
671 movdqa xmm1, [eax + 16] |
|
672 movdqa xmm2, [eax + 32] |
|
673 movdqa xmm3, [eax + 48] |
|
674 pmaddubsw xmm0, xmm4 |
|
675 pmaddubsw xmm1, xmm4 |
|
676 pmaddubsw xmm2, xmm4 |
|
677 pmaddubsw xmm3, xmm4 |
|
678 lea eax, [eax + 64] |
|
679 phaddw xmm0, xmm1 |
|
680 phaddw xmm2, xmm3 |
|
681 paddw xmm0, xmm5 // Add .5 for rounding. |
|
682 paddw xmm2, xmm5 |
|
683 psrlw xmm0, 7 |
|
684 psrlw xmm2, 7 |
|
685 packuswb xmm0, xmm2 |
|
686 sub ecx, 16 |
|
687 movdqa [edx], xmm0 |
|
688 lea edx, [edx + 16] |
|
689 jg convertloop |
|
690 ret |
|
691 } |
|
692 } |
|
693 |
|
694 #ifdef HAS_ARGBTOYROW_AVX2 |
|
695 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. |
|
696 __declspec(naked) __declspec(align(32)) |
|
697 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { |
|
698 __asm { |
|
699 mov eax, [esp + 4] /* src_argb */ |
|
700 mov edx, [esp + 8] /* dst_y */ |
|
701 mov ecx, [esp + 12] /* pix */ |
|
702 vbroadcastf128 ymm4, kARGBToY |
|
703 vbroadcastf128 ymm5, kAddY16 |
|
704 vmovdqa ymm6, kPermdARGBToY_AVX |
|
705 |
|
706 align 4 |
|
707 convertloop: |
|
708 vmovdqu ymm0, [eax] |
|
709 vmovdqu ymm1, [eax + 32] |
|
710 vmovdqu ymm2, [eax + 64] |
|
711 vmovdqu ymm3, [eax + 96] |
|
712 vpmaddubsw ymm0, ymm0, ymm4 |
|
713 vpmaddubsw ymm1, ymm1, ymm4 |
|
714 vpmaddubsw ymm2, ymm2, ymm4 |
|
715 vpmaddubsw ymm3, ymm3, ymm4 |
|
716 lea eax, [eax + 128] |
|
717 vphaddw ymm0, ymm0, ymm1 // mutates. |
|
718 vphaddw ymm2, ymm2, ymm3 |
|
719 vpsrlw ymm0, ymm0, 7 |
|
720 vpsrlw ymm2, ymm2, 7 |
|
721 vpackuswb ymm0, ymm0, ymm2 // mutates. |
|
722 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. |
|
723 vpaddb ymm0, ymm0, ymm5 |
|
724 sub ecx, 32 |
|
725 vmovdqu [edx], ymm0 |
|
726 lea edx, [edx + 32] |
|
727 jg convertloop |
|
728 vzeroupper |
|
729 ret |
|
730 } |
|
731 } |
|
732 #endif // HAS_ARGBTOYROW_AVX2 |
|
733 |
|
734 #ifdef HAS_ARGBTOYROW_AVX2 |
|
735 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. |
|
736 __declspec(naked) __declspec(align(32)) |
|
737 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { |
|
738 __asm { |
|
739 mov eax, [esp + 4] /* src_argb */ |
|
740 mov edx, [esp + 8] /* dst_y */ |
|
741 mov ecx, [esp + 12] /* pix */ |
|
742 vbroadcastf128 ymm4, kARGBToYJ |
|
743 vbroadcastf128 ymm5, kAddYJ64 |
|
744 vmovdqa ymm6, kPermdARGBToY_AVX |
|
745 |
|
746 align 4 |
|
747 convertloop: |
|
748 vmovdqu ymm0, [eax] |
|
749 vmovdqu ymm1, [eax + 32] |
|
750 vmovdqu ymm2, [eax + 64] |
|
751 vmovdqu ymm3, [eax + 96] |
|
752 vpmaddubsw ymm0, ymm0, ymm4 |
|
753 vpmaddubsw ymm1, ymm1, ymm4 |
|
754 vpmaddubsw ymm2, ymm2, ymm4 |
|
755 vpmaddubsw ymm3, ymm3, ymm4 |
|
756 lea eax, [eax + 128] |
|
757 vphaddw ymm0, ymm0, ymm1 // mutates. |
|
758 vphaddw ymm2, ymm2, ymm3 |
|
759 vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding. |
|
760 vpaddw ymm2, ymm2, ymm5 |
|
761 vpsrlw ymm0, ymm0, 7 |
|
762 vpsrlw ymm2, ymm2, 7 |
|
763 vpackuswb ymm0, ymm0, ymm2 // mutates. |
|
764 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. |
|
765 sub ecx, 32 |
|
766 vmovdqu [edx], ymm0 |
|
767 lea edx, [edx + 32] |
|
768 jg convertloop |
|
769 |
|
770 vzeroupper |
|
771 ret |
|
772 } |
|
773 } |
|
774 #endif // HAS_ARGBTOYJROW_AVX2 |
|
775 |
|
776 __declspec(naked) __declspec(align(16)) |
|
777 void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
|
778 __asm { |
|
779 mov eax, [esp + 4] /* src_argb */ |
|
780 mov edx, [esp + 8] /* dst_y */ |
|
781 mov ecx, [esp + 12] /* pix */ |
|
782 movdqa xmm5, kAddY16 |
|
783 movdqa xmm4, kARGBToY |
|
784 |
|
785 align 4 |
|
786 convertloop: |
|
787 movdqu xmm0, [eax] |
|
788 movdqu xmm1, [eax + 16] |
|
789 movdqu xmm2, [eax + 32] |
|
790 movdqu xmm3, [eax + 48] |
|
791 pmaddubsw xmm0, xmm4 |
|
792 pmaddubsw xmm1, xmm4 |
|
793 pmaddubsw xmm2, xmm4 |
|
794 pmaddubsw xmm3, xmm4 |
|
795 lea eax, [eax + 64] |
|
796 phaddw xmm0, xmm1 |
|
797 phaddw xmm2, xmm3 |
|
798 psrlw xmm0, 7 |
|
799 psrlw xmm2, 7 |
|
800 packuswb xmm0, xmm2 |
|
801 paddb xmm0, xmm5 |
|
802 sub ecx, 16 |
|
803 movdqu [edx], xmm0 |
|
804 lea edx, [edx + 16] |
|
805 jg convertloop |
|
806 ret |
|
807 } |
|
808 } |
|
809 |
|
810 __declspec(naked) __declspec(align(16)) |
|
811 void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
|
812 __asm { |
|
813 mov eax, [esp + 4] /* src_argb */ |
|
814 mov edx, [esp + 8] /* dst_y */ |
|
815 mov ecx, [esp + 12] /* pix */ |
|
816 movdqa xmm4, kARGBToYJ |
|
817 movdqa xmm5, kAddYJ64 |
|
818 |
|
819 align 4 |
|
820 convertloop: |
|
821 movdqu xmm0, [eax] |
|
822 movdqu xmm1, [eax + 16] |
|
823 movdqu xmm2, [eax + 32] |
|
824 movdqu xmm3, [eax + 48] |
|
825 pmaddubsw xmm0, xmm4 |
|
826 pmaddubsw xmm1, xmm4 |
|
827 pmaddubsw xmm2, xmm4 |
|
828 pmaddubsw xmm3, xmm4 |
|
829 lea eax, [eax + 64] |
|
830 phaddw xmm0, xmm1 |
|
831 phaddw xmm2, xmm3 |
|
832 paddw xmm0, xmm5 |
|
833 paddw xmm2, xmm5 |
|
834 psrlw xmm0, 7 |
|
835 psrlw xmm2, 7 |
|
836 packuswb xmm0, xmm2 |
|
837 sub ecx, 16 |
|
838 movdqu [edx], xmm0 |
|
839 lea edx, [edx + 16] |
|
840 jg convertloop |
|
841 ret |
|
842 } |
|
843 } |
|
844 |
|
845 __declspec(naked) __declspec(align(16)) |
|
846 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
|
847 __asm { |
|
848 mov eax, [esp + 4] /* src_argb */ |
|
849 mov edx, [esp + 8] /* dst_y */ |
|
850 mov ecx, [esp + 12] /* pix */ |
|
851 movdqa xmm5, kAddY16 |
|
852 movdqa xmm4, kBGRAToY |
|
853 |
|
854 align 4 |
|
855 convertloop: |
|
856 movdqa xmm0, [eax] |
|
857 movdqa xmm1, [eax + 16] |
|
858 movdqa xmm2, [eax + 32] |
|
859 movdqa xmm3, [eax + 48] |
|
860 pmaddubsw xmm0, xmm4 |
|
861 pmaddubsw xmm1, xmm4 |
|
862 pmaddubsw xmm2, xmm4 |
|
863 pmaddubsw xmm3, xmm4 |
|
864 lea eax, [eax + 64] |
|
865 phaddw xmm0, xmm1 |
|
866 phaddw xmm2, xmm3 |
|
867 psrlw xmm0, 7 |
|
868 psrlw xmm2, 7 |
|
869 packuswb xmm0, xmm2 |
|
870 paddb xmm0, xmm5 |
|
871 sub ecx, 16 |
|
872 movdqa [edx], xmm0 |
|
873 lea edx, [edx + 16] |
|
874 jg convertloop |
|
875 ret |
|
876 } |
|
877 } |
|
878 |
|
879 __declspec(naked) __declspec(align(16)) |
|
880 void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
|
881 __asm { |
|
882 mov eax, [esp + 4] /* src_argb */ |
|
883 mov edx, [esp + 8] /* dst_y */ |
|
884 mov ecx, [esp + 12] /* pix */ |
|
885 movdqa xmm5, kAddY16 |
|
886 movdqa xmm4, kBGRAToY |
|
887 |
|
888 align 4 |
|
889 convertloop: |
|
890 movdqu xmm0, [eax] |
|
891 movdqu xmm1, [eax + 16] |
|
892 movdqu xmm2, [eax + 32] |
|
893 movdqu xmm3, [eax + 48] |
|
894 pmaddubsw xmm0, xmm4 |
|
895 pmaddubsw xmm1, xmm4 |
|
896 pmaddubsw xmm2, xmm4 |
|
897 pmaddubsw xmm3, xmm4 |
|
898 lea eax, [eax + 64] |
|
899 phaddw xmm0, xmm1 |
|
900 phaddw xmm2, xmm3 |
|
901 psrlw xmm0, 7 |
|
902 psrlw xmm2, 7 |
|
903 packuswb xmm0, xmm2 |
|
904 paddb xmm0, xmm5 |
|
905 sub ecx, 16 |
|
906 movdqu [edx], xmm0 |
|
907 lea edx, [edx + 16] |
|
908 jg convertloop |
|
909 ret |
|
910 } |
|
911 } |
|
912 |
|
913 __declspec(naked) __declspec(align(16)) |
|
914 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
|
915 __asm { |
|
916 mov eax, [esp + 4] /* src_argb */ |
|
917 mov edx, [esp + 8] /* dst_y */ |
|
918 mov ecx, [esp + 12] /* pix */ |
|
919 movdqa xmm5, kAddY16 |
|
920 movdqa xmm4, kABGRToY |
|
921 |
|
922 align 4 |
|
923 convertloop: |
|
924 movdqa xmm0, [eax] |
|
925 movdqa xmm1, [eax + 16] |
|
926 movdqa xmm2, [eax + 32] |
|
927 movdqa xmm3, [eax + 48] |
|
928 pmaddubsw xmm0, xmm4 |
|
929 pmaddubsw xmm1, xmm4 |
|
930 pmaddubsw xmm2, xmm4 |
|
931 pmaddubsw xmm3, xmm4 |
|
932 lea eax, [eax + 64] |
|
933 phaddw xmm0, xmm1 |
|
934 phaddw xmm2, xmm3 |
|
935 psrlw xmm0, 7 |
|
936 psrlw xmm2, 7 |
|
937 packuswb xmm0, xmm2 |
|
938 paddb xmm0, xmm5 |
|
939 sub ecx, 16 |
|
940 movdqa [edx], xmm0 |
|
941 lea edx, [edx + 16] |
|
942 jg convertloop |
|
943 ret |
|
944 } |
|
945 } |
|
946 |
|
947 __declspec(naked) __declspec(align(16)) |
|
948 void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
|
949 __asm { |
|
950 mov eax, [esp + 4] /* src_argb */ |
|
951 mov edx, [esp + 8] /* dst_y */ |
|
952 mov ecx, [esp + 12] /* pix */ |
|
953 movdqa xmm5, kAddY16 |
|
954 movdqa xmm4, kABGRToY |
|
955 |
|
956 align 4 |
|
957 convertloop: |
|
958 movdqu xmm0, [eax] |
|
959 movdqu xmm1, [eax + 16] |
|
960 movdqu xmm2, [eax + 32] |
|
961 movdqu xmm3, [eax + 48] |
|
962 pmaddubsw xmm0, xmm4 |
|
963 pmaddubsw xmm1, xmm4 |
|
964 pmaddubsw xmm2, xmm4 |
|
965 pmaddubsw xmm3, xmm4 |
|
966 lea eax, [eax + 64] |
|
967 phaddw xmm0, xmm1 |
|
968 phaddw xmm2, xmm3 |
|
969 psrlw xmm0, 7 |
|
970 psrlw xmm2, 7 |
|
971 packuswb xmm0, xmm2 |
|
972 paddb xmm0, xmm5 |
|
973 sub ecx, 16 |
|
974 movdqu [edx], xmm0 |
|
975 lea edx, [edx + 16] |
|
976 jg convertloop |
|
977 ret |
|
978 } |
|
979 } |
|
980 |
|
981 __declspec(naked) __declspec(align(16)) |
|
982 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
|
983 __asm { |
|
984 mov eax, [esp + 4] /* src_argb */ |
|
985 mov edx, [esp + 8] /* dst_y */ |
|
986 mov ecx, [esp + 12] /* pix */ |
|
987 movdqa xmm5, kAddY16 |
|
988 movdqa xmm4, kRGBAToY |
|
989 |
|
990 align 4 |
|
991 convertloop: |
|
992 movdqa xmm0, [eax] |
|
993 movdqa xmm1, [eax + 16] |
|
994 movdqa xmm2, [eax + 32] |
|
995 movdqa xmm3, [eax + 48] |
|
996 pmaddubsw xmm0, xmm4 |
|
997 pmaddubsw xmm1, xmm4 |
|
998 pmaddubsw xmm2, xmm4 |
|
999 pmaddubsw xmm3, xmm4 |
|
1000 lea eax, [eax + 64] |
|
1001 phaddw xmm0, xmm1 |
|
1002 phaddw xmm2, xmm3 |
|
1003 psrlw xmm0, 7 |
|
1004 psrlw xmm2, 7 |
|
1005 packuswb xmm0, xmm2 |
|
1006 paddb xmm0, xmm5 |
|
1007 sub ecx, 16 |
|
1008 movdqa [edx], xmm0 |
|
1009 lea edx, [edx + 16] |
|
1010 jg convertloop |
|
1011 ret |
|
1012 } |
|
1013 } |
|
1014 |
|
1015 __declspec(naked) __declspec(align(16)) |
|
1016 void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
|
1017 __asm { |
|
1018 mov eax, [esp + 4] /* src_argb */ |
|
1019 mov edx, [esp + 8] /* dst_y */ |
|
1020 mov ecx, [esp + 12] /* pix */ |
|
1021 movdqa xmm5, kAddY16 |
|
1022 movdqa xmm4, kRGBAToY |
|
1023 |
|
1024 align 4 |
|
1025 convertloop: |
|
1026 movdqu xmm0, [eax] |
|
1027 movdqu xmm1, [eax + 16] |
|
1028 movdqu xmm2, [eax + 32] |
|
1029 movdqu xmm3, [eax + 48] |
|
1030 pmaddubsw xmm0, xmm4 |
|
1031 pmaddubsw xmm1, xmm4 |
|
1032 pmaddubsw xmm2, xmm4 |
|
1033 pmaddubsw xmm3, xmm4 |
|
1034 lea eax, [eax + 64] |
|
1035 phaddw xmm0, xmm1 |
|
1036 phaddw xmm2, xmm3 |
|
1037 psrlw xmm0, 7 |
|
1038 psrlw xmm2, 7 |
|
1039 packuswb xmm0, xmm2 |
|
1040 paddb xmm0, xmm5 |
|
1041 sub ecx, 16 |
|
1042 movdqu [edx], xmm0 |
|
1043 lea edx, [edx + 16] |
|
1044 jg convertloop |
|
1045 ret |
|
1046 } |
|
1047 } |
|
1048 |
|
1049 __declspec(naked) __declspec(align(16)) |
|
1050 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
|
1051 uint8* dst_u, uint8* dst_v, int width) { |
|
1052 __asm { |
|
1053 push esi |
|
1054 push edi |
|
1055 mov eax, [esp + 8 + 4] // src_argb |
|
1056 mov esi, [esp + 8 + 8] // src_stride_argb |
|
1057 mov edx, [esp + 8 + 12] // dst_u |
|
1058 mov edi, [esp + 8 + 16] // dst_v |
|
1059 mov ecx, [esp + 8 + 20] // pix |
|
1060 movdqa xmm7, kARGBToU |
|
1061 movdqa xmm6, kARGBToV |
|
1062 movdqa xmm5, kAddUV128 |
|
1063 sub edi, edx // stride from u to v |
|
1064 |
|
1065 align 4 |
|
1066 convertloop: |
|
1067 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
|
1068 movdqa xmm0, [eax] |
|
1069 movdqa xmm1, [eax + 16] |
|
1070 movdqa xmm2, [eax + 32] |
|
1071 movdqa xmm3, [eax + 48] |
|
1072 pavgb xmm0, [eax + esi] |
|
1073 pavgb xmm1, [eax + esi + 16] |
|
1074 pavgb xmm2, [eax + esi + 32] |
|
1075 pavgb xmm3, [eax + esi + 48] |
|
1076 lea eax, [eax + 64] |
|
1077 movdqa xmm4, xmm0 |
|
1078 shufps xmm0, xmm1, 0x88 |
|
1079 shufps xmm4, xmm1, 0xdd |
|
1080 pavgb xmm0, xmm4 |
|
1081 movdqa xmm4, xmm2 |
|
1082 shufps xmm2, xmm3, 0x88 |
|
1083 shufps xmm4, xmm3, 0xdd |
|
1084 pavgb xmm2, xmm4 |
|
1085 |
|
1086 // step 2 - convert to U and V |
|
1087 // from here down is very similar to Y code except |
|
1088 // instead of 16 different pixels, its 8 pixels of U and 8 of V |
|
1089 movdqa xmm1, xmm0 |
|
1090 movdqa xmm3, xmm2 |
|
1091 pmaddubsw xmm0, xmm7 // U |
|
1092 pmaddubsw xmm2, xmm7 |
|
1093 pmaddubsw xmm1, xmm6 // V |
|
1094 pmaddubsw xmm3, xmm6 |
|
1095 phaddw xmm0, xmm2 |
|
1096 phaddw xmm1, xmm3 |
|
1097 psraw xmm0, 8 |
|
1098 psraw xmm1, 8 |
|
1099 packsswb xmm0, xmm1 |
|
1100 paddb xmm0, xmm5 // -> unsigned |
|
1101 |
|
1102 // step 3 - store 8 U and 8 V values |
|
1103 sub ecx, 16 |
|
1104 movlps qword ptr [edx], xmm0 // U |
|
1105 movhps qword ptr [edx + edi], xmm0 // V |
|
1106 lea edx, [edx + 8] |
|
1107 jg convertloop |
|
1108 |
|
1109 pop edi |
|
1110 pop esi |
|
1111 ret |
|
1112 } |
|
1113 } |
|
1114 |
|
1115 __declspec(naked) __declspec(align(16)) |
|
1116 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
|
1117 uint8* dst_u, uint8* dst_v, int width) { |
|
1118 __asm { |
|
1119 push esi |
|
1120 push edi |
|
1121 mov eax, [esp + 8 + 4] // src_argb |
|
1122 mov esi, [esp + 8 + 8] // src_stride_argb |
|
1123 mov edx, [esp + 8 + 12] // dst_u |
|
1124 mov edi, [esp + 8 + 16] // dst_v |
|
1125 mov ecx, [esp + 8 + 20] // pix |
|
1126 movdqa xmm7, kARGBToUJ |
|
1127 movdqa xmm6, kARGBToVJ |
|
1128 movdqa xmm5, kAddUVJ128 |
|
1129 sub edi, edx // stride from u to v |
|
1130 |
|
1131 align 4 |
|
1132 convertloop: |
|
1133 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
|
1134 movdqa xmm0, [eax] |
|
1135 movdqa xmm1, [eax + 16] |
|
1136 movdqa xmm2, [eax + 32] |
|
1137 movdqa xmm3, [eax + 48] |
|
1138 pavgb xmm0, [eax + esi] |
|
1139 pavgb xmm1, [eax + esi + 16] |
|
1140 pavgb xmm2, [eax + esi + 32] |
|
1141 pavgb xmm3, [eax + esi + 48] |
|
1142 lea eax, [eax + 64] |
|
1143 movdqa xmm4, xmm0 |
|
1144 shufps xmm0, xmm1, 0x88 |
|
1145 shufps xmm4, xmm1, 0xdd |
|
1146 pavgb xmm0, xmm4 |
|
1147 movdqa xmm4, xmm2 |
|
1148 shufps xmm2, xmm3, 0x88 |
|
1149 shufps xmm4, xmm3, 0xdd |
|
1150 pavgb xmm2, xmm4 |
|
1151 |
|
1152 // step 2 - convert to U and V |
|
1153 // from here down is very similar to Y code except |
|
1154 // instead of 16 different pixels, its 8 pixels of U and 8 of V |
|
1155 movdqa xmm1, xmm0 |
|
1156 movdqa xmm3, xmm2 |
|
1157 pmaddubsw xmm0, xmm7 // U |
|
1158 pmaddubsw xmm2, xmm7 |
|
1159 pmaddubsw xmm1, xmm6 // V |
|
1160 pmaddubsw xmm3, xmm6 |
|
1161 phaddw xmm0, xmm2 |
|
1162 phaddw xmm1, xmm3 |
|
1163 paddw xmm0, xmm5 // +.5 rounding -> unsigned |
|
1164 paddw xmm1, xmm5 |
|
1165 psraw xmm0, 8 |
|
1166 psraw xmm1, 8 |
|
1167 packsswb xmm0, xmm1 |
|
1168 |
|
1169 // step 3 - store 8 U and 8 V values |
|
1170 sub ecx, 16 |
|
1171 movlps qword ptr [edx], xmm0 // U |
|
1172 movhps qword ptr [edx + edi], xmm0 // V |
|
1173 lea edx, [edx + 8] |
|
1174 jg convertloop |
|
1175 |
|
1176 pop edi |
|
1177 pop esi |
|
1178 ret |
|
1179 } |
|
1180 } |
|
1181 |
|
1182 #ifdef HAS_ARGBTOUVROW_AVX2 |
|
1183 __declspec(naked) __declspec(align(32)) |
|
1184 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, |
|
1185 uint8* dst_u, uint8* dst_v, int width) { |
|
1186 __asm { |
|
1187 push esi |
|
1188 push edi |
|
1189 mov eax, [esp + 8 + 4] // src_argb |
|
1190 mov esi, [esp + 8 + 8] // src_stride_argb |
|
1191 mov edx, [esp + 8 + 12] // dst_u |
|
1192 mov edi, [esp + 8 + 16] // dst_v |
|
1193 mov ecx, [esp + 8 + 20] // pix |
|
1194 vbroadcastf128 ymm5, kAddUV128 |
|
1195 vbroadcastf128 ymm6, kARGBToV |
|
1196 vbroadcastf128 ymm7, kARGBToU |
|
1197 sub edi, edx // stride from u to v |
|
1198 |
|
1199 align 4 |
|
1200 convertloop: |
|
1201 /* step 1 - subsample 32x2 argb pixels to 16x1 */ |
|
1202 vmovdqu ymm0, [eax] |
|
1203 vmovdqu ymm1, [eax + 32] |
|
1204 vmovdqu ymm2, [eax + 64] |
|
1205 vmovdqu ymm3, [eax + 96] |
|
1206 vpavgb ymm0, ymm0, [eax + esi] |
|
1207 vpavgb ymm1, ymm1, [eax + esi + 32] |
|
1208 vpavgb ymm2, ymm2, [eax + esi + 64] |
|
1209 vpavgb ymm3, ymm3, [eax + esi + 96] |
|
1210 lea eax, [eax + 128] |
|
1211 vshufps ymm4, ymm0, ymm1, 0x88 |
|
1212 vshufps ymm0, ymm0, ymm1, 0xdd |
|
1213 vpavgb ymm0, ymm0, ymm4 // mutated by vshufps |
|
1214 vshufps ymm4, ymm2, ymm3, 0x88 |
|
1215 vshufps ymm2, ymm2, ymm3, 0xdd |
|
1216 vpavgb ymm2, ymm2, ymm4 // mutated by vshufps |
|
1217 |
|
1218 // step 2 - convert to U and V |
|
1219 // from here down is very similar to Y code except |
|
1220 // instead of 32 different pixels, its 16 pixels of U and 16 of V |
|
1221 vpmaddubsw ymm1, ymm0, ymm7 // U |
|
1222 vpmaddubsw ymm3, ymm2, ymm7 |
|
1223 vpmaddubsw ymm0, ymm0, ymm6 // V |
|
1224 vpmaddubsw ymm2, ymm2, ymm6 |
|
1225 vphaddw ymm1, ymm1, ymm3 // mutates |
|
1226 vphaddw ymm0, ymm0, ymm2 |
|
1227 vpsraw ymm1, ymm1, 8 |
|
1228 vpsraw ymm0, ymm0, 8 |
|
1229 vpacksswb ymm0, ymm1, ymm0 // mutates |
|
1230 vpermq ymm0, ymm0, 0xd8 // For vpacksswb |
|
1231 vpshufb ymm0, ymm0, kShufARGBToUV_AVX // For vshufps + vphaddw |
|
1232 vpaddb ymm0, ymm0, ymm5 // -> unsigned |
|
1233 |
|
1234 // step 3 - store 16 U and 16 V values |
|
1235 sub ecx, 32 |
|
1236 vextractf128 [edx], ymm0, 0 // U |
|
1237 vextractf128 [edx + edi], ymm0, 1 // V |
|
1238 lea edx, [edx + 16] |
|
1239 jg convertloop |
|
1240 |
|
1241 pop edi |
|
1242 pop esi |
|
1243 vzeroupper |
|
1244 ret |
|
1245 } |
|
1246 } |
|
1247 #endif // HAS_ARGBTOUVROW_AVX2 |
|
1248 |
|
1249 __declspec(naked) __declspec(align(16)) |
|
1250 void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, |
|
1251 uint8* dst_u, uint8* dst_v, int width) { |
|
1252 __asm { |
|
1253 push esi |
|
1254 push edi |
|
1255 mov eax, [esp + 8 + 4] // src_argb |
|
1256 mov esi, [esp + 8 + 8] // src_stride_argb |
|
1257 mov edx, [esp + 8 + 12] // dst_u |
|
1258 mov edi, [esp + 8 + 16] // dst_v |
|
1259 mov ecx, [esp + 8 + 20] // pix |
|
1260 movdqa xmm7, kARGBToU |
|
1261 movdqa xmm6, kARGBToV |
|
1262 movdqa xmm5, kAddUV128 |
|
1263 sub edi, edx // stride from u to v |
|
1264 |
|
1265 align 4 |
|
1266 convertloop: |
|
1267 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
|
1268 movdqu xmm0, [eax] |
|
1269 movdqu xmm1, [eax + 16] |
|
1270 movdqu xmm2, [eax + 32] |
|
1271 movdqu xmm3, [eax + 48] |
|
1272 movdqu xmm4, [eax + esi] |
|
1273 pavgb xmm0, xmm4 |
|
1274 movdqu xmm4, [eax + esi + 16] |
|
1275 pavgb xmm1, xmm4 |
|
1276 movdqu xmm4, [eax + esi + 32] |
|
1277 pavgb xmm2, xmm4 |
|
1278 movdqu xmm4, [eax + esi + 48] |
|
1279 pavgb xmm3, xmm4 |
|
1280 lea eax, [eax + 64] |
|
1281 movdqa xmm4, xmm0 |
|
1282 shufps xmm0, xmm1, 0x88 |
|
1283 shufps xmm4, xmm1, 0xdd |
|
1284 pavgb xmm0, xmm4 |
|
1285 movdqa xmm4, xmm2 |
|
1286 shufps xmm2, xmm3, 0x88 |
|
1287 shufps xmm4, xmm3, 0xdd |
|
1288 pavgb xmm2, xmm4 |
|
1289 |
|
1290 // step 2 - convert to U and V |
|
1291 // from here down is very similar to Y code except |
|
1292 // instead of 16 different pixels, its 8 pixels of U and 8 of V |
|
1293 movdqa xmm1, xmm0 |
|
1294 movdqa xmm3, xmm2 |
|
1295 pmaddubsw xmm0, xmm7 // U |
|
1296 pmaddubsw xmm2, xmm7 |
|
1297 pmaddubsw xmm1, xmm6 // V |
|
1298 pmaddubsw xmm3, xmm6 |
|
1299 phaddw xmm0, xmm2 |
|
1300 phaddw xmm1, xmm3 |
|
1301 psraw xmm0, 8 |
|
1302 psraw xmm1, 8 |
|
1303 packsswb xmm0, xmm1 |
|
1304 paddb xmm0, xmm5 // -> unsigned |
|
1305 |
|
1306 // step 3 - store 8 U and 8 V values |
|
1307 sub ecx, 16 |
|
1308 movlps qword ptr [edx], xmm0 // U |
|
1309 movhps qword ptr [edx + edi], xmm0 // V |
|
1310 lea edx, [edx + 8] |
|
1311 jg convertloop |
|
1312 |
|
1313 pop edi |
|
1314 pop esi |
|
1315 ret |
|
1316 } |
|
1317 } |
|
1318 |
|
1319 __declspec(naked) __declspec(align(16)) |
|
1320 void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, |
|
1321 uint8* dst_u, uint8* dst_v, int width) { |
|
1322 __asm { |
|
1323 push esi |
|
1324 push edi |
|
1325 mov eax, [esp + 8 + 4] // src_argb |
|
1326 mov esi, [esp + 8 + 8] // src_stride_argb |
|
1327 mov edx, [esp + 8 + 12] // dst_u |
|
1328 mov edi, [esp + 8 + 16] // dst_v |
|
1329 mov ecx, [esp + 8 + 20] // pix |
|
1330 movdqa xmm7, kARGBToUJ |
|
1331 movdqa xmm6, kARGBToVJ |
|
1332 movdqa xmm5, kAddUVJ128 |
|
1333 sub edi, edx // stride from u to v |
|
1334 |
|
1335 align 4 |
|
1336 convertloop: |
|
1337 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
|
1338 movdqu xmm0, [eax] |
|
1339 movdqu xmm1, [eax + 16] |
|
1340 movdqu xmm2, [eax + 32] |
|
1341 movdqu xmm3, [eax + 48] |
|
1342 movdqu xmm4, [eax + esi] |
|
1343 pavgb xmm0, xmm4 |
|
1344 movdqu xmm4, [eax + esi + 16] |
|
1345 pavgb xmm1, xmm4 |
|
1346 movdqu xmm4, [eax + esi + 32] |
|
1347 pavgb xmm2, xmm4 |
|
1348 movdqu xmm4, [eax + esi + 48] |
|
1349 pavgb xmm3, xmm4 |
|
1350 lea eax, [eax + 64] |
|
1351 movdqa xmm4, xmm0 |
|
1352 shufps xmm0, xmm1, 0x88 |
|
1353 shufps xmm4, xmm1, 0xdd |
|
1354 pavgb xmm0, xmm4 |
|
1355 movdqa xmm4, xmm2 |
|
1356 shufps xmm2, xmm3, 0x88 |
|
1357 shufps xmm4, xmm3, 0xdd |
|
1358 pavgb xmm2, xmm4 |
|
1359 |
|
1360 // step 2 - convert to U and V |
|
1361 // from here down is very similar to Y code except |
|
1362 // instead of 16 different pixels, its 8 pixels of U and 8 of V |
|
1363 movdqa xmm1, xmm0 |
|
1364 movdqa xmm3, xmm2 |
|
1365 pmaddubsw xmm0, xmm7 // U |
|
1366 pmaddubsw xmm2, xmm7 |
|
1367 pmaddubsw xmm1, xmm6 // V |
|
1368 pmaddubsw xmm3, xmm6 |
|
1369 phaddw xmm0, xmm2 |
|
1370 phaddw xmm1, xmm3 |
|
1371 paddw xmm0, xmm5 // +.5 rounding -> unsigned |
|
1372 paddw xmm1, xmm5 |
|
1373 psraw xmm0, 8 |
|
1374 psraw xmm1, 8 |
|
1375 packsswb xmm0, xmm1 |
|
1376 |
|
1377 // step 3 - store 8 U and 8 V values |
|
1378 sub ecx, 16 |
|
1379 movlps qword ptr [edx], xmm0 // U |
|
1380 movhps qword ptr [edx + edi], xmm0 // V |
|
1381 lea edx, [edx + 8] |
|
1382 jg convertloop |
|
1383 |
|
1384 pop edi |
|
1385 pop esi |
|
1386 ret |
|
1387 } |
|
1388 } |
|
1389 |
|
1390 __declspec(naked) __declspec(align(16)) |
|
1391 void ARGBToUV444Row_SSSE3(const uint8* src_argb0, |
|
1392 uint8* dst_u, uint8* dst_v, int width) { |
|
1393 __asm { |
|
1394 push edi |
|
1395 mov eax, [esp + 4 + 4] // src_argb |
|
1396 mov edx, [esp + 4 + 8] // dst_u |
|
1397 mov edi, [esp + 4 + 12] // dst_v |
|
1398 mov ecx, [esp + 4 + 16] // pix |
|
1399 movdqa xmm7, kARGBToU |
|
1400 movdqa xmm6, kARGBToV |
|
1401 movdqa xmm5, kAddUV128 |
|
1402 sub edi, edx // stride from u to v |
|
1403 |
|
1404 align 4 |
|
1405 convertloop: |
|
1406 /* convert to U and V */ |
|
1407 movdqa xmm0, [eax] // U |
|
1408 movdqa xmm1, [eax + 16] |
|
1409 movdqa xmm2, [eax + 32] |
|
1410 movdqa xmm3, [eax + 48] |
|
1411 pmaddubsw xmm0, xmm7 |
|
1412 pmaddubsw xmm1, xmm7 |
|
1413 pmaddubsw xmm2, xmm7 |
|
1414 pmaddubsw xmm3, xmm7 |
|
1415 phaddw xmm0, xmm1 |
|
1416 phaddw xmm2, xmm3 |
|
1417 psraw xmm0, 8 |
|
1418 psraw xmm2, 8 |
|
1419 packsswb xmm0, xmm2 |
|
1420 paddb xmm0, xmm5 |
|
1421 sub ecx, 16 |
|
1422 movdqa [edx], xmm0 |
|
1423 |
|
1424 movdqa xmm0, [eax] // V |
|
1425 movdqa xmm1, [eax + 16] |
|
1426 movdqa xmm2, [eax + 32] |
|
1427 movdqa xmm3, [eax + 48] |
|
1428 pmaddubsw xmm0, xmm6 |
|
1429 pmaddubsw xmm1, xmm6 |
|
1430 pmaddubsw xmm2, xmm6 |
|
1431 pmaddubsw xmm3, xmm6 |
|
1432 phaddw xmm0, xmm1 |
|
1433 phaddw xmm2, xmm3 |
|
1434 psraw xmm0, 8 |
|
1435 psraw xmm2, 8 |
|
1436 packsswb xmm0, xmm2 |
|
1437 paddb xmm0, xmm5 |
|
1438 lea eax, [eax + 64] |
|
1439 movdqa [edx + edi], xmm0 |
|
1440 lea edx, [edx + 16] |
|
1441 jg convertloop |
|
1442 |
|
1443 pop edi |
|
1444 ret |
|
1445 } |
|
1446 } |
|
1447 |
|
1448 __declspec(naked) __declspec(align(16)) |
|
1449 void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0, |
|
1450 uint8* dst_u, uint8* dst_v, int width) { |
|
1451 __asm { |
|
1452 push edi |
|
1453 mov eax, [esp + 4 + 4] // src_argb |
|
1454 mov edx, [esp + 4 + 8] // dst_u |
|
1455 mov edi, [esp + 4 + 12] // dst_v |
|
1456 mov ecx, [esp + 4 + 16] // pix |
|
1457 movdqa xmm7, kARGBToU |
|
1458 movdqa xmm6, kARGBToV |
|
1459 movdqa xmm5, kAddUV128 |
|
1460 sub edi, edx // stride from u to v |
|
1461 |
|
1462 align 4 |
|
1463 convertloop: |
|
1464 /* convert to U and V */ |
|
1465 movdqu xmm0, [eax] // U |
|
1466 movdqu xmm1, [eax + 16] |
|
1467 movdqu xmm2, [eax + 32] |
|
1468 movdqu xmm3, [eax + 48] |
|
1469 pmaddubsw xmm0, xmm7 |
|
1470 pmaddubsw xmm1, xmm7 |
|
1471 pmaddubsw xmm2, xmm7 |
|
1472 pmaddubsw xmm3, xmm7 |
|
1473 phaddw xmm0, xmm1 |
|
1474 phaddw xmm2, xmm3 |
|
1475 psraw xmm0, 8 |
|
1476 psraw xmm2, 8 |
|
1477 packsswb xmm0, xmm2 |
|
1478 paddb xmm0, xmm5 |
|
1479 sub ecx, 16 |
|
1480 movdqu [edx], xmm0 |
|
1481 |
|
1482 movdqu xmm0, [eax] // V |
|
1483 movdqu xmm1, [eax + 16] |
|
1484 movdqu xmm2, [eax + 32] |
|
1485 movdqu xmm3, [eax + 48] |
|
1486 pmaddubsw xmm0, xmm6 |
|
1487 pmaddubsw xmm1, xmm6 |
|
1488 pmaddubsw xmm2, xmm6 |
|
1489 pmaddubsw xmm3, xmm6 |
|
1490 phaddw xmm0, xmm1 |
|
1491 phaddw xmm2, xmm3 |
|
1492 psraw xmm0, 8 |
|
1493 psraw xmm2, 8 |
|
1494 packsswb xmm0, xmm2 |
|
1495 paddb xmm0, xmm5 |
|
1496 lea eax, [eax + 64] |
|
1497 movdqu [edx + edi], xmm0 |
|
1498 lea edx, [edx + 16] |
|
1499 jg convertloop |
|
1500 |
|
1501 pop edi |
|
1502 ret |
|
1503 } |
|
1504 } |
|
1505 |
|
1506 __declspec(naked) __declspec(align(16)) |
|
1507 void ARGBToUV422Row_SSSE3(const uint8* src_argb0, |
|
1508 uint8* dst_u, uint8* dst_v, int width) { |
|
1509 __asm { |
|
1510 push edi |
|
1511 mov eax, [esp + 4 + 4] // src_argb |
|
1512 mov edx, [esp + 4 + 8] // dst_u |
|
1513 mov edi, [esp + 4 + 12] // dst_v |
|
1514 mov ecx, [esp + 4 + 16] // pix |
|
1515 movdqa xmm7, kARGBToU |
|
1516 movdqa xmm6, kARGBToV |
|
1517 movdqa xmm5, kAddUV128 |
|
1518 sub edi, edx // stride from u to v |
|
1519 |
|
1520 align 4 |
|
1521 convertloop: |
|
1522 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
|
1523 movdqa xmm0, [eax] |
|
1524 movdqa xmm1, [eax + 16] |
|
1525 movdqa xmm2, [eax + 32] |
|
1526 movdqa xmm3, [eax + 48] |
|
1527 lea eax, [eax + 64] |
|
1528 movdqa xmm4, xmm0 |
|
1529 shufps xmm0, xmm1, 0x88 |
|
1530 shufps xmm4, xmm1, 0xdd |
|
1531 pavgb xmm0, xmm4 |
|
1532 movdqa xmm4, xmm2 |
|
1533 shufps xmm2, xmm3, 0x88 |
|
1534 shufps xmm4, xmm3, 0xdd |
|
1535 pavgb xmm2, xmm4 |
|
1536 |
|
1537 // step 2 - convert to U and V |
|
1538 // from here down is very similar to Y code except |
|
1539 // instead of 16 different pixels, its 8 pixels of U and 8 of V |
|
1540 movdqa xmm1, xmm0 |
|
1541 movdqa xmm3, xmm2 |
|
1542 pmaddubsw xmm0, xmm7 // U |
|
1543 pmaddubsw xmm2, xmm7 |
|
1544 pmaddubsw xmm1, xmm6 // V |
|
1545 pmaddubsw xmm3, xmm6 |
|
1546 phaddw xmm0, xmm2 |
|
1547 phaddw xmm1, xmm3 |
|
1548 psraw xmm0, 8 |
|
1549 psraw xmm1, 8 |
|
1550 packsswb xmm0, xmm1 |
|
1551 paddb xmm0, xmm5 // -> unsigned |
|
1552 |
|
1553 // step 3 - store 8 U and 8 V values |
|
1554 sub ecx, 16 |
|
1555 movlps qword ptr [edx], xmm0 // U |
|
1556 movhps qword ptr [edx + edi], xmm0 // V |
|
1557 lea edx, [edx + 8] |
|
1558 jg convertloop |
|
1559 |
|
1560 pop edi |
|
1561 ret |
|
1562 } |
|
1563 } |
|
1564 |
|
1565 __declspec(naked) __declspec(align(16)) |
|
1566 void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0, |
|
1567 uint8* dst_u, uint8* dst_v, int width) { |
|
1568 __asm { |
|
1569 push edi |
|
1570 mov eax, [esp + 4 + 4] // src_argb |
|
1571 mov edx, [esp + 4 + 8] // dst_u |
|
1572 mov edi, [esp + 4 + 12] // dst_v |
|
1573 mov ecx, [esp + 4 + 16] // pix |
|
1574 movdqa xmm7, kARGBToU |
|
1575 movdqa xmm6, kARGBToV |
|
1576 movdqa xmm5, kAddUV128 |
|
1577 sub edi, edx // stride from u to v |
|
1578 |
|
1579 align 4 |
|
1580 convertloop: |
|
1581 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
|
1582 movdqu xmm0, [eax] |
|
1583 movdqu xmm1, [eax + 16] |
|
1584 movdqu xmm2, [eax + 32] |
|
1585 movdqu xmm3, [eax + 48] |
|
1586 lea eax, [eax + 64] |
|
1587 movdqa xmm4, xmm0 |
|
1588 shufps xmm0, xmm1, 0x88 |
|
1589 shufps xmm4, xmm1, 0xdd |
|
1590 pavgb xmm0, xmm4 |
|
1591 movdqa xmm4, xmm2 |
|
1592 shufps xmm2, xmm3, 0x88 |
|
1593 shufps xmm4, xmm3, 0xdd |
|
1594 pavgb xmm2, xmm4 |
|
1595 |
|
1596 // step 2 - convert to U and V |
|
1597 // from here down is very similar to Y code except |
|
1598 // instead of 16 different pixels, its 8 pixels of U and 8 of V |
|
1599 movdqa xmm1, xmm0 |
|
1600 movdqa xmm3, xmm2 |
|
1601 pmaddubsw xmm0, xmm7 // U |
|
1602 pmaddubsw xmm2, xmm7 |
|
1603 pmaddubsw xmm1, xmm6 // V |
|
1604 pmaddubsw xmm3, xmm6 |
|
1605 phaddw xmm0, xmm2 |
|
1606 phaddw xmm1, xmm3 |
|
1607 psraw xmm0, 8 |
|
1608 psraw xmm1, 8 |
|
1609 packsswb xmm0, xmm1 |
|
1610 paddb xmm0, xmm5 // -> unsigned |
|
1611 |
|
1612 // step 3 - store 8 U and 8 V values |
|
1613 sub ecx, 16 |
|
1614 movlps qword ptr [edx], xmm0 // U |
|
1615 movhps qword ptr [edx + edi], xmm0 // V |
|
1616 lea edx, [edx + 8] |
|
1617 jg convertloop |
|
1618 |
|
1619 pop edi |
|
1620 ret |
|
1621 } |
|
1622 } |
|
1623 |
|
1624 __declspec(naked) __declspec(align(16)) |
|
1625 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
|
1626 uint8* dst_u, uint8* dst_v, int width) { |
|
1627 __asm { |
|
1628 push esi |
|
1629 push edi |
|
1630 mov eax, [esp + 8 + 4] // src_argb |
|
1631 mov esi, [esp + 8 + 8] // src_stride_argb |
|
1632 mov edx, [esp + 8 + 12] // dst_u |
|
1633 mov edi, [esp + 8 + 16] // dst_v |
|
1634 mov ecx, [esp + 8 + 20] // pix |
|
1635 movdqa xmm7, kBGRAToU |
|
1636 movdqa xmm6, kBGRAToV |
|
1637 movdqa xmm5, kAddUV128 |
|
1638 sub edi, edx // stride from u to v |
|
1639 |
|
1640 align 4 |
|
1641 convertloop: |
|
1642 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
|
1643 movdqa xmm0, [eax] |
|
1644 movdqa xmm1, [eax + 16] |
|
1645 movdqa xmm2, [eax + 32] |
|
1646 movdqa xmm3, [eax + 48] |
|
1647 pavgb xmm0, [eax + esi] |
|
1648 pavgb xmm1, [eax + esi + 16] |
|
1649 pavgb xmm2, [eax + esi + 32] |
|
1650 pavgb xmm3, [eax + esi + 48] |
|
1651 lea eax, [eax + 64] |
|
1652 movdqa xmm4, xmm0 |
|
1653 shufps xmm0, xmm1, 0x88 |
|
1654 shufps xmm4, xmm1, 0xdd |
|
1655 pavgb xmm0, xmm4 |
|
1656 movdqa xmm4, xmm2 |
|
1657 shufps xmm2, xmm3, 0x88 |
|
1658 shufps xmm4, xmm3, 0xdd |
|
1659 pavgb xmm2, xmm4 |
|
1660 |
|
1661 // step 2 - convert to U and V |
|
1662 // from here down is very similar to Y code except |
|
1663 // instead of 16 different pixels, its 8 pixels of U and 8 of V |
|
1664 movdqa xmm1, xmm0 |
|
1665 movdqa xmm3, xmm2 |
|
1666 pmaddubsw xmm0, xmm7 // U |
|
1667 pmaddubsw xmm2, xmm7 |
|
1668 pmaddubsw xmm1, xmm6 // V |
|
1669 pmaddubsw xmm3, xmm6 |
|
1670 phaddw xmm0, xmm2 |
|
1671 phaddw xmm1, xmm3 |
|
1672 psraw xmm0, 8 |
|
1673 psraw xmm1, 8 |
|
1674 packsswb xmm0, xmm1 |
|
1675 paddb xmm0, xmm5 // -> unsigned |
|
1676 |
|
1677 // step 3 - store 8 U and 8 V values |
|
1678 sub ecx, 16 |
|
1679 movlps qword ptr [edx], xmm0 // U |
|
1680 movhps qword ptr [edx + edi], xmm0 // V |
|
1681 lea edx, [edx + 8] |
|
1682 jg convertloop |
|
1683 |
|
1684 pop edi |
|
1685 pop esi |
|
1686 ret |
|
1687 } |
|
1688 } |
|
1689 |
|
1690 __declspec(naked) __declspec(align(16)) |
|
1691 void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, |
|
1692 uint8* dst_u, uint8* dst_v, int width) { |
|
1693 __asm { |
|
1694 push esi |
|
1695 push edi |
|
1696 mov eax, [esp + 8 + 4] // src_argb |
|
1697 mov esi, [esp + 8 + 8] // src_stride_argb |
|
1698 mov edx, [esp + 8 + 12] // dst_u |
|
1699 mov edi, [esp + 8 + 16] // dst_v |
|
1700 mov ecx, [esp + 8 + 20] // pix |
|
1701 movdqa xmm7, kBGRAToU |
|
1702 movdqa xmm6, kBGRAToV |
|
1703 movdqa xmm5, kAddUV128 |
|
1704 sub edi, edx // stride from u to v |
|
1705 |
|
1706 align 4 |
|
1707 convertloop: |
|
1708 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
|
1709 movdqu xmm0, [eax] |
|
1710 movdqu xmm1, [eax + 16] |
|
1711 movdqu xmm2, [eax + 32] |
|
1712 movdqu xmm3, [eax + 48] |
|
1713 movdqu xmm4, [eax + esi] |
|
1714 pavgb xmm0, xmm4 |
|
1715 movdqu xmm4, [eax + esi + 16] |
|
1716 pavgb xmm1, xmm4 |
|
1717 movdqu xmm4, [eax + esi + 32] |
|
1718 pavgb xmm2, xmm4 |
|
1719 movdqu xmm4, [eax + esi + 48] |
|
1720 pavgb xmm3, xmm4 |
|
1721 lea eax, [eax + 64] |
|
1722 movdqa xmm4, xmm0 |
|
1723 shufps xmm0, xmm1, 0x88 |
|
1724 shufps xmm4, xmm1, 0xdd |
|
1725 pavgb xmm0, xmm4 |
|
1726 movdqa xmm4, xmm2 |
|
1727 shufps xmm2, xmm3, 0x88 |
|
1728 shufps xmm4, xmm3, 0xdd |
|
1729 pavgb xmm2, xmm4 |
|
1730 |
|
1731 // step 2 - convert to U and V |
|
1732 // from here down is very similar to Y code except |
|
1733 // instead of 16 different pixels, its 8 pixels of U and 8 of V |
|
1734 movdqa xmm1, xmm0 |
|
1735 movdqa xmm3, xmm2 |
|
1736 pmaddubsw xmm0, xmm7 // U |
|
1737 pmaddubsw xmm2, xmm7 |
|
1738 pmaddubsw xmm1, xmm6 // V |
|
1739 pmaddubsw xmm3, xmm6 |
|
1740 phaddw xmm0, xmm2 |
|
1741 phaddw xmm1, xmm3 |
|
1742 psraw xmm0, 8 |
|
1743 psraw xmm1, 8 |
|
1744 packsswb xmm0, xmm1 |
|
1745 paddb xmm0, xmm5 // -> unsigned |
|
1746 |
|
1747 // step 3 - store 8 U and 8 V values |
|
1748 sub ecx, 16 |
|
1749 movlps qword ptr [edx], xmm0 // U |
|
1750 movhps qword ptr [edx + edi], xmm0 // V |
|
1751 lea edx, [edx + 8] |
|
1752 jg convertloop |
|
1753 |
|
1754 pop edi |
|
1755 pop esi |
|
1756 ret |
|
1757 } |
|
1758 } |
|
1759 |
|
1760 __declspec(naked) __declspec(align(16)) |
|
1761 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
|
1762 uint8* dst_u, uint8* dst_v, int width) { |
|
1763 __asm { |
|
1764 push esi |
|
1765 push edi |
|
1766 mov eax, [esp + 8 + 4] // src_argb |
|
1767 mov esi, [esp + 8 + 8] // src_stride_argb |
|
1768 mov edx, [esp + 8 + 12] // dst_u |
|
1769 mov edi, [esp + 8 + 16] // dst_v |
|
1770 mov ecx, [esp + 8 + 20] // pix |
|
1771 movdqa xmm7, kABGRToU |
|
1772 movdqa xmm6, kABGRToV |
|
1773 movdqa xmm5, kAddUV128 |
|
1774 sub edi, edx // stride from u to v |
|
1775 |
|
1776 align 4 |
|
1777 convertloop: |
|
1778 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
|
1779 movdqa xmm0, [eax] |
|
1780 movdqa xmm1, [eax + 16] |
|
1781 movdqa xmm2, [eax + 32] |
|
1782 movdqa xmm3, [eax + 48] |
|
1783 pavgb xmm0, [eax + esi] |
|
1784 pavgb xmm1, [eax + esi + 16] |
|
1785 pavgb xmm2, [eax + esi + 32] |
|
1786 pavgb xmm3, [eax + esi + 48] |
|
1787 lea eax, [eax + 64] |
|
1788 movdqa xmm4, xmm0 |
|
1789 shufps xmm0, xmm1, 0x88 |
|
1790 shufps xmm4, xmm1, 0xdd |
|
1791 pavgb xmm0, xmm4 |
|
1792 movdqa xmm4, xmm2 |
|
1793 shufps xmm2, xmm3, 0x88 |
|
1794 shufps xmm4, xmm3, 0xdd |
|
1795 pavgb xmm2, xmm4 |
|
1796 |
|
1797 // step 2 - convert to U and V |
|
1798 // from here down is very similar to Y code except |
|
1799 // instead of 16 different pixels, its 8 pixels of U and 8 of V |
|
1800 movdqa xmm1, xmm0 |
|
1801 movdqa xmm3, xmm2 |
|
1802 pmaddubsw xmm0, xmm7 // U |
|
1803 pmaddubsw xmm2, xmm7 |
|
1804 pmaddubsw xmm1, xmm6 // V |
|
1805 pmaddubsw xmm3, xmm6 |
|
1806 phaddw xmm0, xmm2 |
|
1807 phaddw xmm1, xmm3 |
|
1808 psraw xmm0, 8 |
|
1809 psraw xmm1, 8 |
|
1810 packsswb xmm0, xmm1 |
|
1811 paddb xmm0, xmm5 // -> unsigned |
|
1812 |
|
1813 // step 3 - store 8 U and 8 V values |
|
1814 sub ecx, 16 |
|
1815 movlps qword ptr [edx], xmm0 // U |
|
1816 movhps qword ptr [edx + edi], xmm0 // V |
|
1817 lea edx, [edx + 8] |
|
1818 jg convertloop |
|
1819 |
|
1820 pop edi |
|
1821 pop esi |
|
1822 ret |
|
1823 } |
|
1824 } |
|
1825 |
|
1826 __declspec(naked) __declspec(align(16)) |
|
1827 void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, |
|
1828 uint8* dst_u, uint8* dst_v, int width) { |
|
1829 __asm { |
|
1830 push esi |
|
1831 push edi |
|
1832 mov eax, [esp + 8 + 4] // src_argb |
|
1833 mov esi, [esp + 8 + 8] // src_stride_argb |
|
1834 mov edx, [esp + 8 + 12] // dst_u |
|
1835 mov edi, [esp + 8 + 16] // dst_v |
|
1836 mov ecx, [esp + 8 + 20] // pix |
|
1837 movdqa xmm7, kABGRToU |
|
1838 movdqa xmm6, kABGRToV |
|
1839 movdqa xmm5, kAddUV128 |
|
1840 sub edi, edx // stride from u to v |
|
1841 |
|
1842 align 4 |
|
1843 convertloop: |
|
1844 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
|
1845 movdqu xmm0, [eax] |
|
1846 movdqu xmm1, [eax + 16] |
|
1847 movdqu xmm2, [eax + 32] |
|
1848 movdqu xmm3, [eax + 48] |
|
1849 movdqu xmm4, [eax + esi] |
|
1850 pavgb xmm0, xmm4 |
|
1851 movdqu xmm4, [eax + esi + 16] |
|
1852 pavgb xmm1, xmm4 |
|
1853 movdqu xmm4, [eax + esi + 32] |
|
1854 pavgb xmm2, xmm4 |
|
1855 movdqu xmm4, [eax + esi + 48] |
|
1856 pavgb xmm3, xmm4 |
|
1857 lea eax, [eax + 64] |
|
1858 movdqa xmm4, xmm0 |
|
1859 shufps xmm0, xmm1, 0x88 |
|
1860 shufps xmm4, xmm1, 0xdd |
|
1861 pavgb xmm0, xmm4 |
|
1862 movdqa xmm4, xmm2 |
|
1863 shufps xmm2, xmm3, 0x88 |
|
1864 shufps xmm4, xmm3, 0xdd |
|
1865 pavgb xmm2, xmm4 |
|
1866 |
|
1867 // step 2 - convert to U and V |
|
1868 // from here down is very similar to Y code except |
|
1869 // instead of 16 different pixels, its 8 pixels of U and 8 of V |
|
1870 movdqa xmm1, xmm0 |
|
1871 movdqa xmm3, xmm2 |
|
1872 pmaddubsw xmm0, xmm7 // U |
|
1873 pmaddubsw xmm2, xmm7 |
|
1874 pmaddubsw xmm1, xmm6 // V |
|
1875 pmaddubsw xmm3, xmm6 |
|
1876 phaddw xmm0, xmm2 |
|
1877 phaddw xmm1, xmm3 |
|
1878 psraw xmm0, 8 |
|
1879 psraw xmm1, 8 |
|
1880 packsswb xmm0, xmm1 |
|
1881 paddb xmm0, xmm5 // -> unsigned |
|
1882 |
|
1883 // step 3 - store 8 U and 8 V values |
|
1884 sub ecx, 16 |
|
1885 movlps qword ptr [edx], xmm0 // U |
|
1886 movhps qword ptr [edx + edi], xmm0 // V |
|
1887 lea edx, [edx + 8] |
|
1888 jg convertloop |
|
1889 |
|
1890 pop edi |
|
1891 pop esi |
|
1892 ret |
|
1893 } |
|
1894 } |
|
1895 |
|
1896 __declspec(naked) __declspec(align(16)) |
|
1897 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
|
1898 uint8* dst_u, uint8* dst_v, int width) { |
|
1899 __asm { |
|
1900 push esi |
|
1901 push edi |
|
1902 mov eax, [esp + 8 + 4] // src_argb |
|
1903 mov esi, [esp + 8 + 8] // src_stride_argb |
|
1904 mov edx, [esp + 8 + 12] // dst_u |
|
1905 mov edi, [esp + 8 + 16] // dst_v |
|
1906 mov ecx, [esp + 8 + 20] // pix |
|
1907 movdqa xmm7, kRGBAToU |
|
1908 movdqa xmm6, kRGBAToV |
|
1909 movdqa xmm5, kAddUV128 |
|
1910 sub edi, edx // stride from u to v |
|
1911 |
|
1912 align 4 |
|
1913 convertloop: |
|
1914 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
|
1915 movdqa xmm0, [eax] |
|
1916 movdqa xmm1, [eax + 16] |
|
1917 movdqa xmm2, [eax + 32] |
|
1918 movdqa xmm3, [eax + 48] |
|
1919 pavgb xmm0, [eax + esi] |
|
1920 pavgb xmm1, [eax + esi + 16] |
|
1921 pavgb xmm2, [eax + esi + 32] |
|
1922 pavgb xmm3, [eax + esi + 48] |
|
1923 lea eax, [eax + 64] |
|
1924 movdqa xmm4, xmm0 |
|
1925 shufps xmm0, xmm1, 0x88 |
|
1926 shufps xmm4, xmm1, 0xdd |
|
1927 pavgb xmm0, xmm4 |
|
1928 movdqa xmm4, xmm2 |
|
1929 shufps xmm2, xmm3, 0x88 |
|
1930 shufps xmm4, xmm3, 0xdd |
|
1931 pavgb xmm2, xmm4 |
|
1932 |
|
1933 // step 2 - convert to U and V |
|
1934 // from here down is very similar to Y code except |
|
1935 // instead of 16 different pixels, its 8 pixels of U and 8 of V |
|
1936 movdqa xmm1, xmm0 |
|
1937 movdqa xmm3, xmm2 |
|
1938 pmaddubsw xmm0, xmm7 // U |
|
1939 pmaddubsw xmm2, xmm7 |
|
1940 pmaddubsw xmm1, xmm6 // V |
|
1941 pmaddubsw xmm3, xmm6 |
|
1942 phaddw xmm0, xmm2 |
|
1943 phaddw xmm1, xmm3 |
|
1944 psraw xmm0, 8 |
|
1945 psraw xmm1, 8 |
|
1946 packsswb xmm0, xmm1 |
|
1947 paddb xmm0, xmm5 // -> unsigned |
|
1948 |
|
1949 // step 3 - store 8 U and 8 V values |
|
1950 sub ecx, 16 |
|
1951 movlps qword ptr [edx], xmm0 // U |
|
1952 movhps qword ptr [edx + edi], xmm0 // V |
|
1953 lea edx, [edx + 8] |
|
1954 jg convertloop |
|
1955 |
|
1956 pop edi |
|
1957 pop esi |
|
1958 ret |
|
1959 } |
|
1960 } |
|
1961 |
|
1962 __declspec(naked) __declspec(align(16)) |
|
1963 void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, |
|
1964 uint8* dst_u, uint8* dst_v, int width) { |
|
1965 __asm { |
|
1966 push esi |
|
1967 push edi |
|
1968 mov eax, [esp + 8 + 4] // src_argb |
|
1969 mov esi, [esp + 8 + 8] // src_stride_argb |
|
1970 mov edx, [esp + 8 + 12] // dst_u |
|
1971 mov edi, [esp + 8 + 16] // dst_v |
|
1972 mov ecx, [esp + 8 + 20] // pix |
|
1973 movdqa xmm7, kRGBAToU |
|
1974 movdqa xmm6, kRGBAToV |
|
1975 movdqa xmm5, kAddUV128 |
|
1976 sub edi, edx // stride from u to v |
|
1977 |
|
1978 align 4 |
|
1979 convertloop: |
|
1980 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
|
1981 movdqu xmm0, [eax] |
|
1982 movdqu xmm1, [eax + 16] |
|
1983 movdqu xmm2, [eax + 32] |
|
1984 movdqu xmm3, [eax + 48] |
|
1985 movdqu xmm4, [eax + esi] |
|
1986 pavgb xmm0, xmm4 |
|
1987 movdqu xmm4, [eax + esi + 16] |
|
1988 pavgb xmm1, xmm4 |
|
1989 movdqu xmm4, [eax + esi + 32] |
|
1990 pavgb xmm2, xmm4 |
|
1991 movdqu xmm4, [eax + esi + 48] |
|
1992 pavgb xmm3, xmm4 |
|
1993 lea eax, [eax + 64] |
|
1994 movdqa xmm4, xmm0 |
|
1995 shufps xmm0, xmm1, 0x88 |
|
1996 shufps xmm4, xmm1, 0xdd |
|
1997 pavgb xmm0, xmm4 |
|
1998 movdqa xmm4, xmm2 |
|
1999 shufps xmm2, xmm3, 0x88 |
|
2000 shufps xmm4, xmm3, 0xdd |
|
2001 pavgb xmm2, xmm4 |
|
2002 |
|
2003 // step 2 - convert to U and V |
|
2004 // from here down is very similar to Y code except |
|
2005 // instead of 16 different pixels, its 8 pixels of U and 8 of V |
|
2006 movdqa xmm1, xmm0 |
|
2007 movdqa xmm3, xmm2 |
|
2008 pmaddubsw xmm0, xmm7 // U |
|
2009 pmaddubsw xmm2, xmm7 |
|
2010 pmaddubsw xmm1, xmm6 // V |
|
2011 pmaddubsw xmm3, xmm6 |
|
2012 phaddw xmm0, xmm2 |
|
2013 phaddw xmm1, xmm3 |
|
2014 psraw xmm0, 8 |
|
2015 psraw xmm1, 8 |
|
2016 packsswb xmm0, xmm1 |
|
2017 paddb xmm0, xmm5 // -> unsigned |
|
2018 |
|
2019 // step 3 - store 8 U and 8 V values |
|
2020 sub ecx, 16 |
|
2021 movlps qword ptr [edx], xmm0 // U |
|
2022 movhps qword ptr [edx + edi], xmm0 // V |
|
2023 lea edx, [edx + 8] |
|
2024 jg convertloop |
|
2025 |
|
2026 pop edi |
|
2027 pop esi |
|
2028 ret |
|
2029 } |
|
2030 } |
|
2031 #endif // HAS_ARGBTOYROW_SSSE3 |
|
2032 |
|
2033 #define YG 74 /* (int8)(1.164 * 64 + 0.5) */ |
|
2034 |
|
2035 #define UB 127 /* min(63,(int8)(2.018 * 64)) */ |
|
2036 #define UG -25 /* (int8)(-0.391 * 64 - 0.5) */ |
|
2037 #define UR 0 |
|
2038 |
|
2039 #define VB 0 |
|
2040 #define VG -52 /* (int8)(-0.813 * 64 - 0.5) */ |
|
2041 #define VR 102 /* (int8)(1.596 * 64 + 0.5) */ |
|
2042 |
|
2043 // Bias |
|
2044 #define BB UB * 128 + VB * 128 |
|
2045 #define BG UG * 128 + VG * 128 |
|
2046 #define BR UR * 128 + VR * 128 |
|
2047 |
|
2048 #ifdef HAS_I422TOARGBROW_AVX2 |
|
2049 |
|
2050 static const lvec8 kUVToB_AVX = { |
|
2051 UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, |
|
2052 UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB |
|
2053 }; |
|
2054 static const lvec8 kUVToR_AVX = { |
|
2055 UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, |
|
2056 UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR |
|
2057 }; |
|
2058 static const lvec8 kUVToG_AVX = { |
|
2059 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, |
|
2060 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG |
|
2061 }; |
|
2062 static const lvec16 kYToRgb_AVX = { |
|
2063 YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG |
|
2064 }; |
|
2065 static const lvec16 kYSub16_AVX = { |
|
2066 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 |
|
2067 }; |
|
2068 static const lvec16 kUVBiasB_AVX = { |
|
2069 BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB |
|
2070 }; |
|
2071 static const lvec16 kUVBiasG_AVX = { |
|
2072 BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG |
|
2073 }; |
|
2074 static const lvec16 kUVBiasR_AVX = { |
|
2075 BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR |
|
2076 }; |
|
2077 |
|
2078 // 16 pixels |
|
2079 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
|
2080 __declspec(naked) __declspec(align(16)) |
|
2081 void I422ToARGBRow_AVX2(const uint8* y_buf, |
|
2082 const uint8* u_buf, |
|
2083 const uint8* v_buf, |
|
2084 uint8* dst_argb, |
|
2085 int width) { |
|
2086 __asm { |
|
2087 push esi |
|
2088 push edi |
|
2089 mov eax, [esp + 8 + 4] // Y |
|
2090 mov esi, [esp + 8 + 8] // U |
|
2091 mov edi, [esp + 8 + 12] // V |
|
2092 mov edx, [esp + 8 + 16] // argb |
|
2093 mov ecx, [esp + 8 + 20] // width |
|
2094 sub edi, esi |
|
2095 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
|
2096 vpxor ymm4, ymm4, ymm4 |
|
2097 |
|
2098 align 4 |
|
2099 convertloop: |
|
2100 vmovq xmm0, qword ptr [esi] // U |
|
2101 vmovq xmm1, qword ptr [esi + edi] // V |
|
2102 lea esi, [esi + 8] |
|
2103 vpunpcklbw ymm0, ymm0, ymm1 // UV |
|
2104 vpermq ymm0, ymm0, 0xd8 |
|
2105 vpunpcklwd ymm0, ymm0, ymm0 // UVUV |
|
2106 vpmaddubsw ymm2, ymm0, kUVToB_AVX // scale B UV |
|
2107 vpmaddubsw ymm1, ymm0, kUVToG_AVX // scale G UV |
|
2108 vpmaddubsw ymm0, ymm0, kUVToR_AVX // scale R UV |
|
2109 vpsubw ymm2, ymm2, kUVBiasB_AVX // unbias back to signed |
|
2110 vpsubw ymm1, ymm1, kUVBiasG_AVX |
|
2111 vpsubw ymm0, ymm0, kUVBiasR_AVX |
|
2112 |
|
2113 // Step 2: Find Y contribution to 16 R,G,B values |
|
2114 vmovdqu xmm3, [eax] // NOLINT |
|
2115 lea eax, [eax + 16] |
|
2116 vpermq ymm3, ymm3, 0xd8 |
|
2117 vpunpcklbw ymm3, ymm3, ymm4 |
|
2118 vpsubsw ymm3, ymm3, kYSub16_AVX |
|
2119 vpmullw ymm3, ymm3, kYToRgb_AVX |
|
2120 vpaddsw ymm2, ymm2, ymm3 // B += Y |
|
2121 vpaddsw ymm1, ymm1, ymm3 // G += Y |
|
2122 vpaddsw ymm0, ymm0, ymm3 // R += Y |
|
2123 vpsraw ymm2, ymm2, 6 |
|
2124 vpsraw ymm1, ymm1, 6 |
|
2125 vpsraw ymm0, ymm0, 6 |
|
2126 vpackuswb ymm2, ymm2, ymm2 // B |
|
2127 vpackuswb ymm1, ymm1, ymm1 // G |
|
2128 vpackuswb ymm0, ymm0, ymm0 // R |
|
2129 |
|
2130 // Step 3: Weave into ARGB |
|
2131 vpunpcklbw ymm2, ymm2, ymm1 // BG |
|
2132 vpermq ymm2, ymm2, 0xd8 |
|
2133 vpunpcklbw ymm0, ymm0, ymm5 // RA |
|
2134 vpermq ymm0, ymm0, 0xd8 |
|
2135 vpunpcklwd ymm1, ymm2, ymm0 // BGRA first 8 pixels |
|
2136 vpunpckhwd ymm2, ymm2, ymm0 // BGRA next 8 pixels |
|
2137 vmovdqu [edx], ymm1 |
|
2138 vmovdqu [edx + 32], ymm2 |
|
2139 lea edx, [edx + 64] |
|
2140 sub ecx, 16 |
|
2141 jg convertloop |
|
2142 vzeroupper |
|
2143 |
|
2144 pop edi |
|
2145 pop esi |
|
2146 ret |
|
2147 } |
|
2148 } |
|
2149 #endif // HAS_I422TOARGBROW_AVX2 |
|
2150 |
|
2151 #ifdef HAS_I422TOARGBROW_SSSE3 |
|
2152 |
|
2153 static const vec8 kUVToB = { |
|
2154 UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB |
|
2155 }; |
|
2156 |
|
2157 static const vec8 kUVToR = { |
|
2158 UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR |
|
2159 }; |
|
2160 |
|
2161 static const vec8 kUVToG = { |
|
2162 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG |
|
2163 }; |
|
2164 |
|
2165 static const vec8 kVUToB = { |
|
2166 VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, |
|
2167 }; |
|
2168 |
|
2169 static const vec8 kVUToR = { |
|
2170 VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, |
|
2171 }; |
|
2172 |
|
2173 static const vec8 kVUToG = { |
|
2174 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, |
|
2175 }; |
|
2176 |
|
2177 static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG }; |
|
2178 static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 }; |
|
2179 static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB }; |
|
2180 static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG }; |
|
2181 static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; |
|
2182 |
|
2183 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444. |
|
2184 |
|
2185 // Read 8 UV from 444. |
|
2186 #define READYUV444 __asm { \ |
|
2187 __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ |
|
2188 __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ |
|
2189 __asm lea esi, [esi + 8] \ |
|
2190 __asm punpcklbw xmm0, xmm1 /* UV */ \ |
|
2191 } |
|
2192 |
|
2193 // Read 4 UV from 422, upsample to 8 UV. |
|
2194 #define READYUV422 __asm { \ |
|
2195 __asm movd xmm0, [esi] /* U */ \ |
|
2196 __asm movd xmm1, [esi + edi] /* V */ \ |
|
2197 __asm lea esi, [esi + 4] \ |
|
2198 __asm punpcklbw xmm0, xmm1 /* UV */ \ |
|
2199 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ |
|
2200 } |
|
2201 |
|
2202 // Read 2 UV from 411, upsample to 8 UV. |
|
2203 #define READYUV411 __asm { \ |
|
2204 __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \ |
|
2205 __asm movd xmm0, ebx \ |
|
2206 __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \ |
|
2207 __asm movd xmm1, ebx \ |
|
2208 __asm lea esi, [esi + 2] \ |
|
2209 __asm punpcklbw xmm0, xmm1 /* UV */ \ |
|
2210 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ |
|
2211 __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \ |
|
2212 } |
|
2213 |
|
2214 // Read 4 UV from NV12, upsample to 8 UV. |
|
2215 #define READNV12 __asm { \ |
|
2216 __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \ |
|
2217 __asm lea esi, [esi + 8] \ |
|
2218 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ |
|
2219 } |
|
2220 |
|
2221 // Convert 8 pixels: 8 UV and 8 Y. |
|
2222 #define YUVTORGB __asm { \ |
|
2223 /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ |
|
2224 __asm movdqa xmm1, xmm0 \ |
|
2225 __asm movdqa xmm2, xmm0 \ |
|
2226 __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \ |
|
2227 __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \ |
|
2228 __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \ |
|
2229 __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ |
|
2230 __asm psubw xmm1, kUVBiasG \ |
|
2231 __asm psubw xmm2, kUVBiasR \ |
|
2232 /* Step 2: Find Y contribution to 8 R,G,B values */ \ |
|
2233 __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ |
|
2234 __asm lea eax, [eax + 8] \ |
|
2235 __asm punpcklbw xmm3, xmm4 \ |
|
2236 __asm psubsw xmm3, kYSub16 \ |
|
2237 __asm pmullw xmm3, kYToRgb \ |
|
2238 __asm paddsw xmm0, xmm3 /* B += Y */ \ |
|
2239 __asm paddsw xmm1, xmm3 /* G += Y */ \ |
|
2240 __asm paddsw xmm2, xmm3 /* R += Y */ \ |
|
2241 __asm psraw xmm0, 6 \ |
|
2242 __asm psraw xmm1, 6 \ |
|
2243 __asm psraw xmm2, 6 \ |
|
2244 __asm packuswb xmm0, xmm0 /* B */ \ |
|
2245 __asm packuswb xmm1, xmm1 /* G */ \ |
|
2246 __asm packuswb xmm2, xmm2 /* R */ \ |
|
2247 } |
|
2248 |
|
2249 // Convert 8 pixels: 8 VU and 8 Y. |
|
2250 #define YVUTORGB __asm { \ |
|
2251 /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ |
|
2252 __asm movdqa xmm1, xmm0 \ |
|
2253 __asm movdqa xmm2, xmm0 \ |
|
2254 __asm pmaddubsw xmm0, kVUToB /* scale B UV */ \ |
|
2255 __asm pmaddubsw xmm1, kVUToG /* scale G UV */ \ |
|
2256 __asm pmaddubsw xmm2, kVUToR /* scale R UV */ \ |
|
2257 __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ |
|
2258 __asm psubw xmm1, kUVBiasG \ |
|
2259 __asm psubw xmm2, kUVBiasR \ |
|
2260 /* Step 2: Find Y contribution to 8 R,G,B values */ \ |
|
2261 __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ |
|
2262 __asm lea eax, [eax + 8] \ |
|
2263 __asm punpcklbw xmm3, xmm4 \ |
|
2264 __asm psubsw xmm3, kYSub16 \ |
|
2265 __asm pmullw xmm3, kYToRgb \ |
|
2266 __asm paddsw xmm0, xmm3 /* B += Y */ \ |
|
2267 __asm paddsw xmm1, xmm3 /* G += Y */ \ |
|
2268 __asm paddsw xmm2, xmm3 /* R += Y */ \ |
|
2269 __asm psraw xmm0, 6 \ |
|
2270 __asm psraw xmm1, 6 \ |
|
2271 __asm psraw xmm2, 6 \ |
|
2272 __asm packuswb xmm0, xmm0 /* B */ \ |
|
2273 __asm packuswb xmm1, xmm1 /* G */ \ |
|
2274 __asm packuswb xmm2, xmm2 /* R */ \ |
|
2275 } |
|
2276 |
|
2277 // 8 pixels, dest aligned 16. |
|
2278 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). |
|
2279 __declspec(naked) __declspec(align(16)) |
|
2280 void I444ToARGBRow_SSSE3(const uint8* y_buf, |
|
2281 const uint8* u_buf, |
|
2282 const uint8* v_buf, |
|
2283 uint8* dst_argb, |
|
2284 int width) { |
|
2285 __asm { |
|
2286 push esi |
|
2287 push edi |
|
2288 mov eax, [esp + 8 + 4] // Y |
|
2289 mov esi, [esp + 8 + 8] // U |
|
2290 mov edi, [esp + 8 + 12] // V |
|
2291 mov edx, [esp + 8 + 16] // argb |
|
2292 mov ecx, [esp + 8 + 20] // width |
|
2293 sub edi, esi |
|
2294 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
|
2295 pxor xmm4, xmm4 |
|
2296 |
|
2297 align 4 |
|
2298 convertloop: |
|
2299 READYUV444 |
|
2300 YUVTORGB |
|
2301 |
|
2302 // Step 3: Weave into ARGB |
|
2303 punpcklbw xmm0, xmm1 // BG |
|
2304 punpcklbw xmm2, xmm5 // RA |
|
2305 movdqa xmm1, xmm0 |
|
2306 punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
|
2307 punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
|
2308 movdqa [edx], xmm0 |
|
2309 movdqa [edx + 16], xmm1 |
|
2310 lea edx, [edx + 32] |
|
2311 sub ecx, 8 |
|
2312 jg convertloop |
|
2313 |
|
2314 pop edi |
|
2315 pop esi |
|
2316 ret |
|
2317 } |
|
2318 } |
|
2319 |
|
2320 // 8 pixels, dest aligned 16. |
|
2321 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
|
2322 __declspec(naked) __declspec(align(16)) |
|
2323 void I422ToRGB24Row_SSSE3(const uint8* y_buf, |
|
2324 const uint8* u_buf, |
|
2325 const uint8* v_buf, |
|
2326 uint8* dst_rgb24, |
|
2327 int width) { |
|
2328 __asm { |
|
2329 push esi |
|
2330 push edi |
|
2331 mov eax, [esp + 8 + 4] // Y |
|
2332 mov esi, [esp + 8 + 8] // U |
|
2333 mov edi, [esp + 8 + 12] // V |
|
2334 mov edx, [esp + 8 + 16] // rgb24 |
|
2335 mov ecx, [esp + 8 + 20] // width |
|
2336 sub edi, esi |
|
2337 pxor xmm4, xmm4 |
|
2338 movdqa xmm5, kShuffleMaskARGBToRGB24_0 |
|
2339 movdqa xmm6, kShuffleMaskARGBToRGB24 |
|
2340 |
|
2341 align 4 |
|
2342 convertloop: |
|
2343 READYUV422 |
|
2344 YUVTORGB |
|
2345 |
|
2346 // Step 3: Weave into RRGB |
|
2347 punpcklbw xmm0, xmm1 // BG |
|
2348 punpcklbw xmm2, xmm2 // RR |
|
2349 movdqa xmm1, xmm0 |
|
2350 punpcklwd xmm0, xmm2 // BGRR first 4 pixels |
|
2351 punpckhwd xmm1, xmm2 // BGRR next 4 pixels |
|
2352 pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes. |
|
2353 pshufb xmm1, xmm6 // Pack into first 12 bytes. |
|
2354 palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1 |
|
2355 movq qword ptr [edx], xmm0 // First 8 bytes |
|
2356 movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels. |
|
2357 lea edx, [edx + 24] |
|
2358 sub ecx, 8 |
|
2359 jg convertloop |
|
2360 |
|
2361 pop edi |
|
2362 pop esi |
|
2363 ret |
|
2364 } |
|
2365 } |
|
2366 |
|
2367 // 8 pixels, dest aligned 16. |
|
2368 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
|
2369 __declspec(naked) __declspec(align(16)) |
|
2370 void I422ToRAWRow_SSSE3(const uint8* y_buf, |
|
2371 const uint8* u_buf, |
|
2372 const uint8* v_buf, |
|
2373 uint8* dst_raw, |
|
2374 int width) { |
|
2375 __asm { |
|
2376 push esi |
|
2377 push edi |
|
2378 mov eax, [esp + 8 + 4] // Y |
|
2379 mov esi, [esp + 8 + 8] // U |
|
2380 mov edi, [esp + 8 + 12] // V |
|
2381 mov edx, [esp + 8 + 16] // raw |
|
2382 mov ecx, [esp + 8 + 20] // width |
|
2383 sub edi, esi |
|
2384 pxor xmm4, xmm4 |
|
2385 movdqa xmm5, kShuffleMaskARGBToRAW_0 |
|
2386 movdqa xmm6, kShuffleMaskARGBToRAW |
|
2387 |
|
2388 align 4 |
|
2389 convertloop: |
|
2390 READYUV422 |
|
2391 YUVTORGB |
|
2392 |
|
2393 // Step 3: Weave into RRGB |
|
2394 punpcklbw xmm0, xmm1 // BG |
|
2395 punpcklbw xmm2, xmm2 // RR |
|
2396 movdqa xmm1, xmm0 |
|
2397 punpcklwd xmm0, xmm2 // BGRR first 4 pixels |
|
2398 punpckhwd xmm1, xmm2 // BGRR next 4 pixels |
|
2399 pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes. |
|
2400 pshufb xmm1, xmm6 // Pack into first 12 bytes. |
|
2401 palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1 |
|
2402 movq qword ptr [edx], xmm0 // First 8 bytes |
|
2403 movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels. |
|
2404 lea edx, [edx + 24] |
|
2405 sub ecx, 8 |
|
2406 jg convertloop |
|
2407 |
|
2408 pop edi |
|
2409 pop esi |
|
2410 ret |
|
2411 } |
|
2412 } |
|
2413 |
|
2414 // 8 pixels, dest unaligned. |
|
2415 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
|
2416 __declspec(naked) __declspec(align(16)) |
|
2417 void I422ToRGB565Row_SSSE3(const uint8* y_buf, |
|
2418 const uint8* u_buf, |
|
2419 const uint8* v_buf, |
|
2420 uint8* rgb565_buf, |
|
2421 int width) { |
|
2422 __asm { |
|
2423 push esi |
|
2424 push edi |
|
2425 mov eax, [esp + 8 + 4] // Y |
|
2426 mov esi, [esp + 8 + 8] // U |
|
2427 mov edi, [esp + 8 + 12] // V |
|
2428 mov edx, [esp + 8 + 16] // rgb565 |
|
2429 mov ecx, [esp + 8 + 20] // width |
|
2430 sub edi, esi |
|
2431 pxor xmm4, xmm4 |
|
2432 pcmpeqb xmm5, xmm5 // generate mask 0x0000001f |
|
2433 psrld xmm5, 27 |
|
2434 pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 |
|
2435 psrld xmm6, 26 |
|
2436 pslld xmm6, 5 |
|
2437 pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 |
|
2438 pslld xmm7, 11 |
|
2439 |
|
2440 align 4 |
|
2441 convertloop: |
|
2442 READYUV422 |
|
2443 YUVTORGB |
|
2444 |
|
2445 // Step 3: Weave into RRGB |
|
2446 punpcklbw xmm0, xmm1 // BG |
|
2447 punpcklbw xmm2, xmm2 // RR |
|
2448 movdqa xmm1, xmm0 |
|
2449 punpcklwd xmm0, xmm2 // BGRR first 4 pixels |
|
2450 punpckhwd xmm1, xmm2 // BGRR next 4 pixels |
|
2451 |
|
2452 // Step 3b: RRGB -> RGB565 |
|
2453 movdqa xmm3, xmm0 // B first 4 pixels of argb |
|
2454 movdqa xmm2, xmm0 // G |
|
2455 pslld xmm0, 8 // R |
|
2456 psrld xmm3, 3 // B |
|
2457 psrld xmm2, 5 // G |
|
2458 psrad xmm0, 16 // R |
|
2459 pand xmm3, xmm5 // B |
|
2460 pand xmm2, xmm6 // G |
|
2461 pand xmm0, xmm7 // R |
|
2462 por xmm3, xmm2 // BG |
|
2463 por xmm0, xmm3 // BGR |
|
2464 movdqa xmm3, xmm1 // B next 4 pixels of argb |
|
2465 movdqa xmm2, xmm1 // G |
|
2466 pslld xmm1, 8 // R |
|
2467 psrld xmm3, 3 // B |
|
2468 psrld xmm2, 5 // G |
|
2469 psrad xmm1, 16 // R |
|
2470 pand xmm3, xmm5 // B |
|
2471 pand xmm2, xmm6 // G |
|
2472 pand xmm1, xmm7 // R |
|
2473 por xmm3, xmm2 // BG |
|
2474 por xmm1, xmm3 // BGR |
|
2475 packssdw xmm0, xmm1 |
|
2476 sub ecx, 8 |
|
2477 movdqu [edx], xmm0 // store 8 pixels of RGB565 |
|
2478 lea edx, [edx + 16] |
|
2479 jg convertloop |
|
2480 |
|
2481 pop edi |
|
2482 pop esi |
|
2483 ret |
|
2484 } |
|
2485 } |
|
2486 |
|
2487 // 8 pixels, dest aligned 16. |
|
2488 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
|
2489 __declspec(naked) __declspec(align(16)) |
|
2490 void I422ToARGBRow_SSSE3(const uint8* y_buf, |
|
2491 const uint8* u_buf, |
|
2492 const uint8* v_buf, |
|
2493 uint8* dst_argb, |
|
2494 int width) { |
|
2495 __asm { |
|
2496 push esi |
|
2497 push edi |
|
2498 mov eax, [esp + 8 + 4] // Y |
|
2499 mov esi, [esp + 8 + 8] // U |
|
2500 mov edi, [esp + 8 + 12] // V |
|
2501 mov edx, [esp + 8 + 16] // argb |
|
2502 mov ecx, [esp + 8 + 20] // width |
|
2503 sub edi, esi |
|
2504 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
|
2505 pxor xmm4, xmm4 |
|
2506 |
|
2507 align 4 |
|
2508 convertloop: |
|
2509 READYUV422 |
|
2510 YUVTORGB |
|
2511 |
|
2512 // Step 3: Weave into ARGB |
|
2513 punpcklbw xmm0, xmm1 // BG |
|
2514 punpcklbw xmm2, xmm5 // RA |
|
2515 movdqa xmm1, xmm0 |
|
2516 punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
|
2517 punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
|
2518 movdqa [edx], xmm0 |
|
2519 movdqa [edx + 16], xmm1 |
|
2520 lea edx, [edx + 32] |
|
2521 sub ecx, 8 |
|
2522 jg convertloop |
|
2523 |
|
2524 pop edi |
|
2525 pop esi |
|
2526 ret |
|
2527 } |
|
2528 } |
|
2529 |
|
2530 // 8 pixels, dest aligned 16. |
|
2531 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
|
2532 // Similar to I420 but duplicate UV once more. |
|
2533 __declspec(naked) __declspec(align(16)) |
|
2534 void I411ToARGBRow_SSSE3(const uint8* y_buf, |
|
2535 const uint8* u_buf, |
|
2536 const uint8* v_buf, |
|
2537 uint8* dst_argb, |
|
2538 int width) { |
|
2539 __asm { |
|
2540 push ebx |
|
2541 push esi |
|
2542 push edi |
|
2543 mov eax, [esp + 12 + 4] // Y |
|
2544 mov esi, [esp + 12 + 8] // U |
|
2545 mov edi, [esp + 12 + 12] // V |
|
2546 mov edx, [esp + 12 + 16] // argb |
|
2547 mov ecx, [esp + 12 + 20] // width |
|
2548 sub edi, esi |
|
2549 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
|
2550 pxor xmm4, xmm4 |
|
2551 |
|
2552 align 4 |
|
2553 convertloop: |
|
2554 READYUV411 // modifies EBX |
|
2555 YUVTORGB |
|
2556 |
|
2557 // Step 3: Weave into ARGB |
|
2558 punpcklbw xmm0, xmm1 // BG |
|
2559 punpcklbw xmm2, xmm5 // RA |
|
2560 movdqa xmm1, xmm0 |
|
2561 punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
|
2562 punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
|
2563 movdqa [edx], xmm0 |
|
2564 movdqa [edx + 16], xmm1 |
|
2565 lea edx, [edx + 32] |
|
2566 sub ecx, 8 |
|
2567 jg convertloop |
|
2568 |
|
2569 pop edi |
|
2570 pop esi |
|
2571 pop ebx |
|
2572 ret |
|
2573 } |
|
2574 } |
|
2575 |
|
2576 // 8 pixels, dest aligned 16. |
|
2577 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
|
2578 __declspec(naked) __declspec(align(16)) |
|
2579 void NV12ToARGBRow_SSSE3(const uint8* y_buf, |
|
2580 const uint8* uv_buf, |
|
2581 uint8* dst_argb, |
|
2582 int width) { |
|
2583 __asm { |
|
2584 push esi |
|
2585 mov eax, [esp + 4 + 4] // Y |
|
2586 mov esi, [esp + 4 + 8] // UV |
|
2587 mov edx, [esp + 4 + 12] // argb |
|
2588 mov ecx, [esp + 4 + 16] // width |
|
2589 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
|
2590 pxor xmm4, xmm4 |
|
2591 |
|
2592 align 4 |
|
2593 convertloop: |
|
2594 READNV12 |
|
2595 YUVTORGB |
|
2596 |
|
2597 // Step 3: Weave into ARGB |
|
2598 punpcklbw xmm0, xmm1 // BG |
|
2599 punpcklbw xmm2, xmm5 // RA |
|
2600 movdqa xmm1, xmm0 |
|
2601 punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
|
2602 punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
|
2603 movdqa [edx], xmm0 |
|
2604 movdqa [edx + 16], xmm1 |
|
2605 lea edx, [edx + 32] |
|
2606 sub ecx, 8 |
|
2607 jg convertloop |
|
2608 |
|
2609 pop esi |
|
2610 ret |
|
2611 } |
|
2612 } |
|
2613 |
|
2614 // 8 pixels, dest aligned 16. |
|
2615 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
|
2616 __declspec(naked) __declspec(align(16)) |
|
2617 void NV21ToARGBRow_SSSE3(const uint8* y_buf, |
|
2618 const uint8* uv_buf, |
|
2619 uint8* dst_argb, |
|
2620 int width) { |
|
2621 __asm { |
|
2622 push esi |
|
2623 mov eax, [esp + 4 + 4] // Y |
|
2624 mov esi, [esp + 4 + 8] // VU |
|
2625 mov edx, [esp + 4 + 12] // argb |
|
2626 mov ecx, [esp + 4 + 16] // width |
|
2627 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
|
2628 pxor xmm4, xmm4 |
|
2629 |
|
2630 align 4 |
|
2631 convertloop: |
|
2632 READNV12 |
|
2633 YVUTORGB |
|
2634 |
|
2635 // Step 3: Weave into ARGB |
|
2636 punpcklbw xmm0, xmm1 // BG |
|
2637 punpcklbw xmm2, xmm5 // RA |
|
2638 movdqa xmm1, xmm0 |
|
2639 punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
|
2640 punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
|
2641 movdqa [edx], xmm0 |
|
2642 movdqa [edx + 16], xmm1 |
|
2643 lea edx, [edx + 32] |
|
2644 sub ecx, 8 |
|
2645 jg convertloop |
|
2646 |
|
2647 pop esi |
|
2648 ret |
|
2649 } |
|
2650 } |
|
2651 |
|
2652 // 8 pixels, unaligned. |
|
2653 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). |
|
2654 __declspec(naked) __declspec(align(16)) |
|
2655 void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, |
|
2656 const uint8* u_buf, |
|
2657 const uint8* v_buf, |
|
2658 uint8* dst_argb, |
|
2659 int width) { |
|
2660 __asm { |
|
2661 push esi |
|
2662 push edi |
|
2663 mov eax, [esp + 8 + 4] // Y |
|
2664 mov esi, [esp + 8 + 8] // U |
|
2665 mov edi, [esp + 8 + 12] // V |
|
2666 mov edx, [esp + 8 + 16] // argb |
|
2667 mov ecx, [esp + 8 + 20] // width |
|
2668 sub edi, esi |
|
2669 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
|
2670 pxor xmm4, xmm4 |
|
2671 |
|
2672 align 4 |
|
2673 convertloop: |
|
2674 READYUV444 |
|
2675 YUVTORGB |
|
2676 |
|
2677 // Step 3: Weave into ARGB |
|
2678 punpcklbw xmm0, xmm1 // BG |
|
2679 punpcklbw xmm2, xmm5 // RA |
|
2680 movdqa xmm1, xmm0 |
|
2681 punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
|
2682 punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
|
2683 movdqu [edx], xmm0 |
|
2684 movdqu [edx + 16], xmm1 |
|
2685 lea edx, [edx + 32] |
|
2686 sub ecx, 8 |
|
2687 jg convertloop |
|
2688 |
|
2689 pop edi |
|
2690 pop esi |
|
2691 ret |
|
2692 } |
|
2693 } |
|
2694 |
|
2695 // 8 pixels, unaligned. |
|
2696 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
|
2697 __declspec(naked) __declspec(align(16)) |
|
2698 void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, |
|
2699 const uint8* u_buf, |
|
2700 const uint8* v_buf, |
|
2701 uint8* dst_argb, |
|
2702 int width) { |
|
2703 __asm { |
|
2704 push esi |
|
2705 push edi |
|
2706 mov eax, [esp + 8 + 4] // Y |
|
2707 mov esi, [esp + 8 + 8] // U |
|
2708 mov edi, [esp + 8 + 12] // V |
|
2709 mov edx, [esp + 8 + 16] // argb |
|
2710 mov ecx, [esp + 8 + 20] // width |
|
2711 sub edi, esi |
|
2712 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
|
2713 pxor xmm4, xmm4 |
|
2714 |
|
2715 align 4 |
|
2716 convertloop: |
|
2717 READYUV422 |
|
2718 YUVTORGB |
|
2719 |
|
2720 // Step 3: Weave into ARGB |
|
2721 punpcklbw xmm0, xmm1 // BG |
|
2722 punpcklbw xmm2, xmm5 // RA |
|
2723 movdqa xmm1, xmm0 |
|
2724 punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
|
2725 punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
|
2726 movdqu [edx], xmm0 |
|
2727 movdqu [edx + 16], xmm1 |
|
2728 lea edx, [edx + 32] |
|
2729 sub ecx, 8 |
|
2730 jg convertloop |
|
2731 |
|
2732 pop edi |
|
2733 pop esi |
|
2734 ret |
|
2735 } |
|
2736 } |
|
2737 |
|
2738 // 8 pixels, unaligned. |
|
2739 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
|
2740 // Similar to I420 but duplicate UV once more. |
|
2741 __declspec(naked) __declspec(align(16)) |
|
2742 void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, |
|
2743 const uint8* u_buf, |
|
2744 const uint8* v_buf, |
|
2745 uint8* dst_argb, |
|
2746 int width) { |
|
2747 __asm { |
|
2748 push ebx |
|
2749 push esi |
|
2750 push edi |
|
2751 mov eax, [esp + 12 + 4] // Y |
|
2752 mov esi, [esp + 12 + 8] // U |
|
2753 mov edi, [esp + 12 + 12] // V |
|
2754 mov edx, [esp + 12 + 16] // argb |
|
2755 mov ecx, [esp + 12 + 20] // width |
|
2756 sub edi, esi |
|
2757 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
|
2758 pxor xmm4, xmm4 |
|
2759 |
|
2760 align 4 |
|
2761 convertloop: |
|
2762 READYUV411 // modifies EBX |
|
2763 YUVTORGB |
|
2764 |
|
2765 // Step 3: Weave into ARGB |
|
2766 punpcklbw xmm0, xmm1 // BG |
|
2767 punpcklbw xmm2, xmm5 // RA |
|
2768 movdqa xmm1, xmm0 |
|
2769 punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
|
2770 punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
|
2771 movdqu [edx], xmm0 |
|
2772 movdqu [edx + 16], xmm1 |
|
2773 lea edx, [edx + 32] |
|
2774 sub ecx, 8 |
|
2775 jg convertloop |
|
2776 |
|
2777 pop edi |
|
2778 pop esi |
|
2779 pop ebx |
|
2780 ret |
|
2781 } |
|
2782 } |
|
2783 |
|
2784 // 8 pixels, dest aligned 16. |
|
2785 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
|
2786 __declspec(naked) __declspec(align(16)) |
|
2787 void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, |
|
2788 const uint8* uv_buf, |
|
2789 uint8* dst_argb, |
|
2790 int width) { |
|
2791 __asm { |
|
2792 push esi |
|
2793 mov eax, [esp + 4 + 4] // Y |
|
2794 mov esi, [esp + 4 + 8] // UV |
|
2795 mov edx, [esp + 4 + 12] // argb |
|
2796 mov ecx, [esp + 4 + 16] // width |
|
2797 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
|
2798 pxor xmm4, xmm4 |
|
2799 |
|
2800 align 4 |
|
2801 convertloop: |
|
2802 READNV12 |
|
2803 YUVTORGB |
|
2804 |
|
2805 // Step 3: Weave into ARGB |
|
2806 punpcklbw xmm0, xmm1 // BG |
|
2807 punpcklbw xmm2, xmm5 // RA |
|
2808 movdqa xmm1, xmm0 |
|
2809 punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
|
2810 punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
|
2811 movdqu [edx], xmm0 |
|
2812 movdqu [edx + 16], xmm1 |
|
2813 lea edx, [edx + 32] |
|
2814 sub ecx, 8 |
|
2815 jg convertloop |
|
2816 |
|
2817 pop esi |
|
2818 ret |
|
2819 } |
|
2820 } |
|
2821 |
|
2822 // 8 pixels, dest aligned 16. |
|
2823 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
|
2824 __declspec(naked) __declspec(align(16)) |
|
2825 void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, |
|
2826 const uint8* uv_buf, |
|
2827 uint8* dst_argb, |
|
2828 int width) { |
|
2829 __asm { |
|
2830 push esi |
|
2831 mov eax, [esp + 4 + 4] // Y |
|
2832 mov esi, [esp + 4 + 8] // VU |
|
2833 mov edx, [esp + 4 + 12] // argb |
|
2834 mov ecx, [esp + 4 + 16] // width |
|
2835 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
|
2836 pxor xmm4, xmm4 |
|
2837 |
|
2838 align 4 |
|
2839 convertloop: |
|
2840 READNV12 |
|
2841 YVUTORGB |
|
2842 |
|
2843 // Step 3: Weave into ARGB |
|
2844 punpcklbw xmm0, xmm1 // BG |
|
2845 punpcklbw xmm2, xmm5 // RA |
|
2846 movdqa xmm1, xmm0 |
|
2847 punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
|
2848 punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
|
2849 movdqu [edx], xmm0 |
|
2850 movdqu [edx + 16], xmm1 |
|
2851 lea edx, [edx + 32] |
|
2852 sub ecx, 8 |
|
2853 jg convertloop |
|
2854 |
|
2855 pop esi |
|
2856 ret |
|
2857 } |
|
2858 } |
|
2859 |
|
2860 __declspec(naked) __declspec(align(16)) |
|
2861 void I422ToBGRARow_SSSE3(const uint8* y_buf, |
|
2862 const uint8* u_buf, |
|
2863 const uint8* v_buf, |
|
2864 uint8* dst_bgra, |
|
2865 int width) { |
|
2866 __asm { |
|
2867 push esi |
|
2868 push edi |
|
2869 mov eax, [esp + 8 + 4] // Y |
|
2870 mov esi, [esp + 8 + 8] // U |
|
2871 mov edi, [esp + 8 + 12] // V |
|
2872 mov edx, [esp + 8 + 16] // bgra |
|
2873 mov ecx, [esp + 8 + 20] // width |
|
2874 sub edi, esi |
|
2875 pxor xmm4, xmm4 |
|
2876 |
|
2877 align 4 |
|
2878 convertloop: |
|
2879 READYUV422 |
|
2880 YUVTORGB |
|
2881 |
|
2882 // Step 3: Weave into BGRA |
|
2883 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
|
2884 punpcklbw xmm1, xmm0 // GB |
|
2885 punpcklbw xmm5, xmm2 // AR |
|
2886 movdqa xmm0, xmm5 |
|
2887 punpcklwd xmm5, xmm1 // BGRA first 4 pixels |
|
2888 punpckhwd xmm0, xmm1 // BGRA next 4 pixels |
|
2889 movdqa [edx], xmm5 |
|
2890 movdqa [edx + 16], xmm0 |
|
2891 lea edx, [edx + 32] |
|
2892 sub ecx, 8 |
|
2893 jg convertloop |
|
2894 |
|
2895 pop edi |
|
2896 pop esi |
|
2897 ret |
|
2898 } |
|
2899 } |
|
2900 |
|
2901 __declspec(naked) __declspec(align(16)) |
|
2902 void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, |
|
2903 const uint8* u_buf, |
|
2904 const uint8* v_buf, |
|
2905 uint8* dst_bgra, |
|
2906 int width) { |
|
2907 __asm { |
|
2908 push esi |
|
2909 push edi |
|
2910 mov eax, [esp + 8 + 4] // Y |
|
2911 mov esi, [esp + 8 + 8] // U |
|
2912 mov edi, [esp + 8 + 12] // V |
|
2913 mov edx, [esp + 8 + 16] // bgra |
|
2914 mov ecx, [esp + 8 + 20] // width |
|
2915 sub edi, esi |
|
2916 pxor xmm4, xmm4 |
|
2917 |
|
2918 align 4 |
|
2919 convertloop: |
|
2920 READYUV422 |
|
2921 YUVTORGB |
|
2922 |
|
2923 // Step 3: Weave into BGRA |
|
2924 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
|
2925 punpcklbw xmm1, xmm0 // GB |
|
2926 punpcklbw xmm5, xmm2 // AR |
|
2927 movdqa xmm0, xmm5 |
|
2928 punpcklwd xmm5, xmm1 // BGRA first 4 pixels |
|
2929 punpckhwd xmm0, xmm1 // BGRA next 4 pixels |
|
2930 movdqu [edx], xmm5 |
|
2931 movdqu [edx + 16], xmm0 |
|
2932 lea edx, [edx + 32] |
|
2933 sub ecx, 8 |
|
2934 jg convertloop |
|
2935 |
|
2936 pop edi |
|
2937 pop esi |
|
2938 ret |
|
2939 } |
|
2940 } |
|
2941 |
|
2942 __declspec(naked) __declspec(align(16)) |
|
2943 void I422ToABGRRow_SSSE3(const uint8* y_buf, |
|
2944 const uint8* u_buf, |
|
2945 const uint8* v_buf, |
|
2946 uint8* dst_abgr, |
|
2947 int width) { |
|
2948 __asm { |
|
2949 push esi |
|
2950 push edi |
|
2951 mov eax, [esp + 8 + 4] // Y |
|
2952 mov esi, [esp + 8 + 8] // U |
|
2953 mov edi, [esp + 8 + 12] // V |
|
2954 mov edx, [esp + 8 + 16] // abgr |
|
2955 mov ecx, [esp + 8 + 20] // width |
|
2956 sub edi, esi |
|
2957 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
|
2958 pxor xmm4, xmm4 |
|
2959 |
|
2960 align 4 |
|
2961 convertloop: |
|
2962 READYUV422 |
|
2963 YUVTORGB |
|
2964 |
|
2965 // Step 3: Weave into ARGB |
|
2966 punpcklbw xmm2, xmm1 // RG |
|
2967 punpcklbw xmm0, xmm5 // BA |
|
2968 movdqa xmm1, xmm2 |
|
2969 punpcklwd xmm2, xmm0 // RGBA first 4 pixels |
|
2970 punpckhwd xmm1, xmm0 // RGBA next 4 pixels |
|
2971 movdqa [edx], xmm2 |
|
2972 movdqa [edx + 16], xmm1 |
|
2973 lea edx, [edx + 32] |
|
2974 sub ecx, 8 |
|
2975 jg convertloop |
|
2976 |
|
2977 pop edi |
|
2978 pop esi |
|
2979 ret |
|
2980 } |
|
2981 } |
|
2982 |
|
2983 __declspec(naked) __declspec(align(16)) |
|
2984 void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, |
|
2985 const uint8* u_buf, |
|
2986 const uint8* v_buf, |
|
2987 uint8* dst_abgr, |
|
2988 int width) { |
|
2989 __asm { |
|
2990 push esi |
|
2991 push edi |
|
2992 mov eax, [esp + 8 + 4] // Y |
|
2993 mov esi, [esp + 8 + 8] // U |
|
2994 mov edi, [esp + 8 + 12] // V |
|
2995 mov edx, [esp + 8 + 16] // abgr |
|
2996 mov ecx, [esp + 8 + 20] // width |
|
2997 sub edi, esi |
|
2998 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
|
2999 pxor xmm4, xmm4 |
|
3000 |
|
3001 align 4 |
|
3002 convertloop: |
|
3003 READYUV422 |
|
3004 YUVTORGB |
|
3005 |
|
3006 // Step 3: Weave into ARGB |
|
3007 punpcklbw xmm2, xmm1 // RG |
|
3008 punpcklbw xmm0, xmm5 // BA |
|
3009 movdqa xmm1, xmm2 |
|
3010 punpcklwd xmm2, xmm0 // RGBA first 4 pixels |
|
3011 punpckhwd xmm1, xmm0 // RGBA next 4 pixels |
|
3012 movdqu [edx], xmm2 |
|
3013 movdqu [edx + 16], xmm1 |
|
3014 lea edx, [edx + 32] |
|
3015 sub ecx, 8 |
|
3016 jg convertloop |
|
3017 |
|
3018 pop edi |
|
3019 pop esi |
|
3020 ret |
|
3021 } |
|
3022 } |
|
3023 |
|
3024 __declspec(naked) __declspec(align(16)) |
|
3025 void I422ToRGBARow_SSSE3(const uint8* y_buf, |
|
3026 const uint8* u_buf, |
|
3027 const uint8* v_buf, |
|
3028 uint8* dst_rgba, |
|
3029 int width) { |
|
3030 __asm { |
|
3031 push esi |
|
3032 push edi |
|
3033 mov eax, [esp + 8 + 4] // Y |
|
3034 mov esi, [esp + 8 + 8] // U |
|
3035 mov edi, [esp + 8 + 12] // V |
|
3036 mov edx, [esp + 8 + 16] // rgba |
|
3037 mov ecx, [esp + 8 + 20] // width |
|
3038 sub edi, esi |
|
3039 pxor xmm4, xmm4 |
|
3040 |
|
3041 align 4 |
|
3042 convertloop: |
|
3043 READYUV422 |
|
3044 YUVTORGB |
|
3045 |
|
3046 // Step 3: Weave into RGBA |
|
3047 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
|
3048 punpcklbw xmm1, xmm2 // GR |
|
3049 punpcklbw xmm5, xmm0 // AB |
|
3050 movdqa xmm0, xmm5 |
|
3051 punpcklwd xmm5, xmm1 // RGBA first 4 pixels |
|
3052 punpckhwd xmm0, xmm1 // RGBA next 4 pixels |
|
3053 movdqa [edx], xmm5 |
|
3054 movdqa [edx + 16], xmm0 |
|
3055 lea edx, [edx + 32] |
|
3056 sub ecx, 8 |
|
3057 jg convertloop |
|
3058 |
|
3059 pop edi |
|
3060 pop esi |
|
3061 ret |
|
3062 } |
|
3063 } |
|
3064 |
|
3065 __declspec(naked) __declspec(align(16)) |
|
3066 void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, |
|
3067 const uint8* u_buf, |
|
3068 const uint8* v_buf, |
|
3069 uint8* dst_rgba, |
|
3070 int width) { |
|
3071 __asm { |
|
3072 push esi |
|
3073 push edi |
|
3074 mov eax, [esp + 8 + 4] // Y |
|
3075 mov esi, [esp + 8 + 8] // U |
|
3076 mov edi, [esp + 8 + 12] // V |
|
3077 mov edx, [esp + 8 + 16] // rgba |
|
3078 mov ecx, [esp + 8 + 20] // width |
|
3079 sub edi, esi |
|
3080 pxor xmm4, xmm4 |
|
3081 |
|
3082 align 4 |
|
3083 convertloop: |
|
3084 READYUV422 |
|
3085 YUVTORGB |
|
3086 |
|
3087 // Step 3: Weave into RGBA |
|
3088 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
|
3089 punpcklbw xmm1, xmm2 // GR |
|
3090 punpcklbw xmm5, xmm0 // AB |
|
3091 movdqa xmm0, xmm5 |
|
3092 punpcklwd xmm5, xmm1 // RGBA first 4 pixels |
|
3093 punpckhwd xmm0, xmm1 // RGBA next 4 pixels |
|
3094 movdqu [edx], xmm5 |
|
3095 movdqu [edx + 16], xmm0 |
|
3096 lea edx, [edx + 32] |
|
3097 sub ecx, 8 |
|
3098 jg convertloop |
|
3099 |
|
3100 pop edi |
|
3101 pop esi |
|
3102 ret |
|
3103 } |
|
3104 } |
|
3105 |
|
3106 #endif // HAS_I422TOARGBROW_SSSE3 |
|
3107 |
|
3108 #ifdef HAS_YTOARGBROW_SSE2 |
|
3109 __declspec(naked) __declspec(align(16)) |
|
3110 void YToARGBRow_SSE2(const uint8* y_buf, |
|
3111 uint8* rgb_buf, |
|
3112 int width) { |
|
3113 __asm { |
|
3114 pxor xmm5, xmm5 |
|
3115 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 |
|
3116 pslld xmm4, 24 |
|
3117 mov eax, 0x00100010 |
|
3118 movd xmm3, eax |
|
3119 pshufd xmm3, xmm3, 0 |
|
3120 mov eax, 0x004a004a // 74 |
|
3121 movd xmm2, eax |
|
3122 pshufd xmm2, xmm2,0 |
|
3123 mov eax, [esp + 4] // Y |
|
3124 mov edx, [esp + 8] // rgb |
|
3125 mov ecx, [esp + 12] // width |
|
3126 |
|
3127 align 4 |
|
3128 convertloop: |
|
3129 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 |
|
3130 movq xmm0, qword ptr [eax] |
|
3131 lea eax, [eax + 8] |
|
3132 punpcklbw xmm0, xmm5 // 0.Y |
|
3133 psubusw xmm0, xmm3 |
|
3134 pmullw xmm0, xmm2 |
|
3135 psrlw xmm0, 6 |
|
3136 packuswb xmm0, xmm0 // G |
|
3137 |
|
3138 // Step 2: Weave into ARGB |
|
3139 punpcklbw xmm0, xmm0 // GG |
|
3140 movdqa xmm1, xmm0 |
|
3141 punpcklwd xmm0, xmm0 // BGRA first 4 pixels |
|
3142 punpckhwd xmm1, xmm1 // BGRA next 4 pixels |
|
3143 por xmm0, xmm4 |
|
3144 por xmm1, xmm4 |
|
3145 movdqa [edx], xmm0 |
|
3146 movdqa [edx + 16], xmm1 |
|
3147 lea edx, [edx + 32] |
|
3148 sub ecx, 8 |
|
3149 jg convertloop |
|
3150 |
|
3151 ret |
|
3152 } |
|
3153 } |
|
3154 #endif // HAS_YTOARGBROW_SSE2 |
|
3155 |
|
3156 #ifdef HAS_MIRRORROW_SSSE3 |
|
3157 // Shuffle table for reversing the bytes. |
|
3158 static const uvec8 kShuffleMirror = { |
|
3159 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u |
|
3160 }; |
|
3161 |
|
3162 __declspec(naked) __declspec(align(16)) |
|
3163 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { |
|
3164 __asm { |
|
3165 mov eax, [esp + 4] // src |
|
3166 mov edx, [esp + 8] // dst |
|
3167 mov ecx, [esp + 12] // width |
|
3168 movdqa xmm5, kShuffleMirror |
|
3169 lea eax, [eax - 16] |
|
3170 |
|
3171 align 4 |
|
3172 convertloop: |
|
3173 movdqa xmm0, [eax + ecx] |
|
3174 pshufb xmm0, xmm5 |
|
3175 sub ecx, 16 |
|
3176 movdqa [edx], xmm0 |
|
3177 lea edx, [edx + 16] |
|
3178 jg convertloop |
|
3179 ret |
|
3180 } |
|
3181 } |
|
3182 #endif // HAS_MIRRORROW_SSSE3 |
|
3183 |
|
3184 #ifdef HAS_MIRRORROW_AVX2 |
|
3185 // Shuffle table for reversing the bytes. |
|
3186 static const ulvec8 kShuffleMirror_AVX2 = { |
|
3187 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u, |
|
3188 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u |
|
3189 }; |
|
3190 |
|
3191 __declspec(naked) __declspec(align(16)) |
|
3192 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { |
|
3193 __asm { |
|
3194 mov eax, [esp + 4] // src |
|
3195 mov edx, [esp + 8] // dst |
|
3196 mov ecx, [esp + 12] // width |
|
3197 vmovdqa ymm5, kShuffleMirror_AVX2 |
|
3198 lea eax, [eax - 32] |
|
3199 |
|
3200 align 4 |
|
3201 convertloop: |
|
3202 vmovdqu ymm0, [eax + ecx] |
|
3203 vpshufb ymm0, ymm0, ymm5 |
|
3204 vpermq ymm0, ymm0, 0x4e // swap high and low halfs |
|
3205 sub ecx, 32 |
|
3206 vmovdqu [edx], ymm0 |
|
3207 lea edx, [edx + 32] |
|
3208 jg convertloop |
|
3209 vzeroupper |
|
3210 ret |
|
3211 } |
|
3212 } |
|
3213 #endif // HAS_MIRRORROW_AVX2 |
|
3214 |
|
3215 #ifdef HAS_MIRRORROW_SSE2 |
|
3216 // SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3 |
|
3217 // version can not. |
|
3218 __declspec(naked) __declspec(align(16)) |
|
3219 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { |
|
3220 __asm { |
|
3221 mov eax, [esp + 4] // src |
|
3222 mov edx, [esp + 8] // dst |
|
3223 mov ecx, [esp + 12] // width |
|
3224 lea eax, [eax - 16] |
|
3225 |
|
3226 align 4 |
|
3227 convertloop: |
|
3228 movdqu xmm0, [eax + ecx] |
|
3229 movdqa xmm1, xmm0 // swap bytes |
|
3230 psllw xmm0, 8 |
|
3231 psrlw xmm1, 8 |
|
3232 por xmm0, xmm1 |
|
3233 pshuflw xmm0, xmm0, 0x1b // swap words |
|
3234 pshufhw xmm0, xmm0, 0x1b |
|
3235 pshufd xmm0, xmm0, 0x4e // swap qwords |
|
3236 sub ecx, 16 |
|
3237 movdqu [edx], xmm0 |
|
3238 lea edx, [edx + 16] |
|
3239 jg convertloop |
|
3240 ret |
|
3241 } |
|
3242 } |
|
3243 #endif // HAS_MIRRORROW_SSE2 |
|
3244 |
|
3245 #ifdef HAS_MIRRORROW_UV_SSSE3 |
|
3246 // Shuffle table for reversing the bytes of UV channels. |
|
3247 static const uvec8 kShuffleMirrorUV = { |
|
3248 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u |
|
3249 }; |
|
3250 |
|
3251 __declspec(naked) __declspec(align(16)) |
|
3252 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, |
|
3253 int width) { |
|
3254 __asm { |
|
3255 push edi |
|
3256 mov eax, [esp + 4 + 4] // src |
|
3257 mov edx, [esp + 4 + 8] // dst_u |
|
3258 mov edi, [esp + 4 + 12] // dst_v |
|
3259 mov ecx, [esp + 4 + 16] // width |
|
3260 movdqa xmm1, kShuffleMirrorUV |
|
3261 lea eax, [eax + ecx * 2 - 16] |
|
3262 sub edi, edx |
|
3263 |
|
3264 align 4 |
|
3265 convertloop: |
|
3266 movdqa xmm0, [eax] |
|
3267 lea eax, [eax - 16] |
|
3268 pshufb xmm0, xmm1 |
|
3269 sub ecx, 8 |
|
3270 movlpd qword ptr [edx], xmm0 |
|
3271 movhpd qword ptr [edx + edi], xmm0 |
|
3272 lea edx, [edx + 8] |
|
3273 jg convertloop |
|
3274 |
|
3275 pop edi |
|
3276 ret |
|
3277 } |
|
3278 } |
|
3279 #endif // HAS_MIRRORROW_UV_SSSE3 |
|
3280 |
|
3281 #ifdef HAS_ARGBMIRRORROW_SSSE3 |
|
3282 // Shuffle table for reversing the bytes. |
|
3283 static const uvec8 kARGBShuffleMirror = { |
|
3284 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u |
|
3285 }; |
|
3286 |
|
3287 __declspec(naked) __declspec(align(16)) |
|
3288 void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { |
|
3289 __asm { |
|
3290 mov eax, [esp + 4] // src |
|
3291 mov edx, [esp + 8] // dst |
|
3292 mov ecx, [esp + 12] // width |
|
3293 lea eax, [eax - 16 + ecx * 4] // last 4 pixels. |
|
3294 movdqa xmm5, kARGBShuffleMirror |
|
3295 |
|
3296 align 4 |
|
3297 convertloop: |
|
3298 movdqa xmm0, [eax] |
|
3299 lea eax, [eax - 16] |
|
3300 pshufb xmm0, xmm5 |
|
3301 sub ecx, 4 |
|
3302 movdqa [edx], xmm0 |
|
3303 lea edx, [edx + 16] |
|
3304 jg convertloop |
|
3305 ret |
|
3306 } |
|
3307 } |
|
3308 #endif // HAS_ARGBMIRRORROW_SSSE3 |
|
3309 |
|
3310 #ifdef HAS_ARGBMIRRORROW_AVX2 |
|
3311 // Shuffle table for reversing the bytes. |
|
3312 static const ulvec32 kARGBShuffleMirror_AVX2 = { |
|
3313 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u |
|
3314 }; |
|
3315 |
|
3316 __declspec(naked) __declspec(align(16)) |
|
3317 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { |
|
3318 __asm { |
|
3319 mov eax, [esp + 4] // src |
|
3320 mov edx, [esp + 8] // dst |
|
3321 mov ecx, [esp + 12] // width |
|
3322 lea eax, [eax - 32] |
|
3323 vmovdqa ymm5, kARGBShuffleMirror_AVX2 |
|
3324 |
|
3325 align 4 |
|
3326 convertloop: |
|
3327 vpermd ymm0, ymm5, [eax + ecx * 4] // permute dword order |
|
3328 sub ecx, 8 |
|
3329 vmovdqu [edx], ymm0 |
|
3330 lea edx, [edx + 32] |
|
3331 jg convertloop |
|
3332 vzeroupper |
|
3333 ret |
|
3334 } |
|
3335 } |
|
3336 #endif // HAS_ARGBMIRRORROW_AVX2 |
|
3337 |
|
3338 #ifdef HAS_SPLITUVROW_SSE2 |
|
3339 __declspec(naked) __declspec(align(16)) |
|
3340 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { |
|
3341 __asm { |
|
3342 push edi |
|
3343 mov eax, [esp + 4 + 4] // src_uv |
|
3344 mov edx, [esp + 4 + 8] // dst_u |
|
3345 mov edi, [esp + 4 + 12] // dst_v |
|
3346 mov ecx, [esp + 4 + 16] // pix |
|
3347 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
|
3348 psrlw xmm5, 8 |
|
3349 sub edi, edx |
|
3350 |
|
3351 align 4 |
|
3352 convertloop: |
|
3353 movdqa xmm0, [eax] |
|
3354 movdqa xmm1, [eax + 16] |
|
3355 lea eax, [eax + 32] |
|
3356 movdqa xmm2, xmm0 |
|
3357 movdqa xmm3, xmm1 |
|
3358 pand xmm0, xmm5 // even bytes |
|
3359 pand xmm1, xmm5 |
|
3360 packuswb xmm0, xmm1 |
|
3361 psrlw xmm2, 8 // odd bytes |
|
3362 psrlw xmm3, 8 |
|
3363 packuswb xmm2, xmm3 |
|
3364 movdqa [edx], xmm0 |
|
3365 movdqa [edx + edi], xmm2 |
|
3366 lea edx, [edx + 16] |
|
3367 sub ecx, 16 |
|
3368 jg convertloop |
|
3369 |
|
3370 pop edi |
|
3371 ret |
|
3372 } |
|
3373 } |
|
3374 |
|
3375 __declspec(naked) __declspec(align(16)) |
|
3376 void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, |
|
3377 int pix) { |
|
3378 __asm { |
|
3379 push edi |
|
3380 mov eax, [esp + 4 + 4] // src_uv |
|
3381 mov edx, [esp + 4 + 8] // dst_u |
|
3382 mov edi, [esp + 4 + 12] // dst_v |
|
3383 mov ecx, [esp + 4 + 16] // pix |
|
3384 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
|
3385 psrlw xmm5, 8 |
|
3386 sub edi, edx |
|
3387 |
|
3388 align 4 |
|
3389 convertloop: |
|
3390 movdqu xmm0, [eax] |
|
3391 movdqu xmm1, [eax + 16] |
|
3392 lea eax, [eax + 32] |
|
3393 movdqa xmm2, xmm0 |
|
3394 movdqa xmm3, xmm1 |
|
3395 pand xmm0, xmm5 // even bytes |
|
3396 pand xmm1, xmm5 |
|
3397 packuswb xmm0, xmm1 |
|
3398 psrlw xmm2, 8 // odd bytes |
|
3399 psrlw xmm3, 8 |
|
3400 packuswb xmm2, xmm3 |
|
3401 movdqu [edx], xmm0 |
|
3402 movdqu [edx + edi], xmm2 |
|
3403 lea edx, [edx + 16] |
|
3404 sub ecx, 16 |
|
3405 jg convertloop |
|
3406 |
|
3407 pop edi |
|
3408 ret |
|
3409 } |
|
3410 } |
|
3411 #endif // HAS_SPLITUVROW_SSE2 |
|
3412 |
|
3413 #ifdef HAS_SPLITUVROW_AVX2 |
|
3414 __declspec(naked) __declspec(align(16)) |
|
3415 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { |
|
3416 __asm { |
|
3417 push edi |
|
3418 mov eax, [esp + 4 + 4] // src_uv |
|
3419 mov edx, [esp + 4 + 8] // dst_u |
|
3420 mov edi, [esp + 4 + 12] // dst_v |
|
3421 mov ecx, [esp + 4 + 16] // pix |
|
3422 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
|
3423 vpsrlw ymm5, ymm5, 8 |
|
3424 sub edi, edx |
|
3425 |
|
3426 align 4 |
|
3427 convertloop: |
|
3428 vmovdqu ymm0, [eax] |
|
3429 vmovdqu ymm1, [eax + 32] |
|
3430 lea eax, [eax + 64] |
|
3431 vpsrlw ymm2, ymm0, 8 // odd bytes |
|
3432 vpsrlw ymm3, ymm1, 8 |
|
3433 vpand ymm0, ymm0, ymm5 // even bytes |
|
3434 vpand ymm1, ymm1, ymm5 |
|
3435 vpackuswb ymm0, ymm0, ymm1 |
|
3436 vpackuswb ymm2, ymm2, ymm3 |
|
3437 vpermq ymm0, ymm0, 0xd8 |
|
3438 vpermq ymm2, ymm2, 0xd8 |
|
3439 vmovdqu [edx], ymm0 |
|
3440 vmovdqu [edx + edi], ymm2 |
|
3441 lea edx, [edx + 32] |
|
3442 sub ecx, 32 |
|
3443 jg convertloop |
|
3444 |
|
3445 pop edi |
|
3446 vzeroupper |
|
3447 ret |
|
3448 } |
|
3449 } |
|
3450 #endif // HAS_SPLITUVROW_AVX2 |
|
3451 |
|
3452 #ifdef HAS_MERGEUVROW_SSE2 |
|
3453 __declspec(naked) __declspec(align(16)) |
|
3454 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, |
|
3455 int width) { |
|
3456 __asm { |
|
3457 push edi |
|
3458 mov eax, [esp + 4 + 4] // src_u |
|
3459 mov edx, [esp + 4 + 8] // src_v |
|
3460 mov edi, [esp + 4 + 12] // dst_uv |
|
3461 mov ecx, [esp + 4 + 16] // width |
|
3462 sub edx, eax |
|
3463 |
|
3464 align 4 |
|
3465 convertloop: |
|
3466 movdqa xmm0, [eax] // read 16 U's |
|
3467 movdqa xmm1, [eax + edx] // and 16 V's |
|
3468 lea eax, [eax + 16] |
|
3469 movdqa xmm2, xmm0 |
|
3470 punpcklbw xmm0, xmm1 // first 8 UV pairs |
|
3471 punpckhbw xmm2, xmm1 // next 8 UV pairs |
|
3472 movdqa [edi], xmm0 |
|
3473 movdqa [edi + 16], xmm2 |
|
3474 lea edi, [edi + 32] |
|
3475 sub ecx, 16 |
|
3476 jg convertloop |
|
3477 |
|
3478 pop edi |
|
3479 ret |
|
3480 } |
|
3481 } |
|
3482 |
|
3483 __declspec(naked) __declspec(align(16)) |
|
3484 void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, |
|
3485 uint8* dst_uv, int width) { |
|
3486 __asm { |
|
3487 push edi |
|
3488 mov eax, [esp + 4 + 4] // src_u |
|
3489 mov edx, [esp + 4 + 8] // src_v |
|
3490 mov edi, [esp + 4 + 12] // dst_uv |
|
3491 mov ecx, [esp + 4 + 16] // width |
|
3492 sub edx, eax |
|
3493 |
|
3494 align 4 |
|
3495 convertloop: |
|
3496 movdqu xmm0, [eax] // read 16 U's |
|
3497 movdqu xmm1, [eax + edx] // and 16 V's |
|
3498 lea eax, [eax + 16] |
|
3499 movdqa xmm2, xmm0 |
|
3500 punpcklbw xmm0, xmm1 // first 8 UV pairs |
|
3501 punpckhbw xmm2, xmm1 // next 8 UV pairs |
|
3502 movdqu [edi], xmm0 |
|
3503 movdqu [edi + 16], xmm2 |
|
3504 lea edi, [edi + 32] |
|
3505 sub ecx, 16 |
|
3506 jg convertloop |
|
3507 |
|
3508 pop edi |
|
3509 ret |
|
3510 } |
|
3511 } |
|
3512 #endif // HAS_MERGEUVROW_SSE2 |
|
3513 |
|
3514 #ifdef HAS_MERGEUVROW_AVX2 |
|
3515 __declspec(naked) __declspec(align(16)) |
|
3516 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, |
|
3517 int width) { |
|
3518 __asm { |
|
3519 push edi |
|
3520 mov eax, [esp + 4 + 4] // src_u |
|
3521 mov edx, [esp + 4 + 8] // src_v |
|
3522 mov edi, [esp + 4 + 12] // dst_uv |
|
3523 mov ecx, [esp + 4 + 16] // width |
|
3524 sub edx, eax |
|
3525 |
|
3526 align 4 |
|
3527 convertloop: |
|
3528 vmovdqu ymm0, [eax] // read 32 U's |
|
3529 vmovdqu ymm1, [eax + edx] // and 32 V's |
|
3530 lea eax, [eax + 32] |
|
3531 vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2 |
|
3532 vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3 |
|
3533 vperm2i128 ymm1, ymm2, ymm0, 0x20 // low 128 of ymm2 and low 128 of ymm0 |
|
3534 vperm2i128 ymm2, ymm2, ymm0, 0x31 // high 128 of ymm2 and high 128 of ymm0 |
|
3535 vmovdqu [edi], ymm1 |
|
3536 vmovdqu [edi + 32], ymm2 |
|
3537 lea edi, [edi + 64] |
|
3538 sub ecx, 32 |
|
3539 jg convertloop |
|
3540 |
|
3541 pop edi |
|
3542 vzeroupper |
|
3543 ret |
|
3544 } |
|
3545 } |
|
3546 #endif // HAS_MERGEUVROW_AVX2 |
|
3547 |
|
3548 #ifdef HAS_COPYROW_SSE2 |
|
3549 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. |
|
3550 __declspec(naked) __declspec(align(16)) |
|
3551 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { |
|
3552 __asm { |
|
3553 mov eax, [esp + 4] // src |
|
3554 mov edx, [esp + 8] // dst |
|
3555 mov ecx, [esp + 12] // count |
|
3556 |
|
3557 align 4 |
|
3558 convertloop: |
|
3559 movdqa xmm0, [eax] |
|
3560 movdqa xmm1, [eax + 16] |
|
3561 lea eax, [eax + 32] |
|
3562 movdqa [edx], xmm0 |
|
3563 movdqa [edx + 16], xmm1 |
|
3564 lea edx, [edx + 32] |
|
3565 sub ecx, 32 |
|
3566 jg convertloop |
|
3567 ret |
|
3568 } |
|
3569 } |
|
3570 #endif // HAS_COPYROW_SSE2 |
|
3571 |
|
3572 // Unaligned Multiple of 1. |
|
3573 __declspec(naked) __declspec(align(16)) |
|
3574 void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { |
|
3575 __asm { |
|
3576 mov eax, esi |
|
3577 mov edx, edi |
|
3578 mov esi, [esp + 4] // src |
|
3579 mov edi, [esp + 8] // dst |
|
3580 mov ecx, [esp + 12] // count |
|
3581 rep movsb |
|
3582 mov edi, edx |
|
3583 mov esi, eax |
|
3584 ret |
|
3585 } |
|
3586 } |
|
3587 |
|
3588 #ifdef HAS_COPYROW_X86 |
|
3589 __declspec(naked) __declspec(align(16)) |
|
3590 void CopyRow_X86(const uint8* src, uint8* dst, int count) { |
|
3591 __asm { |
|
3592 mov eax, esi |
|
3593 mov edx, edi |
|
3594 mov esi, [esp + 4] // src |
|
3595 mov edi, [esp + 8] // dst |
|
3596 mov ecx, [esp + 12] // count |
|
3597 shr ecx, 2 |
|
3598 rep movsd |
|
3599 mov edi, edx |
|
3600 mov esi, eax |
|
3601 ret |
|
3602 } |
|
3603 } |
|
3604 #endif // HAS_COPYROW_X86 |
|
3605 |
|
3606 #ifdef HAS_ARGBCOPYALPHAROW_SSE2 |
|
3607 // width in pixels |
|
3608 __declspec(naked) __declspec(align(16)) |
|
3609 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { |
|
3610 __asm { |
|
3611 mov eax, [esp + 4] // src |
|
3612 mov edx, [esp + 8] // dst |
|
3613 mov ecx, [esp + 12] // count |
|
3614 pcmpeqb xmm0, xmm0 // generate mask 0xff000000 |
|
3615 pslld xmm0, 24 |
|
3616 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff |
|
3617 psrld xmm1, 8 |
|
3618 |
|
3619 align 4 |
|
3620 convertloop: |
|
3621 movdqa xmm2, [eax] |
|
3622 movdqa xmm3, [eax + 16] |
|
3623 lea eax, [eax + 32] |
|
3624 movdqa xmm4, [edx] |
|
3625 movdqa xmm5, [edx + 16] |
|
3626 pand xmm2, xmm0 |
|
3627 pand xmm3, xmm0 |
|
3628 pand xmm4, xmm1 |
|
3629 pand xmm5, xmm1 |
|
3630 por xmm2, xmm4 |
|
3631 por xmm3, xmm5 |
|
3632 movdqa [edx], xmm2 |
|
3633 movdqa [edx + 16], xmm3 |
|
3634 lea edx, [edx + 32] |
|
3635 sub ecx, 8 |
|
3636 jg convertloop |
|
3637 |
|
3638 ret |
|
3639 } |
|
3640 } |
|
3641 #endif // HAS_ARGBCOPYALPHAROW_SSE2 |
|
3642 |
|
3643 #ifdef HAS_ARGBCOPYALPHAROW_AVX2 |
|
3644 // width in pixels |
|
3645 __declspec(naked) __declspec(align(16)) |
|
3646 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { |
|
3647 __asm { |
|
3648 mov eax, [esp + 4] // src |
|
3649 mov edx, [esp + 8] // dst |
|
3650 mov ecx, [esp + 12] // count |
|
3651 vpcmpeqb ymm0, ymm0, ymm0 |
|
3652 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff |
|
3653 |
|
3654 align 4 |
|
3655 convertloop: |
|
3656 vmovdqu ymm1, [eax] |
|
3657 vmovdqu ymm2, [eax + 32] |
|
3658 lea eax, [eax + 64] |
|
3659 vpblendvb ymm1, ymm1, [edx], ymm0 |
|
3660 vpblendvb ymm2, ymm2, [edx + 32], ymm0 |
|
3661 vmovdqu [edx], ymm1 |
|
3662 vmovdqu [edx + 32], ymm2 |
|
3663 lea edx, [edx + 64] |
|
3664 sub ecx, 16 |
|
3665 jg convertloop |
|
3666 |
|
3667 vzeroupper |
|
3668 ret |
|
3669 } |
|
3670 } |
|
3671 #endif // HAS_ARGBCOPYALPHAROW_AVX2 |
|
3672 |
|
3673 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 |
|
3674 // width in pixels |
|
3675 __declspec(naked) __declspec(align(16)) |
|
3676 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { |
|
3677 __asm { |
|
3678 mov eax, [esp + 4] // src |
|
3679 mov edx, [esp + 8] // dst |
|
3680 mov ecx, [esp + 12] // count |
|
3681 pcmpeqb xmm0, xmm0 // generate mask 0xff000000 |
|
3682 pslld xmm0, 24 |
|
3683 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff |
|
3684 psrld xmm1, 8 |
|
3685 |
|
3686 align 4 |
|
3687 convertloop: |
|
3688 movq xmm2, qword ptr [eax] // 8 Y's |
|
3689 lea eax, [eax + 8] |
|
3690 punpcklbw xmm2, xmm2 |
|
3691 punpckhwd xmm3, xmm2 |
|
3692 punpcklwd xmm2, xmm2 |
|
3693 movdqa xmm4, [edx] |
|
3694 movdqa xmm5, [edx + 16] |
|
3695 pand xmm2, xmm0 |
|
3696 pand xmm3, xmm0 |
|
3697 pand xmm4, xmm1 |
|
3698 pand xmm5, xmm1 |
|
3699 por xmm2, xmm4 |
|
3700 por xmm3, xmm5 |
|
3701 movdqa [edx], xmm2 |
|
3702 movdqa [edx + 16], xmm3 |
|
3703 lea edx, [edx + 32] |
|
3704 sub ecx, 8 |
|
3705 jg convertloop |
|
3706 |
|
3707 ret |
|
3708 } |
|
3709 } |
|
3710 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 |
|
3711 |
|
3712 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 |
|
3713 // width in pixels |
|
3714 __declspec(naked) __declspec(align(16)) |
|
3715 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { |
|
3716 __asm { |
|
3717 mov eax, [esp + 4] // src |
|
3718 mov edx, [esp + 8] // dst |
|
3719 mov ecx, [esp + 12] // count |
|
3720 vpcmpeqb ymm0, ymm0, ymm0 |
|
3721 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff |
|
3722 |
|
3723 align 4 |
|
3724 convertloop: |
|
3725 vpmovzxbd ymm1, qword ptr [eax] |
|
3726 vpmovzxbd ymm2, qword ptr [eax + 8] |
|
3727 lea eax, [eax + 16] |
|
3728 vpslld ymm1, ymm1, 24 |
|
3729 vpslld ymm2, ymm2, 24 |
|
3730 vpblendvb ymm1, ymm1, [edx], ymm0 |
|
3731 vpblendvb ymm2, ymm2, [edx + 32], ymm0 |
|
3732 vmovdqu [edx], ymm1 |
|
3733 vmovdqu [edx + 32], ymm2 |
|
3734 lea edx, [edx + 64] |
|
3735 sub ecx, 16 |
|
3736 jg convertloop |
|
3737 |
|
3738 vzeroupper |
|
3739 ret |
|
3740 } |
|
3741 } |
|
3742 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 |
|
3743 |
|
3744 #ifdef HAS_SETROW_X86 |
|
3745 // SetRow8 writes 'count' bytes using a 32 bit value repeated. |
|
3746 __declspec(naked) __declspec(align(16)) |
|
3747 void SetRow_X86(uint8* dst, uint32 v32, int count) { |
|
3748 __asm { |
|
3749 mov edx, edi |
|
3750 mov edi, [esp + 4] // dst |
|
3751 mov eax, [esp + 8] // v32 |
|
3752 mov ecx, [esp + 12] // count |
|
3753 shr ecx, 2 |
|
3754 rep stosd |
|
3755 mov edi, edx |
|
3756 ret |
|
3757 } |
|
3758 } |
|
3759 |
|
3760 // SetRow32 writes 'count' words using a 32 bit value repeated. |
|
3761 __declspec(naked) __declspec(align(16)) |
|
3762 void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, |
|
3763 int dst_stride, int height) { |
|
3764 __asm { |
|
3765 push esi |
|
3766 push edi |
|
3767 push ebp |
|
3768 mov edi, [esp + 12 + 4] // dst |
|
3769 mov eax, [esp + 12 + 8] // v32 |
|
3770 mov ebp, [esp + 12 + 12] // width |
|
3771 mov edx, [esp + 12 + 16] // dst_stride |
|
3772 mov esi, [esp + 12 + 20] // height |
|
3773 lea ecx, [ebp * 4] |
|
3774 sub edx, ecx // stride - width * 4 |
|
3775 |
|
3776 align 4 |
|
3777 convertloop: |
|
3778 mov ecx, ebp |
|
3779 rep stosd |
|
3780 add edi, edx |
|
3781 sub esi, 1 |
|
3782 jg convertloop |
|
3783 |
|
3784 pop ebp |
|
3785 pop edi |
|
3786 pop esi |
|
3787 ret |
|
3788 } |
|
3789 } |
|
3790 #endif // HAS_SETROW_X86 |
|
3791 |
|
3792 #ifdef HAS_YUY2TOYROW_AVX2 |
|
3793 __declspec(naked) __declspec(align(16)) |
|
3794 void YUY2ToYRow_AVX2(const uint8* src_yuy2, |
|
3795 uint8* dst_y, int pix) { |
|
3796 __asm { |
|
3797 mov eax, [esp + 4] // src_yuy2 |
|
3798 mov edx, [esp + 8] // dst_y |
|
3799 mov ecx, [esp + 12] // pix |
|
3800 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
|
3801 vpsrlw ymm5, ymm5, 8 |
|
3802 |
|
3803 align 4 |
|
3804 convertloop: |
|
3805 vmovdqu ymm0, [eax] |
|
3806 vmovdqu ymm1, [eax + 32] |
|
3807 lea eax, [eax + 64] |
|
3808 vpand ymm0, ymm0, ymm5 // even bytes are Y |
|
3809 vpand ymm1, ymm1, ymm5 |
|
3810 vpackuswb ymm0, ymm0, ymm1 // mutates. |
|
3811 vpermq ymm0, ymm0, 0xd8 |
|
3812 sub ecx, 32 |
|
3813 vmovdqu [edx], ymm0 |
|
3814 lea edx, [edx + 32] |
|
3815 jg convertloop |
|
3816 vzeroupper |
|
3817 ret |
|
3818 } |
|
3819 } |
|
3820 |
|
3821 __declspec(naked) __declspec(align(16)) |
|
3822 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, |
|
3823 uint8* dst_u, uint8* dst_v, int pix) { |
|
3824 __asm { |
|
3825 push esi |
|
3826 push edi |
|
3827 mov eax, [esp + 8 + 4] // src_yuy2 |
|
3828 mov esi, [esp + 8 + 8] // stride_yuy2 |
|
3829 mov edx, [esp + 8 + 12] // dst_u |
|
3830 mov edi, [esp + 8 + 16] // dst_v |
|
3831 mov ecx, [esp + 8 + 20] // pix |
|
3832 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
|
3833 vpsrlw ymm5, ymm5, 8 |
|
3834 sub edi, edx |
|
3835 |
|
3836 align 4 |
|
3837 convertloop: |
|
3838 vmovdqu ymm0, [eax] |
|
3839 vmovdqu ymm1, [eax + 32] |
|
3840 vpavgb ymm0, ymm0, [eax + esi] |
|
3841 vpavgb ymm1, ymm1, [eax + esi + 32] |
|
3842 lea eax, [eax + 64] |
|
3843 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV |
|
3844 vpsrlw ymm1, ymm1, 8 |
|
3845 vpackuswb ymm0, ymm0, ymm1 // mutates. |
|
3846 vpermq ymm0, ymm0, 0xd8 |
|
3847 vpand ymm1, ymm0, ymm5 // U |
|
3848 vpsrlw ymm0, ymm0, 8 // V |
|
3849 vpackuswb ymm1, ymm1, ymm1 // mutates. |
|
3850 vpackuswb ymm0, ymm0, ymm0 // mutates. |
|
3851 vpermq ymm1, ymm1, 0xd8 |
|
3852 vpermq ymm0, ymm0, 0xd8 |
|
3853 vextractf128 [edx], ymm1, 0 // U |
|
3854 vextractf128 [edx + edi], ymm0, 0 // V |
|
3855 lea edx, [edx + 16] |
|
3856 sub ecx, 32 |
|
3857 jg convertloop |
|
3858 |
|
3859 pop edi |
|
3860 pop esi |
|
3861 vzeroupper |
|
3862 ret |
|
3863 } |
|
3864 } |
|
3865 |
|
3866 __declspec(naked) __declspec(align(16)) |
|
3867 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, |
|
3868 uint8* dst_u, uint8* dst_v, int pix) { |
|
3869 __asm { |
|
3870 push edi |
|
3871 mov eax, [esp + 4 + 4] // src_yuy2 |
|
3872 mov edx, [esp + 4 + 8] // dst_u |
|
3873 mov edi, [esp + 4 + 12] // dst_v |
|
3874 mov ecx, [esp + 4 + 16] // pix |
|
3875 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
|
3876 vpsrlw ymm5, ymm5, 8 |
|
3877 sub edi, edx |
|
3878 |
|
3879 align 4 |
|
3880 convertloop: |
|
3881 vmovdqu ymm0, [eax] |
|
3882 vmovdqu ymm1, [eax + 32] |
|
3883 lea eax, [eax + 64] |
|
3884 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV |
|
3885 vpsrlw ymm1, ymm1, 8 |
|
3886 vpackuswb ymm0, ymm0, ymm1 // mutates. |
|
3887 vpermq ymm0, ymm0, 0xd8 |
|
3888 vpand ymm1, ymm0, ymm5 // U |
|
3889 vpsrlw ymm0, ymm0, 8 // V |
|
3890 vpackuswb ymm1, ymm1, ymm1 // mutates. |
|
3891 vpackuswb ymm0, ymm0, ymm0 // mutates. |
|
3892 vpermq ymm1, ymm1, 0xd8 |
|
3893 vpermq ymm0, ymm0, 0xd8 |
|
3894 vextractf128 [edx], ymm1, 0 // U |
|
3895 vextractf128 [edx + edi], ymm0, 0 // V |
|
3896 lea edx, [edx + 16] |
|
3897 sub ecx, 32 |
|
3898 jg convertloop |
|
3899 |
|
3900 pop edi |
|
3901 vzeroupper |
|
3902 ret |
|
3903 } |
|
3904 } |
|
3905 |
|
3906 __declspec(naked) __declspec(align(16)) |
|
3907 void UYVYToYRow_AVX2(const uint8* src_uyvy, |
|
3908 uint8* dst_y, int pix) { |
|
3909 __asm { |
|
3910 mov eax, [esp + 4] // src_uyvy |
|
3911 mov edx, [esp + 8] // dst_y |
|
3912 mov ecx, [esp + 12] // pix |
|
3913 |
|
3914 align 4 |
|
3915 convertloop: |
|
3916 vmovdqu ymm0, [eax] |
|
3917 vmovdqu ymm1, [eax + 32] |
|
3918 lea eax, [eax + 64] |
|
3919 vpsrlw ymm0, ymm0, 8 // odd bytes are Y |
|
3920 vpsrlw ymm1, ymm1, 8 |
|
3921 vpackuswb ymm0, ymm0, ymm1 // mutates. |
|
3922 vpermq ymm0, ymm0, 0xd8 |
|
3923 sub ecx, 32 |
|
3924 vmovdqu [edx], ymm0 |
|
3925 lea edx, [edx + 32] |
|
3926 jg convertloop |
|
3927 ret |
|
3928 vzeroupper |
|
3929 } |
|
3930 } |
|
3931 |
|
3932 __declspec(naked) __declspec(align(16)) |
|
3933 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, |
|
3934 uint8* dst_u, uint8* dst_v, int pix) { |
|
3935 __asm { |
|
3936 push esi |
|
3937 push edi |
|
3938 mov eax, [esp + 8 + 4] // src_yuy2 |
|
3939 mov esi, [esp + 8 + 8] // stride_yuy2 |
|
3940 mov edx, [esp + 8 + 12] // dst_u |
|
3941 mov edi, [esp + 8 + 16] // dst_v |
|
3942 mov ecx, [esp + 8 + 20] // pix |
|
3943 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
|
3944 vpsrlw ymm5, ymm5, 8 |
|
3945 sub edi, edx |
|
3946 |
|
3947 align 4 |
|
3948 convertloop: |
|
3949 vmovdqu ymm0, [eax] |
|
3950 vmovdqu ymm1, [eax + 32] |
|
3951 vpavgb ymm0, ymm0, [eax + esi] |
|
3952 vpavgb ymm1, ymm1, [eax + esi + 32] |
|
3953 lea eax, [eax + 64] |
|
3954 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV |
|
3955 vpand ymm1, ymm1, ymm5 |
|
3956 vpackuswb ymm0, ymm0, ymm1 // mutates. |
|
3957 vpermq ymm0, ymm0, 0xd8 |
|
3958 vpand ymm1, ymm0, ymm5 // U |
|
3959 vpsrlw ymm0, ymm0, 8 // V |
|
3960 vpackuswb ymm1, ymm1, ymm1 // mutates. |
|
3961 vpackuswb ymm0, ymm0, ymm0 // mutates. |
|
3962 vpermq ymm1, ymm1, 0xd8 |
|
3963 vpermq ymm0, ymm0, 0xd8 |
|
3964 vextractf128 [edx], ymm1, 0 // U |
|
3965 vextractf128 [edx + edi], ymm0, 0 // V |
|
3966 lea edx, [edx + 16] |
|
3967 sub ecx, 32 |
|
3968 jg convertloop |
|
3969 |
|
3970 pop edi |
|
3971 pop esi |
|
3972 vzeroupper |
|
3973 ret |
|
3974 } |
|
3975 } |
|
3976 |
|
3977 __declspec(naked) __declspec(align(16)) |
|
3978 void UYVYToUV422Row_AVX2(const uint8* src_uyvy, |
|
3979 uint8* dst_u, uint8* dst_v, int pix) { |
|
3980 __asm { |
|
3981 push edi |
|
3982 mov eax, [esp + 4 + 4] // src_yuy2 |
|
3983 mov edx, [esp + 4 + 8] // dst_u |
|
3984 mov edi, [esp + 4 + 12] // dst_v |
|
3985 mov ecx, [esp + 4 + 16] // pix |
|
3986 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
|
3987 vpsrlw ymm5, ymm5, 8 |
|
3988 sub edi, edx |
|
3989 |
|
3990 align 4 |
|
3991 convertloop: |
|
3992 vmovdqu ymm0, [eax] |
|
3993 vmovdqu ymm1, [eax + 32] |
|
3994 lea eax, [eax + 64] |
|
3995 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV |
|
3996 vpand ymm1, ymm1, ymm5 |
|
3997 vpackuswb ymm0, ymm0, ymm1 // mutates. |
|
3998 vpermq ymm0, ymm0, 0xd8 |
|
3999 vpand ymm1, ymm0, ymm5 // U |
|
4000 vpsrlw ymm0, ymm0, 8 // V |
|
4001 vpackuswb ymm1, ymm1, ymm1 // mutates. |
|
4002 vpackuswb ymm0, ymm0, ymm0 // mutates. |
|
4003 vpermq ymm1, ymm1, 0xd8 |
|
4004 vpermq ymm0, ymm0, 0xd8 |
|
4005 vextractf128 [edx], ymm1, 0 // U |
|
4006 vextractf128 [edx + edi], ymm0, 0 // V |
|
4007 lea edx, [edx + 16] |
|
4008 sub ecx, 32 |
|
4009 jg convertloop |
|
4010 |
|
4011 pop edi |
|
4012 vzeroupper |
|
4013 ret |
|
4014 } |
|
4015 } |
|
4016 #endif // HAS_YUY2TOYROW_AVX2 |
|
4017 |
|
4018 #ifdef HAS_YUY2TOYROW_SSE2 |
|
4019 __declspec(naked) __declspec(align(16)) |
|
4020 void YUY2ToYRow_SSE2(const uint8* src_yuy2, |
|
4021 uint8* dst_y, int pix) { |
|
4022 __asm { |
|
4023 mov eax, [esp + 4] // src_yuy2 |
|
4024 mov edx, [esp + 8] // dst_y |
|
4025 mov ecx, [esp + 12] // pix |
|
4026 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
|
4027 psrlw xmm5, 8 |
|
4028 |
|
4029 align 4 |
|
4030 convertloop: |
|
4031 movdqa xmm0, [eax] |
|
4032 movdqa xmm1, [eax + 16] |
|
4033 lea eax, [eax + 32] |
|
4034 pand xmm0, xmm5 // even bytes are Y |
|
4035 pand xmm1, xmm5 |
|
4036 packuswb xmm0, xmm1 |
|
4037 sub ecx, 16 |
|
4038 movdqa [edx], xmm0 |
|
4039 lea edx, [edx + 16] |
|
4040 jg convertloop |
|
4041 ret |
|
4042 } |
|
4043 } |
|
4044 |
|
4045 __declspec(naked) __declspec(align(16)) |
|
4046 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, |
|
4047 uint8* dst_u, uint8* dst_v, int pix) { |
|
4048 __asm { |
|
4049 push esi |
|
4050 push edi |
|
4051 mov eax, [esp + 8 + 4] // src_yuy2 |
|
4052 mov esi, [esp + 8 + 8] // stride_yuy2 |
|
4053 mov edx, [esp + 8 + 12] // dst_u |
|
4054 mov edi, [esp + 8 + 16] // dst_v |
|
4055 mov ecx, [esp + 8 + 20] // pix |
|
4056 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
|
4057 psrlw xmm5, 8 |
|
4058 sub edi, edx |
|
4059 |
|
4060 align 4 |
|
4061 convertloop: |
|
4062 movdqa xmm0, [eax] |
|
4063 movdqa xmm1, [eax + 16] |
|
4064 movdqa xmm2, [eax + esi] |
|
4065 movdqa xmm3, [eax + esi + 16] |
|
4066 lea eax, [eax + 32] |
|
4067 pavgb xmm0, xmm2 |
|
4068 pavgb xmm1, xmm3 |
|
4069 psrlw xmm0, 8 // YUYV -> UVUV |
|
4070 psrlw xmm1, 8 |
|
4071 packuswb xmm0, xmm1 |
|
4072 movdqa xmm1, xmm0 |
|
4073 pand xmm0, xmm5 // U |
|
4074 packuswb xmm0, xmm0 |
|
4075 psrlw xmm1, 8 // V |
|
4076 packuswb xmm1, xmm1 |
|
4077 movq qword ptr [edx], xmm0 |
|
4078 movq qword ptr [edx + edi], xmm1 |
|
4079 lea edx, [edx + 8] |
|
4080 sub ecx, 16 |
|
4081 jg convertloop |
|
4082 |
|
4083 pop edi |
|
4084 pop esi |
|
4085 ret |
|
4086 } |
|
4087 } |
|
4088 |
|
4089 __declspec(naked) __declspec(align(16)) |
|
4090 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, |
|
4091 uint8* dst_u, uint8* dst_v, int pix) { |
|
4092 __asm { |
|
4093 push edi |
|
4094 mov eax, [esp + 4 + 4] // src_yuy2 |
|
4095 mov edx, [esp + 4 + 8] // dst_u |
|
4096 mov edi, [esp + 4 + 12] // dst_v |
|
4097 mov ecx, [esp + 4 + 16] // pix |
|
4098 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
|
4099 psrlw xmm5, 8 |
|
4100 sub edi, edx |
|
4101 |
|
4102 align 4 |
|
4103 convertloop: |
|
4104 movdqa xmm0, [eax] |
|
4105 movdqa xmm1, [eax + 16] |
|
4106 lea eax, [eax + 32] |
|
4107 psrlw xmm0, 8 // YUYV -> UVUV |
|
4108 psrlw xmm1, 8 |
|
4109 packuswb xmm0, xmm1 |
|
4110 movdqa xmm1, xmm0 |
|
4111 pand xmm0, xmm5 // U |
|
4112 packuswb xmm0, xmm0 |
|
4113 psrlw xmm1, 8 // V |
|
4114 packuswb xmm1, xmm1 |
|
4115 movq qword ptr [edx], xmm0 |
|
4116 movq qword ptr [edx + edi], xmm1 |
|
4117 lea edx, [edx + 8] |
|
4118 sub ecx, 16 |
|
4119 jg convertloop |
|
4120 |
|
4121 pop edi |
|
4122 ret |
|
4123 } |
|
4124 } |
|
4125 |
|
4126 __declspec(naked) __declspec(align(16)) |
|
4127 void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, |
|
4128 uint8* dst_y, int pix) { |
|
4129 __asm { |
|
4130 mov eax, [esp + 4] // src_yuy2 |
|
4131 mov edx, [esp + 8] // dst_y |
|
4132 mov ecx, [esp + 12] // pix |
|
4133 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
|
4134 psrlw xmm5, 8 |
|
4135 |
|
4136 align 4 |
|
4137 convertloop: |
|
4138 movdqu xmm0, [eax] |
|
4139 movdqu xmm1, [eax + 16] |
|
4140 lea eax, [eax + 32] |
|
4141 pand xmm0, xmm5 // even bytes are Y |
|
4142 pand xmm1, xmm5 |
|
4143 packuswb xmm0, xmm1 |
|
4144 sub ecx, 16 |
|
4145 movdqu [edx], xmm0 |
|
4146 lea edx, [edx + 16] |
|
4147 jg convertloop |
|
4148 ret |
|
4149 } |
|
4150 } |
|
4151 |
|
4152 __declspec(naked) __declspec(align(16)) |
|
4153 void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, |
|
4154 uint8* dst_u, uint8* dst_v, int pix) { |
|
4155 __asm { |
|
4156 push esi |
|
4157 push edi |
|
4158 mov eax, [esp + 8 + 4] // src_yuy2 |
|
4159 mov esi, [esp + 8 + 8] // stride_yuy2 |
|
4160 mov edx, [esp + 8 + 12] // dst_u |
|
4161 mov edi, [esp + 8 + 16] // dst_v |
|
4162 mov ecx, [esp + 8 + 20] // pix |
|
4163 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
|
4164 psrlw xmm5, 8 |
|
4165 sub edi, edx |
|
4166 |
|
4167 align 4 |
|
4168 convertloop: |
|
4169 movdqu xmm0, [eax] |
|
4170 movdqu xmm1, [eax + 16] |
|
4171 movdqu xmm2, [eax + esi] |
|
4172 movdqu xmm3, [eax + esi + 16] |
|
4173 lea eax, [eax + 32] |
|
4174 pavgb xmm0, xmm2 |
|
4175 pavgb xmm1, xmm3 |
|
4176 psrlw xmm0, 8 // YUYV -> UVUV |
|
4177 psrlw xmm1, 8 |
|
4178 packuswb xmm0, xmm1 |
|
4179 movdqa xmm1, xmm0 |
|
4180 pand xmm0, xmm5 // U |
|
4181 packuswb xmm0, xmm0 |
|
4182 psrlw xmm1, 8 // V |
|
4183 packuswb xmm1, xmm1 |
|
4184 movq qword ptr [edx], xmm0 |
|
4185 movq qword ptr [edx + edi], xmm1 |
|
4186 lea edx, [edx + 8] |
|
4187 sub ecx, 16 |
|
4188 jg convertloop |
|
4189 |
|
4190 pop edi |
|
4191 pop esi |
|
4192 ret |
|
4193 } |
|
4194 } |
|
4195 |
|
4196 __declspec(naked) __declspec(align(16)) |
|
4197 void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, |
|
4198 uint8* dst_u, uint8* dst_v, int pix) { |
|
4199 __asm { |
|
4200 push edi |
|
4201 mov eax, [esp + 4 + 4] // src_yuy2 |
|
4202 mov edx, [esp + 4 + 8] // dst_u |
|
4203 mov edi, [esp + 4 + 12] // dst_v |
|
4204 mov ecx, [esp + 4 + 16] // pix |
|
4205 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
|
4206 psrlw xmm5, 8 |
|
4207 sub edi, edx |
|
4208 |
|
4209 align 4 |
|
4210 convertloop: |
|
4211 movdqu xmm0, [eax] |
|
4212 movdqu xmm1, [eax + 16] |
|
4213 lea eax, [eax + 32] |
|
4214 psrlw xmm0, 8 // YUYV -> UVUV |
|
4215 psrlw xmm1, 8 |
|
4216 packuswb xmm0, xmm1 |
|
4217 movdqa xmm1, xmm0 |
|
4218 pand xmm0, xmm5 // U |
|
4219 packuswb xmm0, xmm0 |
|
4220 psrlw xmm1, 8 // V |
|
4221 packuswb xmm1, xmm1 |
|
4222 movq qword ptr [edx], xmm0 |
|
4223 movq qword ptr [edx + edi], xmm1 |
|
4224 lea edx, [edx + 8] |
|
4225 sub ecx, 16 |
|
4226 jg convertloop |
|
4227 |
|
4228 pop edi |
|
4229 ret |
|
4230 } |
|
4231 } |
|
4232 |
|
4233 __declspec(naked) __declspec(align(16)) |
|
4234 void UYVYToYRow_SSE2(const uint8* src_uyvy, |
|
4235 uint8* dst_y, int pix) { |
|
4236 __asm { |
|
4237 mov eax, [esp + 4] // src_uyvy |
|
4238 mov edx, [esp + 8] // dst_y |
|
4239 mov ecx, [esp + 12] // pix |
|
4240 |
|
4241 align 4 |
|
4242 convertloop: |
|
4243 movdqa xmm0, [eax] |
|
4244 movdqa xmm1, [eax + 16] |
|
4245 lea eax, [eax + 32] |
|
4246 psrlw xmm0, 8 // odd bytes are Y |
|
4247 psrlw xmm1, 8 |
|
4248 packuswb xmm0, xmm1 |
|
4249 sub ecx, 16 |
|
4250 movdqa [edx], xmm0 |
|
4251 lea edx, [edx + 16] |
|
4252 jg convertloop |
|
4253 ret |
|
4254 } |
|
4255 } |
|
4256 |
|
4257 __declspec(naked) __declspec(align(16)) |
|
4258 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, |
|
4259 uint8* dst_u, uint8* dst_v, int pix) { |
|
4260 __asm { |
|
4261 push esi |
|
4262 push edi |
|
4263 mov eax, [esp + 8 + 4] // src_yuy2 |
|
4264 mov esi, [esp + 8 + 8] // stride_yuy2 |
|
4265 mov edx, [esp + 8 + 12] // dst_u |
|
4266 mov edi, [esp + 8 + 16] // dst_v |
|
4267 mov ecx, [esp + 8 + 20] // pix |
|
4268 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
|
4269 psrlw xmm5, 8 |
|
4270 sub edi, edx |
|
4271 |
|
4272 align 4 |
|
4273 convertloop: |
|
4274 movdqa xmm0, [eax] |
|
4275 movdqa xmm1, [eax + 16] |
|
4276 movdqa xmm2, [eax + esi] |
|
4277 movdqa xmm3, [eax + esi + 16] |
|
4278 lea eax, [eax + 32] |
|
4279 pavgb xmm0, xmm2 |
|
4280 pavgb xmm1, xmm3 |
|
4281 pand xmm0, xmm5 // UYVY -> UVUV |
|
4282 pand xmm1, xmm5 |
|
4283 packuswb xmm0, xmm1 |
|
4284 movdqa xmm1, xmm0 |
|
4285 pand xmm0, xmm5 // U |
|
4286 packuswb xmm0, xmm0 |
|
4287 psrlw xmm1, 8 // V |
|
4288 packuswb xmm1, xmm1 |
|
4289 movq qword ptr [edx], xmm0 |
|
4290 movq qword ptr [edx + edi], xmm1 |
|
4291 lea edx, [edx + 8] |
|
4292 sub ecx, 16 |
|
4293 jg convertloop |
|
4294 |
|
4295 pop edi |
|
4296 pop esi |
|
4297 ret |
|
4298 } |
|
4299 } |
|
4300 |
|
4301 __declspec(naked) __declspec(align(16)) |
|
4302 void UYVYToUV422Row_SSE2(const uint8* src_uyvy, |
|
4303 uint8* dst_u, uint8* dst_v, int pix) { |
|
4304 __asm { |
|
4305 push edi |
|
4306 mov eax, [esp + 4 + 4] // src_yuy2 |
|
4307 mov edx, [esp + 4 + 8] // dst_u |
|
4308 mov edi, [esp + 4 + 12] // dst_v |
|
4309 mov ecx, [esp + 4 + 16] // pix |
|
4310 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
|
4311 psrlw xmm5, 8 |
|
4312 sub edi, edx |
|
4313 |
|
4314 align 4 |
|
4315 convertloop: |
|
4316 movdqa xmm0, [eax] |
|
4317 movdqa xmm1, [eax + 16] |
|
4318 lea eax, [eax + 32] |
|
4319 pand xmm0, xmm5 // UYVY -> UVUV |
|
4320 pand xmm1, xmm5 |
|
4321 packuswb xmm0, xmm1 |
|
4322 movdqa xmm1, xmm0 |
|
4323 pand xmm0, xmm5 // U |
|
4324 packuswb xmm0, xmm0 |
|
4325 psrlw xmm1, 8 // V |
|
4326 packuswb xmm1, xmm1 |
|
4327 movq qword ptr [edx], xmm0 |
|
4328 movq qword ptr [edx + edi], xmm1 |
|
4329 lea edx, [edx + 8] |
|
4330 sub ecx, 16 |
|
4331 jg convertloop |
|
4332 |
|
4333 pop edi |
|
4334 ret |
|
4335 } |
|
4336 } |
|
4337 |
|
4338 __declspec(naked) __declspec(align(16)) |
|
4339 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, |
|
4340 uint8* dst_y, int pix) { |
|
4341 __asm { |
|
4342 mov eax, [esp + 4] // src_uyvy |
|
4343 mov edx, [esp + 8] // dst_y |
|
4344 mov ecx, [esp + 12] // pix |
|
4345 |
|
4346 align 4 |
|
4347 convertloop: |
|
4348 movdqu xmm0, [eax] |
|
4349 movdqu xmm1, [eax + 16] |
|
4350 lea eax, [eax + 32] |
|
4351 psrlw xmm0, 8 // odd bytes are Y |
|
4352 psrlw xmm1, 8 |
|
4353 packuswb xmm0, xmm1 |
|
4354 sub ecx, 16 |
|
4355 movdqu [edx], xmm0 |
|
4356 lea edx, [edx + 16] |
|
4357 jg convertloop |
|
4358 ret |
|
4359 } |
|
4360 } |
|
4361 |
|
4362 __declspec(naked) __declspec(align(16)) |
|
4363 void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, |
|
4364 uint8* dst_u, uint8* dst_v, int pix) { |
|
4365 __asm { |
|
4366 push esi |
|
4367 push edi |
|
4368 mov eax, [esp + 8 + 4] // src_yuy2 |
|
4369 mov esi, [esp + 8 + 8] // stride_yuy2 |
|
4370 mov edx, [esp + 8 + 12] // dst_u |
|
4371 mov edi, [esp + 8 + 16] // dst_v |
|
4372 mov ecx, [esp + 8 + 20] // pix |
|
4373 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
|
4374 psrlw xmm5, 8 |
|
4375 sub edi, edx |
|
4376 |
|
4377 align 4 |
|
4378 convertloop: |
|
4379 movdqu xmm0, [eax] |
|
4380 movdqu xmm1, [eax + 16] |
|
4381 movdqu xmm2, [eax + esi] |
|
4382 movdqu xmm3, [eax + esi + 16] |
|
4383 lea eax, [eax + 32] |
|
4384 pavgb xmm0, xmm2 |
|
4385 pavgb xmm1, xmm3 |
|
4386 pand xmm0, xmm5 // UYVY -> UVUV |
|
4387 pand xmm1, xmm5 |
|
4388 packuswb xmm0, xmm1 |
|
4389 movdqa xmm1, xmm0 |
|
4390 pand xmm0, xmm5 // U |
|
4391 packuswb xmm0, xmm0 |
|
4392 psrlw xmm1, 8 // V |
|
4393 packuswb xmm1, xmm1 |
|
4394 movq qword ptr [edx], xmm0 |
|
4395 movq qword ptr [edx + edi], xmm1 |
|
4396 lea edx, [edx + 8] |
|
4397 sub ecx, 16 |
|
4398 jg convertloop |
|
4399 |
|
4400 pop edi |
|
4401 pop esi |
|
4402 ret |
|
4403 } |
|
4404 } |
|
4405 |
|
4406 __declspec(naked) __declspec(align(16)) |
|
4407 void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, |
|
4408 uint8* dst_u, uint8* dst_v, int pix) { |
|
4409 __asm { |
|
4410 push edi |
|
4411 mov eax, [esp + 4 + 4] // src_yuy2 |
|
4412 mov edx, [esp + 4 + 8] // dst_u |
|
4413 mov edi, [esp + 4 + 12] // dst_v |
|
4414 mov ecx, [esp + 4 + 16] // pix |
|
4415 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
|
4416 psrlw xmm5, 8 |
|
4417 sub edi, edx |
|
4418 |
|
4419 align 4 |
|
4420 convertloop: |
|
4421 movdqu xmm0, [eax] |
|
4422 movdqu xmm1, [eax + 16] |
|
4423 lea eax, [eax + 32] |
|
4424 pand xmm0, xmm5 // UYVY -> UVUV |
|
4425 pand xmm1, xmm5 |
|
4426 packuswb xmm0, xmm1 |
|
4427 movdqa xmm1, xmm0 |
|
4428 pand xmm0, xmm5 // U |
|
4429 packuswb xmm0, xmm0 |
|
4430 psrlw xmm1, 8 // V |
|
4431 packuswb xmm1, xmm1 |
|
4432 movq qword ptr [edx], xmm0 |
|
4433 movq qword ptr [edx + edi], xmm1 |
|
4434 lea edx, [edx + 8] |
|
4435 sub ecx, 16 |
|
4436 jg convertloop |
|
4437 |
|
4438 pop edi |
|
4439 ret |
|
4440 } |
|
4441 } |
|
4442 #endif // HAS_YUY2TOYROW_SSE2 |
|
4443 |
|
4444 #ifdef HAS_ARGBBLENDROW_SSE2 |
|
4445 // Blend 8 pixels at a time. |
|
4446 __declspec(naked) __declspec(align(16)) |
|
4447 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, |
|
4448 uint8* dst_argb, int width) { |
|
4449 __asm { |
|
4450 push esi |
|
4451 mov eax, [esp + 4 + 4] // src_argb0 |
|
4452 mov esi, [esp + 4 + 8] // src_argb1 |
|
4453 mov edx, [esp + 4 + 12] // dst_argb |
|
4454 mov ecx, [esp + 4 + 16] // width |
|
4455 pcmpeqb xmm7, xmm7 // generate constant 1 |
|
4456 psrlw xmm7, 15 |
|
4457 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff |
|
4458 psrlw xmm6, 8 |
|
4459 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 |
|
4460 psllw xmm5, 8 |
|
4461 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 |
|
4462 pslld xmm4, 24 |
|
4463 |
|
4464 sub ecx, 1 |
|
4465 je convertloop1 // only 1 pixel? |
|
4466 jl convertloop1b |
|
4467 |
|
4468 // 1 pixel loop until destination pointer is aligned. |
|
4469 alignloop1: |
|
4470 test edx, 15 // aligned? |
|
4471 je alignloop1b |
|
4472 movd xmm3, [eax] |
|
4473 lea eax, [eax + 4] |
|
4474 movdqa xmm0, xmm3 // src argb |
|
4475 pxor xmm3, xmm4 // ~alpha |
|
4476 movd xmm2, [esi] // _r_b |
|
4477 psrlw xmm3, 8 // alpha |
|
4478 pshufhw xmm3, xmm3, 0F5h // 8 alpha words |
|
4479 pshuflw xmm3, xmm3, 0F5h |
|
4480 pand xmm2, xmm6 // _r_b |
|
4481 paddw xmm3, xmm7 // 256 - alpha |
|
4482 pmullw xmm2, xmm3 // _r_b * alpha |
|
4483 movd xmm1, [esi] // _a_g |
|
4484 lea esi, [esi + 4] |
|
4485 psrlw xmm1, 8 // _a_g |
|
4486 por xmm0, xmm4 // set alpha to 255 |
|
4487 pmullw xmm1, xmm3 // _a_g * alpha |
|
4488 psrlw xmm2, 8 // _r_b convert to 8 bits again |
|
4489 paddusb xmm0, xmm2 // + src argb |
|
4490 pand xmm1, xmm5 // a_g_ convert to 8 bits again |
|
4491 paddusb xmm0, xmm1 // + src argb |
|
4492 sub ecx, 1 |
|
4493 movd [edx], xmm0 |
|
4494 lea edx, [edx + 4] |
|
4495 jge alignloop1 |
|
4496 |
|
4497 alignloop1b: |
|
4498 add ecx, 1 - 4 |
|
4499 jl convertloop4b |
|
4500 |
|
4501 // 4 pixel loop. |
|
4502 convertloop4: |
|
4503 movdqu xmm3, [eax] // src argb |
|
4504 lea eax, [eax + 16] |
|
4505 movdqa xmm0, xmm3 // src argb |
|
4506 pxor xmm3, xmm4 // ~alpha |
|
4507 movdqu xmm2, [esi] // _r_b |
|
4508 psrlw xmm3, 8 // alpha |
|
4509 pshufhw xmm3, xmm3, 0F5h // 8 alpha words |
|
4510 pshuflw xmm3, xmm3, 0F5h |
|
4511 pand xmm2, xmm6 // _r_b |
|
4512 paddw xmm3, xmm7 // 256 - alpha |
|
4513 pmullw xmm2, xmm3 // _r_b * alpha |
|
4514 movdqu xmm1, [esi] // _a_g |
|
4515 lea esi, [esi + 16] |
|
4516 psrlw xmm1, 8 // _a_g |
|
4517 por xmm0, xmm4 // set alpha to 255 |
|
4518 pmullw xmm1, xmm3 // _a_g * alpha |
|
4519 psrlw xmm2, 8 // _r_b convert to 8 bits again |
|
4520 paddusb xmm0, xmm2 // + src argb |
|
4521 pand xmm1, xmm5 // a_g_ convert to 8 bits again |
|
4522 paddusb xmm0, xmm1 // + src argb |
|
4523 sub ecx, 4 |
|
4524 movdqa [edx], xmm0 |
|
4525 lea edx, [edx + 16] |
|
4526 jge convertloop4 |
|
4527 |
|
4528 convertloop4b: |
|
4529 add ecx, 4 - 1 |
|
4530 jl convertloop1b |
|
4531 |
|
4532 // 1 pixel loop. |
|
4533 convertloop1: |
|
4534 movd xmm3, [eax] // src argb |
|
4535 lea eax, [eax + 4] |
|
4536 movdqa xmm0, xmm3 // src argb |
|
4537 pxor xmm3, xmm4 // ~alpha |
|
4538 movd xmm2, [esi] // _r_b |
|
4539 psrlw xmm3, 8 // alpha |
|
4540 pshufhw xmm3, xmm3, 0F5h // 8 alpha words |
|
4541 pshuflw xmm3, xmm3, 0F5h |
|
4542 pand xmm2, xmm6 // _r_b |
|
4543 paddw xmm3, xmm7 // 256 - alpha |
|
4544 pmullw xmm2, xmm3 // _r_b * alpha |
|
4545 movd xmm1, [esi] // _a_g |
|
4546 lea esi, [esi + 4] |
|
4547 psrlw xmm1, 8 // _a_g |
|
4548 por xmm0, xmm4 // set alpha to 255 |
|
4549 pmullw xmm1, xmm3 // _a_g * alpha |
|
4550 psrlw xmm2, 8 // _r_b convert to 8 bits again |
|
4551 paddusb xmm0, xmm2 // + src argb |
|
4552 pand xmm1, xmm5 // a_g_ convert to 8 bits again |
|
4553 paddusb xmm0, xmm1 // + src argb |
|
4554 sub ecx, 1 |
|
4555 movd [edx], xmm0 |
|
4556 lea edx, [edx + 4] |
|
4557 jge convertloop1 |
|
4558 |
|
4559 convertloop1b: |
|
4560 pop esi |
|
4561 ret |
|
4562 } |
|
4563 } |
|
4564 #endif // HAS_ARGBBLENDROW_SSE2 |
|
4565 |
|
4566 #ifdef HAS_ARGBBLENDROW_SSSE3 |
|
4567 // Shuffle table for isolating alpha. |
|
4568 static const uvec8 kShuffleAlpha = { |
|
4569 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, |
|
4570 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 |
|
4571 }; |
|
4572 // Same as SSE2, but replaces: |
|
4573 // psrlw xmm3, 8 // alpha |
|
4574 // pshufhw xmm3, xmm3, 0F5h // 8 alpha words |
|
4575 // pshuflw xmm3, xmm3, 0F5h |
|
4576 // with.. |
|
4577 // pshufb xmm3, kShuffleAlpha // alpha |
|
4578 // Blend 8 pixels at a time. |
|
4579 |
|
4580 __declspec(naked) __declspec(align(16)) |
|
4581 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, |
|
4582 uint8* dst_argb, int width) { |
|
4583 __asm { |
|
4584 push esi |
|
4585 mov eax, [esp + 4 + 4] // src_argb0 |
|
4586 mov esi, [esp + 4 + 8] // src_argb1 |
|
4587 mov edx, [esp + 4 + 12] // dst_argb |
|
4588 mov ecx, [esp + 4 + 16] // width |
|
4589 pcmpeqb xmm7, xmm7 // generate constant 0x0001 |
|
4590 psrlw xmm7, 15 |
|
4591 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff |
|
4592 psrlw xmm6, 8 |
|
4593 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 |
|
4594 psllw xmm5, 8 |
|
4595 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 |
|
4596 pslld xmm4, 24 |
|
4597 |
|
4598 sub ecx, 1 |
|
4599 je convertloop1 // only 1 pixel? |
|
4600 jl convertloop1b |
|
4601 |
|
4602 // 1 pixel loop until destination pointer is aligned. |
|
4603 alignloop1: |
|
4604 test edx, 15 // aligned? |
|
4605 je alignloop1b |
|
4606 movd xmm3, [eax] |
|
4607 lea eax, [eax + 4] |
|
4608 movdqa xmm0, xmm3 // src argb |
|
4609 pxor xmm3, xmm4 // ~alpha |
|
4610 movd xmm2, [esi] // _r_b |
|
4611 pshufb xmm3, kShuffleAlpha // alpha |
|
4612 pand xmm2, xmm6 // _r_b |
|
4613 paddw xmm3, xmm7 // 256 - alpha |
|
4614 pmullw xmm2, xmm3 // _r_b * alpha |
|
4615 movd xmm1, [esi] // _a_g |
|
4616 lea esi, [esi + 4] |
|
4617 psrlw xmm1, 8 // _a_g |
|
4618 por xmm0, xmm4 // set alpha to 255 |
|
4619 pmullw xmm1, xmm3 // _a_g * alpha |
|
4620 psrlw xmm2, 8 // _r_b convert to 8 bits again |
|
4621 paddusb xmm0, xmm2 // + src argb |
|
4622 pand xmm1, xmm5 // a_g_ convert to 8 bits again |
|
4623 paddusb xmm0, xmm1 // + src argb |
|
4624 sub ecx, 1 |
|
4625 movd [edx], xmm0 |
|
4626 lea edx, [edx + 4] |
|
4627 jge alignloop1 |
|
4628 |
|
4629 alignloop1b: |
|
4630 add ecx, 1 - 4 |
|
4631 jl convertloop4b |
|
4632 |
|
4633 test eax, 15 // unaligned? |
|
4634 jne convertuloop4 |
|
4635 test esi, 15 // unaligned? |
|
4636 jne convertuloop4 |
|
4637 |
|
4638 // 4 pixel loop. |
|
4639 convertloop4: |
|
4640 movdqa xmm3, [eax] // src argb |
|
4641 lea eax, [eax + 16] |
|
4642 movdqa xmm0, xmm3 // src argb |
|
4643 pxor xmm3, xmm4 // ~alpha |
|
4644 movdqa xmm2, [esi] // _r_b |
|
4645 pshufb xmm3, kShuffleAlpha // alpha |
|
4646 pand xmm2, xmm6 // _r_b |
|
4647 paddw xmm3, xmm7 // 256 - alpha |
|
4648 pmullw xmm2, xmm3 // _r_b * alpha |
|
4649 movdqa xmm1, [esi] // _a_g |
|
4650 lea esi, [esi + 16] |
|
4651 psrlw xmm1, 8 // _a_g |
|
4652 por xmm0, xmm4 // set alpha to 255 |
|
4653 pmullw xmm1, xmm3 // _a_g * alpha |
|
4654 psrlw xmm2, 8 // _r_b convert to 8 bits again |
|
4655 paddusb xmm0, xmm2 // + src argb |
|
4656 pand xmm1, xmm5 // a_g_ convert to 8 bits again |
|
4657 paddusb xmm0, xmm1 // + src argb |
|
4658 sub ecx, 4 |
|
4659 movdqa [edx], xmm0 |
|
4660 lea edx, [edx + 16] |
|
4661 jge convertloop4 |
|
4662 jmp convertloop4b |
|
4663 |
|
4664 // 4 pixel unaligned loop. |
|
4665 convertuloop4: |
|
4666 movdqu xmm3, [eax] // src argb |
|
4667 lea eax, [eax + 16] |
|
4668 movdqa xmm0, xmm3 // src argb |
|
4669 pxor xmm3, xmm4 // ~alpha |
|
4670 movdqu xmm2, [esi] // _r_b |
|
4671 pshufb xmm3, kShuffleAlpha // alpha |
|
4672 pand xmm2, xmm6 // _r_b |
|
4673 paddw xmm3, xmm7 // 256 - alpha |
|
4674 pmullw xmm2, xmm3 // _r_b * alpha |
|
4675 movdqu xmm1, [esi] // _a_g |
|
4676 lea esi, [esi + 16] |
|
4677 psrlw xmm1, 8 // _a_g |
|
4678 por xmm0, xmm4 // set alpha to 255 |
|
4679 pmullw xmm1, xmm3 // _a_g * alpha |
|
4680 psrlw xmm2, 8 // _r_b convert to 8 bits again |
|
4681 paddusb xmm0, xmm2 // + src argb |
|
4682 pand xmm1, xmm5 // a_g_ convert to 8 bits again |
|
4683 paddusb xmm0, xmm1 // + src argb |
|
4684 sub ecx, 4 |
|
4685 movdqa [edx], xmm0 |
|
4686 lea edx, [edx + 16] |
|
4687 jge convertuloop4 |
|
4688 |
|
4689 convertloop4b: |
|
4690 add ecx, 4 - 1 |
|
4691 jl convertloop1b |
|
4692 |
|
4693 // 1 pixel loop. |
|
4694 convertloop1: |
|
4695 movd xmm3, [eax] // src argb |
|
4696 lea eax, [eax + 4] |
|
4697 movdqa xmm0, xmm3 // src argb |
|
4698 pxor xmm3, xmm4 // ~alpha |
|
4699 movd xmm2, [esi] // _r_b |
|
4700 pshufb xmm3, kShuffleAlpha // alpha |
|
4701 pand xmm2, xmm6 // _r_b |
|
4702 paddw xmm3, xmm7 // 256 - alpha |
|
4703 pmullw xmm2, xmm3 // _r_b * alpha |
|
4704 movd xmm1, [esi] // _a_g |
|
4705 lea esi, [esi + 4] |
|
4706 psrlw xmm1, 8 // _a_g |
|
4707 por xmm0, xmm4 // set alpha to 255 |
|
4708 pmullw xmm1, xmm3 // _a_g * alpha |
|
4709 psrlw xmm2, 8 // _r_b convert to 8 bits again |
|
4710 paddusb xmm0, xmm2 // + src argb |
|
4711 pand xmm1, xmm5 // a_g_ convert to 8 bits again |
|
4712 paddusb xmm0, xmm1 // + src argb |
|
4713 sub ecx, 1 |
|
4714 movd [edx], xmm0 |
|
4715 lea edx, [edx + 4] |
|
4716 jge convertloop1 |
|
4717 |
|
4718 convertloop1b: |
|
4719 pop esi |
|
4720 ret |
|
4721 } |
|
4722 } |
|
4723 #endif // HAS_ARGBBLENDROW_SSSE3 |
|
4724 |
|
4725 #ifdef HAS_ARGBATTENUATEROW_SSE2 |
|
4726 // Attenuate 4 pixels at a time. |
|
4727 // Aligned to 16 bytes. |
|
4728 __declspec(naked) __declspec(align(16)) |
|
4729 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { |
|
4730 __asm { |
|
4731 mov eax, [esp + 4] // src_argb0 |
|
4732 mov edx, [esp + 8] // dst_argb |
|
4733 mov ecx, [esp + 12] // width |
|
4734 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 |
|
4735 pslld xmm4, 24 |
|
4736 pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff |
|
4737 psrld xmm5, 8 |
|
4738 |
|
4739 align 4 |
|
4740 convertloop: |
|
4741 movdqa xmm0, [eax] // read 4 pixels |
|
4742 punpcklbw xmm0, xmm0 // first 2 |
|
4743 pshufhw xmm2, xmm0, 0FFh // 8 alpha words |
|
4744 pshuflw xmm2, xmm2, 0FFh |
|
4745 pmulhuw xmm0, xmm2 // rgb * a |
|
4746 movdqa xmm1, [eax] // read 4 pixels |
|
4747 punpckhbw xmm1, xmm1 // next 2 pixels |
|
4748 pshufhw xmm2, xmm1, 0FFh // 8 alpha words |
|
4749 pshuflw xmm2, xmm2, 0FFh |
|
4750 pmulhuw xmm1, xmm2 // rgb * a |
|
4751 movdqa xmm2, [eax] // alphas |
|
4752 lea eax, [eax + 16] |
|
4753 psrlw xmm0, 8 |
|
4754 pand xmm2, xmm4 |
|
4755 psrlw xmm1, 8 |
|
4756 packuswb xmm0, xmm1 |
|
4757 pand xmm0, xmm5 // keep original alphas |
|
4758 por xmm0, xmm2 |
|
4759 sub ecx, 4 |
|
4760 movdqa [edx], xmm0 |
|
4761 lea edx, [edx + 16] |
|
4762 jg convertloop |
|
4763 |
|
4764 ret |
|
4765 } |
|
4766 } |
|
4767 #endif // HAS_ARGBATTENUATEROW_SSE2 |
|
4768 |
|
4769 #ifdef HAS_ARGBATTENUATEROW_SSSE3 |
|
4770 // Shuffle table duplicating alpha. |
|
4771 static const uvec8 kShuffleAlpha0 = { |
|
4772 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, |
|
4773 }; |
|
4774 static const uvec8 kShuffleAlpha1 = { |
|
4775 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, |
|
4776 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, |
|
4777 }; |
|
4778 __declspec(naked) __declspec(align(16)) |
|
4779 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { |
|
4780 __asm { |
|
4781 mov eax, [esp + 4] // src_argb0 |
|
4782 mov edx, [esp + 8] // dst_argb |
|
4783 mov ecx, [esp + 12] // width |
|
4784 pcmpeqb xmm3, xmm3 // generate mask 0xff000000 |
|
4785 pslld xmm3, 24 |
|
4786 movdqa xmm4, kShuffleAlpha0 |
|
4787 movdqa xmm5, kShuffleAlpha1 |
|
4788 |
|
4789 align 4 |
|
4790 convertloop: |
|
4791 movdqu xmm0, [eax] // read 4 pixels |
|
4792 pshufb xmm0, xmm4 // isolate first 2 alphas |
|
4793 movdqu xmm1, [eax] // read 4 pixels |
|
4794 punpcklbw xmm1, xmm1 // first 2 pixel rgbs |
|
4795 pmulhuw xmm0, xmm1 // rgb * a |
|
4796 movdqu xmm1, [eax] // read 4 pixels |
|
4797 pshufb xmm1, xmm5 // isolate next 2 alphas |
|
4798 movdqu xmm2, [eax] // read 4 pixels |
|
4799 punpckhbw xmm2, xmm2 // next 2 pixel rgbs |
|
4800 pmulhuw xmm1, xmm2 // rgb * a |
|
4801 movdqu xmm2, [eax] // mask original alpha |
|
4802 lea eax, [eax + 16] |
|
4803 pand xmm2, xmm3 |
|
4804 psrlw xmm0, 8 |
|
4805 psrlw xmm1, 8 |
|
4806 packuswb xmm0, xmm1 |
|
4807 por xmm0, xmm2 // copy original alpha |
|
4808 sub ecx, 4 |
|
4809 movdqu [edx], xmm0 |
|
4810 lea edx, [edx + 16] |
|
4811 jg convertloop |
|
4812 |
|
4813 ret |
|
4814 } |
|
4815 } |
|
4816 #endif // HAS_ARGBATTENUATEROW_SSSE3 |
|
4817 |
|
4818 #ifdef HAS_ARGBATTENUATEROW_AVX2 |
|
4819 // Shuffle table duplicating alpha. |
|
4820 static const ulvec8 kShuffleAlpha_AVX2 = { |
|
4821 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, |
|
4822 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u, |
|
4823 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, |
|
4824 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u, |
|
4825 }; |
|
4826 __declspec(naked) __declspec(align(16)) |
|
4827 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { |
|
4828 __asm { |
|
4829 mov eax, [esp + 4] // src_argb0 |
|
4830 mov edx, [esp + 8] // dst_argb |
|
4831 mov ecx, [esp + 12] // width |
|
4832 sub edx, eax |
|
4833 vmovdqa ymm4, kShuffleAlpha_AVX2 |
|
4834 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 |
|
4835 vpslld ymm5, ymm5, 24 |
|
4836 |
|
4837 align 4 |
|
4838 convertloop: |
|
4839 vmovdqu ymm6, [eax] // read 8 pixels. |
|
4840 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. |
|
4841 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. |
|
4842 vpshufb ymm2, ymm0, ymm4 // low 4 alphas |
|
4843 vpshufb ymm3, ymm1, ymm4 // high 4 alphas |
|
4844 vpmulhuw ymm0, ymm0, ymm2 // rgb * a |
|
4845 vpmulhuw ymm1, ymm1, ymm3 // rgb * a |
|
4846 vpand ymm6, ymm6, ymm5 // isolate alpha |
|
4847 vpsrlw ymm0, ymm0, 8 |
|
4848 vpsrlw ymm1, ymm1, 8 |
|
4849 vpackuswb ymm0, ymm0, ymm1 // unmutated. |
|
4850 vpor ymm0, ymm0, ymm6 // copy original alpha |
|
4851 sub ecx, 8 |
|
4852 vmovdqu [eax + edx], ymm0 |
|
4853 lea eax, [eax + 32] |
|
4854 jg convertloop |
|
4855 |
|
4856 vzeroupper |
|
4857 ret |
|
4858 } |
|
4859 } |
|
4860 #endif // HAS_ARGBATTENUATEROW_AVX2 |
|
4861 |
|
4862 #ifdef HAS_ARGBUNATTENUATEROW_SSE2 |
|
4863 // Unattenuate 4 pixels at a time. |
|
4864 // Aligned to 16 bytes. |
|
4865 __declspec(naked) __declspec(align(16)) |
|
4866 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, |
|
4867 int width) { |
|
4868 __asm { |
|
4869 push esi |
|
4870 push edi |
|
4871 mov eax, [esp + 8 + 4] // src_argb0 |
|
4872 mov edx, [esp + 8 + 8] // dst_argb |
|
4873 mov ecx, [esp + 8 + 12] // width |
|
4874 |
|
4875 align 4 |
|
4876 convertloop: |
|
4877 movdqu xmm0, [eax] // read 4 pixels |
|
4878 movzx esi, byte ptr [eax + 3] // first alpha |
|
4879 movzx edi, byte ptr [eax + 7] // second alpha |
|
4880 punpcklbw xmm0, xmm0 // first 2 |
|
4881 movd xmm2, dword ptr fixed_invtbl8[esi * 4] |
|
4882 movd xmm3, dword ptr fixed_invtbl8[edi * 4] |
|
4883 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a |
|
4884 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words |
|
4885 movlhps xmm2, xmm3 |
|
4886 pmulhuw xmm0, xmm2 // rgb * a |
|
4887 |
|
4888 movdqu xmm1, [eax] // read 4 pixels |
|
4889 movzx esi, byte ptr [eax + 11] // third alpha |
|
4890 movzx edi, byte ptr [eax + 15] // forth alpha |
|
4891 punpckhbw xmm1, xmm1 // next 2 |
|
4892 movd xmm2, dword ptr fixed_invtbl8[esi * 4] |
|
4893 movd xmm3, dword ptr fixed_invtbl8[edi * 4] |
|
4894 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words |
|
4895 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words |
|
4896 movlhps xmm2, xmm3 |
|
4897 pmulhuw xmm1, xmm2 // rgb * a |
|
4898 lea eax, [eax + 16] |
|
4899 |
|
4900 packuswb xmm0, xmm1 |
|
4901 sub ecx, 4 |
|
4902 movdqu [edx], xmm0 |
|
4903 lea edx, [edx + 16] |
|
4904 jg convertloop |
|
4905 pop edi |
|
4906 pop esi |
|
4907 ret |
|
4908 } |
|
4909 } |
|
4910 #endif // HAS_ARGBUNATTENUATEROW_SSE2 |
|
4911 |
|
4912 #ifdef HAS_ARGBUNATTENUATEROW_AVX2 |
|
4913 // Shuffle table duplicating alpha. |
|
4914 static const ulvec8 kUnattenShuffleAlpha_AVX2 = { |
|
4915 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15, |
|
4916 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15, |
|
4917 }; |
|
4918 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster. |
|
4919 // USE_GATHER is not on by default, due to being a slow instruction. |
|
4920 #ifdef USE_GATHER |
|
4921 __declspec(naked) __declspec(align(16)) |
|
4922 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, |
|
4923 int width) { |
|
4924 __asm { |
|
4925 mov eax, [esp + 4] // src_argb0 |
|
4926 mov edx, [esp + 8] // dst_argb |
|
4927 mov ecx, [esp + 12] // width |
|
4928 sub edx, eax |
|
4929 vmovdqa ymm4, kUnattenShuffleAlpha_AVX2 |
|
4930 |
|
4931 align 4 |
|
4932 convertloop: |
|
4933 vmovdqu ymm6, [eax] // read 8 pixels. |
|
4934 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather. |
|
4935 vpsrld ymm2, ymm6, 24 // alpha in low 8 bits. |
|
4936 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. |
|
4937 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. |
|
4938 vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a |
|
4939 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a |
|
4940 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. |
|
4941 vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a |
|
4942 vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas |
|
4943 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia |
|
4944 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia |
|
4945 vpackuswb ymm0, ymm0, ymm1 // unmutated. |
|
4946 sub ecx, 8 |
|
4947 vmovdqu [eax + edx], ymm0 |
|
4948 lea eax, [eax + 32] |
|
4949 jg convertloop |
|
4950 |
|
4951 vzeroupper |
|
4952 ret |
|
4953 } |
|
4954 } |
|
4955 #else // USE_GATHER |
|
4956 __declspec(naked) __declspec(align(16)) |
|
4957 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, |
|
4958 int width) { |
|
4959 __asm { |
|
4960 |
|
4961 mov eax, [esp + 4] // src_argb0 |
|
4962 mov edx, [esp + 8] // dst_argb |
|
4963 mov ecx, [esp + 12] // width |
|
4964 sub edx, eax |
|
4965 vmovdqa ymm5, kUnattenShuffleAlpha_AVX2 |
|
4966 |
|
4967 push esi |
|
4968 push edi |
|
4969 |
|
4970 align 4 |
|
4971 convertloop: |
|
4972 // replace VPGATHER |
|
4973 movzx esi, byte ptr [eax + 3] // alpha0 |
|
4974 movzx edi, byte ptr [eax + 7] // alpha1 |
|
4975 vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a0] |
|
4976 vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a1] |
|
4977 movzx esi, byte ptr [eax + 11] // alpha2 |
|
4978 movzx edi, byte ptr [eax + 15] // alpha3 |
|
4979 vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] |
|
4980 vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a2] |
|
4981 vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a3] |
|
4982 movzx esi, byte ptr [eax + 19] // alpha4 |
|
4983 movzx edi, byte ptr [eax + 23] // alpha5 |
|
4984 vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] |
|
4985 vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a4] |
|
4986 vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a5] |
|
4987 movzx esi, byte ptr [eax + 27] // alpha6 |
|
4988 movzx edi, byte ptr [eax + 31] // alpha7 |
|
4989 vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] |
|
4990 vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a6] |
|
4991 vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a7] |
|
4992 vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6] |
|
4993 vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0] |
|
4994 vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4] |
|
4995 vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0] |
|
4996 // end of VPGATHER |
|
4997 |
|
4998 vmovdqu ymm6, [eax] // read 8 pixels. |
|
4999 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. |
|
5000 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. |
|
5001 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a |
|
5002 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. |
|
5003 vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a |
|
5004 vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas |
|
5005 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia |
|
5006 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia |
|
5007 vpackuswb ymm0, ymm0, ymm1 // unmutated. |
|
5008 sub ecx, 8 |
|
5009 vmovdqu [eax + edx], ymm0 |
|
5010 lea eax, [eax + 32] |
|
5011 jg convertloop |
|
5012 |
|
5013 pop edi |
|
5014 pop esi |
|
5015 vzeroupper |
|
5016 ret |
|
5017 } |
|
5018 } |
|
5019 #endif // USE_GATHER |
|
5020 #endif // HAS_ARGBATTENUATEROW_AVX2 |
|
5021 |
|
5022 #ifdef HAS_ARGBGRAYROW_SSSE3 |
|
5023 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. |
|
5024 __declspec(naked) __declspec(align(16)) |
|
5025 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { |
|
5026 __asm { |
|
5027 mov eax, [esp + 4] /* src_argb */ |
|
5028 mov edx, [esp + 8] /* dst_argb */ |
|
5029 mov ecx, [esp + 12] /* width */ |
|
5030 movdqa xmm4, kARGBToYJ |
|
5031 movdqa xmm5, kAddYJ64 |
|
5032 |
|
5033 align 4 |
|
5034 convertloop: |
|
5035 movdqa xmm0, [eax] // G |
|
5036 movdqa xmm1, [eax + 16] |
|
5037 pmaddubsw xmm0, xmm4 |
|
5038 pmaddubsw xmm1, xmm4 |
|
5039 phaddw xmm0, xmm1 |
|
5040 paddw xmm0, xmm5 // Add .5 for rounding. |
|
5041 psrlw xmm0, 7 |
|
5042 packuswb xmm0, xmm0 // 8 G bytes |
|
5043 movdqa xmm2, [eax] // A |
|
5044 movdqa xmm3, [eax + 16] |
|
5045 lea eax, [eax + 32] |
|
5046 psrld xmm2, 24 |
|
5047 psrld xmm3, 24 |
|
5048 packuswb xmm2, xmm3 |
|
5049 packuswb xmm2, xmm2 // 8 A bytes |
|
5050 movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA |
|
5051 punpcklbw xmm0, xmm0 // 8 GG words |
|
5052 punpcklbw xmm3, xmm2 // 8 GA words |
|
5053 movdqa xmm1, xmm0 |
|
5054 punpcklwd xmm0, xmm3 // GGGA first 4 |
|
5055 punpckhwd xmm1, xmm3 // GGGA next 4 |
|
5056 sub ecx, 8 |
|
5057 movdqa [edx], xmm0 |
|
5058 movdqa [edx + 16], xmm1 |
|
5059 lea edx, [edx + 32] |
|
5060 jg convertloop |
|
5061 ret |
|
5062 } |
|
5063 } |
|
5064 #endif // HAS_ARGBGRAYROW_SSSE3 |
|
5065 |
|
5066 #ifdef HAS_ARGBSEPIAROW_SSSE3 |
|
5067 // b = (r * 35 + g * 68 + b * 17) >> 7 |
|
5068 // g = (r * 45 + g * 88 + b * 22) >> 7 |
|
5069 // r = (r * 50 + g * 98 + b * 24) >> 7 |
|
5070 // Constant for ARGB color to sepia tone. |
|
5071 static const vec8 kARGBToSepiaB = { |
|
5072 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 |
|
5073 }; |
|
5074 |
|
5075 static const vec8 kARGBToSepiaG = { |
|
5076 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 |
|
5077 }; |
|
5078 |
|
5079 static const vec8 kARGBToSepiaR = { |
|
5080 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 |
|
5081 }; |
|
5082 |
|
5083 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. |
|
5084 __declspec(naked) __declspec(align(16)) |
|
5085 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { |
|
5086 __asm { |
|
5087 mov eax, [esp + 4] /* dst_argb */ |
|
5088 mov ecx, [esp + 8] /* width */ |
|
5089 movdqa xmm2, kARGBToSepiaB |
|
5090 movdqa xmm3, kARGBToSepiaG |
|
5091 movdqa xmm4, kARGBToSepiaR |
|
5092 |
|
5093 align 4 |
|
5094 convertloop: |
|
5095 movdqa xmm0, [eax] // B |
|
5096 movdqa xmm6, [eax + 16] |
|
5097 pmaddubsw xmm0, xmm2 |
|
5098 pmaddubsw xmm6, xmm2 |
|
5099 phaddw xmm0, xmm6 |
|
5100 psrlw xmm0, 7 |
|
5101 packuswb xmm0, xmm0 // 8 B values |
|
5102 movdqa xmm5, [eax] // G |
|
5103 movdqa xmm1, [eax + 16] |
|
5104 pmaddubsw xmm5, xmm3 |
|
5105 pmaddubsw xmm1, xmm3 |
|
5106 phaddw xmm5, xmm1 |
|
5107 psrlw xmm5, 7 |
|
5108 packuswb xmm5, xmm5 // 8 G values |
|
5109 punpcklbw xmm0, xmm5 // 8 BG values |
|
5110 movdqa xmm5, [eax] // R |
|
5111 movdqa xmm1, [eax + 16] |
|
5112 pmaddubsw xmm5, xmm4 |
|
5113 pmaddubsw xmm1, xmm4 |
|
5114 phaddw xmm5, xmm1 |
|
5115 psrlw xmm5, 7 |
|
5116 packuswb xmm5, xmm5 // 8 R values |
|
5117 movdqa xmm6, [eax] // A |
|
5118 movdqa xmm1, [eax + 16] |
|
5119 psrld xmm6, 24 |
|
5120 psrld xmm1, 24 |
|
5121 packuswb xmm6, xmm1 |
|
5122 packuswb xmm6, xmm6 // 8 A values |
|
5123 punpcklbw xmm5, xmm6 // 8 RA values |
|
5124 movdqa xmm1, xmm0 // Weave BG, RA together |
|
5125 punpcklwd xmm0, xmm5 // BGRA first 4 |
|
5126 punpckhwd xmm1, xmm5 // BGRA next 4 |
|
5127 sub ecx, 8 |
|
5128 movdqa [eax], xmm0 |
|
5129 movdqa [eax + 16], xmm1 |
|
5130 lea eax, [eax + 32] |
|
5131 jg convertloop |
|
5132 ret |
|
5133 } |
|
5134 } |
|
5135 #endif // HAS_ARGBSEPIAROW_SSSE3 |
|
5136 |
|
5137 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 |
|
5138 // Tranform 8 ARGB pixels (32 bytes) with color matrix. |
|
5139 // Same as Sepia except matrix is provided. |
|
5140 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R |
|
5141 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. |
|
5142 __declspec(naked) __declspec(align(16)) |
|
5143 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, |
|
5144 const int8* matrix_argb, int width) { |
|
5145 __asm { |
|
5146 mov eax, [esp + 4] /* src_argb */ |
|
5147 mov edx, [esp + 8] /* dst_argb */ |
|
5148 mov ecx, [esp + 12] /* matrix_argb */ |
|
5149 movdqu xmm5, [ecx] |
|
5150 pshufd xmm2, xmm5, 0x00 |
|
5151 pshufd xmm3, xmm5, 0x55 |
|
5152 pshufd xmm4, xmm5, 0xaa |
|
5153 pshufd xmm5, xmm5, 0xff |
|
5154 mov ecx, [esp + 16] /* width */ |
|
5155 |
|
5156 align 4 |
|
5157 convertloop: |
|
5158 movdqa xmm0, [eax] // B |
|
5159 movdqa xmm7, [eax + 16] |
|
5160 pmaddubsw xmm0, xmm2 |
|
5161 pmaddubsw xmm7, xmm2 |
|
5162 movdqa xmm6, [eax] // G |
|
5163 movdqa xmm1, [eax + 16] |
|
5164 pmaddubsw xmm6, xmm3 |
|
5165 pmaddubsw xmm1, xmm3 |
|
5166 phaddsw xmm0, xmm7 // B |
|
5167 phaddsw xmm6, xmm1 // G |
|
5168 psraw xmm0, 6 // B |
|
5169 psraw xmm6, 6 // G |
|
5170 packuswb xmm0, xmm0 // 8 B values |
|
5171 packuswb xmm6, xmm6 // 8 G values |
|
5172 punpcklbw xmm0, xmm6 // 8 BG values |
|
5173 movdqa xmm1, [eax] // R |
|
5174 movdqa xmm7, [eax + 16] |
|
5175 pmaddubsw xmm1, xmm4 |
|
5176 pmaddubsw xmm7, xmm4 |
|
5177 phaddsw xmm1, xmm7 // R |
|
5178 movdqa xmm6, [eax] // A |
|
5179 movdqa xmm7, [eax + 16] |
|
5180 pmaddubsw xmm6, xmm5 |
|
5181 pmaddubsw xmm7, xmm5 |
|
5182 phaddsw xmm6, xmm7 // A |
|
5183 psraw xmm1, 6 // R |
|
5184 psraw xmm6, 6 // A |
|
5185 packuswb xmm1, xmm1 // 8 R values |
|
5186 packuswb xmm6, xmm6 // 8 A values |
|
5187 punpcklbw xmm1, xmm6 // 8 RA values |
|
5188 movdqa xmm6, xmm0 // Weave BG, RA together |
|
5189 punpcklwd xmm0, xmm1 // BGRA first 4 |
|
5190 punpckhwd xmm6, xmm1 // BGRA next 4 |
|
5191 sub ecx, 8 |
|
5192 movdqa [edx], xmm0 |
|
5193 movdqa [edx + 16], xmm6 |
|
5194 lea eax, [eax + 32] |
|
5195 lea edx, [edx + 32] |
|
5196 jg convertloop |
|
5197 ret |
|
5198 } |
|
5199 } |
|
5200 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 |
|
5201 |
|
5202 #ifdef HAS_ARGBQUANTIZEROW_SSE2 |
|
5203 // Quantize 4 ARGB pixels (16 bytes). |
|
5204 // Aligned to 16 bytes. |
|
5205 __declspec(naked) __declspec(align(16)) |
|
5206 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, |
|
5207 int interval_offset, int width) { |
|
5208 __asm { |
|
5209 mov eax, [esp + 4] /* dst_argb */ |
|
5210 movd xmm2, [esp + 8] /* scale */ |
|
5211 movd xmm3, [esp + 12] /* interval_size */ |
|
5212 movd xmm4, [esp + 16] /* interval_offset */ |
|
5213 mov ecx, [esp + 20] /* width */ |
|
5214 pshuflw xmm2, xmm2, 040h |
|
5215 pshufd xmm2, xmm2, 044h |
|
5216 pshuflw xmm3, xmm3, 040h |
|
5217 pshufd xmm3, xmm3, 044h |
|
5218 pshuflw xmm4, xmm4, 040h |
|
5219 pshufd xmm4, xmm4, 044h |
|
5220 pxor xmm5, xmm5 // constant 0 |
|
5221 pcmpeqb xmm6, xmm6 // generate mask 0xff000000 |
|
5222 pslld xmm6, 24 |
|
5223 |
|
5224 align 4 |
|
5225 convertloop: |
|
5226 movdqa xmm0, [eax] // read 4 pixels |
|
5227 punpcklbw xmm0, xmm5 // first 2 pixels |
|
5228 pmulhuw xmm0, xmm2 // pixel * scale >> 16 |
|
5229 movdqa xmm1, [eax] // read 4 pixels |
|
5230 punpckhbw xmm1, xmm5 // next 2 pixels |
|
5231 pmulhuw xmm1, xmm2 |
|
5232 pmullw xmm0, xmm3 // * interval_size |
|
5233 movdqa xmm7, [eax] // read 4 pixels |
|
5234 pmullw xmm1, xmm3 |
|
5235 pand xmm7, xmm6 // mask alpha |
|
5236 paddw xmm0, xmm4 // + interval_size / 2 |
|
5237 paddw xmm1, xmm4 |
|
5238 packuswb xmm0, xmm1 |
|
5239 por xmm0, xmm7 |
|
5240 sub ecx, 4 |
|
5241 movdqa [eax], xmm0 |
|
5242 lea eax, [eax + 16] |
|
5243 jg convertloop |
|
5244 ret |
|
5245 } |
|
5246 } |
|
5247 #endif // HAS_ARGBQUANTIZEROW_SSE2 |
|
5248 |
|
5249 #ifdef HAS_ARGBSHADEROW_SSE2 |
|
5250 // Shade 4 pixels at a time by specified value. |
|
5251 // Aligned to 16 bytes. |
|
5252 __declspec(naked) __declspec(align(16)) |
|
5253 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, |
|
5254 uint32 value) { |
|
5255 __asm { |
|
5256 mov eax, [esp + 4] // src_argb |
|
5257 mov edx, [esp + 8] // dst_argb |
|
5258 mov ecx, [esp + 12] // width |
|
5259 movd xmm2, [esp + 16] // value |
|
5260 punpcklbw xmm2, xmm2 |
|
5261 punpcklqdq xmm2, xmm2 |
|
5262 |
|
5263 align 4 |
|
5264 convertloop: |
|
5265 movdqa xmm0, [eax] // read 4 pixels |
|
5266 lea eax, [eax + 16] |
|
5267 movdqa xmm1, xmm0 |
|
5268 punpcklbw xmm0, xmm0 // first 2 |
|
5269 punpckhbw xmm1, xmm1 // next 2 |
|
5270 pmulhuw xmm0, xmm2 // argb * value |
|
5271 pmulhuw xmm1, xmm2 // argb * value |
|
5272 psrlw xmm0, 8 |
|
5273 psrlw xmm1, 8 |
|
5274 packuswb xmm0, xmm1 |
|
5275 sub ecx, 4 |
|
5276 movdqa [edx], xmm0 |
|
5277 lea edx, [edx + 16] |
|
5278 jg convertloop |
|
5279 |
|
5280 ret |
|
5281 } |
|
5282 } |
|
5283 #endif // HAS_ARGBSHADEROW_SSE2 |
|
5284 |
|
5285 #ifdef HAS_ARGBMULTIPLYROW_SSE2 |
|
5286 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. |
|
5287 __declspec(naked) __declspec(align(16)) |
|
5288 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, |
|
5289 uint8* dst_argb, int width) { |
|
5290 __asm { |
|
5291 push esi |
|
5292 mov eax, [esp + 4 + 4] // src_argb0 |
|
5293 mov esi, [esp + 4 + 8] // src_argb1 |
|
5294 mov edx, [esp + 4 + 12] // dst_argb |
|
5295 mov ecx, [esp + 4 + 16] // width |
|
5296 pxor xmm5, xmm5 // constant 0 |
|
5297 |
|
5298 align 4 |
|
5299 convertloop: |
|
5300 movdqu xmm0, [eax] // read 4 pixels from src_argb0 |
|
5301 movdqu xmm2, [esi] // read 4 pixels from src_argb1 |
|
5302 movdqu xmm1, xmm0 |
|
5303 movdqu xmm3, xmm2 |
|
5304 punpcklbw xmm0, xmm0 // first 2 |
|
5305 punpckhbw xmm1, xmm1 // next 2 |
|
5306 punpcklbw xmm2, xmm5 // first 2 |
|
5307 punpckhbw xmm3, xmm5 // next 2 |
|
5308 pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 |
|
5309 pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 |
|
5310 lea eax, [eax + 16] |
|
5311 lea esi, [esi + 16] |
|
5312 packuswb xmm0, xmm1 |
|
5313 sub ecx, 4 |
|
5314 movdqu [edx], xmm0 |
|
5315 lea edx, [edx + 16] |
|
5316 jg convertloop |
|
5317 |
|
5318 pop esi |
|
5319 ret |
|
5320 } |
|
5321 } |
|
5322 #endif // HAS_ARGBMULTIPLYROW_SSE2 |
|
5323 |
|
5324 #ifdef HAS_ARGBADDROW_SSE2 |
|
5325 // Add 2 rows of ARGB pixels together, 4 pixels at a time. |
|
5326 // TODO(fbarchard): Port this to posix, neon and other math functions. |
|
5327 __declspec(naked) __declspec(align(16)) |
|
5328 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, |
|
5329 uint8* dst_argb, int width) { |
|
5330 __asm { |
|
5331 push esi |
|
5332 mov eax, [esp + 4 + 4] // src_argb0 |
|
5333 mov esi, [esp + 4 + 8] // src_argb1 |
|
5334 mov edx, [esp + 4 + 12] // dst_argb |
|
5335 mov ecx, [esp + 4 + 16] // width |
|
5336 |
|
5337 sub ecx, 4 |
|
5338 jl convertloop49 |
|
5339 |
|
5340 align 4 |
|
5341 convertloop4: |
|
5342 movdqu xmm0, [eax] // read 4 pixels from src_argb0 |
|
5343 lea eax, [eax + 16] |
|
5344 movdqu xmm1, [esi] // read 4 pixels from src_argb1 |
|
5345 lea esi, [esi + 16] |
|
5346 paddusb xmm0, xmm1 // src_argb0 + src_argb1 |
|
5347 sub ecx, 4 |
|
5348 movdqu [edx], xmm0 |
|
5349 lea edx, [edx + 16] |
|
5350 jge convertloop4 |
|
5351 |
|
5352 convertloop49: |
|
5353 add ecx, 4 - 1 |
|
5354 jl convertloop19 |
|
5355 |
|
5356 convertloop1: |
|
5357 movd xmm0, [eax] // read 1 pixels from src_argb0 |
|
5358 lea eax, [eax + 4] |
|
5359 movd xmm1, [esi] // read 1 pixels from src_argb1 |
|
5360 lea esi, [esi + 4] |
|
5361 paddusb xmm0, xmm1 // src_argb0 + src_argb1 |
|
5362 sub ecx, 1 |
|
5363 movd [edx], xmm0 |
|
5364 lea edx, [edx + 4] |
|
5365 jge convertloop1 |
|
5366 |
|
5367 convertloop19: |
|
5368 pop esi |
|
5369 ret |
|
5370 } |
|
5371 } |
|
5372 #endif // HAS_ARGBADDROW_SSE2 |
|
5373 |
|
5374 #ifdef HAS_ARGBSUBTRACTROW_SSE2 |
|
5375 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time. |
|
5376 __declspec(naked) __declspec(align(16)) |
|
5377 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, |
|
5378 uint8* dst_argb, int width) { |
|
5379 __asm { |
|
5380 push esi |
|
5381 mov eax, [esp + 4 + 4] // src_argb0 |
|
5382 mov esi, [esp + 4 + 8] // src_argb1 |
|
5383 mov edx, [esp + 4 + 12] // dst_argb |
|
5384 mov ecx, [esp + 4 + 16] // width |
|
5385 |
|
5386 align 4 |
|
5387 convertloop: |
|
5388 movdqu xmm0, [eax] // read 4 pixels from src_argb0 |
|
5389 lea eax, [eax + 16] |
|
5390 movdqu xmm1, [esi] // read 4 pixels from src_argb1 |
|
5391 lea esi, [esi + 16] |
|
5392 psubusb xmm0, xmm1 // src_argb0 - src_argb1 |
|
5393 sub ecx, 4 |
|
5394 movdqu [edx], xmm0 |
|
5395 lea edx, [edx + 16] |
|
5396 jg convertloop |
|
5397 |
|
5398 pop esi |
|
5399 ret |
|
5400 } |
|
5401 } |
|
5402 #endif // HAS_ARGBSUBTRACTROW_SSE2 |
|
5403 |
|
5404 #ifdef HAS_ARGBMULTIPLYROW_AVX2 |
|
5405 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. |
|
5406 __declspec(naked) __declspec(align(16)) |
|
5407 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, |
|
5408 uint8* dst_argb, int width) { |
|
5409 __asm { |
|
5410 push esi |
|
5411 mov eax, [esp + 4 + 4] // src_argb0 |
|
5412 mov esi, [esp + 4 + 8] // src_argb1 |
|
5413 mov edx, [esp + 4 + 12] // dst_argb |
|
5414 mov ecx, [esp + 4 + 16] // width |
|
5415 vpxor ymm5, ymm5, ymm5 // constant 0 |
|
5416 |
|
5417 align 4 |
|
5418 convertloop: |
|
5419 vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 |
|
5420 lea eax, [eax + 32] |
|
5421 vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 |
|
5422 lea esi, [esi + 32] |
|
5423 vpunpcklbw ymm0, ymm1, ymm1 // low 4 |
|
5424 vpunpckhbw ymm1, ymm1, ymm1 // high 4 |
|
5425 vpunpcklbw ymm2, ymm3, ymm5 // low 4 |
|
5426 vpunpckhbw ymm3, ymm3, ymm5 // high 4 |
|
5427 vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4 |
|
5428 vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4 |
|
5429 vpackuswb ymm0, ymm0, ymm1 |
|
5430 vmovdqu [edx], ymm0 |
|
5431 lea edx, [edx + 32] |
|
5432 sub ecx, 8 |
|
5433 jg convertloop |
|
5434 |
|
5435 pop esi |
|
5436 vzeroupper |
|
5437 ret |
|
5438 } |
|
5439 } |
|
5440 #endif // HAS_ARGBMULTIPLYROW_AVX2 |
|
5441 |
|
5442 #ifdef HAS_ARGBADDROW_AVX2 |
|
5443 // Add 2 rows of ARGB pixels together, 8 pixels at a time. |
|
5444 __declspec(naked) __declspec(align(16)) |
|
5445 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, |
|
5446 uint8* dst_argb, int width) { |
|
5447 __asm { |
|
5448 push esi |
|
5449 mov eax, [esp + 4 + 4] // src_argb0 |
|
5450 mov esi, [esp + 4 + 8] // src_argb1 |
|
5451 mov edx, [esp + 4 + 12] // dst_argb |
|
5452 mov ecx, [esp + 4 + 16] // width |
|
5453 |
|
5454 align 4 |
|
5455 convertloop: |
|
5456 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 |
|
5457 lea eax, [eax + 32] |
|
5458 vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 |
|
5459 lea esi, [esi + 32] |
|
5460 vmovdqu [edx], ymm0 |
|
5461 lea edx, [edx + 32] |
|
5462 sub ecx, 8 |
|
5463 jg convertloop |
|
5464 |
|
5465 pop esi |
|
5466 vzeroupper |
|
5467 ret |
|
5468 } |
|
5469 } |
|
5470 #endif // HAS_ARGBADDROW_AVX2 |
|
5471 |
|
5472 #ifdef HAS_ARGBSUBTRACTROW_AVX2 |
|
5473 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time. |
|
5474 __declspec(naked) __declspec(align(16)) |
|
5475 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, |
|
5476 uint8* dst_argb, int width) { |
|
5477 __asm { |
|
5478 push esi |
|
5479 mov eax, [esp + 4 + 4] // src_argb0 |
|
5480 mov esi, [esp + 4 + 8] // src_argb1 |
|
5481 mov edx, [esp + 4 + 12] // dst_argb |
|
5482 mov ecx, [esp + 4 + 16] // width |
|
5483 |
|
5484 align 4 |
|
5485 convertloop: |
|
5486 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 |
|
5487 lea eax, [eax + 32] |
|
5488 vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1 |
|
5489 lea esi, [esi + 32] |
|
5490 vmovdqu [edx], ymm0 |
|
5491 lea edx, [edx + 32] |
|
5492 sub ecx, 8 |
|
5493 jg convertloop |
|
5494 |
|
5495 pop esi |
|
5496 vzeroupper |
|
5497 ret |
|
5498 } |
|
5499 } |
|
5500 #endif // HAS_ARGBSUBTRACTROW_AVX2 |
|
5501 |
|
5502 #ifdef HAS_SOBELXROW_SSE2 |
|
5503 // SobelX as a matrix is |
|
5504 // -1 0 1 |
|
5505 // -2 0 2 |
|
5506 // -1 0 1 |
|
5507 __declspec(naked) __declspec(align(16)) |
|
5508 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, |
|
5509 const uint8* src_y2, uint8* dst_sobelx, int width) { |
|
5510 __asm { |
|
5511 push esi |
|
5512 push edi |
|
5513 mov eax, [esp + 8 + 4] // src_y0 |
|
5514 mov esi, [esp + 8 + 8] // src_y1 |
|
5515 mov edi, [esp + 8 + 12] // src_y2 |
|
5516 mov edx, [esp + 8 + 16] // dst_sobelx |
|
5517 mov ecx, [esp + 8 + 20] // width |
|
5518 sub esi, eax |
|
5519 sub edi, eax |
|
5520 sub edx, eax |
|
5521 pxor xmm5, xmm5 // constant 0 |
|
5522 |
|
5523 align 4 |
|
5524 convertloop: |
|
5525 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] |
|
5526 movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] |
|
5527 punpcklbw xmm0, xmm5 |
|
5528 punpcklbw xmm1, xmm5 |
|
5529 psubw xmm0, xmm1 |
|
5530 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] |
|
5531 movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] |
|
5532 punpcklbw xmm1, xmm5 |
|
5533 punpcklbw xmm2, xmm5 |
|
5534 psubw xmm1, xmm2 |
|
5535 movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] |
|
5536 movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2] |
|
5537 punpcklbw xmm2, xmm5 |
|
5538 punpcklbw xmm3, xmm5 |
|
5539 psubw xmm2, xmm3 |
|
5540 paddw xmm0, xmm2 |
|
5541 paddw xmm0, xmm1 |
|
5542 paddw xmm0, xmm1 |
|
5543 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw |
|
5544 psubw xmm1, xmm0 |
|
5545 pmaxsw xmm0, xmm1 |
|
5546 packuswb xmm0, xmm0 |
|
5547 sub ecx, 8 |
|
5548 movq qword ptr [eax + edx], xmm0 |
|
5549 lea eax, [eax + 8] |
|
5550 jg convertloop |
|
5551 |
|
5552 pop edi |
|
5553 pop esi |
|
5554 ret |
|
5555 } |
|
5556 } |
|
5557 #endif // HAS_SOBELXROW_SSE2 |
|
5558 |
|
5559 #ifdef HAS_SOBELYROW_SSE2 |
|
5560 // SobelY as a matrix is |
|
5561 // -1 -2 -1 |
|
5562 // 0 0 0 |
|
5563 // 1 2 1 |
|
5564 __declspec(naked) __declspec(align(16)) |
|
5565 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, |
|
5566 uint8* dst_sobely, int width) { |
|
5567 __asm { |
|
5568 push esi |
|
5569 mov eax, [esp + 4 + 4] // src_y0 |
|
5570 mov esi, [esp + 4 + 8] // src_y1 |
|
5571 mov edx, [esp + 4 + 12] // dst_sobely |
|
5572 mov ecx, [esp + 4 + 16] // width |
|
5573 sub esi, eax |
|
5574 sub edx, eax |
|
5575 pxor xmm5, xmm5 // constant 0 |
|
5576 |
|
5577 align 4 |
|
5578 convertloop: |
|
5579 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] |
|
5580 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] |
|
5581 punpcklbw xmm0, xmm5 |
|
5582 punpcklbw xmm1, xmm5 |
|
5583 psubw xmm0, xmm1 |
|
5584 movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] |
|
5585 movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1] |
|
5586 punpcklbw xmm1, xmm5 |
|
5587 punpcklbw xmm2, xmm5 |
|
5588 psubw xmm1, xmm2 |
|
5589 movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] |
|
5590 movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] |
|
5591 punpcklbw xmm2, xmm5 |
|
5592 punpcklbw xmm3, xmm5 |
|
5593 psubw xmm2, xmm3 |
|
5594 paddw xmm0, xmm2 |
|
5595 paddw xmm0, xmm1 |
|
5596 paddw xmm0, xmm1 |
|
5597 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw |
|
5598 psubw xmm1, xmm0 |
|
5599 pmaxsw xmm0, xmm1 |
|
5600 packuswb xmm0, xmm0 |
|
5601 sub ecx, 8 |
|
5602 movq qword ptr [eax + edx], xmm0 |
|
5603 lea eax, [eax + 8] |
|
5604 jg convertloop |
|
5605 |
|
5606 pop esi |
|
5607 ret |
|
5608 } |
|
5609 } |
|
5610 #endif // HAS_SOBELYROW_SSE2 |
|
5611 |
|
5612 #ifdef HAS_SOBELROW_SSE2 |
|
5613 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. |
|
5614 // A = 255 |
|
5615 // R = Sobel |
|
5616 // G = Sobel |
|
5617 // B = Sobel |
|
5618 __declspec(naked) __declspec(align(16)) |
|
5619 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, |
|
5620 uint8* dst_argb, int width) { |
|
5621 __asm { |
|
5622 push esi |
|
5623 mov eax, [esp + 4 + 4] // src_sobelx |
|
5624 mov esi, [esp + 4 + 8] // src_sobely |
|
5625 mov edx, [esp + 4 + 12] // dst_argb |
|
5626 mov ecx, [esp + 4 + 16] // width |
|
5627 sub esi, eax |
|
5628 pcmpeqb xmm5, xmm5 // alpha 255 |
|
5629 pslld xmm5, 24 // 0xff000000 |
|
5630 |
|
5631 align 4 |
|
5632 convertloop: |
|
5633 movdqa xmm0, [eax] // read 16 pixels src_sobelx |
|
5634 movdqa xmm1, [eax + esi] // read 16 pixels src_sobely |
|
5635 lea eax, [eax + 16] |
|
5636 paddusb xmm0, xmm1 // sobel = sobelx + sobely |
|
5637 movdqa xmm2, xmm0 // GG |
|
5638 punpcklbw xmm2, xmm0 // First 8 |
|
5639 punpckhbw xmm0, xmm0 // Next 8 |
|
5640 movdqa xmm1, xmm2 // GGGG |
|
5641 punpcklwd xmm1, xmm2 // First 4 |
|
5642 punpckhwd xmm2, xmm2 // Next 4 |
|
5643 por xmm1, xmm5 // GGGA |
|
5644 por xmm2, xmm5 |
|
5645 movdqa xmm3, xmm0 // GGGG |
|
5646 punpcklwd xmm3, xmm0 // Next 4 |
|
5647 punpckhwd xmm0, xmm0 // Last 4 |
|
5648 por xmm3, xmm5 // GGGA |
|
5649 por xmm0, xmm5 |
|
5650 sub ecx, 16 |
|
5651 movdqa [edx], xmm1 |
|
5652 movdqa [edx + 16], xmm2 |
|
5653 movdqa [edx + 32], xmm3 |
|
5654 movdqa [edx + 48], xmm0 |
|
5655 lea edx, [edx + 64] |
|
5656 jg convertloop |
|
5657 |
|
5658 pop esi |
|
5659 ret |
|
5660 } |
|
5661 } |
|
5662 #endif // HAS_SOBELROW_SSE2 |
|
5663 |
|
5664 #ifdef HAS_SOBELTOPLANEROW_SSE2 |
|
5665 // Adds Sobel X and Sobel Y and stores Sobel into a plane. |
|
5666 __declspec(naked) __declspec(align(16)) |
|
5667 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, |
|
5668 uint8* dst_y, int width) { |
|
5669 __asm { |
|
5670 push esi |
|
5671 mov eax, [esp + 4 + 4] // src_sobelx |
|
5672 mov esi, [esp + 4 + 8] // src_sobely |
|
5673 mov edx, [esp + 4 + 12] // dst_argb |
|
5674 mov ecx, [esp + 4 + 16] // width |
|
5675 sub esi, eax |
|
5676 |
|
5677 align 4 |
|
5678 convertloop: |
|
5679 movdqa xmm0, [eax] // read 16 pixels src_sobelx |
|
5680 movdqa xmm1, [eax + esi] // read 16 pixels src_sobely |
|
5681 lea eax, [eax + 16] |
|
5682 paddusb xmm0, xmm1 // sobel = sobelx + sobely |
|
5683 sub ecx, 16 |
|
5684 movdqa [edx], xmm0 |
|
5685 lea edx, [edx + 16] |
|
5686 jg convertloop |
|
5687 |
|
5688 pop esi |
|
5689 ret |
|
5690 } |
|
5691 } |
|
5692 #endif // HAS_SOBELTOPLANEROW_SSE2 |
|
5693 |
|
5694 #ifdef HAS_SOBELXYROW_SSE2 |
|
5695 // Mixes Sobel X, Sobel Y and Sobel into ARGB. |
|
5696 // A = 255 |
|
5697 // R = Sobel X |
|
5698 // G = Sobel |
|
5699 // B = Sobel Y |
|
5700 __declspec(naked) __declspec(align(16)) |
|
5701 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, |
|
5702 uint8* dst_argb, int width) { |
|
5703 __asm { |
|
5704 push esi |
|
5705 mov eax, [esp + 4 + 4] // src_sobelx |
|
5706 mov esi, [esp + 4 + 8] // src_sobely |
|
5707 mov edx, [esp + 4 + 12] // dst_argb |
|
5708 mov ecx, [esp + 4 + 16] // width |
|
5709 sub esi, eax |
|
5710 pcmpeqb xmm5, xmm5 // alpha 255 |
|
5711 |
|
5712 align 4 |
|
5713 convertloop: |
|
5714 movdqa xmm0, [eax] // read 16 pixels src_sobelx |
|
5715 movdqa xmm1, [eax + esi] // read 16 pixels src_sobely |
|
5716 lea eax, [eax + 16] |
|
5717 movdqa xmm2, xmm0 |
|
5718 paddusb xmm2, xmm1 // sobel = sobelx + sobely |
|
5719 movdqa xmm3, xmm0 // XA |
|
5720 punpcklbw xmm3, xmm5 |
|
5721 punpckhbw xmm0, xmm5 |
|
5722 movdqa xmm4, xmm1 // YS |
|
5723 punpcklbw xmm4, xmm2 |
|
5724 punpckhbw xmm1, xmm2 |
|
5725 movdqa xmm6, xmm4 // YSXA |
|
5726 punpcklwd xmm6, xmm3 // First 4 |
|
5727 punpckhwd xmm4, xmm3 // Next 4 |
|
5728 movdqa xmm7, xmm1 // YSXA |
|
5729 punpcklwd xmm7, xmm0 // Next 4 |
|
5730 punpckhwd xmm1, xmm0 // Last 4 |
|
5731 sub ecx, 16 |
|
5732 movdqa [edx], xmm6 |
|
5733 movdqa [edx + 16], xmm4 |
|
5734 movdqa [edx + 32], xmm7 |
|
5735 movdqa [edx + 48], xmm1 |
|
5736 lea edx, [edx + 64] |
|
5737 jg convertloop |
|
5738 |
|
5739 pop esi |
|
5740 ret |
|
5741 } |
|
5742 } |
|
5743 #endif // HAS_SOBELXYROW_SSE2 |
|
5744 |
|
5745 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 |
|
5746 // Consider float CumulativeSum. |
|
5747 // Consider calling CumulativeSum one row at time as needed. |
|
5748 // Consider circular CumulativeSum buffer of radius * 2 + 1 height. |
|
5749 // Convert cumulative sum for an area to an average for 1 pixel. |
|
5750 // topleft is pointer to top left of CumulativeSum buffer for area. |
|
5751 // botleft is pointer to bottom left of CumulativeSum buffer. |
|
5752 // width is offset from left to right of area in CumulativeSum buffer measured |
|
5753 // in number of ints. |
|
5754 // area is the number of pixels in the area being averaged. |
|
5755 // dst points to pixel to store result to. |
|
5756 // count is number of averaged pixels to produce. |
|
5757 // Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte |
|
5758 // aligned. |
|
5759 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, |
|
5760 int width, int area, uint8* dst, |
|
5761 int count) { |
|
5762 __asm { |
|
5763 mov eax, topleft // eax topleft |
|
5764 mov esi, botleft // esi botleft |
|
5765 mov edx, width |
|
5766 movd xmm5, area |
|
5767 mov edi, dst |
|
5768 mov ecx, count |
|
5769 cvtdq2ps xmm5, xmm5 |
|
5770 rcpss xmm4, xmm5 // 1.0f / area |
|
5771 pshufd xmm4, xmm4, 0 |
|
5772 sub ecx, 4 |
|
5773 jl l4b |
|
5774 |
|
5775 cmp area, 128 // 128 pixels will not overflow 15 bits. |
|
5776 ja l4 |
|
5777 |
|
5778 pshufd xmm5, xmm5, 0 // area |
|
5779 pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0 |
|
5780 psrld xmm6, 16 |
|
5781 cvtdq2ps xmm6, xmm6 |
|
5782 addps xmm5, xmm6 // (65536.0 + area - 1) |
|
5783 mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area |
|
5784 cvtps2dq xmm5, xmm5 // 0.16 fixed point |
|
5785 packssdw xmm5, xmm5 // 16 bit shorts |
|
5786 |
|
5787 // 4 pixel loop small blocks. |
|
5788 align 4 |
|
5789 s4: |
|
5790 // top left |
|
5791 movdqa xmm0, [eax] |
|
5792 movdqa xmm1, [eax + 16] |
|
5793 movdqa xmm2, [eax + 32] |
|
5794 movdqa xmm3, [eax + 48] |
|
5795 |
|
5796 // - top right |
|
5797 psubd xmm0, [eax + edx * 4] |
|
5798 psubd xmm1, [eax + edx * 4 + 16] |
|
5799 psubd xmm2, [eax + edx * 4 + 32] |
|
5800 psubd xmm3, [eax + edx * 4 + 48] |
|
5801 lea eax, [eax + 64] |
|
5802 |
|
5803 // - bottom left |
|
5804 psubd xmm0, [esi] |
|
5805 psubd xmm1, [esi + 16] |
|
5806 psubd xmm2, [esi + 32] |
|
5807 psubd xmm3, [esi + 48] |
|
5808 |
|
5809 // + bottom right |
|
5810 paddd xmm0, [esi + edx * 4] |
|
5811 paddd xmm1, [esi + edx * 4 + 16] |
|
5812 paddd xmm2, [esi + edx * 4 + 32] |
|
5813 paddd xmm3, [esi + edx * 4 + 48] |
|
5814 lea esi, [esi + 64] |
|
5815 |
|
5816 packssdw xmm0, xmm1 // pack 4 pixels into 2 registers |
|
5817 packssdw xmm2, xmm3 |
|
5818 |
|
5819 pmulhuw xmm0, xmm5 |
|
5820 pmulhuw xmm2, xmm5 |
|
5821 |
|
5822 packuswb xmm0, xmm2 |
|
5823 movdqu [edi], xmm0 |
|
5824 lea edi, [edi + 16] |
|
5825 sub ecx, 4 |
|
5826 jge s4 |
|
5827 |
|
5828 jmp l4b |
|
5829 |
|
5830 // 4 pixel loop |
|
5831 align 4 |
|
5832 l4: |
|
5833 // top left |
|
5834 movdqa xmm0, [eax] |
|
5835 movdqa xmm1, [eax + 16] |
|
5836 movdqa xmm2, [eax + 32] |
|
5837 movdqa xmm3, [eax + 48] |
|
5838 |
|
5839 // - top right |
|
5840 psubd xmm0, [eax + edx * 4] |
|
5841 psubd xmm1, [eax + edx * 4 + 16] |
|
5842 psubd xmm2, [eax + edx * 4 + 32] |
|
5843 psubd xmm3, [eax + edx * 4 + 48] |
|
5844 lea eax, [eax + 64] |
|
5845 |
|
5846 // - bottom left |
|
5847 psubd xmm0, [esi] |
|
5848 psubd xmm1, [esi + 16] |
|
5849 psubd xmm2, [esi + 32] |
|
5850 psubd xmm3, [esi + 48] |
|
5851 |
|
5852 // + bottom right |
|
5853 paddd xmm0, [esi + edx * 4] |
|
5854 paddd xmm1, [esi + edx * 4 + 16] |
|
5855 paddd xmm2, [esi + edx * 4 + 32] |
|
5856 paddd xmm3, [esi + edx * 4 + 48] |
|
5857 lea esi, [esi + 64] |
|
5858 |
|
5859 cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area |
|
5860 cvtdq2ps xmm1, xmm1 |
|
5861 mulps xmm0, xmm4 |
|
5862 mulps xmm1, xmm4 |
|
5863 cvtdq2ps xmm2, xmm2 |
|
5864 cvtdq2ps xmm3, xmm3 |
|
5865 mulps xmm2, xmm4 |
|
5866 mulps xmm3, xmm4 |
|
5867 cvtps2dq xmm0, xmm0 |
|
5868 cvtps2dq xmm1, xmm1 |
|
5869 cvtps2dq xmm2, xmm2 |
|
5870 cvtps2dq xmm3, xmm3 |
|
5871 packssdw xmm0, xmm1 |
|
5872 packssdw xmm2, xmm3 |
|
5873 packuswb xmm0, xmm2 |
|
5874 movdqu [edi], xmm0 |
|
5875 lea edi, [edi + 16] |
|
5876 sub ecx, 4 |
|
5877 jge l4 |
|
5878 |
|
5879 l4b: |
|
5880 add ecx, 4 - 1 |
|
5881 jl l1b |
|
5882 |
|
5883 // 1 pixel loop |
|
5884 align 4 |
|
5885 l1: |
|
5886 movdqa xmm0, [eax] |
|
5887 psubd xmm0, [eax + edx * 4] |
|
5888 lea eax, [eax + 16] |
|
5889 psubd xmm0, [esi] |
|
5890 paddd xmm0, [esi + edx * 4] |
|
5891 lea esi, [esi + 16] |
|
5892 cvtdq2ps xmm0, xmm0 |
|
5893 mulps xmm0, xmm4 |
|
5894 cvtps2dq xmm0, xmm0 |
|
5895 packssdw xmm0, xmm0 |
|
5896 packuswb xmm0, xmm0 |
|
5897 movd dword ptr [edi], xmm0 |
|
5898 lea edi, [edi + 4] |
|
5899 sub ecx, 1 |
|
5900 jge l1 |
|
5901 l1b: |
|
5902 } |
|
5903 } |
|
5904 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 |
|
5905 |
|
5906 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 |
|
5907 // Creates a table of cumulative sums where each value is a sum of all values |
|
5908 // above and to the left of the value. |
|
5909 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, |
|
5910 const int32* previous_cumsum, int width) { |
|
5911 __asm { |
|
5912 mov eax, row |
|
5913 mov edx, cumsum |
|
5914 mov esi, previous_cumsum |
|
5915 mov ecx, width |
|
5916 pxor xmm0, xmm0 |
|
5917 pxor xmm1, xmm1 |
|
5918 |
|
5919 sub ecx, 4 |
|
5920 jl l4b |
|
5921 test edx, 15 |
|
5922 jne l4b |
|
5923 |
|
5924 // 4 pixel loop |
|
5925 align 4 |
|
5926 l4: |
|
5927 movdqu xmm2, [eax] // 4 argb pixels 16 bytes. |
|
5928 lea eax, [eax + 16] |
|
5929 movdqa xmm4, xmm2 |
|
5930 |
|
5931 punpcklbw xmm2, xmm1 |
|
5932 movdqa xmm3, xmm2 |
|
5933 punpcklwd xmm2, xmm1 |
|
5934 punpckhwd xmm3, xmm1 |
|
5935 |
|
5936 punpckhbw xmm4, xmm1 |
|
5937 movdqa xmm5, xmm4 |
|
5938 punpcklwd xmm4, xmm1 |
|
5939 punpckhwd xmm5, xmm1 |
|
5940 |
|
5941 paddd xmm0, xmm2 |
|
5942 movdqa xmm2, [esi] // previous row above. |
|
5943 paddd xmm2, xmm0 |
|
5944 |
|
5945 paddd xmm0, xmm3 |
|
5946 movdqa xmm3, [esi + 16] |
|
5947 paddd xmm3, xmm0 |
|
5948 |
|
5949 paddd xmm0, xmm4 |
|
5950 movdqa xmm4, [esi + 32] |
|
5951 paddd xmm4, xmm0 |
|
5952 |
|
5953 paddd xmm0, xmm5 |
|
5954 movdqa xmm5, [esi + 48] |
|
5955 lea esi, [esi + 64] |
|
5956 paddd xmm5, xmm0 |
|
5957 |
|
5958 movdqa [edx], xmm2 |
|
5959 movdqa [edx + 16], xmm3 |
|
5960 movdqa [edx + 32], xmm4 |
|
5961 movdqa [edx + 48], xmm5 |
|
5962 |
|
5963 lea edx, [edx + 64] |
|
5964 sub ecx, 4 |
|
5965 jge l4 |
|
5966 |
|
5967 l4b: |
|
5968 add ecx, 4 - 1 |
|
5969 jl l1b |
|
5970 |
|
5971 // 1 pixel loop |
|
5972 align 4 |
|
5973 l1: |
|
5974 movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes. |
|
5975 lea eax, [eax + 4] |
|
5976 punpcklbw xmm2, xmm1 |
|
5977 punpcklwd xmm2, xmm1 |
|
5978 paddd xmm0, xmm2 |
|
5979 movdqu xmm2, [esi] |
|
5980 lea esi, [esi + 16] |
|
5981 paddd xmm2, xmm0 |
|
5982 movdqu [edx], xmm2 |
|
5983 lea edx, [edx + 16] |
|
5984 sub ecx, 1 |
|
5985 jge l1 |
|
5986 |
|
5987 l1b: |
|
5988 } |
|
5989 } |
|
5990 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 |
|
5991 |
|
5992 #ifdef HAS_ARGBAFFINEROW_SSE2 |
|
5993 // Copy ARGB pixels from source image with slope to a row of destination. |
|
5994 __declspec(naked) __declspec(align(16)) |
|
5995 LIBYUV_API |
|
5996 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, |
|
5997 uint8* dst_argb, const float* uv_dudv, int width) { |
|
5998 __asm { |
|
5999 push esi |
|
6000 push edi |
|
6001 mov eax, [esp + 12] // src_argb |
|
6002 mov esi, [esp + 16] // stride |
|
6003 mov edx, [esp + 20] // dst_argb |
|
6004 mov ecx, [esp + 24] // pointer to uv_dudv |
|
6005 movq xmm2, qword ptr [ecx] // uv |
|
6006 movq xmm7, qword ptr [ecx + 8] // dudv |
|
6007 mov ecx, [esp + 28] // width |
|
6008 shl esi, 16 // 4, stride |
|
6009 add esi, 4 |
|
6010 movd xmm5, esi |
|
6011 sub ecx, 4 |
|
6012 jl l4b |
|
6013 |
|
6014 // setup for 4 pixel loop |
|
6015 pshufd xmm7, xmm7, 0x44 // dup dudv |
|
6016 pshufd xmm5, xmm5, 0 // dup 4, stride |
|
6017 movdqa xmm0, xmm2 // x0, y0, x1, y1 |
|
6018 addps xmm0, xmm7 |
|
6019 movlhps xmm2, xmm0 |
|
6020 movdqa xmm4, xmm7 |
|
6021 addps xmm4, xmm4 // dudv *= 2 |
|
6022 movdqa xmm3, xmm2 // x2, y2, x3, y3 |
|
6023 addps xmm3, xmm4 |
|
6024 addps xmm4, xmm4 // dudv *= 4 |
|
6025 |
|
6026 // 4 pixel loop |
|
6027 align 4 |
|
6028 l4: |
|
6029 cvttps2dq xmm0, xmm2 // x, y float to int first 2 |
|
6030 cvttps2dq xmm1, xmm3 // x, y float to int next 2 |
|
6031 packssdw xmm0, xmm1 // x, y as 8 shorts |
|
6032 pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. |
|
6033 movd esi, xmm0 |
|
6034 pshufd xmm0, xmm0, 0x39 // shift right |
|
6035 movd edi, xmm0 |
|
6036 pshufd xmm0, xmm0, 0x39 // shift right |
|
6037 movd xmm1, [eax + esi] // read pixel 0 |
|
6038 movd xmm6, [eax + edi] // read pixel 1 |
|
6039 punpckldq xmm1, xmm6 // combine pixel 0 and 1 |
|
6040 addps xmm2, xmm4 // x, y += dx, dy first 2 |
|
6041 movq qword ptr [edx], xmm1 |
|
6042 movd esi, xmm0 |
|
6043 pshufd xmm0, xmm0, 0x39 // shift right |
|
6044 movd edi, xmm0 |
|
6045 movd xmm6, [eax + esi] // read pixel 2 |
|
6046 movd xmm0, [eax + edi] // read pixel 3 |
|
6047 punpckldq xmm6, xmm0 // combine pixel 2 and 3 |
|
6048 addps xmm3, xmm4 // x, y += dx, dy next 2 |
|
6049 sub ecx, 4 |
|
6050 movq qword ptr 8[edx], xmm6 |
|
6051 lea edx, [edx + 16] |
|
6052 jge l4 |
|
6053 |
|
6054 l4b: |
|
6055 add ecx, 4 - 1 |
|
6056 jl l1b |
|
6057 |
|
6058 // 1 pixel loop |
|
6059 align 4 |
|
6060 l1: |
|
6061 cvttps2dq xmm0, xmm2 // x, y float to int |
|
6062 packssdw xmm0, xmm0 // x, y as shorts |
|
6063 pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride |
|
6064 addps xmm2, xmm7 // x, y += dx, dy |
|
6065 movd esi, xmm0 |
|
6066 movd xmm0, [eax + esi] // copy a pixel |
|
6067 sub ecx, 1 |
|
6068 movd [edx], xmm0 |
|
6069 lea edx, [edx + 4] |
|
6070 jge l1 |
|
6071 l1b: |
|
6072 pop edi |
|
6073 pop esi |
|
6074 ret |
|
6075 } |
|
6076 } |
|
6077 #endif // HAS_ARGBAFFINEROW_SSE2 |
|
6078 |
|
6079 #ifdef HAS_INTERPOLATEROW_AVX2 |
|
6080 // Bilinear filter 16x2 -> 16x1 |
|
6081 __declspec(naked) __declspec(align(16)) |
|
6082 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, |
|
6083 ptrdiff_t src_stride, int dst_width, |
|
6084 int source_y_fraction) { |
|
6085 __asm { |
|
6086 push esi |
|
6087 push edi |
|
6088 mov edi, [esp + 8 + 4] // dst_ptr |
|
6089 mov esi, [esp + 8 + 8] // src_ptr |
|
6090 mov edx, [esp + 8 + 12] // src_stride |
|
6091 mov ecx, [esp + 8 + 16] // dst_width |
|
6092 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) |
|
6093 shr eax, 1 |
|
6094 // Dispatch to specialized filters if applicable. |
|
6095 cmp eax, 0 |
|
6096 je xloop100 // 0 / 128. Blend 100 / 0. |
|
6097 sub edi, esi |
|
6098 cmp eax, 32 |
|
6099 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. |
|
6100 cmp eax, 64 |
|
6101 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. |
|
6102 cmp eax, 96 |
|
6103 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. |
|
6104 |
|
6105 vmovd xmm0, eax // high fraction 0..127 |
|
6106 neg eax |
|
6107 add eax, 128 |
|
6108 vmovd xmm5, eax // low fraction 128..1 |
|
6109 vpunpcklbw xmm5, xmm5, xmm0 |
|
6110 vpunpcklwd xmm5, xmm5, xmm5 |
|
6111 vpxor ymm0, ymm0, ymm0 |
|
6112 vpermd ymm5, ymm0, ymm5 |
|
6113 |
|
6114 align 4 |
|
6115 xloop: |
|
6116 vmovdqu ymm0, [esi] |
|
6117 vmovdqu ymm2, [esi + edx] |
|
6118 vpunpckhbw ymm1, ymm0, ymm2 // mutates |
|
6119 vpunpcklbw ymm0, ymm0, ymm2 // mutates |
|
6120 vpmaddubsw ymm0, ymm0, ymm5 |
|
6121 vpmaddubsw ymm1, ymm1, ymm5 |
|
6122 vpsrlw ymm0, ymm0, 7 |
|
6123 vpsrlw ymm1, ymm1, 7 |
|
6124 vpackuswb ymm0, ymm0, ymm1 // unmutates |
|
6125 sub ecx, 32 |
|
6126 vmovdqu [esi + edi], ymm0 |
|
6127 lea esi, [esi + 32] |
|
6128 jg xloop |
|
6129 jmp xloop99 |
|
6130 |
|
6131 // Blend 25 / 75. |
|
6132 align 4 |
|
6133 xloop25: |
|
6134 vmovdqu ymm0, [esi] |
|
6135 vpavgb ymm0, ymm0, [esi + edx] |
|
6136 vpavgb ymm0, ymm0, [esi + edx] |
|
6137 sub ecx, 32 |
|
6138 vmovdqu [esi + edi], ymm0 |
|
6139 lea esi, [esi + 32] |
|
6140 jg xloop25 |
|
6141 jmp xloop99 |
|
6142 |
|
6143 // Blend 50 / 50. |
|
6144 align 4 |
|
6145 xloop50: |
|
6146 vmovdqu ymm0, [esi] |
|
6147 vpavgb ymm0, ymm0, [esi + edx] |
|
6148 sub ecx, 32 |
|
6149 vmovdqu [esi + edi], ymm0 |
|
6150 lea esi, [esi + 32] |
|
6151 jg xloop50 |
|
6152 jmp xloop99 |
|
6153 |
|
6154 // Blend 75 / 25. |
|
6155 align 4 |
|
6156 xloop75: |
|
6157 vmovdqu ymm0, [esi + edx] |
|
6158 vpavgb ymm0, ymm0, [esi] |
|
6159 vpavgb ymm0, ymm0, [esi] |
|
6160 sub ecx, 32 |
|
6161 vmovdqu [esi + edi], ymm0 |
|
6162 lea esi, [esi + 32] |
|
6163 jg xloop75 |
|
6164 jmp xloop99 |
|
6165 |
|
6166 // Blend 100 / 0 - Copy row unchanged. |
|
6167 align 4 |
|
6168 xloop100: |
|
6169 rep movsb |
|
6170 |
|
6171 xloop99: |
|
6172 pop edi |
|
6173 pop esi |
|
6174 vzeroupper |
|
6175 ret |
|
6176 } |
|
6177 } |
|
6178 #endif // HAS_INTERPOLATEROW_AVX2 |
|
6179 |
|
6180 #ifdef HAS_INTERPOLATEROW_SSSE3 |
|
6181 // Bilinear filter 16x2 -> 16x1 |
|
6182 __declspec(naked) __declspec(align(16)) |
|
6183 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
|
6184 ptrdiff_t src_stride, int dst_width, |
|
6185 int source_y_fraction) { |
|
6186 __asm { |
|
6187 push esi |
|
6188 push edi |
|
6189 mov edi, [esp + 8 + 4] // dst_ptr |
|
6190 mov esi, [esp + 8 + 8] // src_ptr |
|
6191 mov edx, [esp + 8 + 12] // src_stride |
|
6192 mov ecx, [esp + 8 + 16] // dst_width |
|
6193 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) |
|
6194 sub edi, esi |
|
6195 shr eax, 1 |
|
6196 // Dispatch to specialized filters if applicable. |
|
6197 cmp eax, 0 |
|
6198 je xloop100 // 0 / 128. Blend 100 / 0. |
|
6199 cmp eax, 32 |
|
6200 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. |
|
6201 cmp eax, 64 |
|
6202 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. |
|
6203 cmp eax, 96 |
|
6204 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. |
|
6205 |
|
6206 movd xmm0, eax // high fraction 0..127 |
|
6207 neg eax |
|
6208 add eax, 128 |
|
6209 movd xmm5, eax // low fraction 128..1 |
|
6210 punpcklbw xmm5, xmm0 |
|
6211 punpcklwd xmm5, xmm5 |
|
6212 pshufd xmm5, xmm5, 0 |
|
6213 |
|
6214 align 4 |
|
6215 xloop: |
|
6216 movdqa xmm0, [esi] |
|
6217 movdqa xmm2, [esi + edx] |
|
6218 movdqa xmm1, xmm0 |
|
6219 punpcklbw xmm0, xmm2 |
|
6220 punpckhbw xmm1, xmm2 |
|
6221 pmaddubsw xmm0, xmm5 |
|
6222 pmaddubsw xmm1, xmm5 |
|
6223 psrlw xmm0, 7 |
|
6224 psrlw xmm1, 7 |
|
6225 packuswb xmm0, xmm1 |
|
6226 sub ecx, 16 |
|
6227 movdqa [esi + edi], xmm0 |
|
6228 lea esi, [esi + 16] |
|
6229 jg xloop |
|
6230 jmp xloop99 |
|
6231 |
|
6232 // Blend 25 / 75. |
|
6233 align 4 |
|
6234 xloop25: |
|
6235 movdqa xmm0, [esi] |
|
6236 movdqa xmm1, [esi + edx] |
|
6237 pavgb xmm0, xmm1 |
|
6238 pavgb xmm0, xmm1 |
|
6239 sub ecx, 16 |
|
6240 movdqa [esi + edi], xmm0 |
|
6241 lea esi, [esi + 16] |
|
6242 jg xloop25 |
|
6243 jmp xloop99 |
|
6244 |
|
6245 // Blend 50 / 50. |
|
6246 align 4 |
|
6247 xloop50: |
|
6248 movdqa xmm0, [esi] |
|
6249 movdqa xmm1, [esi + edx] |
|
6250 pavgb xmm0, xmm1 |
|
6251 sub ecx, 16 |
|
6252 movdqa [esi + edi], xmm0 |
|
6253 lea esi, [esi + 16] |
|
6254 jg xloop50 |
|
6255 jmp xloop99 |
|
6256 |
|
6257 // Blend 75 / 25. |
|
6258 align 4 |
|
6259 xloop75: |
|
6260 movdqa xmm1, [esi] |
|
6261 movdqa xmm0, [esi + edx] |
|
6262 pavgb xmm0, xmm1 |
|
6263 pavgb xmm0, xmm1 |
|
6264 sub ecx, 16 |
|
6265 movdqa [esi + edi], xmm0 |
|
6266 lea esi, [esi + 16] |
|
6267 jg xloop75 |
|
6268 jmp xloop99 |
|
6269 |
|
6270 // Blend 100 / 0 - Copy row unchanged. |
|
6271 align 4 |
|
6272 xloop100: |
|
6273 movdqa xmm0, [esi] |
|
6274 sub ecx, 16 |
|
6275 movdqa [esi + edi], xmm0 |
|
6276 lea esi, [esi + 16] |
|
6277 jg xloop100 |
|
6278 |
|
6279 xloop99: |
|
6280 pop edi |
|
6281 pop esi |
|
6282 ret |
|
6283 } |
|
6284 } |
|
6285 #endif // HAS_INTERPOLATEROW_SSSE3 |
|
6286 |
|
6287 #ifdef HAS_INTERPOLATEROW_SSE2 |
|
6288 // Bilinear filter 16x2 -> 16x1 |
|
6289 __declspec(naked) __declspec(align(16)) |
|
6290 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, |
|
6291 ptrdiff_t src_stride, int dst_width, |
|
6292 int source_y_fraction) { |
|
6293 __asm { |
|
6294 push esi |
|
6295 push edi |
|
6296 mov edi, [esp + 8 + 4] // dst_ptr |
|
6297 mov esi, [esp + 8 + 8] // src_ptr |
|
6298 mov edx, [esp + 8 + 12] // src_stride |
|
6299 mov ecx, [esp + 8 + 16] // dst_width |
|
6300 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) |
|
6301 sub edi, esi |
|
6302 // Dispatch to specialized filters if applicable. |
|
6303 cmp eax, 0 |
|
6304 je xloop100 // 0 / 256. Blend 100 / 0. |
|
6305 cmp eax, 64 |
|
6306 je xloop75 // 64 / 256 is 0.25. Blend 75 / 25. |
|
6307 cmp eax, 128 |
|
6308 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. |
|
6309 cmp eax, 192 |
|
6310 je xloop25 // 192 / 256 is 0.75. Blend 25 / 75. |
|
6311 |
|
6312 movd xmm5, eax // xmm5 = y fraction |
|
6313 punpcklbw xmm5, xmm5 |
|
6314 psrlw xmm5, 1 |
|
6315 punpcklwd xmm5, xmm5 |
|
6316 punpckldq xmm5, xmm5 |
|
6317 punpcklqdq xmm5, xmm5 |
|
6318 pxor xmm4, xmm4 |
|
6319 |
|
6320 align 4 |
|
6321 xloop: |
|
6322 movdqa xmm0, [esi] // row0 |
|
6323 movdqa xmm2, [esi + edx] // row1 |
|
6324 movdqa xmm1, xmm0 |
|
6325 movdqa xmm3, xmm2 |
|
6326 punpcklbw xmm2, xmm4 |
|
6327 punpckhbw xmm3, xmm4 |
|
6328 punpcklbw xmm0, xmm4 |
|
6329 punpckhbw xmm1, xmm4 |
|
6330 psubw xmm2, xmm0 // row1 - row0 |
|
6331 psubw xmm3, xmm1 |
|
6332 paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16 |
|
6333 paddw xmm3, xmm3 |
|
6334 pmulhw xmm2, xmm5 // scale diff |
|
6335 pmulhw xmm3, xmm5 |
|
6336 paddw xmm0, xmm2 // sum rows |
|
6337 paddw xmm1, xmm3 |
|
6338 packuswb xmm0, xmm1 |
|
6339 sub ecx, 16 |
|
6340 movdqa [esi + edi], xmm0 |
|
6341 lea esi, [esi + 16] |
|
6342 jg xloop |
|
6343 jmp xloop99 |
|
6344 |
|
6345 // Blend 25 / 75. |
|
6346 align 4 |
|
6347 xloop25: |
|
6348 movdqa xmm0, [esi] |
|
6349 movdqa xmm1, [esi + edx] |
|
6350 pavgb xmm0, xmm1 |
|
6351 pavgb xmm0, xmm1 |
|
6352 sub ecx, 16 |
|
6353 movdqa [esi + edi], xmm0 |
|
6354 lea esi, [esi + 16] |
|
6355 jg xloop25 |
|
6356 jmp xloop99 |
|
6357 |
|
6358 // Blend 50 / 50. |
|
6359 align 4 |
|
6360 xloop50: |
|
6361 movdqa xmm0, [esi] |
|
6362 movdqa xmm1, [esi + edx] |
|
6363 pavgb xmm0, xmm1 |
|
6364 sub ecx, 16 |
|
6365 movdqa [esi + edi], xmm0 |
|
6366 lea esi, [esi + 16] |
|
6367 jg xloop50 |
|
6368 jmp xloop99 |
|
6369 |
|
6370 // Blend 75 / 25. |
|
6371 align 4 |
|
6372 xloop75: |
|
6373 movdqa xmm1, [esi] |
|
6374 movdqa xmm0, [esi + edx] |
|
6375 pavgb xmm0, xmm1 |
|
6376 pavgb xmm0, xmm1 |
|
6377 sub ecx, 16 |
|
6378 movdqa [esi + edi], xmm0 |
|
6379 lea esi, [esi + 16] |
|
6380 jg xloop75 |
|
6381 jmp xloop99 |
|
6382 |
|
6383 // Blend 100 / 0 - Copy row unchanged. |
|
6384 align 4 |
|
6385 xloop100: |
|
6386 movdqa xmm0, [esi] |
|
6387 sub ecx, 16 |
|
6388 movdqa [esi + edi], xmm0 |
|
6389 lea esi, [esi + 16] |
|
6390 jg xloop100 |
|
6391 |
|
6392 xloop99: |
|
6393 pop edi |
|
6394 pop esi |
|
6395 ret |
|
6396 } |
|
6397 } |
|
6398 #endif // HAS_INTERPOLATEROW_SSE2 |
|
6399 |
|
6400 // Bilinear filter 16x2 -> 16x1 |
|
6401 __declspec(naked) __declspec(align(16)) |
|
6402 void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
|
6403 ptrdiff_t src_stride, int dst_width, |
|
6404 int source_y_fraction) { |
|
6405 __asm { |
|
6406 push esi |
|
6407 push edi |
|
6408 mov edi, [esp + 8 + 4] // dst_ptr |
|
6409 mov esi, [esp + 8 + 8] // src_ptr |
|
6410 mov edx, [esp + 8 + 12] // src_stride |
|
6411 mov ecx, [esp + 8 + 16] // dst_width |
|
6412 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) |
|
6413 sub edi, esi |
|
6414 shr eax, 1 |
|
6415 // Dispatch to specialized filters if applicable. |
|
6416 cmp eax, 0 |
|
6417 je xloop100 // 0 / 128. Blend 100 / 0. |
|
6418 cmp eax, 32 |
|
6419 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. |
|
6420 cmp eax, 64 |
|
6421 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. |
|
6422 cmp eax, 96 |
|
6423 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. |
|
6424 |
|
6425 movd xmm0, eax // high fraction 0..127 |
|
6426 neg eax |
|
6427 add eax, 128 |
|
6428 movd xmm5, eax // low fraction 128..1 |
|
6429 punpcklbw xmm5, xmm0 |
|
6430 punpcklwd xmm5, xmm5 |
|
6431 pshufd xmm5, xmm5, 0 |
|
6432 |
|
6433 align 4 |
|
6434 xloop: |
|
6435 movdqu xmm0, [esi] |
|
6436 movdqu xmm2, [esi + edx] |
|
6437 movdqu xmm1, xmm0 |
|
6438 punpcklbw xmm0, xmm2 |
|
6439 punpckhbw xmm1, xmm2 |
|
6440 pmaddubsw xmm0, xmm5 |
|
6441 pmaddubsw xmm1, xmm5 |
|
6442 psrlw xmm0, 7 |
|
6443 psrlw xmm1, 7 |
|
6444 packuswb xmm0, xmm1 |
|
6445 sub ecx, 16 |
|
6446 movdqu [esi + edi], xmm0 |
|
6447 lea esi, [esi + 16] |
|
6448 jg xloop |
|
6449 jmp xloop99 |
|
6450 |
|
6451 // Blend 25 / 75. |
|
6452 align 4 |
|
6453 xloop25: |
|
6454 movdqu xmm0, [esi] |
|
6455 movdqu xmm1, [esi + edx] |
|
6456 pavgb xmm0, xmm1 |
|
6457 pavgb xmm0, xmm1 |
|
6458 sub ecx, 16 |
|
6459 movdqu [esi + edi], xmm0 |
|
6460 lea esi, [esi + 16] |
|
6461 jg xloop25 |
|
6462 jmp xloop99 |
|
6463 |
|
6464 // Blend 50 / 50. |
|
6465 align 4 |
|
6466 xloop50: |
|
6467 movdqu xmm0, [esi] |
|
6468 movdqu xmm1, [esi + edx] |
|
6469 pavgb xmm0, xmm1 |
|
6470 sub ecx, 16 |
|
6471 movdqu [esi + edi], xmm0 |
|
6472 lea esi, [esi + 16] |
|
6473 jg xloop50 |
|
6474 jmp xloop99 |
|
6475 |
|
6476 // Blend 75 / 25. |
|
6477 align 4 |
|
6478 xloop75: |
|
6479 movdqu xmm1, [esi] |
|
6480 movdqu xmm0, [esi + edx] |
|
6481 pavgb xmm0, xmm1 |
|
6482 pavgb xmm0, xmm1 |
|
6483 sub ecx, 16 |
|
6484 movdqu [esi + edi], xmm0 |
|
6485 lea esi, [esi + 16] |
|
6486 jg xloop75 |
|
6487 jmp xloop99 |
|
6488 |
|
6489 // Blend 100 / 0 - Copy row unchanged. |
|
6490 align 4 |
|
6491 xloop100: |
|
6492 movdqu xmm0, [esi] |
|
6493 sub ecx, 16 |
|
6494 movdqu [esi + edi], xmm0 |
|
6495 lea esi, [esi + 16] |
|
6496 jg xloop100 |
|
6497 |
|
6498 xloop99: |
|
6499 pop edi |
|
6500 pop esi |
|
6501 ret |
|
6502 } |
|
6503 } |
|
6504 |
|
6505 #ifdef HAS_INTERPOLATEROW_SSE2 |
|
6506 // Bilinear filter 16x2 -> 16x1 |
|
6507 __declspec(naked) __declspec(align(16)) |
|
6508 void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, |
|
6509 ptrdiff_t src_stride, int dst_width, |
|
6510 int source_y_fraction) { |
|
6511 __asm { |
|
6512 push esi |
|
6513 push edi |
|
6514 mov edi, [esp + 8 + 4] // dst_ptr |
|
6515 mov esi, [esp + 8 + 8] // src_ptr |
|
6516 mov edx, [esp + 8 + 12] // src_stride |
|
6517 mov ecx, [esp + 8 + 16] // dst_width |
|
6518 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) |
|
6519 sub edi, esi |
|
6520 // Dispatch to specialized filters if applicable. |
|
6521 cmp eax, 0 |
|
6522 je xloop100 // 0 / 256. Blend 100 / 0. |
|
6523 cmp eax, 64 |
|
6524 je xloop75 // 64 / 256 is 0.25. Blend 75 / 25. |
|
6525 cmp eax, 128 |
|
6526 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. |
|
6527 cmp eax, 192 |
|
6528 je xloop25 // 192 / 256 is 0.75. Blend 25 / 75. |
|
6529 |
|
6530 movd xmm5, eax // xmm5 = y fraction |
|
6531 punpcklbw xmm5, xmm5 |
|
6532 psrlw xmm5, 1 |
|
6533 punpcklwd xmm5, xmm5 |
|
6534 punpckldq xmm5, xmm5 |
|
6535 punpcklqdq xmm5, xmm5 |
|
6536 pxor xmm4, xmm4 |
|
6537 |
|
6538 align 4 |
|
6539 xloop: |
|
6540 movdqu xmm0, [esi] // row0 |
|
6541 movdqu xmm2, [esi + edx] // row1 |
|
6542 movdqu xmm1, xmm0 |
|
6543 movdqu xmm3, xmm2 |
|
6544 punpcklbw xmm2, xmm4 |
|
6545 punpckhbw xmm3, xmm4 |
|
6546 punpcklbw xmm0, xmm4 |
|
6547 punpckhbw xmm1, xmm4 |
|
6548 psubw xmm2, xmm0 // row1 - row0 |
|
6549 psubw xmm3, xmm1 |
|
6550 paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16 |
|
6551 paddw xmm3, xmm3 |
|
6552 pmulhw xmm2, xmm5 // scale diff |
|
6553 pmulhw xmm3, xmm5 |
|
6554 paddw xmm0, xmm2 // sum rows |
|
6555 paddw xmm1, xmm3 |
|
6556 packuswb xmm0, xmm1 |
|
6557 sub ecx, 16 |
|
6558 movdqu [esi + edi], xmm0 |
|
6559 lea esi, [esi + 16] |
|
6560 jg xloop |
|
6561 jmp xloop99 |
|
6562 |
|
6563 // Blend 25 / 75. |
|
6564 align 4 |
|
6565 xloop25: |
|
6566 movdqu xmm0, [esi] |
|
6567 movdqu xmm1, [esi + edx] |
|
6568 pavgb xmm0, xmm1 |
|
6569 pavgb xmm0, xmm1 |
|
6570 sub ecx, 16 |
|
6571 movdqu [esi + edi], xmm0 |
|
6572 lea esi, [esi + 16] |
|
6573 jg xloop25 |
|
6574 jmp xloop99 |
|
6575 |
|
6576 // Blend 50 / 50. |
|
6577 align 4 |
|
6578 xloop50: |
|
6579 movdqu xmm0, [esi] |
|
6580 movdqu xmm1, [esi + edx] |
|
6581 pavgb xmm0, xmm1 |
|
6582 sub ecx, 16 |
|
6583 movdqu [esi + edi], xmm0 |
|
6584 lea esi, [esi + 16] |
|
6585 jg xloop50 |
|
6586 jmp xloop99 |
|
6587 |
|
6588 // Blend 75 / 25. |
|
6589 align 4 |
|
6590 xloop75: |
|
6591 movdqu xmm1, [esi] |
|
6592 movdqu xmm0, [esi + edx] |
|
6593 pavgb xmm0, xmm1 |
|
6594 pavgb xmm0, xmm1 |
|
6595 sub ecx, 16 |
|
6596 movdqu [esi + edi], xmm0 |
|
6597 lea esi, [esi + 16] |
|
6598 jg xloop75 |
|
6599 jmp xloop99 |
|
6600 |
|
6601 // Blend 100 / 0 - Copy row unchanged. |
|
6602 align 4 |
|
6603 xloop100: |
|
6604 movdqu xmm0, [esi] |
|
6605 sub ecx, 16 |
|
6606 movdqu [esi + edi], xmm0 |
|
6607 lea esi, [esi + 16] |
|
6608 jg xloop100 |
|
6609 |
|
6610 xloop99: |
|
6611 pop edi |
|
6612 pop esi |
|
6613 ret |
|
6614 } |
|
6615 } |
|
6616 #endif // HAS_INTERPOLATEROW_SSE2 |
|
6617 |
|
6618 __declspec(naked) __declspec(align(16)) |
|
6619 void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, |
|
6620 uint8* dst_uv, int pix) { |
|
6621 __asm { |
|
6622 push edi |
|
6623 mov eax, [esp + 4 + 4] // src_uv |
|
6624 mov edx, [esp + 4 + 8] // src_uv_stride |
|
6625 mov edi, [esp + 4 + 12] // dst_v |
|
6626 mov ecx, [esp + 4 + 16] // pix |
|
6627 sub edi, eax |
|
6628 |
|
6629 align 4 |
|
6630 convertloop: |
|
6631 movdqa xmm0, [eax] |
|
6632 pavgb xmm0, [eax + edx] |
|
6633 sub ecx, 16 |
|
6634 movdqa [eax + edi], xmm0 |
|
6635 lea eax, [eax + 16] |
|
6636 jg convertloop |
|
6637 pop edi |
|
6638 ret |
|
6639 } |
|
6640 } |
|
6641 |
|
6642 #ifdef HAS_HALFROW_AVX2 |
|
6643 __declspec(naked) __declspec(align(16)) |
|
6644 void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride, |
|
6645 uint8* dst_uv, int pix) { |
|
6646 __asm { |
|
6647 push edi |
|
6648 mov eax, [esp + 4 + 4] // src_uv |
|
6649 mov edx, [esp + 4 + 8] // src_uv_stride |
|
6650 mov edi, [esp + 4 + 12] // dst_v |
|
6651 mov ecx, [esp + 4 + 16] // pix |
|
6652 sub edi, eax |
|
6653 |
|
6654 align 4 |
|
6655 convertloop: |
|
6656 vmovdqu ymm0, [eax] |
|
6657 vpavgb ymm0, ymm0, [eax + edx] |
|
6658 sub ecx, 32 |
|
6659 vmovdqu [eax + edi], ymm0 |
|
6660 lea eax, [eax + 32] |
|
6661 jg convertloop |
|
6662 |
|
6663 pop edi |
|
6664 vzeroupper |
|
6665 ret |
|
6666 } |
|
6667 } |
|
6668 #endif // HAS_HALFROW_AVX2 |
|
6669 |
|
6670 __declspec(naked) __declspec(align(16)) |
|
6671 void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, |
|
6672 uint32 selector, int pix) { |
|
6673 __asm { |
|
6674 mov eax, [esp + 4] // src_argb |
|
6675 mov edx, [esp + 8] // dst_bayer |
|
6676 movd xmm5, [esp + 12] // selector |
|
6677 mov ecx, [esp + 16] // pix |
|
6678 pshufd xmm5, xmm5, 0 |
|
6679 |
|
6680 align 4 |
|
6681 wloop: |
|
6682 movdqa xmm0, [eax] |
|
6683 movdqa xmm1, [eax + 16] |
|
6684 lea eax, [eax + 32] |
|
6685 pshufb xmm0, xmm5 |
|
6686 pshufb xmm1, xmm5 |
|
6687 punpckldq xmm0, xmm1 |
|
6688 sub ecx, 8 |
|
6689 movq qword ptr [edx], xmm0 |
|
6690 lea edx, [edx + 8] |
|
6691 jg wloop |
|
6692 ret |
|
6693 } |
|
6694 } |
|
6695 |
|
6696 // Specialized ARGB to Bayer that just isolates G channel. |
|
6697 __declspec(naked) __declspec(align(16)) |
|
6698 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, |
|
6699 uint32 selector, int pix) { |
|
6700 __asm { |
|
6701 mov eax, [esp + 4] // src_argb |
|
6702 mov edx, [esp + 8] // dst_bayer |
|
6703 // selector |
|
6704 mov ecx, [esp + 16] // pix |
|
6705 pcmpeqb xmm5, xmm5 // generate mask 0x000000ff |
|
6706 psrld xmm5, 24 |
|
6707 |
|
6708 align 4 |
|
6709 wloop: |
|
6710 movdqa xmm0, [eax] |
|
6711 movdqa xmm1, [eax + 16] |
|
6712 lea eax, [eax + 32] |
|
6713 psrld xmm0, 8 // Move green to bottom. |
|
6714 psrld xmm1, 8 |
|
6715 pand xmm0, xmm5 |
|
6716 pand xmm1, xmm5 |
|
6717 packssdw xmm0, xmm1 |
|
6718 packuswb xmm0, xmm1 |
|
6719 sub ecx, 8 |
|
6720 movq qword ptr [edx], xmm0 |
|
6721 lea edx, [edx + 8] |
|
6722 jg wloop |
|
6723 ret |
|
6724 } |
|
6725 } |
|
6726 |
|
6727 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
|
6728 __declspec(naked) __declspec(align(16)) |
|
6729 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, |
|
6730 const uint8* shuffler, int pix) { |
|
6731 __asm { |
|
6732 mov eax, [esp + 4] // src_argb |
|
6733 mov edx, [esp + 8] // dst_argb |
|
6734 mov ecx, [esp + 12] // shuffler |
|
6735 movdqa xmm5, [ecx] |
|
6736 mov ecx, [esp + 16] // pix |
|
6737 |
|
6738 align 4 |
|
6739 wloop: |
|
6740 movdqa xmm0, [eax] |
|
6741 movdqa xmm1, [eax + 16] |
|
6742 lea eax, [eax + 32] |
|
6743 pshufb xmm0, xmm5 |
|
6744 pshufb xmm1, xmm5 |
|
6745 sub ecx, 8 |
|
6746 movdqa [edx], xmm0 |
|
6747 movdqa [edx + 16], xmm1 |
|
6748 lea edx, [edx + 32] |
|
6749 jg wloop |
|
6750 ret |
|
6751 } |
|
6752 } |
|
6753 |
|
6754 __declspec(naked) __declspec(align(16)) |
|
6755 void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, |
|
6756 const uint8* shuffler, int pix) { |
|
6757 __asm { |
|
6758 mov eax, [esp + 4] // src_argb |
|
6759 mov edx, [esp + 8] // dst_argb |
|
6760 mov ecx, [esp + 12] // shuffler |
|
6761 movdqa xmm5, [ecx] |
|
6762 mov ecx, [esp + 16] // pix |
|
6763 |
|
6764 align 4 |
|
6765 wloop: |
|
6766 movdqu xmm0, [eax] |
|
6767 movdqu xmm1, [eax + 16] |
|
6768 lea eax, [eax + 32] |
|
6769 pshufb xmm0, xmm5 |
|
6770 pshufb xmm1, xmm5 |
|
6771 sub ecx, 8 |
|
6772 movdqu [edx], xmm0 |
|
6773 movdqu [edx + 16], xmm1 |
|
6774 lea edx, [edx + 32] |
|
6775 jg wloop |
|
6776 ret |
|
6777 } |
|
6778 } |
|
6779 |
|
6780 #ifdef HAS_ARGBSHUFFLEROW_AVX2 |
|
6781 __declspec(naked) __declspec(align(16)) |
|
6782 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, |
|
6783 const uint8* shuffler, int pix) { |
|
6784 __asm { |
|
6785 mov eax, [esp + 4] // src_argb |
|
6786 mov edx, [esp + 8] // dst_argb |
|
6787 mov ecx, [esp + 12] // shuffler |
|
6788 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. |
|
6789 mov ecx, [esp + 16] // pix |
|
6790 |
|
6791 align 4 |
|
6792 wloop: |
|
6793 vmovdqu ymm0, [eax] |
|
6794 vmovdqu ymm1, [eax + 32] |
|
6795 lea eax, [eax + 64] |
|
6796 vpshufb ymm0, ymm0, ymm5 |
|
6797 vpshufb ymm1, ymm1, ymm5 |
|
6798 sub ecx, 16 |
|
6799 vmovdqu [edx], ymm0 |
|
6800 vmovdqu [edx + 32], ymm1 |
|
6801 lea edx, [edx + 64] |
|
6802 jg wloop |
|
6803 |
|
6804 vzeroupper |
|
6805 ret |
|
6806 } |
|
6807 } |
|
6808 #endif // HAS_ARGBSHUFFLEROW_AVX2 |
|
6809 |
|
6810 __declspec(naked) __declspec(align(16)) |
|
6811 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, |
|
6812 const uint8* shuffler, int pix) { |
|
6813 __asm { |
|
6814 push ebx |
|
6815 push esi |
|
6816 mov eax, [esp + 8 + 4] // src_argb |
|
6817 mov edx, [esp + 8 + 8] // dst_argb |
|
6818 mov esi, [esp + 8 + 12] // shuffler |
|
6819 mov ecx, [esp + 8 + 16] // pix |
|
6820 pxor xmm5, xmm5 |
|
6821 |
|
6822 mov ebx, [esi] // shuffler |
|
6823 cmp ebx, 0x03000102 |
|
6824 je shuf_3012 |
|
6825 cmp ebx, 0x00010203 |
|
6826 je shuf_0123 |
|
6827 cmp ebx, 0x00030201 |
|
6828 je shuf_0321 |
|
6829 cmp ebx, 0x02010003 |
|
6830 je shuf_2103 |
|
6831 |
|
6832 // TODO(fbarchard): Use one source pointer and 3 offsets. |
|
6833 shuf_any1: |
|
6834 movzx ebx, byte ptr [esi] |
|
6835 movzx ebx, byte ptr [eax + ebx] |
|
6836 mov [edx], bl |
|
6837 movzx ebx, byte ptr [esi + 1] |
|
6838 movzx ebx, byte ptr [eax + ebx] |
|
6839 mov [edx + 1], bl |
|
6840 movzx ebx, byte ptr [esi + 2] |
|
6841 movzx ebx, byte ptr [eax + ebx] |
|
6842 mov [edx + 2], bl |
|
6843 movzx ebx, byte ptr [esi + 3] |
|
6844 movzx ebx, byte ptr [eax + ebx] |
|
6845 mov [edx + 3], bl |
|
6846 lea eax, [eax + 4] |
|
6847 lea edx, [edx + 4] |
|
6848 sub ecx, 1 |
|
6849 jg shuf_any1 |
|
6850 jmp shuf99 |
|
6851 |
|
6852 align 4 |
|
6853 shuf_0123: |
|
6854 movdqu xmm0, [eax] |
|
6855 lea eax, [eax + 16] |
|
6856 movdqa xmm1, xmm0 |
|
6857 punpcklbw xmm0, xmm5 |
|
6858 punpckhbw xmm1, xmm5 |
|
6859 pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB |
|
6860 pshuflw xmm0, xmm0, 01Bh |
|
6861 pshufhw xmm1, xmm1, 01Bh |
|
6862 pshuflw xmm1, xmm1, 01Bh |
|
6863 packuswb xmm0, xmm1 |
|
6864 sub ecx, 4 |
|
6865 movdqu [edx], xmm0 |
|
6866 lea edx, [edx + 16] |
|
6867 jg shuf_0123 |
|
6868 jmp shuf99 |
|
6869 |
|
6870 align 4 |
|
6871 shuf_0321: |
|
6872 movdqu xmm0, [eax] |
|
6873 lea eax, [eax + 16] |
|
6874 movdqa xmm1, xmm0 |
|
6875 punpcklbw xmm0, xmm5 |
|
6876 punpckhbw xmm1, xmm5 |
|
6877 pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB |
|
6878 pshuflw xmm0, xmm0, 039h |
|
6879 pshufhw xmm1, xmm1, 039h |
|
6880 pshuflw xmm1, xmm1, 039h |
|
6881 packuswb xmm0, xmm1 |
|
6882 sub ecx, 4 |
|
6883 movdqu [edx], xmm0 |
|
6884 lea edx, [edx + 16] |
|
6885 jg shuf_0321 |
|
6886 jmp shuf99 |
|
6887 |
|
6888 align 4 |
|
6889 shuf_2103: |
|
6890 movdqu xmm0, [eax] |
|
6891 lea eax, [eax + 16] |
|
6892 movdqa xmm1, xmm0 |
|
6893 punpcklbw xmm0, xmm5 |
|
6894 punpckhbw xmm1, xmm5 |
|
6895 pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA |
|
6896 pshuflw xmm0, xmm0, 093h |
|
6897 pshufhw xmm1, xmm1, 093h |
|
6898 pshuflw xmm1, xmm1, 093h |
|
6899 packuswb xmm0, xmm1 |
|
6900 sub ecx, 4 |
|
6901 movdqu [edx], xmm0 |
|
6902 lea edx, [edx + 16] |
|
6903 jg shuf_2103 |
|
6904 jmp shuf99 |
|
6905 |
|
6906 align 4 |
|
6907 shuf_3012: |
|
6908 movdqu xmm0, [eax] |
|
6909 lea eax, [eax + 16] |
|
6910 movdqa xmm1, xmm0 |
|
6911 punpcklbw xmm0, xmm5 |
|
6912 punpckhbw xmm1, xmm5 |
|
6913 pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB |
|
6914 pshuflw xmm0, xmm0, 0C6h |
|
6915 pshufhw xmm1, xmm1, 0C6h |
|
6916 pshuflw xmm1, xmm1, 0C6h |
|
6917 packuswb xmm0, xmm1 |
|
6918 sub ecx, 4 |
|
6919 movdqu [edx], xmm0 |
|
6920 lea edx, [edx + 16] |
|
6921 jg shuf_3012 |
|
6922 |
|
6923 shuf99: |
|
6924 pop esi |
|
6925 pop ebx |
|
6926 ret |
|
6927 } |
|
6928 } |
|
6929 |
|
6930 // YUY2 - Macro-pixel = 2 image pixels |
|
6931 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... |
|
6932 |
|
6933 // UYVY - Macro-pixel = 2 image pixels |
|
6934 // U0Y0V0Y1 |
|
6935 |
|
6936 __declspec(naked) __declspec(align(16)) |
|
6937 void I422ToYUY2Row_SSE2(const uint8* src_y, |
|
6938 const uint8* src_u, |
|
6939 const uint8* src_v, |
|
6940 uint8* dst_frame, int width) { |
|
6941 __asm { |
|
6942 push esi |
|
6943 push edi |
|
6944 mov eax, [esp + 8 + 4] // src_y |
|
6945 mov esi, [esp + 8 + 8] // src_u |
|
6946 mov edx, [esp + 8 + 12] // src_v |
|
6947 mov edi, [esp + 8 + 16] // dst_frame |
|
6948 mov ecx, [esp + 8 + 20] // width |
|
6949 sub edx, esi |
|
6950 |
|
6951 align 4 |
|
6952 convertloop: |
|
6953 movq xmm2, qword ptr [esi] // U |
|
6954 movq xmm3, qword ptr [esi + edx] // V |
|
6955 lea esi, [esi + 8] |
|
6956 punpcklbw xmm2, xmm3 // UV |
|
6957 movdqu xmm0, [eax] // Y |
|
6958 lea eax, [eax + 16] |
|
6959 movdqa xmm1, xmm0 |
|
6960 punpcklbw xmm0, xmm2 // YUYV |
|
6961 punpckhbw xmm1, xmm2 |
|
6962 movdqu [edi], xmm0 |
|
6963 movdqu [edi + 16], xmm1 |
|
6964 lea edi, [edi + 32] |
|
6965 sub ecx, 16 |
|
6966 jg convertloop |
|
6967 |
|
6968 pop edi |
|
6969 pop esi |
|
6970 ret |
|
6971 } |
|
6972 } |
|
6973 |
|
6974 __declspec(naked) __declspec(align(16)) |
|
6975 void I422ToUYVYRow_SSE2(const uint8* src_y, |
|
6976 const uint8* src_u, |
|
6977 const uint8* src_v, |
|
6978 uint8* dst_frame, int width) { |
|
6979 __asm { |
|
6980 push esi |
|
6981 push edi |
|
6982 mov eax, [esp + 8 + 4] // src_y |
|
6983 mov esi, [esp + 8 + 8] // src_u |
|
6984 mov edx, [esp + 8 + 12] // src_v |
|
6985 mov edi, [esp + 8 + 16] // dst_frame |
|
6986 mov ecx, [esp + 8 + 20] // width |
|
6987 sub edx, esi |
|
6988 |
|
6989 align 4 |
|
6990 convertloop: |
|
6991 movq xmm2, qword ptr [esi] // U |
|
6992 movq xmm3, qword ptr [esi + edx] // V |
|
6993 lea esi, [esi + 8] |
|
6994 punpcklbw xmm2, xmm3 // UV |
|
6995 movdqu xmm0, [eax] // Y |
|
6996 movdqa xmm1, xmm2 |
|
6997 lea eax, [eax + 16] |
|
6998 punpcklbw xmm1, xmm0 // UYVY |
|
6999 punpckhbw xmm2, xmm0 |
|
7000 movdqu [edi], xmm1 |
|
7001 movdqu [edi + 16], xmm2 |
|
7002 lea edi, [edi + 32] |
|
7003 sub ecx, 16 |
|
7004 jg convertloop |
|
7005 |
|
7006 pop edi |
|
7007 pop esi |
|
7008 ret |
|
7009 } |
|
7010 } |
|
7011 |
|
7012 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 |
|
7013 __declspec(naked) __declspec(align(16)) |
|
7014 void ARGBPolynomialRow_SSE2(const uint8* src_argb, |
|
7015 uint8* dst_argb, const float* poly, |
|
7016 int width) { |
|
7017 __asm { |
|
7018 push esi |
|
7019 mov eax, [esp + 4 + 4] /* src_argb */ |
|
7020 mov edx, [esp + 4 + 8] /* dst_argb */ |
|
7021 mov esi, [esp + 4 + 12] /* poly */ |
|
7022 mov ecx, [esp + 4 + 16] /* width */ |
|
7023 pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. |
|
7024 |
|
7025 // 2 pixel loop. |
|
7026 align 4 |
|
7027 convertloop: |
|
7028 // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel |
|
7029 // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel |
|
7030 movq xmm0, qword ptr [eax] // BGRABGRA |
|
7031 lea eax, [eax + 8] |
|
7032 punpcklbw xmm0, xmm3 |
|
7033 movdqa xmm4, xmm0 |
|
7034 punpcklwd xmm0, xmm3 // pixel 0 |
|
7035 punpckhwd xmm4, xmm3 // pixel 1 |
|
7036 cvtdq2ps xmm0, xmm0 // 4 floats |
|
7037 cvtdq2ps xmm4, xmm4 |
|
7038 movdqa xmm1, xmm0 // X |
|
7039 movdqa xmm5, xmm4 |
|
7040 mulps xmm0, [esi + 16] // C1 * X |
|
7041 mulps xmm4, [esi + 16] |
|
7042 addps xmm0, [esi] // result = C0 + C1 * X |
|
7043 addps xmm4, [esi] |
|
7044 movdqa xmm2, xmm1 |
|
7045 movdqa xmm6, xmm5 |
|
7046 mulps xmm2, xmm1 // X * X |
|
7047 mulps xmm6, xmm5 |
|
7048 mulps xmm1, xmm2 // X * X * X |
|
7049 mulps xmm5, xmm6 |
|
7050 mulps xmm2, [esi + 32] // C2 * X * X |
|
7051 mulps xmm6, [esi + 32] |
|
7052 mulps xmm1, [esi + 48] // C3 * X * X * X |
|
7053 mulps xmm5, [esi + 48] |
|
7054 addps xmm0, xmm2 // result += C2 * X * X |
|
7055 addps xmm4, xmm6 |
|
7056 addps xmm0, xmm1 // result += C3 * X * X * X |
|
7057 addps xmm4, xmm5 |
|
7058 cvttps2dq xmm0, xmm0 |
|
7059 cvttps2dq xmm4, xmm4 |
|
7060 packuswb xmm0, xmm4 |
|
7061 packuswb xmm0, xmm0 |
|
7062 sub ecx, 2 |
|
7063 movq qword ptr [edx], xmm0 |
|
7064 lea edx, [edx + 8] |
|
7065 jg convertloop |
|
7066 pop esi |
|
7067 ret |
|
7068 } |
|
7069 } |
|
7070 #endif // HAS_ARGBPOLYNOMIALROW_SSE2 |
|
7071 |
|
7072 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 |
|
7073 __declspec(naked) __declspec(align(16)) |
|
7074 void ARGBPolynomialRow_AVX2(const uint8* src_argb, |
|
7075 uint8* dst_argb, const float* poly, |
|
7076 int width) { |
|
7077 __asm { |
|
7078 mov eax, [esp + 4] /* src_argb */ |
|
7079 mov edx, [esp + 8] /* dst_argb */ |
|
7080 mov ecx, [esp + 12] /* poly */ |
|
7081 vbroadcastf128 ymm4, [ecx] // C0 |
|
7082 vbroadcastf128 ymm5, [ecx + 16] // C1 |
|
7083 vbroadcastf128 ymm6, [ecx + 32] // C2 |
|
7084 vbroadcastf128 ymm7, [ecx + 48] // C3 |
|
7085 mov ecx, [esp + 16] /* width */ |
|
7086 |
|
7087 // 2 pixel loop. |
|
7088 align 4 |
|
7089 convertloop: |
|
7090 vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels |
|
7091 lea eax, [eax + 8] |
|
7092 vcvtdq2ps ymm0, ymm0 // X 8 floats |
|
7093 vmulps ymm2, ymm0, ymm0 // X * X |
|
7094 vmulps ymm3, ymm0, ymm7 // C3 * X |
|
7095 vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X |
|
7096 vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X |
|
7097 vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X |
|
7098 vcvttps2dq ymm0, ymm0 |
|
7099 vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000 |
|
7100 vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000 |
|
7101 vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000 |
|
7102 sub ecx, 2 |
|
7103 vmovq qword ptr [edx], xmm0 |
|
7104 lea edx, [edx + 8] |
|
7105 jg convertloop |
|
7106 vzeroupper |
|
7107 ret |
|
7108 } |
|
7109 } |
|
7110 #endif // HAS_ARGBPOLYNOMIALROW_AVX2 |
|
7111 |
|
7112 #ifdef HAS_ARGBCOLORTABLEROW_X86 |
|
7113 // Tranform ARGB pixels with color table. |
|
7114 __declspec(naked) __declspec(align(16)) |
|
7115 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, |
|
7116 int width) { |
|
7117 __asm { |
|
7118 push esi |
|
7119 mov eax, [esp + 4 + 4] /* dst_argb */ |
|
7120 mov esi, [esp + 4 + 8] /* table_argb */ |
|
7121 mov ecx, [esp + 4 + 12] /* width */ |
|
7122 |
|
7123 // 1 pixel loop. |
|
7124 align 4 |
|
7125 convertloop: |
|
7126 movzx edx, byte ptr [eax] |
|
7127 lea eax, [eax + 4] |
|
7128 movzx edx, byte ptr [esi + edx * 4] |
|
7129 mov byte ptr [eax - 4], dl |
|
7130 movzx edx, byte ptr [eax - 4 + 1] |
|
7131 movzx edx, byte ptr [esi + edx * 4 + 1] |
|
7132 mov byte ptr [eax - 4 + 1], dl |
|
7133 movzx edx, byte ptr [eax - 4 + 2] |
|
7134 movzx edx, byte ptr [esi + edx * 4 + 2] |
|
7135 mov byte ptr [eax - 4 + 2], dl |
|
7136 movzx edx, byte ptr [eax - 4 + 3] |
|
7137 movzx edx, byte ptr [esi + edx * 4 + 3] |
|
7138 mov byte ptr [eax - 4 + 3], dl |
|
7139 dec ecx |
|
7140 jg convertloop |
|
7141 pop esi |
|
7142 ret |
|
7143 } |
|
7144 } |
|
7145 #endif // HAS_ARGBCOLORTABLEROW_X86 |
|
7146 |
|
7147 #ifdef HAS_RGBCOLORTABLEROW_X86 |
|
7148 // Tranform RGB pixels with color table. |
|
7149 __declspec(naked) __declspec(align(16)) |
|
7150 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { |
|
7151 __asm { |
|
7152 push esi |
|
7153 mov eax, [esp + 4 + 4] /* dst_argb */ |
|
7154 mov esi, [esp + 4 + 8] /* table_argb */ |
|
7155 mov ecx, [esp + 4 + 12] /* width */ |
|
7156 |
|
7157 // 1 pixel loop. |
|
7158 align 4 |
|
7159 convertloop: |
|
7160 movzx edx, byte ptr [eax] |
|
7161 lea eax, [eax + 4] |
|
7162 movzx edx, byte ptr [esi + edx * 4] |
|
7163 mov byte ptr [eax - 4], dl |
|
7164 movzx edx, byte ptr [eax - 4 + 1] |
|
7165 movzx edx, byte ptr [esi + edx * 4 + 1] |
|
7166 mov byte ptr [eax - 4 + 1], dl |
|
7167 movzx edx, byte ptr [eax - 4 + 2] |
|
7168 movzx edx, byte ptr [esi + edx * 4 + 2] |
|
7169 mov byte ptr [eax - 4 + 2], dl |
|
7170 dec ecx |
|
7171 jg convertloop |
|
7172 |
|
7173 pop esi |
|
7174 ret |
|
7175 } |
|
7176 } |
|
7177 #endif // HAS_RGBCOLORTABLEROW_X86 |
|
7178 |
|
7179 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
|
7180 // Tranform RGB pixels with luma table. |
|
7181 __declspec(naked) __declspec(align(16)) |
|
7182 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, |
|
7183 int width, |
|
7184 const uint8* luma, uint32 lumacoeff) { |
|
7185 __asm { |
|
7186 push esi |
|
7187 push edi |
|
7188 mov eax, [esp + 8 + 4] /* src_argb */ |
|
7189 mov edi, [esp + 8 + 8] /* dst_argb */ |
|
7190 mov ecx, [esp + 8 + 12] /* width */ |
|
7191 movd xmm2, dword ptr [esp + 8 + 16] // luma table |
|
7192 movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff |
|
7193 pshufd xmm2, xmm2, 0 |
|
7194 pshufd xmm3, xmm3, 0 |
|
7195 pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 |
|
7196 psllw xmm4, 8 |
|
7197 pxor xmm5, xmm5 |
|
7198 |
|
7199 // 4 pixel loop. |
|
7200 align 4 |
|
7201 convertloop: |
|
7202 movdqu xmm0, qword ptr [eax] // generate luma ptr |
|
7203 pmaddubsw xmm0, xmm3 |
|
7204 phaddw xmm0, xmm0 |
|
7205 pand xmm0, xmm4 // mask out low bits |
|
7206 punpcklwd xmm0, xmm5 |
|
7207 paddd xmm0, xmm2 // add table base |
|
7208 movd esi, xmm0 |
|
7209 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 |
|
7210 |
|
7211 movzx edx, byte ptr [eax] |
|
7212 movzx edx, byte ptr [esi + edx] |
|
7213 mov byte ptr [edi], dl |
|
7214 movzx edx, byte ptr [eax + 1] |
|
7215 movzx edx, byte ptr [esi + edx] |
|
7216 mov byte ptr [edi + 1], dl |
|
7217 movzx edx, byte ptr [eax + 2] |
|
7218 movzx edx, byte ptr [esi + edx] |
|
7219 mov byte ptr [edi + 2], dl |
|
7220 movzx edx, byte ptr [eax + 3] // copy alpha. |
|
7221 mov byte ptr [edi + 3], dl |
|
7222 |
|
7223 movd esi, xmm0 |
|
7224 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 |
|
7225 |
|
7226 movzx edx, byte ptr [eax + 4] |
|
7227 movzx edx, byte ptr [esi + edx] |
|
7228 mov byte ptr [edi + 4], dl |
|
7229 movzx edx, byte ptr [eax + 5] |
|
7230 movzx edx, byte ptr [esi + edx] |
|
7231 mov byte ptr [edi + 5], dl |
|
7232 movzx edx, byte ptr [eax + 6] |
|
7233 movzx edx, byte ptr [esi + edx] |
|
7234 mov byte ptr [edi + 6], dl |
|
7235 movzx edx, byte ptr [eax + 7] // copy alpha. |
|
7236 mov byte ptr [edi + 7], dl |
|
7237 |
|
7238 movd esi, xmm0 |
|
7239 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 |
|
7240 |
|
7241 movzx edx, byte ptr [eax + 8] |
|
7242 movzx edx, byte ptr [esi + edx] |
|
7243 mov byte ptr [edi + 8], dl |
|
7244 movzx edx, byte ptr [eax + 9] |
|
7245 movzx edx, byte ptr [esi + edx] |
|
7246 mov byte ptr [edi + 9], dl |
|
7247 movzx edx, byte ptr [eax + 10] |
|
7248 movzx edx, byte ptr [esi + edx] |
|
7249 mov byte ptr [edi + 10], dl |
|
7250 movzx edx, byte ptr [eax + 11] // copy alpha. |
|
7251 mov byte ptr [edi + 11], dl |
|
7252 |
|
7253 movd esi, xmm0 |
|
7254 |
|
7255 movzx edx, byte ptr [eax + 12] |
|
7256 movzx edx, byte ptr [esi + edx] |
|
7257 mov byte ptr [edi + 12], dl |
|
7258 movzx edx, byte ptr [eax + 13] |
|
7259 movzx edx, byte ptr [esi + edx] |
|
7260 mov byte ptr [edi + 13], dl |
|
7261 movzx edx, byte ptr [eax + 14] |
|
7262 movzx edx, byte ptr [esi + edx] |
|
7263 mov byte ptr [edi + 14], dl |
|
7264 movzx edx, byte ptr [eax + 15] // copy alpha. |
|
7265 mov byte ptr [edi + 15], dl |
|
7266 |
|
7267 sub ecx, 4 |
|
7268 lea eax, [eax + 16] |
|
7269 lea edi, [edi + 16] |
|
7270 jg convertloop |
|
7271 |
|
7272 pop edi |
|
7273 pop esi |
|
7274 ret |
|
7275 } |
|
7276 } |
|
7277 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
|
7278 |
|
7279 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) |
|
7280 |
|
7281 #ifdef __cplusplus |
|
7282 } // extern "C" |
|
7283 } // namespace libyuv |
|
7284 #endif |