|
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
|
2 // Use of this source code is governed by a BSD-style license that can be |
|
3 // found in the LICENSE file. |
|
4 |
|
5 #include "yuv_row.h" |
|
6 #include "mozilla/SSE.h" |
|
7 |
|
8 #define kCoefficientsRgbU kCoefficientsRgbY + 2048 |
|
9 #define kCoefficientsRgbV kCoefficientsRgbY + 4096 |
|
10 |
|
11 extern "C" { |
|
12 |
|
13 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) |
|
14 __declspec(naked) |
|
15 void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf, |
|
16 const uint8* u_buf, |
|
17 const uint8* v_buf, |
|
18 uint8* rgb_buf, |
|
19 int width) { |
|
20 __asm { |
|
21 pushad |
|
22 mov edx, [esp + 32 + 4] // Y |
|
23 mov edi, [esp + 32 + 8] // U |
|
24 mov esi, [esp + 32 + 12] // V |
|
25 mov ebp, [esp + 32 + 16] // rgb |
|
26 mov ecx, [esp + 32 + 20] // width |
|
27 jmp convertend |
|
28 |
|
29 convertloop : |
|
30 movzx eax, byte ptr [edi] |
|
31 add edi, 1 |
|
32 movzx ebx, byte ptr [esi] |
|
33 add esi, 1 |
|
34 movq mm0, [kCoefficientsRgbU + 8 * eax] |
|
35 movzx eax, byte ptr [edx] |
|
36 paddsw mm0, [kCoefficientsRgbV + 8 * ebx] |
|
37 movzx ebx, byte ptr [edx + 1] |
|
38 movq mm1, [kCoefficientsRgbY + 8 * eax] |
|
39 add edx, 2 |
|
40 movq mm2, [kCoefficientsRgbY + 8 * ebx] |
|
41 paddsw mm1, mm0 |
|
42 paddsw mm2, mm0 |
|
43 psraw mm1, 6 |
|
44 psraw mm2, 6 |
|
45 packuswb mm1, mm2 |
|
46 movntq [ebp], mm1 |
|
47 add ebp, 8 |
|
48 convertend : |
|
49 sub ecx, 2 |
|
50 jns convertloop |
|
51 |
|
52 and ecx, 1 // odd number of pixels? |
|
53 jz convertdone |
|
54 |
|
55 movzx eax, byte ptr [edi] |
|
56 movq mm0, [kCoefficientsRgbU + 8 * eax] |
|
57 movzx eax, byte ptr [esi] |
|
58 paddsw mm0, [kCoefficientsRgbV + 8 * eax] |
|
59 movzx eax, byte ptr [edx] |
|
60 movq mm1, [kCoefficientsRgbY + 8 * eax] |
|
61 paddsw mm1, mm0 |
|
62 psraw mm1, 6 |
|
63 packuswb mm1, mm1 |
|
64 movd [ebp], mm1 |
|
65 convertdone : |
|
66 |
|
67 popad |
|
68 ret |
|
69 } |
|
70 } |
|
71 |
|
72 __declspec(naked) |
|
73 void ConvertYUVToRGB32Row_SSE(const uint8* y_buf, |
|
74 const uint8* u_buf, |
|
75 const uint8* v_buf, |
|
76 uint8* rgb_buf, |
|
77 int width, |
|
78 int step) { |
|
79 __asm { |
|
80 pushad |
|
81 mov edx, [esp + 32 + 4] // Y |
|
82 mov edi, [esp + 32 + 8] // U |
|
83 mov esi, [esp + 32 + 12] // V |
|
84 mov ebp, [esp + 32 + 16] // rgb |
|
85 mov ecx, [esp + 32 + 20] // width |
|
86 mov ebx, [esp + 32 + 24] // step |
|
87 jmp wend |
|
88 |
|
89 wloop : |
|
90 movzx eax, byte ptr [edi] |
|
91 add edi, ebx |
|
92 movq mm0, [kCoefficientsRgbU + 8 * eax] |
|
93 movzx eax, byte ptr [esi] |
|
94 add esi, ebx |
|
95 paddsw mm0, [kCoefficientsRgbV + 8 * eax] |
|
96 movzx eax, byte ptr [edx] |
|
97 add edx, ebx |
|
98 movq mm1, [kCoefficientsRgbY + 8 * eax] |
|
99 movzx eax, byte ptr [edx] |
|
100 add edx, ebx |
|
101 movq mm2, [kCoefficientsRgbY + 8 * eax] |
|
102 paddsw mm1, mm0 |
|
103 paddsw mm2, mm0 |
|
104 psraw mm1, 6 |
|
105 psraw mm2, 6 |
|
106 packuswb mm1, mm2 |
|
107 movntq [ebp], mm1 |
|
108 add ebp, 8 |
|
109 wend : |
|
110 sub ecx, 2 |
|
111 jns wloop |
|
112 |
|
113 and ecx, 1 // odd number of pixels? |
|
114 jz wdone |
|
115 |
|
116 movzx eax, byte ptr [edi] |
|
117 movq mm0, [kCoefficientsRgbU + 8 * eax] |
|
118 movzx eax, byte ptr [esi] |
|
119 paddsw mm0, [kCoefficientsRgbV + 8 * eax] |
|
120 movzx eax, byte ptr [edx] |
|
121 movq mm1, [kCoefficientsRgbY + 8 * eax] |
|
122 paddsw mm1, mm0 |
|
123 psraw mm1, 6 |
|
124 packuswb mm1, mm1 |
|
125 movd [ebp], mm1 |
|
126 wdone : |
|
127 |
|
128 popad |
|
129 ret |
|
130 } |
|
131 } |
|
132 |
|
133 __declspec(naked) |
|
134 void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf, |
|
135 const uint8* u_buf, |
|
136 const uint8* v_buf, |
|
137 uint8* rgb_buf, |
|
138 int width, |
|
139 int ystep, |
|
140 int uvstep) { |
|
141 __asm { |
|
142 pushad |
|
143 mov edx, [esp + 32 + 4] // Y |
|
144 mov edi, [esp + 32 + 8] // U |
|
145 mov esi, [esp + 32 + 12] // V |
|
146 mov ebp, [esp + 32 + 16] // rgb |
|
147 mov ecx, [esp + 32 + 20] // width |
|
148 jmp wend |
|
149 |
|
150 wloop : |
|
151 movzx eax, byte ptr [edi] |
|
152 mov ebx, [esp + 32 + 28] // uvstep |
|
153 add edi, ebx |
|
154 movq mm0, [kCoefficientsRgbU + 8 * eax] |
|
155 movzx eax, byte ptr [esi] |
|
156 add esi, ebx |
|
157 paddsw mm0, [kCoefficientsRgbV + 8 * eax] |
|
158 movzx eax, byte ptr [edx] |
|
159 mov ebx, [esp + 32 + 24] // ystep |
|
160 add edx, ebx |
|
161 movq mm1, [kCoefficientsRgbY + 8 * eax] |
|
162 movzx eax, byte ptr [edx] |
|
163 add edx, ebx |
|
164 movq mm2, [kCoefficientsRgbY + 8 * eax] |
|
165 paddsw mm1, mm0 |
|
166 paddsw mm2, mm0 |
|
167 psraw mm1, 6 |
|
168 psraw mm2, 6 |
|
169 packuswb mm1, mm2 |
|
170 movntq [ebp], mm1 |
|
171 add ebp, 8 |
|
172 wend : |
|
173 sub ecx, 2 |
|
174 jns wloop |
|
175 |
|
176 and ecx, 1 // odd number of pixels? |
|
177 jz wdone |
|
178 |
|
179 movzx eax, byte ptr [edi] |
|
180 movq mm0, [kCoefficientsRgbU + 8 * eax] |
|
181 movzx eax, byte ptr [esi] |
|
182 paddsw mm0, [kCoefficientsRgbV + 8 * eax] |
|
183 movzx eax, byte ptr [edx] |
|
184 movq mm1, [kCoefficientsRgbY + 8 * eax] |
|
185 paddsw mm1, mm0 |
|
186 psraw mm1, 6 |
|
187 packuswb mm1, mm1 |
|
188 movd [ebp], mm1 |
|
189 wdone : |
|
190 |
|
191 popad |
|
192 ret |
|
193 } |
|
194 } |
|
195 |
|
196 __declspec(naked) |
|
197 void DoubleYUVToRGB32Row_SSE(const uint8* y_buf, |
|
198 const uint8* u_buf, |
|
199 const uint8* v_buf, |
|
200 uint8* rgb_buf, |
|
201 int width) { |
|
202 __asm { |
|
203 pushad |
|
204 mov edx, [esp + 32 + 4] // Y |
|
205 mov edi, [esp + 32 + 8] // U |
|
206 mov esi, [esp + 32 + 12] // V |
|
207 mov ebp, [esp + 32 + 16] // rgb |
|
208 mov ecx, [esp + 32 + 20] // width |
|
209 jmp wend |
|
210 |
|
211 wloop : |
|
212 movzx eax, byte ptr [edi] |
|
213 add edi, 1 |
|
214 movzx ebx, byte ptr [esi] |
|
215 add esi, 1 |
|
216 movq mm0, [kCoefficientsRgbU + 8 * eax] |
|
217 movzx eax, byte ptr [edx] |
|
218 paddsw mm0, [kCoefficientsRgbV + 8 * ebx] |
|
219 movq mm1, [kCoefficientsRgbY + 8 * eax] |
|
220 paddsw mm1, mm0 |
|
221 psraw mm1, 6 |
|
222 packuswb mm1, mm1 |
|
223 punpckldq mm1, mm1 |
|
224 movntq [ebp], mm1 |
|
225 |
|
226 movzx ebx, byte ptr [edx + 1] |
|
227 add edx, 2 |
|
228 paddsw mm0, [kCoefficientsRgbY + 8 * ebx] |
|
229 psraw mm0, 6 |
|
230 packuswb mm0, mm0 |
|
231 punpckldq mm0, mm0 |
|
232 movntq [ebp+8], mm0 |
|
233 add ebp, 16 |
|
234 wend : |
|
235 sub ecx, 4 |
|
236 jns wloop |
|
237 |
|
238 add ecx, 4 |
|
239 jz wdone |
|
240 |
|
241 movzx eax, byte ptr [edi] |
|
242 movq mm0, [kCoefficientsRgbU + 8 * eax] |
|
243 movzx eax, byte ptr [esi] |
|
244 paddsw mm0, [kCoefficientsRgbV + 8 * eax] |
|
245 movzx eax, byte ptr [edx] |
|
246 movq mm1, [kCoefficientsRgbY + 8 * eax] |
|
247 paddsw mm1, mm0 |
|
248 psraw mm1, 6 |
|
249 packuswb mm1, mm1 |
|
250 jmp wend1 |
|
251 |
|
252 wloop1 : |
|
253 movd [ebp], mm1 |
|
254 add ebp, 4 |
|
255 wend1 : |
|
256 sub ecx, 1 |
|
257 jns wloop1 |
|
258 wdone : |
|
259 popad |
|
260 ret |
|
261 } |
|
262 } |
|
263 |
|
264 // This version does general purpose scaling by any amount, up or down. |
|
265 // The only thing it cannot do is rotation by 90 or 270. |
|
266 // For performance the chroma is under-sampled, reducing cost of a 3x |
|
267 // 1080p scale from 8.4 ms to 5.4 ms. |
|
268 __declspec(naked) |
|
269 void ScaleYUVToRGB32Row_SSE(const uint8* y_buf, |
|
270 const uint8* u_buf, |
|
271 const uint8* v_buf, |
|
272 uint8* rgb_buf, |
|
273 int width, |
|
274 int source_dx) { |
|
275 __asm { |
|
276 pushad |
|
277 mov edx, [esp + 32 + 4] // Y |
|
278 mov edi, [esp + 32 + 8] // U |
|
279 mov esi, [esp + 32 + 12] // V |
|
280 mov ebp, [esp + 32 + 16] // rgb |
|
281 mov ecx, [esp + 32 + 20] // width |
|
282 xor ebx, ebx // x |
|
283 jmp scaleend |
|
284 |
|
285 scaleloop : |
|
286 mov eax, ebx |
|
287 sar eax, 17 |
|
288 movzx eax, byte ptr [edi + eax] |
|
289 movq mm0, [kCoefficientsRgbU + 8 * eax] |
|
290 mov eax, ebx |
|
291 sar eax, 17 |
|
292 movzx eax, byte ptr [esi + eax] |
|
293 paddsw mm0, [kCoefficientsRgbV + 8 * eax] |
|
294 mov eax, ebx |
|
295 add ebx, [esp + 32 + 24] // x += source_dx |
|
296 sar eax, 16 |
|
297 movzx eax, byte ptr [edx + eax] |
|
298 movq mm1, [kCoefficientsRgbY + 8 * eax] |
|
299 mov eax, ebx |
|
300 add ebx, [esp + 32 + 24] // x += source_dx |
|
301 sar eax, 16 |
|
302 movzx eax, byte ptr [edx + eax] |
|
303 movq mm2, [kCoefficientsRgbY + 8 * eax] |
|
304 paddsw mm1, mm0 |
|
305 paddsw mm2, mm0 |
|
306 psraw mm1, 6 |
|
307 psraw mm2, 6 |
|
308 packuswb mm1, mm2 |
|
309 movntq [ebp], mm1 |
|
310 add ebp, 8 |
|
311 scaleend : |
|
312 sub ecx, 2 |
|
313 jns scaleloop |
|
314 |
|
315 and ecx, 1 // odd number of pixels? |
|
316 jz scaledone |
|
317 |
|
318 mov eax, ebx |
|
319 sar eax, 17 |
|
320 movzx eax, byte ptr [edi + eax] |
|
321 movq mm0, [kCoefficientsRgbU + 8 * eax] |
|
322 mov eax, ebx |
|
323 sar eax, 17 |
|
324 movzx eax, byte ptr [esi + eax] |
|
325 paddsw mm0, [kCoefficientsRgbV + 8 * eax] |
|
326 mov eax, ebx |
|
327 sar eax, 16 |
|
328 movzx eax, byte ptr [edx + eax] |
|
329 movq mm1, [kCoefficientsRgbY + 8 * eax] |
|
330 paddsw mm1, mm0 |
|
331 psraw mm1, 6 |
|
332 packuswb mm1, mm1 |
|
333 movd [ebp], mm1 |
|
334 |
|
335 scaledone : |
|
336 popad |
|
337 ret |
|
338 } |
|
339 } |
|
340 |
|
341 __declspec(naked) |
|
342 void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf, |
|
343 const uint8* u_buf, |
|
344 const uint8* v_buf, |
|
345 uint8* rgb_buf, |
|
346 int width, |
|
347 int source_dx) { |
|
348 __asm { |
|
349 pushad |
|
350 mov edx, [esp + 32 + 4] // Y |
|
351 mov edi, [esp + 32 + 8] // U |
|
352 // [esp + 32 + 12] // V |
|
353 mov ebp, [esp + 32 + 16] // rgb |
|
354 mov ecx, [esp + 32 + 20] // width |
|
355 imul ecx, [esp + 32 + 24] // source_dx |
|
356 mov [esp + 32 + 20], ecx // source_width = width * source_dx |
|
357 mov ecx, [esp + 32 + 24] // source_dx |
|
358 xor ebx, ebx // x = 0 |
|
359 cmp ecx, 0x20000 |
|
360 jl lscaleend |
|
361 mov ebx, 0x8000 // x = 0.5 for 1/2 or less |
|
362 jmp lscaleend |
|
363 lscaleloop: |
|
364 mov eax, ebx |
|
365 sar eax, 0x11 |
|
366 |
|
367 movzx ecx, byte ptr [edi + eax] |
|
368 movzx esi, byte ptr [edi + eax + 1] |
|
369 mov eax, ebx |
|
370 and eax, 0x1fffe |
|
371 imul esi, eax |
|
372 xor eax, 0x1fffe |
|
373 imul ecx, eax |
|
374 add ecx, esi |
|
375 shr ecx, 17 |
|
376 movq mm0, [kCoefficientsRgbU + 8 * ecx] |
|
377 |
|
378 mov esi, [esp + 32 + 12] |
|
379 mov eax, ebx |
|
380 sar eax, 0x11 |
|
381 |
|
382 movzx ecx, byte ptr [esi + eax] |
|
383 movzx esi, byte ptr [esi + eax + 1] |
|
384 mov eax, ebx |
|
385 and eax, 0x1fffe |
|
386 imul esi, eax |
|
387 xor eax, 0x1fffe |
|
388 imul ecx, eax |
|
389 add ecx, esi |
|
390 shr ecx, 17 |
|
391 paddsw mm0, [kCoefficientsRgbV + 8 * ecx] |
|
392 |
|
393 mov eax, ebx |
|
394 sar eax, 0x10 |
|
395 movzx ecx, byte ptr [edx + eax] |
|
396 movzx esi, byte ptr [1 + edx + eax] |
|
397 mov eax, ebx |
|
398 add ebx, [esp + 32 + 24] |
|
399 and eax, 0xffff |
|
400 imul esi, eax |
|
401 xor eax, 0xffff |
|
402 imul ecx, eax |
|
403 add ecx, esi |
|
404 shr ecx, 16 |
|
405 movq mm1, [kCoefficientsRgbY + 8 * ecx] |
|
406 |
|
407 cmp ebx, [esp + 32 + 20] |
|
408 jge lscalelastpixel |
|
409 |
|
410 mov eax, ebx |
|
411 sar eax, 0x10 |
|
412 movzx ecx, byte ptr [edx + eax] |
|
413 movzx esi, byte ptr [edx + eax + 1] |
|
414 mov eax, ebx |
|
415 add ebx, [esp + 32 + 24] |
|
416 and eax, 0xffff |
|
417 imul esi, eax |
|
418 xor eax, 0xffff |
|
419 imul ecx, eax |
|
420 add ecx, esi |
|
421 shr ecx, 16 |
|
422 movq mm2, [kCoefficientsRgbY + 8 * ecx] |
|
423 |
|
424 paddsw mm1, mm0 |
|
425 paddsw mm2, mm0 |
|
426 psraw mm1, 0x6 |
|
427 psraw mm2, 0x6 |
|
428 packuswb mm1, mm2 |
|
429 movntq [ebp], mm1 |
|
430 add ebp, 0x8 |
|
431 |
|
432 lscaleend: |
|
433 cmp ebx, [esp + 32 + 20] |
|
434 jl lscaleloop |
|
435 popad |
|
436 ret |
|
437 |
|
438 lscalelastpixel: |
|
439 paddsw mm1, mm0 |
|
440 psraw mm1, 6 |
|
441 packuswb mm1, mm1 |
|
442 movd [ebp], mm1 |
|
443 popad |
|
444 ret |
|
445 }; |
|
446 } |
|
447 #endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) |
|
448 |
|
449 void FastConvertYUVToRGB32Row(const uint8* y_buf, |
|
450 const uint8* u_buf, |
|
451 const uint8* v_buf, |
|
452 uint8* rgb_buf, |
|
453 int width) { |
|
454 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) |
|
455 if (mozilla::supports_sse()) { |
|
456 FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width); |
|
457 return; |
|
458 } |
|
459 #endif |
|
460 |
|
461 FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); |
|
462 } |
|
463 |
|
464 void ScaleYUVToRGB32Row(const uint8* y_buf, |
|
465 const uint8* u_buf, |
|
466 const uint8* v_buf, |
|
467 uint8* rgb_buf, |
|
468 int width, |
|
469 int source_dx) { |
|
470 |
|
471 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) |
|
472 if (mozilla::supports_sse()) { |
|
473 ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); |
|
474 return; |
|
475 } |
|
476 #endif |
|
477 |
|
478 ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); |
|
479 } |
|
480 |
|
481 void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
|
482 const uint8* u_buf, |
|
483 const uint8* v_buf, |
|
484 uint8* rgb_buf, |
|
485 int width, |
|
486 int source_dx) { |
|
487 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) |
|
488 if (mozilla::supports_sse()) { |
|
489 LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, |
|
490 source_dx); |
|
491 return; |
|
492 } |
|
493 #endif |
|
494 |
|
495 LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); |
|
496 } |
|
497 |
|
498 } // extern "C" |