1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/ycbcr/yuv_row_win.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,498 @@ 1.4 +// Copyright (c) 2010 The Chromium Authors. All rights reserved. 1.5 +// Use of this source code is governed by a BSD-style license that can be 1.6 +// found in the LICENSE file. 1.7 + 1.8 +#include "yuv_row.h" 1.9 +#include "mozilla/SSE.h" 1.10 + 1.11 +#define kCoefficientsRgbU kCoefficientsRgbY + 2048 1.12 +#define kCoefficientsRgbV kCoefficientsRgbY + 4096 1.13 + 1.14 +extern "C" { 1.15 + 1.16 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) 1.17 +__declspec(naked) 1.18 +void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf, 1.19 + const uint8* u_buf, 1.20 + const uint8* v_buf, 1.21 + uint8* rgb_buf, 1.22 + int width) { 1.23 + __asm { 1.24 + pushad 1.25 + mov edx, [esp + 32 + 4] // Y 1.26 + mov edi, [esp + 32 + 8] // U 1.27 + mov esi, [esp + 32 + 12] // V 1.28 + mov ebp, [esp + 32 + 16] // rgb 1.29 + mov ecx, [esp + 32 + 20] // width 1.30 + jmp convertend 1.31 + 1.32 + convertloop : 1.33 + movzx eax, byte ptr [edi] 1.34 + add edi, 1 1.35 + movzx ebx, byte ptr [esi] 1.36 + add esi, 1 1.37 + movq mm0, [kCoefficientsRgbU + 8 * eax] 1.38 + movzx eax, byte ptr [edx] 1.39 + paddsw mm0, [kCoefficientsRgbV + 8 * ebx] 1.40 + movzx ebx, byte ptr [edx + 1] 1.41 + movq mm1, [kCoefficientsRgbY + 8 * eax] 1.42 + add edx, 2 1.43 + movq mm2, [kCoefficientsRgbY + 8 * ebx] 1.44 + paddsw mm1, mm0 1.45 + paddsw mm2, mm0 1.46 + psraw mm1, 6 1.47 + psraw mm2, 6 1.48 + packuswb mm1, mm2 1.49 + movntq [ebp], mm1 1.50 + add ebp, 8 1.51 + convertend : 1.52 + sub ecx, 2 1.53 + jns convertloop 1.54 + 1.55 + and ecx, 1 // odd number of pixels? 1.56 + jz convertdone 1.57 + 1.58 + movzx eax, byte ptr [edi] 1.59 + movq mm0, [kCoefficientsRgbU + 8 * eax] 1.60 + movzx eax, byte ptr [esi] 1.61 + paddsw mm0, [kCoefficientsRgbV + 8 * eax] 1.62 + movzx eax, byte ptr [edx] 1.63 + movq mm1, [kCoefficientsRgbY + 8 * eax] 1.64 + paddsw mm1, mm0 1.65 + psraw mm1, 6 1.66 + packuswb mm1, mm1 1.67 + movd [ebp], mm1 1.68 + convertdone : 1.69 + 1.70 + popad 1.71 + ret 1.72 + } 1.73 +} 1.74 + 1.75 +__declspec(naked) 1.76 +void ConvertYUVToRGB32Row_SSE(const uint8* y_buf, 1.77 + const uint8* u_buf, 1.78 + const uint8* v_buf, 1.79 + uint8* rgb_buf, 1.80 + int width, 1.81 + int step) { 1.82 + __asm { 1.83 + pushad 1.84 + mov edx, [esp + 32 + 4] // Y 1.85 + mov edi, [esp + 32 + 8] // U 1.86 + mov esi, [esp + 32 + 12] // V 1.87 + mov ebp, [esp + 32 + 16] // rgb 1.88 + mov ecx, [esp + 32 + 20] // width 1.89 + mov ebx, [esp + 32 + 24] // step 1.90 + jmp wend 1.91 + 1.92 + wloop : 1.93 + movzx eax, byte ptr [edi] 1.94 + add edi, ebx 1.95 + movq mm0, [kCoefficientsRgbU + 8 * eax] 1.96 + movzx eax, byte ptr [esi] 1.97 + add esi, ebx 1.98 + paddsw mm0, [kCoefficientsRgbV + 8 * eax] 1.99 + movzx eax, byte ptr [edx] 1.100 + add edx, ebx 1.101 + movq mm1, [kCoefficientsRgbY + 8 * eax] 1.102 + movzx eax, byte ptr [edx] 1.103 + add edx, ebx 1.104 + movq mm2, [kCoefficientsRgbY + 8 * eax] 1.105 + paddsw mm1, mm0 1.106 + paddsw mm2, mm0 1.107 + psraw mm1, 6 1.108 + psraw mm2, 6 1.109 + packuswb mm1, mm2 1.110 + movntq [ebp], mm1 1.111 + add ebp, 8 1.112 + wend : 1.113 + sub ecx, 2 1.114 + jns wloop 1.115 + 1.116 + and ecx, 1 // odd number of pixels? 1.117 + jz wdone 1.118 + 1.119 + movzx eax, byte ptr [edi] 1.120 + movq mm0, [kCoefficientsRgbU + 8 * eax] 1.121 + movzx eax, byte ptr [esi] 1.122 + paddsw mm0, [kCoefficientsRgbV + 8 * eax] 1.123 + movzx eax, byte ptr [edx] 1.124 + movq mm1, [kCoefficientsRgbY + 8 * eax] 1.125 + paddsw mm1, mm0 1.126 + psraw mm1, 6 1.127 + packuswb mm1, mm1 1.128 + movd [ebp], mm1 1.129 + wdone : 1.130 + 1.131 + popad 1.132 + ret 1.133 + } 1.134 +} 1.135 + 1.136 +__declspec(naked) 1.137 +void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf, 1.138 + const uint8* u_buf, 1.139 + const uint8* v_buf, 1.140 + uint8* rgb_buf, 1.141 + int width, 1.142 + int ystep, 1.143 + int uvstep) { 1.144 + __asm { 1.145 + pushad 1.146 + mov edx, [esp + 32 + 4] // Y 1.147 + mov edi, [esp + 32 + 8] // U 1.148 + mov esi, [esp + 32 + 12] // V 1.149 + mov ebp, [esp + 32 + 16] // rgb 1.150 + mov ecx, [esp + 32 + 20] // width 1.151 + jmp wend 1.152 + 1.153 + wloop : 1.154 + movzx eax, byte ptr [edi] 1.155 + mov ebx, [esp + 32 + 28] // uvstep 1.156 + add edi, ebx 1.157 + movq mm0, [kCoefficientsRgbU + 8 * eax] 1.158 + movzx eax, byte ptr [esi] 1.159 + add esi, ebx 1.160 + paddsw mm0, [kCoefficientsRgbV + 8 * eax] 1.161 + movzx eax, byte ptr [edx] 1.162 + mov ebx, [esp + 32 + 24] // ystep 1.163 + add edx, ebx 1.164 + movq mm1, [kCoefficientsRgbY + 8 * eax] 1.165 + movzx eax, byte ptr [edx] 1.166 + add edx, ebx 1.167 + movq mm2, [kCoefficientsRgbY + 8 * eax] 1.168 + paddsw mm1, mm0 1.169 + paddsw mm2, mm0 1.170 + psraw mm1, 6 1.171 + psraw mm2, 6 1.172 + packuswb mm1, mm2 1.173 + movntq [ebp], mm1 1.174 + add ebp, 8 1.175 + wend : 1.176 + sub ecx, 2 1.177 + jns wloop 1.178 + 1.179 + and ecx, 1 // odd number of pixels? 1.180 + jz wdone 1.181 + 1.182 + movzx eax, byte ptr [edi] 1.183 + movq mm0, [kCoefficientsRgbU + 8 * eax] 1.184 + movzx eax, byte ptr [esi] 1.185 + paddsw mm0, [kCoefficientsRgbV + 8 * eax] 1.186 + movzx eax, byte ptr [edx] 1.187 + movq mm1, [kCoefficientsRgbY + 8 * eax] 1.188 + paddsw mm1, mm0 1.189 + psraw mm1, 6 1.190 + packuswb mm1, mm1 1.191 + movd [ebp], mm1 1.192 + wdone : 1.193 + 1.194 + popad 1.195 + ret 1.196 + } 1.197 +} 1.198 + 1.199 +__declspec(naked) 1.200 +void DoubleYUVToRGB32Row_SSE(const uint8* y_buf, 1.201 + const uint8* u_buf, 1.202 + const uint8* v_buf, 1.203 + uint8* rgb_buf, 1.204 + int width) { 1.205 + __asm { 1.206 + pushad 1.207 + mov edx, [esp + 32 + 4] // Y 1.208 + mov edi, [esp + 32 + 8] // U 1.209 + mov esi, [esp + 32 + 12] // V 1.210 + mov ebp, [esp + 32 + 16] // rgb 1.211 + mov ecx, [esp + 32 + 20] // width 1.212 + jmp wend 1.213 + 1.214 + wloop : 1.215 + movzx eax, byte ptr [edi] 1.216 + add edi, 1 1.217 + movzx ebx, byte ptr [esi] 1.218 + add esi, 1 1.219 + movq mm0, [kCoefficientsRgbU + 8 * eax] 1.220 + movzx eax, byte ptr [edx] 1.221 + paddsw mm0, [kCoefficientsRgbV + 8 * ebx] 1.222 + movq mm1, [kCoefficientsRgbY + 8 * eax] 1.223 + paddsw mm1, mm0 1.224 + psraw mm1, 6 1.225 + packuswb mm1, mm1 1.226 + punpckldq mm1, mm1 1.227 + movntq [ebp], mm1 1.228 + 1.229 + movzx ebx, byte ptr [edx + 1] 1.230 + add edx, 2 1.231 + paddsw mm0, [kCoefficientsRgbY + 8 * ebx] 1.232 + psraw mm0, 6 1.233 + packuswb mm0, mm0 1.234 + punpckldq mm0, mm0 1.235 + movntq [ebp+8], mm0 1.236 + add ebp, 16 1.237 + wend : 1.238 + sub ecx, 4 1.239 + jns wloop 1.240 + 1.241 + add ecx, 4 1.242 + jz wdone 1.243 + 1.244 + movzx eax, byte ptr [edi] 1.245 + movq mm0, [kCoefficientsRgbU + 8 * eax] 1.246 + movzx eax, byte ptr [esi] 1.247 + paddsw mm0, [kCoefficientsRgbV + 8 * eax] 1.248 + movzx eax, byte ptr [edx] 1.249 + movq mm1, [kCoefficientsRgbY + 8 * eax] 1.250 + paddsw mm1, mm0 1.251 + psraw mm1, 6 1.252 + packuswb mm1, mm1 1.253 + jmp wend1 1.254 + 1.255 + wloop1 : 1.256 + movd [ebp], mm1 1.257 + add ebp, 4 1.258 + wend1 : 1.259 + sub ecx, 1 1.260 + jns wloop1 1.261 + wdone : 1.262 + popad 1.263 + ret 1.264 + } 1.265 +} 1.266 + 1.267 +// This version does general purpose scaling by any amount, up or down. 1.268 +// The only thing it cannot do is rotation by 90 or 270. 1.269 +// For performance the chroma is under-sampled, reducing cost of a 3x 1.270 +// 1080p scale from 8.4 ms to 5.4 ms. 1.271 +__declspec(naked) 1.272 +void ScaleYUVToRGB32Row_SSE(const uint8* y_buf, 1.273 + const uint8* u_buf, 1.274 + const uint8* v_buf, 1.275 + uint8* rgb_buf, 1.276 + int width, 1.277 + int source_dx) { 1.278 + __asm { 1.279 + pushad 1.280 + mov edx, [esp + 32 + 4] // Y 1.281 + mov edi, [esp + 32 + 8] // U 1.282 + mov esi, [esp + 32 + 12] // V 1.283 + mov ebp, [esp + 32 + 16] // rgb 1.284 + mov ecx, [esp + 32 + 20] // width 1.285 + xor ebx, ebx // x 1.286 + jmp scaleend 1.287 + 1.288 + scaleloop : 1.289 + mov eax, ebx 1.290 + sar eax, 17 1.291 + movzx eax, byte ptr [edi + eax] 1.292 + movq mm0, [kCoefficientsRgbU + 8 * eax] 1.293 + mov eax, ebx 1.294 + sar eax, 17 1.295 + movzx eax, byte ptr [esi + eax] 1.296 + paddsw mm0, [kCoefficientsRgbV + 8 * eax] 1.297 + mov eax, ebx 1.298 + add ebx, [esp + 32 + 24] // x += source_dx 1.299 + sar eax, 16 1.300 + movzx eax, byte ptr [edx + eax] 1.301 + movq mm1, [kCoefficientsRgbY + 8 * eax] 1.302 + mov eax, ebx 1.303 + add ebx, [esp + 32 + 24] // x += source_dx 1.304 + sar eax, 16 1.305 + movzx eax, byte ptr [edx + eax] 1.306 + movq mm2, [kCoefficientsRgbY + 8 * eax] 1.307 + paddsw mm1, mm0 1.308 + paddsw mm2, mm0 1.309 + psraw mm1, 6 1.310 + psraw mm2, 6 1.311 + packuswb mm1, mm2 1.312 + movntq [ebp], mm1 1.313 + add ebp, 8 1.314 + scaleend : 1.315 + sub ecx, 2 1.316 + jns scaleloop 1.317 + 1.318 + and ecx, 1 // odd number of pixels? 1.319 + jz scaledone 1.320 + 1.321 + mov eax, ebx 1.322 + sar eax, 17 1.323 + movzx eax, byte ptr [edi + eax] 1.324 + movq mm0, [kCoefficientsRgbU + 8 * eax] 1.325 + mov eax, ebx 1.326 + sar eax, 17 1.327 + movzx eax, byte ptr [esi + eax] 1.328 + paddsw mm0, [kCoefficientsRgbV + 8 * eax] 1.329 + mov eax, ebx 1.330 + sar eax, 16 1.331 + movzx eax, byte ptr [edx + eax] 1.332 + movq mm1, [kCoefficientsRgbY + 8 * eax] 1.333 + paddsw mm1, mm0 1.334 + psraw mm1, 6 1.335 + packuswb mm1, mm1 1.336 + movd [ebp], mm1 1.337 + 1.338 + scaledone : 1.339 + popad 1.340 + ret 1.341 + } 1.342 +} 1.343 + 1.344 +__declspec(naked) 1.345 +void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf, 1.346 + const uint8* u_buf, 1.347 + const uint8* v_buf, 1.348 + uint8* rgb_buf, 1.349 + int width, 1.350 + int source_dx) { 1.351 + __asm { 1.352 + pushad 1.353 + mov edx, [esp + 32 + 4] // Y 1.354 + mov edi, [esp + 32 + 8] // U 1.355 + // [esp + 32 + 12] // V 1.356 + mov ebp, [esp + 32 + 16] // rgb 1.357 + mov ecx, [esp + 32 + 20] // width 1.358 + imul ecx, [esp + 32 + 24] // source_dx 1.359 + mov [esp + 32 + 20], ecx // source_width = width * source_dx 1.360 + mov ecx, [esp + 32 + 24] // source_dx 1.361 + xor ebx, ebx // x = 0 1.362 + cmp ecx, 0x20000 1.363 + jl lscaleend 1.364 + mov ebx, 0x8000 // x = 0.5 for 1/2 or less 1.365 + jmp lscaleend 1.366 +lscaleloop: 1.367 + mov eax, ebx 1.368 + sar eax, 0x11 1.369 + 1.370 + movzx ecx, byte ptr [edi + eax] 1.371 + movzx esi, byte ptr [edi + eax + 1] 1.372 + mov eax, ebx 1.373 + and eax, 0x1fffe 1.374 + imul esi, eax 1.375 + xor eax, 0x1fffe 1.376 + imul ecx, eax 1.377 + add ecx, esi 1.378 + shr ecx, 17 1.379 + movq mm0, [kCoefficientsRgbU + 8 * ecx] 1.380 + 1.381 + mov esi, [esp + 32 + 12] 1.382 + mov eax, ebx 1.383 + sar eax, 0x11 1.384 + 1.385 + movzx ecx, byte ptr [esi + eax] 1.386 + movzx esi, byte ptr [esi + eax + 1] 1.387 + mov eax, ebx 1.388 + and eax, 0x1fffe 1.389 + imul esi, eax 1.390 + xor eax, 0x1fffe 1.391 + imul ecx, eax 1.392 + add ecx, esi 1.393 + shr ecx, 17 1.394 + paddsw mm0, [kCoefficientsRgbV + 8 * ecx] 1.395 + 1.396 + mov eax, ebx 1.397 + sar eax, 0x10 1.398 + movzx ecx, byte ptr [edx + eax] 1.399 + movzx esi, byte ptr [1 + edx + eax] 1.400 + mov eax, ebx 1.401 + add ebx, [esp + 32 + 24] 1.402 + and eax, 0xffff 1.403 + imul esi, eax 1.404 + xor eax, 0xffff 1.405 + imul ecx, eax 1.406 + add ecx, esi 1.407 + shr ecx, 16 1.408 + movq mm1, [kCoefficientsRgbY + 8 * ecx] 1.409 + 1.410 + cmp ebx, [esp + 32 + 20] 1.411 + jge lscalelastpixel 1.412 + 1.413 + mov eax, ebx 1.414 + sar eax, 0x10 1.415 + movzx ecx, byte ptr [edx + eax] 1.416 + movzx esi, byte ptr [edx + eax + 1] 1.417 + mov eax, ebx 1.418 + add ebx, [esp + 32 + 24] 1.419 + and eax, 0xffff 1.420 + imul esi, eax 1.421 + xor eax, 0xffff 1.422 + imul ecx, eax 1.423 + add ecx, esi 1.424 + shr ecx, 16 1.425 + movq mm2, [kCoefficientsRgbY + 8 * ecx] 1.426 + 1.427 + paddsw mm1, mm0 1.428 + paddsw mm2, mm0 1.429 + psraw mm1, 0x6 1.430 + psraw mm2, 0x6 1.431 + packuswb mm1, mm2 1.432 + movntq [ebp], mm1 1.433 + add ebp, 0x8 1.434 + 1.435 +lscaleend: 1.436 + cmp ebx, [esp + 32 + 20] 1.437 + jl lscaleloop 1.438 + popad 1.439 + ret 1.440 + 1.441 +lscalelastpixel: 1.442 + paddsw mm1, mm0 1.443 + psraw mm1, 6 1.444 + packuswb mm1, mm1 1.445 + movd [ebp], mm1 1.446 + popad 1.447 + ret 1.448 + }; 1.449 +} 1.450 +#endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) 1.451 + 1.452 +void FastConvertYUVToRGB32Row(const uint8* y_buf, 1.453 + const uint8* u_buf, 1.454 + const uint8* v_buf, 1.455 + uint8* rgb_buf, 1.456 + int width) { 1.457 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) 1.458 + if (mozilla::supports_sse()) { 1.459 + FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width); 1.460 + return; 1.461 + } 1.462 +#endif 1.463 + 1.464 + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); 1.465 +} 1.466 + 1.467 +void ScaleYUVToRGB32Row(const uint8* y_buf, 1.468 + const uint8* u_buf, 1.469 + const uint8* v_buf, 1.470 + uint8* rgb_buf, 1.471 + int width, 1.472 + int source_dx) { 1.473 + 1.474 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) 1.475 + if (mozilla::supports_sse()) { 1.476 + ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); 1.477 + return; 1.478 + } 1.479 +#endif 1.480 + 1.481 + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); 1.482 +} 1.483 + 1.484 +void LinearScaleYUVToRGB32Row(const uint8* y_buf, 1.485 + const uint8* u_buf, 1.486 + const uint8* v_buf, 1.487 + uint8* rgb_buf, 1.488 + int width, 1.489 + int source_dx) { 1.490 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) 1.491 + if (mozilla::supports_sse()) { 1.492 + LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, 1.493 + source_dx); 1.494 + return; 1.495 + } 1.496 +#endif 1.497 + 1.498 + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); 1.499 +} 1.500 + 1.501 +} // extern "C"