1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/ycbcr/yuv_row_posix.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,915 @@ 1.4 +// Copyright (c) 2010 The Chromium Authors. All rights reserved. 1.5 +// Use of this source code is governed by a BSD-style license that can be 1.6 +// found in the LICENSE file. 1.7 + 1.8 +#include "yuv_row.h" 1.9 +#include "mozilla/SSE.h" 1.10 + 1.11 +#define DCHECK(a) 1.12 + 1.13 +extern "C" { 1.14 + 1.15 +#if defined(ARCH_CPU_X86_64) 1.16 + 1.17 +// We don't need CPUID guards here, since x86-64 implies SSE2. 1.18 + 1.19 +// AMD64 ABI uses register paremters. 1.20 +void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi 1.21 + const uint8* u_buf, // rsi 1.22 + const uint8* v_buf, // rdx 1.23 + uint8* rgb_buf, // rcx 1.24 + int width) { // r8 1.25 + asm( 1.26 + "jmp 1f\n" 1.27 +"0:" 1.28 + "movzb (%1),%%r10\n" 1.29 + "add $0x1,%1\n" 1.30 + "movzb (%2),%%r11\n" 1.31 + "add $0x1,%2\n" 1.32 + "movq 2048(%5,%%r10,8),%%xmm0\n" 1.33 + "movzb (%0),%%r10\n" 1.34 + "movq 4096(%5,%%r11,8),%%xmm1\n" 1.35 + "movzb 0x1(%0),%%r11\n" 1.36 + "paddsw %%xmm1,%%xmm0\n" 1.37 + "movq (%5,%%r10,8),%%xmm2\n" 1.38 + "add $0x2,%0\n" 1.39 + "movq (%5,%%r11,8),%%xmm3\n" 1.40 + "paddsw %%xmm0,%%xmm2\n" 1.41 + "paddsw %%xmm0,%%xmm3\n" 1.42 + "shufps $0x44,%%xmm3,%%xmm2\n" 1.43 + "psraw $0x6,%%xmm2\n" 1.44 + "packuswb %%xmm2,%%xmm2\n" 1.45 + "movq %%xmm2,0x0(%3)\n" 1.46 + "add $0x8,%3\n" 1.47 +"1:" 1.48 + "sub $0x2,%4\n" 1.49 + "jns 0b\n" 1.50 + 1.51 +"2:" 1.52 + "add $0x1,%4\n" 1.53 + "js 3f\n" 1.54 + 1.55 + "movzb (%1),%%r10\n" 1.56 + "movq 2048(%5,%%r10,8),%%xmm0\n" 1.57 + "movzb (%2),%%r10\n" 1.58 + "movq 4096(%5,%%r10,8),%%xmm1\n" 1.59 + "paddsw %%xmm1,%%xmm0\n" 1.60 + "movzb (%0),%%r10\n" 1.61 + "movq (%5,%%r10,8),%%xmm1\n" 1.62 + "paddsw %%xmm0,%%xmm1\n" 1.63 + "psraw $0x6,%%xmm1\n" 1.64 + "packuswb %%xmm1,%%xmm1\n" 1.65 + "movd %%xmm1,0x0(%3)\n" 1.66 +"3:" 1.67 + : 1.68 + : "r"(y_buf), // %0 1.69 + "r"(u_buf), // %1 1.70 + "r"(v_buf), // %2 1.71 + "r"(rgb_buf), // %3 1.72 + "r"(width), // %4 1.73 + "r" (kCoefficientsRgbY) // %5 1.74 + : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" 1.75 +); 1.76 +} 1.77 + 1.78 +void ScaleYUVToRGB32Row(const uint8* y_buf, // rdi 1.79 + const uint8* u_buf, // rsi 1.80 + const uint8* v_buf, // rdx 1.81 + uint8* rgb_buf, // rcx 1.82 + int width, // r8 1.83 + int source_dx) { // r9 1.84 + asm( 1.85 + "xor %%r11,%%r11\n" 1.86 + "sub $0x2,%4\n" 1.87 + "js 1f\n" 1.88 + 1.89 +"0:" 1.90 + "mov %%r11,%%r10\n" 1.91 + "sar $0x11,%%r10\n" 1.92 + "movzb (%1,%%r10,1),%%rax\n" 1.93 + "movq 2048(%5,%%rax,8),%%xmm0\n" 1.94 + "movzb (%2,%%r10,1),%%rax\n" 1.95 + "movq 4096(%5,%%rax,8),%%xmm1\n" 1.96 + "lea (%%r11,%6),%%r10\n" 1.97 + "sar $0x10,%%r11\n" 1.98 + "movzb (%0,%%r11,1),%%rax\n" 1.99 + "paddsw %%xmm1,%%xmm0\n" 1.100 + "movq (%5,%%rax,8),%%xmm1\n" 1.101 + "lea (%%r10,%6),%%r11\n" 1.102 + "sar $0x10,%%r10\n" 1.103 + "movzb (%0,%%r10,1),%%rax\n" 1.104 + "movq (%5,%%rax,8),%%xmm2\n" 1.105 + "paddsw %%xmm0,%%xmm1\n" 1.106 + "paddsw %%xmm0,%%xmm2\n" 1.107 + "shufps $0x44,%%xmm2,%%xmm1\n" 1.108 + "psraw $0x6,%%xmm1\n" 1.109 + "packuswb %%xmm1,%%xmm1\n" 1.110 + "movq %%xmm1,0x0(%3)\n" 1.111 + "add $0x8,%3\n" 1.112 + "sub $0x2,%4\n" 1.113 + "jns 0b\n" 1.114 + 1.115 +"1:" 1.116 + "add $0x1,%4\n" 1.117 + "js 2f\n" 1.118 + 1.119 + "mov %%r11,%%r10\n" 1.120 + "sar $0x11,%%r10\n" 1.121 + "movzb (%1,%%r10,1),%%rax\n" 1.122 + "movq 2048(%5,%%rax,8),%%xmm0\n" 1.123 + "movzb (%2,%%r10,1),%%rax\n" 1.124 + "movq 4096(%5,%%rax,8),%%xmm1\n" 1.125 + "paddsw %%xmm1,%%xmm0\n" 1.126 + "sar $0x10,%%r11\n" 1.127 + "movzb (%0,%%r11,1),%%rax\n" 1.128 + "movq (%5,%%rax,8),%%xmm1\n" 1.129 + "paddsw %%xmm0,%%xmm1\n" 1.130 + "psraw $0x6,%%xmm1\n" 1.131 + "packuswb %%xmm1,%%xmm1\n" 1.132 + "movd %%xmm1,0x0(%3)\n" 1.133 + 1.134 +"2:" 1.135 + : 1.136 + : "r"(y_buf), // %0 1.137 + "r"(u_buf), // %1 1.138 + "r"(v_buf), // %2 1.139 + "r"(rgb_buf), // %3 1.140 + "r"(width), // %4 1.141 + "r" (kCoefficientsRgbY), // %5 1.142 + "r"(static_cast<long>(source_dx)) // %6 1.143 + : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2" 1.144 +); 1.145 +} 1.146 + 1.147 +void LinearScaleYUVToRGB32Row(const uint8* y_buf, 1.148 + const uint8* u_buf, 1.149 + const uint8* v_buf, 1.150 + uint8* rgb_buf, 1.151 + int width, 1.152 + int source_dx) { 1.153 + asm( 1.154 + "xor %%r11,%%r11\n" // x = 0 1.155 + "sub $0x2,%4\n" 1.156 + "js 2f\n" 1.157 + "cmp $0x20000,%6\n" // if source_dx >= 2.0 1.158 + "jl 0f\n" 1.159 + "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less 1.160 +"0:" 1.161 + 1.162 +"1:" 1.163 + "mov %%r11,%%r10\n" 1.164 + "sar $0x11,%%r10\n" 1.165 + 1.166 + "movzb (%1, %%r10, 1), %%r13 \n" 1.167 + "movzb 1(%1, %%r10, 1), %%r14 \n" 1.168 + "mov %%r11, %%rax \n" 1.169 + "and $0x1fffe, %%rax \n" 1.170 + "imul %%rax, %%r14 \n" 1.171 + "xor $0x1fffe, %%rax \n" 1.172 + "imul %%rax, %%r13 \n" 1.173 + "add %%r14, %%r13 \n" 1.174 + "shr $17, %%r13 \n" 1.175 + "movq 2048(%5,%%r13,8), %%xmm0\n" 1.176 + 1.177 + "movzb (%2, %%r10, 1), %%r13 \n" 1.178 + "movzb 1(%2, %%r10, 1), %%r14 \n" 1.179 + "mov %%r11, %%rax \n" 1.180 + "and $0x1fffe, %%rax \n" 1.181 + "imul %%rax, %%r14 \n" 1.182 + "xor $0x1fffe, %%rax \n" 1.183 + "imul %%rax, %%r13 \n" 1.184 + "add %%r14, %%r13 \n" 1.185 + "shr $17, %%r13 \n" 1.186 + "movq 4096(%5,%%r13,8), %%xmm1\n" 1.187 + 1.188 + "mov %%r11, %%rax \n" 1.189 + "lea (%%r11,%6),%%r10\n" 1.190 + "sar $0x10,%%r11\n" 1.191 + "paddsw %%xmm1,%%xmm0\n" 1.192 + 1.193 + "movzb (%0, %%r11, 1), %%r13 \n" 1.194 + "movzb 1(%0, %%r11, 1), %%r14 \n" 1.195 + "and $0xffff, %%rax \n" 1.196 + "imul %%rax, %%r14 \n" 1.197 + "xor $0xffff, %%rax \n" 1.198 + "imul %%rax, %%r13 \n" 1.199 + "add %%r14, %%r13 \n" 1.200 + "shr $16, %%r13 \n" 1.201 + "movq (%5,%%r13,8),%%xmm1\n" 1.202 + 1.203 + "mov %%r10, %%rax \n" 1.204 + "lea (%%r10,%6),%%r11\n" 1.205 + "sar $0x10,%%r10\n" 1.206 + 1.207 + "movzb (%0,%%r10,1), %%r13 \n" 1.208 + "movzb 1(%0,%%r10,1), %%r14 \n" 1.209 + "and $0xffff, %%rax \n" 1.210 + "imul %%rax, %%r14 \n" 1.211 + "xor $0xffff, %%rax \n" 1.212 + "imul %%rax, %%r13 \n" 1.213 + "add %%r14, %%r13 \n" 1.214 + "shr $16, %%r13 \n" 1.215 + "movq (%5,%%r13,8),%%xmm2\n" 1.216 + 1.217 + "paddsw %%xmm0,%%xmm1\n" 1.218 + "paddsw %%xmm0,%%xmm2\n" 1.219 + "shufps $0x44,%%xmm2,%%xmm1\n" 1.220 + "psraw $0x6,%%xmm1\n" 1.221 + "packuswb %%xmm1,%%xmm1\n" 1.222 + "movq %%xmm1,0x0(%3)\n" 1.223 + "add $0x8,%3\n" 1.224 + "sub $0x2,%4\n" 1.225 + "jns 1b\n" 1.226 + 1.227 +"2:" 1.228 + "add $0x1,%4\n" 1.229 + "js 3f\n" 1.230 + 1.231 + "mov %%r11,%%r10\n" 1.232 + "sar $0x11,%%r10\n" 1.233 + 1.234 + "movzb (%1,%%r10,1), %%r13 \n" 1.235 + "movq 2048(%5,%%r13,8),%%xmm0\n" 1.236 + 1.237 + "movzb (%2,%%r10,1), %%r13 \n" 1.238 + "movq 4096(%5,%%r13,8),%%xmm1\n" 1.239 + 1.240 + "paddsw %%xmm1,%%xmm0\n" 1.241 + "sar $0x10,%%r11\n" 1.242 + 1.243 + "movzb (%0,%%r11,1), %%r13 \n" 1.244 + "movq (%5,%%r13,8),%%xmm1\n" 1.245 + 1.246 + "paddsw %%xmm0,%%xmm1\n" 1.247 + "psraw $0x6,%%xmm1\n" 1.248 + "packuswb %%xmm1,%%xmm1\n" 1.249 + "movd %%xmm1,0x0(%3)\n" 1.250 + 1.251 +"3:" 1.252 + : 1.253 + : "r"(y_buf), // %0 1.254 + "r"(u_buf), // %1 1.255 + "r"(v_buf), // %2 1.256 + "r"(rgb_buf), // %3 1.257 + "r"(width), // %4 1.258 + "r" (kCoefficientsRgbY), // %5 1.259 + "r"(static_cast<long>(source_dx)) // %6 1.260 + : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2" 1.261 +); 1.262 +} 1.263 + 1.264 +#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__) 1.265 + 1.266 +// PIC version is slower because less registers are available, so 1.267 +// non-PIC is used on platforms where it is possible. 1.268 +void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf, 1.269 + const uint8* u_buf, 1.270 + const uint8* v_buf, 1.271 + uint8* rgb_buf, 1.272 + int width); 1.273 + asm( 1.274 + ".text\n" 1.275 + ".global FastConvertYUVToRGB32Row_SSE\n" 1.276 + ".type FastConvertYUVToRGB32Row_SSE, @function\n" 1.277 +"FastConvertYUVToRGB32Row_SSE:\n" 1.278 + "pusha\n" 1.279 + "mov 0x24(%esp),%edx\n" 1.280 + "mov 0x28(%esp),%edi\n" 1.281 + "mov 0x2c(%esp),%esi\n" 1.282 + "mov 0x30(%esp),%ebp\n" 1.283 + "mov 0x34(%esp),%ecx\n" 1.284 + "jmp 1f\n" 1.285 + 1.286 +"0:" 1.287 + "movzbl (%edi),%eax\n" 1.288 + "add $0x1,%edi\n" 1.289 + "movzbl (%esi),%ebx\n" 1.290 + "add $0x1,%esi\n" 1.291 + "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" 1.292 + "movzbl (%edx),%eax\n" 1.293 + "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" 1.294 + "movzbl 0x1(%edx),%ebx\n" 1.295 + "movq kCoefficientsRgbY(,%eax,8),%mm1\n" 1.296 + "add $0x2,%edx\n" 1.297 + "movq kCoefficientsRgbY(,%ebx,8),%mm2\n" 1.298 + "paddsw %mm0,%mm1\n" 1.299 + "paddsw %mm0,%mm2\n" 1.300 + "psraw $0x6,%mm1\n" 1.301 + "psraw $0x6,%mm2\n" 1.302 + "packuswb %mm2,%mm1\n" 1.303 + "movntq %mm1,0x0(%ebp)\n" 1.304 + "add $0x8,%ebp\n" 1.305 +"1:" 1.306 + "sub $0x2,%ecx\n" 1.307 + "jns 0b\n" 1.308 + 1.309 + "and $0x1,%ecx\n" 1.310 + "je 2f\n" 1.311 + 1.312 + "movzbl (%edi),%eax\n" 1.313 + "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" 1.314 + "movzbl (%esi),%eax\n" 1.315 + "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" 1.316 + "movzbl (%edx),%eax\n" 1.317 + "movq kCoefficientsRgbY(,%eax,8),%mm1\n" 1.318 + "paddsw %mm0,%mm1\n" 1.319 + "psraw $0x6,%mm1\n" 1.320 + "packuswb %mm1,%mm1\n" 1.321 + "movd %mm1,0x0(%ebp)\n" 1.322 +"2:" 1.323 + "popa\n" 1.324 + "ret\n" 1.325 +#if !defined(XP_MACOSX) 1.326 + ".previous\n" 1.327 +#endif 1.328 +); 1.329 + 1.330 +void FastConvertYUVToRGB32Row(const uint8* y_buf, 1.331 + const uint8* u_buf, 1.332 + const uint8* v_buf, 1.333 + uint8* rgb_buf, 1.334 + int width) 1.335 +{ 1.336 + if (mozilla::supports_sse()) { 1.337 + FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width); 1.338 + return; 1.339 + } 1.340 + 1.341 + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); 1.342 +} 1.343 + 1.344 + 1.345 +void ScaleYUVToRGB32Row_SSE(const uint8* y_buf, 1.346 + const uint8* u_buf, 1.347 + const uint8* v_buf, 1.348 + uint8* rgb_buf, 1.349 + int width, 1.350 + int source_dx); 1.351 + asm( 1.352 + ".text\n" 1.353 + ".global ScaleYUVToRGB32Row_SSE\n" 1.354 + ".type ScaleYUVToRGB32Row_SSE, @function\n" 1.355 +"ScaleYUVToRGB32Row_SSE:\n" 1.356 + "pusha\n" 1.357 + "mov 0x24(%esp),%edx\n" 1.358 + "mov 0x28(%esp),%edi\n" 1.359 + "mov 0x2c(%esp),%esi\n" 1.360 + "mov 0x30(%esp),%ebp\n" 1.361 + "mov 0x34(%esp),%ecx\n" 1.362 + "xor %ebx,%ebx\n" 1.363 + "jmp 1f\n" 1.364 + 1.365 +"0:" 1.366 + "mov %ebx,%eax\n" 1.367 + "sar $0x11,%eax\n" 1.368 + "movzbl (%edi,%eax,1),%eax\n" 1.369 + "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" 1.370 + "mov %ebx,%eax\n" 1.371 + "sar $0x11,%eax\n" 1.372 + "movzbl (%esi,%eax,1),%eax\n" 1.373 + "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" 1.374 + "mov %ebx,%eax\n" 1.375 + "add 0x38(%esp),%ebx\n" 1.376 + "sar $0x10,%eax\n" 1.377 + "movzbl (%edx,%eax,1),%eax\n" 1.378 + "movq kCoefficientsRgbY(,%eax,8),%mm1\n" 1.379 + "mov %ebx,%eax\n" 1.380 + "add 0x38(%esp),%ebx\n" 1.381 + "sar $0x10,%eax\n" 1.382 + "movzbl (%edx,%eax,1),%eax\n" 1.383 + "movq kCoefficientsRgbY(,%eax,8),%mm2\n" 1.384 + "paddsw %mm0,%mm1\n" 1.385 + "paddsw %mm0,%mm2\n" 1.386 + "psraw $0x6,%mm1\n" 1.387 + "psraw $0x6,%mm2\n" 1.388 + "packuswb %mm2,%mm1\n" 1.389 + "movntq %mm1,0x0(%ebp)\n" 1.390 + "add $0x8,%ebp\n" 1.391 +"1:" 1.392 + "sub $0x2,%ecx\n" 1.393 + "jns 0b\n" 1.394 + 1.395 + "and $0x1,%ecx\n" 1.396 + "je 2f\n" 1.397 + 1.398 + "mov %ebx,%eax\n" 1.399 + "sar $0x11,%eax\n" 1.400 + "movzbl (%edi,%eax,1),%eax\n" 1.401 + "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" 1.402 + "mov %ebx,%eax\n" 1.403 + "sar $0x11,%eax\n" 1.404 + "movzbl (%esi,%eax,1),%eax\n" 1.405 + "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" 1.406 + "mov %ebx,%eax\n" 1.407 + "sar $0x10,%eax\n" 1.408 + "movzbl (%edx,%eax,1),%eax\n" 1.409 + "movq kCoefficientsRgbY(,%eax,8),%mm1\n" 1.410 + "paddsw %mm0,%mm1\n" 1.411 + "psraw $0x6,%mm1\n" 1.412 + "packuswb %mm1,%mm1\n" 1.413 + "movd %mm1,0x0(%ebp)\n" 1.414 + 1.415 +"2:" 1.416 + "popa\n" 1.417 + "ret\n" 1.418 +#if !defined(XP_MACOSX) 1.419 + ".previous\n" 1.420 +#endif 1.421 +); 1.422 + 1.423 +void ScaleYUVToRGB32Row(const uint8* y_buf, 1.424 + const uint8* u_buf, 1.425 + const uint8* v_buf, 1.426 + uint8* rgb_buf, 1.427 + int width, 1.428 + int source_dx) 1.429 +{ 1.430 + if (mozilla::supports_sse()) { 1.431 + ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, 1.432 + width, source_dx); 1.433 + } 1.434 + 1.435 + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, 1.436 + width, source_dx); 1.437 +} 1.438 + 1.439 +void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf, 1.440 + const uint8* u_buf, 1.441 + const uint8* v_buf, 1.442 + uint8* rgb_buf, 1.443 + int width, 1.444 + int source_dx); 1.445 + asm( 1.446 + ".text\n" 1.447 + ".global LinearScaleYUVToRGB32Row_SSE\n" 1.448 + ".type LinearScaleYUVToRGB32Row_SSE, @function\n" 1.449 +"LinearScaleYUVToRGB32Row_SSE:\n" 1.450 + "pusha\n" 1.451 + "mov 0x24(%esp),%edx\n" 1.452 + "mov 0x28(%esp),%edi\n" 1.453 + "mov 0x30(%esp),%ebp\n" 1.454 + 1.455 + // source_width = width * source_dx + ebx 1.456 + "mov 0x34(%esp), %ecx\n" 1.457 + "imull 0x38(%esp), %ecx\n" 1.458 + "mov %ecx, 0x34(%esp)\n" 1.459 + 1.460 + "mov 0x38(%esp), %ecx\n" 1.461 + "xor %ebx,%ebx\n" // x = 0 1.462 + "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 1.463 + "jl 1f\n" 1.464 + "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less 1.465 + "jmp 1f\n" 1.466 + 1.467 +"0:" 1.468 + "mov %ebx,%eax\n" 1.469 + "sar $0x11,%eax\n" 1.470 + 1.471 + "movzbl (%edi,%eax,1),%ecx\n" 1.472 + "movzbl 1(%edi,%eax,1),%esi\n" 1.473 + "mov %ebx,%eax\n" 1.474 + "andl $0x1fffe, %eax \n" 1.475 + "imul %eax, %esi \n" 1.476 + "xorl $0x1fffe, %eax \n" 1.477 + "imul %eax, %ecx \n" 1.478 + "addl %esi, %ecx \n" 1.479 + "shrl $17, %ecx \n" 1.480 + "movq kCoefficientsRgbY+2048(,%ecx,8),%mm0\n" 1.481 + 1.482 + "mov 0x2c(%esp),%esi\n" 1.483 + "mov %ebx,%eax\n" 1.484 + "sar $0x11,%eax\n" 1.485 + 1.486 + "movzbl (%esi,%eax,1),%ecx\n" 1.487 + "movzbl 1(%esi,%eax,1),%esi\n" 1.488 + "mov %ebx,%eax\n" 1.489 + "andl $0x1fffe, %eax \n" 1.490 + "imul %eax, %esi \n" 1.491 + "xorl $0x1fffe, %eax \n" 1.492 + "imul %eax, %ecx \n" 1.493 + "addl %esi, %ecx \n" 1.494 + "shrl $17, %ecx \n" 1.495 + "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n" 1.496 + 1.497 + "mov %ebx,%eax\n" 1.498 + "sar $0x10,%eax\n" 1.499 + "movzbl (%edx,%eax,1),%ecx\n" 1.500 + "movzbl 1(%edx,%eax,1),%esi\n" 1.501 + "mov %ebx,%eax\n" 1.502 + "add 0x38(%esp),%ebx\n" 1.503 + "andl $0xffff, %eax \n" 1.504 + "imul %eax, %esi \n" 1.505 + "xorl $0xffff, %eax \n" 1.506 + "imul %eax, %ecx \n" 1.507 + "addl %esi, %ecx \n" 1.508 + "shrl $16, %ecx \n" 1.509 + "movq kCoefficientsRgbY(,%ecx,8),%mm1\n" 1.510 + 1.511 + "cmp 0x34(%esp), %ebx\n" 1.512 + "jge 2f\n" 1.513 + 1.514 + "mov %ebx,%eax\n" 1.515 + "sar $0x10,%eax\n" 1.516 + "movzbl (%edx,%eax,1),%ecx\n" 1.517 + "movzbl 1(%edx,%eax,1),%esi\n" 1.518 + "mov %ebx,%eax\n" 1.519 + "add 0x38(%esp),%ebx\n" 1.520 + "andl $0xffff, %eax \n" 1.521 + "imul %eax, %esi \n" 1.522 + "xorl $0xffff, %eax \n" 1.523 + "imul %eax, %ecx \n" 1.524 + "addl %esi, %ecx \n" 1.525 + "shrl $16, %ecx \n" 1.526 + "movq kCoefficientsRgbY(,%ecx,8),%mm2\n" 1.527 + 1.528 + "paddsw %mm0,%mm1\n" 1.529 + "paddsw %mm0,%mm2\n" 1.530 + "psraw $0x6,%mm1\n" 1.531 + "psraw $0x6,%mm2\n" 1.532 + "packuswb %mm2,%mm1\n" 1.533 + "movntq %mm1,0x0(%ebp)\n" 1.534 + "add $0x8,%ebp\n" 1.535 + 1.536 +"1:" 1.537 + "cmp 0x34(%esp), %ebx\n" 1.538 + "jl 0b\n" 1.539 + "popa\n" 1.540 + "ret\n" 1.541 + 1.542 +"2:" 1.543 + "paddsw %mm0, %mm1\n" 1.544 + "psraw $6, %mm1\n" 1.545 + "packuswb %mm1, %mm1\n" 1.546 + "movd %mm1, (%ebp)\n" 1.547 + "popa\n" 1.548 + "ret\n" 1.549 +#if !defined(XP_MACOSX) 1.550 + ".previous\n" 1.551 +#endif 1.552 +); 1.553 + 1.554 +void LinearScaleYUVToRGB32Row(const uint8* y_buf, 1.555 + const uint8* u_buf, 1.556 + const uint8* v_buf, 1.557 + uint8* rgb_buf, 1.558 + int width, 1.559 + int source_dx) 1.560 +{ 1.561 + if (mozilla::supports_sse()) { 1.562 + LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, 1.563 + width, source_dx); 1.564 + } 1.565 + 1.566 + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, 1.567 + width, source_dx); 1.568 +} 1.569 + 1.570 +#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__) 1.571 + 1.572 +void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf, 1.573 + const uint8* u_buf, 1.574 + const uint8* v_buf, 1.575 + uint8* rgb_buf, 1.576 + int width, 1.577 + int16 *kCoefficientsRgbY); 1.578 + 1.579 + asm( 1.580 + ".text\n" 1.581 +#if defined(XP_MACOSX) 1.582 +"_PICConvertYUVToRGB32Row_SSE:\n" 1.583 +#else 1.584 +"PICConvertYUVToRGB32Row_SSE:\n" 1.585 +#endif 1.586 + "pusha\n" 1.587 + "mov 0x24(%esp),%edx\n" 1.588 + "mov 0x28(%esp),%edi\n" 1.589 + "mov 0x2c(%esp),%esi\n" 1.590 + "mov 0x30(%esp),%ebp\n" 1.591 + "mov 0x38(%esp),%ecx\n" 1.592 + 1.593 + "jmp 1f\n" 1.594 + 1.595 +"0:" 1.596 + "movzbl (%edi),%eax\n" 1.597 + "add $0x1,%edi\n" 1.598 + "movzbl (%esi),%ebx\n" 1.599 + "add $0x1,%esi\n" 1.600 + "movq 2048(%ecx,%eax,8),%mm0\n" 1.601 + "movzbl (%edx),%eax\n" 1.602 + "paddsw 4096(%ecx,%ebx,8),%mm0\n" 1.603 + "movzbl 0x1(%edx),%ebx\n" 1.604 + "movq 0(%ecx,%eax,8),%mm1\n" 1.605 + "add $0x2,%edx\n" 1.606 + "movq 0(%ecx,%ebx,8),%mm2\n" 1.607 + "paddsw %mm0,%mm1\n" 1.608 + "paddsw %mm0,%mm2\n" 1.609 + "psraw $0x6,%mm1\n" 1.610 + "psraw $0x6,%mm2\n" 1.611 + "packuswb %mm2,%mm1\n" 1.612 + "movntq %mm1,0x0(%ebp)\n" 1.613 + "add $0x8,%ebp\n" 1.614 +"1:" 1.615 + "subl $0x2,0x34(%esp)\n" 1.616 + "jns 0b\n" 1.617 + 1.618 + "andl $0x1,0x34(%esp)\n" 1.619 + "je 2f\n" 1.620 + 1.621 + "movzbl (%edi),%eax\n" 1.622 + "movq 2048(%ecx,%eax,8),%mm0\n" 1.623 + "movzbl (%esi),%eax\n" 1.624 + "paddsw 4096(%ecx,%eax,8),%mm0\n" 1.625 + "movzbl (%edx),%eax\n" 1.626 + "movq 0(%ecx,%eax,8),%mm1\n" 1.627 + "paddsw %mm0,%mm1\n" 1.628 + "psraw $0x6,%mm1\n" 1.629 + "packuswb %mm1,%mm1\n" 1.630 + "movd %mm1,0x0(%ebp)\n" 1.631 +"2:" 1.632 + "popa\n" 1.633 + "ret\n" 1.634 +#if !defined(XP_MACOSX) 1.635 + ".previous\n" 1.636 +#endif 1.637 +); 1.638 + 1.639 +void FastConvertYUVToRGB32Row(const uint8* y_buf, 1.640 + const uint8* u_buf, 1.641 + const uint8* v_buf, 1.642 + uint8* rgb_buf, 1.643 + int width) 1.644 +{ 1.645 + if (mozilla::supports_sse()) { 1.646 + PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, 1.647 + &kCoefficientsRgbY[0][0]); 1.648 + return; 1.649 + } 1.650 + 1.651 + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); 1.652 +} 1.653 + 1.654 +void PICScaleYUVToRGB32Row_SSE(const uint8* y_buf, 1.655 + const uint8* u_buf, 1.656 + const uint8* v_buf, 1.657 + uint8* rgb_buf, 1.658 + int width, 1.659 + int source_dx, 1.660 + int16 *kCoefficientsRgbY); 1.661 + 1.662 + asm( 1.663 + ".text\n" 1.664 +#if defined(XP_MACOSX) 1.665 +"_PICScaleYUVToRGB32Row_SSE:\n" 1.666 +#else 1.667 +"PICScaleYUVToRGB32Row_SSE:\n" 1.668 +#endif 1.669 + "pusha\n" 1.670 + "mov 0x24(%esp),%edx\n" 1.671 + "mov 0x28(%esp),%edi\n" 1.672 + "mov 0x2c(%esp),%esi\n" 1.673 + "mov 0x30(%esp),%ebp\n" 1.674 + "mov 0x3c(%esp),%ecx\n" 1.675 + "xor %ebx,%ebx\n" 1.676 + "jmp 1f\n" 1.677 + 1.678 +"0:" 1.679 + "mov %ebx,%eax\n" 1.680 + "sar $0x11,%eax\n" 1.681 + "movzbl (%edi,%eax,1),%eax\n" 1.682 + "movq 2048(%ecx,%eax,8),%mm0\n" 1.683 + "mov %ebx,%eax\n" 1.684 + "sar $0x11,%eax\n" 1.685 + "movzbl (%esi,%eax,1),%eax\n" 1.686 + "paddsw 4096(%ecx,%eax,8),%mm0\n" 1.687 + "mov %ebx,%eax\n" 1.688 + "add 0x38(%esp),%ebx\n" 1.689 + "sar $0x10,%eax\n" 1.690 + "movzbl (%edx,%eax,1),%eax\n" 1.691 + "movq 0(%ecx,%eax,8),%mm1\n" 1.692 + "mov %ebx,%eax\n" 1.693 + "add 0x38(%esp),%ebx\n" 1.694 + "sar $0x10,%eax\n" 1.695 + "movzbl (%edx,%eax,1),%eax\n" 1.696 + "movq 0(%ecx,%eax,8),%mm2\n" 1.697 + "paddsw %mm0,%mm1\n" 1.698 + "paddsw %mm0,%mm2\n" 1.699 + "psraw $0x6,%mm1\n" 1.700 + "psraw $0x6,%mm2\n" 1.701 + "packuswb %mm2,%mm1\n" 1.702 + "movntq %mm1,0x0(%ebp)\n" 1.703 + "add $0x8,%ebp\n" 1.704 +"1:" 1.705 + "subl $0x2,0x34(%esp)\n" 1.706 + "jns 0b\n" 1.707 + 1.708 + "andl $0x1,0x34(%esp)\n" 1.709 + "je 2f\n" 1.710 + 1.711 + "mov %ebx,%eax\n" 1.712 + "sar $0x11,%eax\n" 1.713 + "movzbl (%edi,%eax,1),%eax\n" 1.714 + "movq 2048(%ecx,%eax,8),%mm0\n" 1.715 + "mov %ebx,%eax\n" 1.716 + "sar $0x11,%eax\n" 1.717 + "movzbl (%esi,%eax,1),%eax\n" 1.718 + "paddsw 4096(%ecx,%eax,8),%mm0\n" 1.719 + "mov %ebx,%eax\n" 1.720 + "sar $0x10,%eax\n" 1.721 + "movzbl (%edx,%eax,1),%eax\n" 1.722 + "movq 0(%ecx,%eax,8),%mm1\n" 1.723 + "paddsw %mm0,%mm1\n" 1.724 + "psraw $0x6,%mm1\n" 1.725 + "packuswb %mm1,%mm1\n" 1.726 + "movd %mm1,0x0(%ebp)\n" 1.727 + 1.728 +"2:" 1.729 + "popa\n" 1.730 + "ret\n" 1.731 +#if !defined(XP_MACOSX) 1.732 + ".previous\n" 1.733 +#endif 1.734 +); 1.735 + 1.736 +void ScaleYUVToRGB32Row(const uint8* y_buf, 1.737 + const uint8* u_buf, 1.738 + const uint8* v_buf, 1.739 + uint8* rgb_buf, 1.740 + int width, 1.741 + int source_dx) 1.742 +{ 1.743 + if (mozilla::supports_sse()) { 1.744 + PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, 1.745 + &kCoefficientsRgbY[0][0]); 1.746 + return; 1.747 + } 1.748 + 1.749 + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); 1.750 +} 1.751 + 1.752 +void PICLinearScaleYUVToRGB32Row_SSE(const uint8* y_buf, 1.753 + const uint8* u_buf, 1.754 + const uint8* v_buf, 1.755 + uint8* rgb_buf, 1.756 + int width, 1.757 + int source_dx, 1.758 + int16 *kCoefficientsRgbY); 1.759 + 1.760 + asm( 1.761 + ".text\n" 1.762 +#if defined(XP_MACOSX) 1.763 +"_PICLinearScaleYUVToRGB32Row_SSE:\n" 1.764 +#else 1.765 +"PICLinearScaleYUVToRGB32Row_SSE:\n" 1.766 +#endif 1.767 + "pusha\n" 1.768 + "mov 0x24(%esp),%edx\n" 1.769 + "mov 0x30(%esp),%ebp\n" 1.770 + "mov 0x34(%esp),%ecx\n" 1.771 + "mov 0x3c(%esp),%edi\n" 1.772 + "xor %ebx,%ebx\n" 1.773 + 1.774 + // source_width = width * source_dx + ebx 1.775 + "mov 0x34(%esp), %ecx\n" 1.776 + "imull 0x38(%esp), %ecx\n" 1.777 + "mov %ecx, 0x34(%esp)\n" 1.778 + 1.779 + "mov 0x38(%esp), %ecx\n" 1.780 + "xor %ebx,%ebx\n" // x = 0 1.781 + "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 1.782 + "jl 1f\n" 1.783 + "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less 1.784 + "jmp 1f\n" 1.785 + 1.786 +"0:" 1.787 + "mov 0x28(%esp),%esi\n" 1.788 + "mov %ebx,%eax\n" 1.789 + "sar $0x11,%eax\n" 1.790 + 1.791 + "movzbl (%esi,%eax,1),%ecx\n" 1.792 + "movzbl 1(%esi,%eax,1),%esi\n" 1.793 + "mov %ebx,%eax\n" 1.794 + "andl $0x1fffe, %eax \n" 1.795 + "imul %eax, %esi \n" 1.796 + "xorl $0x1fffe, %eax \n" 1.797 + "imul %eax, %ecx \n" 1.798 + "addl %esi, %ecx \n" 1.799 + "shrl $17, %ecx \n" 1.800 + "movq 2048(%edi,%ecx,8),%mm0\n" 1.801 + 1.802 + "mov 0x2c(%esp),%esi\n" 1.803 + "mov %ebx,%eax\n" 1.804 + "sar $0x11,%eax\n" 1.805 + 1.806 + "movzbl (%esi,%eax,1),%ecx\n" 1.807 + "movzbl 1(%esi,%eax,1),%esi\n" 1.808 + "mov %ebx,%eax\n" 1.809 + "andl $0x1fffe, %eax \n" 1.810 + "imul %eax, %esi \n" 1.811 + "xorl $0x1fffe, %eax \n" 1.812 + "imul %eax, %ecx \n" 1.813 + "addl %esi, %ecx \n" 1.814 + "shrl $17, %ecx \n" 1.815 + "paddsw 4096(%edi,%ecx,8),%mm0\n" 1.816 + 1.817 + "mov %ebx,%eax\n" 1.818 + "sar $0x10,%eax\n" 1.819 + "movzbl (%edx,%eax,1),%ecx\n" 1.820 + "movzbl 1(%edx,%eax,1),%esi\n" 1.821 + "mov %ebx,%eax\n" 1.822 + "add 0x38(%esp),%ebx\n" 1.823 + "andl $0xffff, %eax \n" 1.824 + "imul %eax, %esi \n" 1.825 + "xorl $0xffff, %eax \n" 1.826 + "imul %eax, %ecx \n" 1.827 + "addl %esi, %ecx \n" 1.828 + "shrl $16, %ecx \n" 1.829 + "movq (%edi,%ecx,8),%mm1\n" 1.830 + 1.831 + "cmp 0x34(%esp), %ebx\n" 1.832 + "jge 2f\n" 1.833 + 1.834 + "mov %ebx,%eax\n" 1.835 + "sar $0x10,%eax\n" 1.836 + "movzbl (%edx,%eax,1),%ecx\n" 1.837 + "movzbl 1(%edx,%eax,1),%esi\n" 1.838 + "mov %ebx,%eax\n" 1.839 + "add 0x38(%esp),%ebx\n" 1.840 + "andl $0xffff, %eax \n" 1.841 + "imul %eax, %esi \n" 1.842 + "xorl $0xffff, %eax \n" 1.843 + "imul %eax, %ecx \n" 1.844 + "addl %esi, %ecx \n" 1.845 + "shrl $16, %ecx \n" 1.846 + "movq (%edi,%ecx,8),%mm2\n" 1.847 + 1.848 + "paddsw %mm0,%mm1\n" 1.849 + "paddsw %mm0,%mm2\n" 1.850 + "psraw $0x6,%mm1\n" 1.851 + "psraw $0x6,%mm2\n" 1.852 + "packuswb %mm2,%mm1\n" 1.853 + "movntq %mm1,0x0(%ebp)\n" 1.854 + "add $0x8,%ebp\n" 1.855 + 1.856 +"1:" 1.857 + "cmp %ebx, 0x34(%esp)\n" 1.858 + "jg 0b\n" 1.859 + "popa\n" 1.860 + "ret\n" 1.861 + 1.862 +"2:" 1.863 + "paddsw %mm0, %mm1\n" 1.864 + "psraw $6, %mm1\n" 1.865 + "packuswb %mm1, %mm1\n" 1.866 + "movd %mm1, (%ebp)\n" 1.867 + "popa\n" 1.868 + "ret\n" 1.869 +#if !defined(XP_MACOSX) 1.870 + ".previous\n" 1.871 +#endif 1.872 +); 1.873 + 1.874 + 1.875 +void LinearScaleYUVToRGB32Row(const uint8* y_buf, 1.876 + const uint8* u_buf, 1.877 + const uint8* v_buf, 1.878 + uint8* rgb_buf, 1.879 + int width, 1.880 + int source_dx) 1.881 +{ 1.882 + if (mozilla::supports_sse()) { 1.883 + PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, 1.884 + source_dx, &kCoefficientsRgbY[0][0]); 1.885 + return; 1.886 + } 1.887 + 1.888 + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); 1.889 +} 1.890 +#else 1.891 +void FastConvertYUVToRGB32Row(const uint8* y_buf, 1.892 + const uint8* u_buf, 1.893 + const uint8* v_buf, 1.894 + uint8* rgb_buf, 1.895 + int width) { 1.896 + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); 1.897 +} 1.898 + 1.899 +void ScaleYUVToRGB32Row(const uint8* y_buf, 1.900 + const uint8* u_buf, 1.901 + const uint8* v_buf, 1.902 + uint8* rgb_buf, 1.903 + int width, 1.904 + int source_dx) { 1.905 + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); 1.906 +} 1.907 + 1.908 +void LinearScaleYUVToRGB32Row(const uint8* y_buf, 1.909 + const uint8* u_buf, 1.910 + const uint8* v_buf, 1.911 + uint8* rgb_buf, 1.912 + int width, 1.913 + int source_dx) { 1.914 + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); 1.915 +} 1.916 +#endif 1.917 + 1.918 +}