gfx/ycbcr/yuv_row_win.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/ycbcr/yuv_row_win.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,498 @@
     1.4 +// Copyright (c) 2010 The Chromium Authors. All rights reserved.
     1.5 +// Use of this source code is governed by a BSD-style license that can be
     1.6 +// found in the LICENSE file.
     1.7 +
     1.8 +#include "yuv_row.h"
     1.9 +#include "mozilla/SSE.h"
    1.10 +
    1.11 +#define kCoefficientsRgbU kCoefficientsRgbY + 2048
    1.12 +#define kCoefficientsRgbV kCoefficientsRgbY + 4096
    1.13 +
    1.14 +extern "C" {
    1.15 +
    1.16 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
    1.17 +__declspec(naked)
    1.18 +void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
    1.19 +                                  const uint8* u_buf,
    1.20 +                                  const uint8* v_buf,
    1.21 +                                  uint8* rgb_buf,
    1.22 +                                  int width) {
    1.23 +  __asm {
    1.24 +    pushad
    1.25 +    mov       edx, [esp + 32 + 4]   // Y
    1.26 +    mov       edi, [esp + 32 + 8]   // U
    1.27 +    mov       esi, [esp + 32 + 12]  // V
    1.28 +    mov       ebp, [esp + 32 + 16]  // rgb
    1.29 +    mov       ecx, [esp + 32 + 20]  // width
    1.30 +    jmp       convertend
    1.31 +
    1.32 + convertloop :
    1.33 +    movzx     eax, byte ptr [edi]
    1.34 +    add       edi, 1
    1.35 +    movzx     ebx, byte ptr [esi]
    1.36 +    add       esi, 1
    1.37 +    movq      mm0, [kCoefficientsRgbU + 8 * eax]
    1.38 +    movzx     eax, byte ptr [edx]
    1.39 +    paddsw    mm0, [kCoefficientsRgbV + 8 * ebx]
    1.40 +    movzx     ebx, byte ptr [edx + 1]
    1.41 +    movq      mm1, [kCoefficientsRgbY + 8 * eax]
    1.42 +    add       edx, 2
    1.43 +    movq      mm2, [kCoefficientsRgbY + 8 * ebx]
    1.44 +    paddsw    mm1, mm0
    1.45 +    paddsw    mm2, mm0
    1.46 +    psraw     mm1, 6
    1.47 +    psraw     mm2, 6
    1.48 +    packuswb  mm1, mm2
    1.49 +    movntq    [ebp], mm1
    1.50 +    add       ebp, 8
    1.51 + convertend :
    1.52 +    sub       ecx, 2
    1.53 +    jns       convertloop
    1.54 +
    1.55 +    and       ecx, 1  // odd number of pixels?
    1.56 +    jz        convertdone
    1.57 +
    1.58 +    movzx     eax, byte ptr [edi]
    1.59 +    movq      mm0, [kCoefficientsRgbU + 8 * eax]
    1.60 +    movzx     eax, byte ptr [esi]
    1.61 +    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
    1.62 +    movzx     eax, byte ptr [edx]
    1.63 +    movq      mm1, [kCoefficientsRgbY + 8 * eax]
    1.64 +    paddsw    mm1, mm0
    1.65 +    psraw     mm1, 6
    1.66 +    packuswb  mm1, mm1
    1.67 +    movd      [ebp], mm1
    1.68 + convertdone :
    1.69 +
    1.70 +    popad
    1.71 +    ret
    1.72 +  }
    1.73 +}
    1.74 +
    1.75 +__declspec(naked)
    1.76 +void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
    1.77 +                              const uint8* u_buf,
    1.78 +                              const uint8* v_buf,
    1.79 +                              uint8* rgb_buf,
    1.80 +                              int width,
    1.81 +                              int step) {
    1.82 +  __asm {
    1.83 +    pushad
    1.84 +    mov       edx, [esp + 32 + 4]   // Y
    1.85 +    mov       edi, [esp + 32 + 8]   // U
    1.86 +    mov       esi, [esp + 32 + 12]  // V
    1.87 +    mov       ebp, [esp + 32 + 16]  // rgb
    1.88 +    mov       ecx, [esp + 32 + 20]  // width
    1.89 +    mov       ebx, [esp + 32 + 24]  // step
    1.90 +    jmp       wend
    1.91 +
    1.92 + wloop :
    1.93 +    movzx     eax, byte ptr [edi]
    1.94 +    add       edi, ebx
    1.95 +    movq      mm0, [kCoefficientsRgbU + 8 * eax]
    1.96 +    movzx     eax, byte ptr [esi]
    1.97 +    add       esi, ebx
    1.98 +    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
    1.99 +    movzx     eax, byte ptr [edx]
   1.100 +    add       edx, ebx
   1.101 +    movq      mm1, [kCoefficientsRgbY + 8 * eax]
   1.102 +    movzx     eax, byte ptr [edx]
   1.103 +    add       edx, ebx
   1.104 +    movq      mm2, [kCoefficientsRgbY + 8 * eax]
   1.105 +    paddsw    mm1, mm0
   1.106 +    paddsw    mm2, mm0
   1.107 +    psraw     mm1, 6
   1.108 +    psraw     mm2, 6
   1.109 +    packuswb  mm1, mm2
   1.110 +    movntq    [ebp], mm1
   1.111 +    add       ebp, 8
   1.112 + wend :
   1.113 +    sub       ecx, 2
   1.114 +    jns       wloop
   1.115 +
   1.116 +    and       ecx, 1  // odd number of pixels?
   1.117 +    jz        wdone
   1.118 +
   1.119 +    movzx     eax, byte ptr [edi]
   1.120 +    movq      mm0, [kCoefficientsRgbU + 8 * eax]
   1.121 +    movzx     eax, byte ptr [esi]
   1.122 +    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
   1.123 +    movzx     eax, byte ptr [edx]
   1.124 +    movq      mm1, [kCoefficientsRgbY + 8 * eax]
   1.125 +    paddsw    mm1, mm0
   1.126 +    psraw     mm1, 6
   1.127 +    packuswb  mm1, mm1
   1.128 +    movd      [ebp], mm1
   1.129 + wdone :
   1.130 +
   1.131 +    popad
   1.132 +    ret
   1.133 +  }
   1.134 +}
   1.135 +
   1.136 +__declspec(naked)
   1.137 +void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
   1.138 +                                    const uint8* u_buf,
   1.139 +                                    const uint8* v_buf,
   1.140 +                                    uint8* rgb_buf,
   1.141 +                                    int width,
   1.142 +                                    int ystep,
   1.143 +                                    int uvstep) {
   1.144 +  __asm {
   1.145 +    pushad
   1.146 +    mov       edx, [esp + 32 + 4]   // Y
   1.147 +    mov       edi, [esp + 32 + 8]   // U
   1.148 +    mov       esi, [esp + 32 + 12]  // V
   1.149 +    mov       ebp, [esp + 32 + 16]  // rgb
   1.150 +    mov       ecx, [esp + 32 + 20]  // width
   1.151 +    jmp       wend
   1.152 +
   1.153 + wloop :
   1.154 +    movzx     eax, byte ptr [edi]
   1.155 +    mov       ebx, [esp + 32 + 28]  // uvstep
   1.156 +    add       edi, ebx
   1.157 +    movq      mm0, [kCoefficientsRgbU + 8 * eax]
   1.158 +    movzx     eax, byte ptr [esi]
   1.159 +    add       esi, ebx
   1.160 +    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
   1.161 +    movzx     eax, byte ptr [edx]
   1.162 +    mov       ebx, [esp + 32 + 24]  // ystep
   1.163 +    add       edx, ebx
   1.164 +    movq      mm1, [kCoefficientsRgbY + 8 * eax]
   1.165 +    movzx     eax, byte ptr [edx]
   1.166 +    add       edx, ebx
   1.167 +    movq      mm2, [kCoefficientsRgbY + 8 * eax]
   1.168 +    paddsw    mm1, mm0
   1.169 +    paddsw    mm2, mm0
   1.170 +    psraw     mm1, 6
   1.171 +    psraw     mm2, 6
   1.172 +    packuswb  mm1, mm2
   1.173 +    movntq    [ebp], mm1
   1.174 +    add       ebp, 8
   1.175 + wend :
   1.176 +    sub       ecx, 2
   1.177 +    jns       wloop
   1.178 +
   1.179 +    and       ecx, 1  // odd number of pixels?
   1.180 +    jz        wdone
   1.181 +
   1.182 +    movzx     eax, byte ptr [edi]
   1.183 +    movq      mm0, [kCoefficientsRgbU + 8 * eax]
   1.184 +    movzx     eax, byte ptr [esi]
   1.185 +    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
   1.186 +    movzx     eax, byte ptr [edx]
   1.187 +    movq      mm1, [kCoefficientsRgbY + 8 * eax]
   1.188 +    paddsw    mm1, mm0
   1.189 +    psraw     mm1, 6
   1.190 +    packuswb  mm1, mm1
   1.191 +    movd      [ebp], mm1
   1.192 + wdone :
   1.193 +
   1.194 +    popad
   1.195 +    ret
   1.196 +  }
   1.197 +}
   1.198 +
   1.199 +__declspec(naked)
   1.200 +void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
   1.201 +                             const uint8* u_buf,
   1.202 +                             const uint8* v_buf,
   1.203 +                             uint8* rgb_buf,
   1.204 +                             int width) {
   1.205 +  __asm {
   1.206 +    pushad
   1.207 +    mov       edx, [esp + 32 + 4]   // Y
   1.208 +    mov       edi, [esp + 32 + 8]   // U
   1.209 +    mov       esi, [esp + 32 + 12]  // V
   1.210 +    mov       ebp, [esp + 32 + 16]  // rgb
   1.211 +    mov       ecx, [esp + 32 + 20]  // width
   1.212 +    jmp       wend
   1.213 +
   1.214 + wloop :
   1.215 +    movzx     eax, byte ptr [edi]
   1.216 +    add       edi, 1
   1.217 +    movzx     ebx, byte ptr [esi]
   1.218 +    add       esi, 1
   1.219 +    movq      mm0, [kCoefficientsRgbU + 8 * eax]
   1.220 +    movzx     eax, byte ptr [edx]
   1.221 +    paddsw    mm0, [kCoefficientsRgbV + 8 * ebx]
   1.222 +    movq      mm1, [kCoefficientsRgbY + 8 * eax]
   1.223 +    paddsw    mm1, mm0
   1.224 +    psraw     mm1, 6
   1.225 +    packuswb  mm1, mm1
   1.226 +    punpckldq mm1, mm1
   1.227 +    movntq    [ebp], mm1
   1.228 +
   1.229 +    movzx     ebx, byte ptr [edx + 1]
   1.230 +    add       edx, 2
   1.231 +    paddsw    mm0, [kCoefficientsRgbY + 8 * ebx]
   1.232 +    psraw     mm0, 6
   1.233 +    packuswb  mm0, mm0
   1.234 +    punpckldq mm0, mm0
   1.235 +    movntq    [ebp+8], mm0
   1.236 +    add       ebp, 16
   1.237 + wend :
   1.238 +    sub       ecx, 4
   1.239 +    jns       wloop
   1.240 +
   1.241 +    add       ecx, 4
   1.242 +    jz        wdone
   1.243 +
   1.244 +    movzx     eax, byte ptr [edi]
   1.245 +    movq      mm0, [kCoefficientsRgbU + 8 * eax]
   1.246 +    movzx     eax, byte ptr [esi]
   1.247 +    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
   1.248 +    movzx     eax, byte ptr [edx]
   1.249 +    movq      mm1, [kCoefficientsRgbY + 8 * eax]
   1.250 +    paddsw    mm1, mm0
   1.251 +    psraw     mm1, 6
   1.252 +    packuswb  mm1, mm1
   1.253 +    jmp       wend1
   1.254 +
   1.255 + wloop1 :
   1.256 +    movd      [ebp], mm1
   1.257 +    add       ebp, 4
   1.258 + wend1 :
   1.259 +    sub       ecx, 1
   1.260 +    jns       wloop1
   1.261 + wdone :
   1.262 +    popad
   1.263 +    ret
   1.264 +  }
   1.265 +}
   1.266 +
   1.267 +// This version does general purpose scaling by any amount, up or down.
   1.268 +// The only thing it cannot do is rotation by 90 or 270.
   1.269 +// For performance the chroma is under-sampled, reducing cost of a 3x
   1.270 +// 1080p scale from 8.4 ms to 5.4 ms.
   1.271 +__declspec(naked)
   1.272 +void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
   1.273 +                            const uint8* u_buf,
   1.274 +                            const uint8* v_buf,
   1.275 +                            uint8* rgb_buf,
   1.276 +                            int width,
   1.277 +                            int source_dx) {
   1.278 +  __asm {
   1.279 +    pushad
   1.280 +    mov       edx, [esp + 32 + 4]   // Y
   1.281 +    mov       edi, [esp + 32 + 8]   // U
   1.282 +    mov       esi, [esp + 32 + 12]  // V
   1.283 +    mov       ebp, [esp + 32 + 16]  // rgb
   1.284 +    mov       ecx, [esp + 32 + 20]  // width
   1.285 +    xor       ebx, ebx              // x
   1.286 +    jmp       scaleend
   1.287 +
   1.288 + scaleloop :
   1.289 +    mov       eax, ebx
   1.290 +    sar       eax, 17
   1.291 +    movzx     eax, byte ptr [edi + eax]
   1.292 +    movq      mm0, [kCoefficientsRgbU + 8 * eax]
   1.293 +    mov       eax, ebx
   1.294 +    sar       eax, 17
   1.295 +    movzx     eax, byte ptr [esi + eax]
   1.296 +    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
   1.297 +    mov       eax, ebx
   1.298 +    add       ebx, [esp + 32 + 24]  // x += source_dx
   1.299 +    sar       eax, 16
   1.300 +    movzx     eax, byte ptr [edx + eax]
   1.301 +    movq      mm1, [kCoefficientsRgbY + 8 * eax]
   1.302 +    mov       eax, ebx
   1.303 +    add       ebx, [esp + 32 + 24]  // x += source_dx
   1.304 +    sar       eax, 16
   1.305 +    movzx     eax, byte ptr [edx + eax]
   1.306 +    movq      mm2, [kCoefficientsRgbY + 8 * eax]
   1.307 +    paddsw    mm1, mm0
   1.308 +    paddsw    mm2, mm0
   1.309 +    psraw     mm1, 6
   1.310 +    psraw     mm2, 6
   1.311 +    packuswb  mm1, mm2
   1.312 +    movntq    [ebp], mm1
   1.313 +    add       ebp, 8
   1.314 + scaleend :
   1.315 +    sub       ecx, 2
   1.316 +    jns       scaleloop
   1.317 +
   1.318 +    and       ecx, 1  // odd number of pixels?
   1.319 +    jz        scaledone
   1.320 +
   1.321 +    mov       eax, ebx
   1.322 +    sar       eax, 17
   1.323 +    movzx     eax, byte ptr [edi + eax]
   1.324 +    movq      mm0, [kCoefficientsRgbU + 8 * eax]
   1.325 +    mov       eax, ebx
   1.326 +    sar       eax, 17
   1.327 +    movzx     eax, byte ptr [esi + eax]
   1.328 +    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
   1.329 +    mov       eax, ebx
   1.330 +    sar       eax, 16
   1.331 +    movzx     eax, byte ptr [edx + eax]
   1.332 +    movq      mm1, [kCoefficientsRgbY + 8 * eax]
   1.333 +    paddsw    mm1, mm0
   1.334 +    psraw     mm1, 6
   1.335 +    packuswb  mm1, mm1
   1.336 +    movd      [ebp], mm1
   1.337 +
   1.338 + scaledone :
   1.339 +    popad
   1.340 +    ret
   1.341 +  }
   1.342 +}
   1.343 +
   1.344 +__declspec(naked)
   1.345 +void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
   1.346 +                                  const uint8* u_buf,
   1.347 +                                  const uint8* v_buf,
   1.348 +                                  uint8* rgb_buf,
   1.349 +                                  int width,
   1.350 +                                  int source_dx) {
   1.351 +  __asm {
   1.352 +    pushad
   1.353 +    mov       edx, [esp + 32 + 4]  // Y
   1.354 +    mov       edi, [esp + 32 + 8]  // U
   1.355 +                // [esp + 32 + 12] // V
   1.356 +    mov       ebp, [esp + 32 + 16] // rgb
   1.357 +    mov       ecx, [esp + 32 + 20] // width
   1.358 +    imul      ecx, [esp + 32 + 24] // source_dx
   1.359 +    mov       [esp + 32 + 20], ecx // source_width = width * source_dx
   1.360 +    mov       ecx, [esp + 32 + 24] // source_dx
   1.361 +    xor       ebx, ebx             // x = 0
   1.362 +    cmp       ecx, 0x20000
   1.363 +    jl        lscaleend
   1.364 +    mov       ebx, 0x8000          // x = 0.5 for 1/2 or less
   1.365 +    jmp       lscaleend
   1.366 +lscaleloop:
   1.367 +    mov       eax, ebx
   1.368 +    sar       eax, 0x11
   1.369 +
   1.370 +    movzx     ecx, byte ptr [edi + eax]
   1.371 +    movzx     esi, byte ptr [edi + eax + 1]
   1.372 +    mov       eax, ebx
   1.373 +    and       eax, 0x1fffe
   1.374 +    imul      esi, eax
   1.375 +    xor       eax, 0x1fffe
   1.376 +    imul      ecx, eax
   1.377 +    add       ecx, esi
   1.378 +    shr       ecx, 17
   1.379 +    movq      mm0, [kCoefficientsRgbU + 8 * ecx]
   1.380 +
   1.381 +    mov       esi, [esp + 32 + 12]
   1.382 +    mov       eax, ebx
   1.383 +    sar       eax, 0x11
   1.384 +
   1.385 +    movzx     ecx, byte ptr [esi + eax]
   1.386 +    movzx     esi, byte ptr [esi + eax + 1]
   1.387 +    mov       eax, ebx
   1.388 +    and       eax, 0x1fffe
   1.389 +    imul      esi, eax
   1.390 +    xor       eax, 0x1fffe
   1.391 +    imul      ecx, eax
   1.392 +    add       ecx, esi
   1.393 +    shr       ecx, 17
   1.394 +    paddsw    mm0, [kCoefficientsRgbV + 8 * ecx]
   1.395 +
   1.396 +    mov       eax, ebx
   1.397 +    sar       eax, 0x10
   1.398 +    movzx     ecx, byte ptr [edx + eax]
   1.399 +    movzx     esi, byte ptr [1 + edx + eax]
   1.400 +    mov       eax, ebx
   1.401 +    add       ebx, [esp + 32 + 24]
   1.402 +    and       eax, 0xffff
   1.403 +    imul      esi, eax
   1.404 +    xor       eax, 0xffff
   1.405 +    imul      ecx, eax
   1.406 +    add       ecx, esi
   1.407 +    shr       ecx, 16
   1.408 +    movq      mm1, [kCoefficientsRgbY + 8 * ecx]
   1.409 +
   1.410 +    cmp       ebx, [esp + 32 + 20]
   1.411 +    jge       lscalelastpixel
   1.412 +
   1.413 +    mov       eax, ebx
   1.414 +    sar       eax, 0x10
   1.415 +    movzx     ecx, byte ptr [edx + eax]
   1.416 +    movzx     esi, byte ptr [edx + eax + 1]
   1.417 +    mov       eax, ebx
   1.418 +    add       ebx, [esp + 32 + 24]
   1.419 +    and       eax, 0xffff
   1.420 +    imul      esi, eax
   1.421 +    xor       eax, 0xffff
   1.422 +    imul      ecx, eax
   1.423 +    add       ecx, esi
   1.424 +    shr       ecx, 16
   1.425 +    movq      mm2, [kCoefficientsRgbY + 8 * ecx]
   1.426 +
   1.427 +    paddsw    mm1, mm0
   1.428 +    paddsw    mm2, mm0
   1.429 +    psraw     mm1, 0x6
   1.430 +    psraw     mm2, 0x6
   1.431 +    packuswb  mm1, mm2
   1.432 +    movntq    [ebp], mm1
   1.433 +    add       ebp, 0x8
   1.434 +
   1.435 +lscaleend:
   1.436 +    cmp       ebx, [esp + 32 + 20]
   1.437 +    jl        lscaleloop
   1.438 +    popad
   1.439 +    ret
   1.440 +
   1.441 +lscalelastpixel:
   1.442 +    paddsw    mm1, mm0
   1.443 +    psraw     mm1, 6
   1.444 +    packuswb  mm1, mm1
   1.445 +    movd      [ebp], mm1
   1.446 +    popad
   1.447 +    ret
   1.448 +  };
   1.449 +}
   1.450 +#endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
   1.451 +
   1.452 +void FastConvertYUVToRGB32Row(const uint8* y_buf,
   1.453 +                              const uint8* u_buf,
   1.454 +                              const uint8* v_buf,
   1.455 +                              uint8* rgb_buf,
   1.456 +                              int width) {
   1.457 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
   1.458 +  if (mozilla::supports_sse()) {
   1.459 +    FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
   1.460 +    return;
   1.461 +  }
   1.462 +#endif
   1.463 +
   1.464 +  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
   1.465 +}
   1.466 +
   1.467 +void ScaleYUVToRGB32Row(const uint8* y_buf,
   1.468 +                        const uint8* u_buf,
   1.469 +                        const uint8* v_buf,
   1.470 +                        uint8* rgb_buf,
   1.471 +                        int width,
   1.472 +                        int source_dx) {
   1.473 +
   1.474 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
   1.475 +  if (mozilla::supports_sse()) {
   1.476 +    ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
   1.477 +    return;
   1.478 +  }
   1.479 +#endif
   1.480 +
   1.481 +  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
   1.482 +}
   1.483 +
   1.484 +void LinearScaleYUVToRGB32Row(const uint8* y_buf,
   1.485 +                              const uint8* u_buf,
   1.486 +                              const uint8* v_buf,
   1.487 +                              uint8* rgb_buf,
   1.488 +                              int width,
   1.489 +                              int source_dx) {
   1.490 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
   1.491 +  if (mozilla::supports_sse()) {
   1.492 +    LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
   1.493 +                                 source_dx);
   1.494 +    return;
   1.495 +  }
   1.496 +#endif
   1.497 +
   1.498 +  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
   1.499 +}
   1.500 +
   1.501 +} // extern "C"

mercurial