gfx/ycbcr/yuv_row_win.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
     2 // Use of this source code is governed by a BSD-style license that can be
     3 // found in the LICENSE file.
     5 #include "yuv_row.h"
     6 #include "mozilla/SSE.h"
     8 #define kCoefficientsRgbU kCoefficientsRgbY + 2048
     9 #define kCoefficientsRgbV kCoefficientsRgbY + 4096
    11 extern "C" {
    13 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
    14 __declspec(naked)
    15 void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
    16                                   const uint8* u_buf,
    17                                   const uint8* v_buf,
    18                                   uint8* rgb_buf,
    19                                   int width) {
    20   __asm {
    21     pushad
    22     mov       edx, [esp + 32 + 4]   // Y
    23     mov       edi, [esp + 32 + 8]   // U
    24     mov       esi, [esp + 32 + 12]  // V
    25     mov       ebp, [esp + 32 + 16]  // rgb
    26     mov       ecx, [esp + 32 + 20]  // width
    27     jmp       convertend
    29  convertloop :
    30     movzx     eax, byte ptr [edi]
    31     add       edi, 1
    32     movzx     ebx, byte ptr [esi]
    33     add       esi, 1
    34     movq      mm0, [kCoefficientsRgbU + 8 * eax]
    35     movzx     eax, byte ptr [edx]
    36     paddsw    mm0, [kCoefficientsRgbV + 8 * ebx]
    37     movzx     ebx, byte ptr [edx + 1]
    38     movq      mm1, [kCoefficientsRgbY + 8 * eax]
    39     add       edx, 2
    40     movq      mm2, [kCoefficientsRgbY + 8 * ebx]
    41     paddsw    mm1, mm0
    42     paddsw    mm2, mm0
    43     psraw     mm1, 6
    44     psraw     mm2, 6
    45     packuswb  mm1, mm2
    46     movntq    [ebp], mm1
    47     add       ebp, 8
    48  convertend :
    49     sub       ecx, 2
    50     jns       convertloop
    52     and       ecx, 1  // odd number of pixels?
    53     jz        convertdone
    55     movzx     eax, byte ptr [edi]
    56     movq      mm0, [kCoefficientsRgbU + 8 * eax]
    57     movzx     eax, byte ptr [esi]
    58     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
    59     movzx     eax, byte ptr [edx]
    60     movq      mm1, [kCoefficientsRgbY + 8 * eax]
    61     paddsw    mm1, mm0
    62     psraw     mm1, 6
    63     packuswb  mm1, mm1
    64     movd      [ebp], mm1
    65  convertdone :
    67     popad
    68     ret
    69   }
    70 }
    72 __declspec(naked)
    73 void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
    74                               const uint8* u_buf,
    75                               const uint8* v_buf,
    76                               uint8* rgb_buf,
    77                               int width,
    78                               int step) {
    79   __asm {
    80     pushad
    81     mov       edx, [esp + 32 + 4]   // Y
    82     mov       edi, [esp + 32 + 8]   // U
    83     mov       esi, [esp + 32 + 12]  // V
    84     mov       ebp, [esp + 32 + 16]  // rgb
    85     mov       ecx, [esp + 32 + 20]  // width
    86     mov       ebx, [esp + 32 + 24]  // step
    87     jmp       wend
    89  wloop :
    90     movzx     eax, byte ptr [edi]
    91     add       edi, ebx
    92     movq      mm0, [kCoefficientsRgbU + 8 * eax]
    93     movzx     eax, byte ptr [esi]
    94     add       esi, ebx
    95     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
    96     movzx     eax, byte ptr [edx]
    97     add       edx, ebx
    98     movq      mm1, [kCoefficientsRgbY + 8 * eax]
    99     movzx     eax, byte ptr [edx]
   100     add       edx, ebx
   101     movq      mm2, [kCoefficientsRgbY + 8 * eax]
   102     paddsw    mm1, mm0
   103     paddsw    mm2, mm0
   104     psraw     mm1, 6
   105     psraw     mm2, 6
   106     packuswb  mm1, mm2
   107     movntq    [ebp], mm1
   108     add       ebp, 8
   109  wend :
   110     sub       ecx, 2
   111     jns       wloop
   113     and       ecx, 1  // odd number of pixels?
   114     jz        wdone
   116     movzx     eax, byte ptr [edi]
   117     movq      mm0, [kCoefficientsRgbU + 8 * eax]
   118     movzx     eax, byte ptr [esi]
   119     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
   120     movzx     eax, byte ptr [edx]
   121     movq      mm1, [kCoefficientsRgbY + 8 * eax]
   122     paddsw    mm1, mm0
   123     psraw     mm1, 6
   124     packuswb  mm1, mm1
   125     movd      [ebp], mm1
   126  wdone :
   128     popad
   129     ret
   130   }
   131 }
   133 __declspec(naked)
   134 void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
   135                                     const uint8* u_buf,
   136                                     const uint8* v_buf,
   137                                     uint8* rgb_buf,
   138                                     int width,
   139                                     int ystep,
   140                                     int uvstep) {
   141   __asm {
   142     pushad
   143     mov       edx, [esp + 32 + 4]   // Y
   144     mov       edi, [esp + 32 + 8]   // U
   145     mov       esi, [esp + 32 + 12]  // V
   146     mov       ebp, [esp + 32 + 16]  // rgb
   147     mov       ecx, [esp + 32 + 20]  // width
   148     jmp       wend
   150  wloop :
   151     movzx     eax, byte ptr [edi]
   152     mov       ebx, [esp + 32 + 28]  // uvstep
   153     add       edi, ebx
   154     movq      mm0, [kCoefficientsRgbU + 8 * eax]
   155     movzx     eax, byte ptr [esi]
   156     add       esi, ebx
   157     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
   158     movzx     eax, byte ptr [edx]
   159     mov       ebx, [esp + 32 + 24]  // ystep
   160     add       edx, ebx
   161     movq      mm1, [kCoefficientsRgbY + 8 * eax]
   162     movzx     eax, byte ptr [edx]
   163     add       edx, ebx
   164     movq      mm2, [kCoefficientsRgbY + 8 * eax]
   165     paddsw    mm1, mm0
   166     paddsw    mm2, mm0
   167     psraw     mm1, 6
   168     psraw     mm2, 6
   169     packuswb  mm1, mm2
   170     movntq    [ebp], mm1
   171     add       ebp, 8
   172  wend :
   173     sub       ecx, 2
   174     jns       wloop
   176     and       ecx, 1  // odd number of pixels?
   177     jz        wdone
   179     movzx     eax, byte ptr [edi]
   180     movq      mm0, [kCoefficientsRgbU + 8 * eax]
   181     movzx     eax, byte ptr [esi]
   182     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
   183     movzx     eax, byte ptr [edx]
   184     movq      mm1, [kCoefficientsRgbY + 8 * eax]
   185     paddsw    mm1, mm0
   186     psraw     mm1, 6
   187     packuswb  mm1, mm1
   188     movd      [ebp], mm1
   189  wdone :
   191     popad
   192     ret
   193   }
   194 }
   196 __declspec(naked)
   197 void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
   198                              const uint8* u_buf,
   199                              const uint8* v_buf,
   200                              uint8* rgb_buf,
   201                              int width) {
   202   __asm {
   203     pushad
   204     mov       edx, [esp + 32 + 4]   // Y
   205     mov       edi, [esp + 32 + 8]   // U
   206     mov       esi, [esp + 32 + 12]  // V
   207     mov       ebp, [esp + 32 + 16]  // rgb
   208     mov       ecx, [esp + 32 + 20]  // width
   209     jmp       wend
   211  wloop :
   212     movzx     eax, byte ptr [edi]
   213     add       edi, 1
   214     movzx     ebx, byte ptr [esi]
   215     add       esi, 1
   216     movq      mm0, [kCoefficientsRgbU + 8 * eax]
   217     movzx     eax, byte ptr [edx]
   218     paddsw    mm0, [kCoefficientsRgbV + 8 * ebx]
   219     movq      mm1, [kCoefficientsRgbY + 8 * eax]
   220     paddsw    mm1, mm0
   221     psraw     mm1, 6
   222     packuswb  mm1, mm1
   223     punpckldq mm1, mm1
   224     movntq    [ebp], mm1
   226     movzx     ebx, byte ptr [edx + 1]
   227     add       edx, 2
   228     paddsw    mm0, [kCoefficientsRgbY + 8 * ebx]
   229     psraw     mm0, 6
   230     packuswb  mm0, mm0
   231     punpckldq mm0, mm0
   232     movntq    [ebp+8], mm0
   233     add       ebp, 16
   234  wend :
   235     sub       ecx, 4
   236     jns       wloop
   238     add       ecx, 4
   239     jz        wdone
   241     movzx     eax, byte ptr [edi]
   242     movq      mm0, [kCoefficientsRgbU + 8 * eax]
   243     movzx     eax, byte ptr [esi]
   244     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
   245     movzx     eax, byte ptr [edx]
   246     movq      mm1, [kCoefficientsRgbY + 8 * eax]
   247     paddsw    mm1, mm0
   248     psraw     mm1, 6
   249     packuswb  mm1, mm1
   250     jmp       wend1
   252  wloop1 :
   253     movd      [ebp], mm1
   254     add       ebp, 4
   255  wend1 :
   256     sub       ecx, 1
   257     jns       wloop1
   258  wdone :
   259     popad
   260     ret
   261   }
   262 }
   264 // This version does general purpose scaling by any amount, up or down.
   265 // The only thing it cannot do is rotation by 90 or 270.
   266 // For performance the chroma is under-sampled, reducing cost of a 3x
   267 // 1080p scale from 8.4 ms to 5.4 ms.
   268 __declspec(naked)
   269 void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
   270                             const uint8* u_buf,
   271                             const uint8* v_buf,
   272                             uint8* rgb_buf,
   273                             int width,
   274                             int source_dx) {
   275   __asm {
   276     pushad
   277     mov       edx, [esp + 32 + 4]   // Y
   278     mov       edi, [esp + 32 + 8]   // U
   279     mov       esi, [esp + 32 + 12]  // V
   280     mov       ebp, [esp + 32 + 16]  // rgb
   281     mov       ecx, [esp + 32 + 20]  // width
   282     xor       ebx, ebx              // x
   283     jmp       scaleend
   285  scaleloop :
   286     mov       eax, ebx
   287     sar       eax, 17
   288     movzx     eax, byte ptr [edi + eax]
   289     movq      mm0, [kCoefficientsRgbU + 8 * eax]
   290     mov       eax, ebx
   291     sar       eax, 17
   292     movzx     eax, byte ptr [esi + eax]
   293     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
   294     mov       eax, ebx
   295     add       ebx, [esp + 32 + 24]  // x += source_dx
   296     sar       eax, 16
   297     movzx     eax, byte ptr [edx + eax]
   298     movq      mm1, [kCoefficientsRgbY + 8 * eax]
   299     mov       eax, ebx
   300     add       ebx, [esp + 32 + 24]  // x += source_dx
   301     sar       eax, 16
   302     movzx     eax, byte ptr [edx + eax]
   303     movq      mm2, [kCoefficientsRgbY + 8 * eax]
   304     paddsw    mm1, mm0
   305     paddsw    mm2, mm0
   306     psraw     mm1, 6
   307     psraw     mm2, 6
   308     packuswb  mm1, mm2
   309     movntq    [ebp], mm1
   310     add       ebp, 8
   311  scaleend :
   312     sub       ecx, 2
   313     jns       scaleloop
   315     and       ecx, 1  // odd number of pixels?
   316     jz        scaledone
   318     mov       eax, ebx
   319     sar       eax, 17
   320     movzx     eax, byte ptr [edi + eax]
   321     movq      mm0, [kCoefficientsRgbU + 8 * eax]
   322     mov       eax, ebx
   323     sar       eax, 17
   324     movzx     eax, byte ptr [esi + eax]
   325     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
   326     mov       eax, ebx
   327     sar       eax, 16
   328     movzx     eax, byte ptr [edx + eax]
   329     movq      mm1, [kCoefficientsRgbY + 8 * eax]
   330     paddsw    mm1, mm0
   331     psraw     mm1, 6
   332     packuswb  mm1, mm1
   333     movd      [ebp], mm1
   335  scaledone :
   336     popad
   337     ret
   338   }
   339 }
   341 __declspec(naked)
   342 void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
   343                                   const uint8* u_buf,
   344                                   const uint8* v_buf,
   345                                   uint8* rgb_buf,
   346                                   int width,
   347                                   int source_dx) {
   348   __asm {
   349     pushad
   350     mov       edx, [esp + 32 + 4]  // Y
   351     mov       edi, [esp + 32 + 8]  // U
   352                 // [esp + 32 + 12] // V
   353     mov       ebp, [esp + 32 + 16] // rgb
   354     mov       ecx, [esp + 32 + 20] // width
   355     imul      ecx, [esp + 32 + 24] // source_dx
   356     mov       [esp + 32 + 20], ecx // source_width = width * source_dx
   357     mov       ecx, [esp + 32 + 24] // source_dx
   358     xor       ebx, ebx             // x = 0
   359     cmp       ecx, 0x20000
   360     jl        lscaleend
   361     mov       ebx, 0x8000          // x = 0.5 for 1/2 or less
   362     jmp       lscaleend
   363 lscaleloop:
   364     mov       eax, ebx
   365     sar       eax, 0x11
   367     movzx     ecx, byte ptr [edi + eax]
   368     movzx     esi, byte ptr [edi + eax + 1]
   369     mov       eax, ebx
   370     and       eax, 0x1fffe
   371     imul      esi, eax
   372     xor       eax, 0x1fffe
   373     imul      ecx, eax
   374     add       ecx, esi
   375     shr       ecx, 17
   376     movq      mm0, [kCoefficientsRgbU + 8 * ecx]
   378     mov       esi, [esp + 32 + 12]
   379     mov       eax, ebx
   380     sar       eax, 0x11
   382     movzx     ecx, byte ptr [esi + eax]
   383     movzx     esi, byte ptr [esi + eax + 1]
   384     mov       eax, ebx
   385     and       eax, 0x1fffe
   386     imul      esi, eax
   387     xor       eax, 0x1fffe
   388     imul      ecx, eax
   389     add       ecx, esi
   390     shr       ecx, 17
   391     paddsw    mm0, [kCoefficientsRgbV + 8 * ecx]
   393     mov       eax, ebx
   394     sar       eax, 0x10
   395     movzx     ecx, byte ptr [edx + eax]
   396     movzx     esi, byte ptr [1 + edx + eax]
   397     mov       eax, ebx
   398     add       ebx, [esp + 32 + 24]
   399     and       eax, 0xffff
   400     imul      esi, eax
   401     xor       eax, 0xffff
   402     imul      ecx, eax
   403     add       ecx, esi
   404     shr       ecx, 16
   405     movq      mm1, [kCoefficientsRgbY + 8 * ecx]
   407     cmp       ebx, [esp + 32 + 20]
   408     jge       lscalelastpixel
   410     mov       eax, ebx
   411     sar       eax, 0x10
   412     movzx     ecx, byte ptr [edx + eax]
   413     movzx     esi, byte ptr [edx + eax + 1]
   414     mov       eax, ebx
   415     add       ebx, [esp + 32 + 24]
   416     and       eax, 0xffff
   417     imul      esi, eax
   418     xor       eax, 0xffff
   419     imul      ecx, eax
   420     add       ecx, esi
   421     shr       ecx, 16
   422     movq      mm2, [kCoefficientsRgbY + 8 * ecx]
   424     paddsw    mm1, mm0
   425     paddsw    mm2, mm0
   426     psraw     mm1, 0x6
   427     psraw     mm2, 0x6
   428     packuswb  mm1, mm2
   429     movntq    [ebp], mm1
   430     add       ebp, 0x8
   432 lscaleend:
   433     cmp       ebx, [esp + 32 + 20]
   434     jl        lscaleloop
   435     popad
   436     ret
   438 lscalelastpixel:
   439     paddsw    mm1, mm0
   440     psraw     mm1, 6
   441     packuswb  mm1, mm1
   442     movd      [ebp], mm1
   443     popad
   444     ret
   445   };
   446 }
   447 #endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
   449 void FastConvertYUVToRGB32Row(const uint8* y_buf,
   450                               const uint8* u_buf,
   451                               const uint8* v_buf,
   452                               uint8* rgb_buf,
   453                               int width) {
   454 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
   455   if (mozilla::supports_sse()) {
   456     FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
   457     return;
   458   }
   459 #endif
   461   FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
   462 }
   464 void ScaleYUVToRGB32Row(const uint8* y_buf,
   465                         const uint8* u_buf,
   466                         const uint8* v_buf,
   467                         uint8* rgb_buf,
   468                         int width,
   469                         int source_dx) {
   471 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
   472   if (mozilla::supports_sse()) {
   473     ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
   474     return;
   475   }
   476 #endif
   478   ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
   479 }
   481 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
   482                               const uint8* u_buf,
   483                               const uint8* v_buf,
   484                               uint8* rgb_buf,
   485                               int width,
   486                               int source_dx) {
   487 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
   488   if (mozilla::supports_sse()) {
   489     LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
   490                                  source_dx);
   491     return;
   492   }
   493 #endif
   495   LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
   496 }
   498 } // extern "C"

mercurial