gfx/ycbcr/yuv_row_posix.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
     2 // Use of this source code is governed by a BSD-style license that can be
     3 // found in the LICENSE file.
     5 #include "yuv_row.h"
     6 #include "mozilla/SSE.h"
     8 #define DCHECK(a)
    10 extern "C" {
    12 #if defined(ARCH_CPU_X86_64)
    14 // We don't need CPUID guards here, since x86-64 implies SSE2.
    16 // AMD64 ABI uses register paremters.
    17 void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi
    18                               const uint8* u_buf,  // rsi
    19                               const uint8* v_buf,  // rdx
    20                               uint8* rgb_buf,      // rcx
    21                               int width) {         // r8
    22   asm(
    23   "jmp    1f\n"
    24 "0:"
    25   "movzb  (%1),%%r10\n"
    26   "add    $0x1,%1\n"
    27   "movzb  (%2),%%r11\n"
    28   "add    $0x1,%2\n"
    29   "movq   2048(%5,%%r10,8),%%xmm0\n"
    30   "movzb  (%0),%%r10\n"
    31   "movq   4096(%5,%%r11,8),%%xmm1\n"
    32   "movzb  0x1(%0),%%r11\n"
    33   "paddsw %%xmm1,%%xmm0\n"
    34   "movq   (%5,%%r10,8),%%xmm2\n"
    35   "add    $0x2,%0\n"
    36   "movq   (%5,%%r11,8),%%xmm3\n"
    37   "paddsw %%xmm0,%%xmm2\n"
    38   "paddsw %%xmm0,%%xmm3\n"
    39   "shufps $0x44,%%xmm3,%%xmm2\n"
    40   "psraw  $0x6,%%xmm2\n"
    41   "packuswb %%xmm2,%%xmm2\n"
    42   "movq   %%xmm2,0x0(%3)\n"
    43   "add    $0x8,%3\n"
    44 "1:"
    45   "sub    $0x2,%4\n"
    46   "jns    0b\n"
    48 "2:"
    49   "add    $0x1,%4\n"
    50   "js     3f\n"
    52   "movzb  (%1),%%r10\n"
    53   "movq   2048(%5,%%r10,8),%%xmm0\n"
    54   "movzb  (%2),%%r10\n"
    55   "movq   4096(%5,%%r10,8),%%xmm1\n"
    56   "paddsw %%xmm1,%%xmm0\n"
    57   "movzb  (%0),%%r10\n"
    58   "movq   (%5,%%r10,8),%%xmm1\n"
    59   "paddsw %%xmm0,%%xmm1\n"
    60   "psraw  $0x6,%%xmm1\n"
    61   "packuswb %%xmm1,%%xmm1\n"
    62   "movd   %%xmm1,0x0(%3)\n"
    63 "3:"
    64   :
    65   : "r"(y_buf),  // %0
    66     "r"(u_buf),  // %1
    67     "r"(v_buf),  // %2
    68     "r"(rgb_buf),  // %3
    69     "r"(width),  // %4
    70     "r" (kCoefficientsRgbY)  // %5
    71   : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
    72 );
    73 }
    75 void ScaleYUVToRGB32Row(const uint8* y_buf,  // rdi
    76                         const uint8* u_buf,  // rsi
    77                         const uint8* v_buf,  // rdx
    78                         uint8* rgb_buf,      // rcx
    79                         int width,           // r8
    80                         int source_dx) {     // r9
    81   asm(
    82   "xor    %%r11,%%r11\n"
    83   "sub    $0x2,%4\n"
    84   "js     1f\n"
    86 "0:"
    87   "mov    %%r11,%%r10\n"
    88   "sar    $0x11,%%r10\n"
    89   "movzb  (%1,%%r10,1),%%rax\n"
    90   "movq   2048(%5,%%rax,8),%%xmm0\n"
    91   "movzb  (%2,%%r10,1),%%rax\n"
    92   "movq   4096(%5,%%rax,8),%%xmm1\n"
    93   "lea    (%%r11,%6),%%r10\n"
    94   "sar    $0x10,%%r11\n"
    95   "movzb  (%0,%%r11,1),%%rax\n"
    96   "paddsw %%xmm1,%%xmm0\n"
    97   "movq   (%5,%%rax,8),%%xmm1\n"
    98   "lea    (%%r10,%6),%%r11\n"
    99   "sar    $0x10,%%r10\n"
   100   "movzb  (%0,%%r10,1),%%rax\n"
   101   "movq   (%5,%%rax,8),%%xmm2\n"
   102   "paddsw %%xmm0,%%xmm1\n"
   103   "paddsw %%xmm0,%%xmm2\n"
   104   "shufps $0x44,%%xmm2,%%xmm1\n"
   105   "psraw  $0x6,%%xmm1\n"
   106   "packuswb %%xmm1,%%xmm1\n"
   107   "movq   %%xmm1,0x0(%3)\n"
   108   "add    $0x8,%3\n"
   109   "sub    $0x2,%4\n"
   110   "jns    0b\n"
   112 "1:"
   113   "add    $0x1,%4\n"
   114   "js     2f\n"
   116   "mov    %%r11,%%r10\n"
   117   "sar    $0x11,%%r10\n"
   118   "movzb  (%1,%%r10,1),%%rax\n"
   119   "movq   2048(%5,%%rax,8),%%xmm0\n"
   120   "movzb  (%2,%%r10,1),%%rax\n"
   121   "movq   4096(%5,%%rax,8),%%xmm1\n"
   122   "paddsw %%xmm1,%%xmm0\n"
   123   "sar    $0x10,%%r11\n"
   124   "movzb  (%0,%%r11,1),%%rax\n"
   125   "movq   (%5,%%rax,8),%%xmm1\n"
   126   "paddsw %%xmm0,%%xmm1\n"
   127   "psraw  $0x6,%%xmm1\n"
   128   "packuswb %%xmm1,%%xmm1\n"
   129   "movd   %%xmm1,0x0(%3)\n"
   131 "2:"
   132   :
   133   : "r"(y_buf),  // %0
   134     "r"(u_buf),  // %1
   135     "r"(v_buf),  // %2
   136     "r"(rgb_buf),  // %3
   137     "r"(width),  // %4
   138     "r" (kCoefficientsRgbY),  // %5
   139     "r"(static_cast<long>(source_dx))  // %6
   140   : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2"
   141 );
   142 }
   144 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
   145                               const uint8* u_buf,
   146                               const uint8* v_buf,
   147                               uint8* rgb_buf,
   148                               int width,
   149                               int source_dx) {
   150   asm(
   151   "xor    %%r11,%%r11\n"   // x = 0
   152   "sub    $0x2,%4\n"
   153   "js     2f\n"
   154   "cmp    $0x20000,%6\n"   // if source_dx >= 2.0
   155   "jl     0f\n"
   156   "mov    $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
   157 "0:"
   159 "1:"
   160   "mov    %%r11,%%r10\n"
   161   "sar    $0x11,%%r10\n"
   163   "movzb  (%1, %%r10, 1), %%r13 \n"
   164   "movzb  1(%1, %%r10, 1), %%r14 \n"
   165   "mov    %%r11, %%rax \n"
   166   "and    $0x1fffe, %%rax \n"
   167   "imul   %%rax, %%r14 \n"
   168   "xor    $0x1fffe, %%rax \n"
   169   "imul   %%rax, %%r13 \n"
   170   "add    %%r14, %%r13 \n"
   171   "shr    $17, %%r13 \n"
   172   "movq   2048(%5,%%r13,8), %%xmm0\n"
   174   "movzb  (%2, %%r10, 1), %%r13 \n"
   175   "movzb  1(%2, %%r10, 1), %%r14 \n"
   176   "mov    %%r11, %%rax \n"
   177   "and    $0x1fffe, %%rax \n"
   178   "imul   %%rax, %%r14 \n"
   179   "xor    $0x1fffe, %%rax \n"
   180   "imul   %%rax, %%r13 \n"
   181   "add    %%r14, %%r13 \n"
   182   "shr    $17, %%r13 \n"
   183   "movq   4096(%5,%%r13,8), %%xmm1\n"
   185   "mov    %%r11, %%rax \n"
   186   "lea    (%%r11,%6),%%r10\n"
   187   "sar    $0x10,%%r11\n"
   188   "paddsw %%xmm1,%%xmm0\n"
   190   "movzb  (%0, %%r11, 1), %%r13 \n"
   191   "movzb  1(%0, %%r11, 1), %%r14 \n"
   192   "and    $0xffff, %%rax \n"
   193   "imul   %%rax, %%r14 \n"
   194   "xor    $0xffff, %%rax \n"
   195   "imul   %%rax, %%r13 \n"
   196   "add    %%r14, %%r13 \n"
   197   "shr    $16, %%r13 \n"
   198   "movq   (%5,%%r13,8),%%xmm1\n"
   200   "mov    %%r10, %%rax \n"
   201   "lea    (%%r10,%6),%%r11\n"
   202   "sar    $0x10,%%r10\n"
   204   "movzb  (%0,%%r10,1), %%r13 \n"
   205   "movzb  1(%0,%%r10,1), %%r14 \n"
   206   "and    $0xffff, %%rax \n"
   207   "imul   %%rax, %%r14 \n"
   208   "xor    $0xffff, %%rax \n"
   209   "imul   %%rax, %%r13 \n"
   210   "add    %%r14, %%r13 \n"
   211   "shr    $16, %%r13 \n"
   212   "movq   (%5,%%r13,8),%%xmm2\n"
   214   "paddsw %%xmm0,%%xmm1\n"
   215   "paddsw %%xmm0,%%xmm2\n"
   216   "shufps $0x44,%%xmm2,%%xmm1\n"
   217   "psraw  $0x6,%%xmm1\n"
   218   "packuswb %%xmm1,%%xmm1\n"
   219   "movq   %%xmm1,0x0(%3)\n"
   220   "add    $0x8,%3\n"
   221   "sub    $0x2,%4\n"
   222   "jns    1b\n"
   224 "2:"
   225   "add    $0x1,%4\n"
   226   "js     3f\n"
   228   "mov    %%r11,%%r10\n"
   229   "sar    $0x11,%%r10\n"
   231   "movzb  (%1,%%r10,1), %%r13 \n"
   232   "movq   2048(%5,%%r13,8),%%xmm0\n"
   234   "movzb  (%2,%%r10,1), %%r13 \n"
   235   "movq   4096(%5,%%r13,8),%%xmm1\n"
   237   "paddsw %%xmm1,%%xmm0\n"
   238   "sar    $0x10,%%r11\n"
   240   "movzb  (%0,%%r11,1), %%r13 \n"
   241   "movq   (%5,%%r13,8),%%xmm1\n"
   243   "paddsw %%xmm0,%%xmm1\n"
   244   "psraw  $0x6,%%xmm1\n"
   245   "packuswb %%xmm1,%%xmm1\n"
   246   "movd   %%xmm1,0x0(%3)\n"
   248 "3:"
   249   :
   250   : "r"(y_buf),  // %0
   251     "r"(u_buf),  // %1
   252     "r"(v_buf),  // %2
   253     "r"(rgb_buf),  // %3
   254     "r"(width),  // %4
   255     "r" (kCoefficientsRgbY),  // %5
   256     "r"(static_cast<long>(source_dx))  // %6
   257   : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
   258 );
   259 }
   261 #elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__)
   263 // PIC version is slower because less registers are available, so
   264 // non-PIC is used on platforms where it is possible.
   265 void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
   266                                   const uint8* u_buf,
   267                                   const uint8* v_buf,
   268                                   uint8* rgb_buf,
   269                                   int width);
   270   asm(
   271   ".text\n"
   272   ".global FastConvertYUVToRGB32Row_SSE\n"
   273   ".type FastConvertYUVToRGB32Row_SSE, @function\n"
   274 "FastConvertYUVToRGB32Row_SSE:\n"
   275   "pusha\n"
   276   "mov    0x24(%esp),%edx\n"
   277   "mov    0x28(%esp),%edi\n"
   278   "mov    0x2c(%esp),%esi\n"
   279   "mov    0x30(%esp),%ebp\n"
   280   "mov    0x34(%esp),%ecx\n"
   281   "jmp    1f\n"
   283 "0:"
   284   "movzbl (%edi),%eax\n"
   285   "add    $0x1,%edi\n"
   286   "movzbl (%esi),%ebx\n"
   287   "add    $0x1,%esi\n"
   288   "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
   289   "movzbl (%edx),%eax\n"
   290   "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
   291   "movzbl 0x1(%edx),%ebx\n"
   292   "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
   293   "add    $0x2,%edx\n"
   294   "movq   kCoefficientsRgbY(,%ebx,8),%mm2\n"
   295   "paddsw %mm0,%mm1\n"
   296   "paddsw %mm0,%mm2\n"
   297   "psraw  $0x6,%mm1\n"
   298   "psraw  $0x6,%mm2\n"
   299   "packuswb %mm2,%mm1\n"
   300   "movntq %mm1,0x0(%ebp)\n"
   301   "add    $0x8,%ebp\n"
   302 "1:"
   303   "sub    $0x2,%ecx\n"
   304   "jns    0b\n"
   306   "and    $0x1,%ecx\n"
   307   "je     2f\n"
   309   "movzbl (%edi),%eax\n"
   310   "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
   311   "movzbl (%esi),%eax\n"
   312   "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
   313   "movzbl (%edx),%eax\n"
   314   "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
   315   "paddsw %mm0,%mm1\n"
   316   "psraw  $0x6,%mm1\n"
   317   "packuswb %mm1,%mm1\n"
   318   "movd   %mm1,0x0(%ebp)\n"
   319 "2:"
   320   "popa\n"
   321   "ret\n"
   322 #if !defined(XP_MACOSX)
   323   ".previous\n"
   324 #endif
   325 );
   327 void FastConvertYUVToRGB32Row(const uint8* y_buf,
   328                               const uint8* u_buf,
   329                               const uint8* v_buf,
   330                               uint8* rgb_buf,
   331                               int width)
   332 {
   333   if (mozilla::supports_sse()) {
   334     FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
   335     return;
   336   }
   338   FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
   339 }
   342 void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
   343                             const uint8* u_buf,
   344                             const uint8* v_buf,
   345                             uint8* rgb_buf,
   346                             int width,
   347                             int source_dx);
   348   asm(
   349   ".text\n"
   350   ".global ScaleYUVToRGB32Row_SSE\n"
   351   ".type ScaleYUVToRGB32Row_SSE, @function\n"
   352 "ScaleYUVToRGB32Row_SSE:\n"
   353   "pusha\n"
   354   "mov    0x24(%esp),%edx\n"
   355   "mov    0x28(%esp),%edi\n"
   356   "mov    0x2c(%esp),%esi\n"
   357   "mov    0x30(%esp),%ebp\n"
   358   "mov    0x34(%esp),%ecx\n"
   359   "xor    %ebx,%ebx\n"
   360   "jmp    1f\n"
   362 "0:"
   363   "mov    %ebx,%eax\n"
   364   "sar    $0x11,%eax\n"
   365   "movzbl (%edi,%eax,1),%eax\n"
   366   "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
   367   "mov    %ebx,%eax\n"
   368   "sar    $0x11,%eax\n"
   369   "movzbl (%esi,%eax,1),%eax\n"
   370   "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
   371   "mov    %ebx,%eax\n"
   372   "add    0x38(%esp),%ebx\n"
   373   "sar    $0x10,%eax\n"
   374   "movzbl (%edx,%eax,1),%eax\n"
   375   "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
   376   "mov    %ebx,%eax\n"
   377   "add    0x38(%esp),%ebx\n"
   378   "sar    $0x10,%eax\n"
   379   "movzbl (%edx,%eax,1),%eax\n"
   380   "movq   kCoefficientsRgbY(,%eax,8),%mm2\n"
   381   "paddsw %mm0,%mm1\n"
   382   "paddsw %mm0,%mm2\n"
   383   "psraw  $0x6,%mm1\n"
   384   "psraw  $0x6,%mm2\n"
   385   "packuswb %mm2,%mm1\n"
   386   "movntq %mm1,0x0(%ebp)\n"
   387   "add    $0x8,%ebp\n"
   388 "1:"
   389   "sub    $0x2,%ecx\n"
   390   "jns    0b\n"
   392   "and    $0x1,%ecx\n"
   393   "je     2f\n"
   395   "mov    %ebx,%eax\n"
   396   "sar    $0x11,%eax\n"
   397   "movzbl (%edi,%eax,1),%eax\n"
   398   "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
   399   "mov    %ebx,%eax\n"
   400   "sar    $0x11,%eax\n"
   401   "movzbl (%esi,%eax,1),%eax\n"
   402   "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
   403   "mov    %ebx,%eax\n"
   404   "sar    $0x10,%eax\n"
   405   "movzbl (%edx,%eax,1),%eax\n"
   406   "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
   407   "paddsw %mm0,%mm1\n"
   408   "psraw  $0x6,%mm1\n"
   409   "packuswb %mm1,%mm1\n"
   410   "movd   %mm1,0x0(%ebp)\n"
   412 "2:"
   413   "popa\n"
   414   "ret\n"
   415 #if !defined(XP_MACOSX)
   416   ".previous\n"
   417 #endif
   418 );
   420 void ScaleYUVToRGB32Row(const uint8* y_buf,
   421                         const uint8* u_buf,
   422                         const uint8* v_buf,
   423                         uint8* rgb_buf,
   424                         int width,
   425                         int source_dx)
   426 {
   427   if (mozilla::supports_sse()) {
   428     ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
   429                            width, source_dx);
   430   }
   432   ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
   433                        width, source_dx);
   434 }
   436 void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
   437                                   const uint8* u_buf,
   438                                   const uint8* v_buf,
   439                                   uint8* rgb_buf,
   440                                   int width,
   441                                   int source_dx);
   442   asm(
   443   ".text\n"
   444   ".global LinearScaleYUVToRGB32Row_SSE\n"
   445   ".type LinearScaleYUVToRGB32Row_SSE, @function\n"
   446 "LinearScaleYUVToRGB32Row_SSE:\n"
   447   "pusha\n"
   448   "mov    0x24(%esp),%edx\n"
   449   "mov    0x28(%esp),%edi\n"
   450   "mov    0x30(%esp),%ebp\n"
   452   // source_width = width * source_dx + ebx
   453   "mov    0x34(%esp), %ecx\n"
   454   "imull  0x38(%esp), %ecx\n"
   455   "mov    %ecx, 0x34(%esp)\n"
   457   "mov    0x38(%esp), %ecx\n"
   458   "xor    %ebx,%ebx\n"     // x = 0
   459   "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
   460   "jl     1f\n"
   461   "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
   462   "jmp    1f\n"
   464 "0:"
   465   "mov    %ebx,%eax\n"
   466   "sar    $0x11,%eax\n"
   468   "movzbl (%edi,%eax,1),%ecx\n"
   469   "movzbl 1(%edi,%eax,1),%esi\n"
   470   "mov    %ebx,%eax\n"
   471   "andl   $0x1fffe, %eax \n"
   472   "imul   %eax, %esi \n"
   473   "xorl   $0x1fffe, %eax \n"
   474   "imul   %eax, %ecx \n"
   475   "addl   %esi, %ecx \n"
   476   "shrl   $17, %ecx \n"
   477   "movq   kCoefficientsRgbY+2048(,%ecx,8),%mm0\n"
   479   "mov    0x2c(%esp),%esi\n"
   480   "mov    %ebx,%eax\n"
   481   "sar    $0x11,%eax\n"
   483   "movzbl (%esi,%eax,1),%ecx\n"
   484   "movzbl 1(%esi,%eax,1),%esi\n"
   485   "mov    %ebx,%eax\n"
   486   "andl   $0x1fffe, %eax \n"
   487   "imul   %eax, %esi \n"
   488   "xorl   $0x1fffe, %eax \n"
   489   "imul   %eax, %ecx \n"
   490   "addl   %esi, %ecx \n"
   491   "shrl   $17, %ecx \n"
   492   "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n"
   494   "mov    %ebx,%eax\n"
   495   "sar    $0x10,%eax\n"
   496   "movzbl (%edx,%eax,1),%ecx\n"
   497   "movzbl 1(%edx,%eax,1),%esi\n"
   498   "mov    %ebx,%eax\n"
   499   "add    0x38(%esp),%ebx\n"
   500   "andl   $0xffff, %eax \n"
   501   "imul   %eax, %esi \n"
   502   "xorl   $0xffff, %eax \n"
   503   "imul   %eax, %ecx \n"
   504   "addl   %esi, %ecx \n"
   505   "shrl   $16, %ecx \n"
   506   "movq   kCoefficientsRgbY(,%ecx,8),%mm1\n"
   508   "cmp    0x34(%esp), %ebx\n"
   509   "jge    2f\n"
   511   "mov    %ebx,%eax\n"
   512   "sar    $0x10,%eax\n"
   513   "movzbl (%edx,%eax,1),%ecx\n"
   514   "movzbl 1(%edx,%eax,1),%esi\n"
   515   "mov    %ebx,%eax\n"
   516   "add    0x38(%esp),%ebx\n"
   517   "andl   $0xffff, %eax \n"
   518   "imul   %eax, %esi \n"
   519   "xorl   $0xffff, %eax \n"
   520   "imul   %eax, %ecx \n"
   521   "addl   %esi, %ecx \n"
   522   "shrl   $16, %ecx \n"
   523   "movq   kCoefficientsRgbY(,%ecx,8),%mm2\n"
   525   "paddsw %mm0,%mm1\n"
   526   "paddsw %mm0,%mm2\n"
   527   "psraw  $0x6,%mm1\n"
   528   "psraw  $0x6,%mm2\n"
   529   "packuswb %mm2,%mm1\n"
   530   "movntq %mm1,0x0(%ebp)\n"
   531   "add    $0x8,%ebp\n"
   533 "1:"
   534   "cmp    0x34(%esp), %ebx\n"
   535   "jl     0b\n"
   536   "popa\n"
   537   "ret\n"
   539 "2:"
   540   "paddsw %mm0, %mm1\n"
   541   "psraw $6, %mm1\n"
   542   "packuswb %mm1, %mm1\n"
   543   "movd %mm1, (%ebp)\n"
   544   "popa\n"
   545   "ret\n"
   546 #if !defined(XP_MACOSX)
   547   ".previous\n"
   548 #endif
   549 );
   551 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
   552                               const uint8* u_buf,
   553                               const uint8* v_buf,
   554                               uint8* rgb_buf,
   555                               int width,
   556                               int source_dx)
   557 {
   558   if (mozilla::supports_sse()) {
   559     LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
   560                                  width, source_dx);
   561   }
   563   LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
   564                              width, source_dx);
   565 }
   567 #elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__)
   569 void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf,
   570                                  const uint8* u_buf,
   571                                  const uint8* v_buf,
   572                                  uint8* rgb_buf,
   573                                  int width,
   574                                  int16 *kCoefficientsRgbY);
   576   asm(
   577   ".text\n"
   578 #if defined(XP_MACOSX)
   579 "_PICConvertYUVToRGB32Row_SSE:\n"
   580 #else
   581 "PICConvertYUVToRGB32Row_SSE:\n"
   582 #endif
   583   "pusha\n"
   584   "mov    0x24(%esp),%edx\n"
   585   "mov    0x28(%esp),%edi\n"
   586   "mov    0x2c(%esp),%esi\n"
   587   "mov    0x30(%esp),%ebp\n"
   588   "mov    0x38(%esp),%ecx\n"
   590   "jmp    1f\n"
   592 "0:"
   593   "movzbl (%edi),%eax\n"
   594   "add    $0x1,%edi\n"
   595   "movzbl (%esi),%ebx\n"
   596   "add    $0x1,%esi\n"
   597   "movq   2048(%ecx,%eax,8),%mm0\n"
   598   "movzbl (%edx),%eax\n"
   599   "paddsw 4096(%ecx,%ebx,8),%mm0\n"
   600   "movzbl 0x1(%edx),%ebx\n"
   601   "movq   0(%ecx,%eax,8),%mm1\n"
   602   "add    $0x2,%edx\n"
   603   "movq   0(%ecx,%ebx,8),%mm2\n"
   604   "paddsw %mm0,%mm1\n"
   605   "paddsw %mm0,%mm2\n"
   606   "psraw  $0x6,%mm1\n"
   607   "psraw  $0x6,%mm2\n"
   608   "packuswb %mm2,%mm1\n"
   609   "movntq %mm1,0x0(%ebp)\n"
   610   "add    $0x8,%ebp\n"
   611 "1:"
   612   "subl   $0x2,0x34(%esp)\n"
   613   "jns    0b\n"
   615   "andl   $0x1,0x34(%esp)\n"
   616   "je     2f\n"
   618   "movzbl (%edi),%eax\n"
   619   "movq   2048(%ecx,%eax,8),%mm0\n"
   620   "movzbl (%esi),%eax\n"
   621   "paddsw 4096(%ecx,%eax,8),%mm0\n"
   622   "movzbl (%edx),%eax\n"
   623   "movq   0(%ecx,%eax,8),%mm1\n"
   624   "paddsw %mm0,%mm1\n"
   625   "psraw  $0x6,%mm1\n"
   626   "packuswb %mm1,%mm1\n"
   627   "movd   %mm1,0x0(%ebp)\n"
   628 "2:"
   629   "popa\n"
   630   "ret\n"
   631 #if !defined(XP_MACOSX)
   632   ".previous\n"
   633 #endif
   634 );
   636 void FastConvertYUVToRGB32Row(const uint8* y_buf,
   637                               const uint8* u_buf,
   638                               const uint8* v_buf,
   639                               uint8* rgb_buf,
   640                               int width)
   641 {
   642   if (mozilla::supports_sse()) {
   643     PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
   644                                 &kCoefficientsRgbY[0][0]);
   645     return;
   646   }
   648   FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
   649 }
   651 void PICScaleYUVToRGB32Row_SSE(const uint8* y_buf,
   652                                const uint8* u_buf,
   653                                const uint8* v_buf,
   654                                uint8* rgb_buf,
   655                                int width,
   656                                int source_dx,
   657                                int16 *kCoefficientsRgbY);
   659   asm(
   660   ".text\n"
   661 #if defined(XP_MACOSX)
   662 "_PICScaleYUVToRGB32Row_SSE:\n"
   663 #else
   664 "PICScaleYUVToRGB32Row_SSE:\n"
   665 #endif
   666   "pusha\n"
   667   "mov    0x24(%esp),%edx\n"
   668   "mov    0x28(%esp),%edi\n"
   669   "mov    0x2c(%esp),%esi\n"
   670   "mov    0x30(%esp),%ebp\n"
   671   "mov    0x3c(%esp),%ecx\n"
   672   "xor    %ebx,%ebx\n"
   673   "jmp    1f\n"
   675 "0:"
   676   "mov    %ebx,%eax\n"
   677   "sar    $0x11,%eax\n"
   678   "movzbl (%edi,%eax,1),%eax\n"
   679   "movq   2048(%ecx,%eax,8),%mm0\n"
   680   "mov    %ebx,%eax\n"
   681   "sar    $0x11,%eax\n"
   682   "movzbl (%esi,%eax,1),%eax\n"
   683   "paddsw 4096(%ecx,%eax,8),%mm0\n"
   684   "mov    %ebx,%eax\n"
   685   "add    0x38(%esp),%ebx\n"
   686   "sar    $0x10,%eax\n"
   687   "movzbl (%edx,%eax,1),%eax\n"
   688   "movq   0(%ecx,%eax,8),%mm1\n"
   689   "mov    %ebx,%eax\n"
   690   "add    0x38(%esp),%ebx\n"
   691   "sar    $0x10,%eax\n"
   692   "movzbl (%edx,%eax,1),%eax\n"
   693   "movq   0(%ecx,%eax,8),%mm2\n"
   694   "paddsw %mm0,%mm1\n"
   695   "paddsw %mm0,%mm2\n"
   696   "psraw  $0x6,%mm1\n"
   697   "psraw  $0x6,%mm2\n"
   698   "packuswb %mm2,%mm1\n"
   699   "movntq %mm1,0x0(%ebp)\n"
   700   "add    $0x8,%ebp\n"
   701 "1:"
   702   "subl   $0x2,0x34(%esp)\n"
   703   "jns    0b\n"
   705   "andl   $0x1,0x34(%esp)\n"
   706   "je     2f\n"
   708   "mov    %ebx,%eax\n"
   709   "sar    $0x11,%eax\n"
   710   "movzbl (%edi,%eax,1),%eax\n"
   711   "movq   2048(%ecx,%eax,8),%mm0\n"
   712   "mov    %ebx,%eax\n"
   713   "sar    $0x11,%eax\n"
   714   "movzbl (%esi,%eax,1),%eax\n"
   715   "paddsw 4096(%ecx,%eax,8),%mm0\n"
   716   "mov    %ebx,%eax\n"
   717   "sar    $0x10,%eax\n"
   718   "movzbl (%edx,%eax,1),%eax\n"
   719   "movq   0(%ecx,%eax,8),%mm1\n"
   720   "paddsw %mm0,%mm1\n"
   721   "psraw  $0x6,%mm1\n"
   722   "packuswb %mm1,%mm1\n"
   723   "movd   %mm1,0x0(%ebp)\n"
   725 "2:"
   726   "popa\n"
   727   "ret\n"
   728 #if !defined(XP_MACOSX)
   729   ".previous\n"
   730 #endif
   731 );
   733 void ScaleYUVToRGB32Row(const uint8* y_buf,
   734                         const uint8* u_buf,
   735                         const uint8* v_buf,
   736                         uint8* rgb_buf,
   737                         int width,
   738                         int source_dx)
   739 {
   740   if (mozilla::supports_sse()) {
   741     PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
   742                               &kCoefficientsRgbY[0][0]);
   743     return;
   744   }
   746   ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
   747 }
   749 void PICLinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
   750                                      const uint8* u_buf,
   751                                      const uint8* v_buf,
   752                                      uint8* rgb_buf,
   753                                      int width,
   754                                      int source_dx,
   755                                      int16 *kCoefficientsRgbY);
   757   asm(
   758   ".text\n"
   759 #if defined(XP_MACOSX)
   760 "_PICLinearScaleYUVToRGB32Row_SSE:\n"
   761 #else
   762 "PICLinearScaleYUVToRGB32Row_SSE:\n"
   763 #endif
   764   "pusha\n"
   765   "mov    0x24(%esp),%edx\n"
   766   "mov    0x30(%esp),%ebp\n"
   767   "mov    0x34(%esp),%ecx\n"
   768   "mov    0x3c(%esp),%edi\n"
   769   "xor    %ebx,%ebx\n"
   771   // source_width = width * source_dx + ebx
   772   "mov    0x34(%esp), %ecx\n"
   773   "imull  0x38(%esp), %ecx\n"
   774   "mov    %ecx, 0x34(%esp)\n"
   776   "mov    0x38(%esp), %ecx\n"
   777   "xor    %ebx,%ebx\n"     // x = 0
   778   "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
   779   "jl     1f\n"
   780   "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
   781   "jmp    1f\n"
   783 "0:"
   784   "mov    0x28(%esp),%esi\n"
   785   "mov    %ebx,%eax\n"
   786   "sar    $0x11,%eax\n"
   788   "movzbl (%esi,%eax,1),%ecx\n"
   789   "movzbl 1(%esi,%eax,1),%esi\n"
   790   "mov    %ebx,%eax\n"
   791   "andl   $0x1fffe, %eax \n"
   792   "imul   %eax, %esi \n"
   793   "xorl   $0x1fffe, %eax \n"
   794   "imul   %eax, %ecx \n"
   795   "addl   %esi, %ecx \n"
   796   "shrl   $17, %ecx \n"
   797   "movq   2048(%edi,%ecx,8),%mm0\n"
   799   "mov    0x2c(%esp),%esi\n"
   800   "mov    %ebx,%eax\n"
   801   "sar    $0x11,%eax\n"
   803   "movzbl (%esi,%eax,1),%ecx\n"
   804   "movzbl 1(%esi,%eax,1),%esi\n"
   805   "mov    %ebx,%eax\n"
   806   "andl   $0x1fffe, %eax \n"
   807   "imul   %eax, %esi \n"
   808   "xorl   $0x1fffe, %eax \n"
   809   "imul   %eax, %ecx \n"
   810   "addl   %esi, %ecx \n"
   811   "shrl   $17, %ecx \n"
   812   "paddsw 4096(%edi,%ecx,8),%mm0\n"
   814   "mov    %ebx,%eax\n"
   815   "sar    $0x10,%eax\n"
   816   "movzbl (%edx,%eax,1),%ecx\n"
   817   "movzbl 1(%edx,%eax,1),%esi\n"
   818   "mov    %ebx,%eax\n"
   819   "add    0x38(%esp),%ebx\n"
   820   "andl   $0xffff, %eax \n"
   821   "imul   %eax, %esi \n"
   822   "xorl   $0xffff, %eax \n"
   823   "imul   %eax, %ecx \n"
   824   "addl   %esi, %ecx \n"
   825   "shrl   $16, %ecx \n"
   826   "movq   (%edi,%ecx,8),%mm1\n"
   828   "cmp    0x34(%esp), %ebx\n"
   829   "jge    2f\n"
   831   "mov    %ebx,%eax\n"
   832   "sar    $0x10,%eax\n"
   833   "movzbl (%edx,%eax,1),%ecx\n"
   834   "movzbl 1(%edx,%eax,1),%esi\n"
   835   "mov    %ebx,%eax\n"
   836   "add    0x38(%esp),%ebx\n"
   837   "andl   $0xffff, %eax \n"
   838   "imul   %eax, %esi \n"
   839   "xorl   $0xffff, %eax \n"
   840   "imul   %eax, %ecx \n"
   841   "addl   %esi, %ecx \n"
   842   "shrl   $16, %ecx \n"
   843   "movq   (%edi,%ecx,8),%mm2\n"
   845   "paddsw %mm0,%mm1\n"
   846   "paddsw %mm0,%mm2\n"
   847   "psraw  $0x6,%mm1\n"
   848   "psraw  $0x6,%mm2\n"
   849   "packuswb %mm2,%mm1\n"
   850   "movntq %mm1,0x0(%ebp)\n"
   851   "add    $0x8,%ebp\n"
   853 "1:"
   854   "cmp    %ebx, 0x34(%esp)\n"
   855   "jg     0b\n"
   856   "popa\n"
   857   "ret\n"
   859 "2:"
   860   "paddsw %mm0, %mm1\n"
   861   "psraw $6, %mm1\n"
   862   "packuswb %mm1, %mm1\n"
   863   "movd %mm1, (%ebp)\n"
   864   "popa\n"
   865   "ret\n"
   866 #if !defined(XP_MACOSX)
   867   ".previous\n"
   868 #endif
   869 );
   872 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
   873                               const uint8* u_buf,
   874                               const uint8* v_buf,
   875                               uint8* rgb_buf,
   876                               int width,
   877                               int source_dx)
   878 {
   879   if (mozilla::supports_sse()) {
   880     PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
   881                                     source_dx, &kCoefficientsRgbY[0][0]);
   882     return;
   883   }
   885   LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
   886 }
   887 #else
   888 void FastConvertYUVToRGB32Row(const uint8* y_buf,
   889                               const uint8* u_buf,
   890                               const uint8* v_buf,
   891                               uint8* rgb_buf,
   892                               int width) {
   893   FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
   894 }
   896 void ScaleYUVToRGB32Row(const uint8* y_buf,
   897                         const uint8* u_buf,
   898                         const uint8* v_buf,
   899                         uint8* rgb_buf,
   900                         int width,
   901                         int source_dx) {
   902   ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
   903 }
   905 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
   906                               const uint8* u_buf,
   907                               const uint8* v_buf,
   908                               uint8* rgb_buf,
   909                               int width,
   910                               int source_dx) {
   911   LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
   912 }
   913 #endif
   915 }

mercurial