media/libyuv/source/scale_posix.cc

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
     3  *
     4  *  Use of this source code is governed by a BSD-style license
     5  *  that can be found in the LICENSE file in the root of the source
     6  *  tree. An additional intellectual property rights grant can be found
     7  *  in the file PATENTS. All contributing project authors may
     8  *  be found in the AUTHORS file in the root of the source tree.
     9  */
    11 #include "libyuv/row.h"
    13 #ifdef __cplusplus
    14 namespace libyuv {
    15 extern "C" {
    16 #endif
    18 // This module is for GCC x86 and x64.
    19 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
    21 // Offsets for source bytes 0 to 9
    22 static uvec8 kShuf0 =
    23   { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
    25 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
    26 static uvec8 kShuf1 =
    27   { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
    29 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
    30 static uvec8 kShuf2 =
    31   { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
    33 // Offsets for source bytes 0 to 10
    34 static uvec8 kShuf01 =
    35   { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
    37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
    38 static uvec8 kShuf11 =
    39   { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
    41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
    42 static uvec8 kShuf21 =
    43   { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
    45 // Coefficients for source bytes 0 to 10
    46 static uvec8 kMadd01 =
    47   { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
    49 // Coefficients for source bytes 10 to 21
    50 static uvec8 kMadd11 =
    51   { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
    53 // Coefficients for source bytes 21 to 31
    54 static uvec8 kMadd21 =
    55   { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
    57 // Coefficients for source bytes 21 to 31
    58 static vec16 kRound34 =
    59   { 2, 2, 2, 2, 2, 2, 2, 2 };
    61 static uvec8 kShuf38a =
    62   { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
    64 static uvec8 kShuf38b =
    65   { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
    67 // Arrange words 0,3,6 into 0,1,2
    68 static uvec8 kShufAc =
    69   { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
    71 // Arrange words 0,3,6 into 3,4,5
    72 static uvec8 kShufAc3 =
    73   { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
    75 // Scaling values for boxes of 3x3 and 2x3
    76 static uvec16 kScaleAc33 =
    77   { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
    79 // Arrange first value for pixels 0,1,2,3,4,5
    80 static uvec8 kShufAb0 =
    81   { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
    83 // Arrange second value for pixels 0,1,2,3,4,5
    84 static uvec8 kShufAb1 =
    85   { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
    87 // Arrange third value for pixels 0,1,2,3,4,5
    88 static uvec8 kShufAb2 =
    89   { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
    91 // Scaling values for boxes of 3x2 and 2x2
    92 static uvec16 kScaleAb2 =
    93   { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
    95 // GCC versions of row functions are verbatim conversions from Visual C.
    96 // Generated using gcc disassembly on Visual C object file:
    97 // objdump -D yuvscaler.obj >yuvscaler.txt
    99 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   100                         uint8* dst_ptr, int dst_width) {
   101   asm volatile (
   102     LABELALIGN
   103   "1:                                          \n"
   104     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
   105     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   106     "lea       " MEMLEA(0x20,0) ",%0           \n"
   107     "psrlw     $0x8,%%xmm0                     \n"
   108     "psrlw     $0x8,%%xmm1                     \n"
   109     "packuswb  %%xmm1,%%xmm0                   \n"
   110     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
   111     "lea       " MEMLEA(0x10,1) ",%1           \n"
   112     "sub       $0x10,%2                        \n"
   113     "jg        1b                              \n"
   114   : "+r"(src_ptr),    // %0
   115     "+r"(dst_ptr),    // %1
   116     "+r"(dst_width)   // %2
   117   :
   118   : "memory", "cc"
   119 #if defined(__SSE2__)
   120     , "xmm0", "xmm1"
   121 #endif
   122   );
   123 }
   125 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   126                               uint8* dst_ptr, int dst_width) {
   127   asm volatile (
   128     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   129     "psrlw     $0x8,%%xmm5                     \n"
   131     LABELALIGN
   132   "1:                                          \n"
   133     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
   134     "movdqa    " MEMACCESS2(0x10, 0) ",%%xmm1  \n"
   135     "lea       " MEMLEA(0x20,0) ",%0           \n"
   136     "movdqa    %%xmm0,%%xmm2                   \n"
   137     "psrlw     $0x8,%%xmm0                     \n"
   138     "movdqa    %%xmm1,%%xmm3                   \n"
   139     "psrlw     $0x8,%%xmm1                     \n"
   140     "pand      %%xmm5,%%xmm2                   \n"
   141     "pand      %%xmm5,%%xmm3                   \n"
   142     "pavgw     %%xmm2,%%xmm0                   \n"
   143     "pavgw     %%xmm3,%%xmm1                   \n"
   144     "packuswb  %%xmm1,%%xmm0                   \n"
   145     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
   146     "lea       " MEMLEA(0x10,1) ",%1           \n"
   147     "sub       $0x10,%2                        \n"
   148     "jg        1b                              \n"
   149   : "+r"(src_ptr),    // %0
   150     "+r"(dst_ptr),    // %1
   151     "+r"(dst_width)   // %2
   152   :
   153   : "memory", "cc"
   154 #if defined(__SSE2__)
   155     , "xmm0", "xmm1", "xmm5"
   156 #endif
   157   );
   158 }
   160 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   161                            uint8* dst_ptr, int dst_width) {
   162   asm volatile (
   163     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   164     "psrlw     $0x8,%%xmm5                     \n"
   166     LABELALIGN
   167   "1:                                          \n"
   168     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
   169     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   170     MEMOPREG(movdqa,0x00,0,3,1,xmm2)           //  movdqa  (%0,%3,1),%%xmm2
   171     BUNDLEALIGN
   172     MEMOPREG(movdqa,0x10,0,3,1,xmm3)           //  movdqa  0x10(%0,%3,1),%%xmm3
   173     "lea       " MEMLEA(0x20,0) ",%0           \n"
   174     "pavgb     %%xmm2,%%xmm0                   \n"
   175     "pavgb     %%xmm3,%%xmm1                   \n"
   176     "movdqa    %%xmm0,%%xmm2                   \n"
   177     "psrlw     $0x8,%%xmm0                     \n"
   178     "movdqa    %%xmm1,%%xmm3                   \n"
   179     "psrlw     $0x8,%%xmm1                     \n"
   180     "pand      %%xmm5,%%xmm2                   \n"
   181     "pand      %%xmm5,%%xmm3                   \n"
   182     "pavgw     %%xmm2,%%xmm0                   \n"
   183     "pavgw     %%xmm3,%%xmm1                   \n"
   184     "packuswb  %%xmm1,%%xmm0                   \n"
   185     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
   186     "lea       " MEMLEA(0x10,1) ",%1           \n"
   187     "sub       $0x10,%2                        \n"
   188     "jg        1b                              \n"
   189   : "+r"(src_ptr),    // %0
   190     "+r"(dst_ptr),    // %1
   191     "+r"(dst_width)   // %2
   192   : "r"((intptr_t)(src_stride))   // %3
   193   : "memory", "cc"
   194 #if defined(__native_client__) && defined(__x86_64__)
   195     , "r14"
   196 #endif
   197 #if defined(__SSE2__)
   198     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   199 #endif
   200   );
   201 }
   203 void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   204                                   uint8* dst_ptr, int dst_width) {
   205   asm volatile (
   206     LABELALIGN
   207   "1:                                          \n"
   208     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   209     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   210     "lea       " MEMLEA(0x20,0) ",%0           \n"
   211     "psrlw     $0x8,%%xmm0                     \n"
   212     "psrlw     $0x8,%%xmm1                     \n"
   213     "packuswb  %%xmm1,%%xmm0                   \n"
   214     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   215     "lea       " MEMLEA(0x10,1) ",%1           \n"
   216     "sub       $0x10,%2                        \n"
   217     "jg        1b                              \n"
   218   : "+r"(src_ptr),    // %0
   219     "+r"(dst_ptr),    // %1
   220     "+r"(dst_width)   // %2
   221   :
   222   : "memory", "cc"
   223 #if defined(__SSE2__)
   224     , "xmm0", "xmm1"
   225 #endif
   226   );
   227 }
   229 void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
   230                                         ptrdiff_t src_stride,
   231                                         uint8* dst_ptr, int dst_width) {
   232   asm volatile (
   233     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   234     "psrlw     $0x8,%%xmm5                     \n"
   236     LABELALIGN
   237   "1:                                          \n"
   238     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   239     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   240     "lea       " MEMLEA(0x20,0) ",%0           \n"
   241     "movdqa    %%xmm0,%%xmm2                   \n"
   242     "psrlw     $0x8,%%xmm0                     \n"
   243     "movdqa    %%xmm1,%%xmm3                   \n"
   244     "psrlw     $0x8,%%xmm1                     \n"
   245     "pand      %%xmm5,%%xmm2                   \n"
   246     "pand      %%xmm5,%%xmm3                   \n"
   247     "pavgw     %%xmm2,%%xmm0                   \n"
   248     "pavgw     %%xmm3,%%xmm1                   \n"
   249     "packuswb  %%xmm1,%%xmm0                   \n"
   250     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   251     "lea       " MEMLEA(0x10,1) ",%1           \n"
   252     "sub       $0x10,%2                        \n"
   253     "jg        1b                              \n"
   254   : "+r"(src_ptr),    // %0
   255     "+r"(dst_ptr),    // %1
   256     "+r"(dst_width)   // %2
   257   :
   258   : "memory", "cc"
   259 #if defined(__SSE2__)
   260     , "xmm0", "xmm1", "xmm5"
   261 #endif
   262   );
   263 }
   265 void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
   266                                      ptrdiff_t src_stride,
   267                                      uint8* dst_ptr, int dst_width) {
   268   asm volatile (
   269     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   270     "psrlw     $0x8,%%xmm5                     \n"
   272     LABELALIGN
   273   "1:                                          \n"
   274     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   275     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   276     MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
   277     BUNDLEALIGN
   278     MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
   279     "lea       " MEMLEA(0x20,0) ",%0           \n"
   280     "pavgb     %%xmm2,%%xmm0                   \n"
   281     "pavgb     %%xmm3,%%xmm1                   \n"
   282     "movdqa    %%xmm0,%%xmm2                   \n"
   283     "psrlw     $0x8,%%xmm0                     \n"
   284     "movdqa    %%xmm1,%%xmm3                   \n"
   285     "psrlw     $0x8,%%xmm1                     \n"
   286     "pand      %%xmm5,%%xmm2                   \n"
   287     "pand      %%xmm5,%%xmm3                   \n"
   288     "pavgw     %%xmm2,%%xmm0                   \n"
   289     "pavgw     %%xmm3,%%xmm1                   \n"
   290     "packuswb  %%xmm1,%%xmm0                   \n"
   291     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   292     "lea       " MEMLEA(0x10,1) ",%1           \n"
   293     "sub       $0x10,%2                        \n"
   294     "jg        1b                              \n"
   295   : "+r"(src_ptr),    // %0
   296     "+r"(dst_ptr),    // %1
   297     "+r"(dst_width)   // %2
   298   : "r"((intptr_t)(src_stride))   // %3
   299   : "memory", "cc"
   300 #if defined(__native_client__) && defined(__x86_64__)
   301     , "r14"
   302 #endif
   303 #if defined(__SSE2__)
   304     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   305 #endif
   306   );
   307 }
   309 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   310                         uint8* dst_ptr, int dst_width) {
   311   asm volatile (
   312     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   313     "psrld     $0x18,%%xmm5                    \n"
   314     "pslld     $0x10,%%xmm5                    \n"
   316     LABELALIGN
   317   "1:                                          \n"
   318     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
   319     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   320     "lea       " MEMLEA(0x20,0) ",%0           \n"
   321     "pand      %%xmm5,%%xmm0                   \n"
   322     "pand      %%xmm5,%%xmm1                   \n"
   323     "packuswb  %%xmm1,%%xmm0                   \n"
   324     "psrlw     $0x8,%%xmm0                     \n"
   325     "packuswb  %%xmm0,%%xmm0                   \n"
   326     "movq      %%xmm0," MEMACCESS(1) "         \n"
   327     "lea       " MEMLEA(0x8,1) ",%1            \n"
   328     "sub       $0x8,%2                         \n"
   329     "jg        1b                              \n"
   330   : "+r"(src_ptr),    // %0
   331     "+r"(dst_ptr),    // %1
   332     "+r"(dst_width)   // %2
   333   :
   334   : "memory", "cc"
   335 #if defined(__SSE2__)
   336     , "xmm0", "xmm1", "xmm5"
   337 #endif
   338   );
   339 }
   341 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   342                            uint8* dst_ptr, int dst_width) {
   343   intptr_t stridex3 = 0;
   344   asm volatile (
   345     "pcmpeqb   %%xmm7,%%xmm7                   \n"
   346     "psrlw     $0x8,%%xmm7                     \n"
   347     "lea       " MEMLEA4(0x00,4,4,2) ",%3      \n"
   349     LABELALIGN
   350   "1:                                          \n"
   351     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
   352     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   353     MEMOPREG(movdqa,0x00,0,4,1,xmm2)           //  movdqa  (%0,%4,1),%%xmm2
   354     BUNDLEALIGN
   355     MEMOPREG(movdqa,0x10,0,4,1,xmm3)           //  movdqa  0x10(%0,%4,1),%%xmm3
   356     "pavgb     %%xmm2,%%xmm0                   \n"
   357     "pavgb     %%xmm3,%%xmm1                   \n"
   358     MEMOPREG(movdqa,0x00,0,4,2,xmm2)           //  movdqa  (%0,%4,2),%%xmm2
   359     BUNDLEALIGN
   360     MEMOPREG(movdqa,0x10,0,4,2,xmm3)           //  movdqa  0x10(%0,%4,2),%%xmm3
   361     MEMOPREG(movdqa,0x00,0,3,1,xmm4)           //  movdqa  (%0,%3,1),%%xmm4
   362     MEMOPREG(movdqa,0x10,0,3,1,xmm5)           //  movdqa  0x10(%0,%3,1),%%xmm5
   363     "lea       " MEMLEA(0x20,0) ",%0           \n"
   364     "pavgb     %%xmm4,%%xmm2                   \n"
   365     "pavgb     %%xmm2,%%xmm0                   \n"
   366     "pavgb     %%xmm5,%%xmm3                   \n"
   367     "pavgb     %%xmm3,%%xmm1                   \n"
   368     "movdqa    %%xmm0,%%xmm2                   \n"
   369     "psrlw     $0x8,%%xmm0                     \n"
   370     "movdqa    %%xmm1,%%xmm3                   \n"
   371     "psrlw     $0x8,%%xmm1                     \n"
   372     "pand      %%xmm7,%%xmm2                   \n"
   373     "pand      %%xmm7,%%xmm3                   \n"
   374     "pavgw     %%xmm2,%%xmm0                   \n"
   375     "pavgw     %%xmm3,%%xmm1                   \n"
   376     "packuswb  %%xmm1,%%xmm0                   \n"
   377     "movdqa    %%xmm0,%%xmm2                   \n"
   378     "psrlw     $0x8,%%xmm0                     \n"
   379     "pand      %%xmm7,%%xmm2                   \n"
   380     "pavgw     %%xmm2,%%xmm0                   \n"
   381     "packuswb  %%xmm0,%%xmm0                   \n"
   382     "movq      %%xmm0," MEMACCESS(1) "         \n"
   383     "lea       " MEMLEA(0x8,1) ",%1            \n"
   384     "sub       $0x8,%2                         \n"
   385     "jg        1b                              \n"
   386   : "+r"(src_ptr),     // %0
   387     "+r"(dst_ptr),     // %1
   388     "+r"(dst_width),   // %2
   389     "+r"(stridex3)     // %3
   390   : "r"((intptr_t)(src_stride))    // %4
   391   : "memory", "cc"
   392 #if defined(__native_client__) && defined(__x86_64__)
   393     , "r14"
   394 #endif
   395 #if defined(__SSE2__)
   396     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
   397 #endif
   398   );
   399 }
   401 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
   402                           uint8* dst_ptr, int dst_width) {
   403   asm volatile (
   404     "movdqa    %0,%%xmm3                       \n"
   405     "movdqa    %1,%%xmm4                       \n"
   406     "movdqa    %2,%%xmm5                       \n"
   407   :
   408   : "m"(kShuf0),  // %0
   409     "m"(kShuf1),  // %1
   410     "m"(kShuf2)   // %2
   411   );
   412   asm volatile (
   413     LABELALIGN
   414   "1:                                          \n"
   415     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
   416     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm2   \n"
   417     "lea       " MEMLEA(0x20,0) ",%0           \n"
   418     "movdqa    %%xmm2,%%xmm1                   \n"
   419     "palignr   $0x8,%%xmm0,%%xmm1              \n"
   420     "pshufb    %%xmm3,%%xmm0                   \n"
   421     "pshufb    %%xmm4,%%xmm1                   \n"
   422     "pshufb    %%xmm5,%%xmm2                   \n"
   423     "movq      %%xmm0," MEMACCESS(1) "         \n"
   424     "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
   425     "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
   426     "lea       " MEMLEA(0x18,1) ",%1           \n"
   427     "sub       $0x18,%2                        \n"
   428     "jg        1b                              \n"
   429   : "+r"(src_ptr),   // %0
   430     "+r"(dst_ptr),   // %1
   431     "+r"(dst_width)  // %2
   432   :
   433   : "memory", "cc"
   434 #if defined(__SSE2__)
   435     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   436 #endif
   437   );
   438 }
   440 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
   441                                 ptrdiff_t src_stride,
   442                                 uint8* dst_ptr, int dst_width) {
   443   asm volatile (
   444     "movdqa    %0,%%xmm2                       \n"  // kShuf01
   445     "movdqa    %1,%%xmm3                       \n"  // kShuf11
   446     "movdqa    %2,%%xmm4                       \n"  // kShuf21
   447   :
   448   : "m"(kShuf01),  // %0
   449     "m"(kShuf11),  // %1
   450     "m"(kShuf21)   // %2
   451   );
   452   asm volatile (
   453     "movdqa    %0,%%xmm5                       \n"  // kMadd01
   454     "movdqa    %1,%%xmm0                       \n"  // kMadd11
   455     "movdqa    %2,%%xmm1                       \n"  // kRound34
   456   :
   457   : "m"(kMadd01),  // %0
   458     "m"(kMadd11),  // %1
   459     "m"(kRound34)  // %2
   460   );
   461   asm volatile (
   462     LABELALIGN
   463   "1:                                          \n"
   464     "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
   465     MEMOPREG(movdqa,0x00,0,3,1,xmm7)           //  movdqa  (%0,%3),%%xmm7
   466     "pavgb     %%xmm7,%%xmm6                   \n"
   467     "pshufb    %%xmm2,%%xmm6                   \n"
   468     "pmaddubsw %%xmm5,%%xmm6                   \n"
   469     "paddsw    %%xmm1,%%xmm6                   \n"
   470     "psrlw     $0x2,%%xmm6                     \n"
   471     "packuswb  %%xmm6,%%xmm6                   \n"
   472     "movq      %%xmm6," MEMACCESS(1) "         \n"
   473     "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
   474     MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3),%%xmm7
   475     "pavgb     %%xmm7,%%xmm6                   \n"
   476     "pshufb    %%xmm3,%%xmm6                   \n"
   477     "pmaddubsw %%xmm0,%%xmm6                   \n"
   478     "paddsw    %%xmm1,%%xmm6                   \n"
   479     "psrlw     $0x2,%%xmm6                     \n"
   480     "packuswb  %%xmm6,%%xmm6                   \n"
   481     "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
   482     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
   483     BUNDLEALIGN
   484     MEMOPREG(movdqa,0x10,0,3,1,xmm7)           //  movdqa  0x10(%0,%3),%%xmm7
   485     "lea       " MEMLEA(0x20,0) ",%0           \n"
   486     "pavgb     %%xmm7,%%xmm6                   \n"
   487     "pshufb    %%xmm4,%%xmm6                   \n"
   488     "pmaddubsw %4,%%xmm6                       \n"
   489     "paddsw    %%xmm1,%%xmm6                   \n"
   490     "psrlw     $0x2,%%xmm6                     \n"
   491     "packuswb  %%xmm6,%%xmm6                   \n"
   492     "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
   493     "lea       " MEMLEA(0x18,1) ",%1           \n"
   494     "sub       $0x18,%2                        \n"
   495     "jg        1b                              \n"
   496   : "+r"(src_ptr),   // %0
   497     "+r"(dst_ptr),   // %1
   498     "+r"(dst_width)  // %2
   499   : "r"((intptr_t)(src_stride)),  // %3
   500     "m"(kMadd21)     // %4
   501   : "memory", "cc"
   502 #if defined(__native_client__) && defined(__x86_64__)
   503     , "r14"
   504 #endif
   505 #if defined(__SSE2__)
   506     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   507 #endif
   508   );
   509 }
   511 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
   512                                 ptrdiff_t src_stride,
   513                                 uint8* dst_ptr, int dst_width) {
   514   asm volatile (
   515     "movdqa    %0,%%xmm2                       \n"  // kShuf01
   516     "movdqa    %1,%%xmm3                       \n"  // kShuf11
   517     "movdqa    %2,%%xmm4                       \n"  // kShuf21
   518   :
   519   : "m"(kShuf01),  // %0
   520     "m"(kShuf11),  // %1
   521     "m"(kShuf21)   // %2
   522   );
   523   asm volatile (
   524     "movdqa    %0,%%xmm5                       \n"  // kMadd01
   525     "movdqa    %1,%%xmm0                       \n"  // kMadd11
   526     "movdqa    %2,%%xmm1                       \n"  // kRound34
   527   :
   528   : "m"(kMadd01),  // %0
   529     "m"(kMadd11),  // %1
   530     "m"(kRound34)  // %2
   531   );
   533   asm volatile (
   534     LABELALIGN
   535   "1:                                          \n"
   536     "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
   537     MEMOPREG(movdqa,0x00,0,3,1,xmm7)           //  movdqa  (%0,%3,1),%%xmm7
   538     "pavgb     %%xmm6,%%xmm7                   \n"
   539     "pavgb     %%xmm7,%%xmm6                   \n"
   540     "pshufb    %%xmm2,%%xmm6                   \n"
   541     "pmaddubsw %%xmm5,%%xmm6                   \n"
   542     "paddsw    %%xmm1,%%xmm6                   \n"
   543     "psrlw     $0x2,%%xmm6                     \n"
   544     "packuswb  %%xmm6,%%xmm6                   \n"
   545     "movq      %%xmm6," MEMACCESS(1) "         \n"
   546     "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
   547     MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3,1),%%xmm7
   548     "pavgb     %%xmm6,%%xmm7                   \n"
   549     "pavgb     %%xmm7,%%xmm6                   \n"
   550     "pshufb    %%xmm3,%%xmm6                   \n"
   551     "pmaddubsw %%xmm0,%%xmm6                   \n"
   552     "paddsw    %%xmm1,%%xmm6                   \n"
   553     "psrlw     $0x2,%%xmm6                     \n"
   554     "packuswb  %%xmm6,%%xmm6                   \n"
   555     "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
   556     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
   557     MEMOPREG(movdqa,0x10,0,3,1,xmm7)           //  movdqa  0x10(%0,%3,1),%%xmm7
   558     "lea       " MEMLEA(0x20,0) ",%0           \n"
   559     "pavgb     %%xmm6,%%xmm7                   \n"
   560     "pavgb     %%xmm7,%%xmm6                   \n"
   561     "pshufb    %%xmm4,%%xmm6                   \n"
   562     "pmaddubsw %4,%%xmm6                       \n"
   563     "paddsw    %%xmm1,%%xmm6                   \n"
   564     "psrlw     $0x2,%%xmm6                     \n"
   565     "packuswb  %%xmm6,%%xmm6                   \n"
   566     "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
   567     "lea       " MEMLEA(0x18,1) ",%1           \n"
   568     "sub       $0x18,%2                        \n"
   569     "jg        1b                              \n"
   570     : "+r"(src_ptr),   // %0
   571       "+r"(dst_ptr),   // %1
   572       "+r"(dst_width)  // %2
   573     : "r"((intptr_t)(src_stride)),  // %3
   574       "m"(kMadd21)     // %4
   575     : "memory", "cc"
   576 #if defined(__native_client__) && defined(__x86_64__)
   577     , "r14"
   578 #endif
   579 #if defined(__SSE2__)
   580     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   581 #endif
   582   );
   583 }
   585 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
   586                           uint8* dst_ptr, int dst_width) {
   587   asm volatile (
   588     "movdqa    %3,%%xmm4                       \n"
   589     "movdqa    %4,%%xmm5                       \n"
   591     LABELALIGN
   592   "1:                                          \n"
   593     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
   594     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   595     "lea       " MEMLEA(0x20,0) ",%0           \n"
   596     "pshufb    %%xmm4,%%xmm0                   \n"
   597     "pshufb    %%xmm5,%%xmm1                   \n"
   598     "paddusb   %%xmm1,%%xmm0                   \n"
   599     "movq      %%xmm0," MEMACCESS(1) "         \n"
   600     "movhlps   %%xmm0,%%xmm1                   \n"
   601     "movd      %%xmm1," MEMACCESS2(0x8,1) "    \n"
   602     "lea       " MEMLEA(0xc,1) ",%1            \n"
   603     "sub       $0xc,%2                         \n"
   604     "jg        1b                              \n"
   605   : "+r"(src_ptr),   // %0
   606     "+r"(dst_ptr),   // %1
   607     "+r"(dst_width)  // %2
   608   : "m"(kShuf38a),   // %3
   609     "m"(kShuf38b)    // %4
   610   : "memory", "cc"
   611 #if defined(__SSE2__)
   612       , "xmm0", "xmm1", "xmm4", "xmm5"
   613 #endif
   614   );
   615 }
   617 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
   618                                 ptrdiff_t src_stride,
   619                                 uint8* dst_ptr, int dst_width) {
   620   asm volatile (
   621     "movdqa    %0,%%xmm2                       \n"
   622     "movdqa    %1,%%xmm3                       \n"
   623     "movdqa    %2,%%xmm4                       \n"
   624     "movdqa    %3,%%xmm5                       \n"
   625   :
   626   : "m"(kShufAb0),   // %0
   627     "m"(kShufAb1),   // %1
   628     "m"(kShufAb2),   // %2
   629     "m"(kScaleAb2)   // %3
   630   );
   631   asm volatile (
   632     LABELALIGN
   633   "1:                                          \n"
   634     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
   635     MEMOPREG(pavgb,0x00,0,3,1,xmm0)            //  pavgb   (%0,%3,1),%%xmm0
   636     "lea       " MEMLEA(0x10,0) ",%0           \n"
   637     "movdqa    %%xmm0,%%xmm1                   \n"
   638     "pshufb    %%xmm2,%%xmm1                   \n"
   639     "movdqa    %%xmm0,%%xmm6                   \n"
   640     "pshufb    %%xmm3,%%xmm6                   \n"
   641     "paddusw   %%xmm6,%%xmm1                   \n"
   642     "pshufb    %%xmm4,%%xmm0                   \n"
   643     "paddusw   %%xmm0,%%xmm1                   \n"
   644     "pmulhuw   %%xmm5,%%xmm1                   \n"
   645     "packuswb  %%xmm1,%%xmm1                   \n"
   646     "sub       $0x6,%2                         \n"
   647     "movd      %%xmm1," MEMACCESS(1) "         \n"
   648     "psrlq     $0x10,%%xmm1                    \n"
   649     "movd      %%xmm1," MEMACCESS2(0x2,1) "    \n"
   650     "lea       " MEMLEA(0x6,1) ",%1            \n"
   651     "jg        1b                              \n"
   652   : "+r"(src_ptr),     // %0
   653     "+r"(dst_ptr),     // %1
   654     "+r"(dst_width)    // %2
   655   : "r"((intptr_t)(src_stride))  // %3
   656   : "memory", "cc"
   657 #if defined(__native_client__) && defined(__x86_64__)
   658     , "r14"
   659 #endif
   660 #if defined(__SSE2__)
   661     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   662 #endif
   663   );
   664 }
   666 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
   667                                 ptrdiff_t src_stride,
   668                                 uint8* dst_ptr, int dst_width) {
   669   asm volatile (
   670     "movdqa    %0,%%xmm2                       \n"
   671     "movdqa    %1,%%xmm3                       \n"
   672     "movdqa    %2,%%xmm4                       \n"
   673     "pxor      %%xmm5,%%xmm5                   \n"
   674   :
   675   : "m"(kShufAc),    // %0
   676     "m"(kShufAc3),   // %1
   677     "m"(kScaleAc33)  // %2
   678   );
   679   asm volatile (
   680     LABELALIGN
   681   "1:                                          \n"
   682     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
   683     MEMOPREG(movdqa,0x00,0,3,1,xmm6)           //  movdqa  (%0,%3,1),%%xmm6
   684     "movhlps   %%xmm0,%%xmm1                   \n"
   685     "movhlps   %%xmm6,%%xmm7                   \n"
   686     "punpcklbw %%xmm5,%%xmm0                   \n"
   687     "punpcklbw %%xmm5,%%xmm1                   \n"
   688     "punpcklbw %%xmm5,%%xmm6                   \n"
   689     "punpcklbw %%xmm5,%%xmm7                   \n"
   690     "paddusw   %%xmm6,%%xmm0                   \n"
   691     "paddusw   %%xmm7,%%xmm1                   \n"
   692     MEMOPREG(movdqa,0x00,0,3,2,xmm6)           //  movdqa  (%0,%3,2),%%xmm6
   693     "lea       " MEMLEA(0x10,0) ",%0           \n"
   694     "movhlps   %%xmm6,%%xmm7                   \n"
   695     "punpcklbw %%xmm5,%%xmm6                   \n"
   696     "punpcklbw %%xmm5,%%xmm7                   \n"
   697     "paddusw   %%xmm6,%%xmm0                   \n"
   698     "paddusw   %%xmm7,%%xmm1                   \n"
   699     "movdqa    %%xmm0,%%xmm6                   \n"
   700     "psrldq    $0x2,%%xmm0                     \n"
   701     "paddusw   %%xmm0,%%xmm6                   \n"
   702     "psrldq    $0x2,%%xmm0                     \n"
   703     "paddusw   %%xmm0,%%xmm6                   \n"
   704     "pshufb    %%xmm2,%%xmm6                   \n"
   705     "movdqa    %%xmm1,%%xmm7                   \n"
   706     "psrldq    $0x2,%%xmm1                     \n"
   707     "paddusw   %%xmm1,%%xmm7                   \n"
   708     "psrldq    $0x2,%%xmm1                     \n"
   709     "paddusw   %%xmm1,%%xmm7                   \n"
   710     "pshufb    %%xmm3,%%xmm7                   \n"
   711     "paddusw   %%xmm7,%%xmm6                   \n"
   712     "pmulhuw   %%xmm4,%%xmm6                   \n"
   713     "packuswb  %%xmm6,%%xmm6                   \n"
   714     "sub       $0x6,%2                         \n"
   715     "movd      %%xmm6," MEMACCESS(1) "         \n"
   716     "psrlq     $0x10,%%xmm6                    \n"
   717     "movd      %%xmm6," MEMACCESS2(0x2,1) "    \n"
   718     "lea       " MEMLEA(0x6,1) ",%1            \n"
   719     "jg        1b                              \n"
   720   : "+r"(src_ptr),    // %0
   721     "+r"(dst_ptr),    // %1
   722     "+r"(dst_width)   // %2
   723   : "r"((intptr_t)(src_stride))   // %3
   724   : "memory", "cc"
   725 #if defined(__native_client__) && defined(__x86_64__)
   726     , "r14"
   727 #endif
   728 #if defined(__SSE2__)
   729     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   730 #endif
   731   );
   732 }
   734 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   735                        uint16* dst_ptr, int src_width, int src_height) {
   736   int tmp_height = 0;
   737   intptr_t tmp_src = 0;
   738   asm volatile (
   739     "pxor      %%xmm4,%%xmm4                   \n"
   740     "sub       $0x1,%5                         \n"
   742     LABELALIGN
   743   "1:                                          \n"
   744     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
   745     "mov       %0,%3                           \n"
   746     "add       %6,%0                           \n"
   747     "movdqa    %%xmm0,%%xmm1                   \n"
   748     "punpcklbw %%xmm4,%%xmm0                   \n"
   749     "punpckhbw %%xmm4,%%xmm1                   \n"
   750     "mov       %5,%2                           \n"
   751     "test      %2,%2                           \n"
   752     "je        3f                              \n"
   754     LABELALIGN
   755   "2:                                          \n"
   756     "movdqa    " MEMACCESS(0) ",%%xmm2         \n"
   757     "add       %6,%0                           \n"
   758     "movdqa    %%xmm2,%%xmm3                   \n"
   759     "punpcklbw %%xmm4,%%xmm2                   \n"
   760     "punpckhbw %%xmm4,%%xmm3                   \n"
   761     "paddusw   %%xmm2,%%xmm0                   \n"
   762     "paddusw   %%xmm3,%%xmm1                   \n"
   763     "sub       $0x1,%2                         \n"
   764     "jg        2b                              \n"
   766     LABELALIGN
   767   "3:                                          \n"
   768     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
   769     "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
   770     "lea       " MEMLEA(0x10,3) ",%0           \n"
   771     "lea       " MEMLEA(0x20,1) ",%1           \n"
   772     "sub       $0x10,%4                        \n"
   773     "jg        1b                              \n"
   774   : "+r"(src_ptr),     // %0
   775     "+r"(dst_ptr),     // %1
   776     "+r"(tmp_height),  // %2
   777     "+r"(tmp_src),     // %3
   778     "+r"(src_width),   // %4
   779     "+rm"(src_height)  // %5
   780   : "rm"((intptr_t)(src_stride))  // %6
   781   : "memory", "cc"
   782 #if defined(__SSE2__)
   783     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
   784 #endif
   785   );
   786 }
   788 // Bilinear column filtering. SSSE3 version.
   789 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
   790                            int dst_width, int x, int dx) {
   791   intptr_t x0 = 0, x1 = 0, temp_pixel = 0;
   792   asm volatile (
   793     "movd      %6,%%xmm2                       \n"
   794     "movd      %7,%%xmm3                       \n"
   795     "movl      $0x04040000,%k2                 \n"
   796     "movd      %k2,%%xmm5                      \n"
   797     "pcmpeqb   %%xmm6,%%xmm6                   \n"
   798     "psrlw     $0x9,%%xmm6                     \n"
   799     "pextrw    $0x1,%%xmm2,%k3                 \n"
   800     "subl      $0x2,%5                         \n"
   801     "jl        29f                             \n"
   802     "movdqa    %%xmm2,%%xmm0                   \n"
   803     "paddd     %%xmm3,%%xmm0                   \n"
   804     "punpckldq %%xmm0,%%xmm2                   \n"
   805     "punpckldq %%xmm3,%%xmm3                   \n"
   806     "paddd     %%xmm3,%%xmm3                   \n"
   807     "pextrw    $0x3,%%xmm2,%k4                 \n"
   809     LABELALIGN
   810   "2:                                          \n"
   811     "movdqa    %%xmm2,%%xmm1                   \n"
   812     "paddd     %%xmm3,%%xmm2                   \n"
   813     MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
   814     "movd      %k2,%%xmm0                      \n"
   815     "psrlw     $0x9,%%xmm1                     \n"
   816     BUNDLEALIGN
   817     MEMOPARG(movzwl,0x00,1,4,1,k2)             //  movzwl  (%1,%4,1),%k2
   818     "movd      %k2,%%xmm4                      \n"
   819     "pshufb    %%xmm5,%%xmm1                   \n"
   820     "punpcklwd %%xmm4,%%xmm0                   \n"
   821     "pxor      %%xmm6,%%xmm1                   \n"
   822     "pmaddubsw %%xmm1,%%xmm0                   \n"
   823     "pextrw    $0x1,%%xmm2,%k3                 \n"
   824     "pextrw    $0x3,%%xmm2,%k4                 \n"
   825     "psrlw     $0x7,%%xmm0                     \n"
   826     "packuswb  %%xmm0,%%xmm0                   \n"
   827     "movd      %%xmm0,%k2                      \n"
   828     "mov       %w2," MEMACCESS(0) "            \n"
   829     "lea       " MEMLEA(0x2,0) ",%0            \n"
   830     "sub       $0x2,%5                         \n"
   831     "jge       2b                              \n"
   833     LABELALIGN
   834   "29:                                         \n"
   835     "addl      $0x1,%5                         \n"
   836     "jl        99f                             \n"
   837     MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
   838     "movd      %k2,%%xmm0                      \n"
   839     "psrlw     $0x9,%%xmm2                     \n"
   840     "pshufb    %%xmm5,%%xmm2                   \n"
   841     "pxor      %%xmm6,%%xmm2                   \n"
   842     "pmaddubsw %%xmm2,%%xmm0                   \n"
   843     "psrlw     $0x7,%%xmm0                     \n"
   844     "packuswb  %%xmm0,%%xmm0                   \n"
   845     "movd      %%xmm0,%k2                      \n"
   846     "mov       %b2," MEMACCESS(0) "            \n"
   847   "99:                                         \n"
   848   : "+r"(dst_ptr),     // %0
   849     "+r"(src_ptr),     // %1
   850     "+a"(temp_pixel),  // %2
   851     "+r"(x0),          // %3
   852     "+r"(x1),          // %4
   853     "+rm"(dst_width)   // %5
   854   : "rm"(x),           // %6
   855     "rm"(dx)           // %7
   856   : "memory", "cc"
   857 #if defined(__native_client__) && defined(__x86_64__)
   858     , "r14"
   859 #endif
   860 #if defined(__SSE2__)
   861     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   862 #endif
   863   );
   864 }
   866 // Reads 4 pixels, duplicates them and writes 8 pixels.
   867 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
   868 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
   869                        int dst_width, int x, int dx) {
   870   asm volatile (
   871     LABELALIGN
   872   "1:                                          \n"
   873     "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
   874     "lea       " MEMLEA(0x10,1) ",%1           \n"
   875     "movdqa    %%xmm0,%%xmm1                   \n"
   876     "punpcklbw %%xmm0,%%xmm0                   \n"
   877     "punpckhbw %%xmm1,%%xmm1                   \n"
   878     "sub       $0x20,%2                         \n"
   879     "movdqa    %%xmm0," MEMACCESS(0) "         \n"
   880     "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"
   881     "lea       " MEMLEA(0x20,0) ",%0           \n"
   882     "jg        1b                              \n"
   884   : "+r"(dst_ptr),     // %0
   885     "+r"(src_ptr),     // %1
   886     "+r"(dst_width)    // %2
   887   :
   888   : "memory", "cc"
   889 #if defined(__SSE2__)
   890     , "xmm0", "xmm1"
   891 #endif
   892   );
   893 }
   895 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
   896                             ptrdiff_t src_stride,
   897                             uint8* dst_argb, int dst_width) {
   898   asm volatile (
   899     LABELALIGN
   900   "1:                                          \n"
   901     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
   902     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   903     "lea       " MEMLEA(0x20,0) ",%0           \n"
   904     "shufps    $0xdd,%%xmm1,%%xmm0             \n"
   905     "sub       $0x4,%2                         \n"
   906     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
   907     "lea       " MEMLEA(0x10,1) ",%1           \n"
   908     "jg        1b                              \n"
   909   : "+r"(src_argb),  // %0
   910     "+r"(dst_argb),  // %1
   911     "+r"(dst_width)  // %2
   912   :
   913   : "memory", "cc"
   914 #if defined(__SSE2__)
   915     , "xmm0", "xmm1"
   916 #endif
   917   );
   918 }
   920 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
   921                                   ptrdiff_t src_stride,
   922                                   uint8* dst_argb, int dst_width) {
   923   asm volatile (
   924     LABELALIGN
   925   "1:                                          \n"
   926     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
   927     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   928     "lea       " MEMLEA(0x20,0) ",%0           \n"
   929     "movdqa    %%xmm0,%%xmm2                   \n"
   930     "shufps    $0x88,%%xmm1,%%xmm0             \n"
   931     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
   932     "pavgb     %%xmm2,%%xmm0                   \n"
   933     "sub       $0x4,%2                         \n"
   934     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
   935     "lea       " MEMLEA(0x10,1) ",%1           \n"
   936     "jg        1b                              \n"
   937   : "+r"(src_argb),  // %0
   938     "+r"(dst_argb),  // %1
   939     "+r"(dst_width)  // %2
   940   :
   941   : "memory", "cc"
   942 #if defined(__SSE2__)
   943     , "xmm0", "xmm1"
   944 #endif
   945   );
   946 }
   948 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
   949                                ptrdiff_t src_stride,
   950                                uint8* dst_argb, int dst_width) {
   951   asm volatile (
   952     LABELALIGN
   953   "1:                                          \n"
   954     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
   955     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   956     BUNDLEALIGN
   957     MEMOPREG(movdqa,0x00,0,3,1,xmm2)           //  movdqa   (%0,%3,1),%%xmm2
   958     MEMOPREG(movdqa,0x10,0,3,1,xmm3)           //  movdqa   0x10(%0,%3,1),%%xmm3
   959     "lea       " MEMLEA(0x20,0) ",%0           \n"
   960     "pavgb     %%xmm2,%%xmm0                   \n"
   961     "pavgb     %%xmm3,%%xmm1                   \n"
   962     "movdqa    %%xmm0,%%xmm2                   \n"
   963     "shufps    $0x88,%%xmm1,%%xmm0             \n"
   964     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
   965     "pavgb     %%xmm2,%%xmm0                   \n"
   966     "sub       $0x4,%2                         \n"
   967     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
   968     "lea       " MEMLEA(0x10,1) ",%1           \n"
   969     "jg        1b                              \n"
   970   : "+r"(src_argb),   // %0
   971     "+r"(dst_argb),   // %1
   972     "+r"(dst_width)   // %2
   973   : "r"((intptr_t)(src_stride))   // %3
   974   : "memory", "cc"
   975 #if defined(__native_client__) && defined(__x86_64__)
   976     , "r14"
   977 #endif
   978 #if defined(__SSE2__)
   979     , "xmm0", "xmm1", "xmm2", "xmm3"
   980 #endif
   981   );
   982 }
   984 // Reads 4 pixels at a time.
   985 // Alignment requirement: dst_argb 16 byte aligned.
   986 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
   987                                int src_stepx,
   988                                uint8* dst_argb, int dst_width) {
   989   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
   990   intptr_t src_stepx_x12 = 0;
   991   asm volatile (
   992     "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
   993     "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
   994     LABELALIGN
   995   "1:                                          \n"
   996     "movd      " MEMACCESS(0) ",%%xmm0         \n"
   997     MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
   998     "punpckldq %%xmm1,%%xmm0                   \n"
   999     BUNDLEALIGN
  1000     MEMOPREG(movd,0x00,0,1,2,xmm2)             //  movd      (%0,%1,2),%%xmm2
  1001     MEMOPREG(movd,0x00,0,4,1,xmm3)             //  movd      (%0,%4,1),%%xmm3
  1002     "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
  1003     "punpckldq %%xmm3,%%xmm2                   \n"
  1004     "punpcklqdq %%xmm2,%%xmm0                  \n"
  1005     "sub       $0x4,%3                         \n"
  1006     "movdqa    %%xmm0," MEMACCESS(2) "         \n"
  1007     "lea       " MEMLEA(0x10,2) ",%2           \n"
  1008     "jg        1b                              \n"
  1009   : "+r"(src_argb),      // %0
  1010     "+r"(src_stepx_x4),  // %1
  1011     "+r"(dst_argb),      // %2
  1012     "+r"(dst_width),     // %3
  1013     "+r"(src_stepx_x12)  // %4
  1015   : "memory", "cc"
  1016 #if defined(__native_client__) && defined(__x86_64__)
  1017     , "r14"
  1018 #endif
  1019 #if defined(__SSE2__)
  1020     , "xmm0", "xmm1", "xmm2", "xmm3"
  1021 #endif
  1022   );
  1025 // Blends four 2x2 to 4x1.
  1026 // Alignment requirement: dst_argb 16 byte aligned.
  1027 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
  1028                                   ptrdiff_t src_stride, int src_stepx,
  1029                                   uint8* dst_argb, int dst_width) {
  1030   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
  1031   intptr_t src_stepx_x12 = 0;
  1032   intptr_t row1 = (intptr_t)(src_stride);
  1033   asm volatile (
  1034     "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
  1035     "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
  1036     "lea       " MEMLEA4(0x00,0,5,1) ",%5      \n"
  1038     LABELALIGN
  1039   "1:                                          \n"
  1040     "movq      " MEMACCESS(0) ",%%xmm0         \n"
  1041     MEMOPREG(movhps,0x00,0,1,1,xmm0)           //  movhps    (%0,%1,1),%%xmm0
  1042     MEMOPREG(movq,0x00,0,1,2,xmm1)             //  movq      (%0,%1,2),%%xmm1
  1043     BUNDLEALIGN
  1044     MEMOPREG(movhps,0x00,0,4,1,xmm1)           //  movhps    (%0,%4,1),%%xmm1
  1045     "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
  1046     "movq      " MEMACCESS(5) ",%%xmm2         \n"
  1047     BUNDLEALIGN
  1048     MEMOPREG(movhps,0x00,5,1,1,xmm2)           //  movhps    (%5,%1,1),%%xmm2
  1049     MEMOPREG(movq,0x00,5,1,2,xmm3)             //  movq      (%5,%1,2),%%xmm3
  1050     MEMOPREG(movhps,0x00,5,4,1,xmm3)           //  movhps    (%5,%4,1),%%xmm3
  1051     "lea       " MEMLEA4(0x00,5,1,4) ",%5      \n"
  1052     "pavgb     %%xmm2,%%xmm0                   \n"
  1053     "pavgb     %%xmm3,%%xmm1                   \n"
  1054     "movdqa    %%xmm0,%%xmm2                   \n"
  1055     "shufps    $0x88,%%xmm1,%%xmm0             \n"
  1056     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
  1057     "pavgb     %%xmm2,%%xmm0                   \n"
  1058     "sub       $0x4,%3                         \n"
  1059     "movdqa    %%xmm0," MEMACCESS(2) "         \n"
  1060     "lea       " MEMLEA(0x10,2) ",%2           \n"
  1061     "jg        1b                              \n"
  1062   : "+r"(src_argb),       // %0
  1063     "+r"(src_stepx_x4),   // %1
  1064     "+r"(dst_argb),       // %2
  1065     "+rm"(dst_width),     // %3
  1066     "+r"(src_stepx_x12),  // %4
  1067     "+r"(row1)            // %5
  1069   : "memory", "cc"
  1070 #if defined(__native_client__) && defined(__x86_64__)
  1071     , "r14"
  1072 #endif
  1073 #if defined(__SSE2__)
  1074     , "xmm0", "xmm1", "xmm2", "xmm3"
  1075 #endif
  1076   );
  1079 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
  1080                         int dst_width, int x, int dx) {
  1081   intptr_t x0 = 0, x1 = 0;
  1082   asm volatile (
  1083     "movd      %5,%%xmm2                       \n"
  1084     "movd      %6,%%xmm3                       \n"
  1085     "pshufd    $0x0,%%xmm2,%%xmm2              \n"
  1086     "pshufd    $0x11,%%xmm3,%%xmm0             \n"
  1087     "paddd     %%xmm0,%%xmm2                   \n"
  1088     "paddd     %%xmm3,%%xmm3                   \n"
  1089     "pshufd    $0x5,%%xmm3,%%xmm0              \n"
  1090     "paddd     %%xmm0,%%xmm2                   \n"
  1091     "paddd     %%xmm3,%%xmm3                   \n"
  1092     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
  1093     "pextrw    $0x1,%%xmm2,%k0                 \n"
  1094     "pextrw    $0x3,%%xmm2,%k1                 \n"
  1095     "cmp       $0x0,%4                         \n"
  1096     "jl        99f                             \n"
  1097     "sub       $0x4,%4                         \n"
  1098     "jl        49f                             \n"
  1100     LABELALIGN
  1101   "40:                                         \n"
  1102     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
  1103     MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
  1104     "pextrw    $0x5,%%xmm2,%k0                 \n"
  1105     "pextrw    $0x7,%%xmm2,%k1                 \n"
  1106     "paddd     %%xmm3,%%xmm2                   \n"
  1107     "punpckldq %%xmm1,%%xmm0                   \n"
  1108     MEMOPREG(movd,0x00,3,0,4,xmm1)             //  movd      (%3,%0,4),%%xmm1
  1109     MEMOPREG(movd,0x00,3,1,4,xmm4)             //  movd      (%3,%1,4),%%xmm4
  1110     "pextrw    $0x1,%%xmm2,%k0                 \n"
  1111     "pextrw    $0x3,%%xmm2,%k1                 \n"
  1112     "punpckldq %%xmm4,%%xmm1                   \n"
  1113     "punpcklqdq %%xmm1,%%xmm0                  \n"
  1114     "sub       $0x4,%4                         \n"
  1115     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
  1116     "lea       " MEMLEA(0x10,2) ",%2           \n"
  1117     "jge       40b                             \n"
  1119   "49:                                         \n"
  1120     "test      $0x2,%4                         \n"
  1121     "je        29f                             \n"
  1122     BUNDLEALIGN
  1123     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
  1124     MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
  1125     "pextrw    $0x5,%%xmm2,%k0                 \n"
  1126     "punpckldq %%xmm1,%%xmm0                   \n"
  1127     "movq      %%xmm0," MEMACCESS(2) "         \n"
  1128     "lea       " MEMLEA(0x8,2) ",%2            \n"
  1129   "29:                                         \n"
  1130     "test      $0x1,%4                         \n"
  1131     "je        99f                             \n"
  1132     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
  1133     "movd      %%xmm0," MEMACCESS(2) "         \n"
  1134   "99:                                         \n"
  1135   : "+a"(x0),          // %0
  1136     "+d"(x1),          // %1
  1137     "+r"(dst_argb),    // %2
  1138     "+r"(src_argb),    // %3
  1139     "+r"(dst_width)    // %4
  1140   : "rm"(x),           // %5
  1141     "rm"(dx)           // %6
  1142   : "memory", "cc"
  1143 #if defined(__native_client__) && defined(__x86_64__)
  1144     , "r14"
  1145 #endif
  1146 #if defined(__SSE2__)
  1147     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
  1148 #endif
  1149   );
  1152 // Reads 4 pixels, duplicates them and writes 8 pixels.
  1153 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
  1154 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
  1155                            int dst_width, int x, int dx) {
  1156   asm volatile (
  1157     LABELALIGN
  1158   "1:                                          \n"
  1159     "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
  1160     "lea       " MEMLEA(0x10,1) ",%1           \n"
  1161     "movdqa    %%xmm0,%%xmm1                   \n"
  1162     "punpckldq %%xmm0,%%xmm0                   \n"
  1163     "punpckhdq %%xmm1,%%xmm1                   \n"
  1164     "sub       $0x8,%2                         \n"
  1165     "movdqa    %%xmm0," MEMACCESS(0) "         \n"
  1166     "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"
  1167     "lea       " MEMLEA(0x20,0) ",%0           \n"
  1168     "jg        1b                              \n"
  1170   : "+r"(dst_argb),    // %0
  1171     "+r"(src_argb),    // %1
  1172     "+r"(dst_width)    // %2
  1174   : "memory", "cc"
  1175 #if defined(__native_client__) && defined(__x86_64__)
  1176     , "r14"
  1177 #endif
  1178 #if defined(__SSE2__)
  1179     , "xmm0", "xmm1"
  1180 #endif
  1181   );
  1184 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
  1185 static uvec8 kShuffleColARGB = {
  1186   0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
  1187   8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
  1188 };
  1190 // Shuffle table for duplicating 2 fractions into 8 bytes each
  1191 static uvec8 kShuffleFractions = {
  1192   0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
  1193 };
  1195 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
  1196 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
  1197                                int dst_width, int x, int dx) {
  1198   intptr_t x0 = 0, x1 = 0;
  1199   asm volatile (
  1200     "movdqa    %0,%%xmm4                       \n"
  1201     "movdqa    %1,%%xmm5                       \n"
  1203   : "m"(kShuffleColARGB),  // %0
  1204     "m"(kShuffleFractions)  // %1
  1205   );
  1207   asm volatile (
  1208     "movd      %5,%%xmm2                       \n"
  1209     "movd      %6,%%xmm3                       \n"
  1210     "pcmpeqb   %%xmm6,%%xmm6                   \n"
  1211     "psrlw     $0x9,%%xmm6                     \n"
  1212     "pextrw    $0x1,%%xmm2,%k3                 \n"
  1213     "sub       $0x2,%2                         \n"
  1214     "jl        29f                             \n"
  1215     "movdqa    %%xmm2,%%xmm0                   \n"
  1216     "paddd     %%xmm3,%%xmm0                   \n"
  1217     "punpckldq %%xmm0,%%xmm2                   \n"
  1218     "punpckldq %%xmm3,%%xmm3                   \n"
  1219     "paddd     %%xmm3,%%xmm3                   \n"
  1220     "pextrw    $0x3,%%xmm2,%k4                 \n"
  1222     LABELALIGN
  1223   "2:                                          \n"
  1224     "movdqa    %%xmm2,%%xmm1                   \n"
  1225     "paddd     %%xmm3,%%xmm2                   \n"
  1226     MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
  1227     "psrlw     $0x9,%%xmm1                     \n"
  1228     BUNDLEALIGN
  1229     MEMOPREG(movhps,0x00,1,4,4,xmm0)           //  movhps    (%1,%4,4),%%xmm0
  1230     "pshufb    %%xmm5,%%xmm1                   \n"
  1231     "pshufb    %%xmm4,%%xmm0                   \n"
  1232     "pxor      %%xmm6,%%xmm1                   \n"
  1233     "pmaddubsw %%xmm1,%%xmm0                   \n"
  1234     "psrlw     $0x7,%%xmm0                     \n"
  1235     "pextrw    $0x1,%%xmm2,%k3                 \n"
  1236     "pextrw    $0x3,%%xmm2,%k4                 \n"
  1237     "packuswb  %%xmm0,%%xmm0                   \n"
  1238     "movq      %%xmm0," MEMACCESS(0) "         \n"
  1239     "lea       " MEMLEA(0x8,0) ",%0            \n"
  1240     "sub       $0x2,%2                         \n"
  1241     "jge       2b                              \n"
  1243     LABELALIGN
  1244   "29:                                         \n"
  1245     "add       $0x1,%2                         \n"
  1246     "jl        99f                             \n"
  1247     "psrlw     $0x9,%%xmm2                     \n"
  1248     BUNDLEALIGN
  1249     MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
  1250     "pshufb    %%xmm5,%%xmm2                   \n"
  1251     "pshufb    %%xmm4,%%xmm0                   \n"
  1252     "pxor      %%xmm6,%%xmm2                   \n"
  1253     "pmaddubsw %%xmm2,%%xmm0                   \n"
  1254     "psrlw     $0x7,%%xmm0                     \n"
  1255     "packuswb  %%xmm0,%%xmm0                   \n"
  1256     "movd      %%xmm0," MEMACCESS(0) "         \n"
  1258     LABELALIGN
  1259   "99:                                         \n"
  1260   : "+r"(dst_argb),    // %0
  1261     "+r"(src_argb),    // %1
  1262     "+rm"(dst_width),  // %2
  1263     "+r"(x0),          // %3
  1264     "+r"(x1)           // %4
  1265   : "rm"(x),           // %5
  1266     "rm"(dx)           // %6
  1267   : "memory", "cc"
  1268 #if defined(__native_client__) && defined(__x86_64__)
  1269     , "r14"
  1270 #endif
  1271 #if defined(__SSE2__)
  1272     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  1273 #endif
  1274   );
  1277 // Divide num by div and return as 16.16 fixed point result.
  1278 int FixedDiv_X86(int num, int div) {
  1279   asm volatile (
  1280     "cdq                                       \n"
  1281     "shld      $0x10,%%eax,%%edx               \n"
  1282     "shl       $0x10,%%eax                     \n"
  1283     "idiv      %1                              \n"
  1284     "mov       %0, %%eax                       \n"
  1285     : "+a"(num)  // %0
  1286     : "c"(div)   // %1
  1287     : "memory", "cc", "edx"
  1288   );
  1289   return num;
  1292 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
  1293 int FixedDiv1_X86(int num, int div) {
  1294   asm volatile (
  1295     "cdq                                       \n"
  1296     "shld      $0x10,%%eax,%%edx               \n"
  1297     "shl       $0x10,%%eax                     \n"
  1298     "sub       $0x10001,%%eax                  \n"
  1299     "sbb       $0x0,%%edx                      \n"
  1300     "sub       $0x1,%1                         \n"
  1301     "idiv      %1                              \n"
  1302     "mov       %0, %%eax                       \n"
  1303     : "+a"(num)  // %0
  1304     : "c"(div)   // %1
  1305     : "memory", "cc", "edx"
  1306   );
  1307   return num;
  1310 #endif  // defined(__x86_64__) || defined(__i386__)
  1312 #ifdef __cplusplus
  1313 }  // extern "C"
  1314 }  // namespace libyuv
  1315 #endif

mercurial