The Tor Browser: media/libyuv/source/scale

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*

     2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.

     3  *

     4  *  Use of this source code is governed by a BSD-style license

     5  *  that can be found in the LICENSE file in the root of the source

     6  *  tree. An additional intellectual property rights grant can be found

     7  *  in the file PATENTS. All contributing project authors may

     8  *  be found in the AUTHORS file in the root of the source tree.

     9  */

    11 #include "libyuv/row.h"

    13 #ifdef __cplusplus

    14 namespace libyuv {

    15 extern "C" {

    16 #endif

    18 // This module is for GCC x86 and x64.

    19 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))

    21 // Offsets for source bytes 0 to 9

    22 static uvec8 kShuf0 =

    23   { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };

    25 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.

    26 static uvec8 kShuf1 =

    27   { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };

    29 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.

    30 static uvec8 kShuf2 =

    31   { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };

    33 // Offsets for source bytes 0 to 10

    34 static uvec8 kShuf01 =

    35   { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };

    37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.

    38 static uvec8 kShuf11 =

    39   { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };

    41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.

    42 static uvec8 kShuf21 =

    43   { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };

    45 // Coefficients for source bytes 0 to 10

    46 static uvec8 kMadd01 =

    47   { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };

    49 // Coefficients for source bytes 10 to 21

    50 static uvec8 kMadd11 =

    51   { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };

    53 // Coefficients for source bytes 21 to 31

    54 static uvec8 kMadd21 =

    55   { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };

    57 // Coefficients for source bytes 21 to 31

    58 static vec16 kRound34 =

    59   { 2, 2, 2, 2, 2, 2, 2, 2 };

    61 static uvec8 kShuf38a =

    62   { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };

    64 static uvec8 kShuf38b =

    65   { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };

    67 // Arrange words 0,3,6 into 0,1,2

    68 static uvec8 kShufAc =

    69   { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };

    71 // Arrange words 0,3,6 into 3,4,5

    72 static uvec8 kShufAc3 =

    73   { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };

    75 // Scaling values for boxes of 3x3 and 2x3

    76 static uvec16 kScaleAc33 =

    77   { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };

    79 // Arrange first value for pixels 0,1,2,3,4,5

    80 static uvec8 kShufAb0 =

    81   { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };

    83 // Arrange second value for pixels 0,1,2,3,4,5

    84 static uvec8 kShufAb1 =

    85   { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };

    87 // Arrange third value for pixels 0,1,2,3,4,5

    88 static uvec8 kShufAb2 =

    89   { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };

    91 // Scaling values for boxes of 3x2 and 2x2

    92 static uvec16 kScaleAb2 =

    93   { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };

    95 // GCC versions of row functions are verbatim conversions from Visual C.

    96 // Generated using gcc disassembly on Visual C object file:

    97 // objdump -D yuvscaler.obj >yuvscaler.txt

    99 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

   100                         uint8* dst_ptr, int dst_width) {

   101   asm volatile (

   102     LABELALIGN

   103   "1:                                          \n"

   104     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

   105     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

   106     "lea       " MEMLEA(0x20,0) ",%0           \n"

   107     "psrlw     $0x8,%%xmm0                     \n"

   108     "psrlw     $0x8,%%xmm1                     \n"

   109     "packuswb  %%xmm1,%%xmm0                   \n"

   110     "movdqa    %%xmm0," MEMACCESS(1) "         \n"

   111     "lea       " MEMLEA(0x10,1) ",%1           \n"

   112     "sub       $0x10,%2                        \n"

   113     "jg        1b                              \n"

   114   : "+r"(src_ptr),    // %0

   115     "+r"(dst_ptr),    // %1

   116     "+r"(dst_width)   // %2

   117   :

   118   : "memory", "cc"

   119 #if defined(__SSE2__)

   120     , "xmm0", "xmm1"

   121 #endif

   122   );

   123 }

   125 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

   126                               uint8* dst_ptr, int dst_width) {

   127   asm volatile (

   128     "pcmpeqb   %%xmm5,%%xmm5                   \n"

   129     "psrlw     $0x8,%%xmm5                     \n"

   131     LABELALIGN

   132   "1:                                          \n"

   133     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

   134     "movdqa    " MEMACCESS2(0x10, 0) ",%%xmm1  \n"

   135     "lea       " MEMLEA(0x20,0) ",%0           \n"

   136     "movdqa    %%xmm0,%%xmm2                   \n"

   137     "psrlw     $0x8,%%xmm0                     \n"

   138     "movdqa    %%xmm1,%%xmm3                   \n"

   139     "psrlw     $0x8,%%xmm1                     \n"

   140     "pand      %%xmm5,%%xmm2                   \n"

   141     "pand      %%xmm5,%%xmm3                   \n"

   142     "pavgw     %%xmm2,%%xmm0                   \n"

   143     "pavgw     %%xmm3,%%xmm1                   \n"

   144     "packuswb  %%xmm1,%%xmm0                   \n"

   145     "movdqa    %%xmm0," MEMACCESS(1) "         \n"

   146     "lea       " MEMLEA(0x10,1) ",%1           \n"

   147     "sub       $0x10,%2                        \n"

   148     "jg        1b                              \n"

   149   : "+r"(src_ptr),    // %0

   150     "+r"(dst_ptr),    // %1

   151     "+r"(dst_width)   // %2

   152   :

   153   : "memory", "cc"

   154 #if defined(__SSE2__)

   155     , "xmm0", "xmm1", "xmm5"

   156 #endif

   157   );

   158 }

   160 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

   161                            uint8* dst_ptr, int dst_width) {

   162   asm volatile (

   163     "pcmpeqb   %%xmm5,%%xmm5                   \n"

   164     "psrlw     $0x8,%%xmm5                     \n"

   166     LABELALIGN

   167   "1:                                          \n"

   168     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

   169     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

   170     MEMOPREG(movdqa,0x00,0,3,1,xmm2)           //  movdqa  (%0,%3,1),%%xmm2

   171     BUNDLEALIGN

   172     MEMOPREG(movdqa,0x10,0,3,1,xmm3)           //  movdqa  0x10(%0,%3,1),%%xmm3

   173     "lea       " MEMLEA(0x20,0) ",%0           \n"

   174     "pavgb     %%xmm2,%%xmm0                   \n"

   175     "pavgb     %%xmm3,%%xmm1                   \n"

   176     "movdqa    %%xmm0,%%xmm2                   \n"

   177     "psrlw     $0x8,%%xmm0                     \n"

   178     "movdqa    %%xmm1,%%xmm3                   \n"

   179     "psrlw     $0x8,%%xmm1                     \n"

   180     "pand      %%xmm5,%%xmm2                   \n"

   181     "pand      %%xmm5,%%xmm3                   \n"

   182     "pavgw     %%xmm2,%%xmm0                   \n"

   183     "pavgw     %%xmm3,%%xmm1                   \n"

   184     "packuswb  %%xmm1,%%xmm0                   \n"

   185     "movdqa    %%xmm0," MEMACCESS(1) "         \n"

   186     "lea       " MEMLEA(0x10,1) ",%1           \n"

   187     "sub       $0x10,%2                        \n"

   188     "jg        1b                              \n"

   189   : "+r"(src_ptr),    // %0

   190     "+r"(dst_ptr),    // %1

   191     "+r"(dst_width)   // %2

   192   : "r"((intptr_t)(src_stride))   // %3

   193   : "memory", "cc"

   194 #if defined(__native_client__) && defined(__x86_64__)

   195     , "r14"

   196 #endif

   197 #if defined(__SSE2__)

   198     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

   199 #endif

   200   );

   201 }

   203 void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

   204                                   uint8* dst_ptr, int dst_width) {

   205   asm volatile (

   206     LABELALIGN

   207   "1:                                          \n"

   208     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

   209     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

   210     "lea       " MEMLEA(0x20,0) ",%0           \n"

   211     "psrlw     $0x8,%%xmm0                     \n"

   212     "psrlw     $0x8,%%xmm1                     \n"

   213     "packuswb  %%xmm1,%%xmm0                   \n"

   214     "movdqu    %%xmm0," MEMACCESS(1) "         \n"

   215     "lea       " MEMLEA(0x10,1) ",%1           \n"

   216     "sub       $0x10,%2                        \n"

   217     "jg        1b                              \n"

   218   : "+r"(src_ptr),    // %0

   219     "+r"(dst_ptr),    // %1

   220     "+r"(dst_width)   // %2

   221   :

   222   : "memory", "cc"

   223 #if defined(__SSE2__)

   224     , "xmm0", "xmm1"

   225 #endif

   226   );

   227 }

   229 void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,

   230                                         ptrdiff_t src_stride,

   231                                         uint8* dst_ptr, int dst_width) {

   232   asm volatile (

   233     "pcmpeqb   %%xmm5,%%xmm5                   \n"

   234     "psrlw     $0x8,%%xmm5                     \n"

   236     LABELALIGN

   237   "1:                                          \n"

   238     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

   239     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

   240     "lea       " MEMLEA(0x20,0) ",%0           \n"

   241     "movdqa    %%xmm0,%%xmm2                   \n"

   242     "psrlw     $0x8,%%xmm0                     \n"

   243     "movdqa    %%xmm1,%%xmm3                   \n"

   244     "psrlw     $0x8,%%xmm1                     \n"

   245     "pand      %%xmm5,%%xmm2                   \n"

   246     "pand      %%xmm5,%%xmm3                   \n"

   247     "pavgw     %%xmm2,%%xmm0                   \n"

   248     "pavgw     %%xmm3,%%xmm1                   \n"

   249     "packuswb  %%xmm1,%%xmm0                   \n"

   250     "movdqu    %%xmm0," MEMACCESS(1) "         \n"

   251     "lea       " MEMLEA(0x10,1) ",%1           \n"

   252     "sub       $0x10,%2                        \n"

   253     "jg        1b                              \n"

   254   : "+r"(src_ptr),    // %0

   255     "+r"(dst_ptr),    // %1

   256     "+r"(dst_width)   // %2

   257   :

   258   : "memory", "cc"

   259 #if defined(__SSE2__)

   260     , "xmm0", "xmm1", "xmm5"

   261 #endif

   262   );

   263 }

   265 void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,

   266                                      ptrdiff_t src_stride,

   267                                      uint8* dst_ptr, int dst_width) {

   268   asm volatile (

   269     "pcmpeqb   %%xmm5,%%xmm5                   \n"

   270     "psrlw     $0x8,%%xmm5                     \n"

   272     LABELALIGN

   273   "1:                                          \n"

   274     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

   275     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

   276     MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2

   277     BUNDLEALIGN

   278     MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3

   279     "lea       " MEMLEA(0x20,0) ",%0           \n"

   280     "pavgb     %%xmm2,%%xmm0                   \n"

   281     "pavgb     %%xmm3,%%xmm1                   \n"

   282     "movdqa    %%xmm0,%%xmm2                   \n"

   283     "psrlw     $0x8,%%xmm0                     \n"

   284     "movdqa    %%xmm1,%%xmm3                   \n"

   285     "psrlw     $0x8,%%xmm1                     \n"

   286     "pand      %%xmm5,%%xmm2                   \n"

   287     "pand      %%xmm5,%%xmm3                   \n"

   288     "pavgw     %%xmm2,%%xmm0                   \n"

   289     "pavgw     %%xmm3,%%xmm1                   \n"

   290     "packuswb  %%xmm1,%%xmm0                   \n"

   291     "movdqu    %%xmm0," MEMACCESS(1) "         \n"

   292     "lea       " MEMLEA(0x10,1) ",%1           \n"

   293     "sub       $0x10,%2                        \n"

   294     "jg        1b                              \n"

   295   : "+r"(src_ptr),    // %0

   296     "+r"(dst_ptr),    // %1

   297     "+r"(dst_width)   // %2

   298   : "r"((intptr_t)(src_stride))   // %3

   299   : "memory", "cc"

   300 #if defined(__native_client__) && defined(__x86_64__)

   301     , "r14"

   302 #endif

   303 #if defined(__SSE2__)

   304     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

   305 #endif

   306   );

   307 }

   309 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

   310                         uint8* dst_ptr, int dst_width) {

   311   asm volatile (

   312     "pcmpeqb   %%xmm5,%%xmm5                   \n"

   313     "psrld     $0x18,%%xmm5                    \n"

   314     "pslld     $0x10,%%xmm5                    \n"

   316     LABELALIGN

   317   "1:                                          \n"

   318     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

   319     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

   320     "lea       " MEMLEA(0x20,0) ",%0           \n"

   321     "pand      %%xmm5,%%xmm0                   \n"

   322     "pand      %%xmm5,%%xmm1                   \n"

   323     "packuswb  %%xmm1,%%xmm0                   \n"

   324     "psrlw     $0x8,%%xmm0                     \n"

   325     "packuswb  %%xmm0,%%xmm0                   \n"

   326     "movq      %%xmm0," MEMACCESS(1) "         \n"

   327     "lea       " MEMLEA(0x8,1) ",%1            \n"

   328     "sub       $0x8,%2                         \n"

   329     "jg        1b                              \n"

   330   : "+r"(src_ptr),    // %0

   331     "+r"(dst_ptr),    // %1

   332     "+r"(dst_width)   // %2

   333   :

   334   : "memory", "cc"

   335 #if defined(__SSE2__)

   336     , "xmm0", "xmm1", "xmm5"

   337 #endif

   338   );

   339 }

   341 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

   342                            uint8* dst_ptr, int dst_width) {

   343   intptr_t stridex3 = 0;

   344   asm volatile (

   345     "pcmpeqb   %%xmm7,%%xmm7                   \n"

   346     "psrlw     $0x8,%%xmm7                     \n"

   347     "lea       " MEMLEA4(0x00,4,4,2) ",%3      \n"

   349     LABELALIGN

   350   "1:                                          \n"

   351     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

   352     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

   353     MEMOPREG(movdqa,0x00,0,4,1,xmm2)           //  movdqa  (%0,%4,1),%%xmm2

   354     BUNDLEALIGN

   355     MEMOPREG(movdqa,0x10,0,4,1,xmm3)           //  movdqa  0x10(%0,%4,1),%%xmm3

   356     "pavgb     %%xmm2,%%xmm0                   \n"

   357     "pavgb     %%xmm3,%%xmm1                   \n"

   358     MEMOPREG(movdqa,0x00,0,4,2,xmm2)           //  movdqa  (%0,%4,2),%%xmm2

   359     BUNDLEALIGN

   360     MEMOPREG(movdqa,0x10,0,4,2,xmm3)           //  movdqa  0x10(%0,%4,2),%%xmm3

   361     MEMOPREG(movdqa,0x00,0,3,1,xmm4)           //  movdqa  (%0,%3,1),%%xmm4

   362     MEMOPREG(movdqa,0x10,0,3,1,xmm5)           //  movdqa  0x10(%0,%3,1),%%xmm5

   363     "lea       " MEMLEA(0x20,0) ",%0           \n"

   364     "pavgb     %%xmm4,%%xmm2                   \n"

   365     "pavgb     %%xmm2,%%xmm0                   \n"

   366     "pavgb     %%xmm5,%%xmm3                   \n"

   367     "pavgb     %%xmm3,%%xmm1                   \n"

   368     "movdqa    %%xmm0,%%xmm2                   \n"

   369     "psrlw     $0x8,%%xmm0                     \n"

   370     "movdqa    %%xmm1,%%xmm3                   \n"

   371     "psrlw     $0x8,%%xmm1                     \n"

   372     "pand      %%xmm7,%%xmm2                   \n"

   373     "pand      %%xmm7,%%xmm3                   \n"

   374     "pavgw     %%xmm2,%%xmm0                   \n"

   375     "pavgw     %%xmm3,%%xmm1                   \n"

   376     "packuswb  %%xmm1,%%xmm0                   \n"

   377     "movdqa    %%xmm0,%%xmm2                   \n"

   378     "psrlw     $0x8,%%xmm0                     \n"

   379     "pand      %%xmm7,%%xmm2                   \n"

   380     "pavgw     %%xmm2,%%xmm0                   \n"

   381     "packuswb  %%xmm0,%%xmm0                   \n"

   382     "movq      %%xmm0," MEMACCESS(1) "         \n"

   383     "lea       " MEMLEA(0x8,1) ",%1            \n"

   384     "sub       $0x8,%2                         \n"

   385     "jg        1b                              \n"

   386   : "+r"(src_ptr),     // %0

   387     "+r"(dst_ptr),     // %1

   388     "+r"(dst_width),   // %2

   389     "+r"(stridex3)     // %3

   390   : "r"((intptr_t)(src_stride))    // %4

   391   : "memory", "cc"

   392 #if defined(__native_client__) && defined(__x86_64__)

   393     , "r14"

   394 #endif

   395 #if defined(__SSE2__)

   396     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"

   397 #endif

   398   );

   399 }

   401 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

   402                           uint8* dst_ptr, int dst_width) {

   403   asm volatile (

   404     "movdqa    %0,%%xmm3                       \n"

   405     "movdqa    %1,%%xmm4                       \n"

   406     "movdqa    %2,%%xmm5                       \n"

   407   :

   408   : "m"(kShuf0),  // %0

   409     "m"(kShuf1),  // %1

   410     "m"(kShuf2)   // %2

   411   );

   412   asm volatile (

   413     LABELALIGN

   414   "1:                                          \n"

   415     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

   416     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm2   \n"

   417     "lea       " MEMLEA(0x20,0) ",%0           \n"

   418     "movdqa    %%xmm2,%%xmm1                   \n"

   419     "palignr   $0x8,%%xmm0,%%xmm1              \n"

   420     "pshufb    %%xmm3,%%xmm0                   \n"

   421     "pshufb    %%xmm4,%%xmm1                   \n"

   422     "pshufb    %%xmm5,%%xmm2                   \n"

   423     "movq      %%xmm0," MEMACCESS(1) "         \n"

   424     "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"

   425     "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"

   426     "lea       " MEMLEA(0x18,1) ",%1           \n"

   427     "sub       $0x18,%2                        \n"

   428     "jg        1b                              \n"

   429   : "+r"(src_ptr),   // %0

   430     "+r"(dst_ptr),   // %1

   431     "+r"(dst_width)  // %2

   432   :

   433   : "memory", "cc"

   434 #if defined(__SSE2__)

   435     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

   436 #endif

   437   );

   438 }

   440 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,

   441                                 ptrdiff_t src_stride,

   442                                 uint8* dst_ptr, int dst_width) {

   443   asm volatile (

   444     "movdqa    %0,%%xmm2                       \n"  // kShuf01

   445     "movdqa    %1,%%xmm3                       \n"  // kShuf11

   446     "movdqa    %2,%%xmm4                       \n"  // kShuf21

   447   :

   448   : "m"(kShuf01),  // %0

   449     "m"(kShuf11),  // %1

   450     "m"(kShuf21)   // %2

   451   );

   452   asm volatile (

   453     "movdqa    %0,%%xmm5                       \n"  // kMadd01

   454     "movdqa    %1,%%xmm0                       \n"  // kMadd11

   455     "movdqa    %2,%%xmm1                       \n"  // kRound34

   456   :

   457   : "m"(kMadd01),  // %0

   458     "m"(kMadd11),  // %1

   459     "m"(kRound34)  // %2

   460   );

   461   asm volatile (

   462     LABELALIGN

   463   "1:                                          \n"

   464     "movdqa    " MEMACCESS(0) ",%%xmm6         \n"

   465     MEMOPREG(movdqa,0x00,0,3,1,xmm7)           //  movdqa  (%0,%3),%%xmm7

   466     "pavgb     %%xmm7,%%xmm6                   \n"

   467     "pshufb    %%xmm2,%%xmm6                   \n"

   468     "pmaddubsw %%xmm5,%%xmm6                   \n"

   469     "paddsw    %%xmm1,%%xmm6                   \n"

   470     "psrlw     $0x2,%%xmm6                     \n"

   471     "packuswb  %%xmm6,%%xmm6                   \n"

   472     "movq      %%xmm6," MEMACCESS(1) "         \n"

   473     "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"

   474     MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3),%%xmm7

   475     "pavgb     %%xmm7,%%xmm6                   \n"

   476     "pshufb    %%xmm3,%%xmm6                   \n"

   477     "pmaddubsw %%xmm0,%%xmm6                   \n"

   478     "paddsw    %%xmm1,%%xmm6                   \n"

   479     "psrlw     $0x2,%%xmm6                     \n"

   480     "packuswb  %%xmm6,%%xmm6                   \n"

   481     "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"

   482     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm6   \n"

   483     BUNDLEALIGN

   484     MEMOPREG(movdqa,0x10,0,3,1,xmm7)           //  movdqa  0x10(%0,%3),%%xmm7

   485     "lea       " MEMLEA(0x20,0) ",%0           \n"

   486     "pavgb     %%xmm7,%%xmm6                   \n"

   487     "pshufb    %%xmm4,%%xmm6                   \n"

   488     "pmaddubsw %4,%%xmm6                       \n"

   489     "paddsw    %%xmm1,%%xmm6                   \n"

   490     "psrlw     $0x2,%%xmm6                     \n"

   491     "packuswb  %%xmm6,%%xmm6                   \n"

   492     "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"

   493     "lea       " MEMLEA(0x18,1) ",%1           \n"

   494     "sub       $0x18,%2                        \n"

   495     "jg        1b                              \n"

   496   : "+r"(src_ptr),   // %0

   497     "+r"(dst_ptr),   // %1

   498     "+r"(dst_width)  // %2

   499   : "r"((intptr_t)(src_stride)),  // %3

   500     "m"(kMadd21)     // %4

   501   : "memory", "cc"

   502 #if defined(__native_client__) && defined(__x86_64__)

   503     , "r14"

   504 #endif

   505 #if defined(__SSE2__)

   506     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

   507 #endif

   508   );

   509 }

   511 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,

   512                                 ptrdiff_t src_stride,

   513                                 uint8* dst_ptr, int dst_width) {

   514   asm volatile (

   515     "movdqa    %0,%%xmm2                       \n"  // kShuf01

   516     "movdqa    %1,%%xmm3                       \n"  // kShuf11

   517     "movdqa    %2,%%xmm4                       \n"  // kShuf21

   518   :

   519   : "m"(kShuf01),  // %0

   520     "m"(kShuf11),  // %1

   521     "m"(kShuf21)   // %2

   522   );

   523   asm volatile (

   524     "movdqa    %0,%%xmm5                       \n"  // kMadd01

   525     "movdqa    %1,%%xmm0                       \n"  // kMadd11

   526     "movdqa    %2,%%xmm1                       \n"  // kRound34

   527   :

   528   : "m"(kMadd01),  // %0

   529     "m"(kMadd11),  // %1

   530     "m"(kRound34)  // %2

   531   );

   533   asm volatile (

   534     LABELALIGN

   535   "1:                                          \n"

   536     "movdqa    " MEMACCESS(0) ",%%xmm6         \n"

   537     MEMOPREG(movdqa,0x00,0,3,1,xmm7)           //  movdqa  (%0,%3,1),%%xmm7

   538     "pavgb     %%xmm6,%%xmm7                   \n"

   539     "pavgb     %%xmm7,%%xmm6                   \n"

   540     "pshufb    %%xmm2,%%xmm6                   \n"

   541     "pmaddubsw %%xmm5,%%xmm6                   \n"

   542     "paddsw    %%xmm1,%%xmm6                   \n"

   543     "psrlw     $0x2,%%xmm6                     \n"

   544     "packuswb  %%xmm6,%%xmm6                   \n"

   545     "movq      %%xmm6," MEMACCESS(1) "         \n"

   546     "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"

   547     MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3,1),%%xmm7

   548     "pavgb     %%xmm6,%%xmm7                   \n"

   549     "pavgb     %%xmm7,%%xmm6                   \n"

   550     "pshufb    %%xmm3,%%xmm6                   \n"

   551     "pmaddubsw %%xmm0,%%xmm6                   \n"

   552     "paddsw    %%xmm1,%%xmm6                   \n"

   553     "psrlw     $0x2,%%xmm6                     \n"

   554     "packuswb  %%xmm6,%%xmm6                   \n"

   555     "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"

   556     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm6   \n"

   557     MEMOPREG(movdqa,0x10,0,3,1,xmm7)           //  movdqa  0x10(%0,%3,1),%%xmm7

   558     "lea       " MEMLEA(0x20,0) ",%0           \n"

   559     "pavgb     %%xmm6,%%xmm7                   \n"

   560     "pavgb     %%xmm7,%%xmm6                   \n"

   561     "pshufb    %%xmm4,%%xmm6                   \n"

   562     "pmaddubsw %4,%%xmm6                       \n"

   563     "paddsw    %%xmm1,%%xmm6                   \n"

   564     "psrlw     $0x2,%%xmm6                     \n"

   565     "packuswb  %%xmm6,%%xmm6                   \n"

   566     "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"

   567     "lea       " MEMLEA(0x18,1) ",%1           \n"

   568     "sub       $0x18,%2                        \n"

   569     "jg        1b                              \n"

   570     : "+r"(src_ptr),   // %0

   571       "+r"(dst_ptr),   // %1

   572       "+r"(dst_width)  // %2

   573     : "r"((intptr_t)(src_stride)),  // %3

   574       "m"(kMadd21)     // %4

   575     : "memory", "cc"

   576 #if defined(__native_client__) && defined(__x86_64__)

   577     , "r14"

   578 #endif

   579 #if defined(__SSE2__)

   580     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

   581 #endif

   582   );

   583 }

   585 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

   586                           uint8* dst_ptr, int dst_width) {

   587   asm volatile (

   588     "movdqa    %3,%%xmm4                       \n"

   589     "movdqa    %4,%%xmm5                       \n"

   591     LABELALIGN

   592   "1:                                          \n"

   593     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

   594     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

   595     "lea       " MEMLEA(0x20,0) ",%0           \n"

   596     "pshufb    %%xmm4,%%xmm0                   \n"

   597     "pshufb    %%xmm5,%%xmm1                   \n"

   598     "paddusb   %%xmm1,%%xmm0                   \n"

   599     "movq      %%xmm0," MEMACCESS(1) "         \n"

   600     "movhlps   %%xmm0,%%xmm1                   \n"

   601     "movd      %%xmm1," MEMACCESS2(0x8,1) "    \n"

   602     "lea       " MEMLEA(0xc,1) ",%1            \n"

   603     "sub       $0xc,%2                         \n"

   604     "jg        1b                              \n"

   605   : "+r"(src_ptr),   // %0

   606     "+r"(dst_ptr),   // %1

   607     "+r"(dst_width)  // %2

   608   : "m"(kShuf38a),   // %3

   609     "m"(kShuf38b)    // %4

   610   : "memory", "cc"

   611 #if defined(__SSE2__)

   612       , "xmm0", "xmm1", "xmm4", "xmm5"

   613 #endif

   614   );

   615 }

   617 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,

   618                                 ptrdiff_t src_stride,

   619                                 uint8* dst_ptr, int dst_width) {

   620   asm volatile (

   621     "movdqa    %0,%%xmm2                       \n"

   622     "movdqa    %1,%%xmm3                       \n"

   623     "movdqa    %2,%%xmm4                       \n"

   624     "movdqa    %3,%%xmm5                       \n"

   625   :

   626   : "m"(kShufAb0),   // %0

   627     "m"(kShufAb1),   // %1

   628     "m"(kShufAb2),   // %2

   629     "m"(kScaleAb2)   // %3

   630   );

   631   asm volatile (

   632     LABELALIGN

   633   "1:                                          \n"

   634     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

   635     MEMOPREG(pavgb,0x00,0,3,1,xmm0)            //  pavgb   (%0,%3,1),%%xmm0

   636     "lea       " MEMLEA(0x10,0) ",%0           \n"

   637     "movdqa    %%xmm0,%%xmm1                   \n"

   638     "pshufb    %%xmm2,%%xmm1                   \n"

   639     "movdqa    %%xmm0,%%xmm6                   \n"

   640     "pshufb    %%xmm3,%%xmm6                   \n"

   641     "paddusw   %%xmm6,%%xmm1                   \n"

   642     "pshufb    %%xmm4,%%xmm0                   \n"

   643     "paddusw   %%xmm0,%%xmm1                   \n"

   644     "pmulhuw   %%xmm5,%%xmm1                   \n"

   645     "packuswb  %%xmm1,%%xmm1                   \n"

   646     "sub       $0x6,%2                         \n"

   647     "movd      %%xmm1," MEMACCESS(1) "         \n"

   648     "psrlq     $0x10,%%xmm1                    \n"

   649     "movd      %%xmm1," MEMACCESS2(0x2,1) "    \n"

   650     "lea       " MEMLEA(0x6,1) ",%1            \n"

   651     "jg        1b                              \n"

   652   : "+r"(src_ptr),     // %0

   653     "+r"(dst_ptr),     // %1

   654     "+r"(dst_width)    // %2

   655   : "r"((intptr_t)(src_stride))  // %3

   656   : "memory", "cc"

   657 #if defined(__native_client__) && defined(__x86_64__)

   658     , "r14"

   659 #endif

   660 #if defined(__SSE2__)

   661     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

   662 #endif

   663   );

   664 }

   666 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,

   667                                 ptrdiff_t src_stride,

   668                                 uint8* dst_ptr, int dst_width) {

   669   asm volatile (

   670     "movdqa    %0,%%xmm2                       \n"

   671     "movdqa    %1,%%xmm3                       \n"

   672     "movdqa    %2,%%xmm4                       \n"

   673     "pxor      %%xmm5,%%xmm5                   \n"

   674   :

   675   : "m"(kShufAc),    // %0

   676     "m"(kShufAc3),   // %1

   677     "m"(kScaleAc33)  // %2

   678   );

   679   asm volatile (

   680     LABELALIGN

   681   "1:                                          \n"

   682     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

   683     MEMOPREG(movdqa,0x00,0,3,1,xmm6)           //  movdqa  (%0,%3,1),%%xmm6

   684     "movhlps   %%xmm0,%%xmm1                   \n"

   685     "movhlps   %%xmm6,%%xmm7                   \n"

   686     "punpcklbw %%xmm5,%%xmm0                   \n"

   687     "punpcklbw %%xmm5,%%xmm1                   \n"

   688     "punpcklbw %%xmm5,%%xmm6                   \n"

   689     "punpcklbw %%xmm5,%%xmm7                   \n"

   690     "paddusw   %%xmm6,%%xmm0                   \n"

   691     "paddusw   %%xmm7,%%xmm1                   \n"

   692     MEMOPREG(movdqa,0x00,0,3,2,xmm6)           //  movdqa  (%0,%3,2),%%xmm6

   693     "lea       " MEMLEA(0x10,0) ",%0           \n"

   694     "movhlps   %%xmm6,%%xmm7                   \n"

   695     "punpcklbw %%xmm5,%%xmm6                   \n"

   696     "punpcklbw %%xmm5,%%xmm7                   \n"

   697     "paddusw   %%xmm6,%%xmm0                   \n"

   698     "paddusw   %%xmm7,%%xmm1                   \n"

   699     "movdqa    %%xmm0,%%xmm6                   \n"

   700     "psrldq    $0x2,%%xmm0                     \n"

   701     "paddusw   %%xmm0,%%xmm6                   \n"

   702     "psrldq    $0x2,%%xmm0                     \n"

   703     "paddusw   %%xmm0,%%xmm6                   \n"

   704     "pshufb    %%xmm2,%%xmm6                   \n"

   705     "movdqa    %%xmm1,%%xmm7                   \n"

   706     "psrldq    $0x2,%%xmm1                     \n"

   707     "paddusw   %%xmm1,%%xmm7                   \n"

   708     "psrldq    $0x2,%%xmm1                     \n"

   709     "paddusw   %%xmm1,%%xmm7                   \n"

   710     "pshufb    %%xmm3,%%xmm7                   \n"

   711     "paddusw   %%xmm7,%%xmm6                   \n"

   712     "pmulhuw   %%xmm4,%%xmm6                   \n"

   713     "packuswb  %%xmm6,%%xmm6                   \n"

   714     "sub       $0x6,%2                         \n"

   715     "movd      %%xmm6," MEMACCESS(1) "         \n"

   716     "psrlq     $0x10,%%xmm6                    \n"

   717     "movd      %%xmm6," MEMACCESS2(0x2,1) "    \n"

   718     "lea       " MEMLEA(0x6,1) ",%1            \n"

   719     "jg        1b                              \n"

   720   : "+r"(src_ptr),    // %0

   721     "+r"(dst_ptr),    // %1

   722     "+r"(dst_width)   // %2

   723   : "r"((intptr_t)(src_stride))   // %3

   724   : "memory", "cc"

   725 #if defined(__native_client__) && defined(__x86_64__)

   726     , "r14"

   727 #endif

   728 #if defined(__SSE2__)

   729     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

   730 #endif

   731   );

   732 }

   734 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

   735                        uint16* dst_ptr, int src_width, int src_height) {

   736   int tmp_height = 0;

   737   intptr_t tmp_src = 0;

   738   asm volatile (

   739     "pxor      %%xmm4,%%xmm4                   \n"

   740     "sub       $0x1,%5                         \n"

   742     LABELALIGN

   743   "1:                                          \n"

   744     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

   745     "mov       %0,%3                           \n"

   746     "add       %6,%0                           \n"

   747     "movdqa    %%xmm0,%%xmm1                   \n"

   748     "punpcklbw %%xmm4,%%xmm0                   \n"

   749     "punpckhbw %%xmm4,%%xmm1                   \n"

   750     "mov       %5,%2                           \n"

   751     "test      %2,%2                           \n"

   752     "je        3f                              \n"

   754     LABELALIGN

   755   "2:                                          \n"

   756     "movdqa    " MEMACCESS(0) ",%%xmm2         \n"

   757     "add       %6,%0                           \n"

   758     "movdqa    %%xmm2,%%xmm3                   \n"

   759     "punpcklbw %%xmm4,%%xmm2                   \n"

   760     "punpckhbw %%xmm4,%%xmm3                   \n"

   761     "paddusw   %%xmm2,%%xmm0                   \n"

   762     "paddusw   %%xmm3,%%xmm1                   \n"

   763     "sub       $0x1,%2                         \n"

   764     "jg        2b                              \n"

   766     LABELALIGN

   767   "3:                                          \n"

   768     "movdqa    %%xmm0," MEMACCESS(1) "         \n"

   769     "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"

   770     "lea       " MEMLEA(0x10,3) ",%0           \n"

   771     "lea       " MEMLEA(0x20,1) ",%1           \n"

   772     "sub       $0x10,%4                        \n"

   773     "jg        1b                              \n"

   774   : "+r"(src_ptr),     // %0

   775     "+r"(dst_ptr),     // %1

   776     "+r"(tmp_height),  // %2

   777     "+r"(tmp_src),     // %3

   778     "+r"(src_width),   // %4

   779     "+rm"(src_height)  // %5

   780   : "rm"((intptr_t)(src_stride))  // %6

   781   : "memory", "cc"

   782 #if defined(__SSE2__)

   783     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"

   784 #endif

   785   );

   786 }

   788 // Bilinear column filtering. SSSE3 version.

   789 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

   790                            int dst_width, int x, int dx) {

   791   intptr_t x0 = 0, x1 = 0, temp_pixel = 0;

   792   asm volatile (

   793     "movd      %6,%%xmm2                       \n"

   794     "movd      %7,%%xmm3                       \n"

   795     "movl      $0x04040000,%k2                 \n"

   796     "movd      %k2,%%xmm5                      \n"

   797     "pcmpeqb   %%xmm6,%%xmm6                   \n"

   798     "psrlw     $0x9,%%xmm6                     \n"

   799     "pextrw    $0x1,%%xmm2,%k3                 \n"

   800     "subl      $0x2,%5                         \n"

   801     "jl        29f                             \n"

   802     "movdqa    %%xmm2,%%xmm0                   \n"

   803     "paddd     %%xmm3,%%xmm0                   \n"

   804     "punpckldq %%xmm0,%%xmm2                   \n"

   805     "punpckldq %%xmm3,%%xmm3                   \n"

   806     "paddd     %%xmm3,%%xmm3                   \n"

   807     "pextrw    $0x3,%%xmm2,%k4                 \n"

   809     LABELALIGN

   810   "2:                                          \n"

   811     "movdqa    %%xmm2,%%xmm1                   \n"

   812     "paddd     %%xmm3,%%xmm2                   \n"

   813     MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2

   814     "movd      %k2,%%xmm0                      \n"

   815     "psrlw     $0x9,%%xmm1                     \n"

   816     BUNDLEALIGN

   817     MEMOPARG(movzwl,0x00,1,4,1,k2)             //  movzwl  (%1,%4,1),%k2

   818     "movd      %k2,%%xmm4                      \n"

   819     "pshufb    %%xmm5,%%xmm1                   \n"

   820     "punpcklwd %%xmm4,%%xmm0                   \n"

   821     "pxor      %%xmm6,%%xmm1                   \n"

   822     "pmaddubsw %%xmm1,%%xmm0                   \n"

   823     "pextrw    $0x1,%%xmm2,%k3                 \n"

   824     "pextrw    $0x3,%%xmm2,%k4                 \n"

   825     "psrlw     $0x7,%%xmm0                     \n"

   826     "packuswb  %%xmm0,%%xmm0                   \n"

   827     "movd      %%xmm0,%k2                      \n"

   828     "mov       %w2," MEMACCESS(0) "            \n"

   829     "lea       " MEMLEA(0x2,0) ",%0            \n"

   830     "sub       $0x2,%5                         \n"

   831     "jge       2b                              \n"

   833     LABELALIGN

   834   "29:                                         \n"

   835     "addl      $0x1,%5                         \n"

   836     "jl        99f                             \n"

   837     MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2

   838     "movd      %k2,%%xmm0                      \n"

   839     "psrlw     $0x9,%%xmm2                     \n"

   840     "pshufb    %%xmm5,%%xmm2                   \n"

   841     "pxor      %%xmm6,%%xmm2                   \n"

   842     "pmaddubsw %%xmm2,%%xmm0                   \n"

   843     "psrlw     $0x7,%%xmm0                     \n"

   844     "packuswb  %%xmm0,%%xmm0                   \n"

   845     "movd      %%xmm0,%k2                      \n"

   846     "mov       %b2," MEMACCESS(0) "            \n"

   847   "99:                                         \n"

   848   : "+r"(dst_ptr),     // %0

   849     "+r"(src_ptr),     // %1

   850     "+a"(temp_pixel),  // %2

   851     "+r"(x0),          // %3

   852     "+r"(x1),          // %4

   853     "+rm"(dst_width)   // %5

   854   : "rm"(x),           // %6

   855     "rm"(dx)           // %7

   856   : "memory", "cc"

   857 #if defined(__native_client__) && defined(__x86_64__)

   858     , "r14"

   859 #endif

   860 #if defined(__SSE2__)

   861     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

   862 #endif

   863   );

   864 }

   866 // Reads 4 pixels, duplicates them and writes 8 pixels.

   867 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.

   868 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,

   869                        int dst_width, int x, int dx) {

   870   asm volatile (

   871     LABELALIGN

   872   "1:                                          \n"

   873     "movdqa    " MEMACCESS(1) ",%%xmm0         \n"

   874     "lea       " MEMLEA(0x10,1) ",%1           \n"

   875     "movdqa    %%xmm0,%%xmm1                   \n"

   876     "punpcklbw %%xmm0,%%xmm0                   \n"

   877     "punpckhbw %%xmm1,%%xmm1                   \n"

   878     "sub       $0x20,%2                         \n"

   879     "movdqa    %%xmm0," MEMACCESS(0) "         \n"

   880     "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"

   881     "lea       " MEMLEA(0x20,0) ",%0           \n"

   882     "jg        1b                              \n"

   884   : "+r"(dst_ptr),     // %0

   885     "+r"(src_ptr),     // %1

   886     "+r"(dst_width)    // %2

   887   :

   888   : "memory", "cc"

   889 #if defined(__SSE2__)

   890     , "xmm0", "xmm1"

   891 #endif

   892   );

   893 }

   895 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,

   896                             ptrdiff_t src_stride,

   897                             uint8* dst_argb, int dst_width) {

   898   asm volatile (

   899     LABELALIGN

   900   "1:                                          \n"

   901     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

   902     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

   903     "lea       " MEMLEA(0x20,0) ",%0           \n"

   904     "shufps    $0xdd,%%xmm1,%%xmm0             \n"

   905     "sub       $0x4,%2                         \n"

   906     "movdqa    %%xmm0," MEMACCESS(1) "         \n"

   907     "lea       " MEMLEA(0x10,1) ",%1           \n"

   908     "jg        1b                              \n"

   909   : "+r"(src_argb),  // %0

   910     "+r"(dst_argb),  // %1

   911     "+r"(dst_width)  // %2

   912   :

   913   : "memory", "cc"

   914 #if defined(__SSE2__)

   915     , "xmm0", "xmm1"

   916 #endif

   917   );

   918 }

   920 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,

   921                                   ptrdiff_t src_stride,

   922                                   uint8* dst_argb, int dst_width) {

   923   asm volatile (

   924     LABELALIGN

   925   "1:                                          \n"

   926     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

   927     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

   928     "lea       " MEMLEA(0x20,0) ",%0           \n"

   929     "movdqa    %%xmm0,%%xmm2                   \n"

   930     "shufps    $0x88,%%xmm1,%%xmm0             \n"

   931     "shufps    $0xdd,%%xmm1,%%xmm2             \n"

   932     "pavgb     %%xmm2,%%xmm0                   \n"

   933     "sub       $0x4,%2                         \n"

   934     "movdqa    %%xmm0," MEMACCESS(1) "         \n"

   935     "lea       " MEMLEA(0x10,1) ",%1           \n"

   936     "jg        1b                              \n"

   937   : "+r"(src_argb),  // %0

   938     "+r"(dst_argb),  // %1

   939     "+r"(dst_width)  // %2

   940   :

   941   : "memory", "cc"

   942 #if defined(__SSE2__)

   943     , "xmm0", "xmm1"

   944 #endif

   945   );

   946 }

   948 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,

   949                                ptrdiff_t src_stride,

   950                                uint8* dst_argb, int dst_width) {

   951   asm volatile (

   952     LABELALIGN

   953   "1:                                          \n"

   954     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

   955     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

   956     BUNDLEALIGN

   957     MEMOPREG(movdqa,0x00,0,3,1,xmm2)           //  movdqa   (%0,%3,1),%%xmm2

   958     MEMOPREG(movdqa,0x10,0,3,1,xmm3)           //  movdqa   0x10(%0,%3,1),%%xmm3

   959     "lea       " MEMLEA(0x20,0) ",%0           \n"

   960     "pavgb     %%xmm2,%%xmm0                   \n"

   961     "pavgb     %%xmm3,%%xmm1                   \n"

   962     "movdqa    %%xmm0,%%xmm2                   \n"

   963     "shufps    $0x88,%%xmm1,%%xmm0             \n"

   964     "shufps    $0xdd,%%xmm1,%%xmm2             \n"

   965     "pavgb     %%xmm2,%%xmm0                   \n"

   966     "sub       $0x4,%2                         \n"

   967     "movdqa    %%xmm0," MEMACCESS(1) "         \n"

   968     "lea       " MEMLEA(0x10,1) ",%1           \n"

   969     "jg        1b                              \n"

   970   : "+r"(src_argb),   // %0

   971     "+r"(dst_argb),   // %1

   972     "+r"(dst_width)   // %2

   973   : "r"((intptr_t)(src_stride))   // %3

   974   : "memory", "cc"

   975 #if defined(__native_client__) && defined(__x86_64__)

   976     , "r14"

   977 #endif

   978 #if defined(__SSE2__)

   979     , "xmm0", "xmm1", "xmm2", "xmm3"

   980 #endif

   981   );

   982 }

   984 // Reads 4 pixels at a time.

   985 // Alignment requirement: dst_argb 16 byte aligned.

   986 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,

   987                                int src_stepx,

   988                                uint8* dst_argb, int dst_width) {

   989   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);

   990   intptr_t src_stepx_x12 = 0;

   991   asm volatile (

   992     "lea       " MEMLEA3(0x00,1,4) ",%1        \n"

   993     "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"

   994     LABELALIGN

   995   "1:                                          \n"

   996     "movd      " MEMACCESS(0) ",%%xmm0         \n"

   997     MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1

   998     "punpckldq %%xmm1,%%xmm0                   \n"

   999     BUNDLEALIGN

  1000     MEMOPREG(movd,0x00,0,1,2,xmm2)             //  movd      (%0,%1,2),%%xmm2

  1001     MEMOPREG(movd,0x00,0,4,1,xmm3)             //  movd      (%0,%4,1),%%xmm3

  1002     "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"

  1003     "punpckldq %%xmm3,%%xmm2                   \n"

  1004     "punpcklqdq %%xmm2,%%xmm0                  \n"

  1005     "sub       $0x4,%3                         \n"

  1006     "movdqa    %%xmm0," MEMACCESS(2) "         \n"

  1007     "lea       " MEMLEA(0x10,2) ",%2           \n"

  1008     "jg        1b                              \n"

  1009   : "+r"(src_argb),      // %0

  1010     "+r"(src_stepx_x4),  // %1

  1011     "+r"(dst_argb),      // %2

  1012     "+r"(dst_width),     // %3

  1013     "+r"(src_stepx_x12)  // %4

  1014   :

  1015   : "memory", "cc"

  1016 #if defined(__native_client__) && defined(__x86_64__)

  1017     , "r14"

  1018 #endif

  1019 #if defined(__SSE2__)

  1020     , "xmm0", "xmm1", "xmm2", "xmm3"

  1021 #endif

  1022   );

  1023 }

  1025 // Blends four 2x2 to 4x1.

  1026 // Alignment requirement: dst_argb 16 byte aligned.

  1027 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,

  1028                                   ptrdiff_t src_stride, int src_stepx,

  1029                                   uint8* dst_argb, int dst_width) {

  1030   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);

  1031   intptr_t src_stepx_x12 = 0;

  1032   intptr_t row1 = (intptr_t)(src_stride);

  1033   asm volatile (

  1034     "lea       " MEMLEA3(0x00,1,4) ",%1        \n"

  1035     "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"

  1036     "lea       " MEMLEA4(0x00,0,5,1) ",%5      \n"

  1038     LABELALIGN

  1039   "1:                                          \n"

  1040     "movq      " MEMACCESS(0) ",%%xmm0         \n"

  1041     MEMOPREG(movhps,0x00,0,1,1,xmm0)           //  movhps    (%0,%1,1),%%xmm0

  1042     MEMOPREG(movq,0x00,0,1,2,xmm1)             //  movq      (%0,%1,2),%%xmm1

  1043     BUNDLEALIGN

  1044     MEMOPREG(movhps,0x00,0,4,1,xmm1)           //  movhps    (%0,%4,1),%%xmm1

  1045     "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"

  1046     "movq      " MEMACCESS(5) ",%%xmm2         \n"

  1047     BUNDLEALIGN

  1048     MEMOPREG(movhps,0x00,5,1,1,xmm2)           //  movhps    (%5,%1,1),%%xmm2

  1049     MEMOPREG(movq,0x00,5,1,2,xmm3)             //  movq      (%5,%1,2),%%xmm3

  1050     MEMOPREG(movhps,0x00,5,4,1,xmm3)           //  movhps    (%5,%4,1),%%xmm3

  1051     "lea       " MEMLEA4(0x00,5,1,4) ",%5      \n"

  1052     "pavgb     %%xmm2,%%xmm0                   \n"

  1053     "pavgb     %%xmm3,%%xmm1                   \n"

  1054     "movdqa    %%xmm0,%%xmm2                   \n"

  1055     "shufps    $0x88,%%xmm1,%%xmm0             \n"

  1056     "shufps    $0xdd,%%xmm1,%%xmm2             \n"

  1057     "pavgb     %%xmm2,%%xmm0                   \n"

  1058     "sub       $0x4,%3                         \n"

  1059     "movdqa    %%xmm0," MEMACCESS(2) "         \n"

  1060     "lea       " MEMLEA(0x10,2) ",%2           \n"

  1061     "jg        1b                              \n"

  1062   : "+r"(src_argb),       // %0

  1063     "+r"(src_stepx_x4),   // %1

  1064     "+r"(dst_argb),       // %2

  1065     "+rm"(dst_width),     // %3

  1066     "+r"(src_stepx_x12),  // %4

  1067     "+r"(row1)            // %5

  1068   :

  1069   : "memory", "cc"

  1070 #if defined(__native_client__) && defined(__x86_64__)

  1071     , "r14"

  1072 #endif

  1073 #if defined(__SSE2__)

  1074     , "xmm0", "xmm1", "xmm2", "xmm3"

  1075 #endif

  1076   );

  1077 }

  1079 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,

  1080                         int dst_width, int x, int dx) {

  1081   intptr_t x0 = 0, x1 = 0;

  1082   asm volatile (

  1083     "movd      %5,%%xmm2                       \n"

  1084     "movd      %6,%%xmm3                       \n"

  1085     "pshufd    $0x0,%%xmm2,%%xmm2              \n"

  1086     "pshufd    $0x11,%%xmm3,%%xmm0             \n"

  1087     "paddd     %%xmm0,%%xmm2                   \n"

  1088     "paddd     %%xmm3,%%xmm3                   \n"

  1089     "pshufd    $0x5,%%xmm3,%%xmm0              \n"

  1090     "paddd     %%xmm0,%%xmm2                   \n"

  1091     "paddd     %%xmm3,%%xmm3                   \n"

  1092     "pshufd    $0x0,%%xmm3,%%xmm3              \n"

  1093     "pextrw    $0x1,%%xmm2,%k0                 \n"

  1094     "pextrw    $0x3,%%xmm2,%k1                 \n"

  1095     "cmp       $0x0,%4                         \n"

  1096     "jl        99f                             \n"

  1097     "sub       $0x4,%4                         \n"

  1098     "jl        49f                             \n"

  1100     LABELALIGN

  1101   "40:                                         \n"

  1102     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0

  1103     MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1

  1104     "pextrw    $0x5,%%xmm2,%k0                 \n"

  1105     "pextrw    $0x7,%%xmm2,%k1                 \n"

  1106     "paddd     %%xmm3,%%xmm2                   \n"

  1107     "punpckldq %%xmm1,%%xmm0                   \n"

  1108     MEMOPREG(movd,0x00,3,0,4,xmm1)             //  movd      (%3,%0,4),%%xmm1

  1109     MEMOPREG(movd,0x00,3,1,4,xmm4)             //  movd      (%3,%1,4),%%xmm4

  1110     "pextrw    $0x1,%%xmm2,%k0                 \n"

  1111     "pextrw    $0x3,%%xmm2,%k1                 \n"

  1112     "punpckldq %%xmm4,%%xmm1                   \n"

  1113     "punpcklqdq %%xmm1,%%xmm0                  \n"

  1114     "sub       $0x4,%4                         \n"

  1115     "movdqu    %%xmm0," MEMACCESS(2) "         \n"

  1116     "lea       " MEMLEA(0x10,2) ",%2           \n"

  1117     "jge       40b                             \n"

  1119   "49:                                         \n"

  1120     "test      $0x2,%4                         \n"

  1121     "je        29f                             \n"

  1122     BUNDLEALIGN

  1123     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0

  1124     MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1

  1125     "pextrw    $0x5,%%xmm2,%k0                 \n"

  1126     "punpckldq %%xmm1,%%xmm0                   \n"

  1127     "movq      %%xmm0," MEMACCESS(2) "         \n"

  1128     "lea       " MEMLEA(0x8,2) ",%2            \n"

  1129   "29:                                         \n"

  1130     "test      $0x1,%4                         \n"

  1131     "je        99f                             \n"

  1132     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0

  1133     "movd      %%xmm0," MEMACCESS(2) "         \n"

  1134   "99:                                         \n"

  1135   : "+a"(x0),          // %0

  1136     "+d"(x1),          // %1

  1137     "+r"(dst_argb),    // %2

  1138     "+r"(src_argb),    // %3

  1139     "+r"(dst_width)    // %4

  1140   : "rm"(x),           // %5

  1141     "rm"(dx)           // %6

  1142   : "memory", "cc"

  1143 #if defined(__native_client__) && defined(__x86_64__)

  1144     , "r14"

  1145 #endif

  1146 #if defined(__SSE2__)

  1147     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"

  1148 #endif

  1149   );

  1150 }

  1152 // Reads 4 pixels, duplicates them and writes 8 pixels.

  1153 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.

  1154 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,

  1155                            int dst_width, int x, int dx) {

  1156   asm volatile (

  1157     LABELALIGN

  1158   "1:                                          \n"

  1159     "movdqa    " MEMACCESS(1) ",%%xmm0         \n"

  1160     "lea       " MEMLEA(0x10,1) ",%1           \n"

  1161     "movdqa    %%xmm0,%%xmm1                   \n"

  1162     "punpckldq %%xmm0,%%xmm0                   \n"

  1163     "punpckhdq %%xmm1,%%xmm1                   \n"

  1164     "sub       $0x8,%2                         \n"

  1165     "movdqa    %%xmm0," MEMACCESS(0) "         \n"

  1166     "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"

  1167     "lea       " MEMLEA(0x20,0) ",%0           \n"

  1168     "jg        1b                              \n"

  1170   : "+r"(dst_argb),    // %0

  1171     "+r"(src_argb),    // %1

  1172     "+r"(dst_width)    // %2

  1173   :

  1174   : "memory", "cc"

  1175 #if defined(__native_client__) && defined(__x86_64__)

  1176     , "r14"

  1177 #endif

  1178 #if defined(__SSE2__)

  1179     , "xmm0", "xmm1"

  1180 #endif

  1181   );

  1182 }

  1184 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw

  1185 static uvec8 kShuffleColARGB = {

  1186   0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel

  1187   8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel

  1188 };

  1190 // Shuffle table for duplicating 2 fractions into 8 bytes each

  1191 static uvec8 kShuffleFractions = {

  1192   0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,

  1193 };

  1195 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version

  1196 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,

  1197                                int dst_width, int x, int dx) {

  1198   intptr_t x0 = 0, x1 = 0;

  1199   asm volatile (

  1200     "movdqa    %0,%%xmm4                       \n"

  1201     "movdqa    %1,%%xmm5                       \n"

  1202   :

  1203   : "m"(kShuffleColARGB),  // %0

  1204     "m"(kShuffleFractions)  // %1

  1205   );

  1207   asm volatile (

  1208     "movd      %5,%%xmm2                       \n"

  1209     "movd      %6,%%xmm3                       \n"

  1210     "pcmpeqb   %%xmm6,%%xmm6                   \n"

  1211     "psrlw     $0x9,%%xmm6                     \n"

  1212     "pextrw    $0x1,%%xmm2,%k3                 \n"

  1213     "sub       $0x2,%2                         \n"

  1214     "jl        29f                             \n"

  1215     "movdqa    %%xmm2,%%xmm0                   \n"

  1216     "paddd     %%xmm3,%%xmm0                   \n"

  1217     "punpckldq %%xmm0,%%xmm2                   \n"

  1218     "punpckldq %%xmm3,%%xmm3                   \n"

  1219     "paddd     %%xmm3,%%xmm3                   \n"

  1220     "pextrw    $0x3,%%xmm2,%k4                 \n"

  1222     LABELALIGN

  1223   "2:                                          \n"

  1224     "movdqa    %%xmm2,%%xmm1                   \n"

  1225     "paddd     %%xmm3,%%xmm2                   \n"

  1226     MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0

  1227     "psrlw     $0x9,%%xmm1                     \n"

  1228     BUNDLEALIGN

  1229     MEMOPREG(movhps,0x00,1,4,4,xmm0)           //  movhps    (%1,%4,4),%%xmm0

  1230     "pshufb    %%xmm5,%%xmm1                   \n"

  1231     "pshufb    %%xmm4,%%xmm0                   \n"

  1232     "pxor      %%xmm6,%%xmm1                   \n"

  1233     "pmaddubsw %%xmm1,%%xmm0                   \n"

  1234     "psrlw     $0x7,%%xmm0                     \n"

  1235     "pextrw    $0x1,%%xmm2,%k3                 \n"

  1236     "pextrw    $0x3,%%xmm2,%k4                 \n"

  1237     "packuswb  %%xmm0,%%xmm0                   \n"

  1238     "movq      %%xmm0," MEMACCESS(0) "         \n"

  1239     "lea       " MEMLEA(0x8,0) ",%0            \n"

  1240     "sub       $0x2,%2                         \n"

  1241     "jge       2b                              \n"

  1243     LABELALIGN

  1244   "29:                                         \n"

  1245     "add       $0x1,%2                         \n"

  1246     "jl        99f                             \n"

  1247     "psrlw     $0x9,%%xmm2                     \n"

  1248     BUNDLEALIGN

  1249     MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0

  1250     "pshufb    %%xmm5,%%xmm2                   \n"

  1251     "pshufb    %%xmm4,%%xmm0                   \n"

  1252     "pxor      %%xmm6,%%xmm2                   \n"

  1253     "pmaddubsw %%xmm2,%%xmm0                   \n"

  1254     "psrlw     $0x7,%%xmm0                     \n"

  1255     "packuswb  %%xmm0,%%xmm0                   \n"

  1256     "movd      %%xmm0," MEMACCESS(0) "         \n"

  1258     LABELALIGN

  1259   "99:                                         \n"

  1260   : "+r"(dst_argb),    // %0

  1261     "+r"(src_argb),    // %1

  1262     "+rm"(dst_width),  // %2

  1263     "+r"(x0),          // %3

  1264     "+r"(x1)           // %4

  1265   : "rm"(x),           // %5

  1266     "rm"(dx)           // %6

  1267   : "memory", "cc"

  1268 #if defined(__native_client__) && defined(__x86_64__)

  1269     , "r14"

  1270 #endif

  1271 #if defined(__SSE2__)

  1272     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

  1273 #endif

  1274   );

  1275 }

  1277 // Divide num by div and return as 16.16 fixed point result.

  1278 int FixedDiv_X86(int num, int div) {

  1279   asm volatile (

  1280     "cdq                                       \n"

  1281     "shld      $0x10,%%eax,%%edx               \n"

  1282     "shl       $0x10,%%eax                     \n"

  1283     "idiv      %1                              \n"

  1284     "mov       %0, %%eax                       \n"

  1285     : "+a"(num)  // %0

  1286     : "c"(div)   // %1

  1287     : "memory", "cc", "edx"

  1288   );

  1289   return num;

  1290 }

  1292 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.

  1293 int FixedDiv1_X86(int num, int div) {

  1294   asm volatile (

  1295     "cdq                                       \n"

  1296     "shld      $0x10,%%eax,%%edx               \n"

  1297     "shl       $0x10,%%eax                     \n"

  1298     "sub       $0x10001,%%eax                  \n"

  1299     "sbb       $0x0,%%edx                      \n"

  1300     "sub       $0x1,%1                         \n"

  1301     "idiv      %1                              \n"

  1302     "mov       %0, %%eax                       \n"

  1303     : "+a"(num)  // %0

  1304     : "c"(div)   // %1

  1305     : "memory", "cc", "edx"

  1306   );

  1307   return num;

  1308 }

  1310 #endif  // defined(__x86_64__) || defined(__i386__)

  1312 #ifdef __cplusplus

  1313 }  // extern "C"

  1314 }  // namespace libyuv

  1315 #endif

The Tor Browser / file revision

media/libyuv/source/scale_posix.cc@6474c204b198

media/libyuv/source/scale_posix.cc