media/libyuv/source/row_posix.cc

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*
     2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
     3  *
     4  *  Use of this source code is governed by a BSD-style license
     5  *  that can be found in the LICENSE file in the root of the source
     6  *  tree. An additional intellectual property rights grant can be found
     7  *  in the file PATENTS. All contributing project authors may
     8  *  be found in the AUTHORS file in the root of the source tree.
     9  */
    11 #include "libyuv/row.h"
    13 #ifdef __cplusplus
    14 namespace libyuv {
    15 extern "C" {
    16 #endif
    18 // This module is for GCC x86 and x64.
    19 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
    21 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
    23 // Constants for ARGB
    24 static vec8 kARGBToY = {
    25   13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
    26 };
    28 // JPeg full range.
    29 static vec8 kARGBToYJ = {
    30   15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
    31 };
    32 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
    34 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
    36 static vec8 kARGBToU = {
    37   112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
    38 };
    40 static vec8 kARGBToUJ = {
    41   127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
    42 };
    44 static vec8 kARGBToV = {
    45   -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
    46 };
    48 static vec8 kARGBToVJ = {
    49   -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
    50 };
    52 // Constants for BGRA
    53 static vec8 kBGRAToY = {
    54   0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
    55 };
    57 static vec8 kBGRAToU = {
    58   0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
    59 };
    61 static vec8 kBGRAToV = {
    62   0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
    63 };
    65 // Constants for ABGR
    66 static vec8 kABGRToY = {
    67   33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
    68 };
    70 static vec8 kABGRToU = {
    71   -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
    72 };
    74 static vec8 kABGRToV = {
    75   112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
    76 };
    78 // Constants for RGBA.
    79 static vec8 kRGBAToY = {
    80   0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
    81 };
    83 static vec8 kRGBAToU = {
    84   0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
    85 };
    87 static vec8 kRGBAToV = {
    88   0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
    89 };
    91 static uvec8 kAddY16 = {
    92   16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
    93 };
    95 static vec16 kAddYJ64 = {
    96   64, 64, 64, 64, 64, 64, 64, 64
    97 };
    99 static uvec8 kAddUV128 = {
   100   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
   101   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
   102 };
   104 static uvec16 kAddUVJ128 = {
   105   0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
   106 };
   107 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
   109 #ifdef HAS_RGB24TOARGBROW_SSSE3
   111 // Shuffle table for converting RGB24 to ARGB.
   112 static uvec8 kShuffleMaskRGB24ToARGB = {
   113   0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
   114 };
   116 // Shuffle table for converting RAW to ARGB.
   117 static uvec8 kShuffleMaskRAWToARGB = {
   118   2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
   119 };
   121 // Shuffle table for converting ARGB to RGB24.
   122 static uvec8 kShuffleMaskARGBToRGB24 = {
   123   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
   124 };
   126 // Shuffle table for converting ARGB to RAW.
   127 static uvec8 kShuffleMaskARGBToRAW = {
   128   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
   129 };
   131 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
   132 static uvec8 kShuffleMaskARGBToRGB24_0 = {
   133   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
   134 };
   136 // Shuffle table for converting ARGB to RAW.
   137 static uvec8 kShuffleMaskARGBToRAW_0 = {
   138   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
   139 };
   140 #endif  // HAS_RGB24TOARGBROW_SSSE3
   142 #if defined(TESTING) && defined(__x86_64__)
   143 void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
   144   asm volatile (
   145     ".p2align  5                               \n"
   146     "mov       %%eax,%%eax                     \n"
   147     "mov       %%ebx,%%ebx                     \n"
   148     "mov       %%ecx,%%ecx                     \n"
   149     "mov       %%edx,%%edx                     \n"
   150     "mov       %%esi,%%esi                     \n"
   151     "mov       %%edi,%%edi                     \n"
   152     "mov       %%ebp,%%ebp                     \n"
   153     "mov       %%esp,%%esp                     \n"
   154     ".p2align  5                               \n"
   155     "mov       %%r8d,%%r8d                     \n"
   156     "mov       %%r9d,%%r9d                     \n"
   157     "mov       %%r10d,%%r10d                   \n"
   158     "mov       %%r11d,%%r11d                   \n"
   159     "mov       %%r12d,%%r12d                   \n"
   160     "mov       %%r13d,%%r13d                   \n"
   161     "mov       %%r14d,%%r14d                   \n"
   162     "mov       %%r15d,%%r15d                   \n"
   163     ".p2align  5                               \n"
   164     "lea       (%%rax),%%eax                   \n"
   165     "lea       (%%rbx),%%ebx                   \n"
   166     "lea       (%%rcx),%%ecx                   \n"
   167     "lea       (%%rdx),%%edx                   \n"
   168     "lea       (%%rsi),%%esi                   \n"
   169     "lea       (%%rdi),%%edi                   \n"
   170     "lea       (%%rbp),%%ebp                   \n"
   171     "lea       (%%rsp),%%esp                   \n"
   172     ".p2align  5                               \n"
   173     "lea       (%%r8),%%r8d                    \n"
   174     "lea       (%%r9),%%r9d                    \n"
   175     "lea       (%%r10),%%r10d                  \n"
   176     "lea       (%%r11),%%r11d                  \n"
   177     "lea       (%%r12),%%r12d                  \n"
   178     "lea       (%%r13),%%r13d                  \n"
   179     "lea       (%%r14),%%r14d                  \n"
   180     "lea       (%%r15),%%r15d                  \n"
   182     ".p2align  5                               \n"
   183     "lea       0x10(%%rax),%%eax               \n"
   184     "lea       0x10(%%rbx),%%ebx               \n"
   185     "lea       0x10(%%rcx),%%ecx               \n"
   186     "lea       0x10(%%rdx),%%edx               \n"
   187     "lea       0x10(%%rsi),%%esi               \n"
   188     "lea       0x10(%%rdi),%%edi               \n"
   189     "lea       0x10(%%rbp),%%ebp               \n"
   190     "lea       0x10(%%rsp),%%esp               \n"
   191     ".p2align  5                               \n"
   192     "lea       0x10(%%r8),%%r8d                \n"
   193     "lea       0x10(%%r9),%%r9d                \n"
   194     "lea       0x10(%%r10),%%r10d              \n"
   195     "lea       0x10(%%r11),%%r11d              \n"
   196     "lea       0x10(%%r12),%%r12d              \n"
   197     "lea       0x10(%%r13),%%r13d              \n"
   198     "lea       0x10(%%r14),%%r14d              \n"
   199     "lea       0x10(%%r15),%%r15d              \n"
   201     ".p2align  5                               \n"
   202     "add       0x10,%%eax                      \n"
   203     "add       0x10,%%ebx                      \n"
   204     "add       0x10,%%ecx                      \n"
   205     "add       0x10,%%edx                      \n"
   206     "add       0x10,%%esi                      \n"
   207     "add       0x10,%%edi                      \n"
   208     "add       0x10,%%ebp                      \n"
   209     "add       0x10,%%esp                      \n"
   210     ".p2align  5                               \n"
   211     "add       0x10,%%r8d                      \n"
   212     "add       0x10,%%r9d                      \n"
   213     "add       0x10,%%r10d                     \n"
   214     "add       0x10,%%r11d                     \n"
   215     "add       0x10,%%r12d                     \n"
   216     "add       0x10,%%r13d                     \n"
   217     "add       0x10,%%r14d                     \n"
   218     "add       0x10,%%r15d                     \n"
   220     ".p2align  2                               \n"
   221   "1:                                          \n"
   222     "movq      " MEMACCESS(0) ",%%xmm0         \n"
   223     "lea       " MEMLEA(0x8,0) ",%0            \n"
   224     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
   225     "lea       " MEMLEA(0x20,1) ",%1           \n"
   226     "sub       $0x8,%2                         \n"
   227     "jg        1b                              \n"
   228   : "+r"(src_y),     // %0
   229     "+r"(dst_argb),  // %1
   230     "+r"(pix)        // %2
   231   :
   232   : "memory", "cc"
   233 #if defined(__SSE2__)
   234     , "xmm0", "xmm1", "xmm5"
   235 #endif
   236   );
   237 }
   238 #endif  // TESTING
   240 #ifdef HAS_I400TOARGBROW_SSE2
   241 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
   242   asm volatile (
   243     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   244     "pslld     $0x18,%%xmm5                    \n"
   245     LABELALIGN
   246   "1:                                          \n"
   247     "movq      " MEMACCESS(0) ",%%xmm0         \n"
   248     "lea       " MEMLEA(0x8,0) ",%0            \n"
   249     "punpcklbw %%xmm0,%%xmm0                   \n"
   250     "movdqa    %%xmm0,%%xmm1                   \n"
   251     "punpcklwd %%xmm0,%%xmm0                   \n"
   252     "punpckhwd %%xmm1,%%xmm1                   \n"
   253     "por       %%xmm5,%%xmm0                   \n"
   254     "por       %%xmm5,%%xmm1                   \n"
   255     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
   256     "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
   257     "lea       " MEMLEA(0x20,1) ",%1           \n"
   258     "sub       $0x8,%2                         \n"
   259     "jg        1b                              \n"
   260   : "+r"(src_y),     // %0
   261     "+r"(dst_argb),  // %1
   262     "+r"(pix)        // %2
   263   :
   264   : "memory", "cc"
   265 #if defined(__SSE2__)
   266     , "xmm0", "xmm1", "xmm5"
   267 #endif
   268   );
   269 }
   271 void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
   272                                   int pix) {
   273   asm volatile (
   274     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   275     "pslld     $0x18,%%xmm5                    \n"
   276     LABELALIGN
   277   "1:                                          \n"
   278     "movq      " MEMACCESS(0) ",%%xmm0         \n"
   279     "lea       " MEMLEA(0x8,0) ",%0            \n"
   280     "punpcklbw %%xmm0,%%xmm0                   \n"
   281     "movdqa    %%xmm0,%%xmm1                   \n"
   282     "punpcklwd %%xmm0,%%xmm0                   \n"
   283     "punpckhwd %%xmm1,%%xmm1                   \n"
   284     "por       %%xmm5,%%xmm0                   \n"
   285     "por       %%xmm5,%%xmm1                   \n"
   286     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   287     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
   288     "lea       " MEMLEA(0x20,1) ",%1           \n"
   289     "sub       $0x8,%2                         \n"
   290     "jg        1b                              \n"
   291   : "+r"(src_y),     // %0
   292     "+r"(dst_argb),  // %1
   293     "+r"(pix)        // %2
   294   :
   295   : "memory", "cc"
   296 #if defined(__SSE2__)
   297     , "xmm0", "xmm1", "xmm5"
   298 #endif
   299   );
   300 }
   301 #endif  // HAS_I400TOARGBROW_SSE2
   303 #ifdef HAS_RGB24TOARGBROW_SSSE3
   304 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
   305   asm volatile (
   306     "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
   307     "pslld     $0x18,%%xmm5                    \n"
   308     "movdqa    %3,%%xmm4                       \n"
   309     LABELALIGN
   310   "1:                                          \n"
   311     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   312     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   313     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
   314     "lea       " MEMLEA(0x30,0) ",%0           \n"
   315     "movdqa    %%xmm3,%%xmm2                   \n"
   316     "palignr   $0x8,%%xmm1,%%xmm2              \n"
   317     "pshufb    %%xmm4,%%xmm2                   \n"
   318     "por       %%xmm5,%%xmm2                   \n"
   319     "palignr   $0xc,%%xmm0,%%xmm1              \n"
   320     "pshufb    %%xmm4,%%xmm0                   \n"
   321     "movdqa    %%xmm2," MEMACCESS2(0x20,1) "   \n"
   322     "por       %%xmm5,%%xmm0                   \n"
   323     "pshufb    %%xmm4,%%xmm1                   \n"
   324     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
   325     "por       %%xmm5,%%xmm1                   \n"
   326     "palignr   $0x4,%%xmm3,%%xmm3              \n"
   327     "pshufb    %%xmm4,%%xmm3                   \n"
   328     "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
   329     "por       %%xmm5,%%xmm3                   \n"
   330     "sub       $0x10,%2                        \n"
   331     "movdqa    %%xmm3," MEMACCESS2(0x30,1) "   \n"
   332     "lea       " MEMLEA(0x40,1) ",%1           \n"
   333     "jg        1b                              \n"
   334   : "+r"(src_rgb24),  // %0
   335     "+r"(dst_argb),  // %1
   336     "+r"(pix)        // %2
   337   : "m"(kShuffleMaskRGB24ToARGB)  // %3
   338   : "memory", "cc"
   339 #if defined(__SSE2__)
   340     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   341 #endif
   342   );
   343 }
   345 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
   346   asm volatile (
   347     "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
   348     "pslld     $0x18,%%xmm5                    \n"
   349     "movdqa    %3,%%xmm4                       \n"
   350     LABELALIGN
   351   "1:                                          \n"
   352     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   353     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   354     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
   355     "lea       " MEMLEA(0x30,0) ",%0           \n"
   356     "movdqa    %%xmm3,%%xmm2                   \n"
   357     "palignr   $0x8,%%xmm1,%%xmm2              \n"
   358     "pshufb    %%xmm4,%%xmm2                   \n"
   359     "por       %%xmm5,%%xmm2                   \n"
   360     "palignr   $0xc,%%xmm0,%%xmm1              \n"
   361     "pshufb    %%xmm4,%%xmm0                   \n"
   362     "movdqa    %%xmm2," MEMACCESS2(0x20,1) "   \n"
   363     "por       %%xmm5,%%xmm0                   \n"
   364     "pshufb    %%xmm4,%%xmm1                   \n"
   365     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
   366     "por       %%xmm5,%%xmm1                   \n"
   367     "palignr   $0x4,%%xmm3,%%xmm3              \n"
   368     "pshufb    %%xmm4,%%xmm3                   \n"
   369     "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
   370     "por       %%xmm5,%%xmm3                   \n"
   371     "sub       $0x10,%2                        \n"
   372     "movdqa    %%xmm3," MEMACCESS2(0x30,1) "   \n"
   373     "lea       " MEMLEA(0x40,1) ",%1           \n"
   374     "jg        1b                              \n"
   375   : "+r"(src_raw),   // %0
   376     "+r"(dst_argb),  // %1
   377     "+r"(pix)        // %2
   378   : "m"(kShuffleMaskRAWToARGB)  // %3
   379   : "memory", "cc"
   380 #if defined(__SSE2__)
   381     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   382 #endif
   383   );
   384 }
   386 void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
   387   asm volatile (
   388     "mov       $0x1080108,%%eax                \n"
   389     "movd      %%eax,%%xmm5                    \n"
   390     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
   391     "mov       $0x20802080,%%eax               \n"
   392     "movd      %%eax,%%xmm6                    \n"
   393     "pshufd    $0x0,%%xmm6,%%xmm6              \n"
   394     "pcmpeqb   %%xmm3,%%xmm3                   \n"
   395     "psllw     $0xb,%%xmm3                     \n"
   396     "pcmpeqb   %%xmm4,%%xmm4                   \n"
   397     "psllw     $0xa,%%xmm4                     \n"
   398     "psrlw     $0x5,%%xmm4                     \n"
   399     "pcmpeqb   %%xmm7,%%xmm7                   \n"
   400     "psllw     $0x8,%%xmm7                     \n"
   401     "sub       %0,%1                           \n"
   402     "sub       %0,%1                           \n"
   403     LABELALIGN
   404   "1:                                          \n"
   405     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   406     "movdqa    %%xmm0,%%xmm1                   \n"
   407     "movdqa    %%xmm0,%%xmm2                   \n"
   408     "pand      %%xmm3,%%xmm1                   \n"
   409     "psllw     $0xb,%%xmm2                     \n"
   410     "pmulhuw   %%xmm5,%%xmm1                   \n"
   411     "pmulhuw   %%xmm5,%%xmm2                   \n"
   412     "psllw     $0x8,%%xmm1                     \n"
   413     "por       %%xmm2,%%xmm1                   \n"
   414     "pand      %%xmm4,%%xmm0                   \n"
   415     "pmulhuw   %%xmm6,%%xmm0                   \n"
   416     "por       %%xmm7,%%xmm0                   \n"
   417     "movdqa    %%xmm1,%%xmm2                   \n"
   418     "punpcklbw %%xmm0,%%xmm1                   \n"
   419     "punpckhbw %%xmm0,%%xmm2                   \n"
   420     BUNDLEALIGN
   421     MEMOPMEM(movdqa,xmm1,0x00,1,0,2)           //  movdqa  %%xmm1,(%1,%0,2)
   422     MEMOPMEM(movdqa,xmm2,0x10,1,0,2)           //  movdqa  %%xmm2,0x10(%1,%0,2)
   423     "lea       " MEMLEA(0x10,0) ",%0           \n"
   424     "sub       $0x8,%2                         \n"
   425     "jg        1b                              \n"
   426   : "+r"(src),  // %0
   427     "+r"(dst),  // %1
   428     "+r"(pix)   // %2
   429   :
   430   : "memory", "cc", "eax"
   431 #if defined(__native_client__) && defined(__x86_64__)
   432     , "r14"
   433 #endif
   434 #if defined(__SSE2__)
   435     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   436 #endif
   437   );
   438 }
   440 void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
   441   asm volatile (
   442     "mov       $0x1080108,%%eax                \n"
   443     "movd      %%eax,%%xmm5                    \n"
   444     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
   445     "mov       $0x42004200,%%eax               \n"
   446     "movd      %%eax,%%xmm6                    \n"
   447     "pshufd    $0x0,%%xmm6,%%xmm6              \n"
   448     "pcmpeqb   %%xmm3,%%xmm3                   \n"
   449     "psllw     $0xb,%%xmm3                     \n"
   450     "movdqa    %%xmm3,%%xmm4                   \n"
   451     "psrlw     $0x6,%%xmm4                     \n"
   452     "pcmpeqb   %%xmm7,%%xmm7                   \n"
   453     "psllw     $0x8,%%xmm7                     \n"
   454     "sub       %0,%1                           \n"
   455     "sub       %0,%1                           \n"
   456     LABELALIGN
   457   "1:                                          \n"
   458     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   459     "movdqa    %%xmm0,%%xmm1                   \n"
   460     "movdqa    %%xmm0,%%xmm2                   \n"
   461     "psllw     $0x1,%%xmm1                     \n"
   462     "psllw     $0xb,%%xmm2                     \n"
   463     "pand      %%xmm3,%%xmm1                   \n"
   464     "pmulhuw   %%xmm5,%%xmm2                   \n"
   465     "pmulhuw   %%xmm5,%%xmm1                   \n"
   466     "psllw     $0x8,%%xmm1                     \n"
   467     "por       %%xmm2,%%xmm1                   \n"
   468     "movdqa    %%xmm0,%%xmm2                   \n"
   469     "pand      %%xmm4,%%xmm0                   \n"
   470     "psraw     $0x8,%%xmm2                     \n"
   471     "pmulhuw   %%xmm6,%%xmm0                   \n"
   472     "pand      %%xmm7,%%xmm2                   \n"
   473     "por       %%xmm2,%%xmm0                   \n"
   474     "movdqa    %%xmm1,%%xmm2                   \n"
   475     "punpcklbw %%xmm0,%%xmm1                   \n"
   476     "punpckhbw %%xmm0,%%xmm2                   \n"
   477     BUNDLEALIGN
   478     MEMOPMEM(movdqa,xmm1,0x00,1,0,2)           //  movdqa  %%xmm1,(%1,%0,2)
   479     MEMOPMEM(movdqa,xmm2,0x10,1,0,2)           //  movdqa  %%xmm2,0x10(%1,%0,2)
   480     "lea       " MEMLEA(0x10,0) ",%0           \n"
   481     "sub       $0x8,%2                         \n"
   482     "jg        1b                              \n"
   483   : "+r"(src),  // %0
   484     "+r"(dst),  // %1
   485     "+r"(pix)   // %2
   486   :
   487   : "memory", "cc", "eax"
   488 #if defined(__native_client__) && defined(__x86_64__)
   489     , "r14"
   490 #endif
   491 #if defined(__SSE2__)
   492     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   493 #endif
   494   );
   495 }
   497 void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
   498   asm volatile (
   499     "mov       $0xf0f0f0f,%%eax                \n"
   500     "movd      %%eax,%%xmm4                    \n"
   501     "pshufd    $0x0,%%xmm4,%%xmm4              \n"
   502     "movdqa    %%xmm4,%%xmm5                   \n"
   503     "pslld     $0x4,%%xmm5                     \n"
   504     "sub       %0,%1                           \n"
   505     "sub       %0,%1                           \n"
   506     LABELALIGN
   507   "1:                                          \n"
   508     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   509     "movdqa    %%xmm0,%%xmm2                   \n"
   510     "pand      %%xmm4,%%xmm0                   \n"
   511     "pand      %%xmm5,%%xmm2                   \n"
   512     "movdqa    %%xmm0,%%xmm1                   \n"
   513     "movdqa    %%xmm2,%%xmm3                   \n"
   514     "psllw     $0x4,%%xmm1                     \n"
   515     "psrlw     $0x4,%%xmm3                     \n"
   516     "por       %%xmm1,%%xmm0                   \n"
   517     "por       %%xmm3,%%xmm2                   \n"
   518     "movdqa    %%xmm0,%%xmm1                   \n"
   519     "punpcklbw %%xmm2,%%xmm0                   \n"
   520     "punpckhbw %%xmm2,%%xmm1                   \n"
   521     BUNDLEALIGN
   522     MEMOPMEM(movdqa,xmm0,0x00,1,0,2)           //  movdqa  %%xmm0,(%1,%0,2)
   523     MEMOPMEM(movdqa,xmm1,0x10,1,0,2)           //  movdqa  %%xmm1,0x10(%1,%0,2)
   524     "lea       " MEMLEA(0x10,0) ",%0           \n"
   525     "sub       $0x8,%2                         \n"
   526     "jg        1b                              \n"
   527   : "+r"(src),  // %0
   528     "+r"(dst),  // %1
   529     "+r"(pix)   // %2
   530   :
   531   : "memory", "cc", "eax"
   532 #if defined(__native_client__) && defined(__x86_64__)
   533     , "r14"
   534 #endif
   535 #if defined(__SSE2__)
   536     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   537 #endif
   538   );
   539 }
   541 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
   542   asm volatile (
   543     "movdqa    %3,%%xmm6                       \n"
   544     LABELALIGN
   545   "1:                                          \n"
   546     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   547     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   548     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   549     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
   550     "lea       " MEMLEA(0x40,0) ",%0           \n"
   551     "pshufb    %%xmm6,%%xmm0                   \n"
   552     "pshufb    %%xmm6,%%xmm1                   \n"
   553     "pshufb    %%xmm6,%%xmm2                   \n"
   554     "pshufb    %%xmm6,%%xmm3                   \n"
   555     "movdqa    %%xmm1,%%xmm4                   \n"
   556     "psrldq    $0x4,%%xmm1                     \n"
   557     "pslldq    $0xc,%%xmm4                     \n"
   558     "movdqa    %%xmm2,%%xmm5                   \n"
   559     "por       %%xmm4,%%xmm0                   \n"
   560     "pslldq    $0x8,%%xmm5                     \n"
   561     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   562     "por       %%xmm5,%%xmm1                   \n"
   563     "psrldq    $0x8,%%xmm2                     \n"
   564     "pslldq    $0x4,%%xmm3                     \n"
   565     "por       %%xmm3,%%xmm2                   \n"
   566     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
   567     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
   568     "lea       " MEMLEA(0x30,1) ",%1           \n"
   569     "sub       $0x10,%2                        \n"
   570     "jg        1b                              \n"
   571   : "+r"(src),  // %0
   572     "+r"(dst),  // %1
   573     "+r"(pix)   // %2
   574   : "m"(kShuffleMaskARGBToRGB24)  // %3
   575   : "memory", "cc"
   576 #if defined(__SSE2__)
   577     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   578 #endif
   579   );
   580 }
   582 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
   583   asm volatile (
   584     "movdqa    %3,%%xmm6                       \n"
   585     LABELALIGN
   586   "1:                                          \n"
   587     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   588     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   589     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   590     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
   591     "lea       " MEMLEA(0x40,0) ",%0           \n"
   592     "pshufb    %%xmm6,%%xmm0                   \n"
   593     "pshufb    %%xmm6,%%xmm1                   \n"
   594     "pshufb    %%xmm6,%%xmm2                   \n"
   595     "pshufb    %%xmm6,%%xmm3                   \n"
   596     "movdqa    %%xmm1,%%xmm4                   \n"
   597     "psrldq    $0x4,%%xmm1                     \n"
   598     "pslldq    $0xc,%%xmm4                     \n"
   599     "movdqa    %%xmm2,%%xmm5                   \n"
   600     "por       %%xmm4,%%xmm0                   \n"
   601     "pslldq    $0x8,%%xmm5                     \n"
   602     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   603     "por       %%xmm5,%%xmm1                   \n"
   604     "psrldq    $0x8,%%xmm2                     \n"
   605     "pslldq    $0x4,%%xmm3                     \n"
   606     "por       %%xmm3,%%xmm2                   \n"
   607     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
   608     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
   609     "lea       " MEMLEA(0x30,1) ",%1           \n"
   610     "sub       $0x10,%2                        \n"
   611     "jg        1b                              \n"
   612   : "+r"(src),  // %0
   613     "+r"(dst),  // %1
   614     "+r"(pix)   // %2
   615   : "m"(kShuffleMaskARGBToRAW)  // %3
   616   : "memory", "cc"
   617 #if defined(__SSE2__)
   618     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   619 #endif
   620   );
   621 }
   623 void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
   624   asm volatile (
   625     "pcmpeqb   %%xmm3,%%xmm3                   \n"
   626     "psrld     $0x1b,%%xmm3                    \n"
   627     "pcmpeqb   %%xmm4,%%xmm4                   \n"
   628     "psrld     $0x1a,%%xmm4                    \n"
   629     "pslld     $0x5,%%xmm4                     \n"
   630     "pcmpeqb   %%xmm5,%%xmm5                   \n"
   631     "pslld     $0xb,%%xmm5                     \n"
   632     LABELALIGN
   633   "1:                                          \n"
   634     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
   635     "movdqa    %%xmm0,%%xmm1                   \n"
   636     "movdqa    %%xmm0,%%xmm2                   \n"
   637     "pslld     $0x8,%%xmm0                     \n"
   638     "psrld     $0x3,%%xmm1                     \n"
   639     "psrld     $0x5,%%xmm2                     \n"
   640     "psrad     $0x10,%%xmm0                    \n"
   641     "pand      %%xmm3,%%xmm1                   \n"
   642     "pand      %%xmm4,%%xmm2                   \n"
   643     "pand      %%xmm5,%%xmm0                   \n"
   644     "por       %%xmm2,%%xmm1                   \n"
   645     "por       %%xmm1,%%xmm0                   \n"
   646     "packssdw  %%xmm0,%%xmm0                   \n"
   647     "lea       " MEMLEA(0x10,0) ",%0           \n"
   648     "movq      %%xmm0," MEMACCESS(1) "         \n"
   649     "lea       " MEMLEA(0x8,1) ",%1            \n"
   650     "sub       $0x4,%2                         \n"
   651     "jg        1b                              \n"
   652   : "+r"(src),  // %0
   653     "+r"(dst),  // %1
   654     "+r"(pix)   // %2
   655   :
   656   : "memory", "cc"
   657 #if defined(__SSE2__)
   658     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   659 #endif
   660   );
   661 }
   663 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
   664   asm volatile (
   665     "pcmpeqb   %%xmm4,%%xmm4                   \n"
   666     "psrld     $0x1b,%%xmm4                    \n"
   667     "movdqa    %%xmm4,%%xmm5                   \n"
   668     "pslld     $0x5,%%xmm5                     \n"
   669     "movdqa    %%xmm4,%%xmm6                   \n"
   670     "pslld     $0xa,%%xmm6                     \n"
   671     "pcmpeqb   %%xmm7,%%xmm7                   \n"
   672     "pslld     $0xf,%%xmm7                     \n"
   673     LABELALIGN
   674   "1:                                          \n"
   675     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
   676     "movdqa    %%xmm0,%%xmm1                   \n"
   677     "movdqa    %%xmm0,%%xmm2                   \n"
   678     "movdqa    %%xmm0,%%xmm3                   \n"
   679     "psrad     $0x10,%%xmm0                    \n"
   680     "psrld     $0x3,%%xmm1                     \n"
   681     "psrld     $0x6,%%xmm2                     \n"
   682     "psrld     $0x9,%%xmm3                     \n"
   683     "pand      %%xmm7,%%xmm0                   \n"
   684     "pand      %%xmm4,%%xmm1                   \n"
   685     "pand      %%xmm5,%%xmm2                   \n"
   686     "pand      %%xmm6,%%xmm3                   \n"
   687     "por       %%xmm1,%%xmm0                   \n"
   688     "por       %%xmm3,%%xmm2                   \n"
   689     "por       %%xmm2,%%xmm0                   \n"
   690     "packssdw  %%xmm0,%%xmm0                   \n"
   691     "lea       " MEMLEA(0x10,0) ",%0           \n"
   692     "movq      %%xmm0," MEMACCESS(1) "         \n"
   693     "lea       " MEMACCESS2(0x8,1) ",%1        \n"
   694     "sub       $0x4,%2                         \n"
   695     "jg        1b                              \n"
   696   : "+r"(src),  // %0
   697     "+r"(dst),  // %1
   698     "+r"(pix)   // %2
   699   :
   700   : "memory", "cc"
   701 #if defined(__SSE2__)
   702     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   703 #endif
   704   );
   705 }
   707 void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
   708   asm volatile (
   709     "pcmpeqb   %%xmm4,%%xmm4                   \n"
   710     "psllw     $0xc,%%xmm4                     \n"
   711     "movdqa    %%xmm4,%%xmm3                   \n"
   712     "psrlw     $0x8,%%xmm3                     \n"
   713     LABELALIGN
   714   "1:                                          \n"
   715     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
   716     "movdqa    %%xmm0,%%xmm1                   \n"
   717     "pand      %%xmm3,%%xmm0                   \n"
   718     "pand      %%xmm4,%%xmm1                   \n"
   719     "psrlq     $0x4,%%xmm0                     \n"
   720     "psrlq     $0x8,%%xmm1                     \n"
   721     "por       %%xmm1,%%xmm0                   \n"
   722     "packuswb  %%xmm0,%%xmm0                   \n"
   723     "lea       " MEMLEA(0x10,0) ",%0           \n"
   724     "movq      %%xmm0," MEMACCESS(1) "         \n"
   725     "lea       " MEMLEA(0x8,1) ",%1            \n"
   726     "sub       $0x4,%2                         \n"
   727     "jg        1b                              \n"
   728   : "+r"(src),  // %0
   729     "+r"(dst),  // %1
   730     "+r"(pix)   // %2
   731   :
   732   : "memory", "cc"
   733 #if defined(__SSE2__)
   734     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
   735 #endif
   736   );
   737 }
   738 #endif  // HAS_RGB24TOARGBROW_SSSE3
   740 #ifdef HAS_ARGBTOYROW_SSSE3
   741 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   742   asm volatile (
   743     "movdqa    %4,%%xmm5                       \n"
   744     "movdqa    %3,%%xmm4                       \n"
   745     LABELALIGN
   746   "1:                                          \n"
   747     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
   748     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   749     "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   750     "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
   751     "pmaddubsw %%xmm4,%%xmm0                   \n"
   752     "pmaddubsw %%xmm4,%%xmm1                   \n"
   753     "pmaddubsw %%xmm4,%%xmm2                   \n"
   754     "pmaddubsw %%xmm4,%%xmm3                   \n"
   755     "lea       " MEMLEA(0x40,0) ",%0           \n"
   756     "phaddw    %%xmm1,%%xmm0                   \n"
   757     "phaddw    %%xmm3,%%xmm2                   \n"
   758     "psrlw     $0x7,%%xmm0                     \n"
   759     "psrlw     $0x7,%%xmm2                     \n"
   760     "packuswb  %%xmm2,%%xmm0                   \n"
   761     "paddb     %%xmm5,%%xmm0                   \n"
   762     "sub       $0x10,%2                        \n"
   763     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
   764     "lea       " MEMLEA(0x10,1) ",%1           \n"
   765     "jg        1b                              \n"
   766   : "+r"(src_argb),  // %0
   767     "+r"(dst_y),     // %1
   768     "+r"(pix)        // %2
   769   : "m"(kARGBToY),   // %3
   770     "m"(kAddY16)     // %4
   771   : "memory", "cc"
   772 #if defined(__SSE2__)
   773     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   774 #endif
   775   );
   776 }
   778 void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   779   asm volatile (
   780     "movdqa    %4,%%xmm5                       \n"
   781     "movdqa    %3,%%xmm4                       \n"
   782     LABELALIGN
   783   "1:                                          \n"
   784     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   785     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   786     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   787     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
   788     "pmaddubsw %%xmm4,%%xmm0                   \n"
   789     "pmaddubsw %%xmm4,%%xmm1                   \n"
   790     "pmaddubsw %%xmm4,%%xmm2                   \n"
   791     "pmaddubsw %%xmm4,%%xmm3                   \n"
   792     "lea       " MEMLEA(0x40,0) ",%0           \n"
   793     "phaddw    %%xmm1,%%xmm0                   \n"
   794     "phaddw    %%xmm3,%%xmm2                   \n"
   795     "psrlw     $0x7,%%xmm0                     \n"
   796     "psrlw     $0x7,%%xmm2                     \n"
   797     "packuswb  %%xmm2,%%xmm0                   \n"
   798     "paddb     %%xmm5,%%xmm0                   \n"
   799     "sub       $0x10,%2                        \n"
   800     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   801     "lea       " MEMLEA(0x10,1) ",%1           \n"
   802     "jg        1b                              \n"
   803   : "+r"(src_argb),  // %0
   804     "+r"(dst_y),     // %1
   805     "+r"(pix)        // %2
   806   : "m"(kARGBToY),   // %3
   807     "m"(kAddY16)     // %4
   808   : "memory", "cc"
   809 #if defined(__SSE2__)
   810     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   811 #endif
   812   );
   813 }
   814 #endif  // HAS_ARGBTOYROW_SSSE3
   816 #ifdef HAS_ARGBTOYJROW_SSSE3
   817 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   818   asm volatile (
   819     "movdqa    %3,%%xmm4                       \n"
   820     "movdqa    %4,%%xmm5                       \n"
   821     LABELALIGN
   822   "1:                                          \n"
   823     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
   824     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   825     "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   826     "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
   827     "pmaddubsw %%xmm4,%%xmm0                   \n"
   828     "pmaddubsw %%xmm4,%%xmm1                   \n"
   829     "pmaddubsw %%xmm4,%%xmm2                   \n"
   830     "pmaddubsw %%xmm4,%%xmm3                   \n"
   831     "lea       " MEMLEA(0x40,0) ",%0           \n"
   832     "phaddw    %%xmm1,%%xmm0                   \n"
   833     "phaddw    %%xmm3,%%xmm2                   \n"
   834     "paddw     %%xmm5,%%xmm0                   \n"
   835     "paddw     %%xmm5,%%xmm2                   \n"
   836     "psrlw     $0x7,%%xmm0                     \n"
   837     "psrlw     $0x7,%%xmm2                     \n"
   838     "packuswb  %%xmm2,%%xmm0                   \n"
   839     "sub       $0x10,%2                        \n"
   840     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
   841     "lea       " MEMLEA(0x10,1) ",%1           \n"
   842     "jg        1b                              \n"
   843   : "+r"(src_argb),  // %0
   844     "+r"(dst_y),     // %1
   845     "+r"(pix)        // %2
   846   : "m"(kARGBToYJ),  // %3
   847     "m"(kAddYJ64)    // %4
   848   : "memory", "cc"
   849 #if defined(__SSE2__)
   850     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   851 #endif
   852   );
   853 }
   855 void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   856   asm volatile (
   857     "movdqa    %3,%%xmm4                       \n"
   858     "movdqa    %4,%%xmm5                       \n"
   859     LABELALIGN
   860   "1:                                          \n"
   861     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   862     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   863     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   864     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
   865     "pmaddubsw %%xmm4,%%xmm0                   \n"
   866     "pmaddubsw %%xmm4,%%xmm1                   \n"
   867     "pmaddubsw %%xmm4,%%xmm2                   \n"
   868     "pmaddubsw %%xmm4,%%xmm3                   \n"
   869     "lea       " MEMLEA(0x40,0) ",%0           \n"
   870     "phaddw    %%xmm1,%%xmm0                   \n"
   871     "phaddw    %%xmm3,%%xmm2                   \n"
   872     "paddw     %%xmm5,%%xmm0                   \n"
   873     "paddw     %%xmm5,%%xmm2                   \n"
   874     "psrlw     $0x7,%%xmm0                     \n"
   875     "psrlw     $0x7,%%xmm2                     \n"
   876     "packuswb  %%xmm2,%%xmm0                   \n"
   877     "sub       $0x10,%2                        \n"
   878     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   879     "lea       " MEMLEA(0x10,1) ",%1           \n"
   880     "jg        1b                              \n"
   881   : "+r"(src_argb),  // %0
   882     "+r"(dst_y),     // %1
   883     "+r"(pix)        // %2
   884   : "m"(kARGBToYJ),  // %3
   885     "m"(kAddYJ64)    // %4
   886   : "memory", "cc"
   887 #if defined(__SSE2__)
   888     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   889 #endif
   890   );
   891 }
   892 #endif  // HAS_ARGBTOYJROW_SSSE3
   894 #ifdef HAS_ARGBTOUVROW_SSSE3
   895 // TODO(fbarchard): pass xmm constants to single block of assembly.
   896 // fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
   897 // 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
   898 // or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
   899 // and considered unsafe.
   900 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   901                        uint8* dst_u, uint8* dst_v, int width) {
   902   asm volatile (
   903     "movdqa    %0,%%xmm4                       \n"
   904     "movdqa    %1,%%xmm3                       \n"
   905     "movdqa    %2,%%xmm5                       \n"
   906   :
   907   : "m"(kARGBToU),  // %0
   908     "m"(kARGBToV),  // %1
   909     "m"(kAddUV128)  // %2
   910   );
   911   asm volatile (
   912     "sub       %1,%2                           \n"
   913     LABELALIGN
   914   "1:                                          \n"
   915     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
   916     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   917     "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   918     "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
   919     BUNDLEALIGN
   920     MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
   921     MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
   922     MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
   923     MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
   924     "lea       " MEMLEA(0x40,0) ",%0           \n"
   925     "movdqa    %%xmm0,%%xmm7                   \n"
   926     "shufps    $0x88,%%xmm1,%%xmm0             \n"
   927     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
   928     "pavgb     %%xmm7,%%xmm0                   \n"
   929     "movdqa    %%xmm2,%%xmm7                   \n"
   930     "shufps    $0x88,%%xmm6,%%xmm2             \n"
   931     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
   932     "pavgb     %%xmm7,%%xmm2                   \n"
   933     "movdqa    %%xmm0,%%xmm1                   \n"
   934     "movdqa    %%xmm2,%%xmm6                   \n"
   935     "pmaddubsw %%xmm4,%%xmm0                   \n"
   936     "pmaddubsw %%xmm4,%%xmm2                   \n"
   937     "pmaddubsw %%xmm3,%%xmm1                   \n"
   938     "pmaddubsw %%xmm3,%%xmm6                   \n"
   939     "phaddw    %%xmm2,%%xmm0                   \n"
   940     "phaddw    %%xmm6,%%xmm1                   \n"
   941     "psraw     $0x8,%%xmm0                     \n"
   942     "psraw     $0x8,%%xmm1                     \n"
   943     "packsswb  %%xmm1,%%xmm0                   \n"
   944     "paddb     %%xmm5,%%xmm0                   \n"
   945     "sub       $0x10,%3                        \n"
   946     "movlps    %%xmm0," MEMACCESS(1) "         \n"
   947     BUNDLEALIGN
   948     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps    %%xmm0,(%1,%2,1)
   949     "lea       " MEMLEA(0x8,1) ",%1            \n"
   950     "jg        1b                              \n"
   951   : "+r"(src_argb0),       // %0
   952     "+r"(dst_u),           // %1
   953     "+r"(dst_v),           // %2
   954     "+rm"(width)           // %3
   955   : "r"((intptr_t)(src_stride_argb)) // %4
   956   : "memory", "cc"
   957 #if defined(__native_client__) && defined(__x86_64__)
   958     , "r14"
   959 #endif
   960 #if defined(__SSE2__)
   961     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
   962 #endif
   963   );
   964 }
   966 // TODO(fbarchard): Share code with ARGBToUVRow_SSSE3.
   967 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   968                         uint8* dst_u, uint8* dst_v, int width) {
   969   asm volatile (
   970     "movdqa    %0,%%xmm4                       \n"
   971     "movdqa    %1,%%xmm3                       \n"
   972     "movdqa    %2,%%xmm5                       \n"
   973   :
   974   : "m"(kARGBToUJ),  // %0
   975     "m"(kARGBToVJ),  // %1
   976     "m"(kAddUVJ128)  // %2
   977   );
   978   asm volatile (
   979     "sub       %1,%2                           \n"
   980     LABELALIGN
   981   "1:                                          \n"
   982     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
   983     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   984     "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   985     "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
   986     BUNDLEALIGN
   987     MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
   988     MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
   989     MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
   990     MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
   991     "lea       " MEMLEA(0x40,0) ",%0           \n"
   992     "movdqa    %%xmm0,%%xmm7                   \n"
   993     "shufps    $0x88,%%xmm1,%%xmm0             \n"
   994     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
   995     "pavgb     %%xmm7,%%xmm0                   \n"
   996     "movdqa    %%xmm2,%%xmm7                   \n"
   997     "shufps    $0x88,%%xmm6,%%xmm2             \n"
   998     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
   999     "pavgb     %%xmm7,%%xmm2                   \n"
  1000     "movdqa    %%xmm0,%%xmm1                   \n"
  1001     "movdqa    %%xmm2,%%xmm6                   \n"
  1002     "pmaddubsw %%xmm4,%%xmm0                   \n"
  1003     "pmaddubsw %%xmm4,%%xmm2                   \n"
  1004     "pmaddubsw %%xmm3,%%xmm1                   \n"
  1005     "pmaddubsw %%xmm3,%%xmm6                   \n"
  1006     "phaddw    %%xmm2,%%xmm0                   \n"
  1007     "phaddw    %%xmm6,%%xmm1                   \n"
  1008     "paddw     %%xmm5,%%xmm0                   \n"
  1009     "paddw     %%xmm5,%%xmm1                   \n"
  1010     "psraw     $0x8,%%xmm0                     \n"
  1011     "psraw     $0x8,%%xmm1                     \n"
  1012     "packsswb  %%xmm1,%%xmm0                   \n"
  1013     "sub       $0x10,%3                        \n"
  1014     "movlps    %%xmm0," MEMACCESS(1) "         \n"
  1015     BUNDLEALIGN
  1016     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
  1017     "lea       " MEMLEA(0x8,1) ",%1            \n"
  1018     "jg        1b                              \n"
  1019   : "+r"(src_argb0),       // %0
  1020     "+r"(dst_u),           // %1
  1021     "+r"(dst_v),           // %2
  1022     "+rm"(width)           // %3
  1023   : "r"((intptr_t)(src_stride_argb)) // %4
  1024   : "memory", "cc"
  1025 #if defined(__native_client__) && defined(__x86_64__)
  1026     , "r14"
  1027 #endif
  1028 #if defined(__SSE2__)
  1029     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1030 #endif
  1031   );
  1034 void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1035                                  uint8* dst_u, uint8* dst_v, int width) {
  1036   asm volatile (
  1037     "movdqa    %0,%%xmm4                       \n"
  1038     "movdqa    %1,%%xmm3                       \n"
  1039     "movdqa    %2,%%xmm5                       \n"
  1041   : "m"(kARGBToU),         // %0
  1042     "m"(kARGBToV),         // %1
  1043     "m"(kAddUV128)         // %2
  1044   );
  1045   asm volatile (
  1046     "sub       %1,%2                           \n"
  1047     LABELALIGN
  1048   "1:                                          \n"
  1049     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1050     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1051     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1052     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
  1053     BUNDLEALIGN
  1054     MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
  1055     "pavgb     %%xmm7,%%xmm0                   \n"
  1056     MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
  1057     "pavgb     %%xmm7,%%xmm1                   \n"
  1058     MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
  1059     "pavgb     %%xmm7,%%xmm2                   \n"
  1060     MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
  1061     "pavgb     %%xmm7,%%xmm6                   \n"
  1062     "lea       " MEMLEA(0x40,0) ",%0           \n"
  1063     "movdqa    %%xmm0,%%xmm7                   \n"
  1064     "shufps    $0x88,%%xmm1,%%xmm0             \n"
  1065     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
  1066     "pavgb     %%xmm7,%%xmm0                   \n"
  1067     "movdqa    %%xmm2,%%xmm7                   \n"
  1068     "shufps    $0x88,%%xmm6,%%xmm2             \n"
  1069     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
  1070     "pavgb     %%xmm7,%%xmm2                   \n"
  1071     "movdqa    %%xmm0,%%xmm1                   \n"
  1072     "movdqa    %%xmm2,%%xmm6                   \n"
  1073     "pmaddubsw %%xmm4,%%xmm0                   \n"
  1074     "pmaddubsw %%xmm4,%%xmm2                   \n"
  1075     "pmaddubsw %%xmm3,%%xmm1                   \n"
  1076     "pmaddubsw %%xmm3,%%xmm6                   \n"
  1077     "phaddw    %%xmm2,%%xmm0                   \n"
  1078     "phaddw    %%xmm6,%%xmm1                   \n"
  1079     "psraw     $0x8,%%xmm0                     \n"
  1080     "psraw     $0x8,%%xmm1                     \n"
  1081     "packsswb  %%xmm1,%%xmm0                   \n"
  1082     "paddb     %%xmm5,%%xmm0                   \n"
  1083     "sub       $0x10,%3                        \n"
  1084     "movlps    %%xmm0," MEMACCESS(1) "         \n"
  1085     BUNDLEALIGN
  1086     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
  1087     "lea       " MEMLEA(0x8,1) ",%1            \n"
  1088     "jg        1b                              \n"
  1089   : "+r"(src_argb0),       // %0
  1090     "+r"(dst_u),           // %1
  1091     "+r"(dst_v),           // %2
  1092     "+rm"(width)           // %3
  1093   : "r"((intptr_t)(src_stride_argb)) // %4
  1094   : "memory", "cc"
  1095 #if defined(__native_client__) && defined(__x86_64__)
  1096     , "r14"
  1097 #endif
  1098 #if defined(__SSE2__)
  1099     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1100 #endif
  1101   );
  1104 void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1105                                   uint8* dst_u, uint8* dst_v, int width) {
  1106   asm volatile (
  1107     "movdqa    %0,%%xmm4                       \n"
  1108     "movdqa    %1,%%xmm3                       \n"
  1109     "movdqa    %2,%%xmm5                       \n"
  1111   : "m"(kARGBToUJ),         // %0
  1112     "m"(kARGBToVJ),         // %1
  1113     "m"(kAddUVJ128)         // %2
  1114   );
  1115   asm volatile (
  1116     "sub       %1,%2                           \n"
  1117     LABELALIGN
  1118   "1:                                          \n"
  1119     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1120     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1121     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1122     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
  1123     BUNDLEALIGN
  1124     MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
  1125     "pavgb     %%xmm7,%%xmm0                   \n"
  1126     MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
  1127     "pavgb     %%xmm7,%%xmm1                   \n"
  1128     MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
  1129     "pavgb     %%xmm7,%%xmm2                   \n"
  1130     MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
  1131     "pavgb     %%xmm7,%%xmm6                   \n"
  1132     "lea       " MEMLEA(0x40,0) ",%0           \n"
  1133     "movdqa    %%xmm0,%%xmm7                   \n"
  1134     "shufps    $0x88,%%xmm1,%%xmm0             \n"
  1135     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
  1136     "pavgb     %%xmm7,%%xmm0                   \n"
  1137     "movdqa    %%xmm2,%%xmm7                   \n"
  1138     "shufps    $0x88,%%xmm6,%%xmm2             \n"
  1139     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
  1140     "pavgb     %%xmm7,%%xmm2                   \n"
  1141     "movdqa    %%xmm0,%%xmm1                   \n"
  1142     "movdqa    %%xmm2,%%xmm6                   \n"
  1143     "pmaddubsw %%xmm4,%%xmm0                   \n"
  1144     "pmaddubsw %%xmm4,%%xmm2                   \n"
  1145     "pmaddubsw %%xmm3,%%xmm1                   \n"
  1146     "pmaddubsw %%xmm3,%%xmm6                   \n"
  1147     "phaddw    %%xmm2,%%xmm0                   \n"
  1148     "phaddw    %%xmm6,%%xmm1                   \n"
  1149     "paddw     %%xmm5,%%xmm0                   \n"
  1150     "paddw     %%xmm5,%%xmm1                   \n"
  1151     "psraw     $0x8,%%xmm0                     \n"
  1152     "psraw     $0x8,%%xmm1                     \n"
  1153     "packsswb  %%xmm1,%%xmm0                   \n"
  1154     "sub       $0x10,%3                        \n"
  1155     "movlps    %%xmm0," MEMACCESS(1) "         \n"
  1156     BUNDLEALIGN
  1157     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
  1158     "lea       " MEMLEA(0x8,1) ",%1            \n"
  1159     "jg        1b                              \n"
  1160   : "+r"(src_argb0),       // %0
  1161     "+r"(dst_u),           // %1
  1162     "+r"(dst_v),           // %2
  1163     "+rm"(width)           // %3
  1164   : "r"((intptr_t)(src_stride_argb))
  1165   : "memory", "cc"
  1166 #if defined(__native_client__) && defined(__x86_64__)
  1167     , "r14"
  1168 #endif
  1169 #if defined(__SSE2__)
  1170     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1171 #endif
  1172   );
  1175 void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
  1176                           int width) {
  1177   asm volatile (
  1178     "movdqa    %0,%%xmm4                       \n"
  1179     "movdqa    %1,%%xmm3                       \n"
  1180     "movdqa    %2,%%xmm5                       \n"
  1182   : "m"(kARGBToU),  // %0
  1183     "m"(kARGBToV),  // %1
  1184     "m"(kAddUV128)  // %2
  1185   );
  1186   asm volatile (
  1187     "sub       %1,%2                           \n"
  1188     LABELALIGN
  1189   "1:                                          \n"
  1190     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1191     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1192     "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1193     "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
  1194     "pmaddubsw %%xmm4,%%xmm0                   \n"
  1195     "pmaddubsw %%xmm4,%%xmm1                   \n"
  1196     "pmaddubsw %%xmm4,%%xmm2                   \n"
  1197     "pmaddubsw %%xmm4,%%xmm6                   \n"
  1198     "phaddw    %%xmm1,%%xmm0                   \n"
  1199     "phaddw    %%xmm6,%%xmm2                   \n"
  1200     "psraw     $0x8,%%xmm0                     \n"
  1201     "psraw     $0x8,%%xmm2                     \n"
  1202     "packsswb  %%xmm2,%%xmm0                   \n"
  1203     "paddb     %%xmm5,%%xmm0                   \n"
  1204     "sub       $0x10,%3                        \n"
  1205     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  1206     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1207     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1208     "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1209     "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
  1210     "pmaddubsw %%xmm3,%%xmm0                   \n"
  1211     "pmaddubsw %%xmm3,%%xmm1                   \n"
  1212     "pmaddubsw %%xmm3,%%xmm2                   \n"
  1213     "pmaddubsw %%xmm3,%%xmm6                   \n"
  1214     "phaddw    %%xmm1,%%xmm0                   \n"
  1215     "phaddw    %%xmm6,%%xmm2                   \n"
  1216     "psraw     $0x8,%%xmm0                     \n"
  1217     "psraw     $0x8,%%xmm2                     \n"
  1218     "packsswb  %%xmm2,%%xmm0                   \n"
  1219     "paddb     %%xmm5,%%xmm0                   \n"
  1220     "lea       " MEMLEA(0x40,0) ",%0           \n"
  1221     BUNDLEALIGN
  1222     MEMOPMEM(movdqa,xmm0,0x00,1,2,1)           //  movdqa  %%xmm0,(%1,%2,1)
  1223     "lea       " MEMLEA(0x10,1) ",%1           \n"
  1224     "jg        1b                              \n"
  1225   : "+r"(src_argb),        // %0
  1226     "+r"(dst_u),           // %1
  1227     "+r"(dst_v),           // %2
  1228     "+rm"(width)           // %3
  1230   : "memory", "cc"
  1231 #if defined(__native_client__) && defined(__x86_64__)
  1232     , "r14"
  1233 #endif
  1234 #if defined(__SSE2__)
  1235     , "xmm0", "xmm1", "xmm2", "xmm6"
  1236 #endif
  1237   );
  1240 void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
  1241                                     uint8* dst_v, int width) {
  1242   asm volatile (
  1243     "movdqa    %0,%%xmm4                       \n"
  1244     "movdqa    %1,%%xmm3                       \n"
  1245     "movdqa    %2,%%xmm5                       \n"
  1247   : "m"(kARGBToU),  // %0
  1248     "m"(kARGBToV),  // %1
  1249     "m"(kAddUV128)  // %2
  1250   );
  1251   asm volatile (
  1252     "sub       %1,%2                           \n"
  1253     LABELALIGN
  1254   "1:                                          \n"
  1255     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1256     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1257     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1258     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
  1259     "pmaddubsw %%xmm4,%%xmm0                   \n"
  1260     "pmaddubsw %%xmm4,%%xmm1                   \n"
  1261     "pmaddubsw %%xmm4,%%xmm2                   \n"
  1262     "pmaddubsw %%xmm4,%%xmm6                   \n"
  1263     "phaddw    %%xmm1,%%xmm0                   \n"
  1264     "phaddw    %%xmm6,%%xmm2                   \n"
  1265     "psraw     $0x8,%%xmm0                     \n"
  1266     "psraw     $0x8,%%xmm2                     \n"
  1267     "packsswb  %%xmm2,%%xmm0                   \n"
  1268     "paddb     %%xmm5,%%xmm0                   \n"
  1269     "sub       $0x10,%3                        \n"
  1270     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
  1271     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1272     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1273     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1274     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
  1275     "pmaddubsw %%xmm3,%%xmm0                   \n"
  1276     "pmaddubsw %%xmm3,%%xmm1                   \n"
  1277     "pmaddubsw %%xmm3,%%xmm2                   \n"
  1278     "pmaddubsw %%xmm3,%%xmm6                   \n"
  1279     "phaddw    %%xmm1,%%xmm0                   \n"
  1280     "phaddw    %%xmm6,%%xmm2                   \n"
  1281     "psraw     $0x8,%%xmm0                     \n"
  1282     "psraw     $0x8,%%xmm2                     \n"
  1283     "packsswb  %%xmm2,%%xmm0                   \n"
  1284     "paddb     %%xmm5,%%xmm0                   \n"
  1285     "lea       " MEMLEA(0x40,0) ",%0           \n"
  1286     BUNDLEALIGN
  1287     MEMOPMEM(movdqu,xmm0,0x00,1,2,1)           //  movdqu  %%xmm0,(%1,%2,1)
  1288     "lea       " MEMLEA(0x10,1) ",%1           \n"
  1289     "jg        1b                              \n"
  1290   : "+r"(src_argb),        // %0
  1291     "+r"(dst_u),           // %1
  1292     "+r"(dst_v),           // %2
  1293     "+rm"(width)           // %3
  1295   : "memory", "cc"
  1296 #if defined(__native_client__) && defined(__x86_64__)
  1297     , "r14"
  1298 #endif
  1299 #if defined(__SSE2__)
  1300     , "xmm0", "xmm1", "xmm2", "xmm6"
  1301 #endif
  1302   );
  1305 void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
  1306                           uint8* dst_u, uint8* dst_v, int width) {
  1307   asm volatile (
  1308     "movdqa    %0,%%xmm4                       \n"
  1309     "movdqa    %1,%%xmm3                       \n"
  1310     "movdqa    %2,%%xmm5                       \n"
  1312   : "m"(kARGBToU),  // %0
  1313     "m"(kARGBToV),  // %1
  1314     "m"(kAddUV128)  // %2
  1315   );
  1316   asm volatile (
  1317     "sub       %1,%2                           \n"
  1318     LABELALIGN
  1319   "1:                                          \n"
  1320     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1321     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1322     "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1323     "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
  1324     "lea       " MEMLEA(0x40,0) ",%0           \n"
  1325     "movdqa    %%xmm0,%%xmm7                   \n"
  1326     "shufps    $0x88,%%xmm1,%%xmm0             \n"
  1327     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
  1328     "pavgb     %%xmm7,%%xmm0                   \n"
  1329     "movdqa    %%xmm2,%%xmm7                   \n"
  1330     "shufps    $0x88,%%xmm6,%%xmm2             \n"
  1331     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
  1332     "pavgb     %%xmm7,%%xmm2                   \n"
  1333     "movdqa    %%xmm0,%%xmm1                   \n"
  1334     "movdqa    %%xmm2,%%xmm6                   \n"
  1335     "pmaddubsw %%xmm4,%%xmm0                   \n"
  1336     "pmaddubsw %%xmm4,%%xmm2                   \n"
  1337     "pmaddubsw %%xmm3,%%xmm1                   \n"
  1338     "pmaddubsw %%xmm3,%%xmm6                   \n"
  1339     "phaddw    %%xmm2,%%xmm0                   \n"
  1340     "phaddw    %%xmm6,%%xmm1                   \n"
  1341     "psraw     $0x8,%%xmm0                     \n"
  1342     "psraw     $0x8,%%xmm1                     \n"
  1343     "packsswb  %%xmm1,%%xmm0                   \n"
  1344     "paddb     %%xmm5,%%xmm0                   \n"
  1345     "sub       $0x10,%3                        \n"
  1346     "movlps    %%xmm0," MEMACCESS(1) "         \n"
  1347     BUNDLEALIGN
  1348     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
  1349     "lea       " MEMLEA(0x8,1) ",%1            \n"
  1350     "jg        1b                              \n"
  1351   : "+r"(src_argb0),       // %0
  1352     "+r"(dst_u),           // %1
  1353     "+r"(dst_v),           // %2
  1354     "+rm"(width)           // %3
  1356   : "memory", "cc"
  1357 #if defined(__native_client__) && defined(__x86_64__)
  1358     , "r14"
  1359 #endif
  1360 #if defined(__SSE2__)
  1361     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1362 #endif
  1363   );
  1366 void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
  1367                                     uint8* dst_u, uint8* dst_v, int width) {
  1368   asm volatile (
  1369     "movdqa    %0,%%xmm4                       \n"
  1370     "movdqa    %1,%%xmm3                       \n"
  1371     "movdqa    %2,%%xmm5                       \n"
  1373   : "m"(kARGBToU),  // %0
  1374     "m"(kARGBToV),  // %1
  1375     "m"(kAddUV128)  // %2
  1376   );
  1377   asm volatile (
  1378     "sub       %1,%2                           \n"
  1379     LABELALIGN
  1380   "1:                                          \n"
  1381     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1382     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1383     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1384     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
  1385     "lea       " MEMLEA(0x40,0) ",%0           \n"
  1386     "movdqa    %%xmm0,%%xmm7                   \n"
  1387     "shufps    $0x88,%%xmm1,%%xmm0             \n"
  1388     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
  1389     "pavgb     %%xmm7,%%xmm0                   \n"
  1390     "movdqa    %%xmm2,%%xmm7                   \n"
  1391     "shufps    $0x88,%%xmm6,%%xmm2             \n"
  1392     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
  1393     "pavgb     %%xmm7,%%xmm2                   \n"
  1394     "movdqa    %%xmm0,%%xmm1                   \n"
  1395     "movdqa    %%xmm2,%%xmm6                   \n"
  1396     "pmaddubsw %%xmm4,%%xmm0                   \n"
  1397     "pmaddubsw %%xmm4,%%xmm2                   \n"
  1398     "pmaddubsw %%xmm3,%%xmm1                   \n"
  1399     "pmaddubsw %%xmm3,%%xmm6                   \n"
  1400     "phaddw    %%xmm2,%%xmm0                   \n"
  1401     "phaddw    %%xmm6,%%xmm1                   \n"
  1402     "psraw     $0x8,%%xmm0                     \n"
  1403     "psraw     $0x8,%%xmm1                     \n"
  1404     "packsswb  %%xmm1,%%xmm0                   \n"
  1405     "paddb     %%xmm5,%%xmm0                   \n"
  1406     "sub       $0x10,%3                        \n"
  1407     "movlps    %%xmm0," MEMACCESS(1) "         \n"
  1408     BUNDLEALIGN
  1409     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
  1410     "lea       " MEMLEA(0x8,1) ",%1            \n"
  1411     "jg        1b                              \n"
  1412   : "+r"(src_argb0),       // %0
  1413     "+r"(dst_u),           // %1
  1414     "+r"(dst_v),           // %2
  1415     "+rm"(width)           // %3
  1417   : "memory", "cc"
  1418 #if defined(__native_client__) && defined(__x86_64__)
  1419     , "r14"
  1420 #endif
  1421 #if defined(__SSE2__)
  1422     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1423 #endif
  1424   );
  1427 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
  1428   asm volatile (
  1429     "movdqa    %4,%%xmm5                       \n"
  1430     "movdqa    %3,%%xmm4                       \n"
  1431     LABELALIGN
  1432   "1:                                          \n"
  1433     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1434     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1435     "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1436     "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
  1437     "pmaddubsw %%xmm4,%%xmm0                   \n"
  1438     "pmaddubsw %%xmm4,%%xmm1                   \n"
  1439     "pmaddubsw %%xmm4,%%xmm2                   \n"
  1440     "pmaddubsw %%xmm4,%%xmm3                   \n"
  1441     "lea       " MEMLEA(0x40,0) ",%0           \n"
  1442     "phaddw    %%xmm1,%%xmm0                   \n"
  1443     "phaddw    %%xmm3,%%xmm2                   \n"
  1444     "psrlw     $0x7,%%xmm0                     \n"
  1445     "psrlw     $0x7,%%xmm2                     \n"
  1446     "packuswb  %%xmm2,%%xmm0                   \n"
  1447     "paddb     %%xmm5,%%xmm0                   \n"
  1448     "sub       $0x10,%2                        \n"
  1449     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  1450     "lea       " MEMLEA(0x10,1) ",%1           \n"
  1451     "jg        1b                              \n"
  1452   : "+r"(src_bgra),  // %0
  1453     "+r"(dst_y),     // %1
  1454     "+r"(pix)        // %2
  1455   : "m"(kBGRAToY),   // %3
  1456     "m"(kAddY16)     // %4
  1457   : "memory", "cc"
  1458 #if defined(__SSE2__)
  1459     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1460 #endif
  1461   );
  1464 void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
  1465   asm volatile (
  1466     "movdqa    %4,%%xmm5                       \n"
  1467     "movdqa    %3,%%xmm4                       \n"
  1468     LABELALIGN
  1469   "1:                                          \n"
  1470     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1471     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1472     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1473     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
  1474     "pmaddubsw %%xmm4,%%xmm0                   \n"
  1475     "pmaddubsw %%xmm4,%%xmm1                   \n"
  1476     "pmaddubsw %%xmm4,%%xmm2                   \n"
  1477     "pmaddubsw %%xmm4,%%xmm3                   \n"
  1478     "lea       " MEMLEA(0x40,0) ",%0           \n"
  1479     "phaddw    %%xmm1,%%xmm0                   \n"
  1480     "phaddw    %%xmm3,%%xmm2                   \n"
  1481     "psrlw     $0x7,%%xmm0                     \n"
  1482     "psrlw     $0x7,%%xmm2                     \n"
  1483     "packuswb  %%xmm2,%%xmm0                   \n"
  1484     "paddb     %%xmm5,%%xmm0                   \n"
  1485     "sub       $0x10,%2                        \n"
  1486     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
  1487     "lea       " MEMLEA(0x10,1) ",%1           \n"
  1488     "jg        1b                              \n"
  1489   : "+r"(src_bgra),  // %0
  1490     "+r"(dst_y),     // %1
  1491     "+r"(pix)        // %2
  1492   : "m"(kBGRAToY),   // %3
  1493     "m"(kAddY16)     // %4
  1494   : "memory", "cc"
  1495 #if defined(__SSE2__)
  1496     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1497 #endif
  1498   );
  1501 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
  1502                        uint8* dst_u, uint8* dst_v, int width) {
  1503   asm volatile (
  1504     "movdqa    %0,%%xmm4                       \n"
  1505     "movdqa    %1,%%xmm3                       \n"
  1506     "movdqa    %2,%%xmm5                       \n"
  1508   : "m"(kBGRAToU),         // %0
  1509     "m"(kBGRAToV),         // %1
  1510     "m"(kAddUV128)         // %2
  1511   );
  1512   asm volatile (
  1513     "sub       %1,%2                           \n"
  1514     LABELALIGN
  1515   "1:                                          \n"
  1516     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1517     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1518     "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1519     "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
  1520     BUNDLEALIGN
  1521     MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
  1522     MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
  1523     MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
  1524     MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
  1525     "lea       " MEMLEA(0x40,0) ",%0           \n"
  1526     "movdqa    %%xmm0,%%xmm7                   \n"
  1527     "shufps    $0x88,%%xmm1,%%xmm0             \n"
  1528     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
  1529     "pavgb     %%xmm7,%%xmm0                   \n"
  1530     "movdqa    %%xmm2,%%xmm7                   \n"
  1531     "shufps    $0x88,%%xmm6,%%xmm2             \n"
  1532     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
  1533     "pavgb     %%xmm7,%%xmm2                   \n"
  1534     "movdqa    %%xmm0,%%xmm1                   \n"
  1535     "movdqa    %%xmm2,%%xmm6                   \n"
  1536     "pmaddubsw %%xmm4,%%xmm0                   \n"
  1537     "pmaddubsw %%xmm4,%%xmm2                   \n"
  1538     "pmaddubsw %%xmm3,%%xmm1                   \n"
  1539     "pmaddubsw %%xmm3,%%xmm6                   \n"
  1540     "phaddw    %%xmm2,%%xmm0                   \n"
  1541     "phaddw    %%xmm6,%%xmm1                   \n"
  1542     "psraw     $0x8,%%xmm0                     \n"
  1543     "psraw     $0x8,%%xmm1                     \n"
  1544     "packsswb  %%xmm1,%%xmm0                   \n"
  1545     "paddb     %%xmm5,%%xmm0                   \n"
  1546     "sub       $0x10,%3                        \n"
  1547     "movlps    %%xmm0," MEMACCESS(1) "         \n"
  1548     BUNDLEALIGN
  1549     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
  1550     "lea       " MEMLEA(0x8,1) ",%1            \n"
  1551     "jg        1b                              \n"
  1552   : "+r"(src_bgra0),       // %0
  1553     "+r"(dst_u),           // %1
  1554     "+r"(dst_v),           // %2
  1555     "+rm"(width)           // %3
  1556   : "r"((intptr_t)(src_stride_bgra)) // %4
  1557   : "memory", "cc"
  1558 #if defined(__native_client__) && defined(__x86_64__)
  1559     , "r14"
  1560 #endif
  1561 #if defined(__SSE2__)
  1562     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1563 #endif
  1564   );
  1567 void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
  1568                                  uint8* dst_u, uint8* dst_v, int width) {
  1569   asm volatile (
  1570     "movdqa    %0,%%xmm4                       \n"
  1571     "movdqa    %1,%%xmm3                       \n"
  1572     "movdqa    %2,%%xmm5                       \n"
  1574   : "m"(kBGRAToU),         // %0
  1575     "m"(kBGRAToV),         // %1
  1576     "m"(kAddUV128)         // %2
  1577   );
  1578   asm volatile (
  1579     "sub       %1,%2                           \n"
  1580     LABELALIGN
  1581   "1:                                          \n"
  1582     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1583     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1584     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1585     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
  1586     BUNDLEALIGN
  1587     MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
  1588     "pavgb     %%xmm7,%%xmm0                   \n"
  1589     MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
  1590     "pavgb     %%xmm7,%%xmm1                   \n"
  1591     MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
  1592     "pavgb     %%xmm7,%%xmm2                   \n"
  1593     MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
  1594     "pavgb     %%xmm7,%%xmm6                   \n"
  1595     "lea       " MEMLEA(0x40,0) ",%0           \n"
  1596     "movdqa    %%xmm0,%%xmm7                   \n"
  1597     "shufps    $0x88,%%xmm1,%%xmm0             \n"
  1598     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
  1599     "pavgb     %%xmm7,%%xmm0                   \n"
  1600     "movdqa    %%xmm2,%%xmm7                   \n"
  1601     "shufps    $0x88,%%xmm6,%%xmm2             \n"
  1602     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
  1603     "pavgb     %%xmm7,%%xmm2                   \n"
  1604     "movdqa    %%xmm0,%%xmm1                   \n"
  1605     "movdqa    %%xmm2,%%xmm6                   \n"
  1606     "pmaddubsw %%xmm4,%%xmm0                   \n"
  1607     "pmaddubsw %%xmm4,%%xmm2                   \n"
  1608     "pmaddubsw %%xmm3,%%xmm1                   \n"
  1609     "pmaddubsw %%xmm3,%%xmm6                   \n"
  1610     "phaddw    %%xmm2,%%xmm0                   \n"
  1611     "phaddw    %%xmm6,%%xmm1                   \n"
  1612     "psraw     $0x8,%%xmm0                     \n"
  1613     "psraw     $0x8,%%xmm1                     \n"
  1614     "packsswb  %%xmm1,%%xmm0                   \n"
  1615     "paddb     %%xmm5,%%xmm0                   \n"
  1616     "sub       $0x10,%3                        \n"
  1617     "movlps    %%xmm0," MEMACCESS(1) "         \n"
  1618     BUNDLEALIGN
  1619     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
  1620     "lea       " MEMLEA(0x8,1) ",%1            \n"
  1621     "jg        1b                              \n"
  1622   : "+r"(src_bgra0),       // %0
  1623     "+r"(dst_u),           // %1
  1624     "+r"(dst_v),           // %2
  1625     "+rm"(width)           // %3
  1626   : "r"((intptr_t)(src_stride_bgra)) // %4
  1627   : "memory", "cc"
  1628 #if defined(__native_client__) && defined(__x86_64__)
  1629     , "r14"
  1630 #endif
  1631 #if defined(__SSE2__)
  1632     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1633 #endif
  1634   );
  1637 void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
  1638   asm volatile (
  1639     "movdqa    %4,%%xmm5                       \n"
  1640     "movdqa    %3,%%xmm4                       \n"
  1641     LABELALIGN
  1642   "1:                                          \n"
  1643     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1644     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1645     "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1646     "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
  1647     "pmaddubsw %%xmm4,%%xmm0                   \n"
  1648     "pmaddubsw %%xmm4,%%xmm1                   \n"
  1649     "pmaddubsw %%xmm4,%%xmm2                   \n"
  1650     "pmaddubsw %%xmm4,%%xmm3                   \n"
  1651     "lea       " MEMLEA(0x40,0) ",%0           \n"
  1652     "phaddw    %%xmm1,%%xmm0                   \n"
  1653     "phaddw    %%xmm3,%%xmm2                   \n"
  1654     "psrlw     $0x7,%%xmm0                     \n"
  1655     "psrlw     $0x7,%%xmm2                     \n"
  1656     "packuswb  %%xmm2,%%xmm0                   \n"
  1657     "paddb     %%xmm5,%%xmm0                   \n"
  1658     "sub       $0x10,%2                        \n"
  1659     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  1660     "lea       " MEMLEA(0x10,1) ",%1           \n"
  1661     "jg        1b                              \n"
  1662   : "+r"(src_abgr),  // %0
  1663     "+r"(dst_y),     // %1
  1664     "+r"(pix)        // %2
  1665   : "m"(kABGRToY),   // %3
  1666     "m"(kAddY16)     // %4
  1667   : "memory", "cc"
  1668 #if defined(__SSE2__)
  1669     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1670 #endif
  1671   );
  1674 void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
  1675   asm volatile (
  1676     "movdqa    %4,%%xmm5                       \n"
  1677     "movdqa    %3,%%xmm4                       \n"
  1678     LABELALIGN
  1679   "1:                                          \n"
  1680     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1681     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1682     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1683     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
  1684     "pmaddubsw %%xmm4,%%xmm0                   \n"
  1685     "pmaddubsw %%xmm4,%%xmm1                   \n"
  1686     "pmaddubsw %%xmm4,%%xmm2                   \n"
  1687     "pmaddubsw %%xmm4,%%xmm3                   \n"
  1688     "lea       " MEMLEA(0x40,0) ",%0           \n"
  1689     "phaddw    %%xmm1,%%xmm0                   \n"
  1690     "phaddw    %%xmm3,%%xmm2                   \n"
  1691     "psrlw     $0x7,%%xmm0                     \n"
  1692     "psrlw     $0x7,%%xmm2                     \n"
  1693     "packuswb  %%xmm2,%%xmm0                   \n"
  1694     "paddb     %%xmm5,%%xmm0                   \n"
  1695     "sub       $0x10,%2                        \n"
  1696     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
  1697     "lea       " MEMLEA(0x10,1) ",%1           \n"
  1698     "jg        1b                              \n"
  1699   : "+r"(src_abgr),  // %0
  1700     "+r"(dst_y),     // %1
  1701     "+r"(pix)        // %2
  1702   : "m"(kABGRToY),   // %3
  1703     "m"(kAddY16)     // %4
  1704   : "memory", "cc"
  1705 #if defined(__SSE2__)
  1706     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1707 #endif
  1708   );
  1711 void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
  1712   asm volatile (
  1713     "movdqa    %4,%%xmm5                       \n"
  1714     "movdqa    %3,%%xmm4                       \n"
  1715     LABELALIGN
  1716   "1:                                          \n"
  1717     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1718     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1719     "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1720     "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
  1721     "pmaddubsw %%xmm4,%%xmm0                   \n"
  1722     "pmaddubsw %%xmm4,%%xmm1                   \n"
  1723     "pmaddubsw %%xmm4,%%xmm2                   \n"
  1724     "pmaddubsw %%xmm4,%%xmm3                   \n"
  1725     "lea       " MEMLEA(0x40,0) ",%0           \n"
  1726     "phaddw    %%xmm1,%%xmm0                   \n"
  1727     "phaddw    %%xmm3,%%xmm2                   \n"
  1728     "psrlw     $0x7,%%xmm0                     \n"
  1729     "psrlw     $0x7,%%xmm2                     \n"
  1730     "packuswb  %%xmm2,%%xmm0                   \n"
  1731     "paddb     %%xmm5,%%xmm0                   \n"
  1732     "sub       $0x10,%2                        \n"
  1733     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  1734     "lea       " MEMLEA(0x10,1) ",%1           \n"
  1735     "jg        1b                              \n"
  1736   : "+r"(src_rgba),  // %0
  1737     "+r"(dst_y),     // %1
  1738     "+r"(pix)        // %2
  1739   : "m"(kRGBAToY),   // %3
  1740     "m"(kAddY16)     // %4
  1741   : "memory", "cc"
  1742 #if defined(__SSE2__)
  1743     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1744 #endif
  1745   );
  1748 void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
  1749   asm volatile (
  1750     "movdqa    %4,%%xmm5                       \n"
  1751     "movdqa    %3,%%xmm4                       \n"
  1752     LABELALIGN
  1753   "1:                                          \n"
  1754     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1755     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1756     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1757     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
  1758     "pmaddubsw %%xmm4,%%xmm0                   \n"
  1759     "pmaddubsw %%xmm4,%%xmm1                   \n"
  1760     "pmaddubsw %%xmm4,%%xmm2                   \n"
  1761     "pmaddubsw %%xmm4,%%xmm3                   \n"
  1762     "lea       " MEMLEA(0x40,0) ",%0           \n"
  1763     "phaddw    %%xmm1,%%xmm0                   \n"
  1764     "phaddw    %%xmm3,%%xmm2                   \n"
  1765     "psrlw     $0x7,%%xmm0                     \n"
  1766     "psrlw     $0x7,%%xmm2                     \n"
  1767     "packuswb  %%xmm2,%%xmm0                   \n"
  1768     "paddb     %%xmm5,%%xmm0                   \n"
  1769     "sub       $0x10,%2                        \n"
  1770     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
  1771     "lea       " MEMLEA(0x10,1) ",%1           \n"
  1772     "jg        1b                              \n"
  1773   : "+r"(src_rgba),  // %0
  1774     "+r"(dst_y),     // %1
  1775     "+r"(pix)        // %2
  1776   : "m"(kRGBAToY),   // %3
  1777     "m"(kAddY16)     // %4
  1778   : "memory", "cc"
  1779 #if defined(__SSE2__)
  1780     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1781 #endif
  1782   );
  1785 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
  1786                        uint8* dst_u, uint8* dst_v, int width) {
  1787   asm volatile (
  1788     "movdqa    %0,%%xmm4                       \n"
  1789     "movdqa    %1,%%xmm3                       \n"
  1790     "movdqa    %2,%%xmm5                       \n"
  1792   : "m"(kABGRToU),         // %0
  1793     "m"(kABGRToV),         // %1
  1794     "m"(kAddUV128)         // %2
  1795   );
  1796   asm volatile (
  1797     "sub       %1,%2                           \n"
  1798     LABELALIGN
  1799   "1:                                          \n"
  1800     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1801     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1802     "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1803     "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
  1804     BUNDLEALIGN
  1805     MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
  1806     MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
  1807     MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
  1808     MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
  1809     "lea       " MEMLEA(0x40,0) ",%0           \n"
  1810     "movdqa    %%xmm0,%%xmm7                   \n"
  1811     "shufps    $0x88,%%xmm1,%%xmm0             \n"
  1812     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
  1813     "pavgb     %%xmm7,%%xmm0                   \n"
  1814     "movdqa    %%xmm2,%%xmm7                   \n"
  1815     "shufps    $0x88,%%xmm6,%%xmm2             \n"
  1816     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
  1817     "pavgb     %%xmm7,%%xmm2                   \n"
  1818     "movdqa    %%xmm0,%%xmm1                   \n"
  1819     "movdqa    %%xmm2,%%xmm6                   \n"
  1820     "pmaddubsw %%xmm4,%%xmm0                   \n"
  1821     "pmaddubsw %%xmm4,%%xmm2                   \n"
  1822     "pmaddubsw %%xmm3,%%xmm1                   \n"
  1823     "pmaddubsw %%xmm3,%%xmm6                   \n"
  1824     "phaddw    %%xmm2,%%xmm0                   \n"
  1825     "phaddw    %%xmm6,%%xmm1                   \n"
  1826     "psraw     $0x8,%%xmm0                     \n"
  1827     "psraw     $0x8,%%xmm1                     \n"
  1828     "packsswb  %%xmm1,%%xmm0                   \n"
  1829     "paddb     %%xmm5,%%xmm0                   \n"
  1830     "sub       $0x10,%3                        \n"
  1831     "movlps    %%xmm0," MEMACCESS(1) "         \n"
  1832     BUNDLEALIGN
  1833     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
  1834     "lea       " MEMLEA(0x8,1) ",%1            \n"
  1835     "jg        1b                              \n"
  1836   : "+r"(src_abgr0),       // %0
  1837     "+r"(dst_u),           // %1
  1838     "+r"(dst_v),           // %2
  1839     "+rm"(width)           // %3
  1840   : "r"((intptr_t)(src_stride_abgr)) // %4
  1841   : "memory", "cc"
  1842 #if defined(__native_client__) && defined(__x86_64__)
  1843     , "r14"
  1844 #endif
  1845 #if defined(__SSE2__)
  1846     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1847 #endif
  1848   );
  1851 void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
  1852                                  uint8* dst_u, uint8* dst_v, int width) {
  1853   asm volatile (
  1854     "movdqa    %0,%%xmm4                       \n"
  1855     "movdqa    %1,%%xmm3                       \n"
  1856     "movdqa    %2,%%xmm5                       \n"
  1858   : "m"(kABGRToU),         // %0
  1859     "m"(kABGRToV),         // %1
  1860     "m"(kAddUV128)         // %2
  1861   );
  1862   asm volatile (
  1863     "sub       %1,%2                           \n"
  1864     LABELALIGN
  1865   "1:                                          \n"
  1866     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1867     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1868     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1869     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
  1870     BUNDLEALIGN
  1871     MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
  1872     "pavgb     %%xmm7,%%xmm0                   \n"
  1873     MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
  1874     "pavgb     %%xmm7,%%xmm1                   \n"
  1875     MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
  1876     "pavgb     %%xmm7,%%xmm2                   \n"
  1877     MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
  1878     "pavgb     %%xmm7,%%xmm6                   \n"
  1879     "lea       " MEMLEA(0x40,0) ",%0           \n"
  1880     "movdqa    %%xmm0,%%xmm7                   \n"
  1881     "shufps    $0x88,%%xmm1,%%xmm0             \n"
  1882     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
  1883     "pavgb     %%xmm7,%%xmm0                   \n"
  1884     "movdqa    %%xmm2,%%xmm7                   \n"
  1885     "shufps    $0x88,%%xmm6,%%xmm2             \n"
  1886     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
  1887     "pavgb     %%xmm7,%%xmm2                   \n"
  1888     "movdqa    %%xmm0,%%xmm1                   \n"
  1889     "movdqa    %%xmm2,%%xmm6                   \n"
  1890     "pmaddubsw %%xmm4,%%xmm0                   \n"
  1891     "pmaddubsw %%xmm4,%%xmm2                   \n"
  1892     "pmaddubsw %%xmm3,%%xmm1                   \n"
  1893     "pmaddubsw %%xmm3,%%xmm6                   \n"
  1894     "phaddw    %%xmm2,%%xmm0                   \n"
  1895     "phaddw    %%xmm6,%%xmm1                   \n"
  1896     "psraw     $0x8,%%xmm0                     \n"
  1897     "psraw     $0x8,%%xmm1                     \n"
  1898     "packsswb  %%xmm1,%%xmm0                   \n"
  1899     "paddb     %%xmm5,%%xmm0                   \n"
  1900     "sub       $0x10,%3                        \n"
  1901     "movlps    %%xmm0," MEMACCESS(1) "         \n"
  1902     BUNDLEALIGN
  1903     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
  1904     "lea       " MEMLEA(0x8,1) ",%1            \n"
  1905     "jg        1b                              \n"
  1906   : "+r"(src_abgr0),       // %0
  1907     "+r"(dst_u),           // %1
  1908     "+r"(dst_v),           // %2
  1909     "+rm"(width)           // %3
  1910   : "r"((intptr_t)(src_stride_abgr)) // %4
  1911   : "memory", "cc"
  1912 #if defined(__native_client__) && defined(__x86_64__)
  1913     , "r14"
  1914 #endif
  1915 #if defined(__SSE2__)
  1916     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1917 #endif
  1918   );
  1921 void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
  1922                        uint8* dst_u, uint8* dst_v, int width) {
  1923   asm volatile (
  1924     "movdqa    %0,%%xmm4                       \n"
  1925     "movdqa    %1,%%xmm3                       \n"
  1926     "movdqa    %2,%%xmm5                       \n"
  1928   : "m"(kRGBAToU),         // %0
  1929     "m"(kRGBAToV),         // %1
  1930     "m"(kAddUV128)         // %2
  1931   );
  1932   asm volatile (
  1933     "sub       %1,%2                           \n"
  1934     LABELALIGN
  1935   "1:                                          \n"
  1936     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1937     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1938     "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1939     "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
  1940     BUNDLEALIGN
  1941     MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
  1942     MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
  1943     MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
  1944     MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
  1945     "lea       " MEMLEA(0x40,0) ",%0           \n"
  1946     "movdqa    %%xmm0,%%xmm7                   \n"
  1947     "shufps    $0x88,%%xmm1,%%xmm0             \n"
  1948     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
  1949     "pavgb     %%xmm7,%%xmm0                   \n"
  1950     "movdqa    %%xmm2,%%xmm7                   \n"
  1951     "shufps    $0x88,%%xmm6,%%xmm2             \n"
  1952     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
  1953     "pavgb     %%xmm7,%%xmm2                   \n"
  1954     "movdqa    %%xmm0,%%xmm1                   \n"
  1955     "movdqa    %%xmm2,%%xmm6                   \n"
  1956     "pmaddubsw %%xmm4,%%xmm0                   \n"
  1957     "pmaddubsw %%xmm4,%%xmm2                   \n"
  1958     "pmaddubsw %%xmm3,%%xmm1                   \n"
  1959     "pmaddubsw %%xmm3,%%xmm6                   \n"
  1960     "phaddw    %%xmm2,%%xmm0                   \n"
  1961     "phaddw    %%xmm6,%%xmm1                   \n"
  1962     "psraw     $0x8,%%xmm0                     \n"
  1963     "psraw     $0x8,%%xmm1                     \n"
  1964     "packsswb  %%xmm1,%%xmm0                   \n"
  1965     "paddb     %%xmm5,%%xmm0                   \n"
  1966     "sub       $0x10,%3                        \n"
  1967     "movlps    %%xmm0," MEMACCESS(1) "         \n"
  1968     BUNDLEALIGN
  1969     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
  1970     "lea       " MEMLEA(0x8,1) ",%1            \n"
  1971     "jg        1b                              \n"
  1972   : "+r"(src_rgba0),       // %0
  1973     "+r"(dst_u),           // %1
  1974     "+r"(dst_v),           // %2
  1975     "+rm"(width)           // %3
  1976   : "r"((intptr_t)(src_stride_rgba))
  1977   : "memory", "cc"
  1978 #if defined(__native_client__) && defined(__x86_64__)
  1979     , "r14"
  1980 #endif
  1981 #if defined(__SSE2__)
  1982     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1983 #endif
  1984   );
  1987 void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
  1988                                  uint8* dst_u, uint8* dst_v, int width) {
  1989   asm volatile (
  1990     "movdqa    %0,%%xmm4                       \n"
  1991     "movdqa    %1,%%xmm3                       \n"
  1992     "movdqa    %2,%%xmm5                       \n"
  1994   : "m"(kRGBAToU),         // %0
  1995     "m"(kRGBAToV),         // %1
  1996     "m"(kAddUV128)         // %2
  1997   );
  1998   asm volatile (
  1999     "sub       %1,%2                           \n"
  2000     LABELALIGN
  2001   "1:                                          \n"
  2002     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  2003     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  2004     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  2005     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
  2006     BUNDLEALIGN
  2007     MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
  2008     "pavgb     %%xmm7,%%xmm0                   \n"
  2009     MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
  2010     "pavgb     %%xmm7,%%xmm1                   \n"
  2011     MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
  2012     "pavgb     %%xmm7,%%xmm2                   \n"
  2013     MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
  2014     "pavgb     %%xmm7,%%xmm6                   \n"
  2015     "lea       " MEMLEA(0x40,0) ",%0           \n"
  2016     "movdqa    %%xmm0,%%xmm7                   \n"
  2017     "shufps    $0x88,%%xmm1,%%xmm0             \n"
  2018     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
  2019     "pavgb     %%xmm7,%%xmm0                   \n"
  2020     "movdqa    %%xmm2,%%xmm7                   \n"
  2021     "shufps    $0x88,%%xmm6,%%xmm2             \n"
  2022     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
  2023     "pavgb     %%xmm7,%%xmm2                   \n"
  2024     "movdqa    %%xmm0,%%xmm1                   \n"
  2025     "movdqa    %%xmm2,%%xmm6                   \n"
  2026     "pmaddubsw %%xmm4,%%xmm0                   \n"
  2027     "pmaddubsw %%xmm4,%%xmm2                   \n"
  2028     "pmaddubsw %%xmm3,%%xmm1                   \n"
  2029     "pmaddubsw %%xmm3,%%xmm6                   \n"
  2030     "phaddw    %%xmm2,%%xmm0                   \n"
  2031     "phaddw    %%xmm6,%%xmm1                   \n"
  2032     "psraw     $0x8,%%xmm0                     \n"
  2033     "psraw     $0x8,%%xmm1                     \n"
  2034     "packsswb  %%xmm1,%%xmm0                   \n"
  2035     "paddb     %%xmm5,%%xmm0                   \n"
  2036     "sub       $0x10,%3                        \n"
  2037     "movlps    %%xmm0," MEMACCESS(1) "         \n"
  2038     BUNDLEALIGN
  2039     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
  2040     "lea       " MEMLEA(0x8,1) ",%1            \n"
  2041     "jg        1b                              \n"
  2042   : "+r"(src_rgba0),       // %0
  2043     "+r"(dst_u),           // %1
  2044     "+r"(dst_v),           // %2
  2045     "+rm"(width)           // %3
  2046   : "r"((intptr_t)(src_stride_rgba)) // %4
  2047   : "memory", "cc"
  2048 #if defined(__native_client__) && defined(__x86_64__)
  2049     , "r14"
  2050 #endif
  2051 #if defined(__SSE2__)
  2052     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  2053 #endif
  2054   );
  2056 #endif  // HAS_ARGBTOUVROW_SSSE3
  2058 #ifdef HAS_I422TOARGBROW_SSSE3
  2059 #define UB 127 /* min(63,(int8)(2.018 * 64)) */
  2060 #define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
  2061 #define UR 0
  2063 #define VB 0
  2064 #define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
  2065 #define VR 102 /* (int8)(1.596 * 64 + 0.5) */
  2067 // Bias
  2068 #define BB UB * 128 + VB * 128
  2069 #define BG UG * 128 + VG * 128
  2070 #define BR UR * 128 + VR * 128
  2072 #define YG 74 /* (int8)(1.164 * 64 + 0.5) */
  2074 struct {
  2075   vec8 kUVToB;  // 0
  2076   vec8 kUVToG;  // 16
  2077   vec8 kUVToR;  // 32
  2078   vec16 kUVBiasB;  // 48
  2079   vec16 kUVBiasG;  // 64
  2080   vec16 kUVBiasR;  // 80
  2081   vec16 kYSub16;  // 96
  2082   vec16 kYToRgb;  // 112
  2083   vec8 kVUToB;  // 128
  2084   vec8 kVUToG;  // 144
  2085   vec8 kVUToR;  // 160
  2086 } static SIMD_ALIGNED(kYuvConstants) = {
  2087   { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
  2088   { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
  2089   { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
  2090   { BB, BB, BB, BB, BB, BB, BB, BB },
  2091   { BG, BG, BG, BG, BG, BG, BG, BG },
  2092   { BR, BR, BR, BR, BR, BR, BR, BR },
  2093   { 16, 16, 16, 16, 16, 16, 16, 16 },
  2094   { YG, YG, YG, YG, YG, YG, YG, YG },
  2095   { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
  2096   { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
  2097   { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
  2098 };
  2101 // Read 8 UV from 411
  2102 #define READYUV444                                                             \
  2103     "movq       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
  2104     BUNDLEALIGN                                                                \
  2105     MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
  2106     "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \
  2107     "punpcklbw  %%xmm1,%%xmm0                                   \n"
  2109 // Read 4 UV from 422, upsample to 8 UV
  2110 #define READYUV422                                                             \
  2111     "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
  2112     BUNDLEALIGN                                                                \
  2113     MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
  2114     "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
  2115     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
  2116     "punpcklwd  %%xmm0,%%xmm0                                   \n"
  2118 // Read 2 UV from 411, upsample to 8 UV
  2119 #define READYUV411                                                             \
  2120     "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
  2121     BUNDLEALIGN                                                                \
  2122     MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
  2123     "lea        " MEMLEA(0x2, [u_buf]) ",%[u_buf]               \n"            \
  2124     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
  2125     "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
  2126     "punpckldq  %%xmm0,%%xmm0                                   \n"
  2128 // Read 4 UV from NV12, upsample to 8 UV
  2129 #define READNV12                                                               \
  2130     "movq       " MEMACCESS([uv_buf]) ",%%xmm0                  \n"            \
  2131     "lea        " MEMLEA(0x8, [uv_buf]) ",%[uv_buf]             \n"            \
  2132     "punpcklwd  %%xmm0,%%xmm0                                   \n"
  2134 // Convert 8 pixels: 8 UV and 8 Y
  2135 #define YUVTORGB                                                               \
  2136     "movdqa     %%xmm0,%%xmm1                                   \n"            \
  2137     "movdqa     %%xmm0,%%xmm2                                   \n"            \
  2138     "pmaddubsw  " MEMACCESS([kYuvConstants]) ",%%xmm0           \n"            \
  2139     "pmaddubsw  " MEMACCESS2(16, [kYuvConstants]) ",%%xmm1      \n"            \
  2140     "pmaddubsw  " MEMACCESS2(32, [kYuvConstants]) ",%%xmm2      \n"            \
  2141     "psubw      " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0      \n"            \
  2142     "psubw      " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1      \n"            \
  2143     "psubw      " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2      \n"            \
  2144     "movq       " MEMACCESS([y_buf]) ",%%xmm3                   \n"            \
  2145     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
  2146     "punpcklbw  %%xmm4,%%xmm3                                   \n"            \
  2147     "psubsw     " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3      \n"            \
  2148     "pmullw     " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3     \n"            \
  2149     "paddsw     %%xmm3,%%xmm0                                   \n"            \
  2150     "paddsw     %%xmm3,%%xmm1                                   \n"            \
  2151     "paddsw     %%xmm3,%%xmm2                                   \n"            \
  2152     "psraw      $0x6,%%xmm0                                     \n"            \
  2153     "psraw      $0x6,%%xmm1                                     \n"            \
  2154     "psraw      $0x6,%%xmm2                                     \n"            \
  2155     "packuswb   %%xmm0,%%xmm0                                   \n"            \
  2156     "packuswb   %%xmm1,%%xmm1                                   \n"            \
  2157     "packuswb   %%xmm2,%%xmm2                                   \n"
  2159 // Convert 8 pixels: 8 VU and 8 Y
  2160 #define YVUTORGB                                                               \
  2161     "movdqa     %%xmm0,%%xmm1                                   \n"            \
  2162     "movdqa     %%xmm0,%%xmm2                                   \n"            \
  2163     "pmaddubsw  " MEMACCESS2(128, [kYuvConstants]) ",%%xmm0     \n"            \
  2164     "pmaddubsw  " MEMACCESS2(144, [kYuvConstants]) ",%%xmm1     \n"            \
  2165     "pmaddubsw  " MEMACCESS2(160, [kYuvConstants]) ",%%xmm2     \n"            \
  2166     "psubw      " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0      \n"            \
  2167     "psubw      " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1      \n"            \
  2168     "psubw      " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2      \n"            \
  2169     "movq       " MEMACCESS([y_buf]) ",%%xmm3                   \n"            \
  2170     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
  2171     "punpcklbw  %%xmm4,%%xmm3                                   \n"            \
  2172     "psubsw     " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3      \n"            \
  2173     "pmullw     " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3     \n"            \
  2174     "paddsw     %%xmm3,%%xmm0                                   \n"            \
  2175     "paddsw     %%xmm3,%%xmm1                                   \n"            \
  2176     "paddsw     %%xmm3,%%xmm2                                   \n"            \
  2177     "psraw      $0x6,%%xmm0                                     \n"            \
  2178     "psraw      $0x6,%%xmm1                                     \n"            \
  2179     "psraw      $0x6,%%xmm2                                     \n"            \
  2180     "packuswb   %%xmm0,%%xmm0                                   \n"            \
  2181     "packuswb   %%xmm1,%%xmm1                                   \n"            \
  2182     "packuswb   %%xmm2,%%xmm2                                   \n"
  2184 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
  2185                                 const uint8* u_buf,
  2186                                 const uint8* v_buf,
  2187                                 uint8* dst_argb,
  2188                                 int width) {
  2189   asm volatile (
  2190     "sub       %[u_buf],%[v_buf]               \n"
  2191     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  2192     "pxor      %%xmm4,%%xmm4                   \n"
  2193     LABELALIGN
  2194   "1:                                          \n"
  2195     READYUV444
  2196     YUVTORGB
  2197     "punpcklbw %%xmm1,%%xmm0                   \n"
  2198     "punpcklbw %%xmm5,%%xmm2                   \n"
  2199     "movdqa    %%xmm0,%%xmm1                   \n"
  2200     "punpcklwd %%xmm2,%%xmm0                   \n"
  2201     "punpckhwd %%xmm2,%%xmm1                   \n"
  2202     "movdqa    %%xmm0," MEMACCESS([dst_argb]) "         \n"
  2203     "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "   \n"
  2204     "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb]  \n"
  2205     "sub       $0x8,%[width]                   \n"
  2206     "jg        1b                              \n"
  2207   : [y_buf]"+r"(y_buf),    // %[y_buf]
  2208     [u_buf]"+r"(u_buf),    // %[u_buf]
  2209     [v_buf]"+r"(v_buf),    // %[v_buf]
  2210     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
  2211     [width]"+rm"(width)    // %[width]
  2212   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  2213   : "memory", "cc"
  2214 #if defined(__native_client__) && defined(__x86_64__)
  2215     , "r14"
  2216 #endif
  2217 #if defined(__SSE2__)
  2218     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2219 #endif
  2220   );
  2223 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
  2224                                  const uint8* u_buf,
  2225                                  const uint8* v_buf,
  2226                                  uint8* dst_rgb24,
  2227                                  int width) {
  2228 // fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
  2229 #if defined(__i386__)
  2230   asm volatile (
  2231     "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
  2232     "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
  2233   :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
  2234     [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24));
  2235 #endif
  2237   asm volatile (
  2238 #if !defined(__i386__)
  2239     "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
  2240     "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
  2241 #endif
  2242     "sub       %[u_buf],%[v_buf]               \n"
  2243     "pxor      %%xmm4,%%xmm4                   \n"
  2244     LABELALIGN
  2245   "1:                                          \n"
  2246     READYUV422
  2247     YUVTORGB
  2248     "punpcklbw %%xmm1,%%xmm0                   \n"
  2249     "punpcklbw %%xmm2,%%xmm2                   \n"
  2250     "movdqa    %%xmm0,%%xmm1                   \n"
  2251     "punpcklwd %%xmm2,%%xmm0                   \n"
  2252     "punpckhwd %%xmm2,%%xmm1                   \n"
  2253     "pshufb    %%xmm5,%%xmm0                   \n"
  2254     "pshufb    %%xmm6,%%xmm1                   \n"
  2255     "palignr   $0xc,%%xmm0,%%xmm1              \n"
  2256     "movq      %%xmm0," MEMACCESS([dst_rgb24]) "\n"
  2257     "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
  2258     "lea       " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
  2259     "sub       $0x8,%[width]                   \n"
  2260     "jg        1b                              \n"
  2261   : [y_buf]"+r"(y_buf),    // %[y_buf]
  2262     [u_buf]"+r"(u_buf),    // %[u_buf]
  2263     [v_buf]"+r"(v_buf),    // %[v_buf]
  2264     [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
  2265     [width]"+rm"(width)    // %[width]
  2266   : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
  2267 #if !defined(__i386__)
  2268     , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
  2269     [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
  2270 #endif
  2271   : "memory", "cc"
  2272 #if defined(__native_client__) && defined(__x86_64__)
  2273     , "r14"
  2274 #endif
  2275 #if defined(__SSE2__)
  2276     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  2277 #endif
  2278   );
  2281 void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
  2282                                const uint8* u_buf,
  2283                                const uint8* v_buf,
  2284                                uint8* dst_raw,
  2285                                int width) {
  2286 // fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
  2287 #if defined(__i386__)
  2288   asm volatile (
  2289     "movdqa    %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
  2290     "movdqa    %[kShuffleMaskARGBToRAW],%%xmm6   \n"
  2291   :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
  2292     [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW));
  2293 #endif
  2295   asm volatile (
  2296 #if !defined(__i386__)
  2297     "movdqa    %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
  2298     "movdqa    %[kShuffleMaskARGBToRAW],%%xmm6   \n"
  2299 #endif
  2300     "sub       %[u_buf],%[v_buf]               \n"
  2301     "pxor      %%xmm4,%%xmm4                   \n"
  2302     LABELALIGN
  2303   "1:                                          \n"
  2304     READYUV422
  2305     YUVTORGB
  2306     "punpcklbw %%xmm1,%%xmm0                   \n"
  2307     "punpcklbw %%xmm2,%%xmm2                   \n"
  2308     "movdqa    %%xmm0,%%xmm1                   \n"
  2309     "punpcklwd %%xmm2,%%xmm0                   \n"
  2310     "punpckhwd %%xmm2,%%xmm1                   \n"
  2311     "pshufb    %%xmm5,%%xmm0                   \n"
  2312     "pshufb    %%xmm6,%%xmm1                   \n"
  2313     "palignr   $0xc,%%xmm0,%%xmm1              \n"
  2314     "movq      %%xmm0," MEMACCESS([dst_raw]) " \n"
  2315     "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n"
  2316     "lea       " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n"
  2317     "sub       $0x8,%[width]                   \n"
  2318     "jg        1b                              \n"
  2319   : [y_buf]"+r"(y_buf),    // %[y_buf]
  2320     [u_buf]"+r"(u_buf),    // %[u_buf]
  2321     [v_buf]"+r"(v_buf),    // %[v_buf]
  2322     [dst_raw]"+r"(dst_raw),  // %[dst_raw]
  2323     [width]"+rm"(width)    // %[width]
  2324   : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
  2325 #if !defined(__i386__)
  2326     , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
  2327     [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
  2328 #endif
  2329   : "memory", "cc"
  2330 #if defined(__native_client__) && defined(__x86_64__)
  2331     , "r14"
  2332 #endif
  2333 #if defined(__SSE2__)
  2334     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  2335 #endif
  2336   );
  2339 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
  2340                                 const uint8* u_buf,
  2341                                 const uint8* v_buf,
  2342                                 uint8* dst_argb,
  2343                                 int width) {
  2344   asm volatile (
  2345     "sub       %[u_buf],%[v_buf]               \n"
  2346     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  2347     "pxor      %%xmm4,%%xmm4                   \n"
  2348     LABELALIGN
  2349   "1:                                          \n"
  2350     READYUV422
  2351     YUVTORGB
  2352     "punpcklbw %%xmm1,%%xmm0                   \n"
  2353     "punpcklbw %%xmm5,%%xmm2                   \n"
  2354     "movdqa    %%xmm0,%%xmm1                   \n"
  2355     "punpcklwd %%xmm2,%%xmm0                   \n"
  2356     "punpckhwd %%xmm2,%%xmm1                   \n"
  2357     "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"
  2358     "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
  2359     "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
  2360     "sub       $0x8,%[width]                   \n"
  2361     "jg        1b                              \n"
  2362   : [y_buf]"+r"(y_buf),    // %[y_buf]
  2363     [u_buf]"+r"(u_buf),    // %[u_buf]
  2364     [v_buf]"+r"(v_buf),    // %[v_buf]
  2365     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
  2366     [width]"+rm"(width)    // %[width]
  2367   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  2368   : "memory", "cc"
  2369 #if defined(__native_client__) && defined(__x86_64__)
  2370     , "r14"
  2371 #endif
  2372 #if defined(__SSE2__)
  2373     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2374 #endif
  2375   );
  2378 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
  2379                                 const uint8* u_buf,
  2380                                 const uint8* v_buf,
  2381                                 uint8* dst_argb,
  2382                                 int width) {
  2383   asm volatile (
  2384     "sub       %[u_buf],%[v_buf]               \n"
  2385     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  2386     "pxor      %%xmm4,%%xmm4                   \n"
  2387     LABELALIGN
  2388   "1:                                          \n"
  2389     READYUV411
  2390     YUVTORGB
  2391     "punpcklbw %%xmm1,%%xmm0                   \n"
  2392     "punpcklbw %%xmm5,%%xmm2                   \n"
  2393     "movdqa    %%xmm0,%%xmm1                   \n"
  2394     "punpcklwd %%xmm2,%%xmm0                   \n"
  2395     "punpckhwd %%xmm2,%%xmm1                   \n"
  2396     "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"
  2397     "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
  2398     "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
  2399     "sub       $0x8,%[width]                   \n"
  2400     "jg        1b                              \n"
  2401   : [y_buf]"+r"(y_buf),    // %[y_buf]
  2402     [u_buf]"+r"(u_buf),    // %[u_buf]
  2403     [v_buf]"+r"(v_buf),    // %[v_buf]
  2404     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
  2405     [width]"+rm"(width)    // %[width]
  2406   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  2407   : "memory", "cc"
  2408 #if defined(__native_client__) && defined(__x86_64__)
  2409     , "r14"
  2410 #endif
  2411 #if defined(__SSE2__)
  2412     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2413 #endif
  2414   );
  2417 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
  2418                                 const uint8* uv_buf,
  2419                                 uint8* dst_argb,
  2420                                 int width) {
  2421   asm volatile (
  2422     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  2423     "pxor      %%xmm4,%%xmm4                   \n"
  2424     LABELALIGN
  2425   "1:                                          \n"
  2426     READNV12
  2427     YUVTORGB
  2428     "punpcklbw %%xmm1,%%xmm0                   \n"
  2429     "punpcklbw %%xmm5,%%xmm2                   \n"
  2430     "movdqa    %%xmm0,%%xmm1                   \n"
  2431     "punpcklwd %%xmm2,%%xmm0                   \n"
  2432     "punpckhwd %%xmm2,%%xmm1                   \n"
  2433     "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"
  2434     "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
  2435     "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
  2436     "sub       $0x8,%[width]                   \n"
  2437     "jg        1b                              \n"
  2438   : [y_buf]"+r"(y_buf),    // %[y_buf]
  2439     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
  2440     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
  2441     [width]"+rm"(width)    // %[width]
  2442   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  2443   : "memory", "cc"
  2444   // Does not use r14.
  2445 #if defined(__SSE2__)
  2446     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2447 #endif
  2448   );
  2451 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
  2452                                 const uint8* uv_buf,
  2453                                 uint8* dst_argb,
  2454                                 int width) {
  2455   asm volatile (
  2456     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  2457     "pxor      %%xmm4,%%xmm4                   \n"
  2458     LABELALIGN
  2459   "1:                                          \n"
  2460     READNV12
  2461     YVUTORGB
  2462     "punpcklbw %%xmm1,%%xmm0                   \n"
  2463     "punpcklbw %%xmm5,%%xmm2                   \n"
  2464     "movdqa    %%xmm0,%%xmm1                   \n"
  2465     "punpcklwd %%xmm2,%%xmm0                   \n"
  2466     "punpckhwd %%xmm2,%%xmm1                   \n"
  2467     "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"
  2468     "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
  2469     "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
  2470     "sub       $0x8,%[width]                   \n"
  2471     "jg        1b                              \n"
  2472   : [y_buf]"+r"(y_buf),    // %[y_buf]
  2473     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
  2474     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
  2475     [width]"+rm"(width)    // %[width]
  2476   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  2477   : "memory", "cc"
  2478   // Does not use r14.
  2479 #if defined(__SSE2__)
  2480     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2481 #endif
  2482   );
  2485 void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
  2486                                           const uint8* u_buf,
  2487                                           const uint8* v_buf,
  2488                                           uint8* dst_argb,
  2489                                           int width) {
  2490   asm volatile (
  2491     "sub       %[u_buf],%[v_buf]               \n"
  2492     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  2493     "pxor      %%xmm4,%%xmm4                   \n"
  2494     LABELALIGN
  2495   "1:                                          \n"
  2496     READYUV444
  2497     YUVTORGB
  2498     "punpcklbw %%xmm1,%%xmm0                   \n"
  2499     "punpcklbw %%xmm5,%%xmm2                   \n"
  2500     "movdqa    %%xmm0,%%xmm1                   \n"
  2501     "punpcklwd %%xmm2,%%xmm0                   \n"
  2502     "punpckhwd %%xmm2,%%xmm1                   \n"
  2503     "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
  2504     "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
  2505     "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
  2506     "sub       $0x8,%[width]                   \n"
  2507     "jg        1b                              \n"
  2508   : [y_buf]"+r"(y_buf),    // %[y_buf]
  2509     [u_buf]"+r"(u_buf),    // %[u_buf]
  2510     [v_buf]"+r"(v_buf),    // %[v_buf]
  2511     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
  2512     [width]"+rm"(width)    // %[width]
  2513   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  2514   : "memory", "cc"
  2515 #if defined(__native_client__) && defined(__x86_64__)
  2516     , "r14"
  2517 #endif
  2518 #if defined(__SSE2__)
  2519     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2520 #endif
  2521   );
  2524 void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
  2525                                           const uint8* u_buf,
  2526                                           const uint8* v_buf,
  2527                                           uint8* dst_argb,
  2528                                           int width) {
  2529   asm volatile (
  2530     "sub       %[u_buf],%[v_buf]               \n"
  2531     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  2532     "pxor      %%xmm4,%%xmm4                   \n"
  2533     LABELALIGN
  2534   "1:                                          \n"
  2535     READYUV422
  2536     YUVTORGB
  2537     "punpcklbw %%xmm1,%%xmm0                   \n"
  2538     "punpcklbw %%xmm5,%%xmm2                   \n"
  2539     "movdqa    %%xmm0,%%xmm1                   \n"
  2540     "punpcklwd %%xmm2,%%xmm0                   \n"
  2541     "punpckhwd %%xmm2,%%xmm1                   \n"
  2542     "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
  2543     "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
  2544     "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
  2545     "sub       $0x8,%[width]                   \n"
  2546     "jg        1b                              \n"
  2547   : [y_buf]"+r"(y_buf),    // %[y_buf]
  2548     [u_buf]"+r"(u_buf),    // %[u_buf]
  2549     [v_buf]"+r"(v_buf),    // %[v_buf]
  2550     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
  2551     [width]"+rm"(width)    // %[width]
  2552   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  2553   : "memory", "cc"
  2554 #if defined(__native_client__) && defined(__x86_64__)
  2555     , "r14"
  2556 #endif
  2557 #if defined(__SSE2__)
  2558     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2559 #endif
  2560   );
  2563 void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
  2564                                           const uint8* u_buf,
  2565                                           const uint8* v_buf,
  2566                                           uint8* dst_argb,
  2567                                           int width) {
  2568   asm volatile (
  2569     "sub       %[u_buf],%[v_buf]               \n"
  2570     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  2571     "pxor      %%xmm4,%%xmm4                   \n"
  2572     LABELALIGN
  2573   "1:                                          \n"
  2574     READYUV411
  2575     YUVTORGB
  2576     "punpcklbw %%xmm1,%%xmm0                   \n"
  2577     "punpcklbw %%xmm5,%%xmm2                   \n"
  2578     "movdqa    %%xmm0,%%xmm1                   \n"
  2579     "punpcklwd %%xmm2,%%xmm0                   \n"
  2580     "punpckhwd %%xmm2,%%xmm1                   \n"
  2581     "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
  2582     "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
  2583     "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
  2584     "sub       $0x8,%[width]                   \n"
  2585     "jg        1b                              \n"
  2586   : [y_buf]"+r"(y_buf),    // %[y_buf]
  2587     [u_buf]"+r"(u_buf),    // %[u_buf]
  2588     [v_buf]"+r"(v_buf),    // %[v_buf]
  2589     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
  2590     [width]"+rm"(width)    // %[width]
  2591   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  2592   : "memory", "cc"
  2593 #if defined(__native_client__) && defined(__x86_64__)
  2594     , "r14"
  2595 #endif
  2596 #if defined(__SSE2__)
  2597     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2598 #endif
  2599   );
  2602 void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
  2603                                           const uint8* uv_buf,
  2604                                           uint8* dst_argb,
  2605                                           int width) {
  2606   asm volatile (
  2607     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  2608     "pxor      %%xmm4,%%xmm4                   \n"
  2609     LABELALIGN
  2610   "1:                                          \n"
  2611     READNV12
  2612     YUVTORGB
  2613     "punpcklbw %%xmm1,%%xmm0                   \n"
  2614     "punpcklbw %%xmm5,%%xmm2                   \n"
  2615     "movdqa    %%xmm0,%%xmm1                   \n"
  2616     "punpcklwd %%xmm2,%%xmm0                   \n"
  2617     "punpckhwd %%xmm2,%%xmm1                   \n"
  2618     "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
  2619     "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
  2620     "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
  2621     "sub       $0x8,%[width]                   \n"
  2622     "jg        1b                              \n"
  2623   : [y_buf]"+r"(y_buf),    // %[y_buf]
  2624     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
  2625     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
  2626     [width]"+rm"(width)    // %[width]
  2627   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  2628   : "memory", "cc"
  2629   // Does not use r14.
  2630 #if defined(__SSE2__)
  2631     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2632 #endif
  2633   );
  2636 void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
  2637                                           const uint8* uv_buf,
  2638                                           uint8* dst_argb,
  2639                                           int width) {
  2640   asm volatile (
  2641     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  2642     "pxor      %%xmm4,%%xmm4                   \n"
  2643     LABELALIGN
  2644   "1:                                          \n"
  2645     READNV12
  2646     YVUTORGB
  2647     "punpcklbw %%xmm1,%%xmm0                   \n"
  2648     "punpcklbw %%xmm5,%%xmm2                   \n"
  2649     "movdqa    %%xmm0,%%xmm1                   \n"
  2650     "punpcklwd %%xmm2,%%xmm0                   \n"
  2651     "punpckhwd %%xmm2,%%xmm1                   \n"
  2652     "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
  2653     "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
  2654     "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
  2655     "sub       $0x8,%[width]                   \n"
  2656     "jg        1b                              \n"
  2657   : [y_buf]"+r"(y_buf),    // %[y_buf]
  2658     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
  2659     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
  2660     [width]"+rm"(width)    // %[width]
  2661   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  2662   : "memory", "cc"
  2663   // Does not use r14.
  2664 #if defined(__SSE2__)
  2665     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2666 #endif
  2667   );
  2670 void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
  2671                                 const uint8* u_buf,
  2672                                 const uint8* v_buf,
  2673                                 uint8* dst_bgra,
  2674                                 int width) {
  2675   asm volatile (
  2676     "sub       %[u_buf],%[v_buf]               \n"
  2677     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  2678     "pxor      %%xmm4,%%xmm4                   \n"
  2679     LABELALIGN
  2680   "1:                                          \n"
  2681     READYUV422
  2682     YUVTORGB
  2683     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  2684     "punpcklbw %%xmm0,%%xmm1                   \n"
  2685     "punpcklbw %%xmm2,%%xmm5                   \n"
  2686     "movdqa    %%xmm5,%%xmm0                   \n"
  2687     "punpcklwd %%xmm1,%%xmm5                   \n"
  2688     "punpckhwd %%xmm1,%%xmm0                   \n"
  2689     "movdqa    %%xmm5," MEMACCESS([dst_bgra]) "\n"
  2690     "movdqa    %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
  2691     "lea       " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
  2692     "sub       $0x8,%[width]                   \n"
  2693     "jg        1b                              \n"
  2694   : [y_buf]"+r"(y_buf),    // %[y_buf]
  2695     [u_buf]"+r"(u_buf),    // %[u_buf]
  2696     [v_buf]"+r"(v_buf),    // %[v_buf]
  2697     [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]
  2698     [width]"+rm"(width)    // %[width]
  2699   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  2700   : "memory", "cc"
  2701 #if defined(__native_client__) && defined(__x86_64__)
  2702     , "r14"
  2703 #endif
  2704 #if defined(__SSE2__)
  2705     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2706 #endif
  2707   );
  2710 void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
  2711                                 const uint8* u_buf,
  2712                                 const uint8* v_buf,
  2713                                 uint8* dst_abgr,
  2714                                 int width) {
  2715   asm volatile (
  2716     "sub       %[u_buf],%[v_buf]               \n"
  2717     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  2718     "pxor      %%xmm4,%%xmm4                   \n"
  2719     LABELALIGN
  2720   "1:                                          \n"
  2721     READYUV422
  2722     YUVTORGB
  2723     "punpcklbw %%xmm1,%%xmm2                   \n"
  2724     "punpcklbw %%xmm5,%%xmm0                   \n"
  2725     "movdqa    %%xmm2,%%xmm1                   \n"
  2726     "punpcklwd %%xmm0,%%xmm2                   \n"
  2727     "punpckhwd %%xmm0,%%xmm1                   \n"
  2728     "movdqa    %%xmm2," MEMACCESS([dst_abgr]) "\n"
  2729     "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
  2730     "lea       " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
  2731     "sub       $0x8,%[width]                   \n"
  2732     "jg        1b                              \n"
  2733   : [y_buf]"+r"(y_buf),    // %[y_buf]
  2734     [u_buf]"+r"(u_buf),    // %[u_buf]
  2735     [v_buf]"+r"(v_buf),    // %[v_buf]
  2736     [dst_abgr]"+r"(dst_abgr),  // %[dst_abgr]
  2737     [width]"+rm"(width)    // %[width]
  2738   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  2739   : "memory", "cc"
  2740 #if defined(__native_client__) && defined(__x86_64__)
  2741     , "r14"
  2742 #endif
  2743 #if defined(__SSE2__)
  2744     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2745 #endif
  2746   );
  2749 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
  2750                                 const uint8* u_buf,
  2751                                 const uint8* v_buf,
  2752                                 uint8* dst_rgba,
  2753                                 int width) {
  2754   asm volatile (
  2755     "sub       %[u_buf],%[v_buf]               \n"
  2756     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  2757     "pxor      %%xmm4,%%xmm4                   \n"
  2758     LABELALIGN
  2759   "1:                                          \n"
  2760     READYUV422
  2761     YUVTORGB
  2762     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  2763     "punpcklbw %%xmm2,%%xmm1                   \n"
  2764     "punpcklbw %%xmm0,%%xmm5                   \n"
  2765     "movdqa    %%xmm5,%%xmm0                   \n"
  2766     "punpcklwd %%xmm1,%%xmm5                   \n"
  2767     "punpckhwd %%xmm1,%%xmm0                   \n"
  2768     "movdqa    %%xmm5," MEMACCESS([dst_rgba]) "\n"
  2769     "movdqa    %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
  2770     "lea       " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
  2771     "sub       $0x8,%[width]                   \n"
  2772     "jg        1b                              \n"
  2773   : [y_buf]"+r"(y_buf),    // %[y_buf]
  2774     [u_buf]"+r"(u_buf),    // %[u_buf]
  2775     [v_buf]"+r"(v_buf),    // %[v_buf]
  2776     [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
  2777     [width]"+rm"(width)    // %[width]
  2778   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  2779   : "memory", "cc"
  2780 #if defined(__native_client__) && defined(__x86_64__)
  2781     , "r14"
  2782 #endif
  2783 #if defined(__SSE2__)
  2784     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2785 #endif
  2786   );
  2789 void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
  2790                                           const uint8* u_buf,
  2791                                           const uint8* v_buf,
  2792                                           uint8* dst_bgra,
  2793                                           int width) {
  2794   asm volatile (
  2795     "sub       %[u_buf],%[v_buf]               \n"
  2796     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  2797     "pxor      %%xmm4,%%xmm4                   \n"
  2798     LABELALIGN
  2799   "1:                                          \n"
  2800     READYUV422
  2801     YUVTORGB
  2802     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  2803     "punpcklbw %%xmm0,%%xmm1                   \n"
  2804     "punpcklbw %%xmm2,%%xmm5                   \n"
  2805     "movdqa    %%xmm5,%%xmm0                   \n"
  2806     "punpcklwd %%xmm1,%%xmm5                   \n"
  2807     "punpckhwd %%xmm1,%%xmm0                   \n"
  2808     "movdqu    %%xmm5," MEMACCESS([dst_bgra]) "\n"
  2809     "movdqu    %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
  2810     "lea       " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
  2811     "sub       $0x8,%[width]                   \n"
  2812     "jg        1b                              \n"
  2813   : [y_buf]"+r"(y_buf),    // %[y_buf]
  2814     [u_buf]"+r"(u_buf),    // %[u_buf]
  2815     [v_buf]"+r"(v_buf),    // %[v_buf]
  2816     [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]
  2817     [width]"+rm"(width)    // %[width]
  2818   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  2819   : "memory", "cc"
  2820 #if defined(__native_client__) && defined(__x86_64__)
  2821     , "r14"
  2822 #endif
  2823 #if defined(__SSE2__)
  2824     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2825 #endif
  2826   );
  2829 void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
  2830                                           const uint8* u_buf,
  2831                                           const uint8* v_buf,
  2832                                           uint8* dst_abgr,
  2833                                           int width) {
  2834   asm volatile (
  2835     "sub       %[u_buf],%[v_buf]               \n"
  2836     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  2837     "pxor      %%xmm4,%%xmm4                   \n"
  2838     LABELALIGN
  2839   "1:                                          \n"
  2840     READYUV422
  2841     YUVTORGB
  2842     "punpcklbw %%xmm1,%%xmm2                   \n"
  2843     "punpcklbw %%xmm5,%%xmm0                   \n"
  2844     "movdqa    %%xmm2,%%xmm1                   \n"
  2845     "punpcklwd %%xmm0,%%xmm2                   \n"
  2846     "punpckhwd %%xmm0,%%xmm1                   \n"
  2847     "movdqu    %%xmm2," MEMACCESS([dst_abgr]) "\n"
  2848     "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
  2849     "lea       " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
  2850     "sub       $0x8,%[width]                   \n"
  2851     "jg        1b                              \n"
  2852   : [y_buf]"+r"(y_buf),    // %[y_buf]
  2853     [u_buf]"+r"(u_buf),    // %[u_buf]
  2854     [v_buf]"+r"(v_buf),    // %[v_buf]
  2855     [dst_abgr]"+r"(dst_abgr),  // %[dst_abgr]
  2856     [width]"+rm"(width)    // %[width]
  2857   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  2858   : "memory", "cc"
  2859 #if defined(__native_client__) && defined(__x86_64__)
  2860     , "r14"
  2861 #endif
  2862 #if defined(__SSE2__)
  2863     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2864 #endif
  2865   );
  2868 void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
  2869                                           const uint8* u_buf,
  2870                                           const uint8* v_buf,
  2871                                           uint8* dst_rgba,
  2872                                           int width) {
  2873   asm volatile (
  2874     "sub       %[u_buf],%[v_buf]               \n"
  2875     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  2876     "pxor      %%xmm4,%%xmm4                   \n"
  2877     LABELALIGN
  2878   "1:                                          \n"
  2879     READYUV422
  2880     YUVTORGB
  2881     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  2882     "punpcklbw %%xmm2,%%xmm1                   \n"
  2883     "punpcklbw %%xmm0,%%xmm5                   \n"
  2884     "movdqa    %%xmm5,%%xmm0                   \n"
  2885     "punpcklwd %%xmm1,%%xmm5                   \n"
  2886     "punpckhwd %%xmm1,%%xmm0                   \n"
  2887     "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "\n"
  2888     "movdqu    %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
  2889     "lea       " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
  2890     "sub       $0x8,%[width]                   \n"
  2891     "jg        1b                              \n"
  2892   : [y_buf]"+r"(y_buf),    // %[y_buf]
  2893     [u_buf]"+r"(u_buf),    // %[u_buf]
  2894     [v_buf]"+r"(v_buf),    // %[v_buf]
  2895     [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
  2896     [width]"+rm"(width)    // %[width]
  2897   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  2898   : "memory", "cc"
  2899 #if defined(__native_client__) && defined(__x86_64__)
  2900     , "r14"
  2901 #endif
  2902 #if defined(__SSE2__)
  2903     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2904 #endif
  2905   );
  2908 #endif  // HAS_I422TOARGBROW_SSSE3
  2910 #ifdef HAS_YTOARGBROW_SSE2
  2911 void YToARGBRow_SSE2(const uint8* y_buf,
  2912                      uint8* dst_argb,
  2913                      int width) {
  2914   asm volatile (
  2915     "pxor      %%xmm5,%%xmm5                   \n"
  2916     "pcmpeqb   %%xmm4,%%xmm4                   \n"
  2917     "pslld     $0x18,%%xmm4                    \n"
  2918     "mov       $0x00100010,%%eax               \n"
  2919     "movd      %%eax,%%xmm3                    \n"
  2920     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
  2921     "mov       $0x004a004a,%%eax               \n"
  2922     "movd      %%eax,%%xmm2                    \n"
  2923     "pshufd    $0x0,%%xmm2,%%xmm2              \n"
  2924     LABELALIGN
  2925   "1:                                          \n"
  2926     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
  2927     "movq      " MEMACCESS(0) ",%%xmm0         \n"
  2928     "lea       " MEMLEA(0x8,0) ",%0            \n"
  2929     "punpcklbw %%xmm5,%%xmm0                   \n"
  2930     "psubusw   %%xmm3,%%xmm0                   \n"
  2931     "pmullw    %%xmm2,%%xmm0                   \n"
  2932     "psrlw     $6, %%xmm0                      \n"
  2933     "packuswb  %%xmm0,%%xmm0                   \n"
  2935     // Step 2: Weave into ARGB
  2936     "punpcklbw %%xmm0,%%xmm0                   \n"
  2937     "movdqa    %%xmm0,%%xmm1                   \n"
  2938     "punpcklwd %%xmm0,%%xmm0                   \n"
  2939     "punpckhwd %%xmm1,%%xmm1                   \n"
  2940     "por       %%xmm4,%%xmm0                   \n"
  2941     "por       %%xmm4,%%xmm1                   \n"
  2942     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  2943     "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
  2944     "lea       " MEMLEA(0x20,1) ",%1           \n"
  2946     "sub       $0x8,%2                         \n"
  2947     "jg        1b                              \n"
  2948   : "+r"(y_buf),     // %0
  2949     "+r"(dst_argb),  // %1
  2950     "+rm"(width)     // %2
  2952   : "memory", "cc", "eax"
  2953 #if defined(__SSE2__)
  2954     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
  2955 #endif
  2956   );
  2958 #endif  // HAS_YTOARGBROW_SSE2
  2960 #ifdef HAS_MIRRORROW_SSSE3
  2961 // Shuffle table for reversing the bytes.
  2962 static uvec8 kShuffleMirror = {
  2963   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
  2964 };
  2966 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
  2967   intptr_t temp_width = (intptr_t)(width);
  2968   asm volatile (
  2969     "movdqa    %3,%%xmm5                       \n"
  2970     "lea       " MEMLEA(-0x10,0) ",%0          \n"
  2971     LABELALIGN
  2972   "1:                                          \n"
  2973     MEMOPREG(movdqa,0x00,0,2,1,xmm0)           //  movdqa  (%0,%2),%%xmm0
  2974     "pshufb    %%xmm5,%%xmm0                   \n"
  2975     "sub       $0x10,%2                        \n"
  2976     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  2977     "lea       " MEMLEA(0x10,1) ",%1           \n"
  2978     "jg        1b                              \n"
  2979   : "+r"(src),  // %0
  2980     "+r"(dst),  // %1
  2981     "+r"(temp_width)  // %2
  2982   : "m"(kShuffleMirror) // %3
  2983   : "memory", "cc"
  2984 #if defined(__native_client__) && defined(__x86_64__)
  2985     , "r14"
  2986 #endif
  2987 #if defined(__SSE2__)
  2988     , "xmm0", "xmm5"
  2989 #endif
  2990   );
  2992 #endif  // HAS_MIRRORROW_SSSE3
  2994 #ifdef HAS_MIRRORROW_SSE2
  2995 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
  2996   intptr_t temp_width = (intptr_t)(width);
  2997   asm volatile (
  2998     "lea       " MEMLEA(-0x10,0) ",%0          \n"
  2999     LABELALIGN
  3000   "1:                                          \n"
  3001     MEMOPREG(movdqu,0x00,0,2,1,xmm0)           //  movdqu  (%0,%2),%%xmm0
  3002     "movdqa    %%xmm0,%%xmm1                   \n"
  3003     "psllw     $0x8,%%xmm0                     \n"
  3004     "psrlw     $0x8,%%xmm1                     \n"
  3005     "por       %%xmm1,%%xmm0                   \n"
  3006     "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
  3007     "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
  3008     "pshufd    $0x4e,%%xmm0,%%xmm0             \n"
  3009     "sub       $0x10,%2                        \n"
  3010     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
  3011     "lea       " MEMLEA(0x10,1)",%1            \n"
  3012     "jg        1b                              \n"
  3013   : "+r"(src),  // %0
  3014     "+r"(dst),  // %1
  3015     "+r"(temp_width)  // %2
  3017   : "memory", "cc"
  3018 #if defined(__native_client__) && defined(__x86_64__)
  3019     , "r14"
  3020 #endif
  3021 #if defined(__SSE2__)
  3022     , "xmm0", "xmm1"
  3023 #endif
  3024   );
  3026 #endif  // HAS_MIRRORROW_SSE2
  3028 #ifdef HAS_MIRRORROW_UV_SSSE3
  3029 // Shuffle table for reversing the bytes of UV channels.
  3030 static uvec8 kShuffleMirrorUV = {
  3031   14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
  3032 };
  3033 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
  3034                        int width) {
  3035   intptr_t temp_width = (intptr_t)(width);
  3036   asm volatile (
  3037     "movdqa    %4,%%xmm1                       \n"
  3038     "lea       " MEMLEA4(-0x10,0,3,2) ",%0       \n"
  3039     "sub       %1,%2                           \n"
  3040     LABELALIGN
  3041   "1:                                          \n"
  3042     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  3043     "lea       " MEMLEA(-0x10,0) ",%0            \n"
  3044     "pshufb    %%xmm1,%%xmm0                   \n"
  3045     "sub       $8,%3                           \n"
  3046     "movlpd    %%xmm0," MEMACCESS(1) "         \n"
  3047     BUNDLEALIGN
  3048     MEMOPMEM(movhpd,xmm0,0x00,1,2,1)           //  movhpd    %%xmm0,(%1,%2)
  3049     "lea       " MEMLEA(0x8,1) ",%1            \n"
  3050     "jg        1b                              \n"
  3051   : "+r"(src),      // %0
  3052     "+r"(dst_u),    // %1
  3053     "+r"(dst_v),    // %2
  3054     "+r"(temp_width)  // %3
  3055   : "m"(kShuffleMirrorUV)  // %4
  3056   : "memory", "cc"
  3057 #if defined(__native_client__) && defined(__x86_64__)
  3058     , "r14"
  3059 #endif
  3060 #if defined(__SSE2__)
  3061     , "xmm0", "xmm1"
  3062 #endif
  3063   );
  3065 #endif  // HAS_MIRRORROW_UV_SSSE3
  3067 #ifdef HAS_ARGBMIRRORROW_SSSE3
  3068 // Shuffle table for reversing the bytes.
  3069 static uvec8 kARGBShuffleMirror = {
  3070   12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
  3071 };
  3073 void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
  3074   intptr_t temp_width = (intptr_t)(width);
  3075   asm volatile (
  3076     "lea       " MEMLEA4(-0x10,0,2,4) ",%0     \n"
  3077     "movdqa    %3,%%xmm5                       \n"
  3078     LABELALIGN
  3079   "1:                                          \n"
  3080     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  3081     "pshufb    %%xmm5,%%xmm0                   \n"
  3082     "lea       " MEMLEA(-0x10,0) ",%0          \n"
  3083     "sub       $0x4,%2                         \n"
  3084     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  3085     "lea       " MEMLEA(0x10,1) ",%1           \n"
  3086     "jg        1b                              \n"
  3087   : "+r"(src),  // %0
  3088     "+r"(dst),  // %1
  3089     "+r"(temp_width)  // %2
  3090   : "m"(kARGBShuffleMirror)  // %3
  3091   : "memory", "cc"
  3092 #if defined(__SSE2__)
  3093     , "xmm0", "xmm5"
  3094 #endif
  3095   );
  3097 #endif  // HAS_ARGBMIRRORROW_SSSE3
  3099 #ifdef HAS_SPLITUVROW_SSE2
  3100 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
  3101   asm volatile (
  3102     "pcmpeqb    %%xmm5,%%xmm5                    \n"
  3103     "psrlw      $0x8,%%xmm5                      \n"
  3104     "sub        %1,%2                            \n"
  3105     LABELALIGN
  3106   "1:                                            \n"
  3107     "movdqa     " MEMACCESS(0) ",%%xmm0          \n"
  3108     "movdqa     " MEMACCESS2(0x10,0) ",%%xmm1    \n"
  3109     "lea        " MEMLEA(0x20,0) ",%0            \n"
  3110     "movdqa     %%xmm0,%%xmm2                    \n"
  3111     "movdqa     %%xmm1,%%xmm3                    \n"
  3112     "pand       %%xmm5,%%xmm0                    \n"
  3113     "pand       %%xmm5,%%xmm1                    \n"
  3114     "packuswb   %%xmm1,%%xmm0                    \n"
  3115     "psrlw      $0x8,%%xmm2                      \n"
  3116     "psrlw      $0x8,%%xmm3                      \n"
  3117     "packuswb   %%xmm3,%%xmm2                    \n"
  3118     "movdqa     %%xmm0," MEMACCESS(1) "          \n"
  3119     MEMOPMEM(movdqa,xmm2,0x00,1,2,1)             // movdqa     %%xmm2,(%1,%2)
  3120     "lea        " MEMLEA(0x10,1) ",%1            \n"
  3121     "sub        $0x10,%3                         \n"
  3122     "jg         1b                               \n"
  3123   : "+r"(src_uv),     // %0
  3124     "+r"(dst_u),      // %1
  3125     "+r"(dst_v),      // %2
  3126     "+r"(pix)         // %3
  3128   : "memory", "cc"
  3129 #if defined(__native_client__) && defined(__x86_64__)
  3130     , "r14"
  3131 #endif
  3132 #if defined(__SSE2__)
  3133     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  3134 #endif
  3135   );
  3138 void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
  3139                                int pix) {
  3140   asm volatile (
  3141     "pcmpeqb    %%xmm5,%%xmm5                    \n"
  3142     "psrlw      $0x8,%%xmm5                      \n"
  3143     "sub        %1,%2                            \n"
  3144     LABELALIGN
  3145   "1:                                            \n"
  3146     "movdqu     " MEMACCESS(0) ",%%xmm0          \n"
  3147     "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1    \n"
  3148     "lea        " MEMLEA(0x20,0) ",%0            \n"
  3149     "movdqa     %%xmm0,%%xmm2                    \n"
  3150     "movdqa     %%xmm1,%%xmm3                    \n"
  3151     "pand       %%xmm5,%%xmm0                    \n"
  3152     "pand       %%xmm5,%%xmm1                    \n"
  3153     "packuswb   %%xmm1,%%xmm0                    \n"
  3154     "psrlw      $0x8,%%xmm2                      \n"
  3155     "psrlw      $0x8,%%xmm3                      \n"
  3156     "packuswb   %%xmm3,%%xmm2                    \n"
  3157     "movdqu     %%xmm0," MEMACCESS(1) "          \n"
  3158     MEMOPMEM(movdqu,xmm2,0x00,1,2,1)             //  movdqu     %%xmm2,(%1,%2)
  3159     "lea        " MEMLEA(0x10,1) ",%1            \n"
  3160     "sub        $0x10,%3                         \n"
  3161     "jg         1b                               \n"
  3162   : "+r"(src_uv),     // %0
  3163     "+r"(dst_u),      // %1
  3164     "+r"(dst_v),      // %2
  3165     "+r"(pix)         // %3
  3167   : "memory", "cc"
  3168 #if defined(__native_client__) && defined(__x86_64__)
  3169     , "r14"
  3170 #endif
  3171 #if defined(__SSE2__)
  3172     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  3173 #endif
  3174   );
  3176 #endif  // HAS_SPLITUVROW_SSE2
  3178 #ifdef HAS_MERGEUVROW_SSE2
  3179 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
  3180                      int width) {
  3181   asm volatile (
  3182     "sub       %0,%1                             \n"
  3183     LABELALIGN
  3184   "1:                                            \n"
  3185     "movdqa    " MEMACCESS(0) ",%%xmm0           \n"
  3186     MEMOPREG(movdqa,0x00,0,1,1,xmm1)             //  movdqa    (%0,%1,1),%%xmm1
  3187     "lea       " MEMLEA(0x10,0) ",%0             \n"
  3188     "movdqa    %%xmm0,%%xmm2                     \n"
  3189     "punpcklbw %%xmm1,%%xmm0                     \n"
  3190     "punpckhbw %%xmm1,%%xmm2                     \n"
  3191     "movdqa    %%xmm0," MEMACCESS(2) "           \n"
  3192     "movdqa    %%xmm2," MEMACCESS2(0x10,2) "     \n"
  3193     "lea       " MEMLEA(0x20,2) ",%2             \n"
  3194     "sub       $0x10,%3                          \n"
  3195     "jg        1b                                \n"
  3196   : "+r"(src_u),     // %0
  3197     "+r"(src_v),     // %1
  3198     "+r"(dst_uv),    // %2
  3199     "+r"(width)      // %3
  3201   : "memory", "cc"
  3202 #if defined(__native_client__) && defined(__x86_64__)
  3203     , "r14"
  3204 #endif
  3205 #if defined(__SSE2__)
  3206     , "xmm0", "xmm1", "xmm2"
  3207 #endif
  3208   );
  3211 void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
  3212                                uint8* dst_uv, int width) {
  3213   asm volatile (
  3214     "sub       %0,%1                             \n"
  3215     LABELALIGN
  3216   "1:                                            \n"
  3217     "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
  3218     MEMOPREG(movdqu,0x00,0,1,1,xmm1)             //  movdqu    (%0,%1,1),%%xmm1
  3219     "lea       " MEMLEA(0x10,0) ",%0             \n"
  3220     "movdqa    %%xmm0,%%xmm2                     \n"
  3221     "punpcklbw %%xmm1,%%xmm0                     \n"
  3222     "punpckhbw %%xmm1,%%xmm2                     \n"
  3223     "movdqu    %%xmm0," MEMACCESS(2) "           \n"
  3224     "movdqu    %%xmm2," MEMACCESS2(0x10,2) "     \n"
  3225     "lea       " MEMLEA(0x20,2) ",%2             \n"
  3226     "sub       $0x10,%3                          \n"
  3227     "jg        1b                                \n"
  3228   : "+r"(src_u),     // %0
  3229     "+r"(src_v),     // %1
  3230     "+r"(dst_uv),    // %2
  3231     "+r"(width)      // %3
  3233   : "memory", "cc"
  3234 #if defined(__native_client__) && defined(__x86_64__)
  3235     , "r14"
  3236 #endif
  3237 #if defined(__SSE2__)
  3238     , "xmm0", "xmm1", "xmm2"
  3239 #endif
  3240   );
  3242 #endif  // HAS_MERGEUVROW_SSE2
  3244 #ifdef HAS_COPYROW_SSE2
  3245 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
  3246   asm volatile (
  3247     LABELALIGN
  3248   "1:                                          \n"
  3249     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  3250     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  3251     "lea       " MEMLEA(0x20,0) ",%0           \n"
  3252     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  3253     "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
  3254     "lea       " MEMLEA(0x20,1) ",%1           \n"
  3255     "sub       $0x20,%2                        \n"
  3256     "jg        1b                              \n"
  3257   : "+r"(src),   // %0
  3258     "+r"(dst),   // %1
  3259     "+r"(count)  // %2
  3261   : "memory", "cc"
  3262 #if defined(__SSE2__)
  3263     , "xmm0", "xmm1"
  3264 #endif
  3265   );
  3267 #endif  // HAS_COPYROW_SSE2
  3269 #ifdef HAS_COPYROW_X86
  3270 void CopyRow_X86(const uint8* src, uint8* dst, int width) {
  3271   size_t width_tmp = (size_t)(width);
  3272   asm volatile (
  3273     "shr       $0x2,%2                         \n"
  3274     "rep movsl " MEMMOVESTRING(0,1) "          \n"
  3275   : "+S"(src),  // %0
  3276     "+D"(dst),  // %1
  3277     "+c"(width_tmp) // %2
  3279   : "memory", "cc"
  3280   );
  3282 #endif  // HAS_COPYROW_X86
  3284 #ifdef HAS_COPYROW_ERMS
  3285 // Unaligned Multiple of 1.
  3286 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
  3287   size_t width_tmp = (size_t)(width);
  3288   asm volatile (
  3289     "rep movsb " MEMMOVESTRING(0,1) "          \n"
  3290   : "+S"(src),  // %0
  3291     "+D"(dst),  // %1
  3292     "+c"(width_tmp) // %2
  3294   : "memory", "cc"
  3295   );
  3297 #endif  // HAS_COPYROW_ERMS
  3299 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
  3300 // width in pixels
  3301 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
  3302   asm volatile (
  3303     "pcmpeqb   %%xmm0,%%xmm0                   \n"
  3304     "pslld     $0x18,%%xmm0                    \n"
  3305     "pcmpeqb   %%xmm1,%%xmm1                   \n"
  3306     "psrld     $0x8,%%xmm1                     \n"
  3307     LABELALIGN
  3308   "1:                                          \n"
  3309     "movdqa    " MEMACCESS(0) ",%%xmm2         \n"
  3310     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
  3311     "lea       " MEMLEA(0x20,0) ",%0           \n"
  3312     "movdqa    " MEMACCESS(1) ",%%xmm4         \n"
  3313     "movdqa    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
  3314     "pand      %%xmm0,%%xmm2                   \n"
  3315     "pand      %%xmm0,%%xmm3                   \n"
  3316     "pand      %%xmm1,%%xmm4                   \n"
  3317     "pand      %%xmm1,%%xmm5                   \n"
  3318     "por       %%xmm4,%%xmm2                   \n"
  3319     "por       %%xmm5,%%xmm3                   \n"
  3320     "movdqa    %%xmm2," MEMACCESS(1) "         \n"
  3321     "movdqa    %%xmm3," MEMACCESS2(0x10,1) "   \n"
  3322     "lea       " MEMLEA(0x20,1) ",%1           \n"
  3323     "sub       $0x8,%2                         \n"
  3324     "jg        1b                              \n"
  3325   : "+r"(src),   // %0
  3326     "+r"(dst),   // %1
  3327     "+r"(width)  // %2
  3329   : "memory", "cc"
  3330 #if defined(__SSE2__)
  3331     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  3332 #endif
  3333   );
  3335 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
  3337 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
  3338 // width in pixels
  3339 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
  3340   asm volatile (
  3341     "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
  3342     "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
  3343     LABELALIGN
  3344   "1:                                          \n"
  3345     "vmovdqu   " MEMACCESS(0) ",%%ymm1         \n"
  3346     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm2   \n"
  3347     "lea       " MEMLEA(0x40,0) ",%0           \n"
  3348     "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
  3349     "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
  3350     "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
  3351     "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
  3352     "lea       " MEMLEA(0x40,1) ",%1           \n"
  3353     "sub       $0x10,%2                        \n"
  3354     "jg        1b                              \n"
  3355     "vzeroupper                                \n"
  3356   : "+r"(src),   // %0
  3357     "+r"(dst),   // %1
  3358     "+r"(width)  // %2
  3360   : "memory", "cc"
  3361 #if defined(__SSE2__)
  3362     , "xmm0", "xmm1", "xmm2"
  3363 #endif
  3364   );
  3366 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
  3368 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
  3369 // width in pixels
  3370 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
  3371   asm volatile (
  3372     "pcmpeqb   %%xmm0,%%xmm0                   \n"
  3373     "pslld     $0x18,%%xmm0                    \n"
  3374     "pcmpeqb   %%xmm1,%%xmm1                   \n"
  3375     "psrld     $0x8,%%xmm1                     \n"
  3376     LABELALIGN
  3377   "1:                                          \n"
  3378     "movq      " MEMACCESS(0) ",%%xmm2         \n"
  3379     "lea       " MEMLEA(0x8,0) ",%0            \n"
  3380     "punpcklbw %%xmm2,%%xmm2                   \n"
  3381     "punpckhwd %%xmm2,%%xmm3                   \n"
  3382     "punpcklwd %%xmm2,%%xmm2                   \n"
  3383     "movdqa    " MEMACCESS(1) ",%%xmm4         \n"
  3384     "movdqa    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
  3385     "pand      %%xmm0,%%xmm2                   \n"
  3386     "pand      %%xmm0,%%xmm3                   \n"
  3387     "pand      %%xmm1,%%xmm4                   \n"
  3388     "pand      %%xmm1,%%xmm5                   \n"
  3389     "por       %%xmm4,%%xmm2                   \n"
  3390     "por       %%xmm5,%%xmm3                   \n"
  3391     "movdqa    %%xmm2," MEMACCESS(1) "         \n"
  3392     "movdqa    %%xmm3," MEMACCESS2(0x10,1) "   \n"
  3393     "lea       " MEMLEA(0x20,1) ",%1           \n"
  3394     "sub       $0x8,%2                         \n"
  3395     "jg        1b                              \n"
  3396   : "+r"(src),   // %0
  3397     "+r"(dst),   // %1
  3398     "+r"(width)  // %2
  3400   : "memory", "cc"
  3401 #if defined(__SSE2__)
  3402     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  3403 #endif
  3404   );
  3406 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
  3408 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
  3409 // width in pixels
  3410 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
  3411   asm volatile (
  3412     "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
  3413     "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
  3414     LABELALIGN
  3415   "1:                                          \n"
  3416     "vpmovzxbd " MEMACCESS(0) ",%%ymm1         \n"
  3417     "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2    \n"
  3418     "lea       " MEMLEA(0x10,0) ",%0           \n"
  3419     "vpslld    $0x18,%%ymm1,%%ymm1             \n"
  3420     "vpslld    $0x18,%%ymm2,%%ymm2             \n"
  3421     "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
  3422     "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
  3423     "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
  3424     "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
  3425     "lea       " MEMLEA(0x40,1) ",%1           \n"
  3426     "sub       $0x10,%2                        \n"
  3427     "jg        1b                              \n"
  3428     "vzeroupper                                \n"
  3429   : "+r"(src),   // %0
  3430     "+r"(dst),   // %1
  3431     "+r"(width)  // %2
  3433   : "memory", "cc"
  3434 #if defined(__SSE2__)
  3435     , "xmm0", "xmm1", "xmm2"
  3436 #endif
  3437   );
  3439 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
  3441 #ifdef HAS_SETROW_X86
  3442 void SetRow_X86(uint8* dst, uint32 v32, int width) {
  3443   size_t width_tmp = (size_t)(width);
  3444   asm volatile (
  3445     "shr       $0x2,%1                         \n"
  3446     "rep stosl " MEMSTORESTRING(eax,0) "       \n"
  3447     : "+D"(dst),       // %0
  3448       "+c"(width_tmp)  // %1
  3449     : "a"(v32)         // %2
  3450     : "memory", "cc");
  3453 void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
  3454                    int dst_stride, int height) {
  3455   for (int y = 0; y < height; ++y) {
  3456     size_t width_tmp = (size_t)(width);
  3457     uint32* d = (uint32*)(dst);
  3458     asm volatile (
  3459       "rep stosl " MEMSTORESTRING(eax,0) "     \n"
  3460       : "+D"(d),         // %0
  3461         "+c"(width_tmp)  // %1
  3462       : "a"(v32)         // %2
  3463       : "memory", "cc");
  3464     dst += dst_stride;
  3467 #endif  // HAS_SETROW_X86
  3469 #ifdef HAS_YUY2TOYROW_SSE2
  3470 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
  3471   asm volatile (
  3472     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  3473     "psrlw     $0x8,%%xmm5                     \n"
  3474     LABELALIGN
  3475   "1:                                          \n"
  3476     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  3477     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  3478     "lea       " MEMLEA(0x20,0) ",%0           \n"
  3479     "pand      %%xmm5,%%xmm0                   \n"
  3480     "pand      %%xmm5,%%xmm1                   \n"
  3481     "packuswb  %%xmm1,%%xmm0                   \n"
  3482     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  3483     "lea       " MEMLEA(0x10,1) ",%1           \n"
  3484     "sub       $0x10,%2                        \n"
  3485     "jg        1b                              \n"
  3486   : "+r"(src_yuy2),  // %0
  3487     "+r"(dst_y),     // %1
  3488     "+r"(pix)        // %2
  3490   : "memory", "cc"
  3491 #if defined(__SSE2__)
  3492     , "xmm0", "xmm1", "xmm5"
  3493 #endif
  3494   );
  3497 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
  3498                       uint8* dst_u, uint8* dst_v, int pix) {
  3499   asm volatile (
  3500     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  3501     "psrlw     $0x8,%%xmm5                     \n"
  3502     "sub       %1,%2                           \n"
  3503     LABELALIGN
  3504   "1:                                          \n"
  3505     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  3506     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  3507     BUNDLEALIGN
  3508     MEMOPREG(movdqa,0x00,0,4,1,xmm2)           //  movdqa  (%0,%4,1),%%xmm2
  3509     MEMOPREG(movdqa,0x10,0,4,1,xmm3)           //  movdqa  0x10(%0,%4,1),%%xmm3
  3510     "lea       " MEMLEA(0x20,0) ",%0           \n"
  3511     "pavgb     %%xmm2,%%xmm0                   \n"
  3512     "pavgb     %%xmm3,%%xmm1                   \n"
  3513     "psrlw     $0x8,%%xmm0                     \n"
  3514     "psrlw     $0x8,%%xmm1                     \n"
  3515     "packuswb  %%xmm1,%%xmm0                   \n"
  3516     "movdqa    %%xmm0,%%xmm1                   \n"
  3517     "pand      %%xmm5,%%xmm0                   \n"
  3518     "packuswb  %%xmm0,%%xmm0                   \n"
  3519     "psrlw     $0x8,%%xmm1                     \n"
  3520     "packuswb  %%xmm1,%%xmm1                   \n"
  3521     "movq      %%xmm0," MEMACCESS(1) "         \n"
  3522     BUNDLEALIGN
  3523     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
  3524     "lea       " MEMLEA(0x8,1) ",%1            \n"
  3525     "sub       $0x10,%3                        \n"
  3526     "jg        1b                              \n"
  3527   : "+r"(src_yuy2),    // %0
  3528     "+r"(dst_u),       // %1
  3529     "+r"(dst_v),       // %2
  3530     "+r"(pix)          // %3
  3531   : "r"((intptr_t)(stride_yuy2))  // %4
  3532   : "memory", "cc"
  3533 #if defined(__native_client__) && defined(__x86_64__)
  3534     , "r14"
  3535 #endif
  3536 #if defined(__SSE2__)
  3537     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  3538 #endif
  3539   );
  3542 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
  3543                          uint8* dst_u, uint8* dst_v, int pix) {
  3544   asm volatile (
  3545     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  3546     "psrlw     $0x8,%%xmm5                     \n"
  3547     "sub       %1,%2                           \n"
  3548     LABELALIGN
  3549   "1:                                          \n"
  3550     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  3551     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  3552     "lea       " MEMLEA(0x20,0) ",%0           \n"
  3553     "psrlw     $0x8,%%xmm0                     \n"
  3554     "psrlw     $0x8,%%xmm1                     \n"
  3555     "packuswb  %%xmm1,%%xmm0                   \n"
  3556     "movdqa    %%xmm0,%%xmm1                   \n"
  3557     "pand      %%xmm5,%%xmm0                   \n"
  3558     "packuswb  %%xmm0,%%xmm0                   \n"
  3559     "psrlw     $0x8,%%xmm1                     \n"
  3560     "packuswb  %%xmm1,%%xmm1                   \n"
  3561     "movq      %%xmm0," MEMACCESS(1) "         \n"
  3562     BUNDLEALIGN
  3563     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
  3564     "lea       " MEMLEA(0x8,1) ",%1            \n"
  3565     "sub       $0x10,%3                        \n"
  3566     "jg        1b                              \n"
  3567   : "+r"(src_yuy2),    // %0
  3568     "+r"(dst_u),       // %1
  3569     "+r"(dst_v),       // %2
  3570     "+r"(pix)          // %3
  3572   : "memory", "cc"
  3573 #if defined(__native_client__) && defined(__x86_64__)
  3574     , "r14"
  3575 #endif
  3576 #if defined(__SSE2__)
  3577     , "xmm0", "xmm1", "xmm5"
  3578 #endif
  3579   );
  3582 void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
  3583                                uint8* dst_y, int pix) {
  3584   asm volatile (
  3585     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  3586     "psrlw     $0x8,%%xmm5                     \n"
  3587     LABELALIGN
  3588   "1:                                          \n"
  3589     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  3590     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  3591     "lea       " MEMLEA(0x20,0) ",%0           \n"
  3592     "pand      %%xmm5,%%xmm0                   \n"
  3593     "pand      %%xmm5,%%xmm1                   \n"
  3594     "packuswb  %%xmm1,%%xmm0                   \n"
  3595     "sub       $0x10,%2                        \n"
  3596     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
  3597     "lea       " MEMLEA(0x10,1) ",%1           \n"
  3598     "jg        1b                              \n"
  3599   : "+r"(src_yuy2),  // %0
  3600     "+r"(dst_y),     // %1
  3601     "+r"(pix)        // %2
  3603   : "memory", "cc"
  3604 #if defined(__SSE2__)
  3605     , "xmm0", "xmm1", "xmm5"
  3606 #endif
  3607   );
  3610 void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
  3611                                 int stride_yuy2,
  3612                                 uint8* dst_u, uint8* dst_v, int pix) {
  3613   asm volatile (
  3614     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  3615     "psrlw     $0x8,%%xmm5                     \n"
  3616     "sub       %1,%2                           \n"
  3617     LABELALIGN
  3618   "1:                                          \n"
  3619     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  3620     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  3621     BUNDLEALIGN
  3622     MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
  3623     MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
  3624     "lea       " MEMLEA(0x20,0) ",%0           \n"
  3625     "pavgb     %%xmm2,%%xmm0                   \n"
  3626     "pavgb     %%xmm3,%%xmm1                   \n"
  3627     "psrlw     $0x8,%%xmm0                     \n"
  3628     "psrlw     $0x8,%%xmm1                     \n"
  3629     "packuswb  %%xmm1,%%xmm0                   \n"
  3630     "movdqa    %%xmm0,%%xmm1                   \n"
  3631     "pand      %%xmm5,%%xmm0                   \n"
  3632     "packuswb  %%xmm0,%%xmm0                   \n"
  3633     "psrlw     $0x8,%%xmm1                     \n"
  3634     "packuswb  %%xmm1,%%xmm1                   \n"
  3635     "movq      %%xmm0," MEMACCESS(1) "         \n"
  3636     BUNDLEALIGN
  3637     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
  3638     "lea       " MEMLEA(0x8,1) ",%1            \n"
  3639     "sub       $0x10,%3                        \n"
  3640     "jg        1b                              \n"
  3641   : "+r"(src_yuy2),    // %0
  3642     "+r"(dst_u),       // %1
  3643     "+r"(dst_v),       // %2
  3644     "+r"(pix)          // %3
  3645   : "r"((intptr_t)(stride_yuy2))  // %4
  3646   : "memory", "cc"
  3647 #if defined(__native_client__) && defined(__x86_64__)
  3648     , "r14"
  3649 #endif
  3650 #if defined(__SSE2__)
  3651     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  3652 #endif
  3653   );
  3656 void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
  3657                                    uint8* dst_u, uint8* dst_v, int pix) {
  3658   asm volatile (
  3659     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  3660     "psrlw     $0x8,%%xmm5                     \n"
  3661     "sub       %1,%2                           \n"
  3662     LABELALIGN
  3663   "1:                                          \n"
  3664     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  3665     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  3666     "lea       " MEMLEA(0x20,0) ",%0           \n"
  3667     "psrlw     $0x8,%%xmm0                     \n"
  3668     "psrlw     $0x8,%%xmm1                     \n"
  3669     "packuswb  %%xmm1,%%xmm0                   \n"
  3670     "movdqa    %%xmm0,%%xmm1                   \n"
  3671     "pand      %%xmm5,%%xmm0                   \n"
  3672     "packuswb  %%xmm0,%%xmm0                   \n"
  3673     "psrlw     $0x8,%%xmm1                     \n"
  3674     "packuswb  %%xmm1,%%xmm1                   \n"
  3675     "movq      %%xmm0," MEMACCESS(1) "         \n"
  3676     BUNDLEALIGN
  3677     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
  3678     "lea       " MEMLEA(0x8,1) ",%1            \n"
  3679     "sub       $0x10,%3                        \n"
  3680     "jg        1b                              \n"
  3681   : "+r"(src_yuy2),    // %0
  3682     "+r"(dst_u),       // %1
  3683     "+r"(dst_v),       // %2
  3684     "+r"(pix)          // %3
  3686   : "memory", "cc"
  3687 #if defined(__native_client__) && defined(__x86_64__)
  3688     , "r14"
  3689 #endif
  3690 #if defined(__SSE2__)
  3691     , "xmm0", "xmm1", "xmm5"
  3692 #endif
  3693   );
  3696 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
  3697   asm volatile (
  3698     LABELALIGN
  3699   "1:                                          \n"
  3700     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  3701     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  3702     "lea       " MEMLEA(0x20,0) ",%0           \n"
  3703     "psrlw     $0x8,%%xmm0                     \n"
  3704     "psrlw     $0x8,%%xmm1                     \n"
  3705     "packuswb  %%xmm1,%%xmm0                   \n"
  3706     "sub       $0x10,%2                        \n"
  3707     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  3708     "lea       " MEMLEA(0x10,1) ",%1           \n"
  3709     "jg        1b                              \n"
  3710   : "+r"(src_uyvy),  // %0
  3711     "+r"(dst_y),     // %1
  3712     "+r"(pix)        // %2
  3714   : "memory", "cc"
  3715 #if defined(__SSE2__)
  3716     , "xmm0", "xmm1"
  3717 #endif
  3718   );
  3721 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
  3722                       uint8* dst_u, uint8* dst_v, int pix) {
  3723   asm volatile (
  3724     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  3725     "psrlw     $0x8,%%xmm5                     \n"
  3726     "sub       %1,%2                           \n"
  3727     LABELALIGN
  3728   "1:                                          \n"
  3729     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  3730     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  3731     BUNDLEALIGN
  3732     MEMOPREG(movdqa,0x00,0,4,1,xmm2)           //  movdqa  (%0,%4,1),%%xmm2
  3733     MEMOPREG(movdqa,0x10,0,4,1,xmm3)           //  movdqa  0x10(%0,%4,1),%%xmm3
  3734     "lea       " MEMLEA(0x20,0) ",%0           \n"
  3735     "pavgb     %%xmm2,%%xmm0                   \n"
  3736     "pavgb     %%xmm3,%%xmm1                   \n"
  3737     "pand      %%xmm5,%%xmm0                   \n"
  3738     "pand      %%xmm5,%%xmm1                   \n"
  3739     "packuswb  %%xmm1,%%xmm0                   \n"
  3740     "movdqa    %%xmm0,%%xmm1                   \n"
  3741     "pand      %%xmm5,%%xmm0                   \n"
  3742     "packuswb  %%xmm0,%%xmm0                   \n"
  3743     "psrlw     $0x8,%%xmm1                     \n"
  3744     "packuswb  %%xmm1,%%xmm1                   \n"
  3745     "movq      %%xmm0," MEMACCESS(1) "         \n"
  3746     BUNDLEALIGN
  3747     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
  3748     "lea       " MEMLEA(0x8,1) ",%1            \n"
  3749     "sub       $0x10,%3                        \n"
  3750     "jg        1b                              \n"
  3751   : "+r"(src_uyvy),    // %0
  3752     "+r"(dst_u),       // %1
  3753     "+r"(dst_v),       // %2
  3754     "+r"(pix)          // %3
  3755   : "r"((intptr_t)(stride_uyvy))  // %4
  3756   : "memory", "cc"
  3757 #if defined(__native_client__) && defined(__x86_64__)
  3758     , "r14"
  3759 #endif
  3760 #if defined(__SSE2__)
  3761     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  3762 #endif
  3763   );
  3766 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
  3767                          uint8* dst_u, uint8* dst_v, int pix) {
  3768   asm volatile (
  3769     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  3770     "psrlw     $0x8,%%xmm5                     \n"
  3771     "sub       %1,%2                           \n"
  3772     LABELALIGN
  3773   "1:                                          \n"
  3774     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  3775     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  3776     "lea       " MEMLEA(0x20,0) ",%0           \n"
  3777     "pand      %%xmm5,%%xmm0                   \n"
  3778     "pand      %%xmm5,%%xmm1                   \n"
  3779     "packuswb  %%xmm1,%%xmm0                   \n"
  3780     "movdqa    %%xmm0,%%xmm1                   \n"
  3781     "pand      %%xmm5,%%xmm0                   \n"
  3782     "packuswb  %%xmm0,%%xmm0                   \n"
  3783     "psrlw     $0x8,%%xmm1                     \n"
  3784     "packuswb  %%xmm1,%%xmm1                   \n"
  3785     "movq      %%xmm0," MEMACCESS(1) "         \n"
  3786     BUNDLEALIGN
  3787     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
  3788     "lea       " MEMLEA(0x8,1) ",%1            \n"
  3789     "sub       $0x10,%3                        \n"
  3790     "jg        1b                              \n"
  3791   : "+r"(src_uyvy),    // %0
  3792     "+r"(dst_u),       // %1
  3793     "+r"(dst_v),       // %2
  3794     "+r"(pix)          // %3
  3796   : "memory", "cc"
  3797 #if defined(__native_client__) && defined(__x86_64__)
  3798     , "r14"
  3799 #endif
  3800 #if defined(__SSE2__)
  3801     , "xmm0", "xmm1", "xmm5"
  3802 #endif
  3803   );
  3806 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
  3807                                uint8* dst_y, int pix) {
  3808   asm volatile (
  3809     LABELALIGN
  3810   "1:                                          \n"
  3811     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  3812     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  3813     "lea       " MEMLEA(0x20,0) ",%0           \n"
  3814     "psrlw     $0x8,%%xmm0                     \n"
  3815     "psrlw     $0x8,%%xmm1                     \n"
  3816     "packuswb  %%xmm1,%%xmm0                   \n"
  3817     "sub       $0x10,%2                        \n"
  3818     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
  3819     "lea       " MEMLEA(0x10,1) ",%1           \n"
  3820     "jg        1b                              \n"
  3821   : "+r"(src_uyvy),  // %0
  3822     "+r"(dst_y),     // %1
  3823     "+r"(pix)        // %2
  3825   : "memory", "cc"
  3826 #if defined(__SSE2__)
  3827     , "xmm0", "xmm1"
  3828 #endif
  3829   );
  3832 void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
  3833                                 uint8* dst_u, uint8* dst_v, int pix) {
  3834   asm volatile (
  3835     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  3836     "psrlw     $0x8,%%xmm5                     \n"
  3837     "sub       %1,%2                           \n"
  3838     LABELALIGN
  3839   "1:                                          \n"
  3840     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  3841     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  3842     BUNDLEALIGN
  3843     MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
  3844     MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
  3845     "lea       " MEMLEA(0x20,0) ",%0           \n"
  3846     "pavgb     %%xmm2,%%xmm0                   \n"
  3847     "pavgb     %%xmm3,%%xmm1                   \n"
  3848     "pand      %%xmm5,%%xmm0                   \n"
  3849     "pand      %%xmm5,%%xmm1                   \n"
  3850     "packuswb  %%xmm1,%%xmm0                   \n"
  3851     "movdqa    %%xmm0,%%xmm1                   \n"
  3852     "pand      %%xmm5,%%xmm0                   \n"
  3853     "packuswb  %%xmm0,%%xmm0                   \n"
  3854     "psrlw     $0x8,%%xmm1                     \n"
  3855     "packuswb  %%xmm1,%%xmm1                   \n"
  3856     "movq      %%xmm0," MEMACCESS(1) "         \n"
  3857     BUNDLEALIGN
  3858     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
  3859     "lea       " MEMLEA(0x8,1) ",%1            \n"
  3860     "sub       $0x10,%3                        \n"
  3861     "jg        1b                              \n"
  3862   : "+r"(src_uyvy),    // %0
  3863     "+r"(dst_u),       // %1
  3864     "+r"(dst_v),       // %2
  3865     "+r"(pix)          // %3
  3866   : "r"((intptr_t)(stride_uyvy))  // %4
  3867   : "memory", "cc"
  3868 #if defined(__native_client__) && defined(__x86_64__)
  3869     , "r14"
  3870 #endif
  3871 #if defined(__SSE2__)
  3872     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  3873 #endif
  3874   );
  3877 void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
  3878                                    uint8* dst_u, uint8* dst_v, int pix) {
  3879   asm volatile (
  3880     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  3881     "psrlw     $0x8,%%xmm5                     \n"
  3882     "sub       %1,%2                           \n"
  3883     LABELALIGN
  3884   "1:                                          \n"
  3885     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  3886     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  3887     "lea       " MEMLEA(0x20,0) ",%0           \n"
  3888     "pand      %%xmm5,%%xmm0                   \n"
  3889     "pand      %%xmm5,%%xmm1                   \n"
  3890     "packuswb  %%xmm1,%%xmm0                   \n"
  3891     "movdqa    %%xmm0,%%xmm1                   \n"
  3892     "pand      %%xmm5,%%xmm0                   \n"
  3893     "packuswb  %%xmm0,%%xmm0                   \n"
  3894     "psrlw     $0x8,%%xmm1                     \n"
  3895     "packuswb  %%xmm1,%%xmm1                   \n"
  3896     "movq      %%xmm0," MEMACCESS(1) "         \n"
  3897     BUNDLEALIGN
  3898     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
  3899     "lea       " MEMLEA(0x8,1) ",%1            \n"
  3900     "sub       $0x10,%3                        \n"
  3901     "jg        1b                              \n"
  3902   : "+r"(src_uyvy),    // %0
  3903     "+r"(dst_u),       // %1
  3904     "+r"(dst_v),       // %2
  3905     "+r"(pix)          // %3
  3907   : "memory", "cc"
  3908 #if defined(__native_client__) && defined(__x86_64__)
  3909     , "r14"
  3910 #endif
  3911 #if defined(__SSE2__)
  3912     , "xmm0", "xmm1", "xmm5"
  3913 #endif
  3914   );
  3916 #endif  // HAS_YUY2TOYROW_SSE2
  3918 #ifdef HAS_ARGBBLENDROW_SSE2
  3919 // Blend 8 pixels at a time.
  3920 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  3921                        uint8* dst_argb, int width) {
  3922   asm volatile (
  3923     "pcmpeqb   %%xmm7,%%xmm7                   \n"
  3924     "psrlw     $0xf,%%xmm7                     \n"
  3925     "pcmpeqb   %%xmm6,%%xmm6                   \n"
  3926     "psrlw     $0x8,%%xmm6                     \n"
  3927     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  3928     "psllw     $0x8,%%xmm5                     \n"
  3929     "pcmpeqb   %%xmm4,%%xmm4                   \n"
  3930     "pslld     $0x18,%%xmm4                    \n"
  3931     "sub       $0x1,%3                         \n"
  3932     "je        91f                             \n"
  3933     "jl        99f                             \n"
  3935     // 1 pixel loop until destination pointer is aligned.
  3936   "10:                                         \n"
  3937     "test      $0xf,%2                         \n"
  3938     "je        19f                             \n"
  3939     "movd      " MEMACCESS(0) ",%%xmm3         \n"
  3940     "lea       " MEMLEA(0x4,0) ",%0            \n"
  3941     "movdqa    %%xmm3,%%xmm0                   \n"
  3942     "pxor      %%xmm4,%%xmm3                   \n"
  3943     "movd      " MEMACCESS(1) ",%%xmm2         \n"
  3944     "psrlw     $0x8,%%xmm3                     \n"
  3945     "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
  3946     "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
  3947     "pand      %%xmm6,%%xmm2                   \n"
  3948     "paddw     %%xmm7,%%xmm3                   \n"
  3949     "pmullw    %%xmm3,%%xmm2                   \n"
  3950     "movd      " MEMACCESS(1) ",%%xmm1         \n"
  3951     "lea       " MEMLEA(0x4,1) ",%1            \n"
  3952     "psrlw     $0x8,%%xmm1                     \n"
  3953     "por       %%xmm4,%%xmm0                   \n"
  3954     "pmullw    %%xmm3,%%xmm1                   \n"
  3955     "psrlw     $0x8,%%xmm2                     \n"
  3956     "paddusb   %%xmm2,%%xmm0                   \n"
  3957     "pand      %%xmm5,%%xmm1                   \n"
  3958     "paddusb   %%xmm1,%%xmm0                   \n"
  3959     "sub       $0x1,%3                         \n"
  3960     "movd      %%xmm0," MEMACCESS(2) "         \n"
  3961     "lea       " MEMLEA(0x4,2) ",%2            \n"
  3962     "jge       10b                             \n"
  3964   "19:                                         \n"
  3965     "add       $1-4,%3                         \n"
  3966     "jl        49f                             \n"
  3968     // 4 pixel loop.
  3969     LABELALIGN
  3970   "41:                                         \n"
  3971     "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
  3972     "lea       " MEMLEA(0x10,0) ",%0           \n"
  3973     "movdqa    %%xmm3,%%xmm0                   \n"
  3974     "pxor      %%xmm4,%%xmm3                   \n"
  3975     "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
  3976     "psrlw     $0x8,%%xmm3                     \n"
  3977     "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
  3978     "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
  3979     "pand      %%xmm6,%%xmm2                   \n"
  3980     "paddw     %%xmm7,%%xmm3                   \n"
  3981     "pmullw    %%xmm3,%%xmm2                   \n"
  3982     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
  3983     "lea       " MEMLEA(0x10,1) ",%1           \n"
  3984     "psrlw     $0x8,%%xmm1                     \n"
  3985     "por       %%xmm4,%%xmm0                   \n"
  3986     "pmullw    %%xmm3,%%xmm1                   \n"
  3987     "psrlw     $0x8,%%xmm2                     \n"
  3988     "paddusb   %%xmm2,%%xmm0                   \n"
  3989     "pand      %%xmm5,%%xmm1                   \n"
  3990     "paddusb   %%xmm1,%%xmm0                   \n"
  3991     "sub       $0x4,%3                         \n"
  3992     "movdqa    %%xmm0," MEMACCESS(2) "         \n"
  3993     "lea       " MEMLEA(0x10,2) ",%2           \n"
  3994     "jge       41b                             \n"
  3996   "49:                                         \n"
  3997     "add       $0x3,%3                         \n"
  3998     "jl        99f                             \n"
  4000     // 1 pixel loop.
  4001   "91:                                         \n"
  4002     "movd      " MEMACCESS(0) ",%%xmm3         \n"
  4003     "lea       " MEMLEA(0x4,0) ",%0            \n"
  4004     "movdqa    %%xmm3,%%xmm0                   \n"
  4005     "pxor      %%xmm4,%%xmm3                   \n"
  4006     "movd      " MEMACCESS(1) ",%%xmm2         \n"
  4007     "psrlw     $0x8,%%xmm3                     \n"
  4008     "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
  4009     "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
  4010     "pand      %%xmm6,%%xmm2                   \n"
  4011     "paddw     %%xmm7,%%xmm3                   \n"
  4012     "pmullw    %%xmm3,%%xmm2                   \n"
  4013     "movd      " MEMACCESS(1) ",%%xmm1         \n"
  4014     "lea       " MEMLEA(0x4,1) ",%1            \n"
  4015     "psrlw     $0x8,%%xmm1                     \n"
  4016     "por       %%xmm4,%%xmm0                   \n"
  4017     "pmullw    %%xmm3,%%xmm1                   \n"
  4018     "psrlw     $0x8,%%xmm2                     \n"
  4019     "paddusb   %%xmm2,%%xmm0                   \n"
  4020     "pand      %%xmm5,%%xmm1                   \n"
  4021     "paddusb   %%xmm1,%%xmm0                   \n"
  4022     "sub       $0x1,%3                         \n"
  4023     "movd      %%xmm0," MEMACCESS(2) "         \n"
  4024     "lea       " MEMLEA(0x4,2) ",%2            \n"
  4025     "jge       91b                             \n"
  4026   "99:                                         \n"
  4027   : "+r"(src_argb0),    // %0
  4028     "+r"(src_argb1),    // %1
  4029     "+r"(dst_argb),     // %2
  4030     "+r"(width)         // %3
  4032   : "memory", "cc"
  4033 #if defined(__SSE2__)
  4034     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  4035 #endif
  4036   );
  4038 #endif  // HAS_ARGBBLENDROW_SSE2
  4040 #ifdef HAS_ARGBBLENDROW_SSSE3
  4041 // Shuffle table for isolating alpha.
  4042 static uvec8 kShuffleAlpha = {
  4043   3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
  4044   11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
  4045 };
  4047 // Blend 8 pixels at a time
  4048 // Shuffle table for reversing the bytes.
  4050 // Same as SSE2, but replaces
  4051 //    psrlw      xmm3, 8          // alpha
  4052 //    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
  4053 //    pshuflw    xmm3, xmm3,0F5h
  4054 // with..
  4055 //    pshufb     xmm3, kShuffleAlpha // alpha
  4057 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
  4058                         uint8* dst_argb, int width) {
  4059   asm volatile (
  4060     "pcmpeqb   %%xmm7,%%xmm7                   \n"
  4061     "psrlw     $0xf,%%xmm7                     \n"
  4062     "pcmpeqb   %%xmm6,%%xmm6                   \n"
  4063     "psrlw     $0x8,%%xmm6                     \n"
  4064     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  4065     "psllw     $0x8,%%xmm5                     \n"
  4066     "pcmpeqb   %%xmm4,%%xmm4                   \n"
  4067     "pslld     $0x18,%%xmm4                    \n"
  4068     "sub       $0x1,%3                         \n"
  4069     "je        91f                             \n"
  4070     "jl        99f                             \n"
  4072     // 1 pixel loop until destination pointer is aligned.
  4073   "10:                                         \n"
  4074     "test      $0xf,%2                         \n"
  4075     "je        19f                             \n"
  4076     "movd      " MEMACCESS(0) ",%%xmm3         \n"
  4077     "lea       " MEMLEA(0x4,0) ",%0            \n"
  4078     "movdqa    %%xmm3,%%xmm0                   \n"
  4079     "pxor      %%xmm4,%%xmm3                   \n"
  4080     "movd      " MEMACCESS(1) ",%%xmm2         \n"
  4081     "pshufb    %4,%%xmm3                       \n"
  4082     "pand      %%xmm6,%%xmm2                   \n"
  4083     "paddw     %%xmm7,%%xmm3                   \n"
  4084     "pmullw    %%xmm3,%%xmm2                   \n"
  4085     "movd      " MEMACCESS(1) ",%%xmm1         \n"
  4086     "lea       " MEMLEA(0x4,1) ",%1            \n"
  4087     "psrlw     $0x8,%%xmm1                     \n"
  4088     "por       %%xmm4,%%xmm0                   \n"
  4089     "pmullw    %%xmm3,%%xmm1                   \n"
  4090     "psrlw     $0x8,%%xmm2                     \n"
  4091     "paddusb   %%xmm2,%%xmm0                   \n"
  4092     "pand      %%xmm5,%%xmm1                   \n"
  4093     "paddusb   %%xmm1,%%xmm0                   \n"
  4094     "sub       $0x1,%3                         \n"
  4095     "movd      %%xmm0," MEMACCESS(2) "         \n"
  4096     "lea       " MEMLEA(0x4,2) ",%2            \n"
  4097     "jge       10b                             \n"
  4099   "19:                                         \n"
  4100     "add       $1-4,%3                         \n"
  4101     "jl        49f                             \n"
  4102     "test      $0xf,%0                         \n"
  4103     "jne       41f                             \n"
  4104     "test      $0xf,%1                         \n"
  4105     "jne       41f                             \n"
  4107     // 4 pixel loop.
  4108     LABELALIGN
  4109   "40:                                         \n"
  4110     "movdqa    " MEMACCESS(0) ",%%xmm3         \n"
  4111     "lea       " MEMLEA(0x10,0) ",%0           \n"
  4112     "movdqa    %%xmm3,%%xmm0                   \n"
  4113     "pxor      %%xmm4,%%xmm3                   \n"
  4114     "movdqa    " MEMACCESS(1) ",%%xmm2         \n"
  4115     "pshufb    %4,%%xmm3                       \n"
  4116     "pand      %%xmm6,%%xmm2                   \n"
  4117     "paddw     %%xmm7,%%xmm3                   \n"
  4118     "pmullw    %%xmm3,%%xmm2                   \n"
  4119     "movdqa    " MEMACCESS(1) ",%%xmm1         \n"
  4120     "lea       " MEMLEA(0x10,1) ",%1           \n"
  4121     "psrlw     $0x8,%%xmm1                     \n"
  4122     "por       %%xmm4,%%xmm0                   \n"
  4123     "pmullw    %%xmm3,%%xmm1                   \n"
  4124     "psrlw     $0x8,%%xmm2                     \n"
  4125     "paddusb   %%xmm2,%%xmm0                   \n"
  4126     "pand      %%xmm5,%%xmm1                   \n"
  4127     "paddusb   %%xmm1,%%xmm0                   \n"
  4128     "sub       $0x4,%3                         \n"
  4129     "movdqa    %%xmm0," MEMACCESS(2) "         \n"
  4130     "lea       " MEMLEA(0x10,2) ",%2           \n"
  4131     "jge       40b                             \n"
  4132     "jmp       49f                             \n"
  4134     // 4 pixel unaligned loop.
  4135     LABELALIGN
  4136   "41:                                         \n"
  4137     "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
  4138     "lea       " MEMLEA(0x10,0) ",%0           \n"
  4139     "movdqa    %%xmm3,%%xmm0                   \n"
  4140     "pxor      %%xmm4,%%xmm3                   \n"
  4141     "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
  4142     "pshufb    %4,%%xmm3                       \n"
  4143     "pand      %%xmm6,%%xmm2                   \n"
  4144     "paddw     %%xmm7,%%xmm3                   \n"
  4145     "pmullw    %%xmm3,%%xmm2                   \n"
  4146     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
  4147     "lea       " MEMLEA(0x10,1) ",%1           \n"
  4148     "psrlw     $0x8,%%xmm1                     \n"
  4149     "por       %%xmm4,%%xmm0                   \n"
  4150     "pmullw    %%xmm3,%%xmm1                   \n"
  4151     "psrlw     $0x8,%%xmm2                     \n"
  4152     "paddusb   %%xmm2,%%xmm0                   \n"
  4153     "pand      %%xmm5,%%xmm1                   \n"
  4154     "paddusb   %%xmm1,%%xmm0                   \n"
  4155     "sub       $0x4,%3                         \n"
  4156     "movdqa    %%xmm0," MEMACCESS(2) "         \n"
  4157     "lea       " MEMLEA(0x10,2) ",%2           \n"
  4158     "jge       41b                             \n"
  4160   "49:                                         \n"
  4161     "add       $0x3,%3                         \n"
  4162     "jl        99f                             \n"
  4164     // 1 pixel loop.
  4165   "91:                                         \n"
  4166     "movd      " MEMACCESS(0) ",%%xmm3         \n"
  4167     "lea       " MEMLEA(0x4,0) ",%0            \n"
  4168     "movdqa    %%xmm3,%%xmm0                   \n"
  4169     "pxor      %%xmm4,%%xmm3                   \n"
  4170     "movd      " MEMACCESS(1) ",%%xmm2         \n"
  4171     "pshufb    %4,%%xmm3                       \n"
  4172     "pand      %%xmm6,%%xmm2                   \n"
  4173     "paddw     %%xmm7,%%xmm3                   \n"
  4174     "pmullw    %%xmm3,%%xmm2                   \n"
  4175     "movd      " MEMACCESS(1) ",%%xmm1         \n"
  4176     "lea       " MEMLEA(0x4,1) ",%1            \n"
  4177     "psrlw     $0x8,%%xmm1                     \n"
  4178     "por       %%xmm4,%%xmm0                   \n"
  4179     "pmullw    %%xmm3,%%xmm1                   \n"
  4180     "psrlw     $0x8,%%xmm2                     \n"
  4181     "paddusb   %%xmm2,%%xmm0                   \n"
  4182     "pand      %%xmm5,%%xmm1                   \n"
  4183     "paddusb   %%xmm1,%%xmm0                   \n"
  4184     "sub       $0x1,%3                         \n"
  4185     "movd      %%xmm0," MEMACCESS(2) "         \n"
  4186     "lea       " MEMLEA(0x4,2) ",%2            \n"
  4187     "jge       91b                             \n"
  4188   "99:                                         \n"
  4189   : "+r"(src_argb0),    // %0
  4190     "+r"(src_argb1),    // %1
  4191     "+r"(dst_argb),     // %2
  4192     "+r"(width)         // %3
  4193   : "m"(kShuffleAlpha)  // %4
  4194   : "memory", "cc"
  4195 #if defined(__SSE2__)
  4196     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  4197 #endif
  4198   );
  4200 #endif  // HAS_ARGBBLENDROW_SSSE3
  4202 #ifdef HAS_ARGBATTENUATEROW_SSE2
  4203 // Attenuate 4 pixels at a time.
  4204 // aligned to 16 bytes
  4205 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
  4206   asm volatile (
  4207     "pcmpeqb   %%xmm4,%%xmm4                   \n"
  4208     "pslld     $0x18,%%xmm4                    \n"
  4209     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  4210     "psrld     $0x8,%%xmm5                     \n"
  4212     // 4 pixel loop.
  4213     LABELALIGN
  4214   "1:                                          \n"
  4215     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  4216     "punpcklbw %%xmm0,%%xmm0                   \n"
  4217     "pshufhw   $0xff,%%xmm0,%%xmm2             \n"
  4218     "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
  4219     "pmulhuw   %%xmm2,%%xmm0                   \n"
  4220     "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
  4221     "punpckhbw %%xmm1,%%xmm1                   \n"
  4222     "pshufhw   $0xff,%%xmm1,%%xmm2             \n"
  4223     "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
  4224     "pmulhuw   %%xmm2,%%xmm1                   \n"
  4225     "movdqa    " MEMACCESS(0) ",%%xmm2         \n"
  4226     "lea       " MEMLEA(0x10,0) ",%0           \n"
  4227     "psrlw     $0x8,%%xmm0                     \n"
  4228     "pand      %%xmm4,%%xmm2                   \n"
  4229     "psrlw     $0x8,%%xmm1                     \n"
  4230     "packuswb  %%xmm1,%%xmm0                   \n"
  4231     "pand      %%xmm5,%%xmm0                   \n"
  4232     "por       %%xmm2,%%xmm0                   \n"
  4233     "sub       $0x4,%2                         \n"
  4234     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  4235     "lea       " MEMLEA(0x10,1) ",%1           \n"
  4236     "jg        1b                              \n"
  4237   : "+r"(src_argb),    // %0
  4238     "+r"(dst_argb),    // %1
  4239     "+r"(width)        // %2
  4241   : "memory", "cc"
  4242 #if defined(__SSE2__)
  4243     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  4244 #endif
  4245   );
  4247 #endif  // HAS_ARGBATTENUATEROW_SSE2
  4249 #ifdef HAS_ARGBATTENUATEROW_SSSE3
  4250 // Shuffle table duplicating alpha
  4251 static uvec8 kShuffleAlpha0 = {
  4252   3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
  4253 };
  4254 static uvec8 kShuffleAlpha1 = {
  4255   11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
  4256   15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
  4257 };
  4258 // Attenuate 4 pixels at a time.
  4259 // aligned to 16 bytes
  4260 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
  4261   asm volatile (
  4262     "pcmpeqb   %%xmm3,%%xmm3                   \n"
  4263     "pslld     $0x18,%%xmm3                    \n"
  4264     "movdqa    %3,%%xmm4                       \n"
  4265     "movdqa    %4,%%xmm5                       \n"
  4267     // 4 pixel loop.
  4268     LABELALIGN
  4269   "1:                                          \n"
  4270     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  4271     "pshufb    %%xmm4,%%xmm0                   \n"
  4272     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
  4273     "punpcklbw %%xmm1,%%xmm1                   \n"
  4274     "pmulhuw   %%xmm1,%%xmm0                   \n"
  4275     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
  4276     "pshufb    %%xmm5,%%xmm1                   \n"
  4277     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
  4278     "punpckhbw %%xmm2,%%xmm2                   \n"
  4279     "pmulhuw   %%xmm2,%%xmm1                   \n"
  4280     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
  4281     "lea       " MEMLEA(0x10,0) ",%0           \n"
  4282     "pand      %%xmm3,%%xmm2                   \n"
  4283     "psrlw     $0x8,%%xmm0                     \n"
  4284     "psrlw     $0x8,%%xmm1                     \n"
  4285     "packuswb  %%xmm1,%%xmm0                   \n"
  4286     "por       %%xmm2,%%xmm0                   \n"
  4287     "sub       $0x4,%2                         \n"
  4288     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
  4289     "lea       " MEMLEA(0x10,1) ",%1           \n"
  4290     "jg        1b                              \n"
  4291   : "+r"(src_argb),    // %0
  4292     "+r"(dst_argb),    // %1
  4293     "+r"(width)        // %2
  4294   : "m"(kShuffleAlpha0),  // %3
  4295     "m"(kShuffleAlpha1)  // %4
  4296   : "memory", "cc"
  4297 #if defined(__SSE2__)
  4298     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  4299 #endif
  4300   );
  4302 #endif  // HAS_ARGBATTENUATEROW_SSSE3
  4304 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
  4305 // Unattenuate 4 pixels at a time.
  4306 // aligned to 16 bytes
  4307 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
  4308                              int width) {
  4309   uintptr_t alpha = 0;
  4310   asm volatile (
  4311     // 4 pixel loop.
  4312     LABELALIGN
  4313   "1:                                          \n"
  4314     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  4315     "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
  4316     "punpcklbw %%xmm0,%%xmm0                   \n"
  4317     MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
  4318     "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
  4319     MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
  4320     "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
  4321     "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
  4322     "movlhps   %%xmm3,%%xmm2                   \n"
  4323     "pmulhuw   %%xmm2,%%xmm0                   \n"
  4324     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
  4325     "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
  4326     "punpckhbw %%xmm1,%%xmm1                   \n"
  4327     BUNDLEALIGN
  4328     MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
  4329     "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
  4330     MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
  4331     "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
  4332     "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
  4333     "movlhps   %%xmm3,%%xmm2                   \n"
  4334     "pmulhuw   %%xmm2,%%xmm1                   \n"
  4335     "lea       " MEMLEA(0x10,0) ",%0           \n"
  4336     "packuswb  %%xmm1,%%xmm0                   \n"
  4337     "sub       $0x4,%2                         \n"
  4338     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
  4339     "lea       " MEMLEA(0x10,1) ",%1           \n"
  4340     "jg        1b                              \n"
  4341   : "+r"(src_argb),    // %0
  4342     "+r"(dst_argb),    // %1
  4343     "+r"(width),       // %2
  4344     "+r"(alpha)        // %3
  4345   : "r"(fixed_invtbl8)  // %4
  4346   : "memory", "cc"
  4347 #if defined(__native_client__) && defined(__x86_64__)
  4348     , "r14"
  4349 #endif
  4350 #if defined(__SSE2__)
  4351     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  4352 #endif
  4353   );
  4355 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
  4357 #ifdef HAS_ARGBGRAYROW_SSSE3
  4358 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
  4359 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
  4360   asm volatile (
  4361     "movdqa    %3,%%xmm4                       \n"
  4362     "movdqa    %4,%%xmm5                       \n"
  4364     // 8 pixel loop.
  4365     LABELALIGN
  4366   "1:                                          \n"
  4367     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  4368     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  4369     "pmaddubsw %%xmm4,%%xmm0                   \n"
  4370     "pmaddubsw %%xmm4,%%xmm1                   \n"
  4371     "phaddw    %%xmm1,%%xmm0                   \n"
  4372     "paddw     %%xmm5,%%xmm0                   \n"
  4373     "psrlw     $0x7,%%xmm0                     \n"
  4374     "packuswb  %%xmm0,%%xmm0                   \n"
  4375     "movdqa    " MEMACCESS(0) ",%%xmm2         \n"
  4376     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
  4377     "lea       " MEMLEA(0x20,0) ",%0           \n"
  4378     "psrld     $0x18,%%xmm2                    \n"
  4379     "psrld     $0x18,%%xmm3                    \n"
  4380     "packuswb  %%xmm3,%%xmm2                   \n"
  4381     "packuswb  %%xmm2,%%xmm2                   \n"
  4382     "movdqa    %%xmm0,%%xmm3                   \n"
  4383     "punpcklbw %%xmm0,%%xmm0                   \n"
  4384     "punpcklbw %%xmm2,%%xmm3                   \n"
  4385     "movdqa    %%xmm0,%%xmm1                   \n"
  4386     "punpcklwd %%xmm3,%%xmm0                   \n"
  4387     "punpckhwd %%xmm3,%%xmm1                   \n"
  4388     "sub       $0x8,%2                         \n"
  4389     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  4390     "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
  4391     "lea       " MEMLEA(0x20,1) ",%1           \n"
  4392     "jg        1b                              \n"
  4393   : "+r"(src_argb),   // %0
  4394     "+r"(dst_argb),   // %1
  4395     "+r"(width)       // %2
  4396   : "m"(kARGBToYJ),   // %3
  4397     "m"(kAddYJ64)     // %4
  4398   : "memory", "cc"
  4399 #if defined(__SSE2__)
  4400     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  4401 #endif
  4402   );
  4404 #endif  // HAS_ARGBGRAYROW_SSSE3
  4406 #ifdef HAS_ARGBSEPIAROW_SSSE3
  4407 //    b = (r * 35 + g * 68 + b * 17) >> 7
  4408 //    g = (r * 45 + g * 88 + b * 22) >> 7
  4409 //    r = (r * 50 + g * 98 + b * 24) >> 7
  4410 // Constant for ARGB color to sepia tone
  4411 static vec8 kARGBToSepiaB = {
  4412   17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
  4413 };
  4415 static vec8 kARGBToSepiaG = {
  4416   22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
  4417 };
  4419 static vec8 kARGBToSepiaR = {
  4420   24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
  4421 };
  4423 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
  4424 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
  4425   asm volatile (
  4426     "movdqa    %2,%%xmm2                       \n"
  4427     "movdqa    %3,%%xmm3                       \n"
  4428     "movdqa    %4,%%xmm4                       \n"
  4430     // 8 pixel loop.
  4431     LABELALIGN
  4432   "1:                                          \n"
  4433     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  4434     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
  4435     "pmaddubsw %%xmm2,%%xmm0                   \n"
  4436     "pmaddubsw %%xmm2,%%xmm6                   \n"
  4437     "phaddw    %%xmm6,%%xmm0                   \n"
  4438     "psrlw     $0x7,%%xmm0                     \n"
  4439     "packuswb  %%xmm0,%%xmm0                   \n"
  4440     "movdqa    " MEMACCESS(0) ",%%xmm5         \n"
  4441     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  4442     "pmaddubsw %%xmm3,%%xmm5                   \n"
  4443     "pmaddubsw %%xmm3,%%xmm1                   \n"
  4444     "phaddw    %%xmm1,%%xmm5                   \n"
  4445     "psrlw     $0x7,%%xmm5                     \n"
  4446     "packuswb  %%xmm5,%%xmm5                   \n"
  4447     "punpcklbw %%xmm5,%%xmm0                   \n"
  4448     "movdqa    " MEMACCESS(0) ",%%xmm5         \n"
  4449     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  4450     "pmaddubsw %%xmm4,%%xmm5                   \n"
  4451     "pmaddubsw %%xmm4,%%xmm1                   \n"
  4452     "phaddw    %%xmm1,%%xmm5                   \n"
  4453     "psrlw     $0x7,%%xmm5                     \n"
  4454     "packuswb  %%xmm5,%%xmm5                   \n"
  4455     "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
  4456     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  4457     "psrld     $0x18,%%xmm6                    \n"
  4458     "psrld     $0x18,%%xmm1                    \n"
  4459     "packuswb  %%xmm1,%%xmm6                   \n"
  4460     "packuswb  %%xmm6,%%xmm6                   \n"
  4461     "punpcklbw %%xmm6,%%xmm5                   \n"
  4462     "movdqa    %%xmm0,%%xmm1                   \n"
  4463     "punpcklwd %%xmm5,%%xmm0                   \n"
  4464     "punpckhwd %%xmm5,%%xmm1                   \n"
  4465     "sub       $0x8,%1                         \n"
  4466     "movdqa    %%xmm0," MEMACCESS(0) "         \n"
  4467     "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"
  4468     "lea       " MEMLEA(0x20,0) ",%0           \n"
  4469     "jg        1b                              \n"
  4470   : "+r"(dst_argb),      // %0
  4471     "+r"(width)          // %1
  4472   : "m"(kARGBToSepiaB),  // %2
  4473     "m"(kARGBToSepiaG),  // %3
  4474     "m"(kARGBToSepiaR)   // %4
  4475   : "memory", "cc"
  4476 #if defined(__SSE2__)
  4477     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  4478 #endif
  4479   );
  4481 #endif  // HAS_ARGBSEPIAROW_SSSE3
  4483 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
  4484 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
  4485 // Same as Sepia except matrix is provided.
  4486 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
  4487                               const int8* matrix_argb, int width) {
  4488   asm volatile (
  4489     "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
  4490     "pshufd    $0x00,%%xmm5,%%xmm2             \n"
  4491     "pshufd    $0x55,%%xmm5,%%xmm3             \n"
  4492     "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
  4493     "pshufd    $0xff,%%xmm5,%%xmm5             \n"
  4495     // 8 pixel loop.
  4496     LABELALIGN
  4497   "1:                                          \n"
  4498     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  4499     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
  4500     "pmaddubsw %%xmm2,%%xmm0                   \n"
  4501     "pmaddubsw %%xmm2,%%xmm7                   \n"
  4502     "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
  4503     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  4504     "pmaddubsw %%xmm3,%%xmm6                   \n"
  4505     "pmaddubsw %%xmm3,%%xmm1                   \n"
  4506     "phaddsw   %%xmm7,%%xmm0                   \n"
  4507     "phaddsw   %%xmm1,%%xmm6                   \n"
  4508     "psraw     $0x6,%%xmm0                     \n"
  4509     "psraw     $0x6,%%xmm6                     \n"
  4510     "packuswb  %%xmm0,%%xmm0                   \n"
  4511     "packuswb  %%xmm6,%%xmm6                   \n"
  4512     "punpcklbw %%xmm6,%%xmm0                   \n"
  4513     "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
  4514     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
  4515     "pmaddubsw %%xmm4,%%xmm1                   \n"
  4516     "pmaddubsw %%xmm4,%%xmm7                   \n"
  4517     "phaddsw   %%xmm7,%%xmm1                   \n"
  4518     "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
  4519     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
  4520     "pmaddubsw %%xmm5,%%xmm6                   \n"
  4521     "pmaddubsw %%xmm5,%%xmm7                   \n"
  4522     "phaddsw   %%xmm7,%%xmm6                   \n"
  4523     "psraw     $0x6,%%xmm1                     \n"
  4524     "psraw     $0x6,%%xmm6                     \n"
  4525     "packuswb  %%xmm1,%%xmm1                   \n"
  4526     "packuswb  %%xmm6,%%xmm6                   \n"
  4527     "punpcklbw %%xmm6,%%xmm1                   \n"
  4528     "movdqa    %%xmm0,%%xmm6                   \n"
  4529     "punpcklwd %%xmm1,%%xmm0                   \n"
  4530     "punpckhwd %%xmm1,%%xmm6                   \n"
  4531     "sub       $0x8,%2                         \n"
  4532     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  4533     "movdqa    %%xmm6," MEMACCESS2(0x10,1) "   \n"
  4534     "lea       " MEMLEA(0x20,0) ",%0           \n"
  4535     "lea       " MEMLEA(0x20,1) ",%1           \n"
  4536     "jg        1b                              \n"
  4537   : "+r"(src_argb),      // %0
  4538     "+r"(dst_argb),      // %1
  4539     "+r"(width)          // %2
  4540   : "r"(matrix_argb)     // %3
  4541   : "memory", "cc"
  4542 #if defined(__SSE2__)
  4543     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  4544 #endif
  4545   );
  4547 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
  4549 #ifdef HAS_ARGBQUANTIZEROW_SSE2
  4550 // Quantize 4 ARGB pixels (16 bytes).
  4551 // aligned to 16 bytes
  4552 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
  4553                           int interval_offset, int width) {
  4554   asm volatile (
  4555     "movd      %2,%%xmm2                       \n"
  4556     "movd      %3,%%xmm3                       \n"
  4557     "movd      %4,%%xmm4                       \n"
  4558     "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
  4559     "pshufd    $0x44,%%xmm2,%%xmm2             \n"
  4560     "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
  4561     "pshufd    $0x44,%%xmm3,%%xmm3             \n"
  4562     "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
  4563     "pshufd    $0x44,%%xmm4,%%xmm4             \n"
  4564     "pxor      %%xmm5,%%xmm5                   \n"
  4565     "pcmpeqb   %%xmm6,%%xmm6                   \n"
  4566     "pslld     $0x18,%%xmm6                    \n"
  4568     // 4 pixel loop.
  4569     LABELALIGN
  4570   "1:                                          \n"
  4571     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  4572     "punpcklbw %%xmm5,%%xmm0                   \n"
  4573     "pmulhuw   %%xmm2,%%xmm0                   \n"
  4574     "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
  4575     "punpckhbw %%xmm5,%%xmm1                   \n"
  4576     "pmulhuw   %%xmm2,%%xmm1                   \n"
  4577     "pmullw    %%xmm3,%%xmm0                   \n"
  4578     "movdqa    " MEMACCESS(0) ",%%xmm7         \n"
  4579     "pmullw    %%xmm3,%%xmm1                   \n"
  4580     "pand      %%xmm6,%%xmm7                   \n"
  4581     "paddw     %%xmm4,%%xmm0                   \n"
  4582     "paddw     %%xmm4,%%xmm1                   \n"
  4583     "packuswb  %%xmm1,%%xmm0                   \n"
  4584     "por       %%xmm7,%%xmm0                   \n"
  4585     "sub       $0x4,%1                         \n"
  4586     "movdqa    %%xmm0," MEMACCESS(0) "         \n"
  4587     "lea       " MEMLEA(0x10,0) ",%0           \n"
  4588     "jg        1b                              \n"
  4589   : "+r"(dst_argb),       // %0
  4590     "+r"(width)           // %1
  4591   : "r"(scale),           // %2
  4592     "r"(interval_size),   // %3
  4593     "r"(interval_offset)  // %4
  4594   : "memory", "cc"
  4595 #if defined(__SSE2__)
  4596     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  4597 #endif
  4598   );
  4600 #endif  // HAS_ARGBQUANTIZEROW_SSE2
  4602 #ifdef HAS_ARGBSHADEROW_SSE2
  4603 // Shade 4 pixels at a time by specified value.
  4604 // Aligned to 16 bytes.
  4605 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
  4606                        uint32 value) {
  4607   asm volatile (
  4608     "movd      %3,%%xmm2                       \n"
  4609     "punpcklbw %%xmm2,%%xmm2                   \n"
  4610     "punpcklqdq %%xmm2,%%xmm2                  \n"
  4612     // 4 pixel loop.
  4613     LABELALIGN
  4614   "1:                                          \n"
  4615     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  4616     "lea       " MEMLEA(0x10,0) ",%0           \n"
  4617     "movdqa    %%xmm0,%%xmm1                   \n"
  4618     "punpcklbw %%xmm0,%%xmm0                   \n"
  4619     "punpckhbw %%xmm1,%%xmm1                   \n"
  4620     "pmulhuw   %%xmm2,%%xmm0                   \n"
  4621     "pmulhuw   %%xmm2,%%xmm1                   \n"
  4622     "psrlw     $0x8,%%xmm0                     \n"
  4623     "psrlw     $0x8,%%xmm1                     \n"
  4624     "packuswb  %%xmm1,%%xmm0                   \n"
  4625     "sub       $0x4,%2                         \n"
  4626     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  4627     "lea       " MEMLEA(0x10,1) ",%1           \n"
  4628     "jg        1b                              \n"
  4629   : "+r"(src_argb),  // %0
  4630     "+r"(dst_argb),  // %1
  4631     "+r"(width)      // %2
  4632   : "r"(value)       // %3
  4633   : "memory", "cc"
  4634 #if defined(__SSE2__)
  4635     , "xmm0", "xmm1", "xmm2"
  4636 #endif
  4637   );
  4639 #endif  // HAS_ARGBSHADEROW_SSE2
  4641 #ifdef HAS_ARGBMULTIPLYROW_SSE2
  4642 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
  4643 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  4644                           uint8* dst_argb, int width) {
  4645   asm volatile (
  4646     "pxor      %%xmm5,%%xmm5                   \n"
  4648     // 4 pixel loop.
  4649     LABELALIGN
  4650   "1:                                          \n"
  4651     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  4652     "lea       " MEMLEA(0x10,0) ",%0           \n"
  4653     "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
  4654     "lea       " MEMLEA(0x10,1) ",%1           \n"
  4655     "movdqu    %%xmm0,%%xmm1                   \n"
  4656     "movdqu    %%xmm2,%%xmm3                   \n"
  4657     "punpcklbw %%xmm0,%%xmm0                   \n"
  4658     "punpckhbw %%xmm1,%%xmm1                   \n"
  4659     "punpcklbw %%xmm5,%%xmm2                   \n"
  4660     "punpckhbw %%xmm5,%%xmm3                   \n"
  4661     "pmulhuw   %%xmm2,%%xmm0                   \n"
  4662     "pmulhuw   %%xmm3,%%xmm1                   \n"
  4663     "packuswb  %%xmm1,%%xmm0                   \n"
  4664     "sub       $0x4,%3                         \n"
  4665     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
  4666     "lea       " MEMLEA(0x10,2) ",%2           \n"
  4667     "jg        1b                              \n"
  4668   : "+r"(src_argb0),  // %0
  4669     "+r"(src_argb1),  // %1
  4670     "+r"(dst_argb),   // %2
  4671     "+r"(width)       // %3
  4673   : "memory", "cc"
  4674 #if defined(__SSE2__)
  4675     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  4676 #endif
  4677   );
  4679 #endif  // HAS_ARGBMULTIPLYROW_SSE2
  4681 #ifdef HAS_ARGBADDROW_SSE2
  4682 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
  4683 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  4684                      uint8* dst_argb, int width) {
  4685   asm volatile (
  4686     // 4 pixel loop.
  4687     LABELALIGN
  4688   "1:                                          \n"
  4689     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  4690     "lea       " MEMLEA(0x10,0) ",%0           \n"
  4691     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
  4692     "lea       " MEMLEA(0x10,1) ",%1           \n"
  4693     "paddusb   %%xmm1,%%xmm0                   \n"
  4694     "sub       $0x4,%3                         \n"
  4695     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
  4696     "lea       " MEMLEA(0x10,2) ",%2           \n"
  4697     "jg        1b                              \n"
  4698   : "+r"(src_argb0),  // %0
  4699     "+r"(src_argb1),  // %1
  4700     "+r"(dst_argb),   // %2
  4701     "+r"(width)       // %3
  4703   : "memory", "cc"
  4704 #if defined(__SSE2__)
  4705     , "xmm0", "xmm1"
  4706 #endif
  4707   );
  4709 #endif  // HAS_ARGBADDROW_SSE2
  4711 #ifdef HAS_ARGBSUBTRACTROW_SSE2
  4712 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
  4713 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  4714                           uint8* dst_argb, int width) {
  4715   asm volatile (
  4716     // 4 pixel loop.
  4717     LABELALIGN
  4718   "1:                                          \n"
  4719     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  4720     "lea       " MEMLEA(0x10,0) ",%0           \n"
  4721     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
  4722     "lea       " MEMLEA(0x10,1) ",%1           \n"
  4723     "psubusb   %%xmm1,%%xmm0                   \n"
  4724     "sub       $0x4,%3                         \n"
  4725     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
  4726     "lea       " MEMLEA(0x10,2) ",%2           \n"
  4727     "jg        1b                              \n"
  4728   : "+r"(src_argb0),  // %0
  4729     "+r"(src_argb1),  // %1
  4730     "+r"(dst_argb),   // %2
  4731     "+r"(width)       // %3
  4733   : "memory", "cc"
  4734 #if defined(__SSE2__)
  4735     , "xmm0", "xmm1"
  4736 #endif
  4737   );
  4739 #endif  // HAS_ARGBSUBTRACTROW_SSE2
  4741 #ifdef HAS_SOBELXROW_SSE2
  4742 // SobelX as a matrix is
  4743 // -1  0  1
  4744 // -2  0  2
  4745 // -1  0  1
  4746 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
  4747                     const uint8* src_y2, uint8* dst_sobelx, int width) {
  4748   asm volatile (
  4749     "sub       %0,%1                           \n"
  4750     "sub       %0,%2                           \n"
  4751     "sub       %0,%3                           \n"
  4752     "pxor      %%xmm5,%%xmm5                   \n"
  4754     // 8 pixel loop.
  4755     LABELALIGN
  4756   "1:                                          \n"
  4757     "movq      " MEMACCESS(0) ",%%xmm0         \n"
  4758     "movq      " MEMACCESS2(0x2,0) ",%%xmm1    \n"
  4759     "punpcklbw %%xmm5,%%xmm0                   \n"
  4760     "punpcklbw %%xmm5,%%xmm1                   \n"
  4761     "psubw     %%xmm1,%%xmm0                   \n"
  4762     BUNDLEALIGN
  4763     MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
  4764     MEMOPREG(movq,0x02,0,1,1,xmm2)             //  movq      0x2(%0,%1,1),%%xmm2
  4765     "punpcklbw %%xmm5,%%xmm1                   \n"
  4766     "punpcklbw %%xmm5,%%xmm2                   \n"
  4767     "psubw     %%xmm2,%%xmm1                   \n"
  4768     BUNDLEALIGN
  4769     MEMOPREG(movq,0x00,0,2,1,xmm2)             //  movq      (%0,%2,1),%%xmm2
  4770     MEMOPREG(movq,0x02,0,2,1,xmm3)             //  movq      0x2(%0,%2,1),%%xmm3
  4771     "punpcklbw %%xmm5,%%xmm2                   \n"
  4772     "punpcklbw %%xmm5,%%xmm3                   \n"
  4773     "psubw     %%xmm3,%%xmm2                   \n"
  4774     "paddw     %%xmm2,%%xmm0                   \n"
  4775     "paddw     %%xmm1,%%xmm0                   \n"
  4776     "paddw     %%xmm1,%%xmm0                   \n"
  4777     "pxor      %%xmm1,%%xmm1                   \n"
  4778     "psubw     %%xmm0,%%xmm1                   \n"
  4779     "pmaxsw    %%xmm1,%%xmm0                   \n"
  4780     "packuswb  %%xmm0,%%xmm0                   \n"
  4781     "sub       $0x8,%4                         \n"
  4782     BUNDLEALIGN
  4783     MEMOPMEM(movq,xmm0,0x00,0,3,1)             //  movq      %%xmm0,(%0,%3,1)
  4784     "lea       " MEMLEA(0x8,0) ",%0            \n"
  4785     "jg        1b                              \n"
  4786   : "+r"(src_y0),      // %0
  4787     "+r"(src_y1),      // %1
  4788     "+r"(src_y2),      // %2
  4789     "+r"(dst_sobelx),  // %3
  4790     "+r"(width)        // %4
  4792   : "memory", "cc"
  4793 #if defined(__native_client__) && defined(__x86_64__)
  4794     , "r14"
  4795 #endif
  4796 #if defined(__SSE2__)
  4797     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  4798 #endif
  4799   );
  4801 #endif  // HAS_SOBELXROW_SSE2
  4803 #ifdef HAS_SOBELYROW_SSE2
  4804 // SobelY as a matrix is
  4805 // -1 -2 -1
  4806 //  0  0  0
  4807 //  1  2  1
  4808 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
  4809                     uint8* dst_sobely, int width) {
  4810   asm volatile (
  4811     "sub       %0,%1                           \n"
  4812     "sub       %0,%2                           \n"
  4813     "pxor      %%xmm5,%%xmm5                   \n"
  4815     // 8 pixel loop.
  4816     LABELALIGN
  4817   "1:                                          \n"
  4818     "movq      " MEMACCESS(0) ",%%xmm0         \n"
  4819     MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
  4820     "punpcklbw %%xmm5,%%xmm0                   \n"
  4821     "punpcklbw %%xmm5,%%xmm1                   \n"
  4822     "psubw     %%xmm1,%%xmm0                   \n"
  4823     BUNDLEALIGN
  4824     "movq      " MEMACCESS2(0x1,0) ",%%xmm1    \n"
  4825     MEMOPREG(movq,0x01,0,1,1,xmm2)             //  movq      0x1(%0,%1,1),%%xmm2
  4826     "punpcklbw %%xmm5,%%xmm1                   \n"
  4827     "punpcklbw %%xmm5,%%xmm2                   \n"
  4828     "psubw     %%xmm2,%%xmm1                   \n"
  4829     BUNDLEALIGN
  4830     "movq      " MEMACCESS2(0x2,0) ",%%xmm2    \n"
  4831     MEMOPREG(movq,0x02,0,1,1,xmm3)             //  movq      0x2(%0,%1,1),%%xmm3
  4832     "punpcklbw %%xmm5,%%xmm2                   \n"
  4833     "punpcklbw %%xmm5,%%xmm3                   \n"
  4834     "psubw     %%xmm3,%%xmm2                   \n"
  4835     "paddw     %%xmm2,%%xmm0                   \n"
  4836     "paddw     %%xmm1,%%xmm0                   \n"
  4837     "paddw     %%xmm1,%%xmm0                   \n"
  4838     "pxor      %%xmm1,%%xmm1                   \n"
  4839     "psubw     %%xmm0,%%xmm1                   \n"
  4840     "pmaxsw    %%xmm1,%%xmm0                   \n"
  4841     "packuswb  %%xmm0,%%xmm0                   \n"
  4842     "sub       $0x8,%3                         \n"
  4843     BUNDLEALIGN
  4844     MEMOPMEM(movq,xmm0,0x00,0,2,1)             //  movq      %%xmm0,(%0,%2,1)
  4845     "lea       " MEMLEA(0x8,0) ",%0            \n"
  4846     "jg        1b                              \n"
  4847   : "+r"(src_y0),      // %0
  4848     "+r"(src_y1),      // %1
  4849     "+r"(dst_sobely),  // %2
  4850     "+r"(width)        // %3
  4852   : "memory", "cc"
  4853 #if defined(__native_client__) && defined(__x86_64__)
  4854     , "r14"
  4855 #endif
  4856 #if defined(__SSE2__)
  4857     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  4858 #endif
  4859   );
  4861 #endif  // HAS_SOBELYROW_SSE2
  4863 #ifdef HAS_SOBELROW_SSE2
  4864 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
  4865 // A = 255
  4866 // R = Sobel
  4867 // G = Sobel
  4868 // B = Sobel
  4869 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
  4870                    uint8* dst_argb, int width) {
  4871   asm volatile (
  4872     "sub       %0,%1                           \n"
  4873     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  4874     "pslld     $0x18,%%xmm5                    \n"
  4876     // 8 pixel loop.
  4877     LABELALIGN
  4878   "1:                                          \n"
  4879     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  4880     MEMOPREG(movdqa,0x00,0,1,1,xmm1)           //  movdqa    (%0,%1,1),%%xmm1
  4881     "lea       " MEMLEA(0x10,0) ",%0           \n"
  4882     "paddusb   %%xmm1,%%xmm0                   \n"
  4883     "movdqa    %%xmm0,%%xmm2                   \n"
  4884     "punpcklbw %%xmm0,%%xmm2                   \n"
  4885     "punpckhbw %%xmm0,%%xmm0                   \n"
  4886     "movdqa    %%xmm2,%%xmm1                   \n"
  4887     "punpcklwd %%xmm2,%%xmm1                   \n"
  4888     "punpckhwd %%xmm2,%%xmm2                   \n"
  4889     "por       %%xmm5,%%xmm1                   \n"
  4890     "por       %%xmm5,%%xmm2                   \n"
  4891     "movdqa    %%xmm0,%%xmm3                   \n"
  4892     "punpcklwd %%xmm0,%%xmm3                   \n"
  4893     "punpckhwd %%xmm0,%%xmm0                   \n"
  4894     "por       %%xmm5,%%xmm3                   \n"
  4895     "por       %%xmm5,%%xmm0                   \n"
  4896     "sub       $0x10,%3                        \n"
  4897     "movdqa    %%xmm1," MEMACCESS(2) "         \n"
  4898     "movdqa    %%xmm2," MEMACCESS2(0x10,2) "   \n"
  4899     "movdqa    %%xmm3," MEMACCESS2(0x20,2) "   \n"
  4900     "movdqa    %%xmm0," MEMACCESS2(0x30,2) "   \n"
  4901     "lea       " MEMLEA(0x40,2) ",%2           \n"
  4902     "jg        1b                              \n"
  4903   : "+r"(src_sobelx),  // %0
  4904     "+r"(src_sobely),  // %1
  4905     "+r"(dst_argb),    // %2
  4906     "+r"(width)        // %3
  4908   : "memory", "cc"
  4909 #if defined(__native_client__) && defined(__x86_64__)
  4910     , "r14"
  4911 #endif
  4912 #if defined(__SSE2__)
  4913     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  4914 #endif
  4915   );
  4917 #endif  // HAS_SOBELROW_SSE2
  4919 #ifdef HAS_SOBELTOPLANEROW_SSE2
  4920 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
  4921 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
  4922                           uint8* dst_y, int width) {
  4923   asm volatile (
  4924     "sub       %0,%1                           \n"
  4925     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  4926     "pslld     $0x18,%%xmm5                    \n"
  4928     // 8 pixel loop.
  4929     LABELALIGN
  4930   "1:                                          \n"
  4931     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  4932     MEMOPREG(movdqa,0x00,0,1,1,xmm1)           //  movdqa    (%0,%1,1),%%xmm1
  4933     "lea       " MEMLEA(0x10,0) ",%0           \n"
  4934     "paddusb   %%xmm1,%%xmm0                   \n"
  4935     "sub       $0x10,%3                        \n"
  4936     "movdqa    %%xmm0," MEMACCESS(2) "         \n"
  4937     "lea       " MEMLEA(0x10,2) ",%2           \n"
  4938     "jg        1b                              \n"
  4939   : "+r"(src_sobelx),  // %0
  4940     "+r"(src_sobely),  // %1
  4941     "+r"(dst_y),       // %2
  4942     "+r"(width)        // %3
  4944   : "memory", "cc"
  4945 #if defined(__native_client__) && defined(__x86_64__)
  4946     , "r14"
  4947 #endif
  4948 #if defined(__SSE2__)
  4949     , "xmm0", "xmm1"
  4950 #endif
  4951   );
  4953 #endif  // HAS_SOBELTOPLANEROW_SSE2
  4955 #ifdef HAS_SOBELXYROW_SSE2
  4956 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
  4957 // A = 255
  4958 // R = Sobel X
  4959 // G = Sobel
  4960 // B = Sobel Y
  4961 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
  4962                      uint8* dst_argb, int width) {
  4963   asm volatile (
  4964     "sub       %0,%1                           \n"
  4965     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  4967     // 8 pixel loop.
  4968     LABELALIGN
  4969   "1:                                          \n"
  4970     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  4971     MEMOPREG(movdqa,0x00,0,1,1,xmm1)           //  movdqa    (%0,%1,1),%%xmm1
  4972     "lea       " MEMLEA(0x10,0) ",%0           \n"
  4973     "movdqa    %%xmm0,%%xmm2                   \n"
  4974     "paddusb   %%xmm1,%%xmm2                   \n"
  4975     "movdqa    %%xmm0,%%xmm3                   \n"
  4976     "punpcklbw %%xmm5,%%xmm3                   \n"
  4977     "punpckhbw %%xmm5,%%xmm0                   \n"
  4978     "movdqa    %%xmm1,%%xmm4                   \n"
  4979     "punpcklbw %%xmm2,%%xmm4                   \n"
  4980     "punpckhbw %%xmm2,%%xmm1                   \n"
  4981     "movdqa    %%xmm4,%%xmm6                   \n"
  4982     "punpcklwd %%xmm3,%%xmm6                   \n"
  4983     "punpckhwd %%xmm3,%%xmm4                   \n"
  4984     "movdqa    %%xmm1,%%xmm7                   \n"
  4985     "punpcklwd %%xmm0,%%xmm7                   \n"
  4986     "punpckhwd %%xmm0,%%xmm1                   \n"
  4987     "sub       $0x10,%3                        \n"
  4988     "movdqa    %%xmm6," MEMACCESS(2) "         \n"
  4989     "movdqa    %%xmm4," MEMACCESS2(0x10,2) "   \n"
  4990     "movdqa    %%xmm7," MEMACCESS2(0x20,2) "   \n"
  4991     "movdqa    %%xmm1," MEMACCESS2(0x30,2) "   \n"
  4992     "lea       " MEMLEA(0x40,2) ",%2           \n"
  4993     "jg        1b                              \n"
  4994   : "+r"(src_sobelx),  // %0
  4995     "+r"(src_sobely),  // %1
  4996     "+r"(dst_argb),    // %2
  4997     "+r"(width)        // %3
  4999   : "memory", "cc"
  5000 #if defined(__native_client__) && defined(__x86_64__)
  5001     , "r14"
  5002 #endif
  5003 #if defined(__SSE2__)
  5004     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  5005 #endif
  5006   );
  5008 #endif  // HAS_SOBELXYROW_SSE2
  5010 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
  5011 // Creates a table of cumulative sums where each value is a sum of all values
  5012 // above and to the left of the value, inclusive of the value.
  5013 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
  5014                                   const int32* previous_cumsum, int width) {
  5015   asm volatile (
  5016     "pxor      %%xmm0,%%xmm0                   \n"
  5017     "pxor      %%xmm1,%%xmm1                   \n"
  5018     "sub       $0x4,%3                         \n"
  5019     "jl        49f                             \n"
  5020     "test      $0xf,%1                         \n"
  5021     "jne       49f                             \n"
  5023   // 4 pixel loop                              \n"
  5024     LABELALIGN
  5025   "40:                                         \n"
  5026     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
  5027     "lea       " MEMLEA(0x10,0) ",%0           \n"
  5028     "movdqa    %%xmm2,%%xmm4                   \n"
  5029     "punpcklbw %%xmm1,%%xmm2                   \n"
  5030     "movdqa    %%xmm2,%%xmm3                   \n"
  5031     "punpcklwd %%xmm1,%%xmm2                   \n"
  5032     "punpckhwd %%xmm1,%%xmm3                   \n"
  5033     "punpckhbw %%xmm1,%%xmm4                   \n"
  5034     "movdqa    %%xmm4,%%xmm5                   \n"
  5035     "punpcklwd %%xmm1,%%xmm4                   \n"
  5036     "punpckhwd %%xmm1,%%xmm5                   \n"
  5037     "paddd     %%xmm2,%%xmm0                   \n"
  5038     "movdqa    " MEMACCESS(2) ",%%xmm2         \n"
  5039     "paddd     %%xmm0,%%xmm2                   \n"
  5040     "paddd     %%xmm3,%%xmm0                   \n"
  5041     "movdqa    " MEMACCESS2(0x10,2) ",%%xmm3   \n"
  5042     "paddd     %%xmm0,%%xmm3                   \n"
  5043     "paddd     %%xmm4,%%xmm0                   \n"
  5044     "movdqa    " MEMACCESS2(0x20,2) ",%%xmm4   \n"
  5045     "paddd     %%xmm0,%%xmm4                   \n"
  5046     "paddd     %%xmm5,%%xmm0                   \n"
  5047     "movdqa    " MEMACCESS2(0x30,2) ",%%xmm5   \n"
  5048     "lea       " MEMLEA(0x40,2) ",%2           \n"
  5049     "paddd     %%xmm0,%%xmm5                   \n"
  5050     "movdqa    %%xmm2," MEMACCESS(1) "         \n"
  5051     "movdqa    %%xmm3," MEMACCESS2(0x10,1) "   \n"
  5052     "movdqa    %%xmm4," MEMACCESS2(0x20,1) "   \n"
  5053     "movdqa    %%xmm5," MEMACCESS2(0x30,1) "   \n"
  5054     "lea       " MEMLEA(0x40,1) ",%1           \n"
  5055     "sub       $0x4,%3                         \n"
  5056     "jge       40b                             \n"
  5058   "49:                                         \n"
  5059     "add       $0x3,%3                         \n"
  5060     "jl        19f                             \n"
  5062   // 1 pixel loop                              \n"
  5063     LABELALIGN
  5064   "10:                                         \n"
  5065     "movd      " MEMACCESS(0) ",%%xmm2         \n"
  5066     "lea       " MEMLEA(0x4,0) ",%0            \n"
  5067     "punpcklbw %%xmm1,%%xmm2                   \n"
  5068     "punpcklwd %%xmm1,%%xmm2                   \n"
  5069     "paddd     %%xmm2,%%xmm0                   \n"
  5070     "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
  5071     "lea       " MEMLEA(0x10,2) ",%2           \n"
  5072     "paddd     %%xmm0,%%xmm2                   \n"
  5073     "movdqu    %%xmm2," MEMACCESS(1) "         \n"
  5074     "lea       " MEMLEA(0x10,1) ",%1           \n"
  5075     "sub       $0x1,%3                         \n"
  5076     "jge       10b                             \n"
  5078   "19:                                         \n"
  5079   : "+r"(row),  // %0
  5080     "+r"(cumsum),  // %1
  5081     "+r"(previous_cumsum),  // %2
  5082     "+r"(width)  // %3
  5084   : "memory", "cc"
  5085 #if defined(__SSE2__)
  5086     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  5087 #endif
  5088   );
  5090 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
  5092 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
  5093 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
  5094                                     int width, int area, uint8* dst,
  5095                                     int count) {
  5096   asm volatile (
  5097     "movd      %5,%%xmm5                       \n"
  5098     "cvtdq2ps  %%xmm5,%%xmm5                   \n"
  5099     "rcpss     %%xmm5,%%xmm4                   \n"
  5100     "pshufd    $0x0,%%xmm4,%%xmm4              \n"
  5101     "sub       $0x4,%3                         \n"
  5102     "jl        49f                             \n"
  5103     "cmpl      $0x80,%5                        \n"
  5104     "ja        40f                             \n"
  5106     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
  5107     "pcmpeqb   %%xmm6,%%xmm6                   \n"
  5108     "psrld     $0x10,%%xmm6                    \n"
  5109     "cvtdq2ps  %%xmm6,%%xmm6                   \n"
  5110     "addps     %%xmm6,%%xmm5                   \n"
  5111     "mulps     %%xmm4,%%xmm5                   \n"
  5112     "cvtps2dq  %%xmm5,%%xmm5                   \n"
  5113     "packssdw  %%xmm5,%%xmm5                   \n"
  5115   // 4 pixel small loop                        \n"
  5116     LABELALIGN
  5117   "4:                                         \n"
  5118     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  5119     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  5120     "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  5121     "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
  5122     BUNDLEALIGN
  5123     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
  5124     MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
  5125     MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
  5126     MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
  5127     "lea       " MEMLEA(0x40,0) ",%0           \n"
  5128     "psubd     " MEMACCESS(1) ",%%xmm0         \n"
  5129     "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
  5130     "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
  5131     "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
  5132     BUNDLEALIGN
  5133     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
  5134     MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
  5135     MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
  5136     MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
  5137     "lea       " MEMLEA(0x40,1) ",%1           \n"
  5138     "packssdw  %%xmm1,%%xmm0                   \n"
  5139     "packssdw  %%xmm3,%%xmm2                   \n"
  5140     "pmulhuw   %%xmm5,%%xmm0                   \n"
  5141     "pmulhuw   %%xmm5,%%xmm2                   \n"
  5142     "packuswb  %%xmm2,%%xmm0                   \n"
  5143     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
  5144     "lea       " MEMLEA(0x10,2) ",%2           \n"
  5145     "sub       $0x4,%3                         \n"
  5146     "jge       4b                              \n"
  5147     "jmp       49f                             \n"
  5149   // 4 pixel loop                              \n"
  5150     LABELALIGN
  5151   "40:                                         \n"
  5152     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  5153     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  5154     "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  5155     "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
  5156     BUNDLEALIGN
  5157     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
  5158     MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
  5159     MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
  5160     MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
  5161     "lea       " MEMLEA(0x40,0) ",%0           \n"
  5162     "psubd     " MEMACCESS(1) ",%%xmm0         \n"
  5163     "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
  5164     "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
  5165     "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
  5166     BUNDLEALIGN
  5167     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
  5168     MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
  5169     MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
  5170     MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
  5171     "lea       " MEMLEA(0x40,1) ",%1           \n"
  5172     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
  5173     "cvtdq2ps  %%xmm1,%%xmm1                   \n"
  5174     "mulps     %%xmm4,%%xmm0                   \n"
  5175     "mulps     %%xmm4,%%xmm1                   \n"
  5176     "cvtdq2ps  %%xmm2,%%xmm2                   \n"
  5177     "cvtdq2ps  %%xmm3,%%xmm3                   \n"
  5178     "mulps     %%xmm4,%%xmm2                   \n"
  5179     "mulps     %%xmm4,%%xmm3                   \n"
  5180     "cvtps2dq  %%xmm0,%%xmm0                   \n"
  5181     "cvtps2dq  %%xmm1,%%xmm1                   \n"
  5182     "cvtps2dq  %%xmm2,%%xmm2                   \n"
  5183     "cvtps2dq  %%xmm3,%%xmm3                   \n"
  5184     "packssdw  %%xmm1,%%xmm0                   \n"
  5185     "packssdw  %%xmm3,%%xmm2                   \n"
  5186     "packuswb  %%xmm2,%%xmm0                   \n"
  5187     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
  5188     "lea       " MEMLEA(0x10,2) ",%2           \n"
  5189     "sub       $0x4,%3                         \n"
  5190     "jge       40b                             \n"
  5192   "49:                                         \n"
  5193     "add       $0x3,%3                         \n"
  5194     "jl        19f                             \n"
  5196   // 1 pixel loop                              \n"
  5197     LABELALIGN
  5198   "10:                                         \n"
  5199     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  5200     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
  5201     "lea       " MEMLEA(0x10,0) ",%0           \n"
  5202     "psubd     " MEMACCESS(1) ",%%xmm0         \n"
  5203     BUNDLEALIGN
  5204     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
  5205     "lea       " MEMLEA(0x10,1) ",%1           \n"
  5206     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
  5207     "mulps     %%xmm4,%%xmm0                   \n"
  5208     "cvtps2dq  %%xmm0,%%xmm0                   \n"
  5209     "packssdw  %%xmm0,%%xmm0                   \n"
  5210     "packuswb  %%xmm0,%%xmm0                   \n"
  5211     "movd      %%xmm0," MEMACCESS(2) "         \n"
  5212     "lea       " MEMLEA(0x4,2) ",%2            \n"
  5213     "sub       $0x1,%3                         \n"
  5214     "jge       10b                             \n"
  5215   "19:                                         \n"
  5216   : "+r"(topleft),  // %0
  5217     "+r"(botleft),  // %1
  5218     "+r"(dst),      // %2
  5219     "+rm"(count)    // %3
  5220   : "r"((intptr_t)(width)),  // %4
  5221     "rm"(area)     // %5
  5222   : "memory", "cc"
  5223 #if defined(__native_client__) && defined(__x86_64__)
  5224     , "r14"
  5225 #endif
  5226 #if defined(__SSE2__)
  5227     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  5228 #endif
  5229   );
  5231 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
  5233 #ifdef HAS_ARGBAFFINEROW_SSE2
  5234 // Copy ARGB pixels from source image with slope to a row of destination.
  5235 LIBYUV_API
  5236 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
  5237                         uint8* dst_argb, const float* src_dudv, int width) {
  5238   intptr_t src_argb_stride_temp = src_argb_stride;
  5239   intptr_t temp = 0;
  5240   asm volatile (
  5241     "movq      " MEMACCESS(3) ",%%xmm2         \n"
  5242     "movq      " MEMACCESS2(0x08,3) ",%%xmm7   \n"
  5243     "shl       $0x10,%1                        \n"
  5244     "add       $0x4,%1                         \n"
  5245     "movd      %1,%%xmm5                       \n"
  5246     "sub       $0x4,%4                         \n"
  5247     "jl        49f                             \n"
  5249     "pshufd    $0x44,%%xmm7,%%xmm7             \n"
  5250     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
  5251     "movdqa    %%xmm2,%%xmm0                   \n"
  5252     "addps     %%xmm7,%%xmm0                   \n"
  5253     "movlhps   %%xmm0,%%xmm2                   \n"
  5254     "movdqa    %%xmm7,%%xmm4                   \n"
  5255     "addps     %%xmm4,%%xmm4                   \n"
  5256     "movdqa    %%xmm2,%%xmm3                   \n"
  5257     "addps     %%xmm4,%%xmm3                   \n"
  5258     "addps     %%xmm4,%%xmm4                   \n"
  5260   // 4 pixel loop                              \n"
  5261     LABELALIGN
  5262   "40:                                         \n"
  5263     "cvttps2dq %%xmm2,%%xmm0                   \n"  // x, y float to int first 2
  5264     "cvttps2dq %%xmm3,%%xmm1                   \n"  // x, y float to int next 2
  5265     "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
  5266     "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x * 4 + y * stride
  5267     "movd      %%xmm0,%k1                      \n"
  5268     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
  5269     "movd      %%xmm0,%k5                      \n"
  5270     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
  5271     BUNDLEALIGN
  5272     MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
  5273     MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
  5274     "punpckldq %%xmm6,%%xmm1                   \n"
  5275     "addps     %%xmm4,%%xmm2                   \n"
  5276     "movq      %%xmm1," MEMACCESS(2) "         \n"
  5277     "movd      %%xmm0,%k1                      \n"
  5278     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
  5279     "movd      %%xmm0,%k5                      \n"
  5280     BUNDLEALIGN
  5281     MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
  5282     MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
  5283     "punpckldq %%xmm6,%%xmm0                   \n"
  5284     "addps     %%xmm4,%%xmm3                   \n"
  5285     "sub       $0x4,%4                         \n"
  5286     "movq      %%xmm0," MEMACCESS2(0x08,2) "   \n"
  5287     "lea       " MEMLEA(0x10,2) ",%2           \n"
  5288     "jge       40b                             \n"
  5290   "49:                                         \n"
  5291     "add       $0x3,%4                         \n"
  5292     "jl        19f                             \n"
  5294   // 1 pixel loop                              \n"
  5295     LABELALIGN
  5296   "10:                                         \n"
  5297     "cvttps2dq %%xmm2,%%xmm0                   \n"
  5298     "packssdw  %%xmm0,%%xmm0                   \n"
  5299     "pmaddwd   %%xmm5,%%xmm0                   \n"
  5300     "addps     %%xmm7,%%xmm2                   \n"
  5301     "movd      %%xmm0,%k1                      \n"
  5302     BUNDLEALIGN
  5303     MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
  5304     "sub       $0x1,%4                         \n"
  5305     "movd      %%xmm0," MEMACCESS(2) "         \n"
  5306     "lea       " MEMLEA(0x04,2) ",%2           \n"
  5307     "jge       10b                             \n"
  5308   "19:                                         \n"
  5309   : "+r"(src_argb),  // %0
  5310     "+r"(src_argb_stride_temp),  // %1
  5311     "+r"(dst_argb),  // %2
  5312     "+r"(src_dudv),  // %3
  5313     "+rm"(width),    // %4
  5314     "+r"(temp)   // %5
  5316   : "memory", "cc"
  5317 #if defined(__native_client__) && defined(__x86_64__)
  5318     , "r14"
  5319 #endif
  5320 #if defined(__SSE2__)
  5321     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  5322 #endif
  5323   );
  5325 #endif  // HAS_ARGBAFFINEROW_SSE2
  5327 #ifdef HAS_INTERPOLATEROW_SSSE3
  5328 // Bilinear filter 16x2 -> 16x1
  5329 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
  5330                           ptrdiff_t src_stride, int dst_width,
  5331                           int source_y_fraction) {
  5332   asm volatile (
  5333     "sub       %1,%0                           \n"
  5334     "shr       %3                              \n"
  5335     "cmp       $0x0,%3                         \n"
  5336     "je        100f                            \n"
  5337     "cmp       $0x20,%3                        \n"
  5338     "je        75f                             \n"
  5339     "cmp       $0x40,%3                        \n"
  5340     "je        50f                             \n"
  5341     "cmp       $0x60,%3                        \n"
  5342     "je        25f                             \n"
  5344     "movd      %3,%%xmm0                       \n"
  5345     "neg       %3                              \n"
  5346     "add       $0x80,%3                        \n"
  5347     "movd      %3,%%xmm5                       \n"
  5348     "punpcklbw %%xmm0,%%xmm5                   \n"
  5349     "punpcklwd %%xmm5,%%xmm5                   \n"
  5350     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
  5352     // General purpose row blend.
  5353     LABELALIGN
  5354   "1:                                          \n"
  5355     "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
  5356     MEMOPREG(movdqa,0x00,1,4,1,xmm2)
  5357     "movdqa    %%xmm0,%%xmm1                   \n"
  5358     "punpcklbw %%xmm2,%%xmm0                   \n"
  5359     "punpckhbw %%xmm2,%%xmm1                   \n"
  5360     "pmaddubsw %%xmm5,%%xmm0                   \n"
  5361     "pmaddubsw %%xmm5,%%xmm1                   \n"
  5362     "psrlw     $0x7,%%xmm0                     \n"
  5363     "psrlw     $0x7,%%xmm1                     \n"
  5364     "packuswb  %%xmm1,%%xmm0                   \n"
  5365     "sub       $0x10,%2                        \n"
  5366     BUNDLEALIGN
  5367     MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
  5368     "lea       " MEMLEA(0x10,1) ",%1           \n"
  5369     "jg        1b                              \n"
  5370     "jmp       99f                             \n"
  5372     // Blend 25 / 75.
  5373     LABELALIGN
  5374   "25:                                         \n"
  5375     "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
  5376     MEMOPREG(movdqa,0x00,1,4,1,xmm1)
  5377     "pavgb     %%xmm1,%%xmm0                   \n"
  5378     "pavgb     %%xmm1,%%xmm0                   \n"
  5379     "sub       $0x10,%2                        \n"
  5380     BUNDLEALIGN
  5381     MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
  5382     "lea       " MEMLEA(0x10,1) ",%1           \n"
  5383     "jg        25b                             \n"
  5384     "jmp       99f                             \n"
  5386     // Blend 50 / 50.
  5387     LABELALIGN
  5388   "50:                                         \n"
  5389     "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
  5390     MEMOPREG(movdqa,0x00,1,4,1,xmm1)
  5391     "pavgb     %%xmm1,%%xmm0                   \n"
  5392     "sub       $0x10,%2                        \n"
  5393     BUNDLEALIGN
  5394     MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
  5395     "lea       " MEMLEA(0x10,1) ",%1           \n"
  5396     "jg        50b                             \n"
  5397     "jmp       99f                             \n"
  5399     // Blend 75 / 25.
  5400     LABELALIGN
  5401   "75:                                         \n"
  5402     "movdqa    " MEMACCESS(1) ",%%xmm1         \n"
  5403     MEMOPREG(movdqa,0x00,1,4,1,xmm0)
  5404     "pavgb     %%xmm1,%%xmm0                   \n"
  5405     "pavgb     %%xmm1,%%xmm0                   \n"
  5406     "sub       $0x10,%2                        \n"
  5407     BUNDLEALIGN
  5408     MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
  5409     "lea       " MEMLEA(0x10,1) ",%1           \n"
  5410     "jg        75b                             \n"
  5411     "jmp       99f                             \n"
  5413     // Blend 100 / 0 - Copy row unchanged.
  5414     LABELALIGN
  5415   "100:                                        \n"
  5416     "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
  5417     "sub       $0x10,%2                        \n"
  5418     MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
  5419     "lea       " MEMLEA(0x10,1) ",%1           \n"
  5420     "jg        100b                            \n"
  5422   "99:                                         \n"
  5423   : "+r"(dst_ptr),    // %0
  5424     "+r"(src_ptr),    // %1
  5425     "+r"(dst_width),  // %2
  5426     "+r"(source_y_fraction)  // %3
  5427   : "r"((intptr_t)(src_stride))  // %4
  5428   : "memory", "cc"
  5429 #if defined(__native_client__) && defined(__x86_64__)
  5430     , "r14"
  5431 #endif
  5432 #if defined(__SSE2__)
  5433     , "xmm0", "xmm1", "xmm2", "xmm5"
  5434 #endif
  5435   );
  5437 #endif  // HAS_INTERPOLATEROW_SSSE3
  5439 #ifdef HAS_INTERPOLATEROW_SSE2
  5440 // Bilinear filter 16x2 -> 16x1
  5441 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
  5442                          ptrdiff_t src_stride, int dst_width,
  5443                          int source_y_fraction) {
  5444   asm volatile (
  5445     "sub       %1,%0                           \n"
  5446     "shr       %3                              \n"
  5447     "cmp       $0x0,%3                         \n"
  5448     "je        100f                            \n"
  5449     "cmp       $0x20,%3                        \n"
  5450     "je        75f                             \n"
  5451     "cmp       $0x40,%3                        \n"
  5452     "je        50f                             \n"
  5453     "cmp       $0x60,%3                        \n"
  5454     "je        25f                             \n"
  5456     "movd      %3,%%xmm0                       \n"
  5457     "neg       %3                              \n"
  5458     "add       $0x80,%3                        \n"
  5459     "movd      %3,%%xmm5                       \n"
  5460     "punpcklbw %%xmm0,%%xmm5                   \n"
  5461     "punpcklwd %%xmm5,%%xmm5                   \n"
  5462     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
  5463     "pxor      %%xmm4,%%xmm4                   \n"
  5465     // General purpose row blend.
  5466     LABELALIGN
  5467   "1:                                          \n"
  5468     "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
  5469     MEMOPREG(movdqa,0x00,1,4,1,xmm2)           //  movdqa    (%1,%4,1),%%xmm2
  5470     "movdqa    %%xmm0,%%xmm1                   \n"
  5471     "movdqa    %%xmm2,%%xmm3                   \n"
  5472     "punpcklbw %%xmm4,%%xmm2                   \n"
  5473     "punpckhbw %%xmm4,%%xmm3                   \n"
  5474     "punpcklbw %%xmm4,%%xmm0                   \n"
  5475     "punpckhbw %%xmm4,%%xmm1                   \n"
  5476     "psubw     %%xmm0,%%xmm2                   \n"
  5477     "psubw     %%xmm1,%%xmm3                   \n"
  5478     "paddw     %%xmm2,%%xmm2                   \n"
  5479     "paddw     %%xmm3,%%xmm3                   \n"
  5480     "pmulhw    %%xmm5,%%xmm2                   \n"
  5481     "pmulhw    %%xmm5,%%xmm3                   \n"
  5482     "paddw     %%xmm2,%%xmm0                   \n"
  5483     "paddw     %%xmm3,%%xmm1                   \n"
  5484     "packuswb  %%xmm1,%%xmm0                   \n"
  5485     "sub       $0x10,%2                        \n"
  5486     BUNDLEALIGN
  5487     MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
  5488     "lea       " MEMLEA(0x10,1) ",%1           \n"
  5489     "jg        1b                              \n"
  5490     "jmp       99f                             \n"
  5492     // Blend 25 / 75.
  5493     LABELALIGN
  5494   "25:                                         \n"
  5495     "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
  5496     MEMOPREG(movdqa,0x00,1,4,1,xmm1)           //  movdqa    (%1,%4,1),%%xmm1
  5497     "pavgb     %%xmm1,%%xmm0                   \n"
  5498     "pavgb     %%xmm1,%%xmm0                   \n"
  5499     "sub       $0x10,%2                        \n"
  5500     BUNDLEALIGN
  5501     MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
  5502     "lea       " MEMLEA(0x10,1) ",%1           \n"
  5503     "jg        25b                             \n"
  5504     "jmp       99f                             \n"
  5506     // Blend 50 / 50.
  5507     LABELALIGN
  5508   "50:                                         \n"
  5509     "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
  5510     MEMOPREG(movdqa,0x00,1,4,1,xmm1)           //  movdqa    (%1,%4,1),%%xmm1
  5511     "pavgb     %%xmm1,%%xmm0                   \n"
  5512     "sub       $0x10,%2                        \n"
  5513     BUNDLEALIGN
  5514     MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
  5515     "lea       " MEMLEA(0x10,1) ",%1           \n"
  5516     "jg        50b                             \n"
  5517     "jmp       99f                             \n"
  5519     // Blend 75 / 25.
  5520     LABELALIGN
  5521   "75:                                         \n"
  5522     "movdqa    " MEMACCESS(1) ",%%xmm1         \n"
  5523     MEMOPREG(movdqa,0x00,1,4,1,xmm0)           //  movdqa    (%1,%4,1),%%xmm0
  5524     "pavgb     %%xmm1,%%xmm0                   \n"
  5525     "pavgb     %%xmm1,%%xmm0                   \n"
  5526     "sub       $0x10,%2                        \n"
  5527     BUNDLEALIGN
  5528     MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
  5529     "lea       " MEMLEA(0x10,1) ",%1           \n"
  5530     "jg        75b                             \n"
  5531     "jmp       99f                             \n"
  5533     // Blend 100 / 0 - Copy row unchanged.
  5534     LABELALIGN
  5535   "100:                                        \n"
  5536     "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
  5537     "sub       $0x10,%2                        \n"
  5538     MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
  5539     "lea       " MEMLEA(0x10,1) ",%1           \n"
  5540     "jg        100b                            \n"
  5542   "99:                                         \n"
  5543   : "+r"(dst_ptr),    // %0
  5544     "+r"(src_ptr),    // %1
  5545     "+r"(dst_width),  // %2
  5546     "+r"(source_y_fraction)  // %3
  5547   : "r"((intptr_t)(src_stride))  // %4
  5548   : "memory", "cc"
  5549 #if defined(__native_client__) && defined(__x86_64__)
  5550     , "r14"
  5551 #endif
  5552 #if defined(__SSE2__)
  5553     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  5554 #endif
  5555   );
  5557 #endif  // HAS_INTERPOLATEROW_SSE2
  5559 #ifdef HAS_INTERPOLATEROW_SSSE3
  5560 // Bilinear filter 16x2 -> 16x1
  5561 void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
  5562                                     ptrdiff_t src_stride, int dst_width,
  5563                                     int source_y_fraction) {
  5564   asm volatile (
  5565     "sub       %1,%0                           \n"
  5566     "shr       %3                              \n"
  5567     "cmp       $0x0,%3                         \n"
  5568     "je        100f                            \n"
  5569     "cmp       $0x20,%3                        \n"
  5570     "je        75f                             \n"
  5571     "cmp       $0x40,%3                        \n"
  5572     "je        50f                             \n"
  5573     "cmp       $0x60,%3                        \n"
  5574     "je        25f                             \n"
  5576     "movd      %3,%%xmm0                       \n"
  5577     "neg       %3                              \n"
  5578     "add       $0x80,%3                        \n"
  5579     "movd      %3,%%xmm5                       \n"
  5580     "punpcklbw %%xmm0,%%xmm5                   \n"
  5581     "punpcklwd %%xmm5,%%xmm5                   \n"
  5582     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
  5584     // General purpose row blend.
  5585     LABELALIGN
  5586   "1:                                          \n"
  5587     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
  5588     MEMOPREG(movdqu,0x00,1,4,1,xmm2)
  5589     "movdqu    %%xmm0,%%xmm1                   \n"
  5590     "punpcklbw %%xmm2,%%xmm0                   \n"
  5591     "punpckhbw %%xmm2,%%xmm1                   \n"
  5592     "pmaddubsw %%xmm5,%%xmm0                   \n"
  5593     "pmaddubsw %%xmm5,%%xmm1                   \n"
  5594     "psrlw     $0x7,%%xmm0                     \n"
  5595     "psrlw     $0x7,%%xmm1                     \n"
  5596     "packuswb  %%xmm1,%%xmm0                   \n"
  5597     "sub       $0x10,%2                        \n"
  5598     BUNDLEALIGN
  5599     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
  5600     "lea       " MEMLEA(0x10,1) ",%1           \n"
  5601     "jg        1b                              \n"
  5602     "jmp       99f                             \n"
  5604     // Blend 25 / 75.
  5605     LABELALIGN
  5606   "25:                                         \n"
  5607     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
  5608     MEMOPREG(movdqu,0x00,1,4,1,xmm1)
  5609     "pavgb     %%xmm1,%%xmm0                   \n"
  5610     "pavgb     %%xmm1,%%xmm0                   \n"
  5611     "sub       $0x10,%2                        \n"
  5612     BUNDLEALIGN
  5613     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
  5614     "lea       " MEMLEA(0x10,1) ",%1           \n"
  5615     "jg        25b                             \n"
  5616     "jmp       99f                             \n"
  5618     // Blend 50 / 50.
  5619     LABELALIGN
  5620   "50:                                         \n"
  5621     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
  5622     MEMOPREG(movdqu,0x00,1,4,1,xmm1)
  5623     "pavgb     %%xmm1,%%xmm0                   \n"
  5624     "sub       $0x10,%2                        \n"
  5625     BUNDLEALIGN
  5626     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
  5627     "lea       " MEMLEA(0x10,1) ",%1           \n"
  5628     "jg        50b                             \n"
  5629     "jmp       99f                             \n"
  5631     // Blend 75 / 25.
  5632     LABELALIGN
  5633   "75:                                         \n"
  5634     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
  5635     MEMOPREG(movdqu,0x00,1,4,1,xmm0)
  5636     "pavgb     %%xmm1,%%xmm0                   \n"
  5637     "pavgb     %%xmm1,%%xmm0                   \n"
  5638     "sub       $0x10,%2                        \n"
  5639     BUNDLEALIGN
  5640     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
  5641     "lea       " MEMLEA(0x10,1) ",%1           \n"
  5642     "jg        75b                             \n"
  5643     "jmp       99f                             \n"
  5645     // Blend 100 / 0 - Copy row unchanged.
  5646     LABELALIGN
  5647   "100:                                        \n"
  5648     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
  5649     "sub       $0x10,%2                        \n"
  5650     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
  5651     "lea       " MEMLEA(0x10,1) ",%1           \n"
  5652     "jg        100b                            \n"
  5654   "99:                                         \n"
  5655   : "+r"(dst_ptr),    // %0
  5656     "+r"(src_ptr),    // %1
  5657     "+r"(dst_width),  // %2
  5658     "+r"(source_y_fraction)  // %3
  5659   : "r"((intptr_t)(src_stride))  // %4
  5660   : "memory", "cc"
  5661 #if defined(__native_client__) && defined(__x86_64__)
  5662     , "r14"
  5663 #endif
  5664 #if defined(__SSE2__)
  5665     , "xmm0", "xmm1", "xmm2", "xmm5"
  5666 #endif
  5667   );
  5669 #endif   // HAS_INTERPOLATEROW_SSSE3
  5671 #ifdef HAS_INTERPOLATEROW_SSE2
  5672 // Bilinear filter 16x2 -> 16x1
  5673 void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
  5674                                    ptrdiff_t src_stride, int dst_width,
  5675                                    int source_y_fraction) {
  5676   asm volatile (
  5677     "sub       %1,%0                           \n"
  5678     "shr       %3                              \n"
  5679     "cmp       $0x0,%3                         \n"
  5680     "je        100f                            \n"
  5681     "cmp       $0x20,%3                        \n"
  5682     "je        75f                             \n"
  5683     "cmp       $0x40,%3                        \n"
  5684     "je        50f                             \n"
  5685     "cmp       $0x60,%3                        \n"
  5686     "je        25f                             \n"
  5688     "movd      %3,%%xmm0                       \n"
  5689     "neg       %3                              \n"
  5690     "add       $0x80,%3                        \n"
  5691     "movd      %3,%%xmm5                       \n"
  5692     "punpcklbw %%xmm0,%%xmm5                   \n"
  5693     "punpcklwd %%xmm5,%%xmm5                   \n"
  5694     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
  5695     "pxor      %%xmm4,%%xmm4                   \n"
  5697     // General purpose row blend.
  5698     LABELALIGN
  5699   "1:                                          \n"
  5700     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
  5701     MEMOPREG(movdqu,0x00,1,4,1,xmm2)           //  movdqu    (%1,%4,1),%%xmm2
  5702     "movdqu    %%xmm0,%%xmm1                   \n"
  5703     "movdqu    %%xmm2,%%xmm3                   \n"
  5704     "punpcklbw %%xmm4,%%xmm2                   \n"
  5705     "punpckhbw %%xmm4,%%xmm3                   \n"
  5706     "punpcklbw %%xmm4,%%xmm0                   \n"
  5707     "punpckhbw %%xmm4,%%xmm1                   \n"
  5708     "psubw     %%xmm0,%%xmm2                   \n"
  5709     "psubw     %%xmm1,%%xmm3                   \n"
  5710     "paddw     %%xmm2,%%xmm2                   \n"
  5711     "paddw     %%xmm3,%%xmm3                   \n"
  5712     "pmulhw    %%xmm5,%%xmm2                   \n"
  5713     "pmulhw    %%xmm5,%%xmm3                   \n"
  5714     "paddw     %%xmm2,%%xmm0                   \n"
  5715     "paddw     %%xmm3,%%xmm1                   \n"
  5716     "packuswb  %%xmm1,%%xmm0                   \n"
  5717     "sub       $0x10,%2                        \n"
  5718     BUNDLEALIGN
  5719     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
  5720     "lea       " MEMLEA(0x10,1) ",%1           \n"
  5721     "jg        1b                              \n"
  5722     "jmp       99f                             \n"
  5724     // Blend 25 / 75.
  5725     LABELALIGN
  5726   "25:                                         \n"
  5727     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
  5728     MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
  5729     "pavgb     %%xmm1,%%xmm0                   \n"
  5730     "pavgb     %%xmm1,%%xmm0                   \n"
  5731     "sub       $0x10,%2                        \n"
  5732     BUNDLEALIGN
  5733     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
  5734     "lea       " MEMLEA(0x10,1) ",%1           \n"
  5735     "jg        25b                             \n"
  5736     "jmp       99f                             \n"
  5738     // Blend 50 / 50.
  5739     LABELALIGN
  5740   "50:                                         \n"
  5741     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
  5742     MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
  5743     "pavgb     %%xmm1,%%xmm0                   \n"
  5744     "sub       $0x10,%2                        \n"
  5745     BUNDLEALIGN
  5746     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
  5747     "lea       " MEMLEA(0x10,1) ",%1           \n"
  5748     "jg        50b                             \n"
  5749     "jmp       99f                             \n"
  5751     // Blend 75 / 25.
  5752     LABELALIGN
  5753   "75:                                         \n"
  5754     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
  5755     MEMOPREG(movdqu,0x00,1,4,1,xmm0)           //  movdqu    (%1,%4,1),%%xmm0
  5756     "pavgb     %%xmm1,%%xmm0                   \n"
  5757     "pavgb     %%xmm1,%%xmm0                   \n"
  5758     "sub       $0x10,%2                        \n"
  5759     BUNDLEALIGN
  5760     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
  5761     "lea       " MEMLEA(0x10,1) ",%1           \n"
  5762     "jg        75b                             \n"
  5763     "jmp       99f                             \n"
  5765     // Blend 100 / 0 - Copy row unchanged.
  5766     LABELALIGN
  5767   "100:                                        \n"
  5768     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
  5769     "sub       $0x10,%2                        \n"
  5770     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
  5771     "lea       " MEMLEA(0x10,1) ",%1           \n"
  5772     "jg        100b                            \n"
  5774   "99:                                         \n"
  5775   : "+r"(dst_ptr),    // %0
  5776     "+r"(src_ptr),    // %1
  5777     "+r"(dst_width),  // %2
  5778     "+r"(source_y_fraction)  // %3
  5779   : "r"((intptr_t)(src_stride))  // %4
  5780   : "memory", "cc"
  5781 #if defined(__native_client__) && defined(__x86_64__)
  5782     , "r14"
  5783 #endif
  5784 #if defined(__SSE2__)
  5785     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  5786 #endif
  5787   );
  5789 #endif  // HAS_INTERPOLATEROW_SSE2
  5791 #ifdef HAS_HALFROW_SSE2
  5792 void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
  5793                   uint8* dst_uv, int pix) {
  5794   asm volatile (
  5795     "sub       %0,%1                           \n"
  5796     LABELALIGN
  5797   "1:                                          \n"
  5798     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  5799     MEMOPREG(pavgb,0x00,0,3,1,xmm0)            //  pavgb     (%0,%3),%%xmm0
  5800     "sub       $0x10,%2                        \n"
  5801     MEMOPMEM(movdqa,xmm0,0x00,0,1,1)           //  movdqa    %%xmm0,(%0,%1)
  5802     "lea       " MEMLEA(0x10,0) ",%0           \n"
  5803     "jg        1b                              \n"
  5804   : "+r"(src_uv),  // %0
  5805     "+r"(dst_uv),  // %1
  5806     "+r"(pix)      // %2
  5807   : "r"((intptr_t)(src_uv_stride))  // %3
  5808   : "memory", "cc"
  5809 #if defined(__SSE2__)
  5810       , "xmm0"
  5811 #endif
  5812   );
  5814 #endif  // HAS_HALFROW_SSE2
  5816 #ifdef HAS_ARGBTOBAYERROW_SSSE3
  5817 void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
  5818                           uint32 selector, int pix) {
  5819   asm volatile (
  5820     // NaCL caveat - assumes movd is from GPR
  5821     "movd      %3,%%xmm5                       \n"
  5822     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
  5823     LABELALIGN
  5824   "1:                                          \n"
  5825     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  5826     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  5827     "lea       " MEMLEA(0x20,0) ",%0           \n"
  5828     "pshufb    %%xmm5,%%xmm0                   \n"
  5829     "pshufb    %%xmm5,%%xmm1                   \n"
  5830     "punpckldq %%xmm1,%%xmm0                   \n"
  5831     "sub       $0x8,%2                         \n"
  5832     "movq      %%xmm0," MEMACCESS(1) "         \n"
  5833     "lea       " MEMLEA(0x8,1) ",%1            \n"
  5834     "jg        1b                              \n"
  5835   : "+r"(src_argb),  // %0
  5836     "+r"(dst_bayer), // %1
  5837     "+r"(pix)        // %2
  5838   : "g"(selector)    // %3
  5839   : "memory", "cc"
  5840 #if defined(__SSE2__)
  5841     , "xmm0", "xmm1", "xmm5"
  5842 #endif
  5843   );
  5845 #endif  // HAS_ARGBTOBAYERROW_SSSE3
  5847 #ifdef HAS_ARGBTOBAYERGGROW_SSE2
  5848 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
  5849                            uint32 selector, int pix) {
  5850   asm volatile (
  5851     "pcmpeqb   %%xmm5,%%xmm5                   \n"
  5852     "psrld     $0x18,%%xmm5                    \n"
  5853     LABELALIGN
  5854   "1:                                          \n"
  5855     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  5856     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  5857     "lea       " MEMLEA(0x20,0) ",%0           \n"
  5858     "psrld     $0x8,%%xmm0                     \n"
  5859     "psrld     $0x8,%%xmm1                     \n"
  5860     "pand      %%xmm5,%%xmm0                   \n"
  5861     "pand      %%xmm5,%%xmm1                   \n"
  5862     "packssdw  %%xmm1,%%xmm0                   \n"
  5863     "packuswb  %%xmm1,%%xmm0                   \n"
  5864     "sub       $0x8,%2                         \n"
  5865     "movq      %%xmm0," MEMACCESS(1) "         \n"
  5866     "lea       " MEMLEA(0x8,1) ",%1            \n"
  5867     "jg        1b                              \n"
  5868   : "+r"(src_argb),  // %0
  5869     "+r"(dst_bayer), // %1
  5870     "+r"(pix)        // %2
  5872   : "memory", "cc"
  5873 #if defined(__SSE2__)
  5874     , "xmm0", "xmm1", "xmm5"
  5875 #endif
  5876   );
  5878 #endif  // HAS_ARGBTOBAYERGGROW_SSE2
  5880 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
  5881 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  5882 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
  5883                           const uint8* shuffler, int pix) {
  5884   asm volatile (
  5885     "movdqa    " MEMACCESS(3) ",%%xmm5         \n"
  5886     LABELALIGN
  5887   "1:                                          \n"
  5888     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  5889     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  5890     "lea       " MEMLEA(0x20,0) ",%0           \n"
  5891     "pshufb    %%xmm5,%%xmm0                   \n"
  5892     "pshufb    %%xmm5,%%xmm1                   \n"
  5893     "sub       $0x8,%2                         \n"
  5894     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  5895     "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
  5896     "lea       " MEMLEA(0x20,1) ",%1           \n"
  5897     "jg        1b                              \n"
  5898   : "+r"(src_argb),  // %0
  5899     "+r"(dst_argb),  // %1
  5900     "+r"(pix)        // %2
  5901   : "r"(shuffler)    // %3
  5902   : "memory", "cc"
  5903 #if defined(__SSE2__)
  5904     , "xmm0", "xmm1", "xmm5"
  5905 #endif
  5906   );
  5909 void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
  5910                                     const uint8* shuffler, int pix) {
  5911   asm volatile (
  5912     "movdqa    " MEMACCESS(3) ",%%xmm5         \n"
  5913     LABELALIGN
  5914   "1:                                          \n"
  5915     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  5916     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  5917     "lea       " MEMLEA(0x20,0) ",%0           \n"
  5918     "pshufb    %%xmm5,%%xmm0                   \n"
  5919     "pshufb    %%xmm5,%%xmm1                   \n"
  5920     "sub       $0x8,%2                         \n"
  5921     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
  5922     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
  5923     "lea       " MEMLEA(0x20,1) ",%1           \n"
  5924     "jg        1b                              \n"
  5925   : "+r"(src_argb),  // %0
  5926     "+r"(dst_argb),  // %1
  5927     "+r"(pix)        // %2
  5928   : "r"(shuffler)    // %3
  5929   : "memory", "cc"
  5930 #if defined(__SSE2__)
  5931     , "xmm0", "xmm1", "xmm5"
  5932 #endif
  5933   );
  5935 #endif  // HAS_ARGBSHUFFLEROW_SSSE3
  5937 #ifdef HAS_ARGBSHUFFLEROW_AVX2
  5938 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  5939 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
  5940                          const uint8* shuffler, int pix) {
  5941   asm volatile (
  5942     "vbroadcastf128 " MEMACCESS(3) ",%%ymm5    \n"
  5943     LABELALIGN
  5944   "1:                                          \n"
  5945     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
  5946     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
  5947     "lea       " MEMLEA(0x40,0) ",%0           \n"
  5948     "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
  5949     "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
  5950     "sub       $0x10,%2                        \n"
  5951     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
  5952     "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
  5953     "lea       " MEMLEA(0x40,1) ",%1           \n"
  5954     "jg        1b                              \n"
  5955   : "+r"(src_argb),  // %0
  5956     "+r"(dst_argb),  // %1
  5957     "+r"(pix)        // %2
  5958   : "r"(shuffler)    // %3
  5959   : "memory", "cc"
  5960 #if defined(__SSE2__)
  5961     , "xmm0", "xmm1", "xmm5"
  5962 #endif
  5963   );
  5965 #endif  // HAS_ARGBSHUFFLEROW_AVX2
  5967 #ifdef HAS_ARGBSHUFFLEROW_SSE2
  5968 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  5969 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
  5970                          const uint8* shuffler, int pix) {
  5971   uintptr_t pixel_temp = 0u;
  5972   asm volatile (
  5973     "pxor      %%xmm5,%%xmm5                   \n"
  5974     "mov       " MEMACCESS(4) ",%k2            \n"
  5975     "cmp       $0x3000102,%k2                  \n"
  5976     "je        3012f                           \n"
  5977     "cmp       $0x10203,%k2                    \n"
  5978     "je        123f                            \n"
  5979     "cmp       $0x30201,%k2                    \n"
  5980     "je        321f                            \n"
  5981     "cmp       $0x2010003,%k2                  \n"
  5982     "je        2103f                           \n"
  5984     LABELALIGN
  5985   "1:                                          \n"
  5986     "movzb     " MEMACCESS(4) ",%2             \n"
  5987     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
  5988     "mov       %b2," MEMACCESS(1) "            \n"
  5989     "movzb     " MEMACCESS2(0x1,4) ",%2        \n"
  5990     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
  5991     "mov       %b2," MEMACCESS2(0x1,1) "       \n"
  5992     BUNDLEALIGN
  5993     "movzb     " MEMACCESS2(0x2,4) ",%2        \n"
  5994     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
  5995     "mov       %b2," MEMACCESS2(0x2,1) "       \n"
  5996     "movzb     " MEMACCESS2(0x3,4) ",%2        \n"
  5997     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
  5998     "mov       %b2," MEMACCESS2(0x3,1) "       \n"
  5999     "lea       " MEMLEA(0x4,0) ",%0            \n"
  6000     "lea       " MEMLEA(0x4,1) ",%1            \n"
  6001     "sub       $0x1,%3                         \n"
  6002     "jg        1b                              \n"
  6003     "jmp       99f                             \n"
  6005     LABELALIGN
  6006   "123:                                        \n"
  6007     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  6008     "lea       " MEMLEA(0x10,0) ",%0           \n"
  6009     "movdqa    %%xmm0,%%xmm1                   \n"
  6010     "punpcklbw %%xmm5,%%xmm0                   \n"
  6011     "punpckhbw %%xmm5,%%xmm1                   \n"
  6012     "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
  6013     "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
  6014     "pshufhw   $0x1b,%%xmm1,%%xmm1             \n"
  6015     "pshuflw   $0x1b,%%xmm1,%%xmm1             \n"
  6016     "packuswb  %%xmm1,%%xmm0                   \n"
  6017     "sub       $0x4,%3                         \n"
  6018     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
  6019     "lea       " MEMLEA(0x10,1) ",%1           \n"
  6020     "jg        123b                            \n"
  6021     "jmp       99f                             \n"
  6023     LABELALIGN
  6024   "321:                                        \n"
  6025     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  6026     "lea       " MEMLEA(0x10,0) ",%0           \n"
  6027     "movdqa    %%xmm0,%%xmm1                   \n"
  6028     "punpcklbw %%xmm5,%%xmm0                   \n"
  6029     "punpckhbw %%xmm5,%%xmm1                   \n"
  6030     "pshufhw   $0x39,%%xmm0,%%xmm0             \n"
  6031     "pshuflw   $0x39,%%xmm0,%%xmm0             \n"
  6032     "pshufhw   $0x39,%%xmm1,%%xmm1             \n"
  6033     "pshuflw   $0x39,%%xmm1,%%xmm1             \n"
  6034     "packuswb  %%xmm1,%%xmm0                   \n"
  6035     "sub       $0x4,%3                         \n"
  6036     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
  6037     "lea       " MEMLEA(0x10,1) ",%1           \n"
  6038     "jg        321b                            \n"
  6039     "jmp       99f                             \n"
  6041     LABELALIGN
  6042   "2103:                                       \n"
  6043     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  6044     "lea       " MEMLEA(0x10,0) ",%0           \n"
  6045     "movdqa    %%xmm0,%%xmm1                   \n"
  6046     "punpcklbw %%xmm5,%%xmm0                   \n"
  6047     "punpckhbw %%xmm5,%%xmm1                   \n"
  6048     "pshufhw   $0x93,%%xmm0,%%xmm0             \n"
  6049     "pshuflw   $0x93,%%xmm0,%%xmm0             \n"
  6050     "pshufhw   $0x93,%%xmm1,%%xmm1             \n"
  6051     "pshuflw   $0x93,%%xmm1,%%xmm1             \n"
  6052     "packuswb  %%xmm1,%%xmm0                   \n"
  6053     "sub       $0x4,%3                         \n"
  6054     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
  6055     "lea       " MEMLEA(0x10,1) ",%1           \n"
  6056     "jg        2103b                           \n"
  6057     "jmp       99f                             \n"
  6059     LABELALIGN
  6060   "3012:                                       \n"
  6061     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  6062     "lea       " MEMLEA(0x10,0) ",%0           \n"
  6063     "movdqa    %%xmm0,%%xmm1                   \n"
  6064     "punpcklbw %%xmm5,%%xmm0                   \n"
  6065     "punpckhbw %%xmm5,%%xmm1                   \n"
  6066     "pshufhw   $0xc6,%%xmm0,%%xmm0             \n"
  6067     "pshuflw   $0xc6,%%xmm0,%%xmm0             \n"
  6068     "pshufhw   $0xc6,%%xmm1,%%xmm1             \n"
  6069     "pshuflw   $0xc6,%%xmm1,%%xmm1             \n"
  6070     "packuswb  %%xmm1,%%xmm0                   \n"
  6071     "sub       $0x4,%3                         \n"
  6072     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
  6073     "lea       " MEMLEA(0x10,1) ",%1           \n"
  6074     "jg        3012b                           \n"
  6076   "99:                                         \n"
  6077   : "+r"(src_argb),    // %0
  6078     "+r"(dst_argb),    // %1
  6079     "+d"(pixel_temp),  // %2
  6080     "+r"(pix)         // %3
  6081   : "r"(shuffler)      // %4
  6082   : "memory", "cc"
  6083 #if defined(__native_client__) && defined(__x86_64__)
  6084     , "r14"
  6085 #endif
  6086 #if defined(__SSE2__)
  6087     , "xmm0", "xmm1", "xmm5"
  6088 #endif
  6089   );
  6091 #endif  // HAS_ARGBSHUFFLEROW_SSE2
  6093 #ifdef HAS_I422TOYUY2ROW_SSE2
  6094 void I422ToYUY2Row_SSE2(const uint8* src_y,
  6095                         const uint8* src_u,
  6096                         const uint8* src_v,
  6097                         uint8* dst_frame, int width) {
  6098  asm volatile (
  6099     "sub       %1,%2                             \n"
  6100     LABELALIGN
  6101   "1:                                            \n"
  6102     "movq      " MEMACCESS(1) ",%%xmm2           \n"
  6103     MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
  6104     "lea       " MEMLEA(0x8,1) ",%1              \n"
  6105     "punpcklbw %%xmm3,%%xmm2                     \n"
  6106     "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
  6107     "lea       " MEMLEA(0x10,0) ",%0             \n"
  6108     "movdqa    %%xmm0,%%xmm1                     \n"
  6109     "punpcklbw %%xmm2,%%xmm0                     \n"
  6110     "punpckhbw %%xmm2,%%xmm1                     \n"
  6111     "movdqu    %%xmm0," MEMACCESS(3) "           \n"
  6112     "movdqu    %%xmm1," MEMACCESS2(0x10,3) "     \n"
  6113     "lea       " MEMLEA(0x20,3) ",%3             \n"
  6114     "sub       $0x10,%4                          \n"
  6115     "jg         1b                               \n"
  6116     : "+r"(src_y),  // %0
  6117       "+r"(src_u),  // %1
  6118       "+r"(src_v),  // %2
  6119       "+r"(dst_frame),  // %3
  6120       "+rm"(width)  // %4
  6122     : "memory", "cc"
  6123 #if defined(__native_client__) && defined(__x86_64__)
  6124     , "r14"
  6125 #endif
  6126 #if defined(__SSE2__)
  6127     , "xmm0", "xmm1", "xmm2", "xmm3"
  6128 #endif
  6129   );
  6131 #endif  // HAS_I422TOYUY2ROW_SSE2
  6133 #ifdef HAS_I422TOUYVYROW_SSE2
  6134 void I422ToUYVYRow_SSE2(const uint8* src_y,
  6135                         const uint8* src_u,
  6136                         const uint8* src_v,
  6137                         uint8* dst_frame, int width) {
  6138  asm volatile (
  6139     "sub        %1,%2                            \n"
  6140     LABELALIGN
  6141   "1:                                            \n"
  6142     "movq      " MEMACCESS(1) ",%%xmm2           \n"
  6143     MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
  6144     "lea       " MEMLEA(0x8,1) ",%1              \n"
  6145     "punpcklbw %%xmm3,%%xmm2                     \n"
  6146     "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
  6147     "movdqa    %%xmm2,%%xmm1                     \n"
  6148     "lea       " MEMLEA(0x10,0) ",%0             \n"
  6149     "punpcklbw %%xmm0,%%xmm1                     \n"
  6150     "punpckhbw %%xmm0,%%xmm2                     \n"
  6151     "movdqu    %%xmm1," MEMACCESS(3) "           \n"
  6152     "movdqu    %%xmm2," MEMACCESS2(0x10,3) "     \n"
  6153     "lea       " MEMLEA(0x20,3) ",%3             \n"
  6154     "sub       $0x10,%4                          \n"
  6155     "jg         1b                               \n"
  6156     : "+r"(src_y),  // %0
  6157       "+r"(src_u),  // %1
  6158       "+r"(src_v),  // %2
  6159       "+r"(dst_frame),  // %3
  6160       "+rm"(width)  // %4
  6162     : "memory", "cc"
  6163 #if defined(__native_client__) && defined(__x86_64__)
  6164     , "r14"
  6165 #endif
  6166 #if defined(__SSE2__)
  6167     , "xmm0", "xmm1", "xmm2", "xmm3"
  6168 #endif
  6169   );
  6171 #endif  // HAS_I422TOUYVYROW_SSE2
  6173 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
  6174 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
  6175                             uint8* dst_argb, const float* poly,
  6176                             int width) {
  6177   asm volatile (
  6178     "pxor      %%xmm3,%%xmm3                   \n"
  6180     // 2 pixel loop.
  6181     LABELALIGN
  6182   "1:                                          \n"
  6183     "movq      " MEMACCESS(0) ",%%xmm0         \n"
  6184     "lea       " MEMLEA(0x8,0) ",%0            \n"
  6185     "punpcklbw %%xmm3,%%xmm0                   \n"
  6186     "movdqa    %%xmm0,%%xmm4                   \n"
  6187     "punpcklwd %%xmm3,%%xmm0                   \n"
  6188     "punpckhwd %%xmm3,%%xmm4                   \n"
  6189     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
  6190     "cvtdq2ps  %%xmm4,%%xmm4                   \n"
  6191     "movdqa    %%xmm0,%%xmm1                   \n"
  6192     "movdqa    %%xmm4,%%xmm5                   \n"
  6193     "mulps     " MEMACCESS2(0x10,3) ",%%xmm0   \n"
  6194     "mulps     " MEMACCESS2(0x10,3) ",%%xmm4   \n"
  6195     "addps     " MEMACCESS(3) ",%%xmm0         \n"
  6196     "addps     " MEMACCESS(3) ",%%xmm4         \n"
  6197     "movdqa    %%xmm1,%%xmm2                   \n"
  6198     "movdqa    %%xmm5,%%xmm6                   \n"
  6199     "mulps     %%xmm1,%%xmm2                   \n"
  6200     "mulps     %%xmm5,%%xmm6                   \n"
  6201     "mulps     %%xmm2,%%xmm1                   \n"
  6202     "mulps     %%xmm6,%%xmm5                   \n"
  6203     "mulps     " MEMACCESS2(0x20,3) ",%%xmm2   \n"
  6204     "mulps     " MEMACCESS2(0x20,3) ",%%xmm6   \n"
  6205     "mulps     " MEMACCESS2(0x30,3) ",%%xmm1   \n"
  6206     "mulps     " MEMACCESS2(0x30,3) ",%%xmm5   \n"
  6207     "addps     %%xmm2,%%xmm0                   \n"
  6208     "addps     %%xmm6,%%xmm4                   \n"
  6209     "addps     %%xmm1,%%xmm0                   \n"
  6210     "addps     %%xmm5,%%xmm4                   \n"
  6211     "cvttps2dq %%xmm0,%%xmm0                   \n"
  6212     "cvttps2dq %%xmm4,%%xmm4                   \n"
  6213     "packuswb  %%xmm4,%%xmm0                   \n"
  6214     "packuswb  %%xmm0,%%xmm0                   \n"
  6215     "sub       $0x2,%2                         \n"
  6216     "movq      %%xmm0," MEMACCESS(1) "         \n"
  6217     "lea       " MEMLEA(0x8,1) ",%1            \n"
  6218     "jg        1b                              \n"
  6219   : "+r"(src_argb),  // %0
  6220     "+r"(dst_argb),  // %1
  6221     "+r"(width)      // %2
  6222   : "r"(poly)        // %3
  6223   : "memory", "cc"
  6224 #if defined(__SSE2__)
  6225     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  6226 #endif
  6227   );
  6229 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
  6231 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
  6232 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
  6233                             uint8* dst_argb, const float* poly,
  6234                             int width) {
  6235   asm volatile (
  6236     "vbroadcastf128 " MEMACCESS(3) ",%%ymm4     \n"
  6237     "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
  6238     "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
  6239     "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
  6241     // 2 pixel loop.
  6242     LABELALIGN
  6243   "1:                                          \n"
  6244     "vpmovzxbd   " MEMACCESS(0) ",%%ymm0       \n"  // 2 ARGB pixels
  6245     "lea         " MEMLEA(0x8,0) ",%0          \n"
  6246     "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
  6247     "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
  6248     "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
  6249     "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
  6250     "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
  6251     "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X * X
  6252     "vcvttps2dq  %%ymm0,%%ymm0                 \n"
  6253     "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
  6254     "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
  6255     "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
  6256     "sub         $0x2,%2                       \n"
  6257     "vmovq       %%xmm0," MEMACCESS(1) "       \n"
  6258     "lea         " MEMLEA(0x8,1) ",%1          \n"
  6259     "jg          1b                            \n"
  6260     "vzeroupper                                \n"
  6261   : "+r"(src_argb),  // %0
  6262     "+r"(dst_argb),  // %1
  6263     "+r"(width)      // %2
  6264   : "r"(poly)        // %3
  6265   : "memory", "cc"
  6266 #if defined(__SSE2__)
  6267 // TODO(fbarchard): declare ymm usage when applicable.
  6268     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  6269 #endif
  6270   );
  6272 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
  6274 #ifdef HAS_ARGBCOLORTABLEROW_X86
  6275 // Tranform ARGB pixels with color table.
  6276 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
  6277                            int width) {
  6278   uintptr_t pixel_temp = 0u;
  6279   asm volatile (
  6280     // 1 pixel loop.
  6281     LABELALIGN
  6282   "1:                                          \n"
  6283     "movzb     " MEMACCESS(0) ",%1             \n"
  6284     "lea       " MEMLEA(0x4,0) ",%0            \n"
  6285     MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
  6286     "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
  6287     "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
  6288     MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
  6289     "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
  6290     "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
  6291     MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
  6292     "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
  6293     "movzb     " MEMACCESS2(-0x1,0) ",%1       \n"
  6294     MEMOPARG(movzb,0x03,3,1,4,1) "             \n"  // movzb 0x3(%3,%1,4),%1
  6295     "mov       %b1," MEMACCESS2(-0x1,0) "      \n"
  6296     "dec       %2                              \n"
  6297     "jg        1b                              \n"
  6298   : "+r"(dst_argb),   // %0
  6299     "+d"(pixel_temp), // %1
  6300     "+r"(width)       // %2
  6301   : "r"(table_argb)   // %3
  6302   : "memory", "cc");
  6304 #endif  // HAS_ARGBCOLORTABLEROW_X86
  6306 #ifdef HAS_RGBCOLORTABLEROW_X86
  6307 // Tranform RGB pixels with color table.
  6308 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
  6309   uintptr_t pixel_temp = 0u;
  6310   asm volatile (
  6311     // 1 pixel loop.
  6312     LABELALIGN
  6313   "1:                                          \n"
  6314     "movzb     " MEMACCESS(0) ",%1             \n"
  6315     "lea       " MEMLEA(0x4,0) ",%0            \n"
  6316     MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
  6317     "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
  6318     "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
  6319     MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
  6320     "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
  6321     "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
  6322     MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
  6323     "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
  6324     "dec       %2                              \n"
  6325     "jg        1b                              \n"
  6326   : "+r"(dst_argb),   // %0
  6327     "+d"(pixel_temp), // %1
  6328     "+r"(width)       // %2
  6329   : "r"(table_argb)   // %3
  6330   : "memory", "cc");
  6332 #endif  // HAS_RGBCOLORTABLEROW_X86
  6334 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
  6335 // Tranform RGB pixels with luma table.
  6336 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
  6337                                  int width,
  6338                                  const uint8* luma, uint32 lumacoeff) {
  6339   uintptr_t pixel_temp = 0u;
  6340   uintptr_t table_temp = 0u;
  6341   asm volatile (
  6342     "movd      %6,%%xmm3                       \n"
  6343     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
  6344     "pcmpeqb   %%xmm4,%%xmm4                   \n"
  6345     "psllw     $0x8,%%xmm4                     \n"
  6346     "pxor      %%xmm5,%%xmm5                   \n"
  6348     // 4 pixel loop.
  6349     LABELALIGN
  6350   "1:                                          \n"
  6351     "movdqu    " MEMACCESS(2) ",%%xmm0         \n"
  6352     "pmaddubsw %%xmm3,%%xmm0                   \n"
  6353     "phaddw    %%xmm0,%%xmm0                   \n"
  6354     "pand      %%xmm4,%%xmm0                   \n"
  6355     "punpcklwd %%xmm5,%%xmm0                   \n"
  6356     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
  6357     "add       %5,%1                           \n"
  6358     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
  6360     "movzb     " MEMACCESS(2) ",%0             \n"
  6361     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
  6362     "mov       %b0," MEMACCESS(3) "            \n"
  6363     "movzb     " MEMACCESS2(0x1,2) ",%0        \n"
  6364     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
  6365     "mov       %b0," MEMACCESS2(0x1,3) "       \n"
  6366     "movzb     " MEMACCESS2(0x2,2) ",%0        \n"
  6367     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
  6368     "mov       %b0," MEMACCESS2(0x2,3) "       \n"
  6369     "movzb     " MEMACCESS2(0x3,2) ",%0        \n"
  6370     "mov       %b0," MEMACCESS2(0x3,3) "       \n"
  6372     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
  6373     "add       %5,%1                           \n"
  6374     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
  6376     "movzb     " MEMACCESS2(0x4,2) ",%0        \n"
  6377     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
  6378     "mov       %b0," MEMACCESS2(0x4,3) "       \n"
  6379     BUNDLEALIGN
  6380     "movzb     " MEMACCESS2(0x5,2) ",%0        \n"
  6381     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
  6382     "mov       %b0," MEMACCESS2(0x5,3) "       \n"
  6383     "movzb     " MEMACCESS2(0x6,2) ",%0        \n"
  6384     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
  6385     "mov       %b0," MEMACCESS2(0x6,3) "       \n"
  6386     "movzb     " MEMACCESS2(0x7,2) ",%0        \n"
  6387     "mov       %b0," MEMACCESS2(0x7,3) "       \n"
  6389     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
  6390     "add       %5,%1                           \n"
  6391     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
  6393     "movzb     " MEMACCESS2(0x8,2) ",%0        \n"
  6394     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
  6395     "mov       %b0," MEMACCESS2(0x8,3) "       \n"
  6396     "movzb     " MEMACCESS2(0x9,2) ",%0        \n"
  6397     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
  6398     "mov       %b0," MEMACCESS2(0x9,3) "       \n"
  6399     "movzb     " MEMACCESS2(0xa,2) ",%0        \n"
  6400     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
  6401     "mov       %b0," MEMACCESS2(0xa,3) "       \n"
  6402     "movzb     " MEMACCESS2(0xb,2) ",%0        \n"
  6403     "mov       %b0," MEMACCESS2(0xb,3) "       \n"
  6405     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
  6406     "add       %5,%1                           \n"
  6408     "movzb     " MEMACCESS2(0xc,2) ",%0        \n"
  6409     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
  6410     "mov       %b0," MEMACCESS2(0xc,3) "       \n"
  6411     "movzb     " MEMACCESS2(0xd,2) ",%0        \n"
  6412     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
  6413     "mov       %b0," MEMACCESS2(0xd,3) "       \n"
  6414     "movzb     " MEMACCESS2(0xe,2) ",%0        \n"
  6415     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
  6416     "mov       %b0," MEMACCESS2(0xe,3) "       \n"
  6417     "movzb     " MEMACCESS2(0xf,2) ",%0        \n"
  6418     "mov       %b0," MEMACCESS2(0xf,3) "       \n"
  6419     "sub       $0x4,%4                         \n"
  6420     "lea       " MEMLEA(0x10,2) ",%2           \n"
  6421     "lea       " MEMLEA(0x10,3) ",%3           \n"
  6422     "jg        1b                              \n"
  6423   : "+d"(pixel_temp),  // %0
  6424     "+a"(table_temp),  // %1
  6425     "+r"(src_argb),    // %2
  6426     "+r"(dst_argb),    // %3
  6427     "+rm"(width)       // %4
  6428   : "r"(luma),         // %5
  6429     "rm"(lumacoeff)    // %6
  6430   : "memory", "cc"
  6431 #if defined(__SSE2__)
  6432     , "xmm0", "xmm3", "xmm4", "xmm5"
  6433 #endif
  6434   );
  6436 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
  6438 #endif  // defined(__x86_64__) || defined(__i386__)
  6440 #ifdef __cplusplus
  6441 }  // extern "C"
  6442 }  // namespace libyuv
  6443 #endif

mercurial