media/libyuv/source/row_posix.cc

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libyuv/source/row_posix.cc	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,6443 @@
     1.4 +/*
     1.5 + *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
     1.6 + *
     1.7 + *  Use of this source code is governed by a BSD-style license
     1.8 + *  that can be found in the LICENSE file in the root of the source
     1.9 + *  tree. An additional intellectual property rights grant can be found
    1.10 + *  in the file PATENTS. All contributing project authors may
    1.11 + *  be found in the AUTHORS file in the root of the source tree.
    1.12 + */
    1.13 +
    1.14 +#include "libyuv/row.h"
    1.15 +
    1.16 +#ifdef __cplusplus
    1.17 +namespace libyuv {
    1.18 +extern "C" {
    1.19 +#endif
    1.20 +
    1.21 +// This module is for GCC x86 and x64.
    1.22 +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
    1.23 +
    1.24 +#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
    1.25 +
    1.26 +// Constants for ARGB
    1.27 +static vec8 kARGBToY = {
    1.28 +  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
    1.29 +};
    1.30 +
    1.31 +// JPeg full range.
    1.32 +static vec8 kARGBToYJ = {
    1.33 +  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
    1.34 +};
    1.35 +#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
    1.36 +
    1.37 +#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
    1.38 +
    1.39 +static vec8 kARGBToU = {
    1.40 +  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
    1.41 +};
    1.42 +
    1.43 +static vec8 kARGBToUJ = {
    1.44 +  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
    1.45 +};
    1.46 +
    1.47 +static vec8 kARGBToV = {
    1.48 +  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
    1.49 +};
    1.50 +
    1.51 +static vec8 kARGBToVJ = {
    1.52 +  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
    1.53 +};
    1.54 +
    1.55 +// Constants for BGRA
    1.56 +static vec8 kBGRAToY = {
    1.57 +  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
    1.58 +};
    1.59 +
    1.60 +static vec8 kBGRAToU = {
    1.61 +  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
    1.62 +};
    1.63 +
    1.64 +static vec8 kBGRAToV = {
    1.65 +  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
    1.66 +};
    1.67 +
    1.68 +// Constants for ABGR
    1.69 +static vec8 kABGRToY = {
    1.70 +  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
    1.71 +};
    1.72 +
    1.73 +static vec8 kABGRToU = {
    1.74 +  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
    1.75 +};
    1.76 +
    1.77 +static vec8 kABGRToV = {
    1.78 +  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
    1.79 +};
    1.80 +
    1.81 +// Constants for RGBA.
    1.82 +static vec8 kRGBAToY = {
    1.83 +  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
    1.84 +};
    1.85 +
    1.86 +static vec8 kRGBAToU = {
    1.87 +  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
    1.88 +};
    1.89 +
    1.90 +static vec8 kRGBAToV = {
    1.91 +  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
    1.92 +};
    1.93 +
    1.94 +static uvec8 kAddY16 = {
    1.95 +  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
    1.96 +};
    1.97 +
    1.98 +static vec16 kAddYJ64 = {
    1.99 +  64, 64, 64, 64, 64, 64, 64, 64
   1.100 +};
   1.101 +
   1.102 +static uvec8 kAddUV128 = {
   1.103 +  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
   1.104 +  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
   1.105 +};
   1.106 +
   1.107 +static uvec16 kAddUVJ128 = {
   1.108 +  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
   1.109 +};
   1.110 +#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
   1.111 +
   1.112 +#ifdef HAS_RGB24TOARGBROW_SSSE3
   1.113 +
   1.114 +// Shuffle table for converting RGB24 to ARGB.
   1.115 +static uvec8 kShuffleMaskRGB24ToARGB = {
   1.116 +  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
   1.117 +};
   1.118 +
   1.119 +// Shuffle table for converting RAW to ARGB.
   1.120 +static uvec8 kShuffleMaskRAWToARGB = {
   1.121 +  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
   1.122 +};
   1.123 +
   1.124 +// Shuffle table for converting ARGB to RGB24.
   1.125 +static uvec8 kShuffleMaskARGBToRGB24 = {
   1.126 +  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
   1.127 +};
   1.128 +
   1.129 +// Shuffle table for converting ARGB to RAW.
   1.130 +static uvec8 kShuffleMaskARGBToRAW = {
   1.131 +  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
   1.132 +};
   1.133 +
   1.134 +// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
   1.135 +static uvec8 kShuffleMaskARGBToRGB24_0 = {
   1.136 +  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
   1.137 +};
   1.138 +
   1.139 +// Shuffle table for converting ARGB to RAW.
   1.140 +static uvec8 kShuffleMaskARGBToRAW_0 = {
   1.141 +  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
   1.142 +};
   1.143 +#endif  // HAS_RGB24TOARGBROW_SSSE3
   1.144 +
   1.145 +#if defined(TESTING) && defined(__x86_64__)
   1.146 +void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
   1.147 +  asm volatile (
   1.148 +    ".p2align  5                               \n"
   1.149 +    "mov       %%eax,%%eax                     \n"
   1.150 +    "mov       %%ebx,%%ebx                     \n"
   1.151 +    "mov       %%ecx,%%ecx                     \n"
   1.152 +    "mov       %%edx,%%edx                     \n"
   1.153 +    "mov       %%esi,%%esi                     \n"
   1.154 +    "mov       %%edi,%%edi                     \n"
   1.155 +    "mov       %%ebp,%%ebp                     \n"
   1.156 +    "mov       %%esp,%%esp                     \n"
   1.157 +    ".p2align  5                               \n"
   1.158 +    "mov       %%r8d,%%r8d                     \n"
   1.159 +    "mov       %%r9d,%%r9d                     \n"
   1.160 +    "mov       %%r10d,%%r10d                   \n"
   1.161 +    "mov       %%r11d,%%r11d                   \n"
   1.162 +    "mov       %%r12d,%%r12d                   \n"
   1.163 +    "mov       %%r13d,%%r13d                   \n"
   1.164 +    "mov       %%r14d,%%r14d                   \n"
   1.165 +    "mov       %%r15d,%%r15d                   \n"
   1.166 +    ".p2align  5                               \n"
   1.167 +    "lea       (%%rax),%%eax                   \n"
   1.168 +    "lea       (%%rbx),%%ebx                   \n"
   1.169 +    "lea       (%%rcx),%%ecx                   \n"
   1.170 +    "lea       (%%rdx),%%edx                   \n"
   1.171 +    "lea       (%%rsi),%%esi                   \n"
   1.172 +    "lea       (%%rdi),%%edi                   \n"
   1.173 +    "lea       (%%rbp),%%ebp                   \n"
   1.174 +    "lea       (%%rsp),%%esp                   \n"
   1.175 +    ".p2align  5                               \n"
   1.176 +    "lea       (%%r8),%%r8d                    \n"
   1.177 +    "lea       (%%r9),%%r9d                    \n"
   1.178 +    "lea       (%%r10),%%r10d                  \n"
   1.179 +    "lea       (%%r11),%%r11d                  \n"
   1.180 +    "lea       (%%r12),%%r12d                  \n"
   1.181 +    "lea       (%%r13),%%r13d                  \n"
   1.182 +    "lea       (%%r14),%%r14d                  \n"
   1.183 +    "lea       (%%r15),%%r15d                  \n"
   1.184 +
   1.185 +    ".p2align  5                               \n"
   1.186 +    "lea       0x10(%%rax),%%eax               \n"
   1.187 +    "lea       0x10(%%rbx),%%ebx               \n"
   1.188 +    "lea       0x10(%%rcx),%%ecx               \n"
   1.189 +    "lea       0x10(%%rdx),%%edx               \n"
   1.190 +    "lea       0x10(%%rsi),%%esi               \n"
   1.191 +    "lea       0x10(%%rdi),%%edi               \n"
   1.192 +    "lea       0x10(%%rbp),%%ebp               \n"
   1.193 +    "lea       0x10(%%rsp),%%esp               \n"
   1.194 +    ".p2align  5                               \n"
   1.195 +    "lea       0x10(%%r8),%%r8d                \n"
   1.196 +    "lea       0x10(%%r9),%%r9d                \n"
   1.197 +    "lea       0x10(%%r10),%%r10d              \n"
   1.198 +    "lea       0x10(%%r11),%%r11d              \n"
   1.199 +    "lea       0x10(%%r12),%%r12d              \n"
   1.200 +    "lea       0x10(%%r13),%%r13d              \n"
   1.201 +    "lea       0x10(%%r14),%%r14d              \n"
   1.202 +    "lea       0x10(%%r15),%%r15d              \n"
   1.203 +
   1.204 +    ".p2align  5                               \n"
   1.205 +    "add       0x10,%%eax                      \n"
   1.206 +    "add       0x10,%%ebx                      \n"
   1.207 +    "add       0x10,%%ecx                      \n"
   1.208 +    "add       0x10,%%edx                      \n"
   1.209 +    "add       0x10,%%esi                      \n"
   1.210 +    "add       0x10,%%edi                      \n"
   1.211 +    "add       0x10,%%ebp                      \n"
   1.212 +    "add       0x10,%%esp                      \n"
   1.213 +    ".p2align  5                               \n"
   1.214 +    "add       0x10,%%r8d                      \n"
   1.215 +    "add       0x10,%%r9d                      \n"
   1.216 +    "add       0x10,%%r10d                     \n"
   1.217 +    "add       0x10,%%r11d                     \n"
   1.218 +    "add       0x10,%%r12d                     \n"
   1.219 +    "add       0x10,%%r13d                     \n"
   1.220 +    "add       0x10,%%r14d                     \n"
   1.221 +    "add       0x10,%%r15d                     \n"
   1.222 +
   1.223 +    ".p2align  2                               \n"
   1.224 +  "1:                                          \n"
   1.225 +    "movq      " MEMACCESS(0) ",%%xmm0         \n"
   1.226 +    "lea       " MEMLEA(0x8,0) ",%0            \n"
   1.227 +    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
   1.228 +    "lea       " MEMLEA(0x20,1) ",%1           \n"
   1.229 +    "sub       $0x8,%2                         \n"
   1.230 +    "jg        1b                              \n"
   1.231 +  : "+r"(src_y),     // %0
   1.232 +    "+r"(dst_argb),  // %1
   1.233 +    "+r"(pix)        // %2
   1.234 +  :
   1.235 +  : "memory", "cc"
   1.236 +#if defined(__SSE2__)
   1.237 +    , "xmm0", "xmm1", "xmm5"
   1.238 +#endif
   1.239 +  );
   1.240 +}
   1.241 +#endif  // TESTING
   1.242 +
   1.243 +#ifdef HAS_I400TOARGBROW_SSE2
   1.244 +void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
   1.245 +  asm volatile (
   1.246 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1.247 +    "pslld     $0x18,%%xmm5                    \n"
   1.248 +    LABELALIGN
   1.249 +  "1:                                          \n"
   1.250 +    "movq      " MEMACCESS(0) ",%%xmm0         \n"
   1.251 +    "lea       " MEMLEA(0x8,0) ",%0            \n"
   1.252 +    "punpcklbw %%xmm0,%%xmm0                   \n"
   1.253 +    "movdqa    %%xmm0,%%xmm1                   \n"
   1.254 +    "punpcklwd %%xmm0,%%xmm0                   \n"
   1.255 +    "punpckhwd %%xmm1,%%xmm1                   \n"
   1.256 +    "por       %%xmm5,%%xmm0                   \n"
   1.257 +    "por       %%xmm5,%%xmm1                   \n"
   1.258 +    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
   1.259 +    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
   1.260 +    "lea       " MEMLEA(0x20,1) ",%1           \n"
   1.261 +    "sub       $0x8,%2                         \n"
   1.262 +    "jg        1b                              \n"
   1.263 +  : "+r"(src_y),     // %0
   1.264 +    "+r"(dst_argb),  // %1
   1.265 +    "+r"(pix)        // %2
   1.266 +  :
   1.267 +  : "memory", "cc"
   1.268 +#if defined(__SSE2__)
   1.269 +    , "xmm0", "xmm1", "xmm5"
   1.270 +#endif
   1.271 +  );
   1.272 +}
   1.273 +
   1.274 +void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
   1.275 +                                  int pix) {
   1.276 +  asm volatile (
   1.277 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1.278 +    "pslld     $0x18,%%xmm5                    \n"
   1.279 +    LABELALIGN
   1.280 +  "1:                                          \n"
   1.281 +    "movq      " MEMACCESS(0) ",%%xmm0         \n"
   1.282 +    "lea       " MEMLEA(0x8,0) ",%0            \n"
   1.283 +    "punpcklbw %%xmm0,%%xmm0                   \n"
   1.284 +    "movdqa    %%xmm0,%%xmm1                   \n"
   1.285 +    "punpcklwd %%xmm0,%%xmm0                   \n"
   1.286 +    "punpckhwd %%xmm1,%%xmm1                   \n"
   1.287 +    "por       %%xmm5,%%xmm0                   \n"
   1.288 +    "por       %%xmm5,%%xmm1                   \n"
   1.289 +    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   1.290 +    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
   1.291 +    "lea       " MEMLEA(0x20,1) ",%1           \n"
   1.292 +    "sub       $0x8,%2                         \n"
   1.293 +    "jg        1b                              \n"
   1.294 +  : "+r"(src_y),     // %0
   1.295 +    "+r"(dst_argb),  // %1
   1.296 +    "+r"(pix)        // %2
   1.297 +  :
   1.298 +  : "memory", "cc"
   1.299 +#if defined(__SSE2__)
   1.300 +    , "xmm0", "xmm1", "xmm5"
   1.301 +#endif
   1.302 +  );
   1.303 +}
   1.304 +#endif  // HAS_I400TOARGBROW_SSE2
   1.305 +
   1.306 +#ifdef HAS_RGB24TOARGBROW_SSSE3
   1.307 +void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
   1.308 +  asm volatile (
   1.309 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
   1.310 +    "pslld     $0x18,%%xmm5                    \n"
   1.311 +    "movdqa    %3,%%xmm4                       \n"
   1.312 +    LABELALIGN
   1.313 +  "1:                                          \n"
   1.314 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   1.315 +    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1.316 +    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
   1.317 +    "lea       " MEMLEA(0x30,0) ",%0           \n"
   1.318 +    "movdqa    %%xmm3,%%xmm2                   \n"
   1.319 +    "palignr   $0x8,%%xmm1,%%xmm2              \n"
   1.320 +    "pshufb    %%xmm4,%%xmm2                   \n"
   1.321 +    "por       %%xmm5,%%xmm2                   \n"
   1.322 +    "palignr   $0xc,%%xmm0,%%xmm1              \n"
   1.323 +    "pshufb    %%xmm4,%%xmm0                   \n"
   1.324 +    "movdqa    %%xmm2," MEMACCESS2(0x20,1) "   \n"
   1.325 +    "por       %%xmm5,%%xmm0                   \n"
   1.326 +    "pshufb    %%xmm4,%%xmm1                   \n"
   1.327 +    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
   1.328 +    "por       %%xmm5,%%xmm1                   \n"
   1.329 +    "palignr   $0x4,%%xmm3,%%xmm3              \n"
   1.330 +    "pshufb    %%xmm4,%%xmm3                   \n"
   1.331 +    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
   1.332 +    "por       %%xmm5,%%xmm3                   \n"
   1.333 +    "sub       $0x10,%2                        \n"
   1.334 +    "movdqa    %%xmm3," MEMACCESS2(0x30,1) "   \n"
   1.335 +    "lea       " MEMLEA(0x40,1) ",%1           \n"
   1.336 +    "jg        1b                              \n"
   1.337 +  : "+r"(src_rgb24),  // %0
   1.338 +    "+r"(dst_argb),  // %1
   1.339 +    "+r"(pix)        // %2
   1.340 +  : "m"(kShuffleMaskRGB24ToARGB)  // %3
   1.341 +  : "memory", "cc"
   1.342 +#if defined(__SSE2__)
   1.343 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1.344 +#endif
   1.345 +  );
   1.346 +}
   1.347 +
   1.348 +void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
   1.349 +  asm volatile (
   1.350 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
   1.351 +    "pslld     $0x18,%%xmm5                    \n"
   1.352 +    "movdqa    %3,%%xmm4                       \n"
   1.353 +    LABELALIGN
   1.354 +  "1:                                          \n"
   1.355 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   1.356 +    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1.357 +    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
   1.358 +    "lea       " MEMLEA(0x30,0) ",%0           \n"
   1.359 +    "movdqa    %%xmm3,%%xmm2                   \n"
   1.360 +    "palignr   $0x8,%%xmm1,%%xmm2              \n"
   1.361 +    "pshufb    %%xmm4,%%xmm2                   \n"
   1.362 +    "por       %%xmm5,%%xmm2                   \n"
   1.363 +    "palignr   $0xc,%%xmm0,%%xmm1              \n"
   1.364 +    "pshufb    %%xmm4,%%xmm0                   \n"
   1.365 +    "movdqa    %%xmm2," MEMACCESS2(0x20,1) "   \n"
   1.366 +    "por       %%xmm5,%%xmm0                   \n"
   1.367 +    "pshufb    %%xmm4,%%xmm1                   \n"
   1.368 +    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
   1.369 +    "por       %%xmm5,%%xmm1                   \n"
   1.370 +    "palignr   $0x4,%%xmm3,%%xmm3              \n"
   1.371 +    "pshufb    %%xmm4,%%xmm3                   \n"
   1.372 +    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
   1.373 +    "por       %%xmm5,%%xmm3                   \n"
   1.374 +    "sub       $0x10,%2                        \n"
   1.375 +    "movdqa    %%xmm3," MEMACCESS2(0x30,1) "   \n"
   1.376 +    "lea       " MEMLEA(0x40,1) ",%1           \n"
   1.377 +    "jg        1b                              \n"
   1.378 +  : "+r"(src_raw),   // %0
   1.379 +    "+r"(dst_argb),  // %1
   1.380 +    "+r"(pix)        // %2
   1.381 +  : "m"(kShuffleMaskRAWToARGB)  // %3
   1.382 +  : "memory", "cc"
   1.383 +#if defined(__SSE2__)
   1.384 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1.385 +#endif
   1.386 +  );
   1.387 +}
   1.388 +
   1.389 +void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
   1.390 +  asm volatile (
   1.391 +    "mov       $0x1080108,%%eax                \n"
   1.392 +    "movd      %%eax,%%xmm5                    \n"
   1.393 +    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
   1.394 +    "mov       $0x20802080,%%eax               \n"
   1.395 +    "movd      %%eax,%%xmm6                    \n"
   1.396 +    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
   1.397 +    "pcmpeqb   %%xmm3,%%xmm3                   \n"
   1.398 +    "psllw     $0xb,%%xmm3                     \n"
   1.399 +    "pcmpeqb   %%xmm4,%%xmm4                   \n"
   1.400 +    "psllw     $0xa,%%xmm4                     \n"
   1.401 +    "psrlw     $0x5,%%xmm4                     \n"
   1.402 +    "pcmpeqb   %%xmm7,%%xmm7                   \n"
   1.403 +    "psllw     $0x8,%%xmm7                     \n"
   1.404 +    "sub       %0,%1                           \n"
   1.405 +    "sub       %0,%1                           \n"
   1.406 +    LABELALIGN
   1.407 +  "1:                                          \n"
   1.408 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   1.409 +    "movdqa    %%xmm0,%%xmm1                   \n"
   1.410 +    "movdqa    %%xmm0,%%xmm2                   \n"
   1.411 +    "pand      %%xmm3,%%xmm1                   \n"
   1.412 +    "psllw     $0xb,%%xmm2                     \n"
   1.413 +    "pmulhuw   %%xmm5,%%xmm1                   \n"
   1.414 +    "pmulhuw   %%xmm5,%%xmm2                   \n"
   1.415 +    "psllw     $0x8,%%xmm1                     \n"
   1.416 +    "por       %%xmm2,%%xmm1                   \n"
   1.417 +    "pand      %%xmm4,%%xmm0                   \n"
   1.418 +    "pmulhuw   %%xmm6,%%xmm0                   \n"
   1.419 +    "por       %%xmm7,%%xmm0                   \n"
   1.420 +    "movdqa    %%xmm1,%%xmm2                   \n"
   1.421 +    "punpcklbw %%xmm0,%%xmm1                   \n"
   1.422 +    "punpckhbw %%xmm0,%%xmm2                   \n"
   1.423 +    BUNDLEALIGN
   1.424 +    MEMOPMEM(movdqa,xmm1,0x00,1,0,2)           //  movdqa  %%xmm1,(%1,%0,2)
   1.425 +    MEMOPMEM(movdqa,xmm2,0x10,1,0,2)           //  movdqa  %%xmm2,0x10(%1,%0,2)
   1.426 +    "lea       " MEMLEA(0x10,0) ",%0           \n"
   1.427 +    "sub       $0x8,%2                         \n"
   1.428 +    "jg        1b                              \n"
   1.429 +  : "+r"(src),  // %0
   1.430 +    "+r"(dst),  // %1
   1.431 +    "+r"(pix)   // %2
   1.432 +  :
   1.433 +  : "memory", "cc", "eax"
   1.434 +#if defined(__native_client__) && defined(__x86_64__)
   1.435 +    , "r14"
   1.436 +#endif
   1.437 +#if defined(__SSE2__)
   1.438 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   1.439 +#endif
   1.440 +  );
   1.441 +}
   1.442 +
   1.443 +void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
   1.444 +  asm volatile (
   1.445 +    "mov       $0x1080108,%%eax                \n"
   1.446 +    "movd      %%eax,%%xmm5                    \n"
   1.447 +    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
   1.448 +    "mov       $0x42004200,%%eax               \n"
   1.449 +    "movd      %%eax,%%xmm6                    \n"
   1.450 +    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
   1.451 +    "pcmpeqb   %%xmm3,%%xmm3                   \n"
   1.452 +    "psllw     $0xb,%%xmm3                     \n"
   1.453 +    "movdqa    %%xmm3,%%xmm4                   \n"
   1.454 +    "psrlw     $0x6,%%xmm4                     \n"
   1.455 +    "pcmpeqb   %%xmm7,%%xmm7                   \n"
   1.456 +    "psllw     $0x8,%%xmm7                     \n"
   1.457 +    "sub       %0,%1                           \n"
   1.458 +    "sub       %0,%1                           \n"
   1.459 +    LABELALIGN
   1.460 +  "1:                                          \n"
   1.461 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   1.462 +    "movdqa    %%xmm0,%%xmm1                   \n"
   1.463 +    "movdqa    %%xmm0,%%xmm2                   \n"
   1.464 +    "psllw     $0x1,%%xmm1                     \n"
   1.465 +    "psllw     $0xb,%%xmm2                     \n"
   1.466 +    "pand      %%xmm3,%%xmm1                   \n"
   1.467 +    "pmulhuw   %%xmm5,%%xmm2                   \n"
   1.468 +    "pmulhuw   %%xmm5,%%xmm1                   \n"
   1.469 +    "psllw     $0x8,%%xmm1                     \n"
   1.470 +    "por       %%xmm2,%%xmm1                   \n"
   1.471 +    "movdqa    %%xmm0,%%xmm2                   \n"
   1.472 +    "pand      %%xmm4,%%xmm0                   \n"
   1.473 +    "psraw     $0x8,%%xmm2                     \n"
   1.474 +    "pmulhuw   %%xmm6,%%xmm0                   \n"
   1.475 +    "pand      %%xmm7,%%xmm2                   \n"
   1.476 +    "por       %%xmm2,%%xmm0                   \n"
   1.477 +    "movdqa    %%xmm1,%%xmm2                   \n"
   1.478 +    "punpcklbw %%xmm0,%%xmm1                   \n"
   1.479 +    "punpckhbw %%xmm0,%%xmm2                   \n"
   1.480 +    BUNDLEALIGN
   1.481 +    MEMOPMEM(movdqa,xmm1,0x00,1,0,2)           //  movdqa  %%xmm1,(%1,%0,2)
   1.482 +    MEMOPMEM(movdqa,xmm2,0x10,1,0,2)           //  movdqa  %%xmm2,0x10(%1,%0,2)
   1.483 +    "lea       " MEMLEA(0x10,0) ",%0           \n"
   1.484 +    "sub       $0x8,%2                         \n"
   1.485 +    "jg        1b                              \n"
   1.486 +  : "+r"(src),  // %0
   1.487 +    "+r"(dst),  // %1
   1.488 +    "+r"(pix)   // %2
   1.489 +  :
   1.490 +  : "memory", "cc", "eax"
   1.491 +#if defined(__native_client__) && defined(__x86_64__)
   1.492 +    , "r14"
   1.493 +#endif
   1.494 +#if defined(__SSE2__)
   1.495 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   1.496 +#endif
   1.497 +  );
   1.498 +}
   1.499 +
   1.500 +void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
   1.501 +  asm volatile (
   1.502 +    "mov       $0xf0f0f0f,%%eax                \n"
   1.503 +    "movd      %%eax,%%xmm4                    \n"
   1.504 +    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
   1.505 +    "movdqa    %%xmm4,%%xmm5                   \n"
   1.506 +    "pslld     $0x4,%%xmm5                     \n"
   1.507 +    "sub       %0,%1                           \n"
   1.508 +    "sub       %0,%1                           \n"
   1.509 +    LABELALIGN
   1.510 +  "1:                                          \n"
   1.511 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   1.512 +    "movdqa    %%xmm0,%%xmm2                   \n"
   1.513 +    "pand      %%xmm4,%%xmm0                   \n"
   1.514 +    "pand      %%xmm5,%%xmm2                   \n"
   1.515 +    "movdqa    %%xmm0,%%xmm1                   \n"
   1.516 +    "movdqa    %%xmm2,%%xmm3                   \n"
   1.517 +    "psllw     $0x4,%%xmm1                     \n"
   1.518 +    "psrlw     $0x4,%%xmm3                     \n"
   1.519 +    "por       %%xmm1,%%xmm0                   \n"
   1.520 +    "por       %%xmm3,%%xmm2                   \n"
   1.521 +    "movdqa    %%xmm0,%%xmm1                   \n"
   1.522 +    "punpcklbw %%xmm2,%%xmm0                   \n"
   1.523 +    "punpckhbw %%xmm2,%%xmm1                   \n"
   1.524 +    BUNDLEALIGN
   1.525 +    MEMOPMEM(movdqa,xmm0,0x00,1,0,2)           //  movdqa  %%xmm0,(%1,%0,2)
   1.526 +    MEMOPMEM(movdqa,xmm1,0x10,1,0,2)           //  movdqa  %%xmm1,0x10(%1,%0,2)
   1.527 +    "lea       " MEMLEA(0x10,0) ",%0           \n"
   1.528 +    "sub       $0x8,%2                         \n"
   1.529 +    "jg        1b                              \n"
   1.530 +  : "+r"(src),  // %0
   1.531 +    "+r"(dst),  // %1
   1.532 +    "+r"(pix)   // %2
   1.533 +  :
   1.534 +  : "memory", "cc", "eax"
   1.535 +#if defined(__native_client__) && defined(__x86_64__)
   1.536 +    , "r14"
   1.537 +#endif
   1.538 +#if defined(__SSE2__)
   1.539 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1.540 +#endif
   1.541 +  );
   1.542 +}
   1.543 +
   1.544 +void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
   1.545 +  asm volatile (
   1.546 +    "movdqa    %3,%%xmm6                       \n"
   1.547 +    LABELALIGN
   1.548 +  "1:                                          \n"
   1.549 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   1.550 +    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1.551 +    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   1.552 +    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
   1.553 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
   1.554 +    "pshufb    %%xmm6,%%xmm0                   \n"
   1.555 +    "pshufb    %%xmm6,%%xmm1                   \n"
   1.556 +    "pshufb    %%xmm6,%%xmm2                   \n"
   1.557 +    "pshufb    %%xmm6,%%xmm3                   \n"
   1.558 +    "movdqa    %%xmm1,%%xmm4                   \n"
   1.559 +    "psrldq    $0x4,%%xmm1                     \n"
   1.560 +    "pslldq    $0xc,%%xmm4                     \n"
   1.561 +    "movdqa    %%xmm2,%%xmm5                   \n"
   1.562 +    "por       %%xmm4,%%xmm0                   \n"
   1.563 +    "pslldq    $0x8,%%xmm5                     \n"
   1.564 +    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   1.565 +    "por       %%xmm5,%%xmm1                   \n"
   1.566 +    "psrldq    $0x8,%%xmm2                     \n"
   1.567 +    "pslldq    $0x4,%%xmm3                     \n"
   1.568 +    "por       %%xmm3,%%xmm2                   \n"
   1.569 +    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
   1.570 +    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
   1.571 +    "lea       " MEMLEA(0x30,1) ",%1           \n"
   1.572 +    "sub       $0x10,%2                        \n"
   1.573 +    "jg        1b                              \n"
   1.574 +  : "+r"(src),  // %0
   1.575 +    "+r"(dst),  // %1
   1.576 +    "+r"(pix)   // %2
   1.577 +  : "m"(kShuffleMaskARGBToRGB24)  // %3
   1.578 +  : "memory", "cc"
   1.579 +#if defined(__SSE2__)
   1.580 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   1.581 +#endif
   1.582 +  );
   1.583 +}
   1.584 +
   1.585 +void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
   1.586 +  asm volatile (
   1.587 +    "movdqa    %3,%%xmm6                       \n"
   1.588 +    LABELALIGN
   1.589 +  "1:                                          \n"
   1.590 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   1.591 +    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1.592 +    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   1.593 +    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
   1.594 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
   1.595 +    "pshufb    %%xmm6,%%xmm0                   \n"
   1.596 +    "pshufb    %%xmm6,%%xmm1                   \n"
   1.597 +    "pshufb    %%xmm6,%%xmm2                   \n"
   1.598 +    "pshufb    %%xmm6,%%xmm3                   \n"
   1.599 +    "movdqa    %%xmm1,%%xmm4                   \n"
   1.600 +    "psrldq    $0x4,%%xmm1                     \n"
   1.601 +    "pslldq    $0xc,%%xmm4                     \n"
   1.602 +    "movdqa    %%xmm2,%%xmm5                   \n"
   1.603 +    "por       %%xmm4,%%xmm0                   \n"
   1.604 +    "pslldq    $0x8,%%xmm5                     \n"
   1.605 +    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   1.606 +    "por       %%xmm5,%%xmm1                   \n"
   1.607 +    "psrldq    $0x8,%%xmm2                     \n"
   1.608 +    "pslldq    $0x4,%%xmm3                     \n"
   1.609 +    "por       %%xmm3,%%xmm2                   \n"
   1.610 +    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
   1.611 +    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
   1.612 +    "lea       " MEMLEA(0x30,1) ",%1           \n"
   1.613 +    "sub       $0x10,%2                        \n"
   1.614 +    "jg        1b                              \n"
   1.615 +  : "+r"(src),  // %0
   1.616 +    "+r"(dst),  // %1
   1.617 +    "+r"(pix)   // %2
   1.618 +  : "m"(kShuffleMaskARGBToRAW)  // %3
   1.619 +  : "memory", "cc"
   1.620 +#if defined(__SSE2__)
   1.621 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   1.622 +#endif
   1.623 +  );
   1.624 +}
   1.625 +
   1.626 +void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
   1.627 +  asm volatile (
   1.628 +    "pcmpeqb   %%xmm3,%%xmm3                   \n"
   1.629 +    "psrld     $0x1b,%%xmm3                    \n"
   1.630 +    "pcmpeqb   %%xmm4,%%xmm4                   \n"
   1.631 +    "psrld     $0x1a,%%xmm4                    \n"
   1.632 +    "pslld     $0x5,%%xmm4                     \n"
   1.633 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
   1.634 +    "pslld     $0xb,%%xmm5                     \n"
   1.635 +    LABELALIGN
   1.636 +  "1:                                          \n"
   1.637 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
   1.638 +    "movdqa    %%xmm0,%%xmm1                   \n"
   1.639 +    "movdqa    %%xmm0,%%xmm2                   \n"
   1.640 +    "pslld     $0x8,%%xmm0                     \n"
   1.641 +    "psrld     $0x3,%%xmm1                     \n"
   1.642 +    "psrld     $0x5,%%xmm2                     \n"
   1.643 +    "psrad     $0x10,%%xmm0                    \n"
   1.644 +    "pand      %%xmm3,%%xmm1                   \n"
   1.645 +    "pand      %%xmm4,%%xmm2                   \n"
   1.646 +    "pand      %%xmm5,%%xmm0                   \n"
   1.647 +    "por       %%xmm2,%%xmm1                   \n"
   1.648 +    "por       %%xmm1,%%xmm0                   \n"
   1.649 +    "packssdw  %%xmm0,%%xmm0                   \n"
   1.650 +    "lea       " MEMLEA(0x10,0) ",%0           \n"
   1.651 +    "movq      %%xmm0," MEMACCESS(1) "         \n"
   1.652 +    "lea       " MEMLEA(0x8,1) ",%1            \n"
   1.653 +    "sub       $0x4,%2                         \n"
   1.654 +    "jg        1b                              \n"
   1.655 +  : "+r"(src),  // %0
   1.656 +    "+r"(dst),  // %1
   1.657 +    "+r"(pix)   // %2
   1.658 +  :
   1.659 +  : "memory", "cc"
   1.660 +#if defined(__SSE2__)
   1.661 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1.662 +#endif
   1.663 +  );
   1.664 +}
   1.665 +
   1.666 +void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
   1.667 +  asm volatile (
   1.668 +    "pcmpeqb   %%xmm4,%%xmm4                   \n"
   1.669 +    "psrld     $0x1b,%%xmm4                    \n"
   1.670 +    "movdqa    %%xmm4,%%xmm5                   \n"
   1.671 +    "pslld     $0x5,%%xmm5                     \n"
   1.672 +    "movdqa    %%xmm4,%%xmm6                   \n"
   1.673 +    "pslld     $0xa,%%xmm6                     \n"
   1.674 +    "pcmpeqb   %%xmm7,%%xmm7                   \n"
   1.675 +    "pslld     $0xf,%%xmm7                     \n"
   1.676 +    LABELALIGN
   1.677 +  "1:                                          \n"
   1.678 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
   1.679 +    "movdqa    %%xmm0,%%xmm1                   \n"
   1.680 +    "movdqa    %%xmm0,%%xmm2                   \n"
   1.681 +    "movdqa    %%xmm0,%%xmm3                   \n"
   1.682 +    "psrad     $0x10,%%xmm0                    \n"
   1.683 +    "psrld     $0x3,%%xmm1                     \n"
   1.684 +    "psrld     $0x6,%%xmm2                     \n"
   1.685 +    "psrld     $0x9,%%xmm3                     \n"
   1.686 +    "pand      %%xmm7,%%xmm0                   \n"
   1.687 +    "pand      %%xmm4,%%xmm1                   \n"
   1.688 +    "pand      %%xmm5,%%xmm2                   \n"
   1.689 +    "pand      %%xmm6,%%xmm3                   \n"
   1.690 +    "por       %%xmm1,%%xmm0                   \n"
   1.691 +    "por       %%xmm3,%%xmm2                   \n"
   1.692 +    "por       %%xmm2,%%xmm0                   \n"
   1.693 +    "packssdw  %%xmm0,%%xmm0                   \n"
   1.694 +    "lea       " MEMLEA(0x10,0) ",%0           \n"
   1.695 +    "movq      %%xmm0," MEMACCESS(1) "         \n"
   1.696 +    "lea       " MEMACCESS2(0x8,1) ",%1        \n"
   1.697 +    "sub       $0x4,%2                         \n"
   1.698 +    "jg        1b                              \n"
   1.699 +  : "+r"(src),  // %0
   1.700 +    "+r"(dst),  // %1
   1.701 +    "+r"(pix)   // %2
   1.702 +  :
   1.703 +  : "memory", "cc"
   1.704 +#if defined(__SSE2__)
   1.705 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   1.706 +#endif
   1.707 +  );
   1.708 +}
   1.709 +
   1.710 +void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
   1.711 +  asm volatile (
   1.712 +    "pcmpeqb   %%xmm4,%%xmm4                   \n"
   1.713 +    "psllw     $0xc,%%xmm4                     \n"
   1.714 +    "movdqa    %%xmm4,%%xmm3                   \n"
   1.715 +    "psrlw     $0x8,%%xmm3                     \n"
   1.716 +    LABELALIGN
   1.717 +  "1:                                          \n"
   1.718 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
   1.719 +    "movdqa    %%xmm0,%%xmm1                   \n"
   1.720 +    "pand      %%xmm3,%%xmm0                   \n"
   1.721 +    "pand      %%xmm4,%%xmm1                   \n"
   1.722 +    "psrlq     $0x4,%%xmm0                     \n"
   1.723 +    "psrlq     $0x8,%%xmm1                     \n"
   1.724 +    "por       %%xmm1,%%xmm0                   \n"
   1.725 +    "packuswb  %%xmm0,%%xmm0                   \n"
   1.726 +    "lea       " MEMLEA(0x10,0) ",%0           \n"
   1.727 +    "movq      %%xmm0," MEMACCESS(1) "         \n"
   1.728 +    "lea       " MEMLEA(0x8,1) ",%1            \n"
   1.729 +    "sub       $0x4,%2                         \n"
   1.730 +    "jg        1b                              \n"
   1.731 +  : "+r"(src),  // %0
   1.732 +    "+r"(dst),  // %1
   1.733 +    "+r"(pix)   // %2
   1.734 +  :
   1.735 +  : "memory", "cc"
   1.736 +#if defined(__SSE2__)
   1.737 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
   1.738 +#endif
   1.739 +  );
   1.740 +}
   1.741 +#endif  // HAS_RGB24TOARGBROW_SSSE3
   1.742 +
   1.743 +#ifdef HAS_ARGBTOYROW_SSSE3
   1.744 +void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   1.745 +  asm volatile (
   1.746 +    "movdqa    %4,%%xmm5                       \n"
   1.747 +    "movdqa    %3,%%xmm4                       \n"
   1.748 +    LABELALIGN
   1.749 +  "1:                                          \n"
   1.750 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
   1.751 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1.752 +    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   1.753 +    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
   1.754 +    "pmaddubsw %%xmm4,%%xmm0                   \n"
   1.755 +    "pmaddubsw %%xmm4,%%xmm1                   \n"
   1.756 +    "pmaddubsw %%xmm4,%%xmm2                   \n"
   1.757 +    "pmaddubsw %%xmm4,%%xmm3                   \n"
   1.758 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
   1.759 +    "phaddw    %%xmm1,%%xmm0                   \n"
   1.760 +    "phaddw    %%xmm3,%%xmm2                   \n"
   1.761 +    "psrlw     $0x7,%%xmm0                     \n"
   1.762 +    "psrlw     $0x7,%%xmm2                     \n"
   1.763 +    "packuswb  %%xmm2,%%xmm0                   \n"
   1.764 +    "paddb     %%xmm5,%%xmm0                   \n"
   1.765 +    "sub       $0x10,%2                        \n"
   1.766 +    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
   1.767 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
   1.768 +    "jg        1b                              \n"
   1.769 +  : "+r"(src_argb),  // %0
   1.770 +    "+r"(dst_y),     // %1
   1.771 +    "+r"(pix)        // %2
   1.772 +  : "m"(kARGBToY),   // %3
   1.773 +    "m"(kAddY16)     // %4
   1.774 +  : "memory", "cc"
   1.775 +#if defined(__SSE2__)
   1.776 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1.777 +#endif
   1.778 +  );
   1.779 +}
   1.780 +
   1.781 +void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   1.782 +  asm volatile (
   1.783 +    "movdqa    %4,%%xmm5                       \n"
   1.784 +    "movdqa    %3,%%xmm4                       \n"
   1.785 +    LABELALIGN
   1.786 +  "1:                                          \n"
   1.787 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   1.788 +    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1.789 +    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   1.790 +    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
   1.791 +    "pmaddubsw %%xmm4,%%xmm0                   \n"
   1.792 +    "pmaddubsw %%xmm4,%%xmm1                   \n"
   1.793 +    "pmaddubsw %%xmm4,%%xmm2                   \n"
   1.794 +    "pmaddubsw %%xmm4,%%xmm3                   \n"
   1.795 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
   1.796 +    "phaddw    %%xmm1,%%xmm0                   \n"
   1.797 +    "phaddw    %%xmm3,%%xmm2                   \n"
   1.798 +    "psrlw     $0x7,%%xmm0                     \n"
   1.799 +    "psrlw     $0x7,%%xmm2                     \n"
   1.800 +    "packuswb  %%xmm2,%%xmm0                   \n"
   1.801 +    "paddb     %%xmm5,%%xmm0                   \n"
   1.802 +    "sub       $0x10,%2                        \n"
   1.803 +    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   1.804 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
   1.805 +    "jg        1b                              \n"
   1.806 +  : "+r"(src_argb),  // %0
   1.807 +    "+r"(dst_y),     // %1
   1.808 +    "+r"(pix)        // %2
   1.809 +  : "m"(kARGBToY),   // %3
   1.810 +    "m"(kAddY16)     // %4
   1.811 +  : "memory", "cc"
   1.812 +#if defined(__SSE2__)
   1.813 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1.814 +#endif
   1.815 +  );
   1.816 +}
   1.817 +#endif  // HAS_ARGBTOYROW_SSSE3
   1.818 +
   1.819 +#ifdef HAS_ARGBTOYJROW_SSSE3
   1.820 +void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   1.821 +  asm volatile (
   1.822 +    "movdqa    %3,%%xmm4                       \n"
   1.823 +    "movdqa    %4,%%xmm5                       \n"
   1.824 +    LABELALIGN
   1.825 +  "1:                                          \n"
   1.826 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
   1.827 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1.828 +    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   1.829 +    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
   1.830 +    "pmaddubsw %%xmm4,%%xmm0                   \n"
   1.831 +    "pmaddubsw %%xmm4,%%xmm1                   \n"
   1.832 +    "pmaddubsw %%xmm4,%%xmm2                   \n"
   1.833 +    "pmaddubsw %%xmm4,%%xmm3                   \n"
   1.834 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
   1.835 +    "phaddw    %%xmm1,%%xmm0                   \n"
   1.836 +    "phaddw    %%xmm3,%%xmm2                   \n"
   1.837 +    "paddw     %%xmm5,%%xmm0                   \n"
   1.838 +    "paddw     %%xmm5,%%xmm2                   \n"
   1.839 +    "psrlw     $0x7,%%xmm0                     \n"
   1.840 +    "psrlw     $0x7,%%xmm2                     \n"
   1.841 +    "packuswb  %%xmm2,%%xmm0                   \n"
   1.842 +    "sub       $0x10,%2                        \n"
   1.843 +    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
   1.844 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
   1.845 +    "jg        1b                              \n"
   1.846 +  : "+r"(src_argb),  // %0
   1.847 +    "+r"(dst_y),     // %1
   1.848 +    "+r"(pix)        // %2
   1.849 +  : "m"(kARGBToYJ),  // %3
   1.850 +    "m"(kAddYJ64)    // %4
   1.851 +  : "memory", "cc"
   1.852 +#if defined(__SSE2__)
   1.853 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1.854 +#endif
   1.855 +  );
   1.856 +}
   1.857 +
   1.858 +void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   1.859 +  asm volatile (
   1.860 +    "movdqa    %3,%%xmm4                       \n"
   1.861 +    "movdqa    %4,%%xmm5                       \n"
   1.862 +    LABELALIGN
   1.863 +  "1:                                          \n"
   1.864 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
   1.865 +    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1.866 +    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   1.867 +    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
   1.868 +    "pmaddubsw %%xmm4,%%xmm0                   \n"
   1.869 +    "pmaddubsw %%xmm4,%%xmm1                   \n"
   1.870 +    "pmaddubsw %%xmm4,%%xmm2                   \n"
   1.871 +    "pmaddubsw %%xmm4,%%xmm3                   \n"
   1.872 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
   1.873 +    "phaddw    %%xmm1,%%xmm0                   \n"
   1.874 +    "phaddw    %%xmm3,%%xmm2                   \n"
   1.875 +    "paddw     %%xmm5,%%xmm0                   \n"
   1.876 +    "paddw     %%xmm5,%%xmm2                   \n"
   1.877 +    "psrlw     $0x7,%%xmm0                     \n"
   1.878 +    "psrlw     $0x7,%%xmm2                     \n"
   1.879 +    "packuswb  %%xmm2,%%xmm0                   \n"
   1.880 +    "sub       $0x10,%2                        \n"
   1.881 +    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
   1.882 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
   1.883 +    "jg        1b                              \n"
   1.884 +  : "+r"(src_argb),  // %0
   1.885 +    "+r"(dst_y),     // %1
   1.886 +    "+r"(pix)        // %2
   1.887 +  : "m"(kARGBToYJ),  // %3
   1.888 +    "m"(kAddYJ64)    // %4
   1.889 +  : "memory", "cc"
   1.890 +#if defined(__SSE2__)
   1.891 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   1.892 +#endif
   1.893 +  );
   1.894 +}
   1.895 +#endif  // HAS_ARGBTOYJROW_SSSE3
   1.896 +
   1.897 +#ifdef HAS_ARGBTOUVROW_SSSE3
   1.898 +// TODO(fbarchard): pass xmm constants to single block of assembly.
   1.899 +// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
   1.900 +// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
   1.901 +// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
   1.902 +// and considered unsafe.
   1.903 +void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   1.904 +                       uint8* dst_u, uint8* dst_v, int width) {
   1.905 +  asm volatile (
   1.906 +    "movdqa    %0,%%xmm4                       \n"
   1.907 +    "movdqa    %1,%%xmm3                       \n"
   1.908 +    "movdqa    %2,%%xmm5                       \n"
   1.909 +  :
   1.910 +  : "m"(kARGBToU),  // %0
   1.911 +    "m"(kARGBToV),  // %1
   1.912 +    "m"(kAddUV128)  // %2
   1.913 +  );
   1.914 +  asm volatile (
   1.915 +    "sub       %1,%2                           \n"
   1.916 +    LABELALIGN
   1.917 +  "1:                                          \n"
   1.918 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
   1.919 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1.920 +    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   1.921 +    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
   1.922 +    BUNDLEALIGN
   1.923 +    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
   1.924 +    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
   1.925 +    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
   1.926 +    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
   1.927 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
   1.928 +    "movdqa    %%xmm0,%%xmm7                   \n"
   1.929 +    "shufps    $0x88,%%xmm1,%%xmm0             \n"
   1.930 +    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
   1.931 +    "pavgb     %%xmm7,%%xmm0                   \n"
   1.932 +    "movdqa    %%xmm2,%%xmm7                   \n"
   1.933 +    "shufps    $0x88,%%xmm6,%%xmm2             \n"
   1.934 +    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
   1.935 +    "pavgb     %%xmm7,%%xmm2                   \n"
   1.936 +    "movdqa    %%xmm0,%%xmm1                   \n"
   1.937 +    "movdqa    %%xmm2,%%xmm6                   \n"
   1.938 +    "pmaddubsw %%xmm4,%%xmm0                   \n"
   1.939 +    "pmaddubsw %%xmm4,%%xmm2                   \n"
   1.940 +    "pmaddubsw %%xmm3,%%xmm1                   \n"
   1.941 +    "pmaddubsw %%xmm3,%%xmm6                   \n"
   1.942 +    "phaddw    %%xmm2,%%xmm0                   \n"
   1.943 +    "phaddw    %%xmm6,%%xmm1                   \n"
   1.944 +    "psraw     $0x8,%%xmm0                     \n"
   1.945 +    "psraw     $0x8,%%xmm1                     \n"
   1.946 +    "packsswb  %%xmm1,%%xmm0                   \n"
   1.947 +    "paddb     %%xmm5,%%xmm0                   \n"
   1.948 +    "sub       $0x10,%3                        \n"
   1.949 +    "movlps    %%xmm0," MEMACCESS(1) "         \n"
   1.950 +    BUNDLEALIGN
   1.951 +    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps    %%xmm0,(%1,%2,1)
   1.952 +    "lea       " MEMLEA(0x8,1) ",%1            \n"
   1.953 +    "jg        1b                              \n"
   1.954 +  : "+r"(src_argb0),       // %0
   1.955 +    "+r"(dst_u),           // %1
   1.956 +    "+r"(dst_v),           // %2
   1.957 +    "+rm"(width)           // %3
   1.958 +  : "r"((intptr_t)(src_stride_argb)) // %4
   1.959 +  : "memory", "cc"
   1.960 +#if defined(__native_client__) && defined(__x86_64__)
   1.961 +    , "r14"
   1.962 +#endif
   1.963 +#if defined(__SSE2__)
   1.964 +    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
   1.965 +#endif
   1.966 +  );
   1.967 +}
   1.968 +
   1.969 +// TODO(fbarchard): Share code with ARGBToUVRow_SSSE3.
   1.970 +void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   1.971 +                        uint8* dst_u, uint8* dst_v, int width) {
   1.972 +  asm volatile (
   1.973 +    "movdqa    %0,%%xmm4                       \n"
   1.974 +    "movdqa    %1,%%xmm3                       \n"
   1.975 +    "movdqa    %2,%%xmm5                       \n"
   1.976 +  :
   1.977 +  : "m"(kARGBToUJ),  // %0
   1.978 +    "m"(kARGBToVJ),  // %1
   1.979 +    "m"(kAddUVJ128)  // %2
   1.980 +  );
   1.981 +  asm volatile (
   1.982 +    "sub       %1,%2                           \n"
   1.983 +    LABELALIGN
   1.984 +  "1:                                          \n"
   1.985 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
   1.986 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
   1.987 +    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
   1.988 +    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
   1.989 +    BUNDLEALIGN
   1.990 +    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
   1.991 +    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
   1.992 +    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
   1.993 +    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
   1.994 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
   1.995 +    "movdqa    %%xmm0,%%xmm7                   \n"
   1.996 +    "shufps    $0x88,%%xmm1,%%xmm0             \n"
   1.997 +    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
   1.998 +    "pavgb     %%xmm7,%%xmm0                   \n"
   1.999 +    "movdqa    %%xmm2,%%xmm7                   \n"
  1.1000 +    "shufps    $0x88,%%xmm6,%%xmm2             \n"
  1.1001 +    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
  1.1002 +    "pavgb     %%xmm7,%%xmm2                   \n"
  1.1003 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.1004 +    "movdqa    %%xmm2,%%xmm6                   \n"
  1.1005 +    "pmaddubsw %%xmm4,%%xmm0                   \n"
  1.1006 +    "pmaddubsw %%xmm4,%%xmm2                   \n"
  1.1007 +    "pmaddubsw %%xmm3,%%xmm1                   \n"
  1.1008 +    "pmaddubsw %%xmm3,%%xmm6                   \n"
  1.1009 +    "phaddw    %%xmm2,%%xmm0                   \n"
  1.1010 +    "phaddw    %%xmm6,%%xmm1                   \n"
  1.1011 +    "paddw     %%xmm5,%%xmm0                   \n"
  1.1012 +    "paddw     %%xmm5,%%xmm1                   \n"
  1.1013 +    "psraw     $0x8,%%xmm0                     \n"
  1.1014 +    "psraw     $0x8,%%xmm1                     \n"
  1.1015 +    "packsswb  %%xmm1,%%xmm0                   \n"
  1.1016 +    "sub       $0x10,%3                        \n"
  1.1017 +    "movlps    %%xmm0," MEMACCESS(1) "         \n"
  1.1018 +    BUNDLEALIGN
  1.1019 +    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
  1.1020 +    "lea       " MEMLEA(0x8,1) ",%1            \n"
  1.1021 +    "jg        1b                              \n"
  1.1022 +  : "+r"(src_argb0),       // %0
  1.1023 +    "+r"(dst_u),           // %1
  1.1024 +    "+r"(dst_v),           // %2
  1.1025 +    "+rm"(width)           // %3
  1.1026 +  : "r"((intptr_t)(src_stride_argb)) // %4
  1.1027 +  : "memory", "cc"
  1.1028 +#if defined(__native_client__) && defined(__x86_64__)
  1.1029 +    , "r14"
  1.1030 +#endif
  1.1031 +#if defined(__SSE2__)
  1.1032 +    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1.1033 +#endif
  1.1034 +  );
  1.1035 +}
  1.1036 +
  1.1037 +void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1.1038 +                                 uint8* dst_u, uint8* dst_v, int width) {
  1.1039 +  asm volatile (
  1.1040 +    "movdqa    %0,%%xmm4                       \n"
  1.1041 +    "movdqa    %1,%%xmm3                       \n"
  1.1042 +    "movdqa    %2,%%xmm5                       \n"
  1.1043 +  :
  1.1044 +  : "m"(kARGBToU),         // %0
  1.1045 +    "m"(kARGBToV),         // %1
  1.1046 +    "m"(kAddUV128)         // %2
  1.1047 +  );
  1.1048 +  asm volatile (
  1.1049 +    "sub       %1,%2                           \n"
  1.1050 +    LABELALIGN
  1.1051 +  "1:                                          \n"
  1.1052 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1.1053 +    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.1054 +    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1.1055 +    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
  1.1056 +    BUNDLEALIGN
  1.1057 +    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
  1.1058 +    "pavgb     %%xmm7,%%xmm0                   \n"
  1.1059 +    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
  1.1060 +    "pavgb     %%xmm7,%%xmm1                   \n"
  1.1061 +    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
  1.1062 +    "pavgb     %%xmm7,%%xmm2                   \n"
  1.1063 +    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
  1.1064 +    "pavgb     %%xmm7,%%xmm6                   \n"
  1.1065 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
  1.1066 +    "movdqa    %%xmm0,%%xmm7                   \n"
  1.1067 +    "shufps    $0x88,%%xmm1,%%xmm0             \n"
  1.1068 +    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
  1.1069 +    "pavgb     %%xmm7,%%xmm0                   \n"
  1.1070 +    "movdqa    %%xmm2,%%xmm7                   \n"
  1.1071 +    "shufps    $0x88,%%xmm6,%%xmm2             \n"
  1.1072 +    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
  1.1073 +    "pavgb     %%xmm7,%%xmm2                   \n"
  1.1074 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.1075 +    "movdqa    %%xmm2,%%xmm6                   \n"
  1.1076 +    "pmaddubsw %%xmm4,%%xmm0                   \n"
  1.1077 +    "pmaddubsw %%xmm4,%%xmm2                   \n"
  1.1078 +    "pmaddubsw %%xmm3,%%xmm1                   \n"
  1.1079 +    "pmaddubsw %%xmm3,%%xmm6                   \n"
  1.1080 +    "phaddw    %%xmm2,%%xmm0                   \n"
  1.1081 +    "phaddw    %%xmm6,%%xmm1                   \n"
  1.1082 +    "psraw     $0x8,%%xmm0                     \n"
  1.1083 +    "psraw     $0x8,%%xmm1                     \n"
  1.1084 +    "packsswb  %%xmm1,%%xmm0                   \n"
  1.1085 +    "paddb     %%xmm5,%%xmm0                   \n"
  1.1086 +    "sub       $0x10,%3                        \n"
  1.1087 +    "movlps    %%xmm0," MEMACCESS(1) "         \n"
  1.1088 +    BUNDLEALIGN
  1.1089 +    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
  1.1090 +    "lea       " MEMLEA(0x8,1) ",%1            \n"
  1.1091 +    "jg        1b                              \n"
  1.1092 +  : "+r"(src_argb0),       // %0
  1.1093 +    "+r"(dst_u),           // %1
  1.1094 +    "+r"(dst_v),           // %2
  1.1095 +    "+rm"(width)           // %3
  1.1096 +  : "r"((intptr_t)(src_stride_argb)) // %4
  1.1097 +  : "memory", "cc"
  1.1098 +#if defined(__native_client__) && defined(__x86_64__)
  1.1099 +    , "r14"
  1.1100 +#endif
  1.1101 +#if defined(__SSE2__)
  1.1102 +    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1.1103 +#endif
  1.1104 +  );
  1.1105 +}
  1.1106 +
  1.1107 +void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1.1108 +                                  uint8* dst_u, uint8* dst_v, int width) {
  1.1109 +  asm volatile (
  1.1110 +    "movdqa    %0,%%xmm4                       \n"
  1.1111 +    "movdqa    %1,%%xmm3                       \n"
  1.1112 +    "movdqa    %2,%%xmm5                       \n"
  1.1113 +  :
  1.1114 +  : "m"(kARGBToUJ),         // %0
  1.1115 +    "m"(kARGBToVJ),         // %1
  1.1116 +    "m"(kAddUVJ128)         // %2
  1.1117 +  );
  1.1118 +  asm volatile (
  1.1119 +    "sub       %1,%2                           \n"
  1.1120 +    LABELALIGN
  1.1121 +  "1:                                          \n"
  1.1122 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1.1123 +    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.1124 +    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1.1125 +    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
  1.1126 +    BUNDLEALIGN
  1.1127 +    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
  1.1128 +    "pavgb     %%xmm7,%%xmm0                   \n"
  1.1129 +    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
  1.1130 +    "pavgb     %%xmm7,%%xmm1                   \n"
  1.1131 +    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
  1.1132 +    "pavgb     %%xmm7,%%xmm2                   \n"
  1.1133 +    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
  1.1134 +    "pavgb     %%xmm7,%%xmm6                   \n"
  1.1135 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
  1.1136 +    "movdqa    %%xmm0,%%xmm7                   \n"
  1.1137 +    "shufps    $0x88,%%xmm1,%%xmm0             \n"
  1.1138 +    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
  1.1139 +    "pavgb     %%xmm7,%%xmm0                   \n"
  1.1140 +    "movdqa    %%xmm2,%%xmm7                   \n"
  1.1141 +    "shufps    $0x88,%%xmm6,%%xmm2             \n"
  1.1142 +    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
  1.1143 +    "pavgb     %%xmm7,%%xmm2                   \n"
  1.1144 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.1145 +    "movdqa    %%xmm2,%%xmm6                   \n"
  1.1146 +    "pmaddubsw %%xmm4,%%xmm0                   \n"
  1.1147 +    "pmaddubsw %%xmm4,%%xmm2                   \n"
  1.1148 +    "pmaddubsw %%xmm3,%%xmm1                   \n"
  1.1149 +    "pmaddubsw %%xmm3,%%xmm6                   \n"
  1.1150 +    "phaddw    %%xmm2,%%xmm0                   \n"
  1.1151 +    "phaddw    %%xmm6,%%xmm1                   \n"
  1.1152 +    "paddw     %%xmm5,%%xmm0                   \n"
  1.1153 +    "paddw     %%xmm5,%%xmm1                   \n"
  1.1154 +    "psraw     $0x8,%%xmm0                     \n"
  1.1155 +    "psraw     $0x8,%%xmm1                     \n"
  1.1156 +    "packsswb  %%xmm1,%%xmm0                   \n"
  1.1157 +    "sub       $0x10,%3                        \n"
  1.1158 +    "movlps    %%xmm0," MEMACCESS(1) "         \n"
  1.1159 +    BUNDLEALIGN
  1.1160 +    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
  1.1161 +    "lea       " MEMLEA(0x8,1) ",%1            \n"
  1.1162 +    "jg        1b                              \n"
  1.1163 +  : "+r"(src_argb0),       // %0
  1.1164 +    "+r"(dst_u),           // %1
  1.1165 +    "+r"(dst_v),           // %2
  1.1166 +    "+rm"(width)           // %3
  1.1167 +  : "r"((intptr_t)(src_stride_argb))
  1.1168 +  : "memory", "cc"
  1.1169 +#if defined(__native_client__) && defined(__x86_64__)
  1.1170 +    , "r14"
  1.1171 +#endif
  1.1172 +#if defined(__SSE2__)
  1.1173 +    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1.1174 +#endif
  1.1175 +  );
  1.1176 +}
  1.1177 +
  1.1178 +void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
  1.1179 +                          int width) {
  1.1180 +  asm volatile (
  1.1181 +    "movdqa    %0,%%xmm4                       \n"
  1.1182 +    "movdqa    %1,%%xmm3                       \n"
  1.1183 +    "movdqa    %2,%%xmm5                       \n"
  1.1184 +  :
  1.1185 +  : "m"(kARGBToU),  // %0
  1.1186 +    "m"(kARGBToV),  // %1
  1.1187 +    "m"(kAddUV128)  // %2
  1.1188 +  );
  1.1189 +  asm volatile (
  1.1190 +    "sub       %1,%2                           \n"
  1.1191 +    LABELALIGN
  1.1192 +  "1:                                          \n"
  1.1193 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.1194 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.1195 +    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1.1196 +    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
  1.1197 +    "pmaddubsw %%xmm4,%%xmm0                   \n"
  1.1198 +    "pmaddubsw %%xmm4,%%xmm1                   \n"
  1.1199 +    "pmaddubsw %%xmm4,%%xmm2                   \n"
  1.1200 +    "pmaddubsw %%xmm4,%%xmm6                   \n"
  1.1201 +    "phaddw    %%xmm1,%%xmm0                   \n"
  1.1202 +    "phaddw    %%xmm6,%%xmm2                   \n"
  1.1203 +    "psraw     $0x8,%%xmm0                     \n"
  1.1204 +    "psraw     $0x8,%%xmm2                     \n"
  1.1205 +    "packsswb  %%xmm2,%%xmm0                   \n"
  1.1206 +    "paddb     %%xmm5,%%xmm0                   \n"
  1.1207 +    "sub       $0x10,%3                        \n"
  1.1208 +    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  1.1209 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.1210 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.1211 +    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1.1212 +    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
  1.1213 +    "pmaddubsw %%xmm3,%%xmm0                   \n"
  1.1214 +    "pmaddubsw %%xmm3,%%xmm1                   \n"
  1.1215 +    "pmaddubsw %%xmm3,%%xmm2                   \n"
  1.1216 +    "pmaddubsw %%xmm3,%%xmm6                   \n"
  1.1217 +    "phaddw    %%xmm1,%%xmm0                   \n"
  1.1218 +    "phaddw    %%xmm6,%%xmm2                   \n"
  1.1219 +    "psraw     $0x8,%%xmm0                     \n"
  1.1220 +    "psraw     $0x8,%%xmm2                     \n"
  1.1221 +    "packsswb  %%xmm2,%%xmm0                   \n"
  1.1222 +    "paddb     %%xmm5,%%xmm0                   \n"
  1.1223 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
  1.1224 +    BUNDLEALIGN
  1.1225 +    MEMOPMEM(movdqa,xmm0,0x00,1,2,1)           //  movdqa  %%xmm0,(%1,%2,1)
  1.1226 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.1227 +    "jg        1b                              \n"
  1.1228 +  : "+r"(src_argb),        // %0
  1.1229 +    "+r"(dst_u),           // %1
  1.1230 +    "+r"(dst_v),           // %2
  1.1231 +    "+rm"(width)           // %3
  1.1232 +  :
  1.1233 +  : "memory", "cc"
  1.1234 +#if defined(__native_client__) && defined(__x86_64__)
  1.1235 +    , "r14"
  1.1236 +#endif
  1.1237 +#if defined(__SSE2__)
  1.1238 +    , "xmm0", "xmm1", "xmm2", "xmm6"
  1.1239 +#endif
  1.1240 +  );
  1.1241 +}
  1.1242 +
  1.1243 +void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
  1.1244 +                                    uint8* dst_v, int width) {
  1.1245 +  asm volatile (
  1.1246 +    "movdqa    %0,%%xmm4                       \n"
  1.1247 +    "movdqa    %1,%%xmm3                       \n"
  1.1248 +    "movdqa    %2,%%xmm5                       \n"
  1.1249 +  :
  1.1250 +  : "m"(kARGBToU),  // %0
  1.1251 +    "m"(kARGBToV),  // %1
  1.1252 +    "m"(kAddUV128)  // %2
  1.1253 +  );
  1.1254 +  asm volatile (
  1.1255 +    "sub       %1,%2                           \n"
  1.1256 +    LABELALIGN
  1.1257 +  "1:                                          \n"
  1.1258 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1.1259 +    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.1260 +    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1.1261 +    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
  1.1262 +    "pmaddubsw %%xmm4,%%xmm0                   \n"
  1.1263 +    "pmaddubsw %%xmm4,%%xmm1                   \n"
  1.1264 +    "pmaddubsw %%xmm4,%%xmm2                   \n"
  1.1265 +    "pmaddubsw %%xmm4,%%xmm6                   \n"
  1.1266 +    "phaddw    %%xmm1,%%xmm0                   \n"
  1.1267 +    "phaddw    %%xmm6,%%xmm2                   \n"
  1.1268 +    "psraw     $0x8,%%xmm0                     \n"
  1.1269 +    "psraw     $0x8,%%xmm2                     \n"
  1.1270 +    "packsswb  %%xmm2,%%xmm0                   \n"
  1.1271 +    "paddb     %%xmm5,%%xmm0                   \n"
  1.1272 +    "sub       $0x10,%3                        \n"
  1.1273 +    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
  1.1274 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1.1275 +    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.1276 +    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1.1277 +    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
  1.1278 +    "pmaddubsw %%xmm3,%%xmm0                   \n"
  1.1279 +    "pmaddubsw %%xmm3,%%xmm1                   \n"
  1.1280 +    "pmaddubsw %%xmm3,%%xmm2                   \n"
  1.1281 +    "pmaddubsw %%xmm3,%%xmm6                   \n"
  1.1282 +    "phaddw    %%xmm1,%%xmm0                   \n"
  1.1283 +    "phaddw    %%xmm6,%%xmm2                   \n"
  1.1284 +    "psraw     $0x8,%%xmm0                     \n"
  1.1285 +    "psraw     $0x8,%%xmm2                     \n"
  1.1286 +    "packsswb  %%xmm2,%%xmm0                   \n"
  1.1287 +    "paddb     %%xmm5,%%xmm0                   \n"
  1.1288 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
  1.1289 +    BUNDLEALIGN
  1.1290 +    MEMOPMEM(movdqu,xmm0,0x00,1,2,1)           //  movdqu  %%xmm0,(%1,%2,1)
  1.1291 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.1292 +    "jg        1b                              \n"
  1.1293 +  : "+r"(src_argb),        // %0
  1.1294 +    "+r"(dst_u),           // %1
  1.1295 +    "+r"(dst_v),           // %2
  1.1296 +    "+rm"(width)           // %3
  1.1297 +  :
  1.1298 +  : "memory", "cc"
  1.1299 +#if defined(__native_client__) && defined(__x86_64__)
  1.1300 +    , "r14"
  1.1301 +#endif
  1.1302 +#if defined(__SSE2__)
  1.1303 +    , "xmm0", "xmm1", "xmm2", "xmm6"
  1.1304 +#endif
  1.1305 +  );
  1.1306 +}
  1.1307 +
  1.1308 +void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
  1.1309 +                          uint8* dst_u, uint8* dst_v, int width) {
  1.1310 +  asm volatile (
  1.1311 +    "movdqa    %0,%%xmm4                       \n"
  1.1312 +    "movdqa    %1,%%xmm3                       \n"
  1.1313 +    "movdqa    %2,%%xmm5                       \n"
  1.1314 +  :
  1.1315 +  : "m"(kARGBToU),  // %0
  1.1316 +    "m"(kARGBToV),  // %1
  1.1317 +    "m"(kAddUV128)  // %2
  1.1318 +  );
  1.1319 +  asm volatile (
  1.1320 +    "sub       %1,%2                           \n"
  1.1321 +    LABELALIGN
  1.1322 +  "1:                                          \n"
  1.1323 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.1324 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.1325 +    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1.1326 +    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
  1.1327 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
  1.1328 +    "movdqa    %%xmm0,%%xmm7                   \n"
  1.1329 +    "shufps    $0x88,%%xmm1,%%xmm0             \n"
  1.1330 +    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
  1.1331 +    "pavgb     %%xmm7,%%xmm0                   \n"
  1.1332 +    "movdqa    %%xmm2,%%xmm7                   \n"
  1.1333 +    "shufps    $0x88,%%xmm6,%%xmm2             \n"
  1.1334 +    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
  1.1335 +    "pavgb     %%xmm7,%%xmm2                   \n"
  1.1336 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.1337 +    "movdqa    %%xmm2,%%xmm6                   \n"
  1.1338 +    "pmaddubsw %%xmm4,%%xmm0                   \n"
  1.1339 +    "pmaddubsw %%xmm4,%%xmm2                   \n"
  1.1340 +    "pmaddubsw %%xmm3,%%xmm1                   \n"
  1.1341 +    "pmaddubsw %%xmm3,%%xmm6                   \n"
  1.1342 +    "phaddw    %%xmm2,%%xmm0                   \n"
  1.1343 +    "phaddw    %%xmm6,%%xmm1                   \n"
  1.1344 +    "psraw     $0x8,%%xmm0                     \n"
  1.1345 +    "psraw     $0x8,%%xmm1                     \n"
  1.1346 +    "packsswb  %%xmm1,%%xmm0                   \n"
  1.1347 +    "paddb     %%xmm5,%%xmm0                   \n"
  1.1348 +    "sub       $0x10,%3                        \n"
  1.1349 +    "movlps    %%xmm0," MEMACCESS(1) "         \n"
  1.1350 +    BUNDLEALIGN
  1.1351 +    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
  1.1352 +    "lea       " MEMLEA(0x8,1) ",%1            \n"
  1.1353 +    "jg        1b                              \n"
  1.1354 +  : "+r"(src_argb0),       // %0
  1.1355 +    "+r"(dst_u),           // %1
  1.1356 +    "+r"(dst_v),           // %2
  1.1357 +    "+rm"(width)           // %3
  1.1358 +  :
  1.1359 +  : "memory", "cc"
  1.1360 +#if defined(__native_client__) && defined(__x86_64__)
  1.1361 +    , "r14"
  1.1362 +#endif
  1.1363 +#if defined(__SSE2__)
  1.1364 +    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1.1365 +#endif
  1.1366 +  );
  1.1367 +}
  1.1368 +
  1.1369 +void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
  1.1370 +                                    uint8* dst_u, uint8* dst_v, int width) {
  1.1371 +  asm volatile (
  1.1372 +    "movdqa    %0,%%xmm4                       \n"
  1.1373 +    "movdqa    %1,%%xmm3                       \n"
  1.1374 +    "movdqa    %2,%%xmm5                       \n"
  1.1375 +  :
  1.1376 +  : "m"(kARGBToU),  // %0
  1.1377 +    "m"(kARGBToV),  // %1
  1.1378 +    "m"(kAddUV128)  // %2
  1.1379 +  );
  1.1380 +  asm volatile (
  1.1381 +    "sub       %1,%2                           \n"
  1.1382 +    LABELALIGN
  1.1383 +  "1:                                          \n"
  1.1384 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1.1385 +    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.1386 +    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1.1387 +    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
  1.1388 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
  1.1389 +    "movdqa    %%xmm0,%%xmm7                   \n"
  1.1390 +    "shufps    $0x88,%%xmm1,%%xmm0             \n"
  1.1391 +    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
  1.1392 +    "pavgb     %%xmm7,%%xmm0                   \n"
  1.1393 +    "movdqa    %%xmm2,%%xmm7                   \n"
  1.1394 +    "shufps    $0x88,%%xmm6,%%xmm2             \n"
  1.1395 +    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
  1.1396 +    "pavgb     %%xmm7,%%xmm2                   \n"
  1.1397 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.1398 +    "movdqa    %%xmm2,%%xmm6                   \n"
  1.1399 +    "pmaddubsw %%xmm4,%%xmm0                   \n"
  1.1400 +    "pmaddubsw %%xmm4,%%xmm2                   \n"
  1.1401 +    "pmaddubsw %%xmm3,%%xmm1                   \n"
  1.1402 +    "pmaddubsw %%xmm3,%%xmm6                   \n"
  1.1403 +    "phaddw    %%xmm2,%%xmm0                   \n"
  1.1404 +    "phaddw    %%xmm6,%%xmm1                   \n"
  1.1405 +    "psraw     $0x8,%%xmm0                     \n"
  1.1406 +    "psraw     $0x8,%%xmm1                     \n"
  1.1407 +    "packsswb  %%xmm1,%%xmm0                   \n"
  1.1408 +    "paddb     %%xmm5,%%xmm0                   \n"
  1.1409 +    "sub       $0x10,%3                        \n"
  1.1410 +    "movlps    %%xmm0," MEMACCESS(1) "         \n"
  1.1411 +    BUNDLEALIGN
  1.1412 +    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
  1.1413 +    "lea       " MEMLEA(0x8,1) ",%1            \n"
  1.1414 +    "jg        1b                              \n"
  1.1415 +  : "+r"(src_argb0),       // %0
  1.1416 +    "+r"(dst_u),           // %1
  1.1417 +    "+r"(dst_v),           // %2
  1.1418 +    "+rm"(width)           // %3
  1.1419 +  :
  1.1420 +  : "memory", "cc"
  1.1421 +#if defined(__native_client__) && defined(__x86_64__)
  1.1422 +    , "r14"
  1.1423 +#endif
  1.1424 +#if defined(__SSE2__)
  1.1425 +    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1.1426 +#endif
  1.1427 +  );
  1.1428 +}
  1.1429 +
  1.1430 +void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
  1.1431 +  asm volatile (
  1.1432 +    "movdqa    %4,%%xmm5                       \n"
  1.1433 +    "movdqa    %3,%%xmm4                       \n"
  1.1434 +    LABELALIGN
  1.1435 +  "1:                                          \n"
  1.1436 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.1437 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.1438 +    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1.1439 +    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
  1.1440 +    "pmaddubsw %%xmm4,%%xmm0                   \n"
  1.1441 +    "pmaddubsw %%xmm4,%%xmm1                   \n"
  1.1442 +    "pmaddubsw %%xmm4,%%xmm2                   \n"
  1.1443 +    "pmaddubsw %%xmm4,%%xmm3                   \n"
  1.1444 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
  1.1445 +    "phaddw    %%xmm1,%%xmm0                   \n"
  1.1446 +    "phaddw    %%xmm3,%%xmm2                   \n"
  1.1447 +    "psrlw     $0x7,%%xmm0                     \n"
  1.1448 +    "psrlw     $0x7,%%xmm2                     \n"
  1.1449 +    "packuswb  %%xmm2,%%xmm0                   \n"
  1.1450 +    "paddb     %%xmm5,%%xmm0                   \n"
  1.1451 +    "sub       $0x10,%2                        \n"
  1.1452 +    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  1.1453 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.1454 +    "jg        1b                              \n"
  1.1455 +  : "+r"(src_bgra),  // %0
  1.1456 +    "+r"(dst_y),     // %1
  1.1457 +    "+r"(pix)        // %2
  1.1458 +  : "m"(kBGRAToY),   // %3
  1.1459 +    "m"(kAddY16)     // %4
  1.1460 +  : "memory", "cc"
  1.1461 +#if defined(__SSE2__)
  1.1462 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.1463 +#endif
  1.1464 +  );
  1.1465 +}
  1.1466 +
  1.1467 +void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
  1.1468 +  asm volatile (
  1.1469 +    "movdqa    %4,%%xmm5                       \n"
  1.1470 +    "movdqa    %3,%%xmm4                       \n"
  1.1471 +    LABELALIGN
  1.1472 +  "1:                                          \n"
  1.1473 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1.1474 +    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.1475 +    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1.1476 +    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
  1.1477 +    "pmaddubsw %%xmm4,%%xmm0                   \n"
  1.1478 +    "pmaddubsw %%xmm4,%%xmm1                   \n"
  1.1479 +    "pmaddubsw %%xmm4,%%xmm2                   \n"
  1.1480 +    "pmaddubsw %%xmm4,%%xmm3                   \n"
  1.1481 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
  1.1482 +    "phaddw    %%xmm1,%%xmm0                   \n"
  1.1483 +    "phaddw    %%xmm3,%%xmm2                   \n"
  1.1484 +    "psrlw     $0x7,%%xmm0                     \n"
  1.1485 +    "psrlw     $0x7,%%xmm2                     \n"
  1.1486 +    "packuswb  %%xmm2,%%xmm0                   \n"
  1.1487 +    "paddb     %%xmm5,%%xmm0                   \n"
  1.1488 +    "sub       $0x10,%2                        \n"
  1.1489 +    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
  1.1490 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.1491 +    "jg        1b                              \n"
  1.1492 +  : "+r"(src_bgra),  // %0
  1.1493 +    "+r"(dst_y),     // %1
  1.1494 +    "+r"(pix)        // %2
  1.1495 +  : "m"(kBGRAToY),   // %3
  1.1496 +    "m"(kAddY16)     // %4
  1.1497 +  : "memory", "cc"
  1.1498 +#if defined(__SSE2__)
  1.1499 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.1500 +#endif
  1.1501 +  );
  1.1502 +}
  1.1503 +
  1.1504 +void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
  1.1505 +                       uint8* dst_u, uint8* dst_v, int width) {
  1.1506 +  asm volatile (
  1.1507 +    "movdqa    %0,%%xmm4                       \n"
  1.1508 +    "movdqa    %1,%%xmm3                       \n"
  1.1509 +    "movdqa    %2,%%xmm5                       \n"
  1.1510 +  :
  1.1511 +  : "m"(kBGRAToU),         // %0
  1.1512 +    "m"(kBGRAToV),         // %1
  1.1513 +    "m"(kAddUV128)         // %2
  1.1514 +  );
  1.1515 +  asm volatile (
  1.1516 +    "sub       %1,%2                           \n"
  1.1517 +    LABELALIGN
  1.1518 +  "1:                                          \n"
  1.1519 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.1520 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.1521 +    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1.1522 +    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
  1.1523 +    BUNDLEALIGN
  1.1524 +    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
  1.1525 +    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
  1.1526 +    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
  1.1527 +    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
  1.1528 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
  1.1529 +    "movdqa    %%xmm0,%%xmm7                   \n"
  1.1530 +    "shufps    $0x88,%%xmm1,%%xmm0             \n"
  1.1531 +    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
  1.1532 +    "pavgb     %%xmm7,%%xmm0                   \n"
  1.1533 +    "movdqa    %%xmm2,%%xmm7                   \n"
  1.1534 +    "shufps    $0x88,%%xmm6,%%xmm2             \n"
  1.1535 +    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
  1.1536 +    "pavgb     %%xmm7,%%xmm2                   \n"
  1.1537 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.1538 +    "movdqa    %%xmm2,%%xmm6                   \n"
  1.1539 +    "pmaddubsw %%xmm4,%%xmm0                   \n"
  1.1540 +    "pmaddubsw %%xmm4,%%xmm2                   \n"
  1.1541 +    "pmaddubsw %%xmm3,%%xmm1                   \n"
  1.1542 +    "pmaddubsw %%xmm3,%%xmm6                   \n"
  1.1543 +    "phaddw    %%xmm2,%%xmm0                   \n"
  1.1544 +    "phaddw    %%xmm6,%%xmm1                   \n"
  1.1545 +    "psraw     $0x8,%%xmm0                     \n"
  1.1546 +    "psraw     $0x8,%%xmm1                     \n"
  1.1547 +    "packsswb  %%xmm1,%%xmm0                   \n"
  1.1548 +    "paddb     %%xmm5,%%xmm0                   \n"
  1.1549 +    "sub       $0x10,%3                        \n"
  1.1550 +    "movlps    %%xmm0," MEMACCESS(1) "         \n"
  1.1551 +    BUNDLEALIGN
  1.1552 +    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
  1.1553 +    "lea       " MEMLEA(0x8,1) ",%1            \n"
  1.1554 +    "jg        1b                              \n"
  1.1555 +  : "+r"(src_bgra0),       // %0
  1.1556 +    "+r"(dst_u),           // %1
  1.1557 +    "+r"(dst_v),           // %2
  1.1558 +    "+rm"(width)           // %3
  1.1559 +  : "r"((intptr_t)(src_stride_bgra)) // %4
  1.1560 +  : "memory", "cc"
  1.1561 +#if defined(__native_client__) && defined(__x86_64__)
  1.1562 +    , "r14"
  1.1563 +#endif
  1.1564 +#if defined(__SSE2__)
  1.1565 +    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1.1566 +#endif
  1.1567 +  );
  1.1568 +}
  1.1569 +
  1.1570 +void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
  1.1571 +                                 uint8* dst_u, uint8* dst_v, int width) {
  1.1572 +  asm volatile (
  1.1573 +    "movdqa    %0,%%xmm4                       \n"
  1.1574 +    "movdqa    %1,%%xmm3                       \n"
  1.1575 +    "movdqa    %2,%%xmm5                       \n"
  1.1576 +  :
  1.1577 +  : "m"(kBGRAToU),         // %0
  1.1578 +    "m"(kBGRAToV),         // %1
  1.1579 +    "m"(kAddUV128)         // %2
  1.1580 +  );
  1.1581 +  asm volatile (
  1.1582 +    "sub       %1,%2                           \n"
  1.1583 +    LABELALIGN
  1.1584 +  "1:                                          \n"
  1.1585 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1.1586 +    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.1587 +    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1.1588 +    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
  1.1589 +    BUNDLEALIGN
  1.1590 +    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
  1.1591 +    "pavgb     %%xmm7,%%xmm0                   \n"
  1.1592 +    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
  1.1593 +    "pavgb     %%xmm7,%%xmm1                   \n"
  1.1594 +    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
  1.1595 +    "pavgb     %%xmm7,%%xmm2                   \n"
  1.1596 +    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
  1.1597 +    "pavgb     %%xmm7,%%xmm6                   \n"
  1.1598 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
  1.1599 +    "movdqa    %%xmm0,%%xmm7                   \n"
  1.1600 +    "shufps    $0x88,%%xmm1,%%xmm0             \n"
  1.1601 +    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
  1.1602 +    "pavgb     %%xmm7,%%xmm0                   \n"
  1.1603 +    "movdqa    %%xmm2,%%xmm7                   \n"
  1.1604 +    "shufps    $0x88,%%xmm6,%%xmm2             \n"
  1.1605 +    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
  1.1606 +    "pavgb     %%xmm7,%%xmm2                   \n"
  1.1607 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.1608 +    "movdqa    %%xmm2,%%xmm6                   \n"
  1.1609 +    "pmaddubsw %%xmm4,%%xmm0                   \n"
  1.1610 +    "pmaddubsw %%xmm4,%%xmm2                   \n"
  1.1611 +    "pmaddubsw %%xmm3,%%xmm1                   \n"
  1.1612 +    "pmaddubsw %%xmm3,%%xmm6                   \n"
  1.1613 +    "phaddw    %%xmm2,%%xmm0                   \n"
  1.1614 +    "phaddw    %%xmm6,%%xmm1                   \n"
  1.1615 +    "psraw     $0x8,%%xmm0                     \n"
  1.1616 +    "psraw     $0x8,%%xmm1                     \n"
  1.1617 +    "packsswb  %%xmm1,%%xmm0                   \n"
  1.1618 +    "paddb     %%xmm5,%%xmm0                   \n"
  1.1619 +    "sub       $0x10,%3                        \n"
  1.1620 +    "movlps    %%xmm0," MEMACCESS(1) "         \n"
  1.1621 +    BUNDLEALIGN
  1.1622 +    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
  1.1623 +    "lea       " MEMLEA(0x8,1) ",%1            \n"
  1.1624 +    "jg        1b                              \n"
  1.1625 +  : "+r"(src_bgra0),       // %0
  1.1626 +    "+r"(dst_u),           // %1
  1.1627 +    "+r"(dst_v),           // %2
  1.1628 +    "+rm"(width)           // %3
  1.1629 +  : "r"((intptr_t)(src_stride_bgra)) // %4
  1.1630 +  : "memory", "cc"
  1.1631 +#if defined(__native_client__) && defined(__x86_64__)
  1.1632 +    , "r14"
  1.1633 +#endif
  1.1634 +#if defined(__SSE2__)
  1.1635 +    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1.1636 +#endif
  1.1637 +  );
  1.1638 +}
  1.1639 +
  1.1640 +void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
  1.1641 +  asm volatile (
  1.1642 +    "movdqa    %4,%%xmm5                       \n"
  1.1643 +    "movdqa    %3,%%xmm4                       \n"
  1.1644 +    LABELALIGN
  1.1645 +  "1:                                          \n"
  1.1646 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.1647 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.1648 +    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1.1649 +    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
  1.1650 +    "pmaddubsw %%xmm4,%%xmm0                   \n"
  1.1651 +    "pmaddubsw %%xmm4,%%xmm1                   \n"
  1.1652 +    "pmaddubsw %%xmm4,%%xmm2                   \n"
  1.1653 +    "pmaddubsw %%xmm4,%%xmm3                   \n"
  1.1654 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
  1.1655 +    "phaddw    %%xmm1,%%xmm0                   \n"
  1.1656 +    "phaddw    %%xmm3,%%xmm2                   \n"
  1.1657 +    "psrlw     $0x7,%%xmm0                     \n"
  1.1658 +    "psrlw     $0x7,%%xmm2                     \n"
  1.1659 +    "packuswb  %%xmm2,%%xmm0                   \n"
  1.1660 +    "paddb     %%xmm5,%%xmm0                   \n"
  1.1661 +    "sub       $0x10,%2                        \n"
  1.1662 +    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  1.1663 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.1664 +    "jg        1b                              \n"
  1.1665 +  : "+r"(src_abgr),  // %0
  1.1666 +    "+r"(dst_y),     // %1
  1.1667 +    "+r"(pix)        // %2
  1.1668 +  : "m"(kABGRToY),   // %3
  1.1669 +    "m"(kAddY16)     // %4
  1.1670 +  : "memory", "cc"
  1.1671 +#if defined(__SSE2__)
  1.1672 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.1673 +#endif
  1.1674 +  );
  1.1675 +}
  1.1676 +
  1.1677 +void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
  1.1678 +  asm volatile (
  1.1679 +    "movdqa    %4,%%xmm5                       \n"
  1.1680 +    "movdqa    %3,%%xmm4                       \n"
  1.1681 +    LABELALIGN
  1.1682 +  "1:                                          \n"
  1.1683 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1.1684 +    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.1685 +    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1.1686 +    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
  1.1687 +    "pmaddubsw %%xmm4,%%xmm0                   \n"
  1.1688 +    "pmaddubsw %%xmm4,%%xmm1                   \n"
  1.1689 +    "pmaddubsw %%xmm4,%%xmm2                   \n"
  1.1690 +    "pmaddubsw %%xmm4,%%xmm3                   \n"
  1.1691 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
  1.1692 +    "phaddw    %%xmm1,%%xmm0                   \n"
  1.1693 +    "phaddw    %%xmm3,%%xmm2                   \n"
  1.1694 +    "psrlw     $0x7,%%xmm0                     \n"
  1.1695 +    "psrlw     $0x7,%%xmm2                     \n"
  1.1696 +    "packuswb  %%xmm2,%%xmm0                   \n"
  1.1697 +    "paddb     %%xmm5,%%xmm0                   \n"
  1.1698 +    "sub       $0x10,%2                        \n"
  1.1699 +    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
  1.1700 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.1701 +    "jg        1b                              \n"
  1.1702 +  : "+r"(src_abgr),  // %0
  1.1703 +    "+r"(dst_y),     // %1
  1.1704 +    "+r"(pix)        // %2
  1.1705 +  : "m"(kABGRToY),   // %3
  1.1706 +    "m"(kAddY16)     // %4
  1.1707 +  : "memory", "cc"
  1.1708 +#if defined(__SSE2__)
  1.1709 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.1710 +#endif
  1.1711 +  );
  1.1712 +}
  1.1713 +
  1.1714 +void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
  1.1715 +  asm volatile (
  1.1716 +    "movdqa    %4,%%xmm5                       \n"
  1.1717 +    "movdqa    %3,%%xmm4                       \n"
  1.1718 +    LABELALIGN
  1.1719 +  "1:                                          \n"
  1.1720 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.1721 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.1722 +    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1.1723 +    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
  1.1724 +    "pmaddubsw %%xmm4,%%xmm0                   \n"
  1.1725 +    "pmaddubsw %%xmm4,%%xmm1                   \n"
  1.1726 +    "pmaddubsw %%xmm4,%%xmm2                   \n"
  1.1727 +    "pmaddubsw %%xmm4,%%xmm3                   \n"
  1.1728 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
  1.1729 +    "phaddw    %%xmm1,%%xmm0                   \n"
  1.1730 +    "phaddw    %%xmm3,%%xmm2                   \n"
  1.1731 +    "psrlw     $0x7,%%xmm0                     \n"
  1.1732 +    "psrlw     $0x7,%%xmm2                     \n"
  1.1733 +    "packuswb  %%xmm2,%%xmm0                   \n"
  1.1734 +    "paddb     %%xmm5,%%xmm0                   \n"
  1.1735 +    "sub       $0x10,%2                        \n"
  1.1736 +    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  1.1737 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.1738 +    "jg        1b                              \n"
  1.1739 +  : "+r"(src_rgba),  // %0
  1.1740 +    "+r"(dst_y),     // %1
  1.1741 +    "+r"(pix)        // %2
  1.1742 +  : "m"(kRGBAToY),   // %3
  1.1743 +    "m"(kAddY16)     // %4
  1.1744 +  : "memory", "cc"
  1.1745 +#if defined(__SSE2__)
  1.1746 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.1747 +#endif
  1.1748 +  );
  1.1749 +}
  1.1750 +
  1.1751 +void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
  1.1752 +  asm volatile (
  1.1753 +    "movdqa    %4,%%xmm5                       \n"
  1.1754 +    "movdqa    %3,%%xmm4                       \n"
  1.1755 +    LABELALIGN
  1.1756 +  "1:                                          \n"
  1.1757 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1.1758 +    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.1759 +    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1.1760 +    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
  1.1761 +    "pmaddubsw %%xmm4,%%xmm0                   \n"
  1.1762 +    "pmaddubsw %%xmm4,%%xmm1                   \n"
  1.1763 +    "pmaddubsw %%xmm4,%%xmm2                   \n"
  1.1764 +    "pmaddubsw %%xmm4,%%xmm3                   \n"
  1.1765 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
  1.1766 +    "phaddw    %%xmm1,%%xmm0                   \n"
  1.1767 +    "phaddw    %%xmm3,%%xmm2                   \n"
  1.1768 +    "psrlw     $0x7,%%xmm0                     \n"
  1.1769 +    "psrlw     $0x7,%%xmm2                     \n"
  1.1770 +    "packuswb  %%xmm2,%%xmm0                   \n"
  1.1771 +    "paddb     %%xmm5,%%xmm0                   \n"
  1.1772 +    "sub       $0x10,%2                        \n"
  1.1773 +    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
  1.1774 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.1775 +    "jg        1b                              \n"
  1.1776 +  : "+r"(src_rgba),  // %0
  1.1777 +    "+r"(dst_y),     // %1
  1.1778 +    "+r"(pix)        // %2
  1.1779 +  : "m"(kRGBAToY),   // %3
  1.1780 +    "m"(kAddY16)     // %4
  1.1781 +  : "memory", "cc"
  1.1782 +#if defined(__SSE2__)
  1.1783 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.1784 +#endif
  1.1785 +  );
  1.1786 +}
  1.1787 +
  1.1788 +void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
  1.1789 +                       uint8* dst_u, uint8* dst_v, int width) {
  1.1790 +  asm volatile (
  1.1791 +    "movdqa    %0,%%xmm4                       \n"
  1.1792 +    "movdqa    %1,%%xmm3                       \n"
  1.1793 +    "movdqa    %2,%%xmm5                       \n"
  1.1794 +  :
  1.1795 +  : "m"(kABGRToU),         // %0
  1.1796 +    "m"(kABGRToV),         // %1
  1.1797 +    "m"(kAddUV128)         // %2
  1.1798 +  );
  1.1799 +  asm volatile (
  1.1800 +    "sub       %1,%2                           \n"
  1.1801 +    LABELALIGN
  1.1802 +  "1:                                          \n"
  1.1803 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.1804 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.1805 +    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1.1806 +    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
  1.1807 +    BUNDLEALIGN
  1.1808 +    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
  1.1809 +    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
  1.1810 +    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
  1.1811 +    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
  1.1812 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
  1.1813 +    "movdqa    %%xmm0,%%xmm7                   \n"
  1.1814 +    "shufps    $0x88,%%xmm1,%%xmm0             \n"
  1.1815 +    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
  1.1816 +    "pavgb     %%xmm7,%%xmm0                   \n"
  1.1817 +    "movdqa    %%xmm2,%%xmm7                   \n"
  1.1818 +    "shufps    $0x88,%%xmm6,%%xmm2             \n"
  1.1819 +    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
  1.1820 +    "pavgb     %%xmm7,%%xmm2                   \n"
  1.1821 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.1822 +    "movdqa    %%xmm2,%%xmm6                   \n"
  1.1823 +    "pmaddubsw %%xmm4,%%xmm0                   \n"
  1.1824 +    "pmaddubsw %%xmm4,%%xmm2                   \n"
  1.1825 +    "pmaddubsw %%xmm3,%%xmm1                   \n"
  1.1826 +    "pmaddubsw %%xmm3,%%xmm6                   \n"
  1.1827 +    "phaddw    %%xmm2,%%xmm0                   \n"
  1.1828 +    "phaddw    %%xmm6,%%xmm1                   \n"
  1.1829 +    "psraw     $0x8,%%xmm0                     \n"
  1.1830 +    "psraw     $0x8,%%xmm1                     \n"
  1.1831 +    "packsswb  %%xmm1,%%xmm0                   \n"
  1.1832 +    "paddb     %%xmm5,%%xmm0                   \n"
  1.1833 +    "sub       $0x10,%3                        \n"
  1.1834 +    "movlps    %%xmm0," MEMACCESS(1) "         \n"
  1.1835 +    BUNDLEALIGN
  1.1836 +    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
  1.1837 +    "lea       " MEMLEA(0x8,1) ",%1            \n"
  1.1838 +    "jg        1b                              \n"
  1.1839 +  : "+r"(src_abgr0),       // %0
  1.1840 +    "+r"(dst_u),           // %1
  1.1841 +    "+r"(dst_v),           // %2
  1.1842 +    "+rm"(width)           // %3
  1.1843 +  : "r"((intptr_t)(src_stride_abgr)) // %4
  1.1844 +  : "memory", "cc"
  1.1845 +#if defined(__native_client__) && defined(__x86_64__)
  1.1846 +    , "r14"
  1.1847 +#endif
  1.1848 +#if defined(__SSE2__)
  1.1849 +    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1.1850 +#endif
  1.1851 +  );
  1.1852 +}
  1.1853 +
  1.1854 +void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
  1.1855 +                                 uint8* dst_u, uint8* dst_v, int width) {
  1.1856 +  asm volatile (
  1.1857 +    "movdqa    %0,%%xmm4                       \n"
  1.1858 +    "movdqa    %1,%%xmm3                       \n"
  1.1859 +    "movdqa    %2,%%xmm5                       \n"
  1.1860 +  :
  1.1861 +  : "m"(kABGRToU),         // %0
  1.1862 +    "m"(kABGRToV),         // %1
  1.1863 +    "m"(kAddUV128)         // %2
  1.1864 +  );
  1.1865 +  asm volatile (
  1.1866 +    "sub       %1,%2                           \n"
  1.1867 +    LABELALIGN
  1.1868 +  "1:                                          \n"
  1.1869 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1.1870 +    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.1871 +    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1.1872 +    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
  1.1873 +    BUNDLEALIGN
  1.1874 +    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
  1.1875 +    "pavgb     %%xmm7,%%xmm0                   \n"
  1.1876 +    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
  1.1877 +    "pavgb     %%xmm7,%%xmm1                   \n"
  1.1878 +    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
  1.1879 +    "pavgb     %%xmm7,%%xmm2                   \n"
  1.1880 +    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
  1.1881 +    "pavgb     %%xmm7,%%xmm6                   \n"
  1.1882 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
  1.1883 +    "movdqa    %%xmm0,%%xmm7                   \n"
  1.1884 +    "shufps    $0x88,%%xmm1,%%xmm0             \n"
  1.1885 +    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
  1.1886 +    "pavgb     %%xmm7,%%xmm0                   \n"
  1.1887 +    "movdqa    %%xmm2,%%xmm7                   \n"
  1.1888 +    "shufps    $0x88,%%xmm6,%%xmm2             \n"
  1.1889 +    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
  1.1890 +    "pavgb     %%xmm7,%%xmm2                   \n"
  1.1891 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.1892 +    "movdqa    %%xmm2,%%xmm6                   \n"
  1.1893 +    "pmaddubsw %%xmm4,%%xmm0                   \n"
  1.1894 +    "pmaddubsw %%xmm4,%%xmm2                   \n"
  1.1895 +    "pmaddubsw %%xmm3,%%xmm1                   \n"
  1.1896 +    "pmaddubsw %%xmm3,%%xmm6                   \n"
  1.1897 +    "phaddw    %%xmm2,%%xmm0                   \n"
  1.1898 +    "phaddw    %%xmm6,%%xmm1                   \n"
  1.1899 +    "psraw     $0x8,%%xmm0                     \n"
  1.1900 +    "psraw     $0x8,%%xmm1                     \n"
  1.1901 +    "packsswb  %%xmm1,%%xmm0                   \n"
  1.1902 +    "paddb     %%xmm5,%%xmm0                   \n"
  1.1903 +    "sub       $0x10,%3                        \n"
  1.1904 +    "movlps    %%xmm0," MEMACCESS(1) "         \n"
  1.1905 +    BUNDLEALIGN
  1.1906 +    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
  1.1907 +    "lea       " MEMLEA(0x8,1) ",%1            \n"
  1.1908 +    "jg        1b                              \n"
  1.1909 +  : "+r"(src_abgr0),       // %0
  1.1910 +    "+r"(dst_u),           // %1
  1.1911 +    "+r"(dst_v),           // %2
  1.1912 +    "+rm"(width)           // %3
  1.1913 +  : "r"((intptr_t)(src_stride_abgr)) // %4
  1.1914 +  : "memory", "cc"
  1.1915 +#if defined(__native_client__) && defined(__x86_64__)
  1.1916 +    , "r14"
  1.1917 +#endif
  1.1918 +#if defined(__SSE2__)
  1.1919 +    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1.1920 +#endif
  1.1921 +  );
  1.1922 +}
  1.1923 +
  1.1924 +void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
  1.1925 +                       uint8* dst_u, uint8* dst_v, int width) {
  1.1926 +  asm volatile (
  1.1927 +    "movdqa    %0,%%xmm4                       \n"
  1.1928 +    "movdqa    %1,%%xmm3                       \n"
  1.1929 +    "movdqa    %2,%%xmm5                       \n"
  1.1930 +  :
  1.1931 +  : "m"(kRGBAToU),         // %0
  1.1932 +    "m"(kRGBAToV),         // %1
  1.1933 +    "m"(kAddUV128)         // %2
  1.1934 +  );
  1.1935 +  asm volatile (
  1.1936 +    "sub       %1,%2                           \n"
  1.1937 +    LABELALIGN
  1.1938 +  "1:                                          \n"
  1.1939 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.1940 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.1941 +    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1.1942 +    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
  1.1943 +    BUNDLEALIGN
  1.1944 +    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
  1.1945 +    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
  1.1946 +    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
  1.1947 +    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
  1.1948 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
  1.1949 +    "movdqa    %%xmm0,%%xmm7                   \n"
  1.1950 +    "shufps    $0x88,%%xmm1,%%xmm0             \n"
  1.1951 +    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
  1.1952 +    "pavgb     %%xmm7,%%xmm0                   \n"
  1.1953 +    "movdqa    %%xmm2,%%xmm7                   \n"
  1.1954 +    "shufps    $0x88,%%xmm6,%%xmm2             \n"
  1.1955 +    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
  1.1956 +    "pavgb     %%xmm7,%%xmm2                   \n"
  1.1957 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.1958 +    "movdqa    %%xmm2,%%xmm6                   \n"
  1.1959 +    "pmaddubsw %%xmm4,%%xmm0                   \n"
  1.1960 +    "pmaddubsw %%xmm4,%%xmm2                   \n"
  1.1961 +    "pmaddubsw %%xmm3,%%xmm1                   \n"
  1.1962 +    "pmaddubsw %%xmm3,%%xmm6                   \n"
  1.1963 +    "phaddw    %%xmm2,%%xmm0                   \n"
  1.1964 +    "phaddw    %%xmm6,%%xmm1                   \n"
  1.1965 +    "psraw     $0x8,%%xmm0                     \n"
  1.1966 +    "psraw     $0x8,%%xmm1                     \n"
  1.1967 +    "packsswb  %%xmm1,%%xmm0                   \n"
  1.1968 +    "paddb     %%xmm5,%%xmm0                   \n"
  1.1969 +    "sub       $0x10,%3                        \n"
  1.1970 +    "movlps    %%xmm0," MEMACCESS(1) "         \n"
  1.1971 +    BUNDLEALIGN
  1.1972 +    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
  1.1973 +    "lea       " MEMLEA(0x8,1) ",%1            \n"
  1.1974 +    "jg        1b                              \n"
  1.1975 +  : "+r"(src_rgba0),       // %0
  1.1976 +    "+r"(dst_u),           // %1
  1.1977 +    "+r"(dst_v),           // %2
  1.1978 +    "+rm"(width)           // %3
  1.1979 +  : "r"((intptr_t)(src_stride_rgba))
  1.1980 +  : "memory", "cc"
  1.1981 +#if defined(__native_client__) && defined(__x86_64__)
  1.1982 +    , "r14"
  1.1983 +#endif
  1.1984 +#if defined(__SSE2__)
  1.1985 +    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1.1986 +#endif
  1.1987 +  );
  1.1988 +}
  1.1989 +
  1.1990 +void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
  1.1991 +                                 uint8* dst_u, uint8* dst_v, int width) {
  1.1992 +  asm volatile (
  1.1993 +    "movdqa    %0,%%xmm4                       \n"
  1.1994 +    "movdqa    %1,%%xmm3                       \n"
  1.1995 +    "movdqa    %2,%%xmm5                       \n"
  1.1996 +  :
  1.1997 +  : "m"(kRGBAToU),         // %0
  1.1998 +    "m"(kRGBAToV),         // %1
  1.1999 +    "m"(kAddUV128)         // %2
  1.2000 +  );
  1.2001 +  asm volatile (
  1.2002 +    "sub       %1,%2                           \n"
  1.2003 +    LABELALIGN
  1.2004 +  "1:                                          \n"
  1.2005 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1.2006 +    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.2007 +    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1.2008 +    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
  1.2009 +    BUNDLEALIGN
  1.2010 +    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
  1.2011 +    "pavgb     %%xmm7,%%xmm0                   \n"
  1.2012 +    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
  1.2013 +    "pavgb     %%xmm7,%%xmm1                   \n"
  1.2014 +    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
  1.2015 +    "pavgb     %%xmm7,%%xmm2                   \n"
  1.2016 +    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
  1.2017 +    "pavgb     %%xmm7,%%xmm6                   \n"
  1.2018 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
  1.2019 +    "movdqa    %%xmm0,%%xmm7                   \n"
  1.2020 +    "shufps    $0x88,%%xmm1,%%xmm0             \n"
  1.2021 +    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
  1.2022 +    "pavgb     %%xmm7,%%xmm0                   \n"
  1.2023 +    "movdqa    %%xmm2,%%xmm7                   \n"
  1.2024 +    "shufps    $0x88,%%xmm6,%%xmm2             \n"
  1.2025 +    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
  1.2026 +    "pavgb     %%xmm7,%%xmm2                   \n"
  1.2027 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.2028 +    "movdqa    %%xmm2,%%xmm6                   \n"
  1.2029 +    "pmaddubsw %%xmm4,%%xmm0                   \n"
  1.2030 +    "pmaddubsw %%xmm4,%%xmm2                   \n"
  1.2031 +    "pmaddubsw %%xmm3,%%xmm1                   \n"
  1.2032 +    "pmaddubsw %%xmm3,%%xmm6                   \n"
  1.2033 +    "phaddw    %%xmm2,%%xmm0                   \n"
  1.2034 +    "phaddw    %%xmm6,%%xmm1                   \n"
  1.2035 +    "psraw     $0x8,%%xmm0                     \n"
  1.2036 +    "psraw     $0x8,%%xmm1                     \n"
  1.2037 +    "packsswb  %%xmm1,%%xmm0                   \n"
  1.2038 +    "paddb     %%xmm5,%%xmm0                   \n"
  1.2039 +    "sub       $0x10,%3                        \n"
  1.2040 +    "movlps    %%xmm0," MEMACCESS(1) "         \n"
  1.2041 +    BUNDLEALIGN
  1.2042 +    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
  1.2043 +    "lea       " MEMLEA(0x8,1) ",%1            \n"
  1.2044 +    "jg        1b                              \n"
  1.2045 +  : "+r"(src_rgba0),       // %0
  1.2046 +    "+r"(dst_u),           // %1
  1.2047 +    "+r"(dst_v),           // %2
  1.2048 +    "+rm"(width)           // %3
  1.2049 +  : "r"((intptr_t)(src_stride_rgba)) // %4
  1.2050 +  : "memory", "cc"
  1.2051 +#if defined(__native_client__) && defined(__x86_64__)
  1.2052 +    , "r14"
  1.2053 +#endif
  1.2054 +#if defined(__SSE2__)
  1.2055 +    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1.2056 +#endif
  1.2057 +  );
  1.2058 +}
  1.2059 +#endif  // HAS_ARGBTOUVROW_SSSE3
  1.2060 +
  1.2061 +#ifdef HAS_I422TOARGBROW_SSSE3
  1.2062 +#define UB 127 /* min(63,(int8)(2.018 * 64)) */
  1.2063 +#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
  1.2064 +#define UR 0
  1.2065 +
  1.2066 +#define VB 0
  1.2067 +#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
  1.2068 +#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
  1.2069 +
  1.2070 +// Bias
  1.2071 +#define BB UB * 128 + VB * 128
  1.2072 +#define BG UG * 128 + VG * 128
  1.2073 +#define BR UR * 128 + VR * 128
  1.2074 +
  1.2075 +#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
  1.2076 +
  1.2077 +struct {
  1.2078 +  vec8 kUVToB;  // 0
  1.2079 +  vec8 kUVToG;  // 16
  1.2080 +  vec8 kUVToR;  // 32
  1.2081 +  vec16 kUVBiasB;  // 48
  1.2082 +  vec16 kUVBiasG;  // 64
  1.2083 +  vec16 kUVBiasR;  // 80
  1.2084 +  vec16 kYSub16;  // 96
  1.2085 +  vec16 kYToRgb;  // 112
  1.2086 +  vec8 kVUToB;  // 128
  1.2087 +  vec8 kVUToG;  // 144
  1.2088 +  vec8 kVUToR;  // 160
  1.2089 +} static SIMD_ALIGNED(kYuvConstants) = {
  1.2090 +  { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
  1.2091 +  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
  1.2092 +  { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
  1.2093 +  { BB, BB, BB, BB, BB, BB, BB, BB },
  1.2094 +  { BG, BG, BG, BG, BG, BG, BG, BG },
  1.2095 +  { BR, BR, BR, BR, BR, BR, BR, BR },
  1.2096 +  { 16, 16, 16, 16, 16, 16, 16, 16 },
  1.2097 +  { YG, YG, YG, YG, YG, YG, YG, YG },
  1.2098 +  { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
  1.2099 +  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
  1.2100 +  { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
  1.2101 +};
  1.2102 +
  1.2103 +
  1.2104 +// Read 8 UV from 411
  1.2105 +#define READYUV444                                                             \
  1.2106 +    "movq       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
  1.2107 +    BUNDLEALIGN                                                                \
  1.2108 +    MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
  1.2109 +    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \
  1.2110 +    "punpcklbw  %%xmm1,%%xmm0                                   \n"
  1.2111 +
  1.2112 +// Read 4 UV from 422, upsample to 8 UV
  1.2113 +#define READYUV422                                                             \
  1.2114 +    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
  1.2115 +    BUNDLEALIGN                                                                \
  1.2116 +    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
  1.2117 +    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
  1.2118 +    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
  1.2119 +    "punpcklwd  %%xmm0,%%xmm0                                   \n"
  1.2120 +
  1.2121 +// Read 2 UV from 411, upsample to 8 UV
  1.2122 +#define READYUV411                                                             \
  1.2123 +    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
  1.2124 +    BUNDLEALIGN                                                                \
  1.2125 +    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
  1.2126 +    "lea        " MEMLEA(0x2, [u_buf]) ",%[u_buf]               \n"            \
  1.2127 +    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
  1.2128 +    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
  1.2129 +    "punpckldq  %%xmm0,%%xmm0                                   \n"
  1.2130 +
  1.2131 +// Read 4 UV from NV12, upsample to 8 UV
  1.2132 +#define READNV12                                                               \
  1.2133 +    "movq       " MEMACCESS([uv_buf]) ",%%xmm0                  \n"            \
  1.2134 +    "lea        " MEMLEA(0x8, [uv_buf]) ",%[uv_buf]             \n"            \
  1.2135 +    "punpcklwd  %%xmm0,%%xmm0                                   \n"
  1.2136 +
  1.2137 +// Convert 8 pixels: 8 UV and 8 Y
  1.2138 +#define YUVTORGB                                                               \
  1.2139 +    "movdqa     %%xmm0,%%xmm1                                   \n"            \
  1.2140 +    "movdqa     %%xmm0,%%xmm2                                   \n"            \
  1.2141 +    "pmaddubsw  " MEMACCESS([kYuvConstants]) ",%%xmm0           \n"            \
  1.2142 +    "pmaddubsw  " MEMACCESS2(16, [kYuvConstants]) ",%%xmm1      \n"            \
  1.2143 +    "pmaddubsw  " MEMACCESS2(32, [kYuvConstants]) ",%%xmm2      \n"            \
  1.2144 +    "psubw      " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0      \n"            \
  1.2145 +    "psubw      " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1      \n"            \
  1.2146 +    "psubw      " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2      \n"            \
  1.2147 +    "movq       " MEMACCESS([y_buf]) ",%%xmm3                   \n"            \
  1.2148 +    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
  1.2149 +    "punpcklbw  %%xmm4,%%xmm3                                   \n"            \
  1.2150 +    "psubsw     " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3      \n"            \
  1.2151 +    "pmullw     " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3     \n"            \
  1.2152 +    "paddsw     %%xmm3,%%xmm0                                   \n"            \
  1.2153 +    "paddsw     %%xmm3,%%xmm1                                   \n"            \
  1.2154 +    "paddsw     %%xmm3,%%xmm2                                   \n"            \
  1.2155 +    "psraw      $0x6,%%xmm0                                     \n"            \
  1.2156 +    "psraw      $0x6,%%xmm1                                     \n"            \
  1.2157 +    "psraw      $0x6,%%xmm2                                     \n"            \
  1.2158 +    "packuswb   %%xmm0,%%xmm0                                   \n"            \
  1.2159 +    "packuswb   %%xmm1,%%xmm1                                   \n"            \
  1.2160 +    "packuswb   %%xmm2,%%xmm2                                   \n"
  1.2161 +
  1.2162 +// Convert 8 pixels: 8 VU and 8 Y
  1.2163 +#define YVUTORGB                                                               \
  1.2164 +    "movdqa     %%xmm0,%%xmm1                                   \n"            \
  1.2165 +    "movdqa     %%xmm0,%%xmm2                                   \n"            \
  1.2166 +    "pmaddubsw  " MEMACCESS2(128, [kYuvConstants]) ",%%xmm0     \n"            \
  1.2167 +    "pmaddubsw  " MEMACCESS2(144, [kYuvConstants]) ",%%xmm1     \n"            \
  1.2168 +    "pmaddubsw  " MEMACCESS2(160, [kYuvConstants]) ",%%xmm2     \n"            \
  1.2169 +    "psubw      " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0      \n"            \
  1.2170 +    "psubw      " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1      \n"            \
  1.2171 +    "psubw      " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2      \n"            \
  1.2172 +    "movq       " MEMACCESS([y_buf]) ",%%xmm3                   \n"            \
  1.2173 +    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
  1.2174 +    "punpcklbw  %%xmm4,%%xmm3                                   \n"            \
  1.2175 +    "psubsw     " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3      \n"            \
  1.2176 +    "pmullw     " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3     \n"            \
  1.2177 +    "paddsw     %%xmm3,%%xmm0                                   \n"            \
  1.2178 +    "paddsw     %%xmm3,%%xmm1                                   \n"            \
  1.2179 +    "paddsw     %%xmm3,%%xmm2                                   \n"            \
  1.2180 +    "psraw      $0x6,%%xmm0                                     \n"            \
  1.2181 +    "psraw      $0x6,%%xmm1                                     \n"            \
  1.2182 +    "psraw      $0x6,%%xmm2                                     \n"            \
  1.2183 +    "packuswb   %%xmm0,%%xmm0                                   \n"            \
  1.2184 +    "packuswb   %%xmm1,%%xmm1                                   \n"            \
  1.2185 +    "packuswb   %%xmm2,%%xmm2                                   \n"
  1.2186 +
  1.2187 +void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
  1.2188 +                                const uint8* u_buf,
  1.2189 +                                const uint8* v_buf,
  1.2190 +                                uint8* dst_argb,
  1.2191 +                                int width) {
  1.2192 +  asm volatile (
  1.2193 +    "sub       %[u_buf],%[v_buf]               \n"
  1.2194 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.2195 +    "pxor      %%xmm4,%%xmm4                   \n"
  1.2196 +    LABELALIGN
  1.2197 +  "1:                                          \n"
  1.2198 +    READYUV444
  1.2199 +    YUVTORGB
  1.2200 +    "punpcklbw %%xmm1,%%xmm0                   \n"
  1.2201 +    "punpcklbw %%xmm5,%%xmm2                   \n"
  1.2202 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.2203 +    "punpcklwd %%xmm2,%%xmm0                   \n"
  1.2204 +    "punpckhwd %%xmm2,%%xmm1                   \n"
  1.2205 +    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "         \n"
  1.2206 +    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "   \n"
  1.2207 +    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb]  \n"
  1.2208 +    "sub       $0x8,%[width]                   \n"
  1.2209 +    "jg        1b                              \n"
  1.2210 +  : [y_buf]"+r"(y_buf),    // %[y_buf]
  1.2211 +    [u_buf]"+r"(u_buf),    // %[u_buf]
  1.2212 +    [v_buf]"+r"(v_buf),    // %[v_buf]
  1.2213 +    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
  1.2214 +    [width]"+rm"(width)    // %[width]
  1.2215 +  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  1.2216 +  : "memory", "cc"
  1.2217 +#if defined(__native_client__) && defined(__x86_64__)
  1.2218 +    , "r14"
  1.2219 +#endif
  1.2220 +#if defined(__SSE2__)
  1.2221 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.2222 +#endif
  1.2223 +  );
  1.2224 +}
  1.2225 +
  1.2226 +void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
  1.2227 +                                 const uint8* u_buf,
  1.2228 +                                 const uint8* v_buf,
  1.2229 +                                 uint8* dst_rgb24,
  1.2230 +                                 int width) {
  1.2231 +// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
  1.2232 +#if defined(__i386__)
  1.2233 +  asm volatile (
  1.2234 +    "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
  1.2235 +    "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
  1.2236 +  :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
  1.2237 +    [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24));
  1.2238 +#endif
  1.2239 +
  1.2240 +  asm volatile (
  1.2241 +#if !defined(__i386__)
  1.2242 +    "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
  1.2243 +    "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
  1.2244 +#endif
  1.2245 +    "sub       %[u_buf],%[v_buf]               \n"
  1.2246 +    "pxor      %%xmm4,%%xmm4                   \n"
  1.2247 +    LABELALIGN
  1.2248 +  "1:                                          \n"
  1.2249 +    READYUV422
  1.2250 +    YUVTORGB
  1.2251 +    "punpcklbw %%xmm1,%%xmm0                   \n"
  1.2252 +    "punpcklbw %%xmm2,%%xmm2                   \n"
  1.2253 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.2254 +    "punpcklwd %%xmm2,%%xmm0                   \n"
  1.2255 +    "punpckhwd %%xmm2,%%xmm1                   \n"
  1.2256 +    "pshufb    %%xmm5,%%xmm0                   \n"
  1.2257 +    "pshufb    %%xmm6,%%xmm1                   \n"
  1.2258 +    "palignr   $0xc,%%xmm0,%%xmm1              \n"
  1.2259 +    "movq      %%xmm0," MEMACCESS([dst_rgb24]) "\n"
  1.2260 +    "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
  1.2261 +    "lea       " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
  1.2262 +    "sub       $0x8,%[width]                   \n"
  1.2263 +    "jg        1b                              \n"
  1.2264 +  : [y_buf]"+r"(y_buf),    // %[y_buf]
  1.2265 +    [u_buf]"+r"(u_buf),    // %[u_buf]
  1.2266 +    [v_buf]"+r"(v_buf),    // %[v_buf]
  1.2267 +    [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
  1.2268 +    [width]"+rm"(width)    // %[width]
  1.2269 +  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
  1.2270 +#if !defined(__i386__)
  1.2271 +    , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
  1.2272 +    [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
  1.2273 +#endif
  1.2274 +  : "memory", "cc"
  1.2275 +#if defined(__native_client__) && defined(__x86_64__)
  1.2276 +    , "r14"
  1.2277 +#endif
  1.2278 +#if defined(__SSE2__)
  1.2279 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  1.2280 +#endif
  1.2281 +  );
  1.2282 +}
  1.2283 +
  1.2284 +void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
  1.2285 +                               const uint8* u_buf,
  1.2286 +                               const uint8* v_buf,
  1.2287 +                               uint8* dst_raw,
  1.2288 +                               int width) {
  1.2289 +// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
  1.2290 +#if defined(__i386__)
  1.2291 +  asm volatile (
  1.2292 +    "movdqa    %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
  1.2293 +    "movdqa    %[kShuffleMaskARGBToRAW],%%xmm6   \n"
  1.2294 +  :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
  1.2295 +    [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW));
  1.2296 +#endif
  1.2297 +
  1.2298 +  asm volatile (
  1.2299 +#if !defined(__i386__)
  1.2300 +    "movdqa    %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
  1.2301 +    "movdqa    %[kShuffleMaskARGBToRAW],%%xmm6   \n"
  1.2302 +#endif
  1.2303 +    "sub       %[u_buf],%[v_buf]               \n"
  1.2304 +    "pxor      %%xmm4,%%xmm4                   \n"
  1.2305 +    LABELALIGN
  1.2306 +  "1:                                          \n"
  1.2307 +    READYUV422
  1.2308 +    YUVTORGB
  1.2309 +    "punpcklbw %%xmm1,%%xmm0                   \n"
  1.2310 +    "punpcklbw %%xmm2,%%xmm2                   \n"
  1.2311 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.2312 +    "punpcklwd %%xmm2,%%xmm0                   \n"
  1.2313 +    "punpckhwd %%xmm2,%%xmm1                   \n"
  1.2314 +    "pshufb    %%xmm5,%%xmm0                   \n"
  1.2315 +    "pshufb    %%xmm6,%%xmm1                   \n"
  1.2316 +    "palignr   $0xc,%%xmm0,%%xmm1              \n"
  1.2317 +    "movq      %%xmm0," MEMACCESS([dst_raw]) " \n"
  1.2318 +    "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n"
  1.2319 +    "lea       " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n"
  1.2320 +    "sub       $0x8,%[width]                   \n"
  1.2321 +    "jg        1b                              \n"
  1.2322 +  : [y_buf]"+r"(y_buf),    // %[y_buf]
  1.2323 +    [u_buf]"+r"(u_buf),    // %[u_buf]
  1.2324 +    [v_buf]"+r"(v_buf),    // %[v_buf]
  1.2325 +    [dst_raw]"+r"(dst_raw),  // %[dst_raw]
  1.2326 +    [width]"+rm"(width)    // %[width]
  1.2327 +  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
  1.2328 +#if !defined(__i386__)
  1.2329 +    , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
  1.2330 +    [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
  1.2331 +#endif
  1.2332 +  : "memory", "cc"
  1.2333 +#if defined(__native_client__) && defined(__x86_64__)
  1.2334 +    , "r14"
  1.2335 +#endif
  1.2336 +#if defined(__SSE2__)
  1.2337 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  1.2338 +#endif
  1.2339 +  );
  1.2340 +}
  1.2341 +
  1.2342 +void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
  1.2343 +                                const uint8* u_buf,
  1.2344 +                                const uint8* v_buf,
  1.2345 +                                uint8* dst_argb,
  1.2346 +                                int width) {
  1.2347 +  asm volatile (
  1.2348 +    "sub       %[u_buf],%[v_buf]               \n"
  1.2349 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.2350 +    "pxor      %%xmm4,%%xmm4                   \n"
  1.2351 +    LABELALIGN
  1.2352 +  "1:                                          \n"
  1.2353 +    READYUV422
  1.2354 +    YUVTORGB
  1.2355 +    "punpcklbw %%xmm1,%%xmm0                   \n"
  1.2356 +    "punpcklbw %%xmm5,%%xmm2                   \n"
  1.2357 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.2358 +    "punpcklwd %%xmm2,%%xmm0                   \n"
  1.2359 +    "punpckhwd %%xmm2,%%xmm1                   \n"
  1.2360 +    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"
  1.2361 +    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
  1.2362 +    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
  1.2363 +    "sub       $0x8,%[width]                   \n"
  1.2364 +    "jg        1b                              \n"
  1.2365 +  : [y_buf]"+r"(y_buf),    // %[y_buf]
  1.2366 +    [u_buf]"+r"(u_buf),    // %[u_buf]
  1.2367 +    [v_buf]"+r"(v_buf),    // %[v_buf]
  1.2368 +    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
  1.2369 +    [width]"+rm"(width)    // %[width]
  1.2370 +  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  1.2371 +  : "memory", "cc"
  1.2372 +#if defined(__native_client__) && defined(__x86_64__)
  1.2373 +    , "r14"
  1.2374 +#endif
  1.2375 +#if defined(__SSE2__)
  1.2376 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.2377 +#endif
  1.2378 +  );
  1.2379 +}
  1.2380 +
  1.2381 +void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
  1.2382 +                                const uint8* u_buf,
  1.2383 +                                const uint8* v_buf,
  1.2384 +                                uint8* dst_argb,
  1.2385 +                                int width) {
  1.2386 +  asm volatile (
  1.2387 +    "sub       %[u_buf],%[v_buf]               \n"
  1.2388 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.2389 +    "pxor      %%xmm4,%%xmm4                   \n"
  1.2390 +    LABELALIGN
  1.2391 +  "1:                                          \n"
  1.2392 +    READYUV411
  1.2393 +    YUVTORGB
  1.2394 +    "punpcklbw %%xmm1,%%xmm0                   \n"
  1.2395 +    "punpcklbw %%xmm5,%%xmm2                   \n"
  1.2396 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.2397 +    "punpcklwd %%xmm2,%%xmm0                   \n"
  1.2398 +    "punpckhwd %%xmm2,%%xmm1                   \n"
  1.2399 +    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"
  1.2400 +    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
  1.2401 +    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
  1.2402 +    "sub       $0x8,%[width]                   \n"
  1.2403 +    "jg        1b                              \n"
  1.2404 +  : [y_buf]"+r"(y_buf),    // %[y_buf]
  1.2405 +    [u_buf]"+r"(u_buf),    // %[u_buf]
  1.2406 +    [v_buf]"+r"(v_buf),    // %[v_buf]
  1.2407 +    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
  1.2408 +    [width]"+rm"(width)    // %[width]
  1.2409 +  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  1.2410 +  : "memory", "cc"
  1.2411 +#if defined(__native_client__) && defined(__x86_64__)
  1.2412 +    , "r14"
  1.2413 +#endif
  1.2414 +#if defined(__SSE2__)
  1.2415 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.2416 +#endif
  1.2417 +  );
  1.2418 +}
  1.2419 +
  1.2420 +void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
  1.2421 +                                const uint8* uv_buf,
  1.2422 +                                uint8* dst_argb,
  1.2423 +                                int width) {
  1.2424 +  asm volatile (
  1.2425 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.2426 +    "pxor      %%xmm4,%%xmm4                   \n"
  1.2427 +    LABELALIGN
  1.2428 +  "1:                                          \n"
  1.2429 +    READNV12
  1.2430 +    YUVTORGB
  1.2431 +    "punpcklbw %%xmm1,%%xmm0                   \n"
  1.2432 +    "punpcklbw %%xmm5,%%xmm2                   \n"
  1.2433 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.2434 +    "punpcklwd %%xmm2,%%xmm0                   \n"
  1.2435 +    "punpckhwd %%xmm2,%%xmm1                   \n"
  1.2436 +    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"
  1.2437 +    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
  1.2438 +    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
  1.2439 +    "sub       $0x8,%[width]                   \n"
  1.2440 +    "jg        1b                              \n"
  1.2441 +  : [y_buf]"+r"(y_buf),    // %[y_buf]
  1.2442 +    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
  1.2443 +    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
  1.2444 +    [width]"+rm"(width)    // %[width]
  1.2445 +  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  1.2446 +  : "memory", "cc"
  1.2447 +  // Does not use r14.
  1.2448 +#if defined(__SSE2__)
  1.2449 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.2450 +#endif
  1.2451 +  );
  1.2452 +}
  1.2453 +
  1.2454 +void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
  1.2455 +                                const uint8* uv_buf,
  1.2456 +                                uint8* dst_argb,
  1.2457 +                                int width) {
  1.2458 +  asm volatile (
  1.2459 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.2460 +    "pxor      %%xmm4,%%xmm4                   \n"
  1.2461 +    LABELALIGN
  1.2462 +  "1:                                          \n"
  1.2463 +    READNV12
  1.2464 +    YVUTORGB
  1.2465 +    "punpcklbw %%xmm1,%%xmm0                   \n"
  1.2466 +    "punpcklbw %%xmm5,%%xmm2                   \n"
  1.2467 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.2468 +    "punpcklwd %%xmm2,%%xmm0                   \n"
  1.2469 +    "punpckhwd %%xmm2,%%xmm1                   \n"
  1.2470 +    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"
  1.2471 +    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
  1.2472 +    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
  1.2473 +    "sub       $0x8,%[width]                   \n"
  1.2474 +    "jg        1b                              \n"
  1.2475 +  : [y_buf]"+r"(y_buf),    // %[y_buf]
  1.2476 +    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
  1.2477 +    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
  1.2478 +    [width]"+rm"(width)    // %[width]
  1.2479 +  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  1.2480 +  : "memory", "cc"
  1.2481 +  // Does not use r14.
  1.2482 +#if defined(__SSE2__)
  1.2483 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.2484 +#endif
  1.2485 +  );
  1.2486 +}
  1.2487 +
  1.2488 +void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
  1.2489 +                                          const uint8* u_buf,
  1.2490 +                                          const uint8* v_buf,
  1.2491 +                                          uint8* dst_argb,
  1.2492 +                                          int width) {
  1.2493 +  asm volatile (
  1.2494 +    "sub       %[u_buf],%[v_buf]               \n"
  1.2495 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.2496 +    "pxor      %%xmm4,%%xmm4                   \n"
  1.2497 +    LABELALIGN
  1.2498 +  "1:                                          \n"
  1.2499 +    READYUV444
  1.2500 +    YUVTORGB
  1.2501 +    "punpcklbw %%xmm1,%%xmm0                   \n"
  1.2502 +    "punpcklbw %%xmm5,%%xmm2                   \n"
  1.2503 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.2504 +    "punpcklwd %%xmm2,%%xmm0                   \n"
  1.2505 +    "punpckhwd %%xmm2,%%xmm1                   \n"
  1.2506 +    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
  1.2507 +    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
  1.2508 +    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
  1.2509 +    "sub       $0x8,%[width]                   \n"
  1.2510 +    "jg        1b                              \n"
  1.2511 +  : [y_buf]"+r"(y_buf),    // %[y_buf]
  1.2512 +    [u_buf]"+r"(u_buf),    // %[u_buf]
  1.2513 +    [v_buf]"+r"(v_buf),    // %[v_buf]
  1.2514 +    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
  1.2515 +    [width]"+rm"(width)    // %[width]
  1.2516 +  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  1.2517 +  : "memory", "cc"
  1.2518 +#if defined(__native_client__) && defined(__x86_64__)
  1.2519 +    , "r14"
  1.2520 +#endif
  1.2521 +#if defined(__SSE2__)
  1.2522 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.2523 +#endif
  1.2524 +  );
  1.2525 +}
  1.2526 +
  1.2527 +void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
  1.2528 +                                          const uint8* u_buf,
  1.2529 +                                          const uint8* v_buf,
  1.2530 +                                          uint8* dst_argb,
  1.2531 +                                          int width) {
  1.2532 +  asm volatile (
  1.2533 +    "sub       %[u_buf],%[v_buf]               \n"
  1.2534 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.2535 +    "pxor      %%xmm4,%%xmm4                   \n"
  1.2536 +    LABELALIGN
  1.2537 +  "1:                                          \n"
  1.2538 +    READYUV422
  1.2539 +    YUVTORGB
  1.2540 +    "punpcklbw %%xmm1,%%xmm0                   \n"
  1.2541 +    "punpcklbw %%xmm5,%%xmm2                   \n"
  1.2542 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.2543 +    "punpcklwd %%xmm2,%%xmm0                   \n"
  1.2544 +    "punpckhwd %%xmm2,%%xmm1                   \n"
  1.2545 +    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
  1.2546 +    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
  1.2547 +    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
  1.2548 +    "sub       $0x8,%[width]                   \n"
  1.2549 +    "jg        1b                              \n"
  1.2550 +  : [y_buf]"+r"(y_buf),    // %[y_buf]
  1.2551 +    [u_buf]"+r"(u_buf),    // %[u_buf]
  1.2552 +    [v_buf]"+r"(v_buf),    // %[v_buf]
  1.2553 +    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
  1.2554 +    [width]"+rm"(width)    // %[width]
  1.2555 +  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  1.2556 +  : "memory", "cc"
  1.2557 +#if defined(__native_client__) && defined(__x86_64__)
  1.2558 +    , "r14"
  1.2559 +#endif
  1.2560 +#if defined(__SSE2__)
  1.2561 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.2562 +#endif
  1.2563 +  );
  1.2564 +}
  1.2565 +
  1.2566 +void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
  1.2567 +                                          const uint8* u_buf,
  1.2568 +                                          const uint8* v_buf,
  1.2569 +                                          uint8* dst_argb,
  1.2570 +                                          int width) {
  1.2571 +  asm volatile (
  1.2572 +    "sub       %[u_buf],%[v_buf]               \n"
  1.2573 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.2574 +    "pxor      %%xmm4,%%xmm4                   \n"
  1.2575 +    LABELALIGN
  1.2576 +  "1:                                          \n"
  1.2577 +    READYUV411
  1.2578 +    YUVTORGB
  1.2579 +    "punpcklbw %%xmm1,%%xmm0                   \n"
  1.2580 +    "punpcklbw %%xmm5,%%xmm2                   \n"
  1.2581 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.2582 +    "punpcklwd %%xmm2,%%xmm0                   \n"
  1.2583 +    "punpckhwd %%xmm2,%%xmm1                   \n"
  1.2584 +    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
  1.2585 +    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
  1.2586 +    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
  1.2587 +    "sub       $0x8,%[width]                   \n"
  1.2588 +    "jg        1b                              \n"
  1.2589 +  : [y_buf]"+r"(y_buf),    // %[y_buf]
  1.2590 +    [u_buf]"+r"(u_buf),    // %[u_buf]
  1.2591 +    [v_buf]"+r"(v_buf),    // %[v_buf]
  1.2592 +    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
  1.2593 +    [width]"+rm"(width)    // %[width]
  1.2594 +  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  1.2595 +  : "memory", "cc"
  1.2596 +#if defined(__native_client__) && defined(__x86_64__)
  1.2597 +    , "r14"
  1.2598 +#endif
  1.2599 +#if defined(__SSE2__)
  1.2600 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.2601 +#endif
  1.2602 +  );
  1.2603 +}
  1.2604 +
  1.2605 +void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
  1.2606 +                                          const uint8* uv_buf,
  1.2607 +                                          uint8* dst_argb,
  1.2608 +                                          int width) {
  1.2609 +  asm volatile (
  1.2610 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.2611 +    "pxor      %%xmm4,%%xmm4                   \n"
  1.2612 +    LABELALIGN
  1.2613 +  "1:                                          \n"
  1.2614 +    READNV12
  1.2615 +    YUVTORGB
  1.2616 +    "punpcklbw %%xmm1,%%xmm0                   \n"
  1.2617 +    "punpcklbw %%xmm5,%%xmm2                   \n"
  1.2618 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.2619 +    "punpcklwd %%xmm2,%%xmm0                   \n"
  1.2620 +    "punpckhwd %%xmm2,%%xmm1                   \n"
  1.2621 +    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
  1.2622 +    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
  1.2623 +    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
  1.2624 +    "sub       $0x8,%[width]                   \n"
  1.2625 +    "jg        1b                              \n"
  1.2626 +  : [y_buf]"+r"(y_buf),    // %[y_buf]
  1.2627 +    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
  1.2628 +    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
  1.2629 +    [width]"+rm"(width)    // %[width]
  1.2630 +  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  1.2631 +  : "memory", "cc"
  1.2632 +  // Does not use r14.
  1.2633 +#if defined(__SSE2__)
  1.2634 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.2635 +#endif
  1.2636 +  );
  1.2637 +}
  1.2638 +
  1.2639 +void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
  1.2640 +                                          const uint8* uv_buf,
  1.2641 +                                          uint8* dst_argb,
  1.2642 +                                          int width) {
  1.2643 +  asm volatile (
  1.2644 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.2645 +    "pxor      %%xmm4,%%xmm4                   \n"
  1.2646 +    LABELALIGN
  1.2647 +  "1:                                          \n"
  1.2648 +    READNV12
  1.2649 +    YVUTORGB
  1.2650 +    "punpcklbw %%xmm1,%%xmm0                   \n"
  1.2651 +    "punpcklbw %%xmm5,%%xmm2                   \n"
  1.2652 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.2653 +    "punpcklwd %%xmm2,%%xmm0                   \n"
  1.2654 +    "punpckhwd %%xmm2,%%xmm1                   \n"
  1.2655 +    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
  1.2656 +    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
  1.2657 +    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
  1.2658 +    "sub       $0x8,%[width]                   \n"
  1.2659 +    "jg        1b                              \n"
  1.2660 +  : [y_buf]"+r"(y_buf),    // %[y_buf]
  1.2661 +    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
  1.2662 +    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
  1.2663 +    [width]"+rm"(width)    // %[width]
  1.2664 +  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  1.2665 +  : "memory", "cc"
  1.2666 +  // Does not use r14.
  1.2667 +#if defined(__SSE2__)
  1.2668 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.2669 +#endif
  1.2670 +  );
  1.2671 +}
  1.2672 +
  1.2673 +void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
  1.2674 +                                const uint8* u_buf,
  1.2675 +                                const uint8* v_buf,
  1.2676 +                                uint8* dst_bgra,
  1.2677 +                                int width) {
  1.2678 +  asm volatile (
  1.2679 +    "sub       %[u_buf],%[v_buf]               \n"
  1.2680 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.2681 +    "pxor      %%xmm4,%%xmm4                   \n"
  1.2682 +    LABELALIGN
  1.2683 +  "1:                                          \n"
  1.2684 +    READYUV422
  1.2685 +    YUVTORGB
  1.2686 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.2687 +    "punpcklbw %%xmm0,%%xmm1                   \n"
  1.2688 +    "punpcklbw %%xmm2,%%xmm5                   \n"
  1.2689 +    "movdqa    %%xmm5,%%xmm0                   \n"
  1.2690 +    "punpcklwd %%xmm1,%%xmm5                   \n"
  1.2691 +    "punpckhwd %%xmm1,%%xmm0                   \n"
  1.2692 +    "movdqa    %%xmm5," MEMACCESS([dst_bgra]) "\n"
  1.2693 +    "movdqa    %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
  1.2694 +    "lea       " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
  1.2695 +    "sub       $0x8,%[width]                   \n"
  1.2696 +    "jg        1b                              \n"
  1.2697 +  : [y_buf]"+r"(y_buf),    // %[y_buf]
  1.2698 +    [u_buf]"+r"(u_buf),    // %[u_buf]
  1.2699 +    [v_buf]"+r"(v_buf),    // %[v_buf]
  1.2700 +    [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]
  1.2701 +    [width]"+rm"(width)    // %[width]
  1.2702 +  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  1.2703 +  : "memory", "cc"
  1.2704 +#if defined(__native_client__) && defined(__x86_64__)
  1.2705 +    , "r14"
  1.2706 +#endif
  1.2707 +#if defined(__SSE2__)
  1.2708 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.2709 +#endif
  1.2710 +  );
  1.2711 +}
  1.2712 +
  1.2713 +void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
  1.2714 +                                const uint8* u_buf,
  1.2715 +                                const uint8* v_buf,
  1.2716 +                                uint8* dst_abgr,
  1.2717 +                                int width) {
  1.2718 +  asm volatile (
  1.2719 +    "sub       %[u_buf],%[v_buf]               \n"
  1.2720 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.2721 +    "pxor      %%xmm4,%%xmm4                   \n"
  1.2722 +    LABELALIGN
  1.2723 +  "1:                                          \n"
  1.2724 +    READYUV422
  1.2725 +    YUVTORGB
  1.2726 +    "punpcklbw %%xmm1,%%xmm2                   \n"
  1.2727 +    "punpcklbw %%xmm5,%%xmm0                   \n"
  1.2728 +    "movdqa    %%xmm2,%%xmm1                   \n"
  1.2729 +    "punpcklwd %%xmm0,%%xmm2                   \n"
  1.2730 +    "punpckhwd %%xmm0,%%xmm1                   \n"
  1.2731 +    "movdqa    %%xmm2," MEMACCESS([dst_abgr]) "\n"
  1.2732 +    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
  1.2733 +    "lea       " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
  1.2734 +    "sub       $0x8,%[width]                   \n"
  1.2735 +    "jg        1b                              \n"
  1.2736 +  : [y_buf]"+r"(y_buf),    // %[y_buf]
  1.2737 +    [u_buf]"+r"(u_buf),    // %[u_buf]
  1.2738 +    [v_buf]"+r"(v_buf),    // %[v_buf]
  1.2739 +    [dst_abgr]"+r"(dst_abgr),  // %[dst_abgr]
  1.2740 +    [width]"+rm"(width)    // %[width]
  1.2741 +  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  1.2742 +  : "memory", "cc"
  1.2743 +#if defined(__native_client__) && defined(__x86_64__)
  1.2744 +    , "r14"
  1.2745 +#endif
  1.2746 +#if defined(__SSE2__)
  1.2747 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.2748 +#endif
  1.2749 +  );
  1.2750 +}
  1.2751 +
  1.2752 +void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
  1.2753 +                                const uint8* u_buf,
  1.2754 +                                const uint8* v_buf,
  1.2755 +                                uint8* dst_rgba,
  1.2756 +                                int width) {
  1.2757 +  asm volatile (
  1.2758 +    "sub       %[u_buf],%[v_buf]               \n"
  1.2759 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.2760 +    "pxor      %%xmm4,%%xmm4                   \n"
  1.2761 +    LABELALIGN
  1.2762 +  "1:                                          \n"
  1.2763 +    READYUV422
  1.2764 +    YUVTORGB
  1.2765 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.2766 +    "punpcklbw %%xmm2,%%xmm1                   \n"
  1.2767 +    "punpcklbw %%xmm0,%%xmm5                   \n"
  1.2768 +    "movdqa    %%xmm5,%%xmm0                   \n"
  1.2769 +    "punpcklwd %%xmm1,%%xmm5                   \n"
  1.2770 +    "punpckhwd %%xmm1,%%xmm0                   \n"
  1.2771 +    "movdqa    %%xmm5," MEMACCESS([dst_rgba]) "\n"
  1.2772 +    "movdqa    %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
  1.2773 +    "lea       " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
  1.2774 +    "sub       $0x8,%[width]                   \n"
  1.2775 +    "jg        1b                              \n"
  1.2776 +  : [y_buf]"+r"(y_buf),    // %[y_buf]
  1.2777 +    [u_buf]"+r"(u_buf),    // %[u_buf]
  1.2778 +    [v_buf]"+r"(v_buf),    // %[v_buf]
  1.2779 +    [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
  1.2780 +    [width]"+rm"(width)    // %[width]
  1.2781 +  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  1.2782 +  : "memory", "cc"
  1.2783 +#if defined(__native_client__) && defined(__x86_64__)
  1.2784 +    , "r14"
  1.2785 +#endif
  1.2786 +#if defined(__SSE2__)
  1.2787 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.2788 +#endif
  1.2789 +  );
  1.2790 +}
  1.2791 +
  1.2792 +void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
  1.2793 +                                          const uint8* u_buf,
  1.2794 +                                          const uint8* v_buf,
  1.2795 +                                          uint8* dst_bgra,
  1.2796 +                                          int width) {
  1.2797 +  asm volatile (
  1.2798 +    "sub       %[u_buf],%[v_buf]               \n"
  1.2799 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.2800 +    "pxor      %%xmm4,%%xmm4                   \n"
  1.2801 +    LABELALIGN
  1.2802 +  "1:                                          \n"
  1.2803 +    READYUV422
  1.2804 +    YUVTORGB
  1.2805 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.2806 +    "punpcklbw %%xmm0,%%xmm1                   \n"
  1.2807 +    "punpcklbw %%xmm2,%%xmm5                   \n"
  1.2808 +    "movdqa    %%xmm5,%%xmm0                   \n"
  1.2809 +    "punpcklwd %%xmm1,%%xmm5                   \n"
  1.2810 +    "punpckhwd %%xmm1,%%xmm0                   \n"
  1.2811 +    "movdqu    %%xmm5," MEMACCESS([dst_bgra]) "\n"
  1.2812 +    "movdqu    %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
  1.2813 +    "lea       " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
  1.2814 +    "sub       $0x8,%[width]                   \n"
  1.2815 +    "jg        1b                              \n"
  1.2816 +  : [y_buf]"+r"(y_buf),    // %[y_buf]
  1.2817 +    [u_buf]"+r"(u_buf),    // %[u_buf]
  1.2818 +    [v_buf]"+r"(v_buf),    // %[v_buf]
  1.2819 +    [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]
  1.2820 +    [width]"+rm"(width)    // %[width]
  1.2821 +  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  1.2822 +  : "memory", "cc"
  1.2823 +#if defined(__native_client__) && defined(__x86_64__)
  1.2824 +    , "r14"
  1.2825 +#endif
  1.2826 +#if defined(__SSE2__)
  1.2827 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.2828 +#endif
  1.2829 +  );
  1.2830 +}
  1.2831 +
  1.2832 +void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
  1.2833 +                                          const uint8* u_buf,
  1.2834 +                                          const uint8* v_buf,
  1.2835 +                                          uint8* dst_abgr,
  1.2836 +                                          int width) {
  1.2837 +  asm volatile (
  1.2838 +    "sub       %[u_buf],%[v_buf]               \n"
  1.2839 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.2840 +    "pxor      %%xmm4,%%xmm4                   \n"
  1.2841 +    LABELALIGN
  1.2842 +  "1:                                          \n"
  1.2843 +    READYUV422
  1.2844 +    YUVTORGB
  1.2845 +    "punpcklbw %%xmm1,%%xmm2                   \n"
  1.2846 +    "punpcklbw %%xmm5,%%xmm0                   \n"
  1.2847 +    "movdqa    %%xmm2,%%xmm1                   \n"
  1.2848 +    "punpcklwd %%xmm0,%%xmm2                   \n"
  1.2849 +    "punpckhwd %%xmm0,%%xmm1                   \n"
  1.2850 +    "movdqu    %%xmm2," MEMACCESS([dst_abgr]) "\n"
  1.2851 +    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
  1.2852 +    "lea       " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
  1.2853 +    "sub       $0x8,%[width]                   \n"
  1.2854 +    "jg        1b                              \n"
  1.2855 +  : [y_buf]"+r"(y_buf),    // %[y_buf]
  1.2856 +    [u_buf]"+r"(u_buf),    // %[u_buf]
  1.2857 +    [v_buf]"+r"(v_buf),    // %[v_buf]
  1.2858 +    [dst_abgr]"+r"(dst_abgr),  // %[dst_abgr]
  1.2859 +    [width]"+rm"(width)    // %[width]
  1.2860 +  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  1.2861 +  : "memory", "cc"
  1.2862 +#if defined(__native_client__) && defined(__x86_64__)
  1.2863 +    , "r14"
  1.2864 +#endif
  1.2865 +#if defined(__SSE2__)
  1.2866 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.2867 +#endif
  1.2868 +  );
  1.2869 +}
  1.2870 +
  1.2871 +void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
  1.2872 +                                          const uint8* u_buf,
  1.2873 +                                          const uint8* v_buf,
  1.2874 +                                          uint8* dst_rgba,
  1.2875 +                                          int width) {
  1.2876 +  asm volatile (
  1.2877 +    "sub       %[u_buf],%[v_buf]               \n"
  1.2878 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.2879 +    "pxor      %%xmm4,%%xmm4                   \n"
  1.2880 +    LABELALIGN
  1.2881 +  "1:                                          \n"
  1.2882 +    READYUV422
  1.2883 +    YUVTORGB
  1.2884 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.2885 +    "punpcklbw %%xmm2,%%xmm1                   \n"
  1.2886 +    "punpcklbw %%xmm0,%%xmm5                   \n"
  1.2887 +    "movdqa    %%xmm5,%%xmm0                   \n"
  1.2888 +    "punpcklwd %%xmm1,%%xmm5                   \n"
  1.2889 +    "punpckhwd %%xmm1,%%xmm0                   \n"
  1.2890 +    "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "\n"
  1.2891 +    "movdqu    %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
  1.2892 +    "lea       " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
  1.2893 +    "sub       $0x8,%[width]                   \n"
  1.2894 +    "jg        1b                              \n"
  1.2895 +  : [y_buf]"+r"(y_buf),    // %[y_buf]
  1.2896 +    [u_buf]"+r"(u_buf),    // %[u_buf]
  1.2897 +    [v_buf]"+r"(v_buf),    // %[v_buf]
  1.2898 +    [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
  1.2899 +    [width]"+rm"(width)    // %[width]
  1.2900 +  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
  1.2901 +  : "memory", "cc"
  1.2902 +#if defined(__native_client__) && defined(__x86_64__)
  1.2903 +    , "r14"
  1.2904 +#endif
  1.2905 +#if defined(__SSE2__)
  1.2906 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.2907 +#endif
  1.2908 +  );
  1.2909 +}
  1.2910 +
  1.2911 +#endif  // HAS_I422TOARGBROW_SSSE3
  1.2912 +
  1.2913 +#ifdef HAS_YTOARGBROW_SSE2
  1.2914 +void YToARGBRow_SSE2(const uint8* y_buf,
  1.2915 +                     uint8* dst_argb,
  1.2916 +                     int width) {
  1.2917 +  asm volatile (
  1.2918 +    "pxor      %%xmm5,%%xmm5                   \n"
  1.2919 +    "pcmpeqb   %%xmm4,%%xmm4                   \n"
  1.2920 +    "pslld     $0x18,%%xmm4                    \n"
  1.2921 +    "mov       $0x00100010,%%eax               \n"
  1.2922 +    "movd      %%eax,%%xmm3                    \n"
  1.2923 +    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
  1.2924 +    "mov       $0x004a004a,%%eax               \n"
  1.2925 +    "movd      %%eax,%%xmm2                    \n"
  1.2926 +    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
  1.2927 +    LABELALIGN
  1.2928 +  "1:                                          \n"
  1.2929 +    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
  1.2930 +    "movq      " MEMACCESS(0) ",%%xmm0         \n"
  1.2931 +    "lea       " MEMLEA(0x8,0) ",%0            \n"
  1.2932 +    "punpcklbw %%xmm5,%%xmm0                   \n"
  1.2933 +    "psubusw   %%xmm3,%%xmm0                   \n"
  1.2934 +    "pmullw    %%xmm2,%%xmm0                   \n"
  1.2935 +    "psrlw     $6, %%xmm0                      \n"
  1.2936 +    "packuswb  %%xmm0,%%xmm0                   \n"
  1.2937 +
  1.2938 +    // Step 2: Weave into ARGB
  1.2939 +    "punpcklbw %%xmm0,%%xmm0                   \n"
  1.2940 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.2941 +    "punpcklwd %%xmm0,%%xmm0                   \n"
  1.2942 +    "punpckhwd %%xmm1,%%xmm1                   \n"
  1.2943 +    "por       %%xmm4,%%xmm0                   \n"
  1.2944 +    "por       %%xmm4,%%xmm1                   \n"
  1.2945 +    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  1.2946 +    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
  1.2947 +    "lea       " MEMLEA(0x20,1) ",%1           \n"
  1.2948 +
  1.2949 +    "sub       $0x8,%2                         \n"
  1.2950 +    "jg        1b                              \n"
  1.2951 +  : "+r"(y_buf),     // %0
  1.2952 +    "+r"(dst_argb),  // %1
  1.2953 +    "+rm"(width)     // %2
  1.2954 +  :
  1.2955 +  : "memory", "cc", "eax"
  1.2956 +#if defined(__SSE2__)
  1.2957 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
  1.2958 +#endif
  1.2959 +  );
  1.2960 +}
  1.2961 +#endif  // HAS_YTOARGBROW_SSE2
  1.2962 +
  1.2963 +#ifdef HAS_MIRRORROW_SSSE3
  1.2964 +// Shuffle table for reversing the bytes.
  1.2965 +static uvec8 kShuffleMirror = {
  1.2966 +  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
  1.2967 +};
  1.2968 +
  1.2969 +void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
  1.2970 +  intptr_t temp_width = (intptr_t)(width);
  1.2971 +  asm volatile (
  1.2972 +    "movdqa    %3,%%xmm5                       \n"
  1.2973 +    "lea       " MEMLEA(-0x10,0) ",%0          \n"
  1.2974 +    LABELALIGN
  1.2975 +  "1:                                          \n"
  1.2976 +    MEMOPREG(movdqa,0x00,0,2,1,xmm0)           //  movdqa  (%0,%2),%%xmm0
  1.2977 +    "pshufb    %%xmm5,%%xmm0                   \n"
  1.2978 +    "sub       $0x10,%2                        \n"
  1.2979 +    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  1.2980 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.2981 +    "jg        1b                              \n"
  1.2982 +  : "+r"(src),  // %0
  1.2983 +    "+r"(dst),  // %1
  1.2984 +    "+r"(temp_width)  // %2
  1.2985 +  : "m"(kShuffleMirror) // %3
  1.2986 +  : "memory", "cc"
  1.2987 +#if defined(__native_client__) && defined(__x86_64__)
  1.2988 +    , "r14"
  1.2989 +#endif
  1.2990 +#if defined(__SSE2__)
  1.2991 +    , "xmm0", "xmm5"
  1.2992 +#endif
  1.2993 +  );
  1.2994 +}
  1.2995 +#endif  // HAS_MIRRORROW_SSSE3
  1.2996 +
  1.2997 +#ifdef HAS_MIRRORROW_SSE2
  1.2998 +void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
  1.2999 +  intptr_t temp_width = (intptr_t)(width);
  1.3000 +  asm volatile (
  1.3001 +    "lea       " MEMLEA(-0x10,0) ",%0          \n"
  1.3002 +    LABELALIGN
  1.3003 +  "1:                                          \n"
  1.3004 +    MEMOPREG(movdqu,0x00,0,2,1,xmm0)           //  movdqu  (%0,%2),%%xmm0
  1.3005 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.3006 +    "psllw     $0x8,%%xmm0                     \n"
  1.3007 +    "psrlw     $0x8,%%xmm1                     \n"
  1.3008 +    "por       %%xmm1,%%xmm0                   \n"
  1.3009 +    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
  1.3010 +    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
  1.3011 +    "pshufd    $0x4e,%%xmm0,%%xmm0             \n"
  1.3012 +    "sub       $0x10,%2                        \n"
  1.3013 +    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
  1.3014 +    "lea       " MEMLEA(0x10,1)",%1            \n"
  1.3015 +    "jg        1b                              \n"
  1.3016 +  : "+r"(src),  // %0
  1.3017 +    "+r"(dst),  // %1
  1.3018 +    "+r"(temp_width)  // %2
  1.3019 +  :
  1.3020 +  : "memory", "cc"
  1.3021 +#if defined(__native_client__) && defined(__x86_64__)
  1.3022 +    , "r14"
  1.3023 +#endif
  1.3024 +#if defined(__SSE2__)
  1.3025 +    , "xmm0", "xmm1"
  1.3026 +#endif
  1.3027 +  );
  1.3028 +}
  1.3029 +#endif  // HAS_MIRRORROW_SSE2
  1.3030 +
  1.3031 +#ifdef HAS_MIRRORROW_UV_SSSE3
  1.3032 +// Shuffle table for reversing the bytes of UV channels.
  1.3033 +static uvec8 kShuffleMirrorUV = {
  1.3034 +  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
  1.3035 +};
  1.3036 +void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
  1.3037 +                       int width) {
  1.3038 +  intptr_t temp_width = (intptr_t)(width);
  1.3039 +  asm volatile (
  1.3040 +    "movdqa    %4,%%xmm1                       \n"
  1.3041 +    "lea       " MEMLEA4(-0x10,0,3,2) ",%0       \n"
  1.3042 +    "sub       %1,%2                           \n"
  1.3043 +    LABELALIGN
  1.3044 +  "1:                                          \n"
  1.3045 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.3046 +    "lea       " MEMLEA(-0x10,0) ",%0            \n"
  1.3047 +    "pshufb    %%xmm1,%%xmm0                   \n"
  1.3048 +    "sub       $8,%3                           \n"
  1.3049 +    "movlpd    %%xmm0," MEMACCESS(1) "         \n"
  1.3050 +    BUNDLEALIGN
  1.3051 +    MEMOPMEM(movhpd,xmm0,0x00,1,2,1)           //  movhpd    %%xmm0,(%1,%2)
  1.3052 +    "lea       " MEMLEA(0x8,1) ",%1            \n"
  1.3053 +    "jg        1b                              \n"
  1.3054 +  : "+r"(src),      // %0
  1.3055 +    "+r"(dst_u),    // %1
  1.3056 +    "+r"(dst_v),    // %2
  1.3057 +    "+r"(temp_width)  // %3
  1.3058 +  : "m"(kShuffleMirrorUV)  // %4
  1.3059 +  : "memory", "cc"
  1.3060 +#if defined(__native_client__) && defined(__x86_64__)
  1.3061 +    , "r14"
  1.3062 +#endif
  1.3063 +#if defined(__SSE2__)
  1.3064 +    , "xmm0", "xmm1"
  1.3065 +#endif
  1.3066 +  );
  1.3067 +}
  1.3068 +#endif  // HAS_MIRRORROW_UV_SSSE3
  1.3069 +
  1.3070 +#ifdef HAS_ARGBMIRRORROW_SSSE3
  1.3071 +// Shuffle table for reversing the bytes.
  1.3072 +static uvec8 kARGBShuffleMirror = {
  1.3073 +  12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
  1.3074 +};
  1.3075 +
  1.3076 +void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
  1.3077 +  intptr_t temp_width = (intptr_t)(width);
  1.3078 +  asm volatile (
  1.3079 +    "lea       " MEMLEA4(-0x10,0,2,4) ",%0     \n"
  1.3080 +    "movdqa    %3,%%xmm5                       \n"
  1.3081 +    LABELALIGN
  1.3082 +  "1:                                          \n"
  1.3083 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.3084 +    "pshufb    %%xmm5,%%xmm0                   \n"
  1.3085 +    "lea       " MEMLEA(-0x10,0) ",%0          \n"
  1.3086 +    "sub       $0x4,%2                         \n"
  1.3087 +    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  1.3088 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.3089 +    "jg        1b                              \n"
  1.3090 +  : "+r"(src),  // %0
  1.3091 +    "+r"(dst),  // %1
  1.3092 +    "+r"(temp_width)  // %2
  1.3093 +  : "m"(kARGBShuffleMirror)  // %3
  1.3094 +  : "memory", "cc"
  1.3095 +#if defined(__SSE2__)
  1.3096 +    , "xmm0", "xmm5"
  1.3097 +#endif
  1.3098 +  );
  1.3099 +}
  1.3100 +#endif  // HAS_ARGBMIRRORROW_SSSE3
  1.3101 +
  1.3102 +#ifdef HAS_SPLITUVROW_SSE2
  1.3103 +void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
  1.3104 +  asm volatile (
  1.3105 +    "pcmpeqb    %%xmm5,%%xmm5                    \n"
  1.3106 +    "psrlw      $0x8,%%xmm5                      \n"
  1.3107 +    "sub        %1,%2                            \n"
  1.3108 +    LABELALIGN
  1.3109 +  "1:                                            \n"
  1.3110 +    "movdqa     " MEMACCESS(0) ",%%xmm0          \n"
  1.3111 +    "movdqa     " MEMACCESS2(0x10,0) ",%%xmm1    \n"
  1.3112 +    "lea        " MEMLEA(0x20,0) ",%0            \n"
  1.3113 +    "movdqa     %%xmm0,%%xmm2                    \n"
  1.3114 +    "movdqa     %%xmm1,%%xmm3                    \n"
  1.3115 +    "pand       %%xmm5,%%xmm0                    \n"
  1.3116 +    "pand       %%xmm5,%%xmm1                    \n"
  1.3117 +    "packuswb   %%xmm1,%%xmm0                    \n"
  1.3118 +    "psrlw      $0x8,%%xmm2                      \n"
  1.3119 +    "psrlw      $0x8,%%xmm3                      \n"
  1.3120 +    "packuswb   %%xmm3,%%xmm2                    \n"
  1.3121 +    "movdqa     %%xmm0," MEMACCESS(1) "          \n"
  1.3122 +    MEMOPMEM(movdqa,xmm2,0x00,1,2,1)             // movdqa     %%xmm2,(%1,%2)
  1.3123 +    "lea        " MEMLEA(0x10,1) ",%1            \n"
  1.3124 +    "sub        $0x10,%3                         \n"
  1.3125 +    "jg         1b                               \n"
  1.3126 +  : "+r"(src_uv),     // %0
  1.3127 +    "+r"(dst_u),      // %1
  1.3128 +    "+r"(dst_v),      // %2
  1.3129 +    "+r"(pix)         // %3
  1.3130 +  :
  1.3131 +  : "memory", "cc"
  1.3132 +#if defined(__native_client__) && defined(__x86_64__)
  1.3133 +    , "r14"
  1.3134 +#endif
  1.3135 +#if defined(__SSE2__)
  1.3136 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  1.3137 +#endif
  1.3138 +  );
  1.3139 +}
  1.3140 +
  1.3141 +void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
  1.3142 +                               int pix) {
  1.3143 +  asm volatile (
  1.3144 +    "pcmpeqb    %%xmm5,%%xmm5                    \n"
  1.3145 +    "psrlw      $0x8,%%xmm5                      \n"
  1.3146 +    "sub        %1,%2                            \n"
  1.3147 +    LABELALIGN
  1.3148 +  "1:                                            \n"
  1.3149 +    "movdqu     " MEMACCESS(0) ",%%xmm0          \n"
  1.3150 +    "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1    \n"
  1.3151 +    "lea        " MEMLEA(0x20,0) ",%0            \n"
  1.3152 +    "movdqa     %%xmm0,%%xmm2                    \n"
  1.3153 +    "movdqa     %%xmm1,%%xmm3                    \n"
  1.3154 +    "pand       %%xmm5,%%xmm0                    \n"
  1.3155 +    "pand       %%xmm5,%%xmm1                    \n"
  1.3156 +    "packuswb   %%xmm1,%%xmm0                    \n"
  1.3157 +    "psrlw      $0x8,%%xmm2                      \n"
  1.3158 +    "psrlw      $0x8,%%xmm3                      \n"
  1.3159 +    "packuswb   %%xmm3,%%xmm2                    \n"
  1.3160 +    "movdqu     %%xmm0," MEMACCESS(1) "          \n"
  1.3161 +    MEMOPMEM(movdqu,xmm2,0x00,1,2,1)             //  movdqu     %%xmm2,(%1,%2)
  1.3162 +    "lea        " MEMLEA(0x10,1) ",%1            \n"
  1.3163 +    "sub        $0x10,%3                         \n"
  1.3164 +    "jg         1b                               \n"
  1.3165 +  : "+r"(src_uv),     // %0
  1.3166 +    "+r"(dst_u),      // %1
  1.3167 +    "+r"(dst_v),      // %2
  1.3168 +    "+r"(pix)         // %3
  1.3169 +  :
  1.3170 +  : "memory", "cc"
  1.3171 +#if defined(__native_client__) && defined(__x86_64__)
  1.3172 +    , "r14"
  1.3173 +#endif
  1.3174 +#if defined(__SSE2__)
  1.3175 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  1.3176 +#endif
  1.3177 +  );
  1.3178 +}
  1.3179 +#endif  // HAS_SPLITUVROW_SSE2
  1.3180 +
  1.3181 +#ifdef HAS_MERGEUVROW_SSE2
  1.3182 +void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
  1.3183 +                     int width) {
  1.3184 +  asm volatile (
  1.3185 +    "sub       %0,%1                             \n"
  1.3186 +    LABELALIGN
  1.3187 +  "1:                                            \n"
  1.3188 +    "movdqa    " MEMACCESS(0) ",%%xmm0           \n"
  1.3189 +    MEMOPREG(movdqa,0x00,0,1,1,xmm1)             //  movdqa    (%0,%1,1),%%xmm1
  1.3190 +    "lea       " MEMLEA(0x10,0) ",%0             \n"
  1.3191 +    "movdqa    %%xmm0,%%xmm2                     \n"
  1.3192 +    "punpcklbw %%xmm1,%%xmm0                     \n"
  1.3193 +    "punpckhbw %%xmm1,%%xmm2                     \n"
  1.3194 +    "movdqa    %%xmm0," MEMACCESS(2) "           \n"
  1.3195 +    "movdqa    %%xmm2," MEMACCESS2(0x10,2) "     \n"
  1.3196 +    "lea       " MEMLEA(0x20,2) ",%2             \n"
  1.3197 +    "sub       $0x10,%3                          \n"
  1.3198 +    "jg        1b                                \n"
  1.3199 +  : "+r"(src_u),     // %0
  1.3200 +    "+r"(src_v),     // %1
  1.3201 +    "+r"(dst_uv),    // %2
  1.3202 +    "+r"(width)      // %3
  1.3203 +  :
  1.3204 +  : "memory", "cc"
  1.3205 +#if defined(__native_client__) && defined(__x86_64__)
  1.3206 +    , "r14"
  1.3207 +#endif
  1.3208 +#if defined(__SSE2__)
  1.3209 +    , "xmm0", "xmm1", "xmm2"
  1.3210 +#endif
  1.3211 +  );
  1.3212 +}
  1.3213 +
  1.3214 +void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
  1.3215 +                               uint8* dst_uv, int width) {
  1.3216 +  asm volatile (
  1.3217 +    "sub       %0,%1                             \n"
  1.3218 +    LABELALIGN
  1.3219 +  "1:                                            \n"
  1.3220 +    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
  1.3221 +    MEMOPREG(movdqu,0x00,0,1,1,xmm1)             //  movdqu    (%0,%1,1),%%xmm1
  1.3222 +    "lea       " MEMLEA(0x10,0) ",%0             \n"
  1.3223 +    "movdqa    %%xmm0,%%xmm2                     \n"
  1.3224 +    "punpcklbw %%xmm1,%%xmm0                     \n"
  1.3225 +    "punpckhbw %%xmm1,%%xmm2                     \n"
  1.3226 +    "movdqu    %%xmm0," MEMACCESS(2) "           \n"
  1.3227 +    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "     \n"
  1.3228 +    "lea       " MEMLEA(0x20,2) ",%2             \n"
  1.3229 +    "sub       $0x10,%3                          \n"
  1.3230 +    "jg        1b                                \n"
  1.3231 +  : "+r"(src_u),     // %0
  1.3232 +    "+r"(src_v),     // %1
  1.3233 +    "+r"(dst_uv),    // %2
  1.3234 +    "+r"(width)      // %3
  1.3235 +  :
  1.3236 +  : "memory", "cc"
  1.3237 +#if defined(__native_client__) && defined(__x86_64__)
  1.3238 +    , "r14"
  1.3239 +#endif
  1.3240 +#if defined(__SSE2__)
  1.3241 +    , "xmm0", "xmm1", "xmm2"
  1.3242 +#endif
  1.3243 +  );
  1.3244 +}
  1.3245 +#endif  // HAS_MERGEUVROW_SSE2
  1.3246 +
  1.3247 +#ifdef HAS_COPYROW_SSE2
  1.3248 +void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
  1.3249 +  asm volatile (
  1.3250 +    LABELALIGN
  1.3251 +  "1:                                          \n"
  1.3252 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.3253 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.3254 +    "lea       " MEMLEA(0x20,0) ",%0           \n"
  1.3255 +    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  1.3256 +    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
  1.3257 +    "lea       " MEMLEA(0x20,1) ",%1           \n"
  1.3258 +    "sub       $0x20,%2                        \n"
  1.3259 +    "jg        1b                              \n"
  1.3260 +  : "+r"(src),   // %0
  1.3261 +    "+r"(dst),   // %1
  1.3262 +    "+r"(count)  // %2
  1.3263 +  :
  1.3264 +  : "memory", "cc"
  1.3265 +#if defined(__SSE2__)
  1.3266 +    , "xmm0", "xmm1"
  1.3267 +#endif
  1.3268 +  );
  1.3269 +}
  1.3270 +#endif  // HAS_COPYROW_SSE2
  1.3271 +
  1.3272 +#ifdef HAS_COPYROW_X86
  1.3273 +void CopyRow_X86(const uint8* src, uint8* dst, int width) {
  1.3274 +  size_t width_tmp = (size_t)(width);
  1.3275 +  asm volatile (
  1.3276 +    "shr       $0x2,%2                         \n"
  1.3277 +    "rep movsl " MEMMOVESTRING(0,1) "          \n"
  1.3278 +  : "+S"(src),  // %0
  1.3279 +    "+D"(dst),  // %1
  1.3280 +    "+c"(width_tmp) // %2
  1.3281 +  :
  1.3282 +  : "memory", "cc"
  1.3283 +  );
  1.3284 +}
  1.3285 +#endif  // HAS_COPYROW_X86
  1.3286 +
  1.3287 +#ifdef HAS_COPYROW_ERMS
  1.3288 +// Unaligned Multiple of 1.
  1.3289 +void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
  1.3290 +  size_t width_tmp = (size_t)(width);
  1.3291 +  asm volatile (
  1.3292 +    "rep movsb " MEMMOVESTRING(0,1) "          \n"
  1.3293 +  : "+S"(src),  // %0
  1.3294 +    "+D"(dst),  // %1
  1.3295 +    "+c"(width_tmp) // %2
  1.3296 +  :
  1.3297 +  : "memory", "cc"
  1.3298 +  );
  1.3299 +}
  1.3300 +#endif  // HAS_COPYROW_ERMS
  1.3301 +
  1.3302 +#ifdef HAS_ARGBCOPYALPHAROW_SSE2
  1.3303 +// width in pixels
  1.3304 +void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
  1.3305 +  asm volatile (
  1.3306 +    "pcmpeqb   %%xmm0,%%xmm0                   \n"
  1.3307 +    "pslld     $0x18,%%xmm0                    \n"
  1.3308 +    "pcmpeqb   %%xmm1,%%xmm1                   \n"
  1.3309 +    "psrld     $0x8,%%xmm1                     \n"
  1.3310 +    LABELALIGN
  1.3311 +  "1:                                          \n"
  1.3312 +    "movdqa    " MEMACCESS(0) ",%%xmm2         \n"
  1.3313 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
  1.3314 +    "lea       " MEMLEA(0x20,0) ",%0           \n"
  1.3315 +    "movdqa    " MEMACCESS(1) ",%%xmm4         \n"
  1.3316 +    "movdqa    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
  1.3317 +    "pand      %%xmm0,%%xmm2                   \n"
  1.3318 +    "pand      %%xmm0,%%xmm3                   \n"
  1.3319 +    "pand      %%xmm1,%%xmm4                   \n"
  1.3320 +    "pand      %%xmm1,%%xmm5                   \n"
  1.3321 +    "por       %%xmm4,%%xmm2                   \n"
  1.3322 +    "por       %%xmm5,%%xmm3                   \n"
  1.3323 +    "movdqa    %%xmm2," MEMACCESS(1) "         \n"
  1.3324 +    "movdqa    %%xmm3," MEMACCESS2(0x10,1) "   \n"
  1.3325 +    "lea       " MEMLEA(0x20,1) ",%1           \n"
  1.3326 +    "sub       $0x8,%2                         \n"
  1.3327 +    "jg        1b                              \n"
  1.3328 +  : "+r"(src),   // %0
  1.3329 +    "+r"(dst),   // %1
  1.3330 +    "+r"(width)  // %2
  1.3331 +  :
  1.3332 +  : "memory", "cc"
  1.3333 +#if defined(__SSE2__)
  1.3334 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.3335 +#endif
  1.3336 +  );
  1.3337 +}
  1.3338 +#endif  // HAS_ARGBCOPYALPHAROW_SSE2
  1.3339 +
  1.3340 +#ifdef HAS_ARGBCOPYALPHAROW_AVX2
  1.3341 +// width in pixels
  1.3342 +void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
  1.3343 +  asm volatile (
  1.3344 +    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
  1.3345 +    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
  1.3346 +    LABELALIGN
  1.3347 +  "1:                                          \n"
  1.3348 +    "vmovdqu   " MEMACCESS(0) ",%%ymm1         \n"
  1.3349 +    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm2   \n"
  1.3350 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
  1.3351 +    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
  1.3352 +    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
  1.3353 +    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
  1.3354 +    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
  1.3355 +    "lea       " MEMLEA(0x40,1) ",%1           \n"
  1.3356 +    "sub       $0x10,%2                        \n"
  1.3357 +    "jg        1b                              \n"
  1.3358 +    "vzeroupper                                \n"
  1.3359 +  : "+r"(src),   // %0
  1.3360 +    "+r"(dst),   // %1
  1.3361 +    "+r"(width)  // %2
  1.3362 +  :
  1.3363 +  : "memory", "cc"
  1.3364 +#if defined(__SSE2__)
  1.3365 +    , "xmm0", "xmm1", "xmm2"
  1.3366 +#endif
  1.3367 +  );
  1.3368 +}
  1.3369 +#endif  // HAS_ARGBCOPYALPHAROW_AVX2
  1.3370 +
  1.3371 +#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
  1.3372 +// width in pixels
  1.3373 +void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
  1.3374 +  asm volatile (
  1.3375 +    "pcmpeqb   %%xmm0,%%xmm0                   \n"
  1.3376 +    "pslld     $0x18,%%xmm0                    \n"
  1.3377 +    "pcmpeqb   %%xmm1,%%xmm1                   \n"
  1.3378 +    "psrld     $0x8,%%xmm1                     \n"
  1.3379 +    LABELALIGN
  1.3380 +  "1:                                          \n"
  1.3381 +    "movq      " MEMACCESS(0) ",%%xmm2         \n"
  1.3382 +    "lea       " MEMLEA(0x8,0) ",%0            \n"
  1.3383 +    "punpcklbw %%xmm2,%%xmm2                   \n"
  1.3384 +    "punpckhwd %%xmm2,%%xmm3                   \n"
  1.3385 +    "punpcklwd %%xmm2,%%xmm2                   \n"
  1.3386 +    "movdqa    " MEMACCESS(1) ",%%xmm4         \n"
  1.3387 +    "movdqa    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
  1.3388 +    "pand      %%xmm0,%%xmm2                   \n"
  1.3389 +    "pand      %%xmm0,%%xmm3                   \n"
  1.3390 +    "pand      %%xmm1,%%xmm4                   \n"
  1.3391 +    "pand      %%xmm1,%%xmm5                   \n"
  1.3392 +    "por       %%xmm4,%%xmm2                   \n"
  1.3393 +    "por       %%xmm5,%%xmm3                   \n"
  1.3394 +    "movdqa    %%xmm2," MEMACCESS(1) "         \n"
  1.3395 +    "movdqa    %%xmm3," MEMACCESS2(0x10,1) "   \n"
  1.3396 +    "lea       " MEMLEA(0x20,1) ",%1           \n"
  1.3397 +    "sub       $0x8,%2                         \n"
  1.3398 +    "jg        1b                              \n"
  1.3399 +  : "+r"(src),   // %0
  1.3400 +    "+r"(dst),   // %1
  1.3401 +    "+r"(width)  // %2
  1.3402 +  :
  1.3403 +  : "memory", "cc"
  1.3404 +#if defined(__SSE2__)
  1.3405 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.3406 +#endif
  1.3407 +  );
  1.3408 +}
  1.3409 +#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
  1.3410 +
  1.3411 +#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
  1.3412 +// width in pixels
  1.3413 +void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
  1.3414 +  asm volatile (
  1.3415 +    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
  1.3416 +    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
  1.3417 +    LABELALIGN
  1.3418 +  "1:                                          \n"
  1.3419 +    "vpmovzxbd " MEMACCESS(0) ",%%ymm1         \n"
  1.3420 +    "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2    \n"
  1.3421 +    "lea       " MEMLEA(0x10,0) ",%0           \n"
  1.3422 +    "vpslld    $0x18,%%ymm1,%%ymm1             \n"
  1.3423 +    "vpslld    $0x18,%%ymm2,%%ymm2             \n"
  1.3424 +    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
  1.3425 +    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
  1.3426 +    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
  1.3427 +    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
  1.3428 +    "lea       " MEMLEA(0x40,1) ",%1           \n"
  1.3429 +    "sub       $0x10,%2                        \n"
  1.3430 +    "jg        1b                              \n"
  1.3431 +    "vzeroupper                                \n"
  1.3432 +  : "+r"(src),   // %0
  1.3433 +    "+r"(dst),   // %1
  1.3434 +    "+r"(width)  // %2
  1.3435 +  :
  1.3436 +  : "memory", "cc"
  1.3437 +#if defined(__SSE2__)
  1.3438 +    , "xmm0", "xmm1", "xmm2"
  1.3439 +#endif
  1.3440 +  );
  1.3441 +}
  1.3442 +#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
  1.3443 +
  1.3444 +#ifdef HAS_SETROW_X86
  1.3445 +void SetRow_X86(uint8* dst, uint32 v32, int width) {
  1.3446 +  size_t width_tmp = (size_t)(width);
  1.3447 +  asm volatile (
  1.3448 +    "shr       $0x2,%1                         \n"
  1.3449 +    "rep stosl " MEMSTORESTRING(eax,0) "       \n"
  1.3450 +    : "+D"(dst),       // %0
  1.3451 +      "+c"(width_tmp)  // %1
  1.3452 +    : "a"(v32)         // %2
  1.3453 +    : "memory", "cc");
  1.3454 +}
  1.3455 +
  1.3456 +void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
  1.3457 +                   int dst_stride, int height) {
  1.3458 +  for (int y = 0; y < height; ++y) {
  1.3459 +    size_t width_tmp = (size_t)(width);
  1.3460 +    uint32* d = (uint32*)(dst);
  1.3461 +    asm volatile (
  1.3462 +      "rep stosl " MEMSTORESTRING(eax,0) "     \n"
  1.3463 +      : "+D"(d),         // %0
  1.3464 +        "+c"(width_tmp)  // %1
  1.3465 +      : "a"(v32)         // %2
  1.3466 +      : "memory", "cc");
  1.3467 +    dst += dst_stride;
  1.3468 +  }
  1.3469 +}
  1.3470 +#endif  // HAS_SETROW_X86
  1.3471 +
  1.3472 +#ifdef HAS_YUY2TOYROW_SSE2
  1.3473 +void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
  1.3474 +  asm volatile (
  1.3475 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.3476 +    "psrlw     $0x8,%%xmm5                     \n"
  1.3477 +    LABELALIGN
  1.3478 +  "1:                                          \n"
  1.3479 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.3480 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.3481 +    "lea       " MEMLEA(0x20,0) ",%0           \n"
  1.3482 +    "pand      %%xmm5,%%xmm0                   \n"
  1.3483 +    "pand      %%xmm5,%%xmm1                   \n"
  1.3484 +    "packuswb  %%xmm1,%%xmm0                   \n"
  1.3485 +    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  1.3486 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.3487 +    "sub       $0x10,%2                        \n"
  1.3488 +    "jg        1b                              \n"
  1.3489 +  : "+r"(src_yuy2),  // %0
  1.3490 +    "+r"(dst_y),     // %1
  1.3491 +    "+r"(pix)        // %2
  1.3492 +  :
  1.3493 +  : "memory", "cc"
  1.3494 +#if defined(__SSE2__)
  1.3495 +    , "xmm0", "xmm1", "xmm5"
  1.3496 +#endif
  1.3497 +  );
  1.3498 +}
  1.3499 +
  1.3500 +void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
  1.3501 +                      uint8* dst_u, uint8* dst_v, int pix) {
  1.3502 +  asm volatile (
  1.3503 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.3504 +    "psrlw     $0x8,%%xmm5                     \n"
  1.3505 +    "sub       %1,%2                           \n"
  1.3506 +    LABELALIGN
  1.3507 +  "1:                                          \n"
  1.3508 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.3509 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.3510 +    BUNDLEALIGN
  1.3511 +    MEMOPREG(movdqa,0x00,0,4,1,xmm2)           //  movdqa  (%0,%4,1),%%xmm2
  1.3512 +    MEMOPREG(movdqa,0x10,0,4,1,xmm3)           //  movdqa  0x10(%0,%4,1),%%xmm3
  1.3513 +    "lea       " MEMLEA(0x20,0) ",%0           \n"
  1.3514 +    "pavgb     %%xmm2,%%xmm0                   \n"
  1.3515 +    "pavgb     %%xmm3,%%xmm1                   \n"
  1.3516 +    "psrlw     $0x8,%%xmm0                     \n"
  1.3517 +    "psrlw     $0x8,%%xmm1                     \n"
  1.3518 +    "packuswb  %%xmm1,%%xmm0                   \n"
  1.3519 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.3520 +    "pand      %%xmm5,%%xmm0                   \n"
  1.3521 +    "packuswb  %%xmm0,%%xmm0                   \n"
  1.3522 +    "psrlw     $0x8,%%xmm1                     \n"
  1.3523 +    "packuswb  %%xmm1,%%xmm1                   \n"
  1.3524 +    "movq      %%xmm0," MEMACCESS(1) "         \n"
  1.3525 +    BUNDLEALIGN
  1.3526 +    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
  1.3527 +    "lea       " MEMLEA(0x8,1) ",%1            \n"
  1.3528 +    "sub       $0x10,%3                        \n"
  1.3529 +    "jg        1b                              \n"
  1.3530 +  : "+r"(src_yuy2),    // %0
  1.3531 +    "+r"(dst_u),       // %1
  1.3532 +    "+r"(dst_v),       // %2
  1.3533 +    "+r"(pix)          // %3
  1.3534 +  : "r"((intptr_t)(stride_yuy2))  // %4
  1.3535 +  : "memory", "cc"
  1.3536 +#if defined(__native_client__) && defined(__x86_64__)
  1.3537 +    , "r14"
  1.3538 +#endif
  1.3539 +#if defined(__SSE2__)
  1.3540 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  1.3541 +#endif
  1.3542 +  );
  1.3543 +}
  1.3544 +
  1.3545 +void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
  1.3546 +                         uint8* dst_u, uint8* dst_v, int pix) {
  1.3547 +  asm volatile (
  1.3548 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.3549 +    "psrlw     $0x8,%%xmm5                     \n"
  1.3550 +    "sub       %1,%2                           \n"
  1.3551 +    LABELALIGN
  1.3552 +  "1:                                          \n"
  1.3553 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.3554 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.3555 +    "lea       " MEMLEA(0x20,0) ",%0           \n"
  1.3556 +    "psrlw     $0x8,%%xmm0                     \n"
  1.3557 +    "psrlw     $0x8,%%xmm1                     \n"
  1.3558 +    "packuswb  %%xmm1,%%xmm0                   \n"
  1.3559 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.3560 +    "pand      %%xmm5,%%xmm0                   \n"
  1.3561 +    "packuswb  %%xmm0,%%xmm0                   \n"
  1.3562 +    "psrlw     $0x8,%%xmm1                     \n"
  1.3563 +    "packuswb  %%xmm1,%%xmm1                   \n"
  1.3564 +    "movq      %%xmm0," MEMACCESS(1) "         \n"
  1.3565 +    BUNDLEALIGN
  1.3566 +    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
  1.3567 +    "lea       " MEMLEA(0x8,1) ",%1            \n"
  1.3568 +    "sub       $0x10,%3                        \n"
  1.3569 +    "jg        1b                              \n"
  1.3570 +  : "+r"(src_yuy2),    // %0
  1.3571 +    "+r"(dst_u),       // %1
  1.3572 +    "+r"(dst_v),       // %2
  1.3573 +    "+r"(pix)          // %3
  1.3574 +  :
  1.3575 +  : "memory", "cc"
  1.3576 +#if defined(__native_client__) && defined(__x86_64__)
  1.3577 +    , "r14"
  1.3578 +#endif
  1.3579 +#if defined(__SSE2__)
  1.3580 +    , "xmm0", "xmm1", "xmm5"
  1.3581 +#endif
  1.3582 +  );
  1.3583 +}
  1.3584 +
  1.3585 +void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
  1.3586 +                               uint8* dst_y, int pix) {
  1.3587 +  asm volatile (
  1.3588 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.3589 +    "psrlw     $0x8,%%xmm5                     \n"
  1.3590 +    LABELALIGN
  1.3591 +  "1:                                          \n"
  1.3592 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1.3593 +    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.3594 +    "lea       " MEMLEA(0x20,0) ",%0           \n"
  1.3595 +    "pand      %%xmm5,%%xmm0                   \n"
  1.3596 +    "pand      %%xmm5,%%xmm1                   \n"
  1.3597 +    "packuswb  %%xmm1,%%xmm0                   \n"
  1.3598 +    "sub       $0x10,%2                        \n"
  1.3599 +    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
  1.3600 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.3601 +    "jg        1b                              \n"
  1.3602 +  : "+r"(src_yuy2),  // %0
  1.3603 +    "+r"(dst_y),     // %1
  1.3604 +    "+r"(pix)        // %2
  1.3605 +  :
  1.3606 +  : "memory", "cc"
  1.3607 +#if defined(__SSE2__)
  1.3608 +    , "xmm0", "xmm1", "xmm5"
  1.3609 +#endif
  1.3610 +  );
  1.3611 +}
  1.3612 +
  1.3613 +void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
  1.3614 +                                int stride_yuy2,
  1.3615 +                                uint8* dst_u, uint8* dst_v, int pix) {
  1.3616 +  asm volatile (
  1.3617 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.3618 +    "psrlw     $0x8,%%xmm5                     \n"
  1.3619 +    "sub       %1,%2                           \n"
  1.3620 +    LABELALIGN
  1.3621 +  "1:                                          \n"
  1.3622 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1.3623 +    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.3624 +    BUNDLEALIGN
  1.3625 +    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
  1.3626 +    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
  1.3627 +    "lea       " MEMLEA(0x20,0) ",%0           \n"
  1.3628 +    "pavgb     %%xmm2,%%xmm0                   \n"
  1.3629 +    "pavgb     %%xmm3,%%xmm1                   \n"
  1.3630 +    "psrlw     $0x8,%%xmm0                     \n"
  1.3631 +    "psrlw     $0x8,%%xmm1                     \n"
  1.3632 +    "packuswb  %%xmm1,%%xmm0                   \n"
  1.3633 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.3634 +    "pand      %%xmm5,%%xmm0                   \n"
  1.3635 +    "packuswb  %%xmm0,%%xmm0                   \n"
  1.3636 +    "psrlw     $0x8,%%xmm1                     \n"
  1.3637 +    "packuswb  %%xmm1,%%xmm1                   \n"
  1.3638 +    "movq      %%xmm0," MEMACCESS(1) "         \n"
  1.3639 +    BUNDLEALIGN
  1.3640 +    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
  1.3641 +    "lea       " MEMLEA(0x8,1) ",%1            \n"
  1.3642 +    "sub       $0x10,%3                        \n"
  1.3643 +    "jg        1b                              \n"
  1.3644 +  : "+r"(src_yuy2),    // %0
  1.3645 +    "+r"(dst_u),       // %1
  1.3646 +    "+r"(dst_v),       // %2
  1.3647 +    "+r"(pix)          // %3
  1.3648 +  : "r"((intptr_t)(stride_yuy2))  // %4
  1.3649 +  : "memory", "cc"
  1.3650 +#if defined(__native_client__) && defined(__x86_64__)
  1.3651 +    , "r14"
  1.3652 +#endif
  1.3653 +#if defined(__SSE2__)
  1.3654 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  1.3655 +#endif
  1.3656 +  );
  1.3657 +}
  1.3658 +
  1.3659 +void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
  1.3660 +                                   uint8* dst_u, uint8* dst_v, int pix) {
  1.3661 +  asm volatile (
  1.3662 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.3663 +    "psrlw     $0x8,%%xmm5                     \n"
  1.3664 +    "sub       %1,%2                           \n"
  1.3665 +    LABELALIGN
  1.3666 +  "1:                                          \n"
  1.3667 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1.3668 +    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.3669 +    "lea       " MEMLEA(0x20,0) ",%0           \n"
  1.3670 +    "psrlw     $0x8,%%xmm0                     \n"
  1.3671 +    "psrlw     $0x8,%%xmm1                     \n"
  1.3672 +    "packuswb  %%xmm1,%%xmm0                   \n"
  1.3673 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.3674 +    "pand      %%xmm5,%%xmm0                   \n"
  1.3675 +    "packuswb  %%xmm0,%%xmm0                   \n"
  1.3676 +    "psrlw     $0x8,%%xmm1                     \n"
  1.3677 +    "packuswb  %%xmm1,%%xmm1                   \n"
  1.3678 +    "movq      %%xmm0," MEMACCESS(1) "         \n"
  1.3679 +    BUNDLEALIGN
  1.3680 +    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
  1.3681 +    "lea       " MEMLEA(0x8,1) ",%1            \n"
  1.3682 +    "sub       $0x10,%3                        \n"
  1.3683 +    "jg        1b                              \n"
  1.3684 +  : "+r"(src_yuy2),    // %0
  1.3685 +    "+r"(dst_u),       // %1
  1.3686 +    "+r"(dst_v),       // %2
  1.3687 +    "+r"(pix)          // %3
  1.3688 +  :
  1.3689 +  : "memory", "cc"
  1.3690 +#if defined(__native_client__) && defined(__x86_64__)
  1.3691 +    , "r14"
  1.3692 +#endif
  1.3693 +#if defined(__SSE2__)
  1.3694 +    , "xmm0", "xmm1", "xmm5"
  1.3695 +#endif
  1.3696 +  );
  1.3697 +}
  1.3698 +
  1.3699 +void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
  1.3700 +  asm volatile (
  1.3701 +    LABELALIGN
  1.3702 +  "1:                                          \n"
  1.3703 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.3704 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.3705 +    "lea       " MEMLEA(0x20,0) ",%0           \n"
  1.3706 +    "psrlw     $0x8,%%xmm0                     \n"
  1.3707 +    "psrlw     $0x8,%%xmm1                     \n"
  1.3708 +    "packuswb  %%xmm1,%%xmm0                   \n"
  1.3709 +    "sub       $0x10,%2                        \n"
  1.3710 +    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  1.3711 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.3712 +    "jg        1b                              \n"
  1.3713 +  : "+r"(src_uyvy),  // %0
  1.3714 +    "+r"(dst_y),     // %1
  1.3715 +    "+r"(pix)        // %2
  1.3716 +  :
  1.3717 +  : "memory", "cc"
  1.3718 +#if defined(__SSE2__)
  1.3719 +    , "xmm0", "xmm1"
  1.3720 +#endif
  1.3721 +  );
  1.3722 +}
  1.3723 +
  1.3724 +void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
  1.3725 +                      uint8* dst_u, uint8* dst_v, int pix) {
  1.3726 +  asm volatile (
  1.3727 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.3728 +    "psrlw     $0x8,%%xmm5                     \n"
  1.3729 +    "sub       %1,%2                           \n"
  1.3730 +    LABELALIGN
  1.3731 +  "1:                                          \n"
  1.3732 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.3733 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.3734 +    BUNDLEALIGN
  1.3735 +    MEMOPREG(movdqa,0x00,0,4,1,xmm2)           //  movdqa  (%0,%4,1),%%xmm2
  1.3736 +    MEMOPREG(movdqa,0x10,0,4,1,xmm3)           //  movdqa  0x10(%0,%4,1),%%xmm3
  1.3737 +    "lea       " MEMLEA(0x20,0) ",%0           \n"
  1.3738 +    "pavgb     %%xmm2,%%xmm0                   \n"
  1.3739 +    "pavgb     %%xmm3,%%xmm1                   \n"
  1.3740 +    "pand      %%xmm5,%%xmm0                   \n"
  1.3741 +    "pand      %%xmm5,%%xmm1                   \n"
  1.3742 +    "packuswb  %%xmm1,%%xmm0                   \n"
  1.3743 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.3744 +    "pand      %%xmm5,%%xmm0                   \n"
  1.3745 +    "packuswb  %%xmm0,%%xmm0                   \n"
  1.3746 +    "psrlw     $0x8,%%xmm1                     \n"
  1.3747 +    "packuswb  %%xmm1,%%xmm1                   \n"
  1.3748 +    "movq      %%xmm0," MEMACCESS(1) "         \n"
  1.3749 +    BUNDLEALIGN
  1.3750 +    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
  1.3751 +    "lea       " MEMLEA(0x8,1) ",%1            \n"
  1.3752 +    "sub       $0x10,%3                        \n"
  1.3753 +    "jg        1b                              \n"
  1.3754 +  : "+r"(src_uyvy),    // %0
  1.3755 +    "+r"(dst_u),       // %1
  1.3756 +    "+r"(dst_v),       // %2
  1.3757 +    "+r"(pix)          // %3
  1.3758 +  : "r"((intptr_t)(stride_uyvy))  // %4
  1.3759 +  : "memory", "cc"
  1.3760 +#if defined(__native_client__) && defined(__x86_64__)
  1.3761 +    , "r14"
  1.3762 +#endif
  1.3763 +#if defined(__SSE2__)
  1.3764 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  1.3765 +#endif
  1.3766 +  );
  1.3767 +}
  1.3768 +
  1.3769 +void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
  1.3770 +                         uint8* dst_u, uint8* dst_v, int pix) {
  1.3771 +  asm volatile (
  1.3772 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.3773 +    "psrlw     $0x8,%%xmm5                     \n"
  1.3774 +    "sub       %1,%2                           \n"
  1.3775 +    LABELALIGN
  1.3776 +  "1:                                          \n"
  1.3777 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.3778 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.3779 +    "lea       " MEMLEA(0x20,0) ",%0           \n"
  1.3780 +    "pand      %%xmm5,%%xmm0                   \n"
  1.3781 +    "pand      %%xmm5,%%xmm1                   \n"
  1.3782 +    "packuswb  %%xmm1,%%xmm0                   \n"
  1.3783 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.3784 +    "pand      %%xmm5,%%xmm0                   \n"
  1.3785 +    "packuswb  %%xmm0,%%xmm0                   \n"
  1.3786 +    "psrlw     $0x8,%%xmm1                     \n"
  1.3787 +    "packuswb  %%xmm1,%%xmm1                   \n"
  1.3788 +    "movq      %%xmm0," MEMACCESS(1) "         \n"
  1.3789 +    BUNDLEALIGN
  1.3790 +    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
  1.3791 +    "lea       " MEMLEA(0x8,1) ",%1            \n"
  1.3792 +    "sub       $0x10,%3                        \n"
  1.3793 +    "jg        1b                              \n"
  1.3794 +  : "+r"(src_uyvy),    // %0
  1.3795 +    "+r"(dst_u),       // %1
  1.3796 +    "+r"(dst_v),       // %2
  1.3797 +    "+r"(pix)          // %3
  1.3798 +  :
  1.3799 +  : "memory", "cc"
  1.3800 +#if defined(__native_client__) && defined(__x86_64__)
  1.3801 +    , "r14"
  1.3802 +#endif
  1.3803 +#if defined(__SSE2__)
  1.3804 +    , "xmm0", "xmm1", "xmm5"
  1.3805 +#endif
  1.3806 +  );
  1.3807 +}
  1.3808 +
  1.3809 +void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
  1.3810 +                               uint8* dst_y, int pix) {
  1.3811 +  asm volatile (
  1.3812 +    LABELALIGN
  1.3813 +  "1:                                          \n"
  1.3814 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1.3815 +    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.3816 +    "lea       " MEMLEA(0x20,0) ",%0           \n"
  1.3817 +    "psrlw     $0x8,%%xmm0                     \n"
  1.3818 +    "psrlw     $0x8,%%xmm1                     \n"
  1.3819 +    "packuswb  %%xmm1,%%xmm0                   \n"
  1.3820 +    "sub       $0x10,%2                        \n"
  1.3821 +    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
  1.3822 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.3823 +    "jg        1b                              \n"
  1.3824 +  : "+r"(src_uyvy),  // %0
  1.3825 +    "+r"(dst_y),     // %1
  1.3826 +    "+r"(pix)        // %2
  1.3827 +  :
  1.3828 +  : "memory", "cc"
  1.3829 +#if defined(__SSE2__)
  1.3830 +    , "xmm0", "xmm1"
  1.3831 +#endif
  1.3832 +  );
  1.3833 +}
  1.3834 +
  1.3835 +void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
  1.3836 +                                uint8* dst_u, uint8* dst_v, int pix) {
  1.3837 +  asm volatile (
  1.3838 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.3839 +    "psrlw     $0x8,%%xmm5                     \n"
  1.3840 +    "sub       %1,%2                           \n"
  1.3841 +    LABELALIGN
  1.3842 +  "1:                                          \n"
  1.3843 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1.3844 +    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.3845 +    BUNDLEALIGN
  1.3846 +    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
  1.3847 +    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
  1.3848 +    "lea       " MEMLEA(0x20,0) ",%0           \n"
  1.3849 +    "pavgb     %%xmm2,%%xmm0                   \n"
  1.3850 +    "pavgb     %%xmm3,%%xmm1                   \n"
  1.3851 +    "pand      %%xmm5,%%xmm0                   \n"
  1.3852 +    "pand      %%xmm5,%%xmm1                   \n"
  1.3853 +    "packuswb  %%xmm1,%%xmm0                   \n"
  1.3854 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.3855 +    "pand      %%xmm5,%%xmm0                   \n"
  1.3856 +    "packuswb  %%xmm0,%%xmm0                   \n"
  1.3857 +    "psrlw     $0x8,%%xmm1                     \n"
  1.3858 +    "packuswb  %%xmm1,%%xmm1                   \n"
  1.3859 +    "movq      %%xmm0," MEMACCESS(1) "         \n"
  1.3860 +    BUNDLEALIGN
  1.3861 +    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
  1.3862 +    "lea       " MEMLEA(0x8,1) ",%1            \n"
  1.3863 +    "sub       $0x10,%3                        \n"
  1.3864 +    "jg        1b                              \n"
  1.3865 +  : "+r"(src_uyvy),    // %0
  1.3866 +    "+r"(dst_u),       // %1
  1.3867 +    "+r"(dst_v),       // %2
  1.3868 +    "+r"(pix)          // %3
  1.3869 +  : "r"((intptr_t)(stride_uyvy))  // %4
  1.3870 +  : "memory", "cc"
  1.3871 +#if defined(__native_client__) && defined(__x86_64__)
  1.3872 +    , "r14"
  1.3873 +#endif
  1.3874 +#if defined(__SSE2__)
  1.3875 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  1.3876 +#endif
  1.3877 +  );
  1.3878 +}
  1.3879 +
  1.3880 +void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
  1.3881 +                                   uint8* dst_u, uint8* dst_v, int pix) {
  1.3882 +  asm volatile (
  1.3883 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.3884 +    "psrlw     $0x8,%%xmm5                     \n"
  1.3885 +    "sub       %1,%2                           \n"
  1.3886 +    LABELALIGN
  1.3887 +  "1:                                          \n"
  1.3888 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1.3889 +    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.3890 +    "lea       " MEMLEA(0x20,0) ",%0           \n"
  1.3891 +    "pand      %%xmm5,%%xmm0                   \n"
  1.3892 +    "pand      %%xmm5,%%xmm1                   \n"
  1.3893 +    "packuswb  %%xmm1,%%xmm0                   \n"
  1.3894 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.3895 +    "pand      %%xmm5,%%xmm0                   \n"
  1.3896 +    "packuswb  %%xmm0,%%xmm0                   \n"
  1.3897 +    "psrlw     $0x8,%%xmm1                     \n"
  1.3898 +    "packuswb  %%xmm1,%%xmm1                   \n"
  1.3899 +    "movq      %%xmm0," MEMACCESS(1) "         \n"
  1.3900 +    BUNDLEALIGN
  1.3901 +    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
  1.3902 +    "lea       " MEMLEA(0x8,1) ",%1            \n"
  1.3903 +    "sub       $0x10,%3                        \n"
  1.3904 +    "jg        1b                              \n"
  1.3905 +  : "+r"(src_uyvy),    // %0
  1.3906 +    "+r"(dst_u),       // %1
  1.3907 +    "+r"(dst_v),       // %2
  1.3908 +    "+r"(pix)          // %3
  1.3909 +  :
  1.3910 +  : "memory", "cc"
  1.3911 +#if defined(__native_client__) && defined(__x86_64__)
  1.3912 +    , "r14"
  1.3913 +#endif
  1.3914 +#if defined(__SSE2__)
  1.3915 +    , "xmm0", "xmm1", "xmm5"
  1.3916 +#endif
  1.3917 +  );
  1.3918 +}
  1.3919 +#endif  // HAS_YUY2TOYROW_SSE2
  1.3920 +
  1.3921 +#ifdef HAS_ARGBBLENDROW_SSE2
  1.3922 +// Blend 8 pixels at a time.
  1.3923 +void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  1.3924 +                       uint8* dst_argb, int width) {
  1.3925 +  asm volatile (
  1.3926 +    "pcmpeqb   %%xmm7,%%xmm7                   \n"
  1.3927 +    "psrlw     $0xf,%%xmm7                     \n"
  1.3928 +    "pcmpeqb   %%xmm6,%%xmm6                   \n"
  1.3929 +    "psrlw     $0x8,%%xmm6                     \n"
  1.3930 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.3931 +    "psllw     $0x8,%%xmm5                     \n"
  1.3932 +    "pcmpeqb   %%xmm4,%%xmm4                   \n"
  1.3933 +    "pslld     $0x18,%%xmm4                    \n"
  1.3934 +    "sub       $0x1,%3                         \n"
  1.3935 +    "je        91f                             \n"
  1.3936 +    "jl        99f                             \n"
  1.3937 +
  1.3938 +    // 1 pixel loop until destination pointer is aligned.
  1.3939 +  "10:                                         \n"
  1.3940 +    "test      $0xf,%2                         \n"
  1.3941 +    "je        19f                             \n"
  1.3942 +    "movd      " MEMACCESS(0) ",%%xmm3         \n"
  1.3943 +    "lea       " MEMLEA(0x4,0) ",%0            \n"
  1.3944 +    "movdqa    %%xmm3,%%xmm0                   \n"
  1.3945 +    "pxor      %%xmm4,%%xmm3                   \n"
  1.3946 +    "movd      " MEMACCESS(1) ",%%xmm2         \n"
  1.3947 +    "psrlw     $0x8,%%xmm3                     \n"
  1.3948 +    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
  1.3949 +    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
  1.3950 +    "pand      %%xmm6,%%xmm2                   \n"
  1.3951 +    "paddw     %%xmm7,%%xmm3                   \n"
  1.3952 +    "pmullw    %%xmm3,%%xmm2                   \n"
  1.3953 +    "movd      " MEMACCESS(1) ",%%xmm1         \n"
  1.3954 +    "lea       " MEMLEA(0x4,1) ",%1            \n"
  1.3955 +    "psrlw     $0x8,%%xmm1                     \n"
  1.3956 +    "por       %%xmm4,%%xmm0                   \n"
  1.3957 +    "pmullw    %%xmm3,%%xmm1                   \n"
  1.3958 +    "psrlw     $0x8,%%xmm2                     \n"
  1.3959 +    "paddusb   %%xmm2,%%xmm0                   \n"
  1.3960 +    "pand      %%xmm5,%%xmm1                   \n"
  1.3961 +    "paddusb   %%xmm1,%%xmm0                   \n"
  1.3962 +    "sub       $0x1,%3                         \n"
  1.3963 +    "movd      %%xmm0," MEMACCESS(2) "         \n"
  1.3964 +    "lea       " MEMLEA(0x4,2) ",%2            \n"
  1.3965 +    "jge       10b                             \n"
  1.3966 +
  1.3967 +  "19:                                         \n"
  1.3968 +    "add       $1-4,%3                         \n"
  1.3969 +    "jl        49f                             \n"
  1.3970 +
  1.3971 +    // 4 pixel loop.
  1.3972 +    LABELALIGN
  1.3973 +  "41:                                         \n"
  1.3974 +    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
  1.3975 +    "lea       " MEMLEA(0x10,0) ",%0           \n"
  1.3976 +    "movdqa    %%xmm3,%%xmm0                   \n"
  1.3977 +    "pxor      %%xmm4,%%xmm3                   \n"
  1.3978 +    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
  1.3979 +    "psrlw     $0x8,%%xmm3                     \n"
  1.3980 +    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
  1.3981 +    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
  1.3982 +    "pand      %%xmm6,%%xmm2                   \n"
  1.3983 +    "paddw     %%xmm7,%%xmm3                   \n"
  1.3984 +    "pmullw    %%xmm3,%%xmm2                   \n"
  1.3985 +    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
  1.3986 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.3987 +    "psrlw     $0x8,%%xmm1                     \n"
  1.3988 +    "por       %%xmm4,%%xmm0                   \n"
  1.3989 +    "pmullw    %%xmm3,%%xmm1                   \n"
  1.3990 +    "psrlw     $0x8,%%xmm2                     \n"
  1.3991 +    "paddusb   %%xmm2,%%xmm0                   \n"
  1.3992 +    "pand      %%xmm5,%%xmm1                   \n"
  1.3993 +    "paddusb   %%xmm1,%%xmm0                   \n"
  1.3994 +    "sub       $0x4,%3                         \n"
  1.3995 +    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
  1.3996 +    "lea       " MEMLEA(0x10,2) ",%2           \n"
  1.3997 +    "jge       41b                             \n"
  1.3998 +
  1.3999 +  "49:                                         \n"
  1.4000 +    "add       $0x3,%3                         \n"
  1.4001 +    "jl        99f                             \n"
  1.4002 +
  1.4003 +    // 1 pixel loop.
  1.4004 +  "91:                                         \n"
  1.4005 +    "movd      " MEMACCESS(0) ",%%xmm3         \n"
  1.4006 +    "lea       " MEMLEA(0x4,0) ",%0            \n"
  1.4007 +    "movdqa    %%xmm3,%%xmm0                   \n"
  1.4008 +    "pxor      %%xmm4,%%xmm3                   \n"
  1.4009 +    "movd      " MEMACCESS(1) ",%%xmm2         \n"
  1.4010 +    "psrlw     $0x8,%%xmm3                     \n"
  1.4011 +    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
  1.4012 +    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
  1.4013 +    "pand      %%xmm6,%%xmm2                   \n"
  1.4014 +    "paddw     %%xmm7,%%xmm3                   \n"
  1.4015 +    "pmullw    %%xmm3,%%xmm2                   \n"
  1.4016 +    "movd      " MEMACCESS(1) ",%%xmm1         \n"
  1.4017 +    "lea       " MEMLEA(0x4,1) ",%1            \n"
  1.4018 +    "psrlw     $0x8,%%xmm1                     \n"
  1.4019 +    "por       %%xmm4,%%xmm0                   \n"
  1.4020 +    "pmullw    %%xmm3,%%xmm1                   \n"
  1.4021 +    "psrlw     $0x8,%%xmm2                     \n"
  1.4022 +    "paddusb   %%xmm2,%%xmm0                   \n"
  1.4023 +    "pand      %%xmm5,%%xmm1                   \n"
  1.4024 +    "paddusb   %%xmm1,%%xmm0                   \n"
  1.4025 +    "sub       $0x1,%3                         \n"
  1.4026 +    "movd      %%xmm0," MEMACCESS(2) "         \n"
  1.4027 +    "lea       " MEMLEA(0x4,2) ",%2            \n"
  1.4028 +    "jge       91b                             \n"
  1.4029 +  "99:                                         \n"
  1.4030 +  : "+r"(src_argb0),    // %0
  1.4031 +    "+r"(src_argb1),    // %1
  1.4032 +    "+r"(dst_argb),     // %2
  1.4033 +    "+r"(width)         // %3
  1.4034 +  :
  1.4035 +  : "memory", "cc"
  1.4036 +#if defined(__SSE2__)
  1.4037 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  1.4038 +#endif
  1.4039 +  );
  1.4040 +}
  1.4041 +#endif  // HAS_ARGBBLENDROW_SSE2
  1.4042 +
  1.4043 +#ifdef HAS_ARGBBLENDROW_SSSE3
  1.4044 +// Shuffle table for isolating alpha.
  1.4045 +static uvec8 kShuffleAlpha = {
  1.4046 +  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
  1.4047 +  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
  1.4048 +};
  1.4049 +
  1.4050 +// Blend 8 pixels at a time
  1.4051 +// Shuffle table for reversing the bytes.
  1.4052 +
  1.4053 +// Same as SSE2, but replaces
  1.4054 +//    psrlw      xmm3, 8          // alpha
  1.4055 +//    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
  1.4056 +//    pshuflw    xmm3, xmm3,0F5h
  1.4057 +// with..
  1.4058 +//    pshufb     xmm3, kShuffleAlpha // alpha
  1.4059 +
  1.4060 +void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
  1.4061 +                        uint8* dst_argb, int width) {
  1.4062 +  asm volatile (
  1.4063 +    "pcmpeqb   %%xmm7,%%xmm7                   \n"
  1.4064 +    "psrlw     $0xf,%%xmm7                     \n"
  1.4065 +    "pcmpeqb   %%xmm6,%%xmm6                   \n"
  1.4066 +    "psrlw     $0x8,%%xmm6                     \n"
  1.4067 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.4068 +    "psllw     $0x8,%%xmm5                     \n"
  1.4069 +    "pcmpeqb   %%xmm4,%%xmm4                   \n"
  1.4070 +    "pslld     $0x18,%%xmm4                    \n"
  1.4071 +    "sub       $0x1,%3                         \n"
  1.4072 +    "je        91f                             \n"
  1.4073 +    "jl        99f                             \n"
  1.4074 +
  1.4075 +    // 1 pixel loop until destination pointer is aligned.
  1.4076 +  "10:                                         \n"
  1.4077 +    "test      $0xf,%2                         \n"
  1.4078 +    "je        19f                             \n"
  1.4079 +    "movd      " MEMACCESS(0) ",%%xmm3         \n"
  1.4080 +    "lea       " MEMLEA(0x4,0) ",%0            \n"
  1.4081 +    "movdqa    %%xmm3,%%xmm0                   \n"
  1.4082 +    "pxor      %%xmm4,%%xmm3                   \n"
  1.4083 +    "movd      " MEMACCESS(1) ",%%xmm2         \n"
  1.4084 +    "pshufb    %4,%%xmm3                       \n"
  1.4085 +    "pand      %%xmm6,%%xmm2                   \n"
  1.4086 +    "paddw     %%xmm7,%%xmm3                   \n"
  1.4087 +    "pmullw    %%xmm3,%%xmm2                   \n"
  1.4088 +    "movd      " MEMACCESS(1) ",%%xmm1         \n"
  1.4089 +    "lea       " MEMLEA(0x4,1) ",%1            \n"
  1.4090 +    "psrlw     $0x8,%%xmm1                     \n"
  1.4091 +    "por       %%xmm4,%%xmm0                   \n"
  1.4092 +    "pmullw    %%xmm3,%%xmm1                   \n"
  1.4093 +    "psrlw     $0x8,%%xmm2                     \n"
  1.4094 +    "paddusb   %%xmm2,%%xmm0                   \n"
  1.4095 +    "pand      %%xmm5,%%xmm1                   \n"
  1.4096 +    "paddusb   %%xmm1,%%xmm0                   \n"
  1.4097 +    "sub       $0x1,%3                         \n"
  1.4098 +    "movd      %%xmm0," MEMACCESS(2) "         \n"
  1.4099 +    "lea       " MEMLEA(0x4,2) ",%2            \n"
  1.4100 +    "jge       10b                             \n"
  1.4101 +
  1.4102 +  "19:                                         \n"
  1.4103 +    "add       $1-4,%3                         \n"
  1.4104 +    "jl        49f                             \n"
  1.4105 +    "test      $0xf,%0                         \n"
  1.4106 +    "jne       41f                             \n"
  1.4107 +    "test      $0xf,%1                         \n"
  1.4108 +    "jne       41f                             \n"
  1.4109 +
  1.4110 +    // 4 pixel loop.
  1.4111 +    LABELALIGN
  1.4112 +  "40:                                         \n"
  1.4113 +    "movdqa    " MEMACCESS(0) ",%%xmm3         \n"
  1.4114 +    "lea       " MEMLEA(0x10,0) ",%0           \n"
  1.4115 +    "movdqa    %%xmm3,%%xmm0                   \n"
  1.4116 +    "pxor      %%xmm4,%%xmm3                   \n"
  1.4117 +    "movdqa    " MEMACCESS(1) ",%%xmm2         \n"
  1.4118 +    "pshufb    %4,%%xmm3                       \n"
  1.4119 +    "pand      %%xmm6,%%xmm2                   \n"
  1.4120 +    "paddw     %%xmm7,%%xmm3                   \n"
  1.4121 +    "pmullw    %%xmm3,%%xmm2                   \n"
  1.4122 +    "movdqa    " MEMACCESS(1) ",%%xmm1         \n"
  1.4123 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.4124 +    "psrlw     $0x8,%%xmm1                     \n"
  1.4125 +    "por       %%xmm4,%%xmm0                   \n"
  1.4126 +    "pmullw    %%xmm3,%%xmm1                   \n"
  1.4127 +    "psrlw     $0x8,%%xmm2                     \n"
  1.4128 +    "paddusb   %%xmm2,%%xmm0                   \n"
  1.4129 +    "pand      %%xmm5,%%xmm1                   \n"
  1.4130 +    "paddusb   %%xmm1,%%xmm0                   \n"
  1.4131 +    "sub       $0x4,%3                         \n"
  1.4132 +    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
  1.4133 +    "lea       " MEMLEA(0x10,2) ",%2           \n"
  1.4134 +    "jge       40b                             \n"
  1.4135 +    "jmp       49f                             \n"
  1.4136 +
  1.4137 +    // 4 pixel unaligned loop.
  1.4138 +    LABELALIGN
  1.4139 +  "41:                                         \n"
  1.4140 +    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
  1.4141 +    "lea       " MEMLEA(0x10,0) ",%0           \n"
  1.4142 +    "movdqa    %%xmm3,%%xmm0                   \n"
  1.4143 +    "pxor      %%xmm4,%%xmm3                   \n"
  1.4144 +    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
  1.4145 +    "pshufb    %4,%%xmm3                       \n"
  1.4146 +    "pand      %%xmm6,%%xmm2                   \n"
  1.4147 +    "paddw     %%xmm7,%%xmm3                   \n"
  1.4148 +    "pmullw    %%xmm3,%%xmm2                   \n"
  1.4149 +    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
  1.4150 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.4151 +    "psrlw     $0x8,%%xmm1                     \n"
  1.4152 +    "por       %%xmm4,%%xmm0                   \n"
  1.4153 +    "pmullw    %%xmm3,%%xmm1                   \n"
  1.4154 +    "psrlw     $0x8,%%xmm2                     \n"
  1.4155 +    "paddusb   %%xmm2,%%xmm0                   \n"
  1.4156 +    "pand      %%xmm5,%%xmm1                   \n"
  1.4157 +    "paddusb   %%xmm1,%%xmm0                   \n"
  1.4158 +    "sub       $0x4,%3                         \n"
  1.4159 +    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
  1.4160 +    "lea       " MEMLEA(0x10,2) ",%2           \n"
  1.4161 +    "jge       41b                             \n"
  1.4162 +
  1.4163 +  "49:                                         \n"
  1.4164 +    "add       $0x3,%3                         \n"
  1.4165 +    "jl        99f                             \n"
  1.4166 +
  1.4167 +    // 1 pixel loop.
  1.4168 +  "91:                                         \n"
  1.4169 +    "movd      " MEMACCESS(0) ",%%xmm3         \n"
  1.4170 +    "lea       " MEMLEA(0x4,0) ",%0            \n"
  1.4171 +    "movdqa    %%xmm3,%%xmm0                   \n"
  1.4172 +    "pxor      %%xmm4,%%xmm3                   \n"
  1.4173 +    "movd      " MEMACCESS(1) ",%%xmm2         \n"
  1.4174 +    "pshufb    %4,%%xmm3                       \n"
  1.4175 +    "pand      %%xmm6,%%xmm2                   \n"
  1.4176 +    "paddw     %%xmm7,%%xmm3                   \n"
  1.4177 +    "pmullw    %%xmm3,%%xmm2                   \n"
  1.4178 +    "movd      " MEMACCESS(1) ",%%xmm1         \n"
  1.4179 +    "lea       " MEMLEA(0x4,1) ",%1            \n"
  1.4180 +    "psrlw     $0x8,%%xmm1                     \n"
  1.4181 +    "por       %%xmm4,%%xmm0                   \n"
  1.4182 +    "pmullw    %%xmm3,%%xmm1                   \n"
  1.4183 +    "psrlw     $0x8,%%xmm2                     \n"
  1.4184 +    "paddusb   %%xmm2,%%xmm0                   \n"
  1.4185 +    "pand      %%xmm5,%%xmm1                   \n"
  1.4186 +    "paddusb   %%xmm1,%%xmm0                   \n"
  1.4187 +    "sub       $0x1,%3                         \n"
  1.4188 +    "movd      %%xmm0," MEMACCESS(2) "         \n"
  1.4189 +    "lea       " MEMLEA(0x4,2) ",%2            \n"
  1.4190 +    "jge       91b                             \n"
  1.4191 +  "99:                                         \n"
  1.4192 +  : "+r"(src_argb0),    // %0
  1.4193 +    "+r"(src_argb1),    // %1
  1.4194 +    "+r"(dst_argb),     // %2
  1.4195 +    "+r"(width)         // %3
  1.4196 +  : "m"(kShuffleAlpha)  // %4
  1.4197 +  : "memory", "cc"
  1.4198 +#if defined(__SSE2__)
  1.4199 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  1.4200 +#endif
  1.4201 +  );
  1.4202 +}
  1.4203 +#endif  // HAS_ARGBBLENDROW_SSSE3
  1.4204 +
  1.4205 +#ifdef HAS_ARGBATTENUATEROW_SSE2
  1.4206 +// Attenuate 4 pixels at a time.
  1.4207 +// aligned to 16 bytes
  1.4208 +void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
  1.4209 +  asm volatile (
  1.4210 +    "pcmpeqb   %%xmm4,%%xmm4                   \n"
  1.4211 +    "pslld     $0x18,%%xmm4                    \n"
  1.4212 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.4213 +    "psrld     $0x8,%%xmm5                     \n"
  1.4214 +
  1.4215 +    // 4 pixel loop.
  1.4216 +    LABELALIGN
  1.4217 +  "1:                                          \n"
  1.4218 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.4219 +    "punpcklbw %%xmm0,%%xmm0                   \n"
  1.4220 +    "pshufhw   $0xff,%%xmm0,%%xmm2             \n"
  1.4221 +    "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
  1.4222 +    "pmulhuw   %%xmm2,%%xmm0                   \n"
  1.4223 +    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
  1.4224 +    "punpckhbw %%xmm1,%%xmm1                   \n"
  1.4225 +    "pshufhw   $0xff,%%xmm1,%%xmm2             \n"
  1.4226 +    "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
  1.4227 +    "pmulhuw   %%xmm2,%%xmm1                   \n"
  1.4228 +    "movdqa    " MEMACCESS(0) ",%%xmm2         \n"
  1.4229 +    "lea       " MEMLEA(0x10,0) ",%0           \n"
  1.4230 +    "psrlw     $0x8,%%xmm0                     \n"
  1.4231 +    "pand      %%xmm4,%%xmm2                   \n"
  1.4232 +    "psrlw     $0x8,%%xmm1                     \n"
  1.4233 +    "packuswb  %%xmm1,%%xmm0                   \n"
  1.4234 +    "pand      %%xmm5,%%xmm0                   \n"
  1.4235 +    "por       %%xmm2,%%xmm0                   \n"
  1.4236 +    "sub       $0x4,%2                         \n"
  1.4237 +    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  1.4238 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.4239 +    "jg        1b                              \n"
  1.4240 +  : "+r"(src_argb),    // %0
  1.4241 +    "+r"(dst_argb),    // %1
  1.4242 +    "+r"(width)        // %2
  1.4243 +  :
  1.4244 +  : "memory", "cc"
  1.4245 +#if defined(__SSE2__)
  1.4246 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.4247 +#endif
  1.4248 +  );
  1.4249 +}
  1.4250 +#endif  // HAS_ARGBATTENUATEROW_SSE2
  1.4251 +
  1.4252 +#ifdef HAS_ARGBATTENUATEROW_SSSE3
  1.4253 +// Shuffle table duplicating alpha
  1.4254 +static uvec8 kShuffleAlpha0 = {
  1.4255 +  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
  1.4256 +};
  1.4257 +static uvec8 kShuffleAlpha1 = {
  1.4258 +  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
  1.4259 +  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
  1.4260 +};
  1.4261 +// Attenuate 4 pixels at a time.
  1.4262 +// aligned to 16 bytes
  1.4263 +void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
  1.4264 +  asm volatile (
  1.4265 +    "pcmpeqb   %%xmm3,%%xmm3                   \n"
  1.4266 +    "pslld     $0x18,%%xmm3                    \n"
  1.4267 +    "movdqa    %3,%%xmm4                       \n"
  1.4268 +    "movdqa    %4,%%xmm5                       \n"
  1.4269 +
  1.4270 +    // 4 pixel loop.
  1.4271 +    LABELALIGN
  1.4272 +  "1:                                          \n"
  1.4273 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1.4274 +    "pshufb    %%xmm4,%%xmm0                   \n"
  1.4275 +    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
  1.4276 +    "punpcklbw %%xmm1,%%xmm1                   \n"
  1.4277 +    "pmulhuw   %%xmm1,%%xmm0                   \n"
  1.4278 +    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
  1.4279 +    "pshufb    %%xmm5,%%xmm1                   \n"
  1.4280 +    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
  1.4281 +    "punpckhbw %%xmm2,%%xmm2                   \n"
  1.4282 +    "pmulhuw   %%xmm2,%%xmm1                   \n"
  1.4283 +    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
  1.4284 +    "lea       " MEMLEA(0x10,0) ",%0           \n"
  1.4285 +    "pand      %%xmm3,%%xmm2                   \n"
  1.4286 +    "psrlw     $0x8,%%xmm0                     \n"
  1.4287 +    "psrlw     $0x8,%%xmm1                     \n"
  1.4288 +    "packuswb  %%xmm1,%%xmm0                   \n"
  1.4289 +    "por       %%xmm2,%%xmm0                   \n"
  1.4290 +    "sub       $0x4,%2                         \n"
  1.4291 +    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
  1.4292 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.4293 +    "jg        1b                              \n"
  1.4294 +  : "+r"(src_argb),    // %0
  1.4295 +    "+r"(dst_argb),    // %1
  1.4296 +    "+r"(width)        // %2
  1.4297 +  : "m"(kShuffleAlpha0),  // %3
  1.4298 +    "m"(kShuffleAlpha1)  // %4
  1.4299 +  : "memory", "cc"
  1.4300 +#if defined(__SSE2__)
  1.4301 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.4302 +#endif
  1.4303 +  );
  1.4304 +}
  1.4305 +#endif  // HAS_ARGBATTENUATEROW_SSSE3
  1.4306 +
  1.4307 +#ifdef HAS_ARGBUNATTENUATEROW_SSE2
  1.4308 +// Unattenuate 4 pixels at a time.
  1.4309 +// aligned to 16 bytes
  1.4310 +void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
  1.4311 +                             int width) {
  1.4312 +  uintptr_t alpha = 0;
  1.4313 +  asm volatile (
  1.4314 +    // 4 pixel loop.
  1.4315 +    LABELALIGN
  1.4316 +  "1:                                          \n"
  1.4317 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1.4318 +    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
  1.4319 +    "punpcklbw %%xmm0,%%xmm0                   \n"
  1.4320 +    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
  1.4321 +    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
  1.4322 +    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
  1.4323 +    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
  1.4324 +    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
  1.4325 +    "movlhps   %%xmm3,%%xmm2                   \n"
  1.4326 +    "pmulhuw   %%xmm2,%%xmm0                   \n"
  1.4327 +    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
  1.4328 +    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
  1.4329 +    "punpckhbw %%xmm1,%%xmm1                   \n"
  1.4330 +    BUNDLEALIGN
  1.4331 +    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
  1.4332 +    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
  1.4333 +    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
  1.4334 +    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
  1.4335 +    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
  1.4336 +    "movlhps   %%xmm3,%%xmm2                   \n"
  1.4337 +    "pmulhuw   %%xmm2,%%xmm1                   \n"
  1.4338 +    "lea       " MEMLEA(0x10,0) ",%0           \n"
  1.4339 +    "packuswb  %%xmm1,%%xmm0                   \n"
  1.4340 +    "sub       $0x4,%2                         \n"
  1.4341 +    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
  1.4342 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.4343 +    "jg        1b                              \n"
  1.4344 +  : "+r"(src_argb),    // %0
  1.4345 +    "+r"(dst_argb),    // %1
  1.4346 +    "+r"(width),       // %2
  1.4347 +    "+r"(alpha)        // %3
  1.4348 +  : "r"(fixed_invtbl8)  // %4
  1.4349 +  : "memory", "cc"
  1.4350 +#if defined(__native_client__) && defined(__x86_64__)
  1.4351 +    , "r14"
  1.4352 +#endif
  1.4353 +#if defined(__SSE2__)
  1.4354 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.4355 +#endif
  1.4356 +  );
  1.4357 +}
  1.4358 +#endif  // HAS_ARGBUNATTENUATEROW_SSE2
  1.4359 +
  1.4360 +#ifdef HAS_ARGBGRAYROW_SSSE3
  1.4361 +// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
  1.4362 +void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
  1.4363 +  asm volatile (
  1.4364 +    "movdqa    %3,%%xmm4                       \n"
  1.4365 +    "movdqa    %4,%%xmm5                       \n"
  1.4366 +
  1.4367 +    // 8 pixel loop.
  1.4368 +    LABELALIGN
  1.4369 +  "1:                                          \n"
  1.4370 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.4371 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.4372 +    "pmaddubsw %%xmm4,%%xmm0                   \n"
  1.4373 +    "pmaddubsw %%xmm4,%%xmm1                   \n"
  1.4374 +    "phaddw    %%xmm1,%%xmm0                   \n"
  1.4375 +    "paddw     %%xmm5,%%xmm0                   \n"
  1.4376 +    "psrlw     $0x7,%%xmm0                     \n"
  1.4377 +    "packuswb  %%xmm0,%%xmm0                   \n"
  1.4378 +    "movdqa    " MEMACCESS(0) ",%%xmm2         \n"
  1.4379 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
  1.4380 +    "lea       " MEMLEA(0x20,0) ",%0           \n"
  1.4381 +    "psrld     $0x18,%%xmm2                    \n"
  1.4382 +    "psrld     $0x18,%%xmm3                    \n"
  1.4383 +    "packuswb  %%xmm3,%%xmm2                   \n"
  1.4384 +    "packuswb  %%xmm2,%%xmm2                   \n"
  1.4385 +    "movdqa    %%xmm0,%%xmm3                   \n"
  1.4386 +    "punpcklbw %%xmm0,%%xmm0                   \n"
  1.4387 +    "punpcklbw %%xmm2,%%xmm3                   \n"
  1.4388 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.4389 +    "punpcklwd %%xmm3,%%xmm0                   \n"
  1.4390 +    "punpckhwd %%xmm3,%%xmm1                   \n"
  1.4391 +    "sub       $0x8,%2                         \n"
  1.4392 +    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  1.4393 +    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
  1.4394 +    "lea       " MEMLEA(0x20,1) ",%1           \n"
  1.4395 +    "jg        1b                              \n"
  1.4396 +  : "+r"(src_argb),   // %0
  1.4397 +    "+r"(dst_argb),   // %1
  1.4398 +    "+r"(width)       // %2
  1.4399 +  : "m"(kARGBToYJ),   // %3
  1.4400 +    "m"(kAddYJ64)     // %4
  1.4401 +  : "memory", "cc"
  1.4402 +#if defined(__SSE2__)
  1.4403 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.4404 +#endif
  1.4405 +  );
  1.4406 +}
  1.4407 +#endif  // HAS_ARGBGRAYROW_SSSE3
  1.4408 +
  1.4409 +#ifdef HAS_ARGBSEPIAROW_SSSE3
  1.4410 +//    b = (r * 35 + g * 68 + b * 17) >> 7
  1.4411 +//    g = (r * 45 + g * 88 + b * 22) >> 7
  1.4412 +//    r = (r * 50 + g * 98 + b * 24) >> 7
  1.4413 +// Constant for ARGB color to sepia tone
  1.4414 +static vec8 kARGBToSepiaB = {
  1.4415 +  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
  1.4416 +};
  1.4417 +
  1.4418 +static vec8 kARGBToSepiaG = {
  1.4419 +  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
  1.4420 +};
  1.4421 +
  1.4422 +static vec8 kARGBToSepiaR = {
  1.4423 +  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
  1.4424 +};
  1.4425 +
  1.4426 +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
  1.4427 +void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
  1.4428 +  asm volatile (
  1.4429 +    "movdqa    %2,%%xmm2                       \n"
  1.4430 +    "movdqa    %3,%%xmm3                       \n"
  1.4431 +    "movdqa    %4,%%xmm4                       \n"
  1.4432 +
  1.4433 +    // 8 pixel loop.
  1.4434 +    LABELALIGN
  1.4435 +  "1:                                          \n"
  1.4436 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.4437 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
  1.4438 +    "pmaddubsw %%xmm2,%%xmm0                   \n"
  1.4439 +    "pmaddubsw %%xmm2,%%xmm6                   \n"
  1.4440 +    "phaddw    %%xmm6,%%xmm0                   \n"
  1.4441 +    "psrlw     $0x7,%%xmm0                     \n"
  1.4442 +    "packuswb  %%xmm0,%%xmm0                   \n"
  1.4443 +    "movdqa    " MEMACCESS(0) ",%%xmm5         \n"
  1.4444 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.4445 +    "pmaddubsw %%xmm3,%%xmm5                   \n"
  1.4446 +    "pmaddubsw %%xmm3,%%xmm1                   \n"
  1.4447 +    "phaddw    %%xmm1,%%xmm5                   \n"
  1.4448 +    "psrlw     $0x7,%%xmm5                     \n"
  1.4449 +    "packuswb  %%xmm5,%%xmm5                   \n"
  1.4450 +    "punpcklbw %%xmm5,%%xmm0                   \n"
  1.4451 +    "movdqa    " MEMACCESS(0) ",%%xmm5         \n"
  1.4452 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.4453 +    "pmaddubsw %%xmm4,%%xmm5                   \n"
  1.4454 +    "pmaddubsw %%xmm4,%%xmm1                   \n"
  1.4455 +    "phaddw    %%xmm1,%%xmm5                   \n"
  1.4456 +    "psrlw     $0x7,%%xmm5                     \n"
  1.4457 +    "packuswb  %%xmm5,%%xmm5                   \n"
  1.4458 +    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
  1.4459 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.4460 +    "psrld     $0x18,%%xmm6                    \n"
  1.4461 +    "psrld     $0x18,%%xmm1                    \n"
  1.4462 +    "packuswb  %%xmm1,%%xmm6                   \n"
  1.4463 +    "packuswb  %%xmm6,%%xmm6                   \n"
  1.4464 +    "punpcklbw %%xmm6,%%xmm5                   \n"
  1.4465 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.4466 +    "punpcklwd %%xmm5,%%xmm0                   \n"
  1.4467 +    "punpckhwd %%xmm5,%%xmm1                   \n"
  1.4468 +    "sub       $0x8,%1                         \n"
  1.4469 +    "movdqa    %%xmm0," MEMACCESS(0) "         \n"
  1.4470 +    "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"
  1.4471 +    "lea       " MEMLEA(0x20,0) ",%0           \n"
  1.4472 +    "jg        1b                              \n"
  1.4473 +  : "+r"(dst_argb),      // %0
  1.4474 +    "+r"(width)          // %1
  1.4475 +  : "m"(kARGBToSepiaB),  // %2
  1.4476 +    "m"(kARGBToSepiaG),  // %3
  1.4477 +    "m"(kARGBToSepiaR)   // %4
  1.4478 +  : "memory", "cc"
  1.4479 +#if defined(__SSE2__)
  1.4480 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  1.4481 +#endif
  1.4482 +  );
  1.4483 +}
  1.4484 +#endif  // HAS_ARGBSEPIAROW_SSSE3
  1.4485 +
  1.4486 +#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
  1.4487 +// Tranform 8 ARGB pixels (32 bytes) with color matrix.
  1.4488 +// Same as Sepia except matrix is provided.
  1.4489 +void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
  1.4490 +                              const int8* matrix_argb, int width) {
  1.4491 +  asm volatile (
  1.4492 +    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
  1.4493 +    "pshufd    $0x00,%%xmm5,%%xmm2             \n"
  1.4494 +    "pshufd    $0x55,%%xmm5,%%xmm3             \n"
  1.4495 +    "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
  1.4496 +    "pshufd    $0xff,%%xmm5,%%xmm5             \n"
  1.4497 +
  1.4498 +    // 8 pixel loop.
  1.4499 +    LABELALIGN
  1.4500 +  "1:                                          \n"
  1.4501 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.4502 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
  1.4503 +    "pmaddubsw %%xmm2,%%xmm0                   \n"
  1.4504 +    "pmaddubsw %%xmm2,%%xmm7                   \n"
  1.4505 +    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
  1.4506 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.4507 +    "pmaddubsw %%xmm3,%%xmm6                   \n"
  1.4508 +    "pmaddubsw %%xmm3,%%xmm1                   \n"
  1.4509 +    "phaddsw   %%xmm7,%%xmm0                   \n"
  1.4510 +    "phaddsw   %%xmm1,%%xmm6                   \n"
  1.4511 +    "psraw     $0x6,%%xmm0                     \n"
  1.4512 +    "psraw     $0x6,%%xmm6                     \n"
  1.4513 +    "packuswb  %%xmm0,%%xmm0                   \n"
  1.4514 +    "packuswb  %%xmm6,%%xmm6                   \n"
  1.4515 +    "punpcklbw %%xmm6,%%xmm0                   \n"
  1.4516 +    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
  1.4517 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
  1.4518 +    "pmaddubsw %%xmm4,%%xmm1                   \n"
  1.4519 +    "pmaddubsw %%xmm4,%%xmm7                   \n"
  1.4520 +    "phaddsw   %%xmm7,%%xmm1                   \n"
  1.4521 +    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
  1.4522 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
  1.4523 +    "pmaddubsw %%xmm5,%%xmm6                   \n"
  1.4524 +    "pmaddubsw %%xmm5,%%xmm7                   \n"
  1.4525 +    "phaddsw   %%xmm7,%%xmm6                   \n"
  1.4526 +    "psraw     $0x6,%%xmm1                     \n"
  1.4527 +    "psraw     $0x6,%%xmm6                     \n"
  1.4528 +    "packuswb  %%xmm1,%%xmm1                   \n"
  1.4529 +    "packuswb  %%xmm6,%%xmm6                   \n"
  1.4530 +    "punpcklbw %%xmm6,%%xmm1                   \n"
  1.4531 +    "movdqa    %%xmm0,%%xmm6                   \n"
  1.4532 +    "punpcklwd %%xmm1,%%xmm0                   \n"
  1.4533 +    "punpckhwd %%xmm1,%%xmm6                   \n"
  1.4534 +    "sub       $0x8,%2                         \n"
  1.4535 +    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  1.4536 +    "movdqa    %%xmm6," MEMACCESS2(0x10,1) "   \n"
  1.4537 +    "lea       " MEMLEA(0x20,0) ",%0           \n"
  1.4538 +    "lea       " MEMLEA(0x20,1) ",%1           \n"
  1.4539 +    "jg        1b                              \n"
  1.4540 +  : "+r"(src_argb),      // %0
  1.4541 +    "+r"(dst_argb),      // %1
  1.4542 +    "+r"(width)          // %2
  1.4543 +  : "r"(matrix_argb)     // %3
  1.4544 +  : "memory", "cc"
  1.4545 +#if defined(__SSE2__)
  1.4546 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  1.4547 +#endif
  1.4548 +  );
  1.4549 +}
  1.4550 +#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
  1.4551 +
  1.4552 +#ifdef HAS_ARGBQUANTIZEROW_SSE2
  1.4553 +// Quantize 4 ARGB pixels (16 bytes).
  1.4554 +// aligned to 16 bytes
  1.4555 +void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
  1.4556 +                          int interval_offset, int width) {
  1.4557 +  asm volatile (
  1.4558 +    "movd      %2,%%xmm2                       \n"
  1.4559 +    "movd      %3,%%xmm3                       \n"
  1.4560 +    "movd      %4,%%xmm4                       \n"
  1.4561 +    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
  1.4562 +    "pshufd    $0x44,%%xmm2,%%xmm2             \n"
  1.4563 +    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
  1.4564 +    "pshufd    $0x44,%%xmm3,%%xmm3             \n"
  1.4565 +    "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
  1.4566 +    "pshufd    $0x44,%%xmm4,%%xmm4             \n"
  1.4567 +    "pxor      %%xmm5,%%xmm5                   \n"
  1.4568 +    "pcmpeqb   %%xmm6,%%xmm6                   \n"
  1.4569 +    "pslld     $0x18,%%xmm6                    \n"
  1.4570 +
  1.4571 +    // 4 pixel loop.
  1.4572 +    LABELALIGN
  1.4573 +  "1:                                          \n"
  1.4574 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.4575 +    "punpcklbw %%xmm5,%%xmm0                   \n"
  1.4576 +    "pmulhuw   %%xmm2,%%xmm0                   \n"
  1.4577 +    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
  1.4578 +    "punpckhbw %%xmm5,%%xmm1                   \n"
  1.4579 +    "pmulhuw   %%xmm2,%%xmm1                   \n"
  1.4580 +    "pmullw    %%xmm3,%%xmm0                   \n"
  1.4581 +    "movdqa    " MEMACCESS(0) ",%%xmm7         \n"
  1.4582 +    "pmullw    %%xmm3,%%xmm1                   \n"
  1.4583 +    "pand      %%xmm6,%%xmm7                   \n"
  1.4584 +    "paddw     %%xmm4,%%xmm0                   \n"
  1.4585 +    "paddw     %%xmm4,%%xmm1                   \n"
  1.4586 +    "packuswb  %%xmm1,%%xmm0                   \n"
  1.4587 +    "por       %%xmm7,%%xmm0                   \n"
  1.4588 +    "sub       $0x4,%1                         \n"
  1.4589 +    "movdqa    %%xmm0," MEMACCESS(0) "         \n"
  1.4590 +    "lea       " MEMLEA(0x10,0) ",%0           \n"
  1.4591 +    "jg        1b                              \n"
  1.4592 +  : "+r"(dst_argb),       // %0
  1.4593 +    "+r"(width)           // %1
  1.4594 +  : "r"(scale),           // %2
  1.4595 +    "r"(interval_size),   // %3
  1.4596 +    "r"(interval_offset)  // %4
  1.4597 +  : "memory", "cc"
  1.4598 +#if defined(__SSE2__)
  1.4599 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  1.4600 +#endif
  1.4601 +  );
  1.4602 +}
  1.4603 +#endif  // HAS_ARGBQUANTIZEROW_SSE2
  1.4604 +
  1.4605 +#ifdef HAS_ARGBSHADEROW_SSE2
  1.4606 +// Shade 4 pixels at a time by specified value.
  1.4607 +// Aligned to 16 bytes.
  1.4608 +void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
  1.4609 +                       uint32 value) {
  1.4610 +  asm volatile (
  1.4611 +    "movd      %3,%%xmm2                       \n"
  1.4612 +    "punpcklbw %%xmm2,%%xmm2                   \n"
  1.4613 +    "punpcklqdq %%xmm2,%%xmm2                  \n"
  1.4614 +
  1.4615 +    // 4 pixel loop.
  1.4616 +    LABELALIGN
  1.4617 +  "1:                                          \n"
  1.4618 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.4619 +    "lea       " MEMLEA(0x10,0) ",%0           \n"
  1.4620 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.4621 +    "punpcklbw %%xmm0,%%xmm0                   \n"
  1.4622 +    "punpckhbw %%xmm1,%%xmm1                   \n"
  1.4623 +    "pmulhuw   %%xmm2,%%xmm0                   \n"
  1.4624 +    "pmulhuw   %%xmm2,%%xmm1                   \n"
  1.4625 +    "psrlw     $0x8,%%xmm0                     \n"
  1.4626 +    "psrlw     $0x8,%%xmm1                     \n"
  1.4627 +    "packuswb  %%xmm1,%%xmm0                   \n"
  1.4628 +    "sub       $0x4,%2                         \n"
  1.4629 +    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  1.4630 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.4631 +    "jg        1b                              \n"
  1.4632 +  : "+r"(src_argb),  // %0
  1.4633 +    "+r"(dst_argb),  // %1
  1.4634 +    "+r"(width)      // %2
  1.4635 +  : "r"(value)       // %3
  1.4636 +  : "memory", "cc"
  1.4637 +#if defined(__SSE2__)
  1.4638 +    , "xmm0", "xmm1", "xmm2"
  1.4639 +#endif
  1.4640 +  );
  1.4641 +}
  1.4642 +#endif  // HAS_ARGBSHADEROW_SSE2
  1.4643 +
  1.4644 +#ifdef HAS_ARGBMULTIPLYROW_SSE2
  1.4645 +// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
  1.4646 +void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  1.4647 +                          uint8* dst_argb, int width) {
  1.4648 +  asm volatile (
  1.4649 +    "pxor      %%xmm5,%%xmm5                   \n"
  1.4650 +
  1.4651 +    // 4 pixel loop.
  1.4652 +    LABELALIGN
  1.4653 +  "1:                                          \n"
  1.4654 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1.4655 +    "lea       " MEMLEA(0x10,0) ",%0           \n"
  1.4656 +    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
  1.4657 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.4658 +    "movdqu    %%xmm0,%%xmm1                   \n"
  1.4659 +    "movdqu    %%xmm2,%%xmm3                   \n"
  1.4660 +    "punpcklbw %%xmm0,%%xmm0                   \n"
  1.4661 +    "punpckhbw %%xmm1,%%xmm1                   \n"
  1.4662 +    "punpcklbw %%xmm5,%%xmm2                   \n"
  1.4663 +    "punpckhbw %%xmm5,%%xmm3                   \n"
  1.4664 +    "pmulhuw   %%xmm2,%%xmm0                   \n"
  1.4665 +    "pmulhuw   %%xmm3,%%xmm1                   \n"
  1.4666 +    "packuswb  %%xmm1,%%xmm0                   \n"
  1.4667 +    "sub       $0x4,%3                         \n"
  1.4668 +    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
  1.4669 +    "lea       " MEMLEA(0x10,2) ",%2           \n"
  1.4670 +    "jg        1b                              \n"
  1.4671 +  : "+r"(src_argb0),  // %0
  1.4672 +    "+r"(src_argb1),  // %1
  1.4673 +    "+r"(dst_argb),   // %2
  1.4674 +    "+r"(width)       // %3
  1.4675 +  :
  1.4676 +  : "memory", "cc"
  1.4677 +#if defined(__SSE2__)
  1.4678 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  1.4679 +#endif
  1.4680 +  );
  1.4681 +}
  1.4682 +#endif  // HAS_ARGBMULTIPLYROW_SSE2
  1.4683 +
  1.4684 +#ifdef HAS_ARGBADDROW_SSE2
  1.4685 +// Add 2 rows of ARGB pixels together, 4 pixels at a time.
  1.4686 +void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  1.4687 +                     uint8* dst_argb, int width) {
  1.4688 +  asm volatile (
  1.4689 +    // 4 pixel loop.
  1.4690 +    LABELALIGN
  1.4691 +  "1:                                          \n"
  1.4692 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1.4693 +    "lea       " MEMLEA(0x10,0) ",%0           \n"
  1.4694 +    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
  1.4695 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.4696 +    "paddusb   %%xmm1,%%xmm0                   \n"
  1.4697 +    "sub       $0x4,%3                         \n"
  1.4698 +    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
  1.4699 +    "lea       " MEMLEA(0x10,2) ",%2           \n"
  1.4700 +    "jg        1b                              \n"
  1.4701 +  : "+r"(src_argb0),  // %0
  1.4702 +    "+r"(src_argb1),  // %1
  1.4703 +    "+r"(dst_argb),   // %2
  1.4704 +    "+r"(width)       // %3
  1.4705 +  :
  1.4706 +  : "memory", "cc"
  1.4707 +#if defined(__SSE2__)
  1.4708 +    , "xmm0", "xmm1"
  1.4709 +#endif
  1.4710 +  );
  1.4711 +}
  1.4712 +#endif  // HAS_ARGBADDROW_SSE2
  1.4713 +
  1.4714 +#ifdef HAS_ARGBSUBTRACTROW_SSE2
  1.4715 +// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
  1.4716 +void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  1.4717 +                          uint8* dst_argb, int width) {
  1.4718 +  asm volatile (
  1.4719 +    // 4 pixel loop.
  1.4720 +    LABELALIGN
  1.4721 +  "1:                                          \n"
  1.4722 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1.4723 +    "lea       " MEMLEA(0x10,0) ",%0           \n"
  1.4724 +    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
  1.4725 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.4726 +    "psubusb   %%xmm1,%%xmm0                   \n"
  1.4727 +    "sub       $0x4,%3                         \n"
  1.4728 +    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
  1.4729 +    "lea       " MEMLEA(0x10,2) ",%2           \n"
  1.4730 +    "jg        1b                              \n"
  1.4731 +  : "+r"(src_argb0),  // %0
  1.4732 +    "+r"(src_argb1),  // %1
  1.4733 +    "+r"(dst_argb),   // %2
  1.4734 +    "+r"(width)       // %3
  1.4735 +  :
  1.4736 +  : "memory", "cc"
  1.4737 +#if defined(__SSE2__)
  1.4738 +    , "xmm0", "xmm1"
  1.4739 +#endif
  1.4740 +  );
  1.4741 +}
  1.4742 +#endif  // HAS_ARGBSUBTRACTROW_SSE2
  1.4743 +
  1.4744 +#ifdef HAS_SOBELXROW_SSE2
  1.4745 +// SobelX as a matrix is
  1.4746 +// -1  0  1
  1.4747 +// -2  0  2
  1.4748 +// -1  0  1
  1.4749 +void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
  1.4750 +                    const uint8* src_y2, uint8* dst_sobelx, int width) {
  1.4751 +  asm volatile (
  1.4752 +    "sub       %0,%1                           \n"
  1.4753 +    "sub       %0,%2                           \n"
  1.4754 +    "sub       %0,%3                           \n"
  1.4755 +    "pxor      %%xmm5,%%xmm5                   \n"
  1.4756 +
  1.4757 +    // 8 pixel loop.
  1.4758 +    LABELALIGN
  1.4759 +  "1:                                          \n"
  1.4760 +    "movq      " MEMACCESS(0) ",%%xmm0         \n"
  1.4761 +    "movq      " MEMACCESS2(0x2,0) ",%%xmm1    \n"
  1.4762 +    "punpcklbw %%xmm5,%%xmm0                   \n"
  1.4763 +    "punpcklbw %%xmm5,%%xmm1                   \n"
  1.4764 +    "psubw     %%xmm1,%%xmm0                   \n"
  1.4765 +    BUNDLEALIGN
  1.4766 +    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
  1.4767 +    MEMOPREG(movq,0x02,0,1,1,xmm2)             //  movq      0x2(%0,%1,1),%%xmm2
  1.4768 +    "punpcklbw %%xmm5,%%xmm1                   \n"
  1.4769 +    "punpcklbw %%xmm5,%%xmm2                   \n"
  1.4770 +    "psubw     %%xmm2,%%xmm1                   \n"
  1.4771 +    BUNDLEALIGN
  1.4772 +    MEMOPREG(movq,0x00,0,2,1,xmm2)             //  movq      (%0,%2,1),%%xmm2
  1.4773 +    MEMOPREG(movq,0x02,0,2,1,xmm3)             //  movq      0x2(%0,%2,1),%%xmm3
  1.4774 +    "punpcklbw %%xmm5,%%xmm2                   \n"
  1.4775 +    "punpcklbw %%xmm5,%%xmm3                   \n"
  1.4776 +    "psubw     %%xmm3,%%xmm2                   \n"
  1.4777 +    "paddw     %%xmm2,%%xmm0                   \n"
  1.4778 +    "paddw     %%xmm1,%%xmm0                   \n"
  1.4779 +    "paddw     %%xmm1,%%xmm0                   \n"
  1.4780 +    "pxor      %%xmm1,%%xmm1                   \n"
  1.4781 +    "psubw     %%xmm0,%%xmm1                   \n"
  1.4782 +    "pmaxsw    %%xmm1,%%xmm0                   \n"
  1.4783 +    "packuswb  %%xmm0,%%xmm0                   \n"
  1.4784 +    "sub       $0x8,%4                         \n"
  1.4785 +    BUNDLEALIGN
  1.4786 +    MEMOPMEM(movq,xmm0,0x00,0,3,1)             //  movq      %%xmm0,(%0,%3,1)
  1.4787 +    "lea       " MEMLEA(0x8,0) ",%0            \n"
  1.4788 +    "jg        1b                              \n"
  1.4789 +  : "+r"(src_y0),      // %0
  1.4790 +    "+r"(src_y1),      // %1
  1.4791 +    "+r"(src_y2),      // %2
  1.4792 +    "+r"(dst_sobelx),  // %3
  1.4793 +    "+r"(width)        // %4
  1.4794 +  :
  1.4795 +  : "memory", "cc"
  1.4796 +#if defined(__native_client__) && defined(__x86_64__)
  1.4797 +    , "r14"
  1.4798 +#endif
  1.4799 +#if defined(__SSE2__)
  1.4800 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  1.4801 +#endif
  1.4802 +  );
  1.4803 +}
  1.4804 +#endif  // HAS_SOBELXROW_SSE2
  1.4805 +
  1.4806 +#ifdef HAS_SOBELYROW_SSE2
  1.4807 +// SobelY as a matrix is
  1.4808 +// -1 -2 -1
  1.4809 +//  0  0  0
  1.4810 +//  1  2  1
  1.4811 +void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
  1.4812 +                    uint8* dst_sobely, int width) {
  1.4813 +  asm volatile (
  1.4814 +    "sub       %0,%1                           \n"
  1.4815 +    "sub       %0,%2                           \n"
  1.4816 +    "pxor      %%xmm5,%%xmm5                   \n"
  1.4817 +
  1.4818 +    // 8 pixel loop.
  1.4819 +    LABELALIGN
  1.4820 +  "1:                                          \n"
  1.4821 +    "movq      " MEMACCESS(0) ",%%xmm0         \n"
  1.4822 +    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
  1.4823 +    "punpcklbw %%xmm5,%%xmm0                   \n"
  1.4824 +    "punpcklbw %%xmm5,%%xmm1                   \n"
  1.4825 +    "psubw     %%xmm1,%%xmm0                   \n"
  1.4826 +    BUNDLEALIGN
  1.4827 +    "movq      " MEMACCESS2(0x1,0) ",%%xmm1    \n"
  1.4828 +    MEMOPREG(movq,0x01,0,1,1,xmm2)             //  movq      0x1(%0,%1,1),%%xmm2
  1.4829 +    "punpcklbw %%xmm5,%%xmm1                   \n"
  1.4830 +    "punpcklbw %%xmm5,%%xmm2                   \n"
  1.4831 +    "psubw     %%xmm2,%%xmm1                   \n"
  1.4832 +    BUNDLEALIGN
  1.4833 +    "movq      " MEMACCESS2(0x2,0) ",%%xmm2    \n"
  1.4834 +    MEMOPREG(movq,0x02,0,1,1,xmm3)             //  movq      0x2(%0,%1,1),%%xmm3
  1.4835 +    "punpcklbw %%xmm5,%%xmm2                   \n"
  1.4836 +    "punpcklbw %%xmm5,%%xmm3                   \n"
  1.4837 +    "psubw     %%xmm3,%%xmm2                   \n"
  1.4838 +    "paddw     %%xmm2,%%xmm0                   \n"
  1.4839 +    "paddw     %%xmm1,%%xmm0                   \n"
  1.4840 +    "paddw     %%xmm1,%%xmm0                   \n"
  1.4841 +    "pxor      %%xmm1,%%xmm1                   \n"
  1.4842 +    "psubw     %%xmm0,%%xmm1                   \n"
  1.4843 +    "pmaxsw    %%xmm1,%%xmm0                   \n"
  1.4844 +    "packuswb  %%xmm0,%%xmm0                   \n"
  1.4845 +    "sub       $0x8,%3                         \n"
  1.4846 +    BUNDLEALIGN
  1.4847 +    MEMOPMEM(movq,xmm0,0x00,0,2,1)             //  movq      %%xmm0,(%0,%2,1)
  1.4848 +    "lea       " MEMLEA(0x8,0) ",%0            \n"
  1.4849 +    "jg        1b                              \n"
  1.4850 +  : "+r"(src_y0),      // %0
  1.4851 +    "+r"(src_y1),      // %1
  1.4852 +    "+r"(dst_sobely),  // %2
  1.4853 +    "+r"(width)        // %3
  1.4854 +  :
  1.4855 +  : "memory", "cc"
  1.4856 +#if defined(__native_client__) && defined(__x86_64__)
  1.4857 +    , "r14"
  1.4858 +#endif
  1.4859 +#if defined(__SSE2__)
  1.4860 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  1.4861 +#endif
  1.4862 +  );
  1.4863 +}
  1.4864 +#endif  // HAS_SOBELYROW_SSE2
  1.4865 +
  1.4866 +#ifdef HAS_SOBELROW_SSE2
  1.4867 +// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
  1.4868 +// A = 255
  1.4869 +// R = Sobel
  1.4870 +// G = Sobel
  1.4871 +// B = Sobel
  1.4872 +void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
  1.4873 +                   uint8* dst_argb, int width) {
  1.4874 +  asm volatile (
  1.4875 +    "sub       %0,%1                           \n"
  1.4876 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.4877 +    "pslld     $0x18,%%xmm5                    \n"
  1.4878 +
  1.4879 +    // 8 pixel loop.
  1.4880 +    LABELALIGN
  1.4881 +  "1:                                          \n"
  1.4882 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.4883 +    MEMOPREG(movdqa,0x00,0,1,1,xmm1)           //  movdqa    (%0,%1,1),%%xmm1
  1.4884 +    "lea       " MEMLEA(0x10,0) ",%0           \n"
  1.4885 +    "paddusb   %%xmm1,%%xmm0                   \n"
  1.4886 +    "movdqa    %%xmm0,%%xmm2                   \n"
  1.4887 +    "punpcklbw %%xmm0,%%xmm2                   \n"
  1.4888 +    "punpckhbw %%xmm0,%%xmm0                   \n"
  1.4889 +    "movdqa    %%xmm2,%%xmm1                   \n"
  1.4890 +    "punpcklwd %%xmm2,%%xmm1                   \n"
  1.4891 +    "punpckhwd %%xmm2,%%xmm2                   \n"
  1.4892 +    "por       %%xmm5,%%xmm1                   \n"
  1.4893 +    "por       %%xmm5,%%xmm2                   \n"
  1.4894 +    "movdqa    %%xmm0,%%xmm3                   \n"
  1.4895 +    "punpcklwd %%xmm0,%%xmm3                   \n"
  1.4896 +    "punpckhwd %%xmm0,%%xmm0                   \n"
  1.4897 +    "por       %%xmm5,%%xmm3                   \n"
  1.4898 +    "por       %%xmm5,%%xmm0                   \n"
  1.4899 +    "sub       $0x10,%3                        \n"
  1.4900 +    "movdqa    %%xmm1," MEMACCESS(2) "         \n"
  1.4901 +    "movdqa    %%xmm2," MEMACCESS2(0x10,2) "   \n"
  1.4902 +    "movdqa    %%xmm3," MEMACCESS2(0x20,2) "   \n"
  1.4903 +    "movdqa    %%xmm0," MEMACCESS2(0x30,2) "   \n"
  1.4904 +    "lea       " MEMLEA(0x40,2) ",%2           \n"
  1.4905 +    "jg        1b                              \n"
  1.4906 +  : "+r"(src_sobelx),  // %0
  1.4907 +    "+r"(src_sobely),  // %1
  1.4908 +    "+r"(dst_argb),    // %2
  1.4909 +    "+r"(width)        // %3
  1.4910 +  :
  1.4911 +  : "memory", "cc"
  1.4912 +#if defined(__native_client__) && defined(__x86_64__)
  1.4913 +    , "r14"
  1.4914 +#endif
  1.4915 +#if defined(__SSE2__)
  1.4916 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  1.4917 +#endif
  1.4918 +  );
  1.4919 +}
  1.4920 +#endif  // HAS_SOBELROW_SSE2
  1.4921 +
  1.4922 +#ifdef HAS_SOBELTOPLANEROW_SSE2
  1.4923 +// Adds Sobel X and Sobel Y and stores Sobel into a plane.
  1.4924 +void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
  1.4925 +                          uint8* dst_y, int width) {
  1.4926 +  asm volatile (
  1.4927 +    "sub       %0,%1                           \n"
  1.4928 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.4929 +    "pslld     $0x18,%%xmm5                    \n"
  1.4930 +
  1.4931 +    // 8 pixel loop.
  1.4932 +    LABELALIGN
  1.4933 +  "1:                                          \n"
  1.4934 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.4935 +    MEMOPREG(movdqa,0x00,0,1,1,xmm1)           //  movdqa    (%0,%1,1),%%xmm1
  1.4936 +    "lea       " MEMLEA(0x10,0) ",%0           \n"
  1.4937 +    "paddusb   %%xmm1,%%xmm0                   \n"
  1.4938 +    "sub       $0x10,%3                        \n"
  1.4939 +    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
  1.4940 +    "lea       " MEMLEA(0x10,2) ",%2           \n"
  1.4941 +    "jg        1b                              \n"
  1.4942 +  : "+r"(src_sobelx),  // %0
  1.4943 +    "+r"(src_sobely),  // %1
  1.4944 +    "+r"(dst_y),       // %2
  1.4945 +    "+r"(width)        // %3
  1.4946 +  :
  1.4947 +  : "memory", "cc"
  1.4948 +#if defined(__native_client__) && defined(__x86_64__)
  1.4949 +    , "r14"
  1.4950 +#endif
  1.4951 +#if defined(__SSE2__)
  1.4952 +    , "xmm0", "xmm1"
  1.4953 +#endif
  1.4954 +  );
  1.4955 +}
  1.4956 +#endif  // HAS_SOBELTOPLANEROW_SSE2
  1.4957 +
  1.4958 +#ifdef HAS_SOBELXYROW_SSE2
  1.4959 +// Mixes Sobel X, Sobel Y and Sobel into ARGB.
  1.4960 +// A = 255
  1.4961 +// R = Sobel X
  1.4962 +// G = Sobel
  1.4963 +// B = Sobel Y
  1.4964 +void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
  1.4965 +                     uint8* dst_argb, int width) {
  1.4966 +  asm volatile (
  1.4967 +    "sub       %0,%1                           \n"
  1.4968 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.4969 +
  1.4970 +    // 8 pixel loop.
  1.4971 +    LABELALIGN
  1.4972 +  "1:                                          \n"
  1.4973 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.4974 +    MEMOPREG(movdqa,0x00,0,1,1,xmm1)           //  movdqa    (%0,%1,1),%%xmm1
  1.4975 +    "lea       " MEMLEA(0x10,0) ",%0           \n"
  1.4976 +    "movdqa    %%xmm0,%%xmm2                   \n"
  1.4977 +    "paddusb   %%xmm1,%%xmm2                   \n"
  1.4978 +    "movdqa    %%xmm0,%%xmm3                   \n"
  1.4979 +    "punpcklbw %%xmm5,%%xmm3                   \n"
  1.4980 +    "punpckhbw %%xmm5,%%xmm0                   \n"
  1.4981 +    "movdqa    %%xmm1,%%xmm4                   \n"
  1.4982 +    "punpcklbw %%xmm2,%%xmm4                   \n"
  1.4983 +    "punpckhbw %%xmm2,%%xmm1                   \n"
  1.4984 +    "movdqa    %%xmm4,%%xmm6                   \n"
  1.4985 +    "punpcklwd %%xmm3,%%xmm6                   \n"
  1.4986 +    "punpckhwd %%xmm3,%%xmm4                   \n"
  1.4987 +    "movdqa    %%xmm1,%%xmm7                   \n"
  1.4988 +    "punpcklwd %%xmm0,%%xmm7                   \n"
  1.4989 +    "punpckhwd %%xmm0,%%xmm1                   \n"
  1.4990 +    "sub       $0x10,%3                        \n"
  1.4991 +    "movdqa    %%xmm6," MEMACCESS(2) "         \n"
  1.4992 +    "movdqa    %%xmm4," MEMACCESS2(0x10,2) "   \n"
  1.4993 +    "movdqa    %%xmm7," MEMACCESS2(0x20,2) "   \n"
  1.4994 +    "movdqa    %%xmm1," MEMACCESS2(0x30,2) "   \n"
  1.4995 +    "lea       " MEMLEA(0x40,2) ",%2           \n"
  1.4996 +    "jg        1b                              \n"
  1.4997 +  : "+r"(src_sobelx),  // %0
  1.4998 +    "+r"(src_sobely),  // %1
  1.4999 +    "+r"(dst_argb),    // %2
  1.5000 +    "+r"(width)        // %3
  1.5001 +  :
  1.5002 +  : "memory", "cc"
  1.5003 +#if defined(__native_client__) && defined(__x86_64__)
  1.5004 +    , "r14"
  1.5005 +#endif
  1.5006 +#if defined(__SSE2__)
  1.5007 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  1.5008 +#endif
  1.5009 +  );
  1.5010 +}
  1.5011 +#endif  // HAS_SOBELXYROW_SSE2
  1.5012 +
  1.5013 +#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
  1.5014 +// Creates a table of cumulative sums where each value is a sum of all values
  1.5015 +// above and to the left of the value, inclusive of the value.
  1.5016 +void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
  1.5017 +                                  const int32* previous_cumsum, int width) {
  1.5018 +  asm volatile (
  1.5019 +    "pxor      %%xmm0,%%xmm0                   \n"
  1.5020 +    "pxor      %%xmm1,%%xmm1                   \n"
  1.5021 +    "sub       $0x4,%3                         \n"
  1.5022 +    "jl        49f                             \n"
  1.5023 +    "test      $0xf,%1                         \n"
  1.5024 +    "jne       49f                             \n"
  1.5025 +
  1.5026 +  // 4 pixel loop                              \n"
  1.5027 +    LABELALIGN
  1.5028 +  "40:                                         \n"
  1.5029 +    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
  1.5030 +    "lea       " MEMLEA(0x10,0) ",%0           \n"
  1.5031 +    "movdqa    %%xmm2,%%xmm4                   \n"
  1.5032 +    "punpcklbw %%xmm1,%%xmm2                   \n"
  1.5033 +    "movdqa    %%xmm2,%%xmm3                   \n"
  1.5034 +    "punpcklwd %%xmm1,%%xmm2                   \n"
  1.5035 +    "punpckhwd %%xmm1,%%xmm3                   \n"
  1.5036 +    "punpckhbw %%xmm1,%%xmm4                   \n"
  1.5037 +    "movdqa    %%xmm4,%%xmm5                   \n"
  1.5038 +    "punpcklwd %%xmm1,%%xmm4                   \n"
  1.5039 +    "punpckhwd %%xmm1,%%xmm5                   \n"
  1.5040 +    "paddd     %%xmm2,%%xmm0                   \n"
  1.5041 +    "movdqa    " MEMACCESS(2) ",%%xmm2         \n"
  1.5042 +    "paddd     %%xmm0,%%xmm2                   \n"
  1.5043 +    "paddd     %%xmm3,%%xmm0                   \n"
  1.5044 +    "movdqa    " MEMACCESS2(0x10,2) ",%%xmm3   \n"
  1.5045 +    "paddd     %%xmm0,%%xmm3                   \n"
  1.5046 +    "paddd     %%xmm4,%%xmm0                   \n"
  1.5047 +    "movdqa    " MEMACCESS2(0x20,2) ",%%xmm4   \n"
  1.5048 +    "paddd     %%xmm0,%%xmm4                   \n"
  1.5049 +    "paddd     %%xmm5,%%xmm0                   \n"
  1.5050 +    "movdqa    " MEMACCESS2(0x30,2) ",%%xmm5   \n"
  1.5051 +    "lea       " MEMLEA(0x40,2) ",%2           \n"
  1.5052 +    "paddd     %%xmm0,%%xmm5                   \n"
  1.5053 +    "movdqa    %%xmm2," MEMACCESS(1) "         \n"
  1.5054 +    "movdqa    %%xmm3," MEMACCESS2(0x10,1) "   \n"
  1.5055 +    "movdqa    %%xmm4," MEMACCESS2(0x20,1) "   \n"
  1.5056 +    "movdqa    %%xmm5," MEMACCESS2(0x30,1) "   \n"
  1.5057 +    "lea       " MEMLEA(0x40,1) ",%1           \n"
  1.5058 +    "sub       $0x4,%3                         \n"
  1.5059 +    "jge       40b                             \n"
  1.5060 +
  1.5061 +  "49:                                         \n"
  1.5062 +    "add       $0x3,%3                         \n"
  1.5063 +    "jl        19f                             \n"
  1.5064 +
  1.5065 +  // 1 pixel loop                              \n"
  1.5066 +    LABELALIGN
  1.5067 +  "10:                                         \n"
  1.5068 +    "movd      " MEMACCESS(0) ",%%xmm2         \n"
  1.5069 +    "lea       " MEMLEA(0x4,0) ",%0            \n"
  1.5070 +    "punpcklbw %%xmm1,%%xmm2                   \n"
  1.5071 +    "punpcklwd %%xmm1,%%xmm2                   \n"
  1.5072 +    "paddd     %%xmm2,%%xmm0                   \n"
  1.5073 +    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
  1.5074 +    "lea       " MEMLEA(0x10,2) ",%2           \n"
  1.5075 +    "paddd     %%xmm0,%%xmm2                   \n"
  1.5076 +    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
  1.5077 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.5078 +    "sub       $0x1,%3                         \n"
  1.5079 +    "jge       10b                             \n"
  1.5080 +
  1.5081 +  "19:                                         \n"
  1.5082 +  : "+r"(row),  // %0
  1.5083 +    "+r"(cumsum),  // %1
  1.5084 +    "+r"(previous_cumsum),  // %2
  1.5085 +    "+r"(width)  // %3
  1.5086 +  :
  1.5087 +  : "memory", "cc"
  1.5088 +#if defined(__SSE2__)
  1.5089 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.5090 +#endif
  1.5091 +  );
  1.5092 +}
  1.5093 +#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
  1.5094 +
  1.5095 +#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
  1.5096 +void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
  1.5097 +                                    int width, int area, uint8* dst,
  1.5098 +                                    int count) {
  1.5099 +  asm volatile (
  1.5100 +    "movd      %5,%%xmm5                       \n"
  1.5101 +    "cvtdq2ps  %%xmm5,%%xmm5                   \n"
  1.5102 +    "rcpss     %%xmm5,%%xmm4                   \n"
  1.5103 +    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
  1.5104 +    "sub       $0x4,%3                         \n"
  1.5105 +    "jl        49f                             \n"
  1.5106 +    "cmpl      $0x80,%5                        \n"
  1.5107 +    "ja        40f                             \n"
  1.5108 +
  1.5109 +    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
  1.5110 +    "pcmpeqb   %%xmm6,%%xmm6                   \n"
  1.5111 +    "psrld     $0x10,%%xmm6                    \n"
  1.5112 +    "cvtdq2ps  %%xmm6,%%xmm6                   \n"
  1.5113 +    "addps     %%xmm6,%%xmm5                   \n"
  1.5114 +    "mulps     %%xmm4,%%xmm5                   \n"
  1.5115 +    "cvtps2dq  %%xmm5,%%xmm5                   \n"
  1.5116 +    "packssdw  %%xmm5,%%xmm5                   \n"
  1.5117 +
  1.5118 +  // 4 pixel small loop                        \n"
  1.5119 +    LABELALIGN
  1.5120 +  "4:                                         \n"
  1.5121 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.5122 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.5123 +    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1.5124 +    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
  1.5125 +    BUNDLEALIGN
  1.5126 +    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
  1.5127 +    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
  1.5128 +    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
  1.5129 +    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
  1.5130 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
  1.5131 +    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
  1.5132 +    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
  1.5133 +    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
  1.5134 +    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
  1.5135 +    BUNDLEALIGN
  1.5136 +    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
  1.5137 +    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
  1.5138 +    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
  1.5139 +    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
  1.5140 +    "lea       " MEMLEA(0x40,1) ",%1           \n"
  1.5141 +    "packssdw  %%xmm1,%%xmm0                   \n"
  1.5142 +    "packssdw  %%xmm3,%%xmm2                   \n"
  1.5143 +    "pmulhuw   %%xmm5,%%xmm0                   \n"
  1.5144 +    "pmulhuw   %%xmm5,%%xmm2                   \n"
  1.5145 +    "packuswb  %%xmm2,%%xmm0                   \n"
  1.5146 +    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
  1.5147 +    "lea       " MEMLEA(0x10,2) ",%2           \n"
  1.5148 +    "sub       $0x4,%3                         \n"
  1.5149 +    "jge       4b                              \n"
  1.5150 +    "jmp       49f                             \n"
  1.5151 +
  1.5152 +  // 4 pixel loop                              \n"
  1.5153 +    LABELALIGN
  1.5154 +  "40:                                         \n"
  1.5155 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.5156 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.5157 +    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
  1.5158 +    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
  1.5159 +    BUNDLEALIGN
  1.5160 +    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
  1.5161 +    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
  1.5162 +    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
  1.5163 +    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
  1.5164 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
  1.5165 +    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
  1.5166 +    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
  1.5167 +    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
  1.5168 +    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
  1.5169 +    BUNDLEALIGN
  1.5170 +    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
  1.5171 +    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
  1.5172 +    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
  1.5173 +    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
  1.5174 +    "lea       " MEMLEA(0x40,1) ",%1           \n"
  1.5175 +    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
  1.5176 +    "cvtdq2ps  %%xmm1,%%xmm1                   \n"
  1.5177 +    "mulps     %%xmm4,%%xmm0                   \n"
  1.5178 +    "mulps     %%xmm4,%%xmm1                   \n"
  1.5179 +    "cvtdq2ps  %%xmm2,%%xmm2                   \n"
  1.5180 +    "cvtdq2ps  %%xmm3,%%xmm3                   \n"
  1.5181 +    "mulps     %%xmm4,%%xmm2                   \n"
  1.5182 +    "mulps     %%xmm4,%%xmm3                   \n"
  1.5183 +    "cvtps2dq  %%xmm0,%%xmm0                   \n"
  1.5184 +    "cvtps2dq  %%xmm1,%%xmm1                   \n"
  1.5185 +    "cvtps2dq  %%xmm2,%%xmm2                   \n"
  1.5186 +    "cvtps2dq  %%xmm3,%%xmm3                   \n"
  1.5187 +    "packssdw  %%xmm1,%%xmm0                   \n"
  1.5188 +    "packssdw  %%xmm3,%%xmm2                   \n"
  1.5189 +    "packuswb  %%xmm2,%%xmm0                   \n"
  1.5190 +    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
  1.5191 +    "lea       " MEMLEA(0x10,2) ",%2           \n"
  1.5192 +    "sub       $0x4,%3                         \n"
  1.5193 +    "jge       40b                             \n"
  1.5194 +
  1.5195 +  "49:                                         \n"
  1.5196 +    "add       $0x3,%3                         \n"
  1.5197 +    "jl        19f                             \n"
  1.5198 +
  1.5199 +  // 1 pixel loop                              \n"
  1.5200 +    LABELALIGN
  1.5201 +  "10:                                         \n"
  1.5202 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.5203 +    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
  1.5204 +    "lea       " MEMLEA(0x10,0) ",%0           \n"
  1.5205 +    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
  1.5206 +    BUNDLEALIGN
  1.5207 +    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
  1.5208 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.5209 +    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
  1.5210 +    "mulps     %%xmm4,%%xmm0                   \n"
  1.5211 +    "cvtps2dq  %%xmm0,%%xmm0                   \n"
  1.5212 +    "packssdw  %%xmm0,%%xmm0                   \n"
  1.5213 +    "packuswb  %%xmm0,%%xmm0                   \n"
  1.5214 +    "movd      %%xmm0," MEMACCESS(2) "         \n"
  1.5215 +    "lea       " MEMLEA(0x4,2) ",%2            \n"
  1.5216 +    "sub       $0x1,%3                         \n"
  1.5217 +    "jge       10b                             \n"
  1.5218 +  "19:                                         \n"
  1.5219 +  : "+r"(topleft),  // %0
  1.5220 +    "+r"(botleft),  // %1
  1.5221 +    "+r"(dst),      // %2
  1.5222 +    "+rm"(count)    // %3
  1.5223 +  : "r"((intptr_t)(width)),  // %4
  1.5224 +    "rm"(area)     // %5
  1.5225 +  : "memory", "cc"
  1.5226 +#if defined(__native_client__) && defined(__x86_64__)
  1.5227 +    , "r14"
  1.5228 +#endif
  1.5229 +#if defined(__SSE2__)
  1.5230 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  1.5231 +#endif
  1.5232 +  );
  1.5233 +}
  1.5234 +#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
  1.5235 +
  1.5236 +#ifdef HAS_ARGBAFFINEROW_SSE2
  1.5237 +// Copy ARGB pixels from source image with slope to a row of destination.
  1.5238 +LIBYUV_API
  1.5239 +void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
  1.5240 +                        uint8* dst_argb, const float* src_dudv, int width) {
  1.5241 +  intptr_t src_argb_stride_temp = src_argb_stride;
  1.5242 +  intptr_t temp = 0;
  1.5243 +  asm volatile (
  1.5244 +    "movq      " MEMACCESS(3) ",%%xmm2         \n"
  1.5245 +    "movq      " MEMACCESS2(0x08,3) ",%%xmm7   \n"
  1.5246 +    "shl       $0x10,%1                        \n"
  1.5247 +    "add       $0x4,%1                         \n"
  1.5248 +    "movd      %1,%%xmm5                       \n"
  1.5249 +    "sub       $0x4,%4                         \n"
  1.5250 +    "jl        49f                             \n"
  1.5251 +
  1.5252 +    "pshufd    $0x44,%%xmm7,%%xmm7             \n"
  1.5253 +    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
  1.5254 +    "movdqa    %%xmm2,%%xmm0                   \n"
  1.5255 +    "addps     %%xmm7,%%xmm0                   \n"
  1.5256 +    "movlhps   %%xmm0,%%xmm2                   \n"
  1.5257 +    "movdqa    %%xmm7,%%xmm4                   \n"
  1.5258 +    "addps     %%xmm4,%%xmm4                   \n"
  1.5259 +    "movdqa    %%xmm2,%%xmm3                   \n"
  1.5260 +    "addps     %%xmm4,%%xmm3                   \n"
  1.5261 +    "addps     %%xmm4,%%xmm4                   \n"
  1.5262 +
  1.5263 +  // 4 pixel loop                              \n"
  1.5264 +    LABELALIGN
  1.5265 +  "40:                                         \n"
  1.5266 +    "cvttps2dq %%xmm2,%%xmm0                   \n"  // x, y float to int first 2
  1.5267 +    "cvttps2dq %%xmm3,%%xmm1                   \n"  // x, y float to int next 2
  1.5268 +    "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
  1.5269 +    "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x * 4 + y * stride
  1.5270 +    "movd      %%xmm0,%k1                      \n"
  1.5271 +    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
  1.5272 +    "movd      %%xmm0,%k5                      \n"
  1.5273 +    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
  1.5274 +    BUNDLEALIGN
  1.5275 +    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
  1.5276 +    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
  1.5277 +    "punpckldq %%xmm6,%%xmm1                   \n"
  1.5278 +    "addps     %%xmm4,%%xmm2                   \n"
  1.5279 +    "movq      %%xmm1," MEMACCESS(2) "         \n"
  1.5280 +    "movd      %%xmm0,%k1                      \n"
  1.5281 +    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
  1.5282 +    "movd      %%xmm0,%k5                      \n"
  1.5283 +    BUNDLEALIGN
  1.5284 +    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
  1.5285 +    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
  1.5286 +    "punpckldq %%xmm6,%%xmm0                   \n"
  1.5287 +    "addps     %%xmm4,%%xmm3                   \n"
  1.5288 +    "sub       $0x4,%4                         \n"
  1.5289 +    "movq      %%xmm0," MEMACCESS2(0x08,2) "   \n"
  1.5290 +    "lea       " MEMLEA(0x10,2) ",%2           \n"
  1.5291 +    "jge       40b                             \n"
  1.5292 +
  1.5293 +  "49:                                         \n"
  1.5294 +    "add       $0x3,%4                         \n"
  1.5295 +    "jl        19f                             \n"
  1.5296 +
  1.5297 +  // 1 pixel loop                              \n"
  1.5298 +    LABELALIGN
  1.5299 +  "10:                                         \n"
  1.5300 +    "cvttps2dq %%xmm2,%%xmm0                   \n"
  1.5301 +    "packssdw  %%xmm0,%%xmm0                   \n"
  1.5302 +    "pmaddwd   %%xmm5,%%xmm0                   \n"
  1.5303 +    "addps     %%xmm7,%%xmm2                   \n"
  1.5304 +    "movd      %%xmm0,%k1                      \n"
  1.5305 +    BUNDLEALIGN
  1.5306 +    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
  1.5307 +    "sub       $0x1,%4                         \n"
  1.5308 +    "movd      %%xmm0," MEMACCESS(2) "         \n"
  1.5309 +    "lea       " MEMLEA(0x04,2) ",%2           \n"
  1.5310 +    "jge       10b                             \n"
  1.5311 +  "19:                                         \n"
  1.5312 +  : "+r"(src_argb),  // %0
  1.5313 +    "+r"(src_argb_stride_temp),  // %1
  1.5314 +    "+r"(dst_argb),  // %2
  1.5315 +    "+r"(src_dudv),  // %3
  1.5316 +    "+rm"(width),    // %4
  1.5317 +    "+r"(temp)   // %5
  1.5318 +  :
  1.5319 +  : "memory", "cc"
  1.5320 +#if defined(__native_client__) && defined(__x86_64__)
  1.5321 +    , "r14"
  1.5322 +#endif
  1.5323 +#if defined(__SSE2__)
  1.5324 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  1.5325 +#endif
  1.5326 +  );
  1.5327 +}
  1.5328 +#endif  // HAS_ARGBAFFINEROW_SSE2
  1.5329 +
  1.5330 +#ifdef HAS_INTERPOLATEROW_SSSE3
  1.5331 +// Bilinear filter 16x2 -> 16x1
  1.5332 +void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
  1.5333 +                          ptrdiff_t src_stride, int dst_width,
  1.5334 +                          int source_y_fraction) {
  1.5335 +  asm volatile (
  1.5336 +    "sub       %1,%0                           \n"
  1.5337 +    "shr       %3                              \n"
  1.5338 +    "cmp       $0x0,%3                         \n"
  1.5339 +    "je        100f                            \n"
  1.5340 +    "cmp       $0x20,%3                        \n"
  1.5341 +    "je        75f                             \n"
  1.5342 +    "cmp       $0x40,%3                        \n"
  1.5343 +    "je        50f                             \n"
  1.5344 +    "cmp       $0x60,%3                        \n"
  1.5345 +    "je        25f                             \n"
  1.5346 +
  1.5347 +    "movd      %3,%%xmm0                       \n"
  1.5348 +    "neg       %3                              \n"
  1.5349 +    "add       $0x80,%3                        \n"
  1.5350 +    "movd      %3,%%xmm5                       \n"
  1.5351 +    "punpcklbw %%xmm0,%%xmm5                   \n"
  1.5352 +    "punpcklwd %%xmm5,%%xmm5                   \n"
  1.5353 +    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
  1.5354 +
  1.5355 +    // General purpose row blend.
  1.5356 +    LABELALIGN
  1.5357 +  "1:                                          \n"
  1.5358 +    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
  1.5359 +    MEMOPREG(movdqa,0x00,1,4,1,xmm2)
  1.5360 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.5361 +    "punpcklbw %%xmm2,%%xmm0                   \n"
  1.5362 +    "punpckhbw %%xmm2,%%xmm1                   \n"
  1.5363 +    "pmaddubsw %%xmm5,%%xmm0                   \n"
  1.5364 +    "pmaddubsw %%xmm5,%%xmm1                   \n"
  1.5365 +    "psrlw     $0x7,%%xmm0                     \n"
  1.5366 +    "psrlw     $0x7,%%xmm1                     \n"
  1.5367 +    "packuswb  %%xmm1,%%xmm0                   \n"
  1.5368 +    "sub       $0x10,%2                        \n"
  1.5369 +    BUNDLEALIGN
  1.5370 +    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
  1.5371 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.5372 +    "jg        1b                              \n"
  1.5373 +    "jmp       99f                             \n"
  1.5374 +
  1.5375 +    // Blend 25 / 75.
  1.5376 +    LABELALIGN
  1.5377 +  "25:                                         \n"
  1.5378 +    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
  1.5379 +    MEMOPREG(movdqa,0x00,1,4,1,xmm1)
  1.5380 +    "pavgb     %%xmm1,%%xmm0                   \n"
  1.5381 +    "pavgb     %%xmm1,%%xmm0                   \n"
  1.5382 +    "sub       $0x10,%2                        \n"
  1.5383 +    BUNDLEALIGN
  1.5384 +    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
  1.5385 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.5386 +    "jg        25b                             \n"
  1.5387 +    "jmp       99f                             \n"
  1.5388 +
  1.5389 +    // Blend 50 / 50.
  1.5390 +    LABELALIGN
  1.5391 +  "50:                                         \n"
  1.5392 +    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
  1.5393 +    MEMOPREG(movdqa,0x00,1,4,1,xmm1)
  1.5394 +    "pavgb     %%xmm1,%%xmm0                   \n"
  1.5395 +    "sub       $0x10,%2                        \n"
  1.5396 +    BUNDLEALIGN
  1.5397 +    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
  1.5398 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.5399 +    "jg        50b                             \n"
  1.5400 +    "jmp       99f                             \n"
  1.5401 +
  1.5402 +    // Blend 75 / 25.
  1.5403 +    LABELALIGN
  1.5404 +  "75:                                         \n"
  1.5405 +    "movdqa    " MEMACCESS(1) ",%%xmm1         \n"
  1.5406 +    MEMOPREG(movdqa,0x00,1,4,1,xmm0)
  1.5407 +    "pavgb     %%xmm1,%%xmm0                   \n"
  1.5408 +    "pavgb     %%xmm1,%%xmm0                   \n"
  1.5409 +    "sub       $0x10,%2                        \n"
  1.5410 +    BUNDLEALIGN
  1.5411 +    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
  1.5412 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.5413 +    "jg        75b                             \n"
  1.5414 +    "jmp       99f                             \n"
  1.5415 +
  1.5416 +    // Blend 100 / 0 - Copy row unchanged.
  1.5417 +    LABELALIGN
  1.5418 +  "100:                                        \n"
  1.5419 +    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
  1.5420 +    "sub       $0x10,%2                        \n"
  1.5421 +    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
  1.5422 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.5423 +    "jg        100b                            \n"
  1.5424 +
  1.5425 +  "99:                                         \n"
  1.5426 +  : "+r"(dst_ptr),    // %0
  1.5427 +    "+r"(src_ptr),    // %1
  1.5428 +    "+r"(dst_width),  // %2
  1.5429 +    "+r"(source_y_fraction)  // %3
  1.5430 +  : "r"((intptr_t)(src_stride))  // %4
  1.5431 +  : "memory", "cc"
  1.5432 +#if defined(__native_client__) && defined(__x86_64__)
  1.5433 +    , "r14"
  1.5434 +#endif
  1.5435 +#if defined(__SSE2__)
  1.5436 +    , "xmm0", "xmm1", "xmm2", "xmm5"
  1.5437 +#endif
  1.5438 +  );
  1.5439 +}
  1.5440 +#endif  // HAS_INTERPOLATEROW_SSSE3
  1.5441 +
  1.5442 +#ifdef HAS_INTERPOLATEROW_SSE2
  1.5443 +// Bilinear filter 16x2 -> 16x1
  1.5444 +void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
  1.5445 +                         ptrdiff_t src_stride, int dst_width,
  1.5446 +                         int source_y_fraction) {
  1.5447 +  asm volatile (
  1.5448 +    "sub       %1,%0                           \n"
  1.5449 +    "shr       %3                              \n"
  1.5450 +    "cmp       $0x0,%3                         \n"
  1.5451 +    "je        100f                            \n"
  1.5452 +    "cmp       $0x20,%3                        \n"
  1.5453 +    "je        75f                             \n"
  1.5454 +    "cmp       $0x40,%3                        \n"
  1.5455 +    "je        50f                             \n"
  1.5456 +    "cmp       $0x60,%3                        \n"
  1.5457 +    "je        25f                             \n"
  1.5458 +
  1.5459 +    "movd      %3,%%xmm0                       \n"
  1.5460 +    "neg       %3                              \n"
  1.5461 +    "add       $0x80,%3                        \n"
  1.5462 +    "movd      %3,%%xmm5                       \n"
  1.5463 +    "punpcklbw %%xmm0,%%xmm5                   \n"
  1.5464 +    "punpcklwd %%xmm5,%%xmm5                   \n"
  1.5465 +    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
  1.5466 +    "pxor      %%xmm4,%%xmm4                   \n"
  1.5467 +
  1.5468 +    // General purpose row blend.
  1.5469 +    LABELALIGN
  1.5470 +  "1:                                          \n"
  1.5471 +    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
  1.5472 +    MEMOPREG(movdqa,0x00,1,4,1,xmm2)           //  movdqa    (%1,%4,1),%%xmm2
  1.5473 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.5474 +    "movdqa    %%xmm2,%%xmm3                   \n"
  1.5475 +    "punpcklbw %%xmm4,%%xmm2                   \n"
  1.5476 +    "punpckhbw %%xmm4,%%xmm3                   \n"
  1.5477 +    "punpcklbw %%xmm4,%%xmm0                   \n"
  1.5478 +    "punpckhbw %%xmm4,%%xmm1                   \n"
  1.5479 +    "psubw     %%xmm0,%%xmm2                   \n"
  1.5480 +    "psubw     %%xmm1,%%xmm3                   \n"
  1.5481 +    "paddw     %%xmm2,%%xmm2                   \n"
  1.5482 +    "paddw     %%xmm3,%%xmm3                   \n"
  1.5483 +    "pmulhw    %%xmm5,%%xmm2                   \n"
  1.5484 +    "pmulhw    %%xmm5,%%xmm3                   \n"
  1.5485 +    "paddw     %%xmm2,%%xmm0                   \n"
  1.5486 +    "paddw     %%xmm3,%%xmm1                   \n"
  1.5487 +    "packuswb  %%xmm1,%%xmm0                   \n"
  1.5488 +    "sub       $0x10,%2                        \n"
  1.5489 +    BUNDLEALIGN
  1.5490 +    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
  1.5491 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.5492 +    "jg        1b                              \n"
  1.5493 +    "jmp       99f                             \n"
  1.5494 +
  1.5495 +    // Blend 25 / 75.
  1.5496 +    LABELALIGN
  1.5497 +  "25:                                         \n"
  1.5498 +    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
  1.5499 +    MEMOPREG(movdqa,0x00,1,4,1,xmm1)           //  movdqa    (%1,%4,1),%%xmm1
  1.5500 +    "pavgb     %%xmm1,%%xmm0                   \n"
  1.5501 +    "pavgb     %%xmm1,%%xmm0                   \n"
  1.5502 +    "sub       $0x10,%2                        \n"
  1.5503 +    BUNDLEALIGN
  1.5504 +    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
  1.5505 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.5506 +    "jg        25b                             \n"
  1.5507 +    "jmp       99f                             \n"
  1.5508 +
  1.5509 +    // Blend 50 / 50.
  1.5510 +    LABELALIGN
  1.5511 +  "50:                                         \n"
  1.5512 +    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
  1.5513 +    MEMOPREG(movdqa,0x00,1,4,1,xmm1)           //  movdqa    (%1,%4,1),%%xmm1
  1.5514 +    "pavgb     %%xmm1,%%xmm0                   \n"
  1.5515 +    "sub       $0x10,%2                        \n"
  1.5516 +    BUNDLEALIGN
  1.5517 +    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
  1.5518 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.5519 +    "jg        50b                             \n"
  1.5520 +    "jmp       99f                             \n"
  1.5521 +
  1.5522 +    // Blend 75 / 25.
  1.5523 +    LABELALIGN
  1.5524 +  "75:                                         \n"
  1.5525 +    "movdqa    " MEMACCESS(1) ",%%xmm1         \n"
  1.5526 +    MEMOPREG(movdqa,0x00,1,4,1,xmm0)           //  movdqa    (%1,%4,1),%%xmm0
  1.5527 +    "pavgb     %%xmm1,%%xmm0                   \n"
  1.5528 +    "pavgb     %%xmm1,%%xmm0                   \n"
  1.5529 +    "sub       $0x10,%2                        \n"
  1.5530 +    BUNDLEALIGN
  1.5531 +    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
  1.5532 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.5533 +    "jg        75b                             \n"
  1.5534 +    "jmp       99f                             \n"
  1.5535 +
  1.5536 +    // Blend 100 / 0 - Copy row unchanged.
  1.5537 +    LABELALIGN
  1.5538 +  "100:                                        \n"
  1.5539 +    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
  1.5540 +    "sub       $0x10,%2                        \n"
  1.5541 +    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
  1.5542 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.5543 +    "jg        100b                            \n"
  1.5544 +
  1.5545 +  "99:                                         \n"
  1.5546 +  : "+r"(dst_ptr),    // %0
  1.5547 +    "+r"(src_ptr),    // %1
  1.5548 +    "+r"(dst_width),  // %2
  1.5549 +    "+r"(source_y_fraction)  // %3
  1.5550 +  : "r"((intptr_t)(src_stride))  // %4
  1.5551 +  : "memory", "cc"
  1.5552 +#if defined(__native_client__) && defined(__x86_64__)
  1.5553 +    , "r14"
  1.5554 +#endif
  1.5555 +#if defined(__SSE2__)
  1.5556 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.5557 +#endif
  1.5558 +  );
  1.5559 +}
  1.5560 +#endif  // HAS_INTERPOLATEROW_SSE2
  1.5561 +
  1.5562 +#ifdef HAS_INTERPOLATEROW_SSSE3
  1.5563 +// Bilinear filter 16x2 -> 16x1
  1.5564 +void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
  1.5565 +                                    ptrdiff_t src_stride, int dst_width,
  1.5566 +                                    int source_y_fraction) {
  1.5567 +  asm volatile (
  1.5568 +    "sub       %1,%0                           \n"
  1.5569 +    "shr       %3                              \n"
  1.5570 +    "cmp       $0x0,%3                         \n"
  1.5571 +    "je        100f                            \n"
  1.5572 +    "cmp       $0x20,%3                        \n"
  1.5573 +    "je        75f                             \n"
  1.5574 +    "cmp       $0x40,%3                        \n"
  1.5575 +    "je        50f                             \n"
  1.5576 +    "cmp       $0x60,%3                        \n"
  1.5577 +    "je        25f                             \n"
  1.5578 +
  1.5579 +    "movd      %3,%%xmm0                       \n"
  1.5580 +    "neg       %3                              \n"
  1.5581 +    "add       $0x80,%3                        \n"
  1.5582 +    "movd      %3,%%xmm5                       \n"
  1.5583 +    "punpcklbw %%xmm0,%%xmm5                   \n"
  1.5584 +    "punpcklwd %%xmm5,%%xmm5                   \n"
  1.5585 +    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
  1.5586 +
  1.5587 +    // General purpose row blend.
  1.5588 +    LABELALIGN
  1.5589 +  "1:                                          \n"
  1.5590 +    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
  1.5591 +    MEMOPREG(movdqu,0x00,1,4,1,xmm2)
  1.5592 +    "movdqu    %%xmm0,%%xmm1                   \n"
  1.5593 +    "punpcklbw %%xmm2,%%xmm0                   \n"
  1.5594 +    "punpckhbw %%xmm2,%%xmm1                   \n"
  1.5595 +    "pmaddubsw %%xmm5,%%xmm0                   \n"
  1.5596 +    "pmaddubsw %%xmm5,%%xmm1                   \n"
  1.5597 +    "psrlw     $0x7,%%xmm0                     \n"
  1.5598 +    "psrlw     $0x7,%%xmm1                     \n"
  1.5599 +    "packuswb  %%xmm1,%%xmm0                   \n"
  1.5600 +    "sub       $0x10,%2                        \n"
  1.5601 +    BUNDLEALIGN
  1.5602 +    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
  1.5603 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.5604 +    "jg        1b                              \n"
  1.5605 +    "jmp       99f                             \n"
  1.5606 +
  1.5607 +    // Blend 25 / 75.
  1.5608 +    LABELALIGN
  1.5609 +  "25:                                         \n"
  1.5610 +    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
  1.5611 +    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
  1.5612 +    "pavgb     %%xmm1,%%xmm0                   \n"
  1.5613 +    "pavgb     %%xmm1,%%xmm0                   \n"
  1.5614 +    "sub       $0x10,%2                        \n"
  1.5615 +    BUNDLEALIGN
  1.5616 +    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
  1.5617 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.5618 +    "jg        25b                             \n"
  1.5619 +    "jmp       99f                             \n"
  1.5620 +
  1.5621 +    // Blend 50 / 50.
  1.5622 +    LABELALIGN
  1.5623 +  "50:                                         \n"
  1.5624 +    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
  1.5625 +    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
  1.5626 +    "pavgb     %%xmm1,%%xmm0                   \n"
  1.5627 +    "sub       $0x10,%2                        \n"
  1.5628 +    BUNDLEALIGN
  1.5629 +    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
  1.5630 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.5631 +    "jg        50b                             \n"
  1.5632 +    "jmp       99f                             \n"
  1.5633 +
  1.5634 +    // Blend 75 / 25.
  1.5635 +    LABELALIGN
  1.5636 +  "75:                                         \n"
  1.5637 +    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
  1.5638 +    MEMOPREG(movdqu,0x00,1,4,1,xmm0)
  1.5639 +    "pavgb     %%xmm1,%%xmm0                   \n"
  1.5640 +    "pavgb     %%xmm1,%%xmm0                   \n"
  1.5641 +    "sub       $0x10,%2                        \n"
  1.5642 +    BUNDLEALIGN
  1.5643 +    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
  1.5644 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.5645 +    "jg        75b                             \n"
  1.5646 +    "jmp       99f                             \n"
  1.5647 +
  1.5648 +    // Blend 100 / 0 - Copy row unchanged.
  1.5649 +    LABELALIGN
  1.5650 +  "100:                                        \n"
  1.5651 +    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
  1.5652 +    "sub       $0x10,%2                        \n"
  1.5653 +    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
  1.5654 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.5655 +    "jg        100b                            \n"
  1.5656 +
  1.5657 +  "99:                                         \n"
  1.5658 +  : "+r"(dst_ptr),    // %0
  1.5659 +    "+r"(src_ptr),    // %1
  1.5660 +    "+r"(dst_width),  // %2
  1.5661 +    "+r"(source_y_fraction)  // %3
  1.5662 +  : "r"((intptr_t)(src_stride))  // %4
  1.5663 +  : "memory", "cc"
  1.5664 +#if defined(__native_client__) && defined(__x86_64__)
  1.5665 +    , "r14"
  1.5666 +#endif
  1.5667 +#if defined(__SSE2__)
  1.5668 +    , "xmm0", "xmm1", "xmm2", "xmm5"
  1.5669 +#endif
  1.5670 +  );
  1.5671 +}
  1.5672 +#endif   // HAS_INTERPOLATEROW_SSSE3
  1.5673 +
  1.5674 +#ifdef HAS_INTERPOLATEROW_SSE2
  1.5675 +// Bilinear filter 16x2 -> 16x1
  1.5676 +void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
  1.5677 +                                   ptrdiff_t src_stride, int dst_width,
  1.5678 +                                   int source_y_fraction) {
  1.5679 +  asm volatile (
  1.5680 +    "sub       %1,%0                           \n"
  1.5681 +    "shr       %3                              \n"
  1.5682 +    "cmp       $0x0,%3                         \n"
  1.5683 +    "je        100f                            \n"
  1.5684 +    "cmp       $0x20,%3                        \n"
  1.5685 +    "je        75f                             \n"
  1.5686 +    "cmp       $0x40,%3                        \n"
  1.5687 +    "je        50f                             \n"
  1.5688 +    "cmp       $0x60,%3                        \n"
  1.5689 +    "je        25f                             \n"
  1.5690 +
  1.5691 +    "movd      %3,%%xmm0                       \n"
  1.5692 +    "neg       %3                              \n"
  1.5693 +    "add       $0x80,%3                        \n"
  1.5694 +    "movd      %3,%%xmm5                       \n"
  1.5695 +    "punpcklbw %%xmm0,%%xmm5                   \n"
  1.5696 +    "punpcklwd %%xmm5,%%xmm5                   \n"
  1.5697 +    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
  1.5698 +    "pxor      %%xmm4,%%xmm4                   \n"
  1.5699 +
  1.5700 +    // General purpose row blend.
  1.5701 +    LABELALIGN
  1.5702 +  "1:                                          \n"
  1.5703 +    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
  1.5704 +    MEMOPREG(movdqu,0x00,1,4,1,xmm2)           //  movdqu    (%1,%4,1),%%xmm2
  1.5705 +    "movdqu    %%xmm0,%%xmm1                   \n"
  1.5706 +    "movdqu    %%xmm2,%%xmm3                   \n"
  1.5707 +    "punpcklbw %%xmm4,%%xmm2                   \n"
  1.5708 +    "punpckhbw %%xmm4,%%xmm3                   \n"
  1.5709 +    "punpcklbw %%xmm4,%%xmm0                   \n"
  1.5710 +    "punpckhbw %%xmm4,%%xmm1                   \n"
  1.5711 +    "psubw     %%xmm0,%%xmm2                   \n"
  1.5712 +    "psubw     %%xmm1,%%xmm3                   \n"
  1.5713 +    "paddw     %%xmm2,%%xmm2                   \n"
  1.5714 +    "paddw     %%xmm3,%%xmm3                   \n"
  1.5715 +    "pmulhw    %%xmm5,%%xmm2                   \n"
  1.5716 +    "pmulhw    %%xmm5,%%xmm3                   \n"
  1.5717 +    "paddw     %%xmm2,%%xmm0                   \n"
  1.5718 +    "paddw     %%xmm3,%%xmm1                   \n"
  1.5719 +    "packuswb  %%xmm1,%%xmm0                   \n"
  1.5720 +    "sub       $0x10,%2                        \n"
  1.5721 +    BUNDLEALIGN
  1.5722 +    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
  1.5723 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.5724 +    "jg        1b                              \n"
  1.5725 +    "jmp       99f                             \n"
  1.5726 +
  1.5727 +    // Blend 25 / 75.
  1.5728 +    LABELALIGN
  1.5729 +  "25:                                         \n"
  1.5730 +    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
  1.5731 +    MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
  1.5732 +    "pavgb     %%xmm1,%%xmm0                   \n"
  1.5733 +    "pavgb     %%xmm1,%%xmm0                   \n"
  1.5734 +    "sub       $0x10,%2                        \n"
  1.5735 +    BUNDLEALIGN
  1.5736 +    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
  1.5737 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.5738 +    "jg        25b                             \n"
  1.5739 +    "jmp       99f                             \n"
  1.5740 +
  1.5741 +    // Blend 50 / 50.
  1.5742 +    LABELALIGN
  1.5743 +  "50:                                         \n"
  1.5744 +    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
  1.5745 +    MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
  1.5746 +    "pavgb     %%xmm1,%%xmm0                   \n"
  1.5747 +    "sub       $0x10,%2                        \n"
  1.5748 +    BUNDLEALIGN
  1.5749 +    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
  1.5750 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.5751 +    "jg        50b                             \n"
  1.5752 +    "jmp       99f                             \n"
  1.5753 +
  1.5754 +    // Blend 75 / 25.
  1.5755 +    LABELALIGN
  1.5756 +  "75:                                         \n"
  1.5757 +    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
  1.5758 +    MEMOPREG(movdqu,0x00,1,4,1,xmm0)           //  movdqu    (%1,%4,1),%%xmm0
  1.5759 +    "pavgb     %%xmm1,%%xmm0                   \n"
  1.5760 +    "pavgb     %%xmm1,%%xmm0                   \n"
  1.5761 +    "sub       $0x10,%2                        \n"
  1.5762 +    BUNDLEALIGN
  1.5763 +    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
  1.5764 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.5765 +    "jg        75b                             \n"
  1.5766 +    "jmp       99f                             \n"
  1.5767 +
  1.5768 +    // Blend 100 / 0 - Copy row unchanged.
  1.5769 +    LABELALIGN
  1.5770 +  "100:                                        \n"
  1.5771 +    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
  1.5772 +    "sub       $0x10,%2                        \n"
  1.5773 +    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
  1.5774 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.5775 +    "jg        100b                            \n"
  1.5776 +
  1.5777 +  "99:                                         \n"
  1.5778 +  : "+r"(dst_ptr),    // %0
  1.5779 +    "+r"(src_ptr),    // %1
  1.5780 +    "+r"(dst_width),  // %2
  1.5781 +    "+r"(source_y_fraction)  // %3
  1.5782 +  : "r"((intptr_t)(src_stride))  // %4
  1.5783 +  : "memory", "cc"
  1.5784 +#if defined(__native_client__) && defined(__x86_64__)
  1.5785 +    , "r14"
  1.5786 +#endif
  1.5787 +#if defined(__SSE2__)
  1.5788 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1.5789 +#endif
  1.5790 +  );
  1.5791 +}
  1.5792 +#endif  // HAS_INTERPOLATEROW_SSE2
  1.5793 +
  1.5794 +#ifdef HAS_HALFROW_SSE2
  1.5795 +void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
  1.5796 +                  uint8* dst_uv, int pix) {
  1.5797 +  asm volatile (
  1.5798 +    "sub       %0,%1                           \n"
  1.5799 +    LABELALIGN
  1.5800 +  "1:                                          \n"
  1.5801 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.5802 +    MEMOPREG(pavgb,0x00,0,3,1,xmm0)            //  pavgb     (%0,%3),%%xmm0
  1.5803 +    "sub       $0x10,%2                        \n"
  1.5804 +    MEMOPMEM(movdqa,xmm0,0x00,0,1,1)           //  movdqa    %%xmm0,(%0,%1)
  1.5805 +    "lea       " MEMLEA(0x10,0) ",%0           \n"
  1.5806 +    "jg        1b                              \n"
  1.5807 +  : "+r"(src_uv),  // %0
  1.5808 +    "+r"(dst_uv),  // %1
  1.5809 +    "+r"(pix)      // %2
  1.5810 +  : "r"((intptr_t)(src_uv_stride))  // %3
  1.5811 +  : "memory", "cc"
  1.5812 +#if defined(__SSE2__)
  1.5813 +      , "xmm0"
  1.5814 +#endif
  1.5815 +  );
  1.5816 +}
  1.5817 +#endif  // HAS_HALFROW_SSE2
  1.5818 +
  1.5819 +#ifdef HAS_ARGBTOBAYERROW_SSSE3
  1.5820 +void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
  1.5821 +                          uint32 selector, int pix) {
  1.5822 +  asm volatile (
  1.5823 +    // NaCL caveat - assumes movd is from GPR
  1.5824 +    "movd      %3,%%xmm5                       \n"
  1.5825 +    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
  1.5826 +    LABELALIGN
  1.5827 +  "1:                                          \n"
  1.5828 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.5829 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.5830 +    "lea       " MEMLEA(0x20,0) ",%0           \n"
  1.5831 +    "pshufb    %%xmm5,%%xmm0                   \n"
  1.5832 +    "pshufb    %%xmm5,%%xmm1                   \n"
  1.5833 +    "punpckldq %%xmm1,%%xmm0                   \n"
  1.5834 +    "sub       $0x8,%2                         \n"
  1.5835 +    "movq      %%xmm0," MEMACCESS(1) "         \n"
  1.5836 +    "lea       " MEMLEA(0x8,1) ",%1            \n"
  1.5837 +    "jg        1b                              \n"
  1.5838 +  : "+r"(src_argb),  // %0
  1.5839 +    "+r"(dst_bayer), // %1
  1.5840 +    "+r"(pix)        // %2
  1.5841 +  : "g"(selector)    // %3
  1.5842 +  : "memory", "cc"
  1.5843 +#if defined(__SSE2__)
  1.5844 +    , "xmm0", "xmm1", "xmm5"
  1.5845 +#endif
  1.5846 +  );
  1.5847 +}
  1.5848 +#endif  // HAS_ARGBTOBAYERROW_SSSE3
  1.5849 +
  1.5850 +#ifdef HAS_ARGBTOBAYERGGROW_SSE2
  1.5851 +void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
  1.5852 +                           uint32 selector, int pix) {
  1.5853 +  asm volatile (
  1.5854 +    "pcmpeqb   %%xmm5,%%xmm5                   \n"
  1.5855 +    "psrld     $0x18,%%xmm5                    \n"
  1.5856 +    LABELALIGN
  1.5857 +  "1:                                          \n"
  1.5858 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.5859 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.5860 +    "lea       " MEMLEA(0x20,0) ",%0           \n"
  1.5861 +    "psrld     $0x8,%%xmm0                     \n"
  1.5862 +    "psrld     $0x8,%%xmm1                     \n"
  1.5863 +    "pand      %%xmm5,%%xmm0                   \n"
  1.5864 +    "pand      %%xmm5,%%xmm1                   \n"
  1.5865 +    "packssdw  %%xmm1,%%xmm0                   \n"
  1.5866 +    "packuswb  %%xmm1,%%xmm0                   \n"
  1.5867 +    "sub       $0x8,%2                         \n"
  1.5868 +    "movq      %%xmm0," MEMACCESS(1) "         \n"
  1.5869 +    "lea       " MEMLEA(0x8,1) ",%1            \n"
  1.5870 +    "jg        1b                              \n"
  1.5871 +  : "+r"(src_argb),  // %0
  1.5872 +    "+r"(dst_bayer), // %1
  1.5873 +    "+r"(pix)        // %2
  1.5874 +  :
  1.5875 +  : "memory", "cc"
  1.5876 +#if defined(__SSE2__)
  1.5877 +    , "xmm0", "xmm1", "xmm5"
  1.5878 +#endif
  1.5879 +  );
  1.5880 +}
  1.5881 +#endif  // HAS_ARGBTOBAYERGGROW_SSE2
  1.5882 +
  1.5883 +#ifdef HAS_ARGBSHUFFLEROW_SSSE3
  1.5884 +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  1.5885 +void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
  1.5886 +                          const uint8* shuffler, int pix) {
  1.5887 +  asm volatile (
  1.5888 +    "movdqa    " MEMACCESS(3) ",%%xmm5         \n"
  1.5889 +    LABELALIGN
  1.5890 +  "1:                                          \n"
  1.5891 +    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
  1.5892 +    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.5893 +    "lea       " MEMLEA(0x20,0) ",%0           \n"
  1.5894 +    "pshufb    %%xmm5,%%xmm0                   \n"
  1.5895 +    "pshufb    %%xmm5,%%xmm1                   \n"
  1.5896 +    "sub       $0x8,%2                         \n"
  1.5897 +    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
  1.5898 +    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
  1.5899 +    "lea       " MEMLEA(0x20,1) ",%1           \n"
  1.5900 +    "jg        1b                              \n"
  1.5901 +  : "+r"(src_argb),  // %0
  1.5902 +    "+r"(dst_argb),  // %1
  1.5903 +    "+r"(pix)        // %2
  1.5904 +  : "r"(shuffler)    // %3
  1.5905 +  : "memory", "cc"
  1.5906 +#if defined(__SSE2__)
  1.5907 +    , "xmm0", "xmm1", "xmm5"
  1.5908 +#endif
  1.5909 +  );
  1.5910 +}
  1.5911 +
  1.5912 +void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
  1.5913 +                                    const uint8* shuffler, int pix) {
  1.5914 +  asm volatile (
  1.5915 +    "movdqa    " MEMACCESS(3) ",%%xmm5         \n"
  1.5916 +    LABELALIGN
  1.5917 +  "1:                                          \n"
  1.5918 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1.5919 +    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
  1.5920 +    "lea       " MEMLEA(0x20,0) ",%0           \n"
  1.5921 +    "pshufb    %%xmm5,%%xmm0                   \n"
  1.5922 +    "pshufb    %%xmm5,%%xmm1                   \n"
  1.5923 +    "sub       $0x8,%2                         \n"
  1.5924 +    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
  1.5925 +    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
  1.5926 +    "lea       " MEMLEA(0x20,1) ",%1           \n"
  1.5927 +    "jg        1b                              \n"
  1.5928 +  : "+r"(src_argb),  // %0
  1.5929 +    "+r"(dst_argb),  // %1
  1.5930 +    "+r"(pix)        // %2
  1.5931 +  : "r"(shuffler)    // %3
  1.5932 +  : "memory", "cc"
  1.5933 +#if defined(__SSE2__)
  1.5934 +    , "xmm0", "xmm1", "xmm5"
  1.5935 +#endif
  1.5936 +  );
  1.5937 +}
  1.5938 +#endif  // HAS_ARGBSHUFFLEROW_SSSE3
  1.5939 +
  1.5940 +#ifdef HAS_ARGBSHUFFLEROW_AVX2
  1.5941 +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  1.5942 +void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
  1.5943 +                         const uint8* shuffler, int pix) {
  1.5944 +  asm volatile (
  1.5945 +    "vbroadcastf128 " MEMACCESS(3) ",%%ymm5    \n"
  1.5946 +    LABELALIGN
  1.5947 +  "1:                                          \n"
  1.5948 +    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
  1.5949 +    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
  1.5950 +    "lea       " MEMLEA(0x40,0) ",%0           \n"
  1.5951 +    "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
  1.5952 +    "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
  1.5953 +    "sub       $0x10,%2                        \n"
  1.5954 +    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
  1.5955 +    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
  1.5956 +    "lea       " MEMLEA(0x40,1) ",%1           \n"
  1.5957 +    "jg        1b                              \n"
  1.5958 +  : "+r"(src_argb),  // %0
  1.5959 +    "+r"(dst_argb),  // %1
  1.5960 +    "+r"(pix)        // %2
  1.5961 +  : "r"(shuffler)    // %3
  1.5962 +  : "memory", "cc"
  1.5963 +#if defined(__SSE2__)
  1.5964 +    , "xmm0", "xmm1", "xmm5"
  1.5965 +#endif
  1.5966 +  );
  1.5967 +}
  1.5968 +#endif  // HAS_ARGBSHUFFLEROW_AVX2
  1.5969 +
  1.5970 +#ifdef HAS_ARGBSHUFFLEROW_SSE2
  1.5971 +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  1.5972 +void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
  1.5973 +                         const uint8* shuffler, int pix) {
  1.5974 +  uintptr_t pixel_temp = 0u;
  1.5975 +  asm volatile (
  1.5976 +    "pxor      %%xmm5,%%xmm5                   \n"
  1.5977 +    "mov       " MEMACCESS(4) ",%k2            \n"
  1.5978 +    "cmp       $0x3000102,%k2                  \n"
  1.5979 +    "je        3012f                           \n"
  1.5980 +    "cmp       $0x10203,%k2                    \n"
  1.5981 +    "je        123f                            \n"
  1.5982 +    "cmp       $0x30201,%k2                    \n"
  1.5983 +    "je        321f                            \n"
  1.5984 +    "cmp       $0x2010003,%k2                  \n"
  1.5985 +    "je        2103f                           \n"
  1.5986 +
  1.5987 +    LABELALIGN
  1.5988 +  "1:                                          \n"
  1.5989 +    "movzb     " MEMACCESS(4) ",%2             \n"
  1.5990 +    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
  1.5991 +    "mov       %b2," MEMACCESS(1) "            \n"
  1.5992 +    "movzb     " MEMACCESS2(0x1,4) ",%2        \n"
  1.5993 +    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
  1.5994 +    "mov       %b2," MEMACCESS2(0x1,1) "       \n"
  1.5995 +    BUNDLEALIGN
  1.5996 +    "movzb     " MEMACCESS2(0x2,4) ",%2        \n"
  1.5997 +    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
  1.5998 +    "mov       %b2," MEMACCESS2(0x2,1) "       \n"
  1.5999 +    "movzb     " MEMACCESS2(0x3,4) ",%2        \n"
  1.6000 +    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
  1.6001 +    "mov       %b2," MEMACCESS2(0x3,1) "       \n"
  1.6002 +    "lea       " MEMLEA(0x4,0) ",%0            \n"
  1.6003 +    "lea       " MEMLEA(0x4,1) ",%1            \n"
  1.6004 +    "sub       $0x1,%3                         \n"
  1.6005 +    "jg        1b                              \n"
  1.6006 +    "jmp       99f                             \n"
  1.6007 +
  1.6008 +    LABELALIGN
  1.6009 +  "123:                                        \n"
  1.6010 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1.6011 +    "lea       " MEMLEA(0x10,0) ",%0           \n"
  1.6012 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.6013 +    "punpcklbw %%xmm5,%%xmm0                   \n"
  1.6014 +    "punpckhbw %%xmm5,%%xmm1                   \n"
  1.6015 +    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
  1.6016 +    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
  1.6017 +    "pshufhw   $0x1b,%%xmm1,%%xmm1             \n"
  1.6018 +    "pshuflw   $0x1b,%%xmm1,%%xmm1             \n"
  1.6019 +    "packuswb  %%xmm1,%%xmm0                   \n"
  1.6020 +    "sub       $0x4,%3                         \n"
  1.6021 +    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
  1.6022 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.6023 +    "jg        123b                            \n"
  1.6024 +    "jmp       99f                             \n"
  1.6025 +
  1.6026 +    LABELALIGN
  1.6027 +  "321:                                        \n"
  1.6028 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1.6029 +    "lea       " MEMLEA(0x10,0) ",%0           \n"
  1.6030 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.6031 +    "punpcklbw %%xmm5,%%xmm0                   \n"
  1.6032 +    "punpckhbw %%xmm5,%%xmm1                   \n"
  1.6033 +    "pshufhw   $0x39,%%xmm0,%%xmm0             \n"
  1.6034 +    "pshuflw   $0x39,%%xmm0,%%xmm0             \n"
  1.6035 +    "pshufhw   $0x39,%%xmm1,%%xmm1             \n"
  1.6036 +    "pshuflw   $0x39,%%xmm1,%%xmm1             \n"
  1.6037 +    "packuswb  %%xmm1,%%xmm0                   \n"
  1.6038 +    "sub       $0x4,%3                         \n"
  1.6039 +    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
  1.6040 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.6041 +    "jg        321b                            \n"
  1.6042 +    "jmp       99f                             \n"
  1.6043 +
  1.6044 +    LABELALIGN
  1.6045 +  "2103:                                       \n"
  1.6046 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1.6047 +    "lea       " MEMLEA(0x10,0) ",%0           \n"
  1.6048 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.6049 +    "punpcklbw %%xmm5,%%xmm0                   \n"
  1.6050 +    "punpckhbw %%xmm5,%%xmm1                   \n"
  1.6051 +    "pshufhw   $0x93,%%xmm0,%%xmm0             \n"
  1.6052 +    "pshuflw   $0x93,%%xmm0,%%xmm0             \n"
  1.6053 +    "pshufhw   $0x93,%%xmm1,%%xmm1             \n"
  1.6054 +    "pshuflw   $0x93,%%xmm1,%%xmm1             \n"
  1.6055 +    "packuswb  %%xmm1,%%xmm0                   \n"
  1.6056 +    "sub       $0x4,%3                         \n"
  1.6057 +    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
  1.6058 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.6059 +    "jg        2103b                           \n"
  1.6060 +    "jmp       99f                             \n"
  1.6061 +
  1.6062 +    LABELALIGN
  1.6063 +  "3012:                                       \n"
  1.6064 +    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
  1.6065 +    "lea       " MEMLEA(0x10,0) ",%0           \n"
  1.6066 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.6067 +    "punpcklbw %%xmm5,%%xmm0                   \n"
  1.6068 +    "punpckhbw %%xmm5,%%xmm1                   \n"
  1.6069 +    "pshufhw   $0xc6,%%xmm0,%%xmm0             \n"
  1.6070 +    "pshuflw   $0xc6,%%xmm0,%%xmm0             \n"
  1.6071 +    "pshufhw   $0xc6,%%xmm1,%%xmm1             \n"
  1.6072 +    "pshuflw   $0xc6,%%xmm1,%%xmm1             \n"
  1.6073 +    "packuswb  %%xmm1,%%xmm0                   \n"
  1.6074 +    "sub       $0x4,%3                         \n"
  1.6075 +    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
  1.6076 +    "lea       " MEMLEA(0x10,1) ",%1           \n"
  1.6077 +    "jg        3012b                           \n"
  1.6078 +
  1.6079 +  "99:                                         \n"
  1.6080 +  : "+r"(src_argb),    // %0
  1.6081 +    "+r"(dst_argb),    // %1
  1.6082 +    "+d"(pixel_temp),  // %2
  1.6083 +    "+r"(pix)         // %3
  1.6084 +  : "r"(shuffler)      // %4
  1.6085 +  : "memory", "cc"
  1.6086 +#if defined(__native_client__) && defined(__x86_64__)
  1.6087 +    , "r14"
  1.6088 +#endif
  1.6089 +#if defined(__SSE2__)
  1.6090 +    , "xmm0", "xmm1", "xmm5"
  1.6091 +#endif
  1.6092 +  );
  1.6093 +}
  1.6094 +#endif  // HAS_ARGBSHUFFLEROW_SSE2
  1.6095 +
  1.6096 +#ifdef HAS_I422TOYUY2ROW_SSE2
  1.6097 +void I422ToYUY2Row_SSE2(const uint8* src_y,
  1.6098 +                        const uint8* src_u,
  1.6099 +                        const uint8* src_v,
  1.6100 +                        uint8* dst_frame, int width) {
  1.6101 + asm volatile (
  1.6102 +    "sub       %1,%2                             \n"
  1.6103 +    LABELALIGN
  1.6104 +  "1:                                            \n"
  1.6105 +    "movq      " MEMACCESS(1) ",%%xmm2           \n"
  1.6106 +    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
  1.6107 +    "lea       " MEMLEA(0x8,1) ",%1              \n"
  1.6108 +    "punpcklbw %%xmm3,%%xmm2                     \n"
  1.6109 +    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
  1.6110 +    "lea       " MEMLEA(0x10,0) ",%0             \n"
  1.6111 +    "movdqa    %%xmm0,%%xmm1                     \n"
  1.6112 +    "punpcklbw %%xmm2,%%xmm0                     \n"
  1.6113 +    "punpckhbw %%xmm2,%%xmm1                     \n"
  1.6114 +    "movdqu    %%xmm0," MEMACCESS(3) "           \n"
  1.6115 +    "movdqu    %%xmm1," MEMACCESS2(0x10,3) "     \n"
  1.6116 +    "lea       " MEMLEA(0x20,3) ",%3             \n"
  1.6117 +    "sub       $0x10,%4                          \n"
  1.6118 +    "jg         1b                               \n"
  1.6119 +    : "+r"(src_y),  // %0
  1.6120 +      "+r"(src_u),  // %1
  1.6121 +      "+r"(src_v),  // %2
  1.6122 +      "+r"(dst_frame),  // %3
  1.6123 +      "+rm"(width)  // %4
  1.6124 +    :
  1.6125 +    : "memory", "cc"
  1.6126 +#if defined(__native_client__) && defined(__x86_64__)
  1.6127 +    , "r14"
  1.6128 +#endif
  1.6129 +#if defined(__SSE2__)
  1.6130 +    , "xmm0", "xmm1", "xmm2", "xmm3"
  1.6131 +#endif
  1.6132 +  );
  1.6133 +}
  1.6134 +#endif  // HAS_I422TOYUY2ROW_SSE2
  1.6135 +
  1.6136 +#ifdef HAS_I422TOUYVYROW_SSE2
  1.6137 +void I422ToUYVYRow_SSE2(const uint8* src_y,
  1.6138 +                        const uint8* src_u,
  1.6139 +                        const uint8* src_v,
  1.6140 +                        uint8* dst_frame, int width) {
  1.6141 + asm volatile (
  1.6142 +    "sub        %1,%2                            \n"
  1.6143 +    LABELALIGN
  1.6144 +  "1:                                            \n"
  1.6145 +    "movq      " MEMACCESS(1) ",%%xmm2           \n"
  1.6146 +    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
  1.6147 +    "lea       " MEMLEA(0x8,1) ",%1              \n"
  1.6148 +    "punpcklbw %%xmm3,%%xmm2                     \n"
  1.6149 +    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
  1.6150 +    "movdqa    %%xmm2,%%xmm1                     \n"
  1.6151 +    "lea       " MEMLEA(0x10,0) ",%0             \n"
  1.6152 +    "punpcklbw %%xmm0,%%xmm1                     \n"
  1.6153 +    "punpckhbw %%xmm0,%%xmm2                     \n"
  1.6154 +    "movdqu    %%xmm1," MEMACCESS(3) "           \n"
  1.6155 +    "movdqu    %%xmm2," MEMACCESS2(0x10,3) "     \n"
  1.6156 +    "lea       " MEMLEA(0x20,3) ",%3             \n"
  1.6157 +    "sub       $0x10,%4                          \n"
  1.6158 +    "jg         1b                               \n"
  1.6159 +    : "+r"(src_y),  // %0
  1.6160 +      "+r"(src_u),  // %1
  1.6161 +      "+r"(src_v),  // %2
  1.6162 +      "+r"(dst_frame),  // %3
  1.6163 +      "+rm"(width)  // %4
  1.6164 +    :
  1.6165 +    : "memory", "cc"
  1.6166 +#if defined(__native_client__) && defined(__x86_64__)
  1.6167 +    , "r14"
  1.6168 +#endif
  1.6169 +#if defined(__SSE2__)
  1.6170 +    , "xmm0", "xmm1", "xmm2", "xmm3"
  1.6171 +#endif
  1.6172 +  );
  1.6173 +}
  1.6174 +#endif  // HAS_I422TOUYVYROW_SSE2
  1.6175 +
  1.6176 +#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
  1.6177 +void ARGBPolynomialRow_SSE2(const uint8* src_argb,
  1.6178 +                            uint8* dst_argb, const float* poly,
  1.6179 +                            int width) {
  1.6180 +  asm volatile (
  1.6181 +    "pxor      %%xmm3,%%xmm3                   \n"
  1.6182 +
  1.6183 +    // 2 pixel loop.
  1.6184 +    LABELALIGN
  1.6185 +  "1:                                          \n"
  1.6186 +    "movq      " MEMACCESS(0) ",%%xmm0         \n"
  1.6187 +    "lea       " MEMLEA(0x8,0) ",%0            \n"
  1.6188 +    "punpcklbw %%xmm3,%%xmm0                   \n"
  1.6189 +    "movdqa    %%xmm0,%%xmm4                   \n"
  1.6190 +    "punpcklwd %%xmm3,%%xmm0                   \n"
  1.6191 +    "punpckhwd %%xmm3,%%xmm4                   \n"
  1.6192 +    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
  1.6193 +    "cvtdq2ps  %%xmm4,%%xmm4                   \n"
  1.6194 +    "movdqa    %%xmm0,%%xmm1                   \n"
  1.6195 +    "movdqa    %%xmm4,%%xmm5                   \n"
  1.6196 +    "mulps     " MEMACCESS2(0x10,3) ",%%xmm0   \n"
  1.6197 +    "mulps     " MEMACCESS2(0x10,3) ",%%xmm4   \n"
  1.6198 +    "addps     " MEMACCESS(3) ",%%xmm0         \n"
  1.6199 +    "addps     " MEMACCESS(3) ",%%xmm4         \n"
  1.6200 +    "movdqa    %%xmm1,%%xmm2                   \n"
  1.6201 +    "movdqa    %%xmm5,%%xmm6                   \n"
  1.6202 +    "mulps     %%xmm1,%%xmm2                   \n"
  1.6203 +    "mulps     %%xmm5,%%xmm6                   \n"
  1.6204 +    "mulps     %%xmm2,%%xmm1                   \n"
  1.6205 +    "mulps     %%xmm6,%%xmm5                   \n"
  1.6206 +    "mulps     " MEMACCESS2(0x20,3) ",%%xmm2   \n"
  1.6207 +    "mulps     " MEMACCESS2(0x20,3) ",%%xmm6   \n"
  1.6208 +    "mulps     " MEMACCESS2(0x30,3) ",%%xmm1   \n"
  1.6209 +    "mulps     " MEMACCESS2(0x30,3) ",%%xmm5   \n"
  1.6210 +    "addps     %%xmm2,%%xmm0                   \n"
  1.6211 +    "addps     %%xmm6,%%xmm4                   \n"
  1.6212 +    "addps     %%xmm1,%%xmm0                   \n"
  1.6213 +    "addps     %%xmm5,%%xmm4                   \n"
  1.6214 +    "cvttps2dq %%xmm0,%%xmm0                   \n"
  1.6215 +    "cvttps2dq %%xmm4,%%xmm4                   \n"
  1.6216 +    "packuswb  %%xmm4,%%xmm0                   \n"
  1.6217 +    "packuswb  %%xmm0,%%xmm0                   \n"
  1.6218 +    "sub       $0x2,%2                         \n"
  1.6219 +    "movq      %%xmm0," MEMACCESS(1) "         \n"
  1.6220 +    "lea       " MEMLEA(0x8,1) ",%1            \n"
  1.6221 +    "jg        1b                              \n"
  1.6222 +  : "+r"(src_argb),  // %0
  1.6223 +    "+r"(dst_argb),  // %1
  1.6224 +    "+r"(width)      // %2
  1.6225 +  : "r"(poly)        // %3
  1.6226 +  : "memory", "cc"
  1.6227 +#if defined(__SSE2__)
  1.6228 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  1.6229 +#endif
  1.6230 +  );
  1.6231 +}
  1.6232 +#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
  1.6233 +
  1.6234 +#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
  1.6235 +void ARGBPolynomialRow_AVX2(const uint8* src_argb,
  1.6236 +                            uint8* dst_argb, const float* poly,
  1.6237 +                            int width) {
  1.6238 +  asm volatile (
  1.6239 +    "vbroadcastf128 " MEMACCESS(3) ",%%ymm4     \n"
  1.6240 +    "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
  1.6241 +    "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
  1.6242 +    "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
  1.6243 +
  1.6244 +    // 2 pixel loop.
  1.6245 +    LABELALIGN
  1.6246 +  "1:                                          \n"
  1.6247 +    "vpmovzxbd   " MEMACCESS(0) ",%%ymm0       \n"  // 2 ARGB pixels
  1.6248 +    "lea         " MEMLEA(0x8,0) ",%0          \n"
  1.6249 +    "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
  1.6250 +    "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
  1.6251 +    "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
  1.6252 +    "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
  1.6253 +    "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
  1.6254 +    "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X * X
  1.6255 +    "vcvttps2dq  %%ymm0,%%ymm0                 \n"
  1.6256 +    "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
  1.6257 +    "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
  1.6258 +    "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
  1.6259 +    "sub         $0x2,%2                       \n"
  1.6260 +    "vmovq       %%xmm0," MEMACCESS(1) "       \n"
  1.6261 +    "lea         " MEMLEA(0x8,1) ",%1          \n"
  1.6262 +    "jg          1b                            \n"
  1.6263 +    "vzeroupper                                \n"
  1.6264 +  : "+r"(src_argb),  // %0
  1.6265 +    "+r"(dst_argb),  // %1
  1.6266 +    "+r"(width)      // %2
  1.6267 +  : "r"(poly)        // %3
  1.6268 +  : "memory", "cc"
  1.6269 +#if defined(__SSE2__)
  1.6270 +// TODO(fbarchard): declare ymm usage when applicable.
  1.6271 +    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  1.6272 +#endif
  1.6273 +  );
  1.6274 +}
  1.6275 +#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
  1.6276 +
  1.6277 +#ifdef HAS_ARGBCOLORTABLEROW_X86
  1.6278 +// Tranform ARGB pixels with color table.
  1.6279 +void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
  1.6280 +                           int width) {
  1.6281 +  uintptr_t pixel_temp = 0u;
  1.6282 +  asm volatile (
  1.6283 +    // 1 pixel loop.
  1.6284 +    LABELALIGN
  1.6285 +  "1:                                          \n"
  1.6286 +    "movzb     " MEMACCESS(0) ",%1             \n"
  1.6287 +    "lea       " MEMLEA(0x4,0) ",%0            \n"
  1.6288 +    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
  1.6289 +    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
  1.6290 +    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
  1.6291 +    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
  1.6292 +    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
  1.6293 +    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
  1.6294 +    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
  1.6295 +    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
  1.6296 +    "movzb     " MEMACCESS2(-0x1,0) ",%1       \n"
  1.6297 +    MEMOPARG(movzb,0x03,3,1,4,1) "             \n"  // movzb 0x3(%3,%1,4),%1
  1.6298 +    "mov       %b1," MEMACCESS2(-0x1,0) "      \n"
  1.6299 +    "dec       %2                              \n"
  1.6300 +    "jg        1b                              \n"
  1.6301 +  : "+r"(dst_argb),   // %0
  1.6302 +    "+d"(pixel_temp), // %1
  1.6303 +    "+r"(width)       // %2
  1.6304 +  : "r"(table_argb)   // %3
  1.6305 +  : "memory", "cc");
  1.6306 +}
  1.6307 +#endif  // HAS_ARGBCOLORTABLEROW_X86
  1.6308 +
  1.6309 +#ifdef HAS_RGBCOLORTABLEROW_X86
  1.6310 +// Tranform RGB pixels with color table.
  1.6311 +void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
  1.6312 +  uintptr_t pixel_temp = 0u;
  1.6313 +  asm volatile (
  1.6314 +    // 1 pixel loop.
  1.6315 +    LABELALIGN
  1.6316 +  "1:                                          \n"
  1.6317 +    "movzb     " MEMACCESS(0) ",%1             \n"
  1.6318 +    "lea       " MEMLEA(0x4,0) ",%0            \n"
  1.6319 +    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
  1.6320 +    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
  1.6321 +    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
  1.6322 +    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
  1.6323 +    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
  1.6324 +    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
  1.6325 +    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
  1.6326 +    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
  1.6327 +    "dec       %2                              \n"
  1.6328 +    "jg        1b                              \n"
  1.6329 +  : "+r"(dst_argb),   // %0
  1.6330 +    "+d"(pixel_temp), // %1
  1.6331 +    "+r"(width)       // %2
  1.6332 +  : "r"(table_argb)   // %3
  1.6333 +  : "memory", "cc");
  1.6334 +}
  1.6335 +#endif  // HAS_RGBCOLORTABLEROW_X86
  1.6336 +
  1.6337 +#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
  1.6338 +// Tranform RGB pixels with luma table.
  1.6339 +void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
  1.6340 +                                 int width,
  1.6341 +                                 const uint8* luma, uint32 lumacoeff) {
  1.6342 +  uintptr_t pixel_temp = 0u;
  1.6343 +  uintptr_t table_temp = 0u;
  1.6344 +  asm volatile (
  1.6345 +    "movd      %6,%%xmm3                       \n"
  1.6346 +    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
  1.6347 +    "pcmpeqb   %%xmm4,%%xmm4                   \n"
  1.6348 +    "psllw     $0x8,%%xmm4                     \n"
  1.6349 +    "pxor      %%xmm5,%%xmm5                   \n"
  1.6350 +
  1.6351 +    // 4 pixel loop.
  1.6352 +    LABELALIGN
  1.6353 +  "1:                                          \n"
  1.6354 +    "movdqu    " MEMACCESS(2) ",%%xmm0         \n"
  1.6355 +    "pmaddubsw %%xmm3,%%xmm0                   \n"
  1.6356 +    "phaddw    %%xmm0,%%xmm0                   \n"
  1.6357 +    "pand      %%xmm4,%%xmm0                   \n"
  1.6358 +    "punpcklwd %%xmm5,%%xmm0                   \n"
  1.6359 +    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
  1.6360 +    "add       %5,%1                           \n"
  1.6361 +    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
  1.6362 +
  1.6363 +    "movzb     " MEMACCESS(2) ",%0             \n"
  1.6364 +    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
  1.6365 +    "mov       %b0," MEMACCESS(3) "            \n"
  1.6366 +    "movzb     " MEMACCESS2(0x1,2) ",%0        \n"
  1.6367 +    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
  1.6368 +    "mov       %b0," MEMACCESS2(0x1,3) "       \n"
  1.6369 +    "movzb     " MEMACCESS2(0x2,2) ",%0        \n"
  1.6370 +    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
  1.6371 +    "mov       %b0," MEMACCESS2(0x2,3) "       \n"
  1.6372 +    "movzb     " MEMACCESS2(0x3,2) ",%0        \n"
  1.6373 +    "mov       %b0," MEMACCESS2(0x3,3) "       \n"
  1.6374 +
  1.6375 +    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
  1.6376 +    "add       %5,%1                           \n"
  1.6377 +    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
  1.6378 +
  1.6379 +    "movzb     " MEMACCESS2(0x4,2) ",%0        \n"
  1.6380 +    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
  1.6381 +    "mov       %b0," MEMACCESS2(0x4,3) "       \n"
  1.6382 +    BUNDLEALIGN
  1.6383 +    "movzb     " MEMACCESS2(0x5,2) ",%0        \n"
  1.6384 +    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
  1.6385 +    "mov       %b0," MEMACCESS2(0x5,3) "       \n"
  1.6386 +    "movzb     " MEMACCESS2(0x6,2) ",%0        \n"
  1.6387 +    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
  1.6388 +    "mov       %b0," MEMACCESS2(0x6,3) "       \n"
  1.6389 +    "movzb     " MEMACCESS2(0x7,2) ",%0        \n"
  1.6390 +    "mov       %b0," MEMACCESS2(0x7,3) "       \n"
  1.6391 +
  1.6392 +    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
  1.6393 +    "add       %5,%1                           \n"
  1.6394 +    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
  1.6395 +
  1.6396 +    "movzb     " MEMACCESS2(0x8,2) ",%0        \n"
  1.6397 +    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
  1.6398 +    "mov       %b0," MEMACCESS2(0x8,3) "       \n"
  1.6399 +    "movzb     " MEMACCESS2(0x9,2) ",%0        \n"
  1.6400 +    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
  1.6401 +    "mov       %b0," MEMACCESS2(0x9,3) "       \n"
  1.6402 +    "movzb     " MEMACCESS2(0xa,2) ",%0        \n"
  1.6403 +    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
  1.6404 +    "mov       %b0," MEMACCESS2(0xa,3) "       \n"
  1.6405 +    "movzb     " MEMACCESS2(0xb,2) ",%0        \n"
  1.6406 +    "mov       %b0," MEMACCESS2(0xb,3) "       \n"
  1.6407 +
  1.6408 +    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
  1.6409 +    "add       %5,%1                           \n"
  1.6410 +
  1.6411 +    "movzb     " MEMACCESS2(0xc,2) ",%0        \n"
  1.6412 +    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
  1.6413 +    "mov       %b0," MEMACCESS2(0xc,3) "       \n"
  1.6414 +    "movzb     " MEMACCESS2(0xd,2) ",%0        \n"
  1.6415 +    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
  1.6416 +    "mov       %b0," MEMACCESS2(0xd,3) "       \n"
  1.6417 +    "movzb     " MEMACCESS2(0xe,2) ",%0        \n"
  1.6418 +    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
  1.6419 +    "mov       %b0," MEMACCESS2(0xe,3) "       \n"
  1.6420 +    "movzb     " MEMACCESS2(0xf,2) ",%0        \n"
  1.6421 +    "mov       %b0," MEMACCESS2(0xf,3) "       \n"
  1.6422 +    "sub       $0x4,%4                         \n"
  1.6423 +    "lea       " MEMLEA(0x10,2) ",%2           \n"
  1.6424 +    "lea       " MEMLEA(0x10,3) ",%3           \n"
  1.6425 +    "jg        1b                              \n"
  1.6426 +  : "+d"(pixel_temp),  // %0
  1.6427 +    "+a"(table_temp),  // %1
  1.6428 +    "+r"(src_argb),    // %2
  1.6429 +    "+r"(dst_argb),    // %3
  1.6430 +    "+rm"(width)       // %4
  1.6431 +  : "r"(luma),         // %5
  1.6432 +    "rm"(lumacoeff)    // %6
  1.6433 +  : "memory", "cc"
  1.6434 +#if defined(__SSE2__)
  1.6435 +    , "xmm0", "xmm3", "xmm4", "xmm5"
  1.6436 +#endif
  1.6437 +  );
  1.6438 +}
  1.6439 +#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
  1.6440 +
  1.6441 +#endif  // defined(__x86_64__) || defined(__i386__)
  1.6442 +
  1.6443 +#ifdef __cplusplus
  1.6444 +}  // extern "C"
  1.6445 +}  // namespace libyuv
  1.6446 +#endif

mercurial