media/libyuv/source/row_win.cc

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libyuv/source/row_win.cc	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,7284 @@
     1.4 +/*
     1.5 + *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
     1.6 + *
     1.7 + *  Use of this source code is governed by a BSD-style license
     1.8 + *  that can be found in the LICENSE file in the root of the source
     1.9 + *  tree. An additional intellectual property rights grant can be found
    1.10 + *  in the file PATENTS. All contributing project authors may
    1.11 + *  be found in the AUTHORS file in the root of the source tree.
    1.12 + */
    1.13 +
    1.14 +#include "libyuv/row.h"
    1.15 +
    1.16 +#ifdef __cplusplus
    1.17 +namespace libyuv {
    1.18 +extern "C" {
    1.19 +#endif
    1.20 +
    1.21 +// This module is for Visual C x86.
    1.22 +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
    1.23 +
    1.24 +#ifdef HAS_ARGBTOYROW_SSSE3
    1.25 +
    1.26 +// Constants for ARGB.
    1.27 +static const vec8 kARGBToY = {
    1.28 +  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
    1.29 +};
    1.30 +
    1.31 +// JPeg full range.
    1.32 +static const vec8 kARGBToYJ = {
    1.33 +  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
    1.34 +};
    1.35 +
    1.36 +static const vec8 kARGBToU = {
    1.37 +  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
    1.38 +};
    1.39 +
    1.40 +static const vec8 kARGBToUJ = {
    1.41 +  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
    1.42 +};
    1.43 +
    1.44 +static const vec8 kARGBToV = {
    1.45 +  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
    1.46 +};
    1.47 +
    1.48 +static const vec8 kARGBToVJ = {
    1.49 +  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
    1.50 +};
    1.51 +
    1.52 +// vpermd for vphaddw + vpackuswb vpermd.
    1.53 +static const lvec32 kPermdARGBToY_AVX = {
    1.54 +  0, 4, 1, 5, 2, 6, 3, 7
    1.55 +};
    1.56 +
    1.57 +// vpshufb for vphaddw + vpackuswb packed to shorts.
    1.58 +static const lvec8 kShufARGBToUV_AVX = {
    1.59 +  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
    1.60 +  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
    1.61 +};
    1.62 +
    1.63 +// Constants for BGRA.
    1.64 +static const vec8 kBGRAToY = {
    1.65 +  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
    1.66 +};
    1.67 +
    1.68 +static const vec8 kBGRAToU = {
    1.69 +  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
    1.70 +};
    1.71 +
    1.72 +static const vec8 kBGRAToV = {
    1.73 +  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
    1.74 +};
    1.75 +
    1.76 +// Constants for ABGR.
    1.77 +static const vec8 kABGRToY = {
    1.78 +  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
    1.79 +};
    1.80 +
    1.81 +static const vec8 kABGRToU = {
    1.82 +  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
    1.83 +};
    1.84 +
    1.85 +static const vec8 kABGRToV = {
    1.86 +  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
    1.87 +};
    1.88 +
    1.89 +// Constants for RGBA.
    1.90 +static const vec8 kRGBAToY = {
    1.91 +  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
    1.92 +};
    1.93 +
    1.94 +static const vec8 kRGBAToU = {
    1.95 +  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
    1.96 +};
    1.97 +
    1.98 +static const vec8 kRGBAToV = {
    1.99 +  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
   1.100 +};
   1.101 +
   1.102 +static const uvec8 kAddY16 = {
   1.103 +  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
   1.104 +};
   1.105 +
   1.106 +static const vec16 kAddYJ64 = {
   1.107 +  64, 64, 64, 64, 64, 64, 64, 64
   1.108 +};
   1.109 +
   1.110 +static const uvec8 kAddUV128 = {
   1.111 +  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
   1.112 +  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
   1.113 +};
   1.114 +
   1.115 +static const uvec16 kAddUVJ128 = {
   1.116 +  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
   1.117 +};
   1.118 +
   1.119 +// Shuffle table for converting RGB24 to ARGB.
   1.120 +static const uvec8 kShuffleMaskRGB24ToARGB = {
   1.121 +  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
   1.122 +};
   1.123 +
   1.124 +// Shuffle table for converting RAW to ARGB.
   1.125 +static const uvec8 kShuffleMaskRAWToARGB = {
   1.126 +  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
   1.127 +};
   1.128 +
   1.129 +// Shuffle table for converting ARGB to RGB24.
   1.130 +static const uvec8 kShuffleMaskARGBToRGB24 = {
   1.131 +  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
   1.132 +};
   1.133 +
   1.134 +// Shuffle table for converting ARGB to RAW.
   1.135 +static const uvec8 kShuffleMaskARGBToRAW = {
   1.136 +  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
   1.137 +};
   1.138 +
   1.139 +// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
   1.140 +static const uvec8 kShuffleMaskARGBToRGB24_0 = {
   1.141 +  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
   1.142 +};
   1.143 +
   1.144 +// Shuffle table for converting ARGB to RAW.
   1.145 +static const uvec8 kShuffleMaskARGBToRAW_0 = {
   1.146 +  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
   1.147 +};
   1.148 +
   1.149 +// Duplicates gray value 3 times and fills in alpha opaque.
   1.150 +__declspec(naked) __declspec(align(16))
   1.151 +void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
   1.152 +  __asm {
   1.153 +    mov        eax, [esp + 4]        // src_y
   1.154 +    mov        edx, [esp + 8]        // dst_argb
   1.155 +    mov        ecx, [esp + 12]       // pix
   1.156 +    pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
   1.157 +    pslld      xmm5, 24
   1.158 +
   1.159 +    align      4
   1.160 +  convertloop:
   1.161 +    movq       xmm0, qword ptr [eax]
   1.162 +    lea        eax,  [eax + 8]
   1.163 +    punpcklbw  xmm0, xmm0
   1.164 +    movdqa     xmm1, xmm0
   1.165 +    punpcklwd  xmm0, xmm0
   1.166 +    punpckhwd  xmm1, xmm1
   1.167 +    por        xmm0, xmm5
   1.168 +    por        xmm1, xmm5
   1.169 +    movdqa     [edx], xmm0
   1.170 +    movdqa     [edx + 16], xmm1
   1.171 +    lea        edx, [edx + 32]
   1.172 +    sub        ecx, 8
   1.173 +    jg         convertloop
   1.174 +    ret
   1.175 +  }
   1.176 +}
   1.177 +
   1.178 +__declspec(naked) __declspec(align(16))
   1.179 +void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
   1.180 +                                  int pix) {
   1.181 +  __asm {
   1.182 +    mov        eax, [esp + 4]        // src_y
   1.183 +    mov        edx, [esp + 8]        // dst_argb
   1.184 +    mov        ecx, [esp + 12]       // pix
   1.185 +    pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
   1.186 +    pslld      xmm5, 24
   1.187 +
   1.188 +    align      4
   1.189 +  convertloop:
   1.190 +    movq       xmm0, qword ptr [eax]
   1.191 +    lea        eax,  [eax + 8]
   1.192 +    punpcklbw  xmm0, xmm0
   1.193 +    movdqa     xmm1, xmm0
   1.194 +    punpcklwd  xmm0, xmm0
   1.195 +    punpckhwd  xmm1, xmm1
   1.196 +    por        xmm0, xmm5
   1.197 +    por        xmm1, xmm5
   1.198 +    movdqu     [edx], xmm0
   1.199 +    movdqu     [edx + 16], xmm1
   1.200 +    lea        edx, [edx + 32]
   1.201 +    sub        ecx, 8
   1.202 +    jg         convertloop
   1.203 +    ret
   1.204 +  }
   1.205 +}
   1.206 +
   1.207 +__declspec(naked) __declspec(align(16))
   1.208 +void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
   1.209 +  __asm {
   1.210 +    mov       eax, [esp + 4]   // src_rgb24
   1.211 +    mov       edx, [esp + 8]   // dst_argb
   1.212 +    mov       ecx, [esp + 12]  // pix
   1.213 +    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
   1.214 +    pslld     xmm5, 24
   1.215 +    movdqa    xmm4, kShuffleMaskRGB24ToARGB
   1.216 +
   1.217 +    align      4
   1.218 + convertloop:
   1.219 +    movdqu    xmm0, [eax]
   1.220 +    movdqu    xmm1, [eax + 16]
   1.221 +    movdqu    xmm3, [eax + 32]
   1.222 +    lea       eax, [eax + 48]
   1.223 +    movdqa    xmm2, xmm3
   1.224 +    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
   1.225 +    pshufb    xmm2, xmm4
   1.226 +    por       xmm2, xmm5
   1.227 +    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
   1.228 +    pshufb    xmm0, xmm4
   1.229 +    movdqa    [edx + 32], xmm2
   1.230 +    por       xmm0, xmm5
   1.231 +    pshufb    xmm1, xmm4
   1.232 +    movdqa    [edx], xmm0
   1.233 +    por       xmm1, xmm5
   1.234 +    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
   1.235 +    pshufb    xmm3, xmm4
   1.236 +    movdqa    [edx + 16], xmm1
   1.237 +    por       xmm3, xmm5
   1.238 +    sub       ecx, 16
   1.239 +    movdqa    [edx + 48], xmm3
   1.240 +    lea       edx, [edx + 64]
   1.241 +    jg        convertloop
   1.242 +    ret
   1.243 +  }
   1.244 +}
   1.245 +
   1.246 +__declspec(naked) __declspec(align(16))
   1.247 +void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
   1.248 +                        int pix) {
   1.249 +  __asm {
   1.250 +    mov       eax, [esp + 4]   // src_raw
   1.251 +    mov       edx, [esp + 8]   // dst_argb
   1.252 +    mov       ecx, [esp + 12]  // pix
   1.253 +    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
   1.254 +    pslld     xmm5, 24
   1.255 +    movdqa    xmm4, kShuffleMaskRAWToARGB
   1.256 +
   1.257 +    align      4
   1.258 + convertloop:
   1.259 +    movdqu    xmm0, [eax]
   1.260 +    movdqu    xmm1, [eax + 16]
   1.261 +    movdqu    xmm3, [eax + 32]
   1.262 +    lea       eax, [eax + 48]
   1.263 +    movdqa    xmm2, xmm3
   1.264 +    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
   1.265 +    pshufb    xmm2, xmm4
   1.266 +    por       xmm2, xmm5
   1.267 +    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
   1.268 +    pshufb    xmm0, xmm4
   1.269 +    movdqa    [edx + 32], xmm2
   1.270 +    por       xmm0, xmm5
   1.271 +    pshufb    xmm1, xmm4
   1.272 +    movdqa    [edx], xmm0
   1.273 +    por       xmm1, xmm5
   1.274 +    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
   1.275 +    pshufb    xmm3, xmm4
   1.276 +    movdqa    [edx + 16], xmm1
   1.277 +    por       xmm3, xmm5
   1.278 +    sub       ecx, 16
   1.279 +    movdqa    [edx + 48], xmm3
   1.280 +    lea       edx, [edx + 64]
   1.281 +    jg        convertloop
   1.282 +    ret
   1.283 +  }
   1.284 +}
   1.285 +
   1.286 +// pmul method to replicate bits.
   1.287 +// Math to replicate bits:
   1.288 +// (v << 8) | (v << 3)
   1.289 +// v * 256 + v * 8
   1.290 +// v * (256 + 8)
   1.291 +// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
   1.292 +// 20 instructions.
   1.293 +__declspec(naked) __declspec(align(16))
   1.294 +void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
   1.295 +                          int pix) {
   1.296 +  __asm {
   1.297 +    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
   1.298 +    movd      xmm5, eax
   1.299 +    pshufd    xmm5, xmm5, 0
   1.300 +    mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
   1.301 +    movd      xmm6, eax
   1.302 +    pshufd    xmm6, xmm6, 0
   1.303 +    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
   1.304 +    psllw     xmm3, 11
   1.305 +    pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green
   1.306 +    psllw     xmm4, 10
   1.307 +    psrlw     xmm4, 5
   1.308 +    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
   1.309 +    psllw     xmm7, 8
   1.310 +
   1.311 +    mov       eax, [esp + 4]   // src_rgb565
   1.312 +    mov       edx, [esp + 8]   // dst_argb
   1.313 +    mov       ecx, [esp + 12]  // pix
   1.314 +    sub       edx, eax
   1.315 +    sub       edx, eax
   1.316 +
   1.317 +    align      4
   1.318 + convertloop:
   1.319 +    movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
   1.320 +    movdqa    xmm1, xmm0
   1.321 +    movdqa    xmm2, xmm0
   1.322 +    pand      xmm1, xmm3    // R in upper 5 bits
   1.323 +    psllw     xmm2, 11      // B in upper 5 bits
   1.324 +    pmulhuw   xmm1, xmm5    // * (256 + 8)
   1.325 +    pmulhuw   xmm2, xmm5    // * (256 + 8)
   1.326 +    psllw     xmm1, 8
   1.327 +    por       xmm1, xmm2    // RB
   1.328 +    pand      xmm0, xmm4    // G in middle 6 bits
   1.329 +    pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)
   1.330 +    por       xmm0, xmm7    // AG
   1.331 +    movdqa    xmm2, xmm1
   1.332 +    punpcklbw xmm1, xmm0
   1.333 +    punpckhbw xmm2, xmm0
   1.334 +    movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
   1.335 +    movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
   1.336 +    lea       eax, [eax + 16]
   1.337 +    sub       ecx, 8
   1.338 +    jg        convertloop
   1.339 +    ret
   1.340 +  }
   1.341 +}
   1.342 +
   1.343 +// 24 instructions
   1.344 +__declspec(naked) __declspec(align(16))
   1.345 +void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
   1.346 +                            int pix) {
   1.347 +  __asm {
   1.348 +    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
   1.349 +    movd      xmm5, eax
   1.350 +    pshufd    xmm5, xmm5, 0
   1.351 +    mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
   1.352 +    movd      xmm6, eax
   1.353 +    pshufd    xmm6, xmm6, 0
   1.354 +    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
   1.355 +    psllw     xmm3, 11
   1.356 +    movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green
   1.357 +    psrlw     xmm4, 6
   1.358 +    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
   1.359 +    psllw     xmm7, 8
   1.360 +
   1.361 +    mov       eax, [esp + 4]   // src_argb1555
   1.362 +    mov       edx, [esp + 8]   // dst_argb
   1.363 +    mov       ecx, [esp + 12]  // pix
   1.364 +    sub       edx, eax
   1.365 +    sub       edx, eax
   1.366 +
   1.367 +    align      4
   1.368 + convertloop:
   1.369 +    movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
   1.370 +    movdqa    xmm1, xmm0
   1.371 +    movdqa    xmm2, xmm0
   1.372 +    psllw     xmm1, 1       // R in upper 5 bits
   1.373 +    psllw     xmm2, 11      // B in upper 5 bits
   1.374 +    pand      xmm1, xmm3
   1.375 +    pmulhuw   xmm2, xmm5    // * (256 + 8)
   1.376 +    pmulhuw   xmm1, xmm5    // * (256 + 8)
   1.377 +    psllw     xmm1, 8
   1.378 +    por       xmm1, xmm2    // RB
   1.379 +    movdqa    xmm2, xmm0
   1.380 +    pand      xmm0, xmm4    // G in middle 5 bits
   1.381 +    psraw     xmm2, 8       // A
   1.382 +    pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)
   1.383 +    pand      xmm2, xmm7
   1.384 +    por       xmm0, xmm2    // AG
   1.385 +    movdqa    xmm2, xmm1
   1.386 +    punpcklbw xmm1, xmm0
   1.387 +    punpckhbw xmm2, xmm0
   1.388 +    movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
   1.389 +    movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
   1.390 +    lea       eax, [eax + 16]
   1.391 +    sub       ecx, 8
   1.392 +    jg        convertloop
   1.393 +    ret
   1.394 +  }
   1.395 +}
   1.396 +
   1.397 +// 18 instructions.
   1.398 +__declspec(naked) __declspec(align(16))
   1.399 +void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
   1.400 +                            int pix) {
   1.401 +  __asm {
   1.402 +    mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
   1.403 +    movd      xmm4, eax
   1.404 +    pshufd    xmm4, xmm4, 0
   1.405 +    movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles
   1.406 +    pslld     xmm5, 4
   1.407 +    mov       eax, [esp + 4]   // src_argb4444
   1.408 +    mov       edx, [esp + 8]   // dst_argb
   1.409 +    mov       ecx, [esp + 12]  // pix
   1.410 +    sub       edx, eax
   1.411 +    sub       edx, eax
   1.412 +
   1.413 +    align      4
   1.414 + convertloop:
   1.415 +    movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
   1.416 +    movdqa    xmm2, xmm0
   1.417 +    pand      xmm0, xmm4    // mask low nibbles
   1.418 +    pand      xmm2, xmm5    // mask high nibbles
   1.419 +    movdqa    xmm1, xmm0
   1.420 +    movdqa    xmm3, xmm2
   1.421 +    psllw     xmm1, 4
   1.422 +    psrlw     xmm3, 4
   1.423 +    por       xmm0, xmm1
   1.424 +    por       xmm2, xmm3
   1.425 +    movdqa    xmm1, xmm0
   1.426 +    punpcklbw xmm0, xmm2
   1.427 +    punpckhbw xmm1, xmm2
   1.428 +    movdqa    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
   1.429 +    movdqa    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
   1.430 +    lea       eax, [eax + 16]
   1.431 +    sub       ecx, 8
   1.432 +    jg        convertloop
   1.433 +    ret
   1.434 +  }
   1.435 +}
   1.436 +
   1.437 +__declspec(naked) __declspec(align(16))
   1.438 +void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
   1.439 +  __asm {
   1.440 +    mov       eax, [esp + 4]   // src_argb
   1.441 +    mov       edx, [esp + 8]   // dst_rgb
   1.442 +    mov       ecx, [esp + 12]  // pix
   1.443 +    movdqa    xmm6, kShuffleMaskARGBToRGB24
   1.444 +
   1.445 +    align      4
   1.446 + convertloop:
   1.447 +    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
   1.448 +    movdqu    xmm1, [eax + 16]
   1.449 +    movdqu    xmm2, [eax + 32]
   1.450 +    movdqu    xmm3, [eax + 48]
   1.451 +    lea       eax, [eax + 64]
   1.452 +    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
   1.453 +    pshufb    xmm1, xmm6
   1.454 +    pshufb    xmm2, xmm6
   1.455 +    pshufb    xmm3, xmm6
   1.456 +    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
   1.457 +    psrldq    xmm1, 4      // 8 bytes from 1
   1.458 +    pslldq    xmm4, 12     // 4 bytes from 1 for 0
   1.459 +    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
   1.460 +    por       xmm0, xmm4   // 4 bytes from 1 for 0
   1.461 +    pslldq    xmm5, 8      // 8 bytes from 2 for 1
   1.462 +    movdqu    [edx], xmm0  // store 0
   1.463 +    por       xmm1, xmm5   // 8 bytes from 2 for 1
   1.464 +    psrldq    xmm2, 8      // 4 bytes from 2
   1.465 +    pslldq    xmm3, 4      // 12 bytes from 3 for 2
   1.466 +    por       xmm2, xmm3   // 12 bytes from 3 for 2
   1.467 +    movdqu    [edx + 16], xmm1   // store 1
   1.468 +    movdqu    [edx + 32], xmm2   // store 2
   1.469 +    lea       edx, [edx + 48]
   1.470 +    sub       ecx, 16
   1.471 +    jg        convertloop
   1.472 +    ret
   1.473 +  }
   1.474 +}
   1.475 +
   1.476 +__declspec(naked) __declspec(align(16))
   1.477 +void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
   1.478 +  __asm {
   1.479 +    mov       eax, [esp + 4]   // src_argb
   1.480 +    mov       edx, [esp + 8]   // dst_rgb
   1.481 +    mov       ecx, [esp + 12]  // pix
   1.482 +    movdqa    xmm6, kShuffleMaskARGBToRAW
   1.483 +
   1.484 +    align      4
   1.485 + convertloop:
   1.486 +    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
   1.487 +    movdqu    xmm1, [eax + 16]
   1.488 +    movdqu    xmm2, [eax + 32]
   1.489 +    movdqu    xmm3, [eax + 48]
   1.490 +    lea       eax, [eax + 64]
   1.491 +    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
   1.492 +    pshufb    xmm1, xmm6
   1.493 +    pshufb    xmm2, xmm6
   1.494 +    pshufb    xmm3, xmm6
   1.495 +    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
   1.496 +    psrldq    xmm1, 4      // 8 bytes from 1
   1.497 +    pslldq    xmm4, 12     // 4 bytes from 1 for 0
   1.498 +    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
   1.499 +    por       xmm0, xmm4   // 4 bytes from 1 for 0
   1.500 +    pslldq    xmm5, 8      // 8 bytes from 2 for 1
   1.501 +    movdqu    [edx], xmm0  // store 0
   1.502 +    por       xmm1, xmm5   // 8 bytes from 2 for 1
   1.503 +    psrldq    xmm2, 8      // 4 bytes from 2
   1.504 +    pslldq    xmm3, 4      // 12 bytes from 3 for 2
   1.505 +    por       xmm2, xmm3   // 12 bytes from 3 for 2
   1.506 +    movdqu    [edx + 16], xmm1   // store 1
   1.507 +    movdqu    [edx + 32], xmm2   // store 2
   1.508 +    lea       edx, [edx + 48]
   1.509 +    sub       ecx, 16
   1.510 +    jg        convertloop
   1.511 +    ret
   1.512 +  }
   1.513 +}
   1.514 +
   1.515 +__declspec(naked) __declspec(align(16))
   1.516 +void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
   1.517 +  __asm {
   1.518 +    mov       eax, [esp + 4]   // src_argb
   1.519 +    mov       edx, [esp + 8]   // dst_rgb
   1.520 +    mov       ecx, [esp + 12]  // pix
   1.521 +    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
   1.522 +    psrld     xmm3, 27
   1.523 +    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
   1.524 +    psrld     xmm4, 26
   1.525 +    pslld     xmm4, 5
   1.526 +    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
   1.527 +    pslld     xmm5, 11
   1.528 +
   1.529 +    align      4
   1.530 + convertloop:
   1.531 +    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
   1.532 +    movdqa    xmm1, xmm0    // B
   1.533 +    movdqa    xmm2, xmm0    // G
   1.534 +    pslld     xmm0, 8       // R
   1.535 +    psrld     xmm1, 3       // B
   1.536 +    psrld     xmm2, 5       // G
   1.537 +    psrad     xmm0, 16      // R
   1.538 +    pand      xmm1, xmm3    // B
   1.539 +    pand      xmm2, xmm4    // G
   1.540 +    pand      xmm0, xmm5    // R
   1.541 +    por       xmm1, xmm2    // BG
   1.542 +    por       xmm0, xmm1    // BGR
   1.543 +    packssdw  xmm0, xmm0
   1.544 +    lea       eax, [eax + 16]
   1.545 +    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
   1.546 +    lea       edx, [edx + 8]
   1.547 +    sub       ecx, 4
   1.548 +    jg        convertloop
   1.549 +    ret
   1.550 +  }
   1.551 +}
   1.552 +
   1.553 +// TODO(fbarchard): Improve sign extension/packing.
   1.554 +__declspec(naked) __declspec(align(16))
   1.555 +void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
   1.556 +  __asm {
   1.557 +    mov       eax, [esp + 4]   // src_argb
   1.558 +    mov       edx, [esp + 8]   // dst_rgb
   1.559 +    mov       ecx, [esp + 12]  // pix
   1.560 +    pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
   1.561 +    psrld     xmm4, 27
   1.562 +    movdqa    xmm5, xmm4       // generate mask 0x000003e0
   1.563 +    pslld     xmm5, 5
   1.564 +    movdqa    xmm6, xmm4       // generate mask 0x00007c00
   1.565 +    pslld     xmm6, 10
   1.566 +    pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
   1.567 +    pslld     xmm7, 15
   1.568 +
   1.569 +    align      4
   1.570 + convertloop:
   1.571 +    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
   1.572 +    movdqa    xmm1, xmm0    // B
   1.573 +    movdqa    xmm2, xmm0    // G
   1.574 +    movdqa    xmm3, xmm0    // R
   1.575 +    psrad     xmm0, 16      // A
   1.576 +    psrld     xmm1, 3       // B
   1.577 +    psrld     xmm2, 6       // G
   1.578 +    psrld     xmm3, 9       // R
   1.579 +    pand      xmm0, xmm7    // A
   1.580 +    pand      xmm1, xmm4    // B
   1.581 +    pand      xmm2, xmm5    // G
   1.582 +    pand      xmm3, xmm6    // R
   1.583 +    por       xmm0, xmm1    // BA
   1.584 +    por       xmm2, xmm3    // GR
   1.585 +    por       xmm0, xmm2    // BGRA
   1.586 +    packssdw  xmm0, xmm0
   1.587 +    lea       eax, [eax + 16]
   1.588 +    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
   1.589 +    lea       edx, [edx + 8]
   1.590 +    sub       ecx, 4
   1.591 +    jg        convertloop
   1.592 +    ret
   1.593 +  }
   1.594 +}
   1.595 +
   1.596 +__declspec(naked) __declspec(align(16))
   1.597 +void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
   1.598 +  __asm {
   1.599 +    mov       eax, [esp + 4]   // src_argb
   1.600 +    mov       edx, [esp + 8]   // dst_rgb
   1.601 +    mov       ecx, [esp + 12]  // pix
   1.602 +    pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
   1.603 +    psllw     xmm4, 12
   1.604 +    movdqa    xmm3, xmm4       // generate mask 0x00f000f0
   1.605 +    psrlw     xmm3, 8
   1.606 +
   1.607 +    align      4
   1.608 + convertloop:
   1.609 +    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
   1.610 +    movdqa    xmm1, xmm0
   1.611 +    pand      xmm0, xmm3    // low nibble
   1.612 +    pand      xmm1, xmm4    // high nibble
   1.613 +    psrl      xmm0, 4
   1.614 +    psrl      xmm1, 8
   1.615 +    por       xmm0, xmm1
   1.616 +    packuswb  xmm0, xmm0
   1.617 +    lea       eax, [eax + 16]
   1.618 +    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
   1.619 +    lea       edx, [edx + 8]
   1.620 +    sub       ecx, 4
   1.621 +    jg        convertloop
   1.622 +    ret
   1.623 +  }
   1.624 +}
   1.625 +
   1.626 +// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
   1.627 +__declspec(naked) __declspec(align(16))
   1.628 +void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   1.629 +  __asm {
   1.630 +    mov        eax, [esp + 4]   /* src_argb */
   1.631 +    mov        edx, [esp + 8]   /* dst_y */
   1.632 +    mov        ecx, [esp + 12]  /* pix */
   1.633 +    movdqa     xmm5, kAddY16
   1.634 +    movdqa     xmm4, kARGBToY
   1.635 +
   1.636 +    align      4
   1.637 + convertloop:
   1.638 +    movdqa     xmm0, [eax]
   1.639 +    movdqa     xmm1, [eax + 16]
   1.640 +    movdqa     xmm2, [eax + 32]
   1.641 +    movdqa     xmm3, [eax + 48]
   1.642 +    pmaddubsw  xmm0, xmm4
   1.643 +    pmaddubsw  xmm1, xmm4
   1.644 +    pmaddubsw  xmm2, xmm4
   1.645 +    pmaddubsw  xmm3, xmm4
   1.646 +    lea        eax, [eax + 64]
   1.647 +    phaddw     xmm0, xmm1
   1.648 +    phaddw     xmm2, xmm3
   1.649 +    psrlw      xmm0, 7
   1.650 +    psrlw      xmm2, 7
   1.651 +    packuswb   xmm0, xmm2
   1.652 +    paddb      xmm0, xmm5
   1.653 +    sub        ecx, 16
   1.654 +    movdqa     [edx], xmm0
   1.655 +    lea        edx, [edx + 16]
   1.656 +    jg         convertloop
   1.657 +    ret
   1.658 +  }
   1.659 +}
   1.660 +
   1.661 +// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
   1.662 +__declspec(naked) __declspec(align(16))
   1.663 +void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   1.664 +  __asm {
   1.665 +    mov        eax, [esp + 4]   /* src_argb */
   1.666 +    mov        edx, [esp + 8]   /* dst_y */
   1.667 +    mov        ecx, [esp + 12]  /* pix */
   1.668 +    movdqa     xmm4, kARGBToYJ
   1.669 +    movdqa     xmm5, kAddYJ64
   1.670 +
   1.671 +    align      4
   1.672 + convertloop:
   1.673 +    movdqa     xmm0, [eax]
   1.674 +    movdqa     xmm1, [eax + 16]
   1.675 +    movdqa     xmm2, [eax + 32]
   1.676 +    movdqa     xmm3, [eax + 48]
   1.677 +    pmaddubsw  xmm0, xmm4
   1.678 +    pmaddubsw  xmm1, xmm4
   1.679 +    pmaddubsw  xmm2, xmm4
   1.680 +    pmaddubsw  xmm3, xmm4
   1.681 +    lea        eax, [eax + 64]
   1.682 +    phaddw     xmm0, xmm1
   1.683 +    phaddw     xmm2, xmm3
   1.684 +    paddw      xmm0, xmm5  // Add .5 for rounding.
   1.685 +    paddw      xmm2, xmm5
   1.686 +    psrlw      xmm0, 7
   1.687 +    psrlw      xmm2, 7
   1.688 +    packuswb   xmm0, xmm2
   1.689 +    sub        ecx, 16
   1.690 +    movdqa     [edx], xmm0
   1.691 +    lea        edx, [edx + 16]
   1.692 +    jg         convertloop
   1.693 +    ret
   1.694 +  }
   1.695 +}
   1.696 +
   1.697 +#ifdef HAS_ARGBTOYROW_AVX2
   1.698 +// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
   1.699 +__declspec(naked) __declspec(align(32))
   1.700 +void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
   1.701 +  __asm {
   1.702 +    mov        eax, [esp + 4]   /* src_argb */
   1.703 +    mov        edx, [esp + 8]   /* dst_y */
   1.704 +    mov        ecx, [esp + 12]  /* pix */
   1.705 +    vbroadcastf128 ymm4, kARGBToY
   1.706 +    vbroadcastf128 ymm5, kAddY16
   1.707 +    vmovdqa    ymm6, kPermdARGBToY_AVX
   1.708 +
   1.709 +    align      4
   1.710 + convertloop:
   1.711 +    vmovdqu    ymm0, [eax]
   1.712 +    vmovdqu    ymm1, [eax + 32]
   1.713 +    vmovdqu    ymm2, [eax + 64]
   1.714 +    vmovdqu    ymm3, [eax + 96]
   1.715 +    vpmaddubsw ymm0, ymm0, ymm4
   1.716 +    vpmaddubsw ymm1, ymm1, ymm4
   1.717 +    vpmaddubsw ymm2, ymm2, ymm4
   1.718 +    vpmaddubsw ymm3, ymm3, ymm4
   1.719 +    lea        eax, [eax + 128]
   1.720 +    vphaddw    ymm0, ymm0, ymm1  // mutates.
   1.721 +    vphaddw    ymm2, ymm2, ymm3
   1.722 +    vpsrlw     ymm0, ymm0, 7
   1.723 +    vpsrlw     ymm2, ymm2, 7
   1.724 +    vpackuswb  ymm0, ymm0, ymm2  // mutates.
   1.725 +    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
   1.726 +    vpaddb     ymm0, ymm0, ymm5
   1.727 +    sub        ecx, 32
   1.728 +    vmovdqu    [edx], ymm0
   1.729 +    lea        edx, [edx + 32]
   1.730 +    jg         convertloop
   1.731 +    vzeroupper
   1.732 +    ret
   1.733 +  }
   1.734 +}
   1.735 +#endif  //  HAS_ARGBTOYROW_AVX2
   1.736 +
   1.737 +#ifdef HAS_ARGBTOYROW_AVX2
   1.738 +// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
   1.739 +__declspec(naked) __declspec(align(32))
   1.740 +void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
   1.741 +  __asm {
   1.742 +    mov        eax, [esp + 4]   /* src_argb */
   1.743 +    mov        edx, [esp + 8]   /* dst_y */
   1.744 +    mov        ecx, [esp + 12]  /* pix */
   1.745 +    vbroadcastf128 ymm4, kARGBToYJ
   1.746 +    vbroadcastf128 ymm5, kAddYJ64
   1.747 +    vmovdqa    ymm6, kPermdARGBToY_AVX
   1.748 +
   1.749 +    align      4
   1.750 + convertloop:
   1.751 +    vmovdqu    ymm0, [eax]
   1.752 +    vmovdqu    ymm1, [eax + 32]
   1.753 +    vmovdqu    ymm2, [eax + 64]
   1.754 +    vmovdqu    ymm3, [eax + 96]
   1.755 +    vpmaddubsw ymm0, ymm0, ymm4
   1.756 +    vpmaddubsw ymm1, ymm1, ymm4
   1.757 +    vpmaddubsw ymm2, ymm2, ymm4
   1.758 +    vpmaddubsw ymm3, ymm3, ymm4
   1.759 +    lea        eax, [eax + 128]
   1.760 +    vphaddw    ymm0, ymm0, ymm1  // mutates.
   1.761 +    vphaddw    ymm2, ymm2, ymm3
   1.762 +    vpaddw     ymm0, ymm0, ymm5  // Add .5 for rounding.
   1.763 +    vpaddw     ymm2, ymm2, ymm5
   1.764 +    vpsrlw     ymm0, ymm0, 7
   1.765 +    vpsrlw     ymm2, ymm2, 7
   1.766 +    vpackuswb  ymm0, ymm0, ymm2  // mutates.
   1.767 +    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
   1.768 +    sub        ecx, 32
   1.769 +    vmovdqu    [edx], ymm0
   1.770 +    lea        edx, [edx + 32]
   1.771 +    jg         convertloop
   1.772 +
   1.773 +    vzeroupper
   1.774 +    ret
   1.775 +  }
   1.776 +}
   1.777 +#endif  //  HAS_ARGBTOYJROW_AVX2
   1.778 +
   1.779 +__declspec(naked) __declspec(align(16))
   1.780 +void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   1.781 +  __asm {
   1.782 +    mov        eax, [esp + 4]   /* src_argb */
   1.783 +    mov        edx, [esp + 8]   /* dst_y */
   1.784 +    mov        ecx, [esp + 12]  /* pix */
   1.785 +    movdqa     xmm5, kAddY16
   1.786 +    movdqa     xmm4, kARGBToY
   1.787 +
   1.788 +    align      4
   1.789 + convertloop:
   1.790 +    movdqu     xmm0, [eax]
   1.791 +    movdqu     xmm1, [eax + 16]
   1.792 +    movdqu     xmm2, [eax + 32]
   1.793 +    movdqu     xmm3, [eax + 48]
   1.794 +    pmaddubsw  xmm0, xmm4
   1.795 +    pmaddubsw  xmm1, xmm4
   1.796 +    pmaddubsw  xmm2, xmm4
   1.797 +    pmaddubsw  xmm3, xmm4
   1.798 +    lea        eax, [eax + 64]
   1.799 +    phaddw     xmm0, xmm1
   1.800 +    phaddw     xmm2, xmm3
   1.801 +    psrlw      xmm0, 7
   1.802 +    psrlw      xmm2, 7
   1.803 +    packuswb   xmm0, xmm2
   1.804 +    paddb      xmm0, xmm5
   1.805 +    sub        ecx, 16
   1.806 +    movdqu     [edx], xmm0
   1.807 +    lea        edx, [edx + 16]
   1.808 +    jg         convertloop
   1.809 +    ret
   1.810 +  }
   1.811 +}
   1.812 +
   1.813 +__declspec(naked) __declspec(align(16))
   1.814 +void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   1.815 +  __asm {
   1.816 +    mov        eax, [esp + 4]   /* src_argb */
   1.817 +    mov        edx, [esp + 8]   /* dst_y */
   1.818 +    mov        ecx, [esp + 12]  /* pix */
   1.819 +    movdqa     xmm4, kARGBToYJ
   1.820 +    movdqa     xmm5, kAddYJ64
   1.821 +
   1.822 +    align      4
   1.823 + convertloop:
   1.824 +    movdqu     xmm0, [eax]
   1.825 +    movdqu     xmm1, [eax + 16]
   1.826 +    movdqu     xmm2, [eax + 32]
   1.827 +    movdqu     xmm3, [eax + 48]
   1.828 +    pmaddubsw  xmm0, xmm4
   1.829 +    pmaddubsw  xmm1, xmm4
   1.830 +    pmaddubsw  xmm2, xmm4
   1.831 +    pmaddubsw  xmm3, xmm4
   1.832 +    lea        eax, [eax + 64]
   1.833 +    phaddw     xmm0, xmm1
   1.834 +    phaddw     xmm2, xmm3
   1.835 +    paddw      xmm0, xmm5
   1.836 +    paddw      xmm2, xmm5
   1.837 +    psrlw      xmm0, 7
   1.838 +    psrlw      xmm2, 7
   1.839 +    packuswb   xmm0, xmm2
   1.840 +    sub        ecx, 16
   1.841 +    movdqu     [edx], xmm0
   1.842 +    lea        edx, [edx + 16]
   1.843 +    jg         convertloop
   1.844 +    ret
   1.845 +  }
   1.846 +}
   1.847 +
   1.848 +__declspec(naked) __declspec(align(16))
   1.849 +void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   1.850 +  __asm {
   1.851 +    mov        eax, [esp + 4]   /* src_argb */
   1.852 +    mov        edx, [esp + 8]   /* dst_y */
   1.853 +    mov        ecx, [esp + 12]  /* pix */
   1.854 +    movdqa     xmm5, kAddY16
   1.855 +    movdqa     xmm4, kBGRAToY
   1.856 +
   1.857 +    align      4
   1.858 + convertloop:
   1.859 +    movdqa     xmm0, [eax]
   1.860 +    movdqa     xmm1, [eax + 16]
   1.861 +    movdqa     xmm2, [eax + 32]
   1.862 +    movdqa     xmm3, [eax + 48]
   1.863 +    pmaddubsw  xmm0, xmm4
   1.864 +    pmaddubsw  xmm1, xmm4
   1.865 +    pmaddubsw  xmm2, xmm4
   1.866 +    pmaddubsw  xmm3, xmm4
   1.867 +    lea        eax, [eax + 64]
   1.868 +    phaddw     xmm0, xmm1
   1.869 +    phaddw     xmm2, xmm3
   1.870 +    psrlw      xmm0, 7
   1.871 +    psrlw      xmm2, 7
   1.872 +    packuswb   xmm0, xmm2
   1.873 +    paddb      xmm0, xmm5
   1.874 +    sub        ecx, 16
   1.875 +    movdqa     [edx], xmm0
   1.876 +    lea        edx, [edx + 16]
   1.877 +    jg         convertloop
   1.878 +    ret
   1.879 +  }
   1.880 +}
   1.881 +
   1.882 +__declspec(naked) __declspec(align(16))
   1.883 +void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   1.884 +  __asm {
   1.885 +    mov        eax, [esp + 4]   /* src_argb */
   1.886 +    mov        edx, [esp + 8]   /* dst_y */
   1.887 +    mov        ecx, [esp + 12]  /* pix */
   1.888 +    movdqa     xmm5, kAddY16
   1.889 +    movdqa     xmm4, kBGRAToY
   1.890 +
   1.891 +    align      4
   1.892 + convertloop:
   1.893 +    movdqu     xmm0, [eax]
   1.894 +    movdqu     xmm1, [eax + 16]
   1.895 +    movdqu     xmm2, [eax + 32]
   1.896 +    movdqu     xmm3, [eax + 48]
   1.897 +    pmaddubsw  xmm0, xmm4
   1.898 +    pmaddubsw  xmm1, xmm4
   1.899 +    pmaddubsw  xmm2, xmm4
   1.900 +    pmaddubsw  xmm3, xmm4
   1.901 +    lea        eax, [eax + 64]
   1.902 +    phaddw     xmm0, xmm1
   1.903 +    phaddw     xmm2, xmm3
   1.904 +    psrlw      xmm0, 7
   1.905 +    psrlw      xmm2, 7
   1.906 +    packuswb   xmm0, xmm2
   1.907 +    paddb      xmm0, xmm5
   1.908 +    sub        ecx, 16
   1.909 +    movdqu     [edx], xmm0
   1.910 +    lea        edx, [edx + 16]
   1.911 +    jg         convertloop
   1.912 +    ret
   1.913 +  }
   1.914 +}
   1.915 +
   1.916 +__declspec(naked) __declspec(align(16))
   1.917 +void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   1.918 +  __asm {
   1.919 +    mov        eax, [esp + 4]   /* src_argb */
   1.920 +    mov        edx, [esp + 8]   /* dst_y */
   1.921 +    mov        ecx, [esp + 12]  /* pix */
   1.922 +    movdqa     xmm5, kAddY16
   1.923 +    movdqa     xmm4, kABGRToY
   1.924 +
   1.925 +    align      4
   1.926 + convertloop:
   1.927 +    movdqa     xmm0, [eax]
   1.928 +    movdqa     xmm1, [eax + 16]
   1.929 +    movdqa     xmm2, [eax + 32]
   1.930 +    movdqa     xmm3, [eax + 48]
   1.931 +    pmaddubsw  xmm0, xmm4
   1.932 +    pmaddubsw  xmm1, xmm4
   1.933 +    pmaddubsw  xmm2, xmm4
   1.934 +    pmaddubsw  xmm3, xmm4
   1.935 +    lea        eax, [eax + 64]
   1.936 +    phaddw     xmm0, xmm1
   1.937 +    phaddw     xmm2, xmm3
   1.938 +    psrlw      xmm0, 7
   1.939 +    psrlw      xmm2, 7
   1.940 +    packuswb   xmm0, xmm2
   1.941 +    paddb      xmm0, xmm5
   1.942 +    sub        ecx, 16
   1.943 +    movdqa     [edx], xmm0
   1.944 +    lea        edx, [edx + 16]
   1.945 +    jg         convertloop
   1.946 +    ret
   1.947 +  }
   1.948 +}
   1.949 +
   1.950 +__declspec(naked) __declspec(align(16))
   1.951 +void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   1.952 +  __asm {
   1.953 +    mov        eax, [esp + 4]   /* src_argb */
   1.954 +    mov        edx, [esp + 8]   /* dst_y */
   1.955 +    mov        ecx, [esp + 12]  /* pix */
   1.956 +    movdqa     xmm5, kAddY16
   1.957 +    movdqa     xmm4, kABGRToY
   1.958 +
   1.959 +    align      4
   1.960 + convertloop:
   1.961 +    movdqu     xmm0, [eax]
   1.962 +    movdqu     xmm1, [eax + 16]
   1.963 +    movdqu     xmm2, [eax + 32]
   1.964 +    movdqu     xmm3, [eax + 48]
   1.965 +    pmaddubsw  xmm0, xmm4
   1.966 +    pmaddubsw  xmm1, xmm4
   1.967 +    pmaddubsw  xmm2, xmm4
   1.968 +    pmaddubsw  xmm3, xmm4
   1.969 +    lea        eax, [eax + 64]
   1.970 +    phaddw     xmm0, xmm1
   1.971 +    phaddw     xmm2, xmm3
   1.972 +    psrlw      xmm0, 7
   1.973 +    psrlw      xmm2, 7
   1.974 +    packuswb   xmm0, xmm2
   1.975 +    paddb      xmm0, xmm5
   1.976 +    sub        ecx, 16
   1.977 +    movdqu     [edx], xmm0
   1.978 +    lea        edx, [edx + 16]
   1.979 +    jg         convertloop
   1.980 +    ret
   1.981 +  }
   1.982 +}
   1.983 +
   1.984 +__declspec(naked) __declspec(align(16))
   1.985 +void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   1.986 +  __asm {
   1.987 +    mov        eax, [esp + 4]   /* src_argb */
   1.988 +    mov        edx, [esp + 8]   /* dst_y */
   1.989 +    mov        ecx, [esp + 12]  /* pix */
   1.990 +    movdqa     xmm5, kAddY16
   1.991 +    movdqa     xmm4, kRGBAToY
   1.992 +
   1.993 +    align      4
   1.994 + convertloop:
   1.995 +    movdqa     xmm0, [eax]
   1.996 +    movdqa     xmm1, [eax + 16]
   1.997 +    movdqa     xmm2, [eax + 32]
   1.998 +    movdqa     xmm3, [eax + 48]
   1.999 +    pmaddubsw  xmm0, xmm4
  1.1000 +    pmaddubsw  xmm1, xmm4
  1.1001 +    pmaddubsw  xmm2, xmm4
  1.1002 +    pmaddubsw  xmm3, xmm4
  1.1003 +    lea        eax, [eax + 64]
  1.1004 +    phaddw     xmm0, xmm1
  1.1005 +    phaddw     xmm2, xmm3
  1.1006 +    psrlw      xmm0, 7
  1.1007 +    psrlw      xmm2, 7
  1.1008 +    packuswb   xmm0, xmm2
  1.1009 +    paddb      xmm0, xmm5
  1.1010 +    sub        ecx, 16
  1.1011 +    movdqa     [edx], xmm0
  1.1012 +    lea        edx, [edx + 16]
  1.1013 +    jg         convertloop
  1.1014 +    ret
  1.1015 +  }
  1.1016 +}
  1.1017 +
  1.1018 +__declspec(naked) __declspec(align(16))
  1.1019 +void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
  1.1020 +  __asm {
  1.1021 +    mov        eax, [esp + 4]   /* src_argb */
  1.1022 +    mov        edx, [esp + 8]   /* dst_y */
  1.1023 +    mov        ecx, [esp + 12]  /* pix */
  1.1024 +    movdqa     xmm5, kAddY16
  1.1025 +    movdqa     xmm4, kRGBAToY
  1.1026 +
  1.1027 +    align      4
  1.1028 + convertloop:
  1.1029 +    movdqu     xmm0, [eax]
  1.1030 +    movdqu     xmm1, [eax + 16]
  1.1031 +    movdqu     xmm2, [eax + 32]
  1.1032 +    movdqu     xmm3, [eax + 48]
  1.1033 +    pmaddubsw  xmm0, xmm4
  1.1034 +    pmaddubsw  xmm1, xmm4
  1.1035 +    pmaddubsw  xmm2, xmm4
  1.1036 +    pmaddubsw  xmm3, xmm4
  1.1037 +    lea        eax, [eax + 64]
  1.1038 +    phaddw     xmm0, xmm1
  1.1039 +    phaddw     xmm2, xmm3
  1.1040 +    psrlw      xmm0, 7
  1.1041 +    psrlw      xmm2, 7
  1.1042 +    packuswb   xmm0, xmm2
  1.1043 +    paddb      xmm0, xmm5
  1.1044 +    sub        ecx, 16
  1.1045 +    movdqu     [edx], xmm0
  1.1046 +    lea        edx, [edx + 16]
  1.1047 +    jg         convertloop
  1.1048 +    ret
  1.1049 +  }
  1.1050 +}
  1.1051 +
  1.1052 +__declspec(naked) __declspec(align(16))
  1.1053 +void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1.1054 +                       uint8* dst_u, uint8* dst_v, int width) {
  1.1055 +  __asm {
  1.1056 +    push       esi
  1.1057 +    push       edi
  1.1058 +    mov        eax, [esp + 8 + 4]   // src_argb
  1.1059 +    mov        esi, [esp + 8 + 8]   // src_stride_argb
  1.1060 +    mov        edx, [esp + 8 + 12]  // dst_u
  1.1061 +    mov        edi, [esp + 8 + 16]  // dst_v
  1.1062 +    mov        ecx, [esp + 8 + 20]  // pix
  1.1063 +    movdqa     xmm7, kARGBToU
  1.1064 +    movdqa     xmm6, kARGBToV
  1.1065 +    movdqa     xmm5, kAddUV128
  1.1066 +    sub        edi, edx             // stride from u to v
  1.1067 +
  1.1068 +    align      4
  1.1069 + convertloop:
  1.1070 +    /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1.1071 +    movdqa     xmm0, [eax]
  1.1072 +    movdqa     xmm1, [eax + 16]
  1.1073 +    movdqa     xmm2, [eax + 32]
  1.1074 +    movdqa     xmm3, [eax + 48]
  1.1075 +    pavgb      xmm0, [eax + esi]
  1.1076 +    pavgb      xmm1, [eax + esi + 16]
  1.1077 +    pavgb      xmm2, [eax + esi + 32]
  1.1078 +    pavgb      xmm3, [eax + esi + 48]
  1.1079 +    lea        eax,  [eax + 64]
  1.1080 +    movdqa     xmm4, xmm0
  1.1081 +    shufps     xmm0, xmm1, 0x88
  1.1082 +    shufps     xmm4, xmm1, 0xdd
  1.1083 +    pavgb      xmm0, xmm4
  1.1084 +    movdqa     xmm4, xmm2
  1.1085 +    shufps     xmm2, xmm3, 0x88
  1.1086 +    shufps     xmm4, xmm3, 0xdd
  1.1087 +    pavgb      xmm2, xmm4
  1.1088 +
  1.1089 +    // step 2 - convert to U and V
  1.1090 +    // from here down is very similar to Y code except
  1.1091 +    // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1.1092 +    movdqa     xmm1, xmm0
  1.1093 +    movdqa     xmm3, xmm2
  1.1094 +    pmaddubsw  xmm0, xmm7  // U
  1.1095 +    pmaddubsw  xmm2, xmm7
  1.1096 +    pmaddubsw  xmm1, xmm6  // V
  1.1097 +    pmaddubsw  xmm3, xmm6
  1.1098 +    phaddw     xmm0, xmm2
  1.1099 +    phaddw     xmm1, xmm3
  1.1100 +    psraw      xmm0, 8
  1.1101 +    psraw      xmm1, 8
  1.1102 +    packsswb   xmm0, xmm1
  1.1103 +    paddb      xmm0, xmm5            // -> unsigned
  1.1104 +
  1.1105 +    // step 3 - store 8 U and 8 V values
  1.1106 +    sub        ecx, 16
  1.1107 +    movlps     qword ptr [edx], xmm0 // U
  1.1108 +    movhps     qword ptr [edx + edi], xmm0 // V
  1.1109 +    lea        edx, [edx + 8]
  1.1110 +    jg         convertloop
  1.1111 +
  1.1112 +    pop        edi
  1.1113 +    pop        esi
  1.1114 +    ret
  1.1115 +  }
  1.1116 +}
  1.1117 +
  1.1118 +__declspec(naked) __declspec(align(16))
  1.1119 +void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1.1120 +                        uint8* dst_u, uint8* dst_v, int width) {
  1.1121 +  __asm {
  1.1122 +    push       esi
  1.1123 +    push       edi
  1.1124 +    mov        eax, [esp + 8 + 4]   // src_argb
  1.1125 +    mov        esi, [esp + 8 + 8]   // src_stride_argb
  1.1126 +    mov        edx, [esp + 8 + 12]  // dst_u
  1.1127 +    mov        edi, [esp + 8 + 16]  // dst_v
  1.1128 +    mov        ecx, [esp + 8 + 20]  // pix
  1.1129 +    movdqa     xmm7, kARGBToUJ
  1.1130 +    movdqa     xmm6, kARGBToVJ
  1.1131 +    movdqa     xmm5, kAddUVJ128
  1.1132 +    sub        edi, edx             // stride from u to v
  1.1133 +
  1.1134 +    align      4
  1.1135 + convertloop:
  1.1136 +    /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1.1137 +    movdqa     xmm0, [eax]
  1.1138 +    movdqa     xmm1, [eax + 16]
  1.1139 +    movdqa     xmm2, [eax + 32]
  1.1140 +    movdqa     xmm3, [eax + 48]
  1.1141 +    pavgb      xmm0, [eax + esi]
  1.1142 +    pavgb      xmm1, [eax + esi + 16]
  1.1143 +    pavgb      xmm2, [eax + esi + 32]
  1.1144 +    pavgb      xmm3, [eax + esi + 48]
  1.1145 +    lea        eax,  [eax + 64]
  1.1146 +    movdqa     xmm4, xmm0
  1.1147 +    shufps     xmm0, xmm1, 0x88
  1.1148 +    shufps     xmm4, xmm1, 0xdd
  1.1149 +    pavgb      xmm0, xmm4
  1.1150 +    movdqa     xmm4, xmm2
  1.1151 +    shufps     xmm2, xmm3, 0x88
  1.1152 +    shufps     xmm4, xmm3, 0xdd
  1.1153 +    pavgb      xmm2, xmm4
  1.1154 +
  1.1155 +    // step 2 - convert to U and V
  1.1156 +    // from here down is very similar to Y code except
  1.1157 +    // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1.1158 +    movdqa     xmm1, xmm0
  1.1159 +    movdqa     xmm3, xmm2
  1.1160 +    pmaddubsw  xmm0, xmm7  // U
  1.1161 +    pmaddubsw  xmm2, xmm7
  1.1162 +    pmaddubsw  xmm1, xmm6  // V
  1.1163 +    pmaddubsw  xmm3, xmm6
  1.1164 +    phaddw     xmm0, xmm2
  1.1165 +    phaddw     xmm1, xmm3
  1.1166 +    paddw      xmm0, xmm5            // +.5 rounding -> unsigned
  1.1167 +    paddw      xmm1, xmm5
  1.1168 +    psraw      xmm0, 8
  1.1169 +    psraw      xmm1, 8
  1.1170 +    packsswb   xmm0, xmm1
  1.1171 +
  1.1172 +    // step 3 - store 8 U and 8 V values
  1.1173 +    sub        ecx, 16
  1.1174 +    movlps     qword ptr [edx], xmm0 // U
  1.1175 +    movhps     qword ptr [edx + edi], xmm0 // V
  1.1176 +    lea        edx, [edx + 8]
  1.1177 +    jg         convertloop
  1.1178 +
  1.1179 +    pop        edi
  1.1180 +    pop        esi
  1.1181 +    ret
  1.1182 +  }
  1.1183 +}
  1.1184 +
  1.1185 +#ifdef HAS_ARGBTOUVROW_AVX2
  1.1186 +__declspec(naked) __declspec(align(32))
  1.1187 +void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
  1.1188 +                      uint8* dst_u, uint8* dst_v, int width) {
  1.1189 +  __asm {
  1.1190 +    push       esi
  1.1191 +    push       edi
  1.1192 +    mov        eax, [esp + 8 + 4]   // src_argb
  1.1193 +    mov        esi, [esp + 8 + 8]   // src_stride_argb
  1.1194 +    mov        edx, [esp + 8 + 12]  // dst_u
  1.1195 +    mov        edi, [esp + 8 + 16]  // dst_v
  1.1196 +    mov        ecx, [esp + 8 + 20]  // pix
  1.1197 +    vbroadcastf128 ymm5, kAddUV128
  1.1198 +    vbroadcastf128 ymm6, kARGBToV
  1.1199 +    vbroadcastf128 ymm7, kARGBToU
  1.1200 +    sub        edi, edx             // stride from u to v
  1.1201 +
  1.1202 +    align      4
  1.1203 + convertloop:
  1.1204 +    /* step 1 - subsample 32x2 argb pixels to 16x1 */
  1.1205 +    vmovdqu    ymm0, [eax]
  1.1206 +    vmovdqu    ymm1, [eax + 32]
  1.1207 +    vmovdqu    ymm2, [eax + 64]
  1.1208 +    vmovdqu    ymm3, [eax + 96]
  1.1209 +    vpavgb     ymm0, ymm0, [eax + esi]
  1.1210 +    vpavgb     ymm1, ymm1, [eax + esi + 32]
  1.1211 +    vpavgb     ymm2, ymm2, [eax + esi + 64]
  1.1212 +    vpavgb     ymm3, ymm3, [eax + esi + 96]
  1.1213 +    lea        eax,  [eax + 128]
  1.1214 +    vshufps    ymm4, ymm0, ymm1, 0x88
  1.1215 +    vshufps    ymm0, ymm0, ymm1, 0xdd
  1.1216 +    vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
  1.1217 +    vshufps    ymm4, ymm2, ymm3, 0x88
  1.1218 +    vshufps    ymm2, ymm2, ymm3, 0xdd
  1.1219 +    vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
  1.1220 +
  1.1221 +    // step 2 - convert to U and V
  1.1222 +    // from here down is very similar to Y code except
  1.1223 +    // instead of 32 different pixels, its 16 pixels of U and 16 of V
  1.1224 +    vpmaddubsw ymm1, ymm0, ymm7  // U
  1.1225 +    vpmaddubsw ymm3, ymm2, ymm7
  1.1226 +    vpmaddubsw ymm0, ymm0, ymm6  // V
  1.1227 +    vpmaddubsw ymm2, ymm2, ymm6
  1.1228 +    vphaddw    ymm1, ymm1, ymm3  // mutates
  1.1229 +    vphaddw    ymm0, ymm0, ymm2
  1.1230 +    vpsraw     ymm1, ymm1, 8
  1.1231 +    vpsraw     ymm0, ymm0, 8
  1.1232 +    vpacksswb  ymm0, ymm1, ymm0  // mutates
  1.1233 +    vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
  1.1234 +    vpshufb    ymm0, ymm0, kShufARGBToUV_AVX  // For vshufps + vphaddw
  1.1235 +    vpaddb     ymm0, ymm0, ymm5  // -> unsigned
  1.1236 +
  1.1237 +    // step 3 - store 16 U and 16 V values
  1.1238 +    sub         ecx, 32
  1.1239 +    vextractf128 [edx], ymm0, 0 // U
  1.1240 +    vextractf128 [edx + edi], ymm0, 1 // V
  1.1241 +    lea        edx, [edx + 16]
  1.1242 +    jg         convertloop
  1.1243 +
  1.1244 +    pop        edi
  1.1245 +    pop        esi
  1.1246 +    vzeroupper
  1.1247 +    ret
  1.1248 +  }
  1.1249 +}
  1.1250 +#endif  // HAS_ARGBTOUVROW_AVX2
  1.1251 +
  1.1252 +__declspec(naked) __declspec(align(16))
  1.1253 +void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1.1254 +                                 uint8* dst_u, uint8* dst_v, int width) {
  1.1255 +  __asm {
  1.1256 +    push       esi
  1.1257 +    push       edi
  1.1258 +    mov        eax, [esp + 8 + 4]   // src_argb
  1.1259 +    mov        esi, [esp + 8 + 8]   // src_stride_argb
  1.1260 +    mov        edx, [esp + 8 + 12]  // dst_u
  1.1261 +    mov        edi, [esp + 8 + 16]  // dst_v
  1.1262 +    mov        ecx, [esp + 8 + 20]  // pix
  1.1263 +    movdqa     xmm7, kARGBToU
  1.1264 +    movdqa     xmm6, kARGBToV
  1.1265 +    movdqa     xmm5, kAddUV128
  1.1266 +    sub        edi, edx             // stride from u to v
  1.1267 +
  1.1268 +    align      4
  1.1269 + convertloop:
  1.1270 +    /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1.1271 +    movdqu     xmm0, [eax]
  1.1272 +    movdqu     xmm1, [eax + 16]
  1.1273 +    movdqu     xmm2, [eax + 32]
  1.1274 +    movdqu     xmm3, [eax + 48]
  1.1275 +    movdqu     xmm4, [eax + esi]
  1.1276 +    pavgb      xmm0, xmm4
  1.1277 +    movdqu     xmm4, [eax + esi + 16]
  1.1278 +    pavgb      xmm1, xmm4
  1.1279 +    movdqu     xmm4, [eax + esi + 32]
  1.1280 +    pavgb      xmm2, xmm4
  1.1281 +    movdqu     xmm4, [eax + esi + 48]
  1.1282 +    pavgb      xmm3, xmm4
  1.1283 +    lea        eax,  [eax + 64]
  1.1284 +    movdqa     xmm4, xmm0
  1.1285 +    shufps     xmm0, xmm1, 0x88
  1.1286 +    shufps     xmm4, xmm1, 0xdd
  1.1287 +    pavgb      xmm0, xmm4
  1.1288 +    movdqa     xmm4, xmm2
  1.1289 +    shufps     xmm2, xmm3, 0x88
  1.1290 +    shufps     xmm4, xmm3, 0xdd
  1.1291 +    pavgb      xmm2, xmm4
  1.1292 +
  1.1293 +    // step 2 - convert to U and V
  1.1294 +    // from here down is very similar to Y code except
  1.1295 +    // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1.1296 +    movdqa     xmm1, xmm0
  1.1297 +    movdqa     xmm3, xmm2
  1.1298 +    pmaddubsw  xmm0, xmm7  // U
  1.1299 +    pmaddubsw  xmm2, xmm7
  1.1300 +    pmaddubsw  xmm1, xmm6  // V
  1.1301 +    pmaddubsw  xmm3, xmm6
  1.1302 +    phaddw     xmm0, xmm2
  1.1303 +    phaddw     xmm1, xmm3
  1.1304 +    psraw      xmm0, 8
  1.1305 +    psraw      xmm1, 8
  1.1306 +    packsswb   xmm0, xmm1
  1.1307 +    paddb      xmm0, xmm5            // -> unsigned
  1.1308 +
  1.1309 +    // step 3 - store 8 U and 8 V values
  1.1310 +    sub        ecx, 16
  1.1311 +    movlps     qword ptr [edx], xmm0 // U
  1.1312 +    movhps     qword ptr [edx + edi], xmm0 // V
  1.1313 +    lea        edx, [edx + 8]
  1.1314 +    jg         convertloop
  1.1315 +
  1.1316 +    pop        edi
  1.1317 +    pop        esi
  1.1318 +    ret
  1.1319 +  }
  1.1320 +}
  1.1321 +
  1.1322 +__declspec(naked) __declspec(align(16))
  1.1323 +void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1.1324 +                                 uint8* dst_u, uint8* dst_v, int width) {
  1.1325 +  __asm {
  1.1326 +    push       esi
  1.1327 +    push       edi
  1.1328 +    mov        eax, [esp + 8 + 4]   // src_argb
  1.1329 +    mov        esi, [esp + 8 + 8]   // src_stride_argb
  1.1330 +    mov        edx, [esp + 8 + 12]  // dst_u
  1.1331 +    mov        edi, [esp + 8 + 16]  // dst_v
  1.1332 +    mov        ecx, [esp + 8 + 20]  // pix
  1.1333 +    movdqa     xmm7, kARGBToUJ
  1.1334 +    movdqa     xmm6, kARGBToVJ
  1.1335 +    movdqa     xmm5, kAddUVJ128
  1.1336 +    sub        edi, edx             // stride from u to v
  1.1337 +
  1.1338 +    align      4
  1.1339 + convertloop:
  1.1340 +    /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1.1341 +    movdqu     xmm0, [eax]
  1.1342 +    movdqu     xmm1, [eax + 16]
  1.1343 +    movdqu     xmm2, [eax + 32]
  1.1344 +    movdqu     xmm3, [eax + 48]
  1.1345 +    movdqu     xmm4, [eax + esi]
  1.1346 +    pavgb      xmm0, xmm4
  1.1347 +    movdqu     xmm4, [eax + esi + 16]
  1.1348 +    pavgb      xmm1, xmm4
  1.1349 +    movdqu     xmm4, [eax + esi + 32]
  1.1350 +    pavgb      xmm2, xmm4
  1.1351 +    movdqu     xmm4, [eax + esi + 48]
  1.1352 +    pavgb      xmm3, xmm4
  1.1353 +    lea        eax,  [eax + 64]
  1.1354 +    movdqa     xmm4, xmm0
  1.1355 +    shufps     xmm0, xmm1, 0x88
  1.1356 +    shufps     xmm4, xmm1, 0xdd
  1.1357 +    pavgb      xmm0, xmm4
  1.1358 +    movdqa     xmm4, xmm2
  1.1359 +    shufps     xmm2, xmm3, 0x88
  1.1360 +    shufps     xmm4, xmm3, 0xdd
  1.1361 +    pavgb      xmm2, xmm4
  1.1362 +
  1.1363 +    // step 2 - convert to U and V
  1.1364 +    // from here down is very similar to Y code except
  1.1365 +    // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1.1366 +    movdqa     xmm1, xmm0
  1.1367 +    movdqa     xmm3, xmm2
  1.1368 +    pmaddubsw  xmm0, xmm7  // U
  1.1369 +    pmaddubsw  xmm2, xmm7
  1.1370 +    pmaddubsw  xmm1, xmm6  // V
  1.1371 +    pmaddubsw  xmm3, xmm6
  1.1372 +    phaddw     xmm0, xmm2
  1.1373 +    phaddw     xmm1, xmm3
  1.1374 +    paddw      xmm0, xmm5            // +.5 rounding -> unsigned
  1.1375 +    paddw      xmm1, xmm5
  1.1376 +    psraw      xmm0, 8
  1.1377 +    psraw      xmm1, 8
  1.1378 +    packsswb   xmm0, xmm1
  1.1379 +
  1.1380 +    // step 3 - store 8 U and 8 V values
  1.1381 +    sub        ecx, 16
  1.1382 +    movlps     qword ptr [edx], xmm0 // U
  1.1383 +    movhps     qword ptr [edx + edi], xmm0 // V
  1.1384 +    lea        edx, [edx + 8]
  1.1385 +    jg         convertloop
  1.1386 +
  1.1387 +    pop        edi
  1.1388 +    pop        esi
  1.1389 +    ret
  1.1390 +  }
  1.1391 +}
  1.1392 +
  1.1393 +__declspec(naked) __declspec(align(16))
  1.1394 +void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
  1.1395 +                          uint8* dst_u, uint8* dst_v, int width) {
  1.1396 +  __asm {
  1.1397 +    push       edi
  1.1398 +    mov        eax, [esp + 4 + 4]   // src_argb
  1.1399 +    mov        edx, [esp + 4 + 8]   // dst_u
  1.1400 +    mov        edi, [esp + 4 + 12]  // dst_v
  1.1401 +    mov        ecx, [esp + 4 + 16]  // pix
  1.1402 +    movdqa     xmm7, kARGBToU
  1.1403 +    movdqa     xmm6, kARGBToV
  1.1404 +    movdqa     xmm5, kAddUV128
  1.1405 +    sub        edi, edx             // stride from u to v
  1.1406 +
  1.1407 +    align      4
  1.1408 + convertloop:
  1.1409 +    /* convert to U and V */
  1.1410 +    movdqa     xmm0, [eax]          // U
  1.1411 +    movdqa     xmm1, [eax + 16]
  1.1412 +    movdqa     xmm2, [eax + 32]
  1.1413 +    movdqa     xmm3, [eax + 48]
  1.1414 +    pmaddubsw  xmm0, xmm7
  1.1415 +    pmaddubsw  xmm1, xmm7
  1.1416 +    pmaddubsw  xmm2, xmm7
  1.1417 +    pmaddubsw  xmm3, xmm7
  1.1418 +    phaddw     xmm0, xmm1
  1.1419 +    phaddw     xmm2, xmm3
  1.1420 +    psraw      xmm0, 8
  1.1421 +    psraw      xmm2, 8
  1.1422 +    packsswb   xmm0, xmm2
  1.1423 +    paddb      xmm0, xmm5
  1.1424 +    sub        ecx,  16
  1.1425 +    movdqa     [edx], xmm0
  1.1426 +
  1.1427 +    movdqa     xmm0, [eax]          // V
  1.1428 +    movdqa     xmm1, [eax + 16]
  1.1429 +    movdqa     xmm2, [eax + 32]
  1.1430 +    movdqa     xmm3, [eax + 48]
  1.1431 +    pmaddubsw  xmm0, xmm6
  1.1432 +    pmaddubsw  xmm1, xmm6
  1.1433 +    pmaddubsw  xmm2, xmm6
  1.1434 +    pmaddubsw  xmm3, xmm6
  1.1435 +    phaddw     xmm0, xmm1
  1.1436 +    phaddw     xmm2, xmm3
  1.1437 +    psraw      xmm0, 8
  1.1438 +    psraw      xmm2, 8
  1.1439 +    packsswb   xmm0, xmm2
  1.1440 +    paddb      xmm0, xmm5
  1.1441 +    lea        eax,  [eax + 64]
  1.1442 +    movdqa     [edx + edi], xmm0
  1.1443 +    lea        edx,  [edx + 16]
  1.1444 +    jg         convertloop
  1.1445 +
  1.1446 +    pop        edi
  1.1447 +    ret
  1.1448 +  }
  1.1449 +}
  1.1450 +
  1.1451 +__declspec(naked) __declspec(align(16))
  1.1452 +void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
  1.1453 +                                    uint8* dst_u, uint8* dst_v, int width) {
  1.1454 +  __asm {
  1.1455 +    push       edi
  1.1456 +    mov        eax, [esp + 4 + 4]   // src_argb
  1.1457 +    mov        edx, [esp + 4 + 8]   // dst_u
  1.1458 +    mov        edi, [esp + 4 + 12]  // dst_v
  1.1459 +    mov        ecx, [esp + 4 + 16]  // pix
  1.1460 +    movdqa     xmm7, kARGBToU
  1.1461 +    movdqa     xmm6, kARGBToV
  1.1462 +    movdqa     xmm5, kAddUV128
  1.1463 +    sub        edi, edx             // stride from u to v
  1.1464 +
  1.1465 +    align      4
  1.1466 + convertloop:
  1.1467 +    /* convert to U and V */
  1.1468 +    movdqu     xmm0, [eax]          // U
  1.1469 +    movdqu     xmm1, [eax + 16]
  1.1470 +    movdqu     xmm2, [eax + 32]
  1.1471 +    movdqu     xmm3, [eax + 48]
  1.1472 +    pmaddubsw  xmm0, xmm7
  1.1473 +    pmaddubsw  xmm1, xmm7
  1.1474 +    pmaddubsw  xmm2, xmm7
  1.1475 +    pmaddubsw  xmm3, xmm7
  1.1476 +    phaddw     xmm0, xmm1
  1.1477 +    phaddw     xmm2, xmm3
  1.1478 +    psraw      xmm0, 8
  1.1479 +    psraw      xmm2, 8
  1.1480 +    packsswb   xmm0, xmm2
  1.1481 +    paddb      xmm0, xmm5
  1.1482 +    sub        ecx,  16
  1.1483 +    movdqu     [edx], xmm0
  1.1484 +
  1.1485 +    movdqu     xmm0, [eax]          // V
  1.1486 +    movdqu     xmm1, [eax + 16]
  1.1487 +    movdqu     xmm2, [eax + 32]
  1.1488 +    movdqu     xmm3, [eax + 48]
  1.1489 +    pmaddubsw  xmm0, xmm6
  1.1490 +    pmaddubsw  xmm1, xmm6
  1.1491 +    pmaddubsw  xmm2, xmm6
  1.1492 +    pmaddubsw  xmm3, xmm6
  1.1493 +    phaddw     xmm0, xmm1
  1.1494 +    phaddw     xmm2, xmm3
  1.1495 +    psraw      xmm0, 8
  1.1496 +    psraw      xmm2, 8
  1.1497 +    packsswb   xmm0, xmm2
  1.1498 +    paddb      xmm0, xmm5
  1.1499 +    lea        eax,  [eax + 64]
  1.1500 +    movdqu     [edx + edi], xmm0
  1.1501 +    lea        edx,  [edx + 16]
  1.1502 +    jg         convertloop
  1.1503 +
  1.1504 +    pop        edi
  1.1505 +    ret
  1.1506 +  }
  1.1507 +}
  1.1508 +
  1.1509 +__declspec(naked) __declspec(align(16))
  1.1510 +void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
  1.1511 +                          uint8* dst_u, uint8* dst_v, int width) {
  1.1512 +  __asm {
  1.1513 +    push       edi
  1.1514 +    mov        eax, [esp + 4 + 4]   // src_argb
  1.1515 +    mov        edx, [esp + 4 + 8]   // dst_u
  1.1516 +    mov        edi, [esp + 4 + 12]  // dst_v
  1.1517 +    mov        ecx, [esp + 4 + 16]  // pix
  1.1518 +    movdqa     xmm7, kARGBToU
  1.1519 +    movdqa     xmm6, kARGBToV
  1.1520 +    movdqa     xmm5, kAddUV128
  1.1521 +    sub        edi, edx             // stride from u to v
  1.1522 +
  1.1523 +    align      4
  1.1524 + convertloop:
  1.1525 +    /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1.1526 +    movdqa     xmm0, [eax]
  1.1527 +    movdqa     xmm1, [eax + 16]
  1.1528 +    movdqa     xmm2, [eax + 32]
  1.1529 +    movdqa     xmm3, [eax + 48]
  1.1530 +    lea        eax,  [eax + 64]
  1.1531 +    movdqa     xmm4, xmm0
  1.1532 +    shufps     xmm0, xmm1, 0x88
  1.1533 +    shufps     xmm4, xmm1, 0xdd
  1.1534 +    pavgb      xmm0, xmm4
  1.1535 +    movdqa     xmm4, xmm2
  1.1536 +    shufps     xmm2, xmm3, 0x88
  1.1537 +    shufps     xmm4, xmm3, 0xdd
  1.1538 +    pavgb      xmm2, xmm4
  1.1539 +
  1.1540 +    // step 2 - convert to U and V
  1.1541 +    // from here down is very similar to Y code except
  1.1542 +    // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1.1543 +    movdqa     xmm1, xmm0
  1.1544 +    movdqa     xmm3, xmm2
  1.1545 +    pmaddubsw  xmm0, xmm7  // U
  1.1546 +    pmaddubsw  xmm2, xmm7
  1.1547 +    pmaddubsw  xmm1, xmm6  // V
  1.1548 +    pmaddubsw  xmm3, xmm6
  1.1549 +    phaddw     xmm0, xmm2
  1.1550 +    phaddw     xmm1, xmm3
  1.1551 +    psraw      xmm0, 8
  1.1552 +    psraw      xmm1, 8
  1.1553 +    packsswb   xmm0, xmm1
  1.1554 +    paddb      xmm0, xmm5            // -> unsigned
  1.1555 +
  1.1556 +    // step 3 - store 8 U and 8 V values
  1.1557 +    sub        ecx, 16
  1.1558 +    movlps     qword ptr [edx], xmm0 // U
  1.1559 +    movhps     qword ptr [edx + edi], xmm0 // V
  1.1560 +    lea        edx, [edx + 8]
  1.1561 +    jg         convertloop
  1.1562 +
  1.1563 +    pop        edi
  1.1564 +    ret
  1.1565 +  }
  1.1566 +}
  1.1567 +
  1.1568 +__declspec(naked) __declspec(align(16))
  1.1569 +void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
  1.1570 +                                    uint8* dst_u, uint8* dst_v, int width) {
  1.1571 +  __asm {
  1.1572 +    push       edi
  1.1573 +    mov        eax, [esp + 4 + 4]   // src_argb
  1.1574 +    mov        edx, [esp + 4 + 8]   // dst_u
  1.1575 +    mov        edi, [esp + 4 + 12]  // dst_v
  1.1576 +    mov        ecx, [esp + 4 + 16]  // pix
  1.1577 +    movdqa     xmm7, kARGBToU
  1.1578 +    movdqa     xmm6, kARGBToV
  1.1579 +    movdqa     xmm5, kAddUV128
  1.1580 +    sub        edi, edx             // stride from u to v
  1.1581 +
  1.1582 +    align      4
  1.1583 + convertloop:
  1.1584 +    /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1.1585 +    movdqu     xmm0, [eax]
  1.1586 +    movdqu     xmm1, [eax + 16]
  1.1587 +    movdqu     xmm2, [eax + 32]
  1.1588 +    movdqu     xmm3, [eax + 48]
  1.1589 +    lea        eax,  [eax + 64]
  1.1590 +    movdqa     xmm4, xmm0
  1.1591 +    shufps     xmm0, xmm1, 0x88
  1.1592 +    shufps     xmm4, xmm1, 0xdd
  1.1593 +    pavgb      xmm0, xmm4
  1.1594 +    movdqa     xmm4, xmm2
  1.1595 +    shufps     xmm2, xmm3, 0x88
  1.1596 +    shufps     xmm4, xmm3, 0xdd
  1.1597 +    pavgb      xmm2, xmm4
  1.1598 +
  1.1599 +    // step 2 - convert to U and V
  1.1600 +    // from here down is very similar to Y code except
  1.1601 +    // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1.1602 +    movdqa     xmm1, xmm0
  1.1603 +    movdqa     xmm3, xmm2
  1.1604 +    pmaddubsw  xmm0, xmm7  // U
  1.1605 +    pmaddubsw  xmm2, xmm7
  1.1606 +    pmaddubsw  xmm1, xmm6  // V
  1.1607 +    pmaddubsw  xmm3, xmm6
  1.1608 +    phaddw     xmm0, xmm2
  1.1609 +    phaddw     xmm1, xmm3
  1.1610 +    psraw      xmm0, 8
  1.1611 +    psraw      xmm1, 8
  1.1612 +    packsswb   xmm0, xmm1
  1.1613 +    paddb      xmm0, xmm5            // -> unsigned
  1.1614 +
  1.1615 +    // step 3 - store 8 U and 8 V values
  1.1616 +    sub        ecx, 16
  1.1617 +    movlps     qword ptr [edx], xmm0 // U
  1.1618 +    movhps     qword ptr [edx + edi], xmm0 // V
  1.1619 +    lea        edx, [edx + 8]
  1.1620 +    jg         convertloop
  1.1621 +
  1.1622 +    pop        edi
  1.1623 +    ret
  1.1624 +  }
  1.1625 +}
  1.1626 +
  1.1627 +__declspec(naked) __declspec(align(16))
  1.1628 +void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1.1629 +                       uint8* dst_u, uint8* dst_v, int width) {
  1.1630 +  __asm {
  1.1631 +    push       esi
  1.1632 +    push       edi
  1.1633 +    mov        eax, [esp + 8 + 4]   // src_argb
  1.1634 +    mov        esi, [esp + 8 + 8]   // src_stride_argb
  1.1635 +    mov        edx, [esp + 8 + 12]  // dst_u
  1.1636 +    mov        edi, [esp + 8 + 16]  // dst_v
  1.1637 +    mov        ecx, [esp + 8 + 20]  // pix
  1.1638 +    movdqa     xmm7, kBGRAToU
  1.1639 +    movdqa     xmm6, kBGRAToV
  1.1640 +    movdqa     xmm5, kAddUV128
  1.1641 +    sub        edi, edx             // stride from u to v
  1.1642 +
  1.1643 +    align      4
  1.1644 + convertloop:
  1.1645 +    /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1.1646 +    movdqa     xmm0, [eax]
  1.1647 +    movdqa     xmm1, [eax + 16]
  1.1648 +    movdqa     xmm2, [eax + 32]
  1.1649 +    movdqa     xmm3, [eax + 48]
  1.1650 +    pavgb      xmm0, [eax + esi]
  1.1651 +    pavgb      xmm1, [eax + esi + 16]
  1.1652 +    pavgb      xmm2, [eax + esi + 32]
  1.1653 +    pavgb      xmm3, [eax + esi + 48]
  1.1654 +    lea        eax,  [eax + 64]
  1.1655 +    movdqa     xmm4, xmm0
  1.1656 +    shufps     xmm0, xmm1, 0x88
  1.1657 +    shufps     xmm4, xmm1, 0xdd
  1.1658 +    pavgb      xmm0, xmm4
  1.1659 +    movdqa     xmm4, xmm2
  1.1660 +    shufps     xmm2, xmm3, 0x88
  1.1661 +    shufps     xmm4, xmm3, 0xdd
  1.1662 +    pavgb      xmm2, xmm4
  1.1663 +
  1.1664 +    // step 2 - convert to U and V
  1.1665 +    // from here down is very similar to Y code except
  1.1666 +    // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1.1667 +    movdqa     xmm1, xmm0
  1.1668 +    movdqa     xmm3, xmm2
  1.1669 +    pmaddubsw  xmm0, xmm7  // U
  1.1670 +    pmaddubsw  xmm2, xmm7
  1.1671 +    pmaddubsw  xmm1, xmm6  // V
  1.1672 +    pmaddubsw  xmm3, xmm6
  1.1673 +    phaddw     xmm0, xmm2
  1.1674 +    phaddw     xmm1, xmm3
  1.1675 +    psraw      xmm0, 8
  1.1676 +    psraw      xmm1, 8
  1.1677 +    packsswb   xmm0, xmm1
  1.1678 +    paddb      xmm0, xmm5            // -> unsigned
  1.1679 +
  1.1680 +    // step 3 - store 8 U and 8 V values
  1.1681 +    sub        ecx, 16
  1.1682 +    movlps     qword ptr [edx], xmm0 // U
  1.1683 +    movhps     qword ptr [edx + edi], xmm0 // V
  1.1684 +    lea        edx, [edx + 8]
  1.1685 +    jg         convertloop
  1.1686 +
  1.1687 +    pop        edi
  1.1688 +    pop        esi
  1.1689 +    ret
  1.1690 +  }
  1.1691 +}
  1.1692 +
  1.1693 +__declspec(naked) __declspec(align(16))
  1.1694 +void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1.1695 +                                 uint8* dst_u, uint8* dst_v, int width) {
  1.1696 +  __asm {
  1.1697 +    push       esi
  1.1698 +    push       edi
  1.1699 +    mov        eax, [esp + 8 + 4]   // src_argb
  1.1700 +    mov        esi, [esp + 8 + 8]   // src_stride_argb
  1.1701 +    mov        edx, [esp + 8 + 12]  // dst_u
  1.1702 +    mov        edi, [esp + 8 + 16]  // dst_v
  1.1703 +    mov        ecx, [esp + 8 + 20]  // pix
  1.1704 +    movdqa     xmm7, kBGRAToU
  1.1705 +    movdqa     xmm6, kBGRAToV
  1.1706 +    movdqa     xmm5, kAddUV128
  1.1707 +    sub        edi, edx             // stride from u to v
  1.1708 +
  1.1709 +    align      4
  1.1710 + convertloop:
  1.1711 +    /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1.1712 +    movdqu     xmm0, [eax]
  1.1713 +    movdqu     xmm1, [eax + 16]
  1.1714 +    movdqu     xmm2, [eax + 32]
  1.1715 +    movdqu     xmm3, [eax + 48]
  1.1716 +    movdqu     xmm4, [eax + esi]
  1.1717 +    pavgb      xmm0, xmm4
  1.1718 +    movdqu     xmm4, [eax + esi + 16]
  1.1719 +    pavgb      xmm1, xmm4
  1.1720 +    movdqu     xmm4, [eax + esi + 32]
  1.1721 +    pavgb      xmm2, xmm4
  1.1722 +    movdqu     xmm4, [eax + esi + 48]
  1.1723 +    pavgb      xmm3, xmm4
  1.1724 +    lea        eax,  [eax + 64]
  1.1725 +    movdqa     xmm4, xmm0
  1.1726 +    shufps     xmm0, xmm1, 0x88
  1.1727 +    shufps     xmm4, xmm1, 0xdd
  1.1728 +    pavgb      xmm0, xmm4
  1.1729 +    movdqa     xmm4, xmm2
  1.1730 +    shufps     xmm2, xmm3, 0x88
  1.1731 +    shufps     xmm4, xmm3, 0xdd
  1.1732 +    pavgb      xmm2, xmm4
  1.1733 +
  1.1734 +    // step 2 - convert to U and V
  1.1735 +    // from here down is very similar to Y code except
  1.1736 +    // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1.1737 +    movdqa     xmm1, xmm0
  1.1738 +    movdqa     xmm3, xmm2
  1.1739 +    pmaddubsw  xmm0, xmm7  // U
  1.1740 +    pmaddubsw  xmm2, xmm7
  1.1741 +    pmaddubsw  xmm1, xmm6  // V
  1.1742 +    pmaddubsw  xmm3, xmm6
  1.1743 +    phaddw     xmm0, xmm2
  1.1744 +    phaddw     xmm1, xmm3
  1.1745 +    psraw      xmm0, 8
  1.1746 +    psraw      xmm1, 8
  1.1747 +    packsswb   xmm0, xmm1
  1.1748 +    paddb      xmm0, xmm5            // -> unsigned
  1.1749 +
  1.1750 +    // step 3 - store 8 U and 8 V values
  1.1751 +    sub        ecx, 16
  1.1752 +    movlps     qword ptr [edx], xmm0 // U
  1.1753 +    movhps     qword ptr [edx + edi], xmm0 // V
  1.1754 +    lea        edx, [edx + 8]
  1.1755 +    jg         convertloop
  1.1756 +
  1.1757 +    pop        edi
  1.1758 +    pop        esi
  1.1759 +    ret
  1.1760 +  }
  1.1761 +}
  1.1762 +
  1.1763 +__declspec(naked) __declspec(align(16))
  1.1764 +void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1.1765 +                       uint8* dst_u, uint8* dst_v, int width) {
  1.1766 +  __asm {
  1.1767 +    push       esi
  1.1768 +    push       edi
  1.1769 +    mov        eax, [esp + 8 + 4]   // src_argb
  1.1770 +    mov        esi, [esp + 8 + 8]   // src_stride_argb
  1.1771 +    mov        edx, [esp + 8 + 12]  // dst_u
  1.1772 +    mov        edi, [esp + 8 + 16]  // dst_v
  1.1773 +    mov        ecx, [esp + 8 + 20]  // pix
  1.1774 +    movdqa     xmm7, kABGRToU
  1.1775 +    movdqa     xmm6, kABGRToV
  1.1776 +    movdqa     xmm5, kAddUV128
  1.1777 +    sub        edi, edx             // stride from u to v
  1.1778 +
  1.1779 +    align      4
  1.1780 + convertloop:
  1.1781 +    /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1.1782 +    movdqa     xmm0, [eax]
  1.1783 +    movdqa     xmm1, [eax + 16]
  1.1784 +    movdqa     xmm2, [eax + 32]
  1.1785 +    movdqa     xmm3, [eax + 48]
  1.1786 +    pavgb      xmm0, [eax + esi]
  1.1787 +    pavgb      xmm1, [eax + esi + 16]
  1.1788 +    pavgb      xmm2, [eax + esi + 32]
  1.1789 +    pavgb      xmm3, [eax + esi + 48]
  1.1790 +    lea        eax,  [eax + 64]
  1.1791 +    movdqa     xmm4, xmm0
  1.1792 +    shufps     xmm0, xmm1, 0x88
  1.1793 +    shufps     xmm4, xmm1, 0xdd
  1.1794 +    pavgb      xmm0, xmm4
  1.1795 +    movdqa     xmm4, xmm2
  1.1796 +    shufps     xmm2, xmm3, 0x88
  1.1797 +    shufps     xmm4, xmm3, 0xdd
  1.1798 +    pavgb      xmm2, xmm4
  1.1799 +
  1.1800 +    // step 2 - convert to U and V
  1.1801 +    // from here down is very similar to Y code except
  1.1802 +    // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1.1803 +    movdqa     xmm1, xmm0
  1.1804 +    movdqa     xmm3, xmm2
  1.1805 +    pmaddubsw  xmm0, xmm7  // U
  1.1806 +    pmaddubsw  xmm2, xmm7
  1.1807 +    pmaddubsw  xmm1, xmm6  // V
  1.1808 +    pmaddubsw  xmm3, xmm6
  1.1809 +    phaddw     xmm0, xmm2
  1.1810 +    phaddw     xmm1, xmm3
  1.1811 +    psraw      xmm0, 8
  1.1812 +    psraw      xmm1, 8
  1.1813 +    packsswb   xmm0, xmm1
  1.1814 +    paddb      xmm0, xmm5            // -> unsigned
  1.1815 +
  1.1816 +    // step 3 - store 8 U and 8 V values
  1.1817 +    sub        ecx, 16
  1.1818 +    movlps     qword ptr [edx], xmm0 // U
  1.1819 +    movhps     qword ptr [edx + edi], xmm0 // V
  1.1820 +    lea        edx, [edx + 8]
  1.1821 +    jg         convertloop
  1.1822 +
  1.1823 +    pop        edi
  1.1824 +    pop        esi
  1.1825 +    ret
  1.1826 +  }
  1.1827 +}
  1.1828 +
  1.1829 +__declspec(naked) __declspec(align(16))
  1.1830 +void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1.1831 +                                 uint8* dst_u, uint8* dst_v, int width) {
  1.1832 +  __asm {
  1.1833 +    push       esi
  1.1834 +    push       edi
  1.1835 +    mov        eax, [esp + 8 + 4]   // src_argb
  1.1836 +    mov        esi, [esp + 8 + 8]   // src_stride_argb
  1.1837 +    mov        edx, [esp + 8 + 12]  // dst_u
  1.1838 +    mov        edi, [esp + 8 + 16]  // dst_v
  1.1839 +    mov        ecx, [esp + 8 + 20]  // pix
  1.1840 +    movdqa     xmm7, kABGRToU
  1.1841 +    movdqa     xmm6, kABGRToV
  1.1842 +    movdqa     xmm5, kAddUV128
  1.1843 +    sub        edi, edx             // stride from u to v
  1.1844 +
  1.1845 +    align      4
  1.1846 + convertloop:
  1.1847 +    /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1.1848 +    movdqu     xmm0, [eax]
  1.1849 +    movdqu     xmm1, [eax + 16]
  1.1850 +    movdqu     xmm2, [eax + 32]
  1.1851 +    movdqu     xmm3, [eax + 48]
  1.1852 +    movdqu     xmm4, [eax + esi]
  1.1853 +    pavgb      xmm0, xmm4
  1.1854 +    movdqu     xmm4, [eax + esi + 16]
  1.1855 +    pavgb      xmm1, xmm4
  1.1856 +    movdqu     xmm4, [eax + esi + 32]
  1.1857 +    pavgb      xmm2, xmm4
  1.1858 +    movdqu     xmm4, [eax + esi + 48]
  1.1859 +    pavgb      xmm3, xmm4
  1.1860 +    lea        eax,  [eax + 64]
  1.1861 +    movdqa     xmm4, xmm0
  1.1862 +    shufps     xmm0, xmm1, 0x88
  1.1863 +    shufps     xmm4, xmm1, 0xdd
  1.1864 +    pavgb      xmm0, xmm4
  1.1865 +    movdqa     xmm4, xmm2
  1.1866 +    shufps     xmm2, xmm3, 0x88
  1.1867 +    shufps     xmm4, xmm3, 0xdd
  1.1868 +    pavgb      xmm2, xmm4
  1.1869 +
  1.1870 +    // step 2 - convert to U and V
  1.1871 +    // from here down is very similar to Y code except
  1.1872 +    // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1.1873 +    movdqa     xmm1, xmm0
  1.1874 +    movdqa     xmm3, xmm2
  1.1875 +    pmaddubsw  xmm0, xmm7  // U
  1.1876 +    pmaddubsw  xmm2, xmm7
  1.1877 +    pmaddubsw  xmm1, xmm6  // V
  1.1878 +    pmaddubsw  xmm3, xmm6
  1.1879 +    phaddw     xmm0, xmm2
  1.1880 +    phaddw     xmm1, xmm3
  1.1881 +    psraw      xmm0, 8
  1.1882 +    psraw      xmm1, 8
  1.1883 +    packsswb   xmm0, xmm1
  1.1884 +    paddb      xmm0, xmm5            // -> unsigned
  1.1885 +
  1.1886 +    // step 3 - store 8 U and 8 V values
  1.1887 +    sub        ecx, 16
  1.1888 +    movlps     qword ptr [edx], xmm0 // U
  1.1889 +    movhps     qword ptr [edx + edi], xmm0 // V
  1.1890 +    lea        edx, [edx + 8]
  1.1891 +    jg         convertloop
  1.1892 +
  1.1893 +    pop        edi
  1.1894 +    pop        esi
  1.1895 +    ret
  1.1896 +  }
  1.1897 +}
  1.1898 +
  1.1899 +__declspec(naked) __declspec(align(16))
  1.1900 +void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1.1901 +                       uint8* dst_u, uint8* dst_v, int width) {
  1.1902 +  __asm {
  1.1903 +    push       esi
  1.1904 +    push       edi
  1.1905 +    mov        eax, [esp + 8 + 4]   // src_argb
  1.1906 +    mov        esi, [esp + 8 + 8]   // src_stride_argb
  1.1907 +    mov        edx, [esp + 8 + 12]  // dst_u
  1.1908 +    mov        edi, [esp + 8 + 16]  // dst_v
  1.1909 +    mov        ecx, [esp + 8 + 20]  // pix
  1.1910 +    movdqa     xmm7, kRGBAToU
  1.1911 +    movdqa     xmm6, kRGBAToV
  1.1912 +    movdqa     xmm5, kAddUV128
  1.1913 +    sub        edi, edx             // stride from u to v
  1.1914 +
  1.1915 +    align      4
  1.1916 + convertloop:
  1.1917 +    /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1.1918 +    movdqa     xmm0, [eax]
  1.1919 +    movdqa     xmm1, [eax + 16]
  1.1920 +    movdqa     xmm2, [eax + 32]
  1.1921 +    movdqa     xmm3, [eax + 48]
  1.1922 +    pavgb      xmm0, [eax + esi]
  1.1923 +    pavgb      xmm1, [eax + esi + 16]
  1.1924 +    pavgb      xmm2, [eax + esi + 32]
  1.1925 +    pavgb      xmm3, [eax + esi + 48]
  1.1926 +    lea        eax,  [eax + 64]
  1.1927 +    movdqa     xmm4, xmm0
  1.1928 +    shufps     xmm0, xmm1, 0x88
  1.1929 +    shufps     xmm4, xmm1, 0xdd
  1.1930 +    pavgb      xmm0, xmm4
  1.1931 +    movdqa     xmm4, xmm2
  1.1932 +    shufps     xmm2, xmm3, 0x88
  1.1933 +    shufps     xmm4, xmm3, 0xdd
  1.1934 +    pavgb      xmm2, xmm4
  1.1935 +
  1.1936 +    // step 2 - convert to U and V
  1.1937 +    // from here down is very similar to Y code except
  1.1938 +    // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1.1939 +    movdqa     xmm1, xmm0
  1.1940 +    movdqa     xmm3, xmm2
  1.1941 +    pmaddubsw  xmm0, xmm7  // U
  1.1942 +    pmaddubsw  xmm2, xmm7
  1.1943 +    pmaddubsw  xmm1, xmm6  // V
  1.1944 +    pmaddubsw  xmm3, xmm6
  1.1945 +    phaddw     xmm0, xmm2
  1.1946 +    phaddw     xmm1, xmm3
  1.1947 +    psraw      xmm0, 8
  1.1948 +    psraw      xmm1, 8
  1.1949 +    packsswb   xmm0, xmm1
  1.1950 +    paddb      xmm0, xmm5            // -> unsigned
  1.1951 +
  1.1952 +    // step 3 - store 8 U and 8 V values
  1.1953 +    sub        ecx, 16
  1.1954 +    movlps     qword ptr [edx], xmm0 // U
  1.1955 +    movhps     qword ptr [edx + edi], xmm0 // V
  1.1956 +    lea        edx, [edx + 8]
  1.1957 +    jg         convertloop
  1.1958 +
  1.1959 +    pop        edi
  1.1960 +    pop        esi
  1.1961 +    ret
  1.1962 +  }
  1.1963 +}
  1.1964 +
  1.1965 +__declspec(naked) __declspec(align(16))
  1.1966 +void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1.1967 +                                 uint8* dst_u, uint8* dst_v, int width) {
  1.1968 +  __asm {
  1.1969 +    push       esi
  1.1970 +    push       edi
  1.1971 +    mov        eax, [esp + 8 + 4]   // src_argb
  1.1972 +    mov        esi, [esp + 8 + 8]   // src_stride_argb
  1.1973 +    mov        edx, [esp + 8 + 12]  // dst_u
  1.1974 +    mov        edi, [esp + 8 + 16]  // dst_v
  1.1975 +    mov        ecx, [esp + 8 + 20]  // pix
  1.1976 +    movdqa     xmm7, kRGBAToU
  1.1977 +    movdqa     xmm6, kRGBAToV
  1.1978 +    movdqa     xmm5, kAddUV128
  1.1979 +    sub        edi, edx             // stride from u to v
  1.1980 +
  1.1981 +    align      4
  1.1982 + convertloop:
  1.1983 +    /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1.1984 +    movdqu     xmm0, [eax]
  1.1985 +    movdqu     xmm1, [eax + 16]
  1.1986 +    movdqu     xmm2, [eax + 32]
  1.1987 +    movdqu     xmm3, [eax + 48]
  1.1988 +    movdqu     xmm4, [eax + esi]
  1.1989 +    pavgb      xmm0, xmm4
  1.1990 +    movdqu     xmm4, [eax + esi + 16]
  1.1991 +    pavgb      xmm1, xmm4
  1.1992 +    movdqu     xmm4, [eax + esi + 32]
  1.1993 +    pavgb      xmm2, xmm4
  1.1994 +    movdqu     xmm4, [eax + esi + 48]
  1.1995 +    pavgb      xmm3, xmm4
  1.1996 +    lea        eax,  [eax + 64]
  1.1997 +    movdqa     xmm4, xmm0
  1.1998 +    shufps     xmm0, xmm1, 0x88
  1.1999 +    shufps     xmm4, xmm1, 0xdd
  1.2000 +    pavgb      xmm0, xmm4
  1.2001 +    movdqa     xmm4, xmm2
  1.2002 +    shufps     xmm2, xmm3, 0x88
  1.2003 +    shufps     xmm4, xmm3, 0xdd
  1.2004 +    pavgb      xmm2, xmm4
  1.2005 +
  1.2006 +    // step 2 - convert to U and V
  1.2007 +    // from here down is very similar to Y code except
  1.2008 +    // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1.2009 +    movdqa     xmm1, xmm0
  1.2010 +    movdqa     xmm3, xmm2
  1.2011 +    pmaddubsw  xmm0, xmm7  // U
  1.2012 +    pmaddubsw  xmm2, xmm7
  1.2013 +    pmaddubsw  xmm1, xmm6  // V
  1.2014 +    pmaddubsw  xmm3, xmm6
  1.2015 +    phaddw     xmm0, xmm2
  1.2016 +    phaddw     xmm1, xmm3
  1.2017 +    psraw      xmm0, 8
  1.2018 +    psraw      xmm1, 8
  1.2019 +    packsswb   xmm0, xmm1
  1.2020 +    paddb      xmm0, xmm5            // -> unsigned
  1.2021 +
  1.2022 +    // step 3 - store 8 U and 8 V values
  1.2023 +    sub        ecx, 16
  1.2024 +    movlps     qword ptr [edx], xmm0 // U
  1.2025 +    movhps     qword ptr [edx + edi], xmm0 // V
  1.2026 +    lea        edx, [edx + 8]
  1.2027 +    jg         convertloop
  1.2028 +
  1.2029 +    pop        edi
  1.2030 +    pop        esi
  1.2031 +    ret
  1.2032 +  }
  1.2033 +}
  1.2034 +#endif  // HAS_ARGBTOYROW_SSSE3
  1.2035 +
  1.2036 +#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
  1.2037 +
  1.2038 +#define UB 127 /* min(63,(int8)(2.018 * 64)) */
  1.2039 +#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
  1.2040 +#define UR 0
  1.2041 +
  1.2042 +#define VB 0
  1.2043 +#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
  1.2044 +#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
  1.2045 +
  1.2046 +// Bias
  1.2047 +#define BB UB * 128 + VB * 128
  1.2048 +#define BG UG * 128 + VG * 128
  1.2049 +#define BR UR * 128 + VR * 128
  1.2050 +
  1.2051 +#ifdef HAS_I422TOARGBROW_AVX2
  1.2052 +
  1.2053 +static const lvec8 kUVToB_AVX = {
  1.2054 +  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB,
  1.2055 +  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
  1.2056 +};
  1.2057 +static const lvec8 kUVToR_AVX = {
  1.2058 +  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR,
  1.2059 +  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
  1.2060 +};
  1.2061 +static const lvec8 kUVToG_AVX = {
  1.2062 +  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
  1.2063 +  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
  1.2064 +};
  1.2065 +static const lvec16 kYToRgb_AVX = {
  1.2066 +  YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG
  1.2067 +};
  1.2068 +static const lvec16 kYSub16_AVX = {
  1.2069 +  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
  1.2070 +};
  1.2071 +static const lvec16 kUVBiasB_AVX = {
  1.2072 +  BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB
  1.2073 +};
  1.2074 +static const lvec16 kUVBiasG_AVX = {
  1.2075 +  BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG
  1.2076 +};
  1.2077 +static const lvec16 kUVBiasR_AVX = {
  1.2078 +  BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR
  1.2079 +};
  1.2080 +
  1.2081 +// 16 pixels
  1.2082 +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  1.2083 +__declspec(naked) __declspec(align(16))
  1.2084 +void I422ToARGBRow_AVX2(const uint8* y_buf,
  1.2085 +                         const uint8* u_buf,
  1.2086 +                         const uint8* v_buf,
  1.2087 +                         uint8* dst_argb,
  1.2088 +                         int width) {
  1.2089 +  __asm {
  1.2090 +    push       esi
  1.2091 +    push       edi
  1.2092 +    mov        eax, [esp + 8 + 4]   // Y
  1.2093 +    mov        esi, [esp + 8 + 8]   // U
  1.2094 +    mov        edi, [esp + 8 + 12]  // V
  1.2095 +    mov        edx, [esp + 8 + 16]  // argb
  1.2096 +    mov        ecx, [esp + 8 + 20]  // width
  1.2097 +    sub        edi, esi
  1.2098 +    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
  1.2099 +    vpxor      ymm4, ymm4, ymm4
  1.2100 +
  1.2101 +    align      4
  1.2102 + convertloop:
  1.2103 +    vmovq      xmm0, qword ptr [esi]          //  U
  1.2104 +    vmovq      xmm1, qword ptr [esi + edi]    //  V
  1.2105 +    lea        esi,  [esi + 8]
  1.2106 +    vpunpcklbw ymm0, ymm0, ymm1               // UV
  1.2107 +    vpermq     ymm0, ymm0, 0xd8
  1.2108 +    vpunpcklwd ymm0, ymm0, ymm0              // UVUV
  1.2109 +    vpmaddubsw ymm2, ymm0, kUVToB_AVX        // scale B UV
  1.2110 +    vpmaddubsw ymm1, ymm0, kUVToG_AVX        // scale G UV
  1.2111 +    vpmaddubsw ymm0, ymm0, kUVToR_AVX        // scale R UV
  1.2112 +    vpsubw     ymm2, ymm2, kUVBiasB_AVX      // unbias back to signed
  1.2113 +    vpsubw     ymm1, ymm1, kUVBiasG_AVX
  1.2114 +    vpsubw     ymm0, ymm0, kUVBiasR_AVX
  1.2115 +
  1.2116 +    // Step 2: Find Y contribution to 16 R,G,B values
  1.2117 +    vmovdqu    xmm3, [eax]                  // NOLINT
  1.2118 +    lea        eax, [eax + 16]
  1.2119 +    vpermq     ymm3, ymm3, 0xd8
  1.2120 +    vpunpcklbw ymm3, ymm3, ymm4
  1.2121 +    vpsubsw    ymm3, ymm3, kYSub16_AVX
  1.2122 +    vpmullw    ymm3, ymm3, kYToRgb_AVX
  1.2123 +    vpaddsw    ymm2, ymm2, ymm3           // B += Y
  1.2124 +    vpaddsw    ymm1, ymm1, ymm3           // G += Y
  1.2125 +    vpaddsw    ymm0, ymm0, ymm3           // R += Y
  1.2126 +    vpsraw     ymm2, ymm2, 6
  1.2127 +    vpsraw     ymm1, ymm1, 6
  1.2128 +    vpsraw     ymm0, ymm0, 6
  1.2129 +    vpackuswb  ymm2, ymm2, ymm2           // B
  1.2130 +    vpackuswb  ymm1, ymm1, ymm1           // G
  1.2131 +    vpackuswb  ymm0, ymm0, ymm0           // R
  1.2132 +
  1.2133 +    // Step 3: Weave into ARGB
  1.2134 +    vpunpcklbw ymm2, ymm2, ymm1           // BG
  1.2135 +    vpermq     ymm2, ymm2, 0xd8
  1.2136 +    vpunpcklbw ymm0, ymm0, ymm5           // RA
  1.2137 +    vpermq     ymm0, ymm0, 0xd8
  1.2138 +    vpunpcklwd ymm1, ymm2, ymm0           // BGRA first 8 pixels
  1.2139 +    vpunpckhwd ymm2, ymm2, ymm0           // BGRA next 8 pixels
  1.2140 +    vmovdqu    [edx], ymm1
  1.2141 +    vmovdqu    [edx + 32], ymm2
  1.2142 +    lea        edx,  [edx + 64]
  1.2143 +    sub        ecx, 16
  1.2144 +    jg         convertloop
  1.2145 +    vzeroupper
  1.2146 +
  1.2147 +    pop        edi
  1.2148 +    pop        esi
  1.2149 +    ret
  1.2150 +  }
  1.2151 +}
  1.2152 +#endif  // HAS_I422TOARGBROW_AVX2
  1.2153 +
  1.2154 +#ifdef HAS_I422TOARGBROW_SSSE3
  1.2155 +
  1.2156 +static const vec8 kUVToB = {
  1.2157 +  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
  1.2158 +};
  1.2159 +
  1.2160 +static const vec8 kUVToR = {
  1.2161 +  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
  1.2162 +};
  1.2163 +
  1.2164 +static const vec8 kUVToG = {
  1.2165 +  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
  1.2166 +};
  1.2167 +
  1.2168 +static const vec8 kVUToB = {
  1.2169 +  VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
  1.2170 +};
  1.2171 +
  1.2172 +static const vec8 kVUToR = {
  1.2173 +  VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
  1.2174 +};
  1.2175 +
  1.2176 +static const vec8 kVUToG = {
  1.2177 +  VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
  1.2178 +};
  1.2179 +
  1.2180 +static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
  1.2181 +static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
  1.2182 +static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
  1.2183 +static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
  1.2184 +static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
  1.2185 +
  1.2186 +// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
  1.2187 +
  1.2188 +// Read 8 UV from 444.
  1.2189 +#define READYUV444 __asm {                                                     \
  1.2190 +    __asm movq       xmm0, qword ptr [esi] /* U */                /* NOLINT */ \
  1.2191 +    __asm movq       xmm1, qword ptr [esi + edi] /* V */          /* NOLINT */ \
  1.2192 +    __asm lea        esi,  [esi + 8]                                           \
  1.2193 +    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
  1.2194 +  }
  1.2195 +
  1.2196 +// Read 4 UV from 422, upsample to 8 UV.
  1.2197 +#define READYUV422 __asm {                                                     \
  1.2198 +    __asm movd       xmm0, [esi]          /* U */                              \
  1.2199 +    __asm movd       xmm1, [esi + edi]    /* V */                              \
  1.2200 +    __asm lea        esi,  [esi + 4]                                           \
  1.2201 +    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
  1.2202 +    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
  1.2203 +  }
  1.2204 +
  1.2205 +// Read 2 UV from 411, upsample to 8 UV.
  1.2206 +#define READYUV411 __asm {                                                     \
  1.2207 +    __asm movzx      ebx, word ptr [esi]        /* U */           /* NOLINT */ \
  1.2208 +    __asm movd       xmm0, ebx                                                 \
  1.2209 +    __asm movzx      ebx, word ptr [esi + edi]  /* V */           /* NOLINT */ \
  1.2210 +    __asm movd       xmm1, ebx                                                 \
  1.2211 +    __asm lea        esi,  [esi + 2]                                           \
  1.2212 +    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
  1.2213 +    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
  1.2214 +    __asm punpckldq  xmm0, xmm0           /* UVUV (upsample) */                \
  1.2215 +  }
  1.2216 +
  1.2217 +// Read 4 UV from NV12, upsample to 8 UV.
  1.2218 +#define READNV12 __asm {                                                       \
  1.2219 +    __asm movq       xmm0, qword ptr [esi] /* UV */               /* NOLINT */ \
  1.2220 +    __asm lea        esi,  [esi + 8]                                           \
  1.2221 +    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
  1.2222 +  }
  1.2223 +
  1.2224 +// Convert 8 pixels: 8 UV and 8 Y.
  1.2225 +#define YUVTORGB __asm {                                                       \
  1.2226 +    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
  1.2227 +    __asm movdqa     xmm1, xmm0                                                \
  1.2228 +    __asm movdqa     xmm2, xmm0                                                \
  1.2229 +    __asm pmaddubsw  xmm0, kUVToB        /* scale B UV */                      \
  1.2230 +    __asm pmaddubsw  xmm1, kUVToG        /* scale G UV */                      \
  1.2231 +    __asm pmaddubsw  xmm2, kUVToR        /* scale R UV */                      \
  1.2232 +    __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
  1.2233 +    __asm psubw      xmm1, kUVBiasG                                            \
  1.2234 +    __asm psubw      xmm2, kUVBiasR                                            \
  1.2235 +    /* Step 2: Find Y contribution to 8 R,G,B values */                        \
  1.2236 +    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
  1.2237 +    __asm lea        eax, [eax + 8]                                            \
  1.2238 +    __asm punpcklbw  xmm3, xmm4                                                \
  1.2239 +    __asm psubsw     xmm3, kYSub16                                             \
  1.2240 +    __asm pmullw     xmm3, kYToRgb                                             \
  1.2241 +    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
  1.2242 +    __asm paddsw     xmm1, xmm3           /* G += Y */                         \
  1.2243 +    __asm paddsw     xmm2, xmm3           /* R += Y */                         \
  1.2244 +    __asm psraw      xmm0, 6                                                   \
  1.2245 +    __asm psraw      xmm1, 6                                                   \
  1.2246 +    __asm psraw      xmm2, 6                                                   \
  1.2247 +    __asm packuswb   xmm0, xmm0           /* B */                              \
  1.2248 +    __asm packuswb   xmm1, xmm1           /* G */                              \
  1.2249 +    __asm packuswb   xmm2, xmm2           /* R */                              \
  1.2250 +  }
  1.2251 +
  1.2252 +// Convert 8 pixels: 8 VU and 8 Y.
  1.2253 +#define YVUTORGB __asm {                                                       \
  1.2254 +    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
  1.2255 +    __asm movdqa     xmm1, xmm0                                                \
  1.2256 +    __asm movdqa     xmm2, xmm0                                                \
  1.2257 +    __asm pmaddubsw  xmm0, kVUToB        /* scale B UV */                      \
  1.2258 +    __asm pmaddubsw  xmm1, kVUToG        /* scale G UV */                      \
  1.2259 +    __asm pmaddubsw  xmm2, kVUToR        /* scale R UV */                      \
  1.2260 +    __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
  1.2261 +    __asm psubw      xmm1, kUVBiasG                                            \
  1.2262 +    __asm psubw      xmm2, kUVBiasR                                            \
  1.2263 +    /* Step 2: Find Y contribution to 8 R,G,B values */                        \
  1.2264 +    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
  1.2265 +    __asm lea        eax, [eax + 8]                                            \
  1.2266 +    __asm punpcklbw  xmm3, xmm4                                                \
  1.2267 +    __asm psubsw     xmm3, kYSub16                                             \
  1.2268 +    __asm pmullw     xmm3, kYToRgb                                             \
  1.2269 +    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
  1.2270 +    __asm paddsw     xmm1, xmm3           /* G += Y */                         \
  1.2271 +    __asm paddsw     xmm2, xmm3           /* R += Y */                         \
  1.2272 +    __asm psraw      xmm0, 6                                                   \
  1.2273 +    __asm psraw      xmm1, 6                                                   \
  1.2274 +    __asm psraw      xmm2, 6                                                   \
  1.2275 +    __asm packuswb   xmm0, xmm0           /* B */                              \
  1.2276 +    __asm packuswb   xmm1, xmm1           /* G */                              \
  1.2277 +    __asm packuswb   xmm2, xmm2           /* R */                              \
  1.2278 +  }
  1.2279 +
  1.2280 +// 8 pixels, dest aligned 16.
  1.2281 +// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
  1.2282 +__declspec(naked) __declspec(align(16))
  1.2283 +void I444ToARGBRow_SSSE3(const uint8* y_buf,
  1.2284 +                         const uint8* u_buf,
  1.2285 +                         const uint8* v_buf,
  1.2286 +                         uint8* dst_argb,
  1.2287 +                         int width) {
  1.2288 +  __asm {
  1.2289 +    push       esi
  1.2290 +    push       edi
  1.2291 +    mov        eax, [esp + 8 + 4]   // Y
  1.2292 +    mov        esi, [esp + 8 + 8]   // U
  1.2293 +    mov        edi, [esp + 8 + 12]  // V
  1.2294 +    mov        edx, [esp + 8 + 16]  // argb
  1.2295 +    mov        ecx, [esp + 8 + 20]  // width
  1.2296 +    sub        edi, esi
  1.2297 +    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  1.2298 +    pxor       xmm4, xmm4
  1.2299 +
  1.2300 +    align      4
  1.2301 + convertloop:
  1.2302 +    READYUV444
  1.2303 +    YUVTORGB
  1.2304 +
  1.2305 +    // Step 3: Weave into ARGB
  1.2306 +    punpcklbw  xmm0, xmm1           // BG
  1.2307 +    punpcklbw  xmm2, xmm5           // RA
  1.2308 +    movdqa     xmm1, xmm0
  1.2309 +    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
  1.2310 +    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
  1.2311 +    movdqa     [edx], xmm0
  1.2312 +    movdqa     [edx + 16], xmm1
  1.2313 +    lea        edx,  [edx + 32]
  1.2314 +    sub        ecx, 8
  1.2315 +    jg         convertloop
  1.2316 +
  1.2317 +    pop        edi
  1.2318 +    pop        esi
  1.2319 +    ret
  1.2320 +  }
  1.2321 +}
  1.2322 +
  1.2323 +// 8 pixels, dest aligned 16.
  1.2324 +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  1.2325 +__declspec(naked) __declspec(align(16))
  1.2326 +void I422ToRGB24Row_SSSE3(const uint8* y_buf,
  1.2327 +                          const uint8* u_buf,
  1.2328 +                          const uint8* v_buf,
  1.2329 +                          uint8* dst_rgb24,
  1.2330 +                          int width) {
  1.2331 +  __asm {
  1.2332 +    push       esi
  1.2333 +    push       edi
  1.2334 +    mov        eax, [esp + 8 + 4]   // Y
  1.2335 +    mov        esi, [esp + 8 + 8]   // U
  1.2336 +    mov        edi, [esp + 8 + 12]  // V
  1.2337 +    mov        edx, [esp + 8 + 16]  // rgb24
  1.2338 +    mov        ecx, [esp + 8 + 20]  // width
  1.2339 +    sub        edi, esi
  1.2340 +    pxor       xmm4, xmm4
  1.2341 +    movdqa     xmm5, kShuffleMaskARGBToRGB24_0
  1.2342 +    movdqa     xmm6, kShuffleMaskARGBToRGB24
  1.2343 +
  1.2344 +    align      4
  1.2345 + convertloop:
  1.2346 +    READYUV422
  1.2347 +    YUVTORGB
  1.2348 +
  1.2349 +    // Step 3: Weave into RRGB
  1.2350 +    punpcklbw  xmm0, xmm1           // BG
  1.2351 +    punpcklbw  xmm2, xmm2           // RR
  1.2352 +    movdqa     xmm1, xmm0
  1.2353 +    punpcklwd  xmm0, xmm2           // BGRR first 4 pixels
  1.2354 +    punpckhwd  xmm1, xmm2           // BGRR next 4 pixels
  1.2355 +    pshufb     xmm0, xmm5           // Pack into first 8 and last 4 bytes.
  1.2356 +    pshufb     xmm1, xmm6           // Pack into first 12 bytes.
  1.2357 +    palignr    xmm1, xmm0, 12       // last 4 bytes of xmm0 + 12 from xmm1
  1.2358 +    movq       qword ptr [edx], xmm0  // First 8 bytes
  1.2359 +    movdqu     [edx + 8], xmm1      // Last 16 bytes. = 24 bytes, 8 RGB pixels.
  1.2360 +    lea        edx,  [edx + 24]
  1.2361 +    sub        ecx, 8
  1.2362 +    jg         convertloop
  1.2363 +
  1.2364 +    pop        edi
  1.2365 +    pop        esi
  1.2366 +    ret
  1.2367 +  }
  1.2368 +}
  1.2369 +
  1.2370 +// 8 pixels, dest aligned 16.
  1.2371 +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  1.2372 +__declspec(naked) __declspec(align(16))
  1.2373 +void I422ToRAWRow_SSSE3(const uint8* y_buf,
  1.2374 +                        const uint8* u_buf,
  1.2375 +                        const uint8* v_buf,
  1.2376 +                        uint8* dst_raw,
  1.2377 +                        int width) {
  1.2378 +  __asm {
  1.2379 +    push       esi
  1.2380 +    push       edi
  1.2381 +    mov        eax, [esp + 8 + 4]   // Y
  1.2382 +    mov        esi, [esp + 8 + 8]   // U
  1.2383 +    mov        edi, [esp + 8 + 12]  // V
  1.2384 +    mov        edx, [esp + 8 + 16]  // raw
  1.2385 +    mov        ecx, [esp + 8 + 20]  // width
  1.2386 +    sub        edi, esi
  1.2387 +    pxor       xmm4, xmm4
  1.2388 +    movdqa     xmm5, kShuffleMaskARGBToRAW_0
  1.2389 +    movdqa     xmm6, kShuffleMaskARGBToRAW
  1.2390 +
  1.2391 +    align      4
  1.2392 + convertloop:
  1.2393 +    READYUV422
  1.2394 +    YUVTORGB
  1.2395 +
  1.2396 +    // Step 3: Weave into RRGB
  1.2397 +    punpcklbw  xmm0, xmm1           // BG
  1.2398 +    punpcklbw  xmm2, xmm2           // RR
  1.2399 +    movdqa     xmm1, xmm0
  1.2400 +    punpcklwd  xmm0, xmm2           // BGRR first 4 pixels
  1.2401 +    punpckhwd  xmm1, xmm2           // BGRR next 4 pixels
  1.2402 +    pshufb     xmm0, xmm5           // Pack into first 8 and last 4 bytes.
  1.2403 +    pshufb     xmm1, xmm6           // Pack into first 12 bytes.
  1.2404 +    palignr    xmm1, xmm0, 12       // last 4 bytes of xmm0 + 12 from xmm1
  1.2405 +    movq       qword ptr [edx], xmm0  // First 8 bytes
  1.2406 +    movdqu     [edx + 8], xmm1      // Last 16 bytes. = 24 bytes, 8 RGB pixels.
  1.2407 +    lea        edx,  [edx + 24]
  1.2408 +    sub        ecx, 8
  1.2409 +    jg         convertloop
  1.2410 +
  1.2411 +    pop        edi
  1.2412 +    pop        esi
  1.2413 +    ret
  1.2414 +  }
  1.2415 +}
  1.2416 +
  1.2417 +// 8 pixels, dest unaligned.
  1.2418 +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  1.2419 +__declspec(naked) __declspec(align(16))
  1.2420 +void I422ToRGB565Row_SSSE3(const uint8* y_buf,
  1.2421 +                           const uint8* u_buf,
  1.2422 +                           const uint8* v_buf,
  1.2423 +                           uint8* rgb565_buf,
  1.2424 +                           int width) {
  1.2425 +  __asm {
  1.2426 +    push       esi
  1.2427 +    push       edi
  1.2428 +    mov        eax, [esp + 8 + 4]   // Y
  1.2429 +    mov        esi, [esp + 8 + 8]   // U
  1.2430 +    mov        edi, [esp + 8 + 12]  // V
  1.2431 +    mov        edx, [esp + 8 + 16]  // rgb565
  1.2432 +    mov        ecx, [esp + 8 + 20]  // width
  1.2433 +    sub        edi, esi
  1.2434 +    pxor       xmm4, xmm4
  1.2435 +    pcmpeqb    xmm5, xmm5       // generate mask 0x0000001f
  1.2436 +    psrld      xmm5, 27
  1.2437 +    pcmpeqb    xmm6, xmm6       // generate mask 0x000007e0
  1.2438 +    psrld      xmm6, 26
  1.2439 +    pslld      xmm6, 5
  1.2440 +    pcmpeqb    xmm7, xmm7       // generate mask 0xfffff800
  1.2441 +    pslld      xmm7, 11
  1.2442 +
  1.2443 +    align      4
  1.2444 + convertloop:
  1.2445 +    READYUV422
  1.2446 +    YUVTORGB
  1.2447 +
  1.2448 +    // Step 3: Weave into RRGB
  1.2449 +    punpcklbw  xmm0, xmm1           // BG
  1.2450 +    punpcklbw  xmm2, xmm2           // RR
  1.2451 +    movdqa     xmm1, xmm0
  1.2452 +    punpcklwd  xmm0, xmm2           // BGRR first 4 pixels
  1.2453 +    punpckhwd  xmm1, xmm2           // BGRR next 4 pixels
  1.2454 +
  1.2455 +    // Step 3b: RRGB -> RGB565
  1.2456 +    movdqa     xmm3, xmm0    // B  first 4 pixels of argb
  1.2457 +    movdqa     xmm2, xmm0    // G
  1.2458 +    pslld      xmm0, 8       // R
  1.2459 +    psrld      xmm3, 3       // B
  1.2460 +    psrld      xmm2, 5       // G
  1.2461 +    psrad      xmm0, 16      // R
  1.2462 +    pand       xmm3, xmm5    // B
  1.2463 +    pand       xmm2, xmm6    // G
  1.2464 +    pand       xmm0, xmm7    // R
  1.2465 +    por        xmm3, xmm2    // BG
  1.2466 +    por        xmm0, xmm3    // BGR
  1.2467 +    movdqa     xmm3, xmm1    // B  next 4 pixels of argb
  1.2468 +    movdqa     xmm2, xmm1    // G
  1.2469 +    pslld      xmm1, 8       // R
  1.2470 +    psrld      xmm3, 3       // B
  1.2471 +    psrld      xmm2, 5       // G
  1.2472 +    psrad      xmm1, 16      // R
  1.2473 +    pand       xmm3, xmm5    // B
  1.2474 +    pand       xmm2, xmm6    // G
  1.2475 +    pand       xmm1, xmm7    // R
  1.2476 +    por        xmm3, xmm2    // BG
  1.2477 +    por        xmm1, xmm3    // BGR
  1.2478 +    packssdw   xmm0, xmm1
  1.2479 +    sub        ecx, 8
  1.2480 +    movdqu     [edx], xmm0   // store 8 pixels of RGB565
  1.2481 +    lea        edx, [edx + 16]
  1.2482 +    jg         convertloop
  1.2483 +
  1.2484 +    pop        edi
  1.2485 +    pop        esi
  1.2486 +    ret
  1.2487 +  }
  1.2488 +}
  1.2489 +
  1.2490 +// 8 pixels, dest aligned 16.
  1.2491 +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  1.2492 +__declspec(naked) __declspec(align(16))
  1.2493 +void I422ToARGBRow_SSSE3(const uint8* y_buf,
  1.2494 +                         const uint8* u_buf,
  1.2495 +                         const uint8* v_buf,
  1.2496 +                         uint8* dst_argb,
  1.2497 +                         int width) {
  1.2498 +  __asm {
  1.2499 +    push       esi
  1.2500 +    push       edi
  1.2501 +    mov        eax, [esp + 8 + 4]   // Y
  1.2502 +    mov        esi, [esp + 8 + 8]   // U
  1.2503 +    mov        edi, [esp + 8 + 12]  // V
  1.2504 +    mov        edx, [esp + 8 + 16]  // argb
  1.2505 +    mov        ecx, [esp + 8 + 20]  // width
  1.2506 +    sub        edi, esi
  1.2507 +    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  1.2508 +    pxor       xmm4, xmm4
  1.2509 +
  1.2510 +    align      4
  1.2511 + convertloop:
  1.2512 +    READYUV422
  1.2513 +    YUVTORGB
  1.2514 +
  1.2515 +    // Step 3: Weave into ARGB
  1.2516 +    punpcklbw  xmm0, xmm1           // BG
  1.2517 +    punpcklbw  xmm2, xmm5           // RA
  1.2518 +    movdqa     xmm1, xmm0
  1.2519 +    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
  1.2520 +    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
  1.2521 +    movdqa     [edx], xmm0
  1.2522 +    movdqa     [edx + 16], xmm1
  1.2523 +    lea        edx,  [edx + 32]
  1.2524 +    sub        ecx, 8
  1.2525 +    jg         convertloop
  1.2526 +
  1.2527 +    pop        edi
  1.2528 +    pop        esi
  1.2529 +    ret
  1.2530 +  }
  1.2531 +}
  1.2532 +
  1.2533 +// 8 pixels, dest aligned 16.
  1.2534 +// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  1.2535 +// Similar to I420 but duplicate UV once more.
  1.2536 +__declspec(naked) __declspec(align(16))
  1.2537 +void I411ToARGBRow_SSSE3(const uint8* y_buf,
  1.2538 +                         const uint8* u_buf,
  1.2539 +                         const uint8* v_buf,
  1.2540 +                         uint8* dst_argb,
  1.2541 +                         int width) {
  1.2542 +  __asm {
  1.2543 +    push       ebx
  1.2544 +    push       esi
  1.2545 +    push       edi
  1.2546 +    mov        eax, [esp + 12 + 4]   // Y
  1.2547 +    mov        esi, [esp + 12 + 8]   // U
  1.2548 +    mov        edi, [esp + 12 + 12]  // V
  1.2549 +    mov        edx, [esp + 12 + 16]  // argb
  1.2550 +    mov        ecx, [esp + 12 + 20]  // width
  1.2551 +    sub        edi, esi
  1.2552 +    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  1.2553 +    pxor       xmm4, xmm4
  1.2554 +
  1.2555 +    align      4
  1.2556 + convertloop:
  1.2557 +    READYUV411  // modifies EBX
  1.2558 +    YUVTORGB
  1.2559 +
  1.2560 +    // Step 3: Weave into ARGB
  1.2561 +    punpcklbw  xmm0, xmm1           // BG
  1.2562 +    punpcklbw  xmm2, xmm5           // RA
  1.2563 +    movdqa     xmm1, xmm0
  1.2564 +    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
  1.2565 +    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
  1.2566 +    movdqa     [edx], xmm0
  1.2567 +    movdqa     [edx + 16], xmm1
  1.2568 +    lea        edx,  [edx + 32]
  1.2569 +    sub        ecx, 8
  1.2570 +    jg         convertloop
  1.2571 +
  1.2572 +    pop        edi
  1.2573 +    pop        esi
  1.2574 +    pop        ebx
  1.2575 +    ret
  1.2576 +  }
  1.2577 +}
  1.2578 +
  1.2579 +// 8 pixels, dest aligned 16.
  1.2580 +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  1.2581 +__declspec(naked) __declspec(align(16))
  1.2582 +void NV12ToARGBRow_SSSE3(const uint8* y_buf,
  1.2583 +                         const uint8* uv_buf,
  1.2584 +                         uint8* dst_argb,
  1.2585 +                         int width) {
  1.2586 +  __asm {
  1.2587 +    push       esi
  1.2588 +    mov        eax, [esp + 4 + 4]   // Y
  1.2589 +    mov        esi, [esp + 4 + 8]   // UV
  1.2590 +    mov        edx, [esp + 4 + 12]  // argb
  1.2591 +    mov        ecx, [esp + 4 + 16]  // width
  1.2592 +    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  1.2593 +    pxor       xmm4, xmm4
  1.2594 +
  1.2595 +    align      4
  1.2596 + convertloop:
  1.2597 +    READNV12
  1.2598 +    YUVTORGB
  1.2599 +
  1.2600 +    // Step 3: Weave into ARGB
  1.2601 +    punpcklbw  xmm0, xmm1           // BG
  1.2602 +    punpcklbw  xmm2, xmm5           // RA
  1.2603 +    movdqa     xmm1, xmm0
  1.2604 +    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
  1.2605 +    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
  1.2606 +    movdqa     [edx], xmm0
  1.2607 +    movdqa     [edx + 16], xmm1
  1.2608 +    lea        edx,  [edx + 32]
  1.2609 +    sub        ecx, 8
  1.2610 +    jg         convertloop
  1.2611 +
  1.2612 +    pop        esi
  1.2613 +    ret
  1.2614 +  }
  1.2615 +}
  1.2616 +
  1.2617 +// 8 pixels, dest aligned 16.
  1.2618 +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  1.2619 +__declspec(naked) __declspec(align(16))
  1.2620 +void NV21ToARGBRow_SSSE3(const uint8* y_buf,
  1.2621 +                         const uint8* uv_buf,
  1.2622 +                         uint8* dst_argb,
  1.2623 +                         int width) {
  1.2624 +  __asm {
  1.2625 +    push       esi
  1.2626 +    mov        eax, [esp + 4 + 4]   // Y
  1.2627 +    mov        esi, [esp + 4 + 8]   // VU
  1.2628 +    mov        edx, [esp + 4 + 12]  // argb
  1.2629 +    mov        ecx, [esp + 4 + 16]  // width
  1.2630 +    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  1.2631 +    pxor       xmm4, xmm4
  1.2632 +
  1.2633 +    align      4
  1.2634 + convertloop:
  1.2635 +    READNV12
  1.2636 +    YVUTORGB
  1.2637 +
  1.2638 +    // Step 3: Weave into ARGB
  1.2639 +    punpcklbw  xmm0, xmm1           // BG
  1.2640 +    punpcklbw  xmm2, xmm5           // RA
  1.2641 +    movdqa     xmm1, xmm0
  1.2642 +    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
  1.2643 +    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
  1.2644 +    movdqa     [edx], xmm0
  1.2645 +    movdqa     [edx + 16], xmm1
  1.2646 +    lea        edx,  [edx + 32]
  1.2647 +    sub        ecx, 8
  1.2648 +    jg         convertloop
  1.2649 +
  1.2650 +    pop        esi
  1.2651 +    ret
  1.2652 +  }
  1.2653 +}
  1.2654 +
  1.2655 +// 8 pixels, unaligned.
  1.2656 +// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
  1.2657 +__declspec(naked) __declspec(align(16))
  1.2658 +void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
  1.2659 +                                   const uint8* u_buf,
  1.2660 +                                   const uint8* v_buf,
  1.2661 +                                   uint8* dst_argb,
  1.2662 +                                   int width) {
  1.2663 +  __asm {
  1.2664 +    push       esi
  1.2665 +    push       edi
  1.2666 +    mov        eax, [esp + 8 + 4]   // Y
  1.2667 +    mov        esi, [esp + 8 + 8]   // U
  1.2668 +    mov        edi, [esp + 8 + 12]  // V
  1.2669 +    mov        edx, [esp + 8 + 16]  // argb
  1.2670 +    mov        ecx, [esp + 8 + 20]  // width
  1.2671 +    sub        edi, esi
  1.2672 +    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  1.2673 +    pxor       xmm4, xmm4
  1.2674 +
  1.2675 +    align      4
  1.2676 + convertloop:
  1.2677 +    READYUV444
  1.2678 +    YUVTORGB
  1.2679 +
  1.2680 +    // Step 3: Weave into ARGB
  1.2681 +    punpcklbw  xmm0, xmm1           // BG
  1.2682 +    punpcklbw  xmm2, xmm5           // RA
  1.2683 +    movdqa     xmm1, xmm0
  1.2684 +    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
  1.2685 +    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
  1.2686 +    movdqu     [edx], xmm0
  1.2687 +    movdqu     [edx + 16], xmm1
  1.2688 +    lea        edx,  [edx + 32]
  1.2689 +    sub        ecx, 8
  1.2690 +    jg         convertloop
  1.2691 +
  1.2692 +    pop        edi
  1.2693 +    pop        esi
  1.2694 +    ret
  1.2695 +  }
  1.2696 +}
  1.2697 +
  1.2698 +// 8 pixels, unaligned.
  1.2699 +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  1.2700 +__declspec(naked) __declspec(align(16))
  1.2701 +void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
  1.2702 +                                   const uint8* u_buf,
  1.2703 +                                   const uint8* v_buf,
  1.2704 +                                   uint8* dst_argb,
  1.2705 +                                   int width) {
  1.2706 +  __asm {
  1.2707 +    push       esi
  1.2708 +    push       edi
  1.2709 +    mov        eax, [esp + 8 + 4]   // Y
  1.2710 +    mov        esi, [esp + 8 + 8]   // U
  1.2711 +    mov        edi, [esp + 8 + 12]  // V
  1.2712 +    mov        edx, [esp + 8 + 16]  // argb
  1.2713 +    mov        ecx, [esp + 8 + 20]  // width
  1.2714 +    sub        edi, esi
  1.2715 +    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  1.2716 +    pxor       xmm4, xmm4
  1.2717 +
  1.2718 +    align      4
  1.2719 + convertloop:
  1.2720 +    READYUV422
  1.2721 +    YUVTORGB
  1.2722 +
  1.2723 +    // Step 3: Weave into ARGB
  1.2724 +    punpcklbw  xmm0, xmm1           // BG
  1.2725 +    punpcklbw  xmm2, xmm5           // RA
  1.2726 +    movdqa     xmm1, xmm0
  1.2727 +    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
  1.2728 +    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
  1.2729 +    movdqu     [edx], xmm0
  1.2730 +    movdqu     [edx + 16], xmm1
  1.2731 +    lea        edx,  [edx + 32]
  1.2732 +    sub        ecx, 8
  1.2733 +    jg         convertloop
  1.2734 +
  1.2735 +    pop        edi
  1.2736 +    pop        esi
  1.2737 +    ret
  1.2738 +  }
  1.2739 +}
  1.2740 +
  1.2741 +// 8 pixels, unaligned.
  1.2742 +// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  1.2743 +// Similar to I420 but duplicate UV once more.
  1.2744 +__declspec(naked) __declspec(align(16))
  1.2745 +void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
  1.2746 +                                   const uint8* u_buf,
  1.2747 +                                   const uint8* v_buf,
  1.2748 +                                   uint8* dst_argb,
  1.2749 +                                   int width) {
  1.2750 +  __asm {
  1.2751 +    push       ebx
  1.2752 +    push       esi
  1.2753 +    push       edi
  1.2754 +    mov        eax, [esp + 12 + 4]   // Y
  1.2755 +    mov        esi, [esp + 12 + 8]   // U
  1.2756 +    mov        edi, [esp + 12 + 12]  // V
  1.2757 +    mov        edx, [esp + 12 + 16]  // argb
  1.2758 +    mov        ecx, [esp + 12 + 20]  // width
  1.2759 +    sub        edi, esi
  1.2760 +    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  1.2761 +    pxor       xmm4, xmm4
  1.2762 +
  1.2763 +    align      4
  1.2764 + convertloop:
  1.2765 +    READYUV411  // modifies EBX
  1.2766 +    YUVTORGB
  1.2767 +
  1.2768 +    // Step 3: Weave into ARGB
  1.2769 +    punpcklbw  xmm0, xmm1           // BG
  1.2770 +    punpcklbw  xmm2, xmm5           // RA
  1.2771 +    movdqa     xmm1, xmm0
  1.2772 +    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
  1.2773 +    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
  1.2774 +    movdqu     [edx], xmm0
  1.2775 +    movdqu     [edx + 16], xmm1
  1.2776 +    lea        edx,  [edx + 32]
  1.2777 +    sub        ecx, 8
  1.2778 +    jg         convertloop
  1.2779 +
  1.2780 +    pop        edi
  1.2781 +    pop        esi
  1.2782 +    pop        ebx
  1.2783 +    ret
  1.2784 +  }
  1.2785 +}
  1.2786 +
  1.2787 +// 8 pixels, dest aligned 16.
  1.2788 +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  1.2789 +__declspec(naked) __declspec(align(16))
  1.2790 +void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
  1.2791 +                                   const uint8* uv_buf,
  1.2792 +                                   uint8* dst_argb,
  1.2793 +                                   int width) {
  1.2794 +  __asm {
  1.2795 +    push       esi
  1.2796 +    mov        eax, [esp + 4 + 4]   // Y
  1.2797 +    mov        esi, [esp + 4 + 8]   // UV
  1.2798 +    mov        edx, [esp + 4 + 12]  // argb
  1.2799 +    mov        ecx, [esp + 4 + 16]  // width
  1.2800 +    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  1.2801 +    pxor       xmm4, xmm4
  1.2802 +
  1.2803 +    align      4
  1.2804 + convertloop:
  1.2805 +    READNV12
  1.2806 +    YUVTORGB
  1.2807 +
  1.2808 +    // Step 3: Weave into ARGB
  1.2809 +    punpcklbw  xmm0, xmm1           // BG
  1.2810 +    punpcklbw  xmm2, xmm5           // RA
  1.2811 +    movdqa     xmm1, xmm0
  1.2812 +    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
  1.2813 +    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
  1.2814 +    movdqu     [edx], xmm0
  1.2815 +    movdqu     [edx + 16], xmm1
  1.2816 +    lea        edx,  [edx + 32]
  1.2817 +    sub        ecx, 8
  1.2818 +    jg         convertloop
  1.2819 +
  1.2820 +    pop        esi
  1.2821 +    ret
  1.2822 +  }
  1.2823 +}
  1.2824 +
  1.2825 +// 8 pixels, dest aligned 16.
  1.2826 +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  1.2827 +__declspec(naked) __declspec(align(16))
  1.2828 +void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
  1.2829 +                                   const uint8* uv_buf,
  1.2830 +                                   uint8* dst_argb,
  1.2831 +                                   int width) {
  1.2832 +  __asm {
  1.2833 +    push       esi
  1.2834 +    mov        eax, [esp + 4 + 4]   // Y
  1.2835 +    mov        esi, [esp + 4 + 8]   // VU
  1.2836 +    mov        edx, [esp + 4 + 12]  // argb
  1.2837 +    mov        ecx, [esp + 4 + 16]  // width
  1.2838 +    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  1.2839 +    pxor       xmm4, xmm4
  1.2840 +
  1.2841 +    align      4
  1.2842 + convertloop:
  1.2843 +    READNV12
  1.2844 +    YVUTORGB
  1.2845 +
  1.2846 +    // Step 3: Weave into ARGB
  1.2847 +    punpcklbw  xmm0, xmm1           // BG
  1.2848 +    punpcklbw  xmm2, xmm5           // RA
  1.2849 +    movdqa     xmm1, xmm0
  1.2850 +    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
  1.2851 +    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
  1.2852 +    movdqu     [edx], xmm0
  1.2853 +    movdqu     [edx + 16], xmm1
  1.2854 +    lea        edx,  [edx + 32]
  1.2855 +    sub        ecx, 8
  1.2856 +    jg         convertloop
  1.2857 +
  1.2858 +    pop        esi
  1.2859 +    ret
  1.2860 +  }
  1.2861 +}
  1.2862 +
  1.2863 +__declspec(naked) __declspec(align(16))
  1.2864 +void I422ToBGRARow_SSSE3(const uint8* y_buf,
  1.2865 +                         const uint8* u_buf,
  1.2866 +                         const uint8* v_buf,
  1.2867 +                         uint8* dst_bgra,
  1.2868 +                         int width) {
  1.2869 +  __asm {
  1.2870 +    push       esi
  1.2871 +    push       edi
  1.2872 +    mov        eax, [esp + 8 + 4]   // Y
  1.2873 +    mov        esi, [esp + 8 + 8]   // U
  1.2874 +    mov        edi, [esp + 8 + 12]  // V
  1.2875 +    mov        edx, [esp + 8 + 16]  // bgra
  1.2876 +    mov        ecx, [esp + 8 + 20]  // width
  1.2877 +    sub        edi, esi
  1.2878 +    pxor       xmm4, xmm4
  1.2879 +
  1.2880 +    align      4
  1.2881 + convertloop:
  1.2882 +    READYUV422
  1.2883 +    YUVTORGB
  1.2884 +
  1.2885 +    // Step 3: Weave into BGRA
  1.2886 +    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  1.2887 +    punpcklbw  xmm1, xmm0           // GB
  1.2888 +    punpcklbw  xmm5, xmm2           // AR
  1.2889 +    movdqa     xmm0, xmm5
  1.2890 +    punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
  1.2891 +    punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
  1.2892 +    movdqa     [edx], xmm5
  1.2893 +    movdqa     [edx + 16], xmm0
  1.2894 +    lea        edx,  [edx + 32]
  1.2895 +    sub        ecx, 8
  1.2896 +    jg         convertloop
  1.2897 +
  1.2898 +    pop        edi
  1.2899 +    pop        esi
  1.2900 +    ret
  1.2901 +  }
  1.2902 +}
  1.2903 +
  1.2904 +__declspec(naked) __declspec(align(16))
  1.2905 +void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
  1.2906 +                                   const uint8* u_buf,
  1.2907 +                                   const uint8* v_buf,
  1.2908 +                                   uint8* dst_bgra,
  1.2909 +                                   int width) {
  1.2910 +  __asm {
  1.2911 +    push       esi
  1.2912 +    push       edi
  1.2913 +    mov        eax, [esp + 8 + 4]   // Y
  1.2914 +    mov        esi, [esp + 8 + 8]   // U
  1.2915 +    mov        edi, [esp + 8 + 12]  // V
  1.2916 +    mov        edx, [esp + 8 + 16]  // bgra
  1.2917 +    mov        ecx, [esp + 8 + 20]  // width
  1.2918 +    sub        edi, esi
  1.2919 +    pxor       xmm4, xmm4
  1.2920 +
  1.2921 +    align      4
  1.2922 + convertloop:
  1.2923 +    READYUV422
  1.2924 +    YUVTORGB
  1.2925 +
  1.2926 +    // Step 3: Weave into BGRA
  1.2927 +    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  1.2928 +    punpcklbw  xmm1, xmm0           // GB
  1.2929 +    punpcklbw  xmm5, xmm2           // AR
  1.2930 +    movdqa     xmm0, xmm5
  1.2931 +    punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
  1.2932 +    punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
  1.2933 +    movdqu     [edx], xmm5
  1.2934 +    movdqu     [edx + 16], xmm0
  1.2935 +    lea        edx,  [edx + 32]
  1.2936 +    sub        ecx, 8
  1.2937 +    jg         convertloop
  1.2938 +
  1.2939 +    pop        edi
  1.2940 +    pop        esi
  1.2941 +    ret
  1.2942 +  }
  1.2943 +}
  1.2944 +
  1.2945 +__declspec(naked) __declspec(align(16))
  1.2946 +void I422ToABGRRow_SSSE3(const uint8* y_buf,
  1.2947 +                         const uint8* u_buf,
  1.2948 +                         const uint8* v_buf,
  1.2949 +                         uint8* dst_abgr,
  1.2950 +                         int width) {
  1.2951 +  __asm {
  1.2952 +    push       esi
  1.2953 +    push       edi
  1.2954 +    mov        eax, [esp + 8 + 4]   // Y
  1.2955 +    mov        esi, [esp + 8 + 8]   // U
  1.2956 +    mov        edi, [esp + 8 + 12]  // V
  1.2957 +    mov        edx, [esp + 8 + 16]  // abgr
  1.2958 +    mov        ecx, [esp + 8 + 20]  // width
  1.2959 +    sub        edi, esi
  1.2960 +    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  1.2961 +    pxor       xmm4, xmm4
  1.2962 +
  1.2963 +    align      4
  1.2964 + convertloop:
  1.2965 +    READYUV422
  1.2966 +    YUVTORGB
  1.2967 +
  1.2968 +    // Step 3: Weave into ARGB
  1.2969 +    punpcklbw  xmm2, xmm1           // RG
  1.2970 +    punpcklbw  xmm0, xmm5           // BA
  1.2971 +    movdqa     xmm1, xmm2
  1.2972 +    punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
  1.2973 +    punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
  1.2974 +    movdqa     [edx], xmm2
  1.2975 +    movdqa     [edx + 16], xmm1
  1.2976 +    lea        edx,  [edx + 32]
  1.2977 +    sub        ecx, 8
  1.2978 +    jg         convertloop
  1.2979 +
  1.2980 +    pop        edi
  1.2981 +    pop        esi
  1.2982 +    ret
  1.2983 +  }
  1.2984 +}
  1.2985 +
  1.2986 +__declspec(naked) __declspec(align(16))
  1.2987 +void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
  1.2988 +                                   const uint8* u_buf,
  1.2989 +                                   const uint8* v_buf,
  1.2990 +                                   uint8* dst_abgr,
  1.2991 +                                   int width) {
  1.2992 +  __asm {
  1.2993 +    push       esi
  1.2994 +    push       edi
  1.2995 +    mov        eax, [esp + 8 + 4]   // Y
  1.2996 +    mov        esi, [esp + 8 + 8]   // U
  1.2997 +    mov        edi, [esp + 8 + 12]  // V
  1.2998 +    mov        edx, [esp + 8 + 16]  // abgr
  1.2999 +    mov        ecx, [esp + 8 + 20]  // width
  1.3000 +    sub        edi, esi
  1.3001 +    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  1.3002 +    pxor       xmm4, xmm4
  1.3003 +
  1.3004 +    align      4
  1.3005 + convertloop:
  1.3006 +    READYUV422
  1.3007 +    YUVTORGB
  1.3008 +
  1.3009 +    // Step 3: Weave into ARGB
  1.3010 +    punpcklbw  xmm2, xmm1           // RG
  1.3011 +    punpcklbw  xmm0, xmm5           // BA
  1.3012 +    movdqa     xmm1, xmm2
  1.3013 +    punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
  1.3014 +    punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
  1.3015 +    movdqu     [edx], xmm2
  1.3016 +    movdqu     [edx + 16], xmm1
  1.3017 +    lea        edx,  [edx + 32]
  1.3018 +    sub        ecx, 8
  1.3019 +    jg         convertloop
  1.3020 +
  1.3021 +    pop        edi
  1.3022 +    pop        esi
  1.3023 +    ret
  1.3024 +  }
  1.3025 +}
  1.3026 +
  1.3027 +__declspec(naked) __declspec(align(16))
  1.3028 +void I422ToRGBARow_SSSE3(const uint8* y_buf,
  1.3029 +                         const uint8* u_buf,
  1.3030 +                         const uint8* v_buf,
  1.3031 +                         uint8* dst_rgba,
  1.3032 +                         int width) {
  1.3033 +  __asm {
  1.3034 +    push       esi
  1.3035 +    push       edi
  1.3036 +    mov        eax, [esp + 8 + 4]   // Y
  1.3037 +    mov        esi, [esp + 8 + 8]   // U
  1.3038 +    mov        edi, [esp + 8 + 12]  // V
  1.3039 +    mov        edx, [esp + 8 + 16]  // rgba
  1.3040 +    mov        ecx, [esp + 8 + 20]  // width
  1.3041 +    sub        edi, esi
  1.3042 +    pxor       xmm4, xmm4
  1.3043 +
  1.3044 +    align      4
  1.3045 + convertloop:
  1.3046 +    READYUV422
  1.3047 +    YUVTORGB
  1.3048 +
  1.3049 +    // Step 3: Weave into RGBA
  1.3050 +    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  1.3051 +    punpcklbw  xmm1, xmm2           // GR
  1.3052 +    punpcklbw  xmm5, xmm0           // AB
  1.3053 +    movdqa     xmm0, xmm5
  1.3054 +    punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
  1.3055 +    punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
  1.3056 +    movdqa     [edx], xmm5
  1.3057 +    movdqa     [edx + 16], xmm0
  1.3058 +    lea        edx,  [edx + 32]
  1.3059 +    sub        ecx, 8
  1.3060 +    jg         convertloop
  1.3061 +
  1.3062 +    pop        edi
  1.3063 +    pop        esi
  1.3064 +    ret
  1.3065 +  }
  1.3066 +}
  1.3067 +
  1.3068 +__declspec(naked) __declspec(align(16))
  1.3069 +void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
  1.3070 +                                   const uint8* u_buf,
  1.3071 +                                   const uint8* v_buf,
  1.3072 +                                   uint8* dst_rgba,
  1.3073 +                                   int width) {
  1.3074 +  __asm {
  1.3075 +    push       esi
  1.3076 +    push       edi
  1.3077 +    mov        eax, [esp + 8 + 4]   // Y
  1.3078 +    mov        esi, [esp + 8 + 8]   // U
  1.3079 +    mov        edi, [esp + 8 + 12]  // V
  1.3080 +    mov        edx, [esp + 8 + 16]  // rgba
  1.3081 +    mov        ecx, [esp + 8 + 20]  // width
  1.3082 +    sub        edi, esi
  1.3083 +    pxor       xmm4, xmm4
  1.3084 +
  1.3085 +    align      4
  1.3086 + convertloop:
  1.3087 +    READYUV422
  1.3088 +    YUVTORGB
  1.3089 +
  1.3090 +    // Step 3: Weave into RGBA
  1.3091 +    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  1.3092 +    punpcklbw  xmm1, xmm2           // GR
  1.3093 +    punpcklbw  xmm5, xmm0           // AB
  1.3094 +    movdqa     xmm0, xmm5
  1.3095 +    punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
  1.3096 +    punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
  1.3097 +    movdqu     [edx], xmm5
  1.3098 +    movdqu     [edx + 16], xmm0
  1.3099 +    lea        edx,  [edx + 32]
  1.3100 +    sub        ecx, 8
  1.3101 +    jg         convertloop
  1.3102 +
  1.3103 +    pop        edi
  1.3104 +    pop        esi
  1.3105 +    ret
  1.3106 +  }
  1.3107 +}
  1.3108 +
  1.3109 +#endif  // HAS_I422TOARGBROW_SSSE3
  1.3110 +
  1.3111 +#ifdef HAS_YTOARGBROW_SSE2
  1.3112 +__declspec(naked) __declspec(align(16))
  1.3113 +void YToARGBRow_SSE2(const uint8* y_buf,
  1.3114 +                     uint8* rgb_buf,
  1.3115 +                     int width) {
  1.3116 +  __asm {
  1.3117 +    pxor       xmm5, xmm5
  1.3118 +    pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
  1.3119 +    pslld      xmm4, 24
  1.3120 +    mov        eax, 0x00100010
  1.3121 +    movd       xmm3, eax
  1.3122 +    pshufd     xmm3, xmm3, 0
  1.3123 +    mov        eax, 0x004a004a       // 74
  1.3124 +    movd       xmm2, eax
  1.3125 +    pshufd     xmm2, xmm2,0
  1.3126 +    mov        eax, [esp + 4]       // Y
  1.3127 +    mov        edx, [esp + 8]       // rgb
  1.3128 +    mov        ecx, [esp + 12]      // width
  1.3129 +
  1.3130 +    align      4
  1.3131 + convertloop:
  1.3132 +    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
  1.3133 +    movq       xmm0, qword ptr [eax]
  1.3134 +    lea        eax, [eax + 8]
  1.3135 +    punpcklbw  xmm0, xmm5           // 0.Y
  1.3136 +    psubusw    xmm0, xmm3
  1.3137 +    pmullw     xmm0, xmm2
  1.3138 +    psrlw      xmm0, 6
  1.3139 +    packuswb   xmm0, xmm0           // G
  1.3140 +
  1.3141 +    // Step 2: Weave into ARGB
  1.3142 +    punpcklbw  xmm0, xmm0           // GG
  1.3143 +    movdqa     xmm1, xmm0
  1.3144 +    punpcklwd  xmm0, xmm0           // BGRA first 4 pixels
  1.3145 +    punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
  1.3146 +    por        xmm0, xmm4
  1.3147 +    por        xmm1, xmm4
  1.3148 +    movdqa     [edx], xmm0
  1.3149 +    movdqa     [edx + 16], xmm1
  1.3150 +    lea        edx,  [edx + 32]
  1.3151 +    sub        ecx, 8
  1.3152 +    jg         convertloop
  1.3153 +
  1.3154 +    ret
  1.3155 +  }
  1.3156 +}
  1.3157 +#endif  // HAS_YTOARGBROW_SSE2
  1.3158 +
  1.3159 +#ifdef HAS_MIRRORROW_SSSE3
  1.3160 +// Shuffle table for reversing the bytes.
  1.3161 +static const uvec8 kShuffleMirror = {
  1.3162 +  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
  1.3163 +};
  1.3164 +
  1.3165 +__declspec(naked) __declspec(align(16))
  1.3166 +void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
  1.3167 +  __asm {
  1.3168 +    mov       eax, [esp + 4]   // src
  1.3169 +    mov       edx, [esp + 8]   // dst
  1.3170 +    mov       ecx, [esp + 12]  // width
  1.3171 +    movdqa    xmm5, kShuffleMirror
  1.3172 +    lea       eax, [eax - 16]
  1.3173 +
  1.3174 +    align      4
  1.3175 + convertloop:
  1.3176 +    movdqa    xmm0, [eax + ecx]
  1.3177 +    pshufb    xmm0, xmm5
  1.3178 +    sub       ecx, 16
  1.3179 +    movdqa    [edx], xmm0
  1.3180 +    lea       edx, [edx + 16]
  1.3181 +    jg        convertloop
  1.3182 +    ret
  1.3183 +  }
  1.3184 +}
  1.3185 +#endif  // HAS_MIRRORROW_SSSE3
  1.3186 +
  1.3187 +#ifdef HAS_MIRRORROW_AVX2
  1.3188 +// Shuffle table for reversing the bytes.
  1.3189 +static const ulvec8 kShuffleMirror_AVX2 = {
  1.3190 +  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u,
  1.3191 +  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
  1.3192 +};
  1.3193 +
  1.3194 +__declspec(naked) __declspec(align(16))
  1.3195 +void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
  1.3196 +  __asm {
  1.3197 +    mov       eax, [esp + 4]   // src
  1.3198 +    mov       edx, [esp + 8]   // dst
  1.3199 +    mov       ecx, [esp + 12]  // width
  1.3200 +    vmovdqa   ymm5, kShuffleMirror_AVX2
  1.3201 +    lea       eax, [eax - 32]
  1.3202 +
  1.3203 +    align      4
  1.3204 + convertloop:
  1.3205 +    vmovdqu   ymm0, [eax + ecx]
  1.3206 +    vpshufb   ymm0, ymm0, ymm5
  1.3207 +    vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
  1.3208 +    sub       ecx, 32
  1.3209 +    vmovdqu   [edx], ymm0
  1.3210 +    lea       edx, [edx + 32]
  1.3211 +    jg        convertloop
  1.3212 +    vzeroupper
  1.3213 +    ret
  1.3214 +  }
  1.3215 +}
  1.3216 +#endif  // HAS_MIRRORROW_AVX2
  1.3217 +
  1.3218 +#ifdef HAS_MIRRORROW_SSE2
  1.3219 +// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
  1.3220 +// version can not.
  1.3221 +__declspec(naked) __declspec(align(16))
  1.3222 +void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
  1.3223 +  __asm {
  1.3224 +    mov       eax, [esp + 4]   // src
  1.3225 +    mov       edx, [esp + 8]   // dst
  1.3226 +    mov       ecx, [esp + 12]  // width
  1.3227 +    lea       eax, [eax - 16]
  1.3228 +
  1.3229 +    align      4
  1.3230 + convertloop:
  1.3231 +    movdqu    xmm0, [eax + ecx]
  1.3232 +    movdqa    xmm1, xmm0        // swap bytes
  1.3233 +    psllw     xmm0, 8
  1.3234 +    psrlw     xmm1, 8
  1.3235 +    por       xmm0, xmm1
  1.3236 +    pshuflw   xmm0, xmm0, 0x1b  // swap words
  1.3237 +    pshufhw   xmm0, xmm0, 0x1b
  1.3238 +    pshufd    xmm0, xmm0, 0x4e  // swap qwords
  1.3239 +    sub       ecx, 16
  1.3240 +    movdqu    [edx], xmm0
  1.3241 +    lea       edx, [edx + 16]
  1.3242 +    jg        convertloop
  1.3243 +    ret
  1.3244 +  }
  1.3245 +}
  1.3246 +#endif  // HAS_MIRRORROW_SSE2
  1.3247 +
  1.3248 +#ifdef HAS_MIRRORROW_UV_SSSE3
  1.3249 +// Shuffle table for reversing the bytes of UV channels.
  1.3250 +static const uvec8 kShuffleMirrorUV = {
  1.3251 +  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
  1.3252 +};
  1.3253 +
  1.3254 +__declspec(naked) __declspec(align(16))
  1.3255 +void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
  1.3256 +                       int width) {
  1.3257 +  __asm {
  1.3258 +    push      edi
  1.3259 +    mov       eax, [esp + 4 + 4]   // src
  1.3260 +    mov       edx, [esp + 4 + 8]   // dst_u
  1.3261 +    mov       edi, [esp + 4 + 12]  // dst_v
  1.3262 +    mov       ecx, [esp + 4 + 16]  // width
  1.3263 +    movdqa    xmm1, kShuffleMirrorUV
  1.3264 +    lea       eax, [eax + ecx * 2 - 16]
  1.3265 +    sub       edi, edx
  1.3266 +
  1.3267 +    align      4
  1.3268 + convertloop:
  1.3269 +    movdqa    xmm0, [eax]
  1.3270 +    lea       eax, [eax - 16]
  1.3271 +    pshufb    xmm0, xmm1
  1.3272 +    sub       ecx, 8
  1.3273 +    movlpd    qword ptr [edx], xmm0
  1.3274 +    movhpd    qword ptr [edx + edi], xmm0
  1.3275 +    lea       edx, [edx + 8]
  1.3276 +    jg        convertloop
  1.3277 +
  1.3278 +    pop       edi
  1.3279 +    ret
  1.3280 +  }
  1.3281 +}
  1.3282 +#endif  // HAS_MIRRORROW_UV_SSSE3
  1.3283 +
  1.3284 +#ifdef HAS_ARGBMIRRORROW_SSSE3
  1.3285 +// Shuffle table for reversing the bytes.
  1.3286 +static const uvec8 kARGBShuffleMirror = {
  1.3287 +  12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
  1.3288 +};
  1.3289 +
  1.3290 +__declspec(naked) __declspec(align(16))
  1.3291 +void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
  1.3292 +  __asm {
  1.3293 +    mov       eax, [esp + 4]   // src
  1.3294 +    mov       edx, [esp + 8]   // dst
  1.3295 +    mov       ecx, [esp + 12]  // width
  1.3296 +    lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
  1.3297 +    movdqa    xmm5, kARGBShuffleMirror
  1.3298 +
  1.3299 +    align      4
  1.3300 + convertloop:
  1.3301 +    movdqa    xmm0, [eax]
  1.3302 +    lea       eax, [eax - 16]
  1.3303 +    pshufb    xmm0, xmm5
  1.3304 +    sub       ecx, 4
  1.3305 +    movdqa    [edx], xmm0
  1.3306 +    lea       edx, [edx + 16]
  1.3307 +    jg        convertloop
  1.3308 +    ret
  1.3309 +  }
  1.3310 +}
  1.3311 +#endif  // HAS_ARGBMIRRORROW_SSSE3
  1.3312 +
  1.3313 +#ifdef HAS_ARGBMIRRORROW_AVX2
  1.3314 +// Shuffle table for reversing the bytes.
  1.3315 +static const ulvec32 kARGBShuffleMirror_AVX2 = {
  1.3316 +  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
  1.3317 +};
  1.3318 +
  1.3319 +__declspec(naked) __declspec(align(16))
  1.3320 +void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
  1.3321 +  __asm {
  1.3322 +    mov       eax, [esp + 4]   // src
  1.3323 +    mov       edx, [esp + 8]   // dst
  1.3324 +    mov       ecx, [esp + 12]  // width
  1.3325 +    lea       eax, [eax - 32]
  1.3326 +    vmovdqa   ymm5, kARGBShuffleMirror_AVX2
  1.3327 +
  1.3328 +    align      4
  1.3329 + convertloop:
  1.3330 +    vpermd    ymm0, ymm5, [eax + ecx * 4]  // permute dword order
  1.3331 +    sub       ecx, 8
  1.3332 +    vmovdqu   [edx], ymm0
  1.3333 +    lea       edx, [edx + 32]
  1.3334 +    jg        convertloop
  1.3335 +    vzeroupper
  1.3336 +    ret
  1.3337 +  }
  1.3338 +}
  1.3339 +#endif  // HAS_ARGBMIRRORROW_AVX2
  1.3340 +
  1.3341 +#ifdef HAS_SPLITUVROW_SSE2
  1.3342 +__declspec(naked) __declspec(align(16))
  1.3343 +void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
  1.3344 +  __asm {
  1.3345 +    push       edi
  1.3346 +    mov        eax, [esp + 4 + 4]    // src_uv
  1.3347 +    mov        edx, [esp + 4 + 8]    // dst_u
  1.3348 +    mov        edi, [esp + 4 + 12]   // dst_v
  1.3349 +    mov        ecx, [esp + 4 + 16]   // pix
  1.3350 +    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
  1.3351 +    psrlw      xmm5, 8
  1.3352 +    sub        edi, edx
  1.3353 +
  1.3354 +    align      4
  1.3355 +  convertloop:
  1.3356 +    movdqa     xmm0, [eax]
  1.3357 +    movdqa     xmm1, [eax + 16]
  1.3358 +    lea        eax,  [eax + 32]
  1.3359 +    movdqa     xmm2, xmm0
  1.3360 +    movdqa     xmm3, xmm1
  1.3361 +    pand       xmm0, xmm5   // even bytes
  1.3362 +    pand       xmm1, xmm5
  1.3363 +    packuswb   xmm0, xmm1
  1.3364 +    psrlw      xmm2, 8      // odd bytes
  1.3365 +    psrlw      xmm3, 8
  1.3366 +    packuswb   xmm2, xmm3
  1.3367 +    movdqa     [edx], xmm0
  1.3368 +    movdqa     [edx + edi], xmm2
  1.3369 +    lea        edx, [edx + 16]
  1.3370 +    sub        ecx, 16
  1.3371 +    jg         convertloop
  1.3372 +
  1.3373 +    pop        edi
  1.3374 +    ret
  1.3375 +  }
  1.3376 +}
  1.3377 +
  1.3378 +__declspec(naked) __declspec(align(16))
  1.3379 +void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
  1.3380 +                               int pix) {
  1.3381 +  __asm {
  1.3382 +    push       edi
  1.3383 +    mov        eax, [esp + 4 + 4]    // src_uv
  1.3384 +    mov        edx, [esp + 4 + 8]    // dst_u
  1.3385 +    mov        edi, [esp + 4 + 12]   // dst_v
  1.3386 +    mov        ecx, [esp + 4 + 16]   // pix
  1.3387 +    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
  1.3388 +    psrlw      xmm5, 8
  1.3389 +    sub        edi, edx
  1.3390 +
  1.3391 +    align      4
  1.3392 +  convertloop:
  1.3393 +    movdqu     xmm0, [eax]
  1.3394 +    movdqu     xmm1, [eax + 16]
  1.3395 +    lea        eax,  [eax + 32]
  1.3396 +    movdqa     xmm2, xmm0
  1.3397 +    movdqa     xmm3, xmm1
  1.3398 +    pand       xmm0, xmm5   // even bytes
  1.3399 +    pand       xmm1, xmm5
  1.3400 +    packuswb   xmm0, xmm1
  1.3401 +    psrlw      xmm2, 8      // odd bytes
  1.3402 +    psrlw      xmm3, 8
  1.3403 +    packuswb   xmm2, xmm3
  1.3404 +    movdqu     [edx], xmm0
  1.3405 +    movdqu     [edx + edi], xmm2
  1.3406 +    lea        edx, [edx + 16]
  1.3407 +    sub        ecx, 16
  1.3408 +    jg         convertloop
  1.3409 +
  1.3410 +    pop        edi
  1.3411 +    ret
  1.3412 +  }
  1.3413 +}
  1.3414 +#endif  // HAS_SPLITUVROW_SSE2
  1.3415 +
  1.3416 +#ifdef HAS_SPLITUVROW_AVX2
  1.3417 +__declspec(naked) __declspec(align(16))
  1.3418 +void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
  1.3419 +  __asm {
  1.3420 +    push       edi
  1.3421 +    mov        eax, [esp + 4 + 4]    // src_uv
  1.3422 +    mov        edx, [esp + 4 + 8]    // dst_u
  1.3423 +    mov        edi, [esp + 4 + 12]   // dst_v
  1.3424 +    mov        ecx, [esp + 4 + 16]   // pix
  1.3425 +    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
  1.3426 +    vpsrlw     ymm5, ymm5, 8
  1.3427 +    sub        edi, edx
  1.3428 +
  1.3429 +    align      4
  1.3430 +  convertloop:
  1.3431 +    vmovdqu    ymm0, [eax]
  1.3432 +    vmovdqu    ymm1, [eax + 32]
  1.3433 +    lea        eax,  [eax + 64]
  1.3434 +    vpsrlw     ymm2, ymm0, 8      // odd bytes
  1.3435 +    vpsrlw     ymm3, ymm1, 8
  1.3436 +    vpand      ymm0, ymm0, ymm5   // even bytes
  1.3437 +    vpand      ymm1, ymm1, ymm5
  1.3438 +    vpackuswb  ymm0, ymm0, ymm1
  1.3439 +    vpackuswb  ymm2, ymm2, ymm3
  1.3440 +    vpermq     ymm0, ymm0, 0xd8
  1.3441 +    vpermq     ymm2, ymm2, 0xd8
  1.3442 +    vmovdqu    [edx], ymm0
  1.3443 +    vmovdqu    [edx + edi], ymm2
  1.3444 +    lea        edx, [edx + 32]
  1.3445 +    sub        ecx, 32
  1.3446 +    jg         convertloop
  1.3447 +
  1.3448 +    pop        edi
  1.3449 +    vzeroupper
  1.3450 +    ret
  1.3451 +  }
  1.3452 +}
  1.3453 +#endif  // HAS_SPLITUVROW_AVX2
  1.3454 +
  1.3455 +#ifdef HAS_MERGEUVROW_SSE2
  1.3456 +__declspec(naked) __declspec(align(16))
  1.3457 +void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
  1.3458 +                     int width) {
  1.3459 +  __asm {
  1.3460 +    push       edi
  1.3461 +    mov        eax, [esp + 4 + 4]    // src_u
  1.3462 +    mov        edx, [esp + 4 + 8]    // src_v
  1.3463 +    mov        edi, [esp + 4 + 12]   // dst_uv
  1.3464 +    mov        ecx, [esp + 4 + 16]   // width
  1.3465 +    sub        edx, eax
  1.3466 +
  1.3467 +    align      4
  1.3468 +  convertloop:
  1.3469 +    movdqa     xmm0, [eax]      // read 16 U's
  1.3470 +    movdqa     xmm1, [eax + edx]  // and 16 V's
  1.3471 +    lea        eax,  [eax + 16]
  1.3472 +    movdqa     xmm2, xmm0
  1.3473 +    punpcklbw  xmm0, xmm1       // first 8 UV pairs
  1.3474 +    punpckhbw  xmm2, xmm1       // next 8 UV pairs
  1.3475 +    movdqa     [edi], xmm0
  1.3476 +    movdqa     [edi + 16], xmm2
  1.3477 +    lea        edi, [edi + 32]
  1.3478 +    sub        ecx, 16
  1.3479 +    jg         convertloop
  1.3480 +
  1.3481 +    pop        edi
  1.3482 +    ret
  1.3483 +  }
  1.3484 +}
  1.3485 +
  1.3486 +__declspec(naked) __declspec(align(16))
  1.3487 +void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
  1.3488 +                               uint8* dst_uv, int width) {
  1.3489 +  __asm {
  1.3490 +    push       edi
  1.3491 +    mov        eax, [esp + 4 + 4]    // src_u
  1.3492 +    mov        edx, [esp + 4 + 8]    // src_v
  1.3493 +    mov        edi, [esp + 4 + 12]   // dst_uv
  1.3494 +    mov        ecx, [esp + 4 + 16]   // width
  1.3495 +    sub        edx, eax
  1.3496 +
  1.3497 +    align      4
  1.3498 +  convertloop:
  1.3499 +    movdqu     xmm0, [eax]      // read 16 U's
  1.3500 +    movdqu     xmm1, [eax + edx]  // and 16 V's
  1.3501 +    lea        eax,  [eax + 16]
  1.3502 +    movdqa     xmm2, xmm0
  1.3503 +    punpcklbw  xmm0, xmm1       // first 8 UV pairs
  1.3504 +    punpckhbw  xmm2, xmm1       // next 8 UV pairs
  1.3505 +    movdqu     [edi], xmm0
  1.3506 +    movdqu     [edi + 16], xmm2
  1.3507 +    lea        edi, [edi + 32]
  1.3508 +    sub        ecx, 16
  1.3509 +    jg         convertloop
  1.3510 +
  1.3511 +    pop        edi
  1.3512 +    ret
  1.3513 +  }
  1.3514 +}
  1.3515 +#endif  //  HAS_MERGEUVROW_SSE2
  1.3516 +
  1.3517 +#ifdef HAS_MERGEUVROW_AVX2
  1.3518 +__declspec(naked) __declspec(align(16))
  1.3519 +void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
  1.3520 +                     int width) {
  1.3521 +  __asm {
  1.3522 +    push       edi
  1.3523 +    mov        eax, [esp + 4 + 4]    // src_u
  1.3524 +    mov        edx, [esp + 4 + 8]    // src_v
  1.3525 +    mov        edi, [esp + 4 + 12]   // dst_uv
  1.3526 +    mov        ecx, [esp + 4 + 16]   // width
  1.3527 +    sub        edx, eax
  1.3528 +
  1.3529 +    align      4
  1.3530 +  convertloop:
  1.3531 +    vmovdqu    ymm0, [eax]           // read 32 U's
  1.3532 +    vmovdqu    ymm1, [eax + edx]     // and 32 V's
  1.3533 +    lea        eax,  [eax + 32]
  1.3534 +    vpunpcklbw ymm2, ymm0, ymm1      // low 16 UV pairs. mutated qqword 0,2
  1.3535 +    vpunpckhbw ymm0, ymm0, ymm1      // high 16 UV pairs. mutated qqword 1,3
  1.3536 +    vperm2i128 ymm1, ymm2, ymm0, 0x20  // low 128 of ymm2 and low 128 of ymm0
  1.3537 +    vperm2i128 ymm2, ymm2, ymm0, 0x31  // high 128 of ymm2 and high 128 of ymm0
  1.3538 +    vmovdqu    [edi], ymm1
  1.3539 +    vmovdqu    [edi + 32], ymm2
  1.3540 +    lea        edi, [edi + 64]
  1.3541 +    sub        ecx, 32
  1.3542 +    jg         convertloop
  1.3543 +
  1.3544 +    pop        edi
  1.3545 +    vzeroupper
  1.3546 +    ret
  1.3547 +  }
  1.3548 +}
  1.3549 +#endif  //  HAS_MERGEUVROW_AVX2
  1.3550 +
  1.3551 +#ifdef HAS_COPYROW_SSE2
  1.3552 +// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
  1.3553 +__declspec(naked) __declspec(align(16))
  1.3554 +void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
  1.3555 +  __asm {
  1.3556 +    mov        eax, [esp + 4]   // src
  1.3557 +    mov        edx, [esp + 8]   // dst
  1.3558 +    mov        ecx, [esp + 12]  // count
  1.3559 +
  1.3560 +    align      4
  1.3561 +  convertloop:
  1.3562 +    movdqa     xmm0, [eax]
  1.3563 +    movdqa     xmm1, [eax + 16]
  1.3564 +    lea        eax, [eax + 32]
  1.3565 +    movdqa     [edx], xmm0
  1.3566 +    movdqa     [edx + 16], xmm1
  1.3567 +    lea        edx, [edx + 32]
  1.3568 +    sub        ecx, 32
  1.3569 +    jg         convertloop
  1.3570 +    ret
  1.3571 +  }
  1.3572 +}
  1.3573 +#endif  // HAS_COPYROW_SSE2
  1.3574 +
  1.3575 +// Unaligned Multiple of 1.
  1.3576 +__declspec(naked) __declspec(align(16))
  1.3577 +void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
  1.3578 +  __asm {
  1.3579 +    mov        eax, esi
  1.3580 +    mov        edx, edi
  1.3581 +    mov        esi, [esp + 4]   // src
  1.3582 +    mov        edi, [esp + 8]   // dst
  1.3583 +    mov        ecx, [esp + 12]  // count
  1.3584 +    rep movsb
  1.3585 +    mov        edi, edx
  1.3586 +    mov        esi, eax
  1.3587 +    ret
  1.3588 +  }
  1.3589 +}
  1.3590 +
  1.3591 +#ifdef HAS_COPYROW_X86
  1.3592 +__declspec(naked) __declspec(align(16))
  1.3593 +void CopyRow_X86(const uint8* src, uint8* dst, int count) {
  1.3594 +  __asm {
  1.3595 +    mov        eax, esi
  1.3596 +    mov        edx, edi
  1.3597 +    mov        esi, [esp + 4]   // src
  1.3598 +    mov        edi, [esp + 8]   // dst
  1.3599 +    mov        ecx, [esp + 12]  // count
  1.3600 +    shr        ecx, 2
  1.3601 +    rep movsd
  1.3602 +    mov        edi, edx
  1.3603 +    mov        esi, eax
  1.3604 +    ret
  1.3605 +  }
  1.3606 +}
  1.3607 +#endif  // HAS_COPYROW_X86
  1.3608 +
  1.3609 +#ifdef HAS_ARGBCOPYALPHAROW_SSE2
  1.3610 +// width in pixels
  1.3611 +__declspec(naked) __declspec(align(16))
  1.3612 +void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
  1.3613 +  __asm {
  1.3614 +    mov        eax, [esp + 4]   // src
  1.3615 +    mov        edx, [esp + 8]   // dst
  1.3616 +    mov        ecx, [esp + 12]  // count
  1.3617 +    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
  1.3618 +    pslld      xmm0, 24
  1.3619 +    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
  1.3620 +    psrld      xmm1, 8
  1.3621 +
  1.3622 +    align      4
  1.3623 +  convertloop:
  1.3624 +    movdqa     xmm2, [eax]
  1.3625 +    movdqa     xmm3, [eax + 16]
  1.3626 +    lea        eax, [eax + 32]
  1.3627 +    movdqa     xmm4, [edx]
  1.3628 +    movdqa     xmm5, [edx + 16]
  1.3629 +    pand       xmm2, xmm0
  1.3630 +    pand       xmm3, xmm0
  1.3631 +    pand       xmm4, xmm1
  1.3632 +    pand       xmm5, xmm1
  1.3633 +    por        xmm2, xmm4
  1.3634 +    por        xmm3, xmm5
  1.3635 +    movdqa     [edx], xmm2
  1.3636 +    movdqa     [edx + 16], xmm3
  1.3637 +    lea        edx, [edx + 32]
  1.3638 +    sub        ecx, 8
  1.3639 +    jg         convertloop
  1.3640 +
  1.3641 +    ret
  1.3642 +  }
  1.3643 +}
  1.3644 +#endif  // HAS_ARGBCOPYALPHAROW_SSE2
  1.3645 +
  1.3646 +#ifdef HAS_ARGBCOPYALPHAROW_AVX2
  1.3647 +// width in pixels
  1.3648 +__declspec(naked) __declspec(align(16))
  1.3649 +void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
  1.3650 +  __asm {
  1.3651 +    mov        eax, [esp + 4]   // src
  1.3652 +    mov        edx, [esp + 8]   // dst
  1.3653 +    mov        ecx, [esp + 12]  // count
  1.3654 +    vpcmpeqb   ymm0, ymm0, ymm0
  1.3655 +    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
  1.3656 +
  1.3657 +    align      4
  1.3658 +  convertloop:
  1.3659 +    vmovdqu    ymm1, [eax]
  1.3660 +    vmovdqu    ymm2, [eax + 32]
  1.3661 +    lea        eax, [eax + 64]
  1.3662 +    vpblendvb  ymm1, ymm1, [edx], ymm0
  1.3663 +    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
  1.3664 +    vmovdqu    [edx], ymm1
  1.3665 +    vmovdqu    [edx + 32], ymm2
  1.3666 +    lea        edx, [edx + 64]
  1.3667 +    sub        ecx, 16
  1.3668 +    jg         convertloop
  1.3669 +
  1.3670 +    vzeroupper
  1.3671 +    ret
  1.3672 +  }
  1.3673 +}
  1.3674 +#endif  // HAS_ARGBCOPYALPHAROW_AVX2
  1.3675 +
  1.3676 +#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
  1.3677 +// width in pixels
  1.3678 +__declspec(naked) __declspec(align(16))
  1.3679 +void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
  1.3680 +  __asm {
  1.3681 +    mov        eax, [esp + 4]   // src
  1.3682 +    mov        edx, [esp + 8]   // dst
  1.3683 +    mov        ecx, [esp + 12]  // count
  1.3684 +    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
  1.3685 +    pslld      xmm0, 24
  1.3686 +    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
  1.3687 +    psrld      xmm1, 8
  1.3688 +
  1.3689 +    align      4
  1.3690 +  convertloop:
  1.3691 +    movq       xmm2, qword ptr [eax]  // 8 Y's
  1.3692 +    lea        eax, [eax + 8]
  1.3693 +    punpcklbw  xmm2, xmm2
  1.3694 +    punpckhwd  xmm3, xmm2
  1.3695 +    punpcklwd  xmm2, xmm2
  1.3696 +    movdqa     xmm4, [edx]
  1.3697 +    movdqa     xmm5, [edx + 16]
  1.3698 +    pand       xmm2, xmm0
  1.3699 +    pand       xmm3, xmm0
  1.3700 +    pand       xmm4, xmm1
  1.3701 +    pand       xmm5, xmm1
  1.3702 +    por        xmm2, xmm4
  1.3703 +    por        xmm3, xmm5
  1.3704 +    movdqa     [edx], xmm2
  1.3705 +    movdqa     [edx + 16], xmm3
  1.3706 +    lea        edx, [edx + 32]
  1.3707 +    sub        ecx, 8
  1.3708 +    jg         convertloop
  1.3709 +
  1.3710 +    ret
  1.3711 +  }
  1.3712 +}
  1.3713 +#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
  1.3714 +
  1.3715 +#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
  1.3716 +// width in pixels
  1.3717 +__declspec(naked) __declspec(align(16))
  1.3718 +void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
  1.3719 +  __asm {
  1.3720 +    mov        eax, [esp + 4]   // src
  1.3721 +    mov        edx, [esp + 8]   // dst
  1.3722 +    mov        ecx, [esp + 12]  // count
  1.3723 +    vpcmpeqb   ymm0, ymm0, ymm0
  1.3724 +    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
  1.3725 +
  1.3726 +    align      4
  1.3727 +  convertloop:
  1.3728 +    vpmovzxbd  ymm1, qword ptr [eax]
  1.3729 +    vpmovzxbd  ymm2, qword ptr [eax + 8]
  1.3730 +    lea        eax, [eax + 16]
  1.3731 +    vpslld     ymm1, ymm1, 24
  1.3732 +    vpslld     ymm2, ymm2, 24
  1.3733 +    vpblendvb  ymm1, ymm1, [edx], ymm0
  1.3734 +    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
  1.3735 +    vmovdqu    [edx], ymm1
  1.3736 +    vmovdqu    [edx + 32], ymm2
  1.3737 +    lea        edx, [edx + 64]
  1.3738 +    sub        ecx, 16
  1.3739 +    jg         convertloop
  1.3740 +
  1.3741 +    vzeroupper
  1.3742 +    ret
  1.3743 +  }
  1.3744 +}
  1.3745 +#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
  1.3746 +
  1.3747 +#ifdef HAS_SETROW_X86
  1.3748 +// SetRow8 writes 'count' bytes using a 32 bit value repeated.
  1.3749 +__declspec(naked) __declspec(align(16))
  1.3750 +void SetRow_X86(uint8* dst, uint32 v32, int count) {
  1.3751 +  __asm {
  1.3752 +    mov        edx, edi
  1.3753 +    mov        edi, [esp + 4]   // dst
  1.3754 +    mov        eax, [esp + 8]   // v32
  1.3755 +    mov        ecx, [esp + 12]  // count
  1.3756 +    shr        ecx, 2
  1.3757 +    rep stosd
  1.3758 +    mov        edi, edx
  1.3759 +    ret
  1.3760 +  }
  1.3761 +}
  1.3762 +
  1.3763 +// SetRow32 writes 'count' words using a 32 bit value repeated.
  1.3764 +__declspec(naked) __declspec(align(16))
  1.3765 +void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
  1.3766 +                   int dst_stride, int height) {
  1.3767 +  __asm {
  1.3768 +    push       esi
  1.3769 +    push       edi
  1.3770 +    push       ebp
  1.3771 +    mov        edi, [esp + 12 + 4]   // dst
  1.3772 +    mov        eax, [esp + 12 + 8]   // v32
  1.3773 +    mov        ebp, [esp + 12 + 12]  // width
  1.3774 +    mov        edx, [esp + 12 + 16]  // dst_stride
  1.3775 +    mov        esi, [esp + 12 + 20]  // height
  1.3776 +    lea        ecx, [ebp * 4]
  1.3777 +    sub        edx, ecx             // stride - width * 4
  1.3778 +
  1.3779 +    align      4
  1.3780 +  convertloop:
  1.3781 +    mov        ecx, ebp
  1.3782 +    rep stosd
  1.3783 +    add        edi, edx
  1.3784 +    sub        esi, 1
  1.3785 +    jg         convertloop
  1.3786 +
  1.3787 +    pop        ebp
  1.3788 +    pop        edi
  1.3789 +    pop        esi
  1.3790 +    ret
  1.3791 +  }
  1.3792 +}
  1.3793 +#endif  // HAS_SETROW_X86
  1.3794 +
  1.3795 +#ifdef HAS_YUY2TOYROW_AVX2
  1.3796 +__declspec(naked) __declspec(align(16))
  1.3797 +void YUY2ToYRow_AVX2(const uint8* src_yuy2,
  1.3798 +                     uint8* dst_y, int pix) {
  1.3799 +  __asm {
  1.3800 +    mov        eax, [esp + 4]    // src_yuy2
  1.3801 +    mov        edx, [esp + 8]    // dst_y
  1.3802 +    mov        ecx, [esp + 12]   // pix
  1.3803 +    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
  1.3804 +    vpsrlw     ymm5, ymm5, 8
  1.3805 +
  1.3806 +    align      4
  1.3807 +  convertloop:
  1.3808 +    vmovdqu    ymm0, [eax]
  1.3809 +    vmovdqu    ymm1, [eax + 32]
  1.3810 +    lea        eax,  [eax + 64]
  1.3811 +    vpand      ymm0, ymm0, ymm5   // even bytes are Y
  1.3812 +    vpand      ymm1, ymm1, ymm5
  1.3813 +    vpackuswb  ymm0, ymm0, ymm1   // mutates.
  1.3814 +    vpermq     ymm0, ymm0, 0xd8
  1.3815 +    sub        ecx, 32
  1.3816 +    vmovdqu    [edx], ymm0
  1.3817 +    lea        edx, [edx + 32]
  1.3818 +    jg         convertloop
  1.3819 +    vzeroupper
  1.3820 +    ret
  1.3821 +  }
  1.3822 +}
  1.3823 +
  1.3824 +__declspec(naked) __declspec(align(16))
  1.3825 +void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
  1.3826 +                      uint8* dst_u, uint8* dst_v, int pix) {
  1.3827 +  __asm {
  1.3828 +    push       esi
  1.3829 +    push       edi
  1.3830 +    mov        eax, [esp + 8 + 4]    // src_yuy2
  1.3831 +    mov        esi, [esp + 8 + 8]    // stride_yuy2
  1.3832 +    mov        edx, [esp + 8 + 12]   // dst_u
  1.3833 +    mov        edi, [esp + 8 + 16]   // dst_v
  1.3834 +    mov        ecx, [esp + 8 + 20]   // pix
  1.3835 +    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
  1.3836 +    vpsrlw     ymm5, ymm5, 8
  1.3837 +    sub        edi, edx
  1.3838 +
  1.3839 +    align      4
  1.3840 +  convertloop:
  1.3841 +    vmovdqu    ymm0, [eax]
  1.3842 +    vmovdqu    ymm1, [eax + 32]
  1.3843 +    vpavgb     ymm0, ymm0, [eax + esi]
  1.3844 +    vpavgb     ymm1, ymm1, [eax + esi + 32]
  1.3845 +    lea        eax,  [eax + 64]
  1.3846 +    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
  1.3847 +    vpsrlw     ymm1, ymm1, 8
  1.3848 +    vpackuswb  ymm0, ymm0, ymm1   // mutates.
  1.3849 +    vpermq     ymm0, ymm0, 0xd8
  1.3850 +    vpand      ymm1, ymm0, ymm5  // U
  1.3851 +    vpsrlw     ymm0, ymm0, 8     // V
  1.3852 +    vpackuswb  ymm1, ymm1, ymm1  // mutates.
  1.3853 +    vpackuswb  ymm0, ymm0, ymm0  // mutates.
  1.3854 +    vpermq     ymm1, ymm1, 0xd8
  1.3855 +    vpermq     ymm0, ymm0, 0xd8
  1.3856 +    vextractf128 [edx], ymm1, 0  // U
  1.3857 +    vextractf128 [edx + edi], ymm0, 0 // V
  1.3858 +    lea        edx, [edx + 16]
  1.3859 +    sub        ecx, 32
  1.3860 +    jg         convertloop
  1.3861 +
  1.3862 +    pop        edi
  1.3863 +    pop        esi
  1.3864 +    vzeroupper
  1.3865 +    ret
  1.3866 +  }
  1.3867 +}
  1.3868 +
  1.3869 +__declspec(naked) __declspec(align(16))
  1.3870 +void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
  1.3871 +                         uint8* dst_u, uint8* dst_v, int pix) {
  1.3872 +  __asm {
  1.3873 +    push       edi
  1.3874 +    mov        eax, [esp + 4 + 4]    // src_yuy2
  1.3875 +    mov        edx, [esp + 4 + 8]    // dst_u
  1.3876 +    mov        edi, [esp + 4 + 12]   // dst_v
  1.3877 +    mov        ecx, [esp + 4 + 16]   // pix
  1.3878 +    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
  1.3879 +    vpsrlw     ymm5, ymm5, 8
  1.3880 +    sub        edi, edx
  1.3881 +
  1.3882 +    align      4
  1.3883 +  convertloop:
  1.3884 +    vmovdqu    ymm0, [eax]
  1.3885 +    vmovdqu    ymm1, [eax + 32]
  1.3886 +    lea        eax,  [eax + 64]
  1.3887 +    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
  1.3888 +    vpsrlw     ymm1, ymm1, 8
  1.3889 +    vpackuswb  ymm0, ymm0, ymm1   // mutates.
  1.3890 +    vpermq     ymm0, ymm0, 0xd8
  1.3891 +    vpand      ymm1, ymm0, ymm5  // U
  1.3892 +    vpsrlw     ymm0, ymm0, 8     // V
  1.3893 +    vpackuswb  ymm1, ymm1, ymm1  // mutates.
  1.3894 +    vpackuswb  ymm0, ymm0, ymm0  // mutates.
  1.3895 +    vpermq     ymm1, ymm1, 0xd8
  1.3896 +    vpermq     ymm0, ymm0, 0xd8
  1.3897 +    vextractf128 [edx], ymm1, 0  // U
  1.3898 +    vextractf128 [edx + edi], ymm0, 0 // V
  1.3899 +    lea        edx, [edx + 16]
  1.3900 +    sub        ecx, 32
  1.3901 +    jg         convertloop
  1.3902 +
  1.3903 +    pop        edi
  1.3904 +    vzeroupper
  1.3905 +    ret
  1.3906 +  }
  1.3907 +}
  1.3908 +
  1.3909 +__declspec(naked) __declspec(align(16))
  1.3910 +void UYVYToYRow_AVX2(const uint8* src_uyvy,
  1.3911 +                     uint8* dst_y, int pix) {
  1.3912 +  __asm {
  1.3913 +    mov        eax, [esp + 4]    // src_uyvy
  1.3914 +    mov        edx, [esp + 8]    // dst_y
  1.3915 +    mov        ecx, [esp + 12]   // pix
  1.3916 +
  1.3917 +    align      4
  1.3918 +  convertloop:
  1.3919 +    vmovdqu    ymm0, [eax]
  1.3920 +    vmovdqu    ymm1, [eax + 32]
  1.3921 +    lea        eax,  [eax + 64]
  1.3922 +    vpsrlw     ymm0, ymm0, 8      // odd bytes are Y
  1.3923 +    vpsrlw     ymm1, ymm1, 8
  1.3924 +    vpackuswb  ymm0, ymm0, ymm1   // mutates.
  1.3925 +    vpermq     ymm0, ymm0, 0xd8
  1.3926 +    sub        ecx, 32
  1.3927 +    vmovdqu    [edx], ymm0
  1.3928 +    lea        edx, [edx + 32]
  1.3929 +    jg         convertloop
  1.3930 +    ret
  1.3931 +    vzeroupper
  1.3932 +  }
  1.3933 +}
  1.3934 +
  1.3935 +__declspec(naked) __declspec(align(16))
  1.3936 +void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
  1.3937 +                      uint8* dst_u, uint8* dst_v, int pix) {
  1.3938 +  __asm {
  1.3939 +    push       esi
  1.3940 +    push       edi
  1.3941 +    mov        eax, [esp + 8 + 4]    // src_yuy2
  1.3942 +    mov        esi, [esp + 8 + 8]    // stride_yuy2
  1.3943 +    mov        edx, [esp + 8 + 12]   // dst_u
  1.3944 +    mov        edi, [esp + 8 + 16]   // dst_v
  1.3945 +    mov        ecx, [esp + 8 + 20]   // pix
  1.3946 +    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
  1.3947 +    vpsrlw     ymm5, ymm5, 8
  1.3948 +    sub        edi, edx
  1.3949 +
  1.3950 +    align      4
  1.3951 +  convertloop:
  1.3952 +    vmovdqu    ymm0, [eax]
  1.3953 +    vmovdqu    ymm1, [eax + 32]
  1.3954 +    vpavgb     ymm0, ymm0, [eax + esi]
  1.3955 +    vpavgb     ymm1, ymm1, [eax + esi + 32]
  1.3956 +    lea        eax,  [eax + 64]
  1.3957 +    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
  1.3958 +    vpand      ymm1, ymm1, ymm5
  1.3959 +    vpackuswb  ymm0, ymm0, ymm1   // mutates.
  1.3960 +    vpermq     ymm0, ymm0, 0xd8
  1.3961 +    vpand      ymm1, ymm0, ymm5  // U
  1.3962 +    vpsrlw     ymm0, ymm0, 8     // V
  1.3963 +    vpackuswb  ymm1, ymm1, ymm1  // mutates.
  1.3964 +    vpackuswb  ymm0, ymm0, ymm0  // mutates.
  1.3965 +    vpermq     ymm1, ymm1, 0xd8
  1.3966 +    vpermq     ymm0, ymm0, 0xd8
  1.3967 +    vextractf128 [edx], ymm1, 0  // U
  1.3968 +    vextractf128 [edx + edi], ymm0, 0 // V
  1.3969 +    lea        edx, [edx + 16]
  1.3970 +    sub        ecx, 32
  1.3971 +    jg         convertloop
  1.3972 +
  1.3973 +    pop        edi
  1.3974 +    pop        esi
  1.3975 +    vzeroupper
  1.3976 +    ret
  1.3977 +  }
  1.3978 +}
  1.3979 +
  1.3980 +__declspec(naked) __declspec(align(16))
  1.3981 +void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
  1.3982 +                         uint8* dst_u, uint8* dst_v, int pix) {
  1.3983 +  __asm {
  1.3984 +    push       edi
  1.3985 +    mov        eax, [esp + 4 + 4]    // src_yuy2
  1.3986 +    mov        edx, [esp + 4 + 8]    // dst_u
  1.3987 +    mov        edi, [esp + 4 + 12]   // dst_v
  1.3988 +    mov        ecx, [esp + 4 + 16]   // pix
  1.3989 +    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
  1.3990 +    vpsrlw     ymm5, ymm5, 8
  1.3991 +    sub        edi, edx
  1.3992 +
  1.3993 +    align      4
  1.3994 +  convertloop:
  1.3995 +    vmovdqu    ymm0, [eax]
  1.3996 +    vmovdqu    ymm1, [eax + 32]
  1.3997 +    lea        eax,  [eax + 64]
  1.3998 +    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
  1.3999 +    vpand      ymm1, ymm1, ymm5
  1.4000 +    vpackuswb  ymm0, ymm0, ymm1   // mutates.
  1.4001 +    vpermq     ymm0, ymm0, 0xd8
  1.4002 +    vpand      ymm1, ymm0, ymm5  // U
  1.4003 +    vpsrlw     ymm0, ymm0, 8     // V
  1.4004 +    vpackuswb  ymm1, ymm1, ymm1  // mutates.
  1.4005 +    vpackuswb  ymm0, ymm0, ymm0  // mutates.
  1.4006 +    vpermq     ymm1, ymm1, 0xd8
  1.4007 +    vpermq     ymm0, ymm0, 0xd8
  1.4008 +    vextractf128 [edx], ymm1, 0  // U
  1.4009 +    vextractf128 [edx + edi], ymm0, 0 // V
  1.4010 +    lea        edx, [edx + 16]
  1.4011 +    sub        ecx, 32
  1.4012 +    jg         convertloop
  1.4013 +
  1.4014 +    pop        edi
  1.4015 +    vzeroupper
  1.4016 +    ret
  1.4017 +  }
  1.4018 +}
  1.4019 +#endif  // HAS_YUY2TOYROW_AVX2
  1.4020 +
  1.4021 +#ifdef HAS_YUY2TOYROW_SSE2
  1.4022 +__declspec(naked) __declspec(align(16))
  1.4023 +void YUY2ToYRow_SSE2(const uint8* src_yuy2,
  1.4024 +                     uint8* dst_y, int pix) {
  1.4025 +  __asm {
  1.4026 +    mov        eax, [esp + 4]    // src_yuy2
  1.4027 +    mov        edx, [esp + 8]    // dst_y
  1.4028 +    mov        ecx, [esp + 12]   // pix
  1.4029 +    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
  1.4030 +    psrlw      xmm5, 8
  1.4031 +
  1.4032 +    align      4
  1.4033 +  convertloop:
  1.4034 +    movdqa     xmm0, [eax]
  1.4035 +    movdqa     xmm1, [eax + 16]
  1.4036 +    lea        eax,  [eax + 32]
  1.4037 +    pand       xmm0, xmm5   // even bytes are Y
  1.4038 +    pand       xmm1, xmm5
  1.4039 +    packuswb   xmm0, xmm1
  1.4040 +    sub        ecx, 16
  1.4041 +    movdqa     [edx], xmm0
  1.4042 +    lea        edx, [edx + 16]
  1.4043 +    jg         convertloop
  1.4044 +    ret
  1.4045 +  }
  1.4046 +}
  1.4047 +
  1.4048 +__declspec(naked) __declspec(align(16))
  1.4049 +void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
  1.4050 +                      uint8* dst_u, uint8* dst_v, int pix) {
  1.4051 +  __asm {
  1.4052 +    push       esi
  1.4053 +    push       edi
  1.4054 +    mov        eax, [esp + 8 + 4]    // src_yuy2
  1.4055 +    mov        esi, [esp + 8 + 8]    // stride_yuy2
  1.4056 +    mov        edx, [esp + 8 + 12]   // dst_u
  1.4057 +    mov        edi, [esp + 8 + 16]   // dst_v
  1.4058 +    mov        ecx, [esp + 8 + 20]   // pix
  1.4059 +    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
  1.4060 +    psrlw      xmm5, 8
  1.4061 +    sub        edi, edx
  1.4062 +
  1.4063 +    align      4
  1.4064 +  convertloop:
  1.4065 +    movdqa     xmm0, [eax]
  1.4066 +    movdqa     xmm1, [eax + 16]
  1.4067 +    movdqa     xmm2, [eax + esi]
  1.4068 +    movdqa     xmm3, [eax + esi + 16]
  1.4069 +    lea        eax,  [eax + 32]
  1.4070 +    pavgb      xmm0, xmm2
  1.4071 +    pavgb      xmm1, xmm3
  1.4072 +    psrlw      xmm0, 8      // YUYV -> UVUV
  1.4073 +    psrlw      xmm1, 8
  1.4074 +    packuswb   xmm0, xmm1
  1.4075 +    movdqa     xmm1, xmm0
  1.4076 +    pand       xmm0, xmm5  // U
  1.4077 +    packuswb   xmm0, xmm0
  1.4078 +    psrlw      xmm1, 8     // V
  1.4079 +    packuswb   xmm1, xmm1
  1.4080 +    movq       qword ptr [edx], xmm0
  1.4081 +    movq       qword ptr [edx + edi], xmm1
  1.4082 +    lea        edx, [edx + 8]
  1.4083 +    sub        ecx, 16
  1.4084 +    jg         convertloop
  1.4085 +
  1.4086 +    pop        edi
  1.4087 +    pop        esi
  1.4088 +    ret
  1.4089 +  }
  1.4090 +}
  1.4091 +
  1.4092 +__declspec(naked) __declspec(align(16))
  1.4093 +void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
  1.4094 +                         uint8* dst_u, uint8* dst_v, int pix) {
  1.4095 +  __asm {
  1.4096 +    push       edi
  1.4097 +    mov        eax, [esp + 4 + 4]    // src_yuy2
  1.4098 +    mov        edx, [esp + 4 + 8]    // dst_u
  1.4099 +    mov        edi, [esp + 4 + 12]   // dst_v
  1.4100 +    mov        ecx, [esp + 4 + 16]   // pix
  1.4101 +    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
  1.4102 +    psrlw      xmm5, 8
  1.4103 +    sub        edi, edx
  1.4104 +
  1.4105 +    align      4
  1.4106 +  convertloop:
  1.4107 +    movdqa     xmm0, [eax]
  1.4108 +    movdqa     xmm1, [eax + 16]
  1.4109 +    lea        eax,  [eax + 32]
  1.4110 +    psrlw      xmm0, 8      // YUYV -> UVUV
  1.4111 +    psrlw      xmm1, 8
  1.4112 +    packuswb   xmm0, xmm1
  1.4113 +    movdqa     xmm1, xmm0
  1.4114 +    pand       xmm0, xmm5  // U
  1.4115 +    packuswb   xmm0, xmm0
  1.4116 +    psrlw      xmm1, 8     // V
  1.4117 +    packuswb   xmm1, xmm1
  1.4118 +    movq       qword ptr [edx], xmm0
  1.4119 +    movq       qword ptr [edx + edi], xmm1
  1.4120 +    lea        edx, [edx + 8]
  1.4121 +    sub        ecx, 16
  1.4122 +    jg         convertloop
  1.4123 +
  1.4124 +    pop        edi
  1.4125 +    ret
  1.4126 +  }
  1.4127 +}
  1.4128 +
  1.4129 +__declspec(naked) __declspec(align(16))
  1.4130 +void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
  1.4131 +                               uint8* dst_y, int pix) {
  1.4132 +  __asm {
  1.4133 +    mov        eax, [esp + 4]    // src_yuy2
  1.4134 +    mov        edx, [esp + 8]    // dst_y
  1.4135 +    mov        ecx, [esp + 12]   // pix
  1.4136 +    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
  1.4137 +    psrlw      xmm5, 8
  1.4138 +
  1.4139 +    align      4
  1.4140 +  convertloop:
  1.4141 +    movdqu     xmm0, [eax]
  1.4142 +    movdqu     xmm1, [eax + 16]
  1.4143 +    lea        eax,  [eax + 32]
  1.4144 +    pand       xmm0, xmm5   // even bytes are Y
  1.4145 +    pand       xmm1, xmm5
  1.4146 +    packuswb   xmm0, xmm1
  1.4147 +    sub        ecx, 16
  1.4148 +    movdqu     [edx], xmm0
  1.4149 +    lea        edx, [edx + 16]
  1.4150 +    jg         convertloop
  1.4151 +    ret
  1.4152 +  }
  1.4153 +}
  1.4154 +
  1.4155 +__declspec(naked) __declspec(align(16))
  1.4156 +void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
  1.4157 +                                uint8* dst_u, uint8* dst_v, int pix) {
  1.4158 +  __asm {
  1.4159 +    push       esi
  1.4160 +    push       edi
  1.4161 +    mov        eax, [esp + 8 + 4]    // src_yuy2
  1.4162 +    mov        esi, [esp + 8 + 8]    // stride_yuy2
  1.4163 +    mov        edx, [esp + 8 + 12]   // dst_u
  1.4164 +    mov        edi, [esp + 8 + 16]   // dst_v
  1.4165 +    mov        ecx, [esp + 8 + 20]   // pix
  1.4166 +    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
  1.4167 +    psrlw      xmm5, 8
  1.4168 +    sub        edi, edx
  1.4169 +
  1.4170 +    align      4
  1.4171 +  convertloop:
  1.4172 +    movdqu     xmm0, [eax]
  1.4173 +    movdqu     xmm1, [eax + 16]
  1.4174 +    movdqu     xmm2, [eax + esi]
  1.4175 +    movdqu     xmm3, [eax + esi + 16]
  1.4176 +    lea        eax,  [eax + 32]
  1.4177 +    pavgb      xmm0, xmm2
  1.4178 +    pavgb      xmm1, xmm3
  1.4179 +    psrlw      xmm0, 8      // YUYV -> UVUV
  1.4180 +    psrlw      xmm1, 8
  1.4181 +    packuswb   xmm0, xmm1
  1.4182 +    movdqa     xmm1, xmm0
  1.4183 +    pand       xmm0, xmm5  // U
  1.4184 +    packuswb   xmm0, xmm0
  1.4185 +    psrlw      xmm1, 8     // V
  1.4186 +    packuswb   xmm1, xmm1
  1.4187 +    movq       qword ptr [edx], xmm0
  1.4188 +    movq       qword ptr [edx + edi], xmm1
  1.4189 +    lea        edx, [edx + 8]
  1.4190 +    sub        ecx, 16
  1.4191 +    jg         convertloop
  1.4192 +
  1.4193 +    pop        edi
  1.4194 +    pop        esi
  1.4195 +    ret
  1.4196 +  }
  1.4197 +}
  1.4198 +
  1.4199 +__declspec(naked) __declspec(align(16))
  1.4200 +void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
  1.4201 +                                   uint8* dst_u, uint8* dst_v, int pix) {
  1.4202 +  __asm {
  1.4203 +    push       edi
  1.4204 +    mov        eax, [esp + 4 + 4]    // src_yuy2
  1.4205 +    mov        edx, [esp + 4 + 8]    // dst_u
  1.4206 +    mov        edi, [esp + 4 + 12]   // dst_v
  1.4207 +    mov        ecx, [esp + 4 + 16]   // pix
  1.4208 +    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
  1.4209 +    psrlw      xmm5, 8
  1.4210 +    sub        edi, edx
  1.4211 +
  1.4212 +    align      4
  1.4213 +  convertloop:
  1.4214 +    movdqu     xmm0, [eax]
  1.4215 +    movdqu     xmm1, [eax + 16]
  1.4216 +    lea        eax,  [eax + 32]
  1.4217 +    psrlw      xmm0, 8      // YUYV -> UVUV
  1.4218 +    psrlw      xmm1, 8
  1.4219 +    packuswb   xmm0, xmm1
  1.4220 +    movdqa     xmm1, xmm0
  1.4221 +    pand       xmm0, xmm5  // U
  1.4222 +    packuswb   xmm0, xmm0
  1.4223 +    psrlw      xmm1, 8     // V
  1.4224 +    packuswb   xmm1, xmm1
  1.4225 +    movq       qword ptr [edx], xmm0
  1.4226 +    movq       qword ptr [edx + edi], xmm1
  1.4227 +    lea        edx, [edx + 8]
  1.4228 +    sub        ecx, 16
  1.4229 +    jg         convertloop
  1.4230 +
  1.4231 +    pop        edi
  1.4232 +    ret
  1.4233 +  }
  1.4234 +}
  1.4235 +
  1.4236 +__declspec(naked) __declspec(align(16))
  1.4237 +void UYVYToYRow_SSE2(const uint8* src_uyvy,
  1.4238 +                     uint8* dst_y, int pix) {
  1.4239 +  __asm {
  1.4240 +    mov        eax, [esp + 4]    // src_uyvy
  1.4241 +    mov        edx, [esp + 8]    // dst_y
  1.4242 +    mov        ecx, [esp + 12]   // pix
  1.4243 +
  1.4244 +    align      4
  1.4245 +  convertloop:
  1.4246 +    movdqa     xmm0, [eax]
  1.4247 +    movdqa     xmm1, [eax + 16]
  1.4248 +    lea        eax,  [eax + 32]
  1.4249 +    psrlw      xmm0, 8    // odd bytes are Y
  1.4250 +    psrlw      xmm1, 8
  1.4251 +    packuswb   xmm0, xmm1
  1.4252 +    sub        ecx, 16
  1.4253 +    movdqa     [edx], xmm0
  1.4254 +    lea        edx, [edx + 16]
  1.4255 +    jg         convertloop
  1.4256 +    ret
  1.4257 +  }
  1.4258 +}
  1.4259 +
  1.4260 +__declspec(naked) __declspec(align(16))
  1.4261 +void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
  1.4262 +                      uint8* dst_u, uint8* dst_v, int pix) {
  1.4263 +  __asm {
  1.4264 +    push       esi
  1.4265 +    push       edi
  1.4266 +    mov        eax, [esp + 8 + 4]    // src_yuy2
  1.4267 +    mov        esi, [esp + 8 + 8]    // stride_yuy2
  1.4268 +    mov        edx, [esp + 8 + 12]   // dst_u
  1.4269 +    mov        edi, [esp + 8 + 16]   // dst_v
  1.4270 +    mov        ecx, [esp + 8 + 20]   // pix
  1.4271 +    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
  1.4272 +    psrlw      xmm5, 8
  1.4273 +    sub        edi, edx
  1.4274 +
  1.4275 +    align      4
  1.4276 +  convertloop:
  1.4277 +    movdqa     xmm0, [eax]
  1.4278 +    movdqa     xmm1, [eax + 16]
  1.4279 +    movdqa     xmm2, [eax + esi]
  1.4280 +    movdqa     xmm3, [eax + esi + 16]
  1.4281 +    lea        eax,  [eax + 32]
  1.4282 +    pavgb      xmm0, xmm2
  1.4283 +    pavgb      xmm1, xmm3
  1.4284 +    pand       xmm0, xmm5   // UYVY -> UVUV
  1.4285 +    pand       xmm1, xmm5
  1.4286 +    packuswb   xmm0, xmm1
  1.4287 +    movdqa     xmm1, xmm0
  1.4288 +    pand       xmm0, xmm5  // U
  1.4289 +    packuswb   xmm0, xmm0
  1.4290 +    psrlw      xmm1, 8     // V
  1.4291 +    packuswb   xmm1, xmm1
  1.4292 +    movq       qword ptr [edx], xmm0
  1.4293 +    movq       qword ptr [edx + edi], xmm1
  1.4294 +    lea        edx, [edx + 8]
  1.4295 +    sub        ecx, 16
  1.4296 +    jg         convertloop
  1.4297 +
  1.4298 +    pop        edi
  1.4299 +    pop        esi
  1.4300 +    ret
  1.4301 +  }
  1.4302 +}
  1.4303 +
  1.4304 +__declspec(naked) __declspec(align(16))
  1.4305 +void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
  1.4306 +                         uint8* dst_u, uint8* dst_v, int pix) {
  1.4307 +  __asm {
  1.4308 +    push       edi
  1.4309 +    mov        eax, [esp + 4 + 4]    // src_yuy2
  1.4310 +    mov        edx, [esp + 4 + 8]    // dst_u
  1.4311 +    mov        edi, [esp + 4 + 12]   // dst_v
  1.4312 +    mov        ecx, [esp + 4 + 16]   // pix
  1.4313 +    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
  1.4314 +    psrlw      xmm5, 8
  1.4315 +    sub        edi, edx
  1.4316 +
  1.4317 +    align      4
  1.4318 +  convertloop:
  1.4319 +    movdqa     xmm0, [eax]
  1.4320 +    movdqa     xmm1, [eax + 16]
  1.4321 +    lea        eax,  [eax + 32]
  1.4322 +    pand       xmm0, xmm5   // UYVY -> UVUV
  1.4323 +    pand       xmm1, xmm5
  1.4324 +    packuswb   xmm0, xmm1
  1.4325 +    movdqa     xmm1, xmm0
  1.4326 +    pand       xmm0, xmm5  // U
  1.4327 +    packuswb   xmm0, xmm0
  1.4328 +    psrlw      xmm1, 8     // V
  1.4329 +    packuswb   xmm1, xmm1
  1.4330 +    movq       qword ptr [edx], xmm0
  1.4331 +    movq       qword ptr [edx + edi], xmm1
  1.4332 +    lea        edx, [edx + 8]
  1.4333 +    sub        ecx, 16
  1.4334 +    jg         convertloop
  1.4335 +
  1.4336 +    pop        edi
  1.4337 +    ret
  1.4338 +  }
  1.4339 +}
  1.4340 +
  1.4341 +__declspec(naked) __declspec(align(16))
  1.4342 +void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
  1.4343 +                               uint8* dst_y, int pix) {
  1.4344 +  __asm {
  1.4345 +    mov        eax, [esp + 4]    // src_uyvy
  1.4346 +    mov        edx, [esp + 8]    // dst_y
  1.4347 +    mov        ecx, [esp + 12]   // pix
  1.4348 +
  1.4349 +    align      4
  1.4350 +  convertloop:
  1.4351 +    movdqu     xmm0, [eax]
  1.4352 +    movdqu     xmm1, [eax + 16]
  1.4353 +    lea        eax,  [eax + 32]
  1.4354 +    psrlw      xmm0, 8    // odd bytes are Y
  1.4355 +    psrlw      xmm1, 8
  1.4356 +    packuswb   xmm0, xmm1
  1.4357 +    sub        ecx, 16
  1.4358 +    movdqu     [edx], xmm0
  1.4359 +    lea        edx, [edx + 16]
  1.4360 +    jg         convertloop
  1.4361 +    ret
  1.4362 +  }
  1.4363 +}
  1.4364 +
  1.4365 +__declspec(naked) __declspec(align(16))
  1.4366 +void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
  1.4367 +                                uint8* dst_u, uint8* dst_v, int pix) {
  1.4368 +  __asm {
  1.4369 +    push       esi
  1.4370 +    push       edi
  1.4371 +    mov        eax, [esp + 8 + 4]    // src_yuy2
  1.4372 +    mov        esi, [esp + 8 + 8]    // stride_yuy2
  1.4373 +    mov        edx, [esp + 8 + 12]   // dst_u
  1.4374 +    mov        edi, [esp + 8 + 16]   // dst_v
  1.4375 +    mov        ecx, [esp + 8 + 20]   // pix
  1.4376 +    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
  1.4377 +    psrlw      xmm5, 8
  1.4378 +    sub        edi, edx
  1.4379 +
  1.4380 +    align      4
  1.4381 +  convertloop:
  1.4382 +    movdqu     xmm0, [eax]
  1.4383 +    movdqu     xmm1, [eax + 16]
  1.4384 +    movdqu     xmm2, [eax + esi]
  1.4385 +    movdqu     xmm3, [eax + esi + 16]
  1.4386 +    lea        eax,  [eax + 32]
  1.4387 +    pavgb      xmm0, xmm2
  1.4388 +    pavgb      xmm1, xmm3
  1.4389 +    pand       xmm0, xmm5   // UYVY -> UVUV
  1.4390 +    pand       xmm1, xmm5
  1.4391 +    packuswb   xmm0, xmm1
  1.4392 +    movdqa     xmm1, xmm0
  1.4393 +    pand       xmm0, xmm5  // U
  1.4394 +    packuswb   xmm0, xmm0
  1.4395 +    psrlw      xmm1, 8     // V
  1.4396 +    packuswb   xmm1, xmm1
  1.4397 +    movq       qword ptr [edx], xmm0
  1.4398 +    movq       qword ptr [edx + edi], xmm1
  1.4399 +    lea        edx, [edx + 8]
  1.4400 +    sub        ecx, 16
  1.4401 +    jg         convertloop
  1.4402 +
  1.4403 +    pop        edi
  1.4404 +    pop        esi
  1.4405 +    ret
  1.4406 +  }
  1.4407 +}
  1.4408 +
  1.4409 +__declspec(naked) __declspec(align(16))
  1.4410 +void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
  1.4411 +                                   uint8* dst_u, uint8* dst_v, int pix) {
  1.4412 +  __asm {
  1.4413 +    push       edi
  1.4414 +    mov        eax, [esp + 4 + 4]    // src_yuy2
  1.4415 +    mov        edx, [esp + 4 + 8]    // dst_u
  1.4416 +    mov        edi, [esp + 4 + 12]   // dst_v
  1.4417 +    mov        ecx, [esp + 4 + 16]   // pix
  1.4418 +    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
  1.4419 +    psrlw      xmm5, 8
  1.4420 +    sub        edi, edx
  1.4421 +
  1.4422 +    align      4
  1.4423 +  convertloop:
  1.4424 +    movdqu     xmm0, [eax]
  1.4425 +    movdqu     xmm1, [eax + 16]
  1.4426 +    lea        eax,  [eax + 32]
  1.4427 +    pand       xmm0, xmm5   // UYVY -> UVUV
  1.4428 +    pand       xmm1, xmm5
  1.4429 +    packuswb   xmm0, xmm1
  1.4430 +    movdqa     xmm1, xmm0
  1.4431 +    pand       xmm0, xmm5  // U
  1.4432 +    packuswb   xmm0, xmm0
  1.4433 +    psrlw      xmm1, 8     // V
  1.4434 +    packuswb   xmm1, xmm1
  1.4435 +    movq       qword ptr [edx], xmm0
  1.4436 +    movq       qword ptr [edx + edi], xmm1
  1.4437 +    lea        edx, [edx + 8]
  1.4438 +    sub        ecx, 16
  1.4439 +    jg         convertloop
  1.4440 +
  1.4441 +    pop        edi
  1.4442 +    ret
  1.4443 +  }
  1.4444 +}
  1.4445 +#endif  // HAS_YUY2TOYROW_SSE2
  1.4446 +
  1.4447 +#ifdef HAS_ARGBBLENDROW_SSE2
  1.4448 +// Blend 8 pixels at a time.
  1.4449 +__declspec(naked) __declspec(align(16))
  1.4450 +void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  1.4451 +                       uint8* dst_argb, int width) {
  1.4452 +  __asm {
  1.4453 +    push       esi
  1.4454 +    mov        eax, [esp + 4 + 4]   // src_argb0
  1.4455 +    mov        esi, [esp + 4 + 8]   // src_argb1
  1.4456 +    mov        edx, [esp + 4 + 12]  // dst_argb
  1.4457 +    mov        ecx, [esp + 4 + 16]  // width
  1.4458 +    pcmpeqb    xmm7, xmm7       // generate constant 1
  1.4459 +    psrlw      xmm7, 15
  1.4460 +    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
  1.4461 +    psrlw      xmm6, 8
  1.4462 +    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
  1.4463 +    psllw      xmm5, 8
  1.4464 +    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
  1.4465 +    pslld      xmm4, 24
  1.4466 +
  1.4467 +    sub        ecx, 1
  1.4468 +    je         convertloop1     // only 1 pixel?
  1.4469 +    jl         convertloop1b
  1.4470 +
  1.4471 +    // 1 pixel loop until destination pointer is aligned.
  1.4472 +  alignloop1:
  1.4473 +    test       edx, 15          // aligned?
  1.4474 +    je         alignloop1b
  1.4475 +    movd       xmm3, [eax]
  1.4476 +    lea        eax, [eax + 4]
  1.4477 +    movdqa     xmm0, xmm3       // src argb
  1.4478 +    pxor       xmm3, xmm4       // ~alpha
  1.4479 +    movd       xmm2, [esi]      // _r_b
  1.4480 +    psrlw      xmm3, 8          // alpha
  1.4481 +    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
  1.4482 +    pshuflw    xmm3, xmm3, 0F5h
  1.4483 +    pand       xmm2, xmm6       // _r_b
  1.4484 +    paddw      xmm3, xmm7       // 256 - alpha
  1.4485 +    pmullw     xmm2, xmm3       // _r_b * alpha
  1.4486 +    movd       xmm1, [esi]      // _a_g
  1.4487 +    lea        esi, [esi + 4]
  1.4488 +    psrlw      xmm1, 8          // _a_g
  1.4489 +    por        xmm0, xmm4       // set alpha to 255
  1.4490 +    pmullw     xmm1, xmm3       // _a_g * alpha
  1.4491 +    psrlw      xmm2, 8          // _r_b convert to 8 bits again
  1.4492 +    paddusb    xmm0, xmm2       // + src argb
  1.4493 +    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
  1.4494 +    paddusb    xmm0, xmm1       // + src argb
  1.4495 +    sub        ecx, 1
  1.4496 +    movd       [edx], xmm0
  1.4497 +    lea        edx, [edx + 4]
  1.4498 +    jge        alignloop1
  1.4499 +
  1.4500 +  alignloop1b:
  1.4501 +    add        ecx, 1 - 4
  1.4502 +    jl         convertloop4b
  1.4503 +
  1.4504 +    // 4 pixel loop.
  1.4505 +  convertloop4:
  1.4506 +    movdqu     xmm3, [eax]      // src argb
  1.4507 +    lea        eax, [eax + 16]
  1.4508 +    movdqa     xmm0, xmm3       // src argb
  1.4509 +    pxor       xmm3, xmm4       // ~alpha
  1.4510 +    movdqu     xmm2, [esi]      // _r_b
  1.4511 +    psrlw      xmm3, 8          // alpha
  1.4512 +    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
  1.4513 +    pshuflw    xmm3, xmm3, 0F5h
  1.4514 +    pand       xmm2, xmm6       // _r_b
  1.4515 +    paddw      xmm3, xmm7       // 256 - alpha
  1.4516 +    pmullw     xmm2, xmm3       // _r_b * alpha
  1.4517 +    movdqu     xmm1, [esi]      // _a_g
  1.4518 +    lea        esi, [esi + 16]
  1.4519 +    psrlw      xmm1, 8          // _a_g
  1.4520 +    por        xmm0, xmm4       // set alpha to 255
  1.4521 +    pmullw     xmm1, xmm3       // _a_g * alpha
  1.4522 +    psrlw      xmm2, 8          // _r_b convert to 8 bits again
  1.4523 +    paddusb    xmm0, xmm2       // + src argb
  1.4524 +    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
  1.4525 +    paddusb    xmm0, xmm1       // + src argb
  1.4526 +    sub        ecx, 4
  1.4527 +    movdqa     [edx], xmm0
  1.4528 +    lea        edx, [edx + 16]
  1.4529 +    jge        convertloop4
  1.4530 +
  1.4531 +  convertloop4b:
  1.4532 +    add        ecx, 4 - 1
  1.4533 +    jl         convertloop1b
  1.4534 +
  1.4535 +    // 1 pixel loop.
  1.4536 +  convertloop1:
  1.4537 +    movd       xmm3, [eax]      // src argb
  1.4538 +    lea        eax, [eax + 4]
  1.4539 +    movdqa     xmm0, xmm3       // src argb
  1.4540 +    pxor       xmm3, xmm4       // ~alpha
  1.4541 +    movd       xmm2, [esi]      // _r_b
  1.4542 +    psrlw      xmm3, 8          // alpha
  1.4543 +    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
  1.4544 +    pshuflw    xmm3, xmm3, 0F5h
  1.4545 +    pand       xmm2, xmm6       // _r_b
  1.4546 +    paddw      xmm3, xmm7       // 256 - alpha
  1.4547 +    pmullw     xmm2, xmm3       // _r_b * alpha
  1.4548 +    movd       xmm1, [esi]      // _a_g
  1.4549 +    lea        esi, [esi + 4]
  1.4550 +    psrlw      xmm1, 8          // _a_g
  1.4551 +    por        xmm0, xmm4       // set alpha to 255
  1.4552 +    pmullw     xmm1, xmm3       // _a_g * alpha
  1.4553 +    psrlw      xmm2, 8          // _r_b convert to 8 bits again
  1.4554 +    paddusb    xmm0, xmm2       // + src argb
  1.4555 +    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
  1.4556 +    paddusb    xmm0, xmm1       // + src argb
  1.4557 +    sub        ecx, 1
  1.4558 +    movd       [edx], xmm0
  1.4559 +    lea        edx, [edx + 4]
  1.4560 +    jge        convertloop1
  1.4561 +
  1.4562 +  convertloop1b:
  1.4563 +    pop        esi
  1.4564 +    ret
  1.4565 +  }
  1.4566 +}
  1.4567 +#endif  // HAS_ARGBBLENDROW_SSE2
  1.4568 +
  1.4569 +#ifdef HAS_ARGBBLENDROW_SSSE3
  1.4570 +// Shuffle table for isolating alpha.
  1.4571 +static const uvec8 kShuffleAlpha = {
  1.4572 +  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
  1.4573 +  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
  1.4574 +};
  1.4575 +// Same as SSE2, but replaces:
  1.4576 +//    psrlw      xmm3, 8          // alpha
  1.4577 +//    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
  1.4578 +//    pshuflw    xmm3, xmm3, 0F5h
  1.4579 +// with..
  1.4580 +//    pshufb     xmm3, kShuffleAlpha // alpha
  1.4581 +// Blend 8 pixels at a time.
  1.4582 +
  1.4583 +__declspec(naked) __declspec(align(16))
  1.4584 +void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
  1.4585 +                        uint8* dst_argb, int width) {
  1.4586 +  __asm {
  1.4587 +    push       esi
  1.4588 +    mov        eax, [esp + 4 + 4]   // src_argb0
  1.4589 +    mov        esi, [esp + 4 + 8]   // src_argb1
  1.4590 +    mov        edx, [esp + 4 + 12]  // dst_argb
  1.4591 +    mov        ecx, [esp + 4 + 16]  // width
  1.4592 +    pcmpeqb    xmm7, xmm7       // generate constant 0x0001
  1.4593 +    psrlw      xmm7, 15
  1.4594 +    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
  1.4595 +    psrlw      xmm6, 8
  1.4596 +    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
  1.4597 +    psllw      xmm5, 8
  1.4598 +    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
  1.4599 +    pslld      xmm4, 24
  1.4600 +
  1.4601 +    sub        ecx, 1
  1.4602 +    je         convertloop1     // only 1 pixel?
  1.4603 +    jl         convertloop1b
  1.4604 +
  1.4605 +    // 1 pixel loop until destination pointer is aligned.
  1.4606 +  alignloop1:
  1.4607 +    test       edx, 15          // aligned?
  1.4608 +    je         alignloop1b
  1.4609 +    movd       xmm3, [eax]
  1.4610 +    lea        eax, [eax + 4]
  1.4611 +    movdqa     xmm0, xmm3       // src argb
  1.4612 +    pxor       xmm3, xmm4       // ~alpha
  1.4613 +    movd       xmm2, [esi]      // _r_b
  1.4614 +    pshufb     xmm3, kShuffleAlpha // alpha
  1.4615 +    pand       xmm2, xmm6       // _r_b
  1.4616 +    paddw      xmm3, xmm7       // 256 - alpha
  1.4617 +    pmullw     xmm2, xmm3       // _r_b * alpha
  1.4618 +    movd       xmm1, [esi]      // _a_g
  1.4619 +    lea        esi, [esi + 4]
  1.4620 +    psrlw      xmm1, 8          // _a_g
  1.4621 +    por        xmm0, xmm4       // set alpha to 255
  1.4622 +    pmullw     xmm1, xmm3       // _a_g * alpha
  1.4623 +    psrlw      xmm2, 8          // _r_b convert to 8 bits again
  1.4624 +    paddusb    xmm0, xmm2       // + src argb
  1.4625 +    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
  1.4626 +    paddusb    xmm0, xmm1       // + src argb
  1.4627 +    sub        ecx, 1
  1.4628 +    movd       [edx], xmm0
  1.4629 +    lea        edx, [edx + 4]
  1.4630 +    jge        alignloop1
  1.4631 +
  1.4632 +  alignloop1b:
  1.4633 +    add        ecx, 1 - 4
  1.4634 +    jl         convertloop4b
  1.4635 +
  1.4636 +    test       eax, 15          // unaligned?
  1.4637 +    jne        convertuloop4
  1.4638 +    test       esi, 15          // unaligned?
  1.4639 +    jne        convertuloop4
  1.4640 +
  1.4641 +    // 4 pixel loop.
  1.4642 +  convertloop4:
  1.4643 +    movdqa     xmm3, [eax]      // src argb
  1.4644 +    lea        eax, [eax + 16]
  1.4645 +    movdqa     xmm0, xmm3       // src argb
  1.4646 +    pxor       xmm3, xmm4       // ~alpha
  1.4647 +    movdqa     xmm2, [esi]      // _r_b
  1.4648 +    pshufb     xmm3, kShuffleAlpha // alpha
  1.4649 +    pand       xmm2, xmm6       // _r_b
  1.4650 +    paddw      xmm3, xmm7       // 256 - alpha
  1.4651 +    pmullw     xmm2, xmm3       // _r_b * alpha
  1.4652 +    movdqa     xmm1, [esi]      // _a_g
  1.4653 +    lea        esi, [esi + 16]
  1.4654 +    psrlw      xmm1, 8          // _a_g
  1.4655 +    por        xmm0, xmm4       // set alpha to 255
  1.4656 +    pmullw     xmm1, xmm3       // _a_g * alpha
  1.4657 +    psrlw      xmm2, 8          // _r_b convert to 8 bits again
  1.4658 +    paddusb    xmm0, xmm2       // + src argb
  1.4659 +    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
  1.4660 +    paddusb    xmm0, xmm1       // + src argb
  1.4661 +    sub        ecx, 4
  1.4662 +    movdqa     [edx], xmm0
  1.4663 +    lea        edx, [edx + 16]
  1.4664 +    jge        convertloop4
  1.4665 +    jmp        convertloop4b
  1.4666 +
  1.4667 +    // 4 pixel unaligned loop.
  1.4668 +  convertuloop4:
  1.4669 +    movdqu     xmm3, [eax]      // src argb
  1.4670 +    lea        eax, [eax + 16]
  1.4671 +    movdqa     xmm0, xmm3       // src argb
  1.4672 +    pxor       xmm3, xmm4       // ~alpha
  1.4673 +    movdqu     xmm2, [esi]      // _r_b
  1.4674 +    pshufb     xmm3, kShuffleAlpha // alpha
  1.4675 +    pand       xmm2, xmm6       // _r_b
  1.4676 +    paddw      xmm3, xmm7       // 256 - alpha
  1.4677 +    pmullw     xmm2, xmm3       // _r_b * alpha
  1.4678 +    movdqu     xmm1, [esi]      // _a_g
  1.4679 +    lea        esi, [esi + 16]
  1.4680 +    psrlw      xmm1, 8          // _a_g
  1.4681 +    por        xmm0, xmm4       // set alpha to 255
  1.4682 +    pmullw     xmm1, xmm3       // _a_g * alpha
  1.4683 +    psrlw      xmm2, 8          // _r_b convert to 8 bits again
  1.4684 +    paddusb    xmm0, xmm2       // + src argb
  1.4685 +    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
  1.4686 +    paddusb    xmm0, xmm1       // + src argb
  1.4687 +    sub        ecx, 4
  1.4688 +    movdqa     [edx], xmm0
  1.4689 +    lea        edx, [edx + 16]
  1.4690 +    jge        convertuloop4
  1.4691 +
  1.4692 +  convertloop4b:
  1.4693 +    add        ecx, 4 - 1
  1.4694 +    jl         convertloop1b
  1.4695 +
  1.4696 +    // 1 pixel loop.
  1.4697 +  convertloop1:
  1.4698 +    movd       xmm3, [eax]      // src argb
  1.4699 +    lea        eax, [eax + 4]
  1.4700 +    movdqa     xmm0, xmm3       // src argb
  1.4701 +    pxor       xmm3, xmm4       // ~alpha
  1.4702 +    movd       xmm2, [esi]      // _r_b
  1.4703 +    pshufb     xmm3, kShuffleAlpha // alpha
  1.4704 +    pand       xmm2, xmm6       // _r_b
  1.4705 +    paddw      xmm3, xmm7       // 256 - alpha
  1.4706 +    pmullw     xmm2, xmm3       // _r_b * alpha
  1.4707 +    movd       xmm1, [esi]      // _a_g
  1.4708 +    lea        esi, [esi + 4]
  1.4709 +    psrlw      xmm1, 8          // _a_g
  1.4710 +    por        xmm0, xmm4       // set alpha to 255
  1.4711 +    pmullw     xmm1, xmm3       // _a_g * alpha
  1.4712 +    psrlw      xmm2, 8          // _r_b convert to 8 bits again
  1.4713 +    paddusb    xmm0, xmm2       // + src argb
  1.4714 +    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
  1.4715 +    paddusb    xmm0, xmm1       // + src argb
  1.4716 +    sub        ecx, 1
  1.4717 +    movd       [edx], xmm0
  1.4718 +    lea        edx, [edx + 4]
  1.4719 +    jge        convertloop1
  1.4720 +
  1.4721 +  convertloop1b:
  1.4722 +    pop        esi
  1.4723 +    ret
  1.4724 +  }
  1.4725 +}
  1.4726 +#endif  // HAS_ARGBBLENDROW_SSSE3
  1.4727 +
  1.4728 +#ifdef HAS_ARGBATTENUATEROW_SSE2
  1.4729 +// Attenuate 4 pixels at a time.
  1.4730 +// Aligned to 16 bytes.
  1.4731 +__declspec(naked) __declspec(align(16))
  1.4732 +void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
  1.4733 +  __asm {
  1.4734 +    mov        eax, [esp + 4]   // src_argb0
  1.4735 +    mov        edx, [esp + 8]   // dst_argb
  1.4736 +    mov        ecx, [esp + 12]  // width
  1.4737 +    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
  1.4738 +    pslld      xmm4, 24
  1.4739 +    pcmpeqb    xmm5, xmm5       // generate mask 0x00ffffff
  1.4740 +    psrld      xmm5, 8
  1.4741 +
  1.4742 +    align      4
  1.4743 + convertloop:
  1.4744 +    movdqa     xmm0, [eax]      // read 4 pixels
  1.4745 +    punpcklbw  xmm0, xmm0       // first 2
  1.4746 +    pshufhw    xmm2, xmm0, 0FFh // 8 alpha words
  1.4747 +    pshuflw    xmm2, xmm2, 0FFh
  1.4748 +    pmulhuw    xmm0, xmm2       // rgb * a
  1.4749 +    movdqa     xmm1, [eax]      // read 4 pixels
  1.4750 +    punpckhbw  xmm1, xmm1       // next 2 pixels
  1.4751 +    pshufhw    xmm2, xmm1, 0FFh // 8 alpha words
  1.4752 +    pshuflw    xmm2, xmm2, 0FFh
  1.4753 +    pmulhuw    xmm1, xmm2       // rgb * a
  1.4754 +    movdqa     xmm2, [eax]      // alphas
  1.4755 +    lea        eax, [eax + 16]
  1.4756 +    psrlw      xmm0, 8
  1.4757 +    pand       xmm2, xmm4
  1.4758 +    psrlw      xmm1, 8
  1.4759 +    packuswb   xmm0, xmm1
  1.4760 +    pand       xmm0, xmm5       // keep original alphas
  1.4761 +    por        xmm0, xmm2
  1.4762 +    sub        ecx, 4
  1.4763 +    movdqa     [edx], xmm0
  1.4764 +    lea        edx, [edx + 16]
  1.4765 +    jg         convertloop
  1.4766 +
  1.4767 +    ret
  1.4768 +  }
  1.4769 +}
  1.4770 +#endif  // HAS_ARGBATTENUATEROW_SSE2
  1.4771 +
  1.4772 +#ifdef HAS_ARGBATTENUATEROW_SSSE3
  1.4773 +// Shuffle table duplicating alpha.
  1.4774 +static const uvec8 kShuffleAlpha0 = {
  1.4775 +  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
  1.4776 +};
  1.4777 +static const uvec8 kShuffleAlpha1 = {
  1.4778 +  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
  1.4779 +  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
  1.4780 +};
  1.4781 +__declspec(naked) __declspec(align(16))
  1.4782 +void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
  1.4783 +  __asm {
  1.4784 +    mov        eax, [esp + 4]   // src_argb0
  1.4785 +    mov        edx, [esp + 8]   // dst_argb
  1.4786 +    mov        ecx, [esp + 12]  // width
  1.4787 +    pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
  1.4788 +    pslld      xmm3, 24
  1.4789 +    movdqa     xmm4, kShuffleAlpha0
  1.4790 +    movdqa     xmm5, kShuffleAlpha1
  1.4791 +
  1.4792 +    align      4
  1.4793 + convertloop:
  1.4794 +    movdqu     xmm0, [eax]      // read 4 pixels
  1.4795 +    pshufb     xmm0, xmm4       // isolate first 2 alphas
  1.4796 +    movdqu     xmm1, [eax]      // read 4 pixels
  1.4797 +    punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
  1.4798 +    pmulhuw    xmm0, xmm1       // rgb * a
  1.4799 +    movdqu     xmm1, [eax]      // read 4 pixels
  1.4800 +    pshufb     xmm1, xmm5       // isolate next 2 alphas
  1.4801 +    movdqu     xmm2, [eax]      // read 4 pixels
  1.4802 +    punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
  1.4803 +    pmulhuw    xmm1, xmm2       // rgb * a
  1.4804 +    movdqu     xmm2, [eax]      // mask original alpha
  1.4805 +    lea        eax, [eax + 16]
  1.4806 +    pand       xmm2, xmm3
  1.4807 +    psrlw      xmm0, 8
  1.4808 +    psrlw      xmm1, 8
  1.4809 +    packuswb   xmm0, xmm1
  1.4810 +    por        xmm0, xmm2       // copy original alpha
  1.4811 +    sub        ecx, 4
  1.4812 +    movdqu     [edx], xmm0
  1.4813 +    lea        edx, [edx + 16]
  1.4814 +    jg         convertloop
  1.4815 +
  1.4816 +    ret
  1.4817 +  }
  1.4818 +}
  1.4819 +#endif  // HAS_ARGBATTENUATEROW_SSSE3
  1.4820 +
  1.4821 +#ifdef HAS_ARGBATTENUATEROW_AVX2
  1.4822 +// Shuffle table duplicating alpha.
  1.4823 +static const ulvec8 kShuffleAlpha_AVX2 = {
  1.4824 +  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
  1.4825 +  14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
  1.4826 +  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
  1.4827 +  14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
  1.4828 +};
  1.4829 +__declspec(naked) __declspec(align(16))
  1.4830 +void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
  1.4831 +  __asm {
  1.4832 +    mov        eax, [esp + 4]   // src_argb0
  1.4833 +    mov        edx, [esp + 8]   // dst_argb
  1.4834 +    mov        ecx, [esp + 12]  // width
  1.4835 +    sub        edx, eax
  1.4836 +    vmovdqa    ymm4, kShuffleAlpha_AVX2
  1.4837 +    vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000
  1.4838 +    vpslld     ymm5, ymm5, 24
  1.4839 +
  1.4840 +    align      4
  1.4841 + convertloop:
  1.4842 +    vmovdqu    ymm6, [eax]       // read 8 pixels.
  1.4843 +    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
  1.4844 +    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
  1.4845 +    vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
  1.4846 +    vpshufb    ymm3, ymm1, ymm4  // high 4 alphas
  1.4847 +    vpmulhuw   ymm0, ymm0, ymm2  // rgb * a
  1.4848 +    vpmulhuw   ymm1, ymm1, ymm3  // rgb * a
  1.4849 +    vpand      ymm6, ymm6, ymm5  // isolate alpha
  1.4850 +    vpsrlw     ymm0, ymm0, 8
  1.4851 +    vpsrlw     ymm1, ymm1, 8
  1.4852 +    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
  1.4853 +    vpor       ymm0, ymm0, ymm6  // copy original alpha
  1.4854 +    sub        ecx, 8
  1.4855 +    vmovdqu    [eax + edx], ymm0
  1.4856 +    lea        eax, [eax + 32]
  1.4857 +    jg         convertloop
  1.4858 +
  1.4859 +    vzeroupper
  1.4860 +    ret
  1.4861 +  }
  1.4862 +}
  1.4863 +#endif  // HAS_ARGBATTENUATEROW_AVX2
  1.4864 +
  1.4865 +#ifdef HAS_ARGBUNATTENUATEROW_SSE2
  1.4866 +// Unattenuate 4 pixels at a time.
  1.4867 +// Aligned to 16 bytes.
  1.4868 +__declspec(naked) __declspec(align(16))
  1.4869 +void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
  1.4870 +                             int width) {
  1.4871 +  __asm {
  1.4872 +    push       esi
  1.4873 +    push       edi
  1.4874 +    mov        eax, [esp + 8 + 4]   // src_argb0
  1.4875 +    mov        edx, [esp + 8 + 8]   // dst_argb
  1.4876 +    mov        ecx, [esp + 8 + 12]  // width
  1.4877 +
  1.4878 +    align      4
  1.4879 + convertloop:
  1.4880 +    movdqu     xmm0, [eax]      // read 4 pixels
  1.4881 +    movzx      esi, byte ptr [eax + 3]  // first alpha
  1.4882 +    movzx      edi, byte ptr [eax + 7]  // second alpha
  1.4883 +    punpcklbw  xmm0, xmm0       // first 2
  1.4884 +    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
  1.4885 +    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
  1.4886 +    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words.  1, a, a, a
  1.4887 +    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
  1.4888 +    movlhps    xmm2, xmm3
  1.4889 +    pmulhuw    xmm0, xmm2       // rgb * a
  1.4890 +
  1.4891 +    movdqu     xmm1, [eax]      // read 4 pixels
  1.4892 +    movzx      esi, byte ptr [eax + 11]  // third alpha
  1.4893 +    movzx      edi, byte ptr [eax + 15]  // forth alpha
  1.4894 +    punpckhbw  xmm1, xmm1       // next 2
  1.4895 +    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
  1.4896 +    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
  1.4897 +    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words
  1.4898 +    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
  1.4899 +    movlhps    xmm2, xmm3
  1.4900 +    pmulhuw    xmm1, xmm2       // rgb * a
  1.4901 +    lea        eax, [eax + 16]
  1.4902 +
  1.4903 +    packuswb   xmm0, xmm1
  1.4904 +    sub        ecx, 4
  1.4905 +    movdqu     [edx], xmm0
  1.4906 +    lea        edx, [edx + 16]
  1.4907 +    jg         convertloop
  1.4908 +    pop        edi
  1.4909 +    pop        esi
  1.4910 +    ret
  1.4911 +  }
  1.4912 +}
  1.4913 +#endif  // HAS_ARGBUNATTENUATEROW_SSE2
  1.4914 +
  1.4915 +#ifdef HAS_ARGBUNATTENUATEROW_AVX2
  1.4916 +// Shuffle table duplicating alpha.
  1.4917 +static const ulvec8 kUnattenShuffleAlpha_AVX2 = {
  1.4918 +  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
  1.4919 +  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
  1.4920 +};
  1.4921 +// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
  1.4922 +// USE_GATHER is not on by default, due to being a slow instruction.
  1.4923 +#ifdef USE_GATHER
  1.4924 +__declspec(naked) __declspec(align(16))
  1.4925 +void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
  1.4926 +                             int width) {
  1.4927 +  __asm {
  1.4928 +    mov        eax, [esp + 4]   // src_argb0
  1.4929 +    mov        edx, [esp + 8]   // dst_argb
  1.4930 +    mov        ecx, [esp + 12]  // width
  1.4931 +    sub        edx, eax
  1.4932 +    vmovdqa    ymm4, kUnattenShuffleAlpha_AVX2
  1.4933 +
  1.4934 +    align      4
  1.4935 + convertloop:
  1.4936 +    vmovdqu    ymm6, [eax]       // read 8 pixels.
  1.4937 +    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
  1.4938 +    vpsrld     ymm2, ymm6, 24    // alpha in low 8 bits.
  1.4939 +    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
  1.4940 +    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
  1.4941 +    vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
  1.4942 +    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
  1.4943 +    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
  1.4944 +    vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas. 1, a, a, a
  1.4945 +    vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas
  1.4946 +    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
  1.4947 +    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
  1.4948 +    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
  1.4949 +    sub        ecx, 8
  1.4950 +    vmovdqu    [eax + edx], ymm0
  1.4951 +    lea        eax, [eax + 32]
  1.4952 +    jg         convertloop
  1.4953 +
  1.4954 +    vzeroupper
  1.4955 +    ret
  1.4956 +  }
  1.4957 +}
  1.4958 +#else  // USE_GATHER
  1.4959 +__declspec(naked) __declspec(align(16))
  1.4960 +void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
  1.4961 +                             int width) {
  1.4962 +  __asm {
  1.4963 +
  1.4964 +    mov        eax, [esp + 4]   // src_argb0
  1.4965 +    mov        edx, [esp + 8]   // dst_argb
  1.4966 +    mov        ecx, [esp + 12]  // width
  1.4967 +    sub        edx, eax
  1.4968 +    vmovdqa    ymm5, kUnattenShuffleAlpha_AVX2
  1.4969 +
  1.4970 +    push       esi
  1.4971 +    push       edi
  1.4972 +
  1.4973 +    align      4
  1.4974 + convertloop:
  1.4975 +    // replace VPGATHER
  1.4976 +    movzx      esi, byte ptr [eax + 3]                 // alpha0
  1.4977 +    movzx      edi, byte ptr [eax + 7]                 // alpha1
  1.4978 +    vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a0]
  1.4979 +    vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a1]
  1.4980 +    movzx      esi, byte ptr [eax + 11]                // alpha2
  1.4981 +    movzx      edi, byte ptr [eax + 15]                // alpha3
  1.4982 +    vpunpckldq xmm6, xmm0, xmm1                        // [1,a1,1,a0]
  1.4983 +    vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a2]
  1.4984 +    vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a3]
  1.4985 +    movzx      esi, byte ptr [eax + 19]                // alpha4
  1.4986 +    movzx      edi, byte ptr [eax + 23]                // alpha5
  1.4987 +    vpunpckldq xmm7, xmm2, xmm3                        // [1,a3,1,a2]
  1.4988 +    vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a4]
  1.4989 +    vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a5]
  1.4990 +    movzx      esi, byte ptr [eax + 27]                // alpha6
  1.4991 +    movzx      edi, byte ptr [eax + 31]                // alpha7
  1.4992 +    vpunpckldq xmm0, xmm0, xmm1                        // [1,a5,1,a4]
  1.4993 +    vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a6]
  1.4994 +    vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a7]
  1.4995 +    vpunpckldq xmm2, xmm2, xmm3                        // [1,a7,1,a6]
  1.4996 +    vpunpcklqdq xmm3, xmm6, xmm7                       // [1,a3,1,a2,1,a1,1,a0]
  1.4997 +    vpunpcklqdq xmm0, xmm0, xmm2                       // [1,a7,1,a6,1,a5,1,a4]
  1.4998 +    vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
  1.4999 +    // end of VPGATHER
  1.5000 +
  1.5001 +    vmovdqu    ymm6, [eax]       // read 8 pixels.
  1.5002 +    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
  1.5003 +    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
  1.5004 +    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
  1.5005 +    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
  1.5006 +    vpshufb    ymm2, ymm2, ymm5  // replicate low 4 alphas. 1, a, a, a
  1.5007 +    vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
  1.5008 +    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
  1.5009 +    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
  1.5010 +    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
  1.5011 +    sub        ecx, 8
  1.5012 +    vmovdqu    [eax + edx], ymm0
  1.5013 +    lea        eax, [eax + 32]
  1.5014 +    jg         convertloop
  1.5015 +
  1.5016 +    pop        edi
  1.5017 +    pop        esi
  1.5018 +    vzeroupper
  1.5019 +    ret
  1.5020 +  }
  1.5021 +}
  1.5022 +#endif  // USE_GATHER
  1.5023 +#endif  // HAS_ARGBATTENUATEROW_AVX2
  1.5024 +
  1.5025 +#ifdef HAS_ARGBGRAYROW_SSSE3
  1.5026 +// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
  1.5027 +__declspec(naked) __declspec(align(16))
  1.5028 +void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
  1.5029 +  __asm {
  1.5030 +    mov        eax, [esp + 4]   /* src_argb */
  1.5031 +    mov        edx, [esp + 8]   /* dst_argb */
  1.5032 +    mov        ecx, [esp + 12]  /* width */
  1.5033 +    movdqa     xmm4, kARGBToYJ
  1.5034 +    movdqa     xmm5, kAddYJ64
  1.5035 +
  1.5036 +    align      4
  1.5037 + convertloop:
  1.5038 +    movdqa     xmm0, [eax]  // G
  1.5039 +    movdqa     xmm1, [eax + 16]
  1.5040 +    pmaddubsw  xmm0, xmm4
  1.5041 +    pmaddubsw  xmm1, xmm4
  1.5042 +    phaddw     xmm0, xmm1
  1.5043 +    paddw      xmm0, xmm5  // Add .5 for rounding.
  1.5044 +    psrlw      xmm0, 7
  1.5045 +    packuswb   xmm0, xmm0   // 8 G bytes
  1.5046 +    movdqa     xmm2, [eax]  // A
  1.5047 +    movdqa     xmm3, [eax + 16]
  1.5048 +    lea        eax, [eax + 32]
  1.5049 +    psrld      xmm2, 24
  1.5050 +    psrld      xmm3, 24
  1.5051 +    packuswb   xmm2, xmm3
  1.5052 +    packuswb   xmm2, xmm2   // 8 A bytes
  1.5053 +    movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA
  1.5054 +    punpcklbw  xmm0, xmm0   // 8 GG words
  1.5055 +    punpcklbw  xmm3, xmm2   // 8 GA words
  1.5056 +    movdqa     xmm1, xmm0
  1.5057 +    punpcklwd  xmm0, xmm3   // GGGA first 4
  1.5058 +    punpckhwd  xmm1, xmm3   // GGGA next 4
  1.5059 +    sub        ecx, 8
  1.5060 +    movdqa     [edx], xmm0
  1.5061 +    movdqa     [edx + 16], xmm1
  1.5062 +    lea        edx, [edx + 32]
  1.5063 +    jg         convertloop
  1.5064 +    ret
  1.5065 +  }
  1.5066 +}
  1.5067 +#endif  // HAS_ARGBGRAYROW_SSSE3
  1.5068 +
  1.5069 +#ifdef HAS_ARGBSEPIAROW_SSSE3
  1.5070 +//    b = (r * 35 + g * 68 + b * 17) >> 7
  1.5071 +//    g = (r * 45 + g * 88 + b * 22) >> 7
  1.5072 +//    r = (r * 50 + g * 98 + b * 24) >> 7
  1.5073 +// Constant for ARGB color to sepia tone.
  1.5074 +static const vec8 kARGBToSepiaB = {
  1.5075 +  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
  1.5076 +};
  1.5077 +
  1.5078 +static const vec8 kARGBToSepiaG = {
  1.5079 +  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
  1.5080 +};
  1.5081 +
  1.5082 +static const vec8 kARGBToSepiaR = {
  1.5083 +  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
  1.5084 +};
  1.5085 +
  1.5086 +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
  1.5087 +__declspec(naked) __declspec(align(16))
  1.5088 +void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
  1.5089 +  __asm {
  1.5090 +    mov        eax, [esp + 4]   /* dst_argb */
  1.5091 +    mov        ecx, [esp + 8]   /* width */
  1.5092 +    movdqa     xmm2, kARGBToSepiaB
  1.5093 +    movdqa     xmm3, kARGBToSepiaG
  1.5094 +    movdqa     xmm4, kARGBToSepiaR
  1.5095 +
  1.5096 +    align      4
  1.5097 + convertloop:
  1.5098 +    movdqa     xmm0, [eax]  // B
  1.5099 +    movdqa     xmm6, [eax + 16]
  1.5100 +    pmaddubsw  xmm0, xmm2
  1.5101 +    pmaddubsw  xmm6, xmm2
  1.5102 +    phaddw     xmm0, xmm6
  1.5103 +    psrlw      xmm0, 7
  1.5104 +    packuswb   xmm0, xmm0   // 8 B values
  1.5105 +    movdqa     xmm5, [eax]  // G
  1.5106 +    movdqa     xmm1, [eax + 16]
  1.5107 +    pmaddubsw  xmm5, xmm3
  1.5108 +    pmaddubsw  xmm1, xmm3
  1.5109 +    phaddw     xmm5, xmm1
  1.5110 +    psrlw      xmm5, 7
  1.5111 +    packuswb   xmm5, xmm5   // 8 G values
  1.5112 +    punpcklbw  xmm0, xmm5   // 8 BG values
  1.5113 +    movdqa     xmm5, [eax]  // R
  1.5114 +    movdqa     xmm1, [eax + 16]
  1.5115 +    pmaddubsw  xmm5, xmm4
  1.5116 +    pmaddubsw  xmm1, xmm4
  1.5117 +    phaddw     xmm5, xmm1
  1.5118 +    psrlw      xmm5, 7
  1.5119 +    packuswb   xmm5, xmm5   // 8 R values
  1.5120 +    movdqa     xmm6, [eax]  // A
  1.5121 +    movdqa     xmm1, [eax + 16]
  1.5122 +    psrld      xmm6, 24
  1.5123 +    psrld      xmm1, 24
  1.5124 +    packuswb   xmm6, xmm1
  1.5125 +    packuswb   xmm6, xmm6   // 8 A values
  1.5126 +    punpcklbw  xmm5, xmm6   // 8 RA values
  1.5127 +    movdqa     xmm1, xmm0   // Weave BG, RA together
  1.5128 +    punpcklwd  xmm0, xmm5   // BGRA first 4
  1.5129 +    punpckhwd  xmm1, xmm5   // BGRA next 4
  1.5130 +    sub        ecx, 8
  1.5131 +    movdqa     [eax], xmm0
  1.5132 +    movdqa     [eax + 16], xmm1
  1.5133 +    lea        eax, [eax + 32]
  1.5134 +    jg         convertloop
  1.5135 +    ret
  1.5136 +  }
  1.5137 +}
  1.5138 +#endif  // HAS_ARGBSEPIAROW_SSSE3
  1.5139 +
  1.5140 +#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
  1.5141 +// Tranform 8 ARGB pixels (32 bytes) with color matrix.
  1.5142 +// Same as Sepia except matrix is provided.
  1.5143 +// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
  1.5144 +// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
  1.5145 +__declspec(naked) __declspec(align(16))
  1.5146 +void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
  1.5147 +                              const int8* matrix_argb, int width) {
  1.5148 +  __asm {
  1.5149 +    mov        eax, [esp + 4]   /* src_argb */
  1.5150 +    mov        edx, [esp + 8]   /* dst_argb */
  1.5151 +    mov        ecx, [esp + 12]  /* matrix_argb */
  1.5152 +    movdqu     xmm5, [ecx]
  1.5153 +    pshufd     xmm2, xmm5, 0x00
  1.5154 +    pshufd     xmm3, xmm5, 0x55
  1.5155 +    pshufd     xmm4, xmm5, 0xaa
  1.5156 +    pshufd     xmm5, xmm5, 0xff
  1.5157 +    mov        ecx, [esp + 16]  /* width */
  1.5158 +
  1.5159 +    align      4
  1.5160 + convertloop:
  1.5161 +    movdqa     xmm0, [eax]  // B
  1.5162 +    movdqa     xmm7, [eax + 16]
  1.5163 +    pmaddubsw  xmm0, xmm2
  1.5164 +    pmaddubsw  xmm7, xmm2
  1.5165 +    movdqa     xmm6, [eax]  // G
  1.5166 +    movdqa     xmm1, [eax + 16]
  1.5167 +    pmaddubsw  xmm6, xmm3
  1.5168 +    pmaddubsw  xmm1, xmm3
  1.5169 +    phaddsw    xmm0, xmm7   // B
  1.5170 +    phaddsw    xmm6, xmm1   // G
  1.5171 +    psraw      xmm0, 6      // B
  1.5172 +    psraw      xmm6, 6      // G
  1.5173 +    packuswb   xmm0, xmm0   // 8 B values
  1.5174 +    packuswb   xmm6, xmm6   // 8 G values
  1.5175 +    punpcklbw  xmm0, xmm6   // 8 BG values
  1.5176 +    movdqa     xmm1, [eax]  // R
  1.5177 +    movdqa     xmm7, [eax + 16]
  1.5178 +    pmaddubsw  xmm1, xmm4
  1.5179 +    pmaddubsw  xmm7, xmm4
  1.5180 +    phaddsw    xmm1, xmm7   // R
  1.5181 +    movdqa     xmm6, [eax]  // A
  1.5182 +    movdqa     xmm7, [eax + 16]
  1.5183 +    pmaddubsw  xmm6, xmm5
  1.5184 +    pmaddubsw  xmm7, xmm5
  1.5185 +    phaddsw    xmm6, xmm7   // A
  1.5186 +    psraw      xmm1, 6      // R
  1.5187 +    psraw      xmm6, 6      // A
  1.5188 +    packuswb   xmm1, xmm1   // 8 R values
  1.5189 +    packuswb   xmm6, xmm6   // 8 A values
  1.5190 +    punpcklbw  xmm1, xmm6   // 8 RA values
  1.5191 +    movdqa     xmm6, xmm0   // Weave BG, RA together
  1.5192 +    punpcklwd  xmm0, xmm1   // BGRA first 4
  1.5193 +    punpckhwd  xmm6, xmm1   // BGRA next 4
  1.5194 +    sub        ecx, 8
  1.5195 +    movdqa     [edx], xmm0
  1.5196 +    movdqa     [edx + 16], xmm6
  1.5197 +    lea        eax, [eax + 32]
  1.5198 +    lea        edx, [edx + 32]
  1.5199 +    jg         convertloop
  1.5200 +    ret
  1.5201 +  }
  1.5202 +}
  1.5203 +#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
  1.5204 +
  1.5205 +#ifdef HAS_ARGBQUANTIZEROW_SSE2
  1.5206 +// Quantize 4 ARGB pixels (16 bytes).
  1.5207 +// Aligned to 16 bytes.
  1.5208 +__declspec(naked) __declspec(align(16))
  1.5209 +void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
  1.5210 +                          int interval_offset, int width) {
  1.5211 +  __asm {
  1.5212 +    mov        eax, [esp + 4]    /* dst_argb */
  1.5213 +    movd       xmm2, [esp + 8]   /* scale */
  1.5214 +    movd       xmm3, [esp + 12]  /* interval_size */
  1.5215 +    movd       xmm4, [esp + 16]  /* interval_offset */
  1.5216 +    mov        ecx, [esp + 20]   /* width */
  1.5217 +    pshuflw    xmm2, xmm2, 040h
  1.5218 +    pshufd     xmm2, xmm2, 044h
  1.5219 +    pshuflw    xmm3, xmm3, 040h
  1.5220 +    pshufd     xmm3, xmm3, 044h
  1.5221 +    pshuflw    xmm4, xmm4, 040h
  1.5222 +    pshufd     xmm4, xmm4, 044h
  1.5223 +    pxor       xmm5, xmm5  // constant 0
  1.5224 +    pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
  1.5225 +    pslld      xmm6, 24
  1.5226 +
  1.5227 +    align      4
  1.5228 + convertloop:
  1.5229 +    movdqa     xmm0, [eax]  // read 4 pixels
  1.5230 +    punpcklbw  xmm0, xmm5   // first 2 pixels
  1.5231 +    pmulhuw    xmm0, xmm2   // pixel * scale >> 16
  1.5232 +    movdqa     xmm1, [eax]  // read 4 pixels
  1.5233 +    punpckhbw  xmm1, xmm5   // next 2 pixels
  1.5234 +    pmulhuw    xmm1, xmm2
  1.5235 +    pmullw     xmm0, xmm3   // * interval_size
  1.5236 +    movdqa     xmm7, [eax]  // read 4 pixels
  1.5237 +    pmullw     xmm1, xmm3
  1.5238 +    pand       xmm7, xmm6   // mask alpha
  1.5239 +    paddw      xmm0, xmm4   // + interval_size / 2
  1.5240 +    paddw      xmm1, xmm4
  1.5241 +    packuswb   xmm0, xmm1
  1.5242 +    por        xmm0, xmm7
  1.5243 +    sub        ecx, 4
  1.5244 +    movdqa     [eax], xmm0
  1.5245 +    lea        eax, [eax + 16]
  1.5246 +    jg         convertloop
  1.5247 +    ret
  1.5248 +  }
  1.5249 +}
  1.5250 +#endif  // HAS_ARGBQUANTIZEROW_SSE2
  1.5251 +
  1.5252 +#ifdef HAS_ARGBSHADEROW_SSE2
  1.5253 +// Shade 4 pixels at a time by specified value.
  1.5254 +// Aligned to 16 bytes.
  1.5255 +__declspec(naked) __declspec(align(16))
  1.5256 +void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
  1.5257 +                       uint32 value) {
  1.5258 +  __asm {
  1.5259 +    mov        eax, [esp + 4]   // src_argb
  1.5260 +    mov        edx, [esp + 8]   // dst_argb
  1.5261 +    mov        ecx, [esp + 12]  // width
  1.5262 +    movd       xmm2, [esp + 16]  // value
  1.5263 +    punpcklbw  xmm2, xmm2
  1.5264 +    punpcklqdq xmm2, xmm2
  1.5265 +
  1.5266 +    align      4
  1.5267 + convertloop:
  1.5268 +    movdqa     xmm0, [eax]      // read 4 pixels
  1.5269 +    lea        eax, [eax + 16]
  1.5270 +    movdqa     xmm1, xmm0
  1.5271 +    punpcklbw  xmm0, xmm0       // first 2
  1.5272 +    punpckhbw  xmm1, xmm1       // next 2
  1.5273 +    pmulhuw    xmm0, xmm2       // argb * value
  1.5274 +    pmulhuw    xmm1, xmm2       // argb * value
  1.5275 +    psrlw      xmm0, 8
  1.5276 +    psrlw      xmm1, 8
  1.5277 +    packuswb   xmm0, xmm1
  1.5278 +    sub        ecx, 4
  1.5279 +    movdqa     [edx], xmm0
  1.5280 +    lea        edx, [edx + 16]
  1.5281 +    jg         convertloop
  1.5282 +
  1.5283 +    ret
  1.5284 +  }
  1.5285 +}
  1.5286 +#endif  // HAS_ARGBSHADEROW_SSE2
  1.5287 +
  1.5288 +#ifdef HAS_ARGBMULTIPLYROW_SSE2
  1.5289 +// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
  1.5290 +__declspec(naked) __declspec(align(16))
  1.5291 +void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  1.5292 +                          uint8* dst_argb, int width) {
  1.5293 +  __asm {
  1.5294 +    push       esi
  1.5295 +    mov        eax, [esp + 4 + 4]   // src_argb0
  1.5296 +    mov        esi, [esp + 4 + 8]   // src_argb1
  1.5297 +    mov        edx, [esp + 4 + 12]  // dst_argb
  1.5298 +    mov        ecx, [esp + 4 + 16]  // width
  1.5299 +    pxor       xmm5, xmm5  // constant 0
  1.5300 +
  1.5301 +    align      4
  1.5302 + convertloop:
  1.5303 +    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
  1.5304 +    movdqu     xmm2, [esi]        // read 4 pixels from src_argb1
  1.5305 +    movdqu     xmm1, xmm0
  1.5306 +    movdqu     xmm3, xmm2
  1.5307 +    punpcklbw  xmm0, xmm0         // first 2
  1.5308 +    punpckhbw  xmm1, xmm1         // next 2
  1.5309 +    punpcklbw  xmm2, xmm5         // first 2
  1.5310 +    punpckhbw  xmm3, xmm5         // next 2
  1.5311 +    pmulhuw    xmm0, xmm2         // src_argb0 * src_argb1 first 2
  1.5312 +    pmulhuw    xmm1, xmm3         // src_argb0 * src_argb1 next 2
  1.5313 +    lea        eax, [eax + 16]
  1.5314 +    lea        esi, [esi + 16]
  1.5315 +    packuswb   xmm0, xmm1
  1.5316 +    sub        ecx, 4
  1.5317 +    movdqu     [edx], xmm0
  1.5318 +    lea        edx, [edx + 16]
  1.5319 +    jg         convertloop
  1.5320 +
  1.5321 +    pop        esi
  1.5322 +    ret
  1.5323 +  }
  1.5324 +}
  1.5325 +#endif  // HAS_ARGBMULTIPLYROW_SSE2
  1.5326 +
  1.5327 +#ifdef HAS_ARGBADDROW_SSE2
  1.5328 +// Add 2 rows of ARGB pixels together, 4 pixels at a time.
  1.5329 +// TODO(fbarchard): Port this to posix, neon and other math functions.
  1.5330 +__declspec(naked) __declspec(align(16))
  1.5331 +void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  1.5332 +                     uint8* dst_argb, int width) {
  1.5333 +  __asm {
  1.5334 +    push       esi
  1.5335 +    mov        eax, [esp + 4 + 4]   // src_argb0
  1.5336 +    mov        esi, [esp + 4 + 8]   // src_argb1
  1.5337 +    mov        edx, [esp + 4 + 12]  // dst_argb
  1.5338 +    mov        ecx, [esp + 4 + 16]  // width
  1.5339 +
  1.5340 +    sub        ecx, 4
  1.5341 +    jl         convertloop49
  1.5342 +
  1.5343 +    align      4
  1.5344 + convertloop4:
  1.5345 +    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
  1.5346 +    lea        eax, [eax + 16]
  1.5347 +    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
  1.5348 +    lea        esi, [esi + 16]
  1.5349 +    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
  1.5350 +    sub        ecx, 4
  1.5351 +    movdqu     [edx], xmm0
  1.5352 +    lea        edx, [edx + 16]
  1.5353 +    jge        convertloop4
  1.5354 +
  1.5355 + convertloop49:
  1.5356 +    add        ecx, 4 - 1
  1.5357 +    jl         convertloop19
  1.5358 +
  1.5359 + convertloop1:
  1.5360 +    movd       xmm0, [eax]        // read 1 pixels from src_argb0
  1.5361 +    lea        eax, [eax + 4]
  1.5362 +    movd       xmm1, [esi]        // read 1 pixels from src_argb1
  1.5363 +    lea        esi, [esi + 4]
  1.5364 +    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
  1.5365 +    sub        ecx, 1
  1.5366 +    movd       [edx], xmm0
  1.5367 +    lea        edx, [edx + 4]
  1.5368 +    jge        convertloop1
  1.5369 +
  1.5370 + convertloop19:
  1.5371 +    pop        esi
  1.5372 +    ret
  1.5373 +  }
  1.5374 +}
  1.5375 +#endif  // HAS_ARGBADDROW_SSE2
  1.5376 +
  1.5377 +#ifdef HAS_ARGBSUBTRACTROW_SSE2
  1.5378 +// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
  1.5379 +__declspec(naked) __declspec(align(16))
  1.5380 +void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  1.5381 +                          uint8* dst_argb, int width) {
  1.5382 +  __asm {
  1.5383 +    push       esi
  1.5384 +    mov        eax, [esp + 4 + 4]   // src_argb0
  1.5385 +    mov        esi, [esp + 4 + 8]   // src_argb1
  1.5386 +    mov        edx, [esp + 4 + 12]  // dst_argb
  1.5387 +    mov        ecx, [esp + 4 + 16]  // width
  1.5388 +
  1.5389 +    align      4
  1.5390 + convertloop:
  1.5391 +    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
  1.5392 +    lea        eax, [eax + 16]
  1.5393 +    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
  1.5394 +    lea        esi, [esi + 16]
  1.5395 +    psubusb    xmm0, xmm1         // src_argb0 - src_argb1
  1.5396 +    sub        ecx, 4
  1.5397 +    movdqu     [edx], xmm0
  1.5398 +    lea        edx, [edx + 16]
  1.5399 +    jg         convertloop
  1.5400 +
  1.5401 +    pop        esi
  1.5402 +    ret
  1.5403 +  }
  1.5404 +}
  1.5405 +#endif  // HAS_ARGBSUBTRACTROW_SSE2
  1.5406 +
  1.5407 +#ifdef HAS_ARGBMULTIPLYROW_AVX2
  1.5408 +// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
  1.5409 +__declspec(naked) __declspec(align(16))
  1.5410 +void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
  1.5411 +                          uint8* dst_argb, int width) {
  1.5412 +  __asm {
  1.5413 +    push       esi
  1.5414 +    mov        eax, [esp + 4 + 4]   // src_argb0
  1.5415 +    mov        esi, [esp + 4 + 8]   // src_argb1
  1.5416 +    mov        edx, [esp + 4 + 12]  // dst_argb
  1.5417 +    mov        ecx, [esp + 4 + 16]  // width
  1.5418 +    vpxor      ymm5, ymm5, ymm5     // constant 0
  1.5419 +
  1.5420 +    align      4
  1.5421 + convertloop:
  1.5422 +    vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0
  1.5423 +    lea        eax, [eax + 32]
  1.5424 +    vmovdqu    ymm3, [esi]        // read 8 pixels from src_argb1
  1.5425 +    lea        esi, [esi + 32]
  1.5426 +    vpunpcklbw ymm0, ymm1, ymm1   // low 4
  1.5427 +    vpunpckhbw ymm1, ymm1, ymm1   // high 4
  1.5428 +    vpunpcklbw ymm2, ymm3, ymm5   // low 4
  1.5429 +    vpunpckhbw ymm3, ymm3, ymm5   // high 4
  1.5430 +    vpmulhuw   ymm0, ymm0, ymm2   // src_argb0 * src_argb1 low 4
  1.5431 +    vpmulhuw   ymm1, ymm1, ymm3   // src_argb0 * src_argb1 high 4
  1.5432 +    vpackuswb  ymm0, ymm0, ymm1
  1.5433 +    vmovdqu    [edx], ymm0
  1.5434 +    lea        edx, [edx + 32]
  1.5435 +    sub        ecx, 8
  1.5436 +    jg         convertloop
  1.5437 +
  1.5438 +    pop        esi
  1.5439 +    vzeroupper
  1.5440 +    ret
  1.5441 +  }
  1.5442 +}
  1.5443 +#endif  // HAS_ARGBMULTIPLYROW_AVX2
  1.5444 +
  1.5445 +#ifdef HAS_ARGBADDROW_AVX2
  1.5446 +// Add 2 rows of ARGB pixels together, 8 pixels at a time.
  1.5447 +__declspec(naked) __declspec(align(16))
  1.5448 +void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
  1.5449 +                     uint8* dst_argb, int width) {
  1.5450 +  __asm {
  1.5451 +    push       esi
  1.5452 +    mov        eax, [esp + 4 + 4]   // src_argb0
  1.5453 +    mov        esi, [esp + 4 + 8]   // src_argb1
  1.5454 +    mov        edx, [esp + 4 + 12]  // dst_argb
  1.5455 +    mov        ecx, [esp + 4 + 16]  // width
  1.5456 +
  1.5457 +    align      4
  1.5458 + convertloop:
  1.5459 +    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
  1.5460 +    lea        eax, [eax + 32]
  1.5461 +    vpaddusb   ymm0, ymm0, [esi]        // add 8 pixels from src_argb1
  1.5462 +    lea        esi, [esi + 32]
  1.5463 +    vmovdqu    [edx], ymm0
  1.5464 +    lea        edx, [edx + 32]
  1.5465 +    sub        ecx, 8
  1.5466 +    jg         convertloop
  1.5467 +
  1.5468 +    pop        esi
  1.5469 +    vzeroupper
  1.5470 +    ret
  1.5471 +  }
  1.5472 +}
  1.5473 +#endif  // HAS_ARGBADDROW_AVX2
  1.5474 +
  1.5475 +#ifdef HAS_ARGBSUBTRACTROW_AVX2
  1.5476 +// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
  1.5477 +__declspec(naked) __declspec(align(16))
  1.5478 +void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
  1.5479 +                          uint8* dst_argb, int width) {
  1.5480 +  __asm {
  1.5481 +    push       esi
  1.5482 +    mov        eax, [esp + 4 + 4]   // src_argb0
  1.5483 +    mov        esi, [esp + 4 + 8]   // src_argb1
  1.5484 +    mov        edx, [esp + 4 + 12]  // dst_argb
  1.5485 +    mov        ecx, [esp + 4 + 16]  // width
  1.5486 +
  1.5487 +    align      4
  1.5488 + convertloop:
  1.5489 +    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
  1.5490 +    lea        eax, [eax + 32]
  1.5491 +    vpsubusb   ymm0, ymm0, [esi]        // src_argb0 - src_argb1
  1.5492 +    lea        esi, [esi + 32]
  1.5493 +    vmovdqu    [edx], ymm0
  1.5494 +    lea        edx, [edx + 32]
  1.5495 +    sub        ecx, 8
  1.5496 +    jg         convertloop
  1.5497 +
  1.5498 +    pop        esi
  1.5499 +    vzeroupper
  1.5500 +    ret
  1.5501 +  }
  1.5502 +}
  1.5503 +#endif  // HAS_ARGBSUBTRACTROW_AVX2
  1.5504 +
  1.5505 +#ifdef HAS_SOBELXROW_SSE2
  1.5506 +// SobelX as a matrix is
  1.5507 +// -1  0  1
  1.5508 +// -2  0  2
  1.5509 +// -1  0  1
  1.5510 +__declspec(naked) __declspec(align(16))
  1.5511 +void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
  1.5512 +                    const uint8* src_y2, uint8* dst_sobelx, int width) {
  1.5513 +  __asm {
  1.5514 +    push       esi
  1.5515 +    push       edi
  1.5516 +    mov        eax, [esp + 8 + 4]   // src_y0
  1.5517 +    mov        esi, [esp + 8 + 8]   // src_y1
  1.5518 +    mov        edi, [esp + 8 + 12]  // src_y2
  1.5519 +    mov        edx, [esp + 8 + 16]  // dst_sobelx
  1.5520 +    mov        ecx, [esp + 8 + 20]  // width
  1.5521 +    sub        esi, eax
  1.5522 +    sub        edi, eax
  1.5523 +    sub        edx, eax
  1.5524 +    pxor       xmm5, xmm5  // constant 0
  1.5525 +
  1.5526 +    align      4
  1.5527 + convertloop:
  1.5528 +    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
  1.5529 +    movq       xmm1, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
  1.5530 +    punpcklbw  xmm0, xmm5
  1.5531 +    punpcklbw  xmm1, xmm5
  1.5532 +    psubw      xmm0, xmm1
  1.5533 +    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
  1.5534 +    movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
  1.5535 +    punpcklbw  xmm1, xmm5
  1.5536 +    punpcklbw  xmm2, xmm5
  1.5537 +    psubw      xmm1, xmm2
  1.5538 +    movq       xmm2, qword ptr [eax + edi]      // read 8 pixels from src_y2[0]
  1.5539 +    movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
  1.5540 +    punpcklbw  xmm2, xmm5
  1.5541 +    punpcklbw  xmm3, xmm5
  1.5542 +    psubw      xmm2, xmm3
  1.5543 +    paddw      xmm0, xmm2
  1.5544 +    paddw      xmm0, xmm1
  1.5545 +    paddw      xmm0, xmm1
  1.5546 +    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
  1.5547 +    psubw      xmm1, xmm0
  1.5548 +    pmaxsw     xmm0, xmm1
  1.5549 +    packuswb   xmm0, xmm0
  1.5550 +    sub        ecx, 8
  1.5551 +    movq       qword ptr [eax + edx], xmm0
  1.5552 +    lea        eax, [eax + 8]
  1.5553 +    jg         convertloop
  1.5554 +
  1.5555 +    pop        edi
  1.5556 +    pop        esi
  1.5557 +    ret
  1.5558 +  }
  1.5559 +}
  1.5560 +#endif  // HAS_SOBELXROW_SSE2
  1.5561 +
  1.5562 +#ifdef HAS_SOBELYROW_SSE2
  1.5563 +// SobelY as a matrix is
  1.5564 +// -1 -2 -1
  1.5565 +//  0  0  0
  1.5566 +//  1  2  1
  1.5567 +__declspec(naked) __declspec(align(16))
  1.5568 +void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
  1.5569 +                    uint8* dst_sobely, int width) {
  1.5570 +  __asm {
  1.5571 +    push       esi
  1.5572 +    mov        eax, [esp + 4 + 4]   // src_y0
  1.5573 +    mov        esi, [esp + 4 + 8]   // src_y1
  1.5574 +    mov        edx, [esp + 4 + 12]  // dst_sobely
  1.5575 +    mov        ecx, [esp + 4 + 16]  // width
  1.5576 +    sub        esi, eax
  1.5577 +    sub        edx, eax
  1.5578 +    pxor       xmm5, xmm5  // constant 0
  1.5579 +
  1.5580 +    align      4
  1.5581 + convertloop:
  1.5582 +    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
  1.5583 +    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
  1.5584 +    punpcklbw  xmm0, xmm5
  1.5585 +    punpcklbw  xmm1, xmm5
  1.5586 +    psubw      xmm0, xmm1
  1.5587 +    movq       xmm1, qword ptr [eax + 1]        // read 8 pixels from src_y0[1]
  1.5588 +    movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
  1.5589 +    punpcklbw  xmm1, xmm5
  1.5590 +    punpcklbw  xmm2, xmm5
  1.5591 +    psubw      xmm1, xmm2
  1.5592 +    movq       xmm2, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
  1.5593 +    movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
  1.5594 +    punpcklbw  xmm2, xmm5
  1.5595 +    punpcklbw  xmm3, xmm5
  1.5596 +    psubw      xmm2, xmm3
  1.5597 +    paddw      xmm0, xmm2
  1.5598 +    paddw      xmm0, xmm1
  1.5599 +    paddw      xmm0, xmm1
  1.5600 +    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
  1.5601 +    psubw      xmm1, xmm0
  1.5602 +    pmaxsw     xmm0, xmm1
  1.5603 +    packuswb   xmm0, xmm0
  1.5604 +    sub        ecx, 8
  1.5605 +    movq       qword ptr [eax + edx], xmm0
  1.5606 +    lea        eax, [eax + 8]
  1.5607 +    jg         convertloop
  1.5608 +
  1.5609 +    pop        esi
  1.5610 +    ret
  1.5611 +  }
  1.5612 +}
  1.5613 +#endif  // HAS_SOBELYROW_SSE2
  1.5614 +
  1.5615 +#ifdef HAS_SOBELROW_SSE2
  1.5616 +// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
  1.5617 +// A = 255
  1.5618 +// R = Sobel
  1.5619 +// G = Sobel
  1.5620 +// B = Sobel
  1.5621 +__declspec(naked) __declspec(align(16))
  1.5622 +void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
  1.5623 +                   uint8* dst_argb, int width) {
  1.5624 +  __asm {
  1.5625 +    push       esi
  1.5626 +    mov        eax, [esp + 4 + 4]   // src_sobelx
  1.5627 +    mov        esi, [esp + 4 + 8]   // src_sobely
  1.5628 +    mov        edx, [esp + 4 + 12]  // dst_argb
  1.5629 +    mov        ecx, [esp + 4 + 16]  // width
  1.5630 +    sub        esi, eax
  1.5631 +    pcmpeqb    xmm5, xmm5           // alpha 255
  1.5632 +    pslld      xmm5, 24             // 0xff000000
  1.5633 +
  1.5634 +    align      4
  1.5635 + convertloop:
  1.5636 +    movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
  1.5637 +    movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
  1.5638 +    lea        eax, [eax + 16]
  1.5639 +    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
  1.5640 +    movdqa     xmm2, xmm0             // GG
  1.5641 +    punpcklbw  xmm2, xmm0             // First 8
  1.5642 +    punpckhbw  xmm0, xmm0             // Next 8
  1.5643 +    movdqa     xmm1, xmm2             // GGGG
  1.5644 +    punpcklwd  xmm1, xmm2             // First 4
  1.5645 +    punpckhwd  xmm2, xmm2             // Next 4
  1.5646 +    por        xmm1, xmm5             // GGGA
  1.5647 +    por        xmm2, xmm5
  1.5648 +    movdqa     xmm3, xmm0             // GGGG
  1.5649 +    punpcklwd  xmm3, xmm0             // Next 4
  1.5650 +    punpckhwd  xmm0, xmm0             // Last 4
  1.5651 +    por        xmm3, xmm5             // GGGA
  1.5652 +    por        xmm0, xmm5
  1.5653 +    sub        ecx, 16
  1.5654 +    movdqa     [edx], xmm1
  1.5655 +    movdqa     [edx + 16], xmm2
  1.5656 +    movdqa     [edx + 32], xmm3
  1.5657 +    movdqa     [edx + 48], xmm0
  1.5658 +    lea        edx, [edx + 64]
  1.5659 +    jg         convertloop
  1.5660 +
  1.5661 +    pop        esi
  1.5662 +    ret
  1.5663 +  }
  1.5664 +}
  1.5665 +#endif  // HAS_SOBELROW_SSE2
  1.5666 +
  1.5667 +#ifdef HAS_SOBELTOPLANEROW_SSE2
  1.5668 +// Adds Sobel X and Sobel Y and stores Sobel into a plane.
  1.5669 +__declspec(naked) __declspec(align(16))
  1.5670 +void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
  1.5671 +                          uint8* dst_y, int width) {
  1.5672 +  __asm {
  1.5673 +    push       esi
  1.5674 +    mov        eax, [esp + 4 + 4]   // src_sobelx
  1.5675 +    mov        esi, [esp + 4 + 8]   // src_sobely
  1.5676 +    mov        edx, [esp + 4 + 12]  // dst_argb
  1.5677 +    mov        ecx, [esp + 4 + 16]  // width
  1.5678 +    sub        esi, eax
  1.5679 +
  1.5680 +    align      4
  1.5681 + convertloop:
  1.5682 +    movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
  1.5683 +    movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
  1.5684 +    lea        eax, [eax + 16]
  1.5685 +    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
  1.5686 +    sub        ecx, 16
  1.5687 +    movdqa     [edx], xmm0
  1.5688 +    lea        edx, [edx + 16]
  1.5689 +    jg         convertloop
  1.5690 +
  1.5691 +    pop        esi
  1.5692 +    ret
  1.5693 +  }
  1.5694 +}
  1.5695 +#endif  // HAS_SOBELTOPLANEROW_SSE2
  1.5696 +
  1.5697 +#ifdef HAS_SOBELXYROW_SSE2
  1.5698 +// Mixes Sobel X, Sobel Y and Sobel into ARGB.
  1.5699 +// A = 255
  1.5700 +// R = Sobel X
  1.5701 +// G = Sobel
  1.5702 +// B = Sobel Y
  1.5703 +__declspec(naked) __declspec(align(16))
  1.5704 +void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
  1.5705 +                     uint8* dst_argb, int width) {
  1.5706 +  __asm {
  1.5707 +    push       esi
  1.5708 +    mov        eax, [esp + 4 + 4]   // src_sobelx
  1.5709 +    mov        esi, [esp + 4 + 8]   // src_sobely
  1.5710 +    mov        edx, [esp + 4 + 12]  // dst_argb
  1.5711 +    mov        ecx, [esp + 4 + 16]  // width
  1.5712 +    sub        esi, eax
  1.5713 +    pcmpeqb    xmm5, xmm5           // alpha 255
  1.5714 +
  1.5715 +    align      4
  1.5716 + convertloop:
  1.5717 +    movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
  1.5718 +    movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
  1.5719 +    lea        eax, [eax + 16]
  1.5720 +    movdqa     xmm2, xmm0
  1.5721 +    paddusb    xmm2, xmm1             // sobel = sobelx + sobely
  1.5722 +    movdqa     xmm3, xmm0             // XA
  1.5723 +    punpcklbw  xmm3, xmm5
  1.5724 +    punpckhbw  xmm0, xmm5
  1.5725 +    movdqa     xmm4, xmm1             // YS
  1.5726 +    punpcklbw  xmm4, xmm2
  1.5727 +    punpckhbw  xmm1, xmm2
  1.5728 +    movdqa     xmm6, xmm4             // YSXA
  1.5729 +    punpcklwd  xmm6, xmm3             // First 4
  1.5730 +    punpckhwd  xmm4, xmm3             // Next 4
  1.5731 +    movdqa     xmm7, xmm1             // YSXA
  1.5732 +    punpcklwd  xmm7, xmm0             // Next 4
  1.5733 +    punpckhwd  xmm1, xmm0             // Last 4
  1.5734 +    sub        ecx, 16
  1.5735 +    movdqa     [edx], xmm6
  1.5736 +    movdqa     [edx + 16], xmm4
  1.5737 +    movdqa     [edx + 32], xmm7
  1.5738 +    movdqa     [edx + 48], xmm1
  1.5739 +    lea        edx, [edx + 64]
  1.5740 +    jg         convertloop
  1.5741 +
  1.5742 +    pop        esi
  1.5743 +    ret
  1.5744 +  }
  1.5745 +}
  1.5746 +#endif  // HAS_SOBELXYROW_SSE2
  1.5747 +
  1.5748 +#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
  1.5749 +// Consider float CumulativeSum.
  1.5750 +// Consider calling CumulativeSum one row at time as needed.
  1.5751 +// Consider circular CumulativeSum buffer of radius * 2 + 1 height.
  1.5752 +// Convert cumulative sum for an area to an average for 1 pixel.
  1.5753 +// topleft is pointer to top left of CumulativeSum buffer for area.
  1.5754 +// botleft is pointer to bottom left of CumulativeSum buffer.
  1.5755 +// width is offset from left to right of area in CumulativeSum buffer measured
  1.5756 +//   in number of ints.
  1.5757 +// area is the number of pixels in the area being averaged.
  1.5758 +// dst points to pixel to store result to.
  1.5759 +// count is number of averaged pixels to produce.
  1.5760 +// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
  1.5761 +// aligned.
  1.5762 +void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
  1.5763 +                                    int width, int area, uint8* dst,
  1.5764 +                                    int count) {
  1.5765 +  __asm {
  1.5766 +    mov        eax, topleft  // eax topleft
  1.5767 +    mov        esi, botleft  // esi botleft
  1.5768 +    mov        edx, width
  1.5769 +    movd       xmm5, area
  1.5770 +    mov        edi, dst
  1.5771 +    mov        ecx, count
  1.5772 +    cvtdq2ps   xmm5, xmm5
  1.5773 +    rcpss      xmm4, xmm5  // 1.0f / area
  1.5774 +    pshufd     xmm4, xmm4, 0
  1.5775 +    sub        ecx, 4
  1.5776 +    jl         l4b
  1.5777 +
  1.5778 +    cmp        area, 128  // 128 pixels will not overflow 15 bits.
  1.5779 +    ja         l4
  1.5780 +
  1.5781 +    pshufd     xmm5, xmm5, 0        // area
  1.5782 +    pcmpeqb    xmm6, xmm6           // constant of 65536.0 - 1 = 65535.0
  1.5783 +    psrld      xmm6, 16
  1.5784 +    cvtdq2ps   xmm6, xmm6
  1.5785 +    addps      xmm5, xmm6           // (65536.0 + area - 1)
  1.5786 +    mulps      xmm5, xmm4           // (65536.0 + area - 1) * 1 / area
  1.5787 +    cvtps2dq   xmm5, xmm5           // 0.16 fixed point
  1.5788 +    packssdw   xmm5, xmm5           // 16 bit shorts
  1.5789 +
  1.5790 +    // 4 pixel loop small blocks.
  1.5791 +    align      4
  1.5792 +  s4:
  1.5793 +    // top left
  1.5794 +    movdqa     xmm0, [eax]
  1.5795 +    movdqa     xmm1, [eax + 16]
  1.5796 +    movdqa     xmm2, [eax + 32]
  1.5797 +    movdqa     xmm3, [eax + 48]
  1.5798 +
  1.5799 +    // - top right
  1.5800 +    psubd      xmm0, [eax + edx * 4]
  1.5801 +    psubd      xmm1, [eax + edx * 4 + 16]
  1.5802 +    psubd      xmm2, [eax + edx * 4 + 32]
  1.5803 +    psubd      xmm3, [eax + edx * 4 + 48]
  1.5804 +    lea        eax, [eax + 64]
  1.5805 +
  1.5806 +    // - bottom left
  1.5807 +    psubd      xmm0, [esi]
  1.5808 +    psubd      xmm1, [esi + 16]
  1.5809 +    psubd      xmm2, [esi + 32]
  1.5810 +    psubd      xmm3, [esi + 48]
  1.5811 +
  1.5812 +    // + bottom right
  1.5813 +    paddd      xmm0, [esi + edx * 4]
  1.5814 +    paddd      xmm1, [esi + edx * 4 + 16]
  1.5815 +    paddd      xmm2, [esi + edx * 4 + 32]
  1.5816 +    paddd      xmm3, [esi + edx * 4 + 48]
  1.5817 +    lea        esi, [esi + 64]
  1.5818 +
  1.5819 +    packssdw   xmm0, xmm1  // pack 4 pixels into 2 registers
  1.5820 +    packssdw   xmm2, xmm3
  1.5821 +
  1.5822 +    pmulhuw    xmm0, xmm5
  1.5823 +    pmulhuw    xmm2, xmm5
  1.5824 +
  1.5825 +    packuswb   xmm0, xmm2
  1.5826 +    movdqu     [edi], xmm0
  1.5827 +    lea        edi, [edi + 16]
  1.5828 +    sub        ecx, 4
  1.5829 +    jge        s4
  1.5830 +
  1.5831 +    jmp        l4b
  1.5832 +
  1.5833 +    // 4 pixel loop
  1.5834 +    align      4
  1.5835 +  l4:
  1.5836 +    // top left
  1.5837 +    movdqa     xmm0, [eax]
  1.5838 +    movdqa     xmm1, [eax + 16]
  1.5839 +    movdqa     xmm2, [eax + 32]
  1.5840 +    movdqa     xmm3, [eax + 48]
  1.5841 +
  1.5842 +    // - top right
  1.5843 +    psubd      xmm0, [eax + edx * 4]
  1.5844 +    psubd      xmm1, [eax + edx * 4 + 16]
  1.5845 +    psubd      xmm2, [eax + edx * 4 + 32]
  1.5846 +    psubd      xmm3, [eax + edx * 4 + 48]
  1.5847 +    lea        eax, [eax + 64]
  1.5848 +
  1.5849 +    // - bottom left
  1.5850 +    psubd      xmm0, [esi]
  1.5851 +    psubd      xmm1, [esi + 16]
  1.5852 +    psubd      xmm2, [esi + 32]
  1.5853 +    psubd      xmm3, [esi + 48]
  1.5854 +
  1.5855 +    // + bottom right
  1.5856 +    paddd      xmm0, [esi + edx * 4]
  1.5857 +    paddd      xmm1, [esi + edx * 4 + 16]
  1.5858 +    paddd      xmm2, [esi + edx * 4 + 32]
  1.5859 +    paddd      xmm3, [esi + edx * 4 + 48]
  1.5860 +    lea        esi, [esi + 64]
  1.5861 +
  1.5862 +    cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area
  1.5863 +    cvtdq2ps   xmm1, xmm1
  1.5864 +    mulps      xmm0, xmm4
  1.5865 +    mulps      xmm1, xmm4
  1.5866 +    cvtdq2ps   xmm2, xmm2
  1.5867 +    cvtdq2ps   xmm3, xmm3
  1.5868 +    mulps      xmm2, xmm4
  1.5869 +    mulps      xmm3, xmm4
  1.5870 +    cvtps2dq   xmm0, xmm0
  1.5871 +    cvtps2dq   xmm1, xmm1
  1.5872 +    cvtps2dq   xmm2, xmm2
  1.5873 +    cvtps2dq   xmm3, xmm3
  1.5874 +    packssdw   xmm0, xmm1
  1.5875 +    packssdw   xmm2, xmm3
  1.5876 +    packuswb   xmm0, xmm2
  1.5877 +    movdqu     [edi], xmm0
  1.5878 +    lea        edi, [edi + 16]
  1.5879 +    sub        ecx, 4
  1.5880 +    jge        l4
  1.5881 +
  1.5882 +  l4b:
  1.5883 +    add        ecx, 4 - 1
  1.5884 +    jl         l1b
  1.5885 +
  1.5886 +    // 1 pixel loop
  1.5887 +    align      4
  1.5888 +  l1:
  1.5889 +    movdqa     xmm0, [eax]
  1.5890 +    psubd      xmm0, [eax + edx * 4]
  1.5891 +    lea        eax, [eax + 16]
  1.5892 +    psubd      xmm0, [esi]
  1.5893 +    paddd      xmm0, [esi + edx * 4]
  1.5894 +    lea        esi, [esi + 16]
  1.5895 +    cvtdq2ps   xmm0, xmm0
  1.5896 +    mulps      xmm0, xmm4
  1.5897 +    cvtps2dq   xmm0, xmm0
  1.5898 +    packssdw   xmm0, xmm0
  1.5899 +    packuswb   xmm0, xmm0
  1.5900 +    movd       dword ptr [edi], xmm0
  1.5901 +    lea        edi, [edi + 4]
  1.5902 +    sub        ecx, 1
  1.5903 +    jge        l1
  1.5904 +  l1b:
  1.5905 +  }
  1.5906 +}
  1.5907 +#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
  1.5908 +
  1.5909 +#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
  1.5910 +// Creates a table of cumulative sums where each value is a sum of all values
  1.5911 +// above and to the left of the value.
  1.5912 +void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
  1.5913 +                                  const int32* previous_cumsum, int width) {
  1.5914 +  __asm {
  1.5915 +    mov        eax, row
  1.5916 +    mov        edx, cumsum
  1.5917 +    mov        esi, previous_cumsum
  1.5918 +    mov        ecx, width
  1.5919 +    pxor       xmm0, xmm0
  1.5920 +    pxor       xmm1, xmm1
  1.5921 +
  1.5922 +    sub        ecx, 4
  1.5923 +    jl         l4b
  1.5924 +    test       edx, 15
  1.5925 +    jne        l4b
  1.5926 +
  1.5927 +    // 4 pixel loop
  1.5928 +    align      4
  1.5929 +  l4:
  1.5930 +    movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
  1.5931 +    lea        eax, [eax + 16]
  1.5932 +    movdqa     xmm4, xmm2
  1.5933 +
  1.5934 +    punpcklbw  xmm2, xmm1
  1.5935 +    movdqa     xmm3, xmm2
  1.5936 +    punpcklwd  xmm2, xmm1
  1.5937 +    punpckhwd  xmm3, xmm1
  1.5938 +
  1.5939 +    punpckhbw  xmm4, xmm1
  1.5940 +    movdqa     xmm5, xmm4
  1.5941 +    punpcklwd  xmm4, xmm1
  1.5942 +    punpckhwd  xmm5, xmm1
  1.5943 +
  1.5944 +    paddd      xmm0, xmm2
  1.5945 +    movdqa     xmm2, [esi]  // previous row above.
  1.5946 +    paddd      xmm2, xmm0
  1.5947 +
  1.5948 +    paddd      xmm0, xmm3
  1.5949 +    movdqa     xmm3, [esi + 16]
  1.5950 +    paddd      xmm3, xmm0
  1.5951 +
  1.5952 +    paddd      xmm0, xmm4
  1.5953 +    movdqa     xmm4, [esi + 32]
  1.5954 +    paddd      xmm4, xmm0
  1.5955 +
  1.5956 +    paddd      xmm0, xmm5
  1.5957 +    movdqa     xmm5, [esi + 48]
  1.5958 +    lea        esi, [esi + 64]
  1.5959 +    paddd      xmm5, xmm0
  1.5960 +
  1.5961 +    movdqa     [edx], xmm2
  1.5962 +    movdqa     [edx + 16], xmm3
  1.5963 +    movdqa     [edx + 32], xmm4
  1.5964 +    movdqa     [edx + 48], xmm5
  1.5965 +
  1.5966 +    lea        edx, [edx + 64]
  1.5967 +    sub        ecx, 4
  1.5968 +    jge        l4
  1.5969 +
  1.5970 +  l4b:
  1.5971 +    add        ecx, 4 - 1
  1.5972 +    jl         l1b
  1.5973 +
  1.5974 +    // 1 pixel loop
  1.5975 +    align      4
  1.5976 +  l1:
  1.5977 +    movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
  1.5978 +    lea        eax, [eax + 4]
  1.5979 +    punpcklbw  xmm2, xmm1
  1.5980 +    punpcklwd  xmm2, xmm1
  1.5981 +    paddd      xmm0, xmm2
  1.5982 +    movdqu     xmm2, [esi]
  1.5983 +    lea        esi, [esi + 16]
  1.5984 +    paddd      xmm2, xmm0
  1.5985 +    movdqu     [edx], xmm2
  1.5986 +    lea        edx, [edx + 16]
  1.5987 +    sub        ecx, 1
  1.5988 +    jge        l1
  1.5989 +
  1.5990 + l1b:
  1.5991 +  }
  1.5992 +}
  1.5993 +#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
  1.5994 +
  1.5995 +#ifdef HAS_ARGBAFFINEROW_SSE2
  1.5996 +// Copy ARGB pixels from source image with slope to a row of destination.
  1.5997 +__declspec(naked) __declspec(align(16))
  1.5998 +LIBYUV_API
  1.5999 +void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
  1.6000 +                        uint8* dst_argb, const float* uv_dudv, int width) {
  1.6001 +  __asm {
  1.6002 +    push       esi
  1.6003 +    push       edi
  1.6004 +    mov        eax, [esp + 12]  // src_argb
  1.6005 +    mov        esi, [esp + 16]  // stride
  1.6006 +    mov        edx, [esp + 20]  // dst_argb
  1.6007 +    mov        ecx, [esp + 24]  // pointer to uv_dudv
  1.6008 +    movq       xmm2, qword ptr [ecx]  // uv
  1.6009 +    movq       xmm7, qword ptr [ecx + 8]  // dudv
  1.6010 +    mov        ecx, [esp + 28]  // width
  1.6011 +    shl        esi, 16          // 4, stride
  1.6012 +    add        esi, 4
  1.6013 +    movd       xmm5, esi
  1.6014 +    sub        ecx, 4
  1.6015 +    jl         l4b
  1.6016 +
  1.6017 +    // setup for 4 pixel loop
  1.6018 +    pshufd     xmm7, xmm7, 0x44  // dup dudv
  1.6019 +    pshufd     xmm5, xmm5, 0  // dup 4, stride
  1.6020 +    movdqa     xmm0, xmm2    // x0, y0, x1, y1
  1.6021 +    addps      xmm0, xmm7
  1.6022 +    movlhps    xmm2, xmm0
  1.6023 +    movdqa     xmm4, xmm7
  1.6024 +    addps      xmm4, xmm4    // dudv *= 2
  1.6025 +    movdqa     xmm3, xmm2    // x2, y2, x3, y3
  1.6026 +    addps      xmm3, xmm4
  1.6027 +    addps      xmm4, xmm4    // dudv *= 4
  1.6028 +
  1.6029 +    // 4 pixel loop
  1.6030 +    align      4
  1.6031 +  l4:
  1.6032 +    cvttps2dq  xmm0, xmm2    // x, y float to int first 2
  1.6033 +    cvttps2dq  xmm1, xmm3    // x, y float to int next 2
  1.6034 +    packssdw   xmm0, xmm1    // x, y as 8 shorts
  1.6035 +    pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.
  1.6036 +    movd       esi, xmm0
  1.6037 +    pshufd     xmm0, xmm0, 0x39  // shift right
  1.6038 +    movd       edi, xmm0
  1.6039 +    pshufd     xmm0, xmm0, 0x39  // shift right
  1.6040 +    movd       xmm1, [eax + esi]  // read pixel 0
  1.6041 +    movd       xmm6, [eax + edi]  // read pixel 1
  1.6042 +    punpckldq  xmm1, xmm6     // combine pixel 0 and 1
  1.6043 +    addps      xmm2, xmm4    // x, y += dx, dy first 2
  1.6044 +    movq       qword ptr [edx], xmm1
  1.6045 +    movd       esi, xmm0
  1.6046 +    pshufd     xmm0, xmm0, 0x39  // shift right
  1.6047 +    movd       edi, xmm0
  1.6048 +    movd       xmm6, [eax + esi]  // read pixel 2
  1.6049 +    movd       xmm0, [eax + edi]  // read pixel 3
  1.6050 +    punpckldq  xmm6, xmm0     // combine pixel 2 and 3
  1.6051 +    addps      xmm3, xmm4    // x, y += dx, dy next 2
  1.6052 +    sub        ecx, 4
  1.6053 +    movq       qword ptr 8[edx], xmm6
  1.6054 +    lea        edx, [edx + 16]
  1.6055 +    jge        l4
  1.6056 +
  1.6057 +  l4b:
  1.6058 +    add        ecx, 4 - 1
  1.6059 +    jl         l1b
  1.6060 +
  1.6061 +    // 1 pixel loop
  1.6062 +    align      4
  1.6063 +  l1:
  1.6064 +    cvttps2dq  xmm0, xmm2    // x, y float to int
  1.6065 +    packssdw   xmm0, xmm0    // x, y as shorts
  1.6066 +    pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
  1.6067 +    addps      xmm2, xmm7    // x, y += dx, dy
  1.6068 +    movd       esi, xmm0
  1.6069 +    movd       xmm0, [eax + esi]  // copy a pixel
  1.6070 +    sub        ecx, 1
  1.6071 +    movd       [edx], xmm0
  1.6072 +    lea        edx, [edx + 4]
  1.6073 +    jge        l1
  1.6074 +  l1b:
  1.6075 +    pop        edi
  1.6076 +    pop        esi
  1.6077 +    ret
  1.6078 +  }
  1.6079 +}
  1.6080 +#endif  // HAS_ARGBAFFINEROW_SSE2
  1.6081 +
  1.6082 +#ifdef HAS_INTERPOLATEROW_AVX2
  1.6083 +// Bilinear filter 16x2 -> 16x1
  1.6084 +__declspec(naked) __declspec(align(16))
  1.6085 +void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
  1.6086 +                          ptrdiff_t src_stride, int dst_width,
  1.6087 +                          int source_y_fraction) {
  1.6088 +  __asm {
  1.6089 +    push       esi
  1.6090 +    push       edi
  1.6091 +    mov        edi, [esp + 8 + 4]   // dst_ptr
  1.6092 +    mov        esi, [esp + 8 + 8]   // src_ptr
  1.6093 +    mov        edx, [esp + 8 + 12]  // src_stride
  1.6094 +    mov        ecx, [esp + 8 + 16]  // dst_width
  1.6095 +    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
  1.6096 +    shr        eax, 1
  1.6097 +    // Dispatch to specialized filters if applicable.
  1.6098 +    cmp        eax, 0
  1.6099 +    je         xloop100  // 0 / 128.  Blend 100 / 0.
  1.6100 +    sub        edi, esi
  1.6101 +    cmp        eax, 32
  1.6102 +    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
  1.6103 +    cmp        eax, 64
  1.6104 +    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
  1.6105 +    cmp        eax, 96
  1.6106 +    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
  1.6107 +
  1.6108 +    vmovd      xmm0, eax  // high fraction 0..127
  1.6109 +    neg        eax
  1.6110 +    add        eax, 128
  1.6111 +    vmovd      xmm5, eax  // low fraction 128..1
  1.6112 +    vpunpcklbw xmm5, xmm5, xmm0
  1.6113 +    vpunpcklwd xmm5, xmm5, xmm5
  1.6114 +    vpxor      ymm0, ymm0, ymm0
  1.6115 +    vpermd     ymm5, ymm0, ymm5
  1.6116 +
  1.6117 +    align      4
  1.6118 +  xloop:
  1.6119 +    vmovdqu    ymm0, [esi]
  1.6120 +    vmovdqu    ymm2, [esi + edx]
  1.6121 +    vpunpckhbw ymm1, ymm0, ymm2  // mutates
  1.6122 +    vpunpcklbw ymm0, ymm0, ymm2  // mutates
  1.6123 +    vpmaddubsw ymm0, ymm0, ymm5
  1.6124 +    vpmaddubsw ymm1, ymm1, ymm5
  1.6125 +    vpsrlw     ymm0, ymm0, 7
  1.6126 +    vpsrlw     ymm1, ymm1, 7
  1.6127 +    vpackuswb  ymm0, ymm0, ymm1  // unmutates
  1.6128 +    sub        ecx, 32
  1.6129 +    vmovdqu    [esi + edi], ymm0
  1.6130 +    lea        esi, [esi + 32]
  1.6131 +    jg         xloop
  1.6132 +    jmp        xloop99
  1.6133 +
  1.6134 +    // Blend 25 / 75.
  1.6135 +    align      4
  1.6136 +  xloop25:
  1.6137 +    vmovdqu    ymm0, [esi]
  1.6138 +    vpavgb     ymm0, ymm0, [esi + edx]
  1.6139 +    vpavgb     ymm0, ymm0, [esi + edx]
  1.6140 +    sub        ecx, 32
  1.6141 +    vmovdqu    [esi + edi], ymm0
  1.6142 +    lea        esi, [esi + 32]
  1.6143 +    jg         xloop25
  1.6144 +    jmp        xloop99
  1.6145 +
  1.6146 +    // Blend 50 / 50.
  1.6147 +    align      4
  1.6148 +  xloop50:
  1.6149 +    vmovdqu    ymm0, [esi]
  1.6150 +    vpavgb     ymm0, ymm0, [esi + edx]
  1.6151 +    sub        ecx, 32
  1.6152 +    vmovdqu    [esi + edi], ymm0
  1.6153 +    lea        esi, [esi + 32]
  1.6154 +    jg         xloop50
  1.6155 +    jmp        xloop99
  1.6156 +
  1.6157 +    // Blend 75 / 25.
  1.6158 +    align      4
  1.6159 +  xloop75:
  1.6160 +    vmovdqu    ymm0, [esi + edx]
  1.6161 +    vpavgb     ymm0, ymm0, [esi]
  1.6162 +    vpavgb     ymm0, ymm0, [esi]
  1.6163 +    sub        ecx, 32
  1.6164 +    vmovdqu     [esi + edi], ymm0
  1.6165 +    lea        esi, [esi + 32]
  1.6166 +    jg         xloop75
  1.6167 +    jmp        xloop99
  1.6168 +
  1.6169 +    // Blend 100 / 0 - Copy row unchanged.
  1.6170 +    align      4
  1.6171 +  xloop100:
  1.6172 +    rep movsb
  1.6173 +
  1.6174 +  xloop99:
  1.6175 +    pop        edi
  1.6176 +    pop        esi
  1.6177 +    vzeroupper
  1.6178 +    ret
  1.6179 +  }
  1.6180 +}
  1.6181 +#endif  // HAS_INTERPOLATEROW_AVX2
  1.6182 +
  1.6183 +#ifdef HAS_INTERPOLATEROW_SSSE3
  1.6184 +// Bilinear filter 16x2 -> 16x1
  1.6185 +__declspec(naked) __declspec(align(16))
  1.6186 +void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
  1.6187 +                          ptrdiff_t src_stride, int dst_width,
  1.6188 +                          int source_y_fraction) {
  1.6189 +  __asm {
  1.6190 +    push       esi
  1.6191 +    push       edi
  1.6192 +    mov        edi, [esp + 8 + 4]   // dst_ptr
  1.6193 +    mov        esi, [esp + 8 + 8]   // src_ptr
  1.6194 +    mov        edx, [esp + 8 + 12]  // src_stride
  1.6195 +    mov        ecx, [esp + 8 + 16]  // dst_width
  1.6196 +    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
  1.6197 +    sub        edi, esi
  1.6198 +    shr        eax, 1
  1.6199 +    // Dispatch to specialized filters if applicable.
  1.6200 +    cmp        eax, 0
  1.6201 +    je         xloop100  // 0 / 128.  Blend 100 / 0.
  1.6202 +    cmp        eax, 32
  1.6203 +    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
  1.6204 +    cmp        eax, 64
  1.6205 +    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
  1.6206 +    cmp        eax, 96
  1.6207 +    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
  1.6208 +
  1.6209 +    movd       xmm0, eax  // high fraction 0..127
  1.6210 +    neg        eax
  1.6211 +    add        eax, 128
  1.6212 +    movd       xmm5, eax  // low fraction 128..1
  1.6213 +    punpcklbw  xmm5, xmm0
  1.6214 +    punpcklwd  xmm5, xmm5
  1.6215 +    pshufd     xmm5, xmm5, 0
  1.6216 +
  1.6217 +    align      4
  1.6218 +  xloop:
  1.6219 +    movdqa     xmm0, [esi]
  1.6220 +    movdqa     xmm2, [esi + edx]
  1.6221 +    movdqa     xmm1, xmm0
  1.6222 +    punpcklbw  xmm0, xmm2
  1.6223 +    punpckhbw  xmm1, xmm2
  1.6224 +    pmaddubsw  xmm0, xmm5
  1.6225 +    pmaddubsw  xmm1, xmm5
  1.6226 +    psrlw      xmm0, 7
  1.6227 +    psrlw      xmm1, 7
  1.6228 +    packuswb   xmm0, xmm1
  1.6229 +    sub        ecx, 16
  1.6230 +    movdqa     [esi + edi], xmm0
  1.6231 +    lea        esi, [esi + 16]
  1.6232 +    jg         xloop
  1.6233 +    jmp        xloop99
  1.6234 +
  1.6235 +    // Blend 25 / 75.
  1.6236 +    align      4
  1.6237 +  xloop25:
  1.6238 +    movdqa     xmm0, [esi]
  1.6239 +    movdqa     xmm1, [esi + edx]
  1.6240 +    pavgb      xmm0, xmm1
  1.6241 +    pavgb      xmm0, xmm1
  1.6242 +    sub        ecx, 16
  1.6243 +    movdqa     [esi + edi], xmm0
  1.6244 +    lea        esi, [esi + 16]
  1.6245 +    jg         xloop25
  1.6246 +    jmp        xloop99
  1.6247 +
  1.6248 +    // Blend 50 / 50.
  1.6249 +    align      4
  1.6250 +  xloop50:
  1.6251 +    movdqa     xmm0, [esi]
  1.6252 +    movdqa     xmm1, [esi + edx]
  1.6253 +    pavgb      xmm0, xmm1
  1.6254 +    sub        ecx, 16
  1.6255 +    movdqa     [esi + edi], xmm0
  1.6256 +    lea        esi, [esi + 16]
  1.6257 +    jg         xloop50
  1.6258 +    jmp        xloop99
  1.6259 +
  1.6260 +    // Blend 75 / 25.
  1.6261 +    align      4
  1.6262 +  xloop75:
  1.6263 +    movdqa     xmm1, [esi]
  1.6264 +    movdqa     xmm0, [esi + edx]
  1.6265 +    pavgb      xmm0, xmm1
  1.6266 +    pavgb      xmm0, xmm1
  1.6267 +    sub        ecx, 16
  1.6268 +    movdqa     [esi + edi], xmm0
  1.6269 +    lea        esi, [esi + 16]
  1.6270 +    jg         xloop75
  1.6271 +    jmp        xloop99
  1.6272 +
  1.6273 +    // Blend 100 / 0 - Copy row unchanged.
  1.6274 +    align      4
  1.6275 +  xloop100:
  1.6276 +    movdqa     xmm0, [esi]
  1.6277 +    sub        ecx, 16
  1.6278 +    movdqa     [esi + edi], xmm0
  1.6279 +    lea        esi, [esi + 16]
  1.6280 +    jg         xloop100
  1.6281 +
  1.6282 +  xloop99:
  1.6283 +    pop        edi
  1.6284 +    pop        esi
  1.6285 +    ret
  1.6286 +  }
  1.6287 +}
  1.6288 +#endif  // HAS_INTERPOLATEROW_SSSE3
  1.6289 +
  1.6290 +#ifdef HAS_INTERPOLATEROW_SSE2
  1.6291 +// Bilinear filter 16x2 -> 16x1
  1.6292 +__declspec(naked) __declspec(align(16))
  1.6293 +void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
  1.6294 +                         ptrdiff_t src_stride, int dst_width,
  1.6295 +                         int source_y_fraction) {
  1.6296 +  __asm {
  1.6297 +    push       esi
  1.6298 +    push       edi
  1.6299 +    mov        edi, [esp + 8 + 4]   // dst_ptr
  1.6300 +    mov        esi, [esp + 8 + 8]   // src_ptr
  1.6301 +    mov        edx, [esp + 8 + 12]  // src_stride
  1.6302 +    mov        ecx, [esp + 8 + 16]  // dst_width
  1.6303 +    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
  1.6304 +    sub        edi, esi
  1.6305 +    // Dispatch to specialized filters if applicable.
  1.6306 +    cmp        eax, 0
  1.6307 +    je         xloop100  // 0 / 256.  Blend 100 / 0.
  1.6308 +    cmp        eax, 64
  1.6309 +    je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.
  1.6310 +    cmp        eax, 128
  1.6311 +    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
  1.6312 +    cmp        eax, 192
  1.6313 +    je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.
  1.6314 +
  1.6315 +    movd       xmm5, eax            // xmm5 = y fraction
  1.6316 +    punpcklbw  xmm5, xmm5
  1.6317 +    psrlw      xmm5, 1
  1.6318 +    punpcklwd  xmm5, xmm5
  1.6319 +    punpckldq  xmm5, xmm5
  1.6320 +    punpcklqdq xmm5, xmm5
  1.6321 +    pxor       xmm4, xmm4
  1.6322 +
  1.6323 +    align      4
  1.6324 +  xloop:
  1.6325 +    movdqa     xmm0, [esi]  // row0
  1.6326 +    movdqa     xmm2, [esi + edx]  // row1
  1.6327 +    movdqa     xmm1, xmm0
  1.6328 +    movdqa     xmm3, xmm2
  1.6329 +    punpcklbw  xmm2, xmm4
  1.6330 +    punpckhbw  xmm3, xmm4
  1.6331 +    punpcklbw  xmm0, xmm4
  1.6332 +    punpckhbw  xmm1, xmm4
  1.6333 +    psubw      xmm2, xmm0  // row1 - row0
  1.6334 +    psubw      xmm3, xmm1
  1.6335 +    paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16
  1.6336 +    paddw      xmm3, xmm3
  1.6337 +    pmulhw     xmm2, xmm5  // scale diff
  1.6338 +    pmulhw     xmm3, xmm5
  1.6339 +    paddw      xmm0, xmm2  // sum rows
  1.6340 +    paddw      xmm1, xmm3
  1.6341 +    packuswb   xmm0, xmm1
  1.6342 +    sub        ecx, 16
  1.6343 +    movdqa     [esi + edi], xmm0
  1.6344 +    lea        esi, [esi + 16]
  1.6345 +    jg         xloop
  1.6346 +    jmp        xloop99
  1.6347 +
  1.6348 +    // Blend 25 / 75.
  1.6349 +    align      4
  1.6350 +  xloop25:
  1.6351 +    movdqa     xmm0, [esi]
  1.6352 +    movdqa     xmm1, [esi + edx]
  1.6353 +    pavgb      xmm0, xmm1
  1.6354 +    pavgb      xmm0, xmm1
  1.6355 +    sub        ecx, 16
  1.6356 +    movdqa     [esi + edi], xmm0
  1.6357 +    lea        esi, [esi + 16]
  1.6358 +    jg         xloop25
  1.6359 +    jmp        xloop99
  1.6360 +
  1.6361 +    // Blend 50 / 50.
  1.6362 +    align      4
  1.6363 +  xloop50:
  1.6364 +    movdqa     xmm0, [esi]
  1.6365 +    movdqa     xmm1, [esi + edx]
  1.6366 +    pavgb      xmm0, xmm1
  1.6367 +    sub        ecx, 16
  1.6368 +    movdqa     [esi + edi], xmm0
  1.6369 +    lea        esi, [esi + 16]
  1.6370 +    jg         xloop50
  1.6371 +    jmp        xloop99
  1.6372 +
  1.6373 +    // Blend 75 / 25.
  1.6374 +    align      4
  1.6375 +  xloop75:
  1.6376 +    movdqa     xmm1, [esi]
  1.6377 +    movdqa     xmm0, [esi + edx]
  1.6378 +    pavgb      xmm0, xmm1
  1.6379 +    pavgb      xmm0, xmm1
  1.6380 +    sub        ecx, 16
  1.6381 +    movdqa     [esi + edi], xmm0
  1.6382 +    lea        esi, [esi + 16]
  1.6383 +    jg         xloop75
  1.6384 +    jmp        xloop99
  1.6385 +
  1.6386 +    // Blend 100 / 0 - Copy row unchanged.
  1.6387 +    align      4
  1.6388 +  xloop100:
  1.6389 +    movdqa     xmm0, [esi]
  1.6390 +    sub        ecx, 16
  1.6391 +    movdqa     [esi + edi], xmm0
  1.6392 +    lea        esi, [esi + 16]
  1.6393 +    jg         xloop100
  1.6394 +
  1.6395 +  xloop99:
  1.6396 +    pop        edi
  1.6397 +    pop        esi
  1.6398 +    ret
  1.6399 +  }
  1.6400 +}
  1.6401 +#endif  // HAS_INTERPOLATEROW_SSE2
  1.6402 +
  1.6403 +// Bilinear filter 16x2 -> 16x1
  1.6404 +__declspec(naked) __declspec(align(16))
  1.6405 +void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
  1.6406 +                                    ptrdiff_t src_stride, int dst_width,
  1.6407 +                                    int source_y_fraction) {
  1.6408 +  __asm {
  1.6409 +    push       esi
  1.6410 +    push       edi
  1.6411 +    mov        edi, [esp + 8 + 4]   // dst_ptr
  1.6412 +    mov        esi, [esp + 8 + 8]   // src_ptr
  1.6413 +    mov        edx, [esp + 8 + 12]  // src_stride
  1.6414 +    mov        ecx, [esp + 8 + 16]  // dst_width
  1.6415 +    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
  1.6416 +    sub        edi, esi
  1.6417 +    shr        eax, 1
  1.6418 +    // Dispatch to specialized filters if applicable.
  1.6419 +    cmp        eax, 0
  1.6420 +    je         xloop100  // 0 / 128.  Blend 100 / 0.
  1.6421 +    cmp        eax, 32
  1.6422 +    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
  1.6423 +    cmp        eax, 64
  1.6424 +    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
  1.6425 +    cmp        eax, 96
  1.6426 +    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
  1.6427 +
  1.6428 +    movd       xmm0, eax  // high fraction 0..127
  1.6429 +    neg        eax
  1.6430 +    add        eax, 128
  1.6431 +    movd       xmm5, eax  // low fraction 128..1
  1.6432 +    punpcklbw  xmm5, xmm0
  1.6433 +    punpcklwd  xmm5, xmm5
  1.6434 +    pshufd     xmm5, xmm5, 0
  1.6435 +
  1.6436 +    align      4
  1.6437 +  xloop:
  1.6438 +    movdqu     xmm0, [esi]
  1.6439 +    movdqu     xmm2, [esi + edx]
  1.6440 +    movdqu     xmm1, xmm0
  1.6441 +    punpcklbw  xmm0, xmm2
  1.6442 +    punpckhbw  xmm1, xmm2
  1.6443 +    pmaddubsw  xmm0, xmm5
  1.6444 +    pmaddubsw  xmm1, xmm5
  1.6445 +    psrlw      xmm0, 7
  1.6446 +    psrlw      xmm1, 7
  1.6447 +    packuswb   xmm0, xmm1
  1.6448 +    sub        ecx, 16
  1.6449 +    movdqu     [esi + edi], xmm0
  1.6450 +    lea        esi, [esi + 16]
  1.6451 +    jg         xloop
  1.6452 +    jmp        xloop99
  1.6453 +
  1.6454 +    // Blend 25 / 75.
  1.6455 +    align      4
  1.6456 +  xloop25:
  1.6457 +    movdqu     xmm0, [esi]
  1.6458 +    movdqu     xmm1, [esi + edx]
  1.6459 +    pavgb      xmm0, xmm1
  1.6460 +    pavgb      xmm0, xmm1
  1.6461 +    sub        ecx, 16
  1.6462 +    movdqu     [esi + edi], xmm0
  1.6463 +    lea        esi, [esi + 16]
  1.6464 +    jg         xloop25
  1.6465 +    jmp        xloop99
  1.6466 +
  1.6467 +    // Blend 50 / 50.
  1.6468 +    align      4
  1.6469 +  xloop50:
  1.6470 +    movdqu     xmm0, [esi]
  1.6471 +    movdqu     xmm1, [esi + edx]
  1.6472 +    pavgb      xmm0, xmm1
  1.6473 +    sub        ecx, 16
  1.6474 +    movdqu     [esi + edi], xmm0
  1.6475 +    lea        esi, [esi + 16]
  1.6476 +    jg         xloop50
  1.6477 +    jmp        xloop99
  1.6478 +
  1.6479 +    // Blend 75 / 25.
  1.6480 +    align      4
  1.6481 +  xloop75:
  1.6482 +    movdqu     xmm1, [esi]
  1.6483 +    movdqu     xmm0, [esi + edx]
  1.6484 +    pavgb      xmm0, xmm1
  1.6485 +    pavgb      xmm0, xmm1
  1.6486 +    sub        ecx, 16
  1.6487 +    movdqu     [esi + edi], xmm0
  1.6488 +    lea        esi, [esi + 16]
  1.6489 +    jg         xloop75
  1.6490 +    jmp        xloop99
  1.6491 +
  1.6492 +    // Blend 100 / 0 - Copy row unchanged.
  1.6493 +    align      4
  1.6494 +  xloop100:
  1.6495 +    movdqu     xmm0, [esi]
  1.6496 +    sub        ecx, 16
  1.6497 +    movdqu     [esi + edi], xmm0
  1.6498 +    lea        esi, [esi + 16]
  1.6499 +    jg         xloop100
  1.6500 +
  1.6501 +  xloop99:
  1.6502 +    pop        edi
  1.6503 +    pop        esi
  1.6504 +    ret
  1.6505 +  }
  1.6506 +}
  1.6507 +
  1.6508 +#ifdef HAS_INTERPOLATEROW_SSE2
  1.6509 +// Bilinear filter 16x2 -> 16x1
  1.6510 +__declspec(naked) __declspec(align(16))
  1.6511 +void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
  1.6512 +                                   ptrdiff_t src_stride, int dst_width,
  1.6513 +                                   int source_y_fraction) {
  1.6514 +  __asm {
  1.6515 +    push       esi
  1.6516 +    push       edi
  1.6517 +    mov        edi, [esp + 8 + 4]   // dst_ptr
  1.6518 +    mov        esi, [esp + 8 + 8]   // src_ptr
  1.6519 +    mov        edx, [esp + 8 + 12]  // src_stride
  1.6520 +    mov        ecx, [esp + 8 + 16]  // dst_width
  1.6521 +    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
  1.6522 +    sub        edi, esi
  1.6523 +    // Dispatch to specialized filters if applicable.
  1.6524 +    cmp        eax, 0
  1.6525 +    je         xloop100  // 0 / 256.  Blend 100 / 0.
  1.6526 +    cmp        eax, 64
  1.6527 +    je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.
  1.6528 +    cmp        eax, 128
  1.6529 +    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
  1.6530 +    cmp        eax, 192
  1.6531 +    je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.
  1.6532 +
  1.6533 +    movd       xmm5, eax            // xmm5 = y fraction
  1.6534 +    punpcklbw  xmm5, xmm5
  1.6535 +    psrlw      xmm5, 1
  1.6536 +    punpcklwd  xmm5, xmm5
  1.6537 +    punpckldq  xmm5, xmm5
  1.6538 +    punpcklqdq xmm5, xmm5
  1.6539 +    pxor       xmm4, xmm4
  1.6540 +
  1.6541 +    align      4
  1.6542 +  xloop:
  1.6543 +    movdqu     xmm0, [esi]  // row0
  1.6544 +    movdqu     xmm2, [esi + edx]  // row1
  1.6545 +    movdqu     xmm1, xmm0
  1.6546 +    movdqu     xmm3, xmm2
  1.6547 +    punpcklbw  xmm2, xmm4
  1.6548 +    punpckhbw  xmm3, xmm4
  1.6549 +    punpcklbw  xmm0, xmm4
  1.6550 +    punpckhbw  xmm1, xmm4
  1.6551 +    psubw      xmm2, xmm0  // row1 - row0
  1.6552 +    psubw      xmm3, xmm1
  1.6553 +    paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16
  1.6554 +    paddw      xmm3, xmm3
  1.6555 +    pmulhw     xmm2, xmm5  // scale diff
  1.6556 +    pmulhw     xmm3, xmm5
  1.6557 +    paddw      xmm0, xmm2  // sum rows
  1.6558 +    paddw      xmm1, xmm3
  1.6559 +    packuswb   xmm0, xmm1
  1.6560 +    sub        ecx, 16
  1.6561 +    movdqu     [esi + edi], xmm0
  1.6562 +    lea        esi, [esi + 16]
  1.6563 +    jg         xloop
  1.6564 +    jmp        xloop99
  1.6565 +
  1.6566 +    // Blend 25 / 75.
  1.6567 +    align      4
  1.6568 +  xloop25:
  1.6569 +    movdqu     xmm0, [esi]
  1.6570 +    movdqu     xmm1, [esi + edx]
  1.6571 +    pavgb      xmm0, xmm1
  1.6572 +    pavgb      xmm0, xmm1
  1.6573 +    sub        ecx, 16
  1.6574 +    movdqu     [esi + edi], xmm0
  1.6575 +    lea        esi, [esi + 16]
  1.6576 +    jg         xloop25
  1.6577 +    jmp        xloop99
  1.6578 +
  1.6579 +    // Blend 50 / 50.
  1.6580 +    align      4
  1.6581 +  xloop50:
  1.6582 +    movdqu     xmm0, [esi]
  1.6583 +    movdqu     xmm1, [esi + edx]
  1.6584 +    pavgb      xmm0, xmm1
  1.6585 +    sub        ecx, 16
  1.6586 +    movdqu     [esi + edi], xmm0
  1.6587 +    lea        esi, [esi + 16]
  1.6588 +    jg         xloop50
  1.6589 +    jmp        xloop99
  1.6590 +
  1.6591 +    // Blend 75 / 25.
  1.6592 +    align      4
  1.6593 +  xloop75:
  1.6594 +    movdqu     xmm1, [esi]
  1.6595 +    movdqu     xmm0, [esi + edx]
  1.6596 +    pavgb      xmm0, xmm1
  1.6597 +    pavgb      xmm0, xmm1
  1.6598 +    sub        ecx, 16
  1.6599 +    movdqu     [esi + edi], xmm0
  1.6600 +    lea        esi, [esi + 16]
  1.6601 +    jg         xloop75
  1.6602 +    jmp        xloop99
  1.6603 +
  1.6604 +    // Blend 100 / 0 - Copy row unchanged.
  1.6605 +    align      4
  1.6606 +  xloop100:
  1.6607 +    movdqu     xmm0, [esi]
  1.6608 +    sub        ecx, 16
  1.6609 +    movdqu     [esi + edi], xmm0
  1.6610 +    lea        esi, [esi + 16]
  1.6611 +    jg         xloop100
  1.6612 +
  1.6613 +  xloop99:
  1.6614 +    pop        edi
  1.6615 +    pop        esi
  1.6616 +    ret
  1.6617 +  }
  1.6618 +}
  1.6619 +#endif  // HAS_INTERPOLATEROW_SSE2
  1.6620 +
  1.6621 +__declspec(naked) __declspec(align(16))
  1.6622 +void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
  1.6623 +                  uint8* dst_uv, int pix) {
  1.6624 +  __asm {
  1.6625 +    push       edi
  1.6626 +    mov        eax, [esp + 4 + 4]    // src_uv
  1.6627 +    mov        edx, [esp + 4 + 8]    // src_uv_stride
  1.6628 +    mov        edi, [esp + 4 + 12]   // dst_v
  1.6629 +    mov        ecx, [esp + 4 + 16]   // pix
  1.6630 +    sub        edi, eax
  1.6631 +
  1.6632 +    align      4
  1.6633 +  convertloop:
  1.6634 +    movdqa     xmm0, [eax]
  1.6635 +    pavgb      xmm0, [eax + edx]
  1.6636 +    sub        ecx, 16
  1.6637 +    movdqa     [eax + edi], xmm0
  1.6638 +    lea        eax,  [eax + 16]
  1.6639 +    jg         convertloop
  1.6640 +    pop        edi
  1.6641 +    ret
  1.6642 +  }
  1.6643 +}
  1.6644 +
  1.6645 +#ifdef HAS_HALFROW_AVX2
  1.6646 +__declspec(naked) __declspec(align(16))
  1.6647 +void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
  1.6648 +                  uint8* dst_uv, int pix) {
  1.6649 +  __asm {
  1.6650 +    push       edi
  1.6651 +    mov        eax, [esp + 4 + 4]    // src_uv
  1.6652 +    mov        edx, [esp + 4 + 8]    // src_uv_stride
  1.6653 +    mov        edi, [esp + 4 + 12]   // dst_v
  1.6654 +    mov        ecx, [esp + 4 + 16]   // pix
  1.6655 +    sub        edi, eax
  1.6656 +
  1.6657 +    align      4
  1.6658 +  convertloop:
  1.6659 +    vmovdqu    ymm0, [eax]
  1.6660 +    vpavgb     ymm0, ymm0, [eax + edx]
  1.6661 +    sub        ecx, 32
  1.6662 +    vmovdqu    [eax + edi], ymm0
  1.6663 +    lea        eax,  [eax + 32]
  1.6664 +    jg         convertloop
  1.6665 +
  1.6666 +    pop        edi
  1.6667 +    vzeroupper
  1.6668 +    ret
  1.6669 +  }
  1.6670 +}
  1.6671 +#endif  // HAS_HALFROW_AVX2
  1.6672 +
  1.6673 +__declspec(naked) __declspec(align(16))
  1.6674 +void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
  1.6675 +                          uint32 selector, int pix) {
  1.6676 +  __asm {
  1.6677 +    mov        eax, [esp + 4]    // src_argb
  1.6678 +    mov        edx, [esp + 8]    // dst_bayer
  1.6679 +    movd       xmm5, [esp + 12]  // selector
  1.6680 +    mov        ecx, [esp + 16]   // pix
  1.6681 +    pshufd     xmm5, xmm5, 0
  1.6682 +
  1.6683 +    align      4
  1.6684 +  wloop:
  1.6685 +    movdqa     xmm0, [eax]
  1.6686 +    movdqa     xmm1, [eax + 16]
  1.6687 +    lea        eax, [eax + 32]
  1.6688 +    pshufb     xmm0, xmm5
  1.6689 +    pshufb     xmm1, xmm5
  1.6690 +    punpckldq  xmm0, xmm1
  1.6691 +    sub        ecx, 8
  1.6692 +    movq       qword ptr [edx], xmm0
  1.6693 +    lea        edx, [edx + 8]
  1.6694 +    jg         wloop
  1.6695 +    ret
  1.6696 +  }
  1.6697 +}
  1.6698 +
  1.6699 +// Specialized ARGB to Bayer that just isolates G channel.
  1.6700 +__declspec(naked) __declspec(align(16))
  1.6701 +void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
  1.6702 +                           uint32 selector, int pix) {
  1.6703 +  __asm {
  1.6704 +    mov        eax, [esp + 4]    // src_argb
  1.6705 +    mov        edx, [esp + 8]    // dst_bayer
  1.6706 +                                 // selector
  1.6707 +    mov        ecx, [esp + 16]   // pix
  1.6708 +    pcmpeqb    xmm5, xmm5        // generate mask 0x000000ff
  1.6709 +    psrld      xmm5, 24
  1.6710 +
  1.6711 +    align      4
  1.6712 +  wloop:
  1.6713 +    movdqa     xmm0, [eax]
  1.6714 +    movdqa     xmm1, [eax + 16]
  1.6715 +    lea        eax, [eax + 32]
  1.6716 +    psrld      xmm0, 8  // Move green to bottom.
  1.6717 +    psrld      xmm1, 8
  1.6718 +    pand       xmm0, xmm5
  1.6719 +    pand       xmm1, xmm5
  1.6720 +    packssdw   xmm0, xmm1
  1.6721 +    packuswb   xmm0, xmm1
  1.6722 +    sub        ecx, 8
  1.6723 +    movq       qword ptr [edx], xmm0
  1.6724 +    lea        edx, [edx + 8]
  1.6725 +    jg         wloop
  1.6726 +    ret
  1.6727 +  }
  1.6728 +}
  1.6729 +
  1.6730 +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  1.6731 +__declspec(naked) __declspec(align(16))
  1.6732 +void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
  1.6733 +                          const uint8* shuffler, int pix) {
  1.6734 +  __asm {
  1.6735 +    mov        eax, [esp + 4]    // src_argb
  1.6736 +    mov        edx, [esp + 8]    // dst_argb
  1.6737 +    mov        ecx, [esp + 12]   // shuffler
  1.6738 +    movdqa     xmm5, [ecx]
  1.6739 +    mov        ecx, [esp + 16]   // pix
  1.6740 +
  1.6741 +    align      4
  1.6742 +  wloop:
  1.6743 +    movdqa     xmm0, [eax]
  1.6744 +    movdqa     xmm1, [eax + 16]
  1.6745 +    lea        eax, [eax + 32]
  1.6746 +    pshufb     xmm0, xmm5
  1.6747 +    pshufb     xmm1, xmm5
  1.6748 +    sub        ecx, 8
  1.6749 +    movdqa     [edx], xmm0
  1.6750 +    movdqa     [edx + 16], xmm1
  1.6751 +    lea        edx, [edx + 32]
  1.6752 +    jg         wloop
  1.6753 +    ret
  1.6754 +  }
  1.6755 +}
  1.6756 +
  1.6757 +__declspec(naked) __declspec(align(16))
  1.6758 +void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
  1.6759 +                                    const uint8* shuffler, int pix) {
  1.6760 +  __asm {
  1.6761 +    mov        eax, [esp + 4]    // src_argb
  1.6762 +    mov        edx, [esp + 8]    // dst_argb
  1.6763 +    mov        ecx, [esp + 12]   // shuffler
  1.6764 +    movdqa     xmm5, [ecx]
  1.6765 +    mov        ecx, [esp + 16]   // pix
  1.6766 +
  1.6767 +    align      4
  1.6768 +  wloop:
  1.6769 +    movdqu     xmm0, [eax]
  1.6770 +    movdqu     xmm1, [eax + 16]
  1.6771 +    lea        eax, [eax + 32]
  1.6772 +    pshufb     xmm0, xmm5
  1.6773 +    pshufb     xmm1, xmm5
  1.6774 +    sub        ecx, 8
  1.6775 +    movdqu     [edx], xmm0
  1.6776 +    movdqu     [edx + 16], xmm1
  1.6777 +    lea        edx, [edx + 32]
  1.6778 +    jg         wloop
  1.6779 +    ret
  1.6780 +  }
  1.6781 +}
  1.6782 +
  1.6783 +#ifdef HAS_ARGBSHUFFLEROW_AVX2
  1.6784 +__declspec(naked) __declspec(align(16))
  1.6785 +void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
  1.6786 +                         const uint8* shuffler, int pix) {
  1.6787 +  __asm {
  1.6788 +    mov        eax, [esp + 4]     // src_argb
  1.6789 +    mov        edx, [esp + 8]     // dst_argb
  1.6790 +    mov        ecx, [esp + 12]    // shuffler
  1.6791 +    vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.
  1.6792 +    mov        ecx, [esp + 16]    // pix
  1.6793 +
  1.6794 +    align      4
  1.6795 +  wloop:
  1.6796 +    vmovdqu    ymm0, [eax]
  1.6797 +    vmovdqu    ymm1, [eax + 32]
  1.6798 +    lea        eax, [eax + 64]
  1.6799 +    vpshufb    ymm0, ymm0, ymm5
  1.6800 +    vpshufb    ymm1, ymm1, ymm5
  1.6801 +    sub        ecx, 16
  1.6802 +    vmovdqu    [edx], ymm0
  1.6803 +    vmovdqu    [edx + 32], ymm1
  1.6804 +    lea        edx, [edx + 64]
  1.6805 +    jg         wloop
  1.6806 +
  1.6807 +    vzeroupper
  1.6808 +    ret
  1.6809 +  }
  1.6810 +}
  1.6811 +#endif  // HAS_ARGBSHUFFLEROW_AVX2
  1.6812 +
  1.6813 +__declspec(naked) __declspec(align(16))
  1.6814 +void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
  1.6815 +                         const uint8* shuffler, int pix) {
  1.6816 +  __asm {
  1.6817 +    push       ebx
  1.6818 +    push       esi
  1.6819 +    mov        eax, [esp + 8 + 4]    // src_argb
  1.6820 +    mov        edx, [esp + 8 + 8]    // dst_argb
  1.6821 +    mov        esi, [esp + 8 + 12]   // shuffler
  1.6822 +    mov        ecx, [esp + 8 + 16]   // pix
  1.6823 +    pxor       xmm5, xmm5
  1.6824 +
  1.6825 +    mov        ebx, [esi]   // shuffler
  1.6826 +    cmp        ebx, 0x03000102
  1.6827 +    je         shuf_3012
  1.6828 +    cmp        ebx, 0x00010203
  1.6829 +    je         shuf_0123
  1.6830 +    cmp        ebx, 0x00030201
  1.6831 +    je         shuf_0321
  1.6832 +    cmp        ebx, 0x02010003
  1.6833 +    je         shuf_2103
  1.6834 +
  1.6835 +  // TODO(fbarchard): Use one source pointer and 3 offsets.
  1.6836 +  shuf_any1:
  1.6837 +    movzx      ebx, byte ptr [esi]
  1.6838 +    movzx      ebx, byte ptr [eax + ebx]
  1.6839 +    mov        [edx], bl
  1.6840 +    movzx      ebx, byte ptr [esi + 1]
  1.6841 +    movzx      ebx, byte ptr [eax + ebx]
  1.6842 +    mov        [edx + 1], bl
  1.6843 +    movzx      ebx, byte ptr [esi + 2]
  1.6844 +    movzx      ebx, byte ptr [eax + ebx]
  1.6845 +    mov        [edx + 2], bl
  1.6846 +    movzx      ebx, byte ptr [esi + 3]
  1.6847 +    movzx      ebx, byte ptr [eax + ebx]
  1.6848 +    mov        [edx + 3], bl
  1.6849 +    lea        eax, [eax + 4]
  1.6850 +    lea        edx, [edx + 4]
  1.6851 +    sub        ecx, 1
  1.6852 +    jg         shuf_any1
  1.6853 +    jmp        shuf99
  1.6854 +
  1.6855 +    align      4
  1.6856 +  shuf_0123:
  1.6857 +    movdqu     xmm0, [eax]
  1.6858 +    lea        eax, [eax + 16]
  1.6859 +    movdqa     xmm1, xmm0
  1.6860 +    punpcklbw  xmm0, xmm5
  1.6861 +    punpckhbw  xmm1, xmm5
  1.6862 +    pshufhw    xmm0, xmm0, 01Bh   // 1B = 00011011 = 0x0123 = BGRAToARGB
  1.6863 +    pshuflw    xmm0, xmm0, 01Bh
  1.6864 +    pshufhw    xmm1, xmm1, 01Bh
  1.6865 +    pshuflw    xmm1, xmm1, 01Bh
  1.6866 +    packuswb   xmm0, xmm1
  1.6867 +    sub        ecx, 4
  1.6868 +    movdqu     [edx], xmm0
  1.6869 +    lea        edx, [edx + 16]
  1.6870 +    jg         shuf_0123
  1.6871 +    jmp        shuf99
  1.6872 +
  1.6873 +    align      4
  1.6874 +  shuf_0321:
  1.6875 +    movdqu     xmm0, [eax]
  1.6876 +    lea        eax, [eax + 16]
  1.6877 +    movdqa     xmm1, xmm0
  1.6878 +    punpcklbw  xmm0, xmm5
  1.6879 +    punpckhbw  xmm1, xmm5
  1.6880 +    pshufhw    xmm0, xmm0, 039h   // 39 = 00111001 = 0x0321 = RGBAToARGB
  1.6881 +    pshuflw    xmm0, xmm0, 039h
  1.6882 +    pshufhw    xmm1, xmm1, 039h
  1.6883 +    pshuflw    xmm1, xmm1, 039h
  1.6884 +    packuswb   xmm0, xmm1
  1.6885 +    sub        ecx, 4
  1.6886 +    movdqu     [edx], xmm0
  1.6887 +    lea        edx, [edx + 16]
  1.6888 +    jg         shuf_0321
  1.6889 +    jmp        shuf99
  1.6890 +
  1.6891 +    align      4
  1.6892 +  shuf_2103:
  1.6893 +    movdqu     xmm0, [eax]
  1.6894 +    lea        eax, [eax + 16]
  1.6895 +    movdqa     xmm1, xmm0
  1.6896 +    punpcklbw  xmm0, xmm5
  1.6897 +    punpckhbw  xmm1, xmm5
  1.6898 +    pshufhw    xmm0, xmm0, 093h   // 93 = 10010011 = 0x2103 = ARGBToRGBA
  1.6899 +    pshuflw    xmm0, xmm0, 093h
  1.6900 +    pshufhw    xmm1, xmm1, 093h
  1.6901 +    pshuflw    xmm1, xmm1, 093h
  1.6902 +    packuswb   xmm0, xmm1
  1.6903 +    sub        ecx, 4
  1.6904 +    movdqu     [edx], xmm0
  1.6905 +    lea        edx, [edx + 16]
  1.6906 +    jg         shuf_2103
  1.6907 +    jmp        shuf99
  1.6908 +
  1.6909 +    align      4
  1.6910 +  shuf_3012:
  1.6911 +    movdqu     xmm0, [eax]
  1.6912 +    lea        eax, [eax + 16]
  1.6913 +    movdqa     xmm1, xmm0
  1.6914 +    punpcklbw  xmm0, xmm5
  1.6915 +    punpckhbw  xmm1, xmm5
  1.6916 +    pshufhw    xmm0, xmm0, 0C6h   // C6 = 11000110 = 0x3012 = ABGRToARGB
  1.6917 +    pshuflw    xmm0, xmm0, 0C6h
  1.6918 +    pshufhw    xmm1, xmm1, 0C6h
  1.6919 +    pshuflw    xmm1, xmm1, 0C6h
  1.6920 +    packuswb   xmm0, xmm1
  1.6921 +    sub        ecx, 4
  1.6922 +    movdqu     [edx], xmm0
  1.6923 +    lea        edx, [edx + 16]
  1.6924 +    jg         shuf_3012
  1.6925 +
  1.6926 +  shuf99:
  1.6927 +    pop        esi
  1.6928 +    pop        ebx
  1.6929 +    ret
  1.6930 +  }
  1.6931 +}
  1.6932 +
  1.6933 +// YUY2 - Macro-pixel = 2 image pixels
  1.6934 +// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
  1.6935 +
  1.6936 +// UYVY - Macro-pixel = 2 image pixels
  1.6937 +// U0Y0V0Y1
  1.6938 +
  1.6939 +__declspec(naked) __declspec(align(16))
  1.6940 +void I422ToYUY2Row_SSE2(const uint8* src_y,
  1.6941 +                        const uint8* src_u,
  1.6942 +                        const uint8* src_v,
  1.6943 +                        uint8* dst_frame, int width) {
  1.6944 +  __asm {
  1.6945 +    push       esi
  1.6946 +    push       edi
  1.6947 +    mov        eax, [esp + 8 + 4]    // src_y
  1.6948 +    mov        esi, [esp + 8 + 8]    // src_u
  1.6949 +    mov        edx, [esp + 8 + 12]   // src_v
  1.6950 +    mov        edi, [esp + 8 + 16]   // dst_frame
  1.6951 +    mov        ecx, [esp + 8 + 20]   // width
  1.6952 +    sub        edx, esi
  1.6953 +
  1.6954 +    align      4
  1.6955 +  convertloop:
  1.6956 +    movq       xmm2, qword ptr [esi] // U
  1.6957 +    movq       xmm3, qword ptr [esi + edx] // V
  1.6958 +    lea        esi, [esi + 8]
  1.6959 +    punpcklbw  xmm2, xmm3 // UV
  1.6960 +    movdqu     xmm0, [eax] // Y
  1.6961 +    lea        eax, [eax + 16]
  1.6962 +    movdqa     xmm1, xmm0
  1.6963 +    punpcklbw  xmm0, xmm2 // YUYV
  1.6964 +    punpckhbw  xmm1, xmm2
  1.6965 +    movdqu     [edi], xmm0
  1.6966 +    movdqu     [edi + 16], xmm1
  1.6967 +    lea        edi, [edi + 32]
  1.6968 +    sub        ecx, 16
  1.6969 +    jg         convertloop
  1.6970 +
  1.6971 +    pop        edi
  1.6972 +    pop        esi
  1.6973 +    ret
  1.6974 +  }
  1.6975 +}
  1.6976 +
  1.6977 +__declspec(naked) __declspec(align(16))
  1.6978 +void I422ToUYVYRow_SSE2(const uint8* src_y,
  1.6979 +                        const uint8* src_u,
  1.6980 +                        const uint8* src_v,
  1.6981 +                        uint8* dst_frame, int width) {
  1.6982 +  __asm {
  1.6983 +    push       esi
  1.6984 +    push       edi
  1.6985 +    mov        eax, [esp + 8 + 4]    // src_y
  1.6986 +    mov        esi, [esp + 8 + 8]    // src_u
  1.6987 +    mov        edx, [esp + 8 + 12]   // src_v
  1.6988 +    mov        edi, [esp + 8 + 16]   // dst_frame
  1.6989 +    mov        ecx, [esp + 8 + 20]   // width
  1.6990 +    sub        edx, esi
  1.6991 +
  1.6992 +    align      4
  1.6993 +  convertloop:
  1.6994 +    movq       xmm2, qword ptr [esi] // U
  1.6995 +    movq       xmm3, qword ptr [esi + edx] // V
  1.6996 +    lea        esi, [esi + 8]
  1.6997 +    punpcklbw  xmm2, xmm3 // UV
  1.6998 +    movdqu     xmm0, [eax] // Y
  1.6999 +    movdqa     xmm1, xmm2
  1.7000 +    lea        eax, [eax + 16]
  1.7001 +    punpcklbw  xmm1, xmm0 // UYVY
  1.7002 +    punpckhbw  xmm2, xmm0
  1.7003 +    movdqu     [edi], xmm1
  1.7004 +    movdqu     [edi + 16], xmm2
  1.7005 +    lea        edi, [edi + 32]
  1.7006 +    sub        ecx, 16
  1.7007 +    jg         convertloop
  1.7008 +
  1.7009 +    pop        edi
  1.7010 +    pop        esi
  1.7011 +    ret
  1.7012 +  }
  1.7013 +}
  1.7014 +
  1.7015 +#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
  1.7016 +__declspec(naked) __declspec(align(16))
  1.7017 +void ARGBPolynomialRow_SSE2(const uint8* src_argb,
  1.7018 +                            uint8* dst_argb, const float* poly,
  1.7019 +                            int width) {
  1.7020 +  __asm {
  1.7021 +    push       esi
  1.7022 +    mov        eax, [esp + 4 + 4]   /* src_argb */
  1.7023 +    mov        edx, [esp + 4 + 8]   /* dst_argb */
  1.7024 +    mov        esi, [esp + 4 + 12]  /* poly */
  1.7025 +    mov        ecx, [esp + 4 + 16]  /* width */
  1.7026 +    pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
  1.7027 +
  1.7028 +    // 2 pixel loop.
  1.7029 +    align      4
  1.7030 + convertloop:
  1.7031 +//    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
  1.7032 +//    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
  1.7033 +    movq       xmm0, qword ptr [eax]  // BGRABGRA
  1.7034 +    lea        eax, [eax + 8]
  1.7035 +    punpcklbw  xmm0, xmm3
  1.7036 +    movdqa     xmm4, xmm0
  1.7037 +    punpcklwd  xmm0, xmm3  // pixel 0
  1.7038 +    punpckhwd  xmm4, xmm3  // pixel 1
  1.7039 +    cvtdq2ps   xmm0, xmm0  // 4 floats
  1.7040 +    cvtdq2ps   xmm4, xmm4
  1.7041 +    movdqa     xmm1, xmm0  // X
  1.7042 +    movdqa     xmm5, xmm4
  1.7043 +    mulps      xmm0, [esi + 16]  // C1 * X
  1.7044 +    mulps      xmm4, [esi + 16]
  1.7045 +    addps      xmm0, [esi]  // result = C0 + C1 * X
  1.7046 +    addps      xmm4, [esi]
  1.7047 +    movdqa     xmm2, xmm1
  1.7048 +    movdqa     xmm6, xmm5
  1.7049 +    mulps      xmm2, xmm1  // X * X
  1.7050 +    mulps      xmm6, xmm5
  1.7051 +    mulps      xmm1, xmm2  // X * X * X
  1.7052 +    mulps      xmm5, xmm6
  1.7053 +    mulps      xmm2, [esi + 32]  // C2 * X * X
  1.7054 +    mulps      xmm6, [esi + 32]
  1.7055 +    mulps      xmm1, [esi + 48]  // C3 * X * X * X
  1.7056 +    mulps      xmm5, [esi + 48]
  1.7057 +    addps      xmm0, xmm2  // result += C2 * X * X
  1.7058 +    addps      xmm4, xmm6
  1.7059 +    addps      xmm0, xmm1  // result += C3 * X * X * X
  1.7060 +    addps      xmm4, xmm5
  1.7061 +    cvttps2dq  xmm0, xmm0
  1.7062 +    cvttps2dq  xmm4, xmm4
  1.7063 +    packuswb   xmm0, xmm4
  1.7064 +    packuswb   xmm0, xmm0
  1.7065 +    sub        ecx, 2
  1.7066 +    movq       qword ptr [edx], xmm0
  1.7067 +    lea        edx, [edx + 8]
  1.7068 +    jg         convertloop
  1.7069 +    pop        esi
  1.7070 +    ret
  1.7071 +  }
  1.7072 +}
  1.7073 +#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
  1.7074 +
  1.7075 +#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
  1.7076 +__declspec(naked) __declspec(align(16))
  1.7077 +void ARGBPolynomialRow_AVX2(const uint8* src_argb,
  1.7078 +                            uint8* dst_argb, const float* poly,
  1.7079 +                            int width) {
  1.7080 +  __asm {
  1.7081 +    mov        eax, [esp + 4]   /* src_argb */
  1.7082 +    mov        edx, [esp + 8]   /* dst_argb */
  1.7083 +    mov        ecx, [esp + 12]   /* poly */
  1.7084 +    vbroadcastf128 ymm4, [ecx]       // C0
  1.7085 +    vbroadcastf128 ymm5, [ecx + 16]  // C1
  1.7086 +    vbroadcastf128 ymm6, [ecx + 32]  // C2
  1.7087 +    vbroadcastf128 ymm7, [ecx + 48]  // C3
  1.7088 +    mov        ecx, [esp + 16]  /* width */
  1.7089 +
  1.7090 +    // 2 pixel loop.
  1.7091 +    align      4
  1.7092 + convertloop:
  1.7093 +    vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
  1.7094 +    lea         eax, [eax + 8]
  1.7095 +    vcvtdq2ps   ymm0, ymm0        // X 8 floats
  1.7096 +    vmulps      ymm2, ymm0, ymm0  // X * X
  1.7097 +    vmulps      ymm3, ymm0, ymm7  // C3 * X
  1.7098 +    vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
  1.7099 +    vfmadd231ps ymm0, ymm2, ymm6  // result += C2 * X * X
  1.7100 +    vfmadd231ps ymm0, ymm2, ymm3  // result += C3 * X * X * X
  1.7101 +    vcvttps2dq  ymm0, ymm0
  1.7102 +    vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
  1.7103 +    vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
  1.7104 +    vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
  1.7105 +    sub         ecx, 2
  1.7106 +    vmovq       qword ptr [edx], xmm0
  1.7107 +    lea         edx, [edx + 8]
  1.7108 +    jg          convertloop
  1.7109 +    vzeroupper
  1.7110 +    ret
  1.7111 +  }
  1.7112 +}
  1.7113 +#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
  1.7114 +
  1.7115 +#ifdef HAS_ARGBCOLORTABLEROW_X86
  1.7116 +// Tranform ARGB pixels with color table.
  1.7117 +__declspec(naked) __declspec(align(16))
  1.7118 +void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
  1.7119 +                           int width) {
  1.7120 +  __asm {
  1.7121 +    push       esi
  1.7122 +    mov        eax, [esp + 4 + 4]   /* dst_argb */
  1.7123 +    mov        esi, [esp + 4 + 8]   /* table_argb */
  1.7124 +    mov        ecx, [esp + 4 + 12]  /* width */
  1.7125 +
  1.7126 +    // 1 pixel loop.
  1.7127 +    align      4
  1.7128 +  convertloop:
  1.7129 +    movzx      edx, byte ptr [eax]
  1.7130 +    lea        eax, [eax + 4]
  1.7131 +    movzx      edx, byte ptr [esi + edx * 4]
  1.7132 +    mov        byte ptr [eax - 4], dl
  1.7133 +    movzx      edx, byte ptr [eax - 4 + 1]
  1.7134 +    movzx      edx, byte ptr [esi + edx * 4 + 1]
  1.7135 +    mov        byte ptr [eax - 4 + 1], dl
  1.7136 +    movzx      edx, byte ptr [eax - 4 + 2]
  1.7137 +    movzx      edx, byte ptr [esi + edx * 4 + 2]
  1.7138 +    mov        byte ptr [eax - 4 + 2], dl
  1.7139 +    movzx      edx, byte ptr [eax - 4 + 3]
  1.7140 +    movzx      edx, byte ptr [esi + edx * 4 + 3]
  1.7141 +    mov        byte ptr [eax - 4 + 3], dl
  1.7142 +    dec        ecx
  1.7143 +    jg         convertloop
  1.7144 +    pop        esi
  1.7145 +    ret
  1.7146 +  }
  1.7147 +}
  1.7148 +#endif  // HAS_ARGBCOLORTABLEROW_X86
  1.7149 +
  1.7150 +#ifdef HAS_RGBCOLORTABLEROW_X86
  1.7151 +// Tranform RGB pixels with color table.
  1.7152 +__declspec(naked) __declspec(align(16))
  1.7153 +void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
  1.7154 +  __asm {
  1.7155 +    push       esi
  1.7156 +    mov        eax, [esp + 4 + 4]   /* dst_argb */
  1.7157 +    mov        esi, [esp + 4 + 8]   /* table_argb */
  1.7158 +    mov        ecx, [esp + 4 + 12]  /* width */
  1.7159 +
  1.7160 +    // 1 pixel loop.
  1.7161 +    align      4
  1.7162 +  convertloop:
  1.7163 +    movzx      edx, byte ptr [eax]
  1.7164 +    lea        eax, [eax + 4]
  1.7165 +    movzx      edx, byte ptr [esi + edx * 4]
  1.7166 +    mov        byte ptr [eax - 4], dl
  1.7167 +    movzx      edx, byte ptr [eax - 4 + 1]
  1.7168 +    movzx      edx, byte ptr [esi + edx * 4 + 1]
  1.7169 +    mov        byte ptr [eax - 4 + 1], dl
  1.7170 +    movzx      edx, byte ptr [eax - 4 + 2]
  1.7171 +    movzx      edx, byte ptr [esi + edx * 4 + 2]
  1.7172 +    mov        byte ptr [eax - 4 + 2], dl
  1.7173 +    dec        ecx
  1.7174 +    jg         convertloop
  1.7175 +
  1.7176 +    pop        esi
  1.7177 +    ret
  1.7178 +  }
  1.7179 +}
  1.7180 +#endif  // HAS_RGBCOLORTABLEROW_X86
  1.7181 +
  1.7182 +#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
  1.7183 +// Tranform RGB pixels with luma table.
  1.7184 +__declspec(naked) __declspec(align(16))
  1.7185 +void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
  1.7186 +                                 int width,
  1.7187 +                                 const uint8* luma, uint32 lumacoeff) {
  1.7188 +  __asm {
  1.7189 +    push       esi
  1.7190 +    push       edi
  1.7191 +    mov        eax, [esp + 8 + 4]   /* src_argb */
  1.7192 +    mov        edi, [esp + 8 + 8]   /* dst_argb */
  1.7193 +    mov        ecx, [esp + 8 + 12]  /* width */
  1.7194 +    movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
  1.7195 +    movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
  1.7196 +    pshufd     xmm2, xmm2, 0
  1.7197 +    pshufd     xmm3, xmm3, 0
  1.7198 +    pcmpeqb    xmm4, xmm4        // generate mask 0xff00ff00
  1.7199 +    psllw      xmm4, 8
  1.7200 +    pxor       xmm5, xmm5
  1.7201 +
  1.7202 +    // 4 pixel loop.
  1.7203 +    align      4
  1.7204 +  convertloop:
  1.7205 +    movdqu     xmm0, qword ptr [eax]      // generate luma ptr
  1.7206 +    pmaddubsw  xmm0, xmm3
  1.7207 +    phaddw     xmm0, xmm0
  1.7208 +    pand       xmm0, xmm4  // mask out low bits
  1.7209 +    punpcklwd  xmm0, xmm5
  1.7210 +    paddd      xmm0, xmm2  // add table base
  1.7211 +    movd       esi, xmm0
  1.7212 +    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
  1.7213 +
  1.7214 +    movzx      edx, byte ptr [eax]
  1.7215 +    movzx      edx, byte ptr [esi + edx]
  1.7216 +    mov        byte ptr [edi], dl
  1.7217 +    movzx      edx, byte ptr [eax + 1]
  1.7218 +    movzx      edx, byte ptr [esi + edx]
  1.7219 +    mov        byte ptr [edi + 1], dl
  1.7220 +    movzx      edx, byte ptr [eax + 2]
  1.7221 +    movzx      edx, byte ptr [esi + edx]
  1.7222 +    mov        byte ptr [edi + 2], dl
  1.7223 +    movzx      edx, byte ptr [eax + 3]  // copy alpha.
  1.7224 +    mov        byte ptr [edi + 3], dl
  1.7225 +
  1.7226 +    movd       esi, xmm0
  1.7227 +    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
  1.7228 +
  1.7229 +    movzx      edx, byte ptr [eax + 4]
  1.7230 +    movzx      edx, byte ptr [esi + edx]
  1.7231 +    mov        byte ptr [edi + 4], dl
  1.7232 +    movzx      edx, byte ptr [eax + 5]
  1.7233 +    movzx      edx, byte ptr [esi + edx]
  1.7234 +    mov        byte ptr [edi + 5], dl
  1.7235 +    movzx      edx, byte ptr [eax + 6]
  1.7236 +    movzx      edx, byte ptr [esi + edx]
  1.7237 +    mov        byte ptr [edi + 6], dl
  1.7238 +    movzx      edx, byte ptr [eax + 7]  // copy alpha.
  1.7239 +    mov        byte ptr [edi + 7], dl
  1.7240 +
  1.7241 +    movd       esi, xmm0
  1.7242 +    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
  1.7243 +
  1.7244 +    movzx      edx, byte ptr [eax + 8]
  1.7245 +    movzx      edx, byte ptr [esi + edx]
  1.7246 +    mov        byte ptr [edi + 8], dl
  1.7247 +    movzx      edx, byte ptr [eax + 9]
  1.7248 +    movzx      edx, byte ptr [esi + edx]
  1.7249 +    mov        byte ptr [edi + 9], dl
  1.7250 +    movzx      edx, byte ptr [eax + 10]
  1.7251 +    movzx      edx, byte ptr [esi + edx]
  1.7252 +    mov        byte ptr [edi + 10], dl
  1.7253 +    movzx      edx, byte ptr [eax + 11]  // copy alpha.
  1.7254 +    mov        byte ptr [edi + 11], dl
  1.7255 +
  1.7256 +    movd       esi, xmm0
  1.7257 +
  1.7258 +    movzx      edx, byte ptr [eax + 12]
  1.7259 +    movzx      edx, byte ptr [esi + edx]
  1.7260 +    mov        byte ptr [edi + 12], dl
  1.7261 +    movzx      edx, byte ptr [eax + 13]
  1.7262 +    movzx      edx, byte ptr [esi + edx]
  1.7263 +    mov        byte ptr [edi + 13], dl
  1.7264 +    movzx      edx, byte ptr [eax + 14]
  1.7265 +    movzx      edx, byte ptr [esi + edx]
  1.7266 +    mov        byte ptr [edi + 14], dl
  1.7267 +    movzx      edx, byte ptr [eax + 15]  // copy alpha.
  1.7268 +    mov        byte ptr [edi + 15], dl
  1.7269 +
  1.7270 +    sub        ecx, 4
  1.7271 +    lea        eax, [eax + 16]
  1.7272 +    lea        edi, [edi + 16]
  1.7273 +    jg         convertloop
  1.7274 +
  1.7275 +    pop        edi
  1.7276 +    pop        esi
  1.7277 +    ret
  1.7278 +  }
  1.7279 +}
  1.7280 +#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
  1.7281 +
  1.7282 +#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
  1.7283 +
  1.7284 +#ifdef __cplusplus
  1.7285 +}  // extern "C"
  1.7286 +}  // namespace libyuv
  1.7287 +#endif

mercurial