media/libyuv/source/row_win.cc

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*
     2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
     3  *
     4  *  Use of this source code is governed by a BSD-style license
     5  *  that can be found in the LICENSE file in the root of the source
     6  *  tree. An additional intellectual property rights grant can be found
     7  *  in the file PATENTS. All contributing project authors may
     8  *  be found in the AUTHORS file in the root of the source tree.
     9  */
    11 #include "libyuv/row.h"
    13 #ifdef __cplusplus
    14 namespace libyuv {
    15 extern "C" {
    16 #endif
    18 // This module is for Visual C x86.
    19 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
    21 #ifdef HAS_ARGBTOYROW_SSSE3
    23 // Constants for ARGB.
    24 static const vec8 kARGBToY = {
    25   13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
    26 };
    28 // JPeg full range.
    29 static const vec8 kARGBToYJ = {
    30   15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
    31 };
    33 static const vec8 kARGBToU = {
    34   112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
    35 };
    37 static const vec8 kARGBToUJ = {
    38   127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
    39 };
    41 static const vec8 kARGBToV = {
    42   -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
    43 };
    45 static const vec8 kARGBToVJ = {
    46   -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
    47 };
    49 // vpermd for vphaddw + vpackuswb vpermd.
    50 static const lvec32 kPermdARGBToY_AVX = {
    51   0, 4, 1, 5, 2, 6, 3, 7
    52 };
    54 // vpshufb for vphaddw + vpackuswb packed to shorts.
    55 static const lvec8 kShufARGBToUV_AVX = {
    56   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
    57   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
    58 };
    60 // Constants for BGRA.
    61 static const vec8 kBGRAToY = {
    62   0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
    63 };
    65 static const vec8 kBGRAToU = {
    66   0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
    67 };
    69 static const vec8 kBGRAToV = {
    70   0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
    71 };
    73 // Constants for ABGR.
    74 static const vec8 kABGRToY = {
    75   33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
    76 };
    78 static const vec8 kABGRToU = {
    79   -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
    80 };
    82 static const vec8 kABGRToV = {
    83   112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
    84 };
    86 // Constants for RGBA.
    87 static const vec8 kRGBAToY = {
    88   0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
    89 };
    91 static const vec8 kRGBAToU = {
    92   0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
    93 };
    95 static const vec8 kRGBAToV = {
    96   0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
    97 };
    99 static const uvec8 kAddY16 = {
   100   16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
   101 };
   103 static const vec16 kAddYJ64 = {
   104   64, 64, 64, 64, 64, 64, 64, 64
   105 };
   107 static const uvec8 kAddUV128 = {
   108   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
   109   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
   110 };
   112 static const uvec16 kAddUVJ128 = {
   113   0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
   114 };
   116 // Shuffle table for converting RGB24 to ARGB.
   117 static const uvec8 kShuffleMaskRGB24ToARGB = {
   118   0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
   119 };
   121 // Shuffle table for converting RAW to ARGB.
   122 static const uvec8 kShuffleMaskRAWToARGB = {
   123   2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
   124 };
   126 // Shuffle table for converting ARGB to RGB24.
   127 static const uvec8 kShuffleMaskARGBToRGB24 = {
   128   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
   129 };
   131 // Shuffle table for converting ARGB to RAW.
   132 static const uvec8 kShuffleMaskARGBToRAW = {
   133   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
   134 };
   136 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
   137 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
   138   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
   139 };
   141 // Shuffle table for converting ARGB to RAW.
   142 static const uvec8 kShuffleMaskARGBToRAW_0 = {
   143   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
   144 };
   146 // Duplicates gray value 3 times and fills in alpha opaque.
   147 __declspec(naked) __declspec(align(16))
   148 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
   149   __asm {
   150     mov        eax, [esp + 4]        // src_y
   151     mov        edx, [esp + 8]        // dst_argb
   152     mov        ecx, [esp + 12]       // pix
   153     pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
   154     pslld      xmm5, 24
   156     align      4
   157   convertloop:
   158     movq       xmm0, qword ptr [eax]
   159     lea        eax,  [eax + 8]
   160     punpcklbw  xmm0, xmm0
   161     movdqa     xmm1, xmm0
   162     punpcklwd  xmm0, xmm0
   163     punpckhwd  xmm1, xmm1
   164     por        xmm0, xmm5
   165     por        xmm1, xmm5
   166     movdqa     [edx], xmm0
   167     movdqa     [edx + 16], xmm1
   168     lea        edx, [edx + 32]
   169     sub        ecx, 8
   170     jg         convertloop
   171     ret
   172   }
   173 }
   175 __declspec(naked) __declspec(align(16))
   176 void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
   177                                   int pix) {
   178   __asm {
   179     mov        eax, [esp + 4]        // src_y
   180     mov        edx, [esp + 8]        // dst_argb
   181     mov        ecx, [esp + 12]       // pix
   182     pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
   183     pslld      xmm5, 24
   185     align      4
   186   convertloop:
   187     movq       xmm0, qword ptr [eax]
   188     lea        eax,  [eax + 8]
   189     punpcklbw  xmm0, xmm0
   190     movdqa     xmm1, xmm0
   191     punpcklwd  xmm0, xmm0
   192     punpckhwd  xmm1, xmm1
   193     por        xmm0, xmm5
   194     por        xmm1, xmm5
   195     movdqu     [edx], xmm0
   196     movdqu     [edx + 16], xmm1
   197     lea        edx, [edx + 32]
   198     sub        ecx, 8
   199     jg         convertloop
   200     ret
   201   }
   202 }
   204 __declspec(naked) __declspec(align(16))
   205 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
   206   __asm {
   207     mov       eax, [esp + 4]   // src_rgb24
   208     mov       edx, [esp + 8]   // dst_argb
   209     mov       ecx, [esp + 12]  // pix
   210     pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
   211     pslld     xmm5, 24
   212     movdqa    xmm4, kShuffleMaskRGB24ToARGB
   214     align      4
   215  convertloop:
   216     movdqu    xmm0, [eax]
   217     movdqu    xmm1, [eax + 16]
   218     movdqu    xmm3, [eax + 32]
   219     lea       eax, [eax + 48]
   220     movdqa    xmm2, xmm3
   221     palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
   222     pshufb    xmm2, xmm4
   223     por       xmm2, xmm5
   224     palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
   225     pshufb    xmm0, xmm4
   226     movdqa    [edx + 32], xmm2
   227     por       xmm0, xmm5
   228     pshufb    xmm1, xmm4
   229     movdqa    [edx], xmm0
   230     por       xmm1, xmm5
   231     palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
   232     pshufb    xmm3, xmm4
   233     movdqa    [edx + 16], xmm1
   234     por       xmm3, xmm5
   235     sub       ecx, 16
   236     movdqa    [edx + 48], xmm3
   237     lea       edx, [edx + 64]
   238     jg        convertloop
   239     ret
   240   }
   241 }
   243 __declspec(naked) __declspec(align(16))
   244 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
   245                         int pix) {
   246   __asm {
   247     mov       eax, [esp + 4]   // src_raw
   248     mov       edx, [esp + 8]   // dst_argb
   249     mov       ecx, [esp + 12]  // pix
   250     pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
   251     pslld     xmm5, 24
   252     movdqa    xmm4, kShuffleMaskRAWToARGB
   254     align      4
   255  convertloop:
   256     movdqu    xmm0, [eax]
   257     movdqu    xmm1, [eax + 16]
   258     movdqu    xmm3, [eax + 32]
   259     lea       eax, [eax + 48]
   260     movdqa    xmm2, xmm3
   261     palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
   262     pshufb    xmm2, xmm4
   263     por       xmm2, xmm5
   264     palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
   265     pshufb    xmm0, xmm4
   266     movdqa    [edx + 32], xmm2
   267     por       xmm0, xmm5
   268     pshufb    xmm1, xmm4
   269     movdqa    [edx], xmm0
   270     por       xmm1, xmm5
   271     palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
   272     pshufb    xmm3, xmm4
   273     movdqa    [edx + 16], xmm1
   274     por       xmm3, xmm5
   275     sub       ecx, 16
   276     movdqa    [edx + 48], xmm3
   277     lea       edx, [edx + 64]
   278     jg        convertloop
   279     ret
   280   }
   281 }
   283 // pmul method to replicate bits.
   284 // Math to replicate bits:
   285 // (v << 8) | (v << 3)
   286 // v * 256 + v * 8
   287 // v * (256 + 8)
   288 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
   289 // 20 instructions.
   290 __declspec(naked) __declspec(align(16))
   291 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
   292                           int pix) {
   293   __asm {
   294     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
   295     movd      xmm5, eax
   296     pshufd    xmm5, xmm5, 0
   297     mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
   298     movd      xmm6, eax
   299     pshufd    xmm6, xmm6, 0
   300     pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
   301     psllw     xmm3, 11
   302     pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green
   303     psllw     xmm4, 10
   304     psrlw     xmm4, 5
   305     pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
   306     psllw     xmm7, 8
   308     mov       eax, [esp + 4]   // src_rgb565
   309     mov       edx, [esp + 8]   // dst_argb
   310     mov       ecx, [esp + 12]  // pix
   311     sub       edx, eax
   312     sub       edx, eax
   314     align      4
   315  convertloop:
   316     movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
   317     movdqa    xmm1, xmm0
   318     movdqa    xmm2, xmm0
   319     pand      xmm1, xmm3    // R in upper 5 bits
   320     psllw     xmm2, 11      // B in upper 5 bits
   321     pmulhuw   xmm1, xmm5    // * (256 + 8)
   322     pmulhuw   xmm2, xmm5    // * (256 + 8)
   323     psllw     xmm1, 8
   324     por       xmm1, xmm2    // RB
   325     pand      xmm0, xmm4    // G in middle 6 bits
   326     pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)
   327     por       xmm0, xmm7    // AG
   328     movdqa    xmm2, xmm1
   329     punpcklbw xmm1, xmm0
   330     punpckhbw xmm2, xmm0
   331     movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
   332     movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
   333     lea       eax, [eax + 16]
   334     sub       ecx, 8
   335     jg        convertloop
   336     ret
   337   }
   338 }
   340 // 24 instructions
   341 __declspec(naked) __declspec(align(16))
   342 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
   343                             int pix) {
   344   __asm {
   345     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
   346     movd      xmm5, eax
   347     pshufd    xmm5, xmm5, 0
   348     mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
   349     movd      xmm6, eax
   350     pshufd    xmm6, xmm6, 0
   351     pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
   352     psllw     xmm3, 11
   353     movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green
   354     psrlw     xmm4, 6
   355     pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
   356     psllw     xmm7, 8
   358     mov       eax, [esp + 4]   // src_argb1555
   359     mov       edx, [esp + 8]   // dst_argb
   360     mov       ecx, [esp + 12]  // pix
   361     sub       edx, eax
   362     sub       edx, eax
   364     align      4
   365  convertloop:
   366     movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
   367     movdqa    xmm1, xmm0
   368     movdqa    xmm2, xmm0
   369     psllw     xmm1, 1       // R in upper 5 bits
   370     psllw     xmm2, 11      // B in upper 5 bits
   371     pand      xmm1, xmm3
   372     pmulhuw   xmm2, xmm5    // * (256 + 8)
   373     pmulhuw   xmm1, xmm5    // * (256 + 8)
   374     psllw     xmm1, 8
   375     por       xmm1, xmm2    // RB
   376     movdqa    xmm2, xmm0
   377     pand      xmm0, xmm4    // G in middle 5 bits
   378     psraw     xmm2, 8       // A
   379     pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)
   380     pand      xmm2, xmm7
   381     por       xmm0, xmm2    // AG
   382     movdqa    xmm2, xmm1
   383     punpcklbw xmm1, xmm0
   384     punpckhbw xmm2, xmm0
   385     movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
   386     movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
   387     lea       eax, [eax + 16]
   388     sub       ecx, 8
   389     jg        convertloop
   390     ret
   391   }
   392 }
   394 // 18 instructions.
   395 __declspec(naked) __declspec(align(16))
   396 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
   397                             int pix) {
   398   __asm {
   399     mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
   400     movd      xmm4, eax
   401     pshufd    xmm4, xmm4, 0
   402     movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles
   403     pslld     xmm5, 4
   404     mov       eax, [esp + 4]   // src_argb4444
   405     mov       edx, [esp + 8]   // dst_argb
   406     mov       ecx, [esp + 12]  // pix
   407     sub       edx, eax
   408     sub       edx, eax
   410     align      4
   411  convertloop:
   412     movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
   413     movdqa    xmm2, xmm0
   414     pand      xmm0, xmm4    // mask low nibbles
   415     pand      xmm2, xmm5    // mask high nibbles
   416     movdqa    xmm1, xmm0
   417     movdqa    xmm3, xmm2
   418     psllw     xmm1, 4
   419     psrlw     xmm3, 4
   420     por       xmm0, xmm1
   421     por       xmm2, xmm3
   422     movdqa    xmm1, xmm0
   423     punpcklbw xmm0, xmm2
   424     punpckhbw xmm1, xmm2
   425     movdqa    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
   426     movdqa    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
   427     lea       eax, [eax + 16]
   428     sub       ecx, 8
   429     jg        convertloop
   430     ret
   431   }
   432 }
   434 __declspec(naked) __declspec(align(16))
   435 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
   436   __asm {
   437     mov       eax, [esp + 4]   // src_argb
   438     mov       edx, [esp + 8]   // dst_rgb
   439     mov       ecx, [esp + 12]  // pix
   440     movdqa    xmm6, kShuffleMaskARGBToRGB24
   442     align      4
   443  convertloop:
   444     movdqu    xmm0, [eax]   // fetch 16 pixels of argb
   445     movdqu    xmm1, [eax + 16]
   446     movdqu    xmm2, [eax + 32]
   447     movdqu    xmm3, [eax + 48]
   448     lea       eax, [eax + 64]
   449     pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
   450     pshufb    xmm1, xmm6
   451     pshufb    xmm2, xmm6
   452     pshufb    xmm3, xmm6
   453     movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
   454     psrldq    xmm1, 4      // 8 bytes from 1
   455     pslldq    xmm4, 12     // 4 bytes from 1 for 0
   456     movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
   457     por       xmm0, xmm4   // 4 bytes from 1 for 0
   458     pslldq    xmm5, 8      // 8 bytes from 2 for 1
   459     movdqu    [edx], xmm0  // store 0
   460     por       xmm1, xmm5   // 8 bytes from 2 for 1
   461     psrldq    xmm2, 8      // 4 bytes from 2
   462     pslldq    xmm3, 4      // 12 bytes from 3 for 2
   463     por       xmm2, xmm3   // 12 bytes from 3 for 2
   464     movdqu    [edx + 16], xmm1   // store 1
   465     movdqu    [edx + 32], xmm2   // store 2
   466     lea       edx, [edx + 48]
   467     sub       ecx, 16
   468     jg        convertloop
   469     ret
   470   }
   471 }
   473 __declspec(naked) __declspec(align(16))
   474 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
   475   __asm {
   476     mov       eax, [esp + 4]   // src_argb
   477     mov       edx, [esp + 8]   // dst_rgb
   478     mov       ecx, [esp + 12]  // pix
   479     movdqa    xmm6, kShuffleMaskARGBToRAW
   481     align      4
   482  convertloop:
   483     movdqu    xmm0, [eax]   // fetch 16 pixels of argb
   484     movdqu    xmm1, [eax + 16]
   485     movdqu    xmm2, [eax + 32]
   486     movdqu    xmm3, [eax + 48]
   487     lea       eax, [eax + 64]
   488     pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
   489     pshufb    xmm1, xmm6
   490     pshufb    xmm2, xmm6
   491     pshufb    xmm3, xmm6
   492     movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
   493     psrldq    xmm1, 4      // 8 bytes from 1
   494     pslldq    xmm4, 12     // 4 bytes from 1 for 0
   495     movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
   496     por       xmm0, xmm4   // 4 bytes from 1 for 0
   497     pslldq    xmm5, 8      // 8 bytes from 2 for 1
   498     movdqu    [edx], xmm0  // store 0
   499     por       xmm1, xmm5   // 8 bytes from 2 for 1
   500     psrldq    xmm2, 8      // 4 bytes from 2
   501     pslldq    xmm3, 4      // 12 bytes from 3 for 2
   502     por       xmm2, xmm3   // 12 bytes from 3 for 2
   503     movdqu    [edx + 16], xmm1   // store 1
   504     movdqu    [edx + 32], xmm2   // store 2
   505     lea       edx, [edx + 48]
   506     sub       ecx, 16
   507     jg        convertloop
   508     ret
   509   }
   510 }
   512 __declspec(naked) __declspec(align(16))
   513 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
   514   __asm {
   515     mov       eax, [esp + 4]   // src_argb
   516     mov       edx, [esp + 8]   // dst_rgb
   517     mov       ecx, [esp + 12]  // pix
   518     pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
   519     psrld     xmm3, 27
   520     pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
   521     psrld     xmm4, 26
   522     pslld     xmm4, 5
   523     pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
   524     pslld     xmm5, 11
   526     align      4
   527  convertloop:
   528     movdqa    xmm0, [eax]   // fetch 4 pixels of argb
   529     movdqa    xmm1, xmm0    // B
   530     movdqa    xmm2, xmm0    // G
   531     pslld     xmm0, 8       // R
   532     psrld     xmm1, 3       // B
   533     psrld     xmm2, 5       // G
   534     psrad     xmm0, 16      // R
   535     pand      xmm1, xmm3    // B
   536     pand      xmm2, xmm4    // G
   537     pand      xmm0, xmm5    // R
   538     por       xmm1, xmm2    // BG
   539     por       xmm0, xmm1    // BGR
   540     packssdw  xmm0, xmm0
   541     lea       eax, [eax + 16]
   542     movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
   543     lea       edx, [edx + 8]
   544     sub       ecx, 4
   545     jg        convertloop
   546     ret
   547   }
   548 }
   550 // TODO(fbarchard): Improve sign extension/packing.
   551 __declspec(naked) __declspec(align(16))
   552 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
   553   __asm {
   554     mov       eax, [esp + 4]   // src_argb
   555     mov       edx, [esp + 8]   // dst_rgb
   556     mov       ecx, [esp + 12]  // pix
   557     pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
   558     psrld     xmm4, 27
   559     movdqa    xmm5, xmm4       // generate mask 0x000003e0
   560     pslld     xmm5, 5
   561     movdqa    xmm6, xmm4       // generate mask 0x00007c00
   562     pslld     xmm6, 10
   563     pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
   564     pslld     xmm7, 15
   566     align      4
   567  convertloop:
   568     movdqa    xmm0, [eax]   // fetch 4 pixels of argb
   569     movdqa    xmm1, xmm0    // B
   570     movdqa    xmm2, xmm0    // G
   571     movdqa    xmm3, xmm0    // R
   572     psrad     xmm0, 16      // A
   573     psrld     xmm1, 3       // B
   574     psrld     xmm2, 6       // G
   575     psrld     xmm3, 9       // R
   576     pand      xmm0, xmm7    // A
   577     pand      xmm1, xmm4    // B
   578     pand      xmm2, xmm5    // G
   579     pand      xmm3, xmm6    // R
   580     por       xmm0, xmm1    // BA
   581     por       xmm2, xmm3    // GR
   582     por       xmm0, xmm2    // BGRA
   583     packssdw  xmm0, xmm0
   584     lea       eax, [eax + 16]
   585     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
   586     lea       edx, [edx + 8]
   587     sub       ecx, 4
   588     jg        convertloop
   589     ret
   590   }
   591 }
   593 __declspec(naked) __declspec(align(16))
   594 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
   595   __asm {
   596     mov       eax, [esp + 4]   // src_argb
   597     mov       edx, [esp + 8]   // dst_rgb
   598     mov       ecx, [esp + 12]  // pix
   599     pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
   600     psllw     xmm4, 12
   601     movdqa    xmm3, xmm4       // generate mask 0x00f000f0
   602     psrlw     xmm3, 8
   604     align      4
   605  convertloop:
   606     movdqa    xmm0, [eax]   // fetch 4 pixels of argb
   607     movdqa    xmm1, xmm0
   608     pand      xmm0, xmm3    // low nibble
   609     pand      xmm1, xmm4    // high nibble
   610     psrl      xmm0, 4
   611     psrl      xmm1, 8
   612     por       xmm0, xmm1
   613     packuswb  xmm0, xmm0
   614     lea       eax, [eax + 16]
   615     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
   616     lea       edx, [edx + 8]
   617     sub       ecx, 4
   618     jg        convertloop
   619     ret
   620   }
   621 }
   623 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
   624 __declspec(naked) __declspec(align(16))
   625 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   626   __asm {
   627     mov        eax, [esp + 4]   /* src_argb */
   628     mov        edx, [esp + 8]   /* dst_y */
   629     mov        ecx, [esp + 12]  /* pix */
   630     movdqa     xmm5, kAddY16
   631     movdqa     xmm4, kARGBToY
   633     align      4
   634  convertloop:
   635     movdqa     xmm0, [eax]
   636     movdqa     xmm1, [eax + 16]
   637     movdqa     xmm2, [eax + 32]
   638     movdqa     xmm3, [eax + 48]
   639     pmaddubsw  xmm0, xmm4
   640     pmaddubsw  xmm1, xmm4
   641     pmaddubsw  xmm2, xmm4
   642     pmaddubsw  xmm3, xmm4
   643     lea        eax, [eax + 64]
   644     phaddw     xmm0, xmm1
   645     phaddw     xmm2, xmm3
   646     psrlw      xmm0, 7
   647     psrlw      xmm2, 7
   648     packuswb   xmm0, xmm2
   649     paddb      xmm0, xmm5
   650     sub        ecx, 16
   651     movdqa     [edx], xmm0
   652     lea        edx, [edx + 16]
   653     jg         convertloop
   654     ret
   655   }
   656 }
   658 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
   659 __declspec(naked) __declspec(align(16))
   660 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   661   __asm {
   662     mov        eax, [esp + 4]   /* src_argb */
   663     mov        edx, [esp + 8]   /* dst_y */
   664     mov        ecx, [esp + 12]  /* pix */
   665     movdqa     xmm4, kARGBToYJ
   666     movdqa     xmm5, kAddYJ64
   668     align      4
   669  convertloop:
   670     movdqa     xmm0, [eax]
   671     movdqa     xmm1, [eax + 16]
   672     movdqa     xmm2, [eax + 32]
   673     movdqa     xmm3, [eax + 48]
   674     pmaddubsw  xmm0, xmm4
   675     pmaddubsw  xmm1, xmm4
   676     pmaddubsw  xmm2, xmm4
   677     pmaddubsw  xmm3, xmm4
   678     lea        eax, [eax + 64]
   679     phaddw     xmm0, xmm1
   680     phaddw     xmm2, xmm3
   681     paddw      xmm0, xmm5  // Add .5 for rounding.
   682     paddw      xmm2, xmm5
   683     psrlw      xmm0, 7
   684     psrlw      xmm2, 7
   685     packuswb   xmm0, xmm2
   686     sub        ecx, 16
   687     movdqa     [edx], xmm0
   688     lea        edx, [edx + 16]
   689     jg         convertloop
   690     ret
   691   }
   692 }
   694 #ifdef HAS_ARGBTOYROW_AVX2
   695 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
   696 __declspec(naked) __declspec(align(32))
   697 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
   698   __asm {
   699     mov        eax, [esp + 4]   /* src_argb */
   700     mov        edx, [esp + 8]   /* dst_y */
   701     mov        ecx, [esp + 12]  /* pix */
   702     vbroadcastf128 ymm4, kARGBToY
   703     vbroadcastf128 ymm5, kAddY16
   704     vmovdqa    ymm6, kPermdARGBToY_AVX
   706     align      4
   707  convertloop:
   708     vmovdqu    ymm0, [eax]
   709     vmovdqu    ymm1, [eax + 32]
   710     vmovdqu    ymm2, [eax + 64]
   711     vmovdqu    ymm3, [eax + 96]
   712     vpmaddubsw ymm0, ymm0, ymm4
   713     vpmaddubsw ymm1, ymm1, ymm4
   714     vpmaddubsw ymm2, ymm2, ymm4
   715     vpmaddubsw ymm3, ymm3, ymm4
   716     lea        eax, [eax + 128]
   717     vphaddw    ymm0, ymm0, ymm1  // mutates.
   718     vphaddw    ymm2, ymm2, ymm3
   719     vpsrlw     ymm0, ymm0, 7
   720     vpsrlw     ymm2, ymm2, 7
   721     vpackuswb  ymm0, ymm0, ymm2  // mutates.
   722     vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
   723     vpaddb     ymm0, ymm0, ymm5
   724     sub        ecx, 32
   725     vmovdqu    [edx], ymm0
   726     lea        edx, [edx + 32]
   727     jg         convertloop
   728     vzeroupper
   729     ret
   730   }
   731 }
   732 #endif  //  HAS_ARGBTOYROW_AVX2
   734 #ifdef HAS_ARGBTOYROW_AVX2
   735 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
   736 __declspec(naked) __declspec(align(32))
   737 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
   738   __asm {
   739     mov        eax, [esp + 4]   /* src_argb */
   740     mov        edx, [esp + 8]   /* dst_y */
   741     mov        ecx, [esp + 12]  /* pix */
   742     vbroadcastf128 ymm4, kARGBToYJ
   743     vbroadcastf128 ymm5, kAddYJ64
   744     vmovdqa    ymm6, kPermdARGBToY_AVX
   746     align      4
   747  convertloop:
   748     vmovdqu    ymm0, [eax]
   749     vmovdqu    ymm1, [eax + 32]
   750     vmovdqu    ymm2, [eax + 64]
   751     vmovdqu    ymm3, [eax + 96]
   752     vpmaddubsw ymm0, ymm0, ymm4
   753     vpmaddubsw ymm1, ymm1, ymm4
   754     vpmaddubsw ymm2, ymm2, ymm4
   755     vpmaddubsw ymm3, ymm3, ymm4
   756     lea        eax, [eax + 128]
   757     vphaddw    ymm0, ymm0, ymm1  // mutates.
   758     vphaddw    ymm2, ymm2, ymm3
   759     vpaddw     ymm0, ymm0, ymm5  // Add .5 for rounding.
   760     vpaddw     ymm2, ymm2, ymm5
   761     vpsrlw     ymm0, ymm0, 7
   762     vpsrlw     ymm2, ymm2, 7
   763     vpackuswb  ymm0, ymm0, ymm2  // mutates.
   764     vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
   765     sub        ecx, 32
   766     vmovdqu    [edx], ymm0
   767     lea        edx, [edx + 32]
   768     jg         convertloop
   770     vzeroupper
   771     ret
   772   }
   773 }
   774 #endif  //  HAS_ARGBTOYJROW_AVX2
   776 __declspec(naked) __declspec(align(16))
   777 void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   778   __asm {
   779     mov        eax, [esp + 4]   /* src_argb */
   780     mov        edx, [esp + 8]   /* dst_y */
   781     mov        ecx, [esp + 12]  /* pix */
   782     movdqa     xmm5, kAddY16
   783     movdqa     xmm4, kARGBToY
   785     align      4
   786  convertloop:
   787     movdqu     xmm0, [eax]
   788     movdqu     xmm1, [eax + 16]
   789     movdqu     xmm2, [eax + 32]
   790     movdqu     xmm3, [eax + 48]
   791     pmaddubsw  xmm0, xmm4
   792     pmaddubsw  xmm1, xmm4
   793     pmaddubsw  xmm2, xmm4
   794     pmaddubsw  xmm3, xmm4
   795     lea        eax, [eax + 64]
   796     phaddw     xmm0, xmm1
   797     phaddw     xmm2, xmm3
   798     psrlw      xmm0, 7
   799     psrlw      xmm2, 7
   800     packuswb   xmm0, xmm2
   801     paddb      xmm0, xmm5
   802     sub        ecx, 16
   803     movdqu     [edx], xmm0
   804     lea        edx, [edx + 16]
   805     jg         convertloop
   806     ret
   807   }
   808 }
   810 __declspec(naked) __declspec(align(16))
   811 void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   812   __asm {
   813     mov        eax, [esp + 4]   /* src_argb */
   814     mov        edx, [esp + 8]   /* dst_y */
   815     mov        ecx, [esp + 12]  /* pix */
   816     movdqa     xmm4, kARGBToYJ
   817     movdqa     xmm5, kAddYJ64
   819     align      4
   820  convertloop:
   821     movdqu     xmm0, [eax]
   822     movdqu     xmm1, [eax + 16]
   823     movdqu     xmm2, [eax + 32]
   824     movdqu     xmm3, [eax + 48]
   825     pmaddubsw  xmm0, xmm4
   826     pmaddubsw  xmm1, xmm4
   827     pmaddubsw  xmm2, xmm4
   828     pmaddubsw  xmm3, xmm4
   829     lea        eax, [eax + 64]
   830     phaddw     xmm0, xmm1
   831     phaddw     xmm2, xmm3
   832     paddw      xmm0, xmm5
   833     paddw      xmm2, xmm5
   834     psrlw      xmm0, 7
   835     psrlw      xmm2, 7
   836     packuswb   xmm0, xmm2
   837     sub        ecx, 16
   838     movdqu     [edx], xmm0
   839     lea        edx, [edx + 16]
   840     jg         convertloop
   841     ret
   842   }
   843 }
   845 __declspec(naked) __declspec(align(16))
   846 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   847   __asm {
   848     mov        eax, [esp + 4]   /* src_argb */
   849     mov        edx, [esp + 8]   /* dst_y */
   850     mov        ecx, [esp + 12]  /* pix */
   851     movdqa     xmm5, kAddY16
   852     movdqa     xmm4, kBGRAToY
   854     align      4
   855  convertloop:
   856     movdqa     xmm0, [eax]
   857     movdqa     xmm1, [eax + 16]
   858     movdqa     xmm2, [eax + 32]
   859     movdqa     xmm3, [eax + 48]
   860     pmaddubsw  xmm0, xmm4
   861     pmaddubsw  xmm1, xmm4
   862     pmaddubsw  xmm2, xmm4
   863     pmaddubsw  xmm3, xmm4
   864     lea        eax, [eax + 64]
   865     phaddw     xmm0, xmm1
   866     phaddw     xmm2, xmm3
   867     psrlw      xmm0, 7
   868     psrlw      xmm2, 7
   869     packuswb   xmm0, xmm2
   870     paddb      xmm0, xmm5
   871     sub        ecx, 16
   872     movdqa     [edx], xmm0
   873     lea        edx, [edx + 16]
   874     jg         convertloop
   875     ret
   876   }
   877 }
   879 __declspec(naked) __declspec(align(16))
   880 void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   881   __asm {
   882     mov        eax, [esp + 4]   /* src_argb */
   883     mov        edx, [esp + 8]   /* dst_y */
   884     mov        ecx, [esp + 12]  /* pix */
   885     movdqa     xmm5, kAddY16
   886     movdqa     xmm4, kBGRAToY
   888     align      4
   889  convertloop:
   890     movdqu     xmm0, [eax]
   891     movdqu     xmm1, [eax + 16]
   892     movdqu     xmm2, [eax + 32]
   893     movdqu     xmm3, [eax + 48]
   894     pmaddubsw  xmm0, xmm4
   895     pmaddubsw  xmm1, xmm4
   896     pmaddubsw  xmm2, xmm4
   897     pmaddubsw  xmm3, xmm4
   898     lea        eax, [eax + 64]
   899     phaddw     xmm0, xmm1
   900     phaddw     xmm2, xmm3
   901     psrlw      xmm0, 7
   902     psrlw      xmm2, 7
   903     packuswb   xmm0, xmm2
   904     paddb      xmm0, xmm5
   905     sub        ecx, 16
   906     movdqu     [edx], xmm0
   907     lea        edx, [edx + 16]
   908     jg         convertloop
   909     ret
   910   }
   911 }
   913 __declspec(naked) __declspec(align(16))
   914 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   915   __asm {
   916     mov        eax, [esp + 4]   /* src_argb */
   917     mov        edx, [esp + 8]   /* dst_y */
   918     mov        ecx, [esp + 12]  /* pix */
   919     movdqa     xmm5, kAddY16
   920     movdqa     xmm4, kABGRToY
   922     align      4
   923  convertloop:
   924     movdqa     xmm0, [eax]
   925     movdqa     xmm1, [eax + 16]
   926     movdqa     xmm2, [eax + 32]
   927     movdqa     xmm3, [eax + 48]
   928     pmaddubsw  xmm0, xmm4
   929     pmaddubsw  xmm1, xmm4
   930     pmaddubsw  xmm2, xmm4
   931     pmaddubsw  xmm3, xmm4
   932     lea        eax, [eax + 64]
   933     phaddw     xmm0, xmm1
   934     phaddw     xmm2, xmm3
   935     psrlw      xmm0, 7
   936     psrlw      xmm2, 7
   937     packuswb   xmm0, xmm2
   938     paddb      xmm0, xmm5
   939     sub        ecx, 16
   940     movdqa     [edx], xmm0
   941     lea        edx, [edx + 16]
   942     jg         convertloop
   943     ret
   944   }
   945 }
   947 __declspec(naked) __declspec(align(16))
   948 void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   949   __asm {
   950     mov        eax, [esp + 4]   /* src_argb */
   951     mov        edx, [esp + 8]   /* dst_y */
   952     mov        ecx, [esp + 12]  /* pix */
   953     movdqa     xmm5, kAddY16
   954     movdqa     xmm4, kABGRToY
   956     align      4
   957  convertloop:
   958     movdqu     xmm0, [eax]
   959     movdqu     xmm1, [eax + 16]
   960     movdqu     xmm2, [eax + 32]
   961     movdqu     xmm3, [eax + 48]
   962     pmaddubsw  xmm0, xmm4
   963     pmaddubsw  xmm1, xmm4
   964     pmaddubsw  xmm2, xmm4
   965     pmaddubsw  xmm3, xmm4
   966     lea        eax, [eax + 64]
   967     phaddw     xmm0, xmm1
   968     phaddw     xmm2, xmm3
   969     psrlw      xmm0, 7
   970     psrlw      xmm2, 7
   971     packuswb   xmm0, xmm2
   972     paddb      xmm0, xmm5
   973     sub        ecx, 16
   974     movdqu     [edx], xmm0
   975     lea        edx, [edx + 16]
   976     jg         convertloop
   977     ret
   978   }
   979 }
   981 __declspec(naked) __declspec(align(16))
   982 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   983   __asm {
   984     mov        eax, [esp + 4]   /* src_argb */
   985     mov        edx, [esp + 8]   /* dst_y */
   986     mov        ecx, [esp + 12]  /* pix */
   987     movdqa     xmm5, kAddY16
   988     movdqa     xmm4, kRGBAToY
   990     align      4
   991  convertloop:
   992     movdqa     xmm0, [eax]
   993     movdqa     xmm1, [eax + 16]
   994     movdqa     xmm2, [eax + 32]
   995     movdqa     xmm3, [eax + 48]
   996     pmaddubsw  xmm0, xmm4
   997     pmaddubsw  xmm1, xmm4
   998     pmaddubsw  xmm2, xmm4
   999     pmaddubsw  xmm3, xmm4
  1000     lea        eax, [eax + 64]
  1001     phaddw     xmm0, xmm1
  1002     phaddw     xmm2, xmm3
  1003     psrlw      xmm0, 7
  1004     psrlw      xmm2, 7
  1005     packuswb   xmm0, xmm2
  1006     paddb      xmm0, xmm5
  1007     sub        ecx, 16
  1008     movdqa     [edx], xmm0
  1009     lea        edx, [edx + 16]
  1010     jg         convertloop
  1011     ret
  1015 __declspec(naked) __declspec(align(16))
  1016 void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
  1017   __asm {
  1018     mov        eax, [esp + 4]   /* src_argb */
  1019     mov        edx, [esp + 8]   /* dst_y */
  1020     mov        ecx, [esp + 12]  /* pix */
  1021     movdqa     xmm5, kAddY16
  1022     movdqa     xmm4, kRGBAToY
  1024     align      4
  1025  convertloop:
  1026     movdqu     xmm0, [eax]
  1027     movdqu     xmm1, [eax + 16]
  1028     movdqu     xmm2, [eax + 32]
  1029     movdqu     xmm3, [eax + 48]
  1030     pmaddubsw  xmm0, xmm4
  1031     pmaddubsw  xmm1, xmm4
  1032     pmaddubsw  xmm2, xmm4
  1033     pmaddubsw  xmm3, xmm4
  1034     lea        eax, [eax + 64]
  1035     phaddw     xmm0, xmm1
  1036     phaddw     xmm2, xmm3
  1037     psrlw      xmm0, 7
  1038     psrlw      xmm2, 7
  1039     packuswb   xmm0, xmm2
  1040     paddb      xmm0, xmm5
  1041     sub        ecx, 16
  1042     movdqu     [edx], xmm0
  1043     lea        edx, [edx + 16]
  1044     jg         convertloop
  1045     ret
  1049 __declspec(naked) __declspec(align(16))
  1050 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1051                        uint8* dst_u, uint8* dst_v, int width) {
  1052   __asm {
  1053     push       esi
  1054     push       edi
  1055     mov        eax, [esp + 8 + 4]   // src_argb
  1056     mov        esi, [esp + 8 + 8]   // src_stride_argb
  1057     mov        edx, [esp + 8 + 12]  // dst_u
  1058     mov        edi, [esp + 8 + 16]  // dst_v
  1059     mov        ecx, [esp + 8 + 20]  // pix
  1060     movdqa     xmm7, kARGBToU
  1061     movdqa     xmm6, kARGBToV
  1062     movdqa     xmm5, kAddUV128
  1063     sub        edi, edx             // stride from u to v
  1065     align      4
  1066  convertloop:
  1067     /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1068     movdqa     xmm0, [eax]
  1069     movdqa     xmm1, [eax + 16]
  1070     movdqa     xmm2, [eax + 32]
  1071     movdqa     xmm3, [eax + 48]
  1072     pavgb      xmm0, [eax + esi]
  1073     pavgb      xmm1, [eax + esi + 16]
  1074     pavgb      xmm2, [eax + esi + 32]
  1075     pavgb      xmm3, [eax + esi + 48]
  1076     lea        eax,  [eax + 64]
  1077     movdqa     xmm4, xmm0
  1078     shufps     xmm0, xmm1, 0x88
  1079     shufps     xmm4, xmm1, 0xdd
  1080     pavgb      xmm0, xmm4
  1081     movdqa     xmm4, xmm2
  1082     shufps     xmm2, xmm3, 0x88
  1083     shufps     xmm4, xmm3, 0xdd
  1084     pavgb      xmm2, xmm4
  1086     // step 2 - convert to U and V
  1087     // from here down is very similar to Y code except
  1088     // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1089     movdqa     xmm1, xmm0
  1090     movdqa     xmm3, xmm2
  1091     pmaddubsw  xmm0, xmm7  // U
  1092     pmaddubsw  xmm2, xmm7
  1093     pmaddubsw  xmm1, xmm6  // V
  1094     pmaddubsw  xmm3, xmm6
  1095     phaddw     xmm0, xmm2
  1096     phaddw     xmm1, xmm3
  1097     psraw      xmm0, 8
  1098     psraw      xmm1, 8
  1099     packsswb   xmm0, xmm1
  1100     paddb      xmm0, xmm5            // -> unsigned
  1102     // step 3 - store 8 U and 8 V values
  1103     sub        ecx, 16
  1104     movlps     qword ptr [edx], xmm0 // U
  1105     movhps     qword ptr [edx + edi], xmm0 // V
  1106     lea        edx, [edx + 8]
  1107     jg         convertloop
  1109     pop        edi
  1110     pop        esi
  1111     ret
  1115 __declspec(naked) __declspec(align(16))
  1116 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1117                         uint8* dst_u, uint8* dst_v, int width) {
  1118   __asm {
  1119     push       esi
  1120     push       edi
  1121     mov        eax, [esp + 8 + 4]   // src_argb
  1122     mov        esi, [esp + 8 + 8]   // src_stride_argb
  1123     mov        edx, [esp + 8 + 12]  // dst_u
  1124     mov        edi, [esp + 8 + 16]  // dst_v
  1125     mov        ecx, [esp + 8 + 20]  // pix
  1126     movdqa     xmm7, kARGBToUJ
  1127     movdqa     xmm6, kARGBToVJ
  1128     movdqa     xmm5, kAddUVJ128
  1129     sub        edi, edx             // stride from u to v
  1131     align      4
  1132  convertloop:
  1133     /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1134     movdqa     xmm0, [eax]
  1135     movdqa     xmm1, [eax + 16]
  1136     movdqa     xmm2, [eax + 32]
  1137     movdqa     xmm3, [eax + 48]
  1138     pavgb      xmm0, [eax + esi]
  1139     pavgb      xmm1, [eax + esi + 16]
  1140     pavgb      xmm2, [eax + esi + 32]
  1141     pavgb      xmm3, [eax + esi + 48]
  1142     lea        eax,  [eax + 64]
  1143     movdqa     xmm4, xmm0
  1144     shufps     xmm0, xmm1, 0x88
  1145     shufps     xmm4, xmm1, 0xdd
  1146     pavgb      xmm0, xmm4
  1147     movdqa     xmm4, xmm2
  1148     shufps     xmm2, xmm3, 0x88
  1149     shufps     xmm4, xmm3, 0xdd
  1150     pavgb      xmm2, xmm4
  1152     // step 2 - convert to U and V
  1153     // from here down is very similar to Y code except
  1154     // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1155     movdqa     xmm1, xmm0
  1156     movdqa     xmm3, xmm2
  1157     pmaddubsw  xmm0, xmm7  // U
  1158     pmaddubsw  xmm2, xmm7
  1159     pmaddubsw  xmm1, xmm6  // V
  1160     pmaddubsw  xmm3, xmm6
  1161     phaddw     xmm0, xmm2
  1162     phaddw     xmm1, xmm3
  1163     paddw      xmm0, xmm5            // +.5 rounding -> unsigned
  1164     paddw      xmm1, xmm5
  1165     psraw      xmm0, 8
  1166     psraw      xmm1, 8
  1167     packsswb   xmm0, xmm1
  1169     // step 3 - store 8 U and 8 V values
  1170     sub        ecx, 16
  1171     movlps     qword ptr [edx], xmm0 // U
  1172     movhps     qword ptr [edx + edi], xmm0 // V
  1173     lea        edx, [edx + 8]
  1174     jg         convertloop
  1176     pop        edi
  1177     pop        esi
  1178     ret
  1182 #ifdef HAS_ARGBTOUVROW_AVX2
  1183 __declspec(naked) __declspec(align(32))
  1184 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
  1185                       uint8* dst_u, uint8* dst_v, int width) {
  1186   __asm {
  1187     push       esi
  1188     push       edi
  1189     mov        eax, [esp + 8 + 4]   // src_argb
  1190     mov        esi, [esp + 8 + 8]   // src_stride_argb
  1191     mov        edx, [esp + 8 + 12]  // dst_u
  1192     mov        edi, [esp + 8 + 16]  // dst_v
  1193     mov        ecx, [esp + 8 + 20]  // pix
  1194     vbroadcastf128 ymm5, kAddUV128
  1195     vbroadcastf128 ymm6, kARGBToV
  1196     vbroadcastf128 ymm7, kARGBToU
  1197     sub        edi, edx             // stride from u to v
  1199     align      4
  1200  convertloop:
  1201     /* step 1 - subsample 32x2 argb pixels to 16x1 */
  1202     vmovdqu    ymm0, [eax]
  1203     vmovdqu    ymm1, [eax + 32]
  1204     vmovdqu    ymm2, [eax + 64]
  1205     vmovdqu    ymm3, [eax + 96]
  1206     vpavgb     ymm0, ymm0, [eax + esi]
  1207     vpavgb     ymm1, ymm1, [eax + esi + 32]
  1208     vpavgb     ymm2, ymm2, [eax + esi + 64]
  1209     vpavgb     ymm3, ymm3, [eax + esi + 96]
  1210     lea        eax,  [eax + 128]
  1211     vshufps    ymm4, ymm0, ymm1, 0x88
  1212     vshufps    ymm0, ymm0, ymm1, 0xdd
  1213     vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
  1214     vshufps    ymm4, ymm2, ymm3, 0x88
  1215     vshufps    ymm2, ymm2, ymm3, 0xdd
  1216     vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
  1218     // step 2 - convert to U and V
  1219     // from here down is very similar to Y code except
  1220     // instead of 32 different pixels, its 16 pixels of U and 16 of V
  1221     vpmaddubsw ymm1, ymm0, ymm7  // U
  1222     vpmaddubsw ymm3, ymm2, ymm7
  1223     vpmaddubsw ymm0, ymm0, ymm6  // V
  1224     vpmaddubsw ymm2, ymm2, ymm6
  1225     vphaddw    ymm1, ymm1, ymm3  // mutates
  1226     vphaddw    ymm0, ymm0, ymm2
  1227     vpsraw     ymm1, ymm1, 8
  1228     vpsraw     ymm0, ymm0, 8
  1229     vpacksswb  ymm0, ymm1, ymm0  // mutates
  1230     vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
  1231     vpshufb    ymm0, ymm0, kShufARGBToUV_AVX  // For vshufps + vphaddw
  1232     vpaddb     ymm0, ymm0, ymm5  // -> unsigned
  1234     // step 3 - store 16 U and 16 V values
  1235     sub         ecx, 32
  1236     vextractf128 [edx], ymm0, 0 // U
  1237     vextractf128 [edx + edi], ymm0, 1 // V
  1238     lea        edx, [edx + 16]
  1239     jg         convertloop
  1241     pop        edi
  1242     pop        esi
  1243     vzeroupper
  1244     ret
  1247 #endif  // HAS_ARGBTOUVROW_AVX2
  1249 __declspec(naked) __declspec(align(16))
  1250 void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1251                                  uint8* dst_u, uint8* dst_v, int width) {
  1252   __asm {
  1253     push       esi
  1254     push       edi
  1255     mov        eax, [esp + 8 + 4]   // src_argb
  1256     mov        esi, [esp + 8 + 8]   // src_stride_argb
  1257     mov        edx, [esp + 8 + 12]  // dst_u
  1258     mov        edi, [esp + 8 + 16]  // dst_v
  1259     mov        ecx, [esp + 8 + 20]  // pix
  1260     movdqa     xmm7, kARGBToU
  1261     movdqa     xmm6, kARGBToV
  1262     movdqa     xmm5, kAddUV128
  1263     sub        edi, edx             // stride from u to v
  1265     align      4
  1266  convertloop:
  1267     /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1268     movdqu     xmm0, [eax]
  1269     movdqu     xmm1, [eax + 16]
  1270     movdqu     xmm2, [eax + 32]
  1271     movdqu     xmm3, [eax + 48]
  1272     movdqu     xmm4, [eax + esi]
  1273     pavgb      xmm0, xmm4
  1274     movdqu     xmm4, [eax + esi + 16]
  1275     pavgb      xmm1, xmm4
  1276     movdqu     xmm4, [eax + esi + 32]
  1277     pavgb      xmm2, xmm4
  1278     movdqu     xmm4, [eax + esi + 48]
  1279     pavgb      xmm3, xmm4
  1280     lea        eax,  [eax + 64]
  1281     movdqa     xmm4, xmm0
  1282     shufps     xmm0, xmm1, 0x88
  1283     shufps     xmm4, xmm1, 0xdd
  1284     pavgb      xmm0, xmm4
  1285     movdqa     xmm4, xmm2
  1286     shufps     xmm2, xmm3, 0x88
  1287     shufps     xmm4, xmm3, 0xdd
  1288     pavgb      xmm2, xmm4
  1290     // step 2 - convert to U and V
  1291     // from here down is very similar to Y code except
  1292     // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1293     movdqa     xmm1, xmm0
  1294     movdqa     xmm3, xmm2
  1295     pmaddubsw  xmm0, xmm7  // U
  1296     pmaddubsw  xmm2, xmm7
  1297     pmaddubsw  xmm1, xmm6  // V
  1298     pmaddubsw  xmm3, xmm6
  1299     phaddw     xmm0, xmm2
  1300     phaddw     xmm1, xmm3
  1301     psraw      xmm0, 8
  1302     psraw      xmm1, 8
  1303     packsswb   xmm0, xmm1
  1304     paddb      xmm0, xmm5            // -> unsigned
  1306     // step 3 - store 8 U and 8 V values
  1307     sub        ecx, 16
  1308     movlps     qword ptr [edx], xmm0 // U
  1309     movhps     qword ptr [edx + edi], xmm0 // V
  1310     lea        edx, [edx + 8]
  1311     jg         convertloop
  1313     pop        edi
  1314     pop        esi
  1315     ret
  1319 __declspec(naked) __declspec(align(16))
  1320 void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1321                                  uint8* dst_u, uint8* dst_v, int width) {
  1322   __asm {
  1323     push       esi
  1324     push       edi
  1325     mov        eax, [esp + 8 + 4]   // src_argb
  1326     mov        esi, [esp + 8 + 8]   // src_stride_argb
  1327     mov        edx, [esp + 8 + 12]  // dst_u
  1328     mov        edi, [esp + 8 + 16]  // dst_v
  1329     mov        ecx, [esp + 8 + 20]  // pix
  1330     movdqa     xmm7, kARGBToUJ
  1331     movdqa     xmm6, kARGBToVJ
  1332     movdqa     xmm5, kAddUVJ128
  1333     sub        edi, edx             // stride from u to v
  1335     align      4
  1336  convertloop:
  1337     /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1338     movdqu     xmm0, [eax]
  1339     movdqu     xmm1, [eax + 16]
  1340     movdqu     xmm2, [eax + 32]
  1341     movdqu     xmm3, [eax + 48]
  1342     movdqu     xmm4, [eax + esi]
  1343     pavgb      xmm0, xmm4
  1344     movdqu     xmm4, [eax + esi + 16]
  1345     pavgb      xmm1, xmm4
  1346     movdqu     xmm4, [eax + esi + 32]
  1347     pavgb      xmm2, xmm4
  1348     movdqu     xmm4, [eax + esi + 48]
  1349     pavgb      xmm3, xmm4
  1350     lea        eax,  [eax + 64]
  1351     movdqa     xmm4, xmm0
  1352     shufps     xmm0, xmm1, 0x88
  1353     shufps     xmm4, xmm1, 0xdd
  1354     pavgb      xmm0, xmm4
  1355     movdqa     xmm4, xmm2
  1356     shufps     xmm2, xmm3, 0x88
  1357     shufps     xmm4, xmm3, 0xdd
  1358     pavgb      xmm2, xmm4
  1360     // step 2 - convert to U and V
  1361     // from here down is very similar to Y code except
  1362     // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1363     movdqa     xmm1, xmm0
  1364     movdqa     xmm3, xmm2
  1365     pmaddubsw  xmm0, xmm7  // U
  1366     pmaddubsw  xmm2, xmm7
  1367     pmaddubsw  xmm1, xmm6  // V
  1368     pmaddubsw  xmm3, xmm6
  1369     phaddw     xmm0, xmm2
  1370     phaddw     xmm1, xmm3
  1371     paddw      xmm0, xmm5            // +.5 rounding -> unsigned
  1372     paddw      xmm1, xmm5
  1373     psraw      xmm0, 8
  1374     psraw      xmm1, 8
  1375     packsswb   xmm0, xmm1
  1377     // step 3 - store 8 U and 8 V values
  1378     sub        ecx, 16
  1379     movlps     qword ptr [edx], xmm0 // U
  1380     movhps     qword ptr [edx + edi], xmm0 // V
  1381     lea        edx, [edx + 8]
  1382     jg         convertloop
  1384     pop        edi
  1385     pop        esi
  1386     ret
  1390 __declspec(naked) __declspec(align(16))
  1391 void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
  1392                           uint8* dst_u, uint8* dst_v, int width) {
  1393   __asm {
  1394     push       edi
  1395     mov        eax, [esp + 4 + 4]   // src_argb
  1396     mov        edx, [esp + 4 + 8]   // dst_u
  1397     mov        edi, [esp + 4 + 12]  // dst_v
  1398     mov        ecx, [esp + 4 + 16]  // pix
  1399     movdqa     xmm7, kARGBToU
  1400     movdqa     xmm6, kARGBToV
  1401     movdqa     xmm5, kAddUV128
  1402     sub        edi, edx             // stride from u to v
  1404     align      4
  1405  convertloop:
  1406     /* convert to U and V */
  1407     movdqa     xmm0, [eax]          // U
  1408     movdqa     xmm1, [eax + 16]
  1409     movdqa     xmm2, [eax + 32]
  1410     movdqa     xmm3, [eax + 48]
  1411     pmaddubsw  xmm0, xmm7
  1412     pmaddubsw  xmm1, xmm7
  1413     pmaddubsw  xmm2, xmm7
  1414     pmaddubsw  xmm3, xmm7
  1415     phaddw     xmm0, xmm1
  1416     phaddw     xmm2, xmm3
  1417     psraw      xmm0, 8
  1418     psraw      xmm2, 8
  1419     packsswb   xmm0, xmm2
  1420     paddb      xmm0, xmm5
  1421     sub        ecx,  16
  1422     movdqa     [edx], xmm0
  1424     movdqa     xmm0, [eax]          // V
  1425     movdqa     xmm1, [eax + 16]
  1426     movdqa     xmm2, [eax + 32]
  1427     movdqa     xmm3, [eax + 48]
  1428     pmaddubsw  xmm0, xmm6
  1429     pmaddubsw  xmm1, xmm6
  1430     pmaddubsw  xmm2, xmm6
  1431     pmaddubsw  xmm3, xmm6
  1432     phaddw     xmm0, xmm1
  1433     phaddw     xmm2, xmm3
  1434     psraw      xmm0, 8
  1435     psraw      xmm2, 8
  1436     packsswb   xmm0, xmm2
  1437     paddb      xmm0, xmm5
  1438     lea        eax,  [eax + 64]
  1439     movdqa     [edx + edi], xmm0
  1440     lea        edx,  [edx + 16]
  1441     jg         convertloop
  1443     pop        edi
  1444     ret
  1448 __declspec(naked) __declspec(align(16))
  1449 void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
  1450                                     uint8* dst_u, uint8* dst_v, int width) {
  1451   __asm {
  1452     push       edi
  1453     mov        eax, [esp + 4 + 4]   // src_argb
  1454     mov        edx, [esp + 4 + 8]   // dst_u
  1455     mov        edi, [esp + 4 + 12]  // dst_v
  1456     mov        ecx, [esp + 4 + 16]  // pix
  1457     movdqa     xmm7, kARGBToU
  1458     movdqa     xmm6, kARGBToV
  1459     movdqa     xmm5, kAddUV128
  1460     sub        edi, edx             // stride from u to v
  1462     align      4
  1463  convertloop:
  1464     /* convert to U and V */
  1465     movdqu     xmm0, [eax]          // U
  1466     movdqu     xmm1, [eax + 16]
  1467     movdqu     xmm2, [eax + 32]
  1468     movdqu     xmm3, [eax + 48]
  1469     pmaddubsw  xmm0, xmm7
  1470     pmaddubsw  xmm1, xmm7
  1471     pmaddubsw  xmm2, xmm7
  1472     pmaddubsw  xmm3, xmm7
  1473     phaddw     xmm0, xmm1
  1474     phaddw     xmm2, xmm3
  1475     psraw      xmm0, 8
  1476     psraw      xmm2, 8
  1477     packsswb   xmm0, xmm2
  1478     paddb      xmm0, xmm5
  1479     sub        ecx,  16
  1480     movdqu     [edx], xmm0
  1482     movdqu     xmm0, [eax]          // V
  1483     movdqu     xmm1, [eax + 16]
  1484     movdqu     xmm2, [eax + 32]
  1485     movdqu     xmm3, [eax + 48]
  1486     pmaddubsw  xmm0, xmm6
  1487     pmaddubsw  xmm1, xmm6
  1488     pmaddubsw  xmm2, xmm6
  1489     pmaddubsw  xmm3, xmm6
  1490     phaddw     xmm0, xmm1
  1491     phaddw     xmm2, xmm3
  1492     psraw      xmm0, 8
  1493     psraw      xmm2, 8
  1494     packsswb   xmm0, xmm2
  1495     paddb      xmm0, xmm5
  1496     lea        eax,  [eax + 64]
  1497     movdqu     [edx + edi], xmm0
  1498     lea        edx,  [edx + 16]
  1499     jg         convertloop
  1501     pop        edi
  1502     ret
  1506 __declspec(naked) __declspec(align(16))
  1507 void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
  1508                           uint8* dst_u, uint8* dst_v, int width) {
  1509   __asm {
  1510     push       edi
  1511     mov        eax, [esp + 4 + 4]   // src_argb
  1512     mov        edx, [esp + 4 + 8]   // dst_u
  1513     mov        edi, [esp + 4 + 12]  // dst_v
  1514     mov        ecx, [esp + 4 + 16]  // pix
  1515     movdqa     xmm7, kARGBToU
  1516     movdqa     xmm6, kARGBToV
  1517     movdqa     xmm5, kAddUV128
  1518     sub        edi, edx             // stride from u to v
  1520     align      4
  1521  convertloop:
  1522     /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1523     movdqa     xmm0, [eax]
  1524     movdqa     xmm1, [eax + 16]
  1525     movdqa     xmm2, [eax + 32]
  1526     movdqa     xmm3, [eax + 48]
  1527     lea        eax,  [eax + 64]
  1528     movdqa     xmm4, xmm0
  1529     shufps     xmm0, xmm1, 0x88
  1530     shufps     xmm4, xmm1, 0xdd
  1531     pavgb      xmm0, xmm4
  1532     movdqa     xmm4, xmm2
  1533     shufps     xmm2, xmm3, 0x88
  1534     shufps     xmm4, xmm3, 0xdd
  1535     pavgb      xmm2, xmm4
  1537     // step 2 - convert to U and V
  1538     // from here down is very similar to Y code except
  1539     // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1540     movdqa     xmm1, xmm0
  1541     movdqa     xmm3, xmm2
  1542     pmaddubsw  xmm0, xmm7  // U
  1543     pmaddubsw  xmm2, xmm7
  1544     pmaddubsw  xmm1, xmm6  // V
  1545     pmaddubsw  xmm3, xmm6
  1546     phaddw     xmm0, xmm2
  1547     phaddw     xmm1, xmm3
  1548     psraw      xmm0, 8
  1549     psraw      xmm1, 8
  1550     packsswb   xmm0, xmm1
  1551     paddb      xmm0, xmm5            // -> unsigned
  1553     // step 3 - store 8 U and 8 V values
  1554     sub        ecx, 16
  1555     movlps     qword ptr [edx], xmm0 // U
  1556     movhps     qword ptr [edx + edi], xmm0 // V
  1557     lea        edx, [edx + 8]
  1558     jg         convertloop
  1560     pop        edi
  1561     ret
  1565 __declspec(naked) __declspec(align(16))
  1566 void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
  1567                                     uint8* dst_u, uint8* dst_v, int width) {
  1568   __asm {
  1569     push       edi
  1570     mov        eax, [esp + 4 + 4]   // src_argb
  1571     mov        edx, [esp + 4 + 8]   // dst_u
  1572     mov        edi, [esp + 4 + 12]  // dst_v
  1573     mov        ecx, [esp + 4 + 16]  // pix
  1574     movdqa     xmm7, kARGBToU
  1575     movdqa     xmm6, kARGBToV
  1576     movdqa     xmm5, kAddUV128
  1577     sub        edi, edx             // stride from u to v
  1579     align      4
  1580  convertloop:
  1581     /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1582     movdqu     xmm0, [eax]
  1583     movdqu     xmm1, [eax + 16]
  1584     movdqu     xmm2, [eax + 32]
  1585     movdqu     xmm3, [eax + 48]
  1586     lea        eax,  [eax + 64]
  1587     movdqa     xmm4, xmm0
  1588     shufps     xmm0, xmm1, 0x88
  1589     shufps     xmm4, xmm1, 0xdd
  1590     pavgb      xmm0, xmm4
  1591     movdqa     xmm4, xmm2
  1592     shufps     xmm2, xmm3, 0x88
  1593     shufps     xmm4, xmm3, 0xdd
  1594     pavgb      xmm2, xmm4
  1596     // step 2 - convert to U and V
  1597     // from here down is very similar to Y code except
  1598     // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1599     movdqa     xmm1, xmm0
  1600     movdqa     xmm3, xmm2
  1601     pmaddubsw  xmm0, xmm7  // U
  1602     pmaddubsw  xmm2, xmm7
  1603     pmaddubsw  xmm1, xmm6  // V
  1604     pmaddubsw  xmm3, xmm6
  1605     phaddw     xmm0, xmm2
  1606     phaddw     xmm1, xmm3
  1607     psraw      xmm0, 8
  1608     psraw      xmm1, 8
  1609     packsswb   xmm0, xmm1
  1610     paddb      xmm0, xmm5            // -> unsigned
  1612     // step 3 - store 8 U and 8 V values
  1613     sub        ecx, 16
  1614     movlps     qword ptr [edx], xmm0 // U
  1615     movhps     qword ptr [edx + edi], xmm0 // V
  1616     lea        edx, [edx + 8]
  1617     jg         convertloop
  1619     pop        edi
  1620     ret
  1624 __declspec(naked) __declspec(align(16))
  1625 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1626                        uint8* dst_u, uint8* dst_v, int width) {
  1627   __asm {
  1628     push       esi
  1629     push       edi
  1630     mov        eax, [esp + 8 + 4]   // src_argb
  1631     mov        esi, [esp + 8 + 8]   // src_stride_argb
  1632     mov        edx, [esp + 8 + 12]  // dst_u
  1633     mov        edi, [esp + 8 + 16]  // dst_v
  1634     mov        ecx, [esp + 8 + 20]  // pix
  1635     movdqa     xmm7, kBGRAToU
  1636     movdqa     xmm6, kBGRAToV
  1637     movdqa     xmm5, kAddUV128
  1638     sub        edi, edx             // stride from u to v
  1640     align      4
  1641  convertloop:
  1642     /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1643     movdqa     xmm0, [eax]
  1644     movdqa     xmm1, [eax + 16]
  1645     movdqa     xmm2, [eax + 32]
  1646     movdqa     xmm3, [eax + 48]
  1647     pavgb      xmm0, [eax + esi]
  1648     pavgb      xmm1, [eax + esi + 16]
  1649     pavgb      xmm2, [eax + esi + 32]
  1650     pavgb      xmm3, [eax + esi + 48]
  1651     lea        eax,  [eax + 64]
  1652     movdqa     xmm4, xmm0
  1653     shufps     xmm0, xmm1, 0x88
  1654     shufps     xmm4, xmm1, 0xdd
  1655     pavgb      xmm0, xmm4
  1656     movdqa     xmm4, xmm2
  1657     shufps     xmm2, xmm3, 0x88
  1658     shufps     xmm4, xmm3, 0xdd
  1659     pavgb      xmm2, xmm4
  1661     // step 2 - convert to U and V
  1662     // from here down is very similar to Y code except
  1663     // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1664     movdqa     xmm1, xmm0
  1665     movdqa     xmm3, xmm2
  1666     pmaddubsw  xmm0, xmm7  // U
  1667     pmaddubsw  xmm2, xmm7
  1668     pmaddubsw  xmm1, xmm6  // V
  1669     pmaddubsw  xmm3, xmm6
  1670     phaddw     xmm0, xmm2
  1671     phaddw     xmm1, xmm3
  1672     psraw      xmm0, 8
  1673     psraw      xmm1, 8
  1674     packsswb   xmm0, xmm1
  1675     paddb      xmm0, xmm5            // -> unsigned
  1677     // step 3 - store 8 U and 8 V values
  1678     sub        ecx, 16
  1679     movlps     qword ptr [edx], xmm0 // U
  1680     movhps     qword ptr [edx + edi], xmm0 // V
  1681     lea        edx, [edx + 8]
  1682     jg         convertloop
  1684     pop        edi
  1685     pop        esi
  1686     ret
  1690 __declspec(naked) __declspec(align(16))
  1691 void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1692                                  uint8* dst_u, uint8* dst_v, int width) {
  1693   __asm {
  1694     push       esi
  1695     push       edi
  1696     mov        eax, [esp + 8 + 4]   // src_argb
  1697     mov        esi, [esp + 8 + 8]   // src_stride_argb
  1698     mov        edx, [esp + 8 + 12]  // dst_u
  1699     mov        edi, [esp + 8 + 16]  // dst_v
  1700     mov        ecx, [esp + 8 + 20]  // pix
  1701     movdqa     xmm7, kBGRAToU
  1702     movdqa     xmm6, kBGRAToV
  1703     movdqa     xmm5, kAddUV128
  1704     sub        edi, edx             // stride from u to v
  1706     align      4
  1707  convertloop:
  1708     /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1709     movdqu     xmm0, [eax]
  1710     movdqu     xmm1, [eax + 16]
  1711     movdqu     xmm2, [eax + 32]
  1712     movdqu     xmm3, [eax + 48]
  1713     movdqu     xmm4, [eax + esi]
  1714     pavgb      xmm0, xmm4
  1715     movdqu     xmm4, [eax + esi + 16]
  1716     pavgb      xmm1, xmm4
  1717     movdqu     xmm4, [eax + esi + 32]
  1718     pavgb      xmm2, xmm4
  1719     movdqu     xmm4, [eax + esi + 48]
  1720     pavgb      xmm3, xmm4
  1721     lea        eax,  [eax + 64]
  1722     movdqa     xmm4, xmm0
  1723     shufps     xmm0, xmm1, 0x88
  1724     shufps     xmm4, xmm1, 0xdd
  1725     pavgb      xmm0, xmm4
  1726     movdqa     xmm4, xmm2
  1727     shufps     xmm2, xmm3, 0x88
  1728     shufps     xmm4, xmm3, 0xdd
  1729     pavgb      xmm2, xmm4
  1731     // step 2 - convert to U and V
  1732     // from here down is very similar to Y code except
  1733     // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1734     movdqa     xmm1, xmm0
  1735     movdqa     xmm3, xmm2
  1736     pmaddubsw  xmm0, xmm7  // U
  1737     pmaddubsw  xmm2, xmm7
  1738     pmaddubsw  xmm1, xmm6  // V
  1739     pmaddubsw  xmm3, xmm6
  1740     phaddw     xmm0, xmm2
  1741     phaddw     xmm1, xmm3
  1742     psraw      xmm0, 8
  1743     psraw      xmm1, 8
  1744     packsswb   xmm0, xmm1
  1745     paddb      xmm0, xmm5            // -> unsigned
  1747     // step 3 - store 8 U and 8 V values
  1748     sub        ecx, 16
  1749     movlps     qword ptr [edx], xmm0 // U
  1750     movhps     qword ptr [edx + edi], xmm0 // V
  1751     lea        edx, [edx + 8]
  1752     jg         convertloop
  1754     pop        edi
  1755     pop        esi
  1756     ret
  1760 __declspec(naked) __declspec(align(16))
  1761 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1762                        uint8* dst_u, uint8* dst_v, int width) {
  1763   __asm {
  1764     push       esi
  1765     push       edi
  1766     mov        eax, [esp + 8 + 4]   // src_argb
  1767     mov        esi, [esp + 8 + 8]   // src_stride_argb
  1768     mov        edx, [esp + 8 + 12]  // dst_u
  1769     mov        edi, [esp + 8 + 16]  // dst_v
  1770     mov        ecx, [esp + 8 + 20]  // pix
  1771     movdqa     xmm7, kABGRToU
  1772     movdqa     xmm6, kABGRToV
  1773     movdqa     xmm5, kAddUV128
  1774     sub        edi, edx             // stride from u to v
  1776     align      4
  1777  convertloop:
  1778     /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1779     movdqa     xmm0, [eax]
  1780     movdqa     xmm1, [eax + 16]
  1781     movdqa     xmm2, [eax + 32]
  1782     movdqa     xmm3, [eax + 48]
  1783     pavgb      xmm0, [eax + esi]
  1784     pavgb      xmm1, [eax + esi + 16]
  1785     pavgb      xmm2, [eax + esi + 32]
  1786     pavgb      xmm3, [eax + esi + 48]
  1787     lea        eax,  [eax + 64]
  1788     movdqa     xmm4, xmm0
  1789     shufps     xmm0, xmm1, 0x88
  1790     shufps     xmm4, xmm1, 0xdd
  1791     pavgb      xmm0, xmm4
  1792     movdqa     xmm4, xmm2
  1793     shufps     xmm2, xmm3, 0x88
  1794     shufps     xmm4, xmm3, 0xdd
  1795     pavgb      xmm2, xmm4
  1797     // step 2 - convert to U and V
  1798     // from here down is very similar to Y code except
  1799     // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1800     movdqa     xmm1, xmm0
  1801     movdqa     xmm3, xmm2
  1802     pmaddubsw  xmm0, xmm7  // U
  1803     pmaddubsw  xmm2, xmm7
  1804     pmaddubsw  xmm1, xmm6  // V
  1805     pmaddubsw  xmm3, xmm6
  1806     phaddw     xmm0, xmm2
  1807     phaddw     xmm1, xmm3
  1808     psraw      xmm0, 8
  1809     psraw      xmm1, 8
  1810     packsswb   xmm0, xmm1
  1811     paddb      xmm0, xmm5            // -> unsigned
  1813     // step 3 - store 8 U and 8 V values
  1814     sub        ecx, 16
  1815     movlps     qword ptr [edx], xmm0 // U
  1816     movhps     qword ptr [edx + edi], xmm0 // V
  1817     lea        edx, [edx + 8]
  1818     jg         convertloop
  1820     pop        edi
  1821     pop        esi
  1822     ret
  1826 __declspec(naked) __declspec(align(16))
  1827 void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1828                                  uint8* dst_u, uint8* dst_v, int width) {
  1829   __asm {
  1830     push       esi
  1831     push       edi
  1832     mov        eax, [esp + 8 + 4]   // src_argb
  1833     mov        esi, [esp + 8 + 8]   // src_stride_argb
  1834     mov        edx, [esp + 8 + 12]  // dst_u
  1835     mov        edi, [esp + 8 + 16]  // dst_v
  1836     mov        ecx, [esp + 8 + 20]  // pix
  1837     movdqa     xmm7, kABGRToU
  1838     movdqa     xmm6, kABGRToV
  1839     movdqa     xmm5, kAddUV128
  1840     sub        edi, edx             // stride from u to v
  1842     align      4
  1843  convertloop:
  1844     /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1845     movdqu     xmm0, [eax]
  1846     movdqu     xmm1, [eax + 16]
  1847     movdqu     xmm2, [eax + 32]
  1848     movdqu     xmm3, [eax + 48]
  1849     movdqu     xmm4, [eax + esi]
  1850     pavgb      xmm0, xmm4
  1851     movdqu     xmm4, [eax + esi + 16]
  1852     pavgb      xmm1, xmm4
  1853     movdqu     xmm4, [eax + esi + 32]
  1854     pavgb      xmm2, xmm4
  1855     movdqu     xmm4, [eax + esi + 48]
  1856     pavgb      xmm3, xmm4
  1857     lea        eax,  [eax + 64]
  1858     movdqa     xmm4, xmm0
  1859     shufps     xmm0, xmm1, 0x88
  1860     shufps     xmm4, xmm1, 0xdd
  1861     pavgb      xmm0, xmm4
  1862     movdqa     xmm4, xmm2
  1863     shufps     xmm2, xmm3, 0x88
  1864     shufps     xmm4, xmm3, 0xdd
  1865     pavgb      xmm2, xmm4
  1867     // step 2 - convert to U and V
  1868     // from here down is very similar to Y code except
  1869     // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1870     movdqa     xmm1, xmm0
  1871     movdqa     xmm3, xmm2
  1872     pmaddubsw  xmm0, xmm7  // U
  1873     pmaddubsw  xmm2, xmm7
  1874     pmaddubsw  xmm1, xmm6  // V
  1875     pmaddubsw  xmm3, xmm6
  1876     phaddw     xmm0, xmm2
  1877     phaddw     xmm1, xmm3
  1878     psraw      xmm0, 8
  1879     psraw      xmm1, 8
  1880     packsswb   xmm0, xmm1
  1881     paddb      xmm0, xmm5            // -> unsigned
  1883     // step 3 - store 8 U and 8 V values
  1884     sub        ecx, 16
  1885     movlps     qword ptr [edx], xmm0 // U
  1886     movhps     qword ptr [edx + edi], xmm0 // V
  1887     lea        edx, [edx + 8]
  1888     jg         convertloop
  1890     pop        edi
  1891     pop        esi
  1892     ret
  1896 __declspec(naked) __declspec(align(16))
  1897 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1898                        uint8* dst_u, uint8* dst_v, int width) {
  1899   __asm {
  1900     push       esi
  1901     push       edi
  1902     mov        eax, [esp + 8 + 4]   // src_argb
  1903     mov        esi, [esp + 8 + 8]   // src_stride_argb
  1904     mov        edx, [esp + 8 + 12]  // dst_u
  1905     mov        edi, [esp + 8 + 16]  // dst_v
  1906     mov        ecx, [esp + 8 + 20]  // pix
  1907     movdqa     xmm7, kRGBAToU
  1908     movdqa     xmm6, kRGBAToV
  1909     movdqa     xmm5, kAddUV128
  1910     sub        edi, edx             // stride from u to v
  1912     align      4
  1913  convertloop:
  1914     /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1915     movdqa     xmm0, [eax]
  1916     movdqa     xmm1, [eax + 16]
  1917     movdqa     xmm2, [eax + 32]
  1918     movdqa     xmm3, [eax + 48]
  1919     pavgb      xmm0, [eax + esi]
  1920     pavgb      xmm1, [eax + esi + 16]
  1921     pavgb      xmm2, [eax + esi + 32]
  1922     pavgb      xmm3, [eax + esi + 48]
  1923     lea        eax,  [eax + 64]
  1924     movdqa     xmm4, xmm0
  1925     shufps     xmm0, xmm1, 0x88
  1926     shufps     xmm4, xmm1, 0xdd
  1927     pavgb      xmm0, xmm4
  1928     movdqa     xmm4, xmm2
  1929     shufps     xmm2, xmm3, 0x88
  1930     shufps     xmm4, xmm3, 0xdd
  1931     pavgb      xmm2, xmm4
  1933     // step 2 - convert to U and V
  1934     // from here down is very similar to Y code except
  1935     // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1936     movdqa     xmm1, xmm0
  1937     movdqa     xmm3, xmm2
  1938     pmaddubsw  xmm0, xmm7  // U
  1939     pmaddubsw  xmm2, xmm7
  1940     pmaddubsw  xmm1, xmm6  // V
  1941     pmaddubsw  xmm3, xmm6
  1942     phaddw     xmm0, xmm2
  1943     phaddw     xmm1, xmm3
  1944     psraw      xmm0, 8
  1945     psraw      xmm1, 8
  1946     packsswb   xmm0, xmm1
  1947     paddb      xmm0, xmm5            // -> unsigned
  1949     // step 3 - store 8 U and 8 V values
  1950     sub        ecx, 16
  1951     movlps     qword ptr [edx], xmm0 // U
  1952     movhps     qword ptr [edx + edi], xmm0 // V
  1953     lea        edx, [edx + 8]
  1954     jg         convertloop
  1956     pop        edi
  1957     pop        esi
  1958     ret
  1962 __declspec(naked) __declspec(align(16))
  1963 void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1964                                  uint8* dst_u, uint8* dst_v, int width) {
  1965   __asm {
  1966     push       esi
  1967     push       edi
  1968     mov        eax, [esp + 8 + 4]   // src_argb
  1969     mov        esi, [esp + 8 + 8]   // src_stride_argb
  1970     mov        edx, [esp + 8 + 12]  // dst_u
  1971     mov        edi, [esp + 8 + 16]  // dst_v
  1972     mov        ecx, [esp + 8 + 20]  // pix
  1973     movdqa     xmm7, kRGBAToU
  1974     movdqa     xmm6, kRGBAToV
  1975     movdqa     xmm5, kAddUV128
  1976     sub        edi, edx             // stride from u to v
  1978     align      4
  1979  convertloop:
  1980     /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1981     movdqu     xmm0, [eax]
  1982     movdqu     xmm1, [eax + 16]
  1983     movdqu     xmm2, [eax + 32]
  1984     movdqu     xmm3, [eax + 48]
  1985     movdqu     xmm4, [eax + esi]
  1986     pavgb      xmm0, xmm4
  1987     movdqu     xmm4, [eax + esi + 16]
  1988     pavgb      xmm1, xmm4
  1989     movdqu     xmm4, [eax + esi + 32]
  1990     pavgb      xmm2, xmm4
  1991     movdqu     xmm4, [eax + esi + 48]
  1992     pavgb      xmm3, xmm4
  1993     lea        eax,  [eax + 64]
  1994     movdqa     xmm4, xmm0
  1995     shufps     xmm0, xmm1, 0x88
  1996     shufps     xmm4, xmm1, 0xdd
  1997     pavgb      xmm0, xmm4
  1998     movdqa     xmm4, xmm2
  1999     shufps     xmm2, xmm3, 0x88
  2000     shufps     xmm4, xmm3, 0xdd
  2001     pavgb      xmm2, xmm4
  2003     // step 2 - convert to U and V
  2004     // from here down is very similar to Y code except
  2005     // instead of 16 different pixels, its 8 pixels of U and 8 of V
  2006     movdqa     xmm1, xmm0
  2007     movdqa     xmm3, xmm2
  2008     pmaddubsw  xmm0, xmm7  // U
  2009     pmaddubsw  xmm2, xmm7
  2010     pmaddubsw  xmm1, xmm6  // V
  2011     pmaddubsw  xmm3, xmm6
  2012     phaddw     xmm0, xmm2
  2013     phaddw     xmm1, xmm3
  2014     psraw      xmm0, 8
  2015     psraw      xmm1, 8
  2016     packsswb   xmm0, xmm1
  2017     paddb      xmm0, xmm5            // -> unsigned
  2019     // step 3 - store 8 U and 8 V values
  2020     sub        ecx, 16
  2021     movlps     qword ptr [edx], xmm0 // U
  2022     movhps     qword ptr [edx + edi], xmm0 // V
  2023     lea        edx, [edx + 8]
  2024     jg         convertloop
  2026     pop        edi
  2027     pop        esi
  2028     ret
  2031 #endif  // HAS_ARGBTOYROW_SSSE3
  2033 #define YG 74 /* (int8)(1.164 * 64 + 0.5) */
  2035 #define UB 127 /* min(63,(int8)(2.018 * 64)) */
  2036 #define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
  2037 #define UR 0
  2039 #define VB 0
  2040 #define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
  2041 #define VR 102 /* (int8)(1.596 * 64 + 0.5) */
  2043 // Bias
  2044 #define BB UB * 128 + VB * 128
  2045 #define BG UG * 128 + VG * 128
  2046 #define BR UR * 128 + VR * 128
  2048 #ifdef HAS_I422TOARGBROW_AVX2
  2050 static const lvec8 kUVToB_AVX = {
  2051   UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB,
  2052   UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
  2053 };
  2054 static const lvec8 kUVToR_AVX = {
  2055   UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR,
  2056   UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
  2057 };
  2058 static const lvec8 kUVToG_AVX = {
  2059   UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
  2060   UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
  2061 };
  2062 static const lvec16 kYToRgb_AVX = {
  2063   YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG
  2064 };
  2065 static const lvec16 kYSub16_AVX = {
  2066   16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
  2067 };
  2068 static const lvec16 kUVBiasB_AVX = {
  2069   BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB
  2070 };
  2071 static const lvec16 kUVBiasG_AVX = {
  2072   BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG
  2073 };
  2074 static const lvec16 kUVBiasR_AVX = {
  2075   BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR
  2076 };
  2078 // 16 pixels
  2079 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2080 __declspec(naked) __declspec(align(16))
  2081 void I422ToARGBRow_AVX2(const uint8* y_buf,
  2082                          const uint8* u_buf,
  2083                          const uint8* v_buf,
  2084                          uint8* dst_argb,
  2085                          int width) {
  2086   __asm {
  2087     push       esi
  2088     push       edi
  2089     mov        eax, [esp + 8 + 4]   // Y
  2090     mov        esi, [esp + 8 + 8]   // U
  2091     mov        edi, [esp + 8 + 12]  // V
  2092     mov        edx, [esp + 8 + 16]  // argb
  2093     mov        ecx, [esp + 8 + 20]  // width
  2094     sub        edi, esi
  2095     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
  2096     vpxor      ymm4, ymm4, ymm4
  2098     align      4
  2099  convertloop:
  2100     vmovq      xmm0, qword ptr [esi]          //  U
  2101     vmovq      xmm1, qword ptr [esi + edi]    //  V
  2102     lea        esi,  [esi + 8]
  2103     vpunpcklbw ymm0, ymm0, ymm1               // UV
  2104     vpermq     ymm0, ymm0, 0xd8
  2105     vpunpcklwd ymm0, ymm0, ymm0              // UVUV
  2106     vpmaddubsw ymm2, ymm0, kUVToB_AVX        // scale B UV
  2107     vpmaddubsw ymm1, ymm0, kUVToG_AVX        // scale G UV
  2108     vpmaddubsw ymm0, ymm0, kUVToR_AVX        // scale R UV
  2109     vpsubw     ymm2, ymm2, kUVBiasB_AVX      // unbias back to signed
  2110     vpsubw     ymm1, ymm1, kUVBiasG_AVX
  2111     vpsubw     ymm0, ymm0, kUVBiasR_AVX
  2113     // Step 2: Find Y contribution to 16 R,G,B values
  2114     vmovdqu    xmm3, [eax]                  // NOLINT
  2115     lea        eax, [eax + 16]
  2116     vpermq     ymm3, ymm3, 0xd8
  2117     vpunpcklbw ymm3, ymm3, ymm4
  2118     vpsubsw    ymm3, ymm3, kYSub16_AVX
  2119     vpmullw    ymm3, ymm3, kYToRgb_AVX
  2120     vpaddsw    ymm2, ymm2, ymm3           // B += Y
  2121     vpaddsw    ymm1, ymm1, ymm3           // G += Y
  2122     vpaddsw    ymm0, ymm0, ymm3           // R += Y
  2123     vpsraw     ymm2, ymm2, 6
  2124     vpsraw     ymm1, ymm1, 6
  2125     vpsraw     ymm0, ymm0, 6
  2126     vpackuswb  ymm2, ymm2, ymm2           // B
  2127     vpackuswb  ymm1, ymm1, ymm1           // G
  2128     vpackuswb  ymm0, ymm0, ymm0           // R
  2130     // Step 3: Weave into ARGB
  2131     vpunpcklbw ymm2, ymm2, ymm1           // BG
  2132     vpermq     ymm2, ymm2, 0xd8
  2133     vpunpcklbw ymm0, ymm0, ymm5           // RA
  2134     vpermq     ymm0, ymm0, 0xd8
  2135     vpunpcklwd ymm1, ymm2, ymm0           // BGRA first 8 pixels
  2136     vpunpckhwd ymm2, ymm2, ymm0           // BGRA next 8 pixels
  2137     vmovdqu    [edx], ymm1
  2138     vmovdqu    [edx + 32], ymm2
  2139     lea        edx,  [edx + 64]
  2140     sub        ecx, 16
  2141     jg         convertloop
  2142     vzeroupper
  2144     pop        edi
  2145     pop        esi
  2146     ret
  2149 #endif  // HAS_I422TOARGBROW_AVX2
  2151 #ifdef HAS_I422TOARGBROW_SSSE3
  2153 static const vec8 kUVToB = {
  2154   UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
  2155 };
  2157 static const vec8 kUVToR = {
  2158   UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
  2159 };
  2161 static const vec8 kUVToG = {
  2162   UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
  2163 };
  2165 static const vec8 kVUToB = {
  2166   VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
  2167 };
  2169 static const vec8 kVUToR = {
  2170   VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
  2171 };
  2173 static const vec8 kVUToG = {
  2174   VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
  2175 };
  2177 static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
  2178 static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
  2179 static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
  2180 static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
  2181 static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
  2183 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
  2185 // Read 8 UV from 444.
  2186 #define READYUV444 __asm {                                                     \
  2187     __asm movq       xmm0, qword ptr [esi] /* U */                /* NOLINT */ \
  2188     __asm movq       xmm1, qword ptr [esi + edi] /* V */          /* NOLINT */ \
  2189     __asm lea        esi,  [esi + 8]                                           \
  2190     __asm punpcklbw  xmm0, xmm1           /* UV */                             \
  2193 // Read 4 UV from 422, upsample to 8 UV.
  2194 #define READYUV422 __asm {                                                     \
  2195     __asm movd       xmm0, [esi]          /* U */                              \
  2196     __asm movd       xmm1, [esi + edi]    /* V */                              \
  2197     __asm lea        esi,  [esi + 4]                                           \
  2198     __asm punpcklbw  xmm0, xmm1           /* UV */                             \
  2199     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
  2202 // Read 2 UV from 411, upsample to 8 UV.
  2203 #define READYUV411 __asm {                                                     \
  2204     __asm movzx      ebx, word ptr [esi]        /* U */           /* NOLINT */ \
  2205     __asm movd       xmm0, ebx                                                 \
  2206     __asm movzx      ebx, word ptr [esi + edi]  /* V */           /* NOLINT */ \
  2207     __asm movd       xmm1, ebx                                                 \
  2208     __asm lea        esi,  [esi + 2]                                           \
  2209     __asm punpcklbw  xmm0, xmm1           /* UV */                             \
  2210     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
  2211     __asm punpckldq  xmm0, xmm0           /* UVUV (upsample) */                \
  2214 // Read 4 UV from NV12, upsample to 8 UV.
  2215 #define READNV12 __asm {                                                       \
  2216     __asm movq       xmm0, qword ptr [esi] /* UV */               /* NOLINT */ \
  2217     __asm lea        esi,  [esi + 8]                                           \
  2218     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
  2221 // Convert 8 pixels: 8 UV and 8 Y.
  2222 #define YUVTORGB __asm {                                                       \
  2223     /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
  2224     __asm movdqa     xmm1, xmm0                                                \
  2225     __asm movdqa     xmm2, xmm0                                                \
  2226     __asm pmaddubsw  xmm0, kUVToB        /* scale B UV */                      \
  2227     __asm pmaddubsw  xmm1, kUVToG        /* scale G UV */                      \
  2228     __asm pmaddubsw  xmm2, kUVToR        /* scale R UV */                      \
  2229     __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
  2230     __asm psubw      xmm1, kUVBiasG                                            \
  2231     __asm psubw      xmm2, kUVBiasR                                            \
  2232     /* Step 2: Find Y contribution to 8 R,G,B values */                        \
  2233     __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
  2234     __asm lea        eax, [eax + 8]                                            \
  2235     __asm punpcklbw  xmm3, xmm4                                                \
  2236     __asm psubsw     xmm3, kYSub16                                             \
  2237     __asm pmullw     xmm3, kYToRgb                                             \
  2238     __asm paddsw     xmm0, xmm3           /* B += Y */                         \
  2239     __asm paddsw     xmm1, xmm3           /* G += Y */                         \
  2240     __asm paddsw     xmm2, xmm3           /* R += Y */                         \
  2241     __asm psraw      xmm0, 6                                                   \
  2242     __asm psraw      xmm1, 6                                                   \
  2243     __asm psraw      xmm2, 6                                                   \
  2244     __asm packuswb   xmm0, xmm0           /* B */                              \
  2245     __asm packuswb   xmm1, xmm1           /* G */                              \
  2246     __asm packuswb   xmm2, xmm2           /* R */                              \
  2249 // Convert 8 pixels: 8 VU and 8 Y.
  2250 #define YVUTORGB __asm {                                                       \
  2251     /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
  2252     __asm movdqa     xmm1, xmm0                                                \
  2253     __asm movdqa     xmm2, xmm0                                                \
  2254     __asm pmaddubsw  xmm0, kVUToB        /* scale B UV */                      \
  2255     __asm pmaddubsw  xmm1, kVUToG        /* scale G UV */                      \
  2256     __asm pmaddubsw  xmm2, kVUToR        /* scale R UV */                      \
  2257     __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
  2258     __asm psubw      xmm1, kUVBiasG                                            \
  2259     __asm psubw      xmm2, kUVBiasR                                            \
  2260     /* Step 2: Find Y contribution to 8 R,G,B values */                        \
  2261     __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
  2262     __asm lea        eax, [eax + 8]                                            \
  2263     __asm punpcklbw  xmm3, xmm4                                                \
  2264     __asm psubsw     xmm3, kYSub16                                             \
  2265     __asm pmullw     xmm3, kYToRgb                                             \
  2266     __asm paddsw     xmm0, xmm3           /* B += Y */                         \
  2267     __asm paddsw     xmm1, xmm3           /* G += Y */                         \
  2268     __asm paddsw     xmm2, xmm3           /* R += Y */                         \
  2269     __asm psraw      xmm0, 6                                                   \
  2270     __asm psraw      xmm1, 6                                                   \
  2271     __asm psraw      xmm2, 6                                                   \
  2272     __asm packuswb   xmm0, xmm0           /* B */                              \
  2273     __asm packuswb   xmm1, xmm1           /* G */                              \
  2274     __asm packuswb   xmm2, xmm2           /* R */                              \
  2277 // 8 pixels, dest aligned 16.
  2278 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
  2279 __declspec(naked) __declspec(align(16))
  2280 void I444ToARGBRow_SSSE3(const uint8* y_buf,
  2281                          const uint8* u_buf,
  2282                          const uint8* v_buf,
  2283                          uint8* dst_argb,
  2284                          int width) {
  2285   __asm {
  2286     push       esi
  2287     push       edi
  2288     mov        eax, [esp + 8 + 4]   // Y
  2289     mov        esi, [esp + 8 + 8]   // U
  2290     mov        edi, [esp + 8 + 12]  // V
  2291     mov        edx, [esp + 8 + 16]  // argb
  2292     mov        ecx, [esp + 8 + 20]  // width
  2293     sub        edi, esi
  2294     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  2295     pxor       xmm4, xmm4
  2297     align      4
  2298  convertloop:
  2299     READYUV444
  2300     YUVTORGB
  2302     // Step 3: Weave into ARGB
  2303     punpcklbw  xmm0, xmm1           // BG
  2304     punpcklbw  xmm2, xmm5           // RA
  2305     movdqa     xmm1, xmm0
  2306     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
  2307     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
  2308     movdqa     [edx], xmm0
  2309     movdqa     [edx + 16], xmm1
  2310     lea        edx,  [edx + 32]
  2311     sub        ecx, 8
  2312     jg         convertloop
  2314     pop        edi
  2315     pop        esi
  2316     ret
  2320 // 8 pixels, dest aligned 16.
  2321 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  2322 __declspec(naked) __declspec(align(16))
  2323 void I422ToRGB24Row_SSSE3(const uint8* y_buf,
  2324                           const uint8* u_buf,
  2325                           const uint8* v_buf,
  2326                           uint8* dst_rgb24,
  2327                           int width) {
  2328   __asm {
  2329     push       esi
  2330     push       edi
  2331     mov        eax, [esp + 8 + 4]   // Y
  2332     mov        esi, [esp + 8 + 8]   // U
  2333     mov        edi, [esp + 8 + 12]  // V
  2334     mov        edx, [esp + 8 + 16]  // rgb24
  2335     mov        ecx, [esp + 8 + 20]  // width
  2336     sub        edi, esi
  2337     pxor       xmm4, xmm4
  2338     movdqa     xmm5, kShuffleMaskARGBToRGB24_0
  2339     movdqa     xmm6, kShuffleMaskARGBToRGB24
  2341     align      4
  2342  convertloop:
  2343     READYUV422
  2344     YUVTORGB
  2346     // Step 3: Weave into RRGB
  2347     punpcklbw  xmm0, xmm1           // BG
  2348     punpcklbw  xmm2, xmm2           // RR
  2349     movdqa     xmm1, xmm0
  2350     punpcklwd  xmm0, xmm2           // BGRR first 4 pixels
  2351     punpckhwd  xmm1, xmm2           // BGRR next 4 pixels
  2352     pshufb     xmm0, xmm5           // Pack into first 8 and last 4 bytes.
  2353     pshufb     xmm1, xmm6           // Pack into first 12 bytes.
  2354     palignr    xmm1, xmm0, 12       // last 4 bytes of xmm0 + 12 from xmm1
  2355     movq       qword ptr [edx], xmm0  // First 8 bytes
  2356     movdqu     [edx + 8], xmm1      // Last 16 bytes. = 24 bytes, 8 RGB pixels.
  2357     lea        edx,  [edx + 24]
  2358     sub        ecx, 8
  2359     jg         convertloop
  2361     pop        edi
  2362     pop        esi
  2363     ret
  2367 // 8 pixels, dest aligned 16.
  2368 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  2369 __declspec(naked) __declspec(align(16))
  2370 void I422ToRAWRow_SSSE3(const uint8* y_buf,
  2371                         const uint8* u_buf,
  2372                         const uint8* v_buf,
  2373                         uint8* dst_raw,
  2374                         int width) {
  2375   __asm {
  2376     push       esi
  2377     push       edi
  2378     mov        eax, [esp + 8 + 4]   // Y
  2379     mov        esi, [esp + 8 + 8]   // U
  2380     mov        edi, [esp + 8 + 12]  // V
  2381     mov        edx, [esp + 8 + 16]  // raw
  2382     mov        ecx, [esp + 8 + 20]  // width
  2383     sub        edi, esi
  2384     pxor       xmm4, xmm4
  2385     movdqa     xmm5, kShuffleMaskARGBToRAW_0
  2386     movdqa     xmm6, kShuffleMaskARGBToRAW
  2388     align      4
  2389  convertloop:
  2390     READYUV422
  2391     YUVTORGB
  2393     // Step 3: Weave into RRGB
  2394     punpcklbw  xmm0, xmm1           // BG
  2395     punpcklbw  xmm2, xmm2           // RR
  2396     movdqa     xmm1, xmm0
  2397     punpcklwd  xmm0, xmm2           // BGRR first 4 pixels
  2398     punpckhwd  xmm1, xmm2           // BGRR next 4 pixels
  2399     pshufb     xmm0, xmm5           // Pack into first 8 and last 4 bytes.
  2400     pshufb     xmm1, xmm6           // Pack into first 12 bytes.
  2401     palignr    xmm1, xmm0, 12       // last 4 bytes of xmm0 + 12 from xmm1
  2402     movq       qword ptr [edx], xmm0  // First 8 bytes
  2403     movdqu     [edx + 8], xmm1      // Last 16 bytes. = 24 bytes, 8 RGB pixels.
  2404     lea        edx,  [edx + 24]
  2405     sub        ecx, 8
  2406     jg         convertloop
  2408     pop        edi
  2409     pop        esi
  2410     ret
  2414 // 8 pixels, dest unaligned.
  2415 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  2416 __declspec(naked) __declspec(align(16))
  2417 void I422ToRGB565Row_SSSE3(const uint8* y_buf,
  2418                            const uint8* u_buf,
  2419                            const uint8* v_buf,
  2420                            uint8* rgb565_buf,
  2421                            int width) {
  2422   __asm {
  2423     push       esi
  2424     push       edi
  2425     mov        eax, [esp + 8 + 4]   // Y
  2426     mov        esi, [esp + 8 + 8]   // U
  2427     mov        edi, [esp + 8 + 12]  // V
  2428     mov        edx, [esp + 8 + 16]  // rgb565
  2429     mov        ecx, [esp + 8 + 20]  // width
  2430     sub        edi, esi
  2431     pxor       xmm4, xmm4
  2432     pcmpeqb    xmm5, xmm5       // generate mask 0x0000001f
  2433     psrld      xmm5, 27
  2434     pcmpeqb    xmm6, xmm6       // generate mask 0x000007e0
  2435     psrld      xmm6, 26
  2436     pslld      xmm6, 5
  2437     pcmpeqb    xmm7, xmm7       // generate mask 0xfffff800
  2438     pslld      xmm7, 11
  2440     align      4
  2441  convertloop:
  2442     READYUV422
  2443     YUVTORGB
  2445     // Step 3: Weave into RRGB
  2446     punpcklbw  xmm0, xmm1           // BG
  2447     punpcklbw  xmm2, xmm2           // RR
  2448     movdqa     xmm1, xmm0
  2449     punpcklwd  xmm0, xmm2           // BGRR first 4 pixels
  2450     punpckhwd  xmm1, xmm2           // BGRR next 4 pixels
  2452     // Step 3b: RRGB -> RGB565
  2453     movdqa     xmm3, xmm0    // B  first 4 pixels of argb
  2454     movdqa     xmm2, xmm0    // G
  2455     pslld      xmm0, 8       // R
  2456     psrld      xmm3, 3       // B
  2457     psrld      xmm2, 5       // G
  2458     psrad      xmm0, 16      // R
  2459     pand       xmm3, xmm5    // B
  2460     pand       xmm2, xmm6    // G
  2461     pand       xmm0, xmm7    // R
  2462     por        xmm3, xmm2    // BG
  2463     por        xmm0, xmm3    // BGR
  2464     movdqa     xmm3, xmm1    // B  next 4 pixels of argb
  2465     movdqa     xmm2, xmm1    // G
  2466     pslld      xmm1, 8       // R
  2467     psrld      xmm3, 3       // B
  2468     psrld      xmm2, 5       // G
  2469     psrad      xmm1, 16      // R
  2470     pand       xmm3, xmm5    // B
  2471     pand       xmm2, xmm6    // G
  2472     pand       xmm1, xmm7    // R
  2473     por        xmm3, xmm2    // BG
  2474     por        xmm1, xmm3    // BGR
  2475     packssdw   xmm0, xmm1
  2476     sub        ecx, 8
  2477     movdqu     [edx], xmm0   // store 8 pixels of RGB565
  2478     lea        edx, [edx + 16]
  2479     jg         convertloop
  2481     pop        edi
  2482     pop        esi
  2483     ret
  2487 // 8 pixels, dest aligned 16.
  2488 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  2489 __declspec(naked) __declspec(align(16))
  2490 void I422ToARGBRow_SSSE3(const uint8* y_buf,
  2491                          const uint8* u_buf,
  2492                          const uint8* v_buf,
  2493                          uint8* dst_argb,
  2494                          int width) {
  2495   __asm {
  2496     push       esi
  2497     push       edi
  2498     mov        eax, [esp + 8 + 4]   // Y
  2499     mov        esi, [esp + 8 + 8]   // U
  2500     mov        edi, [esp + 8 + 12]  // V
  2501     mov        edx, [esp + 8 + 16]  // argb
  2502     mov        ecx, [esp + 8 + 20]  // width
  2503     sub        edi, esi
  2504     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  2505     pxor       xmm4, xmm4
  2507     align      4
  2508  convertloop:
  2509     READYUV422
  2510     YUVTORGB
  2512     // Step 3: Weave into ARGB
  2513     punpcklbw  xmm0, xmm1           // BG
  2514     punpcklbw  xmm2, xmm5           // RA
  2515     movdqa     xmm1, xmm0
  2516     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
  2517     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
  2518     movdqa     [edx], xmm0
  2519     movdqa     [edx + 16], xmm1
  2520     lea        edx,  [edx + 32]
  2521     sub        ecx, 8
  2522     jg         convertloop
  2524     pop        edi
  2525     pop        esi
  2526     ret
  2530 // 8 pixels, dest aligned 16.
  2531 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  2532 // Similar to I420 but duplicate UV once more.
  2533 __declspec(naked) __declspec(align(16))
  2534 void I411ToARGBRow_SSSE3(const uint8* y_buf,
  2535                          const uint8* u_buf,
  2536                          const uint8* v_buf,
  2537                          uint8* dst_argb,
  2538                          int width) {
  2539   __asm {
  2540     push       ebx
  2541     push       esi
  2542     push       edi
  2543     mov        eax, [esp + 12 + 4]   // Y
  2544     mov        esi, [esp + 12 + 8]   // U
  2545     mov        edi, [esp + 12 + 12]  // V
  2546     mov        edx, [esp + 12 + 16]  // argb
  2547     mov        ecx, [esp + 12 + 20]  // width
  2548     sub        edi, esi
  2549     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  2550     pxor       xmm4, xmm4
  2552     align      4
  2553  convertloop:
  2554     READYUV411  // modifies EBX
  2555     YUVTORGB
  2557     // Step 3: Weave into ARGB
  2558     punpcklbw  xmm0, xmm1           // BG
  2559     punpcklbw  xmm2, xmm5           // RA
  2560     movdqa     xmm1, xmm0
  2561     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
  2562     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
  2563     movdqa     [edx], xmm0
  2564     movdqa     [edx + 16], xmm1
  2565     lea        edx,  [edx + 32]
  2566     sub        ecx, 8
  2567     jg         convertloop
  2569     pop        edi
  2570     pop        esi
  2571     pop        ebx
  2572     ret
  2576 // 8 pixels, dest aligned 16.
  2577 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  2578 __declspec(naked) __declspec(align(16))
  2579 void NV12ToARGBRow_SSSE3(const uint8* y_buf,
  2580                          const uint8* uv_buf,
  2581                          uint8* dst_argb,
  2582                          int width) {
  2583   __asm {
  2584     push       esi
  2585     mov        eax, [esp + 4 + 4]   // Y
  2586     mov        esi, [esp + 4 + 8]   // UV
  2587     mov        edx, [esp + 4 + 12]  // argb
  2588     mov        ecx, [esp + 4 + 16]  // width
  2589     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  2590     pxor       xmm4, xmm4
  2592     align      4
  2593  convertloop:
  2594     READNV12
  2595     YUVTORGB
  2597     // Step 3: Weave into ARGB
  2598     punpcklbw  xmm0, xmm1           // BG
  2599     punpcklbw  xmm2, xmm5           // RA
  2600     movdqa     xmm1, xmm0
  2601     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
  2602     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
  2603     movdqa     [edx], xmm0
  2604     movdqa     [edx + 16], xmm1
  2605     lea        edx,  [edx + 32]
  2606     sub        ecx, 8
  2607     jg         convertloop
  2609     pop        esi
  2610     ret
  2614 // 8 pixels, dest aligned 16.
  2615 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  2616 __declspec(naked) __declspec(align(16))
  2617 void NV21ToARGBRow_SSSE3(const uint8* y_buf,
  2618                          const uint8* uv_buf,
  2619                          uint8* dst_argb,
  2620                          int width) {
  2621   __asm {
  2622     push       esi
  2623     mov        eax, [esp + 4 + 4]   // Y
  2624     mov        esi, [esp + 4 + 8]   // VU
  2625     mov        edx, [esp + 4 + 12]  // argb
  2626     mov        ecx, [esp + 4 + 16]  // width
  2627     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  2628     pxor       xmm4, xmm4
  2630     align      4
  2631  convertloop:
  2632     READNV12
  2633     YVUTORGB
  2635     // Step 3: Weave into ARGB
  2636     punpcklbw  xmm0, xmm1           // BG
  2637     punpcklbw  xmm2, xmm5           // RA
  2638     movdqa     xmm1, xmm0
  2639     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
  2640     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
  2641     movdqa     [edx], xmm0
  2642     movdqa     [edx + 16], xmm1
  2643     lea        edx,  [edx + 32]
  2644     sub        ecx, 8
  2645     jg         convertloop
  2647     pop        esi
  2648     ret
  2652 // 8 pixels, unaligned.
  2653 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
  2654 __declspec(naked) __declspec(align(16))
  2655 void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
  2656                                    const uint8* u_buf,
  2657                                    const uint8* v_buf,
  2658                                    uint8* dst_argb,
  2659                                    int width) {
  2660   __asm {
  2661     push       esi
  2662     push       edi
  2663     mov        eax, [esp + 8 + 4]   // Y
  2664     mov        esi, [esp + 8 + 8]   // U
  2665     mov        edi, [esp + 8 + 12]  // V
  2666     mov        edx, [esp + 8 + 16]  // argb
  2667     mov        ecx, [esp + 8 + 20]  // width
  2668     sub        edi, esi
  2669     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  2670     pxor       xmm4, xmm4
  2672     align      4
  2673  convertloop:
  2674     READYUV444
  2675     YUVTORGB
  2677     // Step 3: Weave into ARGB
  2678     punpcklbw  xmm0, xmm1           // BG
  2679     punpcklbw  xmm2, xmm5           // RA
  2680     movdqa     xmm1, xmm0
  2681     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
  2682     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
  2683     movdqu     [edx], xmm0
  2684     movdqu     [edx + 16], xmm1
  2685     lea        edx,  [edx + 32]
  2686     sub        ecx, 8
  2687     jg         convertloop
  2689     pop        edi
  2690     pop        esi
  2691     ret
  2695 // 8 pixels, unaligned.
  2696 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  2697 __declspec(naked) __declspec(align(16))
  2698 void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
  2699                                    const uint8* u_buf,
  2700                                    const uint8* v_buf,
  2701                                    uint8* dst_argb,
  2702                                    int width) {
  2703   __asm {
  2704     push       esi
  2705     push       edi
  2706     mov        eax, [esp + 8 + 4]   // Y
  2707     mov        esi, [esp + 8 + 8]   // U
  2708     mov        edi, [esp + 8 + 12]  // V
  2709     mov        edx, [esp + 8 + 16]  // argb
  2710     mov        ecx, [esp + 8 + 20]  // width
  2711     sub        edi, esi
  2712     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  2713     pxor       xmm4, xmm4
  2715     align      4
  2716  convertloop:
  2717     READYUV422
  2718     YUVTORGB
  2720     // Step 3: Weave into ARGB
  2721     punpcklbw  xmm0, xmm1           // BG
  2722     punpcklbw  xmm2, xmm5           // RA
  2723     movdqa     xmm1, xmm0
  2724     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
  2725     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
  2726     movdqu     [edx], xmm0
  2727     movdqu     [edx + 16], xmm1
  2728     lea        edx,  [edx + 32]
  2729     sub        ecx, 8
  2730     jg         convertloop
  2732     pop        edi
  2733     pop        esi
  2734     ret
  2738 // 8 pixels, unaligned.
  2739 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  2740 // Similar to I420 but duplicate UV once more.
  2741 __declspec(naked) __declspec(align(16))
  2742 void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
  2743                                    const uint8* u_buf,
  2744                                    const uint8* v_buf,
  2745                                    uint8* dst_argb,
  2746                                    int width) {
  2747   __asm {
  2748     push       ebx
  2749     push       esi
  2750     push       edi
  2751     mov        eax, [esp + 12 + 4]   // Y
  2752     mov        esi, [esp + 12 + 8]   // U
  2753     mov        edi, [esp + 12 + 12]  // V
  2754     mov        edx, [esp + 12 + 16]  // argb
  2755     mov        ecx, [esp + 12 + 20]  // width
  2756     sub        edi, esi
  2757     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  2758     pxor       xmm4, xmm4
  2760     align      4
  2761  convertloop:
  2762     READYUV411  // modifies EBX
  2763     YUVTORGB
  2765     // Step 3: Weave into ARGB
  2766     punpcklbw  xmm0, xmm1           // BG
  2767     punpcklbw  xmm2, xmm5           // RA
  2768     movdqa     xmm1, xmm0
  2769     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
  2770     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
  2771     movdqu     [edx], xmm0
  2772     movdqu     [edx + 16], xmm1
  2773     lea        edx,  [edx + 32]
  2774     sub        ecx, 8
  2775     jg         convertloop
  2777     pop        edi
  2778     pop        esi
  2779     pop        ebx
  2780     ret
  2784 // 8 pixels, dest aligned 16.
  2785 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  2786 __declspec(naked) __declspec(align(16))
  2787 void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
  2788                                    const uint8* uv_buf,
  2789                                    uint8* dst_argb,
  2790                                    int width) {
  2791   __asm {
  2792     push       esi
  2793     mov        eax, [esp + 4 + 4]   // Y
  2794     mov        esi, [esp + 4 + 8]   // UV
  2795     mov        edx, [esp + 4 + 12]  // argb
  2796     mov        ecx, [esp + 4 + 16]  // width
  2797     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  2798     pxor       xmm4, xmm4
  2800     align      4
  2801  convertloop:
  2802     READNV12
  2803     YUVTORGB
  2805     // Step 3: Weave into ARGB
  2806     punpcklbw  xmm0, xmm1           // BG
  2807     punpcklbw  xmm2, xmm5           // RA
  2808     movdqa     xmm1, xmm0
  2809     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
  2810     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
  2811     movdqu     [edx], xmm0
  2812     movdqu     [edx + 16], xmm1
  2813     lea        edx,  [edx + 32]
  2814     sub        ecx, 8
  2815     jg         convertloop
  2817     pop        esi
  2818     ret
  2822 // 8 pixels, dest aligned 16.
  2823 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  2824 __declspec(naked) __declspec(align(16))
  2825 void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
  2826                                    const uint8* uv_buf,
  2827                                    uint8* dst_argb,
  2828                                    int width) {
  2829   __asm {
  2830     push       esi
  2831     mov        eax, [esp + 4 + 4]   // Y
  2832     mov        esi, [esp + 4 + 8]   // VU
  2833     mov        edx, [esp + 4 + 12]  // argb
  2834     mov        ecx, [esp + 4 + 16]  // width
  2835     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  2836     pxor       xmm4, xmm4
  2838     align      4
  2839  convertloop:
  2840     READNV12
  2841     YVUTORGB
  2843     // Step 3: Weave into ARGB
  2844     punpcklbw  xmm0, xmm1           // BG
  2845     punpcklbw  xmm2, xmm5           // RA
  2846     movdqa     xmm1, xmm0
  2847     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
  2848     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
  2849     movdqu     [edx], xmm0
  2850     movdqu     [edx + 16], xmm1
  2851     lea        edx,  [edx + 32]
  2852     sub        ecx, 8
  2853     jg         convertloop
  2855     pop        esi
  2856     ret
  2860 __declspec(naked) __declspec(align(16))
  2861 void I422ToBGRARow_SSSE3(const uint8* y_buf,
  2862                          const uint8* u_buf,
  2863                          const uint8* v_buf,
  2864                          uint8* dst_bgra,
  2865                          int width) {
  2866   __asm {
  2867     push       esi
  2868     push       edi
  2869     mov        eax, [esp + 8 + 4]   // Y
  2870     mov        esi, [esp + 8 + 8]   // U
  2871     mov        edi, [esp + 8 + 12]  // V
  2872     mov        edx, [esp + 8 + 16]  // bgra
  2873     mov        ecx, [esp + 8 + 20]  // width
  2874     sub        edi, esi
  2875     pxor       xmm4, xmm4
  2877     align      4
  2878  convertloop:
  2879     READYUV422
  2880     YUVTORGB
  2882     // Step 3: Weave into BGRA
  2883     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  2884     punpcklbw  xmm1, xmm0           // GB
  2885     punpcklbw  xmm5, xmm2           // AR
  2886     movdqa     xmm0, xmm5
  2887     punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
  2888     punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
  2889     movdqa     [edx], xmm5
  2890     movdqa     [edx + 16], xmm0
  2891     lea        edx,  [edx + 32]
  2892     sub        ecx, 8
  2893     jg         convertloop
  2895     pop        edi
  2896     pop        esi
  2897     ret
  2901 __declspec(naked) __declspec(align(16))
  2902 void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
  2903                                    const uint8* u_buf,
  2904                                    const uint8* v_buf,
  2905                                    uint8* dst_bgra,
  2906                                    int width) {
  2907   __asm {
  2908     push       esi
  2909     push       edi
  2910     mov        eax, [esp + 8 + 4]   // Y
  2911     mov        esi, [esp + 8 + 8]   // U
  2912     mov        edi, [esp + 8 + 12]  // V
  2913     mov        edx, [esp + 8 + 16]  // bgra
  2914     mov        ecx, [esp + 8 + 20]  // width
  2915     sub        edi, esi
  2916     pxor       xmm4, xmm4
  2918     align      4
  2919  convertloop:
  2920     READYUV422
  2921     YUVTORGB
  2923     // Step 3: Weave into BGRA
  2924     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  2925     punpcklbw  xmm1, xmm0           // GB
  2926     punpcklbw  xmm5, xmm2           // AR
  2927     movdqa     xmm0, xmm5
  2928     punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
  2929     punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
  2930     movdqu     [edx], xmm5
  2931     movdqu     [edx + 16], xmm0
  2932     lea        edx,  [edx + 32]
  2933     sub        ecx, 8
  2934     jg         convertloop
  2936     pop        edi
  2937     pop        esi
  2938     ret
  2942 __declspec(naked) __declspec(align(16))
  2943 void I422ToABGRRow_SSSE3(const uint8* y_buf,
  2944                          const uint8* u_buf,
  2945                          const uint8* v_buf,
  2946                          uint8* dst_abgr,
  2947                          int width) {
  2948   __asm {
  2949     push       esi
  2950     push       edi
  2951     mov        eax, [esp + 8 + 4]   // Y
  2952     mov        esi, [esp + 8 + 8]   // U
  2953     mov        edi, [esp + 8 + 12]  // V
  2954     mov        edx, [esp + 8 + 16]  // abgr
  2955     mov        ecx, [esp + 8 + 20]  // width
  2956     sub        edi, esi
  2957     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  2958     pxor       xmm4, xmm4
  2960     align      4
  2961  convertloop:
  2962     READYUV422
  2963     YUVTORGB
  2965     // Step 3: Weave into ARGB
  2966     punpcklbw  xmm2, xmm1           // RG
  2967     punpcklbw  xmm0, xmm5           // BA
  2968     movdqa     xmm1, xmm2
  2969     punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
  2970     punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
  2971     movdqa     [edx], xmm2
  2972     movdqa     [edx + 16], xmm1
  2973     lea        edx,  [edx + 32]
  2974     sub        ecx, 8
  2975     jg         convertloop
  2977     pop        edi
  2978     pop        esi
  2979     ret
  2983 __declspec(naked) __declspec(align(16))
  2984 void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
  2985                                    const uint8* u_buf,
  2986                                    const uint8* v_buf,
  2987                                    uint8* dst_abgr,
  2988                                    int width) {
  2989   __asm {
  2990     push       esi
  2991     push       edi
  2992     mov        eax, [esp + 8 + 4]   // Y
  2993     mov        esi, [esp + 8 + 8]   // U
  2994     mov        edi, [esp + 8 + 12]  // V
  2995     mov        edx, [esp + 8 + 16]  // abgr
  2996     mov        ecx, [esp + 8 + 20]  // width
  2997     sub        edi, esi
  2998     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  2999     pxor       xmm4, xmm4
  3001     align      4
  3002  convertloop:
  3003     READYUV422
  3004     YUVTORGB
  3006     // Step 3: Weave into ARGB
  3007     punpcklbw  xmm2, xmm1           // RG
  3008     punpcklbw  xmm0, xmm5           // BA
  3009     movdqa     xmm1, xmm2
  3010     punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
  3011     punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
  3012     movdqu     [edx], xmm2
  3013     movdqu     [edx + 16], xmm1
  3014     lea        edx,  [edx + 32]
  3015     sub        ecx, 8
  3016     jg         convertloop
  3018     pop        edi
  3019     pop        esi
  3020     ret
  3024 __declspec(naked) __declspec(align(16))
  3025 void I422ToRGBARow_SSSE3(const uint8* y_buf,
  3026                          const uint8* u_buf,
  3027                          const uint8* v_buf,
  3028                          uint8* dst_rgba,
  3029                          int width) {
  3030   __asm {
  3031     push       esi
  3032     push       edi
  3033     mov        eax, [esp + 8 + 4]   // Y
  3034     mov        esi, [esp + 8 + 8]   // U
  3035     mov        edi, [esp + 8 + 12]  // V
  3036     mov        edx, [esp + 8 + 16]  // rgba
  3037     mov        ecx, [esp + 8 + 20]  // width
  3038     sub        edi, esi
  3039     pxor       xmm4, xmm4
  3041     align      4
  3042  convertloop:
  3043     READYUV422
  3044     YUVTORGB
  3046     // Step 3: Weave into RGBA
  3047     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  3048     punpcklbw  xmm1, xmm2           // GR
  3049     punpcklbw  xmm5, xmm0           // AB
  3050     movdqa     xmm0, xmm5
  3051     punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
  3052     punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
  3053     movdqa     [edx], xmm5
  3054     movdqa     [edx + 16], xmm0
  3055     lea        edx,  [edx + 32]
  3056     sub        ecx, 8
  3057     jg         convertloop
  3059     pop        edi
  3060     pop        esi
  3061     ret
  3065 __declspec(naked) __declspec(align(16))
  3066 void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
  3067                                    const uint8* u_buf,
  3068                                    const uint8* v_buf,
  3069                                    uint8* dst_rgba,
  3070                                    int width) {
  3071   __asm {
  3072     push       esi
  3073     push       edi
  3074     mov        eax, [esp + 8 + 4]   // Y
  3075     mov        esi, [esp + 8 + 8]   // U
  3076     mov        edi, [esp + 8 + 12]  // V
  3077     mov        edx, [esp + 8 + 16]  // rgba
  3078     mov        ecx, [esp + 8 + 20]  // width
  3079     sub        edi, esi
  3080     pxor       xmm4, xmm4
  3082     align      4
  3083  convertloop:
  3084     READYUV422
  3085     YUVTORGB
  3087     // Step 3: Weave into RGBA
  3088     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
  3089     punpcklbw  xmm1, xmm2           // GR
  3090     punpcklbw  xmm5, xmm0           // AB
  3091     movdqa     xmm0, xmm5
  3092     punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
  3093     punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
  3094     movdqu     [edx], xmm5
  3095     movdqu     [edx + 16], xmm0
  3096     lea        edx,  [edx + 32]
  3097     sub        ecx, 8
  3098     jg         convertloop
  3100     pop        edi
  3101     pop        esi
  3102     ret
  3106 #endif  // HAS_I422TOARGBROW_SSSE3
  3108 #ifdef HAS_YTOARGBROW_SSE2
  3109 __declspec(naked) __declspec(align(16))
  3110 void YToARGBRow_SSE2(const uint8* y_buf,
  3111                      uint8* rgb_buf,
  3112                      int width) {
  3113   __asm {
  3114     pxor       xmm5, xmm5
  3115     pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
  3116     pslld      xmm4, 24
  3117     mov        eax, 0x00100010
  3118     movd       xmm3, eax
  3119     pshufd     xmm3, xmm3, 0
  3120     mov        eax, 0x004a004a       // 74
  3121     movd       xmm2, eax
  3122     pshufd     xmm2, xmm2,0
  3123     mov        eax, [esp + 4]       // Y
  3124     mov        edx, [esp + 8]       // rgb
  3125     mov        ecx, [esp + 12]      // width
  3127     align      4
  3128  convertloop:
  3129     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
  3130     movq       xmm0, qword ptr [eax]
  3131     lea        eax, [eax + 8]
  3132     punpcklbw  xmm0, xmm5           // 0.Y
  3133     psubusw    xmm0, xmm3
  3134     pmullw     xmm0, xmm2
  3135     psrlw      xmm0, 6
  3136     packuswb   xmm0, xmm0           // G
  3138     // Step 2: Weave into ARGB
  3139     punpcklbw  xmm0, xmm0           // GG
  3140     movdqa     xmm1, xmm0
  3141     punpcklwd  xmm0, xmm0           // BGRA first 4 pixels
  3142     punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
  3143     por        xmm0, xmm4
  3144     por        xmm1, xmm4
  3145     movdqa     [edx], xmm0
  3146     movdqa     [edx + 16], xmm1
  3147     lea        edx,  [edx + 32]
  3148     sub        ecx, 8
  3149     jg         convertloop
  3151     ret
  3154 #endif  // HAS_YTOARGBROW_SSE2
  3156 #ifdef HAS_MIRRORROW_SSSE3
  3157 // Shuffle table for reversing the bytes.
  3158 static const uvec8 kShuffleMirror = {
  3159   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
  3160 };
  3162 __declspec(naked) __declspec(align(16))
  3163 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
  3164   __asm {
  3165     mov       eax, [esp + 4]   // src
  3166     mov       edx, [esp + 8]   // dst
  3167     mov       ecx, [esp + 12]  // width
  3168     movdqa    xmm5, kShuffleMirror
  3169     lea       eax, [eax - 16]
  3171     align      4
  3172  convertloop:
  3173     movdqa    xmm0, [eax + ecx]
  3174     pshufb    xmm0, xmm5
  3175     sub       ecx, 16
  3176     movdqa    [edx], xmm0
  3177     lea       edx, [edx + 16]
  3178     jg        convertloop
  3179     ret
  3182 #endif  // HAS_MIRRORROW_SSSE3
  3184 #ifdef HAS_MIRRORROW_AVX2
  3185 // Shuffle table for reversing the bytes.
  3186 static const ulvec8 kShuffleMirror_AVX2 = {
  3187   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u,
  3188   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
  3189 };
  3191 __declspec(naked) __declspec(align(16))
  3192 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
  3193   __asm {
  3194     mov       eax, [esp + 4]   // src
  3195     mov       edx, [esp + 8]   // dst
  3196     mov       ecx, [esp + 12]  // width
  3197     vmovdqa   ymm5, kShuffleMirror_AVX2
  3198     lea       eax, [eax - 32]
  3200     align      4
  3201  convertloop:
  3202     vmovdqu   ymm0, [eax + ecx]
  3203     vpshufb   ymm0, ymm0, ymm5
  3204     vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
  3205     sub       ecx, 32
  3206     vmovdqu   [edx], ymm0
  3207     lea       edx, [edx + 32]
  3208     jg        convertloop
  3209     vzeroupper
  3210     ret
  3213 #endif  // HAS_MIRRORROW_AVX2
  3215 #ifdef HAS_MIRRORROW_SSE2
  3216 // SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
  3217 // version can not.
  3218 __declspec(naked) __declspec(align(16))
  3219 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
  3220   __asm {
  3221     mov       eax, [esp + 4]   // src
  3222     mov       edx, [esp + 8]   // dst
  3223     mov       ecx, [esp + 12]  // width
  3224     lea       eax, [eax - 16]
  3226     align      4
  3227  convertloop:
  3228     movdqu    xmm0, [eax + ecx]
  3229     movdqa    xmm1, xmm0        // swap bytes
  3230     psllw     xmm0, 8
  3231     psrlw     xmm1, 8
  3232     por       xmm0, xmm1
  3233     pshuflw   xmm0, xmm0, 0x1b  // swap words
  3234     pshufhw   xmm0, xmm0, 0x1b
  3235     pshufd    xmm0, xmm0, 0x4e  // swap qwords
  3236     sub       ecx, 16
  3237     movdqu    [edx], xmm0
  3238     lea       edx, [edx + 16]
  3239     jg        convertloop
  3240     ret
  3243 #endif  // HAS_MIRRORROW_SSE2
  3245 #ifdef HAS_MIRRORROW_UV_SSSE3
  3246 // Shuffle table for reversing the bytes of UV channels.
  3247 static const uvec8 kShuffleMirrorUV = {
  3248   14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
  3249 };
  3251 __declspec(naked) __declspec(align(16))
  3252 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
  3253                        int width) {
  3254   __asm {
  3255     push      edi
  3256     mov       eax, [esp + 4 + 4]   // src
  3257     mov       edx, [esp + 4 + 8]   // dst_u
  3258     mov       edi, [esp + 4 + 12]  // dst_v
  3259     mov       ecx, [esp + 4 + 16]  // width
  3260     movdqa    xmm1, kShuffleMirrorUV
  3261     lea       eax, [eax + ecx * 2 - 16]
  3262     sub       edi, edx
  3264     align      4
  3265  convertloop:
  3266     movdqa    xmm0, [eax]
  3267     lea       eax, [eax - 16]
  3268     pshufb    xmm0, xmm1
  3269     sub       ecx, 8
  3270     movlpd    qword ptr [edx], xmm0
  3271     movhpd    qword ptr [edx + edi], xmm0
  3272     lea       edx, [edx + 8]
  3273     jg        convertloop
  3275     pop       edi
  3276     ret
  3279 #endif  // HAS_MIRRORROW_UV_SSSE3
  3281 #ifdef HAS_ARGBMIRRORROW_SSSE3
  3282 // Shuffle table for reversing the bytes.
  3283 static const uvec8 kARGBShuffleMirror = {
  3284   12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
  3285 };
  3287 __declspec(naked) __declspec(align(16))
  3288 void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
  3289   __asm {
  3290     mov       eax, [esp + 4]   // src
  3291     mov       edx, [esp + 8]   // dst
  3292     mov       ecx, [esp + 12]  // width
  3293     lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
  3294     movdqa    xmm5, kARGBShuffleMirror
  3296     align      4
  3297  convertloop:
  3298     movdqa    xmm0, [eax]
  3299     lea       eax, [eax - 16]
  3300     pshufb    xmm0, xmm5
  3301     sub       ecx, 4
  3302     movdqa    [edx], xmm0
  3303     lea       edx, [edx + 16]
  3304     jg        convertloop
  3305     ret
  3308 #endif  // HAS_ARGBMIRRORROW_SSSE3
  3310 #ifdef HAS_ARGBMIRRORROW_AVX2
  3311 // Shuffle table for reversing the bytes.
  3312 static const ulvec32 kARGBShuffleMirror_AVX2 = {
  3313   7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
  3314 };
  3316 __declspec(naked) __declspec(align(16))
  3317 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
  3318   __asm {
  3319     mov       eax, [esp + 4]   // src
  3320     mov       edx, [esp + 8]   // dst
  3321     mov       ecx, [esp + 12]  // width
  3322     lea       eax, [eax - 32]
  3323     vmovdqa   ymm5, kARGBShuffleMirror_AVX2
  3325     align      4
  3326  convertloop:
  3327     vpermd    ymm0, ymm5, [eax + ecx * 4]  // permute dword order
  3328     sub       ecx, 8
  3329     vmovdqu   [edx], ymm0
  3330     lea       edx, [edx + 32]
  3331     jg        convertloop
  3332     vzeroupper
  3333     ret
  3336 #endif  // HAS_ARGBMIRRORROW_AVX2
  3338 #ifdef HAS_SPLITUVROW_SSE2
  3339 __declspec(naked) __declspec(align(16))
  3340 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
  3341   __asm {
  3342     push       edi
  3343     mov        eax, [esp + 4 + 4]    // src_uv
  3344     mov        edx, [esp + 4 + 8]    // dst_u
  3345     mov        edi, [esp + 4 + 12]   // dst_v
  3346     mov        ecx, [esp + 4 + 16]   // pix
  3347     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
  3348     psrlw      xmm5, 8
  3349     sub        edi, edx
  3351     align      4
  3352   convertloop:
  3353     movdqa     xmm0, [eax]
  3354     movdqa     xmm1, [eax + 16]
  3355     lea        eax,  [eax + 32]
  3356     movdqa     xmm2, xmm0
  3357     movdqa     xmm3, xmm1
  3358     pand       xmm0, xmm5   // even bytes
  3359     pand       xmm1, xmm5
  3360     packuswb   xmm0, xmm1
  3361     psrlw      xmm2, 8      // odd bytes
  3362     psrlw      xmm3, 8
  3363     packuswb   xmm2, xmm3
  3364     movdqa     [edx], xmm0
  3365     movdqa     [edx + edi], xmm2
  3366     lea        edx, [edx + 16]
  3367     sub        ecx, 16
  3368     jg         convertloop
  3370     pop        edi
  3371     ret
  3375 __declspec(naked) __declspec(align(16))
  3376 void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
  3377                                int pix) {
  3378   __asm {
  3379     push       edi
  3380     mov        eax, [esp + 4 + 4]    // src_uv
  3381     mov        edx, [esp + 4 + 8]    // dst_u
  3382     mov        edi, [esp + 4 + 12]   // dst_v
  3383     mov        ecx, [esp + 4 + 16]   // pix
  3384     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
  3385     psrlw      xmm5, 8
  3386     sub        edi, edx
  3388     align      4
  3389   convertloop:
  3390     movdqu     xmm0, [eax]
  3391     movdqu     xmm1, [eax + 16]
  3392     lea        eax,  [eax + 32]
  3393     movdqa     xmm2, xmm0
  3394     movdqa     xmm3, xmm1
  3395     pand       xmm0, xmm5   // even bytes
  3396     pand       xmm1, xmm5
  3397     packuswb   xmm0, xmm1
  3398     psrlw      xmm2, 8      // odd bytes
  3399     psrlw      xmm3, 8
  3400     packuswb   xmm2, xmm3
  3401     movdqu     [edx], xmm0
  3402     movdqu     [edx + edi], xmm2
  3403     lea        edx, [edx + 16]
  3404     sub        ecx, 16
  3405     jg         convertloop
  3407     pop        edi
  3408     ret
  3411 #endif  // HAS_SPLITUVROW_SSE2
  3413 #ifdef HAS_SPLITUVROW_AVX2
  3414 __declspec(naked) __declspec(align(16))
  3415 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
  3416   __asm {
  3417     push       edi
  3418     mov        eax, [esp + 4 + 4]    // src_uv
  3419     mov        edx, [esp + 4 + 8]    // dst_u
  3420     mov        edi, [esp + 4 + 12]   // dst_v
  3421     mov        ecx, [esp + 4 + 16]   // pix
  3422     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
  3423     vpsrlw     ymm5, ymm5, 8
  3424     sub        edi, edx
  3426     align      4
  3427   convertloop:
  3428     vmovdqu    ymm0, [eax]
  3429     vmovdqu    ymm1, [eax + 32]
  3430     lea        eax,  [eax + 64]
  3431     vpsrlw     ymm2, ymm0, 8      // odd bytes
  3432     vpsrlw     ymm3, ymm1, 8
  3433     vpand      ymm0, ymm0, ymm5   // even bytes
  3434     vpand      ymm1, ymm1, ymm5
  3435     vpackuswb  ymm0, ymm0, ymm1
  3436     vpackuswb  ymm2, ymm2, ymm3
  3437     vpermq     ymm0, ymm0, 0xd8
  3438     vpermq     ymm2, ymm2, 0xd8
  3439     vmovdqu    [edx], ymm0
  3440     vmovdqu    [edx + edi], ymm2
  3441     lea        edx, [edx + 32]
  3442     sub        ecx, 32
  3443     jg         convertloop
  3445     pop        edi
  3446     vzeroupper
  3447     ret
  3450 #endif  // HAS_SPLITUVROW_AVX2
  3452 #ifdef HAS_MERGEUVROW_SSE2
  3453 __declspec(naked) __declspec(align(16))
  3454 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
  3455                      int width) {
  3456   __asm {
  3457     push       edi
  3458     mov        eax, [esp + 4 + 4]    // src_u
  3459     mov        edx, [esp + 4 + 8]    // src_v
  3460     mov        edi, [esp + 4 + 12]   // dst_uv
  3461     mov        ecx, [esp + 4 + 16]   // width
  3462     sub        edx, eax
  3464     align      4
  3465   convertloop:
  3466     movdqa     xmm0, [eax]      // read 16 U's
  3467     movdqa     xmm1, [eax + edx]  // and 16 V's
  3468     lea        eax,  [eax + 16]
  3469     movdqa     xmm2, xmm0
  3470     punpcklbw  xmm0, xmm1       // first 8 UV pairs
  3471     punpckhbw  xmm2, xmm1       // next 8 UV pairs
  3472     movdqa     [edi], xmm0
  3473     movdqa     [edi + 16], xmm2
  3474     lea        edi, [edi + 32]
  3475     sub        ecx, 16
  3476     jg         convertloop
  3478     pop        edi
  3479     ret
  3483 __declspec(naked) __declspec(align(16))
  3484 void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
  3485                                uint8* dst_uv, int width) {
  3486   __asm {
  3487     push       edi
  3488     mov        eax, [esp + 4 + 4]    // src_u
  3489     mov        edx, [esp + 4 + 8]    // src_v
  3490     mov        edi, [esp + 4 + 12]   // dst_uv
  3491     mov        ecx, [esp + 4 + 16]   // width
  3492     sub        edx, eax
  3494     align      4
  3495   convertloop:
  3496     movdqu     xmm0, [eax]      // read 16 U's
  3497     movdqu     xmm1, [eax + edx]  // and 16 V's
  3498     lea        eax,  [eax + 16]
  3499     movdqa     xmm2, xmm0
  3500     punpcklbw  xmm0, xmm1       // first 8 UV pairs
  3501     punpckhbw  xmm2, xmm1       // next 8 UV pairs
  3502     movdqu     [edi], xmm0
  3503     movdqu     [edi + 16], xmm2
  3504     lea        edi, [edi + 32]
  3505     sub        ecx, 16
  3506     jg         convertloop
  3508     pop        edi
  3509     ret
  3512 #endif  //  HAS_MERGEUVROW_SSE2
  3514 #ifdef HAS_MERGEUVROW_AVX2
  3515 __declspec(naked) __declspec(align(16))
  3516 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
  3517                      int width) {
  3518   __asm {
  3519     push       edi
  3520     mov        eax, [esp + 4 + 4]    // src_u
  3521     mov        edx, [esp + 4 + 8]    // src_v
  3522     mov        edi, [esp + 4 + 12]   // dst_uv
  3523     mov        ecx, [esp + 4 + 16]   // width
  3524     sub        edx, eax
  3526     align      4
  3527   convertloop:
  3528     vmovdqu    ymm0, [eax]           // read 32 U's
  3529     vmovdqu    ymm1, [eax + edx]     // and 32 V's
  3530     lea        eax,  [eax + 32]
  3531     vpunpcklbw ymm2, ymm0, ymm1      // low 16 UV pairs. mutated qqword 0,2
  3532     vpunpckhbw ymm0, ymm0, ymm1      // high 16 UV pairs. mutated qqword 1,3
  3533     vperm2i128 ymm1, ymm2, ymm0, 0x20  // low 128 of ymm2 and low 128 of ymm0
  3534     vperm2i128 ymm2, ymm2, ymm0, 0x31  // high 128 of ymm2 and high 128 of ymm0
  3535     vmovdqu    [edi], ymm1
  3536     vmovdqu    [edi + 32], ymm2
  3537     lea        edi, [edi + 64]
  3538     sub        ecx, 32
  3539     jg         convertloop
  3541     pop        edi
  3542     vzeroupper
  3543     ret
  3546 #endif  //  HAS_MERGEUVROW_AVX2
  3548 #ifdef HAS_COPYROW_SSE2
  3549 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
  3550 __declspec(naked) __declspec(align(16))
  3551 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
  3552   __asm {
  3553     mov        eax, [esp + 4]   // src
  3554     mov        edx, [esp + 8]   // dst
  3555     mov        ecx, [esp + 12]  // count
  3557     align      4
  3558   convertloop:
  3559     movdqa     xmm0, [eax]
  3560     movdqa     xmm1, [eax + 16]
  3561     lea        eax, [eax + 32]
  3562     movdqa     [edx], xmm0
  3563     movdqa     [edx + 16], xmm1
  3564     lea        edx, [edx + 32]
  3565     sub        ecx, 32
  3566     jg         convertloop
  3567     ret
  3570 #endif  // HAS_COPYROW_SSE2
  3572 // Unaligned Multiple of 1.
  3573 __declspec(naked) __declspec(align(16))
  3574 void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
  3575   __asm {
  3576     mov        eax, esi
  3577     mov        edx, edi
  3578     mov        esi, [esp + 4]   // src
  3579     mov        edi, [esp + 8]   // dst
  3580     mov        ecx, [esp + 12]  // count
  3581     rep movsb
  3582     mov        edi, edx
  3583     mov        esi, eax
  3584     ret
  3588 #ifdef HAS_COPYROW_X86
  3589 __declspec(naked) __declspec(align(16))
  3590 void CopyRow_X86(const uint8* src, uint8* dst, int count) {
  3591   __asm {
  3592     mov        eax, esi
  3593     mov        edx, edi
  3594     mov        esi, [esp + 4]   // src
  3595     mov        edi, [esp + 8]   // dst
  3596     mov        ecx, [esp + 12]  // count
  3597     shr        ecx, 2
  3598     rep movsd
  3599     mov        edi, edx
  3600     mov        esi, eax
  3601     ret
  3604 #endif  // HAS_COPYROW_X86
  3606 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
  3607 // width in pixels
  3608 __declspec(naked) __declspec(align(16))
  3609 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
  3610   __asm {
  3611     mov        eax, [esp + 4]   // src
  3612     mov        edx, [esp + 8]   // dst
  3613     mov        ecx, [esp + 12]  // count
  3614     pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
  3615     pslld      xmm0, 24
  3616     pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
  3617     psrld      xmm1, 8
  3619     align      4
  3620   convertloop:
  3621     movdqa     xmm2, [eax]
  3622     movdqa     xmm3, [eax + 16]
  3623     lea        eax, [eax + 32]
  3624     movdqa     xmm4, [edx]
  3625     movdqa     xmm5, [edx + 16]
  3626     pand       xmm2, xmm0
  3627     pand       xmm3, xmm0
  3628     pand       xmm4, xmm1
  3629     pand       xmm5, xmm1
  3630     por        xmm2, xmm4
  3631     por        xmm3, xmm5
  3632     movdqa     [edx], xmm2
  3633     movdqa     [edx + 16], xmm3
  3634     lea        edx, [edx + 32]
  3635     sub        ecx, 8
  3636     jg         convertloop
  3638     ret
  3641 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
  3643 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
  3644 // width in pixels
  3645 __declspec(naked) __declspec(align(16))
  3646 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
  3647   __asm {
  3648     mov        eax, [esp + 4]   // src
  3649     mov        edx, [esp + 8]   // dst
  3650     mov        ecx, [esp + 12]  // count
  3651     vpcmpeqb   ymm0, ymm0, ymm0
  3652     vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
  3654     align      4
  3655   convertloop:
  3656     vmovdqu    ymm1, [eax]
  3657     vmovdqu    ymm2, [eax + 32]
  3658     lea        eax, [eax + 64]
  3659     vpblendvb  ymm1, ymm1, [edx], ymm0
  3660     vpblendvb  ymm2, ymm2, [edx + 32], ymm0
  3661     vmovdqu    [edx], ymm1
  3662     vmovdqu    [edx + 32], ymm2
  3663     lea        edx, [edx + 64]
  3664     sub        ecx, 16
  3665     jg         convertloop
  3667     vzeroupper
  3668     ret
  3671 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
  3673 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
  3674 // width in pixels
  3675 __declspec(naked) __declspec(align(16))
  3676 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
  3677   __asm {
  3678     mov        eax, [esp + 4]   // src
  3679     mov        edx, [esp + 8]   // dst
  3680     mov        ecx, [esp + 12]  // count
  3681     pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
  3682     pslld      xmm0, 24
  3683     pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
  3684     psrld      xmm1, 8
  3686     align      4
  3687   convertloop:
  3688     movq       xmm2, qword ptr [eax]  // 8 Y's
  3689     lea        eax, [eax + 8]
  3690     punpcklbw  xmm2, xmm2
  3691     punpckhwd  xmm3, xmm2
  3692     punpcklwd  xmm2, xmm2
  3693     movdqa     xmm4, [edx]
  3694     movdqa     xmm5, [edx + 16]
  3695     pand       xmm2, xmm0
  3696     pand       xmm3, xmm0
  3697     pand       xmm4, xmm1
  3698     pand       xmm5, xmm1
  3699     por        xmm2, xmm4
  3700     por        xmm3, xmm5
  3701     movdqa     [edx], xmm2
  3702     movdqa     [edx + 16], xmm3
  3703     lea        edx, [edx + 32]
  3704     sub        ecx, 8
  3705     jg         convertloop
  3707     ret
  3710 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
  3712 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
  3713 // width in pixels
  3714 __declspec(naked) __declspec(align(16))
  3715 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
  3716   __asm {
  3717     mov        eax, [esp + 4]   // src
  3718     mov        edx, [esp + 8]   // dst
  3719     mov        ecx, [esp + 12]  // count
  3720     vpcmpeqb   ymm0, ymm0, ymm0
  3721     vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
  3723     align      4
  3724   convertloop:
  3725     vpmovzxbd  ymm1, qword ptr [eax]
  3726     vpmovzxbd  ymm2, qword ptr [eax + 8]
  3727     lea        eax, [eax + 16]
  3728     vpslld     ymm1, ymm1, 24
  3729     vpslld     ymm2, ymm2, 24
  3730     vpblendvb  ymm1, ymm1, [edx], ymm0
  3731     vpblendvb  ymm2, ymm2, [edx + 32], ymm0
  3732     vmovdqu    [edx], ymm1
  3733     vmovdqu    [edx + 32], ymm2
  3734     lea        edx, [edx + 64]
  3735     sub        ecx, 16
  3736     jg         convertloop
  3738     vzeroupper
  3739     ret
  3742 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
  3744 #ifdef HAS_SETROW_X86
  3745 // SetRow8 writes 'count' bytes using a 32 bit value repeated.
  3746 __declspec(naked) __declspec(align(16))
  3747 void SetRow_X86(uint8* dst, uint32 v32, int count) {
  3748   __asm {
  3749     mov        edx, edi
  3750     mov        edi, [esp + 4]   // dst
  3751     mov        eax, [esp + 8]   // v32
  3752     mov        ecx, [esp + 12]  // count
  3753     shr        ecx, 2
  3754     rep stosd
  3755     mov        edi, edx
  3756     ret
  3760 // SetRow32 writes 'count' words using a 32 bit value repeated.
  3761 __declspec(naked) __declspec(align(16))
  3762 void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
  3763                    int dst_stride, int height) {
  3764   __asm {
  3765     push       esi
  3766     push       edi
  3767     push       ebp
  3768     mov        edi, [esp + 12 + 4]   // dst
  3769     mov        eax, [esp + 12 + 8]   // v32
  3770     mov        ebp, [esp + 12 + 12]  // width
  3771     mov        edx, [esp + 12 + 16]  // dst_stride
  3772     mov        esi, [esp + 12 + 20]  // height
  3773     lea        ecx, [ebp * 4]
  3774     sub        edx, ecx             // stride - width * 4
  3776     align      4
  3777   convertloop:
  3778     mov        ecx, ebp
  3779     rep stosd
  3780     add        edi, edx
  3781     sub        esi, 1
  3782     jg         convertloop
  3784     pop        ebp
  3785     pop        edi
  3786     pop        esi
  3787     ret
  3790 #endif  // HAS_SETROW_X86
  3792 #ifdef HAS_YUY2TOYROW_AVX2
  3793 __declspec(naked) __declspec(align(16))
  3794 void YUY2ToYRow_AVX2(const uint8* src_yuy2,
  3795                      uint8* dst_y, int pix) {
  3796   __asm {
  3797     mov        eax, [esp + 4]    // src_yuy2
  3798     mov        edx, [esp + 8]    // dst_y
  3799     mov        ecx, [esp + 12]   // pix
  3800     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
  3801     vpsrlw     ymm5, ymm5, 8
  3803     align      4
  3804   convertloop:
  3805     vmovdqu    ymm0, [eax]
  3806     vmovdqu    ymm1, [eax + 32]
  3807     lea        eax,  [eax + 64]
  3808     vpand      ymm0, ymm0, ymm5   // even bytes are Y
  3809     vpand      ymm1, ymm1, ymm5
  3810     vpackuswb  ymm0, ymm0, ymm1   // mutates.
  3811     vpermq     ymm0, ymm0, 0xd8
  3812     sub        ecx, 32
  3813     vmovdqu    [edx], ymm0
  3814     lea        edx, [edx + 32]
  3815     jg         convertloop
  3816     vzeroupper
  3817     ret
  3821 __declspec(naked) __declspec(align(16))
  3822 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
  3823                       uint8* dst_u, uint8* dst_v, int pix) {
  3824   __asm {
  3825     push       esi
  3826     push       edi
  3827     mov        eax, [esp + 8 + 4]    // src_yuy2
  3828     mov        esi, [esp + 8 + 8]    // stride_yuy2
  3829     mov        edx, [esp + 8 + 12]   // dst_u
  3830     mov        edi, [esp + 8 + 16]   // dst_v
  3831     mov        ecx, [esp + 8 + 20]   // pix
  3832     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
  3833     vpsrlw     ymm5, ymm5, 8
  3834     sub        edi, edx
  3836     align      4
  3837   convertloop:
  3838     vmovdqu    ymm0, [eax]
  3839     vmovdqu    ymm1, [eax + 32]
  3840     vpavgb     ymm0, ymm0, [eax + esi]
  3841     vpavgb     ymm1, ymm1, [eax + esi + 32]
  3842     lea        eax,  [eax + 64]
  3843     vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
  3844     vpsrlw     ymm1, ymm1, 8
  3845     vpackuswb  ymm0, ymm0, ymm1   // mutates.
  3846     vpermq     ymm0, ymm0, 0xd8
  3847     vpand      ymm1, ymm0, ymm5  // U
  3848     vpsrlw     ymm0, ymm0, 8     // V
  3849     vpackuswb  ymm1, ymm1, ymm1  // mutates.
  3850     vpackuswb  ymm0, ymm0, ymm0  // mutates.
  3851     vpermq     ymm1, ymm1, 0xd8
  3852     vpermq     ymm0, ymm0, 0xd8
  3853     vextractf128 [edx], ymm1, 0  // U
  3854     vextractf128 [edx + edi], ymm0, 0 // V
  3855     lea        edx, [edx + 16]
  3856     sub        ecx, 32
  3857     jg         convertloop
  3859     pop        edi
  3860     pop        esi
  3861     vzeroupper
  3862     ret
  3866 __declspec(naked) __declspec(align(16))
  3867 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
  3868                          uint8* dst_u, uint8* dst_v, int pix) {
  3869   __asm {
  3870     push       edi
  3871     mov        eax, [esp + 4 + 4]    // src_yuy2
  3872     mov        edx, [esp + 4 + 8]    // dst_u
  3873     mov        edi, [esp + 4 + 12]   // dst_v
  3874     mov        ecx, [esp + 4 + 16]   // pix
  3875     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
  3876     vpsrlw     ymm5, ymm5, 8
  3877     sub        edi, edx
  3879     align      4
  3880   convertloop:
  3881     vmovdqu    ymm0, [eax]
  3882     vmovdqu    ymm1, [eax + 32]
  3883     lea        eax,  [eax + 64]
  3884     vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
  3885     vpsrlw     ymm1, ymm1, 8
  3886     vpackuswb  ymm0, ymm0, ymm1   // mutates.
  3887     vpermq     ymm0, ymm0, 0xd8
  3888     vpand      ymm1, ymm0, ymm5  // U
  3889     vpsrlw     ymm0, ymm0, 8     // V
  3890     vpackuswb  ymm1, ymm1, ymm1  // mutates.
  3891     vpackuswb  ymm0, ymm0, ymm0  // mutates.
  3892     vpermq     ymm1, ymm1, 0xd8
  3893     vpermq     ymm0, ymm0, 0xd8
  3894     vextractf128 [edx], ymm1, 0  // U
  3895     vextractf128 [edx + edi], ymm0, 0 // V
  3896     lea        edx, [edx + 16]
  3897     sub        ecx, 32
  3898     jg         convertloop
  3900     pop        edi
  3901     vzeroupper
  3902     ret
  3906 __declspec(naked) __declspec(align(16))
  3907 void UYVYToYRow_AVX2(const uint8* src_uyvy,
  3908                      uint8* dst_y, int pix) {
  3909   __asm {
  3910     mov        eax, [esp + 4]    // src_uyvy
  3911     mov        edx, [esp + 8]    // dst_y
  3912     mov        ecx, [esp + 12]   // pix
  3914     align      4
  3915   convertloop:
  3916     vmovdqu    ymm0, [eax]
  3917     vmovdqu    ymm1, [eax + 32]
  3918     lea        eax,  [eax + 64]
  3919     vpsrlw     ymm0, ymm0, 8      // odd bytes are Y
  3920     vpsrlw     ymm1, ymm1, 8
  3921     vpackuswb  ymm0, ymm0, ymm1   // mutates.
  3922     vpermq     ymm0, ymm0, 0xd8
  3923     sub        ecx, 32
  3924     vmovdqu    [edx], ymm0
  3925     lea        edx, [edx + 32]
  3926     jg         convertloop
  3927     ret
  3928     vzeroupper
  3932 __declspec(naked) __declspec(align(16))
  3933 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
  3934                       uint8* dst_u, uint8* dst_v, int pix) {
  3935   __asm {
  3936     push       esi
  3937     push       edi
  3938     mov        eax, [esp + 8 + 4]    // src_yuy2
  3939     mov        esi, [esp + 8 + 8]    // stride_yuy2
  3940     mov        edx, [esp + 8 + 12]   // dst_u
  3941     mov        edi, [esp + 8 + 16]   // dst_v
  3942     mov        ecx, [esp + 8 + 20]   // pix
  3943     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
  3944     vpsrlw     ymm5, ymm5, 8
  3945     sub        edi, edx
  3947     align      4
  3948   convertloop:
  3949     vmovdqu    ymm0, [eax]
  3950     vmovdqu    ymm1, [eax + 32]
  3951     vpavgb     ymm0, ymm0, [eax + esi]
  3952     vpavgb     ymm1, ymm1, [eax + esi + 32]
  3953     lea        eax,  [eax + 64]
  3954     vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
  3955     vpand      ymm1, ymm1, ymm5
  3956     vpackuswb  ymm0, ymm0, ymm1   // mutates.
  3957     vpermq     ymm0, ymm0, 0xd8
  3958     vpand      ymm1, ymm0, ymm5  // U
  3959     vpsrlw     ymm0, ymm0, 8     // V
  3960     vpackuswb  ymm1, ymm1, ymm1  // mutates.
  3961     vpackuswb  ymm0, ymm0, ymm0  // mutates.
  3962     vpermq     ymm1, ymm1, 0xd8
  3963     vpermq     ymm0, ymm0, 0xd8
  3964     vextractf128 [edx], ymm1, 0  // U
  3965     vextractf128 [edx + edi], ymm0, 0 // V
  3966     lea        edx, [edx + 16]
  3967     sub        ecx, 32
  3968     jg         convertloop
  3970     pop        edi
  3971     pop        esi
  3972     vzeroupper
  3973     ret
  3977 __declspec(naked) __declspec(align(16))
  3978 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
  3979                          uint8* dst_u, uint8* dst_v, int pix) {
  3980   __asm {
  3981     push       edi
  3982     mov        eax, [esp + 4 + 4]    // src_yuy2
  3983     mov        edx, [esp + 4 + 8]    // dst_u
  3984     mov        edi, [esp + 4 + 12]   // dst_v
  3985     mov        ecx, [esp + 4 + 16]   // pix
  3986     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
  3987     vpsrlw     ymm5, ymm5, 8
  3988     sub        edi, edx
  3990     align      4
  3991   convertloop:
  3992     vmovdqu    ymm0, [eax]
  3993     vmovdqu    ymm1, [eax + 32]
  3994     lea        eax,  [eax + 64]
  3995     vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
  3996     vpand      ymm1, ymm1, ymm5
  3997     vpackuswb  ymm0, ymm0, ymm1   // mutates.
  3998     vpermq     ymm0, ymm0, 0xd8
  3999     vpand      ymm1, ymm0, ymm5  // U
  4000     vpsrlw     ymm0, ymm0, 8     // V
  4001     vpackuswb  ymm1, ymm1, ymm1  // mutates.
  4002     vpackuswb  ymm0, ymm0, ymm0  // mutates.
  4003     vpermq     ymm1, ymm1, 0xd8
  4004     vpermq     ymm0, ymm0, 0xd8
  4005     vextractf128 [edx], ymm1, 0  // U
  4006     vextractf128 [edx + edi], ymm0, 0 // V
  4007     lea        edx, [edx + 16]
  4008     sub        ecx, 32
  4009     jg         convertloop
  4011     pop        edi
  4012     vzeroupper
  4013     ret
  4016 #endif  // HAS_YUY2TOYROW_AVX2
  4018 #ifdef HAS_YUY2TOYROW_SSE2
  4019 __declspec(naked) __declspec(align(16))
  4020 void YUY2ToYRow_SSE2(const uint8* src_yuy2,
  4021                      uint8* dst_y, int pix) {
  4022   __asm {
  4023     mov        eax, [esp + 4]    // src_yuy2
  4024     mov        edx, [esp + 8]    // dst_y
  4025     mov        ecx, [esp + 12]   // pix
  4026     pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
  4027     psrlw      xmm5, 8
  4029     align      4
  4030   convertloop:
  4031     movdqa     xmm0, [eax]
  4032     movdqa     xmm1, [eax + 16]
  4033     lea        eax,  [eax + 32]
  4034     pand       xmm0, xmm5   // even bytes are Y
  4035     pand       xmm1, xmm5
  4036     packuswb   xmm0, xmm1
  4037     sub        ecx, 16
  4038     movdqa     [edx], xmm0
  4039     lea        edx, [edx + 16]
  4040     jg         convertloop
  4041     ret
  4045 __declspec(naked) __declspec(align(16))
  4046 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
  4047                       uint8* dst_u, uint8* dst_v, int pix) {
  4048   __asm {
  4049     push       esi
  4050     push       edi
  4051     mov        eax, [esp + 8 + 4]    // src_yuy2
  4052     mov        esi, [esp + 8 + 8]    // stride_yuy2
  4053     mov        edx, [esp + 8 + 12]   // dst_u
  4054     mov        edi, [esp + 8 + 16]   // dst_v
  4055     mov        ecx, [esp + 8 + 20]   // pix
  4056     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
  4057     psrlw      xmm5, 8
  4058     sub        edi, edx
  4060     align      4
  4061   convertloop:
  4062     movdqa     xmm0, [eax]
  4063     movdqa     xmm1, [eax + 16]
  4064     movdqa     xmm2, [eax + esi]
  4065     movdqa     xmm3, [eax + esi + 16]
  4066     lea        eax,  [eax + 32]
  4067     pavgb      xmm0, xmm2
  4068     pavgb      xmm1, xmm3
  4069     psrlw      xmm0, 8      // YUYV -> UVUV
  4070     psrlw      xmm1, 8
  4071     packuswb   xmm0, xmm1
  4072     movdqa     xmm1, xmm0
  4073     pand       xmm0, xmm5  // U
  4074     packuswb   xmm0, xmm0
  4075     psrlw      xmm1, 8     // V
  4076     packuswb   xmm1, xmm1
  4077     movq       qword ptr [edx], xmm0
  4078     movq       qword ptr [edx + edi], xmm1
  4079     lea        edx, [edx + 8]
  4080     sub        ecx, 16
  4081     jg         convertloop
  4083     pop        edi
  4084     pop        esi
  4085     ret
  4089 __declspec(naked) __declspec(align(16))
  4090 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
  4091                          uint8* dst_u, uint8* dst_v, int pix) {
  4092   __asm {
  4093     push       edi
  4094     mov        eax, [esp + 4 + 4]    // src_yuy2
  4095     mov        edx, [esp + 4 + 8]    // dst_u
  4096     mov        edi, [esp + 4 + 12]   // dst_v
  4097     mov        ecx, [esp + 4 + 16]   // pix
  4098     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
  4099     psrlw      xmm5, 8
  4100     sub        edi, edx
  4102     align      4
  4103   convertloop:
  4104     movdqa     xmm0, [eax]
  4105     movdqa     xmm1, [eax + 16]
  4106     lea        eax,  [eax + 32]
  4107     psrlw      xmm0, 8      // YUYV -> UVUV
  4108     psrlw      xmm1, 8
  4109     packuswb   xmm0, xmm1
  4110     movdqa     xmm1, xmm0
  4111     pand       xmm0, xmm5  // U
  4112     packuswb   xmm0, xmm0
  4113     psrlw      xmm1, 8     // V
  4114     packuswb   xmm1, xmm1
  4115     movq       qword ptr [edx], xmm0
  4116     movq       qword ptr [edx + edi], xmm1
  4117     lea        edx, [edx + 8]
  4118     sub        ecx, 16
  4119     jg         convertloop
  4121     pop        edi
  4122     ret
  4126 __declspec(naked) __declspec(align(16))
  4127 void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
  4128                                uint8* dst_y, int pix) {
  4129   __asm {
  4130     mov        eax, [esp + 4]    // src_yuy2
  4131     mov        edx, [esp + 8]    // dst_y
  4132     mov        ecx, [esp + 12]   // pix
  4133     pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
  4134     psrlw      xmm5, 8
  4136     align      4
  4137   convertloop:
  4138     movdqu     xmm0, [eax]
  4139     movdqu     xmm1, [eax + 16]
  4140     lea        eax,  [eax + 32]
  4141     pand       xmm0, xmm5   // even bytes are Y
  4142     pand       xmm1, xmm5
  4143     packuswb   xmm0, xmm1
  4144     sub        ecx, 16
  4145     movdqu     [edx], xmm0
  4146     lea        edx, [edx + 16]
  4147     jg         convertloop
  4148     ret
  4152 __declspec(naked) __declspec(align(16))
  4153 void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
  4154                                 uint8* dst_u, uint8* dst_v, int pix) {
  4155   __asm {
  4156     push       esi
  4157     push       edi
  4158     mov        eax, [esp + 8 + 4]    // src_yuy2
  4159     mov        esi, [esp + 8 + 8]    // stride_yuy2
  4160     mov        edx, [esp + 8 + 12]   // dst_u
  4161     mov        edi, [esp + 8 + 16]   // dst_v
  4162     mov        ecx, [esp + 8 + 20]   // pix
  4163     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
  4164     psrlw      xmm5, 8
  4165     sub        edi, edx
  4167     align      4
  4168   convertloop:
  4169     movdqu     xmm0, [eax]
  4170     movdqu     xmm1, [eax + 16]
  4171     movdqu     xmm2, [eax + esi]
  4172     movdqu     xmm3, [eax + esi + 16]
  4173     lea        eax,  [eax + 32]
  4174     pavgb      xmm0, xmm2
  4175     pavgb      xmm1, xmm3
  4176     psrlw      xmm0, 8      // YUYV -> UVUV
  4177     psrlw      xmm1, 8
  4178     packuswb   xmm0, xmm1
  4179     movdqa     xmm1, xmm0
  4180     pand       xmm0, xmm5  // U
  4181     packuswb   xmm0, xmm0
  4182     psrlw      xmm1, 8     // V
  4183     packuswb   xmm1, xmm1
  4184     movq       qword ptr [edx], xmm0
  4185     movq       qword ptr [edx + edi], xmm1
  4186     lea        edx, [edx + 8]
  4187     sub        ecx, 16
  4188     jg         convertloop
  4190     pop        edi
  4191     pop        esi
  4192     ret
  4196 __declspec(naked) __declspec(align(16))
  4197 void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
  4198                                    uint8* dst_u, uint8* dst_v, int pix) {
  4199   __asm {
  4200     push       edi
  4201     mov        eax, [esp + 4 + 4]    // src_yuy2
  4202     mov        edx, [esp + 4 + 8]    // dst_u
  4203     mov        edi, [esp + 4 + 12]   // dst_v
  4204     mov        ecx, [esp + 4 + 16]   // pix
  4205     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
  4206     psrlw      xmm5, 8
  4207     sub        edi, edx
  4209     align      4
  4210   convertloop:
  4211     movdqu     xmm0, [eax]
  4212     movdqu     xmm1, [eax + 16]
  4213     lea        eax,  [eax + 32]
  4214     psrlw      xmm0, 8      // YUYV -> UVUV
  4215     psrlw      xmm1, 8
  4216     packuswb   xmm0, xmm1
  4217     movdqa     xmm1, xmm0
  4218     pand       xmm0, xmm5  // U
  4219     packuswb   xmm0, xmm0
  4220     psrlw      xmm1, 8     // V
  4221     packuswb   xmm1, xmm1
  4222     movq       qword ptr [edx], xmm0
  4223     movq       qword ptr [edx + edi], xmm1
  4224     lea        edx, [edx + 8]
  4225     sub        ecx, 16
  4226     jg         convertloop
  4228     pop        edi
  4229     ret
  4233 __declspec(naked) __declspec(align(16))
  4234 void UYVYToYRow_SSE2(const uint8* src_uyvy,
  4235                      uint8* dst_y, int pix) {
  4236   __asm {
  4237     mov        eax, [esp + 4]    // src_uyvy
  4238     mov        edx, [esp + 8]    // dst_y
  4239     mov        ecx, [esp + 12]   // pix
  4241     align      4
  4242   convertloop:
  4243     movdqa     xmm0, [eax]
  4244     movdqa     xmm1, [eax + 16]
  4245     lea        eax,  [eax + 32]
  4246     psrlw      xmm0, 8    // odd bytes are Y
  4247     psrlw      xmm1, 8
  4248     packuswb   xmm0, xmm1
  4249     sub        ecx, 16
  4250     movdqa     [edx], xmm0
  4251     lea        edx, [edx + 16]
  4252     jg         convertloop
  4253     ret
  4257 __declspec(naked) __declspec(align(16))
  4258 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
  4259                       uint8* dst_u, uint8* dst_v, int pix) {
  4260   __asm {
  4261     push       esi
  4262     push       edi
  4263     mov        eax, [esp + 8 + 4]    // src_yuy2
  4264     mov        esi, [esp + 8 + 8]    // stride_yuy2
  4265     mov        edx, [esp + 8 + 12]   // dst_u
  4266     mov        edi, [esp + 8 + 16]   // dst_v
  4267     mov        ecx, [esp + 8 + 20]   // pix
  4268     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
  4269     psrlw      xmm5, 8
  4270     sub        edi, edx
  4272     align      4
  4273   convertloop:
  4274     movdqa     xmm0, [eax]
  4275     movdqa     xmm1, [eax + 16]
  4276     movdqa     xmm2, [eax + esi]
  4277     movdqa     xmm3, [eax + esi + 16]
  4278     lea        eax,  [eax + 32]
  4279     pavgb      xmm0, xmm2
  4280     pavgb      xmm1, xmm3
  4281     pand       xmm0, xmm5   // UYVY -> UVUV
  4282     pand       xmm1, xmm5
  4283     packuswb   xmm0, xmm1
  4284     movdqa     xmm1, xmm0
  4285     pand       xmm0, xmm5  // U
  4286     packuswb   xmm0, xmm0
  4287     psrlw      xmm1, 8     // V
  4288     packuswb   xmm1, xmm1
  4289     movq       qword ptr [edx], xmm0
  4290     movq       qword ptr [edx + edi], xmm1
  4291     lea        edx, [edx + 8]
  4292     sub        ecx, 16
  4293     jg         convertloop
  4295     pop        edi
  4296     pop        esi
  4297     ret
  4301 __declspec(naked) __declspec(align(16))
  4302 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
  4303                          uint8* dst_u, uint8* dst_v, int pix) {
  4304   __asm {
  4305     push       edi
  4306     mov        eax, [esp + 4 + 4]    // src_yuy2
  4307     mov        edx, [esp + 4 + 8]    // dst_u
  4308     mov        edi, [esp + 4 + 12]   // dst_v
  4309     mov        ecx, [esp + 4 + 16]   // pix
  4310     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
  4311     psrlw      xmm5, 8
  4312     sub        edi, edx
  4314     align      4
  4315   convertloop:
  4316     movdqa     xmm0, [eax]
  4317     movdqa     xmm1, [eax + 16]
  4318     lea        eax,  [eax + 32]
  4319     pand       xmm0, xmm5   // UYVY -> UVUV
  4320     pand       xmm1, xmm5
  4321     packuswb   xmm0, xmm1
  4322     movdqa     xmm1, xmm0
  4323     pand       xmm0, xmm5  // U
  4324     packuswb   xmm0, xmm0
  4325     psrlw      xmm1, 8     // V
  4326     packuswb   xmm1, xmm1
  4327     movq       qword ptr [edx], xmm0
  4328     movq       qword ptr [edx + edi], xmm1
  4329     lea        edx, [edx + 8]
  4330     sub        ecx, 16
  4331     jg         convertloop
  4333     pop        edi
  4334     ret
  4338 __declspec(naked) __declspec(align(16))
  4339 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
  4340                                uint8* dst_y, int pix) {
  4341   __asm {
  4342     mov        eax, [esp + 4]    // src_uyvy
  4343     mov        edx, [esp + 8]    // dst_y
  4344     mov        ecx, [esp + 12]   // pix
  4346     align      4
  4347   convertloop:
  4348     movdqu     xmm0, [eax]
  4349     movdqu     xmm1, [eax + 16]
  4350     lea        eax,  [eax + 32]
  4351     psrlw      xmm0, 8    // odd bytes are Y
  4352     psrlw      xmm1, 8
  4353     packuswb   xmm0, xmm1
  4354     sub        ecx, 16
  4355     movdqu     [edx], xmm0
  4356     lea        edx, [edx + 16]
  4357     jg         convertloop
  4358     ret
  4362 __declspec(naked) __declspec(align(16))
  4363 void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
  4364                                 uint8* dst_u, uint8* dst_v, int pix) {
  4365   __asm {
  4366     push       esi
  4367     push       edi
  4368     mov        eax, [esp + 8 + 4]    // src_yuy2
  4369     mov        esi, [esp + 8 + 8]    // stride_yuy2
  4370     mov        edx, [esp + 8 + 12]   // dst_u
  4371     mov        edi, [esp + 8 + 16]   // dst_v
  4372     mov        ecx, [esp + 8 + 20]   // pix
  4373     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
  4374     psrlw      xmm5, 8
  4375     sub        edi, edx
  4377     align      4
  4378   convertloop:
  4379     movdqu     xmm0, [eax]
  4380     movdqu     xmm1, [eax + 16]
  4381     movdqu     xmm2, [eax + esi]
  4382     movdqu     xmm3, [eax + esi + 16]
  4383     lea        eax,  [eax + 32]
  4384     pavgb      xmm0, xmm2
  4385     pavgb      xmm1, xmm3
  4386     pand       xmm0, xmm5   // UYVY -> UVUV
  4387     pand       xmm1, xmm5
  4388     packuswb   xmm0, xmm1
  4389     movdqa     xmm1, xmm0
  4390     pand       xmm0, xmm5  // U
  4391     packuswb   xmm0, xmm0
  4392     psrlw      xmm1, 8     // V
  4393     packuswb   xmm1, xmm1
  4394     movq       qword ptr [edx], xmm0
  4395     movq       qword ptr [edx + edi], xmm1
  4396     lea        edx, [edx + 8]
  4397     sub        ecx, 16
  4398     jg         convertloop
  4400     pop        edi
  4401     pop        esi
  4402     ret
  4406 __declspec(naked) __declspec(align(16))
  4407 void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
  4408                                    uint8* dst_u, uint8* dst_v, int pix) {
  4409   __asm {
  4410     push       edi
  4411     mov        eax, [esp + 4 + 4]    // src_yuy2
  4412     mov        edx, [esp + 4 + 8]    // dst_u
  4413     mov        edi, [esp + 4 + 12]   // dst_v
  4414     mov        ecx, [esp + 4 + 16]   // pix
  4415     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
  4416     psrlw      xmm5, 8
  4417     sub        edi, edx
  4419     align      4
  4420   convertloop:
  4421     movdqu     xmm0, [eax]
  4422     movdqu     xmm1, [eax + 16]
  4423     lea        eax,  [eax + 32]
  4424     pand       xmm0, xmm5   // UYVY -> UVUV
  4425     pand       xmm1, xmm5
  4426     packuswb   xmm0, xmm1
  4427     movdqa     xmm1, xmm0
  4428     pand       xmm0, xmm5  // U
  4429     packuswb   xmm0, xmm0
  4430     psrlw      xmm1, 8     // V
  4431     packuswb   xmm1, xmm1
  4432     movq       qword ptr [edx], xmm0
  4433     movq       qword ptr [edx + edi], xmm1
  4434     lea        edx, [edx + 8]
  4435     sub        ecx, 16
  4436     jg         convertloop
  4438     pop        edi
  4439     ret
  4442 #endif  // HAS_YUY2TOYROW_SSE2
  4444 #ifdef HAS_ARGBBLENDROW_SSE2
  4445 // Blend 8 pixels at a time.
  4446 __declspec(naked) __declspec(align(16))
  4447 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  4448                        uint8* dst_argb, int width) {
  4449   __asm {
  4450     push       esi
  4451     mov        eax, [esp + 4 + 4]   // src_argb0
  4452     mov        esi, [esp + 4 + 8]   // src_argb1
  4453     mov        edx, [esp + 4 + 12]  // dst_argb
  4454     mov        ecx, [esp + 4 + 16]  // width
  4455     pcmpeqb    xmm7, xmm7       // generate constant 1
  4456     psrlw      xmm7, 15
  4457     pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
  4458     psrlw      xmm6, 8
  4459     pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
  4460     psllw      xmm5, 8
  4461     pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
  4462     pslld      xmm4, 24
  4464     sub        ecx, 1
  4465     je         convertloop1     // only 1 pixel?
  4466     jl         convertloop1b
  4468     // 1 pixel loop until destination pointer is aligned.
  4469   alignloop1:
  4470     test       edx, 15          // aligned?
  4471     je         alignloop1b
  4472     movd       xmm3, [eax]
  4473     lea        eax, [eax + 4]
  4474     movdqa     xmm0, xmm3       // src argb
  4475     pxor       xmm3, xmm4       // ~alpha
  4476     movd       xmm2, [esi]      // _r_b
  4477     psrlw      xmm3, 8          // alpha
  4478     pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
  4479     pshuflw    xmm3, xmm3, 0F5h
  4480     pand       xmm2, xmm6       // _r_b
  4481     paddw      xmm3, xmm7       // 256 - alpha
  4482     pmullw     xmm2, xmm3       // _r_b * alpha
  4483     movd       xmm1, [esi]      // _a_g
  4484     lea        esi, [esi + 4]
  4485     psrlw      xmm1, 8          // _a_g
  4486     por        xmm0, xmm4       // set alpha to 255
  4487     pmullw     xmm1, xmm3       // _a_g * alpha
  4488     psrlw      xmm2, 8          // _r_b convert to 8 bits again
  4489     paddusb    xmm0, xmm2       // + src argb
  4490     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
  4491     paddusb    xmm0, xmm1       // + src argb
  4492     sub        ecx, 1
  4493     movd       [edx], xmm0
  4494     lea        edx, [edx + 4]
  4495     jge        alignloop1
  4497   alignloop1b:
  4498     add        ecx, 1 - 4
  4499     jl         convertloop4b
  4501     // 4 pixel loop.
  4502   convertloop4:
  4503     movdqu     xmm3, [eax]      // src argb
  4504     lea        eax, [eax + 16]
  4505     movdqa     xmm0, xmm3       // src argb
  4506     pxor       xmm3, xmm4       // ~alpha
  4507     movdqu     xmm2, [esi]      // _r_b
  4508     psrlw      xmm3, 8          // alpha
  4509     pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
  4510     pshuflw    xmm3, xmm3, 0F5h
  4511     pand       xmm2, xmm6       // _r_b
  4512     paddw      xmm3, xmm7       // 256 - alpha
  4513     pmullw     xmm2, xmm3       // _r_b * alpha
  4514     movdqu     xmm1, [esi]      // _a_g
  4515     lea        esi, [esi + 16]
  4516     psrlw      xmm1, 8          // _a_g
  4517     por        xmm0, xmm4       // set alpha to 255
  4518     pmullw     xmm1, xmm3       // _a_g * alpha
  4519     psrlw      xmm2, 8          // _r_b convert to 8 bits again
  4520     paddusb    xmm0, xmm2       // + src argb
  4521     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
  4522     paddusb    xmm0, xmm1       // + src argb
  4523     sub        ecx, 4
  4524     movdqa     [edx], xmm0
  4525     lea        edx, [edx + 16]
  4526     jge        convertloop4
  4528   convertloop4b:
  4529     add        ecx, 4 - 1
  4530     jl         convertloop1b
  4532     // 1 pixel loop.
  4533   convertloop1:
  4534     movd       xmm3, [eax]      // src argb
  4535     lea        eax, [eax + 4]
  4536     movdqa     xmm0, xmm3       // src argb
  4537     pxor       xmm3, xmm4       // ~alpha
  4538     movd       xmm2, [esi]      // _r_b
  4539     psrlw      xmm3, 8          // alpha
  4540     pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
  4541     pshuflw    xmm3, xmm3, 0F5h
  4542     pand       xmm2, xmm6       // _r_b
  4543     paddw      xmm3, xmm7       // 256 - alpha
  4544     pmullw     xmm2, xmm3       // _r_b * alpha
  4545     movd       xmm1, [esi]      // _a_g
  4546     lea        esi, [esi + 4]
  4547     psrlw      xmm1, 8          // _a_g
  4548     por        xmm0, xmm4       // set alpha to 255
  4549     pmullw     xmm1, xmm3       // _a_g * alpha
  4550     psrlw      xmm2, 8          // _r_b convert to 8 bits again
  4551     paddusb    xmm0, xmm2       // + src argb
  4552     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
  4553     paddusb    xmm0, xmm1       // + src argb
  4554     sub        ecx, 1
  4555     movd       [edx], xmm0
  4556     lea        edx, [edx + 4]
  4557     jge        convertloop1
  4559   convertloop1b:
  4560     pop        esi
  4561     ret
  4564 #endif  // HAS_ARGBBLENDROW_SSE2
  4566 #ifdef HAS_ARGBBLENDROW_SSSE3
  4567 // Shuffle table for isolating alpha.
  4568 static const uvec8 kShuffleAlpha = {
  4569   3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
  4570   11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
  4571 };
  4572 // Same as SSE2, but replaces:
  4573 //    psrlw      xmm3, 8          // alpha
  4574 //    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
  4575 //    pshuflw    xmm3, xmm3, 0F5h
  4576 // with..
  4577 //    pshufb     xmm3, kShuffleAlpha // alpha
  4578 // Blend 8 pixels at a time.
  4580 __declspec(naked) __declspec(align(16))
  4581 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
  4582                         uint8* dst_argb, int width) {
  4583   __asm {
  4584     push       esi
  4585     mov        eax, [esp + 4 + 4]   // src_argb0
  4586     mov        esi, [esp + 4 + 8]   // src_argb1
  4587     mov        edx, [esp + 4 + 12]  // dst_argb
  4588     mov        ecx, [esp + 4 + 16]  // width
  4589     pcmpeqb    xmm7, xmm7       // generate constant 0x0001
  4590     psrlw      xmm7, 15
  4591     pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
  4592     psrlw      xmm6, 8
  4593     pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
  4594     psllw      xmm5, 8
  4595     pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
  4596     pslld      xmm4, 24
  4598     sub        ecx, 1
  4599     je         convertloop1     // only 1 pixel?
  4600     jl         convertloop1b
  4602     // 1 pixel loop until destination pointer is aligned.
  4603   alignloop1:
  4604     test       edx, 15          // aligned?
  4605     je         alignloop1b
  4606     movd       xmm3, [eax]
  4607     lea        eax, [eax + 4]
  4608     movdqa     xmm0, xmm3       // src argb
  4609     pxor       xmm3, xmm4       // ~alpha
  4610     movd       xmm2, [esi]      // _r_b
  4611     pshufb     xmm3, kShuffleAlpha // alpha
  4612     pand       xmm2, xmm6       // _r_b
  4613     paddw      xmm3, xmm7       // 256 - alpha
  4614     pmullw     xmm2, xmm3       // _r_b * alpha
  4615     movd       xmm1, [esi]      // _a_g
  4616     lea        esi, [esi + 4]
  4617     psrlw      xmm1, 8          // _a_g
  4618     por        xmm0, xmm4       // set alpha to 255
  4619     pmullw     xmm1, xmm3       // _a_g * alpha
  4620     psrlw      xmm2, 8          // _r_b convert to 8 bits again
  4621     paddusb    xmm0, xmm2       // + src argb
  4622     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
  4623     paddusb    xmm0, xmm1       // + src argb
  4624     sub        ecx, 1
  4625     movd       [edx], xmm0
  4626     lea        edx, [edx + 4]
  4627     jge        alignloop1
  4629   alignloop1b:
  4630     add        ecx, 1 - 4
  4631     jl         convertloop4b
  4633     test       eax, 15          // unaligned?
  4634     jne        convertuloop4
  4635     test       esi, 15          // unaligned?
  4636     jne        convertuloop4
  4638     // 4 pixel loop.
  4639   convertloop4:
  4640     movdqa     xmm3, [eax]      // src argb
  4641     lea        eax, [eax + 16]
  4642     movdqa     xmm0, xmm3       // src argb
  4643     pxor       xmm3, xmm4       // ~alpha
  4644     movdqa     xmm2, [esi]      // _r_b
  4645     pshufb     xmm3, kShuffleAlpha // alpha
  4646     pand       xmm2, xmm6       // _r_b
  4647     paddw      xmm3, xmm7       // 256 - alpha
  4648     pmullw     xmm2, xmm3       // _r_b * alpha
  4649     movdqa     xmm1, [esi]      // _a_g
  4650     lea        esi, [esi + 16]
  4651     psrlw      xmm1, 8          // _a_g
  4652     por        xmm0, xmm4       // set alpha to 255
  4653     pmullw     xmm1, xmm3       // _a_g * alpha
  4654     psrlw      xmm2, 8          // _r_b convert to 8 bits again
  4655     paddusb    xmm0, xmm2       // + src argb
  4656     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
  4657     paddusb    xmm0, xmm1       // + src argb
  4658     sub        ecx, 4
  4659     movdqa     [edx], xmm0
  4660     lea        edx, [edx + 16]
  4661     jge        convertloop4
  4662     jmp        convertloop4b
  4664     // 4 pixel unaligned loop.
  4665   convertuloop4:
  4666     movdqu     xmm3, [eax]      // src argb
  4667     lea        eax, [eax + 16]
  4668     movdqa     xmm0, xmm3       // src argb
  4669     pxor       xmm3, xmm4       // ~alpha
  4670     movdqu     xmm2, [esi]      // _r_b
  4671     pshufb     xmm3, kShuffleAlpha // alpha
  4672     pand       xmm2, xmm6       // _r_b
  4673     paddw      xmm3, xmm7       // 256 - alpha
  4674     pmullw     xmm2, xmm3       // _r_b * alpha
  4675     movdqu     xmm1, [esi]      // _a_g
  4676     lea        esi, [esi + 16]
  4677     psrlw      xmm1, 8          // _a_g
  4678     por        xmm0, xmm4       // set alpha to 255
  4679     pmullw     xmm1, xmm3       // _a_g * alpha
  4680     psrlw      xmm2, 8          // _r_b convert to 8 bits again
  4681     paddusb    xmm0, xmm2       // + src argb
  4682     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
  4683     paddusb    xmm0, xmm1       // + src argb
  4684     sub        ecx, 4
  4685     movdqa     [edx], xmm0
  4686     lea        edx, [edx + 16]
  4687     jge        convertuloop4
  4689   convertloop4b:
  4690     add        ecx, 4 - 1
  4691     jl         convertloop1b
  4693     // 1 pixel loop.
  4694   convertloop1:
  4695     movd       xmm3, [eax]      // src argb
  4696     lea        eax, [eax + 4]
  4697     movdqa     xmm0, xmm3       // src argb
  4698     pxor       xmm3, xmm4       // ~alpha
  4699     movd       xmm2, [esi]      // _r_b
  4700     pshufb     xmm3, kShuffleAlpha // alpha
  4701     pand       xmm2, xmm6       // _r_b
  4702     paddw      xmm3, xmm7       // 256 - alpha
  4703     pmullw     xmm2, xmm3       // _r_b * alpha
  4704     movd       xmm1, [esi]      // _a_g
  4705     lea        esi, [esi + 4]
  4706     psrlw      xmm1, 8          // _a_g
  4707     por        xmm0, xmm4       // set alpha to 255
  4708     pmullw     xmm1, xmm3       // _a_g * alpha
  4709     psrlw      xmm2, 8          // _r_b convert to 8 bits again
  4710     paddusb    xmm0, xmm2       // + src argb
  4711     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
  4712     paddusb    xmm0, xmm1       // + src argb
  4713     sub        ecx, 1
  4714     movd       [edx], xmm0
  4715     lea        edx, [edx + 4]
  4716     jge        convertloop1
  4718   convertloop1b:
  4719     pop        esi
  4720     ret
  4723 #endif  // HAS_ARGBBLENDROW_SSSE3
  4725 #ifdef HAS_ARGBATTENUATEROW_SSE2
  4726 // Attenuate 4 pixels at a time.
  4727 // Aligned to 16 bytes.
  4728 __declspec(naked) __declspec(align(16))
  4729 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
  4730   __asm {
  4731     mov        eax, [esp + 4]   // src_argb0
  4732     mov        edx, [esp + 8]   // dst_argb
  4733     mov        ecx, [esp + 12]  // width
  4734     pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
  4735     pslld      xmm4, 24
  4736     pcmpeqb    xmm5, xmm5       // generate mask 0x00ffffff
  4737     psrld      xmm5, 8
  4739     align      4
  4740  convertloop:
  4741     movdqa     xmm0, [eax]      // read 4 pixels
  4742     punpcklbw  xmm0, xmm0       // first 2
  4743     pshufhw    xmm2, xmm0, 0FFh // 8 alpha words
  4744     pshuflw    xmm2, xmm2, 0FFh
  4745     pmulhuw    xmm0, xmm2       // rgb * a
  4746     movdqa     xmm1, [eax]      // read 4 pixels
  4747     punpckhbw  xmm1, xmm1       // next 2 pixels
  4748     pshufhw    xmm2, xmm1, 0FFh // 8 alpha words
  4749     pshuflw    xmm2, xmm2, 0FFh
  4750     pmulhuw    xmm1, xmm2       // rgb * a
  4751     movdqa     xmm2, [eax]      // alphas
  4752     lea        eax, [eax + 16]
  4753     psrlw      xmm0, 8
  4754     pand       xmm2, xmm4
  4755     psrlw      xmm1, 8
  4756     packuswb   xmm0, xmm1
  4757     pand       xmm0, xmm5       // keep original alphas
  4758     por        xmm0, xmm2
  4759     sub        ecx, 4
  4760     movdqa     [edx], xmm0
  4761     lea        edx, [edx + 16]
  4762     jg         convertloop
  4764     ret
  4767 #endif  // HAS_ARGBATTENUATEROW_SSE2
  4769 #ifdef HAS_ARGBATTENUATEROW_SSSE3
  4770 // Shuffle table duplicating alpha.
  4771 static const uvec8 kShuffleAlpha0 = {
  4772   3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
  4773 };
  4774 static const uvec8 kShuffleAlpha1 = {
  4775   11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
  4776   15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
  4777 };
  4778 __declspec(naked) __declspec(align(16))
  4779 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
  4780   __asm {
  4781     mov        eax, [esp + 4]   // src_argb0
  4782     mov        edx, [esp + 8]   // dst_argb
  4783     mov        ecx, [esp + 12]  // width
  4784     pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
  4785     pslld      xmm3, 24
  4786     movdqa     xmm4, kShuffleAlpha0
  4787     movdqa     xmm5, kShuffleAlpha1
  4789     align      4
  4790  convertloop:
  4791     movdqu     xmm0, [eax]      // read 4 pixels
  4792     pshufb     xmm0, xmm4       // isolate first 2 alphas
  4793     movdqu     xmm1, [eax]      // read 4 pixels
  4794     punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
  4795     pmulhuw    xmm0, xmm1       // rgb * a
  4796     movdqu     xmm1, [eax]      // read 4 pixels
  4797     pshufb     xmm1, xmm5       // isolate next 2 alphas
  4798     movdqu     xmm2, [eax]      // read 4 pixels
  4799     punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
  4800     pmulhuw    xmm1, xmm2       // rgb * a
  4801     movdqu     xmm2, [eax]      // mask original alpha
  4802     lea        eax, [eax + 16]
  4803     pand       xmm2, xmm3
  4804     psrlw      xmm0, 8
  4805     psrlw      xmm1, 8
  4806     packuswb   xmm0, xmm1
  4807     por        xmm0, xmm2       // copy original alpha
  4808     sub        ecx, 4
  4809     movdqu     [edx], xmm0
  4810     lea        edx, [edx + 16]
  4811     jg         convertloop
  4813     ret
  4816 #endif  // HAS_ARGBATTENUATEROW_SSSE3
  4818 #ifdef HAS_ARGBATTENUATEROW_AVX2
  4819 // Shuffle table duplicating alpha.
  4820 static const ulvec8 kShuffleAlpha_AVX2 = {
  4821   6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
  4822   14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
  4823   6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
  4824   14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
  4825 };
  4826 __declspec(naked) __declspec(align(16))
  4827 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
  4828   __asm {
  4829     mov        eax, [esp + 4]   // src_argb0
  4830     mov        edx, [esp + 8]   // dst_argb
  4831     mov        ecx, [esp + 12]  // width
  4832     sub        edx, eax
  4833     vmovdqa    ymm4, kShuffleAlpha_AVX2
  4834     vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000
  4835     vpslld     ymm5, ymm5, 24
  4837     align      4
  4838  convertloop:
  4839     vmovdqu    ymm6, [eax]       // read 8 pixels.
  4840     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
  4841     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
  4842     vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
  4843     vpshufb    ymm3, ymm1, ymm4  // high 4 alphas
  4844     vpmulhuw   ymm0, ymm0, ymm2  // rgb * a
  4845     vpmulhuw   ymm1, ymm1, ymm3  // rgb * a
  4846     vpand      ymm6, ymm6, ymm5  // isolate alpha
  4847     vpsrlw     ymm0, ymm0, 8
  4848     vpsrlw     ymm1, ymm1, 8
  4849     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
  4850     vpor       ymm0, ymm0, ymm6  // copy original alpha
  4851     sub        ecx, 8
  4852     vmovdqu    [eax + edx], ymm0
  4853     lea        eax, [eax + 32]
  4854     jg         convertloop
  4856     vzeroupper
  4857     ret
  4860 #endif  // HAS_ARGBATTENUATEROW_AVX2
  4862 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
  4863 // Unattenuate 4 pixels at a time.
  4864 // Aligned to 16 bytes.
  4865 __declspec(naked) __declspec(align(16))
  4866 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
  4867                              int width) {
  4868   __asm {
  4869     push       esi
  4870     push       edi
  4871     mov        eax, [esp + 8 + 4]   // src_argb0
  4872     mov        edx, [esp + 8 + 8]   // dst_argb
  4873     mov        ecx, [esp + 8 + 12]  // width
  4875     align      4
  4876  convertloop:
  4877     movdqu     xmm0, [eax]      // read 4 pixels
  4878     movzx      esi, byte ptr [eax + 3]  // first alpha
  4879     movzx      edi, byte ptr [eax + 7]  // second alpha
  4880     punpcklbw  xmm0, xmm0       // first 2
  4881     movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
  4882     movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
  4883     pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words.  1, a, a, a
  4884     pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
  4885     movlhps    xmm2, xmm3
  4886     pmulhuw    xmm0, xmm2       // rgb * a
  4888     movdqu     xmm1, [eax]      // read 4 pixels
  4889     movzx      esi, byte ptr [eax + 11]  // third alpha
  4890     movzx      edi, byte ptr [eax + 15]  // forth alpha
  4891     punpckhbw  xmm1, xmm1       // next 2
  4892     movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
  4893     movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
  4894     pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words
  4895     pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
  4896     movlhps    xmm2, xmm3
  4897     pmulhuw    xmm1, xmm2       // rgb * a
  4898     lea        eax, [eax + 16]
  4900     packuswb   xmm0, xmm1
  4901     sub        ecx, 4
  4902     movdqu     [edx], xmm0
  4903     lea        edx, [edx + 16]
  4904     jg         convertloop
  4905     pop        edi
  4906     pop        esi
  4907     ret
  4910 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
  4912 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
  4913 // Shuffle table duplicating alpha.
  4914 static const ulvec8 kUnattenShuffleAlpha_AVX2 = {
  4915   0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
  4916   0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
  4917 };
  4918 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
  4919 // USE_GATHER is not on by default, due to being a slow instruction.
  4920 #ifdef USE_GATHER
  4921 __declspec(naked) __declspec(align(16))
  4922 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
  4923                              int width) {
  4924   __asm {
  4925     mov        eax, [esp + 4]   // src_argb0
  4926     mov        edx, [esp + 8]   // dst_argb
  4927     mov        ecx, [esp + 12]  // width
  4928     sub        edx, eax
  4929     vmovdqa    ymm4, kUnattenShuffleAlpha_AVX2
  4931     align      4
  4932  convertloop:
  4933     vmovdqu    ymm6, [eax]       // read 8 pixels.
  4934     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
  4935     vpsrld     ymm2, ymm6, 24    // alpha in low 8 bits.
  4936     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
  4937     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
  4938     vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
  4939     vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
  4940     vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
  4941     vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas. 1, a, a, a
  4942     vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas
  4943     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
  4944     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
  4945     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
  4946     sub        ecx, 8
  4947     vmovdqu    [eax + edx], ymm0
  4948     lea        eax, [eax + 32]
  4949     jg         convertloop
  4951     vzeroupper
  4952     ret
  4955 #else  // USE_GATHER
  4956 __declspec(naked) __declspec(align(16))
  4957 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
  4958                              int width) {
  4959   __asm {
  4961     mov        eax, [esp + 4]   // src_argb0
  4962     mov        edx, [esp + 8]   // dst_argb
  4963     mov        ecx, [esp + 12]  // width
  4964     sub        edx, eax
  4965     vmovdqa    ymm5, kUnattenShuffleAlpha_AVX2
  4967     push       esi
  4968     push       edi
  4970     align      4
  4971  convertloop:
  4972     // replace VPGATHER
  4973     movzx      esi, byte ptr [eax + 3]                 // alpha0
  4974     movzx      edi, byte ptr [eax + 7]                 // alpha1
  4975     vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a0]
  4976     vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a1]
  4977     movzx      esi, byte ptr [eax + 11]                // alpha2
  4978     movzx      edi, byte ptr [eax + 15]                // alpha3
  4979     vpunpckldq xmm6, xmm0, xmm1                        // [1,a1,1,a0]
  4980     vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a2]
  4981     vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a3]
  4982     movzx      esi, byte ptr [eax + 19]                // alpha4
  4983     movzx      edi, byte ptr [eax + 23]                // alpha5
  4984     vpunpckldq xmm7, xmm2, xmm3                        // [1,a3,1,a2]
  4985     vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a4]
  4986     vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a5]
  4987     movzx      esi, byte ptr [eax + 27]                // alpha6
  4988     movzx      edi, byte ptr [eax + 31]                // alpha7
  4989     vpunpckldq xmm0, xmm0, xmm1                        // [1,a5,1,a4]
  4990     vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a6]
  4991     vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a7]
  4992     vpunpckldq xmm2, xmm2, xmm3                        // [1,a7,1,a6]
  4993     vpunpcklqdq xmm3, xmm6, xmm7                       // [1,a3,1,a2,1,a1,1,a0]
  4994     vpunpcklqdq xmm0, xmm0, xmm2                       // [1,a7,1,a6,1,a5,1,a4]
  4995     vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
  4996     // end of VPGATHER
  4998     vmovdqu    ymm6, [eax]       // read 8 pixels.
  4999     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
  5000     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
  5001     vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
  5002     vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
  5003     vpshufb    ymm2, ymm2, ymm5  // replicate low 4 alphas. 1, a, a, a
  5004     vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
  5005     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
  5006     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
  5007     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
  5008     sub        ecx, 8
  5009     vmovdqu    [eax + edx], ymm0
  5010     lea        eax, [eax + 32]
  5011     jg         convertloop
  5013     pop        edi
  5014     pop        esi
  5015     vzeroupper
  5016     ret
  5019 #endif  // USE_GATHER
  5020 #endif  // HAS_ARGBATTENUATEROW_AVX2
  5022 #ifdef HAS_ARGBGRAYROW_SSSE3
  5023 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
  5024 __declspec(naked) __declspec(align(16))
  5025 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
  5026   __asm {
  5027     mov        eax, [esp + 4]   /* src_argb */
  5028     mov        edx, [esp + 8]   /* dst_argb */
  5029     mov        ecx, [esp + 12]  /* width */
  5030     movdqa     xmm4, kARGBToYJ
  5031     movdqa     xmm5, kAddYJ64
  5033     align      4
  5034  convertloop:
  5035     movdqa     xmm0, [eax]  // G
  5036     movdqa     xmm1, [eax + 16]
  5037     pmaddubsw  xmm0, xmm4
  5038     pmaddubsw  xmm1, xmm4
  5039     phaddw     xmm0, xmm1
  5040     paddw      xmm0, xmm5  // Add .5 for rounding.
  5041     psrlw      xmm0, 7
  5042     packuswb   xmm0, xmm0   // 8 G bytes
  5043     movdqa     xmm2, [eax]  // A
  5044     movdqa     xmm3, [eax + 16]
  5045     lea        eax, [eax + 32]
  5046     psrld      xmm2, 24
  5047     psrld      xmm3, 24
  5048     packuswb   xmm2, xmm3
  5049     packuswb   xmm2, xmm2   // 8 A bytes
  5050     movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA
  5051     punpcklbw  xmm0, xmm0   // 8 GG words
  5052     punpcklbw  xmm3, xmm2   // 8 GA words
  5053     movdqa     xmm1, xmm0
  5054     punpcklwd  xmm0, xmm3   // GGGA first 4
  5055     punpckhwd  xmm1, xmm3   // GGGA next 4
  5056     sub        ecx, 8
  5057     movdqa     [edx], xmm0
  5058     movdqa     [edx + 16], xmm1
  5059     lea        edx, [edx + 32]
  5060     jg         convertloop
  5061     ret
  5064 #endif  // HAS_ARGBGRAYROW_SSSE3
  5066 #ifdef HAS_ARGBSEPIAROW_SSSE3
  5067 //    b = (r * 35 + g * 68 + b * 17) >> 7
  5068 //    g = (r * 45 + g * 88 + b * 22) >> 7
  5069 //    r = (r * 50 + g * 98 + b * 24) >> 7
  5070 // Constant for ARGB color to sepia tone.
  5071 static const vec8 kARGBToSepiaB = {
  5072   17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
  5073 };
  5075 static const vec8 kARGBToSepiaG = {
  5076   22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
  5077 };
  5079 static const vec8 kARGBToSepiaR = {
  5080   24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
  5081 };
  5083 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
  5084 __declspec(naked) __declspec(align(16))
  5085 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
  5086   __asm {
  5087     mov        eax, [esp + 4]   /* dst_argb */
  5088     mov        ecx, [esp + 8]   /* width */
  5089     movdqa     xmm2, kARGBToSepiaB
  5090     movdqa     xmm3, kARGBToSepiaG
  5091     movdqa     xmm4, kARGBToSepiaR
  5093     align      4
  5094  convertloop:
  5095     movdqa     xmm0, [eax]  // B
  5096     movdqa     xmm6, [eax + 16]
  5097     pmaddubsw  xmm0, xmm2
  5098     pmaddubsw  xmm6, xmm2
  5099     phaddw     xmm0, xmm6
  5100     psrlw      xmm0, 7
  5101     packuswb   xmm0, xmm0   // 8 B values
  5102     movdqa     xmm5, [eax]  // G
  5103     movdqa     xmm1, [eax + 16]
  5104     pmaddubsw  xmm5, xmm3
  5105     pmaddubsw  xmm1, xmm3
  5106     phaddw     xmm5, xmm1
  5107     psrlw      xmm5, 7
  5108     packuswb   xmm5, xmm5   // 8 G values
  5109     punpcklbw  xmm0, xmm5   // 8 BG values
  5110     movdqa     xmm5, [eax]  // R
  5111     movdqa     xmm1, [eax + 16]
  5112     pmaddubsw  xmm5, xmm4
  5113     pmaddubsw  xmm1, xmm4
  5114     phaddw     xmm5, xmm1
  5115     psrlw      xmm5, 7
  5116     packuswb   xmm5, xmm5   // 8 R values
  5117     movdqa     xmm6, [eax]  // A
  5118     movdqa     xmm1, [eax + 16]
  5119     psrld      xmm6, 24
  5120     psrld      xmm1, 24
  5121     packuswb   xmm6, xmm1
  5122     packuswb   xmm6, xmm6   // 8 A values
  5123     punpcklbw  xmm5, xmm6   // 8 RA values
  5124     movdqa     xmm1, xmm0   // Weave BG, RA together
  5125     punpcklwd  xmm0, xmm5   // BGRA first 4
  5126     punpckhwd  xmm1, xmm5   // BGRA next 4
  5127     sub        ecx, 8
  5128     movdqa     [eax], xmm0
  5129     movdqa     [eax + 16], xmm1
  5130     lea        eax, [eax + 32]
  5131     jg         convertloop
  5132     ret
  5135 #endif  // HAS_ARGBSEPIAROW_SSSE3
  5137 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
  5138 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
  5139 // Same as Sepia except matrix is provided.
  5140 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
  5141 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
  5142 __declspec(naked) __declspec(align(16))
  5143 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
  5144                               const int8* matrix_argb, int width) {
  5145   __asm {
  5146     mov        eax, [esp + 4]   /* src_argb */
  5147     mov        edx, [esp + 8]   /* dst_argb */
  5148     mov        ecx, [esp + 12]  /* matrix_argb */
  5149     movdqu     xmm5, [ecx]
  5150     pshufd     xmm2, xmm5, 0x00
  5151     pshufd     xmm3, xmm5, 0x55
  5152     pshufd     xmm4, xmm5, 0xaa
  5153     pshufd     xmm5, xmm5, 0xff
  5154     mov        ecx, [esp + 16]  /* width */
  5156     align      4
  5157  convertloop:
  5158     movdqa     xmm0, [eax]  // B
  5159     movdqa     xmm7, [eax + 16]
  5160     pmaddubsw  xmm0, xmm2
  5161     pmaddubsw  xmm7, xmm2
  5162     movdqa     xmm6, [eax]  // G
  5163     movdqa     xmm1, [eax + 16]
  5164     pmaddubsw  xmm6, xmm3
  5165     pmaddubsw  xmm1, xmm3
  5166     phaddsw    xmm0, xmm7   // B
  5167     phaddsw    xmm6, xmm1   // G
  5168     psraw      xmm0, 6      // B
  5169     psraw      xmm6, 6      // G
  5170     packuswb   xmm0, xmm0   // 8 B values
  5171     packuswb   xmm6, xmm6   // 8 G values
  5172     punpcklbw  xmm0, xmm6   // 8 BG values
  5173     movdqa     xmm1, [eax]  // R
  5174     movdqa     xmm7, [eax + 16]
  5175     pmaddubsw  xmm1, xmm4
  5176     pmaddubsw  xmm7, xmm4
  5177     phaddsw    xmm1, xmm7   // R
  5178     movdqa     xmm6, [eax]  // A
  5179     movdqa     xmm7, [eax + 16]
  5180     pmaddubsw  xmm6, xmm5
  5181     pmaddubsw  xmm7, xmm5
  5182     phaddsw    xmm6, xmm7   // A
  5183     psraw      xmm1, 6      // R
  5184     psraw      xmm6, 6      // A
  5185     packuswb   xmm1, xmm1   // 8 R values
  5186     packuswb   xmm6, xmm6   // 8 A values
  5187     punpcklbw  xmm1, xmm6   // 8 RA values
  5188     movdqa     xmm6, xmm0   // Weave BG, RA together
  5189     punpcklwd  xmm0, xmm1   // BGRA first 4
  5190     punpckhwd  xmm6, xmm1   // BGRA next 4
  5191     sub        ecx, 8
  5192     movdqa     [edx], xmm0
  5193     movdqa     [edx + 16], xmm6
  5194     lea        eax, [eax + 32]
  5195     lea        edx, [edx + 32]
  5196     jg         convertloop
  5197     ret
  5200 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
  5202 #ifdef HAS_ARGBQUANTIZEROW_SSE2
  5203 // Quantize 4 ARGB pixels (16 bytes).
  5204 // Aligned to 16 bytes.
  5205 __declspec(naked) __declspec(align(16))
  5206 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
  5207                           int interval_offset, int width) {
  5208   __asm {
  5209     mov        eax, [esp + 4]    /* dst_argb */
  5210     movd       xmm2, [esp + 8]   /* scale */
  5211     movd       xmm3, [esp + 12]  /* interval_size */
  5212     movd       xmm4, [esp + 16]  /* interval_offset */
  5213     mov        ecx, [esp + 20]   /* width */
  5214     pshuflw    xmm2, xmm2, 040h
  5215     pshufd     xmm2, xmm2, 044h
  5216     pshuflw    xmm3, xmm3, 040h
  5217     pshufd     xmm3, xmm3, 044h
  5218     pshuflw    xmm4, xmm4, 040h
  5219     pshufd     xmm4, xmm4, 044h
  5220     pxor       xmm5, xmm5  // constant 0
  5221     pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
  5222     pslld      xmm6, 24
  5224     align      4
  5225  convertloop:
  5226     movdqa     xmm0, [eax]  // read 4 pixels
  5227     punpcklbw  xmm0, xmm5   // first 2 pixels
  5228     pmulhuw    xmm0, xmm2   // pixel * scale >> 16
  5229     movdqa     xmm1, [eax]  // read 4 pixels
  5230     punpckhbw  xmm1, xmm5   // next 2 pixels
  5231     pmulhuw    xmm1, xmm2
  5232     pmullw     xmm0, xmm3   // * interval_size
  5233     movdqa     xmm7, [eax]  // read 4 pixels
  5234     pmullw     xmm1, xmm3
  5235     pand       xmm7, xmm6   // mask alpha
  5236     paddw      xmm0, xmm4   // + interval_size / 2
  5237     paddw      xmm1, xmm4
  5238     packuswb   xmm0, xmm1
  5239     por        xmm0, xmm7
  5240     sub        ecx, 4
  5241     movdqa     [eax], xmm0
  5242     lea        eax, [eax + 16]
  5243     jg         convertloop
  5244     ret
  5247 #endif  // HAS_ARGBQUANTIZEROW_SSE2
  5249 #ifdef HAS_ARGBSHADEROW_SSE2
  5250 // Shade 4 pixels at a time by specified value.
  5251 // Aligned to 16 bytes.
  5252 __declspec(naked) __declspec(align(16))
  5253 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
  5254                        uint32 value) {
  5255   __asm {
  5256     mov        eax, [esp + 4]   // src_argb
  5257     mov        edx, [esp + 8]   // dst_argb
  5258     mov        ecx, [esp + 12]  // width
  5259     movd       xmm2, [esp + 16]  // value
  5260     punpcklbw  xmm2, xmm2
  5261     punpcklqdq xmm2, xmm2
  5263     align      4
  5264  convertloop:
  5265     movdqa     xmm0, [eax]      // read 4 pixels
  5266     lea        eax, [eax + 16]
  5267     movdqa     xmm1, xmm0
  5268     punpcklbw  xmm0, xmm0       // first 2
  5269     punpckhbw  xmm1, xmm1       // next 2
  5270     pmulhuw    xmm0, xmm2       // argb * value
  5271     pmulhuw    xmm1, xmm2       // argb * value
  5272     psrlw      xmm0, 8
  5273     psrlw      xmm1, 8
  5274     packuswb   xmm0, xmm1
  5275     sub        ecx, 4
  5276     movdqa     [edx], xmm0
  5277     lea        edx, [edx + 16]
  5278     jg         convertloop
  5280     ret
  5283 #endif  // HAS_ARGBSHADEROW_SSE2
  5285 #ifdef HAS_ARGBMULTIPLYROW_SSE2
  5286 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
  5287 __declspec(naked) __declspec(align(16))
  5288 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  5289                           uint8* dst_argb, int width) {
  5290   __asm {
  5291     push       esi
  5292     mov        eax, [esp + 4 + 4]   // src_argb0
  5293     mov        esi, [esp + 4 + 8]   // src_argb1
  5294     mov        edx, [esp + 4 + 12]  // dst_argb
  5295     mov        ecx, [esp + 4 + 16]  // width
  5296     pxor       xmm5, xmm5  // constant 0
  5298     align      4
  5299  convertloop:
  5300     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
  5301     movdqu     xmm2, [esi]        // read 4 pixels from src_argb1
  5302     movdqu     xmm1, xmm0
  5303     movdqu     xmm3, xmm2
  5304     punpcklbw  xmm0, xmm0         // first 2
  5305     punpckhbw  xmm1, xmm1         // next 2
  5306     punpcklbw  xmm2, xmm5         // first 2
  5307     punpckhbw  xmm3, xmm5         // next 2
  5308     pmulhuw    xmm0, xmm2         // src_argb0 * src_argb1 first 2
  5309     pmulhuw    xmm1, xmm3         // src_argb0 * src_argb1 next 2
  5310     lea        eax, [eax + 16]
  5311     lea        esi, [esi + 16]
  5312     packuswb   xmm0, xmm1
  5313     sub        ecx, 4
  5314     movdqu     [edx], xmm0
  5315     lea        edx, [edx + 16]
  5316     jg         convertloop
  5318     pop        esi
  5319     ret
  5322 #endif  // HAS_ARGBMULTIPLYROW_SSE2
  5324 #ifdef HAS_ARGBADDROW_SSE2
  5325 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
  5326 // TODO(fbarchard): Port this to posix, neon and other math functions.
  5327 __declspec(naked) __declspec(align(16))
  5328 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  5329                      uint8* dst_argb, int width) {
  5330   __asm {
  5331     push       esi
  5332     mov        eax, [esp + 4 + 4]   // src_argb0
  5333     mov        esi, [esp + 4 + 8]   // src_argb1
  5334     mov        edx, [esp + 4 + 12]  // dst_argb
  5335     mov        ecx, [esp + 4 + 16]  // width
  5337     sub        ecx, 4
  5338     jl         convertloop49
  5340     align      4
  5341  convertloop4:
  5342     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
  5343     lea        eax, [eax + 16]
  5344     movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
  5345     lea        esi, [esi + 16]
  5346     paddusb    xmm0, xmm1         // src_argb0 + src_argb1
  5347     sub        ecx, 4
  5348     movdqu     [edx], xmm0
  5349     lea        edx, [edx + 16]
  5350     jge        convertloop4
  5352  convertloop49:
  5353     add        ecx, 4 - 1
  5354     jl         convertloop19
  5356  convertloop1:
  5357     movd       xmm0, [eax]        // read 1 pixels from src_argb0
  5358     lea        eax, [eax + 4]
  5359     movd       xmm1, [esi]        // read 1 pixels from src_argb1
  5360     lea        esi, [esi + 4]
  5361     paddusb    xmm0, xmm1         // src_argb0 + src_argb1
  5362     sub        ecx, 1
  5363     movd       [edx], xmm0
  5364     lea        edx, [edx + 4]
  5365     jge        convertloop1
  5367  convertloop19:
  5368     pop        esi
  5369     ret
  5372 #endif  // HAS_ARGBADDROW_SSE2
  5374 #ifdef HAS_ARGBSUBTRACTROW_SSE2
  5375 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
  5376 __declspec(naked) __declspec(align(16))
  5377 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  5378                           uint8* dst_argb, int width) {
  5379   __asm {
  5380     push       esi
  5381     mov        eax, [esp + 4 + 4]   // src_argb0
  5382     mov        esi, [esp + 4 + 8]   // src_argb1
  5383     mov        edx, [esp + 4 + 12]  // dst_argb
  5384     mov        ecx, [esp + 4 + 16]  // width
  5386     align      4
  5387  convertloop:
  5388     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
  5389     lea        eax, [eax + 16]
  5390     movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
  5391     lea        esi, [esi + 16]
  5392     psubusb    xmm0, xmm1         // src_argb0 - src_argb1
  5393     sub        ecx, 4
  5394     movdqu     [edx], xmm0
  5395     lea        edx, [edx + 16]
  5396     jg         convertloop
  5398     pop        esi
  5399     ret
  5402 #endif  // HAS_ARGBSUBTRACTROW_SSE2
  5404 #ifdef HAS_ARGBMULTIPLYROW_AVX2
  5405 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
  5406 __declspec(naked) __declspec(align(16))
  5407 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
  5408                           uint8* dst_argb, int width) {
  5409   __asm {
  5410     push       esi
  5411     mov        eax, [esp + 4 + 4]   // src_argb0
  5412     mov        esi, [esp + 4 + 8]   // src_argb1
  5413     mov        edx, [esp + 4 + 12]  // dst_argb
  5414     mov        ecx, [esp + 4 + 16]  // width
  5415     vpxor      ymm5, ymm5, ymm5     // constant 0
  5417     align      4
  5418  convertloop:
  5419     vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0
  5420     lea        eax, [eax + 32]
  5421     vmovdqu    ymm3, [esi]        // read 8 pixels from src_argb1
  5422     lea        esi, [esi + 32]
  5423     vpunpcklbw ymm0, ymm1, ymm1   // low 4
  5424     vpunpckhbw ymm1, ymm1, ymm1   // high 4
  5425     vpunpcklbw ymm2, ymm3, ymm5   // low 4
  5426     vpunpckhbw ymm3, ymm3, ymm5   // high 4
  5427     vpmulhuw   ymm0, ymm0, ymm2   // src_argb0 * src_argb1 low 4
  5428     vpmulhuw   ymm1, ymm1, ymm3   // src_argb0 * src_argb1 high 4
  5429     vpackuswb  ymm0, ymm0, ymm1
  5430     vmovdqu    [edx], ymm0
  5431     lea        edx, [edx + 32]
  5432     sub        ecx, 8
  5433     jg         convertloop
  5435     pop        esi
  5436     vzeroupper
  5437     ret
  5440 #endif  // HAS_ARGBMULTIPLYROW_AVX2
  5442 #ifdef HAS_ARGBADDROW_AVX2
  5443 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
  5444 __declspec(naked) __declspec(align(16))
  5445 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
  5446                      uint8* dst_argb, int width) {
  5447   __asm {
  5448     push       esi
  5449     mov        eax, [esp + 4 + 4]   // src_argb0
  5450     mov        esi, [esp + 4 + 8]   // src_argb1
  5451     mov        edx, [esp + 4 + 12]  // dst_argb
  5452     mov        ecx, [esp + 4 + 16]  // width
  5454     align      4
  5455  convertloop:
  5456     vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
  5457     lea        eax, [eax + 32]
  5458     vpaddusb   ymm0, ymm0, [esi]        // add 8 pixels from src_argb1
  5459     lea        esi, [esi + 32]
  5460     vmovdqu    [edx], ymm0
  5461     lea        edx, [edx + 32]
  5462     sub        ecx, 8
  5463     jg         convertloop
  5465     pop        esi
  5466     vzeroupper
  5467     ret
  5470 #endif  // HAS_ARGBADDROW_AVX2
  5472 #ifdef HAS_ARGBSUBTRACTROW_AVX2
  5473 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
  5474 __declspec(naked) __declspec(align(16))
  5475 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
  5476                           uint8* dst_argb, int width) {
  5477   __asm {
  5478     push       esi
  5479     mov        eax, [esp + 4 + 4]   // src_argb0
  5480     mov        esi, [esp + 4 + 8]   // src_argb1
  5481     mov        edx, [esp + 4 + 12]  // dst_argb
  5482     mov        ecx, [esp + 4 + 16]  // width
  5484     align      4
  5485  convertloop:
  5486     vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
  5487     lea        eax, [eax + 32]
  5488     vpsubusb   ymm0, ymm0, [esi]        // src_argb0 - src_argb1
  5489     lea        esi, [esi + 32]
  5490     vmovdqu    [edx], ymm0
  5491     lea        edx, [edx + 32]
  5492     sub        ecx, 8
  5493     jg         convertloop
  5495     pop        esi
  5496     vzeroupper
  5497     ret
  5500 #endif  // HAS_ARGBSUBTRACTROW_AVX2
  5502 #ifdef HAS_SOBELXROW_SSE2
  5503 // SobelX as a matrix is
  5504 // -1  0  1
  5505 // -2  0  2
  5506 // -1  0  1
  5507 __declspec(naked) __declspec(align(16))
  5508 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
  5509                     const uint8* src_y2, uint8* dst_sobelx, int width) {
  5510   __asm {
  5511     push       esi
  5512     push       edi
  5513     mov        eax, [esp + 8 + 4]   // src_y0
  5514     mov        esi, [esp + 8 + 8]   // src_y1
  5515     mov        edi, [esp + 8 + 12]  // src_y2
  5516     mov        edx, [esp + 8 + 16]  // dst_sobelx
  5517     mov        ecx, [esp + 8 + 20]  // width
  5518     sub        esi, eax
  5519     sub        edi, eax
  5520     sub        edx, eax
  5521     pxor       xmm5, xmm5  // constant 0
  5523     align      4
  5524  convertloop:
  5525     movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
  5526     movq       xmm1, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
  5527     punpcklbw  xmm0, xmm5
  5528     punpcklbw  xmm1, xmm5
  5529     psubw      xmm0, xmm1
  5530     movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
  5531     movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
  5532     punpcklbw  xmm1, xmm5
  5533     punpcklbw  xmm2, xmm5
  5534     psubw      xmm1, xmm2
  5535     movq       xmm2, qword ptr [eax + edi]      // read 8 pixels from src_y2[0]
  5536     movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
  5537     punpcklbw  xmm2, xmm5
  5538     punpcklbw  xmm3, xmm5
  5539     psubw      xmm2, xmm3
  5540     paddw      xmm0, xmm2
  5541     paddw      xmm0, xmm1
  5542     paddw      xmm0, xmm1
  5543     pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
  5544     psubw      xmm1, xmm0
  5545     pmaxsw     xmm0, xmm1
  5546     packuswb   xmm0, xmm0
  5547     sub        ecx, 8
  5548     movq       qword ptr [eax + edx], xmm0
  5549     lea        eax, [eax + 8]
  5550     jg         convertloop
  5552     pop        edi
  5553     pop        esi
  5554     ret
  5557 #endif  // HAS_SOBELXROW_SSE2
  5559 #ifdef HAS_SOBELYROW_SSE2
  5560 // SobelY as a matrix is
  5561 // -1 -2 -1
  5562 //  0  0  0
  5563 //  1  2  1
  5564 __declspec(naked) __declspec(align(16))
  5565 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
  5566                     uint8* dst_sobely, int width) {
  5567   __asm {
  5568     push       esi
  5569     mov        eax, [esp + 4 + 4]   // src_y0
  5570     mov        esi, [esp + 4 + 8]   // src_y1
  5571     mov        edx, [esp + 4 + 12]  // dst_sobely
  5572     mov        ecx, [esp + 4 + 16]  // width
  5573     sub        esi, eax
  5574     sub        edx, eax
  5575     pxor       xmm5, xmm5  // constant 0
  5577     align      4
  5578  convertloop:
  5579     movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
  5580     movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
  5581     punpcklbw  xmm0, xmm5
  5582     punpcklbw  xmm1, xmm5
  5583     psubw      xmm0, xmm1
  5584     movq       xmm1, qword ptr [eax + 1]        // read 8 pixels from src_y0[1]
  5585     movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
  5586     punpcklbw  xmm1, xmm5
  5587     punpcklbw  xmm2, xmm5
  5588     psubw      xmm1, xmm2
  5589     movq       xmm2, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
  5590     movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
  5591     punpcklbw  xmm2, xmm5
  5592     punpcklbw  xmm3, xmm5
  5593     psubw      xmm2, xmm3
  5594     paddw      xmm0, xmm2
  5595     paddw      xmm0, xmm1
  5596     paddw      xmm0, xmm1
  5597     pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
  5598     psubw      xmm1, xmm0
  5599     pmaxsw     xmm0, xmm1
  5600     packuswb   xmm0, xmm0
  5601     sub        ecx, 8
  5602     movq       qword ptr [eax + edx], xmm0
  5603     lea        eax, [eax + 8]
  5604     jg         convertloop
  5606     pop        esi
  5607     ret
  5610 #endif  // HAS_SOBELYROW_SSE2
  5612 #ifdef HAS_SOBELROW_SSE2
  5613 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
  5614 // A = 255
  5615 // R = Sobel
  5616 // G = Sobel
  5617 // B = Sobel
  5618 __declspec(naked) __declspec(align(16))
  5619 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
  5620                    uint8* dst_argb, int width) {
  5621   __asm {
  5622     push       esi
  5623     mov        eax, [esp + 4 + 4]   // src_sobelx
  5624     mov        esi, [esp + 4 + 8]   // src_sobely
  5625     mov        edx, [esp + 4 + 12]  // dst_argb
  5626     mov        ecx, [esp + 4 + 16]  // width
  5627     sub        esi, eax
  5628     pcmpeqb    xmm5, xmm5           // alpha 255
  5629     pslld      xmm5, 24             // 0xff000000
  5631     align      4
  5632  convertloop:
  5633     movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
  5634     movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
  5635     lea        eax, [eax + 16]
  5636     paddusb    xmm0, xmm1             // sobel = sobelx + sobely
  5637     movdqa     xmm2, xmm0             // GG
  5638     punpcklbw  xmm2, xmm0             // First 8
  5639     punpckhbw  xmm0, xmm0             // Next 8
  5640     movdqa     xmm1, xmm2             // GGGG
  5641     punpcklwd  xmm1, xmm2             // First 4
  5642     punpckhwd  xmm2, xmm2             // Next 4
  5643     por        xmm1, xmm5             // GGGA
  5644     por        xmm2, xmm5
  5645     movdqa     xmm3, xmm0             // GGGG
  5646     punpcklwd  xmm3, xmm0             // Next 4
  5647     punpckhwd  xmm0, xmm0             // Last 4
  5648     por        xmm3, xmm5             // GGGA
  5649     por        xmm0, xmm5
  5650     sub        ecx, 16
  5651     movdqa     [edx], xmm1
  5652     movdqa     [edx + 16], xmm2
  5653     movdqa     [edx + 32], xmm3
  5654     movdqa     [edx + 48], xmm0
  5655     lea        edx, [edx + 64]
  5656     jg         convertloop
  5658     pop        esi
  5659     ret
  5662 #endif  // HAS_SOBELROW_SSE2
  5664 #ifdef HAS_SOBELTOPLANEROW_SSE2
  5665 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
  5666 __declspec(naked) __declspec(align(16))
  5667 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
  5668                           uint8* dst_y, int width) {
  5669   __asm {
  5670     push       esi
  5671     mov        eax, [esp + 4 + 4]   // src_sobelx
  5672     mov        esi, [esp + 4 + 8]   // src_sobely
  5673     mov        edx, [esp + 4 + 12]  // dst_argb
  5674     mov        ecx, [esp + 4 + 16]  // width
  5675     sub        esi, eax
  5677     align      4
  5678  convertloop:
  5679     movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
  5680     movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
  5681     lea        eax, [eax + 16]
  5682     paddusb    xmm0, xmm1             // sobel = sobelx + sobely
  5683     sub        ecx, 16
  5684     movdqa     [edx], xmm0
  5685     lea        edx, [edx + 16]
  5686     jg         convertloop
  5688     pop        esi
  5689     ret
  5692 #endif  // HAS_SOBELTOPLANEROW_SSE2
  5694 #ifdef HAS_SOBELXYROW_SSE2
  5695 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
  5696 // A = 255
  5697 // R = Sobel X
  5698 // G = Sobel
  5699 // B = Sobel Y
  5700 __declspec(naked) __declspec(align(16))
  5701 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
  5702                      uint8* dst_argb, int width) {
  5703   __asm {
  5704     push       esi
  5705     mov        eax, [esp + 4 + 4]   // src_sobelx
  5706     mov        esi, [esp + 4 + 8]   // src_sobely
  5707     mov        edx, [esp + 4 + 12]  // dst_argb
  5708     mov        ecx, [esp + 4 + 16]  // width
  5709     sub        esi, eax
  5710     pcmpeqb    xmm5, xmm5           // alpha 255
  5712     align      4
  5713  convertloop:
  5714     movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
  5715     movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
  5716     lea        eax, [eax + 16]
  5717     movdqa     xmm2, xmm0
  5718     paddusb    xmm2, xmm1             // sobel = sobelx + sobely
  5719     movdqa     xmm3, xmm0             // XA
  5720     punpcklbw  xmm3, xmm5
  5721     punpckhbw  xmm0, xmm5
  5722     movdqa     xmm4, xmm1             // YS
  5723     punpcklbw  xmm4, xmm2
  5724     punpckhbw  xmm1, xmm2
  5725     movdqa     xmm6, xmm4             // YSXA
  5726     punpcklwd  xmm6, xmm3             // First 4
  5727     punpckhwd  xmm4, xmm3             // Next 4
  5728     movdqa     xmm7, xmm1             // YSXA
  5729     punpcklwd  xmm7, xmm0             // Next 4
  5730     punpckhwd  xmm1, xmm0             // Last 4
  5731     sub        ecx, 16
  5732     movdqa     [edx], xmm6
  5733     movdqa     [edx + 16], xmm4
  5734     movdqa     [edx + 32], xmm7
  5735     movdqa     [edx + 48], xmm1
  5736     lea        edx, [edx + 64]
  5737     jg         convertloop
  5739     pop        esi
  5740     ret
  5743 #endif  // HAS_SOBELXYROW_SSE2
  5745 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
  5746 // Consider float CumulativeSum.
  5747 // Consider calling CumulativeSum one row at time as needed.
  5748 // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
  5749 // Convert cumulative sum for an area to an average for 1 pixel.
  5750 // topleft is pointer to top left of CumulativeSum buffer for area.
  5751 // botleft is pointer to bottom left of CumulativeSum buffer.
  5752 // width is offset from left to right of area in CumulativeSum buffer measured
  5753 //   in number of ints.
  5754 // area is the number of pixels in the area being averaged.
  5755 // dst points to pixel to store result to.
  5756 // count is number of averaged pixels to produce.
  5757 // Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
  5758 // aligned.
  5759 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
  5760                                     int width, int area, uint8* dst,
  5761                                     int count) {
  5762   __asm {
  5763     mov        eax, topleft  // eax topleft
  5764     mov        esi, botleft  // esi botleft
  5765     mov        edx, width
  5766     movd       xmm5, area
  5767     mov        edi, dst
  5768     mov        ecx, count
  5769     cvtdq2ps   xmm5, xmm5
  5770     rcpss      xmm4, xmm5  // 1.0f / area
  5771     pshufd     xmm4, xmm4, 0
  5772     sub        ecx, 4
  5773     jl         l4b
  5775     cmp        area, 128  // 128 pixels will not overflow 15 bits.
  5776     ja         l4
  5778     pshufd     xmm5, xmm5, 0        // area
  5779     pcmpeqb    xmm6, xmm6           // constant of 65536.0 - 1 = 65535.0
  5780     psrld      xmm6, 16
  5781     cvtdq2ps   xmm6, xmm6
  5782     addps      xmm5, xmm6           // (65536.0 + area - 1)
  5783     mulps      xmm5, xmm4           // (65536.0 + area - 1) * 1 / area
  5784     cvtps2dq   xmm5, xmm5           // 0.16 fixed point
  5785     packssdw   xmm5, xmm5           // 16 bit shorts
  5787     // 4 pixel loop small blocks.
  5788     align      4
  5789   s4:
  5790     // top left
  5791     movdqa     xmm0, [eax]
  5792     movdqa     xmm1, [eax + 16]
  5793     movdqa     xmm2, [eax + 32]
  5794     movdqa     xmm3, [eax + 48]
  5796     // - top right
  5797     psubd      xmm0, [eax + edx * 4]
  5798     psubd      xmm1, [eax + edx * 4 + 16]
  5799     psubd      xmm2, [eax + edx * 4 + 32]
  5800     psubd      xmm3, [eax + edx * 4 + 48]
  5801     lea        eax, [eax + 64]
  5803     // - bottom left
  5804     psubd      xmm0, [esi]
  5805     psubd      xmm1, [esi + 16]
  5806     psubd      xmm2, [esi + 32]
  5807     psubd      xmm3, [esi + 48]
  5809     // + bottom right
  5810     paddd      xmm0, [esi + edx * 4]
  5811     paddd      xmm1, [esi + edx * 4 + 16]
  5812     paddd      xmm2, [esi + edx * 4 + 32]
  5813     paddd      xmm3, [esi + edx * 4 + 48]
  5814     lea        esi, [esi + 64]
  5816     packssdw   xmm0, xmm1  // pack 4 pixels into 2 registers
  5817     packssdw   xmm2, xmm3
  5819     pmulhuw    xmm0, xmm5
  5820     pmulhuw    xmm2, xmm5
  5822     packuswb   xmm0, xmm2
  5823     movdqu     [edi], xmm0
  5824     lea        edi, [edi + 16]
  5825     sub        ecx, 4
  5826     jge        s4
  5828     jmp        l4b
  5830     // 4 pixel loop
  5831     align      4
  5832   l4:
  5833     // top left
  5834     movdqa     xmm0, [eax]
  5835     movdqa     xmm1, [eax + 16]
  5836     movdqa     xmm2, [eax + 32]
  5837     movdqa     xmm3, [eax + 48]
  5839     // - top right
  5840     psubd      xmm0, [eax + edx * 4]
  5841     psubd      xmm1, [eax + edx * 4 + 16]
  5842     psubd      xmm2, [eax + edx * 4 + 32]
  5843     psubd      xmm3, [eax + edx * 4 + 48]
  5844     lea        eax, [eax + 64]
  5846     // - bottom left
  5847     psubd      xmm0, [esi]
  5848     psubd      xmm1, [esi + 16]
  5849     psubd      xmm2, [esi + 32]
  5850     psubd      xmm3, [esi + 48]
  5852     // + bottom right
  5853     paddd      xmm0, [esi + edx * 4]
  5854     paddd      xmm1, [esi + edx * 4 + 16]
  5855     paddd      xmm2, [esi + edx * 4 + 32]
  5856     paddd      xmm3, [esi + edx * 4 + 48]
  5857     lea        esi, [esi + 64]
  5859     cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area
  5860     cvtdq2ps   xmm1, xmm1
  5861     mulps      xmm0, xmm4
  5862     mulps      xmm1, xmm4
  5863     cvtdq2ps   xmm2, xmm2
  5864     cvtdq2ps   xmm3, xmm3
  5865     mulps      xmm2, xmm4
  5866     mulps      xmm3, xmm4
  5867     cvtps2dq   xmm0, xmm0
  5868     cvtps2dq   xmm1, xmm1
  5869     cvtps2dq   xmm2, xmm2
  5870     cvtps2dq   xmm3, xmm3
  5871     packssdw   xmm0, xmm1
  5872     packssdw   xmm2, xmm3
  5873     packuswb   xmm0, xmm2
  5874     movdqu     [edi], xmm0
  5875     lea        edi, [edi + 16]
  5876     sub        ecx, 4
  5877     jge        l4
  5879   l4b:
  5880     add        ecx, 4 - 1
  5881     jl         l1b
  5883     // 1 pixel loop
  5884     align      4
  5885   l1:
  5886     movdqa     xmm0, [eax]
  5887     psubd      xmm0, [eax + edx * 4]
  5888     lea        eax, [eax + 16]
  5889     psubd      xmm0, [esi]
  5890     paddd      xmm0, [esi + edx * 4]
  5891     lea        esi, [esi + 16]
  5892     cvtdq2ps   xmm0, xmm0
  5893     mulps      xmm0, xmm4
  5894     cvtps2dq   xmm0, xmm0
  5895     packssdw   xmm0, xmm0
  5896     packuswb   xmm0, xmm0
  5897     movd       dword ptr [edi], xmm0
  5898     lea        edi, [edi + 4]
  5899     sub        ecx, 1
  5900     jge        l1
  5901   l1b:
  5904 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
  5906 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
  5907 // Creates a table of cumulative sums where each value is a sum of all values
  5908 // above and to the left of the value.
  5909 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
  5910                                   const int32* previous_cumsum, int width) {
  5911   __asm {
  5912     mov        eax, row
  5913     mov        edx, cumsum
  5914     mov        esi, previous_cumsum
  5915     mov        ecx, width
  5916     pxor       xmm0, xmm0
  5917     pxor       xmm1, xmm1
  5919     sub        ecx, 4
  5920     jl         l4b
  5921     test       edx, 15
  5922     jne        l4b
  5924     // 4 pixel loop
  5925     align      4
  5926   l4:
  5927     movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
  5928     lea        eax, [eax + 16]
  5929     movdqa     xmm4, xmm2
  5931     punpcklbw  xmm2, xmm1
  5932     movdqa     xmm3, xmm2
  5933     punpcklwd  xmm2, xmm1
  5934     punpckhwd  xmm3, xmm1
  5936     punpckhbw  xmm4, xmm1
  5937     movdqa     xmm5, xmm4
  5938     punpcklwd  xmm4, xmm1
  5939     punpckhwd  xmm5, xmm1
  5941     paddd      xmm0, xmm2
  5942     movdqa     xmm2, [esi]  // previous row above.
  5943     paddd      xmm2, xmm0
  5945     paddd      xmm0, xmm3
  5946     movdqa     xmm3, [esi + 16]
  5947     paddd      xmm3, xmm0
  5949     paddd      xmm0, xmm4
  5950     movdqa     xmm4, [esi + 32]
  5951     paddd      xmm4, xmm0
  5953     paddd      xmm0, xmm5
  5954     movdqa     xmm5, [esi + 48]
  5955     lea        esi, [esi + 64]
  5956     paddd      xmm5, xmm0
  5958     movdqa     [edx], xmm2
  5959     movdqa     [edx + 16], xmm3
  5960     movdqa     [edx + 32], xmm4
  5961     movdqa     [edx + 48], xmm5
  5963     lea        edx, [edx + 64]
  5964     sub        ecx, 4
  5965     jge        l4
  5967   l4b:
  5968     add        ecx, 4 - 1
  5969     jl         l1b
  5971     // 1 pixel loop
  5972     align      4
  5973   l1:
  5974     movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
  5975     lea        eax, [eax + 4]
  5976     punpcklbw  xmm2, xmm1
  5977     punpcklwd  xmm2, xmm1
  5978     paddd      xmm0, xmm2
  5979     movdqu     xmm2, [esi]
  5980     lea        esi, [esi + 16]
  5981     paddd      xmm2, xmm0
  5982     movdqu     [edx], xmm2
  5983     lea        edx, [edx + 16]
  5984     sub        ecx, 1
  5985     jge        l1
  5987  l1b:
  5990 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
  5992 #ifdef HAS_ARGBAFFINEROW_SSE2
  5993 // Copy ARGB pixels from source image with slope to a row of destination.
  5994 __declspec(naked) __declspec(align(16))
  5995 LIBYUV_API
  5996 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
  5997                         uint8* dst_argb, const float* uv_dudv, int width) {
  5998   __asm {
  5999     push       esi
  6000     push       edi
  6001     mov        eax, [esp + 12]  // src_argb
  6002     mov        esi, [esp + 16]  // stride
  6003     mov        edx, [esp + 20]  // dst_argb
  6004     mov        ecx, [esp + 24]  // pointer to uv_dudv
  6005     movq       xmm2, qword ptr [ecx]  // uv
  6006     movq       xmm7, qword ptr [ecx + 8]  // dudv
  6007     mov        ecx, [esp + 28]  // width
  6008     shl        esi, 16          // 4, stride
  6009     add        esi, 4
  6010     movd       xmm5, esi
  6011     sub        ecx, 4
  6012     jl         l4b
  6014     // setup for 4 pixel loop
  6015     pshufd     xmm7, xmm7, 0x44  // dup dudv
  6016     pshufd     xmm5, xmm5, 0  // dup 4, stride
  6017     movdqa     xmm0, xmm2    // x0, y0, x1, y1
  6018     addps      xmm0, xmm7
  6019     movlhps    xmm2, xmm0
  6020     movdqa     xmm4, xmm7
  6021     addps      xmm4, xmm4    // dudv *= 2
  6022     movdqa     xmm3, xmm2    // x2, y2, x3, y3
  6023     addps      xmm3, xmm4
  6024     addps      xmm4, xmm4    // dudv *= 4
  6026     // 4 pixel loop
  6027     align      4
  6028   l4:
  6029     cvttps2dq  xmm0, xmm2    // x, y float to int first 2
  6030     cvttps2dq  xmm1, xmm3    // x, y float to int next 2
  6031     packssdw   xmm0, xmm1    // x, y as 8 shorts
  6032     pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.
  6033     movd       esi, xmm0
  6034     pshufd     xmm0, xmm0, 0x39  // shift right
  6035     movd       edi, xmm0
  6036     pshufd     xmm0, xmm0, 0x39  // shift right
  6037     movd       xmm1, [eax + esi]  // read pixel 0
  6038     movd       xmm6, [eax + edi]  // read pixel 1
  6039     punpckldq  xmm1, xmm6     // combine pixel 0 and 1
  6040     addps      xmm2, xmm4    // x, y += dx, dy first 2
  6041     movq       qword ptr [edx], xmm1
  6042     movd       esi, xmm0
  6043     pshufd     xmm0, xmm0, 0x39  // shift right
  6044     movd       edi, xmm0
  6045     movd       xmm6, [eax + esi]  // read pixel 2
  6046     movd       xmm0, [eax + edi]  // read pixel 3
  6047     punpckldq  xmm6, xmm0     // combine pixel 2 and 3
  6048     addps      xmm3, xmm4    // x, y += dx, dy next 2
  6049     sub        ecx, 4
  6050     movq       qword ptr 8[edx], xmm6
  6051     lea        edx, [edx + 16]
  6052     jge        l4
  6054   l4b:
  6055     add        ecx, 4 - 1
  6056     jl         l1b
  6058     // 1 pixel loop
  6059     align      4
  6060   l1:
  6061     cvttps2dq  xmm0, xmm2    // x, y float to int
  6062     packssdw   xmm0, xmm0    // x, y as shorts
  6063     pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
  6064     addps      xmm2, xmm7    // x, y += dx, dy
  6065     movd       esi, xmm0
  6066     movd       xmm0, [eax + esi]  // copy a pixel
  6067     sub        ecx, 1
  6068     movd       [edx], xmm0
  6069     lea        edx, [edx + 4]
  6070     jge        l1
  6071   l1b:
  6072     pop        edi
  6073     pop        esi
  6074     ret
  6077 #endif  // HAS_ARGBAFFINEROW_SSE2
  6079 #ifdef HAS_INTERPOLATEROW_AVX2
  6080 // Bilinear filter 16x2 -> 16x1
  6081 __declspec(naked) __declspec(align(16))
  6082 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
  6083                           ptrdiff_t src_stride, int dst_width,
  6084                           int source_y_fraction) {
  6085   __asm {
  6086     push       esi
  6087     push       edi
  6088     mov        edi, [esp + 8 + 4]   // dst_ptr
  6089     mov        esi, [esp + 8 + 8]   // src_ptr
  6090     mov        edx, [esp + 8 + 12]  // src_stride
  6091     mov        ecx, [esp + 8 + 16]  // dst_width
  6092     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
  6093     shr        eax, 1
  6094     // Dispatch to specialized filters if applicable.
  6095     cmp        eax, 0
  6096     je         xloop100  // 0 / 128.  Blend 100 / 0.
  6097     sub        edi, esi
  6098     cmp        eax, 32
  6099     je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
  6100     cmp        eax, 64
  6101     je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
  6102     cmp        eax, 96
  6103     je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
  6105     vmovd      xmm0, eax  // high fraction 0..127
  6106     neg        eax
  6107     add        eax, 128
  6108     vmovd      xmm5, eax  // low fraction 128..1
  6109     vpunpcklbw xmm5, xmm5, xmm0
  6110     vpunpcklwd xmm5, xmm5, xmm5
  6111     vpxor      ymm0, ymm0, ymm0
  6112     vpermd     ymm5, ymm0, ymm5
  6114     align      4
  6115   xloop:
  6116     vmovdqu    ymm0, [esi]
  6117     vmovdqu    ymm2, [esi + edx]
  6118     vpunpckhbw ymm1, ymm0, ymm2  // mutates
  6119     vpunpcklbw ymm0, ymm0, ymm2  // mutates
  6120     vpmaddubsw ymm0, ymm0, ymm5
  6121     vpmaddubsw ymm1, ymm1, ymm5
  6122     vpsrlw     ymm0, ymm0, 7
  6123     vpsrlw     ymm1, ymm1, 7
  6124     vpackuswb  ymm0, ymm0, ymm1  // unmutates
  6125     sub        ecx, 32
  6126     vmovdqu    [esi + edi], ymm0
  6127     lea        esi, [esi + 32]
  6128     jg         xloop
  6129     jmp        xloop99
  6131     // Blend 25 / 75.
  6132     align      4
  6133   xloop25:
  6134     vmovdqu    ymm0, [esi]
  6135     vpavgb     ymm0, ymm0, [esi + edx]
  6136     vpavgb     ymm0, ymm0, [esi + edx]
  6137     sub        ecx, 32
  6138     vmovdqu    [esi + edi], ymm0
  6139     lea        esi, [esi + 32]
  6140     jg         xloop25
  6141     jmp        xloop99
  6143     // Blend 50 / 50.
  6144     align      4
  6145   xloop50:
  6146     vmovdqu    ymm0, [esi]
  6147     vpavgb     ymm0, ymm0, [esi + edx]
  6148     sub        ecx, 32
  6149     vmovdqu    [esi + edi], ymm0
  6150     lea        esi, [esi + 32]
  6151     jg         xloop50
  6152     jmp        xloop99
  6154     // Blend 75 / 25.
  6155     align      4
  6156   xloop75:
  6157     vmovdqu    ymm0, [esi + edx]
  6158     vpavgb     ymm0, ymm0, [esi]
  6159     vpavgb     ymm0, ymm0, [esi]
  6160     sub        ecx, 32
  6161     vmovdqu     [esi + edi], ymm0
  6162     lea        esi, [esi + 32]
  6163     jg         xloop75
  6164     jmp        xloop99
  6166     // Blend 100 / 0 - Copy row unchanged.
  6167     align      4
  6168   xloop100:
  6169     rep movsb
  6171   xloop99:
  6172     pop        edi
  6173     pop        esi
  6174     vzeroupper
  6175     ret
  6178 #endif  // HAS_INTERPOLATEROW_AVX2
  6180 #ifdef HAS_INTERPOLATEROW_SSSE3
  6181 // Bilinear filter 16x2 -> 16x1
  6182 __declspec(naked) __declspec(align(16))
  6183 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
  6184                           ptrdiff_t src_stride, int dst_width,
  6185                           int source_y_fraction) {
  6186   __asm {
  6187     push       esi
  6188     push       edi
  6189     mov        edi, [esp + 8 + 4]   // dst_ptr
  6190     mov        esi, [esp + 8 + 8]   // src_ptr
  6191     mov        edx, [esp + 8 + 12]  // src_stride
  6192     mov        ecx, [esp + 8 + 16]  // dst_width
  6193     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
  6194     sub        edi, esi
  6195     shr        eax, 1
  6196     // Dispatch to specialized filters if applicable.
  6197     cmp        eax, 0
  6198     je         xloop100  // 0 / 128.  Blend 100 / 0.
  6199     cmp        eax, 32
  6200     je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
  6201     cmp        eax, 64
  6202     je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
  6203     cmp        eax, 96
  6204     je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
  6206     movd       xmm0, eax  // high fraction 0..127
  6207     neg        eax
  6208     add        eax, 128
  6209     movd       xmm5, eax  // low fraction 128..1
  6210     punpcklbw  xmm5, xmm0
  6211     punpcklwd  xmm5, xmm5
  6212     pshufd     xmm5, xmm5, 0
  6214     align      4
  6215   xloop:
  6216     movdqa     xmm0, [esi]
  6217     movdqa     xmm2, [esi + edx]
  6218     movdqa     xmm1, xmm0
  6219     punpcklbw  xmm0, xmm2
  6220     punpckhbw  xmm1, xmm2
  6221     pmaddubsw  xmm0, xmm5
  6222     pmaddubsw  xmm1, xmm5
  6223     psrlw      xmm0, 7
  6224     psrlw      xmm1, 7
  6225     packuswb   xmm0, xmm1
  6226     sub        ecx, 16
  6227     movdqa     [esi + edi], xmm0
  6228     lea        esi, [esi + 16]
  6229     jg         xloop
  6230     jmp        xloop99
  6232     // Blend 25 / 75.
  6233     align      4
  6234   xloop25:
  6235     movdqa     xmm0, [esi]
  6236     movdqa     xmm1, [esi + edx]
  6237     pavgb      xmm0, xmm1
  6238     pavgb      xmm0, xmm1
  6239     sub        ecx, 16
  6240     movdqa     [esi + edi], xmm0
  6241     lea        esi, [esi + 16]
  6242     jg         xloop25
  6243     jmp        xloop99
  6245     // Blend 50 / 50.
  6246     align      4
  6247   xloop50:
  6248     movdqa     xmm0, [esi]
  6249     movdqa     xmm1, [esi + edx]
  6250     pavgb      xmm0, xmm1
  6251     sub        ecx, 16
  6252     movdqa     [esi + edi], xmm0
  6253     lea        esi, [esi + 16]
  6254     jg         xloop50
  6255     jmp        xloop99
  6257     // Blend 75 / 25.
  6258     align      4
  6259   xloop75:
  6260     movdqa     xmm1, [esi]
  6261     movdqa     xmm0, [esi + edx]
  6262     pavgb      xmm0, xmm1
  6263     pavgb      xmm0, xmm1
  6264     sub        ecx, 16
  6265     movdqa     [esi + edi], xmm0
  6266     lea        esi, [esi + 16]
  6267     jg         xloop75
  6268     jmp        xloop99
  6270     // Blend 100 / 0 - Copy row unchanged.
  6271     align      4
  6272   xloop100:
  6273     movdqa     xmm0, [esi]
  6274     sub        ecx, 16
  6275     movdqa     [esi + edi], xmm0
  6276     lea        esi, [esi + 16]
  6277     jg         xloop100
  6279   xloop99:
  6280     pop        edi
  6281     pop        esi
  6282     ret
  6285 #endif  // HAS_INTERPOLATEROW_SSSE3
  6287 #ifdef HAS_INTERPOLATEROW_SSE2
  6288 // Bilinear filter 16x2 -> 16x1
  6289 __declspec(naked) __declspec(align(16))
  6290 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
  6291                          ptrdiff_t src_stride, int dst_width,
  6292                          int source_y_fraction) {
  6293   __asm {
  6294     push       esi
  6295     push       edi
  6296     mov        edi, [esp + 8 + 4]   // dst_ptr
  6297     mov        esi, [esp + 8 + 8]   // src_ptr
  6298     mov        edx, [esp + 8 + 12]  // src_stride
  6299     mov        ecx, [esp + 8 + 16]  // dst_width
  6300     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
  6301     sub        edi, esi
  6302     // Dispatch to specialized filters if applicable.
  6303     cmp        eax, 0
  6304     je         xloop100  // 0 / 256.  Blend 100 / 0.
  6305     cmp        eax, 64
  6306     je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.
  6307     cmp        eax, 128
  6308     je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
  6309     cmp        eax, 192
  6310     je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.
  6312     movd       xmm5, eax            // xmm5 = y fraction
  6313     punpcklbw  xmm5, xmm5
  6314     psrlw      xmm5, 1
  6315     punpcklwd  xmm5, xmm5
  6316     punpckldq  xmm5, xmm5
  6317     punpcklqdq xmm5, xmm5
  6318     pxor       xmm4, xmm4
  6320     align      4
  6321   xloop:
  6322     movdqa     xmm0, [esi]  // row0
  6323     movdqa     xmm2, [esi + edx]  // row1
  6324     movdqa     xmm1, xmm0
  6325     movdqa     xmm3, xmm2
  6326     punpcklbw  xmm2, xmm4
  6327     punpckhbw  xmm3, xmm4
  6328     punpcklbw  xmm0, xmm4
  6329     punpckhbw  xmm1, xmm4
  6330     psubw      xmm2, xmm0  // row1 - row0
  6331     psubw      xmm3, xmm1
  6332     paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16
  6333     paddw      xmm3, xmm3
  6334     pmulhw     xmm2, xmm5  // scale diff
  6335     pmulhw     xmm3, xmm5
  6336     paddw      xmm0, xmm2  // sum rows
  6337     paddw      xmm1, xmm3
  6338     packuswb   xmm0, xmm1
  6339     sub        ecx, 16
  6340     movdqa     [esi + edi], xmm0
  6341     lea        esi, [esi + 16]
  6342     jg         xloop
  6343     jmp        xloop99
  6345     // Blend 25 / 75.
  6346     align      4
  6347   xloop25:
  6348     movdqa     xmm0, [esi]
  6349     movdqa     xmm1, [esi + edx]
  6350     pavgb      xmm0, xmm1
  6351     pavgb      xmm0, xmm1
  6352     sub        ecx, 16
  6353     movdqa     [esi + edi], xmm0
  6354     lea        esi, [esi + 16]
  6355     jg         xloop25
  6356     jmp        xloop99
  6358     // Blend 50 / 50.
  6359     align      4
  6360   xloop50:
  6361     movdqa     xmm0, [esi]
  6362     movdqa     xmm1, [esi + edx]
  6363     pavgb      xmm0, xmm1
  6364     sub        ecx, 16
  6365     movdqa     [esi + edi], xmm0
  6366     lea        esi, [esi + 16]
  6367     jg         xloop50
  6368     jmp        xloop99
  6370     // Blend 75 / 25.
  6371     align      4
  6372   xloop75:
  6373     movdqa     xmm1, [esi]
  6374     movdqa     xmm0, [esi + edx]
  6375     pavgb      xmm0, xmm1
  6376     pavgb      xmm0, xmm1
  6377     sub        ecx, 16
  6378     movdqa     [esi + edi], xmm0
  6379     lea        esi, [esi + 16]
  6380     jg         xloop75
  6381     jmp        xloop99
  6383     // Blend 100 / 0 - Copy row unchanged.
  6384     align      4
  6385   xloop100:
  6386     movdqa     xmm0, [esi]
  6387     sub        ecx, 16
  6388     movdqa     [esi + edi], xmm0
  6389     lea        esi, [esi + 16]
  6390     jg         xloop100
  6392   xloop99:
  6393     pop        edi
  6394     pop        esi
  6395     ret
  6398 #endif  // HAS_INTERPOLATEROW_SSE2
  6400 // Bilinear filter 16x2 -> 16x1
  6401 __declspec(naked) __declspec(align(16))
  6402 void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
  6403                                     ptrdiff_t src_stride, int dst_width,
  6404                                     int source_y_fraction) {
  6405   __asm {
  6406     push       esi
  6407     push       edi
  6408     mov        edi, [esp + 8 + 4]   // dst_ptr
  6409     mov        esi, [esp + 8 + 8]   // src_ptr
  6410     mov        edx, [esp + 8 + 12]  // src_stride
  6411     mov        ecx, [esp + 8 + 16]  // dst_width
  6412     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
  6413     sub        edi, esi
  6414     shr        eax, 1
  6415     // Dispatch to specialized filters if applicable.
  6416     cmp        eax, 0
  6417     je         xloop100  // 0 / 128.  Blend 100 / 0.
  6418     cmp        eax, 32
  6419     je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
  6420     cmp        eax, 64
  6421     je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
  6422     cmp        eax, 96
  6423     je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
  6425     movd       xmm0, eax  // high fraction 0..127
  6426     neg        eax
  6427     add        eax, 128
  6428     movd       xmm5, eax  // low fraction 128..1
  6429     punpcklbw  xmm5, xmm0
  6430     punpcklwd  xmm5, xmm5
  6431     pshufd     xmm5, xmm5, 0
  6433     align      4
  6434   xloop:
  6435     movdqu     xmm0, [esi]
  6436     movdqu     xmm2, [esi + edx]
  6437     movdqu     xmm1, xmm0
  6438     punpcklbw  xmm0, xmm2
  6439     punpckhbw  xmm1, xmm2
  6440     pmaddubsw  xmm0, xmm5
  6441     pmaddubsw  xmm1, xmm5
  6442     psrlw      xmm0, 7
  6443     psrlw      xmm1, 7
  6444     packuswb   xmm0, xmm1
  6445     sub        ecx, 16
  6446     movdqu     [esi + edi], xmm0
  6447     lea        esi, [esi + 16]
  6448     jg         xloop
  6449     jmp        xloop99
  6451     // Blend 25 / 75.
  6452     align      4
  6453   xloop25:
  6454     movdqu     xmm0, [esi]
  6455     movdqu     xmm1, [esi + edx]
  6456     pavgb      xmm0, xmm1
  6457     pavgb      xmm0, xmm1
  6458     sub        ecx, 16
  6459     movdqu     [esi + edi], xmm0
  6460     lea        esi, [esi + 16]
  6461     jg         xloop25
  6462     jmp        xloop99
  6464     // Blend 50 / 50.
  6465     align      4
  6466   xloop50:
  6467     movdqu     xmm0, [esi]
  6468     movdqu     xmm1, [esi + edx]
  6469     pavgb      xmm0, xmm1
  6470     sub        ecx, 16
  6471     movdqu     [esi + edi], xmm0
  6472     lea        esi, [esi + 16]
  6473     jg         xloop50
  6474     jmp        xloop99
  6476     // Blend 75 / 25.
  6477     align      4
  6478   xloop75:
  6479     movdqu     xmm1, [esi]
  6480     movdqu     xmm0, [esi + edx]
  6481     pavgb      xmm0, xmm1
  6482     pavgb      xmm0, xmm1
  6483     sub        ecx, 16
  6484     movdqu     [esi + edi], xmm0
  6485     lea        esi, [esi + 16]
  6486     jg         xloop75
  6487     jmp        xloop99
  6489     // Blend 100 / 0 - Copy row unchanged.
  6490     align      4
  6491   xloop100:
  6492     movdqu     xmm0, [esi]
  6493     sub        ecx, 16
  6494     movdqu     [esi + edi], xmm0
  6495     lea        esi, [esi + 16]
  6496     jg         xloop100
  6498   xloop99:
  6499     pop        edi
  6500     pop        esi
  6501     ret
  6505 #ifdef HAS_INTERPOLATEROW_SSE2
  6506 // Bilinear filter 16x2 -> 16x1
  6507 __declspec(naked) __declspec(align(16))
  6508 void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
  6509                                    ptrdiff_t src_stride, int dst_width,
  6510                                    int source_y_fraction) {
  6511   __asm {
  6512     push       esi
  6513     push       edi
  6514     mov        edi, [esp + 8 + 4]   // dst_ptr
  6515     mov        esi, [esp + 8 + 8]   // src_ptr
  6516     mov        edx, [esp + 8 + 12]  // src_stride
  6517     mov        ecx, [esp + 8 + 16]  // dst_width
  6518     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
  6519     sub        edi, esi
  6520     // Dispatch to specialized filters if applicable.
  6521     cmp        eax, 0
  6522     je         xloop100  // 0 / 256.  Blend 100 / 0.
  6523     cmp        eax, 64
  6524     je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.
  6525     cmp        eax, 128
  6526     je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
  6527     cmp        eax, 192
  6528     je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.
  6530     movd       xmm5, eax            // xmm5 = y fraction
  6531     punpcklbw  xmm5, xmm5
  6532     psrlw      xmm5, 1
  6533     punpcklwd  xmm5, xmm5
  6534     punpckldq  xmm5, xmm5
  6535     punpcklqdq xmm5, xmm5
  6536     pxor       xmm4, xmm4
  6538     align      4
  6539   xloop:
  6540     movdqu     xmm0, [esi]  // row0
  6541     movdqu     xmm2, [esi + edx]  // row1
  6542     movdqu     xmm1, xmm0
  6543     movdqu     xmm3, xmm2
  6544     punpcklbw  xmm2, xmm4
  6545     punpckhbw  xmm3, xmm4
  6546     punpcklbw  xmm0, xmm4
  6547     punpckhbw  xmm1, xmm4
  6548     psubw      xmm2, xmm0  // row1 - row0
  6549     psubw      xmm3, xmm1
  6550     paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16
  6551     paddw      xmm3, xmm3
  6552     pmulhw     xmm2, xmm5  // scale diff
  6553     pmulhw     xmm3, xmm5
  6554     paddw      xmm0, xmm2  // sum rows
  6555     paddw      xmm1, xmm3
  6556     packuswb   xmm0, xmm1
  6557     sub        ecx, 16
  6558     movdqu     [esi + edi], xmm0
  6559     lea        esi, [esi + 16]
  6560     jg         xloop
  6561     jmp        xloop99
  6563     // Blend 25 / 75.
  6564     align      4
  6565   xloop25:
  6566     movdqu     xmm0, [esi]
  6567     movdqu     xmm1, [esi + edx]
  6568     pavgb      xmm0, xmm1
  6569     pavgb      xmm0, xmm1
  6570     sub        ecx, 16
  6571     movdqu     [esi + edi], xmm0
  6572     lea        esi, [esi + 16]
  6573     jg         xloop25
  6574     jmp        xloop99
  6576     // Blend 50 / 50.
  6577     align      4
  6578   xloop50:
  6579     movdqu     xmm0, [esi]
  6580     movdqu     xmm1, [esi + edx]
  6581     pavgb      xmm0, xmm1
  6582     sub        ecx, 16
  6583     movdqu     [esi + edi], xmm0
  6584     lea        esi, [esi + 16]
  6585     jg         xloop50
  6586     jmp        xloop99
  6588     // Blend 75 / 25.
  6589     align      4
  6590   xloop75:
  6591     movdqu     xmm1, [esi]
  6592     movdqu     xmm0, [esi + edx]
  6593     pavgb      xmm0, xmm1
  6594     pavgb      xmm0, xmm1
  6595     sub        ecx, 16
  6596     movdqu     [esi + edi], xmm0
  6597     lea        esi, [esi + 16]
  6598     jg         xloop75
  6599     jmp        xloop99
  6601     // Blend 100 / 0 - Copy row unchanged.
  6602     align      4
  6603   xloop100:
  6604     movdqu     xmm0, [esi]
  6605     sub        ecx, 16
  6606     movdqu     [esi + edi], xmm0
  6607     lea        esi, [esi + 16]
  6608     jg         xloop100
  6610   xloop99:
  6611     pop        edi
  6612     pop        esi
  6613     ret
  6616 #endif  // HAS_INTERPOLATEROW_SSE2
  6618 __declspec(naked) __declspec(align(16))
  6619 void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
  6620                   uint8* dst_uv, int pix) {
  6621   __asm {
  6622     push       edi
  6623     mov        eax, [esp + 4 + 4]    // src_uv
  6624     mov        edx, [esp + 4 + 8]    // src_uv_stride
  6625     mov        edi, [esp + 4 + 12]   // dst_v
  6626     mov        ecx, [esp + 4 + 16]   // pix
  6627     sub        edi, eax
  6629     align      4
  6630   convertloop:
  6631     movdqa     xmm0, [eax]
  6632     pavgb      xmm0, [eax + edx]
  6633     sub        ecx, 16
  6634     movdqa     [eax + edi], xmm0
  6635     lea        eax,  [eax + 16]
  6636     jg         convertloop
  6637     pop        edi
  6638     ret
  6642 #ifdef HAS_HALFROW_AVX2
  6643 __declspec(naked) __declspec(align(16))
  6644 void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
  6645                   uint8* dst_uv, int pix) {
  6646   __asm {
  6647     push       edi
  6648     mov        eax, [esp + 4 + 4]    // src_uv
  6649     mov        edx, [esp + 4 + 8]    // src_uv_stride
  6650     mov        edi, [esp + 4 + 12]   // dst_v
  6651     mov        ecx, [esp + 4 + 16]   // pix
  6652     sub        edi, eax
  6654     align      4
  6655   convertloop:
  6656     vmovdqu    ymm0, [eax]
  6657     vpavgb     ymm0, ymm0, [eax + edx]
  6658     sub        ecx, 32
  6659     vmovdqu    [eax + edi], ymm0
  6660     lea        eax,  [eax + 32]
  6661     jg         convertloop
  6663     pop        edi
  6664     vzeroupper
  6665     ret
  6668 #endif  // HAS_HALFROW_AVX2
  6670 __declspec(naked) __declspec(align(16))
  6671 void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
  6672                           uint32 selector, int pix) {
  6673   __asm {
  6674     mov        eax, [esp + 4]    // src_argb
  6675     mov        edx, [esp + 8]    // dst_bayer
  6676     movd       xmm5, [esp + 12]  // selector
  6677     mov        ecx, [esp + 16]   // pix
  6678     pshufd     xmm5, xmm5, 0
  6680     align      4
  6681   wloop:
  6682     movdqa     xmm0, [eax]
  6683     movdqa     xmm1, [eax + 16]
  6684     lea        eax, [eax + 32]
  6685     pshufb     xmm0, xmm5
  6686     pshufb     xmm1, xmm5
  6687     punpckldq  xmm0, xmm1
  6688     sub        ecx, 8
  6689     movq       qword ptr [edx], xmm0
  6690     lea        edx, [edx + 8]
  6691     jg         wloop
  6692     ret
  6696 // Specialized ARGB to Bayer that just isolates G channel.
  6697 __declspec(naked) __declspec(align(16))
  6698 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
  6699                            uint32 selector, int pix) {
  6700   __asm {
  6701     mov        eax, [esp + 4]    // src_argb
  6702     mov        edx, [esp + 8]    // dst_bayer
  6703                                  // selector
  6704     mov        ecx, [esp + 16]   // pix
  6705     pcmpeqb    xmm5, xmm5        // generate mask 0x000000ff
  6706     psrld      xmm5, 24
  6708     align      4
  6709   wloop:
  6710     movdqa     xmm0, [eax]
  6711     movdqa     xmm1, [eax + 16]
  6712     lea        eax, [eax + 32]
  6713     psrld      xmm0, 8  // Move green to bottom.
  6714     psrld      xmm1, 8
  6715     pand       xmm0, xmm5
  6716     pand       xmm1, xmm5
  6717     packssdw   xmm0, xmm1
  6718     packuswb   xmm0, xmm1
  6719     sub        ecx, 8
  6720     movq       qword ptr [edx], xmm0
  6721     lea        edx, [edx + 8]
  6722     jg         wloop
  6723     ret
  6727 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  6728 __declspec(naked) __declspec(align(16))
  6729 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
  6730                           const uint8* shuffler, int pix) {
  6731   __asm {
  6732     mov        eax, [esp + 4]    // src_argb
  6733     mov        edx, [esp + 8]    // dst_argb
  6734     mov        ecx, [esp + 12]   // shuffler
  6735     movdqa     xmm5, [ecx]
  6736     mov        ecx, [esp + 16]   // pix
  6738     align      4
  6739   wloop:
  6740     movdqa     xmm0, [eax]
  6741     movdqa     xmm1, [eax + 16]
  6742     lea        eax, [eax + 32]
  6743     pshufb     xmm0, xmm5
  6744     pshufb     xmm1, xmm5
  6745     sub        ecx, 8
  6746     movdqa     [edx], xmm0
  6747     movdqa     [edx + 16], xmm1
  6748     lea        edx, [edx + 32]
  6749     jg         wloop
  6750     ret
  6754 __declspec(naked) __declspec(align(16))
  6755 void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
  6756                                     const uint8* shuffler, int pix) {
  6757   __asm {
  6758     mov        eax, [esp + 4]    // src_argb
  6759     mov        edx, [esp + 8]    // dst_argb
  6760     mov        ecx, [esp + 12]   // shuffler
  6761     movdqa     xmm5, [ecx]
  6762     mov        ecx, [esp + 16]   // pix
  6764     align      4
  6765   wloop:
  6766     movdqu     xmm0, [eax]
  6767     movdqu     xmm1, [eax + 16]
  6768     lea        eax, [eax + 32]
  6769     pshufb     xmm0, xmm5
  6770     pshufb     xmm1, xmm5
  6771     sub        ecx, 8
  6772     movdqu     [edx], xmm0
  6773     movdqu     [edx + 16], xmm1
  6774     lea        edx, [edx + 32]
  6775     jg         wloop
  6776     ret
  6780 #ifdef HAS_ARGBSHUFFLEROW_AVX2
  6781 __declspec(naked) __declspec(align(16))
  6782 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
  6783                          const uint8* shuffler, int pix) {
  6784   __asm {
  6785     mov        eax, [esp + 4]     // src_argb
  6786     mov        edx, [esp + 8]     // dst_argb
  6787     mov        ecx, [esp + 12]    // shuffler
  6788     vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.
  6789     mov        ecx, [esp + 16]    // pix
  6791     align      4
  6792   wloop:
  6793     vmovdqu    ymm0, [eax]
  6794     vmovdqu    ymm1, [eax + 32]
  6795     lea        eax, [eax + 64]
  6796     vpshufb    ymm0, ymm0, ymm5
  6797     vpshufb    ymm1, ymm1, ymm5
  6798     sub        ecx, 16
  6799     vmovdqu    [edx], ymm0
  6800     vmovdqu    [edx + 32], ymm1
  6801     lea        edx, [edx + 64]
  6802     jg         wloop
  6804     vzeroupper
  6805     ret
  6808 #endif  // HAS_ARGBSHUFFLEROW_AVX2
  6810 __declspec(naked) __declspec(align(16))
  6811 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
  6812                          const uint8* shuffler, int pix) {
  6813   __asm {
  6814     push       ebx
  6815     push       esi
  6816     mov        eax, [esp + 8 + 4]    // src_argb
  6817     mov        edx, [esp + 8 + 8]    // dst_argb
  6818     mov        esi, [esp + 8 + 12]   // shuffler
  6819     mov        ecx, [esp + 8 + 16]   // pix
  6820     pxor       xmm5, xmm5
  6822     mov        ebx, [esi]   // shuffler
  6823     cmp        ebx, 0x03000102
  6824     je         shuf_3012
  6825     cmp        ebx, 0x00010203
  6826     je         shuf_0123
  6827     cmp        ebx, 0x00030201
  6828     je         shuf_0321
  6829     cmp        ebx, 0x02010003
  6830     je         shuf_2103
  6832   // TODO(fbarchard): Use one source pointer and 3 offsets.
  6833   shuf_any1:
  6834     movzx      ebx, byte ptr [esi]
  6835     movzx      ebx, byte ptr [eax + ebx]
  6836     mov        [edx], bl
  6837     movzx      ebx, byte ptr [esi + 1]
  6838     movzx      ebx, byte ptr [eax + ebx]
  6839     mov        [edx + 1], bl
  6840     movzx      ebx, byte ptr [esi + 2]
  6841     movzx      ebx, byte ptr [eax + ebx]
  6842     mov        [edx + 2], bl
  6843     movzx      ebx, byte ptr [esi + 3]
  6844     movzx      ebx, byte ptr [eax + ebx]
  6845     mov        [edx + 3], bl
  6846     lea        eax, [eax + 4]
  6847     lea        edx, [edx + 4]
  6848     sub        ecx, 1
  6849     jg         shuf_any1
  6850     jmp        shuf99
  6852     align      4
  6853   shuf_0123:
  6854     movdqu     xmm0, [eax]
  6855     lea        eax, [eax + 16]
  6856     movdqa     xmm1, xmm0
  6857     punpcklbw  xmm0, xmm5
  6858     punpckhbw  xmm1, xmm5
  6859     pshufhw    xmm0, xmm0, 01Bh   // 1B = 00011011 = 0x0123 = BGRAToARGB
  6860     pshuflw    xmm0, xmm0, 01Bh
  6861     pshufhw    xmm1, xmm1, 01Bh
  6862     pshuflw    xmm1, xmm1, 01Bh
  6863     packuswb   xmm0, xmm1
  6864     sub        ecx, 4
  6865     movdqu     [edx], xmm0
  6866     lea        edx, [edx + 16]
  6867     jg         shuf_0123
  6868     jmp        shuf99
  6870     align      4
  6871   shuf_0321:
  6872     movdqu     xmm0, [eax]
  6873     lea        eax, [eax + 16]
  6874     movdqa     xmm1, xmm0
  6875     punpcklbw  xmm0, xmm5
  6876     punpckhbw  xmm1, xmm5
  6877     pshufhw    xmm0, xmm0, 039h   // 39 = 00111001 = 0x0321 = RGBAToARGB
  6878     pshuflw    xmm0, xmm0, 039h
  6879     pshufhw    xmm1, xmm1, 039h
  6880     pshuflw    xmm1, xmm1, 039h
  6881     packuswb   xmm0, xmm1
  6882     sub        ecx, 4
  6883     movdqu     [edx], xmm0
  6884     lea        edx, [edx + 16]
  6885     jg         shuf_0321
  6886     jmp        shuf99
  6888     align      4
  6889   shuf_2103:
  6890     movdqu     xmm0, [eax]
  6891     lea        eax, [eax + 16]
  6892     movdqa     xmm1, xmm0
  6893     punpcklbw  xmm0, xmm5
  6894     punpckhbw  xmm1, xmm5
  6895     pshufhw    xmm0, xmm0, 093h   // 93 = 10010011 = 0x2103 = ARGBToRGBA
  6896     pshuflw    xmm0, xmm0, 093h
  6897     pshufhw    xmm1, xmm1, 093h
  6898     pshuflw    xmm1, xmm1, 093h
  6899     packuswb   xmm0, xmm1
  6900     sub        ecx, 4
  6901     movdqu     [edx], xmm0
  6902     lea        edx, [edx + 16]
  6903     jg         shuf_2103
  6904     jmp        shuf99
  6906     align      4
  6907   shuf_3012:
  6908     movdqu     xmm0, [eax]
  6909     lea        eax, [eax + 16]
  6910     movdqa     xmm1, xmm0
  6911     punpcklbw  xmm0, xmm5
  6912     punpckhbw  xmm1, xmm5
  6913     pshufhw    xmm0, xmm0, 0C6h   // C6 = 11000110 = 0x3012 = ABGRToARGB
  6914     pshuflw    xmm0, xmm0, 0C6h
  6915     pshufhw    xmm1, xmm1, 0C6h
  6916     pshuflw    xmm1, xmm1, 0C6h
  6917     packuswb   xmm0, xmm1
  6918     sub        ecx, 4
  6919     movdqu     [edx], xmm0
  6920     lea        edx, [edx + 16]
  6921     jg         shuf_3012
  6923   shuf99:
  6924     pop        esi
  6925     pop        ebx
  6926     ret
  6930 // YUY2 - Macro-pixel = 2 image pixels
  6931 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
  6933 // UYVY - Macro-pixel = 2 image pixels
  6934 // U0Y0V0Y1
  6936 __declspec(naked) __declspec(align(16))
  6937 void I422ToYUY2Row_SSE2(const uint8* src_y,
  6938                         const uint8* src_u,
  6939                         const uint8* src_v,
  6940                         uint8* dst_frame, int width) {
  6941   __asm {
  6942     push       esi
  6943     push       edi
  6944     mov        eax, [esp + 8 + 4]    // src_y
  6945     mov        esi, [esp + 8 + 8]    // src_u
  6946     mov        edx, [esp + 8 + 12]   // src_v
  6947     mov        edi, [esp + 8 + 16]   // dst_frame
  6948     mov        ecx, [esp + 8 + 20]   // width
  6949     sub        edx, esi
  6951     align      4
  6952   convertloop:
  6953     movq       xmm2, qword ptr [esi] // U
  6954     movq       xmm3, qword ptr [esi + edx] // V
  6955     lea        esi, [esi + 8]
  6956     punpcklbw  xmm2, xmm3 // UV
  6957     movdqu     xmm0, [eax] // Y
  6958     lea        eax, [eax + 16]
  6959     movdqa     xmm1, xmm0
  6960     punpcklbw  xmm0, xmm2 // YUYV
  6961     punpckhbw  xmm1, xmm2
  6962     movdqu     [edi], xmm0
  6963     movdqu     [edi + 16], xmm1
  6964     lea        edi, [edi + 32]
  6965     sub        ecx, 16
  6966     jg         convertloop
  6968     pop        edi
  6969     pop        esi
  6970     ret
  6974 __declspec(naked) __declspec(align(16))
  6975 void I422ToUYVYRow_SSE2(const uint8* src_y,
  6976                         const uint8* src_u,
  6977                         const uint8* src_v,
  6978                         uint8* dst_frame, int width) {
  6979   __asm {
  6980     push       esi
  6981     push       edi
  6982     mov        eax, [esp + 8 + 4]    // src_y
  6983     mov        esi, [esp + 8 + 8]    // src_u
  6984     mov        edx, [esp + 8 + 12]   // src_v
  6985     mov        edi, [esp + 8 + 16]   // dst_frame
  6986     mov        ecx, [esp + 8 + 20]   // width
  6987     sub        edx, esi
  6989     align      4
  6990   convertloop:
  6991     movq       xmm2, qword ptr [esi] // U
  6992     movq       xmm3, qword ptr [esi + edx] // V
  6993     lea        esi, [esi + 8]
  6994     punpcklbw  xmm2, xmm3 // UV
  6995     movdqu     xmm0, [eax] // Y
  6996     movdqa     xmm1, xmm2
  6997     lea        eax, [eax + 16]
  6998     punpcklbw  xmm1, xmm0 // UYVY
  6999     punpckhbw  xmm2, xmm0
  7000     movdqu     [edi], xmm1
  7001     movdqu     [edi + 16], xmm2
  7002     lea        edi, [edi + 32]
  7003     sub        ecx, 16
  7004     jg         convertloop
  7006     pop        edi
  7007     pop        esi
  7008     ret
  7012 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
  7013 __declspec(naked) __declspec(align(16))
  7014 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
  7015                             uint8* dst_argb, const float* poly,
  7016                             int width) {
  7017   __asm {
  7018     push       esi
  7019     mov        eax, [esp + 4 + 4]   /* src_argb */
  7020     mov        edx, [esp + 4 + 8]   /* dst_argb */
  7021     mov        esi, [esp + 4 + 12]  /* poly */
  7022     mov        ecx, [esp + 4 + 16]  /* width */
  7023     pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
  7025     // 2 pixel loop.
  7026     align      4
  7027  convertloop:
  7028 //    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
  7029 //    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
  7030     movq       xmm0, qword ptr [eax]  // BGRABGRA
  7031     lea        eax, [eax + 8]
  7032     punpcklbw  xmm0, xmm3
  7033     movdqa     xmm4, xmm0
  7034     punpcklwd  xmm0, xmm3  // pixel 0
  7035     punpckhwd  xmm4, xmm3  // pixel 1
  7036     cvtdq2ps   xmm0, xmm0  // 4 floats
  7037     cvtdq2ps   xmm4, xmm4
  7038     movdqa     xmm1, xmm0  // X
  7039     movdqa     xmm5, xmm4
  7040     mulps      xmm0, [esi + 16]  // C1 * X
  7041     mulps      xmm4, [esi + 16]
  7042     addps      xmm0, [esi]  // result = C0 + C1 * X
  7043     addps      xmm4, [esi]
  7044     movdqa     xmm2, xmm1
  7045     movdqa     xmm6, xmm5
  7046     mulps      xmm2, xmm1  // X * X
  7047     mulps      xmm6, xmm5
  7048     mulps      xmm1, xmm2  // X * X * X
  7049     mulps      xmm5, xmm6
  7050     mulps      xmm2, [esi + 32]  // C2 * X * X
  7051     mulps      xmm6, [esi + 32]
  7052     mulps      xmm1, [esi + 48]  // C3 * X * X * X
  7053     mulps      xmm5, [esi + 48]
  7054     addps      xmm0, xmm2  // result += C2 * X * X
  7055     addps      xmm4, xmm6
  7056     addps      xmm0, xmm1  // result += C3 * X * X * X
  7057     addps      xmm4, xmm5
  7058     cvttps2dq  xmm0, xmm0
  7059     cvttps2dq  xmm4, xmm4
  7060     packuswb   xmm0, xmm4
  7061     packuswb   xmm0, xmm0
  7062     sub        ecx, 2
  7063     movq       qword ptr [edx], xmm0
  7064     lea        edx, [edx + 8]
  7065     jg         convertloop
  7066     pop        esi
  7067     ret
  7070 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
  7072 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
  7073 __declspec(naked) __declspec(align(16))
  7074 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
  7075                             uint8* dst_argb, const float* poly,
  7076                             int width) {
  7077   __asm {
  7078     mov        eax, [esp + 4]   /* src_argb */
  7079     mov        edx, [esp + 8]   /* dst_argb */
  7080     mov        ecx, [esp + 12]   /* poly */
  7081     vbroadcastf128 ymm4, [ecx]       // C0
  7082     vbroadcastf128 ymm5, [ecx + 16]  // C1
  7083     vbroadcastf128 ymm6, [ecx + 32]  // C2
  7084     vbroadcastf128 ymm7, [ecx + 48]  // C3
  7085     mov        ecx, [esp + 16]  /* width */
  7087     // 2 pixel loop.
  7088     align      4
  7089  convertloop:
  7090     vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
  7091     lea         eax, [eax + 8]
  7092     vcvtdq2ps   ymm0, ymm0        // X 8 floats
  7093     vmulps      ymm2, ymm0, ymm0  // X * X
  7094     vmulps      ymm3, ymm0, ymm7  // C3 * X
  7095     vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
  7096     vfmadd231ps ymm0, ymm2, ymm6  // result += C2 * X * X
  7097     vfmadd231ps ymm0, ymm2, ymm3  // result += C3 * X * X * X
  7098     vcvttps2dq  ymm0, ymm0
  7099     vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
  7100     vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
  7101     vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
  7102     sub         ecx, 2
  7103     vmovq       qword ptr [edx], xmm0
  7104     lea         edx, [edx + 8]
  7105     jg          convertloop
  7106     vzeroupper
  7107     ret
  7110 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
  7112 #ifdef HAS_ARGBCOLORTABLEROW_X86
  7113 // Tranform ARGB pixels with color table.
  7114 __declspec(naked) __declspec(align(16))
  7115 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
  7116                            int width) {
  7117   __asm {
  7118     push       esi
  7119     mov        eax, [esp + 4 + 4]   /* dst_argb */
  7120     mov        esi, [esp + 4 + 8]   /* table_argb */
  7121     mov        ecx, [esp + 4 + 12]  /* width */
  7123     // 1 pixel loop.
  7124     align      4
  7125   convertloop:
  7126     movzx      edx, byte ptr [eax]
  7127     lea        eax, [eax + 4]
  7128     movzx      edx, byte ptr [esi + edx * 4]
  7129     mov        byte ptr [eax - 4], dl
  7130     movzx      edx, byte ptr [eax - 4 + 1]
  7131     movzx      edx, byte ptr [esi + edx * 4 + 1]
  7132     mov        byte ptr [eax - 4 + 1], dl
  7133     movzx      edx, byte ptr [eax - 4 + 2]
  7134     movzx      edx, byte ptr [esi + edx * 4 + 2]
  7135     mov        byte ptr [eax - 4 + 2], dl
  7136     movzx      edx, byte ptr [eax - 4 + 3]
  7137     movzx      edx, byte ptr [esi + edx * 4 + 3]
  7138     mov        byte ptr [eax - 4 + 3], dl
  7139     dec        ecx
  7140     jg         convertloop
  7141     pop        esi
  7142     ret
  7145 #endif  // HAS_ARGBCOLORTABLEROW_X86
  7147 #ifdef HAS_RGBCOLORTABLEROW_X86
  7148 // Tranform RGB pixels with color table.
  7149 __declspec(naked) __declspec(align(16))
  7150 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
  7151   __asm {
  7152     push       esi
  7153     mov        eax, [esp + 4 + 4]   /* dst_argb */
  7154     mov        esi, [esp + 4 + 8]   /* table_argb */
  7155     mov        ecx, [esp + 4 + 12]  /* width */
  7157     // 1 pixel loop.
  7158     align      4
  7159   convertloop:
  7160     movzx      edx, byte ptr [eax]
  7161     lea        eax, [eax + 4]
  7162     movzx      edx, byte ptr [esi + edx * 4]
  7163     mov        byte ptr [eax - 4], dl
  7164     movzx      edx, byte ptr [eax - 4 + 1]
  7165     movzx      edx, byte ptr [esi + edx * 4 + 1]
  7166     mov        byte ptr [eax - 4 + 1], dl
  7167     movzx      edx, byte ptr [eax - 4 + 2]
  7168     movzx      edx, byte ptr [esi + edx * 4 + 2]
  7169     mov        byte ptr [eax - 4 + 2], dl
  7170     dec        ecx
  7171     jg         convertloop
  7173     pop        esi
  7174     ret
  7177 #endif  // HAS_RGBCOLORTABLEROW_X86
  7179 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
  7180 // Tranform RGB pixels with luma table.
  7181 __declspec(naked) __declspec(align(16))
  7182 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
  7183                                  int width,
  7184                                  const uint8* luma, uint32 lumacoeff) {
  7185   __asm {
  7186     push       esi
  7187     push       edi
  7188     mov        eax, [esp + 8 + 4]   /* src_argb */
  7189     mov        edi, [esp + 8 + 8]   /* dst_argb */
  7190     mov        ecx, [esp + 8 + 12]  /* width */
  7191     movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
  7192     movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
  7193     pshufd     xmm2, xmm2, 0
  7194     pshufd     xmm3, xmm3, 0
  7195     pcmpeqb    xmm4, xmm4        // generate mask 0xff00ff00
  7196     psllw      xmm4, 8
  7197     pxor       xmm5, xmm5
  7199     // 4 pixel loop.
  7200     align      4
  7201   convertloop:
  7202     movdqu     xmm0, qword ptr [eax]      // generate luma ptr
  7203     pmaddubsw  xmm0, xmm3
  7204     phaddw     xmm0, xmm0
  7205     pand       xmm0, xmm4  // mask out low bits
  7206     punpcklwd  xmm0, xmm5
  7207     paddd      xmm0, xmm2  // add table base
  7208     movd       esi, xmm0
  7209     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
  7211     movzx      edx, byte ptr [eax]
  7212     movzx      edx, byte ptr [esi + edx]
  7213     mov        byte ptr [edi], dl
  7214     movzx      edx, byte ptr [eax + 1]
  7215     movzx      edx, byte ptr [esi + edx]
  7216     mov        byte ptr [edi + 1], dl
  7217     movzx      edx, byte ptr [eax + 2]
  7218     movzx      edx, byte ptr [esi + edx]
  7219     mov        byte ptr [edi + 2], dl
  7220     movzx      edx, byte ptr [eax + 3]  // copy alpha.
  7221     mov        byte ptr [edi + 3], dl
  7223     movd       esi, xmm0
  7224     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
  7226     movzx      edx, byte ptr [eax + 4]
  7227     movzx      edx, byte ptr [esi + edx]
  7228     mov        byte ptr [edi + 4], dl
  7229     movzx      edx, byte ptr [eax + 5]
  7230     movzx      edx, byte ptr [esi + edx]
  7231     mov        byte ptr [edi + 5], dl
  7232     movzx      edx, byte ptr [eax + 6]
  7233     movzx      edx, byte ptr [esi + edx]
  7234     mov        byte ptr [edi + 6], dl
  7235     movzx      edx, byte ptr [eax + 7]  // copy alpha.
  7236     mov        byte ptr [edi + 7], dl
  7238     movd       esi, xmm0
  7239     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
  7241     movzx      edx, byte ptr [eax + 8]
  7242     movzx      edx, byte ptr [esi + edx]
  7243     mov        byte ptr [edi + 8], dl
  7244     movzx      edx, byte ptr [eax + 9]
  7245     movzx      edx, byte ptr [esi + edx]
  7246     mov        byte ptr [edi + 9], dl
  7247     movzx      edx, byte ptr [eax + 10]
  7248     movzx      edx, byte ptr [esi + edx]
  7249     mov        byte ptr [edi + 10], dl
  7250     movzx      edx, byte ptr [eax + 11]  // copy alpha.
  7251     mov        byte ptr [edi + 11], dl
  7253     movd       esi, xmm0
  7255     movzx      edx, byte ptr [eax + 12]
  7256     movzx      edx, byte ptr [esi + edx]
  7257     mov        byte ptr [edi + 12], dl
  7258     movzx      edx, byte ptr [eax + 13]
  7259     movzx      edx, byte ptr [esi + edx]
  7260     mov        byte ptr [edi + 13], dl
  7261     movzx      edx, byte ptr [eax + 14]
  7262     movzx      edx, byte ptr [esi + edx]
  7263     mov        byte ptr [edi + 14], dl
  7264     movzx      edx, byte ptr [eax + 15]  // copy alpha.
  7265     mov        byte ptr [edi + 15], dl
  7267     sub        ecx, 4
  7268     lea        eax, [eax + 16]
  7269     lea        edi, [edi + 16]
  7270     jg         convertloop
  7272     pop        edi
  7273     pop        esi
  7274     ret
  7277 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
  7279 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
  7281 #ifdef __cplusplus
  7282 }  // extern "C"
  7283 }  // namespace libyuv
  7284 #endif

mercurial