michael@0: /*
michael@0:  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
michael@0:  *
michael@0:  *  Use of this source code is governed by a BSD-style license
michael@0:  *  that can be found in the LICENSE file in the root of the source
michael@0:  *  tree. An additional intellectual property rights grant can be found
michael@0:  *  in the file PATENTS. All contributing project authors may
michael@0:  *  be found in the AUTHORS file in the root of the source tree.
michael@0:  */
michael@0: 
michael@0: #include "libyuv/row.h"
michael@0: 
michael@0: #ifdef __cplusplus
michael@0: namespace libyuv {
michael@0: extern "C" {
michael@0: #endif
michael@0: 
michael@0: // This module is for Visual C x86.
michael@0: #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
michael@0: 
michael@0: #ifdef HAS_ARGBTOYROW_SSSE3
michael@0: 
michael@0: // Constants for ARGB.
michael@0: static const vec8 kARGBToY = {
michael@0:   13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
michael@0: };
michael@0: 
michael@0: // JPeg full range.
michael@0: static const vec8 kARGBToYJ = {
michael@0:   15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
michael@0: };
michael@0: 
michael@0: static const vec8 kARGBToU = {
michael@0:   112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
michael@0: };
michael@0: 
michael@0: static const vec8 kARGBToUJ = {
michael@0:   127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
michael@0: };
michael@0: 
michael@0: static const vec8 kARGBToV = {
michael@0:   -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
michael@0: };
michael@0: 
michael@0: static const vec8 kARGBToVJ = {
michael@0:   -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
michael@0: };
michael@0: 
michael@0: // vpermd for vphaddw + vpackuswb vpermd.
michael@0: static const lvec32 kPermdARGBToY_AVX = {
michael@0:   0, 4, 1, 5, 2, 6, 3, 7
michael@0: };
michael@0: 
michael@0: // vpshufb for vphaddw + vpackuswb packed to shorts.
michael@0: static const lvec8 kShufARGBToUV_AVX = {
michael@0:   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
michael@0:   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
michael@0: };
michael@0: 
michael@0: // Constants for BGRA.
michael@0: static const vec8 kBGRAToY = {
michael@0:   0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
michael@0: };
michael@0: 
michael@0: static const vec8 kBGRAToU = {
michael@0:   0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
michael@0: };
michael@0: 
michael@0: static const vec8 kBGRAToV = {
michael@0:   0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
michael@0: };
michael@0: 
michael@0: // Constants for ABGR.
michael@0: static const vec8 kABGRToY = {
michael@0:   33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
michael@0: };
michael@0: 
michael@0: static const vec8 kABGRToU = {
michael@0:   -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
michael@0: };
michael@0: 
michael@0: static const vec8 kABGRToV = {
michael@0:   112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
michael@0: };
michael@0: 
michael@0: // Constants for RGBA.
michael@0: static const vec8 kRGBAToY = {
michael@0:   0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
michael@0: };
michael@0: 
michael@0: static const vec8 kRGBAToU = {
michael@0:   0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
michael@0: };
michael@0: 
michael@0: static const vec8 kRGBAToV = {
michael@0:   0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
michael@0: };
michael@0: 
michael@0: static const uvec8 kAddY16 = {
michael@0:   16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
michael@0: };
michael@0: 
michael@0: static const vec16 kAddYJ64 = {
michael@0:   64, 64, 64, 64, 64, 64, 64, 64
michael@0: };
michael@0: 
michael@0: static const uvec8 kAddUV128 = {
michael@0:   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
michael@0:   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
michael@0: };
michael@0: 
michael@0: static const uvec16 kAddUVJ128 = {
michael@0:   0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
michael@0: };
michael@0: 
michael@0: // Shuffle table for converting RGB24 to ARGB.
michael@0: static const uvec8 kShuffleMaskRGB24ToARGB = {
michael@0:   0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
michael@0: };
michael@0: 
michael@0: // Shuffle table for converting RAW to ARGB.
michael@0: static const uvec8 kShuffleMaskRAWToARGB = {
michael@0:   2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
michael@0: };
michael@0: 
michael@0: // Shuffle table for converting ARGB to RGB24.
michael@0: static const uvec8 kShuffleMaskARGBToRGB24 = {
michael@0:   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
michael@0: };
michael@0: 
michael@0: // Shuffle table for converting ARGB to RAW.
michael@0: static const uvec8 kShuffleMaskARGBToRAW = {
michael@0:   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
michael@0: };
michael@0: 
michael@0: // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
michael@0: static const uvec8 kShuffleMaskARGBToRGB24_0 = {
michael@0:   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
michael@0: };
michael@0: 
michael@0: // Shuffle table for converting ARGB to RAW.
michael@0: static const uvec8 kShuffleMaskARGBToRAW_0 = {
michael@0:   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
michael@0: };
michael@0: 
michael@0: // Duplicates gray value 3 times and fills in alpha opaque.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]        // src_y
michael@0:     mov        edx, [esp + 8]        // dst_argb
michael@0:     mov        ecx, [esp + 12]       // pix
michael@0:     pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
michael@0:     pslld      xmm5, 24
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     movq       xmm0, qword ptr [eax]
michael@0:     lea        eax,  [eax + 8]
michael@0:     punpcklbw  xmm0, xmm0
michael@0:     movdqa     xmm1, xmm0
michael@0:     punpcklwd  xmm0, xmm0
michael@0:     punpckhwd  xmm1, xmm1
michael@0:     por        xmm0, xmm5
michael@0:     por        xmm1, xmm5
michael@0:     movdqa     [edx], xmm0
michael@0:     movdqa     [edx + 16], xmm1
michael@0:     lea        edx, [edx + 32]
michael@0:     sub        ecx, 8
michael@0:     jg         convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
michael@0:                                   int pix) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]        // src_y
michael@0:     mov        edx, [esp + 8]        // dst_argb
michael@0:     mov        ecx, [esp + 12]       // pix
michael@0:     pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
michael@0:     pslld      xmm5, 24
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     movq       xmm0, qword ptr [eax]
michael@0:     lea        eax,  [eax + 8]
michael@0:     punpcklbw  xmm0, xmm0
michael@0:     movdqa     xmm1, xmm0
michael@0:     punpcklwd  xmm0, xmm0
michael@0:     punpckhwd  xmm1, xmm1
michael@0:     por        xmm0, xmm5
michael@0:     por        xmm1, xmm5
michael@0:     movdqu     [edx], xmm0
michael@0:     movdqu     [edx + 16], xmm1
michael@0:     lea        edx, [edx + 32]
michael@0:     sub        ecx, 8
michael@0:     jg         convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
michael@0:   __asm {
michael@0:     mov       eax, [esp + 4]   // src_rgb24
michael@0:     mov       edx, [esp + 8]   // dst_argb
michael@0:     mov       ecx, [esp + 12]  // pix
michael@0:     pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
michael@0:     pslld     xmm5, 24
michael@0:     movdqa    xmm4, kShuffleMaskRGB24ToARGB
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqu    xmm0, [eax]
michael@0:     movdqu    xmm1, [eax + 16]
michael@0:     movdqu    xmm3, [eax + 32]
michael@0:     lea       eax, [eax + 48]
michael@0:     movdqa    xmm2, xmm3
michael@0:     palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
michael@0:     pshufb    xmm2, xmm4
michael@0:     por       xmm2, xmm5
michael@0:     palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
michael@0:     pshufb    xmm0, xmm4
michael@0:     movdqa    [edx + 32], xmm2
michael@0:     por       xmm0, xmm5
michael@0:     pshufb    xmm1, xmm4
michael@0:     movdqa    [edx], xmm0
michael@0:     por       xmm1, xmm5
michael@0:     palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
michael@0:     pshufb    xmm3, xmm4
michael@0:     movdqa    [edx + 16], xmm1
michael@0:     por       xmm3, xmm5
michael@0:     sub       ecx, 16
michael@0:     movdqa    [edx + 48], xmm3
michael@0:     lea       edx, [edx + 64]
michael@0:     jg        convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
michael@0:                         int pix) {
michael@0:   __asm {
michael@0:     mov       eax, [esp + 4]   // src_raw
michael@0:     mov       edx, [esp + 8]   // dst_argb
michael@0:     mov       ecx, [esp + 12]  // pix
michael@0:     pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
michael@0:     pslld     xmm5, 24
michael@0:     movdqa    xmm4, kShuffleMaskRAWToARGB
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqu    xmm0, [eax]
michael@0:     movdqu    xmm1, [eax + 16]
michael@0:     movdqu    xmm3, [eax + 32]
michael@0:     lea       eax, [eax + 48]
michael@0:     movdqa    xmm2, xmm3
michael@0:     palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
michael@0:     pshufb    xmm2, xmm4
michael@0:     por       xmm2, xmm5
michael@0:     palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
michael@0:     pshufb    xmm0, xmm4
michael@0:     movdqa    [edx + 32], xmm2
michael@0:     por       xmm0, xmm5
michael@0:     pshufb    xmm1, xmm4
michael@0:     movdqa    [edx], xmm0
michael@0:     por       xmm1, xmm5
michael@0:     palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
michael@0:     pshufb    xmm3, xmm4
michael@0:     movdqa    [edx + 16], xmm1
michael@0:     por       xmm3, xmm5
michael@0:     sub       ecx, 16
michael@0:     movdqa    [edx + 48], xmm3
michael@0:     lea       edx, [edx + 64]
michael@0:     jg        convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: // pmul method to replicate bits.
michael@0: // Math to replicate bits:
michael@0: // (v << 8) | (v << 3)
michael@0: // v * 256 + v * 8
michael@0: // v * (256 + 8)
michael@0: // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
michael@0: // 20 instructions.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
michael@0:                           int pix) {
michael@0:   __asm {
michael@0:     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
michael@0:     movd      xmm5, eax
michael@0:     pshufd    xmm5, xmm5, 0
michael@0:     mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
michael@0:     movd      xmm6, eax
michael@0:     pshufd    xmm6, xmm6, 0
michael@0:     pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
michael@0:     psllw     xmm3, 11
michael@0:     pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green
michael@0:     psllw     xmm4, 10
michael@0:     psrlw     xmm4, 5
michael@0:     pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
michael@0:     psllw     xmm7, 8
michael@0: 
michael@0:     mov       eax, [esp + 4]   // src_rgb565
michael@0:     mov       edx, [esp + 8]   // dst_argb
michael@0:     mov       ecx, [esp + 12]  // pix
michael@0:     sub       edx, eax
michael@0:     sub       edx, eax
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
michael@0:     movdqa    xmm1, xmm0
michael@0:     movdqa    xmm2, xmm0
michael@0:     pand      xmm1, xmm3    // R in upper 5 bits
michael@0:     psllw     xmm2, 11      // B in upper 5 bits
michael@0:     pmulhuw   xmm1, xmm5    // * (256 + 8)
michael@0:     pmulhuw   xmm2, xmm5    // * (256 + 8)
michael@0:     psllw     xmm1, 8
michael@0:     por       xmm1, xmm2    // RB
michael@0:     pand      xmm0, xmm4    // G in middle 6 bits
michael@0:     pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)
michael@0:     por       xmm0, xmm7    // AG
michael@0:     movdqa    xmm2, xmm1
michael@0:     punpcklbw xmm1, xmm0
michael@0:     punpckhbw xmm2, xmm0
michael@0:     movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
michael@0:     movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
michael@0:     lea       eax, [eax + 16]
michael@0:     sub       ecx, 8
michael@0:     jg        convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: // 24 instructions
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
michael@0:                             int pix) {
michael@0:   __asm {
michael@0:     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
michael@0:     movd      xmm5, eax
michael@0:     pshufd    xmm5, xmm5, 0
michael@0:     mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
michael@0:     movd      xmm6, eax
michael@0:     pshufd    xmm6, xmm6, 0
michael@0:     pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
michael@0:     psllw     xmm3, 11
michael@0:     movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green
michael@0:     psrlw     xmm4, 6
michael@0:     pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
michael@0:     psllw     xmm7, 8
michael@0: 
michael@0:     mov       eax, [esp + 4]   // src_argb1555
michael@0:     mov       edx, [esp + 8]   // dst_argb
michael@0:     mov       ecx, [esp + 12]  // pix
michael@0:     sub       edx, eax
michael@0:     sub       edx, eax
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
michael@0:     movdqa    xmm1, xmm0
michael@0:     movdqa    xmm2, xmm0
michael@0:     psllw     xmm1, 1       // R in upper 5 bits
michael@0:     psllw     xmm2, 11      // B in upper 5 bits
michael@0:     pand      xmm1, xmm3
michael@0:     pmulhuw   xmm2, xmm5    // * (256 + 8)
michael@0:     pmulhuw   xmm1, xmm5    // * (256 + 8)
michael@0:     psllw     xmm1, 8
michael@0:     por       xmm1, xmm2    // RB
michael@0:     movdqa    xmm2, xmm0
michael@0:     pand      xmm0, xmm4    // G in middle 5 bits
michael@0:     psraw     xmm2, 8       // A
michael@0:     pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)
michael@0:     pand      xmm2, xmm7
michael@0:     por       xmm0, xmm2    // AG
michael@0:     movdqa    xmm2, xmm1
michael@0:     punpcklbw xmm1, xmm0
michael@0:     punpckhbw xmm2, xmm0
michael@0:     movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
michael@0:     movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
michael@0:     lea       eax, [eax + 16]
michael@0:     sub       ecx, 8
michael@0:     jg        convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: // 18 instructions.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
michael@0:                             int pix) {
michael@0:   __asm {
michael@0:     mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
michael@0:     movd      xmm4, eax
michael@0:     pshufd    xmm4, xmm4, 0
michael@0:     movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles
michael@0:     pslld     xmm5, 4
michael@0:     mov       eax, [esp + 4]   // src_argb4444
michael@0:     mov       edx, [esp + 8]   // dst_argb
michael@0:     mov       ecx, [esp + 12]  // pix
michael@0:     sub       edx, eax
michael@0:     sub       edx, eax
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
michael@0:     movdqa    xmm2, xmm0
michael@0:     pand      xmm0, xmm4    // mask low nibbles
michael@0:     pand      xmm2, xmm5    // mask high nibbles
michael@0:     movdqa    xmm1, xmm0
michael@0:     movdqa    xmm3, xmm2
michael@0:     psllw     xmm1, 4
michael@0:     psrlw     xmm3, 4
michael@0:     por       xmm0, xmm1
michael@0:     por       xmm2, xmm3
michael@0:     movdqa    xmm1, xmm0
michael@0:     punpcklbw xmm0, xmm2
michael@0:     punpckhbw xmm1, xmm2
michael@0:     movdqa    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
michael@0:     movdqa    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
michael@0:     lea       eax, [eax + 16]
michael@0:     sub       ecx, 8
michael@0:     jg        convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
michael@0:   __asm {
michael@0:     mov       eax, [esp + 4]   // src_argb
michael@0:     mov       edx, [esp + 8]   // dst_rgb
michael@0:     mov       ecx, [esp + 12]  // pix
michael@0:     movdqa    xmm6, kShuffleMaskARGBToRGB24
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqu    xmm0, [eax]   // fetch 16 pixels of argb
michael@0:     movdqu    xmm1, [eax + 16]
michael@0:     movdqu    xmm2, [eax + 32]
michael@0:     movdqu    xmm3, [eax + 48]
michael@0:     lea       eax, [eax + 64]
michael@0:     pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
michael@0:     pshufb    xmm1, xmm6
michael@0:     pshufb    xmm2, xmm6
michael@0:     pshufb    xmm3, xmm6
michael@0:     movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
michael@0:     psrldq    xmm1, 4      // 8 bytes from 1
michael@0:     pslldq    xmm4, 12     // 4 bytes from 1 for 0
michael@0:     movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
michael@0:     por       xmm0, xmm4   // 4 bytes from 1 for 0
michael@0:     pslldq    xmm5, 8      // 8 bytes from 2 for 1
michael@0:     movdqu    [edx], xmm0  // store 0
michael@0:     por       xmm1, xmm5   // 8 bytes from 2 for 1
michael@0:     psrldq    xmm2, 8      // 4 bytes from 2
michael@0:     pslldq    xmm3, 4      // 12 bytes from 3 for 2
michael@0:     por       xmm2, xmm3   // 12 bytes from 3 for 2
michael@0:     movdqu    [edx + 16], xmm1   // store 1
michael@0:     movdqu    [edx + 32], xmm2   // store 2
michael@0:     lea       edx, [edx + 48]
michael@0:     sub       ecx, 16
michael@0:     jg        convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
michael@0:   __asm {
michael@0:     mov       eax, [esp + 4]   // src_argb
michael@0:     mov       edx, [esp + 8]   // dst_rgb
michael@0:     mov       ecx, [esp + 12]  // pix
michael@0:     movdqa    xmm6, kShuffleMaskARGBToRAW
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqu    xmm0, [eax]   // fetch 16 pixels of argb
michael@0:     movdqu    xmm1, [eax + 16]
michael@0:     movdqu    xmm2, [eax + 32]
michael@0:     movdqu    xmm3, [eax + 48]
michael@0:     lea       eax, [eax + 64]
michael@0:     pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
michael@0:     pshufb    xmm1, xmm6
michael@0:     pshufb    xmm2, xmm6
michael@0:     pshufb    xmm3, xmm6
michael@0:     movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
michael@0:     psrldq    xmm1, 4      // 8 bytes from 1
michael@0:     pslldq    xmm4, 12     // 4 bytes from 1 for 0
michael@0:     movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
michael@0:     por       xmm0, xmm4   // 4 bytes from 1 for 0
michael@0:     pslldq    xmm5, 8      // 8 bytes from 2 for 1
michael@0:     movdqu    [edx], xmm0  // store 0
michael@0:     por       xmm1, xmm5   // 8 bytes from 2 for 1
michael@0:     psrldq    xmm2, 8      // 4 bytes from 2
michael@0:     pslldq    xmm3, 4      // 12 bytes from 3 for 2
michael@0:     por       xmm2, xmm3   // 12 bytes from 3 for 2
michael@0:     movdqu    [edx + 16], xmm1   // store 1
michael@0:     movdqu    [edx + 32], xmm2   // store 2
michael@0:     lea       edx, [edx + 48]
michael@0:     sub       ecx, 16
michael@0:     jg        convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
michael@0:   __asm {
michael@0:     mov       eax, [esp + 4]   // src_argb
michael@0:     mov       edx, [esp + 8]   // dst_rgb
michael@0:     mov       ecx, [esp + 12]  // pix
michael@0:     pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
michael@0:     psrld     xmm3, 27
michael@0:     pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
michael@0:     psrld     xmm4, 26
michael@0:     pslld     xmm4, 5
michael@0:     pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
michael@0:     pslld     xmm5, 11
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqa    xmm0, [eax]   // fetch 4 pixels of argb
michael@0:     movdqa    xmm1, xmm0    // B
michael@0:     movdqa    xmm2, xmm0    // G
michael@0:     pslld     xmm0, 8       // R
michael@0:     psrld     xmm1, 3       // B
michael@0:     psrld     xmm2, 5       // G
michael@0:     psrad     xmm0, 16      // R
michael@0:     pand      xmm1, xmm3    // B
michael@0:     pand      xmm2, xmm4    // G
michael@0:     pand      xmm0, xmm5    // R
michael@0:     por       xmm1, xmm2    // BG
michael@0:     por       xmm0, xmm1    // BGR
michael@0:     packssdw  xmm0, xmm0
michael@0:     lea       eax, [eax + 16]
michael@0:     movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
michael@0:     lea       edx, [edx + 8]
michael@0:     sub       ecx, 4
michael@0:     jg        convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: // TODO(fbarchard): Improve sign extension/packing.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
michael@0:   __asm {
michael@0:     mov       eax, [esp + 4]   // src_argb
michael@0:     mov       edx, [esp + 8]   // dst_rgb
michael@0:     mov       ecx, [esp + 12]  // pix
michael@0:     pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
michael@0:     psrld     xmm4, 27
michael@0:     movdqa    xmm5, xmm4       // generate mask 0x000003e0
michael@0:     pslld     xmm5, 5
michael@0:     movdqa    xmm6, xmm4       // generate mask 0x00007c00
michael@0:     pslld     xmm6, 10
michael@0:     pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
michael@0:     pslld     xmm7, 15
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqa    xmm0, [eax]   // fetch 4 pixels of argb
michael@0:     movdqa    xmm1, xmm0    // B
michael@0:     movdqa    xmm2, xmm0    // G
michael@0:     movdqa    xmm3, xmm0    // R
michael@0:     psrad     xmm0, 16      // A
michael@0:     psrld     xmm1, 3       // B
michael@0:     psrld     xmm2, 6       // G
michael@0:     psrld     xmm3, 9       // R
michael@0:     pand      xmm0, xmm7    // A
michael@0:     pand      xmm1, xmm4    // B
michael@0:     pand      xmm2, xmm5    // G
michael@0:     pand      xmm3, xmm6    // R
michael@0:     por       xmm0, xmm1    // BA
michael@0:     por       xmm2, xmm3    // GR
michael@0:     por       xmm0, xmm2    // BGRA
michael@0:     packssdw  xmm0, xmm0
michael@0:     lea       eax, [eax + 16]
michael@0:     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
michael@0:     lea       edx, [edx + 8]
michael@0:     sub       ecx, 4
michael@0:     jg        convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
michael@0:   __asm {
michael@0:     mov       eax, [esp + 4]   // src_argb
michael@0:     mov       edx, [esp + 8]   // dst_rgb
michael@0:     mov       ecx, [esp + 12]  // pix
michael@0:     pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
michael@0:     psllw     xmm4, 12
michael@0:     movdqa    xmm3, xmm4       // generate mask 0x00f000f0
michael@0:     psrlw     xmm3, 8
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqa    xmm0, [eax]   // fetch 4 pixels of argb
michael@0:     movdqa    xmm1, xmm0
michael@0:     pand      xmm0, xmm3    // low nibble
michael@0:     pand      xmm1, xmm4    // high nibble
michael@0:     psrl      xmm0, 4
michael@0:     psrl      xmm1, 8
michael@0:     por       xmm0, xmm1
michael@0:     packuswb  xmm0, xmm0
michael@0:     lea       eax, [eax + 16]
michael@0:     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
michael@0:     lea       edx, [edx + 8]
michael@0:     sub       ecx, 4
michael@0:     jg        convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]   /* src_argb */
michael@0:     mov        edx, [esp + 8]   /* dst_y */
michael@0:     mov        ecx, [esp + 12]  /* pix */
michael@0:     movdqa     xmm5, kAddY16
michael@0:     movdqa     xmm4, kARGBToY
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqa     xmm0, [eax]
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     movdqa     xmm2, [eax + 32]
michael@0:     movdqa     xmm3, [eax + 48]
michael@0:     pmaddubsw  xmm0, xmm4
michael@0:     pmaddubsw  xmm1, xmm4
michael@0:     pmaddubsw  xmm2, xmm4
michael@0:     pmaddubsw  xmm3, xmm4
michael@0:     lea        eax, [eax + 64]
michael@0:     phaddw     xmm0, xmm1
michael@0:     phaddw     xmm2, xmm3
michael@0:     psrlw      xmm0, 7
michael@0:     psrlw      xmm2, 7
michael@0:     packuswb   xmm0, xmm2
michael@0:     paddb      xmm0, xmm5
michael@0:     sub        ecx, 16
michael@0:     movdqa     [edx], xmm0
michael@0:     lea        edx, [edx + 16]
michael@0:     jg         convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]   /* src_argb */
michael@0:     mov        edx, [esp + 8]   /* dst_y */
michael@0:     mov        ecx, [esp + 12]  /* pix */
michael@0:     movdqa     xmm4, kARGBToYJ
michael@0:     movdqa     xmm5, kAddYJ64
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqa     xmm0, [eax]
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     movdqa     xmm2, [eax + 32]
michael@0:     movdqa     xmm3, [eax + 48]
michael@0:     pmaddubsw  xmm0, xmm4
michael@0:     pmaddubsw  xmm1, xmm4
michael@0:     pmaddubsw  xmm2, xmm4
michael@0:     pmaddubsw  xmm3, xmm4
michael@0:     lea        eax, [eax + 64]
michael@0:     phaddw     xmm0, xmm1
michael@0:     phaddw     xmm2, xmm3
michael@0:     paddw      xmm0, xmm5  // Add .5 for rounding.
michael@0:     paddw      xmm2, xmm5
michael@0:     psrlw      xmm0, 7
michael@0:     psrlw      xmm2, 7
michael@0:     packuswb   xmm0, xmm2
michael@0:     sub        ecx, 16
michael@0:     movdqa     [edx], xmm0
michael@0:     lea        edx, [edx + 16]
michael@0:     jg         convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: #ifdef HAS_ARGBTOYROW_AVX2
michael@0: // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
michael@0: __declspec(naked) __declspec(align(32))
michael@0: void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]   /* src_argb */
michael@0:     mov        edx, [esp + 8]   /* dst_y */
michael@0:     mov        ecx, [esp + 12]  /* pix */
michael@0:     vbroadcastf128 ymm4, kARGBToY
michael@0:     vbroadcastf128 ymm5, kAddY16
michael@0:     vmovdqa    ymm6, kPermdARGBToY_AVX
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     vmovdqu    ymm0, [eax]
michael@0:     vmovdqu    ymm1, [eax + 32]
michael@0:     vmovdqu    ymm2, [eax + 64]
michael@0:     vmovdqu    ymm3, [eax + 96]
michael@0:     vpmaddubsw ymm0, ymm0, ymm4
michael@0:     vpmaddubsw ymm1, ymm1, ymm4
michael@0:     vpmaddubsw ymm2, ymm2, ymm4
michael@0:     vpmaddubsw ymm3, ymm3, ymm4
michael@0:     lea        eax, [eax + 128]
michael@0:     vphaddw    ymm0, ymm0, ymm1  // mutates.
michael@0:     vphaddw    ymm2, ymm2, ymm3
michael@0:     vpsrlw     ymm0, ymm0, 7
michael@0:     vpsrlw     ymm2, ymm2, 7
michael@0:     vpackuswb  ymm0, ymm0, ymm2  // mutates.
michael@0:     vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
michael@0:     vpaddb     ymm0, ymm0, ymm5
michael@0:     sub        ecx, 32
michael@0:     vmovdqu    [edx], ymm0
michael@0:     lea        edx, [edx + 32]
michael@0:     jg         convertloop
michael@0:     vzeroupper
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  //  HAS_ARGBTOYROW_AVX2
michael@0: 
michael@0: #ifdef HAS_ARGBTOYROW_AVX2
michael@0: // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
michael@0: __declspec(naked) __declspec(align(32))
michael@0: void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]   /* src_argb */
michael@0:     mov        edx, [esp + 8]   /* dst_y */
michael@0:     mov        ecx, [esp + 12]  /* pix */
michael@0:     vbroadcastf128 ymm4, kARGBToYJ
michael@0:     vbroadcastf128 ymm5, kAddYJ64
michael@0:     vmovdqa    ymm6, kPermdARGBToY_AVX
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     vmovdqu    ymm0, [eax]
michael@0:     vmovdqu    ymm1, [eax + 32]
michael@0:     vmovdqu    ymm2, [eax + 64]
michael@0:     vmovdqu    ymm3, [eax + 96]
michael@0:     vpmaddubsw ymm0, ymm0, ymm4
michael@0:     vpmaddubsw ymm1, ymm1, ymm4
michael@0:     vpmaddubsw ymm2, ymm2, ymm4
michael@0:     vpmaddubsw ymm3, ymm3, ymm4
michael@0:     lea        eax, [eax + 128]
michael@0:     vphaddw    ymm0, ymm0, ymm1  // mutates.
michael@0:     vphaddw    ymm2, ymm2, ymm3
michael@0:     vpaddw     ymm0, ymm0, ymm5  // Add .5 for rounding.
michael@0:     vpaddw     ymm2, ymm2, ymm5
michael@0:     vpsrlw     ymm0, ymm0, 7
michael@0:     vpsrlw     ymm2, ymm2, 7
michael@0:     vpackuswb  ymm0, ymm0, ymm2  // mutates.
michael@0:     vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
michael@0:     sub        ecx, 32
michael@0:     vmovdqu    [edx], ymm0
michael@0:     lea        edx, [edx + 32]
michael@0:     jg         convertloop
michael@0: 
michael@0:     vzeroupper
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  //  HAS_ARGBTOYJROW_AVX2
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]   /* src_argb */
michael@0:     mov        edx, [esp + 8]   /* dst_y */
michael@0:     mov        ecx, [esp + 12]  /* pix */
michael@0:     movdqa     xmm5, kAddY16
michael@0:     movdqa     xmm4, kARGBToY
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqu     xmm0, [eax]
michael@0:     movdqu     xmm1, [eax + 16]
michael@0:     movdqu     xmm2, [eax + 32]
michael@0:     movdqu     xmm3, [eax + 48]
michael@0:     pmaddubsw  xmm0, xmm4
michael@0:     pmaddubsw  xmm1, xmm4
michael@0:     pmaddubsw  xmm2, xmm4
michael@0:     pmaddubsw  xmm3, xmm4
michael@0:     lea        eax, [eax + 64]
michael@0:     phaddw     xmm0, xmm1
michael@0:     phaddw     xmm2, xmm3
michael@0:     psrlw      xmm0, 7
michael@0:     psrlw      xmm2, 7
michael@0:     packuswb   xmm0, xmm2
michael@0:     paddb      xmm0, xmm5
michael@0:     sub        ecx, 16
michael@0:     movdqu     [edx], xmm0
michael@0:     lea        edx, [edx + 16]
michael@0:     jg         convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]   /* src_argb */
michael@0:     mov        edx, [esp + 8]   /* dst_y */
michael@0:     mov        ecx, [esp + 12]  /* pix */
michael@0:     movdqa     xmm4, kARGBToYJ
michael@0:     movdqa     xmm5, kAddYJ64
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqu     xmm0, [eax]
michael@0:     movdqu     xmm1, [eax + 16]
michael@0:     movdqu     xmm2, [eax + 32]
michael@0:     movdqu     xmm3, [eax + 48]
michael@0:     pmaddubsw  xmm0, xmm4
michael@0:     pmaddubsw  xmm1, xmm4
michael@0:     pmaddubsw  xmm2, xmm4
michael@0:     pmaddubsw  xmm3, xmm4
michael@0:     lea        eax, [eax + 64]
michael@0:     phaddw     xmm0, xmm1
michael@0:     phaddw     xmm2, xmm3
michael@0:     paddw      xmm0, xmm5
michael@0:     paddw      xmm2, xmm5
michael@0:     psrlw      xmm0, 7
michael@0:     psrlw      xmm2, 7
michael@0:     packuswb   xmm0, xmm2
michael@0:     sub        ecx, 16
michael@0:     movdqu     [edx], xmm0
michael@0:     lea        edx, [edx + 16]
michael@0:     jg         convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]   /* src_argb */
michael@0:     mov        edx, [esp + 8]   /* dst_y */
michael@0:     mov        ecx, [esp + 12]  /* pix */
michael@0:     movdqa     xmm5, kAddY16
michael@0:     movdqa     xmm4, kBGRAToY
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqa     xmm0, [eax]
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     movdqa     xmm2, [eax + 32]
michael@0:     movdqa     xmm3, [eax + 48]
michael@0:     pmaddubsw  xmm0, xmm4
michael@0:     pmaddubsw  xmm1, xmm4
michael@0:     pmaddubsw  xmm2, xmm4
michael@0:     pmaddubsw  xmm3, xmm4
michael@0:     lea        eax, [eax + 64]
michael@0:     phaddw     xmm0, xmm1
michael@0:     phaddw     xmm2, xmm3
michael@0:     psrlw      xmm0, 7
michael@0:     psrlw      xmm2, 7
michael@0:     packuswb   xmm0, xmm2
michael@0:     paddb      xmm0, xmm5
michael@0:     sub        ecx, 16
michael@0:     movdqa     [edx], xmm0
michael@0:     lea        edx, [edx + 16]
michael@0:     jg         convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]   /* src_argb */
michael@0:     mov        edx, [esp + 8]   /* dst_y */
michael@0:     mov        ecx, [esp + 12]  /* pix */
michael@0:     movdqa     xmm5, kAddY16
michael@0:     movdqa     xmm4, kBGRAToY
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqu     xmm0, [eax]
michael@0:     movdqu     xmm1, [eax + 16]
michael@0:     movdqu     xmm2, [eax + 32]
michael@0:     movdqu     xmm3, [eax + 48]
michael@0:     pmaddubsw  xmm0, xmm4
michael@0:     pmaddubsw  xmm1, xmm4
michael@0:     pmaddubsw  xmm2, xmm4
michael@0:     pmaddubsw  xmm3, xmm4
michael@0:     lea        eax, [eax + 64]
michael@0:     phaddw     xmm0, xmm1
michael@0:     phaddw     xmm2, xmm3
michael@0:     psrlw      xmm0, 7
michael@0:     psrlw      xmm2, 7
michael@0:     packuswb   xmm0, xmm2
michael@0:     paddb      xmm0, xmm5
michael@0:     sub        ecx, 16
michael@0:     movdqu     [edx], xmm0
michael@0:     lea        edx, [edx + 16]
michael@0:     jg         convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]   /* src_argb */
michael@0:     mov        edx, [esp + 8]   /* dst_y */
michael@0:     mov        ecx, [esp + 12]  /* pix */
michael@0:     movdqa     xmm5, kAddY16
michael@0:     movdqa     xmm4, kABGRToY
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqa     xmm0, [eax]
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     movdqa     xmm2, [eax + 32]
michael@0:     movdqa     xmm3, [eax + 48]
michael@0:     pmaddubsw  xmm0, xmm4
michael@0:     pmaddubsw  xmm1, xmm4
michael@0:     pmaddubsw  xmm2, xmm4
michael@0:     pmaddubsw  xmm3, xmm4
michael@0:     lea        eax, [eax + 64]
michael@0:     phaddw     xmm0, xmm1
michael@0:     phaddw     xmm2, xmm3
michael@0:     psrlw      xmm0, 7
michael@0:     psrlw      xmm2, 7
michael@0:     packuswb   xmm0, xmm2
michael@0:     paddb      xmm0, xmm5
michael@0:     sub        ecx, 16
michael@0:     movdqa     [edx], xmm0
michael@0:     lea        edx, [edx + 16]
michael@0:     jg         convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]   /* src_argb */
michael@0:     mov        edx, [esp + 8]   /* dst_y */
michael@0:     mov        ecx, [esp + 12]  /* pix */
michael@0:     movdqa     xmm5, kAddY16
michael@0:     movdqa     xmm4, kABGRToY
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqu     xmm0, [eax]
michael@0:     movdqu     xmm1, [eax + 16]
michael@0:     movdqu     xmm2, [eax + 32]
michael@0:     movdqu     xmm3, [eax + 48]
michael@0:     pmaddubsw  xmm0, xmm4
michael@0:     pmaddubsw  xmm1, xmm4
michael@0:     pmaddubsw  xmm2, xmm4
michael@0:     pmaddubsw  xmm3, xmm4
michael@0:     lea        eax, [eax + 64]
michael@0:     phaddw     xmm0, xmm1
michael@0:     phaddw     xmm2, xmm3
michael@0:     psrlw      xmm0, 7
michael@0:     psrlw      xmm2, 7
michael@0:     packuswb   xmm0, xmm2
michael@0:     paddb      xmm0, xmm5
michael@0:     sub        ecx, 16
michael@0:     movdqu     [edx], xmm0
michael@0:     lea        edx, [edx + 16]
michael@0:     jg         convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]   /* src_argb */
michael@0:     mov        edx, [esp + 8]   /* dst_y */
michael@0:     mov        ecx, [esp + 12]  /* pix */
michael@0:     movdqa     xmm5, kAddY16
michael@0:     movdqa     xmm4, kRGBAToY
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqa     xmm0, [eax]
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     movdqa     xmm2, [eax + 32]
michael@0:     movdqa     xmm3, [eax + 48]
michael@0:     pmaddubsw  xmm0, xmm4
michael@0:     pmaddubsw  xmm1, xmm4
michael@0:     pmaddubsw  xmm2, xmm4
michael@0:     pmaddubsw  xmm3, xmm4
michael@0:     lea        eax, [eax + 64]
michael@0:     phaddw     xmm0, xmm1
michael@0:     phaddw     xmm2, xmm3
michael@0:     psrlw      xmm0, 7
michael@0:     psrlw      xmm2, 7
michael@0:     packuswb   xmm0, xmm2
michael@0:     paddb      xmm0, xmm5
michael@0:     sub        ecx, 16
michael@0:     movdqa     [edx], xmm0
michael@0:     lea        edx, [edx + 16]
michael@0:     jg         convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]   /* src_argb */
michael@0:     mov        edx, [esp + 8]   /* dst_y */
michael@0:     mov        ecx, [esp + 12]  /* pix */
michael@0:     movdqa     xmm5, kAddY16
michael@0:     movdqa     xmm4, kRGBAToY
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqu     xmm0, [eax]
michael@0:     movdqu     xmm1, [eax + 16]
michael@0:     movdqu     xmm2, [eax + 32]
michael@0:     movdqu     xmm3, [eax + 48]
michael@0:     pmaddubsw  xmm0, xmm4
michael@0:     pmaddubsw  xmm1, xmm4
michael@0:     pmaddubsw  xmm2, xmm4
michael@0:     pmaddubsw  xmm3, xmm4
michael@0:     lea        eax, [eax + 64]
michael@0:     phaddw     xmm0, xmm1
michael@0:     phaddw     xmm2, xmm3
michael@0:     psrlw      xmm0, 7
michael@0:     psrlw      xmm2, 7
michael@0:     packuswb   xmm0, xmm2
michael@0:     paddb      xmm0, xmm5
michael@0:     sub        ecx, 16
michael@0:     movdqu     [edx], xmm0
michael@0:     lea        edx, [edx + 16]
michael@0:     jg         convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
michael@0:                        uint8* dst_u, uint8* dst_v, int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]   // src_argb
michael@0:     mov        esi, [esp + 8 + 8]   // src_stride_argb
michael@0:     mov        edx, [esp + 8 + 12]  // dst_u
michael@0:     mov        edi, [esp + 8 + 16]  // dst_v
michael@0:     mov        ecx, [esp + 8 + 20]  // pix
michael@0:     movdqa     xmm7, kARGBToU
michael@0:     movdqa     xmm6, kARGBToV
michael@0:     movdqa     xmm5, kAddUV128
michael@0:     sub        edi, edx             // stride from u to v
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     /* step 1 - subsample 16x2 argb pixels to 8x1 */
michael@0:     movdqa     xmm0, [eax]
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     movdqa     xmm2, [eax + 32]
michael@0:     movdqa     xmm3, [eax + 48]
michael@0:     pavgb      xmm0, [eax + esi]
michael@0:     pavgb      xmm1, [eax + esi + 16]
michael@0:     pavgb      xmm2, [eax + esi + 32]
michael@0:     pavgb      xmm3, [eax + esi + 48]
michael@0:     lea        eax,  [eax + 64]
michael@0:     movdqa     xmm4, xmm0
michael@0:     shufps     xmm0, xmm1, 0x88
michael@0:     shufps     xmm4, xmm1, 0xdd
michael@0:     pavgb      xmm0, xmm4
michael@0:     movdqa     xmm4, xmm2
michael@0:     shufps     xmm2, xmm3, 0x88
michael@0:     shufps     xmm4, xmm3, 0xdd
michael@0:     pavgb      xmm2, xmm4
michael@0: 
michael@0:     // step 2 - convert to U and V
michael@0:     // from here down is very similar to Y code except
michael@0:     // instead of 16 different pixels, its 8 pixels of U and 8 of V
michael@0:     movdqa     xmm1, xmm0
michael@0:     movdqa     xmm3, xmm2
michael@0:     pmaddubsw  xmm0, xmm7  // U
michael@0:     pmaddubsw  xmm2, xmm7
michael@0:     pmaddubsw  xmm1, xmm6  // V
michael@0:     pmaddubsw  xmm3, xmm6
michael@0:     phaddw     xmm0, xmm2
michael@0:     phaddw     xmm1, xmm3
michael@0:     psraw      xmm0, 8
michael@0:     psraw      xmm1, 8
michael@0:     packsswb   xmm0, xmm1
michael@0:     paddb      xmm0, xmm5            // -> unsigned
michael@0: 
michael@0:     // step 3 - store 8 U and 8 V values
michael@0:     sub        ecx, 16
michael@0:     movlps     qword ptr [edx], xmm0 // U
michael@0:     movhps     qword ptr [edx + edi], xmm0 // V
michael@0:     lea        edx, [edx + 8]
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
michael@0:                         uint8* dst_u, uint8* dst_v, int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]   // src_argb
michael@0:     mov        esi, [esp + 8 + 8]   // src_stride_argb
michael@0:     mov        edx, [esp + 8 + 12]  // dst_u
michael@0:     mov        edi, [esp + 8 + 16]  // dst_v
michael@0:     mov        ecx, [esp + 8 + 20]  // pix
michael@0:     movdqa     xmm7, kARGBToUJ
michael@0:     movdqa     xmm6, kARGBToVJ
michael@0:     movdqa     xmm5, kAddUVJ128
michael@0:     sub        edi, edx             // stride from u to v
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     /* step 1 - subsample 16x2 argb pixels to 8x1 */
michael@0:     movdqa     xmm0, [eax]
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     movdqa     xmm2, [eax + 32]
michael@0:     movdqa     xmm3, [eax + 48]
michael@0:     pavgb      xmm0, [eax + esi]
michael@0:     pavgb      xmm1, [eax + esi + 16]
michael@0:     pavgb      xmm2, [eax + esi + 32]
michael@0:     pavgb      xmm3, [eax + esi + 48]
michael@0:     lea        eax,  [eax + 64]
michael@0:     movdqa     xmm4, xmm0
michael@0:     shufps     xmm0, xmm1, 0x88
michael@0:     shufps     xmm4, xmm1, 0xdd
michael@0:     pavgb      xmm0, xmm4
michael@0:     movdqa     xmm4, xmm2
michael@0:     shufps     xmm2, xmm3, 0x88
michael@0:     shufps     xmm4, xmm3, 0xdd
michael@0:     pavgb      xmm2, xmm4
michael@0: 
michael@0:     // step 2 - convert to U and V
michael@0:     // from here down is very similar to Y code except
michael@0:     // instead of 16 different pixels, its 8 pixels of U and 8 of V
michael@0:     movdqa     xmm1, xmm0
michael@0:     movdqa     xmm3, xmm2
michael@0:     pmaddubsw  xmm0, xmm7  // U
michael@0:     pmaddubsw  xmm2, xmm7
michael@0:     pmaddubsw  xmm1, xmm6  // V
michael@0:     pmaddubsw  xmm3, xmm6
michael@0:     phaddw     xmm0, xmm2
michael@0:     phaddw     xmm1, xmm3
michael@0:     paddw      xmm0, xmm5            // +.5 rounding -> unsigned
michael@0:     paddw      xmm1, xmm5
michael@0:     psraw      xmm0, 8
michael@0:     psraw      xmm1, 8
michael@0:     packsswb   xmm0, xmm1
michael@0: 
michael@0:     // step 3 - store 8 U and 8 V values
michael@0:     sub        ecx, 16
michael@0:     movlps     qword ptr [edx], xmm0 // U
michael@0:     movhps     qword ptr [edx + edi], xmm0 // V
michael@0:     lea        edx, [edx + 8]
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: #ifdef HAS_ARGBTOUVROW_AVX2
michael@0: __declspec(naked) __declspec(align(32))
michael@0: void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
michael@0:                       uint8* dst_u, uint8* dst_v, int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]   // src_argb
michael@0:     mov        esi, [esp + 8 + 8]   // src_stride_argb
michael@0:     mov        edx, [esp + 8 + 12]  // dst_u
michael@0:     mov        edi, [esp + 8 + 16]  // dst_v
michael@0:     mov        ecx, [esp + 8 + 20]  // pix
michael@0:     vbroadcastf128 ymm5, kAddUV128
michael@0:     vbroadcastf128 ymm6, kARGBToV
michael@0:     vbroadcastf128 ymm7, kARGBToU
michael@0:     sub        edi, edx             // stride from u to v
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     /* step 1 - subsample 32x2 argb pixels to 16x1 */
michael@0:     vmovdqu    ymm0, [eax]
michael@0:     vmovdqu    ymm1, [eax + 32]
michael@0:     vmovdqu    ymm2, [eax + 64]
michael@0:     vmovdqu    ymm3, [eax + 96]
michael@0:     vpavgb     ymm0, ymm0, [eax + esi]
michael@0:     vpavgb     ymm1, ymm1, [eax + esi + 32]
michael@0:     vpavgb     ymm2, ymm2, [eax + esi + 64]
michael@0:     vpavgb     ymm3, ymm3, [eax + esi + 96]
michael@0:     lea        eax,  [eax + 128]
michael@0:     vshufps    ymm4, ymm0, ymm1, 0x88
michael@0:     vshufps    ymm0, ymm0, ymm1, 0xdd
michael@0:     vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
michael@0:     vshufps    ymm4, ymm2, ymm3, 0x88
michael@0:     vshufps    ymm2, ymm2, ymm3, 0xdd
michael@0:     vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
michael@0: 
michael@0:     // step 2 - convert to U and V
michael@0:     // from here down is very similar to Y code except
michael@0:     // instead of 32 different pixels, its 16 pixels of U and 16 of V
michael@0:     vpmaddubsw ymm1, ymm0, ymm7  // U
michael@0:     vpmaddubsw ymm3, ymm2, ymm7
michael@0:     vpmaddubsw ymm0, ymm0, ymm6  // V
michael@0:     vpmaddubsw ymm2, ymm2, ymm6
michael@0:     vphaddw    ymm1, ymm1, ymm3  // mutates
michael@0:     vphaddw    ymm0, ymm0, ymm2
michael@0:     vpsraw     ymm1, ymm1, 8
michael@0:     vpsraw     ymm0, ymm0, 8
michael@0:     vpacksswb  ymm0, ymm1, ymm0  // mutates
michael@0:     vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
michael@0:     vpshufb    ymm0, ymm0, kShufARGBToUV_AVX  // For vshufps + vphaddw
michael@0:     vpaddb     ymm0, ymm0, ymm5  // -> unsigned
michael@0: 
michael@0:     // step 3 - store 16 U and 16 V values
michael@0:     sub         ecx, 32
michael@0:     vextractf128 [edx], ymm0, 0 // U
michael@0:     vextractf128 [edx + edi], ymm0, 1 // V
michael@0:     lea        edx, [edx + 16]
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     vzeroupper
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBTOUVROW_AVX2
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
michael@0:                                  uint8* dst_u, uint8* dst_v, int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]   // src_argb
michael@0:     mov        esi, [esp + 8 + 8]   // src_stride_argb
michael@0:     mov        edx, [esp + 8 + 12]  // dst_u
michael@0:     mov        edi, [esp + 8 + 16]  // dst_v
michael@0:     mov        ecx, [esp + 8 + 20]  // pix
michael@0:     movdqa     xmm7, kARGBToU
michael@0:     movdqa     xmm6, kARGBToV
michael@0:     movdqa     xmm5, kAddUV128
michael@0:     sub        edi, edx             // stride from u to v
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     /* step 1 - subsample 16x2 argb pixels to 8x1 */
michael@0:     movdqu     xmm0, [eax]
michael@0:     movdqu     xmm1, [eax + 16]
michael@0:     movdqu     xmm2, [eax + 32]
michael@0:     movdqu     xmm3, [eax + 48]
michael@0:     movdqu     xmm4, [eax + esi]
michael@0:     pavgb      xmm0, xmm4
michael@0:     movdqu     xmm4, [eax + esi + 16]
michael@0:     pavgb      xmm1, xmm4
michael@0:     movdqu     xmm4, [eax + esi + 32]
michael@0:     pavgb      xmm2, xmm4
michael@0:     movdqu     xmm4, [eax + esi + 48]
michael@0:     pavgb      xmm3, xmm4
michael@0:     lea        eax,  [eax + 64]
michael@0:     movdqa     xmm4, xmm0
michael@0:     shufps     xmm0, xmm1, 0x88
michael@0:     shufps     xmm4, xmm1, 0xdd
michael@0:     pavgb      xmm0, xmm4
michael@0:     movdqa     xmm4, xmm2
michael@0:     shufps     xmm2, xmm3, 0x88
michael@0:     shufps     xmm4, xmm3, 0xdd
michael@0:     pavgb      xmm2, xmm4
michael@0: 
michael@0:     // step 2 - convert to U and V
michael@0:     // from here down is very similar to Y code except
michael@0:     // instead of 16 different pixels, its 8 pixels of U and 8 of V
michael@0:     movdqa     xmm1, xmm0
michael@0:     movdqa     xmm3, xmm2
michael@0:     pmaddubsw  xmm0, xmm7  // U
michael@0:     pmaddubsw  xmm2, xmm7
michael@0:     pmaddubsw  xmm1, xmm6  // V
michael@0:     pmaddubsw  xmm3, xmm6
michael@0:     phaddw     xmm0, xmm2
michael@0:     phaddw     xmm1, xmm3
michael@0:     psraw      xmm0, 8
michael@0:     psraw      xmm1, 8
michael@0:     packsswb   xmm0, xmm1
michael@0:     paddb      xmm0, xmm5            // -> unsigned
michael@0: 
michael@0:     // step 3 - store 8 U and 8 V values
michael@0:     sub        ecx, 16
michael@0:     movlps     qword ptr [edx], xmm0 // U
michael@0:     movhps     qword ptr [edx + edi], xmm0 // V
michael@0:     lea        edx, [edx + 8]
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
michael@0:                                  uint8* dst_u, uint8* dst_v, int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]   // src_argb
michael@0:     mov        esi, [esp + 8 + 8]   // src_stride_argb
michael@0:     mov        edx, [esp + 8 + 12]  // dst_u
michael@0:     mov        edi, [esp + 8 + 16]  // dst_v
michael@0:     mov        ecx, [esp + 8 + 20]  // pix
michael@0:     movdqa     xmm7, kARGBToUJ
michael@0:     movdqa     xmm6, kARGBToVJ
michael@0:     movdqa     xmm5, kAddUVJ128
michael@0:     sub        edi, edx             // stride from u to v
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     /* step 1 - subsample 16x2 argb pixels to 8x1 */
michael@0:     movdqu     xmm0, [eax]
michael@0:     movdqu     xmm1, [eax + 16]
michael@0:     movdqu     xmm2, [eax + 32]
michael@0:     movdqu     xmm3, [eax + 48]
michael@0:     movdqu     xmm4, [eax + esi]
michael@0:     pavgb      xmm0, xmm4
michael@0:     movdqu     xmm4, [eax + esi + 16]
michael@0:     pavgb      xmm1, xmm4
michael@0:     movdqu     xmm4, [eax + esi + 32]
michael@0:     pavgb      xmm2, xmm4
michael@0:     movdqu     xmm4, [eax + esi + 48]
michael@0:     pavgb      xmm3, xmm4
michael@0:     lea        eax,  [eax + 64]
michael@0:     movdqa     xmm4, xmm0
michael@0:     shufps     xmm0, xmm1, 0x88
michael@0:     shufps     xmm4, xmm1, 0xdd
michael@0:     pavgb      xmm0, xmm4
michael@0:     movdqa     xmm4, xmm2
michael@0:     shufps     xmm2, xmm3, 0x88
michael@0:     shufps     xmm4, xmm3, 0xdd
michael@0:     pavgb      xmm2, xmm4
michael@0: 
michael@0:     // step 2 - convert to U and V
michael@0:     // from here down is very similar to Y code except
michael@0:     // instead of 16 different pixels, its 8 pixels of U and 8 of V
michael@0:     movdqa     xmm1, xmm0
michael@0:     movdqa     xmm3, xmm2
michael@0:     pmaddubsw  xmm0, xmm7  // U
michael@0:     pmaddubsw  xmm2, xmm7
michael@0:     pmaddubsw  xmm1, xmm6  // V
michael@0:     pmaddubsw  xmm3, xmm6
michael@0:     phaddw     xmm0, xmm2
michael@0:     phaddw     xmm1, xmm3
michael@0:     paddw      xmm0, xmm5            // +.5 rounding -> unsigned
michael@0:     paddw      xmm1, xmm5
michael@0:     psraw      xmm0, 8
michael@0:     psraw      xmm1, 8
michael@0:     packsswb   xmm0, xmm1
michael@0: 
michael@0:     // step 3 - store 8 U and 8 V values
michael@0:     sub        ecx, 16
michael@0:     movlps     qword ptr [edx], xmm0 // U
michael@0:     movhps     qword ptr [edx + edi], xmm0 // V
michael@0:     lea        edx, [edx + 8]
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
michael@0:                           uint8* dst_u, uint8* dst_v, int width) {
michael@0:   __asm {
michael@0:     push       edi
michael@0:     mov        eax, [esp + 4 + 4]   // src_argb
michael@0:     mov        edx, [esp + 4 + 8]   // dst_u
michael@0:     mov        edi, [esp + 4 + 12]  // dst_v
michael@0:     mov        ecx, [esp + 4 + 16]  // pix
michael@0:     movdqa     xmm7, kARGBToU
michael@0:     movdqa     xmm6, kARGBToV
michael@0:     movdqa     xmm5, kAddUV128
michael@0:     sub        edi, edx             // stride from u to v
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     /* convert to U and V */
michael@0:     movdqa     xmm0, [eax]          // U
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     movdqa     xmm2, [eax + 32]
michael@0:     movdqa     xmm3, [eax + 48]
michael@0:     pmaddubsw  xmm0, xmm7
michael@0:     pmaddubsw  xmm1, xmm7
michael@0:     pmaddubsw  xmm2, xmm7
michael@0:     pmaddubsw  xmm3, xmm7
michael@0:     phaddw     xmm0, xmm1
michael@0:     phaddw     xmm2, xmm3
michael@0:     psraw      xmm0, 8
michael@0:     psraw      xmm2, 8
michael@0:     packsswb   xmm0, xmm2
michael@0:     paddb      xmm0, xmm5
michael@0:     sub        ecx,  16
michael@0:     movdqa     [edx], xmm0
michael@0: 
michael@0:     movdqa     xmm0, [eax]          // V
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     movdqa     xmm2, [eax + 32]
michael@0:     movdqa     xmm3, [eax + 48]
michael@0:     pmaddubsw  xmm0, xmm6
michael@0:     pmaddubsw  xmm1, xmm6
michael@0:     pmaddubsw  xmm2, xmm6
michael@0:     pmaddubsw  xmm3, xmm6
michael@0:     phaddw     xmm0, xmm1
michael@0:     phaddw     xmm2, xmm3
michael@0:     psraw      xmm0, 8
michael@0:     psraw      xmm2, 8
michael@0:     packsswb   xmm0, xmm2
michael@0:     paddb      xmm0, xmm5
michael@0:     lea        eax,  [eax + 64]
michael@0:     movdqa     [edx + edi], xmm0
michael@0:     lea        edx,  [edx + 16]
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
michael@0:                                     uint8* dst_u, uint8* dst_v, int width) {
michael@0:   __asm {
michael@0:     push       edi
michael@0:     mov        eax, [esp + 4 + 4]   // src_argb
michael@0:     mov        edx, [esp + 4 + 8]   // dst_u
michael@0:     mov        edi, [esp + 4 + 12]  // dst_v
michael@0:     mov        ecx, [esp + 4 + 16]  // pix
michael@0:     movdqa     xmm7, kARGBToU
michael@0:     movdqa     xmm6, kARGBToV
michael@0:     movdqa     xmm5, kAddUV128
michael@0:     sub        edi, edx             // stride from u to v
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     /* convert to U and V */
michael@0:     movdqu     xmm0, [eax]          // U
michael@0:     movdqu     xmm1, [eax + 16]
michael@0:     movdqu     xmm2, [eax + 32]
michael@0:     movdqu     xmm3, [eax + 48]
michael@0:     pmaddubsw  xmm0, xmm7
michael@0:     pmaddubsw  xmm1, xmm7
michael@0:     pmaddubsw  xmm2, xmm7
michael@0:     pmaddubsw  xmm3, xmm7
michael@0:     phaddw     xmm0, xmm1
michael@0:     phaddw     xmm2, xmm3
michael@0:     psraw      xmm0, 8
michael@0:     psraw      xmm2, 8
michael@0:     packsswb   xmm0, xmm2
michael@0:     paddb      xmm0, xmm5
michael@0:     sub        ecx,  16
michael@0:     movdqu     [edx], xmm0
michael@0: 
michael@0:     movdqu     xmm0, [eax]          // V
michael@0:     movdqu     xmm1, [eax + 16]
michael@0:     movdqu     xmm2, [eax + 32]
michael@0:     movdqu     xmm3, [eax + 48]
michael@0:     pmaddubsw  xmm0, xmm6
michael@0:     pmaddubsw  xmm1, xmm6
michael@0:     pmaddubsw  xmm2, xmm6
michael@0:     pmaddubsw  xmm3, xmm6
michael@0:     phaddw     xmm0, xmm1
michael@0:     phaddw     xmm2, xmm3
michael@0:     psraw      xmm0, 8
michael@0:     psraw      xmm2, 8
michael@0:     packsswb   xmm0, xmm2
michael@0:     paddb      xmm0, xmm5
michael@0:     lea        eax,  [eax + 64]
michael@0:     movdqu     [edx + edi], xmm0
michael@0:     lea        edx,  [edx + 16]
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
michael@0:                           uint8* dst_u, uint8* dst_v, int width) {
michael@0:   __asm {
michael@0:     push       edi
michael@0:     mov        eax, [esp + 4 + 4]   // src_argb
michael@0:     mov        edx, [esp + 4 + 8]   // dst_u
michael@0:     mov        edi, [esp + 4 + 12]  // dst_v
michael@0:     mov        ecx, [esp + 4 + 16]  // pix
michael@0:     movdqa     xmm7, kARGBToU
michael@0:     movdqa     xmm6, kARGBToV
michael@0:     movdqa     xmm5, kAddUV128
michael@0:     sub        edi, edx             // stride from u to v
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     /* step 1 - subsample 16x2 argb pixels to 8x1 */
michael@0:     movdqa     xmm0, [eax]
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     movdqa     xmm2, [eax + 32]
michael@0:     movdqa     xmm3, [eax + 48]
michael@0:     lea        eax,  [eax + 64]
michael@0:     movdqa     xmm4, xmm0
michael@0:     shufps     xmm0, xmm1, 0x88
michael@0:     shufps     xmm4, xmm1, 0xdd
michael@0:     pavgb      xmm0, xmm4
michael@0:     movdqa     xmm4, xmm2
michael@0:     shufps     xmm2, xmm3, 0x88
michael@0:     shufps     xmm4, xmm3, 0xdd
michael@0:     pavgb      xmm2, xmm4
michael@0: 
michael@0:     // step 2 - convert to U and V
michael@0:     // from here down is very similar to Y code except
michael@0:     // instead of 16 different pixels, its 8 pixels of U and 8 of V
michael@0:     movdqa     xmm1, xmm0
michael@0:     movdqa     xmm3, xmm2
michael@0:     pmaddubsw  xmm0, xmm7  // U
michael@0:     pmaddubsw  xmm2, xmm7
michael@0:     pmaddubsw  xmm1, xmm6  // V
michael@0:     pmaddubsw  xmm3, xmm6
michael@0:     phaddw     xmm0, xmm2
michael@0:     phaddw     xmm1, xmm3
michael@0:     psraw      xmm0, 8
michael@0:     psraw      xmm1, 8
michael@0:     packsswb   xmm0, xmm1
michael@0:     paddb      xmm0, xmm5            // -> unsigned
michael@0: 
michael@0:     // step 3 - store 8 U and 8 V values
michael@0:     sub        ecx, 16
michael@0:     movlps     qword ptr [edx], xmm0 // U
michael@0:     movhps     qword ptr [edx + edi], xmm0 // V
michael@0:     lea        edx, [edx + 8]
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
michael@0:                                     uint8* dst_u, uint8* dst_v, int width) {
michael@0:   __asm {
michael@0:     push       edi
michael@0:     mov        eax, [esp + 4 + 4]   // src_argb
michael@0:     mov        edx, [esp + 4 + 8]   // dst_u
michael@0:     mov        edi, [esp + 4 + 12]  // dst_v
michael@0:     mov        ecx, [esp + 4 + 16]  // pix
michael@0:     movdqa     xmm7, kARGBToU
michael@0:     movdqa     xmm6, kARGBToV
michael@0:     movdqa     xmm5, kAddUV128
michael@0:     sub        edi, edx             // stride from u to v
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     /* step 1 - subsample 16x2 argb pixels to 8x1 */
michael@0:     movdqu     xmm0, [eax]
michael@0:     movdqu     xmm1, [eax + 16]
michael@0:     movdqu     xmm2, [eax + 32]
michael@0:     movdqu     xmm3, [eax + 48]
michael@0:     lea        eax,  [eax + 64]
michael@0:     movdqa     xmm4, xmm0
michael@0:     shufps     xmm0, xmm1, 0x88
michael@0:     shufps     xmm4, xmm1, 0xdd
michael@0:     pavgb      xmm0, xmm4
michael@0:     movdqa     xmm4, xmm2
michael@0:     shufps     xmm2, xmm3, 0x88
michael@0:     shufps     xmm4, xmm3, 0xdd
michael@0:     pavgb      xmm2, xmm4
michael@0: 
michael@0:     // step 2 - convert to U and V
michael@0:     // from here down is very similar to Y code except
michael@0:     // instead of 16 different pixels, its 8 pixels of U and 8 of V
michael@0:     movdqa     xmm1, xmm0
michael@0:     movdqa     xmm3, xmm2
michael@0:     pmaddubsw  xmm0, xmm7  // U
michael@0:     pmaddubsw  xmm2, xmm7
michael@0:     pmaddubsw  xmm1, xmm6  // V
michael@0:     pmaddubsw  xmm3, xmm6
michael@0:     phaddw     xmm0, xmm2
michael@0:     phaddw     xmm1, xmm3
michael@0:     psraw      xmm0, 8
michael@0:     psraw      xmm1, 8
michael@0:     packsswb   xmm0, xmm1
michael@0:     paddb      xmm0, xmm5            // -> unsigned
michael@0: 
michael@0:     // step 3 - store 8 U and 8 V values
michael@0:     sub        ecx, 16
michael@0:     movlps     qword ptr [edx], xmm0 // U
michael@0:     movhps     qword ptr [edx + edi], xmm0 // V
michael@0:     lea        edx, [edx + 8]
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
michael@0:                        uint8* dst_u, uint8* dst_v, int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]   // src_argb
michael@0:     mov        esi, [esp + 8 + 8]   // src_stride_argb
michael@0:     mov        edx, [esp + 8 + 12]  // dst_u
michael@0:     mov        edi, [esp + 8 + 16]  // dst_v
michael@0:     mov        ecx, [esp + 8 + 20]  // pix
michael@0:     movdqa     xmm7, kBGRAToU
michael@0:     movdqa     xmm6, kBGRAToV
michael@0:     movdqa     xmm5, kAddUV128
michael@0:     sub        edi, edx             // stride from u to v
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     /* step 1 - subsample 16x2 argb pixels to 8x1 */
michael@0:     movdqa     xmm0, [eax]
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     movdqa     xmm2, [eax + 32]
michael@0:     movdqa     xmm3, [eax + 48]
michael@0:     pavgb      xmm0, [eax + esi]
michael@0:     pavgb      xmm1, [eax + esi + 16]
michael@0:     pavgb      xmm2, [eax + esi + 32]
michael@0:     pavgb      xmm3, [eax + esi + 48]
michael@0:     lea        eax,  [eax + 64]
michael@0:     movdqa     xmm4, xmm0
michael@0:     shufps     xmm0, xmm1, 0x88
michael@0:     shufps     xmm4, xmm1, 0xdd
michael@0:     pavgb      xmm0, xmm4
michael@0:     movdqa     xmm4, xmm2
michael@0:     shufps     xmm2, xmm3, 0x88
michael@0:     shufps     xmm4, xmm3, 0xdd
michael@0:     pavgb      xmm2, xmm4
michael@0: 
michael@0:     // step 2 - convert to U and V
michael@0:     // from here down is very similar to Y code except
michael@0:     // instead of 16 different pixels, its 8 pixels of U and 8 of V
michael@0:     movdqa     xmm1, xmm0
michael@0:     movdqa     xmm3, xmm2
michael@0:     pmaddubsw  xmm0, xmm7  // U
michael@0:     pmaddubsw  xmm2, xmm7
michael@0:     pmaddubsw  xmm1, xmm6  // V
michael@0:     pmaddubsw  xmm3, xmm6
michael@0:     phaddw     xmm0, xmm2
michael@0:     phaddw     xmm1, xmm3
michael@0:     psraw      xmm0, 8
michael@0:     psraw      xmm1, 8
michael@0:     packsswb   xmm0, xmm1
michael@0:     paddb      xmm0, xmm5            // -> unsigned
michael@0: 
michael@0:     // step 3 - store 8 U and 8 V values
michael@0:     sub        ecx, 16
michael@0:     movlps     qword ptr [edx], xmm0 // U
michael@0:     movhps     qword ptr [edx + edi], xmm0 // V
michael@0:     lea        edx, [edx + 8]
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
michael@0:                                  uint8* dst_u, uint8* dst_v, int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]   // src_argb
michael@0:     mov        esi, [esp + 8 + 8]   // src_stride_argb
michael@0:     mov        edx, [esp + 8 + 12]  // dst_u
michael@0:     mov        edi, [esp + 8 + 16]  // dst_v
michael@0:     mov        ecx, [esp + 8 + 20]  // pix
michael@0:     movdqa     xmm7, kBGRAToU
michael@0:     movdqa     xmm6, kBGRAToV
michael@0:     movdqa     xmm5, kAddUV128
michael@0:     sub        edi, edx             // stride from u to v
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     /* step 1 - subsample 16x2 argb pixels to 8x1 */
michael@0:     movdqu     xmm0, [eax]
michael@0:     movdqu     xmm1, [eax + 16]
michael@0:     movdqu     xmm2, [eax + 32]
michael@0:     movdqu     xmm3, [eax + 48]
michael@0:     movdqu     xmm4, [eax + esi]
michael@0:     pavgb      xmm0, xmm4
michael@0:     movdqu     xmm4, [eax + esi + 16]
michael@0:     pavgb      xmm1, xmm4
michael@0:     movdqu     xmm4, [eax + esi + 32]
michael@0:     pavgb      xmm2, xmm4
michael@0:     movdqu     xmm4, [eax + esi + 48]
michael@0:     pavgb      xmm3, xmm4
michael@0:     lea        eax,  [eax + 64]
michael@0:     movdqa     xmm4, xmm0
michael@0:     shufps     xmm0, xmm1, 0x88
michael@0:     shufps     xmm4, xmm1, 0xdd
michael@0:     pavgb      xmm0, xmm4
michael@0:     movdqa     xmm4, xmm2
michael@0:     shufps     xmm2, xmm3, 0x88
michael@0:     shufps     xmm4, xmm3, 0xdd
michael@0:     pavgb      xmm2, xmm4
michael@0: 
michael@0:     // step 2 - convert to U and V
michael@0:     // from here down is very similar to Y code except
michael@0:     // instead of 16 different pixels, its 8 pixels of U and 8 of V
michael@0:     movdqa     xmm1, xmm0
michael@0:     movdqa     xmm3, xmm2
michael@0:     pmaddubsw  xmm0, xmm7  // U
michael@0:     pmaddubsw  xmm2, xmm7
michael@0:     pmaddubsw  xmm1, xmm6  // V
michael@0:     pmaddubsw  xmm3, xmm6
michael@0:     phaddw     xmm0, xmm2
michael@0:     phaddw     xmm1, xmm3
michael@0:     psraw      xmm0, 8
michael@0:     psraw      xmm1, 8
michael@0:     packsswb   xmm0, xmm1
michael@0:     paddb      xmm0, xmm5            // -> unsigned
michael@0: 
michael@0:     // step 3 - store 8 U and 8 V values
michael@0:     sub        ecx, 16
michael@0:     movlps     qword ptr [edx], xmm0 // U
michael@0:     movhps     qword ptr [edx + edi], xmm0 // V
michael@0:     lea        edx, [edx + 8]
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
michael@0:                        uint8* dst_u, uint8* dst_v, int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]   // src_argb
michael@0:     mov        esi, [esp + 8 + 8]   // src_stride_argb
michael@0:     mov        edx, [esp + 8 + 12]  // dst_u
michael@0:     mov        edi, [esp + 8 + 16]  // dst_v
michael@0:     mov        ecx, [esp + 8 + 20]  // pix
michael@0:     movdqa     xmm7, kABGRToU
michael@0:     movdqa     xmm6, kABGRToV
michael@0:     movdqa     xmm5, kAddUV128
michael@0:     sub        edi, edx             // stride from u to v
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     /* step 1 - subsample 16x2 argb pixels to 8x1 */
michael@0:     movdqa     xmm0, [eax]
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     movdqa     xmm2, [eax + 32]
michael@0:     movdqa     xmm3, [eax + 48]
michael@0:     pavgb      xmm0, [eax + esi]
michael@0:     pavgb      xmm1, [eax + esi + 16]
michael@0:     pavgb      xmm2, [eax + esi + 32]
michael@0:     pavgb      xmm3, [eax + esi + 48]
michael@0:     lea        eax,  [eax + 64]
michael@0:     movdqa     xmm4, xmm0
michael@0:     shufps     xmm0, xmm1, 0x88
michael@0:     shufps     xmm4, xmm1, 0xdd
michael@0:     pavgb      xmm0, xmm4
michael@0:     movdqa     xmm4, xmm2
michael@0:     shufps     xmm2, xmm3, 0x88
michael@0:     shufps     xmm4, xmm3, 0xdd
michael@0:     pavgb      xmm2, xmm4
michael@0: 
michael@0:     // step 2 - convert to U and V
michael@0:     // from here down is very similar to Y code except
michael@0:     // instead of 16 different pixels, its 8 pixels of U and 8 of V
michael@0:     movdqa     xmm1, xmm0
michael@0:     movdqa     xmm3, xmm2
michael@0:     pmaddubsw  xmm0, xmm7  // U
michael@0:     pmaddubsw  xmm2, xmm7
michael@0:     pmaddubsw  xmm1, xmm6  // V
michael@0:     pmaddubsw  xmm3, xmm6
michael@0:     phaddw     xmm0, xmm2
michael@0:     phaddw     xmm1, xmm3
michael@0:     psraw      xmm0, 8
michael@0:     psraw      xmm1, 8
michael@0:     packsswb   xmm0, xmm1
michael@0:     paddb      xmm0, xmm5            // -> unsigned
michael@0: 
michael@0:     // step 3 - store 8 U and 8 V values
michael@0:     sub        ecx, 16
michael@0:     movlps     qword ptr [edx], xmm0 // U
michael@0:     movhps     qword ptr [edx + edi], xmm0 // V
michael@0:     lea        edx, [edx + 8]
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
michael@0:                                  uint8* dst_u, uint8* dst_v, int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]   // src_argb
michael@0:     mov        esi, [esp + 8 + 8]   // src_stride_argb
michael@0:     mov        edx, [esp + 8 + 12]  // dst_u
michael@0:     mov        edi, [esp + 8 + 16]  // dst_v
michael@0:     mov        ecx, [esp + 8 + 20]  // pix
michael@0:     movdqa     xmm7, kABGRToU
michael@0:     movdqa     xmm6, kABGRToV
michael@0:     movdqa     xmm5, kAddUV128
michael@0:     sub        edi, edx             // stride from u to v
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     /* step 1 - subsample 16x2 argb pixels to 8x1 */
michael@0:     movdqu     xmm0, [eax]
michael@0:     movdqu     xmm1, [eax + 16]
michael@0:     movdqu     xmm2, [eax + 32]
michael@0:     movdqu     xmm3, [eax + 48]
michael@0:     movdqu     xmm4, [eax + esi]
michael@0:     pavgb      xmm0, xmm4
michael@0:     movdqu     xmm4, [eax + esi + 16]
michael@0:     pavgb      xmm1, xmm4
michael@0:     movdqu     xmm4, [eax + esi + 32]
michael@0:     pavgb      xmm2, xmm4
michael@0:     movdqu     xmm4, [eax + esi + 48]
michael@0:     pavgb      xmm3, xmm4
michael@0:     lea        eax,  [eax + 64]
michael@0:     movdqa     xmm4, xmm0
michael@0:     shufps     xmm0, xmm1, 0x88
michael@0:     shufps     xmm4, xmm1, 0xdd
michael@0:     pavgb      xmm0, xmm4
michael@0:     movdqa     xmm4, xmm2
michael@0:     shufps     xmm2, xmm3, 0x88
michael@0:     shufps     xmm4, xmm3, 0xdd
michael@0:     pavgb      xmm2, xmm4
michael@0: 
michael@0:     // step 2 - convert to U and V
michael@0:     // from here down is very similar to Y code except
michael@0:     // instead of 16 different pixels, its 8 pixels of U and 8 of V
michael@0:     movdqa     xmm1, xmm0
michael@0:     movdqa     xmm3, xmm2
michael@0:     pmaddubsw  xmm0, xmm7  // U
michael@0:     pmaddubsw  xmm2, xmm7
michael@0:     pmaddubsw  xmm1, xmm6  // V
michael@0:     pmaddubsw  xmm3, xmm6
michael@0:     phaddw     xmm0, xmm2
michael@0:     phaddw     xmm1, xmm3
michael@0:     psraw      xmm0, 8
michael@0:     psraw      xmm1, 8
michael@0:     packsswb   xmm0, xmm1
michael@0:     paddb      xmm0, xmm5            // -> unsigned
michael@0: 
michael@0:     // step 3 - store 8 U and 8 V values
michael@0:     sub        ecx, 16
michael@0:     movlps     qword ptr [edx], xmm0 // U
michael@0:     movhps     qword ptr [edx + edi], xmm0 // V
michael@0:     lea        edx, [edx + 8]
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
michael@0:                        uint8* dst_u, uint8* dst_v, int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]   // src_argb
michael@0:     mov        esi, [esp + 8 + 8]   // src_stride_argb
michael@0:     mov        edx, [esp + 8 + 12]  // dst_u
michael@0:     mov        edi, [esp + 8 + 16]  // dst_v
michael@0:     mov        ecx, [esp + 8 + 20]  // pix
michael@0:     movdqa     xmm7, kRGBAToU
michael@0:     movdqa     xmm6, kRGBAToV
michael@0:     movdqa     xmm5, kAddUV128
michael@0:     sub        edi, edx             // stride from u to v
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     /* step 1 - subsample 16x2 argb pixels to 8x1 */
michael@0:     movdqa     xmm0, [eax]
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     movdqa     xmm2, [eax + 32]
michael@0:     movdqa     xmm3, [eax + 48]
michael@0:     pavgb      xmm0, [eax + esi]
michael@0:     pavgb      xmm1, [eax + esi + 16]
michael@0:     pavgb      xmm2, [eax + esi + 32]
michael@0:     pavgb      xmm3, [eax + esi + 48]
michael@0:     lea        eax,  [eax + 64]
michael@0:     movdqa     xmm4, xmm0
michael@0:     shufps     xmm0, xmm1, 0x88
michael@0:     shufps     xmm4, xmm1, 0xdd
michael@0:     pavgb      xmm0, xmm4
michael@0:     movdqa     xmm4, xmm2
michael@0:     shufps     xmm2, xmm3, 0x88
michael@0:     shufps     xmm4, xmm3, 0xdd
michael@0:     pavgb      xmm2, xmm4
michael@0: 
michael@0:     // step 2 - convert to U and V
michael@0:     // from here down is very similar to Y code except
michael@0:     // instead of 16 different pixels, its 8 pixels of U and 8 of V
michael@0:     movdqa     xmm1, xmm0
michael@0:     movdqa     xmm3, xmm2
michael@0:     pmaddubsw  xmm0, xmm7  // U
michael@0:     pmaddubsw  xmm2, xmm7
michael@0:     pmaddubsw  xmm1, xmm6  // V
michael@0:     pmaddubsw  xmm3, xmm6
michael@0:     phaddw     xmm0, xmm2
michael@0:     phaddw     xmm1, xmm3
michael@0:     psraw      xmm0, 8
michael@0:     psraw      xmm1, 8
michael@0:     packsswb   xmm0, xmm1
michael@0:     paddb      xmm0, xmm5            // -> unsigned
michael@0: 
michael@0:     // step 3 - store 8 U and 8 V values
michael@0:     sub        ecx, 16
michael@0:     movlps     qword ptr [edx], xmm0 // U
michael@0:     movhps     qword ptr [edx + edi], xmm0 // V
michael@0:     lea        edx, [edx + 8]
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
michael@0:                                  uint8* dst_u, uint8* dst_v, int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]   // src_argb
michael@0:     mov        esi, [esp + 8 + 8]   // src_stride_argb
michael@0:     mov        edx, [esp + 8 + 12]  // dst_u
michael@0:     mov        edi, [esp + 8 + 16]  // dst_v
michael@0:     mov        ecx, [esp + 8 + 20]  // pix
michael@0:     movdqa     xmm7, kRGBAToU
michael@0:     movdqa     xmm6, kRGBAToV
michael@0:     movdqa     xmm5, kAddUV128
michael@0:     sub        edi, edx             // stride from u to v
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     /* step 1 - subsample 16x2 argb pixels to 8x1 */
michael@0:     movdqu     xmm0, [eax]
michael@0:     movdqu     xmm1, [eax + 16]
michael@0:     movdqu     xmm2, [eax + 32]
michael@0:     movdqu     xmm3, [eax + 48]
michael@0:     movdqu     xmm4, [eax + esi]
michael@0:     pavgb      xmm0, xmm4
michael@0:     movdqu     xmm4, [eax + esi + 16]
michael@0:     pavgb      xmm1, xmm4
michael@0:     movdqu     xmm4, [eax + esi + 32]
michael@0:     pavgb      xmm2, xmm4
michael@0:     movdqu     xmm4, [eax + esi + 48]
michael@0:     pavgb      xmm3, xmm4
michael@0:     lea        eax,  [eax + 64]
michael@0:     movdqa     xmm4, xmm0
michael@0:     shufps     xmm0, xmm1, 0x88
michael@0:     shufps     xmm4, xmm1, 0xdd
michael@0:     pavgb      xmm0, xmm4
michael@0:     movdqa     xmm4, xmm2
michael@0:     shufps     xmm2, xmm3, 0x88
michael@0:     shufps     xmm4, xmm3, 0xdd
michael@0:     pavgb      xmm2, xmm4
michael@0: 
michael@0:     // step 2 - convert to U and V
michael@0:     // from here down is very similar to Y code except
michael@0:     // instead of 16 different pixels, its 8 pixels of U and 8 of V
michael@0:     movdqa     xmm1, xmm0
michael@0:     movdqa     xmm3, xmm2
michael@0:     pmaddubsw  xmm0, xmm7  // U
michael@0:     pmaddubsw  xmm2, xmm7
michael@0:     pmaddubsw  xmm1, xmm6  // V
michael@0:     pmaddubsw  xmm3, xmm6
michael@0:     phaddw     xmm0, xmm2
michael@0:     phaddw     xmm1, xmm3
michael@0:     psraw      xmm0, 8
michael@0:     psraw      xmm1, 8
michael@0:     packsswb   xmm0, xmm1
michael@0:     paddb      xmm0, xmm5            // -> unsigned
michael@0: 
michael@0:     // step 3 - store 8 U and 8 V values
michael@0:     sub        ecx, 16
michael@0:     movlps     qword ptr [edx], xmm0 // U
michael@0:     movhps     qword ptr [edx + edi], xmm0 // V
michael@0:     lea        edx, [edx + 8]
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBTOYROW_SSSE3
michael@0: 
michael@0: #define YG 74 /* (int8)(1.164 * 64 + 0.5) */
michael@0: 
michael@0: #define UB 127 /* min(63,(int8)(2.018 * 64)) */
michael@0: #define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
michael@0: #define UR 0
michael@0: 
michael@0: #define VB 0
michael@0: #define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
michael@0: #define VR 102 /* (int8)(1.596 * 64 + 0.5) */
michael@0: 
michael@0: // Bias
michael@0: #define BB UB * 128 + VB * 128
michael@0: #define BG UG * 128 + VG * 128
michael@0: #define BR UR * 128 + VR * 128
michael@0: 
michael@0: #ifdef HAS_I422TOARGBROW_AVX2
michael@0: 
michael@0: static const lvec8 kUVToB_AVX = {
michael@0:   UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB,
michael@0:   UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
michael@0: };
michael@0: static const lvec8 kUVToR_AVX = {
michael@0:   UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR,
michael@0:   UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
michael@0: };
michael@0: static const lvec8 kUVToG_AVX = {
michael@0:   UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
michael@0:   UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
michael@0: };
michael@0: static const lvec16 kYToRgb_AVX = {
michael@0:   YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG
michael@0: };
michael@0: static const lvec16 kYSub16_AVX = {
michael@0:   16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
michael@0: };
michael@0: static const lvec16 kUVBiasB_AVX = {
michael@0:   BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB
michael@0: };
michael@0: static const lvec16 kUVBiasG_AVX = {
michael@0:   BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG
michael@0: };
michael@0: static const lvec16 kUVBiasR_AVX = {
michael@0:   BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR
michael@0: };
michael@0: 
michael@0: // 16 pixels
michael@0: // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void I422ToARGBRow_AVX2(const uint8* y_buf,
michael@0:                          const uint8* u_buf,
michael@0:                          const uint8* v_buf,
michael@0:                          uint8* dst_argb,
michael@0:                          int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]   // Y
michael@0:     mov        esi, [esp + 8 + 8]   // U
michael@0:     mov        edi, [esp + 8 + 12]  // V
michael@0:     mov        edx, [esp + 8 + 16]  // argb
michael@0:     mov        ecx, [esp + 8 + 20]  // width
michael@0:     sub        edi, esi
michael@0:     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
michael@0:     vpxor      ymm4, ymm4, ymm4
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     vmovq      xmm0, qword ptr [esi]          //  U
michael@0:     vmovq      xmm1, qword ptr [esi + edi]    //  V
michael@0:     lea        esi,  [esi + 8]
michael@0:     vpunpcklbw ymm0, ymm0, ymm1               // UV
michael@0:     vpermq     ymm0, ymm0, 0xd8
michael@0:     vpunpcklwd ymm0, ymm0, ymm0              // UVUV
michael@0:     vpmaddubsw ymm2, ymm0, kUVToB_AVX        // scale B UV
michael@0:     vpmaddubsw ymm1, ymm0, kUVToG_AVX        // scale G UV
michael@0:     vpmaddubsw ymm0, ymm0, kUVToR_AVX        // scale R UV
michael@0:     vpsubw     ymm2, ymm2, kUVBiasB_AVX      // unbias back to signed
michael@0:     vpsubw     ymm1, ymm1, kUVBiasG_AVX
michael@0:     vpsubw     ymm0, ymm0, kUVBiasR_AVX
michael@0: 
michael@0:     // Step 2: Find Y contribution to 16 R,G,B values
michael@0:     vmovdqu    xmm3, [eax]                  // NOLINT
michael@0:     lea        eax, [eax + 16]
michael@0:     vpermq     ymm3, ymm3, 0xd8
michael@0:     vpunpcklbw ymm3, ymm3, ymm4
michael@0:     vpsubsw    ymm3, ymm3, kYSub16_AVX
michael@0:     vpmullw    ymm3, ymm3, kYToRgb_AVX
michael@0:     vpaddsw    ymm2, ymm2, ymm3           // B += Y
michael@0:     vpaddsw    ymm1, ymm1, ymm3           // G += Y
michael@0:     vpaddsw    ymm0, ymm0, ymm3           // R += Y
michael@0:     vpsraw     ymm2, ymm2, 6
michael@0:     vpsraw     ymm1, ymm1, 6
michael@0:     vpsraw     ymm0, ymm0, 6
michael@0:     vpackuswb  ymm2, ymm2, ymm2           // B
michael@0:     vpackuswb  ymm1, ymm1, ymm1           // G
michael@0:     vpackuswb  ymm0, ymm0, ymm0           // R
michael@0: 
michael@0:     // Step 3: Weave into ARGB
michael@0:     vpunpcklbw ymm2, ymm2, ymm1           // BG
michael@0:     vpermq     ymm2, ymm2, 0xd8
michael@0:     vpunpcklbw ymm0, ymm0, ymm5           // RA
michael@0:     vpermq     ymm0, ymm0, 0xd8
michael@0:     vpunpcklwd ymm1, ymm2, ymm0           // BGRA first 8 pixels
michael@0:     vpunpckhwd ymm2, ymm2, ymm0           // BGRA next 8 pixels
michael@0:     vmovdqu    [edx], ymm1
michael@0:     vmovdqu    [edx + 32], ymm2
michael@0:     lea        edx,  [edx + 64]
michael@0:     sub        ecx, 16
michael@0:     jg         convertloop
michael@0:     vzeroupper
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_I422TOARGBROW_AVX2
michael@0: 
michael@0: #ifdef HAS_I422TOARGBROW_SSSE3
michael@0: 
michael@0: static const vec8 kUVToB = {
michael@0:   UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
michael@0: };
michael@0: 
michael@0: static const vec8 kUVToR = {
michael@0:   UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
michael@0: };
michael@0: 
michael@0: static const vec8 kUVToG = {
michael@0:   UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
michael@0: };
michael@0: 
michael@0: static const vec8 kVUToB = {
michael@0:   VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
michael@0: };
michael@0: 
michael@0: static const vec8 kVUToR = {
michael@0:   VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
michael@0: };
michael@0: 
michael@0: static const vec8 kVUToG = {
michael@0:   VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
michael@0: };
michael@0: 
michael@0: static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
michael@0: static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
michael@0: static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
michael@0: static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
michael@0: static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
michael@0: 
michael@0: // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
michael@0: 
michael@0: // Read 8 UV from 444.
michael@0: #define READYUV444 __asm {                                                     \
michael@0:     __asm movq       xmm0, qword ptr [esi] /* U */                /* NOLINT */ \
michael@0:     __asm movq       xmm1, qword ptr [esi + edi] /* V */          /* NOLINT */ \
michael@0:     __asm lea        esi,  [esi + 8]                                           \
michael@0:     __asm punpcklbw  xmm0, xmm1           /* UV */                             \
michael@0:   }
michael@0: 
michael@0: // Read 4 UV from 422, upsample to 8 UV.
michael@0: #define READYUV422 __asm {                                                     \
michael@0:     __asm movd       xmm0, [esi]          /* U */                              \
michael@0:     __asm movd       xmm1, [esi + edi]    /* V */                              \
michael@0:     __asm lea        esi,  [esi + 4]                                           \
michael@0:     __asm punpcklbw  xmm0, xmm1           /* UV */                             \
michael@0:     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
michael@0:   }
michael@0: 
michael@0: // Read 2 UV from 411, upsample to 8 UV.
michael@0: #define READYUV411 __asm {                                                     \
michael@0:     __asm movzx      ebx, word ptr [esi]        /* U */           /* NOLINT */ \
michael@0:     __asm movd       xmm0, ebx                                                 \
michael@0:     __asm movzx      ebx, word ptr [esi + edi]  /* V */           /* NOLINT */ \
michael@0:     __asm movd       xmm1, ebx                                                 \
michael@0:     __asm lea        esi,  [esi + 2]                                           \
michael@0:     __asm punpcklbw  xmm0, xmm1           /* UV */                             \
michael@0:     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
michael@0:     __asm punpckldq  xmm0, xmm0           /* UVUV (upsample) */                \
michael@0:   }
michael@0: 
michael@0: // Read 4 UV from NV12, upsample to 8 UV.
michael@0: #define READNV12 __asm {                                                       \
michael@0:     __asm movq       xmm0, qword ptr [esi] /* UV */               /* NOLINT */ \
michael@0:     __asm lea        esi,  [esi + 8]                                           \
michael@0:     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
michael@0:   }
michael@0: 
michael@0: // Convert 8 pixels: 8 UV and 8 Y.
michael@0: #define YUVTORGB __asm {                                                       \
michael@0:     /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
michael@0:     __asm movdqa     xmm1, xmm0                                                \
michael@0:     __asm movdqa     xmm2, xmm0                                                \
michael@0:     __asm pmaddubsw  xmm0, kUVToB        /* scale B UV */                      \
michael@0:     __asm pmaddubsw  xmm1, kUVToG        /* scale G UV */                      \
michael@0:     __asm pmaddubsw  xmm2, kUVToR        /* scale R UV */                      \
michael@0:     __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
michael@0:     __asm psubw      xmm1, kUVBiasG                                            \
michael@0:     __asm psubw      xmm2, kUVBiasR                                            \
michael@0:     /* Step 2: Find Y contribution to 8 R,G,B values */                        \
michael@0:     __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
michael@0:     __asm lea        eax, [eax + 8]                                            \
michael@0:     __asm punpcklbw  xmm3, xmm4                                                \
michael@0:     __asm psubsw     xmm3, kYSub16                                             \
michael@0:     __asm pmullw     xmm3, kYToRgb                                             \
michael@0:     __asm paddsw     xmm0, xmm3           /* B += Y */                         \
michael@0:     __asm paddsw     xmm1, xmm3           /* G += Y */                         \
michael@0:     __asm paddsw     xmm2, xmm3           /* R += Y */                         \
michael@0:     __asm psraw      xmm0, 6                                                   \
michael@0:     __asm psraw      xmm1, 6                                                   \
michael@0:     __asm psraw      xmm2, 6                                                   \
michael@0:     __asm packuswb   xmm0, xmm0           /* B */                              \
michael@0:     __asm packuswb   xmm1, xmm1           /* G */                              \
michael@0:     __asm packuswb   xmm2, xmm2           /* R */                              \
michael@0:   }
michael@0: 
michael@0: // Convert 8 pixels: 8 VU and 8 Y.
michael@0: #define YVUTORGB __asm {                                                       \
michael@0:     /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
michael@0:     __asm movdqa     xmm1, xmm0                                                \
michael@0:     __asm movdqa     xmm2, xmm0                                                \
michael@0:     __asm pmaddubsw  xmm0, kVUToB        /* scale B UV */                      \
michael@0:     __asm pmaddubsw  xmm1, kVUToG        /* scale G UV */                      \
michael@0:     __asm pmaddubsw  xmm2, kVUToR        /* scale R UV */                      \
michael@0:     __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
michael@0:     __asm psubw      xmm1, kUVBiasG                                            \
michael@0:     __asm psubw      xmm2, kUVBiasR                                            \
michael@0:     /* Step 2: Find Y contribution to 8 R,G,B values */                        \
michael@0:     __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
michael@0:     __asm lea        eax, [eax + 8]                                            \
michael@0:     __asm punpcklbw  xmm3, xmm4                                                \
michael@0:     __asm psubsw     xmm3, kYSub16                                             \
michael@0:     __asm pmullw     xmm3, kYToRgb                                             \
michael@0:     __asm paddsw     xmm0, xmm3           /* B += Y */                         \
michael@0:     __asm paddsw     xmm1, xmm3           /* G += Y */                         \
michael@0:     __asm paddsw     xmm2, xmm3           /* R += Y */                         \
michael@0:     __asm psraw      xmm0, 6                                                   \
michael@0:     __asm psraw      xmm1, 6                                                   \
michael@0:     __asm psraw      xmm2, 6                                                   \
michael@0:     __asm packuswb   xmm0, xmm0           /* B */                              \
michael@0:     __asm packuswb   xmm1, xmm1           /* G */                              \
michael@0:     __asm packuswb   xmm2, xmm2           /* R */                              \
michael@0:   }
michael@0: 
michael@0: // 8 pixels, dest aligned 16.
michael@0: // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void I444ToARGBRow_SSSE3(const uint8* y_buf,
michael@0:                          const uint8* u_buf,
michael@0:                          const uint8* v_buf,
michael@0:                          uint8* dst_argb,
michael@0:                          int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]   // Y
michael@0:     mov        esi, [esp + 8 + 8]   // U
michael@0:     mov        edi, [esp + 8 + 12]  // V
michael@0:     mov        edx, [esp + 8 + 16]  // argb
michael@0:     mov        ecx, [esp + 8 + 20]  // width
michael@0:     sub        edi, esi
michael@0:     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
michael@0:     pxor       xmm4, xmm4
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     READYUV444
michael@0:     YUVTORGB
michael@0: 
michael@0:     // Step 3: Weave into ARGB
michael@0:     punpcklbw  xmm0, xmm1           // BG
michael@0:     punpcklbw  xmm2, xmm5           // RA
michael@0:     movdqa     xmm1, xmm0
michael@0:     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
michael@0:     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
michael@0:     movdqa     [edx], xmm0
michael@0:     movdqa     [edx + 16], xmm1
michael@0:     lea        edx,  [edx + 32]
michael@0:     sub        ecx, 8
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: // 8 pixels, dest aligned 16.
michael@0: // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void I422ToRGB24Row_SSSE3(const uint8* y_buf,
michael@0:                           const uint8* u_buf,
michael@0:                           const uint8* v_buf,
michael@0:                           uint8* dst_rgb24,
michael@0:                           int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]   // Y
michael@0:     mov        esi, [esp + 8 + 8]   // U
michael@0:     mov        edi, [esp + 8 + 12]  // V
michael@0:     mov        edx, [esp + 8 + 16]  // rgb24
michael@0:     mov        ecx, [esp + 8 + 20]  // width
michael@0:     sub        edi, esi
michael@0:     pxor       xmm4, xmm4
michael@0:     movdqa     xmm5, kShuffleMaskARGBToRGB24_0
michael@0:     movdqa     xmm6, kShuffleMaskARGBToRGB24
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     READYUV422
michael@0:     YUVTORGB
michael@0: 
michael@0:     // Step 3: Weave into RRGB
michael@0:     punpcklbw  xmm0, xmm1           // BG
michael@0:     punpcklbw  xmm2, xmm2           // RR
michael@0:     movdqa     xmm1, xmm0
michael@0:     punpcklwd  xmm0, xmm2           // BGRR first 4 pixels
michael@0:     punpckhwd  xmm1, xmm2           // BGRR next 4 pixels
michael@0:     pshufb     xmm0, xmm5           // Pack into first 8 and last 4 bytes.
michael@0:     pshufb     xmm1, xmm6           // Pack into first 12 bytes.
michael@0:     palignr    xmm1, xmm0, 12       // last 4 bytes of xmm0 + 12 from xmm1
michael@0:     movq       qword ptr [edx], xmm0  // First 8 bytes
michael@0:     movdqu     [edx + 8], xmm1      // Last 16 bytes. = 24 bytes, 8 RGB pixels.
michael@0:     lea        edx,  [edx + 24]
michael@0:     sub        ecx, 8
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: // 8 pixels, dest aligned 16.
michael@0: // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void I422ToRAWRow_SSSE3(const uint8* y_buf,
michael@0:                         const uint8* u_buf,
michael@0:                         const uint8* v_buf,
michael@0:                         uint8* dst_raw,
michael@0:                         int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]   // Y
michael@0:     mov        esi, [esp + 8 + 8]   // U
michael@0:     mov        edi, [esp + 8 + 12]  // V
michael@0:     mov        edx, [esp + 8 + 16]  // raw
michael@0:     mov        ecx, [esp + 8 + 20]  // width
michael@0:     sub        edi, esi
michael@0:     pxor       xmm4, xmm4
michael@0:     movdqa     xmm5, kShuffleMaskARGBToRAW_0
michael@0:     movdqa     xmm6, kShuffleMaskARGBToRAW
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     READYUV422
michael@0:     YUVTORGB
michael@0: 
michael@0:     // Step 3: Weave into RRGB
michael@0:     punpcklbw  xmm0, xmm1           // BG
michael@0:     punpcklbw  xmm2, xmm2           // RR
michael@0:     movdqa     xmm1, xmm0
michael@0:     punpcklwd  xmm0, xmm2           // BGRR first 4 pixels
michael@0:     punpckhwd  xmm1, xmm2           // BGRR next 4 pixels
michael@0:     pshufb     xmm0, xmm5           // Pack into first 8 and last 4 bytes.
michael@0:     pshufb     xmm1, xmm6           // Pack into first 12 bytes.
michael@0:     palignr    xmm1, xmm0, 12       // last 4 bytes of xmm0 + 12 from xmm1
michael@0:     movq       qword ptr [edx], xmm0  // First 8 bytes
michael@0:     movdqu     [edx + 8], xmm1      // Last 16 bytes. = 24 bytes, 8 RGB pixels.
michael@0:     lea        edx,  [edx + 24]
michael@0:     sub        ecx, 8
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: // 8 pixels, dest unaligned.
michael@0: // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void I422ToRGB565Row_SSSE3(const uint8* y_buf,
michael@0:                            const uint8* u_buf,
michael@0:                            const uint8* v_buf,
michael@0:                            uint8* rgb565_buf,
michael@0:                            int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]   // Y
michael@0:     mov        esi, [esp + 8 + 8]   // U
michael@0:     mov        edi, [esp + 8 + 12]  // V
michael@0:     mov        edx, [esp + 8 + 16]  // rgb565
michael@0:     mov        ecx, [esp + 8 + 20]  // width
michael@0:     sub        edi, esi
michael@0:     pxor       xmm4, xmm4
michael@0:     pcmpeqb    xmm5, xmm5       // generate mask 0x0000001f
michael@0:     psrld      xmm5, 27
michael@0:     pcmpeqb    xmm6, xmm6       // generate mask 0x000007e0
michael@0:     psrld      xmm6, 26
michael@0:     pslld      xmm6, 5
michael@0:     pcmpeqb    xmm7, xmm7       // generate mask 0xfffff800
michael@0:     pslld      xmm7, 11
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     READYUV422
michael@0:     YUVTORGB
michael@0: 
michael@0:     // Step 3: Weave into RRGB
michael@0:     punpcklbw  xmm0, xmm1           // BG
michael@0:     punpcklbw  xmm2, xmm2           // RR
michael@0:     movdqa     xmm1, xmm0
michael@0:     punpcklwd  xmm0, xmm2           // BGRR first 4 pixels
michael@0:     punpckhwd  xmm1, xmm2           // BGRR next 4 pixels
michael@0: 
michael@0:     // Step 3b: RRGB -> RGB565
michael@0:     movdqa     xmm3, xmm0    // B  first 4 pixels of argb
michael@0:     movdqa     xmm2, xmm0    // G
michael@0:     pslld      xmm0, 8       // R
michael@0:     psrld      xmm3, 3       // B
michael@0:     psrld      xmm2, 5       // G
michael@0:     psrad      xmm0, 16      // R
michael@0:     pand       xmm3, xmm5    // B
michael@0:     pand       xmm2, xmm6    // G
michael@0:     pand       xmm0, xmm7    // R
michael@0:     por        xmm3, xmm2    // BG
michael@0:     por        xmm0, xmm3    // BGR
michael@0:     movdqa     xmm3, xmm1    // B  next 4 pixels of argb
michael@0:     movdqa     xmm2, xmm1    // G
michael@0:     pslld      xmm1, 8       // R
michael@0:     psrld      xmm3, 3       // B
michael@0:     psrld      xmm2, 5       // G
michael@0:     psrad      xmm1, 16      // R
michael@0:     pand       xmm3, xmm5    // B
michael@0:     pand       xmm2, xmm6    // G
michael@0:     pand       xmm1, xmm7    // R
michael@0:     por        xmm3, xmm2    // BG
michael@0:     por        xmm1, xmm3    // BGR
michael@0:     packssdw   xmm0, xmm1
michael@0:     sub        ecx, 8
michael@0:     movdqu     [edx], xmm0   // store 8 pixels of RGB565
michael@0:     lea        edx, [edx + 16]
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: // 8 pixels, dest aligned 16.
michael@0: // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void I422ToARGBRow_SSSE3(const uint8* y_buf,
michael@0:                          const uint8* u_buf,
michael@0:                          const uint8* v_buf,
michael@0:                          uint8* dst_argb,
michael@0:                          int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]   // Y
michael@0:     mov        esi, [esp + 8 + 8]   // U
michael@0:     mov        edi, [esp + 8 + 12]  // V
michael@0:     mov        edx, [esp + 8 + 16]  // argb
michael@0:     mov        ecx, [esp + 8 + 20]  // width
michael@0:     sub        edi, esi
michael@0:     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
michael@0:     pxor       xmm4, xmm4
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     READYUV422
michael@0:     YUVTORGB
michael@0: 
michael@0:     // Step 3: Weave into ARGB
michael@0:     punpcklbw  xmm0, xmm1           // BG
michael@0:     punpcklbw  xmm2, xmm5           // RA
michael@0:     movdqa     xmm1, xmm0
michael@0:     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
michael@0:     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
michael@0:     movdqa     [edx], xmm0
michael@0:     movdqa     [edx + 16], xmm1
michael@0:     lea        edx,  [edx + 32]
michael@0:     sub        ecx, 8
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: // 8 pixels, dest aligned 16.
michael@0: // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
michael@0: // Similar to I420 but duplicate UV once more.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void I411ToARGBRow_SSSE3(const uint8* y_buf,
michael@0:                          const uint8* u_buf,
michael@0:                          const uint8* v_buf,
michael@0:                          uint8* dst_argb,
michael@0:                          int width) {
michael@0:   __asm {
michael@0:     push       ebx
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 12 + 4]   // Y
michael@0:     mov        esi, [esp + 12 + 8]   // U
michael@0:     mov        edi, [esp + 12 + 12]  // V
michael@0:     mov        edx, [esp + 12 + 16]  // argb
michael@0:     mov        ecx, [esp + 12 + 20]  // width
michael@0:     sub        edi, esi
michael@0:     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
michael@0:     pxor       xmm4, xmm4
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     READYUV411  // modifies EBX
michael@0:     YUVTORGB
michael@0: 
michael@0:     // Step 3: Weave into ARGB
michael@0:     punpcklbw  xmm0, xmm1           // BG
michael@0:     punpcklbw  xmm2, xmm5           // RA
michael@0:     movdqa     xmm1, xmm0
michael@0:     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
michael@0:     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
michael@0:     movdqa     [edx], xmm0
michael@0:     movdqa     [edx + 16], xmm1
michael@0:     lea        edx,  [edx + 32]
michael@0:     sub        ecx, 8
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     pop        ebx
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: // 8 pixels, dest aligned 16.
michael@0: // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void NV12ToARGBRow_SSSE3(const uint8* y_buf,
michael@0:                          const uint8* uv_buf,
michael@0:                          uint8* dst_argb,
michael@0:                          int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     mov        eax, [esp + 4 + 4]   // Y
michael@0:     mov        esi, [esp + 4 + 8]   // UV
michael@0:     mov        edx, [esp + 4 + 12]  // argb
michael@0:     mov        ecx, [esp + 4 + 16]  // width
michael@0:     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
michael@0:     pxor       xmm4, xmm4
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     READNV12
michael@0:     YUVTORGB
michael@0: 
michael@0:     // Step 3: Weave into ARGB
michael@0:     punpcklbw  xmm0, xmm1           // BG
michael@0:     punpcklbw  xmm2, xmm5           // RA
michael@0:     movdqa     xmm1, xmm0
michael@0:     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
michael@0:     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
michael@0:     movdqa     [edx], xmm0
michael@0:     movdqa     [edx + 16], xmm1
michael@0:     lea        edx,  [edx + 32]
michael@0:     sub        ecx, 8
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: // 8 pixels, dest aligned 16.
michael@0: // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void NV21ToARGBRow_SSSE3(const uint8* y_buf,
michael@0:                          const uint8* uv_buf,
michael@0:                          uint8* dst_argb,
michael@0:                          int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     mov        eax, [esp + 4 + 4]   // Y
michael@0:     mov        esi, [esp + 4 + 8]   // VU
michael@0:     mov        edx, [esp + 4 + 12]  // argb
michael@0:     mov        ecx, [esp + 4 + 16]  // width
michael@0:     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
michael@0:     pxor       xmm4, xmm4
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     READNV12
michael@0:     YVUTORGB
michael@0: 
michael@0:     // Step 3: Weave into ARGB
michael@0:     punpcklbw  xmm0, xmm1           // BG
michael@0:     punpcklbw  xmm2, xmm5           // RA
michael@0:     movdqa     xmm1, xmm0
michael@0:     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
michael@0:     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
michael@0:     movdqa     [edx], xmm0
michael@0:     movdqa     [edx + 16], xmm1
michael@0:     lea        edx,  [edx + 32]
michael@0:     sub        ecx, 8
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: // 8 pixels, unaligned.
michael@0: // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
michael@0:                                    const uint8* u_buf,
michael@0:                                    const uint8* v_buf,
michael@0:                                    uint8* dst_argb,
michael@0:                                    int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]   // Y
michael@0:     mov        esi, [esp + 8 + 8]   // U
michael@0:     mov        edi, [esp + 8 + 12]  // V
michael@0:     mov        edx, [esp + 8 + 16]  // argb
michael@0:     mov        ecx, [esp + 8 + 20]  // width
michael@0:     sub        edi, esi
michael@0:     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
michael@0:     pxor       xmm4, xmm4
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     READYUV444
michael@0:     YUVTORGB
michael@0: 
michael@0:     // Step 3: Weave into ARGB
michael@0:     punpcklbw  xmm0, xmm1           // BG
michael@0:     punpcklbw  xmm2, xmm5           // RA
michael@0:     movdqa     xmm1, xmm0
michael@0:     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
michael@0:     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
michael@0:     movdqu     [edx], xmm0
michael@0:     movdqu     [edx + 16], xmm1
michael@0:     lea        edx,  [edx + 32]
michael@0:     sub        ecx, 8
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: // 8 pixels, unaligned.
michael@0: // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
michael@0:                                    const uint8* u_buf,
michael@0:                                    const uint8* v_buf,
michael@0:                                    uint8* dst_argb,
michael@0:                                    int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]   // Y
michael@0:     mov        esi, [esp + 8 + 8]   // U
michael@0:     mov        edi, [esp + 8 + 12]  // V
michael@0:     mov        edx, [esp + 8 + 16]  // argb
michael@0:     mov        ecx, [esp + 8 + 20]  // width
michael@0:     sub        edi, esi
michael@0:     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
michael@0:     pxor       xmm4, xmm4
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     READYUV422
michael@0:     YUVTORGB
michael@0: 
michael@0:     // Step 3: Weave into ARGB
michael@0:     punpcklbw  xmm0, xmm1           // BG
michael@0:     punpcklbw  xmm2, xmm5           // RA
michael@0:     movdqa     xmm1, xmm0
michael@0:     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
michael@0:     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
michael@0:     movdqu     [edx], xmm0
michael@0:     movdqu     [edx + 16], xmm1
michael@0:     lea        edx,  [edx + 32]
michael@0:     sub        ecx, 8
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: // 8 pixels, unaligned.
michael@0: // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
michael@0: // Similar to I420 but duplicate UV once more.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
michael@0:                                    const uint8* u_buf,
michael@0:                                    const uint8* v_buf,
michael@0:                                    uint8* dst_argb,
michael@0:                                    int width) {
michael@0:   __asm {
michael@0:     push       ebx
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 12 + 4]   // Y
michael@0:     mov        esi, [esp + 12 + 8]   // U
michael@0:     mov        edi, [esp + 12 + 12]  // V
michael@0:     mov        edx, [esp + 12 + 16]  // argb
michael@0:     mov        ecx, [esp + 12 + 20]  // width
michael@0:     sub        edi, esi
michael@0:     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
michael@0:     pxor       xmm4, xmm4
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     READYUV411  // modifies EBX
michael@0:     YUVTORGB
michael@0: 
michael@0:     // Step 3: Weave into ARGB
michael@0:     punpcklbw  xmm0, xmm1           // BG
michael@0:     punpcklbw  xmm2, xmm5           // RA
michael@0:     movdqa     xmm1, xmm0
michael@0:     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
michael@0:     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
michael@0:     movdqu     [edx], xmm0
michael@0:     movdqu     [edx + 16], xmm1
michael@0:     lea        edx,  [edx + 32]
michael@0:     sub        ecx, 8
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     pop        ebx
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: // 8 pixels, dest aligned 16.
michael@0: // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
michael@0:                                    const uint8* uv_buf,
michael@0:                                    uint8* dst_argb,
michael@0:                                    int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     mov        eax, [esp + 4 + 4]   // Y
michael@0:     mov        esi, [esp + 4 + 8]   // UV
michael@0:     mov        edx, [esp + 4 + 12]  // argb
michael@0:     mov        ecx, [esp + 4 + 16]  // width
michael@0:     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
michael@0:     pxor       xmm4, xmm4
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     READNV12
michael@0:     YUVTORGB
michael@0: 
michael@0:     // Step 3: Weave into ARGB
michael@0:     punpcklbw  xmm0, xmm1           // BG
michael@0:     punpcklbw  xmm2, xmm5           // RA
michael@0:     movdqa     xmm1, xmm0
michael@0:     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
michael@0:     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
michael@0:     movdqu     [edx], xmm0
michael@0:     movdqu     [edx + 16], xmm1
michael@0:     lea        edx,  [edx + 32]
michael@0:     sub        ecx, 8
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: // 8 pixels, dest aligned 16.
michael@0: // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
michael@0:                                    const uint8* uv_buf,
michael@0:                                    uint8* dst_argb,
michael@0:                                    int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     mov        eax, [esp + 4 + 4]   // Y
michael@0:     mov        esi, [esp + 4 + 8]   // VU
michael@0:     mov        edx, [esp + 4 + 12]  // argb
michael@0:     mov        ecx, [esp + 4 + 16]  // width
michael@0:     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
michael@0:     pxor       xmm4, xmm4
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     READNV12
michael@0:     YVUTORGB
michael@0: 
michael@0:     // Step 3: Weave into ARGB
michael@0:     punpcklbw  xmm0, xmm1           // BG
michael@0:     punpcklbw  xmm2, xmm5           // RA
michael@0:     movdqa     xmm1, xmm0
michael@0:     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
michael@0:     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
michael@0:     movdqu     [edx], xmm0
michael@0:     movdqu     [edx + 16], xmm1
michael@0:     lea        edx,  [edx + 32]
michael@0:     sub        ecx, 8
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void I422ToBGRARow_SSSE3(const uint8* y_buf,
michael@0:                          const uint8* u_buf,
michael@0:                          const uint8* v_buf,
michael@0:                          uint8* dst_bgra,
michael@0:                          int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]   // Y
michael@0:     mov        esi, [esp + 8 + 8]   // U
michael@0:     mov        edi, [esp + 8 + 12]  // V
michael@0:     mov        edx, [esp + 8 + 16]  // bgra
michael@0:     mov        ecx, [esp + 8 + 20]  // width
michael@0:     sub        edi, esi
michael@0:     pxor       xmm4, xmm4
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     READYUV422
michael@0:     YUVTORGB
michael@0: 
michael@0:     // Step 3: Weave into BGRA
michael@0:     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
michael@0:     punpcklbw  xmm1, xmm0           // GB
michael@0:     punpcklbw  xmm5, xmm2           // AR
michael@0:     movdqa     xmm0, xmm5
michael@0:     punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
michael@0:     punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
michael@0:     movdqa     [edx], xmm5
michael@0:     movdqa     [edx + 16], xmm0
michael@0:     lea        edx,  [edx + 32]
michael@0:     sub        ecx, 8
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
michael@0:                                    const uint8* u_buf,
michael@0:                                    const uint8* v_buf,
michael@0:                                    uint8* dst_bgra,
michael@0:                                    int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]   // Y
michael@0:     mov        esi, [esp + 8 + 8]   // U
michael@0:     mov        edi, [esp + 8 + 12]  // V
michael@0:     mov        edx, [esp + 8 + 16]  // bgra
michael@0:     mov        ecx, [esp + 8 + 20]  // width
michael@0:     sub        edi, esi
michael@0:     pxor       xmm4, xmm4
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     READYUV422
michael@0:     YUVTORGB
michael@0: 
michael@0:     // Step 3: Weave into BGRA
michael@0:     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
michael@0:     punpcklbw  xmm1, xmm0           // GB
michael@0:     punpcklbw  xmm5, xmm2           // AR
michael@0:     movdqa     xmm0, xmm5
michael@0:     punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
michael@0:     punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
michael@0:     movdqu     [edx], xmm5
michael@0:     movdqu     [edx + 16], xmm0
michael@0:     lea        edx,  [edx + 32]
michael@0:     sub        ecx, 8
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void I422ToABGRRow_SSSE3(const uint8* y_buf,
michael@0:                          const uint8* u_buf,
michael@0:                          const uint8* v_buf,
michael@0:                          uint8* dst_abgr,
michael@0:                          int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]   // Y
michael@0:     mov        esi, [esp + 8 + 8]   // U
michael@0:     mov        edi, [esp + 8 + 12]  // V
michael@0:     mov        edx, [esp + 8 + 16]  // abgr
michael@0:     mov        ecx, [esp + 8 + 20]  // width
michael@0:     sub        edi, esi
michael@0:     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
michael@0:     pxor       xmm4, xmm4
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     READYUV422
michael@0:     YUVTORGB
michael@0: 
michael@0:     // Step 3: Weave into ARGB
michael@0:     punpcklbw  xmm2, xmm1           // RG
michael@0:     punpcklbw  xmm0, xmm5           // BA
michael@0:     movdqa     xmm1, xmm2
michael@0:     punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
michael@0:     punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
michael@0:     movdqa     [edx], xmm2
michael@0:     movdqa     [edx + 16], xmm1
michael@0:     lea        edx,  [edx + 32]
michael@0:     sub        ecx, 8
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
michael@0:                                    const uint8* u_buf,
michael@0:                                    const uint8* v_buf,
michael@0:                                    uint8* dst_abgr,
michael@0:                                    int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]   // Y
michael@0:     mov        esi, [esp + 8 + 8]   // U
michael@0:     mov        edi, [esp + 8 + 12]  // V
michael@0:     mov        edx, [esp + 8 + 16]  // abgr
michael@0:     mov        ecx, [esp + 8 + 20]  // width
michael@0:     sub        edi, esi
michael@0:     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
michael@0:     pxor       xmm4, xmm4
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     READYUV422
michael@0:     YUVTORGB
michael@0: 
michael@0:     // Step 3: Weave into ARGB
michael@0:     punpcklbw  xmm2, xmm1           // RG
michael@0:     punpcklbw  xmm0, xmm5           // BA
michael@0:     movdqa     xmm1, xmm2
michael@0:     punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
michael@0:     punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
michael@0:     movdqu     [edx], xmm2
michael@0:     movdqu     [edx + 16], xmm1
michael@0:     lea        edx,  [edx + 32]
michael@0:     sub        ecx, 8
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void I422ToRGBARow_SSSE3(const uint8* y_buf,
michael@0:                          const uint8* u_buf,
michael@0:                          const uint8* v_buf,
michael@0:                          uint8* dst_rgba,
michael@0:                          int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]   // Y
michael@0:     mov        esi, [esp + 8 + 8]   // U
michael@0:     mov        edi, [esp + 8 + 12]  // V
michael@0:     mov        edx, [esp + 8 + 16]  // rgba
michael@0:     mov        ecx, [esp + 8 + 20]  // width
michael@0:     sub        edi, esi
michael@0:     pxor       xmm4, xmm4
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     READYUV422
michael@0:     YUVTORGB
michael@0: 
michael@0:     // Step 3: Weave into RGBA
michael@0:     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
michael@0:     punpcklbw  xmm1, xmm2           // GR
michael@0:     punpcklbw  xmm5, xmm0           // AB
michael@0:     movdqa     xmm0, xmm5
michael@0:     punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
michael@0:     punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
michael@0:     movdqa     [edx], xmm5
michael@0:     movdqa     [edx + 16], xmm0
michael@0:     lea        edx,  [edx + 32]
michael@0:     sub        ecx, 8
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
michael@0:                                    const uint8* u_buf,
michael@0:                                    const uint8* v_buf,
michael@0:                                    uint8* dst_rgba,
michael@0:                                    int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]   // Y
michael@0:     mov        esi, [esp + 8 + 8]   // U
michael@0:     mov        edi, [esp + 8 + 12]  // V
michael@0:     mov        edx, [esp + 8 + 16]  // rgba
michael@0:     mov        ecx, [esp + 8 + 20]  // width
michael@0:     sub        edi, esi
michael@0:     pxor       xmm4, xmm4
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     READYUV422
michael@0:     YUVTORGB
michael@0: 
michael@0:     // Step 3: Weave into RGBA
michael@0:     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
michael@0:     punpcklbw  xmm1, xmm2           // GR
michael@0:     punpcklbw  xmm5, xmm0           // AB
michael@0:     movdqa     xmm0, xmm5
michael@0:     punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
michael@0:     punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
michael@0:     movdqu     [edx], xmm5
michael@0:     movdqu     [edx + 16], xmm0
michael@0:     lea        edx,  [edx + 32]
michael@0:     sub        ecx, 8
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: #endif  // HAS_I422TOARGBROW_SSSE3
michael@0: 
michael@0: #ifdef HAS_YTOARGBROW_SSE2
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void YToARGBRow_SSE2(const uint8* y_buf,
michael@0:                      uint8* rgb_buf,
michael@0:                      int width) {
michael@0:   __asm {
michael@0:     pxor       xmm5, xmm5
michael@0:     pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
michael@0:     pslld      xmm4, 24
michael@0:     mov        eax, 0x00100010
michael@0:     movd       xmm3, eax
michael@0:     pshufd     xmm3, xmm3, 0
michael@0:     mov        eax, 0x004a004a       // 74
michael@0:     movd       xmm2, eax
michael@0:     pshufd     xmm2, xmm2,0
michael@0:     mov        eax, [esp + 4]       // Y
michael@0:     mov        edx, [esp + 8]       // rgb
michael@0:     mov        ecx, [esp + 12]      // width
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
michael@0:     movq       xmm0, qword ptr [eax]
michael@0:     lea        eax, [eax + 8]
michael@0:     punpcklbw  xmm0, xmm5           // 0.Y
michael@0:     psubusw    xmm0, xmm3
michael@0:     pmullw     xmm0, xmm2
michael@0:     psrlw      xmm0, 6
michael@0:     packuswb   xmm0, xmm0           // G
michael@0: 
michael@0:     // Step 2: Weave into ARGB
michael@0:     punpcklbw  xmm0, xmm0           // GG
michael@0:     movdqa     xmm1, xmm0
michael@0:     punpcklwd  xmm0, xmm0           // BGRA first 4 pixels
michael@0:     punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
michael@0:     por        xmm0, xmm4
michael@0:     por        xmm1, xmm4
michael@0:     movdqa     [edx], xmm0
michael@0:     movdqa     [edx + 16], xmm1
michael@0:     lea        edx,  [edx + 32]
michael@0:     sub        ecx, 8
michael@0:     jg         convertloop
michael@0: 
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_YTOARGBROW_SSE2
michael@0: 
michael@0: #ifdef HAS_MIRRORROW_SSSE3
michael@0: // Shuffle table for reversing the bytes.
michael@0: static const uvec8 kShuffleMirror = {
michael@0:   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
michael@0: };
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
michael@0:   __asm {
michael@0:     mov       eax, [esp + 4]   // src
michael@0:     mov       edx, [esp + 8]   // dst
michael@0:     mov       ecx, [esp + 12]  // width
michael@0:     movdqa    xmm5, kShuffleMirror
michael@0:     lea       eax, [eax - 16]
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqa    xmm0, [eax + ecx]
michael@0:     pshufb    xmm0, xmm5
michael@0:     sub       ecx, 16
michael@0:     movdqa    [edx], xmm0
michael@0:     lea       edx, [edx + 16]
michael@0:     jg        convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_MIRRORROW_SSSE3
michael@0: 
michael@0: #ifdef HAS_MIRRORROW_AVX2
michael@0: // Shuffle table for reversing the bytes.
michael@0: static const ulvec8 kShuffleMirror_AVX2 = {
michael@0:   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u,
michael@0:   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
michael@0: };
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
michael@0:   __asm {
michael@0:     mov       eax, [esp + 4]   // src
michael@0:     mov       edx, [esp + 8]   // dst
michael@0:     mov       ecx, [esp + 12]  // width
michael@0:     vmovdqa   ymm5, kShuffleMirror_AVX2
michael@0:     lea       eax, [eax - 32]
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     vmovdqu   ymm0, [eax + ecx]
michael@0:     vpshufb   ymm0, ymm0, ymm5
michael@0:     vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
michael@0:     sub       ecx, 32
michael@0:     vmovdqu   [edx], ymm0
michael@0:     lea       edx, [edx + 32]
michael@0:     jg        convertloop
michael@0:     vzeroupper
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_MIRRORROW_AVX2
michael@0: 
michael@0: #ifdef HAS_MIRRORROW_SSE2
michael@0: // SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
michael@0: // version can not.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
michael@0:   __asm {
michael@0:     mov       eax, [esp + 4]   // src
michael@0:     mov       edx, [esp + 8]   // dst
michael@0:     mov       ecx, [esp + 12]  // width
michael@0:     lea       eax, [eax - 16]
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqu    xmm0, [eax + ecx]
michael@0:     movdqa    xmm1, xmm0        // swap bytes
michael@0:     psllw     xmm0, 8
michael@0:     psrlw     xmm1, 8
michael@0:     por       xmm0, xmm1
michael@0:     pshuflw   xmm0, xmm0, 0x1b  // swap words
michael@0:     pshufhw   xmm0, xmm0, 0x1b
michael@0:     pshufd    xmm0, xmm0, 0x4e  // swap qwords
michael@0:     sub       ecx, 16
michael@0:     movdqu    [edx], xmm0
michael@0:     lea       edx, [edx + 16]
michael@0:     jg        convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_MIRRORROW_SSE2
michael@0: 
michael@0: #ifdef HAS_MIRRORROW_UV_SSSE3
michael@0: // Shuffle table for reversing the bytes of UV channels.
michael@0: static const uvec8 kShuffleMirrorUV = {
michael@0:   14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
michael@0: };
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
michael@0:                        int width) {
michael@0:   __asm {
michael@0:     push      edi
michael@0:     mov       eax, [esp + 4 + 4]   // src
michael@0:     mov       edx, [esp + 4 + 8]   // dst_u
michael@0:     mov       edi, [esp + 4 + 12]  // dst_v
michael@0:     mov       ecx, [esp + 4 + 16]  // width
michael@0:     movdqa    xmm1, kShuffleMirrorUV
michael@0:     lea       eax, [eax + ecx * 2 - 16]
michael@0:     sub       edi, edx
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqa    xmm0, [eax]
michael@0:     lea       eax, [eax - 16]
michael@0:     pshufb    xmm0, xmm1
michael@0:     sub       ecx, 8
michael@0:     movlpd    qword ptr [edx], xmm0
michael@0:     movhpd    qword ptr [edx + edi], xmm0
michael@0:     lea       edx, [edx + 8]
michael@0:     jg        convertloop
michael@0: 
michael@0:     pop       edi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_MIRRORROW_UV_SSSE3
michael@0: 
michael@0: #ifdef HAS_ARGBMIRRORROW_SSSE3
michael@0: // Shuffle table for reversing the bytes.
michael@0: static const uvec8 kARGBShuffleMirror = {
michael@0:   12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
michael@0: };
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
michael@0:   __asm {
michael@0:     mov       eax, [esp + 4]   // src
michael@0:     mov       edx, [esp + 8]   // dst
michael@0:     mov       ecx, [esp + 12]  // width
michael@0:     lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
michael@0:     movdqa    xmm5, kARGBShuffleMirror
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqa    xmm0, [eax]
michael@0:     lea       eax, [eax - 16]
michael@0:     pshufb    xmm0, xmm5
michael@0:     sub       ecx, 4
michael@0:     movdqa    [edx], xmm0
michael@0:     lea       edx, [edx + 16]
michael@0:     jg        convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBMIRRORROW_SSSE3
michael@0: 
michael@0: #ifdef HAS_ARGBMIRRORROW_AVX2
michael@0: // Shuffle table for reversing the bytes.
michael@0: static const ulvec32 kARGBShuffleMirror_AVX2 = {
michael@0:   7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
michael@0: };
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
michael@0:   __asm {
michael@0:     mov       eax, [esp + 4]   // src
michael@0:     mov       edx, [esp + 8]   // dst
michael@0:     mov       ecx, [esp + 12]  // width
michael@0:     lea       eax, [eax - 32]
michael@0:     vmovdqa   ymm5, kARGBShuffleMirror_AVX2
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     vpermd    ymm0, ymm5, [eax + ecx * 4]  // permute dword order
michael@0:     sub       ecx, 8
michael@0:     vmovdqu   [edx], ymm0
michael@0:     lea       edx, [edx + 32]
michael@0:     jg        convertloop
michael@0:     vzeroupper
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBMIRRORROW_AVX2
michael@0: 
michael@0: #ifdef HAS_SPLITUVROW_SSE2
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
michael@0:   __asm {
michael@0:     push       edi
michael@0:     mov        eax, [esp + 4 + 4]    // src_uv
michael@0:     mov        edx, [esp + 4 + 8]    // dst_u
michael@0:     mov        edi, [esp + 4 + 12]   // dst_v
michael@0:     mov        ecx, [esp + 4 + 16]   // pix
michael@0:     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
michael@0:     psrlw      xmm5, 8
michael@0:     sub        edi, edx
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     movdqa     xmm0, [eax]
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     lea        eax,  [eax + 32]
michael@0:     movdqa     xmm2, xmm0
michael@0:     movdqa     xmm3, xmm1
michael@0:     pand       xmm0, xmm5   // even bytes
michael@0:     pand       xmm1, xmm5
michael@0:     packuswb   xmm0, xmm1
michael@0:     psrlw      xmm2, 8      // odd bytes
michael@0:     psrlw      xmm3, 8
michael@0:     packuswb   xmm2, xmm3
michael@0:     movdqa     [edx], xmm0
michael@0:     movdqa     [edx + edi], xmm2
michael@0:     lea        edx, [edx + 16]
michael@0:     sub        ecx, 16
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
michael@0:                                int pix) {
michael@0:   __asm {
michael@0:     push       edi
michael@0:     mov        eax, [esp + 4 + 4]    // src_uv
michael@0:     mov        edx, [esp + 4 + 8]    // dst_u
michael@0:     mov        edi, [esp + 4 + 12]   // dst_v
michael@0:     mov        ecx, [esp + 4 + 16]   // pix
michael@0:     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
michael@0:     psrlw      xmm5, 8
michael@0:     sub        edi, edx
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     movdqu     xmm0, [eax]
michael@0:     movdqu     xmm1, [eax + 16]
michael@0:     lea        eax,  [eax + 32]
michael@0:     movdqa     xmm2, xmm0
michael@0:     movdqa     xmm3, xmm1
michael@0:     pand       xmm0, xmm5   // even bytes
michael@0:     pand       xmm1, xmm5
michael@0:     packuswb   xmm0, xmm1
michael@0:     psrlw      xmm2, 8      // odd bytes
michael@0:     psrlw      xmm3, 8
michael@0:     packuswb   xmm2, xmm3
michael@0:     movdqu     [edx], xmm0
michael@0:     movdqu     [edx + edi], xmm2
michael@0:     lea        edx, [edx + 16]
michael@0:     sub        ecx, 16
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_SPLITUVROW_SSE2
michael@0: 
michael@0: #ifdef HAS_SPLITUVROW_AVX2
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
michael@0:   __asm {
michael@0:     push       edi
michael@0:     mov        eax, [esp + 4 + 4]    // src_uv
michael@0:     mov        edx, [esp + 4 + 8]    // dst_u
michael@0:     mov        edi, [esp + 4 + 12]   // dst_v
michael@0:     mov        ecx, [esp + 4 + 16]   // pix
michael@0:     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
michael@0:     vpsrlw     ymm5, ymm5, 8
michael@0:     sub        edi, edx
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     vmovdqu    ymm0, [eax]
michael@0:     vmovdqu    ymm1, [eax + 32]
michael@0:     lea        eax,  [eax + 64]
michael@0:     vpsrlw     ymm2, ymm0, 8      // odd bytes
michael@0:     vpsrlw     ymm3, ymm1, 8
michael@0:     vpand      ymm0, ymm0, ymm5   // even bytes
michael@0:     vpand      ymm1, ymm1, ymm5
michael@0:     vpackuswb  ymm0, ymm0, ymm1
michael@0:     vpackuswb  ymm2, ymm2, ymm3
michael@0:     vpermq     ymm0, ymm0, 0xd8
michael@0:     vpermq     ymm2, ymm2, 0xd8
michael@0:     vmovdqu    [edx], ymm0
michael@0:     vmovdqu    [edx + edi], ymm2
michael@0:     lea        edx, [edx + 32]
michael@0:     sub        ecx, 32
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     vzeroupper
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_SPLITUVROW_AVX2
michael@0: 
michael@0: #ifdef HAS_MERGEUVROW_SSE2
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
michael@0:                      int width) {
michael@0:   __asm {
michael@0:     push       edi
michael@0:     mov        eax, [esp + 4 + 4]    // src_u
michael@0:     mov        edx, [esp + 4 + 8]    // src_v
michael@0:     mov        edi, [esp + 4 + 12]   // dst_uv
michael@0:     mov        ecx, [esp + 4 + 16]   // width
michael@0:     sub        edx, eax
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     movdqa     xmm0, [eax]      // read 16 U's
michael@0:     movdqa     xmm1, [eax + edx]  // and 16 V's
michael@0:     lea        eax,  [eax + 16]
michael@0:     movdqa     xmm2, xmm0
michael@0:     punpcklbw  xmm0, xmm1       // first 8 UV pairs
michael@0:     punpckhbw  xmm2, xmm1       // next 8 UV pairs
michael@0:     movdqa     [edi], xmm0
michael@0:     movdqa     [edi + 16], xmm2
michael@0:     lea        edi, [edi + 32]
michael@0:     sub        ecx, 16
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
michael@0:                                uint8* dst_uv, int width) {
michael@0:   __asm {
michael@0:     push       edi
michael@0:     mov        eax, [esp + 4 + 4]    // src_u
michael@0:     mov        edx, [esp + 4 + 8]    // src_v
michael@0:     mov        edi, [esp + 4 + 12]   // dst_uv
michael@0:     mov        ecx, [esp + 4 + 16]   // width
michael@0:     sub        edx, eax
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     movdqu     xmm0, [eax]      // read 16 U's
michael@0:     movdqu     xmm1, [eax + edx]  // and 16 V's
michael@0:     lea        eax,  [eax + 16]
michael@0:     movdqa     xmm2, xmm0
michael@0:     punpcklbw  xmm0, xmm1       // first 8 UV pairs
michael@0:     punpckhbw  xmm2, xmm1       // next 8 UV pairs
michael@0:     movdqu     [edi], xmm0
michael@0:     movdqu     [edi + 16], xmm2
michael@0:     lea        edi, [edi + 32]
michael@0:     sub        ecx, 16
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  //  HAS_MERGEUVROW_SSE2
michael@0: 
michael@0: #ifdef HAS_MERGEUVROW_AVX2
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
michael@0:                      int width) {
michael@0:   __asm {
michael@0:     push       edi
michael@0:     mov        eax, [esp + 4 + 4]    // src_u
michael@0:     mov        edx, [esp + 4 + 8]    // src_v
michael@0:     mov        edi, [esp + 4 + 12]   // dst_uv
michael@0:     mov        ecx, [esp + 4 + 16]   // width
michael@0:     sub        edx, eax
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     vmovdqu    ymm0, [eax]           // read 32 U's
michael@0:     vmovdqu    ymm1, [eax + edx]     // and 32 V's
michael@0:     lea        eax,  [eax + 32]
michael@0:     vpunpcklbw ymm2, ymm0, ymm1      // low 16 UV pairs. mutated qqword 0,2
michael@0:     vpunpckhbw ymm0, ymm0, ymm1      // high 16 UV pairs. mutated qqword 1,3
michael@0:     vperm2i128 ymm1, ymm2, ymm0, 0x20  // low 128 of ymm2 and low 128 of ymm0
michael@0:     vperm2i128 ymm2, ymm2, ymm0, 0x31  // high 128 of ymm2 and high 128 of ymm0
michael@0:     vmovdqu    [edi], ymm1
michael@0:     vmovdqu    [edi + 32], ymm2
michael@0:     lea        edi, [edi + 64]
michael@0:     sub        ecx, 32
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     vzeroupper
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  //  HAS_MERGEUVROW_AVX2
michael@0: 
michael@0: #ifdef HAS_COPYROW_SSE2
michael@0: // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]   // src
michael@0:     mov        edx, [esp + 8]   // dst
michael@0:     mov        ecx, [esp + 12]  // count
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     movdqa     xmm0, [eax]
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     lea        eax, [eax + 32]
michael@0:     movdqa     [edx], xmm0
michael@0:     movdqa     [edx + 16], xmm1
michael@0:     lea        edx, [edx + 32]
michael@0:     sub        ecx, 32
michael@0:     jg         convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_COPYROW_SSE2
michael@0: 
michael@0: // Unaligned Multiple of 1.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
michael@0:   __asm {
michael@0:     mov        eax, esi
michael@0:     mov        edx, edi
michael@0:     mov        esi, [esp + 4]   // src
michael@0:     mov        edi, [esp + 8]   // dst
michael@0:     mov        ecx, [esp + 12]  // count
michael@0:     rep movsb
michael@0:     mov        edi, edx
michael@0:     mov        esi, eax
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: #ifdef HAS_COPYROW_X86
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void CopyRow_X86(const uint8* src, uint8* dst, int count) {
michael@0:   __asm {
michael@0:     mov        eax, esi
michael@0:     mov        edx, edi
michael@0:     mov        esi, [esp + 4]   // src
michael@0:     mov        edi, [esp + 8]   // dst
michael@0:     mov        ecx, [esp + 12]  // count
michael@0:     shr        ecx, 2
michael@0:     rep movsd
michael@0:     mov        edi, edx
michael@0:     mov        esi, eax
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_COPYROW_X86
michael@0: 
michael@0: #ifdef HAS_ARGBCOPYALPHAROW_SSE2
michael@0: // width in pixels
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]   // src
michael@0:     mov        edx, [esp + 8]   // dst
michael@0:     mov        ecx, [esp + 12]  // count
michael@0:     pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
michael@0:     pslld      xmm0, 24
michael@0:     pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
michael@0:     psrld      xmm1, 8
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     movdqa     xmm2, [eax]
michael@0:     movdqa     xmm3, [eax + 16]
michael@0:     lea        eax, [eax + 32]
michael@0:     movdqa     xmm4, [edx]
michael@0:     movdqa     xmm5, [edx + 16]
michael@0:     pand       xmm2, xmm0
michael@0:     pand       xmm3, xmm0
michael@0:     pand       xmm4, xmm1
michael@0:     pand       xmm5, xmm1
michael@0:     por        xmm2, xmm4
michael@0:     por        xmm3, xmm5
michael@0:     movdqa     [edx], xmm2
michael@0:     movdqa     [edx + 16], xmm3
michael@0:     lea        edx, [edx + 32]
michael@0:     sub        ecx, 8
michael@0:     jg         convertloop
michael@0: 
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBCOPYALPHAROW_SSE2
michael@0: 
michael@0: #ifdef HAS_ARGBCOPYALPHAROW_AVX2
michael@0: // width in pixels
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]   // src
michael@0:     mov        edx, [esp + 8]   // dst
michael@0:     mov        ecx, [esp + 12]  // count
michael@0:     vpcmpeqb   ymm0, ymm0, ymm0
michael@0:     vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     vmovdqu    ymm1, [eax]
michael@0:     vmovdqu    ymm2, [eax + 32]
michael@0:     lea        eax, [eax + 64]
michael@0:     vpblendvb  ymm1, ymm1, [edx], ymm0
michael@0:     vpblendvb  ymm2, ymm2, [edx + 32], ymm0
michael@0:     vmovdqu    [edx], ymm1
michael@0:     vmovdqu    [edx + 32], ymm2
michael@0:     lea        edx, [edx + 64]
michael@0:     sub        ecx, 16
michael@0:     jg         convertloop
michael@0: 
michael@0:     vzeroupper
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBCOPYALPHAROW_AVX2
michael@0: 
michael@0: #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
michael@0: // width in pixels
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]   // src
michael@0:     mov        edx, [esp + 8]   // dst
michael@0:     mov        ecx, [esp + 12]  // count
michael@0:     pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
michael@0:     pslld      xmm0, 24
michael@0:     pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
michael@0:     psrld      xmm1, 8
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     movq       xmm2, qword ptr [eax]  // 8 Y's
michael@0:     lea        eax, [eax + 8]
michael@0:     punpcklbw  xmm2, xmm2
michael@0:     punpckhwd  xmm3, xmm2
michael@0:     punpcklwd  xmm2, xmm2
michael@0:     movdqa     xmm4, [edx]
michael@0:     movdqa     xmm5, [edx + 16]
michael@0:     pand       xmm2, xmm0
michael@0:     pand       xmm3, xmm0
michael@0:     pand       xmm4, xmm1
michael@0:     pand       xmm5, xmm1
michael@0:     por        xmm2, xmm4
michael@0:     por        xmm3, xmm5
michael@0:     movdqa     [edx], xmm2
michael@0:     movdqa     [edx + 16], xmm3
michael@0:     lea        edx, [edx + 32]
michael@0:     sub        ecx, 8
michael@0:     jg         convertloop
michael@0: 
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
michael@0: 
michael@0: #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
michael@0: // width in pixels
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]   // src
michael@0:     mov        edx, [esp + 8]   // dst
michael@0:     mov        ecx, [esp + 12]  // count
michael@0:     vpcmpeqb   ymm0, ymm0, ymm0
michael@0:     vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     vpmovzxbd  ymm1, qword ptr [eax]
michael@0:     vpmovzxbd  ymm2, qword ptr [eax + 8]
michael@0:     lea        eax, [eax + 16]
michael@0:     vpslld     ymm1, ymm1, 24
michael@0:     vpslld     ymm2, ymm2, 24
michael@0:     vpblendvb  ymm1, ymm1, [edx], ymm0
michael@0:     vpblendvb  ymm2, ymm2, [edx + 32], ymm0
michael@0:     vmovdqu    [edx], ymm1
michael@0:     vmovdqu    [edx + 32], ymm2
michael@0:     lea        edx, [edx + 64]
michael@0:     sub        ecx, 16
michael@0:     jg         convertloop
michael@0: 
michael@0:     vzeroupper
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
michael@0: 
michael@0: #ifdef HAS_SETROW_X86
michael@0: // SetRow8 writes 'count' bytes using a 32 bit value repeated.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void SetRow_X86(uint8* dst, uint32 v32, int count) {
michael@0:   __asm {
michael@0:     mov        edx, edi
michael@0:     mov        edi, [esp + 4]   // dst
michael@0:     mov        eax, [esp + 8]   // v32
michael@0:     mov        ecx, [esp + 12]  // count
michael@0:     shr        ecx, 2
michael@0:     rep stosd
michael@0:     mov        edi, edx
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: // SetRow32 writes 'count' words using a 32 bit value repeated.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
michael@0:                    int dst_stride, int height) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     push       ebp
michael@0:     mov        edi, [esp + 12 + 4]   // dst
michael@0:     mov        eax, [esp + 12 + 8]   // v32
michael@0:     mov        ebp, [esp + 12 + 12]  // width
michael@0:     mov        edx, [esp + 12 + 16]  // dst_stride
michael@0:     mov        esi, [esp + 12 + 20]  // height
michael@0:     lea        ecx, [ebp * 4]
michael@0:     sub        edx, ecx             // stride - width * 4
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     mov        ecx, ebp
michael@0:     rep stosd
michael@0:     add        edi, edx
michael@0:     sub        esi, 1
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        ebp
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_SETROW_X86
michael@0: 
michael@0: #ifdef HAS_YUY2TOYROW_AVX2
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void YUY2ToYRow_AVX2(const uint8* src_yuy2,
michael@0:                      uint8* dst_y, int pix) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]    // src_yuy2
michael@0:     mov        edx, [esp + 8]    // dst_y
michael@0:     mov        ecx, [esp + 12]   // pix
michael@0:     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
michael@0:     vpsrlw     ymm5, ymm5, 8
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     vmovdqu    ymm0, [eax]
michael@0:     vmovdqu    ymm1, [eax + 32]
michael@0:     lea        eax,  [eax + 64]
michael@0:     vpand      ymm0, ymm0, ymm5   // even bytes are Y
michael@0:     vpand      ymm1, ymm1, ymm5
michael@0:     vpackuswb  ymm0, ymm0, ymm1   // mutates.
michael@0:     vpermq     ymm0, ymm0, 0xd8
michael@0:     sub        ecx, 32
michael@0:     vmovdqu    [edx], ymm0
michael@0:     lea        edx, [edx + 32]
michael@0:     jg         convertloop
michael@0:     vzeroupper
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
michael@0:                       uint8* dst_u, uint8* dst_v, int pix) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]    // src_yuy2
michael@0:     mov        esi, [esp + 8 + 8]    // stride_yuy2
michael@0:     mov        edx, [esp + 8 + 12]   // dst_u
michael@0:     mov        edi, [esp + 8 + 16]   // dst_v
michael@0:     mov        ecx, [esp + 8 + 20]   // pix
michael@0:     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
michael@0:     vpsrlw     ymm5, ymm5, 8
michael@0:     sub        edi, edx
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     vmovdqu    ymm0, [eax]
michael@0:     vmovdqu    ymm1, [eax + 32]
michael@0:     vpavgb     ymm0, ymm0, [eax + esi]
michael@0:     vpavgb     ymm1, ymm1, [eax + esi + 32]
michael@0:     lea        eax,  [eax + 64]
michael@0:     vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
michael@0:     vpsrlw     ymm1, ymm1, 8
michael@0:     vpackuswb  ymm0, ymm0, ymm1   // mutates.
michael@0:     vpermq     ymm0, ymm0, 0xd8
michael@0:     vpand      ymm1, ymm0, ymm5  // U
michael@0:     vpsrlw     ymm0, ymm0, 8     // V
michael@0:     vpackuswb  ymm1, ymm1, ymm1  // mutates.
michael@0:     vpackuswb  ymm0, ymm0, ymm0  // mutates.
michael@0:     vpermq     ymm1, ymm1, 0xd8
michael@0:     vpermq     ymm0, ymm0, 0xd8
michael@0:     vextractf128 [edx], ymm1, 0  // U
michael@0:     vextractf128 [edx + edi], ymm0, 0 // V
michael@0:     lea        edx, [edx + 16]
michael@0:     sub        ecx, 32
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     vzeroupper
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
michael@0:                          uint8* dst_u, uint8* dst_v, int pix) {
michael@0:   __asm {
michael@0:     push       edi
michael@0:     mov        eax, [esp + 4 + 4]    // src_yuy2
michael@0:     mov        edx, [esp + 4 + 8]    // dst_u
michael@0:     mov        edi, [esp + 4 + 12]   // dst_v
michael@0:     mov        ecx, [esp + 4 + 16]   // pix
michael@0:     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
michael@0:     vpsrlw     ymm5, ymm5, 8
michael@0:     sub        edi, edx
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     vmovdqu    ymm0, [eax]
michael@0:     vmovdqu    ymm1, [eax + 32]
michael@0:     lea        eax,  [eax + 64]
michael@0:     vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
michael@0:     vpsrlw     ymm1, ymm1, 8
michael@0:     vpackuswb  ymm0, ymm0, ymm1   // mutates.
michael@0:     vpermq     ymm0, ymm0, 0xd8
michael@0:     vpand      ymm1, ymm0, ymm5  // U
michael@0:     vpsrlw     ymm0, ymm0, 8     // V
michael@0:     vpackuswb  ymm1, ymm1, ymm1  // mutates.
michael@0:     vpackuswb  ymm0, ymm0, ymm0  // mutates.
michael@0:     vpermq     ymm1, ymm1, 0xd8
michael@0:     vpermq     ymm0, ymm0, 0xd8
michael@0:     vextractf128 [edx], ymm1, 0  // U
michael@0:     vextractf128 [edx + edi], ymm0, 0 // V
michael@0:     lea        edx, [edx + 16]
michael@0:     sub        ecx, 32
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     vzeroupper
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void UYVYToYRow_AVX2(const uint8* src_uyvy,
michael@0:                      uint8* dst_y, int pix) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]    // src_uyvy
michael@0:     mov        edx, [esp + 8]    // dst_y
michael@0:     mov        ecx, [esp + 12]   // pix
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     vmovdqu    ymm0, [eax]
michael@0:     vmovdqu    ymm1, [eax + 32]
michael@0:     lea        eax,  [eax + 64]
michael@0:     vpsrlw     ymm0, ymm0, 8      // odd bytes are Y
michael@0:     vpsrlw     ymm1, ymm1, 8
michael@0:     vpackuswb  ymm0, ymm0, ymm1   // mutates.
michael@0:     vpermq     ymm0, ymm0, 0xd8
michael@0:     sub        ecx, 32
michael@0:     vmovdqu    [edx], ymm0
michael@0:     lea        edx, [edx + 32]
michael@0:     jg         convertloop
michael@0:     ret
michael@0:     vzeroupper
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
michael@0:                       uint8* dst_u, uint8* dst_v, int pix) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]    // src_yuy2
michael@0:     mov        esi, [esp + 8 + 8]    // stride_yuy2
michael@0:     mov        edx, [esp + 8 + 12]   // dst_u
michael@0:     mov        edi, [esp + 8 + 16]   // dst_v
michael@0:     mov        ecx, [esp + 8 + 20]   // pix
michael@0:     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
michael@0:     vpsrlw     ymm5, ymm5, 8
michael@0:     sub        edi, edx
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     vmovdqu    ymm0, [eax]
michael@0:     vmovdqu    ymm1, [eax + 32]
michael@0:     vpavgb     ymm0, ymm0, [eax + esi]
michael@0:     vpavgb     ymm1, ymm1, [eax + esi + 32]
michael@0:     lea        eax,  [eax + 64]
michael@0:     vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
michael@0:     vpand      ymm1, ymm1, ymm5
michael@0:     vpackuswb  ymm0, ymm0, ymm1   // mutates.
michael@0:     vpermq     ymm0, ymm0, 0xd8
michael@0:     vpand      ymm1, ymm0, ymm5  // U
michael@0:     vpsrlw     ymm0, ymm0, 8     // V
michael@0:     vpackuswb  ymm1, ymm1, ymm1  // mutates.
michael@0:     vpackuswb  ymm0, ymm0, ymm0  // mutates.
michael@0:     vpermq     ymm1, ymm1, 0xd8
michael@0:     vpermq     ymm0, ymm0, 0xd8
michael@0:     vextractf128 [edx], ymm1, 0  // U
michael@0:     vextractf128 [edx + edi], ymm0, 0 // V
michael@0:     lea        edx, [edx + 16]
michael@0:     sub        ecx, 32
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     vzeroupper
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
michael@0:                          uint8* dst_u, uint8* dst_v, int pix) {
michael@0:   __asm {
michael@0:     push       edi
michael@0:     mov        eax, [esp + 4 + 4]    // src_yuy2
michael@0:     mov        edx, [esp + 4 + 8]    // dst_u
michael@0:     mov        edi, [esp + 4 + 12]   // dst_v
michael@0:     mov        ecx, [esp + 4 + 16]   // pix
michael@0:     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
michael@0:     vpsrlw     ymm5, ymm5, 8
michael@0:     sub        edi, edx
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     vmovdqu    ymm0, [eax]
michael@0:     vmovdqu    ymm1, [eax + 32]
michael@0:     lea        eax,  [eax + 64]
michael@0:     vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
michael@0:     vpand      ymm1, ymm1, ymm5
michael@0:     vpackuswb  ymm0, ymm0, ymm1   // mutates.
michael@0:     vpermq     ymm0, ymm0, 0xd8
michael@0:     vpand      ymm1, ymm0, ymm5  // U
michael@0:     vpsrlw     ymm0, ymm0, 8     // V
michael@0:     vpackuswb  ymm1, ymm1, ymm1  // mutates.
michael@0:     vpackuswb  ymm0, ymm0, ymm0  // mutates.
michael@0:     vpermq     ymm1, ymm1, 0xd8
michael@0:     vpermq     ymm0, ymm0, 0xd8
michael@0:     vextractf128 [edx], ymm1, 0  // U
michael@0:     vextractf128 [edx + edi], ymm0, 0 // V
michael@0:     lea        edx, [edx + 16]
michael@0:     sub        ecx, 32
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     vzeroupper
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_YUY2TOYROW_AVX2
michael@0: 
michael@0: #ifdef HAS_YUY2TOYROW_SSE2
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void YUY2ToYRow_SSE2(const uint8* src_yuy2,
michael@0:                      uint8* dst_y, int pix) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]    // src_yuy2
michael@0:     mov        edx, [esp + 8]    // dst_y
michael@0:     mov        ecx, [esp + 12]   // pix
michael@0:     pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
michael@0:     psrlw      xmm5, 8
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     movdqa     xmm0, [eax]
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     lea        eax,  [eax + 32]
michael@0:     pand       xmm0, xmm5   // even bytes are Y
michael@0:     pand       xmm1, xmm5
michael@0:     packuswb   xmm0, xmm1
michael@0:     sub        ecx, 16
michael@0:     movdqa     [edx], xmm0
michael@0:     lea        edx, [edx + 16]
michael@0:     jg         convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
michael@0:                       uint8* dst_u, uint8* dst_v, int pix) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]    // src_yuy2
michael@0:     mov        esi, [esp + 8 + 8]    // stride_yuy2
michael@0:     mov        edx, [esp + 8 + 12]   // dst_u
michael@0:     mov        edi, [esp + 8 + 16]   // dst_v
michael@0:     mov        ecx, [esp + 8 + 20]   // pix
michael@0:     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
michael@0:     psrlw      xmm5, 8
michael@0:     sub        edi, edx
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     movdqa     xmm0, [eax]
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     movdqa     xmm2, [eax + esi]
michael@0:     movdqa     xmm3, [eax + esi + 16]
michael@0:     lea        eax,  [eax + 32]
michael@0:     pavgb      xmm0, xmm2
michael@0:     pavgb      xmm1, xmm3
michael@0:     psrlw      xmm0, 8      // YUYV -> UVUV
michael@0:     psrlw      xmm1, 8
michael@0:     packuswb   xmm0, xmm1
michael@0:     movdqa     xmm1, xmm0
michael@0:     pand       xmm0, xmm5  // U
michael@0:     packuswb   xmm0, xmm0
michael@0:     psrlw      xmm1, 8     // V
michael@0:     packuswb   xmm1, xmm1
michael@0:     movq       qword ptr [edx], xmm0
michael@0:     movq       qword ptr [edx + edi], xmm1
michael@0:     lea        edx, [edx + 8]
michael@0:     sub        ecx, 16
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
michael@0:                          uint8* dst_u, uint8* dst_v, int pix) {
michael@0:   __asm {
michael@0:     push       edi
michael@0:     mov        eax, [esp + 4 + 4]    // src_yuy2
michael@0:     mov        edx, [esp + 4 + 8]    // dst_u
michael@0:     mov        edi, [esp + 4 + 12]   // dst_v
michael@0:     mov        ecx, [esp + 4 + 16]   // pix
michael@0:     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
michael@0:     psrlw      xmm5, 8
michael@0:     sub        edi, edx
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     movdqa     xmm0, [eax]
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     lea        eax,  [eax + 32]
michael@0:     psrlw      xmm0, 8      // YUYV -> UVUV
michael@0:     psrlw      xmm1, 8
michael@0:     packuswb   xmm0, xmm1
michael@0:     movdqa     xmm1, xmm0
michael@0:     pand       xmm0, xmm5  // U
michael@0:     packuswb   xmm0, xmm0
michael@0:     psrlw      xmm1, 8     // V
michael@0:     packuswb   xmm1, xmm1
michael@0:     movq       qword ptr [edx], xmm0
michael@0:     movq       qword ptr [edx + edi], xmm1
michael@0:     lea        edx, [edx + 8]
michael@0:     sub        ecx, 16
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
michael@0:                                uint8* dst_y, int pix) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]    // src_yuy2
michael@0:     mov        edx, [esp + 8]    // dst_y
michael@0:     mov        ecx, [esp + 12]   // pix
michael@0:     pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
michael@0:     psrlw      xmm5, 8
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     movdqu     xmm0, [eax]
michael@0:     movdqu     xmm1, [eax + 16]
michael@0:     lea        eax,  [eax + 32]
michael@0:     pand       xmm0, xmm5   // even bytes are Y
michael@0:     pand       xmm1, xmm5
michael@0:     packuswb   xmm0, xmm1
michael@0:     sub        ecx, 16
michael@0:     movdqu     [edx], xmm0
michael@0:     lea        edx, [edx + 16]
michael@0:     jg         convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
michael@0:                                 uint8* dst_u, uint8* dst_v, int pix) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]    // src_yuy2
michael@0:     mov        esi, [esp + 8 + 8]    // stride_yuy2
michael@0:     mov        edx, [esp + 8 + 12]   // dst_u
michael@0:     mov        edi, [esp + 8 + 16]   // dst_v
michael@0:     mov        ecx, [esp + 8 + 20]   // pix
michael@0:     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
michael@0:     psrlw      xmm5, 8
michael@0:     sub        edi, edx
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     movdqu     xmm0, [eax]
michael@0:     movdqu     xmm1, [eax + 16]
michael@0:     movdqu     xmm2, [eax + esi]
michael@0:     movdqu     xmm3, [eax + esi + 16]
michael@0:     lea        eax,  [eax + 32]
michael@0:     pavgb      xmm0, xmm2
michael@0:     pavgb      xmm1, xmm3
michael@0:     psrlw      xmm0, 8      // YUYV -> UVUV
michael@0:     psrlw      xmm1, 8
michael@0:     packuswb   xmm0, xmm1
michael@0:     movdqa     xmm1, xmm0
michael@0:     pand       xmm0, xmm5  // U
michael@0:     packuswb   xmm0, xmm0
michael@0:     psrlw      xmm1, 8     // V
michael@0:     packuswb   xmm1, xmm1
michael@0:     movq       qword ptr [edx], xmm0
michael@0:     movq       qword ptr [edx + edi], xmm1
michael@0:     lea        edx, [edx + 8]
michael@0:     sub        ecx, 16
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
michael@0:                                    uint8* dst_u, uint8* dst_v, int pix) {
michael@0:   __asm {
michael@0:     push       edi
michael@0:     mov        eax, [esp + 4 + 4]    // src_yuy2
michael@0:     mov        edx, [esp + 4 + 8]    // dst_u
michael@0:     mov        edi, [esp + 4 + 12]   // dst_v
michael@0:     mov        ecx, [esp + 4 + 16]   // pix
michael@0:     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
michael@0:     psrlw      xmm5, 8
michael@0:     sub        edi, edx
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     movdqu     xmm0, [eax]
michael@0:     movdqu     xmm1, [eax + 16]
michael@0:     lea        eax,  [eax + 32]
michael@0:     psrlw      xmm0, 8      // YUYV -> UVUV
michael@0:     psrlw      xmm1, 8
michael@0:     packuswb   xmm0, xmm1
michael@0:     movdqa     xmm1, xmm0
michael@0:     pand       xmm0, xmm5  // U
michael@0:     packuswb   xmm0, xmm0
michael@0:     psrlw      xmm1, 8     // V
michael@0:     packuswb   xmm1, xmm1
michael@0:     movq       qword ptr [edx], xmm0
michael@0:     movq       qword ptr [edx + edi], xmm1
michael@0:     lea        edx, [edx + 8]
michael@0:     sub        ecx, 16
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void UYVYToYRow_SSE2(const uint8* src_uyvy,
michael@0:                      uint8* dst_y, int pix) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]    // src_uyvy
michael@0:     mov        edx, [esp + 8]    // dst_y
michael@0:     mov        ecx, [esp + 12]   // pix
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     movdqa     xmm0, [eax]
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     lea        eax,  [eax + 32]
michael@0:     psrlw      xmm0, 8    // odd bytes are Y
michael@0:     psrlw      xmm1, 8
michael@0:     packuswb   xmm0, xmm1
michael@0:     sub        ecx, 16
michael@0:     movdqa     [edx], xmm0
michael@0:     lea        edx, [edx + 16]
michael@0:     jg         convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
michael@0:                       uint8* dst_u, uint8* dst_v, int pix) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]    // src_yuy2
michael@0:     mov        esi, [esp + 8 + 8]    // stride_yuy2
michael@0:     mov        edx, [esp + 8 + 12]   // dst_u
michael@0:     mov        edi, [esp + 8 + 16]   // dst_v
michael@0:     mov        ecx, [esp + 8 + 20]   // pix
michael@0:     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
michael@0:     psrlw      xmm5, 8
michael@0:     sub        edi, edx
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     movdqa     xmm0, [eax]
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     movdqa     xmm2, [eax + esi]
michael@0:     movdqa     xmm3, [eax + esi + 16]
michael@0:     lea        eax,  [eax + 32]
michael@0:     pavgb      xmm0, xmm2
michael@0:     pavgb      xmm1, xmm3
michael@0:     pand       xmm0, xmm5   // UYVY -> UVUV
michael@0:     pand       xmm1, xmm5
michael@0:     packuswb   xmm0, xmm1
michael@0:     movdqa     xmm1, xmm0
michael@0:     pand       xmm0, xmm5  // U
michael@0:     packuswb   xmm0, xmm0
michael@0:     psrlw      xmm1, 8     // V
michael@0:     packuswb   xmm1, xmm1
michael@0:     movq       qword ptr [edx], xmm0
michael@0:     movq       qword ptr [edx + edi], xmm1
michael@0:     lea        edx, [edx + 8]
michael@0:     sub        ecx, 16
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
michael@0:                          uint8* dst_u, uint8* dst_v, int pix) {
michael@0:   __asm {
michael@0:     push       edi
michael@0:     mov        eax, [esp + 4 + 4]    // src_yuy2
michael@0:     mov        edx, [esp + 4 + 8]    // dst_u
michael@0:     mov        edi, [esp + 4 + 12]   // dst_v
michael@0:     mov        ecx, [esp + 4 + 16]   // pix
michael@0:     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
michael@0:     psrlw      xmm5, 8
michael@0:     sub        edi, edx
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     movdqa     xmm0, [eax]
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     lea        eax,  [eax + 32]
michael@0:     pand       xmm0, xmm5   // UYVY -> UVUV
michael@0:     pand       xmm1, xmm5
michael@0:     packuswb   xmm0, xmm1
michael@0:     movdqa     xmm1, xmm0
michael@0:     pand       xmm0, xmm5  // U
michael@0:     packuswb   xmm0, xmm0
michael@0:     psrlw      xmm1, 8     // V
michael@0:     packuswb   xmm1, xmm1
michael@0:     movq       qword ptr [edx], xmm0
michael@0:     movq       qword ptr [edx + edi], xmm1
michael@0:     lea        edx, [edx + 8]
michael@0:     sub        ecx, 16
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
michael@0:                                uint8* dst_y, int pix) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]    // src_uyvy
michael@0:     mov        edx, [esp + 8]    // dst_y
michael@0:     mov        ecx, [esp + 12]   // pix
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     movdqu     xmm0, [eax]
michael@0:     movdqu     xmm1, [eax + 16]
michael@0:     lea        eax,  [eax + 32]
michael@0:     psrlw      xmm0, 8    // odd bytes are Y
michael@0:     psrlw      xmm1, 8
michael@0:     packuswb   xmm0, xmm1
michael@0:     sub        ecx, 16
michael@0:     movdqu     [edx], xmm0
michael@0:     lea        edx, [edx + 16]
michael@0:     jg         convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
michael@0:                                 uint8* dst_u, uint8* dst_v, int pix) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]    // src_yuy2
michael@0:     mov        esi, [esp + 8 + 8]    // stride_yuy2
michael@0:     mov        edx, [esp + 8 + 12]   // dst_u
michael@0:     mov        edi, [esp + 8 + 16]   // dst_v
michael@0:     mov        ecx, [esp + 8 + 20]   // pix
michael@0:     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
michael@0:     psrlw      xmm5, 8
michael@0:     sub        edi, edx
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     movdqu     xmm0, [eax]
michael@0:     movdqu     xmm1, [eax + 16]
michael@0:     movdqu     xmm2, [eax + esi]
michael@0:     movdqu     xmm3, [eax + esi + 16]
michael@0:     lea        eax,  [eax + 32]
michael@0:     pavgb      xmm0, xmm2
michael@0:     pavgb      xmm1, xmm3
michael@0:     pand       xmm0, xmm5   // UYVY -> UVUV
michael@0:     pand       xmm1, xmm5
michael@0:     packuswb   xmm0, xmm1
michael@0:     movdqa     xmm1, xmm0
michael@0:     pand       xmm0, xmm5  // U
michael@0:     packuswb   xmm0, xmm0
michael@0:     psrlw      xmm1, 8     // V
michael@0:     packuswb   xmm1, xmm1
michael@0:     movq       qword ptr [edx], xmm0
michael@0:     movq       qword ptr [edx + edi], xmm1
michael@0:     lea        edx, [edx + 8]
michael@0:     sub        ecx, 16
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
michael@0:                                    uint8* dst_u, uint8* dst_v, int pix) {
michael@0:   __asm {
michael@0:     push       edi
michael@0:     mov        eax, [esp + 4 + 4]    // src_yuy2
michael@0:     mov        edx, [esp + 4 + 8]    // dst_u
michael@0:     mov        edi, [esp + 4 + 12]   // dst_v
michael@0:     mov        ecx, [esp + 4 + 16]   // pix
michael@0:     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
michael@0:     psrlw      xmm5, 8
michael@0:     sub        edi, edx
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     movdqu     xmm0, [eax]
michael@0:     movdqu     xmm1, [eax + 16]
michael@0:     lea        eax,  [eax + 32]
michael@0:     pand       xmm0, xmm5   // UYVY -> UVUV
michael@0:     pand       xmm1, xmm5
michael@0:     packuswb   xmm0, xmm1
michael@0:     movdqa     xmm1, xmm0
michael@0:     pand       xmm0, xmm5  // U
michael@0:     packuswb   xmm0, xmm0
michael@0:     psrlw      xmm1, 8     // V
michael@0:     packuswb   xmm1, xmm1
michael@0:     movq       qword ptr [edx], xmm0
michael@0:     movq       qword ptr [edx + edi], xmm1
michael@0:     lea        edx, [edx + 8]
michael@0:     sub        ecx, 16
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_YUY2TOYROW_SSE2
michael@0: 
michael@0: #ifdef HAS_ARGBBLENDROW_SSE2
michael@0: // Blend 8 pixels at a time.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
michael@0:                        uint8* dst_argb, int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     mov        eax, [esp + 4 + 4]   // src_argb0
michael@0:     mov        esi, [esp + 4 + 8]   // src_argb1
michael@0:     mov        edx, [esp + 4 + 12]  // dst_argb
michael@0:     mov        ecx, [esp + 4 + 16]  // width
michael@0:     pcmpeqb    xmm7, xmm7       // generate constant 1
michael@0:     psrlw      xmm7, 15
michael@0:     pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
michael@0:     psrlw      xmm6, 8
michael@0:     pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
michael@0:     psllw      xmm5, 8
michael@0:     pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
michael@0:     pslld      xmm4, 24
michael@0: 
michael@0:     sub        ecx, 1
michael@0:     je         convertloop1     // only 1 pixel?
michael@0:     jl         convertloop1b
michael@0: 
michael@0:     // 1 pixel loop until destination pointer is aligned.
michael@0:   alignloop1:
michael@0:     test       edx, 15          // aligned?
michael@0:     je         alignloop1b
michael@0:     movd       xmm3, [eax]
michael@0:     lea        eax, [eax + 4]
michael@0:     movdqa     xmm0, xmm3       // src argb
michael@0:     pxor       xmm3, xmm4       // ~alpha
michael@0:     movd       xmm2, [esi]      // _r_b
michael@0:     psrlw      xmm3, 8          // alpha
michael@0:     pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
michael@0:     pshuflw    xmm3, xmm3, 0F5h
michael@0:     pand       xmm2, xmm6       // _r_b
michael@0:     paddw      xmm3, xmm7       // 256 - alpha
michael@0:     pmullw     xmm2, xmm3       // _r_b * alpha
michael@0:     movd       xmm1, [esi]      // _a_g
michael@0:     lea        esi, [esi + 4]
michael@0:     psrlw      xmm1, 8          // _a_g
michael@0:     por        xmm0, xmm4       // set alpha to 255
michael@0:     pmullw     xmm1, xmm3       // _a_g * alpha
michael@0:     psrlw      xmm2, 8          // _r_b convert to 8 bits again
michael@0:     paddusb    xmm0, xmm2       // + src argb
michael@0:     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
michael@0:     paddusb    xmm0, xmm1       // + src argb
michael@0:     sub        ecx, 1
michael@0:     movd       [edx], xmm0
michael@0:     lea        edx, [edx + 4]
michael@0:     jge        alignloop1
michael@0: 
michael@0:   alignloop1b:
michael@0:     add        ecx, 1 - 4
michael@0:     jl         convertloop4b
michael@0: 
michael@0:     // 4 pixel loop.
michael@0:   convertloop4:
michael@0:     movdqu     xmm3, [eax]      // src argb
michael@0:     lea        eax, [eax + 16]
michael@0:     movdqa     xmm0, xmm3       // src argb
michael@0:     pxor       xmm3, xmm4       // ~alpha
michael@0:     movdqu     xmm2, [esi]      // _r_b
michael@0:     psrlw      xmm3, 8          // alpha
michael@0:     pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
michael@0:     pshuflw    xmm3, xmm3, 0F5h
michael@0:     pand       xmm2, xmm6       // _r_b
michael@0:     paddw      xmm3, xmm7       // 256 - alpha
michael@0:     pmullw     xmm2, xmm3       // _r_b * alpha
michael@0:     movdqu     xmm1, [esi]      // _a_g
michael@0:     lea        esi, [esi + 16]
michael@0:     psrlw      xmm1, 8          // _a_g
michael@0:     por        xmm0, xmm4       // set alpha to 255
michael@0:     pmullw     xmm1, xmm3       // _a_g * alpha
michael@0:     psrlw      xmm2, 8          // _r_b convert to 8 bits again
michael@0:     paddusb    xmm0, xmm2       // + src argb
michael@0:     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
michael@0:     paddusb    xmm0, xmm1       // + src argb
michael@0:     sub        ecx, 4
michael@0:     movdqa     [edx], xmm0
michael@0:     lea        edx, [edx + 16]
michael@0:     jge        convertloop4
michael@0: 
michael@0:   convertloop4b:
michael@0:     add        ecx, 4 - 1
michael@0:     jl         convertloop1b
michael@0: 
michael@0:     // 1 pixel loop.
michael@0:   convertloop1:
michael@0:     movd       xmm3, [eax]      // src argb
michael@0:     lea        eax, [eax + 4]
michael@0:     movdqa     xmm0, xmm3       // src argb
michael@0:     pxor       xmm3, xmm4       // ~alpha
michael@0:     movd       xmm2, [esi]      // _r_b
michael@0:     psrlw      xmm3, 8          // alpha
michael@0:     pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
michael@0:     pshuflw    xmm3, xmm3, 0F5h
michael@0:     pand       xmm2, xmm6       // _r_b
michael@0:     paddw      xmm3, xmm7       // 256 - alpha
michael@0:     pmullw     xmm2, xmm3       // _r_b * alpha
michael@0:     movd       xmm1, [esi]      // _a_g
michael@0:     lea        esi, [esi + 4]
michael@0:     psrlw      xmm1, 8          // _a_g
michael@0:     por        xmm0, xmm4       // set alpha to 255
michael@0:     pmullw     xmm1, xmm3       // _a_g * alpha
michael@0:     psrlw      xmm2, 8          // _r_b convert to 8 bits again
michael@0:     paddusb    xmm0, xmm2       // + src argb
michael@0:     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
michael@0:     paddusb    xmm0, xmm1       // + src argb
michael@0:     sub        ecx, 1
michael@0:     movd       [edx], xmm0
michael@0:     lea        edx, [edx + 4]
michael@0:     jge        convertloop1
michael@0: 
michael@0:   convertloop1b:
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBBLENDROW_SSE2
michael@0: 
michael@0: #ifdef HAS_ARGBBLENDROW_SSSE3
michael@0: // Shuffle table for isolating alpha.
michael@0: static const uvec8 kShuffleAlpha = {
michael@0:   3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
michael@0:   11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
michael@0: };
michael@0: // Same as SSE2, but replaces:
michael@0: //    psrlw      xmm3, 8          // alpha
michael@0: //    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
michael@0: //    pshuflw    xmm3, xmm3, 0F5h
michael@0: // with..
michael@0: //    pshufb     xmm3, kShuffleAlpha // alpha
michael@0: // Blend 8 pixels at a time.
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
michael@0:                         uint8* dst_argb, int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     mov        eax, [esp + 4 + 4]   // src_argb0
michael@0:     mov        esi, [esp + 4 + 8]   // src_argb1
michael@0:     mov        edx, [esp + 4 + 12]  // dst_argb
michael@0:     mov        ecx, [esp + 4 + 16]  // width
michael@0:     pcmpeqb    xmm7, xmm7       // generate constant 0x0001
michael@0:     psrlw      xmm7, 15
michael@0:     pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
michael@0:     psrlw      xmm6, 8
michael@0:     pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
michael@0:     psllw      xmm5, 8
michael@0:     pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
michael@0:     pslld      xmm4, 24
michael@0: 
michael@0:     sub        ecx, 1
michael@0:     je         convertloop1     // only 1 pixel?
michael@0:     jl         convertloop1b
michael@0: 
michael@0:     // 1 pixel loop until destination pointer is aligned.
michael@0:   alignloop1:
michael@0:     test       edx, 15          // aligned?
michael@0:     je         alignloop1b
michael@0:     movd       xmm3, [eax]
michael@0:     lea        eax, [eax + 4]
michael@0:     movdqa     xmm0, xmm3       // src argb
michael@0:     pxor       xmm3, xmm4       // ~alpha
michael@0:     movd       xmm2, [esi]      // _r_b
michael@0:     pshufb     xmm3, kShuffleAlpha // alpha
michael@0:     pand       xmm2, xmm6       // _r_b
michael@0:     paddw      xmm3, xmm7       // 256 - alpha
michael@0:     pmullw     xmm2, xmm3       // _r_b * alpha
michael@0:     movd       xmm1, [esi]      // _a_g
michael@0:     lea        esi, [esi + 4]
michael@0:     psrlw      xmm1, 8          // _a_g
michael@0:     por        xmm0, xmm4       // set alpha to 255
michael@0:     pmullw     xmm1, xmm3       // _a_g * alpha
michael@0:     psrlw      xmm2, 8          // _r_b convert to 8 bits again
michael@0:     paddusb    xmm0, xmm2       // + src argb
michael@0:     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
michael@0:     paddusb    xmm0, xmm1       // + src argb
michael@0:     sub        ecx, 1
michael@0:     movd       [edx], xmm0
michael@0:     lea        edx, [edx + 4]
michael@0:     jge        alignloop1
michael@0: 
michael@0:   alignloop1b:
michael@0:     add        ecx, 1 - 4
michael@0:     jl         convertloop4b
michael@0: 
michael@0:     test       eax, 15          // unaligned?
michael@0:     jne        convertuloop4
michael@0:     test       esi, 15          // unaligned?
michael@0:     jne        convertuloop4
michael@0: 
michael@0:     // 4 pixel loop.
michael@0:   convertloop4:
michael@0:     movdqa     xmm3, [eax]      // src argb
michael@0:     lea        eax, [eax + 16]
michael@0:     movdqa     xmm0, xmm3       // src argb
michael@0:     pxor       xmm3, xmm4       // ~alpha
michael@0:     movdqa     xmm2, [esi]      // _r_b
michael@0:     pshufb     xmm3, kShuffleAlpha // alpha
michael@0:     pand       xmm2, xmm6       // _r_b
michael@0:     paddw      xmm3, xmm7       // 256 - alpha
michael@0:     pmullw     xmm2, xmm3       // _r_b * alpha
michael@0:     movdqa     xmm1, [esi]      // _a_g
michael@0:     lea        esi, [esi + 16]
michael@0:     psrlw      xmm1, 8          // _a_g
michael@0:     por        xmm0, xmm4       // set alpha to 255
michael@0:     pmullw     xmm1, xmm3       // _a_g * alpha
michael@0:     psrlw      xmm2, 8          // _r_b convert to 8 bits again
michael@0:     paddusb    xmm0, xmm2       // + src argb
michael@0:     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
michael@0:     paddusb    xmm0, xmm1       // + src argb
michael@0:     sub        ecx, 4
michael@0:     movdqa     [edx], xmm0
michael@0:     lea        edx, [edx + 16]
michael@0:     jge        convertloop4
michael@0:     jmp        convertloop4b
michael@0: 
michael@0:     // 4 pixel unaligned loop.
michael@0:   convertuloop4:
michael@0:     movdqu     xmm3, [eax]      // src argb
michael@0:     lea        eax, [eax + 16]
michael@0:     movdqa     xmm0, xmm3       // src argb
michael@0:     pxor       xmm3, xmm4       // ~alpha
michael@0:     movdqu     xmm2, [esi]      // _r_b
michael@0:     pshufb     xmm3, kShuffleAlpha // alpha
michael@0:     pand       xmm2, xmm6       // _r_b
michael@0:     paddw      xmm3, xmm7       // 256 - alpha
michael@0:     pmullw     xmm2, xmm3       // _r_b * alpha
michael@0:     movdqu     xmm1, [esi]      // _a_g
michael@0:     lea        esi, [esi + 16]
michael@0:     psrlw      xmm1, 8          // _a_g
michael@0:     por        xmm0, xmm4       // set alpha to 255
michael@0:     pmullw     xmm1, xmm3       // _a_g * alpha
michael@0:     psrlw      xmm2, 8          // _r_b convert to 8 bits again
michael@0:     paddusb    xmm0, xmm2       // + src argb
michael@0:     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
michael@0:     paddusb    xmm0, xmm1       // + src argb
michael@0:     sub        ecx, 4
michael@0:     movdqa     [edx], xmm0
michael@0:     lea        edx, [edx + 16]
michael@0:     jge        convertuloop4
michael@0: 
michael@0:   convertloop4b:
michael@0:     add        ecx, 4 - 1
michael@0:     jl         convertloop1b
michael@0: 
michael@0:     // 1 pixel loop.
michael@0:   convertloop1:
michael@0:     movd       xmm3, [eax]      // src argb
michael@0:     lea        eax, [eax + 4]
michael@0:     movdqa     xmm0, xmm3       // src argb
michael@0:     pxor       xmm3, xmm4       // ~alpha
michael@0:     movd       xmm2, [esi]      // _r_b
michael@0:     pshufb     xmm3, kShuffleAlpha // alpha
michael@0:     pand       xmm2, xmm6       // _r_b
michael@0:     paddw      xmm3, xmm7       // 256 - alpha
michael@0:     pmullw     xmm2, xmm3       // _r_b * alpha
michael@0:     movd       xmm1, [esi]      // _a_g
michael@0:     lea        esi, [esi + 4]
michael@0:     psrlw      xmm1, 8          // _a_g
michael@0:     por        xmm0, xmm4       // set alpha to 255
michael@0:     pmullw     xmm1, xmm3       // _a_g * alpha
michael@0:     psrlw      xmm2, 8          // _r_b convert to 8 bits again
michael@0:     paddusb    xmm0, xmm2       // + src argb
michael@0:     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
michael@0:     paddusb    xmm0, xmm1       // + src argb
michael@0:     sub        ecx, 1
michael@0:     movd       [edx], xmm0
michael@0:     lea        edx, [edx + 4]
michael@0:     jge        convertloop1
michael@0: 
michael@0:   convertloop1b:
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBBLENDROW_SSSE3
michael@0: 
michael@0: #ifdef HAS_ARGBATTENUATEROW_SSE2
michael@0: // Attenuate 4 pixels at a time.
michael@0: // Aligned to 16 bytes.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]   // src_argb0
michael@0:     mov        edx, [esp + 8]   // dst_argb
michael@0:     mov        ecx, [esp + 12]  // width
michael@0:     pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
michael@0:     pslld      xmm4, 24
michael@0:     pcmpeqb    xmm5, xmm5       // generate mask 0x00ffffff
michael@0:     psrld      xmm5, 8
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqa     xmm0, [eax]      // read 4 pixels
michael@0:     punpcklbw  xmm0, xmm0       // first 2
michael@0:     pshufhw    xmm2, xmm0, 0FFh // 8 alpha words
michael@0:     pshuflw    xmm2, xmm2, 0FFh
michael@0:     pmulhuw    xmm0, xmm2       // rgb * a
michael@0:     movdqa     xmm1, [eax]      // read 4 pixels
michael@0:     punpckhbw  xmm1, xmm1       // next 2 pixels
michael@0:     pshufhw    xmm2, xmm1, 0FFh // 8 alpha words
michael@0:     pshuflw    xmm2, xmm2, 0FFh
michael@0:     pmulhuw    xmm1, xmm2       // rgb * a
michael@0:     movdqa     xmm2, [eax]      // alphas
michael@0:     lea        eax, [eax + 16]
michael@0:     psrlw      xmm0, 8
michael@0:     pand       xmm2, xmm4
michael@0:     psrlw      xmm1, 8
michael@0:     packuswb   xmm0, xmm1
michael@0:     pand       xmm0, xmm5       // keep original alphas
michael@0:     por        xmm0, xmm2
michael@0:     sub        ecx, 4
michael@0:     movdqa     [edx], xmm0
michael@0:     lea        edx, [edx + 16]
michael@0:     jg         convertloop
michael@0: 
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBATTENUATEROW_SSE2
michael@0: 
michael@0: #ifdef HAS_ARGBATTENUATEROW_SSSE3
michael@0: // Shuffle table duplicating alpha.
michael@0: static const uvec8 kShuffleAlpha0 = {
michael@0:   3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
michael@0: };
michael@0: static const uvec8 kShuffleAlpha1 = {
michael@0:   11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
michael@0:   15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
michael@0: };
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]   // src_argb0
michael@0:     mov        edx, [esp + 8]   // dst_argb
michael@0:     mov        ecx, [esp + 12]  // width
michael@0:     pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
michael@0:     pslld      xmm3, 24
michael@0:     movdqa     xmm4, kShuffleAlpha0
michael@0:     movdqa     xmm5, kShuffleAlpha1
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqu     xmm0, [eax]      // read 4 pixels
michael@0:     pshufb     xmm0, xmm4       // isolate first 2 alphas
michael@0:     movdqu     xmm1, [eax]      // read 4 pixels
michael@0:     punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
michael@0:     pmulhuw    xmm0, xmm1       // rgb * a
michael@0:     movdqu     xmm1, [eax]      // read 4 pixels
michael@0:     pshufb     xmm1, xmm5       // isolate next 2 alphas
michael@0:     movdqu     xmm2, [eax]      // read 4 pixels
michael@0:     punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
michael@0:     pmulhuw    xmm1, xmm2       // rgb * a
michael@0:     movdqu     xmm2, [eax]      // mask original alpha
michael@0:     lea        eax, [eax + 16]
michael@0:     pand       xmm2, xmm3
michael@0:     psrlw      xmm0, 8
michael@0:     psrlw      xmm1, 8
michael@0:     packuswb   xmm0, xmm1
michael@0:     por        xmm0, xmm2       // copy original alpha
michael@0:     sub        ecx, 4
michael@0:     movdqu     [edx], xmm0
michael@0:     lea        edx, [edx + 16]
michael@0:     jg         convertloop
michael@0: 
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBATTENUATEROW_SSSE3
michael@0: 
michael@0: #ifdef HAS_ARGBATTENUATEROW_AVX2
michael@0: // Shuffle table duplicating alpha.
michael@0: static const ulvec8 kShuffleAlpha_AVX2 = {
michael@0:   6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
michael@0:   14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
michael@0:   6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
michael@0:   14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
michael@0: };
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]   // src_argb0
michael@0:     mov        edx, [esp + 8]   // dst_argb
michael@0:     mov        ecx, [esp + 12]  // width
michael@0:     sub        edx, eax
michael@0:     vmovdqa    ymm4, kShuffleAlpha_AVX2
michael@0:     vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000
michael@0:     vpslld     ymm5, ymm5, 24
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     vmovdqu    ymm6, [eax]       // read 8 pixels.
michael@0:     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
michael@0:     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
michael@0:     vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
michael@0:     vpshufb    ymm3, ymm1, ymm4  // high 4 alphas
michael@0:     vpmulhuw   ymm0, ymm0, ymm2  // rgb * a
michael@0:     vpmulhuw   ymm1, ymm1, ymm3  // rgb * a
michael@0:     vpand      ymm6, ymm6, ymm5  // isolate alpha
michael@0:     vpsrlw     ymm0, ymm0, 8
michael@0:     vpsrlw     ymm1, ymm1, 8
michael@0:     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
michael@0:     vpor       ymm0, ymm0, ymm6  // copy original alpha
michael@0:     sub        ecx, 8
michael@0:     vmovdqu    [eax + edx], ymm0
michael@0:     lea        eax, [eax + 32]
michael@0:     jg         convertloop
michael@0: 
michael@0:     vzeroupper
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBATTENUATEROW_AVX2
michael@0: 
michael@0: #ifdef HAS_ARGBUNATTENUATEROW_SSE2
michael@0: // Unattenuate 4 pixels at a time.
michael@0: // Aligned to 16 bytes.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
michael@0:                              int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]   // src_argb0
michael@0:     mov        edx, [esp + 8 + 8]   // dst_argb
michael@0:     mov        ecx, [esp + 8 + 12]  // width
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqu     xmm0, [eax]      // read 4 pixels
michael@0:     movzx      esi, byte ptr [eax + 3]  // first alpha
michael@0:     movzx      edi, byte ptr [eax + 7]  // second alpha
michael@0:     punpcklbw  xmm0, xmm0       // first 2
michael@0:     movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
michael@0:     movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
michael@0:     pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words.  1, a, a, a
michael@0:     pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
michael@0:     movlhps    xmm2, xmm3
michael@0:     pmulhuw    xmm0, xmm2       // rgb * a
michael@0: 
michael@0:     movdqu     xmm1, [eax]      // read 4 pixels
michael@0:     movzx      esi, byte ptr [eax + 11]  // third alpha
michael@0:     movzx      edi, byte ptr [eax + 15]  // forth alpha
michael@0:     punpckhbw  xmm1, xmm1       // next 2
michael@0:     movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
michael@0:     movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
michael@0:     pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words
michael@0:     pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
michael@0:     movlhps    xmm2, xmm3
michael@0:     pmulhuw    xmm1, xmm2       // rgb * a
michael@0:     lea        eax, [eax + 16]
michael@0: 
michael@0:     packuswb   xmm0, xmm1
michael@0:     sub        ecx, 4
michael@0:     movdqu     [edx], xmm0
michael@0:     lea        edx, [edx + 16]
michael@0:     jg         convertloop
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBUNATTENUATEROW_SSE2
michael@0: 
michael@0: #ifdef HAS_ARGBUNATTENUATEROW_AVX2
michael@0: // Shuffle table duplicating alpha.
michael@0: static const ulvec8 kUnattenShuffleAlpha_AVX2 = {
michael@0:   0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
michael@0:   0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
michael@0: };
michael@0: // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
michael@0: // USE_GATHER is not on by default, due to being a slow instruction.
michael@0: #ifdef USE_GATHER
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
michael@0:                              int width) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]   // src_argb0
michael@0:     mov        edx, [esp + 8]   // dst_argb
michael@0:     mov        ecx, [esp + 12]  // width
michael@0:     sub        edx, eax
michael@0:     vmovdqa    ymm4, kUnattenShuffleAlpha_AVX2
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     vmovdqu    ymm6, [eax]       // read 8 pixels.
michael@0:     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
michael@0:     vpsrld     ymm2, ymm6, 24    // alpha in low 8 bits.
michael@0:     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
michael@0:     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
michael@0:     vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
michael@0:     vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
michael@0:     vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
michael@0:     vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas. 1, a, a, a
michael@0:     vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas
michael@0:     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
michael@0:     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
michael@0:     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
michael@0:     sub        ecx, 8
michael@0:     vmovdqu    [eax + edx], ymm0
michael@0:     lea        eax, [eax + 32]
michael@0:     jg         convertloop
michael@0: 
michael@0:     vzeroupper
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #else  // USE_GATHER
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
michael@0:                              int width) {
michael@0:   __asm {
michael@0: 
michael@0:     mov        eax, [esp + 4]   // src_argb0
michael@0:     mov        edx, [esp + 8]   // dst_argb
michael@0:     mov        ecx, [esp + 12]  // width
michael@0:     sub        edx, eax
michael@0:     vmovdqa    ymm5, kUnattenShuffleAlpha_AVX2
michael@0: 
michael@0:     push       esi
michael@0:     push       edi
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     // replace VPGATHER
michael@0:     movzx      esi, byte ptr [eax + 3]                 // alpha0
michael@0:     movzx      edi, byte ptr [eax + 7]                 // alpha1
michael@0:     vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a0]
michael@0:     vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a1]
michael@0:     movzx      esi, byte ptr [eax + 11]                // alpha2
michael@0:     movzx      edi, byte ptr [eax + 15]                // alpha3
michael@0:     vpunpckldq xmm6, xmm0, xmm1                        // [1,a1,1,a0]
michael@0:     vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a2]
michael@0:     vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a3]
michael@0:     movzx      esi, byte ptr [eax + 19]                // alpha4
michael@0:     movzx      edi, byte ptr [eax + 23]                // alpha5
michael@0:     vpunpckldq xmm7, xmm2, xmm3                        // [1,a3,1,a2]
michael@0:     vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a4]
michael@0:     vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a5]
michael@0:     movzx      esi, byte ptr [eax + 27]                // alpha6
michael@0:     movzx      edi, byte ptr [eax + 31]                // alpha7
michael@0:     vpunpckldq xmm0, xmm0, xmm1                        // [1,a5,1,a4]
michael@0:     vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a6]
michael@0:     vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a7]
michael@0:     vpunpckldq xmm2, xmm2, xmm3                        // [1,a7,1,a6]
michael@0:     vpunpcklqdq xmm3, xmm6, xmm7                       // [1,a3,1,a2,1,a1,1,a0]
michael@0:     vpunpcklqdq xmm0, xmm0, xmm2                       // [1,a7,1,a6,1,a5,1,a4]
michael@0:     vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
michael@0:     // end of VPGATHER
michael@0: 
michael@0:     vmovdqu    ymm6, [eax]       // read 8 pixels.
michael@0:     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
michael@0:     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
michael@0:     vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
michael@0:     vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
michael@0:     vpshufb    ymm2, ymm2, ymm5  // replicate low 4 alphas. 1, a, a, a
michael@0:     vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
michael@0:     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
michael@0:     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
michael@0:     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
michael@0:     sub        ecx, 8
michael@0:     vmovdqu    [eax + edx], ymm0
michael@0:     lea        eax, [eax + 32]
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     vzeroupper
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // USE_GATHER
michael@0: #endif  // HAS_ARGBATTENUATEROW_AVX2
michael@0: 
michael@0: #ifdef HAS_ARGBGRAYROW_SSSE3
michael@0: // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]   /* src_argb */
michael@0:     mov        edx, [esp + 8]   /* dst_argb */
michael@0:     mov        ecx, [esp + 12]  /* width */
michael@0:     movdqa     xmm4, kARGBToYJ
michael@0:     movdqa     xmm5, kAddYJ64
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqa     xmm0, [eax]  // G
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     pmaddubsw  xmm0, xmm4
michael@0:     pmaddubsw  xmm1, xmm4
michael@0:     phaddw     xmm0, xmm1
michael@0:     paddw      xmm0, xmm5  // Add .5 for rounding.
michael@0:     psrlw      xmm0, 7
michael@0:     packuswb   xmm0, xmm0   // 8 G bytes
michael@0:     movdqa     xmm2, [eax]  // A
michael@0:     movdqa     xmm3, [eax + 16]
michael@0:     lea        eax, [eax + 32]
michael@0:     psrld      xmm2, 24
michael@0:     psrld      xmm3, 24
michael@0:     packuswb   xmm2, xmm3
michael@0:     packuswb   xmm2, xmm2   // 8 A bytes
michael@0:     movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA
michael@0:     punpcklbw  xmm0, xmm0   // 8 GG words
michael@0:     punpcklbw  xmm3, xmm2   // 8 GA words
michael@0:     movdqa     xmm1, xmm0
michael@0:     punpcklwd  xmm0, xmm3   // GGGA first 4
michael@0:     punpckhwd  xmm1, xmm3   // GGGA next 4
michael@0:     sub        ecx, 8
michael@0:     movdqa     [edx], xmm0
michael@0:     movdqa     [edx + 16], xmm1
michael@0:     lea        edx, [edx + 32]
michael@0:     jg         convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBGRAYROW_SSSE3
michael@0: 
michael@0: #ifdef HAS_ARGBSEPIAROW_SSSE3
michael@0: //    b = (r * 35 + g * 68 + b * 17) >> 7
michael@0: //    g = (r * 45 + g * 88 + b * 22) >> 7
michael@0: //    r = (r * 50 + g * 98 + b * 24) >> 7
michael@0: // Constant for ARGB color to sepia tone.
michael@0: static const vec8 kARGBToSepiaB = {
michael@0:   17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
michael@0: };
michael@0: 
michael@0: static const vec8 kARGBToSepiaG = {
michael@0:   22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
michael@0: };
michael@0: 
michael@0: static const vec8 kARGBToSepiaR = {
michael@0:   24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
michael@0: };
michael@0: 
michael@0: // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]   /* dst_argb */
michael@0:     mov        ecx, [esp + 8]   /* width */
michael@0:     movdqa     xmm2, kARGBToSepiaB
michael@0:     movdqa     xmm3, kARGBToSepiaG
michael@0:     movdqa     xmm4, kARGBToSepiaR
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqa     xmm0, [eax]  // B
michael@0:     movdqa     xmm6, [eax + 16]
michael@0:     pmaddubsw  xmm0, xmm2
michael@0:     pmaddubsw  xmm6, xmm2
michael@0:     phaddw     xmm0, xmm6
michael@0:     psrlw      xmm0, 7
michael@0:     packuswb   xmm0, xmm0   // 8 B values
michael@0:     movdqa     xmm5, [eax]  // G
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     pmaddubsw  xmm5, xmm3
michael@0:     pmaddubsw  xmm1, xmm3
michael@0:     phaddw     xmm5, xmm1
michael@0:     psrlw      xmm5, 7
michael@0:     packuswb   xmm5, xmm5   // 8 G values
michael@0:     punpcklbw  xmm0, xmm5   // 8 BG values
michael@0:     movdqa     xmm5, [eax]  // R
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     pmaddubsw  xmm5, xmm4
michael@0:     pmaddubsw  xmm1, xmm4
michael@0:     phaddw     xmm5, xmm1
michael@0:     psrlw      xmm5, 7
michael@0:     packuswb   xmm5, xmm5   // 8 R values
michael@0:     movdqa     xmm6, [eax]  // A
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     psrld      xmm6, 24
michael@0:     psrld      xmm1, 24
michael@0:     packuswb   xmm6, xmm1
michael@0:     packuswb   xmm6, xmm6   // 8 A values
michael@0:     punpcklbw  xmm5, xmm6   // 8 RA values
michael@0:     movdqa     xmm1, xmm0   // Weave BG, RA together
michael@0:     punpcklwd  xmm0, xmm5   // BGRA first 4
michael@0:     punpckhwd  xmm1, xmm5   // BGRA next 4
michael@0:     sub        ecx, 8
michael@0:     movdqa     [eax], xmm0
michael@0:     movdqa     [eax + 16], xmm1
michael@0:     lea        eax, [eax + 32]
michael@0:     jg         convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBSEPIAROW_SSSE3
michael@0: 
michael@0: #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
michael@0: // Tranform 8 ARGB pixels (32 bytes) with color matrix.
michael@0: // Same as Sepia except matrix is provided.
michael@0: // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
michael@0: // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
michael@0:                               const int8* matrix_argb, int width) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]   /* src_argb */
michael@0:     mov        edx, [esp + 8]   /* dst_argb */
michael@0:     mov        ecx, [esp + 12]  /* matrix_argb */
michael@0:     movdqu     xmm5, [ecx]
michael@0:     pshufd     xmm2, xmm5, 0x00
michael@0:     pshufd     xmm3, xmm5, 0x55
michael@0:     pshufd     xmm4, xmm5, 0xaa
michael@0:     pshufd     xmm5, xmm5, 0xff
michael@0:     mov        ecx, [esp + 16]  /* width */
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqa     xmm0, [eax]  // B
michael@0:     movdqa     xmm7, [eax + 16]
michael@0:     pmaddubsw  xmm0, xmm2
michael@0:     pmaddubsw  xmm7, xmm2
michael@0:     movdqa     xmm6, [eax]  // G
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     pmaddubsw  xmm6, xmm3
michael@0:     pmaddubsw  xmm1, xmm3
michael@0:     phaddsw    xmm0, xmm7   // B
michael@0:     phaddsw    xmm6, xmm1   // G
michael@0:     psraw      xmm0, 6      // B
michael@0:     psraw      xmm6, 6      // G
michael@0:     packuswb   xmm0, xmm0   // 8 B values
michael@0:     packuswb   xmm6, xmm6   // 8 G values
michael@0:     punpcklbw  xmm0, xmm6   // 8 BG values
michael@0:     movdqa     xmm1, [eax]  // R
michael@0:     movdqa     xmm7, [eax + 16]
michael@0:     pmaddubsw  xmm1, xmm4
michael@0:     pmaddubsw  xmm7, xmm4
michael@0:     phaddsw    xmm1, xmm7   // R
michael@0:     movdqa     xmm6, [eax]  // A
michael@0:     movdqa     xmm7, [eax + 16]
michael@0:     pmaddubsw  xmm6, xmm5
michael@0:     pmaddubsw  xmm7, xmm5
michael@0:     phaddsw    xmm6, xmm7   // A
michael@0:     psraw      xmm1, 6      // R
michael@0:     psraw      xmm6, 6      // A
michael@0:     packuswb   xmm1, xmm1   // 8 R values
michael@0:     packuswb   xmm6, xmm6   // 8 A values
michael@0:     punpcklbw  xmm1, xmm6   // 8 RA values
michael@0:     movdqa     xmm6, xmm0   // Weave BG, RA together
michael@0:     punpcklwd  xmm0, xmm1   // BGRA first 4
michael@0:     punpckhwd  xmm6, xmm1   // BGRA next 4
michael@0:     sub        ecx, 8
michael@0:     movdqa     [edx], xmm0
michael@0:     movdqa     [edx + 16], xmm6
michael@0:     lea        eax, [eax + 32]
michael@0:     lea        edx, [edx + 32]
michael@0:     jg         convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
michael@0: 
michael@0: #ifdef HAS_ARGBQUANTIZEROW_SSE2
michael@0: // Quantize 4 ARGB pixels (16 bytes).
michael@0: // Aligned to 16 bytes.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
michael@0:                           int interval_offset, int width) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]    /* dst_argb */
michael@0:     movd       xmm2, [esp + 8]   /* scale */
michael@0:     movd       xmm3, [esp + 12]  /* interval_size */
michael@0:     movd       xmm4, [esp + 16]  /* interval_offset */
michael@0:     mov        ecx, [esp + 20]   /* width */
michael@0:     pshuflw    xmm2, xmm2, 040h
michael@0:     pshufd     xmm2, xmm2, 044h
michael@0:     pshuflw    xmm3, xmm3, 040h
michael@0:     pshufd     xmm3, xmm3, 044h
michael@0:     pshuflw    xmm4, xmm4, 040h
michael@0:     pshufd     xmm4, xmm4, 044h
michael@0:     pxor       xmm5, xmm5  // constant 0
michael@0:     pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
michael@0:     pslld      xmm6, 24
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqa     xmm0, [eax]  // read 4 pixels
michael@0:     punpcklbw  xmm0, xmm5   // first 2 pixels
michael@0:     pmulhuw    xmm0, xmm2   // pixel * scale >> 16
michael@0:     movdqa     xmm1, [eax]  // read 4 pixels
michael@0:     punpckhbw  xmm1, xmm5   // next 2 pixels
michael@0:     pmulhuw    xmm1, xmm2
michael@0:     pmullw     xmm0, xmm3   // * interval_size
michael@0:     movdqa     xmm7, [eax]  // read 4 pixels
michael@0:     pmullw     xmm1, xmm3
michael@0:     pand       xmm7, xmm6   // mask alpha
michael@0:     paddw      xmm0, xmm4   // + interval_size / 2
michael@0:     paddw      xmm1, xmm4
michael@0:     packuswb   xmm0, xmm1
michael@0:     por        xmm0, xmm7
michael@0:     sub        ecx, 4
michael@0:     movdqa     [eax], xmm0
michael@0:     lea        eax, [eax + 16]
michael@0:     jg         convertloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBQUANTIZEROW_SSE2
michael@0: 
michael@0: #ifdef HAS_ARGBSHADEROW_SSE2
michael@0: // Shade 4 pixels at a time by specified value.
michael@0: // Aligned to 16 bytes.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
michael@0:                        uint32 value) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]   // src_argb
michael@0:     mov        edx, [esp + 8]   // dst_argb
michael@0:     mov        ecx, [esp + 12]  // width
michael@0:     movd       xmm2, [esp + 16]  // value
michael@0:     punpcklbw  xmm2, xmm2
michael@0:     punpcklqdq xmm2, xmm2
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqa     xmm0, [eax]      // read 4 pixels
michael@0:     lea        eax, [eax + 16]
michael@0:     movdqa     xmm1, xmm0
michael@0:     punpcklbw  xmm0, xmm0       // first 2
michael@0:     punpckhbw  xmm1, xmm1       // next 2
michael@0:     pmulhuw    xmm0, xmm2       // argb * value
michael@0:     pmulhuw    xmm1, xmm2       // argb * value
michael@0:     psrlw      xmm0, 8
michael@0:     psrlw      xmm1, 8
michael@0:     packuswb   xmm0, xmm1
michael@0:     sub        ecx, 4
michael@0:     movdqa     [edx], xmm0
michael@0:     lea        edx, [edx + 16]
michael@0:     jg         convertloop
michael@0: 
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBSHADEROW_SSE2
michael@0: 
michael@0: #ifdef HAS_ARGBMULTIPLYROW_SSE2
michael@0: // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
michael@0:                           uint8* dst_argb, int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     mov        eax, [esp + 4 + 4]   // src_argb0
michael@0:     mov        esi, [esp + 4 + 8]   // src_argb1
michael@0:     mov        edx, [esp + 4 + 12]  // dst_argb
michael@0:     mov        ecx, [esp + 4 + 16]  // width
michael@0:     pxor       xmm5, xmm5  // constant 0
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
michael@0:     movdqu     xmm2, [esi]        // read 4 pixels from src_argb1
michael@0:     movdqu     xmm1, xmm0
michael@0:     movdqu     xmm3, xmm2
michael@0:     punpcklbw  xmm0, xmm0         // first 2
michael@0:     punpckhbw  xmm1, xmm1         // next 2
michael@0:     punpcklbw  xmm2, xmm5         // first 2
michael@0:     punpckhbw  xmm3, xmm5         // next 2
michael@0:     pmulhuw    xmm0, xmm2         // src_argb0 * src_argb1 first 2
michael@0:     pmulhuw    xmm1, xmm3         // src_argb0 * src_argb1 next 2
michael@0:     lea        eax, [eax + 16]
michael@0:     lea        esi, [esi + 16]
michael@0:     packuswb   xmm0, xmm1
michael@0:     sub        ecx, 4
michael@0:     movdqu     [edx], xmm0
michael@0:     lea        edx, [edx + 16]
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBMULTIPLYROW_SSE2
michael@0: 
michael@0: #ifdef HAS_ARGBADDROW_SSE2
michael@0: // Add 2 rows of ARGB pixels together, 4 pixels at a time.
michael@0: // TODO(fbarchard): Port this to posix, neon and other math functions.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
michael@0:                      uint8* dst_argb, int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     mov        eax, [esp + 4 + 4]   // src_argb0
michael@0:     mov        esi, [esp + 4 + 8]   // src_argb1
michael@0:     mov        edx, [esp + 4 + 12]  // dst_argb
michael@0:     mov        ecx, [esp + 4 + 16]  // width
michael@0: 
michael@0:     sub        ecx, 4
michael@0:     jl         convertloop49
michael@0: 
michael@0:     align      4
michael@0:  convertloop4:
michael@0:     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
michael@0:     lea        eax, [eax + 16]
michael@0:     movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
michael@0:     lea        esi, [esi + 16]
michael@0:     paddusb    xmm0, xmm1         // src_argb0 + src_argb1
michael@0:     sub        ecx, 4
michael@0:     movdqu     [edx], xmm0
michael@0:     lea        edx, [edx + 16]
michael@0:     jge        convertloop4
michael@0: 
michael@0:  convertloop49:
michael@0:     add        ecx, 4 - 1
michael@0:     jl         convertloop19
michael@0: 
michael@0:  convertloop1:
michael@0:     movd       xmm0, [eax]        // read 1 pixels from src_argb0
michael@0:     lea        eax, [eax + 4]
michael@0:     movd       xmm1, [esi]        // read 1 pixels from src_argb1
michael@0:     lea        esi, [esi + 4]
michael@0:     paddusb    xmm0, xmm1         // src_argb0 + src_argb1
michael@0:     sub        ecx, 1
michael@0:     movd       [edx], xmm0
michael@0:     lea        edx, [edx + 4]
michael@0:     jge        convertloop1
michael@0: 
michael@0:  convertloop19:
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBADDROW_SSE2
michael@0: 
michael@0: #ifdef HAS_ARGBSUBTRACTROW_SSE2
michael@0: // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
michael@0:                           uint8* dst_argb, int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     mov        eax, [esp + 4 + 4]   // src_argb0
michael@0:     mov        esi, [esp + 4 + 8]   // src_argb1
michael@0:     mov        edx, [esp + 4 + 12]  // dst_argb
michael@0:     mov        ecx, [esp + 4 + 16]  // width
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
michael@0:     lea        eax, [eax + 16]
michael@0:     movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
michael@0:     lea        esi, [esi + 16]
michael@0:     psubusb    xmm0, xmm1         // src_argb0 - src_argb1
michael@0:     sub        ecx, 4
michael@0:     movdqu     [edx], xmm0
michael@0:     lea        edx, [edx + 16]
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBSUBTRACTROW_SSE2
michael@0: 
michael@0: #ifdef HAS_ARGBMULTIPLYROW_AVX2
michael@0: // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
michael@0:                           uint8* dst_argb, int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     mov        eax, [esp + 4 + 4]   // src_argb0
michael@0:     mov        esi, [esp + 4 + 8]   // src_argb1
michael@0:     mov        edx, [esp + 4 + 12]  // dst_argb
michael@0:     mov        ecx, [esp + 4 + 16]  // width
michael@0:     vpxor      ymm5, ymm5, ymm5     // constant 0
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0
michael@0:     lea        eax, [eax + 32]
michael@0:     vmovdqu    ymm3, [esi]        // read 8 pixels from src_argb1
michael@0:     lea        esi, [esi + 32]
michael@0:     vpunpcklbw ymm0, ymm1, ymm1   // low 4
michael@0:     vpunpckhbw ymm1, ymm1, ymm1   // high 4
michael@0:     vpunpcklbw ymm2, ymm3, ymm5   // low 4
michael@0:     vpunpckhbw ymm3, ymm3, ymm5   // high 4
michael@0:     vpmulhuw   ymm0, ymm0, ymm2   // src_argb0 * src_argb1 low 4
michael@0:     vpmulhuw   ymm1, ymm1, ymm3   // src_argb0 * src_argb1 high 4
michael@0:     vpackuswb  ymm0, ymm0, ymm1
michael@0:     vmovdqu    [edx], ymm0
michael@0:     lea        edx, [edx + 32]
michael@0:     sub        ecx, 8
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        esi
michael@0:     vzeroupper
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBMULTIPLYROW_AVX2
michael@0: 
michael@0: #ifdef HAS_ARGBADDROW_AVX2
michael@0: // Add 2 rows of ARGB pixels together, 8 pixels at a time.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
michael@0:                      uint8* dst_argb, int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     mov        eax, [esp + 4 + 4]   // src_argb0
michael@0:     mov        esi, [esp + 4 + 8]   // src_argb1
michael@0:     mov        edx, [esp + 4 + 12]  // dst_argb
michael@0:     mov        ecx, [esp + 4 + 16]  // width
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
michael@0:     lea        eax, [eax + 32]
michael@0:     vpaddusb   ymm0, ymm0, [esi]        // add 8 pixels from src_argb1
michael@0:     lea        esi, [esi + 32]
michael@0:     vmovdqu    [edx], ymm0
michael@0:     lea        edx, [edx + 32]
michael@0:     sub        ecx, 8
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        esi
michael@0:     vzeroupper
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBADDROW_AVX2
michael@0: 
michael@0: #ifdef HAS_ARGBSUBTRACTROW_AVX2
michael@0: // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
michael@0:                           uint8* dst_argb, int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     mov        eax, [esp + 4 + 4]   // src_argb0
michael@0:     mov        esi, [esp + 4 + 8]   // src_argb1
michael@0:     mov        edx, [esp + 4 + 12]  // dst_argb
michael@0:     mov        ecx, [esp + 4 + 16]  // width
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
michael@0:     lea        eax, [eax + 32]
michael@0:     vpsubusb   ymm0, ymm0, [esi]        // src_argb0 - src_argb1
michael@0:     lea        esi, [esi + 32]
michael@0:     vmovdqu    [edx], ymm0
michael@0:     lea        edx, [edx + 32]
michael@0:     sub        ecx, 8
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        esi
michael@0:     vzeroupper
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBSUBTRACTROW_AVX2
michael@0: 
michael@0: #ifdef HAS_SOBELXROW_SSE2
michael@0: // SobelX as a matrix is
michael@0: // -1  0  1
michael@0: // -2  0  2
michael@0: // -1  0  1
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
michael@0:                     const uint8* src_y2, uint8* dst_sobelx, int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]   // src_y0
michael@0:     mov        esi, [esp + 8 + 8]   // src_y1
michael@0:     mov        edi, [esp + 8 + 12]  // src_y2
michael@0:     mov        edx, [esp + 8 + 16]  // dst_sobelx
michael@0:     mov        ecx, [esp + 8 + 20]  // width
michael@0:     sub        esi, eax
michael@0:     sub        edi, eax
michael@0:     sub        edx, eax
michael@0:     pxor       xmm5, xmm5  // constant 0
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
michael@0:     movq       xmm1, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
michael@0:     punpcklbw  xmm0, xmm5
michael@0:     punpcklbw  xmm1, xmm5
michael@0:     psubw      xmm0, xmm1
michael@0:     movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
michael@0:     movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
michael@0:     punpcklbw  xmm1, xmm5
michael@0:     punpcklbw  xmm2, xmm5
michael@0:     psubw      xmm1, xmm2
michael@0:     movq       xmm2, qword ptr [eax + edi]      // read 8 pixels from src_y2[0]
michael@0:     movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
michael@0:     punpcklbw  xmm2, xmm5
michael@0:     punpcklbw  xmm3, xmm5
michael@0:     psubw      xmm2, xmm3
michael@0:     paddw      xmm0, xmm2
michael@0:     paddw      xmm0, xmm1
michael@0:     paddw      xmm0, xmm1
michael@0:     pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
michael@0:     psubw      xmm1, xmm0
michael@0:     pmaxsw     xmm0, xmm1
michael@0:     packuswb   xmm0, xmm0
michael@0:     sub        ecx, 8
michael@0:     movq       qword ptr [eax + edx], xmm0
michael@0:     lea        eax, [eax + 8]
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_SOBELXROW_SSE2
michael@0: 
michael@0: #ifdef HAS_SOBELYROW_SSE2
michael@0: // SobelY as a matrix is
michael@0: // -1 -2 -1
michael@0: //  0  0  0
michael@0: //  1  2  1
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
michael@0:                     uint8* dst_sobely, int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     mov        eax, [esp + 4 + 4]   // src_y0
michael@0:     mov        esi, [esp + 4 + 8]   // src_y1
michael@0:     mov        edx, [esp + 4 + 12]  // dst_sobely
michael@0:     mov        ecx, [esp + 4 + 16]  // width
michael@0:     sub        esi, eax
michael@0:     sub        edx, eax
michael@0:     pxor       xmm5, xmm5  // constant 0
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
michael@0:     movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
michael@0:     punpcklbw  xmm0, xmm5
michael@0:     punpcklbw  xmm1, xmm5
michael@0:     psubw      xmm0, xmm1
michael@0:     movq       xmm1, qword ptr [eax + 1]        // read 8 pixels from src_y0[1]
michael@0:     movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
michael@0:     punpcklbw  xmm1, xmm5
michael@0:     punpcklbw  xmm2, xmm5
michael@0:     psubw      xmm1, xmm2
michael@0:     movq       xmm2, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
michael@0:     movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
michael@0:     punpcklbw  xmm2, xmm5
michael@0:     punpcklbw  xmm3, xmm5
michael@0:     psubw      xmm2, xmm3
michael@0:     paddw      xmm0, xmm2
michael@0:     paddw      xmm0, xmm1
michael@0:     paddw      xmm0, xmm1
michael@0:     pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
michael@0:     psubw      xmm1, xmm0
michael@0:     pmaxsw     xmm0, xmm1
michael@0:     packuswb   xmm0, xmm0
michael@0:     sub        ecx, 8
michael@0:     movq       qword ptr [eax + edx], xmm0
michael@0:     lea        eax, [eax + 8]
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_SOBELYROW_SSE2
michael@0: 
michael@0: #ifdef HAS_SOBELROW_SSE2
michael@0: // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
michael@0: // A = 255
michael@0: // R = Sobel
michael@0: // G = Sobel
michael@0: // B = Sobel
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
michael@0:                    uint8* dst_argb, int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     mov        eax, [esp + 4 + 4]   // src_sobelx
michael@0:     mov        esi, [esp + 4 + 8]   // src_sobely
michael@0:     mov        edx, [esp + 4 + 12]  // dst_argb
michael@0:     mov        ecx, [esp + 4 + 16]  // width
michael@0:     sub        esi, eax
michael@0:     pcmpeqb    xmm5, xmm5           // alpha 255
michael@0:     pslld      xmm5, 24             // 0xff000000
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
michael@0:     movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
michael@0:     lea        eax, [eax + 16]
michael@0:     paddusb    xmm0, xmm1             // sobel = sobelx + sobely
michael@0:     movdqa     xmm2, xmm0             // GG
michael@0:     punpcklbw  xmm2, xmm0             // First 8
michael@0:     punpckhbw  xmm0, xmm0             // Next 8
michael@0:     movdqa     xmm1, xmm2             // GGGG
michael@0:     punpcklwd  xmm1, xmm2             // First 4
michael@0:     punpckhwd  xmm2, xmm2             // Next 4
michael@0:     por        xmm1, xmm5             // GGGA
michael@0:     por        xmm2, xmm5
michael@0:     movdqa     xmm3, xmm0             // GGGG
michael@0:     punpcklwd  xmm3, xmm0             // Next 4
michael@0:     punpckhwd  xmm0, xmm0             // Last 4
michael@0:     por        xmm3, xmm5             // GGGA
michael@0:     por        xmm0, xmm5
michael@0:     sub        ecx, 16
michael@0:     movdqa     [edx], xmm1
michael@0:     movdqa     [edx + 16], xmm2
michael@0:     movdqa     [edx + 32], xmm3
michael@0:     movdqa     [edx + 48], xmm0
michael@0:     lea        edx, [edx + 64]
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_SOBELROW_SSE2
michael@0: 
michael@0: #ifdef HAS_SOBELTOPLANEROW_SSE2
michael@0: // Adds Sobel X and Sobel Y and stores Sobel into a plane.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
michael@0:                           uint8* dst_y, int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     mov        eax, [esp + 4 + 4]   // src_sobelx
michael@0:     mov        esi, [esp + 4 + 8]   // src_sobely
michael@0:     mov        edx, [esp + 4 + 12]  // dst_argb
michael@0:     mov        ecx, [esp + 4 + 16]  // width
michael@0:     sub        esi, eax
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
michael@0:     movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
michael@0:     lea        eax, [eax + 16]
michael@0:     paddusb    xmm0, xmm1             // sobel = sobelx + sobely
michael@0:     sub        ecx, 16
michael@0:     movdqa     [edx], xmm0
michael@0:     lea        edx, [edx + 16]
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_SOBELTOPLANEROW_SSE2
michael@0: 
michael@0: #ifdef HAS_SOBELXYROW_SSE2
michael@0: // Mixes Sobel X, Sobel Y and Sobel into ARGB.
michael@0: // A = 255
michael@0: // R = Sobel X
michael@0: // G = Sobel
michael@0: // B = Sobel Y
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
michael@0:                      uint8* dst_argb, int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     mov        eax, [esp + 4 + 4]   // src_sobelx
michael@0:     mov        esi, [esp + 4 + 8]   // src_sobely
michael@0:     mov        edx, [esp + 4 + 12]  // dst_argb
michael@0:     mov        ecx, [esp + 4 + 16]  // width
michael@0:     sub        esi, eax
michael@0:     pcmpeqb    xmm5, xmm5           // alpha 255
michael@0: 
michael@0:     align      4
michael@0:  convertloop:
michael@0:     movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
michael@0:     movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
michael@0:     lea        eax, [eax + 16]
michael@0:     movdqa     xmm2, xmm0
michael@0:     paddusb    xmm2, xmm1             // sobel = sobelx + sobely
michael@0:     movdqa     xmm3, xmm0             // XA
michael@0:     punpcklbw  xmm3, xmm5
michael@0:     punpckhbw  xmm0, xmm5
michael@0:     movdqa     xmm4, xmm1             // YS
michael@0:     punpcklbw  xmm4, xmm2
michael@0:     punpckhbw  xmm1, xmm2
michael@0:     movdqa     xmm6, xmm4             // YSXA
michael@0:     punpcklwd  xmm6, xmm3             // First 4
michael@0:     punpckhwd  xmm4, xmm3             // Next 4
michael@0:     movdqa     xmm7, xmm1             // YSXA
michael@0:     punpcklwd  xmm7, xmm0             // Next 4
michael@0:     punpckhwd  xmm1, xmm0             // Last 4
michael@0:     sub        ecx, 16
michael@0:     movdqa     [edx], xmm6
michael@0:     movdqa     [edx + 16], xmm4
michael@0:     movdqa     [edx + 32], xmm7
michael@0:     movdqa     [edx + 48], xmm1
michael@0:     lea        edx, [edx + 64]
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_SOBELXYROW_SSE2
michael@0: 
michael@0: #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
michael@0: // Consider float CumulativeSum.
michael@0: // Consider calling CumulativeSum one row at time as needed.
michael@0: // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
michael@0: // Convert cumulative sum for an area to an average for 1 pixel.
michael@0: // topleft is pointer to top left of CumulativeSum buffer for area.
michael@0: // botleft is pointer to bottom left of CumulativeSum buffer.
michael@0: // width is offset from left to right of area in CumulativeSum buffer measured
michael@0: //   in number of ints.
michael@0: // area is the number of pixels in the area being averaged.
michael@0: // dst points to pixel to store result to.
michael@0: // count is number of averaged pixels to produce.
michael@0: // Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
michael@0: // aligned.
michael@0: void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
michael@0:                                     int width, int area, uint8* dst,
michael@0:                                     int count) {
michael@0:   __asm {
michael@0:     mov        eax, topleft  // eax topleft
michael@0:     mov        esi, botleft  // esi botleft
michael@0:     mov        edx, width
michael@0:     movd       xmm5, area
michael@0:     mov        edi, dst
michael@0:     mov        ecx, count
michael@0:     cvtdq2ps   xmm5, xmm5
michael@0:     rcpss      xmm4, xmm5  // 1.0f / area
michael@0:     pshufd     xmm4, xmm4, 0
michael@0:     sub        ecx, 4
michael@0:     jl         l4b
michael@0: 
michael@0:     cmp        area, 128  // 128 pixels will not overflow 15 bits.
michael@0:     ja         l4
michael@0: 
michael@0:     pshufd     xmm5, xmm5, 0        // area
michael@0:     pcmpeqb    xmm6, xmm6           // constant of 65536.0 - 1 = 65535.0
michael@0:     psrld      xmm6, 16
michael@0:     cvtdq2ps   xmm6, xmm6
michael@0:     addps      xmm5, xmm6           // (65536.0 + area - 1)
michael@0:     mulps      xmm5, xmm4           // (65536.0 + area - 1) * 1 / area
michael@0:     cvtps2dq   xmm5, xmm5           // 0.16 fixed point
michael@0:     packssdw   xmm5, xmm5           // 16 bit shorts
michael@0: 
michael@0:     // 4 pixel loop small blocks.
michael@0:     align      4
michael@0:   s4:
michael@0:     // top left
michael@0:     movdqa     xmm0, [eax]
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     movdqa     xmm2, [eax + 32]
michael@0:     movdqa     xmm3, [eax + 48]
michael@0: 
michael@0:     // - top right
michael@0:     psubd      xmm0, [eax + edx * 4]
michael@0:     psubd      xmm1, [eax + edx * 4 + 16]
michael@0:     psubd      xmm2, [eax + edx * 4 + 32]
michael@0:     psubd      xmm3, [eax + edx * 4 + 48]
michael@0:     lea        eax, [eax + 64]
michael@0: 
michael@0:     // - bottom left
michael@0:     psubd      xmm0, [esi]
michael@0:     psubd      xmm1, [esi + 16]
michael@0:     psubd      xmm2, [esi + 32]
michael@0:     psubd      xmm3, [esi + 48]
michael@0: 
michael@0:     // + bottom right
michael@0:     paddd      xmm0, [esi + edx * 4]
michael@0:     paddd      xmm1, [esi + edx * 4 + 16]
michael@0:     paddd      xmm2, [esi + edx * 4 + 32]
michael@0:     paddd      xmm3, [esi + edx * 4 + 48]
michael@0:     lea        esi, [esi + 64]
michael@0: 
michael@0:     packssdw   xmm0, xmm1  // pack 4 pixels into 2 registers
michael@0:     packssdw   xmm2, xmm3
michael@0: 
michael@0:     pmulhuw    xmm0, xmm5
michael@0:     pmulhuw    xmm2, xmm5
michael@0: 
michael@0:     packuswb   xmm0, xmm2
michael@0:     movdqu     [edi], xmm0
michael@0:     lea        edi, [edi + 16]
michael@0:     sub        ecx, 4
michael@0:     jge        s4
michael@0: 
michael@0:     jmp        l4b
michael@0: 
michael@0:     // 4 pixel loop
michael@0:     align      4
michael@0:   l4:
michael@0:     // top left
michael@0:     movdqa     xmm0, [eax]
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     movdqa     xmm2, [eax + 32]
michael@0:     movdqa     xmm3, [eax + 48]
michael@0: 
michael@0:     // - top right
michael@0:     psubd      xmm0, [eax + edx * 4]
michael@0:     psubd      xmm1, [eax + edx * 4 + 16]
michael@0:     psubd      xmm2, [eax + edx * 4 + 32]
michael@0:     psubd      xmm3, [eax + edx * 4 + 48]
michael@0:     lea        eax, [eax + 64]
michael@0: 
michael@0:     // - bottom left
michael@0:     psubd      xmm0, [esi]
michael@0:     psubd      xmm1, [esi + 16]
michael@0:     psubd      xmm2, [esi + 32]
michael@0:     psubd      xmm3, [esi + 48]
michael@0: 
michael@0:     // + bottom right
michael@0:     paddd      xmm0, [esi + edx * 4]
michael@0:     paddd      xmm1, [esi + edx * 4 + 16]
michael@0:     paddd      xmm2, [esi + edx * 4 + 32]
michael@0:     paddd      xmm3, [esi + edx * 4 + 48]
michael@0:     lea        esi, [esi + 64]
michael@0: 
michael@0:     cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area
michael@0:     cvtdq2ps   xmm1, xmm1
michael@0:     mulps      xmm0, xmm4
michael@0:     mulps      xmm1, xmm4
michael@0:     cvtdq2ps   xmm2, xmm2
michael@0:     cvtdq2ps   xmm3, xmm3
michael@0:     mulps      xmm2, xmm4
michael@0:     mulps      xmm3, xmm4
michael@0:     cvtps2dq   xmm0, xmm0
michael@0:     cvtps2dq   xmm1, xmm1
michael@0:     cvtps2dq   xmm2, xmm2
michael@0:     cvtps2dq   xmm3, xmm3
michael@0:     packssdw   xmm0, xmm1
michael@0:     packssdw   xmm2, xmm3
michael@0:     packuswb   xmm0, xmm2
michael@0:     movdqu     [edi], xmm0
michael@0:     lea        edi, [edi + 16]
michael@0:     sub        ecx, 4
michael@0:     jge        l4
michael@0: 
michael@0:   l4b:
michael@0:     add        ecx, 4 - 1
michael@0:     jl         l1b
michael@0: 
michael@0:     // 1 pixel loop
michael@0:     align      4
michael@0:   l1:
michael@0:     movdqa     xmm0, [eax]
michael@0:     psubd      xmm0, [eax + edx * 4]
michael@0:     lea        eax, [eax + 16]
michael@0:     psubd      xmm0, [esi]
michael@0:     paddd      xmm0, [esi + edx * 4]
michael@0:     lea        esi, [esi + 16]
michael@0:     cvtdq2ps   xmm0, xmm0
michael@0:     mulps      xmm0, xmm4
michael@0:     cvtps2dq   xmm0, xmm0
michael@0:     packssdw   xmm0, xmm0
michael@0:     packuswb   xmm0, xmm0
michael@0:     movd       dword ptr [edi], xmm0
michael@0:     lea        edi, [edi + 4]
michael@0:     sub        ecx, 1
michael@0:     jge        l1
michael@0:   l1b:
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
michael@0: 
michael@0: #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
michael@0: // Creates a table of cumulative sums where each value is a sum of all values
michael@0: // above and to the left of the value.
michael@0: void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
michael@0:                                   const int32* previous_cumsum, int width) {
michael@0:   __asm {
michael@0:     mov        eax, row
michael@0:     mov        edx, cumsum
michael@0:     mov        esi, previous_cumsum
michael@0:     mov        ecx, width
michael@0:     pxor       xmm0, xmm0
michael@0:     pxor       xmm1, xmm1
michael@0: 
michael@0:     sub        ecx, 4
michael@0:     jl         l4b
michael@0:     test       edx, 15
michael@0:     jne        l4b
michael@0: 
michael@0:     // 4 pixel loop
michael@0:     align      4
michael@0:   l4:
michael@0:     movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
michael@0:     lea        eax, [eax + 16]
michael@0:     movdqa     xmm4, xmm2
michael@0: 
michael@0:     punpcklbw  xmm2, xmm1
michael@0:     movdqa     xmm3, xmm2
michael@0:     punpcklwd  xmm2, xmm1
michael@0:     punpckhwd  xmm3, xmm1
michael@0: 
michael@0:     punpckhbw  xmm4, xmm1
michael@0:     movdqa     xmm5, xmm4
michael@0:     punpcklwd  xmm4, xmm1
michael@0:     punpckhwd  xmm5, xmm1
michael@0: 
michael@0:     paddd      xmm0, xmm2
michael@0:     movdqa     xmm2, [esi]  // previous row above.
michael@0:     paddd      xmm2, xmm0
michael@0: 
michael@0:     paddd      xmm0, xmm3
michael@0:     movdqa     xmm3, [esi + 16]
michael@0:     paddd      xmm3, xmm0
michael@0: 
michael@0:     paddd      xmm0, xmm4
michael@0:     movdqa     xmm4, [esi + 32]
michael@0:     paddd      xmm4, xmm0
michael@0: 
michael@0:     paddd      xmm0, xmm5
michael@0:     movdqa     xmm5, [esi + 48]
michael@0:     lea        esi, [esi + 64]
michael@0:     paddd      xmm5, xmm0
michael@0: 
michael@0:     movdqa     [edx], xmm2
michael@0:     movdqa     [edx + 16], xmm3
michael@0:     movdqa     [edx + 32], xmm4
michael@0:     movdqa     [edx + 48], xmm5
michael@0: 
michael@0:     lea        edx, [edx + 64]
michael@0:     sub        ecx, 4
michael@0:     jge        l4
michael@0: 
michael@0:   l4b:
michael@0:     add        ecx, 4 - 1
michael@0:     jl         l1b
michael@0: 
michael@0:     // 1 pixel loop
michael@0:     align      4
michael@0:   l1:
michael@0:     movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
michael@0:     lea        eax, [eax + 4]
michael@0:     punpcklbw  xmm2, xmm1
michael@0:     punpcklwd  xmm2, xmm1
michael@0:     paddd      xmm0, xmm2
michael@0:     movdqu     xmm2, [esi]
michael@0:     lea        esi, [esi + 16]
michael@0:     paddd      xmm2, xmm0
michael@0:     movdqu     [edx], xmm2
michael@0:     lea        edx, [edx + 16]
michael@0:     sub        ecx, 1
michael@0:     jge        l1
michael@0: 
michael@0:  l1b:
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
michael@0: 
michael@0: #ifdef HAS_ARGBAFFINEROW_SSE2
michael@0: // Copy ARGB pixels from source image with slope to a row of destination.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: LIBYUV_API
michael@0: void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
michael@0:                         uint8* dst_argb, const float* uv_dudv, int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 12]  // src_argb
michael@0:     mov        esi, [esp + 16]  // stride
michael@0:     mov        edx, [esp + 20]  // dst_argb
michael@0:     mov        ecx, [esp + 24]  // pointer to uv_dudv
michael@0:     movq       xmm2, qword ptr [ecx]  // uv
michael@0:     movq       xmm7, qword ptr [ecx + 8]  // dudv
michael@0:     mov        ecx, [esp + 28]  // width
michael@0:     shl        esi, 16          // 4, stride
michael@0:     add        esi, 4
michael@0:     movd       xmm5, esi
michael@0:     sub        ecx, 4
michael@0:     jl         l4b
michael@0: 
michael@0:     // setup for 4 pixel loop
michael@0:     pshufd     xmm7, xmm7, 0x44  // dup dudv
michael@0:     pshufd     xmm5, xmm5, 0  // dup 4, stride
michael@0:     movdqa     xmm0, xmm2    // x0, y0, x1, y1
michael@0:     addps      xmm0, xmm7
michael@0:     movlhps    xmm2, xmm0
michael@0:     movdqa     xmm4, xmm7
michael@0:     addps      xmm4, xmm4    // dudv *= 2
michael@0:     movdqa     xmm3, xmm2    // x2, y2, x3, y3
michael@0:     addps      xmm3, xmm4
michael@0:     addps      xmm4, xmm4    // dudv *= 4
michael@0: 
michael@0:     // 4 pixel loop
michael@0:     align      4
michael@0:   l4:
michael@0:     cvttps2dq  xmm0, xmm2    // x, y float to int first 2
michael@0:     cvttps2dq  xmm1, xmm3    // x, y float to int next 2
michael@0:     packssdw   xmm0, xmm1    // x, y as 8 shorts
michael@0:     pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.
michael@0:     movd       esi, xmm0
michael@0:     pshufd     xmm0, xmm0, 0x39  // shift right
michael@0:     movd       edi, xmm0
michael@0:     pshufd     xmm0, xmm0, 0x39  // shift right
michael@0:     movd       xmm1, [eax + esi]  // read pixel 0
michael@0:     movd       xmm6, [eax + edi]  // read pixel 1
michael@0:     punpckldq  xmm1, xmm6     // combine pixel 0 and 1
michael@0:     addps      xmm2, xmm4    // x, y += dx, dy first 2
michael@0:     movq       qword ptr [edx], xmm1
michael@0:     movd       esi, xmm0
michael@0:     pshufd     xmm0, xmm0, 0x39  // shift right
michael@0:     movd       edi, xmm0
michael@0:     movd       xmm6, [eax + esi]  // read pixel 2
michael@0:     movd       xmm0, [eax + edi]  // read pixel 3
michael@0:     punpckldq  xmm6, xmm0     // combine pixel 2 and 3
michael@0:     addps      xmm3, xmm4    // x, y += dx, dy next 2
michael@0:     sub        ecx, 4
michael@0:     movq       qword ptr 8[edx], xmm6
michael@0:     lea        edx, [edx + 16]
michael@0:     jge        l4
michael@0: 
michael@0:   l4b:
michael@0:     add        ecx, 4 - 1
michael@0:     jl         l1b
michael@0: 
michael@0:     // 1 pixel loop
michael@0:     align      4
michael@0:   l1:
michael@0:     cvttps2dq  xmm0, xmm2    // x, y float to int
michael@0:     packssdw   xmm0, xmm0    // x, y as shorts
michael@0:     pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
michael@0:     addps      xmm2, xmm7    // x, y += dx, dy
michael@0:     movd       esi, xmm0
michael@0:     movd       xmm0, [eax + esi]  // copy a pixel
michael@0:     sub        ecx, 1
michael@0:     movd       [edx], xmm0
michael@0:     lea        edx, [edx + 4]
michael@0:     jge        l1
michael@0:   l1b:
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBAFFINEROW_SSE2
michael@0: 
michael@0: #ifdef HAS_INTERPOLATEROW_AVX2
michael@0: // Bilinear filter 16x2 -> 16x1
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
michael@0:                           ptrdiff_t src_stride, int dst_width,
michael@0:                           int source_y_fraction) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        edi, [esp + 8 + 4]   // dst_ptr
michael@0:     mov        esi, [esp + 8 + 8]   // src_ptr
michael@0:     mov        edx, [esp + 8 + 12]  // src_stride
michael@0:     mov        ecx, [esp + 8 + 16]  // dst_width
michael@0:     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
michael@0:     shr        eax, 1
michael@0:     // Dispatch to specialized filters if applicable.
michael@0:     cmp        eax, 0
michael@0:     je         xloop100  // 0 / 128.  Blend 100 / 0.
michael@0:     sub        edi, esi
michael@0:     cmp        eax, 32
michael@0:     je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
michael@0:     cmp        eax, 64
michael@0:     je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
michael@0:     cmp        eax, 96
michael@0:     je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
michael@0: 
michael@0:     vmovd      xmm0, eax  // high fraction 0..127
michael@0:     neg        eax
michael@0:     add        eax, 128
michael@0:     vmovd      xmm5, eax  // low fraction 128..1
michael@0:     vpunpcklbw xmm5, xmm5, xmm0
michael@0:     vpunpcklwd xmm5, xmm5, xmm5
michael@0:     vpxor      ymm0, ymm0, ymm0
michael@0:     vpermd     ymm5, ymm0, ymm5
michael@0: 
michael@0:     align      4
michael@0:   xloop:
michael@0:     vmovdqu    ymm0, [esi]
michael@0:     vmovdqu    ymm2, [esi + edx]
michael@0:     vpunpckhbw ymm1, ymm0, ymm2  // mutates
michael@0:     vpunpcklbw ymm0, ymm0, ymm2  // mutates
michael@0:     vpmaddubsw ymm0, ymm0, ymm5
michael@0:     vpmaddubsw ymm1, ymm1, ymm5
michael@0:     vpsrlw     ymm0, ymm0, 7
michael@0:     vpsrlw     ymm1, ymm1, 7
michael@0:     vpackuswb  ymm0, ymm0, ymm1  // unmutates
michael@0:     sub        ecx, 32
michael@0:     vmovdqu    [esi + edi], ymm0
michael@0:     lea        esi, [esi + 32]
michael@0:     jg         xloop
michael@0:     jmp        xloop99
michael@0: 
michael@0:     // Blend 25 / 75.
michael@0:     align      4
michael@0:   xloop25:
michael@0:     vmovdqu    ymm0, [esi]
michael@0:     vpavgb     ymm0, ymm0, [esi + edx]
michael@0:     vpavgb     ymm0, ymm0, [esi + edx]
michael@0:     sub        ecx, 32
michael@0:     vmovdqu    [esi + edi], ymm0
michael@0:     lea        esi, [esi + 32]
michael@0:     jg         xloop25
michael@0:     jmp        xloop99
michael@0: 
michael@0:     // Blend 50 / 50.
michael@0:     align      4
michael@0:   xloop50:
michael@0:     vmovdqu    ymm0, [esi]
michael@0:     vpavgb     ymm0, ymm0, [esi + edx]
michael@0:     sub        ecx, 32
michael@0:     vmovdqu    [esi + edi], ymm0
michael@0:     lea        esi, [esi + 32]
michael@0:     jg         xloop50
michael@0:     jmp        xloop99
michael@0: 
michael@0:     // Blend 75 / 25.
michael@0:     align      4
michael@0:   xloop75:
michael@0:     vmovdqu    ymm0, [esi + edx]
michael@0:     vpavgb     ymm0, ymm0, [esi]
michael@0:     vpavgb     ymm0, ymm0, [esi]
michael@0:     sub        ecx, 32
michael@0:     vmovdqu     [esi + edi], ymm0
michael@0:     lea        esi, [esi + 32]
michael@0:     jg         xloop75
michael@0:     jmp        xloop99
michael@0: 
michael@0:     // Blend 100 / 0 - Copy row unchanged.
michael@0:     align      4
michael@0:   xloop100:
michael@0:     rep movsb
michael@0: 
michael@0:   xloop99:
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     vzeroupper
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_INTERPOLATEROW_AVX2
michael@0: 
michael@0: #ifdef HAS_INTERPOLATEROW_SSSE3
michael@0: // Bilinear filter 16x2 -> 16x1
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
michael@0:                           ptrdiff_t src_stride, int dst_width,
michael@0:                           int source_y_fraction) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        edi, [esp + 8 + 4]   // dst_ptr
michael@0:     mov        esi, [esp + 8 + 8]   // src_ptr
michael@0:     mov        edx, [esp + 8 + 12]  // src_stride
michael@0:     mov        ecx, [esp + 8 + 16]  // dst_width
michael@0:     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
michael@0:     sub        edi, esi
michael@0:     shr        eax, 1
michael@0:     // Dispatch to specialized filters if applicable.
michael@0:     cmp        eax, 0
michael@0:     je         xloop100  // 0 / 128.  Blend 100 / 0.
michael@0:     cmp        eax, 32
michael@0:     je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
michael@0:     cmp        eax, 64
michael@0:     je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
michael@0:     cmp        eax, 96
michael@0:     je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
michael@0: 
michael@0:     movd       xmm0, eax  // high fraction 0..127
michael@0:     neg        eax
michael@0:     add        eax, 128
michael@0:     movd       xmm5, eax  // low fraction 128..1
michael@0:     punpcklbw  xmm5, xmm0
michael@0:     punpcklwd  xmm5, xmm5
michael@0:     pshufd     xmm5, xmm5, 0
michael@0: 
michael@0:     align      4
michael@0:   xloop:
michael@0:     movdqa     xmm0, [esi]
michael@0:     movdqa     xmm2, [esi + edx]
michael@0:     movdqa     xmm1, xmm0
michael@0:     punpcklbw  xmm0, xmm2
michael@0:     punpckhbw  xmm1, xmm2
michael@0:     pmaddubsw  xmm0, xmm5
michael@0:     pmaddubsw  xmm1, xmm5
michael@0:     psrlw      xmm0, 7
michael@0:     psrlw      xmm1, 7
michael@0:     packuswb   xmm0, xmm1
michael@0:     sub        ecx, 16
michael@0:     movdqa     [esi + edi], xmm0
michael@0:     lea        esi, [esi + 16]
michael@0:     jg         xloop
michael@0:     jmp        xloop99
michael@0: 
michael@0:     // Blend 25 / 75.
michael@0:     align      4
michael@0:   xloop25:
michael@0:     movdqa     xmm0, [esi]
michael@0:     movdqa     xmm1, [esi + edx]
michael@0:     pavgb      xmm0, xmm1
michael@0:     pavgb      xmm0, xmm1
michael@0:     sub        ecx, 16
michael@0:     movdqa     [esi + edi], xmm0
michael@0:     lea        esi, [esi + 16]
michael@0:     jg         xloop25
michael@0:     jmp        xloop99
michael@0: 
michael@0:     // Blend 50 / 50.
michael@0:     align      4
michael@0:   xloop50:
michael@0:     movdqa     xmm0, [esi]
michael@0:     movdqa     xmm1, [esi + edx]
michael@0:     pavgb      xmm0, xmm1
michael@0:     sub        ecx, 16
michael@0:     movdqa     [esi + edi], xmm0
michael@0:     lea        esi, [esi + 16]
michael@0:     jg         xloop50
michael@0:     jmp        xloop99
michael@0: 
michael@0:     // Blend 75 / 25.
michael@0:     align      4
michael@0:   xloop75:
michael@0:     movdqa     xmm1, [esi]
michael@0:     movdqa     xmm0, [esi + edx]
michael@0:     pavgb      xmm0, xmm1
michael@0:     pavgb      xmm0, xmm1
michael@0:     sub        ecx, 16
michael@0:     movdqa     [esi + edi], xmm0
michael@0:     lea        esi, [esi + 16]
michael@0:     jg         xloop75
michael@0:     jmp        xloop99
michael@0: 
michael@0:     // Blend 100 / 0 - Copy row unchanged.
michael@0:     align      4
michael@0:   xloop100:
michael@0:     movdqa     xmm0, [esi]
michael@0:     sub        ecx, 16
michael@0:     movdqa     [esi + edi], xmm0
michael@0:     lea        esi, [esi + 16]
michael@0:     jg         xloop100
michael@0: 
michael@0:   xloop99:
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_INTERPOLATEROW_SSSE3
michael@0: 
michael@0: #ifdef HAS_INTERPOLATEROW_SSE2
michael@0: // Bilinear filter 16x2 -> 16x1
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
michael@0:                          ptrdiff_t src_stride, int dst_width,
michael@0:                          int source_y_fraction) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        edi, [esp + 8 + 4]   // dst_ptr
michael@0:     mov        esi, [esp + 8 + 8]   // src_ptr
michael@0:     mov        edx, [esp + 8 + 12]  // src_stride
michael@0:     mov        ecx, [esp + 8 + 16]  // dst_width
michael@0:     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
michael@0:     sub        edi, esi
michael@0:     // Dispatch to specialized filters if applicable.
michael@0:     cmp        eax, 0
michael@0:     je         xloop100  // 0 / 256.  Blend 100 / 0.
michael@0:     cmp        eax, 64
michael@0:     je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.
michael@0:     cmp        eax, 128
michael@0:     je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
michael@0:     cmp        eax, 192
michael@0:     je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.
michael@0: 
michael@0:     movd       xmm5, eax            // xmm5 = y fraction
michael@0:     punpcklbw  xmm5, xmm5
michael@0:     psrlw      xmm5, 1
michael@0:     punpcklwd  xmm5, xmm5
michael@0:     punpckldq  xmm5, xmm5
michael@0:     punpcklqdq xmm5, xmm5
michael@0:     pxor       xmm4, xmm4
michael@0: 
michael@0:     align      4
michael@0:   xloop:
michael@0:     movdqa     xmm0, [esi]  // row0
michael@0:     movdqa     xmm2, [esi + edx]  // row1
michael@0:     movdqa     xmm1, xmm0
michael@0:     movdqa     xmm3, xmm2
michael@0:     punpcklbw  xmm2, xmm4
michael@0:     punpckhbw  xmm3, xmm4
michael@0:     punpcklbw  xmm0, xmm4
michael@0:     punpckhbw  xmm1, xmm4
michael@0:     psubw      xmm2, xmm0  // row1 - row0
michael@0:     psubw      xmm3, xmm1
michael@0:     paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16
michael@0:     paddw      xmm3, xmm3
michael@0:     pmulhw     xmm2, xmm5  // scale diff
michael@0:     pmulhw     xmm3, xmm5
michael@0:     paddw      xmm0, xmm2  // sum rows
michael@0:     paddw      xmm1, xmm3
michael@0:     packuswb   xmm0, xmm1
michael@0:     sub        ecx, 16
michael@0:     movdqa     [esi + edi], xmm0
michael@0:     lea        esi, [esi + 16]
michael@0:     jg         xloop
michael@0:     jmp        xloop99
michael@0: 
michael@0:     // Blend 25 / 75.
michael@0:     align      4
michael@0:   xloop25:
michael@0:     movdqa     xmm0, [esi]
michael@0:     movdqa     xmm1, [esi + edx]
michael@0:     pavgb      xmm0, xmm1
michael@0:     pavgb      xmm0, xmm1
michael@0:     sub        ecx, 16
michael@0:     movdqa     [esi + edi], xmm0
michael@0:     lea        esi, [esi + 16]
michael@0:     jg         xloop25
michael@0:     jmp        xloop99
michael@0: 
michael@0:     // Blend 50 / 50.
michael@0:     align      4
michael@0:   xloop50:
michael@0:     movdqa     xmm0, [esi]
michael@0:     movdqa     xmm1, [esi + edx]
michael@0:     pavgb      xmm0, xmm1
michael@0:     sub        ecx, 16
michael@0:     movdqa     [esi + edi], xmm0
michael@0:     lea        esi, [esi + 16]
michael@0:     jg         xloop50
michael@0:     jmp        xloop99
michael@0: 
michael@0:     // Blend 75 / 25.
michael@0:     align      4
michael@0:   xloop75:
michael@0:     movdqa     xmm1, [esi]
michael@0:     movdqa     xmm0, [esi + edx]
michael@0:     pavgb      xmm0, xmm1
michael@0:     pavgb      xmm0, xmm1
michael@0:     sub        ecx, 16
michael@0:     movdqa     [esi + edi], xmm0
michael@0:     lea        esi, [esi + 16]
michael@0:     jg         xloop75
michael@0:     jmp        xloop99
michael@0: 
michael@0:     // Blend 100 / 0 - Copy row unchanged.
michael@0:     align      4
michael@0:   xloop100:
michael@0:     movdqa     xmm0, [esi]
michael@0:     sub        ecx, 16
michael@0:     movdqa     [esi + edi], xmm0
michael@0:     lea        esi, [esi + 16]
michael@0:     jg         xloop100
michael@0: 
michael@0:   xloop99:
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_INTERPOLATEROW_SSE2
michael@0: 
michael@0: // Bilinear filter 16x2 -> 16x1
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
michael@0:                                     ptrdiff_t src_stride, int dst_width,
michael@0:                                     int source_y_fraction) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        edi, [esp + 8 + 4]   // dst_ptr
michael@0:     mov        esi, [esp + 8 + 8]   // src_ptr
michael@0:     mov        edx, [esp + 8 + 12]  // src_stride
michael@0:     mov        ecx, [esp + 8 + 16]  // dst_width
michael@0:     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
michael@0:     sub        edi, esi
michael@0:     shr        eax, 1
michael@0:     // Dispatch to specialized filters if applicable.
michael@0:     cmp        eax, 0
michael@0:     je         xloop100  // 0 / 128.  Blend 100 / 0.
michael@0:     cmp        eax, 32
michael@0:     je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
michael@0:     cmp        eax, 64
michael@0:     je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
michael@0:     cmp        eax, 96
michael@0:     je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
michael@0: 
michael@0:     movd       xmm0, eax  // high fraction 0..127
michael@0:     neg        eax
michael@0:     add        eax, 128
michael@0:     movd       xmm5, eax  // low fraction 128..1
michael@0:     punpcklbw  xmm5, xmm0
michael@0:     punpcklwd  xmm5, xmm5
michael@0:     pshufd     xmm5, xmm5, 0
michael@0: 
michael@0:     align      4
michael@0:   xloop:
michael@0:     movdqu     xmm0, [esi]
michael@0:     movdqu     xmm2, [esi + edx]
michael@0:     movdqu     xmm1, xmm0
michael@0:     punpcklbw  xmm0, xmm2
michael@0:     punpckhbw  xmm1, xmm2
michael@0:     pmaddubsw  xmm0, xmm5
michael@0:     pmaddubsw  xmm1, xmm5
michael@0:     psrlw      xmm0, 7
michael@0:     psrlw      xmm1, 7
michael@0:     packuswb   xmm0, xmm1
michael@0:     sub        ecx, 16
michael@0:     movdqu     [esi + edi], xmm0
michael@0:     lea        esi, [esi + 16]
michael@0:     jg         xloop
michael@0:     jmp        xloop99
michael@0: 
michael@0:     // Blend 25 / 75.
michael@0:     align      4
michael@0:   xloop25:
michael@0:     movdqu     xmm0, [esi]
michael@0:     movdqu     xmm1, [esi + edx]
michael@0:     pavgb      xmm0, xmm1
michael@0:     pavgb      xmm0, xmm1
michael@0:     sub        ecx, 16
michael@0:     movdqu     [esi + edi], xmm0
michael@0:     lea        esi, [esi + 16]
michael@0:     jg         xloop25
michael@0:     jmp        xloop99
michael@0: 
michael@0:     // Blend 50 / 50.
michael@0:     align      4
michael@0:   xloop50:
michael@0:     movdqu     xmm0, [esi]
michael@0:     movdqu     xmm1, [esi + edx]
michael@0:     pavgb      xmm0, xmm1
michael@0:     sub        ecx, 16
michael@0:     movdqu     [esi + edi], xmm0
michael@0:     lea        esi, [esi + 16]
michael@0:     jg         xloop50
michael@0:     jmp        xloop99
michael@0: 
michael@0:     // Blend 75 / 25.
michael@0:     align      4
michael@0:   xloop75:
michael@0:     movdqu     xmm1, [esi]
michael@0:     movdqu     xmm0, [esi + edx]
michael@0:     pavgb      xmm0, xmm1
michael@0:     pavgb      xmm0, xmm1
michael@0:     sub        ecx, 16
michael@0:     movdqu     [esi + edi], xmm0
michael@0:     lea        esi, [esi + 16]
michael@0:     jg         xloop75
michael@0:     jmp        xloop99
michael@0: 
michael@0:     // Blend 100 / 0 - Copy row unchanged.
michael@0:     align      4
michael@0:   xloop100:
michael@0:     movdqu     xmm0, [esi]
michael@0:     sub        ecx, 16
michael@0:     movdqu     [esi + edi], xmm0
michael@0:     lea        esi, [esi + 16]
michael@0:     jg         xloop100
michael@0: 
michael@0:   xloop99:
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: #ifdef HAS_INTERPOLATEROW_SSE2
michael@0: // Bilinear filter 16x2 -> 16x1
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
michael@0:                                    ptrdiff_t src_stride, int dst_width,
michael@0:                                    int source_y_fraction) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        edi, [esp + 8 + 4]   // dst_ptr
michael@0:     mov        esi, [esp + 8 + 8]   // src_ptr
michael@0:     mov        edx, [esp + 8 + 12]  // src_stride
michael@0:     mov        ecx, [esp + 8 + 16]  // dst_width
michael@0:     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
michael@0:     sub        edi, esi
michael@0:     // Dispatch to specialized filters if applicable.
michael@0:     cmp        eax, 0
michael@0:     je         xloop100  // 0 / 256.  Blend 100 / 0.
michael@0:     cmp        eax, 64
michael@0:     je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.
michael@0:     cmp        eax, 128
michael@0:     je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
michael@0:     cmp        eax, 192
michael@0:     je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.
michael@0: 
michael@0:     movd       xmm5, eax            // xmm5 = y fraction
michael@0:     punpcklbw  xmm5, xmm5
michael@0:     psrlw      xmm5, 1
michael@0:     punpcklwd  xmm5, xmm5
michael@0:     punpckldq  xmm5, xmm5
michael@0:     punpcklqdq xmm5, xmm5
michael@0:     pxor       xmm4, xmm4
michael@0: 
michael@0:     align      4
michael@0:   xloop:
michael@0:     movdqu     xmm0, [esi]  // row0
michael@0:     movdqu     xmm2, [esi + edx]  // row1
michael@0:     movdqu     xmm1, xmm0
michael@0:     movdqu     xmm3, xmm2
michael@0:     punpcklbw  xmm2, xmm4
michael@0:     punpckhbw  xmm3, xmm4
michael@0:     punpcklbw  xmm0, xmm4
michael@0:     punpckhbw  xmm1, xmm4
michael@0:     psubw      xmm2, xmm0  // row1 - row0
michael@0:     psubw      xmm3, xmm1
michael@0:     paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16
michael@0:     paddw      xmm3, xmm3
michael@0:     pmulhw     xmm2, xmm5  // scale diff
michael@0:     pmulhw     xmm3, xmm5
michael@0:     paddw      xmm0, xmm2  // sum rows
michael@0:     paddw      xmm1, xmm3
michael@0:     packuswb   xmm0, xmm1
michael@0:     sub        ecx, 16
michael@0:     movdqu     [esi + edi], xmm0
michael@0:     lea        esi, [esi + 16]
michael@0:     jg         xloop
michael@0:     jmp        xloop99
michael@0: 
michael@0:     // Blend 25 / 75.
michael@0:     align      4
michael@0:   xloop25:
michael@0:     movdqu     xmm0, [esi]
michael@0:     movdqu     xmm1, [esi + edx]
michael@0:     pavgb      xmm0, xmm1
michael@0:     pavgb      xmm0, xmm1
michael@0:     sub        ecx, 16
michael@0:     movdqu     [esi + edi], xmm0
michael@0:     lea        esi, [esi + 16]
michael@0:     jg         xloop25
michael@0:     jmp        xloop99
michael@0: 
michael@0:     // Blend 50 / 50.
michael@0:     align      4
michael@0:   xloop50:
michael@0:     movdqu     xmm0, [esi]
michael@0:     movdqu     xmm1, [esi + edx]
michael@0:     pavgb      xmm0, xmm1
michael@0:     sub        ecx, 16
michael@0:     movdqu     [esi + edi], xmm0
michael@0:     lea        esi, [esi + 16]
michael@0:     jg         xloop50
michael@0:     jmp        xloop99
michael@0: 
michael@0:     // Blend 75 / 25.
michael@0:     align      4
michael@0:   xloop75:
michael@0:     movdqu     xmm1, [esi]
michael@0:     movdqu     xmm0, [esi + edx]
michael@0:     pavgb      xmm0, xmm1
michael@0:     pavgb      xmm0, xmm1
michael@0:     sub        ecx, 16
michael@0:     movdqu     [esi + edi], xmm0
michael@0:     lea        esi, [esi + 16]
michael@0:     jg         xloop75
michael@0:     jmp        xloop99
michael@0: 
michael@0:     // Blend 100 / 0 - Copy row unchanged.
michael@0:     align      4
michael@0:   xloop100:
michael@0:     movdqu     xmm0, [esi]
michael@0:     sub        ecx, 16
michael@0:     movdqu     [esi + edi], xmm0
michael@0:     lea        esi, [esi + 16]
michael@0:     jg         xloop100
michael@0: 
michael@0:   xloop99:
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_INTERPOLATEROW_SSE2
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
michael@0:                   uint8* dst_uv, int pix) {
michael@0:   __asm {
michael@0:     push       edi
michael@0:     mov        eax, [esp + 4 + 4]    // src_uv
michael@0:     mov        edx, [esp + 4 + 8]    // src_uv_stride
michael@0:     mov        edi, [esp + 4 + 12]   // dst_v
michael@0:     mov        ecx, [esp + 4 + 16]   // pix
michael@0:     sub        edi, eax
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     movdqa     xmm0, [eax]
michael@0:     pavgb      xmm0, [eax + edx]
michael@0:     sub        ecx, 16
michael@0:     movdqa     [eax + edi], xmm0
michael@0:     lea        eax,  [eax + 16]
michael@0:     jg         convertloop
michael@0:     pop        edi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: #ifdef HAS_HALFROW_AVX2
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
michael@0:                   uint8* dst_uv, int pix) {
michael@0:   __asm {
michael@0:     push       edi
michael@0:     mov        eax, [esp + 4 + 4]    // src_uv
michael@0:     mov        edx, [esp + 4 + 8]    // src_uv_stride
michael@0:     mov        edi, [esp + 4 + 12]   // dst_v
michael@0:     mov        ecx, [esp + 4 + 16]   // pix
michael@0:     sub        edi, eax
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     vmovdqu    ymm0, [eax]
michael@0:     vpavgb     ymm0, ymm0, [eax + edx]
michael@0:     sub        ecx, 32
michael@0:     vmovdqu    [eax + edi], ymm0
michael@0:     lea        eax,  [eax + 32]
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     vzeroupper
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_HALFROW_AVX2
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
michael@0:                           uint32 selector, int pix) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]    // src_argb
michael@0:     mov        edx, [esp + 8]    // dst_bayer
michael@0:     movd       xmm5, [esp + 12]  // selector
michael@0:     mov        ecx, [esp + 16]   // pix
michael@0:     pshufd     xmm5, xmm5, 0
michael@0: 
michael@0:     align      4
michael@0:   wloop:
michael@0:     movdqa     xmm0, [eax]
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     lea        eax, [eax + 32]
michael@0:     pshufb     xmm0, xmm5
michael@0:     pshufb     xmm1, xmm5
michael@0:     punpckldq  xmm0, xmm1
michael@0:     sub        ecx, 8
michael@0:     movq       qword ptr [edx], xmm0
michael@0:     lea        edx, [edx + 8]
michael@0:     jg         wloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: // Specialized ARGB to Bayer that just isolates G channel.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
michael@0:                            uint32 selector, int pix) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]    // src_argb
michael@0:     mov        edx, [esp + 8]    // dst_bayer
michael@0:                                  // selector
michael@0:     mov        ecx, [esp + 16]   // pix
michael@0:     pcmpeqb    xmm5, xmm5        // generate mask 0x000000ff
michael@0:     psrld      xmm5, 24
michael@0: 
michael@0:     align      4
michael@0:   wloop:
michael@0:     movdqa     xmm0, [eax]
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     lea        eax, [eax + 32]
michael@0:     psrld      xmm0, 8  // Move green to bottom.
michael@0:     psrld      xmm1, 8
michael@0:     pand       xmm0, xmm5
michael@0:     pand       xmm1, xmm5
michael@0:     packssdw   xmm0, xmm1
michael@0:     packuswb   xmm0, xmm1
michael@0:     sub        ecx, 8
michael@0:     movq       qword ptr [edx], xmm0
michael@0:     lea        edx, [edx + 8]
michael@0:     jg         wloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
michael@0:                           const uint8* shuffler, int pix) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]    // src_argb
michael@0:     mov        edx, [esp + 8]    // dst_argb
michael@0:     mov        ecx, [esp + 12]   // shuffler
michael@0:     movdqa     xmm5, [ecx]
michael@0:     mov        ecx, [esp + 16]   // pix
michael@0: 
michael@0:     align      4
michael@0:   wloop:
michael@0:     movdqa     xmm0, [eax]
michael@0:     movdqa     xmm1, [eax + 16]
michael@0:     lea        eax, [eax + 32]
michael@0:     pshufb     xmm0, xmm5
michael@0:     pshufb     xmm1, xmm5
michael@0:     sub        ecx, 8
michael@0:     movdqa     [edx], xmm0
michael@0:     movdqa     [edx + 16], xmm1
michael@0:     lea        edx, [edx + 32]
michael@0:     jg         wloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
michael@0:                                     const uint8* shuffler, int pix) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]    // src_argb
michael@0:     mov        edx, [esp + 8]    // dst_argb
michael@0:     mov        ecx, [esp + 12]   // shuffler
michael@0:     movdqa     xmm5, [ecx]
michael@0:     mov        ecx, [esp + 16]   // pix
michael@0: 
michael@0:     align      4
michael@0:   wloop:
michael@0:     movdqu     xmm0, [eax]
michael@0:     movdqu     xmm1, [eax + 16]
michael@0:     lea        eax, [eax + 32]
michael@0:     pshufb     xmm0, xmm5
michael@0:     pshufb     xmm1, xmm5
michael@0:     sub        ecx, 8
michael@0:     movdqu     [edx], xmm0
michael@0:     movdqu     [edx + 16], xmm1
michael@0:     lea        edx, [edx + 32]
michael@0:     jg         wloop
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: #ifdef HAS_ARGBSHUFFLEROW_AVX2
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
michael@0:                          const uint8* shuffler, int pix) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]     // src_argb
michael@0:     mov        edx, [esp + 8]     // dst_argb
michael@0:     mov        ecx, [esp + 12]    // shuffler
michael@0:     vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.
michael@0:     mov        ecx, [esp + 16]    // pix
michael@0: 
michael@0:     align      4
michael@0:   wloop:
michael@0:     vmovdqu    ymm0, [eax]
michael@0:     vmovdqu    ymm1, [eax + 32]
michael@0:     lea        eax, [eax + 64]
michael@0:     vpshufb    ymm0, ymm0, ymm5
michael@0:     vpshufb    ymm1, ymm1, ymm5
michael@0:     sub        ecx, 16
michael@0:     vmovdqu    [edx], ymm0
michael@0:     vmovdqu    [edx + 32], ymm1
michael@0:     lea        edx, [edx + 64]
michael@0:     jg         wloop
michael@0: 
michael@0:     vzeroupper
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBSHUFFLEROW_AVX2
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
michael@0:                          const uint8* shuffler, int pix) {
michael@0:   __asm {
michael@0:     push       ebx
michael@0:     push       esi
michael@0:     mov        eax, [esp + 8 + 4]    // src_argb
michael@0:     mov        edx, [esp + 8 + 8]    // dst_argb
michael@0:     mov        esi, [esp + 8 + 12]   // shuffler
michael@0:     mov        ecx, [esp + 8 + 16]   // pix
michael@0:     pxor       xmm5, xmm5
michael@0: 
michael@0:     mov        ebx, [esi]   // shuffler
michael@0:     cmp        ebx, 0x03000102
michael@0:     je         shuf_3012
michael@0:     cmp        ebx, 0x00010203
michael@0:     je         shuf_0123
michael@0:     cmp        ebx, 0x00030201
michael@0:     je         shuf_0321
michael@0:     cmp        ebx, 0x02010003
michael@0:     je         shuf_2103
michael@0: 
michael@0:   // TODO(fbarchard): Use one source pointer and 3 offsets.
michael@0:   shuf_any1:
michael@0:     movzx      ebx, byte ptr [esi]
michael@0:     movzx      ebx, byte ptr [eax + ebx]
michael@0:     mov        [edx], bl
michael@0:     movzx      ebx, byte ptr [esi + 1]
michael@0:     movzx      ebx, byte ptr [eax + ebx]
michael@0:     mov        [edx + 1], bl
michael@0:     movzx      ebx, byte ptr [esi + 2]
michael@0:     movzx      ebx, byte ptr [eax + ebx]
michael@0:     mov        [edx + 2], bl
michael@0:     movzx      ebx, byte ptr [esi + 3]
michael@0:     movzx      ebx, byte ptr [eax + ebx]
michael@0:     mov        [edx + 3], bl
michael@0:     lea        eax, [eax + 4]
michael@0:     lea        edx, [edx + 4]
michael@0:     sub        ecx, 1
michael@0:     jg         shuf_any1
michael@0:     jmp        shuf99
michael@0: 
michael@0:     align      4
michael@0:   shuf_0123:
michael@0:     movdqu     xmm0, [eax]
michael@0:     lea        eax, [eax + 16]
michael@0:     movdqa     xmm1, xmm0
michael@0:     punpcklbw  xmm0, xmm5
michael@0:     punpckhbw  xmm1, xmm5
michael@0:     pshufhw    xmm0, xmm0, 01Bh   // 1B = 00011011 = 0x0123 = BGRAToARGB
michael@0:     pshuflw    xmm0, xmm0, 01Bh
michael@0:     pshufhw    xmm1, xmm1, 01Bh
michael@0:     pshuflw    xmm1, xmm1, 01Bh
michael@0:     packuswb   xmm0, xmm1
michael@0:     sub        ecx, 4
michael@0:     movdqu     [edx], xmm0
michael@0:     lea        edx, [edx + 16]
michael@0:     jg         shuf_0123
michael@0:     jmp        shuf99
michael@0: 
michael@0:     align      4
michael@0:   shuf_0321:
michael@0:     movdqu     xmm0, [eax]
michael@0:     lea        eax, [eax + 16]
michael@0:     movdqa     xmm1, xmm0
michael@0:     punpcklbw  xmm0, xmm5
michael@0:     punpckhbw  xmm1, xmm5
michael@0:     pshufhw    xmm0, xmm0, 039h   // 39 = 00111001 = 0x0321 = RGBAToARGB
michael@0:     pshuflw    xmm0, xmm0, 039h
michael@0:     pshufhw    xmm1, xmm1, 039h
michael@0:     pshuflw    xmm1, xmm1, 039h
michael@0:     packuswb   xmm0, xmm1
michael@0:     sub        ecx, 4
michael@0:     movdqu     [edx], xmm0
michael@0:     lea        edx, [edx + 16]
michael@0:     jg         shuf_0321
michael@0:     jmp        shuf99
michael@0: 
michael@0:     align      4
michael@0:   shuf_2103:
michael@0:     movdqu     xmm0, [eax]
michael@0:     lea        eax, [eax + 16]
michael@0:     movdqa     xmm1, xmm0
michael@0:     punpcklbw  xmm0, xmm5
michael@0:     punpckhbw  xmm1, xmm5
michael@0:     pshufhw    xmm0, xmm0, 093h   // 93 = 10010011 = 0x2103 = ARGBToRGBA
michael@0:     pshuflw    xmm0, xmm0, 093h
michael@0:     pshufhw    xmm1, xmm1, 093h
michael@0:     pshuflw    xmm1, xmm1, 093h
michael@0:     packuswb   xmm0, xmm1
michael@0:     sub        ecx, 4
michael@0:     movdqu     [edx], xmm0
michael@0:     lea        edx, [edx + 16]
michael@0:     jg         shuf_2103
michael@0:     jmp        shuf99
michael@0: 
michael@0:     align      4
michael@0:   shuf_3012:
michael@0:     movdqu     xmm0, [eax]
michael@0:     lea        eax, [eax + 16]
michael@0:     movdqa     xmm1, xmm0
michael@0:     punpcklbw  xmm0, xmm5
michael@0:     punpckhbw  xmm1, xmm5
michael@0:     pshufhw    xmm0, xmm0, 0C6h   // C6 = 11000110 = 0x3012 = ABGRToARGB
michael@0:     pshuflw    xmm0, xmm0, 0C6h
michael@0:     pshufhw    xmm1, xmm1, 0C6h
michael@0:     pshuflw    xmm1, xmm1, 0C6h
michael@0:     packuswb   xmm0, xmm1
michael@0:     sub        ecx, 4
michael@0:     movdqu     [edx], xmm0
michael@0:     lea        edx, [edx + 16]
michael@0:     jg         shuf_3012
michael@0: 
michael@0:   shuf99:
michael@0:     pop        esi
michael@0:     pop        ebx
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: // YUY2 - Macro-pixel = 2 image pixels
michael@0: // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
michael@0: 
michael@0: // UYVY - Macro-pixel = 2 image pixels
michael@0: // U0Y0V0Y1
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void I422ToYUY2Row_SSE2(const uint8* src_y,
michael@0:                         const uint8* src_u,
michael@0:                         const uint8* src_v,
michael@0:                         uint8* dst_frame, int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]    // src_y
michael@0:     mov        esi, [esp + 8 + 8]    // src_u
michael@0:     mov        edx, [esp + 8 + 12]   // src_v
michael@0:     mov        edi, [esp + 8 + 16]   // dst_frame
michael@0:     mov        ecx, [esp + 8 + 20]   // width
michael@0:     sub        edx, esi
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     movq       xmm2, qword ptr [esi] // U
michael@0:     movq       xmm3, qword ptr [esi + edx] // V
michael@0:     lea        esi, [esi + 8]
michael@0:     punpcklbw  xmm2, xmm3 // UV
michael@0:     movdqu     xmm0, [eax] // Y
michael@0:     lea        eax, [eax + 16]
michael@0:     movdqa     xmm1, xmm0
michael@0:     punpcklbw  xmm0, xmm2 // YUYV
michael@0:     punpckhbw  xmm1, xmm2
michael@0:     movdqu     [edi], xmm0
michael@0:     movdqu     [edi + 16], xmm1
michael@0:     lea        edi, [edi + 32]
michael@0:     sub        ecx, 16
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void I422ToUYVYRow_SSE2(const uint8* src_y,
michael@0:                         const uint8* src_u,
michael@0:                         const uint8* src_v,
michael@0:                         uint8* dst_frame, int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]    // src_y
michael@0:     mov        esi, [esp + 8 + 8]    // src_u
michael@0:     mov        edx, [esp + 8 + 12]   // src_v
michael@0:     mov        edi, [esp + 8 + 16]   // dst_frame
michael@0:     mov        ecx, [esp + 8 + 20]   // width
michael@0:     sub        edx, esi
michael@0: 
michael@0:     align      4
michael@0:   convertloop:
michael@0:     movq       xmm2, qword ptr [esi] // U
michael@0:     movq       xmm3, qword ptr [esi + edx] // V
michael@0:     lea        esi, [esi + 8]
michael@0:     punpcklbw  xmm2, xmm3 // UV
michael@0:     movdqu     xmm0, [eax] // Y
michael@0:     movdqa     xmm1, xmm2
michael@0:     lea        eax, [eax + 16]
michael@0:     punpcklbw  xmm1, xmm0 // UYVY
michael@0:     punpckhbw  xmm2, xmm0
michael@0:     movdqu     [edi], xmm1
michael@0:     movdqu     [edi + 16], xmm2
michael@0:     lea        edi, [edi + 32]
michael@0:     sub        ecx, 16
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBPolynomialRow_SSE2(const uint8* src_argb,
michael@0:                             uint8* dst_argb, const float* poly,
michael@0:                             int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     mov        eax, [esp + 4 + 4]   /* src_argb */
michael@0:     mov        edx, [esp + 4 + 8]   /* dst_argb */
michael@0:     mov        esi, [esp + 4 + 12]  /* poly */
michael@0:     mov        ecx, [esp + 4 + 16]  /* width */
michael@0:     pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
michael@0: 
michael@0:     // 2 pixel loop.
michael@0:     align      4
michael@0:  convertloop:
michael@0: //    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
michael@0: //    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
michael@0:     movq       xmm0, qword ptr [eax]  // BGRABGRA
michael@0:     lea        eax, [eax + 8]
michael@0:     punpcklbw  xmm0, xmm3
michael@0:     movdqa     xmm4, xmm0
michael@0:     punpcklwd  xmm0, xmm3  // pixel 0
michael@0:     punpckhwd  xmm4, xmm3  // pixel 1
michael@0:     cvtdq2ps   xmm0, xmm0  // 4 floats
michael@0:     cvtdq2ps   xmm4, xmm4
michael@0:     movdqa     xmm1, xmm0  // X
michael@0:     movdqa     xmm5, xmm4
michael@0:     mulps      xmm0, [esi + 16]  // C1 * X
michael@0:     mulps      xmm4, [esi + 16]
michael@0:     addps      xmm0, [esi]  // result = C0 + C1 * X
michael@0:     addps      xmm4, [esi]
michael@0:     movdqa     xmm2, xmm1
michael@0:     movdqa     xmm6, xmm5
michael@0:     mulps      xmm2, xmm1  // X * X
michael@0:     mulps      xmm6, xmm5
michael@0:     mulps      xmm1, xmm2  // X * X * X
michael@0:     mulps      xmm5, xmm6
michael@0:     mulps      xmm2, [esi + 32]  // C2 * X * X
michael@0:     mulps      xmm6, [esi + 32]
michael@0:     mulps      xmm1, [esi + 48]  // C3 * X * X * X
michael@0:     mulps      xmm5, [esi + 48]
michael@0:     addps      xmm0, xmm2  // result += C2 * X * X
michael@0:     addps      xmm4, xmm6
michael@0:     addps      xmm0, xmm1  // result += C3 * X * X * X
michael@0:     addps      xmm4, xmm5
michael@0:     cvttps2dq  xmm0, xmm0
michael@0:     cvttps2dq  xmm4, xmm4
michael@0:     packuswb   xmm0, xmm4
michael@0:     packuswb   xmm0, xmm0
michael@0:     sub        ecx, 2
michael@0:     movq       qword ptr [edx], xmm0
michael@0:     lea        edx, [edx + 8]
michael@0:     jg         convertloop
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
michael@0: 
michael@0: #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBPolynomialRow_AVX2(const uint8* src_argb,
michael@0:                             uint8* dst_argb, const float* poly,
michael@0:                             int width) {
michael@0:   __asm {
michael@0:     mov        eax, [esp + 4]   /* src_argb */
michael@0:     mov        edx, [esp + 8]   /* dst_argb */
michael@0:     mov        ecx, [esp + 12]   /* poly */
michael@0:     vbroadcastf128 ymm4, [ecx]       // C0
michael@0:     vbroadcastf128 ymm5, [ecx + 16]  // C1
michael@0:     vbroadcastf128 ymm6, [ecx + 32]  // C2
michael@0:     vbroadcastf128 ymm7, [ecx + 48]  // C3
michael@0:     mov        ecx, [esp + 16]  /* width */
michael@0: 
michael@0:     // 2 pixel loop.
michael@0:     align      4
michael@0:  convertloop:
michael@0:     vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
michael@0:     lea         eax, [eax + 8]
michael@0:     vcvtdq2ps   ymm0, ymm0        // X 8 floats
michael@0:     vmulps      ymm2, ymm0, ymm0  // X * X
michael@0:     vmulps      ymm3, ymm0, ymm7  // C3 * X
michael@0:     vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
michael@0:     vfmadd231ps ymm0, ymm2, ymm6  // result += C2 * X * X
michael@0:     vfmadd231ps ymm0, ymm2, ymm3  // result += C3 * X * X * X
michael@0:     vcvttps2dq  ymm0, ymm0
michael@0:     vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
michael@0:     vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
michael@0:     vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
michael@0:     sub         ecx, 2
michael@0:     vmovq       qword ptr [edx], xmm0
michael@0:     lea         edx, [edx + 8]
michael@0:     jg          convertloop
michael@0:     vzeroupper
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
michael@0: 
michael@0: #ifdef HAS_ARGBCOLORTABLEROW_X86
michael@0: // Tranform ARGB pixels with color table.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
michael@0:                            int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     mov        eax, [esp + 4 + 4]   /* dst_argb */
michael@0:     mov        esi, [esp + 4 + 8]   /* table_argb */
michael@0:     mov        ecx, [esp + 4 + 12]  /* width */
michael@0: 
michael@0:     // 1 pixel loop.
michael@0:     align      4
michael@0:   convertloop:
michael@0:     movzx      edx, byte ptr [eax]
michael@0:     lea        eax, [eax + 4]
michael@0:     movzx      edx, byte ptr [esi + edx * 4]
michael@0:     mov        byte ptr [eax - 4], dl
michael@0:     movzx      edx, byte ptr [eax - 4 + 1]
michael@0:     movzx      edx, byte ptr [esi + edx * 4 + 1]
michael@0:     mov        byte ptr [eax - 4 + 1], dl
michael@0:     movzx      edx, byte ptr [eax - 4 + 2]
michael@0:     movzx      edx, byte ptr [esi + edx * 4 + 2]
michael@0:     mov        byte ptr [eax - 4 + 2], dl
michael@0:     movzx      edx, byte ptr [eax - 4 + 3]
michael@0:     movzx      edx, byte ptr [esi + edx * 4 + 3]
michael@0:     mov        byte ptr [eax - 4 + 3], dl
michael@0:     dec        ecx
michael@0:     jg         convertloop
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBCOLORTABLEROW_X86
michael@0: 
michael@0: #ifdef HAS_RGBCOLORTABLEROW_X86
michael@0: // Tranform RGB pixels with color table.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     mov        eax, [esp + 4 + 4]   /* dst_argb */
michael@0:     mov        esi, [esp + 4 + 8]   /* table_argb */
michael@0:     mov        ecx, [esp + 4 + 12]  /* width */
michael@0: 
michael@0:     // 1 pixel loop.
michael@0:     align      4
michael@0:   convertloop:
michael@0:     movzx      edx, byte ptr [eax]
michael@0:     lea        eax, [eax + 4]
michael@0:     movzx      edx, byte ptr [esi + edx * 4]
michael@0:     mov        byte ptr [eax - 4], dl
michael@0:     movzx      edx, byte ptr [eax - 4 + 1]
michael@0:     movzx      edx, byte ptr [esi + edx * 4 + 1]
michael@0:     mov        byte ptr [eax - 4 + 1], dl
michael@0:     movzx      edx, byte ptr [eax - 4 + 2]
michael@0:     movzx      edx, byte ptr [esi + edx * 4 + 2]
michael@0:     mov        byte ptr [eax - 4 + 2], dl
michael@0:     dec        ecx
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_RGBCOLORTABLEROW_X86
michael@0: 
michael@0: #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
michael@0: // Tranform RGB pixels with luma table.
michael@0: __declspec(naked) __declspec(align(16))
michael@0: void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
michael@0:                                  int width,
michael@0:                                  const uint8* luma, uint32 lumacoeff) {
michael@0:   __asm {
michael@0:     push       esi
michael@0:     push       edi
michael@0:     mov        eax, [esp + 8 + 4]   /* src_argb */
michael@0:     mov        edi, [esp + 8 + 8]   /* dst_argb */
michael@0:     mov        ecx, [esp + 8 + 12]  /* width */
michael@0:     movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
michael@0:     movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
michael@0:     pshufd     xmm2, xmm2, 0
michael@0:     pshufd     xmm3, xmm3, 0
michael@0:     pcmpeqb    xmm4, xmm4        // generate mask 0xff00ff00
michael@0:     psllw      xmm4, 8
michael@0:     pxor       xmm5, xmm5
michael@0: 
michael@0:     // 4 pixel loop.
michael@0:     align      4
michael@0:   convertloop:
michael@0:     movdqu     xmm0, qword ptr [eax]      // generate luma ptr
michael@0:     pmaddubsw  xmm0, xmm3
michael@0:     phaddw     xmm0, xmm0
michael@0:     pand       xmm0, xmm4  // mask out low bits
michael@0:     punpcklwd  xmm0, xmm5
michael@0:     paddd      xmm0, xmm2  // add table base
michael@0:     movd       esi, xmm0
michael@0:     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
michael@0: 
michael@0:     movzx      edx, byte ptr [eax]
michael@0:     movzx      edx, byte ptr [esi + edx]
michael@0:     mov        byte ptr [edi], dl
michael@0:     movzx      edx, byte ptr [eax + 1]
michael@0:     movzx      edx, byte ptr [esi + edx]
michael@0:     mov        byte ptr [edi + 1], dl
michael@0:     movzx      edx, byte ptr [eax + 2]
michael@0:     movzx      edx, byte ptr [esi + edx]
michael@0:     mov        byte ptr [edi + 2], dl
michael@0:     movzx      edx, byte ptr [eax + 3]  // copy alpha.
michael@0:     mov        byte ptr [edi + 3], dl
michael@0: 
michael@0:     movd       esi, xmm0
michael@0:     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
michael@0: 
michael@0:     movzx      edx, byte ptr [eax + 4]
michael@0:     movzx      edx, byte ptr [esi + edx]
michael@0:     mov        byte ptr [edi + 4], dl
michael@0:     movzx      edx, byte ptr [eax + 5]
michael@0:     movzx      edx, byte ptr [esi + edx]
michael@0:     mov        byte ptr [edi + 5], dl
michael@0:     movzx      edx, byte ptr [eax + 6]
michael@0:     movzx      edx, byte ptr [esi + edx]
michael@0:     mov        byte ptr [edi + 6], dl
michael@0:     movzx      edx, byte ptr [eax + 7]  // copy alpha.
michael@0:     mov        byte ptr [edi + 7], dl
michael@0: 
michael@0:     movd       esi, xmm0
michael@0:     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
michael@0: 
michael@0:     movzx      edx, byte ptr [eax + 8]
michael@0:     movzx      edx, byte ptr [esi + edx]
michael@0:     mov        byte ptr [edi + 8], dl
michael@0:     movzx      edx, byte ptr [eax + 9]
michael@0:     movzx      edx, byte ptr [esi + edx]
michael@0:     mov        byte ptr [edi + 9], dl
michael@0:     movzx      edx, byte ptr [eax + 10]
michael@0:     movzx      edx, byte ptr [esi + edx]
michael@0:     mov        byte ptr [edi + 10], dl
michael@0:     movzx      edx, byte ptr [eax + 11]  // copy alpha.
michael@0:     mov        byte ptr [edi + 11], dl
michael@0: 
michael@0:     movd       esi, xmm0
michael@0: 
michael@0:     movzx      edx, byte ptr [eax + 12]
michael@0:     movzx      edx, byte ptr [esi + edx]
michael@0:     mov        byte ptr [edi + 12], dl
michael@0:     movzx      edx, byte ptr [eax + 13]
michael@0:     movzx      edx, byte ptr [esi + edx]
michael@0:     mov        byte ptr [edi + 13], dl
michael@0:     movzx      edx, byte ptr [eax + 14]
michael@0:     movzx      edx, byte ptr [esi + edx]
michael@0:     mov        byte ptr [edi + 14], dl
michael@0:     movzx      edx, byte ptr [eax + 15]  // copy alpha.
michael@0:     mov        byte ptr [edi + 15], dl
michael@0: 
michael@0:     sub        ecx, 4
michael@0:     lea        eax, [eax + 16]
michael@0:     lea        edi, [edi + 16]
michael@0:     jg         convertloop
michael@0: 
michael@0:     pop        edi
michael@0:     pop        esi
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
michael@0: 
michael@0: #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
michael@0: 
michael@0: #ifdef __cplusplus
michael@0: }  // extern "C"
michael@0: }  // namespace libyuv
michael@0: #endif