michael@0: /*
michael@0:  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
michael@0:  *
michael@0:  *  Use of this source code is governed by a BSD-style license
michael@0:  *  that can be found in the LICENSE file in the root of the source
michael@0:  *  tree. An additional intellectual property rights grant can be found
michael@0:  *  in the file PATENTS. All contributing project authors may
michael@0:  *  be found in the AUTHORS file in the root of the source tree.
michael@0:  */
michael@0: 
michael@0: #include "libyuv/row.h"
michael@0: 
michael@0: #ifdef __cplusplus
michael@0: namespace libyuv {
michael@0: extern "C" {
michael@0: #endif
michael@0: 
michael@0: // This module is for GCC x86 and x64.
michael@0: #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
michael@0: 
michael@0: // Offsets for source bytes 0 to 9
michael@0: static uvec8 kShuf0 =
michael@0:   { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
michael@0: 
michael@0: // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
michael@0: static uvec8 kShuf1 =
michael@0:   { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
michael@0: 
michael@0: // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
michael@0: static uvec8 kShuf2 =
michael@0:   { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
michael@0: 
michael@0: // Offsets for source bytes 0 to 10
michael@0: static uvec8 kShuf01 =
michael@0:   { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
michael@0: 
michael@0: // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
michael@0: static uvec8 kShuf11 =
michael@0:   { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
michael@0: 
michael@0: // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
michael@0: static uvec8 kShuf21 =
michael@0:   { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
michael@0: 
michael@0: // Coefficients for source bytes 0 to 10
michael@0: static uvec8 kMadd01 =
michael@0:   { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
michael@0: 
michael@0: // Coefficients for source bytes 10 to 21
michael@0: static uvec8 kMadd11 =
michael@0:   { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
michael@0: 
michael@0: // Coefficients for source bytes 21 to 31
michael@0: static uvec8 kMadd21 =
michael@0:   { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
michael@0: 
michael@0: // Coefficients for source bytes 21 to 31
michael@0: static vec16 kRound34 =
michael@0:   { 2, 2, 2, 2, 2, 2, 2, 2 };
michael@0: 
michael@0: static uvec8 kShuf38a =
michael@0:   { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
michael@0: 
michael@0: static uvec8 kShuf38b =
michael@0:   { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
michael@0: 
michael@0: // Arrange words 0,3,6 into 0,1,2
michael@0: static uvec8 kShufAc =
michael@0:   { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
michael@0: 
michael@0: // Arrange words 0,3,6 into 3,4,5
michael@0: static uvec8 kShufAc3 =
michael@0:   { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
michael@0: 
michael@0: // Scaling values for boxes of 3x3 and 2x3
michael@0: static uvec16 kScaleAc33 =
michael@0:   { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
michael@0: 
michael@0: // Arrange first value for pixels 0,1,2,3,4,5
michael@0: static uvec8 kShufAb0 =
michael@0:   { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
michael@0: 
michael@0: // Arrange second value for pixels 0,1,2,3,4,5
michael@0: static uvec8 kShufAb1 =
michael@0:   { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
michael@0: 
michael@0: // Arrange third value for pixels 0,1,2,3,4,5
michael@0: static uvec8 kShufAb2 =
michael@0:   { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
michael@0: 
michael@0: // Scaling values for boxes of 3x2 and 2x2
michael@0: static uvec16 kScaleAb2 =
michael@0:   { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
michael@0: 
michael@0: // GCC versions of row functions are verbatim conversions from Visual C.
michael@0: // Generated using gcc disassembly on Visual C object file:
michael@0: // objdump -D yuvscaler.obj >yuvscaler.txt
michael@0: 
michael@0: void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
michael@0:                         uint8* dst_ptr, int dst_width) {
michael@0:   asm volatile (
michael@0:     LABELALIGN
michael@0:   "1:                                          \n"
michael@0:     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
michael@0:     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
michael@0:     "lea       " MEMLEA(0x20,0) ",%0           \n"
michael@0:     "psrlw     $0x8,%%xmm0                     \n"
michael@0:     "psrlw     $0x8,%%xmm1                     \n"
michael@0:     "packuswb  %%xmm1,%%xmm0                   \n"
michael@0:     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
michael@0:     "lea       " MEMLEA(0x10,1) ",%1           \n"
michael@0:     "sub       $0x10,%2                        \n"
michael@0:     "jg        1b                              \n"
michael@0:   : "+r"(src_ptr),    // %0
michael@0:     "+r"(dst_ptr),    // %1
michael@0:     "+r"(dst_width)   // %2
michael@0:   :
michael@0:   : "memory", "cc"
michael@0: #if defined(__SSE2__)
michael@0:     , "xmm0", "xmm1"
michael@0: #endif
michael@0:   );
michael@0: }
michael@0: 
michael@0: void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
michael@0:                               uint8* dst_ptr, int dst_width) {
michael@0:   asm volatile (
michael@0:     "pcmpeqb   %%xmm5,%%xmm5                   \n"
michael@0:     "psrlw     $0x8,%%xmm5                     \n"
michael@0: 
michael@0:     LABELALIGN
michael@0:   "1:                                          \n"
michael@0:     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
michael@0:     "movdqa    " MEMACCESS2(0x10, 0) ",%%xmm1  \n"
michael@0:     "lea       " MEMLEA(0x20,0) ",%0           \n"
michael@0:     "movdqa    %%xmm0,%%xmm2                   \n"
michael@0:     "psrlw     $0x8,%%xmm0                     \n"
michael@0:     "movdqa    %%xmm1,%%xmm3                   \n"
michael@0:     "psrlw     $0x8,%%xmm1                     \n"
michael@0:     "pand      %%xmm5,%%xmm2                   \n"
michael@0:     "pand      %%xmm5,%%xmm3                   \n"
michael@0:     "pavgw     %%xmm2,%%xmm0                   \n"
michael@0:     "pavgw     %%xmm3,%%xmm1                   \n"
michael@0:     "packuswb  %%xmm1,%%xmm0                   \n"
michael@0:     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
michael@0:     "lea       " MEMLEA(0x10,1) ",%1           \n"
michael@0:     "sub       $0x10,%2                        \n"
michael@0:     "jg        1b                              \n"
michael@0:   : "+r"(src_ptr),    // %0
michael@0:     "+r"(dst_ptr),    // %1
michael@0:     "+r"(dst_width)   // %2
michael@0:   :
michael@0:   : "memory", "cc"
michael@0: #if defined(__SSE2__)
michael@0:     , "xmm0", "xmm1", "xmm5"
michael@0: #endif
michael@0:   );
michael@0: }
michael@0: 
michael@0: void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
michael@0:                            uint8* dst_ptr, int dst_width) {
michael@0:   asm volatile (
michael@0:     "pcmpeqb   %%xmm5,%%xmm5                   \n"
michael@0:     "psrlw     $0x8,%%xmm5                     \n"
michael@0: 
michael@0:     LABELALIGN
michael@0:   "1:                                          \n"
michael@0:     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
michael@0:     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
michael@0:     MEMOPREG(movdqa,0x00,0,3,1,xmm2)           //  movdqa  (%0,%3,1),%%xmm2
michael@0:     BUNDLEALIGN
michael@0:     MEMOPREG(movdqa,0x10,0,3,1,xmm3)           //  movdqa  0x10(%0,%3,1),%%xmm3
michael@0:     "lea       " MEMLEA(0x20,0) ",%0           \n"
michael@0:     "pavgb     %%xmm2,%%xmm0                   \n"
michael@0:     "pavgb     %%xmm3,%%xmm1                   \n"
michael@0:     "movdqa    %%xmm0,%%xmm2                   \n"
michael@0:     "psrlw     $0x8,%%xmm0                     \n"
michael@0:     "movdqa    %%xmm1,%%xmm3                   \n"
michael@0:     "psrlw     $0x8,%%xmm1                     \n"
michael@0:     "pand      %%xmm5,%%xmm2                   \n"
michael@0:     "pand      %%xmm5,%%xmm3                   \n"
michael@0:     "pavgw     %%xmm2,%%xmm0                   \n"
michael@0:     "pavgw     %%xmm3,%%xmm1                   \n"
michael@0:     "packuswb  %%xmm1,%%xmm0                   \n"
michael@0:     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
michael@0:     "lea       " MEMLEA(0x10,1) ",%1           \n"
michael@0:     "sub       $0x10,%2                        \n"
michael@0:     "jg        1b                              \n"
michael@0:   : "+r"(src_ptr),    // %0
michael@0:     "+r"(dst_ptr),    // %1
michael@0:     "+r"(dst_width)   // %2
michael@0:   : "r"((intptr_t)(src_stride))   // %3
michael@0:   : "memory", "cc"
michael@0: #if defined(__native_client__) && defined(__x86_64__)
michael@0:     , "r14"
michael@0: #endif
michael@0: #if defined(__SSE2__)
michael@0:     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
michael@0: #endif
michael@0:   );
michael@0: }
michael@0: 
michael@0: void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
michael@0:                                   uint8* dst_ptr, int dst_width) {
michael@0:   asm volatile (
michael@0:     LABELALIGN
michael@0:   "1:                                          \n"
michael@0:     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
michael@0:     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
michael@0:     "lea       " MEMLEA(0x20,0) ",%0           \n"
michael@0:     "psrlw     $0x8,%%xmm0                     \n"
michael@0:     "psrlw     $0x8,%%xmm1                     \n"
michael@0:     "packuswb  %%xmm1,%%xmm0                   \n"
michael@0:     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
michael@0:     "lea       " MEMLEA(0x10,1) ",%1           \n"
michael@0:     "sub       $0x10,%2                        \n"
michael@0:     "jg        1b                              \n"
michael@0:   : "+r"(src_ptr),    // %0
michael@0:     "+r"(dst_ptr),    // %1
michael@0:     "+r"(dst_width)   // %2
michael@0:   :
michael@0:   : "memory", "cc"
michael@0: #if defined(__SSE2__)
michael@0:     , "xmm0", "xmm1"
michael@0: #endif
michael@0:   );
michael@0: }
michael@0: 
michael@0: void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
michael@0:                                         ptrdiff_t src_stride,
michael@0:                                         uint8* dst_ptr, int dst_width) {
michael@0:   asm volatile (
michael@0:     "pcmpeqb   %%xmm5,%%xmm5                   \n"
michael@0:     "psrlw     $0x8,%%xmm5                     \n"
michael@0: 
michael@0:     LABELALIGN
michael@0:   "1:                                          \n"
michael@0:     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
michael@0:     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
michael@0:     "lea       " MEMLEA(0x20,0) ",%0           \n"
michael@0:     "movdqa    %%xmm0,%%xmm2                   \n"
michael@0:     "psrlw     $0x8,%%xmm0                     \n"
michael@0:     "movdqa    %%xmm1,%%xmm3                   \n"
michael@0:     "psrlw     $0x8,%%xmm1                     \n"
michael@0:     "pand      %%xmm5,%%xmm2                   \n"
michael@0:     "pand      %%xmm5,%%xmm3                   \n"
michael@0:     "pavgw     %%xmm2,%%xmm0                   \n"
michael@0:     "pavgw     %%xmm3,%%xmm1                   \n"
michael@0:     "packuswb  %%xmm1,%%xmm0                   \n"
michael@0:     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
michael@0:     "lea       " MEMLEA(0x10,1) ",%1           \n"
michael@0:     "sub       $0x10,%2                        \n"
michael@0:     "jg        1b                              \n"
michael@0:   : "+r"(src_ptr),    // %0
michael@0:     "+r"(dst_ptr),    // %1
michael@0:     "+r"(dst_width)   // %2
michael@0:   :
michael@0:   : "memory", "cc"
michael@0: #if defined(__SSE2__)
michael@0:     , "xmm0", "xmm1", "xmm5"
michael@0: #endif
michael@0:   );
michael@0: }
michael@0: 
michael@0: void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
michael@0:                                      ptrdiff_t src_stride,
michael@0:                                      uint8* dst_ptr, int dst_width) {
michael@0:   asm volatile (
michael@0:     "pcmpeqb   %%xmm5,%%xmm5                   \n"
michael@0:     "psrlw     $0x8,%%xmm5                     \n"
michael@0: 
michael@0:     LABELALIGN
michael@0:   "1:                                          \n"
michael@0:     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
michael@0:     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
michael@0:     MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
michael@0:     BUNDLEALIGN
michael@0:     MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
michael@0:     "lea       " MEMLEA(0x20,0) ",%0           \n"
michael@0:     "pavgb     %%xmm2,%%xmm0                   \n"
michael@0:     "pavgb     %%xmm3,%%xmm1                   \n"
michael@0:     "movdqa    %%xmm0,%%xmm2                   \n"
michael@0:     "psrlw     $0x8,%%xmm0                     \n"
michael@0:     "movdqa    %%xmm1,%%xmm3                   \n"
michael@0:     "psrlw     $0x8,%%xmm1                     \n"
michael@0:     "pand      %%xmm5,%%xmm2                   \n"
michael@0:     "pand      %%xmm5,%%xmm3                   \n"
michael@0:     "pavgw     %%xmm2,%%xmm0                   \n"
michael@0:     "pavgw     %%xmm3,%%xmm1                   \n"
michael@0:     "packuswb  %%xmm1,%%xmm0                   \n"
michael@0:     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
michael@0:     "lea       " MEMLEA(0x10,1) ",%1           \n"
michael@0:     "sub       $0x10,%2                        \n"
michael@0:     "jg        1b                              \n"
michael@0:   : "+r"(src_ptr),    // %0
michael@0:     "+r"(dst_ptr),    // %1
michael@0:     "+r"(dst_width)   // %2
michael@0:   : "r"((intptr_t)(src_stride))   // %3
michael@0:   : "memory", "cc"
michael@0: #if defined(__native_client__) && defined(__x86_64__)
michael@0:     , "r14"
michael@0: #endif
michael@0: #if defined(__SSE2__)
michael@0:     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
michael@0: #endif
michael@0:   );
michael@0: }
michael@0: 
michael@0: void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
michael@0:                         uint8* dst_ptr, int dst_width) {
michael@0:   asm volatile (
michael@0:     "pcmpeqb   %%xmm5,%%xmm5                   \n"
michael@0:     "psrld     $0x18,%%xmm5                    \n"
michael@0:     "pslld     $0x10,%%xmm5                    \n"
michael@0: 
michael@0:     LABELALIGN
michael@0:   "1:                                          \n"
michael@0:     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
michael@0:     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
michael@0:     "lea       " MEMLEA(0x20,0) ",%0           \n"
michael@0:     "pand      %%xmm5,%%xmm0                   \n"
michael@0:     "pand      %%xmm5,%%xmm1                   \n"
michael@0:     "packuswb  %%xmm1,%%xmm0                   \n"
michael@0:     "psrlw     $0x8,%%xmm0                     \n"
michael@0:     "packuswb  %%xmm0,%%xmm0                   \n"
michael@0:     "movq      %%xmm0," MEMACCESS(1) "         \n"
michael@0:     "lea       " MEMLEA(0x8,1) ",%1            \n"
michael@0:     "sub       $0x8,%2                         \n"
michael@0:     "jg        1b                              \n"
michael@0:   : "+r"(src_ptr),    // %0
michael@0:     "+r"(dst_ptr),    // %1
michael@0:     "+r"(dst_width)   // %2
michael@0:   :
michael@0:   : "memory", "cc"
michael@0: #if defined(__SSE2__)
michael@0:     , "xmm0", "xmm1", "xmm5"
michael@0: #endif
michael@0:   );
michael@0: }
michael@0: 
michael@0: void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
michael@0:                            uint8* dst_ptr, int dst_width) {
michael@0:   intptr_t stridex3 = 0;
michael@0:   asm volatile (
michael@0:     "pcmpeqb   %%xmm7,%%xmm7                   \n"
michael@0:     "psrlw     $0x8,%%xmm7                     \n"
michael@0:     "lea       " MEMLEA4(0x00,4,4,2) ",%3      \n"
michael@0: 
michael@0:     LABELALIGN
michael@0:   "1:                                          \n"
michael@0:     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
michael@0:     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
michael@0:     MEMOPREG(movdqa,0x00,0,4,1,xmm2)           //  movdqa  (%0,%4,1),%%xmm2
michael@0:     BUNDLEALIGN
michael@0:     MEMOPREG(movdqa,0x10,0,4,1,xmm3)           //  movdqa  0x10(%0,%4,1),%%xmm3
michael@0:     "pavgb     %%xmm2,%%xmm0                   \n"
michael@0:     "pavgb     %%xmm3,%%xmm1                   \n"
michael@0:     MEMOPREG(movdqa,0x00,0,4,2,xmm2)           //  movdqa  (%0,%4,2),%%xmm2
michael@0:     BUNDLEALIGN
michael@0:     MEMOPREG(movdqa,0x10,0,4,2,xmm3)           //  movdqa  0x10(%0,%4,2),%%xmm3
michael@0:     MEMOPREG(movdqa,0x00,0,3,1,xmm4)           //  movdqa  (%0,%3,1),%%xmm4
michael@0:     MEMOPREG(movdqa,0x10,0,3,1,xmm5)           //  movdqa  0x10(%0,%3,1),%%xmm5
michael@0:     "lea       " MEMLEA(0x20,0) ",%0           \n"
michael@0:     "pavgb     %%xmm4,%%xmm2                   \n"
michael@0:     "pavgb     %%xmm2,%%xmm0                   \n"
michael@0:     "pavgb     %%xmm5,%%xmm3                   \n"
michael@0:     "pavgb     %%xmm3,%%xmm1                   \n"
michael@0:     "movdqa    %%xmm0,%%xmm2                   \n"
michael@0:     "psrlw     $0x8,%%xmm0                     \n"
michael@0:     "movdqa    %%xmm1,%%xmm3                   \n"
michael@0:     "psrlw     $0x8,%%xmm1                     \n"
michael@0:     "pand      %%xmm7,%%xmm2                   \n"
michael@0:     "pand      %%xmm7,%%xmm3                   \n"
michael@0:     "pavgw     %%xmm2,%%xmm0                   \n"
michael@0:     "pavgw     %%xmm3,%%xmm1                   \n"
michael@0:     "packuswb  %%xmm1,%%xmm0                   \n"
michael@0:     "movdqa    %%xmm0,%%xmm2                   \n"
michael@0:     "psrlw     $0x8,%%xmm0                     \n"
michael@0:     "pand      %%xmm7,%%xmm2                   \n"
michael@0:     "pavgw     %%xmm2,%%xmm0                   \n"
michael@0:     "packuswb  %%xmm0,%%xmm0                   \n"
michael@0:     "movq      %%xmm0," MEMACCESS(1) "         \n"
michael@0:     "lea       " MEMLEA(0x8,1) ",%1            \n"
michael@0:     "sub       $0x8,%2                         \n"
michael@0:     "jg        1b                              \n"
michael@0:   : "+r"(src_ptr),     // %0
michael@0:     "+r"(dst_ptr),     // %1
michael@0:     "+r"(dst_width),   // %2
michael@0:     "+r"(stridex3)     // %3
michael@0:   : "r"((intptr_t)(src_stride))    // %4
michael@0:   : "memory", "cc"
michael@0: #if defined(__native_client__) && defined(__x86_64__)
michael@0:     , "r14"
michael@0: #endif
michael@0: #if defined(__SSE2__)
michael@0:     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
michael@0: #endif
michael@0:   );
michael@0: }
michael@0: 
michael@0: void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
michael@0:                           uint8* dst_ptr, int dst_width) {
michael@0:   asm volatile (
michael@0:     "movdqa    %0,%%xmm3                       \n"
michael@0:     "movdqa    %1,%%xmm4                       \n"
michael@0:     "movdqa    %2,%%xmm5                       \n"
michael@0:   :
michael@0:   : "m"(kShuf0),  // %0
michael@0:     "m"(kShuf1),  // %1
michael@0:     "m"(kShuf2)   // %2
michael@0:   );
michael@0:   asm volatile (
michael@0:     LABELALIGN
michael@0:   "1:                                          \n"
michael@0:     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
michael@0:     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm2   \n"
michael@0:     "lea       " MEMLEA(0x20,0) ",%0           \n"
michael@0:     "movdqa    %%xmm2,%%xmm1                   \n"
michael@0:     "palignr   $0x8,%%xmm0,%%xmm1              \n"
michael@0:     "pshufb    %%xmm3,%%xmm0                   \n"
michael@0:     "pshufb    %%xmm4,%%xmm1                   \n"
michael@0:     "pshufb    %%xmm5,%%xmm2                   \n"
michael@0:     "movq      %%xmm0," MEMACCESS(1) "         \n"
michael@0:     "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
michael@0:     "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
michael@0:     "lea       " MEMLEA(0x18,1) ",%1           \n"
michael@0:     "sub       $0x18,%2                        \n"
michael@0:     "jg        1b                              \n"
michael@0:   : "+r"(src_ptr),   // %0
michael@0:     "+r"(dst_ptr),   // %1
michael@0:     "+r"(dst_width)  // %2
michael@0:   :
michael@0:   : "memory", "cc"
michael@0: #if defined(__SSE2__)
michael@0:     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0: #endif
michael@0:   );
michael@0: }
michael@0: 
michael@0: void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
michael@0:                                 ptrdiff_t src_stride,
michael@0:                                 uint8* dst_ptr, int dst_width) {
michael@0:   asm volatile (
michael@0:     "movdqa    %0,%%xmm2                       \n"  // kShuf01
michael@0:     "movdqa    %1,%%xmm3                       \n"  // kShuf11
michael@0:     "movdqa    %2,%%xmm4                       \n"  // kShuf21
michael@0:   :
michael@0:   : "m"(kShuf01),  // %0
michael@0:     "m"(kShuf11),  // %1
michael@0:     "m"(kShuf21)   // %2
michael@0:   );
michael@0:   asm volatile (
michael@0:     "movdqa    %0,%%xmm5                       \n"  // kMadd01
michael@0:     "movdqa    %1,%%xmm0                       \n"  // kMadd11
michael@0:     "movdqa    %2,%%xmm1                       \n"  // kRound34
michael@0:   :
michael@0:   : "m"(kMadd01),  // %0
michael@0:     "m"(kMadd11),  // %1
michael@0:     "m"(kRound34)  // %2
michael@0:   );
michael@0:   asm volatile (
michael@0:     LABELALIGN
michael@0:   "1:                                          \n"
michael@0:     "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
michael@0:     MEMOPREG(movdqa,0x00,0,3,1,xmm7)           //  movdqa  (%0,%3),%%xmm7
michael@0:     "pavgb     %%xmm7,%%xmm6                   \n"
michael@0:     "pshufb    %%xmm2,%%xmm6                   \n"
michael@0:     "pmaddubsw %%xmm5,%%xmm6                   \n"
michael@0:     "paddsw    %%xmm1,%%xmm6                   \n"
michael@0:     "psrlw     $0x2,%%xmm6                     \n"
michael@0:     "packuswb  %%xmm6,%%xmm6                   \n"
michael@0:     "movq      %%xmm6," MEMACCESS(1) "         \n"
michael@0:     "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
michael@0:     MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3),%%xmm7
michael@0:     "pavgb     %%xmm7,%%xmm6                   \n"
michael@0:     "pshufb    %%xmm3,%%xmm6                   \n"
michael@0:     "pmaddubsw %%xmm0,%%xmm6                   \n"
michael@0:     "paddsw    %%xmm1,%%xmm6                   \n"
michael@0:     "psrlw     $0x2,%%xmm6                     \n"
michael@0:     "packuswb  %%xmm6,%%xmm6                   \n"
michael@0:     "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
michael@0:     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
michael@0:     BUNDLEALIGN
michael@0:     MEMOPREG(movdqa,0x10,0,3,1,xmm7)           //  movdqa  0x10(%0,%3),%%xmm7
michael@0:     "lea       " MEMLEA(0x20,0) ",%0           \n"
michael@0:     "pavgb     %%xmm7,%%xmm6                   \n"
michael@0:     "pshufb    %%xmm4,%%xmm6                   \n"
michael@0:     "pmaddubsw %4,%%xmm6                       \n"
michael@0:     "paddsw    %%xmm1,%%xmm6                   \n"
michael@0:     "psrlw     $0x2,%%xmm6                     \n"
michael@0:     "packuswb  %%xmm6,%%xmm6                   \n"
michael@0:     "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
michael@0:     "lea       " MEMLEA(0x18,1) ",%1           \n"
michael@0:     "sub       $0x18,%2                        \n"
michael@0:     "jg        1b                              \n"
michael@0:   : "+r"(src_ptr),   // %0
michael@0:     "+r"(dst_ptr),   // %1
michael@0:     "+r"(dst_width)  // %2
michael@0:   : "r"((intptr_t)(src_stride)),  // %3
michael@0:     "m"(kMadd21)     // %4
michael@0:   : "memory", "cc"
michael@0: #if defined(__native_client__) && defined(__x86_64__)
michael@0:     , "r14"
michael@0: #endif
michael@0: #if defined(__SSE2__)
michael@0:     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
michael@0: #endif
michael@0:   );
michael@0: }
michael@0: 
michael@0: void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
michael@0:                                 ptrdiff_t src_stride,
michael@0:                                 uint8* dst_ptr, int dst_width) {
michael@0:   asm volatile (
michael@0:     "movdqa    %0,%%xmm2                       \n"  // kShuf01
michael@0:     "movdqa    %1,%%xmm3                       \n"  // kShuf11
michael@0:     "movdqa    %2,%%xmm4                       \n"  // kShuf21
michael@0:   :
michael@0:   : "m"(kShuf01),  // %0
michael@0:     "m"(kShuf11),  // %1
michael@0:     "m"(kShuf21)   // %2
michael@0:   );
michael@0:   asm volatile (
michael@0:     "movdqa    %0,%%xmm5                       \n"  // kMadd01
michael@0:     "movdqa    %1,%%xmm0                       \n"  // kMadd11
michael@0:     "movdqa    %2,%%xmm1                       \n"  // kRound34
michael@0:   :
michael@0:   : "m"(kMadd01),  // %0
michael@0:     "m"(kMadd11),  // %1
michael@0:     "m"(kRound34)  // %2
michael@0:   );
michael@0: 
michael@0:   asm volatile (
michael@0:     LABELALIGN
michael@0:   "1:                                          \n"
michael@0:     "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
michael@0:     MEMOPREG(movdqa,0x00,0,3,1,xmm7)           //  movdqa  (%0,%3,1),%%xmm7
michael@0:     "pavgb     %%xmm6,%%xmm7                   \n"
michael@0:     "pavgb     %%xmm7,%%xmm6                   \n"
michael@0:     "pshufb    %%xmm2,%%xmm6                   \n"
michael@0:     "pmaddubsw %%xmm5,%%xmm6                   \n"
michael@0:     "paddsw    %%xmm1,%%xmm6                   \n"
michael@0:     "psrlw     $0x2,%%xmm6                     \n"
michael@0:     "packuswb  %%xmm6,%%xmm6                   \n"
michael@0:     "movq      %%xmm6," MEMACCESS(1) "         \n"
michael@0:     "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
michael@0:     MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3,1),%%xmm7
michael@0:     "pavgb     %%xmm6,%%xmm7                   \n"
michael@0:     "pavgb     %%xmm7,%%xmm6                   \n"
michael@0:     "pshufb    %%xmm3,%%xmm6                   \n"
michael@0:     "pmaddubsw %%xmm0,%%xmm6                   \n"
michael@0:     "paddsw    %%xmm1,%%xmm6                   \n"
michael@0:     "psrlw     $0x2,%%xmm6                     \n"
michael@0:     "packuswb  %%xmm6,%%xmm6                   \n"
michael@0:     "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
michael@0:     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
michael@0:     MEMOPREG(movdqa,0x10,0,3,1,xmm7)           //  movdqa  0x10(%0,%3,1),%%xmm7
michael@0:     "lea       " MEMLEA(0x20,0) ",%0           \n"
michael@0:     "pavgb     %%xmm6,%%xmm7                   \n"
michael@0:     "pavgb     %%xmm7,%%xmm6                   \n"
michael@0:     "pshufb    %%xmm4,%%xmm6                   \n"
michael@0:     "pmaddubsw %4,%%xmm6                       \n"
michael@0:     "paddsw    %%xmm1,%%xmm6                   \n"
michael@0:     "psrlw     $0x2,%%xmm6                     \n"
michael@0:     "packuswb  %%xmm6,%%xmm6                   \n"
michael@0:     "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
michael@0:     "lea       " MEMLEA(0x18,1) ",%1           \n"
michael@0:     "sub       $0x18,%2                        \n"
michael@0:     "jg        1b                              \n"
michael@0:     : "+r"(src_ptr),   // %0
michael@0:       "+r"(dst_ptr),   // %1
michael@0:       "+r"(dst_width)  // %2
michael@0:     : "r"((intptr_t)(src_stride)),  // %3
michael@0:       "m"(kMadd21)     // %4
michael@0:     : "memory", "cc"
michael@0: #if defined(__native_client__) && defined(__x86_64__)
michael@0:     , "r14"
michael@0: #endif
michael@0: #if defined(__SSE2__)
michael@0:     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
michael@0: #endif
michael@0:   );
michael@0: }
michael@0: 
michael@0: void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
michael@0:                           uint8* dst_ptr, int dst_width) {
michael@0:   asm volatile (
michael@0:     "movdqa    %3,%%xmm4                       \n"
michael@0:     "movdqa    %4,%%xmm5                       \n"
michael@0: 
michael@0:     LABELALIGN
michael@0:   "1:                                          \n"
michael@0:     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
michael@0:     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
michael@0:     "lea       " MEMLEA(0x20,0) ",%0           \n"
michael@0:     "pshufb    %%xmm4,%%xmm0                   \n"
michael@0:     "pshufb    %%xmm5,%%xmm1                   \n"
michael@0:     "paddusb   %%xmm1,%%xmm0                   \n"
michael@0:     "movq      %%xmm0," MEMACCESS(1) "         \n"
michael@0:     "movhlps   %%xmm0,%%xmm1                   \n"
michael@0:     "movd      %%xmm1," MEMACCESS2(0x8,1) "    \n"
michael@0:     "lea       " MEMLEA(0xc,1) ",%1            \n"
michael@0:     "sub       $0xc,%2                         \n"
michael@0:     "jg        1b                              \n"
michael@0:   : "+r"(src_ptr),   // %0
michael@0:     "+r"(dst_ptr),   // %1
michael@0:     "+r"(dst_width)  // %2
michael@0:   : "m"(kShuf38a),   // %3
michael@0:     "m"(kShuf38b)    // %4
michael@0:   : "memory", "cc"
michael@0: #if defined(__SSE2__)
michael@0:       , "xmm0", "xmm1", "xmm4", "xmm5"
michael@0: #endif
michael@0:   );
michael@0: }
michael@0: 
michael@0: void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
michael@0:                                 ptrdiff_t src_stride,
michael@0:                                 uint8* dst_ptr, int dst_width) {
michael@0:   asm volatile (
michael@0:     "movdqa    %0,%%xmm2                       \n"
michael@0:     "movdqa    %1,%%xmm3                       \n"
michael@0:     "movdqa    %2,%%xmm4                       \n"
michael@0:     "movdqa    %3,%%xmm5                       \n"
michael@0:   :
michael@0:   : "m"(kShufAb0),   // %0
michael@0:     "m"(kShufAb1),   // %1
michael@0:     "m"(kShufAb2),   // %2
michael@0:     "m"(kScaleAb2)   // %3
michael@0:   );
michael@0:   asm volatile (
michael@0:     LABELALIGN
michael@0:   "1:                                          \n"
michael@0:     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
michael@0:     MEMOPREG(pavgb,0x00,0,3,1,xmm0)            //  pavgb   (%0,%3,1),%%xmm0
michael@0:     "lea       " MEMLEA(0x10,0) ",%0           \n"
michael@0:     "movdqa    %%xmm0,%%xmm1                   \n"
michael@0:     "pshufb    %%xmm2,%%xmm1                   \n"
michael@0:     "movdqa    %%xmm0,%%xmm6                   \n"
michael@0:     "pshufb    %%xmm3,%%xmm6                   \n"
michael@0:     "paddusw   %%xmm6,%%xmm1                   \n"
michael@0:     "pshufb    %%xmm4,%%xmm0                   \n"
michael@0:     "paddusw   %%xmm0,%%xmm1                   \n"
michael@0:     "pmulhuw   %%xmm5,%%xmm1                   \n"
michael@0:     "packuswb  %%xmm1,%%xmm1                   \n"
michael@0:     "sub       $0x6,%2                         \n"
michael@0:     "movd      %%xmm1," MEMACCESS(1) "         \n"
michael@0:     "psrlq     $0x10,%%xmm1                    \n"
michael@0:     "movd      %%xmm1," MEMACCESS2(0x2,1) "    \n"
michael@0:     "lea       " MEMLEA(0x6,1) ",%1            \n"
michael@0:     "jg        1b                              \n"
michael@0:   : "+r"(src_ptr),     // %0
michael@0:     "+r"(dst_ptr),     // %1
michael@0:     "+r"(dst_width)    // %2
michael@0:   : "r"((intptr_t)(src_stride))  // %3
michael@0:   : "memory", "cc"
michael@0: #if defined(__native_client__) && defined(__x86_64__)
michael@0:     , "r14"
michael@0: #endif
michael@0: #if defined(__SSE2__)
michael@0:     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
michael@0: #endif
michael@0:   );
michael@0: }
michael@0: 
michael@0: void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
michael@0:                                 ptrdiff_t src_stride,
michael@0:                                 uint8* dst_ptr, int dst_width) {
michael@0:   asm volatile (
michael@0:     "movdqa    %0,%%xmm2                       \n"
michael@0:     "movdqa    %1,%%xmm3                       \n"
michael@0:     "movdqa    %2,%%xmm4                       \n"
michael@0:     "pxor      %%xmm5,%%xmm5                   \n"
michael@0:   :
michael@0:   : "m"(kShufAc),    // %0
michael@0:     "m"(kShufAc3),   // %1
michael@0:     "m"(kScaleAc33)  // %2
michael@0:   );
michael@0:   asm volatile (
michael@0:     LABELALIGN
michael@0:   "1:                                          \n"
michael@0:     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
michael@0:     MEMOPREG(movdqa,0x00,0,3,1,xmm6)           //  movdqa  (%0,%3,1),%%xmm6
michael@0:     "movhlps   %%xmm0,%%xmm1                   \n"
michael@0:     "movhlps   %%xmm6,%%xmm7                   \n"
michael@0:     "punpcklbw %%xmm5,%%xmm0                   \n"
michael@0:     "punpcklbw %%xmm5,%%xmm1                   \n"
michael@0:     "punpcklbw %%xmm5,%%xmm6                   \n"
michael@0:     "punpcklbw %%xmm5,%%xmm7                   \n"
michael@0:     "paddusw   %%xmm6,%%xmm0                   \n"
michael@0:     "paddusw   %%xmm7,%%xmm1                   \n"
michael@0:     MEMOPREG(movdqa,0x00,0,3,2,xmm6)           //  movdqa  (%0,%3,2),%%xmm6
michael@0:     "lea       " MEMLEA(0x10,0) ",%0           \n"
michael@0:     "movhlps   %%xmm6,%%xmm7                   \n"
michael@0:     "punpcklbw %%xmm5,%%xmm6                   \n"
michael@0:     "punpcklbw %%xmm5,%%xmm7                   \n"
michael@0:     "paddusw   %%xmm6,%%xmm0                   \n"
michael@0:     "paddusw   %%xmm7,%%xmm1                   \n"
michael@0:     "movdqa    %%xmm0,%%xmm6                   \n"
michael@0:     "psrldq    $0x2,%%xmm0                     \n"
michael@0:     "paddusw   %%xmm0,%%xmm6                   \n"
michael@0:     "psrldq    $0x2,%%xmm0                     \n"
michael@0:     "paddusw   %%xmm0,%%xmm6                   \n"
michael@0:     "pshufb    %%xmm2,%%xmm6                   \n"
michael@0:     "movdqa    %%xmm1,%%xmm7                   \n"
michael@0:     "psrldq    $0x2,%%xmm1                     \n"
michael@0:     "paddusw   %%xmm1,%%xmm7                   \n"
michael@0:     "psrldq    $0x2,%%xmm1                     \n"
michael@0:     "paddusw   %%xmm1,%%xmm7                   \n"
michael@0:     "pshufb    %%xmm3,%%xmm7                   \n"
michael@0:     "paddusw   %%xmm7,%%xmm6                   \n"
michael@0:     "pmulhuw   %%xmm4,%%xmm6                   \n"
michael@0:     "packuswb  %%xmm6,%%xmm6                   \n"
michael@0:     "sub       $0x6,%2                         \n"
michael@0:     "movd      %%xmm6," MEMACCESS(1) "         \n"
michael@0:     "psrlq     $0x10,%%xmm6                    \n"
michael@0:     "movd      %%xmm6," MEMACCESS2(0x2,1) "    \n"
michael@0:     "lea       " MEMLEA(0x6,1) ",%1            \n"
michael@0:     "jg        1b                              \n"
michael@0:   : "+r"(src_ptr),    // %0
michael@0:     "+r"(dst_ptr),    // %1
michael@0:     "+r"(dst_width)   // %2
michael@0:   : "r"((intptr_t)(src_stride))   // %3
michael@0:   : "memory", "cc"
michael@0: #if defined(__native_client__) && defined(__x86_64__)
michael@0:     , "r14"
michael@0: #endif
michael@0: #if defined(__SSE2__)
michael@0:     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
michael@0: #endif
michael@0:   );
michael@0: }
michael@0: 
michael@0: void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
michael@0:                        uint16* dst_ptr, int src_width, int src_height) {
michael@0:   int tmp_height = 0;
michael@0:   intptr_t tmp_src = 0;
michael@0:   asm volatile (
michael@0:     "pxor      %%xmm4,%%xmm4                   \n"
michael@0:     "sub       $0x1,%5                         \n"
michael@0: 
michael@0:     LABELALIGN
michael@0:   "1:                                          \n"
michael@0:     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
michael@0:     "mov       %0,%3                           \n"
michael@0:     "add       %6,%0                           \n"
michael@0:     "movdqa    %%xmm0,%%xmm1                   \n"
michael@0:     "punpcklbw %%xmm4,%%xmm0                   \n"
michael@0:     "punpckhbw %%xmm4,%%xmm1                   \n"
michael@0:     "mov       %5,%2                           \n"
michael@0:     "test      %2,%2                           \n"
michael@0:     "je        3f                              \n"
michael@0: 
michael@0:     LABELALIGN
michael@0:   "2:                                          \n"
michael@0:     "movdqa    " MEMACCESS(0) ",%%xmm2         \n"
michael@0:     "add       %6,%0                           \n"
michael@0:     "movdqa    %%xmm2,%%xmm3                   \n"
michael@0:     "punpcklbw %%xmm4,%%xmm2                   \n"
michael@0:     "punpckhbw %%xmm4,%%xmm3                   \n"
michael@0:     "paddusw   %%xmm2,%%xmm0                   \n"
michael@0:     "paddusw   %%xmm3,%%xmm1                   \n"
michael@0:     "sub       $0x1,%2                         \n"
michael@0:     "jg        2b                              \n"
michael@0: 
michael@0:     LABELALIGN
michael@0:   "3:                                          \n"
michael@0:     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
michael@0:     "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
michael@0:     "lea       " MEMLEA(0x10,3) ",%0           \n"
michael@0:     "lea       " MEMLEA(0x20,1) ",%1           \n"
michael@0:     "sub       $0x10,%4                        \n"
michael@0:     "jg        1b                              \n"
michael@0:   : "+r"(src_ptr),     // %0
michael@0:     "+r"(dst_ptr),     // %1
michael@0:     "+r"(tmp_height),  // %2
michael@0:     "+r"(tmp_src),     // %3
michael@0:     "+r"(src_width),   // %4
michael@0:     "+rm"(src_height)  // %5
michael@0:   : "rm"((intptr_t)(src_stride))  // %6
michael@0:   : "memory", "cc"
michael@0: #if defined(__SSE2__)
michael@0:     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
michael@0: #endif
michael@0:   );
michael@0: }
michael@0: 
michael@0: // Bilinear column filtering. SSSE3 version.
michael@0: void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
michael@0:                            int dst_width, int x, int dx) {
michael@0:   intptr_t x0 = 0, x1 = 0, temp_pixel = 0;
michael@0:   asm volatile (
michael@0:     "movd      %6,%%xmm2                       \n"
michael@0:     "movd      %7,%%xmm3                       \n"
michael@0:     "movl      $0x04040000,%k2                 \n"
michael@0:     "movd      %k2,%%xmm5                      \n"
michael@0:     "pcmpeqb   %%xmm6,%%xmm6                   \n"
michael@0:     "psrlw     $0x9,%%xmm6                     \n"
michael@0:     "pextrw    $0x1,%%xmm2,%k3                 \n"
michael@0:     "subl      $0x2,%5                         \n"
michael@0:     "jl        29f                             \n"
michael@0:     "movdqa    %%xmm2,%%xmm0                   \n"
michael@0:     "paddd     %%xmm3,%%xmm0                   \n"
michael@0:     "punpckldq %%xmm0,%%xmm2                   \n"
michael@0:     "punpckldq %%xmm3,%%xmm3                   \n"
michael@0:     "paddd     %%xmm3,%%xmm3                   \n"
michael@0:     "pextrw    $0x3,%%xmm2,%k4                 \n"
michael@0: 
michael@0:     LABELALIGN
michael@0:   "2:                                          \n"
michael@0:     "movdqa    %%xmm2,%%xmm1                   \n"
michael@0:     "paddd     %%xmm3,%%xmm2                   \n"
michael@0:     MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
michael@0:     "movd      %k2,%%xmm0                      \n"
michael@0:     "psrlw     $0x9,%%xmm1                     \n"
michael@0:     BUNDLEALIGN
michael@0:     MEMOPARG(movzwl,0x00,1,4,1,k2)             //  movzwl  (%1,%4,1),%k2
michael@0:     "movd      %k2,%%xmm4                      \n"
michael@0:     "pshufb    %%xmm5,%%xmm1                   \n"
michael@0:     "punpcklwd %%xmm4,%%xmm0                   \n"
michael@0:     "pxor      %%xmm6,%%xmm1                   \n"
michael@0:     "pmaddubsw %%xmm1,%%xmm0                   \n"
michael@0:     "pextrw    $0x1,%%xmm2,%k3                 \n"
michael@0:     "pextrw    $0x3,%%xmm2,%k4                 \n"
michael@0:     "psrlw     $0x7,%%xmm0                     \n"
michael@0:     "packuswb  %%xmm0,%%xmm0                   \n"
michael@0:     "movd      %%xmm0,%k2                      \n"
michael@0:     "mov       %w2," MEMACCESS(0) "            \n"
michael@0:     "lea       " MEMLEA(0x2,0) ",%0            \n"
michael@0:     "sub       $0x2,%5                         \n"
michael@0:     "jge       2b                              \n"
michael@0: 
michael@0:     LABELALIGN
michael@0:   "29:                                         \n"
michael@0:     "addl      $0x1,%5                         \n"
michael@0:     "jl        99f                             \n"
michael@0:     MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
michael@0:     "movd      %k2,%%xmm0                      \n"
michael@0:     "psrlw     $0x9,%%xmm2                     \n"
michael@0:     "pshufb    %%xmm5,%%xmm2                   \n"
michael@0:     "pxor      %%xmm6,%%xmm2                   \n"
michael@0:     "pmaddubsw %%xmm2,%%xmm0                   \n"
michael@0:     "psrlw     $0x7,%%xmm0                     \n"
michael@0:     "packuswb  %%xmm0,%%xmm0                   \n"
michael@0:     "movd      %%xmm0,%k2                      \n"
michael@0:     "mov       %b2," MEMACCESS(0) "            \n"
michael@0:   "99:                                         \n"
michael@0:   : "+r"(dst_ptr),     // %0
michael@0:     "+r"(src_ptr),     // %1
michael@0:     "+a"(temp_pixel),  // %2
michael@0:     "+r"(x0),          // %3
michael@0:     "+r"(x1),          // %4
michael@0:     "+rm"(dst_width)   // %5
michael@0:   : "rm"(x),           // %6
michael@0:     "rm"(dx)           // %7
michael@0:   : "memory", "cc"
michael@0: #if defined(__native_client__) && defined(__x86_64__)
michael@0:     , "r14"
michael@0: #endif
michael@0: #if defined(__SSE2__)
michael@0:     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
michael@0: #endif
michael@0:   );
michael@0: }
michael@0: 
michael@0: // Reads 4 pixels, duplicates them and writes 8 pixels.
michael@0: // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
michael@0: void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
michael@0:                        int dst_width, int x, int dx) {
michael@0:   asm volatile (
michael@0:     LABELALIGN
michael@0:   "1:                                          \n"
michael@0:     "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
michael@0:     "lea       " MEMLEA(0x10,1) ",%1           \n"
michael@0:     "movdqa    %%xmm0,%%xmm1                   \n"
michael@0:     "punpcklbw %%xmm0,%%xmm0                   \n"
michael@0:     "punpckhbw %%xmm1,%%xmm1                   \n"
michael@0:     "sub       $0x20,%2                         \n"
michael@0:     "movdqa    %%xmm0," MEMACCESS(0) "         \n"
michael@0:     "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"
michael@0:     "lea       " MEMLEA(0x20,0) ",%0           \n"
michael@0:     "jg        1b                              \n"
michael@0: 
michael@0:   : "+r"(dst_ptr),     // %0
michael@0:     "+r"(src_ptr),     // %1
michael@0:     "+r"(dst_width)    // %2
michael@0:   :
michael@0:   : "memory", "cc"
michael@0: #if defined(__SSE2__)
michael@0:     , "xmm0", "xmm1"
michael@0: #endif
michael@0:   );
michael@0: }
michael@0: 
michael@0: void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
michael@0:                             ptrdiff_t src_stride,
michael@0:                             uint8* dst_argb, int dst_width) {
michael@0:   asm volatile (
michael@0:     LABELALIGN
michael@0:   "1:                                          \n"
michael@0:     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
michael@0:     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
michael@0:     "lea       " MEMLEA(0x20,0) ",%0           \n"
michael@0:     "shufps    $0xdd,%%xmm1,%%xmm0             \n"
michael@0:     "sub       $0x4,%2                         \n"
michael@0:     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
michael@0:     "lea       " MEMLEA(0x10,1) ",%1           \n"
michael@0:     "jg        1b                              \n"
michael@0:   : "+r"(src_argb),  // %0
michael@0:     "+r"(dst_argb),  // %1
michael@0:     "+r"(dst_width)  // %2
michael@0:   :
michael@0:   : "memory", "cc"
michael@0: #if defined(__SSE2__)
michael@0:     , "xmm0", "xmm1"
michael@0: #endif
michael@0:   );
michael@0: }
michael@0: 
michael@0: void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
michael@0:                                   ptrdiff_t src_stride,
michael@0:                                   uint8* dst_argb, int dst_width) {
michael@0:   asm volatile (
michael@0:     LABELALIGN
michael@0:   "1:                                          \n"
michael@0:     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
michael@0:     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
michael@0:     "lea       " MEMLEA(0x20,0) ",%0           \n"
michael@0:     "movdqa    %%xmm0,%%xmm2                   \n"
michael@0:     "shufps    $0x88,%%xmm1,%%xmm0             \n"
michael@0:     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
michael@0:     "pavgb     %%xmm2,%%xmm0                   \n"
michael@0:     "sub       $0x4,%2                         \n"
michael@0:     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
michael@0:     "lea       " MEMLEA(0x10,1) ",%1           \n"
michael@0:     "jg        1b                              \n"
michael@0:   : "+r"(src_argb),  // %0
michael@0:     "+r"(dst_argb),  // %1
michael@0:     "+r"(dst_width)  // %2
michael@0:   :
michael@0:   : "memory", "cc"
michael@0: #if defined(__SSE2__)
michael@0:     , "xmm0", "xmm1"
michael@0: #endif
michael@0:   );
michael@0: }
michael@0: 
michael@0: void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
michael@0:                                ptrdiff_t src_stride,
michael@0:                                uint8* dst_argb, int dst_width) {
michael@0:   asm volatile (
michael@0:     LABELALIGN
michael@0:   "1:                                          \n"
michael@0:     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
michael@0:     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
michael@0:     BUNDLEALIGN
michael@0:     MEMOPREG(movdqa,0x00,0,3,1,xmm2)           //  movdqa   (%0,%3,1),%%xmm2
michael@0:     MEMOPREG(movdqa,0x10,0,3,1,xmm3)           //  movdqa   0x10(%0,%3,1),%%xmm3
michael@0:     "lea       " MEMLEA(0x20,0) ",%0           \n"
michael@0:     "pavgb     %%xmm2,%%xmm0                   \n"
michael@0:     "pavgb     %%xmm3,%%xmm1                   \n"
michael@0:     "movdqa    %%xmm0,%%xmm2                   \n"
michael@0:     "shufps    $0x88,%%xmm1,%%xmm0             \n"
michael@0:     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
michael@0:     "pavgb     %%xmm2,%%xmm0                   \n"
michael@0:     "sub       $0x4,%2                         \n"
michael@0:     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
michael@0:     "lea       " MEMLEA(0x10,1) ",%1           \n"
michael@0:     "jg        1b                              \n"
michael@0:   : "+r"(src_argb),   // %0
michael@0:     "+r"(dst_argb),   // %1
michael@0:     "+r"(dst_width)   // %2
michael@0:   : "r"((intptr_t)(src_stride))   // %3
michael@0:   : "memory", "cc"
michael@0: #if defined(__native_client__) && defined(__x86_64__)
michael@0:     , "r14"
michael@0: #endif
michael@0: #if defined(__SSE2__)
michael@0:     , "xmm0", "xmm1", "xmm2", "xmm3"
michael@0: #endif
michael@0:   );
michael@0: }
michael@0: 
michael@0: // Reads 4 pixels at a time.
michael@0: // Alignment requirement: dst_argb 16 byte aligned.
michael@0: void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
michael@0:                                int src_stepx,
michael@0:                                uint8* dst_argb, int dst_width) {
michael@0:   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
michael@0:   intptr_t src_stepx_x12 = 0;
michael@0:   asm volatile (
michael@0:     "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
michael@0:     "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
michael@0:     LABELALIGN
michael@0:   "1:                                          \n"
michael@0:     "movd      " MEMACCESS(0) ",%%xmm0         \n"
michael@0:     MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
michael@0:     "punpckldq %%xmm1,%%xmm0                   \n"
michael@0:     BUNDLEALIGN
michael@0:     MEMOPREG(movd,0x00,0,1,2,xmm2)             //  movd      (%0,%1,2),%%xmm2
michael@0:     MEMOPREG(movd,0x00,0,4,1,xmm3)             //  movd      (%0,%4,1),%%xmm3
michael@0:     "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
michael@0:     "punpckldq %%xmm3,%%xmm2                   \n"
michael@0:     "punpcklqdq %%xmm2,%%xmm0                  \n"
michael@0:     "sub       $0x4,%3                         \n"
michael@0:     "movdqa    %%xmm0," MEMACCESS(2) "         \n"
michael@0:     "lea       " MEMLEA(0x10,2) ",%2           \n"
michael@0:     "jg        1b                              \n"
michael@0:   : "+r"(src_argb),      // %0
michael@0:     "+r"(src_stepx_x4),  // %1
michael@0:     "+r"(dst_argb),      // %2
michael@0:     "+r"(dst_width),     // %3
michael@0:     "+r"(src_stepx_x12)  // %4
michael@0:   :
michael@0:   : "memory", "cc"
michael@0: #if defined(__native_client__) && defined(__x86_64__)
michael@0:     , "r14"
michael@0: #endif
michael@0: #if defined(__SSE2__)
michael@0:     , "xmm0", "xmm1", "xmm2", "xmm3"
michael@0: #endif
michael@0:   );
michael@0: }
michael@0: 
michael@0: // Blends four 2x2 to 4x1.
michael@0: // Alignment requirement: dst_argb 16 byte aligned.
michael@0: void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
michael@0:                                   ptrdiff_t src_stride, int src_stepx,
michael@0:                                   uint8* dst_argb, int dst_width) {
michael@0:   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
michael@0:   intptr_t src_stepx_x12 = 0;
michael@0:   intptr_t row1 = (intptr_t)(src_stride);
michael@0:   asm volatile (
michael@0:     "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
michael@0:     "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
michael@0:     "lea       " MEMLEA4(0x00,0,5,1) ",%5      \n"
michael@0: 
michael@0:     LABELALIGN
michael@0:   "1:                                          \n"
michael@0:     "movq      " MEMACCESS(0) ",%%xmm0         \n"
michael@0:     MEMOPREG(movhps,0x00,0,1,1,xmm0)           //  movhps    (%0,%1,1),%%xmm0
michael@0:     MEMOPREG(movq,0x00,0,1,2,xmm1)             //  movq      (%0,%1,2),%%xmm1
michael@0:     BUNDLEALIGN
michael@0:     MEMOPREG(movhps,0x00,0,4,1,xmm1)           //  movhps    (%0,%4,1),%%xmm1
michael@0:     "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
michael@0:     "movq      " MEMACCESS(5) ",%%xmm2         \n"
michael@0:     BUNDLEALIGN
michael@0:     MEMOPREG(movhps,0x00,5,1,1,xmm2)           //  movhps    (%5,%1,1),%%xmm2
michael@0:     MEMOPREG(movq,0x00,5,1,2,xmm3)             //  movq      (%5,%1,2),%%xmm3
michael@0:     MEMOPREG(movhps,0x00,5,4,1,xmm3)           //  movhps    (%5,%4,1),%%xmm3
michael@0:     "lea       " MEMLEA4(0x00,5,1,4) ",%5      \n"
michael@0:     "pavgb     %%xmm2,%%xmm0                   \n"
michael@0:     "pavgb     %%xmm3,%%xmm1                   \n"
michael@0:     "movdqa    %%xmm0,%%xmm2                   \n"
michael@0:     "shufps    $0x88,%%xmm1,%%xmm0             \n"
michael@0:     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
michael@0:     "pavgb     %%xmm2,%%xmm0                   \n"
michael@0:     "sub       $0x4,%3                         \n"
michael@0:     "movdqa    %%xmm0," MEMACCESS(2) "         \n"
michael@0:     "lea       " MEMLEA(0x10,2) ",%2           \n"
michael@0:     "jg        1b                              \n"
michael@0:   : "+r"(src_argb),       // %0
michael@0:     "+r"(src_stepx_x4),   // %1
michael@0:     "+r"(dst_argb),       // %2
michael@0:     "+rm"(dst_width),     // %3
michael@0:     "+r"(src_stepx_x12),  // %4
michael@0:     "+r"(row1)            // %5
michael@0:   :
michael@0:   : "memory", "cc"
michael@0: #if defined(__native_client__) && defined(__x86_64__)
michael@0:     , "r14"
michael@0: #endif
michael@0: #if defined(__SSE2__)
michael@0:     , "xmm0", "xmm1", "xmm2", "xmm3"
michael@0: #endif
michael@0:   );
michael@0: }
michael@0: 
michael@0: void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
michael@0:                         int dst_width, int x, int dx) {
michael@0:   intptr_t x0 = 0, x1 = 0;
michael@0:   asm volatile (
michael@0:     "movd      %5,%%xmm2                       \n"
michael@0:     "movd      %6,%%xmm3                       \n"
michael@0:     "pshufd    $0x0,%%xmm2,%%xmm2              \n"
michael@0:     "pshufd    $0x11,%%xmm3,%%xmm0             \n"
michael@0:     "paddd     %%xmm0,%%xmm2                   \n"
michael@0:     "paddd     %%xmm3,%%xmm3                   \n"
michael@0:     "pshufd    $0x5,%%xmm3,%%xmm0              \n"
michael@0:     "paddd     %%xmm0,%%xmm2                   \n"
michael@0:     "paddd     %%xmm3,%%xmm3                   \n"
michael@0:     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
michael@0:     "pextrw    $0x1,%%xmm2,%k0                 \n"
michael@0:     "pextrw    $0x3,%%xmm2,%k1                 \n"
michael@0:     "cmp       $0x0,%4                         \n"
michael@0:     "jl        99f                             \n"
michael@0:     "sub       $0x4,%4                         \n"
michael@0:     "jl        49f                             \n"
michael@0: 
michael@0:     LABELALIGN
michael@0:   "40:                                         \n"
michael@0:     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
michael@0:     MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
michael@0:     "pextrw    $0x5,%%xmm2,%k0                 \n"
michael@0:     "pextrw    $0x7,%%xmm2,%k1                 \n"
michael@0:     "paddd     %%xmm3,%%xmm2                   \n"
michael@0:     "punpckldq %%xmm1,%%xmm0                   \n"
michael@0:     MEMOPREG(movd,0x00,3,0,4,xmm1)             //  movd      (%3,%0,4),%%xmm1
michael@0:     MEMOPREG(movd,0x00,3,1,4,xmm4)             //  movd      (%3,%1,4),%%xmm4
michael@0:     "pextrw    $0x1,%%xmm2,%k0                 \n"
michael@0:     "pextrw    $0x3,%%xmm2,%k1                 \n"
michael@0:     "punpckldq %%xmm4,%%xmm1                   \n"
michael@0:     "punpcklqdq %%xmm1,%%xmm0                  \n"
michael@0:     "sub       $0x4,%4                         \n"
michael@0:     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
michael@0:     "lea       " MEMLEA(0x10,2) ",%2           \n"
michael@0:     "jge       40b                             \n"
michael@0: 
michael@0:   "49:                                         \n"
michael@0:     "test      $0x2,%4                         \n"
michael@0:     "je        29f                             \n"
michael@0:     BUNDLEALIGN
michael@0:     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
michael@0:     MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
michael@0:     "pextrw    $0x5,%%xmm2,%k0                 \n"
michael@0:     "punpckldq %%xmm1,%%xmm0                   \n"
michael@0:     "movq      %%xmm0," MEMACCESS(2) "         \n"
michael@0:     "lea       " MEMLEA(0x8,2) ",%2            \n"
michael@0:   "29:                                         \n"
michael@0:     "test      $0x1,%4                         \n"
michael@0:     "je        99f                             \n"
michael@0:     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
michael@0:     "movd      %%xmm0," MEMACCESS(2) "         \n"
michael@0:   "99:                                         \n"
michael@0:   : "+a"(x0),          // %0
michael@0:     "+d"(x1),          // %1
michael@0:     "+r"(dst_argb),    // %2
michael@0:     "+r"(src_argb),    // %3
michael@0:     "+r"(dst_width)    // %4
michael@0:   : "rm"(x),           // %5
michael@0:     "rm"(dx)           // %6
michael@0:   : "memory", "cc"
michael@0: #if defined(__native_client__) && defined(__x86_64__)
michael@0:     , "r14"
michael@0: #endif
michael@0: #if defined(__SSE2__)
michael@0:     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
michael@0: #endif
michael@0:   );
michael@0: }
michael@0: 
michael@0: // Reads 4 pixels, duplicates them and writes 8 pixels.
michael@0: // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
michael@0: void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
michael@0:                            int dst_width, int x, int dx) {
michael@0:   asm volatile (
michael@0:     LABELALIGN
michael@0:   "1:                                          \n"
michael@0:     "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
michael@0:     "lea       " MEMLEA(0x10,1) ",%1           \n"
michael@0:     "movdqa    %%xmm0,%%xmm1                   \n"
michael@0:     "punpckldq %%xmm0,%%xmm0                   \n"
michael@0:     "punpckhdq %%xmm1,%%xmm1                   \n"
michael@0:     "sub       $0x8,%2                         \n"
michael@0:     "movdqa    %%xmm0," MEMACCESS(0) "         \n"
michael@0:     "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"
michael@0:     "lea       " MEMLEA(0x20,0) ",%0           \n"
michael@0:     "jg        1b                              \n"
michael@0: 
michael@0:   : "+r"(dst_argb),    // %0
michael@0:     "+r"(src_argb),    // %1
michael@0:     "+r"(dst_width)    // %2
michael@0:   :
michael@0:   : "memory", "cc"
michael@0: #if defined(__native_client__) && defined(__x86_64__)
michael@0:     , "r14"
michael@0: #endif
michael@0: #if defined(__SSE2__)
michael@0:     , "xmm0", "xmm1"
michael@0: #endif
michael@0:   );
michael@0: }
michael@0: 
michael@0: // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
michael@0: static uvec8 kShuffleColARGB = {
michael@0:   0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
michael@0:   8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
michael@0: };
michael@0: 
michael@0: // Shuffle table for duplicating 2 fractions into 8 bytes each
michael@0: static uvec8 kShuffleFractions = {
michael@0:   0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
michael@0: };
michael@0: 
michael@0: // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
michael@0: void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
michael@0:                                int dst_width, int x, int dx) {
michael@0:   intptr_t x0 = 0, x1 = 0;
michael@0:   asm volatile (
michael@0:     "movdqa    %0,%%xmm4                       \n"
michael@0:     "movdqa    %1,%%xmm5                       \n"
michael@0:   :
michael@0:   : "m"(kShuffleColARGB),  // %0
michael@0:     "m"(kShuffleFractions)  // %1
michael@0:   );
michael@0: 
michael@0:   asm volatile (
michael@0:     "movd      %5,%%xmm2                       \n"
michael@0:     "movd      %6,%%xmm3                       \n"
michael@0:     "pcmpeqb   %%xmm6,%%xmm6                   \n"
michael@0:     "psrlw     $0x9,%%xmm6                     \n"
michael@0:     "pextrw    $0x1,%%xmm2,%k3                 \n"
michael@0:     "sub       $0x2,%2                         \n"
michael@0:     "jl        29f                             \n"
michael@0:     "movdqa    %%xmm2,%%xmm0                   \n"
michael@0:     "paddd     %%xmm3,%%xmm0                   \n"
michael@0:     "punpckldq %%xmm0,%%xmm2                   \n"
michael@0:     "punpckldq %%xmm3,%%xmm3                   \n"
michael@0:     "paddd     %%xmm3,%%xmm3                   \n"
michael@0:     "pextrw    $0x3,%%xmm2,%k4                 \n"
michael@0: 
michael@0:     LABELALIGN
michael@0:   "2:                                          \n"
michael@0:     "movdqa    %%xmm2,%%xmm1                   \n"
michael@0:     "paddd     %%xmm3,%%xmm2                   \n"
michael@0:     MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
michael@0:     "psrlw     $0x9,%%xmm1                     \n"
michael@0:     BUNDLEALIGN
michael@0:     MEMOPREG(movhps,0x00,1,4,4,xmm0)           //  movhps    (%1,%4,4),%%xmm0
michael@0:     "pshufb    %%xmm5,%%xmm1                   \n"
michael@0:     "pshufb    %%xmm4,%%xmm0                   \n"
michael@0:     "pxor      %%xmm6,%%xmm1                   \n"
michael@0:     "pmaddubsw %%xmm1,%%xmm0                   \n"
michael@0:     "psrlw     $0x7,%%xmm0                     \n"
michael@0:     "pextrw    $0x1,%%xmm2,%k3                 \n"
michael@0:     "pextrw    $0x3,%%xmm2,%k4                 \n"
michael@0:     "packuswb  %%xmm0,%%xmm0                   \n"
michael@0:     "movq      %%xmm0," MEMACCESS(0) "         \n"
michael@0:     "lea       " MEMLEA(0x8,0) ",%0            \n"
michael@0:     "sub       $0x2,%2                         \n"
michael@0:     "jge       2b                              \n"
michael@0: 
michael@0:     LABELALIGN
michael@0:   "29:                                         \n"
michael@0:     "add       $0x1,%2                         \n"
michael@0:     "jl        99f                             \n"
michael@0:     "psrlw     $0x9,%%xmm2                     \n"
michael@0:     BUNDLEALIGN
michael@0:     MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
michael@0:     "pshufb    %%xmm5,%%xmm2                   \n"
michael@0:     "pshufb    %%xmm4,%%xmm0                   \n"
michael@0:     "pxor      %%xmm6,%%xmm2                   \n"
michael@0:     "pmaddubsw %%xmm2,%%xmm0                   \n"
michael@0:     "psrlw     $0x7,%%xmm0                     \n"
michael@0:     "packuswb  %%xmm0,%%xmm0                   \n"
michael@0:     "movd      %%xmm0," MEMACCESS(0) "         \n"
michael@0: 
michael@0:     LABELALIGN
michael@0:   "99:                                         \n"
michael@0:   : "+r"(dst_argb),    // %0
michael@0:     "+r"(src_argb),    // %1
michael@0:     "+rm"(dst_width),  // %2
michael@0:     "+r"(x0),          // %3
michael@0:     "+r"(x1)           // %4
michael@0:   : "rm"(x),           // %5
michael@0:     "rm"(dx)           // %6
michael@0:   : "memory", "cc"
michael@0: #if defined(__native_client__) && defined(__x86_64__)
michael@0:     , "r14"
michael@0: #endif
michael@0: #if defined(__SSE2__)
michael@0:     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
michael@0: #endif
michael@0:   );
michael@0: }
michael@0: 
michael@0: // Divide num by div and return as 16.16 fixed point result.
michael@0: int FixedDiv_X86(int num, int div) {
michael@0:   asm volatile (
michael@0:     "cdq                                       \n"
michael@0:     "shld      $0x10,%%eax,%%edx               \n"
michael@0:     "shl       $0x10,%%eax                     \n"
michael@0:     "idiv      %1                              \n"
michael@0:     "mov       %0, %%eax                       \n"
michael@0:     : "+a"(num)  // %0
michael@0:     : "c"(div)   // %1
michael@0:     : "memory", "cc", "edx"
michael@0:   );
michael@0:   return num;
michael@0: }
michael@0: 
michael@0: // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
michael@0: int FixedDiv1_X86(int num, int div) {
michael@0:   asm volatile (
michael@0:     "cdq                                       \n"
michael@0:     "shld      $0x10,%%eax,%%edx               \n"
michael@0:     "shl       $0x10,%%eax                     \n"
michael@0:     "sub       $0x10001,%%eax                  \n"
michael@0:     "sbb       $0x0,%%edx                      \n"
michael@0:     "sub       $0x1,%1                         \n"
michael@0:     "idiv      %1                              \n"
michael@0:     "mov       %0, %%eax                       \n"
michael@0:     : "+a"(num)  // %0
michael@0:     : "c"(div)   // %1
michael@0:     : "memory", "cc", "edx"
michael@0:   );
michael@0:   return num;
michael@0: }
michael@0: 
michael@0: #endif  // defined(__x86_64__) || defined(__i386__)
michael@0: 
michael@0: #ifdef __cplusplus
michael@0: }  // extern "C"
michael@0: }  // namespace libyuv
michael@0: #endif