michael@0: /* michael@0: * Copyright 2013 The LibYuv Project Authors. All rights reserved. michael@0: * michael@0: * Use of this source code is governed by a BSD-style license michael@0: * that can be found in the LICENSE file in the root of the source michael@0: * tree. An additional intellectual property rights grant can be found michael@0: * in the file PATENTS. All contributing project authors may michael@0: * be found in the AUTHORS file in the root of the source tree. michael@0: */ michael@0: michael@0: #include "libyuv/row.h" michael@0: michael@0: #ifdef __cplusplus michael@0: namespace libyuv { michael@0: extern "C" { michael@0: #endif michael@0: michael@0: // This module is for GCC x86 and x64. michael@0: #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) michael@0: michael@0: // Offsets for source bytes 0 to 9 michael@0: static uvec8 kShuf0 = michael@0: { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; michael@0: michael@0: // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. michael@0: static uvec8 kShuf1 = michael@0: { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; michael@0: michael@0: // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. michael@0: static uvec8 kShuf2 = michael@0: { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; michael@0: michael@0: // Offsets for source bytes 0 to 10 michael@0: static uvec8 kShuf01 = michael@0: { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; michael@0: michael@0: // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. michael@0: static uvec8 kShuf11 = michael@0: { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; michael@0: michael@0: // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. michael@0: static uvec8 kShuf21 = michael@0: { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; michael@0: michael@0: // Coefficients for source bytes 0 to 10 michael@0: static uvec8 kMadd01 = michael@0: { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; michael@0: michael@0: // Coefficients for source bytes 10 to 21 michael@0: static uvec8 kMadd11 = michael@0: { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; michael@0: michael@0: // Coefficients for source bytes 21 to 31 michael@0: static uvec8 kMadd21 = michael@0: { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; michael@0: michael@0: // Coefficients for source bytes 21 to 31 michael@0: static vec16 kRound34 = michael@0: { 2, 2, 2, 2, 2, 2, 2, 2 }; michael@0: michael@0: static uvec8 kShuf38a = michael@0: { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; michael@0: michael@0: static uvec8 kShuf38b = michael@0: { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; michael@0: michael@0: // Arrange words 0,3,6 into 0,1,2 michael@0: static uvec8 kShufAc = michael@0: { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; michael@0: michael@0: // Arrange words 0,3,6 into 3,4,5 michael@0: static uvec8 kShufAc3 = michael@0: { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; michael@0: michael@0: // Scaling values for boxes of 3x3 and 2x3 michael@0: static uvec16 kScaleAc33 = michael@0: { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; michael@0: michael@0: // Arrange first value for pixels 0,1,2,3,4,5 michael@0: static uvec8 kShufAb0 = michael@0: { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; michael@0: michael@0: // Arrange second value for pixels 0,1,2,3,4,5 michael@0: static uvec8 kShufAb1 = michael@0: { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; michael@0: michael@0: // Arrange third value for pixels 0,1,2,3,4,5 michael@0: static uvec8 kShufAb2 = michael@0: { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; michael@0: michael@0: // Scaling values for boxes of 3x2 and 2x2 michael@0: static uvec16 kScaleAb2 = michael@0: { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; michael@0: michael@0: // GCC versions of row functions are verbatim conversions from Visual C. michael@0: // Generated using gcc disassembly on Visual C object file: michael@0: // objdump -D yuvscaler.obj >yuvscaler.txt michael@0: michael@0: void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, michael@0: uint8* dst_ptr, int dst_width) { michael@0: asm volatile ( michael@0: LABELALIGN michael@0: "1: \n" michael@0: "movdqa " MEMACCESS(0) ",%%xmm0 \n" michael@0: "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" michael@0: "lea " MEMLEA(0x20,0) ",%0 \n" michael@0: "psrlw $0x8,%%xmm0 \n" michael@0: "psrlw $0x8,%%xmm1 \n" michael@0: "packuswb %%xmm1,%%xmm0 \n" michael@0: "movdqa %%xmm0," MEMACCESS(1) " \n" michael@0: "lea " MEMLEA(0x10,1) ",%1 \n" michael@0: "sub $0x10,%2 \n" michael@0: "jg 1b \n" michael@0: : "+r"(src_ptr), // %0 michael@0: "+r"(dst_ptr), // %1 michael@0: "+r"(dst_width) // %2 michael@0: : michael@0: : "memory", "cc" michael@0: #if defined(__SSE2__) michael@0: , "xmm0", "xmm1" michael@0: #endif michael@0: ); michael@0: } michael@0: michael@0: void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, michael@0: uint8* dst_ptr, int dst_width) { michael@0: asm volatile ( michael@0: "pcmpeqb %%xmm5,%%xmm5 \n" michael@0: "psrlw $0x8,%%xmm5 \n" michael@0: michael@0: LABELALIGN michael@0: "1: \n" michael@0: "movdqa " MEMACCESS(0) ",%%xmm0 \n" michael@0: "movdqa " MEMACCESS2(0x10, 0) ",%%xmm1 \n" michael@0: "lea " MEMLEA(0x20,0) ",%0 \n" michael@0: "movdqa %%xmm0,%%xmm2 \n" michael@0: "psrlw $0x8,%%xmm0 \n" michael@0: "movdqa %%xmm1,%%xmm3 \n" michael@0: "psrlw $0x8,%%xmm1 \n" michael@0: "pand %%xmm5,%%xmm2 \n" michael@0: "pand %%xmm5,%%xmm3 \n" michael@0: "pavgw %%xmm2,%%xmm0 \n" michael@0: "pavgw %%xmm3,%%xmm1 \n" michael@0: "packuswb %%xmm1,%%xmm0 \n" michael@0: "movdqa %%xmm0," MEMACCESS(1) " \n" michael@0: "lea " MEMLEA(0x10,1) ",%1 \n" michael@0: "sub $0x10,%2 \n" michael@0: "jg 1b \n" michael@0: : "+r"(src_ptr), // %0 michael@0: "+r"(dst_ptr), // %1 michael@0: "+r"(dst_width) // %2 michael@0: : michael@0: : "memory", "cc" michael@0: #if defined(__SSE2__) michael@0: , "xmm0", "xmm1", "xmm5" michael@0: #endif michael@0: ); michael@0: } michael@0: michael@0: void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, michael@0: uint8* dst_ptr, int dst_width) { michael@0: asm volatile ( michael@0: "pcmpeqb %%xmm5,%%xmm5 \n" michael@0: "psrlw $0x8,%%xmm5 \n" michael@0: michael@0: LABELALIGN michael@0: "1: \n" michael@0: "movdqa " MEMACCESS(0) ",%%xmm0 \n" michael@0: "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" michael@0: MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2 michael@0: BUNDLEALIGN michael@0: MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3 michael@0: "lea " MEMLEA(0x20,0) ",%0 \n" michael@0: "pavgb %%xmm2,%%xmm0 \n" michael@0: "pavgb %%xmm3,%%xmm1 \n" michael@0: "movdqa %%xmm0,%%xmm2 \n" michael@0: "psrlw $0x8,%%xmm0 \n" michael@0: "movdqa %%xmm1,%%xmm3 \n" michael@0: "psrlw $0x8,%%xmm1 \n" michael@0: "pand %%xmm5,%%xmm2 \n" michael@0: "pand %%xmm5,%%xmm3 \n" michael@0: "pavgw %%xmm2,%%xmm0 \n" michael@0: "pavgw %%xmm3,%%xmm1 \n" michael@0: "packuswb %%xmm1,%%xmm0 \n" michael@0: "movdqa %%xmm0," MEMACCESS(1) " \n" michael@0: "lea " MEMLEA(0x10,1) ",%1 \n" michael@0: "sub $0x10,%2 \n" michael@0: "jg 1b \n" michael@0: : "+r"(src_ptr), // %0 michael@0: "+r"(dst_ptr), // %1 michael@0: "+r"(dst_width) // %2 michael@0: : "r"((intptr_t)(src_stride)) // %3 michael@0: : "memory", "cc" michael@0: #if defined(__native_client__) && defined(__x86_64__) michael@0: , "r14" michael@0: #endif michael@0: #if defined(__SSE2__) michael@0: , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" michael@0: #endif michael@0: ); michael@0: } michael@0: michael@0: void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, michael@0: uint8* dst_ptr, int dst_width) { michael@0: asm volatile ( michael@0: LABELALIGN michael@0: "1: \n" michael@0: "movdqu " MEMACCESS(0) ",%%xmm0 \n" michael@0: "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" michael@0: "lea " MEMLEA(0x20,0) ",%0 \n" michael@0: "psrlw $0x8,%%xmm0 \n" michael@0: "psrlw $0x8,%%xmm1 \n" michael@0: "packuswb %%xmm1,%%xmm0 \n" michael@0: "movdqu %%xmm0," MEMACCESS(1) " \n" michael@0: "lea " MEMLEA(0x10,1) ",%1 \n" michael@0: "sub $0x10,%2 \n" michael@0: "jg 1b \n" michael@0: : "+r"(src_ptr), // %0 michael@0: "+r"(dst_ptr), // %1 michael@0: "+r"(dst_width) // %2 michael@0: : michael@0: : "memory", "cc" michael@0: #if defined(__SSE2__) michael@0: , "xmm0", "xmm1" michael@0: #endif michael@0: ); michael@0: } michael@0: michael@0: void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, michael@0: ptrdiff_t src_stride, michael@0: uint8* dst_ptr, int dst_width) { michael@0: asm volatile ( michael@0: "pcmpeqb %%xmm5,%%xmm5 \n" michael@0: "psrlw $0x8,%%xmm5 \n" michael@0: michael@0: LABELALIGN michael@0: "1: \n" michael@0: "movdqu " MEMACCESS(0) ",%%xmm0 \n" michael@0: "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" michael@0: "lea " MEMLEA(0x20,0) ",%0 \n" michael@0: "movdqa %%xmm0,%%xmm2 \n" michael@0: "psrlw $0x8,%%xmm0 \n" michael@0: "movdqa %%xmm1,%%xmm3 \n" michael@0: "psrlw $0x8,%%xmm1 \n" michael@0: "pand %%xmm5,%%xmm2 \n" michael@0: "pand %%xmm5,%%xmm3 \n" michael@0: "pavgw %%xmm2,%%xmm0 \n" michael@0: "pavgw %%xmm3,%%xmm1 \n" michael@0: "packuswb %%xmm1,%%xmm0 \n" michael@0: "movdqu %%xmm0," MEMACCESS(1) " \n" michael@0: "lea " MEMLEA(0x10,1) ",%1 \n" michael@0: "sub $0x10,%2 \n" michael@0: "jg 1b \n" michael@0: : "+r"(src_ptr), // %0 michael@0: "+r"(dst_ptr), // %1 michael@0: "+r"(dst_width) // %2 michael@0: : michael@0: : "memory", "cc" michael@0: #if defined(__SSE2__) michael@0: , "xmm0", "xmm1", "xmm5" michael@0: #endif michael@0: ); michael@0: } michael@0: michael@0: void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, michael@0: ptrdiff_t src_stride, michael@0: uint8* dst_ptr, int dst_width) { michael@0: asm volatile ( michael@0: "pcmpeqb %%xmm5,%%xmm5 \n" michael@0: "psrlw $0x8,%%xmm5 \n" michael@0: michael@0: LABELALIGN michael@0: "1: \n" michael@0: "movdqu " MEMACCESS(0) ",%%xmm0 \n" michael@0: "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" michael@0: MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 michael@0: BUNDLEALIGN michael@0: MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 michael@0: "lea " MEMLEA(0x20,0) ",%0 \n" michael@0: "pavgb %%xmm2,%%xmm0 \n" michael@0: "pavgb %%xmm3,%%xmm1 \n" michael@0: "movdqa %%xmm0,%%xmm2 \n" michael@0: "psrlw $0x8,%%xmm0 \n" michael@0: "movdqa %%xmm1,%%xmm3 \n" michael@0: "psrlw $0x8,%%xmm1 \n" michael@0: "pand %%xmm5,%%xmm2 \n" michael@0: "pand %%xmm5,%%xmm3 \n" michael@0: "pavgw %%xmm2,%%xmm0 \n" michael@0: "pavgw %%xmm3,%%xmm1 \n" michael@0: "packuswb %%xmm1,%%xmm0 \n" michael@0: "movdqu %%xmm0," MEMACCESS(1) " \n" michael@0: "lea " MEMLEA(0x10,1) ",%1 \n" michael@0: "sub $0x10,%2 \n" michael@0: "jg 1b \n" michael@0: : "+r"(src_ptr), // %0 michael@0: "+r"(dst_ptr), // %1 michael@0: "+r"(dst_width) // %2 michael@0: : "r"((intptr_t)(src_stride)) // %3 michael@0: : "memory", "cc" michael@0: #if defined(__native_client__) && defined(__x86_64__) michael@0: , "r14" michael@0: #endif michael@0: #if defined(__SSE2__) michael@0: , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" michael@0: #endif michael@0: ); michael@0: } michael@0: michael@0: void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, michael@0: uint8* dst_ptr, int dst_width) { michael@0: asm volatile ( michael@0: "pcmpeqb %%xmm5,%%xmm5 \n" michael@0: "psrld $0x18,%%xmm5 \n" michael@0: "pslld $0x10,%%xmm5 \n" michael@0: michael@0: LABELALIGN michael@0: "1: \n" michael@0: "movdqa " MEMACCESS(0) ",%%xmm0 \n" michael@0: "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" michael@0: "lea " MEMLEA(0x20,0) ",%0 \n" michael@0: "pand %%xmm5,%%xmm0 \n" michael@0: "pand %%xmm5,%%xmm1 \n" michael@0: "packuswb %%xmm1,%%xmm0 \n" michael@0: "psrlw $0x8,%%xmm0 \n" michael@0: "packuswb %%xmm0,%%xmm0 \n" michael@0: "movq %%xmm0," MEMACCESS(1) " \n" michael@0: "lea " MEMLEA(0x8,1) ",%1 \n" michael@0: "sub $0x8,%2 \n" michael@0: "jg 1b \n" michael@0: : "+r"(src_ptr), // %0 michael@0: "+r"(dst_ptr), // %1 michael@0: "+r"(dst_width) // %2 michael@0: : michael@0: : "memory", "cc" michael@0: #if defined(__SSE2__) michael@0: , "xmm0", "xmm1", "xmm5" michael@0: #endif michael@0: ); michael@0: } michael@0: michael@0: void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, michael@0: uint8* dst_ptr, int dst_width) { michael@0: intptr_t stridex3 = 0; michael@0: asm volatile ( michael@0: "pcmpeqb %%xmm7,%%xmm7 \n" michael@0: "psrlw $0x8,%%xmm7 \n" michael@0: "lea " MEMLEA4(0x00,4,4,2) ",%3 \n" michael@0: michael@0: LABELALIGN michael@0: "1: \n" michael@0: "movdqa " MEMACCESS(0) ",%%xmm0 \n" michael@0: "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" michael@0: MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 michael@0: BUNDLEALIGN michael@0: MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 michael@0: "pavgb %%xmm2,%%xmm0 \n" michael@0: "pavgb %%xmm3,%%xmm1 \n" michael@0: MEMOPREG(movdqa,0x00,0,4,2,xmm2) // movdqa (%0,%4,2),%%xmm2 michael@0: BUNDLEALIGN michael@0: MEMOPREG(movdqa,0x10,0,4,2,xmm3) // movdqa 0x10(%0,%4,2),%%xmm3 michael@0: MEMOPREG(movdqa,0x00,0,3,1,xmm4) // movdqa (%0,%3,1),%%xmm4 michael@0: MEMOPREG(movdqa,0x10,0,3,1,xmm5) // movdqa 0x10(%0,%3,1),%%xmm5 michael@0: "lea " MEMLEA(0x20,0) ",%0 \n" michael@0: "pavgb %%xmm4,%%xmm2 \n" michael@0: "pavgb %%xmm2,%%xmm0 \n" michael@0: "pavgb %%xmm5,%%xmm3 \n" michael@0: "pavgb %%xmm3,%%xmm1 \n" michael@0: "movdqa %%xmm0,%%xmm2 \n" michael@0: "psrlw $0x8,%%xmm0 \n" michael@0: "movdqa %%xmm1,%%xmm3 \n" michael@0: "psrlw $0x8,%%xmm1 \n" michael@0: "pand %%xmm7,%%xmm2 \n" michael@0: "pand %%xmm7,%%xmm3 \n" michael@0: "pavgw %%xmm2,%%xmm0 \n" michael@0: "pavgw %%xmm3,%%xmm1 \n" michael@0: "packuswb %%xmm1,%%xmm0 \n" michael@0: "movdqa %%xmm0,%%xmm2 \n" michael@0: "psrlw $0x8,%%xmm0 \n" michael@0: "pand %%xmm7,%%xmm2 \n" michael@0: "pavgw %%xmm2,%%xmm0 \n" michael@0: "packuswb %%xmm0,%%xmm0 \n" michael@0: "movq %%xmm0," MEMACCESS(1) " \n" michael@0: "lea " MEMLEA(0x8,1) ",%1 \n" michael@0: "sub $0x8,%2 \n" michael@0: "jg 1b \n" michael@0: : "+r"(src_ptr), // %0 michael@0: "+r"(dst_ptr), // %1 michael@0: "+r"(dst_width), // %2 michael@0: "+r"(stridex3) // %3 michael@0: : "r"((intptr_t)(src_stride)) // %4 michael@0: : "memory", "cc" michael@0: #if defined(__native_client__) && defined(__x86_64__) michael@0: , "r14" michael@0: #endif michael@0: #if defined(__SSE2__) michael@0: , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7" michael@0: #endif michael@0: ); michael@0: } michael@0: michael@0: void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, michael@0: uint8* dst_ptr, int dst_width) { michael@0: asm volatile ( michael@0: "movdqa %0,%%xmm3 \n" michael@0: "movdqa %1,%%xmm4 \n" michael@0: "movdqa %2,%%xmm5 \n" michael@0: : michael@0: : "m"(kShuf0), // %0 michael@0: "m"(kShuf1), // %1 michael@0: "m"(kShuf2) // %2 michael@0: ); michael@0: asm volatile ( michael@0: LABELALIGN michael@0: "1: \n" michael@0: "movdqa " MEMACCESS(0) ",%%xmm0 \n" michael@0: "movdqa " MEMACCESS2(0x10,0) ",%%xmm2 \n" michael@0: "lea " MEMLEA(0x20,0) ",%0 \n" michael@0: "movdqa %%xmm2,%%xmm1 \n" michael@0: "palignr $0x8,%%xmm0,%%xmm1 \n" michael@0: "pshufb %%xmm3,%%xmm0 \n" michael@0: "pshufb %%xmm4,%%xmm1 \n" michael@0: "pshufb %%xmm5,%%xmm2 \n" michael@0: "movq %%xmm0," MEMACCESS(1) " \n" michael@0: "movq %%xmm1," MEMACCESS2(0x8,1) " \n" michael@0: "movq %%xmm2," MEMACCESS2(0x10,1) " \n" michael@0: "lea " MEMLEA(0x18,1) ",%1 \n" michael@0: "sub $0x18,%2 \n" michael@0: "jg 1b \n" michael@0: : "+r"(src_ptr), // %0 michael@0: "+r"(dst_ptr), // %1 michael@0: "+r"(dst_width) // %2 michael@0: : michael@0: : "memory", "cc" michael@0: #if defined(__SSE2__) michael@0: , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" michael@0: #endif michael@0: ); michael@0: } michael@0: michael@0: void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, michael@0: ptrdiff_t src_stride, michael@0: uint8* dst_ptr, int dst_width) { michael@0: asm volatile ( michael@0: "movdqa %0,%%xmm2 \n" // kShuf01 michael@0: "movdqa %1,%%xmm3 \n" // kShuf11 michael@0: "movdqa %2,%%xmm4 \n" // kShuf21 michael@0: : michael@0: : "m"(kShuf01), // %0 michael@0: "m"(kShuf11), // %1 michael@0: "m"(kShuf21) // %2 michael@0: ); michael@0: asm volatile ( michael@0: "movdqa %0,%%xmm5 \n" // kMadd01 michael@0: "movdqa %1,%%xmm0 \n" // kMadd11 michael@0: "movdqa %2,%%xmm1 \n" // kRound34 michael@0: : michael@0: : "m"(kMadd01), // %0 michael@0: "m"(kMadd11), // %1 michael@0: "m"(kRound34) // %2 michael@0: ); michael@0: asm volatile ( michael@0: LABELALIGN michael@0: "1: \n" michael@0: "movdqa " MEMACCESS(0) ",%%xmm6 \n" michael@0: MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3),%%xmm7 michael@0: "pavgb %%xmm7,%%xmm6 \n" michael@0: "pshufb %%xmm2,%%xmm6 \n" michael@0: "pmaddubsw %%xmm5,%%xmm6 \n" michael@0: "paddsw %%xmm1,%%xmm6 \n" michael@0: "psrlw $0x2,%%xmm6 \n" michael@0: "packuswb %%xmm6,%%xmm6 \n" michael@0: "movq %%xmm6," MEMACCESS(1) " \n" michael@0: "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" michael@0: MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7 michael@0: "pavgb %%xmm7,%%xmm6 \n" michael@0: "pshufb %%xmm3,%%xmm6 \n" michael@0: "pmaddubsw %%xmm0,%%xmm6 \n" michael@0: "paddsw %%xmm1,%%xmm6 \n" michael@0: "psrlw $0x2,%%xmm6 \n" michael@0: "packuswb %%xmm6,%%xmm6 \n" michael@0: "movq %%xmm6," MEMACCESS2(0x8,1) " \n" michael@0: "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" michael@0: BUNDLEALIGN michael@0: MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3),%%xmm7 michael@0: "lea " MEMLEA(0x20,0) ",%0 \n" michael@0: "pavgb %%xmm7,%%xmm6 \n" michael@0: "pshufb %%xmm4,%%xmm6 \n" michael@0: "pmaddubsw %4,%%xmm6 \n" michael@0: "paddsw %%xmm1,%%xmm6 \n" michael@0: "psrlw $0x2,%%xmm6 \n" michael@0: "packuswb %%xmm6,%%xmm6 \n" michael@0: "movq %%xmm6," MEMACCESS2(0x10,1) " \n" michael@0: "lea " MEMLEA(0x18,1) ",%1 \n" michael@0: "sub $0x18,%2 \n" michael@0: "jg 1b \n" michael@0: : "+r"(src_ptr), // %0 michael@0: "+r"(dst_ptr), // %1 michael@0: "+r"(dst_width) // %2 michael@0: : "r"((intptr_t)(src_stride)), // %3 michael@0: "m"(kMadd21) // %4 michael@0: : "memory", "cc" michael@0: #if defined(__native_client__) && defined(__x86_64__) michael@0: , "r14" michael@0: #endif michael@0: #if defined(__SSE2__) michael@0: , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" michael@0: #endif michael@0: ); michael@0: } michael@0: michael@0: void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, michael@0: ptrdiff_t src_stride, michael@0: uint8* dst_ptr, int dst_width) { michael@0: asm volatile ( michael@0: "movdqa %0,%%xmm2 \n" // kShuf01 michael@0: "movdqa %1,%%xmm3 \n" // kShuf11 michael@0: "movdqa %2,%%xmm4 \n" // kShuf21 michael@0: : michael@0: : "m"(kShuf01), // %0 michael@0: "m"(kShuf11), // %1 michael@0: "m"(kShuf21) // %2 michael@0: ); michael@0: asm volatile ( michael@0: "movdqa %0,%%xmm5 \n" // kMadd01 michael@0: "movdqa %1,%%xmm0 \n" // kMadd11 michael@0: "movdqa %2,%%xmm1 \n" // kRound34 michael@0: : michael@0: : "m"(kMadd01), // %0 michael@0: "m"(kMadd11), // %1 michael@0: "m"(kRound34) // %2 michael@0: ); michael@0: michael@0: asm volatile ( michael@0: LABELALIGN michael@0: "1: \n" michael@0: "movdqa " MEMACCESS(0) ",%%xmm6 \n" michael@0: MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3,1),%%xmm7 michael@0: "pavgb %%xmm6,%%xmm7 \n" michael@0: "pavgb %%xmm7,%%xmm6 \n" michael@0: "pshufb %%xmm2,%%xmm6 \n" michael@0: "pmaddubsw %%xmm5,%%xmm6 \n" michael@0: "paddsw %%xmm1,%%xmm6 \n" michael@0: "psrlw $0x2,%%xmm6 \n" michael@0: "packuswb %%xmm6,%%xmm6 \n" michael@0: "movq %%xmm6," MEMACCESS(1) " \n" michael@0: "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" michael@0: MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7 michael@0: "pavgb %%xmm6,%%xmm7 \n" michael@0: "pavgb %%xmm7,%%xmm6 \n" michael@0: "pshufb %%xmm3,%%xmm6 \n" michael@0: "pmaddubsw %%xmm0,%%xmm6 \n" michael@0: "paddsw %%xmm1,%%xmm6 \n" michael@0: "psrlw $0x2,%%xmm6 \n" michael@0: "packuswb %%xmm6,%%xmm6 \n" michael@0: "movq %%xmm6," MEMACCESS2(0x8,1) " \n" michael@0: "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" michael@0: MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3,1),%%xmm7 michael@0: "lea " MEMLEA(0x20,0) ",%0 \n" michael@0: "pavgb %%xmm6,%%xmm7 \n" michael@0: "pavgb %%xmm7,%%xmm6 \n" michael@0: "pshufb %%xmm4,%%xmm6 \n" michael@0: "pmaddubsw %4,%%xmm6 \n" michael@0: "paddsw %%xmm1,%%xmm6 \n" michael@0: "psrlw $0x2,%%xmm6 \n" michael@0: "packuswb %%xmm6,%%xmm6 \n" michael@0: "movq %%xmm6," MEMACCESS2(0x10,1) " \n" michael@0: "lea " MEMLEA(0x18,1) ",%1 \n" michael@0: "sub $0x18,%2 \n" michael@0: "jg 1b \n" michael@0: : "+r"(src_ptr), // %0 michael@0: "+r"(dst_ptr), // %1 michael@0: "+r"(dst_width) // %2 michael@0: : "r"((intptr_t)(src_stride)), // %3 michael@0: "m"(kMadd21) // %4 michael@0: : "memory", "cc" michael@0: #if defined(__native_client__) && defined(__x86_64__) michael@0: , "r14" michael@0: #endif michael@0: #if defined(__SSE2__) michael@0: , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" michael@0: #endif michael@0: ); michael@0: } michael@0: michael@0: void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, michael@0: uint8* dst_ptr, int dst_width) { michael@0: asm volatile ( michael@0: "movdqa %3,%%xmm4 \n" michael@0: "movdqa %4,%%xmm5 \n" michael@0: michael@0: LABELALIGN michael@0: "1: \n" michael@0: "movdqa " MEMACCESS(0) ",%%xmm0 \n" michael@0: "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" michael@0: "lea " MEMLEA(0x20,0) ",%0 \n" michael@0: "pshufb %%xmm4,%%xmm0 \n" michael@0: "pshufb %%xmm5,%%xmm1 \n" michael@0: "paddusb %%xmm1,%%xmm0 \n" michael@0: "movq %%xmm0," MEMACCESS(1) " \n" michael@0: "movhlps %%xmm0,%%xmm1 \n" michael@0: "movd %%xmm1," MEMACCESS2(0x8,1) " \n" michael@0: "lea " MEMLEA(0xc,1) ",%1 \n" michael@0: "sub $0xc,%2 \n" michael@0: "jg 1b \n" michael@0: : "+r"(src_ptr), // %0 michael@0: "+r"(dst_ptr), // %1 michael@0: "+r"(dst_width) // %2 michael@0: : "m"(kShuf38a), // %3 michael@0: "m"(kShuf38b) // %4 michael@0: : "memory", "cc" michael@0: #if defined(__SSE2__) michael@0: , "xmm0", "xmm1", "xmm4", "xmm5" michael@0: #endif michael@0: ); michael@0: } michael@0: michael@0: void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, michael@0: ptrdiff_t src_stride, michael@0: uint8* dst_ptr, int dst_width) { michael@0: asm volatile ( michael@0: "movdqa %0,%%xmm2 \n" michael@0: "movdqa %1,%%xmm3 \n" michael@0: "movdqa %2,%%xmm4 \n" michael@0: "movdqa %3,%%xmm5 \n" michael@0: : michael@0: : "m"(kShufAb0), // %0 michael@0: "m"(kShufAb1), // %1 michael@0: "m"(kShufAb2), // %2 michael@0: "m"(kScaleAb2) // %3 michael@0: ); michael@0: asm volatile ( michael@0: LABELALIGN michael@0: "1: \n" michael@0: "movdqa " MEMACCESS(0) ",%%xmm0 \n" michael@0: MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3,1),%%xmm0 michael@0: "lea " MEMLEA(0x10,0) ",%0 \n" michael@0: "movdqa %%xmm0,%%xmm1 \n" michael@0: "pshufb %%xmm2,%%xmm1 \n" michael@0: "movdqa %%xmm0,%%xmm6 \n" michael@0: "pshufb %%xmm3,%%xmm6 \n" michael@0: "paddusw %%xmm6,%%xmm1 \n" michael@0: "pshufb %%xmm4,%%xmm0 \n" michael@0: "paddusw %%xmm0,%%xmm1 \n" michael@0: "pmulhuw %%xmm5,%%xmm1 \n" michael@0: "packuswb %%xmm1,%%xmm1 \n" michael@0: "sub $0x6,%2 \n" michael@0: "movd %%xmm1," MEMACCESS(1) " \n" michael@0: "psrlq $0x10,%%xmm1 \n" michael@0: "movd %%xmm1," MEMACCESS2(0x2,1) " \n" michael@0: "lea " MEMLEA(0x6,1) ",%1 \n" michael@0: "jg 1b \n" michael@0: : "+r"(src_ptr), // %0 michael@0: "+r"(dst_ptr), // %1 michael@0: "+r"(dst_width) // %2 michael@0: : "r"((intptr_t)(src_stride)) // %3 michael@0: : "memory", "cc" michael@0: #if defined(__native_client__) && defined(__x86_64__) michael@0: , "r14" michael@0: #endif michael@0: #if defined(__SSE2__) michael@0: , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" michael@0: #endif michael@0: ); michael@0: } michael@0: michael@0: void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, michael@0: ptrdiff_t src_stride, michael@0: uint8* dst_ptr, int dst_width) { michael@0: asm volatile ( michael@0: "movdqa %0,%%xmm2 \n" michael@0: "movdqa %1,%%xmm3 \n" michael@0: "movdqa %2,%%xmm4 \n" michael@0: "pxor %%xmm5,%%xmm5 \n" michael@0: : michael@0: : "m"(kShufAc), // %0 michael@0: "m"(kShufAc3), // %1 michael@0: "m"(kScaleAc33) // %2 michael@0: ); michael@0: asm volatile ( michael@0: LABELALIGN michael@0: "1: \n" michael@0: "movdqa " MEMACCESS(0) ",%%xmm0 \n" michael@0: MEMOPREG(movdqa,0x00,0,3,1,xmm6) // movdqa (%0,%3,1),%%xmm6 michael@0: "movhlps %%xmm0,%%xmm1 \n" michael@0: "movhlps %%xmm6,%%xmm7 \n" michael@0: "punpcklbw %%xmm5,%%xmm0 \n" michael@0: "punpcklbw %%xmm5,%%xmm1 \n" michael@0: "punpcklbw %%xmm5,%%xmm6 \n" michael@0: "punpcklbw %%xmm5,%%xmm7 \n" michael@0: "paddusw %%xmm6,%%xmm0 \n" michael@0: "paddusw %%xmm7,%%xmm1 \n" michael@0: MEMOPREG(movdqa,0x00,0,3,2,xmm6) // movdqa (%0,%3,2),%%xmm6 michael@0: "lea " MEMLEA(0x10,0) ",%0 \n" michael@0: "movhlps %%xmm6,%%xmm7 \n" michael@0: "punpcklbw %%xmm5,%%xmm6 \n" michael@0: "punpcklbw %%xmm5,%%xmm7 \n" michael@0: "paddusw %%xmm6,%%xmm0 \n" michael@0: "paddusw %%xmm7,%%xmm1 \n" michael@0: "movdqa %%xmm0,%%xmm6 \n" michael@0: "psrldq $0x2,%%xmm0 \n" michael@0: "paddusw %%xmm0,%%xmm6 \n" michael@0: "psrldq $0x2,%%xmm0 \n" michael@0: "paddusw %%xmm0,%%xmm6 \n" michael@0: "pshufb %%xmm2,%%xmm6 \n" michael@0: "movdqa %%xmm1,%%xmm7 \n" michael@0: "psrldq $0x2,%%xmm1 \n" michael@0: "paddusw %%xmm1,%%xmm7 \n" michael@0: "psrldq $0x2,%%xmm1 \n" michael@0: "paddusw %%xmm1,%%xmm7 \n" michael@0: "pshufb %%xmm3,%%xmm7 \n" michael@0: "paddusw %%xmm7,%%xmm6 \n" michael@0: "pmulhuw %%xmm4,%%xmm6 \n" michael@0: "packuswb %%xmm6,%%xmm6 \n" michael@0: "sub $0x6,%2 \n" michael@0: "movd %%xmm6," MEMACCESS(1) " \n" michael@0: "psrlq $0x10,%%xmm6 \n" michael@0: "movd %%xmm6," MEMACCESS2(0x2,1) " \n" michael@0: "lea " MEMLEA(0x6,1) ",%1 \n" michael@0: "jg 1b \n" michael@0: : "+r"(src_ptr), // %0 michael@0: "+r"(dst_ptr), // %1 michael@0: "+r"(dst_width) // %2 michael@0: : "r"((intptr_t)(src_stride)) // %3 michael@0: : "memory", "cc" michael@0: #if defined(__native_client__) && defined(__x86_64__) michael@0: , "r14" michael@0: #endif michael@0: #if defined(__SSE2__) michael@0: , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" michael@0: #endif michael@0: ); michael@0: } michael@0: michael@0: void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, michael@0: uint16* dst_ptr, int src_width, int src_height) { michael@0: int tmp_height = 0; michael@0: intptr_t tmp_src = 0; michael@0: asm volatile ( michael@0: "pxor %%xmm4,%%xmm4 \n" michael@0: "sub $0x1,%5 \n" michael@0: michael@0: LABELALIGN michael@0: "1: \n" michael@0: "movdqa " MEMACCESS(0) ",%%xmm0 \n" michael@0: "mov %0,%3 \n" michael@0: "add %6,%0 \n" michael@0: "movdqa %%xmm0,%%xmm1 \n" michael@0: "punpcklbw %%xmm4,%%xmm0 \n" michael@0: "punpckhbw %%xmm4,%%xmm1 \n" michael@0: "mov %5,%2 \n" michael@0: "test %2,%2 \n" michael@0: "je 3f \n" michael@0: michael@0: LABELALIGN michael@0: "2: \n" michael@0: "movdqa " MEMACCESS(0) ",%%xmm2 \n" michael@0: "add %6,%0 \n" michael@0: "movdqa %%xmm2,%%xmm3 \n" michael@0: "punpcklbw %%xmm4,%%xmm2 \n" michael@0: "punpckhbw %%xmm4,%%xmm3 \n" michael@0: "paddusw %%xmm2,%%xmm0 \n" michael@0: "paddusw %%xmm3,%%xmm1 \n" michael@0: "sub $0x1,%2 \n" michael@0: "jg 2b \n" michael@0: michael@0: LABELALIGN michael@0: "3: \n" michael@0: "movdqa %%xmm0," MEMACCESS(1) " \n" michael@0: "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" michael@0: "lea " MEMLEA(0x10,3) ",%0 \n" michael@0: "lea " MEMLEA(0x20,1) ",%1 \n" michael@0: "sub $0x10,%4 \n" michael@0: "jg 1b \n" michael@0: : "+r"(src_ptr), // %0 michael@0: "+r"(dst_ptr), // %1 michael@0: "+r"(tmp_height), // %2 michael@0: "+r"(tmp_src), // %3 michael@0: "+r"(src_width), // %4 michael@0: "+rm"(src_height) // %5 michael@0: : "rm"((intptr_t)(src_stride)) // %6 michael@0: : "memory", "cc" michael@0: #if defined(__SSE2__) michael@0: , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" michael@0: #endif michael@0: ); michael@0: } michael@0: michael@0: // Bilinear column filtering. SSSE3 version. michael@0: void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, michael@0: int dst_width, int x, int dx) { michael@0: intptr_t x0 = 0, x1 = 0, temp_pixel = 0; michael@0: asm volatile ( michael@0: "movd %6,%%xmm2 \n" michael@0: "movd %7,%%xmm3 \n" michael@0: "movl $0x04040000,%k2 \n" michael@0: "movd %k2,%%xmm5 \n" michael@0: "pcmpeqb %%xmm6,%%xmm6 \n" michael@0: "psrlw $0x9,%%xmm6 \n" michael@0: "pextrw $0x1,%%xmm2,%k3 \n" michael@0: "subl $0x2,%5 \n" michael@0: "jl 29f \n" michael@0: "movdqa %%xmm2,%%xmm0 \n" michael@0: "paddd %%xmm3,%%xmm0 \n" michael@0: "punpckldq %%xmm0,%%xmm2 \n" michael@0: "punpckldq %%xmm3,%%xmm3 \n" michael@0: "paddd %%xmm3,%%xmm3 \n" michael@0: "pextrw $0x3,%%xmm2,%k4 \n" michael@0: michael@0: LABELALIGN michael@0: "2: \n" michael@0: "movdqa %%xmm2,%%xmm1 \n" michael@0: "paddd %%xmm3,%%xmm2 \n" michael@0: MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 michael@0: "movd %k2,%%xmm0 \n" michael@0: "psrlw $0x9,%%xmm1 \n" michael@0: BUNDLEALIGN michael@0: MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2 michael@0: "movd %k2,%%xmm4 \n" michael@0: "pshufb %%xmm5,%%xmm1 \n" michael@0: "punpcklwd %%xmm4,%%xmm0 \n" michael@0: "pxor %%xmm6,%%xmm1 \n" michael@0: "pmaddubsw %%xmm1,%%xmm0 \n" michael@0: "pextrw $0x1,%%xmm2,%k3 \n" michael@0: "pextrw $0x3,%%xmm2,%k4 \n" michael@0: "psrlw $0x7,%%xmm0 \n" michael@0: "packuswb %%xmm0,%%xmm0 \n" michael@0: "movd %%xmm0,%k2 \n" michael@0: "mov %w2," MEMACCESS(0) " \n" michael@0: "lea " MEMLEA(0x2,0) ",%0 \n" michael@0: "sub $0x2,%5 \n" michael@0: "jge 2b \n" michael@0: michael@0: LABELALIGN michael@0: "29: \n" michael@0: "addl $0x1,%5 \n" michael@0: "jl 99f \n" michael@0: MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 michael@0: "movd %k2,%%xmm0 \n" michael@0: "psrlw $0x9,%%xmm2 \n" michael@0: "pshufb %%xmm5,%%xmm2 \n" michael@0: "pxor %%xmm6,%%xmm2 \n" michael@0: "pmaddubsw %%xmm2,%%xmm0 \n" michael@0: "psrlw $0x7,%%xmm0 \n" michael@0: "packuswb %%xmm0,%%xmm0 \n" michael@0: "movd %%xmm0,%k2 \n" michael@0: "mov %b2," MEMACCESS(0) " \n" michael@0: "99: \n" michael@0: : "+r"(dst_ptr), // %0 michael@0: "+r"(src_ptr), // %1 michael@0: "+a"(temp_pixel), // %2 michael@0: "+r"(x0), // %3 michael@0: "+r"(x1), // %4 michael@0: "+rm"(dst_width) // %5 michael@0: : "rm"(x), // %6 michael@0: "rm"(dx) // %7 michael@0: : "memory", "cc" michael@0: #if defined(__native_client__) && defined(__x86_64__) michael@0: , "r14" michael@0: #endif michael@0: #if defined(__SSE2__) michael@0: , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" michael@0: #endif michael@0: ); michael@0: } michael@0: michael@0: // Reads 4 pixels, duplicates them and writes 8 pixels. michael@0: // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. michael@0: void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, michael@0: int dst_width, int x, int dx) { michael@0: asm volatile ( michael@0: LABELALIGN michael@0: "1: \n" michael@0: "movdqa " MEMACCESS(1) ",%%xmm0 \n" michael@0: "lea " MEMLEA(0x10,1) ",%1 \n" michael@0: "movdqa %%xmm0,%%xmm1 \n" michael@0: "punpcklbw %%xmm0,%%xmm0 \n" michael@0: "punpckhbw %%xmm1,%%xmm1 \n" michael@0: "sub $0x20,%2 \n" michael@0: "movdqa %%xmm0," MEMACCESS(0) " \n" michael@0: "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" michael@0: "lea " MEMLEA(0x20,0) ",%0 \n" michael@0: "jg 1b \n" michael@0: michael@0: : "+r"(dst_ptr), // %0 michael@0: "+r"(src_ptr), // %1 michael@0: "+r"(dst_width) // %2 michael@0: : michael@0: : "memory", "cc" michael@0: #if defined(__SSE2__) michael@0: , "xmm0", "xmm1" michael@0: #endif michael@0: ); michael@0: } michael@0: michael@0: void ScaleARGBRowDown2_SSE2(const uint8* src_argb, michael@0: ptrdiff_t src_stride, michael@0: uint8* dst_argb, int dst_width) { michael@0: asm volatile ( michael@0: LABELALIGN michael@0: "1: \n" michael@0: "movdqa " MEMACCESS(0) ",%%xmm0 \n" michael@0: "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" michael@0: "lea " MEMLEA(0x20,0) ",%0 \n" michael@0: "shufps $0xdd,%%xmm1,%%xmm0 \n" michael@0: "sub $0x4,%2 \n" michael@0: "movdqa %%xmm0," MEMACCESS(1) " \n" michael@0: "lea " MEMLEA(0x10,1) ",%1 \n" michael@0: "jg 1b \n" michael@0: : "+r"(src_argb), // %0 michael@0: "+r"(dst_argb), // %1 michael@0: "+r"(dst_width) // %2 michael@0: : michael@0: : "memory", "cc" michael@0: #if defined(__SSE2__) michael@0: , "xmm0", "xmm1" michael@0: #endif michael@0: ); michael@0: } michael@0: michael@0: void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, michael@0: ptrdiff_t src_stride, michael@0: uint8* dst_argb, int dst_width) { michael@0: asm volatile ( michael@0: LABELALIGN michael@0: "1: \n" michael@0: "movdqa " MEMACCESS(0) ",%%xmm0 \n" michael@0: "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" michael@0: "lea " MEMLEA(0x20,0) ",%0 \n" michael@0: "movdqa %%xmm0,%%xmm2 \n" michael@0: "shufps $0x88,%%xmm1,%%xmm0 \n" michael@0: "shufps $0xdd,%%xmm1,%%xmm2 \n" michael@0: "pavgb %%xmm2,%%xmm0 \n" michael@0: "sub $0x4,%2 \n" michael@0: "movdqa %%xmm0," MEMACCESS(1) " \n" michael@0: "lea " MEMLEA(0x10,1) ",%1 \n" michael@0: "jg 1b \n" michael@0: : "+r"(src_argb), // %0 michael@0: "+r"(dst_argb), // %1 michael@0: "+r"(dst_width) // %2 michael@0: : michael@0: : "memory", "cc" michael@0: #if defined(__SSE2__) michael@0: , "xmm0", "xmm1" michael@0: #endif michael@0: ); michael@0: } michael@0: michael@0: void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, michael@0: ptrdiff_t src_stride, michael@0: uint8* dst_argb, int dst_width) { michael@0: asm volatile ( michael@0: LABELALIGN michael@0: "1: \n" michael@0: "movdqa " MEMACCESS(0) ",%%xmm0 \n" michael@0: "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" michael@0: BUNDLEALIGN michael@0: MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2 michael@0: MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3 michael@0: "lea " MEMLEA(0x20,0) ",%0 \n" michael@0: "pavgb %%xmm2,%%xmm0 \n" michael@0: "pavgb %%xmm3,%%xmm1 \n" michael@0: "movdqa %%xmm0,%%xmm2 \n" michael@0: "shufps $0x88,%%xmm1,%%xmm0 \n" michael@0: "shufps $0xdd,%%xmm1,%%xmm2 \n" michael@0: "pavgb %%xmm2,%%xmm0 \n" michael@0: "sub $0x4,%2 \n" michael@0: "movdqa %%xmm0," MEMACCESS(1) " \n" michael@0: "lea " MEMLEA(0x10,1) ",%1 \n" michael@0: "jg 1b \n" michael@0: : "+r"(src_argb), // %0 michael@0: "+r"(dst_argb), // %1 michael@0: "+r"(dst_width) // %2 michael@0: : "r"((intptr_t)(src_stride)) // %3 michael@0: : "memory", "cc" michael@0: #if defined(__native_client__) && defined(__x86_64__) michael@0: , "r14" michael@0: #endif michael@0: #if defined(__SSE2__) michael@0: , "xmm0", "xmm1", "xmm2", "xmm3" michael@0: #endif michael@0: ); michael@0: } michael@0: michael@0: // Reads 4 pixels at a time. michael@0: // Alignment requirement: dst_argb 16 byte aligned. michael@0: void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, michael@0: int src_stepx, michael@0: uint8* dst_argb, int dst_width) { michael@0: intptr_t src_stepx_x4 = (intptr_t)(src_stepx); michael@0: intptr_t src_stepx_x12 = 0; michael@0: asm volatile ( michael@0: "lea " MEMLEA3(0x00,1,4) ",%1 \n" michael@0: "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" michael@0: LABELALIGN michael@0: "1: \n" michael@0: "movd " MEMACCESS(0) ",%%xmm0 \n" michael@0: MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 michael@0: "punpckldq %%xmm1,%%xmm0 \n" michael@0: BUNDLEALIGN michael@0: MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2 michael@0: MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3 michael@0: "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" michael@0: "punpckldq %%xmm3,%%xmm2 \n" michael@0: "punpcklqdq %%xmm2,%%xmm0 \n" michael@0: "sub $0x4,%3 \n" michael@0: "movdqa %%xmm0," MEMACCESS(2) " \n" michael@0: "lea " MEMLEA(0x10,2) ",%2 \n" michael@0: "jg 1b \n" michael@0: : "+r"(src_argb), // %0 michael@0: "+r"(src_stepx_x4), // %1 michael@0: "+r"(dst_argb), // %2 michael@0: "+r"(dst_width), // %3 michael@0: "+r"(src_stepx_x12) // %4 michael@0: : michael@0: : "memory", "cc" michael@0: #if defined(__native_client__) && defined(__x86_64__) michael@0: , "r14" michael@0: #endif michael@0: #if defined(__SSE2__) michael@0: , "xmm0", "xmm1", "xmm2", "xmm3" michael@0: #endif michael@0: ); michael@0: } michael@0: michael@0: // Blends four 2x2 to 4x1. michael@0: // Alignment requirement: dst_argb 16 byte aligned. michael@0: void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, michael@0: ptrdiff_t src_stride, int src_stepx, michael@0: uint8* dst_argb, int dst_width) { michael@0: intptr_t src_stepx_x4 = (intptr_t)(src_stepx); michael@0: intptr_t src_stepx_x12 = 0; michael@0: intptr_t row1 = (intptr_t)(src_stride); michael@0: asm volatile ( michael@0: "lea " MEMLEA3(0x00,1,4) ",%1 \n" michael@0: "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" michael@0: "lea " MEMLEA4(0x00,0,5,1) ",%5 \n" michael@0: michael@0: LABELALIGN michael@0: "1: \n" michael@0: "movq " MEMACCESS(0) ",%%xmm0 \n" michael@0: MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0 michael@0: MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1 michael@0: BUNDLEALIGN michael@0: MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1 michael@0: "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" michael@0: "movq " MEMACCESS(5) ",%%xmm2 \n" michael@0: BUNDLEALIGN michael@0: MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2 michael@0: MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3 michael@0: MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3 michael@0: "lea " MEMLEA4(0x00,5,1,4) ",%5 \n" michael@0: "pavgb %%xmm2,%%xmm0 \n" michael@0: "pavgb %%xmm3,%%xmm1 \n" michael@0: "movdqa %%xmm0,%%xmm2 \n" michael@0: "shufps $0x88,%%xmm1,%%xmm0 \n" michael@0: "shufps $0xdd,%%xmm1,%%xmm2 \n" michael@0: "pavgb %%xmm2,%%xmm0 \n" michael@0: "sub $0x4,%3 \n" michael@0: "movdqa %%xmm0," MEMACCESS(2) " \n" michael@0: "lea " MEMLEA(0x10,2) ",%2 \n" michael@0: "jg 1b \n" michael@0: : "+r"(src_argb), // %0 michael@0: "+r"(src_stepx_x4), // %1 michael@0: "+r"(dst_argb), // %2 michael@0: "+rm"(dst_width), // %3 michael@0: "+r"(src_stepx_x12), // %4 michael@0: "+r"(row1) // %5 michael@0: : michael@0: : "memory", "cc" michael@0: #if defined(__native_client__) && defined(__x86_64__) michael@0: , "r14" michael@0: #endif michael@0: #if defined(__SSE2__) michael@0: , "xmm0", "xmm1", "xmm2", "xmm3" michael@0: #endif michael@0: ); michael@0: } michael@0: michael@0: void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, michael@0: int dst_width, int x, int dx) { michael@0: intptr_t x0 = 0, x1 = 0; michael@0: asm volatile ( michael@0: "movd %5,%%xmm2 \n" michael@0: "movd %6,%%xmm3 \n" michael@0: "pshufd $0x0,%%xmm2,%%xmm2 \n" michael@0: "pshufd $0x11,%%xmm3,%%xmm0 \n" michael@0: "paddd %%xmm0,%%xmm2 \n" michael@0: "paddd %%xmm3,%%xmm3 \n" michael@0: "pshufd $0x5,%%xmm3,%%xmm0 \n" michael@0: "paddd %%xmm0,%%xmm2 \n" michael@0: "paddd %%xmm3,%%xmm3 \n" michael@0: "pshufd $0x0,%%xmm3,%%xmm3 \n" michael@0: "pextrw $0x1,%%xmm2,%k0 \n" michael@0: "pextrw $0x3,%%xmm2,%k1 \n" michael@0: "cmp $0x0,%4 \n" michael@0: "jl 99f \n" michael@0: "sub $0x4,%4 \n" michael@0: "jl 49f \n" michael@0: michael@0: LABELALIGN michael@0: "40: \n" michael@0: MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 michael@0: MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 michael@0: "pextrw $0x5,%%xmm2,%k0 \n" michael@0: "pextrw $0x7,%%xmm2,%k1 \n" michael@0: "paddd %%xmm3,%%xmm2 \n" michael@0: "punpckldq %%xmm1,%%xmm0 \n" michael@0: MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1 michael@0: MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4 michael@0: "pextrw $0x1,%%xmm2,%k0 \n" michael@0: "pextrw $0x3,%%xmm2,%k1 \n" michael@0: "punpckldq %%xmm4,%%xmm1 \n" michael@0: "punpcklqdq %%xmm1,%%xmm0 \n" michael@0: "sub $0x4,%4 \n" michael@0: "movdqu %%xmm0," MEMACCESS(2) " \n" michael@0: "lea " MEMLEA(0x10,2) ",%2 \n" michael@0: "jge 40b \n" michael@0: michael@0: "49: \n" michael@0: "test $0x2,%4 \n" michael@0: "je 29f \n" michael@0: BUNDLEALIGN michael@0: MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 michael@0: MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 michael@0: "pextrw $0x5,%%xmm2,%k0 \n" michael@0: "punpckldq %%xmm1,%%xmm0 \n" michael@0: "movq %%xmm0," MEMACCESS(2) " \n" michael@0: "lea " MEMLEA(0x8,2) ",%2 \n" michael@0: "29: \n" michael@0: "test $0x1,%4 \n" michael@0: "je 99f \n" michael@0: MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 michael@0: "movd %%xmm0," MEMACCESS(2) " \n" michael@0: "99: \n" michael@0: : "+a"(x0), // %0 michael@0: "+d"(x1), // %1 michael@0: "+r"(dst_argb), // %2 michael@0: "+r"(src_argb), // %3 michael@0: "+r"(dst_width) // %4 michael@0: : "rm"(x), // %5 michael@0: "rm"(dx) // %6 michael@0: : "memory", "cc" michael@0: #if defined(__native_client__) && defined(__x86_64__) michael@0: , "r14" michael@0: #endif michael@0: #if defined(__SSE2__) michael@0: , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" michael@0: #endif michael@0: ); michael@0: } michael@0: michael@0: // Reads 4 pixels, duplicates them and writes 8 pixels. michael@0: // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. michael@0: void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, michael@0: int dst_width, int x, int dx) { michael@0: asm volatile ( michael@0: LABELALIGN michael@0: "1: \n" michael@0: "movdqa " MEMACCESS(1) ",%%xmm0 \n" michael@0: "lea " MEMLEA(0x10,1) ",%1 \n" michael@0: "movdqa %%xmm0,%%xmm1 \n" michael@0: "punpckldq %%xmm0,%%xmm0 \n" michael@0: "punpckhdq %%xmm1,%%xmm1 \n" michael@0: "sub $0x8,%2 \n" michael@0: "movdqa %%xmm0," MEMACCESS(0) " \n" michael@0: "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" michael@0: "lea " MEMLEA(0x20,0) ",%0 \n" michael@0: "jg 1b \n" michael@0: michael@0: : "+r"(dst_argb), // %0 michael@0: "+r"(src_argb), // %1 michael@0: "+r"(dst_width) // %2 michael@0: : michael@0: : "memory", "cc" michael@0: #if defined(__native_client__) && defined(__x86_64__) michael@0: , "r14" michael@0: #endif michael@0: #if defined(__SSE2__) michael@0: , "xmm0", "xmm1" michael@0: #endif michael@0: ); michael@0: } michael@0: michael@0: // Shuffle table for arranging 2 pixels into pairs for pmaddubsw michael@0: static uvec8 kShuffleColARGB = { michael@0: 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel michael@0: 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel michael@0: }; michael@0: michael@0: // Shuffle table for duplicating 2 fractions into 8 bytes each michael@0: static uvec8 kShuffleFractions = { michael@0: 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, michael@0: }; michael@0: michael@0: // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version michael@0: void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, michael@0: int dst_width, int x, int dx) { michael@0: intptr_t x0 = 0, x1 = 0; michael@0: asm volatile ( michael@0: "movdqa %0,%%xmm4 \n" michael@0: "movdqa %1,%%xmm5 \n" michael@0: : michael@0: : "m"(kShuffleColARGB), // %0 michael@0: "m"(kShuffleFractions) // %1 michael@0: ); michael@0: michael@0: asm volatile ( michael@0: "movd %5,%%xmm2 \n" michael@0: "movd %6,%%xmm3 \n" michael@0: "pcmpeqb %%xmm6,%%xmm6 \n" michael@0: "psrlw $0x9,%%xmm6 \n" michael@0: "pextrw $0x1,%%xmm2,%k3 \n" michael@0: "sub $0x2,%2 \n" michael@0: "jl 29f \n" michael@0: "movdqa %%xmm2,%%xmm0 \n" michael@0: "paddd %%xmm3,%%xmm0 \n" michael@0: "punpckldq %%xmm0,%%xmm2 \n" michael@0: "punpckldq %%xmm3,%%xmm3 \n" michael@0: "paddd %%xmm3,%%xmm3 \n" michael@0: "pextrw $0x3,%%xmm2,%k4 \n" michael@0: michael@0: LABELALIGN michael@0: "2: \n" michael@0: "movdqa %%xmm2,%%xmm1 \n" michael@0: "paddd %%xmm3,%%xmm2 \n" michael@0: MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 michael@0: "psrlw $0x9,%%xmm1 \n" michael@0: BUNDLEALIGN michael@0: MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0 michael@0: "pshufb %%xmm5,%%xmm1 \n" michael@0: "pshufb %%xmm4,%%xmm0 \n" michael@0: "pxor %%xmm6,%%xmm1 \n" michael@0: "pmaddubsw %%xmm1,%%xmm0 \n" michael@0: "psrlw $0x7,%%xmm0 \n" michael@0: "pextrw $0x1,%%xmm2,%k3 \n" michael@0: "pextrw $0x3,%%xmm2,%k4 \n" michael@0: "packuswb %%xmm0,%%xmm0 \n" michael@0: "movq %%xmm0," MEMACCESS(0) " \n" michael@0: "lea " MEMLEA(0x8,0) ",%0 \n" michael@0: "sub $0x2,%2 \n" michael@0: "jge 2b \n" michael@0: michael@0: LABELALIGN michael@0: "29: \n" michael@0: "add $0x1,%2 \n" michael@0: "jl 99f \n" michael@0: "psrlw $0x9,%%xmm2 \n" michael@0: BUNDLEALIGN michael@0: MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 michael@0: "pshufb %%xmm5,%%xmm2 \n" michael@0: "pshufb %%xmm4,%%xmm0 \n" michael@0: "pxor %%xmm6,%%xmm2 \n" michael@0: "pmaddubsw %%xmm2,%%xmm0 \n" michael@0: "psrlw $0x7,%%xmm0 \n" michael@0: "packuswb %%xmm0,%%xmm0 \n" michael@0: "movd %%xmm0," MEMACCESS(0) " \n" michael@0: michael@0: LABELALIGN michael@0: "99: \n" michael@0: : "+r"(dst_argb), // %0 michael@0: "+r"(src_argb), // %1 michael@0: "+rm"(dst_width), // %2 michael@0: "+r"(x0), // %3 michael@0: "+r"(x1) // %4 michael@0: : "rm"(x), // %5 michael@0: "rm"(dx) // %6 michael@0: : "memory", "cc" michael@0: #if defined(__native_client__) && defined(__x86_64__) michael@0: , "r14" michael@0: #endif michael@0: #if defined(__SSE2__) michael@0: , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" michael@0: #endif michael@0: ); michael@0: } michael@0: michael@0: // Divide num by div and return as 16.16 fixed point result. michael@0: int FixedDiv_X86(int num, int div) { michael@0: asm volatile ( michael@0: "cdq \n" michael@0: "shld $0x10,%%eax,%%edx \n" michael@0: "shl $0x10,%%eax \n" michael@0: "idiv %1 \n" michael@0: "mov %0, %%eax \n" michael@0: : "+a"(num) // %0 michael@0: : "c"(div) // %1 michael@0: : "memory", "cc", "edx" michael@0: ); michael@0: return num; michael@0: } michael@0: michael@0: // Divide num - 1 by div - 1 and return as 16.16 fixed point result. michael@0: int FixedDiv1_X86(int num, int div) { michael@0: asm volatile ( michael@0: "cdq \n" michael@0: "shld $0x10,%%eax,%%edx \n" michael@0: "shl $0x10,%%eax \n" michael@0: "sub $0x10001,%%eax \n" michael@0: "sbb $0x0,%%edx \n" michael@0: "sub $0x1,%1 \n" michael@0: "idiv %1 \n" michael@0: "mov %0, %%eax \n" michael@0: : "+a"(num) // %0 michael@0: : "c"(div) // %1 michael@0: : "memory", "cc", "edx" michael@0: ); michael@0: return num; michael@0: } michael@0: michael@0: #endif // defined(__x86_64__) || defined(__i386__) michael@0: michael@0: #ifdef __cplusplus michael@0: } // extern "C" michael@0: } // namespace libyuv michael@0: #endif