michael@0: /* michael@0: * Copyright 2013 The LibYuv Project Authors. All rights reserved. michael@0: * michael@0: * Use of this source code is governed by a BSD-style license michael@0: * that can be found in the LICENSE file in the root of the source michael@0: * tree. An additional intellectual property rights grant can be found michael@0: * in the file PATENTS. All contributing project authors may michael@0: * be found in the AUTHORS file in the root of the source tree. michael@0: */ michael@0: michael@0: #include "libyuv/row.h" michael@0: michael@0: #ifdef __cplusplus michael@0: namespace libyuv { michael@0: extern "C" { michael@0: #endif michael@0: michael@0: // This module is for Visual C x86. michael@0: #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) michael@0: michael@0: // Offsets for source bytes 0 to 9 michael@0: static uvec8 kShuf0 = michael@0: { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; michael@0: michael@0: // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. michael@0: static uvec8 kShuf1 = michael@0: { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; michael@0: michael@0: // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. michael@0: static uvec8 kShuf2 = michael@0: { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; michael@0: michael@0: // Offsets for source bytes 0 to 10 michael@0: static uvec8 kShuf01 = michael@0: { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; michael@0: michael@0: // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. michael@0: static uvec8 kShuf11 = michael@0: { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; michael@0: michael@0: // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. michael@0: static uvec8 kShuf21 = michael@0: { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; michael@0: michael@0: // Coefficients for source bytes 0 to 10 michael@0: static uvec8 kMadd01 = michael@0: { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; michael@0: michael@0: // Coefficients for source bytes 10 to 21 michael@0: static uvec8 kMadd11 = michael@0: { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; michael@0: michael@0: // Coefficients for source bytes 21 to 31 michael@0: static uvec8 kMadd21 = michael@0: { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; michael@0: michael@0: // Coefficients for source bytes 21 to 31 michael@0: static vec16 kRound34 = michael@0: { 2, 2, 2, 2, 2, 2, 2, 2 }; michael@0: michael@0: static uvec8 kShuf38a = michael@0: { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; michael@0: michael@0: static uvec8 kShuf38b = michael@0: { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; michael@0: michael@0: // Arrange words 0,3,6 into 0,1,2 michael@0: static uvec8 kShufAc = michael@0: { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; michael@0: michael@0: // Arrange words 0,3,6 into 3,4,5 michael@0: static uvec8 kShufAc3 = michael@0: { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; michael@0: michael@0: // Scaling values for boxes of 3x3 and 2x3 michael@0: static uvec16 kScaleAc33 = michael@0: { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; michael@0: michael@0: // Arrange first value for pixels 0,1,2,3,4,5 michael@0: static uvec8 kShufAb0 = michael@0: { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; michael@0: michael@0: // Arrange second value for pixels 0,1,2,3,4,5 michael@0: static uvec8 kShufAb1 = michael@0: { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; michael@0: michael@0: // Arrange third value for pixels 0,1,2,3,4,5 michael@0: static uvec8 kShufAb2 = michael@0: { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; michael@0: michael@0: // Scaling values for boxes of 3x2 and 2x2 michael@0: static uvec16 kScaleAb2 = michael@0: { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; michael@0: michael@0: // Reads 32 pixels, throws half away and writes 16 pixels. michael@0: // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, michael@0: uint8* dst_ptr, int dst_width) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_ptr michael@0: // src_stride ignored michael@0: mov edx, [esp + 12] // dst_ptr michael@0: mov ecx, [esp + 16] // dst_width michael@0: michael@0: align 4 michael@0: wloop: michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: lea eax, [eax + 32] michael@0: psrlw xmm0, 8 // isolate odd pixels. michael@0: psrlw xmm1, 8 michael@0: packuswb xmm0, xmm1 michael@0: sub ecx, 16 michael@0: movdqa [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg wloop michael@0: michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // Blends 32x1 rectangle to 16x1. michael@0: // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, michael@0: uint8* dst_ptr, int dst_width) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_ptr michael@0: // src_stride michael@0: mov edx, [esp + 12] // dst_ptr michael@0: mov ecx, [esp + 16] // dst_width michael@0: pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff michael@0: psrlw xmm5, 8 michael@0: michael@0: align 4 michael@0: wloop: michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: lea eax, [eax + 32] michael@0: michael@0: movdqa xmm2, xmm0 // average columns (32 to 16 pixels) michael@0: psrlw xmm0, 8 michael@0: movdqa xmm3, xmm1 michael@0: psrlw xmm1, 8 michael@0: pand xmm2, xmm5 michael@0: pand xmm3, xmm5 michael@0: pavgw xmm0, xmm2 michael@0: pavgw xmm1, xmm3 michael@0: packuswb xmm0, xmm1 michael@0: michael@0: sub ecx, 16 michael@0: movdqa [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg wloop michael@0: michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // Blends 32x2 rectangle to 16x1. michael@0: // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, michael@0: uint8* dst_ptr, int dst_width) { michael@0: __asm { michael@0: push esi michael@0: mov eax, [esp + 4 + 4] // src_ptr michael@0: mov esi, [esp + 4 + 8] // src_stride michael@0: mov edx, [esp + 4 + 12] // dst_ptr michael@0: mov ecx, [esp + 4 + 16] // dst_width michael@0: pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff michael@0: psrlw xmm5, 8 michael@0: michael@0: align 4 michael@0: wloop: michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: movdqa xmm2, [eax + esi] michael@0: movdqa xmm3, [eax + esi + 16] michael@0: lea eax, [eax + 32] michael@0: pavgb xmm0, xmm2 // average rows michael@0: pavgb xmm1, xmm3 michael@0: michael@0: movdqa xmm2, xmm0 // average columns (32 to 16 pixels) michael@0: psrlw xmm0, 8 michael@0: movdqa xmm3, xmm1 michael@0: psrlw xmm1, 8 michael@0: pand xmm2, xmm5 michael@0: pand xmm3, xmm5 michael@0: pavgw xmm0, xmm2 michael@0: pavgw xmm1, xmm3 michael@0: packuswb xmm0, xmm1 michael@0: michael@0: sub ecx, 16 michael@0: movdqa [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg wloop michael@0: michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // Reads 32 pixels, throws half away and writes 16 pixels. michael@0: // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, michael@0: ptrdiff_t src_stride, michael@0: uint8* dst_ptr, int dst_width) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_ptr michael@0: // src_stride ignored michael@0: mov edx, [esp + 12] // dst_ptr michael@0: mov ecx, [esp + 16] // dst_width michael@0: michael@0: align 4 michael@0: wloop: michael@0: movdqu xmm0, [eax] michael@0: movdqu xmm1, [eax + 16] michael@0: lea eax, [eax + 32] michael@0: psrlw xmm0, 8 // isolate odd pixels. michael@0: psrlw xmm1, 8 michael@0: packuswb xmm0, xmm1 michael@0: sub ecx, 16 michael@0: movdqu [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg wloop michael@0: michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // Blends 32x1 rectangle to 16x1. michael@0: // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, michael@0: ptrdiff_t src_stride, michael@0: uint8* dst_ptr, int dst_width) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_ptr michael@0: // src_stride michael@0: mov edx, [esp + 12] // dst_ptr michael@0: mov ecx, [esp + 16] // dst_width michael@0: pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff michael@0: psrlw xmm5, 8 michael@0: michael@0: align 4 michael@0: wloop: michael@0: movdqu xmm0, [eax] michael@0: movdqu xmm1, [eax + 16] michael@0: lea eax, [eax + 32] michael@0: michael@0: movdqa xmm2, xmm0 // average columns (32 to 16 pixels) michael@0: psrlw xmm0, 8 michael@0: movdqa xmm3, xmm1 michael@0: psrlw xmm1, 8 michael@0: pand xmm2, xmm5 michael@0: pand xmm3, xmm5 michael@0: pavgw xmm0, xmm2 michael@0: pavgw xmm1, xmm3 michael@0: packuswb xmm0, xmm1 michael@0: michael@0: sub ecx, 16 michael@0: movdqu [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg wloop michael@0: michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // Blends 32x2 rectangle to 16x1. michael@0: // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, michael@0: ptrdiff_t src_stride, michael@0: uint8* dst_ptr, int dst_width) { michael@0: __asm { michael@0: push esi michael@0: mov eax, [esp + 4 + 4] // src_ptr michael@0: mov esi, [esp + 4 + 8] // src_stride michael@0: mov edx, [esp + 4 + 12] // dst_ptr michael@0: mov ecx, [esp + 4 + 16] // dst_width michael@0: pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff michael@0: psrlw xmm5, 8 michael@0: michael@0: align 4 michael@0: wloop: michael@0: movdqu xmm0, [eax] michael@0: movdqu xmm1, [eax + 16] michael@0: movdqu xmm2, [eax + esi] michael@0: movdqu xmm3, [eax + esi + 16] michael@0: lea eax, [eax + 32] michael@0: pavgb xmm0, xmm2 // average rows michael@0: pavgb xmm1, xmm3 michael@0: michael@0: movdqa xmm2, xmm0 // average columns (32 to 16 pixels) michael@0: psrlw xmm0, 8 michael@0: movdqa xmm3, xmm1 michael@0: psrlw xmm1, 8 michael@0: pand xmm2, xmm5 michael@0: pand xmm3, xmm5 michael@0: pavgw xmm0, xmm2 michael@0: pavgw xmm1, xmm3 michael@0: packuswb xmm0, xmm1 michael@0: michael@0: sub ecx, 16 michael@0: movdqu [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg wloop michael@0: michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // Point samples 32 pixels to 8 pixels. michael@0: // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, michael@0: uint8* dst_ptr, int dst_width) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_ptr michael@0: // src_stride ignored michael@0: mov edx, [esp + 12] // dst_ptr michael@0: mov ecx, [esp + 16] // dst_width michael@0: pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 michael@0: psrld xmm5, 24 michael@0: pslld xmm5, 16 michael@0: michael@0: align 4 michael@0: wloop: michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: lea eax, [eax + 32] michael@0: pand xmm0, xmm5 michael@0: pand xmm1, xmm5 michael@0: packuswb xmm0, xmm1 michael@0: psrlw xmm0, 8 michael@0: packuswb xmm0, xmm0 michael@0: sub ecx, 8 michael@0: movq qword ptr [edx], xmm0 michael@0: lea edx, [edx + 8] michael@0: jg wloop michael@0: michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // Blends 32x4 rectangle to 8x1. michael@0: // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, michael@0: uint8* dst_ptr, int dst_width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // src_ptr michael@0: mov esi, [esp + 8 + 8] // src_stride michael@0: mov edx, [esp + 8 + 12] // dst_ptr michael@0: mov ecx, [esp + 8 + 16] // dst_width michael@0: lea edi, [esi + esi * 2] // src_stride * 3 michael@0: pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff michael@0: psrlw xmm7, 8 michael@0: michael@0: align 4 michael@0: wloop: michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: movdqa xmm2, [eax + esi] michael@0: movdqa xmm3, [eax + esi + 16] michael@0: pavgb xmm0, xmm2 // average rows michael@0: pavgb xmm1, xmm3 michael@0: movdqa xmm2, [eax + esi * 2] michael@0: movdqa xmm3, [eax + esi * 2 + 16] michael@0: movdqa xmm4, [eax + edi] michael@0: movdqa xmm5, [eax + edi + 16] michael@0: lea eax, [eax + 32] michael@0: pavgb xmm2, xmm4 michael@0: pavgb xmm3, xmm5 michael@0: pavgb xmm0, xmm2 michael@0: pavgb xmm1, xmm3 michael@0: michael@0: movdqa xmm2, xmm0 // average columns (32 to 16 pixels) michael@0: psrlw xmm0, 8 michael@0: movdqa xmm3, xmm1 michael@0: psrlw xmm1, 8 michael@0: pand xmm2, xmm7 michael@0: pand xmm3, xmm7 michael@0: pavgw xmm0, xmm2 michael@0: pavgw xmm1, xmm3 michael@0: packuswb xmm0, xmm1 michael@0: michael@0: movdqa xmm2, xmm0 // average columns (16 to 8 pixels) michael@0: psrlw xmm0, 8 michael@0: pand xmm2, xmm7 michael@0: pavgw xmm0, xmm2 michael@0: packuswb xmm0, xmm0 michael@0: michael@0: sub ecx, 8 michael@0: movq qword ptr [edx], xmm0 michael@0: lea edx, [edx + 8] michael@0: jg wloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // Point samples 32 pixels to 24 pixels. michael@0: // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. michael@0: // Then shuffled to do the scaling. michael@0: michael@0: // Note that movdqa+palign may be better than movdqu. michael@0: // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, michael@0: uint8* dst_ptr, int dst_width) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_ptr michael@0: // src_stride ignored michael@0: mov edx, [esp + 12] // dst_ptr michael@0: mov ecx, [esp + 16] // dst_width michael@0: movdqa xmm3, kShuf0 michael@0: movdqa xmm4, kShuf1 michael@0: movdqa xmm5, kShuf2 michael@0: michael@0: align 4 michael@0: wloop: michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: lea eax, [eax + 32] michael@0: movdqa xmm2, xmm1 michael@0: palignr xmm1, xmm0, 8 michael@0: pshufb xmm0, xmm3 michael@0: pshufb xmm1, xmm4 michael@0: pshufb xmm2, xmm5 michael@0: movq qword ptr [edx], xmm0 michael@0: movq qword ptr [edx + 8], xmm1 michael@0: movq qword ptr [edx + 16], xmm2 michael@0: lea edx, [edx + 24] michael@0: sub ecx, 24 michael@0: jg wloop michael@0: michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // Blends 32x2 rectangle to 24x1 michael@0: // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. michael@0: // Then shuffled to do the scaling. michael@0: michael@0: // Register usage: michael@0: // xmm0 src_row 0 michael@0: // xmm1 src_row 1 michael@0: // xmm2 shuf 0 michael@0: // xmm3 shuf 1 michael@0: // xmm4 shuf 2 michael@0: // xmm5 madd 0 michael@0: // xmm6 madd 1 michael@0: // xmm7 kRound34 michael@0: michael@0: // Note that movdqa+palign may be better than movdqu. michael@0: // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, michael@0: ptrdiff_t src_stride, michael@0: uint8* dst_ptr, int dst_width) { michael@0: __asm { michael@0: push esi michael@0: mov eax, [esp + 4 + 4] // src_ptr michael@0: mov esi, [esp + 4 + 8] // src_stride michael@0: mov edx, [esp + 4 + 12] // dst_ptr michael@0: mov ecx, [esp + 4 + 16] // dst_width michael@0: movdqa xmm2, kShuf01 michael@0: movdqa xmm3, kShuf11 michael@0: movdqa xmm4, kShuf21 michael@0: movdqa xmm5, kMadd01 michael@0: movdqa xmm6, kMadd11 michael@0: movdqa xmm7, kRound34 michael@0: michael@0: align 4 michael@0: wloop: michael@0: movdqa xmm0, [eax] // pixels 0..7 michael@0: movdqa xmm1, [eax + esi] michael@0: pavgb xmm0, xmm1 michael@0: pshufb xmm0, xmm2 michael@0: pmaddubsw xmm0, xmm5 michael@0: paddsw xmm0, xmm7 michael@0: psrlw xmm0, 2 michael@0: packuswb xmm0, xmm0 michael@0: movq qword ptr [edx], xmm0 michael@0: movdqu xmm0, [eax + 8] // pixels 8..15 michael@0: movdqu xmm1, [eax + esi + 8] michael@0: pavgb xmm0, xmm1 michael@0: pshufb xmm0, xmm3 michael@0: pmaddubsw xmm0, xmm6 michael@0: paddsw xmm0, xmm7 michael@0: psrlw xmm0, 2 michael@0: packuswb xmm0, xmm0 michael@0: movq qword ptr [edx + 8], xmm0 michael@0: movdqa xmm0, [eax + 16] // pixels 16..23 michael@0: movdqa xmm1, [eax + esi + 16] michael@0: lea eax, [eax + 32] michael@0: pavgb xmm0, xmm1 michael@0: pshufb xmm0, xmm4 michael@0: movdqa xmm1, kMadd21 michael@0: pmaddubsw xmm0, xmm1 michael@0: paddsw xmm0, xmm7 michael@0: psrlw xmm0, 2 michael@0: packuswb xmm0, xmm0 michael@0: sub ecx, 24 michael@0: movq qword ptr [edx + 16], xmm0 michael@0: lea edx, [edx + 24] michael@0: jg wloop michael@0: michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // Note that movdqa+palign may be better than movdqu. michael@0: // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, michael@0: ptrdiff_t src_stride, michael@0: uint8* dst_ptr, int dst_width) { michael@0: __asm { michael@0: push esi michael@0: mov eax, [esp + 4 + 4] // src_ptr michael@0: mov esi, [esp + 4 + 8] // src_stride michael@0: mov edx, [esp + 4 + 12] // dst_ptr michael@0: mov ecx, [esp + 4 + 16] // dst_width michael@0: movdqa xmm2, kShuf01 michael@0: movdqa xmm3, kShuf11 michael@0: movdqa xmm4, kShuf21 michael@0: movdqa xmm5, kMadd01 michael@0: movdqa xmm6, kMadd11 michael@0: movdqa xmm7, kRound34 michael@0: michael@0: align 4 michael@0: wloop: michael@0: movdqa xmm0, [eax] // pixels 0..7 michael@0: movdqa xmm1, [eax + esi] michael@0: pavgb xmm1, xmm0 michael@0: pavgb xmm0, xmm1 michael@0: pshufb xmm0, xmm2 michael@0: pmaddubsw xmm0, xmm5 michael@0: paddsw xmm0, xmm7 michael@0: psrlw xmm0, 2 michael@0: packuswb xmm0, xmm0 michael@0: movq qword ptr [edx], xmm0 michael@0: movdqu xmm0, [eax + 8] // pixels 8..15 michael@0: movdqu xmm1, [eax + esi + 8] michael@0: pavgb xmm1, xmm0 michael@0: pavgb xmm0, xmm1 michael@0: pshufb xmm0, xmm3 michael@0: pmaddubsw xmm0, xmm6 michael@0: paddsw xmm0, xmm7 michael@0: psrlw xmm0, 2 michael@0: packuswb xmm0, xmm0 michael@0: movq qword ptr [edx + 8], xmm0 michael@0: movdqa xmm0, [eax + 16] // pixels 16..23 michael@0: movdqa xmm1, [eax + esi + 16] michael@0: lea eax, [eax + 32] michael@0: pavgb xmm1, xmm0 michael@0: pavgb xmm0, xmm1 michael@0: pshufb xmm0, xmm4 michael@0: movdqa xmm1, kMadd21 michael@0: pmaddubsw xmm0, xmm1 michael@0: paddsw xmm0, xmm7 michael@0: psrlw xmm0, 2 michael@0: packuswb xmm0, xmm0 michael@0: sub ecx, 24 michael@0: movq qword ptr [edx + 16], xmm0 michael@0: lea edx, [edx+24] michael@0: jg wloop michael@0: michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // 3/8 point sampler michael@0: michael@0: // Scale 32 pixels to 12 michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, michael@0: uint8* dst_ptr, int dst_width) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_ptr michael@0: // src_stride ignored michael@0: mov edx, [esp + 12] // dst_ptr michael@0: mov ecx, [esp + 16] // dst_width michael@0: movdqa xmm4, kShuf38a michael@0: movdqa xmm5, kShuf38b michael@0: michael@0: align 4 michael@0: xloop: michael@0: movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 michael@0: movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 michael@0: lea eax, [eax + 32] michael@0: pshufb xmm0, xmm4 michael@0: pshufb xmm1, xmm5 michael@0: paddusb xmm0, xmm1 michael@0: michael@0: sub ecx, 12 michael@0: movq qword ptr [edx], xmm0 // write 12 pixels michael@0: movhlps xmm1, xmm0 michael@0: movd [edx + 8], xmm1 michael@0: lea edx, [edx + 12] michael@0: jg xloop michael@0: michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // Scale 16x3 pixels to 6x1 with interpolation michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, michael@0: ptrdiff_t src_stride, michael@0: uint8* dst_ptr, int dst_width) { michael@0: __asm { michael@0: push esi michael@0: mov eax, [esp + 4 + 4] // src_ptr michael@0: mov esi, [esp + 4 + 8] // src_stride michael@0: mov edx, [esp + 4 + 12] // dst_ptr michael@0: mov ecx, [esp + 4 + 16] // dst_width michael@0: movdqa xmm2, kShufAc michael@0: movdqa xmm3, kShufAc3 michael@0: movdqa xmm4, kScaleAc33 michael@0: pxor xmm5, xmm5 michael@0: michael@0: align 4 michael@0: xloop: michael@0: movdqa xmm0, [eax] // sum up 3 rows into xmm0/1 michael@0: movdqa xmm6, [eax + esi] michael@0: movhlps xmm1, xmm0 michael@0: movhlps xmm7, xmm6 michael@0: punpcklbw xmm0, xmm5 michael@0: punpcklbw xmm1, xmm5 michael@0: punpcklbw xmm6, xmm5 michael@0: punpcklbw xmm7, xmm5 michael@0: paddusw xmm0, xmm6 michael@0: paddusw xmm1, xmm7 michael@0: movdqa xmm6, [eax + esi * 2] michael@0: lea eax, [eax + 16] michael@0: movhlps xmm7, xmm6 michael@0: punpcklbw xmm6, xmm5 michael@0: punpcklbw xmm7, xmm5 michael@0: paddusw xmm0, xmm6 michael@0: paddusw xmm1, xmm7 michael@0: michael@0: movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 michael@0: psrldq xmm0, 2 michael@0: paddusw xmm6, xmm0 michael@0: psrldq xmm0, 2 michael@0: paddusw xmm6, xmm0 michael@0: pshufb xmm6, xmm2 michael@0: michael@0: movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 michael@0: psrldq xmm1, 2 michael@0: paddusw xmm7, xmm1 michael@0: psrldq xmm1, 2 michael@0: paddusw xmm7, xmm1 michael@0: pshufb xmm7, xmm3 michael@0: paddusw xmm6, xmm7 michael@0: michael@0: pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 michael@0: packuswb xmm6, xmm6 michael@0: michael@0: sub ecx, 6 michael@0: movd [edx], xmm6 // write 6 pixels michael@0: psrlq xmm6, 16 michael@0: movd [edx + 2], xmm6 michael@0: lea edx, [edx + 6] michael@0: jg xloop michael@0: michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // Scale 16x2 pixels to 6x1 with interpolation michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, michael@0: ptrdiff_t src_stride, michael@0: uint8* dst_ptr, int dst_width) { michael@0: __asm { michael@0: push esi michael@0: mov eax, [esp + 4 + 4] // src_ptr michael@0: mov esi, [esp + 4 + 8] // src_stride michael@0: mov edx, [esp + 4 + 12] // dst_ptr michael@0: mov ecx, [esp + 4 + 16] // dst_width michael@0: movdqa xmm2, kShufAb0 michael@0: movdqa xmm3, kShufAb1 michael@0: movdqa xmm4, kShufAb2 michael@0: movdqa xmm5, kScaleAb2 michael@0: michael@0: align 4 michael@0: xloop: michael@0: movdqa xmm0, [eax] // average 2 rows into xmm0 michael@0: pavgb xmm0, [eax + esi] michael@0: lea eax, [eax + 16] michael@0: michael@0: movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 michael@0: pshufb xmm1, xmm2 michael@0: movdqa xmm6, xmm0 michael@0: pshufb xmm6, xmm3 michael@0: paddusw xmm1, xmm6 michael@0: pshufb xmm0, xmm4 michael@0: paddusw xmm1, xmm0 michael@0: michael@0: pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 michael@0: packuswb xmm1, xmm1 michael@0: michael@0: sub ecx, 6 michael@0: movd [edx], xmm1 // write 6 pixels michael@0: psrlq xmm1, 16 michael@0: movd [edx + 2], xmm1 michael@0: lea edx, [edx + 6] michael@0: jg xloop michael@0: michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // Reads 16xN bytes and produces 16 shorts at a time. michael@0: // TODO(fbarchard): Make this handle 4xN bytes for any width ARGB. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, michael@0: uint16* dst_ptr, int src_width, michael@0: int src_height) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: push ebx michael@0: push ebp michael@0: mov esi, [esp + 16 + 4] // src_ptr michael@0: mov edx, [esp + 16 + 8] // src_stride michael@0: mov edi, [esp + 16 + 12] // dst_ptr michael@0: mov ecx, [esp + 16 + 16] // dst_width michael@0: mov ebx, [esp + 16 + 20] // height michael@0: pxor xmm4, xmm4 michael@0: dec ebx michael@0: michael@0: align 4 michael@0: xloop: michael@0: // first row michael@0: movdqa xmm0, [esi] michael@0: lea eax, [esi + edx] michael@0: movdqa xmm1, xmm0 michael@0: punpcklbw xmm0, xmm4 michael@0: punpckhbw xmm1, xmm4 michael@0: lea esi, [esi + 16] michael@0: mov ebp, ebx michael@0: test ebp, ebp michael@0: je ydone michael@0: michael@0: // sum remaining rows michael@0: align 4 michael@0: yloop: michael@0: movdqa xmm2, [eax] // read 16 pixels michael@0: lea eax, [eax + edx] // advance to next row michael@0: movdqa xmm3, xmm2 michael@0: punpcklbw xmm2, xmm4 michael@0: punpckhbw xmm3, xmm4 michael@0: paddusw xmm0, xmm2 // sum 16 words michael@0: paddusw xmm1, xmm3 michael@0: sub ebp, 1 michael@0: jg yloop michael@0: michael@0: align 4 michael@0: ydone: michael@0: movdqa [edi], xmm0 michael@0: movdqa [edi + 16], xmm1 michael@0: lea edi, [edi + 32] michael@0: michael@0: sub ecx, 16 michael@0: jg xloop michael@0: michael@0: pop ebp michael@0: pop ebx michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // Bilinear column filtering. SSSE3 version. michael@0: // TODO(fbarchard): Port to Neon michael@0: // TODO(fbarchard): Switch the following: michael@0: // xor ebx, ebx michael@0: // mov bx, word ptr [esi + eax] // 2 source x0 pixels michael@0: // To michael@0: // movzx ebx, word ptr [esi + eax] // 2 source x0 pixels michael@0: // when drmemory bug fixed. michael@0: // https://code.google.com/p/drmemory/issues/detail?id=1396 michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, michael@0: int dst_width, int x, int dx) { michael@0: __asm { michael@0: push ebx michael@0: push esi michael@0: push edi michael@0: mov edi, [esp + 12 + 4] // dst_ptr michael@0: mov esi, [esp + 12 + 8] // src_ptr michael@0: mov ecx, [esp + 12 + 12] // dst_width michael@0: movd xmm2, [esp + 12 + 16] // x michael@0: movd xmm3, [esp + 12 + 20] // dx michael@0: mov eax, 0x04040000 // shuffle to line up fractions with pixel. michael@0: movd xmm5, eax michael@0: pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. michael@0: psrlw xmm6, 9 michael@0: pextrw eax, xmm2, 1 // get x0 integer. preroll michael@0: sub ecx, 2 michael@0: jl xloop29 michael@0: michael@0: movdqa xmm0, xmm2 // x1 = x0 + dx michael@0: paddd xmm0, xmm3 michael@0: punpckldq xmm2, xmm0 // x0 x1 michael@0: punpckldq xmm3, xmm3 // dx dx michael@0: paddd xmm3, xmm3 // dx * 2, dx * 2 michael@0: pextrw edx, xmm2, 3 // get x1 integer. preroll michael@0: michael@0: // 2 Pixel loop. michael@0: align 4 michael@0: xloop2: michael@0: movdqa xmm1, xmm2 // x0, x1 fractions. michael@0: paddd xmm2, xmm3 // x += dx michael@0: movzx ebx, word ptr [esi + eax] // 2 source x0 pixels michael@0: movd xmm0, ebx michael@0: psrlw xmm1, 9 // 7 bit fractions. michael@0: movzx ebx, word ptr [esi + edx] // 2 source x1 pixels michael@0: movd xmm4, ebx michael@0: pshufb xmm1, xmm5 // 0011 michael@0: punpcklwd xmm0, xmm4 michael@0: pxor xmm1, xmm6 // 0..7f and 7f..0 michael@0: pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels. michael@0: pextrw eax, xmm2, 1 // get x0 integer. next iteration. michael@0: pextrw edx, xmm2, 3 // get x1 integer. next iteration. michael@0: psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. michael@0: packuswb xmm0, xmm0 // 8 bits, 2 pixels. michael@0: movd ebx, xmm0 michael@0: mov [edi], bx michael@0: lea edi, [edi + 2] michael@0: sub ecx, 2 // 2 pixels michael@0: jge xloop2 michael@0: michael@0: align 4 michael@0: xloop29: michael@0: michael@0: add ecx, 2 - 1 michael@0: jl xloop99 michael@0: michael@0: // 1 pixel remainder michael@0: movzx ebx, word ptr [esi + eax] // 2 source x0 pixels michael@0: movd xmm0, ebx michael@0: psrlw xmm2, 9 // 7 bit fractions. michael@0: pshufb xmm2, xmm5 // 0011 michael@0: pxor xmm2, xmm6 // 0..7f and 7f..0 michael@0: pmaddubsw xmm0, xmm2 // 16 bit michael@0: psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. michael@0: packuswb xmm0, xmm0 // 8 bits michael@0: movd ebx, xmm0 michael@0: mov [edi], bl michael@0: michael@0: align 4 michael@0: xloop99: michael@0: michael@0: pop edi michael@0: pop esi michael@0: pop ebx michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // Reads 16 pixels, duplicates them and writes 32 pixels. michael@0: // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, michael@0: int dst_width, int x, int dx) { michael@0: __asm { michael@0: mov edx, [esp + 4] // dst_ptr michael@0: mov eax, [esp + 8] // src_ptr michael@0: mov ecx, [esp + 12] // dst_width michael@0: michael@0: align 4 michael@0: wloop: michael@0: movdqa xmm0, [eax] michael@0: lea eax, [eax + 16] michael@0: movdqa xmm1, xmm0 michael@0: punpcklbw xmm0, xmm0 michael@0: punpckhbw xmm1, xmm1 michael@0: sub ecx, 32 michael@0: movdqa [edx], xmm0 michael@0: movdqa [edx + 16], xmm1 michael@0: lea edx, [edx + 32] michael@0: jg wloop michael@0: michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) michael@0: // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ScaleARGBRowDown2_SSE2(const uint8* src_argb, michael@0: ptrdiff_t src_stride, michael@0: uint8* dst_argb, int dst_width) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_argb michael@0: // src_stride ignored michael@0: mov edx, [esp + 12] // dst_argb michael@0: mov ecx, [esp + 16] // dst_width michael@0: michael@0: align 4 michael@0: wloop: michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: lea eax, [eax + 32] michael@0: shufps xmm0, xmm1, 0xdd michael@0: sub ecx, 4 michael@0: movdqa [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg wloop michael@0: michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // Blends 8x1 rectangle to 4x1. michael@0: // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, michael@0: ptrdiff_t src_stride, michael@0: uint8* dst_argb, int dst_width) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_argb michael@0: // src_stride ignored michael@0: mov edx, [esp + 12] // dst_argb michael@0: mov ecx, [esp + 16] // dst_width michael@0: michael@0: align 4 michael@0: wloop: michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: lea eax, [eax + 32] michael@0: movdqa xmm2, xmm0 michael@0: shufps xmm0, xmm1, 0x88 // even pixels michael@0: shufps xmm2, xmm1, 0xdd // odd pixels michael@0: pavgb xmm0, xmm2 michael@0: sub ecx, 4 michael@0: movdqa [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg wloop michael@0: michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // Blends 8x2 rectangle to 4x1. michael@0: // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, michael@0: ptrdiff_t src_stride, michael@0: uint8* dst_argb, int dst_width) { michael@0: __asm { michael@0: push esi michael@0: mov eax, [esp + 4 + 4] // src_argb michael@0: mov esi, [esp + 4 + 8] // src_stride michael@0: mov edx, [esp + 4 + 12] // dst_argb michael@0: mov ecx, [esp + 4 + 16] // dst_width michael@0: michael@0: align 4 michael@0: wloop: michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: movdqa xmm2, [eax + esi] michael@0: movdqa xmm3, [eax + esi + 16] michael@0: lea eax, [eax + 32] michael@0: pavgb xmm0, xmm2 // average rows michael@0: pavgb xmm1, xmm3 michael@0: movdqa xmm2, xmm0 // average columns (8 to 4 pixels) michael@0: shufps xmm0, xmm1, 0x88 // even pixels michael@0: shufps xmm2, xmm1, 0xdd // odd pixels michael@0: pavgb xmm0, xmm2 michael@0: sub ecx, 4 michael@0: movdqa [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg wloop michael@0: michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // Reads 4 pixels at a time. michael@0: // Alignment requirement: dst_argb 16 byte aligned. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, michael@0: int src_stepx, michael@0: uint8* dst_argb, int dst_width) { michael@0: __asm { michael@0: push ebx michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // src_argb michael@0: // src_stride ignored michael@0: mov ebx, [esp + 8 + 12] // src_stepx michael@0: mov edx, [esp + 8 + 16] // dst_argb michael@0: mov ecx, [esp + 8 + 20] // dst_width michael@0: lea ebx, [ebx * 4] michael@0: lea edi, [ebx + ebx * 2] michael@0: michael@0: align 4 michael@0: wloop: michael@0: movd xmm0, [eax] michael@0: movd xmm1, [eax + ebx] michael@0: punpckldq xmm0, xmm1 michael@0: movd xmm2, [eax + ebx * 2] michael@0: movd xmm3, [eax + edi] michael@0: lea eax, [eax + ebx * 4] michael@0: punpckldq xmm2, xmm3 michael@0: punpcklqdq xmm0, xmm2 michael@0: sub ecx, 4 michael@0: movdqa [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg wloop michael@0: michael@0: pop edi michael@0: pop ebx michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // Blends four 2x2 to 4x1. michael@0: // Alignment requirement: dst_argb 16 byte aligned. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, michael@0: ptrdiff_t src_stride, michael@0: int src_stepx, michael@0: uint8* dst_argb, int dst_width) { michael@0: __asm { michael@0: push ebx michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 12 + 4] // src_argb michael@0: mov esi, [esp + 12 + 8] // src_stride michael@0: mov ebx, [esp + 12 + 12] // src_stepx michael@0: mov edx, [esp + 12 + 16] // dst_argb michael@0: mov ecx, [esp + 12 + 20] // dst_width michael@0: lea esi, [eax + esi] // row1 pointer michael@0: lea ebx, [ebx * 4] michael@0: lea edi, [ebx + ebx * 2] michael@0: michael@0: align 4 michael@0: wloop: michael@0: movq xmm0, qword ptr [eax] // row0 4 pairs michael@0: movhps xmm0, qword ptr [eax + ebx] michael@0: movq xmm1, qword ptr [eax + ebx * 2] michael@0: movhps xmm1, qword ptr [eax + edi] michael@0: lea eax, [eax + ebx * 4] michael@0: movq xmm2, qword ptr [esi] // row1 4 pairs michael@0: movhps xmm2, qword ptr [esi + ebx] michael@0: movq xmm3, qword ptr [esi + ebx * 2] michael@0: movhps xmm3, qword ptr [esi + edi] michael@0: lea esi, [esi + ebx * 4] michael@0: pavgb xmm0, xmm2 // average rows michael@0: pavgb xmm1, xmm3 michael@0: movdqa xmm2, xmm0 // average columns (8 to 4 pixels) michael@0: shufps xmm0, xmm1, 0x88 // even pixels michael@0: shufps xmm2, xmm1, 0xdd // odd pixels michael@0: pavgb xmm0, xmm2 michael@0: sub ecx, 4 michael@0: movdqa [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg wloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: pop ebx michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // Column scaling unfiltered. SSE2 version. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, michael@0: int dst_width, int x, int dx) { michael@0: __asm { michael@0: push edi michael@0: push esi michael@0: mov edi, [esp + 8 + 4] // dst_argb michael@0: mov esi, [esp + 8 + 8] // src_argb michael@0: mov ecx, [esp + 8 + 12] // dst_width michael@0: movd xmm2, [esp + 8 + 16] // x michael@0: movd xmm3, [esp + 8 + 20] // dx michael@0: michael@0: pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 michael@0: pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 michael@0: paddd xmm2, xmm0 michael@0: paddd xmm3, xmm3 // 0, 0, 0, dx * 2 michael@0: pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 michael@0: paddd xmm2, xmm0 // x3 x2 x1 x0 michael@0: paddd xmm3, xmm3 // 0, 0, 0, dx * 4 michael@0: pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 michael@0: michael@0: pextrw eax, xmm2, 1 // get x0 integer. michael@0: pextrw edx, xmm2, 3 // get x1 integer. michael@0: michael@0: cmp ecx, 0 michael@0: jle xloop99 michael@0: sub ecx, 4 michael@0: jl xloop49 michael@0: michael@0: // 4 Pixel loop. michael@0: align 4 michael@0: xloop4: michael@0: movd xmm0, [esi + eax * 4] // 1 source x0 pixels michael@0: movd xmm1, [esi + edx * 4] // 1 source x1 pixels michael@0: pextrw eax, xmm2, 5 // get x2 integer. michael@0: pextrw edx, xmm2, 7 // get x3 integer. michael@0: paddd xmm2, xmm3 // x += dx michael@0: punpckldq xmm0, xmm1 // x0 x1 michael@0: michael@0: movd xmm1, [esi + eax * 4] // 1 source x2 pixels michael@0: movd xmm4, [esi + edx * 4] // 1 source x3 pixels michael@0: pextrw eax, xmm2, 1 // get x0 integer. next iteration. michael@0: pextrw edx, xmm2, 3 // get x1 integer. next iteration. michael@0: punpckldq xmm1, xmm4 // x2 x3 michael@0: punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 michael@0: sub ecx, 4 // 4 pixels michael@0: movdqu [edi], xmm0 michael@0: lea edi, [edi + 16] michael@0: jge xloop4 michael@0: michael@0: align 4 michael@0: xloop49: michael@0: test ecx, 2 michael@0: je xloop29 michael@0: michael@0: // 2 Pixels. michael@0: movd xmm0, [esi + eax * 4] // 1 source x0 pixels michael@0: movd xmm1, [esi + edx * 4] // 1 source x1 pixels michael@0: pextrw eax, xmm2, 5 // get x2 integer. michael@0: punpckldq xmm0, xmm1 // x0 x1 michael@0: michael@0: movq qword ptr [edi], xmm0 michael@0: lea edi, [edi + 8] michael@0: michael@0: xloop29: michael@0: test ecx, 1 michael@0: je xloop99 michael@0: michael@0: // 1 Pixels. michael@0: movd xmm0, [esi + eax * 4] // 1 source x2 pixels michael@0: movd dword ptr [edi], xmm0 michael@0: align 4 michael@0: xloop99: michael@0: michael@0: pop esi michael@0: pop edi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. michael@0: // TODO(fbarchard): Port to Neon michael@0: michael@0: // Shuffle table for arranging 2 pixels into pairs for pmaddubsw michael@0: static uvec8 kShuffleColARGB = { michael@0: 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel michael@0: 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel michael@0: }; michael@0: michael@0: // Shuffle table for duplicating 2 fractions into 8 bytes each michael@0: static uvec8 kShuffleFractions = { michael@0: 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, michael@0: }; michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, michael@0: int dst_width, int x, int dx) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov edi, [esp + 8 + 4] // dst_argb michael@0: mov esi, [esp + 8 + 8] // src_argb michael@0: mov ecx, [esp + 8 + 12] // dst_width michael@0: movd xmm2, [esp + 8 + 16] // x michael@0: movd xmm3, [esp + 8 + 20] // dx michael@0: movdqa xmm4, kShuffleColARGB michael@0: movdqa xmm5, kShuffleFractions michael@0: pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. michael@0: psrlw xmm6, 9 michael@0: pextrw eax, xmm2, 1 // get x0 integer. preroll michael@0: sub ecx, 2 michael@0: jl xloop29 michael@0: michael@0: movdqa xmm0, xmm2 // x1 = x0 + dx michael@0: paddd xmm0, xmm3 michael@0: punpckldq xmm2, xmm0 // x0 x1 michael@0: punpckldq xmm3, xmm3 // dx dx michael@0: paddd xmm3, xmm3 // dx * 2, dx * 2 michael@0: pextrw edx, xmm2, 3 // get x1 integer. preroll michael@0: michael@0: // 2 Pixel loop. michael@0: align 4 michael@0: xloop2: michael@0: movdqa xmm1, xmm2 // x0, x1 fractions. michael@0: paddd xmm2, xmm3 // x += dx michael@0: movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels michael@0: psrlw xmm1, 9 // 7 bit fractions. michael@0: movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels michael@0: pshufb xmm1, xmm5 // 0000000011111111 michael@0: pshufb xmm0, xmm4 // arrange pixels into pairs michael@0: pxor xmm1, xmm6 // 0..7f and 7f..0 michael@0: pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. michael@0: pextrw eax, xmm2, 1 // get x0 integer. next iteration. michael@0: pextrw edx, xmm2, 3 // get x1 integer. next iteration. michael@0: psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. michael@0: packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. michael@0: movq qword ptr [edi], xmm0 michael@0: lea edi, [edi + 8] michael@0: sub ecx, 2 // 2 pixels michael@0: jge xloop2 michael@0: michael@0: align 4 michael@0: xloop29: michael@0: michael@0: add ecx, 2 - 1 michael@0: jl xloop99 michael@0: michael@0: // 1 pixel remainder michael@0: psrlw xmm2, 9 // 7 bit fractions. michael@0: movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels michael@0: pshufb xmm2, xmm5 // 00000000 michael@0: pshufb xmm0, xmm4 // arrange pixels into pairs michael@0: pxor xmm2, xmm6 // 0..7f and 7f..0 michael@0: pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. michael@0: psrlw xmm0, 7 michael@0: packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. michael@0: movd [edi], xmm0 michael@0: michael@0: align 4 michael@0: xloop99: michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // Reads 4 pixels, duplicates them and writes 8 pixels. michael@0: // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, michael@0: int dst_width, int x, int dx) { michael@0: __asm { michael@0: mov edx, [esp + 4] // dst_argb michael@0: mov eax, [esp + 8] // src_argb michael@0: mov ecx, [esp + 12] // dst_width michael@0: michael@0: align 4 michael@0: wloop: michael@0: movdqa xmm0, [eax] michael@0: lea eax, [eax + 16] michael@0: movdqa xmm1, xmm0 michael@0: punpckldq xmm0, xmm0 michael@0: punpckhdq xmm1, xmm1 michael@0: sub ecx, 8 michael@0: movdqa [edx], xmm0 michael@0: movdqa [edx + 16], xmm1 michael@0: lea edx, [edx + 32] michael@0: jg wloop michael@0: michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // Divide num by div and return as 16.16 fixed point result. michael@0: __declspec(naked) __declspec(align(16)) michael@0: int FixedDiv_X86(int num, int div) { michael@0: __asm { michael@0: mov eax, [esp + 4] // num michael@0: cdq // extend num to 64 bits michael@0: shld edx, eax, 16 // 32.16 michael@0: shl eax, 16 michael@0: idiv dword ptr [esp + 8] michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // Divide num by div and return as 16.16 fixed point result. michael@0: __declspec(naked) __declspec(align(16)) michael@0: int FixedDiv1_X86(int num, int div) { michael@0: __asm { michael@0: mov eax, [esp + 4] // num michael@0: mov ecx, [esp + 8] // denom michael@0: cdq // extend num to 64 bits michael@0: shld edx, eax, 16 // 32.16 michael@0: shl eax, 16 michael@0: sub eax, 0x00010001 michael@0: sbb edx, 0 michael@0: sub ecx, 1 michael@0: idiv ecx michael@0: ret michael@0: } michael@0: } michael@0: michael@0: #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) michael@0: michael@0: #ifdef __cplusplus michael@0: } // extern "C" michael@0: } // namespace libyuv michael@0: #endif