michael@0: /* michael@0: * Copyright 2011 The LibYuv Project Authors. All rights reserved. michael@0: * michael@0: * Use of this source code is governed by a BSD-style license michael@0: * that can be found in the LICENSE file in the root of the source michael@0: * tree. An additional intellectual property rights grant can be found michael@0: * in the file PATENTS. All contributing project authors may michael@0: * be found in the AUTHORS file in the root of the source tree. michael@0: */ michael@0: michael@0: #include "libyuv/row.h" michael@0: michael@0: #ifdef __cplusplus michael@0: namespace libyuv { michael@0: extern "C" { michael@0: #endif michael@0: michael@0: // This module is for Visual C x86. michael@0: #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) michael@0: michael@0: #ifdef HAS_ARGBTOYROW_SSSE3 michael@0: michael@0: // Constants for ARGB. michael@0: static const vec8 kARGBToY = { michael@0: 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 michael@0: }; michael@0: michael@0: // JPeg full range. michael@0: static const vec8 kARGBToYJ = { michael@0: 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 michael@0: }; michael@0: michael@0: static const vec8 kARGBToU = { michael@0: 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 michael@0: }; michael@0: michael@0: static const vec8 kARGBToUJ = { michael@0: 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 michael@0: }; michael@0: michael@0: static const vec8 kARGBToV = { michael@0: -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, michael@0: }; michael@0: michael@0: static const vec8 kARGBToVJ = { michael@0: -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 michael@0: }; michael@0: michael@0: // vpermd for vphaddw + vpackuswb vpermd. michael@0: static const lvec32 kPermdARGBToY_AVX = { michael@0: 0, 4, 1, 5, 2, 6, 3, 7 michael@0: }; michael@0: michael@0: // vpshufb for vphaddw + vpackuswb packed to shorts. michael@0: static const lvec8 kShufARGBToUV_AVX = { michael@0: 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, michael@0: 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, michael@0: }; michael@0: michael@0: // Constants for BGRA. michael@0: static const vec8 kBGRAToY = { michael@0: 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 michael@0: }; michael@0: michael@0: static const vec8 kBGRAToU = { michael@0: 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 michael@0: }; michael@0: michael@0: static const vec8 kBGRAToV = { michael@0: 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 michael@0: }; michael@0: michael@0: // Constants for ABGR. michael@0: static const vec8 kABGRToY = { michael@0: 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 michael@0: }; michael@0: michael@0: static const vec8 kABGRToU = { michael@0: -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 michael@0: }; michael@0: michael@0: static const vec8 kABGRToV = { michael@0: 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 michael@0: }; michael@0: michael@0: // Constants for RGBA. michael@0: static const vec8 kRGBAToY = { michael@0: 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 michael@0: }; michael@0: michael@0: static const vec8 kRGBAToU = { michael@0: 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 michael@0: }; michael@0: michael@0: static const vec8 kRGBAToV = { michael@0: 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 michael@0: }; michael@0: michael@0: static const uvec8 kAddY16 = { michael@0: 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u michael@0: }; michael@0: michael@0: static const vec16 kAddYJ64 = { michael@0: 64, 64, 64, 64, 64, 64, 64, 64 michael@0: }; michael@0: michael@0: static const uvec8 kAddUV128 = { michael@0: 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, michael@0: 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u michael@0: }; michael@0: michael@0: static const uvec16 kAddUVJ128 = { michael@0: 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u michael@0: }; michael@0: michael@0: // Shuffle table for converting RGB24 to ARGB. michael@0: static const uvec8 kShuffleMaskRGB24ToARGB = { michael@0: 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u michael@0: }; michael@0: michael@0: // Shuffle table for converting RAW to ARGB. michael@0: static const uvec8 kShuffleMaskRAWToARGB = { michael@0: 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u michael@0: }; michael@0: michael@0: // Shuffle table for converting ARGB to RGB24. michael@0: static const uvec8 kShuffleMaskARGBToRGB24 = { michael@0: 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u michael@0: }; michael@0: michael@0: // Shuffle table for converting ARGB to RAW. michael@0: static const uvec8 kShuffleMaskARGBToRAW = { michael@0: 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u michael@0: }; michael@0: michael@0: // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 michael@0: static const uvec8 kShuffleMaskARGBToRGB24_0 = { michael@0: 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u michael@0: }; michael@0: michael@0: // Shuffle table for converting ARGB to RAW. michael@0: static const uvec8 kShuffleMaskARGBToRAW_0 = { michael@0: 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u michael@0: }; michael@0: michael@0: // Duplicates gray value 3 times and fills in alpha opaque. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_y michael@0: mov edx, [esp + 8] // dst_argb michael@0: mov ecx, [esp + 12] // pix michael@0: pcmpeqb xmm5, xmm5 // generate mask 0xff000000 michael@0: pslld xmm5, 24 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movq xmm0, qword ptr [eax] michael@0: lea eax, [eax + 8] michael@0: punpcklbw xmm0, xmm0 michael@0: movdqa xmm1, xmm0 michael@0: punpcklwd xmm0, xmm0 michael@0: punpckhwd xmm1, xmm1 michael@0: por xmm0, xmm5 michael@0: por xmm1, xmm5 michael@0: movdqa [edx], xmm0 michael@0: movdqa [edx + 16], xmm1 michael@0: lea edx, [edx + 32] michael@0: sub ecx, 8 michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, michael@0: int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_y michael@0: mov edx, [esp + 8] // dst_argb michael@0: mov ecx, [esp + 12] // pix michael@0: pcmpeqb xmm5, xmm5 // generate mask 0xff000000 michael@0: pslld xmm5, 24 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movq xmm0, qword ptr [eax] michael@0: lea eax, [eax + 8] michael@0: punpcklbw xmm0, xmm0 michael@0: movdqa xmm1, xmm0 michael@0: punpcklwd xmm0, xmm0 michael@0: punpckhwd xmm1, xmm1 michael@0: por xmm0, xmm5 michael@0: por xmm1, xmm5 michael@0: movdqu [edx], xmm0 michael@0: movdqu [edx + 16], xmm1 michael@0: lea edx, [edx + 32] michael@0: sub ecx, 8 michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_rgb24 michael@0: mov edx, [esp + 8] // dst_argb michael@0: mov ecx, [esp + 12] // pix michael@0: pcmpeqb xmm5, xmm5 // generate mask 0xff000000 michael@0: pslld xmm5, 24 michael@0: movdqa xmm4, kShuffleMaskRGB24ToARGB michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqu xmm0, [eax] michael@0: movdqu xmm1, [eax + 16] michael@0: movdqu xmm3, [eax + 32] michael@0: lea eax, [eax + 48] michael@0: movdqa xmm2, xmm3 michael@0: palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} michael@0: pshufb xmm2, xmm4 michael@0: por xmm2, xmm5 michael@0: palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} michael@0: pshufb xmm0, xmm4 michael@0: movdqa [edx + 32], xmm2 michael@0: por xmm0, xmm5 michael@0: pshufb xmm1, xmm4 michael@0: movdqa [edx], xmm0 michael@0: por xmm1, xmm5 michael@0: palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} michael@0: pshufb xmm3, xmm4 michael@0: movdqa [edx + 16], xmm1 michael@0: por xmm3, xmm5 michael@0: sub ecx, 16 michael@0: movdqa [edx + 48], xmm3 michael@0: lea edx, [edx + 64] michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, michael@0: int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_raw michael@0: mov edx, [esp + 8] // dst_argb michael@0: mov ecx, [esp + 12] // pix michael@0: pcmpeqb xmm5, xmm5 // generate mask 0xff000000 michael@0: pslld xmm5, 24 michael@0: movdqa xmm4, kShuffleMaskRAWToARGB michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqu xmm0, [eax] michael@0: movdqu xmm1, [eax + 16] michael@0: movdqu xmm3, [eax + 32] michael@0: lea eax, [eax + 48] michael@0: movdqa xmm2, xmm3 michael@0: palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} michael@0: pshufb xmm2, xmm4 michael@0: por xmm2, xmm5 michael@0: palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} michael@0: pshufb xmm0, xmm4 michael@0: movdqa [edx + 32], xmm2 michael@0: por xmm0, xmm5 michael@0: pshufb xmm1, xmm4 michael@0: movdqa [edx], xmm0 michael@0: por xmm1, xmm5 michael@0: palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} michael@0: pshufb xmm3, xmm4 michael@0: movdqa [edx + 16], xmm1 michael@0: por xmm3, xmm5 michael@0: sub ecx, 16 michael@0: movdqa [edx + 48], xmm3 michael@0: lea edx, [edx + 64] michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // pmul method to replicate bits. michael@0: // Math to replicate bits: michael@0: // (v << 8) | (v << 3) michael@0: // v * 256 + v * 8 michael@0: // v * (256 + 8) michael@0: // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 michael@0: // 20 instructions. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, michael@0: int pix) { michael@0: __asm { michael@0: mov eax, 0x01080108 // generate multiplier to repeat 5 bits michael@0: movd xmm5, eax michael@0: pshufd xmm5, xmm5, 0 michael@0: mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits michael@0: movd xmm6, eax michael@0: pshufd xmm6, xmm6, 0 michael@0: pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red michael@0: psllw xmm3, 11 michael@0: pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green michael@0: psllw xmm4, 10 michael@0: psrlw xmm4, 5 michael@0: pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha michael@0: psllw xmm7, 8 michael@0: michael@0: mov eax, [esp + 4] // src_rgb565 michael@0: mov edx, [esp + 8] // dst_argb michael@0: mov ecx, [esp + 12] // pix michael@0: sub edx, eax michael@0: sub edx, eax michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqu xmm0, [eax] // fetch 8 pixels of bgr565 michael@0: movdqa xmm1, xmm0 michael@0: movdqa xmm2, xmm0 michael@0: pand xmm1, xmm3 // R in upper 5 bits michael@0: psllw xmm2, 11 // B in upper 5 bits michael@0: pmulhuw xmm1, xmm5 // * (256 + 8) michael@0: pmulhuw xmm2, xmm5 // * (256 + 8) michael@0: psllw xmm1, 8 michael@0: por xmm1, xmm2 // RB michael@0: pand xmm0, xmm4 // G in middle 6 bits michael@0: pmulhuw xmm0, xmm6 // << 5 * (256 + 4) michael@0: por xmm0, xmm7 // AG michael@0: movdqa xmm2, xmm1 michael@0: punpcklbw xmm1, xmm0 michael@0: punpckhbw xmm2, xmm0 michael@0: movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB michael@0: movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB michael@0: lea eax, [eax + 16] michael@0: sub ecx, 8 michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // 24 instructions michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, michael@0: int pix) { michael@0: __asm { michael@0: mov eax, 0x01080108 // generate multiplier to repeat 5 bits michael@0: movd xmm5, eax michael@0: pshufd xmm5, xmm5, 0 michael@0: mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits michael@0: movd xmm6, eax michael@0: pshufd xmm6, xmm6, 0 michael@0: pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red michael@0: psllw xmm3, 11 michael@0: movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green michael@0: psrlw xmm4, 6 michael@0: pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha michael@0: psllw xmm7, 8 michael@0: michael@0: mov eax, [esp + 4] // src_argb1555 michael@0: mov edx, [esp + 8] // dst_argb michael@0: mov ecx, [esp + 12] // pix michael@0: sub edx, eax michael@0: sub edx, eax michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqu xmm0, [eax] // fetch 8 pixels of 1555 michael@0: movdqa xmm1, xmm0 michael@0: movdqa xmm2, xmm0 michael@0: psllw xmm1, 1 // R in upper 5 bits michael@0: psllw xmm2, 11 // B in upper 5 bits michael@0: pand xmm1, xmm3 michael@0: pmulhuw xmm2, xmm5 // * (256 + 8) michael@0: pmulhuw xmm1, xmm5 // * (256 + 8) michael@0: psllw xmm1, 8 michael@0: por xmm1, xmm2 // RB michael@0: movdqa xmm2, xmm0 michael@0: pand xmm0, xmm4 // G in middle 5 bits michael@0: psraw xmm2, 8 // A michael@0: pmulhuw xmm0, xmm6 // << 6 * (256 + 8) michael@0: pand xmm2, xmm7 michael@0: por xmm0, xmm2 // AG michael@0: movdqa xmm2, xmm1 michael@0: punpcklbw xmm1, xmm0 michael@0: punpckhbw xmm2, xmm0 michael@0: movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB michael@0: movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB michael@0: lea eax, [eax + 16] michael@0: sub ecx, 8 michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // 18 instructions. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, michael@0: int pix) { michael@0: __asm { michael@0: mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f michael@0: movd xmm4, eax michael@0: pshufd xmm4, xmm4, 0 michael@0: movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles michael@0: pslld xmm5, 4 michael@0: mov eax, [esp + 4] // src_argb4444 michael@0: mov edx, [esp + 8] // dst_argb michael@0: mov ecx, [esp + 12] // pix michael@0: sub edx, eax michael@0: sub edx, eax michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 michael@0: movdqa xmm2, xmm0 michael@0: pand xmm0, xmm4 // mask low nibbles michael@0: pand xmm2, xmm5 // mask high nibbles michael@0: movdqa xmm1, xmm0 michael@0: movdqa xmm3, xmm2 michael@0: psllw xmm1, 4 michael@0: psrlw xmm3, 4 michael@0: por xmm0, xmm1 michael@0: por xmm2, xmm3 michael@0: movdqa xmm1, xmm0 michael@0: punpcklbw xmm0, xmm2 michael@0: punpckhbw xmm1, xmm2 michael@0: movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB michael@0: movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB michael@0: lea eax, [eax + 16] michael@0: sub ecx, 8 michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_argb michael@0: mov edx, [esp + 8] // dst_rgb michael@0: mov ecx, [esp + 12] // pix michael@0: movdqa xmm6, kShuffleMaskARGBToRGB24 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqu xmm0, [eax] // fetch 16 pixels of argb michael@0: movdqu xmm1, [eax + 16] michael@0: movdqu xmm2, [eax + 32] michael@0: movdqu xmm3, [eax + 48] michael@0: lea eax, [eax + 64] michael@0: pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB michael@0: pshufb xmm1, xmm6 michael@0: pshufb xmm2, xmm6 michael@0: pshufb xmm3, xmm6 michael@0: movdqa xmm4, xmm1 // 4 bytes from 1 for 0 michael@0: psrldq xmm1, 4 // 8 bytes from 1 michael@0: pslldq xmm4, 12 // 4 bytes from 1 for 0 michael@0: movdqa xmm5, xmm2 // 8 bytes from 2 for 1 michael@0: por xmm0, xmm4 // 4 bytes from 1 for 0 michael@0: pslldq xmm5, 8 // 8 bytes from 2 for 1 michael@0: movdqu [edx], xmm0 // store 0 michael@0: por xmm1, xmm5 // 8 bytes from 2 for 1 michael@0: psrldq xmm2, 8 // 4 bytes from 2 michael@0: pslldq xmm3, 4 // 12 bytes from 3 for 2 michael@0: por xmm2, xmm3 // 12 bytes from 3 for 2 michael@0: movdqu [edx + 16], xmm1 // store 1 michael@0: movdqu [edx + 32], xmm2 // store 2 michael@0: lea edx, [edx + 48] michael@0: sub ecx, 16 michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_argb michael@0: mov edx, [esp + 8] // dst_rgb michael@0: mov ecx, [esp + 12] // pix michael@0: movdqa xmm6, kShuffleMaskARGBToRAW michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqu xmm0, [eax] // fetch 16 pixels of argb michael@0: movdqu xmm1, [eax + 16] michael@0: movdqu xmm2, [eax + 32] michael@0: movdqu xmm3, [eax + 48] michael@0: lea eax, [eax + 64] michael@0: pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB michael@0: pshufb xmm1, xmm6 michael@0: pshufb xmm2, xmm6 michael@0: pshufb xmm3, xmm6 michael@0: movdqa xmm4, xmm1 // 4 bytes from 1 for 0 michael@0: psrldq xmm1, 4 // 8 bytes from 1 michael@0: pslldq xmm4, 12 // 4 bytes from 1 for 0 michael@0: movdqa xmm5, xmm2 // 8 bytes from 2 for 1 michael@0: por xmm0, xmm4 // 4 bytes from 1 for 0 michael@0: pslldq xmm5, 8 // 8 bytes from 2 for 1 michael@0: movdqu [edx], xmm0 // store 0 michael@0: por xmm1, xmm5 // 8 bytes from 2 for 1 michael@0: psrldq xmm2, 8 // 4 bytes from 2 michael@0: pslldq xmm3, 4 // 12 bytes from 3 for 2 michael@0: por xmm2, xmm3 // 12 bytes from 3 for 2 michael@0: movdqu [edx + 16], xmm1 // store 1 michael@0: movdqu [edx + 32], xmm2 // store 2 michael@0: lea edx, [edx + 48] michael@0: sub ecx, 16 michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_argb michael@0: mov edx, [esp + 8] // dst_rgb michael@0: mov ecx, [esp + 12] // pix michael@0: pcmpeqb xmm3, xmm3 // generate mask 0x0000001f michael@0: psrld xmm3, 27 michael@0: pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 michael@0: psrld xmm4, 26 michael@0: pslld xmm4, 5 michael@0: pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 michael@0: pslld xmm5, 11 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax] // fetch 4 pixels of argb michael@0: movdqa xmm1, xmm0 // B michael@0: movdqa xmm2, xmm0 // G michael@0: pslld xmm0, 8 // R michael@0: psrld xmm1, 3 // B michael@0: psrld xmm2, 5 // G michael@0: psrad xmm0, 16 // R michael@0: pand xmm1, xmm3 // B michael@0: pand xmm2, xmm4 // G michael@0: pand xmm0, xmm5 // R michael@0: por xmm1, xmm2 // BG michael@0: por xmm0, xmm1 // BGR michael@0: packssdw xmm0, xmm0 michael@0: lea eax, [eax + 16] michael@0: movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 michael@0: lea edx, [edx + 8] michael@0: sub ecx, 4 michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // TODO(fbarchard): Improve sign extension/packing. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_argb michael@0: mov edx, [esp + 8] // dst_rgb michael@0: mov ecx, [esp + 12] // pix michael@0: pcmpeqb xmm4, xmm4 // generate mask 0x0000001f michael@0: psrld xmm4, 27 michael@0: movdqa xmm5, xmm4 // generate mask 0x000003e0 michael@0: pslld xmm5, 5 michael@0: movdqa xmm6, xmm4 // generate mask 0x00007c00 michael@0: pslld xmm6, 10 michael@0: pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 michael@0: pslld xmm7, 15 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax] // fetch 4 pixels of argb michael@0: movdqa xmm1, xmm0 // B michael@0: movdqa xmm2, xmm0 // G michael@0: movdqa xmm3, xmm0 // R michael@0: psrad xmm0, 16 // A michael@0: psrld xmm1, 3 // B michael@0: psrld xmm2, 6 // G michael@0: psrld xmm3, 9 // R michael@0: pand xmm0, xmm7 // A michael@0: pand xmm1, xmm4 // B michael@0: pand xmm2, xmm5 // G michael@0: pand xmm3, xmm6 // R michael@0: por xmm0, xmm1 // BA michael@0: por xmm2, xmm3 // GR michael@0: por xmm0, xmm2 // BGRA michael@0: packssdw xmm0, xmm0 michael@0: lea eax, [eax + 16] michael@0: movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 michael@0: lea edx, [edx + 8] michael@0: sub ecx, 4 michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_argb michael@0: mov edx, [esp + 8] // dst_rgb michael@0: mov ecx, [esp + 12] // pix michael@0: pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 michael@0: psllw xmm4, 12 michael@0: movdqa xmm3, xmm4 // generate mask 0x00f000f0 michael@0: psrlw xmm3, 8 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax] // fetch 4 pixels of argb michael@0: movdqa xmm1, xmm0 michael@0: pand xmm0, xmm3 // low nibble michael@0: pand xmm1, xmm4 // high nibble michael@0: psrl xmm0, 4 michael@0: psrl xmm1, 8 michael@0: por xmm0, xmm1 michael@0: packuswb xmm0, xmm0 michael@0: lea eax, [eax + 16] michael@0: movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 michael@0: lea edx, [edx + 8] michael@0: sub ecx, 4 michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // Convert 16 ARGB pixels (64 bytes) to 16 Y values. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] /* src_argb */ michael@0: mov edx, [esp + 8] /* dst_y */ michael@0: mov ecx, [esp + 12] /* pix */ michael@0: movdqa xmm5, kAddY16 michael@0: movdqa xmm4, kARGBToY michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: movdqa xmm2, [eax + 32] michael@0: movdqa xmm3, [eax + 48] michael@0: pmaddubsw xmm0, xmm4 michael@0: pmaddubsw xmm1, xmm4 michael@0: pmaddubsw xmm2, xmm4 michael@0: pmaddubsw xmm3, xmm4 michael@0: lea eax, [eax + 64] michael@0: phaddw xmm0, xmm1 michael@0: phaddw xmm2, xmm3 michael@0: psrlw xmm0, 7 michael@0: psrlw xmm2, 7 michael@0: packuswb xmm0, xmm2 michael@0: paddb xmm0, xmm5 michael@0: sub ecx, 16 michael@0: movdqa [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // Convert 16 ARGB pixels (64 bytes) to 16 Y values. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] /* src_argb */ michael@0: mov edx, [esp + 8] /* dst_y */ michael@0: mov ecx, [esp + 12] /* pix */ michael@0: movdqa xmm4, kARGBToYJ michael@0: movdqa xmm5, kAddYJ64 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: movdqa xmm2, [eax + 32] michael@0: movdqa xmm3, [eax + 48] michael@0: pmaddubsw xmm0, xmm4 michael@0: pmaddubsw xmm1, xmm4 michael@0: pmaddubsw xmm2, xmm4 michael@0: pmaddubsw xmm3, xmm4 michael@0: lea eax, [eax + 64] michael@0: phaddw xmm0, xmm1 michael@0: phaddw xmm2, xmm3 michael@0: paddw xmm0, xmm5 // Add .5 for rounding. michael@0: paddw xmm2, xmm5 michael@0: psrlw xmm0, 7 michael@0: psrlw xmm2, 7 michael@0: packuswb xmm0, xmm2 michael@0: sub ecx, 16 michael@0: movdqa [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: #ifdef HAS_ARGBTOYROW_AVX2 michael@0: // Convert 32 ARGB pixels (128 bytes) to 32 Y values. michael@0: __declspec(naked) __declspec(align(32)) michael@0: void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] /* src_argb */ michael@0: mov edx, [esp + 8] /* dst_y */ michael@0: mov ecx, [esp + 12] /* pix */ michael@0: vbroadcastf128 ymm4, kARGBToY michael@0: vbroadcastf128 ymm5, kAddY16 michael@0: vmovdqa ymm6, kPermdARGBToY_AVX michael@0: michael@0: align 4 michael@0: convertloop: michael@0: vmovdqu ymm0, [eax] michael@0: vmovdqu ymm1, [eax + 32] michael@0: vmovdqu ymm2, [eax + 64] michael@0: vmovdqu ymm3, [eax + 96] michael@0: vpmaddubsw ymm0, ymm0, ymm4 michael@0: vpmaddubsw ymm1, ymm1, ymm4 michael@0: vpmaddubsw ymm2, ymm2, ymm4 michael@0: vpmaddubsw ymm3, ymm3, ymm4 michael@0: lea eax, [eax + 128] michael@0: vphaddw ymm0, ymm0, ymm1 // mutates. michael@0: vphaddw ymm2, ymm2, ymm3 michael@0: vpsrlw ymm0, ymm0, 7 michael@0: vpsrlw ymm2, ymm2, 7 michael@0: vpackuswb ymm0, ymm0, ymm2 // mutates. michael@0: vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. michael@0: vpaddb ymm0, ymm0, ymm5 michael@0: sub ecx, 32 michael@0: vmovdqu [edx], ymm0 michael@0: lea edx, [edx + 32] michael@0: jg convertloop michael@0: vzeroupper michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBTOYROW_AVX2 michael@0: michael@0: #ifdef HAS_ARGBTOYROW_AVX2 michael@0: // Convert 32 ARGB pixels (128 bytes) to 32 Y values. michael@0: __declspec(naked) __declspec(align(32)) michael@0: void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] /* src_argb */ michael@0: mov edx, [esp + 8] /* dst_y */ michael@0: mov ecx, [esp + 12] /* pix */ michael@0: vbroadcastf128 ymm4, kARGBToYJ michael@0: vbroadcastf128 ymm5, kAddYJ64 michael@0: vmovdqa ymm6, kPermdARGBToY_AVX michael@0: michael@0: align 4 michael@0: convertloop: michael@0: vmovdqu ymm0, [eax] michael@0: vmovdqu ymm1, [eax + 32] michael@0: vmovdqu ymm2, [eax + 64] michael@0: vmovdqu ymm3, [eax + 96] michael@0: vpmaddubsw ymm0, ymm0, ymm4 michael@0: vpmaddubsw ymm1, ymm1, ymm4 michael@0: vpmaddubsw ymm2, ymm2, ymm4 michael@0: vpmaddubsw ymm3, ymm3, ymm4 michael@0: lea eax, [eax + 128] michael@0: vphaddw ymm0, ymm0, ymm1 // mutates. michael@0: vphaddw ymm2, ymm2, ymm3 michael@0: vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding. michael@0: vpaddw ymm2, ymm2, ymm5 michael@0: vpsrlw ymm0, ymm0, 7 michael@0: vpsrlw ymm2, ymm2, 7 michael@0: vpackuswb ymm0, ymm0, ymm2 // mutates. michael@0: vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. michael@0: sub ecx, 32 michael@0: vmovdqu [edx], ymm0 michael@0: lea edx, [edx + 32] michael@0: jg convertloop michael@0: michael@0: vzeroupper michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBTOYJROW_AVX2 michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] /* src_argb */ michael@0: mov edx, [esp + 8] /* dst_y */ michael@0: mov ecx, [esp + 12] /* pix */ michael@0: movdqa xmm5, kAddY16 michael@0: movdqa xmm4, kARGBToY michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqu xmm0, [eax] michael@0: movdqu xmm1, [eax + 16] michael@0: movdqu xmm2, [eax + 32] michael@0: movdqu xmm3, [eax + 48] michael@0: pmaddubsw xmm0, xmm4 michael@0: pmaddubsw xmm1, xmm4 michael@0: pmaddubsw xmm2, xmm4 michael@0: pmaddubsw xmm3, xmm4 michael@0: lea eax, [eax + 64] michael@0: phaddw xmm0, xmm1 michael@0: phaddw xmm2, xmm3 michael@0: psrlw xmm0, 7 michael@0: psrlw xmm2, 7 michael@0: packuswb xmm0, xmm2 michael@0: paddb xmm0, xmm5 michael@0: sub ecx, 16 michael@0: movdqu [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] /* src_argb */ michael@0: mov edx, [esp + 8] /* dst_y */ michael@0: mov ecx, [esp + 12] /* pix */ michael@0: movdqa xmm4, kARGBToYJ michael@0: movdqa xmm5, kAddYJ64 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqu xmm0, [eax] michael@0: movdqu xmm1, [eax + 16] michael@0: movdqu xmm2, [eax + 32] michael@0: movdqu xmm3, [eax + 48] michael@0: pmaddubsw xmm0, xmm4 michael@0: pmaddubsw xmm1, xmm4 michael@0: pmaddubsw xmm2, xmm4 michael@0: pmaddubsw xmm3, xmm4 michael@0: lea eax, [eax + 64] michael@0: phaddw xmm0, xmm1 michael@0: phaddw xmm2, xmm3 michael@0: paddw xmm0, xmm5 michael@0: paddw xmm2, xmm5 michael@0: psrlw xmm0, 7 michael@0: psrlw xmm2, 7 michael@0: packuswb xmm0, xmm2 michael@0: sub ecx, 16 michael@0: movdqu [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] /* src_argb */ michael@0: mov edx, [esp + 8] /* dst_y */ michael@0: mov ecx, [esp + 12] /* pix */ michael@0: movdqa xmm5, kAddY16 michael@0: movdqa xmm4, kBGRAToY michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: movdqa xmm2, [eax + 32] michael@0: movdqa xmm3, [eax + 48] michael@0: pmaddubsw xmm0, xmm4 michael@0: pmaddubsw xmm1, xmm4 michael@0: pmaddubsw xmm2, xmm4 michael@0: pmaddubsw xmm3, xmm4 michael@0: lea eax, [eax + 64] michael@0: phaddw xmm0, xmm1 michael@0: phaddw xmm2, xmm3 michael@0: psrlw xmm0, 7 michael@0: psrlw xmm2, 7 michael@0: packuswb xmm0, xmm2 michael@0: paddb xmm0, xmm5 michael@0: sub ecx, 16 michael@0: movdqa [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] /* src_argb */ michael@0: mov edx, [esp + 8] /* dst_y */ michael@0: mov ecx, [esp + 12] /* pix */ michael@0: movdqa xmm5, kAddY16 michael@0: movdqa xmm4, kBGRAToY michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqu xmm0, [eax] michael@0: movdqu xmm1, [eax + 16] michael@0: movdqu xmm2, [eax + 32] michael@0: movdqu xmm3, [eax + 48] michael@0: pmaddubsw xmm0, xmm4 michael@0: pmaddubsw xmm1, xmm4 michael@0: pmaddubsw xmm2, xmm4 michael@0: pmaddubsw xmm3, xmm4 michael@0: lea eax, [eax + 64] michael@0: phaddw xmm0, xmm1 michael@0: phaddw xmm2, xmm3 michael@0: psrlw xmm0, 7 michael@0: psrlw xmm2, 7 michael@0: packuswb xmm0, xmm2 michael@0: paddb xmm0, xmm5 michael@0: sub ecx, 16 michael@0: movdqu [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] /* src_argb */ michael@0: mov edx, [esp + 8] /* dst_y */ michael@0: mov ecx, [esp + 12] /* pix */ michael@0: movdqa xmm5, kAddY16 michael@0: movdqa xmm4, kABGRToY michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: movdqa xmm2, [eax + 32] michael@0: movdqa xmm3, [eax + 48] michael@0: pmaddubsw xmm0, xmm4 michael@0: pmaddubsw xmm1, xmm4 michael@0: pmaddubsw xmm2, xmm4 michael@0: pmaddubsw xmm3, xmm4 michael@0: lea eax, [eax + 64] michael@0: phaddw xmm0, xmm1 michael@0: phaddw xmm2, xmm3 michael@0: psrlw xmm0, 7 michael@0: psrlw xmm2, 7 michael@0: packuswb xmm0, xmm2 michael@0: paddb xmm0, xmm5 michael@0: sub ecx, 16 michael@0: movdqa [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] /* src_argb */ michael@0: mov edx, [esp + 8] /* dst_y */ michael@0: mov ecx, [esp + 12] /* pix */ michael@0: movdqa xmm5, kAddY16 michael@0: movdqa xmm4, kABGRToY michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqu xmm0, [eax] michael@0: movdqu xmm1, [eax + 16] michael@0: movdqu xmm2, [eax + 32] michael@0: movdqu xmm3, [eax + 48] michael@0: pmaddubsw xmm0, xmm4 michael@0: pmaddubsw xmm1, xmm4 michael@0: pmaddubsw xmm2, xmm4 michael@0: pmaddubsw xmm3, xmm4 michael@0: lea eax, [eax + 64] michael@0: phaddw xmm0, xmm1 michael@0: phaddw xmm2, xmm3 michael@0: psrlw xmm0, 7 michael@0: psrlw xmm2, 7 michael@0: packuswb xmm0, xmm2 michael@0: paddb xmm0, xmm5 michael@0: sub ecx, 16 michael@0: movdqu [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] /* src_argb */ michael@0: mov edx, [esp + 8] /* dst_y */ michael@0: mov ecx, [esp + 12] /* pix */ michael@0: movdqa xmm5, kAddY16 michael@0: movdqa xmm4, kRGBAToY michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: movdqa xmm2, [eax + 32] michael@0: movdqa xmm3, [eax + 48] michael@0: pmaddubsw xmm0, xmm4 michael@0: pmaddubsw xmm1, xmm4 michael@0: pmaddubsw xmm2, xmm4 michael@0: pmaddubsw xmm3, xmm4 michael@0: lea eax, [eax + 64] michael@0: phaddw xmm0, xmm1 michael@0: phaddw xmm2, xmm3 michael@0: psrlw xmm0, 7 michael@0: psrlw xmm2, 7 michael@0: packuswb xmm0, xmm2 michael@0: paddb xmm0, xmm5 michael@0: sub ecx, 16 michael@0: movdqa [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] /* src_argb */ michael@0: mov edx, [esp + 8] /* dst_y */ michael@0: mov ecx, [esp + 12] /* pix */ michael@0: movdqa xmm5, kAddY16 michael@0: movdqa xmm4, kRGBAToY michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqu xmm0, [eax] michael@0: movdqu xmm1, [eax + 16] michael@0: movdqu xmm2, [eax + 32] michael@0: movdqu xmm3, [eax + 48] michael@0: pmaddubsw xmm0, xmm4 michael@0: pmaddubsw xmm1, xmm4 michael@0: pmaddubsw xmm2, xmm4 michael@0: pmaddubsw xmm3, xmm4 michael@0: lea eax, [eax + 64] michael@0: phaddw xmm0, xmm1 michael@0: phaddw xmm2, xmm3 michael@0: psrlw xmm0, 7 michael@0: psrlw xmm2, 7 michael@0: packuswb xmm0, xmm2 michael@0: paddb xmm0, xmm5 michael@0: sub ecx, 16 michael@0: movdqu [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, michael@0: uint8* dst_u, uint8* dst_v, int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // src_argb michael@0: mov esi, [esp + 8 + 8] // src_stride_argb michael@0: mov edx, [esp + 8 + 12] // dst_u michael@0: mov edi, [esp + 8 + 16] // dst_v michael@0: mov ecx, [esp + 8 + 20] // pix michael@0: movdqa xmm7, kARGBToU michael@0: movdqa xmm6, kARGBToV michael@0: movdqa xmm5, kAddUV128 michael@0: sub edi, edx // stride from u to v michael@0: michael@0: align 4 michael@0: convertloop: michael@0: /* step 1 - subsample 16x2 argb pixels to 8x1 */ michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: movdqa xmm2, [eax + 32] michael@0: movdqa xmm3, [eax + 48] michael@0: pavgb xmm0, [eax + esi] michael@0: pavgb xmm1, [eax + esi + 16] michael@0: pavgb xmm2, [eax + esi + 32] michael@0: pavgb xmm3, [eax + esi + 48] michael@0: lea eax, [eax + 64] michael@0: movdqa xmm4, xmm0 michael@0: shufps xmm0, xmm1, 0x88 michael@0: shufps xmm4, xmm1, 0xdd michael@0: pavgb xmm0, xmm4 michael@0: movdqa xmm4, xmm2 michael@0: shufps xmm2, xmm3, 0x88 michael@0: shufps xmm4, xmm3, 0xdd michael@0: pavgb xmm2, xmm4 michael@0: michael@0: // step 2 - convert to U and V michael@0: // from here down is very similar to Y code except michael@0: // instead of 16 different pixels, its 8 pixels of U and 8 of V michael@0: movdqa xmm1, xmm0 michael@0: movdqa xmm3, xmm2 michael@0: pmaddubsw xmm0, xmm7 // U michael@0: pmaddubsw xmm2, xmm7 michael@0: pmaddubsw xmm1, xmm6 // V michael@0: pmaddubsw xmm3, xmm6 michael@0: phaddw xmm0, xmm2 michael@0: phaddw xmm1, xmm3 michael@0: psraw xmm0, 8 michael@0: psraw xmm1, 8 michael@0: packsswb xmm0, xmm1 michael@0: paddb xmm0, xmm5 // -> unsigned michael@0: michael@0: // step 3 - store 8 U and 8 V values michael@0: sub ecx, 16 michael@0: movlps qword ptr [edx], xmm0 // U michael@0: movhps qword ptr [edx + edi], xmm0 // V michael@0: lea edx, [edx + 8] michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, michael@0: uint8* dst_u, uint8* dst_v, int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // src_argb michael@0: mov esi, [esp + 8 + 8] // src_stride_argb michael@0: mov edx, [esp + 8 + 12] // dst_u michael@0: mov edi, [esp + 8 + 16] // dst_v michael@0: mov ecx, [esp + 8 + 20] // pix michael@0: movdqa xmm7, kARGBToUJ michael@0: movdqa xmm6, kARGBToVJ michael@0: movdqa xmm5, kAddUVJ128 michael@0: sub edi, edx // stride from u to v michael@0: michael@0: align 4 michael@0: convertloop: michael@0: /* step 1 - subsample 16x2 argb pixels to 8x1 */ michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: movdqa xmm2, [eax + 32] michael@0: movdqa xmm3, [eax + 48] michael@0: pavgb xmm0, [eax + esi] michael@0: pavgb xmm1, [eax + esi + 16] michael@0: pavgb xmm2, [eax + esi + 32] michael@0: pavgb xmm3, [eax + esi + 48] michael@0: lea eax, [eax + 64] michael@0: movdqa xmm4, xmm0 michael@0: shufps xmm0, xmm1, 0x88 michael@0: shufps xmm4, xmm1, 0xdd michael@0: pavgb xmm0, xmm4 michael@0: movdqa xmm4, xmm2 michael@0: shufps xmm2, xmm3, 0x88 michael@0: shufps xmm4, xmm3, 0xdd michael@0: pavgb xmm2, xmm4 michael@0: michael@0: // step 2 - convert to U and V michael@0: // from here down is very similar to Y code except michael@0: // instead of 16 different pixels, its 8 pixels of U and 8 of V michael@0: movdqa xmm1, xmm0 michael@0: movdqa xmm3, xmm2 michael@0: pmaddubsw xmm0, xmm7 // U michael@0: pmaddubsw xmm2, xmm7 michael@0: pmaddubsw xmm1, xmm6 // V michael@0: pmaddubsw xmm3, xmm6 michael@0: phaddw xmm0, xmm2 michael@0: phaddw xmm1, xmm3 michael@0: paddw xmm0, xmm5 // +.5 rounding -> unsigned michael@0: paddw xmm1, xmm5 michael@0: psraw xmm0, 8 michael@0: psraw xmm1, 8 michael@0: packsswb xmm0, xmm1 michael@0: michael@0: // step 3 - store 8 U and 8 V values michael@0: sub ecx, 16 michael@0: movlps qword ptr [edx], xmm0 // U michael@0: movhps qword ptr [edx + edi], xmm0 // V michael@0: lea edx, [edx + 8] michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: #ifdef HAS_ARGBTOUVROW_AVX2 michael@0: __declspec(naked) __declspec(align(32)) michael@0: void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, michael@0: uint8* dst_u, uint8* dst_v, int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // src_argb michael@0: mov esi, [esp + 8 + 8] // src_stride_argb michael@0: mov edx, [esp + 8 + 12] // dst_u michael@0: mov edi, [esp + 8 + 16] // dst_v michael@0: mov ecx, [esp + 8 + 20] // pix michael@0: vbroadcastf128 ymm5, kAddUV128 michael@0: vbroadcastf128 ymm6, kARGBToV michael@0: vbroadcastf128 ymm7, kARGBToU michael@0: sub edi, edx // stride from u to v michael@0: michael@0: align 4 michael@0: convertloop: michael@0: /* step 1 - subsample 32x2 argb pixels to 16x1 */ michael@0: vmovdqu ymm0, [eax] michael@0: vmovdqu ymm1, [eax + 32] michael@0: vmovdqu ymm2, [eax + 64] michael@0: vmovdqu ymm3, [eax + 96] michael@0: vpavgb ymm0, ymm0, [eax + esi] michael@0: vpavgb ymm1, ymm1, [eax + esi + 32] michael@0: vpavgb ymm2, ymm2, [eax + esi + 64] michael@0: vpavgb ymm3, ymm3, [eax + esi + 96] michael@0: lea eax, [eax + 128] michael@0: vshufps ymm4, ymm0, ymm1, 0x88 michael@0: vshufps ymm0, ymm0, ymm1, 0xdd michael@0: vpavgb ymm0, ymm0, ymm4 // mutated by vshufps michael@0: vshufps ymm4, ymm2, ymm3, 0x88 michael@0: vshufps ymm2, ymm2, ymm3, 0xdd michael@0: vpavgb ymm2, ymm2, ymm4 // mutated by vshufps michael@0: michael@0: // step 2 - convert to U and V michael@0: // from here down is very similar to Y code except michael@0: // instead of 32 different pixels, its 16 pixels of U and 16 of V michael@0: vpmaddubsw ymm1, ymm0, ymm7 // U michael@0: vpmaddubsw ymm3, ymm2, ymm7 michael@0: vpmaddubsw ymm0, ymm0, ymm6 // V michael@0: vpmaddubsw ymm2, ymm2, ymm6 michael@0: vphaddw ymm1, ymm1, ymm3 // mutates michael@0: vphaddw ymm0, ymm0, ymm2 michael@0: vpsraw ymm1, ymm1, 8 michael@0: vpsraw ymm0, ymm0, 8 michael@0: vpacksswb ymm0, ymm1, ymm0 // mutates michael@0: vpermq ymm0, ymm0, 0xd8 // For vpacksswb michael@0: vpshufb ymm0, ymm0, kShufARGBToUV_AVX // For vshufps + vphaddw michael@0: vpaddb ymm0, ymm0, ymm5 // -> unsigned michael@0: michael@0: // step 3 - store 16 U and 16 V values michael@0: sub ecx, 32 michael@0: vextractf128 [edx], ymm0, 0 // U michael@0: vextractf128 [edx + edi], ymm0, 1 // V michael@0: lea edx, [edx + 16] michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: vzeroupper michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBTOUVROW_AVX2 michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, michael@0: uint8* dst_u, uint8* dst_v, int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // src_argb michael@0: mov esi, [esp + 8 + 8] // src_stride_argb michael@0: mov edx, [esp + 8 + 12] // dst_u michael@0: mov edi, [esp + 8 + 16] // dst_v michael@0: mov ecx, [esp + 8 + 20] // pix michael@0: movdqa xmm7, kARGBToU michael@0: movdqa xmm6, kARGBToV michael@0: movdqa xmm5, kAddUV128 michael@0: sub edi, edx // stride from u to v michael@0: michael@0: align 4 michael@0: convertloop: michael@0: /* step 1 - subsample 16x2 argb pixels to 8x1 */ michael@0: movdqu xmm0, [eax] michael@0: movdqu xmm1, [eax + 16] michael@0: movdqu xmm2, [eax + 32] michael@0: movdqu xmm3, [eax + 48] michael@0: movdqu xmm4, [eax + esi] michael@0: pavgb xmm0, xmm4 michael@0: movdqu xmm4, [eax + esi + 16] michael@0: pavgb xmm1, xmm4 michael@0: movdqu xmm4, [eax + esi + 32] michael@0: pavgb xmm2, xmm4 michael@0: movdqu xmm4, [eax + esi + 48] michael@0: pavgb xmm3, xmm4 michael@0: lea eax, [eax + 64] michael@0: movdqa xmm4, xmm0 michael@0: shufps xmm0, xmm1, 0x88 michael@0: shufps xmm4, xmm1, 0xdd michael@0: pavgb xmm0, xmm4 michael@0: movdqa xmm4, xmm2 michael@0: shufps xmm2, xmm3, 0x88 michael@0: shufps xmm4, xmm3, 0xdd michael@0: pavgb xmm2, xmm4 michael@0: michael@0: // step 2 - convert to U and V michael@0: // from here down is very similar to Y code except michael@0: // instead of 16 different pixels, its 8 pixels of U and 8 of V michael@0: movdqa xmm1, xmm0 michael@0: movdqa xmm3, xmm2 michael@0: pmaddubsw xmm0, xmm7 // U michael@0: pmaddubsw xmm2, xmm7 michael@0: pmaddubsw xmm1, xmm6 // V michael@0: pmaddubsw xmm3, xmm6 michael@0: phaddw xmm0, xmm2 michael@0: phaddw xmm1, xmm3 michael@0: psraw xmm0, 8 michael@0: psraw xmm1, 8 michael@0: packsswb xmm0, xmm1 michael@0: paddb xmm0, xmm5 // -> unsigned michael@0: michael@0: // step 3 - store 8 U and 8 V values michael@0: sub ecx, 16 michael@0: movlps qword ptr [edx], xmm0 // U michael@0: movhps qword ptr [edx + edi], xmm0 // V michael@0: lea edx, [edx + 8] michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, michael@0: uint8* dst_u, uint8* dst_v, int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // src_argb michael@0: mov esi, [esp + 8 + 8] // src_stride_argb michael@0: mov edx, [esp + 8 + 12] // dst_u michael@0: mov edi, [esp + 8 + 16] // dst_v michael@0: mov ecx, [esp + 8 + 20] // pix michael@0: movdqa xmm7, kARGBToUJ michael@0: movdqa xmm6, kARGBToVJ michael@0: movdqa xmm5, kAddUVJ128 michael@0: sub edi, edx // stride from u to v michael@0: michael@0: align 4 michael@0: convertloop: michael@0: /* step 1 - subsample 16x2 argb pixels to 8x1 */ michael@0: movdqu xmm0, [eax] michael@0: movdqu xmm1, [eax + 16] michael@0: movdqu xmm2, [eax + 32] michael@0: movdqu xmm3, [eax + 48] michael@0: movdqu xmm4, [eax + esi] michael@0: pavgb xmm0, xmm4 michael@0: movdqu xmm4, [eax + esi + 16] michael@0: pavgb xmm1, xmm4 michael@0: movdqu xmm4, [eax + esi + 32] michael@0: pavgb xmm2, xmm4 michael@0: movdqu xmm4, [eax + esi + 48] michael@0: pavgb xmm3, xmm4 michael@0: lea eax, [eax + 64] michael@0: movdqa xmm4, xmm0 michael@0: shufps xmm0, xmm1, 0x88 michael@0: shufps xmm4, xmm1, 0xdd michael@0: pavgb xmm0, xmm4 michael@0: movdqa xmm4, xmm2 michael@0: shufps xmm2, xmm3, 0x88 michael@0: shufps xmm4, xmm3, 0xdd michael@0: pavgb xmm2, xmm4 michael@0: michael@0: // step 2 - convert to U and V michael@0: // from here down is very similar to Y code except michael@0: // instead of 16 different pixels, its 8 pixels of U and 8 of V michael@0: movdqa xmm1, xmm0 michael@0: movdqa xmm3, xmm2 michael@0: pmaddubsw xmm0, xmm7 // U michael@0: pmaddubsw xmm2, xmm7 michael@0: pmaddubsw xmm1, xmm6 // V michael@0: pmaddubsw xmm3, xmm6 michael@0: phaddw xmm0, xmm2 michael@0: phaddw xmm1, xmm3 michael@0: paddw xmm0, xmm5 // +.5 rounding -> unsigned michael@0: paddw xmm1, xmm5 michael@0: psraw xmm0, 8 michael@0: psraw xmm1, 8 michael@0: packsswb xmm0, xmm1 michael@0: michael@0: // step 3 - store 8 U and 8 V values michael@0: sub ecx, 16 michael@0: movlps qword ptr [edx], xmm0 // U michael@0: movhps qword ptr [edx + edi], xmm0 // V michael@0: lea edx, [edx + 8] michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBToUV444Row_SSSE3(const uint8* src_argb0, michael@0: uint8* dst_u, uint8* dst_v, int width) { michael@0: __asm { michael@0: push edi michael@0: mov eax, [esp + 4 + 4] // src_argb michael@0: mov edx, [esp + 4 + 8] // dst_u michael@0: mov edi, [esp + 4 + 12] // dst_v michael@0: mov ecx, [esp + 4 + 16] // pix michael@0: movdqa xmm7, kARGBToU michael@0: movdqa xmm6, kARGBToV michael@0: movdqa xmm5, kAddUV128 michael@0: sub edi, edx // stride from u to v michael@0: michael@0: align 4 michael@0: convertloop: michael@0: /* convert to U and V */ michael@0: movdqa xmm0, [eax] // U michael@0: movdqa xmm1, [eax + 16] michael@0: movdqa xmm2, [eax + 32] michael@0: movdqa xmm3, [eax + 48] michael@0: pmaddubsw xmm0, xmm7 michael@0: pmaddubsw xmm1, xmm7 michael@0: pmaddubsw xmm2, xmm7 michael@0: pmaddubsw xmm3, xmm7 michael@0: phaddw xmm0, xmm1 michael@0: phaddw xmm2, xmm3 michael@0: psraw xmm0, 8 michael@0: psraw xmm2, 8 michael@0: packsswb xmm0, xmm2 michael@0: paddb xmm0, xmm5 michael@0: sub ecx, 16 michael@0: movdqa [edx], xmm0 michael@0: michael@0: movdqa xmm0, [eax] // V michael@0: movdqa xmm1, [eax + 16] michael@0: movdqa xmm2, [eax + 32] michael@0: movdqa xmm3, [eax + 48] michael@0: pmaddubsw xmm0, xmm6 michael@0: pmaddubsw xmm1, xmm6 michael@0: pmaddubsw xmm2, xmm6 michael@0: pmaddubsw xmm3, xmm6 michael@0: phaddw xmm0, xmm1 michael@0: phaddw xmm2, xmm3 michael@0: psraw xmm0, 8 michael@0: psraw xmm2, 8 michael@0: packsswb xmm0, xmm2 michael@0: paddb xmm0, xmm5 michael@0: lea eax, [eax + 64] michael@0: movdqa [edx + edi], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0, michael@0: uint8* dst_u, uint8* dst_v, int width) { michael@0: __asm { michael@0: push edi michael@0: mov eax, [esp + 4 + 4] // src_argb michael@0: mov edx, [esp + 4 + 8] // dst_u michael@0: mov edi, [esp + 4 + 12] // dst_v michael@0: mov ecx, [esp + 4 + 16] // pix michael@0: movdqa xmm7, kARGBToU michael@0: movdqa xmm6, kARGBToV michael@0: movdqa xmm5, kAddUV128 michael@0: sub edi, edx // stride from u to v michael@0: michael@0: align 4 michael@0: convertloop: michael@0: /* convert to U and V */ michael@0: movdqu xmm0, [eax] // U michael@0: movdqu xmm1, [eax + 16] michael@0: movdqu xmm2, [eax + 32] michael@0: movdqu xmm3, [eax + 48] michael@0: pmaddubsw xmm0, xmm7 michael@0: pmaddubsw xmm1, xmm7 michael@0: pmaddubsw xmm2, xmm7 michael@0: pmaddubsw xmm3, xmm7 michael@0: phaddw xmm0, xmm1 michael@0: phaddw xmm2, xmm3 michael@0: psraw xmm0, 8 michael@0: psraw xmm2, 8 michael@0: packsswb xmm0, xmm2 michael@0: paddb xmm0, xmm5 michael@0: sub ecx, 16 michael@0: movdqu [edx], xmm0 michael@0: michael@0: movdqu xmm0, [eax] // V michael@0: movdqu xmm1, [eax + 16] michael@0: movdqu xmm2, [eax + 32] michael@0: movdqu xmm3, [eax + 48] michael@0: pmaddubsw xmm0, xmm6 michael@0: pmaddubsw xmm1, xmm6 michael@0: pmaddubsw xmm2, xmm6 michael@0: pmaddubsw xmm3, xmm6 michael@0: phaddw xmm0, xmm1 michael@0: phaddw xmm2, xmm3 michael@0: psraw xmm0, 8 michael@0: psraw xmm2, 8 michael@0: packsswb xmm0, xmm2 michael@0: paddb xmm0, xmm5 michael@0: lea eax, [eax + 64] michael@0: movdqu [edx + edi], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBToUV422Row_SSSE3(const uint8* src_argb0, michael@0: uint8* dst_u, uint8* dst_v, int width) { michael@0: __asm { michael@0: push edi michael@0: mov eax, [esp + 4 + 4] // src_argb michael@0: mov edx, [esp + 4 + 8] // dst_u michael@0: mov edi, [esp + 4 + 12] // dst_v michael@0: mov ecx, [esp + 4 + 16] // pix michael@0: movdqa xmm7, kARGBToU michael@0: movdqa xmm6, kARGBToV michael@0: movdqa xmm5, kAddUV128 michael@0: sub edi, edx // stride from u to v michael@0: michael@0: align 4 michael@0: convertloop: michael@0: /* step 1 - subsample 16x2 argb pixels to 8x1 */ michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: movdqa xmm2, [eax + 32] michael@0: movdqa xmm3, [eax + 48] michael@0: lea eax, [eax + 64] michael@0: movdqa xmm4, xmm0 michael@0: shufps xmm0, xmm1, 0x88 michael@0: shufps xmm4, xmm1, 0xdd michael@0: pavgb xmm0, xmm4 michael@0: movdqa xmm4, xmm2 michael@0: shufps xmm2, xmm3, 0x88 michael@0: shufps xmm4, xmm3, 0xdd michael@0: pavgb xmm2, xmm4 michael@0: michael@0: // step 2 - convert to U and V michael@0: // from here down is very similar to Y code except michael@0: // instead of 16 different pixels, its 8 pixels of U and 8 of V michael@0: movdqa xmm1, xmm0 michael@0: movdqa xmm3, xmm2 michael@0: pmaddubsw xmm0, xmm7 // U michael@0: pmaddubsw xmm2, xmm7 michael@0: pmaddubsw xmm1, xmm6 // V michael@0: pmaddubsw xmm3, xmm6 michael@0: phaddw xmm0, xmm2 michael@0: phaddw xmm1, xmm3 michael@0: psraw xmm0, 8 michael@0: psraw xmm1, 8 michael@0: packsswb xmm0, xmm1 michael@0: paddb xmm0, xmm5 // -> unsigned michael@0: michael@0: // step 3 - store 8 U and 8 V values michael@0: sub ecx, 16 michael@0: movlps qword ptr [edx], xmm0 // U michael@0: movhps qword ptr [edx + edi], xmm0 // V michael@0: lea edx, [edx + 8] michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0, michael@0: uint8* dst_u, uint8* dst_v, int width) { michael@0: __asm { michael@0: push edi michael@0: mov eax, [esp + 4 + 4] // src_argb michael@0: mov edx, [esp + 4 + 8] // dst_u michael@0: mov edi, [esp + 4 + 12] // dst_v michael@0: mov ecx, [esp + 4 + 16] // pix michael@0: movdqa xmm7, kARGBToU michael@0: movdqa xmm6, kARGBToV michael@0: movdqa xmm5, kAddUV128 michael@0: sub edi, edx // stride from u to v michael@0: michael@0: align 4 michael@0: convertloop: michael@0: /* step 1 - subsample 16x2 argb pixels to 8x1 */ michael@0: movdqu xmm0, [eax] michael@0: movdqu xmm1, [eax + 16] michael@0: movdqu xmm2, [eax + 32] michael@0: movdqu xmm3, [eax + 48] michael@0: lea eax, [eax + 64] michael@0: movdqa xmm4, xmm0 michael@0: shufps xmm0, xmm1, 0x88 michael@0: shufps xmm4, xmm1, 0xdd michael@0: pavgb xmm0, xmm4 michael@0: movdqa xmm4, xmm2 michael@0: shufps xmm2, xmm3, 0x88 michael@0: shufps xmm4, xmm3, 0xdd michael@0: pavgb xmm2, xmm4 michael@0: michael@0: // step 2 - convert to U and V michael@0: // from here down is very similar to Y code except michael@0: // instead of 16 different pixels, its 8 pixels of U and 8 of V michael@0: movdqa xmm1, xmm0 michael@0: movdqa xmm3, xmm2 michael@0: pmaddubsw xmm0, xmm7 // U michael@0: pmaddubsw xmm2, xmm7 michael@0: pmaddubsw xmm1, xmm6 // V michael@0: pmaddubsw xmm3, xmm6 michael@0: phaddw xmm0, xmm2 michael@0: phaddw xmm1, xmm3 michael@0: psraw xmm0, 8 michael@0: psraw xmm1, 8 michael@0: packsswb xmm0, xmm1 michael@0: paddb xmm0, xmm5 // -> unsigned michael@0: michael@0: // step 3 - store 8 U and 8 V values michael@0: sub ecx, 16 michael@0: movlps qword ptr [edx], xmm0 // U michael@0: movhps qword ptr [edx + edi], xmm0 // V michael@0: lea edx, [edx + 8] michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, michael@0: uint8* dst_u, uint8* dst_v, int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // src_argb michael@0: mov esi, [esp + 8 + 8] // src_stride_argb michael@0: mov edx, [esp + 8 + 12] // dst_u michael@0: mov edi, [esp + 8 + 16] // dst_v michael@0: mov ecx, [esp + 8 + 20] // pix michael@0: movdqa xmm7, kBGRAToU michael@0: movdqa xmm6, kBGRAToV michael@0: movdqa xmm5, kAddUV128 michael@0: sub edi, edx // stride from u to v michael@0: michael@0: align 4 michael@0: convertloop: michael@0: /* step 1 - subsample 16x2 argb pixels to 8x1 */ michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: movdqa xmm2, [eax + 32] michael@0: movdqa xmm3, [eax + 48] michael@0: pavgb xmm0, [eax + esi] michael@0: pavgb xmm1, [eax + esi + 16] michael@0: pavgb xmm2, [eax + esi + 32] michael@0: pavgb xmm3, [eax + esi + 48] michael@0: lea eax, [eax + 64] michael@0: movdqa xmm4, xmm0 michael@0: shufps xmm0, xmm1, 0x88 michael@0: shufps xmm4, xmm1, 0xdd michael@0: pavgb xmm0, xmm4 michael@0: movdqa xmm4, xmm2 michael@0: shufps xmm2, xmm3, 0x88 michael@0: shufps xmm4, xmm3, 0xdd michael@0: pavgb xmm2, xmm4 michael@0: michael@0: // step 2 - convert to U and V michael@0: // from here down is very similar to Y code except michael@0: // instead of 16 different pixels, its 8 pixels of U and 8 of V michael@0: movdqa xmm1, xmm0 michael@0: movdqa xmm3, xmm2 michael@0: pmaddubsw xmm0, xmm7 // U michael@0: pmaddubsw xmm2, xmm7 michael@0: pmaddubsw xmm1, xmm6 // V michael@0: pmaddubsw xmm3, xmm6 michael@0: phaddw xmm0, xmm2 michael@0: phaddw xmm1, xmm3 michael@0: psraw xmm0, 8 michael@0: psraw xmm1, 8 michael@0: packsswb xmm0, xmm1 michael@0: paddb xmm0, xmm5 // -> unsigned michael@0: michael@0: // step 3 - store 8 U and 8 V values michael@0: sub ecx, 16 michael@0: movlps qword ptr [edx], xmm0 // U michael@0: movhps qword ptr [edx + edi], xmm0 // V michael@0: lea edx, [edx + 8] michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, michael@0: uint8* dst_u, uint8* dst_v, int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // src_argb michael@0: mov esi, [esp + 8 + 8] // src_stride_argb michael@0: mov edx, [esp + 8 + 12] // dst_u michael@0: mov edi, [esp + 8 + 16] // dst_v michael@0: mov ecx, [esp + 8 + 20] // pix michael@0: movdqa xmm7, kBGRAToU michael@0: movdqa xmm6, kBGRAToV michael@0: movdqa xmm5, kAddUV128 michael@0: sub edi, edx // stride from u to v michael@0: michael@0: align 4 michael@0: convertloop: michael@0: /* step 1 - subsample 16x2 argb pixels to 8x1 */ michael@0: movdqu xmm0, [eax] michael@0: movdqu xmm1, [eax + 16] michael@0: movdqu xmm2, [eax + 32] michael@0: movdqu xmm3, [eax + 48] michael@0: movdqu xmm4, [eax + esi] michael@0: pavgb xmm0, xmm4 michael@0: movdqu xmm4, [eax + esi + 16] michael@0: pavgb xmm1, xmm4 michael@0: movdqu xmm4, [eax + esi + 32] michael@0: pavgb xmm2, xmm4 michael@0: movdqu xmm4, [eax + esi + 48] michael@0: pavgb xmm3, xmm4 michael@0: lea eax, [eax + 64] michael@0: movdqa xmm4, xmm0 michael@0: shufps xmm0, xmm1, 0x88 michael@0: shufps xmm4, xmm1, 0xdd michael@0: pavgb xmm0, xmm4 michael@0: movdqa xmm4, xmm2 michael@0: shufps xmm2, xmm3, 0x88 michael@0: shufps xmm4, xmm3, 0xdd michael@0: pavgb xmm2, xmm4 michael@0: michael@0: // step 2 - convert to U and V michael@0: // from here down is very similar to Y code except michael@0: // instead of 16 different pixels, its 8 pixels of U and 8 of V michael@0: movdqa xmm1, xmm0 michael@0: movdqa xmm3, xmm2 michael@0: pmaddubsw xmm0, xmm7 // U michael@0: pmaddubsw xmm2, xmm7 michael@0: pmaddubsw xmm1, xmm6 // V michael@0: pmaddubsw xmm3, xmm6 michael@0: phaddw xmm0, xmm2 michael@0: phaddw xmm1, xmm3 michael@0: psraw xmm0, 8 michael@0: psraw xmm1, 8 michael@0: packsswb xmm0, xmm1 michael@0: paddb xmm0, xmm5 // -> unsigned michael@0: michael@0: // step 3 - store 8 U and 8 V values michael@0: sub ecx, 16 michael@0: movlps qword ptr [edx], xmm0 // U michael@0: movhps qword ptr [edx + edi], xmm0 // V michael@0: lea edx, [edx + 8] michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, michael@0: uint8* dst_u, uint8* dst_v, int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // src_argb michael@0: mov esi, [esp + 8 + 8] // src_stride_argb michael@0: mov edx, [esp + 8 + 12] // dst_u michael@0: mov edi, [esp + 8 + 16] // dst_v michael@0: mov ecx, [esp + 8 + 20] // pix michael@0: movdqa xmm7, kABGRToU michael@0: movdqa xmm6, kABGRToV michael@0: movdqa xmm5, kAddUV128 michael@0: sub edi, edx // stride from u to v michael@0: michael@0: align 4 michael@0: convertloop: michael@0: /* step 1 - subsample 16x2 argb pixels to 8x1 */ michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: movdqa xmm2, [eax + 32] michael@0: movdqa xmm3, [eax + 48] michael@0: pavgb xmm0, [eax + esi] michael@0: pavgb xmm1, [eax + esi + 16] michael@0: pavgb xmm2, [eax + esi + 32] michael@0: pavgb xmm3, [eax + esi + 48] michael@0: lea eax, [eax + 64] michael@0: movdqa xmm4, xmm0 michael@0: shufps xmm0, xmm1, 0x88 michael@0: shufps xmm4, xmm1, 0xdd michael@0: pavgb xmm0, xmm4 michael@0: movdqa xmm4, xmm2 michael@0: shufps xmm2, xmm3, 0x88 michael@0: shufps xmm4, xmm3, 0xdd michael@0: pavgb xmm2, xmm4 michael@0: michael@0: // step 2 - convert to U and V michael@0: // from here down is very similar to Y code except michael@0: // instead of 16 different pixels, its 8 pixels of U and 8 of V michael@0: movdqa xmm1, xmm0 michael@0: movdqa xmm3, xmm2 michael@0: pmaddubsw xmm0, xmm7 // U michael@0: pmaddubsw xmm2, xmm7 michael@0: pmaddubsw xmm1, xmm6 // V michael@0: pmaddubsw xmm3, xmm6 michael@0: phaddw xmm0, xmm2 michael@0: phaddw xmm1, xmm3 michael@0: psraw xmm0, 8 michael@0: psraw xmm1, 8 michael@0: packsswb xmm0, xmm1 michael@0: paddb xmm0, xmm5 // -> unsigned michael@0: michael@0: // step 3 - store 8 U and 8 V values michael@0: sub ecx, 16 michael@0: movlps qword ptr [edx], xmm0 // U michael@0: movhps qword ptr [edx + edi], xmm0 // V michael@0: lea edx, [edx + 8] michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, michael@0: uint8* dst_u, uint8* dst_v, int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // src_argb michael@0: mov esi, [esp + 8 + 8] // src_stride_argb michael@0: mov edx, [esp + 8 + 12] // dst_u michael@0: mov edi, [esp + 8 + 16] // dst_v michael@0: mov ecx, [esp + 8 + 20] // pix michael@0: movdqa xmm7, kABGRToU michael@0: movdqa xmm6, kABGRToV michael@0: movdqa xmm5, kAddUV128 michael@0: sub edi, edx // stride from u to v michael@0: michael@0: align 4 michael@0: convertloop: michael@0: /* step 1 - subsample 16x2 argb pixels to 8x1 */ michael@0: movdqu xmm0, [eax] michael@0: movdqu xmm1, [eax + 16] michael@0: movdqu xmm2, [eax + 32] michael@0: movdqu xmm3, [eax + 48] michael@0: movdqu xmm4, [eax + esi] michael@0: pavgb xmm0, xmm4 michael@0: movdqu xmm4, [eax + esi + 16] michael@0: pavgb xmm1, xmm4 michael@0: movdqu xmm4, [eax + esi + 32] michael@0: pavgb xmm2, xmm4 michael@0: movdqu xmm4, [eax + esi + 48] michael@0: pavgb xmm3, xmm4 michael@0: lea eax, [eax + 64] michael@0: movdqa xmm4, xmm0 michael@0: shufps xmm0, xmm1, 0x88 michael@0: shufps xmm4, xmm1, 0xdd michael@0: pavgb xmm0, xmm4 michael@0: movdqa xmm4, xmm2 michael@0: shufps xmm2, xmm3, 0x88 michael@0: shufps xmm4, xmm3, 0xdd michael@0: pavgb xmm2, xmm4 michael@0: michael@0: // step 2 - convert to U and V michael@0: // from here down is very similar to Y code except michael@0: // instead of 16 different pixels, its 8 pixels of U and 8 of V michael@0: movdqa xmm1, xmm0 michael@0: movdqa xmm3, xmm2 michael@0: pmaddubsw xmm0, xmm7 // U michael@0: pmaddubsw xmm2, xmm7 michael@0: pmaddubsw xmm1, xmm6 // V michael@0: pmaddubsw xmm3, xmm6 michael@0: phaddw xmm0, xmm2 michael@0: phaddw xmm1, xmm3 michael@0: psraw xmm0, 8 michael@0: psraw xmm1, 8 michael@0: packsswb xmm0, xmm1 michael@0: paddb xmm0, xmm5 // -> unsigned michael@0: michael@0: // step 3 - store 8 U and 8 V values michael@0: sub ecx, 16 michael@0: movlps qword ptr [edx], xmm0 // U michael@0: movhps qword ptr [edx + edi], xmm0 // V michael@0: lea edx, [edx + 8] michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, michael@0: uint8* dst_u, uint8* dst_v, int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // src_argb michael@0: mov esi, [esp + 8 + 8] // src_stride_argb michael@0: mov edx, [esp + 8 + 12] // dst_u michael@0: mov edi, [esp + 8 + 16] // dst_v michael@0: mov ecx, [esp + 8 + 20] // pix michael@0: movdqa xmm7, kRGBAToU michael@0: movdqa xmm6, kRGBAToV michael@0: movdqa xmm5, kAddUV128 michael@0: sub edi, edx // stride from u to v michael@0: michael@0: align 4 michael@0: convertloop: michael@0: /* step 1 - subsample 16x2 argb pixels to 8x1 */ michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: movdqa xmm2, [eax + 32] michael@0: movdqa xmm3, [eax + 48] michael@0: pavgb xmm0, [eax + esi] michael@0: pavgb xmm1, [eax + esi + 16] michael@0: pavgb xmm2, [eax + esi + 32] michael@0: pavgb xmm3, [eax + esi + 48] michael@0: lea eax, [eax + 64] michael@0: movdqa xmm4, xmm0 michael@0: shufps xmm0, xmm1, 0x88 michael@0: shufps xmm4, xmm1, 0xdd michael@0: pavgb xmm0, xmm4 michael@0: movdqa xmm4, xmm2 michael@0: shufps xmm2, xmm3, 0x88 michael@0: shufps xmm4, xmm3, 0xdd michael@0: pavgb xmm2, xmm4 michael@0: michael@0: // step 2 - convert to U and V michael@0: // from here down is very similar to Y code except michael@0: // instead of 16 different pixels, its 8 pixels of U and 8 of V michael@0: movdqa xmm1, xmm0 michael@0: movdqa xmm3, xmm2 michael@0: pmaddubsw xmm0, xmm7 // U michael@0: pmaddubsw xmm2, xmm7 michael@0: pmaddubsw xmm1, xmm6 // V michael@0: pmaddubsw xmm3, xmm6 michael@0: phaddw xmm0, xmm2 michael@0: phaddw xmm1, xmm3 michael@0: psraw xmm0, 8 michael@0: psraw xmm1, 8 michael@0: packsswb xmm0, xmm1 michael@0: paddb xmm0, xmm5 // -> unsigned michael@0: michael@0: // step 3 - store 8 U and 8 V values michael@0: sub ecx, 16 michael@0: movlps qword ptr [edx], xmm0 // U michael@0: movhps qword ptr [edx + edi], xmm0 // V michael@0: lea edx, [edx + 8] michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, michael@0: uint8* dst_u, uint8* dst_v, int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // src_argb michael@0: mov esi, [esp + 8 + 8] // src_stride_argb michael@0: mov edx, [esp + 8 + 12] // dst_u michael@0: mov edi, [esp + 8 + 16] // dst_v michael@0: mov ecx, [esp + 8 + 20] // pix michael@0: movdqa xmm7, kRGBAToU michael@0: movdqa xmm6, kRGBAToV michael@0: movdqa xmm5, kAddUV128 michael@0: sub edi, edx // stride from u to v michael@0: michael@0: align 4 michael@0: convertloop: michael@0: /* step 1 - subsample 16x2 argb pixels to 8x1 */ michael@0: movdqu xmm0, [eax] michael@0: movdqu xmm1, [eax + 16] michael@0: movdqu xmm2, [eax + 32] michael@0: movdqu xmm3, [eax + 48] michael@0: movdqu xmm4, [eax + esi] michael@0: pavgb xmm0, xmm4 michael@0: movdqu xmm4, [eax + esi + 16] michael@0: pavgb xmm1, xmm4 michael@0: movdqu xmm4, [eax + esi + 32] michael@0: pavgb xmm2, xmm4 michael@0: movdqu xmm4, [eax + esi + 48] michael@0: pavgb xmm3, xmm4 michael@0: lea eax, [eax + 64] michael@0: movdqa xmm4, xmm0 michael@0: shufps xmm0, xmm1, 0x88 michael@0: shufps xmm4, xmm1, 0xdd michael@0: pavgb xmm0, xmm4 michael@0: movdqa xmm4, xmm2 michael@0: shufps xmm2, xmm3, 0x88 michael@0: shufps xmm4, xmm3, 0xdd michael@0: pavgb xmm2, xmm4 michael@0: michael@0: // step 2 - convert to U and V michael@0: // from here down is very similar to Y code except michael@0: // instead of 16 different pixels, its 8 pixels of U and 8 of V michael@0: movdqa xmm1, xmm0 michael@0: movdqa xmm3, xmm2 michael@0: pmaddubsw xmm0, xmm7 // U michael@0: pmaddubsw xmm2, xmm7 michael@0: pmaddubsw xmm1, xmm6 // V michael@0: pmaddubsw xmm3, xmm6 michael@0: phaddw xmm0, xmm2 michael@0: phaddw xmm1, xmm3 michael@0: psraw xmm0, 8 michael@0: psraw xmm1, 8 michael@0: packsswb xmm0, xmm1 michael@0: paddb xmm0, xmm5 // -> unsigned michael@0: michael@0: // step 3 - store 8 U and 8 V values michael@0: sub ecx, 16 michael@0: movlps qword ptr [edx], xmm0 // U michael@0: movhps qword ptr [edx + edi], xmm0 // V michael@0: lea edx, [edx + 8] michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBTOYROW_SSSE3 michael@0: michael@0: #define YG 74 /* (int8)(1.164 * 64 + 0.5) */ michael@0: michael@0: #define UB 127 /* min(63,(int8)(2.018 * 64)) */ michael@0: #define UG -25 /* (int8)(-0.391 * 64 - 0.5) */ michael@0: #define UR 0 michael@0: michael@0: #define VB 0 michael@0: #define VG -52 /* (int8)(-0.813 * 64 - 0.5) */ michael@0: #define VR 102 /* (int8)(1.596 * 64 + 0.5) */ michael@0: michael@0: // Bias michael@0: #define BB UB * 128 + VB * 128 michael@0: #define BG UG * 128 + VG * 128 michael@0: #define BR UR * 128 + VR * 128 michael@0: michael@0: #ifdef HAS_I422TOARGBROW_AVX2 michael@0: michael@0: static const lvec8 kUVToB_AVX = { michael@0: UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, michael@0: UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB michael@0: }; michael@0: static const lvec8 kUVToR_AVX = { michael@0: UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, michael@0: UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR michael@0: }; michael@0: static const lvec8 kUVToG_AVX = { michael@0: UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, michael@0: UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG michael@0: }; michael@0: static const lvec16 kYToRgb_AVX = { michael@0: YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG michael@0: }; michael@0: static const lvec16 kYSub16_AVX = { michael@0: 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 michael@0: }; michael@0: static const lvec16 kUVBiasB_AVX = { michael@0: BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB michael@0: }; michael@0: static const lvec16 kUVBiasG_AVX = { michael@0: BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG michael@0: }; michael@0: static const lvec16 kUVBiasR_AVX = { michael@0: BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR michael@0: }; michael@0: michael@0: // 16 pixels michael@0: // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). michael@0: __declspec(naked) __declspec(align(16)) michael@0: void I422ToARGBRow_AVX2(const uint8* y_buf, michael@0: const uint8* u_buf, michael@0: const uint8* v_buf, michael@0: uint8* dst_argb, michael@0: int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // Y michael@0: mov esi, [esp + 8 + 8] // U michael@0: mov edi, [esp + 8 + 12] // V michael@0: mov edx, [esp + 8 + 16] // argb michael@0: mov ecx, [esp + 8 + 20] // width michael@0: sub edi, esi michael@0: vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha michael@0: vpxor ymm4, ymm4, ymm4 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: vmovq xmm0, qword ptr [esi] // U michael@0: vmovq xmm1, qword ptr [esi + edi] // V michael@0: lea esi, [esi + 8] michael@0: vpunpcklbw ymm0, ymm0, ymm1 // UV michael@0: vpermq ymm0, ymm0, 0xd8 michael@0: vpunpcklwd ymm0, ymm0, ymm0 // UVUV michael@0: vpmaddubsw ymm2, ymm0, kUVToB_AVX // scale B UV michael@0: vpmaddubsw ymm1, ymm0, kUVToG_AVX // scale G UV michael@0: vpmaddubsw ymm0, ymm0, kUVToR_AVX // scale R UV michael@0: vpsubw ymm2, ymm2, kUVBiasB_AVX // unbias back to signed michael@0: vpsubw ymm1, ymm1, kUVBiasG_AVX michael@0: vpsubw ymm0, ymm0, kUVBiasR_AVX michael@0: michael@0: // Step 2: Find Y contribution to 16 R,G,B values michael@0: vmovdqu xmm3, [eax] // NOLINT michael@0: lea eax, [eax + 16] michael@0: vpermq ymm3, ymm3, 0xd8 michael@0: vpunpcklbw ymm3, ymm3, ymm4 michael@0: vpsubsw ymm3, ymm3, kYSub16_AVX michael@0: vpmullw ymm3, ymm3, kYToRgb_AVX michael@0: vpaddsw ymm2, ymm2, ymm3 // B += Y michael@0: vpaddsw ymm1, ymm1, ymm3 // G += Y michael@0: vpaddsw ymm0, ymm0, ymm3 // R += Y michael@0: vpsraw ymm2, ymm2, 6 michael@0: vpsraw ymm1, ymm1, 6 michael@0: vpsraw ymm0, ymm0, 6 michael@0: vpackuswb ymm2, ymm2, ymm2 // B michael@0: vpackuswb ymm1, ymm1, ymm1 // G michael@0: vpackuswb ymm0, ymm0, ymm0 // R michael@0: michael@0: // Step 3: Weave into ARGB michael@0: vpunpcklbw ymm2, ymm2, ymm1 // BG michael@0: vpermq ymm2, ymm2, 0xd8 michael@0: vpunpcklbw ymm0, ymm0, ymm5 // RA michael@0: vpermq ymm0, ymm0, 0xd8 michael@0: vpunpcklwd ymm1, ymm2, ymm0 // BGRA first 8 pixels michael@0: vpunpckhwd ymm2, ymm2, ymm0 // BGRA next 8 pixels michael@0: vmovdqu [edx], ymm1 michael@0: vmovdqu [edx + 32], ymm2 michael@0: lea edx, [edx + 64] michael@0: sub ecx, 16 michael@0: jg convertloop michael@0: vzeroupper michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_I422TOARGBROW_AVX2 michael@0: michael@0: #ifdef HAS_I422TOARGBROW_SSSE3 michael@0: michael@0: static const vec8 kUVToB = { michael@0: UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB michael@0: }; michael@0: michael@0: static const vec8 kUVToR = { michael@0: UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR michael@0: }; michael@0: michael@0: static const vec8 kUVToG = { michael@0: UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG michael@0: }; michael@0: michael@0: static const vec8 kVUToB = { michael@0: VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, michael@0: }; michael@0: michael@0: static const vec8 kVUToR = { michael@0: VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, michael@0: }; michael@0: michael@0: static const vec8 kVUToG = { michael@0: VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, michael@0: }; michael@0: michael@0: static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG }; michael@0: static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 }; michael@0: static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB }; michael@0: static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG }; michael@0: static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; michael@0: michael@0: // TODO(fbarchard): Read that does half size on Y and treats 420 as 444. michael@0: michael@0: // Read 8 UV from 444. michael@0: #define READYUV444 __asm { \ michael@0: __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ michael@0: __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ michael@0: __asm lea esi, [esi + 8] \ michael@0: __asm punpcklbw xmm0, xmm1 /* UV */ \ michael@0: } michael@0: michael@0: // Read 4 UV from 422, upsample to 8 UV. michael@0: #define READYUV422 __asm { \ michael@0: __asm movd xmm0, [esi] /* U */ \ michael@0: __asm movd xmm1, [esi + edi] /* V */ \ michael@0: __asm lea esi, [esi + 4] \ michael@0: __asm punpcklbw xmm0, xmm1 /* UV */ \ michael@0: __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ michael@0: } michael@0: michael@0: // Read 2 UV from 411, upsample to 8 UV. michael@0: #define READYUV411 __asm { \ michael@0: __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \ michael@0: __asm movd xmm0, ebx \ michael@0: __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \ michael@0: __asm movd xmm1, ebx \ michael@0: __asm lea esi, [esi + 2] \ michael@0: __asm punpcklbw xmm0, xmm1 /* UV */ \ michael@0: __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ michael@0: __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \ michael@0: } michael@0: michael@0: // Read 4 UV from NV12, upsample to 8 UV. michael@0: #define READNV12 __asm { \ michael@0: __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \ michael@0: __asm lea esi, [esi + 8] \ michael@0: __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ michael@0: } michael@0: michael@0: // Convert 8 pixels: 8 UV and 8 Y. michael@0: #define YUVTORGB __asm { \ michael@0: /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ michael@0: __asm movdqa xmm1, xmm0 \ michael@0: __asm movdqa xmm2, xmm0 \ michael@0: __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \ michael@0: __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \ michael@0: __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \ michael@0: __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ michael@0: __asm psubw xmm1, kUVBiasG \ michael@0: __asm psubw xmm2, kUVBiasR \ michael@0: /* Step 2: Find Y contribution to 8 R,G,B values */ \ michael@0: __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ michael@0: __asm lea eax, [eax + 8] \ michael@0: __asm punpcklbw xmm3, xmm4 \ michael@0: __asm psubsw xmm3, kYSub16 \ michael@0: __asm pmullw xmm3, kYToRgb \ michael@0: __asm paddsw xmm0, xmm3 /* B += Y */ \ michael@0: __asm paddsw xmm1, xmm3 /* G += Y */ \ michael@0: __asm paddsw xmm2, xmm3 /* R += Y */ \ michael@0: __asm psraw xmm0, 6 \ michael@0: __asm psraw xmm1, 6 \ michael@0: __asm psraw xmm2, 6 \ michael@0: __asm packuswb xmm0, xmm0 /* B */ \ michael@0: __asm packuswb xmm1, xmm1 /* G */ \ michael@0: __asm packuswb xmm2, xmm2 /* R */ \ michael@0: } michael@0: michael@0: // Convert 8 pixels: 8 VU and 8 Y. michael@0: #define YVUTORGB __asm { \ michael@0: /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ michael@0: __asm movdqa xmm1, xmm0 \ michael@0: __asm movdqa xmm2, xmm0 \ michael@0: __asm pmaddubsw xmm0, kVUToB /* scale B UV */ \ michael@0: __asm pmaddubsw xmm1, kVUToG /* scale G UV */ \ michael@0: __asm pmaddubsw xmm2, kVUToR /* scale R UV */ \ michael@0: __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ michael@0: __asm psubw xmm1, kUVBiasG \ michael@0: __asm psubw xmm2, kUVBiasR \ michael@0: /* Step 2: Find Y contribution to 8 R,G,B values */ \ michael@0: __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ michael@0: __asm lea eax, [eax + 8] \ michael@0: __asm punpcklbw xmm3, xmm4 \ michael@0: __asm psubsw xmm3, kYSub16 \ michael@0: __asm pmullw xmm3, kYToRgb \ michael@0: __asm paddsw xmm0, xmm3 /* B += Y */ \ michael@0: __asm paddsw xmm1, xmm3 /* G += Y */ \ michael@0: __asm paddsw xmm2, xmm3 /* R += Y */ \ michael@0: __asm psraw xmm0, 6 \ michael@0: __asm psraw xmm1, 6 \ michael@0: __asm psraw xmm2, 6 \ michael@0: __asm packuswb xmm0, xmm0 /* B */ \ michael@0: __asm packuswb xmm1, xmm1 /* G */ \ michael@0: __asm packuswb xmm2, xmm2 /* R */ \ michael@0: } michael@0: michael@0: // 8 pixels, dest aligned 16. michael@0: // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). michael@0: __declspec(naked) __declspec(align(16)) michael@0: void I444ToARGBRow_SSSE3(const uint8* y_buf, michael@0: const uint8* u_buf, michael@0: const uint8* v_buf, michael@0: uint8* dst_argb, michael@0: int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // Y michael@0: mov esi, [esp + 8 + 8] // U michael@0: mov edi, [esp + 8 + 12] // V michael@0: mov edx, [esp + 8 + 16] // argb michael@0: mov ecx, [esp + 8 + 20] // width michael@0: sub edi, esi michael@0: pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha michael@0: pxor xmm4, xmm4 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: READYUV444 michael@0: YUVTORGB michael@0: michael@0: // Step 3: Weave into ARGB michael@0: punpcklbw xmm0, xmm1 // BG michael@0: punpcklbw xmm2, xmm5 // RA michael@0: movdqa xmm1, xmm0 michael@0: punpcklwd xmm0, xmm2 // BGRA first 4 pixels michael@0: punpckhwd xmm1, xmm2 // BGRA next 4 pixels michael@0: movdqa [edx], xmm0 michael@0: movdqa [edx + 16], xmm1 michael@0: lea edx, [edx + 32] michael@0: sub ecx, 8 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // 8 pixels, dest aligned 16. michael@0: // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). michael@0: __declspec(naked) __declspec(align(16)) michael@0: void I422ToRGB24Row_SSSE3(const uint8* y_buf, michael@0: const uint8* u_buf, michael@0: const uint8* v_buf, michael@0: uint8* dst_rgb24, michael@0: int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // Y michael@0: mov esi, [esp + 8 + 8] // U michael@0: mov edi, [esp + 8 + 12] // V michael@0: mov edx, [esp + 8 + 16] // rgb24 michael@0: mov ecx, [esp + 8 + 20] // width michael@0: sub edi, esi michael@0: pxor xmm4, xmm4 michael@0: movdqa xmm5, kShuffleMaskARGBToRGB24_0 michael@0: movdqa xmm6, kShuffleMaskARGBToRGB24 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: READYUV422 michael@0: YUVTORGB michael@0: michael@0: // Step 3: Weave into RRGB michael@0: punpcklbw xmm0, xmm1 // BG michael@0: punpcklbw xmm2, xmm2 // RR michael@0: movdqa xmm1, xmm0 michael@0: punpcklwd xmm0, xmm2 // BGRR first 4 pixels michael@0: punpckhwd xmm1, xmm2 // BGRR next 4 pixels michael@0: pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes. michael@0: pshufb xmm1, xmm6 // Pack into first 12 bytes. michael@0: palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1 michael@0: movq qword ptr [edx], xmm0 // First 8 bytes michael@0: movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels. michael@0: lea edx, [edx + 24] michael@0: sub ecx, 8 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // 8 pixels, dest aligned 16. michael@0: // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). michael@0: __declspec(naked) __declspec(align(16)) michael@0: void I422ToRAWRow_SSSE3(const uint8* y_buf, michael@0: const uint8* u_buf, michael@0: const uint8* v_buf, michael@0: uint8* dst_raw, michael@0: int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // Y michael@0: mov esi, [esp + 8 + 8] // U michael@0: mov edi, [esp + 8 + 12] // V michael@0: mov edx, [esp + 8 + 16] // raw michael@0: mov ecx, [esp + 8 + 20] // width michael@0: sub edi, esi michael@0: pxor xmm4, xmm4 michael@0: movdqa xmm5, kShuffleMaskARGBToRAW_0 michael@0: movdqa xmm6, kShuffleMaskARGBToRAW michael@0: michael@0: align 4 michael@0: convertloop: michael@0: READYUV422 michael@0: YUVTORGB michael@0: michael@0: // Step 3: Weave into RRGB michael@0: punpcklbw xmm0, xmm1 // BG michael@0: punpcklbw xmm2, xmm2 // RR michael@0: movdqa xmm1, xmm0 michael@0: punpcklwd xmm0, xmm2 // BGRR first 4 pixels michael@0: punpckhwd xmm1, xmm2 // BGRR next 4 pixels michael@0: pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes. michael@0: pshufb xmm1, xmm6 // Pack into first 12 bytes. michael@0: palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1 michael@0: movq qword ptr [edx], xmm0 // First 8 bytes michael@0: movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels. michael@0: lea edx, [edx + 24] michael@0: sub ecx, 8 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // 8 pixels, dest unaligned. michael@0: // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). michael@0: __declspec(naked) __declspec(align(16)) michael@0: void I422ToRGB565Row_SSSE3(const uint8* y_buf, michael@0: const uint8* u_buf, michael@0: const uint8* v_buf, michael@0: uint8* rgb565_buf, michael@0: int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // Y michael@0: mov esi, [esp + 8 + 8] // U michael@0: mov edi, [esp + 8 + 12] // V michael@0: mov edx, [esp + 8 + 16] // rgb565 michael@0: mov ecx, [esp + 8 + 20] // width michael@0: sub edi, esi michael@0: pxor xmm4, xmm4 michael@0: pcmpeqb xmm5, xmm5 // generate mask 0x0000001f michael@0: psrld xmm5, 27 michael@0: pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 michael@0: psrld xmm6, 26 michael@0: pslld xmm6, 5 michael@0: pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 michael@0: pslld xmm7, 11 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: READYUV422 michael@0: YUVTORGB michael@0: michael@0: // Step 3: Weave into RRGB michael@0: punpcklbw xmm0, xmm1 // BG michael@0: punpcklbw xmm2, xmm2 // RR michael@0: movdqa xmm1, xmm0 michael@0: punpcklwd xmm0, xmm2 // BGRR first 4 pixels michael@0: punpckhwd xmm1, xmm2 // BGRR next 4 pixels michael@0: michael@0: // Step 3b: RRGB -> RGB565 michael@0: movdqa xmm3, xmm0 // B first 4 pixels of argb michael@0: movdqa xmm2, xmm0 // G michael@0: pslld xmm0, 8 // R michael@0: psrld xmm3, 3 // B michael@0: psrld xmm2, 5 // G michael@0: psrad xmm0, 16 // R michael@0: pand xmm3, xmm5 // B michael@0: pand xmm2, xmm6 // G michael@0: pand xmm0, xmm7 // R michael@0: por xmm3, xmm2 // BG michael@0: por xmm0, xmm3 // BGR michael@0: movdqa xmm3, xmm1 // B next 4 pixels of argb michael@0: movdqa xmm2, xmm1 // G michael@0: pslld xmm1, 8 // R michael@0: psrld xmm3, 3 // B michael@0: psrld xmm2, 5 // G michael@0: psrad xmm1, 16 // R michael@0: pand xmm3, xmm5 // B michael@0: pand xmm2, xmm6 // G michael@0: pand xmm1, xmm7 // R michael@0: por xmm3, xmm2 // BG michael@0: por xmm1, xmm3 // BGR michael@0: packssdw xmm0, xmm1 michael@0: sub ecx, 8 michael@0: movdqu [edx], xmm0 // store 8 pixels of RGB565 michael@0: lea edx, [edx + 16] michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // 8 pixels, dest aligned 16. michael@0: // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). michael@0: __declspec(naked) __declspec(align(16)) michael@0: void I422ToARGBRow_SSSE3(const uint8* y_buf, michael@0: const uint8* u_buf, michael@0: const uint8* v_buf, michael@0: uint8* dst_argb, michael@0: int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // Y michael@0: mov esi, [esp + 8 + 8] // U michael@0: mov edi, [esp + 8 + 12] // V michael@0: mov edx, [esp + 8 + 16] // argb michael@0: mov ecx, [esp + 8 + 20] // width michael@0: sub edi, esi michael@0: pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha michael@0: pxor xmm4, xmm4 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: READYUV422 michael@0: YUVTORGB michael@0: michael@0: // Step 3: Weave into ARGB michael@0: punpcklbw xmm0, xmm1 // BG michael@0: punpcklbw xmm2, xmm5 // RA michael@0: movdqa xmm1, xmm0 michael@0: punpcklwd xmm0, xmm2 // BGRA first 4 pixels michael@0: punpckhwd xmm1, xmm2 // BGRA next 4 pixels michael@0: movdqa [edx], xmm0 michael@0: movdqa [edx + 16], xmm1 michael@0: lea edx, [edx + 32] michael@0: sub ecx, 8 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // 8 pixels, dest aligned 16. michael@0: // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). michael@0: // Similar to I420 but duplicate UV once more. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void I411ToARGBRow_SSSE3(const uint8* y_buf, michael@0: const uint8* u_buf, michael@0: const uint8* v_buf, michael@0: uint8* dst_argb, michael@0: int width) { michael@0: __asm { michael@0: push ebx michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 12 + 4] // Y michael@0: mov esi, [esp + 12 + 8] // U michael@0: mov edi, [esp + 12 + 12] // V michael@0: mov edx, [esp + 12 + 16] // argb michael@0: mov ecx, [esp + 12 + 20] // width michael@0: sub edi, esi michael@0: pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha michael@0: pxor xmm4, xmm4 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: READYUV411 // modifies EBX michael@0: YUVTORGB michael@0: michael@0: // Step 3: Weave into ARGB michael@0: punpcklbw xmm0, xmm1 // BG michael@0: punpcklbw xmm2, xmm5 // RA michael@0: movdqa xmm1, xmm0 michael@0: punpcklwd xmm0, xmm2 // BGRA first 4 pixels michael@0: punpckhwd xmm1, xmm2 // BGRA next 4 pixels michael@0: movdqa [edx], xmm0 michael@0: movdqa [edx + 16], xmm1 michael@0: lea edx, [edx + 32] michael@0: sub ecx, 8 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: pop ebx michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // 8 pixels, dest aligned 16. michael@0: // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). michael@0: __declspec(naked) __declspec(align(16)) michael@0: void NV12ToARGBRow_SSSE3(const uint8* y_buf, michael@0: const uint8* uv_buf, michael@0: uint8* dst_argb, michael@0: int width) { michael@0: __asm { michael@0: push esi michael@0: mov eax, [esp + 4 + 4] // Y michael@0: mov esi, [esp + 4 + 8] // UV michael@0: mov edx, [esp + 4 + 12] // argb michael@0: mov ecx, [esp + 4 + 16] // width michael@0: pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha michael@0: pxor xmm4, xmm4 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: READNV12 michael@0: YUVTORGB michael@0: michael@0: // Step 3: Weave into ARGB michael@0: punpcklbw xmm0, xmm1 // BG michael@0: punpcklbw xmm2, xmm5 // RA michael@0: movdqa xmm1, xmm0 michael@0: punpcklwd xmm0, xmm2 // BGRA first 4 pixels michael@0: punpckhwd xmm1, xmm2 // BGRA next 4 pixels michael@0: movdqa [edx], xmm0 michael@0: movdqa [edx + 16], xmm1 michael@0: lea edx, [edx + 32] michael@0: sub ecx, 8 michael@0: jg convertloop michael@0: michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // 8 pixels, dest aligned 16. michael@0: // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). michael@0: __declspec(naked) __declspec(align(16)) michael@0: void NV21ToARGBRow_SSSE3(const uint8* y_buf, michael@0: const uint8* uv_buf, michael@0: uint8* dst_argb, michael@0: int width) { michael@0: __asm { michael@0: push esi michael@0: mov eax, [esp + 4 + 4] // Y michael@0: mov esi, [esp + 4 + 8] // VU michael@0: mov edx, [esp + 4 + 12] // argb michael@0: mov ecx, [esp + 4 + 16] // width michael@0: pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha michael@0: pxor xmm4, xmm4 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: READNV12 michael@0: YVUTORGB michael@0: michael@0: // Step 3: Weave into ARGB michael@0: punpcklbw xmm0, xmm1 // BG michael@0: punpcklbw xmm2, xmm5 // RA michael@0: movdqa xmm1, xmm0 michael@0: punpcklwd xmm0, xmm2 // BGRA first 4 pixels michael@0: punpckhwd xmm1, xmm2 // BGRA next 4 pixels michael@0: movdqa [edx], xmm0 michael@0: movdqa [edx + 16], xmm1 michael@0: lea edx, [edx + 32] michael@0: sub ecx, 8 michael@0: jg convertloop michael@0: michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // 8 pixels, unaligned. michael@0: // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). michael@0: __declspec(naked) __declspec(align(16)) michael@0: void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, michael@0: const uint8* u_buf, michael@0: const uint8* v_buf, michael@0: uint8* dst_argb, michael@0: int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // Y michael@0: mov esi, [esp + 8 + 8] // U michael@0: mov edi, [esp + 8 + 12] // V michael@0: mov edx, [esp + 8 + 16] // argb michael@0: mov ecx, [esp + 8 + 20] // width michael@0: sub edi, esi michael@0: pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha michael@0: pxor xmm4, xmm4 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: READYUV444 michael@0: YUVTORGB michael@0: michael@0: // Step 3: Weave into ARGB michael@0: punpcklbw xmm0, xmm1 // BG michael@0: punpcklbw xmm2, xmm5 // RA michael@0: movdqa xmm1, xmm0 michael@0: punpcklwd xmm0, xmm2 // BGRA first 4 pixels michael@0: punpckhwd xmm1, xmm2 // BGRA next 4 pixels michael@0: movdqu [edx], xmm0 michael@0: movdqu [edx + 16], xmm1 michael@0: lea edx, [edx + 32] michael@0: sub ecx, 8 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // 8 pixels, unaligned. michael@0: // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). michael@0: __declspec(naked) __declspec(align(16)) michael@0: void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, michael@0: const uint8* u_buf, michael@0: const uint8* v_buf, michael@0: uint8* dst_argb, michael@0: int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // Y michael@0: mov esi, [esp + 8 + 8] // U michael@0: mov edi, [esp + 8 + 12] // V michael@0: mov edx, [esp + 8 + 16] // argb michael@0: mov ecx, [esp + 8 + 20] // width michael@0: sub edi, esi michael@0: pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha michael@0: pxor xmm4, xmm4 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: READYUV422 michael@0: YUVTORGB michael@0: michael@0: // Step 3: Weave into ARGB michael@0: punpcklbw xmm0, xmm1 // BG michael@0: punpcklbw xmm2, xmm5 // RA michael@0: movdqa xmm1, xmm0 michael@0: punpcklwd xmm0, xmm2 // BGRA first 4 pixels michael@0: punpckhwd xmm1, xmm2 // BGRA next 4 pixels michael@0: movdqu [edx], xmm0 michael@0: movdqu [edx + 16], xmm1 michael@0: lea edx, [edx + 32] michael@0: sub ecx, 8 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // 8 pixels, unaligned. michael@0: // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). michael@0: // Similar to I420 but duplicate UV once more. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, michael@0: const uint8* u_buf, michael@0: const uint8* v_buf, michael@0: uint8* dst_argb, michael@0: int width) { michael@0: __asm { michael@0: push ebx michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 12 + 4] // Y michael@0: mov esi, [esp + 12 + 8] // U michael@0: mov edi, [esp + 12 + 12] // V michael@0: mov edx, [esp + 12 + 16] // argb michael@0: mov ecx, [esp + 12 + 20] // width michael@0: sub edi, esi michael@0: pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha michael@0: pxor xmm4, xmm4 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: READYUV411 // modifies EBX michael@0: YUVTORGB michael@0: michael@0: // Step 3: Weave into ARGB michael@0: punpcklbw xmm0, xmm1 // BG michael@0: punpcklbw xmm2, xmm5 // RA michael@0: movdqa xmm1, xmm0 michael@0: punpcklwd xmm0, xmm2 // BGRA first 4 pixels michael@0: punpckhwd xmm1, xmm2 // BGRA next 4 pixels michael@0: movdqu [edx], xmm0 michael@0: movdqu [edx + 16], xmm1 michael@0: lea edx, [edx + 32] michael@0: sub ecx, 8 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: pop ebx michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // 8 pixels, dest aligned 16. michael@0: // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). michael@0: __declspec(naked) __declspec(align(16)) michael@0: void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, michael@0: const uint8* uv_buf, michael@0: uint8* dst_argb, michael@0: int width) { michael@0: __asm { michael@0: push esi michael@0: mov eax, [esp + 4 + 4] // Y michael@0: mov esi, [esp + 4 + 8] // UV michael@0: mov edx, [esp + 4 + 12] // argb michael@0: mov ecx, [esp + 4 + 16] // width michael@0: pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha michael@0: pxor xmm4, xmm4 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: READNV12 michael@0: YUVTORGB michael@0: michael@0: // Step 3: Weave into ARGB michael@0: punpcklbw xmm0, xmm1 // BG michael@0: punpcklbw xmm2, xmm5 // RA michael@0: movdqa xmm1, xmm0 michael@0: punpcklwd xmm0, xmm2 // BGRA first 4 pixels michael@0: punpckhwd xmm1, xmm2 // BGRA next 4 pixels michael@0: movdqu [edx], xmm0 michael@0: movdqu [edx + 16], xmm1 michael@0: lea edx, [edx + 32] michael@0: sub ecx, 8 michael@0: jg convertloop michael@0: michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // 8 pixels, dest aligned 16. michael@0: // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). michael@0: __declspec(naked) __declspec(align(16)) michael@0: void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, michael@0: const uint8* uv_buf, michael@0: uint8* dst_argb, michael@0: int width) { michael@0: __asm { michael@0: push esi michael@0: mov eax, [esp + 4 + 4] // Y michael@0: mov esi, [esp + 4 + 8] // VU michael@0: mov edx, [esp + 4 + 12] // argb michael@0: mov ecx, [esp + 4 + 16] // width michael@0: pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha michael@0: pxor xmm4, xmm4 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: READNV12 michael@0: YVUTORGB michael@0: michael@0: // Step 3: Weave into ARGB michael@0: punpcklbw xmm0, xmm1 // BG michael@0: punpcklbw xmm2, xmm5 // RA michael@0: movdqa xmm1, xmm0 michael@0: punpcklwd xmm0, xmm2 // BGRA first 4 pixels michael@0: punpckhwd xmm1, xmm2 // BGRA next 4 pixels michael@0: movdqu [edx], xmm0 michael@0: movdqu [edx + 16], xmm1 michael@0: lea edx, [edx + 32] michael@0: sub ecx, 8 michael@0: jg convertloop michael@0: michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void I422ToBGRARow_SSSE3(const uint8* y_buf, michael@0: const uint8* u_buf, michael@0: const uint8* v_buf, michael@0: uint8* dst_bgra, michael@0: int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // Y michael@0: mov esi, [esp + 8 + 8] // U michael@0: mov edi, [esp + 8 + 12] // V michael@0: mov edx, [esp + 8 + 16] // bgra michael@0: mov ecx, [esp + 8 + 20] // width michael@0: sub edi, esi michael@0: pxor xmm4, xmm4 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: READYUV422 michael@0: YUVTORGB michael@0: michael@0: // Step 3: Weave into BGRA michael@0: pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha michael@0: punpcklbw xmm1, xmm0 // GB michael@0: punpcklbw xmm5, xmm2 // AR michael@0: movdqa xmm0, xmm5 michael@0: punpcklwd xmm5, xmm1 // BGRA first 4 pixels michael@0: punpckhwd xmm0, xmm1 // BGRA next 4 pixels michael@0: movdqa [edx], xmm5 michael@0: movdqa [edx + 16], xmm0 michael@0: lea edx, [edx + 32] michael@0: sub ecx, 8 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, michael@0: const uint8* u_buf, michael@0: const uint8* v_buf, michael@0: uint8* dst_bgra, michael@0: int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // Y michael@0: mov esi, [esp + 8 + 8] // U michael@0: mov edi, [esp + 8 + 12] // V michael@0: mov edx, [esp + 8 + 16] // bgra michael@0: mov ecx, [esp + 8 + 20] // width michael@0: sub edi, esi michael@0: pxor xmm4, xmm4 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: READYUV422 michael@0: YUVTORGB michael@0: michael@0: // Step 3: Weave into BGRA michael@0: pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha michael@0: punpcklbw xmm1, xmm0 // GB michael@0: punpcklbw xmm5, xmm2 // AR michael@0: movdqa xmm0, xmm5 michael@0: punpcklwd xmm5, xmm1 // BGRA first 4 pixels michael@0: punpckhwd xmm0, xmm1 // BGRA next 4 pixels michael@0: movdqu [edx], xmm5 michael@0: movdqu [edx + 16], xmm0 michael@0: lea edx, [edx + 32] michael@0: sub ecx, 8 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void I422ToABGRRow_SSSE3(const uint8* y_buf, michael@0: const uint8* u_buf, michael@0: const uint8* v_buf, michael@0: uint8* dst_abgr, michael@0: int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // Y michael@0: mov esi, [esp + 8 + 8] // U michael@0: mov edi, [esp + 8 + 12] // V michael@0: mov edx, [esp + 8 + 16] // abgr michael@0: mov ecx, [esp + 8 + 20] // width michael@0: sub edi, esi michael@0: pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha michael@0: pxor xmm4, xmm4 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: READYUV422 michael@0: YUVTORGB michael@0: michael@0: // Step 3: Weave into ARGB michael@0: punpcklbw xmm2, xmm1 // RG michael@0: punpcklbw xmm0, xmm5 // BA michael@0: movdqa xmm1, xmm2 michael@0: punpcklwd xmm2, xmm0 // RGBA first 4 pixels michael@0: punpckhwd xmm1, xmm0 // RGBA next 4 pixels michael@0: movdqa [edx], xmm2 michael@0: movdqa [edx + 16], xmm1 michael@0: lea edx, [edx + 32] michael@0: sub ecx, 8 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, michael@0: const uint8* u_buf, michael@0: const uint8* v_buf, michael@0: uint8* dst_abgr, michael@0: int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // Y michael@0: mov esi, [esp + 8 + 8] // U michael@0: mov edi, [esp + 8 + 12] // V michael@0: mov edx, [esp + 8 + 16] // abgr michael@0: mov ecx, [esp + 8 + 20] // width michael@0: sub edi, esi michael@0: pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha michael@0: pxor xmm4, xmm4 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: READYUV422 michael@0: YUVTORGB michael@0: michael@0: // Step 3: Weave into ARGB michael@0: punpcklbw xmm2, xmm1 // RG michael@0: punpcklbw xmm0, xmm5 // BA michael@0: movdqa xmm1, xmm2 michael@0: punpcklwd xmm2, xmm0 // RGBA first 4 pixels michael@0: punpckhwd xmm1, xmm0 // RGBA next 4 pixels michael@0: movdqu [edx], xmm2 michael@0: movdqu [edx + 16], xmm1 michael@0: lea edx, [edx + 32] michael@0: sub ecx, 8 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void I422ToRGBARow_SSSE3(const uint8* y_buf, michael@0: const uint8* u_buf, michael@0: const uint8* v_buf, michael@0: uint8* dst_rgba, michael@0: int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // Y michael@0: mov esi, [esp + 8 + 8] // U michael@0: mov edi, [esp + 8 + 12] // V michael@0: mov edx, [esp + 8 + 16] // rgba michael@0: mov ecx, [esp + 8 + 20] // width michael@0: sub edi, esi michael@0: pxor xmm4, xmm4 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: READYUV422 michael@0: YUVTORGB michael@0: michael@0: // Step 3: Weave into RGBA michael@0: pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha michael@0: punpcklbw xmm1, xmm2 // GR michael@0: punpcklbw xmm5, xmm0 // AB michael@0: movdqa xmm0, xmm5 michael@0: punpcklwd xmm5, xmm1 // RGBA first 4 pixels michael@0: punpckhwd xmm0, xmm1 // RGBA next 4 pixels michael@0: movdqa [edx], xmm5 michael@0: movdqa [edx + 16], xmm0 michael@0: lea edx, [edx + 32] michael@0: sub ecx, 8 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, michael@0: const uint8* u_buf, michael@0: const uint8* v_buf, michael@0: uint8* dst_rgba, michael@0: int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // Y michael@0: mov esi, [esp + 8 + 8] // U michael@0: mov edi, [esp + 8 + 12] // V michael@0: mov edx, [esp + 8 + 16] // rgba michael@0: mov ecx, [esp + 8 + 20] // width michael@0: sub edi, esi michael@0: pxor xmm4, xmm4 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: READYUV422 michael@0: YUVTORGB michael@0: michael@0: // Step 3: Weave into RGBA michael@0: pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha michael@0: punpcklbw xmm1, xmm2 // GR michael@0: punpcklbw xmm5, xmm0 // AB michael@0: movdqa xmm0, xmm5 michael@0: punpcklwd xmm5, xmm1 // RGBA first 4 pixels michael@0: punpckhwd xmm0, xmm1 // RGBA next 4 pixels michael@0: movdqu [edx], xmm5 michael@0: movdqu [edx + 16], xmm0 michael@0: lea edx, [edx + 32] michael@0: sub ecx, 8 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: #endif // HAS_I422TOARGBROW_SSSE3 michael@0: michael@0: #ifdef HAS_YTOARGBROW_SSE2 michael@0: __declspec(naked) __declspec(align(16)) michael@0: void YToARGBRow_SSE2(const uint8* y_buf, michael@0: uint8* rgb_buf, michael@0: int width) { michael@0: __asm { michael@0: pxor xmm5, xmm5 michael@0: pcmpeqb xmm4, xmm4 // generate mask 0xff000000 michael@0: pslld xmm4, 24 michael@0: mov eax, 0x00100010 michael@0: movd xmm3, eax michael@0: pshufd xmm3, xmm3, 0 michael@0: mov eax, 0x004a004a // 74 michael@0: movd xmm2, eax michael@0: pshufd xmm2, xmm2,0 michael@0: mov eax, [esp + 4] // Y michael@0: mov edx, [esp + 8] // rgb michael@0: mov ecx, [esp + 12] // width michael@0: michael@0: align 4 michael@0: convertloop: michael@0: // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 michael@0: movq xmm0, qword ptr [eax] michael@0: lea eax, [eax + 8] michael@0: punpcklbw xmm0, xmm5 // 0.Y michael@0: psubusw xmm0, xmm3 michael@0: pmullw xmm0, xmm2 michael@0: psrlw xmm0, 6 michael@0: packuswb xmm0, xmm0 // G michael@0: michael@0: // Step 2: Weave into ARGB michael@0: punpcklbw xmm0, xmm0 // GG michael@0: movdqa xmm1, xmm0 michael@0: punpcklwd xmm0, xmm0 // BGRA first 4 pixels michael@0: punpckhwd xmm1, xmm1 // BGRA next 4 pixels michael@0: por xmm0, xmm4 michael@0: por xmm1, xmm4 michael@0: movdqa [edx], xmm0 michael@0: movdqa [edx + 16], xmm1 michael@0: lea edx, [edx + 32] michael@0: sub ecx, 8 michael@0: jg convertloop michael@0: michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_YTOARGBROW_SSE2 michael@0: michael@0: #ifdef HAS_MIRRORROW_SSSE3 michael@0: // Shuffle table for reversing the bytes. michael@0: static const uvec8 kShuffleMirror = { michael@0: 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u michael@0: }; michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src michael@0: mov edx, [esp + 8] // dst michael@0: mov ecx, [esp + 12] // width michael@0: movdqa xmm5, kShuffleMirror michael@0: lea eax, [eax - 16] michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax + ecx] michael@0: pshufb xmm0, xmm5 michael@0: sub ecx, 16 michael@0: movdqa [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_MIRRORROW_SSSE3 michael@0: michael@0: #ifdef HAS_MIRRORROW_AVX2 michael@0: // Shuffle table for reversing the bytes. michael@0: static const ulvec8 kShuffleMirror_AVX2 = { michael@0: 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u, michael@0: 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u michael@0: }; michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src michael@0: mov edx, [esp + 8] // dst michael@0: mov ecx, [esp + 12] // width michael@0: vmovdqa ymm5, kShuffleMirror_AVX2 michael@0: lea eax, [eax - 32] michael@0: michael@0: align 4 michael@0: convertloop: michael@0: vmovdqu ymm0, [eax + ecx] michael@0: vpshufb ymm0, ymm0, ymm5 michael@0: vpermq ymm0, ymm0, 0x4e // swap high and low halfs michael@0: sub ecx, 32 michael@0: vmovdqu [edx], ymm0 michael@0: lea edx, [edx + 32] michael@0: jg convertloop michael@0: vzeroupper michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_MIRRORROW_AVX2 michael@0: michael@0: #ifdef HAS_MIRRORROW_SSE2 michael@0: // SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3 michael@0: // version can not. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src michael@0: mov edx, [esp + 8] // dst michael@0: mov ecx, [esp + 12] // width michael@0: lea eax, [eax - 16] michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqu xmm0, [eax + ecx] michael@0: movdqa xmm1, xmm0 // swap bytes michael@0: psllw xmm0, 8 michael@0: psrlw xmm1, 8 michael@0: por xmm0, xmm1 michael@0: pshuflw xmm0, xmm0, 0x1b // swap words michael@0: pshufhw xmm0, xmm0, 0x1b michael@0: pshufd xmm0, xmm0, 0x4e // swap qwords michael@0: sub ecx, 16 michael@0: movdqu [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_MIRRORROW_SSE2 michael@0: michael@0: #ifdef HAS_MIRRORROW_UV_SSSE3 michael@0: // Shuffle table for reversing the bytes of UV channels. michael@0: static const uvec8 kShuffleMirrorUV = { michael@0: 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u michael@0: }; michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, michael@0: int width) { michael@0: __asm { michael@0: push edi michael@0: mov eax, [esp + 4 + 4] // src michael@0: mov edx, [esp + 4 + 8] // dst_u michael@0: mov edi, [esp + 4 + 12] // dst_v michael@0: mov ecx, [esp + 4 + 16] // width michael@0: movdqa xmm1, kShuffleMirrorUV michael@0: lea eax, [eax + ecx * 2 - 16] michael@0: sub edi, edx michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax] michael@0: lea eax, [eax - 16] michael@0: pshufb xmm0, xmm1 michael@0: sub ecx, 8 michael@0: movlpd qword ptr [edx], xmm0 michael@0: movhpd qword ptr [edx + edi], xmm0 michael@0: lea edx, [edx + 8] michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_MIRRORROW_UV_SSSE3 michael@0: michael@0: #ifdef HAS_ARGBMIRRORROW_SSSE3 michael@0: // Shuffle table for reversing the bytes. michael@0: static const uvec8 kARGBShuffleMirror = { michael@0: 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u michael@0: }; michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src michael@0: mov edx, [esp + 8] // dst michael@0: mov ecx, [esp + 12] // width michael@0: lea eax, [eax - 16 + ecx * 4] // last 4 pixels. michael@0: movdqa xmm5, kARGBShuffleMirror michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax] michael@0: lea eax, [eax - 16] michael@0: pshufb xmm0, xmm5 michael@0: sub ecx, 4 michael@0: movdqa [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBMIRRORROW_SSSE3 michael@0: michael@0: #ifdef HAS_ARGBMIRRORROW_AVX2 michael@0: // Shuffle table for reversing the bytes. michael@0: static const ulvec32 kARGBShuffleMirror_AVX2 = { michael@0: 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u michael@0: }; michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src michael@0: mov edx, [esp + 8] // dst michael@0: mov ecx, [esp + 12] // width michael@0: lea eax, [eax - 32] michael@0: vmovdqa ymm5, kARGBShuffleMirror_AVX2 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: vpermd ymm0, ymm5, [eax + ecx * 4] // permute dword order michael@0: sub ecx, 8 michael@0: vmovdqu [edx], ymm0 michael@0: lea edx, [edx + 32] michael@0: jg convertloop michael@0: vzeroupper michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBMIRRORROW_AVX2 michael@0: michael@0: #ifdef HAS_SPLITUVROW_SSE2 michael@0: __declspec(naked) __declspec(align(16)) michael@0: void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { michael@0: __asm { michael@0: push edi michael@0: mov eax, [esp + 4 + 4] // src_uv michael@0: mov edx, [esp + 4 + 8] // dst_u michael@0: mov edi, [esp + 4 + 12] // dst_v michael@0: mov ecx, [esp + 4 + 16] // pix michael@0: pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff michael@0: psrlw xmm5, 8 michael@0: sub edi, edx michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: lea eax, [eax + 32] michael@0: movdqa xmm2, xmm0 michael@0: movdqa xmm3, xmm1 michael@0: pand xmm0, xmm5 // even bytes michael@0: pand xmm1, xmm5 michael@0: packuswb xmm0, xmm1 michael@0: psrlw xmm2, 8 // odd bytes michael@0: psrlw xmm3, 8 michael@0: packuswb xmm2, xmm3 michael@0: movdqa [edx], xmm0 michael@0: movdqa [edx + edi], xmm2 michael@0: lea edx, [edx + 16] michael@0: sub ecx, 16 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, michael@0: int pix) { michael@0: __asm { michael@0: push edi michael@0: mov eax, [esp + 4 + 4] // src_uv michael@0: mov edx, [esp + 4 + 8] // dst_u michael@0: mov edi, [esp + 4 + 12] // dst_v michael@0: mov ecx, [esp + 4 + 16] // pix michael@0: pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff michael@0: psrlw xmm5, 8 michael@0: sub edi, edx michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqu xmm0, [eax] michael@0: movdqu xmm1, [eax + 16] michael@0: lea eax, [eax + 32] michael@0: movdqa xmm2, xmm0 michael@0: movdqa xmm3, xmm1 michael@0: pand xmm0, xmm5 // even bytes michael@0: pand xmm1, xmm5 michael@0: packuswb xmm0, xmm1 michael@0: psrlw xmm2, 8 // odd bytes michael@0: psrlw xmm3, 8 michael@0: packuswb xmm2, xmm3 michael@0: movdqu [edx], xmm0 michael@0: movdqu [edx + edi], xmm2 michael@0: lea edx, [edx + 16] michael@0: sub ecx, 16 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_SPLITUVROW_SSE2 michael@0: michael@0: #ifdef HAS_SPLITUVROW_AVX2 michael@0: __declspec(naked) __declspec(align(16)) michael@0: void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { michael@0: __asm { michael@0: push edi michael@0: mov eax, [esp + 4 + 4] // src_uv michael@0: mov edx, [esp + 4 + 8] // dst_u michael@0: mov edi, [esp + 4 + 12] // dst_v michael@0: mov ecx, [esp + 4 + 16] // pix michael@0: vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff michael@0: vpsrlw ymm5, ymm5, 8 michael@0: sub edi, edx michael@0: michael@0: align 4 michael@0: convertloop: michael@0: vmovdqu ymm0, [eax] michael@0: vmovdqu ymm1, [eax + 32] michael@0: lea eax, [eax + 64] michael@0: vpsrlw ymm2, ymm0, 8 // odd bytes michael@0: vpsrlw ymm3, ymm1, 8 michael@0: vpand ymm0, ymm0, ymm5 // even bytes michael@0: vpand ymm1, ymm1, ymm5 michael@0: vpackuswb ymm0, ymm0, ymm1 michael@0: vpackuswb ymm2, ymm2, ymm3 michael@0: vpermq ymm0, ymm0, 0xd8 michael@0: vpermq ymm2, ymm2, 0xd8 michael@0: vmovdqu [edx], ymm0 michael@0: vmovdqu [edx + edi], ymm2 michael@0: lea edx, [edx + 32] michael@0: sub ecx, 32 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: vzeroupper michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_SPLITUVROW_AVX2 michael@0: michael@0: #ifdef HAS_MERGEUVROW_SSE2 michael@0: __declspec(naked) __declspec(align(16)) michael@0: void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, michael@0: int width) { michael@0: __asm { michael@0: push edi michael@0: mov eax, [esp + 4 + 4] // src_u michael@0: mov edx, [esp + 4 + 8] // src_v michael@0: mov edi, [esp + 4 + 12] // dst_uv michael@0: mov ecx, [esp + 4 + 16] // width michael@0: sub edx, eax michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax] // read 16 U's michael@0: movdqa xmm1, [eax + edx] // and 16 V's michael@0: lea eax, [eax + 16] michael@0: movdqa xmm2, xmm0 michael@0: punpcklbw xmm0, xmm1 // first 8 UV pairs michael@0: punpckhbw xmm2, xmm1 // next 8 UV pairs michael@0: movdqa [edi], xmm0 michael@0: movdqa [edi + 16], xmm2 michael@0: lea edi, [edi + 32] michael@0: sub ecx, 16 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, michael@0: uint8* dst_uv, int width) { michael@0: __asm { michael@0: push edi michael@0: mov eax, [esp + 4 + 4] // src_u michael@0: mov edx, [esp + 4 + 8] // src_v michael@0: mov edi, [esp + 4 + 12] // dst_uv michael@0: mov ecx, [esp + 4 + 16] // width michael@0: sub edx, eax michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqu xmm0, [eax] // read 16 U's michael@0: movdqu xmm1, [eax + edx] // and 16 V's michael@0: lea eax, [eax + 16] michael@0: movdqa xmm2, xmm0 michael@0: punpcklbw xmm0, xmm1 // first 8 UV pairs michael@0: punpckhbw xmm2, xmm1 // next 8 UV pairs michael@0: movdqu [edi], xmm0 michael@0: movdqu [edi + 16], xmm2 michael@0: lea edi, [edi + 32] michael@0: sub ecx, 16 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_MERGEUVROW_SSE2 michael@0: michael@0: #ifdef HAS_MERGEUVROW_AVX2 michael@0: __declspec(naked) __declspec(align(16)) michael@0: void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, michael@0: int width) { michael@0: __asm { michael@0: push edi michael@0: mov eax, [esp + 4 + 4] // src_u michael@0: mov edx, [esp + 4 + 8] // src_v michael@0: mov edi, [esp + 4 + 12] // dst_uv michael@0: mov ecx, [esp + 4 + 16] // width michael@0: sub edx, eax michael@0: michael@0: align 4 michael@0: convertloop: michael@0: vmovdqu ymm0, [eax] // read 32 U's michael@0: vmovdqu ymm1, [eax + edx] // and 32 V's michael@0: lea eax, [eax + 32] michael@0: vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2 michael@0: vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3 michael@0: vperm2i128 ymm1, ymm2, ymm0, 0x20 // low 128 of ymm2 and low 128 of ymm0 michael@0: vperm2i128 ymm2, ymm2, ymm0, 0x31 // high 128 of ymm2 and high 128 of ymm0 michael@0: vmovdqu [edi], ymm1 michael@0: vmovdqu [edi + 32], ymm2 michael@0: lea edi, [edi + 64] michael@0: sub ecx, 32 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: vzeroupper michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_MERGEUVROW_AVX2 michael@0: michael@0: #ifdef HAS_COPYROW_SSE2 michael@0: // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src michael@0: mov edx, [esp + 8] // dst michael@0: mov ecx, [esp + 12] // count michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: lea eax, [eax + 32] michael@0: movdqa [edx], xmm0 michael@0: movdqa [edx + 16], xmm1 michael@0: lea edx, [edx + 32] michael@0: sub ecx, 32 michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_COPYROW_SSE2 michael@0: michael@0: // Unaligned Multiple of 1. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { michael@0: __asm { michael@0: mov eax, esi michael@0: mov edx, edi michael@0: mov esi, [esp + 4] // src michael@0: mov edi, [esp + 8] // dst michael@0: mov ecx, [esp + 12] // count michael@0: rep movsb michael@0: mov edi, edx michael@0: mov esi, eax michael@0: ret michael@0: } michael@0: } michael@0: michael@0: #ifdef HAS_COPYROW_X86 michael@0: __declspec(naked) __declspec(align(16)) michael@0: void CopyRow_X86(const uint8* src, uint8* dst, int count) { michael@0: __asm { michael@0: mov eax, esi michael@0: mov edx, edi michael@0: mov esi, [esp + 4] // src michael@0: mov edi, [esp + 8] // dst michael@0: mov ecx, [esp + 12] // count michael@0: shr ecx, 2 michael@0: rep movsd michael@0: mov edi, edx michael@0: mov esi, eax michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_COPYROW_X86 michael@0: michael@0: #ifdef HAS_ARGBCOPYALPHAROW_SSE2 michael@0: // width in pixels michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src michael@0: mov edx, [esp + 8] // dst michael@0: mov ecx, [esp + 12] // count michael@0: pcmpeqb xmm0, xmm0 // generate mask 0xff000000 michael@0: pslld xmm0, 24 michael@0: pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff michael@0: psrld xmm1, 8 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm2, [eax] michael@0: movdqa xmm3, [eax + 16] michael@0: lea eax, [eax + 32] michael@0: movdqa xmm4, [edx] michael@0: movdqa xmm5, [edx + 16] michael@0: pand xmm2, xmm0 michael@0: pand xmm3, xmm0 michael@0: pand xmm4, xmm1 michael@0: pand xmm5, xmm1 michael@0: por xmm2, xmm4 michael@0: por xmm3, xmm5 michael@0: movdqa [edx], xmm2 michael@0: movdqa [edx + 16], xmm3 michael@0: lea edx, [edx + 32] michael@0: sub ecx, 8 michael@0: jg convertloop michael@0: michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBCOPYALPHAROW_SSE2 michael@0: michael@0: #ifdef HAS_ARGBCOPYALPHAROW_AVX2 michael@0: // width in pixels michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src michael@0: mov edx, [esp + 8] // dst michael@0: mov ecx, [esp + 12] // count michael@0: vpcmpeqb ymm0, ymm0, ymm0 michael@0: vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff michael@0: michael@0: align 4 michael@0: convertloop: michael@0: vmovdqu ymm1, [eax] michael@0: vmovdqu ymm2, [eax + 32] michael@0: lea eax, [eax + 64] michael@0: vpblendvb ymm1, ymm1, [edx], ymm0 michael@0: vpblendvb ymm2, ymm2, [edx + 32], ymm0 michael@0: vmovdqu [edx], ymm1 michael@0: vmovdqu [edx + 32], ymm2 michael@0: lea edx, [edx + 64] michael@0: sub ecx, 16 michael@0: jg convertloop michael@0: michael@0: vzeroupper michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBCOPYALPHAROW_AVX2 michael@0: michael@0: #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 michael@0: // width in pixels michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src michael@0: mov edx, [esp + 8] // dst michael@0: mov ecx, [esp + 12] // count michael@0: pcmpeqb xmm0, xmm0 // generate mask 0xff000000 michael@0: pslld xmm0, 24 michael@0: pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff michael@0: psrld xmm1, 8 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movq xmm2, qword ptr [eax] // 8 Y's michael@0: lea eax, [eax + 8] michael@0: punpcklbw xmm2, xmm2 michael@0: punpckhwd xmm3, xmm2 michael@0: punpcklwd xmm2, xmm2 michael@0: movdqa xmm4, [edx] michael@0: movdqa xmm5, [edx + 16] michael@0: pand xmm2, xmm0 michael@0: pand xmm3, xmm0 michael@0: pand xmm4, xmm1 michael@0: pand xmm5, xmm1 michael@0: por xmm2, xmm4 michael@0: por xmm3, xmm5 michael@0: movdqa [edx], xmm2 michael@0: movdqa [edx + 16], xmm3 michael@0: lea edx, [edx + 32] michael@0: sub ecx, 8 michael@0: jg convertloop michael@0: michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 michael@0: michael@0: #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 michael@0: // width in pixels michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src michael@0: mov edx, [esp + 8] // dst michael@0: mov ecx, [esp + 12] // count michael@0: vpcmpeqb ymm0, ymm0, ymm0 michael@0: vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff michael@0: michael@0: align 4 michael@0: convertloop: michael@0: vpmovzxbd ymm1, qword ptr [eax] michael@0: vpmovzxbd ymm2, qword ptr [eax + 8] michael@0: lea eax, [eax + 16] michael@0: vpslld ymm1, ymm1, 24 michael@0: vpslld ymm2, ymm2, 24 michael@0: vpblendvb ymm1, ymm1, [edx], ymm0 michael@0: vpblendvb ymm2, ymm2, [edx + 32], ymm0 michael@0: vmovdqu [edx], ymm1 michael@0: vmovdqu [edx + 32], ymm2 michael@0: lea edx, [edx + 64] michael@0: sub ecx, 16 michael@0: jg convertloop michael@0: michael@0: vzeroupper michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 michael@0: michael@0: #ifdef HAS_SETROW_X86 michael@0: // SetRow8 writes 'count' bytes using a 32 bit value repeated. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void SetRow_X86(uint8* dst, uint32 v32, int count) { michael@0: __asm { michael@0: mov edx, edi michael@0: mov edi, [esp + 4] // dst michael@0: mov eax, [esp + 8] // v32 michael@0: mov ecx, [esp + 12] // count michael@0: shr ecx, 2 michael@0: rep stosd michael@0: mov edi, edx michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // SetRow32 writes 'count' words using a 32 bit value repeated. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, michael@0: int dst_stride, int height) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: push ebp michael@0: mov edi, [esp + 12 + 4] // dst michael@0: mov eax, [esp + 12 + 8] // v32 michael@0: mov ebp, [esp + 12 + 12] // width michael@0: mov edx, [esp + 12 + 16] // dst_stride michael@0: mov esi, [esp + 12 + 20] // height michael@0: lea ecx, [ebp * 4] michael@0: sub edx, ecx // stride - width * 4 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: mov ecx, ebp michael@0: rep stosd michael@0: add edi, edx michael@0: sub esi, 1 michael@0: jg convertloop michael@0: michael@0: pop ebp michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_SETROW_X86 michael@0: michael@0: #ifdef HAS_YUY2TOYROW_AVX2 michael@0: __declspec(naked) __declspec(align(16)) michael@0: void YUY2ToYRow_AVX2(const uint8* src_yuy2, michael@0: uint8* dst_y, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_yuy2 michael@0: mov edx, [esp + 8] // dst_y michael@0: mov ecx, [esp + 12] // pix michael@0: vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff michael@0: vpsrlw ymm5, ymm5, 8 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: vmovdqu ymm0, [eax] michael@0: vmovdqu ymm1, [eax + 32] michael@0: lea eax, [eax + 64] michael@0: vpand ymm0, ymm0, ymm5 // even bytes are Y michael@0: vpand ymm1, ymm1, ymm5 michael@0: vpackuswb ymm0, ymm0, ymm1 // mutates. michael@0: vpermq ymm0, ymm0, 0xd8 michael@0: sub ecx, 32 michael@0: vmovdqu [edx], ymm0 michael@0: lea edx, [edx + 32] michael@0: jg convertloop michael@0: vzeroupper michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, michael@0: uint8* dst_u, uint8* dst_v, int pix) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // src_yuy2 michael@0: mov esi, [esp + 8 + 8] // stride_yuy2 michael@0: mov edx, [esp + 8 + 12] // dst_u michael@0: mov edi, [esp + 8 + 16] // dst_v michael@0: mov ecx, [esp + 8 + 20] // pix michael@0: vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff michael@0: vpsrlw ymm5, ymm5, 8 michael@0: sub edi, edx michael@0: michael@0: align 4 michael@0: convertloop: michael@0: vmovdqu ymm0, [eax] michael@0: vmovdqu ymm1, [eax + 32] michael@0: vpavgb ymm0, ymm0, [eax + esi] michael@0: vpavgb ymm1, ymm1, [eax + esi + 32] michael@0: lea eax, [eax + 64] michael@0: vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV michael@0: vpsrlw ymm1, ymm1, 8 michael@0: vpackuswb ymm0, ymm0, ymm1 // mutates. michael@0: vpermq ymm0, ymm0, 0xd8 michael@0: vpand ymm1, ymm0, ymm5 // U michael@0: vpsrlw ymm0, ymm0, 8 // V michael@0: vpackuswb ymm1, ymm1, ymm1 // mutates. michael@0: vpackuswb ymm0, ymm0, ymm0 // mutates. michael@0: vpermq ymm1, ymm1, 0xd8 michael@0: vpermq ymm0, ymm0, 0xd8 michael@0: vextractf128 [edx], ymm1, 0 // U michael@0: vextractf128 [edx + edi], ymm0, 0 // V michael@0: lea edx, [edx + 16] michael@0: sub ecx, 32 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: vzeroupper michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, michael@0: uint8* dst_u, uint8* dst_v, int pix) { michael@0: __asm { michael@0: push edi michael@0: mov eax, [esp + 4 + 4] // src_yuy2 michael@0: mov edx, [esp + 4 + 8] // dst_u michael@0: mov edi, [esp + 4 + 12] // dst_v michael@0: mov ecx, [esp + 4 + 16] // pix michael@0: vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff michael@0: vpsrlw ymm5, ymm5, 8 michael@0: sub edi, edx michael@0: michael@0: align 4 michael@0: convertloop: michael@0: vmovdqu ymm0, [eax] michael@0: vmovdqu ymm1, [eax + 32] michael@0: lea eax, [eax + 64] michael@0: vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV michael@0: vpsrlw ymm1, ymm1, 8 michael@0: vpackuswb ymm0, ymm0, ymm1 // mutates. michael@0: vpermq ymm0, ymm0, 0xd8 michael@0: vpand ymm1, ymm0, ymm5 // U michael@0: vpsrlw ymm0, ymm0, 8 // V michael@0: vpackuswb ymm1, ymm1, ymm1 // mutates. michael@0: vpackuswb ymm0, ymm0, ymm0 // mutates. michael@0: vpermq ymm1, ymm1, 0xd8 michael@0: vpermq ymm0, ymm0, 0xd8 michael@0: vextractf128 [edx], ymm1, 0 // U michael@0: vextractf128 [edx + edi], ymm0, 0 // V michael@0: lea edx, [edx + 16] michael@0: sub ecx, 32 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: vzeroupper michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void UYVYToYRow_AVX2(const uint8* src_uyvy, michael@0: uint8* dst_y, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_uyvy michael@0: mov edx, [esp + 8] // dst_y michael@0: mov ecx, [esp + 12] // pix michael@0: michael@0: align 4 michael@0: convertloop: michael@0: vmovdqu ymm0, [eax] michael@0: vmovdqu ymm1, [eax + 32] michael@0: lea eax, [eax + 64] michael@0: vpsrlw ymm0, ymm0, 8 // odd bytes are Y michael@0: vpsrlw ymm1, ymm1, 8 michael@0: vpackuswb ymm0, ymm0, ymm1 // mutates. michael@0: vpermq ymm0, ymm0, 0xd8 michael@0: sub ecx, 32 michael@0: vmovdqu [edx], ymm0 michael@0: lea edx, [edx + 32] michael@0: jg convertloop michael@0: ret michael@0: vzeroupper michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, michael@0: uint8* dst_u, uint8* dst_v, int pix) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // src_yuy2 michael@0: mov esi, [esp + 8 + 8] // stride_yuy2 michael@0: mov edx, [esp + 8 + 12] // dst_u michael@0: mov edi, [esp + 8 + 16] // dst_v michael@0: mov ecx, [esp + 8 + 20] // pix michael@0: vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff michael@0: vpsrlw ymm5, ymm5, 8 michael@0: sub edi, edx michael@0: michael@0: align 4 michael@0: convertloop: michael@0: vmovdqu ymm0, [eax] michael@0: vmovdqu ymm1, [eax + 32] michael@0: vpavgb ymm0, ymm0, [eax + esi] michael@0: vpavgb ymm1, ymm1, [eax + esi + 32] michael@0: lea eax, [eax + 64] michael@0: vpand ymm0, ymm0, ymm5 // UYVY -> UVUV michael@0: vpand ymm1, ymm1, ymm5 michael@0: vpackuswb ymm0, ymm0, ymm1 // mutates. michael@0: vpermq ymm0, ymm0, 0xd8 michael@0: vpand ymm1, ymm0, ymm5 // U michael@0: vpsrlw ymm0, ymm0, 8 // V michael@0: vpackuswb ymm1, ymm1, ymm1 // mutates. michael@0: vpackuswb ymm0, ymm0, ymm0 // mutates. michael@0: vpermq ymm1, ymm1, 0xd8 michael@0: vpermq ymm0, ymm0, 0xd8 michael@0: vextractf128 [edx], ymm1, 0 // U michael@0: vextractf128 [edx + edi], ymm0, 0 // V michael@0: lea edx, [edx + 16] michael@0: sub ecx, 32 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: vzeroupper michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void UYVYToUV422Row_AVX2(const uint8* src_uyvy, michael@0: uint8* dst_u, uint8* dst_v, int pix) { michael@0: __asm { michael@0: push edi michael@0: mov eax, [esp + 4 + 4] // src_yuy2 michael@0: mov edx, [esp + 4 + 8] // dst_u michael@0: mov edi, [esp + 4 + 12] // dst_v michael@0: mov ecx, [esp + 4 + 16] // pix michael@0: vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff michael@0: vpsrlw ymm5, ymm5, 8 michael@0: sub edi, edx michael@0: michael@0: align 4 michael@0: convertloop: michael@0: vmovdqu ymm0, [eax] michael@0: vmovdqu ymm1, [eax + 32] michael@0: lea eax, [eax + 64] michael@0: vpand ymm0, ymm0, ymm5 // UYVY -> UVUV michael@0: vpand ymm1, ymm1, ymm5 michael@0: vpackuswb ymm0, ymm0, ymm1 // mutates. michael@0: vpermq ymm0, ymm0, 0xd8 michael@0: vpand ymm1, ymm0, ymm5 // U michael@0: vpsrlw ymm0, ymm0, 8 // V michael@0: vpackuswb ymm1, ymm1, ymm1 // mutates. michael@0: vpackuswb ymm0, ymm0, ymm0 // mutates. michael@0: vpermq ymm1, ymm1, 0xd8 michael@0: vpermq ymm0, ymm0, 0xd8 michael@0: vextractf128 [edx], ymm1, 0 // U michael@0: vextractf128 [edx + edi], ymm0, 0 // V michael@0: lea edx, [edx + 16] michael@0: sub ecx, 32 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: vzeroupper michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_YUY2TOYROW_AVX2 michael@0: michael@0: #ifdef HAS_YUY2TOYROW_SSE2 michael@0: __declspec(naked) __declspec(align(16)) michael@0: void YUY2ToYRow_SSE2(const uint8* src_yuy2, michael@0: uint8* dst_y, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_yuy2 michael@0: mov edx, [esp + 8] // dst_y michael@0: mov ecx, [esp + 12] // pix michael@0: pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff michael@0: psrlw xmm5, 8 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: lea eax, [eax + 32] michael@0: pand xmm0, xmm5 // even bytes are Y michael@0: pand xmm1, xmm5 michael@0: packuswb xmm0, xmm1 michael@0: sub ecx, 16 michael@0: movdqa [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, michael@0: uint8* dst_u, uint8* dst_v, int pix) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // src_yuy2 michael@0: mov esi, [esp + 8 + 8] // stride_yuy2 michael@0: mov edx, [esp + 8 + 12] // dst_u michael@0: mov edi, [esp + 8 + 16] // dst_v michael@0: mov ecx, [esp + 8 + 20] // pix michael@0: pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff michael@0: psrlw xmm5, 8 michael@0: sub edi, edx michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: movdqa xmm2, [eax + esi] michael@0: movdqa xmm3, [eax + esi + 16] michael@0: lea eax, [eax + 32] michael@0: pavgb xmm0, xmm2 michael@0: pavgb xmm1, xmm3 michael@0: psrlw xmm0, 8 // YUYV -> UVUV michael@0: psrlw xmm1, 8 michael@0: packuswb xmm0, xmm1 michael@0: movdqa xmm1, xmm0 michael@0: pand xmm0, xmm5 // U michael@0: packuswb xmm0, xmm0 michael@0: psrlw xmm1, 8 // V michael@0: packuswb xmm1, xmm1 michael@0: movq qword ptr [edx], xmm0 michael@0: movq qword ptr [edx + edi], xmm1 michael@0: lea edx, [edx + 8] michael@0: sub ecx, 16 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, michael@0: uint8* dst_u, uint8* dst_v, int pix) { michael@0: __asm { michael@0: push edi michael@0: mov eax, [esp + 4 + 4] // src_yuy2 michael@0: mov edx, [esp + 4 + 8] // dst_u michael@0: mov edi, [esp + 4 + 12] // dst_v michael@0: mov ecx, [esp + 4 + 16] // pix michael@0: pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff michael@0: psrlw xmm5, 8 michael@0: sub edi, edx michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: lea eax, [eax + 32] michael@0: psrlw xmm0, 8 // YUYV -> UVUV michael@0: psrlw xmm1, 8 michael@0: packuswb xmm0, xmm1 michael@0: movdqa xmm1, xmm0 michael@0: pand xmm0, xmm5 // U michael@0: packuswb xmm0, xmm0 michael@0: psrlw xmm1, 8 // V michael@0: packuswb xmm1, xmm1 michael@0: movq qword ptr [edx], xmm0 michael@0: movq qword ptr [edx + edi], xmm1 michael@0: lea edx, [edx + 8] michael@0: sub ecx, 16 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, michael@0: uint8* dst_y, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_yuy2 michael@0: mov edx, [esp + 8] // dst_y michael@0: mov ecx, [esp + 12] // pix michael@0: pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff michael@0: psrlw xmm5, 8 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqu xmm0, [eax] michael@0: movdqu xmm1, [eax + 16] michael@0: lea eax, [eax + 32] michael@0: pand xmm0, xmm5 // even bytes are Y michael@0: pand xmm1, xmm5 michael@0: packuswb xmm0, xmm1 michael@0: sub ecx, 16 michael@0: movdqu [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, michael@0: uint8* dst_u, uint8* dst_v, int pix) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // src_yuy2 michael@0: mov esi, [esp + 8 + 8] // stride_yuy2 michael@0: mov edx, [esp + 8 + 12] // dst_u michael@0: mov edi, [esp + 8 + 16] // dst_v michael@0: mov ecx, [esp + 8 + 20] // pix michael@0: pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff michael@0: psrlw xmm5, 8 michael@0: sub edi, edx michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqu xmm0, [eax] michael@0: movdqu xmm1, [eax + 16] michael@0: movdqu xmm2, [eax + esi] michael@0: movdqu xmm3, [eax + esi + 16] michael@0: lea eax, [eax + 32] michael@0: pavgb xmm0, xmm2 michael@0: pavgb xmm1, xmm3 michael@0: psrlw xmm0, 8 // YUYV -> UVUV michael@0: psrlw xmm1, 8 michael@0: packuswb xmm0, xmm1 michael@0: movdqa xmm1, xmm0 michael@0: pand xmm0, xmm5 // U michael@0: packuswb xmm0, xmm0 michael@0: psrlw xmm1, 8 // V michael@0: packuswb xmm1, xmm1 michael@0: movq qword ptr [edx], xmm0 michael@0: movq qword ptr [edx + edi], xmm1 michael@0: lea edx, [edx + 8] michael@0: sub ecx, 16 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, michael@0: uint8* dst_u, uint8* dst_v, int pix) { michael@0: __asm { michael@0: push edi michael@0: mov eax, [esp + 4 + 4] // src_yuy2 michael@0: mov edx, [esp + 4 + 8] // dst_u michael@0: mov edi, [esp + 4 + 12] // dst_v michael@0: mov ecx, [esp + 4 + 16] // pix michael@0: pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff michael@0: psrlw xmm5, 8 michael@0: sub edi, edx michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqu xmm0, [eax] michael@0: movdqu xmm1, [eax + 16] michael@0: lea eax, [eax + 32] michael@0: psrlw xmm0, 8 // YUYV -> UVUV michael@0: psrlw xmm1, 8 michael@0: packuswb xmm0, xmm1 michael@0: movdqa xmm1, xmm0 michael@0: pand xmm0, xmm5 // U michael@0: packuswb xmm0, xmm0 michael@0: psrlw xmm1, 8 // V michael@0: packuswb xmm1, xmm1 michael@0: movq qword ptr [edx], xmm0 michael@0: movq qword ptr [edx + edi], xmm1 michael@0: lea edx, [edx + 8] michael@0: sub ecx, 16 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void UYVYToYRow_SSE2(const uint8* src_uyvy, michael@0: uint8* dst_y, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_uyvy michael@0: mov edx, [esp + 8] // dst_y michael@0: mov ecx, [esp + 12] // pix michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: lea eax, [eax + 32] michael@0: psrlw xmm0, 8 // odd bytes are Y michael@0: psrlw xmm1, 8 michael@0: packuswb xmm0, xmm1 michael@0: sub ecx, 16 michael@0: movdqa [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, michael@0: uint8* dst_u, uint8* dst_v, int pix) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // src_yuy2 michael@0: mov esi, [esp + 8 + 8] // stride_yuy2 michael@0: mov edx, [esp + 8 + 12] // dst_u michael@0: mov edi, [esp + 8 + 16] // dst_v michael@0: mov ecx, [esp + 8 + 20] // pix michael@0: pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff michael@0: psrlw xmm5, 8 michael@0: sub edi, edx michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: movdqa xmm2, [eax + esi] michael@0: movdqa xmm3, [eax + esi + 16] michael@0: lea eax, [eax + 32] michael@0: pavgb xmm0, xmm2 michael@0: pavgb xmm1, xmm3 michael@0: pand xmm0, xmm5 // UYVY -> UVUV michael@0: pand xmm1, xmm5 michael@0: packuswb xmm0, xmm1 michael@0: movdqa xmm1, xmm0 michael@0: pand xmm0, xmm5 // U michael@0: packuswb xmm0, xmm0 michael@0: psrlw xmm1, 8 // V michael@0: packuswb xmm1, xmm1 michael@0: movq qword ptr [edx], xmm0 michael@0: movq qword ptr [edx + edi], xmm1 michael@0: lea edx, [edx + 8] michael@0: sub ecx, 16 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void UYVYToUV422Row_SSE2(const uint8* src_uyvy, michael@0: uint8* dst_u, uint8* dst_v, int pix) { michael@0: __asm { michael@0: push edi michael@0: mov eax, [esp + 4 + 4] // src_yuy2 michael@0: mov edx, [esp + 4 + 8] // dst_u michael@0: mov edi, [esp + 4 + 12] // dst_v michael@0: mov ecx, [esp + 4 + 16] // pix michael@0: pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff michael@0: psrlw xmm5, 8 michael@0: sub edi, edx michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: lea eax, [eax + 32] michael@0: pand xmm0, xmm5 // UYVY -> UVUV michael@0: pand xmm1, xmm5 michael@0: packuswb xmm0, xmm1 michael@0: movdqa xmm1, xmm0 michael@0: pand xmm0, xmm5 // U michael@0: packuswb xmm0, xmm0 michael@0: psrlw xmm1, 8 // V michael@0: packuswb xmm1, xmm1 michael@0: movq qword ptr [edx], xmm0 michael@0: movq qword ptr [edx + edi], xmm1 michael@0: lea edx, [edx + 8] michael@0: sub ecx, 16 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, michael@0: uint8* dst_y, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_uyvy michael@0: mov edx, [esp + 8] // dst_y michael@0: mov ecx, [esp + 12] // pix michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqu xmm0, [eax] michael@0: movdqu xmm1, [eax + 16] michael@0: lea eax, [eax + 32] michael@0: psrlw xmm0, 8 // odd bytes are Y michael@0: psrlw xmm1, 8 michael@0: packuswb xmm0, xmm1 michael@0: sub ecx, 16 michael@0: movdqu [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, michael@0: uint8* dst_u, uint8* dst_v, int pix) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // src_yuy2 michael@0: mov esi, [esp + 8 + 8] // stride_yuy2 michael@0: mov edx, [esp + 8 + 12] // dst_u michael@0: mov edi, [esp + 8 + 16] // dst_v michael@0: mov ecx, [esp + 8 + 20] // pix michael@0: pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff michael@0: psrlw xmm5, 8 michael@0: sub edi, edx michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqu xmm0, [eax] michael@0: movdqu xmm1, [eax + 16] michael@0: movdqu xmm2, [eax + esi] michael@0: movdqu xmm3, [eax + esi + 16] michael@0: lea eax, [eax + 32] michael@0: pavgb xmm0, xmm2 michael@0: pavgb xmm1, xmm3 michael@0: pand xmm0, xmm5 // UYVY -> UVUV michael@0: pand xmm1, xmm5 michael@0: packuswb xmm0, xmm1 michael@0: movdqa xmm1, xmm0 michael@0: pand xmm0, xmm5 // U michael@0: packuswb xmm0, xmm0 michael@0: psrlw xmm1, 8 // V michael@0: packuswb xmm1, xmm1 michael@0: movq qword ptr [edx], xmm0 michael@0: movq qword ptr [edx + edi], xmm1 michael@0: lea edx, [edx + 8] michael@0: sub ecx, 16 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, michael@0: uint8* dst_u, uint8* dst_v, int pix) { michael@0: __asm { michael@0: push edi michael@0: mov eax, [esp + 4 + 4] // src_yuy2 michael@0: mov edx, [esp + 4 + 8] // dst_u michael@0: mov edi, [esp + 4 + 12] // dst_v michael@0: mov ecx, [esp + 4 + 16] // pix michael@0: pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff michael@0: psrlw xmm5, 8 michael@0: sub edi, edx michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqu xmm0, [eax] michael@0: movdqu xmm1, [eax + 16] michael@0: lea eax, [eax + 32] michael@0: pand xmm0, xmm5 // UYVY -> UVUV michael@0: pand xmm1, xmm5 michael@0: packuswb xmm0, xmm1 michael@0: movdqa xmm1, xmm0 michael@0: pand xmm0, xmm5 // U michael@0: packuswb xmm0, xmm0 michael@0: psrlw xmm1, 8 // V michael@0: packuswb xmm1, xmm1 michael@0: movq qword ptr [edx], xmm0 michael@0: movq qword ptr [edx + edi], xmm1 michael@0: lea edx, [edx + 8] michael@0: sub ecx, 16 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_YUY2TOYROW_SSE2 michael@0: michael@0: #ifdef HAS_ARGBBLENDROW_SSE2 michael@0: // Blend 8 pixels at a time. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, michael@0: uint8* dst_argb, int width) { michael@0: __asm { michael@0: push esi michael@0: mov eax, [esp + 4 + 4] // src_argb0 michael@0: mov esi, [esp + 4 + 8] // src_argb1 michael@0: mov edx, [esp + 4 + 12] // dst_argb michael@0: mov ecx, [esp + 4 + 16] // width michael@0: pcmpeqb xmm7, xmm7 // generate constant 1 michael@0: psrlw xmm7, 15 michael@0: pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff michael@0: psrlw xmm6, 8 michael@0: pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 michael@0: psllw xmm5, 8 michael@0: pcmpeqb xmm4, xmm4 // generate mask 0xff000000 michael@0: pslld xmm4, 24 michael@0: michael@0: sub ecx, 1 michael@0: je convertloop1 // only 1 pixel? michael@0: jl convertloop1b michael@0: michael@0: // 1 pixel loop until destination pointer is aligned. michael@0: alignloop1: michael@0: test edx, 15 // aligned? michael@0: je alignloop1b michael@0: movd xmm3, [eax] michael@0: lea eax, [eax + 4] michael@0: movdqa xmm0, xmm3 // src argb michael@0: pxor xmm3, xmm4 // ~alpha michael@0: movd xmm2, [esi] // _r_b michael@0: psrlw xmm3, 8 // alpha michael@0: pshufhw xmm3, xmm3, 0F5h // 8 alpha words michael@0: pshuflw xmm3, xmm3, 0F5h michael@0: pand xmm2, xmm6 // _r_b michael@0: paddw xmm3, xmm7 // 256 - alpha michael@0: pmullw xmm2, xmm3 // _r_b * alpha michael@0: movd xmm1, [esi] // _a_g michael@0: lea esi, [esi + 4] michael@0: psrlw xmm1, 8 // _a_g michael@0: por xmm0, xmm4 // set alpha to 255 michael@0: pmullw xmm1, xmm3 // _a_g * alpha michael@0: psrlw xmm2, 8 // _r_b convert to 8 bits again michael@0: paddusb xmm0, xmm2 // + src argb michael@0: pand xmm1, xmm5 // a_g_ convert to 8 bits again michael@0: paddusb xmm0, xmm1 // + src argb michael@0: sub ecx, 1 michael@0: movd [edx], xmm0 michael@0: lea edx, [edx + 4] michael@0: jge alignloop1 michael@0: michael@0: alignloop1b: michael@0: add ecx, 1 - 4 michael@0: jl convertloop4b michael@0: michael@0: // 4 pixel loop. michael@0: convertloop4: michael@0: movdqu xmm3, [eax] // src argb michael@0: lea eax, [eax + 16] michael@0: movdqa xmm0, xmm3 // src argb michael@0: pxor xmm3, xmm4 // ~alpha michael@0: movdqu xmm2, [esi] // _r_b michael@0: psrlw xmm3, 8 // alpha michael@0: pshufhw xmm3, xmm3, 0F5h // 8 alpha words michael@0: pshuflw xmm3, xmm3, 0F5h michael@0: pand xmm2, xmm6 // _r_b michael@0: paddw xmm3, xmm7 // 256 - alpha michael@0: pmullw xmm2, xmm3 // _r_b * alpha michael@0: movdqu xmm1, [esi] // _a_g michael@0: lea esi, [esi + 16] michael@0: psrlw xmm1, 8 // _a_g michael@0: por xmm0, xmm4 // set alpha to 255 michael@0: pmullw xmm1, xmm3 // _a_g * alpha michael@0: psrlw xmm2, 8 // _r_b convert to 8 bits again michael@0: paddusb xmm0, xmm2 // + src argb michael@0: pand xmm1, xmm5 // a_g_ convert to 8 bits again michael@0: paddusb xmm0, xmm1 // + src argb michael@0: sub ecx, 4 michael@0: movdqa [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jge convertloop4 michael@0: michael@0: convertloop4b: michael@0: add ecx, 4 - 1 michael@0: jl convertloop1b michael@0: michael@0: // 1 pixel loop. michael@0: convertloop1: michael@0: movd xmm3, [eax] // src argb michael@0: lea eax, [eax + 4] michael@0: movdqa xmm0, xmm3 // src argb michael@0: pxor xmm3, xmm4 // ~alpha michael@0: movd xmm2, [esi] // _r_b michael@0: psrlw xmm3, 8 // alpha michael@0: pshufhw xmm3, xmm3, 0F5h // 8 alpha words michael@0: pshuflw xmm3, xmm3, 0F5h michael@0: pand xmm2, xmm6 // _r_b michael@0: paddw xmm3, xmm7 // 256 - alpha michael@0: pmullw xmm2, xmm3 // _r_b * alpha michael@0: movd xmm1, [esi] // _a_g michael@0: lea esi, [esi + 4] michael@0: psrlw xmm1, 8 // _a_g michael@0: por xmm0, xmm4 // set alpha to 255 michael@0: pmullw xmm1, xmm3 // _a_g * alpha michael@0: psrlw xmm2, 8 // _r_b convert to 8 bits again michael@0: paddusb xmm0, xmm2 // + src argb michael@0: pand xmm1, xmm5 // a_g_ convert to 8 bits again michael@0: paddusb xmm0, xmm1 // + src argb michael@0: sub ecx, 1 michael@0: movd [edx], xmm0 michael@0: lea edx, [edx + 4] michael@0: jge convertloop1 michael@0: michael@0: convertloop1b: michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBBLENDROW_SSE2 michael@0: michael@0: #ifdef HAS_ARGBBLENDROW_SSSE3 michael@0: // Shuffle table for isolating alpha. michael@0: static const uvec8 kShuffleAlpha = { michael@0: 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, michael@0: 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 michael@0: }; michael@0: // Same as SSE2, but replaces: michael@0: // psrlw xmm3, 8 // alpha michael@0: // pshufhw xmm3, xmm3, 0F5h // 8 alpha words michael@0: // pshuflw xmm3, xmm3, 0F5h michael@0: // with.. michael@0: // pshufb xmm3, kShuffleAlpha // alpha michael@0: // Blend 8 pixels at a time. michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, michael@0: uint8* dst_argb, int width) { michael@0: __asm { michael@0: push esi michael@0: mov eax, [esp + 4 + 4] // src_argb0 michael@0: mov esi, [esp + 4 + 8] // src_argb1 michael@0: mov edx, [esp + 4 + 12] // dst_argb michael@0: mov ecx, [esp + 4 + 16] // width michael@0: pcmpeqb xmm7, xmm7 // generate constant 0x0001 michael@0: psrlw xmm7, 15 michael@0: pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff michael@0: psrlw xmm6, 8 michael@0: pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 michael@0: psllw xmm5, 8 michael@0: pcmpeqb xmm4, xmm4 // generate mask 0xff000000 michael@0: pslld xmm4, 24 michael@0: michael@0: sub ecx, 1 michael@0: je convertloop1 // only 1 pixel? michael@0: jl convertloop1b michael@0: michael@0: // 1 pixel loop until destination pointer is aligned. michael@0: alignloop1: michael@0: test edx, 15 // aligned? michael@0: je alignloop1b michael@0: movd xmm3, [eax] michael@0: lea eax, [eax + 4] michael@0: movdqa xmm0, xmm3 // src argb michael@0: pxor xmm3, xmm4 // ~alpha michael@0: movd xmm2, [esi] // _r_b michael@0: pshufb xmm3, kShuffleAlpha // alpha michael@0: pand xmm2, xmm6 // _r_b michael@0: paddw xmm3, xmm7 // 256 - alpha michael@0: pmullw xmm2, xmm3 // _r_b * alpha michael@0: movd xmm1, [esi] // _a_g michael@0: lea esi, [esi + 4] michael@0: psrlw xmm1, 8 // _a_g michael@0: por xmm0, xmm4 // set alpha to 255 michael@0: pmullw xmm1, xmm3 // _a_g * alpha michael@0: psrlw xmm2, 8 // _r_b convert to 8 bits again michael@0: paddusb xmm0, xmm2 // + src argb michael@0: pand xmm1, xmm5 // a_g_ convert to 8 bits again michael@0: paddusb xmm0, xmm1 // + src argb michael@0: sub ecx, 1 michael@0: movd [edx], xmm0 michael@0: lea edx, [edx + 4] michael@0: jge alignloop1 michael@0: michael@0: alignloop1b: michael@0: add ecx, 1 - 4 michael@0: jl convertloop4b michael@0: michael@0: test eax, 15 // unaligned? michael@0: jne convertuloop4 michael@0: test esi, 15 // unaligned? michael@0: jne convertuloop4 michael@0: michael@0: // 4 pixel loop. michael@0: convertloop4: michael@0: movdqa xmm3, [eax] // src argb michael@0: lea eax, [eax + 16] michael@0: movdqa xmm0, xmm3 // src argb michael@0: pxor xmm3, xmm4 // ~alpha michael@0: movdqa xmm2, [esi] // _r_b michael@0: pshufb xmm3, kShuffleAlpha // alpha michael@0: pand xmm2, xmm6 // _r_b michael@0: paddw xmm3, xmm7 // 256 - alpha michael@0: pmullw xmm2, xmm3 // _r_b * alpha michael@0: movdqa xmm1, [esi] // _a_g michael@0: lea esi, [esi + 16] michael@0: psrlw xmm1, 8 // _a_g michael@0: por xmm0, xmm4 // set alpha to 255 michael@0: pmullw xmm1, xmm3 // _a_g * alpha michael@0: psrlw xmm2, 8 // _r_b convert to 8 bits again michael@0: paddusb xmm0, xmm2 // + src argb michael@0: pand xmm1, xmm5 // a_g_ convert to 8 bits again michael@0: paddusb xmm0, xmm1 // + src argb michael@0: sub ecx, 4 michael@0: movdqa [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jge convertloop4 michael@0: jmp convertloop4b michael@0: michael@0: // 4 pixel unaligned loop. michael@0: convertuloop4: michael@0: movdqu xmm3, [eax] // src argb michael@0: lea eax, [eax + 16] michael@0: movdqa xmm0, xmm3 // src argb michael@0: pxor xmm3, xmm4 // ~alpha michael@0: movdqu xmm2, [esi] // _r_b michael@0: pshufb xmm3, kShuffleAlpha // alpha michael@0: pand xmm2, xmm6 // _r_b michael@0: paddw xmm3, xmm7 // 256 - alpha michael@0: pmullw xmm2, xmm3 // _r_b * alpha michael@0: movdqu xmm1, [esi] // _a_g michael@0: lea esi, [esi + 16] michael@0: psrlw xmm1, 8 // _a_g michael@0: por xmm0, xmm4 // set alpha to 255 michael@0: pmullw xmm1, xmm3 // _a_g * alpha michael@0: psrlw xmm2, 8 // _r_b convert to 8 bits again michael@0: paddusb xmm0, xmm2 // + src argb michael@0: pand xmm1, xmm5 // a_g_ convert to 8 bits again michael@0: paddusb xmm0, xmm1 // + src argb michael@0: sub ecx, 4 michael@0: movdqa [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jge convertuloop4 michael@0: michael@0: convertloop4b: michael@0: add ecx, 4 - 1 michael@0: jl convertloop1b michael@0: michael@0: // 1 pixel loop. michael@0: convertloop1: michael@0: movd xmm3, [eax] // src argb michael@0: lea eax, [eax + 4] michael@0: movdqa xmm0, xmm3 // src argb michael@0: pxor xmm3, xmm4 // ~alpha michael@0: movd xmm2, [esi] // _r_b michael@0: pshufb xmm3, kShuffleAlpha // alpha michael@0: pand xmm2, xmm6 // _r_b michael@0: paddw xmm3, xmm7 // 256 - alpha michael@0: pmullw xmm2, xmm3 // _r_b * alpha michael@0: movd xmm1, [esi] // _a_g michael@0: lea esi, [esi + 4] michael@0: psrlw xmm1, 8 // _a_g michael@0: por xmm0, xmm4 // set alpha to 255 michael@0: pmullw xmm1, xmm3 // _a_g * alpha michael@0: psrlw xmm2, 8 // _r_b convert to 8 bits again michael@0: paddusb xmm0, xmm2 // + src argb michael@0: pand xmm1, xmm5 // a_g_ convert to 8 bits again michael@0: paddusb xmm0, xmm1 // + src argb michael@0: sub ecx, 1 michael@0: movd [edx], xmm0 michael@0: lea edx, [edx + 4] michael@0: jge convertloop1 michael@0: michael@0: convertloop1b: michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBBLENDROW_SSSE3 michael@0: michael@0: #ifdef HAS_ARGBATTENUATEROW_SSE2 michael@0: // Attenuate 4 pixels at a time. michael@0: // Aligned to 16 bytes. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_argb0 michael@0: mov edx, [esp + 8] // dst_argb michael@0: mov ecx, [esp + 12] // width michael@0: pcmpeqb xmm4, xmm4 // generate mask 0xff000000 michael@0: pslld xmm4, 24 michael@0: pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff michael@0: psrld xmm5, 8 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax] // read 4 pixels michael@0: punpcklbw xmm0, xmm0 // first 2 michael@0: pshufhw xmm2, xmm0, 0FFh // 8 alpha words michael@0: pshuflw xmm2, xmm2, 0FFh michael@0: pmulhuw xmm0, xmm2 // rgb * a michael@0: movdqa xmm1, [eax] // read 4 pixels michael@0: punpckhbw xmm1, xmm1 // next 2 pixels michael@0: pshufhw xmm2, xmm1, 0FFh // 8 alpha words michael@0: pshuflw xmm2, xmm2, 0FFh michael@0: pmulhuw xmm1, xmm2 // rgb * a michael@0: movdqa xmm2, [eax] // alphas michael@0: lea eax, [eax + 16] michael@0: psrlw xmm0, 8 michael@0: pand xmm2, xmm4 michael@0: psrlw xmm1, 8 michael@0: packuswb xmm0, xmm1 michael@0: pand xmm0, xmm5 // keep original alphas michael@0: por xmm0, xmm2 michael@0: sub ecx, 4 michael@0: movdqa [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg convertloop michael@0: michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBATTENUATEROW_SSE2 michael@0: michael@0: #ifdef HAS_ARGBATTENUATEROW_SSSE3 michael@0: // Shuffle table duplicating alpha. michael@0: static const uvec8 kShuffleAlpha0 = { michael@0: 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, michael@0: }; michael@0: static const uvec8 kShuffleAlpha1 = { michael@0: 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, michael@0: 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, michael@0: }; michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_argb0 michael@0: mov edx, [esp + 8] // dst_argb michael@0: mov ecx, [esp + 12] // width michael@0: pcmpeqb xmm3, xmm3 // generate mask 0xff000000 michael@0: pslld xmm3, 24 michael@0: movdqa xmm4, kShuffleAlpha0 michael@0: movdqa xmm5, kShuffleAlpha1 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqu xmm0, [eax] // read 4 pixels michael@0: pshufb xmm0, xmm4 // isolate first 2 alphas michael@0: movdqu xmm1, [eax] // read 4 pixels michael@0: punpcklbw xmm1, xmm1 // first 2 pixel rgbs michael@0: pmulhuw xmm0, xmm1 // rgb * a michael@0: movdqu xmm1, [eax] // read 4 pixels michael@0: pshufb xmm1, xmm5 // isolate next 2 alphas michael@0: movdqu xmm2, [eax] // read 4 pixels michael@0: punpckhbw xmm2, xmm2 // next 2 pixel rgbs michael@0: pmulhuw xmm1, xmm2 // rgb * a michael@0: movdqu xmm2, [eax] // mask original alpha michael@0: lea eax, [eax + 16] michael@0: pand xmm2, xmm3 michael@0: psrlw xmm0, 8 michael@0: psrlw xmm1, 8 michael@0: packuswb xmm0, xmm1 michael@0: por xmm0, xmm2 // copy original alpha michael@0: sub ecx, 4 michael@0: movdqu [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg convertloop michael@0: michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBATTENUATEROW_SSSE3 michael@0: michael@0: #ifdef HAS_ARGBATTENUATEROW_AVX2 michael@0: // Shuffle table duplicating alpha. michael@0: static const ulvec8 kShuffleAlpha_AVX2 = { michael@0: 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, michael@0: 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u, michael@0: 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, michael@0: 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u, michael@0: }; michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_argb0 michael@0: mov edx, [esp + 8] // dst_argb michael@0: mov ecx, [esp + 12] // width michael@0: sub edx, eax michael@0: vmovdqa ymm4, kShuffleAlpha_AVX2 michael@0: vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 michael@0: vpslld ymm5, ymm5, 24 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: vmovdqu ymm6, [eax] // read 8 pixels. michael@0: vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. michael@0: vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. michael@0: vpshufb ymm2, ymm0, ymm4 // low 4 alphas michael@0: vpshufb ymm3, ymm1, ymm4 // high 4 alphas michael@0: vpmulhuw ymm0, ymm0, ymm2 // rgb * a michael@0: vpmulhuw ymm1, ymm1, ymm3 // rgb * a michael@0: vpand ymm6, ymm6, ymm5 // isolate alpha michael@0: vpsrlw ymm0, ymm0, 8 michael@0: vpsrlw ymm1, ymm1, 8 michael@0: vpackuswb ymm0, ymm0, ymm1 // unmutated. michael@0: vpor ymm0, ymm0, ymm6 // copy original alpha michael@0: sub ecx, 8 michael@0: vmovdqu [eax + edx], ymm0 michael@0: lea eax, [eax + 32] michael@0: jg convertloop michael@0: michael@0: vzeroupper michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBATTENUATEROW_AVX2 michael@0: michael@0: #ifdef HAS_ARGBUNATTENUATEROW_SSE2 michael@0: // Unattenuate 4 pixels at a time. michael@0: // Aligned to 16 bytes. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, michael@0: int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // src_argb0 michael@0: mov edx, [esp + 8 + 8] // dst_argb michael@0: mov ecx, [esp + 8 + 12] // width michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqu xmm0, [eax] // read 4 pixels michael@0: movzx esi, byte ptr [eax + 3] // first alpha michael@0: movzx edi, byte ptr [eax + 7] // second alpha michael@0: punpcklbw xmm0, xmm0 // first 2 michael@0: movd xmm2, dword ptr fixed_invtbl8[esi * 4] michael@0: movd xmm3, dword ptr fixed_invtbl8[edi * 4] michael@0: pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a michael@0: pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words michael@0: movlhps xmm2, xmm3 michael@0: pmulhuw xmm0, xmm2 // rgb * a michael@0: michael@0: movdqu xmm1, [eax] // read 4 pixels michael@0: movzx esi, byte ptr [eax + 11] // third alpha michael@0: movzx edi, byte ptr [eax + 15] // forth alpha michael@0: punpckhbw xmm1, xmm1 // next 2 michael@0: movd xmm2, dword ptr fixed_invtbl8[esi * 4] michael@0: movd xmm3, dword ptr fixed_invtbl8[edi * 4] michael@0: pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words michael@0: pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words michael@0: movlhps xmm2, xmm3 michael@0: pmulhuw xmm1, xmm2 // rgb * a michael@0: lea eax, [eax + 16] michael@0: michael@0: packuswb xmm0, xmm1 michael@0: sub ecx, 4 michael@0: movdqu [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg convertloop michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBUNATTENUATEROW_SSE2 michael@0: michael@0: #ifdef HAS_ARGBUNATTENUATEROW_AVX2 michael@0: // Shuffle table duplicating alpha. michael@0: static const ulvec8 kUnattenShuffleAlpha_AVX2 = { michael@0: 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15, michael@0: 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15, michael@0: }; michael@0: // TODO(fbarchard): Enable USE_GATHER for future hardware if faster. michael@0: // USE_GATHER is not on by default, due to being a slow instruction. michael@0: #ifdef USE_GATHER michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, michael@0: int width) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_argb0 michael@0: mov edx, [esp + 8] // dst_argb michael@0: mov ecx, [esp + 12] // width michael@0: sub edx, eax michael@0: vmovdqa ymm4, kUnattenShuffleAlpha_AVX2 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: vmovdqu ymm6, [eax] // read 8 pixels. michael@0: vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather. michael@0: vpsrld ymm2, ymm6, 24 // alpha in low 8 bits. michael@0: vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. michael@0: vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. michael@0: vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a michael@0: vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a michael@0: vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. michael@0: vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a michael@0: vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas michael@0: vpmulhuw ymm0, ymm0, ymm2 // rgb * ia michael@0: vpmulhuw ymm1, ymm1, ymm3 // rgb * ia michael@0: vpackuswb ymm0, ymm0, ymm1 // unmutated. michael@0: sub ecx, 8 michael@0: vmovdqu [eax + edx], ymm0 michael@0: lea eax, [eax + 32] michael@0: jg convertloop michael@0: michael@0: vzeroupper michael@0: ret michael@0: } michael@0: } michael@0: #else // USE_GATHER michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, michael@0: int width) { michael@0: __asm { michael@0: michael@0: mov eax, [esp + 4] // src_argb0 michael@0: mov edx, [esp + 8] // dst_argb michael@0: mov ecx, [esp + 12] // width michael@0: sub edx, eax michael@0: vmovdqa ymm5, kUnattenShuffleAlpha_AVX2 michael@0: michael@0: push esi michael@0: push edi michael@0: michael@0: align 4 michael@0: convertloop: michael@0: // replace VPGATHER michael@0: movzx esi, byte ptr [eax + 3] // alpha0 michael@0: movzx edi, byte ptr [eax + 7] // alpha1 michael@0: vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a0] michael@0: vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a1] michael@0: movzx esi, byte ptr [eax + 11] // alpha2 michael@0: movzx edi, byte ptr [eax + 15] // alpha3 michael@0: vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] michael@0: vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a2] michael@0: vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a3] michael@0: movzx esi, byte ptr [eax + 19] // alpha4 michael@0: movzx edi, byte ptr [eax + 23] // alpha5 michael@0: vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] michael@0: vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a4] michael@0: vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a5] michael@0: movzx esi, byte ptr [eax + 27] // alpha6 michael@0: movzx edi, byte ptr [eax + 31] // alpha7 michael@0: vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] michael@0: vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a6] michael@0: vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a7] michael@0: vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6] michael@0: vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0] michael@0: vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4] michael@0: vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0] michael@0: // end of VPGATHER michael@0: michael@0: vmovdqu ymm6, [eax] // read 8 pixels. michael@0: vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. michael@0: vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. michael@0: vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a michael@0: vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. michael@0: vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a michael@0: vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas michael@0: vpmulhuw ymm0, ymm0, ymm2 // rgb * ia michael@0: vpmulhuw ymm1, ymm1, ymm3 // rgb * ia michael@0: vpackuswb ymm0, ymm0, ymm1 // unmutated. michael@0: sub ecx, 8 michael@0: vmovdqu [eax + edx], ymm0 michael@0: lea eax, [eax + 32] michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: vzeroupper michael@0: ret michael@0: } michael@0: } michael@0: #endif // USE_GATHER michael@0: #endif // HAS_ARGBATTENUATEROW_AVX2 michael@0: michael@0: #ifdef HAS_ARGBGRAYROW_SSSE3 michael@0: // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { michael@0: __asm { michael@0: mov eax, [esp + 4] /* src_argb */ michael@0: mov edx, [esp + 8] /* dst_argb */ michael@0: mov ecx, [esp + 12] /* width */ michael@0: movdqa xmm4, kARGBToYJ michael@0: movdqa xmm5, kAddYJ64 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax] // G michael@0: movdqa xmm1, [eax + 16] michael@0: pmaddubsw xmm0, xmm4 michael@0: pmaddubsw xmm1, xmm4 michael@0: phaddw xmm0, xmm1 michael@0: paddw xmm0, xmm5 // Add .5 for rounding. michael@0: psrlw xmm0, 7 michael@0: packuswb xmm0, xmm0 // 8 G bytes michael@0: movdqa xmm2, [eax] // A michael@0: movdqa xmm3, [eax + 16] michael@0: lea eax, [eax + 32] michael@0: psrld xmm2, 24 michael@0: psrld xmm3, 24 michael@0: packuswb xmm2, xmm3 michael@0: packuswb xmm2, xmm2 // 8 A bytes michael@0: movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA michael@0: punpcklbw xmm0, xmm0 // 8 GG words michael@0: punpcklbw xmm3, xmm2 // 8 GA words michael@0: movdqa xmm1, xmm0 michael@0: punpcklwd xmm0, xmm3 // GGGA first 4 michael@0: punpckhwd xmm1, xmm3 // GGGA next 4 michael@0: sub ecx, 8 michael@0: movdqa [edx], xmm0 michael@0: movdqa [edx + 16], xmm1 michael@0: lea edx, [edx + 32] michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBGRAYROW_SSSE3 michael@0: michael@0: #ifdef HAS_ARGBSEPIAROW_SSSE3 michael@0: // b = (r * 35 + g * 68 + b * 17) >> 7 michael@0: // g = (r * 45 + g * 88 + b * 22) >> 7 michael@0: // r = (r * 50 + g * 98 + b * 24) >> 7 michael@0: // Constant for ARGB color to sepia tone. michael@0: static const vec8 kARGBToSepiaB = { michael@0: 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 michael@0: }; michael@0: michael@0: static const vec8 kARGBToSepiaG = { michael@0: 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 michael@0: }; michael@0: michael@0: static const vec8 kARGBToSepiaR = { michael@0: 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 michael@0: }; michael@0: michael@0: // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { michael@0: __asm { michael@0: mov eax, [esp + 4] /* dst_argb */ michael@0: mov ecx, [esp + 8] /* width */ michael@0: movdqa xmm2, kARGBToSepiaB michael@0: movdqa xmm3, kARGBToSepiaG michael@0: movdqa xmm4, kARGBToSepiaR michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax] // B michael@0: movdqa xmm6, [eax + 16] michael@0: pmaddubsw xmm0, xmm2 michael@0: pmaddubsw xmm6, xmm2 michael@0: phaddw xmm0, xmm6 michael@0: psrlw xmm0, 7 michael@0: packuswb xmm0, xmm0 // 8 B values michael@0: movdqa xmm5, [eax] // G michael@0: movdqa xmm1, [eax + 16] michael@0: pmaddubsw xmm5, xmm3 michael@0: pmaddubsw xmm1, xmm3 michael@0: phaddw xmm5, xmm1 michael@0: psrlw xmm5, 7 michael@0: packuswb xmm5, xmm5 // 8 G values michael@0: punpcklbw xmm0, xmm5 // 8 BG values michael@0: movdqa xmm5, [eax] // R michael@0: movdqa xmm1, [eax + 16] michael@0: pmaddubsw xmm5, xmm4 michael@0: pmaddubsw xmm1, xmm4 michael@0: phaddw xmm5, xmm1 michael@0: psrlw xmm5, 7 michael@0: packuswb xmm5, xmm5 // 8 R values michael@0: movdqa xmm6, [eax] // A michael@0: movdqa xmm1, [eax + 16] michael@0: psrld xmm6, 24 michael@0: psrld xmm1, 24 michael@0: packuswb xmm6, xmm1 michael@0: packuswb xmm6, xmm6 // 8 A values michael@0: punpcklbw xmm5, xmm6 // 8 RA values michael@0: movdqa xmm1, xmm0 // Weave BG, RA together michael@0: punpcklwd xmm0, xmm5 // BGRA first 4 michael@0: punpckhwd xmm1, xmm5 // BGRA next 4 michael@0: sub ecx, 8 michael@0: movdqa [eax], xmm0 michael@0: movdqa [eax + 16], xmm1 michael@0: lea eax, [eax + 32] michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBSEPIAROW_SSSE3 michael@0: michael@0: #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 michael@0: // Tranform 8 ARGB pixels (32 bytes) with color matrix. michael@0: // Same as Sepia except matrix is provided. michael@0: // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R michael@0: // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, michael@0: const int8* matrix_argb, int width) { michael@0: __asm { michael@0: mov eax, [esp + 4] /* src_argb */ michael@0: mov edx, [esp + 8] /* dst_argb */ michael@0: mov ecx, [esp + 12] /* matrix_argb */ michael@0: movdqu xmm5, [ecx] michael@0: pshufd xmm2, xmm5, 0x00 michael@0: pshufd xmm3, xmm5, 0x55 michael@0: pshufd xmm4, xmm5, 0xaa michael@0: pshufd xmm5, xmm5, 0xff michael@0: mov ecx, [esp + 16] /* width */ michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax] // B michael@0: movdqa xmm7, [eax + 16] michael@0: pmaddubsw xmm0, xmm2 michael@0: pmaddubsw xmm7, xmm2 michael@0: movdqa xmm6, [eax] // G michael@0: movdqa xmm1, [eax + 16] michael@0: pmaddubsw xmm6, xmm3 michael@0: pmaddubsw xmm1, xmm3 michael@0: phaddsw xmm0, xmm7 // B michael@0: phaddsw xmm6, xmm1 // G michael@0: psraw xmm0, 6 // B michael@0: psraw xmm6, 6 // G michael@0: packuswb xmm0, xmm0 // 8 B values michael@0: packuswb xmm6, xmm6 // 8 G values michael@0: punpcklbw xmm0, xmm6 // 8 BG values michael@0: movdqa xmm1, [eax] // R michael@0: movdqa xmm7, [eax + 16] michael@0: pmaddubsw xmm1, xmm4 michael@0: pmaddubsw xmm7, xmm4 michael@0: phaddsw xmm1, xmm7 // R michael@0: movdqa xmm6, [eax] // A michael@0: movdqa xmm7, [eax + 16] michael@0: pmaddubsw xmm6, xmm5 michael@0: pmaddubsw xmm7, xmm5 michael@0: phaddsw xmm6, xmm7 // A michael@0: psraw xmm1, 6 // R michael@0: psraw xmm6, 6 // A michael@0: packuswb xmm1, xmm1 // 8 R values michael@0: packuswb xmm6, xmm6 // 8 A values michael@0: punpcklbw xmm1, xmm6 // 8 RA values michael@0: movdqa xmm6, xmm0 // Weave BG, RA together michael@0: punpcklwd xmm0, xmm1 // BGRA first 4 michael@0: punpckhwd xmm6, xmm1 // BGRA next 4 michael@0: sub ecx, 8 michael@0: movdqa [edx], xmm0 michael@0: movdqa [edx + 16], xmm6 michael@0: lea eax, [eax + 32] michael@0: lea edx, [edx + 32] michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 michael@0: michael@0: #ifdef HAS_ARGBQUANTIZEROW_SSE2 michael@0: // Quantize 4 ARGB pixels (16 bytes). michael@0: // Aligned to 16 bytes. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, michael@0: int interval_offset, int width) { michael@0: __asm { michael@0: mov eax, [esp + 4] /* dst_argb */ michael@0: movd xmm2, [esp + 8] /* scale */ michael@0: movd xmm3, [esp + 12] /* interval_size */ michael@0: movd xmm4, [esp + 16] /* interval_offset */ michael@0: mov ecx, [esp + 20] /* width */ michael@0: pshuflw xmm2, xmm2, 040h michael@0: pshufd xmm2, xmm2, 044h michael@0: pshuflw xmm3, xmm3, 040h michael@0: pshufd xmm3, xmm3, 044h michael@0: pshuflw xmm4, xmm4, 040h michael@0: pshufd xmm4, xmm4, 044h michael@0: pxor xmm5, xmm5 // constant 0 michael@0: pcmpeqb xmm6, xmm6 // generate mask 0xff000000 michael@0: pslld xmm6, 24 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax] // read 4 pixels michael@0: punpcklbw xmm0, xmm5 // first 2 pixels michael@0: pmulhuw xmm0, xmm2 // pixel * scale >> 16 michael@0: movdqa xmm1, [eax] // read 4 pixels michael@0: punpckhbw xmm1, xmm5 // next 2 pixels michael@0: pmulhuw xmm1, xmm2 michael@0: pmullw xmm0, xmm3 // * interval_size michael@0: movdqa xmm7, [eax] // read 4 pixels michael@0: pmullw xmm1, xmm3 michael@0: pand xmm7, xmm6 // mask alpha michael@0: paddw xmm0, xmm4 // + interval_size / 2 michael@0: paddw xmm1, xmm4 michael@0: packuswb xmm0, xmm1 michael@0: por xmm0, xmm7 michael@0: sub ecx, 4 michael@0: movdqa [eax], xmm0 michael@0: lea eax, [eax + 16] michael@0: jg convertloop michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBQUANTIZEROW_SSE2 michael@0: michael@0: #ifdef HAS_ARGBSHADEROW_SSE2 michael@0: // Shade 4 pixels at a time by specified value. michael@0: // Aligned to 16 bytes. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, michael@0: uint32 value) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_argb michael@0: mov edx, [esp + 8] // dst_argb michael@0: mov ecx, [esp + 12] // width michael@0: movd xmm2, [esp + 16] // value michael@0: punpcklbw xmm2, xmm2 michael@0: punpcklqdq xmm2, xmm2 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax] // read 4 pixels michael@0: lea eax, [eax + 16] michael@0: movdqa xmm1, xmm0 michael@0: punpcklbw xmm0, xmm0 // first 2 michael@0: punpckhbw xmm1, xmm1 // next 2 michael@0: pmulhuw xmm0, xmm2 // argb * value michael@0: pmulhuw xmm1, xmm2 // argb * value michael@0: psrlw xmm0, 8 michael@0: psrlw xmm1, 8 michael@0: packuswb xmm0, xmm1 michael@0: sub ecx, 4 michael@0: movdqa [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg convertloop michael@0: michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBSHADEROW_SSE2 michael@0: michael@0: #ifdef HAS_ARGBMULTIPLYROW_SSE2 michael@0: // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, michael@0: uint8* dst_argb, int width) { michael@0: __asm { michael@0: push esi michael@0: mov eax, [esp + 4 + 4] // src_argb0 michael@0: mov esi, [esp + 4 + 8] // src_argb1 michael@0: mov edx, [esp + 4 + 12] // dst_argb michael@0: mov ecx, [esp + 4 + 16] // width michael@0: pxor xmm5, xmm5 // constant 0 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqu xmm0, [eax] // read 4 pixels from src_argb0 michael@0: movdqu xmm2, [esi] // read 4 pixels from src_argb1 michael@0: movdqu xmm1, xmm0 michael@0: movdqu xmm3, xmm2 michael@0: punpcklbw xmm0, xmm0 // first 2 michael@0: punpckhbw xmm1, xmm1 // next 2 michael@0: punpcklbw xmm2, xmm5 // first 2 michael@0: punpckhbw xmm3, xmm5 // next 2 michael@0: pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 michael@0: pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 michael@0: lea eax, [eax + 16] michael@0: lea esi, [esi + 16] michael@0: packuswb xmm0, xmm1 michael@0: sub ecx, 4 michael@0: movdqu [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg convertloop michael@0: michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBMULTIPLYROW_SSE2 michael@0: michael@0: #ifdef HAS_ARGBADDROW_SSE2 michael@0: // Add 2 rows of ARGB pixels together, 4 pixels at a time. michael@0: // TODO(fbarchard): Port this to posix, neon and other math functions. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, michael@0: uint8* dst_argb, int width) { michael@0: __asm { michael@0: push esi michael@0: mov eax, [esp + 4 + 4] // src_argb0 michael@0: mov esi, [esp + 4 + 8] // src_argb1 michael@0: mov edx, [esp + 4 + 12] // dst_argb michael@0: mov ecx, [esp + 4 + 16] // width michael@0: michael@0: sub ecx, 4 michael@0: jl convertloop49 michael@0: michael@0: align 4 michael@0: convertloop4: michael@0: movdqu xmm0, [eax] // read 4 pixels from src_argb0 michael@0: lea eax, [eax + 16] michael@0: movdqu xmm1, [esi] // read 4 pixels from src_argb1 michael@0: lea esi, [esi + 16] michael@0: paddusb xmm0, xmm1 // src_argb0 + src_argb1 michael@0: sub ecx, 4 michael@0: movdqu [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jge convertloop4 michael@0: michael@0: convertloop49: michael@0: add ecx, 4 - 1 michael@0: jl convertloop19 michael@0: michael@0: convertloop1: michael@0: movd xmm0, [eax] // read 1 pixels from src_argb0 michael@0: lea eax, [eax + 4] michael@0: movd xmm1, [esi] // read 1 pixels from src_argb1 michael@0: lea esi, [esi + 4] michael@0: paddusb xmm0, xmm1 // src_argb0 + src_argb1 michael@0: sub ecx, 1 michael@0: movd [edx], xmm0 michael@0: lea edx, [edx + 4] michael@0: jge convertloop1 michael@0: michael@0: convertloop19: michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBADDROW_SSE2 michael@0: michael@0: #ifdef HAS_ARGBSUBTRACTROW_SSE2 michael@0: // Subtract 2 rows of ARGB pixels together, 4 pixels at a time. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, michael@0: uint8* dst_argb, int width) { michael@0: __asm { michael@0: push esi michael@0: mov eax, [esp + 4 + 4] // src_argb0 michael@0: mov esi, [esp + 4 + 8] // src_argb1 michael@0: mov edx, [esp + 4 + 12] // dst_argb michael@0: mov ecx, [esp + 4 + 16] // width michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqu xmm0, [eax] // read 4 pixels from src_argb0 michael@0: lea eax, [eax + 16] michael@0: movdqu xmm1, [esi] // read 4 pixels from src_argb1 michael@0: lea esi, [esi + 16] michael@0: psubusb xmm0, xmm1 // src_argb0 - src_argb1 michael@0: sub ecx, 4 michael@0: movdqu [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg convertloop michael@0: michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBSUBTRACTROW_SSE2 michael@0: michael@0: #ifdef HAS_ARGBMULTIPLYROW_AVX2 michael@0: // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, michael@0: uint8* dst_argb, int width) { michael@0: __asm { michael@0: push esi michael@0: mov eax, [esp + 4 + 4] // src_argb0 michael@0: mov esi, [esp + 4 + 8] // src_argb1 michael@0: mov edx, [esp + 4 + 12] // dst_argb michael@0: mov ecx, [esp + 4 + 16] // width michael@0: vpxor ymm5, ymm5, ymm5 // constant 0 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 michael@0: lea eax, [eax + 32] michael@0: vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 michael@0: lea esi, [esi + 32] michael@0: vpunpcklbw ymm0, ymm1, ymm1 // low 4 michael@0: vpunpckhbw ymm1, ymm1, ymm1 // high 4 michael@0: vpunpcklbw ymm2, ymm3, ymm5 // low 4 michael@0: vpunpckhbw ymm3, ymm3, ymm5 // high 4 michael@0: vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4 michael@0: vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4 michael@0: vpackuswb ymm0, ymm0, ymm1 michael@0: vmovdqu [edx], ymm0 michael@0: lea edx, [edx + 32] michael@0: sub ecx, 8 michael@0: jg convertloop michael@0: michael@0: pop esi michael@0: vzeroupper michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBMULTIPLYROW_AVX2 michael@0: michael@0: #ifdef HAS_ARGBADDROW_AVX2 michael@0: // Add 2 rows of ARGB pixels together, 8 pixels at a time. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, michael@0: uint8* dst_argb, int width) { michael@0: __asm { michael@0: push esi michael@0: mov eax, [esp + 4 + 4] // src_argb0 michael@0: mov esi, [esp + 4 + 8] // src_argb1 michael@0: mov edx, [esp + 4 + 12] // dst_argb michael@0: mov ecx, [esp + 4 + 16] // width michael@0: michael@0: align 4 michael@0: convertloop: michael@0: vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 michael@0: lea eax, [eax + 32] michael@0: vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 michael@0: lea esi, [esi + 32] michael@0: vmovdqu [edx], ymm0 michael@0: lea edx, [edx + 32] michael@0: sub ecx, 8 michael@0: jg convertloop michael@0: michael@0: pop esi michael@0: vzeroupper michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBADDROW_AVX2 michael@0: michael@0: #ifdef HAS_ARGBSUBTRACTROW_AVX2 michael@0: // Subtract 2 rows of ARGB pixels together, 8 pixels at a time. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, michael@0: uint8* dst_argb, int width) { michael@0: __asm { michael@0: push esi michael@0: mov eax, [esp + 4 + 4] // src_argb0 michael@0: mov esi, [esp + 4 + 8] // src_argb1 michael@0: mov edx, [esp + 4 + 12] // dst_argb michael@0: mov ecx, [esp + 4 + 16] // width michael@0: michael@0: align 4 michael@0: convertloop: michael@0: vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 michael@0: lea eax, [eax + 32] michael@0: vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1 michael@0: lea esi, [esi + 32] michael@0: vmovdqu [edx], ymm0 michael@0: lea edx, [edx + 32] michael@0: sub ecx, 8 michael@0: jg convertloop michael@0: michael@0: pop esi michael@0: vzeroupper michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBSUBTRACTROW_AVX2 michael@0: michael@0: #ifdef HAS_SOBELXROW_SSE2 michael@0: // SobelX as a matrix is michael@0: // -1 0 1 michael@0: // -2 0 2 michael@0: // -1 0 1 michael@0: __declspec(naked) __declspec(align(16)) michael@0: void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, michael@0: const uint8* src_y2, uint8* dst_sobelx, int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // src_y0 michael@0: mov esi, [esp + 8 + 8] // src_y1 michael@0: mov edi, [esp + 8 + 12] // src_y2 michael@0: mov edx, [esp + 8 + 16] // dst_sobelx michael@0: mov ecx, [esp + 8 + 20] // width michael@0: sub esi, eax michael@0: sub edi, eax michael@0: sub edx, eax michael@0: pxor xmm5, xmm5 // constant 0 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] michael@0: movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] michael@0: punpcklbw xmm0, xmm5 michael@0: punpcklbw xmm1, xmm5 michael@0: psubw xmm0, xmm1 michael@0: movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] michael@0: movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] michael@0: punpcklbw xmm1, xmm5 michael@0: punpcklbw xmm2, xmm5 michael@0: psubw xmm1, xmm2 michael@0: movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] michael@0: movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2] michael@0: punpcklbw xmm2, xmm5 michael@0: punpcklbw xmm3, xmm5 michael@0: psubw xmm2, xmm3 michael@0: paddw xmm0, xmm2 michael@0: paddw xmm0, xmm1 michael@0: paddw xmm0, xmm1 michael@0: pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw michael@0: psubw xmm1, xmm0 michael@0: pmaxsw xmm0, xmm1 michael@0: packuswb xmm0, xmm0 michael@0: sub ecx, 8 michael@0: movq qword ptr [eax + edx], xmm0 michael@0: lea eax, [eax + 8] michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_SOBELXROW_SSE2 michael@0: michael@0: #ifdef HAS_SOBELYROW_SSE2 michael@0: // SobelY as a matrix is michael@0: // -1 -2 -1 michael@0: // 0 0 0 michael@0: // 1 2 1 michael@0: __declspec(naked) __declspec(align(16)) michael@0: void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, michael@0: uint8* dst_sobely, int width) { michael@0: __asm { michael@0: push esi michael@0: mov eax, [esp + 4 + 4] // src_y0 michael@0: mov esi, [esp + 4 + 8] // src_y1 michael@0: mov edx, [esp + 4 + 12] // dst_sobely michael@0: mov ecx, [esp + 4 + 16] // width michael@0: sub esi, eax michael@0: sub edx, eax michael@0: pxor xmm5, xmm5 // constant 0 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] michael@0: movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] michael@0: punpcklbw xmm0, xmm5 michael@0: punpcklbw xmm1, xmm5 michael@0: psubw xmm0, xmm1 michael@0: movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] michael@0: movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1] michael@0: punpcklbw xmm1, xmm5 michael@0: punpcklbw xmm2, xmm5 michael@0: psubw xmm1, xmm2 michael@0: movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] michael@0: movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] michael@0: punpcklbw xmm2, xmm5 michael@0: punpcklbw xmm3, xmm5 michael@0: psubw xmm2, xmm3 michael@0: paddw xmm0, xmm2 michael@0: paddw xmm0, xmm1 michael@0: paddw xmm0, xmm1 michael@0: pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw michael@0: psubw xmm1, xmm0 michael@0: pmaxsw xmm0, xmm1 michael@0: packuswb xmm0, xmm0 michael@0: sub ecx, 8 michael@0: movq qword ptr [eax + edx], xmm0 michael@0: lea eax, [eax + 8] michael@0: jg convertloop michael@0: michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_SOBELYROW_SSE2 michael@0: michael@0: #ifdef HAS_SOBELROW_SSE2 michael@0: // Adds Sobel X and Sobel Y and stores Sobel into ARGB. michael@0: // A = 255 michael@0: // R = Sobel michael@0: // G = Sobel michael@0: // B = Sobel michael@0: __declspec(naked) __declspec(align(16)) michael@0: void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, michael@0: uint8* dst_argb, int width) { michael@0: __asm { michael@0: push esi michael@0: mov eax, [esp + 4 + 4] // src_sobelx michael@0: mov esi, [esp + 4 + 8] // src_sobely michael@0: mov edx, [esp + 4 + 12] // dst_argb michael@0: mov ecx, [esp + 4 + 16] // width michael@0: sub esi, eax michael@0: pcmpeqb xmm5, xmm5 // alpha 255 michael@0: pslld xmm5, 24 // 0xff000000 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax] // read 16 pixels src_sobelx michael@0: movdqa xmm1, [eax + esi] // read 16 pixels src_sobely michael@0: lea eax, [eax + 16] michael@0: paddusb xmm0, xmm1 // sobel = sobelx + sobely michael@0: movdqa xmm2, xmm0 // GG michael@0: punpcklbw xmm2, xmm0 // First 8 michael@0: punpckhbw xmm0, xmm0 // Next 8 michael@0: movdqa xmm1, xmm2 // GGGG michael@0: punpcklwd xmm1, xmm2 // First 4 michael@0: punpckhwd xmm2, xmm2 // Next 4 michael@0: por xmm1, xmm5 // GGGA michael@0: por xmm2, xmm5 michael@0: movdqa xmm3, xmm0 // GGGG michael@0: punpcklwd xmm3, xmm0 // Next 4 michael@0: punpckhwd xmm0, xmm0 // Last 4 michael@0: por xmm3, xmm5 // GGGA michael@0: por xmm0, xmm5 michael@0: sub ecx, 16 michael@0: movdqa [edx], xmm1 michael@0: movdqa [edx + 16], xmm2 michael@0: movdqa [edx + 32], xmm3 michael@0: movdqa [edx + 48], xmm0 michael@0: lea edx, [edx + 64] michael@0: jg convertloop michael@0: michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_SOBELROW_SSE2 michael@0: michael@0: #ifdef HAS_SOBELTOPLANEROW_SSE2 michael@0: // Adds Sobel X and Sobel Y and stores Sobel into a plane. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, michael@0: uint8* dst_y, int width) { michael@0: __asm { michael@0: push esi michael@0: mov eax, [esp + 4 + 4] // src_sobelx michael@0: mov esi, [esp + 4 + 8] // src_sobely michael@0: mov edx, [esp + 4 + 12] // dst_argb michael@0: mov ecx, [esp + 4 + 16] // width michael@0: sub esi, eax michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax] // read 16 pixels src_sobelx michael@0: movdqa xmm1, [eax + esi] // read 16 pixels src_sobely michael@0: lea eax, [eax + 16] michael@0: paddusb xmm0, xmm1 // sobel = sobelx + sobely michael@0: sub ecx, 16 michael@0: movdqa [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg convertloop michael@0: michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_SOBELTOPLANEROW_SSE2 michael@0: michael@0: #ifdef HAS_SOBELXYROW_SSE2 michael@0: // Mixes Sobel X, Sobel Y and Sobel into ARGB. michael@0: // A = 255 michael@0: // R = Sobel X michael@0: // G = Sobel michael@0: // B = Sobel Y michael@0: __declspec(naked) __declspec(align(16)) michael@0: void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, michael@0: uint8* dst_argb, int width) { michael@0: __asm { michael@0: push esi michael@0: mov eax, [esp + 4 + 4] // src_sobelx michael@0: mov esi, [esp + 4 + 8] // src_sobely michael@0: mov edx, [esp + 4 + 12] // dst_argb michael@0: mov ecx, [esp + 4 + 16] // width michael@0: sub esi, eax michael@0: pcmpeqb xmm5, xmm5 // alpha 255 michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax] // read 16 pixels src_sobelx michael@0: movdqa xmm1, [eax + esi] // read 16 pixels src_sobely michael@0: lea eax, [eax + 16] michael@0: movdqa xmm2, xmm0 michael@0: paddusb xmm2, xmm1 // sobel = sobelx + sobely michael@0: movdqa xmm3, xmm0 // XA michael@0: punpcklbw xmm3, xmm5 michael@0: punpckhbw xmm0, xmm5 michael@0: movdqa xmm4, xmm1 // YS michael@0: punpcklbw xmm4, xmm2 michael@0: punpckhbw xmm1, xmm2 michael@0: movdqa xmm6, xmm4 // YSXA michael@0: punpcklwd xmm6, xmm3 // First 4 michael@0: punpckhwd xmm4, xmm3 // Next 4 michael@0: movdqa xmm7, xmm1 // YSXA michael@0: punpcklwd xmm7, xmm0 // Next 4 michael@0: punpckhwd xmm1, xmm0 // Last 4 michael@0: sub ecx, 16 michael@0: movdqa [edx], xmm6 michael@0: movdqa [edx + 16], xmm4 michael@0: movdqa [edx + 32], xmm7 michael@0: movdqa [edx + 48], xmm1 michael@0: lea edx, [edx + 64] michael@0: jg convertloop michael@0: michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_SOBELXYROW_SSE2 michael@0: michael@0: #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 michael@0: // Consider float CumulativeSum. michael@0: // Consider calling CumulativeSum one row at time as needed. michael@0: // Consider circular CumulativeSum buffer of radius * 2 + 1 height. michael@0: // Convert cumulative sum for an area to an average for 1 pixel. michael@0: // topleft is pointer to top left of CumulativeSum buffer for area. michael@0: // botleft is pointer to bottom left of CumulativeSum buffer. michael@0: // width is offset from left to right of area in CumulativeSum buffer measured michael@0: // in number of ints. michael@0: // area is the number of pixels in the area being averaged. michael@0: // dst points to pixel to store result to. michael@0: // count is number of averaged pixels to produce. michael@0: // Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte michael@0: // aligned. michael@0: void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, michael@0: int width, int area, uint8* dst, michael@0: int count) { michael@0: __asm { michael@0: mov eax, topleft // eax topleft michael@0: mov esi, botleft // esi botleft michael@0: mov edx, width michael@0: movd xmm5, area michael@0: mov edi, dst michael@0: mov ecx, count michael@0: cvtdq2ps xmm5, xmm5 michael@0: rcpss xmm4, xmm5 // 1.0f / area michael@0: pshufd xmm4, xmm4, 0 michael@0: sub ecx, 4 michael@0: jl l4b michael@0: michael@0: cmp area, 128 // 128 pixels will not overflow 15 bits. michael@0: ja l4 michael@0: michael@0: pshufd xmm5, xmm5, 0 // area michael@0: pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0 michael@0: psrld xmm6, 16 michael@0: cvtdq2ps xmm6, xmm6 michael@0: addps xmm5, xmm6 // (65536.0 + area - 1) michael@0: mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area michael@0: cvtps2dq xmm5, xmm5 // 0.16 fixed point michael@0: packssdw xmm5, xmm5 // 16 bit shorts michael@0: michael@0: // 4 pixel loop small blocks. michael@0: align 4 michael@0: s4: michael@0: // top left michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: movdqa xmm2, [eax + 32] michael@0: movdqa xmm3, [eax + 48] michael@0: michael@0: // - top right michael@0: psubd xmm0, [eax + edx * 4] michael@0: psubd xmm1, [eax + edx * 4 + 16] michael@0: psubd xmm2, [eax + edx * 4 + 32] michael@0: psubd xmm3, [eax + edx * 4 + 48] michael@0: lea eax, [eax + 64] michael@0: michael@0: // - bottom left michael@0: psubd xmm0, [esi] michael@0: psubd xmm1, [esi + 16] michael@0: psubd xmm2, [esi + 32] michael@0: psubd xmm3, [esi + 48] michael@0: michael@0: // + bottom right michael@0: paddd xmm0, [esi + edx * 4] michael@0: paddd xmm1, [esi + edx * 4 + 16] michael@0: paddd xmm2, [esi + edx * 4 + 32] michael@0: paddd xmm3, [esi + edx * 4 + 48] michael@0: lea esi, [esi + 64] michael@0: michael@0: packssdw xmm0, xmm1 // pack 4 pixels into 2 registers michael@0: packssdw xmm2, xmm3 michael@0: michael@0: pmulhuw xmm0, xmm5 michael@0: pmulhuw xmm2, xmm5 michael@0: michael@0: packuswb xmm0, xmm2 michael@0: movdqu [edi], xmm0 michael@0: lea edi, [edi + 16] michael@0: sub ecx, 4 michael@0: jge s4 michael@0: michael@0: jmp l4b michael@0: michael@0: // 4 pixel loop michael@0: align 4 michael@0: l4: michael@0: // top left michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: movdqa xmm2, [eax + 32] michael@0: movdqa xmm3, [eax + 48] michael@0: michael@0: // - top right michael@0: psubd xmm0, [eax + edx * 4] michael@0: psubd xmm1, [eax + edx * 4 + 16] michael@0: psubd xmm2, [eax + edx * 4 + 32] michael@0: psubd xmm3, [eax + edx * 4 + 48] michael@0: lea eax, [eax + 64] michael@0: michael@0: // - bottom left michael@0: psubd xmm0, [esi] michael@0: psubd xmm1, [esi + 16] michael@0: psubd xmm2, [esi + 32] michael@0: psubd xmm3, [esi + 48] michael@0: michael@0: // + bottom right michael@0: paddd xmm0, [esi + edx * 4] michael@0: paddd xmm1, [esi + edx * 4 + 16] michael@0: paddd xmm2, [esi + edx * 4 + 32] michael@0: paddd xmm3, [esi + edx * 4 + 48] michael@0: lea esi, [esi + 64] michael@0: michael@0: cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area michael@0: cvtdq2ps xmm1, xmm1 michael@0: mulps xmm0, xmm4 michael@0: mulps xmm1, xmm4 michael@0: cvtdq2ps xmm2, xmm2 michael@0: cvtdq2ps xmm3, xmm3 michael@0: mulps xmm2, xmm4 michael@0: mulps xmm3, xmm4 michael@0: cvtps2dq xmm0, xmm0 michael@0: cvtps2dq xmm1, xmm1 michael@0: cvtps2dq xmm2, xmm2 michael@0: cvtps2dq xmm3, xmm3 michael@0: packssdw xmm0, xmm1 michael@0: packssdw xmm2, xmm3 michael@0: packuswb xmm0, xmm2 michael@0: movdqu [edi], xmm0 michael@0: lea edi, [edi + 16] michael@0: sub ecx, 4 michael@0: jge l4 michael@0: michael@0: l4b: michael@0: add ecx, 4 - 1 michael@0: jl l1b michael@0: michael@0: // 1 pixel loop michael@0: align 4 michael@0: l1: michael@0: movdqa xmm0, [eax] michael@0: psubd xmm0, [eax + edx * 4] michael@0: lea eax, [eax + 16] michael@0: psubd xmm0, [esi] michael@0: paddd xmm0, [esi + edx * 4] michael@0: lea esi, [esi + 16] michael@0: cvtdq2ps xmm0, xmm0 michael@0: mulps xmm0, xmm4 michael@0: cvtps2dq xmm0, xmm0 michael@0: packssdw xmm0, xmm0 michael@0: packuswb xmm0, xmm0 michael@0: movd dword ptr [edi], xmm0 michael@0: lea edi, [edi + 4] michael@0: sub ecx, 1 michael@0: jge l1 michael@0: l1b: michael@0: } michael@0: } michael@0: #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 michael@0: michael@0: #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 michael@0: // Creates a table of cumulative sums where each value is a sum of all values michael@0: // above and to the left of the value. michael@0: void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, michael@0: const int32* previous_cumsum, int width) { michael@0: __asm { michael@0: mov eax, row michael@0: mov edx, cumsum michael@0: mov esi, previous_cumsum michael@0: mov ecx, width michael@0: pxor xmm0, xmm0 michael@0: pxor xmm1, xmm1 michael@0: michael@0: sub ecx, 4 michael@0: jl l4b michael@0: test edx, 15 michael@0: jne l4b michael@0: michael@0: // 4 pixel loop michael@0: align 4 michael@0: l4: michael@0: movdqu xmm2, [eax] // 4 argb pixels 16 bytes. michael@0: lea eax, [eax + 16] michael@0: movdqa xmm4, xmm2 michael@0: michael@0: punpcklbw xmm2, xmm1 michael@0: movdqa xmm3, xmm2 michael@0: punpcklwd xmm2, xmm1 michael@0: punpckhwd xmm3, xmm1 michael@0: michael@0: punpckhbw xmm4, xmm1 michael@0: movdqa xmm5, xmm4 michael@0: punpcklwd xmm4, xmm1 michael@0: punpckhwd xmm5, xmm1 michael@0: michael@0: paddd xmm0, xmm2 michael@0: movdqa xmm2, [esi] // previous row above. michael@0: paddd xmm2, xmm0 michael@0: michael@0: paddd xmm0, xmm3 michael@0: movdqa xmm3, [esi + 16] michael@0: paddd xmm3, xmm0 michael@0: michael@0: paddd xmm0, xmm4 michael@0: movdqa xmm4, [esi + 32] michael@0: paddd xmm4, xmm0 michael@0: michael@0: paddd xmm0, xmm5 michael@0: movdqa xmm5, [esi + 48] michael@0: lea esi, [esi + 64] michael@0: paddd xmm5, xmm0 michael@0: michael@0: movdqa [edx], xmm2 michael@0: movdqa [edx + 16], xmm3 michael@0: movdqa [edx + 32], xmm4 michael@0: movdqa [edx + 48], xmm5 michael@0: michael@0: lea edx, [edx + 64] michael@0: sub ecx, 4 michael@0: jge l4 michael@0: michael@0: l4b: michael@0: add ecx, 4 - 1 michael@0: jl l1b michael@0: michael@0: // 1 pixel loop michael@0: align 4 michael@0: l1: michael@0: movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes. michael@0: lea eax, [eax + 4] michael@0: punpcklbw xmm2, xmm1 michael@0: punpcklwd xmm2, xmm1 michael@0: paddd xmm0, xmm2 michael@0: movdqu xmm2, [esi] michael@0: lea esi, [esi + 16] michael@0: paddd xmm2, xmm0 michael@0: movdqu [edx], xmm2 michael@0: lea edx, [edx + 16] michael@0: sub ecx, 1 michael@0: jge l1 michael@0: michael@0: l1b: michael@0: } michael@0: } michael@0: #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 michael@0: michael@0: #ifdef HAS_ARGBAFFINEROW_SSE2 michael@0: // Copy ARGB pixels from source image with slope to a row of destination. michael@0: __declspec(naked) __declspec(align(16)) michael@0: LIBYUV_API michael@0: void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, michael@0: uint8* dst_argb, const float* uv_dudv, int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 12] // src_argb michael@0: mov esi, [esp + 16] // stride michael@0: mov edx, [esp + 20] // dst_argb michael@0: mov ecx, [esp + 24] // pointer to uv_dudv michael@0: movq xmm2, qword ptr [ecx] // uv michael@0: movq xmm7, qword ptr [ecx + 8] // dudv michael@0: mov ecx, [esp + 28] // width michael@0: shl esi, 16 // 4, stride michael@0: add esi, 4 michael@0: movd xmm5, esi michael@0: sub ecx, 4 michael@0: jl l4b michael@0: michael@0: // setup for 4 pixel loop michael@0: pshufd xmm7, xmm7, 0x44 // dup dudv michael@0: pshufd xmm5, xmm5, 0 // dup 4, stride michael@0: movdqa xmm0, xmm2 // x0, y0, x1, y1 michael@0: addps xmm0, xmm7 michael@0: movlhps xmm2, xmm0 michael@0: movdqa xmm4, xmm7 michael@0: addps xmm4, xmm4 // dudv *= 2 michael@0: movdqa xmm3, xmm2 // x2, y2, x3, y3 michael@0: addps xmm3, xmm4 michael@0: addps xmm4, xmm4 // dudv *= 4 michael@0: michael@0: // 4 pixel loop michael@0: align 4 michael@0: l4: michael@0: cvttps2dq xmm0, xmm2 // x, y float to int first 2 michael@0: cvttps2dq xmm1, xmm3 // x, y float to int next 2 michael@0: packssdw xmm0, xmm1 // x, y as 8 shorts michael@0: pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. michael@0: movd esi, xmm0 michael@0: pshufd xmm0, xmm0, 0x39 // shift right michael@0: movd edi, xmm0 michael@0: pshufd xmm0, xmm0, 0x39 // shift right michael@0: movd xmm1, [eax + esi] // read pixel 0 michael@0: movd xmm6, [eax + edi] // read pixel 1 michael@0: punpckldq xmm1, xmm6 // combine pixel 0 and 1 michael@0: addps xmm2, xmm4 // x, y += dx, dy first 2 michael@0: movq qword ptr [edx], xmm1 michael@0: movd esi, xmm0 michael@0: pshufd xmm0, xmm0, 0x39 // shift right michael@0: movd edi, xmm0 michael@0: movd xmm6, [eax + esi] // read pixel 2 michael@0: movd xmm0, [eax + edi] // read pixel 3 michael@0: punpckldq xmm6, xmm0 // combine pixel 2 and 3 michael@0: addps xmm3, xmm4 // x, y += dx, dy next 2 michael@0: sub ecx, 4 michael@0: movq qword ptr 8[edx], xmm6 michael@0: lea edx, [edx + 16] michael@0: jge l4 michael@0: michael@0: l4b: michael@0: add ecx, 4 - 1 michael@0: jl l1b michael@0: michael@0: // 1 pixel loop michael@0: align 4 michael@0: l1: michael@0: cvttps2dq xmm0, xmm2 // x, y float to int michael@0: packssdw xmm0, xmm0 // x, y as shorts michael@0: pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride michael@0: addps xmm2, xmm7 // x, y += dx, dy michael@0: movd esi, xmm0 michael@0: movd xmm0, [eax + esi] // copy a pixel michael@0: sub ecx, 1 michael@0: movd [edx], xmm0 michael@0: lea edx, [edx + 4] michael@0: jge l1 michael@0: l1b: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBAFFINEROW_SSE2 michael@0: michael@0: #ifdef HAS_INTERPOLATEROW_AVX2 michael@0: // Bilinear filter 16x2 -> 16x1 michael@0: __declspec(naked) __declspec(align(16)) michael@0: void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, michael@0: ptrdiff_t src_stride, int dst_width, michael@0: int source_y_fraction) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov edi, [esp + 8 + 4] // dst_ptr michael@0: mov esi, [esp + 8 + 8] // src_ptr michael@0: mov edx, [esp + 8 + 12] // src_stride michael@0: mov ecx, [esp + 8 + 16] // dst_width michael@0: mov eax, [esp + 8 + 20] // source_y_fraction (0..255) michael@0: shr eax, 1 michael@0: // Dispatch to specialized filters if applicable. michael@0: cmp eax, 0 michael@0: je xloop100 // 0 / 128. Blend 100 / 0. michael@0: sub edi, esi michael@0: cmp eax, 32 michael@0: je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. michael@0: cmp eax, 64 michael@0: je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. michael@0: cmp eax, 96 michael@0: je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. michael@0: michael@0: vmovd xmm0, eax // high fraction 0..127 michael@0: neg eax michael@0: add eax, 128 michael@0: vmovd xmm5, eax // low fraction 128..1 michael@0: vpunpcklbw xmm5, xmm5, xmm0 michael@0: vpunpcklwd xmm5, xmm5, xmm5 michael@0: vpxor ymm0, ymm0, ymm0 michael@0: vpermd ymm5, ymm0, ymm5 michael@0: michael@0: align 4 michael@0: xloop: michael@0: vmovdqu ymm0, [esi] michael@0: vmovdqu ymm2, [esi + edx] michael@0: vpunpckhbw ymm1, ymm0, ymm2 // mutates michael@0: vpunpcklbw ymm0, ymm0, ymm2 // mutates michael@0: vpmaddubsw ymm0, ymm0, ymm5 michael@0: vpmaddubsw ymm1, ymm1, ymm5 michael@0: vpsrlw ymm0, ymm0, 7 michael@0: vpsrlw ymm1, ymm1, 7 michael@0: vpackuswb ymm0, ymm0, ymm1 // unmutates michael@0: sub ecx, 32 michael@0: vmovdqu [esi + edi], ymm0 michael@0: lea esi, [esi + 32] michael@0: jg xloop michael@0: jmp xloop99 michael@0: michael@0: // Blend 25 / 75. michael@0: align 4 michael@0: xloop25: michael@0: vmovdqu ymm0, [esi] michael@0: vpavgb ymm0, ymm0, [esi + edx] michael@0: vpavgb ymm0, ymm0, [esi + edx] michael@0: sub ecx, 32 michael@0: vmovdqu [esi + edi], ymm0 michael@0: lea esi, [esi + 32] michael@0: jg xloop25 michael@0: jmp xloop99 michael@0: michael@0: // Blend 50 / 50. michael@0: align 4 michael@0: xloop50: michael@0: vmovdqu ymm0, [esi] michael@0: vpavgb ymm0, ymm0, [esi + edx] michael@0: sub ecx, 32 michael@0: vmovdqu [esi + edi], ymm0 michael@0: lea esi, [esi + 32] michael@0: jg xloop50 michael@0: jmp xloop99 michael@0: michael@0: // Blend 75 / 25. michael@0: align 4 michael@0: xloop75: michael@0: vmovdqu ymm0, [esi + edx] michael@0: vpavgb ymm0, ymm0, [esi] michael@0: vpavgb ymm0, ymm0, [esi] michael@0: sub ecx, 32 michael@0: vmovdqu [esi + edi], ymm0 michael@0: lea esi, [esi + 32] michael@0: jg xloop75 michael@0: jmp xloop99 michael@0: michael@0: // Blend 100 / 0 - Copy row unchanged. michael@0: align 4 michael@0: xloop100: michael@0: rep movsb michael@0: michael@0: xloop99: michael@0: pop edi michael@0: pop esi michael@0: vzeroupper michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_INTERPOLATEROW_AVX2 michael@0: michael@0: #ifdef HAS_INTERPOLATEROW_SSSE3 michael@0: // Bilinear filter 16x2 -> 16x1 michael@0: __declspec(naked) __declspec(align(16)) michael@0: void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, michael@0: ptrdiff_t src_stride, int dst_width, michael@0: int source_y_fraction) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov edi, [esp + 8 + 4] // dst_ptr michael@0: mov esi, [esp + 8 + 8] // src_ptr michael@0: mov edx, [esp + 8 + 12] // src_stride michael@0: mov ecx, [esp + 8 + 16] // dst_width michael@0: mov eax, [esp + 8 + 20] // source_y_fraction (0..255) michael@0: sub edi, esi michael@0: shr eax, 1 michael@0: // Dispatch to specialized filters if applicable. michael@0: cmp eax, 0 michael@0: je xloop100 // 0 / 128. Blend 100 / 0. michael@0: cmp eax, 32 michael@0: je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. michael@0: cmp eax, 64 michael@0: je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. michael@0: cmp eax, 96 michael@0: je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. michael@0: michael@0: movd xmm0, eax // high fraction 0..127 michael@0: neg eax michael@0: add eax, 128 michael@0: movd xmm5, eax // low fraction 128..1 michael@0: punpcklbw xmm5, xmm0 michael@0: punpcklwd xmm5, xmm5 michael@0: pshufd xmm5, xmm5, 0 michael@0: michael@0: align 4 michael@0: xloop: michael@0: movdqa xmm0, [esi] michael@0: movdqa xmm2, [esi + edx] michael@0: movdqa xmm1, xmm0 michael@0: punpcklbw xmm0, xmm2 michael@0: punpckhbw xmm1, xmm2 michael@0: pmaddubsw xmm0, xmm5 michael@0: pmaddubsw xmm1, xmm5 michael@0: psrlw xmm0, 7 michael@0: psrlw xmm1, 7 michael@0: packuswb xmm0, xmm1 michael@0: sub ecx, 16 michael@0: movdqa [esi + edi], xmm0 michael@0: lea esi, [esi + 16] michael@0: jg xloop michael@0: jmp xloop99 michael@0: michael@0: // Blend 25 / 75. michael@0: align 4 michael@0: xloop25: michael@0: movdqa xmm0, [esi] michael@0: movdqa xmm1, [esi + edx] michael@0: pavgb xmm0, xmm1 michael@0: pavgb xmm0, xmm1 michael@0: sub ecx, 16 michael@0: movdqa [esi + edi], xmm0 michael@0: lea esi, [esi + 16] michael@0: jg xloop25 michael@0: jmp xloop99 michael@0: michael@0: // Blend 50 / 50. michael@0: align 4 michael@0: xloop50: michael@0: movdqa xmm0, [esi] michael@0: movdqa xmm1, [esi + edx] michael@0: pavgb xmm0, xmm1 michael@0: sub ecx, 16 michael@0: movdqa [esi + edi], xmm0 michael@0: lea esi, [esi + 16] michael@0: jg xloop50 michael@0: jmp xloop99 michael@0: michael@0: // Blend 75 / 25. michael@0: align 4 michael@0: xloop75: michael@0: movdqa xmm1, [esi] michael@0: movdqa xmm0, [esi + edx] michael@0: pavgb xmm0, xmm1 michael@0: pavgb xmm0, xmm1 michael@0: sub ecx, 16 michael@0: movdqa [esi + edi], xmm0 michael@0: lea esi, [esi + 16] michael@0: jg xloop75 michael@0: jmp xloop99 michael@0: michael@0: // Blend 100 / 0 - Copy row unchanged. michael@0: align 4 michael@0: xloop100: michael@0: movdqa xmm0, [esi] michael@0: sub ecx, 16 michael@0: movdqa [esi + edi], xmm0 michael@0: lea esi, [esi + 16] michael@0: jg xloop100 michael@0: michael@0: xloop99: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_INTERPOLATEROW_SSSE3 michael@0: michael@0: #ifdef HAS_INTERPOLATEROW_SSE2 michael@0: // Bilinear filter 16x2 -> 16x1 michael@0: __declspec(naked) __declspec(align(16)) michael@0: void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, michael@0: ptrdiff_t src_stride, int dst_width, michael@0: int source_y_fraction) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov edi, [esp + 8 + 4] // dst_ptr michael@0: mov esi, [esp + 8 + 8] // src_ptr michael@0: mov edx, [esp + 8 + 12] // src_stride michael@0: mov ecx, [esp + 8 + 16] // dst_width michael@0: mov eax, [esp + 8 + 20] // source_y_fraction (0..255) michael@0: sub edi, esi michael@0: // Dispatch to specialized filters if applicable. michael@0: cmp eax, 0 michael@0: je xloop100 // 0 / 256. Blend 100 / 0. michael@0: cmp eax, 64 michael@0: je xloop75 // 64 / 256 is 0.25. Blend 75 / 25. michael@0: cmp eax, 128 michael@0: je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. michael@0: cmp eax, 192 michael@0: je xloop25 // 192 / 256 is 0.75. Blend 25 / 75. michael@0: michael@0: movd xmm5, eax // xmm5 = y fraction michael@0: punpcklbw xmm5, xmm5 michael@0: psrlw xmm5, 1 michael@0: punpcklwd xmm5, xmm5 michael@0: punpckldq xmm5, xmm5 michael@0: punpcklqdq xmm5, xmm5 michael@0: pxor xmm4, xmm4 michael@0: michael@0: align 4 michael@0: xloop: michael@0: movdqa xmm0, [esi] // row0 michael@0: movdqa xmm2, [esi + edx] // row1 michael@0: movdqa xmm1, xmm0 michael@0: movdqa xmm3, xmm2 michael@0: punpcklbw xmm2, xmm4 michael@0: punpckhbw xmm3, xmm4 michael@0: punpcklbw xmm0, xmm4 michael@0: punpckhbw xmm1, xmm4 michael@0: psubw xmm2, xmm0 // row1 - row0 michael@0: psubw xmm3, xmm1 michael@0: paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16 michael@0: paddw xmm3, xmm3 michael@0: pmulhw xmm2, xmm5 // scale diff michael@0: pmulhw xmm3, xmm5 michael@0: paddw xmm0, xmm2 // sum rows michael@0: paddw xmm1, xmm3 michael@0: packuswb xmm0, xmm1 michael@0: sub ecx, 16 michael@0: movdqa [esi + edi], xmm0 michael@0: lea esi, [esi + 16] michael@0: jg xloop michael@0: jmp xloop99 michael@0: michael@0: // Blend 25 / 75. michael@0: align 4 michael@0: xloop25: michael@0: movdqa xmm0, [esi] michael@0: movdqa xmm1, [esi + edx] michael@0: pavgb xmm0, xmm1 michael@0: pavgb xmm0, xmm1 michael@0: sub ecx, 16 michael@0: movdqa [esi + edi], xmm0 michael@0: lea esi, [esi + 16] michael@0: jg xloop25 michael@0: jmp xloop99 michael@0: michael@0: // Blend 50 / 50. michael@0: align 4 michael@0: xloop50: michael@0: movdqa xmm0, [esi] michael@0: movdqa xmm1, [esi + edx] michael@0: pavgb xmm0, xmm1 michael@0: sub ecx, 16 michael@0: movdqa [esi + edi], xmm0 michael@0: lea esi, [esi + 16] michael@0: jg xloop50 michael@0: jmp xloop99 michael@0: michael@0: // Blend 75 / 25. michael@0: align 4 michael@0: xloop75: michael@0: movdqa xmm1, [esi] michael@0: movdqa xmm0, [esi + edx] michael@0: pavgb xmm0, xmm1 michael@0: pavgb xmm0, xmm1 michael@0: sub ecx, 16 michael@0: movdqa [esi + edi], xmm0 michael@0: lea esi, [esi + 16] michael@0: jg xloop75 michael@0: jmp xloop99 michael@0: michael@0: // Blend 100 / 0 - Copy row unchanged. michael@0: align 4 michael@0: xloop100: michael@0: movdqa xmm0, [esi] michael@0: sub ecx, 16 michael@0: movdqa [esi + edi], xmm0 michael@0: lea esi, [esi + 16] michael@0: jg xloop100 michael@0: michael@0: xloop99: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_INTERPOLATEROW_SSE2 michael@0: michael@0: // Bilinear filter 16x2 -> 16x1 michael@0: __declspec(naked) __declspec(align(16)) michael@0: void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, michael@0: ptrdiff_t src_stride, int dst_width, michael@0: int source_y_fraction) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov edi, [esp + 8 + 4] // dst_ptr michael@0: mov esi, [esp + 8 + 8] // src_ptr michael@0: mov edx, [esp + 8 + 12] // src_stride michael@0: mov ecx, [esp + 8 + 16] // dst_width michael@0: mov eax, [esp + 8 + 20] // source_y_fraction (0..255) michael@0: sub edi, esi michael@0: shr eax, 1 michael@0: // Dispatch to specialized filters if applicable. michael@0: cmp eax, 0 michael@0: je xloop100 // 0 / 128. Blend 100 / 0. michael@0: cmp eax, 32 michael@0: je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. michael@0: cmp eax, 64 michael@0: je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. michael@0: cmp eax, 96 michael@0: je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. michael@0: michael@0: movd xmm0, eax // high fraction 0..127 michael@0: neg eax michael@0: add eax, 128 michael@0: movd xmm5, eax // low fraction 128..1 michael@0: punpcklbw xmm5, xmm0 michael@0: punpcklwd xmm5, xmm5 michael@0: pshufd xmm5, xmm5, 0 michael@0: michael@0: align 4 michael@0: xloop: michael@0: movdqu xmm0, [esi] michael@0: movdqu xmm2, [esi + edx] michael@0: movdqu xmm1, xmm0 michael@0: punpcklbw xmm0, xmm2 michael@0: punpckhbw xmm1, xmm2 michael@0: pmaddubsw xmm0, xmm5 michael@0: pmaddubsw xmm1, xmm5 michael@0: psrlw xmm0, 7 michael@0: psrlw xmm1, 7 michael@0: packuswb xmm0, xmm1 michael@0: sub ecx, 16 michael@0: movdqu [esi + edi], xmm0 michael@0: lea esi, [esi + 16] michael@0: jg xloop michael@0: jmp xloop99 michael@0: michael@0: // Blend 25 / 75. michael@0: align 4 michael@0: xloop25: michael@0: movdqu xmm0, [esi] michael@0: movdqu xmm1, [esi + edx] michael@0: pavgb xmm0, xmm1 michael@0: pavgb xmm0, xmm1 michael@0: sub ecx, 16 michael@0: movdqu [esi + edi], xmm0 michael@0: lea esi, [esi + 16] michael@0: jg xloop25 michael@0: jmp xloop99 michael@0: michael@0: // Blend 50 / 50. michael@0: align 4 michael@0: xloop50: michael@0: movdqu xmm0, [esi] michael@0: movdqu xmm1, [esi + edx] michael@0: pavgb xmm0, xmm1 michael@0: sub ecx, 16 michael@0: movdqu [esi + edi], xmm0 michael@0: lea esi, [esi + 16] michael@0: jg xloop50 michael@0: jmp xloop99 michael@0: michael@0: // Blend 75 / 25. michael@0: align 4 michael@0: xloop75: michael@0: movdqu xmm1, [esi] michael@0: movdqu xmm0, [esi + edx] michael@0: pavgb xmm0, xmm1 michael@0: pavgb xmm0, xmm1 michael@0: sub ecx, 16 michael@0: movdqu [esi + edi], xmm0 michael@0: lea esi, [esi + 16] michael@0: jg xloop75 michael@0: jmp xloop99 michael@0: michael@0: // Blend 100 / 0 - Copy row unchanged. michael@0: align 4 michael@0: xloop100: michael@0: movdqu xmm0, [esi] michael@0: sub ecx, 16 michael@0: movdqu [esi + edi], xmm0 michael@0: lea esi, [esi + 16] michael@0: jg xloop100 michael@0: michael@0: xloop99: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: #ifdef HAS_INTERPOLATEROW_SSE2 michael@0: // Bilinear filter 16x2 -> 16x1 michael@0: __declspec(naked) __declspec(align(16)) michael@0: void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, michael@0: ptrdiff_t src_stride, int dst_width, michael@0: int source_y_fraction) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov edi, [esp + 8 + 4] // dst_ptr michael@0: mov esi, [esp + 8 + 8] // src_ptr michael@0: mov edx, [esp + 8 + 12] // src_stride michael@0: mov ecx, [esp + 8 + 16] // dst_width michael@0: mov eax, [esp + 8 + 20] // source_y_fraction (0..255) michael@0: sub edi, esi michael@0: // Dispatch to specialized filters if applicable. michael@0: cmp eax, 0 michael@0: je xloop100 // 0 / 256. Blend 100 / 0. michael@0: cmp eax, 64 michael@0: je xloop75 // 64 / 256 is 0.25. Blend 75 / 25. michael@0: cmp eax, 128 michael@0: je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. michael@0: cmp eax, 192 michael@0: je xloop25 // 192 / 256 is 0.75. Blend 25 / 75. michael@0: michael@0: movd xmm5, eax // xmm5 = y fraction michael@0: punpcklbw xmm5, xmm5 michael@0: psrlw xmm5, 1 michael@0: punpcklwd xmm5, xmm5 michael@0: punpckldq xmm5, xmm5 michael@0: punpcklqdq xmm5, xmm5 michael@0: pxor xmm4, xmm4 michael@0: michael@0: align 4 michael@0: xloop: michael@0: movdqu xmm0, [esi] // row0 michael@0: movdqu xmm2, [esi + edx] // row1 michael@0: movdqu xmm1, xmm0 michael@0: movdqu xmm3, xmm2 michael@0: punpcklbw xmm2, xmm4 michael@0: punpckhbw xmm3, xmm4 michael@0: punpcklbw xmm0, xmm4 michael@0: punpckhbw xmm1, xmm4 michael@0: psubw xmm2, xmm0 // row1 - row0 michael@0: psubw xmm3, xmm1 michael@0: paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16 michael@0: paddw xmm3, xmm3 michael@0: pmulhw xmm2, xmm5 // scale diff michael@0: pmulhw xmm3, xmm5 michael@0: paddw xmm0, xmm2 // sum rows michael@0: paddw xmm1, xmm3 michael@0: packuswb xmm0, xmm1 michael@0: sub ecx, 16 michael@0: movdqu [esi + edi], xmm0 michael@0: lea esi, [esi + 16] michael@0: jg xloop michael@0: jmp xloop99 michael@0: michael@0: // Blend 25 / 75. michael@0: align 4 michael@0: xloop25: michael@0: movdqu xmm0, [esi] michael@0: movdqu xmm1, [esi + edx] michael@0: pavgb xmm0, xmm1 michael@0: pavgb xmm0, xmm1 michael@0: sub ecx, 16 michael@0: movdqu [esi + edi], xmm0 michael@0: lea esi, [esi + 16] michael@0: jg xloop25 michael@0: jmp xloop99 michael@0: michael@0: // Blend 50 / 50. michael@0: align 4 michael@0: xloop50: michael@0: movdqu xmm0, [esi] michael@0: movdqu xmm1, [esi + edx] michael@0: pavgb xmm0, xmm1 michael@0: sub ecx, 16 michael@0: movdqu [esi + edi], xmm0 michael@0: lea esi, [esi + 16] michael@0: jg xloop50 michael@0: jmp xloop99 michael@0: michael@0: // Blend 75 / 25. michael@0: align 4 michael@0: xloop75: michael@0: movdqu xmm1, [esi] michael@0: movdqu xmm0, [esi + edx] michael@0: pavgb xmm0, xmm1 michael@0: pavgb xmm0, xmm1 michael@0: sub ecx, 16 michael@0: movdqu [esi + edi], xmm0 michael@0: lea esi, [esi + 16] michael@0: jg xloop75 michael@0: jmp xloop99 michael@0: michael@0: // Blend 100 / 0 - Copy row unchanged. michael@0: align 4 michael@0: xloop100: michael@0: movdqu xmm0, [esi] michael@0: sub ecx, 16 michael@0: movdqu [esi + edi], xmm0 michael@0: lea esi, [esi + 16] michael@0: jg xloop100 michael@0: michael@0: xloop99: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_INTERPOLATEROW_SSE2 michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, michael@0: uint8* dst_uv, int pix) { michael@0: __asm { michael@0: push edi michael@0: mov eax, [esp + 4 + 4] // src_uv michael@0: mov edx, [esp + 4 + 8] // src_uv_stride michael@0: mov edi, [esp + 4 + 12] // dst_v michael@0: mov ecx, [esp + 4 + 16] // pix michael@0: sub edi, eax michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movdqa xmm0, [eax] michael@0: pavgb xmm0, [eax + edx] michael@0: sub ecx, 16 michael@0: movdqa [eax + edi], xmm0 michael@0: lea eax, [eax + 16] michael@0: jg convertloop michael@0: pop edi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: #ifdef HAS_HALFROW_AVX2 michael@0: __declspec(naked) __declspec(align(16)) michael@0: void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride, michael@0: uint8* dst_uv, int pix) { michael@0: __asm { michael@0: push edi michael@0: mov eax, [esp + 4 + 4] // src_uv michael@0: mov edx, [esp + 4 + 8] // src_uv_stride michael@0: mov edi, [esp + 4 + 12] // dst_v michael@0: mov ecx, [esp + 4 + 16] // pix michael@0: sub edi, eax michael@0: michael@0: align 4 michael@0: convertloop: michael@0: vmovdqu ymm0, [eax] michael@0: vpavgb ymm0, ymm0, [eax + edx] michael@0: sub ecx, 32 michael@0: vmovdqu [eax + edi], ymm0 michael@0: lea eax, [eax + 32] michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: vzeroupper michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_HALFROW_AVX2 michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, michael@0: uint32 selector, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_argb michael@0: mov edx, [esp + 8] // dst_bayer michael@0: movd xmm5, [esp + 12] // selector michael@0: mov ecx, [esp + 16] // pix michael@0: pshufd xmm5, xmm5, 0 michael@0: michael@0: align 4 michael@0: wloop: michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: lea eax, [eax + 32] michael@0: pshufb xmm0, xmm5 michael@0: pshufb xmm1, xmm5 michael@0: punpckldq xmm0, xmm1 michael@0: sub ecx, 8 michael@0: movq qword ptr [edx], xmm0 michael@0: lea edx, [edx + 8] michael@0: jg wloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // Specialized ARGB to Bayer that just isolates G channel. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, michael@0: uint32 selector, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_argb michael@0: mov edx, [esp + 8] // dst_bayer michael@0: // selector michael@0: mov ecx, [esp + 16] // pix michael@0: pcmpeqb xmm5, xmm5 // generate mask 0x000000ff michael@0: psrld xmm5, 24 michael@0: michael@0: align 4 michael@0: wloop: michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: lea eax, [eax + 32] michael@0: psrld xmm0, 8 // Move green to bottom. michael@0: psrld xmm1, 8 michael@0: pand xmm0, xmm5 michael@0: pand xmm1, xmm5 michael@0: packssdw xmm0, xmm1 michael@0: packuswb xmm0, xmm1 michael@0: sub ecx, 8 michael@0: movq qword ptr [edx], xmm0 michael@0: lea edx, [edx + 8] michael@0: jg wloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, michael@0: const uint8* shuffler, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_argb michael@0: mov edx, [esp + 8] // dst_argb michael@0: mov ecx, [esp + 12] // shuffler michael@0: movdqa xmm5, [ecx] michael@0: mov ecx, [esp + 16] // pix michael@0: michael@0: align 4 michael@0: wloop: michael@0: movdqa xmm0, [eax] michael@0: movdqa xmm1, [eax + 16] michael@0: lea eax, [eax + 32] michael@0: pshufb xmm0, xmm5 michael@0: pshufb xmm1, xmm5 michael@0: sub ecx, 8 michael@0: movdqa [edx], xmm0 michael@0: movdqa [edx + 16], xmm1 michael@0: lea edx, [edx + 32] michael@0: jg wloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, michael@0: const uint8* shuffler, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_argb michael@0: mov edx, [esp + 8] // dst_argb michael@0: mov ecx, [esp + 12] // shuffler michael@0: movdqa xmm5, [ecx] michael@0: mov ecx, [esp + 16] // pix michael@0: michael@0: align 4 michael@0: wloop: michael@0: movdqu xmm0, [eax] michael@0: movdqu xmm1, [eax + 16] michael@0: lea eax, [eax + 32] michael@0: pshufb xmm0, xmm5 michael@0: pshufb xmm1, xmm5 michael@0: sub ecx, 8 michael@0: movdqu [edx], xmm0 michael@0: movdqu [edx + 16], xmm1 michael@0: lea edx, [edx + 32] michael@0: jg wloop michael@0: ret michael@0: } michael@0: } michael@0: michael@0: #ifdef HAS_ARGBSHUFFLEROW_AVX2 michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, michael@0: const uint8* shuffler, int pix) { michael@0: __asm { michael@0: mov eax, [esp + 4] // src_argb michael@0: mov edx, [esp + 8] // dst_argb michael@0: mov ecx, [esp + 12] // shuffler michael@0: vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. michael@0: mov ecx, [esp + 16] // pix michael@0: michael@0: align 4 michael@0: wloop: michael@0: vmovdqu ymm0, [eax] michael@0: vmovdqu ymm1, [eax + 32] michael@0: lea eax, [eax + 64] michael@0: vpshufb ymm0, ymm0, ymm5 michael@0: vpshufb ymm1, ymm1, ymm5 michael@0: sub ecx, 16 michael@0: vmovdqu [edx], ymm0 michael@0: vmovdqu [edx + 32], ymm1 michael@0: lea edx, [edx + 64] michael@0: jg wloop michael@0: michael@0: vzeroupper michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBSHUFFLEROW_AVX2 michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, michael@0: const uint8* shuffler, int pix) { michael@0: __asm { michael@0: push ebx michael@0: push esi michael@0: mov eax, [esp + 8 + 4] // src_argb michael@0: mov edx, [esp + 8 + 8] // dst_argb michael@0: mov esi, [esp + 8 + 12] // shuffler michael@0: mov ecx, [esp + 8 + 16] // pix michael@0: pxor xmm5, xmm5 michael@0: michael@0: mov ebx, [esi] // shuffler michael@0: cmp ebx, 0x03000102 michael@0: je shuf_3012 michael@0: cmp ebx, 0x00010203 michael@0: je shuf_0123 michael@0: cmp ebx, 0x00030201 michael@0: je shuf_0321 michael@0: cmp ebx, 0x02010003 michael@0: je shuf_2103 michael@0: michael@0: // TODO(fbarchard): Use one source pointer and 3 offsets. michael@0: shuf_any1: michael@0: movzx ebx, byte ptr [esi] michael@0: movzx ebx, byte ptr [eax + ebx] michael@0: mov [edx], bl michael@0: movzx ebx, byte ptr [esi + 1] michael@0: movzx ebx, byte ptr [eax + ebx] michael@0: mov [edx + 1], bl michael@0: movzx ebx, byte ptr [esi + 2] michael@0: movzx ebx, byte ptr [eax + ebx] michael@0: mov [edx + 2], bl michael@0: movzx ebx, byte ptr [esi + 3] michael@0: movzx ebx, byte ptr [eax + ebx] michael@0: mov [edx + 3], bl michael@0: lea eax, [eax + 4] michael@0: lea edx, [edx + 4] michael@0: sub ecx, 1 michael@0: jg shuf_any1 michael@0: jmp shuf99 michael@0: michael@0: align 4 michael@0: shuf_0123: michael@0: movdqu xmm0, [eax] michael@0: lea eax, [eax + 16] michael@0: movdqa xmm1, xmm0 michael@0: punpcklbw xmm0, xmm5 michael@0: punpckhbw xmm1, xmm5 michael@0: pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB michael@0: pshuflw xmm0, xmm0, 01Bh michael@0: pshufhw xmm1, xmm1, 01Bh michael@0: pshuflw xmm1, xmm1, 01Bh michael@0: packuswb xmm0, xmm1 michael@0: sub ecx, 4 michael@0: movdqu [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg shuf_0123 michael@0: jmp shuf99 michael@0: michael@0: align 4 michael@0: shuf_0321: michael@0: movdqu xmm0, [eax] michael@0: lea eax, [eax + 16] michael@0: movdqa xmm1, xmm0 michael@0: punpcklbw xmm0, xmm5 michael@0: punpckhbw xmm1, xmm5 michael@0: pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB michael@0: pshuflw xmm0, xmm0, 039h michael@0: pshufhw xmm1, xmm1, 039h michael@0: pshuflw xmm1, xmm1, 039h michael@0: packuswb xmm0, xmm1 michael@0: sub ecx, 4 michael@0: movdqu [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg shuf_0321 michael@0: jmp shuf99 michael@0: michael@0: align 4 michael@0: shuf_2103: michael@0: movdqu xmm0, [eax] michael@0: lea eax, [eax + 16] michael@0: movdqa xmm1, xmm0 michael@0: punpcklbw xmm0, xmm5 michael@0: punpckhbw xmm1, xmm5 michael@0: pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA michael@0: pshuflw xmm0, xmm0, 093h michael@0: pshufhw xmm1, xmm1, 093h michael@0: pshuflw xmm1, xmm1, 093h michael@0: packuswb xmm0, xmm1 michael@0: sub ecx, 4 michael@0: movdqu [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg shuf_2103 michael@0: jmp shuf99 michael@0: michael@0: align 4 michael@0: shuf_3012: michael@0: movdqu xmm0, [eax] michael@0: lea eax, [eax + 16] michael@0: movdqa xmm1, xmm0 michael@0: punpcklbw xmm0, xmm5 michael@0: punpckhbw xmm1, xmm5 michael@0: pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB michael@0: pshuflw xmm0, xmm0, 0C6h michael@0: pshufhw xmm1, xmm1, 0C6h michael@0: pshuflw xmm1, xmm1, 0C6h michael@0: packuswb xmm0, xmm1 michael@0: sub ecx, 4 michael@0: movdqu [edx], xmm0 michael@0: lea edx, [edx + 16] michael@0: jg shuf_3012 michael@0: michael@0: shuf99: michael@0: pop esi michael@0: pop ebx michael@0: ret michael@0: } michael@0: } michael@0: michael@0: // YUY2 - Macro-pixel = 2 image pixels michael@0: // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... michael@0: michael@0: // UYVY - Macro-pixel = 2 image pixels michael@0: // U0Y0V0Y1 michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void I422ToYUY2Row_SSE2(const uint8* src_y, michael@0: const uint8* src_u, michael@0: const uint8* src_v, michael@0: uint8* dst_frame, int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // src_y michael@0: mov esi, [esp + 8 + 8] // src_u michael@0: mov edx, [esp + 8 + 12] // src_v michael@0: mov edi, [esp + 8 + 16] // dst_frame michael@0: mov ecx, [esp + 8 + 20] // width michael@0: sub edx, esi michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movq xmm2, qword ptr [esi] // U michael@0: movq xmm3, qword ptr [esi + edx] // V michael@0: lea esi, [esi + 8] michael@0: punpcklbw xmm2, xmm3 // UV michael@0: movdqu xmm0, [eax] // Y michael@0: lea eax, [eax + 16] michael@0: movdqa xmm1, xmm0 michael@0: punpcklbw xmm0, xmm2 // YUYV michael@0: punpckhbw xmm1, xmm2 michael@0: movdqu [edi], xmm0 michael@0: movdqu [edi + 16], xmm1 michael@0: lea edi, [edi + 32] michael@0: sub ecx, 16 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: __declspec(naked) __declspec(align(16)) michael@0: void I422ToUYVYRow_SSE2(const uint8* src_y, michael@0: const uint8* src_u, michael@0: const uint8* src_v, michael@0: uint8* dst_frame, int width) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] // src_y michael@0: mov esi, [esp + 8 + 8] // src_u michael@0: mov edx, [esp + 8 + 12] // src_v michael@0: mov edi, [esp + 8 + 16] // dst_frame michael@0: mov ecx, [esp + 8 + 20] // width michael@0: sub edx, esi michael@0: michael@0: align 4 michael@0: convertloop: michael@0: movq xmm2, qword ptr [esi] // U michael@0: movq xmm3, qword ptr [esi + edx] // V michael@0: lea esi, [esi + 8] michael@0: punpcklbw xmm2, xmm3 // UV michael@0: movdqu xmm0, [eax] // Y michael@0: movdqa xmm1, xmm2 michael@0: lea eax, [eax + 16] michael@0: punpcklbw xmm1, xmm0 // UYVY michael@0: punpckhbw xmm2, xmm0 michael@0: movdqu [edi], xmm1 michael@0: movdqu [edi + 16], xmm2 michael@0: lea edi, [edi + 32] michael@0: sub ecx, 16 michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: michael@0: #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBPolynomialRow_SSE2(const uint8* src_argb, michael@0: uint8* dst_argb, const float* poly, michael@0: int width) { michael@0: __asm { michael@0: push esi michael@0: mov eax, [esp + 4 + 4] /* src_argb */ michael@0: mov edx, [esp + 4 + 8] /* dst_argb */ michael@0: mov esi, [esp + 4 + 12] /* poly */ michael@0: mov ecx, [esp + 4 + 16] /* width */ michael@0: pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. michael@0: michael@0: // 2 pixel loop. michael@0: align 4 michael@0: convertloop: michael@0: // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel michael@0: // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel michael@0: movq xmm0, qword ptr [eax] // BGRABGRA michael@0: lea eax, [eax + 8] michael@0: punpcklbw xmm0, xmm3 michael@0: movdqa xmm4, xmm0 michael@0: punpcklwd xmm0, xmm3 // pixel 0 michael@0: punpckhwd xmm4, xmm3 // pixel 1 michael@0: cvtdq2ps xmm0, xmm0 // 4 floats michael@0: cvtdq2ps xmm4, xmm4 michael@0: movdqa xmm1, xmm0 // X michael@0: movdqa xmm5, xmm4 michael@0: mulps xmm0, [esi + 16] // C1 * X michael@0: mulps xmm4, [esi + 16] michael@0: addps xmm0, [esi] // result = C0 + C1 * X michael@0: addps xmm4, [esi] michael@0: movdqa xmm2, xmm1 michael@0: movdqa xmm6, xmm5 michael@0: mulps xmm2, xmm1 // X * X michael@0: mulps xmm6, xmm5 michael@0: mulps xmm1, xmm2 // X * X * X michael@0: mulps xmm5, xmm6 michael@0: mulps xmm2, [esi + 32] // C2 * X * X michael@0: mulps xmm6, [esi + 32] michael@0: mulps xmm1, [esi + 48] // C3 * X * X * X michael@0: mulps xmm5, [esi + 48] michael@0: addps xmm0, xmm2 // result += C2 * X * X michael@0: addps xmm4, xmm6 michael@0: addps xmm0, xmm1 // result += C3 * X * X * X michael@0: addps xmm4, xmm5 michael@0: cvttps2dq xmm0, xmm0 michael@0: cvttps2dq xmm4, xmm4 michael@0: packuswb xmm0, xmm4 michael@0: packuswb xmm0, xmm0 michael@0: sub ecx, 2 michael@0: movq qword ptr [edx], xmm0 michael@0: lea edx, [edx + 8] michael@0: jg convertloop michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBPOLYNOMIALROW_SSE2 michael@0: michael@0: #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBPolynomialRow_AVX2(const uint8* src_argb, michael@0: uint8* dst_argb, const float* poly, michael@0: int width) { michael@0: __asm { michael@0: mov eax, [esp + 4] /* src_argb */ michael@0: mov edx, [esp + 8] /* dst_argb */ michael@0: mov ecx, [esp + 12] /* poly */ michael@0: vbroadcastf128 ymm4, [ecx] // C0 michael@0: vbroadcastf128 ymm5, [ecx + 16] // C1 michael@0: vbroadcastf128 ymm6, [ecx + 32] // C2 michael@0: vbroadcastf128 ymm7, [ecx + 48] // C3 michael@0: mov ecx, [esp + 16] /* width */ michael@0: michael@0: // 2 pixel loop. michael@0: align 4 michael@0: convertloop: michael@0: vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels michael@0: lea eax, [eax + 8] michael@0: vcvtdq2ps ymm0, ymm0 // X 8 floats michael@0: vmulps ymm2, ymm0, ymm0 // X * X michael@0: vmulps ymm3, ymm0, ymm7 // C3 * X michael@0: vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X michael@0: vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X michael@0: vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X michael@0: vcvttps2dq ymm0, ymm0 michael@0: vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000 michael@0: vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000 michael@0: vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000 michael@0: sub ecx, 2 michael@0: vmovq qword ptr [edx], xmm0 michael@0: lea edx, [edx + 8] michael@0: jg convertloop michael@0: vzeroupper michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBPOLYNOMIALROW_AVX2 michael@0: michael@0: #ifdef HAS_ARGBCOLORTABLEROW_X86 michael@0: // Tranform ARGB pixels with color table. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, michael@0: int width) { michael@0: __asm { michael@0: push esi michael@0: mov eax, [esp + 4 + 4] /* dst_argb */ michael@0: mov esi, [esp + 4 + 8] /* table_argb */ michael@0: mov ecx, [esp + 4 + 12] /* width */ michael@0: michael@0: // 1 pixel loop. michael@0: align 4 michael@0: convertloop: michael@0: movzx edx, byte ptr [eax] michael@0: lea eax, [eax + 4] michael@0: movzx edx, byte ptr [esi + edx * 4] michael@0: mov byte ptr [eax - 4], dl michael@0: movzx edx, byte ptr [eax - 4 + 1] michael@0: movzx edx, byte ptr [esi + edx * 4 + 1] michael@0: mov byte ptr [eax - 4 + 1], dl michael@0: movzx edx, byte ptr [eax - 4 + 2] michael@0: movzx edx, byte ptr [esi + edx * 4 + 2] michael@0: mov byte ptr [eax - 4 + 2], dl michael@0: movzx edx, byte ptr [eax - 4 + 3] michael@0: movzx edx, byte ptr [esi + edx * 4 + 3] michael@0: mov byte ptr [eax - 4 + 3], dl michael@0: dec ecx michael@0: jg convertloop michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBCOLORTABLEROW_X86 michael@0: michael@0: #ifdef HAS_RGBCOLORTABLEROW_X86 michael@0: // Tranform RGB pixels with color table. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { michael@0: __asm { michael@0: push esi michael@0: mov eax, [esp + 4 + 4] /* dst_argb */ michael@0: mov esi, [esp + 4 + 8] /* table_argb */ michael@0: mov ecx, [esp + 4 + 12] /* width */ michael@0: michael@0: // 1 pixel loop. michael@0: align 4 michael@0: convertloop: michael@0: movzx edx, byte ptr [eax] michael@0: lea eax, [eax + 4] michael@0: movzx edx, byte ptr [esi + edx * 4] michael@0: mov byte ptr [eax - 4], dl michael@0: movzx edx, byte ptr [eax - 4 + 1] michael@0: movzx edx, byte ptr [esi + edx * 4 + 1] michael@0: mov byte ptr [eax - 4 + 1], dl michael@0: movzx edx, byte ptr [eax - 4 + 2] michael@0: movzx edx, byte ptr [esi + edx * 4 + 2] michael@0: mov byte ptr [eax - 4 + 2], dl michael@0: dec ecx michael@0: jg convertloop michael@0: michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_RGBCOLORTABLEROW_X86 michael@0: michael@0: #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 michael@0: // Tranform RGB pixels with luma table. michael@0: __declspec(naked) __declspec(align(16)) michael@0: void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, michael@0: int width, michael@0: const uint8* luma, uint32 lumacoeff) { michael@0: __asm { michael@0: push esi michael@0: push edi michael@0: mov eax, [esp + 8 + 4] /* src_argb */ michael@0: mov edi, [esp + 8 + 8] /* dst_argb */ michael@0: mov ecx, [esp + 8 + 12] /* width */ michael@0: movd xmm2, dword ptr [esp + 8 + 16] // luma table michael@0: movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff michael@0: pshufd xmm2, xmm2, 0 michael@0: pshufd xmm3, xmm3, 0 michael@0: pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 michael@0: psllw xmm4, 8 michael@0: pxor xmm5, xmm5 michael@0: michael@0: // 4 pixel loop. michael@0: align 4 michael@0: convertloop: michael@0: movdqu xmm0, qword ptr [eax] // generate luma ptr michael@0: pmaddubsw xmm0, xmm3 michael@0: phaddw xmm0, xmm0 michael@0: pand xmm0, xmm4 // mask out low bits michael@0: punpcklwd xmm0, xmm5 michael@0: paddd xmm0, xmm2 // add table base michael@0: movd esi, xmm0 michael@0: pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 michael@0: michael@0: movzx edx, byte ptr [eax] michael@0: movzx edx, byte ptr [esi + edx] michael@0: mov byte ptr [edi], dl michael@0: movzx edx, byte ptr [eax + 1] michael@0: movzx edx, byte ptr [esi + edx] michael@0: mov byte ptr [edi + 1], dl michael@0: movzx edx, byte ptr [eax + 2] michael@0: movzx edx, byte ptr [esi + edx] michael@0: mov byte ptr [edi + 2], dl michael@0: movzx edx, byte ptr [eax + 3] // copy alpha. michael@0: mov byte ptr [edi + 3], dl michael@0: michael@0: movd esi, xmm0 michael@0: pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 michael@0: michael@0: movzx edx, byte ptr [eax + 4] michael@0: movzx edx, byte ptr [esi + edx] michael@0: mov byte ptr [edi + 4], dl michael@0: movzx edx, byte ptr [eax + 5] michael@0: movzx edx, byte ptr [esi + edx] michael@0: mov byte ptr [edi + 5], dl michael@0: movzx edx, byte ptr [eax + 6] michael@0: movzx edx, byte ptr [esi + edx] michael@0: mov byte ptr [edi + 6], dl michael@0: movzx edx, byte ptr [eax + 7] // copy alpha. michael@0: mov byte ptr [edi + 7], dl michael@0: michael@0: movd esi, xmm0 michael@0: pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 michael@0: michael@0: movzx edx, byte ptr [eax + 8] michael@0: movzx edx, byte ptr [esi + edx] michael@0: mov byte ptr [edi + 8], dl michael@0: movzx edx, byte ptr [eax + 9] michael@0: movzx edx, byte ptr [esi + edx] michael@0: mov byte ptr [edi + 9], dl michael@0: movzx edx, byte ptr [eax + 10] michael@0: movzx edx, byte ptr [esi + edx] michael@0: mov byte ptr [edi + 10], dl michael@0: movzx edx, byte ptr [eax + 11] // copy alpha. michael@0: mov byte ptr [edi + 11], dl michael@0: michael@0: movd esi, xmm0 michael@0: michael@0: movzx edx, byte ptr [eax + 12] michael@0: movzx edx, byte ptr [esi + edx] michael@0: mov byte ptr [edi + 12], dl michael@0: movzx edx, byte ptr [eax + 13] michael@0: movzx edx, byte ptr [esi + edx] michael@0: mov byte ptr [edi + 13], dl michael@0: movzx edx, byte ptr [eax + 14] michael@0: movzx edx, byte ptr [esi + edx] michael@0: mov byte ptr [edi + 14], dl michael@0: movzx edx, byte ptr [eax + 15] // copy alpha. michael@0: mov byte ptr [edi + 15], dl michael@0: michael@0: sub ecx, 4 michael@0: lea eax, [eax + 16] michael@0: lea edi, [edi + 16] michael@0: jg convertloop michael@0: michael@0: pop edi michael@0: pop esi michael@0: ret michael@0: } michael@0: } michael@0: #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 michael@0: michael@0: #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) michael@0: michael@0: #ifdef __cplusplus michael@0: } // extern "C" michael@0: } // namespace libyuv michael@0: #endif