michael@0: ; michael@0: ; Copyright 2012 The LibYuv Project Authors. All rights reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license michael@0: ; that can be found in the LICENSE file in the root of the source michael@0: ; tree. An additional intellectual property rights grant can be found michael@0: ; in the file PATENTS. All contributing project authors may michael@0: ; be found in the AUTHORS file in the root of the source tree. michael@0: ; michael@0: michael@0: %ifdef __YASM_VERSION_ID__ michael@0: %if __YASM_VERSION_ID__ < 01020000h michael@0: %error AVX2 is supported only by yasm 1.2.0 or later. michael@0: %endif michael@0: %endif michael@0: %include "x86inc.asm" michael@0: michael@0: SECTION .text michael@0: michael@0: ; cglobal numeric constants are parameters, gpr regs, mm regs michael@0: michael@0: ; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) michael@0: michael@0: %macro YUY2TOYROW 2-3 michael@0: cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix michael@0: %ifidn %1,YUY2 michael@0: pcmpeqb m2, m2, m2 ; generate mask 0x00ff00ff michael@0: psrlw m2, m2, 8 michael@0: %endif michael@0: michael@0: ALIGN 4 michael@0: .convertloop: michael@0: mov%2 m0, [src_yuy2q] michael@0: mov%2 m1, [src_yuy2q + mmsize] michael@0: lea src_yuy2q, [src_yuy2q + mmsize * 2] michael@0: %ifidn %1,YUY2 michael@0: pand m0, m0, m2 ; YUY2 even bytes are Y michael@0: pand m1, m1, m2 michael@0: %else michael@0: psrlw m0, m0, 8 ; UYVY odd bytes are Y michael@0: psrlw m1, m1, 8 michael@0: %endif michael@0: packuswb m0, m0, m1 michael@0: %if cpuflag(AVX2) michael@0: vpermq m0, m0, 0xd8 michael@0: %endif michael@0: sub pixd, mmsize michael@0: mov%2 [dst_yq], m0 michael@0: lea dst_yq, [dst_yq + mmsize] michael@0: jg .convertloop michael@0: REP_RET michael@0: %endmacro michael@0: michael@0: ; TODO(fbarchard): Remove MMX. Add SSSE3 pshufb version. michael@0: INIT_MMX MMX michael@0: YUY2TOYROW YUY2,a, michael@0: YUY2TOYROW YUY2,u,_Unaligned michael@0: YUY2TOYROW UYVY,a, michael@0: YUY2TOYROW UYVY,u,_Unaligned michael@0: INIT_XMM SSE2 michael@0: YUY2TOYROW YUY2,a, michael@0: YUY2TOYROW YUY2,u,_Unaligned michael@0: YUY2TOYROW UYVY,a, michael@0: YUY2TOYROW UYVY,u,_Unaligned michael@0: INIT_YMM AVX2 michael@0: YUY2TOYROW YUY2,a, michael@0: YUY2TOYROW UYVY,a, michael@0: michael@0: ; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) michael@0: michael@0: %macro SplitUVRow 1-2 michael@0: cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix michael@0: pcmpeqb m4, m4, m4 ; generate mask 0x00ff00ff michael@0: psrlw m4, m4, 8 michael@0: sub dst_vq, dst_uq michael@0: michael@0: ALIGN 4 michael@0: .convertloop: michael@0: mov%1 m0, [src_uvq] michael@0: mov%1 m1, [src_uvq + mmsize] michael@0: lea src_uvq, [src_uvq + mmsize * 2] michael@0: psrlw m2, m0, 8 ; odd bytes michael@0: psrlw m3, m1, 8 michael@0: pand m0, m0, m4 ; even bytes michael@0: pand m1, m1, m4 michael@0: packuswb m0, m0, m1 michael@0: packuswb m2, m2, m3 michael@0: %if cpuflag(AVX2) michael@0: vpermq m0, m0, 0xd8 michael@0: vpermq m2, m2, 0xd8 michael@0: %endif michael@0: mov%1 [dst_uq], m0 michael@0: mov%1 [dst_uq + dst_vq], m2 michael@0: lea dst_uq, [dst_uq + mmsize] michael@0: sub pixd, mmsize michael@0: jg .convertloop michael@0: REP_RET michael@0: %endmacro michael@0: michael@0: INIT_MMX MMX michael@0: SplitUVRow a, michael@0: SplitUVRow u,_Unaligned michael@0: INIT_XMM SSE2 michael@0: SplitUVRow a, michael@0: SplitUVRow u,_Unaligned michael@0: INIT_YMM AVX2 michael@0: SplitUVRow a, michael@0: michael@0: ; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, michael@0: ; int width); michael@0: michael@0: %macro MergeUVRow_ 1-2 michael@0: cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix michael@0: sub src_vq, src_uq michael@0: michael@0: ALIGN 4 michael@0: .convertloop: michael@0: mov%1 m0, [src_uq] michael@0: mov%1 m1, [src_vq] michael@0: lea src_uq, [src_uq + mmsize] michael@0: punpcklbw m2, m0, m1 // first 8 UV pairs michael@0: punpckhbw m0, m0, m1 // next 8 UV pairs michael@0: %if cpuflag(AVX2) michael@0: vperm2i128 m1, m2, m0, 0x20 // low 128 of ymm2 and low 128 of ymm0 michael@0: vperm2i128 m2, m2, m0, 0x31 // high 128 of ymm2 and high 128 of ymm0 michael@0: mov%1 [dst_uvq], m1 michael@0: mov%1 [dst_uvq + mmsize], m2 michael@0: %else michael@0: mov%1 [dst_uvq], m2 michael@0: mov%1 [dst_uvq + mmsize], m0 michael@0: %endif michael@0: lea dst_uvq, [dst_uvq + mmsize * 2] michael@0: sub pixd, mmsize michael@0: jg .convertloop michael@0: REP_RET michael@0: %endmacro michael@0: michael@0: INIT_MMX MMX michael@0: MergeUVRow_ a, michael@0: MergeUVRow_ u,_Unaligned michael@0: INIT_XMM SSE2 michael@0: MergeUVRow_ a, michael@0: MergeUVRow_ u,_Unaligned michael@0: INIT_YMM AVX2 michael@0: MergeUVRow_ a, michael@0: