michael@0: ;
michael@0: ; Copyright 2012 The LibYuv Project Authors. All rights reserved.
michael@0: ;
michael@0: ; Use of this source code is governed by a BSD-style license
michael@0: ; that can be found in the LICENSE file in the root of the source
michael@0: ; tree. An additional intellectual property rights grant can be found
michael@0: ; in the file PATENTS. All contributing project authors may
michael@0: ; be found in the AUTHORS file in the root of the source tree.
michael@0: ;
michael@0: 
michael@0: %ifdef __YASM_VERSION_ID__
michael@0: %if __YASM_VERSION_ID__ < 01020000h
michael@0: %error AVX2 is supported only by yasm 1.2.0 or later.
michael@0: %endif
michael@0: %endif
michael@0: %include "x86inc.asm"
michael@0: 
michael@0: SECTION .text
michael@0: 
michael@0: ; cglobal numeric constants are parameters, gpr regs, mm regs
michael@0: 
michael@0: ; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix)
michael@0: 
michael@0: %macro YUY2TOYROW 2-3
michael@0: cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
michael@0: %ifidn %1,YUY2
michael@0:     pcmpeqb    m2, m2, m2        ; generate mask 0x00ff00ff
michael@0:     psrlw      m2, m2, 8
michael@0: %endif
michael@0: 
michael@0:     ALIGN      4
michael@0: .convertloop:
michael@0:     mov%2      m0, [src_yuy2q]
michael@0:     mov%2      m1, [src_yuy2q + mmsize]
michael@0:     lea        src_yuy2q, [src_yuy2q + mmsize * 2]
michael@0: %ifidn %1,YUY2
michael@0:     pand       m0, m0, m2   ; YUY2 even bytes are Y
michael@0:     pand       m1, m1, m2
michael@0: %else
michael@0:     psrlw      m0, m0, 8    ; UYVY odd bytes are Y
michael@0:     psrlw      m1, m1, 8
michael@0: %endif
michael@0:     packuswb   m0, m0, m1
michael@0: %if cpuflag(AVX2)
michael@0:     vpermq     m0, m0, 0xd8
michael@0: %endif
michael@0:     sub        pixd, mmsize
michael@0:     mov%2      [dst_yq], m0
michael@0:     lea        dst_yq, [dst_yq + mmsize]
michael@0:     jg         .convertloop
michael@0:     REP_RET
michael@0: %endmacro
michael@0: 
michael@0: ; TODO(fbarchard): Remove MMX.  Add SSSE3 pshufb version.
michael@0: INIT_MMX MMX
michael@0: YUY2TOYROW YUY2,a,
michael@0: YUY2TOYROW YUY2,u,_Unaligned
michael@0: YUY2TOYROW UYVY,a,
michael@0: YUY2TOYROW UYVY,u,_Unaligned
michael@0: INIT_XMM SSE2
michael@0: YUY2TOYROW YUY2,a,
michael@0: YUY2TOYROW YUY2,u,_Unaligned
michael@0: YUY2TOYROW UYVY,a,
michael@0: YUY2TOYROW UYVY,u,_Unaligned
michael@0: INIT_YMM AVX2
michael@0: YUY2TOYROW YUY2,a,
michael@0: YUY2TOYROW UYVY,a,
michael@0: 
michael@0: ; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix)
michael@0: 
michael@0: %macro SplitUVRow 1-2
michael@0: cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
michael@0:     pcmpeqb    m4, m4, m4        ; generate mask 0x00ff00ff
michael@0:     psrlw      m4, m4, 8
michael@0:     sub        dst_vq, dst_uq
michael@0: 
michael@0:     ALIGN      4
michael@0: .convertloop:
michael@0:     mov%1      m0, [src_uvq]
michael@0:     mov%1      m1, [src_uvq + mmsize]
michael@0:     lea        src_uvq, [src_uvq + mmsize * 2]
michael@0:     psrlw      m2, m0, 8         ; odd bytes
michael@0:     psrlw      m3, m1, 8
michael@0:     pand       m0, m0, m4        ; even bytes
michael@0:     pand       m1, m1, m4
michael@0:     packuswb   m0, m0, m1
michael@0:     packuswb   m2, m2, m3
michael@0: %if cpuflag(AVX2)
michael@0:     vpermq     m0, m0, 0xd8
michael@0:     vpermq     m2, m2, 0xd8
michael@0: %endif
michael@0:     mov%1      [dst_uq], m0
michael@0:     mov%1      [dst_uq + dst_vq], m2
michael@0:     lea        dst_uq, [dst_uq + mmsize]
michael@0:     sub        pixd, mmsize
michael@0:     jg         .convertloop
michael@0:     REP_RET
michael@0: %endmacro
michael@0: 
michael@0: INIT_MMX MMX
michael@0: SplitUVRow a,
michael@0: SplitUVRow u,_Unaligned
michael@0: INIT_XMM SSE2
michael@0: SplitUVRow a,
michael@0: SplitUVRow u,_Unaligned
michael@0: INIT_YMM AVX2
michael@0: SplitUVRow a,
michael@0: 
michael@0: ; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
michael@0: ;                      int width);
michael@0: 
michael@0: %macro MergeUVRow_ 1-2
michael@0: cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
michael@0:     sub        src_vq, src_uq
michael@0: 
michael@0:     ALIGN      4
michael@0: .convertloop:
michael@0:     mov%1      m0, [src_uq]
michael@0:     mov%1      m1, [src_vq]
michael@0:     lea        src_uq, [src_uq + mmsize]
michael@0:     punpcklbw  m2, m0, m1       // first 8 UV pairs
michael@0:     punpckhbw  m0, m0, m1       // next 8 UV pairs
michael@0: %if cpuflag(AVX2)
michael@0:     vperm2i128 m1, m2, m0, 0x20  // low 128 of ymm2 and low 128 of ymm0
michael@0:     vperm2i128 m2, m2, m0, 0x31  // high 128 of ymm2 and high 128 of ymm0
michael@0:     mov%1      [dst_uvq], m1
michael@0:     mov%1      [dst_uvq + mmsize], m2
michael@0: %else
michael@0:     mov%1      [dst_uvq], m2
michael@0:     mov%1      [dst_uvq + mmsize], m0
michael@0: %endif
michael@0:     lea        dst_uvq, [dst_uvq + mmsize * 2]
michael@0:     sub        pixd, mmsize
michael@0:     jg         .convertloop
michael@0:     REP_RET
michael@0: %endmacro
michael@0: 
michael@0: INIT_MMX MMX
michael@0: MergeUVRow_ a,
michael@0: MergeUVRow_ u,_Unaligned
michael@0: INIT_XMM SSE2
michael@0: MergeUVRow_ a,
michael@0: MergeUVRow_ u,_Unaligned
michael@0: INIT_YMM AVX2
michael@0: MergeUVRow_ a,
michael@0: