media/libyuv/source/row_x86.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libyuv/source/row_x86.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,146 @@
     1.4 +;
     1.5 +; Copyright 2012 The LibYuv Project Authors. All rights reserved.
     1.6 +;
     1.7 +; Use of this source code is governed by a BSD-style license
     1.8 +; that can be found in the LICENSE file in the root of the source
     1.9 +; tree. An additional intellectual property rights grant can be found
    1.10 +; in the file PATENTS. All contributing project authors may
    1.11 +; be found in the AUTHORS file in the root of the source tree.
    1.12 +;
    1.13 +
    1.14 +%ifdef __YASM_VERSION_ID__
    1.15 +%if __YASM_VERSION_ID__ < 01020000h
    1.16 +%error AVX2 is supported only by yasm 1.2.0 or later.
    1.17 +%endif
    1.18 +%endif
    1.19 +%include "x86inc.asm"
    1.20 +
    1.21 +SECTION .text
    1.22 +
    1.23 +; cglobal numeric constants are parameters, gpr regs, mm regs
    1.24 +
    1.25 +; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix)
    1.26 +
    1.27 +%macro YUY2TOYROW 2-3
    1.28 +cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
    1.29 +%ifidn %1,YUY2
    1.30 +    pcmpeqb    m2, m2, m2        ; generate mask 0x00ff00ff
    1.31 +    psrlw      m2, m2, 8
    1.32 +%endif
    1.33 +
    1.34 +    ALIGN      4
    1.35 +.convertloop:
    1.36 +    mov%2      m0, [src_yuy2q]
    1.37 +    mov%2      m1, [src_yuy2q + mmsize]
    1.38 +    lea        src_yuy2q, [src_yuy2q + mmsize * 2]
    1.39 +%ifidn %1,YUY2
    1.40 +    pand       m0, m0, m2   ; YUY2 even bytes are Y
    1.41 +    pand       m1, m1, m2
    1.42 +%else
    1.43 +    psrlw      m0, m0, 8    ; UYVY odd bytes are Y
    1.44 +    psrlw      m1, m1, 8
    1.45 +%endif
    1.46 +    packuswb   m0, m0, m1
    1.47 +%if cpuflag(AVX2)
    1.48 +    vpermq     m0, m0, 0xd8
    1.49 +%endif
    1.50 +    sub        pixd, mmsize
    1.51 +    mov%2      [dst_yq], m0
    1.52 +    lea        dst_yq, [dst_yq + mmsize]
    1.53 +    jg         .convertloop
    1.54 +    REP_RET
    1.55 +%endmacro
    1.56 +
    1.57 +; TODO(fbarchard): Remove MMX.  Add SSSE3 pshufb version.
    1.58 +INIT_MMX MMX
    1.59 +YUY2TOYROW YUY2,a,
    1.60 +YUY2TOYROW YUY2,u,_Unaligned
    1.61 +YUY2TOYROW UYVY,a,
    1.62 +YUY2TOYROW UYVY,u,_Unaligned
    1.63 +INIT_XMM SSE2
    1.64 +YUY2TOYROW YUY2,a,
    1.65 +YUY2TOYROW YUY2,u,_Unaligned
    1.66 +YUY2TOYROW UYVY,a,
    1.67 +YUY2TOYROW UYVY,u,_Unaligned
    1.68 +INIT_YMM AVX2
    1.69 +YUY2TOYROW YUY2,a,
    1.70 +YUY2TOYROW UYVY,a,
    1.71 +
    1.72 +; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix)
    1.73 +
    1.74 +%macro SplitUVRow 1-2
    1.75 +cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
    1.76 +    pcmpeqb    m4, m4, m4        ; generate mask 0x00ff00ff
    1.77 +    psrlw      m4, m4, 8
    1.78 +    sub        dst_vq, dst_uq
    1.79 +
    1.80 +    ALIGN      4
    1.81 +.convertloop:
    1.82 +    mov%1      m0, [src_uvq]
    1.83 +    mov%1      m1, [src_uvq + mmsize]
    1.84 +    lea        src_uvq, [src_uvq + mmsize * 2]
    1.85 +    psrlw      m2, m0, 8         ; odd bytes
    1.86 +    psrlw      m3, m1, 8
    1.87 +    pand       m0, m0, m4        ; even bytes
    1.88 +    pand       m1, m1, m4
    1.89 +    packuswb   m0, m0, m1
    1.90 +    packuswb   m2, m2, m3
    1.91 +%if cpuflag(AVX2)
    1.92 +    vpermq     m0, m0, 0xd8
    1.93 +    vpermq     m2, m2, 0xd8
    1.94 +%endif
    1.95 +    mov%1      [dst_uq], m0
    1.96 +    mov%1      [dst_uq + dst_vq], m2
    1.97 +    lea        dst_uq, [dst_uq + mmsize]
    1.98 +    sub        pixd, mmsize
    1.99 +    jg         .convertloop
   1.100 +    REP_RET
   1.101 +%endmacro
   1.102 +
   1.103 +INIT_MMX MMX
   1.104 +SplitUVRow a,
   1.105 +SplitUVRow u,_Unaligned
   1.106 +INIT_XMM SSE2
   1.107 +SplitUVRow a,
   1.108 +SplitUVRow u,_Unaligned
   1.109 +INIT_YMM AVX2
   1.110 +SplitUVRow a,
   1.111 +
   1.112 +; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
   1.113 +;                      int width);
   1.114 +
   1.115 +%macro MergeUVRow_ 1-2
   1.116 +cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
   1.117 +    sub        src_vq, src_uq
   1.118 +
   1.119 +    ALIGN      4
   1.120 +.convertloop:
   1.121 +    mov%1      m0, [src_uq]
   1.122 +    mov%1      m1, [src_vq]
   1.123 +    lea        src_uq, [src_uq + mmsize]
   1.124 +    punpcklbw  m2, m0, m1       // first 8 UV pairs
   1.125 +    punpckhbw  m0, m0, m1       // next 8 UV pairs
   1.126 +%if cpuflag(AVX2)
   1.127 +    vperm2i128 m1, m2, m0, 0x20  // low 128 of ymm2 and low 128 of ymm0
   1.128 +    vperm2i128 m2, m2, m0, 0x31  // high 128 of ymm2 and high 128 of ymm0
   1.129 +    mov%1      [dst_uvq], m1
   1.130 +    mov%1      [dst_uvq + mmsize], m2
   1.131 +%else
   1.132 +    mov%1      [dst_uvq], m2
   1.133 +    mov%1      [dst_uvq + mmsize], m0
   1.134 +%endif
   1.135 +    lea        dst_uvq, [dst_uvq + mmsize * 2]
   1.136 +    sub        pixd, mmsize
   1.137 +    jg         .convertloop
   1.138 +    REP_RET
   1.139 +%endmacro
   1.140 +
   1.141 +INIT_MMX MMX
   1.142 +MergeUVRow_ a,
   1.143 +MergeUVRow_ u,_Unaligned
   1.144 +INIT_XMM SSE2
   1.145 +MergeUVRow_ a,
   1.146 +MergeUVRow_ u,_Unaligned
   1.147 +INIT_YMM AVX2
   1.148 +MergeUVRow_ a,
   1.149 +

mercurial