media/libyuv/source/row_x86.asm

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 ;
     2 ; Copyright 2012 The LibYuv Project Authors. All rights reserved.
     3 ;
     4 ; Use of this source code is governed by a BSD-style license
     5 ; that can be found in the LICENSE file in the root of the source
     6 ; tree. An additional intellectual property rights grant can be found
     7 ; in the file PATENTS. All contributing project authors may
     8 ; be found in the AUTHORS file in the root of the source tree.
     9 ;
    11 %ifdef __YASM_VERSION_ID__
    12 %if __YASM_VERSION_ID__ < 01020000h
    13 %error AVX2 is supported only by yasm 1.2.0 or later.
    14 %endif
    15 %endif
    16 %include "x86inc.asm"
    18 SECTION .text
    20 ; cglobal numeric constants are parameters, gpr regs, mm regs
    22 ; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix)
    24 %macro YUY2TOYROW 2-3
    25 cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
    26 %ifidn %1,YUY2
    27     pcmpeqb    m2, m2, m2        ; generate mask 0x00ff00ff
    28     psrlw      m2, m2, 8
    29 %endif
    31     ALIGN      4
    32 .convertloop:
    33     mov%2      m0, [src_yuy2q]
    34     mov%2      m1, [src_yuy2q + mmsize]
    35     lea        src_yuy2q, [src_yuy2q + mmsize * 2]
    36 %ifidn %1,YUY2
    37     pand       m0, m0, m2   ; YUY2 even bytes are Y
    38     pand       m1, m1, m2
    39 %else
    40     psrlw      m0, m0, 8    ; UYVY odd bytes are Y
    41     psrlw      m1, m1, 8
    42 %endif
    43     packuswb   m0, m0, m1
    44 %if cpuflag(AVX2)
    45     vpermq     m0, m0, 0xd8
    46 %endif
    47     sub        pixd, mmsize
    48     mov%2      [dst_yq], m0
    49     lea        dst_yq, [dst_yq + mmsize]
    50     jg         .convertloop
    51     REP_RET
    52 %endmacro
    54 ; TODO(fbarchard): Remove MMX.  Add SSSE3 pshufb version.
    55 INIT_MMX MMX
    56 YUY2TOYROW YUY2,a,
    57 YUY2TOYROW YUY2,u,_Unaligned
    58 YUY2TOYROW UYVY,a,
    59 YUY2TOYROW UYVY,u,_Unaligned
    60 INIT_XMM SSE2
    61 YUY2TOYROW YUY2,a,
    62 YUY2TOYROW YUY2,u,_Unaligned
    63 YUY2TOYROW UYVY,a,
    64 YUY2TOYROW UYVY,u,_Unaligned
    65 INIT_YMM AVX2
    66 YUY2TOYROW YUY2,a,
    67 YUY2TOYROW UYVY,a,
    69 ; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix)
    71 %macro SplitUVRow 1-2
    72 cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
    73     pcmpeqb    m4, m4, m4        ; generate mask 0x00ff00ff
    74     psrlw      m4, m4, 8
    75     sub        dst_vq, dst_uq
    77     ALIGN      4
    78 .convertloop:
    79     mov%1      m0, [src_uvq]
    80     mov%1      m1, [src_uvq + mmsize]
    81     lea        src_uvq, [src_uvq + mmsize * 2]
    82     psrlw      m2, m0, 8         ; odd bytes
    83     psrlw      m3, m1, 8
    84     pand       m0, m0, m4        ; even bytes
    85     pand       m1, m1, m4
    86     packuswb   m0, m0, m1
    87     packuswb   m2, m2, m3
    88 %if cpuflag(AVX2)
    89     vpermq     m0, m0, 0xd8
    90     vpermq     m2, m2, 0xd8
    91 %endif
    92     mov%1      [dst_uq], m0
    93     mov%1      [dst_uq + dst_vq], m2
    94     lea        dst_uq, [dst_uq + mmsize]
    95     sub        pixd, mmsize
    96     jg         .convertloop
    97     REP_RET
    98 %endmacro
   100 INIT_MMX MMX
   101 SplitUVRow a,
   102 SplitUVRow u,_Unaligned
   103 INIT_XMM SSE2
   104 SplitUVRow a,
   105 SplitUVRow u,_Unaligned
   106 INIT_YMM AVX2
   107 SplitUVRow a,
   109 ; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
   110 ;                      int width);
   112 %macro MergeUVRow_ 1-2
   113 cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
   114     sub        src_vq, src_uq
   116     ALIGN      4
   117 .convertloop:
   118     mov%1      m0, [src_uq]
   119     mov%1      m1, [src_vq]
   120     lea        src_uq, [src_uq + mmsize]
   121     punpcklbw  m2, m0, m1       // first 8 UV pairs
   122     punpckhbw  m0, m0, m1       // next 8 UV pairs
   123 %if cpuflag(AVX2)
   124     vperm2i128 m1, m2, m0, 0x20  // low 128 of ymm2 and low 128 of ymm0
   125     vperm2i128 m2, m2, m0, 0x31  // high 128 of ymm2 and high 128 of ymm0
   126     mov%1      [dst_uvq], m1
   127     mov%1      [dst_uvq + mmsize], m2
   128 %else
   129     mov%1      [dst_uvq], m2
   130     mov%1      [dst_uvq + mmsize], m0
   131 %endif
   132     lea        dst_uvq, [dst_uvq + mmsize * 2]
   133     sub        pixd, mmsize
   134     jg         .convertloop
   135     REP_RET
   136 %endmacro
   138 INIT_MMX MMX
   139 MergeUVRow_ a,
   140 MergeUVRow_ u,_Unaligned
   141 INIT_XMM SSE2
   142 MergeUVRow_ a,
   143 MergeUVRow_ u,_Unaligned
   144 INIT_YMM AVX2
   145 MergeUVRow_ a,

mercurial