media/libyuv/source/row_x86.asm

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 ;
michael@0 2 ; Copyright 2012 The LibYuv Project Authors. All rights reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11 %ifdef __YASM_VERSION_ID__
michael@0 12 %if __YASM_VERSION_ID__ < 01020000h
michael@0 13 %error AVX2 is supported only by yasm 1.2.0 or later.
michael@0 14 %endif
michael@0 15 %endif
michael@0 16 %include "x86inc.asm"
michael@0 17
michael@0 18 SECTION .text
michael@0 19
michael@0 20 ; cglobal numeric constants are parameters, gpr regs, mm regs
michael@0 21
michael@0 22 ; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix)
michael@0 23
michael@0 24 %macro YUY2TOYROW 2-3
michael@0 25 cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
michael@0 26 %ifidn %1,YUY2
michael@0 27 pcmpeqb m2, m2, m2 ; generate mask 0x00ff00ff
michael@0 28 psrlw m2, m2, 8
michael@0 29 %endif
michael@0 30
michael@0 31 ALIGN 4
michael@0 32 .convertloop:
michael@0 33 mov%2 m0, [src_yuy2q]
michael@0 34 mov%2 m1, [src_yuy2q + mmsize]
michael@0 35 lea src_yuy2q, [src_yuy2q + mmsize * 2]
michael@0 36 %ifidn %1,YUY2
michael@0 37 pand m0, m0, m2 ; YUY2 even bytes are Y
michael@0 38 pand m1, m1, m2
michael@0 39 %else
michael@0 40 psrlw m0, m0, 8 ; UYVY odd bytes are Y
michael@0 41 psrlw m1, m1, 8
michael@0 42 %endif
michael@0 43 packuswb m0, m0, m1
michael@0 44 %if cpuflag(AVX2)
michael@0 45 vpermq m0, m0, 0xd8
michael@0 46 %endif
michael@0 47 sub pixd, mmsize
michael@0 48 mov%2 [dst_yq], m0
michael@0 49 lea dst_yq, [dst_yq + mmsize]
michael@0 50 jg .convertloop
michael@0 51 REP_RET
michael@0 52 %endmacro
michael@0 53
michael@0 54 ; TODO(fbarchard): Remove MMX. Add SSSE3 pshufb version.
michael@0 55 INIT_MMX MMX
michael@0 56 YUY2TOYROW YUY2,a,
michael@0 57 YUY2TOYROW YUY2,u,_Unaligned
michael@0 58 YUY2TOYROW UYVY,a,
michael@0 59 YUY2TOYROW UYVY,u,_Unaligned
michael@0 60 INIT_XMM SSE2
michael@0 61 YUY2TOYROW YUY2,a,
michael@0 62 YUY2TOYROW YUY2,u,_Unaligned
michael@0 63 YUY2TOYROW UYVY,a,
michael@0 64 YUY2TOYROW UYVY,u,_Unaligned
michael@0 65 INIT_YMM AVX2
michael@0 66 YUY2TOYROW YUY2,a,
michael@0 67 YUY2TOYROW UYVY,a,
michael@0 68
michael@0 69 ; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix)
michael@0 70
michael@0 71 %macro SplitUVRow 1-2
michael@0 72 cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
michael@0 73 pcmpeqb m4, m4, m4 ; generate mask 0x00ff00ff
michael@0 74 psrlw m4, m4, 8
michael@0 75 sub dst_vq, dst_uq
michael@0 76
michael@0 77 ALIGN 4
michael@0 78 .convertloop:
michael@0 79 mov%1 m0, [src_uvq]
michael@0 80 mov%1 m1, [src_uvq + mmsize]
michael@0 81 lea src_uvq, [src_uvq + mmsize * 2]
michael@0 82 psrlw m2, m0, 8 ; odd bytes
michael@0 83 psrlw m3, m1, 8
michael@0 84 pand m0, m0, m4 ; even bytes
michael@0 85 pand m1, m1, m4
michael@0 86 packuswb m0, m0, m1
michael@0 87 packuswb m2, m2, m3
michael@0 88 %if cpuflag(AVX2)
michael@0 89 vpermq m0, m0, 0xd8
michael@0 90 vpermq m2, m2, 0xd8
michael@0 91 %endif
michael@0 92 mov%1 [dst_uq], m0
michael@0 93 mov%1 [dst_uq + dst_vq], m2
michael@0 94 lea dst_uq, [dst_uq + mmsize]
michael@0 95 sub pixd, mmsize
michael@0 96 jg .convertloop
michael@0 97 REP_RET
michael@0 98 %endmacro
michael@0 99
michael@0 100 INIT_MMX MMX
michael@0 101 SplitUVRow a,
michael@0 102 SplitUVRow u,_Unaligned
michael@0 103 INIT_XMM SSE2
michael@0 104 SplitUVRow a,
michael@0 105 SplitUVRow u,_Unaligned
michael@0 106 INIT_YMM AVX2
michael@0 107 SplitUVRow a,
michael@0 108
michael@0 109 ; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
michael@0 110 ; int width);
michael@0 111
michael@0 112 %macro MergeUVRow_ 1-2
michael@0 113 cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
michael@0 114 sub src_vq, src_uq
michael@0 115
michael@0 116 ALIGN 4
michael@0 117 .convertloop:
michael@0 118 mov%1 m0, [src_uq]
michael@0 119 mov%1 m1, [src_vq]
michael@0 120 lea src_uq, [src_uq + mmsize]
michael@0 121 punpcklbw m2, m0, m1 // first 8 UV pairs
michael@0 122 punpckhbw m0, m0, m1 // next 8 UV pairs
michael@0 123 %if cpuflag(AVX2)
michael@0 124 vperm2i128 m1, m2, m0, 0x20 // low 128 of ymm2 and low 128 of ymm0
michael@0 125 vperm2i128 m2, m2, m0, 0x31 // high 128 of ymm2 and high 128 of ymm0
michael@0 126 mov%1 [dst_uvq], m1
michael@0 127 mov%1 [dst_uvq + mmsize], m2
michael@0 128 %else
michael@0 129 mov%1 [dst_uvq], m2
michael@0 130 mov%1 [dst_uvq + mmsize], m0
michael@0 131 %endif
michael@0 132 lea dst_uvq, [dst_uvq + mmsize * 2]
michael@0 133 sub pixd, mmsize
michael@0 134 jg .convertloop
michael@0 135 REP_RET
michael@0 136 %endmacro
michael@0 137
michael@0 138 INIT_MMX MMX
michael@0 139 MergeUVRow_ a,
michael@0 140 MergeUVRow_ u,_Unaligned
michael@0 141 INIT_XMM SSE2
michael@0 142 MergeUVRow_ a,
michael@0 143 MergeUVRow_ u,_Unaligned
michael@0 144 INIT_YMM AVX2
michael@0 145 MergeUVRow_ a,
michael@0 146

mercurial