Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
1 ;
2 ; Copyright 2012 The LibYuv Project Authors. All rights reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
11 %ifdef __YASM_VERSION_ID__
12 %if __YASM_VERSION_ID__ < 01020000h
13 %error AVX2 is supported only by yasm 1.2.0 or later.
14 %endif
15 %endif
16 %include "x86inc.asm"
18 SECTION .text
20 ; cglobal numeric constants are parameters, gpr regs, mm regs
22 ; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix)
24 %macro YUY2TOYROW 2-3
25 cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
26 %ifidn %1,YUY2
27 pcmpeqb m2, m2, m2 ; generate mask 0x00ff00ff
28 psrlw m2, m2, 8
29 %endif
31 ALIGN 4
32 .convertloop:
33 mov%2 m0, [src_yuy2q]
34 mov%2 m1, [src_yuy2q + mmsize]
35 lea src_yuy2q, [src_yuy2q + mmsize * 2]
36 %ifidn %1,YUY2
37 pand m0, m0, m2 ; YUY2 even bytes are Y
38 pand m1, m1, m2
39 %else
40 psrlw m0, m0, 8 ; UYVY odd bytes are Y
41 psrlw m1, m1, 8
42 %endif
43 packuswb m0, m0, m1
44 %if cpuflag(AVX2)
45 vpermq m0, m0, 0xd8
46 %endif
47 sub pixd, mmsize
48 mov%2 [dst_yq], m0
49 lea dst_yq, [dst_yq + mmsize]
50 jg .convertloop
51 REP_RET
52 %endmacro
54 ; TODO(fbarchard): Remove MMX. Add SSSE3 pshufb version.
55 INIT_MMX MMX
56 YUY2TOYROW YUY2,a,
57 YUY2TOYROW YUY2,u,_Unaligned
58 YUY2TOYROW UYVY,a,
59 YUY2TOYROW UYVY,u,_Unaligned
60 INIT_XMM SSE2
61 YUY2TOYROW YUY2,a,
62 YUY2TOYROW YUY2,u,_Unaligned
63 YUY2TOYROW UYVY,a,
64 YUY2TOYROW UYVY,u,_Unaligned
65 INIT_YMM AVX2
66 YUY2TOYROW YUY2,a,
67 YUY2TOYROW UYVY,a,
69 ; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix)
71 %macro SplitUVRow 1-2
72 cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
73 pcmpeqb m4, m4, m4 ; generate mask 0x00ff00ff
74 psrlw m4, m4, 8
75 sub dst_vq, dst_uq
77 ALIGN 4
78 .convertloop:
79 mov%1 m0, [src_uvq]
80 mov%1 m1, [src_uvq + mmsize]
81 lea src_uvq, [src_uvq + mmsize * 2]
82 psrlw m2, m0, 8 ; odd bytes
83 psrlw m3, m1, 8
84 pand m0, m0, m4 ; even bytes
85 pand m1, m1, m4
86 packuswb m0, m0, m1
87 packuswb m2, m2, m3
88 %if cpuflag(AVX2)
89 vpermq m0, m0, 0xd8
90 vpermq m2, m2, 0xd8
91 %endif
92 mov%1 [dst_uq], m0
93 mov%1 [dst_uq + dst_vq], m2
94 lea dst_uq, [dst_uq + mmsize]
95 sub pixd, mmsize
96 jg .convertloop
97 REP_RET
98 %endmacro
100 INIT_MMX MMX
101 SplitUVRow a,
102 SplitUVRow u,_Unaligned
103 INIT_XMM SSE2
104 SplitUVRow a,
105 SplitUVRow u,_Unaligned
106 INIT_YMM AVX2
107 SplitUVRow a,
109 ; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
110 ; int width);
112 %macro MergeUVRow_ 1-2
113 cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
114 sub src_vq, src_uq
116 ALIGN 4
117 .convertloop:
118 mov%1 m0, [src_uq]
119 mov%1 m1, [src_vq]
120 lea src_uq, [src_uq + mmsize]
121 punpcklbw m2, m0, m1 // first 8 UV pairs
122 punpckhbw m0, m0, m1 // next 8 UV pairs
123 %if cpuflag(AVX2)
124 vperm2i128 m1, m2, m0, 0x20 // low 128 of ymm2 and low 128 of ymm0
125 vperm2i128 m2, m2, m0, 0x31 // high 128 of ymm2 and high 128 of ymm0
126 mov%1 [dst_uvq], m1
127 mov%1 [dst_uvq + mmsize], m2
128 %else
129 mov%1 [dst_uvq], m2
130 mov%1 [dst_uvq + mmsize], m0
131 %endif
132 lea dst_uvq, [dst_uvq + mmsize * 2]
133 sub pixd, mmsize
134 jg .convertloop
135 REP_RET
136 %endmacro
138 INIT_MMX MMX
139 MergeUVRow_ a,
140 MergeUVRow_ u,_Unaligned
141 INIT_XMM SSE2
142 MergeUVRow_ a,
143 MergeUVRow_ u,_Unaligned
144 INIT_YMM AVX2
145 MergeUVRow_ a,