|
1 ; |
|
2 ; Copyright 2012 The LibYuv Project Authors. All rights reserved. |
|
3 ; |
|
4 ; Use of this source code is governed by a BSD-style license |
|
5 ; that can be found in the LICENSE file in the root of the source |
|
6 ; tree. An additional intellectual property rights grant can be found |
|
7 ; in the file PATENTS. All contributing project authors may |
|
8 ; be found in the AUTHORS file in the root of the source tree. |
|
9 ; |
|
10 |
|
11 %ifdef __YASM_VERSION_ID__ |
|
12 %if __YASM_VERSION_ID__ < 01020000h |
|
13 %error AVX2 is supported only by yasm 1.2.0 or later. |
|
14 %endif |
|
15 %endif |
|
16 %include "x86inc.asm" |
|
17 |
|
18 SECTION .text |
|
19 |
|
20 ; cglobal numeric constants are parameters, gpr regs, mm regs |
|
21 |
|
22 ; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) |
|
23 |
|
24 %macro YUY2TOYROW 2-3 |
|
25 cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix |
|
26 %ifidn %1,YUY2 |
|
27 pcmpeqb m2, m2, m2 ; generate mask 0x00ff00ff |
|
28 psrlw m2, m2, 8 |
|
29 %endif |
|
30 |
|
31 ALIGN 4 |
|
32 .convertloop: |
|
33 mov%2 m0, [src_yuy2q] |
|
34 mov%2 m1, [src_yuy2q + mmsize] |
|
35 lea src_yuy2q, [src_yuy2q + mmsize * 2] |
|
36 %ifidn %1,YUY2 |
|
37 pand m0, m0, m2 ; YUY2 even bytes are Y |
|
38 pand m1, m1, m2 |
|
39 %else |
|
40 psrlw m0, m0, 8 ; UYVY odd bytes are Y |
|
41 psrlw m1, m1, 8 |
|
42 %endif |
|
43 packuswb m0, m0, m1 |
|
44 %if cpuflag(AVX2) |
|
45 vpermq m0, m0, 0xd8 |
|
46 %endif |
|
47 sub pixd, mmsize |
|
48 mov%2 [dst_yq], m0 |
|
49 lea dst_yq, [dst_yq + mmsize] |
|
50 jg .convertloop |
|
51 REP_RET |
|
52 %endmacro |
|
53 |
|
54 ; TODO(fbarchard): Remove MMX. Add SSSE3 pshufb version. |
|
55 INIT_MMX MMX |
|
56 YUY2TOYROW YUY2,a, |
|
57 YUY2TOYROW YUY2,u,_Unaligned |
|
58 YUY2TOYROW UYVY,a, |
|
59 YUY2TOYROW UYVY,u,_Unaligned |
|
60 INIT_XMM SSE2 |
|
61 YUY2TOYROW YUY2,a, |
|
62 YUY2TOYROW YUY2,u,_Unaligned |
|
63 YUY2TOYROW UYVY,a, |
|
64 YUY2TOYROW UYVY,u,_Unaligned |
|
65 INIT_YMM AVX2 |
|
66 YUY2TOYROW YUY2,a, |
|
67 YUY2TOYROW UYVY,a, |
|
68 |
|
69 ; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) |
|
70 |
|
71 %macro SplitUVRow 1-2 |
|
72 cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix |
|
73 pcmpeqb m4, m4, m4 ; generate mask 0x00ff00ff |
|
74 psrlw m4, m4, 8 |
|
75 sub dst_vq, dst_uq |
|
76 |
|
77 ALIGN 4 |
|
78 .convertloop: |
|
79 mov%1 m0, [src_uvq] |
|
80 mov%1 m1, [src_uvq + mmsize] |
|
81 lea src_uvq, [src_uvq + mmsize * 2] |
|
82 psrlw m2, m0, 8 ; odd bytes |
|
83 psrlw m3, m1, 8 |
|
84 pand m0, m0, m4 ; even bytes |
|
85 pand m1, m1, m4 |
|
86 packuswb m0, m0, m1 |
|
87 packuswb m2, m2, m3 |
|
88 %if cpuflag(AVX2) |
|
89 vpermq m0, m0, 0xd8 |
|
90 vpermq m2, m2, 0xd8 |
|
91 %endif |
|
92 mov%1 [dst_uq], m0 |
|
93 mov%1 [dst_uq + dst_vq], m2 |
|
94 lea dst_uq, [dst_uq + mmsize] |
|
95 sub pixd, mmsize |
|
96 jg .convertloop |
|
97 REP_RET |
|
98 %endmacro |
|
99 |
|
100 INIT_MMX MMX |
|
101 SplitUVRow a, |
|
102 SplitUVRow u,_Unaligned |
|
103 INIT_XMM SSE2 |
|
104 SplitUVRow a, |
|
105 SplitUVRow u,_Unaligned |
|
106 INIT_YMM AVX2 |
|
107 SplitUVRow a, |
|
108 |
|
109 ; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, |
|
110 ; int width); |
|
111 |
|
112 %macro MergeUVRow_ 1-2 |
|
113 cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix |
|
114 sub src_vq, src_uq |
|
115 |
|
116 ALIGN 4 |
|
117 .convertloop: |
|
118 mov%1 m0, [src_uq] |
|
119 mov%1 m1, [src_vq] |
|
120 lea src_uq, [src_uq + mmsize] |
|
121 punpcklbw m2, m0, m1 // first 8 UV pairs |
|
122 punpckhbw m0, m0, m1 // next 8 UV pairs |
|
123 %if cpuflag(AVX2) |
|
124 vperm2i128 m1, m2, m0, 0x20 // low 128 of ymm2 and low 128 of ymm0 |
|
125 vperm2i128 m2, m2, m0, 0x31 // high 128 of ymm2 and high 128 of ymm0 |
|
126 mov%1 [dst_uvq], m1 |
|
127 mov%1 [dst_uvq + mmsize], m2 |
|
128 %else |
|
129 mov%1 [dst_uvq], m2 |
|
130 mov%1 [dst_uvq + mmsize], m0 |
|
131 %endif |
|
132 lea dst_uvq, [dst_uvq + mmsize * 2] |
|
133 sub pixd, mmsize |
|
134 jg .convertloop |
|
135 REP_RET |
|
136 %endmacro |
|
137 |
|
138 INIT_MMX MMX |
|
139 MergeUVRow_ a, |
|
140 MergeUVRow_ u,_Unaligned |
|
141 INIT_XMM SSE2 |
|
142 MergeUVRow_ a, |
|
143 MergeUVRow_ u,_Unaligned |
|
144 INIT_YMM AVX2 |
|
145 MergeUVRow_ a, |
|
146 |