|
1 ; |
|
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 ; |
|
4 ; Use of this source code is governed by a BSD-style license |
|
5 ; that can be found in the LICENSE file in the root of the source |
|
6 ; tree. An additional intellectual property rights grant can be found |
|
7 ; in the file PATENTS. All contributing project authors may |
|
8 ; be found in the AUTHORS file in the root of the source tree. |
|
9 ; |
|
10 |
|
11 |
|
12 %include "vpx_ports/x86_abi_support.asm" |
|
13 |
|
14 %macro STACK_FRAME_CREATE 0 |
|
15 %if ABI_IS_32BIT |
|
16 %define input rsi |
|
17 %define output rdi |
|
18 %define pitch rax |
|
19 push rbp |
|
20 mov rbp, rsp |
|
21 GET_GOT rbx |
|
22 push rsi |
|
23 push rdi |
|
24 ; end prolog |
|
25 |
|
26 mov rsi, arg(0) |
|
27 mov rdi, arg(1) |
|
28 |
|
29 movsxd rax, dword ptr arg(2) |
|
30 lea rcx, [rsi + rax*2] |
|
31 %else |
|
32 %if LIBVPX_YASM_WIN64 |
|
33 %define input rcx |
|
34 %define output rdx |
|
35 %define pitch r8 |
|
36 SAVE_XMM 7, u |
|
37 %else |
|
38 %define input rdi |
|
39 %define output rsi |
|
40 %define pitch rdx |
|
41 %endif |
|
42 %endif |
|
43 %endmacro |
|
44 |
|
45 %macro STACK_FRAME_DESTROY 0 |
|
46 %define input |
|
47 %define output |
|
48 %define pitch |
|
49 |
|
50 %if ABI_IS_32BIT |
|
51 pop rdi |
|
52 pop rsi |
|
53 RESTORE_GOT |
|
54 pop rbp |
|
55 %else |
|
56 %if LIBVPX_YASM_WIN64 |
|
57 RESTORE_XMM |
|
58 %endif |
|
59 %endif |
|
60 ret |
|
61 %endmacro |
|
62 |
|
63 ;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch) |
|
64 global sym(vp8_short_fdct4x4_sse2) PRIVATE |
|
65 sym(vp8_short_fdct4x4_sse2): |
|
66 |
|
67 STACK_FRAME_CREATE |
|
68 |
|
69 movq xmm0, MMWORD PTR[input ] ;03 02 01 00 |
|
70 movq xmm2, MMWORD PTR[input+ pitch] ;13 12 11 10 |
|
71 lea input, [input+2*pitch] |
|
72 movq xmm1, MMWORD PTR[input ] ;23 22 21 20 |
|
73 movq xmm3, MMWORD PTR[input+ pitch] ;33 32 31 30 |
|
74 |
|
75 punpcklqdq xmm0, xmm2 ;13 12 11 10 03 02 01 00 |
|
76 punpcklqdq xmm1, xmm3 ;33 32 31 30 23 22 21 20 |
|
77 |
|
78 movdqa xmm2, xmm0 |
|
79 punpckldq xmm0, xmm1 ;23 22 03 02 21 20 01 00 |
|
80 punpckhdq xmm2, xmm1 ;33 32 13 12 31 30 11 10 |
|
81 movdqa xmm1, xmm0 |
|
82 punpckldq xmm0, xmm2 ;31 21 30 20 11 10 01 00 |
|
83 pshufhw xmm1, xmm1, 0b1h ;22 23 02 03 xx xx xx xx |
|
84 pshufhw xmm2, xmm2, 0b1h ;32 33 12 13 xx xx xx xx |
|
85 |
|
86 punpckhdq xmm1, xmm2 ;32 33 22 23 12 13 02 03 |
|
87 movdqa xmm3, xmm0 |
|
88 paddw xmm0, xmm1 ;b1 a1 b1 a1 b1 a1 b1 a1 |
|
89 psubw xmm3, xmm1 ;c1 d1 c1 d1 c1 d1 c1 d1 |
|
90 psllw xmm0, 3 ;b1 <<= 3 a1 <<= 3 |
|
91 psllw xmm3, 3 ;c1 <<= 3 d1 <<= 3 |
|
92 |
|
93 movdqa xmm1, xmm0 |
|
94 pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 |
|
95 pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 |
|
96 movdqa xmm4, xmm3 |
|
97 pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 |
|
98 pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352 |
|
99 |
|
100 paddd xmm3, XMMWORD PTR[GLOBAL(_14500)] |
|
101 paddd xmm4, XMMWORD PTR[GLOBAL(_7500)] |
|
102 psrad xmm3, 12 ;(c1 * 2217 + d1 * 5352 + 14500)>>12 |
|
103 psrad xmm4, 12 ;(d1 * 2217 - c1 * 5352 + 7500)>>12 |
|
104 |
|
105 packssdw xmm0, xmm1 ;op[2] op[0] |
|
106 packssdw xmm3, xmm4 ;op[3] op[1] |
|
107 ; 23 22 21 20 03 02 01 00 |
|
108 ; |
|
109 ; 33 32 31 30 13 12 11 10 |
|
110 ; |
|
111 movdqa xmm2, xmm0 |
|
112 punpcklqdq xmm0, xmm3 ;13 12 11 10 03 02 01 00 |
|
113 punpckhqdq xmm2, xmm3 ;23 22 21 20 33 32 31 30 |
|
114 |
|
115 movdqa xmm3, xmm0 |
|
116 punpcklwd xmm0, xmm2 ;32 30 22 20 12 10 02 00 |
|
117 punpckhwd xmm3, xmm2 ;33 31 23 21 13 11 03 01 |
|
118 movdqa xmm2, xmm0 |
|
119 punpcklwd xmm0, xmm3 ;13 12 11 10 03 02 01 00 |
|
120 punpckhwd xmm2, xmm3 ;33 32 31 30 23 22 21 20 |
|
121 |
|
122 movdqa xmm5, XMMWORD PTR[GLOBAL(_7)] |
|
123 pshufd xmm2, xmm2, 04eh |
|
124 movdqa xmm3, xmm0 |
|
125 paddw xmm0, xmm2 ;b1 b1 b1 b1 a1 a1 a1 a1 |
|
126 psubw xmm3, xmm2 ;c1 c1 c1 c1 d1 d1 d1 d1 |
|
127 |
|
128 pshufd xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 b1 a1 a1 |
|
129 movdqa xmm2, xmm3 ;save d1 for compare |
|
130 pshufd xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 c1 d1 d1 |
|
131 pshuflw xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 a1 b1 a1 |
|
132 pshuflw xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 d1 c1 d1 |
|
133 pshufhw xmm0, xmm0, 0d8h ;b1 a1 b1 a1 b1 a1 b1 a1 |
|
134 pshufhw xmm3, xmm3, 0d8h ;c1 d1 c1 d1 c1 d1 c1 d1 |
|
135 movdqa xmm1, xmm0 |
|
136 pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 |
|
137 pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 |
|
138 |
|
139 pxor xmm4, xmm4 ;zero out for compare |
|
140 paddd xmm0, xmm5 |
|
141 paddd xmm1, xmm5 |
|
142 pcmpeqw xmm2, xmm4 |
|
143 psrad xmm0, 4 ;(a1 + b1 + 7)>>4 |
|
144 psrad xmm1, 4 ;(a1 - b1 + 7)>>4 |
|
145 pandn xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper, |
|
146 ;and keep bit 0 of lower |
|
147 |
|
148 movdqa xmm4, xmm3 |
|
149 pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 |
|
150 pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352 |
|
151 paddd xmm3, XMMWORD PTR[GLOBAL(_12000)] |
|
152 paddd xmm4, XMMWORD PTR[GLOBAL(_51000)] |
|
153 packssdw xmm0, xmm1 ;op[8] op[0] |
|
154 psrad xmm3, 16 ;(c1 * 2217 + d1 * 5352 + 12000)>>16 |
|
155 psrad xmm4, 16 ;(d1 * 2217 - c1 * 5352 + 51000)>>16 |
|
156 |
|
157 packssdw xmm3, xmm4 ;op[12] op[4] |
|
158 movdqa xmm1, xmm0 |
|
159 paddw xmm3, xmm2 ;op[4] += (d1!=0) |
|
160 punpcklqdq xmm0, xmm3 ;op[4] op[0] |
|
161 punpckhqdq xmm1, xmm3 ;op[12] op[8] |
|
162 |
|
163 movdqa XMMWORD PTR[output + 0], xmm0 |
|
164 movdqa XMMWORD PTR[output + 16], xmm1 |
|
165 |
|
166 STACK_FRAME_DESTROY |
|
167 |
|
168 ;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch) |
|
169 global sym(vp8_short_fdct8x4_sse2) PRIVATE |
|
170 sym(vp8_short_fdct8x4_sse2): |
|
171 |
|
172 STACK_FRAME_CREATE |
|
173 |
|
174 ; read the input data |
|
175 movdqa xmm0, [input ] |
|
176 movdqa xmm2, [input+ pitch] |
|
177 lea input, [input+2*pitch] |
|
178 movdqa xmm4, [input ] |
|
179 movdqa xmm3, [input+ pitch] |
|
180 |
|
181 ; transpose for the first stage |
|
182 movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07 |
|
183 movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27 |
|
184 |
|
185 punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13 |
|
186 punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17 |
|
187 |
|
188 punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33 |
|
189 punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37 |
|
190 |
|
191 movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13 |
|
192 punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31 |
|
193 |
|
194 punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33 |
|
195 |
|
196 movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17 |
|
197 punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35 |
|
198 |
|
199 punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37 |
|
200 movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33 |
|
201 |
|
202 punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37 |
|
203 punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36 |
|
204 |
|
205 movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31 |
|
206 punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34 |
|
207 |
|
208 punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35 |
|
209 |
|
210 ; xmm0 0 |
|
211 ; xmm1 1 |
|
212 ; xmm2 2 |
|
213 ; xmm3 3 |
|
214 |
|
215 ; first stage |
|
216 movdqa xmm5, xmm0 |
|
217 movdqa xmm4, xmm1 |
|
218 |
|
219 paddw xmm0, xmm3 ; a1 = 0 + 3 |
|
220 paddw xmm1, xmm2 ; b1 = 1 + 2 |
|
221 |
|
222 psubw xmm4, xmm2 ; c1 = 1 - 2 |
|
223 psubw xmm5, xmm3 ; d1 = 0 - 3 |
|
224 |
|
225 psllw xmm5, 3 |
|
226 psllw xmm4, 3 |
|
227 |
|
228 psllw xmm0, 3 |
|
229 psllw xmm1, 3 |
|
230 |
|
231 ; output 0 and 2 |
|
232 movdqa xmm2, xmm0 ; a1 |
|
233 |
|
234 paddw xmm0, xmm1 ; op[0] = a1 + b1 |
|
235 psubw xmm2, xmm1 ; op[2] = a1 - b1 |
|
236 |
|
237 ; output 1 and 3 |
|
238 ; interleave c1, d1 |
|
239 movdqa xmm1, xmm5 ; d1 |
|
240 punpcklwd xmm1, xmm4 ; c1 d1 |
|
241 punpckhwd xmm5, xmm4 ; c1 d1 |
|
242 |
|
243 movdqa xmm3, xmm1 |
|
244 movdqa xmm4, xmm5 |
|
245 |
|
246 pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 |
|
247 pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 |
|
248 |
|
249 pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 |
|
250 pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 |
|
251 |
|
252 paddd xmm1, XMMWORD PTR[GLOBAL(_14500)] |
|
253 paddd xmm4, XMMWORD PTR[GLOBAL(_14500)] |
|
254 paddd xmm3, XMMWORD PTR[GLOBAL(_7500)] |
|
255 paddd xmm5, XMMWORD PTR[GLOBAL(_7500)] |
|
256 |
|
257 psrad xmm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 |
|
258 psrad xmm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 |
|
259 psrad xmm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 |
|
260 psrad xmm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 |
|
261 |
|
262 packssdw xmm1, xmm4 ; op[1] |
|
263 packssdw xmm3, xmm5 ; op[3] |
|
264 |
|
265 ; done with vertical |
|
266 ; transpose for the second stage |
|
267 movdqa xmm4, xmm0 ; 00 10 20 30 04 14 24 34 |
|
268 movdqa xmm5, xmm2 ; 02 12 22 32 06 16 26 36 |
|
269 |
|
270 punpcklwd xmm0, xmm1 ; 00 01 10 11 20 21 30 31 |
|
271 punpckhwd xmm4, xmm1 ; 04 05 14 15 24 25 34 35 |
|
272 |
|
273 punpcklwd xmm2, xmm3 ; 02 03 12 13 22 23 32 33 |
|
274 punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37 |
|
275 |
|
276 movdqa xmm1, xmm0 ; 00 01 10 11 20 21 30 31 |
|
277 punpckldq xmm0, xmm2 ; 00 01 02 03 10 11 12 13 |
|
278 |
|
279 punpckhdq xmm1, xmm2 ; 20 21 22 23 30 31 32 33 |
|
280 |
|
281 movdqa xmm2, xmm4 ; 04 05 14 15 24 25 34 35 |
|
282 punpckldq xmm2, xmm5 ; 04 05 06 07 14 15 16 17 |
|
283 |
|
284 punpckhdq xmm4, xmm5 ; 24 25 26 27 34 35 36 37 |
|
285 movdqa xmm3, xmm1 ; 20 21 22 23 30 31 32 33 |
|
286 |
|
287 punpckhqdq xmm3, xmm4 ; 30 31 32 33 34 35 36 37 |
|
288 punpcklqdq xmm1, xmm4 ; 20 21 22 23 24 25 26 27 |
|
289 |
|
290 movdqa xmm4, xmm0 ; 00 01 02 03 10 11 12 13 |
|
291 punpcklqdq xmm0, xmm2 ; 00 01 02 03 04 05 06 07 |
|
292 |
|
293 punpckhqdq xmm4, xmm2 ; 10 11 12 13 14 15 16 17 |
|
294 |
|
295 ; xmm0 0 |
|
296 ; xmm1 4 |
|
297 ; xmm2 1 |
|
298 ; xmm3 3 |
|
299 |
|
300 movdqa xmm5, xmm0 |
|
301 movdqa xmm2, xmm1 |
|
302 |
|
303 paddw xmm0, xmm3 ; a1 = 0 + 3 |
|
304 paddw xmm1, xmm4 ; b1 = 1 + 2 |
|
305 |
|
306 psubw xmm4, xmm2 ; c1 = 1 - 2 |
|
307 psubw xmm5, xmm3 ; d1 = 0 - 3 |
|
308 |
|
309 pxor xmm6, xmm6 ; zero out for compare |
|
310 |
|
311 pcmpeqw xmm6, xmm5 ; d1 != 0 |
|
312 |
|
313 pandn xmm6, XMMWORD PTR[GLOBAL(_cmp_mask8x4)] ; clear upper, |
|
314 ; and keep bit 0 of lower |
|
315 |
|
316 ; output 0 and 2 |
|
317 movdqa xmm2, xmm0 ; a1 |
|
318 |
|
319 paddw xmm0, xmm1 ; a1 + b1 |
|
320 psubw xmm2, xmm1 ; a1 - b1 |
|
321 |
|
322 paddw xmm0, XMMWORD PTR[GLOBAL(_7w)] |
|
323 paddw xmm2, XMMWORD PTR[GLOBAL(_7w)] |
|
324 |
|
325 psraw xmm0, 4 ; op[0] = (a1 + b1 + 7)>>4 |
|
326 psraw xmm2, 4 ; op[8] = (a1 - b1 + 7)>>4 |
|
327 |
|
328 ; output 1 and 3 |
|
329 ; interleave c1, d1 |
|
330 movdqa xmm1, xmm5 ; d1 |
|
331 punpcklwd xmm1, xmm4 ; c1 d1 |
|
332 punpckhwd xmm5, xmm4 ; c1 d1 |
|
333 |
|
334 movdqa xmm3, xmm1 |
|
335 movdqa xmm4, xmm5 |
|
336 |
|
337 pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 |
|
338 pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 |
|
339 |
|
340 pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 |
|
341 pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 |
|
342 |
|
343 paddd xmm1, XMMWORD PTR[GLOBAL(_12000)] |
|
344 paddd xmm4, XMMWORD PTR[GLOBAL(_12000)] |
|
345 paddd xmm3, XMMWORD PTR[GLOBAL(_51000)] |
|
346 paddd xmm5, XMMWORD PTR[GLOBAL(_51000)] |
|
347 |
|
348 psrad xmm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 |
|
349 psrad xmm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 |
|
350 psrad xmm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 |
|
351 psrad xmm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 |
|
352 |
|
353 packssdw xmm1, xmm4 ; op[4] |
|
354 packssdw xmm3, xmm5 ; op[12] |
|
355 |
|
356 paddw xmm1, xmm6 ; op[4] += (d1!=0) |
|
357 |
|
358 movdqa xmm4, xmm0 |
|
359 movdqa xmm5, xmm2 |
|
360 |
|
361 punpcklqdq xmm0, xmm1 |
|
362 punpckhqdq xmm4, xmm1 |
|
363 |
|
364 punpcklqdq xmm2, xmm3 |
|
365 punpckhqdq xmm5, xmm3 |
|
366 |
|
367 movdqa XMMWORD PTR[output + 0 ], xmm0 |
|
368 movdqa XMMWORD PTR[output + 16], xmm2 |
|
369 movdqa XMMWORD PTR[output + 32], xmm4 |
|
370 movdqa XMMWORD PTR[output + 48], xmm5 |
|
371 |
|
372 STACK_FRAME_DESTROY |
|
373 |
|
374 SECTION_RODATA |
|
375 align 16 |
|
376 _5352_2217: |
|
377 dw 5352 |
|
378 dw 2217 |
|
379 dw 5352 |
|
380 dw 2217 |
|
381 dw 5352 |
|
382 dw 2217 |
|
383 dw 5352 |
|
384 dw 2217 |
|
385 align 16 |
|
386 _2217_neg5352: |
|
387 dw 2217 |
|
388 dw -5352 |
|
389 dw 2217 |
|
390 dw -5352 |
|
391 dw 2217 |
|
392 dw -5352 |
|
393 dw 2217 |
|
394 dw -5352 |
|
395 align 16 |
|
396 _mult_add: |
|
397 times 8 dw 1 |
|
398 align 16 |
|
399 _cmp_mask: |
|
400 times 4 dw 1 |
|
401 times 4 dw 0 |
|
402 align 16 |
|
403 _cmp_mask8x4: |
|
404 times 8 dw 1 |
|
405 align 16 |
|
406 _mult_sub: |
|
407 dw 1 |
|
408 dw -1 |
|
409 dw 1 |
|
410 dw -1 |
|
411 dw 1 |
|
412 dw -1 |
|
413 dw 1 |
|
414 dw -1 |
|
415 align 16 |
|
416 _7: |
|
417 times 4 dd 7 |
|
418 align 16 |
|
419 _7w: |
|
420 times 8 dw 7 |
|
421 align 16 |
|
422 _14500: |
|
423 times 4 dd 14500 |
|
424 align 16 |
|
425 _7500: |
|
426 times 4 dd 7500 |
|
427 align 16 |
|
428 _12000: |
|
429 times 4 dd 12000 |
|
430 align 16 |
|
431 _51000: |
|
432 times 4 dd 51000 |