media/libvpx/vp8/encoder/x86/dct_sse2.asm

branch
TOR_BUG_9701
changeset 15
b8a032363ba2
equal deleted inserted replaced
-1:000000000000 0:edd32fbb87d1
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11
12 %include "vpx_ports/x86_abi_support.asm"
13
14 %macro STACK_FRAME_CREATE 0
15 %if ABI_IS_32BIT
16 %define input rsi
17 %define output rdi
18 %define pitch rax
19 push rbp
20 mov rbp, rsp
21 GET_GOT rbx
22 push rsi
23 push rdi
24 ; end prolog
25
26 mov rsi, arg(0)
27 mov rdi, arg(1)
28
29 movsxd rax, dword ptr arg(2)
30 lea rcx, [rsi + rax*2]
31 %else
32 %if LIBVPX_YASM_WIN64
33 %define input rcx
34 %define output rdx
35 %define pitch r8
36 SAVE_XMM 7, u
37 %else
38 %define input rdi
39 %define output rsi
40 %define pitch rdx
41 %endif
42 %endif
43 %endmacro
44
45 %macro STACK_FRAME_DESTROY 0
46 %define input
47 %define output
48 %define pitch
49
50 %if ABI_IS_32BIT
51 pop rdi
52 pop rsi
53 RESTORE_GOT
54 pop rbp
55 %else
56 %if LIBVPX_YASM_WIN64
57 RESTORE_XMM
58 %endif
59 %endif
60 ret
61 %endmacro
62
63 ;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
64 global sym(vp8_short_fdct4x4_sse2) PRIVATE
65 sym(vp8_short_fdct4x4_sse2):
66
67 STACK_FRAME_CREATE
68
69 movq xmm0, MMWORD PTR[input ] ;03 02 01 00
70 movq xmm2, MMWORD PTR[input+ pitch] ;13 12 11 10
71 lea input, [input+2*pitch]
72 movq xmm1, MMWORD PTR[input ] ;23 22 21 20
73 movq xmm3, MMWORD PTR[input+ pitch] ;33 32 31 30
74
75 punpcklqdq xmm0, xmm2 ;13 12 11 10 03 02 01 00
76 punpcklqdq xmm1, xmm3 ;33 32 31 30 23 22 21 20
77
78 movdqa xmm2, xmm0
79 punpckldq xmm0, xmm1 ;23 22 03 02 21 20 01 00
80 punpckhdq xmm2, xmm1 ;33 32 13 12 31 30 11 10
81 movdqa xmm1, xmm0
82 punpckldq xmm0, xmm2 ;31 21 30 20 11 10 01 00
83 pshufhw xmm1, xmm1, 0b1h ;22 23 02 03 xx xx xx xx
84 pshufhw xmm2, xmm2, 0b1h ;32 33 12 13 xx xx xx xx
85
86 punpckhdq xmm1, xmm2 ;32 33 22 23 12 13 02 03
87 movdqa xmm3, xmm0
88 paddw xmm0, xmm1 ;b1 a1 b1 a1 b1 a1 b1 a1
89 psubw xmm3, xmm1 ;c1 d1 c1 d1 c1 d1 c1 d1
90 psllw xmm0, 3 ;b1 <<= 3 a1 <<= 3
91 psllw xmm3, 3 ;c1 <<= 3 d1 <<= 3
92
93 movdqa xmm1, xmm0
94 pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
95 pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
96 movdqa xmm4, xmm3
97 pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352
98 pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352
99
100 paddd xmm3, XMMWORD PTR[GLOBAL(_14500)]
101 paddd xmm4, XMMWORD PTR[GLOBAL(_7500)]
102 psrad xmm3, 12 ;(c1 * 2217 + d1 * 5352 + 14500)>>12
103 psrad xmm4, 12 ;(d1 * 2217 - c1 * 5352 + 7500)>>12
104
105 packssdw xmm0, xmm1 ;op[2] op[0]
106 packssdw xmm3, xmm4 ;op[3] op[1]
107 ; 23 22 21 20 03 02 01 00
108 ;
109 ; 33 32 31 30 13 12 11 10
110 ;
111 movdqa xmm2, xmm0
112 punpcklqdq xmm0, xmm3 ;13 12 11 10 03 02 01 00
113 punpckhqdq xmm2, xmm3 ;23 22 21 20 33 32 31 30
114
115 movdqa xmm3, xmm0
116 punpcklwd xmm0, xmm2 ;32 30 22 20 12 10 02 00
117 punpckhwd xmm3, xmm2 ;33 31 23 21 13 11 03 01
118 movdqa xmm2, xmm0
119 punpcklwd xmm0, xmm3 ;13 12 11 10 03 02 01 00
120 punpckhwd xmm2, xmm3 ;33 32 31 30 23 22 21 20
121
122 movdqa xmm5, XMMWORD PTR[GLOBAL(_7)]
123 pshufd xmm2, xmm2, 04eh
124 movdqa xmm3, xmm0
125 paddw xmm0, xmm2 ;b1 b1 b1 b1 a1 a1 a1 a1
126 psubw xmm3, xmm2 ;c1 c1 c1 c1 d1 d1 d1 d1
127
128 pshufd xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 b1 a1 a1
129 movdqa xmm2, xmm3 ;save d1 for compare
130 pshufd xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 c1 d1 d1
131 pshuflw xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 a1 b1 a1
132 pshuflw xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 d1 c1 d1
133 pshufhw xmm0, xmm0, 0d8h ;b1 a1 b1 a1 b1 a1 b1 a1
134 pshufhw xmm3, xmm3, 0d8h ;c1 d1 c1 d1 c1 d1 c1 d1
135 movdqa xmm1, xmm0
136 pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
137 pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
138
139 pxor xmm4, xmm4 ;zero out for compare
140 paddd xmm0, xmm5
141 paddd xmm1, xmm5
142 pcmpeqw xmm2, xmm4
143 psrad xmm0, 4 ;(a1 + b1 + 7)>>4
144 psrad xmm1, 4 ;(a1 - b1 + 7)>>4
145 pandn xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper,
146 ;and keep bit 0 of lower
147
148 movdqa xmm4, xmm3
149 pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352
150 pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352
151 paddd xmm3, XMMWORD PTR[GLOBAL(_12000)]
152 paddd xmm4, XMMWORD PTR[GLOBAL(_51000)]
153 packssdw xmm0, xmm1 ;op[8] op[0]
154 psrad xmm3, 16 ;(c1 * 2217 + d1 * 5352 + 12000)>>16
155 psrad xmm4, 16 ;(d1 * 2217 - c1 * 5352 + 51000)>>16
156
157 packssdw xmm3, xmm4 ;op[12] op[4]
158 movdqa xmm1, xmm0
159 paddw xmm3, xmm2 ;op[4] += (d1!=0)
160 punpcklqdq xmm0, xmm3 ;op[4] op[0]
161 punpckhqdq xmm1, xmm3 ;op[12] op[8]
162
163 movdqa XMMWORD PTR[output + 0], xmm0
164 movdqa XMMWORD PTR[output + 16], xmm1
165
166 STACK_FRAME_DESTROY
167
168 ;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
169 global sym(vp8_short_fdct8x4_sse2) PRIVATE
170 sym(vp8_short_fdct8x4_sse2):
171
172 STACK_FRAME_CREATE
173
174 ; read the input data
175 movdqa xmm0, [input ]
176 movdqa xmm2, [input+ pitch]
177 lea input, [input+2*pitch]
178 movdqa xmm4, [input ]
179 movdqa xmm3, [input+ pitch]
180
181 ; transpose for the first stage
182 movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07
183 movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27
184
185 punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13
186 punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17
187
188 punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33
189 punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37
190
191 movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13
192 punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31
193
194 punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33
195
196 movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17
197 punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35
198
199 punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37
200 movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33
201
202 punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37
203 punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36
204
205 movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31
206 punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34
207
208 punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35
209
210 ; xmm0 0
211 ; xmm1 1
212 ; xmm2 2
213 ; xmm3 3
214
215 ; first stage
216 movdqa xmm5, xmm0
217 movdqa xmm4, xmm1
218
219 paddw xmm0, xmm3 ; a1 = 0 + 3
220 paddw xmm1, xmm2 ; b1 = 1 + 2
221
222 psubw xmm4, xmm2 ; c1 = 1 - 2
223 psubw xmm5, xmm3 ; d1 = 0 - 3
224
225 psllw xmm5, 3
226 psllw xmm4, 3
227
228 psllw xmm0, 3
229 psllw xmm1, 3
230
231 ; output 0 and 2
232 movdqa xmm2, xmm0 ; a1
233
234 paddw xmm0, xmm1 ; op[0] = a1 + b1
235 psubw xmm2, xmm1 ; op[2] = a1 - b1
236
237 ; output 1 and 3
238 ; interleave c1, d1
239 movdqa xmm1, xmm5 ; d1
240 punpcklwd xmm1, xmm4 ; c1 d1
241 punpckhwd xmm5, xmm4 ; c1 d1
242
243 movdqa xmm3, xmm1
244 movdqa xmm4, xmm5
245
246 pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
247 pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
248
249 pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
250 pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
251
252 paddd xmm1, XMMWORD PTR[GLOBAL(_14500)]
253 paddd xmm4, XMMWORD PTR[GLOBAL(_14500)]
254 paddd xmm3, XMMWORD PTR[GLOBAL(_7500)]
255 paddd xmm5, XMMWORD PTR[GLOBAL(_7500)]
256
257 psrad xmm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
258 psrad xmm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
259 psrad xmm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
260 psrad xmm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
261
262 packssdw xmm1, xmm4 ; op[1]
263 packssdw xmm3, xmm5 ; op[3]
264
265 ; done with vertical
266 ; transpose for the second stage
267 movdqa xmm4, xmm0 ; 00 10 20 30 04 14 24 34
268 movdqa xmm5, xmm2 ; 02 12 22 32 06 16 26 36
269
270 punpcklwd xmm0, xmm1 ; 00 01 10 11 20 21 30 31
271 punpckhwd xmm4, xmm1 ; 04 05 14 15 24 25 34 35
272
273 punpcklwd xmm2, xmm3 ; 02 03 12 13 22 23 32 33
274 punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37
275
276 movdqa xmm1, xmm0 ; 00 01 10 11 20 21 30 31
277 punpckldq xmm0, xmm2 ; 00 01 02 03 10 11 12 13
278
279 punpckhdq xmm1, xmm2 ; 20 21 22 23 30 31 32 33
280
281 movdqa xmm2, xmm4 ; 04 05 14 15 24 25 34 35
282 punpckldq xmm2, xmm5 ; 04 05 06 07 14 15 16 17
283
284 punpckhdq xmm4, xmm5 ; 24 25 26 27 34 35 36 37
285 movdqa xmm3, xmm1 ; 20 21 22 23 30 31 32 33
286
287 punpckhqdq xmm3, xmm4 ; 30 31 32 33 34 35 36 37
288 punpcklqdq xmm1, xmm4 ; 20 21 22 23 24 25 26 27
289
290 movdqa xmm4, xmm0 ; 00 01 02 03 10 11 12 13
291 punpcklqdq xmm0, xmm2 ; 00 01 02 03 04 05 06 07
292
293 punpckhqdq xmm4, xmm2 ; 10 11 12 13 14 15 16 17
294
295 ; xmm0 0
296 ; xmm1 4
297 ; xmm2 1
298 ; xmm3 3
299
300 movdqa xmm5, xmm0
301 movdqa xmm2, xmm1
302
303 paddw xmm0, xmm3 ; a1 = 0 + 3
304 paddw xmm1, xmm4 ; b1 = 1 + 2
305
306 psubw xmm4, xmm2 ; c1 = 1 - 2
307 psubw xmm5, xmm3 ; d1 = 0 - 3
308
309 pxor xmm6, xmm6 ; zero out for compare
310
311 pcmpeqw xmm6, xmm5 ; d1 != 0
312
313 pandn xmm6, XMMWORD PTR[GLOBAL(_cmp_mask8x4)] ; clear upper,
314 ; and keep bit 0 of lower
315
316 ; output 0 and 2
317 movdqa xmm2, xmm0 ; a1
318
319 paddw xmm0, xmm1 ; a1 + b1
320 psubw xmm2, xmm1 ; a1 - b1
321
322 paddw xmm0, XMMWORD PTR[GLOBAL(_7w)]
323 paddw xmm2, XMMWORD PTR[GLOBAL(_7w)]
324
325 psraw xmm0, 4 ; op[0] = (a1 + b1 + 7)>>4
326 psraw xmm2, 4 ; op[8] = (a1 - b1 + 7)>>4
327
328 ; output 1 and 3
329 ; interleave c1, d1
330 movdqa xmm1, xmm5 ; d1
331 punpcklwd xmm1, xmm4 ; c1 d1
332 punpckhwd xmm5, xmm4 ; c1 d1
333
334 movdqa xmm3, xmm1
335 movdqa xmm4, xmm5
336
337 pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
338 pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
339
340 pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
341 pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
342
343 paddd xmm1, XMMWORD PTR[GLOBAL(_12000)]
344 paddd xmm4, XMMWORD PTR[GLOBAL(_12000)]
345 paddd xmm3, XMMWORD PTR[GLOBAL(_51000)]
346 paddd xmm5, XMMWORD PTR[GLOBAL(_51000)]
347
348 psrad xmm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
349 psrad xmm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
350 psrad xmm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
351 psrad xmm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
352
353 packssdw xmm1, xmm4 ; op[4]
354 packssdw xmm3, xmm5 ; op[12]
355
356 paddw xmm1, xmm6 ; op[4] += (d1!=0)
357
358 movdqa xmm4, xmm0
359 movdqa xmm5, xmm2
360
361 punpcklqdq xmm0, xmm1
362 punpckhqdq xmm4, xmm1
363
364 punpcklqdq xmm2, xmm3
365 punpckhqdq xmm5, xmm3
366
367 movdqa XMMWORD PTR[output + 0 ], xmm0
368 movdqa XMMWORD PTR[output + 16], xmm2
369 movdqa XMMWORD PTR[output + 32], xmm4
370 movdqa XMMWORD PTR[output + 48], xmm5
371
372 STACK_FRAME_DESTROY
373
374 SECTION_RODATA
375 align 16
376 _5352_2217:
377 dw 5352
378 dw 2217
379 dw 5352
380 dw 2217
381 dw 5352
382 dw 2217
383 dw 5352
384 dw 2217
385 align 16
386 _2217_neg5352:
387 dw 2217
388 dw -5352
389 dw 2217
390 dw -5352
391 dw 2217
392 dw -5352
393 dw 2217
394 dw -5352
395 align 16
396 _mult_add:
397 times 8 dw 1
398 align 16
399 _cmp_mask:
400 times 4 dw 1
401 times 4 dw 0
402 align 16
403 _cmp_mask8x4:
404 times 8 dw 1
405 align 16
406 _mult_sub:
407 dw 1
408 dw -1
409 dw 1
410 dw -1
411 dw 1
412 dw -1
413 dw 1
414 dw -1
415 align 16
416 _7:
417 times 4 dd 7
418 align 16
419 _7w:
420 times 8 dw 7
421 align 16
422 _14500:
423 times 4 dd 14500
424 align 16
425 _7500:
426 times 4 dd 7500
427 align 16
428 _12000:
429 times 4 dd 12000
430 align 16
431 _51000:
432 times 4 dd 51000

mercurial