|
1 ; |
|
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 ; |
|
4 ; Use of this source code is governed by a BSD-style license |
|
5 ; that can be found in the LICENSE file in the root of the source |
|
6 ; tree. An additional intellectual property rights grant can be found |
|
7 ; in the file PATENTS. All contributing project authors may |
|
8 ; be found in the AUTHORS file in the root of the source tree. |
|
9 ; |
|
10 |
|
11 |
|
12 %include "vpx_ports/x86_abi_support.asm" |
|
13 |
|
14 ;void vp8_idct_dequant_0_2x_sse2 |
|
15 ; ( |
|
16 ; short *qcoeff - 0 |
|
17 ; short *dequant - 1 |
|
18 ; unsigned char *dst - 2 |
|
19 ; int dst_stride - 3 |
|
20 ; ) |
|
21 |
|
22 global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE |
|
23 sym(vp8_idct_dequant_0_2x_sse2): |
|
24 push rbp |
|
25 mov rbp, rsp |
|
26 SHADOW_ARGS_TO_STACK 4 |
|
27 GET_GOT rbx |
|
28 ; end prolog |
|
29 |
|
30 mov rdx, arg(1) ; dequant |
|
31 mov rax, arg(0) ; qcoeff |
|
32 |
|
33 movd xmm4, [rax] |
|
34 movd xmm5, [rdx] |
|
35 |
|
36 pinsrw xmm4, [rax+32], 4 |
|
37 pinsrw xmm5, [rdx], 4 |
|
38 |
|
39 pmullw xmm4, xmm5 |
|
40 |
|
41 ; Zero out xmm5, for use unpacking |
|
42 pxor xmm5, xmm5 |
|
43 |
|
44 ; clear coeffs |
|
45 movd [rax], xmm5 |
|
46 movd [rax+32], xmm5 |
|
47 ;pshufb |
|
48 mov rax, arg(2) ; dst |
|
49 movsxd rdx, dword ptr arg(3) ; dst_stride |
|
50 |
|
51 pshuflw xmm4, xmm4, 00000000b |
|
52 pshufhw xmm4, xmm4, 00000000b |
|
53 |
|
54 lea rcx, [rdx + rdx*2] |
|
55 paddw xmm4, [GLOBAL(fours)] |
|
56 |
|
57 psraw xmm4, 3 |
|
58 |
|
59 movq xmm0, [rax] |
|
60 movq xmm1, [rax+rdx] |
|
61 movq xmm2, [rax+2*rdx] |
|
62 movq xmm3, [rax+rcx] |
|
63 |
|
64 punpcklbw xmm0, xmm5 |
|
65 punpcklbw xmm1, xmm5 |
|
66 punpcklbw xmm2, xmm5 |
|
67 punpcklbw xmm3, xmm5 |
|
68 |
|
69 |
|
70 ; Add to predict buffer |
|
71 paddw xmm0, xmm4 |
|
72 paddw xmm1, xmm4 |
|
73 paddw xmm2, xmm4 |
|
74 paddw xmm3, xmm4 |
|
75 |
|
76 ; pack up before storing |
|
77 packuswb xmm0, xmm5 |
|
78 packuswb xmm1, xmm5 |
|
79 packuswb xmm2, xmm5 |
|
80 packuswb xmm3, xmm5 |
|
81 |
|
82 ; store blocks back out |
|
83 movq [rax], xmm0 |
|
84 movq [rax + rdx], xmm1 |
|
85 |
|
86 lea rax, [rax + 2*rdx] |
|
87 |
|
88 movq [rax], xmm2 |
|
89 movq [rax + rdx], xmm3 |
|
90 |
|
91 ; begin epilog |
|
92 RESTORE_GOT |
|
93 UNSHADOW_ARGS |
|
94 pop rbp |
|
95 ret |
|
96 |
|
97 ;void vp8_idct_dequant_full_2x_sse2 |
|
98 ; ( |
|
99 ; short *qcoeff - 0 |
|
100 ; short *dequant - 1 |
|
101 ; unsigned char *dst - 2 |
|
102 ; int dst_stride - 3 |
|
103 ; ) |
|
104 global sym(vp8_idct_dequant_full_2x_sse2) PRIVATE |
|
105 sym(vp8_idct_dequant_full_2x_sse2): |
|
106 push rbp |
|
107 mov rbp, rsp |
|
108 SHADOW_ARGS_TO_STACK 4 |
|
109 SAVE_XMM 7 |
|
110 GET_GOT rbx |
|
111 push rsi |
|
112 push rdi |
|
113 ; end prolog |
|
114 |
|
115 ; special case when 2 blocks have 0 or 1 coeffs |
|
116 ; dc is set as first coeff, so no need to load qcoeff |
|
117 mov rax, arg(0) ; qcoeff |
|
118 mov rdx, arg(1) ; dequant |
|
119 mov rdi, arg(2) ; dst |
|
120 |
|
121 |
|
122 ; Zero out xmm7, for use unpacking |
|
123 pxor xmm7, xmm7 |
|
124 |
|
125 |
|
126 ; note the transpose of xmm1 and xmm2, necessary for shuffle |
|
127 ; to spit out sensicle data |
|
128 movdqa xmm0, [rax] |
|
129 movdqa xmm2, [rax+16] |
|
130 movdqa xmm1, [rax+32] |
|
131 movdqa xmm3, [rax+48] |
|
132 |
|
133 ; Clear out coeffs |
|
134 movdqa [rax], xmm7 |
|
135 movdqa [rax+16], xmm7 |
|
136 movdqa [rax+32], xmm7 |
|
137 movdqa [rax+48], xmm7 |
|
138 |
|
139 ; dequantize qcoeff buffer |
|
140 pmullw xmm0, [rdx] |
|
141 pmullw xmm2, [rdx+16] |
|
142 pmullw xmm1, [rdx] |
|
143 pmullw xmm3, [rdx+16] |
|
144 movsxd rdx, dword ptr arg(3) ; dst_stride |
|
145 |
|
146 ; repack so block 0 row x and block 1 row x are together |
|
147 movdqa xmm4, xmm0 |
|
148 punpckldq xmm0, xmm1 |
|
149 punpckhdq xmm4, xmm1 |
|
150 |
|
151 pshufd xmm0, xmm0, 11011000b |
|
152 pshufd xmm1, xmm4, 11011000b |
|
153 |
|
154 movdqa xmm4, xmm2 |
|
155 punpckldq xmm2, xmm3 |
|
156 punpckhdq xmm4, xmm3 |
|
157 |
|
158 pshufd xmm2, xmm2, 11011000b |
|
159 pshufd xmm3, xmm4, 11011000b |
|
160 |
|
161 ; first pass |
|
162 psubw xmm0, xmm2 ; b1 = 0-2 |
|
163 paddw xmm2, xmm2 ; |
|
164 |
|
165 movdqa xmm5, xmm1 |
|
166 paddw xmm2, xmm0 ; a1 = 0+2 |
|
167 |
|
168 pmulhw xmm5, [GLOBAL(x_s1sqr2)] |
|
169 lea rcx, [rdx + rdx*2] ;dst_stride * 3 |
|
170 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) |
|
171 |
|
172 movdqa xmm7, xmm3 |
|
173 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] |
|
174 |
|
175 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) |
|
176 psubw xmm7, xmm5 ; c1 |
|
177 |
|
178 movdqa xmm5, xmm1 |
|
179 movdqa xmm4, xmm3 |
|
180 |
|
181 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] |
|
182 paddw xmm5, xmm1 |
|
183 |
|
184 pmulhw xmm3, [GLOBAL(x_s1sqr2)] |
|
185 paddw xmm3, xmm4 |
|
186 |
|
187 paddw xmm3, xmm5 ; d1 |
|
188 movdqa xmm6, xmm2 ; a1 |
|
189 |
|
190 movdqa xmm4, xmm0 ; b1 |
|
191 paddw xmm2, xmm3 ;0 |
|
192 |
|
193 paddw xmm4, xmm7 ;1 |
|
194 psubw xmm0, xmm7 ;2 |
|
195 |
|
196 psubw xmm6, xmm3 ;3 |
|
197 |
|
198 ; transpose for the second pass |
|
199 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 |
|
200 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 |
|
201 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 |
|
202 |
|
203 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 |
|
204 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 |
|
205 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 |
|
206 |
|
207 |
|
208 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 |
|
209 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 |
|
210 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 |
|
211 |
|
212 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 |
|
213 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 |
|
214 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 |
|
215 |
|
216 |
|
217 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 |
|
218 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 |
|
219 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 |
|
220 |
|
221 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 |
|
222 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 |
|
223 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 |
|
224 |
|
225 pshufd xmm0, xmm2, 11011000b |
|
226 pshufd xmm2, xmm1, 11011000b |
|
227 |
|
228 pshufd xmm1, xmm5, 11011000b |
|
229 pshufd xmm3, xmm7, 11011000b |
|
230 |
|
231 ; second pass |
|
232 psubw xmm0, xmm2 ; b1 = 0-2 |
|
233 paddw xmm2, xmm2 |
|
234 |
|
235 movdqa xmm5, xmm1 |
|
236 paddw xmm2, xmm0 ; a1 = 0+2 |
|
237 |
|
238 pmulhw xmm5, [GLOBAL(x_s1sqr2)] |
|
239 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) |
|
240 |
|
241 movdqa xmm7, xmm3 |
|
242 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] |
|
243 |
|
244 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) |
|
245 psubw xmm7, xmm5 ; c1 |
|
246 |
|
247 movdqa xmm5, xmm1 |
|
248 movdqa xmm4, xmm3 |
|
249 |
|
250 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] |
|
251 paddw xmm5, xmm1 |
|
252 |
|
253 pmulhw xmm3, [GLOBAL(x_s1sqr2)] |
|
254 paddw xmm3, xmm4 |
|
255 |
|
256 paddw xmm3, xmm5 ; d1 |
|
257 paddw xmm0, [GLOBAL(fours)] |
|
258 |
|
259 paddw xmm2, [GLOBAL(fours)] |
|
260 movdqa xmm6, xmm2 ; a1 |
|
261 |
|
262 movdqa xmm4, xmm0 ; b1 |
|
263 paddw xmm2, xmm3 ;0 |
|
264 |
|
265 paddw xmm4, xmm7 ;1 |
|
266 psubw xmm0, xmm7 ;2 |
|
267 |
|
268 psubw xmm6, xmm3 ;3 |
|
269 psraw xmm2, 3 |
|
270 |
|
271 psraw xmm0, 3 |
|
272 psraw xmm4, 3 |
|
273 |
|
274 psraw xmm6, 3 |
|
275 |
|
276 ; transpose to save |
|
277 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 |
|
278 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 |
|
279 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 |
|
280 |
|
281 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 |
|
282 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 |
|
283 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 |
|
284 |
|
285 |
|
286 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 |
|
287 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 |
|
288 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 |
|
289 |
|
290 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 |
|
291 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 |
|
292 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 |
|
293 |
|
294 |
|
295 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 |
|
296 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 |
|
297 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 |
|
298 |
|
299 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 |
|
300 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 |
|
301 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 |
|
302 |
|
303 pshufd xmm0, xmm2, 11011000b |
|
304 pshufd xmm2, xmm1, 11011000b |
|
305 |
|
306 pshufd xmm1, xmm5, 11011000b |
|
307 pshufd xmm3, xmm7, 11011000b |
|
308 |
|
309 pxor xmm7, xmm7 |
|
310 |
|
311 ; Load up predict blocks |
|
312 movq xmm4, [rdi] |
|
313 movq xmm5, [rdi+rdx] |
|
314 |
|
315 punpcklbw xmm4, xmm7 |
|
316 punpcklbw xmm5, xmm7 |
|
317 |
|
318 paddw xmm0, xmm4 |
|
319 paddw xmm1, xmm5 |
|
320 |
|
321 movq xmm4, [rdi+2*rdx] |
|
322 movq xmm5, [rdi+rcx] |
|
323 |
|
324 punpcklbw xmm4, xmm7 |
|
325 punpcklbw xmm5, xmm7 |
|
326 |
|
327 paddw xmm2, xmm4 |
|
328 paddw xmm3, xmm5 |
|
329 |
|
330 .finish: |
|
331 |
|
332 ; pack up before storing |
|
333 packuswb xmm0, xmm7 |
|
334 packuswb xmm1, xmm7 |
|
335 packuswb xmm2, xmm7 |
|
336 packuswb xmm3, xmm7 |
|
337 |
|
338 ; store blocks back out |
|
339 movq [rdi], xmm0 |
|
340 movq [rdi + rdx], xmm1 |
|
341 movq [rdi + rdx*2], xmm2 |
|
342 movq [rdi + rcx], xmm3 |
|
343 |
|
344 ; begin epilog |
|
345 pop rdi |
|
346 pop rsi |
|
347 RESTORE_GOT |
|
348 RESTORE_XMM |
|
349 UNSHADOW_ARGS |
|
350 pop rbp |
|
351 ret |
|
352 |
|
353 ;void vp8_idct_dequant_dc_0_2x_sse2 |
|
354 ; ( |
|
355 ; short *qcoeff - 0 |
|
356 ; short *dequant - 1 |
|
357 ; unsigned char *dst - 2 |
|
358 ; int dst_stride - 3 |
|
359 ; short *dc - 4 |
|
360 ; ) |
|
361 global sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE |
|
362 sym(vp8_idct_dequant_dc_0_2x_sse2): |
|
363 push rbp |
|
364 mov rbp, rsp |
|
365 SHADOW_ARGS_TO_STACK 5 |
|
366 GET_GOT rbx |
|
367 push rdi |
|
368 ; end prolog |
|
369 |
|
370 ; special case when 2 blocks have 0 or 1 coeffs |
|
371 ; dc is set as first coeff, so no need to load qcoeff |
|
372 mov rax, arg(0) ; qcoeff |
|
373 |
|
374 mov rdi, arg(2) ; dst |
|
375 mov rdx, arg(4) ; dc |
|
376 |
|
377 ; Zero out xmm5, for use unpacking |
|
378 pxor xmm5, xmm5 |
|
379 |
|
380 ; load up 2 dc words here == 2*16 = doubleword |
|
381 movd xmm4, [rdx] |
|
382 |
|
383 movsxd rdx, dword ptr arg(3) ; dst_stride |
|
384 lea rcx, [rdx + rdx*2] |
|
385 ; Load up predict blocks |
|
386 movq xmm0, [rdi] |
|
387 movq xmm1, [rdi+rdx*1] |
|
388 movq xmm2, [rdi+rdx*2] |
|
389 movq xmm3, [rdi+rcx] |
|
390 |
|
391 ; Duplicate and expand dc across |
|
392 punpcklwd xmm4, xmm4 |
|
393 punpckldq xmm4, xmm4 |
|
394 |
|
395 ; Rounding to dequant and downshift |
|
396 paddw xmm4, [GLOBAL(fours)] |
|
397 psraw xmm4, 3 |
|
398 |
|
399 ; Predict buffer needs to be expanded from bytes to words |
|
400 punpcklbw xmm0, xmm5 |
|
401 punpcklbw xmm1, xmm5 |
|
402 punpcklbw xmm2, xmm5 |
|
403 punpcklbw xmm3, xmm5 |
|
404 |
|
405 ; Add to predict buffer |
|
406 paddw xmm0, xmm4 |
|
407 paddw xmm1, xmm4 |
|
408 paddw xmm2, xmm4 |
|
409 paddw xmm3, xmm4 |
|
410 |
|
411 ; pack up before storing |
|
412 packuswb xmm0, xmm5 |
|
413 packuswb xmm1, xmm5 |
|
414 packuswb xmm2, xmm5 |
|
415 packuswb xmm3, xmm5 |
|
416 |
|
417 ; store blocks back out |
|
418 movq [rdi], xmm0 |
|
419 movq [rdi + rdx], xmm1 |
|
420 movq [rdi + rdx*2], xmm2 |
|
421 movq [rdi + rcx], xmm3 |
|
422 |
|
423 ; begin epilog |
|
424 pop rdi |
|
425 RESTORE_GOT |
|
426 UNSHADOW_ARGS |
|
427 pop rbp |
|
428 ret |
|
429 ;void vp8_idct_dequant_dc_full_2x_sse2 |
|
430 ; ( |
|
431 ; short *qcoeff - 0 |
|
432 ; short *dequant - 1 |
|
433 ; unsigned char *dst - 2 |
|
434 ; int dst_stride - 3 |
|
435 ; short *dc - 4 |
|
436 ; ) |
|
437 global sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE |
|
438 sym(vp8_idct_dequant_dc_full_2x_sse2): |
|
439 push rbp |
|
440 mov rbp, rsp |
|
441 SHADOW_ARGS_TO_STACK 5 |
|
442 SAVE_XMM 7 |
|
443 GET_GOT rbx |
|
444 push rdi |
|
445 ; end prolog |
|
446 |
|
447 ; special case when 2 blocks have 0 or 1 coeffs |
|
448 ; dc is set as first coeff, so no need to load qcoeff |
|
449 mov rax, arg(0) ; qcoeff |
|
450 mov rdx, arg(1) ; dequant |
|
451 |
|
452 mov rdi, arg(2) ; dst |
|
453 |
|
454 ; Zero out xmm7, for use unpacking |
|
455 pxor xmm7, xmm7 |
|
456 |
|
457 |
|
458 ; note the transpose of xmm1 and xmm2, necessary for shuffle |
|
459 ; to spit out sensicle data |
|
460 movdqa xmm0, [rax] |
|
461 movdqa xmm2, [rax+16] |
|
462 movdqa xmm1, [rax+32] |
|
463 movdqa xmm3, [rax+48] |
|
464 |
|
465 ; Clear out coeffs |
|
466 movdqa [rax], xmm7 |
|
467 movdqa [rax+16], xmm7 |
|
468 movdqa [rax+32], xmm7 |
|
469 movdqa [rax+48], xmm7 |
|
470 |
|
471 ; dequantize qcoeff buffer |
|
472 pmullw xmm0, [rdx] |
|
473 pmullw xmm2, [rdx+16] |
|
474 pmullw xmm1, [rdx] |
|
475 pmullw xmm3, [rdx+16] |
|
476 |
|
477 ; DC component |
|
478 mov rdx, arg(4) |
|
479 |
|
480 ; repack so block 0 row x and block 1 row x are together |
|
481 movdqa xmm4, xmm0 |
|
482 punpckldq xmm0, xmm1 |
|
483 punpckhdq xmm4, xmm1 |
|
484 |
|
485 pshufd xmm0, xmm0, 11011000b |
|
486 pshufd xmm1, xmm4, 11011000b |
|
487 |
|
488 movdqa xmm4, xmm2 |
|
489 punpckldq xmm2, xmm3 |
|
490 punpckhdq xmm4, xmm3 |
|
491 |
|
492 pshufd xmm2, xmm2, 11011000b |
|
493 pshufd xmm3, xmm4, 11011000b |
|
494 |
|
495 ; insert DC component |
|
496 pinsrw xmm0, [rdx], 0 |
|
497 pinsrw xmm0, [rdx+2], 4 |
|
498 |
|
499 ; first pass |
|
500 psubw xmm0, xmm2 ; b1 = 0-2 |
|
501 paddw xmm2, xmm2 ; |
|
502 |
|
503 movdqa xmm5, xmm1 |
|
504 paddw xmm2, xmm0 ; a1 = 0+2 |
|
505 |
|
506 pmulhw xmm5, [GLOBAL(x_s1sqr2)] |
|
507 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) |
|
508 |
|
509 movdqa xmm7, xmm3 |
|
510 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] |
|
511 |
|
512 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) |
|
513 psubw xmm7, xmm5 ; c1 |
|
514 |
|
515 movdqa xmm5, xmm1 |
|
516 movdqa xmm4, xmm3 |
|
517 |
|
518 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] |
|
519 paddw xmm5, xmm1 |
|
520 |
|
521 pmulhw xmm3, [GLOBAL(x_s1sqr2)] |
|
522 paddw xmm3, xmm4 |
|
523 |
|
524 paddw xmm3, xmm5 ; d1 |
|
525 movdqa xmm6, xmm2 ; a1 |
|
526 |
|
527 movdqa xmm4, xmm0 ; b1 |
|
528 paddw xmm2, xmm3 ;0 |
|
529 |
|
530 paddw xmm4, xmm7 ;1 |
|
531 psubw xmm0, xmm7 ;2 |
|
532 |
|
533 psubw xmm6, xmm3 ;3 |
|
534 |
|
535 ; transpose for the second pass |
|
536 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 |
|
537 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 |
|
538 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 |
|
539 |
|
540 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 |
|
541 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 |
|
542 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 |
|
543 |
|
544 |
|
545 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 |
|
546 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 |
|
547 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 |
|
548 |
|
549 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 |
|
550 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 |
|
551 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 |
|
552 |
|
553 |
|
554 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 |
|
555 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 |
|
556 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 |
|
557 |
|
558 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 |
|
559 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 |
|
560 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 |
|
561 |
|
562 pshufd xmm0, xmm2, 11011000b |
|
563 pshufd xmm2, xmm1, 11011000b |
|
564 |
|
565 pshufd xmm1, xmm5, 11011000b |
|
566 pshufd xmm3, xmm7, 11011000b |
|
567 |
|
568 ; second pass |
|
569 psubw xmm0, xmm2 ; b1 = 0-2 |
|
570 paddw xmm2, xmm2 |
|
571 |
|
572 movdqa xmm5, xmm1 |
|
573 paddw xmm2, xmm0 ; a1 = 0+2 |
|
574 |
|
575 pmulhw xmm5, [GLOBAL(x_s1sqr2)] |
|
576 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) |
|
577 |
|
578 movdqa xmm7, xmm3 |
|
579 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] |
|
580 |
|
581 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) |
|
582 psubw xmm7, xmm5 ; c1 |
|
583 |
|
584 movdqa xmm5, xmm1 |
|
585 movdqa xmm4, xmm3 |
|
586 |
|
587 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] |
|
588 paddw xmm5, xmm1 |
|
589 |
|
590 pmulhw xmm3, [GLOBAL(x_s1sqr2)] |
|
591 paddw xmm3, xmm4 |
|
592 |
|
593 paddw xmm3, xmm5 ; d1 |
|
594 paddw xmm0, [GLOBAL(fours)] |
|
595 |
|
596 paddw xmm2, [GLOBAL(fours)] |
|
597 movdqa xmm6, xmm2 ; a1 |
|
598 |
|
599 movdqa xmm4, xmm0 ; b1 |
|
600 paddw xmm2, xmm3 ;0 |
|
601 |
|
602 paddw xmm4, xmm7 ;1 |
|
603 psubw xmm0, xmm7 ;2 |
|
604 |
|
605 psubw xmm6, xmm3 ;3 |
|
606 psraw xmm2, 3 |
|
607 |
|
608 psraw xmm0, 3 |
|
609 psraw xmm4, 3 |
|
610 |
|
611 psraw xmm6, 3 |
|
612 |
|
613 ; transpose to save |
|
614 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 |
|
615 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 |
|
616 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 |
|
617 |
|
618 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 |
|
619 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 |
|
620 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 |
|
621 |
|
622 |
|
623 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 |
|
624 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 |
|
625 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 |
|
626 |
|
627 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 |
|
628 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 |
|
629 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 |
|
630 |
|
631 |
|
632 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 |
|
633 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 |
|
634 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 |
|
635 |
|
636 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 |
|
637 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 |
|
638 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 |
|
639 |
|
640 pshufd xmm0, xmm2, 11011000b |
|
641 pshufd xmm2, xmm1, 11011000b |
|
642 |
|
643 pshufd xmm1, xmm5, 11011000b |
|
644 pshufd xmm3, xmm7, 11011000b |
|
645 |
|
646 pxor xmm7, xmm7 |
|
647 |
|
648 ; Load up predict blocks |
|
649 movsxd rdx, dword ptr arg(3) ; dst_stride |
|
650 movq xmm4, [rdi] |
|
651 movq xmm5, [rdi+rdx] |
|
652 lea rcx, [rdx + rdx*2] |
|
653 |
|
654 punpcklbw xmm4, xmm7 |
|
655 punpcklbw xmm5, xmm7 |
|
656 |
|
657 paddw xmm0, xmm4 |
|
658 paddw xmm1, xmm5 |
|
659 |
|
660 movq xmm4, [rdi+rdx*2] |
|
661 movq xmm5, [rdi+rcx] |
|
662 |
|
663 punpcklbw xmm4, xmm7 |
|
664 punpcklbw xmm5, xmm7 |
|
665 |
|
666 paddw xmm2, xmm4 |
|
667 paddw xmm3, xmm5 |
|
668 |
|
669 .finish: |
|
670 |
|
671 ; pack up before storing |
|
672 packuswb xmm0, xmm7 |
|
673 packuswb xmm1, xmm7 |
|
674 packuswb xmm2, xmm7 |
|
675 packuswb xmm3, xmm7 |
|
676 |
|
677 ; Load destination stride before writing out, |
|
678 ; doesn't need to persist |
|
679 movsxd rdx, dword ptr arg(3) ; dst_stride |
|
680 |
|
681 ; store blocks back out |
|
682 movq [rdi], xmm0 |
|
683 movq [rdi + rdx], xmm1 |
|
684 |
|
685 lea rdi, [rdi + 2*rdx] |
|
686 |
|
687 movq [rdi], xmm2 |
|
688 movq [rdi + rdx], xmm3 |
|
689 |
|
690 |
|
691 ; begin epilog |
|
692 pop rdi |
|
693 RESTORE_GOT |
|
694 RESTORE_XMM |
|
695 UNSHADOW_ARGS |
|
696 pop rbp |
|
697 ret |
|
698 |
|
699 SECTION_RODATA |
|
700 align 16 |
|
701 fours: |
|
702 times 8 dw 0x0004 |
|
703 align 16 |
|
704 x_s1sqr2: |
|
705 times 8 dw 0x8A8C |
|
706 align 16 |
|
707 x_c1sqr2less1: |
|
708 times 8 dw 0x4E7B |