Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
12 %include "vpx_ports/x86_abi_support.asm"
14 %macro VERTx4 1
15 mov rdx, arg(5) ;filter ptr
16 mov rsi, arg(0) ;src_ptr
17 mov rdi, arg(2) ;output_ptr
18 mov rcx, 0x0400040
20 movdqa xmm4, [rdx] ;load filters
21 movd xmm5, rcx
22 packsswb xmm4, xmm4
23 pshuflw xmm0, xmm4, 0b ;k0_k1
24 pshuflw xmm1, xmm4, 01010101b ;k2_k3
25 pshuflw xmm2, xmm4, 10101010b ;k4_k5
26 pshuflw xmm3, xmm4, 11111111b ;k6_k7
28 punpcklqdq xmm0, xmm0
29 punpcklqdq xmm1, xmm1
30 punpcklqdq xmm2, xmm2
31 punpcklqdq xmm3, xmm3
33 movdqa k0k1, xmm0
34 movdqa k2k3, xmm1
35 pshufd xmm5, xmm5, 0
36 movdqa k4k5, xmm2
37 movdqa k6k7, xmm3
38 movdqa krd, xmm5
40 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
42 %if ABI_IS_32BIT=0
43 movsxd r8, DWORD PTR arg(3) ;out_pitch
44 %endif
45 mov rax, rsi
46 movsxd rcx, DWORD PTR arg(4) ;output_height
47 add rax, rdx
49 lea rbx, [rdx + rdx*4]
50 add rbx, rdx ;pitch * 6
52 .loop:
53 movd xmm0, [rsi] ;A
54 movd xmm1, [rsi + rdx] ;B
55 movd xmm2, [rsi + rdx * 2] ;C
56 movd xmm3, [rax + rdx * 2] ;D
57 movd xmm4, [rsi + rdx * 4] ;E
58 movd xmm5, [rax + rdx * 4] ;F
60 punpcklbw xmm0, xmm1 ;A B
61 punpcklbw xmm2, xmm3 ;C D
62 punpcklbw xmm4, xmm5 ;E F
64 movd xmm6, [rsi + rbx] ;G
65 movd xmm7, [rax + rbx] ;H
67 pmaddubsw xmm0, k0k1
68 pmaddubsw xmm2, k2k3
69 punpcklbw xmm6, xmm7 ;G H
70 pmaddubsw xmm4, k4k5
71 pmaddubsw xmm6, k6k7
73 movdqa xmm1, xmm2
74 paddsw xmm0, xmm6
75 pmaxsw xmm2, xmm4
76 pminsw xmm4, xmm1
77 paddsw xmm0, xmm4
78 paddsw xmm0, xmm2
80 paddsw xmm0, krd
81 psraw xmm0, 7
82 packuswb xmm0, xmm0
84 add rsi, rdx
85 add rax, rdx
86 %if %1
87 movd xmm1, [rdi]
88 pavgb xmm0, xmm1
89 %endif
90 movd [rdi], xmm0
92 %if ABI_IS_32BIT
93 add rdi, DWORD PTR arg(3) ;out_pitch
94 %else
95 add rdi, r8
96 %endif
97 dec rcx
98 jnz .loop
99 %endm
101 %macro VERTx8 1
102 mov rdx, arg(5) ;filter ptr
103 mov rsi, arg(0) ;src_ptr
104 mov rdi, arg(2) ;output_ptr
105 mov rcx, 0x0400040
107 movdqa xmm4, [rdx] ;load filters
108 movq xmm5, rcx
109 packsswb xmm4, xmm4
110 pshuflw xmm0, xmm4, 0b ;k0_k1
111 pshuflw xmm1, xmm4, 01010101b ;k2_k3
112 pshuflw xmm2, xmm4, 10101010b ;k4_k5
113 pshuflw xmm3, xmm4, 11111111b ;k6_k7
115 punpcklqdq xmm0, xmm0
116 punpcklqdq xmm1, xmm1
117 punpcklqdq xmm2, xmm2
118 punpcklqdq xmm3, xmm3
120 movdqa k0k1, xmm0
121 movdqa k2k3, xmm1
122 pshufd xmm5, xmm5, 0
123 movdqa k4k5, xmm2
124 movdqa k6k7, xmm3
125 movdqa krd, xmm5
127 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
129 %if ABI_IS_32BIT=0
130 movsxd r8, DWORD PTR arg(3) ;out_pitch
131 %endif
132 mov rax, rsi
133 movsxd rcx, DWORD PTR arg(4) ;output_height
134 add rax, rdx
136 lea rbx, [rdx + rdx*4]
137 add rbx, rdx ;pitch * 6
139 .loop:
140 movq xmm0, [rsi] ;A
141 movq xmm1, [rsi + rdx] ;B
142 movq xmm2, [rsi + rdx * 2] ;C
143 movq xmm3, [rax + rdx * 2] ;D
144 movq xmm4, [rsi + rdx * 4] ;E
145 movq xmm5, [rax + rdx * 4] ;F
147 punpcklbw xmm0, xmm1 ;A B
148 punpcklbw xmm2, xmm3 ;C D
149 punpcklbw xmm4, xmm5 ;E F
151 movq xmm6, [rsi + rbx] ;G
152 movq xmm7, [rax + rbx] ;H
154 pmaddubsw xmm0, k0k1
155 pmaddubsw xmm2, k2k3
156 punpcklbw xmm6, xmm7 ;G H
157 pmaddubsw xmm4, k4k5
158 pmaddubsw xmm6, k6k7
160 paddsw xmm0, xmm6
161 movdqa xmm1, xmm2
162 pmaxsw xmm2, xmm4
163 pminsw xmm4, xmm1
164 paddsw xmm0, xmm4
165 paddsw xmm0, xmm2
167 paddsw xmm0, krd
168 psraw xmm0, 7
169 packuswb xmm0, xmm0
171 add rsi, rdx
172 add rax, rdx
173 %if %1
174 movq xmm1, [rdi]
175 pavgb xmm0, xmm1
176 %endif
177 movq [rdi], xmm0
179 %if ABI_IS_32BIT
180 add rdi, DWORD PTR arg(3) ;out_pitch
181 %else
182 add rdi, r8
183 %endif
184 dec rcx
185 jnz .loop
186 %endm
189 %macro VERTx16 1
190 mov rdx, arg(5) ;filter ptr
191 mov rsi, arg(0) ;src_ptr
192 mov rdi, arg(2) ;output_ptr
193 mov rcx, 0x0400040
195 movdqa xmm4, [rdx] ;load filters
196 movq xmm5, rcx
197 packsswb xmm4, xmm4
198 pshuflw xmm0, xmm4, 0b ;k0_k1
199 pshuflw xmm1, xmm4, 01010101b ;k2_k3
200 pshuflw xmm2, xmm4, 10101010b ;k4_k5
201 pshuflw xmm3, xmm4, 11111111b ;k6_k7
203 punpcklqdq xmm0, xmm0
204 punpcklqdq xmm1, xmm1
205 punpcklqdq xmm2, xmm2
206 punpcklqdq xmm3, xmm3
208 movdqa k0k1, xmm0
209 movdqa k2k3, xmm1
210 pshufd xmm5, xmm5, 0
211 movdqa k4k5, xmm2
212 movdqa k6k7, xmm3
213 movdqa krd, xmm5
215 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
217 %if ABI_IS_32BIT=0
218 movsxd r8, DWORD PTR arg(3) ;out_pitch
219 %endif
220 mov rax, rsi
221 movsxd rcx, DWORD PTR arg(4) ;output_height
222 add rax, rdx
224 lea rbx, [rdx + rdx*4]
225 add rbx, rdx ;pitch * 6
227 .loop:
228 movq xmm0, [rsi] ;A
229 movq xmm1, [rsi + rdx] ;B
230 movq xmm2, [rsi + rdx * 2] ;C
231 movq xmm3, [rax + rdx * 2] ;D
232 movq xmm4, [rsi + rdx * 4] ;E
233 movq xmm5, [rax + rdx * 4] ;F
235 punpcklbw xmm0, xmm1 ;A B
236 punpcklbw xmm2, xmm3 ;C D
237 punpcklbw xmm4, xmm5 ;E F
239 movq xmm6, [rsi + rbx] ;G
240 movq xmm7, [rax + rbx] ;H
242 pmaddubsw xmm0, k0k1
243 pmaddubsw xmm2, k2k3
244 punpcklbw xmm6, xmm7 ;G H
245 pmaddubsw xmm4, k4k5
246 pmaddubsw xmm6, k6k7
248 paddsw xmm0, xmm6
249 movdqa xmm1, xmm2
250 pmaxsw xmm2, xmm4
251 pminsw xmm4, xmm1
252 paddsw xmm0, xmm4
253 paddsw xmm0, xmm2
255 paddsw xmm0, krd
256 psraw xmm0, 7
257 packuswb xmm0, xmm0
258 %if %1
259 movq xmm1, [rdi]
260 pavgb xmm0, xmm1
261 %endif
262 movq [rdi], xmm0
264 movq xmm0, [rsi + 8] ;A
265 movq xmm1, [rsi + rdx + 8] ;B
266 movq xmm2, [rsi + rdx * 2 + 8] ;C
267 movq xmm3, [rax + rdx * 2 + 8] ;D
268 movq xmm4, [rsi + rdx * 4 + 8] ;E
269 movq xmm5, [rax + rdx * 4 + 8] ;F
271 punpcklbw xmm0, xmm1 ;A B
272 punpcklbw xmm2, xmm3 ;C D
273 punpcklbw xmm4, xmm5 ;E F
276 movq xmm6, [rsi + rbx + 8] ;G
277 movq xmm7, [rax + rbx + 8] ;H
278 punpcklbw xmm6, xmm7 ;G H
281 pmaddubsw xmm0, k0k1
282 pmaddubsw xmm2, k2k3
283 pmaddubsw xmm4, k4k5
284 pmaddubsw xmm6, k6k7
286 paddsw xmm0, xmm6
287 paddsw xmm0, xmm2
288 paddsw xmm0, xmm4
289 paddsw xmm0, krd
291 psraw xmm0, 7
292 packuswb xmm0, xmm0
294 add rsi, rdx
295 add rax, rdx
296 %if %1
297 movq xmm1, [rdi+8]
298 pavgb xmm0, xmm1
299 %endif
301 movq [rdi+8], xmm0
303 %if ABI_IS_32BIT
304 add rdi, DWORD PTR arg(3) ;out_pitch
305 %else
306 add rdi, r8
307 %endif
308 dec rcx
309 jnz .loop
310 %endm
312 ;void vp9_filter_block1d8_v8_ssse3
313 ;(
314 ; unsigned char *src_ptr,
315 ; unsigned int src_pitch,
316 ; unsigned char *output_ptr,
317 ; unsigned int out_pitch,
318 ; unsigned int output_height,
319 ; short *filter
320 ;)
321 global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE
322 sym(vp9_filter_block1d4_v8_ssse3):
323 push rbp
324 mov rbp, rsp
325 SHADOW_ARGS_TO_STACK 6
326 SAVE_XMM 7
327 push rsi
328 push rdi
329 push rbx
330 ; end prolog
332 ALIGN_STACK 16, rax
333 sub rsp, 16*5
334 %define k0k1 [rsp + 16*0]
335 %define k2k3 [rsp + 16*1]
336 %define k4k5 [rsp + 16*2]
337 %define k6k7 [rsp + 16*3]
338 %define krd [rsp + 16*4]
340 VERTx4 0
342 add rsp, 16*5
343 pop rsp
344 pop rbx
345 ; begin epilog
346 pop rdi
347 pop rsi
348 RESTORE_XMM
349 UNSHADOW_ARGS
350 pop rbp
351 ret
353 ;void vp9_filter_block1d8_v8_ssse3
354 ;(
355 ; unsigned char *src_ptr,
356 ; unsigned int src_pitch,
357 ; unsigned char *output_ptr,
358 ; unsigned int out_pitch,
359 ; unsigned int output_height,
360 ; short *filter
361 ;)
362 global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE
363 sym(vp9_filter_block1d8_v8_ssse3):
364 push rbp
365 mov rbp, rsp
366 SHADOW_ARGS_TO_STACK 6
367 SAVE_XMM 7
368 push rsi
369 push rdi
370 push rbx
371 ; end prolog
373 ALIGN_STACK 16, rax
374 sub rsp, 16*5
375 %define k0k1 [rsp + 16*0]
376 %define k2k3 [rsp + 16*1]
377 %define k4k5 [rsp + 16*2]
378 %define k6k7 [rsp + 16*3]
379 %define krd [rsp + 16*4]
381 VERTx8 0
383 add rsp, 16*5
384 pop rsp
385 pop rbx
386 ; begin epilog
387 pop rdi
388 pop rsi
389 RESTORE_XMM
390 UNSHADOW_ARGS
391 pop rbp
392 ret
394 ;void vp9_filter_block1d16_v8_ssse3
395 ;(
396 ; unsigned char *src_ptr,
397 ; unsigned int src_pitch,
398 ; unsigned char *output_ptr,
399 ; unsigned int out_pitch,
400 ; unsigned int output_height,
401 ; short *filter
402 ;)
403 global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE
404 sym(vp9_filter_block1d16_v8_ssse3):
405 push rbp
406 mov rbp, rsp
407 SHADOW_ARGS_TO_STACK 6
408 SAVE_XMM 7
409 push rsi
410 push rdi
411 push rbx
412 ; end prolog
414 ALIGN_STACK 16, rax
415 sub rsp, 16*5
416 %define k0k1 [rsp + 16*0]
417 %define k2k3 [rsp + 16*1]
418 %define k4k5 [rsp + 16*2]
419 %define k6k7 [rsp + 16*3]
420 %define krd [rsp + 16*4]
422 VERTx16 0
424 add rsp, 16*5
425 pop rsp
426 pop rbx
427 ; begin epilog
428 pop rdi
429 pop rsi
430 RESTORE_XMM
431 UNSHADOW_ARGS
432 pop rbp
433 ret
435 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
438 global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE
439 sym(vp9_filter_block1d4_v8_avg_ssse3):
440 push rbp
441 mov rbp, rsp
442 SHADOW_ARGS_TO_STACK 6
443 SAVE_XMM 7
444 push rsi
445 push rdi
446 push rbx
447 ; end prolog
449 ALIGN_STACK 16, rax
450 sub rsp, 16*5
451 %define k0k1 [rsp + 16*0]
452 %define k2k3 [rsp + 16*1]
453 %define k4k5 [rsp + 16*2]
454 %define k6k7 [rsp + 16*3]
455 %define krd [rsp + 16*4]
457 VERTx4 1
459 add rsp, 16*5
460 pop rsp
461 pop rbx
462 ; begin epilog
463 pop rdi
464 pop rsi
465 RESTORE_XMM
466 UNSHADOW_ARGS
467 pop rbp
468 ret
470 global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE
471 sym(vp9_filter_block1d8_v8_avg_ssse3):
472 push rbp
473 mov rbp, rsp
474 SHADOW_ARGS_TO_STACK 6
475 SAVE_XMM 7
476 push rsi
477 push rdi
478 push rbx
479 ; end prolog
481 ALIGN_STACK 16, rax
482 sub rsp, 16*5
483 %define k0k1 [rsp + 16*0]
484 %define k2k3 [rsp + 16*1]
485 %define k4k5 [rsp + 16*2]
486 %define k6k7 [rsp + 16*3]
487 %define krd [rsp + 16*4]
489 VERTx8 1
491 add rsp, 16*5
492 pop rsp
493 pop rbx
494 ; begin epilog
495 pop rdi
496 pop rsi
497 RESTORE_XMM
498 UNSHADOW_ARGS
499 pop rbp
500 ret
502 global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE
503 sym(vp9_filter_block1d16_v8_avg_ssse3):
504 push rbp
505 mov rbp, rsp
506 SHADOW_ARGS_TO_STACK 6
507 SAVE_XMM 7
508 push rsi
509 push rdi
510 push rbx
511 ; end prolog
513 ALIGN_STACK 16, rax
514 sub rsp, 16*5
515 %define k0k1 [rsp + 16*0]
516 %define k2k3 [rsp + 16*1]
517 %define k4k5 [rsp + 16*2]
518 %define k6k7 [rsp + 16*3]
519 %define krd [rsp + 16*4]
521 VERTx16 1
523 add rsp, 16*5
524 pop rsp
525 pop rbx
526 ; begin epilog
527 pop rdi
528 pop rsi
529 RESTORE_XMM
530 UNSHADOW_ARGS
531 pop rbp
532 ret
534 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
535 %macro HORIZx4_ROW 2
536 movdqa %2, %1
537 pshufb %1, [GLOBAL(shuf_t0t1)]
538 pshufb %2, [GLOBAL(shuf_t2t3)]
539 pmaddubsw %1, k0k1k4k5
540 pmaddubsw %2, k2k3k6k7
542 movdqa xmm4, %1
543 movdqa xmm5, %2
544 psrldq %1, 8
545 psrldq %2, 8
546 movdqa xmm6, xmm5
548 paddsw xmm4, %2
549 pmaxsw xmm5, %1
550 pminsw %1, xmm6
551 paddsw %1, xmm4
552 paddsw %1, xmm5
554 paddsw %1, krd
555 psraw %1, 7
556 packuswb %1, %1
557 %endm
559 %macro HORIZx4 1
560 mov rdx, arg(5) ;filter ptr
561 mov rsi, arg(0) ;src_ptr
562 mov rdi, arg(2) ;output_ptr
563 mov rcx, 0x0400040
565 movdqa xmm4, [rdx] ;load filters
566 movq xmm5, rcx
567 packsswb xmm4, xmm4
568 pshuflw xmm6, xmm4, 0b ;k0_k1
569 pshufhw xmm6, xmm6, 10101010b ;k0_k1_k4_k5
570 pshuflw xmm7, xmm4, 01010101b ;k2_k3
571 pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7
572 pshufd xmm5, xmm5, 0 ;rounding
574 movdqa k0k1k4k5, xmm6
575 movdqa k2k3k6k7, xmm7
576 movdqa krd, xmm5
578 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
579 movsxd rdx, dword ptr arg(3) ;output_pitch
580 movsxd rcx, dword ptr arg(4) ;output_height
581 shr rcx, 1
582 .loop:
583 ;Do two rows once
584 movq xmm0, [rsi - 3] ;load src
585 movq xmm1, [rsi + 5]
586 movq xmm2, [rsi + rax - 3]
587 movq xmm3, [rsi + rax + 5]
588 punpcklqdq xmm0, xmm1
589 punpcklqdq xmm2, xmm3
591 HORIZx4_ROW xmm0, xmm1
592 HORIZx4_ROW xmm2, xmm3
593 %if %1
594 movd xmm1, [rdi]
595 pavgb xmm0, xmm1
596 movd xmm3, [rdi + rdx]
597 pavgb xmm2, xmm3
598 %endif
599 movd [rdi], xmm0
600 movd [rdi +rdx], xmm2
602 lea rsi, [rsi + rax]
603 prefetcht0 [rsi + 4 * rax - 3]
604 lea rsi, [rsi + rax]
605 lea rdi, [rdi + 2 * rdx]
606 prefetcht0 [rsi + 2 * rax - 3]
608 dec rcx
609 jnz .loop
611 ; Do last row if output_height is odd
612 movsxd rcx, dword ptr arg(4) ;output_height
613 and rcx, 1
614 je .done
616 movq xmm0, [rsi - 3] ; load src
617 movq xmm1, [rsi + 5]
618 punpcklqdq xmm0, xmm1
620 HORIZx4_ROW xmm0, xmm1
621 %if %1
622 movd xmm1, [rdi]
623 pavgb xmm0, xmm1
624 %endif
625 movd [rdi], xmm0
626 .done
627 %endm
629 %macro HORIZx8_ROW 4
630 movdqa %2, %1
631 movdqa %3, %1
632 movdqa %4, %1
634 pshufb %1, [GLOBAL(shuf_t0t1)]
635 pshufb %2, [GLOBAL(shuf_t2t3)]
636 pshufb %3, [GLOBAL(shuf_t4t5)]
637 pshufb %4, [GLOBAL(shuf_t6t7)]
639 pmaddubsw %1, k0k1
640 pmaddubsw %2, k2k3
641 pmaddubsw %3, k4k5
642 pmaddubsw %4, k6k7
644 paddsw %1, %4
645 movdqa %4, %2
646 pmaxsw %2, %3
647 pminsw %3, %4
648 paddsw %1, %3
649 paddsw %1, %2
651 paddsw %1, krd
652 psraw %1, 7
653 packuswb %1, %1
654 %endm
656 %macro HORIZx8 1
657 mov rdx, arg(5) ;filter ptr
658 mov rsi, arg(0) ;src_ptr
659 mov rdi, arg(2) ;output_ptr
660 mov rcx, 0x0400040
662 movdqa xmm4, [rdx] ;load filters
663 movd xmm5, rcx
664 packsswb xmm4, xmm4
665 pshuflw xmm0, xmm4, 0b ;k0_k1
666 pshuflw xmm1, xmm4, 01010101b ;k2_k3
667 pshuflw xmm2, xmm4, 10101010b ;k4_k5
668 pshuflw xmm3, xmm4, 11111111b ;k6_k7
670 punpcklqdq xmm0, xmm0
671 punpcklqdq xmm1, xmm1
672 punpcklqdq xmm2, xmm2
673 punpcklqdq xmm3, xmm3
675 movdqa k0k1, xmm0
676 movdqa k2k3, xmm1
677 pshufd xmm5, xmm5, 0
678 movdqa k4k5, xmm2
679 movdqa k6k7, xmm3
680 movdqa krd, xmm5
682 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
683 movsxd rdx, dword ptr arg(3) ;output_pitch
684 movsxd rcx, dword ptr arg(4) ;output_height
685 shr rcx, 1
687 .loop:
688 movq xmm0, [rsi - 3] ;load src
689 movq xmm3, [rsi + 5]
690 movq xmm4, [rsi + rax - 3]
691 movq xmm7, [rsi + rax + 5]
692 punpcklqdq xmm0, xmm3
693 punpcklqdq xmm4, xmm7
695 HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
696 HORIZx8_ROW xmm4, xmm5, xmm6, xmm7
697 %if %1
698 movq xmm1, [rdi]
699 movq xmm2, [rdi + rdx]
700 pavgb xmm0, xmm1
701 pavgb xmm4, xmm2
702 %endif
703 movq [rdi], xmm0
704 movq [rdi + rdx], xmm4
706 lea rsi, [rsi + rax]
707 prefetcht0 [rsi + 4 * rax - 3]
708 lea rsi, [rsi + rax]
709 lea rdi, [rdi + 2 * rdx]
710 prefetcht0 [rsi + 2 * rax - 3]
711 dec rcx
712 jnz .loop
714 ;Do last row if output_height is odd
715 movsxd rcx, dword ptr arg(4) ;output_height
716 and rcx, 1
717 je .done
719 movq xmm0, [rsi - 3]
720 movq xmm3, [rsi + 5]
721 punpcklqdq xmm0, xmm3
723 HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
724 %if %1
725 movq xmm1, [rdi]
726 pavgb xmm0, xmm1
727 %endif
728 movq [rdi], xmm0
729 .done
730 %endm
732 %macro HORIZx16 1
733 mov rdx, arg(5) ;filter ptr
734 mov rsi, arg(0) ;src_ptr
735 mov rdi, arg(2) ;output_ptr
736 mov rcx, 0x0400040
738 movdqa xmm4, [rdx] ;load filters
739 movq xmm5, rcx
740 packsswb xmm4, xmm4
741 pshuflw xmm0, xmm4, 0b ;k0_k1
742 pshuflw xmm1, xmm4, 01010101b ;k2_k3
743 pshuflw xmm2, xmm4, 10101010b ;k4_k5
744 pshuflw xmm3, xmm4, 11111111b ;k6_k7
746 punpcklqdq xmm0, xmm0
747 punpcklqdq xmm1, xmm1
748 punpcklqdq xmm2, xmm2
749 punpcklqdq xmm3, xmm3
751 movdqa k0k1, xmm0
752 movdqa k2k3, xmm1
753 pshufd xmm5, xmm5, 0
754 movdqa k4k5, xmm2
755 movdqa k6k7, xmm3
756 movdqa krd, xmm5
758 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
759 movsxd rdx, dword ptr arg(3) ;output_pitch
760 movsxd rcx, dword ptr arg(4) ;output_height
762 .loop:
763 prefetcht0 [rsi + 2 * rax -3]
765 movq xmm0, [rsi - 3] ;load src data
766 movq xmm4, [rsi + 5]
767 movq xmm7, [rsi + 13]
768 punpcklqdq xmm0, xmm4
769 punpcklqdq xmm4, xmm7
771 movdqa xmm1, xmm0
772 movdqa xmm2, xmm0
773 movdqa xmm3, xmm0
774 movdqa xmm5, xmm4
775 movdqa xmm6, xmm4
776 movdqa xmm7, xmm4
778 pshufb xmm0, [GLOBAL(shuf_t0t1)]
779 pshufb xmm1, [GLOBAL(shuf_t2t3)]
780 pshufb xmm2, [GLOBAL(shuf_t4t5)]
781 pshufb xmm3, [GLOBAL(shuf_t6t7)]
782 pshufb xmm4, [GLOBAL(shuf_t0t1)]
783 pshufb xmm5, [GLOBAL(shuf_t2t3)]
784 pshufb xmm6, [GLOBAL(shuf_t4t5)]
785 pshufb xmm7, [GLOBAL(shuf_t6t7)]
787 pmaddubsw xmm0, k0k1
788 pmaddubsw xmm1, k2k3
789 pmaddubsw xmm2, k4k5
790 pmaddubsw xmm3, k6k7
791 pmaddubsw xmm4, k0k1
792 pmaddubsw xmm5, k2k3
793 pmaddubsw xmm6, k4k5
794 pmaddubsw xmm7, k6k7
796 paddsw xmm0, xmm3
797 movdqa xmm3, xmm1
798 pmaxsw xmm1, xmm2
799 pminsw xmm2, xmm3
800 paddsw xmm0, xmm2
801 paddsw xmm0, xmm1
803 paddsw xmm4, xmm7
804 movdqa xmm7, xmm5
805 pmaxsw xmm5, xmm6
806 pminsw xmm6, xmm7
807 paddsw xmm4, xmm6
808 paddsw xmm4, xmm5
810 paddsw xmm0, krd
811 paddsw xmm4, krd
812 psraw xmm0, 7
813 psraw xmm4, 7
814 packuswb xmm0, xmm0
815 packuswb xmm4, xmm4
816 punpcklqdq xmm0, xmm4
817 %if %1
818 movdqa xmm1, [rdi]
819 pavgb xmm0, xmm1
820 %endif
822 lea rsi, [rsi + rax]
823 movdqa [rdi], xmm0
825 lea rdi, [rdi + rdx]
826 dec rcx
827 jnz .loop
828 %endm
830 ;void vp9_filter_block1d4_h8_ssse3
831 ;(
832 ; unsigned char *src_ptr,
833 ; unsigned int src_pixels_per_line,
834 ; unsigned char *output_ptr,
835 ; unsigned int output_pitch,
836 ; unsigned int output_height,
837 ; short *filter
838 ;)
839 global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE
840 sym(vp9_filter_block1d4_h8_ssse3):
841 push rbp
842 mov rbp, rsp
843 SHADOW_ARGS_TO_STACK 6
844 SAVE_XMM 7
845 GET_GOT rbx
846 push rsi
847 push rdi
848 ; end prolog
850 ALIGN_STACK 16, rax
851 sub rsp, 16 * 3
852 %define k0k1k4k5 [rsp + 16 * 0]
853 %define k2k3k6k7 [rsp + 16 * 1]
854 %define krd [rsp + 16 * 2]
856 HORIZx4 0
858 add rsp, 16 * 3
859 pop rsp
860 ; begin epilog
861 pop rdi
862 pop rsi
863 RESTORE_GOT
864 RESTORE_XMM
865 UNSHADOW_ARGS
866 pop rbp
867 ret
869 ;void vp9_filter_block1d8_h8_ssse3
870 ;(
871 ; unsigned char *src_ptr,
872 ; unsigned int src_pixels_per_line,
873 ; unsigned char *output_ptr,
874 ; unsigned int output_pitch,
875 ; unsigned int output_height,
876 ; short *filter
877 ;)
878 global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE
879 sym(vp9_filter_block1d8_h8_ssse3):
880 push rbp
881 mov rbp, rsp
882 SHADOW_ARGS_TO_STACK 6
883 SAVE_XMM 7
884 GET_GOT rbx
885 push rsi
886 push rdi
887 ; end prolog
889 ALIGN_STACK 16, rax
890 sub rsp, 16*5
891 %define k0k1 [rsp + 16*0]
892 %define k2k3 [rsp + 16*1]
893 %define k4k5 [rsp + 16*2]
894 %define k6k7 [rsp + 16*3]
895 %define krd [rsp + 16*4]
897 HORIZx8 0
899 add rsp, 16*5
900 pop rsp
902 ; begin epilog
903 pop rdi
904 pop rsi
905 RESTORE_GOT
906 RESTORE_XMM
907 UNSHADOW_ARGS
908 pop rbp
909 ret
911 ;void vp9_filter_block1d16_h8_ssse3
912 ;(
913 ; unsigned char *src_ptr,
914 ; unsigned int src_pixels_per_line,
915 ; unsigned char *output_ptr,
916 ; unsigned int output_pitch,
917 ; unsigned int output_height,
918 ; short *filter
919 ;)
920 global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE
921 sym(vp9_filter_block1d16_h8_ssse3):
922 push rbp
923 mov rbp, rsp
924 SHADOW_ARGS_TO_STACK 6
925 SAVE_XMM 7
926 GET_GOT rbx
927 push rsi
928 push rdi
929 ; end prolog
931 ALIGN_STACK 16, rax
932 sub rsp, 16*5
933 %define k0k1 [rsp + 16*0]
934 %define k2k3 [rsp + 16*1]
935 %define k4k5 [rsp + 16*2]
936 %define k6k7 [rsp + 16*3]
937 %define krd [rsp + 16*4]
939 HORIZx16 0
941 add rsp, 16*5
942 pop rsp
944 ; begin epilog
945 pop rdi
946 pop rsi
947 RESTORE_GOT
948 RESTORE_XMM
949 UNSHADOW_ARGS
950 pop rbp
951 ret
953 global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE
954 sym(vp9_filter_block1d4_h8_avg_ssse3):
955 push rbp
956 mov rbp, rsp
957 SHADOW_ARGS_TO_STACK 6
958 SAVE_XMM 7
959 GET_GOT rbx
960 push rsi
961 push rdi
962 ; end prolog
964 ALIGN_STACK 16, rax
965 sub rsp, 16 * 3
966 %define k0k1k4k5 [rsp + 16 * 0]
967 %define k2k3k6k7 [rsp + 16 * 1]
968 %define krd [rsp + 16 * 2]
970 HORIZx4 1
972 add rsp, 16 * 3
973 pop rsp
974 ; begin epilog
975 pop rdi
976 pop rsi
977 RESTORE_GOT
978 RESTORE_XMM
979 UNSHADOW_ARGS
980 pop rbp
981 ret
983 global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE
984 sym(vp9_filter_block1d8_h8_avg_ssse3):
985 push rbp
986 mov rbp, rsp
987 SHADOW_ARGS_TO_STACK 6
988 SAVE_XMM 7
989 GET_GOT rbx
990 push rsi
991 push rdi
992 ; end prolog
994 ALIGN_STACK 16, rax
995 sub rsp, 16*5
996 %define k0k1 [rsp + 16*0]
997 %define k2k3 [rsp + 16*1]
998 %define k4k5 [rsp + 16*2]
999 %define k6k7 [rsp + 16*3]
1000 %define krd [rsp + 16*4]
1002 HORIZx8 1
1004 add rsp, 16*5
1005 pop rsp
1007 ; begin epilog
1008 pop rdi
1009 pop rsi
1010 RESTORE_GOT
1011 RESTORE_XMM
1012 UNSHADOW_ARGS
1013 pop rbp
1014 ret
1016 global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE
1017 sym(vp9_filter_block1d16_h8_avg_ssse3):
1018 push rbp
1019 mov rbp, rsp
1020 SHADOW_ARGS_TO_STACK 6
1021 SAVE_XMM 7
1022 GET_GOT rbx
1023 push rsi
1024 push rdi
1025 ; end prolog
1027 ALIGN_STACK 16, rax
1028 sub rsp, 16*5
1029 %define k0k1 [rsp + 16*0]
1030 %define k2k3 [rsp + 16*1]
1031 %define k4k5 [rsp + 16*2]
1032 %define k6k7 [rsp + 16*3]
1033 %define krd [rsp + 16*4]
1035 HORIZx16 1
1037 add rsp, 16*5
1038 pop rsp
1040 ; begin epilog
1041 pop rdi
1042 pop rsi
1043 RESTORE_GOT
1044 RESTORE_XMM
1045 UNSHADOW_ARGS
1046 pop rbp
1047 ret
1048 SECTION_RODATA
1049 align 16
1050 shuf_t0t1:
1051 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
1052 align 16
1053 shuf_t2t3:
1054 db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
1055 align 16
1056 shuf_t4t5:
1057 db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
1058 align 16
1059 shuf_t6t7:
1060 db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14