media/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm

branch
TOR_BUG_9701
changeset 15
b8a032363ba2
equal deleted inserted replaced
-1:000000000000 0:e33c7cf310d5
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11
12 %include "vpx_ports/x86_abi_support.asm"
13
14 %macro VERTx4 1
15 mov rdx, arg(5) ;filter ptr
16 mov rsi, arg(0) ;src_ptr
17 mov rdi, arg(2) ;output_ptr
18 mov rcx, 0x0400040
19
20 movdqa xmm4, [rdx] ;load filters
21 movd xmm5, rcx
22 packsswb xmm4, xmm4
23 pshuflw xmm0, xmm4, 0b ;k0_k1
24 pshuflw xmm1, xmm4, 01010101b ;k2_k3
25 pshuflw xmm2, xmm4, 10101010b ;k4_k5
26 pshuflw xmm3, xmm4, 11111111b ;k6_k7
27
28 punpcklqdq xmm0, xmm0
29 punpcklqdq xmm1, xmm1
30 punpcklqdq xmm2, xmm2
31 punpcklqdq xmm3, xmm3
32
33 movdqa k0k1, xmm0
34 movdqa k2k3, xmm1
35 pshufd xmm5, xmm5, 0
36 movdqa k4k5, xmm2
37 movdqa k6k7, xmm3
38 movdqa krd, xmm5
39
40 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
41
42 %if ABI_IS_32BIT=0
43 movsxd r8, DWORD PTR arg(3) ;out_pitch
44 %endif
45 mov rax, rsi
46 movsxd rcx, DWORD PTR arg(4) ;output_height
47 add rax, rdx
48
49 lea rbx, [rdx + rdx*4]
50 add rbx, rdx ;pitch * 6
51
52 .loop:
53 movd xmm0, [rsi] ;A
54 movd xmm1, [rsi + rdx] ;B
55 movd xmm2, [rsi + rdx * 2] ;C
56 movd xmm3, [rax + rdx * 2] ;D
57 movd xmm4, [rsi + rdx * 4] ;E
58 movd xmm5, [rax + rdx * 4] ;F
59
60 punpcklbw xmm0, xmm1 ;A B
61 punpcklbw xmm2, xmm3 ;C D
62 punpcklbw xmm4, xmm5 ;E F
63
64 movd xmm6, [rsi + rbx] ;G
65 movd xmm7, [rax + rbx] ;H
66
67 pmaddubsw xmm0, k0k1
68 pmaddubsw xmm2, k2k3
69 punpcklbw xmm6, xmm7 ;G H
70 pmaddubsw xmm4, k4k5
71 pmaddubsw xmm6, k6k7
72
73 movdqa xmm1, xmm2
74 paddsw xmm0, xmm6
75 pmaxsw xmm2, xmm4
76 pminsw xmm4, xmm1
77 paddsw xmm0, xmm4
78 paddsw xmm0, xmm2
79
80 paddsw xmm0, krd
81 psraw xmm0, 7
82 packuswb xmm0, xmm0
83
84 add rsi, rdx
85 add rax, rdx
86 %if %1
87 movd xmm1, [rdi]
88 pavgb xmm0, xmm1
89 %endif
90 movd [rdi], xmm0
91
92 %if ABI_IS_32BIT
93 add rdi, DWORD PTR arg(3) ;out_pitch
94 %else
95 add rdi, r8
96 %endif
97 dec rcx
98 jnz .loop
99 %endm
100
101 %macro VERTx8 1
102 mov rdx, arg(5) ;filter ptr
103 mov rsi, arg(0) ;src_ptr
104 mov rdi, arg(2) ;output_ptr
105 mov rcx, 0x0400040
106
107 movdqa xmm4, [rdx] ;load filters
108 movq xmm5, rcx
109 packsswb xmm4, xmm4
110 pshuflw xmm0, xmm4, 0b ;k0_k1
111 pshuflw xmm1, xmm4, 01010101b ;k2_k3
112 pshuflw xmm2, xmm4, 10101010b ;k4_k5
113 pshuflw xmm3, xmm4, 11111111b ;k6_k7
114
115 punpcklqdq xmm0, xmm0
116 punpcklqdq xmm1, xmm1
117 punpcklqdq xmm2, xmm2
118 punpcklqdq xmm3, xmm3
119
120 movdqa k0k1, xmm0
121 movdqa k2k3, xmm1
122 pshufd xmm5, xmm5, 0
123 movdqa k4k5, xmm2
124 movdqa k6k7, xmm3
125 movdqa krd, xmm5
126
127 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
128
129 %if ABI_IS_32BIT=0
130 movsxd r8, DWORD PTR arg(3) ;out_pitch
131 %endif
132 mov rax, rsi
133 movsxd rcx, DWORD PTR arg(4) ;output_height
134 add rax, rdx
135
136 lea rbx, [rdx + rdx*4]
137 add rbx, rdx ;pitch * 6
138
139 .loop:
140 movq xmm0, [rsi] ;A
141 movq xmm1, [rsi + rdx] ;B
142 movq xmm2, [rsi + rdx * 2] ;C
143 movq xmm3, [rax + rdx * 2] ;D
144 movq xmm4, [rsi + rdx * 4] ;E
145 movq xmm5, [rax + rdx * 4] ;F
146
147 punpcklbw xmm0, xmm1 ;A B
148 punpcklbw xmm2, xmm3 ;C D
149 punpcklbw xmm4, xmm5 ;E F
150
151 movq xmm6, [rsi + rbx] ;G
152 movq xmm7, [rax + rbx] ;H
153
154 pmaddubsw xmm0, k0k1
155 pmaddubsw xmm2, k2k3
156 punpcklbw xmm6, xmm7 ;G H
157 pmaddubsw xmm4, k4k5
158 pmaddubsw xmm6, k6k7
159
160 paddsw xmm0, xmm6
161 movdqa xmm1, xmm2
162 pmaxsw xmm2, xmm4
163 pminsw xmm4, xmm1
164 paddsw xmm0, xmm4
165 paddsw xmm0, xmm2
166
167 paddsw xmm0, krd
168 psraw xmm0, 7
169 packuswb xmm0, xmm0
170
171 add rsi, rdx
172 add rax, rdx
173 %if %1
174 movq xmm1, [rdi]
175 pavgb xmm0, xmm1
176 %endif
177 movq [rdi], xmm0
178
179 %if ABI_IS_32BIT
180 add rdi, DWORD PTR arg(3) ;out_pitch
181 %else
182 add rdi, r8
183 %endif
184 dec rcx
185 jnz .loop
186 %endm
187
188
189 %macro VERTx16 1
190 mov rdx, arg(5) ;filter ptr
191 mov rsi, arg(0) ;src_ptr
192 mov rdi, arg(2) ;output_ptr
193 mov rcx, 0x0400040
194
195 movdqa xmm4, [rdx] ;load filters
196 movq xmm5, rcx
197 packsswb xmm4, xmm4
198 pshuflw xmm0, xmm4, 0b ;k0_k1
199 pshuflw xmm1, xmm4, 01010101b ;k2_k3
200 pshuflw xmm2, xmm4, 10101010b ;k4_k5
201 pshuflw xmm3, xmm4, 11111111b ;k6_k7
202
203 punpcklqdq xmm0, xmm0
204 punpcklqdq xmm1, xmm1
205 punpcklqdq xmm2, xmm2
206 punpcklqdq xmm3, xmm3
207
208 movdqa k0k1, xmm0
209 movdqa k2k3, xmm1
210 pshufd xmm5, xmm5, 0
211 movdqa k4k5, xmm2
212 movdqa k6k7, xmm3
213 movdqa krd, xmm5
214
215 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
216
217 %if ABI_IS_32BIT=0
218 movsxd r8, DWORD PTR arg(3) ;out_pitch
219 %endif
220 mov rax, rsi
221 movsxd rcx, DWORD PTR arg(4) ;output_height
222 add rax, rdx
223
224 lea rbx, [rdx + rdx*4]
225 add rbx, rdx ;pitch * 6
226
227 .loop:
228 movq xmm0, [rsi] ;A
229 movq xmm1, [rsi + rdx] ;B
230 movq xmm2, [rsi + rdx * 2] ;C
231 movq xmm3, [rax + rdx * 2] ;D
232 movq xmm4, [rsi + rdx * 4] ;E
233 movq xmm5, [rax + rdx * 4] ;F
234
235 punpcklbw xmm0, xmm1 ;A B
236 punpcklbw xmm2, xmm3 ;C D
237 punpcklbw xmm4, xmm5 ;E F
238
239 movq xmm6, [rsi + rbx] ;G
240 movq xmm7, [rax + rbx] ;H
241
242 pmaddubsw xmm0, k0k1
243 pmaddubsw xmm2, k2k3
244 punpcklbw xmm6, xmm7 ;G H
245 pmaddubsw xmm4, k4k5
246 pmaddubsw xmm6, k6k7
247
248 paddsw xmm0, xmm6
249 movdqa xmm1, xmm2
250 pmaxsw xmm2, xmm4
251 pminsw xmm4, xmm1
252 paddsw xmm0, xmm4
253 paddsw xmm0, xmm2
254
255 paddsw xmm0, krd
256 psraw xmm0, 7
257 packuswb xmm0, xmm0
258 %if %1
259 movq xmm1, [rdi]
260 pavgb xmm0, xmm1
261 %endif
262 movq [rdi], xmm0
263
264 movq xmm0, [rsi + 8] ;A
265 movq xmm1, [rsi + rdx + 8] ;B
266 movq xmm2, [rsi + rdx * 2 + 8] ;C
267 movq xmm3, [rax + rdx * 2 + 8] ;D
268 movq xmm4, [rsi + rdx * 4 + 8] ;E
269 movq xmm5, [rax + rdx * 4 + 8] ;F
270
271 punpcklbw xmm0, xmm1 ;A B
272 punpcklbw xmm2, xmm3 ;C D
273 punpcklbw xmm4, xmm5 ;E F
274
275
276 movq xmm6, [rsi + rbx + 8] ;G
277 movq xmm7, [rax + rbx + 8] ;H
278 punpcklbw xmm6, xmm7 ;G H
279
280
281 pmaddubsw xmm0, k0k1
282 pmaddubsw xmm2, k2k3
283 pmaddubsw xmm4, k4k5
284 pmaddubsw xmm6, k6k7
285
286 paddsw xmm0, xmm6
287 paddsw xmm0, xmm2
288 paddsw xmm0, xmm4
289 paddsw xmm0, krd
290
291 psraw xmm0, 7
292 packuswb xmm0, xmm0
293
294 add rsi, rdx
295 add rax, rdx
296 %if %1
297 movq xmm1, [rdi+8]
298 pavgb xmm0, xmm1
299 %endif
300
301 movq [rdi+8], xmm0
302
303 %if ABI_IS_32BIT
304 add rdi, DWORD PTR arg(3) ;out_pitch
305 %else
306 add rdi, r8
307 %endif
308 dec rcx
309 jnz .loop
310 %endm
311
312 ;void vp9_filter_block1d8_v8_ssse3
313 ;(
314 ; unsigned char *src_ptr,
315 ; unsigned int src_pitch,
316 ; unsigned char *output_ptr,
317 ; unsigned int out_pitch,
318 ; unsigned int output_height,
319 ; short *filter
320 ;)
321 global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE
322 sym(vp9_filter_block1d4_v8_ssse3):
323 push rbp
324 mov rbp, rsp
325 SHADOW_ARGS_TO_STACK 6
326 SAVE_XMM 7
327 push rsi
328 push rdi
329 push rbx
330 ; end prolog
331
332 ALIGN_STACK 16, rax
333 sub rsp, 16*5
334 %define k0k1 [rsp + 16*0]
335 %define k2k3 [rsp + 16*1]
336 %define k4k5 [rsp + 16*2]
337 %define k6k7 [rsp + 16*3]
338 %define krd [rsp + 16*4]
339
340 VERTx4 0
341
342 add rsp, 16*5
343 pop rsp
344 pop rbx
345 ; begin epilog
346 pop rdi
347 pop rsi
348 RESTORE_XMM
349 UNSHADOW_ARGS
350 pop rbp
351 ret
352
353 ;void vp9_filter_block1d8_v8_ssse3
354 ;(
355 ; unsigned char *src_ptr,
356 ; unsigned int src_pitch,
357 ; unsigned char *output_ptr,
358 ; unsigned int out_pitch,
359 ; unsigned int output_height,
360 ; short *filter
361 ;)
362 global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE
363 sym(vp9_filter_block1d8_v8_ssse3):
364 push rbp
365 mov rbp, rsp
366 SHADOW_ARGS_TO_STACK 6
367 SAVE_XMM 7
368 push rsi
369 push rdi
370 push rbx
371 ; end prolog
372
373 ALIGN_STACK 16, rax
374 sub rsp, 16*5
375 %define k0k1 [rsp + 16*0]
376 %define k2k3 [rsp + 16*1]
377 %define k4k5 [rsp + 16*2]
378 %define k6k7 [rsp + 16*3]
379 %define krd [rsp + 16*4]
380
381 VERTx8 0
382
383 add rsp, 16*5
384 pop rsp
385 pop rbx
386 ; begin epilog
387 pop rdi
388 pop rsi
389 RESTORE_XMM
390 UNSHADOW_ARGS
391 pop rbp
392 ret
393
394 ;void vp9_filter_block1d16_v8_ssse3
395 ;(
396 ; unsigned char *src_ptr,
397 ; unsigned int src_pitch,
398 ; unsigned char *output_ptr,
399 ; unsigned int out_pitch,
400 ; unsigned int output_height,
401 ; short *filter
402 ;)
403 global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE
404 sym(vp9_filter_block1d16_v8_ssse3):
405 push rbp
406 mov rbp, rsp
407 SHADOW_ARGS_TO_STACK 6
408 SAVE_XMM 7
409 push rsi
410 push rdi
411 push rbx
412 ; end prolog
413
414 ALIGN_STACK 16, rax
415 sub rsp, 16*5
416 %define k0k1 [rsp + 16*0]
417 %define k2k3 [rsp + 16*1]
418 %define k4k5 [rsp + 16*2]
419 %define k6k7 [rsp + 16*3]
420 %define krd [rsp + 16*4]
421
422 VERTx16 0
423
424 add rsp, 16*5
425 pop rsp
426 pop rbx
427 ; begin epilog
428 pop rdi
429 pop rsi
430 RESTORE_XMM
431 UNSHADOW_ARGS
432 pop rbp
433 ret
434
435 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
436
437
438 global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE
439 sym(vp9_filter_block1d4_v8_avg_ssse3):
440 push rbp
441 mov rbp, rsp
442 SHADOW_ARGS_TO_STACK 6
443 SAVE_XMM 7
444 push rsi
445 push rdi
446 push rbx
447 ; end prolog
448
449 ALIGN_STACK 16, rax
450 sub rsp, 16*5
451 %define k0k1 [rsp + 16*0]
452 %define k2k3 [rsp + 16*1]
453 %define k4k5 [rsp + 16*2]
454 %define k6k7 [rsp + 16*3]
455 %define krd [rsp + 16*4]
456
457 VERTx4 1
458
459 add rsp, 16*5
460 pop rsp
461 pop rbx
462 ; begin epilog
463 pop rdi
464 pop rsi
465 RESTORE_XMM
466 UNSHADOW_ARGS
467 pop rbp
468 ret
469
470 global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE
471 sym(vp9_filter_block1d8_v8_avg_ssse3):
472 push rbp
473 mov rbp, rsp
474 SHADOW_ARGS_TO_STACK 6
475 SAVE_XMM 7
476 push rsi
477 push rdi
478 push rbx
479 ; end prolog
480
481 ALIGN_STACK 16, rax
482 sub rsp, 16*5
483 %define k0k1 [rsp + 16*0]
484 %define k2k3 [rsp + 16*1]
485 %define k4k5 [rsp + 16*2]
486 %define k6k7 [rsp + 16*3]
487 %define krd [rsp + 16*4]
488
489 VERTx8 1
490
491 add rsp, 16*5
492 pop rsp
493 pop rbx
494 ; begin epilog
495 pop rdi
496 pop rsi
497 RESTORE_XMM
498 UNSHADOW_ARGS
499 pop rbp
500 ret
501
502 global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE
503 sym(vp9_filter_block1d16_v8_avg_ssse3):
504 push rbp
505 mov rbp, rsp
506 SHADOW_ARGS_TO_STACK 6
507 SAVE_XMM 7
508 push rsi
509 push rdi
510 push rbx
511 ; end prolog
512
513 ALIGN_STACK 16, rax
514 sub rsp, 16*5
515 %define k0k1 [rsp + 16*0]
516 %define k2k3 [rsp + 16*1]
517 %define k4k5 [rsp + 16*2]
518 %define k6k7 [rsp + 16*3]
519 %define krd [rsp + 16*4]
520
521 VERTx16 1
522
523 add rsp, 16*5
524 pop rsp
525 pop rbx
526 ; begin epilog
527 pop rdi
528 pop rsi
529 RESTORE_XMM
530 UNSHADOW_ARGS
531 pop rbp
532 ret
533
534 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
535 %macro HORIZx4_ROW 2
536 movdqa %2, %1
537 pshufb %1, [GLOBAL(shuf_t0t1)]
538 pshufb %2, [GLOBAL(shuf_t2t3)]
539 pmaddubsw %1, k0k1k4k5
540 pmaddubsw %2, k2k3k6k7
541
542 movdqa xmm4, %1
543 movdqa xmm5, %2
544 psrldq %1, 8
545 psrldq %2, 8
546 movdqa xmm6, xmm5
547
548 paddsw xmm4, %2
549 pmaxsw xmm5, %1
550 pminsw %1, xmm6
551 paddsw %1, xmm4
552 paddsw %1, xmm5
553
554 paddsw %1, krd
555 psraw %1, 7
556 packuswb %1, %1
557 %endm
558
559 %macro HORIZx4 1
560 mov rdx, arg(5) ;filter ptr
561 mov rsi, arg(0) ;src_ptr
562 mov rdi, arg(2) ;output_ptr
563 mov rcx, 0x0400040
564
565 movdqa xmm4, [rdx] ;load filters
566 movq xmm5, rcx
567 packsswb xmm4, xmm4
568 pshuflw xmm6, xmm4, 0b ;k0_k1
569 pshufhw xmm6, xmm6, 10101010b ;k0_k1_k4_k5
570 pshuflw xmm7, xmm4, 01010101b ;k2_k3
571 pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7
572 pshufd xmm5, xmm5, 0 ;rounding
573
574 movdqa k0k1k4k5, xmm6
575 movdqa k2k3k6k7, xmm7
576 movdqa krd, xmm5
577
578 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
579 movsxd rdx, dword ptr arg(3) ;output_pitch
580 movsxd rcx, dword ptr arg(4) ;output_height
581 shr rcx, 1
582 .loop:
583 ;Do two rows once
584 movq xmm0, [rsi - 3] ;load src
585 movq xmm1, [rsi + 5]
586 movq xmm2, [rsi + rax - 3]
587 movq xmm3, [rsi + rax + 5]
588 punpcklqdq xmm0, xmm1
589 punpcklqdq xmm2, xmm3
590
591 HORIZx4_ROW xmm0, xmm1
592 HORIZx4_ROW xmm2, xmm3
593 %if %1
594 movd xmm1, [rdi]
595 pavgb xmm0, xmm1
596 movd xmm3, [rdi + rdx]
597 pavgb xmm2, xmm3
598 %endif
599 movd [rdi], xmm0
600 movd [rdi +rdx], xmm2
601
602 lea rsi, [rsi + rax]
603 prefetcht0 [rsi + 4 * rax - 3]
604 lea rsi, [rsi + rax]
605 lea rdi, [rdi + 2 * rdx]
606 prefetcht0 [rsi + 2 * rax - 3]
607
608 dec rcx
609 jnz .loop
610
611 ; Do last row if output_height is odd
612 movsxd rcx, dword ptr arg(4) ;output_height
613 and rcx, 1
614 je .done
615
616 movq xmm0, [rsi - 3] ; load src
617 movq xmm1, [rsi + 5]
618 punpcklqdq xmm0, xmm1
619
620 HORIZx4_ROW xmm0, xmm1
621 %if %1
622 movd xmm1, [rdi]
623 pavgb xmm0, xmm1
624 %endif
625 movd [rdi], xmm0
626 .done
627 %endm
628
629 %macro HORIZx8_ROW 4
630 movdqa %2, %1
631 movdqa %3, %1
632 movdqa %4, %1
633
634 pshufb %1, [GLOBAL(shuf_t0t1)]
635 pshufb %2, [GLOBAL(shuf_t2t3)]
636 pshufb %3, [GLOBAL(shuf_t4t5)]
637 pshufb %4, [GLOBAL(shuf_t6t7)]
638
639 pmaddubsw %1, k0k1
640 pmaddubsw %2, k2k3
641 pmaddubsw %3, k4k5
642 pmaddubsw %4, k6k7
643
644 paddsw %1, %4
645 movdqa %4, %2
646 pmaxsw %2, %3
647 pminsw %3, %4
648 paddsw %1, %3
649 paddsw %1, %2
650
651 paddsw %1, krd
652 psraw %1, 7
653 packuswb %1, %1
654 %endm
655
656 %macro HORIZx8 1
657 mov rdx, arg(5) ;filter ptr
658 mov rsi, arg(0) ;src_ptr
659 mov rdi, arg(2) ;output_ptr
660 mov rcx, 0x0400040
661
662 movdqa xmm4, [rdx] ;load filters
663 movd xmm5, rcx
664 packsswb xmm4, xmm4
665 pshuflw xmm0, xmm4, 0b ;k0_k1
666 pshuflw xmm1, xmm4, 01010101b ;k2_k3
667 pshuflw xmm2, xmm4, 10101010b ;k4_k5
668 pshuflw xmm3, xmm4, 11111111b ;k6_k7
669
670 punpcklqdq xmm0, xmm0
671 punpcklqdq xmm1, xmm1
672 punpcklqdq xmm2, xmm2
673 punpcklqdq xmm3, xmm3
674
675 movdqa k0k1, xmm0
676 movdqa k2k3, xmm1
677 pshufd xmm5, xmm5, 0
678 movdqa k4k5, xmm2
679 movdqa k6k7, xmm3
680 movdqa krd, xmm5
681
682 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
683 movsxd rdx, dword ptr arg(3) ;output_pitch
684 movsxd rcx, dword ptr arg(4) ;output_height
685 shr rcx, 1
686
687 .loop:
688 movq xmm0, [rsi - 3] ;load src
689 movq xmm3, [rsi + 5]
690 movq xmm4, [rsi + rax - 3]
691 movq xmm7, [rsi + rax + 5]
692 punpcklqdq xmm0, xmm3
693 punpcklqdq xmm4, xmm7
694
695 HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
696 HORIZx8_ROW xmm4, xmm5, xmm6, xmm7
697 %if %1
698 movq xmm1, [rdi]
699 movq xmm2, [rdi + rdx]
700 pavgb xmm0, xmm1
701 pavgb xmm4, xmm2
702 %endif
703 movq [rdi], xmm0
704 movq [rdi + rdx], xmm4
705
706 lea rsi, [rsi + rax]
707 prefetcht0 [rsi + 4 * rax - 3]
708 lea rsi, [rsi + rax]
709 lea rdi, [rdi + 2 * rdx]
710 prefetcht0 [rsi + 2 * rax - 3]
711 dec rcx
712 jnz .loop
713
714 ;Do last row if output_height is odd
715 movsxd rcx, dword ptr arg(4) ;output_height
716 and rcx, 1
717 je .done
718
719 movq xmm0, [rsi - 3]
720 movq xmm3, [rsi + 5]
721 punpcklqdq xmm0, xmm3
722
723 HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
724 %if %1
725 movq xmm1, [rdi]
726 pavgb xmm0, xmm1
727 %endif
728 movq [rdi], xmm0
729 .done
730 %endm
731
732 %macro HORIZx16 1
733 mov rdx, arg(5) ;filter ptr
734 mov rsi, arg(0) ;src_ptr
735 mov rdi, arg(2) ;output_ptr
736 mov rcx, 0x0400040
737
738 movdqa xmm4, [rdx] ;load filters
739 movq xmm5, rcx
740 packsswb xmm4, xmm4
741 pshuflw xmm0, xmm4, 0b ;k0_k1
742 pshuflw xmm1, xmm4, 01010101b ;k2_k3
743 pshuflw xmm2, xmm4, 10101010b ;k4_k5
744 pshuflw xmm3, xmm4, 11111111b ;k6_k7
745
746 punpcklqdq xmm0, xmm0
747 punpcklqdq xmm1, xmm1
748 punpcklqdq xmm2, xmm2
749 punpcklqdq xmm3, xmm3
750
751 movdqa k0k1, xmm0
752 movdqa k2k3, xmm1
753 pshufd xmm5, xmm5, 0
754 movdqa k4k5, xmm2
755 movdqa k6k7, xmm3
756 movdqa krd, xmm5
757
758 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
759 movsxd rdx, dword ptr arg(3) ;output_pitch
760 movsxd rcx, dword ptr arg(4) ;output_height
761
762 .loop:
763 prefetcht0 [rsi + 2 * rax -3]
764
765 movq xmm0, [rsi - 3] ;load src data
766 movq xmm4, [rsi + 5]
767 movq xmm7, [rsi + 13]
768 punpcklqdq xmm0, xmm4
769 punpcklqdq xmm4, xmm7
770
771 movdqa xmm1, xmm0
772 movdqa xmm2, xmm0
773 movdqa xmm3, xmm0
774 movdqa xmm5, xmm4
775 movdqa xmm6, xmm4
776 movdqa xmm7, xmm4
777
778 pshufb xmm0, [GLOBAL(shuf_t0t1)]
779 pshufb xmm1, [GLOBAL(shuf_t2t3)]
780 pshufb xmm2, [GLOBAL(shuf_t4t5)]
781 pshufb xmm3, [GLOBAL(shuf_t6t7)]
782 pshufb xmm4, [GLOBAL(shuf_t0t1)]
783 pshufb xmm5, [GLOBAL(shuf_t2t3)]
784 pshufb xmm6, [GLOBAL(shuf_t4t5)]
785 pshufb xmm7, [GLOBAL(shuf_t6t7)]
786
787 pmaddubsw xmm0, k0k1
788 pmaddubsw xmm1, k2k3
789 pmaddubsw xmm2, k4k5
790 pmaddubsw xmm3, k6k7
791 pmaddubsw xmm4, k0k1
792 pmaddubsw xmm5, k2k3
793 pmaddubsw xmm6, k4k5
794 pmaddubsw xmm7, k6k7
795
796 paddsw xmm0, xmm3
797 movdqa xmm3, xmm1
798 pmaxsw xmm1, xmm2
799 pminsw xmm2, xmm3
800 paddsw xmm0, xmm2
801 paddsw xmm0, xmm1
802
803 paddsw xmm4, xmm7
804 movdqa xmm7, xmm5
805 pmaxsw xmm5, xmm6
806 pminsw xmm6, xmm7
807 paddsw xmm4, xmm6
808 paddsw xmm4, xmm5
809
810 paddsw xmm0, krd
811 paddsw xmm4, krd
812 psraw xmm0, 7
813 psraw xmm4, 7
814 packuswb xmm0, xmm0
815 packuswb xmm4, xmm4
816 punpcklqdq xmm0, xmm4
817 %if %1
818 movdqa xmm1, [rdi]
819 pavgb xmm0, xmm1
820 %endif
821
822 lea rsi, [rsi + rax]
823 movdqa [rdi], xmm0
824
825 lea rdi, [rdi + rdx]
826 dec rcx
827 jnz .loop
828 %endm
829
830 ;void vp9_filter_block1d4_h8_ssse3
831 ;(
832 ; unsigned char *src_ptr,
833 ; unsigned int src_pixels_per_line,
834 ; unsigned char *output_ptr,
835 ; unsigned int output_pitch,
836 ; unsigned int output_height,
837 ; short *filter
838 ;)
839 global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE
840 sym(vp9_filter_block1d4_h8_ssse3):
841 push rbp
842 mov rbp, rsp
843 SHADOW_ARGS_TO_STACK 6
844 SAVE_XMM 7
845 GET_GOT rbx
846 push rsi
847 push rdi
848 ; end prolog
849
850 ALIGN_STACK 16, rax
851 sub rsp, 16 * 3
852 %define k0k1k4k5 [rsp + 16 * 0]
853 %define k2k3k6k7 [rsp + 16 * 1]
854 %define krd [rsp + 16 * 2]
855
856 HORIZx4 0
857
858 add rsp, 16 * 3
859 pop rsp
860 ; begin epilog
861 pop rdi
862 pop rsi
863 RESTORE_GOT
864 RESTORE_XMM
865 UNSHADOW_ARGS
866 pop rbp
867 ret
868
869 ;void vp9_filter_block1d8_h8_ssse3
870 ;(
871 ; unsigned char *src_ptr,
872 ; unsigned int src_pixels_per_line,
873 ; unsigned char *output_ptr,
874 ; unsigned int output_pitch,
875 ; unsigned int output_height,
876 ; short *filter
877 ;)
878 global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE
879 sym(vp9_filter_block1d8_h8_ssse3):
880 push rbp
881 mov rbp, rsp
882 SHADOW_ARGS_TO_STACK 6
883 SAVE_XMM 7
884 GET_GOT rbx
885 push rsi
886 push rdi
887 ; end prolog
888
889 ALIGN_STACK 16, rax
890 sub rsp, 16*5
891 %define k0k1 [rsp + 16*0]
892 %define k2k3 [rsp + 16*1]
893 %define k4k5 [rsp + 16*2]
894 %define k6k7 [rsp + 16*3]
895 %define krd [rsp + 16*4]
896
897 HORIZx8 0
898
899 add rsp, 16*5
900 pop rsp
901
902 ; begin epilog
903 pop rdi
904 pop rsi
905 RESTORE_GOT
906 RESTORE_XMM
907 UNSHADOW_ARGS
908 pop rbp
909 ret
910
911 ;void vp9_filter_block1d16_h8_ssse3
912 ;(
913 ; unsigned char *src_ptr,
914 ; unsigned int src_pixels_per_line,
915 ; unsigned char *output_ptr,
916 ; unsigned int output_pitch,
917 ; unsigned int output_height,
918 ; short *filter
919 ;)
920 global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE
921 sym(vp9_filter_block1d16_h8_ssse3):
922 push rbp
923 mov rbp, rsp
924 SHADOW_ARGS_TO_STACK 6
925 SAVE_XMM 7
926 GET_GOT rbx
927 push rsi
928 push rdi
929 ; end prolog
930
931 ALIGN_STACK 16, rax
932 sub rsp, 16*5
933 %define k0k1 [rsp + 16*0]
934 %define k2k3 [rsp + 16*1]
935 %define k4k5 [rsp + 16*2]
936 %define k6k7 [rsp + 16*3]
937 %define krd [rsp + 16*4]
938
939 HORIZx16 0
940
941 add rsp, 16*5
942 pop rsp
943
944 ; begin epilog
945 pop rdi
946 pop rsi
947 RESTORE_GOT
948 RESTORE_XMM
949 UNSHADOW_ARGS
950 pop rbp
951 ret
952
953 global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE
954 sym(vp9_filter_block1d4_h8_avg_ssse3):
955 push rbp
956 mov rbp, rsp
957 SHADOW_ARGS_TO_STACK 6
958 SAVE_XMM 7
959 GET_GOT rbx
960 push rsi
961 push rdi
962 ; end prolog
963
964 ALIGN_STACK 16, rax
965 sub rsp, 16 * 3
966 %define k0k1k4k5 [rsp + 16 * 0]
967 %define k2k3k6k7 [rsp + 16 * 1]
968 %define krd [rsp + 16 * 2]
969
970 HORIZx4 1
971
972 add rsp, 16 * 3
973 pop rsp
974 ; begin epilog
975 pop rdi
976 pop rsi
977 RESTORE_GOT
978 RESTORE_XMM
979 UNSHADOW_ARGS
980 pop rbp
981 ret
982
983 global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE
984 sym(vp9_filter_block1d8_h8_avg_ssse3):
985 push rbp
986 mov rbp, rsp
987 SHADOW_ARGS_TO_STACK 6
988 SAVE_XMM 7
989 GET_GOT rbx
990 push rsi
991 push rdi
992 ; end prolog
993
994 ALIGN_STACK 16, rax
995 sub rsp, 16*5
996 %define k0k1 [rsp + 16*0]
997 %define k2k3 [rsp + 16*1]
998 %define k4k5 [rsp + 16*2]
999 %define k6k7 [rsp + 16*3]
1000 %define krd [rsp + 16*4]
1001
1002 HORIZx8 1
1003
1004 add rsp, 16*5
1005 pop rsp
1006
1007 ; begin epilog
1008 pop rdi
1009 pop rsi
1010 RESTORE_GOT
1011 RESTORE_XMM
1012 UNSHADOW_ARGS
1013 pop rbp
1014 ret
1015
1016 global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE
1017 sym(vp9_filter_block1d16_h8_avg_ssse3):
1018 push rbp
1019 mov rbp, rsp
1020 SHADOW_ARGS_TO_STACK 6
1021 SAVE_XMM 7
1022 GET_GOT rbx
1023 push rsi
1024 push rdi
1025 ; end prolog
1026
1027 ALIGN_STACK 16, rax
1028 sub rsp, 16*5
1029 %define k0k1 [rsp + 16*0]
1030 %define k2k3 [rsp + 16*1]
1031 %define k4k5 [rsp + 16*2]
1032 %define k6k7 [rsp + 16*3]
1033 %define krd [rsp + 16*4]
1034
1035 HORIZx16 1
1036
1037 add rsp, 16*5
1038 pop rsp
1039
1040 ; begin epilog
1041 pop rdi
1042 pop rsi
1043 RESTORE_GOT
1044 RESTORE_XMM
1045 UNSHADOW_ARGS
1046 pop rbp
1047 ret
1048 SECTION_RODATA
1049 align 16
1050 shuf_t0t1:
1051 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
1052 align 16
1053 shuf_t2t3:
1054 db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
1055 align 16
1056 shuf_t4t5:
1057 db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
1058 align 16
1059 shuf_t6t7:
1060 db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14

mercurial