Thu, 15 Jan 2015 15:59:08 +0100
Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
12 %include "vpx_ports/x86_abi_support.asm"
14 %define BLOCK_HEIGHT_WIDTH 4
15 %define VP8_FILTER_WEIGHT 128
16 %define VP8_FILTER_SHIFT 7
19 ;/************************************************************************************
20 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
21 ; input pixel array has output_height rows. This routine assumes that output_height is an
22 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
23 ; rows each iteration to take advantage of the 128 bits operations.
24 ;
25 ; This is an implementation of some of the SSE optimizations first seen in ffvp8
26 ;
27 ;*************************************************************************************/
28 ;void vp8_filter_block1d8_h6_ssse3
29 ;(
30 ; unsigned char *src_ptr,
31 ; unsigned int src_pixels_per_line,
32 ; unsigned char *output_ptr,
33 ; unsigned int output_pitch,
34 ; unsigned int output_height,
35 ; unsigned int vp8_filter_index
36 ;)
37 global sym(vp8_filter_block1d8_h6_ssse3) PRIVATE
38 sym(vp8_filter_block1d8_h6_ssse3):
39 push rbp
40 mov rbp, rsp
41 SHADOW_ARGS_TO_STACK 6
42 SAVE_XMM 7
43 GET_GOT rbx
44 push rsi
45 push rdi
46 ; end prolog
48 movsxd rdx, DWORD PTR arg(5) ;table index
49 xor rsi, rsi
50 shl rdx, 4
52 movdqa xmm7, [GLOBAL(rd)]
54 lea rax, [GLOBAL(k0_k5)]
55 add rax, rdx
56 mov rdi, arg(2) ;output_ptr
58 cmp esi, DWORD PTR [rax]
59 je vp8_filter_block1d8_h4_ssse3
61 movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
62 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
63 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
65 mov rsi, arg(0) ;src_ptr
66 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
67 movsxd rcx, dword ptr arg(4) ;output_height
69 movsxd rdx, dword ptr arg(3) ;output_pitch
71 sub rdi, rdx
72 ;xmm3 free
73 .filter_block1d8_h6_rowloop_ssse3:
74 movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
76 movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
78 punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
80 movdqa xmm1, xmm0
81 pmaddubsw xmm0, xmm4
83 movdqa xmm2, xmm1
84 pshufb xmm1, [GLOBAL(shuf2bfrom1)]
86 pshufb xmm2, [GLOBAL(shuf3bfrom1)]
87 pmaddubsw xmm1, xmm5
89 lea rdi, [rdi + rdx]
90 pmaddubsw xmm2, xmm6
92 lea rsi, [rsi + rax]
93 dec rcx
95 paddsw xmm0, xmm1
96 paddsw xmm2, xmm7
98 paddsw xmm0, xmm2
100 psraw xmm0, 7
102 packuswb xmm0, xmm0
104 movq MMWORD Ptr [rdi], xmm0
105 jnz .filter_block1d8_h6_rowloop_ssse3
107 ; begin epilog
108 pop rdi
109 pop rsi
110 RESTORE_GOT
111 RESTORE_XMM
112 UNSHADOW_ARGS
113 pop rbp
114 ret
116 vp8_filter_block1d8_h4_ssse3:
117 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
118 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
120 movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
121 movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
123 mov rsi, arg(0) ;src_ptr
125 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
126 movsxd rcx, dword ptr arg(4) ;output_height
128 movsxd rdx, dword ptr arg(3) ;output_pitch
130 sub rdi, rdx
132 .filter_block1d8_h4_rowloop_ssse3:
133 movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
135 movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
137 punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
139 movdqa xmm2, xmm0
140 pshufb xmm0, xmm3
142 pshufb xmm2, xmm4
143 pmaddubsw xmm0, xmm5
145 lea rdi, [rdi + rdx]
146 pmaddubsw xmm2, xmm6
148 lea rsi, [rsi + rax]
149 dec rcx
151 paddsw xmm0, xmm7
153 paddsw xmm0, xmm2
155 psraw xmm0, 7
157 packuswb xmm0, xmm0
159 movq MMWORD Ptr [rdi], xmm0
161 jnz .filter_block1d8_h4_rowloop_ssse3
163 ; begin epilog
164 pop rdi
165 pop rsi
166 RESTORE_GOT
167 RESTORE_XMM
168 UNSHADOW_ARGS
169 pop rbp
170 ret
171 ;void vp8_filter_block1d16_h6_ssse3
172 ;(
173 ; unsigned char *src_ptr,
174 ; unsigned int src_pixels_per_line,
175 ; unsigned char *output_ptr,
176 ; unsigned int output_pitch,
177 ; unsigned int output_height,
178 ; unsigned int vp8_filter_index
179 ;)
180 global sym(vp8_filter_block1d16_h6_ssse3) PRIVATE
181 sym(vp8_filter_block1d16_h6_ssse3):
182 push rbp
183 mov rbp, rsp
184 SHADOW_ARGS_TO_STACK 6
185 SAVE_XMM 7
186 GET_GOT rbx
187 push rsi
188 push rdi
189 ; end prolog
191 movsxd rdx, DWORD PTR arg(5) ;table index
192 xor rsi, rsi
193 shl rdx, 4 ;
195 lea rax, [GLOBAL(k0_k5)]
196 add rax, rdx
198 mov rdi, arg(2) ;output_ptr
200 mov rsi, arg(0) ;src_ptr
202 movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
203 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
204 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
206 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
207 movsxd rcx, dword ptr arg(4) ;output_height
208 movsxd rdx, dword ptr arg(3) ;output_pitch
210 .filter_block1d16_h6_rowloop_ssse3:
211 movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
213 movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
215 punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
217 movdqa xmm1, xmm0
218 pmaddubsw xmm0, xmm4
220 movdqa xmm2, xmm1
221 pshufb xmm1, [GLOBAL(shuf2bfrom1)]
223 pshufb xmm2, [GLOBAL(shuf3bfrom1)]
224 movq xmm3, MMWORD PTR [rsi + 6]
226 pmaddubsw xmm1, xmm5
227 movq xmm7, MMWORD PTR [rsi + 11]
229 pmaddubsw xmm2, xmm6
230 punpcklbw xmm3, xmm7
232 paddsw xmm0, xmm1
233 movdqa xmm1, xmm3
235 pmaddubsw xmm3, xmm4
236 paddsw xmm0, xmm2
238 movdqa xmm2, xmm1
239 paddsw xmm0, [GLOBAL(rd)]
241 pshufb xmm1, [GLOBAL(shuf2bfrom1)]
242 pshufb xmm2, [GLOBAL(shuf3bfrom1)]
244 psraw xmm0, 7
245 pmaddubsw xmm1, xmm5
247 pmaddubsw xmm2, xmm6
248 packuswb xmm0, xmm0
250 lea rsi, [rsi + rax]
251 paddsw xmm3, xmm1
253 paddsw xmm3, xmm2
255 paddsw xmm3, [GLOBAL(rd)]
257 psraw xmm3, 7
259 packuswb xmm3, xmm3
261 punpcklqdq xmm0, xmm3
263 movdqa XMMWORD Ptr [rdi], xmm0
265 lea rdi, [rdi + rdx]
266 dec rcx
267 jnz .filter_block1d16_h6_rowloop_ssse3
269 ; begin epilog
270 pop rdi
271 pop rsi
272 RESTORE_GOT
273 RESTORE_XMM
274 UNSHADOW_ARGS
275 pop rbp
276 ret
278 ;void vp8_filter_block1d4_h6_ssse3
279 ;(
280 ; unsigned char *src_ptr,
281 ; unsigned int src_pixels_per_line,
282 ; unsigned char *output_ptr,
283 ; unsigned int output_pitch,
284 ; unsigned int output_height,
285 ; unsigned int vp8_filter_index
286 ;)
287 global sym(vp8_filter_block1d4_h6_ssse3) PRIVATE
288 sym(vp8_filter_block1d4_h6_ssse3):
289 push rbp
290 mov rbp, rsp
291 SHADOW_ARGS_TO_STACK 6
292 SAVE_XMM 7
293 GET_GOT rbx
294 push rsi
295 push rdi
296 ; end prolog
298 movsxd rdx, DWORD PTR arg(5) ;table index
299 xor rsi, rsi
300 shl rdx, 4 ;
302 lea rax, [GLOBAL(k0_k5)]
303 add rax, rdx
304 movdqa xmm7, [GLOBAL(rd)]
306 cmp esi, DWORD PTR [rax]
307 je .vp8_filter_block1d4_h4_ssse3
309 movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
310 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
311 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
313 mov rsi, arg(0) ;src_ptr
314 mov rdi, arg(2) ;output_ptr
315 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
316 movsxd rcx, dword ptr arg(4) ;output_height
318 movsxd rdx, dword ptr arg(3) ;output_pitch
320 ;xmm3 free
321 .filter_block1d4_h6_rowloop_ssse3:
322 movdqu xmm0, XMMWORD PTR [rsi - 2]
324 movdqa xmm1, xmm0
325 pshufb xmm0, [GLOBAL(shuf1b)]
327 movdqa xmm2, xmm1
328 pshufb xmm1, [GLOBAL(shuf2b)]
329 pmaddubsw xmm0, xmm4
330 pshufb xmm2, [GLOBAL(shuf3b)]
331 pmaddubsw xmm1, xmm5
333 ;--
334 pmaddubsw xmm2, xmm6
336 lea rsi, [rsi + rax]
337 ;--
338 paddsw xmm0, xmm1
339 paddsw xmm0, xmm7
340 pxor xmm1, xmm1
341 paddsw xmm0, xmm2
342 psraw xmm0, 7
343 packuswb xmm0, xmm0
345 movd DWORD PTR [rdi], xmm0
347 add rdi, rdx
348 dec rcx
349 jnz .filter_block1d4_h6_rowloop_ssse3
351 ; begin epilog
352 pop rdi
353 pop rsi
354 RESTORE_GOT
355 RESTORE_XMM
356 UNSHADOW_ARGS
357 pop rbp
358 ret
360 .vp8_filter_block1d4_h4_ssse3:
361 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
362 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
363 movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
364 movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
366 mov rsi, arg(0) ;src_ptr
367 mov rdi, arg(2) ;output_ptr
368 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
369 movsxd rcx, dword ptr arg(4) ;output_height
371 movsxd rdx, dword ptr arg(3) ;output_pitch
373 .filter_block1d4_h4_rowloop_ssse3:
374 movdqu xmm1, XMMWORD PTR [rsi - 2]
376 movdqa xmm2, xmm1
377 pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)]
378 pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)]
379 pmaddubsw xmm1, xmm5
381 ;--
382 pmaddubsw xmm2, xmm6
384 lea rsi, [rsi + rax]
385 ;--
386 paddsw xmm1, xmm7
387 paddsw xmm1, xmm2
388 psraw xmm1, 7
389 packuswb xmm1, xmm1
391 movd DWORD PTR [rdi], xmm1
393 add rdi, rdx
394 dec rcx
395 jnz .filter_block1d4_h4_rowloop_ssse3
397 ; begin epilog
398 pop rdi
399 pop rsi
400 RESTORE_GOT
401 RESTORE_XMM
402 UNSHADOW_ARGS
403 pop rbp
404 ret
408 ;void vp8_filter_block1d16_v6_ssse3
409 ;(
410 ; unsigned char *src_ptr,
411 ; unsigned int src_pitch,
412 ; unsigned char *output_ptr,
413 ; unsigned int out_pitch,
414 ; unsigned int output_height,
415 ; unsigned int vp8_filter_index
416 ;)
417 global sym(vp8_filter_block1d16_v6_ssse3) PRIVATE
418 sym(vp8_filter_block1d16_v6_ssse3):
419 push rbp
420 mov rbp, rsp
421 SHADOW_ARGS_TO_STACK 6
422 SAVE_XMM 7
423 GET_GOT rbx
424 push rsi
425 push rdi
426 ; end prolog
428 movsxd rdx, DWORD PTR arg(5) ;table index
429 xor rsi, rsi
430 shl rdx, 4 ;
432 lea rax, [GLOBAL(k0_k5)]
433 add rax, rdx
435 cmp esi, DWORD PTR [rax]
436 je .vp8_filter_block1d16_v4_ssse3
438 movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
439 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
440 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
442 mov rsi, arg(0) ;src_ptr
443 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
444 mov rdi, arg(2) ;output_ptr
446 %if ABI_IS_32BIT=0
447 movsxd r8, DWORD PTR arg(3) ;out_pitch
448 %endif
449 mov rax, rsi
450 movsxd rcx, DWORD PTR arg(4) ;output_height
451 add rax, rdx
454 .vp8_filter_block1d16_v6_ssse3_loop:
455 movq xmm1, MMWORD PTR [rsi] ;A
456 movq xmm2, MMWORD PTR [rsi + rdx] ;B
457 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
458 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
459 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
461 punpcklbw xmm2, xmm4 ;B D
462 punpcklbw xmm3, xmm0 ;C E
464 movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
466 pmaddubsw xmm3, xmm6
467 punpcklbw xmm1, xmm0 ;A F
468 pmaddubsw xmm2, xmm7
469 pmaddubsw xmm1, xmm5
471 paddsw xmm2, xmm3
472 paddsw xmm2, xmm1
473 paddsw xmm2, [GLOBAL(rd)]
474 psraw xmm2, 7
475 packuswb xmm2, xmm2
477 movq MMWORD PTR [rdi], xmm2 ;store the results
479 movq xmm1, MMWORD PTR [rsi + 8] ;A
480 movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B
481 movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C
482 movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
483 movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
485 punpcklbw xmm2, xmm4 ;B D
486 punpcklbw xmm3, xmm0 ;C E
488 movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F
489 pmaddubsw xmm3, xmm6
490 punpcklbw xmm1, xmm0 ;A F
491 pmaddubsw xmm2, xmm7
492 pmaddubsw xmm1, xmm5
494 add rsi, rdx
495 add rax, rdx
496 ;--
497 ;--
498 paddsw xmm2, xmm3
499 paddsw xmm2, xmm1
500 paddsw xmm2, [GLOBAL(rd)]
501 psraw xmm2, 7
502 packuswb xmm2, xmm2
504 movq MMWORD PTR [rdi+8], xmm2
506 %if ABI_IS_32BIT
507 add rdi, DWORD PTR arg(3) ;out_pitch
508 %else
509 add rdi, r8
510 %endif
511 dec rcx
512 jnz .vp8_filter_block1d16_v6_ssse3_loop
514 ; begin epilog
515 pop rdi
516 pop rsi
517 RESTORE_GOT
518 RESTORE_XMM
519 UNSHADOW_ARGS
520 pop rbp
521 ret
523 .vp8_filter_block1d16_v4_ssse3:
524 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
525 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
527 mov rsi, arg(0) ;src_ptr
528 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
529 mov rdi, arg(2) ;output_ptr
531 %if ABI_IS_32BIT=0
532 movsxd r8, DWORD PTR arg(3) ;out_pitch
533 %endif
534 mov rax, rsi
535 movsxd rcx, DWORD PTR arg(4) ;output_height
536 add rax, rdx
538 .vp8_filter_block1d16_v4_ssse3_loop:
539 movq xmm2, MMWORD PTR [rsi + rdx] ;B
540 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
541 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
542 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
544 punpcklbw xmm2, xmm4 ;B D
545 punpcklbw xmm3, xmm0 ;C E
547 pmaddubsw xmm3, xmm6
548 pmaddubsw xmm2, xmm7
549 movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B
550 movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C
551 movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
552 movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
554 paddsw xmm2, [GLOBAL(rd)]
555 paddsw xmm2, xmm3
556 psraw xmm2, 7
557 packuswb xmm2, xmm2
559 punpcklbw xmm5, xmm4 ;B D
560 punpcklbw xmm1, xmm0 ;C E
562 pmaddubsw xmm1, xmm6
563 pmaddubsw xmm5, xmm7
565 movdqa xmm4, [GLOBAL(rd)]
566 add rsi, rdx
567 add rax, rdx
568 ;--
569 ;--
570 paddsw xmm5, xmm1
571 paddsw xmm5, xmm4
572 psraw xmm5, 7
573 packuswb xmm5, xmm5
575 punpcklqdq xmm2, xmm5
577 movdqa XMMWORD PTR [rdi], xmm2
579 %if ABI_IS_32BIT
580 add rdi, DWORD PTR arg(3) ;out_pitch
581 %else
582 add rdi, r8
583 %endif
584 dec rcx
585 jnz .vp8_filter_block1d16_v4_ssse3_loop
587 ; begin epilog
588 pop rdi
589 pop rsi
590 RESTORE_GOT
591 RESTORE_XMM
592 UNSHADOW_ARGS
593 pop rbp
594 ret
596 ;void vp8_filter_block1d8_v6_ssse3
597 ;(
598 ; unsigned char *src_ptr,
599 ; unsigned int src_pitch,
600 ; unsigned char *output_ptr,
601 ; unsigned int out_pitch,
602 ; unsigned int output_height,
603 ; unsigned int vp8_filter_index
604 ;)
605 global sym(vp8_filter_block1d8_v6_ssse3) PRIVATE
606 sym(vp8_filter_block1d8_v6_ssse3):
607 push rbp
608 mov rbp, rsp
609 SHADOW_ARGS_TO_STACK 6
610 SAVE_XMM 7
611 GET_GOT rbx
612 push rsi
613 push rdi
614 ; end prolog
616 movsxd rdx, DWORD PTR arg(5) ;table index
617 xor rsi, rsi
618 shl rdx, 4 ;
620 lea rax, [GLOBAL(k0_k5)]
621 add rax, rdx
623 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
624 mov rdi, arg(2) ;output_ptr
625 %if ABI_IS_32BIT=0
626 movsxd r8, DWORD PTR arg(3) ; out_pitch
627 %endif
628 movsxd rcx, DWORD PTR arg(4) ;[output_height]
630 cmp esi, DWORD PTR [rax]
631 je .vp8_filter_block1d8_v4_ssse3
633 movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
634 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
635 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
637 mov rsi, arg(0) ;src_ptr
639 mov rax, rsi
640 add rax, rdx
642 .vp8_filter_block1d8_v6_ssse3_loop:
643 movq xmm1, MMWORD PTR [rsi] ;A
644 movq xmm2, MMWORD PTR [rsi + rdx] ;B
645 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
646 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
647 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
649 punpcklbw xmm2, xmm4 ;B D
650 punpcklbw xmm3, xmm0 ;C E
652 movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
653 movdqa xmm4, [GLOBAL(rd)]
655 pmaddubsw xmm3, xmm6
656 punpcklbw xmm1, xmm0 ;A F
657 pmaddubsw xmm2, xmm7
658 pmaddubsw xmm1, xmm5
659 add rsi, rdx
660 add rax, rdx
661 ;--
662 ;--
663 paddsw xmm2, xmm3
664 paddsw xmm2, xmm1
665 paddsw xmm2, xmm4
666 psraw xmm2, 7
667 packuswb xmm2, xmm2
669 movq MMWORD PTR [rdi], xmm2
671 %if ABI_IS_32BIT
672 add rdi, DWORD PTR arg(3) ;[out_pitch]
673 %else
674 add rdi, r8
675 %endif
676 dec rcx
677 jnz .vp8_filter_block1d8_v6_ssse3_loop
679 ; begin epilog
680 pop rdi
681 pop rsi
682 RESTORE_GOT
683 RESTORE_XMM
684 UNSHADOW_ARGS
685 pop rbp
686 ret
688 .vp8_filter_block1d8_v4_ssse3:
689 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
690 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
691 movdqa xmm5, [GLOBAL(rd)]
693 mov rsi, arg(0) ;src_ptr
695 mov rax, rsi
696 add rax, rdx
698 .vp8_filter_block1d8_v4_ssse3_loop:
699 movq xmm2, MMWORD PTR [rsi + rdx] ;B
700 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
701 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
702 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
704 punpcklbw xmm2, xmm4 ;B D
705 punpcklbw xmm3, xmm0 ;C E
707 pmaddubsw xmm3, xmm6
708 pmaddubsw xmm2, xmm7
709 add rsi, rdx
710 add rax, rdx
711 ;--
712 ;--
713 paddsw xmm2, xmm3
714 paddsw xmm2, xmm5
715 psraw xmm2, 7
716 packuswb xmm2, xmm2
718 movq MMWORD PTR [rdi], xmm2
720 %if ABI_IS_32BIT
721 add rdi, DWORD PTR arg(3) ;[out_pitch]
722 %else
723 add rdi, r8
724 %endif
725 dec rcx
726 jnz .vp8_filter_block1d8_v4_ssse3_loop
728 ; begin epilog
729 pop rdi
730 pop rsi
731 RESTORE_GOT
732 RESTORE_XMM
733 UNSHADOW_ARGS
734 pop rbp
735 ret
736 ;void vp8_filter_block1d4_v6_ssse3
737 ;(
738 ; unsigned char *src_ptr,
739 ; unsigned int src_pitch,
740 ; unsigned char *output_ptr,
741 ; unsigned int out_pitch,
742 ; unsigned int output_height,
743 ; unsigned int vp8_filter_index
744 ;)
745 global sym(vp8_filter_block1d4_v6_ssse3) PRIVATE
746 sym(vp8_filter_block1d4_v6_ssse3):
747 push rbp
748 mov rbp, rsp
749 SHADOW_ARGS_TO_STACK 6
750 GET_GOT rbx
751 push rsi
752 push rdi
753 ; end prolog
755 movsxd rdx, DWORD PTR arg(5) ;table index
756 xor rsi, rsi
757 shl rdx, 4 ;
759 lea rax, [GLOBAL(k0_k5)]
760 add rax, rdx
762 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
763 mov rdi, arg(2) ;output_ptr
764 %if ABI_IS_32BIT=0
765 movsxd r8, DWORD PTR arg(3) ; out_pitch
766 %endif
767 movsxd rcx, DWORD PTR arg(4) ;[output_height]
769 cmp esi, DWORD PTR [rax]
770 je .vp8_filter_block1d4_v4_ssse3
772 movq mm5, MMWORD PTR [rax] ;k0_k5
773 movq mm6, MMWORD PTR [rax+256] ;k2_k4
774 movq mm7, MMWORD PTR [rax+128] ;k1_k3
776 mov rsi, arg(0) ;src_ptr
778 mov rax, rsi
779 add rax, rdx
781 .vp8_filter_block1d4_v6_ssse3_loop:
782 movd mm1, DWORD PTR [rsi] ;A
783 movd mm2, DWORD PTR [rsi + rdx] ;B
784 movd mm3, DWORD PTR [rsi + rdx * 2] ;C
785 movd mm4, DWORD PTR [rax + rdx * 2] ;D
786 movd mm0, DWORD PTR [rsi + rdx * 4] ;E
788 punpcklbw mm2, mm4 ;B D
789 punpcklbw mm3, mm0 ;C E
791 movd mm0, DWORD PTR [rax + rdx * 4] ;F
793 movq mm4, [GLOBAL(rd)]
795 pmaddubsw mm3, mm6
796 punpcklbw mm1, mm0 ;A F
797 pmaddubsw mm2, mm7
798 pmaddubsw mm1, mm5
799 add rsi, rdx
800 add rax, rdx
801 ;--
802 ;--
803 paddsw mm2, mm3
804 paddsw mm2, mm1
805 paddsw mm2, mm4
806 psraw mm2, 7
807 packuswb mm2, mm2
809 movd DWORD PTR [rdi], mm2
811 %if ABI_IS_32BIT
812 add rdi, DWORD PTR arg(3) ;[out_pitch]
813 %else
814 add rdi, r8
815 %endif
816 dec rcx
817 jnz .vp8_filter_block1d4_v6_ssse3_loop
819 ; begin epilog
820 pop rdi
821 pop rsi
822 RESTORE_GOT
823 UNSHADOW_ARGS
824 pop rbp
825 ret
827 .vp8_filter_block1d4_v4_ssse3:
828 movq mm6, MMWORD PTR [rax+256] ;k2_k4
829 movq mm7, MMWORD PTR [rax+128] ;k1_k3
830 movq mm5, MMWORD PTR [GLOBAL(rd)]
832 mov rsi, arg(0) ;src_ptr
834 mov rax, rsi
835 add rax, rdx
837 .vp8_filter_block1d4_v4_ssse3_loop:
838 movd mm2, DWORD PTR [rsi + rdx] ;B
839 movd mm3, DWORD PTR [rsi + rdx * 2] ;C
840 movd mm4, DWORD PTR [rax + rdx * 2] ;D
841 movd mm0, DWORD PTR [rsi + rdx * 4] ;E
843 punpcklbw mm2, mm4 ;B D
844 punpcklbw mm3, mm0 ;C E
846 pmaddubsw mm3, mm6
847 pmaddubsw mm2, mm7
848 add rsi, rdx
849 add rax, rdx
850 ;--
851 ;--
852 paddsw mm2, mm3
853 paddsw mm2, mm5
854 psraw mm2, 7
855 packuswb mm2, mm2
857 movd DWORD PTR [rdi], mm2
859 %if ABI_IS_32BIT
860 add rdi, DWORD PTR arg(3) ;[out_pitch]
861 %else
862 add rdi, r8
863 %endif
864 dec rcx
865 jnz .vp8_filter_block1d4_v4_ssse3_loop
867 ; begin epilog
868 pop rdi
869 pop rsi
870 RESTORE_GOT
871 UNSHADOW_ARGS
872 pop rbp
873 ret
875 ;void vp8_bilinear_predict16x16_ssse3
876 ;(
877 ; unsigned char *src_ptr,
878 ; int src_pixels_per_line,
879 ; int xoffset,
880 ; int yoffset,
881 ; unsigned char *dst_ptr,
882 ; int dst_pitch
883 ;)
884 global sym(vp8_bilinear_predict16x16_ssse3) PRIVATE
885 sym(vp8_bilinear_predict16x16_ssse3):
886 push rbp
887 mov rbp, rsp
888 SHADOW_ARGS_TO_STACK 6
889 SAVE_XMM 7
890 GET_GOT rbx
891 push rsi
892 push rdi
893 ; end prolog
895 lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
896 movsxd rax, dword ptr arg(2) ; xoffset
898 cmp rax, 0 ; skip first_pass filter if xoffset=0
899 je .b16x16_sp_only
901 shl rax, 4
902 lea rax, [rax + rcx] ; HFilter
904 mov rdi, arg(4) ; dst_ptr
905 mov rsi, arg(0) ; src_ptr
906 movsxd rdx, dword ptr arg(5) ; dst_pitch
908 movdqa xmm1, [rax]
910 movsxd rax, dword ptr arg(3) ; yoffset
912 cmp rax, 0 ; skip second_pass filter if yoffset=0
913 je .b16x16_fp_only
915 shl rax, 4
916 lea rax, [rax + rcx] ; VFilter
918 lea rcx, [rdi+rdx*8]
919 lea rcx, [rcx+rdx*8]
920 movsxd rdx, dword ptr arg(1) ; src_pixels_per_line
922 movdqa xmm2, [rax]
924 %if ABI_IS_32BIT=0
925 movsxd r8, dword ptr arg(5) ; dst_pitch
926 %endif
927 movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07
928 movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
930 punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
931 movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
933 movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
935 lea rsi, [rsi + rdx] ; next line
937 pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14
939 punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
940 pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15
942 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
943 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
945 paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
946 psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128
948 movdqa xmm7, xmm3
949 packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
951 .next_row:
952 movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07
953 movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
955 punpcklbw xmm6, xmm5
956 movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
958 movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
959 lea rsi, [rsi + rdx] ; next line
961 pmaddubsw xmm6, xmm1
963 punpcklbw xmm4, xmm5
964 pmaddubsw xmm4, xmm1
966 paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
967 psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128
969 paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
970 psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128
972 packuswb xmm6, xmm4
973 movdqa xmm5, xmm7
975 punpcklbw xmm5, xmm6
976 pmaddubsw xmm5, xmm2
978 punpckhbw xmm7, xmm6
979 pmaddubsw xmm7, xmm2
981 paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value
982 psraw xmm5, VP8_FILTER_SHIFT ; xmm5 /= 128
984 paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
985 psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128
987 packuswb xmm5, xmm7
988 movdqa xmm7, xmm6
990 movdqa [rdi], xmm5 ; store the results in the destination
991 %if ABI_IS_32BIT
992 add rdi, DWORD PTR arg(5) ; dst_pitch
993 %else
994 add rdi, r8
995 %endif
997 cmp rdi, rcx
998 jne .next_row
1000 jmp .done
1002 .b16x16_sp_only:
1003 movsxd rax, dword ptr arg(3) ; yoffset
1004 shl rax, 4
1005 lea rax, [rax + rcx] ; VFilter
1007 mov rdi, arg(4) ; dst_ptr
1008 mov rsi, arg(0) ; src_ptr
1009 movsxd rdx, dword ptr arg(5) ; dst_pitch
1011 movdqa xmm1, [rax] ; VFilter
1013 lea rcx, [rdi+rdx*8]
1014 lea rcx, [rcx+rdx*8]
1015 movsxd rax, dword ptr arg(1) ; src_pixels_per_line
1017 ; get the first horizontal line done
1018 movq xmm4, [rsi] ; load row 0
1019 movq xmm2, [rsi + 8] ; load row 0
1021 lea rsi, [rsi + rax] ; next line
1022 .next_row_sp:
1023 movq xmm3, [rsi] ; load row + 1
1024 movq xmm5, [rsi + 8] ; load row + 1
1026 punpcklbw xmm4, xmm3
1027 punpcklbw xmm2, xmm5
1029 pmaddubsw xmm4, xmm1
1030 movq xmm7, [rsi + rax] ; load row + 2
1032 pmaddubsw xmm2, xmm1
1033 movq xmm6, [rsi + rax + 8] ; load row + 2
1035 punpcklbw xmm3, xmm7
1036 punpcklbw xmm5, xmm6
1038 pmaddubsw xmm3, xmm1
1039 paddw xmm4, [GLOBAL(rd)]
1041 pmaddubsw xmm5, xmm1
1042 paddw xmm2, [GLOBAL(rd)]
1044 psraw xmm4, VP8_FILTER_SHIFT
1045 psraw xmm2, VP8_FILTER_SHIFT
1047 packuswb xmm4, xmm2
1048 paddw xmm3, [GLOBAL(rd)]
1050 movdqa [rdi], xmm4 ; store row 0
1051 paddw xmm5, [GLOBAL(rd)]
1053 psraw xmm3, VP8_FILTER_SHIFT
1054 psraw xmm5, VP8_FILTER_SHIFT
1056 packuswb xmm3, xmm5
1057 movdqa xmm4, xmm7
1059 movdqa [rdi + rdx],xmm3 ; store row 1
1060 lea rsi, [rsi + 2*rax]
1062 movdqa xmm2, xmm6
1063 lea rdi, [rdi + 2*rdx]
1065 cmp rdi, rcx
1066 jne .next_row_sp
1068 jmp .done
1070 .b16x16_fp_only:
1071 lea rcx, [rdi+rdx*8]
1072 lea rcx, [rcx+rdx*8]
1073 movsxd rax, dword ptr arg(1) ; src_pixels_per_line
1075 .next_row_fp:
1076 movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07
1077 movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08
1079 punpcklbw xmm2, xmm4
1080 movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15
1082 pmaddubsw xmm2, xmm1
1083 movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16
1085 lea rsi, [rsi + rax] ; next line
1086 punpcklbw xmm3, xmm4
1088 pmaddubsw xmm3, xmm1
1089 movq xmm5, [rsi]
1091 paddw xmm2, [GLOBAL(rd)]
1092 movq xmm7, [rsi+1]
1094 movq xmm6, [rsi+8]
1095 psraw xmm2, VP8_FILTER_SHIFT
1097 punpcklbw xmm5, xmm7
1098 movq xmm7, [rsi+9]
1100 paddw xmm3, [GLOBAL(rd)]
1101 pmaddubsw xmm5, xmm1
1103 psraw xmm3, VP8_FILTER_SHIFT
1104 punpcklbw xmm6, xmm7
1106 packuswb xmm2, xmm3
1107 pmaddubsw xmm6, xmm1
1109 movdqa [rdi], xmm2 ; store the results in the destination
1110 paddw xmm5, [GLOBAL(rd)]
1112 lea rdi, [rdi + rdx] ; dst_pitch
1113 psraw xmm5, VP8_FILTER_SHIFT
1115 paddw xmm6, [GLOBAL(rd)]
1116 psraw xmm6, VP8_FILTER_SHIFT
1118 packuswb xmm5, xmm6
1119 lea rsi, [rsi + rax] ; next line
1121 movdqa [rdi], xmm5 ; store the results in the destination
1122 lea rdi, [rdi + rdx] ; dst_pitch
1124 cmp rdi, rcx
1126 jne .next_row_fp
1128 .done:
1129 ; begin epilog
1130 pop rdi
1131 pop rsi
1132 RESTORE_GOT
1133 RESTORE_XMM
1134 UNSHADOW_ARGS
1135 pop rbp
1136 ret
1138 ;void vp8_bilinear_predict8x8_ssse3
1139 ;(
1140 ; unsigned char *src_ptr,
1141 ; int src_pixels_per_line,
1142 ; int xoffset,
1143 ; int yoffset,
1144 ; unsigned char *dst_ptr,
1145 ; int dst_pitch
1146 ;)
1147 global sym(vp8_bilinear_predict8x8_ssse3) PRIVATE
1148 sym(vp8_bilinear_predict8x8_ssse3):
1149 push rbp
1150 mov rbp, rsp
1151 SHADOW_ARGS_TO_STACK 6
1152 SAVE_XMM 7
1153 GET_GOT rbx
1154 push rsi
1155 push rdi
1156 ; end prolog
1158 ALIGN_STACK 16, rax
1159 sub rsp, 144 ; reserve 144 bytes
1161 lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
1163 mov rsi, arg(0) ;src_ptr
1164 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
1166 ;Read 9-line unaligned data in and put them on stack. This gives a big
1167 ;performance boost.
1168 movdqu xmm0, [rsi]
1169 lea rax, [rdx + rdx*2]
1170 movdqu xmm1, [rsi+rdx]
1171 movdqu xmm2, [rsi+rdx*2]
1172 add rsi, rax
1173 movdqu xmm3, [rsi]
1174 movdqu xmm4, [rsi+rdx]
1175 movdqu xmm5, [rsi+rdx*2]
1176 add rsi, rax
1177 movdqu xmm6, [rsi]
1178 movdqu xmm7, [rsi+rdx]
1180 movdqa XMMWORD PTR [rsp], xmm0
1182 movdqu xmm0, [rsi+rdx*2]
1184 movdqa XMMWORD PTR [rsp+16], xmm1
1185 movdqa XMMWORD PTR [rsp+32], xmm2
1186 movdqa XMMWORD PTR [rsp+48], xmm3
1187 movdqa XMMWORD PTR [rsp+64], xmm4
1188 movdqa XMMWORD PTR [rsp+80], xmm5
1189 movdqa XMMWORD PTR [rsp+96], xmm6
1190 movdqa XMMWORD PTR [rsp+112], xmm7
1191 movdqa XMMWORD PTR [rsp+128], xmm0
1193 movsxd rax, dword ptr arg(2) ; xoffset
1194 cmp rax, 0 ; skip first_pass filter if xoffset=0
1195 je .b8x8_sp_only
1197 shl rax, 4
1198 add rax, rcx ; HFilter
1200 mov rdi, arg(4) ; dst_ptr
1201 movsxd rdx, dword ptr arg(5) ; dst_pitch
1203 movdqa xmm0, [rax]
1205 movsxd rax, dword ptr arg(3) ; yoffset
1206 cmp rax, 0 ; skip second_pass filter if yoffset=0
1207 je .b8x8_fp_only
1209 shl rax, 4
1210 lea rax, [rax + rcx] ; VFilter
1212 lea rcx, [rdi+rdx*8]
1214 movdqa xmm1, [rax]
1216 ; get the first horizontal line done
1217 movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1218 movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
1220 psrldq xmm5, 1
1221 lea rsp, [rsp + 16] ; next line
1223 punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
1224 pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14
1226 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
1227 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
1229 movdqa xmm7, xmm3
1230 packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1232 .next_row:
1233 movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1234 lea rsp, [rsp + 16] ; next line
1236 movdqa xmm5, xmm6
1238 psrldq xmm5, 1
1240 punpcklbw xmm6, xmm5
1241 pmaddubsw xmm6, xmm0
1243 paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
1244 psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128
1246 packuswb xmm6, xmm6
1248 punpcklbw xmm7, xmm6
1249 pmaddubsw xmm7, xmm1
1251 paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
1252 psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128
1254 packuswb xmm7, xmm7
1256 movq [rdi], xmm7 ; store the results in the destination
1257 lea rdi, [rdi + rdx]
1259 movdqa xmm7, xmm6
1261 cmp rdi, rcx
1262 jne .next_row
1264 jmp .done8x8
1266 .b8x8_sp_only:
1267 movsxd rax, dword ptr arg(3) ; yoffset
1268 shl rax, 4
1269 lea rax, [rax + rcx] ; VFilter
1271 mov rdi, arg(4) ;dst_ptr
1272 movsxd rdx, dword ptr arg(5) ; dst_pitch
1274 movdqa xmm0, [rax] ; VFilter
1276 movq xmm1, XMMWORD PTR [rsp]
1277 movq xmm2, XMMWORD PTR [rsp+16]
1279 movq xmm3, XMMWORD PTR [rsp+32]
1280 punpcklbw xmm1, xmm2
1282 movq xmm4, XMMWORD PTR [rsp+48]
1283 punpcklbw xmm2, xmm3
1285 movq xmm5, XMMWORD PTR [rsp+64]
1286 punpcklbw xmm3, xmm4
1288 movq xmm6, XMMWORD PTR [rsp+80]
1289 punpcklbw xmm4, xmm5
1291 movq xmm7, XMMWORD PTR [rsp+96]
1292 punpcklbw xmm5, xmm6
1294 pmaddubsw xmm1, xmm0
1295 pmaddubsw xmm2, xmm0
1297 pmaddubsw xmm3, xmm0
1298 pmaddubsw xmm4, xmm0
1300 pmaddubsw xmm5, xmm0
1301 punpcklbw xmm6, xmm7
1303 pmaddubsw xmm6, xmm0
1304 paddw xmm1, [GLOBAL(rd)]
1306 paddw xmm2, [GLOBAL(rd)]
1307 psraw xmm1, VP8_FILTER_SHIFT
1309 paddw xmm3, [GLOBAL(rd)]
1310 psraw xmm2, VP8_FILTER_SHIFT
1312 paddw xmm4, [GLOBAL(rd)]
1313 psraw xmm3, VP8_FILTER_SHIFT
1315 paddw xmm5, [GLOBAL(rd)]
1316 psraw xmm4, VP8_FILTER_SHIFT
1318 paddw xmm6, [GLOBAL(rd)]
1319 psraw xmm5, VP8_FILTER_SHIFT
1321 psraw xmm6, VP8_FILTER_SHIFT
1322 packuswb xmm1, xmm1
1324 packuswb xmm2, xmm2
1325 movq [rdi], xmm1
1327 packuswb xmm3, xmm3
1328 movq [rdi+rdx], xmm2
1330 packuswb xmm4, xmm4
1331 movq xmm1, XMMWORD PTR [rsp+112]
1333 lea rdi, [rdi + 2*rdx]
1334 movq xmm2, XMMWORD PTR [rsp+128]
1336 packuswb xmm5, xmm5
1337 movq [rdi], xmm3
1339 packuswb xmm6, xmm6
1340 movq [rdi+rdx], xmm4
1342 lea rdi, [rdi + 2*rdx]
1343 punpcklbw xmm7, xmm1
1345 movq [rdi], xmm5
1346 pmaddubsw xmm7, xmm0
1348 movq [rdi+rdx], xmm6
1349 punpcklbw xmm1, xmm2
1351 pmaddubsw xmm1, xmm0
1352 paddw xmm7, [GLOBAL(rd)]
1354 psraw xmm7, VP8_FILTER_SHIFT
1355 paddw xmm1, [GLOBAL(rd)]
1357 psraw xmm1, VP8_FILTER_SHIFT
1358 packuswb xmm7, xmm7
1360 packuswb xmm1, xmm1
1361 lea rdi, [rdi + 2*rdx]
1363 movq [rdi], xmm7
1365 movq [rdi+rdx], xmm1
1366 lea rsp, [rsp + 144]
1368 jmp .done8x8
1370 .b8x8_fp_only:
1371 lea rcx, [rdi+rdx*8]
1373 .next_row_fp:
1374 movdqa xmm1, XMMWORD PTR [rsp]
1375 movdqa xmm3, XMMWORD PTR [rsp+16]
1377 movdqa xmm2, xmm1
1378 movdqa xmm5, XMMWORD PTR [rsp+32]
1380 psrldq xmm2, 1
1381 movdqa xmm7, XMMWORD PTR [rsp+48]
1383 movdqa xmm4, xmm3
1384 psrldq xmm4, 1
1386 movdqa xmm6, xmm5
1387 psrldq xmm6, 1
1389 punpcklbw xmm1, xmm2
1390 pmaddubsw xmm1, xmm0
1392 punpcklbw xmm3, xmm4
1393 pmaddubsw xmm3, xmm0
1395 punpcklbw xmm5, xmm6
1396 pmaddubsw xmm5, xmm0
1398 movdqa xmm2, xmm7
1399 psrldq xmm2, 1
1401 punpcklbw xmm7, xmm2
1402 pmaddubsw xmm7, xmm0
1404 paddw xmm1, [GLOBAL(rd)]
1405 psraw xmm1, VP8_FILTER_SHIFT
1407 paddw xmm3, [GLOBAL(rd)]
1408 psraw xmm3, VP8_FILTER_SHIFT
1410 paddw xmm5, [GLOBAL(rd)]
1411 psraw xmm5, VP8_FILTER_SHIFT
1413 paddw xmm7, [GLOBAL(rd)]
1414 psraw xmm7, VP8_FILTER_SHIFT
1416 packuswb xmm1, xmm1
1417 packuswb xmm3, xmm3
1419 packuswb xmm5, xmm5
1420 movq [rdi], xmm1
1422 packuswb xmm7, xmm7
1423 movq [rdi+rdx], xmm3
1425 lea rdi, [rdi + 2*rdx]
1426 movq [rdi], xmm5
1428 lea rsp, [rsp + 4*16]
1429 movq [rdi+rdx], xmm7
1431 lea rdi, [rdi + 2*rdx]
1432 cmp rdi, rcx
1434 jne .next_row_fp
1436 lea rsp, [rsp + 16]
1438 .done8x8:
1439 ;add rsp, 144
1440 pop rsp
1441 ; begin epilog
1442 pop rdi
1443 pop rsi
1444 RESTORE_GOT
1445 RESTORE_XMM
1446 UNSHADOW_ARGS
1447 pop rbp
1448 ret
1450 SECTION_RODATA
1451 align 16
1452 shuf1b:
1453 db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
1454 shuf2b:
1455 db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
1456 shuf3b:
1457 db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
1459 align 16
1460 shuf2bfrom1:
1461 db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
1462 align 16
1463 shuf3bfrom1:
1464 db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
1466 align 16
1467 rd:
1468 times 8 dw 0x40
1470 align 16
1471 k0_k5:
1472 times 8 db 0, 0 ;placeholder
1473 times 8 db 0, 0
1474 times 8 db 2, 1
1475 times 8 db 0, 0
1476 times 8 db 3, 3
1477 times 8 db 0, 0
1478 times 8 db 1, 2
1479 times 8 db 0, 0
1480 k1_k3:
1481 times 8 db 0, 0 ;placeholder
1482 times 8 db -6, 12
1483 times 8 db -11, 36
1484 times 8 db -9, 50
1485 times 8 db -16, 77
1486 times 8 db -6, 93
1487 times 8 db -8, 108
1488 times 8 db -1, 123
1489 k2_k4:
1490 times 8 db 128, 0 ;placeholder
1491 times 8 db 123, -1
1492 times 8 db 108, -8
1493 times 8 db 93, -6
1494 times 8 db 77, -16
1495 times 8 db 50, -9
1496 times 8 db 36, -11
1497 times 8 db 12, -6
1498 align 16
1499 vp8_bilinear_filters_ssse3:
1500 times 8 db 128, 0
1501 times 8 db 112, 16
1502 times 8 db 96, 32
1503 times 8 db 80, 48
1504 times 8 db 64, 64
1505 times 8 db 48, 80
1506 times 8 db 32, 96
1507 times 8 db 16, 112