Thu, 15 Jan 2015 15:59:08 +0100
Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
12 %include "vpx_ports/x86_abi_support.asm"
14 ;Note: tap3 and tap4 have to be applied and added after other taps to avoid
15 ;overflow.
17 %macro GET_FILTERS_4 0
18 mov rdx, arg(5) ;filter ptr
19 mov rcx, 0x0400040
21 movdqa xmm7, [rdx] ;load filters
22 pshuflw xmm0, xmm7, 0b ;k0
23 pshuflw xmm1, xmm7, 01010101b ;k1
24 pshuflw xmm2, xmm7, 10101010b ;k2
25 pshuflw xmm3, xmm7, 11111111b ;k3
26 psrldq xmm7, 8
27 pshuflw xmm4, xmm7, 0b ;k4
28 pshuflw xmm5, xmm7, 01010101b ;k5
29 pshuflw xmm6, xmm7, 10101010b ;k6
30 pshuflw xmm7, xmm7, 11111111b ;k7
32 punpcklqdq xmm0, xmm1
33 punpcklqdq xmm2, xmm3
34 punpcklqdq xmm5, xmm4
35 punpcklqdq xmm6, xmm7
37 movdqa k0k1, xmm0
38 movdqa k2k3, xmm2
39 movdqa k5k4, xmm5
40 movdqa k6k7, xmm6
42 movq xmm6, rcx
43 pshufd xmm6, xmm6, 0
44 movdqa krd, xmm6
46 pxor xmm7, xmm7
47 movdqa zero, xmm7
48 %endm
50 %macro APPLY_FILTER_4 1
51 punpckldq xmm0, xmm1 ;two row in one register
52 punpckldq xmm6, xmm7
53 punpckldq xmm2, xmm3
54 punpckldq xmm5, xmm4
56 punpcklbw xmm0, zero ;unpack to word
57 punpcklbw xmm6, zero
58 punpcklbw xmm2, zero
59 punpcklbw xmm5, zero
61 pmullw xmm0, k0k1 ;multiply the filter factors
62 pmullw xmm6, k6k7
63 pmullw xmm2, k2k3
64 pmullw xmm5, k5k4
66 paddsw xmm0, xmm6 ;sum
67 movdqa xmm1, xmm0
68 psrldq xmm1, 8
69 paddsw xmm0, xmm1
70 paddsw xmm0, xmm2
71 psrldq xmm2, 8
72 paddsw xmm0, xmm5
73 psrldq xmm5, 8
74 paddsw xmm0, xmm2
75 paddsw xmm0, xmm5
77 paddsw xmm0, krd ;rounding
78 psraw xmm0, 7 ;shift
79 packuswb xmm0, xmm0 ;pack to byte
81 %if %1
82 movd xmm1, [rdi]
83 pavgb xmm0, xmm1
84 %endif
85 movd [rdi], xmm0
86 %endm
88 %macro GET_FILTERS 0
89 mov rdx, arg(5) ;filter ptr
90 mov rsi, arg(0) ;src_ptr
91 mov rdi, arg(2) ;output_ptr
92 mov rcx, 0x0400040
94 movdqa xmm7, [rdx] ;load filters
95 pshuflw xmm0, xmm7, 0b ;k0
96 pshuflw xmm1, xmm7, 01010101b ;k1
97 pshuflw xmm2, xmm7, 10101010b ;k2
98 pshuflw xmm3, xmm7, 11111111b ;k3
99 pshufhw xmm4, xmm7, 0b ;k4
100 pshufhw xmm5, xmm7, 01010101b ;k5
101 pshufhw xmm6, xmm7, 10101010b ;k6
102 pshufhw xmm7, xmm7, 11111111b ;k7
104 punpcklwd xmm0, xmm0
105 punpcklwd xmm1, xmm1
106 punpcklwd xmm2, xmm2
107 punpcklwd xmm3, xmm3
108 punpckhwd xmm4, xmm4
109 punpckhwd xmm5, xmm5
110 punpckhwd xmm6, xmm6
111 punpckhwd xmm7, xmm7
113 movdqa k0, xmm0 ;store filter factors on stack
114 movdqa k1, xmm1
115 movdqa k2, xmm2
116 movdqa k3, xmm3
117 movdqa k4, xmm4
118 movdqa k5, xmm5
119 movdqa k6, xmm6
120 movdqa k7, xmm7
122 movq xmm6, rcx
123 pshufd xmm6, xmm6, 0
124 movdqa krd, xmm6 ;rounding
126 pxor xmm7, xmm7
127 movdqa zero, xmm7
128 %endm
130 %macro LOAD_VERT_8 1
131 movq xmm0, [rsi + %1] ;0
132 movq xmm1, [rsi + rax + %1] ;1
133 movq xmm6, [rsi + rdx * 2 + %1] ;6
134 lea rsi, [rsi + rax]
135 movq xmm7, [rsi + rdx * 2 + %1] ;7
136 movq xmm2, [rsi + rax + %1] ;2
137 movq xmm3, [rsi + rax * 2 + %1] ;3
138 movq xmm4, [rsi + rdx + %1] ;4
139 movq xmm5, [rsi + rax * 4 + %1] ;5
140 %endm
142 %macro APPLY_FILTER_8 2
143 punpcklbw xmm0, zero
144 punpcklbw xmm1, zero
145 punpcklbw xmm6, zero
146 punpcklbw xmm7, zero
147 punpcklbw xmm2, zero
148 punpcklbw xmm5, zero
149 punpcklbw xmm3, zero
150 punpcklbw xmm4, zero
152 pmullw xmm0, k0
153 pmullw xmm1, k1
154 pmullw xmm6, k6
155 pmullw xmm7, k7
156 pmullw xmm2, k2
157 pmullw xmm5, k5
158 pmullw xmm3, k3
159 pmullw xmm4, k4
161 paddsw xmm0, xmm1
162 paddsw xmm0, xmm6
163 paddsw xmm0, xmm7
164 paddsw xmm0, xmm2
165 paddsw xmm0, xmm5
166 paddsw xmm0, xmm3
167 paddsw xmm0, xmm4
169 paddsw xmm0, krd ;rounding
170 psraw xmm0, 7 ;shift
171 packuswb xmm0, xmm0 ;pack back to byte
172 %if %1
173 movq xmm1, [rdi + %2]
174 pavgb xmm0, xmm1
175 %endif
176 movq [rdi + %2], xmm0
177 %endm
179 ;void vp9_filter_block1d4_v8_sse2
180 ;(
181 ; unsigned char *src_ptr,
182 ; unsigned int src_pitch,
183 ; unsigned char *output_ptr,
184 ; unsigned int out_pitch,
185 ; unsigned int output_height,
186 ; short *filter
187 ;)
188 global sym(vp9_filter_block1d4_v8_sse2) PRIVATE
189 sym(vp9_filter_block1d4_v8_sse2):
190 push rbp
191 mov rbp, rsp
192 SHADOW_ARGS_TO_STACK 6
193 SAVE_XMM 7
194 push rsi
195 push rdi
196 push rbx
197 ; end prolog
199 ALIGN_STACK 16, rax
200 sub rsp, 16 * 6
201 %define k0k1 [rsp + 16 * 0]
202 %define k2k3 [rsp + 16 * 1]
203 %define k5k4 [rsp + 16 * 2]
204 %define k6k7 [rsp + 16 * 3]
205 %define krd [rsp + 16 * 4]
206 %define zero [rsp + 16 * 5]
208 GET_FILTERS_4
210 mov rsi, arg(0) ;src_ptr
211 mov rdi, arg(2) ;output_ptr
213 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
214 movsxd rbx, DWORD PTR arg(3) ;out_pitch
215 lea rdx, [rax + rax * 2]
216 movsxd rcx, DWORD PTR arg(4) ;output_height
218 .loop:
219 movd xmm0, [rsi] ;load src: row 0
220 movd xmm1, [rsi + rax] ;1
221 movd xmm6, [rsi + rdx * 2] ;6
222 lea rsi, [rsi + rax]
223 movd xmm7, [rsi + rdx * 2] ;7
224 movd xmm2, [rsi + rax] ;2
225 movd xmm3, [rsi + rax * 2] ;3
226 movd xmm4, [rsi + rdx] ;4
227 movd xmm5, [rsi + rax * 4] ;5
229 APPLY_FILTER_4 0
231 lea rdi, [rdi + rbx]
232 dec rcx
233 jnz .loop
235 add rsp, 16 * 6
236 pop rsp
237 pop rbx
238 ; begin epilog
239 pop rdi
240 pop rsi
241 RESTORE_XMM
242 UNSHADOW_ARGS
243 pop rbp
244 ret
246 ;void vp9_filter_block1d8_v8_sse2
247 ;(
248 ; unsigned char *src_ptr,
249 ; unsigned int src_pitch,
250 ; unsigned char *output_ptr,
251 ; unsigned int out_pitch,
252 ; unsigned int output_height,
253 ; short *filter
254 ;)
255 global sym(vp9_filter_block1d8_v8_sse2) PRIVATE
256 sym(vp9_filter_block1d8_v8_sse2):
257 push rbp
258 mov rbp, rsp
259 SHADOW_ARGS_TO_STACK 6
260 SAVE_XMM 7
261 push rsi
262 push rdi
263 push rbx
264 ; end prolog
266 ALIGN_STACK 16, rax
267 sub rsp, 16 * 10
268 %define k0 [rsp + 16 * 0]
269 %define k1 [rsp + 16 * 1]
270 %define k2 [rsp + 16 * 2]
271 %define k3 [rsp + 16 * 3]
272 %define k4 [rsp + 16 * 4]
273 %define k5 [rsp + 16 * 5]
274 %define k6 [rsp + 16 * 6]
275 %define k7 [rsp + 16 * 7]
276 %define krd [rsp + 16 * 8]
277 %define zero [rsp + 16 * 9]
279 GET_FILTERS
281 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
282 movsxd rbx, DWORD PTR arg(3) ;out_pitch
283 lea rdx, [rax + rax * 2]
284 movsxd rcx, DWORD PTR arg(4) ;output_height
286 .loop:
287 LOAD_VERT_8 0
288 APPLY_FILTER_8 0, 0
290 lea rdi, [rdi + rbx]
291 dec rcx
292 jnz .loop
294 add rsp, 16 * 10
295 pop rsp
296 pop rbx
297 ; begin epilog
298 pop rdi
299 pop rsi
300 RESTORE_XMM
301 UNSHADOW_ARGS
302 pop rbp
303 ret
305 ;void vp9_filter_block1d16_v8_sse2
306 ;(
307 ; unsigned char *src_ptr,
308 ; unsigned int src_pitch,
309 ; unsigned char *output_ptr,
310 ; unsigned int out_pitch,
311 ; unsigned int output_height,
312 ; short *filter
313 ;)
314 global sym(vp9_filter_block1d16_v8_sse2) PRIVATE
315 sym(vp9_filter_block1d16_v8_sse2):
316 push rbp
317 mov rbp, rsp
318 SHADOW_ARGS_TO_STACK 6
319 SAVE_XMM 7
320 push rsi
321 push rdi
322 push rbx
323 ; end prolog
325 ALIGN_STACK 16, rax
326 sub rsp, 16 * 10
327 %define k0 [rsp + 16 * 0]
328 %define k1 [rsp + 16 * 1]
329 %define k2 [rsp + 16 * 2]
330 %define k3 [rsp + 16 * 3]
331 %define k4 [rsp + 16 * 4]
332 %define k5 [rsp + 16 * 5]
333 %define k6 [rsp + 16 * 6]
334 %define k7 [rsp + 16 * 7]
335 %define krd [rsp + 16 * 8]
336 %define zero [rsp + 16 * 9]
338 GET_FILTERS
340 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
341 movsxd rbx, DWORD PTR arg(3) ;out_pitch
342 lea rdx, [rax + rax * 2]
343 movsxd rcx, DWORD PTR arg(4) ;output_height
345 .loop:
346 LOAD_VERT_8 0
347 APPLY_FILTER_8 0, 0
348 sub rsi, rax
350 LOAD_VERT_8 8
351 APPLY_FILTER_8 0, 8
352 add rdi, rbx
354 dec rcx
355 jnz .loop
357 add rsp, 16 * 10
358 pop rsp
359 pop rbx
360 ; begin epilog
361 pop rdi
362 pop rsi
363 RESTORE_XMM
364 UNSHADOW_ARGS
365 pop rbp
366 ret
368 global sym(vp9_filter_block1d4_v8_avg_sse2) PRIVATE
369 sym(vp9_filter_block1d4_v8_avg_sse2):
370 push rbp
371 mov rbp, rsp
372 SHADOW_ARGS_TO_STACK 6
373 SAVE_XMM 7
374 push rsi
375 push rdi
376 push rbx
377 ; end prolog
379 ALIGN_STACK 16, rax
380 sub rsp, 16 * 6
381 %define k0k1 [rsp + 16 * 0]
382 %define k2k3 [rsp + 16 * 1]
383 %define k5k4 [rsp + 16 * 2]
384 %define k6k7 [rsp + 16 * 3]
385 %define krd [rsp + 16 * 4]
386 %define zero [rsp + 16 * 5]
388 GET_FILTERS_4
390 mov rsi, arg(0) ;src_ptr
391 mov rdi, arg(2) ;output_ptr
393 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
394 movsxd rbx, DWORD PTR arg(3) ;out_pitch
395 lea rdx, [rax + rax * 2]
396 movsxd rcx, DWORD PTR arg(4) ;output_height
398 .loop:
399 movd xmm0, [rsi] ;load src: row 0
400 movd xmm1, [rsi + rax] ;1
401 movd xmm6, [rsi + rdx * 2] ;6
402 lea rsi, [rsi + rax]
403 movd xmm7, [rsi + rdx * 2] ;7
404 movd xmm2, [rsi + rax] ;2
405 movd xmm3, [rsi + rax * 2] ;3
406 movd xmm4, [rsi + rdx] ;4
407 movd xmm5, [rsi + rax * 4] ;5
409 APPLY_FILTER_4 1
411 lea rdi, [rdi + rbx]
412 dec rcx
413 jnz .loop
415 add rsp, 16 * 6
416 pop rsp
417 pop rbx
418 ; begin epilog
419 pop rdi
420 pop rsi
421 RESTORE_XMM
422 UNSHADOW_ARGS
423 pop rbp
424 ret
426 global sym(vp9_filter_block1d8_v8_avg_sse2) PRIVATE
427 sym(vp9_filter_block1d8_v8_avg_sse2):
428 push rbp
429 mov rbp, rsp
430 SHADOW_ARGS_TO_STACK 6
431 SAVE_XMM 7
432 push rsi
433 push rdi
434 push rbx
435 ; end prolog
437 ALIGN_STACK 16, rax
438 sub rsp, 16 * 10
439 %define k0 [rsp + 16 * 0]
440 %define k1 [rsp + 16 * 1]
441 %define k2 [rsp + 16 * 2]
442 %define k3 [rsp + 16 * 3]
443 %define k4 [rsp + 16 * 4]
444 %define k5 [rsp + 16 * 5]
445 %define k6 [rsp + 16 * 6]
446 %define k7 [rsp + 16 * 7]
447 %define krd [rsp + 16 * 8]
448 %define zero [rsp + 16 * 9]
450 GET_FILTERS
452 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
453 movsxd rbx, DWORD PTR arg(3) ;out_pitch
454 lea rdx, [rax + rax * 2]
455 movsxd rcx, DWORD PTR arg(4) ;output_height
456 .loop:
457 LOAD_VERT_8 0
458 APPLY_FILTER_8 1, 0
460 lea rdi, [rdi + rbx]
461 dec rcx
462 jnz .loop
464 add rsp, 16 * 10
465 pop rsp
466 pop rbx
467 ; begin epilog
468 pop rdi
469 pop rsi
470 RESTORE_XMM
471 UNSHADOW_ARGS
472 pop rbp
473 ret
475 global sym(vp9_filter_block1d16_v8_avg_sse2) PRIVATE
476 sym(vp9_filter_block1d16_v8_avg_sse2):
477 push rbp
478 mov rbp, rsp
479 SHADOW_ARGS_TO_STACK 6
480 SAVE_XMM 7
481 push rsi
482 push rdi
483 push rbx
484 ; end prolog
486 ALIGN_STACK 16, rax
487 sub rsp, 16 * 10
488 %define k0 [rsp + 16 * 0]
489 %define k1 [rsp + 16 * 1]
490 %define k2 [rsp + 16 * 2]
491 %define k3 [rsp + 16 * 3]
492 %define k4 [rsp + 16 * 4]
493 %define k5 [rsp + 16 * 5]
494 %define k6 [rsp + 16 * 6]
495 %define k7 [rsp + 16 * 7]
496 %define krd [rsp + 16 * 8]
497 %define zero [rsp + 16 * 9]
499 GET_FILTERS
501 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
502 movsxd rbx, DWORD PTR arg(3) ;out_pitch
503 lea rdx, [rax + rax * 2]
504 movsxd rcx, DWORD PTR arg(4) ;output_height
505 .loop:
506 LOAD_VERT_8 0
507 APPLY_FILTER_8 1, 0
508 sub rsi, rax
510 LOAD_VERT_8 8
511 APPLY_FILTER_8 1, 8
512 add rdi, rbx
514 dec rcx
515 jnz .loop
517 add rsp, 16 * 10
518 pop rsp
519 pop rbx
520 ; begin epilog
521 pop rdi
522 pop rsi
523 RESTORE_XMM
524 UNSHADOW_ARGS
525 pop rbp
526 ret
528 ;void vp9_filter_block1d4_h8_sse2
529 ;(
530 ; unsigned char *src_ptr,
531 ; unsigned int src_pixels_per_line,
532 ; unsigned char *output_ptr,
533 ; unsigned int output_pitch,
534 ; unsigned int output_height,
535 ; short *filter
536 ;)
537 global sym(vp9_filter_block1d4_h8_sse2) PRIVATE
538 sym(vp9_filter_block1d4_h8_sse2):
539 push rbp
540 mov rbp, rsp
541 SHADOW_ARGS_TO_STACK 6
542 SAVE_XMM 7
543 push rsi
544 push rdi
545 ; end prolog
547 ALIGN_STACK 16, rax
548 sub rsp, 16 * 6
549 %define k0k1 [rsp + 16 * 0]
550 %define k2k3 [rsp + 16 * 1]
551 %define k5k4 [rsp + 16 * 2]
552 %define k6k7 [rsp + 16 * 3]
553 %define krd [rsp + 16 * 4]
554 %define zero [rsp + 16 * 5]
556 GET_FILTERS_4
558 mov rsi, arg(0) ;src_ptr
559 mov rdi, arg(2) ;output_ptr
561 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
562 movsxd rdx, DWORD PTR arg(3) ;out_pitch
563 movsxd rcx, DWORD PTR arg(4) ;output_height
565 .loop:
566 movdqu xmm0, [rsi - 3] ;load src
568 movdqa xmm1, xmm0
569 movdqa xmm6, xmm0
570 movdqa xmm7, xmm0
571 movdqa xmm2, xmm0
572 movdqa xmm3, xmm0
573 movdqa xmm5, xmm0
574 movdqa xmm4, xmm0
576 psrldq xmm1, 1
577 psrldq xmm6, 6
578 psrldq xmm7, 7
579 psrldq xmm2, 2
580 psrldq xmm3, 3
581 psrldq xmm5, 5
582 psrldq xmm4, 4
584 APPLY_FILTER_4 0
586 lea rsi, [rsi + rax]
587 lea rdi, [rdi + rdx]
588 dec rcx
589 jnz .loop
591 add rsp, 16 * 6
592 pop rsp
594 ; begin epilog
595 pop rdi
596 pop rsi
597 RESTORE_XMM
598 UNSHADOW_ARGS
599 pop rbp
600 ret
602 ;void vp9_filter_block1d8_h8_sse2
603 ;(
604 ; unsigned char *src_ptr,
605 ; unsigned int src_pixels_per_line,
606 ; unsigned char *output_ptr,
607 ; unsigned int output_pitch,
608 ; unsigned int output_height,
609 ; short *filter
610 ;)
611 global sym(vp9_filter_block1d8_h8_sse2) PRIVATE
612 sym(vp9_filter_block1d8_h8_sse2):
613 push rbp
614 mov rbp, rsp
615 SHADOW_ARGS_TO_STACK 6
616 SAVE_XMM 7
617 push rsi
618 push rdi
619 ; end prolog
621 ALIGN_STACK 16, rax
622 sub rsp, 16 * 10
623 %define k0 [rsp + 16 * 0]
624 %define k1 [rsp + 16 * 1]
625 %define k2 [rsp + 16 * 2]
626 %define k3 [rsp + 16 * 3]
627 %define k4 [rsp + 16 * 4]
628 %define k5 [rsp + 16 * 5]
629 %define k6 [rsp + 16 * 6]
630 %define k7 [rsp + 16 * 7]
631 %define krd [rsp + 16 * 8]
632 %define zero [rsp + 16 * 9]
634 GET_FILTERS
636 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
637 movsxd rdx, DWORD PTR arg(3) ;out_pitch
638 movsxd rcx, DWORD PTR arg(4) ;output_height
640 .loop:
641 movdqu xmm0, [rsi - 3] ;load src
643 movdqa xmm1, xmm0
644 movdqa xmm6, xmm0
645 movdqa xmm7, xmm0
646 movdqa xmm2, xmm0
647 movdqa xmm5, xmm0
648 movdqa xmm3, xmm0
649 movdqa xmm4, xmm0
651 psrldq xmm1, 1
652 psrldq xmm6, 6
653 psrldq xmm7, 7
654 psrldq xmm2, 2
655 psrldq xmm5, 5
656 psrldq xmm3, 3
657 psrldq xmm4, 4
659 APPLY_FILTER_8 0, 0
661 lea rsi, [rsi + rax]
662 lea rdi, [rdi + rdx]
663 dec rcx
664 jnz .loop
666 add rsp, 16 * 10
667 pop rsp
669 ; begin epilog
670 pop rdi
671 pop rsi
672 RESTORE_XMM
673 UNSHADOW_ARGS
674 pop rbp
675 ret
677 ;void vp9_filter_block1d16_h8_sse2
678 ;(
679 ; unsigned char *src_ptr,
680 ; unsigned int src_pixels_per_line,
681 ; unsigned char *output_ptr,
682 ; unsigned int output_pitch,
683 ; unsigned int output_height,
684 ; short *filter
685 ;)
686 global sym(vp9_filter_block1d16_h8_sse2) PRIVATE
687 sym(vp9_filter_block1d16_h8_sse2):
688 push rbp
689 mov rbp, rsp
690 SHADOW_ARGS_TO_STACK 6
691 SAVE_XMM 7
692 push rsi
693 push rdi
694 ; end prolog
696 ALIGN_STACK 16, rax
697 sub rsp, 16 * 10
698 %define k0 [rsp + 16 * 0]
699 %define k1 [rsp + 16 * 1]
700 %define k2 [rsp + 16 * 2]
701 %define k3 [rsp + 16 * 3]
702 %define k4 [rsp + 16 * 4]
703 %define k5 [rsp + 16 * 5]
704 %define k6 [rsp + 16 * 6]
705 %define k7 [rsp + 16 * 7]
706 %define krd [rsp + 16 * 8]
707 %define zero [rsp + 16 * 9]
709 GET_FILTERS
711 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
712 movsxd rdx, DWORD PTR arg(3) ;out_pitch
713 movsxd rcx, DWORD PTR arg(4) ;output_height
715 .loop:
716 movdqu xmm0, [rsi - 3] ;load src
718 movdqa xmm1, xmm0
719 movdqa xmm6, xmm0
720 movdqa xmm7, xmm0
721 movdqa xmm2, xmm0
722 movdqa xmm5, xmm0
723 movdqa xmm3, xmm0
724 movdqa xmm4, xmm0
726 psrldq xmm1, 1
727 psrldq xmm6, 6
728 psrldq xmm7, 7
729 psrldq xmm2, 2
730 psrldq xmm5, 5
731 psrldq xmm3, 3
732 psrldq xmm4, 4
734 APPLY_FILTER_8 0, 0
736 movdqu xmm0, [rsi + 5] ;load src
738 movdqa xmm1, xmm0
739 movdqa xmm6, xmm0
740 movdqa xmm7, xmm0
741 movdqa xmm2, xmm0
742 movdqa xmm5, xmm0
743 movdqa xmm3, xmm0
744 movdqa xmm4, xmm0
746 psrldq xmm1, 1
747 psrldq xmm6, 6
748 psrldq xmm7, 7
749 psrldq xmm2, 2
750 psrldq xmm5, 5
751 psrldq xmm3, 3
752 psrldq xmm4, 4
754 APPLY_FILTER_8 0, 8
756 lea rsi, [rsi + rax]
757 lea rdi, [rdi + rdx]
758 dec rcx
759 jnz .loop
761 add rsp, 16 * 10
762 pop rsp
764 ; begin epilog
765 pop rdi
766 pop rsi
767 RESTORE_XMM
768 UNSHADOW_ARGS
769 pop rbp
770 ret
772 global sym(vp9_filter_block1d4_h8_avg_sse2) PRIVATE
773 sym(vp9_filter_block1d4_h8_avg_sse2):
774 push rbp
775 mov rbp, rsp
776 SHADOW_ARGS_TO_STACK 6
777 SAVE_XMM 7
778 push rsi
779 push rdi
780 ; end prolog
782 ALIGN_STACK 16, rax
783 sub rsp, 16 * 6
784 %define k0k1 [rsp + 16 * 0]
785 %define k2k3 [rsp + 16 * 1]
786 %define k5k4 [rsp + 16 * 2]
787 %define k6k7 [rsp + 16 * 3]
788 %define krd [rsp + 16 * 4]
789 %define zero [rsp + 16 * 5]
791 GET_FILTERS_4
793 mov rsi, arg(0) ;src_ptr
794 mov rdi, arg(2) ;output_ptr
796 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
797 movsxd rdx, DWORD PTR arg(3) ;out_pitch
798 movsxd rcx, DWORD PTR arg(4) ;output_height
800 .loop:
801 movdqu xmm0, [rsi - 3] ;load src
803 movdqa xmm1, xmm0
804 movdqa xmm6, xmm0
805 movdqa xmm7, xmm0
806 movdqa xmm2, xmm0
807 movdqa xmm3, xmm0
808 movdqa xmm5, xmm0
809 movdqa xmm4, xmm0
811 psrldq xmm1, 1
812 psrldq xmm6, 6
813 psrldq xmm7, 7
814 psrldq xmm2, 2
815 psrldq xmm3, 3
816 psrldq xmm5, 5
817 psrldq xmm4, 4
819 APPLY_FILTER_4 1
821 lea rsi, [rsi + rax]
822 lea rdi, [rdi + rdx]
823 dec rcx
824 jnz .loop
826 add rsp, 16 * 6
827 pop rsp
829 ; begin epilog
830 pop rdi
831 pop rsi
832 RESTORE_XMM
833 UNSHADOW_ARGS
834 pop rbp
835 ret
837 global sym(vp9_filter_block1d8_h8_avg_sse2) PRIVATE
838 sym(vp9_filter_block1d8_h8_avg_sse2):
839 push rbp
840 mov rbp, rsp
841 SHADOW_ARGS_TO_STACK 6
842 SAVE_XMM 7
843 push rsi
844 push rdi
845 ; end prolog
847 ALIGN_STACK 16, rax
848 sub rsp, 16 * 10
849 %define k0 [rsp + 16 * 0]
850 %define k1 [rsp + 16 * 1]
851 %define k2 [rsp + 16 * 2]
852 %define k3 [rsp + 16 * 3]
853 %define k4 [rsp + 16 * 4]
854 %define k5 [rsp + 16 * 5]
855 %define k6 [rsp + 16 * 6]
856 %define k7 [rsp + 16 * 7]
857 %define krd [rsp + 16 * 8]
858 %define zero [rsp + 16 * 9]
860 GET_FILTERS
862 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
863 movsxd rdx, DWORD PTR arg(3) ;out_pitch
864 movsxd rcx, DWORD PTR arg(4) ;output_height
866 .loop:
867 movdqu xmm0, [rsi - 3] ;load src
869 movdqa xmm1, xmm0
870 movdqa xmm6, xmm0
871 movdqa xmm7, xmm0
872 movdqa xmm2, xmm0
873 movdqa xmm5, xmm0
874 movdqa xmm3, xmm0
875 movdqa xmm4, xmm0
877 psrldq xmm1, 1
878 psrldq xmm6, 6
879 psrldq xmm7, 7
880 psrldq xmm2, 2
881 psrldq xmm5, 5
882 psrldq xmm3, 3
883 psrldq xmm4, 4
885 APPLY_FILTER_8 1, 0
887 lea rsi, [rsi + rax]
888 lea rdi, [rdi + rdx]
889 dec rcx
890 jnz .loop
892 add rsp, 16 * 10
893 pop rsp
895 ; begin epilog
896 pop rdi
897 pop rsi
898 RESTORE_XMM
899 UNSHADOW_ARGS
900 pop rbp
901 ret
903 global sym(vp9_filter_block1d16_h8_avg_sse2) PRIVATE
904 sym(vp9_filter_block1d16_h8_avg_sse2):
905 push rbp
906 mov rbp, rsp
907 SHADOW_ARGS_TO_STACK 6
908 SAVE_XMM 7
909 push rsi
910 push rdi
911 ; end prolog
913 ALIGN_STACK 16, rax
914 sub rsp, 16 * 10
915 %define k0 [rsp + 16 * 0]
916 %define k1 [rsp + 16 * 1]
917 %define k2 [rsp + 16 * 2]
918 %define k3 [rsp + 16 * 3]
919 %define k4 [rsp + 16 * 4]
920 %define k5 [rsp + 16 * 5]
921 %define k6 [rsp + 16 * 6]
922 %define k7 [rsp + 16 * 7]
923 %define krd [rsp + 16 * 8]
924 %define zero [rsp + 16 * 9]
926 GET_FILTERS
928 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
929 movsxd rdx, DWORD PTR arg(3) ;out_pitch
930 movsxd rcx, DWORD PTR arg(4) ;output_height
932 .loop:
933 movdqu xmm0, [rsi - 3] ;load src
935 movdqa xmm1, xmm0
936 movdqa xmm6, xmm0
937 movdqa xmm7, xmm0
938 movdqa xmm2, xmm0
939 movdqa xmm5, xmm0
940 movdqa xmm3, xmm0
941 movdqa xmm4, xmm0
943 psrldq xmm1, 1
944 psrldq xmm6, 6
945 psrldq xmm7, 7
946 psrldq xmm2, 2
947 psrldq xmm5, 5
948 psrldq xmm3, 3
949 psrldq xmm4, 4
951 APPLY_FILTER_8 1, 0
953 movdqu xmm0, [rsi + 5] ;load src
955 movdqa xmm1, xmm0
956 movdqa xmm6, xmm0
957 movdqa xmm7, xmm0
958 movdqa xmm2, xmm0
959 movdqa xmm5, xmm0
960 movdqa xmm3, xmm0
961 movdqa xmm4, xmm0
963 psrldq xmm1, 1
964 psrldq xmm6, 6
965 psrldq xmm7, 7
966 psrldq xmm2, 2
967 psrldq xmm5, 5
968 psrldq xmm3, 3
969 psrldq xmm4, 4
971 APPLY_FILTER_8 1, 8
973 lea rsi, [rsi + rax]
974 lea rdi, [rdi + rdx]
975 dec rcx
976 jnz .loop
978 add rsp, 16 * 10
979 pop rsp
981 ; begin epilog
982 pop rdi
983 pop rsi
984 RESTORE_XMM
985 UNSHADOW_ARGS
986 pop rbp
987 ret