Thu, 15 Jan 2015 15:59:08 +0100
Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
12 %include "vpx_ports/x86_abi_support.asm"
14 ;void copy_mem16x16_sse2(
15 ; unsigned char *src,
16 ; int src_stride,
17 ; unsigned char *dst,
18 ; int dst_stride
19 ; )
20 global sym(vp8_copy_mem16x16_sse2) PRIVATE
21 sym(vp8_copy_mem16x16_sse2):
22 push rbp
23 mov rbp, rsp
24 SHADOW_ARGS_TO_STACK 4
25 push rsi
26 push rdi
27 ; end prolog
29 mov rsi, arg(0) ;src;
30 movdqu xmm0, [rsi]
32 movsxd rax, dword ptr arg(1) ;src_stride;
33 mov rdi, arg(2) ;dst;
35 movdqu xmm1, [rsi+rax]
36 movdqu xmm2, [rsi+rax*2]
38 movsxd rcx, dword ptr arg(3) ;dst_stride
39 lea rsi, [rsi+rax*2]
41 movdqa [rdi], xmm0
42 add rsi, rax
44 movdqa [rdi+rcx], xmm1
45 movdqa [rdi+rcx*2],xmm2
47 lea rdi, [rdi+rcx*2]
48 movdqu xmm3, [rsi]
50 add rdi, rcx
51 movdqu xmm4, [rsi+rax]
53 movdqu xmm5, [rsi+rax*2]
54 lea rsi, [rsi+rax*2]
56 movdqa [rdi], xmm3
57 add rsi, rax
59 movdqa [rdi+rcx], xmm4
60 movdqa [rdi+rcx*2],xmm5
62 lea rdi, [rdi+rcx*2]
63 movdqu xmm0, [rsi]
65 add rdi, rcx
66 movdqu xmm1, [rsi+rax]
68 movdqu xmm2, [rsi+rax*2]
69 lea rsi, [rsi+rax*2]
71 movdqa [rdi], xmm0
72 add rsi, rax
74 movdqa [rdi+rcx], xmm1
76 movdqa [rdi+rcx*2], xmm2
77 movdqu xmm3, [rsi]
79 movdqu xmm4, [rsi+rax]
80 lea rdi, [rdi+rcx*2]
82 add rdi, rcx
83 movdqu xmm5, [rsi+rax*2]
85 lea rsi, [rsi+rax*2]
86 movdqa [rdi], xmm3
88 add rsi, rax
89 movdqa [rdi+rcx], xmm4
91 movdqa [rdi+rcx*2],xmm5
92 movdqu xmm0, [rsi]
94 lea rdi, [rdi+rcx*2]
95 movdqu xmm1, [rsi+rax]
97 add rdi, rcx
98 movdqu xmm2, [rsi+rax*2]
100 lea rsi, [rsi+rax*2]
101 movdqa [rdi], xmm0
103 movdqa [rdi+rcx], xmm1
104 movdqa [rdi+rcx*2],xmm2
106 movdqu xmm3, [rsi+rax]
107 lea rdi, [rdi+rcx*2]
109 movdqa [rdi+rcx], xmm3
111 ; begin epilog
112 pop rdi
113 pop rsi
114 UNSHADOW_ARGS
115 pop rbp
116 ret
119 ;void vp8_intra_pred_uv_dc_mmx2(
120 ; unsigned char *dst,
121 ; int dst_stride
122 ; unsigned char *above,
123 ; unsigned char *left,
124 ; int left_stride,
125 ; )
126 global sym(vp8_intra_pred_uv_dc_mmx2) PRIVATE
127 sym(vp8_intra_pred_uv_dc_mmx2):
128 push rbp
129 mov rbp, rsp
130 SHADOW_ARGS_TO_STACK 5
131 push rsi
132 push rdi
133 ; end prolog
135 ; from top
136 mov rdi, arg(2) ;above;
137 mov rsi, arg(3) ;left;
138 movsxd rax, dword ptr arg(4) ;left_stride;
139 pxor mm0, mm0
140 movq mm1, [rdi]
141 lea rdi, [rax*3]
142 psadbw mm1, mm0
143 ; from left
144 movzx ecx, byte [rsi]
145 movzx edx, byte [rsi+rax*1]
146 add ecx, edx
147 movzx edx, byte [rsi+rax*2]
148 add ecx, edx
150 movzx edx, byte [rsi+rdi]
151 lea rsi, [rsi+rax*4]
152 add ecx, edx
153 movzx edx, byte [rsi]
154 add ecx, edx
155 movzx edx, byte [rsi+rax]
156 add ecx, edx
157 movzx edx, byte [rsi+rax*2]
158 add ecx, edx
159 movzx edx, byte [rsi+rdi]
160 add ecx, edx
162 ; add up
163 pextrw edx, mm1, 0x0
164 lea edx, [edx+ecx+8]
165 sar edx, 4
166 movd mm1, edx
167 movsxd rcx, dword ptr arg(1) ;dst_stride
168 pshufw mm1, mm1, 0x0
169 mov rdi, arg(0) ;dst;
170 packuswb mm1, mm1
172 ; write out
173 lea rax, [rcx*3]
174 lea rdx, [rdi+rcx*4]
176 movq [rdi ], mm1
177 movq [rdi+rcx ], mm1
178 movq [rdi+rcx*2], mm1
179 movq [rdi+rax ], mm1
180 movq [rdx ], mm1
181 movq [rdx+rcx ], mm1
182 movq [rdx+rcx*2], mm1
183 movq [rdx+rax ], mm1
185 ; begin epilog
186 pop rdi
187 pop rsi
188 UNSHADOW_ARGS
189 pop rbp
190 ret
192 ;void vp8_intra_pred_uv_dctop_mmx2(
193 ; unsigned char *dst,
194 ; int dst_stride
195 ; unsigned char *above,
196 ; unsigned char *left,
197 ; int left_stride,
198 ; )
199 global sym(vp8_intra_pred_uv_dctop_mmx2) PRIVATE
200 sym(vp8_intra_pred_uv_dctop_mmx2):
201 push rbp
202 mov rbp, rsp
203 SHADOW_ARGS_TO_STACK 5
204 GET_GOT rbx
205 push rsi
206 push rdi
207 ; end prolog
209 ;arg(3), arg(4) not used
211 ; from top
212 mov rsi, arg(2) ;above;
213 pxor mm0, mm0
214 movq mm1, [rsi]
215 psadbw mm1, mm0
217 ; add up
218 paddw mm1, [GLOBAL(dc_4)]
219 psraw mm1, 3
220 pshufw mm1, mm1, 0x0
221 packuswb mm1, mm1
223 ; write out
224 mov rdi, arg(0) ;dst;
225 movsxd rcx, dword ptr arg(1) ;dst_stride
226 lea rax, [rcx*3]
228 movq [rdi ], mm1
229 movq [rdi+rcx ], mm1
230 movq [rdi+rcx*2], mm1
231 movq [rdi+rax ], mm1
232 lea rdi, [rdi+rcx*4]
233 movq [rdi ], mm1
234 movq [rdi+rcx ], mm1
235 movq [rdi+rcx*2], mm1
236 movq [rdi+rax ], mm1
238 ; begin epilog
239 pop rdi
240 pop rsi
241 RESTORE_GOT
242 UNSHADOW_ARGS
243 pop rbp
244 ret
246 ;void vp8_intra_pred_uv_dcleft_mmx2(
247 ; unsigned char *dst,
248 ; int dst_stride
249 ; unsigned char *above,
250 ; unsigned char *left,
251 ; int left_stride,
252 ; )
253 global sym(vp8_intra_pred_uv_dcleft_mmx2) PRIVATE
254 sym(vp8_intra_pred_uv_dcleft_mmx2):
255 push rbp
256 mov rbp, rsp
257 SHADOW_ARGS_TO_STACK 5
258 push rsi
259 push rdi
260 ; end prolog
262 ;arg(2) not used
264 ; from left
265 mov rsi, arg(3) ;left;
266 movsxd rax, dword ptr arg(4) ;left_stride;
267 lea rdi, [rax*3]
268 movzx ecx, byte [rsi]
269 movzx edx, byte [rsi+rax]
270 add ecx, edx
271 movzx edx, byte [rsi+rax*2]
272 add ecx, edx
273 movzx edx, byte [rsi+rdi]
274 add ecx, edx
275 lea rsi, [rsi+rax*4]
276 movzx edx, byte [rsi]
277 add ecx, edx
278 movzx edx, byte [rsi+rax]
279 add ecx, edx
280 movzx edx, byte [rsi+rax*2]
281 add ecx, edx
282 movzx edx, byte [rsi+rdi]
283 lea edx, [ecx+edx+4]
285 ; add up
286 shr edx, 3
287 movd mm1, edx
288 pshufw mm1, mm1, 0x0
289 packuswb mm1, mm1
291 ; write out
292 mov rdi, arg(0) ;dst;
293 movsxd rcx, dword ptr arg(1) ;dst_stride
294 lea rax, [rcx*3]
296 movq [rdi ], mm1
297 movq [rdi+rcx ], mm1
298 movq [rdi+rcx*2], mm1
299 movq [rdi+rax ], mm1
300 lea rdi, [rdi+rcx*4]
301 movq [rdi ], mm1
302 movq [rdi+rcx ], mm1
303 movq [rdi+rcx*2], mm1
304 movq [rdi+rax ], mm1
306 ; begin epilog
307 pop rdi
308 pop rsi
309 UNSHADOW_ARGS
310 pop rbp
311 ret
313 ;void vp8_intra_pred_uv_dc128_mmx(
314 ; unsigned char *dst,
315 ; int dst_stride
316 ; unsigned char *above,
317 ; unsigned char *left,
318 ; int left_stride,
319 ; )
320 global sym(vp8_intra_pred_uv_dc128_mmx) PRIVATE
321 sym(vp8_intra_pred_uv_dc128_mmx):
322 push rbp
323 mov rbp, rsp
324 SHADOW_ARGS_TO_STACK 5
325 GET_GOT rbx
326 ; end prolog
328 ;arg(2), arg(3), arg(4) not used
330 ; write out
331 movq mm1, [GLOBAL(dc_128)]
332 mov rax, arg(0) ;dst;
333 movsxd rdx, dword ptr arg(1) ;dst_stride
334 lea rcx, [rdx*3]
336 movq [rax ], mm1
337 movq [rax+rdx ], mm1
338 movq [rax+rdx*2], mm1
339 movq [rax+rcx ], mm1
340 lea rax, [rax+rdx*4]
341 movq [rax ], mm1
342 movq [rax+rdx ], mm1
343 movq [rax+rdx*2], mm1
344 movq [rax+rcx ], mm1
346 ; begin epilog
347 RESTORE_GOT
348 UNSHADOW_ARGS
349 pop rbp
350 ret
352 ;void vp8_intra_pred_uv_tm_sse2(
353 ; unsigned char *dst,
354 ; int dst_stride
355 ; unsigned char *above,
356 ; unsigned char *left,
357 ; int left_stride,
358 ; )
359 %macro vp8_intra_pred_uv_tm 1
360 global sym(vp8_intra_pred_uv_tm_%1) PRIVATE
361 sym(vp8_intra_pred_uv_tm_%1):
362 push rbp
363 mov rbp, rsp
364 SHADOW_ARGS_TO_STACK 5
365 GET_GOT rbx
366 push rsi
367 push rdi
368 ; end prolog
370 ; read top row
371 mov edx, 4
372 mov rsi, arg(2) ;above
373 movsxd rax, dword ptr arg(4) ;left_stride;
374 pxor xmm0, xmm0
375 %ifidn %1, ssse3
376 movdqa xmm2, [GLOBAL(dc_1024)]
377 %endif
378 movq xmm1, [rsi]
379 punpcklbw xmm1, xmm0
381 ; set up left ptrs ans subtract topleft
382 movd xmm3, [rsi-1]
383 mov rsi, arg(3) ;left;
384 %ifidn %1, sse2
385 punpcklbw xmm3, xmm0
386 pshuflw xmm3, xmm3, 0x0
387 punpcklqdq xmm3, xmm3
388 %else
389 pshufb xmm3, xmm2
390 %endif
391 psubw xmm1, xmm3
393 ; set up dest ptrs
394 mov rdi, arg(0) ;dst;
395 movsxd rcx, dword ptr arg(1) ;dst_stride
397 .vp8_intra_pred_uv_tm_%1_loop:
398 movd xmm3, [rsi]
399 movd xmm5, [rsi+rax]
400 %ifidn %1, sse2
401 punpcklbw xmm3, xmm0
402 punpcklbw xmm5, xmm0
403 pshuflw xmm3, xmm3, 0x0
404 pshuflw xmm5, xmm5, 0x0
405 punpcklqdq xmm3, xmm3
406 punpcklqdq xmm5, xmm5
407 %else
408 pshufb xmm3, xmm2
409 pshufb xmm5, xmm2
410 %endif
411 paddw xmm3, xmm1
412 paddw xmm5, xmm1
413 packuswb xmm3, xmm5
414 movq [rdi ], xmm3
415 movhps[rdi+rcx], xmm3
416 lea rsi, [rsi+rax*2]
417 lea rdi, [rdi+rcx*2]
418 dec edx
419 jnz .vp8_intra_pred_uv_tm_%1_loop
421 ; begin epilog
422 pop rdi
423 pop rsi
424 RESTORE_GOT
425 UNSHADOW_ARGS
426 pop rbp
427 ret
428 %endmacro
430 vp8_intra_pred_uv_tm sse2
431 vp8_intra_pred_uv_tm ssse3
433 ;void vp8_intra_pred_uv_ve_mmx(
434 ; unsigned char *dst,
435 ; int dst_stride
436 ; unsigned char *above,
437 ; unsigned char *left,
438 ; int left_stride,
439 ; )
440 global sym(vp8_intra_pred_uv_ve_mmx) PRIVATE
441 sym(vp8_intra_pred_uv_ve_mmx):
442 push rbp
443 mov rbp, rsp
444 SHADOW_ARGS_TO_STACK 5
445 ; end prolog
447 ; arg(3), arg(4) not used
449 ; read from top
450 mov rax, arg(2) ;src;
452 movq mm1, [rax]
454 ; write out
455 mov rax, arg(0) ;dst;
456 movsxd rdx, dword ptr arg(1) ;dst_stride
457 lea rcx, [rdx*3]
459 movq [rax ], mm1
460 movq [rax+rdx ], mm1
461 movq [rax+rdx*2], mm1
462 movq [rax+rcx ], mm1
463 lea rax, [rax+rdx*4]
464 movq [rax ], mm1
465 movq [rax+rdx ], mm1
466 movq [rax+rdx*2], mm1
467 movq [rax+rcx ], mm1
469 ; begin epilog
470 UNSHADOW_ARGS
471 pop rbp
472 ret
474 ;void vp8_intra_pred_uv_ho_mmx2(
475 ; unsigned char *dst,
476 ; int dst_stride
477 ; unsigned char *above,
478 ; unsigned char *left,
479 ; int left_stride
480 ; )
481 %macro vp8_intra_pred_uv_ho 1
482 global sym(vp8_intra_pred_uv_ho_%1) PRIVATE
483 sym(vp8_intra_pred_uv_ho_%1):
484 push rbp
485 mov rbp, rsp
486 SHADOW_ARGS_TO_STACK 5
487 push rsi
488 push rdi
489 %ifidn %1, ssse3
490 %ifndef GET_GOT_SAVE_ARG
491 push rbx
492 %endif
493 GET_GOT rbx
494 %endif
495 ; end prolog
497 ;arg(2) not used
499 ; read from left and write out
500 %ifidn %1, mmx2
501 mov edx, 4
502 %endif
503 mov rsi, arg(3) ;left
504 movsxd rax, dword ptr arg(4) ;left_stride;
505 mov rdi, arg(0) ;dst;
506 movsxd rcx, dword ptr arg(1) ;dst_stride
507 %ifidn %1, ssse3
508 lea rdx, [rcx*3]
509 movdqa xmm2, [GLOBAL(dc_00001111)]
510 lea rbx, [rax*3]
511 %endif
513 %ifidn %1, mmx2
514 .vp8_intra_pred_uv_ho_%1_loop:
515 movd mm0, [rsi]
516 movd mm1, [rsi+rax]
517 punpcklbw mm0, mm0
518 punpcklbw mm1, mm1
519 pshufw mm0, mm0, 0x0
520 pshufw mm1, mm1, 0x0
521 movq [rdi ], mm0
522 movq [rdi+rcx], mm1
523 lea rsi, [rsi+rax*2]
524 lea rdi, [rdi+rcx*2]
525 dec edx
526 jnz .vp8_intra_pred_uv_ho_%1_loop
527 %else
528 movd xmm0, [rsi]
529 movd xmm3, [rsi+rax]
530 movd xmm1, [rsi+rax*2]
531 movd xmm4, [rsi+rbx]
532 punpcklbw xmm0, xmm3
533 punpcklbw xmm1, xmm4
534 pshufb xmm0, xmm2
535 pshufb xmm1, xmm2
536 movq [rdi ], xmm0
537 movhps [rdi+rcx], xmm0
538 movq [rdi+rcx*2], xmm1
539 movhps [rdi+rdx], xmm1
540 lea rsi, [rsi+rax*4]
541 lea rdi, [rdi+rcx*4]
542 movd xmm0, [rsi]
543 movd xmm3, [rsi+rax]
544 movd xmm1, [rsi+rax*2]
545 movd xmm4, [rsi+rbx]
546 punpcklbw xmm0, xmm3
547 punpcklbw xmm1, xmm4
548 pshufb xmm0, xmm2
549 pshufb xmm1, xmm2
550 movq [rdi ], xmm0
551 movhps [rdi+rcx], xmm0
552 movq [rdi+rcx*2], xmm1
553 movhps [rdi+rdx], xmm1
554 %endif
556 ; begin epilog
557 %ifidn %1, ssse3
558 RESTORE_GOT
559 %ifndef GET_GOT_SAVE_ARG
560 pop rbx
561 %endif
562 %endif
563 pop rdi
564 pop rsi
565 UNSHADOW_ARGS
566 pop rbp
567 ret
568 %endmacro
570 vp8_intra_pred_uv_ho mmx2
571 vp8_intra_pred_uv_ho ssse3
573 ;void vp8_intra_pred_y_dc_sse2(
574 ; unsigned char *dst,
575 ; int dst_stride
576 ; unsigned char *above,
577 ; unsigned char *left,
578 ; int left_stride
579 ; )
580 global sym(vp8_intra_pred_y_dc_sse2) PRIVATE
581 sym(vp8_intra_pred_y_dc_sse2):
582 push rbp
583 mov rbp, rsp
584 SHADOW_ARGS_TO_STACK 5
585 push rsi
586 push rdi
587 ; end prolog
589 ; from top
590 mov rdi, arg(2) ;above
591 mov rsi, arg(3) ;left
592 movsxd rax, dword ptr arg(4) ;left_stride;
594 pxor xmm0, xmm0
595 movdqa xmm1, [rdi]
596 psadbw xmm1, xmm0
597 movq xmm2, xmm1
598 punpckhqdq xmm1, xmm1
599 paddw xmm1, xmm2
601 ; from left
602 lea rdi, [rax*3]
604 movzx ecx, byte [rsi]
605 movzx edx, byte [rsi+rax]
606 add ecx, edx
607 movzx edx, byte [rsi+rax*2]
608 add ecx, edx
609 movzx edx, byte [rsi+rdi]
610 add ecx, edx
611 lea rsi, [rsi+rax*4]
613 movzx edx, byte [rsi]
614 add ecx, edx
615 movzx edx, byte [rsi+rax]
616 add ecx, edx
617 movzx edx, byte [rsi+rax*2]
618 add ecx, edx
619 movzx edx, byte [rsi+rdi]
620 add ecx, edx
621 lea rsi, [rsi+rax*4]
623 movzx edx, byte [rsi]
624 add ecx, edx
625 movzx edx, byte [rsi+rax]
626 add ecx, edx
627 movzx edx, byte [rsi+rax*2]
628 add ecx, edx
629 movzx edx, byte [rsi+rdi]
630 add ecx, edx
631 lea rsi, [rsi+rax*4]
633 movzx edx, byte [rsi]
634 add ecx, edx
635 movzx edx, byte [rsi+rax]
636 add ecx, edx
637 movzx edx, byte [rsi+rax*2]
638 add ecx, edx
639 movzx edx, byte [rsi+rdi]
640 add ecx, edx
642 ; add up
643 pextrw edx, xmm1, 0x0
644 lea edx, [edx+ecx+16]
645 sar edx, 5
646 movd xmm1, edx
647 ; FIXME use pshufb for ssse3 version
648 pshuflw xmm1, xmm1, 0x0
649 punpcklqdq xmm1, xmm1
650 packuswb xmm1, xmm1
652 ; write out
653 mov rsi, 2
654 mov rdi, arg(0) ;dst;
655 movsxd rcx, dword ptr arg(1) ;dst_stride
656 lea rax, [rcx*3]
658 .label
659 movdqa [rdi ], xmm1
660 movdqa [rdi+rcx ], xmm1
661 movdqa [rdi+rcx*2], xmm1
662 movdqa [rdi+rax ], xmm1
663 lea rdi, [rdi+rcx*4]
664 movdqa [rdi ], xmm1
665 movdqa [rdi+rcx ], xmm1
666 movdqa [rdi+rcx*2], xmm1
667 movdqa [rdi+rax ], xmm1
668 lea rdi, [rdi+rcx*4]
669 dec rsi
670 jnz .label
672 ; begin epilog
673 pop rdi
674 pop rsi
675 UNSHADOW_ARGS
676 pop rbp
677 ret
679 ;void vp8_intra_pred_y_dctop_sse2(
680 ; unsigned char *dst,
681 ; int dst_stride
682 ; unsigned char *above,
683 ; unsigned char *left,
684 ; int left_stride
685 ; )
686 global sym(vp8_intra_pred_y_dctop_sse2) PRIVATE
687 sym(vp8_intra_pred_y_dctop_sse2):
688 push rbp
689 mov rbp, rsp
690 SHADOW_ARGS_TO_STACK 5
691 push rsi
692 GET_GOT rbx
693 ; end prolog
695 ;arg(3), arg(4) not used
697 ; from top
698 mov rcx, arg(2) ;above;
699 pxor xmm0, xmm0
700 movdqa xmm1, [rcx]
701 psadbw xmm1, xmm0
702 movdqa xmm2, xmm1
703 punpckhqdq xmm1, xmm1
704 paddw xmm1, xmm2
706 ; add up
707 paddw xmm1, [GLOBAL(dc_8)]
708 psraw xmm1, 4
709 ; FIXME use pshufb for ssse3 version
710 pshuflw xmm1, xmm1, 0x0
711 punpcklqdq xmm1, xmm1
712 packuswb xmm1, xmm1
714 ; write out
715 mov rsi, 2
716 mov rdx, arg(0) ;dst;
717 movsxd rcx, dword ptr arg(1) ;dst_stride
718 lea rax, [rcx*3]
720 .label
721 movdqa [rdx ], xmm1
722 movdqa [rdx+rcx ], xmm1
723 movdqa [rdx+rcx*2], xmm1
724 movdqa [rdx+rax ], xmm1
725 lea rdx, [rdx+rcx*4]
726 movdqa [rdx ], xmm1
727 movdqa [rdx+rcx ], xmm1
728 movdqa [rdx+rcx*2], xmm1
729 movdqa [rdx+rax ], xmm1
730 lea rdx, [rdx+rcx*4]
731 dec rsi
732 jnz .label
734 ; begin epilog
735 RESTORE_GOT
736 pop rsi
737 UNSHADOW_ARGS
738 pop rbp
739 ret
741 ;void vp8_intra_pred_y_dcleft_sse2(
742 ; unsigned char *dst,
743 ; int dst_stride
744 ; unsigned char *above,
745 ; unsigned char *left,
746 ; int left_stride
747 ; )
748 global sym(vp8_intra_pred_y_dcleft_sse2) PRIVATE
749 sym(vp8_intra_pred_y_dcleft_sse2):
750 push rbp
751 mov rbp, rsp
752 SHADOW_ARGS_TO_STACK 5
753 push rsi
754 push rdi
755 ; end prolog
757 ;arg(2) not used
759 ; from left
760 mov rsi, arg(3) ;left;
761 movsxd rax, dword ptr arg(4) ;left_stride;
763 lea rdi, [rax*3]
764 movzx ecx, byte [rsi]
765 movzx edx, byte [rsi+rax]
766 add ecx, edx
767 movzx edx, byte [rsi+rax*2]
768 add ecx, edx
769 movzx edx, byte [rsi+rdi]
770 add ecx, edx
771 lea rsi, [rsi+rax*4]
772 movzx edx, byte [rsi]
773 add ecx, edx
774 movzx edx, byte [rsi+rax]
775 add ecx, edx
776 movzx edx, byte [rsi+rax*2]
777 add ecx, edx
778 movzx edx, byte [rsi+rdi]
779 add ecx, edx
780 lea rsi, [rsi+rax*4]
781 movzx edx, byte [rsi]
782 add ecx, edx
783 movzx edx, byte [rsi+rax]
784 add ecx, edx
785 movzx edx, byte [rsi+rax*2]
786 add ecx, edx
787 movzx edx, byte [rsi+rdi]
788 add ecx, edx
789 lea rsi, [rsi+rax*4]
790 movzx edx, byte [rsi]
791 add ecx, edx
792 movzx edx, byte [rsi+rax]
793 add ecx, edx
794 movzx edx, byte [rsi+rax*2]
795 add ecx, edx
796 movzx edx, byte [rsi+rdi]
797 lea edx, [ecx+edx+8]
799 ; add up
800 shr edx, 4
801 movd xmm1, edx
802 ; FIXME use pshufb for ssse3 version
803 pshuflw xmm1, xmm1, 0x0
804 punpcklqdq xmm1, xmm1
805 packuswb xmm1, xmm1
807 ; write out
808 mov rsi, 2
809 mov rdi, arg(0) ;dst;
810 movsxd rcx, dword ptr arg(1) ;dst_stride
811 lea rax, [rcx*3]
813 .label
814 movdqa [rdi ], xmm1
815 movdqa [rdi+rcx ], xmm1
816 movdqa [rdi+rcx*2], xmm1
817 movdqa [rdi+rax ], xmm1
818 lea rdi, [rdi+rcx*4]
819 movdqa [rdi ], xmm1
820 movdqa [rdi+rcx ], xmm1
821 movdqa [rdi+rcx*2], xmm1
822 movdqa [rdi+rax ], xmm1
823 lea rdi, [rdi+rcx*4]
824 dec rsi
825 jnz .label
827 ; begin epilog
828 pop rdi
829 pop rsi
830 UNSHADOW_ARGS
831 pop rbp
832 ret
834 ;void vp8_intra_pred_y_dc128_sse2(
835 ; unsigned char *dst,
836 ; int dst_stride
837 ; unsigned char *above,
838 ; unsigned char *left,
839 ; int left_stride
840 ; )
841 global sym(vp8_intra_pred_y_dc128_sse2) PRIVATE
842 sym(vp8_intra_pred_y_dc128_sse2):
843 push rbp
844 mov rbp, rsp
845 SHADOW_ARGS_TO_STACK 5
846 push rsi
847 GET_GOT rbx
848 ; end prolog
850 ;arg(2), arg(3), arg(4) not used
852 ; write out
853 mov rsi, 2
854 movdqa xmm1, [GLOBAL(dc_128)]
855 mov rax, arg(0) ;dst;
856 movsxd rdx, dword ptr arg(1) ;dst_stride
857 lea rcx, [rdx*3]
859 .label
860 movdqa [rax ], xmm1
861 movdqa [rax+rdx ], xmm1
862 movdqa [rax+rdx*2], xmm1
863 movdqa [rax+rcx ], xmm1
864 lea rax, [rax+rdx*4]
865 movdqa [rax ], xmm1
866 movdqa [rax+rdx ], xmm1
867 movdqa [rax+rdx*2], xmm1
868 movdqa [rax+rcx ], xmm1
869 lea rax, [rax+rdx*4]
870 dec rsi
871 jnz .label
873 ; begin epilog
874 RESTORE_GOT
875 pop rsi
876 UNSHADOW_ARGS
877 pop rbp
878 ret
880 ;void vp8_intra_pred_y_tm_sse2(
881 ; unsigned char *dst,
882 ; int dst_stride
883 ; unsigned char *above,
884 ; unsigned char *left,
885 ; int left_stride
886 ; )
887 %macro vp8_intra_pred_y_tm 1
888 global sym(vp8_intra_pred_y_tm_%1) PRIVATE
889 sym(vp8_intra_pred_y_tm_%1):
890 push rbp
891 mov rbp, rsp
892 SHADOW_ARGS_TO_STACK 5
893 SAVE_XMM 7
894 push rsi
895 push rdi
896 GET_GOT rbx
897 ; end prolog
899 ; read top row
900 mov edx, 8
901 mov rsi, arg(2) ;above
902 movsxd rax, dword ptr arg(4) ;left_stride;
903 pxor xmm0, xmm0
904 %ifidn %1, ssse3
905 movdqa xmm3, [GLOBAL(dc_1024)]
906 %endif
907 movdqa xmm1, [rsi]
908 movdqa xmm2, xmm1
909 punpcklbw xmm1, xmm0
910 punpckhbw xmm2, xmm0
912 ; set up left ptrs ans subtract topleft
913 movd xmm4, [rsi-1]
914 mov rsi, arg(3) ;left
915 %ifidn %1, sse2
916 punpcklbw xmm4, xmm0
917 pshuflw xmm4, xmm4, 0x0
918 punpcklqdq xmm4, xmm4
919 %else
920 pshufb xmm4, xmm3
921 %endif
922 psubw xmm1, xmm4
923 psubw xmm2, xmm4
925 ; set up dest ptrs
926 mov rdi, arg(0) ;dst;
927 movsxd rcx, dword ptr arg(1) ;dst_stride
928 vp8_intra_pred_y_tm_%1_loop:
929 movd xmm4, [rsi]
930 movd xmm5, [rsi+rax]
931 %ifidn %1, sse2
932 punpcklbw xmm4, xmm0
933 punpcklbw xmm5, xmm0
934 pshuflw xmm4, xmm4, 0x0
935 pshuflw xmm5, xmm5, 0x0
936 punpcklqdq xmm4, xmm4
937 punpcklqdq xmm5, xmm5
938 %else
939 pshufb xmm4, xmm3
940 pshufb xmm5, xmm3
941 %endif
942 movdqa xmm6, xmm4
943 movdqa xmm7, xmm5
944 paddw xmm4, xmm1
945 paddw xmm6, xmm2
946 paddw xmm5, xmm1
947 paddw xmm7, xmm2
948 packuswb xmm4, xmm6
949 packuswb xmm5, xmm7
950 movdqa [rdi ], xmm4
951 movdqa [rdi+rcx], xmm5
952 lea rsi, [rsi+rax*2]
953 lea rdi, [rdi+rcx*2]
954 dec edx
955 jnz vp8_intra_pred_y_tm_%1_loop
957 ; begin epilog
958 RESTORE_GOT
959 pop rdi
960 pop rsi
961 RESTORE_XMM
962 UNSHADOW_ARGS
963 pop rbp
964 ret
965 %endmacro
967 vp8_intra_pred_y_tm sse2
968 vp8_intra_pred_y_tm ssse3
970 ;void vp8_intra_pred_y_ve_sse2(
971 ; unsigned char *dst,
972 ; int dst_stride
973 ; unsigned char *above,
974 ; unsigned char *left,
975 ; int left_stride
976 ; )
977 global sym(vp8_intra_pred_y_ve_sse2) PRIVATE
978 sym(vp8_intra_pred_y_ve_sse2):
979 push rbp
980 mov rbp, rsp
981 SHADOW_ARGS_TO_STACK 5
982 push rsi
983 ; end prolog
985 ;arg(3), arg(4) not used
987 mov rax, arg(2) ;above;
988 mov rsi, 2
989 movsxd rdx, dword ptr arg(1) ;dst_stride
991 ; read from top
992 movdqa xmm1, [rax]
994 ; write out
995 mov rax, arg(0) ;dst;
996 lea rcx, [rdx*3]
998 .label
999 movdqa [rax ], xmm1
1000 movdqa [rax+rdx ], xmm1
1001 movdqa [rax+rdx*2], xmm1
1002 movdqa [rax+rcx ], xmm1
1003 lea rax, [rax+rdx*4]
1004 movdqa [rax ], xmm1
1005 movdqa [rax+rdx ], xmm1
1006 movdqa [rax+rdx*2], xmm1
1007 movdqa [rax+rcx ], xmm1
1008 lea rax, [rax+rdx*4]
1009 dec rsi
1010 jnz .label
1012 ; begin epilog
1013 pop rsi
1014 UNSHADOW_ARGS
1015 pop rbp
1016 ret
1018 ;void vp8_intra_pred_y_ho_sse2(
1019 ; unsigned char *dst,
1020 ; int dst_stride
1021 ; unsigned char *above,
1022 ; unsigned char *left,
1023 ; int left_stride,
1024 ; )
1025 global sym(vp8_intra_pred_y_ho_sse2) PRIVATE
1026 sym(vp8_intra_pred_y_ho_sse2):
1027 push rbp
1028 mov rbp, rsp
1029 SHADOW_ARGS_TO_STACK 5
1030 push rsi
1031 push rdi
1032 ; end prolog
1034 ;arg(2) not used
1036 ; read from left and write out
1037 mov edx, 8
1038 mov rsi, arg(3) ;left;
1039 movsxd rax, dword ptr arg(4) ;left_stride;
1040 mov rdi, arg(0) ;dst;
1041 movsxd rcx, dword ptr arg(1) ;dst_stride
1043 vp8_intra_pred_y_ho_sse2_loop:
1044 movd xmm0, [rsi]
1045 movd xmm1, [rsi+rax]
1046 ; FIXME use pshufb for ssse3 version
1047 punpcklbw xmm0, xmm0
1048 punpcklbw xmm1, xmm1
1049 pshuflw xmm0, xmm0, 0x0
1050 pshuflw xmm1, xmm1, 0x0
1051 punpcklqdq xmm0, xmm0
1052 punpcklqdq xmm1, xmm1
1053 movdqa [rdi ], xmm0
1054 movdqa [rdi+rcx], xmm1
1055 lea rsi, [rsi+rax*2]
1056 lea rdi, [rdi+rcx*2]
1057 dec edx
1058 jnz vp8_intra_pred_y_ho_sse2_loop
1060 ; begin epilog
1061 pop rdi
1062 pop rsi
1063 UNSHADOW_ARGS
1064 pop rbp
1065 ret
1067 SECTION_RODATA
1068 align 16
1069 dc_128:
1070 times 16 db 128
1071 dc_4:
1072 times 4 dw 4
1073 align 16
1074 dc_8:
1075 times 8 dw 8
1076 align 16
1077 dc_1024:
1078 times 8 dw 0x400
1079 align 16
1080 dc_00001111:
1081 times 8 db 0
1082 times 8 db 1