Thu, 15 Jan 2015 15:59:08 +0100
Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
12 %include "vpx_ports/x86_abi_support.asm"
13 extern sym(vp8_bilinear_filters_x86_8)
16 %define BLOCK_HEIGHT_WIDTH 4
17 %define vp8_filter_weight 128
18 %define VP8_FILTER_SHIFT 7
21 ;void vp8_filter_block1d_h6_mmx
22 ;(
23 ; unsigned char *src_ptr,
24 ; unsigned short *output_ptr,
25 ; unsigned int src_pixels_per_line,
26 ; unsigned int pixel_step,
27 ; unsigned int output_height,
28 ; unsigned int output_width,
29 ; short * vp8_filter
30 ;)
31 global sym(vp8_filter_block1d_h6_mmx) PRIVATE
32 sym(vp8_filter_block1d_h6_mmx):
33 push rbp
34 mov rbp, rsp
35 SHADOW_ARGS_TO_STACK 7
36 GET_GOT rbx
37 push rsi
38 push rdi
39 ; end prolog
41 mov rdx, arg(6) ;vp8_filter
43 movq mm1, [rdx + 16] ; do both the negative taps first!!!
44 movq mm2, [rdx + 32] ;
45 movq mm6, [rdx + 48] ;
46 movq mm7, [rdx + 64] ;
48 mov rdi, arg(1) ;output_ptr
49 mov rsi, arg(0) ;src_ptr
50 movsxd rcx, dword ptr arg(4) ;output_height
51 movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?
52 pxor mm0, mm0 ; mm0 = 00000000
54 .nextrow:
55 movq mm3, [rsi-2] ; mm3 = p-2..p5
56 movq mm4, mm3 ; mm4 = p-2..p5
57 psrlq mm3, 8 ; mm3 = p-1..p5
58 punpcklbw mm3, mm0 ; mm3 = p-1..p2
59 pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
61 movq mm5, mm4 ; mm5 = p-2..p5
62 punpckhbw mm4, mm0 ; mm5 = p2..p5
63 pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers
64 paddsw mm3, mm4 ; mm3 += mm5
66 movq mm4, mm5 ; mm4 = p-2..p5;
67 psrlq mm5, 16 ; mm5 = p0..p5;
68 punpcklbw mm5, mm0 ; mm5 = p0..p3
69 pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers
70 paddsw mm3, mm5 ; mm3 += mm5
72 movq mm5, mm4 ; mm5 = p-2..p5
73 psrlq mm4, 24 ; mm4 = p1..p5
74 punpcklbw mm4, mm0 ; mm4 = p1..p4
75 pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers
76 paddsw mm3, mm4 ; mm3 += mm5
78 ; do outer positive taps
79 movd mm4, [rsi+3]
80 punpcklbw mm4, mm0 ; mm5 = p3..p6
81 pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers
82 paddsw mm3, mm4 ; mm3 += mm5
84 punpcklbw mm5, mm0 ; mm5 = p-2..p1
85 pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers
86 paddsw mm3, mm5 ; mm3 += mm5
88 paddsw mm3, [GLOBAL(rd)] ; mm3 += round value
89 psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
90 packuswb mm3, mm0 ; pack and unpack to saturate
91 punpcklbw mm3, mm0 ;
93 movq [rdi], mm3 ; store the results in the destination
95 %if ABI_IS_32BIT
96 add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line
97 add rdi, rax;
98 %else
99 movsxd r8, dword ptr arg(2) ;src_pixels_per_line
100 add rdi, rax;
102 add rsi, r8 ; next line
103 %endif
105 dec rcx ; decrement count
106 jnz .nextrow ; next row
108 ; begin epilog
109 pop rdi
110 pop rsi
111 RESTORE_GOT
112 UNSHADOW_ARGS
113 pop rbp
114 ret
117 ;void vp8_filter_block1dc_v6_mmx
118 ;(
119 ; short *src_ptr,
120 ; unsigned char *output_ptr,
121 ; int output_pitch,
122 ; unsigned int pixels_per_line,
123 ; unsigned int pixel_step,
124 ; unsigned int output_height,
125 ; unsigned int output_width,
126 ; short * vp8_filter
127 ;)
128 global sym(vp8_filter_block1dc_v6_mmx) PRIVATE
129 sym(vp8_filter_block1dc_v6_mmx):
130 push rbp
131 mov rbp, rsp
132 SHADOW_ARGS_TO_STACK 8
133 GET_GOT rbx
134 push rsi
135 push rdi
136 ; end prolog
138 movq mm5, [GLOBAL(rd)]
139 push rbx
140 mov rbx, arg(7) ;vp8_filter
141 movq mm1, [rbx + 16] ; do both the negative taps first!!!
142 movq mm2, [rbx + 32] ;
143 movq mm6, [rbx + 48] ;
144 movq mm7, [rbx + 64] ;
146 movsxd rdx, dword ptr arg(3) ;pixels_per_line
147 mov rdi, arg(1) ;output_ptr
148 mov rsi, arg(0) ;src_ptr
149 sub rsi, rdx
150 sub rsi, rdx
151 movsxd rcx, DWORD PTR arg(5) ;output_height
152 movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch?
153 pxor mm0, mm0 ; mm0 = 00000000
156 .nextrow_cv:
157 movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
158 pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
161 movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2
162 pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.
163 paddsw mm3, mm4 ; mm3 += mm4
165 movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0
166 pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.
167 paddsw mm3, mm4 ; mm3 += mm4
169 movq mm4, [rsi] ; mm4 = p0..p3 = row -2
170 pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.
171 paddsw mm3, mm4 ; mm3 += mm4
174 add rsi, rdx ; move source forward 1 line to avoid 3 * pitch
175 movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1
176 pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers.
177 paddsw mm3, mm4 ; mm3 += mm4
179 movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3
180 pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers.
181 paddsw mm3, mm4 ; mm3 += mm4
184 paddsw mm3, mm5 ; mm3 += round value
185 psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
186 packuswb mm3, mm0 ; pack and saturate
188 movd [rdi],mm3 ; store the results in the destination
189 ; the subsequent iterations repeat 3 out of 4 of these reads. Since the
190 ; recon block should be in cache this shouldn't cost much. Its obviously
191 ; avoidable!!!.
192 lea rdi, [rdi+rax] ;
193 dec rcx ; decrement count
194 jnz .nextrow_cv ; next row
196 pop rbx
198 ; begin epilog
199 pop rdi
200 pop rsi
201 RESTORE_GOT
202 UNSHADOW_ARGS
203 pop rbp
204 ret
207 ;void bilinear_predict8x8_mmx
208 ;(
209 ; unsigned char *src_ptr,
210 ; int src_pixels_per_line,
211 ; int xoffset,
212 ; int yoffset,
213 ; unsigned char *dst_ptr,
214 ; int dst_pitch
215 ;)
216 global sym(vp8_bilinear_predict8x8_mmx) PRIVATE
217 sym(vp8_bilinear_predict8x8_mmx):
218 push rbp
219 mov rbp, rsp
220 SHADOW_ARGS_TO_STACK 6
221 GET_GOT rbx
222 push rsi
223 push rdi
224 ; end prolog
226 ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
227 ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
229 movsxd rax, dword ptr arg(2) ;xoffset
230 mov rdi, arg(4) ;dst_ptr ;
232 shl rax, 5 ; offset * 32
233 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
235 add rax, rcx ; HFilter
236 mov rsi, arg(0) ;src_ptr ;
238 movsxd rdx, dword ptr arg(5) ;dst_pitch
239 movq mm1, [rax] ;
241 movq mm2, [rax+16] ;
242 movsxd rax, dword ptr arg(3) ;yoffset
244 pxor mm0, mm0 ;
246 shl rax, 5 ; offset*32
247 add rax, rcx ; VFilter
249 lea rcx, [rdi+rdx*8] ;
250 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
254 ; get the first horizontal line done ;
255 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
256 movq mm4, mm3 ; make a copy of current line
258 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
259 punpckhbw mm4, mm0 ;
261 pmullw mm3, mm1 ;
262 pmullw mm4, mm1 ;
264 movq mm5, [rsi+1] ;
265 movq mm6, mm5 ;
267 punpcklbw mm5, mm0 ;
268 punpckhbw mm6, mm0 ;
270 pmullw mm5, mm2 ;
271 pmullw mm6, mm2 ;
273 paddw mm3, mm5 ;
274 paddw mm4, mm6 ;
276 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
277 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
279 paddw mm4, [GLOBAL(rd)] ;
280 psraw mm4, VP8_FILTER_SHIFT ;
282 movq mm7, mm3 ;
283 packuswb mm7, mm4 ;
285 add rsi, rdx ; next line
286 .next_row_8x8:
287 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
288 movq mm4, mm3 ; make a copy of current line
290 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
291 punpckhbw mm4, mm0 ;
293 pmullw mm3, mm1 ;
294 pmullw mm4, mm1 ;
296 movq mm5, [rsi+1] ;
297 movq mm6, mm5 ;
299 punpcklbw mm5, mm0 ;
300 punpckhbw mm6, mm0 ;
302 pmullw mm5, mm2 ;
303 pmullw mm6, mm2 ;
305 paddw mm3, mm5 ;
306 paddw mm4, mm6 ;
308 movq mm5, mm7 ;
309 movq mm6, mm7 ;
311 punpcklbw mm5, mm0 ;
312 punpckhbw mm6, mm0
314 pmullw mm5, [rax] ;
315 pmullw mm6, [rax] ;
317 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
318 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
320 paddw mm4, [GLOBAL(rd)] ;
321 psraw mm4, VP8_FILTER_SHIFT ;
323 movq mm7, mm3 ;
324 packuswb mm7, mm4 ;
327 pmullw mm3, [rax+16] ;
328 pmullw mm4, [rax+16] ;
330 paddw mm3, mm5 ;
331 paddw mm4, mm6 ;
334 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
335 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
337 paddw mm4, [GLOBAL(rd)] ;
338 psraw mm4, VP8_FILTER_SHIFT ;
340 packuswb mm3, mm4
342 movq [rdi], mm3 ; store the results in the destination
344 %if ABI_IS_32BIT
345 add rsi, rdx ; next line
346 add rdi, dword ptr arg(5) ;dst_pitch ;
347 %else
348 movsxd r8, dword ptr arg(5) ;dst_pitch
349 add rsi, rdx ; next line
350 add rdi, r8 ;dst_pitch
351 %endif
352 cmp rdi, rcx ;
353 jne .next_row_8x8
355 ; begin epilog
356 pop rdi
357 pop rsi
358 RESTORE_GOT
359 UNSHADOW_ARGS
360 pop rbp
361 ret
364 ;void bilinear_predict8x4_mmx
365 ;(
366 ; unsigned char *src_ptr,
367 ; int src_pixels_per_line,
368 ; int xoffset,
369 ; int yoffset,
370 ; unsigned char *dst_ptr,
371 ; int dst_pitch
372 ;)
373 global sym(vp8_bilinear_predict8x4_mmx) PRIVATE
374 sym(vp8_bilinear_predict8x4_mmx):
375 push rbp
376 mov rbp, rsp
377 SHADOW_ARGS_TO_STACK 6
378 GET_GOT rbx
379 push rsi
380 push rdi
381 ; end prolog
383 ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
384 ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
386 movsxd rax, dword ptr arg(2) ;xoffset
387 mov rdi, arg(4) ;dst_ptr ;
389 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
390 shl rax, 5
392 mov rsi, arg(0) ;src_ptr ;
393 add rax, rcx
395 movsxd rdx, dword ptr arg(5) ;dst_pitch
396 movq mm1, [rax] ;
398 movq mm2, [rax+16] ;
399 movsxd rax, dword ptr arg(3) ;yoffset
401 pxor mm0, mm0 ;
402 shl rax, 5
404 add rax, rcx
405 lea rcx, [rdi+rdx*4] ;
407 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
409 ; get the first horizontal line done ;
410 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
411 movq mm4, mm3 ; make a copy of current line
413 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
414 punpckhbw mm4, mm0 ;
416 pmullw mm3, mm1 ;
417 pmullw mm4, mm1 ;
419 movq mm5, [rsi+1] ;
420 movq mm6, mm5 ;
422 punpcklbw mm5, mm0 ;
423 punpckhbw mm6, mm0 ;
425 pmullw mm5, mm2 ;
426 pmullw mm6, mm2 ;
428 paddw mm3, mm5 ;
429 paddw mm4, mm6 ;
431 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
432 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
434 paddw mm4, [GLOBAL(rd)] ;
435 psraw mm4, VP8_FILTER_SHIFT ;
437 movq mm7, mm3 ;
438 packuswb mm7, mm4 ;
440 add rsi, rdx ; next line
441 .next_row_8x4:
442 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
443 movq mm4, mm3 ; make a copy of current line
445 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
446 punpckhbw mm4, mm0 ;
448 pmullw mm3, mm1 ;
449 pmullw mm4, mm1 ;
451 movq mm5, [rsi+1] ;
452 movq mm6, mm5 ;
454 punpcklbw mm5, mm0 ;
455 punpckhbw mm6, mm0 ;
457 pmullw mm5, mm2 ;
458 pmullw mm6, mm2 ;
460 paddw mm3, mm5 ;
461 paddw mm4, mm6 ;
463 movq mm5, mm7 ;
464 movq mm6, mm7 ;
466 punpcklbw mm5, mm0 ;
467 punpckhbw mm6, mm0
469 pmullw mm5, [rax] ;
470 pmullw mm6, [rax] ;
472 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
473 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
475 paddw mm4, [GLOBAL(rd)] ;
476 psraw mm4, VP8_FILTER_SHIFT ;
478 movq mm7, mm3 ;
479 packuswb mm7, mm4 ;
482 pmullw mm3, [rax+16] ;
483 pmullw mm4, [rax+16] ;
485 paddw mm3, mm5 ;
486 paddw mm4, mm6 ;
489 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
490 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
492 paddw mm4, [GLOBAL(rd)] ;
493 psraw mm4, VP8_FILTER_SHIFT ;
495 packuswb mm3, mm4
497 movq [rdi], mm3 ; store the results in the destination
499 %if ABI_IS_32BIT
500 add rsi, rdx ; next line
501 add rdi, dword ptr arg(5) ;dst_pitch ;
502 %else
503 movsxd r8, dword ptr arg(5) ;dst_pitch
504 add rsi, rdx ; next line
505 add rdi, r8
506 %endif
507 cmp rdi, rcx ;
508 jne .next_row_8x4
510 ; begin epilog
511 pop rdi
512 pop rsi
513 RESTORE_GOT
514 UNSHADOW_ARGS
515 pop rbp
516 ret
519 ;void bilinear_predict4x4_mmx
520 ;(
521 ; unsigned char *src_ptr,
522 ; int src_pixels_per_line,
523 ; int xoffset,
524 ; int yoffset,
525 ; unsigned char *dst_ptr,
526 ; int dst_pitch
527 ;)
528 global sym(vp8_bilinear_predict4x4_mmx) PRIVATE
529 sym(vp8_bilinear_predict4x4_mmx):
530 push rbp
531 mov rbp, rsp
532 SHADOW_ARGS_TO_STACK 6
533 GET_GOT rbx
534 push rsi
535 push rdi
536 ; end prolog
538 ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
539 ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
541 movsxd rax, dword ptr arg(2) ;xoffset
542 mov rdi, arg(4) ;dst_ptr ;
544 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
545 shl rax, 5
547 add rax, rcx ; HFilter
548 mov rsi, arg(0) ;src_ptr ;
550 movsxd rdx, dword ptr arg(5) ;ldst_pitch
551 movq mm1, [rax] ;
553 movq mm2, [rax+16] ;
554 movsxd rax, dword ptr arg(3) ;yoffset
556 pxor mm0, mm0 ;
557 shl rax, 5
559 add rax, rcx
560 lea rcx, [rdi+rdx*4] ;
562 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
564 ; get the first horizontal line done ;
565 movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
566 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
568 pmullw mm3, mm1 ;
569 movd mm5, [rsi+1] ;
571 punpcklbw mm5, mm0 ;
572 pmullw mm5, mm2 ;
574 paddw mm3, mm5 ;
575 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
577 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
579 movq mm7, mm3 ;
580 packuswb mm7, mm0 ;
582 add rsi, rdx ; next line
583 .next_row_4x4:
584 movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
585 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
587 pmullw mm3, mm1 ;
588 movd mm5, [rsi+1] ;
590 punpcklbw mm5, mm0 ;
591 pmullw mm5, mm2 ;
593 paddw mm3, mm5 ;
595 movq mm5, mm7 ;
596 punpcklbw mm5, mm0 ;
598 pmullw mm5, [rax] ;
599 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
601 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
602 movq mm7, mm3 ;
604 packuswb mm7, mm0 ;
606 pmullw mm3, [rax+16] ;
607 paddw mm3, mm5 ;
610 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
611 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
613 packuswb mm3, mm0
614 movd [rdi], mm3 ; store the results in the destination
616 %if ABI_IS_32BIT
617 add rsi, rdx ; next line
618 add rdi, dword ptr arg(5) ;dst_pitch ;
619 %else
620 movsxd r8, dword ptr arg(5) ;dst_pitch ;
621 add rsi, rdx ; next line
622 add rdi, r8
623 %endif
625 cmp rdi, rcx ;
626 jne .next_row_4x4
628 ; begin epilog
629 pop rdi
630 pop rsi
631 RESTORE_GOT
632 UNSHADOW_ARGS
633 pop rbp
634 ret
638 SECTION_RODATA
639 align 16
640 rd:
641 times 4 dw 0x40
643 align 16
644 global HIDDEN_DATA(sym(vp8_six_tap_mmx))
645 sym(vp8_six_tap_mmx):
646 times 8 dw 0
647 times 8 dw 0
648 times 8 dw 128
649 times 8 dw 0
650 times 8 dw 0
651 times 8 dw 0
653 times 8 dw 0
654 times 8 dw -6
655 times 8 dw 123
656 times 8 dw 12
657 times 8 dw -1
658 times 8 dw 0
660 times 8 dw 2
661 times 8 dw -11
662 times 8 dw 108
663 times 8 dw 36
664 times 8 dw -8
665 times 8 dw 1
667 times 8 dw 0
668 times 8 dw -9
669 times 8 dw 93
670 times 8 dw 50
671 times 8 dw -6
672 times 8 dw 0
674 times 8 dw 3
675 times 8 dw -16
676 times 8 dw 77
677 times 8 dw 77
678 times 8 dw -16
679 times 8 dw 3
681 times 8 dw 0
682 times 8 dw -6
683 times 8 dw 50
684 times 8 dw 93
685 times 8 dw -9
686 times 8 dw 0
688 times 8 dw 1
689 times 8 dw -8
690 times 8 dw 36
691 times 8 dw 108
692 times 8 dw -11
693 times 8 dw 2
695 times 8 dw 0
696 times 8 dw -1
697 times 8 dw 12
698 times 8 dw 123
699 times 8 dw -6
700 times 8 dw 0