Thu, 15 Jan 2015 15:59:08 +0100
Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
12 %include "vpx_ports/x86_abi_support.asm"
14 ;macro in deblock functions
15 %macro FIRST_2_ROWS 0
16 movdqa xmm4, xmm0
17 movdqa xmm6, xmm0
18 movdqa xmm5, xmm1
19 pavgb xmm5, xmm3
21 ;calculate absolute value
22 psubusb xmm4, xmm1
23 psubusb xmm1, xmm0
24 psubusb xmm6, xmm3
25 psubusb xmm3, xmm0
26 paddusb xmm4, xmm1
27 paddusb xmm6, xmm3
29 ;get threshold
30 movdqa xmm2, flimit
31 pxor xmm1, xmm1
32 movdqa xmm7, xmm2
34 ;get mask
35 psubusb xmm2, xmm4
36 psubusb xmm7, xmm6
37 pcmpeqb xmm2, xmm1
38 pcmpeqb xmm7, xmm1
39 por xmm7, xmm2
40 %endmacro
42 %macro SECOND_2_ROWS 0
43 movdqa xmm6, xmm0
44 movdqa xmm4, xmm0
45 movdqa xmm2, xmm1
46 pavgb xmm1, xmm3
48 ;calculate absolute value
49 psubusb xmm6, xmm2
50 psubusb xmm2, xmm0
51 psubusb xmm4, xmm3
52 psubusb xmm3, xmm0
53 paddusb xmm6, xmm2
54 paddusb xmm4, xmm3
56 pavgb xmm5, xmm1
58 ;get threshold
59 movdqa xmm2, flimit
60 pxor xmm1, xmm1
61 movdqa xmm3, xmm2
63 ;get mask
64 psubusb xmm2, xmm6
65 psubusb xmm3, xmm4
66 pcmpeqb xmm2, xmm1
67 pcmpeqb xmm3, xmm1
69 por xmm7, xmm2
70 por xmm7, xmm3
72 pavgb xmm5, xmm0
74 ;decide if or not to use filtered value
75 pand xmm0, xmm7
76 pandn xmm7, xmm5
77 paddusb xmm0, xmm7
78 %endmacro
80 %macro UPDATE_FLIMIT 0
81 movdqa xmm2, XMMWORD PTR [rbx]
82 movdqa [rsp], xmm2
83 add rbx, 16
84 %endmacro
86 ;void vp8_post_proc_down_and_across_mb_row_sse2
87 ;(
88 ; unsigned char *src_ptr,
89 ; unsigned char *dst_ptr,
90 ; int src_pixels_per_line,
91 ; int dst_pixels_per_line,
92 ; int cols,
93 ; int *flimits,
94 ; int size
95 ;)
96 global sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE
97 sym(vp8_post_proc_down_and_across_mb_row_sse2):
98 push rbp
99 mov rbp, rsp
100 SHADOW_ARGS_TO_STACK 7
101 SAVE_XMM 7
102 push rbx
103 push rsi
104 push rdi
105 ; end prolog
106 ALIGN_STACK 16, rax
107 sub rsp, 16
109 ; put flimit on stack
110 mov rbx, arg(5) ;flimits ptr
111 UPDATE_FLIMIT
113 %define flimit [rsp]
115 mov rsi, arg(0) ;src_ptr
116 mov rdi, arg(1) ;dst_ptr
118 movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line
119 movsxd rcx, DWORD PTR arg(6) ;rows in a macroblock
120 .nextrow:
121 xor rdx, rdx ;col
122 .nextcol:
123 ;load current and next 2 rows
124 movdqu xmm0, XMMWORD PTR [rsi]
125 movdqu xmm1, XMMWORD PTR [rsi + rax]
126 movdqu xmm3, XMMWORD PTR [rsi + 2*rax]
128 FIRST_2_ROWS
130 ;load above 2 rows
131 neg rax
132 movdqu xmm1, XMMWORD PTR [rsi + 2*rax]
133 movdqu xmm3, XMMWORD PTR [rsi + rax]
135 SECOND_2_ROWS
137 movdqu XMMWORD PTR [rdi], xmm0
139 neg rax ; positive stride
140 add rsi, 16
141 add rdi, 16
143 add rdx, 16
144 cmp edx, dword arg(4) ;cols
145 jge .downdone
146 UPDATE_FLIMIT
147 jmp .nextcol
149 .downdone:
150 ; done with the all cols, start the across filtering in place
151 sub rsi, rdx
152 sub rdi, rdx
154 mov rbx, arg(5) ; flimits
155 UPDATE_FLIMIT
157 ; dup the first byte into the left border 8 times
158 movq mm1, [rdi]
159 punpcklbw mm1, mm1
160 punpcklwd mm1, mm1
161 punpckldq mm1, mm1
162 mov rdx, -8
163 movq [rdi+rdx], mm1
165 ; dup the last byte into the right border
166 movsxd rdx, dword arg(4)
167 movq mm1, [rdi + rdx + -1]
168 punpcklbw mm1, mm1
169 punpcklwd mm1, mm1
170 punpckldq mm1, mm1
171 movq [rdi+rdx], mm1
173 xor rdx, rdx
174 movq mm0, QWORD PTR [rdi-16];
175 movq mm1, QWORD PTR [rdi-8];
177 .acrossnextcol:
178 movdqu xmm0, XMMWORD PTR [rdi + rdx]
179 movdqu xmm1, XMMWORD PTR [rdi + rdx -2]
180 movdqu xmm3, XMMWORD PTR [rdi + rdx -1]
182 FIRST_2_ROWS
184 movdqu xmm1, XMMWORD PTR [rdi + rdx +1]
185 movdqu xmm3, XMMWORD PTR [rdi + rdx +2]
187 SECOND_2_ROWS
189 movq QWORD PTR [rdi+rdx-16], mm0 ; store previous 8 bytes
190 movq QWORD PTR [rdi+rdx-8], mm1 ; store previous 8 bytes
191 movdq2q mm0, xmm0
192 psrldq xmm0, 8
193 movdq2q mm1, xmm0
195 add rdx, 16
196 cmp edx, dword arg(4) ;cols
197 jge .acrossdone
198 UPDATE_FLIMIT
199 jmp .acrossnextcol
201 .acrossdone
202 ; last 16 pixels
203 movq QWORD PTR [rdi+rdx-16], mm0
205 cmp edx, dword arg(4)
206 jne .throw_last_8
207 movq QWORD PTR [rdi+rdx-8], mm1
208 .throw_last_8:
209 ; done with this rwo
210 add rsi,rax ;next src line
211 mov eax, dword arg(3) ;dst_pixels_per_line
212 add rdi,rax ;next destination
213 mov eax, dword arg(2) ;src_pixels_per_line
215 mov rbx, arg(5) ;flimits
216 UPDATE_FLIMIT
218 dec rcx ;decrement count
219 jnz .nextrow ;next row
221 add rsp, 16
222 pop rsp
223 ; begin epilog
224 pop rdi
225 pop rsi
226 pop rbx
227 RESTORE_XMM
228 UNSHADOW_ARGS
229 pop rbp
230 ret
231 %undef flimit
233 ;void vp8_mbpost_proc_down_xmm(unsigned char *dst,
234 ; int pitch, int rows, int cols,int flimit)
235 extern sym(vp8_rv)
236 global sym(vp8_mbpost_proc_down_xmm) PRIVATE
237 sym(vp8_mbpost_proc_down_xmm):
238 push rbp
239 mov rbp, rsp
240 SHADOW_ARGS_TO_STACK 5
241 SAVE_XMM 7
242 GET_GOT rbx
243 push rsi
244 push rdi
245 ; end prolog
247 ALIGN_STACK 16, rax
248 sub rsp, 128+16
250 ; unsigned char d[16][8] at [rsp]
251 ; create flimit2 at [rsp+128]
252 mov eax, dword ptr arg(4) ;flimit
253 mov [rsp+128], eax
254 mov [rsp+128+4], eax
255 mov [rsp+128+8], eax
256 mov [rsp+128+12], eax
257 %define flimit4 [rsp+128]
259 %if ABI_IS_32BIT=0
260 lea r8, [GLOBAL(sym(vp8_rv))]
261 %endif
263 ;rows +=8;
264 add dword arg(2), 8
266 ;for(c=0; c<cols; c+=8)
267 .loop_col:
268 mov rsi, arg(0) ; s
269 pxor xmm0, xmm0 ;
271 movsxd rax, dword ptr arg(1) ;pitch ;
273 ; this copies the last row down into the border 8 rows
274 mov rdi, rsi
275 mov rdx, arg(2)
276 sub rdx, 9
277 imul rdx, rax
278 lea rdi, [rdi+rdx]
279 movq xmm1, QWORD ptr[rdi] ; first row
280 mov rcx, 8
281 .init_borderd ; initialize borders
282 lea rdi, [rdi + rax]
283 movq [rdi], xmm1
285 dec rcx
286 jne .init_borderd
288 neg rax ; rax = -pitch
290 ; this copies the first row up into the border 8 rows
291 mov rdi, rsi
292 movq xmm1, QWORD ptr[rdi] ; first row
293 mov rcx, 8
294 .init_border ; initialize borders
295 lea rdi, [rdi + rax]
296 movq [rdi], xmm1
298 dec rcx
299 jne .init_border
303 lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
304 neg rax
306 pxor xmm5, xmm5
307 pxor xmm6, xmm6 ;
309 pxor xmm7, xmm7 ;
310 mov rdi, rsi
312 mov rcx, 15 ;
314 .loop_initvar:
315 movq xmm1, QWORD PTR [rdi];
316 punpcklbw xmm1, xmm0 ;
318 paddw xmm5, xmm1 ;
319 pmullw xmm1, xmm1 ;
321 movdqa xmm2, xmm1 ;
322 punpcklwd xmm1, xmm0 ;
324 punpckhwd xmm2, xmm0 ;
325 paddd xmm6, xmm1 ;
327 paddd xmm7, xmm2 ;
328 lea rdi, [rdi+rax] ;
330 dec rcx
331 jne .loop_initvar
332 ;save the var and sum
333 xor rdx, rdx
334 .loop_row:
335 movq xmm1, QWORD PTR [rsi] ; [s-pitch*8]
336 movq xmm2, QWORD PTR [rdi] ; [s+pitch*7]
338 punpcklbw xmm1, xmm0
339 punpcklbw xmm2, xmm0
341 paddw xmm5, xmm2
342 psubw xmm5, xmm1
344 pmullw xmm2, xmm2
345 movdqa xmm4, xmm2
347 punpcklwd xmm2, xmm0
348 punpckhwd xmm4, xmm0
350 paddd xmm6, xmm2
351 paddd xmm7, xmm4
353 pmullw xmm1, xmm1
354 movdqa xmm2, xmm1
356 punpcklwd xmm1, xmm0
357 psubd xmm6, xmm1
359 punpckhwd xmm2, xmm0
360 psubd xmm7, xmm2
363 movdqa xmm3, xmm6
364 pslld xmm3, 4
366 psubd xmm3, xmm6
367 movdqa xmm1, xmm5
369 movdqa xmm4, xmm5
370 pmullw xmm1, xmm1
372 pmulhw xmm4, xmm4
373 movdqa xmm2, xmm1
375 punpcklwd xmm1, xmm4
376 punpckhwd xmm2, xmm4
378 movdqa xmm4, xmm7
379 pslld xmm4, 4
381 psubd xmm4, xmm7
383 psubd xmm3, xmm1
384 psubd xmm4, xmm2
386 psubd xmm3, flimit4
387 psubd xmm4, flimit4
389 psrad xmm3, 31
390 psrad xmm4, 31
392 packssdw xmm3, xmm4
393 packsswb xmm3, xmm0
395 movq xmm1, QWORD PTR [rsi+rax*8]
397 movq xmm2, xmm1
398 punpcklbw xmm1, xmm0
400 paddw xmm1, xmm5
401 mov rcx, rdx
403 and rcx, 127
404 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
405 push rax
406 lea rax, [GLOBAL(sym(vp8_rv))]
407 movdqu xmm4, [rax + rcx*2] ;vp8_rv[rcx*2]
408 pop rax
409 %elif ABI_IS_32BIT=0
410 movdqu xmm4, [r8 + rcx*2] ;vp8_rv[rcx*2]
411 %else
412 movdqu xmm4, [sym(vp8_rv) + rcx*2]
413 %endif
415 paddw xmm1, xmm4
416 ;paddw xmm1, eight8s
417 psraw xmm1, 4
419 packuswb xmm1, xmm0
420 pand xmm1, xmm3
422 pandn xmm3, xmm2
423 por xmm1, xmm3
425 and rcx, 15
426 movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8]
428 mov rcx, rdx
429 sub rcx, 8
431 and rcx, 15
432 movq mm0, [rsp + rcx*8] ;d[rcx*8]
434 movq [rsi], mm0
435 lea rsi, [rsi+rax]
437 lea rdi, [rdi+rax]
438 add rdx, 1
440 cmp edx, dword arg(2) ;rows
441 jl .loop_row
443 add dword arg(0), 8 ; s += 8
444 sub dword arg(3), 8 ; cols -= 8
445 cmp dword arg(3), 0
446 jg .loop_col
448 add rsp, 128+16
449 pop rsp
451 ; begin epilog
452 pop rdi
453 pop rsi
454 RESTORE_GOT
455 RESTORE_XMM
456 UNSHADOW_ARGS
457 pop rbp
458 ret
459 %undef flimit4
462 ;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src,
463 ; int pitch, int rows, int cols,int flimit)
464 global sym(vp8_mbpost_proc_across_ip_xmm) PRIVATE
465 sym(vp8_mbpost_proc_across_ip_xmm):
466 push rbp
467 mov rbp, rsp
468 SHADOW_ARGS_TO_STACK 5
469 SAVE_XMM 7
470 GET_GOT rbx
471 push rsi
472 push rdi
473 ; end prolog
475 ALIGN_STACK 16, rax
476 sub rsp, 16
478 ; create flimit4 at [rsp]
479 mov eax, dword ptr arg(4) ;flimit
480 mov [rsp], eax
481 mov [rsp+4], eax
482 mov [rsp+8], eax
483 mov [rsp+12], eax
484 %define flimit4 [rsp]
487 ;for(r=0;r<rows;r++)
488 .ip_row_loop:
490 xor rdx, rdx ;sumsq=0;
491 xor rcx, rcx ;sum=0;
492 mov rsi, arg(0); s
495 ; dup the first byte into the left border 8 times
496 movq mm1, [rsi]
497 punpcklbw mm1, mm1
498 punpcklwd mm1, mm1
499 punpckldq mm1, mm1
501 mov rdi, -8
502 movq [rsi+rdi], mm1
504 ; dup the last byte into the right border
505 movsxd rdx, dword arg(3)
506 movq mm1, [rsi + rdx + -1]
507 punpcklbw mm1, mm1
508 punpcklwd mm1, mm1
509 punpckldq mm1, mm1
510 movq [rsi+rdx], mm1
512 .ip_var_loop:
513 ;for(i=-8;i<=6;i++)
514 ;{
515 ; sumsq += s[i]*s[i];
516 ; sum += s[i];
517 ;}
518 movzx eax, byte [rsi+rdi]
519 add ecx, eax
520 mul al
521 add edx, eax
522 add rdi, 1
523 cmp rdi, 6
524 jle .ip_var_loop
527 ;mov rax, sumsq
528 ;movd xmm7, rax
529 movd xmm7, edx
531 ;mov rax, sum
532 ;movd xmm6, rax
533 movd xmm6, ecx
535 mov rsi, arg(0) ;s
536 xor rcx, rcx
538 movsxd rdx, dword arg(3) ;cols
539 add rdx, 8
540 pxor mm0, mm0
541 pxor mm1, mm1
543 pxor xmm0, xmm0
544 .nextcol4:
546 movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5
547 movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10
549 punpcklbw xmm1, xmm0 ; expanding
550 punpcklbw xmm2, xmm0 ; expanding
552 punpcklwd xmm1, xmm0 ; expanding to dwords
553 punpcklwd xmm2, xmm0 ; expanding to dwords
555 psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5
556 paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2
558 paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5
559 pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5
561 paddd xmm6, xmm2
562 paddd xmm7, xmm1
564 pshufd xmm6, xmm6, 0 ; duplicate the last ones
565 pshufd xmm7, xmm7, 0 ; duplicate the last ones
567 psrldq xmm1, 4 ; 8--7 9--6 10--5 0000
568 psrldq xmm2, 4 ; 8--7 9--6 10--5 0000
570 pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared
571 pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared
573 paddd xmm6, xmm4
574 paddd xmm7, xmm3
576 pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared
577 pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared
579 paddd xmm7, xmm3
580 paddd xmm6, xmm4
582 pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared
583 pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared
585 paddd xmm7, xmm3
586 paddd xmm6, xmm4
588 movdqa xmm3, xmm6
589 pmaddwd xmm3, xmm3
591 movdqa xmm5, xmm7
592 pslld xmm5, 4
594 psubd xmm5, xmm7
595 psubd xmm5, xmm3
597 psubd xmm5, flimit4
598 psrad xmm5, 31
600 packssdw xmm5, xmm0
601 packsswb xmm5, xmm0
603 movd xmm1, DWORD PTR [rsi+rcx]
604 movq xmm2, xmm1
606 punpcklbw xmm1, xmm0
607 punpcklwd xmm1, xmm0
609 paddd xmm1, xmm6
610 paddd xmm1, [GLOBAL(four8s)]
612 psrad xmm1, 4
613 packssdw xmm1, xmm0
615 packuswb xmm1, xmm0
616 pand xmm1, xmm5
618 pandn xmm5, xmm2
619 por xmm5, xmm1
621 movd [rsi+rcx-8], mm0
622 movq mm0, mm1
624 movdq2q mm1, xmm5
625 psrldq xmm7, 12
627 psrldq xmm6, 12
628 add rcx, 4
630 cmp rcx, rdx
631 jl .nextcol4
633 ;s+=pitch;
634 movsxd rax, dword arg(1)
635 add arg(0), rax
637 sub dword arg(2), 1 ;rows-=1
638 cmp dword arg(2), 0
639 jg .ip_row_loop
641 add rsp, 16
642 pop rsp
644 ; begin epilog
645 pop rdi
646 pop rsi
647 RESTORE_GOT
648 RESTORE_XMM
649 UNSHADOW_ARGS
650 pop rbp
651 ret
652 %undef flimit4
655 ;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
656 ; unsigned char blackclamp[16],
657 ; unsigned char whiteclamp[16],
658 ; unsigned char bothclamp[16],
659 ; unsigned int Width, unsigned int Height, int Pitch)
660 extern sym(rand)
661 global sym(vp8_plane_add_noise_wmt) PRIVATE
662 sym(vp8_plane_add_noise_wmt):
663 push rbp
664 mov rbp, rsp
665 SHADOW_ARGS_TO_STACK 8
666 GET_GOT rbx
667 push rsi
668 push rdi
669 ; end prolog
671 .addnoise_loop:
672 call sym(rand) WRT_PLT
673 mov rcx, arg(1) ;noise
674 and rax, 0xff
675 add rcx, rax
677 ; we rely on the fact that the clamping vectors are stored contiguously
678 ; in black/white/both order. Note that we have to reload this here because
679 ; rdx could be trashed by rand()
680 mov rdx, arg(2) ; blackclamp
683 mov rdi, rcx
684 movsxd rcx, dword arg(5) ;[Width]
685 mov rsi, arg(0) ;Pos
686 xor rax,rax
688 .addnoise_nextset:
689 movdqu xmm1,[rsi+rax] ; get the source
691 psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise
692 paddusb xmm1, [rdx+32] ;bothclamp
693 psubusb xmm1, [rdx+16] ;whiteclamp
695 movdqu xmm2,[rdi+rax] ; get the noise for this line
696 paddb xmm1,xmm2 ; add it in
697 movdqu [rsi+rax],xmm1 ; store the result
699 add rax,16 ; move to the next line
701 cmp rax, rcx
702 jl .addnoise_nextset
704 movsxd rax, dword arg(7) ; Pitch
705 add arg(0), rax ; Start += Pitch
706 sub dword arg(6), 1 ; Height -= 1
707 jg .addnoise_loop
709 ; begin epilog
710 pop rdi
711 pop rsi
712 RESTORE_GOT
713 UNSHADOW_ARGS
714 pop rbp
715 ret
718 SECTION_RODATA
719 align 16
720 four8s:
721 times 4 dd 8