Thu, 15 Jan 2015 15:59:08 +0100
Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.
1 ;
2 ; Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
12 %include "vpx_ports/x86_abi_support.asm"
14 ;void vp8_filter_by_weight16x16_sse2
15 ;(
16 ; unsigned char *src,
17 ; int src_stride,
18 ; unsigned char *dst,
19 ; int dst_stride,
20 ; int src_weight
21 ;)
22 global sym(vp8_filter_by_weight16x16_sse2) PRIVATE
23 sym(vp8_filter_by_weight16x16_sse2):
24 push rbp
25 mov rbp, rsp
26 SHADOW_ARGS_TO_STACK 5
27 SAVE_XMM 6
28 GET_GOT rbx
29 push rsi
30 push rdi
31 ; end prolog
33 movd xmm0, arg(4) ; src_weight
34 pshuflw xmm0, xmm0, 0x0 ; replicate to all low words
35 punpcklqdq xmm0, xmm0 ; replicate to all hi words
37 movdqa xmm1, [GLOBAL(tMFQE)]
38 psubw xmm1, xmm0 ; dst_weight
40 mov rax, arg(0) ; src
41 mov rsi, arg(1) ; src_stride
42 mov rdx, arg(2) ; dst
43 mov rdi, arg(3) ; dst_stride
45 mov rcx, 16 ; loop count
46 pxor xmm6, xmm6
48 .combine
49 movdqa xmm2, [rax]
50 movdqa xmm4, [rdx]
51 add rax, rsi
53 ; src * src_weight
54 movdqa xmm3, xmm2
55 punpcklbw xmm2, xmm6
56 punpckhbw xmm3, xmm6
57 pmullw xmm2, xmm0
58 pmullw xmm3, xmm0
60 ; dst * dst_weight
61 movdqa xmm5, xmm4
62 punpcklbw xmm4, xmm6
63 punpckhbw xmm5, xmm6
64 pmullw xmm4, xmm1
65 pmullw xmm5, xmm1
67 ; sum, round and shift
68 paddw xmm2, xmm4
69 paddw xmm3, xmm5
70 paddw xmm2, [GLOBAL(tMFQE_round)]
71 paddw xmm3, [GLOBAL(tMFQE_round)]
72 psrlw xmm2, 4
73 psrlw xmm3, 4
75 packuswb xmm2, xmm3
76 movdqa [rdx], xmm2
77 add rdx, rdi
79 dec rcx
80 jnz .combine
82 ; begin epilog
83 pop rdi
84 pop rsi
85 RESTORE_GOT
86 RESTORE_XMM
87 UNSHADOW_ARGS
88 pop rbp
90 ret
92 ;void vp8_filter_by_weight8x8_sse2
93 ;(
94 ; unsigned char *src,
95 ; int src_stride,
96 ; unsigned char *dst,
97 ; int dst_stride,
98 ; int src_weight
99 ;)
100 global sym(vp8_filter_by_weight8x8_sse2) PRIVATE
101 sym(vp8_filter_by_weight8x8_sse2):
102 push rbp
103 mov rbp, rsp
104 SHADOW_ARGS_TO_STACK 5
105 GET_GOT rbx
106 push rsi
107 push rdi
108 ; end prolog
110 movd xmm0, arg(4) ; src_weight
111 pshuflw xmm0, xmm0, 0x0 ; replicate to all low words
112 punpcklqdq xmm0, xmm0 ; replicate to all hi words
114 movdqa xmm1, [GLOBAL(tMFQE)]
115 psubw xmm1, xmm0 ; dst_weight
117 mov rax, arg(0) ; src
118 mov rsi, arg(1) ; src_stride
119 mov rdx, arg(2) ; dst
120 mov rdi, arg(3) ; dst_stride
122 mov rcx, 8 ; loop count
123 pxor xmm4, xmm4
125 .combine
126 movq xmm2, [rax]
127 movq xmm3, [rdx]
128 add rax, rsi
130 ; src * src_weight
131 punpcklbw xmm2, xmm4
132 pmullw xmm2, xmm0
134 ; dst * dst_weight
135 punpcklbw xmm3, xmm4
136 pmullw xmm3, xmm1
138 ; sum, round and shift
139 paddw xmm2, xmm3
140 paddw xmm2, [GLOBAL(tMFQE_round)]
141 psrlw xmm2, 4
143 packuswb xmm2, xmm4
144 movq [rdx], xmm2
145 add rdx, rdi
147 dec rcx
148 jnz .combine
150 ; begin epilog
151 pop rdi
152 pop rsi
153 RESTORE_GOT
154 UNSHADOW_ARGS
155 pop rbp
157 ret
159 ;void vp8_variance_and_sad_16x16_sse2 | arg
160 ;(
161 ; unsigned char *src1, 0
162 ; int stride1, 1
163 ; unsigned char *src2, 2
164 ; int stride2, 3
165 ; unsigned int *variance, 4
166 ; unsigned int *sad, 5
167 ;)
168 global sym(vp8_variance_and_sad_16x16_sse2) PRIVATE
169 sym(vp8_variance_and_sad_16x16_sse2):
170 push rbp
171 mov rbp, rsp
172 SHADOW_ARGS_TO_STACK 6
173 GET_GOT rbx
174 push rsi
175 push rdi
176 ; end prolog
178 mov rax, arg(0) ; src1
179 mov rcx, arg(1) ; stride1
180 mov rdx, arg(2) ; src2
181 mov rdi, arg(3) ; stride2
183 mov rsi, 16 ; block height
185 ; Prep accumulator registers
186 pxor xmm3, xmm3 ; SAD
187 pxor xmm4, xmm4 ; sum of src2
188 pxor xmm5, xmm5 ; sum of src2^2
190 ; Because we're working with the actual output frames
191 ; we can't depend on any kind of data alignment.
192 .accumulate
193 movdqa xmm0, [rax] ; src1
194 movdqa xmm1, [rdx] ; src2
195 add rax, rcx ; src1 + stride1
196 add rdx, rdi ; src2 + stride2
198 ; SAD(src1, src2)
199 psadbw xmm0, xmm1
200 paddusw xmm3, xmm0
202 ; SUM(src2)
203 pxor xmm2, xmm2
204 psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0
205 paddusw xmm4, xmm2
207 ; pmaddubsw would be ideal if it took two unsigned values. instead,
208 ; it expects a signed and an unsigned value. so instead we zero extend
209 ; and operate on words.
210 pxor xmm2, xmm2
211 movdqa xmm0, xmm1
212 punpcklbw xmm0, xmm2
213 punpckhbw xmm1, xmm2
214 pmaddwd xmm0, xmm0
215 pmaddwd xmm1, xmm1
216 paddd xmm5, xmm0
217 paddd xmm5, xmm1
219 sub rsi, 1
220 jnz .accumulate
222 ; phaddd only operates on adjacent double words.
223 ; Finalize SAD and store
224 movdqa xmm0, xmm3
225 psrldq xmm0, 8
226 paddusw xmm0, xmm3
227 paddd xmm0, [GLOBAL(t128)]
228 psrld xmm0, 8
230 mov rax, arg(5)
231 movd [rax], xmm0
233 ; Accumulate sum of src2
234 movdqa xmm0, xmm4
235 psrldq xmm0, 8
236 paddusw xmm0, xmm4
237 ; Square src2. Ignore high value
238 pmuludq xmm0, xmm0
239 psrld xmm0, 8
241 ; phaddw could be used to sum adjacent values but we want
242 ; all the values summed. promote to doubles, accumulate,
243 ; shift and sum
244 pxor xmm2, xmm2
245 movdqa xmm1, xmm5
246 punpckldq xmm1, xmm2
247 punpckhdq xmm5, xmm2
248 paddd xmm1, xmm5
249 movdqa xmm2, xmm1
250 psrldq xmm1, 8
251 paddd xmm1, xmm2
253 psubd xmm1, xmm0
255 ; (variance + 128) >> 8
256 paddd xmm1, [GLOBAL(t128)]
257 psrld xmm1, 8
258 mov rax, arg(4)
260 movd [rax], xmm1
263 ; begin epilog
264 pop rdi
265 pop rsi
266 RESTORE_GOT
267 UNSHADOW_ARGS
268 pop rbp
269 ret
271 SECTION_RODATA
272 align 16
273 t128:
274 %ifndef __NASM_VER__
275 ddq 128
276 %elif CONFIG_BIG_ENDIAN
277 dq 0, 128
278 %else
279 dq 128, 0
280 %endif
281 align 16
282 tMFQE: ; 1 << MFQE_PRECISION
283 times 8 dw 0x10
284 align 16
285 tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
286 times 8 dw 0x08