Thu, 15 Jan 2015 15:59:08 +0100
Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
12 %include "vpx_ports/x86_abi_support.asm"
14 %define xmm_filter_shift 7
17 ;void vp8_filter_block2d_bil_var_ssse3
18 ;(
19 ; unsigned char *ref_ptr,
20 ; int ref_pixels_per_line,
21 ; unsigned char *src_ptr,
22 ; int src_pixels_per_line,
23 ; unsigned int Height,
24 ; int xoffset,
25 ; int yoffset,
26 ; int *sum,
27 ; unsigned int *sumsquared;;
28 ;
29 ;)
30 ;Note: The filter coefficient at offset=0 is 128. Since the second register
31 ;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
32 global sym(vp8_filter_block2d_bil_var_ssse3) PRIVATE
33 sym(vp8_filter_block2d_bil_var_ssse3):
34 push rbp
35 mov rbp, rsp
36 SHADOW_ARGS_TO_STACK 9
37 SAVE_XMM 7
38 GET_GOT rbx
39 push rsi
40 push rdi
41 ; end prolog
43 pxor xmm6, xmm6
44 pxor xmm7, xmm7
46 lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
47 movsxd rax, dword ptr arg(5) ; xoffset
49 cmp rax, 0 ; skip first_pass filter if xoffset=0
50 je .filter_block2d_bil_var_ssse3_sp_only
52 shl rax, 4 ; point to filter coeff with xoffset
53 lea rax, [rax + rcx] ; HFilter
55 movsxd rdx, dword ptr arg(6) ; yoffset
57 cmp rdx, 0 ; skip second_pass filter if yoffset=0
58 je .filter_block2d_bil_var_ssse3_fp_only
60 shl rdx, 4
61 lea rdx, [rdx + rcx] ; VFilter
63 mov rsi, arg(0) ;ref_ptr
64 mov rdi, arg(2) ;src_ptr
65 movsxd rcx, dword ptr arg(4) ;Height
67 movdqu xmm0, XMMWORD PTR [rsi]
68 movdqu xmm1, XMMWORD PTR [rsi+1]
69 movdqa xmm2, xmm0
71 punpcklbw xmm0, xmm1
72 punpckhbw xmm2, xmm1
73 pmaddubsw xmm0, [rax]
74 pmaddubsw xmm2, [rax]
76 paddw xmm0, [GLOBAL(xmm_bi_rd)]
77 paddw xmm2, [GLOBAL(xmm_bi_rd)]
78 psraw xmm0, xmm_filter_shift
79 psraw xmm2, xmm_filter_shift
81 packuswb xmm0, xmm2
83 %if ABI_IS_32BIT
84 add rsi, dword ptr arg(1) ;ref_pixels_per_line
85 %else
86 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
87 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
88 lea rsi, [rsi + r8]
89 %endif
91 .filter_block2d_bil_var_ssse3_loop:
92 movdqu xmm1, XMMWORD PTR [rsi]
93 movdqu xmm2, XMMWORD PTR [rsi+1]
94 movdqa xmm3, xmm1
96 punpcklbw xmm1, xmm2
97 punpckhbw xmm3, xmm2
98 pmaddubsw xmm1, [rax]
99 pmaddubsw xmm3, [rax]
101 paddw xmm1, [GLOBAL(xmm_bi_rd)]
102 paddw xmm3, [GLOBAL(xmm_bi_rd)]
103 psraw xmm1, xmm_filter_shift
104 psraw xmm3, xmm_filter_shift
105 packuswb xmm1, xmm3
107 movdqa xmm2, xmm0
108 movdqa xmm0, xmm1
109 movdqa xmm3, xmm2
111 punpcklbw xmm2, xmm1
112 punpckhbw xmm3, xmm1
113 pmaddubsw xmm2, [rdx]
114 pmaddubsw xmm3, [rdx]
116 paddw xmm2, [GLOBAL(xmm_bi_rd)]
117 paddw xmm3, [GLOBAL(xmm_bi_rd)]
118 psraw xmm2, xmm_filter_shift
119 psraw xmm3, xmm_filter_shift
121 movq xmm1, QWORD PTR [rdi]
122 pxor xmm4, xmm4
123 punpcklbw xmm1, xmm4
124 movq xmm5, QWORD PTR [rdi+8]
125 punpcklbw xmm5, xmm4
127 psubw xmm2, xmm1
128 psubw xmm3, xmm5
129 paddw xmm6, xmm2
130 paddw xmm6, xmm3
131 pmaddwd xmm2, xmm2
132 pmaddwd xmm3, xmm3
133 paddd xmm7, xmm2
134 paddd xmm7, xmm3
136 %if ABI_IS_32BIT
137 add rsi, dword ptr arg(1) ;ref_pixels_per_line
138 add rdi, dword ptr arg(3) ;src_pixels_per_line
139 %else
140 lea rsi, [rsi + r8]
141 lea rdi, [rdi + r9]
142 %endif
144 sub rcx, 1
145 jnz .filter_block2d_bil_var_ssse3_loop
147 jmp .filter_block2d_bil_variance
149 .filter_block2d_bil_var_ssse3_sp_only:
150 movsxd rdx, dword ptr arg(6) ; yoffset
152 cmp rdx, 0 ; Both xoffset =0 and yoffset=0
153 je .filter_block2d_bil_var_ssse3_full_pixel
155 shl rdx, 4
156 lea rdx, [rdx + rcx] ; VFilter
158 mov rsi, arg(0) ;ref_ptr
159 mov rdi, arg(2) ;src_ptr
160 movsxd rcx, dword ptr arg(4) ;Height
161 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
163 movdqu xmm1, XMMWORD PTR [rsi]
164 movdqa xmm0, xmm1
166 %if ABI_IS_32BIT=0
167 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
168 %endif
170 lea rsi, [rsi + rax]
172 .filter_block2d_bil_sp_only_loop:
173 movdqu xmm3, XMMWORD PTR [rsi]
174 movdqa xmm2, xmm1
175 movdqa xmm0, xmm3
177 punpcklbw xmm1, xmm3
178 punpckhbw xmm2, xmm3
179 pmaddubsw xmm1, [rdx]
180 pmaddubsw xmm2, [rdx]
182 paddw xmm1, [GLOBAL(xmm_bi_rd)]
183 paddw xmm2, [GLOBAL(xmm_bi_rd)]
184 psraw xmm1, xmm_filter_shift
185 psraw xmm2, xmm_filter_shift
187 movq xmm3, QWORD PTR [rdi]
188 pxor xmm4, xmm4
189 punpcklbw xmm3, xmm4
190 movq xmm5, QWORD PTR [rdi+8]
191 punpcklbw xmm5, xmm4
193 psubw xmm1, xmm3
194 psubw xmm2, xmm5
195 paddw xmm6, xmm1
196 paddw xmm6, xmm2
197 pmaddwd xmm1, xmm1
198 pmaddwd xmm2, xmm2
199 paddd xmm7, xmm1
200 paddd xmm7, xmm2
202 movdqa xmm1, xmm0
203 lea rsi, [rsi + rax] ;ref_pixels_per_line
205 %if ABI_IS_32BIT
206 add rdi, dword ptr arg(3) ;src_pixels_per_line
207 %else
208 lea rdi, [rdi + r9]
209 %endif
211 sub rcx, 1
212 jnz .filter_block2d_bil_sp_only_loop
214 jmp .filter_block2d_bil_variance
216 .filter_block2d_bil_var_ssse3_full_pixel:
217 mov rsi, arg(0) ;ref_ptr
218 mov rdi, arg(2) ;src_ptr
219 movsxd rcx, dword ptr arg(4) ;Height
220 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
221 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
222 pxor xmm0, xmm0
224 .filter_block2d_bil_full_pixel_loop:
225 movq xmm1, QWORD PTR [rsi]
226 punpcklbw xmm1, xmm0
227 movq xmm2, QWORD PTR [rsi+8]
228 punpcklbw xmm2, xmm0
230 movq xmm3, QWORD PTR [rdi]
231 punpcklbw xmm3, xmm0
232 movq xmm4, QWORD PTR [rdi+8]
233 punpcklbw xmm4, xmm0
235 psubw xmm1, xmm3
236 psubw xmm2, xmm4
237 paddw xmm6, xmm1
238 paddw xmm6, xmm2
239 pmaddwd xmm1, xmm1
240 pmaddwd xmm2, xmm2
241 paddd xmm7, xmm1
242 paddd xmm7, xmm2
244 lea rsi, [rsi + rax] ;ref_pixels_per_line
245 lea rdi, [rdi + rdx] ;src_pixels_per_line
246 sub rcx, 1
247 jnz .filter_block2d_bil_full_pixel_loop
249 jmp .filter_block2d_bil_variance
251 .filter_block2d_bil_var_ssse3_fp_only:
252 mov rsi, arg(0) ;ref_ptr
253 mov rdi, arg(2) ;src_ptr
254 movsxd rcx, dword ptr arg(4) ;Height
255 movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
257 pxor xmm0, xmm0
259 %if ABI_IS_32BIT=0
260 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
261 %endif
263 .filter_block2d_bil_fp_only_loop:
264 movdqu xmm1, XMMWORD PTR [rsi]
265 movdqu xmm2, XMMWORD PTR [rsi+1]
266 movdqa xmm3, xmm1
268 punpcklbw xmm1, xmm2
269 punpckhbw xmm3, xmm2
270 pmaddubsw xmm1, [rax]
271 pmaddubsw xmm3, [rax]
273 paddw xmm1, [GLOBAL(xmm_bi_rd)]
274 paddw xmm3, [GLOBAL(xmm_bi_rd)]
275 psraw xmm1, xmm_filter_shift
276 psraw xmm3, xmm_filter_shift
278 movq xmm2, XMMWORD PTR [rdi]
279 pxor xmm4, xmm4
280 punpcklbw xmm2, xmm4
281 movq xmm5, QWORD PTR [rdi+8]
282 punpcklbw xmm5, xmm4
284 psubw xmm1, xmm2
285 psubw xmm3, xmm5
286 paddw xmm6, xmm1
287 paddw xmm6, xmm3
288 pmaddwd xmm1, xmm1
289 pmaddwd xmm3, xmm3
290 paddd xmm7, xmm1
291 paddd xmm7, xmm3
293 lea rsi, [rsi + rdx]
294 %if ABI_IS_32BIT
295 add rdi, dword ptr arg(3) ;src_pixels_per_line
296 %else
297 lea rdi, [rdi + r9]
298 %endif
300 sub rcx, 1
301 jnz .filter_block2d_bil_fp_only_loop
303 jmp .filter_block2d_bil_variance
305 .filter_block2d_bil_variance:
306 pxor xmm0, xmm0
307 pxor xmm1, xmm1
308 pxor xmm5, xmm5
310 punpcklwd xmm0, xmm6
311 punpckhwd xmm1, xmm6
312 psrad xmm0, 16
313 psrad xmm1, 16
314 paddd xmm0, xmm1
315 movdqa xmm1, xmm0
317 movdqa xmm6, xmm7
318 punpckldq xmm6, xmm5
319 punpckhdq xmm7, xmm5
320 paddd xmm6, xmm7
322 punpckldq xmm0, xmm5
323 punpckhdq xmm1, xmm5
324 paddd xmm0, xmm1
326 movdqa xmm7, xmm6
327 movdqa xmm1, xmm0
329 psrldq xmm7, 8
330 psrldq xmm1, 8
332 paddd xmm6, xmm7
333 paddd xmm0, xmm1
335 mov rsi, arg(7) ;[Sum]
336 mov rdi, arg(8) ;[SSE]
338 movd [rsi], xmm0
339 movd [rdi], xmm6
341 ; begin epilog
342 pop rdi
343 pop rsi
344 RESTORE_GOT
345 RESTORE_XMM
346 UNSHADOW_ARGS
347 pop rbp
348 ret
351 SECTION_RODATA
352 align 16
353 xmm_bi_rd:
354 times 8 dw 64
355 align 16
356 vp8_bilinear_filters_ssse3:
357 times 8 db 128, 0
358 times 8 db 112, 16
359 times 8 db 96, 32
360 times 8 db 80, 48
361 times 8 db 64, 64
362 times 8 db 48, 80
363 times 8 db 32, 96
364 times 8 db 16, 112