media/libvpx/vp8/common/x86/variance_impl_ssse3.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11
michael@0 12 %include "vpx_ports/x86_abi_support.asm"
michael@0 13
michael@0 14 %define xmm_filter_shift 7
michael@0 15
michael@0 16
michael@0 17 ;void vp8_filter_block2d_bil_var_ssse3
michael@0 18 ;(
michael@0 19 ; unsigned char *ref_ptr,
michael@0 20 ; int ref_pixels_per_line,
michael@0 21 ; unsigned char *src_ptr,
michael@0 22 ; int src_pixels_per_line,
michael@0 23 ; unsigned int Height,
michael@0 24 ; int xoffset,
michael@0 25 ; int yoffset,
michael@0 26 ; int *sum,
michael@0 27 ; unsigned int *sumsquared;;
michael@0 28 ;
michael@0 29 ;)
michael@0 30 ;Note: The filter coefficient at offset=0 is 128. Since the second register
michael@0 31 ;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
michael@0 32 global sym(vp8_filter_block2d_bil_var_ssse3) PRIVATE
michael@0 33 sym(vp8_filter_block2d_bil_var_ssse3):
michael@0 34 push rbp
michael@0 35 mov rbp, rsp
michael@0 36 SHADOW_ARGS_TO_STACK 9
michael@0 37 SAVE_XMM 7
michael@0 38 GET_GOT rbx
michael@0 39 push rsi
michael@0 40 push rdi
michael@0 41 ; end prolog
michael@0 42
michael@0 43 pxor xmm6, xmm6
michael@0 44 pxor xmm7, xmm7
michael@0 45
michael@0 46 lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
michael@0 47 movsxd rax, dword ptr arg(5) ; xoffset
michael@0 48
michael@0 49 cmp rax, 0 ; skip first_pass filter if xoffset=0
michael@0 50 je .filter_block2d_bil_var_ssse3_sp_only
michael@0 51
michael@0 52 shl rax, 4 ; point to filter coeff with xoffset
michael@0 53 lea rax, [rax + rcx] ; HFilter
michael@0 54
michael@0 55 movsxd rdx, dword ptr arg(6) ; yoffset
michael@0 56
michael@0 57 cmp rdx, 0 ; skip second_pass filter if yoffset=0
michael@0 58 je .filter_block2d_bil_var_ssse3_fp_only
michael@0 59
michael@0 60 shl rdx, 4
michael@0 61 lea rdx, [rdx + rcx] ; VFilter
michael@0 62
michael@0 63 mov rsi, arg(0) ;ref_ptr
michael@0 64 mov rdi, arg(2) ;src_ptr
michael@0 65 movsxd rcx, dword ptr arg(4) ;Height
michael@0 66
michael@0 67 movdqu xmm0, XMMWORD PTR [rsi]
michael@0 68 movdqu xmm1, XMMWORD PTR [rsi+1]
michael@0 69 movdqa xmm2, xmm0
michael@0 70
michael@0 71 punpcklbw xmm0, xmm1
michael@0 72 punpckhbw xmm2, xmm1
michael@0 73 pmaddubsw xmm0, [rax]
michael@0 74 pmaddubsw xmm2, [rax]
michael@0 75
michael@0 76 paddw xmm0, [GLOBAL(xmm_bi_rd)]
michael@0 77 paddw xmm2, [GLOBAL(xmm_bi_rd)]
michael@0 78 psraw xmm0, xmm_filter_shift
michael@0 79 psraw xmm2, xmm_filter_shift
michael@0 80
michael@0 81 packuswb xmm0, xmm2
michael@0 82
michael@0 83 %if ABI_IS_32BIT
michael@0 84 add rsi, dword ptr arg(1) ;ref_pixels_per_line
michael@0 85 %else
michael@0 86 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
michael@0 87 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
michael@0 88 lea rsi, [rsi + r8]
michael@0 89 %endif
michael@0 90
michael@0 91 .filter_block2d_bil_var_ssse3_loop:
michael@0 92 movdqu xmm1, XMMWORD PTR [rsi]
michael@0 93 movdqu xmm2, XMMWORD PTR [rsi+1]
michael@0 94 movdqa xmm3, xmm1
michael@0 95
michael@0 96 punpcklbw xmm1, xmm2
michael@0 97 punpckhbw xmm3, xmm2
michael@0 98 pmaddubsw xmm1, [rax]
michael@0 99 pmaddubsw xmm3, [rax]
michael@0 100
michael@0 101 paddw xmm1, [GLOBAL(xmm_bi_rd)]
michael@0 102 paddw xmm3, [GLOBAL(xmm_bi_rd)]
michael@0 103 psraw xmm1, xmm_filter_shift
michael@0 104 psraw xmm3, xmm_filter_shift
michael@0 105 packuswb xmm1, xmm3
michael@0 106
michael@0 107 movdqa xmm2, xmm0
michael@0 108 movdqa xmm0, xmm1
michael@0 109 movdqa xmm3, xmm2
michael@0 110
michael@0 111 punpcklbw xmm2, xmm1
michael@0 112 punpckhbw xmm3, xmm1
michael@0 113 pmaddubsw xmm2, [rdx]
michael@0 114 pmaddubsw xmm3, [rdx]
michael@0 115
michael@0 116 paddw xmm2, [GLOBAL(xmm_bi_rd)]
michael@0 117 paddw xmm3, [GLOBAL(xmm_bi_rd)]
michael@0 118 psraw xmm2, xmm_filter_shift
michael@0 119 psraw xmm3, xmm_filter_shift
michael@0 120
michael@0 121 movq xmm1, QWORD PTR [rdi]
michael@0 122 pxor xmm4, xmm4
michael@0 123 punpcklbw xmm1, xmm4
michael@0 124 movq xmm5, QWORD PTR [rdi+8]
michael@0 125 punpcklbw xmm5, xmm4
michael@0 126
michael@0 127 psubw xmm2, xmm1
michael@0 128 psubw xmm3, xmm5
michael@0 129 paddw xmm6, xmm2
michael@0 130 paddw xmm6, xmm3
michael@0 131 pmaddwd xmm2, xmm2
michael@0 132 pmaddwd xmm3, xmm3
michael@0 133 paddd xmm7, xmm2
michael@0 134 paddd xmm7, xmm3
michael@0 135
michael@0 136 %if ABI_IS_32BIT
michael@0 137 add rsi, dword ptr arg(1) ;ref_pixels_per_line
michael@0 138 add rdi, dword ptr arg(3) ;src_pixels_per_line
michael@0 139 %else
michael@0 140 lea rsi, [rsi + r8]
michael@0 141 lea rdi, [rdi + r9]
michael@0 142 %endif
michael@0 143
michael@0 144 sub rcx, 1
michael@0 145 jnz .filter_block2d_bil_var_ssse3_loop
michael@0 146
michael@0 147 jmp .filter_block2d_bil_variance
michael@0 148
michael@0 149 .filter_block2d_bil_var_ssse3_sp_only:
michael@0 150 movsxd rdx, dword ptr arg(6) ; yoffset
michael@0 151
michael@0 152 cmp rdx, 0 ; Both xoffset =0 and yoffset=0
michael@0 153 je .filter_block2d_bil_var_ssse3_full_pixel
michael@0 154
michael@0 155 shl rdx, 4
michael@0 156 lea rdx, [rdx + rcx] ; VFilter
michael@0 157
michael@0 158 mov rsi, arg(0) ;ref_ptr
michael@0 159 mov rdi, arg(2) ;src_ptr
michael@0 160 movsxd rcx, dword ptr arg(4) ;Height
michael@0 161 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
michael@0 162
michael@0 163 movdqu xmm1, XMMWORD PTR [rsi]
michael@0 164 movdqa xmm0, xmm1
michael@0 165
michael@0 166 %if ABI_IS_32BIT=0
michael@0 167 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
michael@0 168 %endif
michael@0 169
michael@0 170 lea rsi, [rsi + rax]
michael@0 171
michael@0 172 .filter_block2d_bil_sp_only_loop:
michael@0 173 movdqu xmm3, XMMWORD PTR [rsi]
michael@0 174 movdqa xmm2, xmm1
michael@0 175 movdqa xmm0, xmm3
michael@0 176
michael@0 177 punpcklbw xmm1, xmm3
michael@0 178 punpckhbw xmm2, xmm3
michael@0 179 pmaddubsw xmm1, [rdx]
michael@0 180 pmaddubsw xmm2, [rdx]
michael@0 181
michael@0 182 paddw xmm1, [GLOBAL(xmm_bi_rd)]
michael@0 183 paddw xmm2, [GLOBAL(xmm_bi_rd)]
michael@0 184 psraw xmm1, xmm_filter_shift
michael@0 185 psraw xmm2, xmm_filter_shift
michael@0 186
michael@0 187 movq xmm3, QWORD PTR [rdi]
michael@0 188 pxor xmm4, xmm4
michael@0 189 punpcklbw xmm3, xmm4
michael@0 190 movq xmm5, QWORD PTR [rdi+8]
michael@0 191 punpcklbw xmm5, xmm4
michael@0 192
michael@0 193 psubw xmm1, xmm3
michael@0 194 psubw xmm2, xmm5
michael@0 195 paddw xmm6, xmm1
michael@0 196 paddw xmm6, xmm2
michael@0 197 pmaddwd xmm1, xmm1
michael@0 198 pmaddwd xmm2, xmm2
michael@0 199 paddd xmm7, xmm1
michael@0 200 paddd xmm7, xmm2
michael@0 201
michael@0 202 movdqa xmm1, xmm0
michael@0 203 lea rsi, [rsi + rax] ;ref_pixels_per_line
michael@0 204
michael@0 205 %if ABI_IS_32BIT
michael@0 206 add rdi, dword ptr arg(3) ;src_pixels_per_line
michael@0 207 %else
michael@0 208 lea rdi, [rdi + r9]
michael@0 209 %endif
michael@0 210
michael@0 211 sub rcx, 1
michael@0 212 jnz .filter_block2d_bil_sp_only_loop
michael@0 213
michael@0 214 jmp .filter_block2d_bil_variance
michael@0 215
michael@0 216 .filter_block2d_bil_var_ssse3_full_pixel:
michael@0 217 mov rsi, arg(0) ;ref_ptr
michael@0 218 mov rdi, arg(2) ;src_ptr
michael@0 219 movsxd rcx, dword ptr arg(4) ;Height
michael@0 220 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
michael@0 221 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
michael@0 222 pxor xmm0, xmm0
michael@0 223
michael@0 224 .filter_block2d_bil_full_pixel_loop:
michael@0 225 movq xmm1, QWORD PTR [rsi]
michael@0 226 punpcklbw xmm1, xmm0
michael@0 227 movq xmm2, QWORD PTR [rsi+8]
michael@0 228 punpcklbw xmm2, xmm0
michael@0 229
michael@0 230 movq xmm3, QWORD PTR [rdi]
michael@0 231 punpcklbw xmm3, xmm0
michael@0 232 movq xmm4, QWORD PTR [rdi+8]
michael@0 233 punpcklbw xmm4, xmm0
michael@0 234
michael@0 235 psubw xmm1, xmm3
michael@0 236 psubw xmm2, xmm4
michael@0 237 paddw xmm6, xmm1
michael@0 238 paddw xmm6, xmm2
michael@0 239 pmaddwd xmm1, xmm1
michael@0 240 pmaddwd xmm2, xmm2
michael@0 241 paddd xmm7, xmm1
michael@0 242 paddd xmm7, xmm2
michael@0 243
michael@0 244 lea rsi, [rsi + rax] ;ref_pixels_per_line
michael@0 245 lea rdi, [rdi + rdx] ;src_pixels_per_line
michael@0 246 sub rcx, 1
michael@0 247 jnz .filter_block2d_bil_full_pixel_loop
michael@0 248
michael@0 249 jmp .filter_block2d_bil_variance
michael@0 250
michael@0 251 .filter_block2d_bil_var_ssse3_fp_only:
michael@0 252 mov rsi, arg(0) ;ref_ptr
michael@0 253 mov rdi, arg(2) ;src_ptr
michael@0 254 movsxd rcx, dword ptr arg(4) ;Height
michael@0 255 movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
michael@0 256
michael@0 257 pxor xmm0, xmm0
michael@0 258
michael@0 259 %if ABI_IS_32BIT=0
michael@0 260 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
michael@0 261 %endif
michael@0 262
michael@0 263 .filter_block2d_bil_fp_only_loop:
michael@0 264 movdqu xmm1, XMMWORD PTR [rsi]
michael@0 265 movdqu xmm2, XMMWORD PTR [rsi+1]
michael@0 266 movdqa xmm3, xmm1
michael@0 267
michael@0 268 punpcklbw xmm1, xmm2
michael@0 269 punpckhbw xmm3, xmm2
michael@0 270 pmaddubsw xmm1, [rax]
michael@0 271 pmaddubsw xmm3, [rax]
michael@0 272
michael@0 273 paddw xmm1, [GLOBAL(xmm_bi_rd)]
michael@0 274 paddw xmm3, [GLOBAL(xmm_bi_rd)]
michael@0 275 psraw xmm1, xmm_filter_shift
michael@0 276 psraw xmm3, xmm_filter_shift
michael@0 277
michael@0 278 movq xmm2, XMMWORD PTR [rdi]
michael@0 279 pxor xmm4, xmm4
michael@0 280 punpcklbw xmm2, xmm4
michael@0 281 movq xmm5, QWORD PTR [rdi+8]
michael@0 282 punpcklbw xmm5, xmm4
michael@0 283
michael@0 284 psubw xmm1, xmm2
michael@0 285 psubw xmm3, xmm5
michael@0 286 paddw xmm6, xmm1
michael@0 287 paddw xmm6, xmm3
michael@0 288 pmaddwd xmm1, xmm1
michael@0 289 pmaddwd xmm3, xmm3
michael@0 290 paddd xmm7, xmm1
michael@0 291 paddd xmm7, xmm3
michael@0 292
michael@0 293 lea rsi, [rsi + rdx]
michael@0 294 %if ABI_IS_32BIT
michael@0 295 add rdi, dword ptr arg(3) ;src_pixels_per_line
michael@0 296 %else
michael@0 297 lea rdi, [rdi + r9]
michael@0 298 %endif
michael@0 299
michael@0 300 sub rcx, 1
michael@0 301 jnz .filter_block2d_bil_fp_only_loop
michael@0 302
michael@0 303 jmp .filter_block2d_bil_variance
michael@0 304
michael@0 305 .filter_block2d_bil_variance:
michael@0 306 pxor xmm0, xmm0
michael@0 307 pxor xmm1, xmm1
michael@0 308 pxor xmm5, xmm5
michael@0 309
michael@0 310 punpcklwd xmm0, xmm6
michael@0 311 punpckhwd xmm1, xmm6
michael@0 312 psrad xmm0, 16
michael@0 313 psrad xmm1, 16
michael@0 314 paddd xmm0, xmm1
michael@0 315 movdqa xmm1, xmm0
michael@0 316
michael@0 317 movdqa xmm6, xmm7
michael@0 318 punpckldq xmm6, xmm5
michael@0 319 punpckhdq xmm7, xmm5
michael@0 320 paddd xmm6, xmm7
michael@0 321
michael@0 322 punpckldq xmm0, xmm5
michael@0 323 punpckhdq xmm1, xmm5
michael@0 324 paddd xmm0, xmm1
michael@0 325
michael@0 326 movdqa xmm7, xmm6
michael@0 327 movdqa xmm1, xmm0
michael@0 328
michael@0 329 psrldq xmm7, 8
michael@0 330 psrldq xmm1, 8
michael@0 331
michael@0 332 paddd xmm6, xmm7
michael@0 333 paddd xmm0, xmm1
michael@0 334
michael@0 335 mov rsi, arg(7) ;[Sum]
michael@0 336 mov rdi, arg(8) ;[SSE]
michael@0 337
michael@0 338 movd [rsi], xmm0
michael@0 339 movd [rdi], xmm6
michael@0 340
michael@0 341 ; begin epilog
michael@0 342 pop rdi
michael@0 343 pop rsi
michael@0 344 RESTORE_GOT
michael@0 345 RESTORE_XMM
michael@0 346 UNSHADOW_ARGS
michael@0 347 pop rbp
michael@0 348 ret
michael@0 349
michael@0 350
michael@0 351 SECTION_RODATA
michael@0 352 align 16
michael@0 353 xmm_bi_rd:
michael@0 354 times 8 dw 64
michael@0 355 align 16
michael@0 356 vp8_bilinear_filters_ssse3:
michael@0 357 times 8 db 128, 0
michael@0 358 times 8 db 112, 16
michael@0 359 times 8 db 96, 32
michael@0 360 times 8 db 80, 48
michael@0 361 times 8 db 64, 64
michael@0 362 times 8 db 48, 80
michael@0 363 times 8 db 32, 96
michael@0 364 times 8 db 16, 112

mercurial