media/libvpx/vp9/encoder/x86/vp9_sad_ssse3.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11
michael@0 12 %include "vpx_ports/x86_abi_support.asm"
michael@0 13
michael@0 14 %macro PROCESS_16X2X3 1
michael@0 15 %if %1
michael@0 16 movdqa xmm0, XMMWORD PTR [rsi]
michael@0 17 lddqu xmm5, XMMWORD PTR [rdi]
michael@0 18 lddqu xmm6, XMMWORD PTR [rdi+1]
michael@0 19 lddqu xmm7, XMMWORD PTR [rdi+2]
michael@0 20
michael@0 21 psadbw xmm5, xmm0
michael@0 22 psadbw xmm6, xmm0
michael@0 23 psadbw xmm7, xmm0
michael@0 24 %else
michael@0 25 movdqa xmm0, XMMWORD PTR [rsi]
michael@0 26 lddqu xmm1, XMMWORD PTR [rdi]
michael@0 27 lddqu xmm2, XMMWORD PTR [rdi+1]
michael@0 28 lddqu xmm3, XMMWORD PTR [rdi+2]
michael@0 29
michael@0 30 psadbw xmm1, xmm0
michael@0 31 psadbw xmm2, xmm0
michael@0 32 psadbw xmm3, xmm0
michael@0 33
michael@0 34 paddw xmm5, xmm1
michael@0 35 paddw xmm6, xmm2
michael@0 36 paddw xmm7, xmm3
michael@0 37 %endif
michael@0 38 movdqa xmm0, XMMWORD PTR [rsi+rax]
michael@0 39 lddqu xmm1, XMMWORD PTR [rdi+rdx]
michael@0 40 lddqu xmm2, XMMWORD PTR [rdi+rdx+1]
michael@0 41 lddqu xmm3, XMMWORD PTR [rdi+rdx+2]
michael@0 42
michael@0 43 lea rsi, [rsi+rax*2]
michael@0 44 lea rdi, [rdi+rdx*2]
michael@0 45
michael@0 46 psadbw xmm1, xmm0
michael@0 47 psadbw xmm2, xmm0
michael@0 48 psadbw xmm3, xmm0
michael@0 49
michael@0 50 paddw xmm5, xmm1
michael@0 51 paddw xmm6, xmm2
michael@0 52 paddw xmm7, xmm3
michael@0 53 %endmacro
michael@0 54
michael@0 55 %macro PROCESS_16X2X3_OFFSET 2
michael@0 56 %if %1
michael@0 57 movdqa xmm0, XMMWORD PTR [rsi]
michael@0 58 movdqa xmm4, XMMWORD PTR [rdi]
michael@0 59 movdqa xmm7, XMMWORD PTR [rdi+16]
michael@0 60
michael@0 61 movdqa xmm5, xmm7
michael@0 62 palignr xmm5, xmm4, %2
michael@0 63
michael@0 64 movdqa xmm6, xmm7
michael@0 65 palignr xmm6, xmm4, (%2+1)
michael@0 66
michael@0 67 palignr xmm7, xmm4, (%2+2)
michael@0 68
michael@0 69 psadbw xmm5, xmm0
michael@0 70 psadbw xmm6, xmm0
michael@0 71 psadbw xmm7, xmm0
michael@0 72 %else
michael@0 73 movdqa xmm0, XMMWORD PTR [rsi]
michael@0 74 movdqa xmm4, XMMWORD PTR [rdi]
michael@0 75 movdqa xmm3, XMMWORD PTR [rdi+16]
michael@0 76
michael@0 77 movdqa xmm1, xmm3
michael@0 78 palignr xmm1, xmm4, %2
michael@0 79
michael@0 80 movdqa xmm2, xmm3
michael@0 81 palignr xmm2, xmm4, (%2+1)
michael@0 82
michael@0 83 palignr xmm3, xmm4, (%2+2)
michael@0 84
michael@0 85 psadbw xmm1, xmm0
michael@0 86 psadbw xmm2, xmm0
michael@0 87 psadbw xmm3, xmm0
michael@0 88
michael@0 89 paddw xmm5, xmm1
michael@0 90 paddw xmm6, xmm2
michael@0 91 paddw xmm7, xmm3
michael@0 92 %endif
michael@0 93 movdqa xmm0, XMMWORD PTR [rsi+rax]
michael@0 94 movdqa xmm4, XMMWORD PTR [rdi+rdx]
michael@0 95 movdqa xmm3, XMMWORD PTR [rdi+rdx+16]
michael@0 96
michael@0 97 movdqa xmm1, xmm3
michael@0 98 palignr xmm1, xmm4, %2
michael@0 99
michael@0 100 movdqa xmm2, xmm3
michael@0 101 palignr xmm2, xmm4, (%2+1)
michael@0 102
michael@0 103 palignr xmm3, xmm4, (%2+2)
michael@0 104
michael@0 105 lea rsi, [rsi+rax*2]
michael@0 106 lea rdi, [rdi+rdx*2]
michael@0 107
michael@0 108 psadbw xmm1, xmm0
michael@0 109 psadbw xmm2, xmm0
michael@0 110 psadbw xmm3, xmm0
michael@0 111
michael@0 112 paddw xmm5, xmm1
michael@0 113 paddw xmm6, xmm2
michael@0 114 paddw xmm7, xmm3
michael@0 115 %endmacro
michael@0 116
michael@0 117 %macro PROCESS_16X16X3_OFFSET 2
michael@0 118 %2_aligned_by_%1:
michael@0 119
michael@0 120 sub rdi, %1
michael@0 121
michael@0 122 PROCESS_16X2X3_OFFSET 1, %1
michael@0 123 PROCESS_16X2X3_OFFSET 0, %1
michael@0 124 PROCESS_16X2X3_OFFSET 0, %1
michael@0 125 PROCESS_16X2X3_OFFSET 0, %1
michael@0 126 PROCESS_16X2X3_OFFSET 0, %1
michael@0 127 PROCESS_16X2X3_OFFSET 0, %1
michael@0 128 PROCESS_16X2X3_OFFSET 0, %1
michael@0 129 PROCESS_16X2X3_OFFSET 0, %1
michael@0 130
michael@0 131 jmp %2_store_off
michael@0 132
michael@0 133 %endmacro
michael@0 134
michael@0 135 %macro PROCESS_16X8X3_OFFSET 2
michael@0 136 %2_aligned_by_%1:
michael@0 137
michael@0 138 sub rdi, %1
michael@0 139
michael@0 140 PROCESS_16X2X3_OFFSET 1, %1
michael@0 141 PROCESS_16X2X3_OFFSET 0, %1
michael@0 142 PROCESS_16X2X3_OFFSET 0, %1
michael@0 143 PROCESS_16X2X3_OFFSET 0, %1
michael@0 144
michael@0 145 jmp %2_store_off
michael@0 146
michael@0 147 %endmacro
michael@0 148
michael@0 149 ;void int vp9_sad16x16x3_ssse3(
michael@0 150 ; unsigned char *src_ptr,
michael@0 151 ; int src_stride,
michael@0 152 ; unsigned char *ref_ptr,
michael@0 153 ; int ref_stride,
michael@0 154 ; int *results)
michael@0 155 global sym(vp9_sad16x16x3_ssse3) PRIVATE
michael@0 156 sym(vp9_sad16x16x3_ssse3):
michael@0 157 push rbp
michael@0 158 mov rbp, rsp
michael@0 159 SHADOW_ARGS_TO_STACK 5
michael@0 160 SAVE_XMM 7
michael@0 161 push rsi
michael@0 162 push rdi
michael@0 163 push rcx
michael@0 164 ; end prolog
michael@0 165
michael@0 166 mov rsi, arg(0) ;src_ptr
michael@0 167 mov rdi, arg(2) ;ref_ptr
michael@0 168
michael@0 169 mov rdx, 0xf
michael@0 170 and rdx, rdi
michael@0 171
michael@0 172 jmp .vp9_sad16x16x3_ssse3_skiptable
michael@0 173 .vp9_sad16x16x3_ssse3_jumptable:
michael@0 174 dd .vp9_sad16x16x3_ssse3_aligned_by_0 - .vp9_sad16x16x3_ssse3_do_jump
michael@0 175 dd .vp9_sad16x16x3_ssse3_aligned_by_1 - .vp9_sad16x16x3_ssse3_do_jump
michael@0 176 dd .vp9_sad16x16x3_ssse3_aligned_by_2 - .vp9_sad16x16x3_ssse3_do_jump
michael@0 177 dd .vp9_sad16x16x3_ssse3_aligned_by_3 - .vp9_sad16x16x3_ssse3_do_jump
michael@0 178 dd .vp9_sad16x16x3_ssse3_aligned_by_4 - .vp9_sad16x16x3_ssse3_do_jump
michael@0 179 dd .vp9_sad16x16x3_ssse3_aligned_by_5 - .vp9_sad16x16x3_ssse3_do_jump
michael@0 180 dd .vp9_sad16x16x3_ssse3_aligned_by_6 - .vp9_sad16x16x3_ssse3_do_jump
michael@0 181 dd .vp9_sad16x16x3_ssse3_aligned_by_7 - .vp9_sad16x16x3_ssse3_do_jump
michael@0 182 dd .vp9_sad16x16x3_ssse3_aligned_by_8 - .vp9_sad16x16x3_ssse3_do_jump
michael@0 183 dd .vp9_sad16x16x3_ssse3_aligned_by_9 - .vp9_sad16x16x3_ssse3_do_jump
michael@0 184 dd .vp9_sad16x16x3_ssse3_aligned_by_10 - .vp9_sad16x16x3_ssse3_do_jump
michael@0 185 dd .vp9_sad16x16x3_ssse3_aligned_by_11 - .vp9_sad16x16x3_ssse3_do_jump
michael@0 186 dd .vp9_sad16x16x3_ssse3_aligned_by_12 - .vp9_sad16x16x3_ssse3_do_jump
michael@0 187 dd .vp9_sad16x16x3_ssse3_aligned_by_13 - .vp9_sad16x16x3_ssse3_do_jump
michael@0 188 dd .vp9_sad16x16x3_ssse3_aligned_by_14 - .vp9_sad16x16x3_ssse3_do_jump
michael@0 189 dd .vp9_sad16x16x3_ssse3_aligned_by_15 - .vp9_sad16x16x3_ssse3_do_jump
michael@0 190 .vp9_sad16x16x3_ssse3_skiptable:
michael@0 191
michael@0 192 call .vp9_sad16x16x3_ssse3_do_jump
michael@0 193 .vp9_sad16x16x3_ssse3_do_jump:
michael@0 194 pop rcx ; get the address of do_jump
michael@0 195 mov rax, .vp9_sad16x16x3_ssse3_jumptable - .vp9_sad16x16x3_ssse3_do_jump
michael@0 196 add rax, rcx ; get the absolute address of vp9_sad16x16x3_ssse3_jumptable
michael@0 197
michael@0 198 movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
michael@0 199 add rcx, rax
michael@0 200
michael@0 201 movsxd rax, dword ptr arg(1) ;src_stride
michael@0 202 movsxd rdx, dword ptr arg(3) ;ref_stride
michael@0 203
michael@0 204 jmp rcx
michael@0 205
michael@0 206 PROCESS_16X16X3_OFFSET 0, .vp9_sad16x16x3_ssse3
michael@0 207 PROCESS_16X16X3_OFFSET 1, .vp9_sad16x16x3_ssse3
michael@0 208 PROCESS_16X16X3_OFFSET 2, .vp9_sad16x16x3_ssse3
michael@0 209 PROCESS_16X16X3_OFFSET 3, .vp9_sad16x16x3_ssse3
michael@0 210 PROCESS_16X16X3_OFFSET 4, .vp9_sad16x16x3_ssse3
michael@0 211 PROCESS_16X16X3_OFFSET 5, .vp9_sad16x16x3_ssse3
michael@0 212 PROCESS_16X16X3_OFFSET 6, .vp9_sad16x16x3_ssse3
michael@0 213 PROCESS_16X16X3_OFFSET 7, .vp9_sad16x16x3_ssse3
michael@0 214 PROCESS_16X16X3_OFFSET 8, .vp9_sad16x16x3_ssse3
michael@0 215 PROCESS_16X16X3_OFFSET 9, .vp9_sad16x16x3_ssse3
michael@0 216 PROCESS_16X16X3_OFFSET 10, .vp9_sad16x16x3_ssse3
michael@0 217 PROCESS_16X16X3_OFFSET 11, .vp9_sad16x16x3_ssse3
michael@0 218 PROCESS_16X16X3_OFFSET 12, .vp9_sad16x16x3_ssse3
michael@0 219 PROCESS_16X16X3_OFFSET 13, .vp9_sad16x16x3_ssse3
michael@0 220 PROCESS_16X16X3_OFFSET 14, .vp9_sad16x16x3_ssse3
michael@0 221
michael@0 222 .vp9_sad16x16x3_ssse3_aligned_by_15:
michael@0 223 PROCESS_16X2X3 1
michael@0 224 PROCESS_16X2X3 0
michael@0 225 PROCESS_16X2X3 0
michael@0 226 PROCESS_16X2X3 0
michael@0 227 PROCESS_16X2X3 0
michael@0 228 PROCESS_16X2X3 0
michael@0 229 PROCESS_16X2X3 0
michael@0 230 PROCESS_16X2X3 0
michael@0 231
michael@0 232 .vp9_sad16x16x3_ssse3_store_off:
michael@0 233 mov rdi, arg(4) ;Results
michael@0 234
michael@0 235 movq xmm0, xmm5
michael@0 236 psrldq xmm5, 8
michael@0 237
michael@0 238 paddw xmm0, xmm5
michael@0 239 movd [rdi], xmm0
michael@0 240 ;-
michael@0 241 movq xmm0, xmm6
michael@0 242 psrldq xmm6, 8
michael@0 243
michael@0 244 paddw xmm0, xmm6
michael@0 245 movd [rdi+4], xmm0
michael@0 246 ;-
michael@0 247 movq xmm0, xmm7
michael@0 248 psrldq xmm7, 8
michael@0 249
michael@0 250 paddw xmm0, xmm7
michael@0 251 movd [rdi+8], xmm0
michael@0 252
michael@0 253 ; begin epilog
michael@0 254 pop rcx
michael@0 255 pop rdi
michael@0 256 pop rsi
michael@0 257 RESTORE_XMM
michael@0 258 UNSHADOW_ARGS
michael@0 259 pop rbp
michael@0 260 ret
michael@0 261
michael@0 262 ;void int vp9_sad16x8x3_ssse3(
michael@0 263 ; unsigned char *src_ptr,
michael@0 264 ; int src_stride,
michael@0 265 ; unsigned char *ref_ptr,
michael@0 266 ; int ref_stride,
michael@0 267 ; int *results)
michael@0 268 global sym(vp9_sad16x8x3_ssse3) PRIVATE
michael@0 269 sym(vp9_sad16x8x3_ssse3):
michael@0 270 push rbp
michael@0 271 mov rbp, rsp
michael@0 272 SHADOW_ARGS_TO_STACK 5
michael@0 273 SAVE_XMM 7
michael@0 274 push rsi
michael@0 275 push rdi
michael@0 276 push rcx
michael@0 277 ; end prolog
michael@0 278
michael@0 279 mov rsi, arg(0) ;src_ptr
michael@0 280 mov rdi, arg(2) ;ref_ptr
michael@0 281
michael@0 282 mov rdx, 0xf
michael@0 283 and rdx, rdi
michael@0 284
michael@0 285 jmp .vp9_sad16x8x3_ssse3_skiptable
michael@0 286 .vp9_sad16x8x3_ssse3_jumptable:
michael@0 287 dd .vp9_sad16x8x3_ssse3_aligned_by_0 - .vp9_sad16x8x3_ssse3_do_jump
michael@0 288 dd .vp9_sad16x8x3_ssse3_aligned_by_1 - .vp9_sad16x8x3_ssse3_do_jump
michael@0 289 dd .vp9_sad16x8x3_ssse3_aligned_by_2 - .vp9_sad16x8x3_ssse3_do_jump
michael@0 290 dd .vp9_sad16x8x3_ssse3_aligned_by_3 - .vp9_sad16x8x3_ssse3_do_jump
michael@0 291 dd .vp9_sad16x8x3_ssse3_aligned_by_4 - .vp9_sad16x8x3_ssse3_do_jump
michael@0 292 dd .vp9_sad16x8x3_ssse3_aligned_by_5 - .vp9_sad16x8x3_ssse3_do_jump
michael@0 293 dd .vp9_sad16x8x3_ssse3_aligned_by_6 - .vp9_sad16x8x3_ssse3_do_jump
michael@0 294 dd .vp9_sad16x8x3_ssse3_aligned_by_7 - .vp9_sad16x8x3_ssse3_do_jump
michael@0 295 dd .vp9_sad16x8x3_ssse3_aligned_by_8 - .vp9_sad16x8x3_ssse3_do_jump
michael@0 296 dd .vp9_sad16x8x3_ssse3_aligned_by_9 - .vp9_sad16x8x3_ssse3_do_jump
michael@0 297 dd .vp9_sad16x8x3_ssse3_aligned_by_10 - .vp9_sad16x8x3_ssse3_do_jump
michael@0 298 dd .vp9_sad16x8x3_ssse3_aligned_by_11 - .vp9_sad16x8x3_ssse3_do_jump
michael@0 299 dd .vp9_sad16x8x3_ssse3_aligned_by_12 - .vp9_sad16x8x3_ssse3_do_jump
michael@0 300 dd .vp9_sad16x8x3_ssse3_aligned_by_13 - .vp9_sad16x8x3_ssse3_do_jump
michael@0 301 dd .vp9_sad16x8x3_ssse3_aligned_by_14 - .vp9_sad16x8x3_ssse3_do_jump
michael@0 302 dd .vp9_sad16x8x3_ssse3_aligned_by_15 - .vp9_sad16x8x3_ssse3_do_jump
michael@0 303 .vp9_sad16x8x3_ssse3_skiptable:
michael@0 304
michael@0 305 call .vp9_sad16x8x3_ssse3_do_jump
michael@0 306 .vp9_sad16x8x3_ssse3_do_jump:
michael@0 307 pop rcx ; get the address of do_jump
michael@0 308 mov rax, .vp9_sad16x8x3_ssse3_jumptable - .vp9_sad16x8x3_ssse3_do_jump
michael@0 309 add rax, rcx ; get the absolute address of vp9_sad16x8x3_ssse3_jumptable
michael@0 310
michael@0 311 movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
michael@0 312 add rcx, rax
michael@0 313
michael@0 314 movsxd rax, dword ptr arg(1) ;src_stride
michael@0 315 movsxd rdx, dword ptr arg(3) ;ref_stride
michael@0 316
michael@0 317 jmp rcx
michael@0 318
michael@0 319 PROCESS_16X8X3_OFFSET 0, .vp9_sad16x8x3_ssse3
michael@0 320 PROCESS_16X8X3_OFFSET 1, .vp9_sad16x8x3_ssse3
michael@0 321 PROCESS_16X8X3_OFFSET 2, .vp9_sad16x8x3_ssse3
michael@0 322 PROCESS_16X8X3_OFFSET 3, .vp9_sad16x8x3_ssse3
michael@0 323 PROCESS_16X8X3_OFFSET 4, .vp9_sad16x8x3_ssse3
michael@0 324 PROCESS_16X8X3_OFFSET 5, .vp9_sad16x8x3_ssse3
michael@0 325 PROCESS_16X8X3_OFFSET 6, .vp9_sad16x8x3_ssse3
michael@0 326 PROCESS_16X8X3_OFFSET 7, .vp9_sad16x8x3_ssse3
michael@0 327 PROCESS_16X8X3_OFFSET 8, .vp9_sad16x8x3_ssse3
michael@0 328 PROCESS_16X8X3_OFFSET 9, .vp9_sad16x8x3_ssse3
michael@0 329 PROCESS_16X8X3_OFFSET 10, .vp9_sad16x8x3_ssse3
michael@0 330 PROCESS_16X8X3_OFFSET 11, .vp9_sad16x8x3_ssse3
michael@0 331 PROCESS_16X8X3_OFFSET 12, .vp9_sad16x8x3_ssse3
michael@0 332 PROCESS_16X8X3_OFFSET 13, .vp9_sad16x8x3_ssse3
michael@0 333 PROCESS_16X8X3_OFFSET 14, .vp9_sad16x8x3_ssse3
michael@0 334
michael@0 335 .vp9_sad16x8x3_ssse3_aligned_by_15:
michael@0 336
michael@0 337 PROCESS_16X2X3 1
michael@0 338 PROCESS_16X2X3 0
michael@0 339 PROCESS_16X2X3 0
michael@0 340 PROCESS_16X2X3 0
michael@0 341
michael@0 342 .vp9_sad16x8x3_ssse3_store_off:
michael@0 343 mov rdi, arg(4) ;Results
michael@0 344
michael@0 345 movq xmm0, xmm5
michael@0 346 psrldq xmm5, 8
michael@0 347
michael@0 348 paddw xmm0, xmm5
michael@0 349 movd [rdi], xmm0
michael@0 350 ;-
michael@0 351 movq xmm0, xmm6
michael@0 352 psrldq xmm6, 8
michael@0 353
michael@0 354 paddw xmm0, xmm6
michael@0 355 movd [rdi+4], xmm0
michael@0 356 ;-
michael@0 357 movq xmm0, xmm7
michael@0 358 psrldq xmm7, 8
michael@0 359
michael@0 360 paddw xmm0, xmm7
michael@0 361 movd [rdi+8], xmm0
michael@0 362
michael@0 363 ; begin epilog
michael@0 364 pop rcx
michael@0 365 pop rdi
michael@0 366 pop rsi
michael@0 367 RESTORE_XMM
michael@0 368 UNSHADOW_ARGS
michael@0 369 pop rbp
michael@0 370 ret

mercurial