media/libvpx/vp8/common/x86/mfqe_sse2.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2012 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11
michael@0 12 %include "vpx_ports/x86_abi_support.asm"
michael@0 13
michael@0 14 ;void vp8_filter_by_weight16x16_sse2
michael@0 15 ;(
michael@0 16 ; unsigned char *src,
michael@0 17 ; int src_stride,
michael@0 18 ; unsigned char *dst,
michael@0 19 ; int dst_stride,
michael@0 20 ; int src_weight
michael@0 21 ;)
michael@0 22 global sym(vp8_filter_by_weight16x16_sse2) PRIVATE
michael@0 23 sym(vp8_filter_by_weight16x16_sse2):
michael@0 24 push rbp
michael@0 25 mov rbp, rsp
michael@0 26 SHADOW_ARGS_TO_STACK 5
michael@0 27 SAVE_XMM 6
michael@0 28 GET_GOT rbx
michael@0 29 push rsi
michael@0 30 push rdi
michael@0 31 ; end prolog
michael@0 32
michael@0 33 movd xmm0, arg(4) ; src_weight
michael@0 34 pshuflw xmm0, xmm0, 0x0 ; replicate to all low words
michael@0 35 punpcklqdq xmm0, xmm0 ; replicate to all hi words
michael@0 36
michael@0 37 movdqa xmm1, [GLOBAL(tMFQE)]
michael@0 38 psubw xmm1, xmm0 ; dst_weight
michael@0 39
michael@0 40 mov rax, arg(0) ; src
michael@0 41 mov rsi, arg(1) ; src_stride
michael@0 42 mov rdx, arg(2) ; dst
michael@0 43 mov rdi, arg(3) ; dst_stride
michael@0 44
michael@0 45 mov rcx, 16 ; loop count
michael@0 46 pxor xmm6, xmm6
michael@0 47
michael@0 48 .combine
michael@0 49 movdqa xmm2, [rax]
michael@0 50 movdqa xmm4, [rdx]
michael@0 51 add rax, rsi
michael@0 52
michael@0 53 ; src * src_weight
michael@0 54 movdqa xmm3, xmm2
michael@0 55 punpcklbw xmm2, xmm6
michael@0 56 punpckhbw xmm3, xmm6
michael@0 57 pmullw xmm2, xmm0
michael@0 58 pmullw xmm3, xmm0
michael@0 59
michael@0 60 ; dst * dst_weight
michael@0 61 movdqa xmm5, xmm4
michael@0 62 punpcklbw xmm4, xmm6
michael@0 63 punpckhbw xmm5, xmm6
michael@0 64 pmullw xmm4, xmm1
michael@0 65 pmullw xmm5, xmm1
michael@0 66
michael@0 67 ; sum, round and shift
michael@0 68 paddw xmm2, xmm4
michael@0 69 paddw xmm3, xmm5
michael@0 70 paddw xmm2, [GLOBAL(tMFQE_round)]
michael@0 71 paddw xmm3, [GLOBAL(tMFQE_round)]
michael@0 72 psrlw xmm2, 4
michael@0 73 psrlw xmm3, 4
michael@0 74
michael@0 75 packuswb xmm2, xmm3
michael@0 76 movdqa [rdx], xmm2
michael@0 77 add rdx, rdi
michael@0 78
michael@0 79 dec rcx
michael@0 80 jnz .combine
michael@0 81
michael@0 82 ; begin epilog
michael@0 83 pop rdi
michael@0 84 pop rsi
michael@0 85 RESTORE_GOT
michael@0 86 RESTORE_XMM
michael@0 87 UNSHADOW_ARGS
michael@0 88 pop rbp
michael@0 89
michael@0 90 ret
michael@0 91
michael@0 92 ;void vp8_filter_by_weight8x8_sse2
michael@0 93 ;(
michael@0 94 ; unsigned char *src,
michael@0 95 ; int src_stride,
michael@0 96 ; unsigned char *dst,
michael@0 97 ; int dst_stride,
michael@0 98 ; int src_weight
michael@0 99 ;)
michael@0 100 global sym(vp8_filter_by_weight8x8_sse2) PRIVATE
michael@0 101 sym(vp8_filter_by_weight8x8_sse2):
michael@0 102 push rbp
michael@0 103 mov rbp, rsp
michael@0 104 SHADOW_ARGS_TO_STACK 5
michael@0 105 GET_GOT rbx
michael@0 106 push rsi
michael@0 107 push rdi
michael@0 108 ; end prolog
michael@0 109
michael@0 110 movd xmm0, arg(4) ; src_weight
michael@0 111 pshuflw xmm0, xmm0, 0x0 ; replicate to all low words
michael@0 112 punpcklqdq xmm0, xmm0 ; replicate to all hi words
michael@0 113
michael@0 114 movdqa xmm1, [GLOBAL(tMFQE)]
michael@0 115 psubw xmm1, xmm0 ; dst_weight
michael@0 116
michael@0 117 mov rax, arg(0) ; src
michael@0 118 mov rsi, arg(1) ; src_stride
michael@0 119 mov rdx, arg(2) ; dst
michael@0 120 mov rdi, arg(3) ; dst_stride
michael@0 121
michael@0 122 mov rcx, 8 ; loop count
michael@0 123 pxor xmm4, xmm4
michael@0 124
michael@0 125 .combine
michael@0 126 movq xmm2, [rax]
michael@0 127 movq xmm3, [rdx]
michael@0 128 add rax, rsi
michael@0 129
michael@0 130 ; src * src_weight
michael@0 131 punpcklbw xmm2, xmm4
michael@0 132 pmullw xmm2, xmm0
michael@0 133
michael@0 134 ; dst * dst_weight
michael@0 135 punpcklbw xmm3, xmm4
michael@0 136 pmullw xmm3, xmm1
michael@0 137
michael@0 138 ; sum, round and shift
michael@0 139 paddw xmm2, xmm3
michael@0 140 paddw xmm2, [GLOBAL(tMFQE_round)]
michael@0 141 psrlw xmm2, 4
michael@0 142
michael@0 143 packuswb xmm2, xmm4
michael@0 144 movq [rdx], xmm2
michael@0 145 add rdx, rdi
michael@0 146
michael@0 147 dec rcx
michael@0 148 jnz .combine
michael@0 149
michael@0 150 ; begin epilog
michael@0 151 pop rdi
michael@0 152 pop rsi
michael@0 153 RESTORE_GOT
michael@0 154 UNSHADOW_ARGS
michael@0 155 pop rbp
michael@0 156
michael@0 157 ret
michael@0 158
michael@0 159 ;void vp8_variance_and_sad_16x16_sse2 | arg
michael@0 160 ;(
michael@0 161 ; unsigned char *src1, 0
michael@0 162 ; int stride1, 1
michael@0 163 ; unsigned char *src2, 2
michael@0 164 ; int stride2, 3
michael@0 165 ; unsigned int *variance, 4
michael@0 166 ; unsigned int *sad, 5
michael@0 167 ;)
michael@0 168 global sym(vp8_variance_and_sad_16x16_sse2) PRIVATE
michael@0 169 sym(vp8_variance_and_sad_16x16_sse2):
michael@0 170 push rbp
michael@0 171 mov rbp, rsp
michael@0 172 SHADOW_ARGS_TO_STACK 6
michael@0 173 GET_GOT rbx
michael@0 174 push rsi
michael@0 175 push rdi
michael@0 176 ; end prolog
michael@0 177
michael@0 178 mov rax, arg(0) ; src1
michael@0 179 mov rcx, arg(1) ; stride1
michael@0 180 mov rdx, arg(2) ; src2
michael@0 181 mov rdi, arg(3) ; stride2
michael@0 182
michael@0 183 mov rsi, 16 ; block height
michael@0 184
michael@0 185 ; Prep accumulator registers
michael@0 186 pxor xmm3, xmm3 ; SAD
michael@0 187 pxor xmm4, xmm4 ; sum of src2
michael@0 188 pxor xmm5, xmm5 ; sum of src2^2
michael@0 189
michael@0 190 ; Because we're working with the actual output frames
michael@0 191 ; we can't depend on any kind of data alignment.
michael@0 192 .accumulate
michael@0 193 movdqa xmm0, [rax] ; src1
michael@0 194 movdqa xmm1, [rdx] ; src2
michael@0 195 add rax, rcx ; src1 + stride1
michael@0 196 add rdx, rdi ; src2 + stride2
michael@0 197
michael@0 198 ; SAD(src1, src2)
michael@0 199 psadbw xmm0, xmm1
michael@0 200 paddusw xmm3, xmm0
michael@0 201
michael@0 202 ; SUM(src2)
michael@0 203 pxor xmm2, xmm2
michael@0 204 psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0
michael@0 205 paddusw xmm4, xmm2
michael@0 206
michael@0 207 ; pmaddubsw would be ideal if it took two unsigned values. instead,
michael@0 208 ; it expects a signed and an unsigned value. so instead we zero extend
michael@0 209 ; and operate on words.
michael@0 210 pxor xmm2, xmm2
michael@0 211 movdqa xmm0, xmm1
michael@0 212 punpcklbw xmm0, xmm2
michael@0 213 punpckhbw xmm1, xmm2
michael@0 214 pmaddwd xmm0, xmm0
michael@0 215 pmaddwd xmm1, xmm1
michael@0 216 paddd xmm5, xmm0
michael@0 217 paddd xmm5, xmm1
michael@0 218
michael@0 219 sub rsi, 1
michael@0 220 jnz .accumulate
michael@0 221
michael@0 222 ; phaddd only operates on adjacent double words.
michael@0 223 ; Finalize SAD and store
michael@0 224 movdqa xmm0, xmm3
michael@0 225 psrldq xmm0, 8
michael@0 226 paddusw xmm0, xmm3
michael@0 227 paddd xmm0, [GLOBAL(t128)]
michael@0 228 psrld xmm0, 8
michael@0 229
michael@0 230 mov rax, arg(5)
michael@0 231 movd [rax], xmm0
michael@0 232
michael@0 233 ; Accumulate sum of src2
michael@0 234 movdqa xmm0, xmm4
michael@0 235 psrldq xmm0, 8
michael@0 236 paddusw xmm0, xmm4
michael@0 237 ; Square src2. Ignore high value
michael@0 238 pmuludq xmm0, xmm0
michael@0 239 psrld xmm0, 8
michael@0 240
michael@0 241 ; phaddw could be used to sum adjacent values but we want
michael@0 242 ; all the values summed. promote to doubles, accumulate,
michael@0 243 ; shift and sum
michael@0 244 pxor xmm2, xmm2
michael@0 245 movdqa xmm1, xmm5
michael@0 246 punpckldq xmm1, xmm2
michael@0 247 punpckhdq xmm5, xmm2
michael@0 248 paddd xmm1, xmm5
michael@0 249 movdqa xmm2, xmm1
michael@0 250 psrldq xmm1, 8
michael@0 251 paddd xmm1, xmm2
michael@0 252
michael@0 253 psubd xmm1, xmm0
michael@0 254
michael@0 255 ; (variance + 128) >> 8
michael@0 256 paddd xmm1, [GLOBAL(t128)]
michael@0 257 psrld xmm1, 8
michael@0 258 mov rax, arg(4)
michael@0 259
michael@0 260 movd [rax], xmm1
michael@0 261
michael@0 262
michael@0 263 ; begin epilog
michael@0 264 pop rdi
michael@0 265 pop rsi
michael@0 266 RESTORE_GOT
michael@0 267 UNSHADOW_ARGS
michael@0 268 pop rbp
michael@0 269 ret
michael@0 270
michael@0 271 SECTION_RODATA
michael@0 272 align 16
michael@0 273 t128:
michael@0 274 %ifndef __NASM_VER__
michael@0 275 ddq 128
michael@0 276 %elif CONFIG_BIG_ENDIAN
michael@0 277 dq 0, 128
michael@0 278 %else
michael@0 279 dq 128, 0
michael@0 280 %endif
michael@0 281 align 16
michael@0 282 tMFQE: ; 1 << MFQE_PRECISION
michael@0 283 times 8 dw 0x10
michael@0 284 align 16
michael@0 285 tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
michael@0 286 times 8 dw 0x08
michael@0 287

mercurial