media/libvpx/vp8/common/x86/postproc_mmx.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11
michael@0 12 %include "vpx_ports/x86_abi_support.asm"
michael@0 13
michael@0 14 %define VP8_FILTER_WEIGHT 128
michael@0 15 %define VP8_FILTER_SHIFT 7
michael@0 16
michael@0 17 ;void vp8_mbpost_proc_down_mmx(unsigned char *dst,
michael@0 18 ; int pitch, int rows, int cols,int flimit)
michael@0 19 extern sym(vp8_rv)
michael@0 20 global sym(vp8_mbpost_proc_down_mmx) PRIVATE
michael@0 21 sym(vp8_mbpost_proc_down_mmx):
michael@0 22 push rbp
michael@0 23 mov rbp, rsp
michael@0 24 SHADOW_ARGS_TO_STACK 5
michael@0 25 GET_GOT rbx
michael@0 26 push rsi
michael@0 27 push rdi
michael@0 28 ; end prolog
michael@0 29
michael@0 30 ALIGN_STACK 16, rax
michael@0 31 sub rsp, 136
michael@0 32
michael@0 33 ; unsigned char d[16][8] at [rsp]
michael@0 34 ; create flimit2 at [rsp+128]
michael@0 35 mov eax, dword ptr arg(4) ;flimit
michael@0 36 mov [rsp+128], eax
michael@0 37 mov [rsp+128+4], eax
michael@0 38 %define flimit2 [rsp+128]
michael@0 39
michael@0 40 %if ABI_IS_32BIT=0
michael@0 41 lea r8, [GLOBAL(sym(vp8_rv))]
michael@0 42 %endif
michael@0 43
michael@0 44 ;rows +=8;
michael@0 45 add dword ptr arg(2), 8
michael@0 46
michael@0 47 ;for(c=0; c<cols; c+=4)
michael@0 48 .loop_col:
michael@0 49 mov rsi, arg(0) ;s
michael@0 50 pxor mm0, mm0 ;
michael@0 51
michael@0 52 movsxd rax, dword ptr arg(1) ;pitch ;
michael@0 53
michael@0 54 ; this copies the last row down into the border 8 rows
michael@0 55 mov rdi, rsi
michael@0 56 mov rdx, arg(2)
michael@0 57 sub rdx, 9
michael@0 58 imul rdx, rax
michael@0 59 lea rdi, [rdi+rdx]
michael@0 60 movq mm1, QWORD ptr[rdi] ; first row
michael@0 61 mov rcx, 8
michael@0 62 .init_borderd ; initialize borders
michael@0 63 lea rdi, [rdi + rax]
michael@0 64 movq [rdi], mm1
michael@0 65
michael@0 66 dec rcx
michael@0 67 jne .init_borderd
michael@0 68
michael@0 69 neg rax ; rax = -pitch
michael@0 70
michael@0 71 ; this copies the first row up into the border 8 rows
michael@0 72 mov rdi, rsi
michael@0 73 movq mm1, QWORD ptr[rdi] ; first row
michael@0 74 mov rcx, 8
michael@0 75 .init_border ; initialize borders
michael@0 76 lea rdi, [rdi + rax]
michael@0 77 movq [rdi], mm1
michael@0 78
michael@0 79 dec rcx
michael@0 80 jne .init_border
michael@0 81
michael@0 82
michael@0 83 lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
michael@0 84 neg rax
michael@0 85
michael@0 86
michael@0 87 pxor mm5, mm5
michael@0 88 pxor mm6, mm6 ;
michael@0 89
michael@0 90 pxor mm7, mm7 ;
michael@0 91 mov rdi, rsi
michael@0 92
michael@0 93 mov rcx, 15 ;
michael@0 94
michael@0 95 .loop_initvar:
michael@0 96 movd mm1, DWORD PTR [rdi];
michael@0 97 punpcklbw mm1, mm0 ;
michael@0 98
michael@0 99 paddw mm5, mm1 ;
michael@0 100 pmullw mm1, mm1 ;
michael@0 101
michael@0 102 movq mm2, mm1 ;
michael@0 103 punpcklwd mm1, mm0 ;
michael@0 104
michael@0 105 punpckhwd mm2, mm0 ;
michael@0 106 paddd mm6, mm1 ;
michael@0 107
michael@0 108 paddd mm7, mm2 ;
michael@0 109 lea rdi, [rdi+rax] ;
michael@0 110
michael@0 111 dec rcx
michael@0 112 jne .loop_initvar
michael@0 113 ;save the var and sum
michael@0 114 xor rdx, rdx
michael@0 115 .loop_row:
michael@0 116 movd mm1, DWORD PTR [rsi] ; [s-pitch*8]
michael@0 117 movd mm2, DWORD PTR [rdi] ; [s+pitch*7]
michael@0 118
michael@0 119 punpcklbw mm1, mm0
michael@0 120 punpcklbw mm2, mm0
michael@0 121
michael@0 122 paddw mm5, mm2
michael@0 123 psubw mm5, mm1
michael@0 124
michael@0 125 pmullw mm2, mm2
michael@0 126 movq mm4, mm2
michael@0 127
michael@0 128 punpcklwd mm2, mm0
michael@0 129 punpckhwd mm4, mm0
michael@0 130
michael@0 131 paddd mm6, mm2
michael@0 132 paddd mm7, mm4
michael@0 133
michael@0 134 pmullw mm1, mm1
michael@0 135 movq mm2, mm1
michael@0 136
michael@0 137 punpcklwd mm1, mm0
michael@0 138 psubd mm6, mm1
michael@0 139
michael@0 140 punpckhwd mm2, mm0
michael@0 141 psubd mm7, mm2
michael@0 142
michael@0 143
michael@0 144 movq mm3, mm6
michael@0 145 pslld mm3, 4
michael@0 146
michael@0 147 psubd mm3, mm6
michael@0 148 movq mm1, mm5
michael@0 149
michael@0 150 movq mm4, mm5
michael@0 151 pmullw mm1, mm1
michael@0 152
michael@0 153 pmulhw mm4, mm4
michael@0 154 movq mm2, mm1
michael@0 155
michael@0 156 punpcklwd mm1, mm4
michael@0 157 punpckhwd mm2, mm4
michael@0 158
michael@0 159 movq mm4, mm7
michael@0 160 pslld mm4, 4
michael@0 161
michael@0 162 psubd mm4, mm7
michael@0 163
michael@0 164 psubd mm3, mm1
michael@0 165 psubd mm4, mm2
michael@0 166
michael@0 167 psubd mm3, flimit2
michael@0 168 psubd mm4, flimit2
michael@0 169
michael@0 170 psrad mm3, 31
michael@0 171 psrad mm4, 31
michael@0 172
michael@0 173 packssdw mm3, mm4
michael@0 174 packsswb mm3, mm0
michael@0 175
michael@0 176 movd mm1, DWORD PTR [rsi+rax*8]
michael@0 177
michael@0 178 movq mm2, mm1
michael@0 179 punpcklbw mm1, mm0
michael@0 180
michael@0 181 paddw mm1, mm5
michael@0 182 mov rcx, rdx
michael@0 183
michael@0 184 and rcx, 127
michael@0 185 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
michael@0 186 push rax
michael@0 187 lea rax, [GLOBAL(sym(vp8_rv))]
michael@0 188 movq mm4, [rax + rcx*2] ;vp8_rv[rcx*2]
michael@0 189 pop rax
michael@0 190 %elif ABI_IS_32BIT=0
michael@0 191 movq mm4, [r8 + rcx*2] ;vp8_rv[rcx*2]
michael@0 192 %else
michael@0 193 movq mm4, [sym(vp8_rv) + rcx*2]
michael@0 194 %endif
michael@0 195 paddw mm1, mm4
michael@0 196 psraw mm1, 4
michael@0 197
michael@0 198 packuswb mm1, mm0
michael@0 199 pand mm1, mm3
michael@0 200
michael@0 201 pandn mm3, mm2
michael@0 202 por mm1, mm3
michael@0 203
michael@0 204 and rcx, 15
michael@0 205 movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4]
michael@0 206
michael@0 207 mov rcx, rdx
michael@0 208 sub rcx, 8
michael@0 209
michael@0 210 and rcx, 15
michael@0 211 movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4]
michael@0 212
michael@0 213 movd [rsi], mm1
michael@0 214 lea rsi, [rsi+rax]
michael@0 215
michael@0 216 lea rdi, [rdi+rax]
michael@0 217 add rdx, 1
michael@0 218
michael@0 219 cmp edx, dword arg(2) ;rows
michael@0 220 jl .loop_row
michael@0 221
michael@0 222
michael@0 223 add dword arg(0), 4 ; s += 4
michael@0 224 sub dword arg(3), 4 ; cols -= 4
michael@0 225 cmp dword arg(3), 0
michael@0 226 jg .loop_col
michael@0 227
michael@0 228 add rsp, 136
michael@0 229 pop rsp
michael@0 230
michael@0 231 ; begin epilog
michael@0 232 pop rdi
michael@0 233 pop rsi
michael@0 234 RESTORE_GOT
michael@0 235 UNSHADOW_ARGS
michael@0 236 pop rbp
michael@0 237 ret
michael@0 238 %undef flimit2
michael@0 239
michael@0 240
michael@0 241 ;void vp8_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,
michael@0 242 ; unsigned char blackclamp[16],
michael@0 243 ; unsigned char whiteclamp[16],
michael@0 244 ; unsigned char bothclamp[16],
michael@0 245 ; unsigned int Width, unsigned int Height, int Pitch)
michael@0 246 extern sym(rand)
michael@0 247 global sym(vp8_plane_add_noise_mmx) PRIVATE
michael@0 248 sym(vp8_plane_add_noise_mmx):
michael@0 249 push rbp
michael@0 250 mov rbp, rsp
michael@0 251 SHADOW_ARGS_TO_STACK 8
michael@0 252 GET_GOT rbx
michael@0 253 push rsi
michael@0 254 push rdi
michael@0 255 ; end prolog
michael@0 256
michael@0 257 .addnoise_loop:
michael@0 258 call sym(rand) WRT_PLT
michael@0 259 mov rcx, arg(1) ;noise
michael@0 260 and rax, 0xff
michael@0 261 add rcx, rax
michael@0 262
michael@0 263 ; we rely on the fact that the clamping vectors are stored contiguously
michael@0 264 ; in black/white/both order. Note that we have to reload this here because
michael@0 265 ; rdx could be trashed by rand()
michael@0 266 mov rdx, arg(2) ; blackclamp
michael@0 267
michael@0 268
michael@0 269 mov rdi, rcx
michael@0 270 movsxd rcx, dword arg(5) ;[Width]
michael@0 271 mov rsi, arg(0) ;Pos
michael@0 272 xor rax,rax
michael@0 273
michael@0 274 .addnoise_nextset:
michael@0 275 movq mm1,[rsi+rax] ; get the source
michael@0 276
michael@0 277 psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise
michael@0 278 paddusb mm1, [rdx+32] ;bothclamp
michael@0 279 psubusb mm1, [rdx+16] ;whiteclamp
michael@0 280
michael@0 281 movq mm2,[rdi+rax] ; get the noise for this line
michael@0 282 paddb mm1,mm2 ; add it in
michael@0 283 movq [rsi+rax],mm1 ; store the result
michael@0 284
michael@0 285 add rax,8 ; move to the next line
michael@0 286
michael@0 287 cmp rax, rcx
michael@0 288 jl .addnoise_nextset
michael@0 289
michael@0 290 movsxd rax, dword arg(7) ; Pitch
michael@0 291 add arg(0), rax ; Start += Pitch
michael@0 292 sub dword arg(6), 1 ; Height -= 1
michael@0 293 jg .addnoise_loop
michael@0 294
michael@0 295 ; begin epilog
michael@0 296 pop rdi
michael@0 297 pop rsi
michael@0 298 RESTORE_GOT
michael@0 299 UNSHADOW_ARGS
michael@0 300 pop rbp
michael@0 301 ret
michael@0 302
michael@0 303
michael@0 304 SECTION_RODATA
michael@0 305 align 16
michael@0 306 Blur:
michael@0 307 times 16 dw 16
michael@0 308 times 8 dw 64
michael@0 309 times 16 dw 16
michael@0 310 times 8 dw 0
michael@0 311
michael@0 312 rd:
michael@0 313 times 4 dw 0x40

mercurial