Thu, 15 Jan 2015 15:59:08 +0100
Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.
michael@0 | 1 | ; |
michael@0 | 2 | ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
michael@0 | 3 | ; |
michael@0 | 4 | ; Use of this source code is governed by a BSD-style license |
michael@0 | 5 | ; that can be found in the LICENSE file in the root of the source |
michael@0 | 6 | ; tree. An additional intellectual property rights grant can be found |
michael@0 | 7 | ; in the file PATENTS. All contributing project authors may |
michael@0 | 8 | ; be found in the AUTHORS file in the root of the source tree. |
michael@0 | 9 | ; |
michael@0 | 10 | |
michael@0 | 11 | |
michael@0 | 12 | %include "vpx_ports/x86_abi_support.asm" |
michael@0 | 13 | |
michael@0 | 14 | ;unsigned int vp9_get_mb_ss_sse2 |
michael@0 | 15 | ;( |
michael@0 | 16 | ; short *src_ptr |
michael@0 | 17 | ;) |
michael@0 | 18 | global sym(vp9_get_mb_ss_sse2) PRIVATE |
michael@0 | 19 | sym(vp9_get_mb_ss_sse2): |
michael@0 | 20 | push rbp |
michael@0 | 21 | mov rbp, rsp |
michael@0 | 22 | SHADOW_ARGS_TO_STACK 1 |
michael@0 | 23 | GET_GOT rbx |
michael@0 | 24 | push rsi |
michael@0 | 25 | push rdi |
michael@0 | 26 | sub rsp, 16 |
michael@0 | 27 | ; end prolog |
michael@0 | 28 | |
michael@0 | 29 | |
michael@0 | 30 | mov rax, arg(0) ;[src_ptr] |
michael@0 | 31 | mov rcx, 8 |
michael@0 | 32 | pxor xmm4, xmm4 |
michael@0 | 33 | |
michael@0 | 34 | .NEXTROW: |
michael@0 | 35 | movdqa xmm0, [rax] |
michael@0 | 36 | movdqa xmm1, [rax+16] |
michael@0 | 37 | movdqa xmm2, [rax+32] |
michael@0 | 38 | movdqa xmm3, [rax+48] |
michael@0 | 39 | pmaddwd xmm0, xmm0 |
michael@0 | 40 | pmaddwd xmm1, xmm1 |
michael@0 | 41 | pmaddwd xmm2, xmm2 |
michael@0 | 42 | pmaddwd xmm3, xmm3 |
michael@0 | 43 | |
michael@0 | 44 | paddd xmm0, xmm1 |
michael@0 | 45 | paddd xmm2, xmm3 |
michael@0 | 46 | paddd xmm4, xmm0 |
michael@0 | 47 | paddd xmm4, xmm2 |
michael@0 | 48 | |
michael@0 | 49 | add rax, 0x40 |
michael@0 | 50 | dec rcx |
michael@0 | 51 | ja .NEXTROW |
michael@0 | 52 | |
michael@0 | 53 | movdqa xmm3,xmm4 |
michael@0 | 54 | psrldq xmm4,8 |
michael@0 | 55 | paddd xmm4,xmm3 |
michael@0 | 56 | movdqa xmm3,xmm4 |
michael@0 | 57 | psrldq xmm4,4 |
michael@0 | 58 | paddd xmm4,xmm3 |
michael@0 | 59 | movq rax,xmm4 |
michael@0 | 60 | |
michael@0 | 61 | |
michael@0 | 62 | ; begin epilog |
michael@0 | 63 | add rsp, 16 |
michael@0 | 64 | pop rdi |
michael@0 | 65 | pop rsi |
michael@0 | 66 | RESTORE_GOT |
michael@0 | 67 | UNSHADOW_ARGS |
michael@0 | 68 | pop rbp |
michael@0 | 69 | ret |
michael@0 | 70 | |
michael@0 | 71 | |
michael@0 | 72 | ;unsigned int vp9_get16x16var_sse2 |
michael@0 | 73 | ;( |
michael@0 | 74 | ; unsigned char * src_ptr, |
michael@0 | 75 | ; int source_stride, |
michael@0 | 76 | ; unsigned char * ref_ptr, |
michael@0 | 77 | ; int recon_stride, |
michael@0 | 78 | ; unsigned int * SSE, |
michael@0 | 79 | ; int * Sum |
michael@0 | 80 | ;) |
michael@0 | 81 | global sym(vp9_get16x16var_sse2) PRIVATE |
michael@0 | 82 | sym(vp9_get16x16var_sse2): |
michael@0 | 83 | push rbp |
michael@0 | 84 | mov rbp, rsp |
michael@0 | 85 | SHADOW_ARGS_TO_STACK 6 |
michael@0 | 86 | SAVE_XMM 7 |
michael@0 | 87 | push rbx |
michael@0 | 88 | push rsi |
michael@0 | 89 | push rdi |
michael@0 | 90 | ; end prolog |
michael@0 | 91 | |
michael@0 | 92 | mov rsi, arg(0) ;[src_ptr] |
michael@0 | 93 | mov rdi, arg(2) ;[ref_ptr] |
michael@0 | 94 | |
michael@0 | 95 | movsxd rax, DWORD PTR arg(1) ;[source_stride] |
michael@0 | 96 | movsxd rdx, DWORD PTR arg(3) ;[recon_stride] |
michael@0 | 97 | |
michael@0 | 98 | ; Prefetch data |
michael@0 | 99 | lea rcx, [rax+rax*2] |
michael@0 | 100 | prefetcht0 [rsi] |
michael@0 | 101 | prefetcht0 [rsi+rax] |
michael@0 | 102 | prefetcht0 [rsi+rax*2] |
michael@0 | 103 | prefetcht0 [rsi+rcx] |
michael@0 | 104 | lea rbx, [rsi+rax*4] |
michael@0 | 105 | prefetcht0 [rbx] |
michael@0 | 106 | prefetcht0 [rbx+rax] |
michael@0 | 107 | prefetcht0 [rbx+rax*2] |
michael@0 | 108 | prefetcht0 [rbx+rcx] |
michael@0 | 109 | |
michael@0 | 110 | lea rcx, [rdx+rdx*2] |
michael@0 | 111 | prefetcht0 [rdi] |
michael@0 | 112 | prefetcht0 [rdi+rdx] |
michael@0 | 113 | prefetcht0 [rdi+rdx*2] |
michael@0 | 114 | prefetcht0 [rdi+rcx] |
michael@0 | 115 | lea rbx, [rdi+rdx*4] |
michael@0 | 116 | prefetcht0 [rbx] |
michael@0 | 117 | prefetcht0 [rbx+rdx] |
michael@0 | 118 | prefetcht0 [rbx+rdx*2] |
michael@0 | 119 | prefetcht0 [rbx+rcx] |
michael@0 | 120 | |
michael@0 | 121 | pxor xmm0, xmm0 ; clear xmm0 for unpack |
michael@0 | 122 | pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs |
michael@0 | 123 | |
michael@0 | 124 | pxor xmm6, xmm6 ; clear xmm6 for accumulating sse |
michael@0 | 125 | mov rcx, 16 |
michael@0 | 126 | |
michael@0 | 127 | .var16loop: |
michael@0 | 128 | movdqu xmm1, XMMWORD PTR [rsi] |
michael@0 | 129 | movdqu xmm2, XMMWORD PTR [rdi] |
michael@0 | 130 | |
michael@0 | 131 | prefetcht0 [rsi+rax*8] |
michael@0 | 132 | prefetcht0 [rdi+rdx*8] |
michael@0 | 133 | |
michael@0 | 134 | movdqa xmm3, xmm1 |
michael@0 | 135 | movdqa xmm4, xmm2 |
michael@0 | 136 | |
michael@0 | 137 | |
michael@0 | 138 | punpcklbw xmm1, xmm0 |
michael@0 | 139 | punpckhbw xmm3, xmm0 |
michael@0 | 140 | |
michael@0 | 141 | punpcklbw xmm2, xmm0 |
michael@0 | 142 | punpckhbw xmm4, xmm0 |
michael@0 | 143 | |
michael@0 | 144 | |
michael@0 | 145 | psubw xmm1, xmm2 |
michael@0 | 146 | psubw xmm3, xmm4 |
michael@0 | 147 | |
michael@0 | 148 | paddw xmm7, xmm1 |
michael@0 | 149 | pmaddwd xmm1, xmm1 |
michael@0 | 150 | |
michael@0 | 151 | paddw xmm7, xmm3 |
michael@0 | 152 | pmaddwd xmm3, xmm3 |
michael@0 | 153 | |
michael@0 | 154 | paddd xmm6, xmm1 |
michael@0 | 155 | paddd xmm6, xmm3 |
michael@0 | 156 | |
michael@0 | 157 | add rsi, rax |
michael@0 | 158 | add rdi, rdx |
michael@0 | 159 | |
michael@0 | 160 | sub rcx, 1 |
michael@0 | 161 | jnz .var16loop |
michael@0 | 162 | |
michael@0 | 163 | |
michael@0 | 164 | movdqa xmm1, xmm6 |
michael@0 | 165 | pxor xmm6, xmm6 |
michael@0 | 166 | |
michael@0 | 167 | pxor xmm5, xmm5 |
michael@0 | 168 | punpcklwd xmm6, xmm7 |
michael@0 | 169 | |
michael@0 | 170 | punpckhwd xmm5, xmm7 |
michael@0 | 171 | psrad xmm5, 16 |
michael@0 | 172 | |
michael@0 | 173 | psrad xmm6, 16 |
michael@0 | 174 | paddd xmm6, xmm5 |
michael@0 | 175 | |
michael@0 | 176 | movdqa xmm2, xmm1 |
michael@0 | 177 | punpckldq xmm1, xmm0 |
michael@0 | 178 | |
michael@0 | 179 | punpckhdq xmm2, xmm0 |
michael@0 | 180 | movdqa xmm7, xmm6 |
michael@0 | 181 | |
michael@0 | 182 | paddd xmm1, xmm2 |
michael@0 | 183 | punpckldq xmm6, xmm0 |
michael@0 | 184 | |
michael@0 | 185 | punpckhdq xmm7, xmm0 |
michael@0 | 186 | paddd xmm6, xmm7 |
michael@0 | 187 | |
michael@0 | 188 | movdqa xmm2, xmm1 |
michael@0 | 189 | movdqa xmm7, xmm6 |
michael@0 | 190 | |
michael@0 | 191 | psrldq xmm1, 8 |
michael@0 | 192 | psrldq xmm6, 8 |
michael@0 | 193 | |
michael@0 | 194 | paddd xmm7, xmm6 |
michael@0 | 195 | paddd xmm1, xmm2 |
michael@0 | 196 | |
michael@0 | 197 | mov rax, arg(5) ;[Sum] |
michael@0 | 198 | mov rdi, arg(4) ;[SSE] |
michael@0 | 199 | |
michael@0 | 200 | movd DWORD PTR [rax], xmm7 |
michael@0 | 201 | movd DWORD PTR [rdi], xmm1 |
michael@0 | 202 | |
michael@0 | 203 | |
michael@0 | 204 | ; begin epilog |
michael@0 | 205 | pop rdi |
michael@0 | 206 | pop rsi |
michael@0 | 207 | pop rbx |
michael@0 | 208 | RESTORE_XMM |
michael@0 | 209 | UNSHADOW_ARGS |
michael@0 | 210 | pop rbp |
michael@0 | 211 | ret |
michael@0 | 212 | |
michael@0 | 213 | |
michael@0 | 214 | |
michael@0 | 215 | |
michael@0 | 216 | ;unsigned int vp9_get8x8var_sse2 |
michael@0 | 217 | ;( |
michael@0 | 218 | ; unsigned char * src_ptr, |
michael@0 | 219 | ; int source_stride, |
michael@0 | 220 | ; unsigned char * ref_ptr, |
michael@0 | 221 | ; int recon_stride, |
michael@0 | 222 | ; unsigned int * SSE, |
michael@0 | 223 | ; int * Sum |
michael@0 | 224 | ;) |
michael@0 | 225 | global sym(vp9_get8x8var_sse2) PRIVATE |
michael@0 | 226 | sym(vp9_get8x8var_sse2): |
michael@0 | 227 | push rbp |
michael@0 | 228 | mov rbp, rsp |
michael@0 | 229 | SHADOW_ARGS_TO_STACK 6 |
michael@0 | 230 | SAVE_XMM 7 |
michael@0 | 231 | GET_GOT rbx |
michael@0 | 232 | push rsi |
michael@0 | 233 | push rdi |
michael@0 | 234 | sub rsp, 16 |
michael@0 | 235 | ; end prolog |
michael@0 | 236 | |
michael@0 | 237 | mov rsi, arg(0) ;[src_ptr] |
michael@0 | 238 | mov rdi, arg(2) ;[ref_ptr] |
michael@0 | 239 | |
michael@0 | 240 | movsxd rax, DWORD PTR arg(1) ;[source_stride] |
michael@0 | 241 | movsxd rdx, DWORD PTR arg(3) ;[recon_stride] |
michael@0 | 242 | |
michael@0 | 243 | pxor xmm0, xmm0 ; clear xmm0 for unpack |
michael@0 | 244 | pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs |
michael@0 | 245 | |
michael@0 | 246 | movq xmm1, QWORD PTR [rsi] |
michael@0 | 247 | movq xmm2, QWORD PTR [rdi] |
michael@0 | 248 | |
michael@0 | 249 | punpcklbw xmm1, xmm0 |
michael@0 | 250 | punpcklbw xmm2, xmm0 |
michael@0 | 251 | |
michael@0 | 252 | psubsw xmm1, xmm2 |
michael@0 | 253 | paddw xmm7, xmm1 |
michael@0 | 254 | |
michael@0 | 255 | pmaddwd xmm1, xmm1 |
michael@0 | 256 | |
michael@0 | 257 | movq xmm2, QWORD PTR[rsi + rax] |
michael@0 | 258 | movq xmm3, QWORD PTR[rdi + rdx] |
michael@0 | 259 | |
michael@0 | 260 | punpcklbw xmm2, xmm0 |
michael@0 | 261 | punpcklbw xmm3, xmm0 |
michael@0 | 262 | |
michael@0 | 263 | psubsw xmm2, xmm3 |
michael@0 | 264 | paddw xmm7, xmm2 |
michael@0 | 265 | |
michael@0 | 266 | pmaddwd xmm2, xmm2 |
michael@0 | 267 | paddd xmm1, xmm2 |
michael@0 | 268 | |
michael@0 | 269 | |
michael@0 | 270 | movq xmm2, QWORD PTR[rsi + rax * 2] |
michael@0 | 271 | movq xmm3, QWORD PTR[rdi + rdx * 2] |
michael@0 | 272 | |
michael@0 | 273 | punpcklbw xmm2, xmm0 |
michael@0 | 274 | punpcklbw xmm3, xmm0 |
michael@0 | 275 | |
michael@0 | 276 | psubsw xmm2, xmm3 |
michael@0 | 277 | paddw xmm7, xmm2 |
michael@0 | 278 | |
michael@0 | 279 | pmaddwd xmm2, xmm2 |
michael@0 | 280 | paddd xmm1, xmm2 |
michael@0 | 281 | |
michael@0 | 282 | |
michael@0 | 283 | lea rsi, [rsi + rax * 2] |
michael@0 | 284 | lea rdi, [rdi + rdx * 2] |
michael@0 | 285 | movq xmm2, QWORD PTR[rsi + rax] |
michael@0 | 286 | movq xmm3, QWORD PTR[rdi + rdx] |
michael@0 | 287 | |
michael@0 | 288 | punpcklbw xmm2, xmm0 |
michael@0 | 289 | punpcklbw xmm3, xmm0 |
michael@0 | 290 | |
michael@0 | 291 | psubsw xmm2, xmm3 |
michael@0 | 292 | paddw xmm7, xmm2 |
michael@0 | 293 | |
michael@0 | 294 | pmaddwd xmm2, xmm2 |
michael@0 | 295 | paddd xmm1, xmm2 |
michael@0 | 296 | |
michael@0 | 297 | movq xmm2, QWORD PTR[rsi + rax *2] |
michael@0 | 298 | movq xmm3, QWORD PTR[rdi + rdx *2] |
michael@0 | 299 | |
michael@0 | 300 | punpcklbw xmm2, xmm0 |
michael@0 | 301 | punpcklbw xmm3, xmm0 |
michael@0 | 302 | |
michael@0 | 303 | psubsw xmm2, xmm3 |
michael@0 | 304 | paddw xmm7, xmm2 |
michael@0 | 305 | |
michael@0 | 306 | pmaddwd xmm2, xmm2 |
michael@0 | 307 | paddd xmm1, xmm2 |
michael@0 | 308 | |
michael@0 | 309 | |
michael@0 | 310 | lea rsi, [rsi + rax * 2] |
michael@0 | 311 | lea rdi, [rdi + rdx * 2] |
michael@0 | 312 | |
michael@0 | 313 | |
michael@0 | 314 | movq xmm2, QWORD PTR[rsi + rax] |
michael@0 | 315 | movq xmm3, QWORD PTR[rdi + rdx] |
michael@0 | 316 | |
michael@0 | 317 | punpcklbw xmm2, xmm0 |
michael@0 | 318 | punpcklbw xmm3, xmm0 |
michael@0 | 319 | |
michael@0 | 320 | psubsw xmm2, xmm3 |
michael@0 | 321 | paddw xmm7, xmm2 |
michael@0 | 322 | |
michael@0 | 323 | pmaddwd xmm2, xmm2 |
michael@0 | 324 | paddd xmm1, xmm2 |
michael@0 | 325 | |
michael@0 | 326 | movq xmm2, QWORD PTR[rsi + rax *2] |
michael@0 | 327 | movq xmm3, QWORD PTR[rdi + rdx *2] |
michael@0 | 328 | |
michael@0 | 329 | punpcklbw xmm2, xmm0 |
michael@0 | 330 | punpcklbw xmm3, xmm0 |
michael@0 | 331 | |
michael@0 | 332 | psubsw xmm2, xmm3 |
michael@0 | 333 | paddw xmm7, xmm2 |
michael@0 | 334 | |
michael@0 | 335 | pmaddwd xmm2, xmm2 |
michael@0 | 336 | paddd xmm1, xmm2 |
michael@0 | 337 | |
michael@0 | 338 | |
michael@0 | 339 | lea rsi, [rsi + rax * 2] |
michael@0 | 340 | lea rdi, [rdi + rdx * 2] |
michael@0 | 341 | |
michael@0 | 342 | movq xmm2, QWORD PTR[rsi + rax] |
michael@0 | 343 | movq xmm3, QWORD PTR[rdi + rdx] |
michael@0 | 344 | |
michael@0 | 345 | punpcklbw xmm2, xmm0 |
michael@0 | 346 | punpcklbw xmm3, xmm0 |
michael@0 | 347 | |
michael@0 | 348 | psubsw xmm2, xmm3 |
michael@0 | 349 | paddw xmm7, xmm2 |
michael@0 | 350 | |
michael@0 | 351 | pmaddwd xmm2, xmm2 |
michael@0 | 352 | paddd xmm1, xmm2 |
michael@0 | 353 | |
michael@0 | 354 | |
michael@0 | 355 | movdqa xmm6, xmm7 |
michael@0 | 356 | punpcklwd xmm6, xmm0 |
michael@0 | 357 | |
michael@0 | 358 | punpckhwd xmm7, xmm0 |
michael@0 | 359 | movdqa xmm2, xmm1 |
michael@0 | 360 | |
michael@0 | 361 | paddw xmm6, xmm7 |
michael@0 | 362 | punpckldq xmm1, xmm0 |
michael@0 | 363 | |
michael@0 | 364 | punpckhdq xmm2, xmm0 |
michael@0 | 365 | movdqa xmm7, xmm6 |
michael@0 | 366 | |
michael@0 | 367 | paddd xmm1, xmm2 |
michael@0 | 368 | punpckldq xmm6, xmm0 |
michael@0 | 369 | |
michael@0 | 370 | punpckhdq xmm7, xmm0 |
michael@0 | 371 | paddw xmm6, xmm7 |
michael@0 | 372 | |
michael@0 | 373 | movdqa xmm2, xmm1 |
michael@0 | 374 | movdqa xmm7, xmm6 |
michael@0 | 375 | |
michael@0 | 376 | psrldq xmm1, 8 |
michael@0 | 377 | psrldq xmm6, 8 |
michael@0 | 378 | |
michael@0 | 379 | paddw xmm7, xmm6 |
michael@0 | 380 | paddd xmm1, xmm2 |
michael@0 | 381 | |
michael@0 | 382 | mov rax, arg(5) ;[Sum] |
michael@0 | 383 | mov rdi, arg(4) ;[SSE] |
michael@0 | 384 | |
michael@0 | 385 | movq rdx, xmm7 |
michael@0 | 386 | movsx rcx, dx |
michael@0 | 387 | |
michael@0 | 388 | mov dword ptr [rax], ecx |
michael@0 | 389 | movd DWORD PTR [rdi], xmm1 |
michael@0 | 390 | |
michael@0 | 391 | ; begin epilog |
michael@0 | 392 | add rsp, 16 |
michael@0 | 393 | pop rdi |
michael@0 | 394 | pop rsi |
michael@0 | 395 | RESTORE_GOT |
michael@0 | 396 | RESTORE_XMM |
michael@0 | 397 | UNSHADOW_ARGS |
michael@0 | 398 | pop rbp |
michael@0 | 399 | ret |
michael@0 | 400 | |
michael@0 | 401 | ;void vp9_half_horiz_vert_variance8x_h_sse2 |
michael@0 | 402 | ;( |
michael@0 | 403 | ; unsigned char *ref_ptr, |
michael@0 | 404 | ; int ref_pixels_per_line, |
michael@0 | 405 | ; unsigned char *src_ptr, |
michael@0 | 406 | ; int src_pixels_per_line, |
michael@0 | 407 | ; unsigned int Height, |
michael@0 | 408 | ; int *sum, |
michael@0 | 409 | ; unsigned int *sumsquared |
michael@0 | 410 | ;) |
michael@0 | 411 | global sym(vp9_half_horiz_vert_variance8x_h_sse2) PRIVATE |
michael@0 | 412 | sym(vp9_half_horiz_vert_variance8x_h_sse2): |
michael@0 | 413 | push rbp |
michael@0 | 414 | mov rbp, rsp |
michael@0 | 415 | SHADOW_ARGS_TO_STACK 7 |
michael@0 | 416 | SAVE_XMM 7 |
michael@0 | 417 | GET_GOT rbx |
michael@0 | 418 | push rsi |
michael@0 | 419 | push rdi |
michael@0 | 420 | ; end prolog |
michael@0 | 421 | |
michael@0 | 422 | %if ABI_IS_32BIT=0 |
michael@0 | 423 | movsxd r8, dword ptr arg(1) ;ref_pixels_per_line |
michael@0 | 424 | movsxd r9, dword ptr arg(3) ;src_pixels_per_line |
michael@0 | 425 | %endif |
michael@0 | 426 | |
michael@0 | 427 | pxor xmm6, xmm6 ; error accumulator |
michael@0 | 428 | pxor xmm7, xmm7 ; sse eaccumulator |
michael@0 | 429 | mov rsi, arg(0) ;ref_ptr ; |
michael@0 | 430 | |
michael@0 | 431 | mov rdi, arg(2) ;src_ptr ; |
michael@0 | 432 | movsxd rcx, dword ptr arg(4) ;Height ; |
michael@0 | 433 | movsxd rax, dword ptr arg(1) ;ref_pixels_per_line |
michael@0 | 434 | |
michael@0 | 435 | pxor xmm0, xmm0 ; |
michael@0 | 436 | |
michael@0 | 437 | movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 |
michael@0 | 438 | movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 |
michael@0 | 439 | pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 |
michael@0 | 440 | |
michael@0 | 441 | %if ABI_IS_32BIT |
michael@0 | 442 | add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source |
michael@0 | 443 | %else |
michael@0 | 444 | add rsi, r8 |
michael@0 | 445 | %endif |
michael@0 | 446 | |
michael@0 | 447 | .half_horiz_vert_variance8x_h_1: |
michael@0 | 448 | |
michael@0 | 449 | movq xmm1, QWORD PTR [rsi] ; |
michael@0 | 450 | movq xmm2, QWORD PTR [rsi+1] ; |
michael@0 | 451 | pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 |
michael@0 | 452 | |
michael@0 | 453 | pavgb xmm5, xmm1 ; xmm = vertical average of the above |
michael@0 | 454 | punpcklbw xmm5, xmm0 ; xmm5 = words of above |
michael@0 | 455 | |
michael@0 | 456 | movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 |
michael@0 | 457 | punpcklbw xmm3, xmm0 ; xmm3 = words of above |
michael@0 | 458 | |
michael@0 | 459 | psubw xmm5, xmm3 ; xmm5 -= xmm3 |
michael@0 | 460 | paddw xmm6, xmm5 ; xmm6 += accumulated column differences |
michael@0 | 461 | pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 |
michael@0 | 462 | paddd xmm7, xmm5 ; xmm7 += accumulated square column differences |
michael@0 | 463 | |
michael@0 | 464 | movdqa xmm5, xmm1 ; save xmm1 for use on the next row |
michael@0 | 465 | |
michael@0 | 466 | %if ABI_IS_32BIT |
michael@0 | 467 | add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source |
michael@0 | 468 | add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination |
michael@0 | 469 | %else |
michael@0 | 470 | add rsi, r8 |
michael@0 | 471 | add rdi, r9 |
michael@0 | 472 | %endif |
michael@0 | 473 | |
michael@0 | 474 | sub rcx, 1 ; |
michael@0 | 475 | jnz .half_horiz_vert_variance8x_h_1 ; |
michael@0 | 476 | |
michael@0 | 477 | movdq2q mm6, xmm6 ; |
michael@0 | 478 | movdq2q mm7, xmm7 ; |
michael@0 | 479 | |
michael@0 | 480 | psrldq xmm6, 8 |
michael@0 | 481 | psrldq xmm7, 8 |
michael@0 | 482 | |
michael@0 | 483 | movdq2q mm2, xmm6 |
michael@0 | 484 | movdq2q mm3, xmm7 |
michael@0 | 485 | |
michael@0 | 486 | paddw mm6, mm2 |
michael@0 | 487 | paddd mm7, mm3 |
michael@0 | 488 | |
michael@0 | 489 | pxor mm3, mm3 ; |
michael@0 | 490 | pxor mm2, mm2 ; |
michael@0 | 491 | |
michael@0 | 492 | punpcklwd mm2, mm6 ; |
michael@0 | 493 | punpckhwd mm3, mm6 ; |
michael@0 | 494 | |
michael@0 | 495 | paddd mm2, mm3 ; |
michael@0 | 496 | movq mm6, mm2 ; |
michael@0 | 497 | |
michael@0 | 498 | psrlq mm6, 32 ; |
michael@0 | 499 | paddd mm2, mm6 ; |
michael@0 | 500 | |
michael@0 | 501 | psrad mm2, 16 ; |
michael@0 | 502 | movq mm4, mm7 ; |
michael@0 | 503 | |
michael@0 | 504 | psrlq mm4, 32 ; |
michael@0 | 505 | paddd mm4, mm7 ; |
michael@0 | 506 | |
michael@0 | 507 | mov rsi, arg(5) ; sum |
michael@0 | 508 | mov rdi, arg(6) ; sumsquared |
michael@0 | 509 | |
michael@0 | 510 | movd [rsi], mm2 ; |
michael@0 | 511 | movd [rdi], mm4 ; |
michael@0 | 512 | |
michael@0 | 513 | |
michael@0 | 514 | ; begin epilog |
michael@0 | 515 | pop rdi |
michael@0 | 516 | pop rsi |
michael@0 | 517 | RESTORE_GOT |
michael@0 | 518 | RESTORE_XMM |
michael@0 | 519 | UNSHADOW_ARGS |
michael@0 | 520 | pop rbp |
michael@0 | 521 | ret |
michael@0 | 522 | |
michael@0 | 523 | ;void vp9_half_vert_variance8x_h_sse2 |
michael@0 | 524 | ;( |
michael@0 | 525 | ; unsigned char *ref_ptr, |
michael@0 | 526 | ; int ref_pixels_per_line, |
michael@0 | 527 | ; unsigned char *src_ptr, |
michael@0 | 528 | ; int src_pixels_per_line, |
michael@0 | 529 | ; unsigned int Height, |
michael@0 | 530 | ; int *sum, |
michael@0 | 531 | ; unsigned int *sumsquared |
michael@0 | 532 | ;) |
michael@0 | 533 | global sym(vp9_half_vert_variance8x_h_sse2) PRIVATE |
michael@0 | 534 | sym(vp9_half_vert_variance8x_h_sse2): |
michael@0 | 535 | push rbp |
michael@0 | 536 | mov rbp, rsp |
michael@0 | 537 | SHADOW_ARGS_TO_STACK 7 |
michael@0 | 538 | SAVE_XMM 7 |
michael@0 | 539 | GET_GOT rbx |
michael@0 | 540 | push rsi |
michael@0 | 541 | push rdi |
michael@0 | 542 | ; end prolog |
michael@0 | 543 | |
michael@0 | 544 | %if ABI_IS_32BIT=0 |
michael@0 | 545 | movsxd r8, dword ptr arg(1) ;ref_pixels_per_line |
michael@0 | 546 | movsxd r9, dword ptr arg(3) ;src_pixels_per_line |
michael@0 | 547 | %endif |
michael@0 | 548 | |
michael@0 | 549 | pxor xmm6, xmm6 ; error accumulator |
michael@0 | 550 | pxor xmm7, xmm7 ; sse eaccumulator |
michael@0 | 551 | mov rsi, arg(0) ;ref_ptr ; |
michael@0 | 552 | |
michael@0 | 553 | mov rdi, arg(2) ;src_ptr ; |
michael@0 | 554 | movsxd rcx, dword ptr arg(4) ;Height ; |
michael@0 | 555 | movsxd rax, dword ptr arg(1) ;ref_pixels_per_line |
michael@0 | 556 | |
michael@0 | 557 | pxor xmm0, xmm0 ; |
michael@0 | 558 | .half_vert_variance8x_h_1: |
michael@0 | 559 | movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 |
michael@0 | 560 | movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9 |
michael@0 | 561 | |
michael@0 | 562 | pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) |
michael@0 | 563 | punpcklbw xmm5, xmm0 ; xmm5 = words of above |
michael@0 | 564 | |
michael@0 | 565 | movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 |
michael@0 | 566 | punpcklbw xmm3, xmm0 ; xmm3 = words of above |
michael@0 | 567 | |
michael@0 | 568 | psubw xmm5, xmm3 ; xmm5 -= xmm3 |
michael@0 | 569 | paddw xmm6, xmm5 ; xmm6 += accumulated column differences |
michael@0 | 570 | pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 |
michael@0 | 571 | paddd xmm7, xmm5 ; xmm7 += accumulated square column differences |
michael@0 | 572 | |
michael@0 | 573 | %if ABI_IS_32BIT |
michael@0 | 574 | add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source |
michael@0 | 575 | add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination |
michael@0 | 576 | %else |
michael@0 | 577 | add rsi, r8 |
michael@0 | 578 | add rdi, r9 |
michael@0 | 579 | %endif |
michael@0 | 580 | |
michael@0 | 581 | sub rcx, 1 ; |
michael@0 | 582 | jnz .half_vert_variance8x_h_1 ; |
michael@0 | 583 | |
michael@0 | 584 | movdq2q mm6, xmm6 ; |
michael@0 | 585 | movdq2q mm7, xmm7 ; |
michael@0 | 586 | |
michael@0 | 587 | psrldq xmm6, 8 |
michael@0 | 588 | psrldq xmm7, 8 |
michael@0 | 589 | |
michael@0 | 590 | movdq2q mm2, xmm6 |
michael@0 | 591 | movdq2q mm3, xmm7 |
michael@0 | 592 | |
michael@0 | 593 | paddw mm6, mm2 |
michael@0 | 594 | paddd mm7, mm3 |
michael@0 | 595 | |
michael@0 | 596 | pxor mm3, mm3 ; |
michael@0 | 597 | pxor mm2, mm2 ; |
michael@0 | 598 | |
michael@0 | 599 | punpcklwd mm2, mm6 ; |
michael@0 | 600 | punpckhwd mm3, mm6 ; |
michael@0 | 601 | |
michael@0 | 602 | paddd mm2, mm3 ; |
michael@0 | 603 | movq mm6, mm2 ; |
michael@0 | 604 | |
michael@0 | 605 | psrlq mm6, 32 ; |
michael@0 | 606 | paddd mm2, mm6 ; |
michael@0 | 607 | |
michael@0 | 608 | psrad mm2, 16 ; |
michael@0 | 609 | movq mm4, mm7 ; |
michael@0 | 610 | |
michael@0 | 611 | psrlq mm4, 32 ; |
michael@0 | 612 | paddd mm4, mm7 ; |
michael@0 | 613 | |
michael@0 | 614 | mov rsi, arg(5) ; sum |
michael@0 | 615 | mov rdi, arg(6) ; sumsquared |
michael@0 | 616 | |
michael@0 | 617 | movd [rsi], mm2 ; |
michael@0 | 618 | movd [rdi], mm4 ; |
michael@0 | 619 | |
michael@0 | 620 | |
michael@0 | 621 | ; begin epilog |
michael@0 | 622 | pop rdi |
michael@0 | 623 | pop rsi |
michael@0 | 624 | RESTORE_GOT |
michael@0 | 625 | RESTORE_XMM |
michael@0 | 626 | UNSHADOW_ARGS |
michael@0 | 627 | pop rbp |
michael@0 | 628 | ret |
michael@0 | 629 | |
michael@0 | 630 | |
michael@0 | 631 | ;void vp9_half_horiz_variance8x_h_sse2 |
michael@0 | 632 | ;( |
michael@0 | 633 | ; unsigned char *ref_ptr, |
michael@0 | 634 | ; int ref_pixels_per_line, |
michael@0 | 635 | ; unsigned char *src_ptr, |
michael@0 | 636 | ; int src_pixels_per_line, |
michael@0 | 637 | ; unsigned int Height, |
michael@0 | 638 | ; int *sum, |
michael@0 | 639 | ; unsigned int *sumsquared |
michael@0 | 640 | ;) |
michael@0 | 641 | global sym(vp9_half_horiz_variance8x_h_sse2) PRIVATE |
michael@0 | 642 | sym(vp9_half_horiz_variance8x_h_sse2): |
michael@0 | 643 | push rbp |
michael@0 | 644 | mov rbp, rsp |
michael@0 | 645 | SHADOW_ARGS_TO_STACK 7 |
michael@0 | 646 | SAVE_XMM 7 |
michael@0 | 647 | GET_GOT rbx |
michael@0 | 648 | push rsi |
michael@0 | 649 | push rdi |
michael@0 | 650 | ; end prolog |
michael@0 | 651 | |
michael@0 | 652 | %if ABI_IS_32BIT=0 |
michael@0 | 653 | movsxd r8, dword ptr arg(1) ;ref_pixels_per_line |
michael@0 | 654 | movsxd r9, dword ptr arg(3) ;src_pixels_per_line |
michael@0 | 655 | %endif |
michael@0 | 656 | |
michael@0 | 657 | pxor xmm6, xmm6 ; error accumulator |
michael@0 | 658 | pxor xmm7, xmm7 ; sse eaccumulator |
michael@0 | 659 | mov rsi, arg(0) ;ref_ptr ; |
michael@0 | 660 | |
michael@0 | 661 | mov rdi, arg(2) ;src_ptr ; |
michael@0 | 662 | movsxd rcx, dword ptr arg(4) ;Height ; |
michael@0 | 663 | |
michael@0 | 664 | pxor xmm0, xmm0 ; |
michael@0 | 665 | .half_horiz_variance8x_h_1: |
michael@0 | 666 | movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 |
michael@0 | 667 | movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 |
michael@0 | 668 | |
michael@0 | 669 | pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) |
michael@0 | 670 | punpcklbw xmm5, xmm0 ; xmm5 = words of above |
michael@0 | 671 | |
michael@0 | 672 | movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 |
michael@0 | 673 | punpcklbw xmm3, xmm0 ; xmm3 = words of above |
michael@0 | 674 | |
michael@0 | 675 | psubw xmm5, xmm3 ; xmm5 -= xmm3 |
michael@0 | 676 | paddw xmm6, xmm5 ; xmm6 += accumulated column differences |
michael@0 | 677 | pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 |
michael@0 | 678 | paddd xmm7, xmm5 ; xmm7 += accumulated square column differences |
michael@0 | 679 | |
michael@0 | 680 | %if ABI_IS_32BIT |
michael@0 | 681 | add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source |
michael@0 | 682 | add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination |
michael@0 | 683 | %else |
michael@0 | 684 | add rsi, r8 |
michael@0 | 685 | add rdi, r9 |
michael@0 | 686 | %endif |
michael@0 | 687 | sub rcx, 1 ; |
michael@0 | 688 | jnz .half_horiz_variance8x_h_1 ; |
michael@0 | 689 | |
michael@0 | 690 | movdq2q mm6, xmm6 ; |
michael@0 | 691 | movdq2q mm7, xmm7 ; |
michael@0 | 692 | |
michael@0 | 693 | psrldq xmm6, 8 |
michael@0 | 694 | psrldq xmm7, 8 |
michael@0 | 695 | |
michael@0 | 696 | movdq2q mm2, xmm6 |
michael@0 | 697 | movdq2q mm3, xmm7 |
michael@0 | 698 | |
michael@0 | 699 | paddw mm6, mm2 |
michael@0 | 700 | paddd mm7, mm3 |
michael@0 | 701 | |
michael@0 | 702 | pxor mm3, mm3 ; |
michael@0 | 703 | pxor mm2, mm2 ; |
michael@0 | 704 | |
michael@0 | 705 | punpcklwd mm2, mm6 ; |
michael@0 | 706 | punpckhwd mm3, mm6 ; |
michael@0 | 707 | |
michael@0 | 708 | paddd mm2, mm3 ; |
michael@0 | 709 | movq mm6, mm2 ; |
michael@0 | 710 | |
michael@0 | 711 | psrlq mm6, 32 ; |
michael@0 | 712 | paddd mm2, mm6 ; |
michael@0 | 713 | |
michael@0 | 714 | psrad mm2, 16 ; |
michael@0 | 715 | movq mm4, mm7 ; |
michael@0 | 716 | |
michael@0 | 717 | psrlq mm4, 32 ; |
michael@0 | 718 | paddd mm4, mm7 ; |
michael@0 | 719 | |
michael@0 | 720 | mov rsi, arg(5) ; sum |
michael@0 | 721 | mov rdi, arg(6) ; sumsquared |
michael@0 | 722 | |
michael@0 | 723 | movd [rsi], mm2 ; |
michael@0 | 724 | movd [rdi], mm4 ; |
michael@0 | 725 | |
michael@0 | 726 | |
michael@0 | 727 | ; begin epilog |
michael@0 | 728 | pop rdi |
michael@0 | 729 | pop rsi |
michael@0 | 730 | RESTORE_GOT |
michael@0 | 731 | RESTORE_XMM |
michael@0 | 732 | UNSHADOW_ARGS |
michael@0 | 733 | pop rbp |
michael@0 | 734 | ret |