media/libvpx/vp8/common/x86/variance_impl_sse2.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11
michael@0 12 %include "vpx_ports/x86_abi_support.asm"
michael@0 13
michael@0 14 %define xmm_filter_shift 7
michael@0 15
michael@0 16 ;unsigned int vp8_get_mb_ss_sse2
michael@0 17 ;(
michael@0 18 ; short *src_ptr
michael@0 19 ;)
michael@0 20 global sym(vp8_get_mb_ss_sse2) PRIVATE
michael@0 21 sym(vp8_get_mb_ss_sse2):
michael@0 22 push rbp
michael@0 23 mov rbp, rsp
michael@0 24 SHADOW_ARGS_TO_STACK 1
michael@0 25 GET_GOT rbx
michael@0 26 push rsi
michael@0 27 push rdi
michael@0 28 sub rsp, 16
michael@0 29 ; end prolog
michael@0 30
michael@0 31
michael@0 32 mov rax, arg(0) ;[src_ptr]
michael@0 33 mov rcx, 8
michael@0 34 pxor xmm4, xmm4
michael@0 35
michael@0 36 .NEXTROW:
michael@0 37 movdqa xmm0, [rax]
michael@0 38 movdqa xmm1, [rax+16]
michael@0 39 movdqa xmm2, [rax+32]
michael@0 40 movdqa xmm3, [rax+48]
michael@0 41 pmaddwd xmm0, xmm0
michael@0 42 pmaddwd xmm1, xmm1
michael@0 43 pmaddwd xmm2, xmm2
michael@0 44 pmaddwd xmm3, xmm3
michael@0 45
michael@0 46 paddd xmm0, xmm1
michael@0 47 paddd xmm2, xmm3
michael@0 48 paddd xmm4, xmm0
michael@0 49 paddd xmm4, xmm2
michael@0 50
michael@0 51 add rax, 0x40
michael@0 52 dec rcx
michael@0 53 ja .NEXTROW
michael@0 54
michael@0 55 movdqa xmm3,xmm4
michael@0 56 psrldq xmm4,8
michael@0 57 paddd xmm4,xmm3
michael@0 58 movdqa xmm3,xmm4
michael@0 59 psrldq xmm4,4
michael@0 60 paddd xmm4,xmm3
michael@0 61 movq rax,xmm4
michael@0 62
michael@0 63
michael@0 64 ; begin epilog
michael@0 65 add rsp, 16
michael@0 66 pop rdi
michael@0 67 pop rsi
michael@0 68 RESTORE_GOT
michael@0 69 UNSHADOW_ARGS
michael@0 70 pop rbp
michael@0 71 ret
michael@0 72
michael@0 73
michael@0 74 ;unsigned int vp8_get16x16var_sse2
michael@0 75 ;(
michael@0 76 ; unsigned char * src_ptr,
michael@0 77 ; int source_stride,
michael@0 78 ; unsigned char * ref_ptr,
michael@0 79 ; int recon_stride,
michael@0 80 ; unsigned int * SSE,
michael@0 81 ; int * Sum
michael@0 82 ;)
michael@0 83 global sym(vp8_get16x16var_sse2) PRIVATE
michael@0 84 sym(vp8_get16x16var_sse2):
michael@0 85 push rbp
michael@0 86 mov rbp, rsp
michael@0 87 SHADOW_ARGS_TO_STACK 6
michael@0 88 SAVE_XMM 7
michael@0 89 push rbx
michael@0 90 push rsi
michael@0 91 push rdi
michael@0 92 ; end prolog
michael@0 93
michael@0 94 mov rsi, arg(0) ;[src_ptr]
michael@0 95 mov rdi, arg(2) ;[ref_ptr]
michael@0 96
michael@0 97 movsxd rax, DWORD PTR arg(1) ;[source_stride]
michael@0 98 movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
michael@0 99
michael@0 100 ; Prefetch data
michael@0 101 lea rcx, [rax+rax*2]
michael@0 102 prefetcht0 [rsi]
michael@0 103 prefetcht0 [rsi+rax]
michael@0 104 prefetcht0 [rsi+rax*2]
michael@0 105 prefetcht0 [rsi+rcx]
michael@0 106 lea rbx, [rsi+rax*4]
michael@0 107 prefetcht0 [rbx]
michael@0 108 prefetcht0 [rbx+rax]
michael@0 109 prefetcht0 [rbx+rax*2]
michael@0 110 prefetcht0 [rbx+rcx]
michael@0 111
michael@0 112 lea rcx, [rdx+rdx*2]
michael@0 113 prefetcht0 [rdi]
michael@0 114 prefetcht0 [rdi+rdx]
michael@0 115 prefetcht0 [rdi+rdx*2]
michael@0 116 prefetcht0 [rdi+rcx]
michael@0 117 lea rbx, [rdi+rdx*4]
michael@0 118 prefetcht0 [rbx]
michael@0 119 prefetcht0 [rbx+rdx]
michael@0 120 prefetcht0 [rbx+rdx*2]
michael@0 121 prefetcht0 [rbx+rcx]
michael@0 122
michael@0 123 pxor xmm0, xmm0 ; clear xmm0 for unpack
michael@0 124 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
michael@0 125
michael@0 126 pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
michael@0 127 mov rcx, 16
michael@0 128
michael@0 129 .var16loop:
michael@0 130 movdqu xmm1, XMMWORD PTR [rsi]
michael@0 131 movdqu xmm2, XMMWORD PTR [rdi]
michael@0 132
michael@0 133 prefetcht0 [rsi+rax*8]
michael@0 134 prefetcht0 [rdi+rdx*8]
michael@0 135
michael@0 136 movdqa xmm3, xmm1
michael@0 137 movdqa xmm4, xmm2
michael@0 138
michael@0 139
michael@0 140 punpcklbw xmm1, xmm0
michael@0 141 punpckhbw xmm3, xmm0
michael@0 142
michael@0 143 punpcklbw xmm2, xmm0
michael@0 144 punpckhbw xmm4, xmm0
michael@0 145
michael@0 146
michael@0 147 psubw xmm1, xmm2
michael@0 148 psubw xmm3, xmm4
michael@0 149
michael@0 150 paddw xmm7, xmm1
michael@0 151 pmaddwd xmm1, xmm1
michael@0 152
michael@0 153 paddw xmm7, xmm3
michael@0 154 pmaddwd xmm3, xmm3
michael@0 155
michael@0 156 paddd xmm6, xmm1
michael@0 157 paddd xmm6, xmm3
michael@0 158
michael@0 159 add rsi, rax
michael@0 160 add rdi, rdx
michael@0 161
michael@0 162 sub rcx, 1
michael@0 163 jnz .var16loop
michael@0 164
michael@0 165
michael@0 166 movdqa xmm1, xmm6
michael@0 167 pxor xmm6, xmm6
michael@0 168
michael@0 169 pxor xmm5, xmm5
michael@0 170 punpcklwd xmm6, xmm7
michael@0 171
michael@0 172 punpckhwd xmm5, xmm7
michael@0 173 psrad xmm5, 16
michael@0 174
michael@0 175 psrad xmm6, 16
michael@0 176 paddd xmm6, xmm5
michael@0 177
michael@0 178 movdqa xmm2, xmm1
michael@0 179 punpckldq xmm1, xmm0
michael@0 180
michael@0 181 punpckhdq xmm2, xmm0
michael@0 182 movdqa xmm7, xmm6
michael@0 183
michael@0 184 paddd xmm1, xmm2
michael@0 185 punpckldq xmm6, xmm0
michael@0 186
michael@0 187 punpckhdq xmm7, xmm0
michael@0 188 paddd xmm6, xmm7
michael@0 189
michael@0 190 movdqa xmm2, xmm1
michael@0 191 movdqa xmm7, xmm6
michael@0 192
michael@0 193 psrldq xmm1, 8
michael@0 194 psrldq xmm6, 8
michael@0 195
michael@0 196 paddd xmm7, xmm6
michael@0 197 paddd xmm1, xmm2
michael@0 198
michael@0 199 mov rax, arg(5) ;[Sum]
michael@0 200 mov rdi, arg(4) ;[SSE]
michael@0 201
michael@0 202 movd DWORD PTR [rax], xmm7
michael@0 203 movd DWORD PTR [rdi], xmm1
michael@0 204
michael@0 205
michael@0 206 ; begin epilog
michael@0 207 pop rdi
michael@0 208 pop rsi
michael@0 209 pop rbx
michael@0 210 RESTORE_XMM
michael@0 211 UNSHADOW_ARGS
michael@0 212 pop rbp
michael@0 213 ret
michael@0 214
michael@0 215
michael@0 216
michael@0 217
michael@0 218 ;unsigned int vp8_get8x8var_sse2
michael@0 219 ;(
michael@0 220 ; unsigned char * src_ptr,
michael@0 221 ; int source_stride,
michael@0 222 ; unsigned char * ref_ptr,
michael@0 223 ; int recon_stride,
michael@0 224 ; unsigned int * SSE,
michael@0 225 ; int * Sum
michael@0 226 ;)
michael@0 227 global sym(vp8_get8x8var_sse2) PRIVATE
michael@0 228 sym(vp8_get8x8var_sse2):
michael@0 229 push rbp
michael@0 230 mov rbp, rsp
michael@0 231 SHADOW_ARGS_TO_STACK 6
michael@0 232 SAVE_XMM 7
michael@0 233 GET_GOT rbx
michael@0 234 push rsi
michael@0 235 push rdi
michael@0 236 sub rsp, 16
michael@0 237 ; end prolog
michael@0 238
michael@0 239 mov rsi, arg(0) ;[src_ptr]
michael@0 240 mov rdi, arg(2) ;[ref_ptr]
michael@0 241
michael@0 242 movsxd rax, DWORD PTR arg(1) ;[source_stride]
michael@0 243 movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
michael@0 244
michael@0 245 pxor xmm0, xmm0 ; clear xmm0 for unpack
michael@0 246 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
michael@0 247
michael@0 248 movq xmm1, QWORD PTR [rsi]
michael@0 249 movq xmm2, QWORD PTR [rdi]
michael@0 250
michael@0 251 punpcklbw xmm1, xmm0
michael@0 252 punpcklbw xmm2, xmm0
michael@0 253
michael@0 254 psubsw xmm1, xmm2
michael@0 255 paddw xmm7, xmm1
michael@0 256
michael@0 257 pmaddwd xmm1, xmm1
michael@0 258
michael@0 259 movq xmm2, QWORD PTR[rsi + rax]
michael@0 260 movq xmm3, QWORD PTR[rdi + rdx]
michael@0 261
michael@0 262 punpcklbw xmm2, xmm0
michael@0 263 punpcklbw xmm3, xmm0
michael@0 264
michael@0 265 psubsw xmm2, xmm3
michael@0 266 paddw xmm7, xmm2
michael@0 267
michael@0 268 pmaddwd xmm2, xmm2
michael@0 269 paddd xmm1, xmm2
michael@0 270
michael@0 271
michael@0 272 movq xmm2, QWORD PTR[rsi + rax * 2]
michael@0 273 movq xmm3, QWORD PTR[rdi + rdx * 2]
michael@0 274
michael@0 275 punpcklbw xmm2, xmm0
michael@0 276 punpcklbw xmm3, xmm0
michael@0 277
michael@0 278 psubsw xmm2, xmm3
michael@0 279 paddw xmm7, xmm2
michael@0 280
michael@0 281 pmaddwd xmm2, xmm2
michael@0 282 paddd xmm1, xmm2
michael@0 283
michael@0 284
michael@0 285 lea rsi, [rsi + rax * 2]
michael@0 286 lea rdi, [rdi + rdx * 2]
michael@0 287 movq xmm2, QWORD PTR[rsi + rax]
michael@0 288 movq xmm3, QWORD PTR[rdi + rdx]
michael@0 289
michael@0 290 punpcklbw xmm2, xmm0
michael@0 291 punpcklbw xmm3, xmm0
michael@0 292
michael@0 293 psubsw xmm2, xmm3
michael@0 294 paddw xmm7, xmm2
michael@0 295
michael@0 296 pmaddwd xmm2, xmm2
michael@0 297 paddd xmm1, xmm2
michael@0 298
michael@0 299 movq xmm2, QWORD PTR[rsi + rax *2]
michael@0 300 movq xmm3, QWORD PTR[rdi + rdx *2]
michael@0 301
michael@0 302 punpcklbw xmm2, xmm0
michael@0 303 punpcklbw xmm3, xmm0
michael@0 304
michael@0 305 psubsw xmm2, xmm3
michael@0 306 paddw xmm7, xmm2
michael@0 307
michael@0 308 pmaddwd xmm2, xmm2
michael@0 309 paddd xmm1, xmm2
michael@0 310
michael@0 311
michael@0 312 lea rsi, [rsi + rax * 2]
michael@0 313 lea rdi, [rdi + rdx * 2]
michael@0 314
michael@0 315
michael@0 316 movq xmm2, QWORD PTR[rsi + rax]
michael@0 317 movq xmm3, QWORD PTR[rdi + rdx]
michael@0 318
michael@0 319 punpcklbw xmm2, xmm0
michael@0 320 punpcklbw xmm3, xmm0
michael@0 321
michael@0 322 psubsw xmm2, xmm3
michael@0 323 paddw xmm7, xmm2
michael@0 324
michael@0 325 pmaddwd xmm2, xmm2
michael@0 326 paddd xmm1, xmm2
michael@0 327
michael@0 328 movq xmm2, QWORD PTR[rsi + rax *2]
michael@0 329 movq xmm3, QWORD PTR[rdi + rdx *2]
michael@0 330
michael@0 331 punpcklbw xmm2, xmm0
michael@0 332 punpcklbw xmm3, xmm0
michael@0 333
michael@0 334 psubsw xmm2, xmm3
michael@0 335 paddw xmm7, xmm2
michael@0 336
michael@0 337 pmaddwd xmm2, xmm2
michael@0 338 paddd xmm1, xmm2
michael@0 339
michael@0 340
michael@0 341 lea rsi, [rsi + rax * 2]
michael@0 342 lea rdi, [rdi + rdx * 2]
michael@0 343
michael@0 344 movq xmm2, QWORD PTR[rsi + rax]
michael@0 345 movq xmm3, QWORD PTR[rdi + rdx]
michael@0 346
michael@0 347 punpcklbw xmm2, xmm0
michael@0 348 punpcklbw xmm3, xmm0
michael@0 349
michael@0 350 psubsw xmm2, xmm3
michael@0 351 paddw xmm7, xmm2
michael@0 352
michael@0 353 pmaddwd xmm2, xmm2
michael@0 354 paddd xmm1, xmm2
michael@0 355
michael@0 356
michael@0 357 movdqa xmm6, xmm7
michael@0 358 punpcklwd xmm6, xmm0
michael@0 359
michael@0 360 punpckhwd xmm7, xmm0
michael@0 361 movdqa xmm2, xmm1
michael@0 362
michael@0 363 paddw xmm6, xmm7
michael@0 364 punpckldq xmm1, xmm0
michael@0 365
michael@0 366 punpckhdq xmm2, xmm0
michael@0 367 movdqa xmm7, xmm6
michael@0 368
michael@0 369 paddd xmm1, xmm2
michael@0 370 punpckldq xmm6, xmm0
michael@0 371
michael@0 372 punpckhdq xmm7, xmm0
michael@0 373 paddw xmm6, xmm7
michael@0 374
michael@0 375 movdqa xmm2, xmm1
michael@0 376 movdqa xmm7, xmm6
michael@0 377
michael@0 378 psrldq xmm1, 8
michael@0 379 psrldq xmm6, 8
michael@0 380
michael@0 381 paddw xmm7, xmm6
michael@0 382 paddd xmm1, xmm2
michael@0 383
michael@0 384 mov rax, arg(5) ;[Sum]
michael@0 385 mov rdi, arg(4) ;[SSE]
michael@0 386
michael@0 387 movq rdx, xmm7
michael@0 388 movsx rcx, dx
michael@0 389
michael@0 390 mov dword ptr [rax], ecx
michael@0 391 movd DWORD PTR [rdi], xmm1
michael@0 392
michael@0 393 ; begin epilog
michael@0 394 add rsp, 16
michael@0 395 pop rdi
michael@0 396 pop rsi
michael@0 397 RESTORE_GOT
michael@0 398 RESTORE_XMM
michael@0 399 UNSHADOW_ARGS
michael@0 400 pop rbp
michael@0 401 ret
michael@0 402
michael@0 403 ;void vp8_filter_block2d_bil_var_sse2
michael@0 404 ;(
michael@0 405 ; unsigned char *ref_ptr,
michael@0 406 ; int ref_pixels_per_line,
michael@0 407 ; unsigned char *src_ptr,
michael@0 408 ; int src_pixels_per_line,
michael@0 409 ; unsigned int Height,
michael@0 410 ; int xoffset,
michael@0 411 ; int yoffset,
michael@0 412 ; int *sum,
michael@0 413 ; unsigned int *sumsquared;;
michael@0 414 ;
michael@0 415 ;)
michael@0 416 global sym(vp8_filter_block2d_bil_var_sse2) PRIVATE
michael@0 417 sym(vp8_filter_block2d_bil_var_sse2):
michael@0 418 push rbp
michael@0 419 mov rbp, rsp
michael@0 420 SHADOW_ARGS_TO_STACK 9
michael@0 421 SAVE_XMM 7
michael@0 422 GET_GOT rbx
michael@0 423 push rsi
michael@0 424 push rdi
michael@0 425 push rbx
michael@0 426 ; end prolog
michael@0 427
michael@0 428 pxor xmm6, xmm6 ;
michael@0 429 pxor xmm7, xmm7 ;
michael@0 430
michael@0 431 lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding
michael@0 432 movdqa xmm4, XMMWORD PTR [rsi]
michael@0 433
michael@0 434 lea rcx, [GLOBAL(vp8_bilinear_filters_sse2)]
michael@0 435 movsxd rax, dword ptr arg(5) ; xoffset
michael@0 436
michael@0 437 cmp rax, 0 ; skip first_pass filter if xoffset=0
michael@0 438 je filter_block2d_bil_var_sse2_sp_only
michael@0 439
michael@0 440 shl rax, 5 ; point to filter coeff with xoffset
michael@0 441 lea rax, [rax + rcx] ; HFilter
michael@0 442
michael@0 443 movsxd rdx, dword ptr arg(6) ; yoffset
michael@0 444
michael@0 445 cmp rdx, 0 ; skip second_pass filter if yoffset=0
michael@0 446 je filter_block2d_bil_var_sse2_fp_only
michael@0 447
michael@0 448 shl rdx, 5
michael@0 449 lea rdx, [rdx + rcx] ; VFilter
michael@0 450
michael@0 451 mov rsi, arg(0) ;ref_ptr
michael@0 452 mov rdi, arg(2) ;src_ptr
michael@0 453 movsxd rcx, dword ptr arg(4) ;Height
michael@0 454
michael@0 455 pxor xmm0, xmm0 ;
michael@0 456 movq xmm1, QWORD PTR [rsi] ;
michael@0 457 movq xmm3, QWORD PTR [rsi+1] ;
michael@0 458
michael@0 459 punpcklbw xmm1, xmm0 ;
michael@0 460 pmullw xmm1, [rax] ;
michael@0 461 punpcklbw xmm3, xmm0
michael@0 462 pmullw xmm3, [rax+16] ;
michael@0 463
michael@0 464 paddw xmm1, xmm3 ;
michael@0 465 paddw xmm1, xmm4 ;
michael@0 466 psraw xmm1, xmm_filter_shift ;
michael@0 467 movdqa xmm5, xmm1
michael@0 468
michael@0 469 movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line
michael@0 470 lea rsi, [rsi + rbx]
michael@0 471 %if ABI_IS_32BIT=0
michael@0 472 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
michael@0 473 %endif
michael@0 474
michael@0 475 filter_block2d_bil_var_sse2_loop:
michael@0 476 movq xmm1, QWORD PTR [rsi] ;
michael@0 477 movq xmm3, QWORD PTR [rsi+1] ;
michael@0 478
michael@0 479 punpcklbw xmm1, xmm0 ;
michael@0 480 pmullw xmm1, [rax] ;
michael@0 481 punpcklbw xmm3, xmm0 ;
michael@0 482 pmullw xmm3, [rax+16] ;
michael@0 483
michael@0 484 paddw xmm1, xmm3 ;
michael@0 485 paddw xmm1, xmm4 ;
michael@0 486 psraw xmm1, xmm_filter_shift ;
michael@0 487
michael@0 488 movdqa xmm3, xmm5 ;
michael@0 489 movdqa xmm5, xmm1 ;
michael@0 490
michael@0 491 pmullw xmm3, [rdx] ;
michael@0 492 pmullw xmm1, [rdx+16] ;
michael@0 493 paddw xmm1, xmm3 ;
michael@0 494 paddw xmm1, xmm4 ;
michael@0 495 psraw xmm1, xmm_filter_shift ;
michael@0 496
michael@0 497 movq xmm3, QWORD PTR [rdi] ;
michael@0 498 punpcklbw xmm3, xmm0 ;
michael@0 499
michael@0 500 psubw xmm1, xmm3 ;
michael@0 501 paddw xmm6, xmm1 ;
michael@0 502
michael@0 503 pmaddwd xmm1, xmm1 ;
michael@0 504 paddd xmm7, xmm1 ;
michael@0 505
michael@0 506 lea rsi, [rsi + rbx] ;ref_pixels_per_line
michael@0 507 %if ABI_IS_32BIT
michael@0 508 add rdi, dword ptr arg(3) ;src_pixels_per_line
michael@0 509 %else
michael@0 510 lea rdi, [rdi + r9]
michael@0 511 %endif
michael@0 512
michael@0 513 sub rcx, 1 ;
michael@0 514 jnz filter_block2d_bil_var_sse2_loop ;
michael@0 515
michael@0 516 jmp filter_block2d_bil_variance
michael@0 517
michael@0 518 filter_block2d_bil_var_sse2_sp_only:
michael@0 519 movsxd rdx, dword ptr arg(6) ; yoffset
michael@0 520
michael@0 521 cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0
michael@0 522 je filter_block2d_bil_var_sse2_full_pixel
michael@0 523
michael@0 524 shl rdx, 5
michael@0 525 lea rdx, [rdx + rcx] ; VFilter
michael@0 526
michael@0 527 mov rsi, arg(0) ;ref_ptr
michael@0 528 mov rdi, arg(2) ;src_ptr
michael@0 529 movsxd rcx, dword ptr arg(4) ;Height
michael@0 530 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
michael@0 531
michael@0 532 pxor xmm0, xmm0 ;
michael@0 533 movq xmm1, QWORD PTR [rsi] ;
michael@0 534 punpcklbw xmm1, xmm0 ;
michael@0 535
michael@0 536 movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
michael@0 537 lea rsi, [rsi + rax]
michael@0 538
michael@0 539 filter_block2d_bil_sp_only_loop:
michael@0 540 movq xmm3, QWORD PTR [rsi] ;
michael@0 541 punpcklbw xmm3, xmm0 ;
michael@0 542 movdqa xmm5, xmm3
michael@0 543
michael@0 544 pmullw xmm1, [rdx] ;
michael@0 545 pmullw xmm3, [rdx+16] ;
michael@0 546 paddw xmm1, xmm3 ;
michael@0 547 paddw xmm1, xmm4 ;
michael@0 548 psraw xmm1, xmm_filter_shift ;
michael@0 549
michael@0 550 movq xmm3, QWORD PTR [rdi] ;
michael@0 551 punpcklbw xmm3, xmm0 ;
michael@0 552
michael@0 553 psubw xmm1, xmm3 ;
michael@0 554 paddw xmm6, xmm1 ;
michael@0 555
michael@0 556 pmaddwd xmm1, xmm1 ;
michael@0 557 paddd xmm7, xmm1 ;
michael@0 558
michael@0 559 movdqa xmm1, xmm5 ;
michael@0 560 lea rsi, [rsi + rax] ;ref_pixels_per_line
michael@0 561 lea rdi, [rdi + rbx] ;src_pixels_per_line
michael@0 562
michael@0 563 sub rcx, 1 ;
michael@0 564 jnz filter_block2d_bil_sp_only_loop ;
michael@0 565
michael@0 566 jmp filter_block2d_bil_variance
michael@0 567
michael@0 568 filter_block2d_bil_var_sse2_full_pixel:
michael@0 569 mov rsi, arg(0) ;ref_ptr
michael@0 570 mov rdi, arg(2) ;src_ptr
michael@0 571 movsxd rcx, dword ptr arg(4) ;Height
michael@0 572 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
michael@0 573 movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
michael@0 574 pxor xmm0, xmm0 ;
michael@0 575
michael@0 576 filter_block2d_bil_full_pixel_loop:
michael@0 577 movq xmm1, QWORD PTR [rsi] ;
michael@0 578 punpcklbw xmm1, xmm0 ;
michael@0 579
michael@0 580 movq xmm2, QWORD PTR [rdi] ;
michael@0 581 punpcklbw xmm2, xmm0 ;
michael@0 582
michael@0 583 psubw xmm1, xmm2 ;
michael@0 584 paddw xmm6, xmm1 ;
michael@0 585
michael@0 586 pmaddwd xmm1, xmm1 ;
michael@0 587 paddd xmm7, xmm1 ;
michael@0 588
michael@0 589 lea rsi, [rsi + rax] ;ref_pixels_per_line
michael@0 590 lea rdi, [rdi + rbx] ;src_pixels_per_line
michael@0 591
michael@0 592 sub rcx, 1 ;
michael@0 593 jnz filter_block2d_bil_full_pixel_loop ;
michael@0 594
michael@0 595 jmp filter_block2d_bil_variance
michael@0 596
michael@0 597 filter_block2d_bil_var_sse2_fp_only:
michael@0 598 mov rsi, arg(0) ;ref_ptr
michael@0 599 mov rdi, arg(2) ;src_ptr
michael@0 600 movsxd rcx, dword ptr arg(4) ;Height
michael@0 601 movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
michael@0 602
michael@0 603 pxor xmm0, xmm0 ;
michael@0 604 movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
michael@0 605
michael@0 606 filter_block2d_bil_fp_only_loop:
michael@0 607 movq xmm1, QWORD PTR [rsi] ;
michael@0 608 movq xmm3, QWORD PTR [rsi+1] ;
michael@0 609
michael@0 610 punpcklbw xmm1, xmm0 ;
michael@0 611 pmullw xmm1, [rax] ;
michael@0 612 punpcklbw xmm3, xmm0 ;
michael@0 613 pmullw xmm3, [rax+16] ;
michael@0 614
michael@0 615 paddw xmm1, xmm3 ;
michael@0 616 paddw xmm1, xmm4 ;
michael@0 617 psraw xmm1, xmm_filter_shift ;
michael@0 618
michael@0 619 movq xmm3, QWORD PTR [rdi] ;
michael@0 620 punpcklbw xmm3, xmm0 ;
michael@0 621
michael@0 622 psubw xmm1, xmm3 ;
michael@0 623 paddw xmm6, xmm1 ;
michael@0 624
michael@0 625 pmaddwd xmm1, xmm1 ;
michael@0 626 paddd xmm7, xmm1 ;
michael@0 627 lea rsi, [rsi + rdx]
michael@0 628 lea rdi, [rdi + rbx] ;src_pixels_per_line
michael@0 629
michael@0 630 sub rcx, 1 ;
michael@0 631 jnz filter_block2d_bil_fp_only_loop ;
michael@0 632
michael@0 633 jmp filter_block2d_bil_variance
michael@0 634
michael@0 635 filter_block2d_bil_variance:
michael@0 636 movdq2q mm6, xmm6 ;
michael@0 637 movdq2q mm7, xmm7 ;
michael@0 638
michael@0 639 psrldq xmm6, 8
michael@0 640 psrldq xmm7, 8
michael@0 641
michael@0 642 movdq2q mm2, xmm6
michael@0 643 movdq2q mm3, xmm7
michael@0 644
michael@0 645 paddw mm6, mm2
michael@0 646 paddd mm7, mm3
michael@0 647
michael@0 648 pxor mm3, mm3 ;
michael@0 649 pxor mm2, mm2 ;
michael@0 650
michael@0 651 punpcklwd mm2, mm6 ;
michael@0 652 punpckhwd mm3, mm6 ;
michael@0 653
michael@0 654 paddd mm2, mm3 ;
michael@0 655 movq mm6, mm2 ;
michael@0 656
michael@0 657 psrlq mm6, 32 ;
michael@0 658 paddd mm2, mm6 ;
michael@0 659
michael@0 660 psrad mm2, 16 ;
michael@0 661 movq mm4, mm7 ;
michael@0 662
michael@0 663 psrlq mm4, 32 ;
michael@0 664 paddd mm4, mm7 ;
michael@0 665
michael@0 666 mov rsi, arg(7) ; sum
michael@0 667 mov rdi, arg(8) ; sumsquared
michael@0 668
michael@0 669 movd [rsi], mm2 ; xsum
michael@0 670 movd [rdi], mm4 ; xxsum
michael@0 671
michael@0 672 ; begin epilog
michael@0 673 pop rbx
michael@0 674 pop rdi
michael@0 675 pop rsi
michael@0 676 RESTORE_GOT
michael@0 677 RESTORE_XMM
michael@0 678 UNSHADOW_ARGS
michael@0 679 pop rbp
michael@0 680 ret
michael@0 681
michael@0 682
michael@0 683 ;void vp8_half_horiz_vert_variance8x_h_sse2
michael@0 684 ;(
michael@0 685 ; unsigned char *ref_ptr,
michael@0 686 ; int ref_pixels_per_line,
michael@0 687 ; unsigned char *src_ptr,
michael@0 688 ; int src_pixels_per_line,
michael@0 689 ; unsigned int Height,
michael@0 690 ; int *sum,
michael@0 691 ; unsigned int *sumsquared
michael@0 692 ;)
michael@0 693 global sym(vp8_half_horiz_vert_variance8x_h_sse2) PRIVATE
michael@0 694 sym(vp8_half_horiz_vert_variance8x_h_sse2):
michael@0 695 push rbp
michael@0 696 mov rbp, rsp
michael@0 697 SHADOW_ARGS_TO_STACK 7
michael@0 698 SAVE_XMM 7
michael@0 699 GET_GOT rbx
michael@0 700 push rsi
michael@0 701 push rdi
michael@0 702 ; end prolog
michael@0 703
michael@0 704 %if ABI_IS_32BIT=0
michael@0 705 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
michael@0 706 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
michael@0 707 %endif
michael@0 708
michael@0 709 pxor xmm6, xmm6 ; error accumulator
michael@0 710 pxor xmm7, xmm7 ; sse eaccumulator
michael@0 711 mov rsi, arg(0) ;ref_ptr ;
michael@0 712
michael@0 713 mov rdi, arg(2) ;src_ptr ;
michael@0 714 movsxd rcx, dword ptr arg(4) ;Height ;
michael@0 715 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
michael@0 716
michael@0 717 pxor xmm0, xmm0 ;
michael@0 718
michael@0 719 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
michael@0 720 movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
michael@0 721 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
michael@0 722
michael@0 723 %if ABI_IS_32BIT
michael@0 724 add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source
michael@0 725 %else
michael@0 726 add rsi, r8
michael@0 727 %endif
michael@0 728
michael@0 729 vp8_half_horiz_vert_variance8x_h_1:
michael@0 730
michael@0 731 movq xmm1, QWORD PTR [rsi] ;
michael@0 732 movq xmm2, QWORD PTR [rsi+1] ;
michael@0 733 pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
michael@0 734
michael@0 735 pavgb xmm5, xmm1 ; xmm = vertical average of the above
michael@0 736 punpcklbw xmm5, xmm0 ; xmm5 = words of above
michael@0 737
michael@0 738 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
michael@0 739 punpcklbw xmm3, xmm0 ; xmm3 = words of above
michael@0 740
michael@0 741 psubw xmm5, xmm3 ; xmm5 -= xmm3
michael@0 742 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
michael@0 743 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
michael@0 744 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
michael@0 745
michael@0 746 movdqa xmm5, xmm1 ; save xmm1 for use on the next row
michael@0 747
michael@0 748 %if ABI_IS_32BIT
michael@0 749 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
michael@0 750 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
michael@0 751 %else
michael@0 752 add rsi, r8
michael@0 753 add rdi, r9
michael@0 754 %endif
michael@0 755
michael@0 756 sub rcx, 1 ;
michael@0 757 jnz vp8_half_horiz_vert_variance8x_h_1 ;
michael@0 758
michael@0 759 movdq2q mm6, xmm6 ;
michael@0 760 movdq2q mm7, xmm7 ;
michael@0 761
michael@0 762 psrldq xmm6, 8
michael@0 763 psrldq xmm7, 8
michael@0 764
michael@0 765 movdq2q mm2, xmm6
michael@0 766 movdq2q mm3, xmm7
michael@0 767
michael@0 768 paddw mm6, mm2
michael@0 769 paddd mm7, mm3
michael@0 770
michael@0 771 pxor mm3, mm3 ;
michael@0 772 pxor mm2, mm2 ;
michael@0 773
michael@0 774 punpcklwd mm2, mm6 ;
michael@0 775 punpckhwd mm3, mm6 ;
michael@0 776
michael@0 777 paddd mm2, mm3 ;
michael@0 778 movq mm6, mm2 ;
michael@0 779
michael@0 780 psrlq mm6, 32 ;
michael@0 781 paddd mm2, mm6 ;
michael@0 782
michael@0 783 psrad mm2, 16 ;
michael@0 784 movq mm4, mm7 ;
michael@0 785
michael@0 786 psrlq mm4, 32 ;
michael@0 787 paddd mm4, mm7 ;
michael@0 788
michael@0 789 mov rsi, arg(5) ; sum
michael@0 790 mov rdi, arg(6) ; sumsquared
michael@0 791
michael@0 792 movd [rsi], mm2 ;
michael@0 793 movd [rdi], mm4 ;
michael@0 794
michael@0 795
michael@0 796 ; begin epilog
michael@0 797 pop rdi
michael@0 798 pop rsi
michael@0 799 RESTORE_GOT
michael@0 800 RESTORE_XMM
michael@0 801 UNSHADOW_ARGS
michael@0 802 pop rbp
michael@0 803 ret
michael@0 804
michael@0 805 ;void vp8_half_horiz_vert_variance16x_h_sse2
michael@0 806 ;(
michael@0 807 ; unsigned char *ref_ptr,
michael@0 808 ; int ref_pixels_per_line,
michael@0 809 ; unsigned char *src_ptr,
michael@0 810 ; int src_pixels_per_line,
michael@0 811 ; unsigned int Height,
michael@0 812 ; int *sum,
michael@0 813 ; unsigned int *sumsquared
michael@0 814 ;)
michael@0 815 global sym(vp8_half_horiz_vert_variance16x_h_sse2) PRIVATE
michael@0 816 sym(vp8_half_horiz_vert_variance16x_h_sse2):
michael@0 817 push rbp
michael@0 818 mov rbp, rsp
michael@0 819 SHADOW_ARGS_TO_STACK 7
michael@0 820 SAVE_XMM 7
michael@0 821 GET_GOT rbx
michael@0 822 push rsi
michael@0 823 push rdi
michael@0 824 ; end prolog
michael@0 825
michael@0 826 pxor xmm6, xmm6 ; error accumulator
michael@0 827 pxor xmm7, xmm7 ; sse eaccumulator
michael@0 828 mov rsi, arg(0) ;ref_ptr ;
michael@0 829
michael@0 830 mov rdi, arg(2) ;src_ptr ;
michael@0 831 movsxd rcx, dword ptr arg(4) ;Height ;
michael@0 832 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
michael@0 833 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
michael@0 834
michael@0 835 pxor xmm0, xmm0 ;
michael@0 836
michael@0 837 movdqu xmm5, XMMWORD PTR [rsi]
michael@0 838 movdqu xmm3, XMMWORD PTR [rsi+1]
michael@0 839 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
michael@0 840
michael@0 841 lea rsi, [rsi + rax]
michael@0 842
michael@0 843 vp8_half_horiz_vert_variance16x_h_1:
michael@0 844 movdqu xmm1, XMMWORD PTR [rsi] ;
michael@0 845 movdqu xmm2, XMMWORD PTR [rsi+1] ;
michael@0 846 pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
michael@0 847
michael@0 848 pavgb xmm5, xmm1 ; xmm = vertical average of the above
michael@0 849
michael@0 850 movdqa xmm4, xmm5
michael@0 851 punpcklbw xmm5, xmm0 ; xmm5 = words of above
michael@0 852 punpckhbw xmm4, xmm0
michael@0 853
michael@0 854 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
michael@0 855 punpcklbw xmm3, xmm0 ; xmm3 = words of above
michael@0 856 psubw xmm5, xmm3 ; xmm5 -= xmm3
michael@0 857
michael@0 858 movq xmm3, QWORD PTR [rdi+8]
michael@0 859 punpcklbw xmm3, xmm0
michael@0 860 psubw xmm4, xmm3
michael@0 861
michael@0 862 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
michael@0 863 paddw xmm6, xmm4
michael@0 864 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
michael@0 865 pmaddwd xmm4, xmm4
michael@0 866 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
michael@0 867 paddd xmm7, xmm4
michael@0 868
michael@0 869 movdqa xmm5, xmm1 ; save xmm1 for use on the next row
michael@0 870
michael@0 871 lea rsi, [rsi + rax]
michael@0 872 lea rdi, [rdi + rdx]
michael@0 873
michael@0 874 sub rcx, 1 ;
michael@0 875 jnz vp8_half_horiz_vert_variance16x_h_1 ;
michael@0 876
michael@0 877 pxor xmm1, xmm1
michael@0 878 pxor xmm5, xmm5
michael@0 879
michael@0 880 punpcklwd xmm0, xmm6
michael@0 881 punpckhwd xmm1, xmm6
michael@0 882 psrad xmm0, 16
michael@0 883 psrad xmm1, 16
michael@0 884 paddd xmm0, xmm1
michael@0 885 movdqa xmm1, xmm0
michael@0 886
michael@0 887 movdqa xmm6, xmm7
michael@0 888 punpckldq xmm6, xmm5
michael@0 889 punpckhdq xmm7, xmm5
michael@0 890 paddd xmm6, xmm7
michael@0 891
michael@0 892 punpckldq xmm0, xmm5
michael@0 893 punpckhdq xmm1, xmm5
michael@0 894 paddd xmm0, xmm1
michael@0 895
michael@0 896 movdqa xmm7, xmm6
michael@0 897 movdqa xmm1, xmm0
michael@0 898
michael@0 899 psrldq xmm7, 8
michael@0 900 psrldq xmm1, 8
michael@0 901
michael@0 902 paddd xmm6, xmm7
michael@0 903 paddd xmm0, xmm1
michael@0 904
michael@0 905 mov rsi, arg(5) ;[Sum]
michael@0 906 mov rdi, arg(6) ;[SSE]
michael@0 907
michael@0 908 movd [rsi], xmm0
michael@0 909 movd [rdi], xmm6
michael@0 910
michael@0 911 ; begin epilog
michael@0 912 pop rdi
michael@0 913 pop rsi
michael@0 914 RESTORE_GOT
michael@0 915 RESTORE_XMM
michael@0 916 UNSHADOW_ARGS
michael@0 917 pop rbp
michael@0 918 ret
michael@0 919
michael@0 920
michael@0 921 ;void vp8_half_vert_variance8x_h_sse2
michael@0 922 ;(
michael@0 923 ; unsigned char *ref_ptr,
michael@0 924 ; int ref_pixels_per_line,
michael@0 925 ; unsigned char *src_ptr,
michael@0 926 ; int src_pixels_per_line,
michael@0 927 ; unsigned int Height,
michael@0 928 ; int *sum,
michael@0 929 ; unsigned int *sumsquared
michael@0 930 ;)
michael@0 931 global sym(vp8_half_vert_variance8x_h_sse2) PRIVATE
michael@0 932 sym(vp8_half_vert_variance8x_h_sse2):
michael@0 933 push rbp
michael@0 934 mov rbp, rsp
michael@0 935 SHADOW_ARGS_TO_STACK 7
michael@0 936 SAVE_XMM 7
michael@0 937 GET_GOT rbx
michael@0 938 push rsi
michael@0 939 push rdi
michael@0 940 ; end prolog
michael@0 941
michael@0 942 %if ABI_IS_32BIT=0
michael@0 943 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
michael@0 944 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
michael@0 945 %endif
michael@0 946
michael@0 947 pxor xmm6, xmm6 ; error accumulator
michael@0 948 pxor xmm7, xmm7 ; sse eaccumulator
michael@0 949 mov rsi, arg(0) ;ref_ptr ;
michael@0 950
michael@0 951 mov rdi, arg(2) ;src_ptr ;
michael@0 952 movsxd rcx, dword ptr arg(4) ;Height ;
michael@0 953 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
michael@0 954
michael@0 955 pxor xmm0, xmm0 ;
michael@0 956 vp8_half_vert_variance8x_h_1:
michael@0 957 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
michael@0 958 movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9
michael@0 959
michael@0 960 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
michael@0 961 punpcklbw xmm5, xmm0 ; xmm5 = words of above
michael@0 962
michael@0 963 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
michael@0 964 punpcklbw xmm3, xmm0 ; xmm3 = words of above
michael@0 965
michael@0 966 psubw xmm5, xmm3 ; xmm5 -= xmm3
michael@0 967 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
michael@0 968 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
michael@0 969 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
michael@0 970
michael@0 971 %if ABI_IS_32BIT
michael@0 972 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
michael@0 973 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
michael@0 974 %else
michael@0 975 add rsi, r8
michael@0 976 add rdi, r9
michael@0 977 %endif
michael@0 978
michael@0 979 sub rcx, 1 ;
michael@0 980 jnz vp8_half_vert_variance8x_h_1 ;
michael@0 981
michael@0 982 movdq2q mm6, xmm6 ;
michael@0 983 movdq2q mm7, xmm7 ;
michael@0 984
michael@0 985 psrldq xmm6, 8
michael@0 986 psrldq xmm7, 8
michael@0 987
michael@0 988 movdq2q mm2, xmm6
michael@0 989 movdq2q mm3, xmm7
michael@0 990
michael@0 991 paddw mm6, mm2
michael@0 992 paddd mm7, mm3
michael@0 993
michael@0 994 pxor mm3, mm3 ;
michael@0 995 pxor mm2, mm2 ;
michael@0 996
michael@0 997 punpcklwd mm2, mm6 ;
michael@0 998 punpckhwd mm3, mm6 ;
michael@0 999
michael@0 1000 paddd mm2, mm3 ;
michael@0 1001 movq mm6, mm2 ;
michael@0 1002
michael@0 1003 psrlq mm6, 32 ;
michael@0 1004 paddd mm2, mm6 ;
michael@0 1005
michael@0 1006 psrad mm2, 16 ;
michael@0 1007 movq mm4, mm7 ;
michael@0 1008
michael@0 1009 psrlq mm4, 32 ;
michael@0 1010 paddd mm4, mm7 ;
michael@0 1011
michael@0 1012 mov rsi, arg(5) ; sum
michael@0 1013 mov rdi, arg(6) ; sumsquared
michael@0 1014
michael@0 1015 movd [rsi], mm2 ;
michael@0 1016 movd [rdi], mm4 ;
michael@0 1017
michael@0 1018
michael@0 1019 ; begin epilog
michael@0 1020 pop rdi
michael@0 1021 pop rsi
michael@0 1022 RESTORE_GOT
michael@0 1023 RESTORE_XMM
michael@0 1024 UNSHADOW_ARGS
michael@0 1025 pop rbp
michael@0 1026 ret
michael@0 1027
michael@0 1028 ;void vp8_half_vert_variance16x_h_sse2
michael@0 1029 ;(
michael@0 1030 ; unsigned char *ref_ptr,
michael@0 1031 ; int ref_pixels_per_line,
michael@0 1032 ; unsigned char *src_ptr,
michael@0 1033 ; int src_pixels_per_line,
michael@0 1034 ; unsigned int Height,
michael@0 1035 ; int *sum,
michael@0 1036 ; unsigned int *sumsquared
michael@0 1037 ;)
michael@0 1038 global sym(vp8_half_vert_variance16x_h_sse2) PRIVATE
michael@0 1039 sym(vp8_half_vert_variance16x_h_sse2):
michael@0 1040 push rbp
michael@0 1041 mov rbp, rsp
michael@0 1042 SHADOW_ARGS_TO_STACK 7
michael@0 1043 SAVE_XMM 7
michael@0 1044 GET_GOT rbx
michael@0 1045 push rsi
michael@0 1046 push rdi
michael@0 1047 ; end prolog
michael@0 1048
michael@0 1049 pxor xmm6, xmm6 ; error accumulator
michael@0 1050 pxor xmm7, xmm7 ; sse eaccumulator
michael@0 1051 mov rsi, arg(0) ;ref_ptr
michael@0 1052
michael@0 1053 mov rdi, arg(2) ;src_ptr
michael@0 1054 movsxd rcx, dword ptr arg(4) ;Height
michael@0 1055 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
michael@0 1056 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
michael@0 1057
michael@0 1058 movdqu xmm5, XMMWORD PTR [rsi]
michael@0 1059 lea rsi, [rsi + rax ]
michael@0 1060 pxor xmm0, xmm0
michael@0 1061
michael@0 1062 vp8_half_vert_variance16x_h_1:
michael@0 1063 movdqu xmm3, XMMWORD PTR [rsi]
michael@0 1064
michael@0 1065 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
michael@0 1066 movdqa xmm4, xmm5
michael@0 1067 punpcklbw xmm5, xmm0
michael@0 1068 punpckhbw xmm4, xmm0
michael@0 1069
michael@0 1070 movq xmm2, QWORD PTR [rdi]
michael@0 1071 punpcklbw xmm2, xmm0
michael@0 1072 psubw xmm5, xmm2
michael@0 1073 movq xmm2, QWORD PTR [rdi+8]
michael@0 1074 punpcklbw xmm2, xmm0
michael@0 1075 psubw xmm4, xmm2
michael@0 1076
michael@0 1077 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
michael@0 1078 paddw xmm6, xmm4
michael@0 1079 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
michael@0 1080 pmaddwd xmm4, xmm4
michael@0 1081 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
michael@0 1082 paddd xmm7, xmm4
michael@0 1083
michael@0 1084 movdqa xmm5, xmm3
michael@0 1085
michael@0 1086 lea rsi, [rsi + rax]
michael@0 1087 lea rdi, [rdi + rdx]
michael@0 1088
michael@0 1089 sub rcx, 1
michael@0 1090 jnz vp8_half_vert_variance16x_h_1
michael@0 1091
michael@0 1092 pxor xmm1, xmm1
michael@0 1093 pxor xmm5, xmm5
michael@0 1094
michael@0 1095 punpcklwd xmm0, xmm6
michael@0 1096 punpckhwd xmm1, xmm6
michael@0 1097 psrad xmm0, 16
michael@0 1098 psrad xmm1, 16
michael@0 1099 paddd xmm0, xmm1
michael@0 1100 movdqa xmm1, xmm0
michael@0 1101
michael@0 1102 movdqa xmm6, xmm7
michael@0 1103 punpckldq xmm6, xmm5
michael@0 1104 punpckhdq xmm7, xmm5
michael@0 1105 paddd xmm6, xmm7
michael@0 1106
michael@0 1107 punpckldq xmm0, xmm5
michael@0 1108 punpckhdq xmm1, xmm5
michael@0 1109 paddd xmm0, xmm1
michael@0 1110
michael@0 1111 movdqa xmm7, xmm6
michael@0 1112 movdqa xmm1, xmm0
michael@0 1113
michael@0 1114 psrldq xmm7, 8
michael@0 1115 psrldq xmm1, 8
michael@0 1116
michael@0 1117 paddd xmm6, xmm7
michael@0 1118 paddd xmm0, xmm1
michael@0 1119
michael@0 1120 mov rsi, arg(5) ;[Sum]
michael@0 1121 mov rdi, arg(6) ;[SSE]
michael@0 1122
michael@0 1123 movd [rsi], xmm0
michael@0 1124 movd [rdi], xmm6
michael@0 1125
michael@0 1126 ; begin epilog
michael@0 1127 pop rdi
michael@0 1128 pop rsi
michael@0 1129 RESTORE_GOT
michael@0 1130 RESTORE_XMM
michael@0 1131 UNSHADOW_ARGS
michael@0 1132 pop rbp
michael@0 1133 ret
michael@0 1134
michael@0 1135
michael@0 1136 ;void vp8_half_horiz_variance8x_h_sse2
michael@0 1137 ;(
michael@0 1138 ; unsigned char *ref_ptr,
michael@0 1139 ; int ref_pixels_per_line,
michael@0 1140 ; unsigned char *src_ptr,
michael@0 1141 ; int src_pixels_per_line,
michael@0 1142 ; unsigned int Height,
michael@0 1143 ; int *sum,
michael@0 1144 ; unsigned int *sumsquared
michael@0 1145 ;)
michael@0 1146 global sym(vp8_half_horiz_variance8x_h_sse2) PRIVATE
michael@0 1147 sym(vp8_half_horiz_variance8x_h_sse2):
michael@0 1148 push rbp
michael@0 1149 mov rbp, rsp
michael@0 1150 SHADOW_ARGS_TO_STACK 7
michael@0 1151 SAVE_XMM 7
michael@0 1152 GET_GOT rbx
michael@0 1153 push rsi
michael@0 1154 push rdi
michael@0 1155 ; end prolog
michael@0 1156
michael@0 1157 %if ABI_IS_32BIT=0
michael@0 1158 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
michael@0 1159 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
michael@0 1160 %endif
michael@0 1161
michael@0 1162 pxor xmm6, xmm6 ; error accumulator
michael@0 1163 pxor xmm7, xmm7 ; sse eaccumulator
michael@0 1164 mov rsi, arg(0) ;ref_ptr ;
michael@0 1165
michael@0 1166 mov rdi, arg(2) ;src_ptr ;
michael@0 1167 movsxd rcx, dword ptr arg(4) ;Height ;
michael@0 1168
michael@0 1169 pxor xmm0, xmm0 ;
michael@0 1170 vp8_half_horiz_variance8x_h_1:
michael@0 1171 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
michael@0 1172 movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
michael@0 1173
michael@0 1174 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
michael@0 1175 punpcklbw xmm5, xmm0 ; xmm5 = words of above
michael@0 1176
michael@0 1177 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
michael@0 1178 punpcklbw xmm3, xmm0 ; xmm3 = words of above
michael@0 1179
michael@0 1180 psubw xmm5, xmm3 ; xmm5 -= xmm3
michael@0 1181 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
michael@0 1182 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
michael@0 1183 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
michael@0 1184
michael@0 1185 %if ABI_IS_32BIT
michael@0 1186 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
michael@0 1187 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
michael@0 1188 %else
michael@0 1189 add rsi, r8
michael@0 1190 add rdi, r9
michael@0 1191 %endif
michael@0 1192 sub rcx, 1 ;
michael@0 1193 jnz vp8_half_horiz_variance8x_h_1 ;
michael@0 1194
michael@0 1195 movdq2q mm6, xmm6 ;
michael@0 1196 movdq2q mm7, xmm7 ;
michael@0 1197
michael@0 1198 psrldq xmm6, 8
michael@0 1199 psrldq xmm7, 8
michael@0 1200
michael@0 1201 movdq2q mm2, xmm6
michael@0 1202 movdq2q mm3, xmm7
michael@0 1203
michael@0 1204 paddw mm6, mm2
michael@0 1205 paddd mm7, mm3
michael@0 1206
michael@0 1207 pxor mm3, mm3 ;
michael@0 1208 pxor mm2, mm2 ;
michael@0 1209
michael@0 1210 punpcklwd mm2, mm6 ;
michael@0 1211 punpckhwd mm3, mm6 ;
michael@0 1212
michael@0 1213 paddd mm2, mm3 ;
michael@0 1214 movq mm6, mm2 ;
michael@0 1215
michael@0 1216 psrlq mm6, 32 ;
michael@0 1217 paddd mm2, mm6 ;
michael@0 1218
michael@0 1219 psrad mm2, 16 ;
michael@0 1220 movq mm4, mm7 ;
michael@0 1221
michael@0 1222 psrlq mm4, 32 ;
michael@0 1223 paddd mm4, mm7 ;
michael@0 1224
michael@0 1225 mov rsi, arg(5) ; sum
michael@0 1226 mov rdi, arg(6) ; sumsquared
michael@0 1227
michael@0 1228 movd [rsi], mm2 ;
michael@0 1229 movd [rdi], mm4 ;
michael@0 1230
michael@0 1231
michael@0 1232 ; begin epilog
michael@0 1233 pop rdi
michael@0 1234 pop rsi
michael@0 1235 RESTORE_GOT
michael@0 1236 RESTORE_XMM
michael@0 1237 UNSHADOW_ARGS
michael@0 1238 pop rbp
michael@0 1239 ret
michael@0 1240
michael@0 1241 ;void vp8_half_horiz_variance16x_h_sse2
michael@0 1242 ;(
michael@0 1243 ; unsigned char *ref_ptr,
michael@0 1244 ; int ref_pixels_per_line,
michael@0 1245 ; unsigned char *src_ptr,
michael@0 1246 ; int src_pixels_per_line,
michael@0 1247 ; unsigned int Height,
michael@0 1248 ; int *sum,
michael@0 1249 ; unsigned int *sumsquared
michael@0 1250 ;)
michael@0 1251 global sym(vp8_half_horiz_variance16x_h_sse2) PRIVATE
michael@0 1252 sym(vp8_half_horiz_variance16x_h_sse2):
michael@0 1253 push rbp
michael@0 1254 mov rbp, rsp
michael@0 1255 SHADOW_ARGS_TO_STACK 7
michael@0 1256 SAVE_XMM 7
michael@0 1257 GET_GOT rbx
michael@0 1258 push rsi
michael@0 1259 push rdi
michael@0 1260 ; end prolog
michael@0 1261
michael@0 1262 pxor xmm6, xmm6 ; error accumulator
michael@0 1263 pxor xmm7, xmm7 ; sse eaccumulator
michael@0 1264 mov rsi, arg(0) ;ref_ptr ;
michael@0 1265
michael@0 1266 mov rdi, arg(2) ;src_ptr ;
michael@0 1267 movsxd rcx, dword ptr arg(4) ;Height ;
michael@0 1268 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
michael@0 1269 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
michael@0 1270
michael@0 1271 pxor xmm0, xmm0 ;
michael@0 1272
michael@0 1273 vp8_half_horiz_variance16x_h_1:
michael@0 1274 movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15
michael@0 1275 movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16
michael@0 1276
michael@0 1277 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
michael@0 1278 movdqa xmm1, xmm5
michael@0 1279 punpcklbw xmm5, xmm0 ; xmm5 = words of above
michael@0 1280 punpckhbw xmm1, xmm0
michael@0 1281
michael@0 1282 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
michael@0 1283 punpcklbw xmm3, xmm0 ; xmm3 = words of above
michael@0 1284 movq xmm2, QWORD PTR [rdi+8]
michael@0 1285 punpcklbw xmm2, xmm0
michael@0 1286
michael@0 1287 psubw xmm5, xmm3 ; xmm5 -= xmm3
michael@0 1288 psubw xmm1, xmm2
michael@0 1289 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
michael@0 1290 paddw xmm6, xmm1
michael@0 1291 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
michael@0 1292 pmaddwd xmm1, xmm1
michael@0 1293 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
michael@0 1294 paddd xmm7, xmm1
michael@0 1295
michael@0 1296 lea rsi, [rsi + rax]
michael@0 1297 lea rdi, [rdi + rdx]
michael@0 1298
michael@0 1299 sub rcx, 1 ;
michael@0 1300 jnz vp8_half_horiz_variance16x_h_1 ;
michael@0 1301
michael@0 1302 pxor xmm1, xmm1
michael@0 1303 pxor xmm5, xmm5
michael@0 1304
michael@0 1305 punpcklwd xmm0, xmm6
michael@0 1306 punpckhwd xmm1, xmm6
michael@0 1307 psrad xmm0, 16
michael@0 1308 psrad xmm1, 16
michael@0 1309 paddd xmm0, xmm1
michael@0 1310 movdqa xmm1, xmm0
michael@0 1311
michael@0 1312 movdqa xmm6, xmm7
michael@0 1313 punpckldq xmm6, xmm5
michael@0 1314 punpckhdq xmm7, xmm5
michael@0 1315 paddd xmm6, xmm7
michael@0 1316
michael@0 1317 punpckldq xmm0, xmm5
michael@0 1318 punpckhdq xmm1, xmm5
michael@0 1319 paddd xmm0, xmm1
michael@0 1320
michael@0 1321 movdqa xmm7, xmm6
michael@0 1322 movdqa xmm1, xmm0
michael@0 1323
michael@0 1324 psrldq xmm7, 8
michael@0 1325 psrldq xmm1, 8
michael@0 1326
michael@0 1327 paddd xmm6, xmm7
michael@0 1328 paddd xmm0, xmm1
michael@0 1329
michael@0 1330 mov rsi, arg(5) ;[Sum]
michael@0 1331 mov rdi, arg(6) ;[SSE]
michael@0 1332
michael@0 1333 movd [rsi], xmm0
michael@0 1334 movd [rdi], xmm6
michael@0 1335
michael@0 1336 ; begin epilog
michael@0 1337 pop rdi
michael@0 1338 pop rsi
michael@0 1339 RESTORE_GOT
michael@0 1340 RESTORE_XMM
michael@0 1341 UNSHADOW_ARGS
michael@0 1342 pop rbp
michael@0 1343 ret
michael@0 1344
michael@0 1345 SECTION_RODATA
michael@0 1346 ; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
michael@0 1347 align 16
michael@0 1348 xmm_bi_rd:
michael@0 1349 times 8 dw 64
michael@0 1350 align 16
michael@0 1351 vp8_bilinear_filters_sse2:
michael@0 1352 dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
michael@0 1353 dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
michael@0 1354 dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
michael@0 1355 dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
michael@0 1356 dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
michael@0 1357 dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
michael@0 1358 dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
michael@0 1359 dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112

mercurial