media/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11
michael@0 12 %include "vpx_ports/x86_abi_support.asm"
michael@0 13
michael@0 14 ;unsigned int vp9_get_mb_ss_sse2
michael@0 15 ;(
michael@0 16 ; short *src_ptr
michael@0 17 ;)
michael@0 18 global sym(vp9_get_mb_ss_sse2) PRIVATE
michael@0 19 sym(vp9_get_mb_ss_sse2):
michael@0 20 push rbp
michael@0 21 mov rbp, rsp
michael@0 22 SHADOW_ARGS_TO_STACK 1
michael@0 23 GET_GOT rbx
michael@0 24 push rsi
michael@0 25 push rdi
michael@0 26 sub rsp, 16
michael@0 27 ; end prolog
michael@0 28
michael@0 29
michael@0 30 mov rax, arg(0) ;[src_ptr]
michael@0 31 mov rcx, 8
michael@0 32 pxor xmm4, xmm4
michael@0 33
michael@0 34 .NEXTROW:
michael@0 35 movdqa xmm0, [rax]
michael@0 36 movdqa xmm1, [rax+16]
michael@0 37 movdqa xmm2, [rax+32]
michael@0 38 movdqa xmm3, [rax+48]
michael@0 39 pmaddwd xmm0, xmm0
michael@0 40 pmaddwd xmm1, xmm1
michael@0 41 pmaddwd xmm2, xmm2
michael@0 42 pmaddwd xmm3, xmm3
michael@0 43
michael@0 44 paddd xmm0, xmm1
michael@0 45 paddd xmm2, xmm3
michael@0 46 paddd xmm4, xmm0
michael@0 47 paddd xmm4, xmm2
michael@0 48
michael@0 49 add rax, 0x40
michael@0 50 dec rcx
michael@0 51 ja .NEXTROW
michael@0 52
michael@0 53 movdqa xmm3,xmm4
michael@0 54 psrldq xmm4,8
michael@0 55 paddd xmm4,xmm3
michael@0 56 movdqa xmm3,xmm4
michael@0 57 psrldq xmm4,4
michael@0 58 paddd xmm4,xmm3
michael@0 59 movq rax,xmm4
michael@0 60
michael@0 61
michael@0 62 ; begin epilog
michael@0 63 add rsp, 16
michael@0 64 pop rdi
michael@0 65 pop rsi
michael@0 66 RESTORE_GOT
michael@0 67 UNSHADOW_ARGS
michael@0 68 pop rbp
michael@0 69 ret
michael@0 70
michael@0 71
michael@0 72 ;unsigned int vp9_get16x16var_sse2
michael@0 73 ;(
michael@0 74 ; unsigned char * src_ptr,
michael@0 75 ; int source_stride,
michael@0 76 ; unsigned char * ref_ptr,
michael@0 77 ; int recon_stride,
michael@0 78 ; unsigned int * SSE,
michael@0 79 ; int * Sum
michael@0 80 ;)
michael@0 81 global sym(vp9_get16x16var_sse2) PRIVATE
michael@0 82 sym(vp9_get16x16var_sse2):
michael@0 83 push rbp
michael@0 84 mov rbp, rsp
michael@0 85 SHADOW_ARGS_TO_STACK 6
michael@0 86 SAVE_XMM 7
michael@0 87 push rbx
michael@0 88 push rsi
michael@0 89 push rdi
michael@0 90 ; end prolog
michael@0 91
michael@0 92 mov rsi, arg(0) ;[src_ptr]
michael@0 93 mov rdi, arg(2) ;[ref_ptr]
michael@0 94
michael@0 95 movsxd rax, DWORD PTR arg(1) ;[source_stride]
michael@0 96 movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
michael@0 97
michael@0 98 ; Prefetch data
michael@0 99 lea rcx, [rax+rax*2]
michael@0 100 prefetcht0 [rsi]
michael@0 101 prefetcht0 [rsi+rax]
michael@0 102 prefetcht0 [rsi+rax*2]
michael@0 103 prefetcht0 [rsi+rcx]
michael@0 104 lea rbx, [rsi+rax*4]
michael@0 105 prefetcht0 [rbx]
michael@0 106 prefetcht0 [rbx+rax]
michael@0 107 prefetcht0 [rbx+rax*2]
michael@0 108 prefetcht0 [rbx+rcx]
michael@0 109
michael@0 110 lea rcx, [rdx+rdx*2]
michael@0 111 prefetcht0 [rdi]
michael@0 112 prefetcht0 [rdi+rdx]
michael@0 113 prefetcht0 [rdi+rdx*2]
michael@0 114 prefetcht0 [rdi+rcx]
michael@0 115 lea rbx, [rdi+rdx*4]
michael@0 116 prefetcht0 [rbx]
michael@0 117 prefetcht0 [rbx+rdx]
michael@0 118 prefetcht0 [rbx+rdx*2]
michael@0 119 prefetcht0 [rbx+rcx]
michael@0 120
michael@0 121 pxor xmm0, xmm0 ; clear xmm0 for unpack
michael@0 122 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
michael@0 123
michael@0 124 pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
michael@0 125 mov rcx, 16
michael@0 126
michael@0 127 .var16loop:
michael@0 128 movdqu xmm1, XMMWORD PTR [rsi]
michael@0 129 movdqu xmm2, XMMWORD PTR [rdi]
michael@0 130
michael@0 131 prefetcht0 [rsi+rax*8]
michael@0 132 prefetcht0 [rdi+rdx*8]
michael@0 133
michael@0 134 movdqa xmm3, xmm1
michael@0 135 movdqa xmm4, xmm2
michael@0 136
michael@0 137
michael@0 138 punpcklbw xmm1, xmm0
michael@0 139 punpckhbw xmm3, xmm0
michael@0 140
michael@0 141 punpcklbw xmm2, xmm0
michael@0 142 punpckhbw xmm4, xmm0
michael@0 143
michael@0 144
michael@0 145 psubw xmm1, xmm2
michael@0 146 psubw xmm3, xmm4
michael@0 147
michael@0 148 paddw xmm7, xmm1
michael@0 149 pmaddwd xmm1, xmm1
michael@0 150
michael@0 151 paddw xmm7, xmm3
michael@0 152 pmaddwd xmm3, xmm3
michael@0 153
michael@0 154 paddd xmm6, xmm1
michael@0 155 paddd xmm6, xmm3
michael@0 156
michael@0 157 add rsi, rax
michael@0 158 add rdi, rdx
michael@0 159
michael@0 160 sub rcx, 1
michael@0 161 jnz .var16loop
michael@0 162
michael@0 163
michael@0 164 movdqa xmm1, xmm6
michael@0 165 pxor xmm6, xmm6
michael@0 166
michael@0 167 pxor xmm5, xmm5
michael@0 168 punpcklwd xmm6, xmm7
michael@0 169
michael@0 170 punpckhwd xmm5, xmm7
michael@0 171 psrad xmm5, 16
michael@0 172
michael@0 173 psrad xmm6, 16
michael@0 174 paddd xmm6, xmm5
michael@0 175
michael@0 176 movdqa xmm2, xmm1
michael@0 177 punpckldq xmm1, xmm0
michael@0 178
michael@0 179 punpckhdq xmm2, xmm0
michael@0 180 movdqa xmm7, xmm6
michael@0 181
michael@0 182 paddd xmm1, xmm2
michael@0 183 punpckldq xmm6, xmm0
michael@0 184
michael@0 185 punpckhdq xmm7, xmm0
michael@0 186 paddd xmm6, xmm7
michael@0 187
michael@0 188 movdqa xmm2, xmm1
michael@0 189 movdqa xmm7, xmm6
michael@0 190
michael@0 191 psrldq xmm1, 8
michael@0 192 psrldq xmm6, 8
michael@0 193
michael@0 194 paddd xmm7, xmm6
michael@0 195 paddd xmm1, xmm2
michael@0 196
michael@0 197 mov rax, arg(5) ;[Sum]
michael@0 198 mov rdi, arg(4) ;[SSE]
michael@0 199
michael@0 200 movd DWORD PTR [rax], xmm7
michael@0 201 movd DWORD PTR [rdi], xmm1
michael@0 202
michael@0 203
michael@0 204 ; begin epilog
michael@0 205 pop rdi
michael@0 206 pop rsi
michael@0 207 pop rbx
michael@0 208 RESTORE_XMM
michael@0 209 UNSHADOW_ARGS
michael@0 210 pop rbp
michael@0 211 ret
michael@0 212
michael@0 213
michael@0 214
michael@0 215
michael@0 216 ;unsigned int vp9_get8x8var_sse2
michael@0 217 ;(
michael@0 218 ; unsigned char * src_ptr,
michael@0 219 ; int source_stride,
michael@0 220 ; unsigned char * ref_ptr,
michael@0 221 ; int recon_stride,
michael@0 222 ; unsigned int * SSE,
michael@0 223 ; int * Sum
michael@0 224 ;)
michael@0 225 global sym(vp9_get8x8var_sse2) PRIVATE
michael@0 226 sym(vp9_get8x8var_sse2):
michael@0 227 push rbp
michael@0 228 mov rbp, rsp
michael@0 229 SHADOW_ARGS_TO_STACK 6
michael@0 230 SAVE_XMM 7
michael@0 231 GET_GOT rbx
michael@0 232 push rsi
michael@0 233 push rdi
michael@0 234 sub rsp, 16
michael@0 235 ; end prolog
michael@0 236
michael@0 237 mov rsi, arg(0) ;[src_ptr]
michael@0 238 mov rdi, arg(2) ;[ref_ptr]
michael@0 239
michael@0 240 movsxd rax, DWORD PTR arg(1) ;[source_stride]
michael@0 241 movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
michael@0 242
michael@0 243 pxor xmm0, xmm0 ; clear xmm0 for unpack
michael@0 244 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
michael@0 245
michael@0 246 movq xmm1, QWORD PTR [rsi]
michael@0 247 movq xmm2, QWORD PTR [rdi]
michael@0 248
michael@0 249 punpcklbw xmm1, xmm0
michael@0 250 punpcklbw xmm2, xmm0
michael@0 251
michael@0 252 psubsw xmm1, xmm2
michael@0 253 paddw xmm7, xmm1
michael@0 254
michael@0 255 pmaddwd xmm1, xmm1
michael@0 256
michael@0 257 movq xmm2, QWORD PTR[rsi + rax]
michael@0 258 movq xmm3, QWORD PTR[rdi + rdx]
michael@0 259
michael@0 260 punpcklbw xmm2, xmm0
michael@0 261 punpcklbw xmm3, xmm0
michael@0 262
michael@0 263 psubsw xmm2, xmm3
michael@0 264 paddw xmm7, xmm2
michael@0 265
michael@0 266 pmaddwd xmm2, xmm2
michael@0 267 paddd xmm1, xmm2
michael@0 268
michael@0 269
michael@0 270 movq xmm2, QWORD PTR[rsi + rax * 2]
michael@0 271 movq xmm3, QWORD PTR[rdi + rdx * 2]
michael@0 272
michael@0 273 punpcklbw xmm2, xmm0
michael@0 274 punpcklbw xmm3, xmm0
michael@0 275
michael@0 276 psubsw xmm2, xmm3
michael@0 277 paddw xmm7, xmm2
michael@0 278
michael@0 279 pmaddwd xmm2, xmm2
michael@0 280 paddd xmm1, xmm2
michael@0 281
michael@0 282
michael@0 283 lea rsi, [rsi + rax * 2]
michael@0 284 lea rdi, [rdi + rdx * 2]
michael@0 285 movq xmm2, QWORD PTR[rsi + rax]
michael@0 286 movq xmm3, QWORD PTR[rdi + rdx]
michael@0 287
michael@0 288 punpcklbw xmm2, xmm0
michael@0 289 punpcklbw xmm3, xmm0
michael@0 290
michael@0 291 psubsw xmm2, xmm3
michael@0 292 paddw xmm7, xmm2
michael@0 293
michael@0 294 pmaddwd xmm2, xmm2
michael@0 295 paddd xmm1, xmm2
michael@0 296
michael@0 297 movq xmm2, QWORD PTR[rsi + rax *2]
michael@0 298 movq xmm3, QWORD PTR[rdi + rdx *2]
michael@0 299
michael@0 300 punpcklbw xmm2, xmm0
michael@0 301 punpcklbw xmm3, xmm0
michael@0 302
michael@0 303 psubsw xmm2, xmm3
michael@0 304 paddw xmm7, xmm2
michael@0 305
michael@0 306 pmaddwd xmm2, xmm2
michael@0 307 paddd xmm1, xmm2
michael@0 308
michael@0 309
michael@0 310 lea rsi, [rsi + rax * 2]
michael@0 311 lea rdi, [rdi + rdx * 2]
michael@0 312
michael@0 313
michael@0 314 movq xmm2, QWORD PTR[rsi + rax]
michael@0 315 movq xmm3, QWORD PTR[rdi + rdx]
michael@0 316
michael@0 317 punpcklbw xmm2, xmm0
michael@0 318 punpcklbw xmm3, xmm0
michael@0 319
michael@0 320 psubsw xmm2, xmm3
michael@0 321 paddw xmm7, xmm2
michael@0 322
michael@0 323 pmaddwd xmm2, xmm2
michael@0 324 paddd xmm1, xmm2
michael@0 325
michael@0 326 movq xmm2, QWORD PTR[rsi + rax *2]
michael@0 327 movq xmm3, QWORD PTR[rdi + rdx *2]
michael@0 328
michael@0 329 punpcklbw xmm2, xmm0
michael@0 330 punpcklbw xmm3, xmm0
michael@0 331
michael@0 332 psubsw xmm2, xmm3
michael@0 333 paddw xmm7, xmm2
michael@0 334
michael@0 335 pmaddwd xmm2, xmm2
michael@0 336 paddd xmm1, xmm2
michael@0 337
michael@0 338
michael@0 339 lea rsi, [rsi + rax * 2]
michael@0 340 lea rdi, [rdi + rdx * 2]
michael@0 341
michael@0 342 movq xmm2, QWORD PTR[rsi + rax]
michael@0 343 movq xmm3, QWORD PTR[rdi + rdx]
michael@0 344
michael@0 345 punpcklbw xmm2, xmm0
michael@0 346 punpcklbw xmm3, xmm0
michael@0 347
michael@0 348 psubsw xmm2, xmm3
michael@0 349 paddw xmm7, xmm2
michael@0 350
michael@0 351 pmaddwd xmm2, xmm2
michael@0 352 paddd xmm1, xmm2
michael@0 353
michael@0 354
michael@0 355 movdqa xmm6, xmm7
michael@0 356 punpcklwd xmm6, xmm0
michael@0 357
michael@0 358 punpckhwd xmm7, xmm0
michael@0 359 movdqa xmm2, xmm1
michael@0 360
michael@0 361 paddw xmm6, xmm7
michael@0 362 punpckldq xmm1, xmm0
michael@0 363
michael@0 364 punpckhdq xmm2, xmm0
michael@0 365 movdqa xmm7, xmm6
michael@0 366
michael@0 367 paddd xmm1, xmm2
michael@0 368 punpckldq xmm6, xmm0
michael@0 369
michael@0 370 punpckhdq xmm7, xmm0
michael@0 371 paddw xmm6, xmm7
michael@0 372
michael@0 373 movdqa xmm2, xmm1
michael@0 374 movdqa xmm7, xmm6
michael@0 375
michael@0 376 psrldq xmm1, 8
michael@0 377 psrldq xmm6, 8
michael@0 378
michael@0 379 paddw xmm7, xmm6
michael@0 380 paddd xmm1, xmm2
michael@0 381
michael@0 382 mov rax, arg(5) ;[Sum]
michael@0 383 mov rdi, arg(4) ;[SSE]
michael@0 384
michael@0 385 movq rdx, xmm7
michael@0 386 movsx rcx, dx
michael@0 387
michael@0 388 mov dword ptr [rax], ecx
michael@0 389 movd DWORD PTR [rdi], xmm1
michael@0 390
michael@0 391 ; begin epilog
michael@0 392 add rsp, 16
michael@0 393 pop rdi
michael@0 394 pop rsi
michael@0 395 RESTORE_GOT
michael@0 396 RESTORE_XMM
michael@0 397 UNSHADOW_ARGS
michael@0 398 pop rbp
michael@0 399 ret
michael@0 400
michael@0 401 ;void vp9_half_horiz_vert_variance8x_h_sse2
michael@0 402 ;(
michael@0 403 ; unsigned char *ref_ptr,
michael@0 404 ; int ref_pixels_per_line,
michael@0 405 ; unsigned char *src_ptr,
michael@0 406 ; int src_pixels_per_line,
michael@0 407 ; unsigned int Height,
michael@0 408 ; int *sum,
michael@0 409 ; unsigned int *sumsquared
michael@0 410 ;)
michael@0 411 global sym(vp9_half_horiz_vert_variance8x_h_sse2) PRIVATE
michael@0 412 sym(vp9_half_horiz_vert_variance8x_h_sse2):
michael@0 413 push rbp
michael@0 414 mov rbp, rsp
michael@0 415 SHADOW_ARGS_TO_STACK 7
michael@0 416 SAVE_XMM 7
michael@0 417 GET_GOT rbx
michael@0 418 push rsi
michael@0 419 push rdi
michael@0 420 ; end prolog
michael@0 421
michael@0 422 %if ABI_IS_32BIT=0
michael@0 423 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
michael@0 424 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
michael@0 425 %endif
michael@0 426
michael@0 427 pxor xmm6, xmm6 ; error accumulator
michael@0 428 pxor xmm7, xmm7 ; sse eaccumulator
michael@0 429 mov rsi, arg(0) ;ref_ptr ;
michael@0 430
michael@0 431 mov rdi, arg(2) ;src_ptr ;
michael@0 432 movsxd rcx, dword ptr arg(4) ;Height ;
michael@0 433 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
michael@0 434
michael@0 435 pxor xmm0, xmm0 ;
michael@0 436
michael@0 437 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
michael@0 438 movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
michael@0 439 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
michael@0 440
michael@0 441 %if ABI_IS_32BIT
michael@0 442 add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source
michael@0 443 %else
michael@0 444 add rsi, r8
michael@0 445 %endif
michael@0 446
michael@0 447 .half_horiz_vert_variance8x_h_1:
michael@0 448
michael@0 449 movq xmm1, QWORD PTR [rsi] ;
michael@0 450 movq xmm2, QWORD PTR [rsi+1] ;
michael@0 451 pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
michael@0 452
michael@0 453 pavgb xmm5, xmm1 ; xmm = vertical average of the above
michael@0 454 punpcklbw xmm5, xmm0 ; xmm5 = words of above
michael@0 455
michael@0 456 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
michael@0 457 punpcklbw xmm3, xmm0 ; xmm3 = words of above
michael@0 458
michael@0 459 psubw xmm5, xmm3 ; xmm5 -= xmm3
michael@0 460 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
michael@0 461 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
michael@0 462 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
michael@0 463
michael@0 464 movdqa xmm5, xmm1 ; save xmm1 for use on the next row
michael@0 465
michael@0 466 %if ABI_IS_32BIT
michael@0 467 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
michael@0 468 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
michael@0 469 %else
michael@0 470 add rsi, r8
michael@0 471 add rdi, r9
michael@0 472 %endif
michael@0 473
michael@0 474 sub rcx, 1 ;
michael@0 475 jnz .half_horiz_vert_variance8x_h_1 ;
michael@0 476
michael@0 477 movdq2q mm6, xmm6 ;
michael@0 478 movdq2q mm7, xmm7 ;
michael@0 479
michael@0 480 psrldq xmm6, 8
michael@0 481 psrldq xmm7, 8
michael@0 482
michael@0 483 movdq2q mm2, xmm6
michael@0 484 movdq2q mm3, xmm7
michael@0 485
michael@0 486 paddw mm6, mm2
michael@0 487 paddd mm7, mm3
michael@0 488
michael@0 489 pxor mm3, mm3 ;
michael@0 490 pxor mm2, mm2 ;
michael@0 491
michael@0 492 punpcklwd mm2, mm6 ;
michael@0 493 punpckhwd mm3, mm6 ;
michael@0 494
michael@0 495 paddd mm2, mm3 ;
michael@0 496 movq mm6, mm2 ;
michael@0 497
michael@0 498 psrlq mm6, 32 ;
michael@0 499 paddd mm2, mm6 ;
michael@0 500
michael@0 501 psrad mm2, 16 ;
michael@0 502 movq mm4, mm7 ;
michael@0 503
michael@0 504 psrlq mm4, 32 ;
michael@0 505 paddd mm4, mm7 ;
michael@0 506
michael@0 507 mov rsi, arg(5) ; sum
michael@0 508 mov rdi, arg(6) ; sumsquared
michael@0 509
michael@0 510 movd [rsi], mm2 ;
michael@0 511 movd [rdi], mm4 ;
michael@0 512
michael@0 513
michael@0 514 ; begin epilog
michael@0 515 pop rdi
michael@0 516 pop rsi
michael@0 517 RESTORE_GOT
michael@0 518 RESTORE_XMM
michael@0 519 UNSHADOW_ARGS
michael@0 520 pop rbp
michael@0 521 ret
michael@0 522
michael@0 523 ;void vp9_half_vert_variance8x_h_sse2
michael@0 524 ;(
michael@0 525 ; unsigned char *ref_ptr,
michael@0 526 ; int ref_pixels_per_line,
michael@0 527 ; unsigned char *src_ptr,
michael@0 528 ; int src_pixels_per_line,
michael@0 529 ; unsigned int Height,
michael@0 530 ; int *sum,
michael@0 531 ; unsigned int *sumsquared
michael@0 532 ;)
michael@0 533 global sym(vp9_half_vert_variance8x_h_sse2) PRIVATE
michael@0 534 sym(vp9_half_vert_variance8x_h_sse2):
michael@0 535 push rbp
michael@0 536 mov rbp, rsp
michael@0 537 SHADOW_ARGS_TO_STACK 7
michael@0 538 SAVE_XMM 7
michael@0 539 GET_GOT rbx
michael@0 540 push rsi
michael@0 541 push rdi
michael@0 542 ; end prolog
michael@0 543
michael@0 544 %if ABI_IS_32BIT=0
michael@0 545 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
michael@0 546 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
michael@0 547 %endif
michael@0 548
michael@0 549 pxor xmm6, xmm6 ; error accumulator
michael@0 550 pxor xmm7, xmm7 ; sse eaccumulator
michael@0 551 mov rsi, arg(0) ;ref_ptr ;
michael@0 552
michael@0 553 mov rdi, arg(2) ;src_ptr ;
michael@0 554 movsxd rcx, dword ptr arg(4) ;Height ;
michael@0 555 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
michael@0 556
michael@0 557 pxor xmm0, xmm0 ;
michael@0 558 .half_vert_variance8x_h_1:
michael@0 559 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
michael@0 560 movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9
michael@0 561
michael@0 562 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
michael@0 563 punpcklbw xmm5, xmm0 ; xmm5 = words of above
michael@0 564
michael@0 565 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
michael@0 566 punpcklbw xmm3, xmm0 ; xmm3 = words of above
michael@0 567
michael@0 568 psubw xmm5, xmm3 ; xmm5 -= xmm3
michael@0 569 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
michael@0 570 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
michael@0 571 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
michael@0 572
michael@0 573 %if ABI_IS_32BIT
michael@0 574 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
michael@0 575 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
michael@0 576 %else
michael@0 577 add rsi, r8
michael@0 578 add rdi, r9
michael@0 579 %endif
michael@0 580
michael@0 581 sub rcx, 1 ;
michael@0 582 jnz .half_vert_variance8x_h_1 ;
michael@0 583
michael@0 584 movdq2q mm6, xmm6 ;
michael@0 585 movdq2q mm7, xmm7 ;
michael@0 586
michael@0 587 psrldq xmm6, 8
michael@0 588 psrldq xmm7, 8
michael@0 589
michael@0 590 movdq2q mm2, xmm6
michael@0 591 movdq2q mm3, xmm7
michael@0 592
michael@0 593 paddw mm6, mm2
michael@0 594 paddd mm7, mm3
michael@0 595
michael@0 596 pxor mm3, mm3 ;
michael@0 597 pxor mm2, mm2 ;
michael@0 598
michael@0 599 punpcklwd mm2, mm6 ;
michael@0 600 punpckhwd mm3, mm6 ;
michael@0 601
michael@0 602 paddd mm2, mm3 ;
michael@0 603 movq mm6, mm2 ;
michael@0 604
michael@0 605 psrlq mm6, 32 ;
michael@0 606 paddd mm2, mm6 ;
michael@0 607
michael@0 608 psrad mm2, 16 ;
michael@0 609 movq mm4, mm7 ;
michael@0 610
michael@0 611 psrlq mm4, 32 ;
michael@0 612 paddd mm4, mm7 ;
michael@0 613
michael@0 614 mov rsi, arg(5) ; sum
michael@0 615 mov rdi, arg(6) ; sumsquared
michael@0 616
michael@0 617 movd [rsi], mm2 ;
michael@0 618 movd [rdi], mm4 ;
michael@0 619
michael@0 620
michael@0 621 ; begin epilog
michael@0 622 pop rdi
michael@0 623 pop rsi
michael@0 624 RESTORE_GOT
michael@0 625 RESTORE_XMM
michael@0 626 UNSHADOW_ARGS
michael@0 627 pop rbp
michael@0 628 ret
michael@0 629
michael@0 630
michael@0 631 ;void vp9_half_horiz_variance8x_h_sse2
michael@0 632 ;(
michael@0 633 ; unsigned char *ref_ptr,
michael@0 634 ; int ref_pixels_per_line,
michael@0 635 ; unsigned char *src_ptr,
michael@0 636 ; int src_pixels_per_line,
michael@0 637 ; unsigned int Height,
michael@0 638 ; int *sum,
michael@0 639 ; unsigned int *sumsquared
michael@0 640 ;)
michael@0 641 global sym(vp9_half_horiz_variance8x_h_sse2) PRIVATE
michael@0 642 sym(vp9_half_horiz_variance8x_h_sse2):
michael@0 643 push rbp
michael@0 644 mov rbp, rsp
michael@0 645 SHADOW_ARGS_TO_STACK 7
michael@0 646 SAVE_XMM 7
michael@0 647 GET_GOT rbx
michael@0 648 push rsi
michael@0 649 push rdi
michael@0 650 ; end prolog
michael@0 651
michael@0 652 %if ABI_IS_32BIT=0
michael@0 653 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
michael@0 654 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
michael@0 655 %endif
michael@0 656
michael@0 657 pxor xmm6, xmm6 ; error accumulator
michael@0 658 pxor xmm7, xmm7 ; sse eaccumulator
michael@0 659 mov rsi, arg(0) ;ref_ptr ;
michael@0 660
michael@0 661 mov rdi, arg(2) ;src_ptr ;
michael@0 662 movsxd rcx, dword ptr arg(4) ;Height ;
michael@0 663
michael@0 664 pxor xmm0, xmm0 ;
michael@0 665 .half_horiz_variance8x_h_1:
michael@0 666 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
michael@0 667 movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
michael@0 668
michael@0 669 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
michael@0 670 punpcklbw xmm5, xmm0 ; xmm5 = words of above
michael@0 671
michael@0 672 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
michael@0 673 punpcklbw xmm3, xmm0 ; xmm3 = words of above
michael@0 674
michael@0 675 psubw xmm5, xmm3 ; xmm5 -= xmm3
michael@0 676 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
michael@0 677 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
michael@0 678 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
michael@0 679
michael@0 680 %if ABI_IS_32BIT
michael@0 681 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
michael@0 682 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
michael@0 683 %else
michael@0 684 add rsi, r8
michael@0 685 add rdi, r9
michael@0 686 %endif
michael@0 687 sub rcx, 1 ;
michael@0 688 jnz .half_horiz_variance8x_h_1 ;
michael@0 689
michael@0 690 movdq2q mm6, xmm6 ;
michael@0 691 movdq2q mm7, xmm7 ;
michael@0 692
michael@0 693 psrldq xmm6, 8
michael@0 694 psrldq xmm7, 8
michael@0 695
michael@0 696 movdq2q mm2, xmm6
michael@0 697 movdq2q mm3, xmm7
michael@0 698
michael@0 699 paddw mm6, mm2
michael@0 700 paddd mm7, mm3
michael@0 701
michael@0 702 pxor mm3, mm3 ;
michael@0 703 pxor mm2, mm2 ;
michael@0 704
michael@0 705 punpcklwd mm2, mm6 ;
michael@0 706 punpckhwd mm3, mm6 ;
michael@0 707
michael@0 708 paddd mm2, mm3 ;
michael@0 709 movq mm6, mm2 ;
michael@0 710
michael@0 711 psrlq mm6, 32 ;
michael@0 712 paddd mm2, mm6 ;
michael@0 713
michael@0 714 psrad mm2, 16 ;
michael@0 715 movq mm4, mm7 ;
michael@0 716
michael@0 717 psrlq mm4, 32 ;
michael@0 718 paddd mm4, mm7 ;
michael@0 719
michael@0 720 mov rsi, arg(5) ; sum
michael@0 721 mov rdi, arg(6) ; sumsquared
michael@0 722
michael@0 723 movd [rsi], mm2 ;
michael@0 724 movd [rdi], mm4 ;
michael@0 725
michael@0 726
michael@0 727 ; begin epilog
michael@0 728 pop rdi
michael@0 729 pop rsi
michael@0 730 RESTORE_GOT
michael@0 731 RESTORE_XMM
michael@0 732 UNSHADOW_ARGS
michael@0 733 pop rbp
michael@0 734 ret

mercurial