media/libvpx/vp8/common/x86/sad_sse3.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11 %include "vpx_ports/x86_abi_support.asm"
michael@0 12
michael@0 13 %macro STACK_FRAME_CREATE_X3 0
michael@0 14 %if ABI_IS_32BIT
michael@0 15 %define src_ptr rsi
michael@0 16 %define src_stride rax
michael@0 17 %define ref_ptr rdi
michael@0 18 %define ref_stride rdx
michael@0 19 %define end_ptr rcx
michael@0 20 %define ret_var rbx
michael@0 21 %define result_ptr arg(4)
michael@0 22 %define max_sad arg(4)
michael@0 23 %define height dword ptr arg(4)
michael@0 24 push rbp
michael@0 25 mov rbp, rsp
michael@0 26 push rsi
michael@0 27 push rdi
michael@0 28 push rbx
michael@0 29
michael@0 30 mov rsi, arg(0) ; src_ptr
michael@0 31 mov rdi, arg(2) ; ref_ptr
michael@0 32
michael@0 33 movsxd rax, dword ptr arg(1) ; src_stride
michael@0 34 movsxd rdx, dword ptr arg(3) ; ref_stride
michael@0 35 %else
michael@0 36 %if LIBVPX_YASM_WIN64
michael@0 37 SAVE_XMM 7, u
michael@0 38 %define src_ptr rcx
michael@0 39 %define src_stride rdx
michael@0 40 %define ref_ptr r8
michael@0 41 %define ref_stride r9
michael@0 42 %define end_ptr r10
michael@0 43 %define ret_var r11
michael@0 44 %define result_ptr [rsp+xmm_stack_space+8+4*8]
michael@0 45 %define max_sad [rsp+xmm_stack_space+8+4*8]
michael@0 46 %define height dword ptr [rsp+xmm_stack_space+8+4*8]
michael@0 47 %else
michael@0 48 %define src_ptr rdi
michael@0 49 %define src_stride rsi
michael@0 50 %define ref_ptr rdx
michael@0 51 %define ref_stride rcx
michael@0 52 %define end_ptr r9
michael@0 53 %define ret_var r10
michael@0 54 %define result_ptr r8
michael@0 55 %define max_sad r8
michael@0 56 %define height r8
michael@0 57 %endif
michael@0 58 %endif
michael@0 59
michael@0 60 %endmacro
michael@0 61
michael@0 62 %macro STACK_FRAME_DESTROY_X3 0
michael@0 63 %define src_ptr
michael@0 64 %define src_stride
michael@0 65 %define ref_ptr
michael@0 66 %define ref_stride
michael@0 67 %define end_ptr
michael@0 68 %define ret_var
michael@0 69 %define result_ptr
michael@0 70 %define max_sad
michael@0 71 %define height
michael@0 72
michael@0 73 %if ABI_IS_32BIT
michael@0 74 pop rbx
michael@0 75 pop rdi
michael@0 76 pop rsi
michael@0 77 pop rbp
michael@0 78 %else
michael@0 79 %if LIBVPX_YASM_WIN64
michael@0 80 RESTORE_XMM
michael@0 81 %endif
michael@0 82 %endif
michael@0 83 ret
michael@0 84 %endmacro
michael@0 85
michael@0 86 %macro STACK_FRAME_CREATE_X4 0
michael@0 87 %if ABI_IS_32BIT
michael@0 88 %define src_ptr rsi
michael@0 89 %define src_stride rax
michael@0 90 %define r0_ptr rcx
michael@0 91 %define r1_ptr rdx
michael@0 92 %define r2_ptr rbx
michael@0 93 %define r3_ptr rdi
michael@0 94 %define ref_stride rbp
michael@0 95 %define result_ptr arg(4)
michael@0 96 push rbp
michael@0 97 mov rbp, rsp
michael@0 98 push rsi
michael@0 99 push rdi
michael@0 100 push rbx
michael@0 101
michael@0 102 push rbp
michael@0 103 mov rdi, arg(2) ; ref_ptr_base
michael@0 104
michael@0 105 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
michael@0 106
michael@0 107 mov rsi, arg(0) ; src_ptr
michael@0 108
michael@0 109 movsxd rbx, dword ptr arg(1) ; src_stride
michael@0 110 movsxd rbp, dword ptr arg(3) ; ref_stride
michael@0 111
michael@0 112 xchg rbx, rax
michael@0 113 %else
michael@0 114 %if LIBVPX_YASM_WIN64
michael@0 115 SAVE_XMM 7, u
michael@0 116 %define src_ptr rcx
michael@0 117 %define src_stride rdx
michael@0 118 %define r0_ptr rsi
michael@0 119 %define r1_ptr r10
michael@0 120 %define r2_ptr r11
michael@0 121 %define r3_ptr r8
michael@0 122 %define ref_stride r9
michael@0 123 %define result_ptr [rsp+xmm_stack_space+16+4*8]
michael@0 124 push rsi
michael@0 125
michael@0 126 LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
michael@0 127 %else
michael@0 128 %define src_ptr rdi
michael@0 129 %define src_stride rsi
michael@0 130 %define r0_ptr r9
michael@0 131 %define r1_ptr r10
michael@0 132 %define r2_ptr r11
michael@0 133 %define r3_ptr rdx
michael@0 134 %define ref_stride rcx
michael@0 135 %define result_ptr r8
michael@0 136
michael@0 137 LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr
michael@0 138
michael@0 139 %endif
michael@0 140 %endif
michael@0 141 %endmacro
michael@0 142
michael@0 143 %macro STACK_FRAME_DESTROY_X4 0
michael@0 144 %define src_ptr
michael@0 145 %define src_stride
michael@0 146 %define r0_ptr
michael@0 147 %define r1_ptr
michael@0 148 %define r2_ptr
michael@0 149 %define r3_ptr
michael@0 150 %define ref_stride
michael@0 151 %define result_ptr
michael@0 152
michael@0 153 %if ABI_IS_32BIT
michael@0 154 pop rbx
michael@0 155 pop rdi
michael@0 156 pop rsi
michael@0 157 pop rbp
michael@0 158 %else
michael@0 159 %if LIBVPX_YASM_WIN64
michael@0 160 pop rsi
michael@0 161 RESTORE_XMM
michael@0 162 %endif
michael@0 163 %endif
michael@0 164 ret
michael@0 165 %endmacro
michael@0 166
michael@0 167 %macro PROCESS_16X2X3 5
michael@0 168 %if %1==0
michael@0 169 movdqa xmm0, XMMWORD PTR [%2]
michael@0 170 lddqu xmm5, XMMWORD PTR [%3]
michael@0 171 lddqu xmm6, XMMWORD PTR [%3+1]
michael@0 172 lddqu xmm7, XMMWORD PTR [%3+2]
michael@0 173
michael@0 174 psadbw xmm5, xmm0
michael@0 175 psadbw xmm6, xmm0
michael@0 176 psadbw xmm7, xmm0
michael@0 177 %else
michael@0 178 movdqa xmm0, XMMWORD PTR [%2]
michael@0 179 lddqu xmm1, XMMWORD PTR [%3]
michael@0 180 lddqu xmm2, XMMWORD PTR [%3+1]
michael@0 181 lddqu xmm3, XMMWORD PTR [%3+2]
michael@0 182
michael@0 183 psadbw xmm1, xmm0
michael@0 184 psadbw xmm2, xmm0
michael@0 185 psadbw xmm3, xmm0
michael@0 186
michael@0 187 paddw xmm5, xmm1
michael@0 188 paddw xmm6, xmm2
michael@0 189 paddw xmm7, xmm3
michael@0 190 %endif
michael@0 191 movdqa xmm0, XMMWORD PTR [%2+%4]
michael@0 192 lddqu xmm1, XMMWORD PTR [%3+%5]
michael@0 193 lddqu xmm2, XMMWORD PTR [%3+%5+1]
michael@0 194 lddqu xmm3, XMMWORD PTR [%3+%5+2]
michael@0 195
michael@0 196 %if %1==0 || %1==1
michael@0 197 lea %2, [%2+%4*2]
michael@0 198 lea %3, [%3+%5*2]
michael@0 199 %endif
michael@0 200
michael@0 201 psadbw xmm1, xmm0
michael@0 202 psadbw xmm2, xmm0
michael@0 203 psadbw xmm3, xmm0
michael@0 204
michael@0 205 paddw xmm5, xmm1
michael@0 206 paddw xmm6, xmm2
michael@0 207 paddw xmm7, xmm3
michael@0 208 %endmacro
michael@0 209
michael@0 210 %macro PROCESS_8X2X3 5
michael@0 211 %if %1==0
michael@0 212 movq mm0, QWORD PTR [%2]
michael@0 213 movq mm5, QWORD PTR [%3]
michael@0 214 movq mm6, QWORD PTR [%3+1]
michael@0 215 movq mm7, QWORD PTR [%3+2]
michael@0 216
michael@0 217 psadbw mm5, mm0
michael@0 218 psadbw mm6, mm0
michael@0 219 psadbw mm7, mm0
michael@0 220 %else
michael@0 221 movq mm0, QWORD PTR [%2]
michael@0 222 movq mm1, QWORD PTR [%3]
michael@0 223 movq mm2, QWORD PTR [%3+1]
michael@0 224 movq mm3, QWORD PTR [%3+2]
michael@0 225
michael@0 226 psadbw mm1, mm0
michael@0 227 psadbw mm2, mm0
michael@0 228 psadbw mm3, mm0
michael@0 229
michael@0 230 paddw mm5, mm1
michael@0 231 paddw mm6, mm2
michael@0 232 paddw mm7, mm3
michael@0 233 %endif
michael@0 234 movq mm0, QWORD PTR [%2+%4]
michael@0 235 movq mm1, QWORD PTR [%3+%5]
michael@0 236 movq mm2, QWORD PTR [%3+%5+1]
michael@0 237 movq mm3, QWORD PTR [%3+%5+2]
michael@0 238
michael@0 239 %if %1==0 || %1==1
michael@0 240 lea %2, [%2+%4*2]
michael@0 241 lea %3, [%3+%5*2]
michael@0 242 %endif
michael@0 243
michael@0 244 psadbw mm1, mm0
michael@0 245 psadbw mm2, mm0
michael@0 246 psadbw mm3, mm0
michael@0 247
michael@0 248 paddw mm5, mm1
michael@0 249 paddw mm6, mm2
michael@0 250 paddw mm7, mm3
michael@0 251 %endmacro
michael@0 252
michael@0 253 %macro LOAD_X4_ADDRESSES 5
michael@0 254 mov %2, [%1+REG_SZ_BYTES*0]
michael@0 255 mov %3, [%1+REG_SZ_BYTES*1]
michael@0 256
michael@0 257 mov %4, [%1+REG_SZ_BYTES*2]
michael@0 258 mov %5, [%1+REG_SZ_BYTES*3]
michael@0 259 %endmacro
michael@0 260
michael@0 261 %macro PROCESS_16X2X4 8
michael@0 262 %if %1==0
michael@0 263 movdqa xmm0, XMMWORD PTR [%2]
michael@0 264 lddqu xmm4, XMMWORD PTR [%3]
michael@0 265 lddqu xmm5, XMMWORD PTR [%4]
michael@0 266 lddqu xmm6, XMMWORD PTR [%5]
michael@0 267 lddqu xmm7, XMMWORD PTR [%6]
michael@0 268
michael@0 269 psadbw xmm4, xmm0
michael@0 270 psadbw xmm5, xmm0
michael@0 271 psadbw xmm6, xmm0
michael@0 272 psadbw xmm7, xmm0
michael@0 273 %else
michael@0 274 movdqa xmm0, XMMWORD PTR [%2]
michael@0 275 lddqu xmm1, XMMWORD PTR [%3]
michael@0 276 lddqu xmm2, XMMWORD PTR [%4]
michael@0 277 lddqu xmm3, XMMWORD PTR [%5]
michael@0 278
michael@0 279 psadbw xmm1, xmm0
michael@0 280 psadbw xmm2, xmm0
michael@0 281 psadbw xmm3, xmm0
michael@0 282
michael@0 283 paddw xmm4, xmm1
michael@0 284 lddqu xmm1, XMMWORD PTR [%6]
michael@0 285 paddw xmm5, xmm2
michael@0 286 paddw xmm6, xmm3
michael@0 287
michael@0 288 psadbw xmm1, xmm0
michael@0 289 paddw xmm7, xmm1
michael@0 290 %endif
michael@0 291 movdqa xmm0, XMMWORD PTR [%2+%7]
michael@0 292 lddqu xmm1, XMMWORD PTR [%3+%8]
michael@0 293 lddqu xmm2, XMMWORD PTR [%4+%8]
michael@0 294 lddqu xmm3, XMMWORD PTR [%5+%8]
michael@0 295
michael@0 296 psadbw xmm1, xmm0
michael@0 297 psadbw xmm2, xmm0
michael@0 298 psadbw xmm3, xmm0
michael@0 299
michael@0 300 paddw xmm4, xmm1
michael@0 301 lddqu xmm1, XMMWORD PTR [%6+%8]
michael@0 302 paddw xmm5, xmm2
michael@0 303 paddw xmm6, xmm3
michael@0 304
michael@0 305 %if %1==0 || %1==1
michael@0 306 lea %2, [%2+%7*2]
michael@0 307 lea %3, [%3+%8*2]
michael@0 308
michael@0 309 lea %4, [%4+%8*2]
michael@0 310 lea %5, [%5+%8*2]
michael@0 311
michael@0 312 lea %6, [%6+%8*2]
michael@0 313 %endif
michael@0 314 psadbw xmm1, xmm0
michael@0 315 paddw xmm7, xmm1
michael@0 316
michael@0 317 %endmacro
michael@0 318
michael@0 319 %macro PROCESS_8X2X4 8
michael@0 320 %if %1==0
michael@0 321 movq mm0, QWORD PTR [%2]
michael@0 322 movq mm4, QWORD PTR [%3]
michael@0 323 movq mm5, QWORD PTR [%4]
michael@0 324 movq mm6, QWORD PTR [%5]
michael@0 325 movq mm7, QWORD PTR [%6]
michael@0 326
michael@0 327 psadbw mm4, mm0
michael@0 328 psadbw mm5, mm0
michael@0 329 psadbw mm6, mm0
michael@0 330 psadbw mm7, mm0
michael@0 331 %else
michael@0 332 movq mm0, QWORD PTR [%2]
michael@0 333 movq mm1, QWORD PTR [%3]
michael@0 334 movq mm2, QWORD PTR [%4]
michael@0 335 movq mm3, QWORD PTR [%5]
michael@0 336
michael@0 337 psadbw mm1, mm0
michael@0 338 psadbw mm2, mm0
michael@0 339 psadbw mm3, mm0
michael@0 340
michael@0 341 paddw mm4, mm1
michael@0 342 movq mm1, QWORD PTR [%6]
michael@0 343 paddw mm5, mm2
michael@0 344 paddw mm6, mm3
michael@0 345
michael@0 346 psadbw mm1, mm0
michael@0 347 paddw mm7, mm1
michael@0 348 %endif
michael@0 349 movq mm0, QWORD PTR [%2+%7]
michael@0 350 movq mm1, QWORD PTR [%3+%8]
michael@0 351 movq mm2, QWORD PTR [%4+%8]
michael@0 352 movq mm3, QWORD PTR [%5+%8]
michael@0 353
michael@0 354 psadbw mm1, mm0
michael@0 355 psadbw mm2, mm0
michael@0 356 psadbw mm3, mm0
michael@0 357
michael@0 358 paddw mm4, mm1
michael@0 359 movq mm1, QWORD PTR [%6+%8]
michael@0 360 paddw mm5, mm2
michael@0 361 paddw mm6, mm3
michael@0 362
michael@0 363 %if %1==0 || %1==1
michael@0 364 lea %2, [%2+%7*2]
michael@0 365 lea %3, [%3+%8*2]
michael@0 366
michael@0 367 lea %4, [%4+%8*2]
michael@0 368 lea %5, [%5+%8*2]
michael@0 369
michael@0 370 lea %6, [%6+%8*2]
michael@0 371 %endif
michael@0 372 psadbw mm1, mm0
michael@0 373 paddw mm7, mm1
michael@0 374
michael@0 375 %endmacro
michael@0 376
michael@0 377 ;void int vp8_sad16x16x3_sse3(
michael@0 378 ; unsigned char *src_ptr,
michael@0 379 ; int src_stride,
michael@0 380 ; unsigned char *ref_ptr,
michael@0 381 ; int ref_stride,
michael@0 382 ; int *results)
michael@0 383 global sym(vp8_sad16x16x3_sse3) PRIVATE
michael@0 384 sym(vp8_sad16x16x3_sse3):
michael@0 385
michael@0 386 STACK_FRAME_CREATE_X3
michael@0 387
michael@0 388 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 389 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 390 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 391 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 392 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 393 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 394 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 395 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 396
michael@0 397 mov rcx, result_ptr
michael@0 398
michael@0 399 movq xmm0, xmm5
michael@0 400 psrldq xmm5, 8
michael@0 401
michael@0 402 paddw xmm0, xmm5
michael@0 403 movd [rcx], xmm0
michael@0 404 ;-
michael@0 405 movq xmm0, xmm6
michael@0 406 psrldq xmm6, 8
michael@0 407
michael@0 408 paddw xmm0, xmm6
michael@0 409 movd [rcx+4], xmm0
michael@0 410 ;-
michael@0 411 movq xmm0, xmm7
michael@0 412 psrldq xmm7, 8
michael@0 413
michael@0 414 paddw xmm0, xmm7
michael@0 415 movd [rcx+8], xmm0
michael@0 416
michael@0 417 STACK_FRAME_DESTROY_X3
michael@0 418
michael@0 419 ;void int vp8_sad16x8x3_sse3(
michael@0 420 ; unsigned char *src_ptr,
michael@0 421 ; int src_stride,
michael@0 422 ; unsigned char *ref_ptr,
michael@0 423 ; int ref_stride,
michael@0 424 ; int *results)
michael@0 425 global sym(vp8_sad16x8x3_sse3) PRIVATE
michael@0 426 sym(vp8_sad16x8x3_sse3):
michael@0 427
michael@0 428 STACK_FRAME_CREATE_X3
michael@0 429
michael@0 430 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 431 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 432 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 433 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 434
michael@0 435 mov rcx, result_ptr
michael@0 436
michael@0 437 movq xmm0, xmm5
michael@0 438 psrldq xmm5, 8
michael@0 439
michael@0 440 paddw xmm0, xmm5
michael@0 441 movd [rcx], xmm0
michael@0 442 ;-
michael@0 443 movq xmm0, xmm6
michael@0 444 psrldq xmm6, 8
michael@0 445
michael@0 446 paddw xmm0, xmm6
michael@0 447 movd [rcx+4], xmm0
michael@0 448 ;-
michael@0 449 movq xmm0, xmm7
michael@0 450 psrldq xmm7, 8
michael@0 451
michael@0 452 paddw xmm0, xmm7
michael@0 453 movd [rcx+8], xmm0
michael@0 454
michael@0 455 STACK_FRAME_DESTROY_X3
michael@0 456
michael@0 457 ;void int vp8_sad8x16x3_sse3(
michael@0 458 ; unsigned char *src_ptr,
michael@0 459 ; int src_stride,
michael@0 460 ; unsigned char *ref_ptr,
michael@0 461 ; int ref_stride,
michael@0 462 ; int *results)
michael@0 463 global sym(vp8_sad8x16x3_sse3) PRIVATE
michael@0 464 sym(vp8_sad8x16x3_sse3):
michael@0 465
michael@0 466 STACK_FRAME_CREATE_X3
michael@0 467
michael@0 468 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 469 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 470 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 471 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 472 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 473 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 474 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 475 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 476
michael@0 477 mov rcx, result_ptr
michael@0 478
michael@0 479 punpckldq mm5, mm6
michael@0 480
michael@0 481 movq [rcx], mm5
michael@0 482 movd [rcx+8], mm7
michael@0 483
michael@0 484 STACK_FRAME_DESTROY_X3
michael@0 485
michael@0 486 ;void int vp8_sad8x8x3_sse3(
michael@0 487 ; unsigned char *src_ptr,
michael@0 488 ; int src_stride,
michael@0 489 ; unsigned char *ref_ptr,
michael@0 490 ; int ref_stride,
michael@0 491 ; int *results)
michael@0 492 global sym(vp8_sad8x8x3_sse3) PRIVATE
michael@0 493 sym(vp8_sad8x8x3_sse3):
michael@0 494
michael@0 495 STACK_FRAME_CREATE_X3
michael@0 496
michael@0 497 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 498 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 499 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 500 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 501
michael@0 502 mov rcx, result_ptr
michael@0 503
michael@0 504 punpckldq mm5, mm6
michael@0 505
michael@0 506 movq [rcx], mm5
michael@0 507 movd [rcx+8], mm7
michael@0 508
michael@0 509 STACK_FRAME_DESTROY_X3
michael@0 510
michael@0 511 ;void int vp8_sad4x4x3_sse3(
michael@0 512 ; unsigned char *src_ptr,
michael@0 513 ; int src_stride,
michael@0 514 ; unsigned char *ref_ptr,
michael@0 515 ; int ref_stride,
michael@0 516 ; int *results)
michael@0 517 global sym(vp8_sad4x4x3_sse3) PRIVATE
michael@0 518 sym(vp8_sad4x4x3_sse3):
michael@0 519
michael@0 520 STACK_FRAME_CREATE_X3
michael@0 521
michael@0 522 movd mm0, DWORD PTR [src_ptr]
michael@0 523 movd mm1, DWORD PTR [ref_ptr]
michael@0 524
michael@0 525 movd mm2, DWORD PTR [src_ptr+src_stride]
michael@0 526 movd mm3, DWORD PTR [ref_ptr+ref_stride]
michael@0 527
michael@0 528 punpcklbw mm0, mm2
michael@0 529 punpcklbw mm1, mm3
michael@0 530
michael@0 531 movd mm4, DWORD PTR [ref_ptr+1]
michael@0 532 movd mm5, DWORD PTR [ref_ptr+2]
michael@0 533
michael@0 534 movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
michael@0 535 movd mm3, DWORD PTR [ref_ptr+ref_stride+2]
michael@0 536
michael@0 537 psadbw mm1, mm0
michael@0 538
michael@0 539 punpcklbw mm4, mm2
michael@0 540 punpcklbw mm5, mm3
michael@0 541
michael@0 542 psadbw mm4, mm0
michael@0 543 psadbw mm5, mm0
michael@0 544
michael@0 545 lea src_ptr, [src_ptr+src_stride*2]
michael@0 546 lea ref_ptr, [ref_ptr+ref_stride*2]
michael@0 547
michael@0 548 movd mm0, DWORD PTR [src_ptr]
michael@0 549 movd mm2, DWORD PTR [ref_ptr]
michael@0 550
michael@0 551 movd mm3, DWORD PTR [src_ptr+src_stride]
michael@0 552 movd mm6, DWORD PTR [ref_ptr+ref_stride]
michael@0 553
michael@0 554 punpcklbw mm0, mm3
michael@0 555 punpcklbw mm2, mm6
michael@0 556
michael@0 557 movd mm3, DWORD PTR [ref_ptr+1]
michael@0 558 movd mm7, DWORD PTR [ref_ptr+2]
michael@0 559
michael@0 560 psadbw mm2, mm0
michael@0 561
michael@0 562 paddw mm1, mm2
michael@0 563
michael@0 564 movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
michael@0 565 movd mm6, DWORD PTR [ref_ptr+ref_stride+2]
michael@0 566
michael@0 567 punpcklbw mm3, mm2
michael@0 568 punpcklbw mm7, mm6
michael@0 569
michael@0 570 psadbw mm3, mm0
michael@0 571 psadbw mm7, mm0
michael@0 572
michael@0 573 paddw mm3, mm4
michael@0 574 paddw mm7, mm5
michael@0 575
michael@0 576 mov rcx, result_ptr
michael@0 577
michael@0 578 punpckldq mm1, mm3
michael@0 579
michael@0 580 movq [rcx], mm1
michael@0 581 movd [rcx+8], mm7
michael@0 582
michael@0 583 STACK_FRAME_DESTROY_X3
michael@0 584
michael@0 585 ;unsigned int vp8_sad16x16_sse3(
michael@0 586 ; unsigned char *src_ptr,
michael@0 587 ; int src_stride,
michael@0 588 ; unsigned char *ref_ptr,
michael@0 589 ; int ref_stride,
michael@0 590 ; int max_sad)
michael@0 591 ;%define lddqu movdqu
michael@0 592 global sym(vp8_sad16x16_sse3) PRIVATE
michael@0 593 sym(vp8_sad16x16_sse3):
michael@0 594
michael@0 595 STACK_FRAME_CREATE_X3
michael@0 596
michael@0 597 mov end_ptr, 4
michael@0 598 pxor xmm7, xmm7
michael@0 599
michael@0 600 .vp8_sad16x16_sse3_loop:
michael@0 601 movdqa xmm0, XMMWORD PTR [src_ptr]
michael@0 602 movdqu xmm1, XMMWORD PTR [ref_ptr]
michael@0 603 movdqa xmm2, XMMWORD PTR [src_ptr+src_stride]
michael@0 604 movdqu xmm3, XMMWORD PTR [ref_ptr+ref_stride]
michael@0 605
michael@0 606 lea src_ptr, [src_ptr+src_stride*2]
michael@0 607 lea ref_ptr, [ref_ptr+ref_stride*2]
michael@0 608
michael@0 609 movdqa xmm4, XMMWORD PTR [src_ptr]
michael@0 610 movdqu xmm5, XMMWORD PTR [ref_ptr]
michael@0 611 movdqa xmm6, XMMWORD PTR [src_ptr+src_stride]
michael@0 612
michael@0 613 psadbw xmm0, xmm1
michael@0 614
michael@0 615 movdqu xmm1, XMMWORD PTR [ref_ptr+ref_stride]
michael@0 616
michael@0 617 psadbw xmm2, xmm3
michael@0 618 psadbw xmm4, xmm5
michael@0 619 psadbw xmm6, xmm1
michael@0 620
michael@0 621 lea src_ptr, [src_ptr+src_stride*2]
michael@0 622 lea ref_ptr, [ref_ptr+ref_stride*2]
michael@0 623
michael@0 624 paddw xmm7, xmm0
michael@0 625 paddw xmm7, xmm2
michael@0 626 paddw xmm7, xmm4
michael@0 627 paddw xmm7, xmm6
michael@0 628
michael@0 629 sub end_ptr, 1
michael@0 630 jne .vp8_sad16x16_sse3_loop
michael@0 631
michael@0 632 movq xmm0, xmm7
michael@0 633 psrldq xmm7, 8
michael@0 634 paddw xmm0, xmm7
michael@0 635 movq rax, xmm0
michael@0 636
michael@0 637 STACK_FRAME_DESTROY_X3
michael@0 638
michael@0 639 ;void vp8_copy32xn_sse3(
michael@0 640 ; unsigned char *src_ptr,
michael@0 641 ; int src_stride,
michael@0 642 ; unsigned char *dst_ptr,
michael@0 643 ; int dst_stride,
michael@0 644 ; int height);
michael@0 645 global sym(vp8_copy32xn_sse3) PRIVATE
michael@0 646 sym(vp8_copy32xn_sse3):
michael@0 647
michael@0 648 STACK_FRAME_CREATE_X3
michael@0 649
michael@0 650 .block_copy_sse3_loopx4:
michael@0 651 lea end_ptr, [src_ptr+src_stride*2]
michael@0 652
michael@0 653 movdqu xmm0, XMMWORD PTR [src_ptr]
michael@0 654 movdqu xmm1, XMMWORD PTR [src_ptr + 16]
michael@0 655 movdqu xmm2, XMMWORD PTR [src_ptr + src_stride]
michael@0 656 movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16]
michael@0 657 movdqu xmm4, XMMWORD PTR [end_ptr]
michael@0 658 movdqu xmm5, XMMWORD PTR [end_ptr + 16]
michael@0 659 movdqu xmm6, XMMWORD PTR [end_ptr + src_stride]
michael@0 660 movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16]
michael@0 661
michael@0 662 lea src_ptr, [src_ptr+src_stride*4]
michael@0 663
michael@0 664 lea end_ptr, [ref_ptr+ref_stride*2]
michael@0 665
michael@0 666 movdqa XMMWORD PTR [ref_ptr], xmm0
michael@0 667 movdqa XMMWORD PTR [ref_ptr + 16], xmm1
michael@0 668 movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2
michael@0 669 movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
michael@0 670 movdqa XMMWORD PTR [end_ptr], xmm4
michael@0 671 movdqa XMMWORD PTR [end_ptr + 16], xmm5
michael@0 672 movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6
michael@0 673 movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
michael@0 674
michael@0 675 lea ref_ptr, [ref_ptr+ref_stride*4]
michael@0 676
michael@0 677 sub height, 4
michael@0 678 cmp height, 4
michael@0 679 jge .block_copy_sse3_loopx4
michael@0 680
michael@0 681 ;Check to see if there is more rows need to be copied.
michael@0 682 cmp height, 0
michael@0 683 je .copy_is_done
michael@0 684
michael@0 685 .block_copy_sse3_loop:
michael@0 686 movdqu xmm0, XMMWORD PTR [src_ptr]
michael@0 687 movdqu xmm1, XMMWORD PTR [src_ptr + 16]
michael@0 688 lea src_ptr, [src_ptr+src_stride]
michael@0 689
michael@0 690 movdqa XMMWORD PTR [ref_ptr], xmm0
michael@0 691 movdqa XMMWORD PTR [ref_ptr + 16], xmm1
michael@0 692 lea ref_ptr, [ref_ptr+ref_stride]
michael@0 693
michael@0 694 sub height, 1
michael@0 695 jne .block_copy_sse3_loop
michael@0 696
michael@0 697 .copy_is_done:
michael@0 698 STACK_FRAME_DESTROY_X3
michael@0 699
michael@0 700 ;void vp8_sad16x16x4d_sse3(
michael@0 701 ; unsigned char *src_ptr,
michael@0 702 ; int src_stride,
michael@0 703 ; unsigned char *ref_ptr_base,
michael@0 704 ; int ref_stride,
michael@0 705 ; int *results)
michael@0 706 global sym(vp8_sad16x16x4d_sse3) PRIVATE
michael@0 707 sym(vp8_sad16x16x4d_sse3):
michael@0 708
michael@0 709 STACK_FRAME_CREATE_X4
michael@0 710
michael@0 711 PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
michael@0 712 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
michael@0 713 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
michael@0 714 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
michael@0 715 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
michael@0 716 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
michael@0 717 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
michael@0 718 PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
michael@0 719
michael@0 720 %if ABI_IS_32BIT
michael@0 721 pop rbp
michael@0 722 %endif
michael@0 723 mov rcx, result_ptr
michael@0 724
michael@0 725 movq xmm0, xmm4
michael@0 726 psrldq xmm4, 8
michael@0 727
michael@0 728 paddw xmm0, xmm4
michael@0 729 movd [rcx], xmm0
michael@0 730 ;-
michael@0 731 movq xmm0, xmm5
michael@0 732 psrldq xmm5, 8
michael@0 733
michael@0 734 paddw xmm0, xmm5
michael@0 735 movd [rcx+4], xmm0
michael@0 736 ;-
michael@0 737 movq xmm0, xmm6
michael@0 738 psrldq xmm6, 8
michael@0 739
michael@0 740 paddw xmm0, xmm6
michael@0 741 movd [rcx+8], xmm0
michael@0 742 ;-
michael@0 743 movq xmm0, xmm7
michael@0 744 psrldq xmm7, 8
michael@0 745
michael@0 746 paddw xmm0, xmm7
michael@0 747 movd [rcx+12], xmm0
michael@0 748
michael@0 749 STACK_FRAME_DESTROY_X4
michael@0 750
michael@0 751 ;void vp8_sad16x8x4d_sse3(
michael@0 752 ; unsigned char *src_ptr,
michael@0 753 ; int src_stride,
michael@0 754 ; unsigned char *ref_ptr_base,
michael@0 755 ; int ref_stride,
michael@0 756 ; int *results)
michael@0 757 global sym(vp8_sad16x8x4d_sse3) PRIVATE
michael@0 758 sym(vp8_sad16x8x4d_sse3):
michael@0 759
michael@0 760 STACK_FRAME_CREATE_X4
michael@0 761
michael@0 762 PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
michael@0 763 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
michael@0 764 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
michael@0 765 PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
michael@0 766
michael@0 767 %if ABI_IS_32BIT
michael@0 768 pop rbp
michael@0 769 %endif
michael@0 770 mov rcx, result_ptr
michael@0 771
michael@0 772 movq xmm0, xmm4
michael@0 773 psrldq xmm4, 8
michael@0 774
michael@0 775 paddw xmm0, xmm4
michael@0 776 movd [rcx], xmm0
michael@0 777 ;-
michael@0 778 movq xmm0, xmm5
michael@0 779 psrldq xmm5, 8
michael@0 780
michael@0 781 paddw xmm0, xmm5
michael@0 782 movd [rcx+4], xmm0
michael@0 783 ;-
michael@0 784 movq xmm0, xmm6
michael@0 785 psrldq xmm6, 8
michael@0 786
michael@0 787 paddw xmm0, xmm6
michael@0 788 movd [rcx+8], xmm0
michael@0 789 ;-
michael@0 790 movq xmm0, xmm7
michael@0 791 psrldq xmm7, 8
michael@0 792
michael@0 793 paddw xmm0, xmm7
michael@0 794 movd [rcx+12], xmm0
michael@0 795
michael@0 796 STACK_FRAME_DESTROY_X4
michael@0 797
michael@0 798 ;void int vp8_sad8x16x4d_sse3(
michael@0 799 ; unsigned char *src_ptr,
michael@0 800 ; int src_stride,
michael@0 801 ; unsigned char *ref_ptr,
michael@0 802 ; int ref_stride,
michael@0 803 ; int *results)
michael@0 804 global sym(vp8_sad8x16x4d_sse3) PRIVATE
michael@0 805 sym(vp8_sad8x16x4d_sse3):
michael@0 806
michael@0 807 STACK_FRAME_CREATE_X4
michael@0 808
michael@0 809 PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
michael@0 810 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
michael@0 811 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
michael@0 812 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
michael@0 813 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
michael@0 814 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
michael@0 815 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
michael@0 816 PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
michael@0 817
michael@0 818 %if ABI_IS_32BIT
michael@0 819 pop rbp
michael@0 820 %endif
michael@0 821 mov rcx, result_ptr
michael@0 822
michael@0 823 punpckldq mm4, mm5
michael@0 824 punpckldq mm6, mm7
michael@0 825
michael@0 826 movq [rcx], mm4
michael@0 827 movq [rcx+8], mm6
michael@0 828
michael@0 829 STACK_FRAME_DESTROY_X4
michael@0 830
michael@0 831 ;void int vp8_sad8x8x4d_sse3(
michael@0 832 ; unsigned char *src_ptr,
michael@0 833 ; int src_stride,
michael@0 834 ; unsigned char *ref_ptr,
michael@0 835 ; int ref_stride,
michael@0 836 ; int *results)
michael@0 837 global sym(vp8_sad8x8x4d_sse3) PRIVATE
michael@0 838 sym(vp8_sad8x8x4d_sse3):
michael@0 839
michael@0 840 STACK_FRAME_CREATE_X4
michael@0 841
michael@0 842 PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
michael@0 843 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
michael@0 844 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
michael@0 845 PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
michael@0 846
michael@0 847 %if ABI_IS_32BIT
michael@0 848 pop rbp
michael@0 849 %endif
michael@0 850 mov rcx, result_ptr
michael@0 851
michael@0 852 punpckldq mm4, mm5
michael@0 853 punpckldq mm6, mm7
michael@0 854
michael@0 855 movq [rcx], mm4
michael@0 856 movq [rcx+8], mm6
michael@0 857
michael@0 858 STACK_FRAME_DESTROY_X4
michael@0 859
michael@0 860 ;void int vp8_sad4x4x4d_sse3(
michael@0 861 ; unsigned char *src_ptr,
michael@0 862 ; int src_stride,
michael@0 863 ; unsigned char *ref_ptr,
michael@0 864 ; int ref_stride,
michael@0 865 ; int *results)
michael@0 866 global sym(vp8_sad4x4x4d_sse3) PRIVATE
michael@0 867 sym(vp8_sad4x4x4d_sse3):
michael@0 868
michael@0 869 STACK_FRAME_CREATE_X4
michael@0 870
michael@0 871 movd mm0, DWORD PTR [src_ptr]
michael@0 872 movd mm1, DWORD PTR [r0_ptr]
michael@0 873
michael@0 874 movd mm2, DWORD PTR [src_ptr+src_stride]
michael@0 875 movd mm3, DWORD PTR [r0_ptr+ref_stride]
michael@0 876
michael@0 877 punpcklbw mm0, mm2
michael@0 878 punpcklbw mm1, mm3
michael@0 879
michael@0 880 movd mm4, DWORD PTR [r1_ptr]
michael@0 881 movd mm5, DWORD PTR [r2_ptr]
michael@0 882
michael@0 883 movd mm6, DWORD PTR [r3_ptr]
michael@0 884 movd mm2, DWORD PTR [r1_ptr+ref_stride]
michael@0 885
michael@0 886 movd mm3, DWORD PTR [r2_ptr+ref_stride]
michael@0 887 movd mm7, DWORD PTR [r3_ptr+ref_stride]
michael@0 888
michael@0 889 psadbw mm1, mm0
michael@0 890
michael@0 891 punpcklbw mm4, mm2
michael@0 892 punpcklbw mm5, mm3
michael@0 893
michael@0 894 punpcklbw mm6, mm7
michael@0 895 psadbw mm4, mm0
michael@0 896
michael@0 897 psadbw mm5, mm0
michael@0 898 psadbw mm6, mm0
michael@0 899
michael@0 900
michael@0 901
michael@0 902 lea src_ptr, [src_ptr+src_stride*2]
michael@0 903 lea r0_ptr, [r0_ptr+ref_stride*2]
michael@0 904
michael@0 905 lea r1_ptr, [r1_ptr+ref_stride*2]
michael@0 906 lea r2_ptr, [r2_ptr+ref_stride*2]
michael@0 907
michael@0 908 lea r3_ptr, [r3_ptr+ref_stride*2]
michael@0 909
michael@0 910 movd mm0, DWORD PTR [src_ptr]
michael@0 911 movd mm2, DWORD PTR [r0_ptr]
michael@0 912
michael@0 913 movd mm3, DWORD PTR [src_ptr+src_stride]
michael@0 914 movd mm7, DWORD PTR [r0_ptr+ref_stride]
michael@0 915
michael@0 916 punpcklbw mm0, mm3
michael@0 917 punpcklbw mm2, mm7
michael@0 918
michael@0 919 movd mm3, DWORD PTR [r1_ptr]
michael@0 920 movd mm7, DWORD PTR [r2_ptr]
michael@0 921
michael@0 922 psadbw mm2, mm0
michael@0 923 %if ABI_IS_32BIT
michael@0 924 mov rax, rbp
michael@0 925
michael@0 926 pop rbp
michael@0 927 %define ref_stride rax
michael@0 928 %endif
michael@0 929 mov rsi, result_ptr
michael@0 930
michael@0 931 paddw mm1, mm2
michael@0 932 movd [rsi], mm1
michael@0 933
michael@0 934 movd mm2, DWORD PTR [r1_ptr+ref_stride]
michael@0 935 movd mm1, DWORD PTR [r2_ptr+ref_stride]
michael@0 936
michael@0 937 punpcklbw mm3, mm2
michael@0 938 punpcklbw mm7, mm1
michael@0 939
michael@0 940 psadbw mm3, mm0
michael@0 941 psadbw mm7, mm0
michael@0 942
michael@0 943 movd mm2, DWORD PTR [r3_ptr]
michael@0 944 movd mm1, DWORD PTR [r3_ptr+ref_stride]
michael@0 945
michael@0 946 paddw mm3, mm4
michael@0 947 paddw mm7, mm5
michael@0 948
michael@0 949 movd [rsi+4], mm3
michael@0 950 punpcklbw mm2, mm1
michael@0 951
michael@0 952 movd [rsi+8], mm7
michael@0 953 psadbw mm2, mm0
michael@0 954
michael@0 955 paddw mm2, mm6
michael@0 956 movd [rsi+12], mm2
michael@0 957
michael@0 958
michael@0 959 STACK_FRAME_DESTROY_X4
michael@0 960

mercurial