media/libvpx/vp8/common/x86/recon_sse2.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11
michael@0 12 %include "vpx_ports/x86_abi_support.asm"
michael@0 13
michael@0 14 ;void copy_mem16x16_sse2(
michael@0 15 ; unsigned char *src,
michael@0 16 ; int src_stride,
michael@0 17 ; unsigned char *dst,
michael@0 18 ; int dst_stride
michael@0 19 ; )
michael@0 20 global sym(vp8_copy_mem16x16_sse2) PRIVATE
michael@0 21 sym(vp8_copy_mem16x16_sse2):
michael@0 22 push rbp
michael@0 23 mov rbp, rsp
michael@0 24 SHADOW_ARGS_TO_STACK 4
michael@0 25 push rsi
michael@0 26 push rdi
michael@0 27 ; end prolog
michael@0 28
michael@0 29 mov rsi, arg(0) ;src;
michael@0 30 movdqu xmm0, [rsi]
michael@0 31
michael@0 32 movsxd rax, dword ptr arg(1) ;src_stride;
michael@0 33 mov rdi, arg(2) ;dst;
michael@0 34
michael@0 35 movdqu xmm1, [rsi+rax]
michael@0 36 movdqu xmm2, [rsi+rax*2]
michael@0 37
michael@0 38 movsxd rcx, dword ptr arg(3) ;dst_stride
michael@0 39 lea rsi, [rsi+rax*2]
michael@0 40
michael@0 41 movdqa [rdi], xmm0
michael@0 42 add rsi, rax
michael@0 43
michael@0 44 movdqa [rdi+rcx], xmm1
michael@0 45 movdqa [rdi+rcx*2],xmm2
michael@0 46
michael@0 47 lea rdi, [rdi+rcx*2]
michael@0 48 movdqu xmm3, [rsi]
michael@0 49
michael@0 50 add rdi, rcx
michael@0 51 movdqu xmm4, [rsi+rax]
michael@0 52
michael@0 53 movdqu xmm5, [rsi+rax*2]
michael@0 54 lea rsi, [rsi+rax*2]
michael@0 55
michael@0 56 movdqa [rdi], xmm3
michael@0 57 add rsi, rax
michael@0 58
michael@0 59 movdqa [rdi+rcx], xmm4
michael@0 60 movdqa [rdi+rcx*2],xmm5
michael@0 61
michael@0 62 lea rdi, [rdi+rcx*2]
michael@0 63 movdqu xmm0, [rsi]
michael@0 64
michael@0 65 add rdi, rcx
michael@0 66 movdqu xmm1, [rsi+rax]
michael@0 67
michael@0 68 movdqu xmm2, [rsi+rax*2]
michael@0 69 lea rsi, [rsi+rax*2]
michael@0 70
michael@0 71 movdqa [rdi], xmm0
michael@0 72 add rsi, rax
michael@0 73
michael@0 74 movdqa [rdi+rcx], xmm1
michael@0 75
michael@0 76 movdqa [rdi+rcx*2], xmm2
michael@0 77 movdqu xmm3, [rsi]
michael@0 78
michael@0 79 movdqu xmm4, [rsi+rax]
michael@0 80 lea rdi, [rdi+rcx*2]
michael@0 81
michael@0 82 add rdi, rcx
michael@0 83 movdqu xmm5, [rsi+rax*2]
michael@0 84
michael@0 85 lea rsi, [rsi+rax*2]
michael@0 86 movdqa [rdi], xmm3
michael@0 87
michael@0 88 add rsi, rax
michael@0 89 movdqa [rdi+rcx], xmm4
michael@0 90
michael@0 91 movdqa [rdi+rcx*2],xmm5
michael@0 92 movdqu xmm0, [rsi]
michael@0 93
michael@0 94 lea rdi, [rdi+rcx*2]
michael@0 95 movdqu xmm1, [rsi+rax]
michael@0 96
michael@0 97 add rdi, rcx
michael@0 98 movdqu xmm2, [rsi+rax*2]
michael@0 99
michael@0 100 lea rsi, [rsi+rax*2]
michael@0 101 movdqa [rdi], xmm0
michael@0 102
michael@0 103 movdqa [rdi+rcx], xmm1
michael@0 104 movdqa [rdi+rcx*2],xmm2
michael@0 105
michael@0 106 movdqu xmm3, [rsi+rax]
michael@0 107 lea rdi, [rdi+rcx*2]
michael@0 108
michael@0 109 movdqa [rdi+rcx], xmm3
michael@0 110
michael@0 111 ; begin epilog
michael@0 112 pop rdi
michael@0 113 pop rsi
michael@0 114 UNSHADOW_ARGS
michael@0 115 pop rbp
michael@0 116 ret
michael@0 117
michael@0 118
michael@0 119 ;void vp8_intra_pred_uv_dc_mmx2(
michael@0 120 ; unsigned char *dst,
michael@0 121 ; int dst_stride
michael@0 122 ; unsigned char *above,
michael@0 123 ; unsigned char *left,
michael@0 124 ; int left_stride,
michael@0 125 ; )
michael@0 126 global sym(vp8_intra_pred_uv_dc_mmx2) PRIVATE
michael@0 127 sym(vp8_intra_pred_uv_dc_mmx2):
michael@0 128 push rbp
michael@0 129 mov rbp, rsp
michael@0 130 SHADOW_ARGS_TO_STACK 5
michael@0 131 push rsi
michael@0 132 push rdi
michael@0 133 ; end prolog
michael@0 134
michael@0 135 ; from top
michael@0 136 mov rdi, arg(2) ;above;
michael@0 137 mov rsi, arg(3) ;left;
michael@0 138 movsxd rax, dword ptr arg(4) ;left_stride;
michael@0 139 pxor mm0, mm0
michael@0 140 movq mm1, [rdi]
michael@0 141 lea rdi, [rax*3]
michael@0 142 psadbw mm1, mm0
michael@0 143 ; from left
michael@0 144 movzx ecx, byte [rsi]
michael@0 145 movzx edx, byte [rsi+rax*1]
michael@0 146 add ecx, edx
michael@0 147 movzx edx, byte [rsi+rax*2]
michael@0 148 add ecx, edx
michael@0 149
michael@0 150 movzx edx, byte [rsi+rdi]
michael@0 151 lea rsi, [rsi+rax*4]
michael@0 152 add ecx, edx
michael@0 153 movzx edx, byte [rsi]
michael@0 154 add ecx, edx
michael@0 155 movzx edx, byte [rsi+rax]
michael@0 156 add ecx, edx
michael@0 157 movzx edx, byte [rsi+rax*2]
michael@0 158 add ecx, edx
michael@0 159 movzx edx, byte [rsi+rdi]
michael@0 160 add ecx, edx
michael@0 161
michael@0 162 ; add up
michael@0 163 pextrw edx, mm1, 0x0
michael@0 164 lea edx, [edx+ecx+8]
michael@0 165 sar edx, 4
michael@0 166 movd mm1, edx
michael@0 167 movsxd rcx, dword ptr arg(1) ;dst_stride
michael@0 168 pshufw mm1, mm1, 0x0
michael@0 169 mov rdi, arg(0) ;dst;
michael@0 170 packuswb mm1, mm1
michael@0 171
michael@0 172 ; write out
michael@0 173 lea rax, [rcx*3]
michael@0 174 lea rdx, [rdi+rcx*4]
michael@0 175
michael@0 176 movq [rdi ], mm1
michael@0 177 movq [rdi+rcx ], mm1
michael@0 178 movq [rdi+rcx*2], mm1
michael@0 179 movq [rdi+rax ], mm1
michael@0 180 movq [rdx ], mm1
michael@0 181 movq [rdx+rcx ], mm1
michael@0 182 movq [rdx+rcx*2], mm1
michael@0 183 movq [rdx+rax ], mm1
michael@0 184
michael@0 185 ; begin epilog
michael@0 186 pop rdi
michael@0 187 pop rsi
michael@0 188 UNSHADOW_ARGS
michael@0 189 pop rbp
michael@0 190 ret
michael@0 191
michael@0 192 ;void vp8_intra_pred_uv_dctop_mmx2(
michael@0 193 ; unsigned char *dst,
michael@0 194 ; int dst_stride
michael@0 195 ; unsigned char *above,
michael@0 196 ; unsigned char *left,
michael@0 197 ; int left_stride,
michael@0 198 ; )
michael@0 199 global sym(vp8_intra_pred_uv_dctop_mmx2) PRIVATE
michael@0 200 sym(vp8_intra_pred_uv_dctop_mmx2):
michael@0 201 push rbp
michael@0 202 mov rbp, rsp
michael@0 203 SHADOW_ARGS_TO_STACK 5
michael@0 204 GET_GOT rbx
michael@0 205 push rsi
michael@0 206 push rdi
michael@0 207 ; end prolog
michael@0 208
michael@0 209 ;arg(3), arg(4) not used
michael@0 210
michael@0 211 ; from top
michael@0 212 mov rsi, arg(2) ;above;
michael@0 213 pxor mm0, mm0
michael@0 214 movq mm1, [rsi]
michael@0 215 psadbw mm1, mm0
michael@0 216
michael@0 217 ; add up
michael@0 218 paddw mm1, [GLOBAL(dc_4)]
michael@0 219 psraw mm1, 3
michael@0 220 pshufw mm1, mm1, 0x0
michael@0 221 packuswb mm1, mm1
michael@0 222
michael@0 223 ; write out
michael@0 224 mov rdi, arg(0) ;dst;
michael@0 225 movsxd rcx, dword ptr arg(1) ;dst_stride
michael@0 226 lea rax, [rcx*3]
michael@0 227
michael@0 228 movq [rdi ], mm1
michael@0 229 movq [rdi+rcx ], mm1
michael@0 230 movq [rdi+rcx*2], mm1
michael@0 231 movq [rdi+rax ], mm1
michael@0 232 lea rdi, [rdi+rcx*4]
michael@0 233 movq [rdi ], mm1
michael@0 234 movq [rdi+rcx ], mm1
michael@0 235 movq [rdi+rcx*2], mm1
michael@0 236 movq [rdi+rax ], mm1
michael@0 237
michael@0 238 ; begin epilog
michael@0 239 pop rdi
michael@0 240 pop rsi
michael@0 241 RESTORE_GOT
michael@0 242 UNSHADOW_ARGS
michael@0 243 pop rbp
michael@0 244 ret
michael@0 245
michael@0 246 ;void vp8_intra_pred_uv_dcleft_mmx2(
michael@0 247 ; unsigned char *dst,
michael@0 248 ; int dst_stride
michael@0 249 ; unsigned char *above,
michael@0 250 ; unsigned char *left,
michael@0 251 ; int left_stride,
michael@0 252 ; )
michael@0 253 global sym(vp8_intra_pred_uv_dcleft_mmx2) PRIVATE
michael@0 254 sym(vp8_intra_pred_uv_dcleft_mmx2):
michael@0 255 push rbp
michael@0 256 mov rbp, rsp
michael@0 257 SHADOW_ARGS_TO_STACK 5
michael@0 258 push rsi
michael@0 259 push rdi
michael@0 260 ; end prolog
michael@0 261
michael@0 262 ;arg(2) not used
michael@0 263
michael@0 264 ; from left
michael@0 265 mov rsi, arg(3) ;left;
michael@0 266 movsxd rax, dword ptr arg(4) ;left_stride;
michael@0 267 lea rdi, [rax*3]
michael@0 268 movzx ecx, byte [rsi]
michael@0 269 movzx edx, byte [rsi+rax]
michael@0 270 add ecx, edx
michael@0 271 movzx edx, byte [rsi+rax*2]
michael@0 272 add ecx, edx
michael@0 273 movzx edx, byte [rsi+rdi]
michael@0 274 add ecx, edx
michael@0 275 lea rsi, [rsi+rax*4]
michael@0 276 movzx edx, byte [rsi]
michael@0 277 add ecx, edx
michael@0 278 movzx edx, byte [rsi+rax]
michael@0 279 add ecx, edx
michael@0 280 movzx edx, byte [rsi+rax*2]
michael@0 281 add ecx, edx
michael@0 282 movzx edx, byte [rsi+rdi]
michael@0 283 lea edx, [ecx+edx+4]
michael@0 284
michael@0 285 ; add up
michael@0 286 shr edx, 3
michael@0 287 movd mm1, edx
michael@0 288 pshufw mm1, mm1, 0x0
michael@0 289 packuswb mm1, mm1
michael@0 290
michael@0 291 ; write out
michael@0 292 mov rdi, arg(0) ;dst;
michael@0 293 movsxd rcx, dword ptr arg(1) ;dst_stride
michael@0 294 lea rax, [rcx*3]
michael@0 295
michael@0 296 movq [rdi ], mm1
michael@0 297 movq [rdi+rcx ], mm1
michael@0 298 movq [rdi+rcx*2], mm1
michael@0 299 movq [rdi+rax ], mm1
michael@0 300 lea rdi, [rdi+rcx*4]
michael@0 301 movq [rdi ], mm1
michael@0 302 movq [rdi+rcx ], mm1
michael@0 303 movq [rdi+rcx*2], mm1
michael@0 304 movq [rdi+rax ], mm1
michael@0 305
michael@0 306 ; begin epilog
michael@0 307 pop rdi
michael@0 308 pop rsi
michael@0 309 UNSHADOW_ARGS
michael@0 310 pop rbp
michael@0 311 ret
michael@0 312
michael@0 313 ;void vp8_intra_pred_uv_dc128_mmx(
michael@0 314 ; unsigned char *dst,
michael@0 315 ; int dst_stride
michael@0 316 ; unsigned char *above,
michael@0 317 ; unsigned char *left,
michael@0 318 ; int left_stride,
michael@0 319 ; )
michael@0 320 global sym(vp8_intra_pred_uv_dc128_mmx) PRIVATE
michael@0 321 sym(vp8_intra_pred_uv_dc128_mmx):
michael@0 322 push rbp
michael@0 323 mov rbp, rsp
michael@0 324 SHADOW_ARGS_TO_STACK 5
michael@0 325 GET_GOT rbx
michael@0 326 ; end prolog
michael@0 327
michael@0 328 ;arg(2), arg(3), arg(4) not used
michael@0 329
michael@0 330 ; write out
michael@0 331 movq mm1, [GLOBAL(dc_128)]
michael@0 332 mov rax, arg(0) ;dst;
michael@0 333 movsxd rdx, dword ptr arg(1) ;dst_stride
michael@0 334 lea rcx, [rdx*3]
michael@0 335
michael@0 336 movq [rax ], mm1
michael@0 337 movq [rax+rdx ], mm1
michael@0 338 movq [rax+rdx*2], mm1
michael@0 339 movq [rax+rcx ], mm1
michael@0 340 lea rax, [rax+rdx*4]
michael@0 341 movq [rax ], mm1
michael@0 342 movq [rax+rdx ], mm1
michael@0 343 movq [rax+rdx*2], mm1
michael@0 344 movq [rax+rcx ], mm1
michael@0 345
michael@0 346 ; begin epilog
michael@0 347 RESTORE_GOT
michael@0 348 UNSHADOW_ARGS
michael@0 349 pop rbp
michael@0 350 ret
michael@0 351
michael@0 352 ;void vp8_intra_pred_uv_tm_sse2(
michael@0 353 ; unsigned char *dst,
michael@0 354 ; int dst_stride
michael@0 355 ; unsigned char *above,
michael@0 356 ; unsigned char *left,
michael@0 357 ; int left_stride,
michael@0 358 ; )
michael@0 359 %macro vp8_intra_pred_uv_tm 1
michael@0 360 global sym(vp8_intra_pred_uv_tm_%1) PRIVATE
michael@0 361 sym(vp8_intra_pred_uv_tm_%1):
michael@0 362 push rbp
michael@0 363 mov rbp, rsp
michael@0 364 SHADOW_ARGS_TO_STACK 5
michael@0 365 GET_GOT rbx
michael@0 366 push rsi
michael@0 367 push rdi
michael@0 368 ; end prolog
michael@0 369
michael@0 370 ; read top row
michael@0 371 mov edx, 4
michael@0 372 mov rsi, arg(2) ;above
michael@0 373 movsxd rax, dword ptr arg(4) ;left_stride;
michael@0 374 pxor xmm0, xmm0
michael@0 375 %ifidn %1, ssse3
michael@0 376 movdqa xmm2, [GLOBAL(dc_1024)]
michael@0 377 %endif
michael@0 378 movq xmm1, [rsi]
michael@0 379 punpcklbw xmm1, xmm0
michael@0 380
michael@0 381 ; set up left ptrs ans subtract topleft
michael@0 382 movd xmm3, [rsi-1]
michael@0 383 mov rsi, arg(3) ;left;
michael@0 384 %ifidn %1, sse2
michael@0 385 punpcklbw xmm3, xmm0
michael@0 386 pshuflw xmm3, xmm3, 0x0
michael@0 387 punpcklqdq xmm3, xmm3
michael@0 388 %else
michael@0 389 pshufb xmm3, xmm2
michael@0 390 %endif
michael@0 391 psubw xmm1, xmm3
michael@0 392
michael@0 393 ; set up dest ptrs
michael@0 394 mov rdi, arg(0) ;dst;
michael@0 395 movsxd rcx, dword ptr arg(1) ;dst_stride
michael@0 396
michael@0 397 .vp8_intra_pred_uv_tm_%1_loop:
michael@0 398 movd xmm3, [rsi]
michael@0 399 movd xmm5, [rsi+rax]
michael@0 400 %ifidn %1, sse2
michael@0 401 punpcklbw xmm3, xmm0
michael@0 402 punpcklbw xmm5, xmm0
michael@0 403 pshuflw xmm3, xmm3, 0x0
michael@0 404 pshuflw xmm5, xmm5, 0x0
michael@0 405 punpcklqdq xmm3, xmm3
michael@0 406 punpcklqdq xmm5, xmm5
michael@0 407 %else
michael@0 408 pshufb xmm3, xmm2
michael@0 409 pshufb xmm5, xmm2
michael@0 410 %endif
michael@0 411 paddw xmm3, xmm1
michael@0 412 paddw xmm5, xmm1
michael@0 413 packuswb xmm3, xmm5
michael@0 414 movq [rdi ], xmm3
michael@0 415 movhps[rdi+rcx], xmm3
michael@0 416 lea rsi, [rsi+rax*2]
michael@0 417 lea rdi, [rdi+rcx*2]
michael@0 418 dec edx
michael@0 419 jnz .vp8_intra_pred_uv_tm_%1_loop
michael@0 420
michael@0 421 ; begin epilog
michael@0 422 pop rdi
michael@0 423 pop rsi
michael@0 424 RESTORE_GOT
michael@0 425 UNSHADOW_ARGS
michael@0 426 pop rbp
michael@0 427 ret
michael@0 428 %endmacro
michael@0 429
michael@0 430 vp8_intra_pred_uv_tm sse2
michael@0 431 vp8_intra_pred_uv_tm ssse3
michael@0 432
michael@0 433 ;void vp8_intra_pred_uv_ve_mmx(
michael@0 434 ; unsigned char *dst,
michael@0 435 ; int dst_stride
michael@0 436 ; unsigned char *above,
michael@0 437 ; unsigned char *left,
michael@0 438 ; int left_stride,
michael@0 439 ; )
michael@0 440 global sym(vp8_intra_pred_uv_ve_mmx) PRIVATE
michael@0 441 sym(vp8_intra_pred_uv_ve_mmx):
michael@0 442 push rbp
michael@0 443 mov rbp, rsp
michael@0 444 SHADOW_ARGS_TO_STACK 5
michael@0 445 ; end prolog
michael@0 446
michael@0 447 ; arg(3), arg(4) not used
michael@0 448
michael@0 449 ; read from top
michael@0 450 mov rax, arg(2) ;src;
michael@0 451
michael@0 452 movq mm1, [rax]
michael@0 453
michael@0 454 ; write out
michael@0 455 mov rax, arg(0) ;dst;
michael@0 456 movsxd rdx, dword ptr arg(1) ;dst_stride
michael@0 457 lea rcx, [rdx*3]
michael@0 458
michael@0 459 movq [rax ], mm1
michael@0 460 movq [rax+rdx ], mm1
michael@0 461 movq [rax+rdx*2], mm1
michael@0 462 movq [rax+rcx ], mm1
michael@0 463 lea rax, [rax+rdx*4]
michael@0 464 movq [rax ], mm1
michael@0 465 movq [rax+rdx ], mm1
michael@0 466 movq [rax+rdx*2], mm1
michael@0 467 movq [rax+rcx ], mm1
michael@0 468
michael@0 469 ; begin epilog
michael@0 470 UNSHADOW_ARGS
michael@0 471 pop rbp
michael@0 472 ret
michael@0 473
michael@0 474 ;void vp8_intra_pred_uv_ho_mmx2(
michael@0 475 ; unsigned char *dst,
michael@0 476 ; int dst_stride
michael@0 477 ; unsigned char *above,
michael@0 478 ; unsigned char *left,
michael@0 479 ; int left_stride
michael@0 480 ; )
michael@0 481 %macro vp8_intra_pred_uv_ho 1
michael@0 482 global sym(vp8_intra_pred_uv_ho_%1) PRIVATE
michael@0 483 sym(vp8_intra_pred_uv_ho_%1):
michael@0 484 push rbp
michael@0 485 mov rbp, rsp
michael@0 486 SHADOW_ARGS_TO_STACK 5
michael@0 487 push rsi
michael@0 488 push rdi
michael@0 489 %ifidn %1, ssse3
michael@0 490 %ifndef GET_GOT_SAVE_ARG
michael@0 491 push rbx
michael@0 492 %endif
michael@0 493 GET_GOT rbx
michael@0 494 %endif
michael@0 495 ; end prolog
michael@0 496
michael@0 497 ;arg(2) not used
michael@0 498
michael@0 499 ; read from left and write out
michael@0 500 %ifidn %1, mmx2
michael@0 501 mov edx, 4
michael@0 502 %endif
michael@0 503 mov rsi, arg(3) ;left
michael@0 504 movsxd rax, dword ptr arg(4) ;left_stride;
michael@0 505 mov rdi, arg(0) ;dst;
michael@0 506 movsxd rcx, dword ptr arg(1) ;dst_stride
michael@0 507 %ifidn %1, ssse3
michael@0 508 lea rdx, [rcx*3]
michael@0 509 movdqa xmm2, [GLOBAL(dc_00001111)]
michael@0 510 lea rbx, [rax*3]
michael@0 511 %endif
michael@0 512
michael@0 513 %ifidn %1, mmx2
michael@0 514 .vp8_intra_pred_uv_ho_%1_loop:
michael@0 515 movd mm0, [rsi]
michael@0 516 movd mm1, [rsi+rax]
michael@0 517 punpcklbw mm0, mm0
michael@0 518 punpcklbw mm1, mm1
michael@0 519 pshufw mm0, mm0, 0x0
michael@0 520 pshufw mm1, mm1, 0x0
michael@0 521 movq [rdi ], mm0
michael@0 522 movq [rdi+rcx], mm1
michael@0 523 lea rsi, [rsi+rax*2]
michael@0 524 lea rdi, [rdi+rcx*2]
michael@0 525 dec edx
michael@0 526 jnz .vp8_intra_pred_uv_ho_%1_loop
michael@0 527 %else
michael@0 528 movd xmm0, [rsi]
michael@0 529 movd xmm3, [rsi+rax]
michael@0 530 movd xmm1, [rsi+rax*2]
michael@0 531 movd xmm4, [rsi+rbx]
michael@0 532 punpcklbw xmm0, xmm3
michael@0 533 punpcklbw xmm1, xmm4
michael@0 534 pshufb xmm0, xmm2
michael@0 535 pshufb xmm1, xmm2
michael@0 536 movq [rdi ], xmm0
michael@0 537 movhps [rdi+rcx], xmm0
michael@0 538 movq [rdi+rcx*2], xmm1
michael@0 539 movhps [rdi+rdx], xmm1
michael@0 540 lea rsi, [rsi+rax*4]
michael@0 541 lea rdi, [rdi+rcx*4]
michael@0 542 movd xmm0, [rsi]
michael@0 543 movd xmm3, [rsi+rax]
michael@0 544 movd xmm1, [rsi+rax*2]
michael@0 545 movd xmm4, [rsi+rbx]
michael@0 546 punpcklbw xmm0, xmm3
michael@0 547 punpcklbw xmm1, xmm4
michael@0 548 pshufb xmm0, xmm2
michael@0 549 pshufb xmm1, xmm2
michael@0 550 movq [rdi ], xmm0
michael@0 551 movhps [rdi+rcx], xmm0
michael@0 552 movq [rdi+rcx*2], xmm1
michael@0 553 movhps [rdi+rdx], xmm1
michael@0 554 %endif
michael@0 555
michael@0 556 ; begin epilog
michael@0 557 %ifidn %1, ssse3
michael@0 558 RESTORE_GOT
michael@0 559 %ifndef GET_GOT_SAVE_ARG
michael@0 560 pop rbx
michael@0 561 %endif
michael@0 562 %endif
michael@0 563 pop rdi
michael@0 564 pop rsi
michael@0 565 UNSHADOW_ARGS
michael@0 566 pop rbp
michael@0 567 ret
michael@0 568 %endmacro
michael@0 569
michael@0 570 vp8_intra_pred_uv_ho mmx2
michael@0 571 vp8_intra_pred_uv_ho ssse3
michael@0 572
michael@0 573 ;void vp8_intra_pred_y_dc_sse2(
michael@0 574 ; unsigned char *dst,
michael@0 575 ; int dst_stride
michael@0 576 ; unsigned char *above,
michael@0 577 ; unsigned char *left,
michael@0 578 ; int left_stride
michael@0 579 ; )
michael@0 580 global sym(vp8_intra_pred_y_dc_sse2) PRIVATE
michael@0 581 sym(vp8_intra_pred_y_dc_sse2):
michael@0 582 push rbp
michael@0 583 mov rbp, rsp
michael@0 584 SHADOW_ARGS_TO_STACK 5
michael@0 585 push rsi
michael@0 586 push rdi
michael@0 587 ; end prolog
michael@0 588
michael@0 589 ; from top
michael@0 590 mov rdi, arg(2) ;above
michael@0 591 mov rsi, arg(3) ;left
michael@0 592 movsxd rax, dword ptr arg(4) ;left_stride;
michael@0 593
michael@0 594 pxor xmm0, xmm0
michael@0 595 movdqa xmm1, [rdi]
michael@0 596 psadbw xmm1, xmm0
michael@0 597 movq xmm2, xmm1
michael@0 598 punpckhqdq xmm1, xmm1
michael@0 599 paddw xmm1, xmm2
michael@0 600
michael@0 601 ; from left
michael@0 602 lea rdi, [rax*3]
michael@0 603
michael@0 604 movzx ecx, byte [rsi]
michael@0 605 movzx edx, byte [rsi+rax]
michael@0 606 add ecx, edx
michael@0 607 movzx edx, byte [rsi+rax*2]
michael@0 608 add ecx, edx
michael@0 609 movzx edx, byte [rsi+rdi]
michael@0 610 add ecx, edx
michael@0 611 lea rsi, [rsi+rax*4]
michael@0 612
michael@0 613 movzx edx, byte [rsi]
michael@0 614 add ecx, edx
michael@0 615 movzx edx, byte [rsi+rax]
michael@0 616 add ecx, edx
michael@0 617 movzx edx, byte [rsi+rax*2]
michael@0 618 add ecx, edx
michael@0 619 movzx edx, byte [rsi+rdi]
michael@0 620 add ecx, edx
michael@0 621 lea rsi, [rsi+rax*4]
michael@0 622
michael@0 623 movzx edx, byte [rsi]
michael@0 624 add ecx, edx
michael@0 625 movzx edx, byte [rsi+rax]
michael@0 626 add ecx, edx
michael@0 627 movzx edx, byte [rsi+rax*2]
michael@0 628 add ecx, edx
michael@0 629 movzx edx, byte [rsi+rdi]
michael@0 630 add ecx, edx
michael@0 631 lea rsi, [rsi+rax*4]
michael@0 632
michael@0 633 movzx edx, byte [rsi]
michael@0 634 add ecx, edx
michael@0 635 movzx edx, byte [rsi+rax]
michael@0 636 add ecx, edx
michael@0 637 movzx edx, byte [rsi+rax*2]
michael@0 638 add ecx, edx
michael@0 639 movzx edx, byte [rsi+rdi]
michael@0 640 add ecx, edx
michael@0 641
michael@0 642 ; add up
michael@0 643 pextrw edx, xmm1, 0x0
michael@0 644 lea edx, [edx+ecx+16]
michael@0 645 sar edx, 5
michael@0 646 movd xmm1, edx
michael@0 647 ; FIXME use pshufb for ssse3 version
michael@0 648 pshuflw xmm1, xmm1, 0x0
michael@0 649 punpcklqdq xmm1, xmm1
michael@0 650 packuswb xmm1, xmm1
michael@0 651
michael@0 652 ; write out
michael@0 653 mov rsi, 2
michael@0 654 mov rdi, arg(0) ;dst;
michael@0 655 movsxd rcx, dword ptr arg(1) ;dst_stride
michael@0 656 lea rax, [rcx*3]
michael@0 657
michael@0 658 .label
michael@0 659 movdqa [rdi ], xmm1
michael@0 660 movdqa [rdi+rcx ], xmm1
michael@0 661 movdqa [rdi+rcx*2], xmm1
michael@0 662 movdqa [rdi+rax ], xmm1
michael@0 663 lea rdi, [rdi+rcx*4]
michael@0 664 movdqa [rdi ], xmm1
michael@0 665 movdqa [rdi+rcx ], xmm1
michael@0 666 movdqa [rdi+rcx*2], xmm1
michael@0 667 movdqa [rdi+rax ], xmm1
michael@0 668 lea rdi, [rdi+rcx*4]
michael@0 669 dec rsi
michael@0 670 jnz .label
michael@0 671
michael@0 672 ; begin epilog
michael@0 673 pop rdi
michael@0 674 pop rsi
michael@0 675 UNSHADOW_ARGS
michael@0 676 pop rbp
michael@0 677 ret
michael@0 678
michael@0 679 ;void vp8_intra_pred_y_dctop_sse2(
michael@0 680 ; unsigned char *dst,
michael@0 681 ; int dst_stride
michael@0 682 ; unsigned char *above,
michael@0 683 ; unsigned char *left,
michael@0 684 ; int left_stride
michael@0 685 ; )
michael@0 686 global sym(vp8_intra_pred_y_dctop_sse2) PRIVATE
michael@0 687 sym(vp8_intra_pred_y_dctop_sse2):
michael@0 688 push rbp
michael@0 689 mov rbp, rsp
michael@0 690 SHADOW_ARGS_TO_STACK 5
michael@0 691 push rsi
michael@0 692 GET_GOT rbx
michael@0 693 ; end prolog
michael@0 694
michael@0 695 ;arg(3), arg(4) not used
michael@0 696
michael@0 697 ; from top
michael@0 698 mov rcx, arg(2) ;above;
michael@0 699 pxor xmm0, xmm0
michael@0 700 movdqa xmm1, [rcx]
michael@0 701 psadbw xmm1, xmm0
michael@0 702 movdqa xmm2, xmm1
michael@0 703 punpckhqdq xmm1, xmm1
michael@0 704 paddw xmm1, xmm2
michael@0 705
michael@0 706 ; add up
michael@0 707 paddw xmm1, [GLOBAL(dc_8)]
michael@0 708 psraw xmm1, 4
michael@0 709 ; FIXME use pshufb for ssse3 version
michael@0 710 pshuflw xmm1, xmm1, 0x0
michael@0 711 punpcklqdq xmm1, xmm1
michael@0 712 packuswb xmm1, xmm1
michael@0 713
michael@0 714 ; write out
michael@0 715 mov rsi, 2
michael@0 716 mov rdx, arg(0) ;dst;
michael@0 717 movsxd rcx, dword ptr arg(1) ;dst_stride
michael@0 718 lea rax, [rcx*3]
michael@0 719
michael@0 720 .label
michael@0 721 movdqa [rdx ], xmm1
michael@0 722 movdqa [rdx+rcx ], xmm1
michael@0 723 movdqa [rdx+rcx*2], xmm1
michael@0 724 movdqa [rdx+rax ], xmm1
michael@0 725 lea rdx, [rdx+rcx*4]
michael@0 726 movdqa [rdx ], xmm1
michael@0 727 movdqa [rdx+rcx ], xmm1
michael@0 728 movdqa [rdx+rcx*2], xmm1
michael@0 729 movdqa [rdx+rax ], xmm1
michael@0 730 lea rdx, [rdx+rcx*4]
michael@0 731 dec rsi
michael@0 732 jnz .label
michael@0 733
michael@0 734 ; begin epilog
michael@0 735 RESTORE_GOT
michael@0 736 pop rsi
michael@0 737 UNSHADOW_ARGS
michael@0 738 pop rbp
michael@0 739 ret
michael@0 740
michael@0 741 ;void vp8_intra_pred_y_dcleft_sse2(
michael@0 742 ; unsigned char *dst,
michael@0 743 ; int dst_stride
michael@0 744 ; unsigned char *above,
michael@0 745 ; unsigned char *left,
michael@0 746 ; int left_stride
michael@0 747 ; )
michael@0 748 global sym(vp8_intra_pred_y_dcleft_sse2) PRIVATE
michael@0 749 sym(vp8_intra_pred_y_dcleft_sse2):
michael@0 750 push rbp
michael@0 751 mov rbp, rsp
michael@0 752 SHADOW_ARGS_TO_STACK 5
michael@0 753 push rsi
michael@0 754 push rdi
michael@0 755 ; end prolog
michael@0 756
michael@0 757 ;arg(2) not used
michael@0 758
michael@0 759 ; from left
michael@0 760 mov rsi, arg(3) ;left;
michael@0 761 movsxd rax, dword ptr arg(4) ;left_stride;
michael@0 762
michael@0 763 lea rdi, [rax*3]
michael@0 764 movzx ecx, byte [rsi]
michael@0 765 movzx edx, byte [rsi+rax]
michael@0 766 add ecx, edx
michael@0 767 movzx edx, byte [rsi+rax*2]
michael@0 768 add ecx, edx
michael@0 769 movzx edx, byte [rsi+rdi]
michael@0 770 add ecx, edx
michael@0 771 lea rsi, [rsi+rax*4]
michael@0 772 movzx edx, byte [rsi]
michael@0 773 add ecx, edx
michael@0 774 movzx edx, byte [rsi+rax]
michael@0 775 add ecx, edx
michael@0 776 movzx edx, byte [rsi+rax*2]
michael@0 777 add ecx, edx
michael@0 778 movzx edx, byte [rsi+rdi]
michael@0 779 add ecx, edx
michael@0 780 lea rsi, [rsi+rax*4]
michael@0 781 movzx edx, byte [rsi]
michael@0 782 add ecx, edx
michael@0 783 movzx edx, byte [rsi+rax]
michael@0 784 add ecx, edx
michael@0 785 movzx edx, byte [rsi+rax*2]
michael@0 786 add ecx, edx
michael@0 787 movzx edx, byte [rsi+rdi]
michael@0 788 add ecx, edx
michael@0 789 lea rsi, [rsi+rax*4]
michael@0 790 movzx edx, byte [rsi]
michael@0 791 add ecx, edx
michael@0 792 movzx edx, byte [rsi+rax]
michael@0 793 add ecx, edx
michael@0 794 movzx edx, byte [rsi+rax*2]
michael@0 795 add ecx, edx
michael@0 796 movzx edx, byte [rsi+rdi]
michael@0 797 lea edx, [ecx+edx+8]
michael@0 798
michael@0 799 ; add up
michael@0 800 shr edx, 4
michael@0 801 movd xmm1, edx
michael@0 802 ; FIXME use pshufb for ssse3 version
michael@0 803 pshuflw xmm1, xmm1, 0x0
michael@0 804 punpcklqdq xmm1, xmm1
michael@0 805 packuswb xmm1, xmm1
michael@0 806
michael@0 807 ; write out
michael@0 808 mov rsi, 2
michael@0 809 mov rdi, arg(0) ;dst;
michael@0 810 movsxd rcx, dword ptr arg(1) ;dst_stride
michael@0 811 lea rax, [rcx*3]
michael@0 812
michael@0 813 .label
michael@0 814 movdqa [rdi ], xmm1
michael@0 815 movdqa [rdi+rcx ], xmm1
michael@0 816 movdqa [rdi+rcx*2], xmm1
michael@0 817 movdqa [rdi+rax ], xmm1
michael@0 818 lea rdi, [rdi+rcx*4]
michael@0 819 movdqa [rdi ], xmm1
michael@0 820 movdqa [rdi+rcx ], xmm1
michael@0 821 movdqa [rdi+rcx*2], xmm1
michael@0 822 movdqa [rdi+rax ], xmm1
michael@0 823 lea rdi, [rdi+rcx*4]
michael@0 824 dec rsi
michael@0 825 jnz .label
michael@0 826
michael@0 827 ; begin epilog
michael@0 828 pop rdi
michael@0 829 pop rsi
michael@0 830 UNSHADOW_ARGS
michael@0 831 pop rbp
michael@0 832 ret
michael@0 833
michael@0 834 ;void vp8_intra_pred_y_dc128_sse2(
michael@0 835 ; unsigned char *dst,
michael@0 836 ; int dst_stride
michael@0 837 ; unsigned char *above,
michael@0 838 ; unsigned char *left,
michael@0 839 ; int left_stride
michael@0 840 ; )
michael@0 841 global sym(vp8_intra_pred_y_dc128_sse2) PRIVATE
michael@0 842 sym(vp8_intra_pred_y_dc128_sse2):
michael@0 843 push rbp
michael@0 844 mov rbp, rsp
michael@0 845 SHADOW_ARGS_TO_STACK 5
michael@0 846 push rsi
michael@0 847 GET_GOT rbx
michael@0 848 ; end prolog
michael@0 849
michael@0 850 ;arg(2), arg(3), arg(4) not used
michael@0 851
michael@0 852 ; write out
michael@0 853 mov rsi, 2
michael@0 854 movdqa xmm1, [GLOBAL(dc_128)]
michael@0 855 mov rax, arg(0) ;dst;
michael@0 856 movsxd rdx, dword ptr arg(1) ;dst_stride
michael@0 857 lea rcx, [rdx*3]
michael@0 858
michael@0 859 .label
michael@0 860 movdqa [rax ], xmm1
michael@0 861 movdqa [rax+rdx ], xmm1
michael@0 862 movdqa [rax+rdx*2], xmm1
michael@0 863 movdqa [rax+rcx ], xmm1
michael@0 864 lea rax, [rax+rdx*4]
michael@0 865 movdqa [rax ], xmm1
michael@0 866 movdqa [rax+rdx ], xmm1
michael@0 867 movdqa [rax+rdx*2], xmm1
michael@0 868 movdqa [rax+rcx ], xmm1
michael@0 869 lea rax, [rax+rdx*4]
michael@0 870 dec rsi
michael@0 871 jnz .label
michael@0 872
michael@0 873 ; begin epilog
michael@0 874 RESTORE_GOT
michael@0 875 pop rsi
michael@0 876 UNSHADOW_ARGS
michael@0 877 pop rbp
michael@0 878 ret
michael@0 879
michael@0 880 ;void vp8_intra_pred_y_tm_sse2(
michael@0 881 ; unsigned char *dst,
michael@0 882 ; int dst_stride
michael@0 883 ; unsigned char *above,
michael@0 884 ; unsigned char *left,
michael@0 885 ; int left_stride
michael@0 886 ; )
michael@0 887 %macro vp8_intra_pred_y_tm 1
michael@0 888 global sym(vp8_intra_pred_y_tm_%1) PRIVATE
michael@0 889 sym(vp8_intra_pred_y_tm_%1):
michael@0 890 push rbp
michael@0 891 mov rbp, rsp
michael@0 892 SHADOW_ARGS_TO_STACK 5
michael@0 893 SAVE_XMM 7
michael@0 894 push rsi
michael@0 895 push rdi
michael@0 896 GET_GOT rbx
michael@0 897 ; end prolog
michael@0 898
michael@0 899 ; read top row
michael@0 900 mov edx, 8
michael@0 901 mov rsi, arg(2) ;above
michael@0 902 movsxd rax, dword ptr arg(4) ;left_stride;
michael@0 903 pxor xmm0, xmm0
michael@0 904 %ifidn %1, ssse3
michael@0 905 movdqa xmm3, [GLOBAL(dc_1024)]
michael@0 906 %endif
michael@0 907 movdqa xmm1, [rsi]
michael@0 908 movdqa xmm2, xmm1
michael@0 909 punpcklbw xmm1, xmm0
michael@0 910 punpckhbw xmm2, xmm0
michael@0 911
michael@0 912 ; set up left ptrs ans subtract topleft
michael@0 913 movd xmm4, [rsi-1]
michael@0 914 mov rsi, arg(3) ;left
michael@0 915 %ifidn %1, sse2
michael@0 916 punpcklbw xmm4, xmm0
michael@0 917 pshuflw xmm4, xmm4, 0x0
michael@0 918 punpcklqdq xmm4, xmm4
michael@0 919 %else
michael@0 920 pshufb xmm4, xmm3
michael@0 921 %endif
michael@0 922 psubw xmm1, xmm4
michael@0 923 psubw xmm2, xmm4
michael@0 924
michael@0 925 ; set up dest ptrs
michael@0 926 mov rdi, arg(0) ;dst;
michael@0 927 movsxd rcx, dword ptr arg(1) ;dst_stride
michael@0 928 vp8_intra_pred_y_tm_%1_loop:
michael@0 929 movd xmm4, [rsi]
michael@0 930 movd xmm5, [rsi+rax]
michael@0 931 %ifidn %1, sse2
michael@0 932 punpcklbw xmm4, xmm0
michael@0 933 punpcklbw xmm5, xmm0
michael@0 934 pshuflw xmm4, xmm4, 0x0
michael@0 935 pshuflw xmm5, xmm5, 0x0
michael@0 936 punpcklqdq xmm4, xmm4
michael@0 937 punpcklqdq xmm5, xmm5
michael@0 938 %else
michael@0 939 pshufb xmm4, xmm3
michael@0 940 pshufb xmm5, xmm3
michael@0 941 %endif
michael@0 942 movdqa xmm6, xmm4
michael@0 943 movdqa xmm7, xmm5
michael@0 944 paddw xmm4, xmm1
michael@0 945 paddw xmm6, xmm2
michael@0 946 paddw xmm5, xmm1
michael@0 947 paddw xmm7, xmm2
michael@0 948 packuswb xmm4, xmm6
michael@0 949 packuswb xmm5, xmm7
michael@0 950 movdqa [rdi ], xmm4
michael@0 951 movdqa [rdi+rcx], xmm5
michael@0 952 lea rsi, [rsi+rax*2]
michael@0 953 lea rdi, [rdi+rcx*2]
michael@0 954 dec edx
michael@0 955 jnz vp8_intra_pred_y_tm_%1_loop
michael@0 956
michael@0 957 ; begin epilog
michael@0 958 RESTORE_GOT
michael@0 959 pop rdi
michael@0 960 pop rsi
michael@0 961 RESTORE_XMM
michael@0 962 UNSHADOW_ARGS
michael@0 963 pop rbp
michael@0 964 ret
michael@0 965 %endmacro
michael@0 966
michael@0 967 vp8_intra_pred_y_tm sse2
michael@0 968 vp8_intra_pred_y_tm ssse3
michael@0 969
michael@0 970 ;void vp8_intra_pred_y_ve_sse2(
michael@0 971 ; unsigned char *dst,
michael@0 972 ; int dst_stride
michael@0 973 ; unsigned char *above,
michael@0 974 ; unsigned char *left,
michael@0 975 ; int left_stride
michael@0 976 ; )
michael@0 977 global sym(vp8_intra_pred_y_ve_sse2) PRIVATE
michael@0 978 sym(vp8_intra_pred_y_ve_sse2):
michael@0 979 push rbp
michael@0 980 mov rbp, rsp
michael@0 981 SHADOW_ARGS_TO_STACK 5
michael@0 982 push rsi
michael@0 983 ; end prolog
michael@0 984
michael@0 985 ;arg(3), arg(4) not used
michael@0 986
michael@0 987 mov rax, arg(2) ;above;
michael@0 988 mov rsi, 2
michael@0 989 movsxd rdx, dword ptr arg(1) ;dst_stride
michael@0 990
michael@0 991 ; read from top
michael@0 992 movdqa xmm1, [rax]
michael@0 993
michael@0 994 ; write out
michael@0 995 mov rax, arg(0) ;dst;
michael@0 996 lea rcx, [rdx*3]
michael@0 997
michael@0 998 .label
michael@0 999 movdqa [rax ], xmm1
michael@0 1000 movdqa [rax+rdx ], xmm1
michael@0 1001 movdqa [rax+rdx*2], xmm1
michael@0 1002 movdqa [rax+rcx ], xmm1
michael@0 1003 lea rax, [rax+rdx*4]
michael@0 1004 movdqa [rax ], xmm1
michael@0 1005 movdqa [rax+rdx ], xmm1
michael@0 1006 movdqa [rax+rdx*2], xmm1
michael@0 1007 movdqa [rax+rcx ], xmm1
michael@0 1008 lea rax, [rax+rdx*4]
michael@0 1009 dec rsi
michael@0 1010 jnz .label
michael@0 1011
michael@0 1012 ; begin epilog
michael@0 1013 pop rsi
michael@0 1014 UNSHADOW_ARGS
michael@0 1015 pop rbp
michael@0 1016 ret
michael@0 1017
michael@0 1018 ;void vp8_intra_pred_y_ho_sse2(
michael@0 1019 ; unsigned char *dst,
michael@0 1020 ; int dst_stride
michael@0 1021 ; unsigned char *above,
michael@0 1022 ; unsigned char *left,
michael@0 1023 ; int left_stride,
michael@0 1024 ; )
michael@0 1025 global sym(vp8_intra_pred_y_ho_sse2) PRIVATE
michael@0 1026 sym(vp8_intra_pred_y_ho_sse2):
michael@0 1027 push rbp
michael@0 1028 mov rbp, rsp
michael@0 1029 SHADOW_ARGS_TO_STACK 5
michael@0 1030 push rsi
michael@0 1031 push rdi
michael@0 1032 ; end prolog
michael@0 1033
michael@0 1034 ;arg(2) not used
michael@0 1035
michael@0 1036 ; read from left and write out
michael@0 1037 mov edx, 8
michael@0 1038 mov rsi, arg(3) ;left;
michael@0 1039 movsxd rax, dword ptr arg(4) ;left_stride;
michael@0 1040 mov rdi, arg(0) ;dst;
michael@0 1041 movsxd rcx, dword ptr arg(1) ;dst_stride
michael@0 1042
michael@0 1043 vp8_intra_pred_y_ho_sse2_loop:
michael@0 1044 movd xmm0, [rsi]
michael@0 1045 movd xmm1, [rsi+rax]
michael@0 1046 ; FIXME use pshufb for ssse3 version
michael@0 1047 punpcklbw xmm0, xmm0
michael@0 1048 punpcklbw xmm1, xmm1
michael@0 1049 pshuflw xmm0, xmm0, 0x0
michael@0 1050 pshuflw xmm1, xmm1, 0x0
michael@0 1051 punpcklqdq xmm0, xmm0
michael@0 1052 punpcklqdq xmm1, xmm1
michael@0 1053 movdqa [rdi ], xmm0
michael@0 1054 movdqa [rdi+rcx], xmm1
michael@0 1055 lea rsi, [rsi+rax*2]
michael@0 1056 lea rdi, [rdi+rcx*2]
michael@0 1057 dec edx
michael@0 1058 jnz vp8_intra_pred_y_ho_sse2_loop
michael@0 1059
michael@0 1060 ; begin epilog
michael@0 1061 pop rdi
michael@0 1062 pop rsi
michael@0 1063 UNSHADOW_ARGS
michael@0 1064 pop rbp
michael@0 1065 ret
michael@0 1066
michael@0 1067 SECTION_RODATA
michael@0 1068 align 16
michael@0 1069 dc_128:
michael@0 1070 times 16 db 128
michael@0 1071 dc_4:
michael@0 1072 times 4 dw 4
michael@0 1073 align 16
michael@0 1074 dc_8:
michael@0 1075 times 8 dw 8
michael@0 1076 align 16
michael@0 1077 dc_1024:
michael@0 1078 times 8 dw 0x400
michael@0 1079 align 16
michael@0 1080 dc_00001111:
michael@0 1081 times 8 db 0
michael@0 1082 times 8 db 1

mercurial