media/libvpx/vp8/common/x86/postproc_sse2.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11
michael@0 12 %include "vpx_ports/x86_abi_support.asm"
michael@0 13
michael@0 14 ;macro in deblock functions
michael@0 15 %macro FIRST_2_ROWS 0
michael@0 16 movdqa xmm4, xmm0
michael@0 17 movdqa xmm6, xmm0
michael@0 18 movdqa xmm5, xmm1
michael@0 19 pavgb xmm5, xmm3
michael@0 20
michael@0 21 ;calculate absolute value
michael@0 22 psubusb xmm4, xmm1
michael@0 23 psubusb xmm1, xmm0
michael@0 24 psubusb xmm6, xmm3
michael@0 25 psubusb xmm3, xmm0
michael@0 26 paddusb xmm4, xmm1
michael@0 27 paddusb xmm6, xmm3
michael@0 28
michael@0 29 ;get threshold
michael@0 30 movdqa xmm2, flimit
michael@0 31 pxor xmm1, xmm1
michael@0 32 movdqa xmm7, xmm2
michael@0 33
michael@0 34 ;get mask
michael@0 35 psubusb xmm2, xmm4
michael@0 36 psubusb xmm7, xmm6
michael@0 37 pcmpeqb xmm2, xmm1
michael@0 38 pcmpeqb xmm7, xmm1
michael@0 39 por xmm7, xmm2
michael@0 40 %endmacro
michael@0 41
michael@0 42 %macro SECOND_2_ROWS 0
michael@0 43 movdqa xmm6, xmm0
michael@0 44 movdqa xmm4, xmm0
michael@0 45 movdqa xmm2, xmm1
michael@0 46 pavgb xmm1, xmm3
michael@0 47
michael@0 48 ;calculate absolute value
michael@0 49 psubusb xmm6, xmm2
michael@0 50 psubusb xmm2, xmm0
michael@0 51 psubusb xmm4, xmm3
michael@0 52 psubusb xmm3, xmm0
michael@0 53 paddusb xmm6, xmm2
michael@0 54 paddusb xmm4, xmm3
michael@0 55
michael@0 56 pavgb xmm5, xmm1
michael@0 57
michael@0 58 ;get threshold
michael@0 59 movdqa xmm2, flimit
michael@0 60 pxor xmm1, xmm1
michael@0 61 movdqa xmm3, xmm2
michael@0 62
michael@0 63 ;get mask
michael@0 64 psubusb xmm2, xmm6
michael@0 65 psubusb xmm3, xmm4
michael@0 66 pcmpeqb xmm2, xmm1
michael@0 67 pcmpeqb xmm3, xmm1
michael@0 68
michael@0 69 por xmm7, xmm2
michael@0 70 por xmm7, xmm3
michael@0 71
michael@0 72 pavgb xmm5, xmm0
michael@0 73
michael@0 74 ;decide if or not to use filtered value
michael@0 75 pand xmm0, xmm7
michael@0 76 pandn xmm7, xmm5
michael@0 77 paddusb xmm0, xmm7
michael@0 78 %endmacro
michael@0 79
michael@0 80 %macro UPDATE_FLIMIT 0
michael@0 81 movdqa xmm2, XMMWORD PTR [rbx]
michael@0 82 movdqa [rsp], xmm2
michael@0 83 add rbx, 16
michael@0 84 %endmacro
michael@0 85
michael@0 86 ;void vp8_post_proc_down_and_across_mb_row_sse2
michael@0 87 ;(
michael@0 88 ; unsigned char *src_ptr,
michael@0 89 ; unsigned char *dst_ptr,
michael@0 90 ; int src_pixels_per_line,
michael@0 91 ; int dst_pixels_per_line,
michael@0 92 ; int cols,
michael@0 93 ; int *flimits,
michael@0 94 ; int size
michael@0 95 ;)
michael@0 96 global sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE
michael@0 97 sym(vp8_post_proc_down_and_across_mb_row_sse2):
michael@0 98 push rbp
michael@0 99 mov rbp, rsp
michael@0 100 SHADOW_ARGS_TO_STACK 7
michael@0 101 SAVE_XMM 7
michael@0 102 push rbx
michael@0 103 push rsi
michael@0 104 push rdi
michael@0 105 ; end prolog
michael@0 106 ALIGN_STACK 16, rax
michael@0 107 sub rsp, 16
michael@0 108
michael@0 109 ; put flimit on stack
michael@0 110 mov rbx, arg(5) ;flimits ptr
michael@0 111 UPDATE_FLIMIT
michael@0 112
michael@0 113 %define flimit [rsp]
michael@0 114
michael@0 115 mov rsi, arg(0) ;src_ptr
michael@0 116 mov rdi, arg(1) ;dst_ptr
michael@0 117
michael@0 118 movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line
michael@0 119 movsxd rcx, DWORD PTR arg(6) ;rows in a macroblock
michael@0 120 .nextrow:
michael@0 121 xor rdx, rdx ;col
michael@0 122 .nextcol:
michael@0 123 ;load current and next 2 rows
michael@0 124 movdqu xmm0, XMMWORD PTR [rsi]
michael@0 125 movdqu xmm1, XMMWORD PTR [rsi + rax]
michael@0 126 movdqu xmm3, XMMWORD PTR [rsi + 2*rax]
michael@0 127
michael@0 128 FIRST_2_ROWS
michael@0 129
michael@0 130 ;load above 2 rows
michael@0 131 neg rax
michael@0 132 movdqu xmm1, XMMWORD PTR [rsi + 2*rax]
michael@0 133 movdqu xmm3, XMMWORD PTR [rsi + rax]
michael@0 134
michael@0 135 SECOND_2_ROWS
michael@0 136
michael@0 137 movdqu XMMWORD PTR [rdi], xmm0
michael@0 138
michael@0 139 neg rax ; positive stride
michael@0 140 add rsi, 16
michael@0 141 add rdi, 16
michael@0 142
michael@0 143 add rdx, 16
michael@0 144 cmp edx, dword arg(4) ;cols
michael@0 145 jge .downdone
michael@0 146 UPDATE_FLIMIT
michael@0 147 jmp .nextcol
michael@0 148
michael@0 149 .downdone:
michael@0 150 ; done with the all cols, start the across filtering in place
michael@0 151 sub rsi, rdx
michael@0 152 sub rdi, rdx
michael@0 153
michael@0 154 mov rbx, arg(5) ; flimits
michael@0 155 UPDATE_FLIMIT
michael@0 156
michael@0 157 ; dup the first byte into the left border 8 times
michael@0 158 movq mm1, [rdi]
michael@0 159 punpcklbw mm1, mm1
michael@0 160 punpcklwd mm1, mm1
michael@0 161 punpckldq mm1, mm1
michael@0 162 mov rdx, -8
michael@0 163 movq [rdi+rdx], mm1
michael@0 164
michael@0 165 ; dup the last byte into the right border
michael@0 166 movsxd rdx, dword arg(4)
michael@0 167 movq mm1, [rdi + rdx + -1]
michael@0 168 punpcklbw mm1, mm1
michael@0 169 punpcklwd mm1, mm1
michael@0 170 punpckldq mm1, mm1
michael@0 171 movq [rdi+rdx], mm1
michael@0 172
michael@0 173 xor rdx, rdx
michael@0 174 movq mm0, QWORD PTR [rdi-16];
michael@0 175 movq mm1, QWORD PTR [rdi-8];
michael@0 176
michael@0 177 .acrossnextcol:
michael@0 178 movdqu xmm0, XMMWORD PTR [rdi + rdx]
michael@0 179 movdqu xmm1, XMMWORD PTR [rdi + rdx -2]
michael@0 180 movdqu xmm3, XMMWORD PTR [rdi + rdx -1]
michael@0 181
michael@0 182 FIRST_2_ROWS
michael@0 183
michael@0 184 movdqu xmm1, XMMWORD PTR [rdi + rdx +1]
michael@0 185 movdqu xmm3, XMMWORD PTR [rdi + rdx +2]
michael@0 186
michael@0 187 SECOND_2_ROWS
michael@0 188
michael@0 189 movq QWORD PTR [rdi+rdx-16], mm0 ; store previous 8 bytes
michael@0 190 movq QWORD PTR [rdi+rdx-8], mm1 ; store previous 8 bytes
michael@0 191 movdq2q mm0, xmm0
michael@0 192 psrldq xmm0, 8
michael@0 193 movdq2q mm1, xmm0
michael@0 194
michael@0 195 add rdx, 16
michael@0 196 cmp edx, dword arg(4) ;cols
michael@0 197 jge .acrossdone
michael@0 198 UPDATE_FLIMIT
michael@0 199 jmp .acrossnextcol
michael@0 200
michael@0 201 .acrossdone
michael@0 202 ; last 16 pixels
michael@0 203 movq QWORD PTR [rdi+rdx-16], mm0
michael@0 204
michael@0 205 cmp edx, dword arg(4)
michael@0 206 jne .throw_last_8
michael@0 207 movq QWORD PTR [rdi+rdx-8], mm1
michael@0 208 .throw_last_8:
michael@0 209 ; done with this rwo
michael@0 210 add rsi,rax ;next src line
michael@0 211 mov eax, dword arg(3) ;dst_pixels_per_line
michael@0 212 add rdi,rax ;next destination
michael@0 213 mov eax, dword arg(2) ;src_pixels_per_line
michael@0 214
michael@0 215 mov rbx, arg(5) ;flimits
michael@0 216 UPDATE_FLIMIT
michael@0 217
michael@0 218 dec rcx ;decrement count
michael@0 219 jnz .nextrow ;next row
michael@0 220
michael@0 221 add rsp, 16
michael@0 222 pop rsp
michael@0 223 ; begin epilog
michael@0 224 pop rdi
michael@0 225 pop rsi
michael@0 226 pop rbx
michael@0 227 RESTORE_XMM
michael@0 228 UNSHADOW_ARGS
michael@0 229 pop rbp
michael@0 230 ret
michael@0 231 %undef flimit
michael@0 232
michael@0 233 ;void vp8_mbpost_proc_down_xmm(unsigned char *dst,
michael@0 234 ; int pitch, int rows, int cols,int flimit)
michael@0 235 extern sym(vp8_rv)
michael@0 236 global sym(vp8_mbpost_proc_down_xmm) PRIVATE
michael@0 237 sym(vp8_mbpost_proc_down_xmm):
michael@0 238 push rbp
michael@0 239 mov rbp, rsp
michael@0 240 SHADOW_ARGS_TO_STACK 5
michael@0 241 SAVE_XMM 7
michael@0 242 GET_GOT rbx
michael@0 243 push rsi
michael@0 244 push rdi
michael@0 245 ; end prolog
michael@0 246
michael@0 247 ALIGN_STACK 16, rax
michael@0 248 sub rsp, 128+16
michael@0 249
michael@0 250 ; unsigned char d[16][8] at [rsp]
michael@0 251 ; create flimit2 at [rsp+128]
michael@0 252 mov eax, dword ptr arg(4) ;flimit
michael@0 253 mov [rsp+128], eax
michael@0 254 mov [rsp+128+4], eax
michael@0 255 mov [rsp+128+8], eax
michael@0 256 mov [rsp+128+12], eax
michael@0 257 %define flimit4 [rsp+128]
michael@0 258
michael@0 259 %if ABI_IS_32BIT=0
michael@0 260 lea r8, [GLOBAL(sym(vp8_rv))]
michael@0 261 %endif
michael@0 262
michael@0 263 ;rows +=8;
michael@0 264 add dword arg(2), 8
michael@0 265
michael@0 266 ;for(c=0; c<cols; c+=8)
michael@0 267 .loop_col:
michael@0 268 mov rsi, arg(0) ; s
michael@0 269 pxor xmm0, xmm0 ;
michael@0 270
michael@0 271 movsxd rax, dword ptr arg(1) ;pitch ;
michael@0 272
michael@0 273 ; this copies the last row down into the border 8 rows
michael@0 274 mov rdi, rsi
michael@0 275 mov rdx, arg(2)
michael@0 276 sub rdx, 9
michael@0 277 imul rdx, rax
michael@0 278 lea rdi, [rdi+rdx]
michael@0 279 movq xmm1, QWORD ptr[rdi] ; first row
michael@0 280 mov rcx, 8
michael@0 281 .init_borderd ; initialize borders
michael@0 282 lea rdi, [rdi + rax]
michael@0 283 movq [rdi], xmm1
michael@0 284
michael@0 285 dec rcx
michael@0 286 jne .init_borderd
michael@0 287
michael@0 288 neg rax ; rax = -pitch
michael@0 289
michael@0 290 ; this copies the first row up into the border 8 rows
michael@0 291 mov rdi, rsi
michael@0 292 movq xmm1, QWORD ptr[rdi] ; first row
michael@0 293 mov rcx, 8
michael@0 294 .init_border ; initialize borders
michael@0 295 lea rdi, [rdi + rax]
michael@0 296 movq [rdi], xmm1
michael@0 297
michael@0 298 dec rcx
michael@0 299 jne .init_border
michael@0 300
michael@0 301
michael@0 302
michael@0 303 lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
michael@0 304 neg rax
michael@0 305
michael@0 306 pxor xmm5, xmm5
michael@0 307 pxor xmm6, xmm6 ;
michael@0 308
michael@0 309 pxor xmm7, xmm7 ;
michael@0 310 mov rdi, rsi
michael@0 311
michael@0 312 mov rcx, 15 ;
michael@0 313
michael@0 314 .loop_initvar:
michael@0 315 movq xmm1, QWORD PTR [rdi];
michael@0 316 punpcklbw xmm1, xmm0 ;
michael@0 317
michael@0 318 paddw xmm5, xmm1 ;
michael@0 319 pmullw xmm1, xmm1 ;
michael@0 320
michael@0 321 movdqa xmm2, xmm1 ;
michael@0 322 punpcklwd xmm1, xmm0 ;
michael@0 323
michael@0 324 punpckhwd xmm2, xmm0 ;
michael@0 325 paddd xmm6, xmm1 ;
michael@0 326
michael@0 327 paddd xmm7, xmm2 ;
michael@0 328 lea rdi, [rdi+rax] ;
michael@0 329
michael@0 330 dec rcx
michael@0 331 jne .loop_initvar
michael@0 332 ;save the var and sum
michael@0 333 xor rdx, rdx
michael@0 334 .loop_row:
michael@0 335 movq xmm1, QWORD PTR [rsi] ; [s-pitch*8]
michael@0 336 movq xmm2, QWORD PTR [rdi] ; [s+pitch*7]
michael@0 337
michael@0 338 punpcklbw xmm1, xmm0
michael@0 339 punpcklbw xmm2, xmm0
michael@0 340
michael@0 341 paddw xmm5, xmm2
michael@0 342 psubw xmm5, xmm1
michael@0 343
michael@0 344 pmullw xmm2, xmm2
michael@0 345 movdqa xmm4, xmm2
michael@0 346
michael@0 347 punpcklwd xmm2, xmm0
michael@0 348 punpckhwd xmm4, xmm0
michael@0 349
michael@0 350 paddd xmm6, xmm2
michael@0 351 paddd xmm7, xmm4
michael@0 352
michael@0 353 pmullw xmm1, xmm1
michael@0 354 movdqa xmm2, xmm1
michael@0 355
michael@0 356 punpcklwd xmm1, xmm0
michael@0 357 psubd xmm6, xmm1
michael@0 358
michael@0 359 punpckhwd xmm2, xmm0
michael@0 360 psubd xmm7, xmm2
michael@0 361
michael@0 362
michael@0 363 movdqa xmm3, xmm6
michael@0 364 pslld xmm3, 4
michael@0 365
michael@0 366 psubd xmm3, xmm6
michael@0 367 movdqa xmm1, xmm5
michael@0 368
michael@0 369 movdqa xmm4, xmm5
michael@0 370 pmullw xmm1, xmm1
michael@0 371
michael@0 372 pmulhw xmm4, xmm4
michael@0 373 movdqa xmm2, xmm1
michael@0 374
michael@0 375 punpcklwd xmm1, xmm4
michael@0 376 punpckhwd xmm2, xmm4
michael@0 377
michael@0 378 movdqa xmm4, xmm7
michael@0 379 pslld xmm4, 4
michael@0 380
michael@0 381 psubd xmm4, xmm7
michael@0 382
michael@0 383 psubd xmm3, xmm1
michael@0 384 psubd xmm4, xmm2
michael@0 385
michael@0 386 psubd xmm3, flimit4
michael@0 387 psubd xmm4, flimit4
michael@0 388
michael@0 389 psrad xmm3, 31
michael@0 390 psrad xmm4, 31
michael@0 391
michael@0 392 packssdw xmm3, xmm4
michael@0 393 packsswb xmm3, xmm0
michael@0 394
michael@0 395 movq xmm1, QWORD PTR [rsi+rax*8]
michael@0 396
michael@0 397 movq xmm2, xmm1
michael@0 398 punpcklbw xmm1, xmm0
michael@0 399
michael@0 400 paddw xmm1, xmm5
michael@0 401 mov rcx, rdx
michael@0 402
michael@0 403 and rcx, 127
michael@0 404 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
michael@0 405 push rax
michael@0 406 lea rax, [GLOBAL(sym(vp8_rv))]
michael@0 407 movdqu xmm4, [rax + rcx*2] ;vp8_rv[rcx*2]
michael@0 408 pop rax
michael@0 409 %elif ABI_IS_32BIT=0
michael@0 410 movdqu xmm4, [r8 + rcx*2] ;vp8_rv[rcx*2]
michael@0 411 %else
michael@0 412 movdqu xmm4, [sym(vp8_rv) + rcx*2]
michael@0 413 %endif
michael@0 414
michael@0 415 paddw xmm1, xmm4
michael@0 416 ;paddw xmm1, eight8s
michael@0 417 psraw xmm1, 4
michael@0 418
michael@0 419 packuswb xmm1, xmm0
michael@0 420 pand xmm1, xmm3
michael@0 421
michael@0 422 pandn xmm3, xmm2
michael@0 423 por xmm1, xmm3
michael@0 424
michael@0 425 and rcx, 15
michael@0 426 movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8]
michael@0 427
michael@0 428 mov rcx, rdx
michael@0 429 sub rcx, 8
michael@0 430
michael@0 431 and rcx, 15
michael@0 432 movq mm0, [rsp + rcx*8] ;d[rcx*8]
michael@0 433
michael@0 434 movq [rsi], mm0
michael@0 435 lea rsi, [rsi+rax]
michael@0 436
michael@0 437 lea rdi, [rdi+rax]
michael@0 438 add rdx, 1
michael@0 439
michael@0 440 cmp edx, dword arg(2) ;rows
michael@0 441 jl .loop_row
michael@0 442
michael@0 443 add dword arg(0), 8 ; s += 8
michael@0 444 sub dword arg(3), 8 ; cols -= 8
michael@0 445 cmp dword arg(3), 0
michael@0 446 jg .loop_col
michael@0 447
michael@0 448 add rsp, 128+16
michael@0 449 pop rsp
michael@0 450
michael@0 451 ; begin epilog
michael@0 452 pop rdi
michael@0 453 pop rsi
michael@0 454 RESTORE_GOT
michael@0 455 RESTORE_XMM
michael@0 456 UNSHADOW_ARGS
michael@0 457 pop rbp
michael@0 458 ret
michael@0 459 %undef flimit4
michael@0 460
michael@0 461
michael@0 462 ;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src,
michael@0 463 ; int pitch, int rows, int cols,int flimit)
michael@0 464 global sym(vp8_mbpost_proc_across_ip_xmm) PRIVATE
michael@0 465 sym(vp8_mbpost_proc_across_ip_xmm):
michael@0 466 push rbp
michael@0 467 mov rbp, rsp
michael@0 468 SHADOW_ARGS_TO_STACK 5
michael@0 469 SAVE_XMM 7
michael@0 470 GET_GOT rbx
michael@0 471 push rsi
michael@0 472 push rdi
michael@0 473 ; end prolog
michael@0 474
michael@0 475 ALIGN_STACK 16, rax
michael@0 476 sub rsp, 16
michael@0 477
michael@0 478 ; create flimit4 at [rsp]
michael@0 479 mov eax, dword ptr arg(4) ;flimit
michael@0 480 mov [rsp], eax
michael@0 481 mov [rsp+4], eax
michael@0 482 mov [rsp+8], eax
michael@0 483 mov [rsp+12], eax
michael@0 484 %define flimit4 [rsp]
michael@0 485
michael@0 486
michael@0 487 ;for(r=0;r<rows;r++)
michael@0 488 .ip_row_loop:
michael@0 489
michael@0 490 xor rdx, rdx ;sumsq=0;
michael@0 491 xor rcx, rcx ;sum=0;
michael@0 492 mov rsi, arg(0); s
michael@0 493
michael@0 494
michael@0 495 ; dup the first byte into the left border 8 times
michael@0 496 movq mm1, [rsi]
michael@0 497 punpcklbw mm1, mm1
michael@0 498 punpcklwd mm1, mm1
michael@0 499 punpckldq mm1, mm1
michael@0 500
michael@0 501 mov rdi, -8
michael@0 502 movq [rsi+rdi], mm1
michael@0 503
michael@0 504 ; dup the last byte into the right border
michael@0 505 movsxd rdx, dword arg(3)
michael@0 506 movq mm1, [rsi + rdx + -1]
michael@0 507 punpcklbw mm1, mm1
michael@0 508 punpcklwd mm1, mm1
michael@0 509 punpckldq mm1, mm1
michael@0 510 movq [rsi+rdx], mm1
michael@0 511
michael@0 512 .ip_var_loop:
michael@0 513 ;for(i=-8;i<=6;i++)
michael@0 514 ;{
michael@0 515 ; sumsq += s[i]*s[i];
michael@0 516 ; sum += s[i];
michael@0 517 ;}
michael@0 518 movzx eax, byte [rsi+rdi]
michael@0 519 add ecx, eax
michael@0 520 mul al
michael@0 521 add edx, eax
michael@0 522 add rdi, 1
michael@0 523 cmp rdi, 6
michael@0 524 jle .ip_var_loop
michael@0 525
michael@0 526
michael@0 527 ;mov rax, sumsq
michael@0 528 ;movd xmm7, rax
michael@0 529 movd xmm7, edx
michael@0 530
michael@0 531 ;mov rax, sum
michael@0 532 ;movd xmm6, rax
michael@0 533 movd xmm6, ecx
michael@0 534
michael@0 535 mov rsi, arg(0) ;s
michael@0 536 xor rcx, rcx
michael@0 537
michael@0 538 movsxd rdx, dword arg(3) ;cols
michael@0 539 add rdx, 8
michael@0 540 pxor mm0, mm0
michael@0 541 pxor mm1, mm1
michael@0 542
michael@0 543 pxor xmm0, xmm0
michael@0 544 .nextcol4:
michael@0 545
michael@0 546 movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5
michael@0 547 movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10
michael@0 548
michael@0 549 punpcklbw xmm1, xmm0 ; expanding
michael@0 550 punpcklbw xmm2, xmm0 ; expanding
michael@0 551
michael@0 552 punpcklwd xmm1, xmm0 ; expanding to dwords
michael@0 553 punpcklwd xmm2, xmm0 ; expanding to dwords
michael@0 554
michael@0 555 psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5
michael@0 556 paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2
michael@0 557
michael@0 558 paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5
michael@0 559 pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5
michael@0 560
michael@0 561 paddd xmm6, xmm2
michael@0 562 paddd xmm7, xmm1
michael@0 563
michael@0 564 pshufd xmm6, xmm6, 0 ; duplicate the last ones
michael@0 565 pshufd xmm7, xmm7, 0 ; duplicate the last ones
michael@0 566
michael@0 567 psrldq xmm1, 4 ; 8--7 9--6 10--5 0000
michael@0 568 psrldq xmm2, 4 ; 8--7 9--6 10--5 0000
michael@0 569
michael@0 570 pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared
michael@0 571 pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared
michael@0 572
michael@0 573 paddd xmm6, xmm4
michael@0 574 paddd xmm7, xmm3
michael@0 575
michael@0 576 pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared
michael@0 577 pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared
michael@0 578
michael@0 579 paddd xmm7, xmm3
michael@0 580 paddd xmm6, xmm4
michael@0 581
michael@0 582 pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared
michael@0 583 pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared
michael@0 584
michael@0 585 paddd xmm7, xmm3
michael@0 586 paddd xmm6, xmm4
michael@0 587
michael@0 588 movdqa xmm3, xmm6
michael@0 589 pmaddwd xmm3, xmm3
michael@0 590
michael@0 591 movdqa xmm5, xmm7
michael@0 592 pslld xmm5, 4
michael@0 593
michael@0 594 psubd xmm5, xmm7
michael@0 595 psubd xmm5, xmm3
michael@0 596
michael@0 597 psubd xmm5, flimit4
michael@0 598 psrad xmm5, 31
michael@0 599
michael@0 600 packssdw xmm5, xmm0
michael@0 601 packsswb xmm5, xmm0
michael@0 602
michael@0 603 movd xmm1, DWORD PTR [rsi+rcx]
michael@0 604 movq xmm2, xmm1
michael@0 605
michael@0 606 punpcklbw xmm1, xmm0
michael@0 607 punpcklwd xmm1, xmm0
michael@0 608
michael@0 609 paddd xmm1, xmm6
michael@0 610 paddd xmm1, [GLOBAL(four8s)]
michael@0 611
michael@0 612 psrad xmm1, 4
michael@0 613 packssdw xmm1, xmm0
michael@0 614
michael@0 615 packuswb xmm1, xmm0
michael@0 616 pand xmm1, xmm5
michael@0 617
michael@0 618 pandn xmm5, xmm2
michael@0 619 por xmm5, xmm1
michael@0 620
michael@0 621 movd [rsi+rcx-8], mm0
michael@0 622 movq mm0, mm1
michael@0 623
michael@0 624 movdq2q mm1, xmm5
michael@0 625 psrldq xmm7, 12
michael@0 626
michael@0 627 psrldq xmm6, 12
michael@0 628 add rcx, 4
michael@0 629
michael@0 630 cmp rcx, rdx
michael@0 631 jl .nextcol4
michael@0 632
michael@0 633 ;s+=pitch;
michael@0 634 movsxd rax, dword arg(1)
michael@0 635 add arg(0), rax
michael@0 636
michael@0 637 sub dword arg(2), 1 ;rows-=1
michael@0 638 cmp dword arg(2), 0
michael@0 639 jg .ip_row_loop
michael@0 640
michael@0 641 add rsp, 16
michael@0 642 pop rsp
michael@0 643
michael@0 644 ; begin epilog
michael@0 645 pop rdi
michael@0 646 pop rsi
michael@0 647 RESTORE_GOT
michael@0 648 RESTORE_XMM
michael@0 649 UNSHADOW_ARGS
michael@0 650 pop rbp
michael@0 651 ret
michael@0 652 %undef flimit4
michael@0 653
michael@0 654
michael@0 655 ;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
michael@0 656 ; unsigned char blackclamp[16],
michael@0 657 ; unsigned char whiteclamp[16],
michael@0 658 ; unsigned char bothclamp[16],
michael@0 659 ; unsigned int Width, unsigned int Height, int Pitch)
michael@0 660 extern sym(rand)
michael@0 661 global sym(vp8_plane_add_noise_wmt) PRIVATE
michael@0 662 sym(vp8_plane_add_noise_wmt):
michael@0 663 push rbp
michael@0 664 mov rbp, rsp
michael@0 665 SHADOW_ARGS_TO_STACK 8
michael@0 666 GET_GOT rbx
michael@0 667 push rsi
michael@0 668 push rdi
michael@0 669 ; end prolog
michael@0 670
michael@0 671 .addnoise_loop:
michael@0 672 call sym(rand) WRT_PLT
michael@0 673 mov rcx, arg(1) ;noise
michael@0 674 and rax, 0xff
michael@0 675 add rcx, rax
michael@0 676
michael@0 677 ; we rely on the fact that the clamping vectors are stored contiguously
michael@0 678 ; in black/white/both order. Note that we have to reload this here because
michael@0 679 ; rdx could be trashed by rand()
michael@0 680 mov rdx, arg(2) ; blackclamp
michael@0 681
michael@0 682
michael@0 683 mov rdi, rcx
michael@0 684 movsxd rcx, dword arg(5) ;[Width]
michael@0 685 mov rsi, arg(0) ;Pos
michael@0 686 xor rax,rax
michael@0 687
michael@0 688 .addnoise_nextset:
michael@0 689 movdqu xmm1,[rsi+rax] ; get the source
michael@0 690
michael@0 691 psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise
michael@0 692 paddusb xmm1, [rdx+32] ;bothclamp
michael@0 693 psubusb xmm1, [rdx+16] ;whiteclamp
michael@0 694
michael@0 695 movdqu xmm2,[rdi+rax] ; get the noise for this line
michael@0 696 paddb xmm1,xmm2 ; add it in
michael@0 697 movdqu [rsi+rax],xmm1 ; store the result
michael@0 698
michael@0 699 add rax,16 ; move to the next line
michael@0 700
michael@0 701 cmp rax, rcx
michael@0 702 jl .addnoise_nextset
michael@0 703
michael@0 704 movsxd rax, dword arg(7) ; Pitch
michael@0 705 add arg(0), rax ; Start += Pitch
michael@0 706 sub dword arg(6), 1 ; Height -= 1
michael@0 707 jg .addnoise_loop
michael@0 708
michael@0 709 ; begin epilog
michael@0 710 pop rdi
michael@0 711 pop rsi
michael@0 712 RESTORE_GOT
michael@0 713 UNSHADOW_ARGS
michael@0 714 pop rbp
michael@0 715 ret
michael@0 716
michael@0 717
michael@0 718 SECTION_RODATA
michael@0 719 align 16
michael@0 720 four8s:
michael@0 721 times 4 dd 8

mercurial