media/libvpx/vp9/common/x86/vp9_postproc_mmx.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11
michael@0 12 %include "vpx_ports/x86_abi_support.asm"
michael@0 13
michael@0 14 %define VP9_FILTER_WEIGHT 128
michael@0 15 %define VP9_FILTER_SHIFT 7
michael@0 16
michael@0 17 ;void vp9_post_proc_down_and_across_mmx
michael@0 18 ;(
michael@0 19 ; unsigned char *src_ptr,
michael@0 20 ; unsigned char *dst_ptr,
michael@0 21 ; int src_pixels_per_line,
michael@0 22 ; int dst_pixels_per_line,
michael@0 23 ; int rows,
michael@0 24 ; int cols,
michael@0 25 ; int flimit
michael@0 26 ;)
michael@0 27 global sym(vp9_post_proc_down_and_across_mmx) PRIVATE
michael@0 28 sym(vp9_post_proc_down_and_across_mmx):
michael@0 29 push rbp
michael@0 30 mov rbp, rsp
michael@0 31 SHADOW_ARGS_TO_STACK 7
michael@0 32 GET_GOT rbx
michael@0 33 push rsi
michael@0 34 push rdi
michael@0 35 ; end prolog
michael@0 36
michael@0 37 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
michael@0 38 ; move the global rd onto the stack, since we don't have enough registers
michael@0 39 ; to do PIC addressing
michael@0 40 movq mm0, [GLOBAL(rd)]
michael@0 41 sub rsp, 8
michael@0 42 movq [rsp], mm0
michael@0 43 %define RD [rsp]
michael@0 44 %else
michael@0 45 %define RD [GLOBAL(rd)]
michael@0 46 %endif
michael@0 47
michael@0 48 push rbx
michael@0 49 lea rbx, [GLOBAL(Blur)]
michael@0 50 movd mm2, dword ptr arg(6) ;flimit
michael@0 51 punpcklwd mm2, mm2
michael@0 52 punpckldq mm2, mm2
michael@0 53
michael@0 54 mov rsi, arg(0) ;src_ptr
michael@0 55 mov rdi, arg(1) ;dst_ptr
michael@0 56
michael@0 57 movsxd rcx, DWORD PTR arg(4) ;rows
michael@0 58 movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
michael@0 59 pxor mm0, mm0 ; mm0 = 00000000
michael@0 60
michael@0 61 .nextrow:
michael@0 62
michael@0 63 xor rdx, rdx ; clear out rdx for use as loop counter
michael@0 64 .nextcol:
michael@0 65
michael@0 66 pxor mm7, mm7 ; mm7 = 00000000
michael@0 67 movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps
michael@0 68 movq mm3, [rsi] ; mm4 = r0 p0..p7
michael@0 69 punpcklbw mm3, mm0 ; mm3 = p0..p3
michael@0 70 movq mm1, mm3 ; mm1 = p0..p3
michael@0 71 pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers
michael@0 72
michael@0 73 movq mm6, [rbx + 48] ; mm6 = kernel 3 taps
michael@0 74 movq mm5, [rsi + rax] ; mm4 = r1 p0..p7
michael@0 75 punpcklbw mm5, mm0 ; mm5 = r1 p0..p3
michael@0 76 pmullw mm6, mm5 ; mm6 *= p0..p3 * kernel 3 modifiers
michael@0 77 paddusw mm3, mm6 ; mm3 += mm6
michael@0 78
michael@0 79 ; thresholding
michael@0 80 movq mm7, mm1 ; mm7 = r0 p0..p3
michael@0 81 psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3
michael@0 82 psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3
michael@0 83 paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
michael@0 84 pcmpgtw mm7, mm2
michael@0 85
michael@0 86 movq mm6, [rbx + 64 ] ; mm6 = kernel 4 modifiers
michael@0 87 movq mm5, [rsi + 2*rax] ; mm4 = r2 p0..p7
michael@0 88 punpcklbw mm5, mm0 ; mm5 = r2 p0..p3
michael@0 89 pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers
michael@0 90 paddusw mm3, mm6 ; mm3 += mm5
michael@0 91
michael@0 92 ; thresholding
michael@0 93 movq mm6, mm1 ; mm6 = r0 p0..p3
michael@0 94 psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3
michael@0 95 psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3
michael@0 96 paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
michael@0 97 pcmpgtw mm6, mm2
michael@0 98 por mm7, mm6 ; accumulate thresholds
michael@0 99
michael@0 100
michael@0 101 neg rax
michael@0 102 movq mm6, [rbx ] ; kernel 0 taps
michael@0 103 movq mm5, [rsi+2*rax] ; mm4 = r-2 p0..p7
michael@0 104 punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3
michael@0 105 pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers
michael@0 106 paddusw mm3, mm6 ; mm3 += mm5
michael@0 107
michael@0 108 ; thresholding
michael@0 109 movq mm6, mm1 ; mm6 = r0 p0..p3
michael@0 110 psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3
michael@0 111 psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3
michael@0 112 paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
michael@0 113 pcmpgtw mm6, mm2
michael@0 114 por mm7, mm6 ; accumulate thresholds
michael@0 115
michael@0 116 movq mm6, [rbx + 16] ; kernel 1 taps
michael@0 117 movq mm4, [rsi+rax] ; mm4 = r-1 p0..p7
michael@0 118 punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3
michael@0 119 pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers.
michael@0 120 paddusw mm3, mm6 ; mm3 += mm5
michael@0 121
michael@0 122 ; thresholding
michael@0 123 movq mm6, mm1 ; mm6 = r0 p0..p3
michael@0 124 psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3
michael@0 125 psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3
michael@0 126 paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
michael@0 127 pcmpgtw mm6, mm2
michael@0 128 por mm7, mm6 ; accumulate thresholds
michael@0 129
michael@0 130
michael@0 131 paddusw mm3, RD ; mm3 += round value
michael@0 132 psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128
michael@0 133
michael@0 134 pand mm1, mm7 ; mm1 select vals > thresh from source
michael@0 135 pandn mm7, mm3 ; mm7 select vals < thresh from blurred result
michael@0 136 paddusw mm1, mm7 ; combination
michael@0 137
michael@0 138 packuswb mm1, mm0 ; pack to bytes
michael@0 139
michael@0 140 movd [rdi], mm1 ;
michael@0 141 neg rax ; pitch is positive
michael@0 142
michael@0 143
michael@0 144 add rsi, 4
michael@0 145 add rdi, 4
michael@0 146 add rdx, 4
michael@0 147
michael@0 148 cmp edx, dword ptr arg(5) ;cols
michael@0 149 jl .nextcol
michael@0 150 ; done with the all cols, start the across filtering in place
michael@0 151 sub rsi, rdx
michael@0 152 sub rdi, rdx
michael@0 153
michael@0 154
michael@0 155 push rax
michael@0 156 xor rdx, rdx
michael@0 157 mov rax, [rdi-4];
michael@0 158
michael@0 159 .acrossnextcol:
michael@0 160 pxor mm7, mm7 ; mm7 = 00000000
michael@0 161 movq mm6, [rbx + 32 ] ;
michael@0 162 movq mm4, [rdi+rdx] ; mm4 = p0..p7
michael@0 163 movq mm3, mm4 ; mm3 = p0..p7
michael@0 164 punpcklbw mm3, mm0 ; mm3 = p0..p3
michael@0 165 movq mm1, mm3 ; mm1 = p0..p3
michael@0 166 pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers
michael@0 167
michael@0 168 movq mm6, [rbx + 48]
michael@0 169 psrlq mm4, 8 ; mm4 = p1..p7
michael@0 170 movq mm5, mm4 ; mm5 = p1..p7
michael@0 171 punpcklbw mm5, mm0 ; mm5 = p1..p4
michael@0 172 pmullw mm6, mm5 ; mm6 *= p1..p4 * kernel 3 modifiers
michael@0 173 paddusw mm3, mm6 ; mm3 += mm6
michael@0 174
michael@0 175 ; thresholding
michael@0 176 movq mm7, mm1 ; mm7 = p0..p3
michael@0 177 psubusw mm7, mm5 ; mm7 = p0..p3 - p1..p4
michael@0 178 psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
michael@0 179 paddusw mm7, mm5 ; mm7 = abs(p0..p3 - p1..p4)
michael@0 180 pcmpgtw mm7, mm2
michael@0 181
michael@0 182 movq mm6, [rbx + 64 ]
michael@0 183 psrlq mm4, 8 ; mm4 = p2..p7
michael@0 184 movq mm5, mm4 ; mm5 = p2..p7
michael@0 185 punpcklbw mm5, mm0 ; mm5 = p2..p5
michael@0 186 pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers
michael@0 187 paddusw mm3, mm6 ; mm3 += mm5
michael@0 188
michael@0 189 ; thresholding
michael@0 190 movq mm6, mm1 ; mm6 = p0..p3
michael@0 191 psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4
michael@0 192 psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
michael@0 193 paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4)
michael@0 194 pcmpgtw mm6, mm2
michael@0 195 por mm7, mm6 ; accumulate thresholds
michael@0 196
michael@0 197
michael@0 198 movq mm6, [rbx ]
michael@0 199 movq mm4, [rdi+rdx-2] ; mm4 = p-2..p5
michael@0 200 movq mm5, mm4 ; mm5 = p-2..p5
michael@0 201 punpcklbw mm5, mm0 ; mm5 = p-2..p1
michael@0 202 pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers
michael@0 203 paddusw mm3, mm6 ; mm3 += mm5
michael@0 204
michael@0 205 ; thresholding
michael@0 206 movq mm6, mm1 ; mm6 = p0..p3
michael@0 207 psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4
michael@0 208 psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
michael@0 209 paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4)
michael@0 210 pcmpgtw mm6, mm2
michael@0 211 por mm7, mm6 ; accumulate thresholds
michael@0 212
michael@0 213 movq mm6, [rbx + 16]
michael@0 214 psrlq mm4, 8 ; mm4 = p-1..p5
michael@0 215 punpcklbw mm4, mm0 ; mm4 = p-1..p2
michael@0 216 pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers.
michael@0 217 paddusw mm3, mm6 ; mm3 += mm5
michael@0 218
michael@0 219 ; thresholding
michael@0 220 movq mm6, mm1 ; mm6 = p0..p3
michael@0 221 psubusw mm6, mm4 ; mm6 = p0..p3 - p1..p4
michael@0 222 psubusw mm4, mm1 ; mm5 = p1..p4 - p0..p3
michael@0 223 paddusw mm6, mm4 ; mm6 = abs(p0..p3 - p1..p4)
michael@0 224 pcmpgtw mm6, mm2
michael@0 225 por mm7, mm6 ; accumulate thresholds
michael@0 226
michael@0 227 paddusw mm3, RD ; mm3 += round value
michael@0 228 psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128
michael@0 229
michael@0 230 pand mm1, mm7 ; mm1 select vals > thresh from source
michael@0 231 pandn mm7, mm3 ; mm7 select vals < thresh from blurred result
michael@0 232 paddusw mm1, mm7 ; combination
michael@0 233
michael@0 234 packuswb mm1, mm0 ; pack to bytes
michael@0 235 mov DWORD PTR [rdi+rdx-4], eax ; store previous four bytes
michael@0 236 movd eax, mm1
michael@0 237
michael@0 238 add rdx, 4
michael@0 239 cmp edx, dword ptr arg(5) ;cols
michael@0 240 jl .acrossnextcol;
michael@0 241
michael@0 242 mov DWORD PTR [rdi+rdx-4], eax
michael@0 243 pop rax
michael@0 244
michael@0 245 ; done with this rwo
michael@0 246 add rsi,rax ; next line
michael@0 247 movsxd rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch?
michael@0 248 add rdi,rax ; next destination
michael@0 249 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?
michael@0 250
michael@0 251 dec rcx ; decrement count
michael@0 252 jnz .nextrow ; next row
michael@0 253 pop rbx
michael@0 254
michael@0 255 ; begin epilog
michael@0 256 pop rdi
michael@0 257 pop rsi
michael@0 258 RESTORE_GOT
michael@0 259 UNSHADOW_ARGS
michael@0 260 pop rbp
michael@0 261 ret
michael@0 262 %undef RD
michael@0 263
michael@0 264
michael@0 265 ;void vp9_mbpost_proc_down_mmx(unsigned char *dst,
michael@0 266 ; int pitch, int rows, int cols,int flimit)
michael@0 267 extern sym(vp9_rv)
michael@0 268 global sym(vp9_mbpost_proc_down_mmx) PRIVATE
michael@0 269 sym(vp9_mbpost_proc_down_mmx):
michael@0 270 push rbp
michael@0 271 mov rbp, rsp
michael@0 272 SHADOW_ARGS_TO_STACK 5
michael@0 273 GET_GOT rbx
michael@0 274 push rsi
michael@0 275 push rdi
michael@0 276 ; end prolog
michael@0 277
michael@0 278 ALIGN_STACK 16, rax
michael@0 279 sub rsp, 136
michael@0 280
michael@0 281 ; unsigned char d[16][8] at [rsp]
michael@0 282 ; create flimit2 at [rsp+128]
michael@0 283 mov eax, dword ptr arg(4) ;flimit
michael@0 284 mov [rsp+128], eax
michael@0 285 mov [rsp+128+4], eax
michael@0 286 %define flimit2 [rsp+128]
michael@0 287
michael@0 288 %if ABI_IS_32BIT=0
michael@0 289 lea r8, [GLOBAL(sym(vp9_rv))]
michael@0 290 %endif
michael@0 291
michael@0 292 ;rows +=8;
michael@0 293 add dword ptr arg(2), 8
michael@0 294
michael@0 295 ;for(c=0; c<cols; c+=4)
michael@0 296 .loop_col:
michael@0 297 mov rsi, arg(0) ;s
michael@0 298 pxor mm0, mm0 ;
michael@0 299
michael@0 300 movsxd rax, dword ptr arg(1) ;pitch ;
michael@0 301 neg rax ; rax = -pitch
michael@0 302
michael@0 303 lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
michael@0 304 neg rax
michael@0 305
michael@0 306
michael@0 307 pxor mm5, mm5
michael@0 308 pxor mm6, mm6 ;
michael@0 309
michael@0 310 pxor mm7, mm7 ;
michael@0 311 mov rdi, rsi
michael@0 312
michael@0 313 mov rcx, 15 ;
michael@0 314
michael@0 315 .loop_initvar:
michael@0 316 movd mm1, DWORD PTR [rdi];
michael@0 317 punpcklbw mm1, mm0 ;
michael@0 318
michael@0 319 paddw mm5, mm1 ;
michael@0 320 pmullw mm1, mm1 ;
michael@0 321
michael@0 322 movq mm2, mm1 ;
michael@0 323 punpcklwd mm1, mm0 ;
michael@0 324
michael@0 325 punpckhwd mm2, mm0 ;
michael@0 326 paddd mm6, mm1 ;
michael@0 327
michael@0 328 paddd mm7, mm2 ;
michael@0 329 lea rdi, [rdi+rax] ;
michael@0 330
michael@0 331 dec rcx
michael@0 332 jne .loop_initvar
michael@0 333 ;save the var and sum
michael@0 334 xor rdx, rdx
michael@0 335 .loop_row:
michael@0 336 movd mm1, DWORD PTR [rsi] ; [s-pitch*8]
michael@0 337 movd mm2, DWORD PTR [rdi] ; [s+pitch*7]
michael@0 338
michael@0 339 punpcklbw mm1, mm0
michael@0 340 punpcklbw mm2, mm0
michael@0 341
michael@0 342 paddw mm5, mm2
michael@0 343 psubw mm5, mm1
michael@0 344
michael@0 345 pmullw mm2, mm2
michael@0 346 movq mm4, mm2
michael@0 347
michael@0 348 punpcklwd mm2, mm0
michael@0 349 punpckhwd mm4, mm0
michael@0 350
michael@0 351 paddd mm6, mm2
michael@0 352 paddd mm7, mm4
michael@0 353
michael@0 354 pmullw mm1, mm1
michael@0 355 movq mm2, mm1
michael@0 356
michael@0 357 punpcklwd mm1, mm0
michael@0 358 psubd mm6, mm1
michael@0 359
michael@0 360 punpckhwd mm2, mm0
michael@0 361 psubd mm7, mm2
michael@0 362
michael@0 363
michael@0 364 movq mm3, mm6
michael@0 365 pslld mm3, 4
michael@0 366
michael@0 367 psubd mm3, mm6
michael@0 368 movq mm1, mm5
michael@0 369
michael@0 370 movq mm4, mm5
michael@0 371 pmullw mm1, mm1
michael@0 372
michael@0 373 pmulhw mm4, mm4
michael@0 374 movq mm2, mm1
michael@0 375
michael@0 376 punpcklwd mm1, mm4
michael@0 377 punpckhwd mm2, mm4
michael@0 378
michael@0 379 movq mm4, mm7
michael@0 380 pslld mm4, 4
michael@0 381
michael@0 382 psubd mm4, mm7
michael@0 383
michael@0 384 psubd mm3, mm1
michael@0 385 psubd mm4, mm2
michael@0 386
michael@0 387 psubd mm3, flimit2
michael@0 388 psubd mm4, flimit2
michael@0 389
michael@0 390 psrad mm3, 31
michael@0 391 psrad mm4, 31
michael@0 392
michael@0 393 packssdw mm3, mm4
michael@0 394 packsswb mm3, mm0
michael@0 395
michael@0 396 movd mm1, DWORD PTR [rsi+rax*8]
michael@0 397
michael@0 398 movq mm2, mm1
michael@0 399 punpcklbw mm1, mm0
michael@0 400
michael@0 401 paddw mm1, mm5
michael@0 402 mov rcx, rdx
michael@0 403
michael@0 404 and rcx, 127
michael@0 405 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
michael@0 406 push rax
michael@0 407 lea rax, [GLOBAL(sym(vp9_rv))]
michael@0 408 movq mm4, [rax + rcx*2] ;vp9_rv[rcx*2]
michael@0 409 pop rax
michael@0 410 %elif ABI_IS_32BIT=0
michael@0 411 movq mm4, [r8 + rcx*2] ;vp9_rv[rcx*2]
michael@0 412 %else
michael@0 413 movq mm4, [sym(vp9_rv) + rcx*2]
michael@0 414 %endif
michael@0 415 paddw mm1, mm4
michael@0 416 ;paddw xmm1, eight8s
michael@0 417 psraw mm1, 4
michael@0 418
michael@0 419 packuswb mm1, mm0
michael@0 420 pand mm1, mm3
michael@0 421
michael@0 422 pandn mm3, mm2
michael@0 423 por mm1, mm3
michael@0 424
michael@0 425 and rcx, 15
michael@0 426 movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4]
michael@0 427
michael@0 428 mov rcx, rdx
michael@0 429 sub rcx, 8
michael@0 430
michael@0 431 and rcx, 15
michael@0 432 movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4]
michael@0 433
michael@0 434 movd [rsi], mm1
michael@0 435 lea rsi, [rsi+rax]
michael@0 436
michael@0 437 lea rdi, [rdi+rax]
michael@0 438 add rdx, 1
michael@0 439
michael@0 440 cmp edx, dword arg(2) ;rows
michael@0 441 jl .loop_row
michael@0 442
michael@0 443
michael@0 444 add dword arg(0), 4 ; s += 4
michael@0 445 sub dword arg(3), 4 ; cols -= 4
michael@0 446 cmp dword arg(3), 0
michael@0 447 jg .loop_col
michael@0 448
michael@0 449 add rsp, 136
michael@0 450 pop rsp
michael@0 451
michael@0 452 ; begin epilog
michael@0 453 pop rdi
michael@0 454 pop rsi
michael@0 455 RESTORE_GOT
michael@0 456 UNSHADOW_ARGS
michael@0 457 pop rbp
michael@0 458 ret
michael@0 459 %undef flimit2
michael@0 460
michael@0 461
michael@0 462 ;void vp9_plane_add_noise_mmx (unsigned char *start, unsigned char *noise,
michael@0 463 ; unsigned char blackclamp[16],
michael@0 464 ; unsigned char whiteclamp[16],
michael@0 465 ; unsigned char bothclamp[16],
michael@0 466 ; unsigned int width, unsigned int height, int pitch)
michael@0 467 extern sym(rand)
michael@0 468 global sym(vp9_plane_add_noise_mmx) PRIVATE
michael@0 469 sym(vp9_plane_add_noise_mmx):
michael@0 470 push rbp
michael@0 471 mov rbp, rsp
michael@0 472 SHADOW_ARGS_TO_STACK 8
michael@0 473 GET_GOT rbx
michael@0 474 push rsi
michael@0 475 push rdi
michael@0 476 ; end prolog
michael@0 477
michael@0 478 .addnoise_loop:
michael@0 479 call sym(rand) WRT_PLT
michael@0 480 mov rcx, arg(1) ;noise
michael@0 481 and rax, 0xff
michael@0 482 add rcx, rax
michael@0 483
michael@0 484 ; we rely on the fact that the clamping vectors are stored contiguously
michael@0 485 ; in black/white/both order. Note that we have to reload this here because
michael@0 486 ; rdx could be trashed by rand()
michael@0 487 mov rdx, arg(2) ; blackclamp
michael@0 488
michael@0 489
michael@0 490 mov rdi, rcx
michael@0 491 movsxd rcx, dword arg(5) ;[Width]
michael@0 492 mov rsi, arg(0) ;Pos
michael@0 493 xor rax,rax
michael@0 494
michael@0 495 .addnoise_nextset:
michael@0 496 movq mm1,[rsi+rax] ; get the source
michael@0 497
michael@0 498 psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise
michael@0 499 paddusb mm1, [rdx+32] ;bothclamp
michael@0 500 psubusb mm1, [rdx+16] ;whiteclamp
michael@0 501
michael@0 502 movq mm2,[rdi+rax] ; get the noise for this line
michael@0 503 paddb mm1,mm2 ; add it in
michael@0 504 movq [rsi+rax],mm1 ; store the result
michael@0 505
michael@0 506 add rax,8 ; move to the next line
michael@0 507
michael@0 508 cmp rax, rcx
michael@0 509 jl .addnoise_nextset
michael@0 510
michael@0 511 movsxd rax, dword arg(7) ; Pitch
michael@0 512 add arg(0), rax ; Start += Pitch
michael@0 513 sub dword arg(6), 1 ; Height -= 1
michael@0 514 jg .addnoise_loop
michael@0 515
michael@0 516 ; begin epilog
michael@0 517 pop rdi
michael@0 518 pop rsi
michael@0 519 RESTORE_GOT
michael@0 520 UNSHADOW_ARGS
michael@0 521 pop rbp
michael@0 522 ret
michael@0 523
michael@0 524
michael@0 525 SECTION_RODATA
michael@0 526 align 16
michael@0 527 Blur:
michael@0 528 times 16 dw 16
michael@0 529 times 8 dw 64
michael@0 530 times 16 dw 16
michael@0 531 times 8 dw 0
michael@0 532
michael@0 533 rd:
michael@0 534 times 4 dw 0x40

mercurial