media/libvpx/vp9/common/x86/vp9_postproc_sse2.asm

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11
michael@0 12 %include "vpx_ports/x86_abi_support.asm"
michael@0 13
michael@0 14 ;void vp9_post_proc_down_and_across_xmm
michael@0 15 ;(
michael@0 16 ; unsigned char *src_ptr,
michael@0 17 ; unsigned char *dst_ptr,
michael@0 18 ; int src_pixels_per_line,
michael@0 19 ; int dst_pixels_per_line,
michael@0 20 ; int rows,
michael@0 21 ; int cols,
michael@0 22 ; int flimit
michael@0 23 ;)
michael@0 24 global sym(vp9_post_proc_down_and_across_xmm) PRIVATE
michael@0 25 sym(vp9_post_proc_down_and_across_xmm):
michael@0 26 push rbp
michael@0 27 mov rbp, rsp
michael@0 28 SHADOW_ARGS_TO_STACK 7
michael@0 29 SAVE_XMM 7
michael@0 30 GET_GOT rbx
michael@0 31 push rsi
michael@0 32 push rdi
michael@0 33 ; end prolog
michael@0 34
michael@0 35 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
michael@0 36 ALIGN_STACK 16, rax
michael@0 37 ; move the global rd onto the stack, since we don't have enough registers
michael@0 38 ; to do PIC addressing
michael@0 39 movdqa xmm0, [GLOBAL(rd42)]
michael@0 40 sub rsp, 16
michael@0 41 movdqa [rsp], xmm0
michael@0 42 %define RD42 [rsp]
michael@0 43 %else
michael@0 44 %define RD42 [GLOBAL(rd42)]
michael@0 45 %endif
michael@0 46
michael@0 47
michael@0 48 movd xmm2, dword ptr arg(6) ;flimit
michael@0 49 punpcklwd xmm2, xmm2
michael@0 50 punpckldq xmm2, xmm2
michael@0 51 punpcklqdq xmm2, xmm2
michael@0 52
michael@0 53 mov rsi, arg(0) ;src_ptr
michael@0 54 mov rdi, arg(1) ;dst_ptr
michael@0 55
michael@0 56 movsxd rcx, DWORD PTR arg(4) ;rows
michael@0 57 movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
michael@0 58 pxor xmm0, xmm0 ; mm0 = 00000000
michael@0 59
michael@0 60 .nextrow:
michael@0 61
michael@0 62 xor rdx, rdx ; clear out rdx for use as loop counter
michael@0 63 .nextcol:
michael@0 64 movq xmm3, QWORD PTR [rsi] ; mm4 = r0 p0..p7
michael@0 65 punpcklbw xmm3, xmm0 ; mm3 = p0..p3
michael@0 66 movdqa xmm1, xmm3 ; mm1 = p0..p3
michael@0 67 psllw xmm3, 2 ;
michael@0 68
michael@0 69 movq xmm5, QWORD PTR [rsi + rax] ; mm4 = r1 p0..p7
michael@0 70 punpcklbw xmm5, xmm0 ; mm5 = r1 p0..p3
michael@0 71 paddusw xmm3, xmm5 ; mm3 += mm6
michael@0 72
michael@0 73 ; thresholding
michael@0 74 movdqa xmm7, xmm1 ; mm7 = r0 p0..p3
michael@0 75 psubusw xmm7, xmm5 ; mm7 = r0 p0..p3 - r1 p0..p3
michael@0 76 psubusw xmm5, xmm1 ; mm5 = r1 p0..p3 - r0 p0..p3
michael@0 77 paddusw xmm7, xmm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
michael@0 78 pcmpgtw xmm7, xmm2
michael@0 79
michael@0 80 movq xmm5, QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
michael@0 81 punpcklbw xmm5, xmm0 ; mm5 = r2 p0..p3
michael@0 82 paddusw xmm3, xmm5 ; mm3 += mm5
michael@0 83
michael@0 84 ; thresholding
michael@0 85 movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
michael@0 86 psubusw xmm6, xmm5 ; mm6 = r0 p0..p3 - r2 p0..p3
michael@0 87 psubusw xmm5, xmm1 ; mm5 = r2 p0..p3 - r2 p0..p3
michael@0 88 paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
michael@0 89 pcmpgtw xmm6, xmm2
michael@0 90 por xmm7, xmm6 ; accumulate thresholds
michael@0 91
michael@0 92
michael@0 93 neg rax
michael@0 94 movq xmm5, QWORD PTR [rsi+2*rax] ; mm4 = r-2 p0..p7
michael@0 95 punpcklbw xmm5, xmm0 ; mm5 = r-2 p0..p3
michael@0 96 paddusw xmm3, xmm5 ; mm3 += mm5
michael@0 97
michael@0 98 ; thresholding
michael@0 99 movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
michael@0 100 psubusw xmm6, xmm5 ; mm6 = p0..p3 - r-2 p0..p3
michael@0 101 psubusw xmm5, xmm1 ; mm5 = r-2 p0..p3 - p0..p3
michael@0 102 paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
michael@0 103 pcmpgtw xmm6, xmm2
michael@0 104 por xmm7, xmm6 ; accumulate thresholds
michael@0 105
michael@0 106 movq xmm4, QWORD PTR [rsi+rax] ; mm4 = r-1 p0..p7
michael@0 107 punpcklbw xmm4, xmm0 ; mm4 = r-1 p0..p3
michael@0 108 paddusw xmm3, xmm4 ; mm3 += mm5
michael@0 109
michael@0 110 ; thresholding
michael@0 111 movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
michael@0 112 psubusw xmm6, xmm4 ; mm6 = p0..p3 - r-2 p0..p3
michael@0 113 psubusw xmm4, xmm1 ; mm5 = r-1 p0..p3 - p0..p3
michael@0 114 paddusw xmm6, xmm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
michael@0 115 pcmpgtw xmm6, xmm2
michael@0 116 por xmm7, xmm6 ; accumulate thresholds
michael@0 117
michael@0 118
michael@0 119 paddusw xmm3, RD42 ; mm3 += round value
michael@0 120 psraw xmm3, 3 ; mm3 /= 8
michael@0 121
michael@0 122 pand xmm1, xmm7 ; mm1 select vals > thresh from source
michael@0 123 pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result
michael@0 124 paddusw xmm1, xmm7 ; combination
michael@0 125
michael@0 126 packuswb xmm1, xmm0 ; pack to bytes
michael@0 127 movq QWORD PTR [rdi], xmm1 ;
michael@0 128
michael@0 129 neg rax ; pitch is positive
michael@0 130 add rsi, 8
michael@0 131 add rdi, 8
michael@0 132
michael@0 133 add rdx, 8
michael@0 134 cmp edx, dword arg(5) ;cols
michael@0 135
michael@0 136 jl .nextcol
michael@0 137
michael@0 138 ; done with the all cols, start the across filtering in place
michael@0 139 sub rsi, rdx
michael@0 140 sub rdi, rdx
michael@0 141
michael@0 142 xor rdx, rdx
michael@0 143 movq mm0, QWORD PTR [rdi-8];
michael@0 144
michael@0 145 .acrossnextcol:
michael@0 146 movq xmm7, QWORD PTR [rdi +rdx -2]
michael@0 147 movd xmm4, DWORD PTR [rdi +rdx +6]
michael@0 148
michael@0 149 pslldq xmm4, 8
michael@0 150 por xmm4, xmm7
michael@0 151
michael@0 152 movdqa xmm3, xmm4
michael@0 153 psrldq xmm3, 2
michael@0 154 punpcklbw xmm3, xmm0 ; mm3 = p0..p3
michael@0 155 movdqa xmm1, xmm3 ; mm1 = p0..p3
michael@0 156 psllw xmm3, 2
michael@0 157
michael@0 158
michael@0 159 movdqa xmm5, xmm4
michael@0 160 psrldq xmm5, 3
michael@0 161 punpcklbw xmm5, xmm0 ; mm5 = p1..p4
michael@0 162 paddusw xmm3, xmm5 ; mm3 += mm6
michael@0 163
michael@0 164 ; thresholding
michael@0 165 movdqa xmm7, xmm1 ; mm7 = p0..p3
michael@0 166 psubusw xmm7, xmm5 ; mm7 = p0..p3 - p1..p4
michael@0 167 psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
michael@0 168 paddusw xmm7, xmm5 ; mm7 = abs(p0..p3 - p1..p4)
michael@0 169 pcmpgtw xmm7, xmm2
michael@0 170
michael@0 171 movdqa xmm5, xmm4
michael@0 172 psrldq xmm5, 4
michael@0 173 punpcklbw xmm5, xmm0 ; mm5 = p2..p5
michael@0 174 paddusw xmm3, xmm5 ; mm3 += mm5
michael@0 175
michael@0 176 ; thresholding
michael@0 177 movdqa xmm6, xmm1 ; mm6 = p0..p3
michael@0 178 psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4
michael@0 179 psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
michael@0 180 paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4)
michael@0 181 pcmpgtw xmm6, xmm2
michael@0 182 por xmm7, xmm6 ; accumulate thresholds
michael@0 183
michael@0 184
michael@0 185 movdqa xmm5, xmm4 ; mm5 = p-2..p5
michael@0 186 punpcklbw xmm5, xmm0 ; mm5 = p-2..p1
michael@0 187 paddusw xmm3, xmm5 ; mm3 += mm5
michael@0 188
michael@0 189 ; thresholding
michael@0 190 movdqa xmm6, xmm1 ; mm6 = p0..p3
michael@0 191 psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4
michael@0 192 psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
michael@0 193 paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4)
michael@0 194 pcmpgtw xmm6, xmm2
michael@0 195 por xmm7, xmm6 ; accumulate thresholds
michael@0 196
michael@0 197 psrldq xmm4, 1 ; mm4 = p-1..p5
michael@0 198 punpcklbw xmm4, xmm0 ; mm4 = p-1..p2
michael@0 199 paddusw xmm3, xmm4 ; mm3 += mm5
michael@0 200
michael@0 201 ; thresholding
michael@0 202 movdqa xmm6, xmm1 ; mm6 = p0..p3
michael@0 203 psubusw xmm6, xmm4 ; mm6 = p0..p3 - p1..p4
michael@0 204 psubusw xmm4, xmm1 ; mm5 = p1..p4 - p0..p3
michael@0 205 paddusw xmm6, xmm4 ; mm6 = abs(p0..p3 - p1..p4)
michael@0 206 pcmpgtw xmm6, xmm2
michael@0 207 por xmm7, xmm6 ; accumulate thresholds
michael@0 208
michael@0 209 paddusw xmm3, RD42 ; mm3 += round value
michael@0 210 psraw xmm3, 3 ; mm3 /= 8
michael@0 211
michael@0 212 pand xmm1, xmm7 ; mm1 select vals > thresh from source
michael@0 213 pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result
michael@0 214 paddusw xmm1, xmm7 ; combination
michael@0 215
michael@0 216 packuswb xmm1, xmm0 ; pack to bytes
michael@0 217 movq QWORD PTR [rdi+rdx-8], mm0 ; store previous four bytes
michael@0 218 movdq2q mm0, xmm1
michael@0 219
michael@0 220 add rdx, 8
michael@0 221 cmp edx, dword arg(5) ;cols
michael@0 222 jl .acrossnextcol;
michael@0 223
michael@0 224 ; last 8 pixels
michael@0 225 movq QWORD PTR [rdi+rdx-8], mm0
michael@0 226
michael@0 227 ; done with this rwo
michael@0 228 add rsi,rax ; next line
michael@0 229 mov eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
michael@0 230 add rdi,rax ; next destination
michael@0 231 mov eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
michael@0 232
michael@0 233 dec rcx ; decrement count
michael@0 234 jnz .nextrow ; next row
michael@0 235
michael@0 236 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
michael@0 237 add rsp,16
michael@0 238 pop rsp
michael@0 239 %endif
michael@0 240 ; begin epilog
michael@0 241 pop rdi
michael@0 242 pop rsi
michael@0 243 RESTORE_GOT
michael@0 244 RESTORE_XMM
michael@0 245 UNSHADOW_ARGS
michael@0 246 pop rbp
michael@0 247 ret
michael@0 248 %undef RD42
michael@0 249
michael@0 250
michael@0 251 ;void vp9_mbpost_proc_down_xmm(unsigned char *dst,
michael@0 252 ; int pitch, int rows, int cols,int flimit)
michael@0 253 extern sym(vp9_rv)
michael@0 254 global sym(vp9_mbpost_proc_down_xmm) PRIVATE
michael@0 255 sym(vp9_mbpost_proc_down_xmm):
michael@0 256 push rbp
michael@0 257 mov rbp, rsp
michael@0 258 SHADOW_ARGS_TO_STACK 5
michael@0 259 SAVE_XMM 7
michael@0 260 GET_GOT rbx
michael@0 261 push rsi
michael@0 262 push rdi
michael@0 263 ; end prolog
michael@0 264
michael@0 265 ALIGN_STACK 16, rax
michael@0 266 sub rsp, 128+16
michael@0 267
michael@0 268 ; unsigned char d[16][8] at [rsp]
michael@0 269 ; create flimit2 at [rsp+128]
michael@0 270 mov eax, dword ptr arg(4) ;flimit
michael@0 271 mov [rsp+128], eax
michael@0 272 mov [rsp+128+4], eax
michael@0 273 mov [rsp+128+8], eax
michael@0 274 mov [rsp+128+12], eax
michael@0 275 %define flimit4 [rsp+128]
michael@0 276
michael@0 277 %if ABI_IS_32BIT=0
michael@0 278 lea r8, [GLOBAL(sym(vp9_rv))]
michael@0 279 %endif
michael@0 280
michael@0 281 ;rows +=8;
michael@0 282 add dword arg(2), 8
michael@0 283
michael@0 284 ;for(c=0; c<cols; c+=8)
michael@0 285 .loop_col:
michael@0 286 mov rsi, arg(0) ; s
michael@0 287 pxor xmm0, xmm0 ;
michael@0 288
michael@0 289 movsxd rax, dword ptr arg(1) ;pitch ;
michael@0 290 neg rax ; rax = -pitch
michael@0 291
michael@0 292 lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
michael@0 293 neg rax
michael@0 294
michael@0 295
michael@0 296 pxor xmm5, xmm5
michael@0 297 pxor xmm6, xmm6 ;
michael@0 298
michael@0 299 pxor xmm7, xmm7 ;
michael@0 300 mov rdi, rsi
michael@0 301
michael@0 302 mov rcx, 15 ;
michael@0 303
michael@0 304 .loop_initvar:
michael@0 305 movq xmm1, QWORD PTR [rdi];
michael@0 306 punpcklbw xmm1, xmm0 ;
michael@0 307
michael@0 308 paddw xmm5, xmm1 ;
michael@0 309 pmullw xmm1, xmm1 ;
michael@0 310
michael@0 311 movdqa xmm2, xmm1 ;
michael@0 312 punpcklwd xmm1, xmm0 ;
michael@0 313
michael@0 314 punpckhwd xmm2, xmm0 ;
michael@0 315 paddd xmm6, xmm1 ;
michael@0 316
michael@0 317 paddd xmm7, xmm2 ;
michael@0 318 lea rdi, [rdi+rax] ;
michael@0 319
michael@0 320 dec rcx
michael@0 321 jne .loop_initvar
michael@0 322 ;save the var and sum
michael@0 323 xor rdx, rdx
michael@0 324 .loop_row:
michael@0 325 movq xmm1, QWORD PTR [rsi] ; [s-pitch*8]
michael@0 326 movq xmm2, QWORD PTR [rdi] ; [s+pitch*7]
michael@0 327
michael@0 328 punpcklbw xmm1, xmm0
michael@0 329 punpcklbw xmm2, xmm0
michael@0 330
michael@0 331 paddw xmm5, xmm2
michael@0 332 psubw xmm5, xmm1
michael@0 333
michael@0 334 pmullw xmm2, xmm2
michael@0 335 movdqa xmm4, xmm2
michael@0 336
michael@0 337 punpcklwd xmm2, xmm0
michael@0 338 punpckhwd xmm4, xmm0
michael@0 339
michael@0 340 paddd xmm6, xmm2
michael@0 341 paddd xmm7, xmm4
michael@0 342
michael@0 343 pmullw xmm1, xmm1
michael@0 344 movdqa xmm2, xmm1
michael@0 345
michael@0 346 punpcklwd xmm1, xmm0
michael@0 347 psubd xmm6, xmm1
michael@0 348
michael@0 349 punpckhwd xmm2, xmm0
michael@0 350 psubd xmm7, xmm2
michael@0 351
michael@0 352
michael@0 353 movdqa xmm3, xmm6
michael@0 354 pslld xmm3, 4
michael@0 355
michael@0 356 psubd xmm3, xmm6
michael@0 357 movdqa xmm1, xmm5
michael@0 358
michael@0 359 movdqa xmm4, xmm5
michael@0 360 pmullw xmm1, xmm1
michael@0 361
michael@0 362 pmulhw xmm4, xmm4
michael@0 363 movdqa xmm2, xmm1
michael@0 364
michael@0 365 punpcklwd xmm1, xmm4
michael@0 366 punpckhwd xmm2, xmm4
michael@0 367
michael@0 368 movdqa xmm4, xmm7
michael@0 369 pslld xmm4, 4
michael@0 370
michael@0 371 psubd xmm4, xmm7
michael@0 372
michael@0 373 psubd xmm3, xmm1
michael@0 374 psubd xmm4, xmm2
michael@0 375
michael@0 376 psubd xmm3, flimit4
michael@0 377 psubd xmm4, flimit4
michael@0 378
michael@0 379 psrad xmm3, 31
michael@0 380 psrad xmm4, 31
michael@0 381
michael@0 382 packssdw xmm3, xmm4
michael@0 383 packsswb xmm3, xmm0
michael@0 384
michael@0 385 movq xmm1, QWORD PTR [rsi+rax*8]
michael@0 386
michael@0 387 movq xmm2, xmm1
michael@0 388 punpcklbw xmm1, xmm0
michael@0 389
michael@0 390 paddw xmm1, xmm5
michael@0 391 mov rcx, rdx
michael@0 392
michael@0 393 and rcx, 127
michael@0 394 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
michael@0 395 push rax
michael@0 396 lea rax, [GLOBAL(sym(vp9_rv))]
michael@0 397 movdqu xmm4, [rax + rcx*2] ;vp9_rv[rcx*2]
michael@0 398 pop rax
michael@0 399 %elif ABI_IS_32BIT=0
michael@0 400 movdqu xmm4, [r8 + rcx*2] ;vp9_rv[rcx*2]
michael@0 401 %else
michael@0 402 movdqu xmm4, [sym(vp9_rv) + rcx*2]
michael@0 403 %endif
michael@0 404
michael@0 405 paddw xmm1, xmm4
michael@0 406 ;paddw xmm1, eight8s
michael@0 407 psraw xmm1, 4
michael@0 408
michael@0 409 packuswb xmm1, xmm0
michael@0 410 pand xmm1, xmm3
michael@0 411
michael@0 412 pandn xmm3, xmm2
michael@0 413 por xmm1, xmm3
michael@0 414
michael@0 415 and rcx, 15
michael@0 416 movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8]
michael@0 417
michael@0 418 mov rcx, rdx
michael@0 419 sub rcx, 8
michael@0 420
michael@0 421 and rcx, 15
michael@0 422 movq mm0, [rsp + rcx*8] ;d[rcx*8]
michael@0 423
michael@0 424 movq [rsi], mm0
michael@0 425 lea rsi, [rsi+rax]
michael@0 426
michael@0 427 lea rdi, [rdi+rax]
michael@0 428 add rdx, 1
michael@0 429
michael@0 430 cmp edx, dword arg(2) ;rows
michael@0 431 jl .loop_row
michael@0 432
michael@0 433 add dword arg(0), 8 ; s += 8
michael@0 434 sub dword arg(3), 8 ; cols -= 8
michael@0 435 cmp dword arg(3), 0
michael@0 436 jg .loop_col
michael@0 437
michael@0 438 add rsp, 128+16
michael@0 439 pop rsp
michael@0 440
michael@0 441 ; begin epilog
michael@0 442 pop rdi
michael@0 443 pop rsi
michael@0 444 RESTORE_GOT
michael@0 445 RESTORE_XMM
michael@0 446 UNSHADOW_ARGS
michael@0 447 pop rbp
michael@0 448 ret
michael@0 449 %undef flimit4
michael@0 450
michael@0 451
michael@0 452 ;void vp9_mbpost_proc_across_ip_xmm(unsigned char *src,
michael@0 453 ; int pitch, int rows, int cols,int flimit)
michael@0 454 global sym(vp9_mbpost_proc_across_ip_xmm) PRIVATE
michael@0 455 sym(vp9_mbpost_proc_across_ip_xmm):
michael@0 456 push rbp
michael@0 457 mov rbp, rsp
michael@0 458 SHADOW_ARGS_TO_STACK 5
michael@0 459 SAVE_XMM 7
michael@0 460 GET_GOT rbx
michael@0 461 push rsi
michael@0 462 push rdi
michael@0 463 ; end prolog
michael@0 464
michael@0 465 ALIGN_STACK 16, rax
michael@0 466 sub rsp, 16
michael@0 467
michael@0 468 ; create flimit4 at [rsp]
michael@0 469 mov eax, dword ptr arg(4) ;flimit
michael@0 470 mov [rsp], eax
michael@0 471 mov [rsp+4], eax
michael@0 472 mov [rsp+8], eax
michael@0 473 mov [rsp+12], eax
michael@0 474 %define flimit4 [rsp]
michael@0 475
michael@0 476
michael@0 477 ;for(r=0;r<rows;r++)
michael@0 478 .ip_row_loop:
michael@0 479
michael@0 480 xor rdx, rdx ;sumsq=0;
michael@0 481 xor rcx, rcx ;sum=0;
michael@0 482 mov rsi, arg(0); s
michael@0 483 mov rdi, -8
michael@0 484 .ip_var_loop:
michael@0 485 ;for(i=-8;i<=6;i++)
michael@0 486 ;{
michael@0 487 ; sumsq += s[i]*s[i];
michael@0 488 ; sum += s[i];
michael@0 489 ;}
michael@0 490 movzx eax, byte [rsi+rdi]
michael@0 491 add ecx, eax
michael@0 492 mul al
michael@0 493 add edx, eax
michael@0 494 add rdi, 1
michael@0 495 cmp rdi, 6
michael@0 496 jle .ip_var_loop
michael@0 497
michael@0 498
michael@0 499 ;mov rax, sumsq
michael@0 500 ;movd xmm7, rax
michael@0 501 movd xmm7, edx
michael@0 502
michael@0 503 ;mov rax, sum
michael@0 504 ;movd xmm6, rax
michael@0 505 movd xmm6, ecx
michael@0 506
michael@0 507 mov rsi, arg(0) ;s
michael@0 508 xor rcx, rcx
michael@0 509
michael@0 510 movsxd rdx, dword arg(3) ;cols
michael@0 511 add rdx, 8
michael@0 512 pxor mm0, mm0
michael@0 513 pxor mm1, mm1
michael@0 514
michael@0 515 pxor xmm0, xmm0
michael@0 516 .nextcol4:
michael@0 517
michael@0 518 movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5
michael@0 519 movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10
michael@0 520
michael@0 521 punpcklbw xmm1, xmm0 ; expanding
michael@0 522 punpcklbw xmm2, xmm0 ; expanding
michael@0 523
michael@0 524 punpcklwd xmm1, xmm0 ; expanding to dwords
michael@0 525 punpcklwd xmm2, xmm0 ; expanding to dwords
michael@0 526
michael@0 527 psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5
michael@0 528 paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2
michael@0 529
michael@0 530 paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5
michael@0 531 pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5
michael@0 532
michael@0 533 paddd xmm6, xmm2
michael@0 534 paddd xmm7, xmm1
michael@0 535
michael@0 536 pshufd xmm6, xmm6, 0 ; duplicate the last ones
michael@0 537 pshufd xmm7, xmm7, 0 ; duplicate the last ones
michael@0 538
michael@0 539 psrldq xmm1, 4 ; 8--7 9--6 10--5 0000
michael@0 540 psrldq xmm2, 4 ; 8--7 9--6 10--5 0000
michael@0 541
michael@0 542 pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared
michael@0 543 pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared
michael@0 544
michael@0 545 paddd xmm6, xmm4
michael@0 546 paddd xmm7, xmm3
michael@0 547
michael@0 548 pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared
michael@0 549 pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared
michael@0 550
michael@0 551 paddd xmm7, xmm3
michael@0 552 paddd xmm6, xmm4
michael@0 553
michael@0 554 pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared
michael@0 555 pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared
michael@0 556
michael@0 557 paddd xmm7, xmm3
michael@0 558 paddd xmm6, xmm4
michael@0 559
michael@0 560 movdqa xmm3, xmm6
michael@0 561 pmaddwd xmm3, xmm3
michael@0 562
michael@0 563 movdqa xmm5, xmm7
michael@0 564 pslld xmm5, 4
michael@0 565
michael@0 566 psubd xmm5, xmm7
michael@0 567 psubd xmm5, xmm3
michael@0 568
michael@0 569 psubd xmm5, flimit4
michael@0 570 psrad xmm5, 31
michael@0 571
michael@0 572 packssdw xmm5, xmm0
michael@0 573 packsswb xmm5, xmm0
michael@0 574
michael@0 575 movd xmm1, DWORD PTR [rsi+rcx]
michael@0 576 movq xmm2, xmm1
michael@0 577
michael@0 578 punpcklbw xmm1, xmm0
michael@0 579 punpcklwd xmm1, xmm0
michael@0 580
michael@0 581 paddd xmm1, xmm6
michael@0 582 paddd xmm1, [GLOBAL(four8s)]
michael@0 583
michael@0 584 psrad xmm1, 4
michael@0 585 packssdw xmm1, xmm0
michael@0 586
michael@0 587 packuswb xmm1, xmm0
michael@0 588 pand xmm1, xmm5
michael@0 589
michael@0 590 pandn xmm5, xmm2
michael@0 591 por xmm5, xmm1
michael@0 592
michael@0 593 movd [rsi+rcx-8], mm0
michael@0 594 movq mm0, mm1
michael@0 595
michael@0 596 movdq2q mm1, xmm5
michael@0 597 psrldq xmm7, 12
michael@0 598
michael@0 599 psrldq xmm6, 12
michael@0 600 add rcx, 4
michael@0 601
michael@0 602 cmp rcx, rdx
michael@0 603 jl .nextcol4
michael@0 604
michael@0 605 ;s+=pitch;
michael@0 606 movsxd rax, dword arg(1)
michael@0 607 add arg(0), rax
michael@0 608
michael@0 609 sub dword arg(2), 1 ;rows-=1
michael@0 610 cmp dword arg(2), 0
michael@0 611 jg .ip_row_loop
michael@0 612
michael@0 613 add rsp, 16
michael@0 614 pop rsp
michael@0 615
michael@0 616 ; begin epilog
michael@0 617 pop rdi
michael@0 618 pop rsi
michael@0 619 RESTORE_GOT
michael@0 620 RESTORE_XMM
michael@0 621 UNSHADOW_ARGS
michael@0 622 pop rbp
michael@0 623 ret
michael@0 624 %undef flimit4
michael@0 625
michael@0 626
michael@0 627 ;void vp9_plane_add_noise_wmt (unsigned char *start, unsigned char *noise,
michael@0 628 ; unsigned char blackclamp[16],
michael@0 629 ; unsigned char whiteclamp[16],
michael@0 630 ; unsigned char bothclamp[16],
michael@0 631 ; unsigned int width, unsigned int height, int pitch)
michael@0 632 extern sym(rand)
michael@0 633 global sym(vp9_plane_add_noise_wmt) PRIVATE
michael@0 634 sym(vp9_plane_add_noise_wmt):
michael@0 635 push rbp
michael@0 636 mov rbp, rsp
michael@0 637 SHADOW_ARGS_TO_STACK 8
michael@0 638 GET_GOT rbx
michael@0 639 push rsi
michael@0 640 push rdi
michael@0 641 ; end prolog
michael@0 642
michael@0 643 .addnoise_loop:
michael@0 644 call sym(rand) WRT_PLT
michael@0 645 mov rcx, arg(1) ;noise
michael@0 646 and rax, 0xff
michael@0 647 add rcx, rax
michael@0 648
michael@0 649 ; we rely on the fact that the clamping vectors are stored contiguously
michael@0 650 ; in black/white/both order. Note that we have to reload this here because
michael@0 651 ; rdx could be trashed by rand()
michael@0 652 mov rdx, arg(2) ; blackclamp
michael@0 653
michael@0 654
michael@0 655 mov rdi, rcx
michael@0 656 movsxd rcx, dword arg(5) ;[Width]
michael@0 657 mov rsi, arg(0) ;Pos
michael@0 658 xor rax,rax
michael@0 659
michael@0 660 .addnoise_nextset:
michael@0 661 movdqu xmm1,[rsi+rax] ; get the source
michael@0 662
michael@0 663 psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise
michael@0 664 paddusb xmm1, [rdx+32] ;bothclamp
michael@0 665 psubusb xmm1, [rdx+16] ;whiteclamp
michael@0 666
michael@0 667 movdqu xmm2,[rdi+rax] ; get the noise for this line
michael@0 668 paddb xmm1,xmm2 ; add it in
michael@0 669 movdqu [rsi+rax],xmm1 ; store the result
michael@0 670
michael@0 671 add rax,16 ; move to the next line
michael@0 672
michael@0 673 cmp rax, rcx
michael@0 674 jl .addnoise_nextset
michael@0 675
michael@0 676 movsxd rax, dword arg(7) ; Pitch
michael@0 677 add arg(0), rax ; Start += Pitch
michael@0 678 sub dword arg(6), 1 ; Height -= 1
michael@0 679 jg .addnoise_loop
michael@0 680
michael@0 681 ; begin epilog
michael@0 682 pop rdi
michael@0 683 pop rsi
michael@0 684 RESTORE_GOT
michael@0 685 UNSHADOW_ARGS
michael@0 686 pop rbp
michael@0 687 ret
michael@0 688
michael@0 689
michael@0 690 SECTION_RODATA
michael@0 691 align 16
michael@0 692 rd42:
michael@0 693 times 8 dw 0x04
michael@0 694 four8s:
michael@0 695 times 4 dd 8

mercurial