media/libvpx/vp8/common/x86/variance_impl_mmx.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11
michael@0 12 %include "vpx_ports/x86_abi_support.asm"
michael@0 13
michael@0 14 ;unsigned int vp8_get_mb_ss_mmx( short *src_ptr )
michael@0 15 global sym(vp8_get_mb_ss_mmx) PRIVATE
michael@0 16 sym(vp8_get_mb_ss_mmx):
michael@0 17 push rbp
michael@0 18 mov rbp, rsp
michael@0 19 SHADOW_ARGS_TO_STACK 7
michael@0 20 GET_GOT rbx
michael@0 21 push rsi
michael@0 22 push rdi
michael@0 23 sub rsp, 8
michael@0 24 ; end prolog
michael@0 25
michael@0 26 mov rax, arg(0) ;src_ptr
michael@0 27 mov rcx, 16
michael@0 28 pxor mm4, mm4
michael@0 29
michael@0 30 .NEXTROW:
michael@0 31 movq mm0, [rax]
michael@0 32 movq mm1, [rax+8]
michael@0 33 movq mm2, [rax+16]
michael@0 34 movq mm3, [rax+24]
michael@0 35 pmaddwd mm0, mm0
michael@0 36 pmaddwd mm1, mm1
michael@0 37 pmaddwd mm2, mm2
michael@0 38 pmaddwd mm3, mm3
michael@0 39
michael@0 40 paddd mm4, mm0
michael@0 41 paddd mm4, mm1
michael@0 42 paddd mm4, mm2
michael@0 43 paddd mm4, mm3
michael@0 44
michael@0 45 add rax, 32
michael@0 46 dec rcx
michael@0 47 ja .NEXTROW
michael@0 48 movq QWORD PTR [rsp], mm4
michael@0 49
michael@0 50 ;return sum[0]+sum[1];
michael@0 51 movsxd rax, dword ptr [rsp]
michael@0 52 movsxd rcx, dword ptr [rsp+4]
michael@0 53 add rax, rcx
michael@0 54
michael@0 55
michael@0 56 ; begin epilog
michael@0 57 add rsp, 8
michael@0 58 pop rdi
michael@0 59 pop rsi
michael@0 60 RESTORE_GOT
michael@0 61 UNSHADOW_ARGS
michael@0 62 pop rbp
michael@0 63 ret
michael@0 64
michael@0 65
michael@0 66 ;unsigned int vp8_get8x8var_mmx
michael@0 67 ;(
michael@0 68 ; unsigned char *src_ptr,
michael@0 69 ; int source_stride,
michael@0 70 ; unsigned char *ref_ptr,
michael@0 71 ; int recon_stride,
michael@0 72 ; unsigned int *SSE,
michael@0 73 ; int *Sum
michael@0 74 ;)
michael@0 75 global sym(vp8_get8x8var_mmx) PRIVATE
michael@0 76 sym(vp8_get8x8var_mmx):
michael@0 77 push rbp
michael@0 78 mov rbp, rsp
michael@0 79 SHADOW_ARGS_TO_STACK 6
michael@0 80 push rsi
michael@0 81 push rdi
michael@0 82 push rbx
michael@0 83 sub rsp, 16
michael@0 84 ; end prolog
michael@0 85
michael@0 86
michael@0 87 pxor mm5, mm5 ; Blank mmx6
michael@0 88 pxor mm6, mm6 ; Blank mmx7
michael@0 89 pxor mm7, mm7 ; Blank mmx7
michael@0 90
michael@0 91 mov rax, arg(0) ;[src_ptr] ; Load base addresses
michael@0 92 mov rbx, arg(2) ;[ref_ptr]
michael@0 93 movsxd rcx, dword ptr arg(1) ;[source_stride]
michael@0 94 movsxd rdx, dword ptr arg(3) ;[recon_stride]
michael@0 95
michael@0 96 ; Row 1
michael@0 97 movq mm0, [rax] ; Copy eight bytes to mm0
michael@0 98 movq mm1, [rbx] ; Copy eight bytes to mm1
michael@0 99 movq mm2, mm0 ; Take copies
michael@0 100 movq mm3, mm1 ; Take copies
michael@0 101
michael@0 102 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 103 punpcklbw mm1, mm6
michael@0 104 punpckhbw mm2, mm6 ; unpack to higher prrcision
michael@0 105 punpckhbw mm3, mm6
michael@0 106 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 107 psubsw mm2, mm3 ; A-B (high order) to MM2
michael@0 108
michael@0 109 paddw mm5, mm0 ; accumulate differences in mm5
michael@0 110 paddw mm5, mm2 ; accumulate differences in mm5
michael@0 111
michael@0 112 pmaddwd mm0, mm0 ; square and accumulate
michael@0 113 pmaddwd mm2, mm2 ; square and accumulate
michael@0 114 add rbx,rdx ; Inc pointer into ref data
michael@0 115 add rax,rcx ; Inc pointer into the new data
michael@0 116 movq mm1, [rbx] ; Copy eight bytes to mm1
michael@0 117 paddd mm7, mm0 ; accumulate in mm7
michael@0 118 paddd mm7, mm2 ; accumulate in mm7
michael@0 119
michael@0 120
michael@0 121 ; Row 2
michael@0 122 movq mm0, [rax] ; Copy eight bytes to mm0
michael@0 123 movq mm2, mm0 ; Take copies
michael@0 124 movq mm3, mm1 ; Take copies
michael@0 125
michael@0 126 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 127 punpcklbw mm1, mm6
michael@0 128 punpckhbw mm2, mm6 ; unpack to higher prrcision
michael@0 129 punpckhbw mm3, mm6
michael@0 130 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 131 psubsw mm2, mm3 ; A-B (high order) to MM2
michael@0 132
michael@0 133 paddw mm5, mm0 ; accumulate differences in mm5
michael@0 134 paddw mm5, mm2 ; accumulate differences in mm5
michael@0 135
michael@0 136 pmaddwd mm0, mm0 ; square and accumulate
michael@0 137 pmaddwd mm2, mm2 ; square and accumulate
michael@0 138 add rbx,rdx ; Inc pointer into ref data
michael@0 139 add rax,rcx ; Inc pointer into the new data
michael@0 140 movq mm1, [rbx] ; Copy eight bytes to mm1
michael@0 141 paddd mm7, mm0 ; accumulate in mm7
michael@0 142 paddd mm7, mm2 ; accumulate in mm7
michael@0 143
michael@0 144 ; Row 3
michael@0 145 movq mm0, [rax] ; Copy eight bytes to mm0
michael@0 146 movq mm2, mm0 ; Take copies
michael@0 147 movq mm3, mm1 ; Take copies
michael@0 148
michael@0 149 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 150 punpcklbw mm1, mm6
michael@0 151 punpckhbw mm2, mm6 ; unpack to higher prrcision
michael@0 152 punpckhbw mm3, mm6
michael@0 153 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 154 psubsw mm2, mm3 ; A-B (high order) to MM2
michael@0 155
michael@0 156 paddw mm5, mm0 ; accumulate differences in mm5
michael@0 157 paddw mm5, mm2 ; accumulate differences in mm5
michael@0 158
michael@0 159 pmaddwd mm0, mm0 ; square and accumulate
michael@0 160 pmaddwd mm2, mm2 ; square and accumulate
michael@0 161 add rbx,rdx ; Inc pointer into ref data
michael@0 162 add rax,rcx ; Inc pointer into the new data
michael@0 163 movq mm1, [rbx] ; Copy eight bytes to mm1
michael@0 164 paddd mm7, mm0 ; accumulate in mm7
michael@0 165 paddd mm7, mm2 ; accumulate in mm7
michael@0 166
michael@0 167 ; Row 4
michael@0 168 movq mm0, [rax] ; Copy eight bytes to mm0
michael@0 169 movq mm2, mm0 ; Take copies
michael@0 170 movq mm3, mm1 ; Take copies
michael@0 171
michael@0 172 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 173 punpcklbw mm1, mm6
michael@0 174 punpckhbw mm2, mm6 ; unpack to higher prrcision
michael@0 175 punpckhbw mm3, mm6
michael@0 176 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 177 psubsw mm2, mm3 ; A-B (high order) to MM2
michael@0 178
michael@0 179 paddw mm5, mm0 ; accumulate differences in mm5
michael@0 180 paddw mm5, mm2 ; accumulate differences in mm5
michael@0 181
michael@0 182 pmaddwd mm0, mm0 ; square and accumulate
michael@0 183 pmaddwd mm2, mm2 ; square and accumulate
michael@0 184 add rbx,rdx ; Inc pointer into ref data
michael@0 185 add rax,rcx ; Inc pointer into the new data
michael@0 186 movq mm1, [rbx] ; Copy eight bytes to mm1
michael@0 187 paddd mm7, mm0 ; accumulate in mm7
michael@0 188 paddd mm7, mm2 ; accumulate in mm7
michael@0 189
michael@0 190 ; Row 5
michael@0 191 movq mm0, [rax] ; Copy eight bytes to mm0
michael@0 192 movq mm2, mm0 ; Take copies
michael@0 193 movq mm3, mm1 ; Take copies
michael@0 194
michael@0 195 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 196 punpcklbw mm1, mm6
michael@0 197 punpckhbw mm2, mm6 ; unpack to higher prrcision
michael@0 198 punpckhbw mm3, mm6
michael@0 199 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 200 psubsw mm2, mm3 ; A-B (high order) to MM2
michael@0 201
michael@0 202 paddw mm5, mm0 ; accumulate differences in mm5
michael@0 203 paddw mm5, mm2 ; accumulate differences in mm5
michael@0 204
michael@0 205 pmaddwd mm0, mm0 ; square and accumulate
michael@0 206 pmaddwd mm2, mm2 ; square and accumulate
michael@0 207 add rbx,rdx ; Inc pointer into ref data
michael@0 208 add rax,rcx ; Inc pointer into the new data
michael@0 209 movq mm1, [rbx] ; Copy eight bytes to mm1
michael@0 210 ; movq mm4, [rbx + rdx]
michael@0 211 paddd mm7, mm0 ; accumulate in mm7
michael@0 212 paddd mm7, mm2 ; accumulate in mm7
michael@0 213
michael@0 214 ; Row 6
michael@0 215 movq mm0, [rax] ; Copy eight bytes to mm0
michael@0 216 movq mm2, mm0 ; Take copies
michael@0 217 movq mm3, mm1 ; Take copies
michael@0 218
michael@0 219 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 220 punpcklbw mm1, mm6
michael@0 221 punpckhbw mm2, mm6 ; unpack to higher prrcision
michael@0 222 punpckhbw mm3, mm6
michael@0 223 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 224 psubsw mm2, mm3 ; A-B (high order) to MM2
michael@0 225
michael@0 226 paddw mm5, mm0 ; accumulate differences in mm5
michael@0 227 paddw mm5, mm2 ; accumulate differences in mm5
michael@0 228
michael@0 229 pmaddwd mm0, mm0 ; square and accumulate
michael@0 230 pmaddwd mm2, mm2 ; square and accumulate
michael@0 231 add rbx,rdx ; Inc pointer into ref data
michael@0 232 add rax,rcx ; Inc pointer into the new data
michael@0 233 movq mm1, [rbx] ; Copy eight bytes to mm1
michael@0 234 paddd mm7, mm0 ; accumulate in mm7
michael@0 235 paddd mm7, mm2 ; accumulate in mm7
michael@0 236
michael@0 237 ; Row 7
michael@0 238 movq mm0, [rax] ; Copy eight bytes to mm0
michael@0 239 movq mm2, mm0 ; Take copies
michael@0 240 movq mm3, mm1 ; Take copies
michael@0 241
michael@0 242 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 243 punpcklbw mm1, mm6
michael@0 244 punpckhbw mm2, mm6 ; unpack to higher prrcision
michael@0 245 punpckhbw mm3, mm6
michael@0 246 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 247 psubsw mm2, mm3 ; A-B (high order) to MM2
michael@0 248
michael@0 249 paddw mm5, mm0 ; accumulate differences in mm5
michael@0 250 paddw mm5, mm2 ; accumulate differences in mm5
michael@0 251
michael@0 252 pmaddwd mm0, mm0 ; square and accumulate
michael@0 253 pmaddwd mm2, mm2 ; square and accumulate
michael@0 254 add rbx,rdx ; Inc pointer into ref data
michael@0 255 add rax,rcx ; Inc pointer into the new data
michael@0 256 movq mm1, [rbx] ; Copy eight bytes to mm1
michael@0 257 paddd mm7, mm0 ; accumulate in mm7
michael@0 258 paddd mm7, mm2 ; accumulate in mm7
michael@0 259
michael@0 260 ; Row 8
michael@0 261 movq mm0, [rax] ; Copy eight bytes to mm0
michael@0 262 movq mm2, mm0 ; Take copies
michael@0 263 movq mm3, mm1 ; Take copies
michael@0 264
michael@0 265 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 266 punpcklbw mm1, mm6
michael@0 267 punpckhbw mm2, mm6 ; unpack to higher prrcision
michael@0 268 punpckhbw mm3, mm6
michael@0 269 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 270 psubsw mm2, mm3 ; A-B (high order) to MM2
michael@0 271
michael@0 272 paddw mm5, mm0 ; accumulate differences in mm5
michael@0 273 paddw mm5, mm2 ; accumulate differences in mm5
michael@0 274
michael@0 275 pmaddwd mm0, mm0 ; square and accumulate
michael@0 276 pmaddwd mm2, mm2 ; square and accumulate
michael@0 277 add rbx,rdx ; Inc pointer into ref data
michael@0 278 add rax,rcx ; Inc pointer into the new data
michael@0 279 paddd mm7, mm0 ; accumulate in mm7
michael@0 280 paddd mm7, mm2 ; accumulate in mm7
michael@0 281
michael@0 282 ; Now accumulate the final results.
michael@0 283 movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
michael@0 284 movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
michael@0 285 movsx rdx, WORD PTR [rsp+8]
michael@0 286 movsx rcx, WORD PTR [rsp+10]
michael@0 287 movsx rbx, WORD PTR [rsp+12]
michael@0 288 movsx rax, WORD PTR [rsp+14]
michael@0 289 add rdx, rcx
michael@0 290 add rbx, rax
michael@0 291 add rdx, rbx ;XSum
michael@0 292 movsxd rax, DWORD PTR [rsp]
michael@0 293 movsxd rcx, DWORD PTR [rsp+4]
michael@0 294 add rax, rcx ;XXSum
michael@0 295 mov rsi, arg(4) ;SSE
michael@0 296 mov rdi, arg(5) ;Sum
michael@0 297 mov dword ptr [rsi], eax
michael@0 298 mov dword ptr [rdi], edx
michael@0 299 xor rax, rax ; return 0
michael@0 300
michael@0 301
michael@0 302 ; begin epilog
michael@0 303 add rsp, 16
michael@0 304 pop rbx
michael@0 305 pop rdi
michael@0 306 pop rsi
michael@0 307 UNSHADOW_ARGS
michael@0 308 pop rbp
michael@0 309 ret
michael@0 310
michael@0 311
michael@0 312
michael@0 313 ;unsigned int
michael@0 314 ;vp8_get4x4var_mmx
michael@0 315 ;(
michael@0 316 ; unsigned char *src_ptr,
michael@0 317 ; int source_stride,
michael@0 318 ; unsigned char *ref_ptr,
michael@0 319 ; int recon_stride,
michael@0 320 ; unsigned int *SSE,
michael@0 321 ; int *Sum
michael@0 322 ;)
michael@0 323 global sym(vp8_get4x4var_mmx) PRIVATE
michael@0 324 sym(vp8_get4x4var_mmx):
michael@0 325 push rbp
michael@0 326 mov rbp, rsp
michael@0 327 SHADOW_ARGS_TO_STACK 6
michael@0 328 push rsi
michael@0 329 push rdi
michael@0 330 push rbx
michael@0 331 sub rsp, 16
michael@0 332 ; end prolog
michael@0 333
michael@0 334
michael@0 335 pxor mm5, mm5 ; Blank mmx6
michael@0 336 pxor mm6, mm6 ; Blank mmx7
michael@0 337 pxor mm7, mm7 ; Blank mmx7
michael@0 338
michael@0 339 mov rax, arg(0) ;[src_ptr] ; Load base addresses
michael@0 340 mov rbx, arg(2) ;[ref_ptr]
michael@0 341 movsxd rcx, dword ptr arg(1) ;[source_stride]
michael@0 342 movsxd rdx, dword ptr arg(3) ;[recon_stride]
michael@0 343
michael@0 344 ; Row 1
michael@0 345 movq mm0, [rax] ; Copy eight bytes to mm0
michael@0 346 movq mm1, [rbx] ; Copy eight bytes to mm1
michael@0 347 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 348 punpcklbw mm1, mm6
michael@0 349 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 350 paddw mm5, mm0 ; accumulate differences in mm5
michael@0 351 pmaddwd mm0, mm0 ; square and accumulate
michael@0 352 add rbx,rdx ; Inc pointer into ref data
michael@0 353 add rax,rcx ; Inc pointer into the new data
michael@0 354 movq mm1, [rbx] ; Copy eight bytes to mm1
michael@0 355 paddd mm7, mm0 ; accumulate in mm7
michael@0 356
michael@0 357
michael@0 358 ; Row 2
michael@0 359 movq mm0, [rax] ; Copy eight bytes to mm0
michael@0 360 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 361 punpcklbw mm1, mm6
michael@0 362 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 363 paddw mm5, mm0 ; accumulate differences in mm5
michael@0 364
michael@0 365 pmaddwd mm0, mm0 ; square and accumulate
michael@0 366 add rbx,rdx ; Inc pointer into ref data
michael@0 367 add rax,rcx ; Inc pointer into the new data
michael@0 368 movq mm1, [rbx] ; Copy eight bytes to mm1
michael@0 369 paddd mm7, mm0 ; accumulate in mm7
michael@0 370
michael@0 371 ; Row 3
michael@0 372 movq mm0, [rax] ; Copy eight bytes to mm0
michael@0 373 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 374 punpcklbw mm1, mm6
michael@0 375 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 376 paddw mm5, mm0 ; accumulate differences in mm5
michael@0 377
michael@0 378 pmaddwd mm0, mm0 ; square and accumulate
michael@0 379 add rbx,rdx ; Inc pointer into ref data
michael@0 380 add rax,rcx ; Inc pointer into the new data
michael@0 381 movq mm1, [rbx] ; Copy eight bytes to mm1
michael@0 382 paddd mm7, mm0 ; accumulate in mm7
michael@0 383
michael@0 384 ; Row 4
michael@0 385 movq mm0, [rax] ; Copy eight bytes to mm0
michael@0 386
michael@0 387 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 388 punpcklbw mm1, mm6
michael@0 389 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 390
michael@0 391 paddw mm5, mm0 ; accumulate differences in mm5
michael@0 392
michael@0 393 pmaddwd mm0, mm0 ; square and accumulate
michael@0 394 paddd mm7, mm0 ; accumulate in mm7
michael@0 395
michael@0 396
michael@0 397 ; Now accumulate the final results.
michael@0 398 movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
michael@0 399 movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
michael@0 400 movsx rdx, WORD PTR [rsp+8]
michael@0 401 movsx rcx, WORD PTR [rsp+10]
michael@0 402 movsx rbx, WORD PTR [rsp+12]
michael@0 403 movsx rax, WORD PTR [rsp+14]
michael@0 404 add rdx, rcx
michael@0 405 add rbx, rax
michael@0 406 add rdx, rbx ;XSum
michael@0 407 movsxd rax, DWORD PTR [rsp]
michael@0 408 movsxd rcx, DWORD PTR [rsp+4]
michael@0 409 add rax, rcx ;XXSum
michael@0 410 mov rsi, arg(4) ;SSE
michael@0 411 mov rdi, arg(5) ;Sum
michael@0 412 mov dword ptr [rsi], eax
michael@0 413 mov dword ptr [rdi], edx
michael@0 414 xor rax, rax ; return 0
michael@0 415
michael@0 416
michael@0 417 ; begin epilog
michael@0 418 add rsp, 16
michael@0 419 pop rbx
michael@0 420 pop rdi
michael@0 421 pop rsi
michael@0 422 UNSHADOW_ARGS
michael@0 423 pop rbp
michael@0 424 ret
michael@0 425
michael@0 426
michael@0 427
michael@0 428 ;unsigned int
michael@0 429 ;vp8_get4x4sse_cs_mmx
michael@0 430 ;(
michael@0 431 ; unsigned char *src_ptr,
michael@0 432 ; int source_stride,
michael@0 433 ; unsigned char *ref_ptr,
michael@0 434 ; int recon_stride
michael@0 435 ;)
michael@0 436 global sym(vp8_get4x4sse_cs_mmx) PRIVATE
michael@0 437 sym(vp8_get4x4sse_cs_mmx):
michael@0 438 push rbp
michael@0 439 mov rbp, rsp
michael@0 440 SHADOW_ARGS_TO_STACK 4
michael@0 441 push rsi
michael@0 442 push rdi
michael@0 443 push rbx
michael@0 444 ; end prolog
michael@0 445
michael@0 446
michael@0 447 pxor mm6, mm6 ; Blank mmx7
michael@0 448 pxor mm7, mm7 ; Blank mmx7
michael@0 449
michael@0 450 mov rax, arg(0) ;[src_ptr] ; Load base addresses
michael@0 451 mov rbx, arg(2) ;[ref_ptr]
michael@0 452 movsxd rcx, dword ptr arg(1) ;[source_stride]
michael@0 453 movsxd rdx, dword ptr arg(3) ;[recon_stride]
michael@0 454 ; Row 1
michael@0 455 movd mm0, [rax] ; Copy eight bytes to mm0
michael@0 456 movd mm1, [rbx] ; Copy eight bytes to mm1
michael@0 457 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 458 punpcklbw mm1, mm6
michael@0 459 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 460 pmaddwd mm0, mm0 ; square and accumulate
michael@0 461 add rbx,rdx ; Inc pointer into ref data
michael@0 462 add rax,rcx ; Inc pointer into the new data
michael@0 463 movd mm1, [rbx] ; Copy eight bytes to mm1
michael@0 464 paddd mm7, mm0 ; accumulate in mm7
michael@0 465
michael@0 466 ; Row 2
michael@0 467 movd mm0, [rax] ; Copy eight bytes to mm0
michael@0 468 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 469 punpcklbw mm1, mm6
michael@0 470 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 471 pmaddwd mm0, mm0 ; square and accumulate
michael@0 472 add rbx,rdx ; Inc pointer into ref data
michael@0 473 add rax,rcx ; Inc pointer into the new data
michael@0 474 movd mm1, [rbx] ; Copy eight bytes to mm1
michael@0 475 paddd mm7, mm0 ; accumulate in mm7
michael@0 476
michael@0 477 ; Row 3
michael@0 478 movd mm0, [rax] ; Copy eight bytes to mm0
michael@0 479 punpcklbw mm1, mm6
michael@0 480 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 481 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 482
michael@0 483 pmaddwd mm0, mm0 ; square and accumulate
michael@0 484 add rbx,rdx ; Inc pointer into ref data
michael@0 485 add rax,rcx ; Inc pointer into the new data
michael@0 486 movd mm1, [rbx] ; Copy eight bytes to mm1
michael@0 487 paddd mm7, mm0 ; accumulate in mm7
michael@0 488
michael@0 489 ; Row 4
michael@0 490 movd mm0, [rax] ; Copy eight bytes to mm0
michael@0 491 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 492 punpcklbw mm1, mm6
michael@0 493 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 494 pmaddwd mm0, mm0 ; square and accumulate
michael@0 495 paddd mm7, mm0 ; accumulate in mm7
michael@0 496
michael@0 497 movq mm0, mm7 ;
michael@0 498 psrlq mm7, 32
michael@0 499
michael@0 500 paddd mm0, mm7
michael@0 501 movq rax, mm0
michael@0 502
michael@0 503
michael@0 504 ; begin epilog
michael@0 505 pop rbx
michael@0 506 pop rdi
michael@0 507 pop rsi
michael@0 508 UNSHADOW_ARGS
michael@0 509 pop rbp
michael@0 510 ret
michael@0 511
michael@0 512 %define mmx_filter_shift 7
michael@0 513
michael@0 514 ;void vp8_filter_block2d_bil4x4_var_mmx
michael@0 515 ;(
michael@0 516 ; unsigned char *ref_ptr,
michael@0 517 ; int ref_pixels_per_line,
michael@0 518 ; unsigned char *src_ptr,
michael@0 519 ; int src_pixels_per_line,
michael@0 520 ; unsigned short *HFilter,
michael@0 521 ; unsigned short *VFilter,
michael@0 522 ; int *sum,
michael@0 523 ; unsigned int *sumsquared
michael@0 524 ;)
michael@0 525 global sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE
michael@0 526 sym(vp8_filter_block2d_bil4x4_var_mmx):
michael@0 527 push rbp
michael@0 528 mov rbp, rsp
michael@0 529 SHADOW_ARGS_TO_STACK 8
michael@0 530 GET_GOT rbx
michael@0 531 push rsi
michael@0 532 push rdi
michael@0 533 sub rsp, 16
michael@0 534 ; end prolog
michael@0 535
michael@0 536
michael@0 537 pxor mm6, mm6 ;
michael@0 538 pxor mm7, mm7 ;
michael@0 539
michael@0 540 mov rax, arg(4) ;HFilter ;
michael@0 541 mov rdx, arg(5) ;VFilter ;
michael@0 542
michael@0 543 mov rsi, arg(0) ;ref_ptr ;
michael@0 544 mov rdi, arg(2) ;src_ptr ;
michael@0 545
michael@0 546 mov rcx, 4 ;
michael@0 547 pxor mm0, mm0 ;
michael@0 548
michael@0 549 movd mm1, [rsi] ;
michael@0 550 movd mm3, [rsi+1] ;
michael@0 551
michael@0 552 punpcklbw mm1, mm0 ;
michael@0 553 pmullw mm1, [rax] ;
michael@0 554
michael@0 555 punpcklbw mm3, mm0 ;
michael@0 556 pmullw mm3, [rax+8] ;
michael@0 557
michael@0 558 paddw mm1, mm3 ;
michael@0 559 paddw mm1, [GLOBAL(mmx_bi_rd)] ;
michael@0 560
michael@0 561 psraw mm1, mmx_filter_shift ;
michael@0 562 movq mm5, mm1
michael@0 563
michael@0 564 %if ABI_IS_32BIT
michael@0 565 add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
michael@0 566 %else
michael@0 567 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
michael@0 568 add rsi, r8
michael@0 569 %endif
michael@0 570
michael@0 571 .filter_block2d_bil4x4_var_mmx_loop:
michael@0 572
michael@0 573 movd mm1, [rsi] ;
michael@0 574 movd mm3, [rsi+1] ;
michael@0 575
michael@0 576 punpcklbw mm1, mm0 ;
michael@0 577 pmullw mm1, [rax] ;
michael@0 578
michael@0 579 punpcklbw mm3, mm0 ;
michael@0 580 pmullw mm3, [rax+8] ;
michael@0 581
michael@0 582 paddw mm1, mm3 ;
michael@0 583 paddw mm1, [GLOBAL(mmx_bi_rd)] ;
michael@0 584
michael@0 585 psraw mm1, mmx_filter_shift ;
michael@0 586 movq mm3, mm5 ;
michael@0 587
michael@0 588 movq mm5, mm1 ;
michael@0 589 pmullw mm3, [rdx] ;
michael@0 590
michael@0 591 pmullw mm1, [rdx+8] ;
michael@0 592 paddw mm1, mm3 ;
michael@0 593
michael@0 594
michael@0 595 paddw mm1, [GLOBAL(mmx_bi_rd)] ;
michael@0 596 psraw mm1, mmx_filter_shift ;
michael@0 597
michael@0 598 movd mm3, [rdi] ;
michael@0 599 punpcklbw mm3, mm0 ;
michael@0 600
michael@0 601 psubw mm1, mm3 ;
michael@0 602 paddw mm6, mm1 ;
michael@0 603
michael@0 604 pmaddwd mm1, mm1 ;
michael@0 605 paddd mm7, mm1 ;
michael@0 606
michael@0 607 %if ABI_IS_32BIT
michael@0 608 add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
michael@0 609 add rdi, dword ptr arg(3) ;src_pixels_per_line ;
michael@0 610 %else
michael@0 611 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
michael@0 612 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
michael@0 613 add rsi, r8
michael@0 614 add rdi, r9
michael@0 615 %endif
michael@0 616 sub rcx, 1 ;
michael@0 617 jnz .filter_block2d_bil4x4_var_mmx_loop ;
michael@0 618
michael@0 619
michael@0 620 pxor mm3, mm3 ;
michael@0 621 pxor mm2, mm2 ;
michael@0 622
michael@0 623 punpcklwd mm2, mm6 ;
michael@0 624 punpckhwd mm3, mm6 ;
michael@0 625
michael@0 626 paddd mm2, mm3 ;
michael@0 627 movq mm6, mm2 ;
michael@0 628
michael@0 629 psrlq mm6, 32 ;
michael@0 630 paddd mm2, mm6 ;
michael@0 631
michael@0 632 psrad mm2, 16 ;
michael@0 633 movq mm4, mm7 ;
michael@0 634
michael@0 635 psrlq mm4, 32 ;
michael@0 636 paddd mm4, mm7 ;
michael@0 637
michael@0 638 mov rdi, arg(6) ;sum
michael@0 639 mov rsi, arg(7) ;sumsquared
michael@0 640
michael@0 641 movd dword ptr [rdi], mm2 ;
michael@0 642 movd dword ptr [rsi], mm4 ;
michael@0 643
michael@0 644
michael@0 645
michael@0 646 ; begin epilog
michael@0 647 add rsp, 16
michael@0 648 pop rdi
michael@0 649 pop rsi
michael@0 650 RESTORE_GOT
michael@0 651 UNSHADOW_ARGS
michael@0 652 pop rbp
michael@0 653 ret
michael@0 654
michael@0 655
michael@0 656
michael@0 657
michael@0 658 ;void vp8_filter_block2d_bil_var_mmx
michael@0 659 ;(
michael@0 660 ; unsigned char *ref_ptr,
michael@0 661 ; int ref_pixels_per_line,
michael@0 662 ; unsigned char *src_ptr,
michael@0 663 ; int src_pixels_per_line,
michael@0 664 ; unsigned int Height,
michael@0 665 ; unsigned short *HFilter,
michael@0 666 ; unsigned short *VFilter,
michael@0 667 ; int *sum,
michael@0 668 ; unsigned int *sumsquared
michael@0 669 ;)
michael@0 670 global sym(vp8_filter_block2d_bil_var_mmx) PRIVATE
michael@0 671 sym(vp8_filter_block2d_bil_var_mmx):
michael@0 672 push rbp
michael@0 673 mov rbp, rsp
michael@0 674 SHADOW_ARGS_TO_STACK 9
michael@0 675 GET_GOT rbx
michael@0 676 push rsi
michael@0 677 push rdi
michael@0 678 sub rsp, 16
michael@0 679 ; end prolog
michael@0 680
michael@0 681 pxor mm6, mm6 ;
michael@0 682 pxor mm7, mm7 ;
michael@0 683 mov rax, arg(5) ;HFilter ;
michael@0 684
michael@0 685 mov rdx, arg(6) ;VFilter ;
michael@0 686 mov rsi, arg(0) ;ref_ptr ;
michael@0 687
michael@0 688 mov rdi, arg(2) ;src_ptr ;
michael@0 689 movsxd rcx, dword ptr arg(4) ;Height ;
michael@0 690
michael@0 691 pxor mm0, mm0 ;
michael@0 692 movq mm1, [rsi] ;
michael@0 693
michael@0 694 movq mm3, [rsi+1] ;
michael@0 695 movq mm2, mm1 ;
michael@0 696
michael@0 697 movq mm4, mm3 ;
michael@0 698 punpcklbw mm1, mm0 ;
michael@0 699
michael@0 700 punpckhbw mm2, mm0 ;
michael@0 701 pmullw mm1, [rax] ;
michael@0 702
michael@0 703 pmullw mm2, [rax] ;
michael@0 704 punpcklbw mm3, mm0 ;
michael@0 705
michael@0 706 punpckhbw mm4, mm0 ;
michael@0 707 pmullw mm3, [rax+8] ;
michael@0 708
michael@0 709 pmullw mm4, [rax+8] ;
michael@0 710 paddw mm1, mm3 ;
michael@0 711
michael@0 712 paddw mm2, mm4 ;
michael@0 713 paddw mm1, [GLOBAL(mmx_bi_rd)] ;
michael@0 714
michael@0 715 psraw mm1, mmx_filter_shift ;
michael@0 716 paddw mm2, [GLOBAL(mmx_bi_rd)] ;
michael@0 717
michael@0 718 psraw mm2, mmx_filter_shift ;
michael@0 719 movq mm5, mm1
michael@0 720
michael@0 721 packuswb mm5, mm2 ;
michael@0 722 %if ABI_IS_32BIT
michael@0 723 add rsi, dword ptr arg(1) ;ref_pixels_per_line
michael@0 724 %else
michael@0 725 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
michael@0 726 add rsi, r8
michael@0 727 %endif
michael@0 728
michael@0 729 .filter_block2d_bil_var_mmx_loop:
michael@0 730
michael@0 731 movq mm1, [rsi] ;
michael@0 732 movq mm3, [rsi+1] ;
michael@0 733
michael@0 734 movq mm2, mm1 ;
michael@0 735 movq mm4, mm3 ;
michael@0 736
michael@0 737 punpcklbw mm1, mm0 ;
michael@0 738 punpckhbw mm2, mm0 ;
michael@0 739
michael@0 740 pmullw mm1, [rax] ;
michael@0 741 pmullw mm2, [rax] ;
michael@0 742
michael@0 743 punpcklbw mm3, mm0 ;
michael@0 744 punpckhbw mm4, mm0 ;
michael@0 745
michael@0 746 pmullw mm3, [rax+8] ;
michael@0 747 pmullw mm4, [rax+8] ;
michael@0 748
michael@0 749 paddw mm1, mm3 ;
michael@0 750 paddw mm2, mm4 ;
michael@0 751
michael@0 752 paddw mm1, [GLOBAL(mmx_bi_rd)] ;
michael@0 753 psraw mm1, mmx_filter_shift ;
michael@0 754
michael@0 755 paddw mm2, [GLOBAL(mmx_bi_rd)] ;
michael@0 756 psraw mm2, mmx_filter_shift ;
michael@0 757
michael@0 758 movq mm3, mm5 ;
michael@0 759 movq mm4, mm5 ;
michael@0 760
michael@0 761 punpcklbw mm3, mm0 ;
michael@0 762 punpckhbw mm4, mm0 ;
michael@0 763
michael@0 764 movq mm5, mm1 ;
michael@0 765 packuswb mm5, mm2 ;
michael@0 766
michael@0 767 pmullw mm3, [rdx] ;
michael@0 768 pmullw mm4, [rdx] ;
michael@0 769
michael@0 770 pmullw mm1, [rdx+8] ;
michael@0 771 pmullw mm2, [rdx+8] ;
michael@0 772
michael@0 773 paddw mm1, mm3 ;
michael@0 774 paddw mm2, mm4 ;
michael@0 775
michael@0 776 paddw mm1, [GLOBAL(mmx_bi_rd)] ;
michael@0 777 paddw mm2, [GLOBAL(mmx_bi_rd)] ;
michael@0 778
michael@0 779 psraw mm1, mmx_filter_shift ;
michael@0 780 psraw mm2, mmx_filter_shift ;
michael@0 781
michael@0 782 movq mm3, [rdi] ;
michael@0 783 movq mm4, mm3 ;
michael@0 784
michael@0 785 punpcklbw mm3, mm0 ;
michael@0 786 punpckhbw mm4, mm0 ;
michael@0 787
michael@0 788 psubw mm1, mm3 ;
michael@0 789 psubw mm2, mm4 ;
michael@0 790
michael@0 791 paddw mm6, mm1 ;
michael@0 792 pmaddwd mm1, mm1 ;
michael@0 793
michael@0 794 paddw mm6, mm2 ;
michael@0 795 pmaddwd mm2, mm2 ;
michael@0 796
michael@0 797 paddd mm7, mm1 ;
michael@0 798 paddd mm7, mm2 ;
michael@0 799
michael@0 800 %if ABI_IS_32BIT
michael@0 801 add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
michael@0 802 add rdi, dword ptr arg(3) ;src_pixels_per_line ;
michael@0 803 %else
michael@0 804 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
michael@0 805 movsxd r9, dword ptr arg(3) ;src_pixels_per_line ;
michael@0 806 add rsi, r8
michael@0 807 add rdi, r9
michael@0 808 %endif
michael@0 809 sub rcx, 1 ;
michael@0 810 jnz .filter_block2d_bil_var_mmx_loop ;
michael@0 811
michael@0 812
michael@0 813 pxor mm3, mm3 ;
michael@0 814 pxor mm2, mm2 ;
michael@0 815
michael@0 816 punpcklwd mm2, mm6 ;
michael@0 817 punpckhwd mm3, mm6 ;
michael@0 818
michael@0 819 paddd mm2, mm3 ;
michael@0 820 movq mm6, mm2 ;
michael@0 821
michael@0 822 psrlq mm6, 32 ;
michael@0 823 paddd mm2, mm6 ;
michael@0 824
michael@0 825 psrad mm2, 16 ;
michael@0 826 movq mm4, mm7 ;
michael@0 827
michael@0 828 psrlq mm4, 32 ;
michael@0 829 paddd mm4, mm7 ;
michael@0 830
michael@0 831 mov rdi, arg(7) ;sum
michael@0 832 mov rsi, arg(8) ;sumsquared
michael@0 833
michael@0 834 movd dword ptr [rdi], mm2 ;
michael@0 835 movd dword ptr [rsi], mm4 ;
michael@0 836
michael@0 837 ; begin epilog
michael@0 838 add rsp, 16
michael@0 839 pop rdi
michael@0 840 pop rsi
michael@0 841 RESTORE_GOT
michael@0 842 UNSHADOW_ARGS
michael@0 843 pop rbp
michael@0 844 ret
michael@0 845
michael@0 846
michael@0 847 SECTION_RODATA
michael@0 848 ;short mmx_bi_rd[4] = { 64, 64, 64, 64};
michael@0 849 align 16
michael@0 850 mmx_bi_rd:
michael@0 851 times 4 dw 64

mercurial