media/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11
michael@0 12 %include "vpx_ports/x86_abi_support.asm"
michael@0 13
michael@0 14 ;unsigned int vp9_get_mb_ss_mmx( short *src_ptr )
michael@0 15 global sym(vp9_get_mb_ss_mmx) PRIVATE
michael@0 16 sym(vp9_get_mb_ss_mmx):
michael@0 17 push rbp
michael@0 18 mov rbp, rsp
michael@0 19 SHADOW_ARGS_TO_STACK 7
michael@0 20 GET_GOT rbx
michael@0 21 push rsi
michael@0 22 push rdi
michael@0 23 sub rsp, 8
michael@0 24 ; end prolog
michael@0 25
michael@0 26 mov rax, arg(0) ;src_ptr
michael@0 27 mov rcx, 16
michael@0 28 pxor mm4, mm4
michael@0 29
michael@0 30 .NEXTROW:
michael@0 31 movq mm0, [rax]
michael@0 32 movq mm1, [rax+8]
michael@0 33 movq mm2, [rax+16]
michael@0 34 movq mm3, [rax+24]
michael@0 35 pmaddwd mm0, mm0
michael@0 36 pmaddwd mm1, mm1
michael@0 37 pmaddwd mm2, mm2
michael@0 38 pmaddwd mm3, mm3
michael@0 39
michael@0 40 paddd mm4, mm0
michael@0 41 paddd mm4, mm1
michael@0 42 paddd mm4, mm2
michael@0 43 paddd mm4, mm3
michael@0 44
michael@0 45 add rax, 32
michael@0 46 dec rcx
michael@0 47 ja .NEXTROW
michael@0 48 movq QWORD PTR [rsp], mm4
michael@0 49
michael@0 50 ;return sum[0]+sum[1];
michael@0 51 movsxd rax, dword ptr [rsp]
michael@0 52 movsxd rcx, dword ptr [rsp+4]
michael@0 53 add rax, rcx
michael@0 54
michael@0 55
michael@0 56 ; begin epilog
michael@0 57 add rsp, 8
michael@0 58 pop rdi
michael@0 59 pop rsi
michael@0 60 RESTORE_GOT
michael@0 61 UNSHADOW_ARGS
michael@0 62 pop rbp
michael@0 63 ret
michael@0 64
michael@0 65
michael@0 66 ;unsigned int vp9_get8x8var_mmx
michael@0 67 ;(
michael@0 68 ; unsigned char *src_ptr,
michael@0 69 ; int source_stride,
michael@0 70 ; unsigned char *ref_ptr,
michael@0 71 ; int recon_stride,
michael@0 72 ; unsigned int *SSE,
michael@0 73 ; int *Sum
michael@0 74 ;)
michael@0 75 global sym(vp9_get8x8var_mmx) PRIVATE
michael@0 76 sym(vp9_get8x8var_mmx):
michael@0 77 push rbp
michael@0 78 mov rbp, rsp
michael@0 79 SHADOW_ARGS_TO_STACK 6
michael@0 80 push rsi
michael@0 81 push rdi
michael@0 82 push rbx
michael@0 83 sub rsp, 16
michael@0 84 ; end prolog
michael@0 85
michael@0 86
michael@0 87 pxor mm5, mm5 ; Blank mmx6
michael@0 88 pxor mm6, mm6 ; Blank mmx7
michael@0 89 pxor mm7, mm7 ; Blank mmx7
michael@0 90
michael@0 91 mov rax, arg(0) ;[src_ptr] ; Load base addresses
michael@0 92 mov rbx, arg(2) ;[ref_ptr]
michael@0 93 movsxd rcx, dword ptr arg(1) ;[source_stride]
michael@0 94 movsxd rdx, dword ptr arg(3) ;[recon_stride]
michael@0 95
michael@0 96 ; Row 1
michael@0 97 movq mm0, [rax] ; Copy eight bytes to mm0
michael@0 98 movq mm1, [rbx] ; Copy eight bytes to mm1
michael@0 99 movq mm2, mm0 ; Take copies
michael@0 100 movq mm3, mm1 ; Take copies
michael@0 101
michael@0 102 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 103 punpcklbw mm1, mm6
michael@0 104 punpckhbw mm2, mm6 ; unpack to higher prrcision
michael@0 105 punpckhbw mm3, mm6
michael@0 106 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 107 psubsw mm2, mm3 ; A-B (high order) to MM2
michael@0 108
michael@0 109 paddw mm5, mm0 ; accumulate differences in mm5
michael@0 110 paddw mm5, mm2 ; accumulate differences in mm5
michael@0 111
michael@0 112 pmaddwd mm0, mm0 ; square and accumulate
michael@0 113 pmaddwd mm2, mm2 ; square and accumulate
michael@0 114 add rbx,rdx ; Inc pointer into ref data
michael@0 115 add rax,rcx ; Inc pointer into the new data
michael@0 116 movq mm1, [rbx] ; Copy eight bytes to mm1
michael@0 117 paddd mm7, mm0 ; accumulate in mm7
michael@0 118 paddd mm7, mm2 ; accumulate in mm7
michael@0 119
michael@0 120
michael@0 121 ; Row 2
michael@0 122 movq mm0, [rax] ; Copy eight bytes to mm0
michael@0 123 movq mm2, mm0 ; Take copies
michael@0 124 movq mm3, mm1 ; Take copies
michael@0 125
michael@0 126 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 127 punpcklbw mm1, mm6
michael@0 128 punpckhbw mm2, mm6 ; unpack to higher prrcision
michael@0 129 punpckhbw mm3, mm6
michael@0 130 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 131 psubsw mm2, mm3 ; A-B (high order) to MM2
michael@0 132
michael@0 133 paddw mm5, mm0 ; accumulate differences in mm5
michael@0 134 paddw mm5, mm2 ; accumulate differences in mm5
michael@0 135
michael@0 136 pmaddwd mm0, mm0 ; square and accumulate
michael@0 137 pmaddwd mm2, mm2 ; square and accumulate
michael@0 138 add rbx,rdx ; Inc pointer into ref data
michael@0 139 add rax,rcx ; Inc pointer into the new data
michael@0 140 movq mm1, [rbx] ; Copy eight bytes to mm1
michael@0 141 paddd mm7, mm0 ; accumulate in mm7
michael@0 142 paddd mm7, mm2 ; accumulate in mm7
michael@0 143
michael@0 144 ; Row 3
michael@0 145 movq mm0, [rax] ; Copy eight bytes to mm0
michael@0 146 movq mm2, mm0 ; Take copies
michael@0 147 movq mm3, mm1 ; Take copies
michael@0 148
michael@0 149 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 150 punpcklbw mm1, mm6
michael@0 151 punpckhbw mm2, mm6 ; unpack to higher prrcision
michael@0 152 punpckhbw mm3, mm6
michael@0 153 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 154 psubsw mm2, mm3 ; A-B (high order) to MM2
michael@0 155
michael@0 156 paddw mm5, mm0 ; accumulate differences in mm5
michael@0 157 paddw mm5, mm2 ; accumulate differences in mm5
michael@0 158
michael@0 159 pmaddwd mm0, mm0 ; square and accumulate
michael@0 160 pmaddwd mm2, mm2 ; square and accumulate
michael@0 161 add rbx,rdx ; Inc pointer into ref data
michael@0 162 add rax,rcx ; Inc pointer into the new data
michael@0 163 movq mm1, [rbx] ; Copy eight bytes to mm1
michael@0 164 paddd mm7, mm0 ; accumulate in mm7
michael@0 165 paddd mm7, mm2 ; accumulate in mm7
michael@0 166
michael@0 167 ; Row 4
michael@0 168 movq mm0, [rax] ; Copy eight bytes to mm0
michael@0 169 movq mm2, mm0 ; Take copies
michael@0 170 movq mm3, mm1 ; Take copies
michael@0 171
michael@0 172 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 173 punpcklbw mm1, mm6
michael@0 174 punpckhbw mm2, mm6 ; unpack to higher prrcision
michael@0 175 punpckhbw mm3, mm6
michael@0 176 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 177 psubsw mm2, mm3 ; A-B (high order) to MM2
michael@0 178
michael@0 179 paddw mm5, mm0 ; accumulate differences in mm5
michael@0 180 paddw mm5, mm2 ; accumulate differences in mm5
michael@0 181
michael@0 182 pmaddwd mm0, mm0 ; square and accumulate
michael@0 183 pmaddwd mm2, mm2 ; square and accumulate
michael@0 184 add rbx,rdx ; Inc pointer into ref data
michael@0 185 add rax,rcx ; Inc pointer into the new data
michael@0 186 movq mm1, [rbx] ; Copy eight bytes to mm1
michael@0 187 paddd mm7, mm0 ; accumulate in mm7
michael@0 188 paddd mm7, mm2 ; accumulate in mm7
michael@0 189
michael@0 190 ; Row 5
michael@0 191 movq mm0, [rax] ; Copy eight bytes to mm0
michael@0 192 movq mm2, mm0 ; Take copies
michael@0 193 movq mm3, mm1 ; Take copies
michael@0 194
michael@0 195 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 196 punpcklbw mm1, mm6
michael@0 197 punpckhbw mm2, mm6 ; unpack to higher prrcision
michael@0 198 punpckhbw mm3, mm6
michael@0 199 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 200 psubsw mm2, mm3 ; A-B (high order) to MM2
michael@0 201
michael@0 202 paddw mm5, mm0 ; accumulate differences in mm5
michael@0 203 paddw mm5, mm2 ; accumulate differences in mm5
michael@0 204
michael@0 205 pmaddwd mm0, mm0 ; square and accumulate
michael@0 206 pmaddwd mm2, mm2 ; square and accumulate
michael@0 207 add rbx,rdx ; Inc pointer into ref data
michael@0 208 add rax,rcx ; Inc pointer into the new data
michael@0 209 movq mm1, [rbx] ; Copy eight bytes to mm1
michael@0 210 ; movq mm4, [rbx + rdx]
michael@0 211 paddd mm7, mm0 ; accumulate in mm7
michael@0 212 paddd mm7, mm2 ; accumulate in mm7
michael@0 213
michael@0 214 ; Row 6
michael@0 215 movq mm0, [rax] ; Copy eight bytes to mm0
michael@0 216 movq mm2, mm0 ; Take copies
michael@0 217 movq mm3, mm1 ; Take copies
michael@0 218
michael@0 219 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 220 punpcklbw mm1, mm6
michael@0 221 punpckhbw mm2, mm6 ; unpack to higher prrcision
michael@0 222 punpckhbw mm3, mm6
michael@0 223 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 224 psubsw mm2, mm3 ; A-B (high order) to MM2
michael@0 225
michael@0 226 paddw mm5, mm0 ; accumulate differences in mm5
michael@0 227 paddw mm5, mm2 ; accumulate differences in mm5
michael@0 228
michael@0 229 pmaddwd mm0, mm0 ; square and accumulate
michael@0 230 pmaddwd mm2, mm2 ; square and accumulate
michael@0 231 add rbx,rdx ; Inc pointer into ref data
michael@0 232 add rax,rcx ; Inc pointer into the new data
michael@0 233 movq mm1, [rbx] ; Copy eight bytes to mm1
michael@0 234 paddd mm7, mm0 ; accumulate in mm7
michael@0 235 paddd mm7, mm2 ; accumulate in mm7
michael@0 236
michael@0 237 ; Row 7
michael@0 238 movq mm0, [rax] ; Copy eight bytes to mm0
michael@0 239 movq mm2, mm0 ; Take copies
michael@0 240 movq mm3, mm1 ; Take copies
michael@0 241
michael@0 242 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 243 punpcklbw mm1, mm6
michael@0 244 punpckhbw mm2, mm6 ; unpack to higher prrcision
michael@0 245 punpckhbw mm3, mm6
michael@0 246 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 247 psubsw mm2, mm3 ; A-B (high order) to MM2
michael@0 248
michael@0 249 paddw mm5, mm0 ; accumulate differences in mm5
michael@0 250 paddw mm5, mm2 ; accumulate differences in mm5
michael@0 251
michael@0 252 pmaddwd mm0, mm0 ; square and accumulate
michael@0 253 pmaddwd mm2, mm2 ; square and accumulate
michael@0 254 add rbx,rdx ; Inc pointer into ref data
michael@0 255 add rax,rcx ; Inc pointer into the new data
michael@0 256 movq mm1, [rbx] ; Copy eight bytes to mm1
michael@0 257 paddd mm7, mm0 ; accumulate in mm7
michael@0 258 paddd mm7, mm2 ; accumulate in mm7
michael@0 259
michael@0 260 ; Row 8
michael@0 261 movq mm0, [rax] ; Copy eight bytes to mm0
michael@0 262 movq mm2, mm0 ; Take copies
michael@0 263 movq mm3, mm1 ; Take copies
michael@0 264
michael@0 265 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 266 punpcklbw mm1, mm6
michael@0 267 punpckhbw mm2, mm6 ; unpack to higher prrcision
michael@0 268 punpckhbw mm3, mm6
michael@0 269 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 270 psubsw mm2, mm3 ; A-B (high order) to MM2
michael@0 271
michael@0 272 paddw mm5, mm0 ; accumulate differences in mm5
michael@0 273 paddw mm5, mm2 ; accumulate differences in mm5
michael@0 274
michael@0 275 pmaddwd mm0, mm0 ; square and accumulate
michael@0 276 pmaddwd mm2, mm2 ; square and accumulate
michael@0 277 add rbx,rdx ; Inc pointer into ref data
michael@0 278 add rax,rcx ; Inc pointer into the new data
michael@0 279 paddd mm7, mm0 ; accumulate in mm7
michael@0 280 paddd mm7, mm2 ; accumulate in mm7
michael@0 281
michael@0 282 ; Now accumulate the final results.
michael@0 283 movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
michael@0 284 movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
michael@0 285 movsx rdx, WORD PTR [rsp+8]
michael@0 286 movsx rcx, WORD PTR [rsp+10]
michael@0 287 movsx rbx, WORD PTR [rsp+12]
michael@0 288 movsx rax, WORD PTR [rsp+14]
michael@0 289 add rdx, rcx
michael@0 290 add rbx, rax
michael@0 291 add rdx, rbx ;XSum
michael@0 292 movsxd rax, DWORD PTR [rsp]
michael@0 293 movsxd rcx, DWORD PTR [rsp+4]
michael@0 294 add rax, rcx ;XXSum
michael@0 295 mov rsi, arg(4) ;SSE
michael@0 296 mov rdi, arg(5) ;Sum
michael@0 297 mov dword ptr [rsi], eax
michael@0 298 mov dword ptr [rdi], edx
michael@0 299 xor rax, rax ; return 0
michael@0 300
michael@0 301
michael@0 302 ; begin epilog
michael@0 303 add rsp, 16
michael@0 304 pop rbx
michael@0 305 pop rdi
michael@0 306 pop rsi
michael@0 307 UNSHADOW_ARGS
michael@0 308 pop rbp
michael@0 309 ret
michael@0 310
michael@0 311
michael@0 312
michael@0 313 ;unsigned int
michael@0 314 ;vp9_get4x4var_mmx
michael@0 315 ;(
michael@0 316 ; unsigned char *src_ptr,
michael@0 317 ; int source_stride,
michael@0 318 ; unsigned char *ref_ptr,
michael@0 319 ; int recon_stride,
michael@0 320 ; unsigned int *SSE,
michael@0 321 ; int *Sum
michael@0 322 ;)
michael@0 323 global sym(vp9_get4x4var_mmx) PRIVATE
michael@0 324 sym(vp9_get4x4var_mmx):
michael@0 325 push rbp
michael@0 326 mov rbp, rsp
michael@0 327 SHADOW_ARGS_TO_STACK 6
michael@0 328 push rsi
michael@0 329 push rdi
michael@0 330 push rbx
michael@0 331 sub rsp, 16
michael@0 332 ; end prolog
michael@0 333
michael@0 334
michael@0 335 pxor mm5, mm5 ; Blank mmx6
michael@0 336 pxor mm6, mm6 ; Blank mmx7
michael@0 337 pxor mm7, mm7 ; Blank mmx7
michael@0 338
michael@0 339 mov rax, arg(0) ;[src_ptr] ; Load base addresses
michael@0 340 mov rbx, arg(2) ;[ref_ptr]
michael@0 341 movsxd rcx, dword ptr arg(1) ;[source_stride]
michael@0 342 movsxd rdx, dword ptr arg(3) ;[recon_stride]
michael@0 343
michael@0 344 ; Row 1
michael@0 345 movd mm0, [rax] ; Copy 4 bytes to mm0
michael@0 346 movd mm1, [rbx] ; Copy 4 bytes to mm1
michael@0 347 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 348 punpcklbw mm1, mm6
michael@0 349 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 350 paddw mm5, mm0 ; accumulate differences in mm5
michael@0 351 pmaddwd mm0, mm0 ; square and accumulate
michael@0 352 add rbx,rdx ; Inc pointer into ref data
michael@0 353 add rax,rcx ; Inc pointer into the new data
michael@0 354 movd mm1, [rbx] ; Copy 4 bytes to mm1
michael@0 355 paddd mm7, mm0 ; accumulate in mm7
michael@0 356
michael@0 357
michael@0 358 ; Row 2
michael@0 359 movd mm0, [rax] ; Copy 4 bytes to mm0
michael@0 360 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 361 punpcklbw mm1, mm6
michael@0 362 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 363 paddw mm5, mm0 ; accumulate differences in mm5
michael@0 364
michael@0 365 pmaddwd mm0, mm0 ; square and accumulate
michael@0 366 add rbx,rdx ; Inc pointer into ref data
michael@0 367 add rax,rcx ; Inc pointer into the new data
michael@0 368 movd mm1, [rbx] ; Copy 4 bytes to mm1
michael@0 369 paddd mm7, mm0 ; accumulate in mm7
michael@0 370
michael@0 371 ; Row 3
michael@0 372 movd mm0, [rax] ; Copy 4 bytes to mm0
michael@0 373 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 374 punpcklbw mm1, mm6
michael@0 375 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 376 paddw mm5, mm0 ; accumulate differences in mm5
michael@0 377
michael@0 378 pmaddwd mm0, mm0 ; square and accumulate
michael@0 379 add rbx,rdx ; Inc pointer into ref data
michael@0 380 add rax,rcx ; Inc pointer into the new data
michael@0 381 movd mm1, [rbx] ; Copy 4 bytes to mm1
michael@0 382 paddd mm7, mm0 ; accumulate in mm7
michael@0 383
michael@0 384 ; Row 4
michael@0 385 movd mm0, [rax] ; Copy 4 bytes to mm0
michael@0 386
michael@0 387 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 388 punpcklbw mm1, mm6
michael@0 389 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 390
michael@0 391 paddw mm5, mm0 ; accumulate differences in mm5
michael@0 392
michael@0 393 pmaddwd mm0, mm0 ; square and accumulate
michael@0 394 paddd mm7, mm0 ; accumulate in mm7
michael@0 395
michael@0 396
michael@0 397 ; Now accumulate the final results.
michael@0 398 movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
michael@0 399 movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
michael@0 400 movsx rdx, WORD PTR [rsp+8]
michael@0 401 movsx rcx, WORD PTR [rsp+10]
michael@0 402 movsx rbx, WORD PTR [rsp+12]
michael@0 403 movsx rax, WORD PTR [rsp+14]
michael@0 404 add rdx, rcx
michael@0 405 add rbx, rax
michael@0 406 add rdx, rbx ;XSum
michael@0 407 movsxd rax, DWORD PTR [rsp]
michael@0 408 movsxd rcx, DWORD PTR [rsp+4]
michael@0 409 add rax, rcx ;XXSum
michael@0 410 mov rsi, arg(4) ;SSE
michael@0 411 mov rdi, arg(5) ;Sum
michael@0 412 mov dword ptr [rsi], eax
michael@0 413 mov dword ptr [rdi], edx
michael@0 414 xor rax, rax ; return 0
michael@0 415
michael@0 416
michael@0 417 ; begin epilog
michael@0 418 add rsp, 16
michael@0 419 pop rbx
michael@0 420 pop rdi
michael@0 421 pop rsi
michael@0 422 UNSHADOW_ARGS
michael@0 423 pop rbp
michael@0 424 ret
michael@0 425
michael@0 426
michael@0 427
michael@0 428 ;unsigned int
michael@0 429 ;vp9_get4x4sse_cs_mmx
michael@0 430 ;(
michael@0 431 ; unsigned char *src_ptr,
michael@0 432 ; int source_stride,
michael@0 433 ; unsigned char *ref_ptr,
michael@0 434 ; int recon_stride
michael@0 435 ;)
michael@0 436 global sym(vp9_get4x4sse_cs_mmx) PRIVATE
michael@0 437 sym(vp9_get4x4sse_cs_mmx):
michael@0 438 push rbp
michael@0 439 mov rbp, rsp
michael@0 440 SHADOW_ARGS_TO_STACK 4
michael@0 441 push rsi
michael@0 442 push rdi
michael@0 443 push rbx
michael@0 444 ; end prolog
michael@0 445
michael@0 446
michael@0 447 pxor mm6, mm6 ; Blank mmx7
michael@0 448 pxor mm7, mm7 ; Blank mmx7
michael@0 449
michael@0 450 mov rax, arg(0) ;[src_ptr] ; Load base addresses
michael@0 451 mov rbx, arg(2) ;[ref_ptr]
michael@0 452 movsxd rcx, dword ptr arg(1) ;[source_stride]
michael@0 453 movsxd rdx, dword ptr arg(3) ;[recon_stride]
michael@0 454 ; Row 1
michael@0 455 movd mm0, [rax] ; Copy eight bytes to mm0
michael@0 456 movd mm1, [rbx] ; Copy eight bytes to mm1
michael@0 457 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 458 punpcklbw mm1, mm6
michael@0 459 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 460 pmaddwd mm0, mm0 ; square and accumulate
michael@0 461 add rbx,rdx ; Inc pointer into ref data
michael@0 462 add rax,rcx ; Inc pointer into the new data
michael@0 463 movd mm1, [rbx] ; Copy eight bytes to mm1
michael@0 464 paddd mm7, mm0 ; accumulate in mm7
michael@0 465
michael@0 466 ; Row 2
michael@0 467 movd mm0, [rax] ; Copy eight bytes to mm0
michael@0 468 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 469 punpcklbw mm1, mm6
michael@0 470 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 471 pmaddwd mm0, mm0 ; square and accumulate
michael@0 472 add rbx,rdx ; Inc pointer into ref data
michael@0 473 add rax,rcx ; Inc pointer into the new data
michael@0 474 movd mm1, [rbx] ; Copy eight bytes to mm1
michael@0 475 paddd mm7, mm0 ; accumulate in mm7
michael@0 476
michael@0 477 ; Row 3
michael@0 478 movd mm0, [rax] ; Copy eight bytes to mm0
michael@0 479 punpcklbw mm1, mm6
michael@0 480 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 481 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 482
michael@0 483 pmaddwd mm0, mm0 ; square and accumulate
michael@0 484 add rbx,rdx ; Inc pointer into ref data
michael@0 485 add rax,rcx ; Inc pointer into the new data
michael@0 486 movd mm1, [rbx] ; Copy eight bytes to mm1
michael@0 487 paddd mm7, mm0 ; accumulate in mm7
michael@0 488
michael@0 489 ; Row 4
michael@0 490 movd mm0, [rax] ; Copy eight bytes to mm0
michael@0 491 punpcklbw mm0, mm6 ; unpack to higher prrcision
michael@0 492 punpcklbw mm1, mm6
michael@0 493 psubsw mm0, mm1 ; A-B (low order) to MM0
michael@0 494 pmaddwd mm0, mm0 ; square and accumulate
michael@0 495 paddd mm7, mm0 ; accumulate in mm7
michael@0 496
michael@0 497 movq mm0, mm7 ;
michael@0 498 psrlq mm7, 32
michael@0 499
michael@0 500 paddd mm0, mm7
michael@0 501 movq rax, mm0
michael@0 502
michael@0 503
michael@0 504 ; begin epilog
michael@0 505 pop rbx
michael@0 506 pop rdi
michael@0 507 pop rsi
michael@0 508 UNSHADOW_ARGS
michael@0 509 pop rbp
michael@0 510 ret

mercurial