media/libvpx/vp8/common/x86/idctllm_sse2.asm

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11
michael@0 12 %include "vpx_ports/x86_abi_support.asm"
michael@0 13
michael@0 14 ;void vp8_idct_dequant_0_2x_sse2
michael@0 15 ; (
michael@0 16 ; short *qcoeff - 0
michael@0 17 ; short *dequant - 1
michael@0 18 ; unsigned char *dst - 2
michael@0 19 ; int dst_stride - 3
michael@0 20 ; )
michael@0 21
michael@0 22 global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE
michael@0 23 sym(vp8_idct_dequant_0_2x_sse2):
michael@0 24 push rbp
michael@0 25 mov rbp, rsp
michael@0 26 SHADOW_ARGS_TO_STACK 4
michael@0 27 GET_GOT rbx
michael@0 28 ; end prolog
michael@0 29
michael@0 30 mov rdx, arg(1) ; dequant
michael@0 31 mov rax, arg(0) ; qcoeff
michael@0 32
michael@0 33 movd xmm4, [rax]
michael@0 34 movd xmm5, [rdx]
michael@0 35
michael@0 36 pinsrw xmm4, [rax+32], 4
michael@0 37 pinsrw xmm5, [rdx], 4
michael@0 38
michael@0 39 pmullw xmm4, xmm5
michael@0 40
michael@0 41 ; Zero out xmm5, for use unpacking
michael@0 42 pxor xmm5, xmm5
michael@0 43
michael@0 44 ; clear coeffs
michael@0 45 movd [rax], xmm5
michael@0 46 movd [rax+32], xmm5
michael@0 47 ;pshufb
michael@0 48 mov rax, arg(2) ; dst
michael@0 49 movsxd rdx, dword ptr arg(3) ; dst_stride
michael@0 50
michael@0 51 pshuflw xmm4, xmm4, 00000000b
michael@0 52 pshufhw xmm4, xmm4, 00000000b
michael@0 53
michael@0 54 lea rcx, [rdx + rdx*2]
michael@0 55 paddw xmm4, [GLOBAL(fours)]
michael@0 56
michael@0 57 psraw xmm4, 3
michael@0 58
michael@0 59 movq xmm0, [rax]
michael@0 60 movq xmm1, [rax+rdx]
michael@0 61 movq xmm2, [rax+2*rdx]
michael@0 62 movq xmm3, [rax+rcx]
michael@0 63
michael@0 64 punpcklbw xmm0, xmm5
michael@0 65 punpcklbw xmm1, xmm5
michael@0 66 punpcklbw xmm2, xmm5
michael@0 67 punpcklbw xmm3, xmm5
michael@0 68
michael@0 69
michael@0 70 ; Add to predict buffer
michael@0 71 paddw xmm0, xmm4
michael@0 72 paddw xmm1, xmm4
michael@0 73 paddw xmm2, xmm4
michael@0 74 paddw xmm3, xmm4
michael@0 75
michael@0 76 ; pack up before storing
michael@0 77 packuswb xmm0, xmm5
michael@0 78 packuswb xmm1, xmm5
michael@0 79 packuswb xmm2, xmm5
michael@0 80 packuswb xmm3, xmm5
michael@0 81
michael@0 82 ; store blocks back out
michael@0 83 movq [rax], xmm0
michael@0 84 movq [rax + rdx], xmm1
michael@0 85
michael@0 86 lea rax, [rax + 2*rdx]
michael@0 87
michael@0 88 movq [rax], xmm2
michael@0 89 movq [rax + rdx], xmm3
michael@0 90
michael@0 91 ; begin epilog
michael@0 92 RESTORE_GOT
michael@0 93 UNSHADOW_ARGS
michael@0 94 pop rbp
michael@0 95 ret
michael@0 96
michael@0 97 ;void vp8_idct_dequant_full_2x_sse2
michael@0 98 ; (
michael@0 99 ; short *qcoeff - 0
michael@0 100 ; short *dequant - 1
michael@0 101 ; unsigned char *dst - 2
michael@0 102 ; int dst_stride - 3
michael@0 103 ; )
michael@0 104 global sym(vp8_idct_dequant_full_2x_sse2) PRIVATE
michael@0 105 sym(vp8_idct_dequant_full_2x_sse2):
michael@0 106 push rbp
michael@0 107 mov rbp, rsp
michael@0 108 SHADOW_ARGS_TO_STACK 4
michael@0 109 SAVE_XMM 7
michael@0 110 GET_GOT rbx
michael@0 111 push rsi
michael@0 112 push rdi
michael@0 113 ; end prolog
michael@0 114
michael@0 115 ; special case when 2 blocks have 0 or 1 coeffs
michael@0 116 ; dc is set as first coeff, so no need to load qcoeff
michael@0 117 mov rax, arg(0) ; qcoeff
michael@0 118 mov rdx, arg(1) ; dequant
michael@0 119 mov rdi, arg(2) ; dst
michael@0 120
michael@0 121
michael@0 122 ; Zero out xmm7, for use unpacking
michael@0 123 pxor xmm7, xmm7
michael@0 124
michael@0 125
michael@0 126 ; note the transpose of xmm1 and xmm2, necessary for shuffle
michael@0 127 ; to spit out sensicle data
michael@0 128 movdqa xmm0, [rax]
michael@0 129 movdqa xmm2, [rax+16]
michael@0 130 movdqa xmm1, [rax+32]
michael@0 131 movdqa xmm3, [rax+48]
michael@0 132
michael@0 133 ; Clear out coeffs
michael@0 134 movdqa [rax], xmm7
michael@0 135 movdqa [rax+16], xmm7
michael@0 136 movdqa [rax+32], xmm7
michael@0 137 movdqa [rax+48], xmm7
michael@0 138
michael@0 139 ; dequantize qcoeff buffer
michael@0 140 pmullw xmm0, [rdx]
michael@0 141 pmullw xmm2, [rdx+16]
michael@0 142 pmullw xmm1, [rdx]
michael@0 143 pmullw xmm3, [rdx+16]
michael@0 144 movsxd rdx, dword ptr arg(3) ; dst_stride
michael@0 145
michael@0 146 ; repack so block 0 row x and block 1 row x are together
michael@0 147 movdqa xmm4, xmm0
michael@0 148 punpckldq xmm0, xmm1
michael@0 149 punpckhdq xmm4, xmm1
michael@0 150
michael@0 151 pshufd xmm0, xmm0, 11011000b
michael@0 152 pshufd xmm1, xmm4, 11011000b
michael@0 153
michael@0 154 movdqa xmm4, xmm2
michael@0 155 punpckldq xmm2, xmm3
michael@0 156 punpckhdq xmm4, xmm3
michael@0 157
michael@0 158 pshufd xmm2, xmm2, 11011000b
michael@0 159 pshufd xmm3, xmm4, 11011000b
michael@0 160
michael@0 161 ; first pass
michael@0 162 psubw xmm0, xmm2 ; b1 = 0-2
michael@0 163 paddw xmm2, xmm2 ;
michael@0 164
michael@0 165 movdqa xmm5, xmm1
michael@0 166 paddw xmm2, xmm0 ; a1 = 0+2
michael@0 167
michael@0 168 pmulhw xmm5, [GLOBAL(x_s1sqr2)]
michael@0 169 lea rcx, [rdx + rdx*2] ;dst_stride * 3
michael@0 170 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
michael@0 171
michael@0 172 movdqa xmm7, xmm3
michael@0 173 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
michael@0 174
michael@0 175 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
michael@0 176 psubw xmm7, xmm5 ; c1
michael@0 177
michael@0 178 movdqa xmm5, xmm1
michael@0 179 movdqa xmm4, xmm3
michael@0 180
michael@0 181 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
michael@0 182 paddw xmm5, xmm1
michael@0 183
michael@0 184 pmulhw xmm3, [GLOBAL(x_s1sqr2)]
michael@0 185 paddw xmm3, xmm4
michael@0 186
michael@0 187 paddw xmm3, xmm5 ; d1
michael@0 188 movdqa xmm6, xmm2 ; a1
michael@0 189
michael@0 190 movdqa xmm4, xmm0 ; b1
michael@0 191 paddw xmm2, xmm3 ;0
michael@0 192
michael@0 193 paddw xmm4, xmm7 ;1
michael@0 194 psubw xmm0, xmm7 ;2
michael@0 195
michael@0 196 psubw xmm6, xmm3 ;3
michael@0 197
michael@0 198 ; transpose for the second pass
michael@0 199 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
michael@0 200 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
michael@0 201 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
michael@0 202
michael@0 203 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
michael@0 204 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
michael@0 205 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
michael@0 206
michael@0 207
michael@0 208 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
michael@0 209 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
michael@0 210 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
michael@0 211
michael@0 212 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
michael@0 213 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
michael@0 214 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
michael@0 215
michael@0 216
michael@0 217 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
michael@0 218 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
michael@0 219 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
michael@0 220
michael@0 221 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
michael@0 222 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
michael@0 223 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
michael@0 224
michael@0 225 pshufd xmm0, xmm2, 11011000b
michael@0 226 pshufd xmm2, xmm1, 11011000b
michael@0 227
michael@0 228 pshufd xmm1, xmm5, 11011000b
michael@0 229 pshufd xmm3, xmm7, 11011000b
michael@0 230
michael@0 231 ; second pass
michael@0 232 psubw xmm0, xmm2 ; b1 = 0-2
michael@0 233 paddw xmm2, xmm2
michael@0 234
michael@0 235 movdqa xmm5, xmm1
michael@0 236 paddw xmm2, xmm0 ; a1 = 0+2
michael@0 237
michael@0 238 pmulhw xmm5, [GLOBAL(x_s1sqr2)]
michael@0 239 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
michael@0 240
michael@0 241 movdqa xmm7, xmm3
michael@0 242 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
michael@0 243
michael@0 244 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
michael@0 245 psubw xmm7, xmm5 ; c1
michael@0 246
michael@0 247 movdqa xmm5, xmm1
michael@0 248 movdqa xmm4, xmm3
michael@0 249
michael@0 250 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
michael@0 251 paddw xmm5, xmm1
michael@0 252
michael@0 253 pmulhw xmm3, [GLOBAL(x_s1sqr2)]
michael@0 254 paddw xmm3, xmm4
michael@0 255
michael@0 256 paddw xmm3, xmm5 ; d1
michael@0 257 paddw xmm0, [GLOBAL(fours)]
michael@0 258
michael@0 259 paddw xmm2, [GLOBAL(fours)]
michael@0 260 movdqa xmm6, xmm2 ; a1
michael@0 261
michael@0 262 movdqa xmm4, xmm0 ; b1
michael@0 263 paddw xmm2, xmm3 ;0
michael@0 264
michael@0 265 paddw xmm4, xmm7 ;1
michael@0 266 psubw xmm0, xmm7 ;2
michael@0 267
michael@0 268 psubw xmm6, xmm3 ;3
michael@0 269 psraw xmm2, 3
michael@0 270
michael@0 271 psraw xmm0, 3
michael@0 272 psraw xmm4, 3
michael@0 273
michael@0 274 psraw xmm6, 3
michael@0 275
michael@0 276 ; transpose to save
michael@0 277 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
michael@0 278 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
michael@0 279 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
michael@0 280
michael@0 281 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
michael@0 282 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
michael@0 283 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
michael@0 284
michael@0 285
michael@0 286 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
michael@0 287 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
michael@0 288 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
michael@0 289
michael@0 290 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
michael@0 291 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
michael@0 292 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
michael@0 293
michael@0 294
michael@0 295 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
michael@0 296 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
michael@0 297 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
michael@0 298
michael@0 299 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
michael@0 300 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
michael@0 301 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
michael@0 302
michael@0 303 pshufd xmm0, xmm2, 11011000b
michael@0 304 pshufd xmm2, xmm1, 11011000b
michael@0 305
michael@0 306 pshufd xmm1, xmm5, 11011000b
michael@0 307 pshufd xmm3, xmm7, 11011000b
michael@0 308
michael@0 309 pxor xmm7, xmm7
michael@0 310
michael@0 311 ; Load up predict blocks
michael@0 312 movq xmm4, [rdi]
michael@0 313 movq xmm5, [rdi+rdx]
michael@0 314
michael@0 315 punpcklbw xmm4, xmm7
michael@0 316 punpcklbw xmm5, xmm7
michael@0 317
michael@0 318 paddw xmm0, xmm4
michael@0 319 paddw xmm1, xmm5
michael@0 320
michael@0 321 movq xmm4, [rdi+2*rdx]
michael@0 322 movq xmm5, [rdi+rcx]
michael@0 323
michael@0 324 punpcklbw xmm4, xmm7
michael@0 325 punpcklbw xmm5, xmm7
michael@0 326
michael@0 327 paddw xmm2, xmm4
michael@0 328 paddw xmm3, xmm5
michael@0 329
michael@0 330 .finish:
michael@0 331
michael@0 332 ; pack up before storing
michael@0 333 packuswb xmm0, xmm7
michael@0 334 packuswb xmm1, xmm7
michael@0 335 packuswb xmm2, xmm7
michael@0 336 packuswb xmm3, xmm7
michael@0 337
michael@0 338 ; store blocks back out
michael@0 339 movq [rdi], xmm0
michael@0 340 movq [rdi + rdx], xmm1
michael@0 341 movq [rdi + rdx*2], xmm2
michael@0 342 movq [rdi + rcx], xmm3
michael@0 343
michael@0 344 ; begin epilog
michael@0 345 pop rdi
michael@0 346 pop rsi
michael@0 347 RESTORE_GOT
michael@0 348 RESTORE_XMM
michael@0 349 UNSHADOW_ARGS
michael@0 350 pop rbp
michael@0 351 ret
michael@0 352
michael@0 353 ;void vp8_idct_dequant_dc_0_2x_sse2
michael@0 354 ; (
michael@0 355 ; short *qcoeff - 0
michael@0 356 ; short *dequant - 1
michael@0 357 ; unsigned char *dst - 2
michael@0 358 ; int dst_stride - 3
michael@0 359 ; short *dc - 4
michael@0 360 ; )
michael@0 361 global sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE
michael@0 362 sym(vp8_idct_dequant_dc_0_2x_sse2):
michael@0 363 push rbp
michael@0 364 mov rbp, rsp
michael@0 365 SHADOW_ARGS_TO_STACK 5
michael@0 366 GET_GOT rbx
michael@0 367 push rdi
michael@0 368 ; end prolog
michael@0 369
michael@0 370 ; special case when 2 blocks have 0 or 1 coeffs
michael@0 371 ; dc is set as first coeff, so no need to load qcoeff
michael@0 372 mov rax, arg(0) ; qcoeff
michael@0 373
michael@0 374 mov rdi, arg(2) ; dst
michael@0 375 mov rdx, arg(4) ; dc
michael@0 376
michael@0 377 ; Zero out xmm5, for use unpacking
michael@0 378 pxor xmm5, xmm5
michael@0 379
michael@0 380 ; load up 2 dc words here == 2*16 = doubleword
michael@0 381 movd xmm4, [rdx]
michael@0 382
michael@0 383 movsxd rdx, dword ptr arg(3) ; dst_stride
michael@0 384 lea rcx, [rdx + rdx*2]
michael@0 385 ; Load up predict blocks
michael@0 386 movq xmm0, [rdi]
michael@0 387 movq xmm1, [rdi+rdx*1]
michael@0 388 movq xmm2, [rdi+rdx*2]
michael@0 389 movq xmm3, [rdi+rcx]
michael@0 390
michael@0 391 ; Duplicate and expand dc across
michael@0 392 punpcklwd xmm4, xmm4
michael@0 393 punpckldq xmm4, xmm4
michael@0 394
michael@0 395 ; Rounding to dequant and downshift
michael@0 396 paddw xmm4, [GLOBAL(fours)]
michael@0 397 psraw xmm4, 3
michael@0 398
michael@0 399 ; Predict buffer needs to be expanded from bytes to words
michael@0 400 punpcklbw xmm0, xmm5
michael@0 401 punpcklbw xmm1, xmm5
michael@0 402 punpcklbw xmm2, xmm5
michael@0 403 punpcklbw xmm3, xmm5
michael@0 404
michael@0 405 ; Add to predict buffer
michael@0 406 paddw xmm0, xmm4
michael@0 407 paddw xmm1, xmm4
michael@0 408 paddw xmm2, xmm4
michael@0 409 paddw xmm3, xmm4
michael@0 410
michael@0 411 ; pack up before storing
michael@0 412 packuswb xmm0, xmm5
michael@0 413 packuswb xmm1, xmm5
michael@0 414 packuswb xmm2, xmm5
michael@0 415 packuswb xmm3, xmm5
michael@0 416
michael@0 417 ; store blocks back out
michael@0 418 movq [rdi], xmm0
michael@0 419 movq [rdi + rdx], xmm1
michael@0 420 movq [rdi + rdx*2], xmm2
michael@0 421 movq [rdi + rcx], xmm3
michael@0 422
michael@0 423 ; begin epilog
michael@0 424 pop rdi
michael@0 425 RESTORE_GOT
michael@0 426 UNSHADOW_ARGS
michael@0 427 pop rbp
michael@0 428 ret
michael@0 429 ;void vp8_idct_dequant_dc_full_2x_sse2
michael@0 430 ; (
michael@0 431 ; short *qcoeff - 0
michael@0 432 ; short *dequant - 1
michael@0 433 ; unsigned char *dst - 2
michael@0 434 ; int dst_stride - 3
michael@0 435 ; short *dc - 4
michael@0 436 ; )
michael@0 437 global sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE
michael@0 438 sym(vp8_idct_dequant_dc_full_2x_sse2):
michael@0 439 push rbp
michael@0 440 mov rbp, rsp
michael@0 441 SHADOW_ARGS_TO_STACK 5
michael@0 442 SAVE_XMM 7
michael@0 443 GET_GOT rbx
michael@0 444 push rdi
michael@0 445 ; end prolog
michael@0 446
michael@0 447 ; special case when 2 blocks have 0 or 1 coeffs
michael@0 448 ; dc is set as first coeff, so no need to load qcoeff
michael@0 449 mov rax, arg(0) ; qcoeff
michael@0 450 mov rdx, arg(1) ; dequant
michael@0 451
michael@0 452 mov rdi, arg(2) ; dst
michael@0 453
michael@0 454 ; Zero out xmm7, for use unpacking
michael@0 455 pxor xmm7, xmm7
michael@0 456
michael@0 457
michael@0 458 ; note the transpose of xmm1 and xmm2, necessary for shuffle
michael@0 459 ; to spit out sensicle data
michael@0 460 movdqa xmm0, [rax]
michael@0 461 movdqa xmm2, [rax+16]
michael@0 462 movdqa xmm1, [rax+32]
michael@0 463 movdqa xmm3, [rax+48]
michael@0 464
michael@0 465 ; Clear out coeffs
michael@0 466 movdqa [rax], xmm7
michael@0 467 movdqa [rax+16], xmm7
michael@0 468 movdqa [rax+32], xmm7
michael@0 469 movdqa [rax+48], xmm7
michael@0 470
michael@0 471 ; dequantize qcoeff buffer
michael@0 472 pmullw xmm0, [rdx]
michael@0 473 pmullw xmm2, [rdx+16]
michael@0 474 pmullw xmm1, [rdx]
michael@0 475 pmullw xmm3, [rdx+16]
michael@0 476
michael@0 477 ; DC component
michael@0 478 mov rdx, arg(4)
michael@0 479
michael@0 480 ; repack so block 0 row x and block 1 row x are together
michael@0 481 movdqa xmm4, xmm0
michael@0 482 punpckldq xmm0, xmm1
michael@0 483 punpckhdq xmm4, xmm1
michael@0 484
michael@0 485 pshufd xmm0, xmm0, 11011000b
michael@0 486 pshufd xmm1, xmm4, 11011000b
michael@0 487
michael@0 488 movdqa xmm4, xmm2
michael@0 489 punpckldq xmm2, xmm3
michael@0 490 punpckhdq xmm4, xmm3
michael@0 491
michael@0 492 pshufd xmm2, xmm2, 11011000b
michael@0 493 pshufd xmm3, xmm4, 11011000b
michael@0 494
michael@0 495 ; insert DC component
michael@0 496 pinsrw xmm0, [rdx], 0
michael@0 497 pinsrw xmm0, [rdx+2], 4
michael@0 498
michael@0 499 ; first pass
michael@0 500 psubw xmm0, xmm2 ; b1 = 0-2
michael@0 501 paddw xmm2, xmm2 ;
michael@0 502
michael@0 503 movdqa xmm5, xmm1
michael@0 504 paddw xmm2, xmm0 ; a1 = 0+2
michael@0 505
michael@0 506 pmulhw xmm5, [GLOBAL(x_s1sqr2)]
michael@0 507 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
michael@0 508
michael@0 509 movdqa xmm7, xmm3
michael@0 510 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
michael@0 511
michael@0 512 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
michael@0 513 psubw xmm7, xmm5 ; c1
michael@0 514
michael@0 515 movdqa xmm5, xmm1
michael@0 516 movdqa xmm4, xmm3
michael@0 517
michael@0 518 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
michael@0 519 paddw xmm5, xmm1
michael@0 520
michael@0 521 pmulhw xmm3, [GLOBAL(x_s1sqr2)]
michael@0 522 paddw xmm3, xmm4
michael@0 523
michael@0 524 paddw xmm3, xmm5 ; d1
michael@0 525 movdqa xmm6, xmm2 ; a1
michael@0 526
michael@0 527 movdqa xmm4, xmm0 ; b1
michael@0 528 paddw xmm2, xmm3 ;0
michael@0 529
michael@0 530 paddw xmm4, xmm7 ;1
michael@0 531 psubw xmm0, xmm7 ;2
michael@0 532
michael@0 533 psubw xmm6, xmm3 ;3
michael@0 534
michael@0 535 ; transpose for the second pass
michael@0 536 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
michael@0 537 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
michael@0 538 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
michael@0 539
michael@0 540 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
michael@0 541 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
michael@0 542 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
michael@0 543
michael@0 544
michael@0 545 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
michael@0 546 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
michael@0 547 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
michael@0 548
michael@0 549 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
michael@0 550 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
michael@0 551 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
michael@0 552
michael@0 553
michael@0 554 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
michael@0 555 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
michael@0 556 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
michael@0 557
michael@0 558 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
michael@0 559 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
michael@0 560 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
michael@0 561
michael@0 562 pshufd xmm0, xmm2, 11011000b
michael@0 563 pshufd xmm2, xmm1, 11011000b
michael@0 564
michael@0 565 pshufd xmm1, xmm5, 11011000b
michael@0 566 pshufd xmm3, xmm7, 11011000b
michael@0 567
michael@0 568 ; second pass
michael@0 569 psubw xmm0, xmm2 ; b1 = 0-2
michael@0 570 paddw xmm2, xmm2
michael@0 571
michael@0 572 movdqa xmm5, xmm1
michael@0 573 paddw xmm2, xmm0 ; a1 = 0+2
michael@0 574
michael@0 575 pmulhw xmm5, [GLOBAL(x_s1sqr2)]
michael@0 576 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
michael@0 577
michael@0 578 movdqa xmm7, xmm3
michael@0 579 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
michael@0 580
michael@0 581 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
michael@0 582 psubw xmm7, xmm5 ; c1
michael@0 583
michael@0 584 movdqa xmm5, xmm1
michael@0 585 movdqa xmm4, xmm3
michael@0 586
michael@0 587 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
michael@0 588 paddw xmm5, xmm1
michael@0 589
michael@0 590 pmulhw xmm3, [GLOBAL(x_s1sqr2)]
michael@0 591 paddw xmm3, xmm4
michael@0 592
michael@0 593 paddw xmm3, xmm5 ; d1
michael@0 594 paddw xmm0, [GLOBAL(fours)]
michael@0 595
michael@0 596 paddw xmm2, [GLOBAL(fours)]
michael@0 597 movdqa xmm6, xmm2 ; a1
michael@0 598
michael@0 599 movdqa xmm4, xmm0 ; b1
michael@0 600 paddw xmm2, xmm3 ;0
michael@0 601
michael@0 602 paddw xmm4, xmm7 ;1
michael@0 603 psubw xmm0, xmm7 ;2
michael@0 604
michael@0 605 psubw xmm6, xmm3 ;3
michael@0 606 psraw xmm2, 3
michael@0 607
michael@0 608 psraw xmm0, 3
michael@0 609 psraw xmm4, 3
michael@0 610
michael@0 611 psraw xmm6, 3
michael@0 612
michael@0 613 ; transpose to save
michael@0 614 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
michael@0 615 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
michael@0 616 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
michael@0 617
michael@0 618 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
michael@0 619 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
michael@0 620 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
michael@0 621
michael@0 622
michael@0 623 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
michael@0 624 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
michael@0 625 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
michael@0 626
michael@0 627 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
michael@0 628 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
michael@0 629 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
michael@0 630
michael@0 631
michael@0 632 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
michael@0 633 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
michael@0 634 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
michael@0 635
michael@0 636 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
michael@0 637 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
michael@0 638 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
michael@0 639
michael@0 640 pshufd xmm0, xmm2, 11011000b
michael@0 641 pshufd xmm2, xmm1, 11011000b
michael@0 642
michael@0 643 pshufd xmm1, xmm5, 11011000b
michael@0 644 pshufd xmm3, xmm7, 11011000b
michael@0 645
michael@0 646 pxor xmm7, xmm7
michael@0 647
michael@0 648 ; Load up predict blocks
michael@0 649 movsxd rdx, dword ptr arg(3) ; dst_stride
michael@0 650 movq xmm4, [rdi]
michael@0 651 movq xmm5, [rdi+rdx]
michael@0 652 lea rcx, [rdx + rdx*2]
michael@0 653
michael@0 654 punpcklbw xmm4, xmm7
michael@0 655 punpcklbw xmm5, xmm7
michael@0 656
michael@0 657 paddw xmm0, xmm4
michael@0 658 paddw xmm1, xmm5
michael@0 659
michael@0 660 movq xmm4, [rdi+rdx*2]
michael@0 661 movq xmm5, [rdi+rcx]
michael@0 662
michael@0 663 punpcklbw xmm4, xmm7
michael@0 664 punpcklbw xmm5, xmm7
michael@0 665
michael@0 666 paddw xmm2, xmm4
michael@0 667 paddw xmm3, xmm5
michael@0 668
michael@0 669 .finish:
michael@0 670
michael@0 671 ; pack up before storing
michael@0 672 packuswb xmm0, xmm7
michael@0 673 packuswb xmm1, xmm7
michael@0 674 packuswb xmm2, xmm7
michael@0 675 packuswb xmm3, xmm7
michael@0 676
michael@0 677 ; Load destination stride before writing out,
michael@0 678 ; doesn't need to persist
michael@0 679 movsxd rdx, dword ptr arg(3) ; dst_stride
michael@0 680
michael@0 681 ; store blocks back out
michael@0 682 movq [rdi], xmm0
michael@0 683 movq [rdi + rdx], xmm1
michael@0 684
michael@0 685 lea rdi, [rdi + 2*rdx]
michael@0 686
michael@0 687 movq [rdi], xmm2
michael@0 688 movq [rdi + rdx], xmm3
michael@0 689
michael@0 690
michael@0 691 ; begin epilog
michael@0 692 pop rdi
michael@0 693 RESTORE_GOT
michael@0 694 RESTORE_XMM
michael@0 695 UNSHADOW_ARGS
michael@0 696 pop rbp
michael@0 697 ret
michael@0 698
michael@0 699 SECTION_RODATA
michael@0 700 align 16
michael@0 701 fours:
michael@0 702 times 8 dw 0x0004
michael@0 703 align 16
michael@0 704 x_s1sqr2:
michael@0 705 times 8 dw 0x8A8C
michael@0 706 align 16
michael@0 707 x_c1sqr2less1:
michael@0 708 times 8 dw 0x4E7B

mercurial