media/libvpx/vp9/common/x86/vp9_subpixel_8t_sse2.asm

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11
michael@0 12 %include "vpx_ports/x86_abi_support.asm"
michael@0 13
michael@0 14 ;Note: tap3 and tap4 have to be applied and added after other taps to avoid
michael@0 15 ;overflow.
michael@0 16
michael@0 17 %macro GET_FILTERS_4 0
michael@0 18 mov rdx, arg(5) ;filter ptr
michael@0 19 mov rcx, 0x0400040
michael@0 20
michael@0 21 movdqa xmm7, [rdx] ;load filters
michael@0 22 pshuflw xmm0, xmm7, 0b ;k0
michael@0 23 pshuflw xmm1, xmm7, 01010101b ;k1
michael@0 24 pshuflw xmm2, xmm7, 10101010b ;k2
michael@0 25 pshuflw xmm3, xmm7, 11111111b ;k3
michael@0 26 psrldq xmm7, 8
michael@0 27 pshuflw xmm4, xmm7, 0b ;k4
michael@0 28 pshuflw xmm5, xmm7, 01010101b ;k5
michael@0 29 pshuflw xmm6, xmm7, 10101010b ;k6
michael@0 30 pshuflw xmm7, xmm7, 11111111b ;k7
michael@0 31
michael@0 32 punpcklqdq xmm0, xmm1
michael@0 33 punpcklqdq xmm2, xmm3
michael@0 34 punpcklqdq xmm5, xmm4
michael@0 35 punpcklqdq xmm6, xmm7
michael@0 36
michael@0 37 movdqa k0k1, xmm0
michael@0 38 movdqa k2k3, xmm2
michael@0 39 movdqa k5k4, xmm5
michael@0 40 movdqa k6k7, xmm6
michael@0 41
michael@0 42 movq xmm6, rcx
michael@0 43 pshufd xmm6, xmm6, 0
michael@0 44 movdqa krd, xmm6
michael@0 45
michael@0 46 pxor xmm7, xmm7
michael@0 47 movdqa zero, xmm7
michael@0 48 %endm
michael@0 49
michael@0 50 %macro APPLY_FILTER_4 1
michael@0 51 punpckldq xmm0, xmm1 ;two row in one register
michael@0 52 punpckldq xmm6, xmm7
michael@0 53 punpckldq xmm2, xmm3
michael@0 54 punpckldq xmm5, xmm4
michael@0 55
michael@0 56 punpcklbw xmm0, zero ;unpack to word
michael@0 57 punpcklbw xmm6, zero
michael@0 58 punpcklbw xmm2, zero
michael@0 59 punpcklbw xmm5, zero
michael@0 60
michael@0 61 pmullw xmm0, k0k1 ;multiply the filter factors
michael@0 62 pmullw xmm6, k6k7
michael@0 63 pmullw xmm2, k2k3
michael@0 64 pmullw xmm5, k5k4
michael@0 65
michael@0 66 paddsw xmm0, xmm6 ;sum
michael@0 67 movdqa xmm1, xmm0
michael@0 68 psrldq xmm1, 8
michael@0 69 paddsw xmm0, xmm1
michael@0 70 paddsw xmm0, xmm2
michael@0 71 psrldq xmm2, 8
michael@0 72 paddsw xmm0, xmm5
michael@0 73 psrldq xmm5, 8
michael@0 74 paddsw xmm0, xmm2
michael@0 75 paddsw xmm0, xmm5
michael@0 76
michael@0 77 paddsw xmm0, krd ;rounding
michael@0 78 psraw xmm0, 7 ;shift
michael@0 79 packuswb xmm0, xmm0 ;pack to byte
michael@0 80
michael@0 81 %if %1
michael@0 82 movd xmm1, [rdi]
michael@0 83 pavgb xmm0, xmm1
michael@0 84 %endif
michael@0 85 movd [rdi], xmm0
michael@0 86 %endm
michael@0 87
michael@0 88 %macro GET_FILTERS 0
michael@0 89 mov rdx, arg(5) ;filter ptr
michael@0 90 mov rsi, arg(0) ;src_ptr
michael@0 91 mov rdi, arg(2) ;output_ptr
michael@0 92 mov rcx, 0x0400040
michael@0 93
michael@0 94 movdqa xmm7, [rdx] ;load filters
michael@0 95 pshuflw xmm0, xmm7, 0b ;k0
michael@0 96 pshuflw xmm1, xmm7, 01010101b ;k1
michael@0 97 pshuflw xmm2, xmm7, 10101010b ;k2
michael@0 98 pshuflw xmm3, xmm7, 11111111b ;k3
michael@0 99 pshufhw xmm4, xmm7, 0b ;k4
michael@0 100 pshufhw xmm5, xmm7, 01010101b ;k5
michael@0 101 pshufhw xmm6, xmm7, 10101010b ;k6
michael@0 102 pshufhw xmm7, xmm7, 11111111b ;k7
michael@0 103
michael@0 104 punpcklwd xmm0, xmm0
michael@0 105 punpcklwd xmm1, xmm1
michael@0 106 punpcklwd xmm2, xmm2
michael@0 107 punpcklwd xmm3, xmm3
michael@0 108 punpckhwd xmm4, xmm4
michael@0 109 punpckhwd xmm5, xmm5
michael@0 110 punpckhwd xmm6, xmm6
michael@0 111 punpckhwd xmm7, xmm7
michael@0 112
michael@0 113 movdqa k0, xmm0 ;store filter factors on stack
michael@0 114 movdqa k1, xmm1
michael@0 115 movdqa k2, xmm2
michael@0 116 movdqa k3, xmm3
michael@0 117 movdqa k4, xmm4
michael@0 118 movdqa k5, xmm5
michael@0 119 movdqa k6, xmm6
michael@0 120 movdqa k7, xmm7
michael@0 121
michael@0 122 movq xmm6, rcx
michael@0 123 pshufd xmm6, xmm6, 0
michael@0 124 movdqa krd, xmm6 ;rounding
michael@0 125
michael@0 126 pxor xmm7, xmm7
michael@0 127 movdqa zero, xmm7
michael@0 128 %endm
michael@0 129
michael@0 130 %macro LOAD_VERT_8 1
michael@0 131 movq xmm0, [rsi + %1] ;0
michael@0 132 movq xmm1, [rsi + rax + %1] ;1
michael@0 133 movq xmm6, [rsi + rdx * 2 + %1] ;6
michael@0 134 lea rsi, [rsi + rax]
michael@0 135 movq xmm7, [rsi + rdx * 2 + %1] ;7
michael@0 136 movq xmm2, [rsi + rax + %1] ;2
michael@0 137 movq xmm3, [rsi + rax * 2 + %1] ;3
michael@0 138 movq xmm4, [rsi + rdx + %1] ;4
michael@0 139 movq xmm5, [rsi + rax * 4 + %1] ;5
michael@0 140 %endm
michael@0 141
michael@0 142 %macro APPLY_FILTER_8 2
michael@0 143 punpcklbw xmm0, zero
michael@0 144 punpcklbw xmm1, zero
michael@0 145 punpcklbw xmm6, zero
michael@0 146 punpcklbw xmm7, zero
michael@0 147 punpcklbw xmm2, zero
michael@0 148 punpcklbw xmm5, zero
michael@0 149 punpcklbw xmm3, zero
michael@0 150 punpcklbw xmm4, zero
michael@0 151
michael@0 152 pmullw xmm0, k0
michael@0 153 pmullw xmm1, k1
michael@0 154 pmullw xmm6, k6
michael@0 155 pmullw xmm7, k7
michael@0 156 pmullw xmm2, k2
michael@0 157 pmullw xmm5, k5
michael@0 158 pmullw xmm3, k3
michael@0 159 pmullw xmm4, k4
michael@0 160
michael@0 161 paddsw xmm0, xmm1
michael@0 162 paddsw xmm0, xmm6
michael@0 163 paddsw xmm0, xmm7
michael@0 164 paddsw xmm0, xmm2
michael@0 165 paddsw xmm0, xmm5
michael@0 166 paddsw xmm0, xmm3
michael@0 167 paddsw xmm0, xmm4
michael@0 168
michael@0 169 paddsw xmm0, krd ;rounding
michael@0 170 psraw xmm0, 7 ;shift
michael@0 171 packuswb xmm0, xmm0 ;pack back to byte
michael@0 172 %if %1
michael@0 173 movq xmm1, [rdi + %2]
michael@0 174 pavgb xmm0, xmm1
michael@0 175 %endif
michael@0 176 movq [rdi + %2], xmm0
michael@0 177 %endm
michael@0 178
michael@0 179 ;void vp9_filter_block1d4_v8_sse2
michael@0 180 ;(
michael@0 181 ; unsigned char *src_ptr,
michael@0 182 ; unsigned int src_pitch,
michael@0 183 ; unsigned char *output_ptr,
michael@0 184 ; unsigned int out_pitch,
michael@0 185 ; unsigned int output_height,
michael@0 186 ; short *filter
michael@0 187 ;)
michael@0 188 global sym(vp9_filter_block1d4_v8_sse2) PRIVATE
michael@0 189 sym(vp9_filter_block1d4_v8_sse2):
michael@0 190 push rbp
michael@0 191 mov rbp, rsp
michael@0 192 SHADOW_ARGS_TO_STACK 6
michael@0 193 SAVE_XMM 7
michael@0 194 push rsi
michael@0 195 push rdi
michael@0 196 push rbx
michael@0 197 ; end prolog
michael@0 198
michael@0 199 ALIGN_STACK 16, rax
michael@0 200 sub rsp, 16 * 6
michael@0 201 %define k0k1 [rsp + 16 * 0]
michael@0 202 %define k2k3 [rsp + 16 * 1]
michael@0 203 %define k5k4 [rsp + 16 * 2]
michael@0 204 %define k6k7 [rsp + 16 * 3]
michael@0 205 %define krd [rsp + 16 * 4]
michael@0 206 %define zero [rsp + 16 * 5]
michael@0 207
michael@0 208 GET_FILTERS_4
michael@0 209
michael@0 210 mov rsi, arg(0) ;src_ptr
michael@0 211 mov rdi, arg(2) ;output_ptr
michael@0 212
michael@0 213 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
michael@0 214 movsxd rbx, DWORD PTR arg(3) ;out_pitch
michael@0 215 lea rdx, [rax + rax * 2]
michael@0 216 movsxd rcx, DWORD PTR arg(4) ;output_height
michael@0 217
michael@0 218 .loop:
michael@0 219 movd xmm0, [rsi] ;load src: row 0
michael@0 220 movd xmm1, [rsi + rax] ;1
michael@0 221 movd xmm6, [rsi + rdx * 2] ;6
michael@0 222 lea rsi, [rsi + rax]
michael@0 223 movd xmm7, [rsi + rdx * 2] ;7
michael@0 224 movd xmm2, [rsi + rax] ;2
michael@0 225 movd xmm3, [rsi + rax * 2] ;3
michael@0 226 movd xmm4, [rsi + rdx] ;4
michael@0 227 movd xmm5, [rsi + rax * 4] ;5
michael@0 228
michael@0 229 APPLY_FILTER_4 0
michael@0 230
michael@0 231 lea rdi, [rdi + rbx]
michael@0 232 dec rcx
michael@0 233 jnz .loop
michael@0 234
michael@0 235 add rsp, 16 * 6
michael@0 236 pop rsp
michael@0 237 pop rbx
michael@0 238 ; begin epilog
michael@0 239 pop rdi
michael@0 240 pop rsi
michael@0 241 RESTORE_XMM
michael@0 242 UNSHADOW_ARGS
michael@0 243 pop rbp
michael@0 244 ret
michael@0 245
michael@0 246 ;void vp9_filter_block1d8_v8_sse2
michael@0 247 ;(
michael@0 248 ; unsigned char *src_ptr,
michael@0 249 ; unsigned int src_pitch,
michael@0 250 ; unsigned char *output_ptr,
michael@0 251 ; unsigned int out_pitch,
michael@0 252 ; unsigned int output_height,
michael@0 253 ; short *filter
michael@0 254 ;)
michael@0 255 global sym(vp9_filter_block1d8_v8_sse2) PRIVATE
michael@0 256 sym(vp9_filter_block1d8_v8_sse2):
michael@0 257 push rbp
michael@0 258 mov rbp, rsp
michael@0 259 SHADOW_ARGS_TO_STACK 6
michael@0 260 SAVE_XMM 7
michael@0 261 push rsi
michael@0 262 push rdi
michael@0 263 push rbx
michael@0 264 ; end prolog
michael@0 265
michael@0 266 ALIGN_STACK 16, rax
michael@0 267 sub rsp, 16 * 10
michael@0 268 %define k0 [rsp + 16 * 0]
michael@0 269 %define k1 [rsp + 16 * 1]
michael@0 270 %define k2 [rsp + 16 * 2]
michael@0 271 %define k3 [rsp + 16 * 3]
michael@0 272 %define k4 [rsp + 16 * 4]
michael@0 273 %define k5 [rsp + 16 * 5]
michael@0 274 %define k6 [rsp + 16 * 6]
michael@0 275 %define k7 [rsp + 16 * 7]
michael@0 276 %define krd [rsp + 16 * 8]
michael@0 277 %define zero [rsp + 16 * 9]
michael@0 278
michael@0 279 GET_FILTERS
michael@0 280
michael@0 281 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
michael@0 282 movsxd rbx, DWORD PTR arg(3) ;out_pitch
michael@0 283 lea rdx, [rax + rax * 2]
michael@0 284 movsxd rcx, DWORD PTR arg(4) ;output_height
michael@0 285
michael@0 286 .loop:
michael@0 287 LOAD_VERT_8 0
michael@0 288 APPLY_FILTER_8 0, 0
michael@0 289
michael@0 290 lea rdi, [rdi + rbx]
michael@0 291 dec rcx
michael@0 292 jnz .loop
michael@0 293
michael@0 294 add rsp, 16 * 10
michael@0 295 pop rsp
michael@0 296 pop rbx
michael@0 297 ; begin epilog
michael@0 298 pop rdi
michael@0 299 pop rsi
michael@0 300 RESTORE_XMM
michael@0 301 UNSHADOW_ARGS
michael@0 302 pop rbp
michael@0 303 ret
michael@0 304
michael@0 305 ;void vp9_filter_block1d16_v8_sse2
michael@0 306 ;(
michael@0 307 ; unsigned char *src_ptr,
michael@0 308 ; unsigned int src_pitch,
michael@0 309 ; unsigned char *output_ptr,
michael@0 310 ; unsigned int out_pitch,
michael@0 311 ; unsigned int output_height,
michael@0 312 ; short *filter
michael@0 313 ;)
michael@0 314 global sym(vp9_filter_block1d16_v8_sse2) PRIVATE
michael@0 315 sym(vp9_filter_block1d16_v8_sse2):
michael@0 316 push rbp
michael@0 317 mov rbp, rsp
michael@0 318 SHADOW_ARGS_TO_STACK 6
michael@0 319 SAVE_XMM 7
michael@0 320 push rsi
michael@0 321 push rdi
michael@0 322 push rbx
michael@0 323 ; end prolog
michael@0 324
michael@0 325 ALIGN_STACK 16, rax
michael@0 326 sub rsp, 16 * 10
michael@0 327 %define k0 [rsp + 16 * 0]
michael@0 328 %define k1 [rsp + 16 * 1]
michael@0 329 %define k2 [rsp + 16 * 2]
michael@0 330 %define k3 [rsp + 16 * 3]
michael@0 331 %define k4 [rsp + 16 * 4]
michael@0 332 %define k5 [rsp + 16 * 5]
michael@0 333 %define k6 [rsp + 16 * 6]
michael@0 334 %define k7 [rsp + 16 * 7]
michael@0 335 %define krd [rsp + 16 * 8]
michael@0 336 %define zero [rsp + 16 * 9]
michael@0 337
michael@0 338 GET_FILTERS
michael@0 339
michael@0 340 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
michael@0 341 movsxd rbx, DWORD PTR arg(3) ;out_pitch
michael@0 342 lea rdx, [rax + rax * 2]
michael@0 343 movsxd rcx, DWORD PTR arg(4) ;output_height
michael@0 344
michael@0 345 .loop:
michael@0 346 LOAD_VERT_8 0
michael@0 347 APPLY_FILTER_8 0, 0
michael@0 348 sub rsi, rax
michael@0 349
michael@0 350 LOAD_VERT_8 8
michael@0 351 APPLY_FILTER_8 0, 8
michael@0 352 add rdi, rbx
michael@0 353
michael@0 354 dec rcx
michael@0 355 jnz .loop
michael@0 356
michael@0 357 add rsp, 16 * 10
michael@0 358 pop rsp
michael@0 359 pop rbx
michael@0 360 ; begin epilog
michael@0 361 pop rdi
michael@0 362 pop rsi
michael@0 363 RESTORE_XMM
michael@0 364 UNSHADOW_ARGS
michael@0 365 pop rbp
michael@0 366 ret
michael@0 367
michael@0 368 global sym(vp9_filter_block1d4_v8_avg_sse2) PRIVATE
michael@0 369 sym(vp9_filter_block1d4_v8_avg_sse2):
michael@0 370 push rbp
michael@0 371 mov rbp, rsp
michael@0 372 SHADOW_ARGS_TO_STACK 6
michael@0 373 SAVE_XMM 7
michael@0 374 push rsi
michael@0 375 push rdi
michael@0 376 push rbx
michael@0 377 ; end prolog
michael@0 378
michael@0 379 ALIGN_STACK 16, rax
michael@0 380 sub rsp, 16 * 6
michael@0 381 %define k0k1 [rsp + 16 * 0]
michael@0 382 %define k2k3 [rsp + 16 * 1]
michael@0 383 %define k5k4 [rsp + 16 * 2]
michael@0 384 %define k6k7 [rsp + 16 * 3]
michael@0 385 %define krd [rsp + 16 * 4]
michael@0 386 %define zero [rsp + 16 * 5]
michael@0 387
michael@0 388 GET_FILTERS_4
michael@0 389
michael@0 390 mov rsi, arg(0) ;src_ptr
michael@0 391 mov rdi, arg(2) ;output_ptr
michael@0 392
michael@0 393 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
michael@0 394 movsxd rbx, DWORD PTR arg(3) ;out_pitch
michael@0 395 lea rdx, [rax + rax * 2]
michael@0 396 movsxd rcx, DWORD PTR arg(4) ;output_height
michael@0 397
michael@0 398 .loop:
michael@0 399 movd xmm0, [rsi] ;load src: row 0
michael@0 400 movd xmm1, [rsi + rax] ;1
michael@0 401 movd xmm6, [rsi + rdx * 2] ;6
michael@0 402 lea rsi, [rsi + rax]
michael@0 403 movd xmm7, [rsi + rdx * 2] ;7
michael@0 404 movd xmm2, [rsi + rax] ;2
michael@0 405 movd xmm3, [rsi + rax * 2] ;3
michael@0 406 movd xmm4, [rsi + rdx] ;4
michael@0 407 movd xmm5, [rsi + rax * 4] ;5
michael@0 408
michael@0 409 APPLY_FILTER_4 1
michael@0 410
michael@0 411 lea rdi, [rdi + rbx]
michael@0 412 dec rcx
michael@0 413 jnz .loop
michael@0 414
michael@0 415 add rsp, 16 * 6
michael@0 416 pop rsp
michael@0 417 pop rbx
michael@0 418 ; begin epilog
michael@0 419 pop rdi
michael@0 420 pop rsi
michael@0 421 RESTORE_XMM
michael@0 422 UNSHADOW_ARGS
michael@0 423 pop rbp
michael@0 424 ret
michael@0 425
michael@0 426 global sym(vp9_filter_block1d8_v8_avg_sse2) PRIVATE
michael@0 427 sym(vp9_filter_block1d8_v8_avg_sse2):
michael@0 428 push rbp
michael@0 429 mov rbp, rsp
michael@0 430 SHADOW_ARGS_TO_STACK 6
michael@0 431 SAVE_XMM 7
michael@0 432 push rsi
michael@0 433 push rdi
michael@0 434 push rbx
michael@0 435 ; end prolog
michael@0 436
michael@0 437 ALIGN_STACK 16, rax
michael@0 438 sub rsp, 16 * 10
michael@0 439 %define k0 [rsp + 16 * 0]
michael@0 440 %define k1 [rsp + 16 * 1]
michael@0 441 %define k2 [rsp + 16 * 2]
michael@0 442 %define k3 [rsp + 16 * 3]
michael@0 443 %define k4 [rsp + 16 * 4]
michael@0 444 %define k5 [rsp + 16 * 5]
michael@0 445 %define k6 [rsp + 16 * 6]
michael@0 446 %define k7 [rsp + 16 * 7]
michael@0 447 %define krd [rsp + 16 * 8]
michael@0 448 %define zero [rsp + 16 * 9]
michael@0 449
michael@0 450 GET_FILTERS
michael@0 451
michael@0 452 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
michael@0 453 movsxd rbx, DWORD PTR arg(3) ;out_pitch
michael@0 454 lea rdx, [rax + rax * 2]
michael@0 455 movsxd rcx, DWORD PTR arg(4) ;output_height
michael@0 456 .loop:
michael@0 457 LOAD_VERT_8 0
michael@0 458 APPLY_FILTER_8 1, 0
michael@0 459
michael@0 460 lea rdi, [rdi + rbx]
michael@0 461 dec rcx
michael@0 462 jnz .loop
michael@0 463
michael@0 464 add rsp, 16 * 10
michael@0 465 pop rsp
michael@0 466 pop rbx
michael@0 467 ; begin epilog
michael@0 468 pop rdi
michael@0 469 pop rsi
michael@0 470 RESTORE_XMM
michael@0 471 UNSHADOW_ARGS
michael@0 472 pop rbp
michael@0 473 ret
michael@0 474
michael@0 475 global sym(vp9_filter_block1d16_v8_avg_sse2) PRIVATE
michael@0 476 sym(vp9_filter_block1d16_v8_avg_sse2):
michael@0 477 push rbp
michael@0 478 mov rbp, rsp
michael@0 479 SHADOW_ARGS_TO_STACK 6
michael@0 480 SAVE_XMM 7
michael@0 481 push rsi
michael@0 482 push rdi
michael@0 483 push rbx
michael@0 484 ; end prolog
michael@0 485
michael@0 486 ALIGN_STACK 16, rax
michael@0 487 sub rsp, 16 * 10
michael@0 488 %define k0 [rsp + 16 * 0]
michael@0 489 %define k1 [rsp + 16 * 1]
michael@0 490 %define k2 [rsp + 16 * 2]
michael@0 491 %define k3 [rsp + 16 * 3]
michael@0 492 %define k4 [rsp + 16 * 4]
michael@0 493 %define k5 [rsp + 16 * 5]
michael@0 494 %define k6 [rsp + 16 * 6]
michael@0 495 %define k7 [rsp + 16 * 7]
michael@0 496 %define krd [rsp + 16 * 8]
michael@0 497 %define zero [rsp + 16 * 9]
michael@0 498
michael@0 499 GET_FILTERS
michael@0 500
michael@0 501 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
michael@0 502 movsxd rbx, DWORD PTR arg(3) ;out_pitch
michael@0 503 lea rdx, [rax + rax * 2]
michael@0 504 movsxd rcx, DWORD PTR arg(4) ;output_height
michael@0 505 .loop:
michael@0 506 LOAD_VERT_8 0
michael@0 507 APPLY_FILTER_8 1, 0
michael@0 508 sub rsi, rax
michael@0 509
michael@0 510 LOAD_VERT_8 8
michael@0 511 APPLY_FILTER_8 1, 8
michael@0 512 add rdi, rbx
michael@0 513
michael@0 514 dec rcx
michael@0 515 jnz .loop
michael@0 516
michael@0 517 add rsp, 16 * 10
michael@0 518 pop rsp
michael@0 519 pop rbx
michael@0 520 ; begin epilog
michael@0 521 pop rdi
michael@0 522 pop rsi
michael@0 523 RESTORE_XMM
michael@0 524 UNSHADOW_ARGS
michael@0 525 pop rbp
michael@0 526 ret
michael@0 527
michael@0 528 ;void vp9_filter_block1d4_h8_sse2
michael@0 529 ;(
michael@0 530 ; unsigned char *src_ptr,
michael@0 531 ; unsigned int src_pixels_per_line,
michael@0 532 ; unsigned char *output_ptr,
michael@0 533 ; unsigned int output_pitch,
michael@0 534 ; unsigned int output_height,
michael@0 535 ; short *filter
michael@0 536 ;)
michael@0 537 global sym(vp9_filter_block1d4_h8_sse2) PRIVATE
michael@0 538 sym(vp9_filter_block1d4_h8_sse2):
michael@0 539 push rbp
michael@0 540 mov rbp, rsp
michael@0 541 SHADOW_ARGS_TO_STACK 6
michael@0 542 SAVE_XMM 7
michael@0 543 push rsi
michael@0 544 push rdi
michael@0 545 ; end prolog
michael@0 546
michael@0 547 ALIGN_STACK 16, rax
michael@0 548 sub rsp, 16 * 6
michael@0 549 %define k0k1 [rsp + 16 * 0]
michael@0 550 %define k2k3 [rsp + 16 * 1]
michael@0 551 %define k5k4 [rsp + 16 * 2]
michael@0 552 %define k6k7 [rsp + 16 * 3]
michael@0 553 %define krd [rsp + 16 * 4]
michael@0 554 %define zero [rsp + 16 * 5]
michael@0 555
michael@0 556 GET_FILTERS_4
michael@0 557
michael@0 558 mov rsi, arg(0) ;src_ptr
michael@0 559 mov rdi, arg(2) ;output_ptr
michael@0 560
michael@0 561 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
michael@0 562 movsxd rdx, DWORD PTR arg(3) ;out_pitch
michael@0 563 movsxd rcx, DWORD PTR arg(4) ;output_height
michael@0 564
michael@0 565 .loop:
michael@0 566 movdqu xmm0, [rsi - 3] ;load src
michael@0 567
michael@0 568 movdqa xmm1, xmm0
michael@0 569 movdqa xmm6, xmm0
michael@0 570 movdqa xmm7, xmm0
michael@0 571 movdqa xmm2, xmm0
michael@0 572 movdqa xmm3, xmm0
michael@0 573 movdqa xmm5, xmm0
michael@0 574 movdqa xmm4, xmm0
michael@0 575
michael@0 576 psrldq xmm1, 1
michael@0 577 psrldq xmm6, 6
michael@0 578 psrldq xmm7, 7
michael@0 579 psrldq xmm2, 2
michael@0 580 psrldq xmm3, 3
michael@0 581 psrldq xmm5, 5
michael@0 582 psrldq xmm4, 4
michael@0 583
michael@0 584 APPLY_FILTER_4 0
michael@0 585
michael@0 586 lea rsi, [rsi + rax]
michael@0 587 lea rdi, [rdi + rdx]
michael@0 588 dec rcx
michael@0 589 jnz .loop
michael@0 590
michael@0 591 add rsp, 16 * 6
michael@0 592 pop rsp
michael@0 593
michael@0 594 ; begin epilog
michael@0 595 pop rdi
michael@0 596 pop rsi
michael@0 597 RESTORE_XMM
michael@0 598 UNSHADOW_ARGS
michael@0 599 pop rbp
michael@0 600 ret
michael@0 601
michael@0 602 ;void vp9_filter_block1d8_h8_sse2
michael@0 603 ;(
michael@0 604 ; unsigned char *src_ptr,
michael@0 605 ; unsigned int src_pixels_per_line,
michael@0 606 ; unsigned char *output_ptr,
michael@0 607 ; unsigned int output_pitch,
michael@0 608 ; unsigned int output_height,
michael@0 609 ; short *filter
michael@0 610 ;)
michael@0 611 global sym(vp9_filter_block1d8_h8_sse2) PRIVATE
michael@0 612 sym(vp9_filter_block1d8_h8_sse2):
michael@0 613 push rbp
michael@0 614 mov rbp, rsp
michael@0 615 SHADOW_ARGS_TO_STACK 6
michael@0 616 SAVE_XMM 7
michael@0 617 push rsi
michael@0 618 push rdi
michael@0 619 ; end prolog
michael@0 620
michael@0 621 ALIGN_STACK 16, rax
michael@0 622 sub rsp, 16 * 10
michael@0 623 %define k0 [rsp + 16 * 0]
michael@0 624 %define k1 [rsp + 16 * 1]
michael@0 625 %define k2 [rsp + 16 * 2]
michael@0 626 %define k3 [rsp + 16 * 3]
michael@0 627 %define k4 [rsp + 16 * 4]
michael@0 628 %define k5 [rsp + 16 * 5]
michael@0 629 %define k6 [rsp + 16 * 6]
michael@0 630 %define k7 [rsp + 16 * 7]
michael@0 631 %define krd [rsp + 16 * 8]
michael@0 632 %define zero [rsp + 16 * 9]
michael@0 633
michael@0 634 GET_FILTERS
michael@0 635
michael@0 636 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
michael@0 637 movsxd rdx, DWORD PTR arg(3) ;out_pitch
michael@0 638 movsxd rcx, DWORD PTR arg(4) ;output_height
michael@0 639
michael@0 640 .loop:
michael@0 641 movdqu xmm0, [rsi - 3] ;load src
michael@0 642
michael@0 643 movdqa xmm1, xmm0
michael@0 644 movdqa xmm6, xmm0
michael@0 645 movdqa xmm7, xmm0
michael@0 646 movdqa xmm2, xmm0
michael@0 647 movdqa xmm5, xmm0
michael@0 648 movdqa xmm3, xmm0
michael@0 649 movdqa xmm4, xmm0
michael@0 650
michael@0 651 psrldq xmm1, 1
michael@0 652 psrldq xmm6, 6
michael@0 653 psrldq xmm7, 7
michael@0 654 psrldq xmm2, 2
michael@0 655 psrldq xmm5, 5
michael@0 656 psrldq xmm3, 3
michael@0 657 psrldq xmm4, 4
michael@0 658
michael@0 659 APPLY_FILTER_8 0, 0
michael@0 660
michael@0 661 lea rsi, [rsi + rax]
michael@0 662 lea rdi, [rdi + rdx]
michael@0 663 dec rcx
michael@0 664 jnz .loop
michael@0 665
michael@0 666 add rsp, 16 * 10
michael@0 667 pop rsp
michael@0 668
michael@0 669 ; begin epilog
michael@0 670 pop rdi
michael@0 671 pop rsi
michael@0 672 RESTORE_XMM
michael@0 673 UNSHADOW_ARGS
michael@0 674 pop rbp
michael@0 675 ret
michael@0 676
michael@0 677 ;void vp9_filter_block1d16_h8_sse2
michael@0 678 ;(
michael@0 679 ; unsigned char *src_ptr,
michael@0 680 ; unsigned int src_pixels_per_line,
michael@0 681 ; unsigned char *output_ptr,
michael@0 682 ; unsigned int output_pitch,
michael@0 683 ; unsigned int output_height,
michael@0 684 ; short *filter
michael@0 685 ;)
michael@0 686 global sym(vp9_filter_block1d16_h8_sse2) PRIVATE
michael@0 687 sym(vp9_filter_block1d16_h8_sse2):
michael@0 688 push rbp
michael@0 689 mov rbp, rsp
michael@0 690 SHADOW_ARGS_TO_STACK 6
michael@0 691 SAVE_XMM 7
michael@0 692 push rsi
michael@0 693 push rdi
michael@0 694 ; end prolog
michael@0 695
michael@0 696 ALIGN_STACK 16, rax
michael@0 697 sub rsp, 16 * 10
michael@0 698 %define k0 [rsp + 16 * 0]
michael@0 699 %define k1 [rsp + 16 * 1]
michael@0 700 %define k2 [rsp + 16 * 2]
michael@0 701 %define k3 [rsp + 16 * 3]
michael@0 702 %define k4 [rsp + 16 * 4]
michael@0 703 %define k5 [rsp + 16 * 5]
michael@0 704 %define k6 [rsp + 16 * 6]
michael@0 705 %define k7 [rsp + 16 * 7]
michael@0 706 %define krd [rsp + 16 * 8]
michael@0 707 %define zero [rsp + 16 * 9]
michael@0 708
michael@0 709 GET_FILTERS
michael@0 710
michael@0 711 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
michael@0 712 movsxd rdx, DWORD PTR arg(3) ;out_pitch
michael@0 713 movsxd rcx, DWORD PTR arg(4) ;output_height
michael@0 714
michael@0 715 .loop:
michael@0 716 movdqu xmm0, [rsi - 3] ;load src
michael@0 717
michael@0 718 movdqa xmm1, xmm0
michael@0 719 movdqa xmm6, xmm0
michael@0 720 movdqa xmm7, xmm0
michael@0 721 movdqa xmm2, xmm0
michael@0 722 movdqa xmm5, xmm0
michael@0 723 movdqa xmm3, xmm0
michael@0 724 movdqa xmm4, xmm0
michael@0 725
michael@0 726 psrldq xmm1, 1
michael@0 727 psrldq xmm6, 6
michael@0 728 psrldq xmm7, 7
michael@0 729 psrldq xmm2, 2
michael@0 730 psrldq xmm5, 5
michael@0 731 psrldq xmm3, 3
michael@0 732 psrldq xmm4, 4
michael@0 733
michael@0 734 APPLY_FILTER_8 0, 0
michael@0 735
michael@0 736 movdqu xmm0, [rsi + 5] ;load src
michael@0 737
michael@0 738 movdqa xmm1, xmm0
michael@0 739 movdqa xmm6, xmm0
michael@0 740 movdqa xmm7, xmm0
michael@0 741 movdqa xmm2, xmm0
michael@0 742 movdqa xmm5, xmm0
michael@0 743 movdqa xmm3, xmm0
michael@0 744 movdqa xmm4, xmm0
michael@0 745
michael@0 746 psrldq xmm1, 1
michael@0 747 psrldq xmm6, 6
michael@0 748 psrldq xmm7, 7
michael@0 749 psrldq xmm2, 2
michael@0 750 psrldq xmm5, 5
michael@0 751 psrldq xmm3, 3
michael@0 752 psrldq xmm4, 4
michael@0 753
michael@0 754 APPLY_FILTER_8 0, 8
michael@0 755
michael@0 756 lea rsi, [rsi + rax]
michael@0 757 lea rdi, [rdi + rdx]
michael@0 758 dec rcx
michael@0 759 jnz .loop
michael@0 760
michael@0 761 add rsp, 16 * 10
michael@0 762 pop rsp
michael@0 763
michael@0 764 ; begin epilog
michael@0 765 pop rdi
michael@0 766 pop rsi
michael@0 767 RESTORE_XMM
michael@0 768 UNSHADOW_ARGS
michael@0 769 pop rbp
michael@0 770 ret
michael@0 771
michael@0 772 global sym(vp9_filter_block1d4_h8_avg_sse2) PRIVATE
michael@0 773 sym(vp9_filter_block1d4_h8_avg_sse2):
michael@0 774 push rbp
michael@0 775 mov rbp, rsp
michael@0 776 SHADOW_ARGS_TO_STACK 6
michael@0 777 SAVE_XMM 7
michael@0 778 push rsi
michael@0 779 push rdi
michael@0 780 ; end prolog
michael@0 781
michael@0 782 ALIGN_STACK 16, rax
michael@0 783 sub rsp, 16 * 6
michael@0 784 %define k0k1 [rsp + 16 * 0]
michael@0 785 %define k2k3 [rsp + 16 * 1]
michael@0 786 %define k5k4 [rsp + 16 * 2]
michael@0 787 %define k6k7 [rsp + 16 * 3]
michael@0 788 %define krd [rsp + 16 * 4]
michael@0 789 %define zero [rsp + 16 * 5]
michael@0 790
michael@0 791 GET_FILTERS_4
michael@0 792
michael@0 793 mov rsi, arg(0) ;src_ptr
michael@0 794 mov rdi, arg(2) ;output_ptr
michael@0 795
michael@0 796 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
michael@0 797 movsxd rdx, DWORD PTR arg(3) ;out_pitch
michael@0 798 movsxd rcx, DWORD PTR arg(4) ;output_height
michael@0 799
michael@0 800 .loop:
michael@0 801 movdqu xmm0, [rsi - 3] ;load src
michael@0 802
michael@0 803 movdqa xmm1, xmm0
michael@0 804 movdqa xmm6, xmm0
michael@0 805 movdqa xmm7, xmm0
michael@0 806 movdqa xmm2, xmm0
michael@0 807 movdqa xmm3, xmm0
michael@0 808 movdqa xmm5, xmm0
michael@0 809 movdqa xmm4, xmm0
michael@0 810
michael@0 811 psrldq xmm1, 1
michael@0 812 psrldq xmm6, 6
michael@0 813 psrldq xmm7, 7
michael@0 814 psrldq xmm2, 2
michael@0 815 psrldq xmm3, 3
michael@0 816 psrldq xmm5, 5
michael@0 817 psrldq xmm4, 4
michael@0 818
michael@0 819 APPLY_FILTER_4 1
michael@0 820
michael@0 821 lea rsi, [rsi + rax]
michael@0 822 lea rdi, [rdi + rdx]
michael@0 823 dec rcx
michael@0 824 jnz .loop
michael@0 825
michael@0 826 add rsp, 16 * 6
michael@0 827 pop rsp
michael@0 828
michael@0 829 ; begin epilog
michael@0 830 pop rdi
michael@0 831 pop rsi
michael@0 832 RESTORE_XMM
michael@0 833 UNSHADOW_ARGS
michael@0 834 pop rbp
michael@0 835 ret
michael@0 836
michael@0 837 global sym(vp9_filter_block1d8_h8_avg_sse2) PRIVATE
michael@0 838 sym(vp9_filter_block1d8_h8_avg_sse2):
michael@0 839 push rbp
michael@0 840 mov rbp, rsp
michael@0 841 SHADOW_ARGS_TO_STACK 6
michael@0 842 SAVE_XMM 7
michael@0 843 push rsi
michael@0 844 push rdi
michael@0 845 ; end prolog
michael@0 846
michael@0 847 ALIGN_STACK 16, rax
michael@0 848 sub rsp, 16 * 10
michael@0 849 %define k0 [rsp + 16 * 0]
michael@0 850 %define k1 [rsp + 16 * 1]
michael@0 851 %define k2 [rsp + 16 * 2]
michael@0 852 %define k3 [rsp + 16 * 3]
michael@0 853 %define k4 [rsp + 16 * 4]
michael@0 854 %define k5 [rsp + 16 * 5]
michael@0 855 %define k6 [rsp + 16 * 6]
michael@0 856 %define k7 [rsp + 16 * 7]
michael@0 857 %define krd [rsp + 16 * 8]
michael@0 858 %define zero [rsp + 16 * 9]
michael@0 859
michael@0 860 GET_FILTERS
michael@0 861
michael@0 862 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
michael@0 863 movsxd rdx, DWORD PTR arg(3) ;out_pitch
michael@0 864 movsxd rcx, DWORD PTR arg(4) ;output_height
michael@0 865
michael@0 866 .loop:
michael@0 867 movdqu xmm0, [rsi - 3] ;load src
michael@0 868
michael@0 869 movdqa xmm1, xmm0
michael@0 870 movdqa xmm6, xmm0
michael@0 871 movdqa xmm7, xmm0
michael@0 872 movdqa xmm2, xmm0
michael@0 873 movdqa xmm5, xmm0
michael@0 874 movdqa xmm3, xmm0
michael@0 875 movdqa xmm4, xmm0
michael@0 876
michael@0 877 psrldq xmm1, 1
michael@0 878 psrldq xmm6, 6
michael@0 879 psrldq xmm7, 7
michael@0 880 psrldq xmm2, 2
michael@0 881 psrldq xmm5, 5
michael@0 882 psrldq xmm3, 3
michael@0 883 psrldq xmm4, 4
michael@0 884
michael@0 885 APPLY_FILTER_8 1, 0
michael@0 886
michael@0 887 lea rsi, [rsi + rax]
michael@0 888 lea rdi, [rdi + rdx]
michael@0 889 dec rcx
michael@0 890 jnz .loop
michael@0 891
michael@0 892 add rsp, 16 * 10
michael@0 893 pop rsp
michael@0 894
michael@0 895 ; begin epilog
michael@0 896 pop rdi
michael@0 897 pop rsi
michael@0 898 RESTORE_XMM
michael@0 899 UNSHADOW_ARGS
michael@0 900 pop rbp
michael@0 901 ret
michael@0 902
michael@0 903 global sym(vp9_filter_block1d16_h8_avg_sse2) PRIVATE
michael@0 904 sym(vp9_filter_block1d16_h8_avg_sse2):
michael@0 905 push rbp
michael@0 906 mov rbp, rsp
michael@0 907 SHADOW_ARGS_TO_STACK 6
michael@0 908 SAVE_XMM 7
michael@0 909 push rsi
michael@0 910 push rdi
michael@0 911 ; end prolog
michael@0 912
michael@0 913 ALIGN_STACK 16, rax
michael@0 914 sub rsp, 16 * 10
michael@0 915 %define k0 [rsp + 16 * 0]
michael@0 916 %define k1 [rsp + 16 * 1]
michael@0 917 %define k2 [rsp + 16 * 2]
michael@0 918 %define k3 [rsp + 16 * 3]
michael@0 919 %define k4 [rsp + 16 * 4]
michael@0 920 %define k5 [rsp + 16 * 5]
michael@0 921 %define k6 [rsp + 16 * 6]
michael@0 922 %define k7 [rsp + 16 * 7]
michael@0 923 %define krd [rsp + 16 * 8]
michael@0 924 %define zero [rsp + 16 * 9]
michael@0 925
michael@0 926 GET_FILTERS
michael@0 927
michael@0 928 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
michael@0 929 movsxd rdx, DWORD PTR arg(3) ;out_pitch
michael@0 930 movsxd rcx, DWORD PTR arg(4) ;output_height
michael@0 931
michael@0 932 .loop:
michael@0 933 movdqu xmm0, [rsi - 3] ;load src
michael@0 934
michael@0 935 movdqa xmm1, xmm0
michael@0 936 movdqa xmm6, xmm0
michael@0 937 movdqa xmm7, xmm0
michael@0 938 movdqa xmm2, xmm0
michael@0 939 movdqa xmm5, xmm0
michael@0 940 movdqa xmm3, xmm0
michael@0 941 movdqa xmm4, xmm0
michael@0 942
michael@0 943 psrldq xmm1, 1
michael@0 944 psrldq xmm6, 6
michael@0 945 psrldq xmm7, 7
michael@0 946 psrldq xmm2, 2
michael@0 947 psrldq xmm5, 5
michael@0 948 psrldq xmm3, 3
michael@0 949 psrldq xmm4, 4
michael@0 950
michael@0 951 APPLY_FILTER_8 1, 0
michael@0 952
michael@0 953 movdqu xmm0, [rsi + 5] ;load src
michael@0 954
michael@0 955 movdqa xmm1, xmm0
michael@0 956 movdqa xmm6, xmm0
michael@0 957 movdqa xmm7, xmm0
michael@0 958 movdqa xmm2, xmm0
michael@0 959 movdqa xmm5, xmm0
michael@0 960 movdqa xmm3, xmm0
michael@0 961 movdqa xmm4, xmm0
michael@0 962
michael@0 963 psrldq xmm1, 1
michael@0 964 psrldq xmm6, 6
michael@0 965 psrldq xmm7, 7
michael@0 966 psrldq xmm2, 2
michael@0 967 psrldq xmm5, 5
michael@0 968 psrldq xmm3, 3
michael@0 969 psrldq xmm4, 4
michael@0 970
michael@0 971 APPLY_FILTER_8 1, 8
michael@0 972
michael@0 973 lea rsi, [rsi + rax]
michael@0 974 lea rdi, [rdi + rdx]
michael@0 975 dec rcx
michael@0 976 jnz .loop
michael@0 977
michael@0 978 add rsp, 16 * 10
michael@0 979 pop rsp
michael@0 980
michael@0 981 ; begin epilog
michael@0 982 pop rdi
michael@0 983 pop rsi
michael@0 984 RESTORE_XMM
michael@0 985 UNSHADOW_ARGS
michael@0 986 pop rbp
michael@0 987 ret

mercurial