media/libvpx/vp8/common/x86/subpixel_sse2.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11
michael@0 12 %include "vpx_ports/x86_abi_support.asm"
michael@0 13 extern sym(vp8_bilinear_filters_x86_8)
michael@0 14
michael@0 15 %define BLOCK_HEIGHT_WIDTH 4
michael@0 16 %define VP8_FILTER_WEIGHT 128
michael@0 17 %define VP8_FILTER_SHIFT 7
michael@0 18
michael@0 19
michael@0 20 ;/************************************************************************************
michael@0 21 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
michael@0 22 ; input pixel array has output_height rows. This routine assumes that output_height is an
michael@0 23 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
michael@0 24 ; rows each iteration to take advantage of the 128 bits operations.
michael@0 25 ;*************************************************************************************/
michael@0 26 ;void vp8_filter_block1d8_h6_sse2
michael@0 27 ;(
michael@0 28 ; unsigned char *src_ptr,
michael@0 29 ; unsigned short *output_ptr,
michael@0 30 ; unsigned int src_pixels_per_line,
michael@0 31 ; unsigned int pixel_step,
michael@0 32 ; unsigned int output_height,
michael@0 33 ; unsigned int output_width,
michael@0 34 ; short *vp8_filter
michael@0 35 ;)
michael@0 36 global sym(vp8_filter_block1d8_h6_sse2) PRIVATE
michael@0 37 sym(vp8_filter_block1d8_h6_sse2):
michael@0 38 push rbp
michael@0 39 mov rbp, rsp
michael@0 40 SHADOW_ARGS_TO_STACK 7
michael@0 41 SAVE_XMM 7
michael@0 42 GET_GOT rbx
michael@0 43 push rsi
michael@0 44 push rdi
michael@0 45 ; end prolog
michael@0 46
michael@0 47 mov rdx, arg(6) ;vp8_filter
michael@0 48 mov rsi, arg(0) ;src_ptr
michael@0 49
michael@0 50 mov rdi, arg(1) ;output_ptr
michael@0 51
michael@0 52 movsxd rcx, dword ptr arg(4) ;output_height
michael@0 53 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
michael@0 54 %if ABI_IS_32BIT=0
michael@0 55 movsxd r8, dword ptr arg(5) ;output_width
michael@0 56 %endif
michael@0 57 pxor xmm0, xmm0 ; clear xmm0 for unpack
michael@0 58
michael@0 59 .filter_block1d8_h6_rowloop:
michael@0 60 movq xmm3, MMWORD PTR [rsi - 2]
michael@0 61 movq xmm1, MMWORD PTR [rsi + 6]
michael@0 62
michael@0 63 prefetcht2 [rsi+rax-2]
michael@0 64
michael@0 65 pslldq xmm1, 8
michael@0 66 por xmm1, xmm3
michael@0 67
michael@0 68 movdqa xmm4, xmm1
michael@0 69 movdqa xmm5, xmm1
michael@0 70
michael@0 71 movdqa xmm6, xmm1
michael@0 72 movdqa xmm7, xmm1
michael@0 73
michael@0 74 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
michael@0 75 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
michael@0 76
michael@0 77 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
michael@0 78 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
michael@0 79
michael@0 80 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
michael@0 81 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
michael@0 82
michael@0 83
michael@0 84 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
michael@0 85 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
michael@0 86
michael@0 87 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
michael@0 88
michael@0 89 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
michael@0 90 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
michael@0 91
michael@0 92 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
michael@0 93
michael@0 94 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
michael@0 95 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
michael@0 96
michael@0 97
michael@0 98 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
michael@0 99
michael@0 100 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
michael@0 101 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
michael@0 102
michael@0 103
michael@0 104 paddsw xmm4, xmm7
michael@0 105 paddsw xmm4, xmm5
michael@0 106
michael@0 107 paddsw xmm4, xmm3
michael@0 108 paddsw xmm4, xmm6
michael@0 109
michael@0 110 paddsw xmm4, xmm1
michael@0 111 paddsw xmm4, [GLOBAL(rd)]
michael@0 112
michael@0 113 psraw xmm4, 7
michael@0 114
michael@0 115 packuswb xmm4, xmm0
michael@0 116 punpcklbw xmm4, xmm0
michael@0 117
michael@0 118 movdqa XMMWORD Ptr [rdi], xmm4
michael@0 119 lea rsi, [rsi + rax]
michael@0 120
michael@0 121 %if ABI_IS_32BIT
michael@0 122 add rdi, DWORD Ptr arg(5) ;[output_width]
michael@0 123 %else
michael@0 124 add rdi, r8
michael@0 125 %endif
michael@0 126 dec rcx
michael@0 127
michael@0 128 jnz .filter_block1d8_h6_rowloop ; next row
michael@0 129
michael@0 130 ; begin epilog
michael@0 131 pop rdi
michael@0 132 pop rsi
michael@0 133 RESTORE_GOT
michael@0 134 RESTORE_XMM
michael@0 135 UNSHADOW_ARGS
michael@0 136 pop rbp
michael@0 137 ret
michael@0 138
michael@0 139
michael@0 140 ;void vp8_filter_block1d16_h6_sse2
michael@0 141 ;(
michael@0 142 ; unsigned char *src_ptr,
michael@0 143 ; unsigned short *output_ptr,
michael@0 144 ; unsigned int src_pixels_per_line,
michael@0 145 ; unsigned int pixel_step,
michael@0 146 ; unsigned int output_height,
michael@0 147 ; unsigned int output_width,
michael@0 148 ; short *vp8_filter
michael@0 149 ;)
michael@0 150 ;/************************************************************************************
michael@0 151 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
michael@0 152 ; input pixel array has output_height rows. This routine assumes that output_height is an
michael@0 153 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
michael@0 154 ; rows each iteration to take advantage of the 128 bits operations.
michael@0 155 ;*************************************************************************************/
michael@0 156 global sym(vp8_filter_block1d16_h6_sse2) PRIVATE
michael@0 157 sym(vp8_filter_block1d16_h6_sse2):
michael@0 158 push rbp
michael@0 159 mov rbp, rsp
michael@0 160 SHADOW_ARGS_TO_STACK 7
michael@0 161 SAVE_XMM 7
michael@0 162 GET_GOT rbx
michael@0 163 push rsi
michael@0 164 push rdi
michael@0 165 ; end prolog
michael@0 166
michael@0 167 mov rdx, arg(6) ;vp8_filter
michael@0 168 mov rsi, arg(0) ;src_ptr
michael@0 169
michael@0 170 mov rdi, arg(1) ;output_ptr
michael@0 171
michael@0 172 movsxd rcx, dword ptr arg(4) ;output_height
michael@0 173 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
michael@0 174 %if ABI_IS_32BIT=0
michael@0 175 movsxd r8, dword ptr arg(5) ;output_width
michael@0 176 %endif
michael@0 177
michael@0 178 pxor xmm0, xmm0 ; clear xmm0 for unpack
michael@0 179
michael@0 180 .filter_block1d16_h6_sse2_rowloop:
michael@0 181 movq xmm3, MMWORD PTR [rsi - 2]
michael@0 182 movq xmm1, MMWORD PTR [rsi + 6]
michael@0 183
michael@0 184 movq xmm2, MMWORD PTR [rsi +14]
michael@0 185 pslldq xmm2, 8
michael@0 186
michael@0 187 por xmm2, xmm1
michael@0 188 prefetcht2 [rsi+rax-2]
michael@0 189
michael@0 190 pslldq xmm1, 8
michael@0 191 por xmm1, xmm3
michael@0 192
michael@0 193 movdqa xmm4, xmm1
michael@0 194 movdqa xmm5, xmm1
michael@0 195
michael@0 196 movdqa xmm6, xmm1
michael@0 197 movdqa xmm7, xmm1
michael@0 198
michael@0 199 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
michael@0 200 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
michael@0 201
michael@0 202 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
michael@0 203 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
michael@0 204
michael@0 205 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
michael@0 206 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
michael@0 207
michael@0 208
michael@0 209 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
michael@0 210 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
michael@0 211
michael@0 212 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
michael@0 213
michael@0 214 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
michael@0 215 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
michael@0 216
michael@0 217 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
michael@0 218
michael@0 219 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
michael@0 220 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
michael@0 221
michael@0 222
michael@0 223 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
michael@0 224
michael@0 225 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
michael@0 226 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
michael@0 227
michael@0 228 paddsw xmm4, xmm7
michael@0 229 paddsw xmm4, xmm5
michael@0 230
michael@0 231 paddsw xmm4, xmm3
michael@0 232 paddsw xmm4, xmm6
michael@0 233
michael@0 234 paddsw xmm4, xmm1
michael@0 235 paddsw xmm4, [GLOBAL(rd)]
michael@0 236
michael@0 237 psraw xmm4, 7
michael@0 238
michael@0 239 packuswb xmm4, xmm0
michael@0 240 punpcklbw xmm4, xmm0
michael@0 241
michael@0 242 movdqa XMMWORD Ptr [rdi], xmm4
michael@0 243
michael@0 244 movdqa xmm3, xmm2
michael@0 245 movdqa xmm4, xmm2
michael@0 246
michael@0 247 movdqa xmm5, xmm2
michael@0 248 movdqa xmm6, xmm2
michael@0 249
michael@0 250 movdqa xmm7, xmm2
michael@0 251
michael@0 252 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
michael@0 253 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
michael@0 254
michael@0 255 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
michael@0 256 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
michael@0 257
michael@0 258 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
michael@0 259 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
michael@0 260
michael@0 261
michael@0 262 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
michael@0 263 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
michael@0 264
michael@0 265 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
michael@0 266
michael@0 267 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
michael@0 268 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
michael@0 269
michael@0 270 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
michael@0 271
michael@0 272 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
michael@0 273 psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
michael@0 274
michael@0 275 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
michael@0 276
michael@0 277 punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
michael@0 278 pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
michael@0 279
michael@0 280
michael@0 281 paddsw xmm4, xmm7
michael@0 282 paddsw xmm4, xmm5
michael@0 283
michael@0 284 paddsw xmm4, xmm3
michael@0 285 paddsw xmm4, xmm6
michael@0 286
michael@0 287 paddsw xmm4, xmm2
michael@0 288 paddsw xmm4, [GLOBAL(rd)]
michael@0 289
michael@0 290 psraw xmm4, 7
michael@0 291
michael@0 292 packuswb xmm4, xmm0
michael@0 293 punpcklbw xmm4, xmm0
michael@0 294
michael@0 295 movdqa XMMWORD Ptr [rdi+16], xmm4
michael@0 296
michael@0 297 lea rsi, [rsi + rax]
michael@0 298 %if ABI_IS_32BIT
michael@0 299 add rdi, DWORD Ptr arg(5) ;[output_width]
michael@0 300 %else
michael@0 301 add rdi, r8
michael@0 302 %endif
michael@0 303
michael@0 304 dec rcx
michael@0 305 jnz .filter_block1d16_h6_sse2_rowloop ; next row
michael@0 306
michael@0 307 ; begin epilog
michael@0 308 pop rdi
michael@0 309 pop rsi
michael@0 310 RESTORE_GOT
michael@0 311 RESTORE_XMM
michael@0 312 UNSHADOW_ARGS
michael@0 313 pop rbp
michael@0 314 ret
michael@0 315
michael@0 316
michael@0 317 ;void vp8_filter_block1d8_v6_sse2
michael@0 318 ;(
michael@0 319 ; short *src_ptr,
michael@0 320 ; unsigned char *output_ptr,
michael@0 321 ; int dst_ptich,
michael@0 322 ; unsigned int pixels_per_line,
michael@0 323 ; unsigned int pixel_step,
michael@0 324 ; unsigned int output_height,
michael@0 325 ; unsigned int output_width,
michael@0 326 ; short * vp8_filter
michael@0 327 ;)
michael@0 328 ;/************************************************************************************
michael@0 329 ; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
michael@0 330 ; input pixel array has output_height rows.
michael@0 331 ;*************************************************************************************/
michael@0 332 global sym(vp8_filter_block1d8_v6_sse2) PRIVATE
michael@0 333 sym(vp8_filter_block1d8_v6_sse2):
michael@0 334 push rbp
michael@0 335 mov rbp, rsp
michael@0 336 SHADOW_ARGS_TO_STACK 8
michael@0 337 SAVE_XMM 7
michael@0 338 GET_GOT rbx
michael@0 339 push rsi
michael@0 340 push rdi
michael@0 341 ; end prolog
michael@0 342
michael@0 343 mov rax, arg(7) ;vp8_filter
michael@0 344 movsxd rdx, dword ptr arg(3) ;pixels_per_line
michael@0 345
michael@0 346 mov rdi, arg(1) ;output_ptr
michael@0 347 mov rsi, arg(0) ;src_ptr
michael@0 348
michael@0 349 sub rsi, rdx
michael@0 350 sub rsi, rdx
michael@0 351
michael@0 352 movsxd rcx, DWORD PTR arg(5) ;[output_height]
michael@0 353 pxor xmm0, xmm0 ; clear xmm0
michael@0 354
michael@0 355 movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
michael@0 356 %if ABI_IS_32BIT=0
michael@0 357 movsxd r8, dword ptr arg(2) ; dst_ptich
michael@0 358 %endif
michael@0 359
michael@0 360 .vp8_filter_block1d8_v6_sse2_loop:
michael@0 361 movdqa xmm1, XMMWORD PTR [rsi]
michael@0 362 pmullw xmm1, [rax]
michael@0 363
michael@0 364 movdqa xmm2, XMMWORD PTR [rsi + rdx]
michael@0 365 pmullw xmm2, [rax + 16]
michael@0 366
michael@0 367 movdqa xmm3, XMMWORD PTR [rsi + rdx * 2]
michael@0 368 pmullw xmm3, [rax + 32]
michael@0 369
michael@0 370 movdqa xmm5, XMMWORD PTR [rsi + rdx * 4]
michael@0 371 pmullw xmm5, [rax + 64]
michael@0 372
michael@0 373 add rsi, rdx
michael@0 374 movdqa xmm4, XMMWORD PTR [rsi + rdx * 2]
michael@0 375
michael@0 376 pmullw xmm4, [rax + 48]
michael@0 377 movdqa xmm6, XMMWORD PTR [rsi + rdx * 4]
michael@0 378
michael@0 379 pmullw xmm6, [rax + 80]
michael@0 380
michael@0 381 paddsw xmm2, xmm5
michael@0 382 paddsw xmm2, xmm3
michael@0 383
michael@0 384 paddsw xmm2, xmm1
michael@0 385 paddsw xmm2, xmm4
michael@0 386
michael@0 387 paddsw xmm2, xmm6
michael@0 388 paddsw xmm2, xmm7
michael@0 389
michael@0 390 psraw xmm2, 7
michael@0 391 packuswb xmm2, xmm0 ; pack and saturate
michael@0 392
michael@0 393 movq QWORD PTR [rdi], xmm2 ; store the results in the destination
michael@0 394 %if ABI_IS_32BIT
michael@0 395 add rdi, DWORD PTR arg(2) ;[dst_ptich]
michael@0 396 %else
michael@0 397 add rdi, r8
michael@0 398 %endif
michael@0 399 dec rcx ; decrement count
michael@0 400 jnz .vp8_filter_block1d8_v6_sse2_loop ; next row
michael@0 401
michael@0 402 ; begin epilog
michael@0 403 pop rdi
michael@0 404 pop rsi
michael@0 405 RESTORE_GOT
michael@0 406 RESTORE_XMM
michael@0 407 UNSHADOW_ARGS
michael@0 408 pop rbp
michael@0 409 ret
michael@0 410
michael@0 411
michael@0 412 ;void vp8_filter_block1d16_v6_sse2
michael@0 413 ;(
michael@0 414 ; unsigned short *src_ptr,
michael@0 415 ; unsigned char *output_ptr,
michael@0 416 ; int dst_ptich,
michael@0 417 ; unsigned int pixels_per_line,
michael@0 418 ; unsigned int pixel_step,
michael@0 419 ; unsigned int output_height,
michael@0 420 ; unsigned int output_width,
michael@0 421 ; const short *vp8_filter
michael@0 422 ;)
michael@0 423 ;/************************************************************************************
michael@0 424 ; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
michael@0 425 ; input pixel array has output_height rows.
michael@0 426 ;*************************************************************************************/
michael@0 427 global sym(vp8_filter_block1d16_v6_sse2) PRIVATE
michael@0 428 sym(vp8_filter_block1d16_v6_sse2):
michael@0 429 push rbp
michael@0 430 mov rbp, rsp
michael@0 431 SHADOW_ARGS_TO_STACK 8
michael@0 432 SAVE_XMM 7
michael@0 433 GET_GOT rbx
michael@0 434 push rsi
michael@0 435 push rdi
michael@0 436 ; end prolog
michael@0 437
michael@0 438 mov rax, arg(7) ;vp8_filter
michael@0 439 movsxd rdx, dword ptr arg(3) ;pixels_per_line
michael@0 440
michael@0 441 mov rdi, arg(1) ;output_ptr
michael@0 442 mov rsi, arg(0) ;src_ptr
michael@0 443
michael@0 444 sub rsi, rdx
michael@0 445 sub rsi, rdx
michael@0 446
michael@0 447 movsxd rcx, DWORD PTR arg(5) ;[output_height]
michael@0 448 %if ABI_IS_32BIT=0
michael@0 449 movsxd r8, dword ptr arg(2) ; dst_ptich
michael@0 450 %endif
michael@0 451
michael@0 452 .vp8_filter_block1d16_v6_sse2_loop:
michael@0 453 ; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
michael@0 454 movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2
michael@0 455 movdqa xmm2, XMMWORD PTR [rsi + rdx + 16]
michael@0 456 pmullw xmm1, [rax + 16]
michael@0 457 pmullw xmm2, [rax + 16]
michael@0 458
michael@0 459 movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5
michael@0 460 movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16]
michael@0 461 pmullw xmm3, [rax + 64]
michael@0 462 pmullw xmm4, [rax + 64]
michael@0 463
michael@0 464 movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3
michael@0 465 movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16]
michael@0 466 pmullw xmm5, [rax + 32]
michael@0 467 pmullw xmm6, [rax + 32]
michael@0 468
michael@0 469 movdqa xmm7, XMMWORD PTR [rsi] ; line 1
michael@0 470 movdqa xmm0, XMMWORD PTR [rsi + 16]
michael@0 471 pmullw xmm7, [rax]
michael@0 472 pmullw xmm0, [rax]
michael@0 473
michael@0 474 paddsw xmm1, xmm3
michael@0 475 paddsw xmm2, xmm4
michael@0 476 paddsw xmm1, xmm5
michael@0 477 paddsw xmm2, xmm6
michael@0 478 paddsw xmm1, xmm7
michael@0 479 paddsw xmm2, xmm0
michael@0 480
michael@0 481 add rsi, rdx
michael@0 482
michael@0 483 movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4
michael@0 484 movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16]
michael@0 485 pmullw xmm3, [rax + 48]
michael@0 486 pmullw xmm4, [rax + 48]
michael@0 487
michael@0 488 movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6
michael@0 489 movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16]
michael@0 490 pmullw xmm5, [rax + 80]
michael@0 491 pmullw xmm6, [rax + 80]
michael@0 492
michael@0 493 movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
michael@0 494 pxor xmm0, xmm0 ; clear xmm0
michael@0 495
michael@0 496 paddsw xmm1, xmm3
michael@0 497 paddsw xmm2, xmm4
michael@0 498 paddsw xmm1, xmm5
michael@0 499 paddsw xmm2, xmm6
michael@0 500
michael@0 501 paddsw xmm1, xmm7
michael@0 502 paddsw xmm2, xmm7
michael@0 503
michael@0 504 psraw xmm1, 7
michael@0 505 psraw xmm2, 7
michael@0 506
michael@0 507 packuswb xmm1, xmm2 ; pack and saturate
michael@0 508 movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination
michael@0 509 %if ABI_IS_32BIT
michael@0 510 add rdi, DWORD PTR arg(2) ;[dst_ptich]
michael@0 511 %else
michael@0 512 add rdi, r8
michael@0 513 %endif
michael@0 514 dec rcx ; decrement count
michael@0 515 jnz .vp8_filter_block1d16_v6_sse2_loop ; next row
michael@0 516
michael@0 517 ; begin epilog
michael@0 518 pop rdi
michael@0 519 pop rsi
michael@0 520 RESTORE_GOT
michael@0 521 RESTORE_XMM
michael@0 522 UNSHADOW_ARGS
michael@0 523 pop rbp
michael@0 524 ret
michael@0 525
michael@0 526
michael@0 527 ;void vp8_filter_block1d8_h6_only_sse2
michael@0 528 ;(
michael@0 529 ; unsigned char *src_ptr,
michael@0 530 ; unsigned int src_pixels_per_line,
michael@0 531 ; unsigned char *output_ptr,
michael@0 532 ; int dst_ptich,
michael@0 533 ; unsigned int output_height,
michael@0 534 ; const short *vp8_filter
michael@0 535 ;)
michael@0 536 ; First-pass filter only when yoffset==0
michael@0 537 global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE
michael@0 538 sym(vp8_filter_block1d8_h6_only_sse2):
michael@0 539 push rbp
michael@0 540 mov rbp, rsp
michael@0 541 SHADOW_ARGS_TO_STACK 6
michael@0 542 SAVE_XMM 7
michael@0 543 GET_GOT rbx
michael@0 544 push rsi
michael@0 545 push rdi
michael@0 546 ; end prolog
michael@0 547
michael@0 548 mov rdx, arg(5) ;vp8_filter
michael@0 549 mov rsi, arg(0) ;src_ptr
michael@0 550
michael@0 551 mov rdi, arg(2) ;output_ptr
michael@0 552
michael@0 553 movsxd rcx, dword ptr arg(4) ;output_height
michael@0 554 movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
michael@0 555 %if ABI_IS_32BIT=0
michael@0 556 movsxd r8, dword ptr arg(3) ;dst_ptich
michael@0 557 %endif
michael@0 558 pxor xmm0, xmm0 ; clear xmm0 for unpack
michael@0 559
michael@0 560 .filter_block1d8_h6_only_rowloop:
michael@0 561 movq xmm3, MMWORD PTR [rsi - 2]
michael@0 562 movq xmm1, MMWORD PTR [rsi + 6]
michael@0 563
michael@0 564 prefetcht2 [rsi+rax-2]
michael@0 565
michael@0 566 pslldq xmm1, 8
michael@0 567 por xmm1, xmm3
michael@0 568
michael@0 569 movdqa xmm4, xmm1
michael@0 570 movdqa xmm5, xmm1
michael@0 571
michael@0 572 movdqa xmm6, xmm1
michael@0 573 movdqa xmm7, xmm1
michael@0 574
michael@0 575 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
michael@0 576 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
michael@0 577
michael@0 578 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
michael@0 579 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
michael@0 580
michael@0 581 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
michael@0 582 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
michael@0 583
michael@0 584
michael@0 585 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
michael@0 586 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
michael@0 587
michael@0 588 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
michael@0 589
michael@0 590 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
michael@0 591 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
michael@0 592
michael@0 593 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
michael@0 594
michael@0 595 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
michael@0 596 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
michael@0 597
michael@0 598
michael@0 599 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
michael@0 600
michael@0 601 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
michael@0 602 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
michael@0 603
michael@0 604
michael@0 605 paddsw xmm4, xmm7
michael@0 606 paddsw xmm4, xmm5
michael@0 607
michael@0 608 paddsw xmm4, xmm3
michael@0 609 paddsw xmm4, xmm6
michael@0 610
michael@0 611 paddsw xmm4, xmm1
michael@0 612 paddsw xmm4, [GLOBAL(rd)]
michael@0 613
michael@0 614 psraw xmm4, 7
michael@0 615
michael@0 616 packuswb xmm4, xmm0
michael@0 617
michael@0 618 movq QWORD PTR [rdi], xmm4 ; store the results in the destination
michael@0 619 lea rsi, [rsi + rax]
michael@0 620
michael@0 621 %if ABI_IS_32BIT
michael@0 622 add rdi, DWORD Ptr arg(3) ;dst_ptich
michael@0 623 %else
michael@0 624 add rdi, r8
michael@0 625 %endif
michael@0 626 dec rcx
michael@0 627
michael@0 628 jnz .filter_block1d8_h6_only_rowloop ; next row
michael@0 629
michael@0 630 ; begin epilog
michael@0 631 pop rdi
michael@0 632 pop rsi
michael@0 633 RESTORE_GOT
michael@0 634 RESTORE_XMM
michael@0 635 UNSHADOW_ARGS
michael@0 636 pop rbp
michael@0 637 ret
michael@0 638
michael@0 639
michael@0 640 ;void vp8_filter_block1d16_h6_only_sse2
michael@0 641 ;(
michael@0 642 ; unsigned char *src_ptr,
michael@0 643 ; unsigned int src_pixels_per_line,
michael@0 644 ; unsigned char *output_ptr,
michael@0 645 ; int dst_ptich,
michael@0 646 ; unsigned int output_height,
michael@0 647 ; const short *vp8_filter
michael@0 648 ;)
michael@0 649 ; First-pass filter only when yoffset==0
michael@0 650 global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE
michael@0 651 sym(vp8_filter_block1d16_h6_only_sse2):
michael@0 652 push rbp
michael@0 653 mov rbp, rsp
michael@0 654 SHADOW_ARGS_TO_STACK 6
michael@0 655 SAVE_XMM 7
michael@0 656 GET_GOT rbx
michael@0 657 push rsi
michael@0 658 push rdi
michael@0 659 ; end prolog
michael@0 660
michael@0 661 mov rdx, arg(5) ;vp8_filter
michael@0 662 mov rsi, arg(0) ;src_ptr
michael@0 663
michael@0 664 mov rdi, arg(2) ;output_ptr
michael@0 665
michael@0 666 movsxd rcx, dword ptr arg(4) ;output_height
michael@0 667 movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
michael@0 668 %if ABI_IS_32BIT=0
michael@0 669 movsxd r8, dword ptr arg(3) ;dst_ptich
michael@0 670 %endif
michael@0 671
michael@0 672 pxor xmm0, xmm0 ; clear xmm0 for unpack
michael@0 673
michael@0 674 .filter_block1d16_h6_only_sse2_rowloop:
michael@0 675 movq xmm3, MMWORD PTR [rsi - 2]
michael@0 676 movq xmm1, MMWORD PTR [rsi + 6]
michael@0 677
michael@0 678 movq xmm2, MMWORD PTR [rsi +14]
michael@0 679 pslldq xmm2, 8
michael@0 680
michael@0 681 por xmm2, xmm1
michael@0 682 prefetcht2 [rsi+rax-2]
michael@0 683
michael@0 684 pslldq xmm1, 8
michael@0 685 por xmm1, xmm3
michael@0 686
michael@0 687 movdqa xmm4, xmm1
michael@0 688 movdqa xmm5, xmm1
michael@0 689
michael@0 690 movdqa xmm6, xmm1
michael@0 691 movdqa xmm7, xmm1
michael@0 692
michael@0 693 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
michael@0 694 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
michael@0 695
michael@0 696 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
michael@0 697 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
michael@0 698
michael@0 699 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
michael@0 700 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
michael@0 701
michael@0 702 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
michael@0 703 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
michael@0 704
michael@0 705 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
michael@0 706
michael@0 707 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
michael@0 708 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
michael@0 709
michael@0 710 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
michael@0 711
michael@0 712 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
michael@0 713 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
michael@0 714
michael@0 715 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
michael@0 716
michael@0 717 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
michael@0 718 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
michael@0 719
michael@0 720 paddsw xmm4, xmm7
michael@0 721 paddsw xmm4, xmm5
michael@0 722
michael@0 723 paddsw xmm4, xmm3
michael@0 724 paddsw xmm4, xmm6
michael@0 725
michael@0 726 paddsw xmm4, xmm1
michael@0 727 paddsw xmm4, [GLOBAL(rd)]
michael@0 728
michael@0 729 psraw xmm4, 7
michael@0 730
michael@0 731 packuswb xmm4, xmm0 ; lower 8 bytes
michael@0 732
michael@0 733 movq QWORD Ptr [rdi], xmm4 ; store the results in the destination
michael@0 734
michael@0 735 movdqa xmm3, xmm2
michael@0 736 movdqa xmm4, xmm2
michael@0 737
michael@0 738 movdqa xmm5, xmm2
michael@0 739 movdqa xmm6, xmm2
michael@0 740
michael@0 741 movdqa xmm7, xmm2
michael@0 742
michael@0 743 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
michael@0 744 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
michael@0 745
michael@0 746 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
michael@0 747 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
michael@0 748
michael@0 749 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
michael@0 750 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
michael@0 751
michael@0 752 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
michael@0 753 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
michael@0 754
michael@0 755 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
michael@0 756
michael@0 757 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
michael@0 758 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
michael@0 759
michael@0 760 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
michael@0 761
michael@0 762 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
michael@0 763 psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
michael@0 764
michael@0 765 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
michael@0 766
michael@0 767 punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
michael@0 768 pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
michael@0 769
michael@0 770 paddsw xmm4, xmm7
michael@0 771 paddsw xmm4, xmm5
michael@0 772
michael@0 773 paddsw xmm4, xmm3
michael@0 774 paddsw xmm4, xmm6
michael@0 775
michael@0 776 paddsw xmm4, xmm2
michael@0 777 paddsw xmm4, [GLOBAL(rd)]
michael@0 778
michael@0 779 psraw xmm4, 7
michael@0 780
michael@0 781 packuswb xmm4, xmm0 ; higher 8 bytes
michael@0 782
michael@0 783 movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination
michael@0 784
michael@0 785 lea rsi, [rsi + rax]
michael@0 786 %if ABI_IS_32BIT
michael@0 787 add rdi, DWORD Ptr arg(3) ;dst_ptich
michael@0 788 %else
michael@0 789 add rdi, r8
michael@0 790 %endif
michael@0 791
michael@0 792 dec rcx
michael@0 793 jnz .filter_block1d16_h6_only_sse2_rowloop ; next row
michael@0 794
michael@0 795 ; begin epilog
michael@0 796 pop rdi
michael@0 797 pop rsi
michael@0 798 RESTORE_GOT
michael@0 799 RESTORE_XMM
michael@0 800 UNSHADOW_ARGS
michael@0 801 pop rbp
michael@0 802 ret
michael@0 803
michael@0 804
michael@0 805 ;void vp8_filter_block1d8_v6_only_sse2
michael@0 806 ;(
michael@0 807 ; unsigned char *src_ptr,
michael@0 808 ; unsigned int src_pixels_per_line,
michael@0 809 ; unsigned char *output_ptr,
michael@0 810 ; int dst_ptich,
michael@0 811 ; unsigned int output_height,
michael@0 812 ; const short *vp8_filter
michael@0 813 ;)
michael@0 814 ; Second-pass filter only when xoffset==0
michael@0 815 global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE
michael@0 816 sym(vp8_filter_block1d8_v6_only_sse2):
michael@0 817 push rbp
michael@0 818 mov rbp, rsp
michael@0 819 SHADOW_ARGS_TO_STACK 6
michael@0 820 SAVE_XMM 7
michael@0 821 GET_GOT rbx
michael@0 822 push rsi
michael@0 823 push rdi
michael@0 824 ; end prolog
michael@0 825
michael@0 826 mov rsi, arg(0) ;src_ptr
michael@0 827 mov rdi, arg(2) ;output_ptr
michael@0 828
michael@0 829 movsxd rcx, dword ptr arg(4) ;output_height
michael@0 830 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
michael@0 831
michael@0 832 mov rax, arg(5) ;vp8_filter
michael@0 833
michael@0 834 pxor xmm0, xmm0 ; clear xmm0
michael@0 835
michael@0 836 movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
michael@0 837 %if ABI_IS_32BIT=0
michael@0 838 movsxd r8, dword ptr arg(3) ; dst_ptich
michael@0 839 %endif
michael@0 840
michael@0 841 .vp8_filter_block1d8_v6_only_sse2_loop:
michael@0 842 movq xmm1, MMWORD PTR [rsi]
michael@0 843 movq xmm2, MMWORD PTR [rsi + rdx]
michael@0 844 movq xmm3, MMWORD PTR [rsi + rdx * 2]
michael@0 845 movq xmm5, MMWORD PTR [rsi + rdx * 4]
michael@0 846 add rsi, rdx
michael@0 847 movq xmm4, MMWORD PTR [rsi + rdx * 2]
michael@0 848 movq xmm6, MMWORD PTR [rsi + rdx * 4]
michael@0 849
michael@0 850 punpcklbw xmm1, xmm0
michael@0 851 pmullw xmm1, [rax]
michael@0 852
michael@0 853 punpcklbw xmm2, xmm0
michael@0 854 pmullw xmm2, [rax + 16]
michael@0 855
michael@0 856 punpcklbw xmm3, xmm0
michael@0 857 pmullw xmm3, [rax + 32]
michael@0 858
michael@0 859 punpcklbw xmm5, xmm0
michael@0 860 pmullw xmm5, [rax + 64]
michael@0 861
michael@0 862 punpcklbw xmm4, xmm0
michael@0 863 pmullw xmm4, [rax + 48]
michael@0 864
michael@0 865 punpcklbw xmm6, xmm0
michael@0 866 pmullw xmm6, [rax + 80]
michael@0 867
michael@0 868 paddsw xmm2, xmm5
michael@0 869 paddsw xmm2, xmm3
michael@0 870
michael@0 871 paddsw xmm2, xmm1
michael@0 872 paddsw xmm2, xmm4
michael@0 873
michael@0 874 paddsw xmm2, xmm6
michael@0 875 paddsw xmm2, xmm7
michael@0 876
michael@0 877 psraw xmm2, 7
michael@0 878 packuswb xmm2, xmm0 ; pack and saturate
michael@0 879
michael@0 880 movq QWORD PTR [rdi], xmm2 ; store the results in the destination
michael@0 881 %if ABI_IS_32BIT
michael@0 882 add rdi, DWORD PTR arg(3) ;[dst_ptich]
michael@0 883 %else
michael@0 884 add rdi, r8
michael@0 885 %endif
michael@0 886 dec rcx ; decrement count
michael@0 887 jnz .vp8_filter_block1d8_v6_only_sse2_loop ; next row
michael@0 888
michael@0 889 ; begin epilog
michael@0 890 pop rdi
michael@0 891 pop rsi
michael@0 892 RESTORE_GOT
michael@0 893 RESTORE_XMM
michael@0 894 UNSHADOW_ARGS
michael@0 895 pop rbp
michael@0 896 ret
michael@0 897
michael@0 898
michael@0 899 ;void vp8_unpack_block1d16_h6_sse2
michael@0 900 ;(
michael@0 901 ; unsigned char *src_ptr,
michael@0 902 ; unsigned short *output_ptr,
michael@0 903 ; unsigned int src_pixels_per_line,
michael@0 904 ; unsigned int output_height,
michael@0 905 ; unsigned int output_width
michael@0 906 ;)
michael@0 907 global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE
michael@0 908 sym(vp8_unpack_block1d16_h6_sse2):
michael@0 909 push rbp
michael@0 910 mov rbp, rsp
michael@0 911 SHADOW_ARGS_TO_STACK 5
michael@0 912 GET_GOT rbx
michael@0 913 push rsi
michael@0 914 push rdi
michael@0 915 ; end prolog
michael@0 916
michael@0 917 mov rsi, arg(0) ;src_ptr
michael@0 918 mov rdi, arg(1) ;output_ptr
michael@0 919
michael@0 920 movsxd rcx, dword ptr arg(3) ;output_height
michael@0 921 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
michael@0 922
michael@0 923 pxor xmm0, xmm0 ; clear xmm0 for unpack
michael@0 924 %if ABI_IS_32BIT=0
michael@0 925 movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source
michael@0 926 %endif
michael@0 927
michael@0 928 .unpack_block1d16_h6_sse2_rowloop:
michael@0 929 movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
michael@0 930 movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1
michael@0 931
michael@0 932 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
michael@0 933 punpcklbw xmm1, xmm0
michael@0 934
michael@0 935 movdqa XMMWORD Ptr [rdi], xmm1
michael@0 936 movdqa XMMWORD Ptr [rdi + 16], xmm3
michael@0 937
michael@0 938 lea rsi, [rsi + rax]
michael@0 939 %if ABI_IS_32BIT
michael@0 940 add rdi, DWORD Ptr arg(4) ;[output_width]
michael@0 941 %else
michael@0 942 add rdi, r8
michael@0 943 %endif
michael@0 944 dec rcx
michael@0 945 jnz .unpack_block1d16_h6_sse2_rowloop ; next row
michael@0 946
michael@0 947 ; begin epilog
michael@0 948 pop rdi
michael@0 949 pop rsi
michael@0 950 RESTORE_GOT
michael@0 951 UNSHADOW_ARGS
michael@0 952 pop rbp
michael@0 953 ret
michael@0 954
michael@0 955
michael@0 956 ;void vp8_bilinear_predict16x16_sse2
michael@0 957 ;(
michael@0 958 ; unsigned char *src_ptr,
michael@0 959 ; int src_pixels_per_line,
michael@0 960 ; int xoffset,
michael@0 961 ; int yoffset,
michael@0 962 ; unsigned char *dst_ptr,
michael@0 963 ; int dst_pitch
michael@0 964 ;)
michael@0 965 extern sym(vp8_bilinear_filters_x86_8)
michael@0 966 global sym(vp8_bilinear_predict16x16_sse2) PRIVATE
michael@0 967 sym(vp8_bilinear_predict16x16_sse2):
michael@0 968 push rbp
michael@0 969 mov rbp, rsp
michael@0 970 SHADOW_ARGS_TO_STACK 6
michael@0 971 SAVE_XMM 7
michael@0 972 GET_GOT rbx
michael@0 973 push rsi
michael@0 974 push rdi
michael@0 975 ; end prolog
michael@0 976
michael@0 977 ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
michael@0 978 ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
michael@0 979
michael@0 980 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
michael@0 981 movsxd rax, dword ptr arg(2) ;xoffset
michael@0 982
michael@0 983 cmp rax, 0 ;skip first_pass filter if xoffset=0
michael@0 984 je .b16x16_sp_only
michael@0 985
michael@0 986 shl rax, 5
michael@0 987 add rax, rcx ;HFilter
michael@0 988
michael@0 989 mov rdi, arg(4) ;dst_ptr
michael@0 990 mov rsi, arg(0) ;src_ptr
michael@0 991 movsxd rdx, dword ptr arg(5) ;dst_pitch
michael@0 992
michael@0 993 movdqa xmm1, [rax]
michael@0 994 movdqa xmm2, [rax+16]
michael@0 995
michael@0 996 movsxd rax, dword ptr arg(3) ;yoffset
michael@0 997
michael@0 998 cmp rax, 0 ;skip second_pass filter if yoffset=0
michael@0 999 je .b16x16_fp_only
michael@0 1000
michael@0 1001 shl rax, 5
michael@0 1002 add rax, rcx ;VFilter
michael@0 1003
michael@0 1004 lea rcx, [rdi+rdx*8]
michael@0 1005 lea rcx, [rcx+rdx*8]
michael@0 1006 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
michael@0 1007
michael@0 1008 pxor xmm0, xmm0
michael@0 1009
michael@0 1010 %if ABI_IS_32BIT=0
michael@0 1011 movsxd r8, dword ptr arg(5) ;dst_pitch
michael@0 1012 %endif
michael@0 1013 ; get the first horizontal line done
michael@0 1014 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
michael@0 1015 movdqa xmm4, xmm3 ; make a copy of current line
michael@0 1016
michael@0 1017 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
michael@0 1018 punpckhbw xmm4, xmm0
michael@0 1019
michael@0 1020 pmullw xmm3, xmm1
michael@0 1021 pmullw xmm4, xmm1
michael@0 1022
michael@0 1023 movdqu xmm5, [rsi+1]
michael@0 1024 movdqa xmm6, xmm5
michael@0 1025
michael@0 1026 punpcklbw xmm5, xmm0
michael@0 1027 punpckhbw xmm6, xmm0
michael@0 1028
michael@0 1029 pmullw xmm5, xmm2
michael@0 1030 pmullw xmm6, xmm2
michael@0 1031
michael@0 1032 paddw xmm3, xmm5
michael@0 1033 paddw xmm4, xmm6
michael@0 1034
michael@0 1035 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
michael@0 1036 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
michael@0 1037
michael@0 1038 paddw xmm4, [GLOBAL(rd)]
michael@0 1039 psraw xmm4, VP8_FILTER_SHIFT
michael@0 1040
michael@0 1041 movdqa xmm7, xmm3
michael@0 1042 packuswb xmm7, xmm4
michael@0 1043
michael@0 1044 add rsi, rdx ; next line
michael@0 1045 .next_row:
michael@0 1046 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
michael@0 1047 movdqa xmm4, xmm3 ; make a copy of current line
michael@0 1048
michael@0 1049 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
michael@0 1050 punpckhbw xmm4, xmm0
michael@0 1051
michael@0 1052 pmullw xmm3, xmm1
michael@0 1053 pmullw xmm4, xmm1
michael@0 1054
michael@0 1055 movdqu xmm5, [rsi+1]
michael@0 1056 movdqa xmm6, xmm5
michael@0 1057
michael@0 1058 punpcklbw xmm5, xmm0
michael@0 1059 punpckhbw xmm6, xmm0
michael@0 1060
michael@0 1061 pmullw xmm5, xmm2
michael@0 1062 pmullw xmm6, xmm2
michael@0 1063
michael@0 1064 paddw xmm3, xmm5
michael@0 1065 paddw xmm4, xmm6
michael@0 1066
michael@0 1067 movdqa xmm5, xmm7
michael@0 1068 movdqa xmm6, xmm7
michael@0 1069
michael@0 1070 punpcklbw xmm5, xmm0
michael@0 1071 punpckhbw xmm6, xmm0
michael@0 1072
michael@0 1073 pmullw xmm5, [rax]
michael@0 1074 pmullw xmm6, [rax]
michael@0 1075
michael@0 1076 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
michael@0 1077 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
michael@0 1078
michael@0 1079 paddw xmm4, [GLOBAL(rd)]
michael@0 1080 psraw xmm4, VP8_FILTER_SHIFT
michael@0 1081
michael@0 1082 movdqa xmm7, xmm3
michael@0 1083 packuswb xmm7, xmm4
michael@0 1084
michael@0 1085 pmullw xmm3, [rax+16]
michael@0 1086 pmullw xmm4, [rax+16]
michael@0 1087
michael@0 1088 paddw xmm3, xmm5
michael@0 1089 paddw xmm4, xmm6
michael@0 1090
michael@0 1091 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
michael@0 1092 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
michael@0 1093
michael@0 1094 paddw xmm4, [GLOBAL(rd)]
michael@0 1095 psraw xmm4, VP8_FILTER_SHIFT
michael@0 1096
michael@0 1097 packuswb xmm3, xmm4
michael@0 1098 movdqa [rdi], xmm3 ; store the results in the destination
michael@0 1099
michael@0 1100 add rsi, rdx ; next line
michael@0 1101 %if ABI_IS_32BIT
michael@0 1102 add rdi, DWORD PTR arg(5) ;dst_pitch
michael@0 1103 %else
michael@0 1104 add rdi, r8
michael@0 1105 %endif
michael@0 1106
michael@0 1107 cmp rdi, rcx
michael@0 1108 jne .next_row
michael@0 1109
michael@0 1110 jmp .done
michael@0 1111
michael@0 1112 .b16x16_sp_only:
michael@0 1113 movsxd rax, dword ptr arg(3) ;yoffset
michael@0 1114 shl rax, 5
michael@0 1115 add rax, rcx ;VFilter
michael@0 1116
michael@0 1117 mov rdi, arg(4) ;dst_ptr
michael@0 1118 mov rsi, arg(0) ;src_ptr
michael@0 1119 movsxd rdx, dword ptr arg(5) ;dst_pitch
michael@0 1120
michael@0 1121 movdqa xmm1, [rax]
michael@0 1122 movdqa xmm2, [rax+16]
michael@0 1123
michael@0 1124 lea rcx, [rdi+rdx*8]
michael@0 1125 lea rcx, [rcx+rdx*8]
michael@0 1126 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
michael@0 1127
michael@0 1128 pxor xmm0, xmm0
michael@0 1129
michael@0 1130 ; get the first horizontal line done
michael@0 1131 movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
michael@0 1132
michael@0 1133 add rsi, rax ; next line
michael@0 1134 .next_row_spo:
michael@0 1135 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
michael@0 1136
michael@0 1137 movdqa xmm5, xmm7
michael@0 1138 movdqa xmm6, xmm7
michael@0 1139
michael@0 1140 movdqa xmm4, xmm3 ; make a copy of current line
michael@0 1141 movdqa xmm7, xmm3
michael@0 1142
michael@0 1143 punpcklbw xmm5, xmm0
michael@0 1144 punpckhbw xmm6, xmm0
michael@0 1145 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
michael@0 1146 punpckhbw xmm4, xmm0
michael@0 1147
michael@0 1148 pmullw xmm5, xmm1
michael@0 1149 pmullw xmm6, xmm1
michael@0 1150 pmullw xmm3, xmm2
michael@0 1151 pmullw xmm4, xmm2
michael@0 1152
michael@0 1153 paddw xmm3, xmm5
michael@0 1154 paddw xmm4, xmm6
michael@0 1155
michael@0 1156 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
michael@0 1157 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
michael@0 1158
michael@0 1159 paddw xmm4, [GLOBAL(rd)]
michael@0 1160 psraw xmm4, VP8_FILTER_SHIFT
michael@0 1161
michael@0 1162 packuswb xmm3, xmm4
michael@0 1163 movdqa [rdi], xmm3 ; store the results in the destination
michael@0 1164
michael@0 1165 add rsi, rax ; next line
michael@0 1166 add rdi, rdx ;dst_pitch
michael@0 1167 cmp rdi, rcx
michael@0 1168 jne .next_row_spo
michael@0 1169
michael@0 1170 jmp .done
michael@0 1171
michael@0 1172 .b16x16_fp_only:
michael@0 1173 lea rcx, [rdi+rdx*8]
michael@0 1174 lea rcx, [rcx+rdx*8]
michael@0 1175 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
michael@0 1176 pxor xmm0, xmm0
michael@0 1177
michael@0 1178 .next_row_fpo:
michael@0 1179 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
michael@0 1180 movdqa xmm4, xmm3 ; make a copy of current line
michael@0 1181
michael@0 1182 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
michael@0 1183 punpckhbw xmm4, xmm0
michael@0 1184
michael@0 1185 pmullw xmm3, xmm1
michael@0 1186 pmullw xmm4, xmm1
michael@0 1187
michael@0 1188 movdqu xmm5, [rsi+1]
michael@0 1189 movdqa xmm6, xmm5
michael@0 1190
michael@0 1191 punpcklbw xmm5, xmm0
michael@0 1192 punpckhbw xmm6, xmm0
michael@0 1193
michael@0 1194 pmullw xmm5, xmm2
michael@0 1195 pmullw xmm6, xmm2
michael@0 1196
michael@0 1197 paddw xmm3, xmm5
michael@0 1198 paddw xmm4, xmm6
michael@0 1199
michael@0 1200 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
michael@0 1201 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
michael@0 1202
michael@0 1203 paddw xmm4, [GLOBAL(rd)]
michael@0 1204 psraw xmm4, VP8_FILTER_SHIFT
michael@0 1205
michael@0 1206 packuswb xmm3, xmm4
michael@0 1207 movdqa [rdi], xmm3 ; store the results in the destination
michael@0 1208
michael@0 1209 add rsi, rax ; next line
michael@0 1210 add rdi, rdx ; dst_pitch
michael@0 1211 cmp rdi, rcx
michael@0 1212 jne .next_row_fpo
michael@0 1213
michael@0 1214 .done:
michael@0 1215 ; begin epilog
michael@0 1216 pop rdi
michael@0 1217 pop rsi
michael@0 1218 RESTORE_GOT
michael@0 1219 RESTORE_XMM
michael@0 1220 UNSHADOW_ARGS
michael@0 1221 pop rbp
michael@0 1222 ret
michael@0 1223
michael@0 1224
michael@0 1225 ;void vp8_bilinear_predict8x8_sse2
michael@0 1226 ;(
michael@0 1227 ; unsigned char *src_ptr,
michael@0 1228 ; int src_pixels_per_line,
michael@0 1229 ; int xoffset,
michael@0 1230 ; int yoffset,
michael@0 1231 ; unsigned char *dst_ptr,
michael@0 1232 ; int dst_pitch
michael@0 1233 ;)
michael@0 1234 global sym(vp8_bilinear_predict8x8_sse2) PRIVATE
michael@0 1235 sym(vp8_bilinear_predict8x8_sse2):
michael@0 1236 push rbp
michael@0 1237 mov rbp, rsp
michael@0 1238 SHADOW_ARGS_TO_STACK 6
michael@0 1239 SAVE_XMM 7
michael@0 1240 GET_GOT rbx
michael@0 1241 push rsi
michael@0 1242 push rdi
michael@0 1243 ; end prolog
michael@0 1244
michael@0 1245 ALIGN_STACK 16, rax
michael@0 1246 sub rsp, 144 ; reserve 144 bytes
michael@0 1247
michael@0 1248 ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
michael@0 1249 ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
michael@0 1250 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
michael@0 1251
michael@0 1252 mov rsi, arg(0) ;src_ptr
michael@0 1253 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
michael@0 1254
michael@0 1255 ;Read 9-line unaligned data in and put them on stack. This gives a big
michael@0 1256 ;performance boost.
michael@0 1257 movdqu xmm0, [rsi]
michael@0 1258 lea rax, [rdx + rdx*2]
michael@0 1259 movdqu xmm1, [rsi+rdx]
michael@0 1260 movdqu xmm2, [rsi+rdx*2]
michael@0 1261 add rsi, rax
michael@0 1262 movdqu xmm3, [rsi]
michael@0 1263 movdqu xmm4, [rsi+rdx]
michael@0 1264 movdqu xmm5, [rsi+rdx*2]
michael@0 1265 add rsi, rax
michael@0 1266 movdqu xmm6, [rsi]
michael@0 1267 movdqu xmm7, [rsi+rdx]
michael@0 1268
michael@0 1269 movdqa XMMWORD PTR [rsp], xmm0
michael@0 1270
michael@0 1271 movdqu xmm0, [rsi+rdx*2]
michael@0 1272
michael@0 1273 movdqa XMMWORD PTR [rsp+16], xmm1
michael@0 1274 movdqa XMMWORD PTR [rsp+32], xmm2
michael@0 1275 movdqa XMMWORD PTR [rsp+48], xmm3
michael@0 1276 movdqa XMMWORD PTR [rsp+64], xmm4
michael@0 1277 movdqa XMMWORD PTR [rsp+80], xmm5
michael@0 1278 movdqa XMMWORD PTR [rsp+96], xmm6
michael@0 1279 movdqa XMMWORD PTR [rsp+112], xmm7
michael@0 1280 movdqa XMMWORD PTR [rsp+128], xmm0
michael@0 1281
michael@0 1282 movsxd rax, dword ptr arg(2) ;xoffset
michael@0 1283 shl rax, 5
michael@0 1284 add rax, rcx ;HFilter
michael@0 1285
michael@0 1286 mov rdi, arg(4) ;dst_ptr
michael@0 1287 movsxd rdx, dword ptr arg(5) ;dst_pitch
michael@0 1288
michael@0 1289 movdqa xmm1, [rax]
michael@0 1290 movdqa xmm2, [rax+16]
michael@0 1291
michael@0 1292 movsxd rax, dword ptr arg(3) ;yoffset
michael@0 1293 shl rax, 5
michael@0 1294 add rax, rcx ;VFilter
michael@0 1295
michael@0 1296 lea rcx, [rdi+rdx*8]
michael@0 1297
michael@0 1298 movdqa xmm5, [rax]
michael@0 1299 movdqa xmm6, [rax+16]
michael@0 1300
michael@0 1301 pxor xmm0, xmm0
michael@0 1302
michael@0 1303 ; get the first horizontal line done
michael@0 1304 movdqa xmm3, XMMWORD PTR [rsp]
michael@0 1305 movdqa xmm4, xmm3 ; make a copy of current line
michael@0 1306 psrldq xmm4, 1
michael@0 1307
michael@0 1308 punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07
michael@0 1309 punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08
michael@0 1310
michael@0 1311 pmullw xmm3, xmm1
michael@0 1312 pmullw xmm4, xmm2
michael@0 1313
michael@0 1314 paddw xmm3, xmm4
michael@0 1315
michael@0 1316 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
michael@0 1317 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
michael@0 1318
michael@0 1319 movdqa xmm7, xmm3
michael@0 1320 add rsp, 16 ; next line
michael@0 1321 .next_row8x8:
michael@0 1322 movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
michael@0 1323 movdqa xmm4, xmm3 ; make a copy of current line
michael@0 1324 psrldq xmm4, 1
michael@0 1325
michael@0 1326 punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07
michael@0 1327 punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08
michael@0 1328
michael@0 1329 pmullw xmm3, xmm1
michael@0 1330 pmullw xmm4, xmm2
michael@0 1331
michael@0 1332 paddw xmm3, xmm4
michael@0 1333 pmullw xmm7, xmm5
michael@0 1334
michael@0 1335 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
michael@0 1336 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
michael@0 1337
michael@0 1338 movdqa xmm4, xmm3
michael@0 1339
michael@0 1340 pmullw xmm3, xmm6
michael@0 1341 paddw xmm3, xmm7
michael@0 1342
michael@0 1343 movdqa xmm7, xmm4
michael@0 1344
michael@0 1345 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
michael@0 1346 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
michael@0 1347
michael@0 1348 packuswb xmm3, xmm0
michael@0 1349 movq [rdi], xmm3 ; store the results in the destination
michael@0 1350
michael@0 1351 add rsp, 16 ; next line
michael@0 1352 add rdi, rdx
michael@0 1353
michael@0 1354 cmp rdi, rcx
michael@0 1355 jne .next_row8x8
michael@0 1356
michael@0 1357 ;add rsp, 144
michael@0 1358 pop rsp
michael@0 1359 ; begin epilog
michael@0 1360 pop rdi
michael@0 1361 pop rsi
michael@0 1362 RESTORE_GOT
michael@0 1363 RESTORE_XMM
michael@0 1364 UNSHADOW_ARGS
michael@0 1365 pop rbp
michael@0 1366 ret
michael@0 1367
michael@0 1368
michael@0 1369 SECTION_RODATA
michael@0 1370 align 16
michael@0 1371 rd:
michael@0 1372 times 8 dw 0x40

mercurial