media/libvpx/vp8/common/x86/subpixel_ssse3.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11
michael@0 12 %include "vpx_ports/x86_abi_support.asm"
michael@0 13
michael@0 14 %define BLOCK_HEIGHT_WIDTH 4
michael@0 15 %define VP8_FILTER_WEIGHT 128
michael@0 16 %define VP8_FILTER_SHIFT 7
michael@0 17
michael@0 18
michael@0 19 ;/************************************************************************************
michael@0 20 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
michael@0 21 ; input pixel array has output_height rows. This routine assumes that output_height is an
michael@0 22 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
michael@0 23 ; rows each iteration to take advantage of the 128 bits operations.
michael@0 24 ;
michael@0 25 ; This is an implementation of some of the SSE optimizations first seen in ffvp8
michael@0 26 ;
michael@0 27 ;*************************************************************************************/
michael@0 28 ;void vp8_filter_block1d8_h6_ssse3
michael@0 29 ;(
michael@0 30 ; unsigned char *src_ptr,
michael@0 31 ; unsigned int src_pixels_per_line,
michael@0 32 ; unsigned char *output_ptr,
michael@0 33 ; unsigned int output_pitch,
michael@0 34 ; unsigned int output_height,
michael@0 35 ; unsigned int vp8_filter_index
michael@0 36 ;)
michael@0 37 global sym(vp8_filter_block1d8_h6_ssse3) PRIVATE
michael@0 38 sym(vp8_filter_block1d8_h6_ssse3):
michael@0 39 push rbp
michael@0 40 mov rbp, rsp
michael@0 41 SHADOW_ARGS_TO_STACK 6
michael@0 42 SAVE_XMM 7
michael@0 43 GET_GOT rbx
michael@0 44 push rsi
michael@0 45 push rdi
michael@0 46 ; end prolog
michael@0 47
michael@0 48 movsxd rdx, DWORD PTR arg(5) ;table index
michael@0 49 xor rsi, rsi
michael@0 50 shl rdx, 4
michael@0 51
michael@0 52 movdqa xmm7, [GLOBAL(rd)]
michael@0 53
michael@0 54 lea rax, [GLOBAL(k0_k5)]
michael@0 55 add rax, rdx
michael@0 56 mov rdi, arg(2) ;output_ptr
michael@0 57
michael@0 58 cmp esi, DWORD PTR [rax]
michael@0 59 je vp8_filter_block1d8_h4_ssse3
michael@0 60
michael@0 61 movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
michael@0 62 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
michael@0 63 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
michael@0 64
michael@0 65 mov rsi, arg(0) ;src_ptr
michael@0 66 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
michael@0 67 movsxd rcx, dword ptr arg(4) ;output_height
michael@0 68
michael@0 69 movsxd rdx, dword ptr arg(3) ;output_pitch
michael@0 70
michael@0 71 sub rdi, rdx
michael@0 72 ;xmm3 free
michael@0 73 .filter_block1d8_h6_rowloop_ssse3:
michael@0 74 movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
michael@0 75
michael@0 76 movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
michael@0 77
michael@0 78 punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
michael@0 79
michael@0 80 movdqa xmm1, xmm0
michael@0 81 pmaddubsw xmm0, xmm4
michael@0 82
michael@0 83 movdqa xmm2, xmm1
michael@0 84 pshufb xmm1, [GLOBAL(shuf2bfrom1)]
michael@0 85
michael@0 86 pshufb xmm2, [GLOBAL(shuf3bfrom1)]
michael@0 87 pmaddubsw xmm1, xmm5
michael@0 88
michael@0 89 lea rdi, [rdi + rdx]
michael@0 90 pmaddubsw xmm2, xmm6
michael@0 91
michael@0 92 lea rsi, [rsi + rax]
michael@0 93 dec rcx
michael@0 94
michael@0 95 paddsw xmm0, xmm1
michael@0 96 paddsw xmm2, xmm7
michael@0 97
michael@0 98 paddsw xmm0, xmm2
michael@0 99
michael@0 100 psraw xmm0, 7
michael@0 101
michael@0 102 packuswb xmm0, xmm0
michael@0 103
michael@0 104 movq MMWORD Ptr [rdi], xmm0
michael@0 105 jnz .filter_block1d8_h6_rowloop_ssse3
michael@0 106
michael@0 107 ; begin epilog
michael@0 108 pop rdi
michael@0 109 pop rsi
michael@0 110 RESTORE_GOT
michael@0 111 RESTORE_XMM
michael@0 112 UNSHADOW_ARGS
michael@0 113 pop rbp
michael@0 114 ret
michael@0 115
michael@0 116 vp8_filter_block1d8_h4_ssse3:
michael@0 117 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
michael@0 118 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
michael@0 119
michael@0 120 movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
michael@0 121 movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
michael@0 122
michael@0 123 mov rsi, arg(0) ;src_ptr
michael@0 124
michael@0 125 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
michael@0 126 movsxd rcx, dword ptr arg(4) ;output_height
michael@0 127
michael@0 128 movsxd rdx, dword ptr arg(3) ;output_pitch
michael@0 129
michael@0 130 sub rdi, rdx
michael@0 131
michael@0 132 .filter_block1d8_h4_rowloop_ssse3:
michael@0 133 movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
michael@0 134
michael@0 135 movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
michael@0 136
michael@0 137 punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
michael@0 138
michael@0 139 movdqa xmm2, xmm0
michael@0 140 pshufb xmm0, xmm3
michael@0 141
michael@0 142 pshufb xmm2, xmm4
michael@0 143 pmaddubsw xmm0, xmm5
michael@0 144
michael@0 145 lea rdi, [rdi + rdx]
michael@0 146 pmaddubsw xmm2, xmm6
michael@0 147
michael@0 148 lea rsi, [rsi + rax]
michael@0 149 dec rcx
michael@0 150
michael@0 151 paddsw xmm0, xmm7
michael@0 152
michael@0 153 paddsw xmm0, xmm2
michael@0 154
michael@0 155 psraw xmm0, 7
michael@0 156
michael@0 157 packuswb xmm0, xmm0
michael@0 158
michael@0 159 movq MMWORD Ptr [rdi], xmm0
michael@0 160
michael@0 161 jnz .filter_block1d8_h4_rowloop_ssse3
michael@0 162
michael@0 163 ; begin epilog
michael@0 164 pop rdi
michael@0 165 pop rsi
michael@0 166 RESTORE_GOT
michael@0 167 RESTORE_XMM
michael@0 168 UNSHADOW_ARGS
michael@0 169 pop rbp
michael@0 170 ret
michael@0 171 ;void vp8_filter_block1d16_h6_ssse3
michael@0 172 ;(
michael@0 173 ; unsigned char *src_ptr,
michael@0 174 ; unsigned int src_pixels_per_line,
michael@0 175 ; unsigned char *output_ptr,
michael@0 176 ; unsigned int output_pitch,
michael@0 177 ; unsigned int output_height,
michael@0 178 ; unsigned int vp8_filter_index
michael@0 179 ;)
michael@0 180 global sym(vp8_filter_block1d16_h6_ssse3) PRIVATE
michael@0 181 sym(vp8_filter_block1d16_h6_ssse3):
michael@0 182 push rbp
michael@0 183 mov rbp, rsp
michael@0 184 SHADOW_ARGS_TO_STACK 6
michael@0 185 SAVE_XMM 7
michael@0 186 GET_GOT rbx
michael@0 187 push rsi
michael@0 188 push rdi
michael@0 189 ; end prolog
michael@0 190
michael@0 191 movsxd rdx, DWORD PTR arg(5) ;table index
michael@0 192 xor rsi, rsi
michael@0 193 shl rdx, 4 ;
michael@0 194
michael@0 195 lea rax, [GLOBAL(k0_k5)]
michael@0 196 add rax, rdx
michael@0 197
michael@0 198 mov rdi, arg(2) ;output_ptr
michael@0 199
michael@0 200 mov rsi, arg(0) ;src_ptr
michael@0 201
michael@0 202 movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
michael@0 203 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
michael@0 204 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
michael@0 205
michael@0 206 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
michael@0 207 movsxd rcx, dword ptr arg(4) ;output_height
michael@0 208 movsxd rdx, dword ptr arg(3) ;output_pitch
michael@0 209
michael@0 210 .filter_block1d16_h6_rowloop_ssse3:
michael@0 211 movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
michael@0 212
michael@0 213 movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
michael@0 214
michael@0 215 punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
michael@0 216
michael@0 217 movdqa xmm1, xmm0
michael@0 218 pmaddubsw xmm0, xmm4
michael@0 219
michael@0 220 movdqa xmm2, xmm1
michael@0 221 pshufb xmm1, [GLOBAL(shuf2bfrom1)]
michael@0 222
michael@0 223 pshufb xmm2, [GLOBAL(shuf3bfrom1)]
michael@0 224 movq xmm3, MMWORD PTR [rsi + 6]
michael@0 225
michael@0 226 pmaddubsw xmm1, xmm5
michael@0 227 movq xmm7, MMWORD PTR [rsi + 11]
michael@0 228
michael@0 229 pmaddubsw xmm2, xmm6
michael@0 230 punpcklbw xmm3, xmm7
michael@0 231
michael@0 232 paddsw xmm0, xmm1
michael@0 233 movdqa xmm1, xmm3
michael@0 234
michael@0 235 pmaddubsw xmm3, xmm4
michael@0 236 paddsw xmm0, xmm2
michael@0 237
michael@0 238 movdqa xmm2, xmm1
michael@0 239 paddsw xmm0, [GLOBAL(rd)]
michael@0 240
michael@0 241 pshufb xmm1, [GLOBAL(shuf2bfrom1)]
michael@0 242 pshufb xmm2, [GLOBAL(shuf3bfrom1)]
michael@0 243
michael@0 244 psraw xmm0, 7
michael@0 245 pmaddubsw xmm1, xmm5
michael@0 246
michael@0 247 pmaddubsw xmm2, xmm6
michael@0 248 packuswb xmm0, xmm0
michael@0 249
michael@0 250 lea rsi, [rsi + rax]
michael@0 251 paddsw xmm3, xmm1
michael@0 252
michael@0 253 paddsw xmm3, xmm2
michael@0 254
michael@0 255 paddsw xmm3, [GLOBAL(rd)]
michael@0 256
michael@0 257 psraw xmm3, 7
michael@0 258
michael@0 259 packuswb xmm3, xmm3
michael@0 260
michael@0 261 punpcklqdq xmm0, xmm3
michael@0 262
michael@0 263 movdqa XMMWORD Ptr [rdi], xmm0
michael@0 264
michael@0 265 lea rdi, [rdi + rdx]
michael@0 266 dec rcx
michael@0 267 jnz .filter_block1d16_h6_rowloop_ssse3
michael@0 268
michael@0 269 ; begin epilog
michael@0 270 pop rdi
michael@0 271 pop rsi
michael@0 272 RESTORE_GOT
michael@0 273 RESTORE_XMM
michael@0 274 UNSHADOW_ARGS
michael@0 275 pop rbp
michael@0 276 ret
michael@0 277
michael@0 278 ;void vp8_filter_block1d4_h6_ssse3
michael@0 279 ;(
michael@0 280 ; unsigned char *src_ptr,
michael@0 281 ; unsigned int src_pixels_per_line,
michael@0 282 ; unsigned char *output_ptr,
michael@0 283 ; unsigned int output_pitch,
michael@0 284 ; unsigned int output_height,
michael@0 285 ; unsigned int vp8_filter_index
michael@0 286 ;)
michael@0 287 global sym(vp8_filter_block1d4_h6_ssse3) PRIVATE
michael@0 288 sym(vp8_filter_block1d4_h6_ssse3):
michael@0 289 push rbp
michael@0 290 mov rbp, rsp
michael@0 291 SHADOW_ARGS_TO_STACK 6
michael@0 292 SAVE_XMM 7
michael@0 293 GET_GOT rbx
michael@0 294 push rsi
michael@0 295 push rdi
michael@0 296 ; end prolog
michael@0 297
michael@0 298 movsxd rdx, DWORD PTR arg(5) ;table index
michael@0 299 xor rsi, rsi
michael@0 300 shl rdx, 4 ;
michael@0 301
michael@0 302 lea rax, [GLOBAL(k0_k5)]
michael@0 303 add rax, rdx
michael@0 304 movdqa xmm7, [GLOBAL(rd)]
michael@0 305
michael@0 306 cmp esi, DWORD PTR [rax]
michael@0 307 je .vp8_filter_block1d4_h4_ssse3
michael@0 308
michael@0 309 movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
michael@0 310 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
michael@0 311 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
michael@0 312
michael@0 313 mov rsi, arg(0) ;src_ptr
michael@0 314 mov rdi, arg(2) ;output_ptr
michael@0 315 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
michael@0 316 movsxd rcx, dword ptr arg(4) ;output_height
michael@0 317
michael@0 318 movsxd rdx, dword ptr arg(3) ;output_pitch
michael@0 319
michael@0 320 ;xmm3 free
michael@0 321 .filter_block1d4_h6_rowloop_ssse3:
michael@0 322 movdqu xmm0, XMMWORD PTR [rsi - 2]
michael@0 323
michael@0 324 movdqa xmm1, xmm0
michael@0 325 pshufb xmm0, [GLOBAL(shuf1b)]
michael@0 326
michael@0 327 movdqa xmm2, xmm1
michael@0 328 pshufb xmm1, [GLOBAL(shuf2b)]
michael@0 329 pmaddubsw xmm0, xmm4
michael@0 330 pshufb xmm2, [GLOBAL(shuf3b)]
michael@0 331 pmaddubsw xmm1, xmm5
michael@0 332
michael@0 333 ;--
michael@0 334 pmaddubsw xmm2, xmm6
michael@0 335
michael@0 336 lea rsi, [rsi + rax]
michael@0 337 ;--
michael@0 338 paddsw xmm0, xmm1
michael@0 339 paddsw xmm0, xmm7
michael@0 340 pxor xmm1, xmm1
michael@0 341 paddsw xmm0, xmm2
michael@0 342 psraw xmm0, 7
michael@0 343 packuswb xmm0, xmm0
michael@0 344
michael@0 345 movd DWORD PTR [rdi], xmm0
michael@0 346
michael@0 347 add rdi, rdx
michael@0 348 dec rcx
michael@0 349 jnz .filter_block1d4_h6_rowloop_ssse3
michael@0 350
michael@0 351 ; begin epilog
michael@0 352 pop rdi
michael@0 353 pop rsi
michael@0 354 RESTORE_GOT
michael@0 355 RESTORE_XMM
michael@0 356 UNSHADOW_ARGS
michael@0 357 pop rbp
michael@0 358 ret
michael@0 359
michael@0 360 .vp8_filter_block1d4_h4_ssse3:
michael@0 361 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
michael@0 362 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
michael@0 363 movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
michael@0 364 movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
michael@0 365
michael@0 366 mov rsi, arg(0) ;src_ptr
michael@0 367 mov rdi, arg(2) ;output_ptr
michael@0 368 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
michael@0 369 movsxd rcx, dword ptr arg(4) ;output_height
michael@0 370
michael@0 371 movsxd rdx, dword ptr arg(3) ;output_pitch
michael@0 372
michael@0 373 .filter_block1d4_h4_rowloop_ssse3:
michael@0 374 movdqu xmm1, XMMWORD PTR [rsi - 2]
michael@0 375
michael@0 376 movdqa xmm2, xmm1
michael@0 377 pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)]
michael@0 378 pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)]
michael@0 379 pmaddubsw xmm1, xmm5
michael@0 380
michael@0 381 ;--
michael@0 382 pmaddubsw xmm2, xmm6
michael@0 383
michael@0 384 lea rsi, [rsi + rax]
michael@0 385 ;--
michael@0 386 paddsw xmm1, xmm7
michael@0 387 paddsw xmm1, xmm2
michael@0 388 psraw xmm1, 7
michael@0 389 packuswb xmm1, xmm1
michael@0 390
michael@0 391 movd DWORD PTR [rdi], xmm1
michael@0 392
michael@0 393 add rdi, rdx
michael@0 394 dec rcx
michael@0 395 jnz .filter_block1d4_h4_rowloop_ssse3
michael@0 396
michael@0 397 ; begin epilog
michael@0 398 pop rdi
michael@0 399 pop rsi
michael@0 400 RESTORE_GOT
michael@0 401 RESTORE_XMM
michael@0 402 UNSHADOW_ARGS
michael@0 403 pop rbp
michael@0 404 ret
michael@0 405
michael@0 406
michael@0 407
michael@0 408 ;void vp8_filter_block1d16_v6_ssse3
michael@0 409 ;(
michael@0 410 ; unsigned char *src_ptr,
michael@0 411 ; unsigned int src_pitch,
michael@0 412 ; unsigned char *output_ptr,
michael@0 413 ; unsigned int out_pitch,
michael@0 414 ; unsigned int output_height,
michael@0 415 ; unsigned int vp8_filter_index
michael@0 416 ;)
michael@0 417 global sym(vp8_filter_block1d16_v6_ssse3) PRIVATE
michael@0 418 sym(vp8_filter_block1d16_v6_ssse3):
michael@0 419 push rbp
michael@0 420 mov rbp, rsp
michael@0 421 SHADOW_ARGS_TO_STACK 6
michael@0 422 SAVE_XMM 7
michael@0 423 GET_GOT rbx
michael@0 424 push rsi
michael@0 425 push rdi
michael@0 426 ; end prolog
michael@0 427
michael@0 428 movsxd rdx, DWORD PTR arg(5) ;table index
michael@0 429 xor rsi, rsi
michael@0 430 shl rdx, 4 ;
michael@0 431
michael@0 432 lea rax, [GLOBAL(k0_k5)]
michael@0 433 add rax, rdx
michael@0 434
michael@0 435 cmp esi, DWORD PTR [rax]
michael@0 436 je .vp8_filter_block1d16_v4_ssse3
michael@0 437
michael@0 438 movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
michael@0 439 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
michael@0 440 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
michael@0 441
michael@0 442 mov rsi, arg(0) ;src_ptr
michael@0 443 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
michael@0 444 mov rdi, arg(2) ;output_ptr
michael@0 445
michael@0 446 %if ABI_IS_32BIT=0
michael@0 447 movsxd r8, DWORD PTR arg(3) ;out_pitch
michael@0 448 %endif
michael@0 449 mov rax, rsi
michael@0 450 movsxd rcx, DWORD PTR arg(4) ;output_height
michael@0 451 add rax, rdx
michael@0 452
michael@0 453
michael@0 454 .vp8_filter_block1d16_v6_ssse3_loop:
michael@0 455 movq xmm1, MMWORD PTR [rsi] ;A
michael@0 456 movq xmm2, MMWORD PTR [rsi + rdx] ;B
michael@0 457 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
michael@0 458 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
michael@0 459 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
michael@0 460
michael@0 461 punpcklbw xmm2, xmm4 ;B D
michael@0 462 punpcklbw xmm3, xmm0 ;C E
michael@0 463
michael@0 464 movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
michael@0 465
michael@0 466 pmaddubsw xmm3, xmm6
michael@0 467 punpcklbw xmm1, xmm0 ;A F
michael@0 468 pmaddubsw xmm2, xmm7
michael@0 469 pmaddubsw xmm1, xmm5
michael@0 470
michael@0 471 paddsw xmm2, xmm3
michael@0 472 paddsw xmm2, xmm1
michael@0 473 paddsw xmm2, [GLOBAL(rd)]
michael@0 474 psraw xmm2, 7
michael@0 475 packuswb xmm2, xmm2
michael@0 476
michael@0 477 movq MMWORD PTR [rdi], xmm2 ;store the results
michael@0 478
michael@0 479 movq xmm1, MMWORD PTR [rsi + 8] ;A
michael@0 480 movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B
michael@0 481 movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C
michael@0 482 movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
michael@0 483 movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
michael@0 484
michael@0 485 punpcklbw xmm2, xmm4 ;B D
michael@0 486 punpcklbw xmm3, xmm0 ;C E
michael@0 487
michael@0 488 movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F
michael@0 489 pmaddubsw xmm3, xmm6
michael@0 490 punpcklbw xmm1, xmm0 ;A F
michael@0 491 pmaddubsw xmm2, xmm7
michael@0 492 pmaddubsw xmm1, xmm5
michael@0 493
michael@0 494 add rsi, rdx
michael@0 495 add rax, rdx
michael@0 496 ;--
michael@0 497 ;--
michael@0 498 paddsw xmm2, xmm3
michael@0 499 paddsw xmm2, xmm1
michael@0 500 paddsw xmm2, [GLOBAL(rd)]
michael@0 501 psraw xmm2, 7
michael@0 502 packuswb xmm2, xmm2
michael@0 503
michael@0 504 movq MMWORD PTR [rdi+8], xmm2
michael@0 505
michael@0 506 %if ABI_IS_32BIT
michael@0 507 add rdi, DWORD PTR arg(3) ;out_pitch
michael@0 508 %else
michael@0 509 add rdi, r8
michael@0 510 %endif
michael@0 511 dec rcx
michael@0 512 jnz .vp8_filter_block1d16_v6_ssse3_loop
michael@0 513
michael@0 514 ; begin epilog
michael@0 515 pop rdi
michael@0 516 pop rsi
michael@0 517 RESTORE_GOT
michael@0 518 RESTORE_XMM
michael@0 519 UNSHADOW_ARGS
michael@0 520 pop rbp
michael@0 521 ret
michael@0 522
michael@0 523 .vp8_filter_block1d16_v4_ssse3:
michael@0 524 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
michael@0 525 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
michael@0 526
michael@0 527 mov rsi, arg(0) ;src_ptr
michael@0 528 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
michael@0 529 mov rdi, arg(2) ;output_ptr
michael@0 530
michael@0 531 %if ABI_IS_32BIT=0
michael@0 532 movsxd r8, DWORD PTR arg(3) ;out_pitch
michael@0 533 %endif
michael@0 534 mov rax, rsi
michael@0 535 movsxd rcx, DWORD PTR arg(4) ;output_height
michael@0 536 add rax, rdx
michael@0 537
michael@0 538 .vp8_filter_block1d16_v4_ssse3_loop:
michael@0 539 movq xmm2, MMWORD PTR [rsi + rdx] ;B
michael@0 540 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
michael@0 541 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
michael@0 542 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
michael@0 543
michael@0 544 punpcklbw xmm2, xmm4 ;B D
michael@0 545 punpcklbw xmm3, xmm0 ;C E
michael@0 546
michael@0 547 pmaddubsw xmm3, xmm6
michael@0 548 pmaddubsw xmm2, xmm7
michael@0 549 movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B
michael@0 550 movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C
michael@0 551 movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
michael@0 552 movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
michael@0 553
michael@0 554 paddsw xmm2, [GLOBAL(rd)]
michael@0 555 paddsw xmm2, xmm3
michael@0 556 psraw xmm2, 7
michael@0 557 packuswb xmm2, xmm2
michael@0 558
michael@0 559 punpcklbw xmm5, xmm4 ;B D
michael@0 560 punpcklbw xmm1, xmm0 ;C E
michael@0 561
michael@0 562 pmaddubsw xmm1, xmm6
michael@0 563 pmaddubsw xmm5, xmm7
michael@0 564
michael@0 565 movdqa xmm4, [GLOBAL(rd)]
michael@0 566 add rsi, rdx
michael@0 567 add rax, rdx
michael@0 568 ;--
michael@0 569 ;--
michael@0 570 paddsw xmm5, xmm1
michael@0 571 paddsw xmm5, xmm4
michael@0 572 psraw xmm5, 7
michael@0 573 packuswb xmm5, xmm5
michael@0 574
michael@0 575 punpcklqdq xmm2, xmm5
michael@0 576
michael@0 577 movdqa XMMWORD PTR [rdi], xmm2
michael@0 578
michael@0 579 %if ABI_IS_32BIT
michael@0 580 add rdi, DWORD PTR arg(3) ;out_pitch
michael@0 581 %else
michael@0 582 add rdi, r8
michael@0 583 %endif
michael@0 584 dec rcx
michael@0 585 jnz .vp8_filter_block1d16_v4_ssse3_loop
michael@0 586
michael@0 587 ; begin epilog
michael@0 588 pop rdi
michael@0 589 pop rsi
michael@0 590 RESTORE_GOT
michael@0 591 RESTORE_XMM
michael@0 592 UNSHADOW_ARGS
michael@0 593 pop rbp
michael@0 594 ret
michael@0 595
michael@0 596 ;void vp8_filter_block1d8_v6_ssse3
michael@0 597 ;(
michael@0 598 ; unsigned char *src_ptr,
michael@0 599 ; unsigned int src_pitch,
michael@0 600 ; unsigned char *output_ptr,
michael@0 601 ; unsigned int out_pitch,
michael@0 602 ; unsigned int output_height,
michael@0 603 ; unsigned int vp8_filter_index
michael@0 604 ;)
michael@0 605 global sym(vp8_filter_block1d8_v6_ssse3) PRIVATE
michael@0 606 sym(vp8_filter_block1d8_v6_ssse3):
michael@0 607 push rbp
michael@0 608 mov rbp, rsp
michael@0 609 SHADOW_ARGS_TO_STACK 6
michael@0 610 SAVE_XMM 7
michael@0 611 GET_GOT rbx
michael@0 612 push rsi
michael@0 613 push rdi
michael@0 614 ; end prolog
michael@0 615
michael@0 616 movsxd rdx, DWORD PTR arg(5) ;table index
michael@0 617 xor rsi, rsi
michael@0 618 shl rdx, 4 ;
michael@0 619
michael@0 620 lea rax, [GLOBAL(k0_k5)]
michael@0 621 add rax, rdx
michael@0 622
michael@0 623 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
michael@0 624 mov rdi, arg(2) ;output_ptr
michael@0 625 %if ABI_IS_32BIT=0
michael@0 626 movsxd r8, DWORD PTR arg(3) ; out_pitch
michael@0 627 %endif
michael@0 628 movsxd rcx, DWORD PTR arg(4) ;[output_height]
michael@0 629
michael@0 630 cmp esi, DWORD PTR [rax]
michael@0 631 je .vp8_filter_block1d8_v4_ssse3
michael@0 632
michael@0 633 movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
michael@0 634 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
michael@0 635 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
michael@0 636
michael@0 637 mov rsi, arg(0) ;src_ptr
michael@0 638
michael@0 639 mov rax, rsi
michael@0 640 add rax, rdx
michael@0 641
michael@0 642 .vp8_filter_block1d8_v6_ssse3_loop:
michael@0 643 movq xmm1, MMWORD PTR [rsi] ;A
michael@0 644 movq xmm2, MMWORD PTR [rsi + rdx] ;B
michael@0 645 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
michael@0 646 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
michael@0 647 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
michael@0 648
michael@0 649 punpcklbw xmm2, xmm4 ;B D
michael@0 650 punpcklbw xmm3, xmm0 ;C E
michael@0 651
michael@0 652 movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
michael@0 653 movdqa xmm4, [GLOBAL(rd)]
michael@0 654
michael@0 655 pmaddubsw xmm3, xmm6
michael@0 656 punpcklbw xmm1, xmm0 ;A F
michael@0 657 pmaddubsw xmm2, xmm7
michael@0 658 pmaddubsw xmm1, xmm5
michael@0 659 add rsi, rdx
michael@0 660 add rax, rdx
michael@0 661 ;--
michael@0 662 ;--
michael@0 663 paddsw xmm2, xmm3
michael@0 664 paddsw xmm2, xmm1
michael@0 665 paddsw xmm2, xmm4
michael@0 666 psraw xmm2, 7
michael@0 667 packuswb xmm2, xmm2
michael@0 668
michael@0 669 movq MMWORD PTR [rdi], xmm2
michael@0 670
michael@0 671 %if ABI_IS_32BIT
michael@0 672 add rdi, DWORD PTR arg(3) ;[out_pitch]
michael@0 673 %else
michael@0 674 add rdi, r8
michael@0 675 %endif
michael@0 676 dec rcx
michael@0 677 jnz .vp8_filter_block1d8_v6_ssse3_loop
michael@0 678
michael@0 679 ; begin epilog
michael@0 680 pop rdi
michael@0 681 pop rsi
michael@0 682 RESTORE_GOT
michael@0 683 RESTORE_XMM
michael@0 684 UNSHADOW_ARGS
michael@0 685 pop rbp
michael@0 686 ret
michael@0 687
michael@0 688 .vp8_filter_block1d8_v4_ssse3:
michael@0 689 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
michael@0 690 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
michael@0 691 movdqa xmm5, [GLOBAL(rd)]
michael@0 692
michael@0 693 mov rsi, arg(0) ;src_ptr
michael@0 694
michael@0 695 mov rax, rsi
michael@0 696 add rax, rdx
michael@0 697
michael@0 698 .vp8_filter_block1d8_v4_ssse3_loop:
michael@0 699 movq xmm2, MMWORD PTR [rsi + rdx] ;B
michael@0 700 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
michael@0 701 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
michael@0 702 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
michael@0 703
michael@0 704 punpcklbw xmm2, xmm4 ;B D
michael@0 705 punpcklbw xmm3, xmm0 ;C E
michael@0 706
michael@0 707 pmaddubsw xmm3, xmm6
michael@0 708 pmaddubsw xmm2, xmm7
michael@0 709 add rsi, rdx
michael@0 710 add rax, rdx
michael@0 711 ;--
michael@0 712 ;--
michael@0 713 paddsw xmm2, xmm3
michael@0 714 paddsw xmm2, xmm5
michael@0 715 psraw xmm2, 7
michael@0 716 packuswb xmm2, xmm2
michael@0 717
michael@0 718 movq MMWORD PTR [rdi], xmm2
michael@0 719
michael@0 720 %if ABI_IS_32BIT
michael@0 721 add rdi, DWORD PTR arg(3) ;[out_pitch]
michael@0 722 %else
michael@0 723 add rdi, r8
michael@0 724 %endif
michael@0 725 dec rcx
michael@0 726 jnz .vp8_filter_block1d8_v4_ssse3_loop
michael@0 727
michael@0 728 ; begin epilog
michael@0 729 pop rdi
michael@0 730 pop rsi
michael@0 731 RESTORE_GOT
michael@0 732 RESTORE_XMM
michael@0 733 UNSHADOW_ARGS
michael@0 734 pop rbp
michael@0 735 ret
michael@0 736 ;void vp8_filter_block1d4_v6_ssse3
michael@0 737 ;(
michael@0 738 ; unsigned char *src_ptr,
michael@0 739 ; unsigned int src_pitch,
michael@0 740 ; unsigned char *output_ptr,
michael@0 741 ; unsigned int out_pitch,
michael@0 742 ; unsigned int output_height,
michael@0 743 ; unsigned int vp8_filter_index
michael@0 744 ;)
michael@0 745 global sym(vp8_filter_block1d4_v6_ssse3) PRIVATE
michael@0 746 sym(vp8_filter_block1d4_v6_ssse3):
michael@0 747 push rbp
michael@0 748 mov rbp, rsp
michael@0 749 SHADOW_ARGS_TO_STACK 6
michael@0 750 GET_GOT rbx
michael@0 751 push rsi
michael@0 752 push rdi
michael@0 753 ; end prolog
michael@0 754
michael@0 755 movsxd rdx, DWORD PTR arg(5) ;table index
michael@0 756 xor rsi, rsi
michael@0 757 shl rdx, 4 ;
michael@0 758
michael@0 759 lea rax, [GLOBAL(k0_k5)]
michael@0 760 add rax, rdx
michael@0 761
michael@0 762 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
michael@0 763 mov rdi, arg(2) ;output_ptr
michael@0 764 %if ABI_IS_32BIT=0
michael@0 765 movsxd r8, DWORD PTR arg(3) ; out_pitch
michael@0 766 %endif
michael@0 767 movsxd rcx, DWORD PTR arg(4) ;[output_height]
michael@0 768
michael@0 769 cmp esi, DWORD PTR [rax]
michael@0 770 je .vp8_filter_block1d4_v4_ssse3
michael@0 771
michael@0 772 movq mm5, MMWORD PTR [rax] ;k0_k5
michael@0 773 movq mm6, MMWORD PTR [rax+256] ;k2_k4
michael@0 774 movq mm7, MMWORD PTR [rax+128] ;k1_k3
michael@0 775
michael@0 776 mov rsi, arg(0) ;src_ptr
michael@0 777
michael@0 778 mov rax, rsi
michael@0 779 add rax, rdx
michael@0 780
michael@0 781 .vp8_filter_block1d4_v6_ssse3_loop:
michael@0 782 movd mm1, DWORD PTR [rsi] ;A
michael@0 783 movd mm2, DWORD PTR [rsi + rdx] ;B
michael@0 784 movd mm3, DWORD PTR [rsi + rdx * 2] ;C
michael@0 785 movd mm4, DWORD PTR [rax + rdx * 2] ;D
michael@0 786 movd mm0, DWORD PTR [rsi + rdx * 4] ;E
michael@0 787
michael@0 788 punpcklbw mm2, mm4 ;B D
michael@0 789 punpcklbw mm3, mm0 ;C E
michael@0 790
michael@0 791 movd mm0, DWORD PTR [rax + rdx * 4] ;F
michael@0 792
michael@0 793 movq mm4, [GLOBAL(rd)]
michael@0 794
michael@0 795 pmaddubsw mm3, mm6
michael@0 796 punpcklbw mm1, mm0 ;A F
michael@0 797 pmaddubsw mm2, mm7
michael@0 798 pmaddubsw mm1, mm5
michael@0 799 add rsi, rdx
michael@0 800 add rax, rdx
michael@0 801 ;--
michael@0 802 ;--
michael@0 803 paddsw mm2, mm3
michael@0 804 paddsw mm2, mm1
michael@0 805 paddsw mm2, mm4
michael@0 806 psraw mm2, 7
michael@0 807 packuswb mm2, mm2
michael@0 808
michael@0 809 movd DWORD PTR [rdi], mm2
michael@0 810
michael@0 811 %if ABI_IS_32BIT
michael@0 812 add rdi, DWORD PTR arg(3) ;[out_pitch]
michael@0 813 %else
michael@0 814 add rdi, r8
michael@0 815 %endif
michael@0 816 dec rcx
michael@0 817 jnz .vp8_filter_block1d4_v6_ssse3_loop
michael@0 818
michael@0 819 ; begin epilog
michael@0 820 pop rdi
michael@0 821 pop rsi
michael@0 822 RESTORE_GOT
michael@0 823 UNSHADOW_ARGS
michael@0 824 pop rbp
michael@0 825 ret
michael@0 826
michael@0 827 .vp8_filter_block1d4_v4_ssse3:
michael@0 828 movq mm6, MMWORD PTR [rax+256] ;k2_k4
michael@0 829 movq mm7, MMWORD PTR [rax+128] ;k1_k3
michael@0 830 movq mm5, MMWORD PTR [GLOBAL(rd)]
michael@0 831
michael@0 832 mov rsi, arg(0) ;src_ptr
michael@0 833
michael@0 834 mov rax, rsi
michael@0 835 add rax, rdx
michael@0 836
michael@0 837 .vp8_filter_block1d4_v4_ssse3_loop:
michael@0 838 movd mm2, DWORD PTR [rsi + rdx] ;B
michael@0 839 movd mm3, DWORD PTR [rsi + rdx * 2] ;C
michael@0 840 movd mm4, DWORD PTR [rax + rdx * 2] ;D
michael@0 841 movd mm0, DWORD PTR [rsi + rdx * 4] ;E
michael@0 842
michael@0 843 punpcklbw mm2, mm4 ;B D
michael@0 844 punpcklbw mm3, mm0 ;C E
michael@0 845
michael@0 846 pmaddubsw mm3, mm6
michael@0 847 pmaddubsw mm2, mm7
michael@0 848 add rsi, rdx
michael@0 849 add rax, rdx
michael@0 850 ;--
michael@0 851 ;--
michael@0 852 paddsw mm2, mm3
michael@0 853 paddsw mm2, mm5
michael@0 854 psraw mm2, 7
michael@0 855 packuswb mm2, mm2
michael@0 856
michael@0 857 movd DWORD PTR [rdi], mm2
michael@0 858
michael@0 859 %if ABI_IS_32BIT
michael@0 860 add rdi, DWORD PTR arg(3) ;[out_pitch]
michael@0 861 %else
michael@0 862 add rdi, r8
michael@0 863 %endif
michael@0 864 dec rcx
michael@0 865 jnz .vp8_filter_block1d4_v4_ssse3_loop
michael@0 866
michael@0 867 ; begin epilog
michael@0 868 pop rdi
michael@0 869 pop rsi
michael@0 870 RESTORE_GOT
michael@0 871 UNSHADOW_ARGS
michael@0 872 pop rbp
michael@0 873 ret
michael@0 874
michael@0 875 ;void vp8_bilinear_predict16x16_ssse3
michael@0 876 ;(
michael@0 877 ; unsigned char *src_ptr,
michael@0 878 ; int src_pixels_per_line,
michael@0 879 ; int xoffset,
michael@0 880 ; int yoffset,
michael@0 881 ; unsigned char *dst_ptr,
michael@0 882 ; int dst_pitch
michael@0 883 ;)
michael@0 884 global sym(vp8_bilinear_predict16x16_ssse3) PRIVATE
michael@0 885 sym(vp8_bilinear_predict16x16_ssse3):
michael@0 886 push rbp
michael@0 887 mov rbp, rsp
michael@0 888 SHADOW_ARGS_TO_STACK 6
michael@0 889 SAVE_XMM 7
michael@0 890 GET_GOT rbx
michael@0 891 push rsi
michael@0 892 push rdi
michael@0 893 ; end prolog
michael@0 894
michael@0 895 lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
michael@0 896 movsxd rax, dword ptr arg(2) ; xoffset
michael@0 897
michael@0 898 cmp rax, 0 ; skip first_pass filter if xoffset=0
michael@0 899 je .b16x16_sp_only
michael@0 900
michael@0 901 shl rax, 4
michael@0 902 lea rax, [rax + rcx] ; HFilter
michael@0 903
michael@0 904 mov rdi, arg(4) ; dst_ptr
michael@0 905 mov rsi, arg(0) ; src_ptr
michael@0 906 movsxd rdx, dword ptr arg(5) ; dst_pitch
michael@0 907
michael@0 908 movdqa xmm1, [rax]
michael@0 909
michael@0 910 movsxd rax, dword ptr arg(3) ; yoffset
michael@0 911
michael@0 912 cmp rax, 0 ; skip second_pass filter if yoffset=0
michael@0 913 je .b16x16_fp_only
michael@0 914
michael@0 915 shl rax, 4
michael@0 916 lea rax, [rax + rcx] ; VFilter
michael@0 917
michael@0 918 lea rcx, [rdi+rdx*8]
michael@0 919 lea rcx, [rcx+rdx*8]
michael@0 920 movsxd rdx, dword ptr arg(1) ; src_pixels_per_line
michael@0 921
michael@0 922 movdqa xmm2, [rax]
michael@0 923
michael@0 924 %if ABI_IS_32BIT=0
michael@0 925 movsxd r8, dword ptr arg(5) ; dst_pitch
michael@0 926 %endif
michael@0 927 movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07
michael@0 928 movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
michael@0 929
michael@0 930 punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
michael@0 931 movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
michael@0 932
michael@0 933 movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
michael@0 934
michael@0 935 lea rsi, [rsi + rdx] ; next line
michael@0 936
michael@0 937 pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14
michael@0 938
michael@0 939 punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
michael@0 940 pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15
michael@0 941
michael@0 942 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
michael@0 943 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
michael@0 944
michael@0 945 paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
michael@0 946 psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128
michael@0 947
michael@0 948 movdqa xmm7, xmm3
michael@0 949 packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
michael@0 950
michael@0 951 .next_row:
michael@0 952 movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07
michael@0 953 movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
michael@0 954
michael@0 955 punpcklbw xmm6, xmm5
michael@0 956 movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
michael@0 957
michael@0 958 movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
michael@0 959 lea rsi, [rsi + rdx] ; next line
michael@0 960
michael@0 961 pmaddubsw xmm6, xmm1
michael@0 962
michael@0 963 punpcklbw xmm4, xmm5
michael@0 964 pmaddubsw xmm4, xmm1
michael@0 965
michael@0 966 paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
michael@0 967 psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128
michael@0 968
michael@0 969 paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
michael@0 970 psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128
michael@0 971
michael@0 972 packuswb xmm6, xmm4
michael@0 973 movdqa xmm5, xmm7
michael@0 974
michael@0 975 punpcklbw xmm5, xmm6
michael@0 976 pmaddubsw xmm5, xmm2
michael@0 977
michael@0 978 punpckhbw xmm7, xmm6
michael@0 979 pmaddubsw xmm7, xmm2
michael@0 980
michael@0 981 paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value
michael@0 982 psraw xmm5, VP8_FILTER_SHIFT ; xmm5 /= 128
michael@0 983
michael@0 984 paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
michael@0 985 psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128
michael@0 986
michael@0 987 packuswb xmm5, xmm7
michael@0 988 movdqa xmm7, xmm6
michael@0 989
michael@0 990 movdqa [rdi], xmm5 ; store the results in the destination
michael@0 991 %if ABI_IS_32BIT
michael@0 992 add rdi, DWORD PTR arg(5) ; dst_pitch
michael@0 993 %else
michael@0 994 add rdi, r8
michael@0 995 %endif
michael@0 996
michael@0 997 cmp rdi, rcx
michael@0 998 jne .next_row
michael@0 999
michael@0 1000 jmp .done
michael@0 1001
michael@0 1002 .b16x16_sp_only:
michael@0 1003 movsxd rax, dword ptr arg(3) ; yoffset
michael@0 1004 shl rax, 4
michael@0 1005 lea rax, [rax + rcx] ; VFilter
michael@0 1006
michael@0 1007 mov rdi, arg(4) ; dst_ptr
michael@0 1008 mov rsi, arg(0) ; src_ptr
michael@0 1009 movsxd rdx, dword ptr arg(5) ; dst_pitch
michael@0 1010
michael@0 1011 movdqa xmm1, [rax] ; VFilter
michael@0 1012
michael@0 1013 lea rcx, [rdi+rdx*8]
michael@0 1014 lea rcx, [rcx+rdx*8]
michael@0 1015 movsxd rax, dword ptr arg(1) ; src_pixels_per_line
michael@0 1016
michael@0 1017 ; get the first horizontal line done
michael@0 1018 movq xmm4, [rsi] ; load row 0
michael@0 1019 movq xmm2, [rsi + 8] ; load row 0
michael@0 1020
michael@0 1021 lea rsi, [rsi + rax] ; next line
michael@0 1022 .next_row_sp:
michael@0 1023 movq xmm3, [rsi] ; load row + 1
michael@0 1024 movq xmm5, [rsi + 8] ; load row + 1
michael@0 1025
michael@0 1026 punpcklbw xmm4, xmm3
michael@0 1027 punpcklbw xmm2, xmm5
michael@0 1028
michael@0 1029 pmaddubsw xmm4, xmm1
michael@0 1030 movq xmm7, [rsi + rax] ; load row + 2
michael@0 1031
michael@0 1032 pmaddubsw xmm2, xmm1
michael@0 1033 movq xmm6, [rsi + rax + 8] ; load row + 2
michael@0 1034
michael@0 1035 punpcklbw xmm3, xmm7
michael@0 1036 punpcklbw xmm5, xmm6
michael@0 1037
michael@0 1038 pmaddubsw xmm3, xmm1
michael@0 1039 paddw xmm4, [GLOBAL(rd)]
michael@0 1040
michael@0 1041 pmaddubsw xmm5, xmm1
michael@0 1042 paddw xmm2, [GLOBAL(rd)]
michael@0 1043
michael@0 1044 psraw xmm4, VP8_FILTER_SHIFT
michael@0 1045 psraw xmm2, VP8_FILTER_SHIFT
michael@0 1046
michael@0 1047 packuswb xmm4, xmm2
michael@0 1048 paddw xmm3, [GLOBAL(rd)]
michael@0 1049
michael@0 1050 movdqa [rdi], xmm4 ; store row 0
michael@0 1051 paddw xmm5, [GLOBAL(rd)]
michael@0 1052
michael@0 1053 psraw xmm3, VP8_FILTER_SHIFT
michael@0 1054 psraw xmm5, VP8_FILTER_SHIFT
michael@0 1055
michael@0 1056 packuswb xmm3, xmm5
michael@0 1057 movdqa xmm4, xmm7
michael@0 1058
michael@0 1059 movdqa [rdi + rdx],xmm3 ; store row 1
michael@0 1060 lea rsi, [rsi + 2*rax]
michael@0 1061
michael@0 1062 movdqa xmm2, xmm6
michael@0 1063 lea rdi, [rdi + 2*rdx]
michael@0 1064
michael@0 1065 cmp rdi, rcx
michael@0 1066 jne .next_row_sp
michael@0 1067
michael@0 1068 jmp .done
michael@0 1069
michael@0 1070 .b16x16_fp_only:
michael@0 1071 lea rcx, [rdi+rdx*8]
michael@0 1072 lea rcx, [rcx+rdx*8]
michael@0 1073 movsxd rax, dword ptr arg(1) ; src_pixels_per_line
michael@0 1074
michael@0 1075 .next_row_fp:
michael@0 1076 movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07
michael@0 1077 movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08
michael@0 1078
michael@0 1079 punpcklbw xmm2, xmm4
michael@0 1080 movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15
michael@0 1081
michael@0 1082 pmaddubsw xmm2, xmm1
michael@0 1083 movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16
michael@0 1084
michael@0 1085 lea rsi, [rsi + rax] ; next line
michael@0 1086 punpcklbw xmm3, xmm4
michael@0 1087
michael@0 1088 pmaddubsw xmm3, xmm1
michael@0 1089 movq xmm5, [rsi]
michael@0 1090
michael@0 1091 paddw xmm2, [GLOBAL(rd)]
michael@0 1092 movq xmm7, [rsi+1]
michael@0 1093
michael@0 1094 movq xmm6, [rsi+8]
michael@0 1095 psraw xmm2, VP8_FILTER_SHIFT
michael@0 1096
michael@0 1097 punpcklbw xmm5, xmm7
michael@0 1098 movq xmm7, [rsi+9]
michael@0 1099
michael@0 1100 paddw xmm3, [GLOBAL(rd)]
michael@0 1101 pmaddubsw xmm5, xmm1
michael@0 1102
michael@0 1103 psraw xmm3, VP8_FILTER_SHIFT
michael@0 1104 punpcklbw xmm6, xmm7
michael@0 1105
michael@0 1106 packuswb xmm2, xmm3
michael@0 1107 pmaddubsw xmm6, xmm1
michael@0 1108
michael@0 1109 movdqa [rdi], xmm2 ; store the results in the destination
michael@0 1110 paddw xmm5, [GLOBAL(rd)]
michael@0 1111
michael@0 1112 lea rdi, [rdi + rdx] ; dst_pitch
michael@0 1113 psraw xmm5, VP8_FILTER_SHIFT
michael@0 1114
michael@0 1115 paddw xmm6, [GLOBAL(rd)]
michael@0 1116 psraw xmm6, VP8_FILTER_SHIFT
michael@0 1117
michael@0 1118 packuswb xmm5, xmm6
michael@0 1119 lea rsi, [rsi + rax] ; next line
michael@0 1120
michael@0 1121 movdqa [rdi], xmm5 ; store the results in the destination
michael@0 1122 lea rdi, [rdi + rdx] ; dst_pitch
michael@0 1123
michael@0 1124 cmp rdi, rcx
michael@0 1125
michael@0 1126 jne .next_row_fp
michael@0 1127
michael@0 1128 .done:
michael@0 1129 ; begin epilog
michael@0 1130 pop rdi
michael@0 1131 pop rsi
michael@0 1132 RESTORE_GOT
michael@0 1133 RESTORE_XMM
michael@0 1134 UNSHADOW_ARGS
michael@0 1135 pop rbp
michael@0 1136 ret
michael@0 1137
michael@0 1138 ;void vp8_bilinear_predict8x8_ssse3
michael@0 1139 ;(
michael@0 1140 ; unsigned char *src_ptr,
michael@0 1141 ; int src_pixels_per_line,
michael@0 1142 ; int xoffset,
michael@0 1143 ; int yoffset,
michael@0 1144 ; unsigned char *dst_ptr,
michael@0 1145 ; int dst_pitch
michael@0 1146 ;)
michael@0 1147 global sym(vp8_bilinear_predict8x8_ssse3) PRIVATE
michael@0 1148 sym(vp8_bilinear_predict8x8_ssse3):
michael@0 1149 push rbp
michael@0 1150 mov rbp, rsp
michael@0 1151 SHADOW_ARGS_TO_STACK 6
michael@0 1152 SAVE_XMM 7
michael@0 1153 GET_GOT rbx
michael@0 1154 push rsi
michael@0 1155 push rdi
michael@0 1156 ; end prolog
michael@0 1157
michael@0 1158 ALIGN_STACK 16, rax
michael@0 1159 sub rsp, 144 ; reserve 144 bytes
michael@0 1160
michael@0 1161 lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
michael@0 1162
michael@0 1163 mov rsi, arg(0) ;src_ptr
michael@0 1164 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
michael@0 1165
michael@0 1166 ;Read 9-line unaligned data in and put them on stack. This gives a big
michael@0 1167 ;performance boost.
michael@0 1168 movdqu xmm0, [rsi]
michael@0 1169 lea rax, [rdx + rdx*2]
michael@0 1170 movdqu xmm1, [rsi+rdx]
michael@0 1171 movdqu xmm2, [rsi+rdx*2]
michael@0 1172 add rsi, rax
michael@0 1173 movdqu xmm3, [rsi]
michael@0 1174 movdqu xmm4, [rsi+rdx]
michael@0 1175 movdqu xmm5, [rsi+rdx*2]
michael@0 1176 add rsi, rax
michael@0 1177 movdqu xmm6, [rsi]
michael@0 1178 movdqu xmm7, [rsi+rdx]
michael@0 1179
michael@0 1180 movdqa XMMWORD PTR [rsp], xmm0
michael@0 1181
michael@0 1182 movdqu xmm0, [rsi+rdx*2]
michael@0 1183
michael@0 1184 movdqa XMMWORD PTR [rsp+16], xmm1
michael@0 1185 movdqa XMMWORD PTR [rsp+32], xmm2
michael@0 1186 movdqa XMMWORD PTR [rsp+48], xmm3
michael@0 1187 movdqa XMMWORD PTR [rsp+64], xmm4
michael@0 1188 movdqa XMMWORD PTR [rsp+80], xmm5
michael@0 1189 movdqa XMMWORD PTR [rsp+96], xmm6
michael@0 1190 movdqa XMMWORD PTR [rsp+112], xmm7
michael@0 1191 movdqa XMMWORD PTR [rsp+128], xmm0
michael@0 1192
michael@0 1193 movsxd rax, dword ptr arg(2) ; xoffset
michael@0 1194 cmp rax, 0 ; skip first_pass filter if xoffset=0
michael@0 1195 je .b8x8_sp_only
michael@0 1196
michael@0 1197 shl rax, 4
michael@0 1198 add rax, rcx ; HFilter
michael@0 1199
michael@0 1200 mov rdi, arg(4) ; dst_ptr
michael@0 1201 movsxd rdx, dword ptr arg(5) ; dst_pitch
michael@0 1202
michael@0 1203 movdqa xmm0, [rax]
michael@0 1204
michael@0 1205 movsxd rax, dword ptr arg(3) ; yoffset
michael@0 1206 cmp rax, 0 ; skip second_pass filter if yoffset=0
michael@0 1207 je .b8x8_fp_only
michael@0 1208
michael@0 1209 shl rax, 4
michael@0 1210 lea rax, [rax + rcx] ; VFilter
michael@0 1211
michael@0 1212 lea rcx, [rdi+rdx*8]
michael@0 1213
michael@0 1214 movdqa xmm1, [rax]
michael@0 1215
michael@0 1216 ; get the first horizontal line done
michael@0 1217 movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
michael@0 1218 movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
michael@0 1219
michael@0 1220 psrldq xmm5, 1
michael@0 1221 lea rsp, [rsp + 16] ; next line
michael@0 1222
michael@0 1223 punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
michael@0 1224 pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14
michael@0 1225
michael@0 1226 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
michael@0 1227 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
michael@0 1228
michael@0 1229 movdqa xmm7, xmm3
michael@0 1230 packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
michael@0 1231
michael@0 1232 .next_row:
michael@0 1233 movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
michael@0 1234 lea rsp, [rsp + 16] ; next line
michael@0 1235
michael@0 1236 movdqa xmm5, xmm6
michael@0 1237
michael@0 1238 psrldq xmm5, 1
michael@0 1239
michael@0 1240 punpcklbw xmm6, xmm5
michael@0 1241 pmaddubsw xmm6, xmm0
michael@0 1242
michael@0 1243 paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
michael@0 1244 psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128
michael@0 1245
michael@0 1246 packuswb xmm6, xmm6
michael@0 1247
michael@0 1248 punpcklbw xmm7, xmm6
michael@0 1249 pmaddubsw xmm7, xmm1
michael@0 1250
michael@0 1251 paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
michael@0 1252 psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128
michael@0 1253
michael@0 1254 packuswb xmm7, xmm7
michael@0 1255
michael@0 1256 movq [rdi], xmm7 ; store the results in the destination
michael@0 1257 lea rdi, [rdi + rdx]
michael@0 1258
michael@0 1259 movdqa xmm7, xmm6
michael@0 1260
michael@0 1261 cmp rdi, rcx
michael@0 1262 jne .next_row
michael@0 1263
michael@0 1264 jmp .done8x8
michael@0 1265
michael@0 1266 .b8x8_sp_only:
michael@0 1267 movsxd rax, dword ptr arg(3) ; yoffset
michael@0 1268 shl rax, 4
michael@0 1269 lea rax, [rax + rcx] ; VFilter
michael@0 1270
michael@0 1271 mov rdi, arg(4) ;dst_ptr
michael@0 1272 movsxd rdx, dword ptr arg(5) ; dst_pitch
michael@0 1273
michael@0 1274 movdqa xmm0, [rax] ; VFilter
michael@0 1275
michael@0 1276 movq xmm1, XMMWORD PTR [rsp]
michael@0 1277 movq xmm2, XMMWORD PTR [rsp+16]
michael@0 1278
michael@0 1279 movq xmm3, XMMWORD PTR [rsp+32]
michael@0 1280 punpcklbw xmm1, xmm2
michael@0 1281
michael@0 1282 movq xmm4, XMMWORD PTR [rsp+48]
michael@0 1283 punpcklbw xmm2, xmm3
michael@0 1284
michael@0 1285 movq xmm5, XMMWORD PTR [rsp+64]
michael@0 1286 punpcklbw xmm3, xmm4
michael@0 1287
michael@0 1288 movq xmm6, XMMWORD PTR [rsp+80]
michael@0 1289 punpcklbw xmm4, xmm5
michael@0 1290
michael@0 1291 movq xmm7, XMMWORD PTR [rsp+96]
michael@0 1292 punpcklbw xmm5, xmm6
michael@0 1293
michael@0 1294 pmaddubsw xmm1, xmm0
michael@0 1295 pmaddubsw xmm2, xmm0
michael@0 1296
michael@0 1297 pmaddubsw xmm3, xmm0
michael@0 1298 pmaddubsw xmm4, xmm0
michael@0 1299
michael@0 1300 pmaddubsw xmm5, xmm0
michael@0 1301 punpcklbw xmm6, xmm7
michael@0 1302
michael@0 1303 pmaddubsw xmm6, xmm0
michael@0 1304 paddw xmm1, [GLOBAL(rd)]
michael@0 1305
michael@0 1306 paddw xmm2, [GLOBAL(rd)]
michael@0 1307 psraw xmm1, VP8_FILTER_SHIFT
michael@0 1308
michael@0 1309 paddw xmm3, [GLOBAL(rd)]
michael@0 1310 psraw xmm2, VP8_FILTER_SHIFT
michael@0 1311
michael@0 1312 paddw xmm4, [GLOBAL(rd)]
michael@0 1313 psraw xmm3, VP8_FILTER_SHIFT
michael@0 1314
michael@0 1315 paddw xmm5, [GLOBAL(rd)]
michael@0 1316 psraw xmm4, VP8_FILTER_SHIFT
michael@0 1317
michael@0 1318 paddw xmm6, [GLOBAL(rd)]
michael@0 1319 psraw xmm5, VP8_FILTER_SHIFT
michael@0 1320
michael@0 1321 psraw xmm6, VP8_FILTER_SHIFT
michael@0 1322 packuswb xmm1, xmm1
michael@0 1323
michael@0 1324 packuswb xmm2, xmm2
michael@0 1325 movq [rdi], xmm1
michael@0 1326
michael@0 1327 packuswb xmm3, xmm3
michael@0 1328 movq [rdi+rdx], xmm2
michael@0 1329
michael@0 1330 packuswb xmm4, xmm4
michael@0 1331 movq xmm1, XMMWORD PTR [rsp+112]
michael@0 1332
michael@0 1333 lea rdi, [rdi + 2*rdx]
michael@0 1334 movq xmm2, XMMWORD PTR [rsp+128]
michael@0 1335
michael@0 1336 packuswb xmm5, xmm5
michael@0 1337 movq [rdi], xmm3
michael@0 1338
michael@0 1339 packuswb xmm6, xmm6
michael@0 1340 movq [rdi+rdx], xmm4
michael@0 1341
michael@0 1342 lea rdi, [rdi + 2*rdx]
michael@0 1343 punpcklbw xmm7, xmm1
michael@0 1344
michael@0 1345 movq [rdi], xmm5
michael@0 1346 pmaddubsw xmm7, xmm0
michael@0 1347
michael@0 1348 movq [rdi+rdx], xmm6
michael@0 1349 punpcklbw xmm1, xmm2
michael@0 1350
michael@0 1351 pmaddubsw xmm1, xmm0
michael@0 1352 paddw xmm7, [GLOBAL(rd)]
michael@0 1353
michael@0 1354 psraw xmm7, VP8_FILTER_SHIFT
michael@0 1355 paddw xmm1, [GLOBAL(rd)]
michael@0 1356
michael@0 1357 psraw xmm1, VP8_FILTER_SHIFT
michael@0 1358 packuswb xmm7, xmm7
michael@0 1359
michael@0 1360 packuswb xmm1, xmm1
michael@0 1361 lea rdi, [rdi + 2*rdx]
michael@0 1362
michael@0 1363 movq [rdi], xmm7
michael@0 1364
michael@0 1365 movq [rdi+rdx], xmm1
michael@0 1366 lea rsp, [rsp + 144]
michael@0 1367
michael@0 1368 jmp .done8x8
michael@0 1369
michael@0 1370 .b8x8_fp_only:
michael@0 1371 lea rcx, [rdi+rdx*8]
michael@0 1372
michael@0 1373 .next_row_fp:
michael@0 1374 movdqa xmm1, XMMWORD PTR [rsp]
michael@0 1375 movdqa xmm3, XMMWORD PTR [rsp+16]
michael@0 1376
michael@0 1377 movdqa xmm2, xmm1
michael@0 1378 movdqa xmm5, XMMWORD PTR [rsp+32]
michael@0 1379
michael@0 1380 psrldq xmm2, 1
michael@0 1381 movdqa xmm7, XMMWORD PTR [rsp+48]
michael@0 1382
michael@0 1383 movdqa xmm4, xmm3
michael@0 1384 psrldq xmm4, 1
michael@0 1385
michael@0 1386 movdqa xmm6, xmm5
michael@0 1387 psrldq xmm6, 1
michael@0 1388
michael@0 1389 punpcklbw xmm1, xmm2
michael@0 1390 pmaddubsw xmm1, xmm0
michael@0 1391
michael@0 1392 punpcklbw xmm3, xmm4
michael@0 1393 pmaddubsw xmm3, xmm0
michael@0 1394
michael@0 1395 punpcklbw xmm5, xmm6
michael@0 1396 pmaddubsw xmm5, xmm0
michael@0 1397
michael@0 1398 movdqa xmm2, xmm7
michael@0 1399 psrldq xmm2, 1
michael@0 1400
michael@0 1401 punpcklbw xmm7, xmm2
michael@0 1402 pmaddubsw xmm7, xmm0
michael@0 1403
michael@0 1404 paddw xmm1, [GLOBAL(rd)]
michael@0 1405 psraw xmm1, VP8_FILTER_SHIFT
michael@0 1406
michael@0 1407 paddw xmm3, [GLOBAL(rd)]
michael@0 1408 psraw xmm3, VP8_FILTER_SHIFT
michael@0 1409
michael@0 1410 paddw xmm5, [GLOBAL(rd)]
michael@0 1411 psraw xmm5, VP8_FILTER_SHIFT
michael@0 1412
michael@0 1413 paddw xmm7, [GLOBAL(rd)]
michael@0 1414 psraw xmm7, VP8_FILTER_SHIFT
michael@0 1415
michael@0 1416 packuswb xmm1, xmm1
michael@0 1417 packuswb xmm3, xmm3
michael@0 1418
michael@0 1419 packuswb xmm5, xmm5
michael@0 1420 movq [rdi], xmm1
michael@0 1421
michael@0 1422 packuswb xmm7, xmm7
michael@0 1423 movq [rdi+rdx], xmm3
michael@0 1424
michael@0 1425 lea rdi, [rdi + 2*rdx]
michael@0 1426 movq [rdi], xmm5
michael@0 1427
michael@0 1428 lea rsp, [rsp + 4*16]
michael@0 1429 movq [rdi+rdx], xmm7
michael@0 1430
michael@0 1431 lea rdi, [rdi + 2*rdx]
michael@0 1432 cmp rdi, rcx
michael@0 1433
michael@0 1434 jne .next_row_fp
michael@0 1435
michael@0 1436 lea rsp, [rsp + 16]
michael@0 1437
michael@0 1438 .done8x8:
michael@0 1439 ;add rsp, 144
michael@0 1440 pop rsp
michael@0 1441 ; begin epilog
michael@0 1442 pop rdi
michael@0 1443 pop rsi
michael@0 1444 RESTORE_GOT
michael@0 1445 RESTORE_XMM
michael@0 1446 UNSHADOW_ARGS
michael@0 1447 pop rbp
michael@0 1448 ret
michael@0 1449
michael@0 1450 SECTION_RODATA
michael@0 1451 align 16
michael@0 1452 shuf1b:
michael@0 1453 db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
michael@0 1454 shuf2b:
michael@0 1455 db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
michael@0 1456 shuf3b:
michael@0 1457 db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
michael@0 1458
michael@0 1459 align 16
michael@0 1460 shuf2bfrom1:
michael@0 1461 db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
michael@0 1462 align 16
michael@0 1463 shuf3bfrom1:
michael@0 1464 db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
michael@0 1465
michael@0 1466 align 16
michael@0 1467 rd:
michael@0 1468 times 8 dw 0x40
michael@0 1469
michael@0 1470 align 16
michael@0 1471 k0_k5:
michael@0 1472 times 8 db 0, 0 ;placeholder
michael@0 1473 times 8 db 0, 0
michael@0 1474 times 8 db 2, 1
michael@0 1475 times 8 db 0, 0
michael@0 1476 times 8 db 3, 3
michael@0 1477 times 8 db 0, 0
michael@0 1478 times 8 db 1, 2
michael@0 1479 times 8 db 0, 0
michael@0 1480 k1_k3:
michael@0 1481 times 8 db 0, 0 ;placeholder
michael@0 1482 times 8 db -6, 12
michael@0 1483 times 8 db -11, 36
michael@0 1484 times 8 db -9, 50
michael@0 1485 times 8 db -16, 77
michael@0 1486 times 8 db -6, 93
michael@0 1487 times 8 db -8, 108
michael@0 1488 times 8 db -1, 123
michael@0 1489 k2_k4:
michael@0 1490 times 8 db 128, 0 ;placeholder
michael@0 1491 times 8 db 123, -1
michael@0 1492 times 8 db 108, -8
michael@0 1493 times 8 db 93, -6
michael@0 1494 times 8 db 77, -16
michael@0 1495 times 8 db 50, -9
michael@0 1496 times 8 db 36, -11
michael@0 1497 times 8 db 12, -6
michael@0 1498 align 16
michael@0 1499 vp8_bilinear_filters_ssse3:
michael@0 1500 times 8 db 128, 0
michael@0 1501 times 8 db 112, 16
michael@0 1502 times 8 db 96, 32
michael@0 1503 times 8 db 80, 48
michael@0 1504 times 8 db 64, 64
michael@0 1505 times 8 db 48, 80
michael@0 1506 times 8 db 32, 96
michael@0 1507 times 8 db 16, 112
michael@0 1508

mercurial