media/libvpx/vp8/common/x86/subpixel_mmx.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11
michael@0 12 %include "vpx_ports/x86_abi_support.asm"
michael@0 13 extern sym(vp8_bilinear_filters_x86_8)
michael@0 14
michael@0 15
michael@0 16 %define BLOCK_HEIGHT_WIDTH 4
michael@0 17 %define vp8_filter_weight 128
michael@0 18 %define VP8_FILTER_SHIFT 7
michael@0 19
michael@0 20
michael@0 21 ;void vp8_filter_block1d_h6_mmx
michael@0 22 ;(
michael@0 23 ; unsigned char *src_ptr,
michael@0 24 ; unsigned short *output_ptr,
michael@0 25 ; unsigned int src_pixels_per_line,
michael@0 26 ; unsigned int pixel_step,
michael@0 27 ; unsigned int output_height,
michael@0 28 ; unsigned int output_width,
michael@0 29 ; short * vp8_filter
michael@0 30 ;)
michael@0 31 global sym(vp8_filter_block1d_h6_mmx) PRIVATE
michael@0 32 sym(vp8_filter_block1d_h6_mmx):
michael@0 33 push rbp
michael@0 34 mov rbp, rsp
michael@0 35 SHADOW_ARGS_TO_STACK 7
michael@0 36 GET_GOT rbx
michael@0 37 push rsi
michael@0 38 push rdi
michael@0 39 ; end prolog
michael@0 40
michael@0 41 mov rdx, arg(6) ;vp8_filter
michael@0 42
michael@0 43 movq mm1, [rdx + 16] ; do both the negative taps first!!!
michael@0 44 movq mm2, [rdx + 32] ;
michael@0 45 movq mm6, [rdx + 48] ;
michael@0 46 movq mm7, [rdx + 64] ;
michael@0 47
michael@0 48 mov rdi, arg(1) ;output_ptr
michael@0 49 mov rsi, arg(0) ;src_ptr
michael@0 50 movsxd rcx, dword ptr arg(4) ;output_height
michael@0 51 movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?
michael@0 52 pxor mm0, mm0 ; mm0 = 00000000
michael@0 53
michael@0 54 .nextrow:
michael@0 55 movq mm3, [rsi-2] ; mm3 = p-2..p5
michael@0 56 movq mm4, mm3 ; mm4 = p-2..p5
michael@0 57 psrlq mm3, 8 ; mm3 = p-1..p5
michael@0 58 punpcklbw mm3, mm0 ; mm3 = p-1..p2
michael@0 59 pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
michael@0 60
michael@0 61 movq mm5, mm4 ; mm5 = p-2..p5
michael@0 62 punpckhbw mm4, mm0 ; mm5 = p2..p5
michael@0 63 pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers
michael@0 64 paddsw mm3, mm4 ; mm3 += mm5
michael@0 65
michael@0 66 movq mm4, mm5 ; mm4 = p-2..p5;
michael@0 67 psrlq mm5, 16 ; mm5 = p0..p5;
michael@0 68 punpcklbw mm5, mm0 ; mm5 = p0..p3
michael@0 69 pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers
michael@0 70 paddsw mm3, mm5 ; mm3 += mm5
michael@0 71
michael@0 72 movq mm5, mm4 ; mm5 = p-2..p5
michael@0 73 psrlq mm4, 24 ; mm4 = p1..p5
michael@0 74 punpcklbw mm4, mm0 ; mm4 = p1..p4
michael@0 75 pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers
michael@0 76 paddsw mm3, mm4 ; mm3 += mm5
michael@0 77
michael@0 78 ; do outer positive taps
michael@0 79 movd mm4, [rsi+3]
michael@0 80 punpcklbw mm4, mm0 ; mm5 = p3..p6
michael@0 81 pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers
michael@0 82 paddsw mm3, mm4 ; mm3 += mm5
michael@0 83
michael@0 84 punpcklbw mm5, mm0 ; mm5 = p-2..p1
michael@0 85 pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers
michael@0 86 paddsw mm3, mm5 ; mm3 += mm5
michael@0 87
michael@0 88 paddsw mm3, [GLOBAL(rd)] ; mm3 += round value
michael@0 89 psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
michael@0 90 packuswb mm3, mm0 ; pack and unpack to saturate
michael@0 91 punpcklbw mm3, mm0 ;
michael@0 92
michael@0 93 movq [rdi], mm3 ; store the results in the destination
michael@0 94
michael@0 95 %if ABI_IS_32BIT
michael@0 96 add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line
michael@0 97 add rdi, rax;
michael@0 98 %else
michael@0 99 movsxd r8, dword ptr arg(2) ;src_pixels_per_line
michael@0 100 add rdi, rax;
michael@0 101
michael@0 102 add rsi, r8 ; next line
michael@0 103 %endif
michael@0 104
michael@0 105 dec rcx ; decrement count
michael@0 106 jnz .nextrow ; next row
michael@0 107
michael@0 108 ; begin epilog
michael@0 109 pop rdi
michael@0 110 pop rsi
michael@0 111 RESTORE_GOT
michael@0 112 UNSHADOW_ARGS
michael@0 113 pop rbp
michael@0 114 ret
michael@0 115
michael@0 116
michael@0 117 ;void vp8_filter_block1dc_v6_mmx
michael@0 118 ;(
michael@0 119 ; short *src_ptr,
michael@0 120 ; unsigned char *output_ptr,
michael@0 121 ; int output_pitch,
michael@0 122 ; unsigned int pixels_per_line,
michael@0 123 ; unsigned int pixel_step,
michael@0 124 ; unsigned int output_height,
michael@0 125 ; unsigned int output_width,
michael@0 126 ; short * vp8_filter
michael@0 127 ;)
michael@0 128 global sym(vp8_filter_block1dc_v6_mmx) PRIVATE
michael@0 129 sym(vp8_filter_block1dc_v6_mmx):
michael@0 130 push rbp
michael@0 131 mov rbp, rsp
michael@0 132 SHADOW_ARGS_TO_STACK 8
michael@0 133 GET_GOT rbx
michael@0 134 push rsi
michael@0 135 push rdi
michael@0 136 ; end prolog
michael@0 137
michael@0 138 movq mm5, [GLOBAL(rd)]
michael@0 139 push rbx
michael@0 140 mov rbx, arg(7) ;vp8_filter
michael@0 141 movq mm1, [rbx + 16] ; do both the negative taps first!!!
michael@0 142 movq mm2, [rbx + 32] ;
michael@0 143 movq mm6, [rbx + 48] ;
michael@0 144 movq mm7, [rbx + 64] ;
michael@0 145
michael@0 146 movsxd rdx, dword ptr arg(3) ;pixels_per_line
michael@0 147 mov rdi, arg(1) ;output_ptr
michael@0 148 mov rsi, arg(0) ;src_ptr
michael@0 149 sub rsi, rdx
michael@0 150 sub rsi, rdx
michael@0 151 movsxd rcx, DWORD PTR arg(5) ;output_height
michael@0 152 movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch?
michael@0 153 pxor mm0, mm0 ; mm0 = 00000000
michael@0 154
michael@0 155
michael@0 156 .nextrow_cv:
michael@0 157 movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
michael@0 158 pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
michael@0 159
michael@0 160
michael@0 161 movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2
michael@0 162 pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.
michael@0 163 paddsw mm3, mm4 ; mm3 += mm4
michael@0 164
michael@0 165 movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0
michael@0 166 pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.
michael@0 167 paddsw mm3, mm4 ; mm3 += mm4
michael@0 168
michael@0 169 movq mm4, [rsi] ; mm4 = p0..p3 = row -2
michael@0 170 pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.
michael@0 171 paddsw mm3, mm4 ; mm3 += mm4
michael@0 172
michael@0 173
michael@0 174 add rsi, rdx ; move source forward 1 line to avoid 3 * pitch
michael@0 175 movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1
michael@0 176 pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers.
michael@0 177 paddsw mm3, mm4 ; mm3 += mm4
michael@0 178
michael@0 179 movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3
michael@0 180 pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers.
michael@0 181 paddsw mm3, mm4 ; mm3 += mm4
michael@0 182
michael@0 183
michael@0 184 paddsw mm3, mm5 ; mm3 += round value
michael@0 185 psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
michael@0 186 packuswb mm3, mm0 ; pack and saturate
michael@0 187
michael@0 188 movd [rdi],mm3 ; store the results in the destination
michael@0 189 ; the subsequent iterations repeat 3 out of 4 of these reads. Since the
michael@0 190 ; recon block should be in cache this shouldn't cost much. Its obviously
michael@0 191 ; avoidable!!!.
michael@0 192 lea rdi, [rdi+rax] ;
michael@0 193 dec rcx ; decrement count
michael@0 194 jnz .nextrow_cv ; next row
michael@0 195
michael@0 196 pop rbx
michael@0 197
michael@0 198 ; begin epilog
michael@0 199 pop rdi
michael@0 200 pop rsi
michael@0 201 RESTORE_GOT
michael@0 202 UNSHADOW_ARGS
michael@0 203 pop rbp
michael@0 204 ret
michael@0 205
michael@0 206
michael@0 207 ;void bilinear_predict8x8_mmx
michael@0 208 ;(
michael@0 209 ; unsigned char *src_ptr,
michael@0 210 ; int src_pixels_per_line,
michael@0 211 ; int xoffset,
michael@0 212 ; int yoffset,
michael@0 213 ; unsigned char *dst_ptr,
michael@0 214 ; int dst_pitch
michael@0 215 ;)
michael@0 216 global sym(vp8_bilinear_predict8x8_mmx) PRIVATE
michael@0 217 sym(vp8_bilinear_predict8x8_mmx):
michael@0 218 push rbp
michael@0 219 mov rbp, rsp
michael@0 220 SHADOW_ARGS_TO_STACK 6
michael@0 221 GET_GOT rbx
michael@0 222 push rsi
michael@0 223 push rdi
michael@0 224 ; end prolog
michael@0 225
michael@0 226 ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
michael@0 227 ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
michael@0 228
michael@0 229 movsxd rax, dword ptr arg(2) ;xoffset
michael@0 230 mov rdi, arg(4) ;dst_ptr ;
michael@0 231
michael@0 232 shl rax, 5 ; offset * 32
michael@0 233 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
michael@0 234
michael@0 235 add rax, rcx ; HFilter
michael@0 236 mov rsi, arg(0) ;src_ptr ;
michael@0 237
michael@0 238 movsxd rdx, dword ptr arg(5) ;dst_pitch
michael@0 239 movq mm1, [rax] ;
michael@0 240
michael@0 241 movq mm2, [rax+16] ;
michael@0 242 movsxd rax, dword ptr arg(3) ;yoffset
michael@0 243
michael@0 244 pxor mm0, mm0 ;
michael@0 245
michael@0 246 shl rax, 5 ; offset*32
michael@0 247 add rax, rcx ; VFilter
michael@0 248
michael@0 249 lea rcx, [rdi+rdx*8] ;
michael@0 250 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
michael@0 251
michael@0 252
michael@0 253
michael@0 254 ; get the first horizontal line done ;
michael@0 255 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
michael@0 256 movq mm4, mm3 ; make a copy of current line
michael@0 257
michael@0 258 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
michael@0 259 punpckhbw mm4, mm0 ;
michael@0 260
michael@0 261 pmullw mm3, mm1 ;
michael@0 262 pmullw mm4, mm1 ;
michael@0 263
michael@0 264 movq mm5, [rsi+1] ;
michael@0 265 movq mm6, mm5 ;
michael@0 266
michael@0 267 punpcklbw mm5, mm0 ;
michael@0 268 punpckhbw mm6, mm0 ;
michael@0 269
michael@0 270 pmullw mm5, mm2 ;
michael@0 271 pmullw mm6, mm2 ;
michael@0 272
michael@0 273 paddw mm3, mm5 ;
michael@0 274 paddw mm4, mm6 ;
michael@0 275
michael@0 276 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
michael@0 277 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
michael@0 278
michael@0 279 paddw mm4, [GLOBAL(rd)] ;
michael@0 280 psraw mm4, VP8_FILTER_SHIFT ;
michael@0 281
michael@0 282 movq mm7, mm3 ;
michael@0 283 packuswb mm7, mm4 ;
michael@0 284
michael@0 285 add rsi, rdx ; next line
michael@0 286 .next_row_8x8:
michael@0 287 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
michael@0 288 movq mm4, mm3 ; make a copy of current line
michael@0 289
michael@0 290 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
michael@0 291 punpckhbw mm4, mm0 ;
michael@0 292
michael@0 293 pmullw mm3, mm1 ;
michael@0 294 pmullw mm4, mm1 ;
michael@0 295
michael@0 296 movq mm5, [rsi+1] ;
michael@0 297 movq mm6, mm5 ;
michael@0 298
michael@0 299 punpcklbw mm5, mm0 ;
michael@0 300 punpckhbw mm6, mm0 ;
michael@0 301
michael@0 302 pmullw mm5, mm2 ;
michael@0 303 pmullw mm6, mm2 ;
michael@0 304
michael@0 305 paddw mm3, mm5 ;
michael@0 306 paddw mm4, mm6 ;
michael@0 307
michael@0 308 movq mm5, mm7 ;
michael@0 309 movq mm6, mm7 ;
michael@0 310
michael@0 311 punpcklbw mm5, mm0 ;
michael@0 312 punpckhbw mm6, mm0
michael@0 313
michael@0 314 pmullw mm5, [rax] ;
michael@0 315 pmullw mm6, [rax] ;
michael@0 316
michael@0 317 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
michael@0 318 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
michael@0 319
michael@0 320 paddw mm4, [GLOBAL(rd)] ;
michael@0 321 psraw mm4, VP8_FILTER_SHIFT ;
michael@0 322
michael@0 323 movq mm7, mm3 ;
michael@0 324 packuswb mm7, mm4 ;
michael@0 325
michael@0 326
michael@0 327 pmullw mm3, [rax+16] ;
michael@0 328 pmullw mm4, [rax+16] ;
michael@0 329
michael@0 330 paddw mm3, mm5 ;
michael@0 331 paddw mm4, mm6 ;
michael@0 332
michael@0 333
michael@0 334 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
michael@0 335 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
michael@0 336
michael@0 337 paddw mm4, [GLOBAL(rd)] ;
michael@0 338 psraw mm4, VP8_FILTER_SHIFT ;
michael@0 339
michael@0 340 packuswb mm3, mm4
michael@0 341
michael@0 342 movq [rdi], mm3 ; store the results in the destination
michael@0 343
michael@0 344 %if ABI_IS_32BIT
michael@0 345 add rsi, rdx ; next line
michael@0 346 add rdi, dword ptr arg(5) ;dst_pitch ;
michael@0 347 %else
michael@0 348 movsxd r8, dword ptr arg(5) ;dst_pitch
michael@0 349 add rsi, rdx ; next line
michael@0 350 add rdi, r8 ;dst_pitch
michael@0 351 %endif
michael@0 352 cmp rdi, rcx ;
michael@0 353 jne .next_row_8x8
michael@0 354
michael@0 355 ; begin epilog
michael@0 356 pop rdi
michael@0 357 pop rsi
michael@0 358 RESTORE_GOT
michael@0 359 UNSHADOW_ARGS
michael@0 360 pop rbp
michael@0 361 ret
michael@0 362
michael@0 363
michael@0 364 ;void bilinear_predict8x4_mmx
michael@0 365 ;(
michael@0 366 ; unsigned char *src_ptr,
michael@0 367 ; int src_pixels_per_line,
michael@0 368 ; int xoffset,
michael@0 369 ; int yoffset,
michael@0 370 ; unsigned char *dst_ptr,
michael@0 371 ; int dst_pitch
michael@0 372 ;)
michael@0 373 global sym(vp8_bilinear_predict8x4_mmx) PRIVATE
michael@0 374 sym(vp8_bilinear_predict8x4_mmx):
michael@0 375 push rbp
michael@0 376 mov rbp, rsp
michael@0 377 SHADOW_ARGS_TO_STACK 6
michael@0 378 GET_GOT rbx
michael@0 379 push rsi
michael@0 380 push rdi
michael@0 381 ; end prolog
michael@0 382
michael@0 383 ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
michael@0 384 ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
michael@0 385
michael@0 386 movsxd rax, dword ptr arg(2) ;xoffset
michael@0 387 mov rdi, arg(4) ;dst_ptr ;
michael@0 388
michael@0 389 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
michael@0 390 shl rax, 5
michael@0 391
michael@0 392 mov rsi, arg(0) ;src_ptr ;
michael@0 393 add rax, rcx
michael@0 394
michael@0 395 movsxd rdx, dword ptr arg(5) ;dst_pitch
michael@0 396 movq mm1, [rax] ;
michael@0 397
michael@0 398 movq mm2, [rax+16] ;
michael@0 399 movsxd rax, dword ptr arg(3) ;yoffset
michael@0 400
michael@0 401 pxor mm0, mm0 ;
michael@0 402 shl rax, 5
michael@0 403
michael@0 404 add rax, rcx
michael@0 405 lea rcx, [rdi+rdx*4] ;
michael@0 406
michael@0 407 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
michael@0 408
michael@0 409 ; get the first horizontal line done ;
michael@0 410 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
michael@0 411 movq mm4, mm3 ; make a copy of current line
michael@0 412
michael@0 413 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
michael@0 414 punpckhbw mm4, mm0 ;
michael@0 415
michael@0 416 pmullw mm3, mm1 ;
michael@0 417 pmullw mm4, mm1 ;
michael@0 418
michael@0 419 movq mm5, [rsi+1] ;
michael@0 420 movq mm6, mm5 ;
michael@0 421
michael@0 422 punpcklbw mm5, mm0 ;
michael@0 423 punpckhbw mm6, mm0 ;
michael@0 424
michael@0 425 pmullw mm5, mm2 ;
michael@0 426 pmullw mm6, mm2 ;
michael@0 427
michael@0 428 paddw mm3, mm5 ;
michael@0 429 paddw mm4, mm6 ;
michael@0 430
michael@0 431 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
michael@0 432 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
michael@0 433
michael@0 434 paddw mm4, [GLOBAL(rd)] ;
michael@0 435 psraw mm4, VP8_FILTER_SHIFT ;
michael@0 436
michael@0 437 movq mm7, mm3 ;
michael@0 438 packuswb mm7, mm4 ;
michael@0 439
michael@0 440 add rsi, rdx ; next line
michael@0 441 .next_row_8x4:
michael@0 442 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
michael@0 443 movq mm4, mm3 ; make a copy of current line
michael@0 444
michael@0 445 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
michael@0 446 punpckhbw mm4, mm0 ;
michael@0 447
michael@0 448 pmullw mm3, mm1 ;
michael@0 449 pmullw mm4, mm1 ;
michael@0 450
michael@0 451 movq mm5, [rsi+1] ;
michael@0 452 movq mm6, mm5 ;
michael@0 453
michael@0 454 punpcklbw mm5, mm0 ;
michael@0 455 punpckhbw mm6, mm0 ;
michael@0 456
michael@0 457 pmullw mm5, mm2 ;
michael@0 458 pmullw mm6, mm2 ;
michael@0 459
michael@0 460 paddw mm3, mm5 ;
michael@0 461 paddw mm4, mm6 ;
michael@0 462
michael@0 463 movq mm5, mm7 ;
michael@0 464 movq mm6, mm7 ;
michael@0 465
michael@0 466 punpcklbw mm5, mm0 ;
michael@0 467 punpckhbw mm6, mm0
michael@0 468
michael@0 469 pmullw mm5, [rax] ;
michael@0 470 pmullw mm6, [rax] ;
michael@0 471
michael@0 472 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
michael@0 473 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
michael@0 474
michael@0 475 paddw mm4, [GLOBAL(rd)] ;
michael@0 476 psraw mm4, VP8_FILTER_SHIFT ;
michael@0 477
michael@0 478 movq mm7, mm3 ;
michael@0 479 packuswb mm7, mm4 ;
michael@0 480
michael@0 481
michael@0 482 pmullw mm3, [rax+16] ;
michael@0 483 pmullw mm4, [rax+16] ;
michael@0 484
michael@0 485 paddw mm3, mm5 ;
michael@0 486 paddw mm4, mm6 ;
michael@0 487
michael@0 488
michael@0 489 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
michael@0 490 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
michael@0 491
michael@0 492 paddw mm4, [GLOBAL(rd)] ;
michael@0 493 psraw mm4, VP8_FILTER_SHIFT ;
michael@0 494
michael@0 495 packuswb mm3, mm4
michael@0 496
michael@0 497 movq [rdi], mm3 ; store the results in the destination
michael@0 498
michael@0 499 %if ABI_IS_32BIT
michael@0 500 add rsi, rdx ; next line
michael@0 501 add rdi, dword ptr arg(5) ;dst_pitch ;
michael@0 502 %else
michael@0 503 movsxd r8, dword ptr arg(5) ;dst_pitch
michael@0 504 add rsi, rdx ; next line
michael@0 505 add rdi, r8
michael@0 506 %endif
michael@0 507 cmp rdi, rcx ;
michael@0 508 jne .next_row_8x4
michael@0 509
michael@0 510 ; begin epilog
michael@0 511 pop rdi
michael@0 512 pop rsi
michael@0 513 RESTORE_GOT
michael@0 514 UNSHADOW_ARGS
michael@0 515 pop rbp
michael@0 516 ret
michael@0 517
michael@0 518
michael@0 519 ;void bilinear_predict4x4_mmx
michael@0 520 ;(
michael@0 521 ; unsigned char *src_ptr,
michael@0 522 ; int src_pixels_per_line,
michael@0 523 ; int xoffset,
michael@0 524 ; int yoffset,
michael@0 525 ; unsigned char *dst_ptr,
michael@0 526 ; int dst_pitch
michael@0 527 ;)
michael@0 528 global sym(vp8_bilinear_predict4x4_mmx) PRIVATE
michael@0 529 sym(vp8_bilinear_predict4x4_mmx):
michael@0 530 push rbp
michael@0 531 mov rbp, rsp
michael@0 532 SHADOW_ARGS_TO_STACK 6
michael@0 533 GET_GOT rbx
michael@0 534 push rsi
michael@0 535 push rdi
michael@0 536 ; end prolog
michael@0 537
michael@0 538 ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
michael@0 539 ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
michael@0 540
michael@0 541 movsxd rax, dword ptr arg(2) ;xoffset
michael@0 542 mov rdi, arg(4) ;dst_ptr ;
michael@0 543
michael@0 544 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
michael@0 545 shl rax, 5
michael@0 546
michael@0 547 add rax, rcx ; HFilter
michael@0 548 mov rsi, arg(0) ;src_ptr ;
michael@0 549
michael@0 550 movsxd rdx, dword ptr arg(5) ;ldst_pitch
michael@0 551 movq mm1, [rax] ;
michael@0 552
michael@0 553 movq mm2, [rax+16] ;
michael@0 554 movsxd rax, dword ptr arg(3) ;yoffset
michael@0 555
michael@0 556 pxor mm0, mm0 ;
michael@0 557 shl rax, 5
michael@0 558
michael@0 559 add rax, rcx
michael@0 560 lea rcx, [rdi+rdx*4] ;
michael@0 561
michael@0 562 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
michael@0 563
michael@0 564 ; get the first horizontal line done ;
michael@0 565 movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
michael@0 566 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
michael@0 567
michael@0 568 pmullw mm3, mm1 ;
michael@0 569 movd mm5, [rsi+1] ;
michael@0 570
michael@0 571 punpcklbw mm5, mm0 ;
michael@0 572 pmullw mm5, mm2 ;
michael@0 573
michael@0 574 paddw mm3, mm5 ;
michael@0 575 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
michael@0 576
michael@0 577 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
michael@0 578
michael@0 579 movq mm7, mm3 ;
michael@0 580 packuswb mm7, mm0 ;
michael@0 581
michael@0 582 add rsi, rdx ; next line
michael@0 583 .next_row_4x4:
michael@0 584 movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
michael@0 585 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
michael@0 586
michael@0 587 pmullw mm3, mm1 ;
michael@0 588 movd mm5, [rsi+1] ;
michael@0 589
michael@0 590 punpcklbw mm5, mm0 ;
michael@0 591 pmullw mm5, mm2 ;
michael@0 592
michael@0 593 paddw mm3, mm5 ;
michael@0 594
michael@0 595 movq mm5, mm7 ;
michael@0 596 punpcklbw mm5, mm0 ;
michael@0 597
michael@0 598 pmullw mm5, [rax] ;
michael@0 599 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
michael@0 600
michael@0 601 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
michael@0 602 movq mm7, mm3 ;
michael@0 603
michael@0 604 packuswb mm7, mm0 ;
michael@0 605
michael@0 606 pmullw mm3, [rax+16] ;
michael@0 607 paddw mm3, mm5 ;
michael@0 608
michael@0 609
michael@0 610 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
michael@0 611 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
michael@0 612
michael@0 613 packuswb mm3, mm0
michael@0 614 movd [rdi], mm3 ; store the results in the destination
michael@0 615
michael@0 616 %if ABI_IS_32BIT
michael@0 617 add rsi, rdx ; next line
michael@0 618 add rdi, dword ptr arg(5) ;dst_pitch ;
michael@0 619 %else
michael@0 620 movsxd r8, dword ptr arg(5) ;dst_pitch ;
michael@0 621 add rsi, rdx ; next line
michael@0 622 add rdi, r8
michael@0 623 %endif
michael@0 624
michael@0 625 cmp rdi, rcx ;
michael@0 626 jne .next_row_4x4
michael@0 627
michael@0 628 ; begin epilog
michael@0 629 pop rdi
michael@0 630 pop rsi
michael@0 631 RESTORE_GOT
michael@0 632 UNSHADOW_ARGS
michael@0 633 pop rbp
michael@0 634 ret
michael@0 635
michael@0 636
michael@0 637
michael@0 638 SECTION_RODATA
michael@0 639 align 16
michael@0 640 rd:
michael@0 641 times 4 dw 0x40
michael@0 642
michael@0 643 align 16
michael@0 644 global HIDDEN_DATA(sym(vp8_six_tap_mmx))
michael@0 645 sym(vp8_six_tap_mmx):
michael@0 646 times 8 dw 0
michael@0 647 times 8 dw 0
michael@0 648 times 8 dw 128
michael@0 649 times 8 dw 0
michael@0 650 times 8 dw 0
michael@0 651 times 8 dw 0
michael@0 652
michael@0 653 times 8 dw 0
michael@0 654 times 8 dw -6
michael@0 655 times 8 dw 123
michael@0 656 times 8 dw 12
michael@0 657 times 8 dw -1
michael@0 658 times 8 dw 0
michael@0 659
michael@0 660 times 8 dw 2
michael@0 661 times 8 dw -11
michael@0 662 times 8 dw 108
michael@0 663 times 8 dw 36
michael@0 664 times 8 dw -8
michael@0 665 times 8 dw 1
michael@0 666
michael@0 667 times 8 dw 0
michael@0 668 times 8 dw -9
michael@0 669 times 8 dw 93
michael@0 670 times 8 dw 50
michael@0 671 times 8 dw -6
michael@0 672 times 8 dw 0
michael@0 673
michael@0 674 times 8 dw 3
michael@0 675 times 8 dw -16
michael@0 676 times 8 dw 77
michael@0 677 times 8 dw 77
michael@0 678 times 8 dw -16
michael@0 679 times 8 dw 3
michael@0 680
michael@0 681 times 8 dw 0
michael@0 682 times 8 dw -6
michael@0 683 times 8 dw 50
michael@0 684 times 8 dw 93
michael@0 685 times 8 dw -9
michael@0 686 times 8 dw 0
michael@0 687
michael@0 688 times 8 dw 1
michael@0 689 times 8 dw -8
michael@0 690 times 8 dw 36
michael@0 691 times 8 dw 108
michael@0 692 times 8 dw -11
michael@0 693 times 8 dw 2
michael@0 694
michael@0 695 times 8 dw 0
michael@0 696 times 8 dw -1
michael@0 697 times 8 dw 12
michael@0 698 times 8 dw 123
michael@0 699 times 8 dw -6
michael@0 700 times 8 dw 0
michael@0 701
michael@0 702

mercurial