1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/common/x86/subpixel_mmx.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,702 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 +%include "vpx_ports/x86_abi_support.asm" 1.16 +extern sym(vp8_bilinear_filters_x86_8) 1.17 + 1.18 + 1.19 +%define BLOCK_HEIGHT_WIDTH 4 1.20 +%define vp8_filter_weight 128 1.21 +%define VP8_FILTER_SHIFT 7 1.22 + 1.23 + 1.24 +;void vp8_filter_block1d_h6_mmx 1.25 +;( 1.26 +; unsigned char *src_ptr, 1.27 +; unsigned short *output_ptr, 1.28 +; unsigned int src_pixels_per_line, 1.29 +; unsigned int pixel_step, 1.30 +; unsigned int output_height, 1.31 +; unsigned int output_width, 1.32 +; short * vp8_filter 1.33 +;) 1.34 +global sym(vp8_filter_block1d_h6_mmx) PRIVATE 1.35 +sym(vp8_filter_block1d_h6_mmx): 1.36 + push rbp 1.37 + mov rbp, rsp 1.38 + SHADOW_ARGS_TO_STACK 7 1.39 + GET_GOT rbx 1.40 + push rsi 1.41 + push rdi 1.42 + ; end prolog 1.43 + 1.44 + mov rdx, arg(6) ;vp8_filter 1.45 + 1.46 + movq mm1, [rdx + 16] ; do both the negative taps first!!! 1.47 + movq mm2, [rdx + 32] ; 1.48 + movq mm6, [rdx + 48] ; 1.49 + movq mm7, [rdx + 64] ; 1.50 + 1.51 + mov rdi, arg(1) ;output_ptr 1.52 + mov rsi, arg(0) ;src_ptr 1.53 + movsxd rcx, dword ptr arg(4) ;output_height 1.54 + movsxd rax, dword ptr arg(5) ;output_width ; destination pitch? 1.55 + pxor mm0, mm0 ; mm0 = 00000000 1.56 + 1.57 +.nextrow: 1.58 + movq mm3, [rsi-2] ; mm3 = p-2..p5 1.59 + movq mm4, mm3 ; mm4 = p-2..p5 1.60 + psrlq mm3, 8 ; mm3 = p-1..p5 1.61 + punpcklbw mm3, mm0 ; mm3 = p-1..p2 1.62 + pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. 1.63 + 1.64 + movq mm5, mm4 ; mm5 = p-2..p5 1.65 + punpckhbw mm4, mm0 ; mm5 = p2..p5 1.66 + pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers 1.67 + paddsw mm3, mm4 ; mm3 += mm5 1.68 + 1.69 + movq mm4, mm5 ; mm4 = p-2..p5; 1.70 + psrlq mm5, 16 ; mm5 = p0..p5; 1.71 + punpcklbw mm5, mm0 ; mm5 = p0..p3 1.72 + pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers 1.73 + paddsw mm3, mm5 ; mm3 += mm5 1.74 + 1.75 + movq mm5, mm4 ; mm5 = p-2..p5 1.76 + psrlq mm4, 24 ; mm4 = p1..p5 1.77 + punpcklbw mm4, mm0 ; mm4 = p1..p4 1.78 + pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers 1.79 + paddsw mm3, mm4 ; mm3 += mm5 1.80 + 1.81 + ; do outer positive taps 1.82 + movd mm4, [rsi+3] 1.83 + punpcklbw mm4, mm0 ; mm5 = p3..p6 1.84 + pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers 1.85 + paddsw mm3, mm4 ; mm3 += mm5 1.86 + 1.87 + punpcklbw mm5, mm0 ; mm5 = p-2..p1 1.88 + pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers 1.89 + paddsw mm3, mm5 ; mm3 += mm5 1.90 + 1.91 + paddsw mm3, [GLOBAL(rd)] ; mm3 += round value 1.92 + psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 1.93 + packuswb mm3, mm0 ; pack and unpack to saturate 1.94 + punpcklbw mm3, mm0 ; 1.95 + 1.96 + movq [rdi], mm3 ; store the results in the destination 1.97 + 1.98 +%if ABI_IS_32BIT 1.99 + add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line 1.100 + add rdi, rax; 1.101 +%else 1.102 + movsxd r8, dword ptr arg(2) ;src_pixels_per_line 1.103 + add rdi, rax; 1.104 + 1.105 + add rsi, r8 ; next line 1.106 +%endif 1.107 + 1.108 + dec rcx ; decrement count 1.109 + jnz .nextrow ; next row 1.110 + 1.111 + ; begin epilog 1.112 + pop rdi 1.113 + pop rsi 1.114 + RESTORE_GOT 1.115 + UNSHADOW_ARGS 1.116 + pop rbp 1.117 + ret 1.118 + 1.119 + 1.120 +;void vp8_filter_block1dc_v6_mmx 1.121 +;( 1.122 +; short *src_ptr, 1.123 +; unsigned char *output_ptr, 1.124 +; int output_pitch, 1.125 +; unsigned int pixels_per_line, 1.126 +; unsigned int pixel_step, 1.127 +; unsigned int output_height, 1.128 +; unsigned int output_width, 1.129 +; short * vp8_filter 1.130 +;) 1.131 +global sym(vp8_filter_block1dc_v6_mmx) PRIVATE 1.132 +sym(vp8_filter_block1dc_v6_mmx): 1.133 + push rbp 1.134 + mov rbp, rsp 1.135 + SHADOW_ARGS_TO_STACK 8 1.136 + GET_GOT rbx 1.137 + push rsi 1.138 + push rdi 1.139 + ; end prolog 1.140 + 1.141 + movq mm5, [GLOBAL(rd)] 1.142 + push rbx 1.143 + mov rbx, arg(7) ;vp8_filter 1.144 + movq mm1, [rbx + 16] ; do both the negative taps first!!! 1.145 + movq mm2, [rbx + 32] ; 1.146 + movq mm6, [rbx + 48] ; 1.147 + movq mm7, [rbx + 64] ; 1.148 + 1.149 + movsxd rdx, dword ptr arg(3) ;pixels_per_line 1.150 + mov rdi, arg(1) ;output_ptr 1.151 + mov rsi, arg(0) ;src_ptr 1.152 + sub rsi, rdx 1.153 + sub rsi, rdx 1.154 + movsxd rcx, DWORD PTR arg(5) ;output_height 1.155 + movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch? 1.156 + pxor mm0, mm0 ; mm0 = 00000000 1.157 + 1.158 + 1.159 +.nextrow_cv: 1.160 + movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1 1.161 + pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. 1.162 + 1.163 + 1.164 + movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2 1.165 + pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers. 1.166 + paddsw mm3, mm4 ; mm3 += mm4 1.167 + 1.168 + movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0 1.169 + pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers. 1.170 + paddsw mm3, mm4 ; mm3 += mm4 1.171 + 1.172 + movq mm4, [rsi] ; mm4 = p0..p3 = row -2 1.173 + pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers. 1.174 + paddsw mm3, mm4 ; mm3 += mm4 1.175 + 1.176 + 1.177 + add rsi, rdx ; move source forward 1 line to avoid 3 * pitch 1.178 + movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1 1.179 + pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers. 1.180 + paddsw mm3, mm4 ; mm3 += mm4 1.181 + 1.182 + movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3 1.183 + pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers. 1.184 + paddsw mm3, mm4 ; mm3 += mm4 1.185 + 1.186 + 1.187 + paddsw mm3, mm5 ; mm3 += round value 1.188 + psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 1.189 + packuswb mm3, mm0 ; pack and saturate 1.190 + 1.191 + movd [rdi],mm3 ; store the results in the destination 1.192 + ; the subsequent iterations repeat 3 out of 4 of these reads. Since the 1.193 + ; recon block should be in cache this shouldn't cost much. Its obviously 1.194 + ; avoidable!!!. 1.195 + lea rdi, [rdi+rax] ; 1.196 + dec rcx ; decrement count 1.197 + jnz .nextrow_cv ; next row 1.198 + 1.199 + pop rbx 1.200 + 1.201 + ; begin epilog 1.202 + pop rdi 1.203 + pop rsi 1.204 + RESTORE_GOT 1.205 + UNSHADOW_ARGS 1.206 + pop rbp 1.207 + ret 1.208 + 1.209 + 1.210 +;void bilinear_predict8x8_mmx 1.211 +;( 1.212 +; unsigned char *src_ptr, 1.213 +; int src_pixels_per_line, 1.214 +; int xoffset, 1.215 +; int yoffset, 1.216 +; unsigned char *dst_ptr, 1.217 +; int dst_pitch 1.218 +;) 1.219 +global sym(vp8_bilinear_predict8x8_mmx) PRIVATE 1.220 +sym(vp8_bilinear_predict8x8_mmx): 1.221 + push rbp 1.222 + mov rbp, rsp 1.223 + SHADOW_ARGS_TO_STACK 6 1.224 + GET_GOT rbx 1.225 + push rsi 1.226 + push rdi 1.227 + ; end prolog 1.228 + 1.229 + ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; 1.230 + ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; 1.231 + 1.232 + movsxd rax, dword ptr arg(2) ;xoffset 1.233 + mov rdi, arg(4) ;dst_ptr ; 1.234 + 1.235 + shl rax, 5 ; offset * 32 1.236 + lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] 1.237 + 1.238 + add rax, rcx ; HFilter 1.239 + mov rsi, arg(0) ;src_ptr ; 1.240 + 1.241 + movsxd rdx, dword ptr arg(5) ;dst_pitch 1.242 + movq mm1, [rax] ; 1.243 + 1.244 + movq mm2, [rax+16] ; 1.245 + movsxd rax, dword ptr arg(3) ;yoffset 1.246 + 1.247 + pxor mm0, mm0 ; 1.248 + 1.249 + shl rax, 5 ; offset*32 1.250 + add rax, rcx ; VFilter 1.251 + 1.252 + lea rcx, [rdi+rdx*8] ; 1.253 + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; 1.254 + 1.255 + 1.256 + 1.257 + ; get the first horizontal line done ; 1.258 + movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 1.259 + movq mm4, mm3 ; make a copy of current line 1.260 + 1.261 + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 1.262 + punpckhbw mm4, mm0 ; 1.263 + 1.264 + pmullw mm3, mm1 ; 1.265 + pmullw mm4, mm1 ; 1.266 + 1.267 + movq mm5, [rsi+1] ; 1.268 + movq mm6, mm5 ; 1.269 + 1.270 + punpcklbw mm5, mm0 ; 1.271 + punpckhbw mm6, mm0 ; 1.272 + 1.273 + pmullw mm5, mm2 ; 1.274 + pmullw mm6, mm2 ; 1.275 + 1.276 + paddw mm3, mm5 ; 1.277 + paddw mm4, mm6 ; 1.278 + 1.279 + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 1.280 + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1.281 + 1.282 + paddw mm4, [GLOBAL(rd)] ; 1.283 + psraw mm4, VP8_FILTER_SHIFT ; 1.284 + 1.285 + movq mm7, mm3 ; 1.286 + packuswb mm7, mm4 ; 1.287 + 1.288 + add rsi, rdx ; next line 1.289 +.next_row_8x8: 1.290 + movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 1.291 + movq mm4, mm3 ; make a copy of current line 1.292 + 1.293 + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 1.294 + punpckhbw mm4, mm0 ; 1.295 + 1.296 + pmullw mm3, mm1 ; 1.297 + pmullw mm4, mm1 ; 1.298 + 1.299 + movq mm5, [rsi+1] ; 1.300 + movq mm6, mm5 ; 1.301 + 1.302 + punpcklbw mm5, mm0 ; 1.303 + punpckhbw mm6, mm0 ; 1.304 + 1.305 + pmullw mm5, mm2 ; 1.306 + pmullw mm6, mm2 ; 1.307 + 1.308 + paddw mm3, mm5 ; 1.309 + paddw mm4, mm6 ; 1.310 + 1.311 + movq mm5, mm7 ; 1.312 + movq mm6, mm7 ; 1.313 + 1.314 + punpcklbw mm5, mm0 ; 1.315 + punpckhbw mm6, mm0 1.316 + 1.317 + pmullw mm5, [rax] ; 1.318 + pmullw mm6, [rax] ; 1.319 + 1.320 + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 1.321 + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1.322 + 1.323 + paddw mm4, [GLOBAL(rd)] ; 1.324 + psraw mm4, VP8_FILTER_SHIFT ; 1.325 + 1.326 + movq mm7, mm3 ; 1.327 + packuswb mm7, mm4 ; 1.328 + 1.329 + 1.330 + pmullw mm3, [rax+16] ; 1.331 + pmullw mm4, [rax+16] ; 1.332 + 1.333 + paddw mm3, mm5 ; 1.334 + paddw mm4, mm6 ; 1.335 + 1.336 + 1.337 + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 1.338 + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1.339 + 1.340 + paddw mm4, [GLOBAL(rd)] ; 1.341 + psraw mm4, VP8_FILTER_SHIFT ; 1.342 + 1.343 + packuswb mm3, mm4 1.344 + 1.345 + movq [rdi], mm3 ; store the results in the destination 1.346 + 1.347 +%if ABI_IS_32BIT 1.348 + add rsi, rdx ; next line 1.349 + add rdi, dword ptr arg(5) ;dst_pitch ; 1.350 +%else 1.351 + movsxd r8, dword ptr arg(5) ;dst_pitch 1.352 + add rsi, rdx ; next line 1.353 + add rdi, r8 ;dst_pitch 1.354 +%endif 1.355 + cmp rdi, rcx ; 1.356 + jne .next_row_8x8 1.357 + 1.358 + ; begin epilog 1.359 + pop rdi 1.360 + pop rsi 1.361 + RESTORE_GOT 1.362 + UNSHADOW_ARGS 1.363 + pop rbp 1.364 + ret 1.365 + 1.366 + 1.367 +;void bilinear_predict8x4_mmx 1.368 +;( 1.369 +; unsigned char *src_ptr, 1.370 +; int src_pixels_per_line, 1.371 +; int xoffset, 1.372 +; int yoffset, 1.373 +; unsigned char *dst_ptr, 1.374 +; int dst_pitch 1.375 +;) 1.376 +global sym(vp8_bilinear_predict8x4_mmx) PRIVATE 1.377 +sym(vp8_bilinear_predict8x4_mmx): 1.378 + push rbp 1.379 + mov rbp, rsp 1.380 + SHADOW_ARGS_TO_STACK 6 1.381 + GET_GOT rbx 1.382 + push rsi 1.383 + push rdi 1.384 + ; end prolog 1.385 + 1.386 + ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; 1.387 + ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; 1.388 + 1.389 + movsxd rax, dword ptr arg(2) ;xoffset 1.390 + mov rdi, arg(4) ;dst_ptr ; 1.391 + 1.392 + lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] 1.393 + shl rax, 5 1.394 + 1.395 + mov rsi, arg(0) ;src_ptr ; 1.396 + add rax, rcx 1.397 + 1.398 + movsxd rdx, dword ptr arg(5) ;dst_pitch 1.399 + movq mm1, [rax] ; 1.400 + 1.401 + movq mm2, [rax+16] ; 1.402 + movsxd rax, dword ptr arg(3) ;yoffset 1.403 + 1.404 + pxor mm0, mm0 ; 1.405 + shl rax, 5 1.406 + 1.407 + add rax, rcx 1.408 + lea rcx, [rdi+rdx*4] ; 1.409 + 1.410 + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; 1.411 + 1.412 + ; get the first horizontal line done ; 1.413 + movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 1.414 + movq mm4, mm3 ; make a copy of current line 1.415 + 1.416 + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 1.417 + punpckhbw mm4, mm0 ; 1.418 + 1.419 + pmullw mm3, mm1 ; 1.420 + pmullw mm4, mm1 ; 1.421 + 1.422 + movq mm5, [rsi+1] ; 1.423 + movq mm6, mm5 ; 1.424 + 1.425 + punpcklbw mm5, mm0 ; 1.426 + punpckhbw mm6, mm0 ; 1.427 + 1.428 + pmullw mm5, mm2 ; 1.429 + pmullw mm6, mm2 ; 1.430 + 1.431 + paddw mm3, mm5 ; 1.432 + paddw mm4, mm6 ; 1.433 + 1.434 + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 1.435 + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1.436 + 1.437 + paddw mm4, [GLOBAL(rd)] ; 1.438 + psraw mm4, VP8_FILTER_SHIFT ; 1.439 + 1.440 + movq mm7, mm3 ; 1.441 + packuswb mm7, mm4 ; 1.442 + 1.443 + add rsi, rdx ; next line 1.444 +.next_row_8x4: 1.445 + movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 1.446 + movq mm4, mm3 ; make a copy of current line 1.447 + 1.448 + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 1.449 + punpckhbw mm4, mm0 ; 1.450 + 1.451 + pmullw mm3, mm1 ; 1.452 + pmullw mm4, mm1 ; 1.453 + 1.454 + movq mm5, [rsi+1] ; 1.455 + movq mm6, mm5 ; 1.456 + 1.457 + punpcklbw mm5, mm0 ; 1.458 + punpckhbw mm6, mm0 ; 1.459 + 1.460 + pmullw mm5, mm2 ; 1.461 + pmullw mm6, mm2 ; 1.462 + 1.463 + paddw mm3, mm5 ; 1.464 + paddw mm4, mm6 ; 1.465 + 1.466 + movq mm5, mm7 ; 1.467 + movq mm6, mm7 ; 1.468 + 1.469 + punpcklbw mm5, mm0 ; 1.470 + punpckhbw mm6, mm0 1.471 + 1.472 + pmullw mm5, [rax] ; 1.473 + pmullw mm6, [rax] ; 1.474 + 1.475 + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 1.476 + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1.477 + 1.478 + paddw mm4, [GLOBAL(rd)] ; 1.479 + psraw mm4, VP8_FILTER_SHIFT ; 1.480 + 1.481 + movq mm7, mm3 ; 1.482 + packuswb mm7, mm4 ; 1.483 + 1.484 + 1.485 + pmullw mm3, [rax+16] ; 1.486 + pmullw mm4, [rax+16] ; 1.487 + 1.488 + paddw mm3, mm5 ; 1.489 + paddw mm4, mm6 ; 1.490 + 1.491 + 1.492 + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 1.493 + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1.494 + 1.495 + paddw mm4, [GLOBAL(rd)] ; 1.496 + psraw mm4, VP8_FILTER_SHIFT ; 1.497 + 1.498 + packuswb mm3, mm4 1.499 + 1.500 + movq [rdi], mm3 ; store the results in the destination 1.501 + 1.502 +%if ABI_IS_32BIT 1.503 + add rsi, rdx ; next line 1.504 + add rdi, dword ptr arg(5) ;dst_pitch ; 1.505 +%else 1.506 + movsxd r8, dword ptr arg(5) ;dst_pitch 1.507 + add rsi, rdx ; next line 1.508 + add rdi, r8 1.509 +%endif 1.510 + cmp rdi, rcx ; 1.511 + jne .next_row_8x4 1.512 + 1.513 + ; begin epilog 1.514 + pop rdi 1.515 + pop rsi 1.516 + RESTORE_GOT 1.517 + UNSHADOW_ARGS 1.518 + pop rbp 1.519 + ret 1.520 + 1.521 + 1.522 +;void bilinear_predict4x4_mmx 1.523 +;( 1.524 +; unsigned char *src_ptr, 1.525 +; int src_pixels_per_line, 1.526 +; int xoffset, 1.527 +; int yoffset, 1.528 +; unsigned char *dst_ptr, 1.529 +; int dst_pitch 1.530 +;) 1.531 +global sym(vp8_bilinear_predict4x4_mmx) PRIVATE 1.532 +sym(vp8_bilinear_predict4x4_mmx): 1.533 + push rbp 1.534 + mov rbp, rsp 1.535 + SHADOW_ARGS_TO_STACK 6 1.536 + GET_GOT rbx 1.537 + push rsi 1.538 + push rdi 1.539 + ; end prolog 1.540 + 1.541 + ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; 1.542 + ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; 1.543 + 1.544 + movsxd rax, dword ptr arg(2) ;xoffset 1.545 + mov rdi, arg(4) ;dst_ptr ; 1.546 + 1.547 + lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] 1.548 + shl rax, 5 1.549 + 1.550 + add rax, rcx ; HFilter 1.551 + mov rsi, arg(0) ;src_ptr ; 1.552 + 1.553 + movsxd rdx, dword ptr arg(5) ;ldst_pitch 1.554 + movq mm1, [rax] ; 1.555 + 1.556 + movq mm2, [rax+16] ; 1.557 + movsxd rax, dword ptr arg(3) ;yoffset 1.558 + 1.559 + pxor mm0, mm0 ; 1.560 + shl rax, 5 1.561 + 1.562 + add rax, rcx 1.563 + lea rcx, [rdi+rdx*4] ; 1.564 + 1.565 + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; 1.566 + 1.567 + ; get the first horizontal line done ; 1.568 + movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 1.569 + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 1.570 + 1.571 + pmullw mm3, mm1 ; 1.572 + movd mm5, [rsi+1] ; 1.573 + 1.574 + punpcklbw mm5, mm0 ; 1.575 + pmullw mm5, mm2 ; 1.576 + 1.577 + paddw mm3, mm5 ; 1.578 + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 1.579 + 1.580 + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1.581 + 1.582 + movq mm7, mm3 ; 1.583 + packuswb mm7, mm0 ; 1.584 + 1.585 + add rsi, rdx ; next line 1.586 +.next_row_4x4: 1.587 + movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 1.588 + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 1.589 + 1.590 + pmullw mm3, mm1 ; 1.591 + movd mm5, [rsi+1] ; 1.592 + 1.593 + punpcklbw mm5, mm0 ; 1.594 + pmullw mm5, mm2 ; 1.595 + 1.596 + paddw mm3, mm5 ; 1.597 + 1.598 + movq mm5, mm7 ; 1.599 + punpcklbw mm5, mm0 ; 1.600 + 1.601 + pmullw mm5, [rax] ; 1.602 + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 1.603 + 1.604 + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1.605 + movq mm7, mm3 ; 1.606 + 1.607 + packuswb mm7, mm0 ; 1.608 + 1.609 + pmullw mm3, [rax+16] ; 1.610 + paddw mm3, mm5 ; 1.611 + 1.612 + 1.613 + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 1.614 + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1.615 + 1.616 + packuswb mm3, mm0 1.617 + movd [rdi], mm3 ; store the results in the destination 1.618 + 1.619 +%if ABI_IS_32BIT 1.620 + add rsi, rdx ; next line 1.621 + add rdi, dword ptr arg(5) ;dst_pitch ; 1.622 +%else 1.623 + movsxd r8, dword ptr arg(5) ;dst_pitch ; 1.624 + add rsi, rdx ; next line 1.625 + add rdi, r8 1.626 +%endif 1.627 + 1.628 + cmp rdi, rcx ; 1.629 + jne .next_row_4x4 1.630 + 1.631 + ; begin epilog 1.632 + pop rdi 1.633 + pop rsi 1.634 + RESTORE_GOT 1.635 + UNSHADOW_ARGS 1.636 + pop rbp 1.637 + ret 1.638 + 1.639 + 1.640 + 1.641 +SECTION_RODATA 1.642 +align 16 1.643 +rd: 1.644 + times 4 dw 0x40 1.645 + 1.646 +align 16 1.647 +global HIDDEN_DATA(sym(vp8_six_tap_mmx)) 1.648 +sym(vp8_six_tap_mmx): 1.649 + times 8 dw 0 1.650 + times 8 dw 0 1.651 + times 8 dw 128 1.652 + times 8 dw 0 1.653 + times 8 dw 0 1.654 + times 8 dw 0 1.655 + 1.656 + times 8 dw 0 1.657 + times 8 dw -6 1.658 + times 8 dw 123 1.659 + times 8 dw 12 1.660 + times 8 dw -1 1.661 + times 8 dw 0 1.662 + 1.663 + times 8 dw 2 1.664 + times 8 dw -11 1.665 + times 8 dw 108 1.666 + times 8 dw 36 1.667 + times 8 dw -8 1.668 + times 8 dw 1 1.669 + 1.670 + times 8 dw 0 1.671 + times 8 dw -9 1.672 + times 8 dw 93 1.673 + times 8 dw 50 1.674 + times 8 dw -6 1.675 + times 8 dw 0 1.676 + 1.677 + times 8 dw 3 1.678 + times 8 dw -16 1.679 + times 8 dw 77 1.680 + times 8 dw 77 1.681 + times 8 dw -16 1.682 + times 8 dw 3 1.683 + 1.684 + times 8 dw 0 1.685 + times 8 dw -6 1.686 + times 8 dw 50 1.687 + times 8 dw 93 1.688 + times 8 dw -9 1.689 + times 8 dw 0 1.690 + 1.691 + times 8 dw 1 1.692 + times 8 dw -8 1.693 + times 8 dw 36 1.694 + times 8 dw 108 1.695 + times 8 dw -11 1.696 + times 8 dw 2 1.697 + 1.698 + times 8 dw 0 1.699 + times 8 dw -1 1.700 + times 8 dw 12 1.701 + times 8 dw 123 1.702 + times 8 dw -6 1.703 + times 8 dw 0 1.704 + 1.705 +