michael@0: ; michael@0: ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license michael@0: ; that can be found in the LICENSE file in the root of the source michael@0: ; tree. An additional intellectual property rights grant can be found michael@0: ; in the file PATENTS. All contributing project authors may michael@0: ; be found in the AUTHORS file in the root of the source tree. michael@0: ; michael@0: michael@0: michael@0: %include "vpx_ports/x86_abi_support.asm" michael@0: extern sym(vp8_bilinear_filters_x86_8) michael@0: michael@0: michael@0: %define BLOCK_HEIGHT_WIDTH 4 michael@0: %define vp8_filter_weight 128 michael@0: %define VP8_FILTER_SHIFT 7 michael@0: michael@0: michael@0: ;void vp8_filter_block1d_h6_mmx michael@0: ;( michael@0: ; unsigned char *src_ptr, michael@0: ; unsigned short *output_ptr, michael@0: ; unsigned int src_pixels_per_line, michael@0: ; unsigned int pixel_step, michael@0: ; unsigned int output_height, michael@0: ; unsigned int output_width, michael@0: ; short * vp8_filter michael@0: ;) michael@0: global sym(vp8_filter_block1d_h6_mmx) PRIVATE michael@0: sym(vp8_filter_block1d_h6_mmx): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 7 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: mov rdx, arg(6) ;vp8_filter michael@0: michael@0: movq mm1, [rdx + 16] ; do both the negative taps first!!! michael@0: movq mm2, [rdx + 32] ; michael@0: movq mm6, [rdx + 48] ; michael@0: movq mm7, [rdx + 64] ; michael@0: michael@0: mov rdi, arg(1) ;output_ptr michael@0: mov rsi, arg(0) ;src_ptr michael@0: movsxd rcx, dword ptr arg(4) ;output_height michael@0: movsxd rax, dword ptr arg(5) ;output_width ; destination pitch? michael@0: pxor mm0, mm0 ; mm0 = 00000000 michael@0: michael@0: .nextrow: michael@0: movq mm3, [rsi-2] ; mm3 = p-2..p5 michael@0: movq mm4, mm3 ; mm4 = p-2..p5 michael@0: psrlq mm3, 8 ; mm3 = p-1..p5 michael@0: punpcklbw mm3, mm0 ; mm3 = p-1..p2 michael@0: pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. michael@0: michael@0: movq mm5, mm4 ; mm5 = p-2..p5 michael@0: punpckhbw mm4, mm0 ; mm5 = p2..p5 michael@0: pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers michael@0: paddsw mm3, mm4 ; mm3 += mm5 michael@0: michael@0: movq mm4, mm5 ; mm4 = p-2..p5; michael@0: psrlq mm5, 16 ; mm5 = p0..p5; michael@0: punpcklbw mm5, mm0 ; mm5 = p0..p3 michael@0: pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers michael@0: paddsw mm3, mm5 ; mm3 += mm5 michael@0: michael@0: movq mm5, mm4 ; mm5 = p-2..p5 michael@0: psrlq mm4, 24 ; mm4 = p1..p5 michael@0: punpcklbw mm4, mm0 ; mm4 = p1..p4 michael@0: pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers michael@0: paddsw mm3, mm4 ; mm3 += mm5 michael@0: michael@0: ; do outer positive taps michael@0: movd mm4, [rsi+3] michael@0: punpcklbw mm4, mm0 ; mm5 = p3..p6 michael@0: pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers michael@0: paddsw mm3, mm4 ; mm3 += mm5 michael@0: michael@0: punpcklbw mm5, mm0 ; mm5 = p-2..p1 michael@0: pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers michael@0: paddsw mm3, mm5 ; mm3 += mm5 michael@0: michael@0: paddsw mm3, [GLOBAL(rd)] ; mm3 += round value michael@0: psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 michael@0: packuswb mm3, mm0 ; pack and unpack to saturate michael@0: punpcklbw mm3, mm0 ; michael@0: michael@0: movq [rdi], mm3 ; store the results in the destination michael@0: michael@0: %if ABI_IS_32BIT michael@0: add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line michael@0: add rdi, rax; michael@0: %else michael@0: movsxd r8, dword ptr arg(2) ;src_pixels_per_line michael@0: add rdi, rax; michael@0: michael@0: add rsi, r8 ; next line michael@0: %endif michael@0: michael@0: dec rcx ; decrement count michael@0: jnz .nextrow ; next row michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: michael@0: ;void vp8_filter_block1dc_v6_mmx michael@0: ;( michael@0: ; short *src_ptr, michael@0: ; unsigned char *output_ptr, michael@0: ; int output_pitch, michael@0: ; unsigned int pixels_per_line, michael@0: ; unsigned int pixel_step, michael@0: ; unsigned int output_height, michael@0: ; unsigned int output_width, michael@0: ; short * vp8_filter michael@0: ;) michael@0: global sym(vp8_filter_block1dc_v6_mmx) PRIVATE michael@0: sym(vp8_filter_block1dc_v6_mmx): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 8 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: movq mm5, [GLOBAL(rd)] michael@0: push rbx michael@0: mov rbx, arg(7) ;vp8_filter michael@0: movq mm1, [rbx + 16] ; do both the negative taps first!!! michael@0: movq mm2, [rbx + 32] ; michael@0: movq mm6, [rbx + 48] ; michael@0: movq mm7, [rbx + 64] ; michael@0: michael@0: movsxd rdx, dword ptr arg(3) ;pixels_per_line michael@0: mov rdi, arg(1) ;output_ptr michael@0: mov rsi, arg(0) ;src_ptr michael@0: sub rsi, rdx michael@0: sub rsi, rdx michael@0: movsxd rcx, DWORD PTR arg(5) ;output_height michael@0: movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch? michael@0: pxor mm0, mm0 ; mm0 = 00000000 michael@0: michael@0: michael@0: .nextrow_cv: michael@0: movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1 michael@0: pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. michael@0: michael@0: michael@0: movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2 michael@0: pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers. michael@0: paddsw mm3, mm4 ; mm3 += mm4 michael@0: michael@0: movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0 michael@0: pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers. michael@0: paddsw mm3, mm4 ; mm3 += mm4 michael@0: michael@0: movq mm4, [rsi] ; mm4 = p0..p3 = row -2 michael@0: pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers. michael@0: paddsw mm3, mm4 ; mm3 += mm4 michael@0: michael@0: michael@0: add rsi, rdx ; move source forward 1 line to avoid 3 * pitch michael@0: movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1 michael@0: pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers. michael@0: paddsw mm3, mm4 ; mm3 += mm4 michael@0: michael@0: movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3 michael@0: pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers. michael@0: paddsw mm3, mm4 ; mm3 += mm4 michael@0: michael@0: michael@0: paddsw mm3, mm5 ; mm3 += round value michael@0: psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 michael@0: packuswb mm3, mm0 ; pack and saturate michael@0: michael@0: movd [rdi],mm3 ; store the results in the destination michael@0: ; the subsequent iterations repeat 3 out of 4 of these reads. Since the michael@0: ; recon block should be in cache this shouldn't cost much. Its obviously michael@0: ; avoidable!!!. michael@0: lea rdi, [rdi+rax] ; michael@0: dec rcx ; decrement count michael@0: jnz .nextrow_cv ; next row michael@0: michael@0: pop rbx michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: michael@0: ;void bilinear_predict8x8_mmx michael@0: ;( michael@0: ; unsigned char *src_ptr, michael@0: ; int src_pixels_per_line, michael@0: ; int xoffset, michael@0: ; int yoffset, michael@0: ; unsigned char *dst_ptr, michael@0: ; int dst_pitch michael@0: ;) michael@0: global sym(vp8_bilinear_predict8x8_mmx) PRIVATE michael@0: sym(vp8_bilinear_predict8x8_mmx): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; michael@0: ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; michael@0: michael@0: movsxd rax, dword ptr arg(2) ;xoffset michael@0: mov rdi, arg(4) ;dst_ptr ; michael@0: michael@0: shl rax, 5 ; offset * 32 michael@0: lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] michael@0: michael@0: add rax, rcx ; HFilter michael@0: mov rsi, arg(0) ;src_ptr ; michael@0: michael@0: movsxd rdx, dword ptr arg(5) ;dst_pitch michael@0: movq mm1, [rax] ; michael@0: michael@0: movq mm2, [rax+16] ; michael@0: movsxd rax, dword ptr arg(3) ;yoffset michael@0: michael@0: pxor mm0, mm0 ; michael@0: michael@0: shl rax, 5 ; offset*32 michael@0: add rax, rcx ; VFilter michael@0: michael@0: lea rcx, [rdi+rdx*8] ; michael@0: movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; michael@0: michael@0: michael@0: michael@0: ; get the first horizontal line done ; michael@0: movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 michael@0: movq mm4, mm3 ; make a copy of current line michael@0: michael@0: punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 michael@0: punpckhbw mm4, mm0 ; michael@0: michael@0: pmullw mm3, mm1 ; michael@0: pmullw mm4, mm1 ; michael@0: michael@0: movq mm5, [rsi+1] ; michael@0: movq mm6, mm5 ; michael@0: michael@0: punpcklbw mm5, mm0 ; michael@0: punpckhbw mm6, mm0 ; michael@0: michael@0: pmullw mm5, mm2 ; michael@0: pmullw mm6, mm2 ; michael@0: michael@0: paddw mm3, mm5 ; michael@0: paddw mm4, mm6 ; michael@0: michael@0: paddw mm3, [GLOBAL(rd)] ; xmm3 += round value michael@0: psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 michael@0: michael@0: paddw mm4, [GLOBAL(rd)] ; michael@0: psraw mm4, VP8_FILTER_SHIFT ; michael@0: michael@0: movq mm7, mm3 ; michael@0: packuswb mm7, mm4 ; michael@0: michael@0: add rsi, rdx ; next line michael@0: .next_row_8x8: michael@0: movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 michael@0: movq mm4, mm3 ; make a copy of current line michael@0: michael@0: punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 michael@0: punpckhbw mm4, mm0 ; michael@0: michael@0: pmullw mm3, mm1 ; michael@0: pmullw mm4, mm1 ; michael@0: michael@0: movq mm5, [rsi+1] ; michael@0: movq mm6, mm5 ; michael@0: michael@0: punpcklbw mm5, mm0 ; michael@0: punpckhbw mm6, mm0 ; michael@0: michael@0: pmullw mm5, mm2 ; michael@0: pmullw mm6, mm2 ; michael@0: michael@0: paddw mm3, mm5 ; michael@0: paddw mm4, mm6 ; michael@0: michael@0: movq mm5, mm7 ; michael@0: movq mm6, mm7 ; michael@0: michael@0: punpcklbw mm5, mm0 ; michael@0: punpckhbw mm6, mm0 michael@0: michael@0: pmullw mm5, [rax] ; michael@0: pmullw mm6, [rax] ; michael@0: michael@0: paddw mm3, [GLOBAL(rd)] ; xmm3 += round value michael@0: psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 michael@0: michael@0: paddw mm4, [GLOBAL(rd)] ; michael@0: psraw mm4, VP8_FILTER_SHIFT ; michael@0: michael@0: movq mm7, mm3 ; michael@0: packuswb mm7, mm4 ; michael@0: michael@0: michael@0: pmullw mm3, [rax+16] ; michael@0: pmullw mm4, [rax+16] ; michael@0: michael@0: paddw mm3, mm5 ; michael@0: paddw mm4, mm6 ; michael@0: michael@0: michael@0: paddw mm3, [GLOBAL(rd)] ; xmm3 += round value michael@0: psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 michael@0: michael@0: paddw mm4, [GLOBAL(rd)] ; michael@0: psraw mm4, VP8_FILTER_SHIFT ; michael@0: michael@0: packuswb mm3, mm4 michael@0: michael@0: movq [rdi], mm3 ; store the results in the destination michael@0: michael@0: %if ABI_IS_32BIT michael@0: add rsi, rdx ; next line michael@0: add rdi, dword ptr arg(5) ;dst_pitch ; michael@0: %else michael@0: movsxd r8, dword ptr arg(5) ;dst_pitch michael@0: add rsi, rdx ; next line michael@0: add rdi, r8 ;dst_pitch michael@0: %endif michael@0: cmp rdi, rcx ; michael@0: jne .next_row_8x8 michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: michael@0: ;void bilinear_predict8x4_mmx michael@0: ;( michael@0: ; unsigned char *src_ptr, michael@0: ; int src_pixels_per_line, michael@0: ; int xoffset, michael@0: ; int yoffset, michael@0: ; unsigned char *dst_ptr, michael@0: ; int dst_pitch michael@0: ;) michael@0: global sym(vp8_bilinear_predict8x4_mmx) PRIVATE michael@0: sym(vp8_bilinear_predict8x4_mmx): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; michael@0: ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; michael@0: michael@0: movsxd rax, dword ptr arg(2) ;xoffset michael@0: mov rdi, arg(4) ;dst_ptr ; michael@0: michael@0: lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] michael@0: shl rax, 5 michael@0: michael@0: mov rsi, arg(0) ;src_ptr ; michael@0: add rax, rcx michael@0: michael@0: movsxd rdx, dword ptr arg(5) ;dst_pitch michael@0: movq mm1, [rax] ; michael@0: michael@0: movq mm2, [rax+16] ; michael@0: movsxd rax, dword ptr arg(3) ;yoffset michael@0: michael@0: pxor mm0, mm0 ; michael@0: shl rax, 5 michael@0: michael@0: add rax, rcx michael@0: lea rcx, [rdi+rdx*4] ; michael@0: michael@0: movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; michael@0: michael@0: ; get the first horizontal line done ; michael@0: movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 michael@0: movq mm4, mm3 ; make a copy of current line michael@0: michael@0: punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 michael@0: punpckhbw mm4, mm0 ; michael@0: michael@0: pmullw mm3, mm1 ; michael@0: pmullw mm4, mm1 ; michael@0: michael@0: movq mm5, [rsi+1] ; michael@0: movq mm6, mm5 ; michael@0: michael@0: punpcklbw mm5, mm0 ; michael@0: punpckhbw mm6, mm0 ; michael@0: michael@0: pmullw mm5, mm2 ; michael@0: pmullw mm6, mm2 ; michael@0: michael@0: paddw mm3, mm5 ; michael@0: paddw mm4, mm6 ; michael@0: michael@0: paddw mm3, [GLOBAL(rd)] ; xmm3 += round value michael@0: psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 michael@0: michael@0: paddw mm4, [GLOBAL(rd)] ; michael@0: psraw mm4, VP8_FILTER_SHIFT ; michael@0: michael@0: movq mm7, mm3 ; michael@0: packuswb mm7, mm4 ; michael@0: michael@0: add rsi, rdx ; next line michael@0: .next_row_8x4: michael@0: movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 michael@0: movq mm4, mm3 ; make a copy of current line michael@0: michael@0: punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 michael@0: punpckhbw mm4, mm0 ; michael@0: michael@0: pmullw mm3, mm1 ; michael@0: pmullw mm4, mm1 ; michael@0: michael@0: movq mm5, [rsi+1] ; michael@0: movq mm6, mm5 ; michael@0: michael@0: punpcklbw mm5, mm0 ; michael@0: punpckhbw mm6, mm0 ; michael@0: michael@0: pmullw mm5, mm2 ; michael@0: pmullw mm6, mm2 ; michael@0: michael@0: paddw mm3, mm5 ; michael@0: paddw mm4, mm6 ; michael@0: michael@0: movq mm5, mm7 ; michael@0: movq mm6, mm7 ; michael@0: michael@0: punpcklbw mm5, mm0 ; michael@0: punpckhbw mm6, mm0 michael@0: michael@0: pmullw mm5, [rax] ; michael@0: pmullw mm6, [rax] ; michael@0: michael@0: paddw mm3, [GLOBAL(rd)] ; xmm3 += round value michael@0: psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 michael@0: michael@0: paddw mm4, [GLOBAL(rd)] ; michael@0: psraw mm4, VP8_FILTER_SHIFT ; michael@0: michael@0: movq mm7, mm3 ; michael@0: packuswb mm7, mm4 ; michael@0: michael@0: michael@0: pmullw mm3, [rax+16] ; michael@0: pmullw mm4, [rax+16] ; michael@0: michael@0: paddw mm3, mm5 ; michael@0: paddw mm4, mm6 ; michael@0: michael@0: michael@0: paddw mm3, [GLOBAL(rd)] ; xmm3 += round value michael@0: psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 michael@0: michael@0: paddw mm4, [GLOBAL(rd)] ; michael@0: psraw mm4, VP8_FILTER_SHIFT ; michael@0: michael@0: packuswb mm3, mm4 michael@0: michael@0: movq [rdi], mm3 ; store the results in the destination michael@0: michael@0: %if ABI_IS_32BIT michael@0: add rsi, rdx ; next line michael@0: add rdi, dword ptr arg(5) ;dst_pitch ; michael@0: %else michael@0: movsxd r8, dword ptr arg(5) ;dst_pitch michael@0: add rsi, rdx ; next line michael@0: add rdi, r8 michael@0: %endif michael@0: cmp rdi, rcx ; michael@0: jne .next_row_8x4 michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: michael@0: ;void bilinear_predict4x4_mmx michael@0: ;( michael@0: ; unsigned char *src_ptr, michael@0: ; int src_pixels_per_line, michael@0: ; int xoffset, michael@0: ; int yoffset, michael@0: ; unsigned char *dst_ptr, michael@0: ; int dst_pitch michael@0: ;) michael@0: global sym(vp8_bilinear_predict4x4_mmx) PRIVATE michael@0: sym(vp8_bilinear_predict4x4_mmx): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; michael@0: ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; michael@0: michael@0: movsxd rax, dword ptr arg(2) ;xoffset michael@0: mov rdi, arg(4) ;dst_ptr ; michael@0: michael@0: lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] michael@0: shl rax, 5 michael@0: michael@0: add rax, rcx ; HFilter michael@0: mov rsi, arg(0) ;src_ptr ; michael@0: michael@0: movsxd rdx, dword ptr arg(5) ;ldst_pitch michael@0: movq mm1, [rax] ; michael@0: michael@0: movq mm2, [rax+16] ; michael@0: movsxd rax, dword ptr arg(3) ;yoffset michael@0: michael@0: pxor mm0, mm0 ; michael@0: shl rax, 5 michael@0: michael@0: add rax, rcx michael@0: lea rcx, [rdi+rdx*4] ; michael@0: michael@0: movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; michael@0: michael@0: ; get the first horizontal line done ; michael@0: movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 michael@0: punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 michael@0: michael@0: pmullw mm3, mm1 ; michael@0: movd mm5, [rsi+1] ; michael@0: michael@0: punpcklbw mm5, mm0 ; michael@0: pmullw mm5, mm2 ; michael@0: michael@0: paddw mm3, mm5 ; michael@0: paddw mm3, [GLOBAL(rd)] ; xmm3 += round value michael@0: michael@0: psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 michael@0: michael@0: movq mm7, mm3 ; michael@0: packuswb mm7, mm0 ; michael@0: michael@0: add rsi, rdx ; next line michael@0: .next_row_4x4: michael@0: movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 michael@0: punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 michael@0: michael@0: pmullw mm3, mm1 ; michael@0: movd mm5, [rsi+1] ; michael@0: michael@0: punpcklbw mm5, mm0 ; michael@0: pmullw mm5, mm2 ; michael@0: michael@0: paddw mm3, mm5 ; michael@0: michael@0: movq mm5, mm7 ; michael@0: punpcklbw mm5, mm0 ; michael@0: michael@0: pmullw mm5, [rax] ; michael@0: paddw mm3, [GLOBAL(rd)] ; xmm3 += round value michael@0: michael@0: psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 michael@0: movq mm7, mm3 ; michael@0: michael@0: packuswb mm7, mm0 ; michael@0: michael@0: pmullw mm3, [rax+16] ; michael@0: paddw mm3, mm5 ; michael@0: michael@0: michael@0: paddw mm3, [GLOBAL(rd)] ; xmm3 += round value michael@0: psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 michael@0: michael@0: packuswb mm3, mm0 michael@0: movd [rdi], mm3 ; store the results in the destination michael@0: michael@0: %if ABI_IS_32BIT michael@0: add rsi, rdx ; next line michael@0: add rdi, dword ptr arg(5) ;dst_pitch ; michael@0: %else michael@0: movsxd r8, dword ptr arg(5) ;dst_pitch ; michael@0: add rsi, rdx ; next line michael@0: add rdi, r8 michael@0: %endif michael@0: michael@0: cmp rdi, rcx ; michael@0: jne .next_row_4x4 michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: michael@0: michael@0: SECTION_RODATA michael@0: align 16 michael@0: rd: michael@0: times 4 dw 0x40 michael@0: michael@0: align 16 michael@0: global HIDDEN_DATA(sym(vp8_six_tap_mmx)) michael@0: sym(vp8_six_tap_mmx): michael@0: times 8 dw 0 michael@0: times 8 dw 0 michael@0: times 8 dw 128 michael@0: times 8 dw 0 michael@0: times 8 dw 0 michael@0: times 8 dw 0 michael@0: michael@0: times 8 dw 0 michael@0: times 8 dw -6 michael@0: times 8 dw 123 michael@0: times 8 dw 12 michael@0: times 8 dw -1 michael@0: times 8 dw 0 michael@0: michael@0: times 8 dw 2 michael@0: times 8 dw -11 michael@0: times 8 dw 108 michael@0: times 8 dw 36 michael@0: times 8 dw -8 michael@0: times 8 dw 1 michael@0: michael@0: times 8 dw 0 michael@0: times 8 dw -9 michael@0: times 8 dw 93 michael@0: times 8 dw 50 michael@0: times 8 dw -6 michael@0: times 8 dw 0 michael@0: michael@0: times 8 dw 3 michael@0: times 8 dw -16 michael@0: times 8 dw 77 michael@0: times 8 dw 77 michael@0: times 8 dw -16 michael@0: times 8 dw 3 michael@0: michael@0: times 8 dw 0 michael@0: times 8 dw -6 michael@0: times 8 dw 50 michael@0: times 8 dw 93 michael@0: times 8 dw -9 michael@0: times 8 dw 0 michael@0: michael@0: times 8 dw 1 michael@0: times 8 dw -8 michael@0: times 8 dw 36 michael@0: times 8 dw 108 michael@0: times 8 dw -11 michael@0: times 8 dw 2 michael@0: michael@0: times 8 dw 0 michael@0: times 8 dw -1 michael@0: times 8 dw 12 michael@0: times 8 dw 123 michael@0: times 8 dw -6 michael@0: times 8 dw 0 michael@0: michael@0: