michael@0: ; michael@0: ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license michael@0: ; that can be found in the LICENSE file in the root of the source michael@0: ; tree. An additional intellectual property rights grant can be found michael@0: ; in the file PATENTS. All contributing project authors may michael@0: ; be found in the AUTHORS file in the root of the source tree. michael@0: ; michael@0: michael@0: %include "third_party/x86inc/x86inc.asm" michael@0: michael@0: SECTION .text michael@0: michael@0: %macro convolve_fn 1 michael@0: INIT_XMM sse2 michael@0: cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \ michael@0: fx, fxs, fy, fys, w, h michael@0: mov r4d, dword wm michael@0: cmp r4d, 4 michael@0: je .w4 michael@0: cmp r4d, 8 michael@0: je .w8 michael@0: cmp r4d, 16 michael@0: je .w16 michael@0: cmp r4d, 32 michael@0: je .w32 michael@0: michael@0: mov r4d, dword hm michael@0: .loop64: michael@0: movu m0, [srcq] michael@0: movu m1, [srcq+16] michael@0: movu m2, [srcq+32] michael@0: movu m3, [srcq+48] michael@0: add srcq, src_strideq michael@0: %ifidn %1, avg michael@0: pavgb m0, [dstq] michael@0: pavgb m1, [dstq+16] michael@0: pavgb m2, [dstq+32] michael@0: pavgb m3, [dstq+48] michael@0: %endif michael@0: mova [dstq ], m0 michael@0: mova [dstq+16], m1 michael@0: mova [dstq+32], m2 michael@0: mova [dstq+48], m3 michael@0: add dstq, dst_strideq michael@0: dec r4d michael@0: jnz .loop64 michael@0: RET michael@0: michael@0: .w32: michael@0: mov r4d, dword hm michael@0: .loop32: michael@0: movu m0, [srcq] michael@0: movu m1, [srcq+16] michael@0: movu m2, [srcq+src_strideq] michael@0: movu m3, [srcq+src_strideq+16] michael@0: lea srcq, [srcq+src_strideq*2] michael@0: %ifidn %1, avg michael@0: pavgb m0, [dstq] michael@0: pavgb m1, [dstq +16] michael@0: pavgb m2, [dstq+dst_strideq] michael@0: pavgb m3, [dstq+dst_strideq+16] michael@0: %endif michael@0: mova [dstq ], m0 michael@0: mova [dstq +16], m1 michael@0: mova [dstq+dst_strideq ], m2 michael@0: mova [dstq+dst_strideq+16], m3 michael@0: lea dstq, [dstq+dst_strideq*2] michael@0: sub r4d, 2 michael@0: jnz .loop32 michael@0: RET michael@0: michael@0: .w16: michael@0: mov r4d, dword hm michael@0: lea r5q, [src_strideq*3] michael@0: lea r6q, [dst_strideq*3] michael@0: .loop16: michael@0: movu m0, [srcq] michael@0: movu m1, [srcq+src_strideq] michael@0: movu m2, [srcq+src_strideq*2] michael@0: movu m3, [srcq+r5q] michael@0: lea srcq, [srcq+src_strideq*4] michael@0: %ifidn %1, avg michael@0: pavgb m0, [dstq] michael@0: pavgb m1, [dstq+dst_strideq] michael@0: pavgb m2, [dstq+dst_strideq*2] michael@0: pavgb m3, [dstq+r6q] michael@0: %endif michael@0: mova [dstq ], m0 michael@0: mova [dstq+dst_strideq ], m1 michael@0: mova [dstq+dst_strideq*2], m2 michael@0: mova [dstq+r6q ], m3 michael@0: lea dstq, [dstq+dst_strideq*4] michael@0: sub r4d, 4 michael@0: jnz .loop16 michael@0: RET michael@0: michael@0: INIT_MMX sse michael@0: .w8: michael@0: mov r4d, dword hm michael@0: lea r5q, [src_strideq*3] michael@0: lea r6q, [dst_strideq*3] michael@0: .loop8: michael@0: movu m0, [srcq] michael@0: movu m1, [srcq+src_strideq] michael@0: movu m2, [srcq+src_strideq*2] michael@0: movu m3, [srcq+r5q] michael@0: lea srcq, [srcq+src_strideq*4] michael@0: %ifidn %1, avg michael@0: pavgb m0, [dstq] michael@0: pavgb m1, [dstq+dst_strideq] michael@0: pavgb m2, [dstq+dst_strideq*2] michael@0: pavgb m3, [dstq+r6q] michael@0: %endif michael@0: mova [dstq ], m0 michael@0: mova [dstq+dst_strideq ], m1 michael@0: mova [dstq+dst_strideq*2], m2 michael@0: mova [dstq+r6q ], m3 michael@0: lea dstq, [dstq+dst_strideq*4] michael@0: sub r4d, 4 michael@0: jnz .loop8 michael@0: RET michael@0: michael@0: .w4: michael@0: mov r4d, dword hm michael@0: lea r5q, [src_strideq*3] michael@0: lea r6q, [dst_strideq*3] michael@0: .loop4: michael@0: movh m0, [srcq] michael@0: movh m1, [srcq+src_strideq] michael@0: movh m2, [srcq+src_strideq*2] michael@0: movh m3, [srcq+r5q] michael@0: lea srcq, [srcq+src_strideq*4] michael@0: %ifidn %1, avg michael@0: pavgb m0, [dstq] michael@0: pavgb m1, [dstq+dst_strideq] michael@0: pavgb m2, [dstq+dst_strideq*2] michael@0: pavgb m3, [dstq+r6q] michael@0: %endif michael@0: movh [dstq ], m0 michael@0: movh [dstq+dst_strideq ], m1 michael@0: movh [dstq+dst_strideq*2], m2 michael@0: movh [dstq+r6q ], m3 michael@0: lea dstq, [dstq+dst_strideq*4] michael@0: sub r4d, 4 michael@0: jnz .loop4 michael@0: RET michael@0: %endmacro michael@0: michael@0: convolve_fn copy michael@0: convolve_fn avg