michael@0: ; michael@0: ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license michael@0: ; that can be found in the LICENSE file in the root of the source michael@0: ; tree. An additional intellectual property rights grant can be found michael@0: ; in the file PATENTS. All contributing project authors may michael@0: ; be found in the AUTHORS file in the root of the source tree. michael@0: ; michael@0: michael@0: %include "third_party/x86inc/x86inc.asm" michael@0: michael@0: SECTION_RODATA michael@0: pw_4: times 8 dw 4 michael@0: pw_8: times 8 dw 8 michael@0: pw_16: times 8 dw 16 michael@0: pw_32: times 8 dw 32 michael@0: michael@0: SECTION .text michael@0: michael@0: INIT_MMX sse michael@0: cglobal dc_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset michael@0: GET_GOT goffsetq michael@0: michael@0: pxor m1, m1 michael@0: movd m0, [aboveq] michael@0: punpckldq m0, [leftq] michael@0: psadbw m0, m1 michael@0: paddw m0, [GLOBAL(pw_4)] michael@0: psraw m0, 3 michael@0: pshufw m0, m0, 0x0 michael@0: packuswb m0, m0 michael@0: movd [dstq ], m0 michael@0: movd [dstq+strideq], m0 michael@0: lea dstq, [dstq+strideq*2] michael@0: movd [dstq ], m0 michael@0: movd [dstq+strideq], m0 michael@0: michael@0: RESTORE_GOT michael@0: RET michael@0: michael@0: INIT_MMX sse michael@0: cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset michael@0: GET_GOT goffsetq michael@0: michael@0: pxor m1, m1 michael@0: movq m0, [aboveq] michael@0: movq m2, [leftq] michael@0: DEFINE_ARGS dst, stride, stride3 michael@0: lea stride3q, [strideq*3] michael@0: psadbw m0, m1 michael@0: psadbw m2, m1 michael@0: paddw m0, m2 michael@0: paddw m0, [GLOBAL(pw_8)] michael@0: psraw m0, 4 michael@0: pshufw m0, m0, 0x0 michael@0: packuswb m0, m0 michael@0: movq [dstq ], m0 michael@0: movq [dstq+strideq ], m0 michael@0: movq [dstq+strideq*2], m0 michael@0: movq [dstq+stride3q ], m0 michael@0: lea dstq, [dstq+strideq*4] michael@0: movq [dstq ], m0 michael@0: movq [dstq+strideq ], m0 michael@0: movq [dstq+strideq*2], m0 michael@0: movq [dstq+stride3q ], m0 michael@0: michael@0: RESTORE_GOT michael@0: RET michael@0: michael@0: INIT_XMM sse2 michael@0: cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset michael@0: GET_GOT goffsetq michael@0: michael@0: pxor m1, m1 michael@0: mova m0, [aboveq] michael@0: mova m2, [leftq] michael@0: DEFINE_ARGS dst, stride, stride3, lines4 michael@0: lea stride3q, [strideq*3] michael@0: mov lines4d, 4 michael@0: psadbw m0, m1 michael@0: psadbw m2, m1 michael@0: paddw m0, m2 michael@0: movhlps m2, m0 michael@0: paddw m0, m2 michael@0: paddw m0, [GLOBAL(pw_16)] michael@0: psraw m0, 5 michael@0: pshuflw m0, m0, 0x0 michael@0: punpcklqdq m0, m0 michael@0: packuswb m0, m0 michael@0: .loop: michael@0: mova [dstq ], m0 michael@0: mova [dstq+strideq ], m0 michael@0: mova [dstq+strideq*2], m0 michael@0: mova [dstq+stride3q ], m0 michael@0: lea dstq, [dstq+strideq*4] michael@0: dec lines4d michael@0: jnz .loop michael@0: michael@0: RESTORE_GOT michael@0: REP_RET michael@0: michael@0: INIT_XMM sse2 michael@0: cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset michael@0: GET_GOT goffsetq michael@0: michael@0: pxor m1, m1 michael@0: mova m0, [aboveq] michael@0: mova m2, [aboveq+16] michael@0: mova m3, [leftq] michael@0: mova m4, [leftq+16] michael@0: DEFINE_ARGS dst, stride, stride3, lines4 michael@0: lea stride3q, [strideq*3] michael@0: mov lines4d, 8 michael@0: psadbw m0, m1 michael@0: psadbw m2, m1 michael@0: psadbw m3, m1 michael@0: psadbw m4, m1 michael@0: paddw m0, m2 michael@0: paddw m0, m3 michael@0: paddw m0, m4 michael@0: movhlps m2, m0 michael@0: paddw m0, m2 michael@0: paddw m0, [GLOBAL(pw_32)] michael@0: psraw m0, 6 michael@0: pshuflw m0, m0, 0x0 michael@0: punpcklqdq m0, m0 michael@0: packuswb m0, m0 michael@0: .loop: michael@0: mova [dstq ], m0 michael@0: mova [dstq +16], m0 michael@0: mova [dstq+strideq ], m0 michael@0: mova [dstq+strideq +16], m0 michael@0: mova [dstq+strideq*2 ], m0 michael@0: mova [dstq+strideq*2+16], m0 michael@0: mova [dstq+stride3q ], m0 michael@0: mova [dstq+stride3q +16], m0 michael@0: lea dstq, [dstq+strideq*4] michael@0: dec lines4d michael@0: jnz .loop michael@0: michael@0: RESTORE_GOT michael@0: REP_RET michael@0: michael@0: INIT_MMX sse michael@0: cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above michael@0: movd m0, [aboveq] michael@0: movd [dstq ], m0 michael@0: movd [dstq+strideq], m0 michael@0: lea dstq, [dstq+strideq*2] michael@0: movd [dstq ], m0 michael@0: movd [dstq+strideq], m0 michael@0: RET michael@0: michael@0: INIT_MMX sse michael@0: cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above michael@0: movq m0, [aboveq] michael@0: DEFINE_ARGS dst, stride, stride3 michael@0: lea stride3q, [strideq*3] michael@0: movq [dstq ], m0 michael@0: movq [dstq+strideq ], m0 michael@0: movq [dstq+strideq*2], m0 michael@0: movq [dstq+stride3q ], m0 michael@0: lea dstq, [dstq+strideq*4] michael@0: movq [dstq ], m0 michael@0: movq [dstq+strideq ], m0 michael@0: movq [dstq+strideq*2], m0 michael@0: movq [dstq+stride3q ], m0 michael@0: RET michael@0: michael@0: INIT_XMM sse2 michael@0: cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above michael@0: mova m0, [aboveq] michael@0: DEFINE_ARGS dst, stride, stride3, nlines4 michael@0: lea stride3q, [strideq*3] michael@0: mov nlines4d, 4 michael@0: .loop: michael@0: mova [dstq ], m0 michael@0: mova [dstq+strideq ], m0 michael@0: mova [dstq+strideq*2], m0 michael@0: mova [dstq+stride3q ], m0 michael@0: lea dstq, [dstq+strideq*4] michael@0: dec nlines4d michael@0: jnz .loop michael@0: REP_RET michael@0: michael@0: INIT_XMM sse2 michael@0: cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above michael@0: mova m0, [aboveq] michael@0: mova m1, [aboveq+16] michael@0: DEFINE_ARGS dst, stride, stride3, nlines4 michael@0: lea stride3q, [strideq*3] michael@0: mov nlines4d, 8 michael@0: .loop: michael@0: mova [dstq ], m0 michael@0: mova [dstq +16], m1 michael@0: mova [dstq+strideq ], m0 michael@0: mova [dstq+strideq +16], m1 michael@0: mova [dstq+strideq*2 ], m0 michael@0: mova [dstq+strideq*2+16], m1 michael@0: mova [dstq+stride3q ], m0 michael@0: mova [dstq+stride3q +16], m1 michael@0: lea dstq, [dstq+strideq*4] michael@0: dec nlines4d michael@0: jnz .loop michael@0: REP_RET michael@0: michael@0: INIT_MMX sse michael@0: cglobal tm_predictor_4x4, 4, 4, 4, dst, stride, above, left michael@0: pxor m1, m1 michael@0: movd m2, [aboveq-1] michael@0: movd m0, [aboveq] michael@0: punpcklbw m2, m1 michael@0: punpcklbw m0, m1 michael@0: pshufw m2, m2, 0x0 michael@0: DEFINE_ARGS dst, stride, line, left michael@0: mov lineq, -2 michael@0: add leftq, 4 michael@0: psubw m0, m2 michael@0: .loop: michael@0: movd m2, [leftq+lineq*2] michael@0: movd m3, [leftq+lineq*2+1] michael@0: punpcklbw m2, m1 michael@0: punpcklbw m3, m1 michael@0: pshufw m2, m2, 0x0 michael@0: pshufw m3, m3, 0x0 michael@0: paddw m2, m0 michael@0: paddw m3, m0 michael@0: packuswb m2, m2 michael@0: packuswb m3, m3 michael@0: movd [dstq ], m2 michael@0: movd [dstq+strideq], m3 michael@0: lea dstq, [dstq+strideq*2] michael@0: inc lineq michael@0: jnz .loop michael@0: REP_RET michael@0: michael@0: INIT_XMM sse2 michael@0: cglobal tm_predictor_8x8, 4, 4, 4, dst, stride, above, left michael@0: pxor m1, m1 michael@0: movd m2, [aboveq-1] michael@0: movq m0, [aboveq] michael@0: punpcklbw m2, m1 michael@0: punpcklbw m0, m1 michael@0: pshuflw m2, m2, 0x0 michael@0: DEFINE_ARGS dst, stride, line, left michael@0: mov lineq, -4 michael@0: punpcklqdq m2, m2 michael@0: add leftq, 8 michael@0: psubw m0, m2 michael@0: .loop: michael@0: movd m2, [leftq+lineq*2] michael@0: movd m3, [leftq+lineq*2+1] michael@0: punpcklbw m2, m1 michael@0: punpcklbw m3, m1 michael@0: pshuflw m2, m2, 0x0 michael@0: pshuflw m3, m3, 0x0 michael@0: punpcklqdq m2, m2 michael@0: punpcklqdq m3, m3 michael@0: paddw m2, m0 michael@0: paddw m3, m0 michael@0: packuswb m2, m3 michael@0: movq [dstq ], m2 michael@0: movhps [dstq+strideq], m2 michael@0: lea dstq, [dstq+strideq*2] michael@0: inc lineq michael@0: jnz .loop michael@0: REP_RET michael@0: michael@0: INIT_XMM sse2 michael@0: cglobal tm_predictor_16x16, 4, 4, 7, dst, stride, above, left michael@0: pxor m1, m1 michael@0: movd m2, [aboveq-1] michael@0: mova m0, [aboveq] michael@0: punpcklbw m2, m1 michael@0: punpckhbw m4, m0, m1 michael@0: punpcklbw m0, m1 michael@0: pshuflw m2, m2, 0x0 michael@0: DEFINE_ARGS dst, stride, line, left michael@0: mov lineq, -8 michael@0: punpcklqdq m2, m2 michael@0: add leftq, 16 michael@0: psubw m0, m2 michael@0: psubw m4, m2 michael@0: .loop: michael@0: movd m2, [leftq+lineq*2] michael@0: movd m3, [leftq+lineq*2+1] michael@0: punpcklbw m2, m1 michael@0: punpcklbw m3, m1 michael@0: pshuflw m2, m2, 0x0 michael@0: pshuflw m3, m3, 0x0 michael@0: punpcklqdq m2, m2 michael@0: punpcklqdq m3, m3 michael@0: paddw m5, m2, m0 michael@0: paddw m6, m3, m0 michael@0: paddw m2, m4 michael@0: paddw m3, m4 michael@0: packuswb m5, m2 michael@0: packuswb m6, m3 michael@0: mova [dstq ], m5 michael@0: mova [dstq+strideq], m6 michael@0: lea dstq, [dstq+strideq*2] michael@0: inc lineq michael@0: jnz .loop michael@0: REP_RET michael@0: michael@0: %if ARCH_X86_64 michael@0: INIT_XMM sse2 michael@0: cglobal tm_predictor_32x32, 4, 4, 10, dst, stride, above, left michael@0: pxor m1, m1 michael@0: movd m2, [aboveq-1] michael@0: mova m0, [aboveq] michael@0: mova m4, [aboveq+16] michael@0: punpcklbw m2, m1 michael@0: punpckhbw m3, m0, m1 michael@0: punpckhbw m5, m4, m1 michael@0: punpcklbw m0, m1 michael@0: punpcklbw m4, m1 michael@0: pshuflw m2, m2, 0x0 michael@0: DEFINE_ARGS dst, stride, line, left michael@0: mov lineq, -16 michael@0: punpcklqdq m2, m2 michael@0: add leftq, 32 michael@0: psubw m0, m2 michael@0: psubw m3, m2 michael@0: psubw m4, m2 michael@0: psubw m5, m2 michael@0: .loop: michael@0: movd m2, [leftq+lineq*2] michael@0: movd m6, [leftq+lineq*2+1] michael@0: punpcklbw m2, m1 michael@0: punpcklbw m6, m1 michael@0: pshuflw m2, m2, 0x0 michael@0: pshuflw m6, m6, 0x0 michael@0: punpcklqdq m2, m2 michael@0: punpcklqdq m6, m6 michael@0: paddw m7, m2, m0 michael@0: paddw m8, m2, m3 michael@0: paddw m9, m2, m4 michael@0: paddw m2, m5 michael@0: packuswb m7, m8 michael@0: packuswb m9, m2 michael@0: paddw m2, m6, m0 michael@0: paddw m8, m6, m3 michael@0: mova [dstq ], m7 michael@0: paddw m7, m6, m4 michael@0: paddw m6, m5 michael@0: mova [dstq +16], m9 michael@0: packuswb m2, m8 michael@0: packuswb m7, m6 michael@0: mova [dstq+strideq ], m2 michael@0: mova [dstq+strideq+16], m7 michael@0: lea dstq, [dstq+strideq*2] michael@0: inc lineq michael@0: jnz .loop michael@0: REP_RET michael@0: %endif