michael@0: ; michael@0: ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license michael@0: ; that can be found in the LICENSE file in the root of the source michael@0: ; tree. An additional intellectual property rights grant can be found michael@0: ; in the file PATENTS. All contributing project authors may michael@0: ; be found in the AUTHORS file in the root of the source tree. michael@0: ; michael@0: michael@0: %include "third_party/x86inc/x86inc.asm" michael@0: michael@0: SECTION .text michael@0: michael@0: ; void vp9_subtract_block(int rows, int cols, michael@0: ; int16_t *diff, ptrdiff_t diff_stride, michael@0: ; const uint8_t *src, ptrdiff_t src_stride, michael@0: ; const uint8_t *pred, ptrdiff_t pred_stride) michael@0: michael@0: INIT_XMM sse2 michael@0: cglobal subtract_block, 7, 7, 8, \ michael@0: rows, cols, diff, diff_stride, src, src_stride, \ michael@0: pred, pred_stride michael@0: %define pred_str colsq michael@0: pxor m7, m7 ; dedicated zero register michael@0: cmp colsd, 4 michael@0: je .case_4 michael@0: cmp colsd, 8 michael@0: je .case_8 michael@0: cmp colsd, 16 michael@0: je .case_16 michael@0: cmp colsd, 32 michael@0: je .case_32 michael@0: michael@0: %macro loop16 6 michael@0: mova m0, [srcq+%1] michael@0: mova m4, [srcq+%2] michael@0: mova m1, [predq+%3] michael@0: mova m5, [predq+%4] michael@0: punpckhbw m2, m0, m7 michael@0: punpckhbw m3, m1, m7 michael@0: punpcklbw m0, m7 michael@0: punpcklbw m1, m7 michael@0: psubw m2, m3 michael@0: psubw m0, m1 michael@0: punpckhbw m1, m4, m7 michael@0: punpckhbw m3, m5, m7 michael@0: punpcklbw m4, m7 michael@0: punpcklbw m5, m7 michael@0: psubw m1, m3 michael@0: psubw m4, m5 michael@0: mova [diffq+mmsize*0+%5], m0 michael@0: mova [diffq+mmsize*1+%5], m2 michael@0: mova [diffq+mmsize*0+%6], m4 michael@0: mova [diffq+mmsize*1+%6], m1 michael@0: %endmacro michael@0: michael@0: mov pred_str, pred_stridemp michael@0: .loop_64: michael@0: loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize michael@0: loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize michael@0: lea diffq, [diffq+diff_strideq*2] michael@0: add predq, pred_str michael@0: add srcq, src_strideq michael@0: dec rowsd michael@0: jg .loop_64 michael@0: RET michael@0: michael@0: .case_32: michael@0: mov pred_str, pred_stridemp michael@0: .loop_32: michael@0: loop16 0, mmsize, 0, mmsize, 0, 2*mmsize michael@0: lea diffq, [diffq+diff_strideq*2] michael@0: add predq, pred_str michael@0: add srcq, src_strideq michael@0: dec rowsd michael@0: jg .loop_32 michael@0: RET michael@0: michael@0: .case_16: michael@0: mov pred_str, pred_stridemp michael@0: .loop_16: michael@0: loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2 michael@0: lea diffq, [diffq+diff_strideq*4] michael@0: lea predq, [predq+pred_str*2] michael@0: lea srcq, [srcq+src_strideq*2] michael@0: sub rowsd, 2 michael@0: jg .loop_16 michael@0: RET michael@0: michael@0: %macro loop_h 0 michael@0: movh m0, [srcq] michael@0: movh m2, [srcq+src_strideq] michael@0: movh m1, [predq] michael@0: movh m3, [predq+pred_str] michael@0: punpcklbw m0, m7 michael@0: punpcklbw m1, m7 michael@0: punpcklbw m2, m7 michael@0: punpcklbw m3, m7 michael@0: psubw m0, m1 michael@0: psubw m2, m3 michael@0: mova [diffq], m0 michael@0: mova [diffq+diff_strideq*2], m2 michael@0: %endmacro michael@0: michael@0: .case_8: michael@0: mov pred_str, pred_stridemp michael@0: .loop_8: michael@0: loop_h michael@0: lea diffq, [diffq+diff_strideq*4] michael@0: lea srcq, [srcq+src_strideq*2] michael@0: lea predq, [predq+pred_str*2] michael@0: sub rowsd, 2 michael@0: jg .loop_8 michael@0: RET michael@0: michael@0: INIT_MMX michael@0: .case_4: michael@0: mov pred_str, pred_stridemp michael@0: .loop_4: michael@0: loop_h michael@0: lea diffq, [diffq+diff_strideq*4] michael@0: lea srcq, [srcq+src_strideq*2] michael@0: lea predq, [predq+pred_str*2] michael@0: sub rowsd, 2 michael@0: jg .loop_4 michael@0: RET