michael@0: ; michael@0: ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license michael@0: ; that can be found in the LICENSE file in the root of the source michael@0: ; tree. An additional intellectual property rights grant can be found michael@0: ; in the file PATENTS. All contributing project authors may michael@0: ; be found in the AUTHORS file in the root of the source tree. michael@0: ; michael@0: michael@0: %include "third_party/x86inc/x86inc.asm" michael@0: michael@0: SECTION_RODATA michael@0: pw_8: times 8 dw 8 michael@0: bilin_filter_m_sse2: times 8 dw 16 michael@0: times 8 dw 0 michael@0: times 8 dw 15 michael@0: times 8 dw 1 michael@0: times 8 dw 14 michael@0: times 8 dw 2 michael@0: times 8 dw 13 michael@0: times 8 dw 3 michael@0: times 8 dw 12 michael@0: times 8 dw 4 michael@0: times 8 dw 11 michael@0: times 8 dw 5 michael@0: times 8 dw 10 michael@0: times 8 dw 6 michael@0: times 8 dw 9 michael@0: times 8 dw 7 michael@0: times 16 dw 8 michael@0: times 8 dw 7 michael@0: times 8 dw 9 michael@0: times 8 dw 6 michael@0: times 8 dw 10 michael@0: times 8 dw 5 michael@0: times 8 dw 11 michael@0: times 8 dw 4 michael@0: times 8 dw 12 michael@0: times 8 dw 3 michael@0: times 8 dw 13 michael@0: times 8 dw 2 michael@0: times 8 dw 14 michael@0: times 8 dw 1 michael@0: times 8 dw 15 michael@0: michael@0: bilin_filter_m_ssse3: times 8 db 16, 0 michael@0: times 8 db 15, 1 michael@0: times 8 db 14, 2 michael@0: times 8 db 13, 3 michael@0: times 8 db 12, 4 michael@0: times 8 db 11, 5 michael@0: times 8 db 10, 6 michael@0: times 8 db 9, 7 michael@0: times 16 db 8 michael@0: times 8 db 7, 9 michael@0: times 8 db 6, 10 michael@0: times 8 db 5, 11 michael@0: times 8 db 4, 12 michael@0: times 8 db 3, 13 michael@0: times 8 db 2, 14 michael@0: times 8 db 1, 15 michael@0: michael@0: SECTION .text michael@0: michael@0: ; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, michael@0: ; int x_offset, int y_offset, michael@0: ; const uint8_t *dst, ptrdiff_t dst_stride, michael@0: ; int height, unsigned int *sse); michael@0: ; michael@0: ; This function returns the SE and stores SSE in the given pointer. michael@0: michael@0: %macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse michael@0: psubw %3, %4 michael@0: psubw %1, %2 michael@0: paddw %5, %3 michael@0: pmaddwd %3, %3 michael@0: paddw %5, %1 michael@0: pmaddwd %1, %1 michael@0: paddd %6, %3 michael@0: paddd %6, %1 michael@0: %endmacro michael@0: michael@0: %macro STORE_AND_RET 0 michael@0: %if mmsize == 16 michael@0: ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit michael@0: ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. michael@0: ; We have to sign-extend it before adding the words within the register michael@0: ; and outputing to a dword. michael@0: pcmpgtw m5, m6 ; mask for 0 > x michael@0: movhlps m3, m7 michael@0: punpcklwd m4, m6, m5 michael@0: punpckhwd m6, m5 ; sign-extend m6 word->dword michael@0: paddd m7, m3 michael@0: paddd m6, m4 michael@0: pshufd m3, m7, 0x1 michael@0: movhlps m4, m6 michael@0: paddd m7, m3 michael@0: paddd m6, m4 michael@0: mov r1, ssem ; r1 = unsigned int *sse michael@0: pshufd m4, m6, 0x1 michael@0: movd [r1], m7 ; store sse michael@0: paddd m6, m4 michael@0: movd rax, m6 ; store sum as return value michael@0: %else ; mmsize == 8 michael@0: pshufw m4, m6, 0xe michael@0: pshufw m3, m7, 0xe michael@0: paddw m6, m4 michael@0: paddd m7, m3 michael@0: pcmpgtw m5, m6 ; mask for 0 > x michael@0: mov r1, ssem ; r1 = unsigned int *sse michael@0: punpcklwd m6, m5 ; sign-extend m6 word->dword michael@0: movd [r1], m7 ; store sse michael@0: pshufw m4, m6, 0xe michael@0: paddd m6, m4 michael@0: movd rax, m6 ; store sum as return value michael@0: %endif michael@0: RET michael@0: %endmacro michael@0: michael@0: %macro INC_SRC_BY_SRC_STRIDE 0 michael@0: %if ARCH_X86=1 && CONFIG_PIC=1 michael@0: add srcq, src_stridemp michael@0: %else michael@0: add srcq, src_strideq michael@0: %endif michael@0: %endmacro michael@0: michael@0: %macro SUBPEL_VARIANCE 1-2 0 ; W michael@0: %if cpuflag(ssse3) michael@0: %define bilin_filter_m bilin_filter_m_ssse3 michael@0: %define filter_idx_shift 4 michael@0: %else michael@0: %define bilin_filter_m bilin_filter_m_sse2 michael@0: %define filter_idx_shift 5 michael@0: %endif michael@0: ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses michael@0: ; 11, not 13, if the registers are ordered correctly. May make a minor speed michael@0: ; difference on Win64 michael@0: michael@0: %ifdef PIC ; 64bit PIC michael@0: %if %2 == 1 ; avg michael@0: cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ michael@0: x_offset, y_offset, \ michael@0: dst, dst_stride, \ michael@0: sec, sec_stride, height, sse michael@0: %define sec_str sec_strideq michael@0: %else michael@0: cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ michael@0: y_offset, dst, dst_stride, height, sse michael@0: %endif michael@0: %define h heightd michael@0: %define bilin_filter sseq michael@0: %else michael@0: %if ARCH_X86=1 && CONFIG_PIC=1 michael@0: %if %2 == 1 ; avg michael@0: cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ michael@0: x_offset, y_offset, \ michael@0: dst, dst_stride, \ michael@0: sec, sec_stride, \ michael@0: height, sse, g_bilin_filter, g_pw_8 michael@0: %define h dword heightm michael@0: %define sec_str sec_stridemp michael@0: michael@0: ;Store bilin_filter and pw_8 location in stack michael@0: GET_GOT eax michael@0: add esp, 4 ; restore esp michael@0: michael@0: lea ecx, [GLOBAL(bilin_filter_m)] michael@0: mov g_bilin_filterm, ecx michael@0: michael@0: lea ecx, [GLOBAL(pw_8)] michael@0: mov g_pw_8m, ecx michael@0: michael@0: LOAD_IF_USED 0, 1 ; load eax, ecx back michael@0: %else michael@0: cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ michael@0: y_offset, dst, dst_stride, height, sse, \ michael@0: g_bilin_filter, g_pw_8 michael@0: %define h heightd michael@0: michael@0: ;Store bilin_filter and pw_8 location in stack michael@0: GET_GOT eax michael@0: add esp, 4 ; restore esp michael@0: michael@0: lea ecx, [GLOBAL(bilin_filter_m)] michael@0: mov g_bilin_filterm, ecx michael@0: michael@0: lea ecx, [GLOBAL(pw_8)] michael@0: mov g_pw_8m, ecx michael@0: michael@0: LOAD_IF_USED 0, 1 ; load eax, ecx back michael@0: %endif michael@0: %else michael@0: %if %2 == 1 ; avg michael@0: cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ michael@0: 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ michael@0: x_offset, y_offset, \ michael@0: dst, dst_stride, \ michael@0: sec, sec_stride, \ michael@0: height, sse michael@0: %if ARCH_X86_64 michael@0: %define h heightd michael@0: %define sec_str sec_strideq michael@0: %else michael@0: %define h dword heightm michael@0: %define sec_str sec_stridemp michael@0: %endif michael@0: %else michael@0: cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ michael@0: y_offset, dst, dst_stride, height, sse michael@0: %define h heightd michael@0: %endif michael@0: michael@0: %define bilin_filter bilin_filter_m michael@0: %endif michael@0: %endif michael@0: michael@0: ASSERT %1 <= 16 ; m6 overflows if w > 16 michael@0: pxor m6, m6 ; sum michael@0: pxor m7, m7 ; sse michael@0: ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we michael@0: ; could perhaps use it for something more productive then michael@0: pxor m5, m5 ; dedicated zero register michael@0: %if %1 < 16 michael@0: sar h, 1 michael@0: %if %2 == 1 ; avg michael@0: shl sec_str, 1 michael@0: %endif michael@0: %endif michael@0: michael@0: ; FIXME(rbultje) replace by jumptable? michael@0: test x_offsetd, x_offsetd michael@0: jnz .x_nonzero michael@0: ; x_offset == 0 michael@0: test y_offsetd, y_offsetd michael@0: jnz .x_zero_y_nonzero michael@0: michael@0: ; x_offset == 0 && y_offset == 0 michael@0: .x_zero_y_zero_loop: michael@0: %if %1 == 16 michael@0: movu m0, [srcq] michael@0: mova m1, [dstq] michael@0: %if %2 == 1 ; avg michael@0: pavgb m0, [secq] michael@0: punpckhbw m3, m1, m5 michael@0: punpcklbw m1, m5 michael@0: %endif michael@0: punpckhbw m2, m0, m5 michael@0: punpcklbw m0, m5 michael@0: %if %2 == 0 ; !avg michael@0: punpckhbw m3, m1, m5 michael@0: punpcklbw m1, m5 michael@0: %endif michael@0: SUM_SSE m0, m1, m2, m3, m6, m7 michael@0: michael@0: add srcq, src_strideq michael@0: add dstq, dst_strideq michael@0: %else ; %1 < 16 michael@0: movh m0, [srcq] michael@0: %if %2 == 1 ; avg michael@0: %if mmsize == 16 michael@0: movhps m0, [srcq+src_strideq] michael@0: %else ; mmsize == 8 michael@0: punpckldq m0, [srcq+src_strideq] michael@0: %endif michael@0: %else ; !avg michael@0: movh m2, [srcq+src_strideq] michael@0: %endif michael@0: movh m1, [dstq] michael@0: movh m3, [dstq+dst_strideq] michael@0: %if %2 == 1 ; avg michael@0: pavgb m0, [secq] michael@0: punpcklbw m3, m5 michael@0: punpcklbw m1, m5 michael@0: punpckhbw m2, m0, m5 michael@0: punpcklbw m0, m5 michael@0: %else ; !avg michael@0: punpcklbw m0, m5 michael@0: punpcklbw m2, m5 michael@0: punpcklbw m3, m5 michael@0: punpcklbw m1, m5 michael@0: %endif michael@0: SUM_SSE m0, m1, m2, m3, m6, m7 michael@0: michael@0: lea srcq, [srcq+src_strideq*2] michael@0: lea dstq, [dstq+dst_strideq*2] michael@0: %endif michael@0: %if %2 == 1 ; avg michael@0: add secq, sec_str michael@0: %endif michael@0: dec h michael@0: jg .x_zero_y_zero_loop michael@0: STORE_AND_RET michael@0: michael@0: .x_zero_y_nonzero: michael@0: cmp y_offsetd, 8 michael@0: jne .x_zero_y_nonhalf michael@0: michael@0: ; x_offset == 0 && y_offset == 0.5 michael@0: .x_zero_y_half_loop: michael@0: %if %1 == 16 michael@0: movu m0, [srcq] michael@0: movu m4, [srcq+src_strideq] michael@0: mova m1, [dstq] michael@0: pavgb m0, m4 michael@0: punpckhbw m3, m1, m5 michael@0: %if %2 == 1 ; avg michael@0: pavgb m0, [secq] michael@0: %endif michael@0: punpcklbw m1, m5 michael@0: punpckhbw m2, m0, m5 michael@0: punpcklbw m0, m5 michael@0: SUM_SSE m0, m1, m2, m3, m6, m7 michael@0: michael@0: add srcq, src_strideq michael@0: add dstq, dst_strideq michael@0: %else ; %1 < 16 michael@0: movh m0, [srcq] michael@0: movh m2, [srcq+src_strideq] michael@0: %if %2 == 1 ; avg michael@0: %if mmsize == 16 michael@0: movhps m2, [srcq+src_strideq*2] michael@0: %else ; mmsize == 8 michael@0: %if %1 == 4 michael@0: movh m1, [srcq+src_strideq*2] michael@0: punpckldq m2, m1 michael@0: %else michael@0: punpckldq m2, [srcq+src_strideq*2] michael@0: %endif michael@0: %endif michael@0: movh m1, [dstq] michael@0: %if mmsize == 16 michael@0: movlhps m0, m2 michael@0: %else ; mmsize == 8 michael@0: punpckldq m0, m2 michael@0: %endif michael@0: movh m3, [dstq+dst_strideq] michael@0: pavgb m0, m2 michael@0: punpcklbw m1, m5 michael@0: pavgb m0, [secq] michael@0: punpcklbw m3, m5 michael@0: punpckhbw m2, m0, m5 michael@0: punpcklbw m0, m5 michael@0: %else ; !avg michael@0: movh m4, [srcq+src_strideq*2] michael@0: movh m1, [dstq] michael@0: pavgb m0, m2 michael@0: movh m3, [dstq+dst_strideq] michael@0: pavgb m2, m4 michael@0: punpcklbw m0, m5 michael@0: punpcklbw m2, m5 michael@0: punpcklbw m3, m5 michael@0: punpcklbw m1, m5 michael@0: %endif michael@0: SUM_SSE m0, m1, m2, m3, m6, m7 michael@0: michael@0: lea srcq, [srcq+src_strideq*2] michael@0: lea dstq, [dstq+dst_strideq*2] michael@0: %endif michael@0: %if %2 == 1 ; avg michael@0: add secq, sec_str michael@0: %endif michael@0: dec h michael@0: jg .x_zero_y_half_loop michael@0: STORE_AND_RET michael@0: michael@0: .x_zero_y_nonhalf: michael@0: ; x_offset == 0 && y_offset == bilin interpolation michael@0: %ifdef PIC michael@0: lea bilin_filter, [bilin_filter_m] michael@0: %endif michael@0: shl y_offsetd, filter_idx_shift michael@0: %if ARCH_X86_64 && mmsize == 16 michael@0: mova m8, [bilin_filter+y_offsetq] michael@0: %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 michael@0: mova m9, [bilin_filter+y_offsetq+16] michael@0: %endif michael@0: mova m10, [pw_8] michael@0: %define filter_y_a m8 michael@0: %define filter_y_b m9 michael@0: %define filter_rnd m10 michael@0: %else ; x86-32 or mmx michael@0: %if ARCH_X86=1 && CONFIG_PIC=1 michael@0: ; x_offset == 0, reuse x_offset reg michael@0: %define tempq x_offsetq michael@0: add y_offsetq, g_bilin_filterm michael@0: %define filter_y_a [y_offsetq] michael@0: %define filter_y_b [y_offsetq+16] michael@0: mov tempq, g_pw_8m michael@0: %define filter_rnd [tempq] michael@0: %else michael@0: add y_offsetq, bilin_filter michael@0: %define filter_y_a [y_offsetq] michael@0: %define filter_y_b [y_offsetq+16] michael@0: %define filter_rnd [pw_8] michael@0: %endif michael@0: %endif michael@0: michael@0: .x_zero_y_other_loop: michael@0: %if %1 == 16 michael@0: movu m0, [srcq] michael@0: movu m4, [srcq+src_strideq] michael@0: mova m1, [dstq] michael@0: %if cpuflag(ssse3) michael@0: punpckhbw m2, m0, m4 michael@0: punpcklbw m0, m4 michael@0: pmaddubsw m2, filter_y_a michael@0: pmaddubsw m0, filter_y_a michael@0: paddw m2, filter_rnd michael@0: paddw m0, filter_rnd michael@0: %else michael@0: punpckhbw m2, m0, m5 michael@0: punpckhbw m3, m4, m5 michael@0: punpcklbw m0, m5 michael@0: punpcklbw m4, m5 michael@0: ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can michael@0: ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of michael@0: ; instructions is the same (5), but it is 1 mul instead of 2, so might be michael@0: ; slightly faster because of pmullw latency. It would also cut our rodata michael@0: ; tables in half for this function, and save 1-2 registers on x86-64. michael@0: pmullw m2, filter_y_a michael@0: pmullw m3, filter_y_b michael@0: paddw m2, filter_rnd michael@0: pmullw m0, filter_y_a michael@0: pmullw m4, filter_y_b michael@0: paddw m0, filter_rnd michael@0: paddw m2, m3 michael@0: paddw m0, m4 michael@0: %endif michael@0: psraw m2, 4 michael@0: psraw m0, 4 michael@0: %if %2 == 1 ; avg michael@0: ; FIXME(rbultje) pipeline michael@0: packuswb m0, m2 michael@0: pavgb m0, [secq] michael@0: punpckhbw m2, m0, m5 michael@0: punpcklbw m0, m5 michael@0: %endif michael@0: punpckhbw m3, m1, m5 michael@0: punpcklbw m1, m5 michael@0: SUM_SSE m0, m1, m2, m3, m6, m7 michael@0: michael@0: add srcq, src_strideq michael@0: add dstq, dst_strideq michael@0: %else ; %1 < 16 michael@0: movh m0, [srcq] michael@0: movh m2, [srcq+src_strideq] michael@0: movh m4, [srcq+src_strideq*2] michael@0: movh m3, [dstq+dst_strideq] michael@0: %if cpuflag(ssse3) michael@0: movh m1, [dstq] michael@0: punpcklbw m0, m2 michael@0: punpcklbw m2, m4 michael@0: pmaddubsw m0, filter_y_a michael@0: pmaddubsw m2, filter_y_a michael@0: punpcklbw m3, m5 michael@0: paddw m2, filter_rnd michael@0: paddw m0, filter_rnd michael@0: %else michael@0: punpcklbw m0, m5 michael@0: punpcklbw m2, m5 michael@0: punpcklbw m4, m5 michael@0: pmullw m0, filter_y_a michael@0: pmullw m1, m2, filter_y_b michael@0: punpcklbw m3, m5 michael@0: paddw m0, filter_rnd michael@0: pmullw m2, filter_y_a michael@0: pmullw m4, filter_y_b michael@0: paddw m0, m1 michael@0: paddw m2, filter_rnd michael@0: movh m1, [dstq] michael@0: paddw m2, m4 michael@0: %endif michael@0: psraw m0, 4 michael@0: psraw m2, 4 michael@0: %if %2 == 1 ; avg michael@0: ; FIXME(rbultje) pipeline michael@0: packuswb m0, m2 michael@0: pavgb m0, [secq] michael@0: punpckhbw m2, m0, m5 michael@0: punpcklbw m0, m5 michael@0: %endif michael@0: punpcklbw m1, m5 michael@0: SUM_SSE m0, m1, m2, m3, m6, m7 michael@0: michael@0: lea srcq, [srcq+src_strideq*2] michael@0: lea dstq, [dstq+dst_strideq*2] michael@0: %endif michael@0: %if %2 == 1 ; avg michael@0: add secq, sec_str michael@0: %endif michael@0: dec h michael@0: jg .x_zero_y_other_loop michael@0: %undef filter_y_a michael@0: %undef filter_y_b michael@0: %undef filter_rnd michael@0: STORE_AND_RET michael@0: michael@0: .x_nonzero: michael@0: cmp x_offsetd, 8 michael@0: jne .x_nonhalf michael@0: ; x_offset == 0.5 michael@0: test y_offsetd, y_offsetd michael@0: jnz .x_half_y_nonzero michael@0: michael@0: ; x_offset == 0.5 && y_offset == 0 michael@0: .x_half_y_zero_loop: michael@0: %if %1 == 16 michael@0: movu m0, [srcq] michael@0: movu m4, [srcq+1] michael@0: mova m1, [dstq] michael@0: pavgb m0, m4 michael@0: punpckhbw m3, m1, m5 michael@0: %if %2 == 1 ; avg michael@0: pavgb m0, [secq] michael@0: %endif michael@0: punpcklbw m1, m5 michael@0: punpckhbw m2, m0, m5 michael@0: punpcklbw m0, m5 michael@0: SUM_SSE m0, m1, m2, m3, m6, m7 michael@0: michael@0: add srcq, src_strideq michael@0: add dstq, dst_strideq michael@0: %else ; %1 < 16 michael@0: movh m0, [srcq] michael@0: movh m4, [srcq+1] michael@0: %if %2 == 1 ; avg michael@0: %if mmsize == 16 michael@0: movhps m0, [srcq+src_strideq] michael@0: movhps m4, [srcq+src_strideq+1] michael@0: %else ; mmsize == 8 michael@0: punpckldq m0, [srcq+src_strideq] michael@0: punpckldq m4, [srcq+src_strideq+1] michael@0: %endif michael@0: movh m1, [dstq] michael@0: movh m3, [dstq+dst_strideq] michael@0: pavgb m0, m4 michael@0: punpcklbw m3, m5 michael@0: pavgb m0, [secq] michael@0: punpcklbw m1, m5 michael@0: punpckhbw m2, m0, m5 michael@0: punpcklbw m0, m5 michael@0: %else ; !avg michael@0: movh m2, [srcq+src_strideq] michael@0: movh m1, [dstq] michael@0: pavgb m0, m4 michael@0: movh m4, [srcq+src_strideq+1] michael@0: movh m3, [dstq+dst_strideq] michael@0: pavgb m2, m4 michael@0: punpcklbw m0, m5 michael@0: punpcklbw m2, m5 michael@0: punpcklbw m3, m5 michael@0: punpcklbw m1, m5 michael@0: %endif michael@0: SUM_SSE m0, m1, m2, m3, m6, m7 michael@0: michael@0: lea srcq, [srcq+src_strideq*2] michael@0: lea dstq, [dstq+dst_strideq*2] michael@0: %endif michael@0: %if %2 == 1 ; avg michael@0: add secq, sec_str michael@0: %endif michael@0: dec h michael@0: jg .x_half_y_zero_loop michael@0: STORE_AND_RET michael@0: michael@0: .x_half_y_nonzero: michael@0: cmp y_offsetd, 8 michael@0: jne .x_half_y_nonhalf michael@0: michael@0: ; x_offset == 0.5 && y_offset == 0.5 michael@0: %if %1 == 16 michael@0: movu m0, [srcq] michael@0: movu m3, [srcq+1] michael@0: add srcq, src_strideq michael@0: pavgb m0, m3 michael@0: .x_half_y_half_loop: michael@0: movu m4, [srcq] michael@0: movu m3, [srcq+1] michael@0: mova m1, [dstq] michael@0: pavgb m4, m3 michael@0: punpckhbw m3, m1, m5 michael@0: pavgb m0, m4 michael@0: %if %2 == 1 ; avg michael@0: punpcklbw m1, m5 michael@0: pavgb m0, [secq] michael@0: punpckhbw m2, m0, m5 michael@0: punpcklbw m0, m5 michael@0: %else michael@0: punpckhbw m2, m0, m5 michael@0: punpcklbw m0, m5 michael@0: punpcklbw m1, m5 michael@0: %endif michael@0: SUM_SSE m0, m1, m2, m3, m6, m7 michael@0: mova m0, m4 michael@0: michael@0: add srcq, src_strideq michael@0: add dstq, dst_strideq michael@0: %else ; %1 < 16 michael@0: movh m0, [srcq] michael@0: movh m3, [srcq+1] michael@0: add srcq, src_strideq michael@0: pavgb m0, m3 michael@0: .x_half_y_half_loop: michael@0: movh m2, [srcq] michael@0: movh m3, [srcq+1] michael@0: %if %2 == 1 ; avg michael@0: %if mmsize == 16 michael@0: movhps m2, [srcq+src_strideq] michael@0: movhps m3, [srcq+src_strideq+1] michael@0: %else michael@0: %if %1 == 4 michael@0: movh m1, [srcq+src_strideq] michael@0: punpckldq m2, m1 michael@0: movh m1, [srcq+src_strideq+1] michael@0: punpckldq m3, m1 michael@0: %else michael@0: punpckldq m2, [srcq+src_strideq] michael@0: punpckldq m3, [srcq+src_strideq+1] michael@0: %endif michael@0: %endif michael@0: pavgb m2, m3 michael@0: %if mmsize == 16 michael@0: movlhps m0, m2 michael@0: movhlps m4, m2 michael@0: %else ; mmsize == 8 michael@0: punpckldq m0, m2 michael@0: pshufw m4, m2, 0xe michael@0: %endif michael@0: movh m1, [dstq] michael@0: pavgb m0, m2 michael@0: movh m3, [dstq+dst_strideq] michael@0: pavgb m0, [secq] michael@0: punpcklbw m3, m5 michael@0: punpcklbw m1, m5 michael@0: punpckhbw m2, m0, m5 michael@0: punpcklbw m0, m5 michael@0: %else ; !avg michael@0: movh m4, [srcq+src_strideq] michael@0: movh m1, [srcq+src_strideq+1] michael@0: pavgb m2, m3 michael@0: pavgb m4, m1 michael@0: pavgb m0, m2 michael@0: pavgb m2, m4 michael@0: movh m1, [dstq] michael@0: movh m3, [dstq+dst_strideq] michael@0: punpcklbw m0, m5 michael@0: punpcklbw m2, m5 michael@0: punpcklbw m3, m5 michael@0: punpcklbw m1, m5 michael@0: %endif michael@0: SUM_SSE m0, m1, m2, m3, m6, m7 michael@0: mova m0, m4 michael@0: michael@0: lea srcq, [srcq+src_strideq*2] michael@0: lea dstq, [dstq+dst_strideq*2] michael@0: %endif michael@0: %if %2 == 1 ; avg michael@0: add secq, sec_str michael@0: %endif michael@0: dec h michael@0: jg .x_half_y_half_loop michael@0: STORE_AND_RET michael@0: michael@0: .x_half_y_nonhalf: michael@0: ; x_offset == 0.5 && y_offset == bilin interpolation michael@0: %ifdef PIC michael@0: lea bilin_filter, [bilin_filter_m] michael@0: %endif michael@0: shl y_offsetd, filter_idx_shift michael@0: %if ARCH_X86_64 && mmsize == 16 michael@0: mova m8, [bilin_filter+y_offsetq] michael@0: %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 michael@0: mova m9, [bilin_filter+y_offsetq+16] michael@0: %endif michael@0: mova m10, [pw_8] michael@0: %define filter_y_a m8 michael@0: %define filter_y_b m9 michael@0: %define filter_rnd m10 michael@0: %else ;x86_32 michael@0: %if ARCH_X86=1 && CONFIG_PIC=1 michael@0: ; x_offset == 0.5. We can reuse x_offset reg michael@0: %define tempq x_offsetq michael@0: add y_offsetq, g_bilin_filterm michael@0: %define filter_y_a [y_offsetq] michael@0: %define filter_y_b [y_offsetq+16] michael@0: mov tempq, g_pw_8m michael@0: %define filter_rnd [tempq] michael@0: %else michael@0: add y_offsetq, bilin_filter michael@0: %define filter_y_a [y_offsetq] michael@0: %define filter_y_b [y_offsetq+16] michael@0: %define filter_rnd [pw_8] michael@0: %endif michael@0: %endif michael@0: michael@0: %if %1 == 16 michael@0: movu m0, [srcq] michael@0: movu m3, [srcq+1] michael@0: add srcq, src_strideq michael@0: pavgb m0, m3 michael@0: .x_half_y_other_loop: michael@0: movu m4, [srcq] michael@0: movu m2, [srcq+1] michael@0: mova m1, [dstq] michael@0: pavgb m4, m2 michael@0: %if cpuflag(ssse3) michael@0: punpckhbw m2, m0, m4 michael@0: punpcklbw m0, m4 michael@0: pmaddubsw m2, filter_y_a michael@0: pmaddubsw m0, filter_y_a michael@0: paddw m2, filter_rnd michael@0: paddw m0, filter_rnd michael@0: psraw m2, 4 michael@0: %else michael@0: punpckhbw m2, m0, m5 michael@0: punpckhbw m3, m4, m5 michael@0: pmullw m2, filter_y_a michael@0: pmullw m3, filter_y_b michael@0: paddw m2, filter_rnd michael@0: punpcklbw m0, m5 michael@0: paddw m2, m3 michael@0: punpcklbw m3, m4, m5 michael@0: pmullw m0, filter_y_a michael@0: pmullw m3, filter_y_b michael@0: paddw m0, filter_rnd michael@0: psraw m2, 4 michael@0: paddw m0, m3 michael@0: %endif michael@0: punpckhbw m3, m1, m5 michael@0: psraw m0, 4 michael@0: %if %2 == 1 ; avg michael@0: ; FIXME(rbultje) pipeline michael@0: packuswb m0, m2 michael@0: pavgb m0, [secq] michael@0: punpckhbw m2, m0, m5 michael@0: punpcklbw m0, m5 michael@0: %endif michael@0: punpcklbw m1, m5 michael@0: SUM_SSE m0, m1, m2, m3, m6, m7 michael@0: mova m0, m4 michael@0: michael@0: add srcq, src_strideq michael@0: add dstq, dst_strideq michael@0: %else ; %1 < 16 michael@0: movh m0, [srcq] michael@0: movh m3, [srcq+1] michael@0: add srcq, src_strideq michael@0: pavgb m0, m3 michael@0: %if notcpuflag(ssse3) michael@0: punpcklbw m0, m5 michael@0: %endif michael@0: .x_half_y_other_loop: michael@0: movh m2, [srcq] michael@0: movh m1, [srcq+1] michael@0: movh m4, [srcq+src_strideq] michael@0: movh m3, [srcq+src_strideq+1] michael@0: pavgb m2, m1 michael@0: pavgb m4, m3 michael@0: movh m3, [dstq+dst_strideq] michael@0: %if cpuflag(ssse3) michael@0: movh m1, [dstq] michael@0: punpcklbw m0, m2 michael@0: punpcklbw m2, m4 michael@0: pmaddubsw m0, filter_y_a michael@0: pmaddubsw m2, filter_y_a michael@0: punpcklbw m3, m5 michael@0: paddw m0, filter_rnd michael@0: paddw m2, filter_rnd michael@0: %else michael@0: punpcklbw m2, m5 michael@0: punpcklbw m4, m5 michael@0: pmullw m0, filter_y_a michael@0: pmullw m1, m2, filter_y_b michael@0: punpcklbw m3, m5 michael@0: paddw m0, filter_rnd michael@0: pmullw m2, filter_y_a michael@0: paddw m0, m1 michael@0: pmullw m1, m4, filter_y_b michael@0: paddw m2, filter_rnd michael@0: paddw m2, m1 michael@0: movh m1, [dstq] michael@0: %endif michael@0: psraw m0, 4 michael@0: psraw m2, 4 michael@0: %if %2 == 1 ; avg michael@0: ; FIXME(rbultje) pipeline michael@0: packuswb m0, m2 michael@0: pavgb m0, [secq] michael@0: punpckhbw m2, m0, m5 michael@0: punpcklbw m0, m5 michael@0: %endif michael@0: punpcklbw m1, m5 michael@0: SUM_SSE m0, m1, m2, m3, m6, m7 michael@0: mova m0, m4 michael@0: michael@0: lea srcq, [srcq+src_strideq*2] michael@0: lea dstq, [dstq+dst_strideq*2] michael@0: %endif michael@0: %if %2 == 1 ; avg michael@0: add secq, sec_str michael@0: %endif michael@0: dec h michael@0: jg .x_half_y_other_loop michael@0: %undef filter_y_a michael@0: %undef filter_y_b michael@0: %undef filter_rnd michael@0: STORE_AND_RET michael@0: michael@0: .x_nonhalf: michael@0: test y_offsetd, y_offsetd michael@0: jnz .x_nonhalf_y_nonzero michael@0: michael@0: ; x_offset == bilin interpolation && y_offset == 0 michael@0: %ifdef PIC michael@0: lea bilin_filter, [bilin_filter_m] michael@0: %endif michael@0: shl x_offsetd, filter_idx_shift michael@0: %if ARCH_X86_64 && mmsize == 16 michael@0: mova m8, [bilin_filter+x_offsetq] michael@0: %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 michael@0: mova m9, [bilin_filter+x_offsetq+16] michael@0: %endif michael@0: mova m10, [pw_8] michael@0: %define filter_x_a m8 michael@0: %define filter_x_b m9 michael@0: %define filter_rnd m10 michael@0: %else ; x86-32 michael@0: %if ARCH_X86=1 && CONFIG_PIC=1 michael@0: ;y_offset == 0. We can reuse y_offset reg. michael@0: %define tempq y_offsetq michael@0: add x_offsetq, g_bilin_filterm michael@0: %define filter_x_a [x_offsetq] michael@0: %define filter_x_b [x_offsetq+16] michael@0: mov tempq, g_pw_8m michael@0: %define filter_rnd [tempq] michael@0: %else michael@0: add x_offsetq, bilin_filter michael@0: %define filter_x_a [x_offsetq] michael@0: %define filter_x_b [x_offsetq+16] michael@0: %define filter_rnd [pw_8] michael@0: %endif michael@0: %endif michael@0: michael@0: .x_other_y_zero_loop: michael@0: %if %1 == 16 michael@0: movu m0, [srcq] michael@0: movu m4, [srcq+1] michael@0: mova m1, [dstq] michael@0: %if cpuflag(ssse3) michael@0: punpckhbw m2, m0, m4 michael@0: punpcklbw m0, m4 michael@0: pmaddubsw m2, filter_x_a michael@0: pmaddubsw m0, filter_x_a michael@0: paddw m2, filter_rnd michael@0: paddw m0, filter_rnd michael@0: %else michael@0: punpckhbw m2, m0, m5 michael@0: punpckhbw m3, m4, m5 michael@0: punpcklbw m0, m5 michael@0: punpcklbw m4, m5 michael@0: pmullw m2, filter_x_a michael@0: pmullw m3, filter_x_b michael@0: paddw m2, filter_rnd michael@0: pmullw m0, filter_x_a michael@0: pmullw m4, filter_x_b michael@0: paddw m0, filter_rnd michael@0: paddw m2, m3 michael@0: paddw m0, m4 michael@0: %endif michael@0: psraw m2, 4 michael@0: psraw m0, 4 michael@0: %if %2 == 1 ; avg michael@0: ; FIXME(rbultje) pipeline michael@0: packuswb m0, m2 michael@0: pavgb m0, [secq] michael@0: punpckhbw m2, m0, m5 michael@0: punpcklbw m0, m5 michael@0: %endif michael@0: punpckhbw m3, m1, m5 michael@0: punpcklbw m1, m5 michael@0: SUM_SSE m0, m1, m2, m3, m6, m7 michael@0: michael@0: add srcq, src_strideq michael@0: add dstq, dst_strideq michael@0: %else ; %1 < 16 michael@0: movh m0, [srcq] michael@0: movh m1, [srcq+1] michael@0: movh m2, [srcq+src_strideq] michael@0: movh m4, [srcq+src_strideq+1] michael@0: movh m3, [dstq+dst_strideq] michael@0: %if cpuflag(ssse3) michael@0: punpcklbw m0, m1 michael@0: movh m1, [dstq] michael@0: punpcklbw m2, m4 michael@0: pmaddubsw m0, filter_x_a michael@0: pmaddubsw m2, filter_x_a michael@0: punpcklbw m3, m5 michael@0: paddw m0, filter_rnd michael@0: paddw m2, filter_rnd michael@0: %else michael@0: punpcklbw m0, m5 michael@0: punpcklbw m1, m5 michael@0: punpcklbw m2, m5 michael@0: punpcklbw m4, m5 michael@0: pmullw m0, filter_x_a michael@0: pmullw m1, filter_x_b michael@0: punpcklbw m3, m5 michael@0: paddw m0, filter_rnd michael@0: pmullw m2, filter_x_a michael@0: pmullw m4, filter_x_b michael@0: paddw m0, m1 michael@0: paddw m2, filter_rnd michael@0: movh m1, [dstq] michael@0: paddw m2, m4 michael@0: %endif michael@0: psraw m0, 4 michael@0: psraw m2, 4 michael@0: %if %2 == 1 ; avg michael@0: ; FIXME(rbultje) pipeline michael@0: packuswb m0, m2 michael@0: pavgb m0, [secq] michael@0: punpckhbw m2, m0, m5 michael@0: punpcklbw m0, m5 michael@0: %endif michael@0: punpcklbw m1, m5 michael@0: SUM_SSE m0, m1, m2, m3, m6, m7 michael@0: michael@0: lea srcq, [srcq+src_strideq*2] michael@0: lea dstq, [dstq+dst_strideq*2] michael@0: %endif michael@0: %if %2 == 1 ; avg michael@0: add secq, sec_str michael@0: %endif michael@0: dec h michael@0: jg .x_other_y_zero_loop michael@0: %undef filter_x_a michael@0: %undef filter_x_b michael@0: %undef filter_rnd michael@0: STORE_AND_RET michael@0: michael@0: .x_nonhalf_y_nonzero: michael@0: cmp y_offsetd, 8 michael@0: jne .x_nonhalf_y_nonhalf michael@0: michael@0: ; x_offset == bilin interpolation && y_offset == 0.5 michael@0: %ifdef PIC michael@0: lea bilin_filter, [bilin_filter_m] michael@0: %endif michael@0: shl x_offsetd, filter_idx_shift michael@0: %if ARCH_X86_64 && mmsize == 16 michael@0: mova m8, [bilin_filter+x_offsetq] michael@0: %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 michael@0: mova m9, [bilin_filter+x_offsetq+16] michael@0: %endif michael@0: mova m10, [pw_8] michael@0: %define filter_x_a m8 michael@0: %define filter_x_b m9 michael@0: %define filter_rnd m10 michael@0: %else ; x86-32 michael@0: %if ARCH_X86=1 && CONFIG_PIC=1 michael@0: ; y_offset == 0.5. We can reuse y_offset reg. michael@0: %define tempq y_offsetq michael@0: add x_offsetq, g_bilin_filterm michael@0: %define filter_x_a [x_offsetq] michael@0: %define filter_x_b [x_offsetq+16] michael@0: mov tempq, g_pw_8m michael@0: %define filter_rnd [tempq] michael@0: %else michael@0: add x_offsetq, bilin_filter michael@0: %define filter_x_a [x_offsetq] michael@0: %define filter_x_b [x_offsetq+16] michael@0: %define filter_rnd [pw_8] michael@0: %endif michael@0: %endif michael@0: michael@0: %if %1 == 16 michael@0: movu m0, [srcq] michael@0: movu m1, [srcq+1] michael@0: %if cpuflag(ssse3) michael@0: punpckhbw m2, m0, m1 michael@0: punpcklbw m0, m1 michael@0: pmaddubsw m2, filter_x_a michael@0: pmaddubsw m0, filter_x_a michael@0: paddw m2, filter_rnd michael@0: paddw m0, filter_rnd michael@0: %else michael@0: punpckhbw m2, m0, m5 michael@0: punpckhbw m3, m1, m5 michael@0: punpcklbw m0, m5 michael@0: punpcklbw m1, m5 michael@0: pmullw m0, filter_x_a michael@0: pmullw m1, filter_x_b michael@0: paddw m0, filter_rnd michael@0: pmullw m2, filter_x_a michael@0: pmullw m3, filter_x_b michael@0: paddw m2, filter_rnd michael@0: paddw m0, m1 michael@0: paddw m2, m3 michael@0: %endif michael@0: psraw m0, 4 michael@0: psraw m2, 4 michael@0: add srcq, src_strideq michael@0: packuswb m0, m2 michael@0: .x_other_y_half_loop: michael@0: movu m4, [srcq] michael@0: movu m3, [srcq+1] michael@0: %if cpuflag(ssse3) michael@0: mova m1, [dstq] michael@0: punpckhbw m2, m4, m3 michael@0: punpcklbw m4, m3 michael@0: pmaddubsw m2, filter_x_a michael@0: pmaddubsw m4, filter_x_a michael@0: paddw m2, filter_rnd michael@0: paddw m4, filter_rnd michael@0: psraw m2, 4 michael@0: psraw m4, 4 michael@0: packuswb m4, m2 michael@0: pavgb m0, m4 michael@0: punpckhbw m3, m1, m5 michael@0: punpcklbw m1, m5 michael@0: %else michael@0: punpckhbw m2, m4, m5 michael@0: punpckhbw m1, m3, m5 michael@0: punpcklbw m4, m5 michael@0: punpcklbw m3, m5 michael@0: pmullw m4, filter_x_a michael@0: pmullw m3, filter_x_b michael@0: paddw m4, filter_rnd michael@0: pmullw m2, filter_x_a michael@0: pmullw m1, filter_x_b michael@0: paddw m2, filter_rnd michael@0: paddw m4, m3 michael@0: paddw m2, m1 michael@0: mova m1, [dstq] michael@0: psraw m4, 4 michael@0: psraw m2, 4 michael@0: punpckhbw m3, m1, m5 michael@0: ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we michael@0: ; have a 1-register shortage to be able to store the backup of the bilin michael@0: ; filtered second line as words as cache for the next line. Packing into michael@0: ; a byte costs 1 pack and 2 unpacks, but saves a register. michael@0: packuswb m4, m2 michael@0: punpcklbw m1, m5 michael@0: pavgb m0, m4 michael@0: %endif michael@0: %if %2 == 1 ; avg michael@0: ; FIXME(rbultje) pipeline michael@0: pavgb m0, [secq] michael@0: %endif michael@0: punpckhbw m2, m0, m5 michael@0: punpcklbw m0, m5 michael@0: SUM_SSE m0, m1, m2, m3, m6, m7 michael@0: mova m0, m4 michael@0: michael@0: add srcq, src_strideq michael@0: add dstq, dst_strideq michael@0: %else ; %1 < 16 michael@0: movh m0, [srcq] michael@0: movh m1, [srcq+1] michael@0: %if cpuflag(ssse3) michael@0: punpcklbw m0, m1 michael@0: pmaddubsw m0, filter_x_a michael@0: paddw m0, filter_rnd michael@0: %else michael@0: punpcklbw m0, m5 michael@0: punpcklbw m1, m5 michael@0: pmullw m0, filter_x_a michael@0: pmullw m1, filter_x_b michael@0: paddw m0, filter_rnd michael@0: paddw m0, m1 michael@0: %endif michael@0: add srcq, src_strideq michael@0: psraw m0, 4 michael@0: .x_other_y_half_loop: michael@0: movh m2, [srcq] michael@0: movh m1, [srcq+1] michael@0: movh m4, [srcq+src_strideq] michael@0: movh m3, [srcq+src_strideq+1] michael@0: %if cpuflag(ssse3) michael@0: punpcklbw m2, m1 michael@0: punpcklbw m4, m3 michael@0: pmaddubsw m2, filter_x_a michael@0: pmaddubsw m4, filter_x_a michael@0: movh m1, [dstq] michael@0: movh m3, [dstq+dst_strideq] michael@0: paddw m2, filter_rnd michael@0: paddw m4, filter_rnd michael@0: %else michael@0: punpcklbw m2, m5 michael@0: punpcklbw m1, m5 michael@0: punpcklbw m4, m5 michael@0: punpcklbw m3, m5 michael@0: pmullw m2, filter_x_a michael@0: pmullw m1, filter_x_b michael@0: paddw m2, filter_rnd michael@0: pmullw m4, filter_x_a michael@0: pmullw m3, filter_x_b michael@0: paddw m4, filter_rnd michael@0: paddw m2, m1 michael@0: movh m1, [dstq] michael@0: paddw m4, m3 michael@0: movh m3, [dstq+dst_strideq] michael@0: %endif michael@0: psraw m2, 4 michael@0: psraw m4, 4 michael@0: pavgw m0, m2 michael@0: pavgw m2, m4 michael@0: %if %2 == 1 ; avg michael@0: ; FIXME(rbultje) pipeline - also consider going to bytes here michael@0: packuswb m0, m2 michael@0: pavgb m0, [secq] michael@0: punpckhbw m2, m0, m5 michael@0: punpcklbw m0, m5 michael@0: %endif michael@0: punpcklbw m3, m5 michael@0: punpcklbw m1, m5 michael@0: SUM_SSE m0, m1, m2, m3, m6, m7 michael@0: mova m0, m4 michael@0: michael@0: lea srcq, [srcq+src_strideq*2] michael@0: lea dstq, [dstq+dst_strideq*2] michael@0: %endif michael@0: %if %2 == 1 ; avg michael@0: add secq, sec_str michael@0: %endif michael@0: dec h michael@0: jg .x_other_y_half_loop michael@0: %undef filter_x_a michael@0: %undef filter_x_b michael@0: %undef filter_rnd michael@0: STORE_AND_RET michael@0: michael@0: .x_nonhalf_y_nonhalf: michael@0: %ifdef PIC michael@0: lea bilin_filter, [bilin_filter_m] michael@0: %endif michael@0: shl x_offsetd, filter_idx_shift michael@0: shl y_offsetd, filter_idx_shift michael@0: %if ARCH_X86_64 && mmsize == 16 michael@0: mova m8, [bilin_filter+x_offsetq] michael@0: %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 michael@0: mova m9, [bilin_filter+x_offsetq+16] michael@0: %endif michael@0: mova m10, [bilin_filter+y_offsetq] michael@0: %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 michael@0: mova m11, [bilin_filter+y_offsetq+16] michael@0: %endif michael@0: mova m12, [pw_8] michael@0: %define filter_x_a m8 michael@0: %define filter_x_b m9 michael@0: %define filter_y_a m10 michael@0: %define filter_y_b m11 michael@0: %define filter_rnd m12 michael@0: %else ; x86-32 michael@0: %if ARCH_X86=1 && CONFIG_PIC=1 michael@0: ; In this case, there is NO unused register. Used src_stride register. Later, michael@0: ; src_stride has to be loaded from stack when it is needed. michael@0: %define tempq src_strideq michael@0: mov tempq, g_bilin_filterm michael@0: add x_offsetq, tempq michael@0: add y_offsetq, tempq michael@0: %define filter_x_a [x_offsetq] michael@0: %define filter_x_b [x_offsetq+16] michael@0: %define filter_y_a [y_offsetq] michael@0: %define filter_y_b [y_offsetq+16] michael@0: michael@0: mov tempq, g_pw_8m michael@0: %define filter_rnd [tempq] michael@0: %else michael@0: add x_offsetq, bilin_filter michael@0: add y_offsetq, bilin_filter michael@0: %define filter_x_a [x_offsetq] michael@0: %define filter_x_b [x_offsetq+16] michael@0: %define filter_y_a [y_offsetq] michael@0: %define filter_y_b [y_offsetq+16] michael@0: %define filter_rnd [pw_8] michael@0: %endif michael@0: %endif michael@0: michael@0: ; x_offset == bilin interpolation && y_offset == bilin interpolation michael@0: %if %1 == 16 michael@0: movu m0, [srcq] michael@0: movu m1, [srcq+1] michael@0: %if cpuflag(ssse3) michael@0: punpckhbw m2, m0, m1 michael@0: punpcklbw m0, m1 michael@0: pmaddubsw m2, filter_x_a michael@0: pmaddubsw m0, filter_x_a michael@0: paddw m2, filter_rnd michael@0: paddw m0, filter_rnd michael@0: %else michael@0: punpckhbw m2, m0, m5 michael@0: punpckhbw m3, m1, m5 michael@0: punpcklbw m0, m5 michael@0: punpcklbw m1, m5 michael@0: pmullw m0, filter_x_a michael@0: pmullw m1, filter_x_b michael@0: paddw m0, filter_rnd michael@0: pmullw m2, filter_x_a michael@0: pmullw m3, filter_x_b michael@0: paddw m2, filter_rnd michael@0: paddw m0, m1 michael@0: paddw m2, m3 michael@0: %endif michael@0: psraw m0, 4 michael@0: psraw m2, 4 michael@0: michael@0: INC_SRC_BY_SRC_STRIDE michael@0: michael@0: packuswb m0, m2 michael@0: .x_other_y_other_loop: michael@0: %if cpuflag(ssse3) michael@0: movu m4, [srcq] michael@0: movu m3, [srcq+1] michael@0: mova m1, [dstq] michael@0: punpckhbw m2, m4, m3 michael@0: punpcklbw m4, m3 michael@0: pmaddubsw m2, filter_x_a michael@0: pmaddubsw m4, filter_x_a michael@0: punpckhbw m3, m1, m5 michael@0: paddw m2, filter_rnd michael@0: paddw m4, filter_rnd michael@0: psraw m2, 4 michael@0: psraw m4, 4 michael@0: packuswb m4, m2 michael@0: punpckhbw m2, m0, m4 michael@0: punpcklbw m0, m4 michael@0: pmaddubsw m2, filter_y_a michael@0: pmaddubsw m0, filter_y_a michael@0: punpcklbw m1, m5 michael@0: paddw m2, filter_rnd michael@0: paddw m0, filter_rnd michael@0: psraw m2, 4 michael@0: psraw m0, 4 michael@0: %else michael@0: movu m3, [srcq] michael@0: movu m4, [srcq+1] michael@0: punpckhbw m1, m3, m5 michael@0: punpckhbw m2, m4, m5 michael@0: punpcklbw m3, m5 michael@0: punpcklbw m4, m5 michael@0: pmullw m3, filter_x_a michael@0: pmullw m4, filter_x_b michael@0: paddw m3, filter_rnd michael@0: pmullw m1, filter_x_a michael@0: pmullw m2, filter_x_b michael@0: paddw m1, filter_rnd michael@0: paddw m3, m4 michael@0: paddw m1, m2 michael@0: psraw m3, 4 michael@0: psraw m1, 4 michael@0: packuswb m4, m3, m1 michael@0: punpckhbw m2, m0, m5 michael@0: punpcklbw m0, m5 michael@0: pmullw m2, filter_y_a michael@0: pmullw m1, filter_y_b michael@0: paddw m2, filter_rnd michael@0: pmullw m0, filter_y_a michael@0: pmullw m3, filter_y_b michael@0: paddw m2, m1 michael@0: mova m1, [dstq] michael@0: paddw m0, filter_rnd michael@0: psraw m2, 4 michael@0: paddw m0, m3 michael@0: punpckhbw m3, m1, m5 michael@0: psraw m0, 4 michael@0: punpcklbw m1, m5 michael@0: %endif michael@0: %if %2 == 1 ; avg michael@0: ; FIXME(rbultje) pipeline michael@0: packuswb m0, m2 michael@0: pavgb m0, [secq] michael@0: punpckhbw m2, m0, m5 michael@0: punpcklbw m0, m5 michael@0: %endif michael@0: SUM_SSE m0, m1, m2, m3, m6, m7 michael@0: mova m0, m4 michael@0: michael@0: INC_SRC_BY_SRC_STRIDE michael@0: add dstq, dst_strideq michael@0: %else ; %1 < 16 michael@0: movh m0, [srcq] michael@0: movh m1, [srcq+1] michael@0: %if cpuflag(ssse3) michael@0: punpcklbw m0, m1 michael@0: pmaddubsw m0, filter_x_a michael@0: paddw m0, filter_rnd michael@0: %else michael@0: punpcklbw m0, m5 michael@0: punpcklbw m1, m5 michael@0: pmullw m0, filter_x_a michael@0: pmullw m1, filter_x_b michael@0: paddw m0, filter_rnd michael@0: paddw m0, m1 michael@0: %endif michael@0: psraw m0, 4 michael@0: %if cpuflag(ssse3) michael@0: packuswb m0, m0 michael@0: %endif michael@0: michael@0: INC_SRC_BY_SRC_STRIDE michael@0: michael@0: .x_other_y_other_loop: michael@0: movh m2, [srcq] michael@0: movh m1, [srcq+1] michael@0: michael@0: INC_SRC_BY_SRC_STRIDE michael@0: movh m4, [srcq] michael@0: movh m3, [srcq+1] michael@0: michael@0: %if cpuflag(ssse3) michael@0: punpcklbw m2, m1 michael@0: punpcklbw m4, m3 michael@0: pmaddubsw m2, filter_x_a michael@0: pmaddubsw m4, filter_x_a michael@0: movh m3, [dstq+dst_strideq] michael@0: movh m1, [dstq] michael@0: paddw m2, filter_rnd michael@0: paddw m4, filter_rnd michael@0: psraw m2, 4 michael@0: psraw m4, 4 michael@0: packuswb m2, m2 michael@0: packuswb m4, m4 michael@0: punpcklbw m0, m2 michael@0: punpcklbw m2, m4 michael@0: pmaddubsw m0, filter_y_a michael@0: pmaddubsw m2, filter_y_a michael@0: punpcklbw m3, m5 michael@0: paddw m0, filter_rnd michael@0: paddw m2, filter_rnd michael@0: psraw m0, 4 michael@0: psraw m2, 4 michael@0: punpcklbw m1, m5 michael@0: %else michael@0: punpcklbw m2, m5 michael@0: punpcklbw m1, m5 michael@0: punpcklbw m4, m5 michael@0: punpcklbw m3, m5 michael@0: pmullw m2, filter_x_a michael@0: pmullw m1, filter_x_b michael@0: paddw m2, filter_rnd michael@0: pmullw m4, filter_x_a michael@0: pmullw m3, filter_x_b michael@0: paddw m4, filter_rnd michael@0: paddw m2, m1 michael@0: paddw m4, m3 michael@0: psraw m2, 4 michael@0: psraw m4, 4 michael@0: pmullw m0, filter_y_a michael@0: pmullw m3, m2, filter_y_b michael@0: paddw m0, filter_rnd michael@0: pmullw m2, filter_y_a michael@0: pmullw m1, m4, filter_y_b michael@0: paddw m2, filter_rnd michael@0: paddw m0, m3 michael@0: movh m3, [dstq+dst_strideq] michael@0: paddw m2, m1 michael@0: movh m1, [dstq] michael@0: psraw m0, 4 michael@0: psraw m2, 4 michael@0: punpcklbw m3, m5 michael@0: punpcklbw m1, m5 michael@0: %endif michael@0: %if %2 == 1 ; avg michael@0: ; FIXME(rbultje) pipeline michael@0: packuswb m0, m2 michael@0: pavgb m0, [secq] michael@0: punpckhbw m2, m0, m5 michael@0: punpcklbw m0, m5 michael@0: %endif michael@0: SUM_SSE m0, m1, m2, m3, m6, m7 michael@0: mova m0, m4 michael@0: michael@0: INC_SRC_BY_SRC_STRIDE michael@0: lea dstq, [dstq+dst_strideq*2] michael@0: %endif michael@0: %if %2 == 1 ; avg michael@0: add secq, sec_str michael@0: %endif michael@0: dec h michael@0: jg .x_other_y_other_loop michael@0: %undef filter_x_a michael@0: %undef filter_x_b michael@0: %undef filter_y_a michael@0: %undef filter_y_b michael@0: %undef filter_rnd michael@0: STORE_AND_RET michael@0: %endmacro michael@0: michael@0: ; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical michael@0: ; between the ssse3 and non-ssse3 version. It may make sense to merge their michael@0: ; code in the sense that the ssse3 version would jump to the appropriate michael@0: ; location in the sse/2 version, rather than duplicating that code in the michael@0: ; binary. michael@0: michael@0: INIT_MMX sse michael@0: SUBPEL_VARIANCE 4 michael@0: INIT_XMM sse2 michael@0: SUBPEL_VARIANCE 8 michael@0: SUBPEL_VARIANCE 16 michael@0: michael@0: INIT_MMX ssse3 michael@0: SUBPEL_VARIANCE 4 michael@0: INIT_XMM ssse3 michael@0: SUBPEL_VARIANCE 8 michael@0: SUBPEL_VARIANCE 16 michael@0: michael@0: INIT_MMX sse michael@0: SUBPEL_VARIANCE 4, 1 michael@0: INIT_XMM sse2 michael@0: SUBPEL_VARIANCE 8, 1 michael@0: SUBPEL_VARIANCE 16, 1 michael@0: michael@0: INIT_MMX ssse3 michael@0: SUBPEL_VARIANCE 4, 1 michael@0: INIT_XMM ssse3 michael@0: SUBPEL_VARIANCE 8, 1 michael@0: SUBPEL_VARIANCE 16, 1