michael@0: ;
michael@0: ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0: ;
michael@0: ;  Use of this source code is governed by a BSD-style license
michael@0: ;  that can be found in the LICENSE file in the root of the source
michael@0: ;  tree. An additional intellectual property rights grant can be found
michael@0: ;  in the file PATENTS.  All contributing project authors may
michael@0: ;  be found in the AUTHORS file in the root of the source tree.
michael@0: ;
michael@0: 
michael@0: %include "third_party/x86inc/x86inc.asm"
michael@0: 
michael@0: SECTION_RODATA
michael@0: pw_8: times  8 dw  8
michael@0: bilin_filter_m_sse2: times  8 dw 16
michael@0:                      times  8 dw  0
michael@0:                      times  8 dw 15
michael@0:                      times  8 dw  1
michael@0:                      times  8 dw 14
michael@0:                      times  8 dw  2
michael@0:                      times  8 dw 13
michael@0:                      times  8 dw  3
michael@0:                      times  8 dw 12
michael@0:                      times  8 dw  4
michael@0:                      times  8 dw 11
michael@0:                      times  8 dw  5
michael@0:                      times  8 dw 10
michael@0:                      times  8 dw  6
michael@0:                      times  8 dw  9
michael@0:                      times  8 dw  7
michael@0:                      times 16 dw  8
michael@0:                      times  8 dw  7
michael@0:                      times  8 dw  9
michael@0:                      times  8 dw  6
michael@0:                      times  8 dw 10
michael@0:                      times  8 dw  5
michael@0:                      times  8 dw 11
michael@0:                      times  8 dw  4
michael@0:                      times  8 dw 12
michael@0:                      times  8 dw  3
michael@0:                      times  8 dw 13
michael@0:                      times  8 dw  2
michael@0:                      times  8 dw 14
michael@0:                      times  8 dw  1
michael@0:                      times  8 dw 15
michael@0: 
michael@0: bilin_filter_m_ssse3: times  8 db 16,  0
michael@0:                       times  8 db 15,  1
michael@0:                       times  8 db 14,  2
michael@0:                       times  8 db 13,  3
michael@0:                       times  8 db 12,  4
michael@0:                       times  8 db 11,  5
michael@0:                       times  8 db 10,  6
michael@0:                       times  8 db  9,  7
michael@0:                       times 16 db  8
michael@0:                       times  8 db  7,  9
michael@0:                       times  8 db  6, 10
michael@0:                       times  8 db  5, 11
michael@0:                       times  8 db  4, 12
michael@0:                       times  8 db  3, 13
michael@0:                       times  8 db  2, 14
michael@0:                       times  8 db  1, 15
michael@0: 
michael@0: SECTION .text
michael@0: 
michael@0: ; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
michael@0: ;                               int x_offset, int y_offset,
michael@0: ;                               const uint8_t *dst, ptrdiff_t dst_stride,
michael@0: ;                               int height, unsigned int *sse);
michael@0: ;
michael@0: ; This function returns the SE and stores SSE in the given pointer.
michael@0: 
michael@0: %macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
michael@0:   psubw                %3, %4
michael@0:   psubw                %1, %2
michael@0:   paddw                %5, %3
michael@0:   pmaddwd              %3, %3
michael@0:   paddw                %5, %1
michael@0:   pmaddwd              %1, %1
michael@0:   paddd                %6, %3
michael@0:   paddd                %6, %1
michael@0: %endmacro
michael@0: 
michael@0: %macro STORE_AND_RET 0
michael@0: %if mmsize == 16
michael@0:   ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
michael@0:   ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
michael@0:   ; We have to sign-extend it before adding the words within the register
michael@0:   ; and outputing to a dword.
michael@0:   pcmpgtw              m5, m6           ; mask for 0 > x
michael@0:   movhlps              m3, m7
michael@0:   punpcklwd            m4, m6, m5
michael@0:   punpckhwd            m6, m5           ; sign-extend m6 word->dword
michael@0:   paddd                m7, m3
michael@0:   paddd                m6, m4
michael@0:   pshufd               m3, m7, 0x1
michael@0:   movhlps              m4, m6
michael@0:   paddd                m7, m3
michael@0:   paddd                m6, m4
michael@0:   mov                  r1, ssem         ; r1 = unsigned int *sse
michael@0:   pshufd               m4, m6, 0x1
michael@0:   movd               [r1], m7           ; store sse
michael@0:   paddd                m6, m4
michael@0:   movd                rax, m6           ; store sum as return value
michael@0: %else ; mmsize == 8
michael@0:   pshufw               m4, m6, 0xe
michael@0:   pshufw               m3, m7, 0xe
michael@0:   paddw                m6, m4
michael@0:   paddd                m7, m3
michael@0:   pcmpgtw              m5, m6           ; mask for 0 > x
michael@0:   mov                  r1, ssem         ; r1 = unsigned int *sse
michael@0:   punpcklwd            m6, m5           ; sign-extend m6 word->dword
michael@0:   movd               [r1], m7           ; store sse
michael@0:   pshufw               m4, m6, 0xe
michael@0:   paddd                m6, m4
michael@0:   movd                rax, m6           ; store sum as return value
michael@0: %endif
michael@0:   RET
michael@0: %endmacro
michael@0: 
michael@0: %macro INC_SRC_BY_SRC_STRIDE  0
michael@0: %if ARCH_X86=1 && CONFIG_PIC=1
michael@0:   add                srcq, src_stridemp
michael@0: %else
michael@0:   add                srcq, src_strideq
michael@0: %endif
michael@0: %endmacro
michael@0: 
michael@0: %macro SUBPEL_VARIANCE 1-2 0 ; W
michael@0: %if cpuflag(ssse3)
michael@0: %define bilin_filter_m bilin_filter_m_ssse3
michael@0: %define filter_idx_shift 4
michael@0: %else
michael@0: %define bilin_filter_m bilin_filter_m_sse2
michael@0: %define filter_idx_shift 5
michael@0: %endif
michael@0: ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
michael@0: ; 11, not 13, if the registers are ordered correctly. May make a minor speed
michael@0: ; difference on Win64
michael@0: 
michael@0: %ifdef PIC    ; 64bit PIC
michael@0:   %if %2 == 1 ; avg
michael@0:     cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
michael@0:                                       x_offset, y_offset, \
michael@0:                                       dst, dst_stride, \
michael@0:                                       sec, sec_stride, height, sse
michael@0:     %define sec_str sec_strideq
michael@0:   %else
michael@0:     cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
michael@0:                                   y_offset, dst, dst_stride, height, sse
michael@0:   %endif
michael@0:   %define h heightd
michael@0:   %define bilin_filter sseq
michael@0: %else
michael@0:   %if ARCH_X86=1 && CONFIG_PIC=1
michael@0:     %if %2 == 1 ; avg
michael@0:       cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
michael@0:                                   x_offset, y_offset, \
michael@0:                                   dst, dst_stride, \
michael@0:                                   sec, sec_stride, \
michael@0:                                   height, sse, g_bilin_filter, g_pw_8
michael@0:       %define h dword heightm
michael@0:       %define sec_str sec_stridemp
michael@0: 
michael@0:       ;Store bilin_filter and pw_8 location in stack
michael@0:       GET_GOT eax
michael@0:       add esp, 4                ; restore esp
michael@0: 
michael@0:       lea ecx, [GLOBAL(bilin_filter_m)]
michael@0:       mov g_bilin_filterm, ecx
michael@0: 
michael@0:       lea ecx, [GLOBAL(pw_8)]
michael@0:       mov g_pw_8m, ecx
michael@0: 
michael@0:       LOAD_IF_USED 0, 1         ; load eax, ecx back
michael@0:     %else
michael@0:       cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
michael@0:                                 y_offset, dst, dst_stride, height, sse, \
michael@0:                                 g_bilin_filter, g_pw_8
michael@0:       %define h heightd
michael@0: 
michael@0:       ;Store bilin_filter and pw_8 location in stack
michael@0:       GET_GOT eax
michael@0:       add esp, 4                ; restore esp
michael@0: 
michael@0:       lea ecx, [GLOBAL(bilin_filter_m)]
michael@0:       mov g_bilin_filterm, ecx
michael@0: 
michael@0:       lea ecx, [GLOBAL(pw_8)]
michael@0:       mov g_pw_8m, ecx
michael@0: 
michael@0:       LOAD_IF_USED 0, 1         ; load eax, ecx back
michael@0:     %endif
michael@0:   %else
michael@0:     %if %2 == 1 ; avg
michael@0:       cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
michael@0:                         7 + 2 * ARCH_X86_64, 13, src, src_stride, \
michael@0:                                              x_offset, y_offset, \
michael@0:                                              dst, dst_stride, \
michael@0:                                              sec, sec_stride, \
michael@0:                                              height, sse
michael@0:       %if ARCH_X86_64
michael@0:       %define h heightd
michael@0:       %define sec_str sec_strideq
michael@0:       %else
michael@0:       %define h dword heightm
michael@0:       %define sec_str sec_stridemp
michael@0:       %endif
michael@0:     %else
michael@0:       cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
michael@0:                               y_offset, dst, dst_stride, height, sse
michael@0:       %define h heightd
michael@0:     %endif
michael@0: 
michael@0:     %define bilin_filter bilin_filter_m
michael@0:   %endif
michael@0: %endif
michael@0: 
michael@0:   ASSERT               %1 <= 16         ; m6 overflows if w > 16
michael@0:   pxor                 m6, m6           ; sum
michael@0:   pxor                 m7, m7           ; sse
michael@0:   ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
michael@0:   ; could perhaps use it for something more productive then
michael@0:   pxor                 m5, m5           ; dedicated zero register
michael@0: %if %1 < 16
michael@0:   sar                   h, 1
michael@0: %if %2 == 1 ; avg
michael@0:   shl             sec_str, 1
michael@0: %endif
michael@0: %endif
michael@0: 
michael@0:   ; FIXME(rbultje) replace by jumptable?
michael@0:   test          x_offsetd, x_offsetd
michael@0:   jnz .x_nonzero
michael@0:   ; x_offset == 0
michael@0:   test          y_offsetd, y_offsetd
michael@0:   jnz .x_zero_y_nonzero
michael@0: 
michael@0:   ; x_offset == 0 && y_offset == 0
michael@0: .x_zero_y_zero_loop:
michael@0: %if %1 == 16
michael@0:   movu                 m0, [srcq]
michael@0:   mova                 m1, [dstq]
michael@0: %if %2 == 1 ; avg
michael@0:   pavgb                m0, [secq]
michael@0:   punpckhbw            m3, m1, m5
michael@0:   punpcklbw            m1, m5
michael@0: %endif
michael@0:   punpckhbw            m2, m0, m5
michael@0:   punpcklbw            m0, m5
michael@0: %if %2 == 0 ; !avg
michael@0:   punpckhbw            m3, m1, m5
michael@0:   punpcklbw            m1, m5
michael@0: %endif
michael@0:   SUM_SSE              m0, m1, m2, m3, m6, m7
michael@0: 
michael@0:   add                srcq, src_strideq
michael@0:   add                dstq, dst_strideq
michael@0: %else ; %1 < 16
michael@0:   movh                 m0, [srcq]
michael@0: %if %2 == 1 ; avg
michael@0: %if mmsize == 16
michael@0:   movhps               m0, [srcq+src_strideq]
michael@0: %else ; mmsize == 8
michael@0:   punpckldq            m0, [srcq+src_strideq]
michael@0: %endif
michael@0: %else ; !avg
michael@0:   movh                 m2, [srcq+src_strideq]
michael@0: %endif
michael@0:   movh                 m1, [dstq]
michael@0:   movh                 m3, [dstq+dst_strideq]
michael@0: %if %2 == 1 ; avg
michael@0:   pavgb                m0, [secq]
michael@0:   punpcklbw            m3, m5
michael@0:   punpcklbw            m1, m5
michael@0:   punpckhbw            m2, m0, m5
michael@0:   punpcklbw            m0, m5
michael@0: %else ; !avg
michael@0:   punpcklbw            m0, m5
michael@0:   punpcklbw            m2, m5
michael@0:   punpcklbw            m3, m5
michael@0:   punpcklbw            m1, m5
michael@0: %endif
michael@0:   SUM_SSE              m0, m1, m2, m3, m6, m7
michael@0: 
michael@0:   lea                srcq, [srcq+src_strideq*2]
michael@0:   lea                dstq, [dstq+dst_strideq*2]
michael@0: %endif
michael@0: %if %2 == 1 ; avg
michael@0:   add                secq, sec_str
michael@0: %endif
michael@0:   dec                   h
michael@0:   jg .x_zero_y_zero_loop
michael@0:   STORE_AND_RET
michael@0: 
michael@0: .x_zero_y_nonzero:
michael@0:   cmp           y_offsetd, 8
michael@0:   jne .x_zero_y_nonhalf
michael@0: 
michael@0:   ; x_offset == 0 && y_offset == 0.5
michael@0: .x_zero_y_half_loop:
michael@0: %if %1 == 16
michael@0:   movu                 m0, [srcq]
michael@0:   movu                 m4, [srcq+src_strideq]
michael@0:   mova                 m1, [dstq]
michael@0:   pavgb                m0, m4
michael@0:   punpckhbw            m3, m1, m5
michael@0: %if %2 == 1 ; avg
michael@0:   pavgb                m0, [secq]
michael@0: %endif
michael@0:   punpcklbw            m1, m5
michael@0:   punpckhbw            m2, m0, m5
michael@0:   punpcklbw            m0, m5
michael@0:   SUM_SSE              m0, m1, m2, m3, m6, m7
michael@0: 
michael@0:   add                srcq, src_strideq
michael@0:   add                dstq, dst_strideq
michael@0: %else ; %1 < 16
michael@0:   movh                 m0, [srcq]
michael@0:   movh                 m2, [srcq+src_strideq]
michael@0: %if %2 == 1 ; avg
michael@0: %if mmsize == 16
michael@0:   movhps               m2, [srcq+src_strideq*2]
michael@0: %else ; mmsize == 8
michael@0: %if %1 == 4
michael@0:   movh                 m1, [srcq+src_strideq*2]
michael@0:   punpckldq            m2, m1
michael@0: %else
michael@0:   punpckldq            m2, [srcq+src_strideq*2]
michael@0: %endif
michael@0: %endif
michael@0:   movh                 m1, [dstq]
michael@0: %if mmsize == 16
michael@0:   movlhps              m0, m2
michael@0: %else ; mmsize == 8
michael@0:   punpckldq            m0, m2
michael@0: %endif
michael@0:   movh                 m3, [dstq+dst_strideq]
michael@0:   pavgb                m0, m2
michael@0:   punpcklbw            m1, m5
michael@0:   pavgb                m0, [secq]
michael@0:   punpcklbw            m3, m5
michael@0:   punpckhbw            m2, m0, m5
michael@0:   punpcklbw            m0, m5
michael@0: %else ; !avg
michael@0:   movh                 m4, [srcq+src_strideq*2]
michael@0:   movh                 m1, [dstq]
michael@0:   pavgb                m0, m2
michael@0:   movh                 m3, [dstq+dst_strideq]
michael@0:   pavgb                m2, m4
michael@0:   punpcklbw            m0, m5
michael@0:   punpcklbw            m2, m5
michael@0:   punpcklbw            m3, m5
michael@0:   punpcklbw            m1, m5
michael@0: %endif
michael@0:   SUM_SSE              m0, m1, m2, m3, m6, m7
michael@0: 
michael@0:   lea                srcq, [srcq+src_strideq*2]
michael@0:   lea                dstq, [dstq+dst_strideq*2]
michael@0: %endif
michael@0: %if %2 == 1 ; avg
michael@0:   add                secq, sec_str
michael@0: %endif
michael@0:   dec                   h
michael@0:   jg .x_zero_y_half_loop
michael@0:   STORE_AND_RET
michael@0: 
michael@0: .x_zero_y_nonhalf:
michael@0:   ; x_offset == 0 && y_offset == bilin interpolation
michael@0: %ifdef PIC
michael@0:   lea        bilin_filter, [bilin_filter_m]
michael@0: %endif
michael@0:   shl           y_offsetd, filter_idx_shift
michael@0: %if ARCH_X86_64 && mmsize == 16
michael@0:   mova                 m8, [bilin_filter+y_offsetq]
michael@0: %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
michael@0:   mova                 m9, [bilin_filter+y_offsetq+16]
michael@0: %endif
michael@0:   mova                m10, [pw_8]
michael@0: %define filter_y_a m8
michael@0: %define filter_y_b m9
michael@0: %define filter_rnd m10
michael@0: %else ; x86-32 or mmx
michael@0: %if ARCH_X86=1 && CONFIG_PIC=1
michael@0: ; x_offset == 0, reuse x_offset reg
michael@0: %define tempq x_offsetq
michael@0:   add y_offsetq, g_bilin_filterm
michael@0: %define filter_y_a [y_offsetq]
michael@0: %define filter_y_b [y_offsetq+16]
michael@0:   mov tempq, g_pw_8m
michael@0: %define filter_rnd [tempq]
michael@0: %else
michael@0:   add           y_offsetq, bilin_filter
michael@0: %define filter_y_a [y_offsetq]
michael@0: %define filter_y_b [y_offsetq+16]
michael@0: %define filter_rnd [pw_8]
michael@0: %endif
michael@0: %endif
michael@0: 
michael@0: .x_zero_y_other_loop:
michael@0: %if %1 == 16
michael@0:   movu                 m0, [srcq]
michael@0:   movu                 m4, [srcq+src_strideq]
michael@0:   mova                 m1, [dstq]
michael@0: %if cpuflag(ssse3)
michael@0:   punpckhbw            m2, m0, m4
michael@0:   punpcklbw            m0, m4
michael@0:   pmaddubsw            m2, filter_y_a
michael@0:   pmaddubsw            m0, filter_y_a
michael@0:   paddw                m2, filter_rnd
michael@0:   paddw                m0, filter_rnd
michael@0: %else
michael@0:   punpckhbw            m2, m0, m5
michael@0:   punpckhbw            m3, m4, m5
michael@0:   punpcklbw            m0, m5
michael@0:   punpcklbw            m4, m5
michael@0:   ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
michael@0:   ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
michael@0:   ; instructions is the same (5), but it is 1 mul instead of 2, so might be
michael@0:   ; slightly faster because of pmullw latency. It would also cut our rodata
michael@0:   ; tables in half for this function, and save 1-2 registers on x86-64.
michael@0:   pmullw               m2, filter_y_a
michael@0:   pmullw               m3, filter_y_b
michael@0:   paddw                m2, filter_rnd
michael@0:   pmullw               m0, filter_y_a
michael@0:   pmullw               m4, filter_y_b
michael@0:   paddw                m0, filter_rnd
michael@0:   paddw                m2, m3
michael@0:   paddw                m0, m4
michael@0: %endif
michael@0:   psraw                m2, 4
michael@0:   psraw                m0, 4
michael@0: %if %2 == 1 ; avg
michael@0:   ; FIXME(rbultje) pipeline
michael@0:   packuswb             m0, m2
michael@0:   pavgb                m0, [secq]
michael@0:   punpckhbw            m2, m0, m5
michael@0:   punpcklbw            m0, m5
michael@0: %endif
michael@0:   punpckhbw            m3, m1, m5
michael@0:   punpcklbw            m1, m5
michael@0:   SUM_SSE              m0, m1, m2, m3, m6, m7
michael@0: 
michael@0:   add                srcq, src_strideq
michael@0:   add                dstq, dst_strideq
michael@0: %else ; %1 < 16
michael@0:   movh                 m0, [srcq]
michael@0:   movh                 m2, [srcq+src_strideq]
michael@0:   movh                 m4, [srcq+src_strideq*2]
michael@0:   movh                 m3, [dstq+dst_strideq]
michael@0: %if cpuflag(ssse3)
michael@0:   movh                 m1, [dstq]
michael@0:   punpcklbw            m0, m2
michael@0:   punpcklbw            m2, m4
michael@0:   pmaddubsw            m0, filter_y_a
michael@0:   pmaddubsw            m2, filter_y_a
michael@0:   punpcklbw            m3, m5
michael@0:   paddw                m2, filter_rnd
michael@0:   paddw                m0, filter_rnd
michael@0: %else
michael@0:   punpcklbw            m0, m5
michael@0:   punpcklbw            m2, m5
michael@0:   punpcklbw            m4, m5
michael@0:   pmullw               m0, filter_y_a
michael@0:   pmullw               m1, m2, filter_y_b
michael@0:   punpcklbw            m3, m5
michael@0:   paddw                m0, filter_rnd
michael@0:   pmullw               m2, filter_y_a
michael@0:   pmullw               m4, filter_y_b
michael@0:   paddw                m0, m1
michael@0:   paddw                m2, filter_rnd
michael@0:   movh                 m1, [dstq]
michael@0:   paddw                m2, m4
michael@0: %endif
michael@0:   psraw                m0, 4
michael@0:   psraw                m2, 4
michael@0: %if %2 == 1 ; avg
michael@0:   ; FIXME(rbultje) pipeline
michael@0:   packuswb             m0, m2
michael@0:   pavgb                m0, [secq]
michael@0:   punpckhbw            m2, m0, m5
michael@0:   punpcklbw            m0, m5
michael@0: %endif
michael@0:   punpcklbw            m1, m5
michael@0:   SUM_SSE              m0, m1, m2, m3, m6, m7
michael@0: 
michael@0:   lea                srcq, [srcq+src_strideq*2]
michael@0:   lea                dstq, [dstq+dst_strideq*2]
michael@0: %endif
michael@0: %if %2 == 1 ; avg
michael@0:   add                secq, sec_str
michael@0: %endif
michael@0:   dec                   h
michael@0:   jg .x_zero_y_other_loop
michael@0: %undef filter_y_a
michael@0: %undef filter_y_b
michael@0: %undef filter_rnd
michael@0:   STORE_AND_RET
michael@0: 
michael@0: .x_nonzero:
michael@0:   cmp           x_offsetd, 8
michael@0:   jne .x_nonhalf
michael@0:   ; x_offset == 0.5
michael@0:   test          y_offsetd, y_offsetd
michael@0:   jnz .x_half_y_nonzero
michael@0: 
michael@0:   ; x_offset == 0.5 && y_offset == 0
michael@0: .x_half_y_zero_loop:
michael@0: %if %1 == 16
michael@0:   movu                 m0, [srcq]
michael@0:   movu                 m4, [srcq+1]
michael@0:   mova                 m1, [dstq]
michael@0:   pavgb                m0, m4
michael@0:   punpckhbw            m3, m1, m5
michael@0: %if %2 == 1 ; avg
michael@0:   pavgb                m0, [secq]
michael@0: %endif
michael@0:   punpcklbw            m1, m5
michael@0:   punpckhbw            m2, m0, m5
michael@0:   punpcklbw            m0, m5
michael@0:   SUM_SSE              m0, m1, m2, m3, m6, m7
michael@0: 
michael@0:   add                srcq, src_strideq
michael@0:   add                dstq, dst_strideq
michael@0: %else ; %1 < 16
michael@0:   movh                 m0, [srcq]
michael@0:   movh                 m4, [srcq+1]
michael@0: %if %2 == 1 ; avg
michael@0: %if mmsize == 16
michael@0:   movhps               m0, [srcq+src_strideq]
michael@0:   movhps               m4, [srcq+src_strideq+1]
michael@0: %else ; mmsize == 8
michael@0:   punpckldq            m0, [srcq+src_strideq]
michael@0:   punpckldq            m4, [srcq+src_strideq+1]
michael@0: %endif
michael@0:   movh                 m1, [dstq]
michael@0:   movh                 m3, [dstq+dst_strideq]
michael@0:   pavgb                m0, m4
michael@0:   punpcklbw            m3, m5
michael@0:   pavgb                m0, [secq]
michael@0:   punpcklbw            m1, m5
michael@0:   punpckhbw            m2, m0, m5
michael@0:   punpcklbw            m0, m5
michael@0: %else ; !avg
michael@0:   movh                 m2, [srcq+src_strideq]
michael@0:   movh                 m1, [dstq]
michael@0:   pavgb                m0, m4
michael@0:   movh                 m4, [srcq+src_strideq+1]
michael@0:   movh                 m3, [dstq+dst_strideq]
michael@0:   pavgb                m2, m4
michael@0:   punpcklbw            m0, m5
michael@0:   punpcklbw            m2, m5
michael@0:   punpcklbw            m3, m5
michael@0:   punpcklbw            m1, m5
michael@0: %endif
michael@0:   SUM_SSE              m0, m1, m2, m3, m6, m7
michael@0: 
michael@0:   lea                srcq, [srcq+src_strideq*2]
michael@0:   lea                dstq, [dstq+dst_strideq*2]
michael@0: %endif
michael@0: %if %2 == 1 ; avg
michael@0:   add                secq, sec_str
michael@0: %endif
michael@0:   dec                   h
michael@0:   jg .x_half_y_zero_loop
michael@0:   STORE_AND_RET
michael@0: 
michael@0: .x_half_y_nonzero:
michael@0:   cmp           y_offsetd, 8
michael@0:   jne .x_half_y_nonhalf
michael@0: 
michael@0:   ; x_offset == 0.5 && y_offset == 0.5
michael@0: %if %1 == 16
michael@0:   movu                 m0, [srcq]
michael@0:   movu                 m3, [srcq+1]
michael@0:   add                srcq, src_strideq
michael@0:   pavgb                m0, m3
michael@0: .x_half_y_half_loop:
michael@0:   movu                 m4, [srcq]
michael@0:   movu                 m3, [srcq+1]
michael@0:   mova                 m1, [dstq]
michael@0:   pavgb                m4, m3
michael@0:   punpckhbw            m3, m1, m5
michael@0:   pavgb                m0, m4
michael@0: %if %2 == 1 ; avg
michael@0:   punpcklbw            m1, m5
michael@0:   pavgb                m0, [secq]
michael@0:   punpckhbw            m2, m0, m5
michael@0:   punpcklbw            m0, m5
michael@0: %else
michael@0:   punpckhbw            m2, m0, m5
michael@0:   punpcklbw            m0, m5
michael@0:   punpcklbw            m1, m5
michael@0: %endif
michael@0:   SUM_SSE              m0, m1, m2, m3, m6, m7
michael@0:   mova                 m0, m4
michael@0: 
michael@0:   add                srcq, src_strideq
michael@0:   add                dstq, dst_strideq
michael@0: %else ; %1 < 16
michael@0:   movh                 m0, [srcq]
michael@0:   movh                 m3, [srcq+1]
michael@0:   add                srcq, src_strideq
michael@0:   pavgb                m0, m3
michael@0: .x_half_y_half_loop:
michael@0:   movh                 m2, [srcq]
michael@0:   movh                 m3, [srcq+1]
michael@0: %if %2 == 1 ; avg
michael@0: %if mmsize == 16
michael@0:   movhps               m2, [srcq+src_strideq]
michael@0:   movhps               m3, [srcq+src_strideq+1]
michael@0: %else
michael@0: %if %1 == 4
michael@0:   movh                 m1, [srcq+src_strideq]
michael@0:   punpckldq            m2, m1
michael@0:   movh                 m1, [srcq+src_strideq+1]
michael@0:   punpckldq            m3, m1
michael@0: %else
michael@0:   punpckldq            m2, [srcq+src_strideq]
michael@0:   punpckldq            m3, [srcq+src_strideq+1]
michael@0: %endif
michael@0: %endif
michael@0:   pavgb                m2, m3
michael@0: %if mmsize == 16
michael@0:   movlhps              m0, m2
michael@0:   movhlps              m4, m2
michael@0: %else ; mmsize == 8
michael@0:   punpckldq            m0, m2
michael@0:   pshufw               m4, m2, 0xe
michael@0: %endif
michael@0:   movh                 m1, [dstq]
michael@0:   pavgb                m0, m2
michael@0:   movh                 m3, [dstq+dst_strideq]
michael@0:   pavgb                m0, [secq]
michael@0:   punpcklbw            m3, m5
michael@0:   punpcklbw            m1, m5
michael@0:   punpckhbw            m2, m0, m5
michael@0:   punpcklbw            m0, m5
michael@0: %else ; !avg
michael@0:   movh                 m4, [srcq+src_strideq]
michael@0:   movh                 m1, [srcq+src_strideq+1]
michael@0:   pavgb                m2, m3
michael@0:   pavgb                m4, m1
michael@0:   pavgb                m0, m2
michael@0:   pavgb                m2, m4
michael@0:   movh                 m1, [dstq]
michael@0:   movh                 m3, [dstq+dst_strideq]
michael@0:   punpcklbw            m0, m5
michael@0:   punpcklbw            m2, m5
michael@0:   punpcklbw            m3, m5
michael@0:   punpcklbw            m1, m5
michael@0: %endif
michael@0:   SUM_SSE              m0, m1, m2, m3, m6, m7
michael@0:   mova                 m0, m4
michael@0: 
michael@0:   lea                srcq, [srcq+src_strideq*2]
michael@0:   lea                dstq, [dstq+dst_strideq*2]
michael@0: %endif
michael@0: %if %2 == 1 ; avg
michael@0:   add                secq, sec_str
michael@0: %endif
michael@0:   dec                   h
michael@0:   jg .x_half_y_half_loop
michael@0:   STORE_AND_RET
michael@0: 
michael@0: .x_half_y_nonhalf:
michael@0:   ; x_offset == 0.5 && y_offset == bilin interpolation
michael@0: %ifdef PIC
michael@0:   lea        bilin_filter, [bilin_filter_m]
michael@0: %endif
michael@0:   shl           y_offsetd, filter_idx_shift
michael@0: %if ARCH_X86_64 && mmsize == 16
michael@0:   mova                 m8, [bilin_filter+y_offsetq]
michael@0: %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
michael@0:   mova                 m9, [bilin_filter+y_offsetq+16]
michael@0: %endif
michael@0:   mova                m10, [pw_8]
michael@0: %define filter_y_a m8
michael@0: %define filter_y_b m9
michael@0: %define filter_rnd m10
michael@0: %else  ;x86_32
michael@0: %if ARCH_X86=1 && CONFIG_PIC=1
michael@0: ; x_offset == 0.5. We can reuse x_offset reg
michael@0: %define tempq x_offsetq
michael@0:   add y_offsetq, g_bilin_filterm
michael@0: %define filter_y_a [y_offsetq]
michael@0: %define filter_y_b [y_offsetq+16]
michael@0:   mov tempq, g_pw_8m
michael@0: %define filter_rnd [tempq]
michael@0: %else
michael@0:   add           y_offsetq, bilin_filter
michael@0: %define filter_y_a [y_offsetq]
michael@0: %define filter_y_b [y_offsetq+16]
michael@0: %define filter_rnd [pw_8]
michael@0: %endif
michael@0: %endif
michael@0: 
michael@0: %if %1 == 16
michael@0:   movu                 m0, [srcq]
michael@0:   movu                 m3, [srcq+1]
michael@0:   add                srcq, src_strideq
michael@0:   pavgb                m0, m3
michael@0: .x_half_y_other_loop:
michael@0:   movu                 m4, [srcq]
michael@0:   movu                 m2, [srcq+1]
michael@0:   mova                 m1, [dstq]
michael@0:   pavgb                m4, m2
michael@0: %if cpuflag(ssse3)
michael@0:   punpckhbw            m2, m0, m4
michael@0:   punpcklbw            m0, m4
michael@0:   pmaddubsw            m2, filter_y_a
michael@0:   pmaddubsw            m0, filter_y_a
michael@0:   paddw                m2, filter_rnd
michael@0:   paddw                m0, filter_rnd
michael@0:   psraw                m2, 4
michael@0: %else
michael@0:   punpckhbw            m2, m0, m5
michael@0:   punpckhbw            m3, m4, m5
michael@0:   pmullw               m2, filter_y_a
michael@0:   pmullw               m3, filter_y_b
michael@0:   paddw                m2, filter_rnd
michael@0:   punpcklbw            m0, m5
michael@0:   paddw                m2, m3
michael@0:   punpcklbw            m3, m4, m5
michael@0:   pmullw               m0, filter_y_a
michael@0:   pmullw               m3, filter_y_b
michael@0:   paddw                m0, filter_rnd
michael@0:   psraw                m2, 4
michael@0:   paddw                m0, m3
michael@0: %endif
michael@0:   punpckhbw            m3, m1, m5
michael@0:   psraw                m0, 4
michael@0: %if %2 == 1 ; avg
michael@0:   ; FIXME(rbultje) pipeline
michael@0:   packuswb             m0, m2
michael@0:   pavgb                m0, [secq]
michael@0:   punpckhbw            m2, m0, m5
michael@0:   punpcklbw            m0, m5
michael@0: %endif
michael@0:   punpcklbw            m1, m5
michael@0:   SUM_SSE              m0, m1, m2, m3, m6, m7
michael@0:   mova                 m0, m4
michael@0: 
michael@0:   add                srcq, src_strideq
michael@0:   add                dstq, dst_strideq
michael@0: %else ; %1 < 16
michael@0:   movh                 m0, [srcq]
michael@0:   movh                 m3, [srcq+1]
michael@0:   add                srcq, src_strideq
michael@0:   pavgb                m0, m3
michael@0: %if notcpuflag(ssse3)
michael@0:   punpcklbw            m0, m5
michael@0: %endif
michael@0: .x_half_y_other_loop:
michael@0:   movh                 m2, [srcq]
michael@0:   movh                 m1, [srcq+1]
michael@0:   movh                 m4, [srcq+src_strideq]
michael@0:   movh                 m3, [srcq+src_strideq+1]
michael@0:   pavgb                m2, m1
michael@0:   pavgb                m4, m3
michael@0:   movh                 m3, [dstq+dst_strideq]
michael@0: %if cpuflag(ssse3)
michael@0:   movh                 m1, [dstq]
michael@0:   punpcklbw            m0, m2
michael@0:   punpcklbw            m2, m4
michael@0:   pmaddubsw            m0, filter_y_a
michael@0:   pmaddubsw            m2, filter_y_a
michael@0:   punpcklbw            m3, m5
michael@0:   paddw                m0, filter_rnd
michael@0:   paddw                m2, filter_rnd
michael@0: %else
michael@0:   punpcklbw            m2, m5
michael@0:   punpcklbw            m4, m5
michael@0:   pmullw               m0, filter_y_a
michael@0:   pmullw               m1, m2, filter_y_b
michael@0:   punpcklbw            m3, m5
michael@0:   paddw                m0, filter_rnd
michael@0:   pmullw               m2, filter_y_a
michael@0:   paddw                m0, m1
michael@0:   pmullw               m1, m4, filter_y_b
michael@0:   paddw                m2, filter_rnd
michael@0:   paddw                m2, m1
michael@0:   movh                 m1, [dstq]
michael@0: %endif
michael@0:   psraw                m0, 4
michael@0:   psraw                m2, 4
michael@0: %if %2 == 1 ; avg
michael@0:   ; FIXME(rbultje) pipeline
michael@0:   packuswb             m0, m2
michael@0:   pavgb                m0, [secq]
michael@0:   punpckhbw            m2, m0, m5
michael@0:   punpcklbw            m0, m5
michael@0: %endif
michael@0:   punpcklbw            m1, m5
michael@0:   SUM_SSE              m0, m1, m2, m3, m6, m7
michael@0:   mova                 m0, m4
michael@0: 
michael@0:   lea                srcq, [srcq+src_strideq*2]
michael@0:   lea                dstq, [dstq+dst_strideq*2]
michael@0: %endif
michael@0: %if %2 == 1 ; avg
michael@0:   add                secq, sec_str
michael@0: %endif
michael@0:   dec                   h
michael@0:   jg .x_half_y_other_loop
michael@0: %undef filter_y_a
michael@0: %undef filter_y_b
michael@0: %undef filter_rnd
michael@0:   STORE_AND_RET
michael@0: 
michael@0: .x_nonhalf:
michael@0:   test          y_offsetd, y_offsetd
michael@0:   jnz .x_nonhalf_y_nonzero
michael@0: 
michael@0:   ; x_offset == bilin interpolation && y_offset == 0
michael@0: %ifdef PIC
michael@0:   lea        bilin_filter, [bilin_filter_m]
michael@0: %endif
michael@0:   shl           x_offsetd, filter_idx_shift
michael@0: %if ARCH_X86_64 && mmsize == 16
michael@0:   mova                 m8, [bilin_filter+x_offsetq]
michael@0: %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
michael@0:   mova                 m9, [bilin_filter+x_offsetq+16]
michael@0: %endif
michael@0:   mova                m10, [pw_8]
michael@0: %define filter_x_a m8
michael@0: %define filter_x_b m9
michael@0: %define filter_rnd m10
michael@0: %else    ; x86-32
michael@0: %if ARCH_X86=1 && CONFIG_PIC=1
michael@0: ;y_offset == 0. We can reuse y_offset reg.
michael@0: %define tempq y_offsetq
michael@0:   add x_offsetq, g_bilin_filterm
michael@0: %define filter_x_a [x_offsetq]
michael@0: %define filter_x_b [x_offsetq+16]
michael@0:   mov tempq, g_pw_8m
michael@0: %define filter_rnd [tempq]
michael@0: %else
michael@0:   add           x_offsetq, bilin_filter
michael@0: %define filter_x_a [x_offsetq]
michael@0: %define filter_x_b [x_offsetq+16]
michael@0: %define filter_rnd [pw_8]
michael@0: %endif
michael@0: %endif
michael@0: 
michael@0: .x_other_y_zero_loop:
michael@0: %if %1 == 16
michael@0:   movu                 m0, [srcq]
michael@0:   movu                 m4, [srcq+1]
michael@0:   mova                 m1, [dstq]
michael@0: %if cpuflag(ssse3)
michael@0:   punpckhbw            m2, m0, m4
michael@0:   punpcklbw            m0, m4
michael@0:   pmaddubsw            m2, filter_x_a
michael@0:   pmaddubsw            m0, filter_x_a
michael@0:   paddw                m2, filter_rnd
michael@0:   paddw                m0, filter_rnd
michael@0: %else
michael@0:   punpckhbw            m2, m0, m5
michael@0:   punpckhbw            m3, m4, m5
michael@0:   punpcklbw            m0, m5
michael@0:   punpcklbw            m4, m5
michael@0:   pmullw               m2, filter_x_a
michael@0:   pmullw               m3, filter_x_b
michael@0:   paddw                m2, filter_rnd
michael@0:   pmullw               m0, filter_x_a
michael@0:   pmullw               m4, filter_x_b
michael@0:   paddw                m0, filter_rnd
michael@0:   paddw                m2, m3
michael@0:   paddw                m0, m4
michael@0: %endif
michael@0:   psraw                m2, 4
michael@0:   psraw                m0, 4
michael@0: %if %2 == 1 ; avg
michael@0:   ; FIXME(rbultje) pipeline
michael@0:   packuswb             m0, m2
michael@0:   pavgb                m0, [secq]
michael@0:   punpckhbw            m2, m0, m5
michael@0:   punpcklbw            m0, m5
michael@0: %endif
michael@0:   punpckhbw            m3, m1, m5
michael@0:   punpcklbw            m1, m5
michael@0:   SUM_SSE              m0, m1, m2, m3, m6, m7
michael@0: 
michael@0:   add                srcq, src_strideq
michael@0:   add                dstq, dst_strideq
michael@0: %else ; %1 < 16
michael@0:   movh                 m0, [srcq]
michael@0:   movh                 m1, [srcq+1]
michael@0:   movh                 m2, [srcq+src_strideq]
michael@0:   movh                 m4, [srcq+src_strideq+1]
michael@0:   movh                 m3, [dstq+dst_strideq]
michael@0: %if cpuflag(ssse3)
michael@0:   punpcklbw            m0, m1
michael@0:   movh                 m1, [dstq]
michael@0:   punpcklbw            m2, m4
michael@0:   pmaddubsw            m0, filter_x_a
michael@0:   pmaddubsw            m2, filter_x_a
michael@0:   punpcklbw            m3, m5
michael@0:   paddw                m0, filter_rnd
michael@0:   paddw                m2, filter_rnd
michael@0: %else
michael@0:   punpcklbw            m0, m5
michael@0:   punpcklbw            m1, m5
michael@0:   punpcklbw            m2, m5
michael@0:   punpcklbw            m4, m5
michael@0:   pmullw               m0, filter_x_a
michael@0:   pmullw               m1, filter_x_b
michael@0:   punpcklbw            m3, m5
michael@0:   paddw                m0, filter_rnd
michael@0:   pmullw               m2, filter_x_a
michael@0:   pmullw               m4, filter_x_b
michael@0:   paddw                m0, m1
michael@0:   paddw                m2, filter_rnd
michael@0:   movh                 m1, [dstq]
michael@0:   paddw                m2, m4
michael@0: %endif
michael@0:   psraw                m0, 4
michael@0:   psraw                m2, 4
michael@0: %if %2 == 1 ; avg
michael@0:   ; FIXME(rbultje) pipeline
michael@0:   packuswb             m0, m2
michael@0:   pavgb                m0, [secq]
michael@0:   punpckhbw            m2, m0, m5
michael@0:   punpcklbw            m0, m5
michael@0: %endif
michael@0:   punpcklbw            m1, m5
michael@0:   SUM_SSE              m0, m1, m2, m3, m6, m7
michael@0: 
michael@0:   lea                srcq, [srcq+src_strideq*2]
michael@0:   lea                dstq, [dstq+dst_strideq*2]
michael@0: %endif
michael@0: %if %2 == 1 ; avg
michael@0:   add                secq, sec_str
michael@0: %endif
michael@0:   dec                   h
michael@0:   jg .x_other_y_zero_loop
michael@0: %undef filter_x_a
michael@0: %undef filter_x_b
michael@0: %undef filter_rnd
michael@0:   STORE_AND_RET
michael@0: 
michael@0: .x_nonhalf_y_nonzero:
michael@0:   cmp           y_offsetd, 8
michael@0:   jne .x_nonhalf_y_nonhalf
michael@0: 
michael@0:   ; x_offset == bilin interpolation && y_offset == 0.5
michael@0: %ifdef PIC
michael@0:   lea        bilin_filter, [bilin_filter_m]
michael@0: %endif
michael@0:   shl           x_offsetd, filter_idx_shift
michael@0: %if ARCH_X86_64 && mmsize == 16
michael@0:   mova                 m8, [bilin_filter+x_offsetq]
michael@0: %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
michael@0:   mova                 m9, [bilin_filter+x_offsetq+16]
michael@0: %endif
michael@0:   mova                m10, [pw_8]
michael@0: %define filter_x_a m8
michael@0: %define filter_x_b m9
michael@0: %define filter_rnd m10
michael@0: %else    ; x86-32
michael@0: %if ARCH_X86=1 && CONFIG_PIC=1
michael@0: ; y_offset == 0.5. We can reuse y_offset reg.
michael@0: %define tempq y_offsetq
michael@0:   add x_offsetq, g_bilin_filterm
michael@0: %define filter_x_a [x_offsetq]
michael@0: %define filter_x_b [x_offsetq+16]
michael@0:   mov tempq, g_pw_8m
michael@0: %define filter_rnd [tempq]
michael@0: %else
michael@0:   add           x_offsetq, bilin_filter
michael@0: %define filter_x_a [x_offsetq]
michael@0: %define filter_x_b [x_offsetq+16]
michael@0: %define filter_rnd [pw_8]
michael@0: %endif
michael@0: %endif
michael@0: 
michael@0: %if %1 == 16
michael@0:   movu                 m0, [srcq]
michael@0:   movu                 m1, [srcq+1]
michael@0: %if cpuflag(ssse3)
michael@0:   punpckhbw            m2, m0, m1
michael@0:   punpcklbw            m0, m1
michael@0:   pmaddubsw            m2, filter_x_a
michael@0:   pmaddubsw            m0, filter_x_a
michael@0:   paddw                m2, filter_rnd
michael@0:   paddw                m0, filter_rnd
michael@0: %else
michael@0:   punpckhbw            m2, m0, m5
michael@0:   punpckhbw            m3, m1, m5
michael@0:   punpcklbw            m0, m5
michael@0:   punpcklbw            m1, m5
michael@0:   pmullw               m0, filter_x_a
michael@0:   pmullw               m1, filter_x_b
michael@0:   paddw                m0, filter_rnd
michael@0:   pmullw               m2, filter_x_a
michael@0:   pmullw               m3, filter_x_b
michael@0:   paddw                m2, filter_rnd
michael@0:   paddw                m0, m1
michael@0:   paddw                m2, m3
michael@0: %endif
michael@0:   psraw                m0, 4
michael@0:   psraw                m2, 4
michael@0:   add                srcq, src_strideq
michael@0:   packuswb             m0, m2
michael@0: .x_other_y_half_loop:
michael@0:   movu                 m4, [srcq]
michael@0:   movu                 m3, [srcq+1]
michael@0: %if cpuflag(ssse3)
michael@0:   mova                 m1, [dstq]
michael@0:   punpckhbw            m2, m4, m3
michael@0:   punpcklbw            m4, m3
michael@0:   pmaddubsw            m2, filter_x_a
michael@0:   pmaddubsw            m4, filter_x_a
michael@0:   paddw                m2, filter_rnd
michael@0:   paddw                m4, filter_rnd
michael@0:   psraw                m2, 4
michael@0:   psraw                m4, 4
michael@0:   packuswb             m4, m2
michael@0:   pavgb                m0, m4
michael@0:   punpckhbw            m3, m1, m5
michael@0:   punpcklbw            m1, m5
michael@0: %else
michael@0:   punpckhbw            m2, m4, m5
michael@0:   punpckhbw            m1, m3, m5
michael@0:   punpcklbw            m4, m5
michael@0:   punpcklbw            m3, m5
michael@0:   pmullw               m4, filter_x_a
michael@0:   pmullw               m3, filter_x_b
michael@0:   paddw                m4, filter_rnd
michael@0:   pmullw               m2, filter_x_a
michael@0:   pmullw               m1, filter_x_b
michael@0:   paddw                m2, filter_rnd
michael@0:   paddw                m4, m3
michael@0:   paddw                m2, m1
michael@0:   mova                 m1, [dstq]
michael@0:   psraw                m4, 4
michael@0:   psraw                m2, 4
michael@0:   punpckhbw            m3, m1, m5
michael@0:   ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
michael@0:   ; have a 1-register shortage to be able to store the backup of the bilin
michael@0:   ; filtered second line as words as cache for the next line. Packing into
michael@0:   ; a byte costs 1 pack and 2 unpacks, but saves a register.
michael@0:   packuswb             m4, m2
michael@0:   punpcklbw            m1, m5
michael@0:   pavgb                m0, m4
michael@0: %endif
michael@0: %if %2 == 1 ; avg
michael@0:   ; FIXME(rbultje) pipeline
michael@0:   pavgb                m0, [secq]
michael@0: %endif
michael@0:   punpckhbw            m2, m0, m5
michael@0:   punpcklbw            m0, m5
michael@0:   SUM_SSE              m0, m1, m2, m3, m6, m7
michael@0:   mova                 m0, m4
michael@0: 
michael@0:   add                srcq, src_strideq
michael@0:   add                dstq, dst_strideq
michael@0: %else ; %1 < 16
michael@0:   movh                 m0, [srcq]
michael@0:   movh                 m1, [srcq+1]
michael@0: %if cpuflag(ssse3)
michael@0:   punpcklbw            m0, m1
michael@0:   pmaddubsw            m0, filter_x_a
michael@0:   paddw                m0, filter_rnd
michael@0: %else
michael@0:   punpcklbw            m0, m5
michael@0:   punpcklbw            m1, m5
michael@0:   pmullw               m0, filter_x_a
michael@0:   pmullw               m1, filter_x_b
michael@0:   paddw                m0, filter_rnd
michael@0:   paddw                m0, m1
michael@0: %endif
michael@0:   add                srcq, src_strideq
michael@0:   psraw                m0, 4
michael@0: .x_other_y_half_loop:
michael@0:   movh                 m2, [srcq]
michael@0:   movh                 m1, [srcq+1]
michael@0:   movh                 m4, [srcq+src_strideq]
michael@0:   movh                 m3, [srcq+src_strideq+1]
michael@0: %if cpuflag(ssse3)
michael@0:   punpcklbw            m2, m1
michael@0:   punpcklbw            m4, m3
michael@0:   pmaddubsw            m2, filter_x_a
michael@0:   pmaddubsw            m4, filter_x_a
michael@0:   movh                 m1, [dstq]
michael@0:   movh                 m3, [dstq+dst_strideq]
michael@0:   paddw                m2, filter_rnd
michael@0:   paddw                m4, filter_rnd
michael@0: %else
michael@0:   punpcklbw            m2, m5
michael@0:   punpcklbw            m1, m5
michael@0:   punpcklbw            m4, m5
michael@0:   punpcklbw            m3, m5
michael@0:   pmullw               m2, filter_x_a
michael@0:   pmullw               m1, filter_x_b
michael@0:   paddw                m2, filter_rnd
michael@0:   pmullw               m4, filter_x_a
michael@0:   pmullw               m3, filter_x_b
michael@0:   paddw                m4, filter_rnd
michael@0:   paddw                m2, m1
michael@0:   movh                 m1, [dstq]
michael@0:   paddw                m4, m3
michael@0:   movh                 m3, [dstq+dst_strideq]
michael@0: %endif
michael@0:   psraw                m2, 4
michael@0:   psraw                m4, 4
michael@0:   pavgw                m0, m2
michael@0:   pavgw                m2, m4
michael@0: %if %2 == 1 ; avg
michael@0:   ; FIXME(rbultje) pipeline - also consider going to bytes here
michael@0:   packuswb             m0, m2
michael@0:   pavgb                m0, [secq]
michael@0:   punpckhbw            m2, m0, m5
michael@0:   punpcklbw            m0, m5
michael@0: %endif
michael@0:   punpcklbw            m3, m5
michael@0:   punpcklbw            m1, m5
michael@0:   SUM_SSE              m0, m1, m2, m3, m6, m7
michael@0:   mova                 m0, m4
michael@0: 
michael@0:   lea                srcq, [srcq+src_strideq*2]
michael@0:   lea                dstq, [dstq+dst_strideq*2]
michael@0: %endif
michael@0: %if %2 == 1 ; avg
michael@0:   add                secq, sec_str
michael@0: %endif
michael@0:   dec                   h
michael@0:   jg .x_other_y_half_loop
michael@0: %undef filter_x_a
michael@0: %undef filter_x_b
michael@0: %undef filter_rnd
michael@0:   STORE_AND_RET
michael@0: 
michael@0: .x_nonhalf_y_nonhalf:
michael@0: %ifdef PIC
michael@0:   lea        bilin_filter, [bilin_filter_m]
michael@0: %endif
michael@0:   shl           x_offsetd, filter_idx_shift
michael@0:   shl           y_offsetd, filter_idx_shift
michael@0: %if ARCH_X86_64 && mmsize == 16
michael@0:   mova                 m8, [bilin_filter+x_offsetq]
michael@0: %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
michael@0:   mova                 m9, [bilin_filter+x_offsetq+16]
michael@0: %endif
michael@0:   mova                m10, [bilin_filter+y_offsetq]
michael@0: %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
michael@0:   mova                m11, [bilin_filter+y_offsetq+16]
michael@0: %endif
michael@0:   mova                m12, [pw_8]
michael@0: %define filter_x_a m8
michael@0: %define filter_x_b m9
michael@0: %define filter_y_a m10
michael@0: %define filter_y_b m11
michael@0: %define filter_rnd m12
michael@0: %else   ; x86-32
michael@0: %if ARCH_X86=1 && CONFIG_PIC=1
michael@0: ; In this case, there is NO unused register. Used src_stride register. Later,
michael@0: ; src_stride has to be loaded from stack when it is needed.
michael@0: %define tempq src_strideq
michael@0:   mov tempq, g_bilin_filterm
michael@0:   add           x_offsetq, tempq
michael@0:   add           y_offsetq, tempq
michael@0: %define filter_x_a [x_offsetq]
michael@0: %define filter_x_b [x_offsetq+16]
michael@0: %define filter_y_a [y_offsetq]
michael@0: %define filter_y_b [y_offsetq+16]
michael@0: 
michael@0:   mov tempq, g_pw_8m
michael@0: %define filter_rnd [tempq]
michael@0: %else
michael@0:   add           x_offsetq, bilin_filter
michael@0:   add           y_offsetq, bilin_filter
michael@0: %define filter_x_a [x_offsetq]
michael@0: %define filter_x_b [x_offsetq+16]
michael@0: %define filter_y_a [y_offsetq]
michael@0: %define filter_y_b [y_offsetq+16]
michael@0: %define filter_rnd [pw_8]
michael@0: %endif
michael@0: %endif
michael@0: 
michael@0:   ; x_offset == bilin interpolation && y_offset == bilin interpolation
michael@0: %if %1 == 16
michael@0:   movu                 m0, [srcq]
michael@0:   movu                 m1, [srcq+1]
michael@0: %if cpuflag(ssse3)
michael@0:   punpckhbw            m2, m0, m1
michael@0:   punpcklbw            m0, m1
michael@0:   pmaddubsw            m2, filter_x_a
michael@0:   pmaddubsw            m0, filter_x_a
michael@0:   paddw                m2, filter_rnd
michael@0:   paddw                m0, filter_rnd
michael@0: %else
michael@0:   punpckhbw            m2, m0, m5
michael@0:   punpckhbw            m3, m1, m5
michael@0:   punpcklbw            m0, m5
michael@0:   punpcklbw            m1, m5
michael@0:   pmullw               m0, filter_x_a
michael@0:   pmullw               m1, filter_x_b
michael@0:   paddw                m0, filter_rnd
michael@0:   pmullw               m2, filter_x_a
michael@0:   pmullw               m3, filter_x_b
michael@0:   paddw                m2, filter_rnd
michael@0:   paddw                m0, m1
michael@0:   paddw                m2, m3
michael@0: %endif
michael@0:   psraw                m0, 4
michael@0:   psraw                m2, 4
michael@0: 
michael@0:   INC_SRC_BY_SRC_STRIDE
michael@0: 
michael@0:   packuswb             m0, m2
michael@0: .x_other_y_other_loop:
michael@0: %if cpuflag(ssse3)
michael@0:   movu                 m4, [srcq]
michael@0:   movu                 m3, [srcq+1]
michael@0:   mova                 m1, [dstq]
michael@0:   punpckhbw            m2, m4, m3
michael@0:   punpcklbw            m4, m3
michael@0:   pmaddubsw            m2, filter_x_a
michael@0:   pmaddubsw            m4, filter_x_a
michael@0:   punpckhbw            m3, m1, m5
michael@0:   paddw                m2, filter_rnd
michael@0:   paddw                m4, filter_rnd
michael@0:   psraw                m2, 4
michael@0:   psraw                m4, 4
michael@0:   packuswb             m4, m2
michael@0:   punpckhbw            m2, m0, m4
michael@0:   punpcklbw            m0, m4
michael@0:   pmaddubsw            m2, filter_y_a
michael@0:   pmaddubsw            m0, filter_y_a
michael@0:   punpcklbw            m1, m5
michael@0:   paddw                m2, filter_rnd
michael@0:   paddw                m0, filter_rnd
michael@0:   psraw                m2, 4
michael@0:   psraw                m0, 4
michael@0: %else
michael@0:   movu                 m3, [srcq]
michael@0:   movu                 m4, [srcq+1]
michael@0:   punpckhbw            m1, m3, m5
michael@0:   punpckhbw            m2, m4, m5
michael@0:   punpcklbw            m3, m5
michael@0:   punpcklbw            m4, m5
michael@0:   pmullw               m3, filter_x_a
michael@0:   pmullw               m4, filter_x_b
michael@0:   paddw                m3, filter_rnd
michael@0:   pmullw               m1, filter_x_a
michael@0:   pmullw               m2, filter_x_b
michael@0:   paddw                m1, filter_rnd
michael@0:   paddw                m3, m4
michael@0:   paddw                m1, m2
michael@0:   psraw                m3, 4
michael@0:   psraw                m1, 4
michael@0:   packuswb             m4, m3, m1
michael@0:   punpckhbw            m2, m0, m5
michael@0:   punpcklbw            m0, m5
michael@0:   pmullw               m2, filter_y_a
michael@0:   pmullw               m1, filter_y_b
michael@0:   paddw                m2, filter_rnd
michael@0:   pmullw               m0, filter_y_a
michael@0:   pmullw               m3, filter_y_b
michael@0:   paddw                m2, m1
michael@0:   mova                 m1, [dstq]
michael@0:   paddw                m0, filter_rnd
michael@0:   psraw                m2, 4
michael@0:   paddw                m0, m3
michael@0:   punpckhbw            m3, m1, m5
michael@0:   psraw                m0, 4
michael@0:   punpcklbw            m1, m5
michael@0: %endif
michael@0: %if %2 == 1 ; avg
michael@0:   ; FIXME(rbultje) pipeline
michael@0:   packuswb             m0, m2
michael@0:   pavgb                m0, [secq]
michael@0:   punpckhbw            m2, m0, m5
michael@0:   punpcklbw            m0, m5
michael@0: %endif
michael@0:   SUM_SSE              m0, m1, m2, m3, m6, m7
michael@0:   mova                 m0, m4
michael@0: 
michael@0:   INC_SRC_BY_SRC_STRIDE
michael@0:   add                dstq, dst_strideq
michael@0: %else ; %1 < 16
michael@0:   movh                 m0, [srcq]
michael@0:   movh                 m1, [srcq+1]
michael@0: %if cpuflag(ssse3)
michael@0:   punpcklbw            m0, m1
michael@0:   pmaddubsw            m0, filter_x_a
michael@0:   paddw                m0, filter_rnd
michael@0: %else
michael@0:   punpcklbw            m0, m5
michael@0:   punpcklbw            m1, m5
michael@0:   pmullw               m0, filter_x_a
michael@0:   pmullw               m1, filter_x_b
michael@0:   paddw                m0, filter_rnd
michael@0:   paddw                m0, m1
michael@0: %endif
michael@0:   psraw                m0, 4
michael@0: %if cpuflag(ssse3)
michael@0:   packuswb             m0, m0
michael@0: %endif
michael@0: 
michael@0:   INC_SRC_BY_SRC_STRIDE
michael@0: 
michael@0: .x_other_y_other_loop:
michael@0:   movh                 m2, [srcq]
michael@0:   movh                 m1, [srcq+1]
michael@0: 
michael@0:   INC_SRC_BY_SRC_STRIDE
michael@0:   movh                 m4, [srcq]
michael@0:   movh                 m3, [srcq+1]
michael@0: 
michael@0: %if cpuflag(ssse3)
michael@0:   punpcklbw            m2, m1
michael@0:   punpcklbw            m4, m3
michael@0:   pmaddubsw            m2, filter_x_a
michael@0:   pmaddubsw            m4, filter_x_a
michael@0:   movh                 m3, [dstq+dst_strideq]
michael@0:   movh                 m1, [dstq]
michael@0:   paddw                m2, filter_rnd
michael@0:   paddw                m4, filter_rnd
michael@0:   psraw                m2, 4
michael@0:   psraw                m4, 4
michael@0:   packuswb             m2, m2
michael@0:   packuswb             m4, m4
michael@0:   punpcklbw            m0, m2
michael@0:   punpcklbw            m2, m4
michael@0:   pmaddubsw            m0, filter_y_a
michael@0:   pmaddubsw            m2, filter_y_a
michael@0:   punpcklbw            m3, m5
michael@0:   paddw                m0, filter_rnd
michael@0:   paddw                m2, filter_rnd
michael@0:   psraw                m0, 4
michael@0:   psraw                m2, 4
michael@0:   punpcklbw            m1, m5
michael@0: %else
michael@0:   punpcklbw            m2, m5
michael@0:   punpcklbw            m1, m5
michael@0:   punpcklbw            m4, m5
michael@0:   punpcklbw            m3, m5
michael@0:   pmullw               m2, filter_x_a
michael@0:   pmullw               m1, filter_x_b
michael@0:   paddw                m2, filter_rnd
michael@0:   pmullw               m4, filter_x_a
michael@0:   pmullw               m3, filter_x_b
michael@0:   paddw                m4, filter_rnd
michael@0:   paddw                m2, m1
michael@0:   paddw                m4, m3
michael@0:   psraw                m2, 4
michael@0:   psraw                m4, 4
michael@0:   pmullw               m0, filter_y_a
michael@0:   pmullw               m3, m2, filter_y_b
michael@0:   paddw                m0, filter_rnd
michael@0:   pmullw               m2, filter_y_a
michael@0:   pmullw               m1, m4, filter_y_b
michael@0:   paddw                m2, filter_rnd
michael@0:   paddw                m0, m3
michael@0:   movh                 m3, [dstq+dst_strideq]
michael@0:   paddw                m2, m1
michael@0:   movh                 m1, [dstq]
michael@0:   psraw                m0, 4
michael@0:   psraw                m2, 4
michael@0:   punpcklbw            m3, m5
michael@0:   punpcklbw            m1, m5
michael@0: %endif
michael@0: %if %2 == 1 ; avg
michael@0:   ; FIXME(rbultje) pipeline
michael@0:   packuswb             m0, m2
michael@0:   pavgb                m0, [secq]
michael@0:   punpckhbw            m2, m0, m5
michael@0:   punpcklbw            m0, m5
michael@0: %endif
michael@0:   SUM_SSE              m0, m1, m2, m3, m6, m7
michael@0:   mova                 m0, m4
michael@0: 
michael@0:   INC_SRC_BY_SRC_STRIDE
michael@0:   lea                dstq, [dstq+dst_strideq*2]
michael@0: %endif
michael@0: %if %2 == 1 ; avg
michael@0:   add                secq, sec_str
michael@0: %endif
michael@0:   dec                   h
michael@0:   jg .x_other_y_other_loop
michael@0: %undef filter_x_a
michael@0: %undef filter_x_b
michael@0: %undef filter_y_a
michael@0: %undef filter_y_b
michael@0: %undef filter_rnd
michael@0:   STORE_AND_RET
michael@0: %endmacro
michael@0: 
michael@0: ; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
michael@0: ; between the ssse3 and non-ssse3 version. It may make sense to merge their
michael@0: ; code in the sense that the ssse3 version would jump to the appropriate
michael@0: ; location in the sse/2 version, rather than duplicating that code in the
michael@0: ; binary.
michael@0: 
michael@0: INIT_MMX sse
michael@0: SUBPEL_VARIANCE  4
michael@0: INIT_XMM sse2
michael@0: SUBPEL_VARIANCE  8
michael@0: SUBPEL_VARIANCE 16
michael@0: 
michael@0: INIT_MMX ssse3
michael@0: SUBPEL_VARIANCE  4
michael@0: INIT_XMM ssse3
michael@0: SUBPEL_VARIANCE  8
michael@0: SUBPEL_VARIANCE 16
michael@0: 
michael@0: INIT_MMX sse
michael@0: SUBPEL_VARIANCE  4, 1
michael@0: INIT_XMM sse2
michael@0: SUBPEL_VARIANCE  8, 1
michael@0: SUBPEL_VARIANCE 16, 1
michael@0: 
michael@0: INIT_MMX ssse3
michael@0: SUBPEL_VARIANCE  4, 1
michael@0: INIT_XMM ssse3
michael@0: SUBPEL_VARIANCE  8, 1
michael@0: SUBPEL_VARIANCE 16, 1