michael@0: ;
michael@0: ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0: ;
michael@0: ;  Use of this source code is governed by a BSD-style license
michael@0: ;  that can be found in the LICENSE file in the root of the source
michael@0: ;  tree. An additional intellectual property rights grant can be found
michael@0: ;  in the file PATENTS.  All contributing project authors may
michael@0: ;  be found in the AUTHORS file in the root of the source tree.
michael@0: ;
michael@0: 
michael@0: 
michael@0: %include "vpx_ports/x86_abi_support.asm"
michael@0: 
michael@0: %define BLOCK_HEIGHT_WIDTH 4
michael@0: %define VP8_FILTER_WEIGHT 128
michael@0: %define VP8_FILTER_SHIFT  7
michael@0: 
michael@0: 
michael@0: ;/************************************************************************************
michael@0: ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
michael@0: ; input pixel array has output_height rows. This routine assumes that output_height is an
michael@0: ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
michael@0: ; rows each iteration to take advantage of the 128 bits operations.
michael@0: ;
michael@0: ; This is an implementation of some of the SSE optimizations first seen in ffvp8
michael@0: ;
michael@0: ;*************************************************************************************/
michael@0: ;void vp8_filter_block1d8_h6_ssse3
michael@0: ;(
michael@0: ;    unsigned char  *src_ptr,
michael@0: ;    unsigned int    src_pixels_per_line,
michael@0: ;    unsigned char *output_ptr,
michael@0: ;    unsigned int    output_pitch,
michael@0: ;    unsigned int    output_height,
michael@0: ;    unsigned int    vp8_filter_index
michael@0: ;)
michael@0: global sym(vp8_filter_block1d8_h6_ssse3) PRIVATE
michael@0: sym(vp8_filter_block1d8_h6_ssse3):
michael@0:     push        rbp
michael@0:     mov         rbp, rsp
michael@0:     SHADOW_ARGS_TO_STACK 6
michael@0:     SAVE_XMM 7
michael@0:     GET_GOT     rbx
michael@0:     push        rsi
michael@0:     push        rdi
michael@0:     ; end prolog
michael@0: 
michael@0:     movsxd      rdx, DWORD PTR arg(5)   ;table index
michael@0:     xor         rsi, rsi
michael@0:     shl         rdx, 4
michael@0: 
michael@0:     movdqa      xmm7, [GLOBAL(rd)]
michael@0: 
michael@0:     lea         rax, [GLOBAL(k0_k5)]
michael@0:     add         rax, rdx
michael@0:     mov         rdi, arg(2)             ;output_ptr
michael@0: 
michael@0:     cmp         esi, DWORD PTR [rax]
michael@0:     je          vp8_filter_block1d8_h4_ssse3
michael@0: 
michael@0:     movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
michael@0:     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
michael@0:     movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
michael@0: 
michael@0:     mov         rsi, arg(0)             ;src_ptr
michael@0:     movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
michael@0:     movsxd      rcx, dword ptr arg(4)   ;output_height
michael@0: 
michael@0:     movsxd      rdx, dword ptr arg(3)   ;output_pitch
michael@0: 
michael@0:     sub         rdi, rdx
michael@0: ;xmm3 free
michael@0: .filter_block1d8_h6_rowloop_ssse3:
michael@0:     movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
michael@0: 
michael@0:     movq        xmm2,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
michael@0: 
michael@0:     punpcklbw   xmm0,   xmm2                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
michael@0: 
michael@0:     movdqa      xmm1,   xmm0
michael@0:     pmaddubsw   xmm0,   xmm4
michael@0: 
michael@0:     movdqa      xmm2,   xmm1
michael@0:     pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
michael@0: 
michael@0:     pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
michael@0:     pmaddubsw   xmm1,   xmm5
michael@0: 
michael@0:     lea         rdi,    [rdi + rdx]
michael@0:     pmaddubsw   xmm2,   xmm6
michael@0: 
michael@0:     lea         rsi,    [rsi + rax]
michael@0:     dec         rcx
michael@0: 
michael@0:     paddsw      xmm0,   xmm1
michael@0:     paddsw      xmm2,   xmm7
michael@0: 
michael@0:     paddsw      xmm0,   xmm2
michael@0: 
michael@0:     psraw       xmm0,   7
michael@0: 
michael@0:     packuswb    xmm0,   xmm0
michael@0: 
michael@0:     movq        MMWORD Ptr [rdi], xmm0
michael@0:     jnz         .filter_block1d8_h6_rowloop_ssse3
michael@0: 
michael@0:     ; begin epilog
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     RESTORE_GOT
michael@0:     RESTORE_XMM
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: 
michael@0: vp8_filter_block1d8_h4_ssse3:
michael@0:     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
michael@0:     movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
michael@0: 
michael@0:     movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
michael@0:     movdqa      xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
michael@0: 
michael@0:     mov         rsi, arg(0)             ;src_ptr
michael@0: 
michael@0:     movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
michael@0:     movsxd      rcx, dword ptr arg(4)   ;output_height
michael@0: 
michael@0:     movsxd      rdx, dword ptr arg(3)   ;output_pitch
michael@0: 
michael@0:     sub         rdi, rdx
michael@0: 
michael@0: .filter_block1d8_h4_rowloop_ssse3:
michael@0:     movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
michael@0: 
michael@0:     movq        xmm1,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
michael@0: 
michael@0:     punpcklbw   xmm0,   xmm1                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
michael@0: 
michael@0:     movdqa      xmm2,   xmm0
michael@0:     pshufb      xmm0,   xmm3
michael@0: 
michael@0:     pshufb      xmm2,   xmm4
michael@0:     pmaddubsw   xmm0,   xmm5
michael@0: 
michael@0:     lea         rdi,    [rdi + rdx]
michael@0:     pmaddubsw   xmm2,   xmm6
michael@0: 
michael@0:     lea         rsi,    [rsi + rax]
michael@0:     dec         rcx
michael@0: 
michael@0:     paddsw      xmm0,   xmm7
michael@0: 
michael@0:     paddsw      xmm0,   xmm2
michael@0: 
michael@0:     psraw       xmm0,   7
michael@0: 
michael@0:     packuswb    xmm0,   xmm0
michael@0: 
michael@0:     movq        MMWORD Ptr [rdi], xmm0
michael@0: 
michael@0:     jnz         .filter_block1d8_h4_rowloop_ssse3
michael@0: 
michael@0:     ; begin epilog
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     RESTORE_GOT
michael@0:     RESTORE_XMM
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: ;void vp8_filter_block1d16_h6_ssse3
michael@0: ;(
michael@0: ;    unsigned char  *src_ptr,
michael@0: ;    unsigned int    src_pixels_per_line,
michael@0: ;    unsigned char  *output_ptr,
michael@0: ;    unsigned int    output_pitch,
michael@0: ;    unsigned int    output_height,
michael@0: ;    unsigned int    vp8_filter_index
michael@0: ;)
michael@0: global sym(vp8_filter_block1d16_h6_ssse3) PRIVATE
michael@0: sym(vp8_filter_block1d16_h6_ssse3):
michael@0:     push        rbp
michael@0:     mov         rbp, rsp
michael@0:     SHADOW_ARGS_TO_STACK 6
michael@0:     SAVE_XMM 7
michael@0:     GET_GOT     rbx
michael@0:     push        rsi
michael@0:     push        rdi
michael@0:     ; end prolog
michael@0: 
michael@0:     movsxd      rdx, DWORD PTR arg(5)           ;table index
michael@0:     xor         rsi, rsi
michael@0:     shl         rdx, 4      ;
michael@0: 
michael@0:     lea         rax, [GLOBAL(k0_k5)]
michael@0:     add         rax, rdx
michael@0: 
michael@0:     mov         rdi, arg(2)                     ;output_ptr
michael@0: 
michael@0:     mov         rsi, arg(0)                     ;src_ptr
michael@0: 
michael@0:     movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
michael@0:     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
michael@0:     movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
michael@0: 
michael@0:     movsxd      rax, dword ptr arg(1)           ;src_pixels_per_line
michael@0:     movsxd      rcx, dword ptr arg(4)           ;output_height
michael@0:     movsxd      rdx, dword ptr arg(3)           ;output_pitch
michael@0: 
michael@0: .filter_block1d16_h6_rowloop_ssse3:
michael@0:     movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
michael@0: 
michael@0:     movq        xmm3,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
michael@0: 
michael@0:     punpcklbw   xmm0,   xmm3                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
michael@0: 
michael@0:     movdqa      xmm1,   xmm0
michael@0:     pmaddubsw   xmm0,   xmm4
michael@0: 
michael@0:     movdqa      xmm2,   xmm1
michael@0:     pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
michael@0: 
michael@0:     pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
michael@0:     movq        xmm3,   MMWORD PTR [rsi +  6]
michael@0: 
michael@0:     pmaddubsw   xmm1,   xmm5
michael@0:     movq        xmm7,   MMWORD PTR [rsi + 11]
michael@0: 
michael@0:     pmaddubsw   xmm2,   xmm6
michael@0:     punpcklbw   xmm3,   xmm7
michael@0: 
michael@0:     paddsw      xmm0,   xmm1
michael@0:     movdqa      xmm1,   xmm3
michael@0: 
michael@0:     pmaddubsw   xmm3,   xmm4
michael@0:     paddsw      xmm0,   xmm2
michael@0: 
michael@0:     movdqa      xmm2,   xmm1
michael@0:     paddsw      xmm0,   [GLOBAL(rd)]
michael@0: 
michael@0:     pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
michael@0:     pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
michael@0: 
michael@0:     psraw       xmm0,   7
michael@0:     pmaddubsw   xmm1,   xmm5
michael@0: 
michael@0:     pmaddubsw   xmm2,   xmm6
michael@0:     packuswb    xmm0,   xmm0
michael@0: 
michael@0:     lea         rsi,    [rsi + rax]
michael@0:     paddsw      xmm3,   xmm1
michael@0: 
michael@0:     paddsw      xmm3,   xmm2
michael@0: 
michael@0:     paddsw      xmm3,   [GLOBAL(rd)]
michael@0: 
michael@0:     psraw       xmm3,   7
michael@0: 
michael@0:     packuswb    xmm3,   xmm3
michael@0: 
michael@0:     punpcklqdq  xmm0,   xmm3
michael@0: 
michael@0:     movdqa      XMMWORD Ptr [rdi], xmm0
michael@0: 
michael@0:     lea         rdi,    [rdi + rdx]
michael@0:     dec         rcx
michael@0:     jnz         .filter_block1d16_h6_rowloop_ssse3
michael@0: 
michael@0:     ; begin epilog
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     RESTORE_GOT
michael@0:     RESTORE_XMM
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: 
michael@0: ;void vp8_filter_block1d4_h6_ssse3
michael@0: ;(
michael@0: ;    unsigned char  *src_ptr,
michael@0: ;    unsigned int    src_pixels_per_line,
michael@0: ;    unsigned char  *output_ptr,
michael@0: ;    unsigned int    output_pitch,
michael@0: ;    unsigned int    output_height,
michael@0: ;    unsigned int    vp8_filter_index
michael@0: ;)
michael@0: global sym(vp8_filter_block1d4_h6_ssse3) PRIVATE
michael@0: sym(vp8_filter_block1d4_h6_ssse3):
michael@0:     push        rbp
michael@0:     mov         rbp, rsp
michael@0:     SHADOW_ARGS_TO_STACK 6
michael@0:     SAVE_XMM 7
michael@0:     GET_GOT     rbx
michael@0:     push        rsi
michael@0:     push        rdi
michael@0:     ; end prolog
michael@0: 
michael@0:     movsxd      rdx, DWORD PTR arg(5)   ;table index
michael@0:     xor         rsi, rsi
michael@0:     shl         rdx, 4      ;
michael@0: 
michael@0:     lea         rax, [GLOBAL(k0_k5)]
michael@0:     add         rax, rdx
michael@0:     movdqa      xmm7, [GLOBAL(rd)]
michael@0: 
michael@0:     cmp         esi, DWORD PTR [rax]
michael@0:     je          .vp8_filter_block1d4_h4_ssse3
michael@0: 
michael@0:     movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
michael@0:     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
michael@0:     movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
michael@0: 
michael@0:     mov         rsi, arg(0)             ;src_ptr
michael@0:     mov         rdi, arg(2)             ;output_ptr
michael@0:     movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
michael@0:     movsxd      rcx, dword ptr arg(4)   ;output_height
michael@0: 
michael@0:     movsxd      rdx, dword ptr arg(3)   ;output_pitch
michael@0: 
michael@0: ;xmm3 free
michael@0: .filter_block1d4_h6_rowloop_ssse3:
michael@0:     movdqu      xmm0,   XMMWORD PTR [rsi - 2]
michael@0: 
michael@0:     movdqa      xmm1, xmm0
michael@0:     pshufb      xmm0, [GLOBAL(shuf1b)]
michael@0: 
michael@0:     movdqa      xmm2, xmm1
michael@0:     pshufb      xmm1, [GLOBAL(shuf2b)]
michael@0:     pmaddubsw   xmm0, xmm4
michael@0:     pshufb      xmm2, [GLOBAL(shuf3b)]
michael@0:     pmaddubsw   xmm1, xmm5
michael@0: 
michael@0: ;--
michael@0:     pmaddubsw   xmm2, xmm6
michael@0: 
michael@0:     lea         rsi,    [rsi + rax]
michael@0: ;--
michael@0:     paddsw      xmm0, xmm1
michael@0:     paddsw      xmm0, xmm7
michael@0:     pxor        xmm1, xmm1
michael@0:     paddsw      xmm0, xmm2
michael@0:     psraw       xmm0, 7
michael@0:     packuswb    xmm0, xmm0
michael@0: 
michael@0:     movd        DWORD PTR [rdi], xmm0
michael@0: 
michael@0:     add         rdi, rdx
michael@0:     dec         rcx
michael@0:     jnz         .filter_block1d4_h6_rowloop_ssse3
michael@0: 
michael@0:     ; begin epilog
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     RESTORE_GOT
michael@0:     RESTORE_XMM
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: 
michael@0: .vp8_filter_block1d4_h4_ssse3:
michael@0:     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
michael@0:     movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
michael@0:     movdqa      xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
michael@0:     movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
michael@0: 
michael@0:     mov         rsi, arg(0)             ;src_ptr
michael@0:     mov         rdi, arg(2)             ;output_ptr
michael@0:     movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
michael@0:     movsxd      rcx, dword ptr arg(4)   ;output_height
michael@0: 
michael@0:     movsxd      rdx, dword ptr arg(3)   ;output_pitch
michael@0: 
michael@0: .filter_block1d4_h4_rowloop_ssse3:
michael@0:     movdqu      xmm1,   XMMWORD PTR [rsi - 2]
michael@0: 
michael@0:     movdqa      xmm2, xmm1
michael@0:     pshufb      xmm1, xmm0 ;;[GLOBAL(shuf2b)]
michael@0:     pshufb      xmm2, xmm3 ;;[GLOBAL(shuf3b)]
michael@0:     pmaddubsw   xmm1, xmm5
michael@0: 
michael@0: ;--
michael@0:     pmaddubsw   xmm2, xmm6
michael@0: 
michael@0:     lea         rsi,    [rsi + rax]
michael@0: ;--
michael@0:     paddsw      xmm1, xmm7
michael@0:     paddsw      xmm1, xmm2
michael@0:     psraw       xmm1, 7
michael@0:     packuswb    xmm1, xmm1
michael@0: 
michael@0:     movd        DWORD PTR [rdi], xmm1
michael@0: 
michael@0:     add         rdi, rdx
michael@0:     dec         rcx
michael@0:     jnz         .filter_block1d4_h4_rowloop_ssse3
michael@0: 
michael@0:     ; begin epilog
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     RESTORE_GOT
michael@0:     RESTORE_XMM
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: 
michael@0: 
michael@0: 
michael@0: ;void vp8_filter_block1d16_v6_ssse3
michael@0: ;(
michael@0: ;    unsigned char *src_ptr,
michael@0: ;    unsigned int   src_pitch,
michael@0: ;    unsigned char *output_ptr,
michael@0: ;    unsigned int   out_pitch,
michael@0: ;    unsigned int   output_height,
michael@0: ;    unsigned int   vp8_filter_index
michael@0: ;)
michael@0: global sym(vp8_filter_block1d16_v6_ssse3) PRIVATE
michael@0: sym(vp8_filter_block1d16_v6_ssse3):
michael@0:     push        rbp
michael@0:     mov         rbp, rsp
michael@0:     SHADOW_ARGS_TO_STACK 6
michael@0:     SAVE_XMM 7
michael@0:     GET_GOT     rbx
michael@0:     push        rsi
michael@0:     push        rdi
michael@0:     ; end prolog
michael@0: 
michael@0:     movsxd      rdx, DWORD PTR arg(5)   ;table index
michael@0:     xor         rsi, rsi
michael@0:     shl         rdx, 4      ;
michael@0: 
michael@0:     lea         rax, [GLOBAL(k0_k5)]
michael@0:     add         rax, rdx
michael@0: 
michael@0:     cmp         esi, DWORD PTR [rax]
michael@0:     je          .vp8_filter_block1d16_v4_ssse3
michael@0: 
michael@0:     movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
michael@0:     movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
michael@0:     movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
michael@0: 
michael@0:     mov         rsi, arg(0)             ;src_ptr
michael@0:     movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
michael@0:     mov         rdi, arg(2)             ;output_ptr
michael@0: 
michael@0: %if ABI_IS_32BIT=0
michael@0:     movsxd      r8, DWORD PTR arg(3)    ;out_pitch
michael@0: %endif
michael@0:     mov         rax, rsi
michael@0:     movsxd      rcx, DWORD PTR arg(4)   ;output_height
michael@0:     add         rax, rdx
michael@0: 
michael@0: 
michael@0: .vp8_filter_block1d16_v6_ssse3_loop:
michael@0:     movq        xmm1, MMWORD PTR [rsi]                  ;A
michael@0:     movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
michael@0:     movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
michael@0:     movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
michael@0:     movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
michael@0: 
michael@0:     punpcklbw   xmm2, xmm4                  ;B D
michael@0:     punpcklbw   xmm3, xmm0                  ;C E
michael@0: 
michael@0:     movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
michael@0: 
michael@0:     pmaddubsw   xmm3, xmm6
michael@0:     punpcklbw   xmm1, xmm0                  ;A F
michael@0:     pmaddubsw   xmm2, xmm7
michael@0:     pmaddubsw   xmm1, xmm5
michael@0: 
michael@0:     paddsw      xmm2, xmm3
michael@0:     paddsw      xmm2, xmm1
michael@0:     paddsw      xmm2, [GLOBAL(rd)]
michael@0:     psraw       xmm2, 7
michael@0:     packuswb    xmm2, xmm2
michael@0: 
michael@0:     movq        MMWORD PTR [rdi], xmm2          ;store the results
michael@0: 
michael@0:     movq        xmm1, MMWORD PTR [rsi + 8]                  ;A
michael@0:     movq        xmm2, MMWORD PTR [rsi + rdx + 8]            ;B
michael@0:     movq        xmm3, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
michael@0:     movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
michael@0:     movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
michael@0: 
michael@0:     punpcklbw   xmm2, xmm4                  ;B D
michael@0:     punpcklbw   xmm3, xmm0                  ;C E
michael@0: 
michael@0:     movq        xmm0, MMWORD PTR [rax + rdx * 4 + 8]        ;F
michael@0:     pmaddubsw   xmm3, xmm6
michael@0:     punpcklbw   xmm1, xmm0                  ;A F
michael@0:     pmaddubsw   xmm2, xmm7
michael@0:     pmaddubsw   xmm1, xmm5
michael@0: 
michael@0:     add         rsi,  rdx
michael@0:     add         rax,  rdx
michael@0: ;--
michael@0: ;--
michael@0:     paddsw      xmm2, xmm3
michael@0:     paddsw      xmm2, xmm1
michael@0:     paddsw      xmm2, [GLOBAL(rd)]
michael@0:     psraw       xmm2, 7
michael@0:     packuswb    xmm2, xmm2
michael@0: 
michael@0:     movq        MMWORD PTR [rdi+8], xmm2
michael@0: 
michael@0: %if ABI_IS_32BIT
michael@0:     add         rdi,        DWORD PTR arg(3) ;out_pitch
michael@0: %else
michael@0:     add         rdi,        r8
michael@0: %endif
michael@0:     dec         rcx
michael@0:     jnz         .vp8_filter_block1d16_v6_ssse3_loop
michael@0: 
michael@0:     ; begin epilog
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     RESTORE_GOT
michael@0:     RESTORE_XMM
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: 
michael@0: .vp8_filter_block1d16_v4_ssse3:
michael@0:     movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
michael@0:     movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
michael@0: 
michael@0:     mov         rsi, arg(0)             ;src_ptr
michael@0:     movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
michael@0:     mov         rdi, arg(2)             ;output_ptr
michael@0: 
michael@0: %if ABI_IS_32BIT=0
michael@0:     movsxd      r8, DWORD PTR arg(3)    ;out_pitch
michael@0: %endif
michael@0:     mov         rax, rsi
michael@0:     movsxd      rcx, DWORD PTR arg(4)   ;output_height
michael@0:     add         rax, rdx
michael@0: 
michael@0: .vp8_filter_block1d16_v4_ssse3_loop:
michael@0:     movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
michael@0:     movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
michael@0:     movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
michael@0:     movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
michael@0: 
michael@0:     punpcklbw   xmm2, xmm4                  ;B D
michael@0:     punpcklbw   xmm3, xmm0                  ;C E
michael@0: 
michael@0:     pmaddubsw   xmm3, xmm6
michael@0:     pmaddubsw   xmm2, xmm7
michael@0:     movq        xmm5, MMWORD PTR [rsi + rdx + 8]            ;B
michael@0:     movq        xmm1, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
michael@0:     movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
michael@0:     movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
michael@0: 
michael@0:     paddsw      xmm2, [GLOBAL(rd)]
michael@0:     paddsw      xmm2, xmm3
michael@0:     psraw       xmm2, 7
michael@0:     packuswb    xmm2, xmm2
michael@0: 
michael@0:     punpcklbw   xmm5, xmm4                  ;B D
michael@0:     punpcklbw   xmm1, xmm0                  ;C E
michael@0: 
michael@0:     pmaddubsw   xmm1, xmm6
michael@0:     pmaddubsw   xmm5, xmm7
michael@0: 
michael@0:     movdqa      xmm4, [GLOBAL(rd)]
michael@0:     add         rsi,  rdx
michael@0:     add         rax,  rdx
michael@0: ;--
michael@0: ;--
michael@0:     paddsw      xmm5, xmm1
michael@0:     paddsw      xmm5, xmm4
michael@0:     psraw       xmm5, 7
michael@0:     packuswb    xmm5, xmm5
michael@0: 
michael@0:     punpcklqdq  xmm2, xmm5
michael@0: 
michael@0:     movdqa       XMMWORD PTR [rdi], xmm2
michael@0: 
michael@0: %if ABI_IS_32BIT
michael@0:     add         rdi,        DWORD PTR arg(3) ;out_pitch
michael@0: %else
michael@0:     add         rdi,        r8
michael@0: %endif
michael@0:     dec         rcx
michael@0:     jnz         .vp8_filter_block1d16_v4_ssse3_loop
michael@0: 
michael@0:     ; begin epilog
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     RESTORE_GOT
michael@0:     RESTORE_XMM
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: 
michael@0: ;void vp8_filter_block1d8_v6_ssse3
michael@0: ;(
michael@0: ;    unsigned char *src_ptr,
michael@0: ;    unsigned int   src_pitch,
michael@0: ;    unsigned char *output_ptr,
michael@0: ;    unsigned int   out_pitch,
michael@0: ;    unsigned int   output_height,
michael@0: ;    unsigned int   vp8_filter_index
michael@0: ;)
michael@0: global sym(vp8_filter_block1d8_v6_ssse3) PRIVATE
michael@0: sym(vp8_filter_block1d8_v6_ssse3):
michael@0:     push        rbp
michael@0:     mov         rbp, rsp
michael@0:     SHADOW_ARGS_TO_STACK 6
michael@0:     SAVE_XMM 7
michael@0:     GET_GOT     rbx
michael@0:     push        rsi
michael@0:     push        rdi
michael@0:     ; end prolog
michael@0: 
michael@0:     movsxd      rdx, DWORD PTR arg(5)   ;table index
michael@0:     xor         rsi, rsi
michael@0:     shl         rdx, 4      ;
michael@0: 
michael@0:     lea         rax, [GLOBAL(k0_k5)]
michael@0:     add         rax, rdx
michael@0: 
michael@0:     movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
michael@0:     mov         rdi, arg(2)             ;output_ptr
michael@0: %if ABI_IS_32BIT=0
michael@0:     movsxd      r8, DWORD PTR arg(3)    ; out_pitch
michael@0: %endif
michael@0:     movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
michael@0: 
michael@0:     cmp         esi, DWORD PTR [rax]
michael@0:     je          .vp8_filter_block1d8_v4_ssse3
michael@0: 
michael@0:     movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
michael@0:     movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
michael@0:     movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
michael@0: 
michael@0:     mov         rsi, arg(0)             ;src_ptr
michael@0: 
michael@0:     mov         rax, rsi
michael@0:     add         rax, rdx
michael@0: 
michael@0: .vp8_filter_block1d8_v6_ssse3_loop:
michael@0:     movq        xmm1, MMWORD PTR [rsi]                  ;A
michael@0:     movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
michael@0:     movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
michael@0:     movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
michael@0:     movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
michael@0: 
michael@0:     punpcklbw   xmm2, xmm4                  ;B D
michael@0:     punpcklbw   xmm3, xmm0                  ;C E
michael@0: 
michael@0:     movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
michael@0:     movdqa      xmm4, [GLOBAL(rd)]
michael@0: 
michael@0:     pmaddubsw   xmm3, xmm6
michael@0:     punpcklbw   xmm1, xmm0                  ;A F
michael@0:     pmaddubsw   xmm2, xmm7
michael@0:     pmaddubsw   xmm1, xmm5
michael@0:     add         rsi,  rdx
michael@0:     add         rax,  rdx
michael@0: ;--
michael@0: ;--
michael@0:     paddsw      xmm2, xmm3
michael@0:     paddsw      xmm2, xmm1
michael@0:     paddsw      xmm2, xmm4
michael@0:     psraw       xmm2, 7
michael@0:     packuswb    xmm2, xmm2
michael@0: 
michael@0:     movq        MMWORD PTR [rdi], xmm2
michael@0: 
michael@0: %if ABI_IS_32BIT
michael@0:     add         rdi,        DWORD PTR arg(3) ;[out_pitch]
michael@0: %else
michael@0:     add         rdi,        r8
michael@0: %endif
michael@0:     dec         rcx
michael@0:     jnz         .vp8_filter_block1d8_v6_ssse3_loop
michael@0: 
michael@0:     ; begin epilog
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     RESTORE_GOT
michael@0:     RESTORE_XMM
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: 
michael@0: .vp8_filter_block1d8_v4_ssse3:
michael@0:     movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
michael@0:     movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
michael@0:     movdqa      xmm5, [GLOBAL(rd)]
michael@0: 
michael@0:     mov         rsi, arg(0)             ;src_ptr
michael@0: 
michael@0:     mov         rax, rsi
michael@0:     add         rax, rdx
michael@0: 
michael@0: .vp8_filter_block1d8_v4_ssse3_loop:
michael@0:     movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
michael@0:     movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
michael@0:     movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
michael@0:     movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
michael@0: 
michael@0:     punpcklbw   xmm2, xmm4                  ;B D
michael@0:     punpcklbw   xmm3, xmm0                  ;C E
michael@0: 
michael@0:     pmaddubsw   xmm3, xmm6
michael@0:     pmaddubsw   xmm2, xmm7
michael@0:     add         rsi,  rdx
michael@0:     add         rax,  rdx
michael@0: ;--
michael@0: ;--
michael@0:     paddsw      xmm2, xmm3
michael@0:     paddsw      xmm2, xmm5
michael@0:     psraw       xmm2, 7
michael@0:     packuswb    xmm2, xmm2
michael@0: 
michael@0:     movq        MMWORD PTR [rdi], xmm2
michael@0: 
michael@0: %if ABI_IS_32BIT
michael@0:     add         rdi,        DWORD PTR arg(3) ;[out_pitch]
michael@0: %else
michael@0:     add         rdi,        r8
michael@0: %endif
michael@0:     dec         rcx
michael@0:     jnz         .vp8_filter_block1d8_v4_ssse3_loop
michael@0: 
michael@0:     ; begin epilog
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     RESTORE_GOT
michael@0:     RESTORE_XMM
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: ;void vp8_filter_block1d4_v6_ssse3
michael@0: ;(
michael@0: ;    unsigned char *src_ptr,
michael@0: ;    unsigned int   src_pitch,
michael@0: ;    unsigned char *output_ptr,
michael@0: ;    unsigned int   out_pitch,
michael@0: ;    unsigned int   output_height,
michael@0: ;    unsigned int   vp8_filter_index
michael@0: ;)
michael@0: global sym(vp8_filter_block1d4_v6_ssse3) PRIVATE
michael@0: sym(vp8_filter_block1d4_v6_ssse3):
michael@0:     push        rbp
michael@0:     mov         rbp, rsp
michael@0:     SHADOW_ARGS_TO_STACK 6
michael@0:     GET_GOT     rbx
michael@0:     push        rsi
michael@0:     push        rdi
michael@0:     ; end prolog
michael@0: 
michael@0:     movsxd      rdx, DWORD PTR arg(5)   ;table index
michael@0:     xor         rsi, rsi
michael@0:     shl         rdx, 4      ;
michael@0: 
michael@0:     lea         rax, [GLOBAL(k0_k5)]
michael@0:     add         rax, rdx
michael@0: 
michael@0:     movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
michael@0:     mov         rdi, arg(2)             ;output_ptr
michael@0: %if ABI_IS_32BIT=0
michael@0:     movsxd      r8, DWORD PTR arg(3)    ; out_pitch
michael@0: %endif
michael@0:     movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
michael@0: 
michael@0:     cmp         esi, DWORD PTR [rax]
michael@0:     je          .vp8_filter_block1d4_v4_ssse3
michael@0: 
michael@0:     movq        mm5, MMWORD PTR [rax]         ;k0_k5
michael@0:     movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
michael@0:     movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
michael@0: 
michael@0:     mov         rsi, arg(0)             ;src_ptr
michael@0: 
michael@0:     mov         rax, rsi
michael@0:     add         rax, rdx
michael@0: 
michael@0: .vp8_filter_block1d4_v6_ssse3_loop:
michael@0:     movd        mm1, DWORD PTR [rsi]                  ;A
michael@0:     movd        mm2, DWORD PTR [rsi + rdx]            ;B
michael@0:     movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
michael@0:     movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
michael@0:     movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
michael@0: 
michael@0:     punpcklbw   mm2, mm4                  ;B D
michael@0:     punpcklbw   mm3, mm0                  ;C E
michael@0: 
michael@0:     movd        mm0, DWORD PTR [rax + rdx * 4]        ;F
michael@0: 
michael@0:     movq        mm4, [GLOBAL(rd)]
michael@0: 
michael@0:     pmaddubsw   mm3, mm6
michael@0:     punpcklbw   mm1, mm0                  ;A F
michael@0:     pmaddubsw   mm2, mm7
michael@0:     pmaddubsw   mm1, mm5
michael@0:     add         rsi,  rdx
michael@0:     add         rax,  rdx
michael@0: ;--
michael@0: ;--
michael@0:     paddsw      mm2, mm3
michael@0:     paddsw      mm2, mm1
michael@0:     paddsw      mm2, mm4
michael@0:     psraw       mm2, 7
michael@0:     packuswb    mm2, mm2
michael@0: 
michael@0:     movd        DWORD PTR [rdi], mm2
michael@0: 
michael@0: %if ABI_IS_32BIT
michael@0:     add         rdi,        DWORD PTR arg(3) ;[out_pitch]
michael@0: %else
michael@0:     add         rdi,        r8
michael@0: %endif
michael@0:     dec         rcx
michael@0:     jnz         .vp8_filter_block1d4_v6_ssse3_loop
michael@0: 
michael@0:     ; begin epilog
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     RESTORE_GOT
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: 
michael@0: .vp8_filter_block1d4_v4_ssse3:
michael@0:     movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
michael@0:     movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
michael@0:     movq        mm5, MMWORD PTR [GLOBAL(rd)]
michael@0: 
michael@0:     mov         rsi, arg(0)             ;src_ptr
michael@0: 
michael@0:     mov         rax, rsi
michael@0:     add         rax, rdx
michael@0: 
michael@0: .vp8_filter_block1d4_v4_ssse3_loop:
michael@0:     movd        mm2, DWORD PTR [rsi + rdx]            ;B
michael@0:     movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
michael@0:     movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
michael@0:     movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
michael@0: 
michael@0:     punpcklbw   mm2, mm4                  ;B D
michael@0:     punpcklbw   mm3, mm0                  ;C E
michael@0: 
michael@0:     pmaddubsw   mm3, mm6
michael@0:     pmaddubsw   mm2, mm7
michael@0:     add         rsi,  rdx
michael@0:     add         rax,  rdx
michael@0: ;--
michael@0: ;--
michael@0:     paddsw      mm2, mm3
michael@0:     paddsw      mm2, mm5
michael@0:     psraw       mm2, 7
michael@0:     packuswb    mm2, mm2
michael@0: 
michael@0:     movd        DWORD PTR [rdi], mm2
michael@0: 
michael@0: %if ABI_IS_32BIT
michael@0:     add         rdi,        DWORD PTR arg(3) ;[out_pitch]
michael@0: %else
michael@0:     add         rdi,        r8
michael@0: %endif
michael@0:     dec         rcx
michael@0:     jnz         .vp8_filter_block1d4_v4_ssse3_loop
michael@0: 
michael@0:     ; begin epilog
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     RESTORE_GOT
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: 
michael@0: ;void vp8_bilinear_predict16x16_ssse3
michael@0: ;(
michael@0: ;    unsigned char  *src_ptr,
michael@0: ;    int   src_pixels_per_line,
michael@0: ;    int  xoffset,
michael@0: ;    int  yoffset,
michael@0: ;    unsigned char *dst_ptr,
michael@0: ;    int dst_pitch
michael@0: ;)
michael@0: global sym(vp8_bilinear_predict16x16_ssse3) PRIVATE
michael@0: sym(vp8_bilinear_predict16x16_ssse3):
michael@0:     push        rbp
michael@0:     mov         rbp, rsp
michael@0:     SHADOW_ARGS_TO_STACK 6
michael@0:     SAVE_XMM 7
michael@0:     GET_GOT     rbx
michael@0:     push        rsi
michael@0:     push        rdi
michael@0:     ; end prolog
michael@0: 
michael@0:         lea         rcx,        [GLOBAL(vp8_bilinear_filters_ssse3)]
michael@0:         movsxd      rax,        dword ptr arg(2)    ; xoffset
michael@0: 
michael@0:         cmp         rax,        0                   ; skip first_pass filter if xoffset=0
michael@0:         je          .b16x16_sp_only
michael@0: 
michael@0:         shl         rax,        4
michael@0:         lea         rax,        [rax + rcx]         ; HFilter
michael@0: 
michael@0:         mov         rdi,        arg(4)              ; dst_ptr
michael@0:         mov         rsi,        arg(0)              ; src_ptr
michael@0:         movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
michael@0: 
michael@0:         movdqa      xmm1,       [rax]
michael@0: 
michael@0:         movsxd      rax,        dword ptr arg(3)    ; yoffset
michael@0: 
michael@0:         cmp         rax,        0                   ; skip second_pass filter if yoffset=0
michael@0:         je          .b16x16_fp_only
michael@0: 
michael@0:         shl         rax,        4
michael@0:         lea         rax,        [rax + rcx]         ; VFilter
michael@0: 
michael@0:         lea         rcx,        [rdi+rdx*8]
michael@0:         lea         rcx,        [rcx+rdx*8]
michael@0:         movsxd      rdx,        dword ptr arg(1)    ; src_pixels_per_line
michael@0: 
michael@0:         movdqa      xmm2,       [rax]
michael@0: 
michael@0: %if ABI_IS_32BIT=0
michael@0:         movsxd      r8,         dword ptr arg(5)    ; dst_pitch
michael@0: %endif
michael@0:         movq        xmm3,       [rsi]               ; 00 01 02 03 04 05 06 07
michael@0:         movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
michael@0: 
michael@0:         punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
michael@0:         movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
michael@0: 
michael@0:         movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
michael@0: 
michael@0:         lea         rsi,        [rsi + rdx]         ; next line
michael@0: 
michael@0:         pmaddubsw   xmm3,       xmm1                ; 00 02 04 06 08 10 12 14
michael@0: 
michael@0:         punpcklbw   xmm4,       xmm5                ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
michael@0:         pmaddubsw   xmm4,       xmm1                ; 01 03 05 07 09 11 13 15
michael@0: 
michael@0:         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
michael@0:         psraw       xmm3,       VP8_FILTER_SHIFT    ; xmm3 /= 128
michael@0: 
michael@0:         paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
michael@0:         psraw       xmm4,       VP8_FILTER_SHIFT    ; xmm4 /= 128
michael@0: 
michael@0:         movdqa      xmm7,       xmm3
michael@0:         packuswb    xmm7,       xmm4                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
michael@0: 
michael@0: .next_row:
michael@0:         movq        xmm6,       [rsi]               ; 00 01 02 03 04 05 06 07
michael@0:         movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
michael@0: 
michael@0:         punpcklbw   xmm6,       xmm5
michael@0:         movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
michael@0: 
michael@0:         movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
michael@0:         lea         rsi,        [rsi + rdx]         ; next line
michael@0: 
michael@0:         pmaddubsw   xmm6,       xmm1
michael@0: 
michael@0:         punpcklbw   xmm4,       xmm5
michael@0:         pmaddubsw   xmm4,       xmm1
michael@0: 
michael@0:         paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
michael@0:         psraw       xmm6,       VP8_FILTER_SHIFT    ; xmm6 /= 128
michael@0: 
michael@0:         paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
michael@0:         psraw       xmm4,       VP8_FILTER_SHIFT    ; xmm4 /= 128
michael@0: 
michael@0:         packuswb    xmm6,       xmm4
michael@0:         movdqa      xmm5,       xmm7
michael@0: 
michael@0:         punpcklbw   xmm5,       xmm6
michael@0:         pmaddubsw   xmm5,       xmm2
michael@0: 
michael@0:         punpckhbw   xmm7,       xmm6
michael@0:         pmaddubsw   xmm7,       xmm2
michael@0: 
michael@0:         paddw       xmm5,       [GLOBAL(rd)]        ; xmm5 += round value
michael@0:         psraw       xmm5,       VP8_FILTER_SHIFT    ; xmm5 /= 128
michael@0: 
michael@0:         paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
michael@0:         psraw       xmm7,       VP8_FILTER_SHIFT    ; xmm7 /= 128
michael@0: 
michael@0:         packuswb    xmm5,       xmm7
michael@0:         movdqa      xmm7,       xmm6
michael@0: 
michael@0:         movdqa      [rdi],      xmm5                ; store the results in the destination
michael@0: %if ABI_IS_32BIT
michael@0:         add         rdi,        DWORD PTR arg(5)    ; dst_pitch
michael@0: %else
michael@0:         add         rdi,        r8
michael@0: %endif
michael@0: 
michael@0:         cmp         rdi,        rcx
michael@0:         jne         .next_row
michael@0: 
michael@0:         jmp         .done
michael@0: 
michael@0: .b16x16_sp_only:
michael@0:         movsxd      rax,        dword ptr arg(3)    ; yoffset
michael@0:         shl         rax,        4
michael@0:         lea         rax,        [rax + rcx]         ; VFilter
michael@0: 
michael@0:         mov         rdi,        arg(4)              ; dst_ptr
michael@0:         mov         rsi,        arg(0)              ; src_ptr
michael@0:         movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
michael@0: 
michael@0:         movdqa      xmm1,       [rax]               ; VFilter
michael@0: 
michael@0:         lea         rcx,        [rdi+rdx*8]
michael@0:         lea         rcx,        [rcx+rdx*8]
michael@0:         movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
michael@0: 
michael@0:         ; get the first horizontal line done
michael@0:         movq        xmm4,       [rsi]               ; load row 0
michael@0:         movq        xmm2,       [rsi + 8]           ; load row 0
michael@0: 
michael@0:         lea         rsi,        [rsi + rax]         ; next line
michael@0: .next_row_sp:
michael@0:         movq        xmm3,       [rsi]               ; load row + 1
michael@0:         movq        xmm5,       [rsi + 8]           ; load row + 1
michael@0: 
michael@0:         punpcklbw   xmm4,       xmm3
michael@0:         punpcklbw   xmm2,       xmm5
michael@0: 
michael@0:         pmaddubsw   xmm4,       xmm1
michael@0:         movq        xmm7,       [rsi + rax]         ; load row + 2
michael@0: 
michael@0:         pmaddubsw   xmm2,       xmm1
michael@0:         movq        xmm6,       [rsi + rax + 8]     ; load row + 2
michael@0: 
michael@0:         punpcklbw   xmm3,       xmm7
michael@0:         punpcklbw   xmm5,       xmm6
michael@0: 
michael@0:         pmaddubsw   xmm3,       xmm1
michael@0:         paddw       xmm4,       [GLOBAL(rd)]
michael@0: 
michael@0:         pmaddubsw   xmm5,       xmm1
michael@0:         paddw       xmm2,       [GLOBAL(rd)]
michael@0: 
michael@0:         psraw       xmm4,       VP8_FILTER_SHIFT
michael@0:         psraw       xmm2,       VP8_FILTER_SHIFT
michael@0: 
michael@0:         packuswb    xmm4,       xmm2
michael@0:         paddw       xmm3,       [GLOBAL(rd)]
michael@0: 
michael@0:         movdqa      [rdi],      xmm4                ; store row 0
michael@0:         paddw       xmm5,       [GLOBAL(rd)]
michael@0: 
michael@0:         psraw       xmm3,       VP8_FILTER_SHIFT
michael@0:         psraw       xmm5,       VP8_FILTER_SHIFT
michael@0: 
michael@0:         packuswb    xmm3,       xmm5
michael@0:         movdqa      xmm4,       xmm7
michael@0: 
michael@0:         movdqa      [rdi + rdx],xmm3                ; store row 1
michael@0:         lea         rsi,        [rsi + 2*rax]
michael@0: 
michael@0:         movdqa      xmm2,       xmm6
michael@0:         lea         rdi,        [rdi + 2*rdx]
michael@0: 
michael@0:         cmp         rdi,        rcx
michael@0:         jne         .next_row_sp
michael@0: 
michael@0:         jmp         .done
michael@0: 
michael@0: .b16x16_fp_only:
michael@0:         lea         rcx,        [rdi+rdx*8]
michael@0:         lea         rcx,        [rcx+rdx*8]
michael@0:         movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
michael@0: 
michael@0: .next_row_fp:
michael@0:         movq        xmm2,       [rsi]               ; 00 01 02 03 04 05 06 07
michael@0:         movq        xmm4,       [rsi+1]             ; 01 02 03 04 05 06 07 08
michael@0: 
michael@0:         punpcklbw   xmm2,       xmm4
michael@0:         movq        xmm3,       [rsi+8]             ; 08 09 10 11 12 13 14 15
michael@0: 
michael@0:         pmaddubsw   xmm2,       xmm1
michael@0:         movq        xmm4,       [rsi+9]             ; 09 10 11 12 13 14 15 16
michael@0: 
michael@0:         lea         rsi,        [rsi + rax]         ; next line
michael@0:         punpcklbw   xmm3,       xmm4
michael@0: 
michael@0:         pmaddubsw   xmm3,       xmm1
michael@0:         movq        xmm5,       [rsi]
michael@0: 
michael@0:         paddw       xmm2,       [GLOBAL(rd)]
michael@0:         movq        xmm7,       [rsi+1]
michael@0: 
michael@0:         movq        xmm6,       [rsi+8]
michael@0:         psraw       xmm2,       VP8_FILTER_SHIFT
michael@0: 
michael@0:         punpcklbw   xmm5,       xmm7
michael@0:         movq        xmm7,       [rsi+9]
michael@0: 
michael@0:         paddw       xmm3,       [GLOBAL(rd)]
michael@0:         pmaddubsw   xmm5,       xmm1
michael@0: 
michael@0:         psraw       xmm3,       VP8_FILTER_SHIFT
michael@0:         punpcklbw   xmm6,       xmm7
michael@0: 
michael@0:         packuswb    xmm2,       xmm3
michael@0:         pmaddubsw   xmm6,       xmm1
michael@0: 
michael@0:         movdqa      [rdi],      xmm2                ; store the results in the destination
michael@0:         paddw       xmm5,       [GLOBAL(rd)]
michael@0: 
michael@0:         lea         rdi,        [rdi + rdx]         ; dst_pitch
michael@0:         psraw       xmm5,       VP8_FILTER_SHIFT
michael@0: 
michael@0:         paddw       xmm6,       [GLOBAL(rd)]
michael@0:         psraw       xmm6,       VP8_FILTER_SHIFT
michael@0: 
michael@0:         packuswb    xmm5,       xmm6
michael@0:         lea         rsi,        [rsi + rax]         ; next line
michael@0: 
michael@0:         movdqa      [rdi],      xmm5                ; store the results in the destination
michael@0:         lea         rdi,        [rdi + rdx]         ; dst_pitch
michael@0: 
michael@0:         cmp         rdi,        rcx
michael@0: 
michael@0:         jne         .next_row_fp
michael@0: 
michael@0: .done:
michael@0:     ; begin epilog
michael@0:     pop         rdi
michael@0:     pop         rsi
michael@0:     RESTORE_GOT
michael@0:     RESTORE_XMM
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: 
michael@0: ;void vp8_bilinear_predict8x8_ssse3
michael@0: ;(
michael@0: ;    unsigned char  *src_ptr,
michael@0: ;    int   src_pixels_per_line,
michael@0: ;    int  xoffset,
michael@0: ;    int  yoffset,
michael@0: ;    unsigned char *dst_ptr,
michael@0: ;    int dst_pitch
michael@0: ;)
michael@0: global sym(vp8_bilinear_predict8x8_ssse3) PRIVATE
michael@0: sym(vp8_bilinear_predict8x8_ssse3):
michael@0:     push        rbp
michael@0:     mov         rbp, rsp
michael@0:     SHADOW_ARGS_TO_STACK 6
michael@0:     SAVE_XMM 7
michael@0:     GET_GOT     rbx
michael@0:     push        rsi
michael@0:     push        rdi
michael@0:     ; end prolog
michael@0: 
michael@0:     ALIGN_STACK 16, rax
michael@0:     sub         rsp, 144                         ; reserve 144 bytes
michael@0: 
michael@0:         lea         rcx,        [GLOBAL(vp8_bilinear_filters_ssse3)]
michael@0: 
michael@0:         mov         rsi,        arg(0) ;src_ptr
michael@0:         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
michael@0: 
michael@0:     ;Read 9-line unaligned data in and put them on stack. This gives a big
michael@0:     ;performance boost.
michael@0:         movdqu      xmm0,       [rsi]
michael@0:         lea         rax,        [rdx + rdx*2]
michael@0:         movdqu      xmm1,       [rsi+rdx]
michael@0:         movdqu      xmm2,       [rsi+rdx*2]
michael@0:         add         rsi,        rax
michael@0:         movdqu      xmm3,       [rsi]
michael@0:         movdqu      xmm4,       [rsi+rdx]
michael@0:         movdqu      xmm5,       [rsi+rdx*2]
michael@0:         add         rsi,        rax
michael@0:         movdqu      xmm6,       [rsi]
michael@0:         movdqu      xmm7,       [rsi+rdx]
michael@0: 
michael@0:         movdqa      XMMWORD PTR [rsp],            xmm0
michael@0: 
michael@0:         movdqu      xmm0,       [rsi+rdx*2]
michael@0: 
michael@0:         movdqa      XMMWORD PTR [rsp+16],         xmm1
michael@0:         movdqa      XMMWORD PTR [rsp+32],         xmm2
michael@0:         movdqa      XMMWORD PTR [rsp+48],         xmm3
michael@0:         movdqa      XMMWORD PTR [rsp+64],         xmm4
michael@0:         movdqa      XMMWORD PTR [rsp+80],         xmm5
michael@0:         movdqa      XMMWORD PTR [rsp+96],         xmm6
michael@0:         movdqa      XMMWORD PTR [rsp+112],        xmm7
michael@0:         movdqa      XMMWORD PTR [rsp+128],        xmm0
michael@0: 
michael@0:         movsxd      rax,        dword ptr arg(2)    ; xoffset
michael@0:         cmp         rax,        0                   ; skip first_pass filter if xoffset=0
michael@0:         je          .b8x8_sp_only
michael@0: 
michael@0:         shl         rax,        4
michael@0:         add         rax,        rcx                 ; HFilter
michael@0: 
michael@0:         mov         rdi,        arg(4)              ; dst_ptr
michael@0:         movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
michael@0: 
michael@0:         movdqa      xmm0,       [rax]
michael@0: 
michael@0:         movsxd      rax,        dword ptr arg(3)    ; yoffset
michael@0:         cmp         rax,        0                   ; skip second_pass filter if yoffset=0
michael@0:         je          .b8x8_fp_only
michael@0: 
michael@0:         shl         rax,        4
michael@0:         lea         rax,        [rax + rcx]         ; VFilter
michael@0: 
michael@0:         lea         rcx,        [rdi+rdx*8]
michael@0: 
michael@0:         movdqa      xmm1,       [rax]
michael@0: 
michael@0:         ; get the first horizontal line done
michael@0:         movdqa      xmm3,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
michael@0:         movdqa      xmm5,       xmm3                ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
michael@0: 
michael@0:         psrldq      xmm5,       1
michael@0:         lea         rsp,        [rsp + 16]          ; next line
michael@0: 
michael@0:         punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
michael@0:         pmaddubsw   xmm3,       xmm0                ; 00 02 04 06 08 10 12 14
michael@0: 
michael@0:         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
michael@0:         psraw       xmm3,       VP8_FILTER_SHIFT    ; xmm3 /= 128
michael@0: 
michael@0:         movdqa      xmm7,       xmm3
michael@0:         packuswb    xmm7,       xmm7                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
michael@0: 
michael@0: .next_row:
michael@0:         movdqa      xmm6,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
michael@0:         lea         rsp,        [rsp + 16]          ; next line
michael@0: 
michael@0:         movdqa      xmm5,       xmm6
michael@0: 
michael@0:         psrldq      xmm5,       1
michael@0: 
michael@0:         punpcklbw   xmm6,       xmm5
michael@0:         pmaddubsw   xmm6,       xmm0
michael@0: 
michael@0:         paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
michael@0:         psraw       xmm6,       VP8_FILTER_SHIFT    ; xmm6 /= 128
michael@0: 
michael@0:         packuswb    xmm6,       xmm6
michael@0: 
michael@0:         punpcklbw   xmm7,       xmm6
michael@0:         pmaddubsw   xmm7,       xmm1
michael@0: 
michael@0:         paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
michael@0:         psraw       xmm7,       VP8_FILTER_SHIFT    ; xmm7 /= 128
michael@0: 
michael@0:         packuswb    xmm7,       xmm7
michael@0: 
michael@0:         movq        [rdi],      xmm7                ; store the results in the destination
michael@0:         lea         rdi,        [rdi + rdx]
michael@0: 
michael@0:         movdqa      xmm7,       xmm6
michael@0: 
michael@0:         cmp         rdi,        rcx
michael@0:         jne         .next_row
michael@0: 
michael@0:         jmp         .done8x8
michael@0: 
michael@0: .b8x8_sp_only:
michael@0:         movsxd      rax,        dword ptr arg(3)    ; yoffset
michael@0:         shl         rax,        4
michael@0:         lea         rax,        [rax + rcx]         ; VFilter
michael@0: 
michael@0:         mov         rdi,        arg(4) ;dst_ptr
michael@0:         movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
michael@0: 
michael@0:         movdqa      xmm0,       [rax]               ; VFilter
michael@0: 
michael@0:         movq        xmm1,       XMMWORD PTR [rsp]
michael@0:         movq        xmm2,       XMMWORD PTR [rsp+16]
michael@0: 
michael@0:         movq        xmm3,       XMMWORD PTR [rsp+32]
michael@0:         punpcklbw   xmm1,       xmm2
michael@0: 
michael@0:         movq        xmm4,       XMMWORD PTR [rsp+48]
michael@0:         punpcklbw   xmm2,       xmm3
michael@0: 
michael@0:         movq        xmm5,       XMMWORD PTR [rsp+64]
michael@0:         punpcklbw   xmm3,       xmm4
michael@0: 
michael@0:         movq        xmm6,       XMMWORD PTR [rsp+80]
michael@0:         punpcklbw   xmm4,       xmm5
michael@0: 
michael@0:         movq        xmm7,       XMMWORD PTR [rsp+96]
michael@0:         punpcklbw   xmm5,       xmm6
michael@0: 
michael@0:         pmaddubsw   xmm1,       xmm0
michael@0:         pmaddubsw   xmm2,       xmm0
michael@0: 
michael@0:         pmaddubsw   xmm3,       xmm0
michael@0:         pmaddubsw   xmm4,       xmm0
michael@0: 
michael@0:         pmaddubsw   xmm5,       xmm0
michael@0:         punpcklbw   xmm6,       xmm7
michael@0: 
michael@0:         pmaddubsw   xmm6,       xmm0
michael@0:         paddw       xmm1,       [GLOBAL(rd)]
michael@0: 
michael@0:         paddw       xmm2,       [GLOBAL(rd)]
michael@0:         psraw       xmm1,       VP8_FILTER_SHIFT
michael@0: 
michael@0:         paddw       xmm3,       [GLOBAL(rd)]
michael@0:         psraw       xmm2,       VP8_FILTER_SHIFT
michael@0: 
michael@0:         paddw       xmm4,       [GLOBAL(rd)]
michael@0:         psraw       xmm3,       VP8_FILTER_SHIFT
michael@0: 
michael@0:         paddw       xmm5,       [GLOBAL(rd)]
michael@0:         psraw       xmm4,       VP8_FILTER_SHIFT
michael@0: 
michael@0:         paddw       xmm6,       [GLOBAL(rd)]
michael@0:         psraw       xmm5,       VP8_FILTER_SHIFT
michael@0: 
michael@0:         psraw       xmm6,       VP8_FILTER_SHIFT
michael@0:         packuswb    xmm1,       xmm1
michael@0: 
michael@0:         packuswb    xmm2,       xmm2
michael@0:         movq        [rdi],      xmm1
michael@0: 
michael@0:         packuswb    xmm3,       xmm3
michael@0:         movq        [rdi+rdx],  xmm2
michael@0: 
michael@0:         packuswb    xmm4,       xmm4
michael@0:         movq        xmm1,       XMMWORD PTR [rsp+112]
michael@0: 
michael@0:         lea         rdi,        [rdi + 2*rdx]
michael@0:         movq        xmm2,       XMMWORD PTR [rsp+128]
michael@0: 
michael@0:         packuswb    xmm5,       xmm5
michael@0:         movq        [rdi],      xmm3
michael@0: 
michael@0:         packuswb    xmm6,       xmm6
michael@0:         movq        [rdi+rdx],  xmm4
michael@0: 
michael@0:         lea         rdi,        [rdi + 2*rdx]
michael@0:         punpcklbw   xmm7,       xmm1
michael@0: 
michael@0:         movq        [rdi],      xmm5
michael@0:         pmaddubsw   xmm7,       xmm0
michael@0: 
michael@0:         movq        [rdi+rdx],  xmm6
michael@0:         punpcklbw   xmm1,       xmm2
michael@0: 
michael@0:         pmaddubsw   xmm1,       xmm0
michael@0:         paddw       xmm7,       [GLOBAL(rd)]
michael@0: 
michael@0:         psraw       xmm7,       VP8_FILTER_SHIFT
michael@0:         paddw       xmm1,       [GLOBAL(rd)]
michael@0: 
michael@0:         psraw       xmm1,       VP8_FILTER_SHIFT
michael@0:         packuswb    xmm7,       xmm7
michael@0: 
michael@0:         packuswb    xmm1,       xmm1
michael@0:         lea         rdi,        [rdi + 2*rdx]
michael@0: 
michael@0:         movq        [rdi],      xmm7
michael@0: 
michael@0:         movq        [rdi+rdx],  xmm1
michael@0:         lea         rsp,        [rsp + 144]
michael@0: 
michael@0:         jmp         .done8x8
michael@0: 
michael@0: .b8x8_fp_only:
michael@0:         lea         rcx,        [rdi+rdx*8]
michael@0: 
michael@0: .next_row_fp:
michael@0:         movdqa      xmm1,       XMMWORD PTR [rsp]
michael@0:         movdqa      xmm3,       XMMWORD PTR [rsp+16]
michael@0: 
michael@0:         movdqa      xmm2,       xmm1
michael@0:         movdqa      xmm5,       XMMWORD PTR [rsp+32]
michael@0: 
michael@0:         psrldq      xmm2,       1
michael@0:         movdqa      xmm7,       XMMWORD PTR [rsp+48]
michael@0: 
michael@0:         movdqa      xmm4,       xmm3
michael@0:         psrldq      xmm4,       1
michael@0: 
michael@0:         movdqa      xmm6,       xmm5
michael@0:         psrldq      xmm6,       1
michael@0: 
michael@0:         punpcklbw   xmm1,       xmm2
michael@0:         pmaddubsw   xmm1,       xmm0
michael@0: 
michael@0:         punpcklbw   xmm3,       xmm4
michael@0:         pmaddubsw   xmm3,       xmm0
michael@0: 
michael@0:         punpcklbw   xmm5,       xmm6
michael@0:         pmaddubsw   xmm5,       xmm0
michael@0: 
michael@0:         movdqa      xmm2,       xmm7
michael@0:         psrldq      xmm2,       1
michael@0: 
michael@0:         punpcklbw   xmm7,       xmm2
michael@0:         pmaddubsw   xmm7,       xmm0
michael@0: 
michael@0:         paddw       xmm1,       [GLOBAL(rd)]
michael@0:         psraw       xmm1,       VP8_FILTER_SHIFT
michael@0: 
michael@0:         paddw       xmm3,       [GLOBAL(rd)]
michael@0:         psraw       xmm3,       VP8_FILTER_SHIFT
michael@0: 
michael@0:         paddw       xmm5,       [GLOBAL(rd)]
michael@0:         psraw       xmm5,       VP8_FILTER_SHIFT
michael@0: 
michael@0:         paddw       xmm7,       [GLOBAL(rd)]
michael@0:         psraw       xmm7,       VP8_FILTER_SHIFT
michael@0: 
michael@0:         packuswb    xmm1,       xmm1
michael@0:         packuswb    xmm3,       xmm3
michael@0: 
michael@0:         packuswb    xmm5,       xmm5
michael@0:         movq        [rdi],      xmm1
michael@0: 
michael@0:         packuswb    xmm7,       xmm7
michael@0:         movq        [rdi+rdx],  xmm3
michael@0: 
michael@0:         lea         rdi,        [rdi + 2*rdx]
michael@0:         movq        [rdi],      xmm5
michael@0: 
michael@0:         lea         rsp,        [rsp + 4*16]
michael@0:         movq        [rdi+rdx],  xmm7
michael@0: 
michael@0:         lea         rdi,        [rdi + 2*rdx]
michael@0:         cmp         rdi,        rcx
michael@0: 
michael@0:         jne         .next_row_fp
michael@0: 
michael@0:         lea         rsp,        [rsp + 16]
michael@0: 
michael@0: .done8x8:
michael@0:     ;add rsp, 144
michael@0:     pop         rsp
michael@0:     ; begin epilog
michael@0:     pop         rdi
michael@0:     pop         rsi
michael@0:     RESTORE_GOT
michael@0:     RESTORE_XMM
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: 
michael@0: SECTION_RODATA
michael@0: align 16
michael@0: shuf1b:
michael@0:     db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
michael@0: shuf2b:
michael@0:     db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
michael@0: shuf3b:
michael@0:     db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
michael@0: 
michael@0: align 16
michael@0: shuf2bfrom1:
michael@0:     db  4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
michael@0: align 16
michael@0: shuf3bfrom1:
michael@0:     db  2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
michael@0: 
michael@0: align 16
michael@0: rd:
michael@0:     times 8 dw 0x40
michael@0: 
michael@0: align 16
michael@0: k0_k5:
michael@0:     times 8 db 0, 0             ;placeholder
michael@0:     times 8 db 0, 0
michael@0:     times 8 db 2, 1
michael@0:     times 8 db 0, 0
michael@0:     times 8 db 3, 3
michael@0:     times 8 db 0, 0
michael@0:     times 8 db 1, 2
michael@0:     times 8 db 0, 0
michael@0: k1_k3:
michael@0:     times 8 db  0,    0         ;placeholder
michael@0:     times 8 db  -6,  12
michael@0:     times 8 db -11,  36
michael@0:     times 8 db  -9,  50
michael@0:     times 8 db -16,  77
michael@0:     times 8 db  -6,  93
michael@0:     times 8 db  -8, 108
michael@0:     times 8 db  -1, 123
michael@0: k2_k4:
michael@0:     times 8 db 128,    0        ;placeholder
michael@0:     times 8 db 123,   -1
michael@0:     times 8 db 108,   -8
michael@0:     times 8 db  93,   -6
michael@0:     times 8 db  77,  -16
michael@0:     times 8 db  50,   -9
michael@0:     times 8 db  36,  -11
michael@0:     times 8 db  12,   -6
michael@0: align 16
michael@0: vp8_bilinear_filters_ssse3:
michael@0:     times 8 db 128, 0
michael@0:     times 8 db 112, 16
michael@0:     times 8 db 96,  32
michael@0:     times 8 db 80,  48
michael@0:     times 8 db 64,  64
michael@0:     times 8 db 48,  80
michael@0:     times 8 db 32,  96
michael@0:     times 8 db 16,  112
michael@0: