michael@0: ;
michael@0: ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0: ;
michael@0: ;  Use of this source code is governed by a BSD-style license
michael@0: ;  that can be found in the LICENSE file in the root of the source
michael@0: ;  tree. An additional intellectual property rights grant can be found
michael@0: ;  in the file PATENTS.  All contributing project authors may
michael@0: ;  be found in the AUTHORS file in the root of the source tree.
michael@0: ;
michael@0: 
michael@0: 
michael@0: %include "vpx_ports/x86_abi_support.asm"
michael@0: extern sym(vp8_bilinear_filters_x86_8)
michael@0: 
michael@0: 
michael@0: %define BLOCK_HEIGHT_WIDTH 4
michael@0: %define vp8_filter_weight 128
michael@0: %define VP8_FILTER_SHIFT  7
michael@0: 
michael@0: 
michael@0: ;void vp8_filter_block1d_h6_mmx
michael@0: ;(
michael@0: ;    unsigned char   *src_ptr,
michael@0: ;    unsigned short  *output_ptr,
michael@0: ;    unsigned int    src_pixels_per_line,
michael@0: ;    unsigned int    pixel_step,
michael@0: ;    unsigned int    output_height,
michael@0: ;    unsigned int    output_width,
michael@0: ;    short           * vp8_filter
michael@0: ;)
michael@0: global sym(vp8_filter_block1d_h6_mmx) PRIVATE
michael@0: sym(vp8_filter_block1d_h6_mmx):
michael@0:     push        rbp
michael@0:     mov         rbp, rsp
michael@0:     SHADOW_ARGS_TO_STACK 7
michael@0:     GET_GOT     rbx
michael@0:     push        rsi
michael@0:     push        rdi
michael@0:     ; end prolog
michael@0: 
michael@0:         mov         rdx,    arg(6) ;vp8_filter
michael@0: 
michael@0:         movq        mm1,    [rdx + 16]             ; do both the negative taps first!!!
michael@0:         movq        mm2,    [rdx + 32]         ;
michael@0:         movq        mm6,    [rdx + 48]        ;
michael@0:         movq        mm7,    [rdx + 64]        ;
michael@0: 
michael@0:         mov         rdi,    arg(1) ;output_ptr
michael@0:         mov         rsi,    arg(0) ;src_ptr
michael@0:         movsxd      rcx,    dword ptr arg(4) ;output_height
michael@0:         movsxd      rax,    dword ptr arg(5) ;output_width      ; destination pitch?
michael@0:         pxor        mm0,    mm0              ; mm0 = 00000000
michael@0: 
michael@0: .nextrow:
michael@0:         movq        mm3,    [rsi-2]          ; mm3 = p-2..p5
michael@0:         movq        mm4,    mm3              ; mm4 = p-2..p5
michael@0:         psrlq       mm3,    8                ; mm3 = p-1..p5
michael@0:         punpcklbw   mm3,    mm0              ; mm3 = p-1..p2
michael@0:         pmullw      mm3,    mm1              ; mm3 *= kernel 1 modifiers.
michael@0: 
michael@0:         movq        mm5,    mm4              ; mm5 = p-2..p5
michael@0:         punpckhbw   mm4,    mm0              ; mm5 = p2..p5
michael@0:         pmullw      mm4,    mm7              ; mm5 *= kernel 4 modifiers
michael@0:         paddsw      mm3,    mm4              ; mm3 += mm5
michael@0: 
michael@0:         movq        mm4,    mm5              ; mm4 = p-2..p5;
michael@0:         psrlq       mm5,    16               ; mm5 = p0..p5;
michael@0:         punpcklbw   mm5,    mm0              ; mm5 = p0..p3
michael@0:         pmullw      mm5,    mm2              ; mm5 *= kernel 2 modifiers
michael@0:         paddsw      mm3,    mm5              ; mm3 += mm5
michael@0: 
michael@0:         movq        mm5,    mm4              ; mm5 = p-2..p5
michael@0:         psrlq       mm4,    24               ; mm4 = p1..p5
michael@0:         punpcklbw   mm4,    mm0              ; mm4 = p1..p4
michael@0:         pmullw      mm4,    mm6              ; mm5 *= kernel 3 modifiers
michael@0:         paddsw      mm3,    mm4              ; mm3 += mm5
michael@0: 
michael@0:         ; do outer positive taps
michael@0:         movd        mm4,    [rsi+3]
michael@0:         punpcklbw   mm4,    mm0              ; mm5 = p3..p6
michael@0:         pmullw      mm4,    [rdx+80]         ; mm5 *= kernel 0 modifiers
michael@0:         paddsw      mm3,    mm4              ; mm3 += mm5
michael@0: 
michael@0:         punpcklbw   mm5,    mm0              ; mm5 = p-2..p1
michael@0:         pmullw      mm5,    [rdx]            ; mm5 *= kernel 5 modifiers
michael@0:         paddsw      mm3,    mm5              ; mm3 += mm5
michael@0: 
michael@0:         paddsw      mm3,    [GLOBAL(rd)]              ; mm3 += round value
michael@0:         psraw       mm3,    VP8_FILTER_SHIFT     ; mm3 /= 128
michael@0:         packuswb    mm3,    mm0              ; pack and unpack to saturate
michael@0:         punpcklbw   mm3,    mm0              ;
michael@0: 
michael@0:         movq        [rdi],  mm3              ; store the results in the destination
michael@0: 
michael@0: %if ABI_IS_32BIT
michael@0:         add         rsi,    dword ptr arg(2) ;src_pixels_per_line ; next line
michael@0:         add         rdi,    rax;
michael@0: %else
michael@0:         movsxd      r8,     dword ptr arg(2) ;src_pixels_per_line
michael@0:         add         rdi,    rax;
michael@0: 
michael@0:         add         rsi,    r8               ; next line
michael@0: %endif
michael@0: 
michael@0:         dec         rcx                      ; decrement count
michael@0:         jnz         .nextrow                 ; next row
michael@0: 
michael@0:     ; begin epilog
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     RESTORE_GOT
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: 
michael@0: 
michael@0: ;void vp8_filter_block1dc_v6_mmx
michael@0: ;(
michael@0: ;   short *src_ptr,
michael@0: ;   unsigned char *output_ptr,
michael@0: ;    int output_pitch,
michael@0: ;   unsigned int pixels_per_line,
michael@0: ;   unsigned int pixel_step,
michael@0: ;   unsigned int output_height,
michael@0: ;   unsigned int output_width,
michael@0: ;   short * vp8_filter
michael@0: ;)
michael@0: global sym(vp8_filter_block1dc_v6_mmx) PRIVATE
michael@0: sym(vp8_filter_block1dc_v6_mmx):
michael@0:     push        rbp
michael@0:     mov         rbp, rsp
michael@0:     SHADOW_ARGS_TO_STACK 8
michael@0:     GET_GOT     rbx
michael@0:     push        rsi
michael@0:     push        rdi
michael@0:     ; end prolog
michael@0: 
michael@0:         movq      mm5, [GLOBAL(rd)]
michael@0:         push        rbx
michael@0:         mov         rbx, arg(7) ;vp8_filter
michael@0:         movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
michael@0:         movq      mm2, [rbx + 32]         ;
michael@0:         movq      mm6, [rbx + 48]        ;
michael@0:         movq      mm7, [rbx + 64]        ;
michael@0: 
michael@0:         movsxd      rdx, dword ptr arg(3) ;pixels_per_line
michael@0:         mov         rdi, arg(1) ;output_ptr
michael@0:         mov         rsi, arg(0) ;src_ptr
michael@0:         sub         rsi, rdx
michael@0:         sub         rsi, rdx
michael@0:         movsxd      rcx, DWORD PTR arg(5) ;output_height
michael@0:         movsxd      rax, DWORD PTR arg(2) ;output_pitch      ; destination pitch?
michael@0:         pxor        mm0, mm0              ; mm0 = 00000000
michael@0: 
michael@0: 
michael@0: .nextrow_cv:
michael@0:         movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
michael@0:         pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.
michael@0: 
michael@0: 
michael@0:         movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2
michael@0:         pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.
michael@0:         paddsw      mm3, mm4              ; mm3 += mm4
michael@0: 
michael@0:         movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0
michael@0:         pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.
michael@0:         paddsw      mm3, mm4              ; mm3 += mm4
michael@0: 
michael@0:         movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2
michael@0:         pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.
michael@0:         paddsw      mm3, mm4              ; mm3 += mm4
michael@0: 
michael@0: 
michael@0:         add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch
michael@0:         movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1
michael@0:         pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.
michael@0:         paddsw      mm3, mm4              ; mm3 += mm4
michael@0: 
michael@0:         movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3
michael@0:         pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.
michael@0:         paddsw      mm3, mm4              ; mm3 += mm4
michael@0: 
michael@0: 
michael@0:         paddsw      mm3, mm5               ; mm3 += round value
michael@0:         psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
michael@0:         packuswb    mm3, mm0              ; pack and saturate
michael@0: 
michael@0:         movd        [rdi],mm3             ; store the results in the destination
michael@0:         ; the subsequent iterations repeat 3 out of 4 of these reads.  Since the
michael@0:         ; recon block should be in cache this shouldn't cost much.  Its obviously
michael@0:         ; avoidable!!!.
michael@0:         lea         rdi,  [rdi+rax] ;
michael@0:         dec         rcx                   ; decrement count
michael@0:         jnz         .nextrow_cv           ; next row
michael@0: 
michael@0:         pop         rbx
michael@0: 
michael@0:     ; begin epilog
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     RESTORE_GOT
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: 
michael@0: 
michael@0: ;void bilinear_predict8x8_mmx
michael@0: ;(
michael@0: ;    unsigned char  *src_ptr,
michael@0: ;    int   src_pixels_per_line,
michael@0: ;    int  xoffset,
michael@0: ;    int  yoffset,
michael@0: ;   unsigned char *dst_ptr,
michael@0: ;    int dst_pitch
michael@0: ;)
michael@0: global sym(vp8_bilinear_predict8x8_mmx) PRIVATE
michael@0: sym(vp8_bilinear_predict8x8_mmx):
michael@0:     push        rbp
michael@0:     mov         rbp, rsp
michael@0:     SHADOW_ARGS_TO_STACK 6
michael@0:     GET_GOT     rbx
michael@0:     push        rsi
michael@0:     push        rdi
michael@0:     ; end prolog
michael@0: 
michael@0:     ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
michael@0:     ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
michael@0: 
michael@0:         movsxd      rax,        dword ptr arg(2) ;xoffset
michael@0:         mov         rdi,        arg(4) ;dst_ptr           ;
michael@0: 
michael@0:         shl         rax,        5 ; offset * 32
michael@0:         lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
michael@0: 
michael@0:         add         rax,        rcx ; HFilter
michael@0:         mov         rsi,        arg(0) ;src_ptr              ;
michael@0: 
michael@0:         movsxd      rdx,        dword ptr arg(5) ;dst_pitch
michael@0:         movq        mm1,        [rax]               ;
michael@0: 
michael@0:         movq        mm2,        [rax+16]            ;
michael@0:         movsxd      rax,        dword ptr arg(3) ;yoffset
michael@0: 
michael@0:         pxor        mm0,        mm0                 ;
michael@0: 
michael@0:         shl         rax,        5 ; offset*32
michael@0:         add         rax,        rcx ; VFilter
michael@0: 
michael@0:         lea         rcx,        [rdi+rdx*8]          ;
michael@0:         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
michael@0: 
michael@0: 
michael@0: 
michael@0:         ; get the first horizontal line done       ;
michael@0:         movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
michael@0:         movq        mm4,        mm3                 ; make a copy of current line
michael@0: 
michael@0:         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
michael@0:         punpckhbw   mm4,        mm0                 ;
michael@0: 
michael@0:         pmullw      mm3,        mm1                 ;
michael@0:         pmullw      mm4,        mm1                 ;
michael@0: 
michael@0:         movq        mm5,        [rsi+1]             ;
michael@0:         movq        mm6,        mm5                 ;
michael@0: 
michael@0:         punpcklbw   mm5,        mm0                 ;
michael@0:         punpckhbw   mm6,        mm0                 ;
michael@0: 
michael@0:         pmullw      mm5,        mm2                 ;
michael@0:         pmullw      mm6,        mm2                 ;
michael@0: 
michael@0:         paddw       mm3,        mm5                 ;
michael@0:         paddw       mm4,        mm6                 ;
michael@0: 
michael@0:         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
michael@0:         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
michael@0: 
michael@0:         paddw       mm4,        [GLOBAL(rd)]                 ;
michael@0:         psraw       mm4,        VP8_FILTER_SHIFT        ;
michael@0: 
michael@0:         movq        mm7,        mm3                 ;
michael@0:         packuswb    mm7,        mm4                 ;
michael@0: 
michael@0:         add         rsi,        rdx                 ; next line
michael@0: .next_row_8x8:
michael@0:         movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
michael@0:         movq        mm4,        mm3                 ; make a copy of current line
michael@0: 
michael@0:         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
michael@0:         punpckhbw   mm4,        mm0                 ;
michael@0: 
michael@0:         pmullw      mm3,        mm1                 ;
michael@0:         pmullw      mm4,        mm1                 ;
michael@0: 
michael@0:         movq        mm5,        [rsi+1]             ;
michael@0:         movq        mm6,        mm5                 ;
michael@0: 
michael@0:         punpcklbw   mm5,        mm0                 ;
michael@0:         punpckhbw   mm6,        mm0                 ;
michael@0: 
michael@0:         pmullw      mm5,        mm2                 ;
michael@0:         pmullw      mm6,        mm2                 ;
michael@0: 
michael@0:         paddw       mm3,        mm5                 ;
michael@0:         paddw       mm4,        mm6                 ;
michael@0: 
michael@0:         movq        mm5,        mm7                 ;
michael@0:         movq        mm6,        mm7                 ;
michael@0: 
michael@0:         punpcklbw   mm5,        mm0                 ;
michael@0:         punpckhbw   mm6,        mm0
michael@0: 
michael@0:         pmullw      mm5,        [rax]               ;
michael@0:         pmullw      mm6,        [rax]               ;
michael@0: 
michael@0:         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
michael@0:         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
michael@0: 
michael@0:         paddw       mm4,        [GLOBAL(rd)]                 ;
michael@0:         psraw       mm4,        VP8_FILTER_SHIFT        ;
michael@0: 
michael@0:         movq        mm7,        mm3                 ;
michael@0:         packuswb    mm7,        mm4                 ;
michael@0: 
michael@0: 
michael@0:         pmullw      mm3,        [rax+16]            ;
michael@0:         pmullw      mm4,        [rax+16]            ;
michael@0: 
michael@0:         paddw       mm3,        mm5                 ;
michael@0:         paddw       mm4,        mm6                 ;
michael@0: 
michael@0: 
michael@0:         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
michael@0:         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
michael@0: 
michael@0:         paddw       mm4,        [GLOBAL(rd)]                 ;
michael@0:         psraw       mm4,        VP8_FILTER_SHIFT        ;
michael@0: 
michael@0:         packuswb    mm3,        mm4
michael@0: 
michael@0:         movq        [rdi],      mm3                 ; store the results in the destination
michael@0: 
michael@0: %if ABI_IS_32BIT
michael@0:         add         rsi,        rdx                 ; next line
michael@0:         add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
michael@0: %else
michael@0:         movsxd      r8,         dword ptr arg(5) ;dst_pitch
michael@0:         add         rsi,        rdx                 ; next line
michael@0:         add         rdi,        r8                  ;dst_pitch
michael@0: %endif
michael@0:         cmp         rdi,        rcx                 ;
michael@0:         jne         .next_row_8x8
michael@0: 
michael@0:     ; begin epilog
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     RESTORE_GOT
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: 
michael@0: 
michael@0: ;void bilinear_predict8x4_mmx
michael@0: ;(
michael@0: ;    unsigned char  *src_ptr,
michael@0: ;    int   src_pixels_per_line,
michael@0: ;    int  xoffset,
michael@0: ;    int  yoffset,
michael@0: ;    unsigned char *dst_ptr,
michael@0: ;    int dst_pitch
michael@0: ;)
michael@0: global sym(vp8_bilinear_predict8x4_mmx) PRIVATE
michael@0: sym(vp8_bilinear_predict8x4_mmx):
michael@0:     push        rbp
michael@0:     mov         rbp, rsp
michael@0:     SHADOW_ARGS_TO_STACK 6
michael@0:     GET_GOT     rbx
michael@0:     push        rsi
michael@0:     push        rdi
michael@0:     ; end prolog
michael@0: 
michael@0:     ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
michael@0:     ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
michael@0: 
michael@0:         movsxd      rax,        dword ptr arg(2) ;xoffset
michael@0:         mov         rdi,        arg(4) ;dst_ptr           ;
michael@0: 
michael@0:         lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
michael@0:         shl         rax,        5
michael@0: 
michael@0:         mov         rsi,        arg(0) ;src_ptr              ;
michael@0:         add         rax,        rcx
michael@0: 
michael@0:         movsxd      rdx,        dword ptr arg(5) ;dst_pitch
michael@0:         movq        mm1,        [rax]               ;
michael@0: 
michael@0:         movq        mm2,        [rax+16]            ;
michael@0:         movsxd      rax,        dword ptr arg(3) ;yoffset
michael@0: 
michael@0:         pxor        mm0,        mm0                 ;
michael@0:         shl         rax,        5
michael@0: 
michael@0:         add         rax,        rcx
michael@0:         lea         rcx,        [rdi+rdx*4]          ;
michael@0: 
michael@0:         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
michael@0: 
michael@0:         ; get the first horizontal line done       ;
michael@0:         movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
michael@0:         movq        mm4,        mm3                 ; make a copy of current line
michael@0: 
michael@0:         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
michael@0:         punpckhbw   mm4,        mm0                 ;
michael@0: 
michael@0:         pmullw      mm3,        mm1                 ;
michael@0:         pmullw      mm4,        mm1                 ;
michael@0: 
michael@0:         movq        mm5,        [rsi+1]             ;
michael@0:         movq        mm6,        mm5                 ;
michael@0: 
michael@0:         punpcklbw   mm5,        mm0                 ;
michael@0:         punpckhbw   mm6,        mm0                 ;
michael@0: 
michael@0:         pmullw      mm5,        mm2                 ;
michael@0:         pmullw      mm6,        mm2                 ;
michael@0: 
michael@0:         paddw       mm3,        mm5                 ;
michael@0:         paddw       mm4,        mm6                 ;
michael@0: 
michael@0:         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
michael@0:         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
michael@0: 
michael@0:         paddw       mm4,        [GLOBAL(rd)]                 ;
michael@0:         psraw       mm4,        VP8_FILTER_SHIFT        ;
michael@0: 
michael@0:         movq        mm7,        mm3                 ;
michael@0:         packuswb    mm7,        mm4                 ;
michael@0: 
michael@0:         add         rsi,        rdx                 ; next line
michael@0: .next_row_8x4:
michael@0:         movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
michael@0:         movq        mm4,        mm3                 ; make a copy of current line
michael@0: 
michael@0:         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
michael@0:         punpckhbw   mm4,        mm0                 ;
michael@0: 
michael@0:         pmullw      mm3,        mm1                 ;
michael@0:         pmullw      mm4,        mm1                 ;
michael@0: 
michael@0:         movq        mm5,        [rsi+1]             ;
michael@0:         movq        mm6,        mm5                 ;
michael@0: 
michael@0:         punpcklbw   mm5,        mm0                 ;
michael@0:         punpckhbw   mm6,        mm0                 ;
michael@0: 
michael@0:         pmullw      mm5,        mm2                 ;
michael@0:         pmullw      mm6,        mm2                 ;
michael@0: 
michael@0:         paddw       mm3,        mm5                 ;
michael@0:         paddw       mm4,        mm6                 ;
michael@0: 
michael@0:         movq        mm5,        mm7                 ;
michael@0:         movq        mm6,        mm7                 ;
michael@0: 
michael@0:         punpcklbw   mm5,        mm0                 ;
michael@0:         punpckhbw   mm6,        mm0
michael@0: 
michael@0:         pmullw      mm5,        [rax]               ;
michael@0:         pmullw      mm6,        [rax]               ;
michael@0: 
michael@0:         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
michael@0:         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
michael@0: 
michael@0:         paddw       mm4,        [GLOBAL(rd)]                 ;
michael@0:         psraw       mm4,        VP8_FILTER_SHIFT        ;
michael@0: 
michael@0:         movq        mm7,        mm3                 ;
michael@0:         packuswb    mm7,        mm4                 ;
michael@0: 
michael@0: 
michael@0:         pmullw      mm3,        [rax+16]            ;
michael@0:         pmullw      mm4,        [rax+16]            ;
michael@0: 
michael@0:         paddw       mm3,        mm5                 ;
michael@0:         paddw       mm4,        mm6                 ;
michael@0: 
michael@0: 
michael@0:         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
michael@0:         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
michael@0: 
michael@0:         paddw       mm4,        [GLOBAL(rd)]                 ;
michael@0:         psraw       mm4,        VP8_FILTER_SHIFT        ;
michael@0: 
michael@0:         packuswb    mm3,        mm4
michael@0: 
michael@0:         movq        [rdi],      mm3                 ; store the results in the destination
michael@0: 
michael@0: %if ABI_IS_32BIT
michael@0:         add         rsi,        rdx                 ; next line
michael@0:         add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
michael@0: %else
michael@0:         movsxd      r8,         dword ptr arg(5) ;dst_pitch
michael@0:         add         rsi,        rdx                 ; next line
michael@0:         add         rdi,        r8
michael@0: %endif
michael@0:         cmp         rdi,        rcx                 ;
michael@0:         jne         .next_row_8x4
michael@0: 
michael@0:     ; begin epilog
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     RESTORE_GOT
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: 
michael@0: 
michael@0: ;void bilinear_predict4x4_mmx
michael@0: ;(
michael@0: ;    unsigned char  *src_ptr,
michael@0: ;    int   src_pixels_per_line,
michael@0: ;    int  xoffset,
michael@0: ;    int  yoffset,
michael@0: ;    unsigned char *dst_ptr,
michael@0: ;    int dst_pitch
michael@0: ;)
michael@0: global sym(vp8_bilinear_predict4x4_mmx) PRIVATE
michael@0: sym(vp8_bilinear_predict4x4_mmx):
michael@0:     push        rbp
michael@0:     mov         rbp, rsp
michael@0:     SHADOW_ARGS_TO_STACK 6
michael@0:     GET_GOT     rbx
michael@0:     push        rsi
michael@0:     push        rdi
michael@0:     ; end prolog
michael@0: 
michael@0:     ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
michael@0:     ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
michael@0: 
michael@0:         movsxd      rax,        dword ptr arg(2) ;xoffset
michael@0:         mov         rdi,        arg(4) ;dst_ptr           ;
michael@0: 
michael@0:         lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
michael@0:         shl         rax,        5
michael@0: 
michael@0:         add         rax,        rcx ; HFilter
michael@0:         mov         rsi,        arg(0) ;src_ptr              ;
michael@0: 
michael@0:         movsxd      rdx,        dword ptr arg(5) ;ldst_pitch
michael@0:         movq        mm1,        [rax]               ;
michael@0: 
michael@0:         movq        mm2,        [rax+16]            ;
michael@0:         movsxd      rax,        dword ptr arg(3) ;yoffset
michael@0: 
michael@0:         pxor        mm0,        mm0                 ;
michael@0:         shl         rax,        5
michael@0: 
michael@0:         add         rax,        rcx
michael@0:         lea         rcx,        [rdi+rdx*4]          ;
michael@0: 
michael@0:         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
michael@0: 
michael@0:         ; get the first horizontal line done       ;
michael@0:         movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
michael@0:         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
michael@0: 
michael@0:         pmullw      mm3,        mm1                 ;
michael@0:         movd        mm5,        [rsi+1]             ;
michael@0: 
michael@0:         punpcklbw   mm5,        mm0                 ;
michael@0:         pmullw      mm5,        mm2                 ;
michael@0: 
michael@0:         paddw       mm3,        mm5                 ;
michael@0:         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
michael@0: 
michael@0:         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
michael@0: 
michael@0:         movq        mm7,        mm3                 ;
michael@0:         packuswb    mm7,        mm0                 ;
michael@0: 
michael@0:         add         rsi,        rdx                 ; next line
michael@0: .next_row_4x4:
michael@0:         movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
michael@0:         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
michael@0: 
michael@0:         pmullw      mm3,        mm1                 ;
michael@0:         movd        mm5,        [rsi+1]             ;
michael@0: 
michael@0:         punpcklbw   mm5,        mm0                 ;
michael@0:         pmullw      mm5,        mm2                 ;
michael@0: 
michael@0:         paddw       mm3,        mm5                 ;
michael@0: 
michael@0:         movq        mm5,        mm7                 ;
michael@0:         punpcklbw   mm5,        mm0                 ;
michael@0: 
michael@0:         pmullw      mm5,        [rax]               ;
michael@0:         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
michael@0: 
michael@0:         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
michael@0:         movq        mm7,        mm3                 ;
michael@0: 
michael@0:         packuswb    mm7,        mm0                 ;
michael@0: 
michael@0:         pmullw      mm3,        [rax+16]            ;
michael@0:         paddw       mm3,        mm5                 ;
michael@0: 
michael@0: 
michael@0:         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
michael@0:         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
michael@0: 
michael@0:         packuswb    mm3,        mm0
michael@0:         movd        [rdi],      mm3                 ; store the results in the destination
michael@0: 
michael@0: %if ABI_IS_32BIT
michael@0:         add         rsi,        rdx                 ; next line
michael@0:         add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
michael@0: %else
michael@0:         movsxd      r8,         dword ptr arg(5) ;dst_pitch                   ;
michael@0:         add         rsi,        rdx                 ; next line
michael@0:         add         rdi,        r8
michael@0: %endif
michael@0: 
michael@0:         cmp         rdi,        rcx                 ;
michael@0:         jne         .next_row_4x4
michael@0: 
michael@0:     ; begin epilog
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     RESTORE_GOT
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: 
michael@0: 
michael@0: 
michael@0: SECTION_RODATA
michael@0: align 16
michael@0: rd:
michael@0:     times 4 dw 0x40
michael@0: 
michael@0: align 16
michael@0: global HIDDEN_DATA(sym(vp8_six_tap_mmx))
michael@0: sym(vp8_six_tap_mmx):
michael@0:     times 8 dw 0
michael@0:     times 8 dw 0
michael@0:     times 8 dw 128
michael@0:     times 8 dw 0
michael@0:     times 8 dw 0
michael@0:     times 8 dw 0
michael@0: 
michael@0:     times 8 dw 0
michael@0:     times 8 dw -6
michael@0:     times 8 dw 123
michael@0:     times 8 dw 12
michael@0:     times 8 dw -1
michael@0:     times 8 dw 0
michael@0: 
michael@0:     times 8 dw 2
michael@0:     times 8 dw -11
michael@0:     times 8 dw 108
michael@0:     times 8 dw 36
michael@0:     times 8 dw -8
michael@0:     times 8 dw 1
michael@0: 
michael@0:     times 8 dw 0
michael@0:     times 8 dw -9
michael@0:     times 8 dw 93
michael@0:     times 8 dw 50
michael@0:     times 8 dw -6
michael@0:     times 8 dw 0
michael@0: 
michael@0:     times 8 dw 3
michael@0:     times 8 dw -16
michael@0:     times 8 dw 77
michael@0:     times 8 dw 77
michael@0:     times 8 dw -16
michael@0:     times 8 dw 3
michael@0: 
michael@0:     times 8 dw 0
michael@0:     times 8 dw -6
michael@0:     times 8 dw 50
michael@0:     times 8 dw 93
michael@0:     times 8 dw -9
michael@0:     times 8 dw 0
michael@0: 
michael@0:     times 8 dw 1
michael@0:     times 8 dw -8
michael@0:     times 8 dw 36
michael@0:     times 8 dw 108
michael@0:     times 8 dw -11
michael@0:     times 8 dw 2
michael@0: 
michael@0:     times 8 dw 0
michael@0:     times 8 dw -1
michael@0:     times 8 dw 12
michael@0:     times 8 dw 123
michael@0:     times 8 dw -6
michael@0:     times 8 dw 0
michael@0: 
michael@0: