michael@0: ;
michael@0: ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0: ;
michael@0: ;  Use of this source code is governed by a BSD-style license
michael@0: ;  that can be found in the LICENSE file in the root of the source
michael@0: ;  tree. An additional intellectual property rights grant can be found
michael@0: ;  in the file PATENTS.  All contributing project authors may
michael@0: ;  be found in the AUTHORS file in the root of the source tree.
michael@0: ;
michael@0: 
michael@0: 
michael@0: %include "vpx_ports/x86_abi_support.asm"
michael@0: 
michael@0: ;macro in deblock functions
michael@0: %macro FIRST_2_ROWS 0
michael@0:         movdqa      xmm4,       xmm0
michael@0:         movdqa      xmm6,       xmm0
michael@0:         movdqa      xmm5,       xmm1
michael@0:         pavgb       xmm5,       xmm3
michael@0: 
michael@0:         ;calculate absolute value
michael@0:         psubusb     xmm4,       xmm1
michael@0:         psubusb     xmm1,       xmm0
michael@0:         psubusb     xmm6,       xmm3
michael@0:         psubusb     xmm3,       xmm0
michael@0:         paddusb     xmm4,       xmm1
michael@0:         paddusb     xmm6,       xmm3
michael@0: 
michael@0:         ;get threshold
michael@0:         movdqa      xmm2,       flimit
michael@0:         pxor        xmm1,       xmm1
michael@0:         movdqa      xmm7,       xmm2
michael@0: 
michael@0:         ;get mask
michael@0:         psubusb     xmm2,       xmm4
michael@0:         psubusb     xmm7,       xmm6
michael@0:         pcmpeqb     xmm2,       xmm1
michael@0:         pcmpeqb     xmm7,       xmm1
michael@0:         por         xmm7,       xmm2
michael@0: %endmacro
michael@0: 
michael@0: %macro SECOND_2_ROWS 0
michael@0:         movdqa      xmm6,       xmm0
michael@0:         movdqa      xmm4,       xmm0
michael@0:         movdqa      xmm2,       xmm1
michael@0:         pavgb       xmm1,       xmm3
michael@0: 
michael@0:         ;calculate absolute value
michael@0:         psubusb     xmm6,       xmm2
michael@0:         psubusb     xmm2,       xmm0
michael@0:         psubusb     xmm4,       xmm3
michael@0:         psubusb     xmm3,       xmm0
michael@0:         paddusb     xmm6,       xmm2
michael@0:         paddusb     xmm4,       xmm3
michael@0: 
michael@0:         pavgb       xmm5,       xmm1
michael@0: 
michael@0:         ;get threshold
michael@0:         movdqa      xmm2,       flimit
michael@0:         pxor        xmm1,       xmm1
michael@0:         movdqa      xmm3,       xmm2
michael@0: 
michael@0:         ;get mask
michael@0:         psubusb     xmm2,       xmm6
michael@0:         psubusb     xmm3,       xmm4
michael@0:         pcmpeqb     xmm2,       xmm1
michael@0:         pcmpeqb     xmm3,       xmm1
michael@0: 
michael@0:         por         xmm7,       xmm2
michael@0:         por         xmm7,       xmm3
michael@0: 
michael@0:         pavgb       xmm5,       xmm0
michael@0: 
michael@0:         ;decide if or not to use filtered value
michael@0:         pand        xmm0,       xmm7
michael@0:         pandn       xmm7,       xmm5
michael@0:         paddusb     xmm0,       xmm7
michael@0: %endmacro
michael@0: 
michael@0: %macro UPDATE_FLIMIT 0
michael@0:         movdqa      xmm2,       XMMWORD PTR [rbx]
michael@0:         movdqa      [rsp],      xmm2
michael@0:         add         rbx,        16
michael@0: %endmacro
michael@0: 
michael@0: ;void vp8_post_proc_down_and_across_mb_row_sse2
michael@0: ;(
michael@0: ;    unsigned char *src_ptr,
michael@0: ;    unsigned char *dst_ptr,
michael@0: ;    int src_pixels_per_line,
michael@0: ;    int dst_pixels_per_line,
michael@0: ;    int cols,
michael@0: ;    int *flimits,
michael@0: ;    int size
michael@0: ;)
michael@0: global sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE
michael@0: sym(vp8_post_proc_down_and_across_mb_row_sse2):
michael@0:     push        rbp
michael@0:     mov         rbp, rsp
michael@0:     SHADOW_ARGS_TO_STACK 7
michael@0:     SAVE_XMM 7
michael@0:     push        rbx
michael@0:     push        rsi
michael@0:     push        rdi
michael@0:     ; end prolog
michael@0:     ALIGN_STACK 16, rax
michael@0:     sub         rsp, 16
michael@0: 
michael@0:         ; put flimit on stack
michael@0:         mov         rbx,        arg(5)           ;flimits ptr
michael@0:         UPDATE_FLIMIT
michael@0: 
michael@0: %define flimit [rsp]
michael@0: 
michael@0:         mov         rsi,        arg(0)           ;src_ptr
michael@0:         mov         rdi,        arg(1)           ;dst_ptr
michael@0: 
michael@0:         movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line
michael@0:         movsxd      rcx,        DWORD PTR arg(6) ;rows in a macroblock
michael@0: .nextrow:
michael@0:         xor         rdx,        rdx              ;col
michael@0: .nextcol:
michael@0:         ;load current and next 2 rows
michael@0:         movdqu      xmm0,       XMMWORD PTR [rsi]
michael@0:         movdqu      xmm1,       XMMWORD PTR [rsi + rax]
michael@0:         movdqu      xmm3,       XMMWORD PTR [rsi + 2*rax]
michael@0: 
michael@0:         FIRST_2_ROWS
michael@0: 
michael@0:         ;load above 2 rows
michael@0:         neg         rax
michael@0:         movdqu      xmm1,       XMMWORD PTR [rsi + 2*rax]
michael@0:         movdqu      xmm3,       XMMWORD PTR [rsi + rax]
michael@0: 
michael@0:         SECOND_2_ROWS
michael@0: 
michael@0:         movdqu      XMMWORD PTR [rdi], xmm0
michael@0: 
michael@0:         neg         rax                          ; positive stride
michael@0:         add         rsi,        16
michael@0:         add         rdi,        16
michael@0: 
michael@0:         add         rdx,        16
michael@0:         cmp         edx,        dword arg(4)     ;cols
michael@0:         jge         .downdone
michael@0:         UPDATE_FLIMIT
michael@0:         jmp         .nextcol
michael@0: 
michael@0: .downdone:
michael@0:         ; done with the all cols, start the across filtering in place
michael@0:         sub         rsi,        rdx
michael@0:         sub         rdi,        rdx
michael@0: 
michael@0:         mov         rbx,        arg(5) ; flimits
michael@0:         UPDATE_FLIMIT
michael@0: 
michael@0:         ; dup the first byte into the left border 8 times
michael@0:         movq        mm1,   [rdi]
michael@0:         punpcklbw   mm1,   mm1
michael@0:         punpcklwd   mm1,   mm1
michael@0:         punpckldq   mm1,   mm1
michael@0:         mov         rdx,    -8
michael@0:         movq        [rdi+rdx], mm1
michael@0: 
michael@0:         ; dup the last byte into the right border
michael@0:         movsxd      rdx,    dword arg(4)
michael@0:         movq        mm1,   [rdi + rdx + -1]
michael@0:         punpcklbw   mm1,   mm1
michael@0:         punpcklwd   mm1,   mm1
michael@0:         punpckldq   mm1,   mm1
michael@0:         movq        [rdi+rdx], mm1
michael@0: 
michael@0:         xor         rdx,        rdx
michael@0:         movq        mm0,        QWORD PTR [rdi-16];
michael@0:         movq        mm1,        QWORD PTR [rdi-8];
michael@0: 
michael@0: .acrossnextcol:
michael@0:         movdqu      xmm0,       XMMWORD PTR [rdi + rdx]
michael@0:         movdqu      xmm1,       XMMWORD PTR [rdi + rdx -2]
michael@0:         movdqu      xmm3,       XMMWORD PTR [rdi + rdx -1]
michael@0: 
michael@0:         FIRST_2_ROWS
michael@0: 
michael@0:         movdqu      xmm1,       XMMWORD PTR [rdi + rdx +1]
michael@0:         movdqu      xmm3,       XMMWORD PTR [rdi + rdx +2]
michael@0: 
michael@0:         SECOND_2_ROWS
michael@0: 
michael@0:         movq        QWORD PTR [rdi+rdx-16], mm0  ; store previous 8 bytes
michael@0:         movq        QWORD PTR [rdi+rdx-8], mm1   ; store previous 8 bytes
michael@0:         movdq2q     mm0,        xmm0
michael@0:         psrldq      xmm0,       8
michael@0:         movdq2q     mm1,        xmm0
michael@0: 
michael@0:         add         rdx,        16
michael@0:         cmp         edx,        dword arg(4)     ;cols
michael@0:         jge         .acrossdone
michael@0:         UPDATE_FLIMIT
michael@0:         jmp         .acrossnextcol
michael@0: 
michael@0: .acrossdone
michael@0:         ; last 16 pixels
michael@0:         movq        QWORD PTR [rdi+rdx-16], mm0
michael@0: 
michael@0:         cmp         edx,        dword arg(4)
michael@0:         jne         .throw_last_8
michael@0:         movq        QWORD PTR [rdi+rdx-8], mm1
michael@0: .throw_last_8:
michael@0:         ; done with this rwo
michael@0:         add         rsi,rax                      ;next src line
michael@0:         mov         eax, dword arg(3)            ;dst_pixels_per_line
michael@0:         add         rdi,rax                      ;next destination
michael@0:         mov         eax, dword arg(2)            ;src_pixels_per_line
michael@0: 
michael@0:         mov         rbx,        arg(5)           ;flimits
michael@0:         UPDATE_FLIMIT
michael@0: 
michael@0:         dec         rcx                          ;decrement count
michael@0:         jnz         .nextrow                     ;next row
michael@0: 
michael@0:     add rsp, 16
michael@0:     pop rsp
michael@0:     ; begin epilog
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     pop rbx
michael@0:     RESTORE_XMM
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: %undef flimit
michael@0: 
michael@0: ;void vp8_mbpost_proc_down_xmm(unsigned char *dst,
michael@0: ;                            int pitch, int rows, int cols,int flimit)
michael@0: extern sym(vp8_rv)
michael@0: global sym(vp8_mbpost_proc_down_xmm) PRIVATE
michael@0: sym(vp8_mbpost_proc_down_xmm):
michael@0:     push        rbp
michael@0:     mov         rbp, rsp
michael@0:     SHADOW_ARGS_TO_STACK 5
michael@0:     SAVE_XMM 7
michael@0:     GET_GOT     rbx
michael@0:     push        rsi
michael@0:     push        rdi
michael@0:     ; end prolog
michael@0: 
michael@0:     ALIGN_STACK 16, rax
michael@0:     sub         rsp, 128+16
michael@0: 
michael@0:     ; unsigned char d[16][8] at [rsp]
michael@0:     ; create flimit2 at [rsp+128]
michael@0:     mov         eax, dword ptr arg(4) ;flimit
michael@0:     mov         [rsp+128], eax
michael@0:     mov         [rsp+128+4], eax
michael@0:     mov         [rsp+128+8], eax
michael@0:     mov         [rsp+128+12], eax
michael@0: %define flimit4 [rsp+128]
michael@0: 
michael@0: %if ABI_IS_32BIT=0
michael@0:     lea         r8,       [GLOBAL(sym(vp8_rv))]
michael@0: %endif
michael@0: 
michael@0:     ;rows +=8;
michael@0:     add         dword arg(2), 8
michael@0: 
michael@0:     ;for(c=0; c<cols; c+=8)
michael@0: .loop_col:
michael@0:             mov         rsi,        arg(0) ; s
michael@0:             pxor        xmm0,       xmm0        ;
michael@0: 
michael@0:             movsxd      rax,        dword ptr arg(1) ;pitch       ;
michael@0: 
michael@0:             ; this copies the last row down into the border 8 rows
michael@0:             mov         rdi,        rsi
michael@0:             mov         rdx,        arg(2)
michael@0:             sub         rdx,        9
michael@0:             imul        rdx,        rax
michael@0:             lea         rdi,        [rdi+rdx]
michael@0:             movq        xmm1,       QWORD ptr[rdi]              ; first row
michael@0:             mov         rcx,        8
michael@0: .init_borderd                                                    ; initialize borders
michael@0:             lea         rdi,        [rdi + rax]
michael@0:             movq        [rdi],      xmm1
michael@0: 
michael@0:             dec         rcx
michael@0:             jne         .init_borderd
michael@0: 
michael@0:             neg         rax                                     ; rax = -pitch
michael@0: 
michael@0:             ; this copies the first row up into the border 8 rows
michael@0:             mov         rdi,        rsi
michael@0:             movq        xmm1,       QWORD ptr[rdi]              ; first row
michael@0:             mov         rcx,        8
michael@0: .init_border                                                    ; initialize borders
michael@0:             lea         rdi,        [rdi + rax]
michael@0:             movq        [rdi],      xmm1
michael@0: 
michael@0:             dec         rcx
michael@0:             jne         .init_border
michael@0: 
michael@0: 
michael@0: 
michael@0:             lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
michael@0:             neg         rax
michael@0: 
michael@0:             pxor        xmm5,       xmm5
michael@0:             pxor        xmm6,       xmm6        ;
michael@0: 
michael@0:             pxor        xmm7,       xmm7        ;
michael@0:             mov         rdi,        rsi
michael@0: 
michael@0:             mov         rcx,        15          ;
michael@0: 
michael@0: .loop_initvar:
michael@0:             movq        xmm1,       QWORD PTR [rdi];
michael@0:             punpcklbw   xmm1,       xmm0        ;
michael@0: 
michael@0:             paddw       xmm5,       xmm1        ;
michael@0:             pmullw      xmm1,       xmm1        ;
michael@0: 
michael@0:             movdqa      xmm2,       xmm1        ;
michael@0:             punpcklwd   xmm1,       xmm0        ;
michael@0: 
michael@0:             punpckhwd   xmm2,       xmm0        ;
michael@0:             paddd       xmm6,       xmm1        ;
michael@0: 
michael@0:             paddd       xmm7,       xmm2        ;
michael@0:             lea         rdi,        [rdi+rax]   ;
michael@0: 
michael@0:             dec         rcx
michael@0:             jne         .loop_initvar
michael@0:             ;save the var and sum
michael@0:             xor         rdx,        rdx
michael@0: .loop_row:
michael@0:             movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
michael@0:             movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
michael@0: 
michael@0:             punpcklbw   xmm1,       xmm0
michael@0:             punpcklbw   xmm2,       xmm0
michael@0: 
michael@0:             paddw       xmm5,       xmm2
michael@0:             psubw       xmm5,       xmm1
michael@0: 
michael@0:             pmullw      xmm2,       xmm2
michael@0:             movdqa      xmm4,       xmm2
michael@0: 
michael@0:             punpcklwd   xmm2,       xmm0
michael@0:             punpckhwd   xmm4,       xmm0
michael@0: 
michael@0:             paddd       xmm6,       xmm2
michael@0:             paddd       xmm7,       xmm4
michael@0: 
michael@0:             pmullw      xmm1,       xmm1
michael@0:             movdqa      xmm2,       xmm1
michael@0: 
michael@0:             punpcklwd   xmm1,       xmm0
michael@0:             psubd       xmm6,       xmm1
michael@0: 
michael@0:             punpckhwd   xmm2,       xmm0
michael@0:             psubd       xmm7,       xmm2
michael@0: 
michael@0: 
michael@0:             movdqa      xmm3,       xmm6
michael@0:             pslld       xmm3,       4
michael@0: 
michael@0:             psubd       xmm3,       xmm6
michael@0:             movdqa      xmm1,       xmm5
michael@0: 
michael@0:             movdqa      xmm4,       xmm5
michael@0:             pmullw      xmm1,       xmm1
michael@0: 
michael@0:             pmulhw      xmm4,       xmm4
michael@0:             movdqa      xmm2,       xmm1
michael@0: 
michael@0:             punpcklwd   xmm1,       xmm4
michael@0:             punpckhwd   xmm2,       xmm4
michael@0: 
michael@0:             movdqa      xmm4,       xmm7
michael@0:             pslld       xmm4,       4
michael@0: 
michael@0:             psubd       xmm4,       xmm7
michael@0: 
michael@0:             psubd       xmm3,       xmm1
michael@0:             psubd       xmm4,       xmm2
michael@0: 
michael@0:             psubd       xmm3,       flimit4
michael@0:             psubd       xmm4,       flimit4
michael@0: 
michael@0:             psrad       xmm3,       31
michael@0:             psrad       xmm4,       31
michael@0: 
michael@0:             packssdw    xmm3,       xmm4
michael@0:             packsswb    xmm3,       xmm0
michael@0: 
michael@0:             movq        xmm1,       QWORD PTR [rsi+rax*8]
michael@0: 
michael@0:             movq        xmm2,       xmm1
michael@0:             punpcklbw   xmm1,       xmm0
michael@0: 
michael@0:             paddw       xmm1,       xmm5
michael@0:             mov         rcx,        rdx
michael@0: 
michael@0:             and         rcx,        127
michael@0: %if ABI_IS_32BIT=1 && CONFIG_PIC=1
michael@0:             push        rax
michael@0:             lea         rax,        [GLOBAL(sym(vp8_rv))]
michael@0:             movdqu      xmm4,       [rax + rcx*2] ;vp8_rv[rcx*2]
michael@0:             pop         rax
michael@0: %elif ABI_IS_32BIT=0
michael@0:             movdqu      xmm4,       [r8 + rcx*2] ;vp8_rv[rcx*2]
michael@0: %else
michael@0:             movdqu      xmm4,       [sym(vp8_rv) + rcx*2]
michael@0: %endif
michael@0: 
michael@0:             paddw       xmm1,       xmm4
michael@0:             ;paddw     xmm1,       eight8s
michael@0:             psraw       xmm1,       4
michael@0: 
michael@0:             packuswb    xmm1,       xmm0
michael@0:             pand        xmm1,       xmm3
michael@0: 
michael@0:             pandn       xmm3,       xmm2
michael@0:             por         xmm1,       xmm3
michael@0: 
michael@0:             and         rcx,        15
michael@0:             movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
michael@0: 
michael@0:             mov         rcx,        rdx
michael@0:             sub         rcx,        8
michael@0: 
michael@0:             and         rcx,        15
michael@0:             movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
michael@0: 
michael@0:             movq        [rsi],      mm0
michael@0:             lea         rsi,        [rsi+rax]
michael@0: 
michael@0:             lea         rdi,        [rdi+rax]
michael@0:             add         rdx,        1
michael@0: 
michael@0:             cmp         edx,        dword arg(2) ;rows
michael@0:             jl          .loop_row
michael@0: 
michael@0:         add         dword arg(0), 8 ; s += 8
michael@0:         sub         dword arg(3), 8 ; cols -= 8
michael@0:         cmp         dword arg(3), 0
michael@0:         jg          .loop_col
michael@0: 
michael@0:     add         rsp, 128+16
michael@0:     pop         rsp
michael@0: 
michael@0:     ; begin epilog
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     RESTORE_GOT
michael@0:     RESTORE_XMM
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: %undef flimit4
michael@0: 
michael@0: 
michael@0: ;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src,
michael@0: ;                                int pitch, int rows, int cols,int flimit)
michael@0: global sym(vp8_mbpost_proc_across_ip_xmm) PRIVATE
michael@0: sym(vp8_mbpost_proc_across_ip_xmm):
michael@0:     push        rbp
michael@0:     mov         rbp, rsp
michael@0:     SHADOW_ARGS_TO_STACK 5
michael@0:     SAVE_XMM 7
michael@0:     GET_GOT     rbx
michael@0:     push        rsi
michael@0:     push        rdi
michael@0:     ; end prolog
michael@0: 
michael@0:     ALIGN_STACK 16, rax
michael@0:     sub         rsp, 16
michael@0: 
michael@0:     ; create flimit4 at [rsp]
michael@0:     mov         eax, dword ptr arg(4) ;flimit
michael@0:     mov         [rsp], eax
michael@0:     mov         [rsp+4], eax
michael@0:     mov         [rsp+8], eax
michael@0:     mov         [rsp+12], eax
michael@0: %define flimit4 [rsp]
michael@0: 
michael@0: 
michael@0:     ;for(r=0;r<rows;r++)
michael@0: .ip_row_loop:
michael@0: 
michael@0:         xor         rdx,    rdx ;sumsq=0;
michael@0:         xor         rcx,    rcx ;sum=0;
michael@0:         mov         rsi,    arg(0); s
michael@0: 
michael@0: 
michael@0:         ; dup the first byte into the left border 8 times
michael@0:         movq        mm1,   [rsi]
michael@0:         punpcklbw   mm1,   mm1
michael@0:         punpcklwd   mm1,   mm1
michael@0:         punpckldq   mm1,   mm1
michael@0: 
michael@0:         mov         rdi,    -8
michael@0:         movq        [rsi+rdi], mm1
michael@0: 
michael@0:         ; dup the last byte into the right border
michael@0:         movsxd      rdx,    dword arg(3)
michael@0:         movq        mm1,   [rsi + rdx + -1]
michael@0:         punpcklbw   mm1,   mm1
michael@0:         punpcklwd   mm1,   mm1
michael@0:         punpckldq   mm1,   mm1
michael@0:         movq        [rsi+rdx], mm1
michael@0: 
michael@0: .ip_var_loop:
michael@0:         ;for(i=-8;i<=6;i++)
michael@0:         ;{
michael@0:         ;    sumsq += s[i]*s[i];
michael@0:         ;    sum   += s[i];
michael@0:         ;}
michael@0:         movzx       eax, byte [rsi+rdi]
michael@0:         add         ecx, eax
michael@0:         mul         al
michael@0:         add         edx, eax
michael@0:         add         rdi, 1
michael@0:         cmp         rdi, 6
michael@0:         jle         .ip_var_loop
michael@0: 
michael@0: 
michael@0:             ;mov         rax,    sumsq
michael@0:             ;movd        xmm7,   rax
michael@0:             movd        xmm7,   edx
michael@0: 
michael@0:             ;mov         rax,    sum
michael@0:             ;movd        xmm6,   rax
michael@0:             movd        xmm6,   ecx
michael@0: 
michael@0:             mov         rsi,    arg(0) ;s
michael@0:             xor         rcx,    rcx
michael@0: 
michael@0:             movsxd      rdx,    dword arg(3) ;cols
michael@0:             add         rdx,    8
michael@0:             pxor        mm0,    mm0
michael@0:             pxor        mm1,    mm1
michael@0: 
michael@0:             pxor        xmm0,   xmm0
michael@0: .nextcol4:
michael@0: 
michael@0:             movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
michael@0:             movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
michael@0: 
michael@0:             punpcklbw   xmm1,   xmm0                    ; expanding
michael@0:             punpcklbw   xmm2,   xmm0                    ; expanding
michael@0: 
michael@0:             punpcklwd   xmm1,   xmm0                    ; expanding to dwords
michael@0:             punpcklwd   xmm2,   xmm0                    ; expanding to dwords
michael@0: 
michael@0:             psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
michael@0:             paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
michael@0: 
michael@0:             paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
michael@0:             pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
michael@0: 
michael@0:             paddd       xmm6,   xmm2
michael@0:             paddd       xmm7,   xmm1
michael@0: 
michael@0:             pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
michael@0:             pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
michael@0: 
michael@0:             psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
michael@0:             psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
michael@0: 
michael@0:             pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
michael@0:             pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
michael@0: 
michael@0:             paddd       xmm6,   xmm4
michael@0:             paddd       xmm7,   xmm3
michael@0: 
michael@0:             pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
michael@0:             pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
michael@0: 
michael@0:             paddd       xmm7,   xmm3
michael@0:             paddd       xmm6,   xmm4
michael@0: 
michael@0:             pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
michael@0:             pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
michael@0: 
michael@0:             paddd       xmm7,   xmm3
michael@0:             paddd       xmm6,   xmm4
michael@0: 
michael@0:             movdqa      xmm3,   xmm6
michael@0:             pmaddwd     xmm3,   xmm3
michael@0: 
michael@0:             movdqa      xmm5,   xmm7
michael@0:             pslld       xmm5,   4
michael@0: 
michael@0:             psubd       xmm5,   xmm7
michael@0:             psubd       xmm5,   xmm3
michael@0: 
michael@0:             psubd       xmm5,   flimit4
michael@0:             psrad       xmm5,   31
michael@0: 
michael@0:             packssdw    xmm5,   xmm0
michael@0:             packsswb    xmm5,   xmm0
michael@0: 
michael@0:             movd        xmm1,   DWORD PTR [rsi+rcx]
michael@0:             movq        xmm2,   xmm1
michael@0: 
michael@0:             punpcklbw   xmm1,   xmm0
michael@0:             punpcklwd   xmm1,   xmm0
michael@0: 
michael@0:             paddd       xmm1,   xmm6
michael@0:             paddd       xmm1,   [GLOBAL(four8s)]
michael@0: 
michael@0:             psrad       xmm1,   4
michael@0:             packssdw    xmm1,   xmm0
michael@0: 
michael@0:             packuswb    xmm1,   xmm0
michael@0:             pand        xmm1,   xmm5
michael@0: 
michael@0:             pandn       xmm5,   xmm2
michael@0:             por         xmm5,   xmm1
michael@0: 
michael@0:             movd        [rsi+rcx-8],  mm0
michael@0:             movq        mm0,    mm1
michael@0: 
michael@0:             movdq2q     mm1,    xmm5
michael@0:             psrldq      xmm7,   12
michael@0: 
michael@0:             psrldq      xmm6,   12
michael@0:             add         rcx,    4
michael@0: 
michael@0:             cmp         rcx,    rdx
michael@0:             jl          .nextcol4
michael@0: 
michael@0:         ;s+=pitch;
michael@0:         movsxd rax, dword arg(1)
michael@0:         add    arg(0), rax
michael@0: 
michael@0:         sub dword arg(2), 1 ;rows-=1
michael@0:         cmp dword arg(2), 0
michael@0:         jg .ip_row_loop
michael@0: 
michael@0:     add         rsp, 16
michael@0:     pop         rsp
michael@0: 
michael@0:     ; begin epilog
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     RESTORE_GOT
michael@0:     RESTORE_XMM
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: %undef flimit4
michael@0: 
michael@0: 
michael@0: ;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
michael@0: ;                            unsigned char blackclamp[16],
michael@0: ;                            unsigned char whiteclamp[16],
michael@0: ;                            unsigned char bothclamp[16],
michael@0: ;                            unsigned int Width, unsigned int Height, int Pitch)
michael@0: extern sym(rand)
michael@0: global sym(vp8_plane_add_noise_wmt) PRIVATE
michael@0: sym(vp8_plane_add_noise_wmt):
michael@0:     push        rbp
michael@0:     mov         rbp, rsp
michael@0:     SHADOW_ARGS_TO_STACK 8
michael@0:     GET_GOT     rbx
michael@0:     push        rsi
michael@0:     push        rdi
michael@0:     ; end prolog
michael@0: 
michael@0: .addnoise_loop:
michael@0:     call sym(rand) WRT_PLT
michael@0:     mov     rcx, arg(1) ;noise
michael@0:     and     rax, 0xff
michael@0:     add     rcx, rax
michael@0: 
michael@0:     ; we rely on the fact that the clamping vectors are stored contiguously
michael@0:     ; in black/white/both order. Note that we have to reload this here because
michael@0:     ; rdx could be trashed by rand()
michael@0:     mov     rdx, arg(2) ; blackclamp
michael@0: 
michael@0: 
michael@0:             mov     rdi, rcx
michael@0:             movsxd  rcx, dword arg(5) ;[Width]
michael@0:             mov     rsi, arg(0) ;Pos
michael@0:             xor         rax,rax
michael@0: 
michael@0: .addnoise_nextset:
michael@0:             movdqu      xmm1,[rsi+rax]         ; get the source
michael@0: 
michael@0:             psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
michael@0:             paddusb     xmm1, [rdx+32] ;bothclamp
michael@0:             psubusb     xmm1, [rdx+16] ;whiteclamp
michael@0: 
michael@0:             movdqu      xmm2,[rdi+rax]         ; get the noise for this line
michael@0:             paddb       xmm1,xmm2              ; add it in
michael@0:             movdqu      [rsi+rax],xmm1         ; store the result
michael@0: 
michael@0:             add         rax,16                 ; move to the next line
michael@0: 
michael@0:             cmp         rax, rcx
michael@0:             jl          .addnoise_nextset
michael@0: 
michael@0:     movsxd  rax, dword arg(7) ; Pitch
michael@0:     add     arg(0), rax ; Start += Pitch
michael@0:     sub     dword arg(6), 1   ; Height -= 1
michael@0:     jg      .addnoise_loop
michael@0: 
michael@0:     ; begin epilog
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     RESTORE_GOT
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: 
michael@0: 
michael@0: SECTION_RODATA
michael@0: align 16
michael@0: four8s:
michael@0:     times 4 dd 8