media/libvpx/vp9/common/x86/vp9_postproc_sse2.asm

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    12 %include "vpx_ports/x86_abi_support.asm"
    14 ;void vp9_post_proc_down_and_across_xmm
    15 ;(
    16 ;    unsigned char *src_ptr,
    17 ;    unsigned char *dst_ptr,
    18 ;    int src_pixels_per_line,
    19 ;    int dst_pixels_per_line,
    20 ;    int rows,
    21 ;    int cols,
    22 ;    int flimit
    23 ;)
    24 global sym(vp9_post_proc_down_and_across_xmm) PRIVATE
    25 sym(vp9_post_proc_down_and_across_xmm):
    26     push        rbp
    27     mov         rbp, rsp
    28     SHADOW_ARGS_TO_STACK 7
    29     SAVE_XMM 7
    30     GET_GOT     rbx
    31     push        rsi
    32     push        rdi
    33     ; end prolog
    35 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
    36     ALIGN_STACK 16, rax
    37     ; move the global rd onto the stack, since we don't have enough registers
    38     ; to do PIC addressing
    39     movdqa      xmm0, [GLOBAL(rd42)]
    40     sub         rsp, 16
    41     movdqa      [rsp], xmm0
    42 %define RD42 [rsp]
    43 %else
    44 %define RD42 [GLOBAL(rd42)]
    45 %endif
    48         movd        xmm2,       dword ptr arg(6) ;flimit
    49         punpcklwd   xmm2,       xmm2
    50         punpckldq   xmm2,       xmm2
    51         punpcklqdq  xmm2,       xmm2
    53         mov         rsi,        arg(0) ;src_ptr
    54         mov         rdi,        arg(1) ;dst_ptr
    56         movsxd      rcx,        DWORD PTR arg(4) ;rows
    57         movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
    58         pxor        xmm0,       xmm0              ; mm0 = 00000000
    60 .nextrow:
    62         xor         rdx,        rdx       ; clear out rdx for use as loop counter
    63 .nextcol:
    64         movq        xmm3,       QWORD PTR [rsi]         ; mm4 = r0 p0..p7
    65         punpcklbw   xmm3,       xmm0                    ; mm3 = p0..p3
    66         movdqa      xmm1,       xmm3                    ; mm1 = p0..p3
    67         psllw       xmm3,       2                       ;
    69         movq        xmm5,       QWORD PTR [rsi + rax]   ; mm4 = r1 p0..p7
    70         punpcklbw   xmm5,       xmm0                    ; mm5 = r1 p0..p3
    71         paddusw     xmm3,       xmm5                    ; mm3 += mm6
    73         ; thresholding
    74         movdqa      xmm7,       xmm1                    ; mm7 = r0 p0..p3
    75         psubusw     xmm7,       xmm5                    ; mm7 = r0 p0..p3 - r1 p0..p3
    76         psubusw     xmm5,       xmm1                    ; mm5 = r1 p0..p3 - r0 p0..p3
    77         paddusw     xmm7,       xmm5                    ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
    78         pcmpgtw     xmm7,       xmm2
    80         movq        xmm5,       QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
    81         punpcklbw   xmm5,       xmm0                    ; mm5 = r2 p0..p3
    82         paddusw     xmm3,       xmm5                    ; mm3 += mm5
    84         ; thresholding
    85         movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
    86         psubusw     xmm6,       xmm5                    ; mm6 = r0 p0..p3 - r2 p0..p3
    87         psubusw     xmm5,       xmm1                    ; mm5 = r2 p0..p3 - r2 p0..p3
    88         paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
    89         pcmpgtw     xmm6,       xmm2
    90         por         xmm7,       xmm6                    ; accumulate thresholds
    93         neg         rax
    94         movq        xmm5,       QWORD PTR [rsi+2*rax]   ; mm4 = r-2 p0..p7
    95         punpcklbw   xmm5,       xmm0                    ; mm5 = r-2 p0..p3
    96         paddusw     xmm3,       xmm5                    ; mm3 += mm5
    98         ; thresholding
    99         movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
   100         psubusw     xmm6,       xmm5                    ; mm6 = p0..p3 - r-2 p0..p3
   101         psubusw     xmm5,       xmm1                    ; mm5 = r-2 p0..p3 - p0..p3
   102         paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
   103         pcmpgtw     xmm6,       xmm2
   104         por         xmm7,       xmm6                    ; accumulate thresholds
   106         movq        xmm4,       QWORD PTR [rsi+rax]     ; mm4 = r-1 p0..p7
   107         punpcklbw   xmm4,       xmm0                    ; mm4 = r-1 p0..p3
   108         paddusw     xmm3,       xmm4                    ; mm3 += mm5
   110         ; thresholding
   111         movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
   112         psubusw     xmm6,       xmm4                    ; mm6 = p0..p3 - r-2 p0..p3
   113         psubusw     xmm4,       xmm1                    ; mm5 = r-1 p0..p3 - p0..p3
   114         paddusw     xmm6,       xmm4                    ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
   115         pcmpgtw     xmm6,       xmm2
   116         por         xmm7,       xmm6                    ; accumulate thresholds
   119         paddusw     xmm3,       RD42                    ; mm3 += round value
   120         psraw       xmm3,       3                       ; mm3 /= 8
   122         pand        xmm1,       xmm7                    ; mm1 select vals > thresh from source
   123         pandn       xmm7,       xmm3                    ; mm7 select vals < thresh from blurred result
   124         paddusw     xmm1,       xmm7                    ; combination
   126         packuswb    xmm1,       xmm0                    ; pack to bytes
   127         movq        QWORD PTR [rdi], xmm1             ;
   129         neg         rax                   ; pitch is positive
   130         add         rsi,        8
   131         add         rdi,        8
   133         add         rdx,        8
   134         cmp         edx,        dword arg(5) ;cols
   136         jl          .nextcol
   138         ; done with the all cols, start the across filtering in place
   139         sub         rsi,        rdx
   140         sub         rdi,        rdx
   142         xor         rdx,        rdx
   143         movq        mm0,        QWORD PTR [rdi-8];
   145 .acrossnextcol:
   146         movq        xmm7,       QWORD PTR [rdi +rdx -2]
   147         movd        xmm4,       DWORD PTR [rdi +rdx +6]
   149         pslldq      xmm4,       8
   150         por         xmm4,       xmm7
   152         movdqa      xmm3,       xmm4
   153         psrldq      xmm3,       2
   154         punpcklbw   xmm3,       xmm0              ; mm3 = p0..p3
   155         movdqa      xmm1,       xmm3              ; mm1 = p0..p3
   156         psllw       xmm3,       2
   159         movdqa      xmm5,       xmm4
   160         psrldq      xmm5,       3
   161         punpcklbw   xmm5,       xmm0              ; mm5 = p1..p4
   162         paddusw     xmm3,       xmm5              ; mm3 += mm6
   164         ; thresholding
   165         movdqa      xmm7,       xmm1              ; mm7 = p0..p3
   166         psubusw     xmm7,       xmm5              ; mm7 = p0..p3 - p1..p4
   167         psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
   168         paddusw     xmm7,       xmm5              ; mm7 = abs(p0..p3 - p1..p4)
   169         pcmpgtw     xmm7,       xmm2
   171         movdqa      xmm5,       xmm4
   172         psrldq      xmm5,       4
   173         punpcklbw   xmm5,       xmm0              ; mm5 = p2..p5
   174         paddusw     xmm3,       xmm5              ; mm3 += mm5
   176         ; thresholding
   177         movdqa      xmm6,       xmm1              ; mm6 = p0..p3
   178         psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
   179         psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
   180         paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
   181         pcmpgtw     xmm6,       xmm2
   182         por         xmm7,       xmm6              ; accumulate thresholds
   185         movdqa      xmm5,       xmm4              ; mm5 = p-2..p5
   186         punpcklbw   xmm5,       xmm0              ; mm5 = p-2..p1
   187         paddusw     xmm3,       xmm5              ; mm3 += mm5
   189         ; thresholding
   190         movdqa      xmm6,       xmm1              ; mm6 = p0..p3
   191         psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
   192         psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
   193         paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
   194         pcmpgtw     xmm6,       xmm2
   195         por         xmm7,       xmm6              ; accumulate thresholds
   197         psrldq      xmm4,       1                   ; mm4 = p-1..p5
   198         punpcklbw   xmm4,       xmm0              ; mm4 = p-1..p2
   199         paddusw     xmm3,       xmm4              ; mm3 += mm5
   201         ; thresholding
   202         movdqa      xmm6,       xmm1              ; mm6 = p0..p3
   203         psubusw     xmm6,       xmm4              ; mm6 = p0..p3 - p1..p4
   204         psubusw     xmm4,       xmm1              ; mm5 = p1..p4 - p0..p3
   205         paddusw     xmm6,       xmm4              ; mm6 = abs(p0..p3 - p1..p4)
   206         pcmpgtw     xmm6,       xmm2
   207         por         xmm7,       xmm6              ; accumulate thresholds
   209         paddusw     xmm3,       RD42              ; mm3 += round value
   210         psraw       xmm3,       3                 ; mm3 /= 8
   212         pand        xmm1,       xmm7              ; mm1 select vals > thresh from source
   213         pandn       xmm7,       xmm3              ; mm7 select vals < thresh from blurred result
   214         paddusw     xmm1,       xmm7              ; combination
   216         packuswb    xmm1,       xmm0              ; pack to bytes
   217         movq        QWORD PTR [rdi+rdx-8],  mm0   ; store previous four bytes
   218         movdq2q     mm0,        xmm1
   220         add         rdx,        8
   221         cmp         edx,        dword arg(5) ;cols
   222         jl          .acrossnextcol;
   224         ; last 8 pixels
   225         movq        QWORD PTR [rdi+rdx-8],  mm0
   227         ; done with this rwo
   228         add         rsi,rax               ; next line
   229         mov         eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
   230         add         rdi,rax               ; next destination
   231         mov         eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
   233         dec         rcx                   ; decrement count
   234         jnz         .nextrow              ; next row
   236 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
   237     add rsp,16
   238     pop rsp
   239 %endif
   240     ; begin epilog
   241     pop rdi
   242     pop rsi
   243     RESTORE_GOT
   244     RESTORE_XMM
   245     UNSHADOW_ARGS
   246     pop         rbp
   247     ret
   248 %undef RD42
   251 ;void vp9_mbpost_proc_down_xmm(unsigned char *dst,
   252 ;                            int pitch, int rows, int cols,int flimit)
   253 extern sym(vp9_rv)
   254 global sym(vp9_mbpost_proc_down_xmm) PRIVATE
   255 sym(vp9_mbpost_proc_down_xmm):
   256     push        rbp
   257     mov         rbp, rsp
   258     SHADOW_ARGS_TO_STACK 5
   259     SAVE_XMM 7
   260     GET_GOT     rbx
   261     push        rsi
   262     push        rdi
   263     ; end prolog
   265     ALIGN_STACK 16, rax
   266     sub         rsp, 128+16
   268     ; unsigned char d[16][8] at [rsp]
   269     ; create flimit2 at [rsp+128]
   270     mov         eax, dword ptr arg(4) ;flimit
   271     mov         [rsp+128], eax
   272     mov         [rsp+128+4], eax
   273     mov         [rsp+128+8], eax
   274     mov         [rsp+128+12], eax
   275 %define flimit4 [rsp+128]
   277 %if ABI_IS_32BIT=0
   278     lea         r8,       [GLOBAL(sym(vp9_rv))]
   279 %endif
   281     ;rows +=8;
   282     add         dword arg(2), 8
   284     ;for(c=0; c<cols; c+=8)
   285 .loop_col:
   286             mov         rsi,        arg(0) ; s
   287             pxor        xmm0,       xmm0        ;
   289             movsxd      rax,        dword ptr arg(1) ;pitch       ;
   290             neg         rax                                     ; rax = -pitch
   292             lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
   293             neg         rax
   296             pxor        xmm5,       xmm5
   297             pxor        xmm6,       xmm6        ;
   299             pxor        xmm7,       xmm7        ;
   300             mov         rdi,        rsi
   302             mov         rcx,        15          ;
   304 .loop_initvar:
   305             movq        xmm1,       QWORD PTR [rdi];
   306             punpcklbw   xmm1,       xmm0        ;
   308             paddw       xmm5,       xmm1        ;
   309             pmullw      xmm1,       xmm1        ;
   311             movdqa      xmm2,       xmm1        ;
   312             punpcklwd   xmm1,       xmm0        ;
   314             punpckhwd   xmm2,       xmm0        ;
   315             paddd       xmm6,       xmm1        ;
   317             paddd       xmm7,       xmm2        ;
   318             lea         rdi,        [rdi+rax]   ;
   320             dec         rcx
   321             jne         .loop_initvar
   322             ;save the var and sum
   323             xor         rdx,        rdx
   324 .loop_row:
   325             movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
   326             movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
   328             punpcklbw   xmm1,       xmm0
   329             punpcklbw   xmm2,       xmm0
   331             paddw       xmm5,       xmm2
   332             psubw       xmm5,       xmm1
   334             pmullw      xmm2,       xmm2
   335             movdqa      xmm4,       xmm2
   337             punpcklwd   xmm2,       xmm0
   338             punpckhwd   xmm4,       xmm0
   340             paddd       xmm6,       xmm2
   341             paddd       xmm7,       xmm4
   343             pmullw      xmm1,       xmm1
   344             movdqa      xmm2,       xmm1
   346             punpcklwd   xmm1,       xmm0
   347             psubd       xmm6,       xmm1
   349             punpckhwd   xmm2,       xmm0
   350             psubd       xmm7,       xmm2
   353             movdqa      xmm3,       xmm6
   354             pslld       xmm3,       4
   356             psubd       xmm3,       xmm6
   357             movdqa      xmm1,       xmm5
   359             movdqa      xmm4,       xmm5
   360             pmullw      xmm1,       xmm1
   362             pmulhw      xmm4,       xmm4
   363             movdqa      xmm2,       xmm1
   365             punpcklwd   xmm1,       xmm4
   366             punpckhwd   xmm2,       xmm4
   368             movdqa      xmm4,       xmm7
   369             pslld       xmm4,       4
   371             psubd       xmm4,       xmm7
   373             psubd       xmm3,       xmm1
   374             psubd       xmm4,       xmm2
   376             psubd       xmm3,       flimit4
   377             psubd       xmm4,       flimit4
   379             psrad       xmm3,       31
   380             psrad       xmm4,       31
   382             packssdw    xmm3,       xmm4
   383             packsswb    xmm3,       xmm0
   385             movq        xmm1,       QWORD PTR [rsi+rax*8]
   387             movq        xmm2,       xmm1
   388             punpcklbw   xmm1,       xmm0
   390             paddw       xmm1,       xmm5
   391             mov         rcx,        rdx
   393             and         rcx,        127
   394 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
   395             push        rax
   396             lea         rax,        [GLOBAL(sym(vp9_rv))]
   397             movdqu      xmm4,       [rax + rcx*2] ;vp9_rv[rcx*2]
   398             pop         rax
   399 %elif ABI_IS_32BIT=0
   400             movdqu      xmm4,       [r8 + rcx*2] ;vp9_rv[rcx*2]
   401 %else
   402             movdqu      xmm4,       [sym(vp9_rv) + rcx*2]
   403 %endif
   405             paddw       xmm1,       xmm4
   406             ;paddw     xmm1,       eight8s
   407             psraw       xmm1,       4
   409             packuswb    xmm1,       xmm0
   410             pand        xmm1,       xmm3
   412             pandn       xmm3,       xmm2
   413             por         xmm1,       xmm3
   415             and         rcx,        15
   416             movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
   418             mov         rcx,        rdx
   419             sub         rcx,        8
   421             and         rcx,        15
   422             movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
   424             movq        [rsi],      mm0
   425             lea         rsi,        [rsi+rax]
   427             lea         rdi,        [rdi+rax]
   428             add         rdx,        1
   430             cmp         edx,        dword arg(2) ;rows
   431             jl          .loop_row
   433         add         dword arg(0), 8 ; s += 8
   434         sub         dword arg(3), 8 ; cols -= 8
   435         cmp         dword arg(3), 0
   436         jg          .loop_col
   438     add         rsp, 128+16
   439     pop         rsp
   441     ; begin epilog
   442     pop rdi
   443     pop rsi
   444     RESTORE_GOT
   445     RESTORE_XMM
   446     UNSHADOW_ARGS
   447     pop         rbp
   448     ret
   449 %undef flimit4
   452 ;void vp9_mbpost_proc_across_ip_xmm(unsigned char *src,
   453 ;                                int pitch, int rows, int cols,int flimit)
   454 global sym(vp9_mbpost_proc_across_ip_xmm) PRIVATE
   455 sym(vp9_mbpost_proc_across_ip_xmm):
   456     push        rbp
   457     mov         rbp, rsp
   458     SHADOW_ARGS_TO_STACK 5
   459     SAVE_XMM 7
   460     GET_GOT     rbx
   461     push        rsi
   462     push        rdi
   463     ; end prolog
   465     ALIGN_STACK 16, rax
   466     sub         rsp, 16
   468     ; create flimit4 at [rsp]
   469     mov         eax, dword ptr arg(4) ;flimit
   470     mov         [rsp], eax
   471     mov         [rsp+4], eax
   472     mov         [rsp+8], eax
   473     mov         [rsp+12], eax
   474 %define flimit4 [rsp]
   477     ;for(r=0;r<rows;r++)
   478 .ip_row_loop:
   480         xor         rdx,    rdx ;sumsq=0;
   481         xor         rcx,    rcx ;sum=0;
   482         mov         rsi,    arg(0); s
   483         mov         rdi,    -8
   484 .ip_var_loop:
   485         ;for(i=-8;i<=6;i++)
   486         ;{
   487         ;    sumsq += s[i]*s[i];
   488         ;    sum   += s[i];
   489         ;}
   490         movzx       eax, byte [rsi+rdi]
   491         add         ecx, eax
   492         mul         al
   493         add         edx, eax
   494         add         rdi, 1
   495         cmp         rdi, 6
   496         jle         .ip_var_loop
   499             ;mov         rax,    sumsq
   500             ;movd        xmm7,   rax
   501             movd        xmm7,   edx
   503             ;mov         rax,    sum
   504             ;movd        xmm6,   rax
   505             movd        xmm6,   ecx
   507             mov         rsi,    arg(0) ;s
   508             xor         rcx,    rcx
   510             movsxd      rdx,    dword arg(3) ;cols
   511             add         rdx,    8
   512             pxor        mm0,    mm0
   513             pxor        mm1,    mm1
   515             pxor        xmm0,   xmm0
   516 .nextcol4:
   518             movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
   519             movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
   521             punpcklbw   xmm1,   xmm0                    ; expanding
   522             punpcklbw   xmm2,   xmm0                    ; expanding
   524             punpcklwd   xmm1,   xmm0                    ; expanding to dwords
   525             punpcklwd   xmm2,   xmm0                    ; expanding to dwords
   527             psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
   528             paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
   530             paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
   531             pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
   533             paddd       xmm6,   xmm2
   534             paddd       xmm7,   xmm1
   536             pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
   537             pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
   539             psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
   540             psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
   542             pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
   543             pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
   545             paddd       xmm6,   xmm4
   546             paddd       xmm7,   xmm3
   548             pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
   549             pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
   551             paddd       xmm7,   xmm3
   552             paddd       xmm6,   xmm4
   554             pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
   555             pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
   557             paddd       xmm7,   xmm3
   558             paddd       xmm6,   xmm4
   560             movdqa      xmm3,   xmm6
   561             pmaddwd     xmm3,   xmm3
   563             movdqa      xmm5,   xmm7
   564             pslld       xmm5,   4
   566             psubd       xmm5,   xmm7
   567             psubd       xmm5,   xmm3
   569             psubd       xmm5,   flimit4
   570             psrad       xmm5,   31
   572             packssdw    xmm5,   xmm0
   573             packsswb    xmm5,   xmm0
   575             movd        xmm1,   DWORD PTR [rsi+rcx]
   576             movq        xmm2,   xmm1
   578             punpcklbw   xmm1,   xmm0
   579             punpcklwd   xmm1,   xmm0
   581             paddd       xmm1,   xmm6
   582             paddd       xmm1,   [GLOBAL(four8s)]
   584             psrad       xmm1,   4
   585             packssdw    xmm1,   xmm0
   587             packuswb    xmm1,   xmm0
   588             pand        xmm1,   xmm5
   590             pandn       xmm5,   xmm2
   591             por         xmm5,   xmm1
   593             movd        [rsi+rcx-8],  mm0
   594             movq        mm0,    mm1
   596             movdq2q     mm1,    xmm5
   597             psrldq      xmm7,   12
   599             psrldq      xmm6,   12
   600             add         rcx,    4
   602             cmp         rcx,    rdx
   603             jl          .nextcol4
   605         ;s+=pitch;
   606         movsxd rax, dword arg(1)
   607         add    arg(0), rax
   609         sub dword arg(2), 1 ;rows-=1
   610         cmp dword arg(2), 0
   611         jg .ip_row_loop
   613     add         rsp, 16
   614     pop         rsp
   616     ; begin epilog
   617     pop rdi
   618     pop rsi
   619     RESTORE_GOT
   620     RESTORE_XMM
   621     UNSHADOW_ARGS
   622     pop         rbp
   623     ret
   624 %undef flimit4
   627 ;void vp9_plane_add_noise_wmt (unsigned char *start, unsigned char *noise,
   628 ;                            unsigned char blackclamp[16],
   629 ;                            unsigned char whiteclamp[16],
   630 ;                            unsigned char bothclamp[16],
   631 ;                            unsigned int width, unsigned int height, int pitch)
   632 extern sym(rand)
   633 global sym(vp9_plane_add_noise_wmt) PRIVATE
   634 sym(vp9_plane_add_noise_wmt):
   635     push        rbp
   636     mov         rbp, rsp
   637     SHADOW_ARGS_TO_STACK 8
   638     GET_GOT     rbx
   639     push        rsi
   640     push        rdi
   641     ; end prolog
   643 .addnoise_loop:
   644     call sym(rand) WRT_PLT
   645     mov     rcx, arg(1) ;noise
   646     and     rax, 0xff
   647     add     rcx, rax
   649     ; we rely on the fact that the clamping vectors are stored contiguously
   650     ; in black/white/both order. Note that we have to reload this here because
   651     ; rdx could be trashed by rand()
   652     mov     rdx, arg(2) ; blackclamp
   655             mov     rdi, rcx
   656             movsxd  rcx, dword arg(5) ;[Width]
   657             mov     rsi, arg(0) ;Pos
   658             xor         rax,rax
   660 .addnoise_nextset:
   661             movdqu      xmm1,[rsi+rax]         ; get the source
   663             psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
   664             paddusb     xmm1, [rdx+32] ;bothclamp
   665             psubusb     xmm1, [rdx+16] ;whiteclamp
   667             movdqu      xmm2,[rdi+rax]         ; get the noise for this line
   668             paddb       xmm1,xmm2              ; add it in
   669             movdqu      [rsi+rax],xmm1         ; store the result
   671             add         rax,16                 ; move to the next line
   673             cmp         rax, rcx
   674             jl          .addnoise_nextset
   676     movsxd  rax, dword arg(7) ; Pitch
   677     add     arg(0), rax ; Start += Pitch
   678     sub     dword arg(6), 1   ; Height -= 1
   679     jg      .addnoise_loop
   681     ; begin epilog
   682     pop rdi
   683     pop rsi
   684     RESTORE_GOT
   685     UNSHADOW_ARGS
   686     pop         rbp
   687     ret
   690 SECTION_RODATA
   691 align 16
   692 rd42:
   693     times 8 dw 0x04
   694 four8s:
   695     times 4 dd 8

mercurial