media/libvpx/vp8/common/x86/postproc_sse2.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    12 %include "vpx_ports/x86_abi_support.asm"
    14 ;macro in deblock functions
    15 %macro FIRST_2_ROWS 0
    16         movdqa      xmm4,       xmm0
    17         movdqa      xmm6,       xmm0
    18         movdqa      xmm5,       xmm1
    19         pavgb       xmm5,       xmm3
    21         ;calculate absolute value
    22         psubusb     xmm4,       xmm1
    23         psubusb     xmm1,       xmm0
    24         psubusb     xmm6,       xmm3
    25         psubusb     xmm3,       xmm0
    26         paddusb     xmm4,       xmm1
    27         paddusb     xmm6,       xmm3
    29         ;get threshold
    30         movdqa      xmm2,       flimit
    31         pxor        xmm1,       xmm1
    32         movdqa      xmm7,       xmm2
    34         ;get mask
    35         psubusb     xmm2,       xmm4
    36         psubusb     xmm7,       xmm6
    37         pcmpeqb     xmm2,       xmm1
    38         pcmpeqb     xmm7,       xmm1
    39         por         xmm7,       xmm2
    40 %endmacro
    42 %macro SECOND_2_ROWS 0
    43         movdqa      xmm6,       xmm0
    44         movdqa      xmm4,       xmm0
    45         movdqa      xmm2,       xmm1
    46         pavgb       xmm1,       xmm3
    48         ;calculate absolute value
    49         psubusb     xmm6,       xmm2
    50         psubusb     xmm2,       xmm0
    51         psubusb     xmm4,       xmm3
    52         psubusb     xmm3,       xmm0
    53         paddusb     xmm6,       xmm2
    54         paddusb     xmm4,       xmm3
    56         pavgb       xmm5,       xmm1
    58         ;get threshold
    59         movdqa      xmm2,       flimit
    60         pxor        xmm1,       xmm1
    61         movdqa      xmm3,       xmm2
    63         ;get mask
    64         psubusb     xmm2,       xmm6
    65         psubusb     xmm3,       xmm4
    66         pcmpeqb     xmm2,       xmm1
    67         pcmpeqb     xmm3,       xmm1
    69         por         xmm7,       xmm2
    70         por         xmm7,       xmm3
    72         pavgb       xmm5,       xmm0
    74         ;decide if or not to use filtered value
    75         pand        xmm0,       xmm7
    76         pandn       xmm7,       xmm5
    77         paddusb     xmm0,       xmm7
    78 %endmacro
    80 %macro UPDATE_FLIMIT 0
    81         movdqa      xmm2,       XMMWORD PTR [rbx]
    82         movdqa      [rsp],      xmm2
    83         add         rbx,        16
    84 %endmacro
    86 ;void vp8_post_proc_down_and_across_mb_row_sse2
    87 ;(
    88 ;    unsigned char *src_ptr,
    89 ;    unsigned char *dst_ptr,
    90 ;    int src_pixels_per_line,
    91 ;    int dst_pixels_per_line,
    92 ;    int cols,
    93 ;    int *flimits,
    94 ;    int size
    95 ;)
    96 global sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE
    97 sym(vp8_post_proc_down_and_across_mb_row_sse2):
    98     push        rbp
    99     mov         rbp, rsp
   100     SHADOW_ARGS_TO_STACK 7
   101     SAVE_XMM 7
   102     push        rbx
   103     push        rsi
   104     push        rdi
   105     ; end prolog
   106     ALIGN_STACK 16, rax
   107     sub         rsp, 16
   109         ; put flimit on stack
   110         mov         rbx,        arg(5)           ;flimits ptr
   111         UPDATE_FLIMIT
   113 %define flimit [rsp]
   115         mov         rsi,        arg(0)           ;src_ptr
   116         mov         rdi,        arg(1)           ;dst_ptr
   118         movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line
   119         movsxd      rcx,        DWORD PTR arg(6) ;rows in a macroblock
   120 .nextrow:
   121         xor         rdx,        rdx              ;col
   122 .nextcol:
   123         ;load current and next 2 rows
   124         movdqu      xmm0,       XMMWORD PTR [rsi]
   125         movdqu      xmm1,       XMMWORD PTR [rsi + rax]
   126         movdqu      xmm3,       XMMWORD PTR [rsi + 2*rax]
   128         FIRST_2_ROWS
   130         ;load above 2 rows
   131         neg         rax
   132         movdqu      xmm1,       XMMWORD PTR [rsi + 2*rax]
   133         movdqu      xmm3,       XMMWORD PTR [rsi + rax]
   135         SECOND_2_ROWS
   137         movdqu      XMMWORD PTR [rdi], xmm0
   139         neg         rax                          ; positive stride
   140         add         rsi,        16
   141         add         rdi,        16
   143         add         rdx,        16
   144         cmp         edx,        dword arg(4)     ;cols
   145         jge         .downdone
   146         UPDATE_FLIMIT
   147         jmp         .nextcol
   149 .downdone:
   150         ; done with the all cols, start the across filtering in place
   151         sub         rsi,        rdx
   152         sub         rdi,        rdx
   154         mov         rbx,        arg(5) ; flimits
   155         UPDATE_FLIMIT
   157         ; dup the first byte into the left border 8 times
   158         movq        mm1,   [rdi]
   159         punpcklbw   mm1,   mm1
   160         punpcklwd   mm1,   mm1
   161         punpckldq   mm1,   mm1
   162         mov         rdx,    -8
   163         movq        [rdi+rdx], mm1
   165         ; dup the last byte into the right border
   166         movsxd      rdx,    dword arg(4)
   167         movq        mm1,   [rdi + rdx + -1]
   168         punpcklbw   mm1,   mm1
   169         punpcklwd   mm1,   mm1
   170         punpckldq   mm1,   mm1
   171         movq        [rdi+rdx], mm1
   173         xor         rdx,        rdx
   174         movq        mm0,        QWORD PTR [rdi-16];
   175         movq        mm1,        QWORD PTR [rdi-8];
   177 .acrossnextcol:
   178         movdqu      xmm0,       XMMWORD PTR [rdi + rdx]
   179         movdqu      xmm1,       XMMWORD PTR [rdi + rdx -2]
   180         movdqu      xmm3,       XMMWORD PTR [rdi + rdx -1]
   182         FIRST_2_ROWS
   184         movdqu      xmm1,       XMMWORD PTR [rdi + rdx +1]
   185         movdqu      xmm3,       XMMWORD PTR [rdi + rdx +2]
   187         SECOND_2_ROWS
   189         movq        QWORD PTR [rdi+rdx-16], mm0  ; store previous 8 bytes
   190         movq        QWORD PTR [rdi+rdx-8], mm1   ; store previous 8 bytes
   191         movdq2q     mm0,        xmm0
   192         psrldq      xmm0,       8
   193         movdq2q     mm1,        xmm0
   195         add         rdx,        16
   196         cmp         edx,        dword arg(4)     ;cols
   197         jge         .acrossdone
   198         UPDATE_FLIMIT
   199         jmp         .acrossnextcol
   201 .acrossdone
   202         ; last 16 pixels
   203         movq        QWORD PTR [rdi+rdx-16], mm0
   205         cmp         edx,        dword arg(4)
   206         jne         .throw_last_8
   207         movq        QWORD PTR [rdi+rdx-8], mm1
   208 .throw_last_8:
   209         ; done with this rwo
   210         add         rsi,rax                      ;next src line
   211         mov         eax, dword arg(3)            ;dst_pixels_per_line
   212         add         rdi,rax                      ;next destination
   213         mov         eax, dword arg(2)            ;src_pixels_per_line
   215         mov         rbx,        arg(5)           ;flimits
   216         UPDATE_FLIMIT
   218         dec         rcx                          ;decrement count
   219         jnz         .nextrow                     ;next row
   221     add rsp, 16
   222     pop rsp
   223     ; begin epilog
   224     pop rdi
   225     pop rsi
   226     pop rbx
   227     RESTORE_XMM
   228     UNSHADOW_ARGS
   229     pop         rbp
   230     ret
   231 %undef flimit
   233 ;void vp8_mbpost_proc_down_xmm(unsigned char *dst,
   234 ;                            int pitch, int rows, int cols,int flimit)
   235 extern sym(vp8_rv)
   236 global sym(vp8_mbpost_proc_down_xmm) PRIVATE
   237 sym(vp8_mbpost_proc_down_xmm):
   238     push        rbp
   239     mov         rbp, rsp
   240     SHADOW_ARGS_TO_STACK 5
   241     SAVE_XMM 7
   242     GET_GOT     rbx
   243     push        rsi
   244     push        rdi
   245     ; end prolog
   247     ALIGN_STACK 16, rax
   248     sub         rsp, 128+16
   250     ; unsigned char d[16][8] at [rsp]
   251     ; create flimit2 at [rsp+128]
   252     mov         eax, dword ptr arg(4) ;flimit
   253     mov         [rsp+128], eax
   254     mov         [rsp+128+4], eax
   255     mov         [rsp+128+8], eax
   256     mov         [rsp+128+12], eax
   257 %define flimit4 [rsp+128]
   259 %if ABI_IS_32BIT=0
   260     lea         r8,       [GLOBAL(sym(vp8_rv))]
   261 %endif
   263     ;rows +=8;
   264     add         dword arg(2), 8
   266     ;for(c=0; c<cols; c+=8)
   267 .loop_col:
   268             mov         rsi,        arg(0) ; s
   269             pxor        xmm0,       xmm0        ;
   271             movsxd      rax,        dword ptr arg(1) ;pitch       ;
   273             ; this copies the last row down into the border 8 rows
   274             mov         rdi,        rsi
   275             mov         rdx,        arg(2)
   276             sub         rdx,        9
   277             imul        rdx,        rax
   278             lea         rdi,        [rdi+rdx]
   279             movq        xmm1,       QWORD ptr[rdi]              ; first row
   280             mov         rcx,        8
   281 .init_borderd                                                    ; initialize borders
   282             lea         rdi,        [rdi + rax]
   283             movq        [rdi],      xmm1
   285             dec         rcx
   286             jne         .init_borderd
   288             neg         rax                                     ; rax = -pitch
   290             ; this copies the first row up into the border 8 rows
   291             mov         rdi,        rsi
   292             movq        xmm1,       QWORD ptr[rdi]              ; first row
   293             mov         rcx,        8
   294 .init_border                                                    ; initialize borders
   295             lea         rdi,        [rdi + rax]
   296             movq        [rdi],      xmm1
   298             dec         rcx
   299             jne         .init_border
   303             lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
   304             neg         rax
   306             pxor        xmm5,       xmm5
   307             pxor        xmm6,       xmm6        ;
   309             pxor        xmm7,       xmm7        ;
   310             mov         rdi,        rsi
   312             mov         rcx,        15          ;
   314 .loop_initvar:
   315             movq        xmm1,       QWORD PTR [rdi];
   316             punpcklbw   xmm1,       xmm0        ;
   318             paddw       xmm5,       xmm1        ;
   319             pmullw      xmm1,       xmm1        ;
   321             movdqa      xmm2,       xmm1        ;
   322             punpcklwd   xmm1,       xmm0        ;
   324             punpckhwd   xmm2,       xmm0        ;
   325             paddd       xmm6,       xmm1        ;
   327             paddd       xmm7,       xmm2        ;
   328             lea         rdi,        [rdi+rax]   ;
   330             dec         rcx
   331             jne         .loop_initvar
   332             ;save the var and sum
   333             xor         rdx,        rdx
   334 .loop_row:
   335             movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
   336             movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
   338             punpcklbw   xmm1,       xmm0
   339             punpcklbw   xmm2,       xmm0
   341             paddw       xmm5,       xmm2
   342             psubw       xmm5,       xmm1
   344             pmullw      xmm2,       xmm2
   345             movdqa      xmm4,       xmm2
   347             punpcklwd   xmm2,       xmm0
   348             punpckhwd   xmm4,       xmm0
   350             paddd       xmm6,       xmm2
   351             paddd       xmm7,       xmm4
   353             pmullw      xmm1,       xmm1
   354             movdqa      xmm2,       xmm1
   356             punpcklwd   xmm1,       xmm0
   357             psubd       xmm6,       xmm1
   359             punpckhwd   xmm2,       xmm0
   360             psubd       xmm7,       xmm2
   363             movdqa      xmm3,       xmm6
   364             pslld       xmm3,       4
   366             psubd       xmm3,       xmm6
   367             movdqa      xmm1,       xmm5
   369             movdqa      xmm4,       xmm5
   370             pmullw      xmm1,       xmm1
   372             pmulhw      xmm4,       xmm4
   373             movdqa      xmm2,       xmm1
   375             punpcklwd   xmm1,       xmm4
   376             punpckhwd   xmm2,       xmm4
   378             movdqa      xmm4,       xmm7
   379             pslld       xmm4,       4
   381             psubd       xmm4,       xmm7
   383             psubd       xmm3,       xmm1
   384             psubd       xmm4,       xmm2
   386             psubd       xmm3,       flimit4
   387             psubd       xmm4,       flimit4
   389             psrad       xmm3,       31
   390             psrad       xmm4,       31
   392             packssdw    xmm3,       xmm4
   393             packsswb    xmm3,       xmm0
   395             movq        xmm1,       QWORD PTR [rsi+rax*8]
   397             movq        xmm2,       xmm1
   398             punpcklbw   xmm1,       xmm0
   400             paddw       xmm1,       xmm5
   401             mov         rcx,        rdx
   403             and         rcx,        127
   404 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
   405             push        rax
   406             lea         rax,        [GLOBAL(sym(vp8_rv))]
   407             movdqu      xmm4,       [rax + rcx*2] ;vp8_rv[rcx*2]
   408             pop         rax
   409 %elif ABI_IS_32BIT=0
   410             movdqu      xmm4,       [r8 + rcx*2] ;vp8_rv[rcx*2]
   411 %else
   412             movdqu      xmm4,       [sym(vp8_rv) + rcx*2]
   413 %endif
   415             paddw       xmm1,       xmm4
   416             ;paddw     xmm1,       eight8s
   417             psraw       xmm1,       4
   419             packuswb    xmm1,       xmm0
   420             pand        xmm1,       xmm3
   422             pandn       xmm3,       xmm2
   423             por         xmm1,       xmm3
   425             and         rcx,        15
   426             movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
   428             mov         rcx,        rdx
   429             sub         rcx,        8
   431             and         rcx,        15
   432             movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
   434             movq        [rsi],      mm0
   435             lea         rsi,        [rsi+rax]
   437             lea         rdi,        [rdi+rax]
   438             add         rdx,        1
   440             cmp         edx,        dword arg(2) ;rows
   441             jl          .loop_row
   443         add         dword arg(0), 8 ; s += 8
   444         sub         dword arg(3), 8 ; cols -= 8
   445         cmp         dword arg(3), 0
   446         jg          .loop_col
   448     add         rsp, 128+16
   449     pop         rsp
   451     ; begin epilog
   452     pop rdi
   453     pop rsi
   454     RESTORE_GOT
   455     RESTORE_XMM
   456     UNSHADOW_ARGS
   457     pop         rbp
   458     ret
   459 %undef flimit4
   462 ;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src,
   463 ;                                int pitch, int rows, int cols,int flimit)
   464 global sym(vp8_mbpost_proc_across_ip_xmm) PRIVATE
   465 sym(vp8_mbpost_proc_across_ip_xmm):
   466     push        rbp
   467     mov         rbp, rsp
   468     SHADOW_ARGS_TO_STACK 5
   469     SAVE_XMM 7
   470     GET_GOT     rbx
   471     push        rsi
   472     push        rdi
   473     ; end prolog
   475     ALIGN_STACK 16, rax
   476     sub         rsp, 16
   478     ; create flimit4 at [rsp]
   479     mov         eax, dword ptr arg(4) ;flimit
   480     mov         [rsp], eax
   481     mov         [rsp+4], eax
   482     mov         [rsp+8], eax
   483     mov         [rsp+12], eax
   484 %define flimit4 [rsp]
   487     ;for(r=0;r<rows;r++)
   488 .ip_row_loop:
   490         xor         rdx,    rdx ;sumsq=0;
   491         xor         rcx,    rcx ;sum=0;
   492         mov         rsi,    arg(0); s
   495         ; dup the first byte into the left border 8 times
   496         movq        mm1,   [rsi]
   497         punpcklbw   mm1,   mm1
   498         punpcklwd   mm1,   mm1
   499         punpckldq   mm1,   mm1
   501         mov         rdi,    -8
   502         movq        [rsi+rdi], mm1
   504         ; dup the last byte into the right border
   505         movsxd      rdx,    dword arg(3)
   506         movq        mm1,   [rsi + rdx + -1]
   507         punpcklbw   mm1,   mm1
   508         punpcklwd   mm1,   mm1
   509         punpckldq   mm1,   mm1
   510         movq        [rsi+rdx], mm1
   512 .ip_var_loop:
   513         ;for(i=-8;i<=6;i++)
   514         ;{
   515         ;    sumsq += s[i]*s[i];
   516         ;    sum   += s[i];
   517         ;}
   518         movzx       eax, byte [rsi+rdi]
   519         add         ecx, eax
   520         mul         al
   521         add         edx, eax
   522         add         rdi, 1
   523         cmp         rdi, 6
   524         jle         .ip_var_loop
   527             ;mov         rax,    sumsq
   528             ;movd        xmm7,   rax
   529             movd        xmm7,   edx
   531             ;mov         rax,    sum
   532             ;movd        xmm6,   rax
   533             movd        xmm6,   ecx
   535             mov         rsi,    arg(0) ;s
   536             xor         rcx,    rcx
   538             movsxd      rdx,    dword arg(3) ;cols
   539             add         rdx,    8
   540             pxor        mm0,    mm0
   541             pxor        mm1,    mm1
   543             pxor        xmm0,   xmm0
   544 .nextcol4:
   546             movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
   547             movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
   549             punpcklbw   xmm1,   xmm0                    ; expanding
   550             punpcklbw   xmm2,   xmm0                    ; expanding
   552             punpcklwd   xmm1,   xmm0                    ; expanding to dwords
   553             punpcklwd   xmm2,   xmm0                    ; expanding to dwords
   555             psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
   556             paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
   558             paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
   559             pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
   561             paddd       xmm6,   xmm2
   562             paddd       xmm7,   xmm1
   564             pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
   565             pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
   567             psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
   568             psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
   570             pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
   571             pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
   573             paddd       xmm6,   xmm4
   574             paddd       xmm7,   xmm3
   576             pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
   577             pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
   579             paddd       xmm7,   xmm3
   580             paddd       xmm6,   xmm4
   582             pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
   583             pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
   585             paddd       xmm7,   xmm3
   586             paddd       xmm6,   xmm4
   588             movdqa      xmm3,   xmm6
   589             pmaddwd     xmm3,   xmm3
   591             movdqa      xmm5,   xmm7
   592             pslld       xmm5,   4
   594             psubd       xmm5,   xmm7
   595             psubd       xmm5,   xmm3
   597             psubd       xmm5,   flimit4
   598             psrad       xmm5,   31
   600             packssdw    xmm5,   xmm0
   601             packsswb    xmm5,   xmm0
   603             movd        xmm1,   DWORD PTR [rsi+rcx]
   604             movq        xmm2,   xmm1
   606             punpcklbw   xmm1,   xmm0
   607             punpcklwd   xmm1,   xmm0
   609             paddd       xmm1,   xmm6
   610             paddd       xmm1,   [GLOBAL(four8s)]
   612             psrad       xmm1,   4
   613             packssdw    xmm1,   xmm0
   615             packuswb    xmm1,   xmm0
   616             pand        xmm1,   xmm5
   618             pandn       xmm5,   xmm2
   619             por         xmm5,   xmm1
   621             movd        [rsi+rcx-8],  mm0
   622             movq        mm0,    mm1
   624             movdq2q     mm1,    xmm5
   625             psrldq      xmm7,   12
   627             psrldq      xmm6,   12
   628             add         rcx,    4
   630             cmp         rcx,    rdx
   631             jl          .nextcol4
   633         ;s+=pitch;
   634         movsxd rax, dword arg(1)
   635         add    arg(0), rax
   637         sub dword arg(2), 1 ;rows-=1
   638         cmp dword arg(2), 0
   639         jg .ip_row_loop
   641     add         rsp, 16
   642     pop         rsp
   644     ; begin epilog
   645     pop rdi
   646     pop rsi
   647     RESTORE_GOT
   648     RESTORE_XMM
   649     UNSHADOW_ARGS
   650     pop         rbp
   651     ret
   652 %undef flimit4
   655 ;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
   656 ;                            unsigned char blackclamp[16],
   657 ;                            unsigned char whiteclamp[16],
   658 ;                            unsigned char bothclamp[16],
   659 ;                            unsigned int Width, unsigned int Height, int Pitch)
   660 extern sym(rand)
   661 global sym(vp8_plane_add_noise_wmt) PRIVATE
   662 sym(vp8_plane_add_noise_wmt):
   663     push        rbp
   664     mov         rbp, rsp
   665     SHADOW_ARGS_TO_STACK 8
   666     GET_GOT     rbx
   667     push        rsi
   668     push        rdi
   669     ; end prolog
   671 .addnoise_loop:
   672     call sym(rand) WRT_PLT
   673     mov     rcx, arg(1) ;noise
   674     and     rax, 0xff
   675     add     rcx, rax
   677     ; we rely on the fact that the clamping vectors are stored contiguously
   678     ; in black/white/both order. Note that we have to reload this here because
   679     ; rdx could be trashed by rand()
   680     mov     rdx, arg(2) ; blackclamp
   683             mov     rdi, rcx
   684             movsxd  rcx, dword arg(5) ;[Width]
   685             mov     rsi, arg(0) ;Pos
   686             xor         rax,rax
   688 .addnoise_nextset:
   689             movdqu      xmm1,[rsi+rax]         ; get the source
   691             psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
   692             paddusb     xmm1, [rdx+32] ;bothclamp
   693             psubusb     xmm1, [rdx+16] ;whiteclamp
   695             movdqu      xmm2,[rdi+rax]         ; get the noise for this line
   696             paddb       xmm1,xmm2              ; add it in
   697             movdqu      [rsi+rax],xmm1         ; store the result
   699             add         rax,16                 ; move to the next line
   701             cmp         rax, rcx
   702             jl          .addnoise_nextset
   704     movsxd  rax, dword arg(7) ; Pitch
   705     add     arg(0), rax ; Start += Pitch
   706     sub     dword arg(6), 1   ; Height -= 1
   707     jg      .addnoise_loop
   709     ; begin epilog
   710     pop rdi
   711     pop rsi
   712     RESTORE_GOT
   713     UNSHADOW_ARGS
   714     pop         rbp
   715     ret
   718 SECTION_RODATA
   719 align 16
   720 four8s:
   721     times 4 dd 8

mercurial