media/libvpx/vp8/common/x86/postproc_mmx.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    12 %include "vpx_ports/x86_abi_support.asm"
    14 %define VP8_FILTER_WEIGHT 128
    15 %define VP8_FILTER_SHIFT  7
    17 ;void vp8_mbpost_proc_down_mmx(unsigned char *dst,
    18 ;                             int pitch, int rows, int cols,int flimit)
    19 extern sym(vp8_rv)
    20 global sym(vp8_mbpost_proc_down_mmx) PRIVATE
    21 sym(vp8_mbpost_proc_down_mmx):
    22     push        rbp
    23     mov         rbp, rsp
    24     SHADOW_ARGS_TO_STACK 5
    25     GET_GOT     rbx
    26     push        rsi
    27     push        rdi
    28     ; end prolog
    30     ALIGN_STACK 16, rax
    31     sub         rsp, 136
    33     ; unsigned char d[16][8] at [rsp]
    34     ; create flimit2 at [rsp+128]
    35     mov         eax, dword ptr arg(4) ;flimit
    36     mov         [rsp+128], eax
    37     mov         [rsp+128+4], eax
    38 %define flimit2 [rsp+128]
    40 %if ABI_IS_32BIT=0
    41     lea         r8,       [GLOBAL(sym(vp8_rv))]
    42 %endif
    44     ;rows +=8;
    45     add         dword ptr arg(2), 8
    47     ;for(c=0; c<cols; c+=4)
    48 .loop_col:
    49             mov         rsi,        arg(0)  ;s
    50             pxor        mm0,        mm0     ;
    52             movsxd      rax,        dword ptr arg(1) ;pitch       ;
    54             ; this copies the last row down into the border 8 rows
    55             mov         rdi,        rsi
    56             mov         rdx,        arg(2)
    57             sub         rdx,        9
    58             imul        rdx,        rax
    59             lea         rdi,        [rdi+rdx]
    60             movq        mm1,        QWORD ptr[rdi]              ; first row
    61             mov         rcx,        8
    62 .init_borderd                                                    ; initialize borders
    63             lea         rdi,        [rdi + rax]
    64             movq        [rdi],      mm1
    66             dec         rcx
    67             jne         .init_borderd
    69             neg         rax                                     ; rax = -pitch
    71             ; this copies the first row up into the border 8 rows
    72             mov         rdi,        rsi
    73             movq        mm1,        QWORD ptr[rdi]              ; first row
    74             mov         rcx,        8
    75 .init_border                                                    ; initialize borders
    76             lea         rdi,        [rdi + rax]
    77             movq        [rdi],      mm1
    79             dec         rcx
    80             jne         .init_border
    83             lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
    84             neg         rax
    87             pxor        mm5,        mm5
    88             pxor        mm6,        mm6     ;
    90             pxor        mm7,        mm7     ;
    91             mov         rdi,        rsi
    93             mov         rcx,        15          ;
    95 .loop_initvar:
    96             movd        mm1,        DWORD PTR [rdi];
    97             punpcklbw   mm1,        mm0     ;
    99             paddw       mm5,        mm1     ;
   100             pmullw      mm1,        mm1     ;
   102             movq        mm2,        mm1     ;
   103             punpcklwd   mm1,        mm0     ;
   105             punpckhwd   mm2,        mm0     ;
   106             paddd       mm6,        mm1     ;
   108             paddd       mm7,        mm2     ;
   109             lea         rdi,        [rdi+rax]   ;
   111             dec         rcx
   112             jne         .loop_initvar
   113             ;save the var and sum
   114             xor         rdx,        rdx
   115 .loop_row:
   116             movd        mm1,        DWORD PTR [rsi]     ; [s-pitch*8]
   117             movd        mm2,        DWORD PTR [rdi]     ; [s+pitch*7]
   119             punpcklbw   mm1,        mm0
   120             punpcklbw   mm2,        mm0
   122             paddw       mm5,        mm2
   123             psubw       mm5,        mm1
   125             pmullw      mm2,        mm2
   126             movq        mm4,        mm2
   128             punpcklwd   mm2,        mm0
   129             punpckhwd   mm4,        mm0
   131             paddd       mm6,        mm2
   132             paddd       mm7,        mm4
   134             pmullw      mm1,        mm1
   135             movq        mm2,        mm1
   137             punpcklwd   mm1,        mm0
   138             psubd       mm6,        mm1
   140             punpckhwd   mm2,        mm0
   141             psubd       mm7,        mm2
   144             movq        mm3,        mm6
   145             pslld       mm3,        4
   147             psubd       mm3,        mm6
   148             movq        mm1,        mm5
   150             movq        mm4,        mm5
   151             pmullw      mm1,        mm1
   153             pmulhw      mm4,        mm4
   154             movq        mm2,        mm1
   156             punpcklwd   mm1,        mm4
   157             punpckhwd   mm2,        mm4
   159             movq        mm4,        mm7
   160             pslld       mm4,        4
   162             psubd       mm4,        mm7
   164             psubd       mm3,        mm1
   165             psubd       mm4,        mm2
   167             psubd       mm3,        flimit2
   168             psubd       mm4,        flimit2
   170             psrad       mm3,        31
   171             psrad       mm4,        31
   173             packssdw    mm3,        mm4
   174             packsswb    mm3,        mm0
   176             movd        mm1,        DWORD PTR [rsi+rax*8]
   178             movq        mm2,        mm1
   179             punpcklbw   mm1,        mm0
   181             paddw       mm1,        mm5
   182             mov         rcx,        rdx
   184             and         rcx,        127
   185 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
   186             push        rax
   187             lea         rax,        [GLOBAL(sym(vp8_rv))]
   188             movq        mm4,        [rax + rcx*2] ;vp8_rv[rcx*2]
   189             pop         rax
   190 %elif ABI_IS_32BIT=0
   191             movq        mm4,        [r8 + rcx*2] ;vp8_rv[rcx*2]
   192 %else
   193             movq        mm4,        [sym(vp8_rv) + rcx*2]
   194 %endif
   195             paddw       mm1,        mm4
   196             psraw       mm1,        4
   198             packuswb    mm1,        mm0
   199             pand        mm1,        mm3
   201             pandn       mm3,        mm2
   202             por         mm1,        mm3
   204             and         rcx,        15
   205             movd        DWORD PTR   [rsp+rcx*4], mm1 ;d[rcx*4]
   207             mov         rcx,        rdx
   208             sub         rcx,        8
   210             and         rcx,        15
   211             movd        mm1,        DWORD PTR [rsp+rcx*4] ;d[rcx*4]
   213             movd        [rsi],      mm1
   214             lea         rsi,        [rsi+rax]
   216             lea         rdi,        [rdi+rax]
   217             add         rdx,        1
   219             cmp         edx,        dword arg(2) ;rows
   220             jl          .loop_row
   223         add         dword arg(0), 4 ; s += 4
   224         sub         dword arg(3), 4 ; cols -= 4
   225         cmp         dword arg(3), 0
   226         jg          .loop_col
   228     add         rsp, 136
   229     pop         rsp
   231     ; begin epilog
   232     pop rdi
   233     pop rsi
   234     RESTORE_GOT
   235     UNSHADOW_ARGS
   236     pop         rbp
   237     ret
   238 %undef flimit2
   241 ;void vp8_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,
   242 ;                            unsigned char blackclamp[16],
   243 ;                            unsigned char whiteclamp[16],
   244 ;                            unsigned char bothclamp[16],
   245 ;                            unsigned int Width, unsigned int Height, int Pitch)
   246 extern sym(rand)
   247 global sym(vp8_plane_add_noise_mmx) PRIVATE
   248 sym(vp8_plane_add_noise_mmx):
   249     push        rbp
   250     mov         rbp, rsp
   251     SHADOW_ARGS_TO_STACK 8
   252     GET_GOT     rbx
   253     push        rsi
   254     push        rdi
   255     ; end prolog
   257 .addnoise_loop:
   258     call sym(rand) WRT_PLT
   259     mov     rcx, arg(1) ;noise
   260     and     rax, 0xff
   261     add     rcx, rax
   263     ; we rely on the fact that the clamping vectors are stored contiguously
   264     ; in black/white/both order. Note that we have to reload this here because
   265     ; rdx could be trashed by rand()
   266     mov     rdx, arg(2) ; blackclamp
   269             mov     rdi, rcx
   270             movsxd  rcx, dword arg(5) ;[Width]
   271             mov     rsi, arg(0) ;Pos
   272             xor         rax,rax
   274 .addnoise_nextset:
   275             movq        mm1,[rsi+rax]         ; get the source
   277             psubusb     mm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
   278             paddusb     mm1, [rdx+32] ;bothclamp
   279             psubusb     mm1, [rdx+16] ;whiteclamp
   281             movq        mm2,[rdi+rax]         ; get the noise for this line
   282             paddb       mm1,mm2              ; add it in
   283             movq        [rsi+rax],mm1         ; store the result
   285             add         rax,8                 ; move to the next line
   287             cmp         rax, rcx
   288             jl          .addnoise_nextset
   290     movsxd  rax, dword arg(7) ; Pitch
   291     add     arg(0), rax ; Start += Pitch
   292     sub     dword arg(6), 1   ; Height -= 1
   293     jg      .addnoise_loop
   295     ; begin epilog
   296     pop rdi
   297     pop rsi
   298     RESTORE_GOT
   299     UNSHADOW_ARGS
   300     pop         rbp
   301     ret
   304 SECTION_RODATA
   305 align 16
   306 Blur:
   307     times 16 dw 16
   308     times  8 dw 64
   309     times 16 dw 16
   310     times  8 dw  0
   312 rd:
   313     times 4 dw 0x40

mercurial