media/libvpx/vp8/common/x86/idctllm_mmx.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    12 %include "vpx_ports/x86_abi_support.asm"
    14 ; /****************************************************************************
    15 ; * Notes:
    16 ; *
    17 ; * This implementation makes use of 16 bit fixed point version of two multiply
    18 ; * constants:
    19 ; *        1.   sqrt(2) * cos (pi/8)
    20 ; *        2.   sqrt(2) * sin (pi/8)
    21 ; * Because the first constant is bigger than 1, to maintain the same 16 bit
    22 ; * fixed point precision as the second one, we use a trick of
    23 ; *        x * a = x + x*(a-1)
    24 ; * so
    25 ; *        x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
    26 ; *
    27 ; * For the second constant, because of the 16bit version is 35468, which
    28 ; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
    29 ; * number.
    30 ; *        (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
    31 ; *
    32 ; **************************************************************************/
    35 ;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred,
    36 ;int pitch, unsigned char *dest,int stride)
    37 global sym(vp8_short_idct4x4llm_mmx) PRIVATE
    38 sym(vp8_short_idct4x4llm_mmx):
    39     push        rbp
    40     mov         rbp, rsp
    41     SHADOW_ARGS_TO_STACK 5
    42     GET_GOT     rbx
    43     push        rsi
    44     push        rdi
    45     ; end prolog
    47     mov         rax,    arg(0)              ;input
    48     mov         rsi,    arg(1)              ;pred
    50     movq        mm0,    [rax   ]
    51     movq        mm1,    [rax+ 8]
    52     movq        mm2,    [rax+16]
    53     movq        mm3,    [rax+24]
    55 %if 0
    56     pxor        mm7,    mm7
    57     movq        [rax],   mm7
    58     movq        [rax+8], mm7
    59     movq        [rax+16],mm7
    60     movq        [rax+24],mm7
    61 %endif
    62     movsxd      rax,    dword ptr arg(2)    ;pitch
    63     mov         rdx,    arg(3)              ;dest
    64     movsxd      rdi,    dword ptr arg(4)    ;stride
    67     psubw       mm0,            mm2             ; b1= 0-2
    68     paddw       mm2,            mm2             ;
    70     movq        mm5,            mm1
    71     paddw       mm2,            mm0             ; a1 =0+2
    73     pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
    74     paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
    76     movq        mm7,            mm3             ;
    77     pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
    79     paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
    80     psubw       mm7,            mm5             ; c1
    82     movq        mm5,            mm1
    83     movq        mm4,            mm3
    85     pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
    86     paddw       mm5,            mm1
    88     pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
    89     paddw       mm3,            mm4
    91     paddw       mm3,            mm5             ; d1
    92     movq        mm6,            mm2             ; a1
    94     movq        mm4,            mm0             ; b1
    95     paddw       mm2,            mm3             ;0
    97     paddw       mm4,            mm7             ;1
    98     psubw       mm0,            mm7             ;2
   100     psubw       mm6,            mm3             ;3
   102     movq        mm1,            mm2             ; 03 02 01 00
   103     movq        mm3,            mm4             ; 23 22 21 20
   105     punpcklwd   mm1,            mm0             ; 11 01 10 00
   106     punpckhwd   mm2,            mm0             ; 13 03 12 02
   108     punpcklwd   mm3,            mm6             ; 31 21 30 20
   109     punpckhwd   mm4,            mm6             ; 33 23 32 22
   111     movq        mm0,            mm1             ; 11 01 10 00
   112     movq        mm5,            mm2             ; 13 03 12 02
   114     punpckldq   mm0,            mm3             ; 30 20 10 00
   115     punpckhdq   mm1,            mm3             ; 31 21 11 01
   117     punpckldq   mm2,            mm4             ; 32 22 12 02
   118     punpckhdq   mm5,            mm4             ; 33 23 13 03
   120     movq        mm3,            mm5             ; 33 23 13 03
   122     psubw       mm0,            mm2             ; b1= 0-2
   123     paddw       mm2,            mm2             ;
   125     movq        mm5,            mm1
   126     paddw       mm2,            mm0             ; a1 =0+2
   128     pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
   129     paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
   131     movq        mm7,            mm3             ;
   132     pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
   134     paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
   135     psubw       mm7,            mm5             ; c1
   137     movq        mm5,            mm1
   138     movq        mm4,            mm3
   140     pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
   141     paddw       mm5,            mm1
   143     pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
   144     paddw       mm3,            mm4
   146     paddw       mm3,            mm5             ; d1
   147     paddw       mm0,            [GLOBAL(fours)]
   149     paddw       mm2,            [GLOBAL(fours)]
   150     movq        mm6,            mm2             ; a1
   152     movq        mm4,            mm0             ; b1
   153     paddw       mm2,            mm3             ;0
   155     paddw       mm4,            mm7             ;1
   156     psubw       mm0,            mm7             ;2
   158     psubw       mm6,            mm3             ;3
   159     psraw       mm2,            3
   161     psraw       mm0,            3
   162     psraw       mm4,            3
   164     psraw       mm6,            3
   166     movq        mm1,            mm2             ; 03 02 01 00
   167     movq        mm3,            mm4             ; 23 22 21 20
   169     punpcklwd   mm1,            mm0             ; 11 01 10 00
   170     punpckhwd   mm2,            mm0             ; 13 03 12 02
   172     punpcklwd   mm3,            mm6             ; 31 21 30 20
   173     punpckhwd   mm4,            mm6             ; 33 23 32 22
   175     movq        mm0,            mm1             ; 11 01 10 00
   176     movq        mm5,            mm2             ; 13 03 12 02
   178     punpckldq   mm0,            mm3             ; 30 20 10 00
   179     punpckhdq   mm1,            mm3             ; 31 21 11 01
   181     punpckldq   mm2,            mm4             ; 32 22 12 02
   182     punpckhdq   mm5,            mm4             ; 33 23 13 03
   184     pxor        mm7,            mm7
   186     movd        mm4,            [rsi]
   187     punpcklbw   mm4,            mm7
   188     paddsw      mm0,            mm4
   189     packuswb    mm0,            mm7
   190     movd        [rdx],          mm0
   192     movd        mm4,            [rsi+rax]
   193     punpcklbw   mm4,            mm7
   194     paddsw      mm1,            mm4
   195     packuswb    mm1,            mm7
   196     movd        [rdx+rdi],      mm1
   198     movd        mm4,            [rsi+2*rax]
   199     punpcklbw   mm4,            mm7
   200     paddsw      mm2,            mm4
   201     packuswb    mm2,            mm7
   202     movd        [rdx+rdi*2],    mm2
   204     add         rdx,            rdi
   205     add         rsi,            rax
   207     movd        mm4,            [rsi+2*rax]
   208     punpcklbw   mm4,            mm7
   209     paddsw      mm5,            mm4
   210     packuswb    mm5,            mm7
   211     movd        [rdx+rdi*2],    mm5
   213     ; begin epilog
   214     pop rdi
   215     pop rsi
   216     RESTORE_GOT
   217     UNSHADOW_ARGS
   218     pop         rbp
   219     ret
   221 ;void vp8_dc_only_idct_add_mmx(
   222 ;short input_dc,
   223 ;unsigned char *pred_ptr,
   224 ;int pred_stride,
   225 ;unsigned char *dst_ptr,
   226 ;int stride)
   227 global sym(vp8_dc_only_idct_add_mmx) PRIVATE
   228 sym(vp8_dc_only_idct_add_mmx):
   229     push        rbp
   230     mov         rbp, rsp
   231     SHADOW_ARGS_TO_STACK 5
   232     GET_GOT     rbx
   233     ; end prolog
   235         movd        mm5,            arg(0) ;input_dc
   236         mov         rax,            arg(1) ;pred_ptr
   237         movsxd      rdx,            dword ptr arg(2) ;pred_stride
   239         pxor        mm0,            mm0
   241         paddw       mm5,            [GLOBAL(fours)]
   242         lea         rcx,            [rdx + rdx*2]
   244         psraw       mm5,            3
   246         punpcklwd   mm5,            mm5
   248         punpckldq   mm5,            mm5
   250         movd        mm1,            [rax]
   251         movd        mm2,            [rax+rdx]
   252         movd        mm3,            [rax+2*rdx]
   253         movd        mm4,            [rax+rcx]
   255         mov         rax,            arg(3) ;d -- destination
   256         movsxd      rdx,            dword ptr arg(4) ;dst_stride
   258         punpcklbw   mm1,            mm0
   259         paddsw      mm1,            mm5
   260         packuswb    mm1,            mm0              ; pack and unpack to saturate
   261         lea         rcx,            [rdx + rdx*2]
   263         punpcklbw   mm2,            mm0
   264         paddsw      mm2,            mm5
   265         packuswb    mm2,            mm0              ; pack and unpack to saturate
   267         punpcklbw   mm3,            mm0
   268         paddsw      mm3,            mm5
   269         packuswb    mm3,            mm0              ; pack and unpack to saturate
   271         punpcklbw   mm4,            mm0
   272         paddsw      mm4,            mm5
   273         packuswb    mm4,            mm0              ; pack and unpack to saturate
   275         movd        [rax],          mm1
   276         movd        [rax+rdx],      mm2
   277         movd        [rax+2*rdx],    mm3
   278         movd        [rax+rcx],      mm4
   280     ; begin epilog
   281     RESTORE_GOT
   282     UNSHADOW_ARGS
   283     pop         rbp
   284     ret
   286 SECTION_RODATA
   287 align 16
   288 x_s1sqr2:
   289     times 4 dw 0x8A8C
   290 align 16
   291 x_c1sqr2less1:
   292     times 4 dw 0x4E7B
   293 align 16
   294 fours:
   295     times 4 dw 0x0004

mercurial