The Tor Browser: media/libvpx/vp8/common/x86/mfqe

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

1 ;

     2 ;  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

3 ;

     4 ;  Use of this source code is governed by a BSD-style license

     5 ;  that can be found in the LICENSE file in the root of the source

     6 ;  tree. An additional intellectual property rights grant can be found

     7 ;  in the file PATENTS.  All contributing project authors may

     8 ;  be found in the AUTHORS file in the root of the source tree.

9 ;

    12 %include "vpx_ports/x86_abi_support.asm"

    14 ;void vp8_filter_by_weight16x16_sse2

    15 ;(

    16 ;    unsigned char *src,

    17 ;    int            src_stride,

    18 ;    unsigned char *dst,

    19 ;    int            dst_stride,

    20 ;    int            src_weight

    21 ;)

    22 global sym(vp8_filter_by_weight16x16_sse2) PRIVATE

    23 sym(vp8_filter_by_weight16x16_sse2):

    24     push        rbp

    25     mov         rbp, rsp

    26     SHADOW_ARGS_TO_STACK 5

    27     SAVE_XMM 6

    28     GET_GOT     rbx

    29     push        rsi

    30     push        rdi

    31     ; end prolog

    33     movd        xmm0, arg(4)                ; src_weight

    34     pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words

    35     punpcklqdq  xmm0, xmm0                  ; replicate to all hi words

    37     movdqa      xmm1, [GLOBAL(tMFQE)]

    38     psubw       xmm1, xmm0                  ; dst_weight

    40     mov         rax, arg(0)                 ; src

    41     mov         rsi, arg(1)                 ; src_stride

    42     mov         rdx, arg(2)                 ; dst

    43     mov         rdi, arg(3)                 ; dst_stride

    45     mov         rcx, 16                     ; loop count

    46     pxor        xmm6, xmm6

    48 .combine

    49     movdqa      xmm2, [rax]

    50     movdqa      xmm4, [rdx]

    51     add         rax, rsi

    53     ; src * src_weight

    54     movdqa      xmm3, xmm2

    55     punpcklbw   xmm2, xmm6

    56     punpckhbw   xmm3, xmm6

    57     pmullw      xmm2, xmm0

    58     pmullw      xmm3, xmm0

    60     ; dst * dst_weight

    61     movdqa      xmm5, xmm4

    62     punpcklbw   xmm4, xmm6

    63     punpckhbw   xmm5, xmm6

    64     pmullw      xmm4, xmm1

    65     pmullw      xmm5, xmm1

    67     ; sum, round and shift

    68     paddw       xmm2, xmm4

    69     paddw       xmm3, xmm5

    70     paddw       xmm2, [GLOBAL(tMFQE_round)]

    71     paddw       xmm3, [GLOBAL(tMFQE_round)]

    72     psrlw       xmm2, 4

    73     psrlw       xmm3, 4

    75     packuswb    xmm2, xmm3

    76     movdqa      [rdx], xmm2

    77     add         rdx, rdi

    79     dec         rcx

    80     jnz         .combine

    82     ; begin epilog

    83     pop         rdi

    84     pop         rsi

    85     RESTORE_GOT

    86     RESTORE_XMM

    87     UNSHADOW_ARGS

    88     pop         rbp

    90     ret

    92 ;void vp8_filter_by_weight8x8_sse2

    93 ;(

    94 ;    unsigned char *src,

    95 ;    int            src_stride,

    96 ;    unsigned char *dst,

    97 ;    int            dst_stride,

    98 ;    int            src_weight

    99 ;)

   100 global sym(vp8_filter_by_weight8x8_sse2) PRIVATE

   101 sym(vp8_filter_by_weight8x8_sse2):

   102     push        rbp

   103     mov         rbp, rsp

   104     SHADOW_ARGS_TO_STACK 5

   105     GET_GOT     rbx

   106     push        rsi

   107     push        rdi

   108     ; end prolog

   110     movd        xmm0, arg(4)                ; src_weight

   111     pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words

   112     punpcklqdq  xmm0, xmm0                  ; replicate to all hi words

   114     movdqa      xmm1, [GLOBAL(tMFQE)]

   115     psubw       xmm1, xmm0                  ; dst_weight

   117     mov         rax, arg(0)                 ; src

   118     mov         rsi, arg(1)                 ; src_stride

   119     mov         rdx, arg(2)                 ; dst

   120     mov         rdi, arg(3)                 ; dst_stride

   122     mov         rcx, 8                      ; loop count

   123     pxor        xmm4, xmm4

   125 .combine

   126     movq        xmm2, [rax]

   127     movq        xmm3, [rdx]

   128     add         rax, rsi

   130     ; src * src_weight

   131     punpcklbw   xmm2, xmm4

   132     pmullw      xmm2, xmm0

   134     ; dst * dst_weight

   135     punpcklbw   xmm3, xmm4

   136     pmullw      xmm3, xmm1

   138     ; sum, round and shift

   139     paddw       xmm2, xmm3

   140     paddw       xmm2, [GLOBAL(tMFQE_round)]

   141     psrlw       xmm2, 4

   143     packuswb    xmm2, xmm4

   144     movq        [rdx], xmm2

   145     add         rdx, rdi

   147     dec         rcx

   148     jnz         .combine

   150     ; begin epilog

   151     pop         rdi

   152     pop         rsi

   153     RESTORE_GOT

   154     UNSHADOW_ARGS

   155     pop         rbp

   157     ret

   159 ;void vp8_variance_and_sad_16x16_sse2 | arg

   160 ;(

   161 ;    unsigned char *src1,          0

   162 ;    int            stride1,       1

   163 ;    unsigned char *src2,          2

   164 ;    int            stride2,       3

   165 ;    unsigned int  *variance,      4

   166 ;    unsigned int  *sad,           5

   167 ;)

   168 global sym(vp8_variance_and_sad_16x16_sse2) PRIVATE

   169 sym(vp8_variance_and_sad_16x16_sse2):

   170     push        rbp

   171     mov         rbp, rsp

   172     SHADOW_ARGS_TO_STACK 6

   173     GET_GOT     rbx

   174     push        rsi

   175     push        rdi

   176     ; end prolog

   178     mov         rax,        arg(0)          ; src1

   179     mov         rcx,        arg(1)          ; stride1

   180     mov         rdx,        arg(2)          ; src2

   181     mov         rdi,        arg(3)          ; stride2

   183     mov         rsi,        16              ; block height

   185     ; Prep accumulator registers

   186     pxor        xmm3, xmm3                  ; SAD

   187     pxor        xmm4, xmm4                  ; sum of src2

   188     pxor        xmm5, xmm5                  ; sum of src2^2

   190     ; Because we're working with the actual output frames

   191     ; we can't depend on any kind of data alignment.

   192 .accumulate

   193     movdqa      xmm0, [rax]                 ; src1

   194     movdqa      xmm1, [rdx]                 ; src2

   195     add         rax, rcx                    ; src1 + stride1

   196     add         rdx, rdi                    ; src2 + stride2

   198     ; SAD(src1, src2)

   199     psadbw      xmm0, xmm1

   200     paddusw     xmm3, xmm0

   202     ; SUM(src2)

   203     pxor        xmm2, xmm2

   204     psadbw      xmm2, xmm1                  ; sum src2 by misusing SAD against 0

   205     paddusw     xmm4, xmm2

   207     ; pmaddubsw would be ideal if it took two unsigned values. instead,

   208     ; it expects a signed and an unsigned value. so instead we zero extend

   209     ; and operate on words.

   210     pxor        xmm2, xmm2

   211     movdqa      xmm0, xmm1

   212     punpcklbw   xmm0, xmm2

   213     punpckhbw   xmm1, xmm2

   214     pmaddwd     xmm0, xmm0

   215     pmaddwd     xmm1, xmm1

   216     paddd       xmm5, xmm0

   217     paddd       xmm5, xmm1

   219     sub         rsi,        1

   220     jnz         .accumulate

   222     ; phaddd only operates on adjacent double words.

   223     ; Finalize SAD and store

   224     movdqa      xmm0, xmm3

   225     psrldq      xmm0, 8

   226     paddusw     xmm0, xmm3

   227     paddd       xmm0, [GLOBAL(t128)]

   228     psrld       xmm0, 8

   230     mov         rax,  arg(5)

   231     movd        [rax], xmm0

   233     ; Accumulate sum of src2

   234     movdqa      xmm0, xmm4

   235     psrldq      xmm0, 8

   236     paddusw     xmm0, xmm4

   237     ; Square src2. Ignore high value

   238     pmuludq     xmm0, xmm0

   239     psrld       xmm0, 8

   241     ; phaddw could be used to sum adjacent values but we want

   242     ; all the values summed. promote to doubles, accumulate,

   243     ; shift and sum

   244     pxor        xmm2, xmm2

   245     movdqa      xmm1, xmm5

   246     punpckldq   xmm1, xmm2

   247     punpckhdq   xmm5, xmm2

   248     paddd       xmm1, xmm5

   249     movdqa      xmm2, xmm1

   250     psrldq      xmm1, 8

   251     paddd       xmm1, xmm2

   253     psubd       xmm1, xmm0

   255     ; (variance + 128) >> 8

   256     paddd       xmm1, [GLOBAL(t128)]

   257     psrld       xmm1, 8

   258     mov         rax,  arg(4)

   260     movd        [rax], xmm1

   263     ; begin epilog

   264     pop         rdi

   265     pop         rsi

   266     RESTORE_GOT

   267     UNSHADOW_ARGS

   268     pop         rbp

   269     ret

   271 SECTION_RODATA

   272 align 16

   273 t128:

   274 %ifndef __NASM_VER__

   275     ddq 128

   276 %elif CONFIG_BIG_ENDIAN

   277     dq  0, 128

   278 %else

   279     dq  128, 0

   280 %endif

   281 align 16

   282 tMFQE: ; 1 << MFQE_PRECISION

   283     times 8 dw 0x10

   284 align 16

   285 tMFQE_round: ; 1 << (MFQE_PRECISION - 1)

   286     times 8 dw 0x08

The Tor Browser / file revision

media/libvpx/vp8/common/x86/mfqe_sse2.asm@ac0c01689b40

media/libvpx/vp8/common/x86/mfqe_sse2.asm